package org.apache.manifoldcf.agents.transformation.htmlextractor;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InterruptedIOException;
import java.io.OutputStream;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Hashtable;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import org.apache.manifoldcf.agents.interfaces.IOutputAddActivity;
import org.apache.manifoldcf.agents.interfaces.RepositoryDocument;
import org.apache.manifoldcf.agents.interfaces.ServiceInterruption;
import org.apache.manifoldcf.agents.transformation.BaseTransformationConnector;
import org.apache.manifoldcf.agents.transformation.htmlextractor.exception.RegexException;
import org.apache.manifoldcf.core.interfaces.ConfigParams;
import org.apache.manifoldcf.core.interfaces.IHTTPOutput;
import org.apache.manifoldcf.core.interfaces.IPostParameters;
import org.apache.manifoldcf.core.interfaces.IThreadContext;
import org.apache.manifoldcf.core.interfaces.ManifoldCFException;
import org.apache.manifoldcf.core.interfaces.Specification;
import org.apache.manifoldcf.core.interfaces.SpecificationNode;
import org.apache.manifoldcf.core.interfaces.VersionContext;
import org.apache.manifoldcf.crawler.system.Logging;

/* loaded from: input_file:org/apache/manifoldcf/agents/transformation/htmlextractor/HtmlExtractor.class */
public class HtmlExtractor extends BaseTransformationConnector {
    public static final String _rcsid = "@(#)$Id$";
    private static final String EDIT_CONFIGURATION_JS = "editConfiguration.js";
    private static final String VIEW_CONFIGURATION_HTML = "viewConfiguration.html";
    private static final String EDIT_SPECIFICATION_JS = "editSpecification.js";
    private static final String VIEW_SPECIFICATION_HTML = "viewSpecification.html";
    private static final String EDIT_SPECIFICATION_HTML_EXTRACTOR_HTML = "editSpecification_HTML_Extractor.html";
    protected static final int HTML_STRIP_NONE = 0;
    public static final String NODE_KEEPMETADATA = "striphtml";
    public static final String NODE_FILTEREMPTY = "filterEmpty";
    public static final String ATTRIBUTE_SOURCE = "source";
    public static final String ATTRIBUTE_TARGET = "target";
    public static final String ATTRIBUTE_VALUE = "value";
    protected static final long inMemoryMaximumFile = 65536;
    protected static final String ACTIVITY_PROCESS = "process";
    protected static final String[] activitiesList = {ACTIVITY_PROCESS};
    protected static final int HTML_STRIP_ALL = 1;
    protected static int html_strip_usage = HTML_STRIP_ALL;

    /* loaded from: input_file:org/apache/manifoldcf/agents/transformation/htmlextractor/HtmlExtractor$DestinationStorage.class */
    protected interface DestinationStorage {
        OutputStream getOutputStream() throws ManifoldCFException;

        long getBinaryLength() throws ManifoldCFException;

        InputStream getInputStream() throws ManifoldCFException;

        void close() throws ManifoldCFException;
    }

    /* loaded from: input_file:org/apache/manifoldcf/agents/transformation/htmlextractor/HtmlExtractor$FileDestinationStorage.class */
    protected static class FileDestinationStorage implements DestinationStorage {
        protected final File outputFile;
        protected final OutputStream outputStream;

        public FileDestinationStorage() throws ManifoldCFException {
            File file;
            FileOutputStream fileOutputStream;
            try {
                file = File.createTempFile("mcftika", "tmp");
                fileOutputStream = new FileOutputStream(file);
            } catch (IOException e) {
                handleIOException(e);
                file = HtmlExtractor.HTML_STRIP_NONE;
                fileOutputStream = HtmlExtractor.HTML_STRIP_NONE;
            }
            this.outputFile = file;
            this.outputStream = fileOutputStream;
        }

        @Override // org.apache.manifoldcf.agents.transformation.htmlextractor.HtmlExtractor.DestinationStorage
        public OutputStream getOutputStream() throws ManifoldCFException {
            return this.outputStream;
        }

        @Override // org.apache.manifoldcf.agents.transformation.htmlextractor.HtmlExtractor.DestinationStorage
        public long getBinaryLength() throws ManifoldCFException {
            return this.outputFile.length();
        }

        @Override // org.apache.manifoldcf.agents.transformation.htmlextractor.HtmlExtractor.DestinationStorage
        public InputStream getInputStream() throws ManifoldCFException {
            try {
                return new FileInputStream(this.outputFile);
            } catch (IOException e) {
                handleIOException(e);
                return null;
            }
        }

        private void handleIOException(IOException iOException) {
        }

        @Override // org.apache.manifoldcf.agents.transformation.htmlextractor.HtmlExtractor.DestinationStorage
        public void close() throws ManifoldCFException {
            this.outputFile.delete();
        }
    }

    /* loaded from: input_file:org/apache/manifoldcf/agents/transformation/htmlextractor/HtmlExtractor$MemoryDestinationStorage.class */
    protected static class MemoryDestinationStorage implements DestinationStorage {
        protected final ByteArrayOutputStream outputStream;

        public MemoryDestinationStorage(int i) {
            this.outputStream = new ByteArrayOutputStream(i);
        }

        @Override // org.apache.manifoldcf.agents.transformation.htmlextractor.HtmlExtractor.DestinationStorage
        public OutputStream getOutputStream() throws ManifoldCFException {
            return this.outputStream;
        }

        @Override // org.apache.manifoldcf.agents.transformation.htmlextractor.HtmlExtractor.DestinationStorage
        public long getBinaryLength() throws ManifoldCFException {
            return this.outputStream.size();
        }

        @Override // org.apache.manifoldcf.agents.transformation.htmlextractor.HtmlExtractor.DestinationStorage
        public InputStream getInputStream() throws ManifoldCFException {
            return new ByteArrayInputStream(this.outputStream.toByteArray());
        }

        @Override // org.apache.manifoldcf.agents.transformation.htmlextractor.HtmlExtractor.DestinationStorage
        public void close() throws ManifoldCFException {
        }

        protected static int handleIOException(IOException iOException) throws ManifoldCFException {
            if (iOException instanceof InterruptedIOException) {
                throw new ManifoldCFException(iOException.getMessage(), iOException, 2);
            }
            throw new ManifoldCFException(iOException.getMessage(), iOException);
        }
    }

    /* loaded from: input_file:org/apache/manifoldcf/agents/transformation/htmlextractor/HtmlExtractor$SpecPacker.class */
    protected static class SpecPacker {
        private final List<String> includeFilters = new ArrayList();
        private final List<String> excludeFilters = new ArrayList();
        private final boolean striphtml;

        public SpecPacker(Specification specification) {
            boolean z = HtmlExtractor.HTML_STRIP_ALL;
            for (int i = HtmlExtractor.HTML_STRIP_NONE; i < specification.getChildCount(); i += HtmlExtractor.HTML_STRIP_ALL) {
                SpecificationNode child = specification.getChild(i);
                if (child.getType().equals(HtmlExtractorConfig.NODE_INCLUDEFILTER)) {
                    this.includeFilters.add(child.getAttributeValue(HtmlExtractorConfig.ATTRIBUTE_REGEX));
                }
                if (child.getType().equals(HtmlExtractorConfig.NODE_EXCLUDEFILTER)) {
                    this.excludeFilters.add(child.getAttributeValue(HtmlExtractorConfig.ATTRIBUTE_REGEX));
                }
                if (child.getType().equals(HtmlExtractor.NODE_KEEPMETADATA)) {
                    z = Boolean.parseBoolean(child.getAttributeValue("value"));
                }
            }
            if (this.includeFilters.isEmpty()) {
                this.includeFilters.add("body");
            }
            this.striphtml = z;
        }

        public String toPackedString() {
            StringBuilder sb = new StringBuilder();
            HtmlExtractor.packList(sb, this.includeFilters, '+');
            HtmlExtractor.packList(sb, this.excludeFilters, '+');
            if (this.striphtml) {
                sb.append('+');
            } else {
                sb.append('-');
            }
            return sb.toString();
        }
    }

    public String[] getActivitiesList() {
        return activitiesList;
    }

    public int addOrReplaceDocumentWithException(String str, VersionContext versionContext, RepositoryDocument repositoryDocument, String str2, IOutputAddActivity iOutputAddActivity) throws ManifoldCFException, ServiceInterruption, IOException {
        long currentTimeMillis = System.currentTimeMillis();
        Long l = HTML_STRIP_NONE;
        SpecPacker specPacker = new SpecPacker(versionContext.getSpecification());
        Logging.connectors.debug("Processing by HTML Extractor");
        if (repositoryDocument.getMimeType().startsWith("text/html")) {
            try {
                if (!repositoryDocument.getMimeType().startsWith("application/xhtml+xml")) {
                    try {
                        try {
                            try {
                                try {
                                    Logging.connectors.debug("Document recognized as HTML - processing");
                                    l = new Long(repositoryDocument.getBinaryLength());
                                    new Hashtable();
                                    Hashtable<String, String> extractTextAndMetadataHtmlDocument = JsoupProcessing.extractTextAndMetadataHtmlDocument(repositoryDocument.getBinaryStream(), specPacker.includeFilters.get(HTML_STRIP_NONE), specPacker.excludeFilters, specPacker.striphtml);
                                    repositoryDocument.setBinary(new ByteArrayInputStream(extractTextAndMetadataHtmlDocument.get("extractedDoc").getBytes(StandardCharsets.UTF_8)), r0.available());
                                    for (Map.Entry<String, String> entry : extractTextAndMetadataHtmlDocument.entrySet()) {
                                        if (entry.getKey() != "extractedDoc") {
                                            repositoryDocument.addField("jsoup_" + entry.getKey(), entry.getValue());
                                        }
                                    }
                                    int sendDocument = iOutputAddActivity.sendDocument(str, repositoryDocument);
                                    iOutputAddActivity.recordActivity(new Long(currentTimeMillis), ACTIVITY_PROCESS, l, str, "OK", (String) null);
                                    return sendDocument;
                                } catch (ManifoldCFException e) {
                                    e.getMessage();
                                    throw e;
                                }
                            } catch (Exception e2) {
                                iOutputAddActivity.recordActivity(new Long(currentTimeMillis), ACTIVITY_PROCESS, l, str, e2.getClass().getSimpleName().toUpperCase(Locale.ROOT), e2.getMessage());
                            }
                        } catch (ServiceInterruption e3) {
                            e3.getMessage();
                            throw e3;
                        }
                    } catch (IOException e4) {
                        e4.getMessage();
                        throw e4;
                    }
                }
            } catch (Throwable th) {
                iOutputAddActivity.recordActivity(new Long(currentTimeMillis), ACTIVITY_PROCESS, l, str, "OK", (String) null);
                throw th;
            }
        }
        Logging.connectors.debug("no processing, mime type not html");
        return iOutputAddActivity.sendDocument(str, repositoryDocument);
    }

    private String matchingRegex(List<String> list, String str) throws RegexException {
        for (String str2 : list) {
            try {
                if (Pattern.compile(str2).matcher(str).find()) {
                    return str2;
                }
            } catch (PatternSyntaxException e) {
                throw new RegexException(str2, "Invalid regular expression");
            }
        }
        return null;
    }

    public void outputConfigurationHeader(IThreadContext iThreadContext, IHTTPOutput iHTTPOutput, Locale locale, ConfigParams configParams, List<String> list) throws ManifoldCFException, IOException {
        Messages.outputResourceWithVelocity(iHTTPOutput, locale, EDIT_CONFIGURATION_JS, null);
    }

    public void outputConfigurationBody(IThreadContext iThreadContext, IHTTPOutput iHTTPOutput, Locale locale, ConfigParams configParams, String str) throws ManifoldCFException, IOException {
        new HashMap().put("TabName", str);
    }

    public String processConfigurationPost(IThreadContext iThreadContext, IPostParameters iPostParameters, Locale locale, ConfigParams configParams) throws ManifoldCFException {
        return null;
    }

    public void viewConfiguration(IThreadContext iThreadContext, IHTTPOutput iHTTPOutput, Locale locale, ConfigParams configParams) throws ManifoldCFException, IOException {
        Messages.outputResourceWithVelocity(iHTTPOutput, locale, VIEW_CONFIGURATION_HTML, new HashMap());
    }

    protected static void fillInHtmlExtractorSpecification(Map<String, Object> map, Specification specification) {
        ArrayList arrayList = new ArrayList();
        ArrayList arrayList2 = new ArrayList();
        Object obj = "true";
        for (int i = HTML_STRIP_NONE; i < specification.getChildCount(); i += HTML_STRIP_ALL) {
            SpecificationNode child = specification.getChild(i);
            if (child.getType().equals(HtmlExtractorConfig.NODE_INCLUDEFILTER)) {
                String attributeValue = child.getAttributeValue(HtmlExtractorConfig.ATTRIBUTE_REGEX);
                if (attributeValue != null) {
                    arrayList.add(attributeValue);
                }
            } else if (child.getType().equals(HtmlExtractorConfig.NODE_EXCLUDEFILTER)) {
                String attributeValue2 = child.getAttributeValue(HtmlExtractorConfig.ATTRIBUTE_REGEX);
                if (attributeValue2 != null) {
                    arrayList2.add(attributeValue2);
                }
            } else if (child.getType().equals(NODE_KEEPMETADATA)) {
                obj = child.getAttributeValue("value");
            }
        }
        map.put("INCLUDEFILTERS", arrayList);
        map.put("EXCLUDEFILTERS", arrayList2);
        map.put("HTMLTAGUSAGE", Integer.valueOf(html_strip_usage));
        map.put("STRIPHTML", obj);
    }

    public void outputSpecificationHeader(IHTTPOutput iHTTPOutput, Locale locale, Specification specification, int i, List<String> list) throws ManifoldCFException, IOException {
        HashMap hashMap = new HashMap();
        hashMap.put("SEQNUM", Integer.toString(i));
        list.add(Messages.getString(locale, "HtmlExtractorTransformationConnector.HtmlExtractorTabName"));
        fillInHtmlExtractorSpecification(hashMap, specification);
        Messages.outputResourceWithVelocity(iHTTPOutput, locale, EDIT_SPECIFICATION_JS, hashMap);
    }

    public void outputSpecificationBody(IHTTPOutput iHTTPOutput, Locale locale, Specification specification, int i, int i2, String str) throws ManifoldCFException, IOException {
        HashMap hashMap = new HashMap();
        hashMap.put("TABNAME", str);
        hashMap.put("SEQNUM", Integer.toString(i));
        hashMap.put("SELECTEDNUM", Integer.toString(i2));
        fillInHtmlExtractorSpecification(hashMap, specification);
        Messages.outputResourceWithVelocity(iHTTPOutput, locale, EDIT_SPECIFICATION_HTML_EXTRACTOR_HTML, hashMap);
    }

    public String processSpecificationPost(IPostParameters iPostParameters, Locale locale, Specification specification, int i) throws ManifoldCFException {
        String str = "s" + i + "_";
        String parameter = iPostParameters.getParameter(str + "includefilter_count");
        if (parameter != null && parameter.length() > 0) {
            int i2 = HTML_STRIP_NONE;
            while (i2 < specification.getChildCount()) {
                if (specification.getChild(i2).getType().equals(HtmlExtractorConfig.NODE_INCLUDEFILTER)) {
                    specification.removeChild(i2);
                } else {
                    i2 += HTML_STRIP_ALL;
                }
            }
            int parseInt = Integer.parseInt(parameter);
            for (int i3 = HTML_STRIP_NONE; i3 < parseInt; i3 += HTML_STRIP_ALL) {
                String str2 = str + "includefilter_";
                String str3 = "_" + Integer.toString(i3);
                String parameter2 = iPostParameters.getParameter(str2 + "op" + str3);
                if (parameter2 == null || !parameter2.equals("Delete")) {
                    String parameter3 = iPostParameters.getParameter(str2 + "regex" + str3);
                    SpecificationNode specificationNode = new SpecificationNode(HtmlExtractorConfig.NODE_INCLUDEFILTER);
                    specificationNode.setAttribute(HtmlExtractorConfig.ATTRIBUTE_REGEX, parameter3);
                    specification.addChild(specification.getChildCount(), specificationNode);
                }
            }
            String parameter4 = iPostParameters.getParameter(str + "includefilter_op");
            if (parameter4 != null && parameter4.equals("Add")) {
                String parameter5 = iPostParameters.getParameter(str + "includefilter_regex");
                SpecificationNode specificationNode2 = new SpecificationNode(HtmlExtractorConfig.NODE_INCLUDEFILTER);
                specificationNode2.setAttribute(HtmlExtractorConfig.ATTRIBUTE_REGEX, parameter5);
                specification.addChild(specification.getChildCount(), specificationNode2);
            }
        }
        String parameter6 = iPostParameters.getParameter(str + "excludefilter_count");
        if (parameter6 != null && parameter6.length() > 0) {
            int i4 = HTML_STRIP_NONE;
            while (i4 < specification.getChildCount()) {
                if (specification.getChild(i4).getType().equals(HtmlExtractorConfig.NODE_EXCLUDEFILTER)) {
                    specification.removeChild(i4);
                } else {
                    i4 += HTML_STRIP_ALL;
                }
            }
            int parseInt2 = Integer.parseInt(parameter6);
            for (int i5 = HTML_STRIP_NONE; i5 < parseInt2; i5 += HTML_STRIP_ALL) {
                String str4 = str + "excludefilter_";
                String str5 = "_" + Integer.toString(i5);
                String parameter7 = iPostParameters.getParameter(str4 + "op" + str5);
                if (parameter7 == null || !parameter7.equals("Delete")) {
                    String parameter8 = iPostParameters.getParameter(str4 + "regex" + str5);
                    SpecificationNode specificationNode3 = new SpecificationNode(HtmlExtractorConfig.NODE_EXCLUDEFILTER);
                    specificationNode3.setAttribute(HtmlExtractorConfig.ATTRIBUTE_REGEX, parameter8);
                    specification.addChild(specification.getChildCount(), specificationNode3);
                }
            }
            String parameter9 = iPostParameters.getParameter(str + "excludefilter_op");
            if (parameter9 != null && parameter9.equals("Add")) {
                String parameter10 = iPostParameters.getParameter(str + "excludefilter_regex");
                SpecificationNode specificationNode4 = new SpecificationNode(HtmlExtractorConfig.NODE_EXCLUDEFILTER);
                specificationNode4.setAttribute(HtmlExtractorConfig.ATTRIBUTE_REGEX, parameter10);
                specification.addChild(specification.getChildCount(), specificationNode4);
            }
        }
        String parameter11 = iPostParameters.getParameter(str + "striphtml_present");
        if (parameter11 == null || parameter11.length() <= 0) {
            return null;
        }
        String parameter12 = iPostParameters.getParameter(str + "striphtml");
        if (parameter12 == null) {
            parameter12 = "false";
        }
        int i6 = HTML_STRIP_NONE;
        while (i6 < specification.getChildCount()) {
            if (specification.getChild(i6).getType().equals(NODE_KEEPMETADATA)) {
                specification.removeChild(i6);
            } else {
                i6 += HTML_STRIP_ALL;
            }
        }
        SpecificationNode specificationNode5 = new SpecificationNode(NODE_KEEPMETADATA);
        specificationNode5.setAttribute("value", parameter12);
        specification.addChild(specification.getChildCount(), specificationNode5);
        return null;
    }

    public void viewSpecification(IHTTPOutput iHTTPOutput, Locale locale, Specification specification, int i) throws ManifoldCFException, IOException {
        HashMap hashMap = new HashMap();
        hashMap.put("SEQNUM", Integer.toString(i));
        fillInHtmlExtractorSpecification(hashMap, specification);
        Messages.outputResourceWithVelocity(iHTTPOutput, locale, VIEW_SPECIFICATION_HTML, hashMap);
    }
}
