package org.apache.manifoldcf.agents.transformation.htmlextractor;

import java.io.IOException;
import java.io.InputStream;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.List;
import org.apache.manifoldcf.crawler.system.Logging;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Entities;
import org.jsoup.parser.Parser;
import org.jsoup.safety.Whitelist;

/* loaded from: input_file:org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.class */
public class JsoupProcessing {
    public static Hashtable<String, String> extractTextAndMetadataHtmlDocument(InputStream inputStream, String str, List<String> list, boolean z) throws IOException {
        Document parse = Jsoup.parse(inputStream, "UTF-8", HtmlExtractorConfig.BLACKLIST_DEFAULT);
        parse.outputSettings().escapeMode(Entities.EscapeMode.xhtml);
        Hashtable<String, String> hashtable = new Hashtable<>();
        Iterator it = parse.select("meta").iterator();
        while (it.hasNext()) {
            Element element = (Element) it.next();
            Logging.connectors.debug("Name: " + element.attr("name") + " - Content: " + element.attr("content"));
            hashtable.put(element.attr("name"), element.attr("content"));
        }
        if (parse.select("title") != null) {
            hashtable.put("title", parse.select("title").text());
        }
        Element first = parse.select("meta[name='keywords']").first();
        if (first != null) {
            hashtable.put("keywords", first.attr("content"));
        }
        Element first2 = parse.select("meta[name=\"description\"]").first();
        if (first2 != null) {
            hashtable.put("description", first2.attr("content"));
        }
        Element first3 = parse.select("meta[name=\"author\"]").first();
        if (first3 != null) {
            hashtable.put("author", first3.attr("content"));
        }
        Element first4 = parse.select("meta[name=\"dcterms.subject\"]").first();
        if (first4 != null) {
            hashtable.put("dc_terms_subject", first4.attr("content"));
        }
        Element first5 = parse.select("meta[name=\"dcterms.title\"]").first();
        if (first5 != null) {
            hashtable.put("dc_terms_title", first5.attr("content"));
        }
        Element first6 = parse.select("meta[name=\"dcterms.creator\"]").first();
        if (first6 != null) {
            hashtable.put("dc_terms_creator", first6.attr("content"));
        }
        Element first7 = parse.select("meta[name=\"dcterms.description\"]").first();
        if (first7 != null) {
            hashtable.put("dc_terms_description", first7.attr("content"));
        }
        Element first8 = parse.select("meta[name=\"dcterms.publisher\"]").first();
        if (first8 != null) {
            hashtable.put("dc_terms_publisher", first8.attr("content"));
        }
        Element first9 = parse.select("meta[name=\"dcterms.contributor\"]").first();
        if (first9 != null) {
            hashtable.put("dc_terms_contributor", first9.attr("content"));
        }
        Element first10 = parse.select("meta[name=\"dcterms.date\"]").first();
        if (first10 != null) {
            hashtable.put("dc_terms_date", first10.attr("content"));
        }
        Element first11 = parse.select("meta[name=\"dcterms.type\"]").first();
        if (first11 != null) {
            hashtable.put("dc_terms_type", first11.attr("content"));
        }
        Element first12 = parse.select("meta[name=\"dcterms.format\"]").first();
        if (first12 != null) {
            hashtable.put("dc_terms_format", first12.attr("content"));
        }
        Element first13 = parse.select("meta[name=\"dcterms.language\"]").first();
        if (first13 != null) {
            hashtable.put("dc_terms_language", first13.attr("content"));
        }
        Element first14 = parse.select("meta[name=\"dcterms.identifier\"]").first();
        if (first14 != null) {
            hashtable.put("dc_terms_identifier", first14.attr("content"));
        }
        Element body = parse.body();
        if (str != "body") {
            body = parse.select(str).first();
            if (parse.select(str).size() == 0) {
                body = parse.select("body").first();
            }
        }
        if (list != null) {
            for (int i = 0; i < list.size(); i++) {
                body.select(list.get(i)).remove();
            }
        }
        hashtable.put("extractedDoc", Parser.unescapeEntities(z ? Jsoup.clean(body.html(), HtmlExtractorConfig.BLACKLIST_DEFAULT, Whitelist.none(), new Document.OutputSettings().prettyPrint(false)) : Jsoup.clean(body.html(), Whitelist.relaxed()), true));
        return hashtable;
    }
}
