package de.l3s.icrawl.util;

import com.google.common.collect.ImmutableSet;
import java.io.IOException;
import java.io.InputStream;
import java.util.Iterator;
import java.util.Locale;
import java.util.Objects;
import java.util.Set;
import org.apache.commons.lang.StringEscapeUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;

/* loaded from: input_file:de/l3s/icrawl/util/WebPageUtils.class */
public final class WebPageUtils {
    private static final Set<String> PARAGRAPH_ELEMENTS = ImmutableSet.of("p", "div", "li", "dd", "dt", "blockquote", new String[]{"pre", "caption", "th", "td"});
    private static final int MIN_PARAGRAPH_TOKENS = 50;
    private static final int MIN_JS_TOKENS = 3;

    private WebPageUtils() {
    }

    private static int tokenCount(Element element) {
        return TextExtractor.extractText(element).split("\\s+").length;
    }

    public static Element findParagraphParent(Element element, int i) {
        Element element2;
        Element element3 = element;
        while (true) {
            element2 = element3;
            if (element2 == null || isParagraphElement(element2) || !needsMoreTokens(element2, i) || !(element2.parent() instanceof Element)) {
                break;
            }
            element3 = (Element) element2.parentNode();
        }
        return element2;
    }

    public static Element containingElement(Node node) {
        Node node2;
        Node node3 = node;
        while (true) {
            node2 = node3;
            if (node2 == null || (node instanceof Element)) {
                break;
            }
            node3 = node2.parentNode();
        }
        return (Element) node2;
    }

    private static boolean needsMoreTokens(Element element, int i) {
        return i < 0 || tokenCount(element) < i;
    }

    private static boolean isParagraphElement(Element element) {
        return PARAGRAPH_ELEMENTS.contains(element.tagName().toLowerCase(Locale.ENGLISH));
    }

    public static Element findParagraphParent(Element element) {
        return findParagraphParent(element, 50);
    }

    public static Element findParagraphParent(Node node, int i) {
        Node node2 = (Node) Objects.requireNonNull(node);
        while (true) {
            Node node3 = node2;
            if (node3 == null) {
                return null;
            }
            if (node3 instanceof Element) {
                return findParagraphParent((Element) node3, i);
            }
            node2 = node3.parentNode();
        }
    }

    public static String extractTextFromJavascript(Document document) {
        StringBuilder sb = new StringBuilder();
        Iterator it = document.getElementsByTag("script").iterator();
        while (it.hasNext()) {
            extractTextFromJavascript(((Element) it.next()).text(), sb);
        }
        return StringEscapeUtils.unescapeJavaScript(sb.toString().trim());
    }

    private static void extractTextFromJavascript(String str, StringBuilder sb) {
        int i = -1;
        int i2 = -1;
        while (true) {
            int i3 = i2;
            int indexOf = str.indexOf("\"", i + 1);
            i = indexOf;
            if (indexOf < 0) {
                return;
            }
            if (i3 >= 0) {
                String substring = str.substring(i3 + 1, i);
                if (substring.split("\\s+").length > MIN_JS_TOKENS) {
                    sb.append("\n\n").append(substring);
                }
                i2 = -1;
            } else {
                i2 = i;
            }
        }
    }

    public static Document parseHtml(InputStream inputStream, String str) {
        try {
            return Jsoup.parse(inputStream, "UTF-8", str);
        } catch (IOException e) {
            throw new HtmlParseException(str, e);
        }
    }

    public static Document parseHtml(String str, String str2) {
        return Jsoup.parse(str, str2);
    }

    public static boolean hasHtmlContent(String str) {
        if (str == null || str.isEmpty()) {
            return false;
        }
        return str.substring(0, Math.min(str.length(), 1024)).toUpperCase(Locale.ROOT).contains("<HTML");
    }
}
