package de.l3s.icrawl.util;

import com.google.common.base.CharMatcher;
import com.google.common.base.Joiner;
import com.google.common.collect.Sets;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Set;
import org.jsoup.nodes.Comment;
import org.jsoup.nodes.DataNode;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.DocumentType;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.nodes.XmlDeclaration;

/* loaded from: input_file:de/l3s/icrawl/util/TextExtractor.class */
public class TextExtractor {
    private static final char LINE_BREAK = '\n';
    private static final String FRAGMENT_SEPARATOR = "\n\n";
    private static final Set<String> BLOCK_ELEMENTS = Sets.newHashSet(new String[]{"html", "head", "body", "frameset", "script", "noscript", "style", "meta", "link", "title", "frame", "noframes", "section", "nav", "aside", "hgroup", "header", "footer", "p", "h1", "h2", "h3", "h4", "h5", "h6", "ul", "ol", "pre", "div", "blockquote", "hr", "address", "figure", "figcaption", "form", "fieldset", "ins", "del", "s", "dl", "dt", "dd", "li", "table", "caption", "thead", "tfoot", "tbody", "colgroup", "col", "tr", "th", "td", "video", "audio", "canvas", "details", "menu", "plaintext"});
    private static final Set<String> IGNORED_ELEMENTS = Sets.newHashSet(new String[]{"script", "style", "head"});

    private TextExtractor() {
    }

    public static String extractText(Element element) {
        ArrayList arrayList = new ArrayList();
        StringBuilder sb = new StringBuilder(1024);
        extract(element, arrayList, sb);
        trimRight(sb);
        if (sb.length() != 0) {
            arrayList.add(sb.toString());
        }
        return Joiner.on(FRAGMENT_SEPARATOR).join(arrayList);
    }

    private static void handleTextNode(String str, StringBuilder sb) {
        if (str.isEmpty()) {
            return;
        }
        if (CharMatcher.WHITESPACE.matchesAllOf(str) && sb.length() == 0) {
            return;
        }
        if (CharMatcher.WHITESPACE.matches(str.charAt(0))) {
            ensureEndsWithSpace(sb);
        }
        sb.append(str.trim());
        if (CharMatcher.WHITESPACE.matches(lastChar(str))) {
            sb.append(' ');
        }
    }

    static void ensureEndsWithSpace(StringBuilder sb) {
        if (sb.length() <= 0 || CharMatcher.WHITESPACE.matches(lastChar(sb))) {
            return;
        }
        sb.append(' ');
    }

    private static char lastChar(CharSequence charSequence) {
        return charSequence.charAt(charSequence.length() - 1);
    }

    static void ensureEndsWithParagraph(StringBuilder sb) {
        if (sb.length() == 0) {
            return;
        }
        if (sb.length() == 1) {
            if (CharMatcher.WHITESPACE.matches(sb.charAt(0))) {
                return;
            }
            sb.append(FRAGMENT_SEPARATOR);
        } else if (LINE_BREAK != lastChar(sb)) {
            trimRight(sb);
            sb.append(FRAGMENT_SEPARATOR);
        } else {
            if (LINE_BREAK == sb.charAt(sb.length() - 2)) {
                return;
            }
            sb.append('\n');
        }
    }

    static void trimRight(StringBuilder sb) {
        int length = sb.length() - 1;
        while (length >= 0 && CharMatcher.WHITESPACE.matches(sb.charAt(length))) {
            length--;
        }
        sb.setLength(length + 1);
    }

    static boolean isBlockElement(Element element) {
        return BLOCK_ELEMENTS.contains(element.tagName().toLowerCase(Locale.ROOT));
    }

    static boolean isIgnoredElement(Element element) {
        return IGNORED_ELEMENTS.contains(element.tagName().toLowerCase(Locale.ROOT));
    }

    public static String extractText(Document document) {
        ArrayList arrayList = new ArrayList();
        StringBuilder sb = new StringBuilder(1024);
        Iterator it = document.children().iterator();
        while (it.hasNext()) {
            extract((Element) it.next(), arrayList, sb);
        }
        trimRight(sb);
        if (sb.length() != 0) {
            arrayList.add(sb.toString());
        }
        return Joiner.on(FRAGMENT_SEPARATOR).join(arrayList);
    }

    private static void extract(Element element, List<String> list, StringBuilder sb) {
        if (isIgnoredElement(element)) {
            return;
        }
        if (isBlockElement(element)) {
            trimRight(sb);
            if (sb.length() != 0) {
                list.add(sb.toString());
                sb.setLength(0);
            }
        } else if ("br".equalsIgnoreCase(element.tagName())) {
            sb.append('\n');
        }
        Iterator it = element.childNodes().iterator();
        while (it.hasNext()) {
            extract((Node) it.next(), list, sb);
        }
    }

    private static void extract(Node node, List<String> list, StringBuilder sb) {
        if (node instanceof Element) {
            extract((Element) node, list, sb);
            return;
        }
        if (node instanceof TextNode) {
            handleTextNode(((TextNode) node).text(), sb);
        } else if (!(node instanceof Comment) && !(node instanceof DataNode) && !(node instanceof DocumentType) && !(node instanceof XmlDeclaration)) {
            throw new IllegalArgumentException("Unknown node type " + node.getClass().getName());
        }
    }
}
