package de.jungblut.crawl.extraction;

import de.jungblut.crawl.ConsoleResultWriter;
import de.jungblut.crawl.FetchResult;
import de.jungblut.crawl.SequentialCrawler;
import de.l3s.boilerpipe.BoilerpipeExtractor;
import de.l3s.boilerpipe.extractors.ArticleExtractor;
import java.io.IOException;
import java.util.HashSet;
import java.util.concurrent.ExecutionException;
import org.apache.commons.lang.StringEscapeUtils;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.tags.TitleTag;
import org.htmlparser.util.ParserException;
import org.htmlparser.util.SimpleNodeIterator;

/* loaded from: input_file:de/jungblut/crawl/extraction/ArticleContentExtrator.class */
public final class ArticleContentExtrator implements Extractor<ContentFetchResult> {
    private final BoilerpipeExtractor extractor = ArticleExtractor.getInstance();
    private static final NodeFilter TITLE_FILTER = new NodeClassFilter(TitleTag.class);

    /* loaded from: input_file:de/jungblut/crawl/extraction/ArticleContentExtrator$ContentFetchResult.class */
    public static class ContentFetchResult extends FetchResult {
        private final String title;
        private final String text;

        public ContentFetchResult(String str, HashSet<String> hashSet) {
            super(str, hashSet);
            this.title = null;
            this.text = null;
        }

        public ContentFetchResult(String str, HashSet<String> hashSet, String str2, String str3) {
            super(str, hashSet);
            this.title = str2;
            this.text = str3;
        }

        public String getTitle() {
            return this.title;
        }

        public String getText() {
            return this.text;
        }

        @Override // de.jungblut.crawl.FetchResult
        public String toString() {
            return this.title + "\n\n" + this.text;
        }
    }

    /* JADX WARN: Can't rename method to resolve collision */
    @Override // de.jungblut.crawl.extraction.Extractor
    public ContentFetchResult extract(String str) {
        if (str == null || !str.startsWith("http") || str.length() > 500) {
            return null;
        }
        try {
            String unescapeHtml = StringEscapeUtils.unescapeHtml(OutlinkExtractor.consumeStream(OutlinkExtractor.getConnection(str)));
            return new ContentFetchResult(str, OutlinkExtractor.extractOutlinks(unescapeHtml, str), extractTitle(unescapeHtml), this.extractor.getText(unescapeHtml));
        } catch (ParserException e) {
            return null;
        } catch (RuntimeException e2) {
            e2.printStackTrace();
            return null;
        } catch (Exception e3) {
            System.err.println(e3.toString().replace("\n", "; ") + " >>> URL was: \"" + str + "\"");
            return null;
        }
    }

    public static String extractTitle(String str) throws ParserException {
        String str2 = "";
        SimpleNodeIterator elements = new Parser(str).extractAllNodesThatMatch(TITLE_FILTER).elements();
        while (elements.hasMoreNodes()) {
            str2 = elements.nextNode().getTitle().trim();
        }
        return str2;
    }

    public static void main(String[] strArr) throws IOException, InterruptedException, ExecutionException {
        new SequentialCrawler(1, new ArticleContentExtrator(), new ConsoleResultWriter()).process("http://www.spiegel.de/wissenschaft/natur/erbgut-entziffert-austern-haben-viele-anti-stress-gene-a-856902.html");
    }
}
