package de.jungblut.crawl.extraction;

import de.jungblut.crawl.FetchResult;
import java.util.HashSet;
import org.apache.commons.lang.StringEscapeUtils;
import org.htmlparser.util.ParserException;

/* loaded from: input_file:de/jungblut/crawl/extraction/HtmlExtrator.class */
public final class HtmlExtrator implements Extractor<HtmlFetchResult> {

    /* loaded from: input_file:de/jungblut/crawl/extraction/HtmlExtrator$HtmlFetchResult.class */
    public static class HtmlFetchResult extends FetchResult {
        private final String html;

        public HtmlFetchResult(String str, HashSet<String> hashSet) {
            super(str, hashSet);
            this.html = null;
        }

        public HtmlFetchResult(String str, HashSet<String> hashSet, String str2) {
            super(str, hashSet);
            this.html = str2;
        }

        public String getHtml() {
            return this.html;
        }

        @Override // de.jungblut.crawl.FetchResult
        public String toString() {
            return this.html;
        }
    }

    /* JADX WARN: Can't rename method to resolve collision */
    @Override // de.jungblut.crawl.extraction.Extractor
    public final HtmlFetchResult extract(String str) {
        if (str == null || !str.startsWith("http") || str.length() > 500) {
            return null;
        }
        try {
            String unescapeHtml = StringEscapeUtils.unescapeHtml(OutlinkExtractor.consumeStream(OutlinkExtractor.getConnection(str)));
            return new HtmlFetchResult(str, OutlinkExtractor.extractOutlinks(unescapeHtml, str), unescapeHtml);
        } catch (Exception e) {
            System.err.println((e.getMessage().length() > 150 ? e.getMessage().substring(0, 150) : e.getMessage()).replace("\n", "") + " >>> URL was: \"" + str + "\"");
            return null;
        } catch (ParserException e2) {
            return null;
        }
    }
}
