package de.jungblut.crawl.extraction;

import de.jungblut.crawl.FetchResult;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
import java.nio.ByteBuffer;
import java.nio.channels.Channels;
import java.nio.channels.ReadableByteChannel;
import java.util.HashSet;
import java.util.Iterator;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.ParserException;
import org.htmlparser.util.SimpleNodeIterator;
import org.mozilla.universalchardet.CharsetListener;
import org.mozilla.universalchardet.UniversalDetector;

/* loaded from: input_file:de/jungblut/crawl/extraction/OutlinkExtractor.class */
public final class OutlinkExtractor implements Extractor<FetchResult> {
    private static final int BUFFER_SIZE = 1048576;
    private static final String USER_AGENT_KEY = "User-Agent";
    private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.97 Safari/537.11";
    private static final NodeFilter LINK_FILTER = new NodeClassFilter(LinkTag.class);
    private static final Pattern IGNORE_SUFFIX_PATTERN = Pattern.compile(".*(\\.(css|js|bmp|gif|jpe?g|png|tiff?|mid|mp2|mp3|mp4|wav|avi|mov|mpeg|ram|m4v|pdf|iso|rm|smil|wmv|swf|wma|zip|rar|gz))$");
    private static final Pattern BASE_URL_PATTERN = Pattern.compile("(http[s]*://[a-z0-9.-]+)");
    private static final Pattern GENERAL_URL_PATTERN = Pattern.compile("\\bhttps?://[-a-zA-Z0-9+&#/%?=~_|!:,.;]*[-a-zA-Z0-9+&#/%=~_|]");

    @Override // de.jungblut.crawl.extraction.Extractor
    public FetchResult extract(String str) {
        if (str == null || !str.startsWith("http") || str.length() > 500) {
            return null;
        }
        try {
            return new FetchResult(str, extractOutlinks(consumeStream(getConnection(str)), str));
        } catch (Exception e) {
            System.err.println(e.toString().replace("\n", "; ") + " >>> URL was: \"" + str + "\"");
            return null;
        } catch (ParserException e2) {
            return null;
        } catch (RuntimeException e3) {
            e3.printStackTrace();
            return null;
        }
    }

    public static InputStream getConnection(String str) throws IOException {
        URLConnection openConnection = new URL(str).openConnection();
        openConnection.addRequestProperty(USER_AGENT_KEY, USER_AGENT);
        return openConnection.getInputStream();
    }

    public static HashSet<String> filter(HashSet<String> hashSet, Pattern pattern) {
        if (pattern != null) {
            Iterator<String> it = hashSet.iterator();
            while (it.hasNext()) {
                if (!pattern.matcher(it.next()).matches()) {
                    it.remove();
                }
            }
        }
        return hashSet;
    }

    public static HashSet<String> extractOutlinks(String str, String str2) throws ParserException {
        String extractBaseUrl = extractBaseUrl(str2);
        if (extractBaseUrl == null) {
            return null;
        }
        HashSet<String> hashSet = new HashSet<>();
        SimpleNodeIterator elements = new Parser(str).extractAllNodesThatMatch(LINK_FILTER).elements();
        while (elements.hasMoreNodes()) {
            String trim = elements.nextNode().getLink().trim();
            if (trim.contains("#")) {
                trim = trim.substring(0, trim.lastIndexOf(35));
            }
            if (trim != null && !trim.isEmpty()) {
                if (isValid(trim)) {
                    hashSet.add(trim);
                } else {
                    if (trim.startsWith("//")) {
                        trim = "http:" + trim;
                        if (isValid(trim)) {
                            hashSet.add(trim);
                        }
                    }
                    if (trim.charAt(0) == '/') {
                        trim = extractBaseUrl + trim;
                        if (isValid(trim)) {
                            hashSet.add(trim);
                        }
                    }
                    String str3 = str2.endsWith("/") ? str2 + trim : str2.substring(0, str2.lastIndexOf(47) + 1) + trim;
                    if (isValid(str3)) {
                        hashSet.add(str3);
                    }
                }
            }
        }
        return hashSet;
    }

    public static String consumeStream(InputStream inputStream) throws IOException {
        try {
            UniversalDetector universalDetector = new UniversalDetector((CharsetListener) null);
            ReadableByteChannel newChannel = Channels.newChannel(inputStream);
            ByteBuffer allocate = ByteBuffer.allocate(BUFFER_SIZE);
            while (true) {
                int read = newChannel.read(allocate);
                if (read == -1) {
                    break;
                }
                universalDetector.handleData(allocate.array(), allocate.position() - read, read);
                allocate = resizeBuffer(allocate);
            }
            universalDetector.dataEnd();
            String detectedCharset = universalDetector.getDetectedCharset();
            String str = new String(allocate.array(), 0, allocate.position(), detectedCharset == null ? "UTF-8" : detectedCharset);
            if (inputStream != null) {
                inputStream.close();
            }
            return str;
        } catch (Throwable th) {
            if (inputStream != null) {
                inputStream.close();
            }
            throw th;
        }
    }

    private static ByteBuffer resizeBuffer(ByteBuffer byteBuffer) {
        ByteBuffer byteBuffer2 = byteBuffer;
        if (byteBuffer.remaining() < ((int) (byteBuffer.capacity() * 0.1f))) {
            byteBuffer2 = ByteBuffer.allocate(byteBuffer.capacity() * 2);
            byteBuffer.flip();
            byteBuffer2.put(byteBuffer);
        }
        return byteBuffer2;
    }

    public static String extractBaseUrl(String str) {
        Matcher matcher = BASE_URL_PATTERN.matcher(str);
        if (matcher.find()) {
            return matcher.group();
        }
        return null;
    }

    public static boolean isValid(String str) {
        Matcher matcher = BASE_URL_PATTERN.matcher(str);
        return matcher.find() && matcher.start() == 0 && !IGNORE_SUFFIX_PATTERN.matcher(str).matches() && GENERAL_URL_PATTERN.matcher(str).matches();
    }
}
