package de.l3s.icrawl.crawler.tools;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.HashMultiset;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Multiset;
import com.google.common.io.Files;
import com.google.common.io.Resources;
import de.l3s.icrawl.contentanalysis.DocumentVectorSimilarity;
import de.l3s.icrawl.contentanalysis.LanguageModels;
import de.l3s.icrawl.crawler.ArchiveCrawlSpecification;
import de.l3s.icrawl.crawler.ArchiveCrawler;
import de.l3s.icrawl.crawler.TimeSpecification;
import de.l3s.icrawl.util.TextExtractor;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.time.LocalDate;
import java.time.Period;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import java.util.regex.Pattern;
import java.util.stream.Stream;
import java.util.zip.GZIPInputStream;
import net.sourceforge.jwbf.core.actions.HttpActionClient;
import net.sourceforge.jwbf.core.contentRep.ParsedPage;
import net.sourceforge.jwbf.mediawiki.MediaWiki;
import net.sourceforge.jwbf.mediawiki.actions.misc.ParsePage;
import net.sourceforge.jwbf.mediawiki.actions.queries.CategoryMembersSimple;
import net.sourceforge.jwbf.mediawiki.bots.MediaWikiBot;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:de/l3s/icrawl/crawler/tools/CrawlSpecCreator.class */
public class CrawlSpecCreator {
    private static final String WIKIPEDIA_API_URL = "https://de.wikipedia.org/w/";
    private static final String WIKIPEDIA_BASE_URL = "https://de.wikipedia.org/wiki/";
    private static final String WIKINEWS_API_URL = "https://de.wikinews.org/w/";
    private static final Logger logger = LoggerFactory.getLogger(CrawlSpecCreator.class);
    private static final Locale DEFAULT_LANGUAGE = Locale.GERMAN;
    private static final Map<Pattern, String> URL_REPLACEMENTS = ImmutableMap.builder().put(Pattern.compile("https://web.archive.org/web/\\d+/(.*)"), "$1").put(Pattern.compile("https://archive.is/\\d+/(.*)"), "$1").put(Pattern.compile("https://archive.is/(.*)\\*$"), "$1").put(Pattern.compile("http://www.webcitation.org/[a-zA-Z0-9]+\\?url=(.*)"), "$1").put(Pattern.compile("http://derefer.unbubble.eu/?\\?u=(.*)"), "$1").put(Pattern.compile("http://deadurl.invalid/(.*)"), "$1").build();
    private static final Set<Pattern> URL_PATTERNS_WHITELIST = ImmutableSet.of(Pattern.compile("^https?://[a-z0-9.-]*?\\.de/"));
    private final MediaWikiBot wpBot;
    private final MediaWikiBot wnBot;
    private final EnumSet<ParsePage.ParseProp> props;
    private final Pattern parentheses;
    private final Map<String, Double> idfDictionary;
    private final MediaWiki.Version version;

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:de/l3s/icrawl/crawler/tools/CrawlSpecCreator$CrawlSpecBuilder.class */
    public class CrawlSpecBuilder {
        private final Multiset<String> keywords = HashMultiset.create();
        private final Map<String, Locale> documents = new HashMap();
        private final Set<String> urls = new HashSet();
        private final List<String> referenceDocuments = new ArrayList();
        private final TimeSpecification timeSpecification;
        private final String name;
        private final String description;

        public CrawlSpecBuilder(String str, String str2, TimeSpecification timeSpecification) {
            this.name = str;
            this.description = str2;
            this.timeSpecification = timeSpecification;
        }

        public void addUrls(Collection<String> collection) {
            Stream filter = collection.stream().map(CrawlSpecCreator::cleanUrl).filter(CrawlSpecCreator::isAllowedUrl);
            Set<String> set = this.urls;
            set.getClass();
            filter.forEach((v1) -> {
                r1.add(v1);
            });
        }

        public void addReferenceDocument(String str) {
            this.referenceDocuments.add(str);
        }

        public void addUrl(String str) {
            String cleanUrl = CrawlSpecCreator.cleanUrl(str);
            if (CrawlSpecCreator.isAllowedUrl(cleanUrl)) {
                this.urls.add(cleanUrl);
            }
        }

        public void addKeyword(String str) {
            this.keywords.add(str);
        }

        public void addDocument(String str, Locale locale) {
            this.documents.put(str, locale);
        }

        public ArchiveCrawlSpecification createSpec(boolean z) {
            Set elementSet = z ? this.keywords.elementSet() : Collections.emptySet();
            DocumentVectorSimilarity documentVectorSimilarity = new DocumentVectorSimilarity(this.documents, elementSet, new HashSet(), 100, false, CrawlSpecCreator.DEFAULT_LANGUAGE, new LanguageModels(Locale.GERMAN, CrawlSpecCreator.this.idfDictionary, CrawlSpecCreator.DEFAULT_LANGUAGE));
            return new ArchiveCrawlSpecification(this.name, new ArrayList(this.urls), this.referenceDocuments, this.timeSpecification, documentVectorSimilarity.getReferenceVectors(), ImmutableMap.of(Locale.GERMAN, elementSet), this.description, CrawlSpecCreator.DEFAULT_LANGUAGE, documentVectorSimilarity.getCorrectionFactors());
        }
    }

    public static void main(String[] strArr) throws IOException {
        if (strArr.length < 1) {
            System.out.println("Usage: java " + CrawlSpecCreator.class.getName() + " topicsFile.tsv [outputDirectory]");
            System.exit(1);
        }
        CrawlSpecCreator crawlSpecCreator = new CrawlSpecCreator();
        DateTimeFormatter dateTimeFormatter = DateTimeFormatter.ISO_DATE;
        File file = new File(strArr.length > 1 ? strArr[1] : "");
        file.mkdirs();
        BufferedReader newReader = Files.newReader(new File(strArr[0]), StandardCharsets.UTF_8);
        Throwable th = null;
        boolean z = false;
        while (true) {
            try {
                try {
                    String readLine = newReader.readLine();
                    if (readLine == null) {
                        break;
                    }
                    if (z) {
                        String[] split = readLine.split("\t", 8);
                        String str = split[0];
                        crawlSpecCreator.extract(str, Arrays.asList(split[6].split(",\\s*")), LocalDate.parse(split[1], dateTimeFormatter), LocalDate.parse(split[2], dateTimeFormatter), Period.parse(split[3]), Period.parse(split[4]), split[5], file);
                        logger.info("Created crawl spec for topic {}", str);
                    } else {
                        z = true;
                    }
                } catch (Throwable th2) {
                    th = th2;
                    throw th2;
                }
            } catch (Throwable th3) {
                if (newReader != null) {
                    if (th != null) {
                        try {
                            newReader.close();
                        } catch (Throwable th4) {
                            th.addSuppressed(th4);
                        }
                    } else {
                        newReader.close();
                    }
                }
                throw th3;
            }
        }
        if (newReader != null) {
            if (0 == 0) {
                newReader.close();
                return;
            }
            try {
                newReader.close();
            } catch (Throwable th5) {
                th.addSuppressed(th5);
            }
        }
    }

    public CrawlSpecCreator() throws IOException {
        URL url = new URL(WIKIPEDIA_API_URL);
        CloseableHttpClient build = HttpClientBuilder.create().setDefaultRequestConfig(RequestConfig.custom().setCookieSpec("standard").build()).setUserAgent("L3SSpecBuilder <gossen@l3s.de>").build();
        this.wpBot = new MediaWikiBot(HttpActionClient.builder().withClient(build).withUrl(url).withRequestsPerUnit(10.0d, TimeUnit.MINUTES).build());
        this.wnBot = new MediaWikiBot(HttpActionClient.builder().withClient(build).withUrl(WIKINEWS_API_URL).withRequestsPerUnit(1.0d, TimeUnit.SECONDS).build());
        this.version = this.wpBot.getVersion();
        this.props = EnumSet.of(ParsePage.ParseProp.externallinks, ParsePage.ParseProp.links, ParsePage.ParseProp.text);
        this.parentheses = Pattern.compile(" (\\([^)]+\\))$");
        GZIPInputStream gZIPInputStream = new GZIPInputStream(Resources.getResource(ArchiveCrawler.IDF_DICTIONARY_DE).openStream());
        Throwable th = null;
        try {
            try {
                this.idfDictionary = LanguageModels.readIdfDictionary(gZIPInputStream);
                if (gZIPInputStream != null) {
                    if (0 == 0) {
                        gZIPInputStream.close();
                        return;
                    }
                    try {
                        gZIPInputStream.close();
                    } catch (Throwable th2) {
                        th.addSuppressed(th2);
                    }
                }
            } catch (Throwable th3) {
                th = th3;
                throw th3;
            }
        } catch (Throwable th4) {
            if (gZIPInputStream != null) {
                if (th != null) {
                    try {
                        gZIPInputStream.close();
                    } catch (Throwable th5) {
                        th.addSuppressed(th5);
                    }
                } else {
                    gZIPInputStream.close();
                }
            }
            throw th4;
        }
    }

    public void extract(String str, Collection<String> collection, LocalDate localDate, LocalDate localDate2, Period period, Period period2, String str2, File file) throws IOException {
        CrawlSpecBuilder crawlSpecBuilder = new CrawlSpecBuilder(str, str2, TimeSpecification.interval(localDate, localDate2, period, period2));
        for (String str3 : collection) {
            if (str3.startsWith("news:")) {
                extractWikiNewsCategory(str3.substring("news:".length()), crawlSpecBuilder);
            } else {
                extractWikipediaPage(str3, crawlSpecBuilder);
            }
        }
        crawlSpecBuilder.createSpec(true).writeFile(new File(file, str + ".json"));
        crawlSpecBuilder.createSpec(false).writeFile(new File(file, str + "-noKW.json"));
    }

    private void extractWikipediaPage(String str, CrawlSpecBuilder crawlSpecBuilder) {
        crawlSpecBuilder.addReferenceDocument(WIKIPEDIA_BASE_URL + str);
        ParsedPage result = this.wpBot.getPerformedAction(new ParsePage(str, this.props, true, this.version)).getResult();
        for (String str2 : result.getExternalLinks()) {
            if (!str2.startsWith("//")) {
                crawlSpecBuilder.addUrl(str2);
            }
        }
        Iterator<ParsedPage.Link> it = result.getLinks().iterator();
        while (it.hasNext()) {
            String name = it.next().getName();
            if (!name.startsWith("Liste ") && !name.startsWith("Vorlage:")) {
                crawlSpecBuilder.addKeyword(this.parentheses.matcher(name).replaceFirst(""));
            }
        }
        String str3 = "";
        Document parseHtmlFragment = parseHtmlFragment(result.getText());
        if (!result.getText().trim().isEmpty() && parseHtmlFragment != null) {
            str3 = cleanWikipediaHtml(parseHtmlFragment);
        }
        crawlSpecBuilder.addDocument(str3, Locale.GERMAN);
    }

    private void extractWikiNewsCategory(String str, CrawlSpecBuilder crawlSpecBuilder) {
        logger.debug("Retrieving WikiNews category {}", str);
        Iterator it = new CategoryMembersSimple(this.wnBot, str, new int[]{0}).iterator();
        while (it.hasNext()) {
            String str2 = (String) it.next();
            List<String> externalLinks = this.wnBot.getPerformedAction(new ParsePage(str2, EnumSet.of(ParsePage.ParseProp.externallinks), false, this.version)).getResult().getExternalLinks();
            logger.debug("Got {} links for '{}': {}", new Object[]{Integer.valueOf(externalLinks.size()), str2, externalLinks});
            crawlSpecBuilder.addUrls(externalLinks);
        }
    }

    @VisibleForTesting
    static boolean isAllowedUrl(String str) {
        boolean z = false;
        Iterator<Pattern> it = URL_PATTERNS_WHITELIST.iterator();
        while (true) {
            if (!it.hasNext()) {
                break;
            }
            if (it.next().matcher(str).find()) {
                z = true;
                break;
            }
        }
        return z;
    }

    @VisibleForTesting
    static String cleanUrl(String str) {
        String str2 = str;
        for (Map.Entry<Pattern, String> entry : URL_REPLACEMENTS.entrySet()) {
            str2 = entry.getKey().matcher(str2).replaceAll(entry.getValue());
        }
        if (logger.isDebugEnabled() && !str2.equals(str)) {
            logger.debug("Replaced URL {} with {}", str, str2);
        }
        return str2;
    }

    private String cleanWikipediaHtml(Document document) {
        Iterator it = document.body().children().iterator();
        while (it.hasNext()) {
            Element element = (Element) it.next();
            if (!element.tagName().equals("p")) {
                element.remove();
            }
        }
        return domFragmentToString(document);
    }

    Document parseHtmlFragment(String str) {
        return Jsoup.parseBodyFragment(str);
    }

    private String domFragmentToString(Document document) {
        return TextExtractor.extractText(document);
    }
}
