package de.l3s.icrawl.crawler.analysis;

import com.codahale.metrics.Counter;
import com.codahale.metrics.Histogram;
import com.codahale.metrics.MetricRegistry;
import com.codahale.metrics.Timer;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableMultiset;
import com.google.common.collect.ImmutableSet;
import com.google.common.io.Resources;
import com.sharethis.textrank.TextRank;
import de.l3s.icrawl.contentanalysis.DocumentVectorSimilarity;
import de.l3s.icrawl.contentanalysis.LanguageModels;
import de.l3s.icrawl.contentanalysis.WebPageDateExtractor;
import de.l3s.icrawl.crawler.ArchiveCrawlSpecification;
import de.l3s.icrawl.crawler.ArchiveCrawler;
import de.l3s.icrawl.crawler.CrawlUrl;
import de.l3s.icrawl.crawler.TimeSpecification;
import de.l3s.icrawl.crawler.urls.RegexUrlNormalizer;
import de.l3s.icrawl.crawler.urls.UrlCanonicalizerNormalizer;
import de.l3s.icrawl.crawler.urls.UrlFilter;
import de.l3s.icrawl.crawler.urls.UrlNormalizer;
import de.l3s.icrawl.crawler.urls.UrlNormalizers;
import de.l3s.icrawl.snapshots.Snapshot;
import de.l3s.icrawl.util.TextExtractor;
import java.io.IOException;
import java.time.ZonedDateTime;
import java.util.Collection;
import java.util.Iterator;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.zip.GZIPInputStream;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.openimaj.text.nlp.language.LanguageDetector;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:de/l3s/icrawl/crawler/analysis/ResourceAnalyser.class */
public class ResourceAnalyser {
    private static final Logger logger = LoggerFactory.getLogger(ResourceAnalyser.class);
    private final UrlFilter urlFilter;
    private final UrlNormalizer urlNormalizer;
    private final Histogram outlinkCount;
    private final Counter unknowns;
    private final Counter empty;
    private final DocumentVectorSimilarity similarity;
    private final LanguageDetector languageDetector = new LanguageDetector();
    private final Timer parseTime;
    private final Timer textExtractTime;
    private final Timer analysisTime;
    private final WeightingMethod method;
    private final TimeSpecification referenceTime;
    private final Timer dateExtractionTime;
    private final float timeRelevanceThreshold;
    private final float docSimilarityWeight;

    /* JADX INFO: Access modifiers changed from: package-private */
    /* renamed from: de.l3s.icrawl.crawler.analysis.ResourceAnalyser$1, reason: invalid class name */
    /* loaded from: input_file:de/l3s/icrawl/crawler/analysis/ResourceAnalyser$1.class */
    public static /* synthetic */ class AnonymousClass1 {
        static final /* synthetic */ int[] $SwitchMap$de$l3s$icrawl$crawler$analysis$ResourceAnalyser$WeightingMethod = new int[WeightingMethod.values().length];

        static {
            try {
                $SwitchMap$de$l3s$icrawl$crawler$analysis$ResourceAnalyser$WeightingMethod[WeightingMethod.CONTENT.ordinal()] = 1;
            } catch (NoSuchFieldError e) {
            }
            try {
                $SwitchMap$de$l3s$icrawl$crawler$analysis$ResourceAnalyser$WeightingMethod[WeightingMethod.CONTENT_AND_TIME.ordinal()] = 2;
            } catch (NoSuchFieldError e2) {
            }
            try {
                $SwitchMap$de$l3s$icrawl$crawler$analysis$ResourceAnalyser$WeightingMethod[WeightingMethod.CONTENT_AND_TIME_EXP.ordinal()] = 3;
            } catch (NoSuchFieldError e3) {
            }
            try {
                $SwitchMap$de$l3s$icrawl$crawler$analysis$ResourceAnalyser$WeightingMethod[WeightingMethod.TIME.ordinal()] = 4;
            } catch (NoSuchFieldError e4) {
            }
            try {
                $SwitchMap$de$l3s$icrawl$crawler$analysis$ResourceAnalyser$WeightingMethod[WeightingMethod.TIME_EXP.ordinal()] = 5;
            } catch (NoSuchFieldError e5) {
            }
            try {
                $SwitchMap$de$l3s$icrawl$crawler$analysis$ResourceAnalyser$WeightingMethod[WeightingMethod.UNFOCUSED.ordinal()] = 6;
            } catch (NoSuchFieldError e6) {
            }
        }
    }

    /* loaded from: input_file:de/l3s/icrawl/crawler/analysis/ResourceAnalyser$Factory.class */
    public static class Factory implements ResourceAnalyserFactory {
        private final MetricRegistry metrics;
        private final float timeRelevanceThreshold;
        private final float docSimilarityWeight;

        public Factory(MetricRegistry metricRegistry, float f, float f2) {
            this.metrics = metricRegistry;
            this.timeRelevanceThreshold = f;
            this.docSimilarityWeight = f2;
        }

        @Override // de.l3s.icrawl.crawler.analysis.ResourceAnalyserFactory
        public ResourceAnalyser get(ArchiveCrawlSpecification archiveCrawlSpecification, WeightingMethod weightingMethod) throws IOException {
            return new ResourceAnalyser(archiveCrawlSpecification, weightingMethod, this.metrics, this.timeRelevanceThreshold, this.docSimilarityWeight);
        }
    }

    /* loaded from: input_file:de/l3s/icrawl/crawler/analysis/ResourceAnalyser$Result.class */
    public static class Result {
        public static final Result EMPTY = new Result(ImmutableSet.of(), -1.0d, null);
        private final Collection<CrawlUrl> outlinks;
        private final double relevance;
        private final ZonedDateTime modifiedDate;

        Result(Collection<CrawlUrl> collection, double d, ZonedDateTime zonedDateTime) {
            this.outlinks = collection;
            this.relevance = d;
            this.modifiedDate = zonedDateTime;
        }

        public Collection<CrawlUrl> getOutlinks() {
            return this.outlinks;
        }

        public double getRelevance() {
            return this.relevance;
        }

        public ZonedDateTime getModifiedDate() {
            return this.modifiedDate;
        }
    }

    /* loaded from: input_file:de/l3s/icrawl/crawler/analysis/ResourceAnalyser$WeightingMethod.class */
    public enum WeightingMethod {
        CONTENT(false),
        TIME(true),
        TIME_EXP(true),
        CONTENT_AND_TIME(true),
        CONTENT_AND_TIME_EXP(true),
        UNFOCUSED(false);

        private final boolean timeSensitive;

        WeightingMethod(boolean z) {
            this.timeSensitive = z;
        }

        public boolean isTimeSensitive() {
            return this.timeSensitive;
        }
    }

    public ResourceAnalyser(ArchiveCrawlSpecification archiveCrawlSpecification, WeightingMethod weightingMethod, MetricRegistry metricRegistry, float f, float f2) throws IOException {
        Preconditions.checkArgument(0.0f <= f2 && ((double) f2) <= 1.0d, "docSimilarityWeight");
        this.method = weightingMethod;
        this.docSimilarityWeight = f2;
        this.timeRelevanceThreshold = f;
        GZIPInputStream gZIPInputStream = new GZIPInputStream(Resources.getResource(ArchiveCrawler.IDF_DICTIONARY_DE).openStream());
        Throwable th = null;
        try {
            try {
                Map<String, Double> readIdfDictionary = LanguageModels.readIdfDictionary(gZIPInputStream);
                if (gZIPInputStream != null) {
                    if (0 != 0) {
                        try {
                            gZIPInputStream.close();
                        } catch (Throwable th2) {
                            th.addSuppressed(th2);
                        }
                    } else {
                        gZIPInputStream.close();
                    }
                }
                this.similarity = DocumentVectorSimilarity.fromVectors(archiveCrawlSpecification.getReferenceVectors(), archiveCrawlSpecification.getKeywords(), archiveCrawlSpecification.getDefaultLanguage(), new LanguageModels(Locale.GERMAN, readIdfDictionary, archiveCrawlSpecification.getDefaultLanguage()), archiveCrawlSpecification.getCorrectionFactors());
                this.referenceTime = archiveCrawlSpecification.getReferenceTime();
                this.urlFilter = UrlFilter.ONLY_HTTP;
                this.urlNormalizer = new UrlNormalizers(new UrlCanonicalizerNormalizer(), new RegexUrlNormalizer(Resources.getResource("default-regex-normalizers.xml")));
                this.outlinkCount = metricRegistry.histogram(MetricRegistry.name(getClass(), new String[]{"numOutlinks"}));
                this.unknowns = metricRegistry.counter(MetricRegistry.name(getClass(), new String[]{"unknownType"}));
                this.empty = metricRegistry.counter(MetricRegistry.name(getClass(), new String[]{"empty"}));
                this.parseTime = metricRegistry.timer(MetricRegistry.name(getClass(), new String[]{"parseTime"}));
                this.textExtractTime = metricRegistry.timer(MetricRegistry.name(getClass(), new String[]{"textExtractTime"}));
                this.analysisTime = metricRegistry.timer(MetricRegistry.name(getClass(), new String[]{"analysisTime"}));
                this.dateExtractionTime = metricRegistry.timer(MetricRegistry.name(getClass(), new String[]{"dateExtractionTime"}));
            } finally {
            }
        } catch (Throwable th3) {
            if (gZIPInputStream != null) {
                if (th != null) {
                    try {
                        gZIPInputStream.close();
                    } catch (Throwable th4) {
                        th.addSuppressed(th4);
                    }
                } else {
                    gZIPInputStream.close();
                }
            }
            throw th3;
        }
    }

    /* JADX WARN: Failed to calculate best type for var: r18v0 ??
    java.lang.NullPointerException: Cannot invoke "jadx.core.dex.instructions.args.InsnArg.getType()" because "changeArg" is null
    	at jadx.core.dex.visitors.typeinference.TypeUpdate.moveListener(TypeUpdate.java:439)
    	at jadx.core.dex.visitors.typeinference.TypeUpdate.runListeners(TypeUpdate.java:232)
    	at jadx.core.dex.visitors.typeinference.TypeUpdate.requestUpdate(TypeUpdate.java:212)
    	at jadx.core.dex.visitors.typeinference.TypeUpdate.updateTypeForSsaVar(TypeUpdate.java:183)
    	at jadx.core.dex.visitors.typeinference.TypeUpdate.updateTypeChecked(TypeUpdate.java:112)
    	at jadx.core.dex.visitors.typeinference.TypeUpdate.apply(TypeUpdate.java:83)
    	at jadx.core.dex.visitors.typeinference.TypeUpdate.apply(TypeUpdate.java:56)
    	at jadx.core.dex.visitors.typeinference.FixTypesVisitor.calculateFromBounds(FixTypesVisitor.java:156)
    	at jadx.core.dex.visitors.typeinference.FixTypesVisitor.setBestType(FixTypesVisitor.java:133)
    	at jadx.core.dex.visitors.typeinference.FixTypesVisitor.deduceType(FixTypesVisitor.java:238)
    	at jadx.core.dex.visitors.typeinference.FixTypesVisitor.tryDeduceTypes(FixTypesVisitor.java:221)
    	at jadx.core.dex.visitors.typeinference.FixTypesVisitor.visit(FixTypesVisitor.java:91)
     */
    /* JADX WARN: Failed to calculate best type for var: r18v0 ??
    java.lang.NullPointerException: Cannot invoke "jadx.core.dex.instructions.args.InsnArg.getType()" because "changeArg" is null
    	at jadx.core.dex.visitors.typeinference.TypeUpdate.moveListener(TypeUpdate.java:439)
    	at jadx.core.dex.visitors.typeinference.TypeUpdate.runListeners(TypeUpdate.java:232)
    	at jadx.core.dex.visitors.typeinference.TypeUpdate.requestUpdate(TypeUpdate.java:212)
    	at jadx.core.dex.visitors.typeinference.TypeUpdate.updateTypeForSsaVar(TypeUpdate.java:183)
    	at jadx.core.dex.visitors.typeinference.TypeUpdate.updateTypeChecked(TypeUpdate.java:112)
    	at jadx.core.dex.visitors.typeinference.TypeUpdate.apply(TypeUpdate.java:83)
    	at jadx.core.dex.visitors.typeinference.TypeUpdate.apply(TypeUpdate.java:56)
    	at jadx.core.dex.visitors.typeinference.TypeInferenceVisitor.calculateFromBounds(TypeInferenceVisitor.java:145)
    	at jadx.core.dex.visitors.typeinference.TypeInferenceVisitor.setBestType(TypeInferenceVisitor.java:123)
    	at jadx.core.dex.visitors.typeinference.TypeInferenceVisitor.lambda$runTypePropagation$2(TypeInferenceVisitor.java:101)
    	at java.base/java.util.ArrayList.forEach(ArrayList.java:1596)
    	at jadx.core.dex.visitors.typeinference.TypeInferenceVisitor.runTypePropagation(TypeInferenceVisitor.java:101)
    	at jadx.core.dex.visitors.typeinference.TypeInferenceVisitor.visit(TypeInferenceVisitor.java:75)
     */
    /* JADX WARN: Failed to calculate best type for var: r19v0 ??
    java.lang.NullPointerException: Cannot invoke "jadx.core.dex.instructions.args.InsnArg.getType()" because "changeArg" is null
    	at jadx.core.dex.visitors.typeinference.TypeUpdate.moveListener(TypeUpdate.java:439)
    	at jadx.core.dex.visitors.typeinference.TypeUpdate.runListeners(TypeUpdate.java:232)
    	at jadx.core.dex.visitors.typeinference.TypeUpdate.requestUpdate(TypeUpdate.java:212)
    	at jadx.core.dex.visitors.typeinference.TypeUpdate.updateTypeForSsaVar(TypeUpdate.java:183)
    	at jadx.core.dex.visitors.typeinference.TypeUpdate.updateTypeChecked(TypeUpdate.java:112)
    	at jadx.core.dex.visitors.typeinference.TypeUpdate.apply(TypeUpdate.java:83)
    	at jadx.core.dex.visitors.typeinference.TypeUpdate.apply(TypeUpdate.java:56)
    	at jadx.core.dex.visitors.typeinference.FixTypesVisitor.calculateFromBounds(FixTypesVisitor.java:156)
    	at jadx.core.dex.visitors.typeinference.FixTypesVisitor.setBestType(FixTypesVisitor.java:133)
    	at jadx.core.dex.visitors.typeinference.FixTypesVisitor.deduceType(FixTypesVisitor.java:238)
    	at jadx.core.dex.visitors.typeinference.FixTypesVisitor.tryDeduceTypes(FixTypesVisitor.java:221)
    	at jadx.core.dex.visitors.typeinference.FixTypesVisitor.visit(FixTypesVisitor.java:91)
     */
    /* JADX WARN: Failed to calculate best type for var: r19v0 ??
    java.lang.NullPointerException: Cannot invoke "jadx.core.dex.instructions.args.InsnArg.getType()" because "changeArg" is null
    	at jadx.core.dex.visitors.typeinference.TypeUpdate.moveListener(TypeUpdate.java:439)
    	at jadx.core.dex.visitors.typeinference.TypeUpdate.runListeners(TypeUpdate.java:232)
    	at jadx.core.dex.visitors.typeinference.TypeUpdate.requestUpdate(TypeUpdate.java:212)
    	at jadx.core.dex.visitors.typeinference.TypeUpdate.updateTypeForSsaVar(TypeUpdate.java:183)
    	at jadx.core.dex.visitors.typeinference.TypeUpdate.updateTypeChecked(TypeUpdate.java:112)
    	at jadx.core.dex.visitors.typeinference.TypeUpdate.apply(TypeUpdate.java:83)
    	at jadx.core.dex.visitors.typeinference.TypeUpdate.apply(TypeUpdate.java:56)
    	at jadx.core.dex.visitors.typeinference.TypeInferenceVisitor.calculateFromBounds(TypeInferenceVisitor.java:145)
    	at jadx.core.dex.visitors.typeinference.TypeInferenceVisitor.setBestType(TypeInferenceVisitor.java:123)
    	at jadx.core.dex.visitors.typeinference.TypeInferenceVisitor.lambda$runTypePropagation$2(TypeInferenceVisitor.java:101)
    	at java.base/java.util.ArrayList.forEach(ArrayList.java:1596)
    	at jadx.core.dex.visitors.typeinference.TypeInferenceVisitor.runTypePropagation(TypeInferenceVisitor.java:101)
    	at jadx.core.dex.visitors.typeinference.TypeInferenceVisitor.visit(TypeInferenceVisitor.java:75)
     */
    /* JADX WARN: Multi-variable type inference failed. Error: java.lang.NullPointerException: Cannot invoke "jadx.core.dex.instructions.args.RegisterArg.getSVar()" because the return value of "jadx.core.dex.nodes.InsnNode.getResult()" is null
    	at jadx.core.dex.visitors.typeinference.AbstractTypeConstraint.collectRelatedVars(AbstractTypeConstraint.java:31)
    	at jadx.core.dex.visitors.typeinference.AbstractTypeConstraint.<init>(AbstractTypeConstraint.java:19)
    	at jadx.core.dex.visitors.typeinference.TypeSearch$1.<init>(TypeSearch.java:376)
    	at jadx.core.dex.visitors.typeinference.TypeSearch.makeMoveConstraint(TypeSearch.java:376)
    	at jadx.core.dex.visitors.typeinference.TypeSearch.makeConstraint(TypeSearch.java:361)
    	at jadx.core.dex.visitors.typeinference.TypeSearch.collectConstraints(TypeSearch.java:341)
    	at java.base/java.util.ArrayList.forEach(ArrayList.java:1596)
    	at jadx.core.dex.visitors.typeinference.TypeSearch.run(TypeSearch.java:60)
    	at jadx.core.dex.visitors.typeinference.FixTypesVisitor.runMultiVariableSearch(FixTypesVisitor.java:116)
    	at jadx.core.dex.visitors.typeinference.FixTypesVisitor.visit(FixTypesVisitor.java:91)
     */
    /* JADX WARN: Not initialized variable reg: 18, insn: 0x0129: MOVE (r0 I:??[int, float, boolean, short, byte, char, OBJECT, ARRAY]) = (r18 I:??[int, float, boolean, short, byte, char, OBJECT, ARRAY]) A[TRY_LEAVE], block:B:66:0x0129 */
    /* JADX WARN: Not initialized variable reg: 19, insn: 0x012e: MOVE (r0 I:??[int, float, boolean, short, byte, char, OBJECT, ARRAY]) = (r19 I:??[int, float, boolean, short, byte, char, OBJECT, ARRAY]), block:B:68:0x012e */
    /* JADX WARN: Type inference failed for: r18v0, types: [com.codahale.metrics.Timer$Context] */
    /* JADX WARN: Type inference failed for: r19v0, types: [java.lang.Throwable] */
    public Result analyse(Snapshot snapshot, CrawlUrl crawlUrl) {
        Object content = snapshot.getContent();
        if (!(content instanceof String)) {
            logger.debug("Unhandled content type '{}' for URL '{}'", snapshot.getMimeType(), crawlUrl);
            this.unknowns.inc();
            return Result.EMPTY;
        }
        Timer.Context time = this.parseTime.time();
        Document parse = Jsoup.parse((String) content, crawlUrl.getUrl());
        time.stop();
        Timer.Context time2 = this.textExtractTime.time();
        String extractText = TextExtractor.extractText(parse);
        time2.stop();
        if (extractText.trim().isEmpty()) {
            logger.debug("No content for URL '{}", crawlUrl);
            this.empty.inc();
            return Result.EMPTY;
        }
        Timer.Context time3 = this.analysisTime.time();
        float similarity = (float) this.similarity.getSimilarity(this.languageDetector.classify(extractText).getLocale(), extractText);
        time3.stop();
        float f = 1.0f;
        try {
            try {
                Timer.Context time4 = this.dateExtractionTime.time();
                Throwable th = null;
                WebPageDateExtractor.WebPageDate modifiedDate = WebPageDateExtractor.getModifiedDate(snapshot.getOriginalUrl(), parse, Long.valueOf(snapshot.getCrawlTime().toInstant().toEpochMilli()), null);
                if (modifiedDate != null && modifiedDate.getDate() != null) {
                    f = (this.method == WeightingMethod.TIME || this.method == WeightingMethod.CONTENT_AND_TIME) ? (float) this.referenceTime.getRelevance(modifiedDate.getDate()) : (float) this.referenceTime.getRelevanceExp(modifiedDate.getDate());
                }
                if (time4 != null) {
                    if (0 != 0) {
                        try {
                            time4.close();
                        } catch (Throwable th2) {
                            th.addSuppressed(th2);
                        }
                    } else {
                        time4.close();
                    }
                }
                float outlinkScore = outlinkScore(similarity, f);
                ImmutableMultiset.Builder builder = ImmutableMultiset.builder();
                Iterator it = parse.select("a[href]").iterator();
                while (it.hasNext()) {
                    String absUrl = ((Element) it.next()).absUrl("href");
                    if (absUrl.trim().isEmpty() || !absUrl.startsWith("http")) {
                        logger.trace("Skipping URL '{}'", absUrl);
                    } else {
                        String normalize = this.urlNormalizer.normalize(absUrl);
                        if (this.urlFilter.apply(normalize)) {
                            builder.add(crawlUrl.outlink(normalize, outlinkScore, snapshot.getCrawlTime()));
                        }
                    }
                }
                Set elementSet = builder.build().elementSet();
                this.outlinkCount.update(elementSet.size());
                logger.debug("Extracted outlinks for URL {}, got {}", crawlUrl, Integer.valueOf(elementSet.size()));
                return new Result(elementSet, similarity, modifiedDate != null ? modifiedDate.getDate() : null);
            } finally {
            }
        } catch (InterruptedException e) {
            logger.info("Interrupted while extracting date", e);
            return Result.EMPTY;
        }
    }

    private float outlinkScore(float f, float f2) {
        switch (AnonymousClass1.$SwitchMap$de$l3s$icrawl$crawler$analysis$ResourceAnalyser$WeightingMethod[this.method.ordinal()]) {
            case 1:
                return f;
            case 2:
            case 3:
                return f > this.timeRelevanceThreshold ? (this.docSimilarityWeight * f) + ((1.0f - this.docSimilarityWeight) * f2) : f;
            case 4:
            case TextRank.MAX_NGRAM_LENGTH /* 5 */:
                return f2;
            case 6:
                return 1.0f;
            default:
                throw new IllegalStateException("Unhandled weighting method " + this.method);
        }
    }
}
