package de.l3s.icrawl.contentanalysis;

import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Multimap;
import com.google.common.collect.Multimaps;
import de.l3s.icrawl.contentanalysis.LanguageModel;
import de.l3s.icrawl.domain.specification.NamedEntity;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import javax.annotation.concurrent.ThreadSafe;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

@ThreadSafe
/* loaded from: input_file:de/l3s/icrawl/contentanalysis/DocumentVectorSimilarity.class */
public class DocumentVectorSimilarity implements Serializable {
    static final String TOKEN_SEPARATOR = " ";
    static final int DEFAULT_NGRAM_SIZE = 2;
    private static final Logger logger = LoggerFactory.getLogger(DocumentVectorSimilarity.class);
    private static final long serialVersionUID = 3;

    @JsonProperty
    private final Map<Locale, DocumentVector> referenceVectors;

    @JsonProperty
    private final Map<Locale, LanguageModel.KeywordMatcher> matchers;

    @JsonIgnore
    private LanguageModels languageModels;

    @JsonIgnore
    private final Locale defaultLanguage;
    private final Map<Locale, Double> correctionFactors;

    public DocumentVectorSimilarity(Map<String, Locale> map, Set<String> set, Set<NamedEntity> set2, int i, boolean z, Locale locale, LanguageModels languageModels) {
        this.defaultLanguage = locale;
        this.languageModels = languageModels;
        Multimap invertFrom = Multimaps.invertFrom(Multimaps.forMap(map), ArrayListMultimap.create());
        HashSet hashSet = new HashSet(set);
        HashMultimap create = HashMultimap.create();
        Iterator<NamedEntity> it = set2.iterator();
        while (it.hasNext()) {
            for (NamedEntity.Label label : it.next().getLabels()) {
                if (label.getLanguage() != null) {
                    create.put(label.getLanguage(), label.getName());
                } else {
                    hashSet.add(label.getName());
                }
            }
        }
        ImmutableMap.Builder builder = ImmutableMap.builder();
        ImmutableMap.Builder builder2 = ImmutableMap.builder();
        for (Map.Entry entry : invertFrom.asMap().entrySet()) {
            Locale locale2 = (Locale) entry.getKey();
            LanguageModel.KeywordMatcher buildMatcher = languageModels.buildMatcher(locale2, Iterables.concat(create.get(locale2), hashSet), DEFAULT_NGRAM_SIZE);
            ArrayList newArrayListWithExpectedSize = Lists.newArrayListWithExpectedSize(((Collection) entry.getValue()).size());
            Iterator it2 = ((Collection) entry.getValue()).iterator();
            while (it2.hasNext()) {
                newArrayListWithExpectedSize.add(languageModels.buildDocumentVector(locale2, (String) it2.next(), buildMatcher));
            }
            logger.debug("Got doc vectors for language {}: {}", locale2.getLanguage(), newArrayListWithExpectedSize);
            DocumentVector merge = DocumentVector.merge(newArrayListWithExpectedSize, z);
            if (i > 0) {
                merge = merge.topN(i);
            }
            builder.put(locale2, merge);
            builder2.put(locale2, buildMatcher);
        }
        this.referenceVectors = builder.build();
        this.matchers = builder2.build();
        if (logger.isDebugEnabled()) {
            for (Map.Entry<Locale, DocumentVector> entry2 : this.referenceVectors.entrySet()) {
                logger.debug("Reference vector for language '{}': {}...", entry2.getKey(), entry2.getValue().topComponents(10));
            }
        }
        ImmutableMap.Builder builder3 = ImmutableMap.builder();
        for (Locale locale3 : invertFrom.keySet()) {
            DocumentVector documentVector = this.referenceVectors.get(locale3);
            LanguageModel.KeywordMatcher keywordMatcher = this.matchers.get(locale3);
            builder3.put(locale3, Double.valueOf(invertFrom.get(locale3).stream().mapToDouble(str -> {
                return languageModels.getSimilarity(locale3, str, documentVector, keywordMatcher);
            }).average().orElse(1.0d)));
        }
        this.correctionFactors = builder3.build();
    }

    public static DocumentVectorSimilarity fromVectors(Map<Locale, DocumentVector> map, Map<Locale, Set<String>> map2, Locale locale, LanguageModels languageModels, Map<Locale, Double> map3) {
        Set set = (Set) map2.values().stream().flatMap((v0) -> {
            return v0.stream();
        }).collect(Collectors.toSet());
        ImmutableMap.Builder builder = ImmutableMap.builder();
        Iterator<Map.Entry<Locale, Set<String>>> it = map2.entrySet().iterator();
        while (it.hasNext()) {
            Locale key = it.next().getKey();
            builder.put(key, languageModels.buildMatcher(key, set, DEFAULT_NGRAM_SIZE));
        }
        DocumentVectorSimilarity documentVectorSimilarity = new DocumentVectorSimilarity(map, builder.build(), locale, map3);
        documentVectorSimilarity.setLanguageModels(languageModels);
        return documentVectorSimilarity;
    }

    public void setLanguageModels(LanguageModels languageModels) {
        this.languageModels = languageModels;
    }

    @JsonCreator
    protected DocumentVectorSimilarity(@JsonProperty("referenceVectors") Map<Locale, DocumentVector> map, @JsonProperty("matchers") Map<Locale, LanguageModel.KeywordMatcher> map2, @JsonProperty("defaultLanguage") Locale locale, @JsonProperty("correctionFactors") Map<Locale, Double> map3) {
        this.referenceVectors = map;
        this.matchers = map2;
        this.defaultLanguage = locale;
        this.correctionFactors = map3;
        this.languageModels = new LanguageModels(locale, new HashMap(), locale);
    }

    public String toString() {
        return (String) this.referenceVectors.entrySet().stream().map(entry -> {
            return String.format("%s => %s", entry.getKey(), ((DocumentVector) entry.getValue()).topComponents(10));
        }).collect(Collectors.joining(", ", "DocumentVectorSimilarity[", "]"));
    }

    private LanguageModel.KeywordMatcher getMatcher(Locale locale) {
        LanguageModel.KeywordMatcher keywordMatcher = this.matchers.get(locale);
        if (keywordMatcher == null) {
            logger.debug("No keyword matcher for language '{}', falling back to default", locale);
            keywordMatcher = this.matchers.get(this.defaultLanguage);
        }
        return keywordMatcher;
    }

    private DocumentVector getReferenceVector(Locale locale) {
        DocumentVector documentVector = this.referenceVectors.get(locale);
        if (documentVector == null) {
            logger.debug("No reference vector for language '{}', falling back to default", locale);
            documentVector = this.referenceVectors.get(this.defaultLanguage);
        }
        return documentVector;
    }

    public Map<Locale, DocumentVector> getReferenceVectors() {
        return this.referenceVectors;
    }

    public Map<Locale, LanguageModel.KeywordMatcher> getMatchers() {
        return this.matchers;
    }

    public Map<Locale, Double> getCorrectionFactors() {
        return this.correctionFactors;
    }

    public double getSimilarity(Locale locale, String str) {
        DocumentVector referenceVector = getReferenceVector(locale);
        if (referenceVector == null) {
            logger.info("No reference vector for language {}", locale);
            return 0.0d;
        }
        LanguageModel.KeywordMatcher matcher = getMatcher(locale);
        if (matcher == null) {
            logger.debug("Available keywords matchers: {}", this.matchers.keySet());
        }
        return this.languageModels.getSimilarity(locale, str, referenceVector, matcher) / this.correctionFactors.getOrDefault(locale, Double.valueOf(1.0d)).doubleValue();
    }
}
