package de.l3s.icrawl.contentanalysis;

import com.google.common.base.CharMatcher;
import com.google.common.base.Joiner;
import com.google.common.base.Splitter;
import com.google.common.collect.HashMultiset;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Iterables;
import com.google.common.collect.Multiset;
import com.google.common.collect.Multisets;
import de.l3s.icrawl.domain.specification.NamedEntity;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Set;
import javax.annotation.Nullable;
import org.openimaj.text.nlp.language.LanguageDetector;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:de/l3s/icrawl/contentanalysis/ContentAnalyser.class */
public class ContentAnalyser {
    private static final CharMatcher SEPARATOR_MATCHER = CharMatcher.WHITESPACE.or(CharMatcher.anyOf("<>|“”„‚‘’,;.:-_'+*`'()$%!\"?"));
    private static final Splitter TEXT_SPLITTER = Splitter.on(SEPARATOR_MATCHER).omitEmptyStrings();
    private static final Logger logger = LoggerFactory.getLogger(ContentAnalyser.class);
    private final LanguageDetector languageDetector;
    private final LabelerFactory labelerFactory;
    private final TextRankWrapper textRank = new TextRankWrapper();

    /* loaded from: input_file:de/l3s/icrawl/contentanalysis/ContentAnalyser$Counts.class */
    public static class Counts {
        private final Multiset<String> keywords;
        private final Multiset<NamedEntity> entities;
        private final List<String> detectedKeywords;
        private final long documentLength;
        private final Locale language;

        Counts(Multiset<String> multiset, Multiset<NamedEntity> multiset2, List<String> list, long j, Locale locale) {
            this.keywords = multiset;
            this.entities = multiset2;
            this.detectedKeywords = list;
            this.documentLength = j;
            this.language = locale;
        }

        public Multiset<String> getKeywords() {
            return this.keywords;
        }

        public Multiset<NamedEntity> getEntities() {
            return this.entities;
        }

        public long getDocumentLength() {
            return this.documentLength;
        }

        public List<String> getDetectedKeywords() {
            return this.detectedKeywords;
        }

        public Locale getLanguage() {
            return this.language;
        }

        public static <T> List<T> topK(Multiset<T> multiset, int i) {
            ImmutableList.Builder builder = ImmutableList.builder();
            Iterator<T> it = Iterables.limit(Multisets.copyHighestCountFirst(multiset).entrySet(), i).iterator();
            while (it.hasNext()) {
                builder.add(((Multiset.Entry) it.next()).getElement());
            }
            return builder.build();
        }
    }

    public ContentAnalyser(LanguageDetector languageDetector, @Nullable LabelerFactory labelerFactory) {
        this.languageDetector = languageDetector;
        this.labelerFactory = labelerFactory;
    }

    public Counts analyze(List<String> list, Set<String> set) {
        List<String> emptyList;
        HashMultiset create = HashMultiset.create(set.size());
        HashMultiset create2 = HashMultiset.create();
        long j = 0;
        String joinParagraphs = joinParagraphs(list);
        LanguageDetector.WeightedLocale classify = this.languageDetector.classify(joinParagraphs);
        Labeler labeler = this.labelerFactory != null ? this.labelerFactory.get(classify) : null;
        try {
            emptyList = this.textRank.rank(joinParagraphs, classify.getLocale(), 10);
        } catch (RuntimeException e) {
            Logger logger2 = logger;
            Object[] objArr = new Object[3];
            objArr[0] = joinParagraphs.length() > 50 ? joinParagraphs.substring(0, 50) + "..." : joinParagraphs;
            objArr[1] = classify.getLocale();
            objArr[2] = e;
            logger2.debug("Exception while running TextRank on '{}'@{}: ", objArr);
            emptyList = Collections.emptyList();
        }
        for (String str : list) {
            create2.addAll(extractEntities(labeler, str));
            j = !set.isEmpty() ? j + extractSpecifiedKeywords(str, create, set) : j + countWords(str);
        }
        return new Counts(create, create2, emptyList, j, classify.getLocale());
    }

    private long extractSpecifiedKeywords(String str, Multiset<String> multiset, Set<String> set) {
        long j = 0;
        for (String str2 : TEXT_SPLITTER.split(str)) {
            if (!SEPARATOR_MATCHER.matchesAllOf(str2)) {
                j++;
                String trim = str2.toLowerCase().trim();
                if (set.contains(trim)) {
                    multiset.add(trim);
                }
            }
        }
        return j;
    }

    private long countWords(String str) {
        long j = 0;
        Iterator it = TEXT_SPLITTER.split(str).iterator();
        while (it.hasNext()) {
            if (!SEPARATOR_MATCHER.matchesAllOf((String) it.next())) {
                j++;
            }
        }
        return j;
    }

    protected Collection<NamedEntity> extractEntities(Labeler labeler, String str) {
        return labeler != null ? labeler.extractEntities(str) : Collections.emptySet();
    }

    private static String joinParagraphs(List<String> list) {
        return Joiner.on('\n').join(list);
    }
}
