package de.datexis.cdv.index;

import com.google.common.collect.ArrayListMultimap;
import de.datexis.common.Resource;
import de.datexis.common.WordHelpers;
import de.datexis.encoder.IEncoder;
import de.datexis.model.Query;
import de.datexis.model.Sentence;
import de.datexis.preprocess.DocumentFactory;
import de.datexis.retrieval.index.InMemoryIndex;
import de.datexis.retrieval.tagger.LSTMSentenceTaggerIterator;
import de.datexis.tagger.AbstractMultiDataSetIterator;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.deeplearning4j.models.embeddings.inmemory.InMemoryLookupTable;
import org.deeplearning4j.models.word2vec.wordstore.inmemory.AbstractCache;
import org.deeplearning4j.text.tokenization.tokenizer.TokenPreProcess;
import org.nd4j.linalg.factory.Nd4j;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:de/datexis/cdv/index/QueryIndex.class */
public abstract class QueryIndex extends InMemoryIndex {
    protected final Logger log;

    /* JADX INFO: Access modifiers changed from: protected */
    public QueryIndex() {
        this.log = LoggerFactory.getLogger(getClass());
    }

    public QueryIndex(TokenPreProcess tokenPreProcess, IEncoder iEncoder) {
        super(tokenPreProcess, iEncoder);
        this.log = LoggerFactory.getLogger(getClass());
    }

    public abstract void encodeFromQueries(Collection<Query> collection);

    public void clear() {
        this.keyVocabulary = new AbstractCache.Builder().hugeModelExpected(false).minElementFrequency(0).build();
        this.lookupVectors = new InMemoryLookupTable(this.keyVocabulary, (int) getEmbeddingVectorSize(), true, 0.01d, Nd4j.getRandom(), 0.0d, true);
    }

    public void buildKeyIndex(Resource resource) {
        buildKeyIndex(new LSTMSentenceTaggerIterator(AbstractMultiDataSetIterator.Stage.ENCODE, (IEncoder) null, (IEncoder) null, resource, "utf-8", WordHelpers.Language.EN, true, 1).getLabels());
    }

    public void encodeIndexFromSentences(Resource resource) {
        encodeIndexFromSentences(resource, Collections.emptySet(), true);
    }

    public void encodeIndexFromSentences(Resource resource, Set<String> set, boolean z) {
        LSTMSentenceTaggerIterator lSTMSentenceTaggerIterator = new LSTMSentenceTaggerIterator(AbstractMultiDataSetIterator.Stage.ENCODE, this.encoder, (IEncoder) null, resource, "utf-8", WordHelpers.Language.EN, set, z, 1);
        this.log.info("Reading {} examples...", Long.valueOf(lSTMSentenceTaggerIterator.getNumExamples()));
        ArrayListMultimap create = ArrayListMultimap.create();
        while (lSTMSentenceTaggerIterator.hasNext()) {
            Map.Entry nextLabeledSentence = lSTMSentenceTaggerIterator.nextLabeledSentence();
            create.put(this.keyPreprocessor.preProcess((String) nextLabeledSentence.getKey()), nextLabeledSentence.getValue());
        }
        buildKeyIndex(create.keys(), false);
        encodeAndBuildVectorIndex(create, false);
        setModelAvailable(true);
    }

    public void encodeIndexFromLabels(Resource resource) {
        List<String> labels = new LSTMSentenceTaggerIterator(AbstractMultiDataSetIterator.Stage.ENCODE, this.encoder, (IEncoder) null, resource, "utf-8", WordHelpers.Language.EN, true, 64).getLabels();
        ArrayListMultimap create = ArrayListMultimap.create();
        for (String str : labels) {
            String preProcess = this.keyPreprocessor.preProcess(str);
            Sentence createSentenceFromTokenizedString = DocumentFactory.createSentenceFromTokenizedString(str);
            if (!create.containsKey(preProcess)) {
                create.put(preProcess, createSentenceFromTokenizedString);
            }
        }
        buildKeyIndex(create.keys(), false);
        encodeAndBuildVectorIndex(create, false);
        setModelAvailable(true);
    }

    public double weightFactor(String str) {
        return Math.min(1.0d, 0.03d / Math.sqrt(probability(str)));
    }
}
