package de.datexis.index.impl;

import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import de.datexis.common.ObjectSerializer;
import de.datexis.common.Resource;
import de.datexis.index.ArticleIndex;
import de.datexis.index.ArticleRef;
import de.datexis.index.WikiDataArticle;
import de.datexis.model.Article;
import de.datexis.preprocess.MinimalLowercasePreprocessor;
import info.debatty.java.stringsimilarity.JaroWinkler;
import info.debatty.java.stringsimilarity.interfaces.NormalizedStringSimilarity;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Optional;
import java.util.TreeMap;
import java.util.TreeSet;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.KeywordAnalyzer;
import org.apache.lucene.analysis.custom.CustomAnalyzer;
import org.apache.lucene.analysis.icu.ICUFoldingFilterFactory;
import org.apache.lucene.analysis.icu.segmentation.ICUTokenizerFactory;
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BoostQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.RAMDirectory;
import org.deeplearning4j.text.tokenization.tokenizer.TokenPreProcess;
import org.nd4j.linalg.api.ndarray.INDArray;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:de/datexis/index/impl/LuceneArticleIndex.class */
public class LuceneArticleIndex extends ArticleIndex {
    protected static final String FIELD_TITLE = "title";
    protected static final String FIELD_TYPE = "type";
    protected static final String FIELD_REFIDS = "refID_";
    protected static final String FIELD_REFIDS_WIKIDATA = "refID_wikidata";
    protected static final String FIELD_REFIDS_WIKIPEDIA = "refID_wikipedia";
    protected static final String FIELD_REFIDS_FREEBASE = "refID_freebase";
    protected static final String FIELD_REFIDS_UMLS = "refID_umls";
    protected static final String FIELD_REFIDS_ICD10 = "refID_icd10";
    protected static final String FIELD_REFURLS_WIKIPEDIA = "refURL_wikipedia";
    protected static final String FIELD_TEXT = "text";
    protected static final String FIELD_DESCRIPTION = "description";
    protected static final String FIELD_NAMES = "name";
    protected static final String FIELD_TERMS = "term";
    protected static final String FIELD_VECTOR = "vector";
    protected static final String PARAM_PROXIMITY = "2";
    protected static final String PARAM_FUZZY = "0.8";
    protected static final int NUM_BM25_CANDIDATES = 1024;
    protected IndexReader reader;
    protected IndexSearcher searcher;
    protected Analyzer analyzer;
    protected final NormalizedStringSimilarity sim = new JaroWinkler();
    protected final TokenPreProcess preprocessor = new MinimalLowercasePreprocessor();
    static final FieldType FIELDTYPE_VECTOR;
    protected static final Logger log = LoggerFactory.getLogger(LuceneArticleIndex.class);
    static final FieldType FIELDTYPE_NAME = new FieldType();

    public boolean openIndex(Resource resource) {
        try {
            return openIndex((Directory) FSDirectory.open(resource.getPath()));
        } catch (IOException e) {
            return false;
        }
    }

    private boolean openIndex(Directory directory) {
        try {
            this.reader = DirectoryReader.open(directory);
            this.searcher = new IndexSearcher(this.reader);
            this.analyzer = buildAnalyzer();
            return true;
        } catch (IOException e) {
            return false;
        }
    }

    public void createIndexRAM(Iterator<? extends Article> it) {
        RAMDirectory rAMDirectory = new RAMDirectory();
        createIndex(it, rAMDirectory);
        openIndex((Directory) rAMDirectory);
    }

    public void createIndexDirectory(Iterator<? extends Article> it, Resource resource) throws IOException {
        FSDirectory open = FSDirectory.open(resource.getPath());
        createIndex(it, open);
        openIndex((Directory) open);
    }

    public void createIndex(Iterator<? extends Article> it, Directory directory) {
        log.info("creating new index...");
        this.analyzer = buildAnalyzer();
        try {
            IndexWriter indexWriter = new IndexWriter(directory, new IndexWriterConfig(this.analyzer));
            int i = 0;
            log.info("writing articles...");
            while (it.hasNext()) {
                indexWriter.addDocument(createLuceneDocument((WikiDataArticle) it.next()));
                i++;
                if (i % 100000 == 0) {
                    log.info("wrote " + i + " articles so far");
                }
            }
            indexWriter.close();
            log.info(i + " articles (0 empty) written to index");
        } catch (IOException e) {
            log.error(e.toString());
        }
    }

    @Override // de.datexis.index.ArticleIndex
    public List<ArticleRef> queryNames(String str, int i) {
        List<Document> queryIndexNames = queryIndexNames(str, NUM_BM25_CANDIDATES);
        ArrayList arrayList = new ArrayList(NUM_BM25_CANDIDATES);
        Iterator<Document> it = queryIndexNames.iterator();
        while (it.hasNext()) {
            Article createWikidataArticle = createWikidataArticle(it.next());
            double d = 0.0d;
            Iterator it2 = createWikidataArticle.getNames().iterator();
            while (it2.hasNext()) {
                double similarity = this.sim.similarity(this.preprocessor.preProcess((String) it2.next()), this.preprocessor.preProcess(str));
                if (similarity > d) {
                    d = similarity;
                }
            }
            ArticleRef articleRef = new ArticleRef(createWikidataArticle);
            articleRef.setScore(d);
            arrayList.add(articleRef);
        }
        Collections.sort(arrayList, new ArticleRef.ScoreComparator());
        return Lists.newArrayList(Iterables.limit(arrayList, i));
    }

    @Override // de.datexis.index.ArticleIndex
    public List<ArticleRef> queryPrefixNames(String str, int i) {
        return queryIndexPrefix(str, i);
    }

    @Override // de.datexis.index.ArticleIndex
    public Optional<ArticleRef> queryID(String str) {
        return queryWikidataID(str);
    }

    public Optional<ArticleRef> queryWikidataID(String str) {
        Optional<Document> queryIndexID = queryIndexID(FIELD_REFIDS_WIKIDATA, str);
        return queryIndexID.isPresent() ? Optional.of(createWikidataArticleRef(queryIndexID.get())) : Optional.empty();
    }

    public Optional<ArticleRef> queryWikipediaURL(String str) {
        if (str == null || str.isEmpty()) {
            return Optional.empty();
        }
        if (str.startsWith("http://")) {
            str = str.replaceFirst("http://", "https://");
        } else if (!str.startsWith("https://")) {
            str = "https://" + str;
        }
        Optional<Document> queryIndexID = queryIndexID(FIELD_REFURLS_WIKIPEDIA, decodeWikiUrl(str));
        return queryIndexID.isPresent() ? Optional.of(createWikidataArticleRef(queryIndexID.get())) : Optional.empty();
    }

    public Optional<ArticleRef> queryWikipediaPage(String str) {
        Optional<Document> queryIndexID = queryIndexID(FIELD_REFIDS_WIKIPEDIA, decodeWikiUrl(str));
        return queryIndexID.isPresent() ? Optional.of(createWikidataArticleRef(queryIndexID.get())) : Optional.empty();
    }

    protected String decodeWikiUrl(String str) {
        try {
            str = URLDecoder.decode(str, "UTF-8");
        } catch (UnsupportedEncodingException e) {
            log.debug("could not decode URL '" + str + "'");
        }
        return str.replace(" ", "_").replaceFirst("#.+$", "");
    }

    public Collection<String> getAllArticleTitles() {
        return getAllFields(FIELD_TITLE);
    }

    public Collection<String> getAllArticleNames() {
        return getAllFields(FIELD_NAMES);
    }

    public Collection<String> getAllArticleTerms() {
        return getAllFields(FIELD_TERMS);
    }

    public Collection<String> getAllArticleURLs() {
        return getAllFields(FIELD_REFURLS_WIKIPEDIA);
    }

    public Collection<String> getAllArticleIDs() {
        return getAllFields(FIELD_REFIDS_WIKIDATA);
    }

    protected Collection<String> getAllFields(String str) {
        TreeSet treeSet = new TreeSet();
        try {
            IndexReader indexReader = this.searcher.getIndexReader();
            TreeSet treeSet2 = new TreeSet();
            treeSet2.add(str);
            for (int i = 0; i < indexReader.maxDoc(); i++) {
                for (String str2 : indexReader.document(i, treeSet2).getValues(str)) {
                    treeSet.add(str2);
                }
            }
        } catch (Exception e) {
            log.error(e.toString());
        }
        return treeSet;
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public Document createLuceneDocument(WikiDataArticle wikiDataArticle) {
        Document document = new Document();
        addTextField(document, FIELD_TITLE, wikiDataArticle.getTitle().trim(), Field.Store.YES);
        addTextField(document, FIELD_TYPE, wikiDataArticle.getType(), Field.Store.YES);
        addTextField(document, FIELD_DESCRIPTION, wikiDataArticle.getDescription(), Field.Store.YES);
        Iterator it = wikiDataArticle.getNames().iterator();
        while (it.hasNext()) {
            addNameField(document, FIELD_NAMES, (String) it.next());
        }
        Iterator it2 = wikiDataArticle.getTerms().iterator();
        while (it2.hasNext()) {
            addNameField(document, FIELD_TERMS, (String) it2.next());
        }
        addStringField(document, FIELD_REFIDS_WIKIDATA, wikiDataArticle.getRefID(WikiDataArticle.RefID.WIKIDATA));
        addStringField(document, FIELD_REFIDS_FREEBASE, wikiDataArticle.getRefID(WikiDataArticle.RefID.FREEBASE));
        addStringField(document, FIELD_REFIDS_WIKIPEDIA, wikiDataArticle.getRefID(WikiDataArticle.RefID.WIKIPEDIA));
        addStringField(document, FIELD_REFIDS_UMLS, wikiDataArticle.getRefID(WikiDataArticle.RefID.UMLS));
        addStringField(document, FIELD_REFIDS_ICD10, wikiDataArticle.getRefID(WikiDataArticle.RefID.ICD10));
        addStringField(document, FIELD_REFURLS_WIKIPEDIA, wikiDataArticle.getUrl());
        return document;
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public ArticleRef createWikidataArticleRef(Document document) {
        ArticleRef articleRef = new ArticleRef();
        articleRef.setTitle(document.get(FIELD_TITLE));
        articleRef.setType(document.get(FIELD_TYPE));
        articleRef.setDescription(document.get(FIELD_DESCRIPTION));
        articleRef.setId(document.get(FIELD_REFIDS_WIKIDATA));
        articleRef.setUrl(document.get(FIELD_REFURLS_WIKIPEDIA));
        String str = document.get(FIELD_VECTOR);
        if (str != null) {
            articleRef.setVector(ObjectSerializer.getArrayFromBase64String(str));
        }
        return articleRef;
    }

    protected Article createWikidataArticle(Document document) {
        Article article = new Article();
        article.setTitle(document.get(FIELD_TITLE));
        article.setType(document.get(FIELD_TYPE));
        article.setDescription(document.get(FIELD_DESCRIPTION));
        article.setId(document.get(FIELD_REFIDS_WIKIDATA));
        article.setUrl(document.get(FIELD_REFURLS_WIKIPEDIA));
        for (IndexableField indexableField : document.getFields(FIELD_NAMES)) {
            article.addName(indexableField.stringValue());
        }
        String str = document.get(FIELD_VECTOR);
        if (str != null) {
            article.setVector(ObjectSerializer.getArrayFromBase64String(str));
        }
        return article;
    }

    protected String splitString(String str, String str2) {
        String[] split = str.split("\\s");
        StringBuilder sb = new StringBuilder();
        for (String str3 : split) {
            if (sb.length() > 0) {
                sb.append(" ");
            }
            sb.append(str3).append(str2);
        }
        return sb.toString();
    }

    protected void addTextField(Document document, String str, String str2, Field.Store store) {
        if (str2 != null) {
            document.add(new TextField(str, str2, store));
        }
    }

    protected void addStringField(Document document, String str, String str2) {
        if (str2 != null) {
            document.add(new StringField(str, str2, Field.Store.YES));
        }
    }

    protected void addNameField(Document document, String str, String str2) {
        if (str2 != null) {
            document.add(new Field(str, str2, FIELDTYPE_NAME));
        }
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public void addVectorField(Document document, String str, INDArray iNDArray) {
        String arrayAsBase64String = ObjectSerializer.getArrayAsBase64String(iNDArray);
        if (arrayAsBase64String != null) {
            document.add(new Field(str, arrayAsBase64String, FIELDTYPE_VECTOR));
        }
    }

    protected Analyzer buildAnalyzer() {
        TreeMap treeMap = new TreeMap();
        try {
            CustomAnalyzer build = CustomAnalyzer.builder().withTokenizer(ICUTokenizerFactory.class, new String[0]).addTokenFilter(ICUFoldingFilterFactory.class, new String[0]).build();
            treeMap.put(FIELD_NAMES, build);
            treeMap.put(FIELD_TERMS, build);
            KeywordAnalyzer keywordAnalyzer = new KeywordAnalyzer();
            treeMap.put(FIELD_VECTOR, keywordAnalyzer);
            treeMap.put(FIELD_REFIDS_WIKIDATA, keywordAnalyzer);
            treeMap.put(FIELD_REFIDS_WIKIPEDIA, keywordAnalyzer);
            treeMap.put(FIELD_REFIDS_FREEBASE, keywordAnalyzer);
            treeMap.put(FIELD_REFIDS_UMLS, keywordAnalyzer);
            treeMap.put(FIELD_REFIDS_ICD10, keywordAnalyzer);
            treeMap.put(FIELD_REFURLS_WIKIPEDIA, keywordAnalyzer);
        } catch (IOException e) {
            log.error("Could not create Lucene Analyzer: ");
            log.error(e.toString());
        }
        return new PerFieldAnalyzerWrapper(new StandardAnalyzer(), treeMap);
    }

    protected List<Document> queryIndexNames(String str, int i) {
        ArrayList arrayList = new ArrayList();
        try {
            for (ScoreDoc scoreDoc : this.searcher.search(new BooleanQuery.Builder().add(new BoostQuery(new QueryParser(FIELD_NAMES, this.analyzer).parse("\"" + str + "\"~" + PARAM_PROXIMITY), 1.0f), BooleanClause.Occur.SHOULD).build(), i).scoreDocs) {
                arrayList.add(this.searcher.doc(scoreDoc.doc));
            }
        } catch (Exception e) {
            log.error(e.toString());
        }
        return arrayList;
    }

    protected List<ArticleRef> queryIndexPrefix(String str, int i) {
        ArrayList arrayList = new ArrayList();
        try {
            for (ScoreDoc scoreDoc : this.searcher.search(new BooleanQuery.Builder().add(new BoostQuery(new QueryParser(FIELD_NAMES, this.analyzer).parse("\"" + str + "\"*"), 1.0f), BooleanClause.Occur.SHOULD).build(), i).scoreDocs) {
                ArticleRef createWikidataArticleRef = createWikidataArticleRef(this.searcher.doc(scoreDoc.doc));
                createWikidataArticleRef.setScore(r0.score);
                arrayList.add(createWikidataArticleRef);
            }
        } catch (Exception e) {
            log.error(e.toString());
        }
        return arrayList;
    }

    protected Optional<Document> queryIndexID(String str, String str2) {
        try {
            TopDocs search = this.searcher.search(new QueryParser(str, this.analyzer).parse("\"" + str2 + "\""), 1);
            if (search.scoreDocs.length > 0) {
                return Optional.ofNullable(this.searcher.doc(search.scoreDocs[0].doc));
            }
        } catch (Exception e) {
            log.error(e.toString());
        }
        return Optional.empty();
    }

    static {
        FIELDTYPE_NAME.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
        FIELDTYPE_NAME.setTokenized(true);
        FIELDTYPE_NAME.setStored(true);
        FIELDTYPE_NAME.setOmitNorms(true);
        FIELDTYPE_NAME.freeze();
        FIELDTYPE_VECTOR = new FieldType();
        FIELDTYPE_VECTOR.setIndexOptions(IndexOptions.NONE);
        FIELDTYPE_VECTOR.setTokenized(false);
        FIELDTYPE_VECTOR.setStored(true);
        FIELDTYPE_VECTOR.setOmitNorms(true);
        FIELDTYPE_VECTOR.freeze();
    }
}
