package de.julielab.jcore.ae.lingpipegazetteer.chunking;

import com.aliasi.chunk.Chunker;
import com.aliasi.dict.AbstractDictionary;
import com.aliasi.dict.ApproxDictionaryChunker;
import com.aliasi.dict.DictionaryEntry;
import com.aliasi.dict.ExactDictionaryChunker;
import com.aliasi.dict.MapDictionary;
import com.aliasi.dict.TrieDictionary;
import com.aliasi.tokenizer.IndoEuropeanTokenizerFactory;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.util.Iterator;
import java.util.Properties;
import java.util.Set;
import java.util.TreeSet;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
import org.apache.commons.lang.StringUtils;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.resource.DataResource;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.SharedResourceObject;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:de/julielab/jcore/ae/lingpipegazetteer/chunking/ChunkerProviderImpl.class */
public class ChunkerProviderImpl implements ChunkerProvider, SharedResourceObject {
    private static final Logger LOGGER = LoggerFactory.getLogger(ChunkerProviderImpl.class);
    public static final String PARAM_USE_APPROXIMATE_MATCHING = "UseApproximateMatching";
    public static final String PARAM_CASE_SENSITIVE = "CaseSensitive";
    public static final String PARAM_MAKE_VARIANTS = "MakeVariants";
    public static final String PARAM_STOPWORD_FILE = "StopWordFile";
    public static final String PARAM_DICTIONARY_FILE = "DictionaryFile";
    public static final String PARAM_SERIALIZED_DICTIONARY_FILE = "SerializedDictionaryFile";
    private boolean generateVariants;
    private boolean caseSensitive;
    private boolean useApproximateMatching;
    private AbstractDictionary<String> dict;
    private Chunker dictChunker = null;
    private final double CHUNK_SCORE = 1.0d;
    private final int MIN_TERM_LENGTH = 3;
    private final int NUM_HYPHENS4VARIANTS = 7;
    private final String SEPARATOR = "\t";
    private final double APPROX_MATCH_THRESHOLD_SCORE = 100.0d;
    private TreeSet<String> stopWords = new TreeSet<>();

    @Override // de.julielab.jcore.ae.lingpipegazetteer.chunking.ChunkerProvider
    public Chunker getChunker() {
        return this.dictChunker;
    }

    public void load(DataResource dataResource) throws ResourceInitializationException {
        InputStream resourceAsStream;
        InputStream resourceAsStream2;
        Properties properties = new Properties();
        try {
            properties.load(dataResource.getInputStream());
            LOGGER.info("Creating dictionary chunker with " + dataResource.getUrl() + " properties file.");
            String property = properties.getProperty("DictionaryFile");
            if (property == null) {
                throw new ResourceInitializationException("config_setting_absent", new Object[]{"DictionaryFile"});
            }
            String property2 = properties.getProperty("StopWordFile");
            if (property2 == null) {
                throw new ResourceInitializationException("config_setting_absent", new Object[]{"StopWordFile"});
            }
            String property3 = properties.getProperty(PARAM_SERIALIZED_DICTIONARY_FILE);
            File file = property3 != null ? new File(property3) : null;
            LOGGER.debug("Serialized dictionary path: {}", property3);
            String property4 = properties.getProperty("MakeVariants");
            this.generateVariants = true;
            if (property4 != null) {
                this.generateVariants = new Boolean(property4).booleanValue();
            }
            LOGGER.debug("Generate variants: {}", Boolean.valueOf(this.generateVariants));
            String property5 = properties.getProperty("CaseSensitive");
            this.caseSensitive = false;
            if (property5 != null) {
                this.caseSensitive = new Boolean(property5).booleanValue();
            }
            LOGGER.debug("Case sensitive: {}", Boolean.valueOf(this.caseSensitive));
            String property6 = properties.getProperty("UseApproximateMatching");
            this.useApproximateMatching = false;
            if (property6 != null) {
                this.useApproximateMatching = new Boolean(property6).booleanValue();
            }
            LOGGER.debug("Use approximate matching: {}", property6);
            try {
                if (new File(property).exists()) {
                    resourceAsStream = new FileInputStream(property);
                } else {
                    resourceAsStream = getClass().getResourceAsStream(property.startsWith("/") ? property : "/" + property);
                }
                if (new File(property2).exists()) {
                    resourceAsStream2 = new FileInputStream(property2);
                } else {
                    resourceAsStream2 = getClass().getResourceAsStream(property2.startsWith("/") ? property2 : "/" + property2);
                }
                if (file == null || !file.exists()) {
                    initStopWords(resourceAsStream2);
                    readDictionary(resourceAsStream);
                    if (!StringUtils.isBlank(property3)) {
                        serializeDictionary(file);
                    }
                } else {
                    readSerializedDictionaryFile(file);
                }
                if (this.useApproximateMatching) {
                    this.dictChunker = new ApproxDictionaryChunker(this.dict, IndoEuropeanTokenizerFactory.INSTANCE, ApproxDictionaryChunker.TT_DISTANCE, 100.0d);
                } else {
                    this.dictChunker = new ExactDictionaryChunker(this.dict, IndoEuropeanTokenizerFactory.INSTANCE, false, this.caseSensitive);
                }
            } catch (Exception e) {
                LOGGER.error("Exception while creating chunker instance", e);
            }
        } catch (IOException e2) {
            LOGGER.error("Error while loading properties file", e2);
            throw new ResourceInitializationException(e2);
        }
    }

    private void readSerializedDictionaryFile(File file) throws FileNotFoundException, IOException, ClassNotFoundException {
        long currentTimeMillis = System.currentTimeMillis();
        LOGGER.info("Reading serialized dictionary from: {}", file.getAbsolutePath());
        LOGGER.info("Warning: Loading a serialized dictionary seems to take longer than just reading the original text entries");
        Throwable th = null;
        try {
            ObjectInputStream objectInputStream = new ObjectInputStream(new GZIPInputStream(new FileInputStream(file)));
            try {
                this.dict = (AbstractDictionary) objectInputStream.readObject();
                if (objectInputStream != null) {
                    objectInputStream.close();
                }
                LOGGER.info("Dictionary has been read.");
                long currentTimeMillis2 = System.currentTimeMillis() - currentTimeMillis;
                LOGGER.info("Reading serialized dictionary took {}ms ({}s)", Long.valueOf(currentTimeMillis2), Long.valueOf(currentTimeMillis2 / 1000));
            } catch (Throwable th2) {
                if (objectInputStream != null) {
                    objectInputStream.close();
                }
                throw th2;
            }
        } catch (Throwable th3) {
            if (0 == 0) {
                th = th3;
            } else if (null != th3) {
                th.addSuppressed(th3);
            }
            throw th;
        }
    }

    private void serializeDictionary(File file) throws FileNotFoundException, IOException {
        LOGGER.info("Storing dictionary to: {}", file.getAbsolutePath());
        LOGGER.info("Warning: Loading a serialized dictionary seems to take longer than just reading the original text entries");
        ObjectOutputStream objectOutputStream = new ObjectOutputStream(new GZIPOutputStream(new FileOutputStream(file)));
        this.dict.compileTo(objectOutputStream);
        objectOutputStream.close();
        LOGGER.info("{} bytes written.", Long.valueOf(file.length()));
    }

    private void readDictionary(InputStream inputStream) throws IOException, AnalysisEngineProcessException {
        long currentTimeMillis = System.currentTimeMillis();
        if (this.useApproximateMatching) {
            this.dict = new TrieDictionary();
        } else {
            this.dict = new MapDictionary();
        }
        LOGGER.info("readDictionary() - adding entries from " + inputStream + " to dictionary...");
        Throwable th = null;
        try {
            InputStreamReader inputStreamReader = new InputStreamReader(inputStream);
            try {
                BufferedReader bufferedReader = new BufferedReader(inputStreamReader);
                TreeSet<String> treeSet = new TreeSet<>();
                while (true) {
                    String readLine = bufferedReader.readLine();
                    if (readLine == null) {
                        flushDictionary(treeSet, this.dict);
                        long currentTimeMillis2 = System.currentTimeMillis() - currentTimeMillis;
                        LOGGER.info("Reading dictionary took {}ms ({}s)", Long.valueOf(currentTimeMillis2), Long.valueOf(currentTimeMillis2 / 1000));
                        if (inputStreamReader != null) {
                            inputStreamReader.close();
                            return;
                        }
                        return;
                    }
                    String[] split = readLine.split("\t");
                    if (split.length != 2) {
                        LOGGER.error("readDictionary() - wrong format of line: " + readLine);
                        throw new AnalysisEngineProcessException("annotator_exception", (Object[]) null);
                    }
                    String trim = split[0].trim();
                    String trim2 = split[1].trim();
                    if (trim.length() >= 3) {
                        if (this.useApproximateMatching && !this.caseSensitive) {
                            trim = trim.toLowerCase();
                        }
                        if (this.generateVariants) {
                            LOGGER.debug("readDictionary() - make term variants of (" + trim + ", " + trim2 + ") and add them to dictionary (NOTE: this may take a while if dictionary is big!)");
                            Iterator<String> it = makeTermVariants(trim).iterator();
                            while (it.hasNext()) {
                                String next = it.next();
                                if (!this.stopWords.contains(next.toLowerCase()) && !next.equals("")) {
                                    treeSet.add(String.valueOf(next) + "\t" + trim2);
                                }
                            }
                        } else if (!this.stopWords.contains(trim.toLowerCase())) {
                            treeSet.add(String.valueOf(trim) + "\t" + trim2);
                        }
                        if (treeSet.size() >= 10000) {
                            LOGGER.debug("readDictionary() - flushing dictionarySet to map dictionary");
                            treeSet = flushDictionary(treeSet, this.dict);
                        }
                    }
                }
            } catch (Throwable th2) {
                if (inputStreamReader != null) {
                    inputStreamReader.close();
                }
                throw th2;
            }
        } catch (Throwable th3) {
            if (0 == 0) {
                th = th3;
            } else if (null != th3) {
                th.addSuppressed(th3);
            }
            throw th;
        }
    }

    private TreeSet<String> flushDictionary(TreeSet<String> treeSet, AbstractDictionary<String> abstractDictionary) throws AnalysisEngineProcessException {
        Iterator<String> it = treeSet.iterator();
        while (it.hasNext()) {
            String[] split = it.next().split("\t");
            if (split.length != 2) {
                LOGGER.error("readDictionary() - wrong split length: " + split.length);
                throw new AnalysisEngineProcessException("annotator_exception", (Object[]) null);
            }
            abstractDictionary.addEntry(new DictionaryEntry(split[0], split[1], 1.0d));
        }
        treeSet.clear();
        return treeSet;
    }

    private TreeSet<String> makeTermVariants(String str) {
        TreeSet<String> treeSet = new TreeSet<>();
        treeSet.add(str);
        int length = str.split("\\-").length + 1;
        if (length < 7) {
            for (int i = 0; i < length; i++) {
                String str2 = "";
                for (String str3 : str.split("\\-", i)) {
                    str2 = String.valueOf(str2) + " " + str3;
                }
                treeSet.add(str2.trim());
                treeSet.add(str2.replaceFirst("\\-", " ").trim());
            }
            treeSet.add(str.replaceAll("\\-", " "));
            treeSet.add(str.replaceFirst("\\-", " "));
            if (str.length() > 8) {
                int length2 = str.split("\\-").length + 1;
                for (int i2 = 0; i2 < length2; i2++) {
                    String str4 = " ";
                    for (String str5 : str.split("\\-", i2)) {
                        str4 = String.valueOf(str4) + str5;
                    }
                    treeSet.add(str4.trim());
                    treeSet.add(str4.replaceFirst("\\-", "").trim());
                }
                treeSet.add(str.replaceAll("\\-", ""));
                treeSet.add(str.replaceFirst("\\-", ""));
            }
        }
        if (str.contains("(") && str.contains(")")) {
            String replaceFirst = str.replaceFirst("\\(", "").replaceFirst("\\)", "");
            treeSet.add(replaceFirst);
            String replaceFirst2 = replaceFirst.replaceFirst("\\-", "");
            treeSet.add(replaceFirst2);
            treeSet.add(replaceFirst2.replaceAll("\\-", ""));
            String replaceAll = str.replaceAll("\\(", "").replaceAll("\\)", "");
            treeSet.add(replaceAll);
            String replaceFirst3 = replaceAll.replaceFirst("\\-", "");
            treeSet.add(replaceFirst3);
            treeSet.add(replaceFirst3.replaceAll("\\-", ""));
        }
        int length3 = str.split(" ").length + 1;
        for (int i3 = 0; i3 < length3; i3++) {
            String str6 = "";
            for (String str7 : str.split(" ", i3)) {
                str6 = String.valueOf(str6) + "-" + str7;
            }
            String trim = str6.substring(1).trim();
            treeSet.add(trim.trim());
            treeSet.add(trim.replaceFirst(" ", "-").trim());
        }
        treeSet.add(str.replaceAll(" ", "-"));
        treeSet.add(str.replaceFirst(" ", "-"));
        treeSet.add(str.replaceFirst("'s", ""));
        treeSet.add(str.replaceFirst("'s", "s"));
        return treeSet;
    }

    private void initStopWords(InputStream inputStream) throws IOException {
        this.stopWords = new TreeSet<>();
        LOGGER.info("readDictionary() - adding entries from " + inputStream + " to dictionary...");
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream));
        while (true) {
            try {
                String readLine = bufferedReader.readLine();
                if (readLine == null) {
                    bufferedReader.close();
                    return;
                } else if (!readLine.startsWith("#")) {
                    this.stopWords.add(readLine.trim().toLowerCase());
                }
            } catch (IOException e) {
                e.printStackTrace();
                return;
            }
        }
    }

    @Override // de.julielab.jcore.ae.lingpipegazetteer.chunking.ChunkerProvider
    public Set<String> getStopWords() {
        return this.stopWords;
    }

    @Override // de.julielab.jcore.ae.lingpipegazetteer.chunking.ChunkerProvider
    public boolean getUseApproximateMatching() {
        return this.useApproximateMatching;
    }

    @Override // de.julielab.jcore.ae.lingpipegazetteer.chunking.ChunkerProvider
    public boolean getNormalize() {
        return false;
    }

    @Override // de.julielab.jcore.ae.lingpipegazetteer.chunking.ChunkerProvider
    public boolean getTransliterate() {
        return false;
    }

    @Override // de.julielab.jcore.ae.lingpipegazetteer.chunking.ChunkerProvider
    public boolean getCaseSensitive() {
        return this.caseSensitive;
    }
}
