package de.julielab.jcore.ae.lingpipegazetteer.chunking;

import com.aliasi.chunk.Chunker;
import com.aliasi.dict.AbstractDictionary;
import com.aliasi.dict.ApproxDictionaryChunker;
import com.aliasi.dict.DictionaryEntry;
import com.aliasi.dict.ExactDictionaryChunker;
import com.aliasi.dict.MapDictionary;
import com.aliasi.dict.TrieDictionary;
import com.aliasi.spell.WeightedEditDistance;
import com.aliasi.tokenizer.IndoEuropeanTokenizerFactory;
import com.ibm.icu.text.Transliterator;
import de.julielab.jcore.ae.lingpipegazetteer.utils.StringNormalizerForChunking;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.HashSet;
import java.util.Properties;
import java.util.Set;
import java.util.zip.GZIPInputStream;
import org.apache.commons.lang.NotImplementedException;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.resource.DataResource;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.SharedResourceObject;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:de/julielab/jcore/ae/lingpipegazetteer/chunking/ChunkerProviderImplAlt.class */
public class ChunkerProviderImplAlt implements ChunkerProvider, SharedResourceObject {
    private static final Logger LOGGER = LoggerFactory.getLogger(ChunkerProviderImplAlt.class);
    public static final String PARAM_USE_APPROXIMATE_MATCHING = "UseApproximateMatching";
    public static final String PARAM_CASE_SENSITIVE = "CaseSensitive";
    public static final String PARAM_MAKE_VARIANTS = "MakeVariants";
    public static final String PARAM_STOPWORD_FILE = "StopWordFile";
    public static final String PARAM_DICTIONARY_FILE = "DictionaryFile";
    public static final String PARAM_NORMALIZE_TEXT = "NormalizeText";
    public static final String PARAM_TRANSLITERATE_TEXT = "TransliterateText";
    private boolean generateVariants;
    private boolean caseSensitive;
    private boolean useApproximateMatching;
    private boolean transliterate;
    private boolean normalize;
    private InputStream dictFile;
    private InputStream stopFile;
    private AbstractDictionary<String> dict;
    private Chunker dictChunker = null;
    private final double CHUNK_SCORE = 1.0d;
    private final int MIN_TERM_LENGTH = 3;
    private final double APPROX_MATCH_THRESHOLD_SCORE = 100.0d;
    private Set<String> stopWords = new HashSet();
    private String dictionaryFilePath;
    private String stopwordFilePath;

    @Override // de.julielab.jcore.ae.lingpipegazetteer.chunking.ChunkerProvider
    public Chunker getChunker() {
        return this.dictChunker;
    }

    public void load(DataResource dataResource) throws ResourceInitializationException {
        InputStream resourceAsStream;
        LOGGER.info("Loading configuration file from URI \"{}\" (URL: \"{}\").", dataResource.getUri(), dataResource.getUrl());
        Properties properties = new Properties();
        try {
            try {
                resourceAsStream = dataResource.getInputStream();
            } catch (IOException e) {
                LOGGER.error("Error while loading properties file", e);
                throw new ResourceInitializationException(e);
            }
        } catch (NullPointerException e2) {
            LOGGER.info("Couldn't get InputStream from UIMA. Trying to load the resource by classpath lookup.");
            String uri = dataResource.getUri().toString();
            resourceAsStream = getClass().getResourceAsStream(uri.startsWith("/") ? uri : "/" + uri);
            if (resourceAsStream == null) {
                String str = "Couldn't find the resource at \"" + dataResource.getUri() + "\" neither as an UIMA external resource file nor as a classpath resource.";
                LOGGER.error(str);
                throw new ResourceInitializationException(new IllegalArgumentException(str));
            }
        }
        properties.load(resourceAsStream);
        LOGGER.info("Creating dictionary chunker with " + dataResource.getUrl() + " properties file.");
        this.dictionaryFilePath = properties.getProperty("DictionaryFile");
        if (this.dictionaryFilePath == null) {
            throw new ResourceInitializationException("config_setting_absent", new Object[]{"DictionaryFile"});
        }
        this.stopwordFilePath = properties.getProperty("StopWordFile");
        if (this.stopwordFilePath == null) {
            throw new ResourceInitializationException("config_setting_absent", new Object[]{"StopWordFile"});
        }
        String property = properties.getProperty("MakeVariants");
        this.generateVariants = true;
        if (property != null) {
            this.generateVariants = new Boolean(property).booleanValue();
        }
        LOGGER.info("Generate variants: {}", Boolean.valueOf(this.generateVariants));
        String property2 = properties.getProperty(PARAM_NORMALIZE_TEXT);
        this.normalize = false;
        if (property2 != null) {
            this.normalize = new Boolean(property2).booleanValue();
        }
        LOGGER.info("Normalize dictionary entries (i.e. completely strip dashes, parenthesis etc): {}", Boolean.valueOf(this.normalize));
        String property3 = properties.getProperty(PARAM_TRANSLITERATE_TEXT);
        this.transliterate = false;
        if (property3 != null) {
            this.transliterate = new Boolean(property3).booleanValue();
        }
        LOGGER.info("Transliterate dictionary entries (i.e. transform accented characters to their base forms): {}", Boolean.valueOf(this.transliterate));
        String property4 = properties.getProperty("CaseSensitive");
        this.caseSensitive = false;
        if (property4 != null) {
            this.caseSensitive = new Boolean(property4).booleanValue();
        }
        LOGGER.info("Case sensitive: {}", Boolean.valueOf(this.caseSensitive));
        String property5 = properties.getProperty("UseApproximateMatching");
        this.useApproximateMatching = false;
        if (property5 != null) {
            this.useApproximateMatching = new Boolean(property5).booleanValue();
        }
        LOGGER.info("Use approximate matching: {}", Boolean.valueOf(this.useApproximateMatching));
        if (this.normalize && this.generateVariants) {
            throw new ResourceInitializationException(new IllegalStateException("MakeVariants and NormalizeText are both activated which is invalid. The two options work towards the same goal in two different ways, i.e. to recognize dictionary entry variants not given explicitly. However, the approaches are not compatible and you have to choose a single one."));
        }
        this.dictFile = readStreamFromFileSystemOrClassPath(this.dictionaryFilePath);
        this.stopFile = readStreamFromFileSystemOrClassPath(this.stopwordFilePath);
        try {
            initStopWords(this.stopFile);
            readDictionary(this.dictFile);
            LOGGER.info("Now creating chunker.");
            long currentTimeMillis = System.currentTimeMillis();
            if (this.useApproximateMatching) {
                final HashSet hashSet = new HashSet();
                hashSet.add('-');
                WeightedEditDistance weightedEditDistance = ApproxDictionaryChunker.TT_DISTANCE;
                this.dictChunker = new ApproxDictionaryChunker(this.dict, IndoEuropeanTokenizerFactory.INSTANCE, new WeightedEditDistance() { // from class: de.julielab.jcore.ae.lingpipegazetteer.chunking.ChunkerProviderImplAlt.1
                    public double deleteWeight(char c) {
                        return c == '-' ? -5.0d : (c == ' ' || hashSet.contains(Character.valueOf(c))) ? -10.0d : -110.0d;
                    }

                    public double insertWeight(char c) {
                        return deleteWeight(c);
                    }

                    public double matchWeight(char c) {
                        return 0.0d;
                    }

                    public double substituteWeight(char c, char c2) {
                        if (c == ' ' && c2 == '-') {
                            return -2.0d;
                        }
                        if (c == '-' && c2 == ' ') {
                            return -2.0d;
                        }
                        if (c == ' ' && hashSet.contains(Character.valueOf(c2))) {
                            return -10.0d;
                        }
                        return (hashSet.contains(Character.valueOf(c)) && c2 == ' ') ? -10.0d : -110.0d;
                    }

                    public double transposeWeight(char c, char c2) {
                        return Double.NEGATIVE_INFINITY;
                    }
                }, 100.0d);
            } else {
                this.dictChunker = new ExactDictionaryChunker(this.dict, IndoEuropeanTokenizerFactory.INSTANCE, false, this.caseSensitive);
            }
            long currentTimeMillis2 = System.currentTimeMillis() - currentTimeMillis;
            LOGGER.info("Building the actual chunker from the dictionary took {}ms ({}s).", Long.valueOf(currentTimeMillis2), Long.valueOf(currentTimeMillis2 / 1000));
        } catch (Exception e3) {
            LOGGER.error("Exception while creating chunker instance", e3);
        }
    }

    private void readDictionary(InputStream inputStream) throws IOException, AnalysisEngineProcessException {
        long currentTimeMillis = System.currentTimeMillis();
        if (this.useApproximateMatching) {
            this.dict = new TrieDictionary();
        } else {
            this.dict = new MapDictionary();
        }
        LOGGER.info("readDictionary() - adding entries from " + this.dictionaryFilePath + " to dictionary...");
        BufferedReader bufferedReader = null;
        try {
            BufferedReader bufferedReader2 = new BufferedReader(new InputStreamReader(inputStream));
            Transliterator transliterator = null;
            if (this.transliterate) {
                transliterator = Transliterator.getInstance("NFD; [:Nonspacing Mark:] Remove; NFC; Lower");
            }
            IndoEuropeanTokenizerFactory indoEuropeanTokenizerFactory = null;
            if (this.normalize) {
                indoEuropeanTokenizerFactory = new IndoEuropeanTokenizerFactory();
            }
            while (true) {
                String readLine = bufferedReader2.readLine();
                if (readLine == null) {
                    long currentTimeMillis2 = System.currentTimeMillis() - currentTimeMillis;
                    LOGGER.info("Reading dictionary took {}ms ({}s)", Long.valueOf(currentTimeMillis2), Long.valueOf(currentTimeMillis2 / 1000));
                    if (bufferedReader2 != null) {
                        bufferedReader2.close();
                        return;
                    }
                    return;
                }
                String[] split = readLine.split("\t");
                if (split.length != 2) {
                    LOGGER.error("readDictionary() - wrong format of line: " + readLine);
                    throw new AnalysisEngineProcessException("annotator_exception", (Object[]) null);
                }
                String trim = split[0].trim();
                if (!this.stopWords.contains(trim.toLowerCase())) {
                    if (this.normalize) {
                        trim = StringNormalizerForChunking.normalizeString(trim, indoEuropeanTokenizerFactory).string;
                    }
                    if (this.transliterate) {
                        trim = transliterator.transform(trim);
                    }
                    if (this.useApproximateMatching && !this.caseSensitive && !this.transliterate) {
                        trim = trim.toLowerCase();
                    }
                    String trim2 = split[1].trim();
                    if (trim.length() >= 3) {
                        if (this.generateVariants) {
                            throw new NotImplementedException("In this alternative ChunkerProvider, generating variants will currently fail to adequately filter out stop words due to the transliteration and/or normalization algorithms. If you don't need those algorithms, just stick to the original ChunkerProviderImpl. Otherwise, this issue must be fixed (shouldnt be too difficult). Variants are also currently not treated with normalization/transliteration (but this is deemed to be two alternative ways to achieve a similar thing anyway)");
                        }
                        if (!this.stopWords.contains(trim.toLowerCase())) {
                            this.dict.addEntry(new DictionaryEntry(trim, trim2, 1.0d));
                        }
                    }
                }
            }
        } catch (Throwable th) {
            if (0 != 0) {
                bufferedReader.close();
            }
            throw th;
        }
    }

    private void initStopWords(InputStream inputStream) throws IOException {
        this.stopWords = new HashSet();
        LOGGER.info("readDictionary() - adding entries from " + this.stopwordFilePath + " to dictionary...");
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream));
        while (true) {
            try {
                String readLine = bufferedReader.readLine();
                if (readLine == null) {
                    bufferedReader.close();
                    return;
                } else if (!readLine.startsWith("#")) {
                    this.stopWords.add(readLine.trim().toLowerCase());
                }
            } catch (IOException e) {
                e.printStackTrace();
                return;
            }
        }
    }

    @Override // de.julielab.jcore.ae.lingpipegazetteer.chunking.ChunkerProvider
    public Set<String> getStopWords() {
        return this.stopWords;
    }

    @Override // de.julielab.jcore.ae.lingpipegazetteer.chunking.ChunkerProvider
    public boolean getUseApproximateMatching() {
        return this.useApproximateMatching;
    }

    @Override // de.julielab.jcore.ae.lingpipegazetteer.chunking.ChunkerProvider
    public boolean getNormalize() {
        return this.normalize;
    }

    @Override // de.julielab.jcore.ae.lingpipegazetteer.chunking.ChunkerProvider
    public boolean getTransliterate() {
        return this.transliterate;
    }

    @Override // de.julielab.jcore.ae.lingpipegazetteer.chunking.ChunkerProvider
    public boolean getCaseSensitive() {
        return this.caseSensitive;
    }

    private InputStream readStreamFromFileSystemOrClassPath(String str) {
        InputStream inputStream = null;
        File file = new File(str);
        if (file.exists()) {
            try {
                inputStream = new FileInputStream(file);
            } catch (FileNotFoundException e) {
                e.printStackTrace();
            }
        } else {
            inputStream = getClass().getResourceAsStream(str.startsWith("/") ? str : "/" + str);
        }
        if (str.endsWith(".gz") || str.endsWith(".gzip")) {
            try {
                inputStream = new GZIPInputStream(inputStream);
            } catch (IOException e2) {
                e2.printStackTrace();
            }
        }
        return inputStream;
    }
}
