package de.julielab.jcore.ae.lingpipegazetteer.uima;

import com.aliasi.chunk.Chunk;
import com.aliasi.chunk.Chunker;
import com.aliasi.chunk.Chunking;
import com.aliasi.tokenizer.IndoEuropeanTokenizerFactory;
import com.aliasi.tokenizer.TokenizerFactory;
import com.ibm.icu.text.Transliterator;
import de.julielab.jcore.ae.lingpipegazetteer.chunking.ChunkerProvider;
import de.julielab.jcore.ae.lingpipegazetteer.chunking.OverlappingChunk;
import de.julielab.jcore.ae.lingpipegazetteer.utils.AnnotationRetrieval;
import de.julielab.jcore.ae.lingpipegazetteer.utils.StringNormalizerForChunking;
import de.julielab.jcore.types.Abbreviation;
import de.julielab.jcore.types.ConceptMention;
import de.julielab.jcore.types.mantra.Entity;
import de.julielab.jcore.utility.JCoReAnnotationTools;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.Stack;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.ExternalResource;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import org.apache.uima.resource.ResourceAccessException;
import org.apache.uima.resource.ResourceInitializationException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:de/julielab/jcore/ae/lingpipegazetteer/uima/GazetteerAnnotator.class */
public class GazetteerAnnotator extends JCasAnnotator_ImplBase {
    public static final String CHUNKER_RESOURCE_NAME = "DictionaryChunkerProvider";
    public static final String PARAM_CHECK_ACRONYMS = "CheckAcronyms";
    public static final String PARAM_OUTPUT_TYPE = "OutputType";
    private static final String PARAM_USE_MANTRA_MODE = "MantraMode";
    private boolean transliterate;
    private boolean normalize;

    @ExternalResource(key = CHUNKER_RESOURCE_NAME, mandatory = true)
    private ChunkerProvider provider;
    private Transliterator transliterator;
    private boolean caseSensitive;
    private TokenizerFactory normalizationTokenFactory;
    private Set<String> stopWords;
    private static final String COMPONENT_ID = GazetteerAnnotator.class.getCanonicalName();
    private static final Logger LOGGER = LoggerFactory.getLogger(GazetteerAnnotator.class);
    private static int initializeCount = 0;
    private boolean useApproximateMatching = false;

    @ConfigurationParameter(name = PARAM_USE_MANTRA_MODE, defaultValue = {"false"})
    private boolean mantraMode = false;

    @ConfigurationParameter(name = PARAM_CHECK_ACRONYMS, defaultValue = {"true"})
    private boolean checkAcronyms = true;

    @ConfigurationParameter(name = PARAM_OUTPUT_TYPE)
    private String outputType = null;
    private Chunker gazetteer = null;

    /* JADX INFO: Access modifiers changed from: package-private */
    /* loaded from: input_file:de/julielab/jcore/ae/lingpipegazetteer/uima/GazetteerAnnotator$ParenthesisType.class */
    public enum ParenthesisType {
        ROUND,
        BRACKET,
        CURLY,
        NONE;

        /* renamed from: values, reason: to resolve conflict with enum method */
        public static ParenthesisType[] valuesCustom() {
            ParenthesisType[] valuesCustom = values();
            int length = valuesCustom.length;
            ParenthesisType[] parenthesisTypeArr = new ParenthesisType[length];
            System.arraycopy(valuesCustom, 0, parenthesisTypeArr, 0, length);
            return parenthesisTypeArr;
        }
    }

    public void initialize(UimaContext uimaContext) throws ResourceInitializationException {
        LOGGER.info("calls to initialize: " + initializeCount);
        super.initialize(uimaContext);
        LOGGER.info("initialize() - initializing GazetteerAnnotator...");
        try {
            this.provider = (ChunkerProvider) getContext().getResourceObject(CHUNKER_RESOURCE_NAME);
            this.gazetteer = this.provider.getChunker();
            this.stopWords = new HashSet();
            for (String str : new String[]{"a", "about", "above", "across", "after", "afterwards", "again", "against", "all", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "amongst", "amoungst", "amount", "an", "and", "another", "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are", "around", "as", "at", "back", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", "beyond", "bill", "both", "bottom", "but", "by", "call", "can", "cannot", "cant", "co", "computer", "con", "could", "couldnt", "cry", "de", "describe", "detail", "do", "done", "down", "due", "during", "each", "eg", "eight", "either", "eleven", "else", "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", "few", "fifteen", "fify", "fill", "find", "fire", "first", "five", "for", "former", "formerly", "forty", "found", "four", "from", "front", "full", "further", "get", "give", "go", "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "high", "him", "himself", "his", "how", "however", "hundred", "i", "ie", "if", "in", "inc", "indeed", "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter", "latterly", "least", "less", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly", "move", "much", "must", "my", "myself", "name", "namely", "neither", "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own", "part", "per", "perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed", "seeming", "seems", "serious", "several", "she", "should", "show", "side", "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "system", "take", "ten", "than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they", "thick", "thin", "third", "this", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards", "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we", "well", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves"}) {
                this.stopWords.add(str);
            }
        } catch (ResourceAccessException e) {
            LOGGER.error("Exception while initializing", e);
        }
        this.useApproximateMatching = this.provider.getUseApproximateMatching();
        LOGGER.info("Use approximate matching (the actual used edit distance is defined by the ChunkerProvider implementation): {}", Boolean.valueOf(this.useApproximateMatching));
        this.checkAcronyms = ((Boolean) uimaContext.getConfigParameterValue(PARAM_CHECK_ACRONYMS)).booleanValue();
        LOGGER.info("Check for acronyms (found dictionary entries that are abbreviations are only accepted if their long form is an abbreviation of the same type, too): {}", Boolean.valueOf(this.checkAcronyms));
        Boolean valueOf = Boolean.valueOf(this.provider.getNormalize());
        this.normalize = false;
        if (valueOf != null) {
            this.normalize = valueOf.booleanValue();
            this.normalizationTokenFactory = new IndoEuropeanTokenizerFactory();
        }
        LOGGER.info("Normalize CAS document text (i.e. do stemming and remove possessive 's): {}", Boolean.valueOf(this.normalize));
        Boolean valueOf2 = Boolean.valueOf(this.provider.getTransliterate());
        this.transliterate = false;
        if (valueOf2 != null) {
            this.transliterate = valueOf2.booleanValue();
            this.transliterator = Transliterator.getInstance("NFD; [:Nonspacing Mark:] Remove; NFC; Lower");
        }
        LOGGER.info("Transliterate CAS document text (i.e. transform accented characters to their base forms): {}", Boolean.valueOf(this.transliterate));
        if (this.useApproximateMatching) {
            Boolean valueOf3 = Boolean.valueOf(this.provider.getCaseSensitive());
            this.caseSensitive = false;
            if (valueOf3 != null) {
                this.caseSensitive = valueOf3.booleanValue();
            }
            LOGGER.info("Dictionary matching is case sensitive (text is not lower cased): {}", Boolean.valueOf(this.caseSensitive));
        }
        this.outputType = (String) uimaContext.getConfigParameterValue(PARAM_OUTPUT_TYPE);
        if (this.outputType == null) {
            LOGGER.error("initialize() - output type not specified.");
            throw new ResourceInitializationException();
        }
        this.mantraMode = uimaContext.getConfigParameterValue(PARAM_USE_MANTRA_MODE) != null ? ((Boolean) uimaContext.getConfigParameterValue(PARAM_USE_MANTRA_MODE)).booleanValue() : false;
    }

    public void process(JCas jCas) throws AnalysisEngineProcessException {
        String documentText = jCas.getDocumentText();
        if (documentText == null || documentText.length() == 0) {
            return;
        }
        if (this.transliterate) {
            documentText = this.transliterator.transform(documentText);
        }
        if (this.useApproximateMatching && !this.transliterate && !this.caseSensitive) {
            documentText = documentText.toLowerCase();
        }
        StringNormalizerForChunking.NormalizedString normalizeString = this.normalize ? StringNormalizerForChunking.normalizeString(documentText, this.normalizationTokenFactory) : null;
        LOGGER.debug("Performing actual Gazetteer annotation...");
        Chunking chunk = this.normalize ? this.gazetteer.chunk(normalizeString.string) : this.gazetteer.chunk(documentText);
        LOGGER.debug("Gazetteer annotation done.");
        if (this.useApproximateMatching) {
            List<OverlappingChunk> groupOverlappingChunks = groupOverlappingChunks(filterChunking(chunk), chunk.charSequence().toString());
            LOGGER.debug("all overlapping chunks:\n");
            HashSet hashSet = new HashSet();
            for (OverlappingChunk overlappingChunk : groupOverlappingChunks) {
                LOGGER.debug(overlappingChunk.toStringAll());
                List<Chunk> bestChunks = overlappingChunk.getBestChunks();
                LOGGER.debug("Found {} best chunks.", Integer.valueOf(bestChunks.size()));
                for (int i = 0; i < bestChunks.size(); i++) {
                    Chunk chunk2 = bestChunks.get(i);
                    LOGGER.debug("Nr. " + i + " best chunk: " + chunk2.start() + " - " + chunk2.end() + ": " + chunk2.score() + " ; type: " + chunk2.type());
                    if (hashSet.contains(chunk2)) {
                        throw new IllegalStateException("Duplicate best chunk: " + chunk2);
                    }
                    hashSet.add(chunk2);
                    add2Cas(jCas, chunk2, normalizeString);
                }
            }
        } else {
            Iterator it = chunk.chunkSet().iterator();
            while (it.hasNext()) {
                add2Cas(jCas, (Chunk) it.next(), normalizeString);
            }
        }
        if (!this.checkAcronyms || this.mantraMode) {
            return;
        }
        LOGGER.debug("process() - checking acronyms");
        annotateAcronymsWithFullFormEntity(jCas);
    }

    private List<Chunk> filterChunking(Chunking chunking) {
        ArrayList arrayList = new ArrayList(chunking.chunkSet().size());
        for (Chunk chunk : chunking.chunkSet()) {
            String charSequence = chunking.charSequence().subSequence(chunk.start(), chunk.end()).toString();
            if (!filterParenthesis(charSequence) && !filterPunctuationArtifacts(charSequence) && !filterStopwords(charSequence)) {
                arrayList.add(chunk);
            }
        }
        return arrayList;
    }

    private boolean filterPunctuationArtifacts(String str) {
        return str.startsWith("-") || str.endsWith("-");
    }

    private boolean filterStopwords(String str) {
        if (this.stopWords.contains(str.toLowerCase())) {
            return true;
        }
        if (!str.contains(" ")) {
            return false;
        }
        int i = 0;
        for (String str2 : str.split(" ")) {
            if (this.stopWords.contains(str2.toLowerCase())) {
                i++;
            }
        }
        if (Math.ceil(r0.length / 2.0d) > i) {
            return false;
        }
        LOGGER.debug("Filtering due to high stop word occurrences: {}", str);
        return true;
    }

    static boolean filterParenthesis(String str) {
        Stack stack = new Stack();
        for (int i = 0; i < str.length(); i++) {
            char charAt = str.charAt(i);
            if (isParentheses(charAt)) {
                if (isOpenedParentheses(charAt)) {
                    stack.add(Character.valueOf(charAt));
                } else if (stack.isEmpty() || !isParenthesisCounterpart((Character) stack.pop(), Character.valueOf(charAt))) {
                    return true;
                }
            }
        }
        return !stack.isEmpty();
    }

    private static boolean isParenthesisCounterpart(Character ch, Character ch2) {
        ParenthesisType parenthesisType = getParenthesisType(ch2.charValue());
        ParenthesisType parenthesisType2 = getParenthesisType(ch.charValue());
        if (parenthesisType == ParenthesisType.NONE || parenthesisType2 == ParenthesisType.NONE) {
            throw new IllegalArgumentException("The two characters '" + ch + "' and '" + ch2 + "' were given in order to determine whether they are compatible parenthesis counterparts, but at least one of those characters is no parentheses.");
        }
        return parenthesisType.equals(parenthesisType2);
    }

    static ParenthesisType getParenthesisType(char c) {
        switch (c) {
            case '(':
            case ')':
                return ParenthesisType.ROUND;
            case '[':
            case ']':
                return ParenthesisType.BRACKET;
            case '{':
            case '}':
                return ParenthesisType.CURLY;
            default:
                return ParenthesisType.NONE;
        }
    }

    static boolean isParentheses(char c) {
        return isOpenedParentheses(c) || isClosedParentheses(c);
    }

    static boolean isOpenedParentheses(char c) {
        switch (c) {
            case '(':
            case '[':
            case '{':
                return true;
            default:
                return false;
        }
    }

    static boolean isClosedParentheses(char c) {
        switch (c) {
            case ')':
            case ']':
            case '}':
                return true;
            default:
                return false;
        }
    }

    static List<OverlappingChunk> groupOverlappingChunks(List<Chunk> list, String str) {
        Collections.sort(list, new Comparator<Chunk>() { // from class: de.julielab.jcore.ae.lingpipegazetteer.uima.GazetteerAnnotator.1
            @Override // java.util.Comparator
            public int compare(Chunk chunk, Chunk chunk2) {
                return chunk.start() - chunk2.start();
            }
        });
        ArrayList<OverlappingChunk> arrayList = new ArrayList();
        for (Chunk chunk : list) {
            boolean z = false;
            for (OverlappingChunk overlappingChunk : arrayList) {
                if (overlappingChunk.isOverlappingSpan(chunk.start(), chunk.end())) {
                    overlappingChunk.addChunk(chunk.start(), chunk.end(), chunk);
                    z = true;
                }
            }
            if (!z) {
                arrayList.add(new OverlappingChunk(chunk.start(), chunk.end(), chunk, str));
            }
        }
        return arrayList;
    }

    private boolean isAcronymWithSameFullFormSpecificType(JCas jCas, Chunk chunk, StringNormalizerForChunking.NormalizedString normalizedString) {
        Annotation annotation = this.normalize ? new Annotation(jCas, normalizedString.getOriginalOffset(chunk.start()).intValue(), normalizedString.getOriginalOffset(chunk.end()).intValue()) : new Annotation(jCas, chunk.start(), chunk.end());
        annotation.addToIndexes(jCas);
        Abbreviation matchingAnnotation = AnnotationRetrieval.getMatchingAnnotation(jCas, annotation, Abbreviation.class);
        annotation.removeFromIndexes();
        if (matchingAnnotation == null) {
            LOGGER.debug(chunk + " chunk \"{}\" is not an abbreviation\n", annotation.getCoveredText());
            return true;
        }
        de.julielab.jcore.types.Annotation textReference = matchingAnnotation.getTextReference();
        ConceptMention matchingAnnotation2 = AnnotationRetrieval.getMatchingAnnotation(jCas, textReference, ConceptMention.class);
        if (matchingAnnotation2 == null) {
            LOGGER.debug(chunk + " chunk \"{}\" is an abbreviation but respective full \"{}\" form is no ConceptMention\n", annotation.getCoveredText(), textReference.getCoveredText());
            return false;
        }
        String canonicalName = matchingAnnotation2.getClass().getCanonicalName();
        if (canonicalName.equals(this.outputType)) {
            LOGGER.debug(chunk + " chunk \"{}\" is an abbreviation and respective full form \"{}\" is ConceptMention with same type as OutputType\n", annotation.getCoveredText(), matchingAnnotation2.getCoveredText());
            return true;
        }
        LOGGER.debug(chunk + " chunk \"{}\" is an abbreviation but respective full form \"{}\" is ConceptMention without the correct OutputType (is: {}; OutputType: {})\n", new Object[]{annotation.getCoveredText(), matchingAnnotation2.getCoveredText(), canonicalName, this.outputType});
        return false;
    }

    private void add2Cas(JCas jCas, Chunk chunk, StringNormalizerForChunking.NormalizedString normalizedString) throws AnalysisEngineProcessException {
        if (!this.checkAcronyms || isAcronymWithSameFullFormSpecificType(jCas, chunk, normalizedString)) {
            int intValue = this.normalize ? normalizedString.getOriginalOffset(chunk.start()).intValue() : chunk.start();
            int intValue2 = this.normalize ? normalizedString.getOriginalOffset(chunk.end()).intValue() : chunk.end();
            try {
                if (!this.mantraMode) {
                    ConceptMention annotationByClassName = JCoReAnnotationTools.getAnnotationByClassName(jCas, this.outputType);
                    annotationByClassName.setBegin(intValue);
                    annotationByClassName.setEnd(intValue2);
                    annotationByClassName.setSpecificType(chunk.type());
                    annotationByClassName.setComponentId(COMPONENT_ID);
                    annotationByClassName.setConfidence(new StringBuilder(String.valueOf(chunk.score())).toString());
                    annotationByClassName.addToIndexes();
                    return;
                }
                for (String str : chunk.type().split("@@TERM@@")) {
                    String[] split = str.split("@@");
                    Entity annotationByClassName2 = JCoReAnnotationTools.getAnnotationByClassName(jCas, "de.julielab.jcore.types.mantra.Entity");
                    annotationByClassName2.setBegin(intValue);
                    annotationByClassName2.setEnd(intValue2);
                    annotationByClassName2.setComponentId(COMPONENT_ID);
                    annotationByClassName2.setConfidence(new StringBuilder(String.valueOf(chunk.score())).toString());
                    annotationByClassName2.setSource(split[0]);
                    annotationByClassName2.setCui(split[1]);
                    annotationByClassName2.setSemanticType(split[2]);
                    annotationByClassName2.setSemanticGroup(split[3]);
                    annotationByClassName2.addToIndexes();
                }
            } catch (Exception e) {
                LOGGER.error("process() - could not generate output type: " + e.getMessage());
                e.printStackTrace();
                throw new AnalysisEngineProcessException(e);
            }
        }
    }

    private void annotateAcronymsWithFullFormEntity(JCas jCas) throws AnalysisEngineProcessException {
        FSIterator it = jCas.getJFSIndexRepository().getAnnotationIndex(Abbreviation.type).iterator();
        while (it.hasNext()) {
            Abbreviation abbreviation = (Abbreviation) it.next();
            de.julielab.jcore.types.Annotation textReference = abbreviation.getTextReference();
            LOGGER.debug("annotateAcronymsWithFullFormEntity() - checking abbreviation: " + abbreviation.getCoveredText());
            ConceptMention conceptMention = null;
            List<ConceptMention> includedAnnotations = JCoReAnnotationTools.getIncludedAnnotations(jCas, textReference, ConceptMention.class);
            if (includedAnnotations.size() == 1) {
                conceptMention = (ConceptMention) includedAnnotations.get(0);
                LOGGER.debug("Found a single ConceptMention included in the full form: {}", conceptMention.getCoveredText());
            } else if (includedAnnotations.size() > 1) {
                int i = -1;
                for (ConceptMention conceptMention2 : includedAnnotations) {
                    int end = conceptMention2.getEnd() - conceptMention2.getBegin();
                    if (end > i) {
                        conceptMention = conceptMention2;
                        i = end;
                    }
                }
                LOGGER.debug("Found multiple ConceptMentions included in the full form \"{}\", returning the longest.", textReference.getCoveredText());
                if (LOGGER.isTraceEnabled()) {
                    LOGGER.trace("All found ConceptMentions:");
                    for (ConceptMention conceptMention3 : includedAnnotations) {
                        LOGGER.trace("Text: {}; offsets: {}-{}", new Object[]{conceptMention3.getCoveredText(), Integer.valueOf(conceptMention3.getBegin()), Integer.valueOf(conceptMention3.getEnd())});
                    }
                }
            } else {
                LOGGER.debug("No ConceptMention in the span of acronym fullform \"{}\" found.", textReference.getCoveredText());
            }
            String canonicalName = conceptMention != null ? conceptMention.getClass().getCanonicalName() : null;
            ConceptMention matchingAnnotation = AnnotationRetrieval.getMatchingAnnotation(jCas, abbreviation, ConceptMention.class);
            if (canonicalName != null && canonicalName.equals(this.outputType)) {
                if (conceptMention == null) {
                    LOGGER.debug("annotateAcronymsWithFullFormEntity() - fullform of abbreviation has no ConceptMention\n");
                } else if (conceptMention.getComponentId() != null && conceptMention.getComponentId().equals(COMPONENT_ID) && (matchingAnnotation == null || !matchingAnnotation.getClass().getName().equals(conceptMention.getClass().getName()))) {
                    try {
                        LOGGER.debug("annotateAcronymsWithFullFormEntity() - fullform of abbreviation (" + abbreviation.getCoveredText() + " [begin=" + abbreviation.getBegin() + "; end=" + abbreviation.getEnd() + "]) has ConceptMention: " + conceptMention.toString());
                        ConceptMention annotationByClassName = JCoReAnnotationTools.getAnnotationByClassName(jCas, this.outputType);
                        annotationByClassName.setBegin(abbreviation.getBegin());
                        annotationByClassName.setEnd(abbreviation.getEnd());
                        annotationByClassName.setTextualRepresentation(annotationByClassName.getCoveredText());
                        annotationByClassName.setSpecificType(conceptMention.getSpecificType());
                        annotationByClassName.setComponentId(String.valueOf(COMPONENT_ID) + "+acronym");
                        annotationByClassName.setConfidence(new StringBuilder(String.valueOf(conceptMention.getConfidence())).toString());
                        annotationByClassName.addToIndexes();
                    } catch (Exception e) {
                        LOGGER.error("process() - could not generate output type: " + e.getMessage());
                        e.printStackTrace();
                        throw new AnalysisEngineProcessException("annotator_exception", (Object[]) null);
                    }
                } else if (matchingAnnotation == null) {
                    LOGGER.debug("annotateAcronymsWithFullFormEntity() - emAcronym != null");
                } else if (matchingAnnotation.getClass().getName().equals(conceptMention.getClass().getName())) {
                    LOGGER.debug("annotateAcronymsWithFullFormEntity() - emAcroType=" + matchingAnnotation.getClass().getCanonicalName() + " == emFullformType=" + conceptMention.getClass().getCanonicalName());
                }
            }
        }
    }
}
