package de.datexis.ner;

import de.datexis.annotator.Annotator;
import de.datexis.common.Resource;
import de.datexis.common.WordHelpers;
import de.datexis.model.Annotation;
import de.datexis.model.Document;
import de.datexis.model.Token;
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.file.FileVisitOption;
import java.nio.file.Files;
import java.nio.file.LinkOption;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import net.amygdalum.stringsearchalgorithms.search.MatchOption;
import net.amygdalum.stringsearchalgorithms.search.StringFinderOption;
import net.amygdalum.stringsearchalgorithms.search.StringMatch;
import net.amygdalum.stringsearchalgorithms.search.chars.SetBackwardOracleMatching;
import net.amygdalum.stringsearchalgorithms.search.chars.StringSearchAlgorithm;
import net.amygdalum.util.io.StringCharProvider;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:de/datexis/ner/MatchingAnnotator.class */
public class MatchingAnnotator extends Annotator {
    protected static final Logger log = LoggerFactory.getLogger(MatchingAnnotator.class);
    protected int minimumWordLength;
    protected String type;
    protected Pattern wordLengthMatcher;
    protected Pattern uppercaseMatcher;
    protected StringSearchAlgorithm stringSearch;
    protected WordHelpers wordHelpers;
    Collection<String> terms;
    protected MatchingStrategy matchingStrategy;
    protected Annotation.Source source;

    /* loaded from: input_file:de/datexis/ner/MatchingAnnotator$MatchingStrategy.class */
    public enum MatchingStrategy {
        CASE_SENSITIVE,
        LOWERCASE,
        LEMMA,
        SKIP_STOPWORDS
    }

    public MatchingAnnotator() {
        this(MatchingStrategy.CASE_SENSITIVE, Annotation.Source.SILVER);
    }

    public MatchingAnnotator(MatchingStrategy matchingStrategy) {
        this(matchingStrategy, Annotation.Source.SILVER);
    }

    public MatchingAnnotator(MatchingStrategy matchingStrategy, Annotation.Source source) {
        this.minimumWordLength = 3;
        this.type = "GENERIC";
        this.wordLengthMatcher = Pattern.compile("\\b\\w{4,}\\b");
        this.uppercaseMatcher = Pattern.compile("^[A-Z0-9]+$");
        this.wordHelpers = new WordHelpers(WordHelpers.Language.EN);
        this.terms = new ArrayList();
        this.matchingStrategy = MatchingStrategy.CASE_SENSITIVE;
        this.source = Annotation.Source.SILVER;
        this.matchingStrategy = matchingStrategy;
        this.source = source;
    }

    public MatchingAnnotator(MatchingStrategy matchingStrategy, Annotation.Source source, String str) {
        this(matchingStrategy, source);
        this.type = str;
    }

    public MatchingAnnotator(MatchingStrategy matchingStrategy, Annotation.Source source, String str, int i) {
        this(matchingStrategy, source, str);
        this.minimumWordLength = i;
    }

    protected Collection<String> convertTerms(Stream<String> stream) {
        switch (this.matchingStrategy) {
            case LOWERCASE:
                return (Collection) stream.filter(str -> {
                    return str.length() >= this.minimumWordLength;
                }).map(str2 -> {
                    return convertToLowercase(str2);
                }).distinct().collect(Collectors.toList());
            case LEMMA:
                return (Collection) stream.filter(str3 -> {
                    return str3.length() >= this.minimumWordLength;
                }).map(str4 -> {
                    return removePlurals(convertToLowercase(str4));
                }).distinct().collect(Collectors.toList());
            case SKIP_STOPWORDS:
                return (Collection) stream.filter(str5 -> {
                    return str5.length() >= this.minimumWordLength && !this.wordHelpers.isStopWord(str5);
                }).distinct().collect(Collectors.toList());
            default:
                return (Collection) stream.distinct().collect(Collectors.toList());
        }
    }

    public void clearTermsToMatch() {
        this.terms.clear();
        this.stringSearch = new SetBackwardOracleMatching(this.terms);
    }

    public void loadTermsToMatch(Collection<String> collection) {
        loadTermsToMatch(collection.stream());
    }

    public void loadTermsToMatch(Stream<String> stream) {
        this.terms.addAll(convertTerms(stream));
        log.info("Rebuildung dictionary with {} distinct terms", Integer.valueOf(this.terms.size()));
        this.stringSearch = new SetBackwardOracleMatching(this.terms);
    }

    public void loadTermsToMatch(Resource resource) throws IOException {
        if (resource.isDirectory()) {
            Files.walk(resource.getPath(), new FileVisitOption[0]).filter(path -> {
                return Files.isRegularFile(path, LinkOption.NOFOLLOW_LINKS);
            }).forEach(path2 -> {
                try {
                    loadTermsToMatch(Resource.fromFile(path2.toString()));
                } catch (IOException e) {
                    log.error(e.toString());
                }
            });
            return;
        }
        if (!resource.isFile()) {
            throw new FileNotFoundException("cannot open path: " + resource.toString());
        }
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(resource.getInputStream(), "UTF-8"));
        Throwable th = null;
        try {
            try {
                loadTermsToMatch(bufferedReader.lines());
                if (bufferedReader != null) {
                    if (0 == 0) {
                        bufferedReader.close();
                        return;
                    }
                    try {
                        bufferedReader.close();
                    } catch (Throwable th2) {
                        th.addSuppressed(th2);
                    }
                }
            } catch (Throwable th3) {
                th = th3;
                throw th3;
            }
        } catch (Throwable th4) {
            if (bufferedReader != null) {
                if (th != null) {
                    try {
                        bufferedReader.close();
                    } catch (Throwable th5) {
                        th.addSuppressed(th5);
                    }
                } else {
                    bufferedReader.close();
                }
            }
            throw th4;
        }
    }

    public void deleteTermsToMatch(Collection<String> collection) {
        deleteTermsToMatch(collection.stream());
    }

    public void deleteTermsToMatch(Stream<String> stream) {
        this.terms.removeAll(convertTerms(stream));
        log.info("Rebuildung dictionary with {} distinct terms", Integer.valueOf(this.terms.size()));
        this.stringSearch = new SetBackwardOracleMatching(this.terms);
    }

    public void deleteTermsToMatch(Resource resource) throws IOException {
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(resource.getInputStream(), "UTF-8"));
        Throwable th = null;
        try {
            try {
                deleteTermsToMatch(bufferedReader.lines());
                if (bufferedReader != null) {
                    if (0 == 0) {
                        bufferedReader.close();
                        return;
                    }
                    try {
                        bufferedReader.close();
                    } catch (Throwable th2) {
                        th.addSuppressed(th2);
                    }
                }
            } catch (Throwable th3) {
                th = th3;
                throw th3;
            }
        } catch (Throwable th4) {
            if (bufferedReader != null) {
                if (th != null) {
                    try {
                        bufferedReader.close();
                    } catch (Throwable th5) {
                        th.addSuppressed(th5);
                    }
                } else {
                    bufferedReader.close();
                }
            }
            throw th4;
        }
    }

    public int countTerms() {
        return this.terms.size();
    }

    protected String convertToLowercase(String str) {
        Matcher matcher = this.wordLengthMatcher.matcher(str);
        StringBuffer stringBuffer = new StringBuffer();
        while (matcher.find()) {
            String group = matcher.group();
            if (!this.uppercaseMatcher.matcher(group).matches()) {
                matcher.appendReplacement(stringBuffer, matcher.group().toLowerCase());
            } else if (group.length() >= 8) {
                matcher.appendReplacement(stringBuffer, matcher.group().toLowerCase());
            }
        }
        matcher.appendTail(stringBuffer);
        return stringBuffer.toString();
    }

    protected String removePlurals(String str) {
        throw new UnsupportedOperationException("Lemma matching is not yet implemented.");
    }

    public void annotate(Collection<Document> collection) {
        annotate(collection, this.source);
    }

    public void annotate(Iterable<Document> iterable, Annotation.Source source) {
        for (Document document : iterable) {
            String text = document.getText();
            if (this.matchingStrategy.equals(MatchingStrategy.LOWERCASE)) {
                text = convertToLowercase(document.getText());
            }
            StringCharProvider stringCharProvider = new StringCharProvider(text, 0);
            if (this.stringSearch == null) {
                log.warn("MatchingAnnotator called without terms loaded");
                return;
            }
            for (StringMatch stringMatch : this.stringSearch.createFinder(stringCharProvider, new StringFinderOption[]{MatchOption.LONGEST_MATCH, MatchOption.NON_OVERLAP}).findAll()) {
                int start = (int) stringMatch.start();
                int end = (int) stringMatch.end();
                List<Token> list = (List) document.streamTokensInRange(start, end, true).collect(Collectors.toList());
                if (spanIsAtTokenBoundaries(list, start, end, document)) {
                    MentionAnnotation mentionAnnotation = new MentionAnnotation(source, list);
                    mentionAnnotation.setType(this.type);
                    document.addAnnotation(mentionAnnotation);
                }
            }
        }
    }

    private boolean spanIsAtTokenBoundaries(List<Token> list, int i, int i2, Document document) {
        if (list.isEmpty()) {
            return false;
        }
        if (list.size() == 1 && list.get(0).getBegin() == i && list.get(0).getEnd() == i2) {
            return true;
        }
        return list.get(0).getBegin() == i && list.get(list.size() - 1).getEnd() == i2;
    }
}
