package org.bigml.mimir.nlp.tokenization;

import com.fasterxml.jackson.databind.JsonNode;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;
import org.bigml.mimir.nlp.tokenization.DictionaryTokenStream;
import org.bigml.mimir.utils.Json;
import org.trie4j.doublearray.DoubleArray;
import org.trie4j.patricia.PatriciaTrie;

/* loaded from: input_file:org/bigml/mimir/nlp/tokenization/TokenStreamFactory.class */
public abstract class TokenStreamFactory {
    public static final String ALL_MODE = "all";
    public static final String FULL_TERMS_MODE = "full_terms_only";
    public static final String CHARACTER_MODE = "characters";
    protected int _gram = -1;
    protected boolean _case = false;
    protected boolean _fullTerm = false;
    protected HashSet<String> _stopWords = null;
    protected Pattern[] _patterns = null;
    protected String _delimiter = null;
    private static final int _MAX_FULL_TERM_LENGTH = 256;

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:org/bigml/mimir/nlp/tokenization/TokenStreamFactory$BigramStreamFactory.class */
    public static class BigramStreamFactory extends TokenStreamFactory implements Serializable {
        private static final long serialVersionUID = 1;

        private BigramStreamFactory() {
        }

        @Override // org.bigml.mimir.nlp.tokenization.TokenStreamFactory
        public TokenStream getStream(String str) {
            return new BigramTokenStream(str, this._case, this._stopWords);
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:org/bigml/mimir/nlp/tokenization/TokenStreamFactory$CharNGramStreamFactory.class */
    public static class CharNGramStreamFactory extends TokenStreamFactory implements Serializable {
        private static final long serialVersionUID = 1;

        private CharNGramStreamFactory() {
        }

        @Override // org.bigml.mimir.nlp.tokenization.TokenStreamFactory
        public TokenStream getStream(String str) {
            return new CharNGramTokenStream(str, Boolean.valueOf(this._case), this._gram);
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:org/bigml/mimir/nlp/tokenization/TokenStreamFactory$ChineseNGramStreamFactory.class */
    public static class ChineseNGramStreamFactory extends TokenStreamFactory implements Serializable {
        private static final long serialVersionUID = 1;

        private ChineseNGramStreamFactory() {
        }

        @Override // org.bigml.mimir.nlp.tokenization.TokenStreamFactory
        public TokenStream getStream(String str) {
            return new ChineseNGramTokenStream(str, this._case);
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:org/bigml/mimir/nlp/tokenization/TokenStreamFactory$ChineseStreamFactory.class */
    public static class ChineseStreamFactory extends TokenStreamFactory implements Serializable {
        private static final long serialVersionUID = 1;

        private ChineseStreamFactory() {
        }

        @Override // org.bigml.mimir.nlp.tokenization.TokenStreamFactory
        public TokenStream getStream(String str) {
            return new ChineseTokenStream(str, this._case);
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:org/bigml/mimir/nlp/tokenization/TokenStreamFactory$CombinationStreamFactory.class */
    public static class CombinationStreamFactory extends TokenStreamFactory implements Serializable {
        private TokenStreamFactory _sf1;
        private TokenStreamFactory _sf2;
        private static final long serialVersionUID = 1;

        public CombinationStreamFactory(TokenStreamFactory tokenStreamFactory, TokenStreamFactory tokenStreamFactory2) {
            this._sf1 = tokenStreamFactory;
            this._sf2 = tokenStreamFactory2;
        }

        @Override // org.bigml.mimir.nlp.tokenization.TokenStreamFactory
        public TokenStream getStream(String str) {
            return new CombinationTokenStream(this._sf1.getStream(str), this._sf2.getStream(str));
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:org/bigml/mimir/nlp/tokenization/TokenStreamFactory$DelimitedStreamFactory.class */
    public static class DelimitedStreamFactory extends TokenStreamFactory implements Serializable {
        private static final long serialVersionUID = 1;

        private DelimitedStreamFactory() {
        }

        @Override // org.bigml.mimir.nlp.tokenization.TokenStreamFactory
        public TokenStream getStream(String str) {
            return new DelimitedTokenStream(str, this._case, this._delimiter);
        }
    }

    /* loaded from: input_file:org/bigml/mimir/nlp/tokenization/TokenStreamFactory$DictionaryStreamFactory.class */
    private static class DictionaryStreamFactory extends TokenStreamFactory implements Serializable {
        private DictionaryTokenStream.TrieNode _rootNode = new DictionaryTokenStream.TrieNode();
        private static final long serialVersionUID = 1;

        public DictionaryStreamFactory(Iterable<String> iterable) {
            Iterator<String> it = iterable.iterator();
            while (it.hasNext()) {
                this._rootNode.insert(it.next().toCharArray(), 0);
            }
        }

        @Override // org.bigml.mimir.nlp.tokenization.TokenStreamFactory
        public TokenStream getStream(String str) {
            return new DictionaryTokenStream(str, this._case, this._rootNode);
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:org/bigml/mimir/nlp/tokenization/TokenStreamFactory$JapaneseStreamFactory.class */
    public static class JapaneseStreamFactory extends TokenStreamFactory implements Serializable {
        private static final long serialVersionUID = 1;

        private JapaneseStreamFactory() {
        }

        @Override // org.bigml.mimir.nlp.tokenization.TokenStreamFactory
        public TokenStream getStream(String str) {
            return new JapaneseTokenStream(str, this._case);
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:org/bigml/mimir/nlp/tokenization/TokenStreamFactory$LetterNumberStreamFactory.class */
    public static class LetterNumberStreamFactory extends TokenStreamFactory implements Serializable {
        private static final long serialVersionUID = 1;

        private LetterNumberStreamFactory() {
        }

        @Override // org.bigml.mimir.nlp.tokenization.TokenStreamFactory
        public TokenStream getStream(String str) {
            return new LetterNumberTokenStream(str, this._case);
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:org/bigml/mimir/nlp/tokenization/TokenStreamFactory$NGramStreamFactory.class */
    public static class NGramStreamFactory extends TokenStreamFactory implements Serializable {
        private static final long serialVersionUID = 1;

        private NGramStreamFactory() {
        }

        @Override // org.bigml.mimir.nlp.tokenization.TokenStreamFactory
        public TokenStream getStream(String str) {
            return new NGramTokenStream(str, this._case, this._stopWords, this._gram);
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:org/bigml/mimir/nlp/tokenization/TokenStreamFactory$NullStreamFactory.class */
    public static class NullStreamFactory extends TokenStreamFactory implements Serializable {
        private static final long serialVersionUID = 1;

        private NullStreamFactory() {
        }

        @Override // org.bigml.mimir.nlp.tokenization.TokenStreamFactory
        public TokenStream getStream(String str) {
            return null;
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:org/bigml/mimir/nlp/tokenization/TokenStreamFactory$RegexStreamFactory.class */
    public static class RegexStreamFactory extends TokenStreamFactory implements Serializable {
        private static final long serialVersionUID = 1;

        private RegexStreamFactory() {
        }

        @Override // org.bigml.mimir.nlp.tokenization.TokenStreamFactory
        public TokenStream getStream(String str) {
            return new RegexTokenStream(str, this._case, this._patterns);
        }
    }

    public abstract TokenStream getStream(String str);

    public static TokenStreamFactory getTextBuilder(JsonNode jsonNode) {
        JsonNode jsonNode2 = jsonNode;
        if (jsonNode.has("term_analysis")) {
            jsonNode2 = jsonNode.get("term_analysis");
        }
        String string = Json.getString(jsonNode2, "language", "none");
        String string2 = Json.getString(jsonNode2, "token_mode", "tokens_only");
        boolean z = Json.getBoolean(jsonNode2, "case_sensitive", false);
        int i = 1;
        if (jsonNode2.has("ngrams")) {
            i = jsonNode2.get("ngrams").asInt();
        } else if (jsonNode2.has("bigrams") && jsonNode2.get("bigrams").asBoolean()) {
            i = 2;
        }
        HashSet hashSet = null;
        if (string != null && string.equals("zh")) {
            JsonNode jsonNode3 = jsonNode.get("tag_cloud");
            hashSet = new HashSet(jsonNode3.size());
            for (int i2 = 0; i2 < jsonNode3.size(); i2++) {
                hashSet.add(jsonNode3.get(i2).get(0).asText());
            }
        }
        return getBuilder(string, string2, i, z, hashSet, null);
    }

    public static TokenStreamFactory getItemsBuilder(JsonNode jsonNode) {
        DelimitedStreamFactory delimitedStreamFactory = new DelimitedStreamFactory();
        delimitedStreamFactory._delimiter = jsonNode.get("item_analysis").get("separator").asText();
        delimitedStreamFactory._case = true;
        return delimitedStreamFactory;
    }

    public static TokenStreamFactory getBuilder(JsonNode jsonNode) {
        String asText = jsonNode.has("optype") ? jsonNode.get("optype").asText() : "text";
        if (asText.equals("items")) {
            return getItemsBuilder(jsonNode);
        }
        if (asText.equals("text")) {
            return getTextBuilder(jsonNode);
        }
        throw new IllegalArgumentException("Cannot tokenize optype '" + asText + "'!");
    }

    public static TokenStreamFactory getBuilder(String str, boolean z, Set<String> set) {
        TokenStreamFactory charNGramStreamFactory = str.equals(CHARACTER_MODE) ? new CharNGramStreamFactory() : str.equals(FULL_TERMS_MODE) ? new NullStreamFactory() : new DictionaryStreamFactory(set);
        charNGramStreamFactory._case = z;
        return charNGramStreamFactory;
    }

    public static TokenStreamFactory getBuilder(String str, String str2, int i, boolean z, Set<String> set, List<String> list) {
        TokenStreamFactory tokenStreamFactory;
        TokenStreamFactory charNGramStreamFactory = str2.equals(CHARACTER_MODE) ? new CharNGramStreamFactory() : str2.equals(FULL_TERMS_MODE) ? new NullStreamFactory() : (str == null || !str.equals("zh")) ? (str == null || !str.equals("ja")) ? i == 2 ? new BigramStreamFactory() : i > 2 ? new NGramStreamFactory() : new LetterNumberStreamFactory() : new JapaneseStreamFactory() : i > 1 ? new ChineseNGramStreamFactory() : new ChineseStreamFactory();
        charNGramStreamFactory._gram = i;
        charNGramStreamFactory._case = z;
        if (set != null && (str == null || !str.equals("zh"))) {
            charNGramStreamFactory._stopWords = new HashSet<>(set);
        }
        if (list != null) {
            RegexStreamFactory regexStreamFactory = new RegexStreamFactory();
            regexStreamFactory._case = z;
            regexStreamFactory._patterns = new Pattern[list.size()];
            for (int i2 = 0; i2 < list.size(); i2++) {
                regexStreamFactory._patterns[i2] = Pattern.compile(list.get(i2));
            }
            tokenStreamFactory = new CombinationStreamFactory(charNGramStreamFactory, regexStreamFactory);
        } else {
            charNGramStreamFactory._patterns = null;
            charNGramStreamFactory.setTokenMode(str2);
            tokenStreamFactory = charNGramStreamFactory;
        }
        return tokenStreamFactory;
    }

    public static DoubleArray toDoubleArrayTrie(Collection<String> collection) {
        if (collection instanceof DoubleArray) {
            return (DoubleArray) collection;
        }
        if (collection == null || collection.isEmpty()) {
            return null;
        }
        PatriciaTrie patriciaTrie = new PatriciaTrie();
        Iterator<String> it = collection.iterator();
        while (it.hasNext()) {
            patriciaTrie.insert(it.next());
        }
        return new DoubleArray(patriciaTrie);
    }

    public List<String> getTokenList(String str) {
        TokenStream stream = getStream(str);
        List<String> list = stream != null ? stream.toList() : new ArrayList();
        if (this._fullTerm && str.length() < _MAX_FULL_TERM_LENGTH) {
            String str2 = str;
            if (!this._case) {
                str2 = str2.toLowerCase();
            }
            if (list.size() != 1 || !str2.equals(list.get(0))) {
                list.add(str2);
            }
        }
        return list;
    }

    private void setTokenMode(String str) {
        if (str.equals(ALL_MODE) || str.equals(FULL_TERMS_MODE)) {
            this._fullTerm = true;
        } else {
            this._fullTerm = false;
        }
    }
}
