package org.bigml.mimir.nlp.tokenization;

import java.io.IOException;
import java.lang.Character;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;

/* loaded from: input_file:org/bigml/mimir/nlp/tokenization/ChineseTokenStream.class */
public class ChineseTokenStream extends TokenStream {
    protected static Character.UnicodeScript HAN = Character.UnicodeScript.HAN;
    protected static Character.UnicodeScript CMN = Character.UnicodeScript.COMMON;
    protected static Class<CharTermAttribute> TERM = CharTermAttribute.class;
    private Analyzer analyzer;
    private org.apache.lucene.analysis.TokenStream stream;
    protected int stringsFromStream;

    public ChineseTokenStream(String str, boolean z) {
        super(str, z);
        this.analyzer = new SmartChineseAnalyzer(false);
        this.stringsFromStream = 0;
    }

    private void setChineseString(String str) {
        try {
            this.stream = this.analyzer.tokenStream((String) null, str);
            this.stream.reset();
        } catch (IOException e) {
            throw new IllegalStateException(e);
        }
    }

    /* JADX INFO: Access modifiers changed from: protected */
    @Override // org.bigml.mimir.nlp.tokenization.TokenStream
    public String nextTerm() {
        return getNext();
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public String getNext() {
        String str = null;
        if (this.stream != null) {
            str = " ";
            while (str != null) {
                try {
                    if (Character.isLetterOrDigit(str.charAt(0))) {
                        break;
                    }
                    if (this.stream.incrementToken()) {
                        str = this.stream.getAttribute(TERM).toString();
                    } else {
                        str = null;
                        this.stream.close();
                        this.stream = null;
                    }
                } catch (IOException e) {
                    throw new IllegalStateException(e);
                }
            }
        }
        if (str != null) {
            this.stringsFromStream++;
            return str;
        }
        String moveMarkers = moveMarkers();
        this.stringsFromStream = 0;
        if (moveMarkers == null) {
            return null;
        }
        if (Character.UnicodeScript.of(moveMarkers.charAt(0)) != HAN || moveMarkers.length() <= 1) {
            return moveMarkers;
        }
        setChineseString(moveMarkers);
        return getNext();
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public static boolean chineseChar(char c) {
        return Character.UnicodeScript.of(c) == HAN;
    }

    /* JADX INFO: Access modifiers changed from: protected */
    @Override // org.bigml.mimir.nlp.tokenization.TokenStream
    public String moveMarkers() {
        this.begin = this.end;
        while (this.begin < this.docLength && !Character.isLetterOrDigit(this.document[this.begin])) {
            this.begin++;
        }
        this.end = this.begin + 1;
        if (this.begin >= this.docLength || Character.UnicodeScript.of(this.document[this.begin]) != HAN) {
            while (this.end < this.docLength && !chineseChar(this.document[this.end]) && ((Character.isLetterOrDigit(this.document[this.end]) || this.document[this.end] == '\'') && this.end - this.begin < 32)) {
                this.end++;
            }
        } else {
            while (this.end < this.docLength && chineseChar(this.document[this.end])) {
                this.end++;
            }
        }
        if (this.begin >= this.docLength) {
            return null;
        }
        return new String(this.document, this.begin, this.end - this.begin);
    }
}
