package cc.unitmesh.rag.splitter;

import cc.unitmesh.nlp.embedding.EncodingTokenizer;
import cc.unitmesh.nlp.embedding.OpenAiEncoding;
import java.util.ArrayList;
import java.util.List;
import kotlin.Metadata;
import kotlin.jvm.internal.DefaultConstructorMarker;
import kotlin.jvm.internal.Intrinsics;
import kotlin.jvm.internal.SourceDebugExtension;
import kotlin.text.StringsKt;
import org.antlr.v4.gui.TestRig;
import org.apache.commons.lang3.StringUtils;
import org.jetbrains.annotations.NotNull;

/* compiled from: TokenTextSplitter.kt */
@Metadata(mv = {1, 9, 0}, k = 1, xi = 48, d1 = {"��&\n\u0002\u0018\u0002\n\u0002\u0018\u0002\n��\n\u0002\u0018\u0002\n��\n\u0002\u0010\b\n\u0002\b\t\n\u0002\u0010\u000e\n��\n\u0002\u0010 \n\u0002\b\u0005\u0018��2\u00020\u0001B7\u0012\b\b\u0002\u0010\u0002\u001a\u00020\u0003\u0012\b\b\u0002\u0010\u0004\u001a\u00020\u0005\u0012\b\b\u0002\u0010\u0006\u001a\u00020\u0005\u0012\b\b\u0002\u0010\u0007\u001a\u00020\u0005\u0012\b\b\u0002\u0010\b\u001a\u00020\u0005¢\u0006\u0002\u0010\tJ\u0016\u0010\u000e\u001a\u00020\u000f2\f\u0010\u0010\u001a\b\u0012\u0004\u0012\u00020\u00050\u0011H\u0002J\u0016\u0010\u0012\u001a\b\u0012\u0004\u0012\u00020\u00050\u00112\u0006\u0010\u0013\u001a\u00020\u000fH\u0002J \u0010\u0014\u001a\b\u0012\u0004\u0012\u00020\u000f0\u00112\b\u0010\u0013\u001a\u0004\u0018\u00010\u000f2\u0006\u0010\u0004\u001a\u00020\u0005H\u0002J\u0016\u0010\u0015\u001a\b\u0012\u0004\u0012\u00020\u000f0\u00112\u0006\u0010\u0013\u001a\u00020\u000fH\u0016R\u001a\u0010\u0004\u001a\u00020\u0005X\u0094\u000e¢\u0006\u000e\n��\u001a\u0004\b\n\u0010\u000b\"\u0004\b\f\u0010\rR\u000e\u0010\u0002\u001a\u00020\u0003X\u0082\u0004¢\u0006\u0002\n��R\u000e\u0010\b\u001a\u00020\u0005X\u0082\u0004¢\u0006\u0002\n��R\u000e\u0010\u0007\u001a\u00020\u0005X\u0082\u0004¢\u0006\u0002\n��R\u000e\u0010\u0006\u001a\u00020\u0005X\u0082\u0004¢\u0006\u0002\n��¨\u0006\u0016"}, d2 = {"Lcc/unitmesh/rag/splitter/TokenTextSplitter;", "Lcc/unitmesh/rag/splitter/TextSplitter;", "encoding", "Lcc/unitmesh/nlp/embedding/EncodingTokenizer;", "chunkSize", "", "minChunkSizeChars", "minChunkLengthToEmbed", "maxNumChunks", "(Lcc/unitmesh/nlp/embedding/EncodingTokenizer;IIII)V", "getChunkSize", "()I", "setChunkSize", "(I)V", "decodeTokens", "", TestRig.LEXER_START_RULE_NAME, "", "getEncodedTokens", "text", "split", "splitText", "cocoa-core"})
@SourceDebugExtension({"SMAP\nTokenTextSplitter.kt\nKotlin\n*S Kotlin\n*F\n+ 1 TokenTextSplitter.kt\ncc/unitmesh/rag/splitter/TokenTextSplitter\n+ 2 Strings.kt\nkotlin/text/StringsKt__StringsKt\n*L\n1#1,111:1\n107#2:112\n79#2,22:113\n107#2:135\n79#2,22:136\n107#2:158\n79#2,29:159\n107#2:188\n79#2,22:189\n*S KotlinDebug\n*F\n+ 1 TokenTextSplitter.kt\ncc/unitmesh/rag/splitter/TokenTextSplitter\n*L\n52#1:112\n52#1:113,22\n66#1:135\n66#1:136,22\n82#1:158\n82#1:159,29\n94#1:188\n94#1:189,22\n*E\n"})
/* loaded from: input_file:cc/unitmesh/rag/splitter/TokenTextSplitter.class */
public final class TokenTextSplitter extends TextSplitter {

    @NotNull
    private final EncodingTokenizer encoding;
    private int chunkSize;
    private final int minChunkSizeChars;
    private final int minChunkLengthToEmbed;
    private final int maxNumChunks;

    public TokenTextSplitter(@NotNull EncodingTokenizer encoding, int i, int i2, int i3, int i4) {
        Intrinsics.checkNotNullParameter(encoding, "encoding");
        this.encoding = encoding;
        this.chunkSize = i;
        this.minChunkSizeChars = i2;
        this.minChunkLengthToEmbed = i3;
        this.maxNumChunks = i4;
    }

    public /* synthetic */ TokenTextSplitter(EncodingTokenizer encodingTokenizer, int i, int i2, int i3, int i4, int i5, DefaultConstructorMarker defaultConstructorMarker) {
        this((i5 & 1) != 0 ? new OpenAiEncoding() : encodingTokenizer, (i5 & 2) != 0 ? 800 : i, (i5 & 4) != 0 ? 350 : i2, (i5 & 8) != 0 ? 5 : i3, (i5 & 16) != 0 ? 10000 : i4);
    }

    /* JADX INFO: Access modifiers changed from: protected */
    @Override // cc.unitmesh.rag.splitter.TextSplitter
    public int getChunkSize() {
        return this.chunkSize;
    }

    @Override // cc.unitmesh.rag.splitter.TextSplitter
    protected void setChunkSize(int i) {
        this.chunkSize = i;
    }

    @Override // cc.unitmesh.rag.splitter.TextSplitter
    @NotNull
    public List<String> splitText(@NotNull String text) {
        Intrinsics.checkNotNullParameter(text, "text");
        return split(text, getChunkSize());
    }

    private final List<String> split(String str, int i) {
        String obj;
        if (str != null) {
            String str2 = str;
            int i2 = 0;
            int length = str2.length() - 1;
            boolean z = false;
            while (i2 <= length) {
                boolean z2 = Intrinsics.compare((int) str2.charAt(!z ? i2 : length), 32) <= 0;
                if (z) {
                    if (!z2) {
                        break;
                    }
                    length--;
                } else if (z2) {
                    i2++;
                } else {
                    z = true;
                }
            }
            if (!(str2.subSequence(i2, length + 1).toString().length() == 0)) {
                List<Integer> encodedTokens = getEncodedTokens(str);
                ArrayList arrayList = new ArrayList();
                int i3 = 0;
                while (true) {
                    if (!(!encodedTokens.isEmpty()) || i3 >= this.maxNumChunks) {
                        break;
                    }
                    List<Integer> subList = encodedTokens.subList(0, (int) Math.min(i, encodedTokens.size()));
                    String decodeTokens = decodeTokens(subList);
                    String str3 = decodeTokens;
                    int i4 = 0;
                    int length2 = str3.length() - 1;
                    boolean z3 = false;
                    while (i4 <= length2) {
                        boolean z4 = Intrinsics.compare((int) str3.charAt(!z3 ? i4 : length2), 32) <= 0;
                        if (z3) {
                            if (!z4) {
                                break;
                            }
                            length2--;
                        } else if (z4) {
                            i4++;
                        } else {
                            z3 = true;
                        }
                    }
                    if (str3.subSequence(i4, length2 + 1).toString().length() == 0) {
                        encodedTokens = encodedTokens.subList(subList.size(), encodedTokens.size());
                    } else {
                        int max = Math.max(StringsKt.lastIndexOf$default((CharSequence) decodeTokens, '.', 0, false, 6, (Object) null), Math.max(StringsKt.lastIndexOf$default((CharSequence) decodeTokens, '?', 0, false, 6, (Object) null), Math.max(StringsKt.lastIndexOf$default((CharSequence) decodeTokens, '!', 0, false, 6, (Object) null), StringsKt.lastIndexOf$default((CharSequence) decodeTokens, '\n', 0, false, 6, (Object) null))));
                        if (max != -1 && max > this.minChunkSizeChars) {
                            String substring = decodeTokens.substring(0, max + 1);
                            Intrinsics.checkNotNullExpressionValue(substring, "this as java.lang.String…ing(startIndex, endIndex)");
                            decodeTokens = substring;
                        }
                        if (getKeepSeparator()) {
                            String str4 = decodeTokens;
                            int i5 = 0;
                            int length3 = str4.length() - 1;
                            boolean z5 = false;
                            while (i5 <= length3) {
                                boolean z6 = Intrinsics.compare((int) str4.charAt(!z5 ? i5 : length3), 32) <= 0;
                                if (z5) {
                                    if (!z6) {
                                        break;
                                    }
                                    length3--;
                                } else if (z6) {
                                    i5++;
                                } else {
                                    z5 = true;
                                }
                            }
                            obj = str4.subSequence(i5, length3 + 1).toString();
                        } else {
                            String replace$default = StringsKt.replace$default(decodeTokens, StringUtils.LF, StringUtils.SPACE, false, 4, (Object) null);
                            int i6 = 0;
                            int length4 = replace$default.length() - 1;
                            boolean z7 = false;
                            while (i6 <= length4) {
                                boolean z8 = Intrinsics.compare((int) replace$default.charAt(!z7 ? i6 : length4), 32) <= 0;
                                if (z7) {
                                    if (!z8) {
                                        break;
                                    }
                                    length4--;
                                } else if (z8) {
                                    i6++;
                                } else {
                                    z7 = true;
                                }
                            }
                            obj = replace$default.subSequence(i6, length4 + 1).toString();
                        }
                        String str5 = obj;
                        if (str5.length() > this.minChunkLengthToEmbed) {
                            arrayList.add(str5);
                        }
                        encodedTokens = encodedTokens.subList(getEncodedTokens(decodeTokens).size(), encodedTokens.size());
                        i3++;
                    }
                }
                if (!encodedTokens.isEmpty()) {
                    String replace$default2 = StringsKt.replace$default(decodeTokens(encodedTokens), StringUtils.LF, StringUtils.SPACE, false, 4, (Object) null);
                    int i7 = 0;
                    int length5 = replace$default2.length() - 1;
                    boolean z9 = false;
                    while (i7 <= length5) {
                        boolean z10 = Intrinsics.compare((int) replace$default2.charAt(!z9 ? i7 : length5), 32) <= 0;
                        if (z9) {
                            if (!z10) {
                                break;
                            }
                            length5--;
                        } else if (z10) {
                            i7++;
                        } else {
                            z9 = true;
                        }
                    }
                    String obj2 = replace$default2.subSequence(i7, length5 + 1).toString();
                    if (obj2.length() > this.minChunkLengthToEmbed) {
                        arrayList.add(obj2);
                    }
                }
                return arrayList;
            }
        }
        return new ArrayList();
    }

    private final List<Integer> getEncodedTokens(String str) {
        return this.encoding.encode(str);
    }

    private final String decodeTokens(List<Integer> list) {
        return this.encoding.decode(list);
    }

    public TokenTextSplitter() {
        this(null, 0, 0, 0, 0, 31, null);
    }
}
