package de.jungblut.nlp;

import com.google.common.base.Preconditions;
import de.jungblut.datastructure.ArrayUtils;
import de.jungblut.datastructure.StringPool;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.LinkedHashSet;
import java.util.StringTokenizer;
import java.util.regex.Pattern;

/* loaded from: input_file:de/jungblut/nlp/TokenizerUtils.class */
public final class TokenizerUtils {
    public static final String END_TAG = "<END>";
    public static final String START_TAG = "<START>";
    public static final String SEPARATORS = " \r\n\t.,;:'\"()?!\\-/|“„";
    private static final Pattern SEPARATORS_PATTERN = Pattern.compile("[ \r\n\t\\.,;:'\"()?!\\-/|“„]");
    private static final Pattern WHITESPACE_PATTERN = Pattern.compile("\\s+");
    private static final char[] CHARACTER_REPLACE_MAPPING = new char[256];

    private TokenizerUtils() {
        throw new IllegalAccessError();
    }

    public static String[] removeMatchingRegex(String str, String str2, String[] strArr, boolean z) {
        String[] strArr2 = new String[strArr.length];
        for (int i = 0; i < strArr.length; i++) {
            strArr2[i] = strArr[i].replaceAll(str, str2);
        }
        if (z) {
            strArr2 = removeEmpty(strArr2);
        }
        return strArr2;
    }

    public static String[] qGramTokenize(String str, int i) {
        return nShinglesTokenize(str, i);
    }

    public static String[] nShinglesTokenize(String str, int i) {
        if (str.length() < i) {
            return new String[]{str};
        }
        int length = (str.length() - i) + 1;
        ArrayList arrayList = new ArrayList(length);
        for (int i2 = 0; i2 < length; i2++) {
            arrayList.add(new String(str.substring(i2, i2 + i)));
        }
        return (String[]) arrayList.toArray(new String[arrayList.size()]);
    }

    public static String[] whiteSpaceTokenize(String str) {
        return WHITESPACE_PATTERN.split(str);
    }

    public static String[] deduplicateTokens(String[] strArr) {
        LinkedHashSet linkedHashSet = new LinkedHashSet();
        Collections.addAll(linkedHashSet, strArr);
        return (String[]) linkedHashSet.toArray(new String[linkedHashSet.size()]);
    }

    public static String[] wordTokenize(String str) {
        return wordTokenize(str, false);
    }

    public static String[] wordTokenize(String str, boolean z) {
        if (!z) {
            return SEPARATORS_PATTERN.split(str);
        }
        StringTokenizer stringTokenizer = new StringTokenizer(str, SEPARATORS, true);
        int countTokens = stringTokenizer.countTokens();
        String[] strArr = new String[countTokens];
        int i = 0;
        while (true) {
            int i2 = countTokens;
            countTokens--;
            if (i2 <= 0) {
                return (String[]) Arrays.copyOf(strArr, i);
            }
            strArr[i] = stringTokenizer.nextToken();
            if (strArr[i].charAt(0) > ' ') {
                i++;
            }
        }
    }

    public static String[] wordTokenize(String str, String str2) {
        return str.split(str2);
    }

    public static String[] normalizeTokens(String[] strArr, boolean z) {
        for (int i = 0; i < strArr.length; i++) {
            strArr[i] = normalizeString(strArr[i]);
        }
        if (z) {
            strArr = removeEmpty(strArr);
        }
        return strArr;
    }

    public static String normalizeString(String str) {
        char[] charArray = str.toCharArray();
        char[] cArr = new char[charArray.length];
        int i = 0;
        for (char c : charArray) {
            if (c < CHARACTER_REPLACE_MAPPING.length && CHARACTER_REPLACE_MAPPING[c] > 0) {
                int i2 = i;
                i++;
                cArr[i2] = CHARACTER_REPLACE_MAPPING[c];
            }
        }
        return String.valueOf(Arrays.copyOf(cArr, i));
    }

    public static String[] removeEmpty(String[] strArr) {
        ArrayList arrayList = new ArrayList();
        for (String str : strArr) {
            if (str != null && !str.isEmpty()) {
                arrayList.add(str);
            }
        }
        return (String[]) arrayList.toArray(new String[arrayList.size()]);
    }

    public static String[] whiteSpaceTokenizeNGrams(String str, int i) {
        return buildNGrams(whiteSpaceTokenize(str), i);
    }

    public static String[] buildNGrams(String[] strArr, int i) {
        if (strArr.length < i) {
            return strArr;
        }
        ArrayList arrayList = new ArrayList();
        int length = (strArr.length - i) + 1;
        for (int i2 = 0; i2 < length; i2++) {
            StringBuilder sb = new StringBuilder(strArr[i2]);
            int i3 = i2 + i;
            for (int i4 = i2 + 1; i4 < i3; i4++) {
                sb.append(' ');
                sb.append(strArr[i4]);
            }
            arrayList.add(sb.toString());
        }
        return (String[]) arrayList.toArray(new String[arrayList.size()]);
    }

    /* JADX WARN: Type inference failed for: r0v7, types: [java.lang.Object[][], java.lang.String[]] */
    public static String[] buildNGramsRange(String[] strArr, int i, int i2) {
        String[] buildNGrams = buildNGrams(strArr, i);
        for (int i3 = i + 1; i3 <= i2; i3++) {
            buildNGrams = (String[]) ArrayUtils.concat((Object[][]) new String[]{buildNGrams, buildNGrams(strArr, i3)});
        }
        return buildNGrams;
    }

    public static String[] internStrings(String[] strArr) {
        for (int i = 0; i < strArr.length; i++) {
            strArr[i] = strArr[i].intern();
        }
        return strArr;
    }

    public static String[] internStrings(String[] strArr, StringPool stringPool) {
        Preconditions.checkNotNull(stringPool, "Pool shouldn't be null!");
        for (int i = 0; i < strArr.length; i++) {
            strArr[i] = stringPool.pool(strArr[i]);
        }
        return strArr;
    }

    public static String[] addStartAndEndTags(String[] strArr) {
        String[] strArr2 = new String[strArr.length + 2];
        System.arraycopy(strArr, 0, strArr2, 1, strArr.length);
        strArr2[0] = START_TAG;
        strArr2[strArr2.length - 1] = END_TAG;
        return strArr2;
    }

    public static String concat(String[] strArr, String str) {
        int length = strArr.length - 1;
        StringBuilder sb = new StringBuilder();
        for (int i = 0; i < strArr.length; i++) {
            sb.append(strArr[i]);
            if (i != length) {
                sb.append(str);
            }
        }
        return sb.toString();
    }

    static {
        char c = 'A';
        while (true) {
            char c2 = c;
            if (c2 > 'Z') {
                break;
            }
            CHARACTER_REPLACE_MAPPING[c2] = (char) (c2 + ' ');
            c = (char) (c2 + 1);
        }
        CHARACTER_REPLACE_MAPPING[32] = ' ';
        CHARACTER_REPLACE_MAPPING[228] = 228;
        CHARACTER_REPLACE_MAPPING[246] = 246;
        CHARACTER_REPLACE_MAPPING[252] = 252;
        CHARACTER_REPLACE_MAPPING[196] = 228;
        CHARACTER_REPLACE_MAPPING[214] = 246;
        CHARACTER_REPLACE_MAPPING[220] = 252;
        CHARACTER_REPLACE_MAPPING[223] = 223;
        char c3 = '0';
        while (true) {
            char c4 = c3;
            if (c4 > '9') {
                break;
            }
            CHARACTER_REPLACE_MAPPING[c4] = c4;
            c3 = (char) (c4 + 1);
        }
        char c5 = 'a';
        while (true) {
            char c6 = c5;
            if (c6 > 'z') {
                return;
            }
            CHARACTER_REPLACE_MAPPING[c6] = c6;
            c5 = (char) (c6 + 1);
        }
    }
}
