package org.bigml.mimir.nlp.tokenization;

import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import org.junit.Assert;
import org.junit.Test;

/* loaded from: input_file:org/bigml/mimir/nlp/tokenization/TokenizationTest.class */
public class TokenizationTest {
    public void compare(String[] strArr, Collection<String> collection) {
        ArrayList arrayList = new ArrayList();
        for (String str : strArr) {
            arrayList.add(str);
        }
        Assert.assertTrue(arrayList.toString() + " != " + collection.toString(), arrayList.equals(collection));
    }

    @Test
    public void tokenize() {
        compare(new String[]{"some", "people", "and", "some", "animals", "not", TokenStreamFactory.ALL_MODE, "people"}, TokenStreamFactory.getBuilder("en", "single", 1, false, null, null).getTokenList("Some people and some ANIMALS, not all people."));
    }

    @Test
    public void caseSensitive() {
        compare(new String[]{"Some", "people", "and", "some", "ANIMALS", "not", TokenStreamFactory.ALL_MODE, "people", "Some people and some ANIMALS, not all people."}, TokenStreamFactory.getBuilder("en", TokenStreamFactory.ALL_MODE, 1, true, null, null).getTokenList("Some people and some ANIMALS, not all people."));
    }

    @Test
    public void nGrams() {
        HashSet hashSet = new HashSet();
        hashSet.add("and");
        compare(new String[]{"some", "some people", "people", "and", "some", "some animals", "animals", "not", "not all", TokenStreamFactory.ALL_MODE, "not all people", "all people", "people"}, TokenStreamFactory.getBuilder("en", "single", 3, false, hashSet, null).getTokenList("Some people and some animals, not all people."));
    }

    @Test
    public void fullTerms() {
        TokenStreamFactory builder = TokenStreamFactory.getBuilder("en", TokenStreamFactory.ALL_MODE, 1, false, null, null);
        compare(new String[]{"token1"}, builder.getTokenList("token1"));
        compare(new String[]{"token1", "token2", "token1 token2"}, builder.getTokenList("token1 token2"));
    }

    @Test
    public void nGramsChinese() {
        HashSet hashSet = new HashSet();
        hashSet.add("鱷梨");
        compare(new String[]{"charles", "鱷梨", "parker"}, TokenStreamFactory.getBuilder("zh", "single", 2, false, hashSet, null).getTokenList("charles 鱷梨 parker"));
    }

    @Test
    public void dictionaryTokens() {
        HashSet hashSet = new HashSet();
        hashSet.add("鱷梨");
        hashSet.add("charles");
        TokenStreamFactory builder = TokenStreamFactory.getBuilder("single", false, hashSet);
        compare(new String[]{"charles", "鱷梨"}, builder.getTokenList("charles 鱷梨 parker"));
        compare(new String[]{"charles"}, builder.getTokenList("charles 鱷 梨 parker"));
        compare(new String[0], builder.getTokenList("鱷 梨 parker"));
        compare(new String[]{"鱷梨"}, builder.getTokenList("charlesp 鱷梨"));
    }

    @Test
    public void dictionaryNGrams() {
        HashSet hashSet = new HashSet();
        hashSet.add("鱷梨");
        hashSet.add("鱷梨鱷");
        hashSet.add("charles");
        hashSet.add("charles");
        hashSet.add("charles parker");
        hashSet.add("*dr. charles parker");
        TokenStreamFactory builder = TokenStreamFactory.getBuilder("single", false, hashSet);
        compare(new String[]{"*dr. charles parker", "charles", "charles parker"}, builder.getTokenList("$*#)*dr. Charles parker#$*("));
        compare(new String[]{"charles", "鱷梨", "鱷梨鱷"}, builder.getTokenList("charles 鱷梨鱷 parker"));
        compare(new String[]{"鱷梨", "鱷梨鱷"}, builder.getTokenList("鱷梨鱷"));
        compare(new String[]{"charles", "鱷梨", "charles"}, builder.getTokenList("charles鱷梨charles"));
    }

    @Test
    public void dictionaryDigitsAndCase() {
        HashSet hashSet = new HashSet();
        hashSet.add("鱷梨1985Right?Yes!");
        hashSet.add("1984");
        hashSet.add("CHARLES42");
        TokenStreamFactory builder = TokenStreamFactory.getBuilder("single", true, hashSet);
        compare(new String[0], builder.getTokenList("鱷梨1985right?Yes! CHARLEs42"));
        compare(new String[]{"鱷梨1985Right?Yes!", "CHARLES42"}, builder.getTokenList("鱷梨1985Right?Yes! CHARLES42"));
        compare(new String[]{"1984"}, builder.getTokenList("1984 CHARLES421984"));
    }

    @Test
    public void simplification() {
        TokenStreamFactory builder = TokenStreamFactory.getBuilder("zh", TokenStreamFactory.ALL_MODE, 2, false, null, null);
        for (int i = 0; i < 8192; i++) {
            Assert.assertTrue(builder.getTokenList("鱷梨鱷梨鱷鱷梨鱷鱷梨鱷梨").size() > 0);
        }
    }
}
