package net.sf.okapi.lib.segmentation;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.TreeSet;
import net.sf.okapi.common.LocaleId;
import net.sf.okapi.common.resource.TextContainer;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;

@RunWith(JUnit4.class)
/* loaded from: input_file:net/sf/okapi/lib/segmentation/TestJava7Regex.class */
public class TestJava7Regex {
    private SRXSegmenter segmenter;
    private SRXDocument doc;
    private ArrayList<Rule> rules;

    @Before
    public void startUp() {
        this.doc = new SRXDocument();
        this.segmenter = new SRXSegmenter();
        this.rules = new ArrayList<>();
        Assert.assertTrue(this.segmenter.useJavaRegex());
    }

    @Test
    public void testSet() {
        TreeSet treeSet = new TreeSet();
        treeSet.add('w');
        treeSet.add('a');
        treeSet.add('x');
        treeSet.add('t');
        treeSet.add('s');
        char[] cArr = new char[treeSet.size()];
        int i = 0;
        Iterator it = treeSet.iterator();
        while (it.hasNext()) {
            int i2 = i;
            i++;
            cArr[i2] = ((Character) it.next()).charValue();
        }
        Assert.assertEquals("astwx", String.valueOf(cArr));
    }

    @Test
    public void testMetachars() {
        testBreak("before\u0007After", "\\a", "A", "before\u0007", "After");
        testBreak("before\u0007After", "[\\a ]", "A", "before\u0007", "After");
        testBreak("before_After", "[\\a_]", "A", "before_", "After");
        testBreak("before After", "[\\a ]", "A", "before ", "After");
        testBreak("After", "\\AA", "f", "A", "fter");
        testBreak("before\nAfter", "\\Abef", "ore", "bef", "ore\nAfter");
        testBreak("before After", "\\b", "A", "before ", "After");
        testBreak("before After", "\\b", " ", "before", " After");
        testBreak("before\rAfter", "\\b", "\r", "before", "\rAfter");
        testBreak("before After", "\\b", "A", "before ", "After");
        testBreak("before After", "\\b", " ", "before", " After");
        testBreak("before After", "\\b|\\r", "A", "before ", "After");
        testBreak("before\rAfter", "\\b|\\r", "A", "before\r", "After");
        testBreak("before afAter", "\\B", "A", "before af", "Ater");
        testBreak("before after", "\\B", "o", "bef", "ore after");
        testBreak("befo\rreAfter", "\\B", "A", "befo\rre", "After");
        testBreak("before afAter", "\\B", "A", "before af", "Ater");
        testBreak("before after", "\\B", "o", "bef", "ore after");
        testBreak("before after", "\\B|\\r", "o", "bef", "ore after");
        testBreak("beforeAfter", "\\Be", "A", "before", "After");
        testBreak("before\rAfter", "\\B|\\r", "A", "before\r", "After");
        testBreak("before\u0001After", "\\cA", "A", "before\u0001", "After");
        testBreak("before\u0002After", "\\cB", "A", "before\u0002", "After");
        testBreak("before\u0003After", "\\cC", "A", "before\u0003", "After");
        testBreak("before\u0004After", "\\cD", "A", "before\u0004", "After");
        testBreak("before\u0005After", "\\cE", "A", "before\u0005", "After");
        testBreak("before\u0006After", "\\cF", "A", "before\u0006", "After");
        testBreak("before\u0007After", "\\cG", "A", "before\u0007", "After");
        testBreak("before\bAfter", "\\cH", "A", "before\b", "After");
        testBreak("before\tAfter", "\\cI", "A", "before\t", "After");
        testBreak("before\nAfter", "\\cJ", "A", "before\n", "After");
        testBreak("before\u000bAfter", "\\cK", "A", "before\u000b", "After");
        testBreak("before\fAfter", "\\cL", "A", "before\f", "After");
        testBreak("before\rAfter", "\\cM", "A", "before\r", "After");
        testBreak("before\u000eAfter", "\\cN", "A", "before\u000e", "After");
        testBreak("before\u000fAfter", "\\cO", "A", "before\u000f", "After");
        testBreak("before\u0010After", "\\cP", "A", "before\u0010", "After");
        testBreak("before\u0011After", "\\cQ", "A", "before\u0011", "After");
        testBreak("before\u0012After", "\\cR", "A", "before\u0012", "After");
        testBreak("before\u0013After", "\\cS", "A", "before\u0013", "After");
        testBreak("before\u0014After", "\\cT", "A", "before\u0014", "After");
        testBreak("before\u0015After", "\\cU", "A", "before\u0015", "After");
        testBreak("before\u0016After", "\\cV", "A", "before\u0016", "After");
        testBreak("before\u0017After", "\\cW", "A", "before\u0017", "After");
        testBreak("before\u0018After", "\\cX", "A", "before\u0018", "After");
        testBreak("before\u0019After", "\\cY", "A", "before\u0019", "After");
        testBreak("before\u001aAfter", "\\cZ", "A", "before\u001a", "After");
        testBreak("before\u001aAfter", "[\\cZ\\cA]", "A", "before\u001a", "After");
        testBreak("before\u0001After", "[\\cZ\\cA]", "A", "before\u0001", "After");
        testBreak("before001After", "\\d", "A", "before001", "After");
        testNoBreak("beforeAfter", "\\d", "A", "before001", "After");
        testNoBreak("beforeAfter", "[\\d]", "A", "before001", "After");
        testBreak("before३After", "\\d", "A", "before३", "After");
        testBreak("before001After", "[\\d]", "A", "before001", "After");
        testBreak("before001After", "[\\d\\r]", "A", "before001", "After");
        testNoBreak("beforeAfter", "[\\d\\r]", "A", "before001", "After");
        testBreak("before३After", "[\\d\\r]", "A", "before३", "After");
        testBreak("before\rAfter", "[\\d\\r]", "A", "before\r", "After");
        testBreak("beforeAfter", "\\D", "A", "before", "After");
        testNoBreak("before३After", "\\D", "A", "before", "After");
        testBreak("beforeAfter", "[\\D]", "A", "before", "After");
        testNoBreak("before३After", "[\\D]", "A", "before", "After");
        testBreak("beforeAfter", "[\\D\\r]", "A", "before", "After");
        testNoBreak("before३After", "[\\D\\r]", "A", "before", "After");
        testBreak("before\rAfter", "[\\D\\r]", "A", "before\r", "After");
        testBreak("before\u001bAfter", "\\e", "A", "before\u001b", "After");
        testBreak("before\u001bAfter", "[\\e\\r]", "A", "before\u001b", "After");
        testBreak("be{4}[foreAfter", "be\\Q{4}[\\Efore", "A", "be{4}[fore", "After");
        testBreak("be{4}[foreAfter", "be[\\Q{[\\E]4\\}\\[fore", "A", "be{4}[fore", "After");
        testBreak("be[4}[foreAfter", "be[\\Q{[\\E]4\\}\\[fore", "A", "be[4}[fore", "After");
        testBreak("before\fAfter", "\\f", "A", "before\f", "After");
        testBreak("before\fAfter", "[\\f\\r]", "A", "before\f", "After");
        testBreak("before\nAfter", "\\n", "A", "before\n", "After");
        testBreak("before\nAfter", "[\\n\\r]", "A", "before\n", "After");
        testBreak("before\rAfter", "[\\n\\r]", "A", "before\r", "After");
        testBreak("before\nAfter", "\n", "A", "before\n", "After");
        testBreak("before\nAfter", "[\n\r]", "A", "before\n", "After");
        testBreak("before\rAfter", "[\n\r]", "A", "before\r", "After");
        testBreak("beforeиAfter", "\\p{IsCyrillic}", "A", "beforeи", "After");
        testBreak("beforeъиAfter", "\\p{IsCyrillic}", "A", "beforeъи", "After");
        testBreak("beforeъиAfter", "ъ\\p{IsCyrillic}", "A", "beforeъи", "After");
        testBreak("abcцыпаAfter", "abcцы\\p{IsCyrillic}{2}", "A", "abcцыпа", "After");
        testBreak("beforeиAfter", "[\\p{IsCyrillic}]", "A", "beforeи", "After");
        testBreak("beforeиAfter", "[\\p{IsCyrillic}\r]", "A", "beforeи", "After");
        testBreak("before\rAfter", "[\\p{IsCyrillic}\r]", "A", "before\r", "After");
        testBreak("beforeиbAfter", "\\P{IsCyrillic}", "A", "beforeиb", "After");
        testBreak("abcцыdaAfter", "abcцы\\P{IsCyrillic}{2}", "A", "abcцыda", "After");
        testBreak("beforeиbAfter", "[\\P{IsCyrillic}]", "A", "beforeиb", "After");
        testBreak("beforeиbAfter", "[\\P{IsCyrillic}\r]", "A", "beforeиb", "After");
        testBreak("before\rAfter", "[\\P{IsCyrillic}\r]", "A", "before\r", "After");
        testBreak("be{4}[foreAfter", "be\\Q{4}[\\Efore", "A", "be{4}[fore", "After");
        testBreak("be{4}[foreAfter", "be[\\Q{[\\E]4\\}\\[fore", "A", "be{4}[fore", "After");
        testBreak("be[4}[foreAfter", "be[\\Q{[\\E]4\\}\\[fore", "A", "be[4}[fore", "After");
        testBreak("before\rAfter", "\\r", "A", "before\r", "After");
        testBreak("before\rAfter", "[\\n\\r]", "A", "before\r", "After");
        testBreak("before\nAfter", "[\\n\\r]", "A", "before\n", "After");
        testBreak("before\rAfter", "\r", "A", "before\r", "After");
        testBreak("before\rAfter", "[\n\r]", "A", "before\r", "After");
        testBreak("before\nAfter", "[\n\r]", "A", "before\n", "After");
        testBreak("before After", "\\s", "A", "before ", "After");
        testBreak("before\u3000After", "\\s", "A", "before\u3000", "After");
        testBreak("before\u2009After", "\\s", "A", "before\u2009", "After");
        testBreak("before\tAfter", "\\s", "A", "before\t", "After");
        testBreak("before After", "[\\s\\t]", "A", "before ", "After");
        testBreak("before\tAfter", "[\\s\\t]", "A", "before\t", "After");
        testBreak("   beforeAfter", "\\S", "A", "   before", "After");
        testBreak("   beforeAfter", "[\\S]", "A", "   before", "After");
        testBreak("   beforeAfter", "[\\S\\t]", "A", "   before", "After");
        testBreak("before\tAfter", "\\t", "A", "before\t", "After");
        testBreak("before\tAfter", "[\\t\\r]", "A", "before\t", "After");
        testBreak("before\tAfter", "e\\tA", "f", "before\tA", "fter");
        testBreak("beforeሴAfter", "\\u1234", "A", "beforeሴ", "After");
        testBreak("beforeAfter", "\\w", "A", "before", "After");
        testBreak("beforeAfter", "[\\w\\r]", "A", "before", "After");
        testBreak("before\rAfter", "[\\w\\r]", "A", "before\r", "After");
        testBreak("before After", "\\W", "A", "before ", "After");
        testBreak("before After", "[\\W\\r]", "A", "before ", "After");
        testBreak("before\rAfter", "[\\W\\r]", "A", "before\r", "After");
        testBreak("before¯After", "\\x{00AF}", "A", "before¯", "After");
        testBreak("before¯After", "\\x{00af}", "A", "before¯", "After");
        testBreak("before¯After", "\\x{af}", "A", "before¯", "After");
        testBreak("before¯After", "[\\x{00AF}\\x{00AA}]", "A", "before¯", "After");
        testBreak("beforeªAfter", "[\\x{00AF}\\x{00AA}]", "A", "beforeª", "After");
        testBreak("before" + buildString(65535) + "After", "\\x{00FFFF}", "A", "before" + buildString(65535), "After");
        testBreak("before" + buildString(1114111) + "After", "[\\x{10FFFF}\\x{10A000}]", "A", "before" + buildString(1114111), "After");
        testBreak("before" + buildString(1089536) + "After", "[\\x{10FFFF}\\x{10A000}-\\x{10AA00}]", "A", "before" + buildString(1089536), "After");
        testBreak("before¯After", "\\xAF", "A", "before¯", "After");
        testBreak("before¯After", "[\\xAF\\xAA]", "A", "before¯", "After");
        testBreak("beforeªAfter", "[\\xAF\\xAA]", "A", "beforeª", "After");
        testBreak("beforeg̈After", "(?:(?:\\u000D\\u000A)|(?:[\\u0E40\\u0E41\\u0E42\\u0E43\\u0E44\\u0EC0\\u0EC1\\u0EC2\\u0EC3\\u0EC4\\uAAB5\\uAAB6\\uAAB9\\uAABB\\uAABC]*(?:[\\u1100-\\u115F\\uA960-\\uA97C]+|([\\u1100-\\u115F\\uA960-\\uA97C]*((?:[[\\u1160-\\u11A2\\uD7B0-\\uD7C6][\\uAC00\\uAC1C\\uAC38]][\\u1160-\\u11A2\\uD7B0-\\uD7C6]*|[\\uAC01\\uAC02\\uAC03\\uAC04])[\\u11A8-\\u11F9\\uD7CB-\\uD7FB]*))|[\\u11A8-\\u11F9\\uD7CB-\\uD7FB]+|[^[\\p{Zl}\\p{Zp}\\p{Cc}\\p{Cf}&&[^\\u000D\\u000A\\u200C\\u200D]]\\u000D\\u000A])[[\\p{Mn}\\p{Me}\\u200C\\u200D\\u0488\\u0489\\u20DD\\u20DE\\u20DF\\u20E0\\u20E2\\u20E3\\u20E4\\uA670\\uA671\\uA672\\uFF9E\\uFF9F][\\p{Mc}\\u0E30\\u0E32\\u0E33\\u0E45\\u0EB0\\u0EB2\\u0EB3]]*)|(?s:.))", "A", "beforeg̈", "After");
        testBreak("beforeÁAfter", "(?>\\PM*\\pM*)", "A", "beforeÁ", "After");
        testBreak("beforeÃAfter", "(?>\\PM*\\pM*)", "A", "beforeÃ", "After");
        testBreak("beforeก็After", "(?>\\PM\\pM*)", "A", "beforeก็", "After");
        testBreak("before༿After", "\\u0F3F", "A", "before༿", "After");
        testBreak("before각After", "(?>\\PM\\pM*)", "A", "before각", "After");
        testBreak("beforeநிAfter", "(?>\\PM\\pM*)", "A", "beforeநி", "After");
        testBreak("beforeषिAfter", "(?>\\PM\\pM*)", "A", "beforeषि", "After");
        testBreak("beforeिAfter", "(?>\\PM\\pM*)", "A", "beforeि", "After");
        testBreak("beforeक्षिAfter", "(?:(?:\\u000D\\u000A)|(?:[\\u0E40\\u0E41\\u0E42\\u0E43\\u0E44\\u0EC0\\u0EC1\\u0EC2\\u0EC3\\u0EC4\\uAAB5\\uAAB6\\uAAB9\\uAABB\\uAABC]*(?:[\\u1100-\\u115F\\uA960-\\uA97C]+|([\\u1100-\\u115F\\uA960-\\uA97C]*((?:[[\\u1160-\\u11A2\\uD7B0-\\uD7C6][\\uAC00\\uAC1C\\uAC38]][\\u1160-\\u11A2\\uD7B0-\\uD7C6]*|[\\uAC01\\uAC02\\uAC03\\uAC04])[\\u11A8-\\u11F9\\uD7CB-\\uD7FB]*))|[\\u11A8-\\u11F9\\uD7CB-\\uD7FB]+|[^[\\p{Zl}\\p{Zp}\\p{Cc}\\p{Cf}&&[^\\u000D\\u000A\\u200C\\u200D]]\\u000D\\u000A])[[\\p{Mn}\\p{Me}\\u200C\\u200D\\u0488\\u0489\\u20DD\\u20DE\\u20DF\\u20E0\\u20E2\\u20E3\\u20E4\\uA670\\uA671\\uA672\\uFF9E\\uFF9F][\\p{Mc}\\u0E30\\u0E32\\u0E33\\u0E45\\u0EB0\\u0EB2\\u0EB3]]*)|(?s:.))", "A", "beforeक्षि", "After");
        testBreak("terbefore\nAfter", "Af", "ter\\Z", "terbefore\nAf", "ter");
        testBreak("terbefore\nAfter\n", "Af", "ter\\Z", "terbefore\nAf", "ter\n");
        testBreak("terbefore\nAfter", "Af", "ter\\z", "terbefore\nAf", "ter");
        testNoBreak("terbefore\nAfter\n", "Af", "ter\\z", "terbefore\nAf", "ter\n");
        testBreak("before beforeAfter", "(before) \\2", "A", "before before", "After");
        testBreak("before before After", "(before) \\2", " A", "before before", " After");
        testBreak("before before After", "(before) \\2 ", "A", "before before ", "After");
        testBreak("before beforeก็After", "(before) \\2(?>\\PM\\pM*)?", "A", "before beforeก็", "After");
        testBreak("before before ก็After", "(before) \\2 (?>\\PM\\pM*)?", "A", "before before ก็", "After");
        testBreak("before before After", "(before) \\2 (?>\\PM\\pM*)?", "A", "before before ", "After");
        testBreak("before beforeAfter", "(before) \\2(?>\\PM\\pM*)?", "A", "before before", "After");
        testBreak("before\nAfter", "\\012", "A", "before\n", "After");
        testBreak("before\u000bAfter", "\\013", "A", "before\u000b", "After");
        testBreak("before\rAfter", "e", "\\015", "before", "\rAfter");
        testBreak("before\u0018After", "\\030", "A", "before\u0018", "After");
        testBreak("before\u001aAfter", "\\032", "A", "before\u001a", "After");
        testBreak("before*After", "\\052", "A", "before*", "After");
        testBreak("before?After", "\\077", "A", "before?", "After");
        testBreak("before@After", "\\0100", "A", "before@", "After");
        testBreak("beforeªAfter", "\\0252", "A", "beforeª", "After");
        testBreak("beforePAfter", "e", "\\0120", "before", "PAfter");
        testBreak("beforeÿAfter", "e", "\\0377", "before", "ÿAfter");
        testBreak("beforeÿAfter", "e", "[\\0377]", "before", "ÿAfter");
        testBreak("beforeÿAfter", "e", "[\\0377\\077]", "before", "ÿAfter");
        testBreak("beforeÿAfter", "e", "[\\0377\\077]", "before", "ÿAfter");
        testNoBreak("before?After", "e", "\\0477", "before", "?After");
        testBreak("beфываfore\nAfter", "[a-z\\p{IsCyrillic}]+", "ore", "beфываf", "ore\nAfter");
        testBreak("before\nAfter", ".", "ore", "bef", "ore\nAfter");
        testBreak("before\nAfter", "^bef", "ore", "bef", "ore\nAfter");
        testBreak("terbefore\nAfter", "Af", "ter$", "terbefore\nAf", "ter");
        testBreak("be{4}[foreAfter", "be\\{4\\}\\[fore", "A", "be{4}[fore", "After");
        testBreak("be{4}[foreAfter", "be[\\{\\[]4\\}\\[fore", "A", "be{4}[fore", "After");
        testBreak("be[4}[foreAfter", "be[\\{\\[]4\\}\\[fore", "A", "be[4}[fore", "After");
        testBreak("before After", " ", "A", "before ", "After");
        testBreak("before\bAfter", "\b", "A", "before\b", "After");
        testBreak("before\bAfter", "\b", "A", "before\b", "After");
        testBreak("before\bAfter", "\b", "A", "before\b", "After");
        testBreak("before\bAfter", "\b", "A", "before\b", "After");
        testBreak("before\bAfter", "\b", "A", "before\b", "After");
        testBreak("before\bAfter", "e\bA", "f", "before\bA", "fter");
        testBreak("Sentence 1. Sentence 2.", "\\.", "\\s|<br/?>", "Sentence 1.", " Sentence 2.");
        testBreak("Sentence 1.<br>Sentence 2.", "\\.", "\\s|<br/?>", "Sentence 1.", "<br>Sentence 2.");
        testBreak("Sentence 1.<br/>Sentence 2.", "\\.", "\\s|<br/?>", "Sentence 1.", "<br/>Sentence 2.");
    }

    @Test
    public void testMetachars2() {
        testBreak("Mr. Holmes is from the U.K. not the U.S. Is Dr. Watson from there too? Yes: both are.", "\\b(St|Gen|Hon|Dr|Mr|Ms|Mrs|Col|Maj|Brig|Sgt|Capt|Cmnd|Sen|Rev|Rep|Revd)\\.", "\\s+\\p{Lu}", "Mr.", " Holmes is from the U.K. not the U.S. Is Dr.", 3);
    }

    private String buildString(int i) {
        return Character.charCount(i) == 1 ? String.valueOf((char) i) : new String(Character.toChars(i));
    }

    private void testBreak(String str, String str2, String str3, String str4, String str5) {
        this.rules.clear();
        this.rules.add(new Rule(str2, str3, true));
        this.doc.addLanguageRule("default", this.rules);
        this.doc.addLanguageMap(new LanguageMap(".*", "default"));
        this.segmenter.setLanguage((LocaleId) null);
        this.doc.compileLanguageRules(LocaleId.ENGLISH, this.segmenter);
        Assert.assertEquals(2L, this.segmenter.computeSegments(str));
        TextContainer textContainer = new TextContainer(str);
        textContainer.getSegments().create(this.segmenter.getRanges());
        Assert.assertEquals(str4, textContainer.getSegments().get(0).toString());
        Assert.assertEquals(str5, textContainer.getSegments().get(1).toString());
    }

    private void testBreak(String str, String str2, String str3, String str4, String str5, int i) {
        this.rules.clear();
        this.rules.add(new Rule(str2, str3, true));
        this.doc.addLanguageRule("default", this.rules);
        this.doc.addLanguageMap(new LanguageMap(".*", "default"));
        this.segmenter.setLanguage((LocaleId) null);
        this.doc.compileLanguageRules(LocaleId.ENGLISH, this.segmenter);
        Assert.assertEquals(i, this.segmenter.computeSegments(str));
        TextContainer textContainer = new TextContainer(str);
        textContainer.getSegments().create(this.segmenter.getRanges());
        Assert.assertEquals(str4, textContainer.getSegments().get(0).toString());
        Assert.assertEquals(str5, textContainer.getSegments().get(1).toString());
    }

    private void testNoBreak(String str, String str2, String str3, String str4, String str5) {
        this.rules.clear();
        this.rules.add(new Rule(str2, str3, true));
        this.doc.addLanguageRule("default", this.rules);
        this.doc.addLanguageMap(new LanguageMap(".*", "default"));
        this.segmenter.setLanguage((LocaleId) null);
        this.doc.compileLanguageRules(LocaleId.ENGLISH, this.segmenter);
        Assert.assertEquals(1L, this.segmenter.computeSegments(str));
        TextContainer textContainer = new TextContainer(str);
        textContainer.getSegments().create(this.segmenter.getRanges());
        Assert.assertEquals(str, textContainer.getSegments().get(0).toString());
    }
}
