package de.digitalcollections.solrocr.formats.hocr;

import de.digitalcollections.solrocr.formats.OcrPassageFormatter;
import de.digitalcollections.solrocr.util.IterableCharSequence;
import de.digitalcollections.solrocr.util.OcrBox;
import java.util.ArrayList;
import java.util.List;
import java.util.TreeMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/* loaded from: input_file:de/digitalcollections/solrocr/formats/hocr/HocrPassageFormatter.class */
public class HocrPassageFormatter extends OcrPassageFormatter {
    private static final Pattern wordPat = Pattern.compile("<span class=['\"]ocrx_word['\"].+?title=['\"].*?bbox (?<ulx>\\d+) (?<uly>\\d+) (?<lrx>\\d+) (?<lry>\\d+);?.*?>(?<text>.+?)</span>");
    private static final Pattern pagePat = Pattern.compile("<div.+?class=['\"]ocr_page['\"].+?id=['\"](?<pageId>.+?)['\"]");
    private final HocrClassBreakIterator pageIter;
    private final String startHlTag;
    private final String endHlTag;

    public HocrPassageFormatter(String str, String str2, boolean z) {
        super(str, str2, z);
        this.pageIter = new HocrClassBreakIterator("ocr_page");
        this.startHlTag = str;
        this.endHlTag = str2;
    }

    @Override // de.digitalcollections.solrocr.formats.OcrPassageFormatter
    public String determineStartPage(String str, int i, IterableCharSequence iterableCharSequence) {
        this.pageIter.setText(iterableCharSequence);
        int preceding = this.pageIter.preceding(i);
        Matcher matcher = pagePat.matcher(iterableCharSequence.subSequence(preceding, Math.min(preceding + 256, iterableCharSequence.length())).toString());
        if (matcher.find()) {
            return matcher.group("pageId");
        }
        return null;
    }

    private TreeMap<Integer, String> determinePageBreaks(String str) {
        TreeMap<Integer, String> treeMap = new TreeMap<>();
        Matcher matcher = pagePat.matcher(str);
        while (matcher.find()) {
            treeMap.put(Integer.valueOf(matcher.start()), matcher.group("pageId"));
        }
        return treeMap;
    }

    @Override // de.digitalcollections.solrocr.formats.OcrPassageFormatter
    protected List<OcrBox> parseWords(String str, String str2) {
        ArrayList arrayList = new ArrayList();
        TreeMap<Integer, String> determinePageBreaks = determinePageBreaks(str);
        Matcher matcher = wordPat.matcher(str);
        boolean z = false;
        while (matcher.find()) {
            String str3 = str2;
            if (determinePageBreaks.floorKey(Integer.valueOf(matcher.start())) != null) {
                str3 = determinePageBreaks.floorEntry(Integer.valueOf(matcher.start())).getValue();
            }
            int parseInt = Integer.parseInt(matcher.group("ulx"));
            int parseInt2 = Integer.parseInt(matcher.group("uly"));
            int parseInt3 = Integer.parseInt(matcher.group("lrx"));
            int parseInt4 = Integer.parseInt(matcher.group("lry"));
            String group = matcher.group("text");
            if (group.contains(this.startHlTag)) {
                z = true;
            }
            arrayList.add(new OcrBox(group.replace(this.startHlTag, "").replace(this.endHlTag, ""), str3, parseInt, parseInt2, parseInt3, parseInt4, z));
            if (group.contains(this.endHlTag) || str.substring(matcher.end(), Math.min(matcher.end() + this.endHlTag.length(), str.length())).equals(this.endHlTag)) {
                z = false;
            }
        }
        return arrayList;
    }
}
