package de.digitalcollections.solrocr.formats.alto;

import de.digitalcollections.solrocr.formats.OcrPassageFormatter;
import de.digitalcollections.solrocr.util.IterableCharSequence;
import de.digitalcollections.solrocr.util.OcrBox;
import de.digitalcollections.solrocr.util.TagBreakIterator;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.text.StringEscapeUtils;

/* loaded from: input_file:de/digitalcollections/solrocr/formats/alto/AltoPassageFormatter.class */
public class AltoPassageFormatter extends OcrPassageFormatter {
    private static final String START_HL = "@@STARTHLTAG@@";
    private static final String END_HL = "@@ENDHLTAG@@";
    private static final Pattern pagePat = Pattern.compile("<Page ?(?<attribs>.+?)/?>");
    private static final Pattern wordPat = Pattern.compile("<String ?(?<attribs>.+?)/?>");
    private static final Pattern attribPat = Pattern.compile("(?<key>[A-Z]+?)=\"(?<val>.+?)\"");
    private final TagBreakIterator pageIter;

    /* JADX INFO: Access modifiers changed from: protected */
    public AltoPassageFormatter(String str, String str2, boolean z) {
        super(str, str2, z);
        this.pageIter = new TagBreakIterator("Page");
    }

    private Map<String, String> parseAttribs(String str) {
        HashMap hashMap = new HashMap();
        Matcher matcher = attribPat.matcher(str);
        while (matcher.find()) {
            hashMap.put(matcher.group("key"), matcher.group("val"));
        }
        return hashMap;
    }

    @Override // de.digitalcollections.solrocr.formats.OcrPassageFormatter
    public String determineStartPage(String str, int i, IterableCharSequence iterableCharSequence) {
        this.pageIter.setText(iterableCharSequence);
        int preceding = this.pageIter.preceding(i);
        Matcher matcher = pagePat.matcher(iterableCharSequence.subSequence(preceding, Math.min(preceding + 512, iterableCharSequence.length())).toString());
        if (matcher.find()) {
            return parseAttribs(matcher.group("attribs")).get("ID");
        }
        return null;
    }

    private TreeMap<Integer, String> determinePageBreaks(String str) {
        TreeMap<Integer, String> treeMap = new TreeMap<>();
        Matcher matcher = pagePat.matcher(str);
        while (matcher.find()) {
            treeMap.put(Integer.valueOf(matcher.start()), parseAttribs(matcher.group("attribs")).get("ID"));
        }
        return treeMap;
    }

    @Override // de.digitalcollections.solrocr.formats.OcrPassageFormatter
    protected String getTextFromXml(String str) {
        StringBuilder sb = new StringBuilder(str.replaceAll(this.startHlTag, START_HL).replaceAll(this.endHlTag, END_HL).replaceAll("<SP.*?>", " ").replaceAll("(</?)?TextLine.*?>", " ").replaceAll("(?s)<Description>.+?</Description>", ""));
        while (true) {
            Matcher matcher = wordPat.matcher(sb);
            if (!matcher.find()) {
                return StringEscapeUtils.unescapeXml(sb.toString().replaceAll("</?[A-Z]?.*?>", "")).replaceAll("\n", "").replaceAll("\\s+", " ").trim().replaceAll(START_HL, this.startHlTag).replaceAll(END_HL, this.endHlTag);
            }
            sb.replace(matcher.start(), matcher.end(), parseAttribs(matcher.group("attribs")).get("CONTENT"));
        }
    }

    @Override // de.digitalcollections.solrocr.formats.OcrPassageFormatter
    protected List<OcrBox> parseWords(String str, String str2) {
        String replaceAll = str.replaceAll(this.startHlTag, START_HL).replaceAll(this.endHlTag, END_HL);
        TreeMap<Integer, String> determinePageBreaks = determinePageBreaks(replaceAll);
        ArrayList arrayList = new ArrayList();
        Matcher matcher = wordPat.matcher(replaceAll);
        boolean z = false;
        while (matcher.find()) {
            String str3 = str2;
            if (determinePageBreaks.floorKey(Integer.valueOf(matcher.start())) != null) {
                str3 = determinePageBreaks.floorEntry(Integer.valueOf(matcher.start())).getValue();
            }
            Map<String, String> parseAttribs = parseAttribs(matcher.group("attribs"));
            int parseInt = Integer.parseInt(parseAttribs.get("HPOS"));
            int parseInt2 = Integer.parseInt(parseAttribs.get("VPOS"));
            int parseInt3 = Integer.parseInt(parseAttribs.get("WIDTH"));
            int parseInt4 = Integer.parseInt(parseAttribs.get("HEIGHT"));
            String str4 = parseAttribs.get("CONTENT");
            if (str4.contains(START_HL)) {
                z = true;
            }
            arrayList.add(new OcrBox(str4.replace(START_HL, "").replace(END_HL, ""), str3, parseInt, parseInt2, parseInt + parseInt3, parseInt2 + parseInt4, z));
            if (str4.contains(END_HL) || replaceAll.substring(matcher.end(), Math.min(matcher.end() + END_HL.length(), replaceAll.length())).equals(END_HL)) {
                z = false;
            }
        }
        return arrayList;
    }
}
