package de.digitalcollections.solrocr.formats.hocr;

import de.digitalcollections.solrocr.formats.OcrParser;
import de.digitalcollections.solrocr.model.OcrBox;
import de.digitalcollections.solrocr.model.OcrPage;
import java.awt.Dimension;
import java.io.Reader;
import java.lang.invoke.MethodHandles;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import javax.xml.stream.XMLStreamException;
import org.apache.commons.lang3.StringUtils;
import org.codehaus.stax2.XMLStreamReader2;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:de/digitalcollections/solrocr/formats/hocr/HocrParser.class */
public class HocrParser extends OcrParser {
    private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
    private boolean noMoreWords;
    private OcrPage currentPage;
    private OcrBox hyphenEnd;

    public HocrParser(Reader reader, OcrParser.ParsingFeature... parsingFeatureArr) throws XMLStreamException {
        super(reader, parsingFeatureArr);
        this.hyphenEnd = null;
    }

    @Override // de.digitalcollections.solrocr.formats.OcrParser
    protected OcrBox readNext(XMLStreamReader2 xMLStreamReader2, Set<OcrParser.ParsingFeature> set) throws XMLStreamException {
        if (this.hyphenEnd != null) {
            OcrBox ocrBox = this.hyphenEnd;
            this.hyphenEnd = null;
            return ocrBox;
        }
        if (this.noMoreWords) {
            return null;
        }
        if (xMLStreamReader2.getEventType() != 1 || !"span".equals(xMLStreamReader2.getLocalName()) || !"ocrx_word".equals(xMLStreamReader2.getAttributeValue("", "class"))) {
            seekToNextWord(xMLStreamReader2, set.contains(OcrParser.ParsingFeature.PAGES));
        }
        if (xMLStreamReader2.getEventType() != 1) {
            return null;
        }
        OcrBox ocrBox2 = new OcrBox();
        Map<String, String> parseTitle = parseTitle(xMLStreamReader2.getAttributeValue("", "title"));
        if (set.contains(OcrParser.ParsingFeature.TEXT)) {
            parseText(xMLStreamReader2, ocrBox2, set.contains(OcrParser.ParsingFeature.HIGHLIGHTS), set.contains(OcrParser.ParsingFeature.OFFSETS), set.contains(OcrParser.ParsingFeature.ALTERNATIVES));
        }
        if (set.contains(OcrParser.ParsingFeature.COORDINATES)) {
            parseCoordinates(ocrBox2, parseTitle.get("bbox"));
        }
        if (set.contains(OcrParser.ParsingFeature.CONFIDENCE) && parseTitle.containsKey("x_wconf")) {
            ocrBox2.setConfidence(Double.valueOf(Double.parseDouble(parseTitle.get("x_wconf"))));
        }
        if (set.contains(OcrParser.ParsingFeature.PAGES) && this.currentPage != null) {
            ocrBox2.setPage(this.currentPage);
        }
        String seekToNextWord = seekToNextWord(xMLStreamReader2, set.contains(OcrParser.ParsingFeature.PAGES));
        if (set.contains(OcrParser.ParsingFeature.TEXT) && !seekToNextWord.isEmpty()) {
            ocrBox2.setTrailingChars(seekToNextWord);
        }
        boolean z = false;
        if (ocrBox2.getText() != null && ocrBox2.getText().replace("��", "").endsWith("\u00ad")) {
            z = true;
            ocrBox2.setText(ocrBox2.getText().replace("\u00ad", ""));
            ocrBox2.setHyphenInfo(true, null);
        } else if (seekToNextWord.startsWith("\u00ad")) {
            z = true;
        }
        if (z) {
            ocrBox2.setTrailingChars(null);
            this.hyphenEnd = readNext(xMLStreamReader2, set);
            if (this.hyphenEnd != null) {
                String str = ocrBox2.getText() + this.hyphenEnd.getText();
                ocrBox2.setHyphenInfo(true, str);
                this.hyphenEnd.setHyphenInfo(false, str);
            } else {
                if (ocrBox2.getText() != null && !ocrBox2.getText().endsWith("-")) {
                    ocrBox2.setText(ocrBox2.getText() + "-");
                }
                ocrBox2.setHyphenInfo(null, null);
            }
        }
        if ((set.contains(OcrParser.ParsingFeature.TEXT) && (ocrBox2.getText() == null || ocrBox2.getText().isEmpty())) || (set.contains(OcrParser.ParsingFeature.COORDINATES) && ocrBox2.getLrx() < 0.0f && ocrBox2.getLry() < 0.0f && ocrBox2.getUlx() < 0.0f && ocrBox2.getUly() < 0.0f)) {
            ocrBox2 = readNext(xMLStreamReader2, set);
        }
        return ocrBox2;
    }

    private Map<String, String> parseTitle(String str) {
        HashMap hashMap = new HashMap();
        if (str == null) {
            return hashMap;
        }
        for (String str2 : str.split(";")) {
            int indexOf = str2.indexOf(32, 3);
            hashMap.put(str2.substring(0, indexOf).trim(), str2.substring(indexOf + 1).trim());
        }
        return hashMap;
    }

    private void parseCoordinates(OcrBox ocrBox, String str) {
        String[] split = str.split(" ");
        if (split.length > 0) {
            ocrBox.setUlx(Integer.parseInt(split[0]));
        }
        if (split.length > 1) {
            ocrBox.setUly(Integer.parseInt(split[1]));
        }
        if (split.length > 2) {
            ocrBox.setLrx(Integer.parseInt(split[2]));
        }
        if (split.length > 3) {
            ocrBox.setLry(Integer.parseInt(split[3]));
        } else {
            log.warn("bbox attribute '{}' is incomplete.", str);
        }
    }

    private void parseText(XMLStreamReader2 xMLStreamReader2, OcrBox ocrBox, boolean z, boolean z2, boolean z3) throws XMLStreamException {
        String str = null;
        int i = -1;
        boolean z4 = false;
        while (xMLStreamReader2.hasNext()) {
            int next = xMLStreamReader2.next();
            if (next == 4 && str == null) {
                if (z2) {
                    i = Math.toIntExact(xMLStreamReader2.getLocationInfo().getStartingCharOffset());
                }
                str = xMLStreamReader2.getText();
            } else if (next == 2) {
                if (!z4) {
                    ocrBox.setText(str);
                    if (z2) {
                        ocrBox.setTextOffset(i);
                    }
                    if (str != null && str.replace("��", "").endsWith("\u00ad")) {
                        ocrBox.setHyphenInfo(true, null);
                    }
                    if (z && ocrBox.getHighlightSpan() == null) {
                        ocrBox.setHighlightSpan(trackHighlightSpan(str, ocrBox));
                        return;
                    }
                    return;
                }
                z4 = false;
            } else if (next != 1) {
                continue;
            } else {
                String localName = xMLStreamReader2.getLocalName();
                if ("span".equals(localName) && "alternatives".equals(xMLStreamReader2.getAttributeValue("", "class"))) {
                    z4 = true;
                } else if ("ins".equals(localName)) {
                    if (xMLStreamReader2.next() != 4) {
                        throw new IllegalStateException("<ins> elements must have a text node as its sole child");
                    }
                    if (z2) {
                        i = Math.toIntExact(xMLStreamReader2.getLocationInfo().getStartingCharOffset());
                    }
                    str = xMLStreamReader2.getText();
                    if (xMLStreamReader2.next() != 2) {
                        throw new IllegalStateException("<ins> elements must have a text node as its sole child");
                    }
                } else if (z3 && "del".equals(localName)) {
                    if (xMLStreamReader2.next() != 4) {
                        throw new IllegalStateException("<del> elements must have a text node as its sole child");
                    }
                    String text = xMLStreamReader2.getText();
                    Integer valueOf = z2 ? Integer.valueOf(Math.toIntExact(xMLStreamReader2.getLocationInfo().getStartingCharOffset())) : null;
                    if (z && ocrBox.getHighlightSpan() == null) {
                        ocrBox.setHighlightSpan(trackHighlightSpan(text, ocrBox));
                    }
                    ocrBox.addAlternative(text, valueOf);
                    if (xMLStreamReader2.next() != 2) {
                        throw new IllegalStateException("<del> elements must have a text node as its sole child");
                    }
                }
            }
        }
    }

    private String seekToNextWord(XMLStreamReader2 xMLStreamReader2, boolean z) throws XMLStreamException {
        boolean z2 = false;
        StringBuilder sb = new StringBuilder();
        while (true) {
            if (!xMLStreamReader2.hasNext()) {
                break;
            }
            int next = xMLStreamReader2.next();
            if (next == 1) {
                String localName = xMLStreamReader2.getLocalName();
                String attributeValue = xMLStreamReader2.getAttributeValue("", "class");
                if ("span".equals(localName) && "ocrx_word".equals(attributeValue)) {
                    z2 = true;
                    break;
                }
                if ("span".equals(localName) && "ocr_line".equals(attributeValue) && sb.lastIndexOf(" ") < 0) {
                    sb.append(' ');
                } else if (z && "div".equals(localName) && "ocr_page".equals(attributeValue)) {
                    Map<String, String> parseTitle = parseTitle(xMLStreamReader2.getAttributeValue("", "title"));
                    Dimension dimension = null;
                    if (parseTitle.containsKey("bbox")) {
                        String[] split = parseTitle.get("bbox").split(" ");
                        dimension = new Dimension(Integer.parseInt(split[2]), Integer.parseInt(split[3]));
                    }
                    String attributeValue2 = xMLStreamReader2.getAttributeValue("", "id");
                    if (attributeValue2 == null) {
                        attributeValue2 = parseTitle.get("x_source");
                    }
                    if (attributeValue2 == null) {
                        attributeValue2 = parseTitle.get("ppageno");
                    }
                    this.currentPage = new OcrPage(attributeValue2, dimension);
                }
            } else if (next == 4 || next == 6) {
                String text = xMLStreamReader2.getText();
                boolean isBlank = StringUtils.isBlank(text);
                if (isBlank && (sb.length() == 0 || sb.lastIndexOf(" ") != sb.length() - 1)) {
                    sb.append(' ');
                } else if (!isBlank) {
                    sb.append(text);
                }
            }
        }
        this.noMoreWords = !z2;
        return sb.toString();
    }
}
