package de.digitalcollections.solrocr.formats.miniocr;

import de.digitalcollections.solrocr.formats.OcrParser;
import de.digitalcollections.solrocr.model.OcrBox;
import de.digitalcollections.solrocr.model.OcrPage;
import java.awt.Dimension;
import java.io.Reader;
import java.lang.invoke.MethodHandles;
import java.util.Set;
import javax.xml.stream.XMLStreamException;
import org.apache.commons.lang3.StringUtils;
import org.codehaus.stax2.XMLStreamReader2;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:de/digitalcollections/solrocr/formats/miniocr/MiniOcrParser.class */
public class MiniOcrParser extends OcrParser {
    private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
    private static final char alternativeMarker = 8703;
    private boolean noMoreWords;
    private OcrPage currentPage;
    private OcrBox hyphenEnd;

    public MiniOcrParser(Reader reader, OcrParser.ParsingFeature... parsingFeatureArr) throws XMLStreamException {
        super(reader, parsingFeatureArr);
        this.hyphenEnd = null;
    }

    @Override // de.digitalcollections.solrocr.formats.OcrParser
    protected OcrBox readNext(XMLStreamReader2 xMLStreamReader2, Set<OcrParser.ParsingFeature> set) throws XMLStreamException {
        String attributeValue;
        if (this.hyphenEnd != null) {
            OcrBox ocrBox = this.hyphenEnd;
            this.hyphenEnd = null;
            return ocrBox;
        }
        if (xMLStreamReader2.getEventType() != 1 || !"w".equals(xMLStreamReader2.getLocalName())) {
            seekToNextWord(xMLStreamReader2, set.contains(OcrParser.ParsingFeature.PAGES));
        }
        if (this.noMoreWords) {
            return null;
        }
        OcrBox ocrBox2 = new OcrBox();
        if (set.contains(OcrParser.ParsingFeature.COORDINATES)) {
            String[] split = xMLStreamReader2.getAttributeValue("", "x").split(" ");
            if (split.length > 0) {
                ocrBox2.setUlx(Float.parseFloat(split[0]));
            }
            if (split.length > 1) {
                ocrBox2.setUly(Float.parseFloat(split[1]));
            }
            if (split.length > 2) {
                ocrBox2.setLrx(ocrBox2.getUlx() + Float.parseFloat(split[2]));
            }
            if (split.length > 3) {
                ocrBox2.setLry(ocrBox2.getUly() + Float.parseFloat(split[3]));
            } else {
                log.warn("x attribute is incomplete: '{}'", String.join(" ", split));
            }
        }
        if (set.contains(OcrParser.ParsingFeature.CONFIDENCE) && (attributeValue = xMLStreamReader2.getAttributeValue("", "c")) != null && !attributeValue.isEmpty()) {
            ocrBox2.setConfidence(Double.valueOf(Double.parseDouble(attributeValue)));
        }
        if (set.contains(OcrParser.ParsingFeature.TEXT)) {
            parseText(xMLStreamReader2, ocrBox2, set.contains(OcrParser.ParsingFeature.HIGHLIGHTS), set.contains(OcrParser.ParsingFeature.OFFSETS), set.contains(OcrParser.ParsingFeature.ALTERNATIVES));
        }
        if (set.contains(OcrParser.ParsingFeature.PAGES) && this.currentPage != null) {
            ocrBox2.setPage(this.currentPage);
        }
        String seekToNextWord = seekToNextWord(xMLStreamReader2, set.contains(OcrParser.ParsingFeature.PAGES));
        if (set.contains(OcrParser.ParsingFeature.TEXT) && !seekToNextWord.isEmpty()) {
            ocrBox2.setTrailingChars(seekToNextWord);
        }
        boolean z = false;
        if (ocrBox2.getText().endsWith("\u00ad")) {
            z = true;
            String text = ocrBox2.getText();
            ocrBox2.setText(text.substring(0, text.length() - 1));
            ocrBox2.setHyphenInfo(true, null);
        } else if (seekToNextWord.startsWith("\u00ad")) {
            z = true;
        }
        if (z) {
            ocrBox2.setTrailingChars(null);
            this.hyphenEnd = readNext(xMLStreamReader2, set);
            if (this.hyphenEnd != null) {
                String str = ocrBox2.getText() + this.hyphenEnd.getText();
                ocrBox2.setHyphenInfo(true, str);
                this.hyphenEnd.setHyphenInfo(false, str);
            } else {
                if (!ocrBox2.getText().endsWith("-") && (ocrBox2.getTrailingChars() == null || ocrBox2.getTrailingChars().endsWith("-"))) {
                    ocrBox2.setTrailingChars("-");
                }
                ocrBox2.setHyphenInfo(null, null);
            }
        }
        if ((set.contains(OcrParser.ParsingFeature.TEXT) && (ocrBox2.getText() == null || ocrBox2.getText().isEmpty())) || (set.contains(OcrParser.ParsingFeature.COORDINATES) && ocrBox2.getLrx() < 0.0f && ocrBox2.getLry() < 0.0f && ocrBox2.getUlx() < 0.0f && ocrBox2.getUly() < 0.0f)) {
            return null;
        }
        return ocrBox2;
    }

    private void parseText(XMLStreamReader2 xMLStreamReader2, OcrBox ocrBox, boolean z, boolean z2, boolean z3) throws XMLStreamException {
        if (xMLStreamReader2.next() != 4) {
            throw new IllegalStateException("A word element must have text content.");
        }
        if (z2) {
            ocrBox.setTextOffset(Math.toIntExact(xMLStreamReader2.getLocationInfo().getStartingCharOffset()));
        }
        String text = xMLStreamReader2.getText();
        if (z) {
            ocrBox.setHighlightSpan(trackHighlightSpan(text, ocrBox));
        }
        if (text.indexOf(alternativeMarker) < 0) {
            ocrBox.setText(text);
            return;
        }
        int i = 0;
        while (true) {
            int i2 = i;
            if (i2 >= text.length()) {
                return;
            }
            int indexOf = text.indexOf(alternativeMarker, i2);
            if (indexOf < 0) {
                indexOf = text.length();
            }
            if (i2 == 0) {
                ocrBox.setText(text.substring(i2, indexOf));
                if (!z3) {
                    return;
                }
            } else {
                ocrBox.addAlternative(text.substring(i2, indexOf), z2 ? Integer.valueOf(ocrBox.getTextOffset() + i2) : null);
            }
            i = Math.min(indexOf + 1, text.length());
        }
    }

    private String seekToNextWord(XMLStreamReader2 xMLStreamReader2, boolean z) throws XMLStreamException {
        boolean z2 = false;
        StringBuilder sb = new StringBuilder();
        while (true) {
            if (!xMLStreamReader2.hasNext()) {
                break;
            }
            int next = xMLStreamReader2.next();
            if (next == 1) {
                String localName = xMLStreamReader2.getLocalName();
                if ("w".equals(localName)) {
                    z2 = true;
                    break;
                }
                if ("l".equals(localName) && sb.lastIndexOf(" ") < 0) {
                    sb.append(' ');
                } else if (z && "p".equals(localName)) {
                    Dimension dimension = null;
                    String attributeValue = xMLStreamReader2.getAttributeValue("", "wh");
                    if (attributeValue != null && !attributeValue.isEmpty()) {
                        String[] split = attributeValue.split(" ");
                        dimension = new Dimension(Integer.parseInt(split[0]), Integer.parseInt(split[1]));
                    }
                    String attributeValue2 = xMLStreamReader2.getAttributeValue("http://www.w3.org/XML/1998/namespace", "id");
                    if (attributeValue2 == null || attributeValue2.isEmpty()) {
                        attributeValue2 = xMLStreamReader2.getAttributeValue("", "pid");
                    }
                    this.currentPage = new OcrPage(attributeValue2, dimension);
                }
            } else if (next == 4) {
                String text = xMLStreamReader2.getText();
                boolean isBlank = StringUtils.isBlank(text);
                if (isBlank && (sb.length() == 0 || sb.lastIndexOf(" ") != sb.length() - 1)) {
                    sb.append(' ');
                } else if (!isBlank) {
                    sb.append(text);
                }
            }
        }
        this.noMoreWords = !z2;
        return sb.toString();
    }
}
