package de.digitalcollections.solrocr.formats;

import com.ctc.wstx.api.WstxInputProperties;
import com.ctc.wstx.exc.WstxLazyException;
import com.ctc.wstx.stax.WstxInputFactory;
import com.google.common.collect.ImmutableMap;
import de.digitalcollections.solrocr.model.OcrBox;
import de.digitalcollections.solrocr.reader.PeekingReader;
import java.io.Reader;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import java.util.Spliterators;
import java.util.UUID;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
import javax.xml.stream.XMLStreamException;
import org.codehaus.stax2.XMLStreamReader2;

/* loaded from: input_file:de/digitalcollections/solrocr/formats/OcrParser.class */
public abstract class OcrParser implements Iterator<OcrBox>, Iterable<OcrBox> {
    public static final String START_HL = "��";
    public static final String END_HL = "��";
    protected final PeekingReader input;
    protected UUID currentHighlightSpan;
    private final XMLStreamReader2 xmlReader;
    private OcrBox nextWord;
    public static final ImmutableMap<Object, Object> ENTITIES = ImmutableMap.builder().put("shy", "\u00ad").put("nbsp", " ").put("ensp", "\u2002").put("emsp", "\u2003").put("thinsp", "\u2009").put("zwnj", "\u200c").put("zwj", "\u200d").build();
    private static final WstxInputFactory xmlInputFactory = new WstxInputFactory();
    protected boolean terminateHighlightSpanAfterNext = false;
    private final Set<ParsingFeature> features = new HashSet();

    /* loaded from: input_file:de/digitalcollections/solrocr/formats/OcrParser$ParsingFeature.class */
    public enum ParsingFeature {
        TEXT,
        OFFSETS,
        COORDINATES,
        HIGHLIGHTS,
        CONFIDENCE,
        ALTERNATIVES,
        PAGES
    }

    /* JADX WARN: Multi-variable type inference failed */
    public OcrParser(Reader reader, ParsingFeature... parsingFeatureArr) throws XMLStreamException {
        if (reader instanceof PeekingReader) {
            this.input = (PeekingReader) reader;
        } else {
            this.input = new PeekingReader(reader, 2048, 16384);
        }
        this.features.addAll(Arrays.asList(parsingFeatureArr.length == 0 ? new ParsingFeature[]{ParsingFeature.TEXT, ParsingFeature.OFFSETS, ParsingFeature.COORDINATES, ParsingFeature.HIGHLIGHTS, ParsingFeature.CONFIDENCE, ParsingFeature.ALTERNATIVES, ParsingFeature.PAGES} : parsingFeatureArr));
        xmlInputFactory.getConfig().doCoalesceText(true);
        xmlInputFactory.getConfig().setInputParsingMode(WstxInputProperties.PARSING_MODE_DOCUMENTS);
        xmlInputFactory.getConfig().doSupportDTDs(false);
        xmlInputFactory.getConfig().setCustomInternalEntities(ENTITIES);
        xmlInputFactory.getConfig().setUndeclaredEntityResolver((str, str2, str3, str4) -> {
            return String.format("&amp;%s;", str4);
        });
        this.xmlReader = xmlInputFactory.createXMLStreamReader(this.input);
        try {
            this.nextWord = readNext(this.xmlReader, this.features);
        } catch (XMLStreamException e) {
            throw new RuntimeException(String.format("Failed to parse the OCR markup, make sure your files are well-formed and your regions start/end on complete tags! (Source was: %s)", this.input.getSource().orElse("[unknown]")), e);
        }
    }

    @Override // java.lang.Iterable
    public Iterator<OcrBox> iterator() {
        return this;
    }

    public Stream<OcrBox> stream() {
        return StreamSupport.stream(Spliterators.spliteratorUnknownSize(this, 16), false);
    }

    @Override // java.util.Iterator
    public boolean hasNext() {
        return this.nextWord != null;
    }

    /* JADX WARN: Can't rename method to resolve collision */
    @Override // java.util.Iterator
    public OcrBox next() {
        if (!hasNext()) {
            throw new IllegalStateException("No more words in input");
        }
        OcrBox ocrBox = this.nextWord;
        do {
            try {
                this.nextWord = readNext(this.xmlReader, this.features);
                if (!hasNext()) {
                    break;
                }
            } catch (XMLStreamException | WstxLazyException e) {
                throw new RuntimeException(String.format("Failed to parse the OCR markup, make sure your files are well-formed and your regions start/end on complete tags! (Source was: %s)", this.input.getSource().orElse("[unknown]")), e);
            }
        } while (this.nextWord == null);
        return ocrBox;
    }

    public Optional<OcrBox> peek() {
        return !hasNext() ? Optional.empty() : Optional.of(this.nextWord);
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public UUID trackHighlightSpan(String str, OcrBox ocrBox) {
        if (this.currentHighlightSpan == null && str.contains("��")) {
            this.currentHighlightSpan = UUID.randomUUID();
        }
        if (this.currentHighlightSpan == null || !(this.terminateHighlightSpanAfterNext || str.contains("��"))) {
            return this.currentHighlightSpan;
        }
        if (this.terminateHighlightSpanAfterNext) {
            this.terminateHighlightSpanAfterNext = false;
        } else if (ocrBox.isHyphenated() && ocrBox.isHyphenStart().booleanValue() && str.contains("��")) {
            this.terminateHighlightSpanAfterNext = true;
            return this.currentHighlightSpan;
        }
        UUID uuid = this.currentHighlightSpan;
        this.currentHighlightSpan = null;
        return uuid;
    }

    public PeekingReader getInput() {
        return this.input;
    }

    protected abstract OcrBox readNext(XMLStreamReader2 xMLStreamReader2, Set<ParsingFeature> set) throws XMLStreamException;

    public static String boxesToString(List<OcrBox> list) {
        StringBuilder sb = new StringBuilder();
        int i = 0;
        Iterator<OcrBox> it = list.iterator();
        while (it.hasNext()) {
            OcrBox next = it.next();
            if (next.isHyphenated() && next.isHyphenStart().booleanValue()) {
                if (i < list.size() - 1 && list.get(i + 1).isHyphenated() && !list.get(i + 1).isHyphenStart().booleanValue()) {
                    OcrBox next2 = it.next();
                    sb.append(next2.getDehyphenatedForm());
                    next.setTrailingChars(next2.getTrailingChars());
                    i++;
                } else {
                    String trim = next.getText().trim();
                    if (!trim.endsWith("-")) {
                        trim = trim + "-";
                    }
                    sb.append(trim);
                }
            } else if (next.getAlternatives().isEmpty()) {
                sb.append(next.getText());
            } else {
                Optional<String> findFirst = next.getAlternatives().stream().filter(str -> {
                    return str.contains("��") || str.contains("��");
                }).findFirst();
                if (findFirst.isPresent()) {
                    sb.append(findFirst.get());
                } else {
                    sb.append(next.getText());
                }
            }
            if (next.getTrailingChars() != null) {
                sb.append(next.getTrailingChars());
            }
            i++;
        }
        return sb.toString().trim();
    }
}
