package de.digitalcollections.solrocr.formats.miniocr;

import com.google.common.collect.ImmutableMap;
import de.digitalcollections.solrocr.formats.OcrParser;
import de.digitalcollections.solrocr.iter.BreakLocator;
import de.digitalcollections.solrocr.iter.IterableCharSequence;
import de.digitalcollections.solrocr.iter.TagBreakLocator;
import de.digitalcollections.solrocr.model.OcrBlock;
import de.digitalcollections.solrocr.model.OcrFormat;
import de.digitalcollections.solrocr.model.OcrPage;
import java.awt.Dimension;
import java.io.Reader;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.stream.XMLStreamException;

/* loaded from: input_file:de/digitalcollections/solrocr/formats/miniocr/MiniOcrFormat.class */
public class MiniOcrFormat implements OcrFormat {
    private static final Pattern pagePat = Pattern.compile("<p (?:xml)?:id=\"(?<pageId>.+?)\" ?(?:wh=\"(?<width>\\d+) (?<height>\\d+)\")?>");
    private static final Map<OcrBlock, String> blockTagMapping = ImmutableMap.of(OcrBlock.PAGE, "p", OcrBlock.SECTION, "s", OcrBlock.BLOCK, "b", OcrBlock.LINE, "l", OcrBlock.WORD, "w");

    @Override // de.digitalcollections.solrocr.model.OcrFormat
    public BreakLocator getBreakLocator(IterableCharSequence iterableCharSequence, OcrBlock... ocrBlockArr) {
        return new TagBreakLocator(iterableCharSequence, blockTagMapping.get(ocrBlockArr[0]));
    }

    @Override // de.digitalcollections.solrocr.model.OcrFormat
    public OcrParser getParser(Reader reader, OcrParser.ParsingFeature... parsingFeatureArr) {
        try {
            return new MiniOcrParser(reader, parsingFeatureArr);
        } catch (XMLStreamException e) {
            throw new RuntimeException((Throwable) e);
        }
    }

    @Override // de.digitalcollections.solrocr.model.OcrFormat
    public OcrPage parsePageFragment(String str) {
        Matcher matcher = pagePat.matcher(str);
        if (!matcher.find()) {
            return null;
        }
        return new OcrPage(matcher.group("pageId"), new Dimension(Integer.parseInt(matcher.group("width")), Integer.parseInt(matcher.group("height"))));
    }

    @Override // de.digitalcollections.solrocr.model.OcrFormat
    public boolean hasFormat(String str) {
        return blockTagMapping.values().stream().anyMatch(str2 -> {
            return str.contains(new StringBuilder().append("<").append(str2).append(" ").toString()) || str.contains(new StringBuilder().append("<").append(str2).append(">").toString());
        });
    }

    @Override // de.digitalcollections.solrocr.model.OcrFormat
    public int getLastContentStartIdx(String str) {
        return str.lastIndexOf(">") + 1;
    }

    @Override // de.digitalcollections.solrocr.model.OcrFormat
    public int getFirstContentEndIdx(String str) {
        return str.indexOf("</");
    }
}
