package de.digitalcollections.solrocr.formats.hocr;

import de.digitalcollections.solrocr.util.Streams;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.List;
import java.util.stream.Collectors;
import net.byteseek.compiler.CompileException;
import net.byteseek.compiler.matcher.SequenceMatcherCompiler;
import net.byteseek.matcher.sequence.ByteSequenceMatcher;
import net.byteseek.matcher.sequence.SequenceMatcher;
import net.byteseek.searcher.ForwardSearchIterator;
import net.byteseek.searcher.SearchResult;
import net.byteseek.searcher.Searcher;
import net.byteseek.searcher.sequence.SequenceMatcherSearcher;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.tuple.ImmutablePair;

/* loaded from: input_file:de/digitalcollections/solrocr/formats/hocr/HocrByteOffsetsParser.class */
public class HocrByteOffsetsParser {
    private static final Searcher<SequenceMatcher> BEGIN_SPAN_SEARCHER = new SequenceMatcherSearcher(new ByteSequenceMatcher("<span"));
    private static final Searcher<SequenceMatcher> END_SPAN_SEARCHER = new SequenceMatcherSearcher(new ByteSequenceMatcher("</span>"));

    private static int getPageOffset(byte[] bArr, String str) {
        try {
            ForwardSearchIterator forwardSearchIterator = new ForwardSearchIterator(new SequenceMatcherSearcher(SequenceMatcherCompiler.compileFrom("'<div class=' . 'ocr_page' . ' id=' . '" + str + "'")), bArr);
            if (forwardSearchIterator.hasNext()) {
                return (int) ((SearchResult) forwardSearchIterator.next().get(0)).getMatchPosition();
            }
            throw new IllegalArgumentException("Could not find page with id '" + str + "'");
        } catch (CompileException e) {
            throw new RuntimeException(e);
        }
    }

    public static void parse(byte[] bArr, OutputStream outputStream) throws IOException {
        parse(bArr, outputStream, null, null);
    }

    public static void parse(byte[] bArr, OutputStream outputStream, String str, String str2) throws IOException {
        int pageOffset = str != null ? getPageOffset(bArr, str) : 0;
        int length = bArr.length - 1;
        if (str2 != null) {
            length = getPageOffset(bArr, str2);
        }
        try {
            SequenceMatcherSearcher sequenceMatcherSearcher = new SequenceMatcherSearcher(SequenceMatcherCompiler.compileFrom("'>' ^'<'"));
            for (ImmutablePair immutablePair : (List) Streams.zip(Streams.stream(new ForwardSearchIterator(BEGIN_SPAN_SEARCHER, pageOffset, length, bArr)).flatMap((v0) -> {
                return v0.stream();
            }).map((v0) -> {
                return v0.getMatchPosition();
            }), Streams.stream(new ForwardSearchIterator(END_SPAN_SEARCHER, pageOffset, length, bArr)).flatMap((v0) -> {
                return v0.stream();
            }).map((v0) -> {
                return v0.getMatchPosition();
            }), (v1, v2) -> {
                return new ImmutablePair(v1, v2);
            }).filter(immutablePair2 -> {
                return new String(bArr, ((Long) immutablePair2.left).intValue() + 13, 9, StandardCharsets.UTF_8).equals("ocrx_word");
            }).collect(Collectors.toList())) {
                ForwardSearchIterator forwardSearchIterator = new ForwardSearchIterator(sequenceMatcherSearcher, Math.toIntExact(((Long) immutablePair.left).longValue()), Math.toIntExact(((Long) immutablePair.right).longValue()), bArr);
                if (forwardSearchIterator.hasNext()) {
                    int matchPosition = ((int) ((SearchResult) forwardSearchIterator.next().get(0)).getMatchPosition()) + 1;
                    outputStream.write(bArr, matchPosition, ArrayUtils.indexOf(bArr, (byte) 60, matchPosition) - matchPosition);
                    outputStream.write(String.format("⚑%d ", Integer.valueOf(matchPosition)).getBytes(StandardCharsets.UTF_8));
                }
            }
        } catch (CompileException e) {
            throw new RuntimeException();
        }
    }

    public static void main(String[] strArr) throws IOException {
        ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
        long nanoTime = System.nanoTime();
        parse(Files.readAllBytes(Paths.get("src/test/resources/data/hocr_test.html", new String[0])), byteArrayOutputStream, "page_118", "page_120");
        System.out.println(String.format("Parsing took %.2fms", Double.valueOf((System.nanoTime() - nanoTime) / 1000000.0d)));
        System.out.println(byteArrayOutputStream.toString(StandardCharsets.UTF_8.toString()));
    }
}
