package de.digitalcollections.solrocr.formats;

import com.google.common.collect.ImmutableSet;
import com.google.common.io.CharStreams;
import de.digitalcollections.solrocr.lucene.fieldloader.PathFieldLoader;
import de.digitalcollections.solrocr.util.IterableCharSequence;
import de.digitalcollections.solrocr.util.OcrBox;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Deque;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import org.apache.commons.text.StringEscapeUtils;
import org.apache.lucene.analysis.charfilter.HTMLStripCharFilter;
import org.apache.lucene.search.uhighlight.Passage;
import org.apache.lucene.search.uhighlight.PassageFormatter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:de/digitalcollections/solrocr/formats/OcrPassageFormatter.class */
public abstract class OcrPassageFormatter extends PassageFormatter {
    private static final Pattern LAST_INNER_TAG_PAT = Pattern.compile("[a-zA-Z0-9]</");
    private static final Logger logger = LoggerFactory.getLogger(PathFieldLoader.class);
    protected final String startHlTag;
    protected final String endHlTag;
    protected final boolean absoluteHighlights;

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:de/digitalcollections/solrocr/formats/OcrPassageFormatter$PassageMatch.class */
    public static class PassageMatch {
        public int start;
        public int end;

        public PassageMatch(int i, int i2) {
            this.start = i;
            this.end = i2;
        }

        public boolean overlaps(PassageMatch passageMatch) {
            int i = this.start;
            int i2 = this.end;
            int i3 = passageMatch.start;
            int i4 = passageMatch.end;
            return (i <= i3 && i3 <= i2) || (i <= i4 && i4 <= i2) || (i3 <= i && i <= i4 && i3 <= i2 && i2 <= i4);
        }

        public void merge(PassageMatch passageMatch) {
            if (this.end < passageMatch.end) {
                this.end = passageMatch.end;
            } else if (this.start > passageMatch.start) {
                this.start = passageMatch.start;
            }
        }

        public String toString() {
            return String.format("PassageMatch{start=%d, end=%d}", Integer.valueOf(this.start), Integer.valueOf(this.end));
        }
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public OcrPassageFormatter(String str, String str2, boolean z) {
        this.startHlTag = str;
        this.endHlTag = str2;
        this.absoluteHighlights = z;
    }

    private List<PassageMatch> mergeMatches(int i, int[] iArr, int[] iArr2) {
        Deque deque = (Deque) IntStream.range(0, i).mapToObj(i2 -> {
            return new PassageMatch(iArr[i2], iArr2[i2]);
        }).collect(Collectors.toCollection(ArrayDeque::new));
        ArrayDeque arrayDeque = new ArrayDeque();
        arrayDeque.add((PassageMatch) deque.removeFirst());
        while (!deque.isEmpty()) {
            PassageMatch passageMatch = (PassageMatch) deque.removeFirst();
            if (((PassageMatch) arrayDeque.peekLast()).overlaps(passageMatch)) {
                ((PassageMatch) arrayDeque.peekLast()).merge(passageMatch);
            } else {
                arrayDeque.add(passageMatch);
            }
        }
        return new ArrayList(arrayDeque);
    }

    public OcrSnippet[] format(Passage[] passageArr, IterableCharSequence iterableCharSequence) {
        OcrSnippet[] ocrSnippetArr = new OcrSnippet[passageArr.length];
        for (int i = 0; i < passageArr.length; i++) {
            Passage passage = passageArr[i];
            try {
                ocrSnippetArr[i] = format(passage, iterableCharSequence);
            } catch (IndexOutOfBoundsException e) {
                String format = String.format("Could not create snippet (start=%d, end=%d) from content at '%s' due to an out-of-bounds error.", Integer.valueOf(passage.getStartOffset()), Integer.valueOf(passage.getEndOffset()), iterableCharSequence.getIdentifier());
                logger.error(iterableCharSequence.getOffsetType() == IterableCharSequence.OffsetType.BYTES ? format + "\nDoes the file on disk correspond to the document that was used for determining the offsets during indexing?" : format + "\nDoes the file on disk correspond to the document that was used during indexing?", e);
            }
        }
        return ocrSnippetArr;
    }

    private OcrSnippet format(Passage passage, IterableCharSequence iterableCharSequence) {
        int i;
        StringBuilder sb = new StringBuilder(iterableCharSequence.subSequence(passage.getStartOffset(), passage.getEndOffset()));
        int i2 = 0;
        if (passage.getNumMatches() > 0) {
            for (PassageMatch passageMatch : mergeMatches(passage.getNumMatches(), passage.getMatchStarts(), passage.getMatchEnds())) {
                int length = iterableCharSequence.subSequence(passage.getStartOffset(), passageMatch.start).toString().length();
                sb.insert(i2 + length, this.startHlTag);
                int length2 = i2 + this.startHlTag.length();
                int length3 = iterableCharSequence.subSequence(passage.getStartOffset(), passageMatch.end).toString().length();
                String substring = sb.substring(length2 + length, length2 + length3);
                if (substring.trim().endsWith(">")) {
                    Matcher matcher = LAST_INNER_TAG_PAT.matcher(substring);
                    int i3 = -1;
                    while (true) {
                        i = i3;
                        if (!matcher.find()) {
                            break;
                        }
                        i3 = matcher.start() + 1;
                    }
                    if (i > -1) {
                        length3 -= (passageMatch.end - passageMatch.start) - i;
                    }
                }
                sb.insert(Math.min(length2 + length3, sb.length()), this.endHlTag);
                i2 = length2 + this.endHlTag.length();
            }
        }
        String sb2 = sb.toString();
        OcrSnippet parseFragment = parseFragment(sb2, determineStartPage(sb2, passage.getStartOffset(), iterableCharSequence));
        if (parseFragment != null) {
            parseFragment.setScore(passage.getScore());
        }
        return parseFragment;
    }

    protected String getTextFromXml(String str) {
        try {
            return StringEscapeUtils.unescapeXml(CharStreams.toString(new HTMLStripCharFilter(new StringReader(str), ImmutableSet.of(this.startHlTag.substring(1, this.startHlTag.length() - 1))))).replaceAll("\n", "").replaceAll("\\s+", " ").trim();
        } catch (IOException e) {
            return str;
        }
    }

    public abstract String determineStartPage(String str, int i, IterableCharSequence iterableCharSequence);

    protected OcrSnippet parseFragment(String str, String str2) {
        ArrayList arrayList = new ArrayList();
        List<OcrBox> parseWords = parseWords(str, str2);
        if (parseWords.isEmpty()) {
            return null;
        }
        Map map = (Map) parseWords.stream().collect(Collectors.groupingBy((v0) -> {
            return v0.getPageId();
        }, LinkedHashMap::new, Collectors.toList()));
        ArrayList arrayList2 = null;
        for (OcrBox ocrBox : parseWords) {
            if (ocrBox.isHighlight()) {
                if (arrayList2 == null) {
                    arrayList2 = new ArrayList();
                }
                arrayList2.add(ocrBox);
            } else if (arrayList2 != null) {
                arrayList.add(arrayList2);
                arrayList2 = null;
            }
        }
        if (arrayList2 != null) {
            arrayList.add(arrayList2);
        }
        OcrSnippet ocrSnippet = new OcrSnippet(getTextFromXml(str), (List) map.entrySet().stream().map(entry -> {
            return determineSnippetRegion((List) entry.getValue(), (String) entry.getKey());
        }).collect(Collectors.toList()));
        addHighlightsToSnippet(arrayList, ocrSnippet);
        return ocrSnippet;
    }

    private OcrBox determineSnippetRegion(List<OcrBox> list, String str) {
        return new OcrBox(null, str, ((Float) list.stream().map((v0) -> {
            return v0.getUlx();
        }).min((v0, v1) -> {
            return v0.compareTo(v1);
        }).get()).floatValue(), ((Float) list.stream().map((v0) -> {
            return v0.getUly();
        }).min((v0, v1) -> {
            return v0.compareTo(v1);
        }).get()).floatValue(), ((Float) list.stream().map((v0) -> {
            return v0.getLrx();
        }).max((v0, v1) -> {
            return v0.compareTo(v1);
        }).get()).floatValue(), ((Float) list.stream().map((v0) -> {
            return v0.getLry();
        }).max((v0, v1) -> {
            return v0.compareTo(v1);
        }).get()).floatValue(), false);
    }

    protected abstract List<OcrBox> parseWords(String str, String str2);

    /* JADX INFO: Access modifiers changed from: protected */
    public void addHighlightsToSnippet(List<List<OcrBox>> list, OcrSnippet ocrSnippet) {
        for (OcrBox ocrBox : ocrSnippet.getSnippetRegions()) {
            float ulx = this.absoluteHighlights ? 0.0f : ocrBox.getUlx();
            float uly = this.absoluteHighlights ? 0.0f : ocrBox.getUly();
            list.stream().map(list2 -> {
                return (List) list2.stream().filter(ocrBox2 -> {
                    return ocrBox2.getPageId().equals(ocrBox.getPageId());
                }).map(ocrBox3 -> {
                    return new OcrBox(ocrBox3.getText(), ocrBox3.getPageId(), ocrBox3.getUlx() - ulx, ocrBox3.getUly() - uly, ocrBox3.getLrx() - ulx, ocrBox3.getLry() - uly, ocrBox3.isHighlight());
                }).collect(Collectors.toList());
            }).forEach(list3 -> {
                ocrSnippet.addHighlightRegion(mergeBoxes(list3));
            });
        }
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public List<OcrBox> mergeBoxes(List<OcrBox> list) {
        if (list.size() < 2) {
            return list;
        }
        ArrayList arrayList = new ArrayList();
        Iterator<OcrBox> it = list.iterator();
        OcrBox next = it.next();
        StringBuilder sb = new StringBuilder(next.getText());
        while (it.hasNext()) {
            OcrBox next2 = it.next();
            if (Math.abs(next2.getUly() - next.getUly()) > 0.75d * (next.getLry() - next.getUly())) {
                next.setText(sb.toString());
                arrayList.add(next);
                next = next2;
                sb = new StringBuilder(next.getText());
            } else {
                sb.append(" ");
                sb.append(next2.getText());
                if (next2.getLrx() > next.getLrx()) {
                    next.setLrx(next2.getLrx());
                }
                if (next2.getLry() > next.getLry()) {
                    next.setLry(next2.getLry());
                }
                if (next2.getUly() < next.getUly()) {
                    next.setUly(next2.getUly());
                }
            }
        }
        next.setText(sb.toString());
        arrayList.add(next);
        return arrayList;
    }

    public Object format(Passage[] passageArr, String str) {
        return Arrays.stream(format(passageArr, IterableCharSequence.fromString(str))).map((v0) -> {
            return v0.getText();
        }).toArray(i -> {
            return new String[i];
        });
    }
}
