package de.datexis.sector.reader;

import de.datexis.common.Resource;
import de.datexis.model.Annotation;
import de.datexis.model.Dataset;
import de.datexis.model.Document;
import de.datexis.preprocess.DocumentFactory;
import de.datexis.reader.DatasetReader;
import de.datexis.sector.model.SectionAnnotation;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.io.LineIterator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:de/datexis/sector/reader/WikiCitiesReader.class */
public class WikiCitiesReader implements DatasetReader {
    protected static final Logger log = LoggerFactory.getLogger(WikiCitiesReader.class);
    protected boolean skipTopLevelSegment = true;
    protected Pattern LINE_PATTERN = Pattern.compile("^(\\d+),(\\d+),(.+?)\u0001(.+?)$");
    protected String TOPLEVEL_STRING = "TOP-LEVEL SEGMENT";

    public WikiCitiesReader withSkipTopLevelSegment(boolean z) {
        this.skipTopLevelSegment = z;
        return this;
    }

    public Dataset read(Resource resource) throws IOException {
        try {
            InputStream inputStream = resource.getInputStream();
            Throwable th = null;
            try {
                try {
                    LineIterator lineIterator = new LineIterator(new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8.newDecoder())));
                    Dataset dataset = new Dataset(resource.getFileName());
                    dataset.setName(resource.getFileName());
                    Document document = new Document();
                    StringBuilder sb = new StringBuilder();
                    String str = "";
                    while (lineIterator.hasNext()) {
                        String str2 = (String) lineIterator.next();
                        Matcher matcher = this.LINE_PATTERN.matcher(str2);
                        if (matcher.matches()) {
                            String group = matcher.group(1);
                            int parseInt = Integer.parseInt(matcher.group(2));
                            String group2 = matcher.group(3);
                            String group3 = matcher.group(4);
                            if (parseInt == 1) {
                                String sb2 = sb.toString();
                                if (sb2.trim().length() > 0) {
                                    addToDocument(sb2, str, document);
                                }
                                if (document.countTokens() > 0) {
                                    dataset.addDocument(document);
                                }
                                document = new Document();
                                document.setId(group);
                                sb = new StringBuilder();
                                str = "";
                            }
                            if (!this.skipTopLevelSegment || !group2.equals(this.TOPLEVEL_STRING)) {
                                if (!group2.equals(str)) {
                                    String sb3 = sb.toString();
                                    if (sb3.trim().length() > 0) {
                                        addToDocument(sb3, str, document);
                                    }
                                    sb = new StringBuilder();
                                    str = group2;
                                }
                                if (sb.length() > 0) {
                                    sb.append(" ");
                                }
                                sb.append(group3).append(" .");
                            }
                        } else {
                            log.error("matcher did not match for lineL\n{}", str2);
                        }
                    }
                    String sb4 = sb.toString();
                    if (sb4.trim().length() > 0) {
                        addToDocument(sb4, str, document);
                    }
                    if (document.countTokens() > 0) {
                        dataset.addDocument(document);
                    }
                    if (inputStream != null) {
                        if (0 != 0) {
                            try {
                                inputStream.close();
                            } catch (Throwable th2) {
                                th.addSuppressed(th2);
                            }
                        } else {
                            inputStream.close();
                        }
                    }
                    return dataset;
                } finally {
                }
            } finally {
            }
        } catch (IOException e) {
            log.error(e.toString());
            throw new RuntimeException(e.toString(), e.getCause());
        }
    }

    private void addToDocument(String str, String str2, Document document) {
        if (str.trim().length() == 0) {
            return;
        }
        Document fromTokenizedText = DocumentFactory.fromTokenizedText(str);
        String lowerCase = str2 == null ? "" : str2.equals(this.TOPLEVEL_STRING) ? "preface" : str2.trim().toLowerCase();
        document.append(fromTokenizedText);
        SectionAnnotation sectionAnnotation = new SectionAnnotation(Annotation.Source.GOLD, "wiki", lowerCase);
        sectionAnnotation.setSectionLabel(lowerCase.replaceAll("\\s+", "_"));
        sectionAnnotation.setBegin(fromTokenizedText.getBegin());
        sectionAnnotation.setEnd(fromTokenizedText.getEnd());
        document.addAnnotation(sectionAnnotation);
    }
}
