package de.datexis.ner.reader;

import de.datexis.common.Resource;
import de.datexis.common.WordHelpers;
import de.datexis.model.Annotation;
import de.datexis.model.Dataset;
import de.datexis.model.Document;
import de.datexis.model.Token;
import de.datexis.model.tag.BIO2Tag;
import de.datexis.model.tag.Tag;
import de.datexis.ner.MentionAnnotation;
import de.datexis.preprocess.DocumentFactory;
import de.datexis.reader.DatasetReader;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Iterator;
import org.apache.commons.io.LineIterator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:de/datexis/ner/reader/CoNLLDatasetReader.class */
public class CoNLLDatasetReader implements DatasetReader {
    private static final Logger log = LoggerFactory.getLogger(CoNLLDatasetReader.class);
    private static final String LINE_START = "-DOCSTART-";
    protected boolean useFirstSentenceAsTitle = false;
    protected Annotation.Source annotationSource = Annotation.Source.GOLD;
    protected int tagIndex = -1;
    protected String type = null;
    protected String name;

    /* loaded from: input_file:de/datexis/ner/reader/CoNLLDatasetReader$Charset.class */
    public enum Charset {
        UTF_8,
        ISO_8859_1
    }

    public CoNLLDatasetReader withName(String str) {
        this.name = str;
        return this;
    }

    public CoNLLDatasetReader withTagIndex(int i) {
        this.tagIndex = i;
        return this;
    }

    public CoNLLDatasetReader withFirstSentenceAsTitle(boolean z) {
        this.useFirstSentenceAsTitle = z;
        return this;
    }

    public CoNLLDatasetReader withAnnotationSource(Annotation.Source source) {
        this.annotationSource = source;
        return this;
    }

    public CoNLLDatasetReader withGenericType(String str) {
        this.type = str;
        return this;
    }

    public Dataset read(Resource resource) throws IOException {
        return read(resource, Charset.UTF_8);
    }

    public Dataset read(Resource resource, Charset charset) throws IOException {
        log.info("Reading Dataset from `{}`...", resource.toString());
        InputStream inputStream = resource.getInputStream();
        Throwable th = null;
        try {
            try {
                Dataset readLines = readLines(new LineIterator(new BufferedReader(new InputStreamReader(inputStream, charset.equals(Charset.UTF_8) ? StandardCharsets.UTF_8.newDecoder() : StandardCharsets.ISO_8859_1.newDecoder()))));
                if (inputStream != null) {
                    if (0 != 0) {
                        try {
                            inputStream.close();
                        } catch (Throwable th2) {
                            th.addSuppressed(th2);
                        }
                    } else {
                        inputStream.close();
                    }
                }
                if (this.name != null) {
                    readLines.setName(this.name);
                } else {
                    readLines.setName(resource.getFileName().replaceFirst("\\..+$", ""));
                }
                return readLines;
            } finally {
            }
        } catch (Throwable th3) {
            if (inputStream != null) {
                if (th != null) {
                    try {
                        inputStream.close();
                    } catch (Throwable th4) {
                        th.addSuppressed(th4);
                    }
                } else {
                    inputStream.close();
                }
            }
            throw th3;
        }
    }

    public static Dataset readDataset(Resource resource, String str, Charset charset) throws IOException {
        Dataset read = new CoNLLDatasetReader().read(resource, charset);
        read.setName(str);
        return read;
    }

    protected Dataset readLines(Iterator<String> it) {
        Token createTokenFromLine;
        Dataset dataset = new Dataset();
        ArrayList arrayList = new ArrayList();
        String str = null;
        int i = 0;
        String str2 = "";
        while (it.hasNext()) {
            String trim = it.next().trim();
            if (trim.startsWith(LINE_START)) {
                if (!arrayList.isEmpty()) {
                    Document fromTokens = DocumentFactory.fromTokens(arrayList);
                    MentionAnnotation.annotateFromTags(fromTokens, this.annotationSource, (Class<? extends Tag>) BIO2Tag.class);
                    dataset.addDocument(fromTokens);
                }
                i = 0;
                arrayList = new ArrayList();
                str = null;
                str2 = "";
            } else if (trim.length() == 0) {
                if (!arrayList.isEmpty()) {
                    Token token = new Token("\n", i, i + 1);
                    token.putTag(this.annotationSource, BIO2Tag.O());
                    arrayList.add(token);
                    i = token.getEnd();
                    str2 = token.getText();
                }
                str = null;
            } else if (trim.length() > 0 && (createTokenFromLine = createTokenFromLine(trim, i, str)) != null) {
                if (!WordHelpers.skipSpaceAfter.contains(str2) && !WordHelpers.skipSpaceBefore.contains(createTokenFromLine.getText())) {
                    createTokenFromLine.setBegin(createTokenFromLine.getBegin() + 1);
                    createTokenFromLine.setEnd(createTokenFromLine.getEnd() + 1);
                }
                arrayList.add(createTokenFromLine);
                i = createTokenFromLine.getEnd();
                str = createTokenFromLine.getTag(this.annotationSource, BIO2Tag.class).getType();
                str2 = createTokenFromLine.getText();
            }
        }
        if (!arrayList.isEmpty()) {
            Document fromTokens2 = DocumentFactory.fromTokens(arrayList);
            MentionAnnotation.annotateFromTags(fromTokens2, this.annotationSource, (Class<? extends Tag>) BIO2Tag.class);
            dataset.addDocument(fromTokens2);
        }
        for (Document document : dataset.getDocuments()) {
            if (this.useFirstSentenceAsTitle) {
                if (document.countSentences() > 0) {
                    document.setTitle(document.getSentence(0).getText());
                } else {
                    document.setTitle("");
                }
            }
            document.setTagAvailable(this.annotationSource, BIO2Tag.class, true);
        }
        log.info(String.format("Finished reading dataset (%,d docs, %,d sentences, %,d tokens, %,d mentions)", Integer.valueOf(dataset.countDocuments()), Long.valueOf(dataset.countSentences()), Long.valueOf(dataset.countTokens()), Long.valueOf(dataset.countAnnotations())));
        return dataset;
    }

    protected Token createTokenFromLine(String str, int i, String str2) {
        try {
            String[] split = str.split("\\s+");
            int length = this.tagIndex >= 0 ? this.tagIndex : split.length - 1;
            String str3 = split[0];
            BIO2Tag createTag = createTag(split[length], str2);
            Token token = new Token(str3, i, i + str3.length());
            token.putTag(this.annotationSource, createTag);
            return token;
        } catch (Exception e) {
            log.warn("could not read line: " + str);
            return null;
        }
    }

    protected BIO2Tag createTag(String str, String str2) {
        String str3;
        String[] split = str.split("\\-");
        String str4 = split[0];
        if (this.type != null) {
            str3 = this.type;
        } else {
            str3 = split.length > 1 ? split[1] : "GENERIC";
        }
        boolean z = -1;
        switch (str4.hashCode()) {
            case 66:
                if (str4.equals("B")) {
                    z = true;
                    break;
                }
                break;
            case 73:
                if (str4.equals("I")) {
                    z = 2;
                    break;
                }
                break;
            case 79:
                if (str4.equals("O")) {
                    z = false;
                    break;
                }
                break;
        }
        switch (z) {
            case false:
                return new BIO2Tag(BIO2Tag.Label.O, (String) null);
            case true:
                return new BIO2Tag(BIO2Tag.Label.B, str3);
            case true:
                return str3.equals(str2) ? new BIO2Tag(BIO2Tag.Label.I, str3) : new BIO2Tag(BIO2Tag.Label.B, str3);
            default:
                log.warn("reading unknown tag " + str);
                return new BIO2Tag(BIO2Tag.Label.O, (String) null);
        }
    }
}
