package de.datexis.cdv.reader;

import de.datexis.cdv.model.EntityAspectAnnotation;
import de.datexis.cdv.retrieval.EntityAspectQueryAnnotation;
import de.datexis.common.InternalResource;
import de.datexis.common.Resource;
import de.datexis.model.Annotation;
import de.datexis.model.Dataset;
import de.datexis.model.Document;
import de.datexis.model.Query;
import de.datexis.model.impl.PassageAnnotation;
import de.datexis.preprocess.DocumentFactory;
import de.datexis.reader.DatasetReader;
import de.datexis.retrieval.model.RelevanceResult;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.FileVisitOption;
import java.nio.file.Files;
import java.nio.file.LinkOption;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.ListIterator;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.TreeSet;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;

/* loaded from: input_file:de/datexis/cdv/reader/MedQuADReader.class */
public class MedQuADReader implements DatasetReader {
    protected static final Logger log = LoggerFactory.getLogger(MedQuADReader.class);
    protected XPath xpath;
    protected DocumentBuilder builder;
    private XPathExpression docIdQuery;
    private XPathExpression docUrlQuery;
    private XPathExpression docSourceQuery;
    private XPathExpression docFocusQuery;
    private XPathExpression docFocusIDsQuery;
    private XPathExpression docFocusGroupQuery;
    private XPathExpression docPassageQuery;
    private XPathExpression docQuestionIdQuery;
    private XPathExpression docQuestionAspectQuery;
    private XPathExpression docQuestionTextQuery;
    private XPathExpression docAnswerQuery;
    private Pattern subsetPattern = Pattern.compile(".+\\/(\\d+)_(.+)?\\/.+?\\.xml$");
    Map<String, String> umlsMap = null;
    Map<String, String> namesMap = null;
    Map<String, String> wikidataMap = null;
    protected boolean keepEmptyDocs = false;
    protected boolean removeQuestions = false;
    protected Set<Class<? extends Annotation>> requestedAnnotations = new HashSet();
    protected List<String> labels;

    public MedQuADReader() {
        try {
            this.builder = DocumentBuilderFactory.newInstance().newDocumentBuilder();
            this.xpath = XPathFactory.newInstance().newXPath();
            this.docIdQuery = this.xpath.compile("/Document/@id");
            this.docUrlQuery = this.xpath.compile("/Document/@url");
            this.docSourceQuery = this.xpath.compile("/Document/@source");
            this.docFocusQuery = this.xpath.compile("/Document/Focus/text()");
            this.docFocusIDsQuery = this.xpath.compile("/Document/FocusAnnotations/UMLS/CUIs/CUI");
            this.docFocusGroupQuery = this.xpath.compile("/Document/FocusAnnotations/UMLS/SemanticGroup");
            this.docPassageQuery = this.xpath.compile("/Document/QAPairs/QAPair");
            this.docQuestionIdQuery = this.xpath.compile("./Question/@qid");
            this.docQuestionAspectQuery = this.xpath.compile("./Question/@qtype");
            this.docQuestionTextQuery = this.xpath.compile("./Question/text()");
            this.docAnswerQuery = this.xpath.compile("./Answer/text()");
        } catch (ParserConfigurationException | XPathExpressionException e) {
            e.printStackTrace();
        }
    }

    public MedQuADReader withAnnotations(Class<? extends Annotation> cls) {
        this.requestedAnnotations.add(cls);
        return this;
    }

    public MedQuADReader withIDMapping(Resource resource) throws IOException {
        List readLines = FileUtils.readLines(resource.toFile(), "UTF-8");
        this.umlsMap = new ConcurrentHashMap(readLines.size());
        readLines.stream().map(str -> {
            return str.split("\\t");
        }).forEach(strArr -> {
            this.umlsMap.put(strArr[0], strArr[1]);
        });
        return this;
    }

    public MedQuADReader withNameMapping(Resource resource) throws IOException {
        List readLines = FileUtils.readLines(resource.toFile(), "UTF-8");
        this.namesMap = new ConcurrentHashMap(readLines.size());
        this.wikidataMap = new ConcurrentHashMap(readLines.size());
        readLines.stream().map(str -> {
            return str.split("\\t");
        }).forEach(strArr -> {
            this.namesMap.putIfAbsent(strArr[1].toLowerCase(), strArr[0]);
            this.wikidataMap.putIfAbsent(strArr[0], strArr[1]);
        });
        return this;
    }

    public MedQuADReader withPassageLabelsCSV(Resource resource) throws IOException {
        this.labels = IOUtils.readLines(resource.getInputStream(), StandardCharsets.UTF_8);
        return this;
    }

    public MedQuADReader withKeepEmptyDocs(boolean z) {
        this.keepEmptyDocs = z;
        return this;
    }

    public MedQuADReader withRemoveQuestions(boolean z) {
        this.removeQuestions = z;
        return this;
    }

    public Dataset read(Resource resource) throws IOException {
        if ((resource instanceof InternalResource) || resource.isFile()) {
            Dataset dataset = new Dataset(resource.getFileName());
            addDocumentFromFile(resource, dataset, null);
            return dataset;
        }
        if (resource.isDirectory()) {
            return readDatasetFromDirectory(resource, "\\.xml$");
        }
        throw new FileNotFoundException("cannot open path: " + resource.toString());
    }

    public Dataset readDatasetFromDirectory(Resource resource, String str) throws IOException {
        log.info("Reading Documents from {}", resource.toString());
        Dataset dataset = new Dataset(resource.getPath().getFileName().toString());
        AtomicInteger atomicInteger = new AtomicInteger();
        ListIterator<String> listIterator = this.labels != null ? this.labels.listIterator() : Collections.emptyListIterator();
        if (listIterator.hasNext()) {
            listIterator.next();
        }
        Files.walk(resource.getPath(), new FileVisitOption[0]).filter(path -> {
            return Files.isRegularFile(path, LinkOption.NOFOLLOW_LINKS);
        }).filter(path2 -> {
            return path2.toString().matches(str);
        }).forEach(path3 -> {
            if (!addDocumentFromFile(Resource.fromFile(path3.toString()), dataset, listIterator.hasNext() ? (String) listIterator.next() : null) && listIterator.hasPrevious()) {
                listIterator.previous();
            }
            int incrementAndGet = atomicInteger.incrementAndGet();
            if (incrementAndGet % 1000 == 0) {
                log.debug("read {}k documents, memory usage {} GB", Integer.valueOf(incrementAndGet / 1000), Double.valueOf(((int) (((Runtime.getRuntime().totalMemory() / 1.073741824E9d) - (Runtime.getRuntime().freeMemory() / 1.073741824E9d)) * 10.0d)) / 10.0d));
            }
        });
        return dataset;
    }

    public Dataset readDatasetFromFiles(Collection<String> collection) throws IOException {
        Dataset dataset = new Dataset("MedQuAD");
        AtomicInteger atomicInteger = new AtomicInteger();
        ListIterator<String> listIterator = this.labels != null ? this.labels.listIterator() : Collections.emptyListIterator();
        if (listIterator.hasNext()) {
            listIterator.next();
        }
        collection.stream().forEach(str -> {
            if (!addDocumentFromFile(Resource.fromFile(str), dataset, listIterator.hasNext() ? (String) listIterator.next() : null) && listIterator.hasPrevious()) {
                listIterator.previous();
            }
            int incrementAndGet = atomicInteger.incrementAndGet();
            if (incrementAndGet % 1000 == 0) {
                log.debug("read {}k documents, memory usage {} GB", Integer.valueOf(incrementAndGet / 1000), Double.valueOf(((int) (((Runtime.getRuntime().totalMemory() / 1.073741824E9d) - (Runtime.getRuntime().freeMemory() / 1.073741824E9d)) * 10.0d)) / 10.0d));
            }
        });
        return dataset;
    }

    protected boolean addDocumentFromFile(Resource resource, Dataset dataset, String str) {
        String str2;
        PassageAnnotation passageAnnotation;
        try {
            try {
                Document document = new Document();
                org.w3c.dom.Document parse = this.builder.parse(resource.getInputStream());
                Matcher matcher = this.subsetPattern.matcher(resource.toString());
                if (!matcher.matches()) {
                    throw new IllegalArgumentException("Invalid path structure. Please use the original MedQuAD folder.");
                }
                String str3 = (String) this.docSourceQuery.evaluate(parse, XPathConstants.STRING);
                document.setId(str3 + "_" + ((String) this.docIdQuery.evaluate(parse, XPathConstants.STRING)));
                document.setSource((String) this.docUrlQuery.evaluate(parse, XPathConstants.STRING));
                String str4 = (String) this.docFocusQuery.evaluate(parse, XPathConstants.STRING);
                if (!((String) this.docFocusGroupQuery.evaluate(parse, XPathConstants.STRING)).equals("Disorders")) {
                    return false;
                }
                document.setTitle(str4);
                document.setType(matcher.group(2).replaceFirst("_QA", ""));
                document.setLanguage("en");
                String replace = str4.replace("What I need to know about ", "");
                TreeSet treeSet = new TreeSet();
                String str5 = null;
                NodeList nodeList = (NodeList) this.docFocusIDsQuery.evaluate(parse, XPathConstants.NODESET);
                for (int i = 0; i < nodeList.getLength(); i++) {
                    treeSet.add(nodeList.item(i).getTextContent());
                }
                List list = Collections.EMPTY_LIST;
                if (str != null) {
                    String[] split = str.split("\\t");
                    if (!split[0].equals(document.getId())) {
                        log.error("got wrong label for docId {}", document.getId());
                    }
                    replace = split[1];
                    str5 = split.length > 2 ? split[2] : "";
                } else {
                    if (this.umlsMap != null) {
                        list = (List) treeSet.stream().map(str6 -> {
                            return this.umlsMap.get(str6);
                        }).filter((v0) -> {
                            return Objects.nonNull(v0);
                        }).distinct().collect(Collectors.toList());
                    }
                    if (list.isEmpty() && this.namesMap != null && (str2 = this.namesMap.get(replace.toLowerCase())) != null) {
                        list.add(str2);
                    }
                    if (this.umlsMap != null || this.namesMap != null) {
                        str5 = StringUtils.join(list, ";");
                        if (list.isEmpty()) {
                            log.warn("could not resolve ID for '{}' ({})", replace, StringUtils.join(treeSet, ";"));
                        }
                    }
                }
                NodeList nodeList2 = (NodeList) this.docPassageQuery.evaluate(parse, XPathConstants.NODESET);
                for (int i2 = 0; i2 < nodeList2.getLength(); i2++) {
                    Node item = nodeList2.item(i2);
                    String str7 = ((String) this.docAnswerQuery.evaluate(item, XPathConstants.STRING)) + "\n";
                    if (this.removeQuestions) {
                        str7 = str7.replaceFirst("^(.(?!\\. ))+?\\? ", "");
                    }
                    Document fromText = DocumentFactory.fromText(str7.replace(" - ", "\n- "), DocumentFactory.Newlines.KEEP);
                    document.append(fromText);
                    String str8 = str3 + "_" + ((String) this.docQuestionIdQuery.evaluate(item, XPathConstants.STRING));
                    String sentence = DocumentFactory.createSentenceFromTokenizedString((String) this.docQuestionTextQuery.evaluate(item, XPathConstants.STRING)).toString();
                    String str9 = (String) this.docQuestionAspectQuery.evaluate(item, XPathConstants.STRING);
                    if (str3.equals("NIHSeniorHealth") && str9.equals("support groups")) {
                        str9 = "treatment";
                    } else if (str3.equals("GHR") && str9.equals("treatment")) {
                        str9 = null;
                    }
                    Query create = str9 != null ? Query.create(sentence) : null;
                    EntityAspectQueryAnnotation entityAspectQueryAnnotation = new EntityAspectQueryAnnotation(replace, str9);
                    if (str5 != null) {
                        entityAspectQueryAnnotation.setEntityId(str5);
                    } else {
                        entityAspectQueryAnnotation.setEntityId(StringUtils.join(treeSet, ";"));
                    }
                    boolean z = false;
                    if (create != null) {
                        Iterator it = dataset.getQueries().iterator();
                        while (true) {
                            if (!it.hasNext()) {
                                break;
                            }
                            Query query = (Query) it.next();
                            if (((EntityAspectQueryAnnotation) query.getAnnotation(EntityAspectQueryAnnotation.class)).matches(entityAspectQueryAnnotation)) {
                                create = query;
                                z = true;
                                break;
                            }
                        }
                        if (!z) {
                            create.addAnnotation(entityAspectQueryAnnotation);
                        }
                    }
                    if (this.requestedAnnotations.contains(EntityAspectAnnotation.class)) {
                        passageAnnotation = new EntityAspectAnnotation(Annotation.Source.GOLD);
                        ((EntityAspectAnnotation) passageAnnotation).setAspect(entityAspectQueryAnnotation.getAspect());
                        ((EntityAspectAnnotation) passageAnnotation).setEntity(entityAspectQueryAnnotation.getEntity());
                        ((EntityAspectAnnotation) passageAnnotation).setEntityId(entityAspectQueryAnnotation.getEntityId());
                    } else {
                        passageAnnotation = new PassageAnnotation(Annotation.Source.GOLD);
                        if (str9 != null && !str9.isEmpty()) {
                            passageAnnotation.setLabel(str9.substring(0, 1).toUpperCase() + str9.substring(1));
                        }
                    }
                    passageAnnotation.setId(str8);
                    passageAnnotation.setBegin(fromText.getBegin());
                    passageAnnotation.setEnd(fromText.getEnd());
                    if (passageAnnotation.getLength() > 1) {
                        document.addAnnotation(passageAnnotation);
                    }
                    if (create != null) {
                        RelevanceResult relevanceResult = new RelevanceResult(Annotation.Source.GOLD, document, fromText.getBegin(), fromText.getEnd());
                        relevanceResult.setRelevance(1);
                        relevanceResult.setId(str8);
                        relevanceResult.setDocumentRef(document);
                        if (relevanceResult.getLength() > 1 || this.keepEmptyDocs) {
                            create.addResult(relevanceResult);
                        }
                        if (!z && create.getResults().size() > 0) {
                            dataset.addQuery(create);
                        }
                    }
                }
                if (document.isEmpty() && !this.keepEmptyDocs) {
                    return false;
                }
                dataset.addDocument(document);
                return true;
            } catch (XPathExpressionException | SAXException e) {
                throw new IllegalArgumentException(e.toString());
            }
        } catch (IOException e2) {
            log.error(e2.toString());
            throw new RuntimeException(e2.toString(), e2.getCause());
        }
    }

    public void printQueries(Dataset dataset) {
        System.out.println("Query\tQuestion\tEntity\tQids\tResolved\tAspect");
        int i = 0;
        for (Query query : dataset.getQueries()) {
            EntityAspectQueryAnnotation entityAspectQueryAnnotation = (EntityAspectQueryAnnotation) query.getAnnotation(EntityAspectQueryAnnotation.class);
            StringBuilder sb = new StringBuilder();
            int i2 = i;
            i++;
            sb.append(i2).append("\t");
            sb.append(query.getText()).append("\t");
            sb.append(entityAspectQueryAnnotation.getEntity()).append("\t");
            sb.append(entityAspectQueryAnnotation.getEntityId()).append("\t");
            if (entityAspectQueryAnnotation.getEntityId() != null) {
                for (String str : entityAspectQueryAnnotation.getEntityId().split(";")) {
                    sb.append(this.wikidataMap.get(str)).append(";");
                }
            }
            sb.append("\t");
            sb.append(entityAspectQueryAnnotation.getAspect()).append("\t");
            System.out.println(sb.toString());
        }
    }

    public void printDocuments(Dataset dataset) {
        System.out.println("DocId\tEntity\tQids\tResolved");
        for (Document document : dataset.getDocuments()) {
            EntityAspectAnnotation entityAspectAnnotation = (EntityAspectAnnotation) document.streamAnnotations(Annotation.Source.GOLD, EntityAspectAnnotation.class).findFirst().get();
            StringBuilder sb = new StringBuilder();
            sb.append(document.getId()).append("\t");
            sb.append(entityAspectAnnotation.getEntity()).append("\t");
            sb.append(entityAspectAnnotation.getEntityId()).append("\t");
            if (entityAspectAnnotation.getEntityId() != null) {
                for (String str : entityAspectAnnotation.getEntityId().split(";")) {
                    sb.append(this.wikidataMap.get(str)).append(";");
                }
            }
            System.out.println(sb.toString());
        }
    }
}
