package de.datexis.reader;

import de.datexis.common.InternalResource;
import de.datexis.common.Resource;
import de.datexis.model.Dataset;
import de.datexis.model.Document;
import de.datexis.reader.DatasetReader;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.nio.file.FileVisitOption;
import java.nio.file.Files;
import java.nio.file.LinkOption;
import java.nio.file.Path;
import java.util.Collections;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:de/datexis/reader/DirectoryDatasetReader.class */
public abstract class DirectoryDatasetReader<A extends DatasetReader> implements DatasetReader {
    protected final Logger log = LoggerFactory.getLogger(DirectoryDatasetReader.class);
    protected boolean randomizeDocuments = false;
    protected long limit = -1;

    public A withRandomizedDocuments(boolean z) {
        this.randomizeDocuments = z;
        return this;
    }

    public A withLimitNumberOfDocuments(long j) {
        this.limit = j;
        return this;
    }

    @Override // de.datexis.reader.DatasetReader
    public Dataset read(Resource resource) throws IOException {
        if (!(resource instanceof InternalResource) && !resource.isFile()) {
            if (resource.isDirectory()) {
                return readDatasetFromDirectory(resource, ".+");
            }
            throw new FileNotFoundException("cannot open path: " + resource.toString());
        }
        Document readDocumentFromFile = readDocumentFromFile(resource);
        Dataset dataset = new Dataset(resource.getFileName());
        dataset.addDocument(readDocumentFromFile);
        return dataset;
    }

    public Stream<Document> stream(Resource resource) throws IOException {
        if ((resource instanceof InternalResource) || resource.isFile()) {
            return Stream.of(readDocumentFromFile(resource));
        }
        if (resource.isDirectory()) {
            return streamDocumentsFromDirectory(resource, ".+");
        }
        throw new FileNotFoundException("cannot open path: " + resource.toString());
    }

    public abstract Document readDocumentFromFile(Resource resource) throws IOException;

    public Dataset readDatasetFromDirectory(Resource resource) throws IOException {
        return readDatasetFromDirectory(resource, ".+");
    }

    public Stream<Document> streamDocumentsFromDirectory(Resource resource, String str) throws IOException {
        this.log.info("Streaming Documents from {}", resource.toString());
        Stream<Path> sorted = Files.walk(resource.getPath(), new FileVisitOption[0]).filter(path -> {
            return Files.isRegularFile(path, LinkOption.NOFOLLOW_LINKS);
        }).filter(path2 -> {
            return path2.getFileName().toString().matches(str);
        }).sorted();
        if (this.randomizeDocuments) {
            List list = (List) sorted.collect(Collectors.toList());
            Collections.shuffle(list);
            sorted = list.stream();
        }
        Stream<Document> filter = ((Stream) sorted.parallel()).flatMap(path3 -> {
            return tryReadDocumentsFromFile(Resource.fromFile(path3.toString()));
        }).filter(document -> {
            return (document != null) & (!document.isEmpty());
        });
        if (this.limit >= 0) {
            filter = filter.limit(this.limit);
        }
        return filter;
    }

    public Dataset readDatasetFromDirectory(Resource resource, String str) throws IOException {
        Dataset dataset = new Dataset(resource.getPath().getFileName().toString());
        AtomicInteger atomicInteger = new AtomicInteger();
        streamDocumentsFromDirectory(resource, str).forEach(document -> {
            long incrementAndGet = atomicInteger.incrementAndGet();
            dataset.addDocument(document);
            if (incrementAndGet % 1000 == 0) {
                double freeMemory = Runtime.getRuntime().freeMemory() / 1.073741824E9d;
                this.log.debug("read {}k documents, memory usage {} GB", Long.valueOf(incrementAndGet / 1000), Double.valueOf(((int) (((Runtime.getRuntime().totalMemory() / 1.073741824E9d) - freeMemory) * 10.0d)) / 10.0d));
            }
        });
        return dataset;
    }

    protected Stream<Document> tryReadDocumentsFromFile(Resource resource) {
        try {
            return Stream.of(readDocumentFromFile(resource));
        } catch (IOException e) {
            this.log.error(e.toString());
            throw new RuntimeException(e.toString(), e.getCause());
        }
    }
}
