package de.l3s.icrawl.crawler;

import com.codahale.metrics.CsvReporter;
import com.codahale.metrics.MetricRegistry;
import com.codahale.metrics.ScheduledReporter;
import com.codahale.metrics.jvm.ThreadStatesGaugeSet;
import com.fasterxml.jackson.databind.Module;
import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule;
import de.l3s.icrawl.crawler.analysis.ResourceAnalyser;
import de.l3s.icrawl.crawler.analysis.ResourceAnalyserFactory;
import de.l3s.icrawl.crawler.io.CsvStorer;
import de.l3s.icrawl.crawler.io.ResultStorer;
import de.l3s.icrawl.crawler.io.ZipFileStorer;
import de.l3s.icrawl.crawler.scheduling.NumberOfUrlsStoppingCriterion;
import de.l3s.icrawl.crawler.ui.UiConfig;
import java.io.File;
import java.io.IOException;
import java.util.Locale;
import java.util.concurrent.TimeUnit;
import javax.inject.Inject;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.EnableAutoConfiguration;
import org.springframework.boot.autoconfigure.data.jpa.JpaRepositoriesAutoConfiguration;
import org.springframework.boot.autoconfigure.orm.jpa.HibernateJpaAutoConfiguration;
import org.springframework.boot.context.embedded.EmbeddedServletContainerInitializedEvent;
import org.springframework.context.ApplicationListener;
import org.springframework.context.ConfigurableApplicationContext;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.context.annotation.Import;
import org.springframework.context.annotation.Profile;
import org.springframework.context.support.PropertySourcesPlaceholderConfigurer;

@Configuration
@EnableAutoConfiguration(exclude = {HibernateJpaAutoConfiguration.class, JpaRepositoriesAutoConfiguration.class})
@Import({UiConfig.class})
/* loaded from: input_file:de/l3s/icrawl/crawler/ArchiveCrawler.class */
public class ArchiveCrawler implements ApplicationListener<EmbeddedServletContainerInitializedEvent> {
    public static final String PROFILE_EVALUATION = "evaluation";
    public static final String PROFILE_EXTRACT = "extract";
    private static final Logger logger = LoggerFactory.getLogger(ArchiveCrawler.class);

    @Value("${cdxPath}")
    String indexPath;

    @Value("${warcRoot}")
    String dataPath;

    @Value("${numThreads:10}")
    int numThreads;

    @Value("${timeRelevanceThreshold:0.25}")
    float timeRelevanceThreshold;

    @Value("${docSimilarityWeight:0.5}")
    float docSimilarityWeight;
    private int serverPort;
    public static final String IDF_DICTIONARY_DE = "dictionary-DE.tsv.gz";

    @Inject
    StorerConfig storerConfig;

    @Value("${logdir}")
    File logDir;

    @Profile({ArchiveCrawler.PROFILE_EVALUATION})
    @Configuration
    /* loaded from: input_file:de/l3s/icrawl/crawler/ArchiveCrawler$CsvStorerConfig.class */
    public static class CsvStorerConfig implements StorerConfig {

        @Value("${outputDirectory}")
        public String outputDirectory;

        @Override // de.l3s.icrawl.crawler.ArchiveCrawler.StorerConfig
        @Bean
        public ResultStorer.Factory storerFactory(org.apache.hadoop.conf.Configuration configuration) throws IOException {
            Path path = new Path(this.outputDirectory);
            FileSystem fileSystem = FileSystem.get(configuration);
            if (!fileSystem.exists(path)) {
                fileSystem.mkdirs(path, FsPermission.valueOf("-rwxrwxrwx"));
            }
            return str -> {
                return new CsvStorer(configuration, new Path(path, str + ".csv"));
            };
        }
    }

    /* loaded from: input_file:de/l3s/icrawl/crawler/ArchiveCrawler$StorerConfig.class */
    public interface StorerConfig {
        ResultStorer.Factory storerFactory(org.apache.hadoop.conf.Configuration configuration) throws IOException;
    }

    @Profile({ArchiveCrawler.PROFILE_EXTRACT})
    @Configuration
    /* loaded from: input_file:de/l3s/icrawl/crawler/ArchiveCrawler$ZipFileStorerConfig.class */
    public static class ZipFileStorerConfig implements StorerConfig {

        @Value("${maxUrls}")
        public int maxUrls;

        @Value("${outputDirectory}")
        public String outputDirectory;

        @Override // de.l3s.icrawl.crawler.ArchiveCrawler.StorerConfig
        public ResultStorer.Factory storerFactory(org.apache.hadoop.conf.Configuration configuration) throws IOException {
            Path path = new Path(this.outputDirectory);
            FileSystem fileSystem = FileSystem.get(configuration);
            fileSystem.mkdirs(path);
            return str -> {
                ArchiveCrawler.logger.info("Creating new ZipFileStorer for '{}'", str);
                return new ZipFileStorer(fileSystem.create(new Path(path, str + ".zip"), true), this.maxUrls);
            };
        }
    }

    @Bean
    static PropertySourcesPlaceholderConfigurer propertySourcesPlaceholderConfigurer() {
        return new PropertySourcesPlaceholderConfigurer();
    }

    @Bean
    ResourceAnalyserFactory raf() {
        return new ResourceAnalyser.Factory(metrics(), this.timeRelevanceThreshold, this.docSimilarityWeight);
    }

    @Bean
    MetricRegistry metrics() {
        MetricRegistry metricRegistry = new MetricRegistry();
        metricRegistry.register("threads", new ThreadStatesGaugeSet());
        return metricRegistry;
    }

    @Bean
    ScheduledReporter reporter() {
        CsvReporter build = CsvReporter.forRegistry(metrics()).formatFor(Locale.ROOT).convertDurationsTo(TimeUnit.MILLISECONDS).convertRatesTo(TimeUnit.SECONDS).build(this.logDir);
        build.start(1L, TimeUnit.MINUTES);
        logger.info("Started logging metrics every minute");
        return build;
    }

    @Bean
    org.apache.hadoop.conf.Configuration conf() {
        return HBaseConfiguration.create(new YarnConfiguration());
    }

    @Bean
    Crawler crawler() throws IOException {
        return new Crawler(conf(), this.indexPath, this.dataPath, raf(), this.storerConfig.storerFactory(conf()), metrics(), this.numThreads);
    }

    @Bean
    Module jsr310Module() {
        return new JavaTimeModule();
    }

    public void onApplicationEvent(EmbeddedServletContainerInitializedEvent embeddedServletContainerInitializedEvent) {
        this.serverPort = embeddedServletContainerInitializedEvent.getEmbeddedServletContainer().getPort();
    }

    public int getServerPort() {
        return this.serverPort;
    }

    public static void main(String[] strArr) throws IOException {
        if (strArr.length < 1) {
            System.err.println("Usage: java " + Crawler.class.getName() + " specification [num_urls [weightingMethod [snapshotsToAnalyze]]");
            System.exit(1);
        }
        SpringApplication springApplication = new SpringApplication(new Object[]{ArchiveCrawler.class});
        springApplication.setWebEnvironment(false);
        ConfigurableApplicationContext run = springApplication.run(strArr);
        Crawler crawler = (Crawler) run.getBean(Crawler.class);
        crawler.crawlContinuously(ArchiveCrawlSpecification.readFile(new File(strArr[0])), new NumberOfUrlsStoppingCriterion(strArr.length >= 2 ? Long.parseLong(strArr[1]) : 10000L), strArr.length >= 3 ? ResourceAnalyser.WeightingMethod.valueOf(strArr[2]) : ResourceAnalyser.WeightingMethod.CONTENT, -1.7976931348623157E308d, strArr.length >= 4 ? Integer.parseInt(strArr[3]) : 10);
        ((ScheduledReporter) run.getBean(ScheduledReporter.class)).report();
        crawler.shutdown();
    }
}
