package de.jungblut.nlp.mr;

import com.google.common.collect.HashMultiset;
import com.google.common.collect.Multiset;
import de.jungblut.nlp.Tokenizer;
import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

/* loaded from: input_file:de/jungblut/nlp/mr/WordCountJob.class */
public class WordCountJob {
    private static final Logger LOG = LogManager.getLogger(WordCountJob.class);
    public static final String MIN_WORD_COUNT_KEY = "min.word.count";

    /* loaded from: input_file:de/jungblut/nlp/mr/WordCountJob$WordFrequencyMapper.class */
    public static class WordFrequencyMapper extends Mapper<LongWritable, Text, Text, LongWritable> {
        private Tokenizer tokenizer;
        private final HashMultiset<String> wordCountSet = HashMultiset.create();
        private int minWordCount = 0;

        /* loaded from: input_file:de/jungblut/nlp/mr/WordCountJob$WordFrequencyMapper$TokenCounter.class */
        enum TokenCounter {
            NUM_TOKENS,
            COUNT_SUM
        }

        protected void setup(Mapper<LongWritable, Text, Text, LongWritable>.Context context) throws IOException, InterruptedException {
            Configuration configuration = context.getConfiguration();
            this.tokenizer = WordCorpusFrequencyJob.getTokenizer(configuration);
            this.minWordCount = configuration.getInt("min.word.count", this.minWordCount);
        }

        protected void map(LongWritable longWritable, Text text, Mapper<LongWritable, Text, Text, LongWritable>.Context context) throws IOException, InterruptedException {
            for (String str : this.tokenizer.tokenize(text.toString())) {
                this.wordCountSet.add(str);
            }
        }

        protected void cleanup(Mapper<LongWritable, Text, Text, LongWritable>.Context context) throws IOException, InterruptedException {
            context.getCounter(TokenCounter.COUNT_SUM).increment(this.wordCountSet.size());
            Text text = new Text();
            LongWritable longWritable = new LongWritable();
            for (Multiset.Entry entry : this.wordCountSet.entrySet()) {
                if (entry.getCount() > this.minWordCount) {
                    text.set((String) entry.getElement());
                    longWritable.set(entry.getCount());
                    context.getCounter(TokenCounter.NUM_TOKENS).increment(1L);
                    context.progress();
                    context.write(text, longWritable);
                }
            }
        }

        protected /* bridge */ /* synthetic */ void map(Object obj, Object obj2, Mapper.Context context) throws IOException, InterruptedException {
            map((LongWritable) obj, (Text) obj2, (Mapper<LongWritable, Text, Text, LongWritable>.Context) context);
        }
    }

    /* loaded from: input_file:de/jungblut/nlp/mr/WordCountJob$WordFrequencyReducer.class */
    public static class WordFrequencyReducer extends Reducer<Text, LongWritable, Text, LongWritable> {
        private final LongWritable sumValue = new LongWritable();

        protected void reduce(Text text, Iterable<LongWritable> iterable, Reducer<Text, LongWritable, Text, LongWritable>.Context context) throws IOException, InterruptedException {
            long j = 0;
            Iterator<LongWritable> it = iterable.iterator();
            while (it.hasNext()) {
                j += it.next().get();
            }
            this.sumValue.set(j);
            context.write(text, this.sumValue);
        }

        protected /* bridge */ /* synthetic */ void reduce(Object obj, Iterable iterable, Reducer.Context context) throws IOException, InterruptedException {
            reduce((Text) obj, (Iterable<LongWritable>) iterable, (Reducer<Text, LongWritable, Text, LongWritable>.Context) context);
        }
    }

    public static void main(String[] strArr) throws Exception {
        if (strArr.length != 2) {
            LOG.fatal("Usage: <Comma separated input paths> <Output path>");
            System.exit(1);
        }
        createJob(strArr[0], strArr[1], new Configuration()).waitForCompletion(true);
    }

    public static Job createJob(String str, String str2, Configuration configuration) throws IOException {
        Job job = Job.getInstance(configuration, "Token Frequency Calculator");
        job.setInputFormatClass(TextInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);
        FileInputFormat.setInputPaths(job, str);
        FileOutputFormat.setOutputPath(job, new Path(str2));
        job.setMapperClass(WordFrequencyMapper.class);
        job.setReducerClass(WordFrequencyReducer.class);
        job.setCombinerClass(WordFrequencyReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(LongWritable.class);
        job.setNumReduceTasks(1);
        return job;
    }
}
