package com.dataartisans.flinktraining.exercises.dataset_java.tf_idf;

import com.dataartisans.flinktraining.dataset_preparation.MBoxParser;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.StringTokenizer;
import java.util.regex.Pattern;
import org.apache.flink.api.common.functions.JoinFunction;
import org.apache.flink.api.common.functions.RichFlatMapFunction;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.operators.DataSource;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.util.Collector;

/* loaded from: input_file:com/dataartisans/flinktraining/exercises/dataset_java/tf_idf/MailTFIDF.class */
public class MailTFIDF {
    public static final String[] STOP_WORDS = {"the", "i", "a", "an", "at", "are", "am", "for", "and", "or", "is", "there", "it", "this", "that", "on", "was", "by", "of", "to", "in", "to", "message", "not", "be", "with", "you", "have", "as", "can"};

    /* loaded from: input_file:com/dataartisans/flinktraining/exercises/dataset_java/tf_idf/MailTFIDF$TFComputer.class */
    public static class TFComputer extends RichFlatMapFunction<Tuple2<String, String>, Tuple3<String, String, Integer>> {
        private Set<String> stopWords = new HashSet();
        private transient Map<String, Integer> wordCounts;
        private transient Pattern wordPattern;

        public TFComputer() {
        }

        public TFComputer(String[] strArr) {
            Collections.addAll(this.stopWords, strArr);
        }

        public void open(Configuration configuration) {
            this.wordPattern = Pattern.compile("(\\p{Alpha})+");
            this.wordCounts = new HashMap();
        }

        public void flatMap(Tuple2<String, String> tuple2, Collector<Tuple3<String, String, Integer>> collector) throws Exception {
            this.wordCounts.clear();
            StringTokenizer stringTokenizer = new StringTokenizer((String) tuple2.f1);
            while (stringTokenizer.hasMoreTokens()) {
                String lowerCase = stringTokenizer.nextToken().toLowerCase();
                if (this.wordPattern.matcher(lowerCase).matches() && !this.stopWords.contains(lowerCase)) {
                    this.wordCounts.put(lowerCase, Integer.valueOf((this.wordCounts.containsKey(lowerCase) ? this.wordCounts.get(lowerCase).intValue() : 0) + 1));
                }
            }
            for (String str : this.wordCounts.keySet()) {
                collector.collect(new Tuple3(tuple2.f0, str, this.wordCounts.get(str)));
            }
        }

        public /* bridge */ /* synthetic */ void flatMap(Object obj, Collector collector) throws Exception {
            flatMap((Tuple2<String, String>) obj, (Collector<Tuple3<String, String, Integer>>) collector);
        }
    }

    /* loaded from: input_file:com/dataartisans/flinktraining/exercises/dataset_java/tf_idf/MailTFIDF$TfIdfComputer.class */
    public static class TfIdfComputer implements JoinFunction<Tuple2<String, Integer>, Tuple3<String, String, Integer>, Tuple3<String, String, Double>> {
        private double mailCount;

        public TfIdfComputer() {
        }

        public TfIdfComputer(long j) {
            this.mailCount = j;
        }

        public Tuple3<String, String, Double> join(Tuple2<String, Integer> tuple2, Tuple3<String, String, Integer> tuple3) throws Exception {
            return new Tuple3<>(tuple3.f0, tuple3.f1, Double.valueOf(((Integer) tuple3.f2).intValue() * (this.mailCount / ((Integer) tuple2.f1).intValue())));
        }
    }

    /* loaded from: input_file:com/dataartisans/flinktraining/exercises/dataset_java/tf_idf/MailTFIDF$UniqueWordExtractor.class */
    public static class UniqueWordExtractor extends RichFlatMapFunction<Tuple2<String, String>, Tuple2<String, Integer>> {
        private Set<String> stopWords = new HashSet();
        private transient Set<String> emittedWords;
        private transient Pattern wordPattern;

        public UniqueWordExtractor() {
        }

        public UniqueWordExtractor(String[] strArr) {
            Collections.addAll(this.stopWords, strArr);
        }

        public void open(Configuration configuration) {
            this.emittedWords = new HashSet();
            this.wordPattern = Pattern.compile("(\\p{Alpha})+");
        }

        public void flatMap(Tuple2<String, String> tuple2, Collector<Tuple2<String, Integer>> collector) throws Exception {
            this.emittedWords.clear();
            StringTokenizer stringTokenizer = new StringTokenizer((String) tuple2.f1);
            while (stringTokenizer.hasMoreTokens()) {
                String lowerCase = stringTokenizer.nextToken().toLowerCase();
                if (this.wordPattern.matcher(lowerCase).matches() && !this.stopWords.contains(lowerCase) && !this.emittedWords.contains(lowerCase)) {
                    collector.collect(new Tuple2(lowerCase, 1));
                    this.emittedWords.add(lowerCase);
                }
            }
        }

        public /* bridge */ /* synthetic */ void flatMap(Object obj, Collector collector) throws Exception {
            flatMap((Tuple2<String, String>) obj, (Collector<Tuple2<String, Integer>>) collector);
        }
    }

    public static void main(String[] strArr) throws Exception {
        DataSource types = ExecutionEnvironment.getExecutionEnvironment().readCsvFile(ParameterTool.fromArgs(strArr).getRequired("input")).lineDelimiter(MBoxParser.MAIL_RECORD_DELIM).fieldDelimiter(MBoxParser.MAIL_FIELD_DELIM).includeFields("10001").types(String.class, String.class);
        types.flatMap(new UniqueWordExtractor(STOP_WORDS)).groupBy(new int[]{0}).sum(1).join(types.flatMap(new TFComputer(STOP_WORDS))).where(new int[]{0}).equalTo(new int[]{1}).with(new TfIdfComputer(types.count())).print();
    }
}
