package org.deeplearning4j.spark.text;

import java.util.List;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.deeplearning4j.berkeley.Pair;
import org.deeplearning4j.models.word2vec.wordstore.VocabCache;
import org.deeplearning4j.models.word2vec.wordstore.inmemory.InMemoryLookupCache;
import org.deeplearning4j.text.stopwords.StopWords;
import org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory;

/* loaded from: input_file:org/deeplearning4j/spark/text/TextPipeline.class */
public class TextPipeline {
    private JavaRDD<String> corpus;
    private List<String> stopWords;
    private int minWordFrequency;
    public static final String MIN_WORDS = "org.deeplearning4j.spark.text.minwords";

    public TextPipeline(JavaRDD<String> javaRDD, List<String> list, int i) {
        this.corpus = javaRDD;
        this.stopWords = list;
        this.minWordFrequency = i;
        this.minWordFrequency = javaRDD.context().conf().getInt(MIN_WORDS, i);
    }

    public TextPipeline(JavaRDD<String> javaRDD) {
        this(javaRDD, StopWords.getStopWords(), 5);
    }

    public TextPipeline(JavaRDD<String> javaRDD, int i) {
        this(javaRDD, StopWords.getStopWords(), i);
    }

    public Pair<VocabCache, Long> process(String str) {
        return (Pair) this.corpus.map(new TokenizerFunction(str)).map(new VocabCacheFunction(this.minWordFrequency, new InMemoryLookupCache(), new JavaSparkContext(this.corpus.context()).broadcast(this.stopWords))).cache().collect().get(0);
    }

    public Pair<VocabCache, Long> process() {
        return (Pair) this.corpus.map(new TokenizerFunction(DefaultTokenizerFactory.class.getName())).map(new VocabCacheFunction(this.minWordFrequency, new InMemoryLookupCache(), new JavaSparkContext(this.corpus.context()).broadcast(this.stopWords))).cache().collect().get(0);
    }
}
