package org.apache.mahout.vectorizer;

import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import com.google.common.io.Closeables;
import java.io.IOException;
import java.net.URI;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.Pair;
import org.apache.mahout.common.StringTuple;
import org.apache.mahout.common.iterator.sequencefile.PathType;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.vectorizer.collocations.llr.CollocDriver;
import org.apache.mahout.vectorizer.common.PartialVectorMerger;
import org.apache.mahout.vectorizer.term.TFPartialVectorReducer;
import org.apache.mahout.vectorizer.term.TermCountCombiner;
import org.apache.mahout.vectorizer.term.TermCountMapper;
import org.apache.mahout.vectorizer.term.TermCountReducer;

/* loaded from: input_file:WEB-INF/lib/mahout-core-0.7.jar:org/apache/mahout/vectorizer/DictionaryVectorizer.class */
public final class DictionaryVectorizer implements Vectorizer {
    public static final String DOCUMENT_VECTOR_OUTPUT_FOLDER = "tf-vectors";
    public static final String MIN_SUPPORT = "min.support";
    public static final String MAX_NGRAMS = "max.ngrams";
    public static final int DEFAULT_MIN_SUPPORT = 2;
    private static final String DICTIONARY_FILE = "dictionary.file-";
    private static final int MAX_CHUNKSIZE = 10000;
    private static final int MIN_CHUNKSIZE = 100;
    private static final String OUTPUT_FILES_PATTERN = "part-*";
    private static final int DICTIONARY_BYTE_OVERHEAD = 4;
    private static final String VECTOR_OUTPUT_FOLDER = "partial-vectors-";
    private static final String DICTIONARY_JOB_FOLDER = "wordcount";

    private DictionaryVectorizer() {
    }

    @Override // org.apache.mahout.vectorizer.Vectorizer
    public void createVectors(Path path, Path path2, VectorizerConfig vectorizerConfig) throws IOException, ClassNotFoundException, InterruptedException {
        createTermFrequencyVectors(path, path2, vectorizerConfig.getTfDirName(), vectorizerConfig.getConf(), vectorizerConfig.getMinSupport(), vectorizerConfig.getMaxNGramSize(), vectorizerConfig.getMinLLRValue(), vectorizerConfig.getNormPower(), vectorizerConfig.isLogNormalize(), vectorizerConfig.getNumReducers(), vectorizerConfig.getChunkSizeInMegabytes(), vectorizerConfig.isSequentialAccess(), vectorizerConfig.isNamedVectors());
    }

    public static void createTermFrequencyVectors(Path path, Path path2, String str, Configuration configuration, int i, int i2, float f, float f2, boolean z, int i3, int i4, boolean z2, boolean z3) throws IOException, InterruptedException, ClassNotFoundException {
        List<Path> createDictionaryChunks;
        Preconditions.checkArgument(f2 == -1.0f || f2 >= 0.0f, "If specified normPower must be nonnegative", Float.valueOf(f2));
        Preconditions.checkArgument(f2 == -1.0f || (f2 > 1.0f && !Double.isInfinite((double) f2)) || !z, "normPower must be > 1 and not infinite if log normalization is chosen", Float.valueOf(f2));
        if (i4 < 100) {
            i4 = 100;
        } else if (i4 > 10000) {
            i4 = 10000;
        }
        if (i < 0) {
            i = 2;
        }
        Path path3 = new Path(path2, DICTIONARY_JOB_FOLDER);
        int[] iArr = new int[1];
        if (i2 == 1) {
            startWordCounting(path, path3, configuration, i);
            createDictionaryChunks = createDictionaryChunks(path3, path2, configuration, i4, iArr);
        } else {
            CollocDriver.generateAllGrams(path, path3, configuration, i2, i, f, i3);
            createDictionaryChunks = createDictionaryChunks(new Path(new Path(path2, DICTIONARY_JOB_FOLDER), CollocDriver.NGRAM_OUTPUT_DIRECTORY), path2, configuration, i4, iArr);
        }
        int i5 = 0;
        ArrayList newArrayList = Lists.newArrayList();
        for (Path path4 : createDictionaryChunks) {
            int i6 = i5;
            i5++;
            Path path5 = new Path(path2, VECTOR_OUTPUT_FOLDER + i6);
            newArrayList.add(path5);
            makePartialVectors(path, configuration, i2, path4, path5, iArr[0], z2, z3, i3);
        }
        Configuration configuration2 = new Configuration(configuration);
        PartialVectorMerger.mergePartialVectors(newArrayList, new Path(path2, str), configuration2, f2, z, iArr[0], z2, z3, i3);
        HadoopUtil.delete(configuration2, newArrayList);
    }

    private static List<Path> createDictionaryChunks(Path path, Path path2, Configuration configuration, int i, int[] iArr) throws IOException {
        ArrayList newArrayList = Lists.newArrayList();
        Configuration configuration2 = new Configuration(configuration);
        FileSystem fileSystem = FileSystem.get(path.toUri(), configuration2);
        long j = i * 1024 * 1024;
        int i2 = 0;
        Path path3 = new Path(path2, DICTIONARY_FILE + 0);
        newArrayList.add(path3);
        SequenceFile.Writer writer = new SequenceFile.Writer(fileSystem, configuration2, path3, Text.class, IntWritable.class);
        try {
            long j2 = 0;
            int i3 = 0;
            Iterator it = new SequenceFileDirIterable(new Path(path, "part-*"), PathType.GLOB, null, null, true, configuration2).iterator();
            while (it.hasNext()) {
                Pair pair = (Pair) it.next();
                if (j2 > j) {
                    Closeables.closeQuietly(writer);
                    i2++;
                    Path path4 = new Path(path2, DICTIONARY_FILE + i2);
                    newArrayList.add(path4);
                    writer = new SequenceFile.Writer(fileSystem, configuration2, path4, Text.class, IntWritable.class);
                    j2 = 0;
                }
                j2 += 4 + (r0.toString().length() * 2) + 4;
                int i4 = i3;
                i3++;
                writer.append((Writable) pair.getFirst(), new IntWritable(i4));
            }
            iArr[0] = i3;
            Closeables.closeQuietly(writer);
            return newArrayList;
        } catch (Throwable th) {
            Closeables.closeQuietly(writer);
            throw th;
        }
    }

    private static void makePartialVectors(Path path, Configuration configuration, int i, Path path2, Path path3, int i2, boolean z, boolean z2, int i3) throws IOException, InterruptedException, ClassNotFoundException {
        Configuration configuration2 = new Configuration(configuration);
        configuration2.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization");
        configuration2.setInt(PartialVectorMerger.DIMENSION, i2);
        configuration2.setBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, z);
        configuration2.setBoolean(PartialVectorMerger.NAMED_VECTOR, z2);
        configuration2.setInt(MAX_NGRAMS, i);
        DistributedCache.setCacheFiles(new URI[]{path2.toUri()}, configuration2);
        Job job = new Job(configuration2);
        job.setJobName("DictionaryVectorizer::MakePartialVectors: input-folder: " + path + ", dictionary-file: " + path2);
        job.setJarByClass(DictionaryVectorizer.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(StringTuple.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(VectorWritable.class);
        FileInputFormat.setInputPaths(job, new Path[]{path});
        FileOutputFormat.setOutputPath(job, path3);
        job.setMapperClass(Mapper.class);
        job.setInputFormatClass(SequenceFileInputFormat.class);
        job.setReducerClass(TFPartialVectorReducer.class);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        job.setNumReduceTasks(i3);
        HadoopUtil.delete(configuration2, path3);
        if (!job.waitForCompletion(true)) {
            throw new IllegalStateException("Job failed!");
        }
    }

    private static void startWordCounting(Path path, Path path2, Configuration configuration, int i) throws IOException, InterruptedException, ClassNotFoundException {
        Configuration configuration2 = new Configuration(configuration);
        configuration2.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization");
        configuration2.setInt(MIN_SUPPORT, i);
        Job job = new Job(configuration2);
        job.setJobName("DictionaryVectorizer::WordCount: input-folder: " + path);
        job.setJarByClass(DictionaryVectorizer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(LongWritable.class);
        FileInputFormat.setInputPaths(job, new Path[]{path});
        FileOutputFormat.setOutputPath(job, path2);
        job.setMapperClass(TermCountMapper.class);
        job.setInputFormatClass(SequenceFileInputFormat.class);
        job.setCombinerClass(TermCountCombiner.class);
        job.setReducerClass(TermCountReducer.class);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        HadoopUtil.delete(configuration2, path2);
        if (!job.waitForCompletion(true)) {
            throw new IllegalStateException("Job failed!");
        }
    }
}
