package eu.dicodeproject.analysis.hbase;

import java.io.IOException;
import java.io.StringReader;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.mahout.common.StringTuple;
import org.apache.mahout.vectorizer.DefaultAnalyzer;

/* loaded from: input_file:WEB-INF/lib/integration-0.0.11.jar:eu/dicodeproject/analysis/hbase/HBaseDocumentProcessor.class */
public final class HBaseDocumentProcessor {
    public static final String ANALYZER_CLASS = "analyzer.class";

    /* loaded from: input_file:WEB-INF/lib/integration-0.0.11.jar:eu/dicodeproject/analysis/hbase/HBaseDocumentProcessor$HBaseDocumentProcessorMapper.class */
    private static class HBaseDocumentProcessorMapper extends TableMapper<Text, StringTuple> {
        private Analyzer analyzer;

        private HBaseDocumentProcessorMapper() {
        }

        @Override // org.apache.hadoop.mapreduce.Mapper
        public void setup(Mapper<ImmutableBytesWritable, Result, Text, StringTuple>.Context context) throws IOException, InterruptedException {
            super.setup(context);
            try {
                this.analyzer = (Analyzer) Thread.currentThread().getContextClassLoader().loadClass(context.getConfiguration().get(HBaseDocumentProcessor.ANALYZER_CLASS, DefaultAnalyzer.class.getName())).newInstance();
            } catch (ClassNotFoundException e) {
                throw new IllegalStateException(e);
            } catch (IllegalAccessException e2) {
                throw new IllegalStateException(e2);
            } catch (InstantiationException e3) {
                throw new IllegalStateException(e3);
            }
        }

        @Override // org.apache.hadoop.mapreduce.Mapper
        public void map(ImmutableBytesWritable immutableBytesWritable, Result result, Mapper.Context context) throws IOException, InterruptedException {
            for (KeyValue keyValue : result.list()) {
                String str = new String(keyValue.getKey());
                TokenStream tokenStream = this.analyzer.tokenStream(str, new StringReader(new String(keyValue.getValue())));
                TermAttribute addAttribute = tokenStream.addAttribute(TermAttribute.class);
                StringTuple stringTuple = new StringTuple();
                while (tokenStream.incrementToken()) {
                    if (addAttribute.termLength() > 0) {
                        stringTuple.add(new String(addAttribute.termBuffer(), 0, addAttribute.termLength()));
                    }
                }
                context.write(new Text(str), stringTuple);
            }
        }
    }

    private HBaseDocumentProcessor() {
    }

    public static void tokenizeDocuments(String str, String str2, String str3, Class<? extends Analyzer> cls, Path path) throws IOException, InterruptedException, ClassNotFoundException {
        Configuration create = HBaseConfiguration.create();
        create.set(ANALYZER_CLASS, cls.getName());
        Job job = new Job(create);
        job.setJarByClass(HBaseDocumentProcessor.class);
        Scan scan = new Scan();
        scan.addColumn(Bytes.toBytes(str2), Bytes.toBytes(str3));
        TableMapReduceUtil.initTableMapperJob(str, scan, HBaseDocumentProcessorMapper.class, Text.class, StringTuple.class, job);
        job.setJobName("HBaseDocumentProcessor::DocumentTokenizer");
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(StringTuple.class);
        FileOutputFormat.setOutputPath(job, path);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        job.setNumReduceTasks(0);
        job.waitForCompletion(true);
    }
}
