Home‎ > ‎

Hadoop: How to get distinct values/lines (dedupe) for a file using Hadoop Map Reduce Framework

The following prorgam takes in a text file with a single column and returns the distinct list of lines in the file in the output directory
 
1. Create file CalculateDistinct.java and paste the following code
 
package org.myorg;
import java.io.IOException;
import java.util.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;
public class CalculateDistinct {
        public static class Map extends MapReduceBase implements Mapper<LongWritable,Text,Text,IntWritable> {
                private final static IntWritable one = new IntWritable(1);
                private Text word = new Text("");
                public void map(LongWritable key, Text value, OutputCollector<Text,IntWritable> output, Reporter reporter) throws IOException {
                        word.set(value.toString());
                        output.collect(word,one);
                }
        }
        public static class Reduce extends MapReduceBase implements Reducer<Text, IntWritable, Text, IntWritable> {
                public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
                        int sum = 0;
                        while (values.hasNext()) {
                                sum += 1;
                                values.next();
                        }
                        output.collect(key, new IntWritable(sum));
                }
        }

        public static void main(String[] args) throws Exception {
                JobConf conf = new JobConf(CalculateDistinct.class);
                conf.setJobName("Calculate Distinct");
                conf.setOutputKeyClass(Text.class);
                conf.setOutputValueClass(IntWritable.class);
                conf.setMapperClass(Map.class);
                conf.setReducerClass(Reduce.class);
                conf.setInputFormat(TextInputFormat.class);
                conf.setOutputFormat(TextOutputFormat.class);
                FileInputFormat.setInputPaths(conf, new Path(args[0]));
                FileOutputFormat.setOutputPath(conf, new Path(args[1]));
                JobClient.runJob(conf);
        }
}

2. Compile, create Jar and Run
 
javac -classpath hadoop-0.20.1-dev-core.jar -d CalculateDistinct/ CalculateDistinct.java
jar -cvf CalculateDistinct.jar -C CalculateDistinct/ .
hadoop jar CalculateDistinct.jar org.myorg.CalculateDistinct /user/john/in/abc.txt /user/john/out
 
Comments