Using a custom Combiner...it may be ignored? - hadoop

I have this in Main...
job.setMapperClass(AverageIntMapper.class);
job.setCombinerClass(AverageIntCombiner.class);
job.setReducerClass(AverageIntReducer.class);
And the Combiner has different code but the Combiner is being completely ignored as the output the Reducer is using is the output from the Mapper.
I understand that a Combiner may not be used but I thought that was the case when the Combiner is the same as the Reducer. I don't really understand the point of being able to create a custom Combiner but the system can still skip its usage.
If that's not supposed to happen, what could be a reason that the Combiner is not being used?
Code...
import java.io.IOException;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class AverageInt {
public static class AverageIntMapper extends Mapper<LongWritable, Text, Text, Text> {
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String n_string = value.toString();
context.write(new Text("Value"), new Text(n_string));
}
}
public static class AverageIntCombiner extends Reducer<Text, Text, Text, Text> {
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum = 0;
int count = 0;
for(IntWritable value : values) {
int temp = Integer.parseInt(value.toString());
sum += value.get();
count += 1;
}
String sum_count = Integer.toString(sum) + "," + Integer.toString(count);
context.write(key, new Text(sum_count));
}
}
public static class AverageIntReducer extends Reducer<Text, Text, Text, Text> {
public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
int total = 0;
int count = 0;
for(Text value : values) {
String temp = value.toString();
String[] split = temp.split(",");
total += Integer.parseInt(split[0]);
count += Integer.parseInt(split[1]);
}
Double average = (double)total/count;
context.write(key, new Text(average.toString()));
}
}
public static void main(String[] args) throws Exception {
if(args.length != 2) {
System.err.println("Usage: AverageInt <input path> <output path>");
System.exit(-1);
}
Job job = new Job();
job.setJarByClass(AverageInt.class);
job.setJobName("Average");
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setMapperClass(AverageIntMapper.class);
job.setCombinerClass(AverageIntCombiner.class);
job.setReducerClass(AverageIntReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}

If you look at what your mapper is emitting:
public void map(LongWritable key, Text value, Context context)
Its sending two Text objects, but whilst you've declared the combiner class itself correctly, the reduce method has:
public void reduce(Text key, Iterable<IntWritable> values, Context context)
It should be:
public void reduce(Text key, Iterable<Text> values, Context context)

Related

How does cleanup() method work?

I am currently new to Hadoop. So I have this solved piece of code in MapReduce which finds out the "parts of a country with most 'Data Engineer' jobs for each year" (for example, if the data of the format (Year,Region,Count(Jobs)) is "2016,'XYZ',35" and "2016,'ABC',25" and "2015,'sdf',14", the answer would be "2016,'XYZ',35" and "2015,'sdf',14"), but I am unable to understand the part in the reducer which is as follows:-
if (Top5DataEngineer.size() > 1)
Top5DataEngineer.remove(Top5DataEngineer.firstKey());
}//Ignore this bracket for the time being.
protected void cleanup(Context context) throws IOException,
InterruptedException {
for (Text t : Top5DataEngineer.descendingMap().values())
context.write(NullWritable.get(), t);
}
This is the full code:-
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.Partitioner;
import java.util.TreeMap;
import org.apache.hadoop.mapreduce.Reducer;
public class Q_002a {
public static class Q_002a_Mapper extends
Mapper<LongWritable, Text, Text, LongWritable> {
LongWritable one = new LongWritable(1);
public void map(LongWritable key, Text values, Context context)
throws IOException, InterruptedException {
try {
if (key.get() > 0)
{
String[] token = values.toString().split("\t");
if (token[4].equals("DATA ENGINEER")) {
Text answer = new Text(token[8] + "\t" + token[7]);
context.write(answer, one);
}
}
} catch (ArrayIndexOutOfBoundsException e) {
System.out.println(e.getMessage());
} catch (ArithmeticException e1) {
System.out.println(e1.getMessage());
}
}
}
public static class Q_002a_Partitioner extends Partitioner<Text, LongWritable> {
#Override
public int getPartition(Text key, LongWritable value, int numReduceTasks) {
String[] str = key.toString().split("\t");
if (str[1].equals("2011"))
return 0;
if (str[1].equals("2012"))
return 1;
if (str[1].equals("2013"))
return 2;
if (str[1].equals("2014"))
return 3;
if (str[1].equals("2015"))
return 4;
if (str[1].equals("2016"))
return 5;
else
return 6;
}
}
public static class Q_002a_Reducer extends
Reducer<Text, LongWritable, NullWritable, Text> {
private TreeMap<LongWritable, Text> Top5DataEngineer = new TreeMap<LongWritable, Text>();
long sum = 0;
public void reduce(Text key, Iterable<LongWritable> values,
Context context) throws IOException, InterruptedException {
sum = 0;
for (LongWritable val : values) {
sum += val.get();
}
Top5DataEngineer.put(new LongWritable(sum), new Text(key + ","
+ sum));
if (Top5DataEngineer.size() > 1)
Top5DataEngineer.remove(Top5DataEngineer.firstKey());
}
protected void cleanup(Context context) throws IOException,
InterruptedException {
for (Text t : Top5DataEngineer.descendingMap().values())
context.write(NullWritable.get(), t);
}
}
public static void main(String args[]) throws IOException,
InterruptedException, ClassNotFoundException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "Top 5 Data Engineer in a worksite");
job.setJarByClass(Q_002a.class);
job.setMapperClass(Q_002a_Mapper.class);
job.setPartitionerClass(Q_002a_Partitioner.class);
job.setReducerClass(Q_002a_Reducer.class);
job.setNumReduceTasks(6);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
This is the output I am getting:-
EDIT:- I tried running the code inside the cleanup() method in the reduce() method, but it was not working as expected. It only ran fine when it was in the cleanup() method. Any help regarding this would be appreciated.
cleanup() method will be called when processing stage is completed. And it will be called only once.
In your example reduce() method is "searching" for the biggest sum of Data engineers jobs by city in years partition. Top5DataEngineer TreeMap stores keys in sorted(ascending) order and on each iteration it simply deletes first key(smaller key) if it has more than one key. In other words after processing Iterable<LongWritable> values you will get a city with the biggest number of jobs in every 'years' partition.
When reducer phase is finished, cleanup() method simply writes a result of every processed partition(single/biggest kv-pair in Top5DataEngineer map).
cleanup() method will be called once for every 'years' partition.
Hope it will help you.

MapReduce not reducing?

I'm following the tutorial at http://hadoop.apache.org/docs/current/hadoop-mapreduce-client/hadoop-mapreduce-client-core/MapReduceTutorial.html and this is my code
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.util.StringTokenizer;
import java.util.Iterator;
public class WordCount {
public static class WordCountMapper extends Mapper<Object, Text, Text, IntWritable> {
private Text word = new Text();
private final IntWritable one = new IntWritable(1);
#Override
public void map(Object key, Text val, Context context) throws IOException, InterruptedException {
String line = val.toString();
StringTokenizer tokenizer = new StringTokenizer(line.toLowerCase());
while (tokenizer.hasMoreTokens()) {
word.set(tokenizer.nextToken());
context.write(word, one);
}
}
}
public static class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
public void reduce(Text key, Iterator<IntWritable> value, Context context) throws IOException, InterruptedException {
int sum = 0;
while (value.hasNext()) {
IntWritable val = (IntWritable) value.next();
sum += val.get();
}
context.write(key, new IntWritable(sum));
}
}
public static void main(String[] args) throws Exception {
Configuration config = new Configuration();
Job job = Job.getInstance(config, "word count");
job.setJarByClass(WordCount.class);
job.setMapperClass(WordCountMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setCombinerClass(WordCountReducer.class);
job.setReducerClass(WordCountReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path("/user/Icarus/words.txt"));
FileOutputFormat.setOutputPath(job, new Path("/user/Icarus/words.out"));
job.waitForCompletion(true);
}
}
But when I run it instead of calculating the word frequency, I got this:
bye 1
goodbye 1
hadoop 1
hadoop 1
hello 1
hello 1
hello 1
world 1
I must missed something very trivial but I can't figure out what. Help please..
Root cause of this problem is, You are not calling the reduce() with the exact Signature required to call by Hadoop. Signature should be as below (reference here)
protected void reduce(KEYIN key, Iterable<VALUEIN> values, org.apache.hadoop.mapreduce.Reducer.Context context)
throws IOException, InterruptedException
Since your reduce() not matching the Signature, Hadoop will call default IdentityReducer, which output the same input.
So only you are getting the same output of Map as Reduce output.
For this problem, i can suggest you 2 solutions,
First: Try the below code
public static class WordCountReducer
extends Reducer<Text,IntWritable,Text,IntWritable> {
private IntWritable result = new IntWritable();
public void reduce(Text key, Iterable<IntWritable> values,
Context context
) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
Second: And second solution is quite simple,
Instead of you define manually reduce class, Just set the Reducer class to IntSumReducer or LongSumReducer which will do the same as above code.
So don't define the WordCountReducer class and add the following code,
job.setReducerClass(LongSumReducer.class); or
job.setReducerClass(IntSumReducer.class);
based on the count type you want.
Hope it helps!

comparing two text files using hadoop map reduce

I want to compare two text files line by line to find whether they are equal or not. How can I do it using hadoop map reduce programming?
static int i=0;
public void map(LongWritable key, String value, OutputCollector<String,IntWritable> output, Reporter reporter) throws IOException {
String line = value.toString();
i++; //used as a line number
output.collect(line, new IntWritable(i));
}
I tries to map each line with line number.But how can i reduce it and compare with another file?
Comparing two text files is equivalent to joining two files in map reduce programming. For Joining two text files you have to use two mappers with same keys. In your case you can use the key as line offset and value as line. MultipleInputs() method is used for using multiple mappers and multiple text files.
Please find below the detailed program for comparing two text files in map-reduce programming using JAVA.
The arguments for the program are file 1,file 2 and output file
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class CompareTwoFiles {
public static class Map extends
Mapper<LongWritable, Text, LongWritable, Text> {
#Override
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
context.write(key, value);
}
}
public static class Map2 extends
Mapper<LongWritable, Text, LongWritable, Text> {
#Override
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
context.write(key, value);
}
}
public static class Reduce extends
Reducer<LongWritable, Text, LongWritable, Text> {
#Override
public void reduce(LongWritable key, Iterable<Text> values,
Context context) throws IOException, InterruptedException {
String[] lines = new String[2];
int i = 0;
for (Text text : values) {
lines[i] = text.toString();
i++;
}
if (lines[0].equals(lines[1])) {
context.write(key, new Text("same"));
} else {
context.write(key,
new Text(lines[0] + " vs " + lines[1]));
}
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
conf.set("fs.default.name", "hdfs://localhost:8020");
Job job = new Job(conf);
job.setJarByClass(CompareTwoFiles.class);
job.setJobName("Compare Two Files and Identify the Difference");
FileOutputFormat.setOutputPath(job, new Path(args[2]));
job.setReducerClass(Reduce.class);
job.setOutputKeyClass(LongWritable.class);
job.setOutputValueClass(Text.class);
MultipleInputs.addInputPath(job, new Path(args[0]),
TextInputFormat.class, Map.class);
MultipleInputs.addInputPath(job, new Path(args[1]),
TextInputFormat.class, Map2.class);
job.waitForCompletion(true);
}
}

MaxTemperature example for MapReduce on Hadoop

I am trying to run MaxTemperature example from MapReduce. But I couldnot find the MaxTemperature.jar in Hadoop MapReduce Examples. Can someone help me out finding the jar file or what is the possibility of executing this program and see the output?
try this,
import java.io.IOException;
import java.util.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class Temp {
public static class Map extends Mapper<LongWritable, Text, Text, IntWritable> {
// private final static IntWritable one = new IntWritable();
// private Text word = new Text();
public void map(LongWritable key, Text value, Context context) throws IOException InterruptedException
{
String line = value.toString();
String year=line.substring(0,4);
//StringTokenizer tokenizer = new StringTokenizer(line);
// while (tokenizer.hasMoreTokens()) {
// word.set(tokenizer.nextToken());
// output.collect(word, one);
int Temp=Integer.parseInt(line.substring(6,8));
context.write(new Text(year),new IntWritable(Temp));
}
}
}
/*
public static class Reduce extends Reducer<Text, IntWritable, Text, IntWritable> {
public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
// int max=Integer.MIN_VALUE;
int sum=0;
while (values.hasNext()) {
sum += values.next().get();
}
output.collect(key, new IntWritable(sum));
}
}*/
public class Reduce extends Reducer<Text, IntWritable, Text, IntWritable>
{
#Override
public void reduce(Text key, Iterable<IntWritable> values,Context context)
throws IOException, InterruptedException
{
int maxValue = Integer.MIN_VALUE;
for (IntWritable value : values) {
maxValue = Math.max(maxValue, value.get());
}
context.write(key, new IntWritable(maxValue));
}
}
public static void main(String[] args) throws Exception {
JobConf conf = new JobConf(Temp.class);
conf.setJobName("Temp");
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(IntWritable.class);
conf.setMapperClass(Map.class);
conf.setCombinerClass(Reduce.class);
conf.setReducerClass(Reduce.class);
conf.setInputFormat(TextInputFormat.class);
conf.setOutputFormat(TextOutputFormat.class);
FileInputFormat.setInputPaths(conf, new Path(args[0]));
FileOutputFormat.setOutputPath(conf, new Path(args[1]));
JobClient.runJob(conf);
}
}
make jar file of this program and execute command
hadoop jar Temp.jar Temp /hdfs_inputFile /hdfs_inputFile

i have 10 files with CSV and TSV. I want to get the output what is CSV and TSV data using MapReduce in Apache Hadoop

one file is containing data like this
robert 10,20,30
john 10,30,20
Another file containing data like
surya 10|20|30
sumanth 30|40|10
like this 10 files i want to get the output data what is coma separated and pipe separated using Map Reduce
Here's the code to replace comma delimiter with pipe and combine all the lists for the same surname into one
package my.reader;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import java.io.IOException;
public class ReadRows {
public static class Map extends Mapper<LongWritable, Text, Text, Text> {
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] res = value.toString().split("\t");
if (res[1].contains(",")) {
res[1] = res[1].replace(',','|');
}
context.write(new Text(res[0]), new Text(res[1]));
}
}
public static class Reduce extends Reducer<Text,Text,Text,Text> {
public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException{
String res = "";
for(Text val : values) {
res += "|" + val.toString();
}
context.write(key, new Text(res.substring(1)));
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
if (args.length != 2) {
System.err.println("Usage: my.reader.ReadRows <in> <out>");
System.exit(2);
}
Job job = new Job(conf, "ReadRows");
job.setJarByClass(ReadRows.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
And here's the code just to parse them and calculate max:
public static class Map extends Mapper<LongWritable, Text, Text, IntWritable> {
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] res = value.toString().split("\t");
String[] sal;
if (res[1].contains(",")) {
sal = res[1].split(",");
} else {
sal = res[1].split("\\|");
}
Integer maxSal = 0;
for ( String s : sal ) {
maxSal = max(Integer.valueOf(s), maxSal);
}
context.write(new Text(res[0]), new IntWritable(maxSal));
}
}
public static class Reduce extends Reducer<Text,IntWritable,Text,IntWritable> {
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException{
Integer maxSal = 0;
for(IntWritable val : values) {
maxSal = max(val.get(), maxSal);
}
context.write(key, new IntWritable(maxSal));
}
}

Resources