Hadoop - Join with MultipleInputs probably skips Reducer - hadoop

So, I want to perform a reduce side join with MR. (No Hive or anything, I'm experimenting on vanilla Hadoop atm).
I have 2 input files, first goes like this:
12 13
12 15
12 16
12 23
the second is simply 12 1000.
So I assign each file to a separate mapper which actually tags each key value pair with 0 or 1 depending on its source file. And that works well. How I can tell?
I get the MapOutput as expected:
| key | |value|
12 0 1000
12 1 13
12 1 15
12 1 16 etc
My Partitioner partitions based on first part of key (ie 12).
The Reducer should join by key. Yet, the job seems to skip the reduce step.
I wonder if there's something wrong with my Driver?
My code (Hadoop v0.22, but same results with 0.20.2 with extra libs from the trunk):
Mappers
public static class JoinDegreeListMapper extends
Mapper<Text, Text, TextPair, Text> {
public void map(Text node, Text degree, Context context)
throws IOException, InterruptedException {
context.write(new TextPair(node.toString(), "0"), degree);
}
}
public static class JoinEdgeListMapper extends
Mapper<Text, Text, TextPair, Text> {
public void map(Text firstNode, Text secondNode, Context context)
throws IOException, InterruptedException {
context.write(new TextPair(firstNode.toString(), "1"), secondNode);
}
}
Reducer
public static class JoinOnFirstReducer extends
Reducer<TextPair, Text, Text, Text> {
public void reduce(TextPair key, Iterator<Text> values, Context context)
throws IOException, InterruptedException {
context.progress();
Text nodeDegree = new Text(values.next());
while (values.hasNext()) {
Text secondNode = values.next();
Text outValue = new Text(nodeDegree.toString() + "\t"
+ secondNode.toString());
context.write(key.getFirst(), outValue);
}
}
}
Partitioner
public static class JoinOnFirstPartitioner extends
Partitioner<TextPair, Text> {
#Override
public int getPartition(TextPair key, Text Value, int numOfPartitions) {
return (key.getFirst().hashCode() & Integer.MAX_VALUE) % numOfPartitions;
}
}
Driver
public int run(String[] args) throws Exception {
Path edgeListPath = new Path(args[0]);
Path nodeListPath = new Path(args[1]);
Path outputPath = new Path(args[2]);
Configuration conf = getConf();
Job job = new Job(conf);
job.setJarByClass(JoinOnFirstNode.class);
job.setJobName("Tag first node with degree");
job.setPartitionerClass(JoinOnFirstPartitioner.class);
job.setGroupingComparatorClass(TextPair.FirstComparator.class);
//job.setSortComparatorClass(TextPair.FirstComparator.class);
job.setReducerClass(JoinOnFirstReducer.class);
job.setMapOutputKeyClass(TextPair.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
MultipleInputs.addInputPath(job, edgeListPath, EdgeInputFormat.class,
JoinEdgeListMapper.class);
MultipleInputs.addInputPath(job, nodeListPath, EdgeInputFormat.class,
JoinDegreeListMapper.class);
FileOutputFormat.setOutputPath(job, outputPath);
return job.waitForCompletion(true) ? 0 : 1;
}

My reduce function had Iterator<> instead of Iterable, so the job skipped to Identity Reducer.
I can't quite believe I overlooked that. Noob error.
And the answer came from this Q/A
Using Hadoop for the First Time, MapReduce Job does not run Reduce Phase

Related

Hadoop mapreduce - mapping NullPointerException

I need to write a simple map-reduce program that , given as input a directed graph represented as a list of edges, produces the same graph where each edge (x,y) with x>y is replaced by (y,x) and there are no repetitions of edges in the output graph.
INPUT
1;3
2;1
0;1
3;1
2;0
1;1
2;1
OUTPUT
1;3
1;2
0;1
0;2
1;1
This is the code :
public class ExamGraph {
// mapper class
public static class MyMapper extends Mapper<LongWritable, Text, Text, NullWritable> {
#Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
value = new Text( value.toString());
String[] campi = value.toString().split(";");
if (Integer.getInteger(campi[0]) > Integer.getInteger(campi[1]))
context.write(new Text(campi[1]+";"+campi[0]), NullWritable.get());
else context.write(new Text(campi[0]+";"+campi[1]), NullWritable.get());
}
}
// reducer class
public static class MyReducer extends Reducer<Text, NullWritable, Text, NullWritable> {
#Override
protected void reduce(Text key, Iterable <NullWritable> values , Context context)
throws IOException, InterruptedException {
context.write(key, NullWritable.get());
}
}
public static void main(String[] args) throws Exception {
// create new job
Job job = Job.getInstance(new Configuration());
// job is based on jar containing this class
job.setJarByClass(ExamGraph.class);
// for logging purposes
job.setJobName("ExamGraph");
// set input path in HDFS
FileInputFormat.addInputPath(job, new Path(args[0]));
// set output path in HDFS (destination must not exist)
FileOutputFormat.setOutputPath(job, new Path(args[1]));
// set mapper and reducer classes
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReducer.class);
// An InputFormat for plain text files.
// Files are broken into lines. Either linefeed or carriage-return are used
// to signal end of line. Keys are the position in the file, and values
// are the line of text.
job.setInputFormatClass(TextInputFormat.class);
// set type of output keys and values for both mappers and reducers
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
// start job
job.waitForCompletion(true);
}
}
When I run the jar file using :
hadoop jar path/jar JOBNAME /inputlocation /outputlocation
I got this error :
18/05/22 02:13:11 INFO mapreduce.Job: Task Id : attempt_1526979627085_0001_m_000000_1, Status : FAILED
Error: java.lang.NullPointerException
at ExamGraph$MyMapper.map(ExamGraph.java:38)
at ExamGraph$MyMapper.map(ExamGraph.java:1)
at org.apache.hadoop.mapreduce.Mapper.run(Mapper.java:145)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:793)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:341)
at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:164)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:415)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1917)
at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:158)
But I did not find the error in the code.
Found the problem , I confused the method getInteger() with the parseInt() in the mapper.

Getting the partition id of input file in Hadoop

I need to know the row index of the partitions of the input file that I'm using. I could force this in the original file by concatenating the row index to the data but I'd rather have a way of doing this in Hadoop. I have this in my mapper...
String id = context.getConfiguration().get("mapreduce.task.partition");
But "id" is 0 in every case. In the "Hadoop: The Definitive Guide" it mentions accessing properties like the partition id "can be accessed from the context object passed to all methods of the Mapper or Reducer". It does not, from what I can tell, actually go into how to access this information.
I went through the documentation for the Context object and it seems like the above is the way to do it and the script does compile. But since I'm getting 0 for every value, I'm not sure if I'm actually using the right thing and I'm unable to find any detail online that could help in figuring this out.
Code used to test...
public class Test {
public static class TestMapper extends Mapper<LongWritable, Text, Text, Text> {
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String id = context.getConfiguration().get("mapreduce.task.partition");
context.write(new Text("Test"), new Text(id + "_" + value.toString()));
}
}
public static class TestReducer extends Reducer<Text, Text, Text, Text> {
public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
for(Text value : values) {
context.write(key, value);
}
}
}
public static void main(String[] args) throws Exception {
if(args.length != 2) {
System.err.println("Usage: Test <input path> <output path>");
System.exit(-1);
}
Job job = new Job();
job.setJarByClass(Test.class);
job.setJobName("Test");
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setMapperClass(TestMapper.class);
job.setReducerClass(TestReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
Two options are:
Use the offset instead of the row number
Track the line number in the mapper
For the first one, the key which is LongWritable tells you the offset of the line being processed. Unless your lines are exactly the same length, you won't be able to calculate the line number from an offset, but it does allow you to determine ordering if thats useful.
The second option is to just track it in the mapper. You could change your code to something like:
public static class TestMapper extends Mapper<LongWritable, Text, Text, Text> {
private long currentLineNum = 0;
private Text test = new Text("Test");
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
context.write(test, new Text(currentLineNum + "_" + value));
currentLineNum++;
}
}
You could also represent your matrix as lines of tuples and include the row and col on every tuple so when you're reading in the file, you have that information. If you use a file that is just space or comma seperated values that make up a 2D array, it'll be extremely hard to figure out what line (row) you are currently working on in the mapper

If 2 Mappers output the same key , what will the input to the reducer be?

I've the following doubt while learning Map reduce. It will be of great help if some one could answer.
I've two mappers working on the same file - I configured them using MultipleInputFormat
mapper 1 - Expected Output [ after extracting few columns of a file]
a - 1234
b - 3456
c - 1345
Mapper 2 Expected output [After extracting few columns of the same file]
a - Monday
b - Tuesday
c - Wednesday
And there is a reducer function that just outputs the key and value pair that it gets as input
So I expected the output to be as I know that similar keys will be shuffled to make a list.
a - [1234,Monday]
b - [3456, Tuesday]
c - [1345, Wednesday]
But am getting some weird output.I guess only 1 Mapper is getting run.
Should this not be expected ? Will the output of each mapper be shuffled separately ? Will both the mappers run parallel ?
Excuse me if its a lame question Please understand that I am new to Hadoop and Map Reduce
Below is the code
//Mapper1
public class numbermapper extends Mapper<Object, Text, Text, Text>{
public void map(Object key,Text value, Context context) throws IOException, InterruptedException {
String record = value.toString();
String[] parts = record.split(",");
System.out.println("***Mapper number output "+parts[0]+" "+parts[1]);
context.write(new Text(parts[0]), new Text(parts[1]));
}
}
//Mapper2
public class weekmapper extends Mapper<Object, Text, Text, Text> {
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
String record = value.toString();
String[] parts = record.split(",");
System.out.println("***Mapper week output "+parts[0]+" "+parts[2]);
context.write(new Text(parts[0]), new Text(parts[2]));
}
}
//Reducer
public class rjoinreducer extends Reducer<Text, Text, Text, Text> {
public void reduce(Text key, Text values, Context context)
throws IOException, InterruptedException {
context.write(key, values);
}
}
//Driver class
public class driver {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = new Job(conf, "Reduce-side join");
job.setJarByClass(numbermapper.class);
job.setReducerClass(rjoinreducer.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
MultipleInputs.addInputPath(job, new Path(args[0]),TextInputFormat.class, numbermapper.class);
MultipleInputs.addInputPath(job, new Path(args[0]),TextInputFormat.class, weekmapper.class);
Path outputPath = new Path(args[1]);
FileOutputFormat.setOutputPath(job, outputPath);
outputPath.getFileSystem(conf).delete(outputPath);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
And this is the O/P I got-
a Monday
b Tuesday
c Wednesday
Dataset used
a,1234,Monday
b,3456,Tuesday
c,1345,Wednesday
Multiple input format was just taking 1 file and running one mapper on it because I have given the same path for both the Mappers.
When I copy the dataset to a different file and ran the same program taking two different files (same content but different names for the files) I got the expected output.
So i now understood that the output from different mapper functions is also combined based on key , not just the output from the same mapper function.
Thanks for trying to help....!!!

why Hadoop combiner output not merged by reducer

I ran a simple wordcount MapReduce example adding combiner with a small change in combiner output, The output of combiner is not merged by reducer. scenario is as follows
Test:
Map -> Combiner ->Reducer
In combiner i added two extra lines to out put a word different and count 1, reducer is not suming the "different" word count. output pasted below.
Text t = new Text("different"); // Added a my own output
context.write(t, new IntWritable(1)); // Added my own output
public class wordcountcombiner extends Reducer<Text, IntWritable, Text, IntWritable>{
#Override
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException
{
int sum = 0;
for (IntWritable val : values)
{
sum += val.get();
}
context.write(key, new IntWritable(sum));
Text t = new Text("different"); // Added my own output
context.write(t, new IntWritable(1)); // Added my own output
}
}
Input:
I ran a simple wordcount MapReduce example adding combiner with a small change in combiner output, The output of combiner is not merged by reducer. scenario is as follows
In combiner I added two extra lines to out put a word different and count 1, reducer is not suming the "different" word count. output pasted below.
Output:
"different" 1
different 1
different 1
I 2
different 1
In 1
different 1
MapReduce 1
different 1
The 1
different 1
...
How can this happen?
fullcode:
I ran wordcount program with combiner and just for fun i tweaked it in combiner, so i faced this issue.
I have three separate classes for mapper, combiner and reducer.
Driver:
public class WordCount {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
// TODO Auto-generated method stub
Job job = Job.getInstance(new Configuration());
job.setJarByClass(wordcountmapper.class);
job.setJobName("Word Count");
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setMapperClass(wordcountmapper.class);
job.setCombinerClass(wordcountcombiner.class);
job.setReducerClass(wordcountreducer.class);
job.getConfiguration().set("fs.file.impl", "com.conga.services.hadoop.patch.HADOOP_7682.WinLocalFileSystem");
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
System.exit(job.waitForCompletion(true)? 0 : 1);
}
}
Mapper:
public class wordcountmapper extends Mapper<LongWritable, Text, Text, IntWritable> {
private Text word = new Text();
IntWritable one = new IntWritable(1);
#Override
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException
{
String line = value.toString();
StringTokenizer token = new StringTokenizer(line);
while (token.hasMoreTokens())
{
word.set(token.nextToken());
context.write(word, one);
}
}
}
Combiner:
public class wordcountcombiner extends Reducer<Text, IntWritable, Text, IntWritable>{
#Override
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException
{
int sum = 0;
for (IntWritable val : values)
{
sum += val.get();
}
context.write(key, new IntWritable(sum));
Text t = new Text("different");
context.write(t, new IntWritable(1));
}
}
Reducer:
public class wordcountreducer extends Reducer<Text, IntWritable, Text, IntWritable>{
#Override
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException
{
int sum = 0;
for (IntWritable val : values)
{
sum += val.get();
}
context.write(key, new IntWritable(sum));
}
}
The output is normal because you're having two lines doing wrong things :
Why are you having this code
Text t = new Text("different"); // Added my own output
context.write(t, new IntWritable(1)); // Added my own output
In your reducer you're doing the sum and then you're adding to the output different 1 ....
You are writing in the final output of the job a new "1 different" in the reduce function, without doing any kind of aggregation. The reduce function is called once per key, as you can see in the method signature, it takes as arguments a key and the list of values for that key, which means that it is called once for each of the keys.
Since you are using as key a word, and in each call of reduce you are writing to the output "1 different", you will get one of those for each of the words in the input data.
hadoop requires that the reduce method in the combiner writes only the same key that it receives as input. This is required because hadoop sorts the keys only before the combiner is called, it does not re-sort them after the combiner has run. In your program, the reduce method writes the key "different" in addition to the key that it received as input. This means that the key "different" then appears in different positions in the order of keys, and these occurrences are not merged before they get passed to the reducer.
For example:
Assume the sorted list of keys output by the mapper is: "alpha", "beta", "gamma"
Your combiner is then called three times (once for "alpha", once for "beta", once for "gamma") and produces keys "alpha", "different", then keys "beta", "different", then keys "gamma", "different".
The "sorted" (but actually not sorted) list of keys after the combiner has executed is then:
"alpha", "different", "beta", "different", "gamma", "different"
This list does not get sorted again, so the different occurrences of "different" do not get merged.
The reducer is then called separately six times, and the key "different" appears 3 times in the output of the reducer.

Multiple Input Files Mapreduce Wordcount example done separately

I was going about Hadoop framework for Mapreduce model,and actually tried out basic examples like WordCount, Max_temperature so much so as to create a mapreduce task for my project .I only want to know how to process wordcount as one output file for each input file...as in let me give you an example on that :-
FILE_1 Dog Cat Dog Bull
FILE_2 Cow Ox Tiger Dog Cat
FILE_3 Dog Cow Ox Tiger Bull
should give 3 output files, 1 for each input file as follows:-
Out_1 Dog 2,Cat 1,Bull 1
Out_2 Cow 1,Ox 1,Tiger 1,Dog 1,Cat 1
Out_3 Dog 1,Cow 1,Ox 1,Tiger 1,Bull 1
I went through the answers posted here Hadoop MapReduce - one output file for each input but couldn't grasp it properly.
Help please! Thanks
Each Reducer outputs one output file.
The number of output files is dependent on number of Reducers.
(A)
Assuming you want to process all three input files in a single MapReduce Job.
At the very minimum - you must set number of Reducers equal to the Number of Output Files you want.
Since you are trying to do word-counts Per File. And not across Files.
You will have to ensure that all the file contents (of one file) are processed by a Single Reducer. Using a Custom Partitioner is one way to do this.
(B)
Another way is to simply run your MapReduce Job Three Times. Once for Each Input File. And have Reducer count as 1.
Even I am a newbie in hadoop and found this question very interesting. And this is how I resolved this.
public class Multiwordcnt {
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
Configuration conf = new Configuration();
Job myJob = new Job(conf, "Multiwordcnt");
String[] userargs = new GenericOptionsParser(conf, args).getRemainingArgs();
myJob.setJarByClass(Multiwordcnt.class);
myJob.setMapperClass(MyMapper.class);
myJob.setReducerClass(MyReducer.class);
myJob.setMapOutputKeyClass(Text.class);
myJob.setMapOutputValueClass(IntWritable.class);
myJob.setOutputKeyClass(Text.class);
myJob.setOutputValueClass(IntWritable.class);
myJob.setInputFormatClass(TextInputFormat.class);
myJob.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(myJob, new Path(userargs[0]));
FileOutputFormat.setOutputPath(myJob, new Path(userargs[1]));
System.exit(myJob.waitForCompletion(true) ? 0 : 1 );
}
public static class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
Text emitkey = new Text();
IntWritable emitvalue = new IntWritable(1);
public void map(LongWritable key , Text value, Context context) throws IOException, InterruptedException {
String filePathString = ((FileSplit) context.getInputSplit()).getPath().toString();
String line = value.toString();
StringTokenizer tokenizer = new StringTokenizer(line);
while (tokenizer.hasMoreTokens()){
String filepathword = filePathString + "*" + tokenizer.nextToken();
emitkey.set(filepathword);
context.write(emitkey, emitvalue);
}
}
}
public static class MyReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
Text emitkey = new Text();
IntWritable emitvalue = new IntWritable();
private MultipleOutputs<Text,IntWritable> multipleoutputs;
public void setup(Context context) throws IOException, InterruptedException {
multipleoutputs = new MultipleOutputs<Text,IntWritable>(context);
}
public void reduce(Text key , Iterable <IntWritable> values, Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable value : values){
sum = sum + value.get();
}
String pathandword = key.toString();
String[] splitted = pathandword.split("\\*");
String path = splitted[0];
String word = splitted[1];
emitkey.set(word);
emitvalue.set(sum);
System.out.println("word:" + word + "\t" + "sum:" + sum + "\t" + "path: " + path);
multipleoutputs.write(emitkey,emitvalue , path);
}
public void cleanup(Context context) throws IOException, InterruptedException {
multipleoutputs.close();
}
}
}

Resources