Error Using InverseMapper and IdentityReducer while Executing MapReduce - hadoop

So I have a huge Access Log file and I am trying to find the Path on the Server which is hit the Most. It is a Traditional Word Count Problem to find the No. of times a path is hit.
But, as the output values are not Sorted in a MR job(only the keys are sorted) I am executing another MR job where the mapper takes the Output of Previous job as input and I use InverseMapper.java to invert the keys and values and use Identity Reducer(Reducer.java) because no aggregation is need and I just need to sort the keys(i.e., values of the first Job). Here is my Code :
package edu.pitt.cloud.CloudProject;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable.DecreasingComparator;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.map.InverseMapper;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
public class AccessLogMostHitPath {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
String configPath = "/usr/local/hadoop-2.7.3/etc/hadoop/";
Path inputPath = new Path(args[0]);
Path outputPath = new Path(args[1]);
Path finalOutputPath = new Path(args[2]);
Configuration config = new Configuration(true);
config.addResource(new Path(configPath+"hdfs-site.xml"));
config.addResource(new Path(configPath+"core-site.xml"));
config.addResource(new Path(configPath+"yarn-site.xml"));
config.addResource(new Path(configPath+"mapred-site.xml"));
Job job = Job.getInstance(config, "AccessLogMostHitPath");
job.setJarByClass(AccessLogMostHitPath.class);
job.setMapperClass(AccessLogMostHitPathMapper.class);
job.setReducerClass(AccessLogMostHitPathReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
config.set("mapreduce.job.running.map.limit", "2");
FileInputFormat.addInputPath(job, inputPath);
job.setInputFormatClass(TextInputFormat.class);
FileOutputFormat.setOutputPath(job, outputPath);
job.setOutputFormatClass(TextOutputFormat.class);
job.setNumReduceTasks(1);
System.out.println("Starting Job Execution ::: AccessLogMostHitPath");
int code = job.waitForCompletion(true) ? 0 : 1;
System.out.println("Job Execution Finished ::: AccessLogMostHitPath");
if(code != 0){
System.out.println("First Job Failed");
System.exit(code);
}
FileSystem hdfs = FileSystem.get(config);
Path successPath = new Path(outputPath+"/_SUCCESS");
if (hdfs.exists(successPath))
hdfs.delete(successPath, true);
Job job2 = Job.getInstance(config, "AccessLogMostHitPathSort");
job2.setJarByClass(AccessLogMostHitPath.class);
job2.setMapperClass(InverseMapper.class);
job2.setReducerClass(Reducer.class);
//config.set("mapreduce.input.keyvaluelinerecordreader.key.value.separator", "\\t");
KeyValueTextInputFormat.addInputPath(job2, outputPath);
job2.setInputFormatClass(KeyValueTextInputFormat.class);
FileOutputFormat.setOutputPath(job2, finalOutputPath);
job2.setOutputFormatClass(TextOutputFormat.class);
job2.setNumReduceTasks(1);
job2.setMapOutputKeyClass(IntWritable.class);
job2.setMapOutputValueClass(Text.class);
job2.setSortComparatorClass(DecreasingComparator.class);
job2.setOutputKeyClass(IntWritable.class);
job2.setOutputValueClass(Text.class);
config.set("mapreduce.job.running.map.limit", "2");
System.out.println("Starting Job Execution ::: AccessLogMostHitPathSort");
int code2 = job2.waitForCompletion(true) ? 0 : 1;
System.out.println("Job Execution Finished ::: AccessLogMostHitPathSort");
System.exit(code2);
}
}
I get the Below Exception When I execute this :
Error: java.io.IOException: Type mismatch in key from map: expected org.apache.hadoop.io.IntWritable, received org.apache.hadoop.io.Text
at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.collect(MapTask.java:1072)
at org.apache.hadoop.mapred.MapTask$NewOutputCollector.write(MapTask.java:715)
at org.apache.hadoop.mapreduce.task.TaskInputOutputContextImpl.write(TaskInputOutputContextImpl.java:89)
at org.apache.hadoop.mapreduce.lib.map.WrappedMapper$Context.write(WrappedMapper.java:112)
at org.apache.hadoop.mapreduce.lib.map.InverseMapper.map(InverseMapper.java:36)
at org.apache.hadoop.mapreduce.Mapper.run(Mapper.java:146)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:787)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:341)
at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:164)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1698)
at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:158)
Where is this going wrong?. I can see that somewhere there is a mismatchin the Key or Value Type but I have crosschecked Everything. Please Help.

The problem is KeyValueTextInputFormat. This is text input format, it reads key as Text and value as Text. But you declared that job2 output mapper types are IntWritable and Text:
job2.setMapOutputKeyClass(IntWritable.class);
job2.setMapOutputValueClass(Text.class);
So you have to provide your own input format that will read input correctly.

Related

MapReduce set to give multiple output files but I only get 1 for the first job

I have three mappers and three reducers that I would like to run. You'll notice in the code that job2 requires the output of Job1 as it's input. When running my code I only get one output file for the first job. The other two jobs seem to be ignored. How can I get all three jobs to run?
package mapreduce;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.jobcontrol.ControlledJob;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class RunMapReduceJob {
public static void main(String[] args) throws Exception {
new RunMapReduceJob().run(args);
}
public void run(String[] args) throws Exception {
//job 1
Configuration conf = new Configuration();
Job job1 = Job.getInstance(conf, "hourly");
job1.setJarByClass(RunMapReduceJob.class);
job1.setMapperClass(MaxConsumptionMapper.class);
job1.setReducerClass(MaxConsumptionReducer.class);
job1.setOutputKeyClass(Text.class);
job1.setOutputValueClass(DoubleWritable.class);
FileInputFormat.addInputPath(job1, new Path(args[0]));
FileOutputFormat.setOutputPath(job1, new Path(args[1])); //output path 1
job1.waitForCompletion(true);
//job 2
Configuration conf2 = new Configuration();
Job job2 = Job.getInstance(conf2, "max hourly");
job2.setJarByClass(RunMapReduceJob.class);
job2.setMapperClass(MaxConsumptionMapper2.class);
job2.setReducerClass(MaxConsumptionReducer2.class);
job2.setOutputKeyClass(Text.class);
job2.setOutputValueClass(DoubleWritable.class);
FileInputFormat.addInputPath(job2, new Path(args[1])); //input file is output path 1
FileOutputFormat.setOutputPath(job2, new Path(args[2])); //output path 2
job2.waitForCompletion(true);
//job 3
Configuration conf3 = new Configuration();
Job job3 = Job.getInstance(conf3, "Avg Daily Consumption");
job3.setJarByClass(RunMapReduceJob.class);
job3.setMapperClass(AvgConsumptionMapper.class);
job3.setReducerClass(AvgComsumptionReducer.class);
job3.setOutputKeyClass(Text.class);
job3.setOutputValueClass(DoubleWritable.class);
FileInputFormat.addInputPath(job3, new Path(args[0])); //original input
FileOutputFormat.setOutputPath(job3, new Path(args[3])); //output path 3
System.exit(job3.waitForCompletion(true) ? 0 : 1);
}
}

Hadoop WordCount Tutorial java.lang.ClassNotFoundException

I'm relatively new to hadoop and I'm struggling a little bit to understand the ClassNotFoundException I get when trying to run the job. I'm using the standard tutorial found here and here is my WordCount class (running on ubuntu 16.04 hadoop 2.7.3 distributed cluster mode):
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class WordCount {
public static class TokenizerMapper
extends Mapper<Object, Text, Text, IntWritable>{
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(Object key, Text value, Context context
) throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());
context.write(word, one);
}
}
}
public static class IntSumReducer
extends Reducer<Text,IntWritable,Text,IntWritable> {
private IntWritable result = new IntWritable();
public void reduce(Text key, Iterable<IntWritable> values,
Context context
) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "word count");
job.setJarByClass(WordCount.class);
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
To try and remain organized, I added a couple paths to my ~/.bashrc file:
hduser#mynode:~$ cd $HADOOP_CODE
hduser#mynode:/usr/local/hadoop/code$
This is one directory down from the $HADOOP_HOME directory. To compile the WordCount.JAVA file, I ran:
hduser#mynode:/usr/local/hadoop$ hadoop com.sun.tools.javac.Main $HADOOP_CODE/WordCount.java
hduser#mynode:/usr/local/hadoop$ jar cf wc.jar $HADOOP_CODE/WordCount*.class
I then tried:
hduser#mynode:/usr/local/hadoop$ hadoop jar $HADOOP_CODE/wc.jar $HADOOP_CODE/WordCount /home/hduser/input /home/hduser/output/wordcount
which bombed with the following error:
Exception in thread "main" java.lang.ClassNotFoundException: /usr/local/hadoop/code/WordCount
EDIT
This gave me the same error:
hduser#mynode:/usr/local/hadoop/code$ hadoop jar $HADOOP_CODE/wc.jar WordCount /home/hduser/input /home/hduser/output/wordcount
To get it to run without error, I moved the WordCount.Java file up one directory to the default hadoop ($HADOOP_HOME) folder. I also know from here and here that the solution is to add a package to the file.
What I'm trying to understand is why that is the solution. With no package name, where is hadoop looking for the specified package, and why can't I pass it a full path to get it to run correctly? This may be a basic java question (apologies - I'm from the python world), but what is the package name doing during the compile process that makes it so I could run without a path name, but leaving off the package name means it has to be in that default directory? I'd prefer not to have to add a package name to every job I run. An explanation would be greatly appreciated!

Running a mapreduce job no output at all. It doesn't even run . very weird. no error thrown on the terminal

I compiled the mapreduce code (driver, mapper and reducer classes) and created Jar files. When I run it on the dataset, it doesn't seem to run. It just comes back to the prompt as shown in the image. Any suggestions folks?
thanks much
basam
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
//This driver program will bring all the information needed to submit this Map reduce job.
public class MultiLangDictionary {
public static void main(String[] args) throws Exception{
if (args.length !=2){
System.err.println("Usage: MultiLangDictionary <input path> <output path>");
System.exit(-1);
}
Configuration conf = new Configuration();
Job ajob = new Job(conf, "MultiLangDictionary");
//Assigning the driver class name
ajob.setJarByClass(MultiLangDictionary.class);
FileInputFormat.addInputPath(ajob, new Path(args[0]));
//first argument is the job itself
//second argument is the location of the output dataset
FileOutputFormat.setOutputPath(ajob, new Path(args[1]));
ajob.setInputFormatClass(TextInputFormat.class);
ajob.setOutputFormatClass(TextOutputFormat.class);
//Defining the mapper class name
ajob.setMapperClass(MultiLangDictionaryMapper.class);
//Defining the Reducer class name
ajob.setReducerClass(MultiLangDictionaryReducer.class);
//setting the second argument as a path in a path variable
Path outputPath = new Path(args[1]);
//deleting the output path automatically from hdfs so that we don't have delete it explicitly
outputPath.getFileSystem(conf).delete(outputPath);
}
}
try with java packagename.classname in the command
hadoop jar MultiLangDictionary.jar [yourpackagename].MultiLangDictionary input output
You could try adding the Map and Reduce output key types to your driver. Something like (this is an example):
job2.setMapOutputKeyClass(Text.class);
job2.setMapOutputValueClass(Text.class);
job2.setOutputKeyClass(Text.class);
job2.setOutputValueClass(Text.class);
In the above both the Mapper and Reducer would be writing (Text,Text) in their context.write() methods.

Setting number of Reduce tasks using command line

I am a beginner in Hadoop. When trying to set the number of reducers using command line using Generic Options Parser, the number of reducers is not changing. There is no property set in the configuration file "mapred-site.xml" for the number of reducers and I think, that would make the number of reducers=1 by default. I am using cloudera QuickVM and hadoop version : "Hadoop 2.5.0-cdh5.2.0".
Pointers Appreciated. Also my issue was I wanted to know the preference order of the ways to set the number of reducers.
Using configuration File "mapred-site.xml"
mapred.reduce.tasks
By specifying in the driver class
job.setNumReduceTasks(4)
By specifying at the command line using Tool interface:
-Dmapreduce.job.reduces=2
Mapper :
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable>
{
#Override
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException
{
String line = value.toString();
//Split the line into words
for(String word: line.split("\\W+"))
{
//Make sure that the word is legitimate
if(word.length() > 0)
{
//Emit the word as you see it
context.write(new Text(word), new IntWritable(1));
}
}
}
}
Reducer :
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
#Override
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException
{
//Initializing the word count to 0 for every key
int count=0;
for(IntWritable value: values)
{
//Adding the word count counter to count
count += value.get();
}
//Finally write the word and its count
context.write(key, new IntWritable(count));
}
}
Driver :
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class WordCount extends Configured implements Tool
{
public int run(String[] args) throws Exception
{
//Instantiate the job object for configuring your job
Job job = new Job();
//Specify the class that hadoop needs to look in the JAR file
//This Jar file is then sent to all the machines in the cluster
job.setJarByClass(WordCount.class);
//Set a meaningful name to the job
job.setJobName("Word Count");
//Add the apth from where the file input is to be taken
FileInputFormat.addInputPath(job, new Path(args[0]));
//Set the path where the output must be stored
FileOutputFormat.setOutputPath(job, new Path(args[1]));
//Set the Mapper and the Reducer class
job.setMapperClass(WordCountMapper.class);
job.setReducerClass(WordCountReducer.class);
//Set the type of the key and value of Mapper and reducer
/*
* If the Mapper output type and Reducer output type are not the same then
* also include setMapOutputKeyClass() and setMapOutputKeyValue()
*/
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//job.setNumReduceTasks(4);
//Start the job and wait for it to finish. And exit the program based on
//the success of the program
System.exit(job.waitForCompletion(true)?0:1);
return 0;
}
public static void main(String[] args) throws Exception
{
// Let ToolRunner handle generic command-line options
int res = ToolRunner.run(new Configuration(), new WordCount(), args);
System.exit(res);
}
}
And I have tried the following commands to run the job :
hadoop jar /home/cloudera/Misc/wordCount.jar WordCount -Dmapreduce.job.reduces=2 hdfs:/Input/inputdata hdfs:/Output/wordcount_tool_D=2_take13
and
hadoop jar /home/cloudera/Misc/wordCount.jar WordCount -D mapreduce.job.reduces=2 hdfs:/Input/inputdata hdfs:/Output/wordcount_tool_D=2_take14
Answering your query on order. It would always be 2>3>1
The option specified in your driver class takes precedence over the ones you specify as an argument to your GenOptionsParser or the ones you specify in your site specific config.
I would recommend debugging the configurations inside your driver class by printing it out before you submit the job. This way , you can be sure what the configurations are , right before you submit the job to the cluster.
Configuration conf = getConf(); // This is available to you since you extended Configured
for(Entry entry: conf)
//Sysout the entries here

How do I convert my Java Hadoop code to run on EC2?

I wrote a Driver, Mapper, and Reducer class in Java that runs the k-nearest neighbor algorithm on test data, and pulls in the training set using Distributed Cache. I used a Cloudera virtual machine to test the code, and it works in pseudo-distributed mode.
I'm trying to get through Amazon's EC2/EMR documentation ... it seems like there should be a way to easily convert working Java Hadoop code into something that will work in EC2, but I'm seeing a whole bunch of custom amazon import statements and methods that I've never seen before.
Here's my driver code for an example:
import java.net.URI;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class KNNDriverEC2 extends Configured implements Tool {
public int run(String[] args) throws Exception {
Configuration conf = new Configuration();
conf.setInt("rows",1000);
conf.setInt("columns",613);
DistributedCache.createSymlink(conf);
// might have to start next line with ./!!!
DistributedCache.addCacheFile(new URI("knn-jg/cache_data/train_sample.csv#train_sample.csv"),conf);
DistributedCache.addCacheFile(new URI("knn-jg/cache_data/train_labels.csv#train_labels.csv"),conf);
//DistributedCache.addCacheFile(new URI("cacheData/train_sample.csv"),conf);
//DistributedCache.addCacheFile(new URI("cacheData/train_labels.csv"),conf);
Job job = new Job(conf);
job.setJarByClass(KNNDriverEC2.class);
job.setJobName("KNN");
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setMapperClass(KNNMapperEC2.class);
job.setReducerClass(KNNReducerEC2.class);
// job.setInputFormatClass(KeyValueTextInputFormat.class);
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(IntWritable.class);
boolean success = job.waitForCompletion(true);
return success ? 0 : 1;
}
public static void main(String[] args) throws Exception {
int exitCode = ToolRunner.run(new Configuration(), new KNNDriverEC2(), args);
System.exit(exitCode);
}
}
I've gotten my instance running, but an exception is thrown at the line "FileInputFormat.setInputPaths(job, new Path(args[0]));". I'm going to try to work through the documentation on handling arguments, but I've run into so many errors so far I'm wondering if I'm far from making this work. Any help appreciated.

Resources