Simple Map-Reduce code fails on null pointer exception - hadoop

I`m trying to run this simple map reduce code that counts the appearance of each word in a text file (this code was given in class):
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.io.LongWritable;
public class WordCount {
public static class MapClass extends Mapper<LongWritable, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
#Override
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());
context.write(word, one);
}
}
}
public static class ReduceClass extends Reducer<Text,IntWritable,Text,IntWritable> {
#Override
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable value : values) {
sum += value.get();
}
context.write(key, new IntWritable(sum));
}
}
public static class PartitionerClass extends Partitioner<Text, IntWritable> {
#Override
public int getPartition(Text key, IntWritable value, int numPartitions) {
return getLanguage(key) % numPartitions;
}
private int getLanguage(Text key) {
if (key.getLength() > 0) {
int c = key.charAt(0);
if (c >= Long.decode("0x05D0").longValue() && c <= Long.decode("0x05EA").longValue())
return 1;
}
return 0;
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
//conf.set("mapred.map.tasks","10");
//conf.set("mapred.reduce.tasks","2");
Job job = new Job(conf, "word count");
job.setJarByClass(WordCount.class);
job.setMapperClass(MapClass.class);
job.setPartitionerClass(PartitionerClass.class);
job.setCombinerClass(ReduceClass.class);
job.setReducerClass(ReduceClass.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
I`m getting this weired null pointer exception, no idea where does it come from. I included hadoop-common, hadoop-mapreduce-client-core, hadoop-hdfs, on my pom.xml dependencies.
log4j:WARN No appenders could be found for logger (org.apache.hadoop.metrics2.lib.MutableMetricsFactory).
log4j:WARN Please initialize the log4j system properly.
log4j:WARN See http://logging.apache.org/log4j/1.2/faq.html#noconfig for more info.
Exception in thread "main" java.lang.NullPointerException
at java.lang.ProcessBuilder.start(ProcessBuilder.java:1012)
at org.apache.hadoop.util.Shell.runCommand(Shell.java:404)
at org.apache.hadoop.util.Shell.run(Shell.java:379)
at org.apache.hadoop.util.Shell$ShellCommandExecutor.execute(Shell.java:589)
at org.apache.hadoop.util.Shell.execCommand(Shell.java:678)
at org.apache.hadoop.util.Shell.execCommand(Shell.java:661)
at org.apache.hadoop.fs.RawLocalFileSystem.setPermission(RawLocalFileSystem.java:639)
at org.apache.hadoop.fs.RawLocalFileSystem.mkdirs(RawLocalFileSystem.java:435)
at org.apache.hadoop.fs.FilterFileSystem.mkdirs(FilterFileSystem.java:277)
at org.apache.hadoop.mapreduce.JobSubmissionFiles.getStagingDir(JobSubmissionFiles.java:125)
at org.apache.hadoop.mapreduce.JobSubmitter.submitJobInternal(JobSubmitter.java:344)
at org.apache.hadoop.mapreduce.Job$10.run(Job.java:1268)
at org.apache.hadoop.mapreduce.Job$10.run(Job.java:1265)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1491)
at org.apache.hadoop.mapreduce.Job.submit(Job.java:1265)
at org.apache.hadoop.mapreduce.Job.waitForCompletion(Job.java:1286)
at WordCount.main(WordCount.java:74)

Looking at your code, the only issue I can see is you aren't setting the number of reduce tasks, but your partitioner is expecting there to be two. You will be using the default number of 1.
You can try setting that in your driver using:
job.setNumReduceTasks(2);

Before running the code, we should first understand the functionality and some part of the data that we are trying to partition.
Partitioning the data requires the reducer to be set to a number of values which is being generated. (Set the reducer value to a higher number as we do not know the different number of values coming from file , you can then use lazy Output to not generate the files which is having 0 records).
By default, reducer is set to '1', so setting this value should help :-
job.setNumReduceTasks(integer_value);

Related

Facing an error when using TotalOrderPartitioner MapReduce

I have written the below program.
I have run it without using TotalOrderPartitioner and it has run well. So I don't think there are any issues with Mapper or Reducer class as such.
But when I include the code for TotalOrderPartitioner i.e. write the partition file and then to put it in DistributedCache, I am getting following error: Really clueless how to go about it.
[train#sandbox TOTALORDERPARTITIONER]$ hadoop jar totalorderpart.jar average.AverageJob counties totpart
//counties is input directory and totpart is output directory
16/01/18 04:14:00 INFO input.FileInputFormat: Total input paths to
process : 4 16/01/18 04:14:00 INFO partition.InputSampler: Using 6
samples 16/01/18 04:14:00 INFO zlib.ZlibFactory: Successfully loaded &
initialized native-zlib library 16/01/18 04:14:00 INFO
compress.CodecPool: Got brand-new compressor [.deflate]
java.io.IOException: wrong key class:
org.apache.hadoop.io.LongWritable is not class
org.apache.hadoop.io.Text at
org.apache.hadoop.io.SequenceFile$RecordCompressWriter.append(SequenceFile.java:1380)
at
org.apache.hadoop.mapreduce.lib.partition.InputSampler.writePartitionFile(InputSampler.java:340)
at average.AverageJob.run(AverageJob.java:132) at
org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:70) at
average.AverageJob.main(AverageJob.java:146) at
sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:39)
at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:25)
at java.lang.reflect.Method.invoke(Method.java:597) at
org.apache.hadoop.util.RunJar.main(RunJar.java:212)
My code
package average;
import java.io.IOException;
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.partition.InputSampler;
import org.apache.hadoop.mapreduce.lib.partition.TotalOrderPartitioner;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class AverageJob extends Configured implements Tool {
public enum Counters {MAP, COMINE, REDUCE};
public static class AverageMapper extends Mapper<LongWritable, Text, Text, Text> {
private Text mapOutputKey = new Text();
private Text mapOutputValue = new Text();
#Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String[] words = StringUtils.split(value.toString(), '\\', ',');
mapOutputKey.set(words[1].trim());
StringBuilder moValue = new StringBuilder();
moValue.append(words[9].trim()).append(",1");
mapOutputValue.set(moValue.toString());
context.write(mapOutputKey, mapOutputValue);
context.getCounter(Counters.MAP).increment(1);
}
}
public static class AverageCombiner extends Reducer<Text, Text, Text, Text> {
private Text combinerOutputValue = new Text();
#Override
protected void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
int count=0;
long sum=0;
for(Text value: values)
{
String[] strValues = StringUtils.split(value.toString(), ',');
sum+= Long.parseLong(strValues[0]);
count+= Integer.parseInt(strValues[1]);
}
combinerOutputValue.set(sum + "," + count);
context.write(key, combinerOutputValue);
context.getCounter(Counters.COMINE).increment(1);
}
}
public static class AverageReducer extends Reducer<Text, Text, Text, DoubleWritable> {
private DoubleWritable reduceOutputKey = new DoubleWritable();
#Override
protected void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
int count=0;
double sum=0;
for(Text value: values)
{
String[] strValues = StringUtils.split(value.toString(), ',');
sum+= Double.parseDouble(strValues[0]);
count+= Integer.parseInt(strValues[1]);
}
reduceOutputKey.set(sum/count);
context.write(key, reduceOutputKey);
context.getCounter(Counters.REDUCE).increment(1);
}
}
#Override
public int run(String[] args) throws Exception {
Configuration conf = getConf();
Job job = Job.getInstance(conf);
job.setJarByClass(getClass());
Path in = new Path(args[0]);
Path out = new Path(args[1]);
FileInputFormat.setInputPaths(job, in);
FileOutputFormat.setOutputPath(job, out);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(DoubleWritable.class);
job.setMapperClass(AverageMapper.class);
job.setCombinerClass(AverageCombiner.class);
job.setPartitionerClass(TotalOrderPartitioner.class);
job.setReducerClass(AverageReducer.class);
job.setNumReduceTasks(6);
InputSampler.Sampler<Text, Text> sampler = new InputSampler.RandomSampler<Text, Text>(0.2, 6, 5);
InputSampler.writePartitionFile(job, sampler);
String partitionFile = TotalOrderPartitioner.getPartitionFile(conf);
URI partitionUri = new URI(partitionFile + "#" + TotalOrderPartitioner.DEFAULT_PATH);
job.addCacheFile(partitionUri);
return job.waitForCompletion(true)?0:1;
}
public static void main(String[] args) {
int result=0;
try
{
result = ToolRunner.run(new Configuration(), new AverageJob(), args);
System.exit(result);
}
catch (Exception e)
{
e.printStackTrace();
}
}
}
TotalOrderPartitioner does not run its sampling on the output of the Mapper, but on the input dataset. Your input format has LongWritable as key and Text as value. Instead, you are trying to call RandomSampler claiming that your format has Text as key and Text as value. This is a mismatch that InputSampler finds when it runs, hence the message
wrong key class: org.apache.hadoop.io.LongWritable is not class org.apache.hadoop.io.Text
Meaning that it was trying to find Text as key (based on your parametrization) but it found LongWritable instead.

MapReduce error type mismatch : I am trying to write a program to find maximum no. from a CSV file but i am getting key mismatch

Program to find maximum from a billions of number present in CSV file.
package org.devender;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class SortMapper extends Mapper<LongWritable, Text, LongWritable, Text> {
public void map(LongWritable ikey, Text ivalue, Context context)
throws IOException, InterruptedException {
String line = ivalue.toString();
String TextInt[]=line.split(",");
int MAX =0;
for (int i=0;i>TextInt.length;i++) {
int n=Integer.parseInt(TextInt[i].toString());
if (n>MAX) {
MAX = n;
}
}
Text max = new Text("Maximum");
LongWritable BIG = new LongWritable(MAX);
context.write(BIG,max);
}
}
Getting below error
Error: java.io.IOException: Type mismatch in key from map: expected
org.apache.hadoop.io.Text, received org.apache.hadoop.io.LongWritable
at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.collect(MapTask.java:1072)
at org.apache.hadoop.mapred.MapTask$NewOutputCollector.write(MapTask.java:715)
at org.apache.hadoop.mapreduce.task.TaskInputOutputContextImpl.write(TaskInputOutputContextImpl.java:89)
at org.apache.hadoop.mapreduce.lib.map.WrappedMapper$Context.write(WrappedMapper.java:112)
at org.devender.SortMapper.map(SortMapper.java:31)
at org.devender.SortMapper.map(SortMapper.java:1)
at org.apache.hadoop.mapreduce.Mapper.run(Mapper.java:145)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:787)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:341)
at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:163)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:415)
Driver
this is my driver program Driver
package org.devender;
import org.apache.hadoop.conf.Configuration;
public class SortMapReduce {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "JobName");
job.setJarByClass(org.devender.SortMapReduce.class);
job.setMapperClass(org.devender.SortMapper.class);
job.setReducerClass(org.devender.SortReducer.class);
// TODO: specify output types
job.setOutputKeyClass(LongWritable.class);
job.setOutputValueClass(Text.class);
// TODO: specify input and output DIRECTORIES (not files)
FileInputFormat.setInputPaths(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path(args[1]));
if (!job.waitForCompletion(true))
return;
}
}
//Reducer - The output coming as 0,Maximum ...0,Maxium but i was expecting the maximum value from the file and the "Highest number" tag along with the value.
------------------------------------------------------------------------
public void reduce(Iterable<LongWritable> _key, Text values, Context context)
throws IOException, InterruptedException {
// process values
LongWritable MAX = new LongWritable(0);
for (LongWritable val : _key) {
if (val.compareTo(MAX)>0) {
MAX=val;
}
}
Text t=new Text("Highest Number ");
context.write(MAX,t);
}
}
I am using LongWritable as key and same is used in the mapper argument but dont know why compiler is saying expected Text. I am trying to read a line from a file and splitting into the separate number and first converting into the int and comparing it with each number present in the line. and saving it into the output context ,but compiler is saying expected Text, i dont know why comiler is expecting Text, when i have specifically mentioned its a longWritable in the Mapper. Can someone help to resolve this compiler error.. Now output is coming as 0 Maximum ,0 Maximum .... so on ...
What is your job configuration? Are you using job.setOutputKeyClass, job.setOutputValueClass, job.setMapOutputKeyClass, job.setMapOutputValueClass as part of your code?
Can you share the entire code?
Edit 1: Here is the code, you have many issues in the code. You can learn map reduce in detail here
import java.io.IOException;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Reducer.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class TestingMapReduce extends Configured implements Tool {
public static class SortMapper extends Mapper<LongWritable, Text, Text, LongWritable> {
public void map(LongWritable ikey, Text ivalue, Context context) throws IOException, InterruptedException {
String line = ivalue.toString();
String TextInt[] = line.split(",");
int MAX = 0;
for (int i = 0; i < TextInt.length; i++) {
int n = Integer.parseInt(TextInt[i].toString());
if (n > MAX) {
MAX = n;
}
}
Text max = new Text("Maximum");
LongWritable BIG = new LongWritable(MAX);
context.write(max, BIG);
}
}
public static class SortReducer extends Reducer<Text, LongWritable, Text, LongWritable> {
Long MAX = 0L;
public void reduce(Text _key, Iterable<LongWritable> values, Context context)
throws IOException, InterruptedException {
// process values
for (LongWritable val : values) {
if (val.get() > MAX) {
MAX = val.get();
}
}
context.write(_key, new LongWritable(MAX));
}
}
public int run(String[] arg0) throws Exception {
Job job = Job.getInstance(getConf());
// job.setJar("nyse-0.0.1-SNAPSHOT.jar");
job.setJarByClass(getClass());
job.setMapperClass(SortMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);
job.setReducerClass(SortReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
job.setNumReduceTasks(1);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.setInputPaths(job, new Path(arg0[0]));
FileOutputFormat.setOutputPath(job, new Path(arg0[1]));
return job.waitForCompletion(true) ? 0 : 1;
}
public static void main(String args[]) throws Exception {
System.exit(ToolRunner.run(new TestingMapReduce(), args));
}
}

Hadoop multiple input files error

Im trying to read 2 file from hdfs input with below code but I face with error as follow
I am beginner in mapreduce programing and stuck on this problem for couple of days,any help will be appreciated.
My code:
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
public class Recipe {
public static class TokenizerMapper1
extends Mapper<Object, Text, Text, IntWritable>{
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(Object key, Text value, Context context
) throws IOException, InterruptedException {
String line=value.toString();
word.set(line.substring(2,8));
context.write(word,one);
}
}
public static class TokenizerMapper2
extends Mapper<Object, Text, Text, IntWritable>{
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(Object key, Text value, Context context
) throws IOException, InterruptedException {
String line=value.toString();
word.set(line.substring(2,8));
context.write(word,one);
}
}
public static class IntSumReducer
extends Reducer<Text,IntWritable,Text,IntWritable> {
private IntWritable result = new IntWritable();
public void reduce(Text key, Iterable<IntWritable> values,
Context context
) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: recipe <in> <out>");
System.exit(2);
}
#SuppressWarnings("deprecation")
Job job = new Job(conf, "Recipe");
job.setJarByClass(Recipe.class);
job.setMapperClass(TokenizerMapper1.class);
job.setMapperClass(TokenizerMapper2.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
MultipleInputs.addInputPath(job,new Path(args[0]),TextInputFormat.class,TokenizerMapper1.class);
MultipleInputs.addInputPath(job,new Path(args[1]),TextInputFormat.class,TokenizerMapper2.class);
FileOutputFormat.setOutputPath(job, new Path(args[2]));
//FileInputFormat.addInputPath(job, new Path("hdfs://localhost:9000/in"));
//FileOutputFormat.setOutputPath(job, new Path("hdfs://127.0.0.1:9000/out"));
System.exit(job.waitForCompletion(true) ? 0 : 1);
// job.submit();
}
And i've set program run configuration arguments like this:
/in /put
Error:
Exception in thread "main" java.lang.ArrayIndexOutOfBoundsException: 2
at Recipe.main(Recipe.java:121)
There are several issues. Program is expecting 3 parameters and you are passing only 2. Also if you have to process multiple input formats you need to use MultipleInputs.
Assume that you invoke program /in1 /in2 /out
MultipleInputs.addInputPath(job, args[0], TokenizerMapper1.class, FirstMapper.class);
MultipleInputs.addInputPath(job, args[1], TokenizerMapper2.class, SecondMapper.class);
You can remove these lines from the code:
job.setMapperClass(TokenizerMapper1.class);
job.setMapperClass(TokenizerMapper2.class);
Now it works with the following modifications:
Put every file in a separate directory.
Use real address instead of arg[], as shown below:
MultipleInputs.addInputPath(job,new Path("hdfs://localhost:9000/in1"),TextInputFormat.class,TokenizerMapper1.class);
MultipleInputs.addInputPath(job,new Path("hdfs://localhost:9000/in2"),TextInputFormat.class,TokenizerMapper1.class);
FileOutputFormat.setOutputPath(job, new Path("hdfs://127.0.0.1:9000/out"));
Specify all input and output paths in run configurations\arguments like this:
127.0.0.1:9000/in1/DNAIn.txt 127.0.0.1:9000/in2/DNAIn2.txt 127.0.0.1:9000/out

MapReduce not reducing?

I'm following the tutorial at http://hadoop.apache.org/docs/current/hadoop-mapreduce-client/hadoop-mapreduce-client-core/MapReduceTutorial.html and this is my code
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.util.StringTokenizer;
import java.util.Iterator;
public class WordCount {
public static class WordCountMapper extends Mapper<Object, Text, Text, IntWritable> {
private Text word = new Text();
private final IntWritable one = new IntWritable(1);
#Override
public void map(Object key, Text val, Context context) throws IOException, InterruptedException {
String line = val.toString();
StringTokenizer tokenizer = new StringTokenizer(line.toLowerCase());
while (tokenizer.hasMoreTokens()) {
word.set(tokenizer.nextToken());
context.write(word, one);
}
}
}
public static class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
public void reduce(Text key, Iterator<IntWritable> value, Context context) throws IOException, InterruptedException {
int sum = 0;
while (value.hasNext()) {
IntWritable val = (IntWritable) value.next();
sum += val.get();
}
context.write(key, new IntWritable(sum));
}
}
public static void main(String[] args) throws Exception {
Configuration config = new Configuration();
Job job = Job.getInstance(config, "word count");
job.setJarByClass(WordCount.class);
job.setMapperClass(WordCountMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setCombinerClass(WordCountReducer.class);
job.setReducerClass(WordCountReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path("/user/Icarus/words.txt"));
FileOutputFormat.setOutputPath(job, new Path("/user/Icarus/words.out"));
job.waitForCompletion(true);
}
}
But when I run it instead of calculating the word frequency, I got this:
bye 1
goodbye 1
hadoop 1
hadoop 1
hello 1
hello 1
hello 1
world 1
I must missed something very trivial but I can't figure out what. Help please..
Root cause of this problem is, You are not calling the reduce() with the exact Signature required to call by Hadoop. Signature should be as below (reference here)
protected void reduce(KEYIN key, Iterable<VALUEIN> values, org.apache.hadoop.mapreduce.Reducer.Context context)
throws IOException, InterruptedException
Since your reduce() not matching the Signature, Hadoop will call default IdentityReducer, which output the same input.
So only you are getting the same output of Map as Reduce output.
For this problem, i can suggest you 2 solutions,
First: Try the below code
public static class WordCountReducer
extends Reducer<Text,IntWritable,Text,IntWritable> {
private IntWritable result = new IntWritable();
public void reduce(Text key, Iterable<IntWritable> values,
Context context
) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
Second: And second solution is quite simple,
Instead of you define manually reduce class, Just set the Reducer class to IntSumReducer or LongSumReducer which will do the same as above code.
So don't define the WordCountReducer class and add the following code,
job.setReducerClass(LongSumReducer.class); or
job.setReducerClass(IntSumReducer.class);
based on the count type you want.
Hope it helps!

MapReduce Old API - Passing Command Line Argument to Map

I am coding a MapReduce job for finding the occurrence of a search string (passed through Command Line argument) in an input file stored in HDFS using old API.
Below is my Driver class -
public class StringSearchDriver
{
public static void main(String[] args) throws IOException
{
JobConf jc = new JobConf(StringSearchDriver.class);
jc.set("SearchWord", args[2]);
jc.setJobName("String Search");
FileInputFormat.addInputPath(jc, new Path(args[0]));
FileOutputFormat.setOutputPath(jc, new Path(args[1]));
jc.setMapperClass(StringSearchMap.class);
jc.setReducerClass(StringSearchReduce.class);
jc.setOutputKeyClass(Text.class);
jc.setOutputValueClass(IntWritable.class);
JobClient.runJob(jc);
}
}
Below is my Mapper Class -
public class StringSearchMap extends MapReduceBase implements
Mapper<LongWritable, Text, Text, IntWritable>
{
String searchWord;
public void configure(JobConf jc)
{
searchWord = jc.get("SearchWord");
}
#Override
public void map(LongWritable key, Text value,
OutputCollector<Text, IntWritable> out, Reporter reporter)
throws IOException
{
String[] input = value.toString().split("");
for(String word:input)
{
if (word.equalsIgnoreCase(searchWord))
out.collect(new Text(word), new IntWritable(1));
}
}
}
On running the job (command line string passed is "hi"), I am getting the below error -
14/09/21 22:35:41 INFO mapred.JobClient: Task Id : attempt_201409212134_0005_m_000001_2, Status : FAILED
java.lang.ClassCastException: interface javax.xml.soap.Text
at java.lang.Class.asSubclass(Class.java:3129)
at org.apache.hadoop.mapred.JobConf.getOutputKeyComparator(JobConf.java:795)
at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.<init>(MapTask.java:964)
at org.apache.hadoop.mapred.MapTask.runOldMapper(MapTask.java:422)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:366)
at org.apache.hadoop.mapred.Child$4.run(Child.java:255)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:416)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1190)
at org.apache.hadoop.mapred.Child.main(Child.java:249)
Please suggest.
You auto imported the wrong import.
Instead of import org.apache.hadoop.io.Text you import javax.xml.soap.Text
You can find a sample wrong import in this blog.
One point , It is better to adopt New API
EDIT
I used New Api
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
/**
* #author Unmesha sreeveni
* #Date 23 sep 2014
*/
public class StringSearchDriver extends Configured implements Tool {
public static class Map extends
Mapper<LongWritable, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
String line = value.toString();
String searchString = conf.get("word");
StringTokenizer tokenizer = new StringTokenizer(line);
while (tokenizer.hasMoreTokens()) {
String token = tokenizer.nextToken();
if(token.equals(searchString)){
word.set(token);
context.write(word, one);
}
}
}
}
public static class Reduce extends
Reducer<Text, IntWritable, Text, IntWritable> {
public void reduce(Text key, Iterable<IntWritable> values,
Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
context.write(key, new IntWritable(sum));
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
int res = ToolRunner.run(conf, new StringSearchDriver(), args);
System.exit(res);
}
#Override
public int run(String[] args) throws Exception {
// TODO Auto-generated method stub
if (args.length != 3) {
System.out
.printf("Usage: Search String <input dir> <output dir> <search word> \n");
System.exit(-1);
}
String source = args[0];
String dest = args[1];
String searchword = args[2];
Configuration conf = new Configuration();
conf.set("word", searchword);
Job job = new Job(conf, "Search String");
job.setJarByClass(StringSearchDriver.class);
FileSystem fs = FileSystem.get(conf);
Path in =new Path(source);
Path out =new Path(dest);
if (fs.exists(out)) {
fs.delete(out, true);
}
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(job, in);
FileOutputFormat.setOutputPath(job, out);
boolean sucess = job.waitForCompletion(true);
return (sucess ? 0 : 1);
}
}
This works.
For Text; required hadoop package is org.apache.hadoop.io..
Check your packages
import java.io.IOException;
import java.util.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;

Resources