I'm trying to run basic wordcount mapreduce example using outputcollector , but im getting exceptions.
INFO mapreduce.Job: Job job_local1048833344_0001 failed with state FAILED due to: NA
java.lang.Exception: java.io.IOException: Type mismatch in key from map: expected org.apache.hadoop.io.Text, received org.apache.hadoop.io.LongWritable ...
Here is the code I'm trying to run:
import java.io.*;
import java.util.StringTokenizer;
import java.util.Iterator;
import org.apache.hadoop.io.*;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.io.ObjectWritable;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.util.GenericOptionsParser;
public class WordCountOutputCollector {
public static class WordCountOutputCollectorMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
String line = value.toString();
StringTokenizer tokenizer = new StringTokenizer(line);
while (tokenizer.hasMoreTokens()) {
word.set(tokenizer.nextToken());
output.collect(word, one);
}
}
}
public static class WordCountOutputCollectorReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
int sum = 0;
while (values.hasNext()) {
sum += values.next().get();
}
output.collect(key, new IntWritable(sum));
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: wordcount <in> <out>");
System.exit(2);
}
Job job = new Job(conf, "word count outputcollector");
job.setJarByClass(WordCountOutputCollector.class);
job.setMapperClass(WordCountOutputCollectorMapper.class);
job.setCombinerClass(WordCountOutputCollectorReducer.class);
job.setReducerClass(WordCountOutputCollectorReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//conf.setInputFormat(TextInputFormat.class);
//conf.setOutputFormat(TextOutputFormat.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
//JobClient.runJob(conf);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
Try This : hadoop-wordcount
import java.io.IOException;
import java.io.PrintStream;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Reducer.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class WordCount
{
public static class Map
extends Mapper<LongWritable, Text, Text, IntWritable>
{
private static final IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(LongWritable paramLongWritable, Text paramText, Mapper<LongWritable, Text, Text, IntWritable>.Context paramMapper)
throws IOException, InterruptedException
{
StringTokenizer localStringTokenizer = new StringTokenizer(paramText.toString());
while (localStringTokenizer.hasMoreTokens())
{
this.word.set(localStringTokenizer.nextToken());
paramMapper.write(this.word, one);
}
}
}
public static class Reduce
extends Reducer<Text, IntWritable, Text, IntWritable>
{
private IntWritable result = new IntWritable();
public void reduce(Text paramText, Iterable<IntWritable> paramIterable, Reducer<Text, IntWritable, Text, IntWritable>.Context paramReducer)
throws IOException, InterruptedException
{
int i = 0;
for (IntWritable localIntWritable : paramIterable) {
i += localIntWritable.get();
}
this.result.set(i);
paramReducer.write(paramText, this.result);
}
}
public static void main(String[] paramArrayOfString)
throws Exception
{
Configuration localConfiguration = new Configuration();
String[] arrayOfString = new GenericOptionsParser(localConfiguration, paramArrayOfString).getRemainingArgs();
if (arrayOfString.length != 2)
{
System.err.println("Usage: WordCount <in> <out>");
System.exit(2);
}
Job localJob = new Job(localConfiguration, "wordcount");
localJob.setJarByClass(WordCount.class);
localJob.setMapperClass(WordCount.Map.class);
localJob.setReducerClass(WordCount.Reduce.class);
localJob.setCombinerClass(WordCount.Reduce.class);
localJob.setOutputKeyClass(Text.class);
localJob.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(localJob, new Path(arrayOfString[0]));
FileOutputFormat.setOutputPath(localJob, new Path(arrayOfString[1]));
System.exit(localJob.waitForCompletion(true) ? 0 : 1);
}
}
I think it's mainly because the map output has not been converted into Text.
Try to uncomment the code below:
conf.setInputFormat(TextInputFormat.class);
conf.setOutputFormat(TextOutputFormat.class);
JobClient.runJob(conf);
Related
This code counts words and skips two given words(in & of) form a file:-
Please help why it is not skipping these words.
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
class skipwc_mapper extends
Mapper<LongWritable, Text, Text, IntWritable> {
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString();
StringTokenizer t = new StringTokenizer(line);
Text word = null;
while (t.hasMoreTokens()) {
word = new Text(t.nextToken());
context.write(word, new IntWritable(1));
}
}
}
class skipwc_reducer extends
Reducer<Text, IntWritable, Text, IntWritable> {
protected void reduce(Text key, Iterable<IntWritable> values,
Context context) throws IOException, InterruptedException {
int tot = 0;
if (key.toString() != "in" && key.toString() != "of") {
while (values.iterator().hasNext()) {
tot += values.iterator().next().get();
}
context.write(key, new IntWritable(tot));
}
}
}
public static class skipwc_runner {
public static void main(String[] args) throws IOException,
InterruptedException, ClassNotFoundException {
Configuration conf = new Configuration();
Job job = new Job(conf);
job.setJarByClass(skipwc_runner.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setMapperClass(skipwc_mapper.class);
job.setReducerClass(skipwc_reducer.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
}
Use equals method to compare String like:
if (!"in".equals(key.toString()) && !"of".equals(key.toString()))
Also it would be beneficial if you skip of/in in the mapper rather than reducer as it would efficient to remove the data before sorting and shuffling phase, so you avoid additional IO.
Write a map reduce programm to print the most frequenty ocuring words in a text document.
The threshld value can be fixed and the word whose frequency exceeds the threshold need to be output.
Eg: If thereshold=100, and “is” occurs 150 times in the document, it has to be printed in the output.
program :
package org.myorg;
import java.io.IOException;
import java.util.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
public class WordCount {
public static class Map extends Mapper<LongWritable, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
StringTokenizer tokenizer = new StringTokenizer(line);
while (tokenizer.hasMoreTokens()) {
word.set(tokenizer.nextToken());
context.write(word, one);
}
}
}
public static class Reduce extends Reducer<Text, IntWritable, Text, Inritable> {
public void reduce(Text key, Iterable<IntWritable> values, Context coext)
throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
context.write(key, new IntWritable(sum));
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = new Job(conf, "wordcount");
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.waitForCompletion(true);
}
}
Here's the complete code,
Driver Class
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class FrequentWordClassDriver extends Configured implements Tool{
#Override
public int run(String[] args) throws Exception {
if(args.length != 2){
return -1;
}
JobConf conf = new JobConf(getConf(), FrequentWordClassDriver.class);
conf.setJobName(this.getClass().getName());
FileInputFormat.setInputPaths(conf, new Path(args[0]));
FileOutputFormat.setOutputPath(conf, new Path(args[1]));
conf.setMapperClass(FrequentWordClassMapper.class);
conf.setReducerClass(FrequentWordClassReducer.class);
conf.setMapOutputKeyClass(Text.class);
conf.setMapOutputValueClass(IntWritable.class);
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(IntWritable.class);
JobClient.runJob(conf);
return 0;
}
public static void main(String[] args) throws Exception{
int exitCode = ToolRunner.run(new FrequentWordClassDriver(), args);
System.exit(exitCode);
}
}
Mapper Class
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
public class FrequentWordClassMapper extends MapReduceBase implements Mapper<LongWritable, Text, Text, IntWritable>{
#Override
public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
String line = value.toString();
for(String phrase : line.split(" ")){
output.collect(new Text(phrase.toUpperCase()), new IntWritable(1));
}
}
}
Reducer Class
import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
public class FrequentWordClassReducer extends MapReduceBase implements Reducer<Text, IntWritable, Text, IntWritable>{
#Override
public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException{
int wordcount = 0, threshold = 100;
while(values.hasNext()){
IntWritable value = values.next();
wordcount +=value.get();
}
if(wordcount >= threshold){
output.collect(key, new IntWritable(wordcount));
}
}
}
The Driver Class, Mapper Class and Reducer Class is fairly simple and self explanatory. The mapper class split each sentence into words and send them to reducer class in the format <word, 1>. The reducer class will receive the data in the format <word, [1, 1, 1, 1]> and it will aggregate and count the occurrence of each word, and if the occurrence of each word is greater than or equal to threshold value then it will send the word as output.
Hope this will help you.
It's very simple.
Have a look at traditional word count example. You can use same code.
After setting Reducer class, add below line (If you want your output in single reduce file)
job.setNumReduceTasks(1);
Add your condition in reduce method.
Before writing to context.write(key, result);, add your condition
if ( sum > threshold) {
context.write(key, result);
}
You can better achieve this by using counters.
You can set the number of counter
public void reduce(Text word, Iterable<IntWritable> count,
Context context
) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : count) {
sum += val.get();
}
context.getCounter(word.toString()).increment(sum);
}
And then in the your driver program, you can get the counter using
Counters counters=job.getCounters();
You can use this and run multiple mappers and reducer, thus not compromising the performance.
I know this is a very basic question but I am not able to find where I am making a mistake. My Reducer is not getting invoked from the driver code. I would greatly appreciate if anyone can help me out.
My Driver Code
package com.mycompany.myorg;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class carsDriver {
public static void main(String args[]) throws IOException, ClassNotFoundException, InterruptedException{
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf,args).getRemainingArgs();
if(otherArgs.length != 2){
System.err.println("specified input and output path is not correct");
System.exit(-1);
}
// set up the job details
Job job = new Job(conf,"Cars Avg Fuel Economy");
job.setJarByClass(carsDriver.class);
//job.setJobName("Cars Avg Fuel Economy");
//setup the input and output paths for the MR job
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
// setup of the Mapper, combiner and Reducer classes
job.setMapperClass(carsMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
//job.setCombinerClass(carsCombiner.class);
job.setReducerClass(carsReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
System.exit(job.waitForCompletion(true)?0:1);
}
}
Mapper Code
package com.mycompany.myorg;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class carsMapper extends Mapper<Object, Text, Text, IntWritable> {
private Text mapkey = new Text();
private final static IntWritable mapval = new IntWritable(1);
public void map(Object key, Text Value,Mapper<Object, Text, Text, IntWritable>.Context context ) throws IOException, InterruptedException{
System.out.println("Running the Mapper");
String items[] = Value.toString().split(",");
System.out.println(items[2]+" "+Integer.parseInt(items[23].toString()));
mapkey.set(items[2]);
mapval.set(Integer.parseInt(items[23].toString()));
context.write(mapkey, mapval);
}
}
Reducer Code
package com.mycompany.myorg;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class carsReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
public void reducer(Text key, Iterable<IntWritable> value,Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
System.out.println("Reducer Code");
Text redKey = new Text();
IntWritable redVal = new IntWritable();
redKey.set(key);
int sum=0;
int count=0;
for(IntWritable val: value){
sum= sum +val.get();
count= count + 1;
}
redVal.set((sum/count));
context.write(redKey, redVal);
}
}
After long time debugging the problem I Found that the issue is with the reduce override method.
I used
public void reducer
instead of
public void reduce
observe that it should be reduce instead of reducer.
I am coding a MapReduce job for finding the occurrence of a search string (passed through Command Line argument) in an input file stored in HDFS using old API.
Below is my Driver class -
public class StringSearchDriver
{
public static void main(String[] args) throws IOException
{
JobConf jc = new JobConf(StringSearchDriver.class);
jc.set("SearchWord", args[2]);
jc.setJobName("String Search");
FileInputFormat.addInputPath(jc, new Path(args[0]));
FileOutputFormat.setOutputPath(jc, new Path(args[1]));
jc.setMapperClass(StringSearchMap.class);
jc.setReducerClass(StringSearchReduce.class);
jc.setOutputKeyClass(Text.class);
jc.setOutputValueClass(IntWritable.class);
JobClient.runJob(jc);
}
}
Below is my Mapper Class -
public class StringSearchMap extends MapReduceBase implements
Mapper<LongWritable, Text, Text, IntWritable>
{
String searchWord;
public void configure(JobConf jc)
{
searchWord = jc.get("SearchWord");
}
#Override
public void map(LongWritable key, Text value,
OutputCollector<Text, IntWritable> out, Reporter reporter)
throws IOException
{
String[] input = value.toString().split("");
for(String word:input)
{
if (word.equalsIgnoreCase(searchWord))
out.collect(new Text(word), new IntWritable(1));
}
}
}
On running the job (command line string passed is "hi"), I am getting the below error -
14/09/21 22:35:41 INFO mapred.JobClient: Task Id : attempt_201409212134_0005_m_000001_2, Status : FAILED
java.lang.ClassCastException: interface javax.xml.soap.Text
at java.lang.Class.asSubclass(Class.java:3129)
at org.apache.hadoop.mapred.JobConf.getOutputKeyComparator(JobConf.java:795)
at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.<init>(MapTask.java:964)
at org.apache.hadoop.mapred.MapTask.runOldMapper(MapTask.java:422)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:366)
at org.apache.hadoop.mapred.Child$4.run(Child.java:255)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:416)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1190)
at org.apache.hadoop.mapred.Child.main(Child.java:249)
Please suggest.
You auto imported the wrong import.
Instead of import org.apache.hadoop.io.Text you import javax.xml.soap.Text
You can find a sample wrong import in this blog.
One point , It is better to adopt New API
EDIT
I used New Api
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
/**
* #author Unmesha sreeveni
* #Date 23 sep 2014
*/
public class StringSearchDriver extends Configured implements Tool {
public static class Map extends
Mapper<LongWritable, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
String line = value.toString();
String searchString = conf.get("word");
StringTokenizer tokenizer = new StringTokenizer(line);
while (tokenizer.hasMoreTokens()) {
String token = tokenizer.nextToken();
if(token.equals(searchString)){
word.set(token);
context.write(word, one);
}
}
}
}
public static class Reduce extends
Reducer<Text, IntWritable, Text, IntWritable> {
public void reduce(Text key, Iterable<IntWritable> values,
Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
context.write(key, new IntWritable(sum));
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
int res = ToolRunner.run(conf, new StringSearchDriver(), args);
System.exit(res);
}
#Override
public int run(String[] args) throws Exception {
// TODO Auto-generated method stub
if (args.length != 3) {
System.out
.printf("Usage: Search String <input dir> <output dir> <search word> \n");
System.exit(-1);
}
String source = args[0];
String dest = args[1];
String searchword = args[2];
Configuration conf = new Configuration();
conf.set("word", searchword);
Job job = new Job(conf, "Search String");
job.setJarByClass(StringSearchDriver.class);
FileSystem fs = FileSystem.get(conf);
Path in =new Path(source);
Path out =new Path(dest);
if (fs.exists(out)) {
fs.delete(out, true);
}
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(job, in);
FileOutputFormat.setOutputPath(job, out);
boolean sucess = job.waitForCompletion(true);
return (sucess ? 0 : 1);
}
}
This works.
For Text; required hadoop package is org.apache.hadoop.io..
Check your packages
import java.io.IOException;
import java.util.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;
I just started working with MapReduce, and I'm running into a weird bug that I haven't been able to answer through Google. I'm making a basic program using ArrayWritable, but when I run it, I get the following error during Reduce:
java.lang.RuntimeException:
java.lang.NoSuchMethodException:org.apache.hadoop.io.ArrayWritable.<init>()
at org.apache.hadoop.util.ReflectionUtils.newInstance(ReflectionUtils.java:115)
at org.apache.hadoop.io.serializer.WritableSerialization$WritableDeserializer.deserialize(WritableSerialization.java:62)
at org.apache.hadoop.io.serializer.WritableSerialization$WritableDeserializer.deserialize(WritableSerialization.java:40)
at org.apache.hadoop.mapred.Task$ValuesIterator.readNextValue(Task.java:1276)
at org.apache.hadoop.mapred.Task$ValuesIterator.next(Task.java:1214)
at org.apache.hadoop.mapred.ReduceTask$ReduceValuesIterator.moveToNext(ReduceTask.java:250)
at org.apache.hadoop.mapred.ReduceTask$ReduceValuesIterator.next(ReduceTask.java:246)
at PageRank$Reduce.reduce(Unknown Source)
at PageRank$Reduce.reduce(Unknown Source)
at org.apache.hadoop.mapred.ReduceTask.runOldReducer(ReduceTask.java:522)
at org.apache.hadoop.mapred.ReduceTask.run(ReduceTask.java:421)
at org.apache.hadoop.mapred.Child$4.run(Child.java:255)
I'm using Hadoop 1.2.1. Here's my code:
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.join.*;
import java.io.IOException;
import java.util.Iterator;
public class TempClass {
public static class MapClass extends MapReduceBase
implements Mapper<LongWritable, Text, Text, ArrayWritable> {
public void map(LongWritable key, Text value,
OutputCollector<Text, ArrayWritable> output,
Reporter reporter) throws IOException {
String[] arr_str = new String[]{"a","b","c"};
for(int i=0; i<3; i++)
output.collect(new Text("my_key"), new ArrayWritable(arr_str));
}
}
public static class Reduce extends MapReduceBase
implements Reducer<Text, ArrayWritable, Text, ArrayWritable> {
public void reduce(Text key, Iterator<ArrayWritable> values,
OutputCollector<Text, ArrayWritable> output,
Reporter reporter) throws IOException {
ArrayWritable tmp;
while(values.hasNext()){
tmp = values.next();
output.collect(key, tmp);
}
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
JobConf job = new JobConf(conf, TempClass.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(ArrayWritable.class);
job.setOutputFormat(TextOutputFormat.class);
job.setInputFormat(TextInputFormat.class);
job.setMapperClass(MapClass.class);
job.setReducerClass(Reduce.class);
FileInputFormat.setInputPaths( job, new Path( args[0] ) );
FileOutputFormat.setOutputPath( job, new Path( args[1] ) );
job.setJobName( "TempClass" );
JobClient.runJob(job);
}
}
If I comment below lines (Reduce Class):
//while(values.hasNext()){
// tmp = values.next();
output.collect(key, tmp);
//}
everything will be ok. Do you have any ideas?
A Writable for arrays containing instances of a class. The elements of
this writable must all be instances of the same class. If this
writable will be the input for a Reducer, you will need to create a
subclass that sets the value to be of the proper type. For example:
public class IntArrayWritable extends ArrayWritable { public
IntArrayWritable() { super(IntWritable.class); } }
Here is from the docs of ArrayWritable. Generally, Writable should have an constructor with no parameter.
I just modified your code to:
import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.ArrayWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
public class TempClass {
public static class TextArrayWritable extends ArrayWritable {
public TextArrayWritable() {
super(Text.class);
}
public TextArrayWritable(String[] strings) {
super(Text.class);
Text[] texts = new Text[strings.length];
for (int i = 0; i < strings.length; i++) {
texts[i] = new Text(strings[i]);
}
set(texts);
}
}
public static class MapClass extends MapReduceBase implements
Mapper<LongWritable, Text, Text, ArrayWritable> {
public void map(LongWritable key, Text value,
OutputCollector<Text, ArrayWritable> output, Reporter reporter)
throws IOException {
String[] arr_str = new String[] {
"a", "b", "c" };
for (int i = 0; i < 3; i++)
output.collect(new Text("my_key"), new TextArrayWritable(
arr_str));
}
}
public static class Reduce extends MapReduceBase implements
Reducer<Text, TextArrayWritable, Text, TextArrayWritable> {
public void reduce(Text key, Iterator<TextArrayWritable> values,
OutputCollector<Text, TextArrayWritable> output,
Reporter reporter) throws IOException {
TextArrayWritable tmp;
while (values.hasNext()) {
tmp = values.next();
output.collect(key, tmp);
}
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
JobConf job = new JobConf(conf, TempClass.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(TextArrayWritable.class);
job.setOutputFormat(TextOutputFormat.class);
job.setInputFormat(TextInputFormat.class);
job.setMapperClass(MapClass.class);
job.setReducerClass(Reduce.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setJobName("TempClass");
JobClient.runJob(job);
}
}