I am newbie to Hadoop but have read the Yahoo tutorial on that and have already written a few mapReduce jobs. All my previous works used TextInputFormat but I now need to change that to KeyValueInputFormat. The problem is that KeyValueInputFormat.class cannot be found in hadoop 0.20.2?
I am attaching my code below (It is the word count example with only input format being changed)
package org.myorg;
import java.io.IOException;
import java.util.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;
public class WordCount {
public static class Map extends MapReduceBase implements Mapper<LongWritable, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
String line = value.toString();
StringTokenizer tokenizer = new StringTokenizer(line);
while (tokenizer.hasMoreTokens()) {
word.set(tokenizer.nextToken());
output.collect(word, one);
}
}
}
public static class Reduce extends MapReduceBase implements Reducer<Text, IntWritable, Text, IntWritable> {
public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
int sum = 0;
while (values.hasNext()) {
sum += values.next().get();
}
output.collect(key, new IntWritable(sum));
}
}
public static void main(String[] args) throws Exception {
JobConf conf = new JobConf(WordCount.class);
conf.setJobName("wordcount");
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(IntWritable.class);
conf.setMapperClass(Map.class);
conf.setCombinerClass(Reduce.class);
conf.setReducerClass(Reduce.class);
conf.setInputFormat(keyValueInputFormat.class); //The modified input format
conf.setOutputFormat(TextOutputFormat.class);
FileInputFormat.setInputPaths(conf, new Path(args[0]));
FileOutputFormat.setOutputPath(conf, new Path(args[1]));
JobClient.runJob(conf);
}
}
There is KeyValueTextInputFormat in org.apache.hadoop.mapreduce.lib.input.
Some of the old tutorials are based on older versions of Hadoop API. I recommend that you go through some of the newer tutorials.
This is what I get when I do go to source on KeyValueTextInputFormat.
package org.apache.hadoop.mapreduce.lib.input;
import java.io.IOException;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
public class KeyValueTextInputFormat extends FileInputFormat<Text, Text> {
public KeyValueTextInputFormat() {
//compiled code
throw new RuntimeException("Compiled Code");
}
protected boolean isSplitable(JobContext context, Path file) {
//compiled code
throw new RuntimeException("Compiled Code");
}
public RecordReader<Text, Text> createRecordReader(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
//compiled code
throw new RuntimeException("Compiled Code");
}
}
Related
I'm trying to run basic wordcount mapreduce example using outputcollector , but im getting exceptions.
INFO mapreduce.Job: Job job_local1048833344_0001 failed with state FAILED due to: NA
java.lang.Exception: java.io.IOException: Type mismatch in key from map: expected org.apache.hadoop.io.Text, received org.apache.hadoop.io.LongWritable ...
Here is the code I'm trying to run:
import java.io.*;
import java.util.StringTokenizer;
import java.util.Iterator;
import org.apache.hadoop.io.*;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.io.ObjectWritable;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.util.GenericOptionsParser;
public class WordCountOutputCollector {
public static class WordCountOutputCollectorMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
String line = value.toString();
StringTokenizer tokenizer = new StringTokenizer(line);
while (tokenizer.hasMoreTokens()) {
word.set(tokenizer.nextToken());
output.collect(word, one);
}
}
}
public static class WordCountOutputCollectorReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
int sum = 0;
while (values.hasNext()) {
sum += values.next().get();
}
output.collect(key, new IntWritable(sum));
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: wordcount <in> <out>");
System.exit(2);
}
Job job = new Job(conf, "word count outputcollector");
job.setJarByClass(WordCountOutputCollector.class);
job.setMapperClass(WordCountOutputCollectorMapper.class);
job.setCombinerClass(WordCountOutputCollectorReducer.class);
job.setReducerClass(WordCountOutputCollectorReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//conf.setInputFormat(TextInputFormat.class);
//conf.setOutputFormat(TextOutputFormat.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
//JobClient.runJob(conf);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
Try This : hadoop-wordcount
import java.io.IOException;
import java.io.PrintStream;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Reducer.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class WordCount
{
public static class Map
extends Mapper<LongWritable, Text, Text, IntWritable>
{
private static final IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(LongWritable paramLongWritable, Text paramText, Mapper<LongWritable, Text, Text, IntWritable>.Context paramMapper)
throws IOException, InterruptedException
{
StringTokenizer localStringTokenizer = new StringTokenizer(paramText.toString());
while (localStringTokenizer.hasMoreTokens())
{
this.word.set(localStringTokenizer.nextToken());
paramMapper.write(this.word, one);
}
}
}
public static class Reduce
extends Reducer<Text, IntWritable, Text, IntWritable>
{
private IntWritable result = new IntWritable();
public void reduce(Text paramText, Iterable<IntWritable> paramIterable, Reducer<Text, IntWritable, Text, IntWritable>.Context paramReducer)
throws IOException, InterruptedException
{
int i = 0;
for (IntWritable localIntWritable : paramIterable) {
i += localIntWritable.get();
}
this.result.set(i);
paramReducer.write(paramText, this.result);
}
}
public static void main(String[] paramArrayOfString)
throws Exception
{
Configuration localConfiguration = new Configuration();
String[] arrayOfString = new GenericOptionsParser(localConfiguration, paramArrayOfString).getRemainingArgs();
if (arrayOfString.length != 2)
{
System.err.println("Usage: WordCount <in> <out>");
System.exit(2);
}
Job localJob = new Job(localConfiguration, "wordcount");
localJob.setJarByClass(WordCount.class);
localJob.setMapperClass(WordCount.Map.class);
localJob.setReducerClass(WordCount.Reduce.class);
localJob.setCombinerClass(WordCount.Reduce.class);
localJob.setOutputKeyClass(Text.class);
localJob.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(localJob, new Path(arrayOfString[0]));
FileOutputFormat.setOutputPath(localJob, new Path(arrayOfString[1]));
System.exit(localJob.waitForCompletion(true) ? 0 : 1);
}
}
I think it's mainly because the map output has not been converted into Text.
Try to uncomment the code below:
conf.setInputFormat(TextInputFormat.class);
conf.setOutputFormat(TextOutputFormat.class);
JobClient.runJob(conf);
I am trying to run MaxTemperature example from MapReduce. But I couldnot find the MaxTemperature.jar in Hadoop MapReduce Examples. Can someone help me out finding the jar file or what is the possibility of executing this program and see the output?
try this,
import java.io.IOException;
import java.util.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class Temp {
public static class Map extends Mapper<LongWritable, Text, Text, IntWritable> {
// private final static IntWritable one = new IntWritable();
// private Text word = new Text();
public void map(LongWritable key, Text value, Context context) throws IOException InterruptedException
{
String line = value.toString();
String year=line.substring(0,4);
//StringTokenizer tokenizer = new StringTokenizer(line);
// while (tokenizer.hasMoreTokens()) {
// word.set(tokenizer.nextToken());
// output.collect(word, one);
int Temp=Integer.parseInt(line.substring(6,8));
context.write(new Text(year),new IntWritable(Temp));
}
}
}
/*
public static class Reduce extends Reducer<Text, IntWritable, Text, IntWritable> {
public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
// int max=Integer.MIN_VALUE;
int sum=0;
while (values.hasNext()) {
sum += values.next().get();
}
output.collect(key, new IntWritable(sum));
}
}*/
public class Reduce extends Reducer<Text, IntWritable, Text, IntWritable>
{
#Override
public void reduce(Text key, Iterable<IntWritable> values,Context context)
throws IOException, InterruptedException
{
int maxValue = Integer.MIN_VALUE;
for (IntWritable value : values) {
maxValue = Math.max(maxValue, value.get());
}
context.write(key, new IntWritable(maxValue));
}
}
public static void main(String[] args) throws Exception {
JobConf conf = new JobConf(Temp.class);
conf.setJobName("Temp");
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(IntWritable.class);
conf.setMapperClass(Map.class);
conf.setCombinerClass(Reduce.class);
conf.setReducerClass(Reduce.class);
conf.setInputFormat(TextInputFormat.class);
conf.setOutputFormat(TextOutputFormat.class);
FileInputFormat.setInputPaths(conf, new Path(args[0]));
FileOutputFormat.setOutputPath(conf, new Path(args[1]));
JobClient.runJob(conf);
}
}
make jar file of this program and execute command
hadoop jar Temp.jar Temp /hdfs_inputFile /hdfs_inputFile
The output I am expecting is the count of every word in the input file. But my output is the whole input file, as it is.
I am using extends Mapper<LongWritable, Text, Text, IntWritable> for mapper class and Reducer<Text, IntWritable, Text, IntWritable> for reducer class.
Here is my code
driver.java
public class driver extends Configured implements Tool{
public int run(String[] args) throws Exception
{
Configuration conf = new Configuration();
Job job = new Job(conf, "wordcount");
job.setMapperClass(mapper.class);
job.setReducerClass(reducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setInputFormatClass(KeyValueTextInputFormat.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.waitForCompletion(true);
//JobClient.runJob((JobConf) conf);
//System.exit(job.waitForCompletion(true) ? 0 : 1);
return 0;
}
public static void main(String[] args) throws Exception
{
long start = System.currentTimeMillis();
//int res = ToolRunner.run(new Configuration(), new driver(),args);
int res = ToolRunner.run(new Configuration(), new driver(),args);
long stop = System.currentTimeMillis();
System.out.println ("Time: " + (stop-start));
System.exit(res);
}
}
mapper.java
public class mapper extends Mapper<LongWritable, Text, Text, IntWritable>
{
//hadoop supported data types
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
//map method that performs the tokenizer job and framing the initial key value pairs
public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException
{
String line = value.toString();
StringTokenizer tokenizer = new StringTokenizer(line);
while (tokenizer.hasMoreTokens())
{
word.set(tokenizer.nextToken());
output.collect(word, one);
}
}
}
reducer.java
public class reducer extends Reducer<Text, IntWritable, Text, IntWritable>
{
//reduce method accepts the Key Value pairs from mappers, do the aggregation based on keys and produce the final out put
public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException
{
int sum = 0;
while (values.hasNext())
{
sum += values.next().get();
}
output.collect(key, new IntWritable(sum));
}
}
You are perplexed by the new & old APIs of MapReduce. I think you tried to write WordCount program in new API, but took snippets from the old API(a old blogpost perhaps). You can find the problem yourself, if you just add #override annotation to both the map & reduce methods.
See what happens to them after evolution :
map
reduce
You just wrote two new methods specifying older signature, so they just don't override anything, nowhere being called. The code is doing nothing since the actual methods being called have empty bodies(I don't think there is a default implementation and if there is that will be identity operations only).
Anyway, you should follow basic conventions for coding.
try this,
import java.io.IOException;
import java.util.Iterator;
import java.util.StringTokenizer;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
public class WordCount {
public static class Map extends MapReduceBase implements
Mapper<LongWritable, Text, Text, IntWritable> {
#Override
public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter)
throws IOException {
String line = value.toString();
StringTokenizer tokenizer = new StringTokenizer(line);
System.out.println(line);
while (tokenizer.hasMoreTokens()) {
value.set(tokenizer.nextToken());
output.collect(value, new IntWritable(1));
}
}
}
public static class Reduce extends MapReduceBase implements
Reducer<Text, IntWritable, Text, IntWritable> {
#Override
public void reduce(Text key, Iterator<IntWritable> values,
OutputCollector<Text, IntWritable> output, Reporter reporter)
throws IOException {
int sum = 0;
while (values.hasNext()) {
sum += values.next().get();
}
output.collect(key, new IntWritable(sum));
}
}
public static void main(String[] args) throws Exception,IOException {
JobConf conf = new JobConf(WordCount.class);
conf.setJobName("WordCount");
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(IntWritable.class);
conf.setMapperClass(Map.class);
conf.setReducerClass(Reduce.class);
conf.setInputFormat(TextInputFormat.class);
conf.setOutputFormat(TextOutputFormat.class);
FileInputFormat.setInputPaths(conf, new Path("/home/user17/test.txt"));
FileOutputFormat.setOutputPath(conf, new Path("hdfs://localhost:9000/out2"));
JobClient.runJob(conf);
}
}
make jar and execute given command on commandLine
hadoop jar WordCount.jar WordCount /inputfile /outputfile
Please run this code if you are facing problem with your code.This code contains mapper,reducer and main functions.
import java.io.IOException;
import java.util.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;
public class WordCount {
public static class Map extends MapReduceBase implements Mapper<LongWritable, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
String line = value.toString();
StringTokenizer tokenizer = new StringTokenizer(line);
while (tokenizer.hasMoreTokens()) {
word.set(tokenizer.nextToken());
output.collect(word, one);
}
}
}
public static class Reduce extends MapReduceBase implements Reducer<Text, IntWritable, Text, IntWritable> {
public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
int sum = 0;
while (values.hasNext()){
sum += values.next().get();
}
output.collect(key, new IntWritable(sum));
}
}
public static void main(String[] args) throws Exception {
JobConf conf = new JobConf(WordCount.class);
conf.setJobName("wordcount");
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(IntWritable.class);
conf.setMapperClass(Map.class);
conf.setCombinerClass(Reduce.class);
conf.setReducerClass(Reduce.class);
conf.setInputFormat(TextInputFormat.class);
conf.setOutputFormat(TextOutputFormat.class);
FileInputFormat.setInputPaths(conf, new Path(args[0]));
FileOutputFormat.setOutputPath(conf, new Path(args[1]));
JobClient.runJob(conf);
}
}
2) After that create a jar of this code say wordcount.jar saved in your home directory (/home/user/wordcount.jar) and run the following command :
hadoop jar wordcount.jar classname /inputfile /outputfile /
This will create a file outputfile under /(root) directory of hadoop. View your result by
hadoop dfs -cat /outputfile/part-m-00000
This will successfully run your wordcount program.
I just started working with MapReduce, and I'm running into a weird bug that I haven't been able to answer through Google. I'm making a basic program using ArrayWritable, but when I run it, I get the following error during Reduce:
java.lang.RuntimeException:
java.lang.NoSuchMethodException:org.apache.hadoop.io.ArrayWritable.<init>()
at org.apache.hadoop.util.ReflectionUtils.newInstance(ReflectionUtils.java:115)
at org.apache.hadoop.io.serializer.WritableSerialization$WritableDeserializer.deserialize(WritableSerialization.java:62)
at org.apache.hadoop.io.serializer.WritableSerialization$WritableDeserializer.deserialize(WritableSerialization.java:40)
at org.apache.hadoop.mapred.Task$ValuesIterator.readNextValue(Task.java:1276)
at org.apache.hadoop.mapred.Task$ValuesIterator.next(Task.java:1214)
at org.apache.hadoop.mapred.ReduceTask$ReduceValuesIterator.moveToNext(ReduceTask.java:250)
at org.apache.hadoop.mapred.ReduceTask$ReduceValuesIterator.next(ReduceTask.java:246)
at PageRank$Reduce.reduce(Unknown Source)
at PageRank$Reduce.reduce(Unknown Source)
at org.apache.hadoop.mapred.ReduceTask.runOldReducer(ReduceTask.java:522)
at org.apache.hadoop.mapred.ReduceTask.run(ReduceTask.java:421)
at org.apache.hadoop.mapred.Child$4.run(Child.java:255)
I'm using Hadoop 1.2.1. Here's my code:
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.join.*;
import java.io.IOException;
import java.util.Iterator;
public class TempClass {
public static class MapClass extends MapReduceBase
implements Mapper<LongWritable, Text, Text, ArrayWritable> {
public void map(LongWritable key, Text value,
OutputCollector<Text, ArrayWritable> output,
Reporter reporter) throws IOException {
String[] arr_str = new String[]{"a","b","c"};
for(int i=0; i<3; i++)
output.collect(new Text("my_key"), new ArrayWritable(arr_str));
}
}
public static class Reduce extends MapReduceBase
implements Reducer<Text, ArrayWritable, Text, ArrayWritable> {
public void reduce(Text key, Iterator<ArrayWritable> values,
OutputCollector<Text, ArrayWritable> output,
Reporter reporter) throws IOException {
ArrayWritable tmp;
while(values.hasNext()){
tmp = values.next();
output.collect(key, tmp);
}
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
JobConf job = new JobConf(conf, TempClass.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(ArrayWritable.class);
job.setOutputFormat(TextOutputFormat.class);
job.setInputFormat(TextInputFormat.class);
job.setMapperClass(MapClass.class);
job.setReducerClass(Reduce.class);
FileInputFormat.setInputPaths( job, new Path( args[0] ) );
FileOutputFormat.setOutputPath( job, new Path( args[1] ) );
job.setJobName( "TempClass" );
JobClient.runJob(job);
}
}
If I comment below lines (Reduce Class):
//while(values.hasNext()){
// tmp = values.next();
output.collect(key, tmp);
//}
everything will be ok. Do you have any ideas?
A Writable for arrays containing instances of a class. The elements of
this writable must all be instances of the same class. If this
writable will be the input for a Reducer, you will need to create a
subclass that sets the value to be of the proper type. For example:
public class IntArrayWritable extends ArrayWritable { public
IntArrayWritable() { super(IntWritable.class); } }
Here is from the docs of ArrayWritable. Generally, Writable should have an constructor with no parameter.
I just modified your code to:
import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.ArrayWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
public class TempClass {
public static class TextArrayWritable extends ArrayWritable {
public TextArrayWritable() {
super(Text.class);
}
public TextArrayWritable(String[] strings) {
super(Text.class);
Text[] texts = new Text[strings.length];
for (int i = 0; i < strings.length; i++) {
texts[i] = new Text(strings[i]);
}
set(texts);
}
}
public static class MapClass extends MapReduceBase implements
Mapper<LongWritable, Text, Text, ArrayWritable> {
public void map(LongWritable key, Text value,
OutputCollector<Text, ArrayWritable> output, Reporter reporter)
throws IOException {
String[] arr_str = new String[] {
"a", "b", "c" };
for (int i = 0; i < 3; i++)
output.collect(new Text("my_key"), new TextArrayWritable(
arr_str));
}
}
public static class Reduce extends MapReduceBase implements
Reducer<Text, TextArrayWritable, Text, TextArrayWritable> {
public void reduce(Text key, Iterator<TextArrayWritable> values,
OutputCollector<Text, TextArrayWritable> output,
Reporter reporter) throws IOException {
TextArrayWritable tmp;
while (values.hasNext()) {
tmp = values.next();
output.collect(key, tmp);
}
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
JobConf job = new JobConf(conf, TempClass.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(TextArrayWritable.class);
job.setOutputFormat(TextOutputFormat.class);
job.setInputFormat(TextInputFormat.class);
job.setMapperClass(MapClass.class);
job.setReducerClass(Reduce.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setJobName("TempClass");
JobClient.runJob(job);
}
}
I have been trying to execute some code that would allow me to 'only' list the words that exist in multiple files; what I have done so far was use the wordcount example and thanx to Chris White I managed to compile it. I tried reading here and there to get the code to work but all I am getting is a blank page with no data. the mapper is suppose to collect each word with its corresponding locations; the reducer is suppose to collect the common words any thoughts as to what might be the problem? the code is:
package org.myorg;
import java.io.IOException;
import java.util.*;
import java.lang.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;
public class WordCount {
public static class Map extends MapReduceBase implements Mapper<Text, Text, Text, Text>
{
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
private Text outvalue=new Text();
private String filename = null;
public void map(Text key, Text value, OutputCollector<Text, Text> output, Reporter reporter) throws IOException
{
if (filename == null)
{
filename = ((FileSplit) reporter.getInputSplit()).getPath().getName();
}
String line = value.toString();
StringTokenizer tokenizer = new StringTokenizer(line);
while (tokenizer.hasMoreTokens())
{
word.set(tokenizer.nextToken());
outvalue.set(filename);
output.collect(word, outvalue);
}
}
}
public static class Reduce extends MapReduceBase implements Reducer<Text, Text, Text, Text>
{
private Text src = new Text();
public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter) throws IOException
{
int sum = 0;
//List<Text> list = new ArrayList<Text>();
while (values.hasNext()) // I believe this would have all locations of the same word in different files?
{
sum += values.next().get();
src =values.next().get();
}
output.collect(key, src);
//while(values.hasNext())
//{
//Text value = values.next();
//list.add(new Text(value));
//System.out.println(value.toString());
//}
//System.out.println(values.toString());
//for(Text value : list)
//{
//System.out.println(value.toString());
//}
}
}
public static void main(String[] args) throws Exception
{
JobConf conf = new JobConf(WordCount.class);
conf.setJobName("wordcount");
conf.setInputFormat(KeyValueTextInputFormat.class);
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(Text.class);
conf.setMapperClass(Map.class);
conf.setCombinerClass(Reduce.class);
conf.setReducerClass(Reduce.class);
//conf.setInputFormat(TextInputFormat.class);
conf.setOutputFormat(TextOutputFormat.class);
FileInputFormat.setInputPaths(conf, new Path(args[0]));
FileOutputFormat.setOutputPath(conf, new Path(args[1]));
JobClient.runJob(conf);
}
}
Am I missing anything?
much obliged...
My Hadoop version : 0.20.203
First of all it seems you're using the old Hadoop API (mapred), and a word of advice would be to use the new Hadoop API (mapreduce) which is compatible with 0.20.203
In the new API, here is a wordcount that will work
import java.io.IOException;
import java.lang.InterruptedException;
import java.util.StringTokenizer;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class WordCount {
/**
* The map class of WordCount.
*/
public static class TokenCounterMapper
extends Mapper<Object, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());
context.write(word, one);
}
}
}
/**
* The reducer class of WordCount
*/
public static class TokenCounterReducer
extends Reducer<Text, IntWritable, Text, IntWritable> {
public void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
int sum = 0;
for (IntWritable value : values) {
sum += value.get();
}
context.write(key, new IntWritable(sum));
}
}
/**
* The main entry point.
*/
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
Job job = new Job(conf, "Example Hadoop 0.20.1 WordCount");
job.setJarByClass(WordCount.class);
job.setMapperClass(TokenCounterMapper.class);
job.setReducerClass(TokenCounterReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
Then, we build this file and pack the result into a jar file:
mkdir classes
javac -classpath /path/to/hadoop-0.20.203/hadoop-0.20.203-core.jar:/path/to/hadoop- 0.20.203/lib/commons-cli-1.2.jar -d classes WordCount.java && jar -cvf wordcount.jar -C classes/ .
Finally, we run the jar file in standalone mode of Hadoop
echo "hello world bye world" > /tmp/in/0.txt
echo "hello hadoop goodebye hadoop" > /tmp/in/1.txt
hadoop jar wordcount.jar org.packagename.WordCount /tmp/in /tmp/out
In the reducer, maintain a set of the values observed (the filenames emitted in the mapper), if after you consume all the values, this set size is 1, then the word is only used in one file.
public static class Reduce extends MapReduceBase implements Reducer<Text, Text, Text, Text>
{
private TreeSet<Text> files = new TreeSet<Text>();
public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter) throws IOException
{
files.clear();
for (Text file : values)
{
if (!files.contains(value))
{
// make a copy of value as hadoop re-uses the object
files.add(new Text(value));
}
}
if (files.size() == 1) {
output.collect(key, files.first());
}
files.clear();
}
}