I have a MapReduce program as below
import java.io.IOException;
import java.util.Iterator;
import java.util.StringTokenizer;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.KeyValueTextInputFormat;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextOutputFormat;
public class Sample {
public static class SampleMapper extends MapReduceBase implements
Mapper<Text, Text, Text, Text> {
private Text word = new Text();
#Override
public void map(Text key, Text value,
OutputCollector<Text, Text> output, Reporter reporter)
throws IOException {
StringTokenizer itr = new StringTokenizer(value.toString(),",");
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());
output.collect(key, word);
}
}
}
public static class SampleReducer extends MapReduceBase implements
Reducer<Text, Text, Text, Text> {
private Text result = new Text();
#Override
public void reduce(Text key, Iterator<Text> values,
OutputCollector<Text, Text> output, Reporter reporter)
throws IOException {
StringBuffer aggregation = new StringBuffer();
while (values.hasNext()) {
aggregation.append("|" + values.next().toString());
}
result.set(aggregation.toString());
output.collect(key, result);
}
}
public static void main(String args[]) throws IOException {
JobConf conf = new JobConf(Sample.class);
conf.setJobName("Sample");
conf.setMapperClass(SampleMapper.class);
conf.setReducerClass(SampleReducer.class);
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(Text.class);
conf.setInputFormat(KeyValueTextInputFormat.class);
conf.setOutputFormat(TextOutputFormat.class);
FileInputFormat.setInputPaths(conf, new Path(args[0]));
FileOutputFormat.setOutputPath(conf, new Path(args[1]));
JobClient.runJob(conf);
}
}
I've made the jar and I've been to trying to get the output. But the output file which is being created is empty.
I'm using the following command to run the job
hadoop jar mapreduce.jar Sample /tmp/input tmp/output
mapreduce.jar is the jar which I have packaged and my input file is like
1 a,b,c
2 e,f
1 x,y,z
2 g
expected output
1 a|b|c|x|y|z
2 e|f|g
I'm guessing that since you're using KeyValueTextInputFormat as the input format that it's not finding a separater byte and is therefore using the entire line value as the key (value is ""). That would mean that your iteration in the mapper doesn't go through any loops and nothing is written out. Use property name mapreduce.input.keyvaluelinerecordreader.key.value.separator in the config to hold " " as the separator byte.
Try passing configuration Object to the JobConf , I guess your JobConf is not able to get the Hadoop/hdfs configuration.
Configuration configuration=new Configuration();
JobConf jobconf=new JobConf(configuration, exampleClass.class);
conf2.setJarByClass(cls);
.......
Related
Im trying to read 2 file from hdfs input with below code but I face with error as follow
I am beginner in mapreduce programing and stuck on this problem for couple of days,any help will be appreciated.
My code:
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
public class Recipe {
public static class TokenizerMapper1
extends Mapper<Object, Text, Text, IntWritable>{
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(Object key, Text value, Context context
) throws IOException, InterruptedException {
String line=value.toString();
word.set(line.substring(2,8));
context.write(word,one);
}
}
public static class TokenizerMapper2
extends Mapper<Object, Text, Text, IntWritable>{
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(Object key, Text value, Context context
) throws IOException, InterruptedException {
String line=value.toString();
word.set(line.substring(2,8));
context.write(word,one);
}
}
public static class IntSumReducer
extends Reducer<Text,IntWritable,Text,IntWritable> {
private IntWritable result = new IntWritable();
public void reduce(Text key, Iterable<IntWritable> values,
Context context
) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: recipe <in> <out>");
System.exit(2);
}
#SuppressWarnings("deprecation")
Job job = new Job(conf, "Recipe");
job.setJarByClass(Recipe.class);
job.setMapperClass(TokenizerMapper1.class);
job.setMapperClass(TokenizerMapper2.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
MultipleInputs.addInputPath(job,new Path(args[0]),TextInputFormat.class,TokenizerMapper1.class);
MultipleInputs.addInputPath(job,new Path(args[1]),TextInputFormat.class,TokenizerMapper2.class);
FileOutputFormat.setOutputPath(job, new Path(args[2]));
//FileInputFormat.addInputPath(job, new Path("hdfs://localhost:9000/in"));
//FileOutputFormat.setOutputPath(job, new Path("hdfs://127.0.0.1:9000/out"));
System.exit(job.waitForCompletion(true) ? 0 : 1);
// job.submit();
}
And i've set program run configuration arguments like this:
/in /put
Error:
Exception in thread "main" java.lang.ArrayIndexOutOfBoundsException: 2
at Recipe.main(Recipe.java:121)
There are several issues. Program is expecting 3 parameters and you are passing only 2. Also if you have to process multiple input formats you need to use MultipleInputs.
Assume that you invoke program /in1 /in2 /out
MultipleInputs.addInputPath(job, args[0], TokenizerMapper1.class, FirstMapper.class);
MultipleInputs.addInputPath(job, args[1], TokenizerMapper2.class, SecondMapper.class);
You can remove these lines from the code:
job.setMapperClass(TokenizerMapper1.class);
job.setMapperClass(TokenizerMapper2.class);
Now it works with the following modifications:
Put every file in a separate directory.
Use real address instead of arg[], as shown below:
MultipleInputs.addInputPath(job,new Path("hdfs://localhost:9000/in1"),TextInputFormat.class,TokenizerMapper1.class);
MultipleInputs.addInputPath(job,new Path("hdfs://localhost:9000/in2"),TextInputFormat.class,TokenizerMapper1.class);
FileOutputFormat.setOutputPath(job, new Path("hdfs://127.0.0.1:9000/out"));
Specify all input and output paths in run configurations\arguments like this:
127.0.0.1:9000/in1/DNAIn.txt 127.0.0.1:9000/in2/DNAIn2.txt 127.0.0.1:9000/out
Write a map reduce programm to print the most frequenty ocuring words in a text document.
The threshld value can be fixed and the word whose frequency exceeds the threshold need to be output.
Eg: If thereshold=100, and “is” occurs 150 times in the document, it has to be printed in the output.
program :
package org.myorg;
import java.io.IOException;
import java.util.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
public class WordCount {
public static class Map extends Mapper<LongWritable, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
StringTokenizer tokenizer = new StringTokenizer(line);
while (tokenizer.hasMoreTokens()) {
word.set(tokenizer.nextToken());
context.write(word, one);
}
}
}
public static class Reduce extends Reducer<Text, IntWritable, Text, Inritable> {
public void reduce(Text key, Iterable<IntWritable> values, Context coext)
throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
context.write(key, new IntWritable(sum));
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = new Job(conf, "wordcount");
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.waitForCompletion(true);
}
}
Here's the complete code,
Driver Class
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class FrequentWordClassDriver extends Configured implements Tool{
#Override
public int run(String[] args) throws Exception {
if(args.length != 2){
return -1;
}
JobConf conf = new JobConf(getConf(), FrequentWordClassDriver.class);
conf.setJobName(this.getClass().getName());
FileInputFormat.setInputPaths(conf, new Path(args[0]));
FileOutputFormat.setOutputPath(conf, new Path(args[1]));
conf.setMapperClass(FrequentWordClassMapper.class);
conf.setReducerClass(FrequentWordClassReducer.class);
conf.setMapOutputKeyClass(Text.class);
conf.setMapOutputValueClass(IntWritable.class);
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(IntWritable.class);
JobClient.runJob(conf);
return 0;
}
public static void main(String[] args) throws Exception{
int exitCode = ToolRunner.run(new FrequentWordClassDriver(), args);
System.exit(exitCode);
}
}
Mapper Class
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
public class FrequentWordClassMapper extends MapReduceBase implements Mapper<LongWritable, Text, Text, IntWritable>{
#Override
public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
String line = value.toString();
for(String phrase : line.split(" ")){
output.collect(new Text(phrase.toUpperCase()), new IntWritable(1));
}
}
}
Reducer Class
import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
public class FrequentWordClassReducer extends MapReduceBase implements Reducer<Text, IntWritable, Text, IntWritable>{
#Override
public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException{
int wordcount = 0, threshold = 100;
while(values.hasNext()){
IntWritable value = values.next();
wordcount +=value.get();
}
if(wordcount >= threshold){
output.collect(key, new IntWritable(wordcount));
}
}
}
The Driver Class, Mapper Class and Reducer Class is fairly simple and self explanatory. The mapper class split each sentence into words and send them to reducer class in the format <word, 1>. The reducer class will receive the data in the format <word, [1, 1, 1, 1]> and it will aggregate and count the occurrence of each word, and if the occurrence of each word is greater than or equal to threshold value then it will send the word as output.
Hope this will help you.
It's very simple.
Have a look at traditional word count example. You can use same code.
After setting Reducer class, add below line (If you want your output in single reduce file)
job.setNumReduceTasks(1);
Add your condition in reduce method.
Before writing to context.write(key, result);, add your condition
if ( sum > threshold) {
context.write(key, result);
}
You can better achieve this by using counters.
You can set the number of counter
public void reduce(Text word, Iterable<IntWritable> count,
Context context
) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : count) {
sum += val.get();
}
context.getCounter(word.toString()).increment(sum);
}
And then in the your driver program, you can get the counter using
Counters counters=job.getCounters();
You can use this and run multiple mappers and reducer, thus not compromising the performance.
I know this is a very basic question but I am not able to find where I am making a mistake. My Reducer is not getting invoked from the driver code. I would greatly appreciate if anyone can help me out.
My Driver Code
package com.mycompany.myorg;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class carsDriver {
public static void main(String args[]) throws IOException, ClassNotFoundException, InterruptedException{
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf,args).getRemainingArgs();
if(otherArgs.length != 2){
System.err.println("specified input and output path is not correct");
System.exit(-1);
}
// set up the job details
Job job = new Job(conf,"Cars Avg Fuel Economy");
job.setJarByClass(carsDriver.class);
//job.setJobName("Cars Avg Fuel Economy");
//setup the input and output paths for the MR job
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
// setup of the Mapper, combiner and Reducer classes
job.setMapperClass(carsMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
//job.setCombinerClass(carsCombiner.class);
job.setReducerClass(carsReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
System.exit(job.waitForCompletion(true)?0:1);
}
}
Mapper Code
package com.mycompany.myorg;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class carsMapper extends Mapper<Object, Text, Text, IntWritable> {
private Text mapkey = new Text();
private final static IntWritable mapval = new IntWritable(1);
public void map(Object key, Text Value,Mapper<Object, Text, Text, IntWritable>.Context context ) throws IOException, InterruptedException{
System.out.println("Running the Mapper");
String items[] = Value.toString().split(",");
System.out.println(items[2]+" "+Integer.parseInt(items[23].toString()));
mapkey.set(items[2]);
mapval.set(Integer.parseInt(items[23].toString()));
context.write(mapkey, mapval);
}
}
Reducer Code
package com.mycompany.myorg;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class carsReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
public void reducer(Text key, Iterable<IntWritable> value,Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
System.out.println("Reducer Code");
Text redKey = new Text();
IntWritable redVal = new IntWritable();
redKey.set(key);
int sum=0;
int count=0;
for(IntWritable val: value){
sum= sum +val.get();
count= count + 1;
}
redVal.set((sum/count));
context.write(redKey, redVal);
}
}
After long time debugging the problem I Found that the issue is with the reduce override method.
I used
public void reducer
instead of
public void reduce
observe that it should be reduce instead of reducer.
I am newbie to Hadoop but have read the Yahoo tutorial on that and have already written a few mapReduce jobs. All my previous works used TextInputFormat but I now need to change that to KeyValueInputFormat. The problem is that KeyValueInputFormat.class cannot be found in hadoop 0.20.2?
I am attaching my code below (It is the word count example with only input format being changed)
package org.myorg;
import java.io.IOException;
import java.util.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;
public class WordCount {
public static class Map extends MapReduceBase implements Mapper<LongWritable, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
String line = value.toString();
StringTokenizer tokenizer = new StringTokenizer(line);
while (tokenizer.hasMoreTokens()) {
word.set(tokenizer.nextToken());
output.collect(word, one);
}
}
}
public static class Reduce extends MapReduceBase implements Reducer<Text, IntWritable, Text, IntWritable> {
public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
int sum = 0;
while (values.hasNext()) {
sum += values.next().get();
}
output.collect(key, new IntWritable(sum));
}
}
public static void main(String[] args) throws Exception {
JobConf conf = new JobConf(WordCount.class);
conf.setJobName("wordcount");
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(IntWritable.class);
conf.setMapperClass(Map.class);
conf.setCombinerClass(Reduce.class);
conf.setReducerClass(Reduce.class);
conf.setInputFormat(keyValueInputFormat.class); //The modified input format
conf.setOutputFormat(TextOutputFormat.class);
FileInputFormat.setInputPaths(conf, new Path(args[0]));
FileOutputFormat.setOutputPath(conf, new Path(args[1]));
JobClient.runJob(conf);
}
}
There is KeyValueTextInputFormat in org.apache.hadoop.mapreduce.lib.input.
Some of the old tutorials are based on older versions of Hadoop API. I recommend that you go through some of the newer tutorials.
This is what I get when I do go to source on KeyValueTextInputFormat.
package org.apache.hadoop.mapreduce.lib.input;
import java.io.IOException;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
public class KeyValueTextInputFormat extends FileInputFormat<Text, Text> {
public KeyValueTextInputFormat() {
//compiled code
throw new RuntimeException("Compiled Code");
}
protected boolean isSplitable(JobContext context, Path file) {
//compiled code
throw new RuntimeException("Compiled Code");
}
public RecordReader<Text, Text> createRecordReader(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
//compiled code
throw new RuntimeException("Compiled Code");
}
}
I have been trying to execute some code that would allow me to 'only' list the words that exist in multiple files; what I have done so far was use the wordcount example and thanx to Chris White I managed to compile it. I tried reading here and there to get the code to work but all I am getting is a blank page with no data. the mapper is suppose to collect each word with its corresponding locations; the reducer is suppose to collect the common words any thoughts as to what might be the problem? the code is:
package org.myorg;
import java.io.IOException;
import java.util.*;
import java.lang.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;
public class WordCount {
public static class Map extends MapReduceBase implements Mapper<Text, Text, Text, Text>
{
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
private Text outvalue=new Text();
private String filename = null;
public void map(Text key, Text value, OutputCollector<Text, Text> output, Reporter reporter) throws IOException
{
if (filename == null)
{
filename = ((FileSplit) reporter.getInputSplit()).getPath().getName();
}
String line = value.toString();
StringTokenizer tokenizer = new StringTokenizer(line);
while (tokenizer.hasMoreTokens())
{
word.set(tokenizer.nextToken());
outvalue.set(filename);
output.collect(word, outvalue);
}
}
}
public static class Reduce extends MapReduceBase implements Reducer<Text, Text, Text, Text>
{
private Text src = new Text();
public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter) throws IOException
{
int sum = 0;
//List<Text> list = new ArrayList<Text>();
while (values.hasNext()) // I believe this would have all locations of the same word in different files?
{
sum += values.next().get();
src =values.next().get();
}
output.collect(key, src);
//while(values.hasNext())
//{
//Text value = values.next();
//list.add(new Text(value));
//System.out.println(value.toString());
//}
//System.out.println(values.toString());
//for(Text value : list)
//{
//System.out.println(value.toString());
//}
}
}
public static void main(String[] args) throws Exception
{
JobConf conf = new JobConf(WordCount.class);
conf.setJobName("wordcount");
conf.setInputFormat(KeyValueTextInputFormat.class);
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(Text.class);
conf.setMapperClass(Map.class);
conf.setCombinerClass(Reduce.class);
conf.setReducerClass(Reduce.class);
//conf.setInputFormat(TextInputFormat.class);
conf.setOutputFormat(TextOutputFormat.class);
FileInputFormat.setInputPaths(conf, new Path(args[0]));
FileOutputFormat.setOutputPath(conf, new Path(args[1]));
JobClient.runJob(conf);
}
}
Am I missing anything?
much obliged...
My Hadoop version : 0.20.203
First of all it seems you're using the old Hadoop API (mapred), and a word of advice would be to use the new Hadoop API (mapreduce) which is compatible with 0.20.203
In the new API, here is a wordcount that will work
import java.io.IOException;
import java.lang.InterruptedException;
import java.util.StringTokenizer;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class WordCount {
/**
* The map class of WordCount.
*/
public static class TokenCounterMapper
extends Mapper<Object, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());
context.write(word, one);
}
}
}
/**
* The reducer class of WordCount
*/
public static class TokenCounterReducer
extends Reducer<Text, IntWritable, Text, IntWritable> {
public void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
int sum = 0;
for (IntWritable value : values) {
sum += value.get();
}
context.write(key, new IntWritable(sum));
}
}
/**
* The main entry point.
*/
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
Job job = new Job(conf, "Example Hadoop 0.20.1 WordCount");
job.setJarByClass(WordCount.class);
job.setMapperClass(TokenCounterMapper.class);
job.setReducerClass(TokenCounterReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
Then, we build this file and pack the result into a jar file:
mkdir classes
javac -classpath /path/to/hadoop-0.20.203/hadoop-0.20.203-core.jar:/path/to/hadoop- 0.20.203/lib/commons-cli-1.2.jar -d classes WordCount.java && jar -cvf wordcount.jar -C classes/ .
Finally, we run the jar file in standalone mode of Hadoop
echo "hello world bye world" > /tmp/in/0.txt
echo "hello hadoop goodebye hadoop" > /tmp/in/1.txt
hadoop jar wordcount.jar org.packagename.WordCount /tmp/in /tmp/out
In the reducer, maintain a set of the values observed (the filenames emitted in the mapper), if after you consume all the values, this set size is 1, then the word is only used in one file.
public static class Reduce extends MapReduceBase implements Reducer<Text, Text, Text, Text>
{
private TreeSet<Text> files = new TreeSet<Text>();
public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter) throws IOException
{
files.clear();
for (Text file : values)
{
if (!files.contains(value))
{
// make a copy of value as hadoop re-uses the object
files.add(new Text(value));
}
}
if (files.size() == 1) {
output.collect(key, files.first());
}
files.clear();
}
}