Code not skipping two words in wordcount program

Code not skipping two words in wordcount program - hadoop

This code counts words and skips two given words(in & of) form a file:-
Please help why it is not skipping these words.
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
class skipwc_mapper extends
Mapper<LongWritable, Text, Text, IntWritable> {
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString();
StringTokenizer t = new StringTokenizer(line);
Text word = null;
while (t.hasMoreTokens()) {
word = new Text(t.nextToken());
context.write(word, new IntWritable(1));
}
}
}
class skipwc_reducer extends
Reducer<Text, IntWritable, Text, IntWritable> {
protected void reduce(Text key, Iterable<IntWritable> values,
Context context) throws IOException, InterruptedException {
int tot = 0;
if (key.toString() != "in" && key.toString() != "of") {
while (values.iterator().hasNext()) {
tot += values.iterator().next().get();
}
context.write(key, new IntWritable(tot));
}
}
}
public static class skipwc_runner {
public static void main(String[] args) throws IOException,
InterruptedException, ClassNotFoundException {
Configuration conf = new Configuration();
Job job = new Job(conf);
job.setJarByClass(skipwc_runner.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setMapperClass(skipwc_mapper.class);
job.setReducerClass(skipwc_reducer.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
}

Use equals method to compare String like:
if (!"in".equals(key.toString()) && !"of".equals(key.toString()))
Also it would be beneficial if you skip of/in in the mapper rather than reducer as it would efficient to remove the data before sorting and shuffling phase, so you avoid additional IO.

Related

Iterable not working in mapreduce Reduce Task

Hi Guys i am new to hadoop, i am struggling with issue related to reducer. I have simple wordcount programme which not returning expected output
expected output:
this 1
hadoop 2
output:
this 1
hadoop 1
hadoop 1
code for wordcount programme
package in.edureka.mapreduce;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import java.io.IOException;
import java.util.StringTokenizer;
public class WordCount {
public static class Map extends Mapper<LongWritable, Text, Text, IntWritable>{
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
StringTokenizer tokenizer = new StringTokenizer(value.toString());
while (tokenizer.hasMoreTokens()){
String token = tokenizer.nextToken();
context.write(new Text(token), new IntWritable(1));
}
}
}
public static class Reduce extends Reducer<Text, IntWritable, Text, IntWritable>{
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum = 0;
for(IntWritable v: values){
sum+=v.get();
}
context.write(key, new IntWritable(sum));
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
Job job = new Job(conf, "WordCount Programme");
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
Path outputpath = new Path(args[1]);
//Path outputpath = new Path(args[1]);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
outputpath.getFileSystem(conf).delete(outputpath);
System.setProperty("hadoop.home.dir", System.getProperty("user.home"));
System.exit(job.waitForCompletion(true)? 0 : 1);
}
}

I am not sure the problem with your code but I took the following from the documentation (https://hadoop.apache.org/docs/stable/hadoop-mapreduce-client/hadoop-mapreduce-client-core/MapReduceTutorial.html#Example:_WordCount_v1.0)
and it works as expected.
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class WordCount {
public static class TokenizerMapper
extends Mapper<Object, Text, Text, IntWritable>{
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(Object key, Text value, Context context
) throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());
context.write(word, one);
}
}
}
public static class IntSumReducer
extends Reducer<Text,IntWritable,Text,IntWritable> {
private IntWritable result = new IntWritable();
public void reduce(Text key, Iterable<IntWritable> values,
Context context
) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "word count");
job.setJarByClass(WordCount.class);
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}

The mapreduce code here produces an empty output file. The code and the input is given below

The mapreduce code here produces an empty output file. The code and the input is given below.
package temperature;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class TemperatureMapper extends Mapper<Text, Text, Text, IntWritable> {
#Override
public void map(Text key, Text value, Context context) throws IOException, InterruptedException {
if (isValueValid(value.toString())) {
Text key2 = new Text(getStateFromValue(value.toString()));
IntWritable value2 = new IntWritable(getTemperatureFrom(value.toString()));
context.write(key2, value2);
}
}
private boolean isValueValid(final String value) {
// We expect that the value is a String in the form of : State, Temperature. E.g. MP,77
Pattern p = Pattern.compile("\\S\\S\\,\\d+");
Matcher m = p.matcher(value);
return m.matches();
}
private String getStateFromValue(final String value) {
final String[] subvalues = value.split("\\,");
return subvalues[0];
}
private int getTemperatureFrom(final String value) {
final String[] subvalues = value.split("\\,");
return Integer.parseInt(subvalues[1]);
}
}
public class TemperatureReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
#Override
protected void reduce(final Text key, final Iterable<IntWritable> values, final Context context) throws IOException, InterruptedException {
int sumOfTemperatures = 0;
int nbValues = 0;
int average=0;
for (IntWritable temperature : values) {
sumOfTemperatures += temperature.get();
nbValues++;
}
average = sumOfTemperatures / nbValues;
context.write(key, new IntWritable(average));
}
}
public class average {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
/*if (otherArgs.length != 2) {
System.err.println("Usage: Main <in> <out>");
System.exit(-1);
}*/
Job job = new Job(conf, "Calculate average Temperature");
job.setInputFormatClass(KeyValueTextInputFormat.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[1]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[2]));
job.setJarByClass(average.class);
job.setMapperClass(TemperatureMapper.class);
job.setReducerClass(TemperatureReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
System.exit(job.waitForCompletion(true) ? 0 : -1);
}
}
The code works fine for the input:
Ujjain MP,77
Bhopal MP,76
Indore MP,72
Raipur CG,72
Durg CG,75
Raigarth CG,70
Kendujhar OR,69
Bhubaneswar OR,71
Puri OR,76
But not for some random input like:
hello VI,6
bye RE,2
It rather produces an empty output file.

modify your regular expression for the following to support that kind of input
Pattern p = Pattern.compile("[a-zA-Z]*\\s*[a-zA-Z]{2},\\d+$");
Also, you will need split again to get the state
String[] subvalues = value.split("\\,")[0].split(" ");
return subvalues[subvalues.length - 1];
I hope it helps. In my side, I had to change the key type in the value LongWritable, I am not sure why is not complaining in our side, probably a different api version
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

hadoop program only logic to be written.write the program only for reducer

Write a map reduce programm to print the most frequenty ocuring words in a text document.
The threshld value can be fixed and the word whose frequency exceeds the threshold need to be output.
Eg: If thereshold=100, and “is” occurs 150 times in the document, it has to be printed in the output.
program :
package org.myorg;
import java.io.IOException;
import java.util.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
public class WordCount {
public static class Map extends Mapper<LongWritable, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
StringTokenizer tokenizer = new StringTokenizer(line);
while (tokenizer.hasMoreTokens()) {
word.set(tokenizer.nextToken());
context.write(word, one);
}
}
}
public static class Reduce extends Reducer<Text, IntWritable, Text, Inritable> {
public void reduce(Text key, Iterable<IntWritable> values, Context coext)
throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
context.write(key, new IntWritable(sum));
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = new Job(conf, "wordcount");
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.waitForCompletion(true);
}
}

Here's the complete code,
Driver Class
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class FrequentWordClassDriver extends Configured implements Tool{
#Override
public int run(String[] args) throws Exception {
if(args.length != 2){
return -1;
}
JobConf conf = new JobConf(getConf(), FrequentWordClassDriver.class);
conf.setJobName(this.getClass().getName());
FileInputFormat.setInputPaths(conf, new Path(args[0]));
FileOutputFormat.setOutputPath(conf, new Path(args[1]));
conf.setMapperClass(FrequentWordClassMapper.class);
conf.setReducerClass(FrequentWordClassReducer.class);
conf.setMapOutputKeyClass(Text.class);
conf.setMapOutputValueClass(IntWritable.class);
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(IntWritable.class);
JobClient.runJob(conf);
return 0;
}
public static void main(String[] args) throws Exception{
int exitCode = ToolRunner.run(new FrequentWordClassDriver(), args);
System.exit(exitCode);
}
}
Mapper Class
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
public class FrequentWordClassMapper extends MapReduceBase implements Mapper<LongWritable, Text, Text, IntWritable>{
#Override
public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
String line = value.toString();
for(String phrase : line.split(" ")){
output.collect(new Text(phrase.toUpperCase()), new IntWritable(1));
}
}
}
Reducer Class
import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
public class FrequentWordClassReducer extends MapReduceBase implements Reducer<Text, IntWritable, Text, IntWritable>{
#Override
public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException{
int wordcount = 0, threshold = 100;
while(values.hasNext()){
IntWritable value = values.next();
wordcount +=value.get();
}
if(wordcount >= threshold){
output.collect(key, new IntWritable(wordcount));
}
}
}
The Driver Class, Mapper Class and Reducer Class is fairly simple and self explanatory. The mapper class split each sentence into words and send them to reducer class in the format <word, 1>. The reducer class will receive the data in the format <word, [1, 1, 1, 1]> and it will aggregate and count the occurrence of each word, and if the occurrence of each word is greater than or equal to threshold value then it will send the word as output.
Hope this will help you.

It's very simple.
Have a look at traditional word count example. You can use same code.
After setting Reducer class, add below line (If you want your output in single reduce file)
job.setNumReduceTasks(1);
Add your condition in reduce method.
Before writing to context.write(key, result);, add your condition
if ( sum > threshold) {
context.write(key, result);
}

You can better achieve this by using counters.
You can set the number of counter
public void reduce(Text word, Iterable<IntWritable> count,
Context context
) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : count) {
sum += val.get();
}
context.getCounter(word.toString()).increment(sum);
}
And then in the your driver program, you can get the counter using
Counters counters=job.getCounters();
You can use this and run multiple mappers and reducer, thus not compromising the performance.

Hadoop setJarByClass not working

My WordCount example is the following structure:
public class WordCount extends Configured implements Tool {
public static class Map extends
Mapper<LongWritable, Text, Text, IntWritable> {}
public static class Reduce extends
Reducer<Text, IntWritable, Text, IntWritable> {}
public static void main(String[] args) throws Exception {
BasicConfigurator.configure();
Logger.getRootLogger().setLevel(Level.WARN);
int res = ToolRunner.run(new Configuration(), new WordCount(), args);
System.exit(res);
}
#Override
public int run(String[] args) throws Exception {
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(conf);
if (fs.exists(new Path(args[1]))) {
fs.delete(new Path(args[1]), true);
}
Job job = Job.getInstance(conf, "wordcount");
long startTime = System.currentTimeMillis();
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setJarByClass(WordCount.class);
// job.setJar(WordCount.class.getSimpleName());
job.waitForCompletion(true);
System.out.println("Job Finished in "
+ (System.currentTimeMillis() - startTime) / 1000.0
+ " seconds");
return 0;
}
}
The job.setJarByClass() call is not working, and I get a "No job jar file set" message. Also, job.getJar() after this call shows "null" value. Anyone knows what's the problem here?
I also tried with job.setJarByClass(this.getClass()), job.setJar("WordCount") and job.setJar(WordCount.class.getSimpleName()). The first one has no effect, job.getJar() returns null, the second and third both give me FileNotFoundException: File WordCount does not exist. Then I tried with job.setJar("src/wordcount/WordCount.java") and job.setJar("bin/wordcount/WordCount.class"), both succeed within eclipse (without this warning message), but still fail with FileNotFoundException when executed as standalone jar file on command line. I guess the problem may relate to class path setting if not unresolved dependencies.

think you should add appropriate jar files.
In your case you must have this jar org.apache.hadoop.mapreduce.Job in your project file.
I imported the following classes and interfaces
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.BasicConfigurator;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
And your project working fine.
Just check after importing all above mentioned classes. If any problem, give me a comment.

please use this java code for word counting, with two arguments one is input file other one is result file. And add all jar files from mapreduce and common folders in hadoop directory
package org.samples.mapreduce.training;
import java.io.IOException;
import java.util.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
public class WordCount {
public static class Map extends Mapper<LongWritable, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
StringTokenizer tokenizer = new StringTokenizer(line);
while (tokenizer.hasMoreTokens()) {
word.set(tokenizer.nextToken());
context.write(word, one);
}
}
}
public static class Reduce extends Reducer<Text, IntWritable, Text, IntWritable> {
public void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
context.write(key, new IntWritable(sum));
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
conf.set("mapred.job.tracker", "hdfs://localhost:50001");
conf.set("fs.default.name", "hdfs://localhost:50000");
Job job = new Job(conf, "wordcount");
job.setJarByClass(WordCount.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.waitForCompletion(true);
}
}
Or If you want use advance version use this code with three arguments, here third one file which you dont want count example ,
package org.samples.mapreduce.training;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.net.URI;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.StringUtils;
public class WordCountV2 {
public static class TokenizerMapper
extends Mapper<Object, Text, Text, IntWritable>{
static enum CountersEnum { INPUT_WORDS }
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
private boolean caseSensitive;
private Set<String> patternsToSkip = new HashSet<String>();
private Configuration conf;
private BufferedReader fis;
#Override
public void setup(Context context) throws IOException,
InterruptedException {
conf = context.getConfiguration();
caseSensitive = conf.getBoolean("wordcount.case.sensitive", true);
if (conf.getBoolean("wordcount.skip.patterns", true)) {
URI[] patternsURIs = Job.getInstance(conf).getCacheFiles();
for (URI patternsURI : patternsURIs) {
Path patternsPath = new Path(patternsURI.getPath());
String patternsFileName = patternsPath.getName().toString();
parseSkipFile(patternsFileName);
}
}
}
private void parseSkipFile(String fileName) {
try {
fis = new BufferedReader(new FileReader(fileName));
String pattern = null;
while ((pattern = fis.readLine()) != null) {
patternsToSkip.add(pattern);
}
} catch (IOException ioe) {
System.err.println("Caught exception while parsing the cached file '"
+ StringUtils.stringifyException(ioe));
}
}
#Override
public void map(Object key, Text value, Context context
) throws IOException, InterruptedException {
String line = (caseSensitive) ?
value.toString() : value.toString().toLowerCase();
for (String pattern : patternsToSkip) {
line = line.replaceAll(pattern, "");
}
StringTokenizer itr = new StringTokenizer(line);
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());
context.write(word, one);
Counter counter = context.getCounter(CountersEnum.class.getName(),
CountersEnum.INPUT_WORDS.toString());
counter.increment(1);
}
}
}
public static class IntSumReducer
extends Reducer<Text,IntWritable,Text,IntWritable> {
private IntWritable result = new IntWritable();
public void reduce(Text key, Iterable<IntWritable> values,
Context context
) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
GenericOptionsParser optionParser = new GenericOptionsParser(conf, args);
String[] remainingArgs = optionParser.getRemainingArgs();
if (!(remainingArgs.length != 2 || remainingArgs.length != 4)) {
System.err.println("Usage: wordcount <in> <out> [-skip skipPatternFile]");
System.exit(2);
}
Job job = Job.getInstance(conf, "word count");
job.setJarByClass(WordCountV2.class);
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
List<String> otherArgs = new ArrayList<String>();
for (int i=0; i < remainingArgs.length; ++i) {
if ("-skip".equals(remainingArgs[i])) {
job.addCacheFile(new Path(remainingArgs[++i]).toUri());
job.getConfiguration().setBoolean("wordcount.skip.patterns", true);
} else {
otherArgs.add(remainingArgs[i]);
}
}
FileInputFormat.addInputPath(job, new Path(otherArgs.get(0)));
FileOutputFormat.setOutputPath(job, new Path(otherArgs.get(1)));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}

Hadoop: reducer not getting invoked

I know this is a very basic question but I am not able to find where I am making a mistake. My Reducer is not getting invoked from the driver code. I would greatly appreciate if anyone can help me out.
My Driver Code
package com.mycompany.myorg;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class carsDriver {
public static void main(String args[]) throws IOException, ClassNotFoundException, InterruptedException{
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf,args).getRemainingArgs();
if(otherArgs.length != 2){
System.err.println("specified input and output path is not correct");
System.exit(-1);
}
// set up the job details
Job job = new Job(conf,"Cars Avg Fuel Economy");
job.setJarByClass(carsDriver.class);
//job.setJobName("Cars Avg Fuel Economy");
//setup the input and output paths for the MR job
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
// setup of the Mapper, combiner and Reducer classes
job.setMapperClass(carsMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
//job.setCombinerClass(carsCombiner.class);
job.setReducerClass(carsReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
System.exit(job.waitForCompletion(true)?0:1);
}
}
Mapper Code
package com.mycompany.myorg;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class carsMapper extends Mapper<Object, Text, Text, IntWritable> {
private Text mapkey = new Text();
private final static IntWritable mapval = new IntWritable(1);
public void map(Object key, Text Value,Mapper<Object, Text, Text, IntWritable>.Context context ) throws IOException, InterruptedException{
System.out.println("Running the Mapper");
String items[] = Value.toString().split(",");
System.out.println(items[2]+" "+Integer.parseInt(items[23].toString()));
mapkey.set(items[2]);
mapval.set(Integer.parseInt(items[23].toString()));
context.write(mapkey, mapval);
}
}
Reducer Code
package com.mycompany.myorg;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class carsReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
public void reducer(Text key, Iterable<IntWritable> value,Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
System.out.println("Reducer Code");
Text redKey = new Text();
IntWritable redVal = new IntWritable();
redKey.set(key);
int sum=0;
int count=0;
for(IntWritable val: value){
sum= sum +val.get();
count= count + 1;
}
redVal.set((sum/count));
context.write(redKey, redVal);
}
}

After long time debugging the problem I Found that the issue is with the reduce override method.
I used
public void reducer
instead of
public void reduce
observe that it should be reduce instead of reducer.

Develop Reference

ruby bash windows laravel spring algorithm oracle macos go visual-studio

Code not skipping two words in wordcount program - hadoop

Use equals method to compare String like: if (!"in".equals(key.toString()) && !"of".equals(key.toString())) Also it would be beneficial if you skip of/in in the mapper rather than reducer as it would efficient to remove the data before sorting and shuffling phase, so you avoid additional IO.

Related

Iterable not working in mapreduce Reduce Task

The mapreduce code here produces an empty output file. The code and the input is given below

hadoop program only logic to be written.write the program only for reducer

Hadoop setJarByClass not working

Hadoop: reducer not getting invoked

Categories

Resources