Sort data Hadoop Mapreduce - sorting

I have the following algorithm that sort data with alphabetic order
public void setup(Context context) throws IOException,
InterruptedException {
conf = context.getConfiguration();
caseSensitive = conf.getBoolean("amasort.case.sensitive", true);
}
#Override
public void map(Object key, Text value, Context context
) throws IOException, InterruptedException {
String line = (caseSensitive) ? value.toString() : value.toString().toLowerCase();
word.set(line+"_"+key.toString());
context.write(word, one);
System.out.println("key:"+key.toString()+";value:"+value.toString());
}
}
public static class ForwardReducer
extends Reducer<Text,NullWritable,Text,NullWritable> {
private NullWritable result = NullWritable.get();
public void reduce(Text key, Iterable<NullWritable> values,
Context context
) throws IOException, InterruptedException {
String originalWord = key.toString();
originalWord = originalWord.substring(0, originalWord.lastIndexOf("_"));
key.set(originalWord);
context.write(key, result);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
GenericOptionsParser optionParser = new GenericOptionsParser(conf, args);
String[] remainingArgs = optionParser.getRemainingArgs();
Job job = Job.getInstance(conf, "word sort");
job.setJarByClass(AmaSort.class);
job.setMapperClass(LineMapper.class);
// job.setCombinerClass(ForwardReducer.class);
job.setReducerClass(ForwardReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
FileInputFormat.addInputPath(job, new Path(remainingArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(remainingArgs[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
i tried this algorithm to sort mydata set that contain (#xxxxxxx , 0,tcp,xx,1,1,1,2,4,5,....) but the output all line start with # are deleted and data line structure 0,tcp,x1x1,1,114,.... are modified, i just want to sort my dataset with this specific character (#) all Line start with # in first of file and the rest stay same structure.
Anyone can help me please to modify this algorithm ?

You can use below modified code to perform sorting,
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class AmaSort
{
static Configuration conf = null;
private static boolean caseSensitive;
private static Text word = new Text();
public static class LineMapper extends Mapper<Object, Text, Text, NullWritable>{
public void setup(Context context) throws IOException, InterruptedException
{
conf = context.getConfiguration();
caseSensitive = conf.getBoolean("amasort.case.sensitive", true);
}
#Override
public void map(Object key, Text value, Context context) throws IOException, InterruptedException
{
String line = (caseSensitive) ? value.toString() : value.toString().toLowerCase();
word.set(line);
context.write(word, NullWritable.get());
}
}
public static class ForwardReducer extends Reducer<Text, NullWritable, Text, NullWritable>
{
private NullWritable result = NullWritable.get();
public void reduce(Text key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException
{
context.write(key, result);
}
}
public static void main(String[] args) throws Exception
{
Configuration conf = new Configuration();
GenericOptionsParser optionParser = new GenericOptionsParser(conf, args);
String[] remainingArgs = optionParser.getRemainingArgs();
// Job job = Job.getInstance(conf, "word sort");
Job job = new Job(conf, "word sort");
job.setJarByClass(AmaSort.class);
job.setMapperClass(LineMapper.class);
// job.setCombinerClass(ForwardReducer.class);
job.setReducerClass(ForwardReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
FileInputFormat.addInputPath(job, new Path(remainingArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(remainingArgs[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}

Related

passing arguments to record reader in mapreduce hadoop

This is my code for using variours arg
import java.io.File;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
public class Docsparser {
private static String Delimiter;
public static class DocsInputFormat extends FileInputFormat<Text, Text> {
#Override
public RecordReader<Text, Text> createRecordReader(InputSplit split,
TaskAttemptContext context) throws IOException, InterruptedException {
return new DocsLineRecordReader();
}
}
public static class DocsLineRecordReader extends RecordReader<Text, Text> {
private Text key = new Text();
private Text value = new Text();
private int currentword = 0;
private String fileline;
private File file = null;
private String line;
private HWPFDocument document;
private WordExtractor extractor = null;
private String[] filedata;
StringBuilder sb = new StringBuilder();
#Override
public void initialize(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
FileSplit fileSplit = (FileSplit) split;
final Path file = fileSplit.getPath();
Configuration conf = context.getConfiguration();
FileSystem fs = file.getFileSystem(conf);
FSDataInputStream filein = fs.open(fileSplit.getPath());
String Delim = conf.get("Delim");
if (filein != null)
{
HWPFDocument document = new HWPFDocument(filein);
extractor = new WordExtractor(document);
fileline = extractor.getText();
filedata = fileline.split(Delim);
}
}
#Override
public boolean nextKeyValue() throws IOException, InterruptedException
{
if (key == null) {
key = new Text();
}
if (value == null) {
value = new Text();
}
if(currentword < filedata.length)
{
for ( currentword=0;currentword < filedata.length; currentword++)
{
sb.append(filedata[currentword] +",");
line = sb.toString();
}
key.set(line);
value.set("");
return true;
}
else
{
key = null;
value = null;
return false;
}
}
#Override
public Text getCurrentKey() throws IOException, InterruptedException {
return key;
}
#Override
public Text getCurrentValue() throws IOException, InterruptedException {
return value;
}
#Override
public float getProgress() throws IOException, InterruptedException {
return (100.0f / filedata.length * currentword) / 100.0f;
}
#Override
public void close() throws IOException {
}
}
public static class Map extends Mapper<Text, Text, Text, Text>{
public void map(Text key, Text value, Context context) throws IOException, InterruptedException
{
context.write(key,value);
}
}
public static void main(String[] args) throws Exception
{
Configuration conf = new Configuration();
Job job = new Job(conf, "Docsparser");
job.setJarByClass(Docsparser.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setMapperClass(Map.class);
job.setNumReduceTasks(0);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
Delimiter = args[2].toString();
conf.set("Delim",Delimiter);
job.setInputFormatClass(DocsInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
Exception details:
15/09/28 03:50:04 INFO mapreduce.Job: Task Id :
attempt_1443193152998_2319_m_000000_2, Status : FAILED Error: java.lang.NullPointerException
at java.lang.String.split(String.java:2272)
at java.lang.String.split(String.java:2355)
at com.nielsen.grfe.Docsparser$DocsLineRecordReader.initialize(Docsparser.java:66)
at org.apache.hadoop.mapred.MapTask$NewTrackingRecordReader.initialize(MapTask.java:548)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:786)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:341)
at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:163)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:415)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1671)
at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:158)
All the configuration variables have to be set before initializing Job class . Move
Delimiter = args[2].toString();
conf.set("Delim",Delimiter);
before
Job job = new Job(conf, "Docsparser");
The NullPointerException occurs in the split method of the fileline string. I suspect that you haven't set the "Delim" configuration value and, thus, your variable Delim is null.

MapReduce not reducing?

I'm following the tutorial at http://hadoop.apache.org/docs/current/hadoop-mapreduce-client/hadoop-mapreduce-client-core/MapReduceTutorial.html and this is my code
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.util.StringTokenizer;
import java.util.Iterator;
public class WordCount {
public static class WordCountMapper extends Mapper<Object, Text, Text, IntWritable> {
private Text word = new Text();
private final IntWritable one = new IntWritable(1);
#Override
public void map(Object key, Text val, Context context) throws IOException, InterruptedException {
String line = val.toString();
StringTokenizer tokenizer = new StringTokenizer(line.toLowerCase());
while (tokenizer.hasMoreTokens()) {
word.set(tokenizer.nextToken());
context.write(word, one);
}
}
}
public static class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
public void reduce(Text key, Iterator<IntWritable> value, Context context) throws IOException, InterruptedException {
int sum = 0;
while (value.hasNext()) {
IntWritable val = (IntWritable) value.next();
sum += val.get();
}
context.write(key, new IntWritable(sum));
}
}
public static void main(String[] args) throws Exception {
Configuration config = new Configuration();
Job job = Job.getInstance(config, "word count");
job.setJarByClass(WordCount.class);
job.setMapperClass(WordCountMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setCombinerClass(WordCountReducer.class);
job.setReducerClass(WordCountReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path("/user/Icarus/words.txt"));
FileOutputFormat.setOutputPath(job, new Path("/user/Icarus/words.out"));
job.waitForCompletion(true);
}
}
But when I run it instead of calculating the word frequency, I got this:
bye 1
goodbye 1
hadoop 1
hadoop 1
hello 1
hello 1
hello 1
world 1
I must missed something very trivial but I can't figure out what. Help please..
Root cause of this problem is, You are not calling the reduce() with the exact Signature required to call by Hadoop. Signature should be as below (reference here)
protected void reduce(KEYIN key, Iterable<VALUEIN> values, org.apache.hadoop.mapreduce.Reducer.Context context)
throws IOException, InterruptedException
Since your reduce() not matching the Signature, Hadoop will call default IdentityReducer, which output the same input.
So only you are getting the same output of Map as Reduce output.
For this problem, i can suggest you 2 solutions,
First: Try the below code
public static class WordCountReducer
extends Reducer<Text,IntWritable,Text,IntWritable> {
private IntWritable result = new IntWritable();
public void reduce(Text key, Iterable<IntWritable> values,
Context context
) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
Second: And second solution is quite simple,
Instead of you define manually reduce class, Just set the Reducer class to IntSumReducer or LongSumReducer which will do the same as above code.
So don't define the WordCountReducer class and add the following code,
job.setReducerClass(LongSumReducer.class); or
job.setReducerClass(IntSumReducer.class);
based on the count type you want.
Hope it helps!

i have 10 files with CSV and TSV. I want to get the output what is CSV and TSV data using MapReduce in Apache Hadoop

one file is containing data like this
robert 10,20,30
john 10,30,20
Another file containing data like
surya 10|20|30
sumanth 30|40|10
like this 10 files i want to get the output data what is coma separated and pipe separated using Map Reduce
Here's the code to replace comma delimiter with pipe and combine all the lists for the same surname into one
package my.reader;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import java.io.IOException;
public class ReadRows {
public static class Map extends Mapper<LongWritable, Text, Text, Text> {
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] res = value.toString().split("\t");
if (res[1].contains(",")) {
res[1] = res[1].replace(',','|');
}
context.write(new Text(res[0]), new Text(res[1]));
}
}
public static class Reduce extends Reducer<Text,Text,Text,Text> {
public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException{
String res = "";
for(Text val : values) {
res += "|" + val.toString();
}
context.write(key, new Text(res.substring(1)));
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
if (args.length != 2) {
System.err.println("Usage: my.reader.ReadRows <in> <out>");
System.exit(2);
}
Job job = new Job(conf, "ReadRows");
job.setJarByClass(ReadRows.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
And here's the code just to parse them and calculate max:
public static class Map extends Mapper<LongWritable, Text, Text, IntWritable> {
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] res = value.toString().split("\t");
String[] sal;
if (res[1].contains(",")) {
sal = res[1].split(",");
} else {
sal = res[1].split("\\|");
}
Integer maxSal = 0;
for ( String s : sal ) {
maxSal = max(Integer.valueOf(s), maxSal);
}
context.write(new Text(res[0]), new IntWritable(maxSal));
}
}
public static class Reduce extends Reducer<Text,IntWritable,Text,IntWritable> {
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException{
Integer maxSal = 0;
for(IntWritable val : values) {
maxSal = max(val.get(), maxSal);
}
context.write(key, new IntWritable(maxSal));
}
}

reading parameter in hadoop mapreduce

I am new to hadoop mapreduce.I am trying to implement search in map reduce so my input file is like this
key1 value1,value3
key2 value2,value6
I want to find values list for key which user will pass as command line argument.for this my main (driver) class is like this
public static void main(String[] args) {
JobClient client = new JobClient();
JobConf conf = new JobConf(NameSearchJava.class);
// write now I am trying with writing search key in code (Joy),later I'll
//try to pass argument while running job from hadoop.
conf.set("searcKey", "Joy");
conf.setJobName("Search");
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(Text.class);
FileInputFormat.setInputPaths(conf, new Path(args[0]));
FileOutputFormat.setOutputPath(conf, new Path(args[1]));
conf.setMapperClass(SearchMapper.class);
conf.setReducerClass(SearchReducer.class);
client.setConf(conf);
try {
JobClient.runJob(conf);
} catch (Exception e) {
e.printStackTrace();
}
}
}
and my configure function is:
String item ;
public void configure(JobConf job) {
{
item = job.get("test");
System.out.println(item);
System.err.println("search" + item);
}
where should I write configure function in Mapper or Reducer.How can I use this item parameter to do comparison in reducer .Is this the correct way to take parameters in hadoop ?
Add on to Hadooper's Answer.
This is the full Code.
You can refer Hadooper's answer for explanation.
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
/**
* #author Unmesha sreeveni
* #Date 23 sep 2014
*/
public class StringSearchDriver extends Configured implements Tool {
public static class Map extends
Mapper<LongWritable, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
String line = value.toString();
String searchString = conf.get("word");
StringTokenizer tokenizer = new StringTokenizer(line);
while (tokenizer.hasMoreTokens()) {
String token = tokenizer.nextToken();
if(token.equals(searchString)){
word.set(token);
context.write(word, one);
}
}
}
}
public static class Reduce extends
Reducer<Text, IntWritable, Text, IntWritable> {
public void reduce(Text key, Iterable<IntWritable> values,
Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
context.write(key, new IntWritable(sum));
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
int res = ToolRunner.run(conf, new StringSearchDriver(), args);
System.exit(res);
}
#Override
public int run(String[] args) throws Exception {
// TODO Auto-generated method stub
if (args.length != 3) {
System.out
.printf("Usage: Search String <input dir> <output dir> <search word> \n");
System.exit(-1);
}
String source = args[0];
String dest = args[1];
String searchword = args[2];
Configuration conf = new Configuration();
conf.set("word", searchword);
Job job = new Job(conf, "Search String");
job.setJarByClass(StringSearchDriver.class);
FileSystem fs = FileSystem.get(conf);
Path in =new Path(source);
Path out =new Path(dest);
if (fs.exists(out)) {
fs.delete(out, true);
}
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(job, in);
FileOutputFormat.setOutputPath(job, out);
boolean sucess = job.waitForCompletion(true);
return (sucess ? 0 : 1);
}
}
Read the command line argument in the Driver class as follows -
conf.set("searchKey", args[2]);
where args[2] will be the search-key passed as third argument.
The configure method should be coded in the Mapper as follows -
String searchWord;
public void configure(JobConf jc)
{
searchWord = jc.get("searchKey");
}
This will bring your key to be searched in the Mapper function.
You can perform the comparison in the Mapper itself using the logic as follows -
public void map(LongWritable key, Text value,
OutputCollector<Text, IntWritable> out, Reporter reporter)
throws IOException
{
String[] input = value.toString().split(" ");
for(String word:input)
{
if (word.equalsIgnoreCase(searchWord))
out.collect(new Text(word), new IntWritable(1));
}
}
Let me know if this helps!

Hadoop JobConf class is deprecated , need updated example

I am writing hadoop programs , and i really dont want to play with deprecated classes .
Anywhere online i am not able to find programs with updated
org.apache.hadoop.conf.Configuration
class
insted of
org.apache.hadoop.mapred.JobConf
class.
public static void main(String[] args) throws Exception {
JobConf conf = new JobConf(Test.class);
conf.setJobName("TESST");
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(IntWritable.class);
conf.setMapperClass(Map.class);
conf.setCombinerClass(Reduce.class);
conf.setReducerClass(Reduce.class);
conf.setInputFormat(TextInputFormat.class);
conf.setOutputFormat(TextOutputFormat.class);
FileInputFormat.setInputPaths(conf, new Path(args[0]));
FileOutputFormat.setOutputPath(conf, new Path(args[1]));
JobClient.runJob(conf);
}
This is how my main() looks like.
Can please anyone will provide me with updated function.
Here it's the classic WordCount example. You'll notice a tone of other imports that may not be necessary, reading the code you'll figure out which is which.
What's different? I'm using the Tool interface and the GenericOptionParser to parse the job command a.k.a : hadoop jar ....
In the mapper you'll notice a run thing. You can get rid of that, it's usually called by default when you supply the code for the Map method. I put it there to give you the info that you can further control the mapping stage. This is all using the new API. I hope you find it useful. Any other questions, let me know!
import java.io.IOException;
import java.util.*;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.hadoop.util.GenericOptionsParser;
public class Inception extends Configured implements Tool{
public static class Map extends Mapper<LongWritable, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
StringTokenizer tokenizer = new StringTokenizer(line);
while (tokenizer.hasMoreTokens()) {
word.set(tokenizer.nextToken());
context.write(word, one);
}
}
public void run (Context context) throws IOException, InterruptedException {
setup(context);
while (context.nextKeyValue()) {
map(context.getCurrentKey(), context.getCurrentValue(), context);
}
cleanup(context);
}
}
public static class Reduce extends Reducer<Text, IntWritable, Text, IntWritable> {
public void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
context.write(key, new IntWritable(sum));
}
}
public int run(String[] args) throws Exception {
Job job = Job.getInstance(new Configuration());
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setJarByClass(WordCount.class);
job.submit();
return 0;
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
ToolRunner.run(new WordCount(), otherArgs);
}
}
Also take classic WordCount as example:
org.apache.hadoop.mapred.JobConf is old, in new version we use Configuration and Job to achieve.
Please use org.apache.hadoop.mapreduce.lib.* (it is new API) instead of org.apache.hadoop.mapred.TextInputFormat (it is old).
The Mapper and Reducer are nothing new, please see main function, it includes relatively overall configurations, feel free to change them according to your specific requirements.
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable> {
private Text outputKey;
private IntWritable outputVal;
#Override
public void setup(Context context) {
outputKey = new Text();
outputVal = new IntWritable(1);
}
#Override
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
StringTokenizer stk = new StringTokenizer(value.toString());
while(stk.hasMoreTokens()) {
outputKey.set(stk.nextToken());
context.write(outputKey, outputVal);
}
}
}
class IntSumReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
private IntWritable result;
#Override
public void setup(Context context) {
result = new IntWritable();
}
#Override
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum = 0;
for(IntWritable val: values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
public class WordCount {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
if(args.length != 2) {
System.err.println("Usage: <in> <out>");
System.exit(2);
}
Job job = Job.getInstance(conf, "Word Count");
// set jar
job.setJarByClass(WordCount.class);
// set Mapper, Combiner, Reducer
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
/* Optional, set customer defined Partioner:
* job.setPartitionerClass(MyPartioner.class);
*/
// set output key
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
// set input and output path
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
// by default, Hadoop use TextInputFormat and TextOutputFormat
// any customer defined input and output class must implement InputFormat/OutputFormat interface
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}

Resources