Group correspoding key and values

Group correspoding key and values - hadoop

I have a use case to write a map reducing code where I have to group the values corresponding to the same queue:
Input:
A,B
A,C
B,A
B,D
Output:
A {B,C}
B {A,D}
I have written this code:
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
public class GroupKeyValues {
public static class Map extends Mapper<LongWritable, Text, Text, Text> {
public void map(LongWritable key, Text value, Context con)
throws IOException, InterruptedException {
Text myKey = new Text();
Text myVal = new Text();
String line = value.toString();
StringTokenizer st = new StringTokenizer(line);
while (st.hasMoreTokens()) {
String thisH = st.nextToken();
String[] splitData = thisH.split(",");
myKey.set(splitData[0]);
myVal.set(splitData[1]);
}
con.write(myKey, myVal);
}
}
#SuppressWarnings("deprecation")
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
#SuppressWarnings("deprecation")
Job job = new Job(conf, "GroupKeyValues");
job.setJarByClass(GroupKeyValues.class);
job.setMapperClass(Map.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
Path outputPath = new Path(args[1]);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
outputPath.getFileSystem(conf).delete(outputPath);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}

You are missing reducer that will aggregate values into a single "row" value. For example, you can use ArrayWritable like this:
public static class AggregatingReducer extends Reducer<Text, Text, Text, ArrayWritable> {
private ArrayWritable result = new ArrayWritable(Text.class);
public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
List<Text> list = new ArrayList<>();
for (Text value : values) {
list.add(value);
}
result.set(list.toArray(new Text[list.size()]));
context.write(key, result);
}
}
In the job setup, make sure to add this:
job.setReducerClass(AggregatingReducer.class);
job.setOutputValueClass(ArrayWritable.class); //instead of Text.class
Alternatively (depending on what you need) you can concatenate reducer values into StringBuilder and emit Text instead of accumulating it into and emitting it as ArrayWritable.
UPDATE:
Here is the example of StringBuilder use with comma delimiter:
public static class AggregatingReducer extends Reducer<Text, Text, Text, Text> {
private Text result = new Text();
public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
StringBuilder sb = new StringBuilder();
for (Text value : values) {
if (sb.length() != 0) {
sb.append(',');
}
sb.append(value);
}
result.set(sb.toString());
context.write(key, result);
}
}
In the driver value type needs to be changed back to Text:
job.setOutputValueClass(Text.class);

Are you consider using Apache Spark to solve the issue?
The code might look like this
import org.apache.spark.sql.functions._
val df = sqlContext.createDataFrame(Seq(("A","B"),("A","C"),("B","A"),("B","D")))
val dfAgg = df.groupBy("_1").agg(collect_list("_2"))
dfAgg.show()

Related

Hadoop multiple input files error

Im trying to read 2 file from hdfs input with below code but I face with error as follow
I am beginner in mapreduce programing and stuck on this problem for couple of days,any help will be appreciated.
My code:
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
public class Recipe {
public static class TokenizerMapper1
extends Mapper<Object, Text, Text, IntWritable>{
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(Object key, Text value, Context context
) throws IOException, InterruptedException {
String line=value.toString();
word.set(line.substring(2,8));
context.write(word,one);
}
}
public static class TokenizerMapper2
extends Mapper<Object, Text, Text, IntWritable>{
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(Object key, Text value, Context context
) throws IOException, InterruptedException {
String line=value.toString();
word.set(line.substring(2,8));
context.write(word,one);
}
}
public static class IntSumReducer
extends Reducer<Text,IntWritable,Text,IntWritable> {
private IntWritable result = new IntWritable();
public void reduce(Text key, Iterable<IntWritable> values,
Context context
) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: recipe <in> <out>");
System.exit(2);
}
#SuppressWarnings("deprecation")
Job job = new Job(conf, "Recipe");
job.setJarByClass(Recipe.class);
job.setMapperClass(TokenizerMapper1.class);
job.setMapperClass(TokenizerMapper2.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
MultipleInputs.addInputPath(job,new Path(args[0]),TextInputFormat.class,TokenizerMapper1.class);
MultipleInputs.addInputPath(job,new Path(args[1]),TextInputFormat.class,TokenizerMapper2.class);
FileOutputFormat.setOutputPath(job, new Path(args[2]));
//FileInputFormat.addInputPath(job, new Path("hdfs://localhost:9000/in"));
//FileOutputFormat.setOutputPath(job, new Path("hdfs://127.0.0.1:9000/out"));
System.exit(job.waitForCompletion(true) ? 0 : 1);
// job.submit();
}
And i've set program run configuration arguments like this:
/in /put
Error:
Exception in thread "main" java.lang.ArrayIndexOutOfBoundsException: 2
at Recipe.main(Recipe.java:121)

There are several issues. Program is expecting 3 parameters and you are passing only 2. Also if you have to process multiple input formats you need to use MultipleInputs.
Assume that you invoke program /in1 /in2 /out
MultipleInputs.addInputPath(job, args[0], TokenizerMapper1.class, FirstMapper.class);
MultipleInputs.addInputPath(job, args[1], TokenizerMapper2.class, SecondMapper.class);
You can remove these lines from the code:
job.setMapperClass(TokenizerMapper1.class);
job.setMapperClass(TokenizerMapper2.class);

Now it works with the following modifications:
Put every file in a separate directory.
Use real address instead of arg[], as shown below:
MultipleInputs.addInputPath(job,new Path("hdfs://localhost:9000/in1"),TextInputFormat.class,TokenizerMapper1.class);
MultipleInputs.addInputPath(job,new Path("hdfs://localhost:9000/in2"),TextInputFormat.class,TokenizerMapper1.class);
FileOutputFormat.setOutputPath(job, new Path("hdfs://127.0.0.1:9000/out"));
Specify all input and output paths in run configurations\arguments like this:
127.0.0.1:9000/in1/DNAIn.txt 127.0.0.1:9000/in2/DNAIn2.txt 127.0.0.1:9000/out

i have 10 files with CSV and TSV. I want to get the output what is CSV and TSV data using MapReduce in Apache Hadoop

one file is containing data like this
robert 10,20,30
john 10,30,20
Another file containing data like
surya 10|20|30
sumanth 30|40|10
like this 10 files i want to get the output data what is coma separated and pipe separated using Map Reduce

Here's the code to replace comma delimiter with pipe and combine all the lists for the same surname into one
package my.reader;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import java.io.IOException;
public class ReadRows {
public static class Map extends Mapper<LongWritable, Text, Text, Text> {
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] res = value.toString().split("\t");
if (res[1].contains(",")) {
res[1] = res[1].replace(',','|');
}
context.write(new Text(res[0]), new Text(res[1]));
}
}
public static class Reduce extends Reducer<Text,Text,Text,Text> {
public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException{
String res = "";
for(Text val : values) {
res += "|" + val.toString();
}
context.write(key, new Text(res.substring(1)));
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
if (args.length != 2) {
System.err.println("Usage: my.reader.ReadRows <in> <out>");
System.exit(2);
}
Job job = new Job(conf, "ReadRows");
job.setJarByClass(ReadRows.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
And here's the code just to parse them and calculate max:
public static class Map extends Mapper<LongWritable, Text, Text, IntWritable> {
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] res = value.toString().split("\t");
String[] sal;
if (res[1].contains(",")) {
sal = res[1].split(",");
} else {
sal = res[1].split("\\|");
}
Integer maxSal = 0;
for ( String s : sal ) {
maxSal = max(Integer.valueOf(s), maxSal);
}
context.write(new Text(res[0]), new IntWritable(maxSal));
}
}
public static class Reduce extends Reducer<Text,IntWritable,Text,IntWritable> {
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException{
Integer maxSal = 0;
for(IntWritable val : values) {
maxSal = max(val.get(), maxSal);
}
context.write(key, new IntWritable(maxSal));
}
}

reading parameter in hadoop mapreduce

I am new to hadoop mapreduce.I am trying to implement search in map reduce so my input file is like this
key1 value1,value3
key2 value2,value6
I want to find values list for key which user will pass as command line argument.for this my main (driver) class is like this
public static void main(String[] args) {
JobClient client = new JobClient();
JobConf conf = new JobConf(NameSearchJava.class);
// write now I am trying with writing search key in code (Joy),later I'll
//try to pass argument while running job from hadoop.
conf.set("searcKey", "Joy");
conf.setJobName("Search");
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(Text.class);
FileInputFormat.setInputPaths(conf, new Path(args[0]));
FileOutputFormat.setOutputPath(conf, new Path(args[1]));
conf.setMapperClass(SearchMapper.class);
conf.setReducerClass(SearchReducer.class);
client.setConf(conf);
try {
JobClient.runJob(conf);
} catch (Exception e) {
e.printStackTrace();
}
}
}
and my configure function is:
String item ;
public void configure(JobConf job) {
{
item = job.get("test");
System.out.println(item);
System.err.println("search" + item);
}
where should I write configure function in Mapper or Reducer.How can I use this item parameter to do comparison in reducer .Is this the correct way to take parameters in hadoop ?

Add on to Hadooper's Answer.
This is the full Code.
You can refer Hadooper's answer for explanation.
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
/**
* #author Unmesha sreeveni
* #Date 23 sep 2014
*/
public class StringSearchDriver extends Configured implements Tool {
public static class Map extends
Mapper<LongWritable, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
String line = value.toString();
String searchString = conf.get("word");
StringTokenizer tokenizer = new StringTokenizer(line);
while (tokenizer.hasMoreTokens()) {
String token = tokenizer.nextToken();
if(token.equals(searchString)){
word.set(token);
context.write(word, one);
}
}
}
}
public static class Reduce extends
Reducer<Text, IntWritable, Text, IntWritable> {
public void reduce(Text key, Iterable<IntWritable> values,
Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
context.write(key, new IntWritable(sum));
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
int res = ToolRunner.run(conf, new StringSearchDriver(), args);
System.exit(res);
}
#Override
public int run(String[] args) throws Exception {
// TODO Auto-generated method stub
if (args.length != 3) {
System.out
.printf("Usage: Search String <input dir> <output dir> <search word> \n");
System.exit(-1);
}
String source = args[0];
String dest = args[1];
String searchword = args[2];
Configuration conf = new Configuration();
conf.set("word", searchword);
Job job = new Job(conf, "Search String");
job.setJarByClass(StringSearchDriver.class);
FileSystem fs = FileSystem.get(conf);
Path in =new Path(source);
Path out =new Path(dest);
if (fs.exists(out)) {
fs.delete(out, true);
}
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(job, in);
FileOutputFormat.setOutputPath(job, out);
boolean sucess = job.waitForCompletion(true);
return (sucess ? 0 : 1);
}
}

Read the command line argument in the Driver class as follows -
conf.set("searchKey", args[2]);
where args[2] will be the search-key passed as third argument.
The configure method should be coded in the Mapper as follows -
String searchWord;
public void configure(JobConf jc)
{
searchWord = jc.get("searchKey");
}
This will bring your key to be searched in the Mapper function.
You can perform the comparison in the Mapper itself using the logic as follows -
public void map(LongWritable key, Text value,
OutputCollector<Text, IntWritable> out, Reporter reporter)
throws IOException
{
String[] input = value.toString().split(" ");
for(String word:input)
{
if (word.equalsIgnoreCase(searchWord))
out.collect(new Text(word), new IntWritable(1));
}
}
Let me know if this helps!

Mutual words in files using hadoop mapreduce

I have been trying to execute some code that would allow me to 'only' list the words that exist in multiple files; what I have done so far was use the wordcount example and thanx to Chris White I managed to compile it. I tried reading here and there to get the code to work but all I am getting is a blank page with no data. the mapper is suppose to collect each word with its corresponding locations; the reducer is suppose to collect the common words any thoughts as to what might be the problem? the code is:
package org.myorg;
import java.io.IOException;
import java.util.*;
import java.lang.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;
public class WordCount {
public static class Map extends MapReduceBase implements Mapper<Text, Text, Text, Text>
{
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
private Text outvalue=new Text();
private String filename = null;
public void map(Text key, Text value, OutputCollector<Text, Text> output, Reporter reporter) throws IOException
{
if (filename == null)
{
filename = ((FileSplit) reporter.getInputSplit()).getPath().getName();
}
String line = value.toString();
StringTokenizer tokenizer = new StringTokenizer(line);
while (tokenizer.hasMoreTokens())
{
word.set(tokenizer.nextToken());
outvalue.set(filename);
output.collect(word, outvalue);
}
}
}
public static class Reduce extends MapReduceBase implements Reducer<Text, Text, Text, Text>
{
private Text src = new Text();
public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter) throws IOException
{
int sum = 0;
//List<Text> list = new ArrayList<Text>();
while (values.hasNext()) // I believe this would have all locations of the same word in different files?
{
sum += values.next().get();
src =values.next().get();
}
output.collect(key, src);
//while(values.hasNext())
//{
//Text value = values.next();
//list.add(new Text(value));
//System.out.println(value.toString());
//}
//System.out.println(values.toString());
//for(Text value : list)
//{
//System.out.println(value.toString());
//}
}
}
public static void main(String[] args) throws Exception
{
JobConf conf = new JobConf(WordCount.class);
conf.setJobName("wordcount");
conf.setInputFormat(KeyValueTextInputFormat.class);
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(Text.class);
conf.setMapperClass(Map.class);
conf.setCombinerClass(Reduce.class);
conf.setReducerClass(Reduce.class);
//conf.setInputFormat(TextInputFormat.class);
conf.setOutputFormat(TextOutputFormat.class);
FileInputFormat.setInputPaths(conf, new Path(args[0]));
FileOutputFormat.setOutputPath(conf, new Path(args[1]));
JobClient.runJob(conf);
}
}
Am I missing anything?
much obliged...
My Hadoop version : 0.20.203

First of all it seems you're using the old Hadoop API (mapred), and a word of advice would be to use the new Hadoop API (mapreduce) which is compatible with 0.20.203
In the new API, here is a wordcount that will work
import java.io.IOException;
import java.lang.InterruptedException;
import java.util.StringTokenizer;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class WordCount {
/**
* The map class of WordCount.
*/
public static class TokenCounterMapper
extends Mapper<Object, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());
context.write(word, one);
}
}
}
/**
* The reducer class of WordCount
*/
public static class TokenCounterReducer
extends Reducer<Text, IntWritable, Text, IntWritable> {
public void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
int sum = 0;
for (IntWritable value : values) {
sum += value.get();
}
context.write(key, new IntWritable(sum));
}
}
/**
* The main entry point.
*/
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
Job job = new Job(conf, "Example Hadoop 0.20.1 WordCount");
job.setJarByClass(WordCount.class);
job.setMapperClass(TokenCounterMapper.class);
job.setReducerClass(TokenCounterReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
Then, we build this file and pack the result into a jar file:
mkdir classes
javac -classpath /path/to/hadoop-0.20.203/hadoop-0.20.203-core.jar:/path/to/hadoop- 0.20.203/lib/commons-cli-1.2.jar -d classes WordCount.java && jar -cvf wordcount.jar -C classes/ .
Finally, we run the jar file in standalone mode of Hadoop
echo "hello world bye world" > /tmp/in/0.txt
echo "hello hadoop goodebye hadoop" > /tmp/in/1.txt
hadoop jar wordcount.jar org.packagename.WordCount /tmp/in /tmp/out

In the reducer, maintain a set of the values observed (the filenames emitted in the mapper), if after you consume all the values, this set size is 1, then the word is only used in one file.
public static class Reduce extends MapReduceBase implements Reducer<Text, Text, Text, Text>
{
private TreeSet<Text> files = new TreeSet<Text>();
public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter) throws IOException
{
files.clear();
for (Text file : values)
{
if (!files.contains(value))
{
// make a copy of value as hadoop re-uses the object
files.add(new Text(value));
}
}
if (files.size() == 1) {
output.collect(key, files.first());
}
files.clear();
}
}

Hadoop JobConf class is deprecated , need updated example

I am writing hadoop programs , and i really dont want to play with deprecated classes .
Anywhere online i am not able to find programs with updated
org.apache.hadoop.conf.Configuration
class
insted of
org.apache.hadoop.mapred.JobConf
class.
public static void main(String[] args) throws Exception {
JobConf conf = new JobConf(Test.class);
conf.setJobName("TESST");
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(IntWritable.class);
conf.setMapperClass(Map.class);
conf.setCombinerClass(Reduce.class);
conf.setReducerClass(Reduce.class);
conf.setInputFormat(TextInputFormat.class);
conf.setOutputFormat(TextOutputFormat.class);
FileInputFormat.setInputPaths(conf, new Path(args[0]));
FileOutputFormat.setOutputPath(conf, new Path(args[1]));
JobClient.runJob(conf);
}
This is how my main() looks like.
Can please anyone will provide me with updated function.

Here it's the classic WordCount example. You'll notice a tone of other imports that may not be necessary, reading the code you'll figure out which is which.
What's different? I'm using the Tool interface and the GenericOptionParser to parse the job command a.k.a : hadoop jar ....
In the mapper you'll notice a run thing. You can get rid of that, it's usually called by default when you supply the code for the Map method. I put it there to give you the info that you can further control the mapping stage. This is all using the new API. I hope you find it useful. Any other questions, let me know!
import java.io.IOException;
import java.util.*;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.hadoop.util.GenericOptionsParser;
public class Inception extends Configured implements Tool{
public static class Map extends Mapper<LongWritable, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
StringTokenizer tokenizer = new StringTokenizer(line);
while (tokenizer.hasMoreTokens()) {
word.set(tokenizer.nextToken());
context.write(word, one);
}
}
public void run (Context context) throws IOException, InterruptedException {
setup(context);
while (context.nextKeyValue()) {
map(context.getCurrentKey(), context.getCurrentValue(), context);
}
cleanup(context);
}
}
public static class Reduce extends Reducer<Text, IntWritable, Text, IntWritable> {
public void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
context.write(key, new IntWritable(sum));
}
}
public int run(String[] args) throws Exception {
Job job = Job.getInstance(new Configuration());
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setJarByClass(WordCount.class);
job.submit();
return 0;
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
ToolRunner.run(new WordCount(), otherArgs);
}
}

Also take classic WordCount as example:
org.apache.hadoop.mapred.JobConf is old, in new version we use Configuration and Job to achieve.
Please use org.apache.hadoop.mapreduce.lib.* (it is new API) instead of org.apache.hadoop.mapred.TextInputFormat (it is old).
The Mapper and Reducer are nothing new, please see main function, it includes relatively overall configurations, feel free to change them according to your specific requirements.
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable> {
private Text outputKey;
private IntWritable outputVal;
#Override
public void setup(Context context) {
outputKey = new Text();
outputVal = new IntWritable(1);
}
#Override
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
StringTokenizer stk = new StringTokenizer(value.toString());
while(stk.hasMoreTokens()) {
outputKey.set(stk.nextToken());
context.write(outputKey, outputVal);
}
}
}
class IntSumReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
private IntWritable result;
#Override
public void setup(Context context) {
result = new IntWritable();
}
#Override
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum = 0;
for(IntWritable val: values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
public class WordCount {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
if(args.length != 2) {
System.err.println("Usage: <in> <out>");
System.exit(2);
}
Job job = Job.getInstance(conf, "Word Count");
// set jar
job.setJarByClass(WordCount.class);
// set Mapper, Combiner, Reducer
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
/* Optional, set customer defined Partioner:
* job.setPartitionerClass(MyPartioner.class);
*/
// set output key
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
// set input and output path
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
// by default, Hadoop use TextInputFormat and TextOutputFormat
// any customer defined input and output class must implement InputFormat/OutputFormat interface
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}

Develop Reference

ruby bash windows laravel spring algorithm oracle macos go visual-studio

Group correspoding key and values - hadoop

Are you consider using Apache Spark to solve the issue? The code might look like this import org.apache.spark.sql.functions._ val df = sqlContext.createDataFrame(Seq(("A","B"),("A","C"),("B","A"),("B","D"))) val dfAgg = df.groupBy("_1").agg(collect_list("_2")) dfAgg.show()

Related

Hadoop multiple input files error

i have 10 files with CSV and TSV. I want to get the output what is CSV and TSV data using MapReduce in Apache Hadoop

reading parameter in hadoop mapreduce

Mutual words in files using hadoop mapreduce

Hadoop JobConf class is deprecated , need updated example

Categories

Resources