Using Multiple Mappers for multiple output directories in Hadoop MapReduce - hadoop

I want to run two mappers that produce two different outputs in different directories.The output of the first mapper(Send as argument) should be send to the input of the second mapper.i have this code in the driver class
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
public class Export_Column_Mapping
{
private static String[] Detail_output_column_array = new String[27];
private static String[] Shop_output_column_array = new String[8];
private static String details_output = null ;
private static String Shop_output = null;
public static void main(String[] args) throws Exception
{
String Output_filetype = args[3];
String Input_column_number = args[4];
String Output_column_number = args[5];
Configuration Detailsconf = new Configuration(false);
Detailsconf.setStrings("output_filetype",Output_filetype);
Detailsconf.setStrings("Input_column_number",Input_column_number);
Detailsconf.setStrings("Output_column_number",Output_column_number);
Job Details = new Job(Detailsconf," Export_Column_Mapping");
Details.setJarByClass(Export_Column_Mapping.class);
Details.setJobName("DetailsFile_Job");
Details.setMapperClass(DetailFile_Mapper.class);
Details.setNumReduceTasks(0);
Details.setInputFormatClass(TextInputFormat.class);
Details.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.setInputPaths(Details, new Path(args[0]));
FileOutputFormat.setOutputPath(Details, new Path(args[1]));
if(Details.waitForCompletion(true))
{
Configuration Shopconf = new Configuration();
Job Shop = new Job(Shopconf,"Export_Column_Mapping");
Shop.setJarByClass(Export_Column_Mapping.class);
Shop.setJobName("ShopFile_Job");
Shop.setMapperClass(ShopFile_Mapper.class);
Shop.setNumReduceTasks(0);
Shop.setInputFormatClass(TextInputFormat.class);
Shop.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.setInputPaths(Shop, new Path(args[1]));
FileOutputFormat.setOutputPath(Shop, new Path(args[2]));
MultipleOutputs.addNamedOutput(Shop, "text", TextOutputFormat.class,LongWritable.class, Text.class);
System.exit(Shop.waitForCompletion(true) ? 0 : 1);
}
}
public static class DetailFile_Mapper extends Mapper<LongWritable,Text,Text,Text>
{
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException
{
String str_Output_filetype = context.getConfiguration().get("output_filetype");
String str_Input_column_number = context.getConfiguration().get("Input_column_number");
String[] input_columns_number = str_Input_column_number.split(",");
String str_Output_column_number= context.getConfiguration().get("Output_column_number");
String[] output_columns_number = str_Output_column_number.split(",");
String str_line = value.toString();
String[] input_column_array = str_line.split(",");
try
{
for(int i = 0;i<=input_column_array.length+1; i++)
{
int int_outputcolumn = Integer.parseInt(output_columns_number[i]);
int int_inputcolumn = Integer.parseInt(input_columns_number[i]);
if((int_inputcolumn != 0) && (int_outputcolumn != 0) && output_columns_number.length == input_columns_number.length)
{
Detail_output_column_array[int_outputcolumn-1] = input_column_array[int_inputcolumn-1];
if(details_output != null)
{
details_output = details_output+" "+ Detail_output_column_array[int_outputcolumn-1];
Shop_output = Shop_output+" "+ Shop_output_column_array[int_outputcolumn-1];
}else
{
details_output = Detail_output_column_array[int_outputcolumn-1];
Shop_output = Shop_output_column_array[int_outputcolumn-1];
}
}
}
}catch (Exception e)
{
}
context.write(null,new Text(details_output));
}
}
public static class ShopFile_Mapper extends Mapper<LongWritable,Text,Text,Text>
{
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException
{
try
{
for(int i = 0;i<=Shop_output_column_array.length; i++)
{
Shop_output_column_array[0] = Detail_output_column_array[0];
Shop_output_column_array[1] = Detail_output_column_array[1];
Shop_output_column_array[2] = Detail_output_column_array[2];
Shop_output_column_array[3] = Detail_output_column_array[3];
Shop_output_column_array[4] = Detail_output_column_array[14];
if(details_output != null)
{
Shop_output = Shop_output+" "+ Shop_output_column_array[i];
}else
{
Shop_output = Shop_output_column_array[i-1];
}
}
}catch (Exception e){
}
context.write(null,new Text(Shop_output));
}
}
}
I get the error..
Error:org.apache.hadoop.mapreduce.lib.input.InvalidInputException:
Input path does not exist:
file:/home/Barath.B.Natarajan.ap/rules/text.txt
I want to run the jobs one by one can any one help me in this?...

There is something called jobcontrol with which you will be able to achieve it.
Suppose there are two jobs A and B
ControlledJob A= new ControlledJob(JobConf for A);
ControlledJob B= new ControlledJob(JobConf for B);
B.addDependingJob(A);
JobControl jControl = newJobControl("Name");
jControl.addJob(A);
jControl.addJob(B);
Thread runJControl = new Thread(jControl);
runJControl.start();
while (!jControl.allFinished()) {
code = jControl.getFailedJobList().size() == 0 ? 0 : 1;
Thread.sleep(1000);
}
System.exit(1);
Initialize code at the beginning like this:
int code =1;
Let the first job in your case be the first mapper with zero reducer and second job be the second mapper with zero reducer.The configuration should be such that the input path of B and output path of A should be same.

Related

mapReduce to get desired output

Kindly point me in a direction to get my desired output
Current outPut given:
Albania 3607 ++ Country minPopulation
Albania 418495 ++ Country maxPopulation
Desired Output
country city minPopulation
country city maxPopulation
Reducer Class:
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class Handson3Reducer extends Reducer<Text, IntWritable, Text, IntWritable> {
#Override
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int maxValue = Integer.MIN_VALUE;
int minValue = Integer.MAX_VALUE;
String line = key.toString();
String field[] = line.split(",");
for (IntWritable value : values) {
maxValue = Math.max(maxValue, value.get());
minValue = Math.min(minValue, value.get());
}
context.write(key, new IntWritable(minValue));
context.write(key, new IntWritable(maxValue));
}
}
Mapper class:
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class handson3Mapper extends Mapper<LongWritable, Text, Text, IntWritable> {
private static final int MISSING = 9999;
#Override
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
int populationVal;
String line = value.toString();
String field[] = line.split(",");
String country = field[4].substring(1, field[4].length()-1);
String newString = country.concat(field[0].substring(1, field[0].length()-1));
String population = field[9].substring(1, field[9].length()-1);
String city = field[0].substring(1, field[0].length()-1);
if (!population.matches(".*\\d.*") || population.equals("")||
population.matches("([0-9].*)\\.([0-9].*)") ){
return;
}else{
populationVal = Integer.parseInt(population);
context.write(new Text(country),new IntWritable(populationVal));
}
}
}
Runner Class:
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class handsonJobRunner {
public int run(String[] args) throws Exception {
if(args.length !=2) {
System.err.println("Usage: Handson3 <input path> <outputpath>");
System.exit(-1);
}
Job job = new Job();
job.setJarByClass(handsonJobRunner.class);
job.setJobName("Handson 3");
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path(args[1]));
job.setMapperClass(handson3Mapper.class);
job.setReducerClass(Handson3Reducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
System.exit(job.waitForCompletion(true) ? 0:1);
boolean success = job.waitForCompletion(true);
return success ? 0 : 1;
}
public static void main(String[] args) throws Exception {
handsonJobRunner driver = new handsonJobRunner();
driver.run(args);
}
}
Thank you in advance, any pointers would be much appreciated.
You should send both city and population as value to reducer and at reducer select the city with max and min population for each country.
Your mapper would be like this:
public class Handson3Mapper extends Mapper<LongWritable, Text, Text, Text> {
private static final int MISSING = 9999;
#Override
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
int populationVal;
String line = value.toString();
String field[] = line.split(",");
String country = field[4].substring(1, field[4].length() - 1);
String newString = country.concat(field[0].substring(1, field[0].length() - 1));
String population = field[9].substring(1, field[9].length() - 1);
String city = field[0].substring(1, field[0].length() - 1);
if (!population.matches(".*\\d.*") || population.equals("") ||
population.matches("([0-9].*)\\.([0-9].*)")) {
return;
} else {
populationVal = Integer.parseInt(population);
context.write(new Text(country), new Text(city + "-" + populationVal));
}
}
}
And Your reducer should change to this one:
public class Handson3Reducer extends Reducer<Text, Text, Text, IntWritable> {
#Override
public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
String maxPopulationCityName = "";
String minPopulationCityName = "";
int maxValue = Integer.MIN_VALUE;
int minValue = Integer.MAX_VALUE;
String line = key.toString();
String field[] = line.split(",");
for (IntWritable value : values) {
String[] array = value.toString().split("-");
int population = Integer.valueOf(array[1]);
if (population > maxValue) {
maxPopulationCityName = array[0];
maxValue = population;
}
if (population < minValue) {
minPopulationCityName = array[0];
minValue = population;
}
}
context.write(new Text(key + " " + minPopulationCityName), new IntWritable(minValue));
context.write(new Text(key + " " + maxPopulationCityName), new IntWritable(maxValue));
}
}

I want to show max,min and avg temperature using hadoop

My project is to show max,min and avg temperature. I have already done it, but I have to show this functions using group by key. There are 4 radio buttons for Year, month, date and city in my application. If I select one then it will ask me to input the aggregate functions(max,min,avg). For these I need to change my CompositeGroupKey class, but I don't have any idea about that. So please help me, and provide inputs about the changes need to be done with the code.
The driver :
import org.apache.hadoop.io.*;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class MaxTemperature
{
public static void Main (String[] args) throws Exception
{
if (args.length != 2)
{
System.err.println("Please Enter the input and output parameters");
System.exit(-1);
}
Job job = new Job();
job.setJarByClass(MaxTemperature.class);
job.setJobName("Max temperature");
FileInputFormat.addInputPath(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path (args[1]));
job.setMapperClass(MaxTemperatureMapper.class);
job.setReducerClass(MaxTemperatureReducer.class);
job.setMapOutputKeyClass(CompositeGroupKey.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(CompositeGroupKey.class);
job.setOutputValueClass(DoubleWritable.class);
System.exit(job.waitForCompletion(true)?0:1);
}
}
The mapper :
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;
import java.io.IOException;
public class MaxTemperatureMapper extends Mapper <LongWritable, Text, CompositeGroupKey, IntWritable>
{
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException
{
String line = value.toString();
int year = Integer.parseInt(line.substring(0,4));
String mnth = line.substring(7,10);
int date = Integer.parseInt(line.substring(10,12));
int temp= Integer.parseInt(line.substring(12,14));
CompositeGroupKey cntry = new CompositeGroupKey(year,mnth, date);
context.write(cntry, new IntWritable(temp));
}
}
The reducer :
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.*;
import java.io.IOException;
public class MaxTemperatureReducer extends Reducer <CompositeGroupKey, IntWritable, CompositeGroupKey, CompositeGroupkeyall >{
public void reduce(CompositeGroupKey key, Iterable<IntWritable> values , Context context) throws IOException,InterruptedException
{
Double max = Double.MIN_VALUE;
Double min =Double.MAX_VALUE;
for (IntWritable value : values )
{
min = Math.min(min, value.get());
max = Math.max(max, value.get());
}
CompositeGroupkeyall val =new CompositeGroupkeyall(max,min);
context.write(key, val);
}
}
And the composite key :
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableUtils;
class CompositeGroupKey implements WritableComparable<CompositeGroupKey> {
int year;
String mnth;
int date;
CompositeGroupKey(int y, String c, int d){
year = y;
mnth = c;
date = d;
}
CompositeGroupKey(){}
public void write(DataOutput out) throws IOException {
out.writeInt(year);
WritableUtils.writeString(out, mnth);
out.writeInt(date);
}
public void readFields(DataInput in) throws IOException {
this.year = in.readInt();
this.mnth = WritableUtils.readString(in);
this.date = in.readInt();
}
public int compareTo(CompositeGroupKey pop) {
if (pop == null)
return 0;
int intcnt;
intcnt = Integer.valueOf(year).toString().compareTo(Integer.valueOf(pop.year).toString());
if(intcnt != 0){
return intcnt;
}else if(mnth.compareTo(pop.mnth) != 0){
return mnth.compareTo(pop.mnth);
}else{
return Integer.valueOf(date).toString().compareTo(Integer.valueOf(pop.date).toString());
}
}
public String toString() {
return year + " :" + mnth.toString() + " :" + date;
}
}
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
class CompositeGroupkeyall implements WritableComparable<CompositeGroupkeyall> {
Double max;
Double min;
CompositeGroupkeyall(double x, double y){
max = x ;
min = y ;
}
CompositeGroupkeyall(){}
public void readFields(DataInput in) throws IOException {
this.max = in.readDouble();
this.min = in.readDouble();
}
public void write(DataOutput out) throws IOException {
out.writeDouble(max);
out.writeDouble(min);
}
public int compareTo(CompositeGroupkeyall arg0) {
return -1;
}
public String toString() {
return max + " " + min +" " ;
}
}
You can create more key value pairs as below and let the same reducer process the data, all the date/month/year will be processed by the same reducer
CompositeGroupKey cntry = new CompositeGroupKey(year, mnth, date);
CompositeGroupKey cntry_date = new CompositeGroupKey((int)0, "ALL", date);
CompositeGroupKey cntry_mnth = new CompositeGroupKey((int)0, mnth, (int) 1);
CompositeGroupKey cntry_year = new CompositeGroupKey(year, "ALL", (int) 1);
context.write(cntry, new IntWritable(temp));
context.write(cntry_date, new IntWritable(temp));
context.write(cntry_mnth, new IntWritable(temp));
context.write(cntry_year, new IntWritable(temp));

Why mapper function not called when using SequenceFileInputFormat

I have spent two days on this issue. Thanks in advance if anyone can help! Here is the description:
First mapper and reduce work well, and the output with SequenceFileOutputFormat can be found in the output path.
First mapper:
public static class TextToRecordMapper
extends Mapper<Object, Text, Text, IntArrayWritable>{
public void map(Object key, Text value, Context context
) throws IOException, InterruptedException {
}
}
First reducer:
public static class MacOneSensorSigCntReducer
extends Reducer<Text,IntArrayWritable,Text,IntArrayWritable> {
public void reduce(Text key, Iterable<IntArrayWritable> values,
Context context
) throws IOException, InterruptedException {
}
}
The Job part:
Job job = new Job(conf, "word count");
job.setJarByClass(RawInputText.class);
job.setMapperClass(TextToRecordMapper.class);
job.setReducerClass(MacOneSensorSigCntReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntArrayWritable.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
job.waitForCompletion(true);
This works well, and then I add my second mapper and reducer to deal with the output of the first part.
Second mapper:
public static class MacSensorsTimeLocMapper
extends Mapper<Text,IntArrayWritable,Text,IntWritable> {
private Text macInfo = new Text();
public void map(Text key, Iterable<IntArrayWritable> values,
Context context
) throws IOException, InterruptedException {
}
}
Second reducer:
public static class MacInfoTestReducer
extends Reducer<Text,IntWritable,Text,Text> {
public void reduce(Text key, Iterable<IntWritable> values,
Context context
) throws IOException, InterruptedException {
}
}
The Job part:
Job secondJob = new Job(conf, "word count 2");
secondJob.setJarByClass(RawInputText.class);
FileInputFormat.addInputPath(secondJob, new Path(otherArgs[1]));
secondJob.setInputFormatClass(SequenceFileInputFormat.class);
secondJob.setMapperClass(MacSensorsTimeLocMapper.class);
secondJob.setMapOutputKeyClass(Text.class);
secondJob.setMapOutputValueClass(IntArrayWritable.class);
//do not use test reducer to make things simple
//secondJob.setReducerClass(MacInfoTestReducer.class);
FileOutputFormat.setOutputPath(secondJob, new Path(otherArgs[2]));
System.exit(secondJob.waitForCompletion(true) ? 0 : 1);
The second mapper function is not called when I run the code, and the output is generated with text like the following:
00:08:CA:6C:A2:81 com.hicapt.xike.IntArrayWritable#234265
Seems like the framework calls IdentityMapper instead of mine. But how do I change that to make my mapper be called with SequenceFileInputFormat as the input format.
all the code added below:
import java.io.IOException;
import java.util.Collection;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.TreeMap;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class RawInputText {
public static class TextToRecordMapper
extends Mapper<Object, Text, Text, IntArrayWritable>{
private Text word = new Text();
private IntArrayWritable mapv = new IntArrayWritable();
public void map(Object key, Text value, Context context
) throws IOException, InterruptedException {
String line = value.toString();
String[] valArray = line.split(",");
if(valArray.length == 6){
IntWritable[] valInts = new IntWritable[2];
word.set(valArray[0]+"-"+valArray[1]);
valInts[0] = new IntWritable(Integer.parseInt(valArray[2]));
valInts[1] = new IntWritable(Integer.parseInt(valArray[4]));
mapv.set(valInts);
context.write(word, mapv);
}
}
}
public static class MacOneSensorSigCntReducer
extends Reducer<Text,IntArrayWritable,Text,IntArrayWritable> {
private Text macKey = new Text();
private IntArrayWritable macInfo = new IntArrayWritable();
public void reduce(Text key, Iterable<IntArrayWritable> values,
Context context
) throws IOException, InterruptedException {
String[] keyArray = key.toString().split("-");
if(keyArray.length < 2){
int a = 10;
a= 20;
}
String mac = keyArray[1];
String sen = keyArray[0];
Hashtable<Integer, MinuteSignalInfo> rssiTime = new Hashtable<Integer, MinuteSignalInfo>();
MinuteSignalInfo minSig;
int rssi = 0;
int ts = 0;
int i = 0;
for (IntArrayWritable val : values) {
i = 0;
for(Writable element : val.get()) {
IntWritable eleVal = (IntWritable)element;
if(i%2 == 0)
rssi = eleVal.get();
else
ts = eleVal.get()/60;
i++;
}
minSig = (MinuteSignalInfo)rssiTime.get(ts);
if(minSig == null){
minSig = new MinuteSignalInfo();
minSig.rssi = rssi;
minSig.count = 1;
}else{
minSig.rssi += rssi;
minSig.count += 1;
}
rssiTime.put(ts, minSig);
}
TreeMap<Integer, MinuteSignalInfo> treeMap = new TreeMap<Integer, MinuteSignalInfo>();
treeMap.putAll(rssiTime);
macKey.set(mac);
i = 0;
IntWritable[] valInts = new IntWritable[1+treeMap.size()*3];
valInts[i++] = new IntWritable(Integer.parseInt(sen));
Collection<Integer> macs = treeMap.keySet();
Iterator<Integer> it = macs.iterator();
while(it.hasNext()) {
int tsKey = it.next();
valInts[i++] = new IntWritable(tsKey);
valInts[i++] = new IntWritable(treeMap.get(tsKey).rssi);
valInts[i++] = new IntWritable(treeMap.get(tsKey).count);
}
macInfo.set(valInts);
context.write(macKey, macInfo);
}
}
public static class MacSensorsTimeLocMapper
extends Mapper<Text,IntArrayWritable,Text,IntWritable> {
private Text macInfo = new Text();
public void map(Text key, Iterable<IntArrayWritable> values,
Context context
) throws IOException, InterruptedException {
int i = 0;
int sensor = 0;
int ts = 0;
int rssi = 0;
int count = 0;
Hashtable<Integer, MinuteSignalInfo> rssiTime = new Hashtable<Integer, MinuteSignalInfo>();
MinuteSignalInfo minSig;
for (IntArrayWritable val : values) {
i = 0;
for(Writable element : val.get()) {
IntWritable eleVal = (IntWritable)element;
int valval = eleVal.get();
if(i == 0) {
sensor = valval;
}else if(i%3 == 1){
ts = valval;
}else if(i%3 == 2){
rssi = valval;
}else if(i%3 == 0){
count = valval;
minSig = (MinuteSignalInfo)rssiTime.get(ts);
if(minSig == null){
minSig = new MinuteSignalInfo();
minSig.rssi = rssi;
minSig.count = count;
minSig.sensor = sensor;
rssiTime.put(ts, minSig);
}else{
if((rssi/count) < (minSig.rssi/minSig.count)){
minSig.rssi = rssi;
minSig.count = count;
minSig.sensor = sensor;
rssiTime.put(ts, minSig);
}
}
}
i++;
}
}
TreeMap<Integer, MinuteSignalInfo> treeMap = new TreeMap<Integer, MinuteSignalInfo>();
treeMap.putAll(rssiTime);
String macLocs = "";
Collection<Integer> tss = treeMap.keySet();
Iterator<Integer> it = tss.iterator();
while(it.hasNext()) {
int tsKey = it.next();
macLocs += String.valueOf(tsKey) + ",";
macLocs += String.valueOf(treeMap.get(tsKey).sensor) + ";";
}
macInfo.set(macLocs);
context.write(key, new IntWritable(10));
//context.write(key, macInfo);
}
}
public static class MacSensorsTimeLocReducer
extends Reducer<Text,IntArrayWritable,Text,Text> {
private Text macInfo = new Text();
public void reduce(Text key, Iterable<IntArrayWritable> values,
Context context
) throws IOException, InterruptedException {
int i = 0;
int sensor = 0;
int ts = 0;
int rssi = 0;
int count = 0;
Hashtable<Integer, MinuteSignalInfo> rssiTime = new Hashtable<Integer, MinuteSignalInfo>();
MinuteSignalInfo minSig;
for (IntArrayWritable val : values) {
i = 0;
for(Writable element : val.get()) {
IntWritable eleVal = (IntWritable)element;
int valval = eleVal.get();
if(i == 0) {
sensor = valval;
}else if(i%3 == 1){
ts = valval;
}else if(i%3 == 2){
rssi = valval;
}else if(i%3 == 0){
count = valval;
minSig = (MinuteSignalInfo)rssiTime.get(ts);
if(minSig == null){
minSig = new MinuteSignalInfo();
minSig.rssi = rssi;
minSig.count = count;
minSig.sensor = sensor;
rssiTime.put(ts, minSig);
}else{
if((rssi/count) < (minSig.rssi/minSig.count)){
minSig.rssi = rssi;
minSig.count = count;
minSig.sensor = sensor;
rssiTime.put(ts, minSig);
}
}
}
i++;
}
}
TreeMap<Integer, MinuteSignalInfo> treeMap = new TreeMap<Integer, MinuteSignalInfo>();
treeMap.putAll(rssiTime);
String macLocs = "";
Collection<Integer> tss = treeMap.keySet();
Iterator<Integer> it = tss.iterator();
while(it.hasNext()) {
int tsKey = it.next();
macLocs += String.valueOf(tsKey) + ",";
macLocs += String.valueOf(treeMap.get(tsKey).sensor) + ";";
}
macInfo.set(macLocs);
context.write(key, macInfo);
}
}
public static class MacInfoTestReducer
extends Reducer<Text,IntArrayWritable,Text,Text> {
private Text macInfo = new Text();
public void reduce(Text key, Iterable<IntArrayWritable> values,
Context context
) throws IOException, InterruptedException {
String tmp = "";
for (IntArrayWritable val : values) {
for(Writable element : val.get()) {
IntWritable eleVal = (IntWritable)element;
int valval = eleVal.get();
tmp += String.valueOf(valval) + " ";
}
}
macInfo.set(tmp);
context.write(key, macInfo);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length != 3) {
System.err.println("Usage: wordcount <in> <out>");
System.exit(2);
}
/*
Job job = new Job(conf, "word count");
job.setJarByClass(RawInputText.class);
job.setMapperClass(TextToRecordMapper.class);
job.setReducerClass(MacOneSensorSigCntReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntArrayWritable.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
job.waitForCompletion(true);
*/
Job secondJob = new Job(conf, "word count 2");
secondJob.setJarByClass(RawInputText.class);
FileInputFormat.addInputPath(secondJob, new Path(otherArgs[1]));
secondJob.setInputFormatClass(SequenceFileInputFormat.class);
secondJob.setMapperClass(MacSensorsTimeLocMapper.class);
//secondJob.setMapperClass(Mapper.class);
secondJob.setMapOutputKeyClass(Text.class);
secondJob.setMapOutputValueClass(IntArrayWritable.class);
secondJob.setReducerClass(MacInfoTestReducer.class);
//secondJob.setOutputKeyClass(Text.class);
//secondJob.setOutputValueClass(IntArrayWritable.class);
FileOutputFormat.setOutputPath(secondJob, new Path(otherArgs[2]));
System.exit(secondJob.waitForCompletion(true) ? 0 : 1);
}
}
package com.hicapt.xike;
public class MinuteSignalInfo {
public int sensor;
public int rssi;
public int count;
public MinuteSignalInfo() {
rssi = 0;
count = 0;
sensor = 0;
}
}
package com.hicapt.xike;
import org.apache.hadoop.io.ArrayWritable;
import org.apache.hadoop.io.IntWritable;
public class IntArrayWritable extends ArrayWritable {
public IntArrayWritable() {
super(IntWritable.class);
}
/*
public void readFields(DataInput in) throws IOException{
super.readFields(in);
}
public void write(DataOutput out) throws IOException{
super.write(out);
}*/
}

How to store input of input file array in Map Reduce(Java)

I've write Linear Regression Program in java.
Input is -->
2,21.05
3,23.51
4,24.23
5,27.71
6,30.86
8,45.85
10,52.12
11,55.98
I want store input in array like x[]={2,3,...11} before processing input to reduce task. Then send that array variable to reduce() function
But I'm only on value at a time My program.
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.ArrayList;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Reducer.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class LinearRegression {
public static class RegressionMapper extends
Mapper<LongWritable, Text, Text, CountRegression> {
private Text id = new Text();
private CountRegression countRegression = new CountRegression();
#Override
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String tempString = value.toString();
String[] inputData = tempString.split(",");
String xVal = inputData[0];
String yVal = inputData[1];
countRegression.setxVal(Integer.parseInt(xVal));
countRegression.setyVal(Float.parseFloat(yVal));
id.set(xVal);
context.write(id, countRegression);
}
}
public static class RegressionReducer extends
Reducer<Text, CountRegression, Text, CountRegression> {
private CountRegression result = new CountRegression();
// static float meanX = 0;
// private float xValues[];
// private float yValues[];
static float xRed = 0.0f;
static float yRed = 0.3f;
static float sum = 0;
static ArrayList<Float> list = new ArrayList<Float>();
public void reduce(Text key, Iterable<CountRegression> values,
Context context) throws IOException, InterruptedException {
//float b = 0;
// while(values.iterator().hasNext())
// {
// xRed = xRed + values.iterator().next().getxVal();
// yRed = yRed + values.iterator().next().getyVal();
// }
for (CountRegression val : values) {
list.add(val.getxVal());
// list.add(val.getyVal());
// xRed += val.getxVal();
// yRed = val.getyVal();
// meanX += val.getxVal();
//xValues = val.getxVal();
}
for (int i=0; i< list.size(); i++) {
int lastIndex = list.listIterator().previousIndex();
sum += list.get(lastIndex);
}
result.setxVal(sum);
result.setyVal(yRed);
context.write(key, result);
}
}
public static class CountRegression implements Writable {
private float xVal = 0;
private float yVal = 0;
public float getxVal() {
return xVal;
}
public void setxVal(float x) {
this.xVal = x;
}
public float getyVal() {
return yVal;
}
public void setyVal(float y) {
this.yVal = y;
}
#Override
public void readFields(DataInput in) throws IOException {
xVal = in.readFloat();
yVal = in.readFloat();
}
#Override
public void write(DataOutput out) throws IOException {
out.writeFloat(xVal);
out.writeFloat(yVal);
}
#Override
public String toString() {
return "y = "+xVal+" +"+yVal+" x" ;
}
}
public static void main(String[] args) throws Exception {
// Provides access to configuration parameters.
Configuration conf = new Configuration();
// Create a new Job It allows the user to configure the job, submit it, control its execution, and query the state.
Job job = new Job(conf);
//Set the user-specified job name.
job.setJobName("LinearRegression");
//Set the Jar by finding where a given class came from.
job.setJarByClass(LinearRegression.class);
// Set the Mapper for the job.
job.setMapperClass(RegressionMapper.class);
// Set the Combiner for the job.
job.setCombinerClass(RegressionReducer.class);
// Set the Reducer for the job.
job.setReducerClass(RegressionReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(CountRegression.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}

Why does the last reducer stop with java heap error during merge step

I keep increasing the number of reducers and I see that while all except one reducers run quickly and finish their job, one last reducer just hangs at the merge step with this message in its tasktracker log:
Down to the last merge-pass, with 3 segments left of total size: 171207264 bytes
... and after a long time staying at this statement, it throws a java heap error and starts some cleaning which just doesn't finish.
I increased the child.opts memory to 3.5GB (unable to go beyond this limit) and compressed the map output too.
What might be the cause?
Here is the driver code:
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
conf.set("mapred.task.timeout", "6000000");
conf.set("mapred.compress.map.output", "true");
Job job = new Job(conf, "FreebasePreprocess_Phase2");
job.setNumReduceTasks(6);
job.setJarByClass(FreebasePreprocess.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(job, new Path("/user/watsonuser/freebase_data100m120m_output"));
FileOutputFormat.setOutputPath(job, new Path("/user/watsonuser/freebase_data100m120m_output_2"));
job.waitForCompletion(true);
}
Here is the mapper:
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Mapper.Context;
public class Map extends Mapper<LongWritable, Text, Text, Text>{
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException
{
String[] entities = value.toString().split("\\t");
String[] strings = {"/type/object/type", "/common/topic/notable_for", "/type/user/usergroup"};
List<String> filteredPredicates = Arrays.asList(strings);
FileSplit fileSplit = (FileSplit)context.getInputSplit();
String filename = fileSplit.getPath().getName();
// System.out.println("File name "+filename);
if(filename.startsWith("part-r")) {
// if(filename.equalsIgnoreCase("quad.tsv")) {
//this is a quad dump file
String name = null;
String predicate = null;
String oid = null;
String outVal = null;
String outKey = null;
if(entities.length==3) {
oid = entities[0].trim();
predicate = entities[1].trim();
name = entities[2].trim();
/*if(predicate.contains("/type/object/name/lang"))
{
if(predicate.endsWith("/en"))
{*/
/*outKey = sid;
outVal = oid+"#-#-#-#"+"topic_name";
context.write(new Text(outKey), new Text(outVal));*/
/* }
}*/
outKey = oid;
outVal = predicate+"#-#-#-#"+name;
context.write(new Text(outKey), new Text(outVal));
}
}
else if(filename.equalsIgnoreCase("freebase-simple-topic-dump.tsv")) {
//this is a simple topic dump file
String sid = null;
String name = null;
String outKey = null;
String outVal = null;
if(entities.length>1) {
sid = entities[0];
name = entities[1];
outKey = sid;
outVal = name+"#-#-#-#"+"topic_name";
context.write(new Text(outKey), new Text(outVal));
}
}
}
}
Here is the reducer
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class Reduce extends Reducer<Text, Text, Text, Text>
{
public void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException
{
String name = null;
String sid = null;
String predicate = null;
String oid = null;
String id = null;
String outKey = null;
String outVal = null;
ArrayList<Text> valuesList = new ArrayList<Text>();
Iterator<Text> ite = values.iterator();
while(ite.hasNext()) {
Text t = ite.next();
Text txt = new Text();
txt.set(t.toString());
valuesList.add(txt);
String[] entities = t.toString().split("#-#-#-#");
if(entities[entities.length-1].equalsIgnoreCase("topic_name"))
{
name = entities[0];
}
}
for(int i=0; i<valuesList.size(); i++) {
{
Text t2 = valuesList.get(i);
String[] entities = t2.toString().split("#-#-#-#");
if(!entities[entities.length-1].contains("topic_name"))
{
if(name!=null) {
outKey = entities[1]+"\t"+entities[0]+"\t"+name;
}
else {
outKey = entities[1]+"\t"+entities[0]+"\t"+key.toString();
}
context.write(new Text(outKey), null);
}
}
}
}
My guess is that you have a single key with a huge number of values and the following line in your reducer is causing you problems:
valuesList.add(txt);
Lets say you had a key with 100m values, you're trying to build an arraylist of size 100m - at some stage your reducer JVM is going to run out of memory.
You can probably confirm this by putting in some debug and inspecting the logs for the reducer that never ends:
valuesList.add(txt);
if (valuesList.size() % 10000 == 0) {
System.err.println(key + "\t" + valueList.size());
}
I haven't written raw MR in a while, but I would approach it in a way similar to this:
Keeping all values for a key in memory is always dangerous. I would instead add another MR phase to your job. In the first stage emit newkey = (key, 0), newValue = value when value contains "topic-name", and newkey = (key, 1), newValue = value when value doesn't contain "topic-name". This will require writing a custom writablecomparable that can handle a pair, and knows how to sort it.
For the reducer in the next phase write a partitioner that partitions on the first element of the new key. Now because of the last reducer's sorted-by-key output, you are guaranteed that you get the k,v pair with the 'name' before you get the other k,v pairs for each key. Now you have access to the "name" for each value corresponding to a key.

Resources