I want to show max,min and avg temperature using hadoop - hadoop

My project is to show max,min and avg temperature. I have already done it, but I have to show this functions using group by key. There are 4 radio buttons for Year, month, date and city in my application. If I select one then it will ask me to input the aggregate functions(max,min,avg). For these I need to change my CompositeGroupKey class, but I don't have any idea about that. So please help me, and provide inputs about the changes need to be done with the code.
The driver :
import org.apache.hadoop.io.*;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class MaxTemperature
{
public static void Main (String[] args) throws Exception
{
if (args.length != 2)
{
System.err.println("Please Enter the input and output parameters");
System.exit(-1);
}
Job job = new Job();
job.setJarByClass(MaxTemperature.class);
job.setJobName("Max temperature");
FileInputFormat.addInputPath(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path (args[1]));
job.setMapperClass(MaxTemperatureMapper.class);
job.setReducerClass(MaxTemperatureReducer.class);
job.setMapOutputKeyClass(CompositeGroupKey.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(CompositeGroupKey.class);
job.setOutputValueClass(DoubleWritable.class);
System.exit(job.waitForCompletion(true)?0:1);
}
}
The mapper :
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;
import java.io.IOException;
public class MaxTemperatureMapper extends Mapper <LongWritable, Text, CompositeGroupKey, IntWritable>
{
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException
{
String line = value.toString();
int year = Integer.parseInt(line.substring(0,4));
String mnth = line.substring(7,10);
int date = Integer.parseInt(line.substring(10,12));
int temp= Integer.parseInt(line.substring(12,14));
CompositeGroupKey cntry = new CompositeGroupKey(year,mnth, date);
context.write(cntry, new IntWritable(temp));
}
}
The reducer :
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.*;
import java.io.IOException;
public class MaxTemperatureReducer extends Reducer <CompositeGroupKey, IntWritable, CompositeGroupKey, CompositeGroupkeyall >{
public void reduce(CompositeGroupKey key, Iterable<IntWritable> values , Context context) throws IOException,InterruptedException
{
Double max = Double.MIN_VALUE;
Double min =Double.MAX_VALUE;
for (IntWritable value : values )
{
min = Math.min(min, value.get());
max = Math.max(max, value.get());
}
CompositeGroupkeyall val =new CompositeGroupkeyall(max,min);
context.write(key, val);
}
}
And the composite key :
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableUtils;
class CompositeGroupKey implements WritableComparable<CompositeGroupKey> {
int year;
String mnth;
int date;
CompositeGroupKey(int y, String c, int d){
year = y;
mnth = c;
date = d;
}
CompositeGroupKey(){}
public void write(DataOutput out) throws IOException {
out.writeInt(year);
WritableUtils.writeString(out, mnth);
out.writeInt(date);
}
public void readFields(DataInput in) throws IOException {
this.year = in.readInt();
this.mnth = WritableUtils.readString(in);
this.date = in.readInt();
}
public int compareTo(CompositeGroupKey pop) {
if (pop == null)
return 0;
int intcnt;
intcnt = Integer.valueOf(year).toString().compareTo(Integer.valueOf(pop.year).toString());
if(intcnt != 0){
return intcnt;
}else if(mnth.compareTo(pop.mnth) != 0){
return mnth.compareTo(pop.mnth);
}else{
return Integer.valueOf(date).toString().compareTo(Integer.valueOf(pop.date).toString());
}
}
public String toString() {
return year + " :" + mnth.toString() + " :" + date;
}
}
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
class CompositeGroupkeyall implements WritableComparable<CompositeGroupkeyall> {
Double max;
Double min;
CompositeGroupkeyall(double x, double y){
max = x ;
min = y ;
}
CompositeGroupkeyall(){}
public void readFields(DataInput in) throws IOException {
this.max = in.readDouble();
this.min = in.readDouble();
}
public void write(DataOutput out) throws IOException {
out.writeDouble(max);
out.writeDouble(min);
}
public int compareTo(CompositeGroupkeyall arg0) {
return -1;
}
public String toString() {
return max + " " + min +" " ;
}
}

You can create more key value pairs as below and let the same reducer process the data, all the date/month/year will be processed by the same reducer
CompositeGroupKey cntry = new CompositeGroupKey(year, mnth, date);
CompositeGroupKey cntry_date = new CompositeGroupKey((int)0, "ALL", date);
CompositeGroupKey cntry_mnth = new CompositeGroupKey((int)0, mnth, (int) 1);
CompositeGroupKey cntry_year = new CompositeGroupKey(year, "ALL", (int) 1);
context.write(cntry, new IntWritable(temp));
context.write(cntry_date, new IntWritable(temp));
context.write(cntry_mnth, new IntWritable(temp));
context.write(cntry_year, new IntWritable(temp));

Related

mapReduce to get desired output

Kindly point me in a direction to get my desired output
Current outPut given:
Albania 3607 ++ Country minPopulation
Albania 418495 ++ Country maxPopulation
Desired Output
country city minPopulation
country city maxPopulation
Reducer Class:
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class Handson3Reducer extends Reducer<Text, IntWritable, Text, IntWritable> {
#Override
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int maxValue = Integer.MIN_VALUE;
int minValue = Integer.MAX_VALUE;
String line = key.toString();
String field[] = line.split(",");
for (IntWritable value : values) {
maxValue = Math.max(maxValue, value.get());
minValue = Math.min(minValue, value.get());
}
context.write(key, new IntWritable(minValue));
context.write(key, new IntWritable(maxValue));
}
}
Mapper class:
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class handson3Mapper extends Mapper<LongWritable, Text, Text, IntWritable> {
private static final int MISSING = 9999;
#Override
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
int populationVal;
String line = value.toString();
String field[] = line.split(",");
String country = field[4].substring(1, field[4].length()-1);
String newString = country.concat(field[0].substring(1, field[0].length()-1));
String population = field[9].substring(1, field[9].length()-1);
String city = field[0].substring(1, field[0].length()-1);
if (!population.matches(".*\\d.*") || population.equals("")||
population.matches("([0-9].*)\\.([0-9].*)") ){
return;
}else{
populationVal = Integer.parseInt(population);
context.write(new Text(country),new IntWritable(populationVal));
}
}
}
Runner Class:
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class handsonJobRunner {
public int run(String[] args) throws Exception {
if(args.length !=2) {
System.err.println("Usage: Handson3 <input path> <outputpath>");
System.exit(-1);
}
Job job = new Job();
job.setJarByClass(handsonJobRunner.class);
job.setJobName("Handson 3");
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path(args[1]));
job.setMapperClass(handson3Mapper.class);
job.setReducerClass(Handson3Reducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
System.exit(job.waitForCompletion(true) ? 0:1);
boolean success = job.waitForCompletion(true);
return success ? 0 : 1;
}
public static void main(String[] args) throws Exception {
handsonJobRunner driver = new handsonJobRunner();
driver.run(args);
}
}
Thank you in advance, any pointers would be much appreciated.
You should send both city and population as value to reducer and at reducer select the city with max and min population for each country.
Your mapper would be like this:
public class Handson3Mapper extends Mapper<LongWritable, Text, Text, Text> {
private static final int MISSING = 9999;
#Override
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
int populationVal;
String line = value.toString();
String field[] = line.split(",");
String country = field[4].substring(1, field[4].length() - 1);
String newString = country.concat(field[0].substring(1, field[0].length() - 1));
String population = field[9].substring(1, field[9].length() - 1);
String city = field[0].substring(1, field[0].length() - 1);
if (!population.matches(".*\\d.*") || population.equals("") ||
population.matches("([0-9].*)\\.([0-9].*)")) {
return;
} else {
populationVal = Integer.parseInt(population);
context.write(new Text(country), new Text(city + "-" + populationVal));
}
}
}
And Your reducer should change to this one:
public class Handson3Reducer extends Reducer<Text, Text, Text, IntWritable> {
#Override
public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
String maxPopulationCityName = "";
String minPopulationCityName = "";
int maxValue = Integer.MIN_VALUE;
int minValue = Integer.MAX_VALUE;
String line = key.toString();
String field[] = line.split(",");
for (IntWritable value : values) {
String[] array = value.toString().split("-");
int population = Integer.valueOf(array[1]);
if (population > maxValue) {
maxPopulationCityName = array[0];
maxValue = population;
}
if (population < minValue) {
minPopulationCityName = array[0];
minValue = population;
}
}
context.write(new Text(key + " " + minPopulationCityName), new IntWritable(minValue));
context.write(new Text(key + " " + maxPopulationCityName), new IntWritable(maxValue));
}
}

I'm getting the following error in My mapreduce code

I'm trying to get pearson's correlation for all pairs of column.
This is My MapReduce code :
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class Pearson
{
public static class MyMapper extends Mapper<LongWritable,Text,IndexPair,ValuePair>{
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] tokens = line.split(",");
double[] arr = toDouble(tokens);
for(int i=0; i < arr.length; i++) {
for(int j=i+1; j < arr.length; j++) {
IndexPair k2 = new IndexPair(i, j);
ValuePair v2 = new ValuePair(arr[i], arr[j]);
context.write(k2, v2);
}
}
}
public double[] toDouble(String[] tokens) {
double[] arr = new double[tokens.length];
for(int i=0; i < tokens.length; i++) {
arr[i] = Double.parseDouble(tokens[i]);
}
return arr;
}
}
public static class MyReduce extends Reducer<IndexPair,ValuePair,IndexPair,DoubleWritable>
{
public void reduce(IndexPair key, Iterable<ValuePair> values, Context context) throws IOException, InterruptedException {
double x = 0.0d;
double y = 0.0d;
double xx = 0.0d;
double yy = 0.0d;
double xy = 0.0d;
double n = 0.0d;
for(ValuePair pairs : values) {
x += pairs.v1;
y += pairs.v2;
xx += Math.pow(pairs.v1, 2.0d);
yy += Math.pow(pairs.v2, 2.0d);
xy += (pairs.v1 * pairs.v2);
n += 1.0d;
}
double numerator = xy - ((x * y) / n);
double denominator1 = xx - (Math.pow(x, 2.0d) / n);
double denominator2 = yy - (Math.pow(y, 2.0d) / n);
double denominator = Math.sqrt(denominator1 * denominator2);
double corr = numerator / denominator;
context.write(key, new DoubleWritable(corr));
}
}
public static void main(String[] args) throws Exception
{
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "Pearson's Correlation");
job.setJarByClass(Pearson.class);
job.setMapperClass(MyMapper.class);
job.setCombinerClass(MyReduce.class);
job.setReducerClass(MyReduce.class);
job.setMapOutputKeyClass(IndexPair.class);
job.setMapOutputValueClass(ValuePair.class);
job.setOutputKeyClass(IndexPair.class);
job.setOutputValueClass(DoubleWritable.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
And Code for IndexPair is this one :
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
public class IndexPair implements WritableComparable<IndexPair>{
public static String[] labels
={"Year","Month","MEI","CO2","CH4","N2O","CFC-11","CFC-12","TSI","Aerosols","Temp"};
public long i,j;
public IndexPair()
{
}
public IndexPair(long i,long j) {
this.i=i;
this.j=j;
}
#Override
public void readFields(DataInput in) throws IOException {
i = in.readLong();
j = in.readLong();
}
#Override
public void write(DataOutput out) throws IOException {
out.writeLong(i);
out.writeLong(j);
}
#Override
public int compareTo(IndexPair o) {
Long i1 = i;
Long j1 = j;
Long i2 = o.i;
Long j2 = o.j;
int result = i1.compareTo(i2);
if (0 == result) {
return j1.compareTo(j2);
}
return result;
}
#Override
public String toString()
{
return "Corelation between column "+labels[(int) i]+"-->"+ labels[(int)j];
}
}
And Code For value Pair is thisone :
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
public class ValuePair implements WritableComparable<ValuePair>{
public double v1,v2;
public ValuePair()
{
}
public ValuePair(double i,double j)
{
v1=i;
v2=j;
}
#Override
public void readFields(DataInput in) throws IOException {
v1=in.readDouble();
v2=in.readDouble();
}
#Override
public void write(DataOutput out) throws IOException {
out.writeDouble(v1);
out.writeDouble(v2);
}
#Override
public int compareTo(ValuePair o) {
// comparator for value pair is not required....
return 0;
}
}
But Whn I'm trying to execute this, I'm getting the following error
17/07/20 13:59:49 INFO mapreduce.Job: map 0% reduce 0%
17/07/20 13:59:53 INFO mapreduce.Job: Task Id : attempt_1500536519279_0007_m_000000_0, Status : FAILED
Error: java.io.IOException: wrong value class: class org.apache.hadoop.io.DoubleWritable is not class ValuePair
at org.apache.hadoop.mapred.IFile$Writer.append(IFile.java:194)
at org.apache.hadoop.mapred.Task$CombineOutputCollector.collect(Task.java:1411)
at org.apache.hadoop.mapred.Task$NewCombinerRunner$OutputConverter.write(Task.java:1728)
at org.apache.hadoop.mapreduce.task.TaskInputOutputContextImpl.write(TaskInputOutputContextImpl.java:89)
at org.apache.hadoop.mapreduce.lib.reduce.WrappedReducer$Context.write(WrappedReducer.java:105)
at Pearson$MyReduce.reduce(Pearson.java:66)
at Pearson$MyReduce.reduce(Pearson.java:1)
at org.apache.hadoop.mapreduce.Reducer.run(Reducer.java:171)
at org.apache.hadoop.mapred.Task$NewCombinerRunner.combine(Task.java:1749)
at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.sortAndSpill(MapTask.java:1639)
at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.flush(MapTask.java:1491)
at org.apache.hadoop.mapred.MapTask$NewOutputCollector.close(MapTask.java:723)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:793)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:341)
at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:175)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1807)
at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:169)
The problem is that you use the reducer as a combiner:
job.setCombinerClass(MyReduce.class);
The output key and value types of the combiner should be the same as the ones of the mapper, while, when you use the reducer as a combiner, it tries to emit pairs of different types, hence the error.

Using Multiple Mappers for multiple output directories in Hadoop MapReduce

I want to run two mappers that produce two different outputs in different directories.The output of the first mapper(Send as argument) should be send to the input of the second mapper.i have this code in the driver class
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
public class Export_Column_Mapping
{
private static String[] Detail_output_column_array = new String[27];
private static String[] Shop_output_column_array = new String[8];
private static String details_output = null ;
private static String Shop_output = null;
public static void main(String[] args) throws Exception
{
String Output_filetype = args[3];
String Input_column_number = args[4];
String Output_column_number = args[5];
Configuration Detailsconf = new Configuration(false);
Detailsconf.setStrings("output_filetype",Output_filetype);
Detailsconf.setStrings("Input_column_number",Input_column_number);
Detailsconf.setStrings("Output_column_number",Output_column_number);
Job Details = new Job(Detailsconf," Export_Column_Mapping");
Details.setJarByClass(Export_Column_Mapping.class);
Details.setJobName("DetailsFile_Job");
Details.setMapperClass(DetailFile_Mapper.class);
Details.setNumReduceTasks(0);
Details.setInputFormatClass(TextInputFormat.class);
Details.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.setInputPaths(Details, new Path(args[0]));
FileOutputFormat.setOutputPath(Details, new Path(args[1]));
if(Details.waitForCompletion(true))
{
Configuration Shopconf = new Configuration();
Job Shop = new Job(Shopconf,"Export_Column_Mapping");
Shop.setJarByClass(Export_Column_Mapping.class);
Shop.setJobName("ShopFile_Job");
Shop.setMapperClass(ShopFile_Mapper.class);
Shop.setNumReduceTasks(0);
Shop.setInputFormatClass(TextInputFormat.class);
Shop.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.setInputPaths(Shop, new Path(args[1]));
FileOutputFormat.setOutputPath(Shop, new Path(args[2]));
MultipleOutputs.addNamedOutput(Shop, "text", TextOutputFormat.class,LongWritable.class, Text.class);
System.exit(Shop.waitForCompletion(true) ? 0 : 1);
}
}
public static class DetailFile_Mapper extends Mapper<LongWritable,Text,Text,Text>
{
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException
{
String str_Output_filetype = context.getConfiguration().get("output_filetype");
String str_Input_column_number = context.getConfiguration().get("Input_column_number");
String[] input_columns_number = str_Input_column_number.split(",");
String str_Output_column_number= context.getConfiguration().get("Output_column_number");
String[] output_columns_number = str_Output_column_number.split(",");
String str_line = value.toString();
String[] input_column_array = str_line.split(",");
try
{
for(int i = 0;i<=input_column_array.length+1; i++)
{
int int_outputcolumn = Integer.parseInt(output_columns_number[i]);
int int_inputcolumn = Integer.parseInt(input_columns_number[i]);
if((int_inputcolumn != 0) && (int_outputcolumn != 0) && output_columns_number.length == input_columns_number.length)
{
Detail_output_column_array[int_outputcolumn-1] = input_column_array[int_inputcolumn-1];
if(details_output != null)
{
details_output = details_output+" "+ Detail_output_column_array[int_outputcolumn-1];
Shop_output = Shop_output+" "+ Shop_output_column_array[int_outputcolumn-1];
}else
{
details_output = Detail_output_column_array[int_outputcolumn-1];
Shop_output = Shop_output_column_array[int_outputcolumn-1];
}
}
}
}catch (Exception e)
{
}
context.write(null,new Text(details_output));
}
}
public static class ShopFile_Mapper extends Mapper<LongWritable,Text,Text,Text>
{
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException
{
try
{
for(int i = 0;i<=Shop_output_column_array.length; i++)
{
Shop_output_column_array[0] = Detail_output_column_array[0];
Shop_output_column_array[1] = Detail_output_column_array[1];
Shop_output_column_array[2] = Detail_output_column_array[2];
Shop_output_column_array[3] = Detail_output_column_array[3];
Shop_output_column_array[4] = Detail_output_column_array[14];
if(details_output != null)
{
Shop_output = Shop_output+" "+ Shop_output_column_array[i];
}else
{
Shop_output = Shop_output_column_array[i-1];
}
}
}catch (Exception e){
}
context.write(null,new Text(Shop_output));
}
}
}
I get the error..
Error:org.apache.hadoop.mapreduce.lib.input.InvalidInputException:
Input path does not exist:
file:/home/Barath.B.Natarajan.ap/rules/text.txt
I want to run the jobs one by one can any one help me in this?...
There is something called jobcontrol with which you will be able to achieve it.
Suppose there are two jobs A and B
ControlledJob A= new ControlledJob(JobConf for A);
ControlledJob B= new ControlledJob(JobConf for B);
B.addDependingJob(A);
JobControl jControl = newJobControl("Name");
jControl.addJob(A);
jControl.addJob(B);
Thread runJControl = new Thread(jControl);
runJControl.start();
while (!jControl.allFinished()) {
code = jControl.getFailedJobList().size() == 0 ? 0 : 1;
Thread.sleep(1000);
}
System.exit(1);
Initialize code at the beginning like this:
int code =1;
Let the first job in your case be the first mapper with zero reducer and second job be the second mapper with zero reducer.The configuration should be such that the input path of B and output path of A should be same.

Hadoop Secondary sort - to use or not use

I have accidents input data from Traffic Data Analysis . Some of the columns are :
Accident Id, Accident Date, Day of week
1, 1/1/1979, 5 (Thursday)
2, 1/2/1979, 6 (Friday)
.......
3, 1/1/1980, 0 (Sunday)
I am trying to solve following :
Find number of accidents per year per day
so output should look like :
where Key is (Year, Day of week)
and Value= Number of accidents on that day
Here line 1 represents , year =1979 Day = Sunday and number of accidents =500 and so on.
1979,1 500
1979,2 1500
1979,3 2500
1979,4 3500
1979,5 4500
1979,6 5500
1979,7 6500
1980,1 500
1980,2 1500
1980,3 2500
1980,4 3500
1980,5 4500
In this scenario , I am trying to solve it using secondary sort method . Is that correct way to solve this problem ?
If secondary sort is correct way , its not working for me . Here is the key class, mapper and reducer. But my output doesn't come as expected . Please help ..
public class DOW implements WritableComparable<DOW> {
private Text year;
private Text day;
// private final Text count;
// private int count;
public DOW() {
this.year = new Text();
this.day = new Text();
// this.count = count;
}
public DOW(Text year, Text day) {
this.year = year;
this.day = day;
// this.count = count;
}
public Text getYear() {
return this.year;
}
public void setYear(Text year) {
this.year = year;
}
public Text getDay() {
return this.day;
}
public void setDay(Text day) {
this.day = day;
}
#Override
public void readFields(DataInput in) throws IOException {
// TODO Auto-generated method stub
year.readFields(in);
day.readFields(in);
}
#Override
public void write(DataOutput out) throws IOException {
// TODO Auto-generated method stub
year.write(out);
day.write(out);
}
#Override
public int compareTo(DOW o) {
// TODO Auto-generated method stub
int cmp = year.compareTo(o.year);
if (cmp != 0) {
return cmp;
}
return o.day.compareTo(this.day);
}
#Override
public String toString() {
// TODO Auto-generated method stub
return year + "," + day;
}
#Override
public boolean equals(Object o) {
// TODO Auto-generated method stub
if (o instanceof DOW) {
DOW tp = (DOW) o;
return year.equals(tp.year) && day.equals(tp.day);
}
return false;
}
#Override
public int hashCode() {
// TODO Auto-generated method stub
return year.hashCode() * 163 + day.hashCode();
}
}
public class AccidentDowDemo extends Configured implements Tool {
public static class DOWMapper extends Mapper<LongWritable, Text, DOW, IntWritable> {
private static final Logger sLogger = Logger.getLogger(DOWMapper.class);
#Override
protected void map(LongWritable key, Text value, Context context)
throws java.io.IOException, InterruptedException {
if (value.toString().contains(",")) {
String[] array = value.toString().split(",");
if (!array[9].equals("Date")) {
Date dt = null;
try {
dt = new SimpleDateFormat("dd/mm/yyyy").parse(array[9]);
} catch (ParseException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
int year = dt.getYear();
int day = Integer.parseInt(array[10].toString());
context.write(new DOW(new Text(Integer.toString(year)),
new Text(Integer.toString(day))),
new IntWritable(1));
}
}
};
}
public static class DOWReducer extends Reducer<DOW, IntWritable, DOW, IntWritable> {
private static final Logger sLogger = Logger
.getLogger(DOWReducer.class);
#Override
protected void reduce(DOW key, Iterable<IntWritable> values,
Context context) throws java.io.IOException,
InterruptedException {
int count = 0;
sLogger.info("key =" + key);
for (IntWritable x : values) {
int val = Integer.parseInt(x.toString());
count = count + val;
}
context.write(key, new IntWritable(count));
};
}
public static class FirstPartitioner extends Partitioner<DOW, IntWritable> {
#Override
public int getPartition(DOW key, IntWritable value, int numPartitions) {
// TODO Auto-generated method stub
return Math.abs(Integer.parseInt(key.getYear().toString()) * 127)
% numPartitions;
}
}
public static class KeyComparator extends WritableComparator {
protected KeyComparator() {
super(DOW.class, true);
}
#Override
public int compare(WritableComparable w1, WritableComparable w2) {
// TODO Auto-generated method stub
DOW ip1 = (DOW) w1;
DOW ip2 = (DOW) w2;
int cmp = ip1.getYear().compareTo(ip2.getYear());
if (cmp == 0) {
cmp = -1 * ip1.getDay().compareTo(ip2.getDay());
}
return cmp;
}
}
public static class GroupComparator extends WritableComparator {
protected GroupComparator() {
super(DOW.class, true);
}
#Override
public int compare(WritableComparable w1, WritableComparable w2) {
// TODO Auto-generated method stub
DOW ip1 = (DOW) w1;
DOW ip2 = (DOW) w2;
return ip1.getYear().compareTo(ip2.getYear());
}
}
}
If you need to basically simulate
select year, day, count(*) as totalPerDay from DATA group by year, day
than you do not need secondary sort.
But if you need to produce something like a CUBE, where you need to calculate total per year and total per week in one MR job, than secondary sort is the way to go.
It is more or less kind of a secondary sorting but is not. The problem is with GroupComparator, the comparison has to be done both on year and day. The idea of groupcomparator is to make sure that the same year does go into same reducer but here we don't need that,instead the data has to go into same reducer if it has same year and same day(1979 and sunday). It should look something like this.
package accidentexercise;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
public class ClassGroupComparator extends WritableComparator
{
protected ClassGroupComparator()
{
super(TextpairWritable.class,true);
}
#SuppressWarnings("rawtypes")
public int compare(WritableComparable w,WritableComparable w1)
{
TextpairWritable s=(TextpairWritable)w;
TextpairWritable s1=(TextpairWritable)w1;
int cmp= s.year.compareTo(s1.year);
if(cmp==0)
{
cmp= -1*s.day.compareTo(s1.day);
}
return cmp;
}
}
I am pasting my whole code as well.
TextpairWritable:
package accidentexercise;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
public class TextpairWritable implements WritableComparable<TextpairWritable>
{
Text year=new Text();
Text day=new Text();
public TextpairWritable()
{
this.year=new Text();
this.day=new Text();
}
public TextpairWritable(Text year,Text day)
{
this.year=year;
this.day=day;
}
public TextpairWritable(String year,String day)
{
this.year=new Text(year);
this.day=new Text(day);
}
public TextpairWritable(TextpairWritable o)
{
this.year=o.year;
this.day=o.day;
}
public void set(Text year,Text day)
{
this.year=year;
this.day=day;
}
public Text getyear()
{
return this.year;
}
public Text getday()
{
return this.day;
}
#Override
public void readFields(DataInput in) throws IOException {
year.readFields(in);
day.readFields(in);
}
#Override
public void write(DataOutput out) throws IOException {
year.write(out);
day.write(out);
}
public String toString()
{
return year+" "+day;
}
public int compareTo(TextpairWritable o)
{
int cmp=year.compareTo(day);
if(cmp==0)
{
cmp=day.compareTo(day);
}
return cmp;
}
}
GroupComparator:
package accidentexercise;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
public class ClassGroupComparator extends WritableComparator
{
protected ClassGroupComparator()
{
super(TextpairWritable.class,true);
}
#SuppressWarnings("rawtypes")
public int compare(WritableComparable w,WritableComparable w1)
{
TextpairWritable s=(TextpairWritable)w;
TextpairWritable s1=(TextpairWritable)w1;
int cmp= s.year.compareTo(s1.year);
if(cmp==0)
{
cmp= -1*s.day.compareTo(s1.day);
}
return cmp;
}
}
SortComparator:
package accidentexercise;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
public class ClassSortComparator extends WritableComparator
{
protected ClassSortComparator()
{
super(TextpairWritable.class,true);
}
#SuppressWarnings("rawtypes")
public int compare(WritableComparable w,WritableComparable w1)
{
TextpairWritable s=(TextpairWritable)w;
TextpairWritable s1=(TextpairWritable)w1;
int cmp=s.year.compareTo(s1.year);
if(cmp==0)
{
cmp= -1*s.day.compareTo(s1.day);
}
return cmp;
}
}
Mapper:
package accidentexercise;
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class ClassMapper extends Mapper<LongWritable,Text,TextpairWritable,IntWritable>
{
public void map(LongWritable key,Text value,Context context) throws IOException,InterruptedException
{
Logger log=LoggerFactory.getLogger(ClassMapper.class) ;
String s=value.toString();
String[] orig_data=s.split(",");
SimpleDateFormat df=new SimpleDateFormat("dd/MM/yyyy");
df.setLenient(false);
try
{
#SuppressWarnings("unused")
Date date=df.parse(orig_data[0]);
String myyear=orig_data[0].substring(6, 10);
context.write(new TextpairWritable(new Text(myyear),new Text(orig_data[2])),new IntWritable(Integer.parseInt(orig_data[1])));
}
catch(ParseException e)
{
log.info("Date is not correct"+e);
}
}
}
Reducer:
package accidentexercise;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Reducer;
public class ClassReducer extends Reducer<TextpairWritable,IntWritable,TextpairWritable,IntWritable>
{
public void reduce(TextpairWritable key,Iterable<IntWritable> value,Context context) throws IOException,InterruptedException
{
int count=0;
for(IntWritable it:value)
{
count+=it.get();
}
context.write(key,new IntWritable(count));
}
}
Driver:
package accidentexercise;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class ClassDriver {
public static void main(String args[]) throws Exception
{
if(args.length!=2)
{
System.err.println("Usage: Worddrivernewapi <input path> <output path>");
System.exit(-1);
}
Job job=new Job();
job.setJarByClass(ClassDriver.class);
job.setJobName("MyDriver");
FileInputFormat.addInputPath(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path(args[1]));
job.setMapperClass(ClassMapper.class);
job.setPartitionerClass(ClassPartitioner.class);
job.setSortComparatorClass(ClassSortComparator.class);
job.setGroupingComparatorClass(ClassGroupComparator.class);
job.setReducerClass(ClassReducer.class);
//job.setNumReduceTasks(0);
job.setOutputKeyClass(TextpairWritable.class);
job.setOutputValueClass(IntWritable.class);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
Partitioner:
package accidentexercise;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Partitioner;
public class ClassPartitioner extends Partitioner<TextpairWritable,IntWritable>
{
#Override
public int getPartition(TextpairWritable tp, IntWritable value, int numPartitions) {
return Math.abs(Integer.parseInt(tp.getyear().toString()) * 127) % numPartitions;
}
}
Sample Input:
Date,Number_of_accidents,day
01/03/2014,18,2
02/03/2014,19,3
03/03/2014,20,4
01/03/2014,1,2
02/03/2014,2,3
03/03/2014,4,4
01/03/2014,8,2
02/03/2014,9,3
03/03/2014,2,4
Output:
01/03/2014,2,27
02/03/2014,3,30
03/03/2014,4,26

How to store input of input file array in Map Reduce(Java)

I've write Linear Regression Program in java.
Input is -->
2,21.05
3,23.51
4,24.23
5,27.71
6,30.86
8,45.85
10,52.12
11,55.98
I want store input in array like x[]={2,3,...11} before processing input to reduce task. Then send that array variable to reduce() function
But I'm only on value at a time My program.
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.ArrayList;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Reducer.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class LinearRegression {
public static class RegressionMapper extends
Mapper<LongWritable, Text, Text, CountRegression> {
private Text id = new Text();
private CountRegression countRegression = new CountRegression();
#Override
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String tempString = value.toString();
String[] inputData = tempString.split(",");
String xVal = inputData[0];
String yVal = inputData[1];
countRegression.setxVal(Integer.parseInt(xVal));
countRegression.setyVal(Float.parseFloat(yVal));
id.set(xVal);
context.write(id, countRegression);
}
}
public static class RegressionReducer extends
Reducer<Text, CountRegression, Text, CountRegression> {
private CountRegression result = new CountRegression();
// static float meanX = 0;
// private float xValues[];
// private float yValues[];
static float xRed = 0.0f;
static float yRed = 0.3f;
static float sum = 0;
static ArrayList<Float> list = new ArrayList<Float>();
public void reduce(Text key, Iterable<CountRegression> values,
Context context) throws IOException, InterruptedException {
//float b = 0;
// while(values.iterator().hasNext())
// {
// xRed = xRed + values.iterator().next().getxVal();
// yRed = yRed + values.iterator().next().getyVal();
// }
for (CountRegression val : values) {
list.add(val.getxVal());
// list.add(val.getyVal());
// xRed += val.getxVal();
// yRed = val.getyVal();
// meanX += val.getxVal();
//xValues = val.getxVal();
}
for (int i=0; i< list.size(); i++) {
int lastIndex = list.listIterator().previousIndex();
sum += list.get(lastIndex);
}
result.setxVal(sum);
result.setyVal(yRed);
context.write(key, result);
}
}
public static class CountRegression implements Writable {
private float xVal = 0;
private float yVal = 0;
public float getxVal() {
return xVal;
}
public void setxVal(float x) {
this.xVal = x;
}
public float getyVal() {
return yVal;
}
public void setyVal(float y) {
this.yVal = y;
}
#Override
public void readFields(DataInput in) throws IOException {
xVal = in.readFloat();
yVal = in.readFloat();
}
#Override
public void write(DataOutput out) throws IOException {
out.writeFloat(xVal);
out.writeFloat(yVal);
}
#Override
public String toString() {
return "y = "+xVal+" +"+yVal+" x" ;
}
}
public static void main(String[] args) throws Exception {
// Provides access to configuration parameters.
Configuration conf = new Configuration();
// Create a new Job It allows the user to configure the job, submit it, control its execution, and query the state.
Job job = new Job(conf);
//Set the user-specified job name.
job.setJobName("LinearRegression");
//Set the Jar by finding where a given class came from.
job.setJarByClass(LinearRegression.class);
// Set the Mapper for the job.
job.setMapperClass(RegressionMapper.class);
// Set the Combiner for the job.
job.setCombinerClass(RegressionReducer.class);
// Set the Reducer for the job.
job.setReducerClass(RegressionReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(CountRegression.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}

Resources