Reduce doesn't run but job is successfully completed - hadoop

Firstly, I am a newbie at Hadoop MapReduce. My reducer does not run but shows that the job is successfully completed. Below is my console output :
INFO mapreduce.Job: Running job: job_1418240815217_0015
INFO mapreduce.Job: Job job_1418240815217_0015 running in uber mode : false
INFO mapreduce.Job: map 0% reduce 0%
INFO mapreduce.Job: map 100% reduce 0%
INFO mapreduce.Job: Job job_1418240815217_0015 completed successfully
INFO mapreduce.Job: Counters: 30
The main class is :
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
#SuppressWarnings("deprecation")
Job job = new Job(conf,"NPhase2");
job.setJarByClass(NPhase2.class);
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(NPhase2Value.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(Text.class);
job.setMapperClass(MapClass.class);
job.setReducerClass(Reduce.class);
int numberOfPartition = 0;
List<String> other_args = new ArrayList<String>();
for(int i = 0; i < args.length; ++i)
{
try {
if ("-m".equals(args[i])) {
//conf.setNumMapTasks(Integer.parseInt(args[++i]));
++i;
} else if ("-r".equals(args[i])) {
job.setNumReduceTasks(Integer.parseInt(args[++i]));
} else if ("-k".equals(args[i])) {
int knn = Integer.parseInt(args[++i]);
conf.setInt("knn", knn);
System.out.println(knn);
} else {
other_args.add(args[i]);
}
job.setNumReduceTasks(numberOfPartition * numberOfPartition);
//conf.setNumReduceTasks(1);
} catch (NumberFormatException except) {
System.out.println("ERROR: Integer expected instead of " + args[i]);
} catch (ArrayIndexOutOfBoundsException except) {
System.out.println("ERROR: Required parameter missing from " + args[i-1]);
}
}
// Make sure there are exactly 2 parameters left.
if (other_args.size() != 2) {
System.out.println("ERROR: Wrong number of parameters: " +
other_args.size() + " instead of 2.");
}
FileInputFormat.setInputPaths(job, other_args.get(0));
FileOutputFormat.setOutputPath(job, new Path(other_args.get(1)));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
My mapper is :
public static class MapClass extends Mapper
{
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException
{
String line = value.toString();
String[] parts = line.split("\\s+");
// key format <rid1>
IntWritable mapKey = new IntWritable(Integer.valueOf(parts[0]));
// value format <rid2, dist>
NPhase2Value np2v = new NPhase2Value(Integer.valueOf(parts[1]), Float.valueOf(parts[2]));
context.write(mapKey, np2v);
}
}
My reducer class is :
public static class Reduce extends Reducer<IntWritable, NPhase2Value, NullWritable, Text>
{
int numberOfPartition;
int knn;
class Record
{
public int id2;
public float dist;
Record(int id2, float dist)
{
this.id2 = id2;
this.dist = dist;
}
public String toString()
{
return Integer.toString(id2) + " " + Float.toString(dist);
}
}
class RecordComparator implements Comparator<Record>
{
public int compare(Record o1, Record o2)
{
int ret = 0;
float dist = o1.dist - o2.dist;
if (Math.abs(dist) < 1E-6)
ret = o1.id2 - o2.id2;
else if (dist > 0)
ret = 1;
else
ret = -1;
return -ret;
}
}
public void setup(Context context)
{
Configuration conf = new Configuration();
conf = context.getConfiguration();
numberOfPartition = conf.getInt("numberOfPartition", 2);
knn = conf.getInt("knn", 3);
}
public void reduce(IntWritable key, Iterator<NPhase2Value> values, Context context) throws IOException, InterruptedException
{
//initialize the pq
RecordComparator rc = new RecordComparator();
PriorityQueue<Record> pq = new PriorityQueue<Record>(knn + 1, rc);
// For each record we have a reduce task
// value format <rid1, rid2, dist>
while (values.hasNext())
{
NPhase2Value np2v = values.next();
int id2 = np2v.getFirst().get();
float dist = np2v.getSecond().get();
Record record = new Record(id2, dist);
pq.add(record);
if (pq.size() > knn)
pq.poll();
}
while(pq.size() > 0)
{
context.write(NullWritable.get(), new Text(key.toString() + " " + pq.poll().toString()));
//break; // only ouput the first record
}
} // reduce
}
This is my helper class :
public class NPhase2Value implements WritableComparable {
private IntWritable first;
private FloatWritable second;
public NPhase2Value() {
set(new IntWritable(), new FloatWritable());
}
public NPhase2Value(int first, float second) {
set(new IntWritable(first), new FloatWritable(second));
}
public void set(IntWritable first, FloatWritable second) {
this.first = first;
this.second = second;
}
public IntWritable getFirst() {
return first;
}
public FloatWritable getSecond() {
return second;
}
#Override
public void write(DataOutput out) throws IOException {
first.write(out);
second.write(out);
}
#Override
public void readFields(DataInput in) throws IOException {
first.readFields(in);
second.readFields(in);
}
#Override
public boolean equals(Object o) {
if (o instanceof NPhase2Value) {
NPhase2Value np2v = (NPhase2Value) o;
return first.equals(np2v.first) && second.equals(np2v.second);
}
return false;
}
#Override
public String toString() {
return first.toString() + " " + second.toString();
}
#Override
public int compareTo(NPhase2Value np2v) {
return 1;
}
}
The command line command I use is :
hadoop jar knn.jar NPhase2 -m 1 -r 3 -k 4 phase1out phase2out
I am trying hard to figure out the error but still not able to come up with solution. Please help me in this regards as I am running on a tight schedule.

Because you have set the number of reducer task as 0. See this:
int numberOfPartition = 0;
//.......
job.setNumReduceTasks(numberOfPartition * numberOfPartition);
I dont see you have resetted numberOfPartition anywhere in your code. I thins you should set it where you are parsing -r option or remove call to setNumReduceTasks method as above completely as you are setting it already while parsing -r option.

Related

Hadoop MapReduce is not producing desired output

I have one large file that contain patent information. The header is as follow "PATENT","GYEAR","GDATE","APPYEAR","COUNTRY","POSTATE","ASSIGNEE","ASSCODE","CLAIMS".
I want to calculate the average claims per patent by year, where the key is the year and value is the average amount. However, reducer output shows that my average amount is 1.0 all the time. Where did my program go wrong?
The Main class
public static void main(String [] args) throws Exception{
int res = ToolRunner.run(new Configuration(), new AvgClaimsByYear(), args);
System.exit(res);
}
The Driver class
Configuration config = this.getConf();
Job job = Job.getInstance(config, "average claims per year");
job.setJarByClass(AvgClaimsByYear.class);
job.setMapperClass(TheMapper.class);
job.setPartitionerClass(ThePartitioner.class);
job.setNumReduceTasks(4);
job.setReducerClass(TheReducer.class);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
return job.waitForCompletion(true) ? 0 : 1;
The Mapper class
public static class TheMapper extends Mapper<LongWritable, Text, IntWritable, IntWritable> {
private IntWritable yearAsKeyOut = new IntWritable();
private IntWritable claimsAsValueOut = new IntWritable(1);
#Override
public void map(LongWritable keyIn, Text valueIn, Context context) throws IOException,InterruptedException {
String line = valueIn.toString();
if(line.contains("PATENT")) {
return; //skip header
}
else {
String [] patentData = line.split(",");
yearAsKeyOut.set(Integer.parseInt(patentData[1]));
if (patentData[8].length() > 0) {
claimsAsValueOut.set(Integer.parseInt(patentData[8]));
}
}
context.write(yearAsKeyOut, claimsAsValueOut);
}
}
The Partitioner Class
public static class ThePartitioner extends Partitioner<IntWritable, IntWritable> {
public int getPartition(IntWritable keyIn, IntWritable valueIn, int totalNumPartition) {
int theYear = keyIn.get();
if (theYear <= 1970) {
return 0;
}
else if(theYear > 1970 && theYear <= 1979) {
return 1;
}
else if(theYear > 1979 && theYear <=1989) {
return 2;
}
else{
return 3;
}
}
}
The Reducer class
public static class TheReducer extends Reducer<IntWritable,IntWritable,IntWritable,FloatWritable> {
#Override
public void reduce(IntWritable yearKey, Iterable<IntWritable> values, Context context) throws IOException,InterruptedException {
int totalClaimsThatYear = 0;
int totalPatentCountThatYear = 0;
FloatWritable avgClaim = new FloatWritable();
for(IntWritable value : values) {
totalClaimsThatYear += value.get();
totalPatentCountThatYear += 1;
}
avgClaim.set(calculateAvgClaimPerPatent (totalPatentCountThatYear, totalClaimsThatYear));
context.write(yearKey, avgClaim);
}
public float calculateAvgClaimPerPatent (int totalPatentCount, int totalClaims) {
return (float)totalClaims/totalPatentCount;
}
}
The Input
3070801,1963,1096,,"BE","",,1,,269,6,69,,1,,0,,,,,,,
3070802,1963,1096,,"US","TX",,1,,2,6,63,,0,,,,,,,,,
3070803,1963,1096,,"US","IL",,1,,2,6,63,,9,,0.3704,,,,,,,
3070804,1963,1096,,"US","OH",,1,,2,6,63,,3,,0.6667,,,,,,,
3070805,1963,1096,,"US","CA",,1,,2,6,63,,1,,0,,,,,,,
The Output
1963 1.0
1964 1.0
1965 1.0
1966 1.0
1967 1.0
1968 1.0
1969 1.0
1970 1.0
In calculateAvgClaimPerPatent() your expression performs integer division before conversion to a float. Convert the two integers to float before the division.
-- edit --
Also, looking over the code again, the average written out is really the average number of claims per record, grouped by the 4 intervals defined by your partitioner. In other words, the number of claims for one patent in 1972 is being averaged in with the number of claims for a different patent in 1975. That doesn't match your problem description.

SimpleTextLoader UDF in Pig

I want to create a Custom Load function for Pig UDF, I have created a SimpleTextLoader using the link
https://pig.apache.org/docs/r0.11.0/udf.html , I have successfully generate the jar file for this code, register in pig and run a Pig Script.I am getting the empty output. I don't know how to solve this issue, any help would be appreciated.
Below is my Java code
public class SimpleTextLoader extends LoadFunc{
protected RecordReader in = null;
private byte fieldDel = '\t';
private ArrayList<Object> mProtoTuple = null;
private TupleFactory mTupleFactory = TupleFactory.getInstance();
private static final int BUFFER_SIZE = 1024;
public SimpleTextLoader() {
}
public SimpleTextLoader(String delimiter)
{
this();
if (delimiter.length() == 1) {
this.fieldDel = (byte)delimiter.charAt(0);
} else if (delimiter.length() > 1 && delimiter.charAt(0) == '\\') {
switch (delimiter.charAt(1)) {
case 't':
this.fieldDel = (byte)'\t';
break;
case 'x':
fieldDel =
Integer.valueOf(delimiter.substring(2), 16).byteValue();
break;
case 'u':
this.fieldDel =
Integer.valueOf(delimiter.substring(2)).byteValue();
break;
default:
throw new RuntimeException("Unknown delimiter " + delimiter);
}
} else {
throw new RuntimeException("PigStorage delimeter must be a single character");
}
}
private void readField(byte[] buf, int start, int end) {
if (mProtoTuple == null) {
mProtoTuple = new ArrayList<Object>();
}
if (start == end) {
// NULL value
mProtoTuple.add(null);
} else {
mProtoTuple.add(new DataByteArray(buf, start, end));
}
} #Override
public Tuple getNext() throws IOException {
try {
boolean notDone = in.nextKeyValue();
if (notDone) {
return null;
}
Text value = (Text) in.getCurrentValue();
System.out.println("printing value" +value);
byte[] buf = value.getBytes();
int len = value.getLength();
int start = 0;
for (int i = 0; i < len; i++) {
if (buf[i] == fieldDel) {
readField(buf, start, i);
start = i + 1;
}
}
// pick up the last field
readField(buf, start, len);
Tuple t = mTupleFactory.newTupleNoCopy(mProtoTuple);
mProtoTuple = null;
System.out.println(t);
return t;
} catch (InterruptedException e) {
int errCode = 6018;
String errMsg = "Error while reading input";
throw new ExecException(errMsg, errCode,
PigException.REMOTE_ENVIRONMENT, e);
}
}
#Override
public void setLocation(String string, Job job) throws IOException {
FileInputFormat.setInputPaths(job,string);
}
#Override
public InputFormat getInputFormat() throws IOException {
return new TextInputFormat();
}
#Override
public void prepareToRead(RecordReader reader, PigSplit ps) throws IOException {
in=reader;
}
}
Below is my Pig Script
REGISTER /home/hadoop/netbeans/sampleloader/dist/sampleloader.jar
a= load '/input.txt' using sampleloader.SimpleTextLoader();
store a into 'output';
You are using sampleloader.SimpleTextLoader() that doesn't do anything as it is just an empty constructor.
Instead use sampleloader.SimpleTextLoader(String delimiter) which is performing the actual operation of split.

Why mapper function not called when using SequenceFileInputFormat

I have spent two days on this issue. Thanks in advance if anyone can help! Here is the description:
First mapper and reduce work well, and the output with SequenceFileOutputFormat can be found in the output path.
First mapper:
public static class TextToRecordMapper
extends Mapper<Object, Text, Text, IntArrayWritable>{
public void map(Object key, Text value, Context context
) throws IOException, InterruptedException {
}
}
First reducer:
public static class MacOneSensorSigCntReducer
extends Reducer<Text,IntArrayWritable,Text,IntArrayWritable> {
public void reduce(Text key, Iterable<IntArrayWritable> values,
Context context
) throws IOException, InterruptedException {
}
}
The Job part:
Job job = new Job(conf, "word count");
job.setJarByClass(RawInputText.class);
job.setMapperClass(TextToRecordMapper.class);
job.setReducerClass(MacOneSensorSigCntReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntArrayWritable.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
job.waitForCompletion(true);
This works well, and then I add my second mapper and reducer to deal with the output of the first part.
Second mapper:
public static class MacSensorsTimeLocMapper
extends Mapper<Text,IntArrayWritable,Text,IntWritable> {
private Text macInfo = new Text();
public void map(Text key, Iterable<IntArrayWritable> values,
Context context
) throws IOException, InterruptedException {
}
}
Second reducer:
public static class MacInfoTestReducer
extends Reducer<Text,IntWritable,Text,Text> {
public void reduce(Text key, Iterable<IntWritable> values,
Context context
) throws IOException, InterruptedException {
}
}
The Job part:
Job secondJob = new Job(conf, "word count 2");
secondJob.setJarByClass(RawInputText.class);
FileInputFormat.addInputPath(secondJob, new Path(otherArgs[1]));
secondJob.setInputFormatClass(SequenceFileInputFormat.class);
secondJob.setMapperClass(MacSensorsTimeLocMapper.class);
secondJob.setMapOutputKeyClass(Text.class);
secondJob.setMapOutputValueClass(IntArrayWritable.class);
//do not use test reducer to make things simple
//secondJob.setReducerClass(MacInfoTestReducer.class);
FileOutputFormat.setOutputPath(secondJob, new Path(otherArgs[2]));
System.exit(secondJob.waitForCompletion(true) ? 0 : 1);
The second mapper function is not called when I run the code, and the output is generated with text like the following:
00:08:CA:6C:A2:81 com.hicapt.xike.IntArrayWritable#234265
Seems like the framework calls IdentityMapper instead of mine. But how do I change that to make my mapper be called with SequenceFileInputFormat as the input format.
all the code added below:
import java.io.IOException;
import java.util.Collection;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.TreeMap;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class RawInputText {
public static class TextToRecordMapper
extends Mapper<Object, Text, Text, IntArrayWritable>{
private Text word = new Text();
private IntArrayWritable mapv = new IntArrayWritable();
public void map(Object key, Text value, Context context
) throws IOException, InterruptedException {
String line = value.toString();
String[] valArray = line.split(",");
if(valArray.length == 6){
IntWritable[] valInts = new IntWritable[2];
word.set(valArray[0]+"-"+valArray[1]);
valInts[0] = new IntWritable(Integer.parseInt(valArray[2]));
valInts[1] = new IntWritable(Integer.parseInt(valArray[4]));
mapv.set(valInts);
context.write(word, mapv);
}
}
}
public static class MacOneSensorSigCntReducer
extends Reducer<Text,IntArrayWritable,Text,IntArrayWritable> {
private Text macKey = new Text();
private IntArrayWritable macInfo = new IntArrayWritable();
public void reduce(Text key, Iterable<IntArrayWritable> values,
Context context
) throws IOException, InterruptedException {
String[] keyArray = key.toString().split("-");
if(keyArray.length < 2){
int a = 10;
a= 20;
}
String mac = keyArray[1];
String sen = keyArray[0];
Hashtable<Integer, MinuteSignalInfo> rssiTime = new Hashtable<Integer, MinuteSignalInfo>();
MinuteSignalInfo minSig;
int rssi = 0;
int ts = 0;
int i = 0;
for (IntArrayWritable val : values) {
i = 0;
for(Writable element : val.get()) {
IntWritable eleVal = (IntWritable)element;
if(i%2 == 0)
rssi = eleVal.get();
else
ts = eleVal.get()/60;
i++;
}
minSig = (MinuteSignalInfo)rssiTime.get(ts);
if(minSig == null){
minSig = new MinuteSignalInfo();
minSig.rssi = rssi;
minSig.count = 1;
}else{
minSig.rssi += rssi;
minSig.count += 1;
}
rssiTime.put(ts, minSig);
}
TreeMap<Integer, MinuteSignalInfo> treeMap = new TreeMap<Integer, MinuteSignalInfo>();
treeMap.putAll(rssiTime);
macKey.set(mac);
i = 0;
IntWritable[] valInts = new IntWritable[1+treeMap.size()*3];
valInts[i++] = new IntWritable(Integer.parseInt(sen));
Collection<Integer> macs = treeMap.keySet();
Iterator<Integer> it = macs.iterator();
while(it.hasNext()) {
int tsKey = it.next();
valInts[i++] = new IntWritable(tsKey);
valInts[i++] = new IntWritable(treeMap.get(tsKey).rssi);
valInts[i++] = new IntWritable(treeMap.get(tsKey).count);
}
macInfo.set(valInts);
context.write(macKey, macInfo);
}
}
public static class MacSensorsTimeLocMapper
extends Mapper<Text,IntArrayWritable,Text,IntWritable> {
private Text macInfo = new Text();
public void map(Text key, Iterable<IntArrayWritable> values,
Context context
) throws IOException, InterruptedException {
int i = 0;
int sensor = 0;
int ts = 0;
int rssi = 0;
int count = 0;
Hashtable<Integer, MinuteSignalInfo> rssiTime = new Hashtable<Integer, MinuteSignalInfo>();
MinuteSignalInfo minSig;
for (IntArrayWritable val : values) {
i = 0;
for(Writable element : val.get()) {
IntWritable eleVal = (IntWritable)element;
int valval = eleVal.get();
if(i == 0) {
sensor = valval;
}else if(i%3 == 1){
ts = valval;
}else if(i%3 == 2){
rssi = valval;
}else if(i%3 == 0){
count = valval;
minSig = (MinuteSignalInfo)rssiTime.get(ts);
if(minSig == null){
minSig = new MinuteSignalInfo();
minSig.rssi = rssi;
minSig.count = count;
minSig.sensor = sensor;
rssiTime.put(ts, minSig);
}else{
if((rssi/count) < (minSig.rssi/minSig.count)){
minSig.rssi = rssi;
minSig.count = count;
minSig.sensor = sensor;
rssiTime.put(ts, minSig);
}
}
}
i++;
}
}
TreeMap<Integer, MinuteSignalInfo> treeMap = new TreeMap<Integer, MinuteSignalInfo>();
treeMap.putAll(rssiTime);
String macLocs = "";
Collection<Integer> tss = treeMap.keySet();
Iterator<Integer> it = tss.iterator();
while(it.hasNext()) {
int tsKey = it.next();
macLocs += String.valueOf(tsKey) + ",";
macLocs += String.valueOf(treeMap.get(tsKey).sensor) + ";";
}
macInfo.set(macLocs);
context.write(key, new IntWritable(10));
//context.write(key, macInfo);
}
}
public static class MacSensorsTimeLocReducer
extends Reducer<Text,IntArrayWritable,Text,Text> {
private Text macInfo = new Text();
public void reduce(Text key, Iterable<IntArrayWritable> values,
Context context
) throws IOException, InterruptedException {
int i = 0;
int sensor = 0;
int ts = 0;
int rssi = 0;
int count = 0;
Hashtable<Integer, MinuteSignalInfo> rssiTime = new Hashtable<Integer, MinuteSignalInfo>();
MinuteSignalInfo minSig;
for (IntArrayWritable val : values) {
i = 0;
for(Writable element : val.get()) {
IntWritable eleVal = (IntWritable)element;
int valval = eleVal.get();
if(i == 0) {
sensor = valval;
}else if(i%3 == 1){
ts = valval;
}else if(i%3 == 2){
rssi = valval;
}else if(i%3 == 0){
count = valval;
minSig = (MinuteSignalInfo)rssiTime.get(ts);
if(minSig == null){
minSig = new MinuteSignalInfo();
minSig.rssi = rssi;
minSig.count = count;
minSig.sensor = sensor;
rssiTime.put(ts, minSig);
}else{
if((rssi/count) < (minSig.rssi/minSig.count)){
minSig.rssi = rssi;
minSig.count = count;
minSig.sensor = sensor;
rssiTime.put(ts, minSig);
}
}
}
i++;
}
}
TreeMap<Integer, MinuteSignalInfo> treeMap = new TreeMap<Integer, MinuteSignalInfo>();
treeMap.putAll(rssiTime);
String macLocs = "";
Collection<Integer> tss = treeMap.keySet();
Iterator<Integer> it = tss.iterator();
while(it.hasNext()) {
int tsKey = it.next();
macLocs += String.valueOf(tsKey) + ",";
macLocs += String.valueOf(treeMap.get(tsKey).sensor) + ";";
}
macInfo.set(macLocs);
context.write(key, macInfo);
}
}
public static class MacInfoTestReducer
extends Reducer<Text,IntArrayWritable,Text,Text> {
private Text macInfo = new Text();
public void reduce(Text key, Iterable<IntArrayWritable> values,
Context context
) throws IOException, InterruptedException {
String tmp = "";
for (IntArrayWritable val : values) {
for(Writable element : val.get()) {
IntWritable eleVal = (IntWritable)element;
int valval = eleVal.get();
tmp += String.valueOf(valval) + " ";
}
}
macInfo.set(tmp);
context.write(key, macInfo);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length != 3) {
System.err.println("Usage: wordcount <in> <out>");
System.exit(2);
}
/*
Job job = new Job(conf, "word count");
job.setJarByClass(RawInputText.class);
job.setMapperClass(TextToRecordMapper.class);
job.setReducerClass(MacOneSensorSigCntReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntArrayWritable.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
job.waitForCompletion(true);
*/
Job secondJob = new Job(conf, "word count 2");
secondJob.setJarByClass(RawInputText.class);
FileInputFormat.addInputPath(secondJob, new Path(otherArgs[1]));
secondJob.setInputFormatClass(SequenceFileInputFormat.class);
secondJob.setMapperClass(MacSensorsTimeLocMapper.class);
//secondJob.setMapperClass(Mapper.class);
secondJob.setMapOutputKeyClass(Text.class);
secondJob.setMapOutputValueClass(IntArrayWritable.class);
secondJob.setReducerClass(MacInfoTestReducer.class);
//secondJob.setOutputKeyClass(Text.class);
//secondJob.setOutputValueClass(IntArrayWritable.class);
FileOutputFormat.setOutputPath(secondJob, new Path(otherArgs[2]));
System.exit(secondJob.waitForCompletion(true) ? 0 : 1);
}
}
package com.hicapt.xike;
public class MinuteSignalInfo {
public int sensor;
public int rssi;
public int count;
public MinuteSignalInfo() {
rssi = 0;
count = 0;
sensor = 0;
}
}
package com.hicapt.xike;
import org.apache.hadoop.io.ArrayWritable;
import org.apache.hadoop.io.IntWritable;
public class IntArrayWritable extends ArrayWritable {
public IntArrayWritable() {
super(IntWritable.class);
}
/*
public void readFields(DataInput in) throws IOException{
super.readFields(in);
}
public void write(DataOutput out) throws IOException{
super.write(out);
}*/
}

Hadoop difficultie with composite key

I'm using Hadoop to analyze GSOD data (ftp://ftp.ncdc.noaa.gov/pub/data/gsod/).
I chose 5 years to executed my experiments (2005 - 2009).
I've configured a little cluster and executed a simple MapReduce program that gets the maximum temperature registered for a year.
Now I have to create a new MR program that counts for each station all the phenomena occurences all those years.
The files that I have to analyze have this structure:
STN--- ... FRSHTO
722115 110001
722115 011001
722110 111000
722110 001000
722000 001000
The column STN means the station code and FRSHTT means the phenomena:
F - Fog, R - Rain or drizzle, S - Snow or ice pellets, H - Hail, T - Thunder, O - Tornado or funnel cloud.
The value 1, means that this phenomenun occured at that day; 0, means not ocurred.
I need to find results like following:
722115: F = 1, R = 2, S = 1, O = 2
722110: F = 1, R = 1, S = 2
722000: S = 1
I could run the MR program but the results are wrong, giving me these results:
722115 F, 1
722115 R, 1
722115 R, 1
722115 S, 1
722115 O, 1
722115 O, 1
722110 F, 1
722110 R, 1
722110 S, 1
722110 S, 1
722000 S, 1
I have used these codes:
Mapper.java
public class Mapper extends org.apache.hadoop.mapreduce.Mapper<LongWritable, Text, StationPhenomenun, IntWritable> {
#Override
protected void map(LongWritable key, Text value, org.apache.hadoop.mapreduce.Mapper.Context context) throws IOException, InterruptedException {
String line = value.toString();
// Every file starts with a field description line, so, I ignore this line
if (!line.startsWith("STN---")) {
// First field of the line means the station code where data was collected
String station = line.substring(0, 6);
String fog = (line.substring(132, 133));
String rainOrDrizzle = (line.substring(133, 134));
String snowOrIcePellets = (line.substring(134, 135));
String hail = (line.substring(135, 136));
String thunder = (line.substring(136, 137));
String tornadoOrFunnelCloud = (line.substring(137, 138));
if (fog.equals("1"))
context.write(new StationPhenomenun(station,"F"), new IntWritable(1));
if (rainOrDrizzle.equals("1"))
context.write(new StationPhenomenun(station,"R"), new IntWritable(1));
if (snowOrIcePellets.equals("1"))
context.write(new StationPhenomenun(station,"S"), new IntWritable(1));
if (hail.equals("1"))
context.write(new StationPhenomenun(station,"H"), new IntWritable(1));
if (thunder.equals("1"))
context.write(new StationPhenomenun(station,"T"), new IntWritable(1));
if (tornadoOrFunnelCloud.equals("1"))
context.write(new StationPhenomenun(station,"O"), new IntWritable(1));
}
}
}
Reducer.java
public class Reducer extends org.apache.hadoop.mapreduce.Reducer<StationPhenomenun, IntWritable, StationPhenomenun, IntWritable> {
protected void reduce(StationPhenomenun key, Iterable<IntWritable> values, org.apache.hadoop.mapreduce.Reducer.Context context) throws IOException, InterruptedException {
int count = 0;
for (IntWritable value : values) {
count++;
}
String station = key.getStation().toString();
String occurence = key.getPhenomenun().toString();
StationPhenomenun textPair = new StationPhenomenun(station, occurence);
context.write(textPair, new IntWritable(count));
}
}
StationPhenomenum.java
public class StationPhenomenun implements WritableComparable<StationPhenomenun> {
private String station;
private String phenomenun;
public StationPhenomenun(String station, String phenomenun) {
this.station = station;
this.phenomenun = phenomenun;
}
public StationPhenomenun() {
}
public String getStation() {
return station;
}
public String getPhenomenun() {
return phenomenun;
}
#Override
public void readFields(DataInput in) throws IOException {
station = in.readUTF();
phenomenun = in.readUTF();
}
#Override
public void write(DataOutput out) throws IOException {
out.writeUTF(station);
out.writeUTF(phenomenun);
}
#Override
public int compareTo(StationPhenomenun t) {
int cmp = this.station.compareTo(t.station);
if (cmp != 0) {
return cmp;
}
return this.phenomenun.compareTo(t.phenomenun);
}
#Override
public boolean equals(Object obj) {
if (obj == null) {
return false;
}
if (getClass() != obj.getClass()) {
return false;
}
final StationPhenomenun other = (StationPhenomenun) obj;
if (this.station != other.station && (this.station == null || !this.station.equals(other.station))) {
return false;
}
if (this.phenomenun != other.phenomenun && (this.phenomenun == null || !this.phenomenun.equals(other.phenomenun))) {
return false;
}
return true;
}
#Override
public int hashCode() {
return this.station.hashCode() * 163 + this.phenomenun.hashCode();
}
}
NcdcJob.java
public class NcdcJob {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = new Job(conf);
job.setJarByClass(NcdcJob.class);
FileInputFormat.addInputPath(job, new Path("/user/hadoop/input"));
FileOutputFormat.setOutputPath(job, new Path("/user/hadoop/station"));
job.setMapperClass(Mapper.class);
job.setReducerClass(Reducer.class);
job.setMapOutputKeyClass(StationPhenomenun.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(StationPhenomenun.class);
job.setOutputValueClass(IntWritable.class);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
Has anyone done something similar?
PS.: I have tried this solution (Hadoop - composite key) but does not worked for me.
Just check if the following 2 classes matches towards your custom implementation.
job.setMapperClass(Mapper.class);
job.setReducerClass(Reducer.class);
I was able to get the desired result with the following changes
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
protected void reduce(StationPhenomenun key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
Also changed the Class Names to MyMapper and MyReducer
722115,1,1,0,0,0,1
722115,0,1,1,0,0,1
722110,1,1,1,0,0,0
722110,0,0,1,0,0,0
722000,0,0,1,0,0,0
For this input set, I could get the following result
StationPhenomenun [station=722000, phenomenun=S] 1
StationPhenomenun [station=722110, phenomenun=F] 1
StationPhenomenun [station=722110, phenomenun=R] 1
StationPhenomenun [station=722110, phenomenun=S] 2
StationPhenomenun [station=722115, phenomenun=F] 1
StationPhenomenun [station=722115, phenomenun=O] 2
StationPhenomenun [station=722115, phenomenun=R] 2
StationPhenomenun [station=722115, phenomenun=S] 1
Computation is same, you just need to customize how the output is displayed.

Why Hadoop shuffle not working as expected

I have this hadoop map reduce code that works on graph data (in adjacency list form) and kind of similar to in-adjacency list to out-adjacency list transformation algorithms. The main MapReduce Task code is following:
public class TestTask extends Configured
implements Tool {
public static class TTMapper extends MapReduceBase
implements Mapper<Text, TextArrayWritable, Text, NeighborWritable> {
#Override
public void map(Text key,
TextArrayWritable value,
OutputCollector<Text, NeighborWritable> output,
Reporter reporter) throws IOException {
int numNeighbors = value.get().length;
double weight = (double)1 / numNeighbors;
Text[] neighbors = (Text[]) value.toArray();
NeighborWritable me = new NeighborWritable(key, new DoubleWritable(weight));
for (int i = 0; i < neighbors.length; i++) {
output.collect(neighbors[i], me);
}
}
}
public static class TTReducer extends MapReduceBase
implements Reducer<Text, NeighborWritable, Text, Text> {
#Override
public void reduce(Text key,
Iterator<NeighborWritable> values,
OutputCollector<Text, Text> output,
Reporter arg3)
throws IOException {
ArrayList<NeighborWritable> neighborList = new ArrayList<NeighborWritable>();
while(values.hasNext()) {
neighborList.add(values.next());
}
NeighborArrayWritable neighbors = new NeighborArrayWritable
(neighborList.toArray(new NeighborWritable[0]));
Text out = new Text(neighbors.toString());
output.collect(key, out);
}
}
#Override
public int run(String[] arg0) throws Exception {
JobConf conf = Util.getMapRedJobConf("testJob",
SequenceFileInputFormat.class,
TTMapper.class,
Text.class,
NeighborWritable.class,
1,
TTReducer.class,
Text.class,
Text.class,
TextOutputFormat.class,
"test/in",
"test/out");
JobClient.runJob(conf);
return 0;
}
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(new TestTask(), args);
System.exit(res);
}
}
The auxiliary code is following:
TextArrayWritable:
public class TextArrayWritable extends ArrayWritable {
public TextArrayWritable() {
super(Text.class);
}
public TextArrayWritable(Text[] values) {
super(Text.class, values);
}
}
NeighborWritable:
public class NeighborWritable implements Writable {
private Text nodeId;
private DoubleWritable weight;
public NeighborWritable(Text nodeId, DoubleWritable weight) {
this.nodeId = nodeId;
this.weight = weight;
}
public NeighborWritable () { }
public Text getNodeId() {
return nodeId;
}
public DoubleWritable getWeight() {
return weight;
}
public void setNodeId(Text nodeId) {
this.nodeId = nodeId;
}
public void setWeight(DoubleWritable weight) {
this.weight = weight;
}
#Override
public void readFields(DataInput in) throws IOException {
nodeId = new Text();
nodeId.readFields(in);
weight = new DoubleWritable();
weight.readFields(in);
}
#Override
public void write(DataOutput out) throws IOException {
nodeId.write(out);
weight.write(out);
}
public String toString() {
return "NW[nodeId=" + (nodeId != null ? nodeId.toString() : "(null)") +
",weight=" + (weight != null ? weight.toString() : "(null)") + "]";
}
public boolean equals(Object o) {
if (!(o instanceof NeighborWritable)) {
return false;
}
NeighborWritable that = (NeighborWritable)o;
return (nodeId.equals(that.getNodeId()) && (weight.equals(that.getWeight())));
}
}
and the Util class:
public class Util {
public static JobConf getMapRedJobConf(String jobName,
Class<? extends InputFormat> inputFormatClass,
Class<? extends Mapper> mapperClass,
Class<?> mapOutputKeyClass,
Class<?> mapOutputValueClass,
int numReducer,
Class<? extends Reducer> reducerClass,
Class<?> outputKeyClass,
Class<?> outputValueClass,
Class<? extends OutputFormat> outputFormatClass,
String inputDir,
String outputDir) throws IOException {
JobConf conf = new JobConf();
if (jobName != null)
conf.setJobName(jobName);
conf.setInputFormat(inputFormatClass);
conf.setMapperClass(mapperClass);
if (numReducer == 0) {
conf.setNumReduceTasks(0);
conf.setOutputKeyClass(outputKeyClass);
conf.setOutputValueClass(outputValueClass);
conf.setOutputFormat(outputFormatClass);
} else {
// may set actual number of reducers
// conf.setNumReduceTasks(numReducer);
conf.setMapOutputKeyClass(mapOutputKeyClass);
conf.setMapOutputValueClass(mapOutputValueClass);
conf.setReducerClass(reducerClass);
conf.setOutputKeyClass(outputKeyClass);
conf.setOutputValueClass(outputValueClass);
conf.setOutputFormat(outputFormatClass);
}
// delete the existing target output folder
FileSystem fs = FileSystem.get(conf);
fs.delete(new Path(outputDir), true);
// specify input and output DIRECTORIES (not files)
FileInputFormat.addInputPath(conf, new Path(inputDir));
FileOutputFormat.setOutputPath(conf, new Path(outputDir));
return conf;
}
}
My input is following graph: (in binary format, here I am giving the text format)
1 2
2 1,3,5
3 2,4
4 3,5
5 2,4
According to the logic of the code the output should be:
1 NWArray[size=1,{NW[nodeId=2,weight=0.3333333333333333],}]
2 NWArray[size=3,{NW[nodeId=5,weight=0.5],NW[nodeId=3,weight=0.5],NW[nodeId=1,weight=1.0],}]
3 NWArray[size=2,{NW[nodeId=2,weight=0.3333333333333333],NW[nodeId=4,weight=0.5],}]
4 NWArray[size=2,{NW[nodeId=5,weight=0.5],NW[nodeId=3,weight=0.5],}]
5 NWArray[size=2,{NW[nodeId=2,weight=0.3333333333333333],NW[nodeId=4,weight=0.5],}]
But the output is coming as:
1 NWArray[size=1,{NW[nodeId=2,weight=0.3333333333333333],}]
2 NWArray[size=3,{NW[nodeId=5,weight=0.5],NW[nodeId=5,weight=0.5],NW[nodeId=5,weight=0.5],}]
3 NWArray[size=2,{NW[nodeId=2,weight=0.3333333333333333],NW[nodeId=2,weight=0.3333333333333333],}]
4 NWArray[size=2,{NW[nodeId=5,weight=0.5],NW[nodeId=5,weight=0.5],}]
5 NWArray[size=2,{NW[nodeId=2,weight=0.3333333333333333],NW[nodeId=2,weight=0.3333333333333333],}]
I cannot understand the reason why the expected output is not coming out. Any help will be appreciated.
Thanks.
You're falling foul of object re-use
while(values.hasNext()) {
neighborList.add(values.next());
}
values.next() will return the same object reference, but the underlying contents of that object will change for each iteration (the readFields method is called to re-populate the contents)
Suggest you amend to (you'll need to obtain the Configuration conf variable from a setup method, unless you can obtain it from the Reporter or OutputCollector - sorry i don't use the old API)
while(values.hasNext()) {
neighborList.add(
ReflectionUtils.copy(conf, values.next(), new NeighborWritable());
}
But I still can't understand why my unit test passed then. Here is the code -
public class UWLTInitReducerTest {
private Text key;
private Iterator<NeighborWritable> values;
private NeighborArrayWritable nodeData;
private TTReducer reducer;
/**
* Set up the states for calling the map function
*/
#Before
public void setUp() throws Exception {
key = new Text("1001");
NeighborWritable[] neighbors = new NeighborWritable[4];
for (int i = 0; i < 4; i++) {
neighbors[i] = new NeighborWritable(new Text("300" + i), new DoubleWritable((double) 1 / (1 + i)));
}
values = Arrays.asList(neighbors).iterator();
nodeData = new NeighborArrayWritable(neighbors);
reducer = new TTReducer();
}
/**
* Test method for InitModelMapper#map - valid input
*/
#Test
public void testMapValid() {
// mock the output object
OutputCollector<Text, UWLTNodeData> output = mock(OutputCollector.class);
try {
// call the API
reducer.reduce(key, values, output, null);
// in order (sequential) verification of the calls to output.collect()
verify(output).collect(key, nodeData);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
Why didn't this code catch the bug?

Resources