Related
I have one large file that contain patent information. The header is as follow "PATENT","GYEAR","GDATE","APPYEAR","COUNTRY","POSTATE","ASSIGNEE","ASSCODE","CLAIMS".
I want to calculate the average claims per patent by year, where the key is the year and value is the average amount. However, reducer output shows that my average amount is 1.0 all the time. Where did my program go wrong?
The Main class
public static void main(String [] args) throws Exception{
int res = ToolRunner.run(new Configuration(), new AvgClaimsByYear(), args);
System.exit(res);
}
The Driver class
Configuration config = this.getConf();
Job job = Job.getInstance(config, "average claims per year");
job.setJarByClass(AvgClaimsByYear.class);
job.setMapperClass(TheMapper.class);
job.setPartitionerClass(ThePartitioner.class);
job.setNumReduceTasks(4);
job.setReducerClass(TheReducer.class);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
return job.waitForCompletion(true) ? 0 : 1;
The Mapper class
public static class TheMapper extends Mapper<LongWritable, Text, IntWritable, IntWritable> {
private IntWritable yearAsKeyOut = new IntWritable();
private IntWritable claimsAsValueOut = new IntWritable(1);
#Override
public void map(LongWritable keyIn, Text valueIn, Context context) throws IOException,InterruptedException {
String line = valueIn.toString();
if(line.contains("PATENT")) {
return; //skip header
}
else {
String [] patentData = line.split(",");
yearAsKeyOut.set(Integer.parseInt(patentData[1]));
if (patentData[8].length() > 0) {
claimsAsValueOut.set(Integer.parseInt(patentData[8]));
}
}
context.write(yearAsKeyOut, claimsAsValueOut);
}
}
The Partitioner Class
public static class ThePartitioner extends Partitioner<IntWritable, IntWritable> {
public int getPartition(IntWritable keyIn, IntWritable valueIn, int totalNumPartition) {
int theYear = keyIn.get();
if (theYear <= 1970) {
return 0;
}
else if(theYear > 1970 && theYear <= 1979) {
return 1;
}
else if(theYear > 1979 && theYear <=1989) {
return 2;
}
else{
return 3;
}
}
}
The Reducer class
public static class TheReducer extends Reducer<IntWritable,IntWritable,IntWritable,FloatWritable> {
#Override
public void reduce(IntWritable yearKey, Iterable<IntWritable> values, Context context) throws IOException,InterruptedException {
int totalClaimsThatYear = 0;
int totalPatentCountThatYear = 0;
FloatWritable avgClaim = new FloatWritable();
for(IntWritable value : values) {
totalClaimsThatYear += value.get();
totalPatentCountThatYear += 1;
}
avgClaim.set(calculateAvgClaimPerPatent (totalPatentCountThatYear, totalClaimsThatYear));
context.write(yearKey, avgClaim);
}
public float calculateAvgClaimPerPatent (int totalPatentCount, int totalClaims) {
return (float)totalClaims/totalPatentCount;
}
}
The Input
3070801,1963,1096,,"BE","",,1,,269,6,69,,1,,0,,,,,,,
3070802,1963,1096,,"US","TX",,1,,2,6,63,,0,,,,,,,,,
3070803,1963,1096,,"US","IL",,1,,2,6,63,,9,,0.3704,,,,,,,
3070804,1963,1096,,"US","OH",,1,,2,6,63,,3,,0.6667,,,,,,,
3070805,1963,1096,,"US","CA",,1,,2,6,63,,1,,0,,,,,,,
The Output
1963 1.0
1964 1.0
1965 1.0
1966 1.0
1967 1.0
1968 1.0
1969 1.0
1970 1.0
In calculateAvgClaimPerPatent() your expression performs integer division before conversion to a float. Convert the two integers to float before the division.
-- edit --
Also, looking over the code again, the average written out is really the average number of claims per record, grouped by the 4 intervals defined by your partitioner. In other words, the number of claims for one patent in 1972 is being averaged in with the number of claims for a different patent in 1975. That doesn't match your problem description.
I want to create a Custom Load function for Pig UDF, I have created a SimpleTextLoader using the link
https://pig.apache.org/docs/r0.11.0/udf.html , I have successfully generate the jar file for this code, register in pig and run a Pig Script.I am getting the empty output. I don't know how to solve this issue, any help would be appreciated.
Below is my Java code
public class SimpleTextLoader extends LoadFunc{
protected RecordReader in = null;
private byte fieldDel = '\t';
private ArrayList<Object> mProtoTuple = null;
private TupleFactory mTupleFactory = TupleFactory.getInstance();
private static final int BUFFER_SIZE = 1024;
public SimpleTextLoader() {
}
public SimpleTextLoader(String delimiter)
{
this();
if (delimiter.length() == 1) {
this.fieldDel = (byte)delimiter.charAt(0);
} else if (delimiter.length() > 1 && delimiter.charAt(0) == '\\') {
switch (delimiter.charAt(1)) {
case 't':
this.fieldDel = (byte)'\t';
break;
case 'x':
fieldDel =
Integer.valueOf(delimiter.substring(2), 16).byteValue();
break;
case 'u':
this.fieldDel =
Integer.valueOf(delimiter.substring(2)).byteValue();
break;
default:
throw new RuntimeException("Unknown delimiter " + delimiter);
}
} else {
throw new RuntimeException("PigStorage delimeter must be a single character");
}
}
private void readField(byte[] buf, int start, int end) {
if (mProtoTuple == null) {
mProtoTuple = new ArrayList<Object>();
}
if (start == end) {
// NULL value
mProtoTuple.add(null);
} else {
mProtoTuple.add(new DataByteArray(buf, start, end));
}
} #Override
public Tuple getNext() throws IOException {
try {
boolean notDone = in.nextKeyValue();
if (notDone) {
return null;
}
Text value = (Text) in.getCurrentValue();
System.out.println("printing value" +value);
byte[] buf = value.getBytes();
int len = value.getLength();
int start = 0;
for (int i = 0; i < len; i++) {
if (buf[i] == fieldDel) {
readField(buf, start, i);
start = i + 1;
}
}
// pick up the last field
readField(buf, start, len);
Tuple t = mTupleFactory.newTupleNoCopy(mProtoTuple);
mProtoTuple = null;
System.out.println(t);
return t;
} catch (InterruptedException e) {
int errCode = 6018;
String errMsg = "Error while reading input";
throw new ExecException(errMsg, errCode,
PigException.REMOTE_ENVIRONMENT, e);
}
}
#Override
public void setLocation(String string, Job job) throws IOException {
FileInputFormat.setInputPaths(job,string);
}
#Override
public InputFormat getInputFormat() throws IOException {
return new TextInputFormat();
}
#Override
public void prepareToRead(RecordReader reader, PigSplit ps) throws IOException {
in=reader;
}
}
Below is my Pig Script
REGISTER /home/hadoop/netbeans/sampleloader/dist/sampleloader.jar
a= load '/input.txt' using sampleloader.SimpleTextLoader();
store a into 'output';
You are using sampleloader.SimpleTextLoader() that doesn't do anything as it is just an empty constructor.
Instead use sampleloader.SimpleTextLoader(String delimiter) which is performing the actual operation of split.
I have spent two days on this issue. Thanks in advance if anyone can help! Here is the description:
First mapper and reduce work well, and the output with SequenceFileOutputFormat can be found in the output path.
First mapper:
public static class TextToRecordMapper
extends Mapper<Object, Text, Text, IntArrayWritable>{
public void map(Object key, Text value, Context context
) throws IOException, InterruptedException {
}
}
First reducer:
public static class MacOneSensorSigCntReducer
extends Reducer<Text,IntArrayWritable,Text,IntArrayWritable> {
public void reduce(Text key, Iterable<IntArrayWritable> values,
Context context
) throws IOException, InterruptedException {
}
}
The Job part:
Job job = new Job(conf, "word count");
job.setJarByClass(RawInputText.class);
job.setMapperClass(TextToRecordMapper.class);
job.setReducerClass(MacOneSensorSigCntReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntArrayWritable.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
job.waitForCompletion(true);
This works well, and then I add my second mapper and reducer to deal with the output of the first part.
Second mapper:
public static class MacSensorsTimeLocMapper
extends Mapper<Text,IntArrayWritable,Text,IntWritable> {
private Text macInfo = new Text();
public void map(Text key, Iterable<IntArrayWritable> values,
Context context
) throws IOException, InterruptedException {
}
}
Second reducer:
public static class MacInfoTestReducer
extends Reducer<Text,IntWritable,Text,Text> {
public void reduce(Text key, Iterable<IntWritable> values,
Context context
) throws IOException, InterruptedException {
}
}
The Job part:
Job secondJob = new Job(conf, "word count 2");
secondJob.setJarByClass(RawInputText.class);
FileInputFormat.addInputPath(secondJob, new Path(otherArgs[1]));
secondJob.setInputFormatClass(SequenceFileInputFormat.class);
secondJob.setMapperClass(MacSensorsTimeLocMapper.class);
secondJob.setMapOutputKeyClass(Text.class);
secondJob.setMapOutputValueClass(IntArrayWritable.class);
//do not use test reducer to make things simple
//secondJob.setReducerClass(MacInfoTestReducer.class);
FileOutputFormat.setOutputPath(secondJob, new Path(otherArgs[2]));
System.exit(secondJob.waitForCompletion(true) ? 0 : 1);
The second mapper function is not called when I run the code, and the output is generated with text like the following:
00:08:CA:6C:A2:81 com.hicapt.xike.IntArrayWritable#234265
Seems like the framework calls IdentityMapper instead of mine. But how do I change that to make my mapper be called with SequenceFileInputFormat as the input format.
all the code added below:
import java.io.IOException;
import java.util.Collection;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.TreeMap;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class RawInputText {
public static class TextToRecordMapper
extends Mapper<Object, Text, Text, IntArrayWritable>{
private Text word = new Text();
private IntArrayWritable mapv = new IntArrayWritable();
public void map(Object key, Text value, Context context
) throws IOException, InterruptedException {
String line = value.toString();
String[] valArray = line.split(",");
if(valArray.length == 6){
IntWritable[] valInts = new IntWritable[2];
word.set(valArray[0]+"-"+valArray[1]);
valInts[0] = new IntWritable(Integer.parseInt(valArray[2]));
valInts[1] = new IntWritable(Integer.parseInt(valArray[4]));
mapv.set(valInts);
context.write(word, mapv);
}
}
}
public static class MacOneSensorSigCntReducer
extends Reducer<Text,IntArrayWritable,Text,IntArrayWritable> {
private Text macKey = new Text();
private IntArrayWritable macInfo = new IntArrayWritable();
public void reduce(Text key, Iterable<IntArrayWritable> values,
Context context
) throws IOException, InterruptedException {
String[] keyArray = key.toString().split("-");
if(keyArray.length < 2){
int a = 10;
a= 20;
}
String mac = keyArray[1];
String sen = keyArray[0];
Hashtable<Integer, MinuteSignalInfo> rssiTime = new Hashtable<Integer, MinuteSignalInfo>();
MinuteSignalInfo minSig;
int rssi = 0;
int ts = 0;
int i = 0;
for (IntArrayWritable val : values) {
i = 0;
for(Writable element : val.get()) {
IntWritable eleVal = (IntWritable)element;
if(i%2 == 0)
rssi = eleVal.get();
else
ts = eleVal.get()/60;
i++;
}
minSig = (MinuteSignalInfo)rssiTime.get(ts);
if(minSig == null){
minSig = new MinuteSignalInfo();
minSig.rssi = rssi;
minSig.count = 1;
}else{
minSig.rssi += rssi;
minSig.count += 1;
}
rssiTime.put(ts, minSig);
}
TreeMap<Integer, MinuteSignalInfo> treeMap = new TreeMap<Integer, MinuteSignalInfo>();
treeMap.putAll(rssiTime);
macKey.set(mac);
i = 0;
IntWritable[] valInts = new IntWritable[1+treeMap.size()*3];
valInts[i++] = new IntWritable(Integer.parseInt(sen));
Collection<Integer> macs = treeMap.keySet();
Iterator<Integer> it = macs.iterator();
while(it.hasNext()) {
int tsKey = it.next();
valInts[i++] = new IntWritable(tsKey);
valInts[i++] = new IntWritable(treeMap.get(tsKey).rssi);
valInts[i++] = new IntWritable(treeMap.get(tsKey).count);
}
macInfo.set(valInts);
context.write(macKey, macInfo);
}
}
public static class MacSensorsTimeLocMapper
extends Mapper<Text,IntArrayWritable,Text,IntWritable> {
private Text macInfo = new Text();
public void map(Text key, Iterable<IntArrayWritable> values,
Context context
) throws IOException, InterruptedException {
int i = 0;
int sensor = 0;
int ts = 0;
int rssi = 0;
int count = 0;
Hashtable<Integer, MinuteSignalInfo> rssiTime = new Hashtable<Integer, MinuteSignalInfo>();
MinuteSignalInfo minSig;
for (IntArrayWritable val : values) {
i = 0;
for(Writable element : val.get()) {
IntWritable eleVal = (IntWritable)element;
int valval = eleVal.get();
if(i == 0) {
sensor = valval;
}else if(i%3 == 1){
ts = valval;
}else if(i%3 == 2){
rssi = valval;
}else if(i%3 == 0){
count = valval;
minSig = (MinuteSignalInfo)rssiTime.get(ts);
if(minSig == null){
minSig = new MinuteSignalInfo();
minSig.rssi = rssi;
minSig.count = count;
minSig.sensor = sensor;
rssiTime.put(ts, minSig);
}else{
if((rssi/count) < (minSig.rssi/minSig.count)){
minSig.rssi = rssi;
minSig.count = count;
minSig.sensor = sensor;
rssiTime.put(ts, minSig);
}
}
}
i++;
}
}
TreeMap<Integer, MinuteSignalInfo> treeMap = new TreeMap<Integer, MinuteSignalInfo>();
treeMap.putAll(rssiTime);
String macLocs = "";
Collection<Integer> tss = treeMap.keySet();
Iterator<Integer> it = tss.iterator();
while(it.hasNext()) {
int tsKey = it.next();
macLocs += String.valueOf(tsKey) + ",";
macLocs += String.valueOf(treeMap.get(tsKey).sensor) + ";";
}
macInfo.set(macLocs);
context.write(key, new IntWritable(10));
//context.write(key, macInfo);
}
}
public static class MacSensorsTimeLocReducer
extends Reducer<Text,IntArrayWritable,Text,Text> {
private Text macInfo = new Text();
public void reduce(Text key, Iterable<IntArrayWritable> values,
Context context
) throws IOException, InterruptedException {
int i = 0;
int sensor = 0;
int ts = 0;
int rssi = 0;
int count = 0;
Hashtable<Integer, MinuteSignalInfo> rssiTime = new Hashtable<Integer, MinuteSignalInfo>();
MinuteSignalInfo minSig;
for (IntArrayWritable val : values) {
i = 0;
for(Writable element : val.get()) {
IntWritable eleVal = (IntWritable)element;
int valval = eleVal.get();
if(i == 0) {
sensor = valval;
}else if(i%3 == 1){
ts = valval;
}else if(i%3 == 2){
rssi = valval;
}else if(i%3 == 0){
count = valval;
minSig = (MinuteSignalInfo)rssiTime.get(ts);
if(minSig == null){
minSig = new MinuteSignalInfo();
minSig.rssi = rssi;
minSig.count = count;
minSig.sensor = sensor;
rssiTime.put(ts, minSig);
}else{
if((rssi/count) < (minSig.rssi/minSig.count)){
minSig.rssi = rssi;
minSig.count = count;
minSig.sensor = sensor;
rssiTime.put(ts, minSig);
}
}
}
i++;
}
}
TreeMap<Integer, MinuteSignalInfo> treeMap = new TreeMap<Integer, MinuteSignalInfo>();
treeMap.putAll(rssiTime);
String macLocs = "";
Collection<Integer> tss = treeMap.keySet();
Iterator<Integer> it = tss.iterator();
while(it.hasNext()) {
int tsKey = it.next();
macLocs += String.valueOf(tsKey) + ",";
macLocs += String.valueOf(treeMap.get(tsKey).sensor) + ";";
}
macInfo.set(macLocs);
context.write(key, macInfo);
}
}
public static class MacInfoTestReducer
extends Reducer<Text,IntArrayWritable,Text,Text> {
private Text macInfo = new Text();
public void reduce(Text key, Iterable<IntArrayWritable> values,
Context context
) throws IOException, InterruptedException {
String tmp = "";
for (IntArrayWritable val : values) {
for(Writable element : val.get()) {
IntWritable eleVal = (IntWritable)element;
int valval = eleVal.get();
tmp += String.valueOf(valval) + " ";
}
}
macInfo.set(tmp);
context.write(key, macInfo);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length != 3) {
System.err.println("Usage: wordcount <in> <out>");
System.exit(2);
}
/*
Job job = new Job(conf, "word count");
job.setJarByClass(RawInputText.class);
job.setMapperClass(TextToRecordMapper.class);
job.setReducerClass(MacOneSensorSigCntReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntArrayWritable.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
job.waitForCompletion(true);
*/
Job secondJob = new Job(conf, "word count 2");
secondJob.setJarByClass(RawInputText.class);
FileInputFormat.addInputPath(secondJob, new Path(otherArgs[1]));
secondJob.setInputFormatClass(SequenceFileInputFormat.class);
secondJob.setMapperClass(MacSensorsTimeLocMapper.class);
//secondJob.setMapperClass(Mapper.class);
secondJob.setMapOutputKeyClass(Text.class);
secondJob.setMapOutputValueClass(IntArrayWritable.class);
secondJob.setReducerClass(MacInfoTestReducer.class);
//secondJob.setOutputKeyClass(Text.class);
//secondJob.setOutputValueClass(IntArrayWritable.class);
FileOutputFormat.setOutputPath(secondJob, new Path(otherArgs[2]));
System.exit(secondJob.waitForCompletion(true) ? 0 : 1);
}
}
package com.hicapt.xike;
public class MinuteSignalInfo {
public int sensor;
public int rssi;
public int count;
public MinuteSignalInfo() {
rssi = 0;
count = 0;
sensor = 0;
}
}
package com.hicapt.xike;
import org.apache.hadoop.io.ArrayWritable;
import org.apache.hadoop.io.IntWritable;
public class IntArrayWritable extends ArrayWritable {
public IntArrayWritable() {
super(IntWritable.class);
}
/*
public void readFields(DataInput in) throws IOException{
super.readFields(in);
}
public void write(DataOutput out) throws IOException{
super.write(out);
}*/
}
I'm using Hadoop to analyze GSOD data (ftp://ftp.ncdc.noaa.gov/pub/data/gsod/).
I chose 5 years to executed my experiments (2005 - 2009).
I've configured a little cluster and executed a simple MapReduce program that gets the maximum temperature registered for a year.
Now I have to create a new MR program that counts for each station all the phenomena occurences all those years.
The files that I have to analyze have this structure:
STN--- ... FRSHTO
722115 110001
722115 011001
722110 111000
722110 001000
722000 001000
The column STN means the station code and FRSHTT means the phenomena:
F - Fog, R - Rain or drizzle, S - Snow or ice pellets, H - Hail, T - Thunder, O - Tornado or funnel cloud.
The value 1, means that this phenomenun occured at that day; 0, means not ocurred.
I need to find results like following:
722115: F = 1, R = 2, S = 1, O = 2
722110: F = 1, R = 1, S = 2
722000: S = 1
I could run the MR program but the results are wrong, giving me these results:
722115 F, 1
722115 R, 1
722115 R, 1
722115 S, 1
722115 O, 1
722115 O, 1
722110 F, 1
722110 R, 1
722110 S, 1
722110 S, 1
722000 S, 1
I have used these codes:
Mapper.java
public class Mapper extends org.apache.hadoop.mapreduce.Mapper<LongWritable, Text, StationPhenomenun, IntWritable> {
#Override
protected void map(LongWritable key, Text value, org.apache.hadoop.mapreduce.Mapper.Context context) throws IOException, InterruptedException {
String line = value.toString();
// Every file starts with a field description line, so, I ignore this line
if (!line.startsWith("STN---")) {
// First field of the line means the station code where data was collected
String station = line.substring(0, 6);
String fog = (line.substring(132, 133));
String rainOrDrizzle = (line.substring(133, 134));
String snowOrIcePellets = (line.substring(134, 135));
String hail = (line.substring(135, 136));
String thunder = (line.substring(136, 137));
String tornadoOrFunnelCloud = (line.substring(137, 138));
if (fog.equals("1"))
context.write(new StationPhenomenun(station,"F"), new IntWritable(1));
if (rainOrDrizzle.equals("1"))
context.write(new StationPhenomenun(station,"R"), new IntWritable(1));
if (snowOrIcePellets.equals("1"))
context.write(new StationPhenomenun(station,"S"), new IntWritable(1));
if (hail.equals("1"))
context.write(new StationPhenomenun(station,"H"), new IntWritable(1));
if (thunder.equals("1"))
context.write(new StationPhenomenun(station,"T"), new IntWritable(1));
if (tornadoOrFunnelCloud.equals("1"))
context.write(new StationPhenomenun(station,"O"), new IntWritable(1));
}
}
}
Reducer.java
public class Reducer extends org.apache.hadoop.mapreduce.Reducer<StationPhenomenun, IntWritable, StationPhenomenun, IntWritable> {
protected void reduce(StationPhenomenun key, Iterable<IntWritable> values, org.apache.hadoop.mapreduce.Reducer.Context context) throws IOException, InterruptedException {
int count = 0;
for (IntWritable value : values) {
count++;
}
String station = key.getStation().toString();
String occurence = key.getPhenomenun().toString();
StationPhenomenun textPair = new StationPhenomenun(station, occurence);
context.write(textPair, new IntWritable(count));
}
}
StationPhenomenum.java
public class StationPhenomenun implements WritableComparable<StationPhenomenun> {
private String station;
private String phenomenun;
public StationPhenomenun(String station, String phenomenun) {
this.station = station;
this.phenomenun = phenomenun;
}
public StationPhenomenun() {
}
public String getStation() {
return station;
}
public String getPhenomenun() {
return phenomenun;
}
#Override
public void readFields(DataInput in) throws IOException {
station = in.readUTF();
phenomenun = in.readUTF();
}
#Override
public void write(DataOutput out) throws IOException {
out.writeUTF(station);
out.writeUTF(phenomenun);
}
#Override
public int compareTo(StationPhenomenun t) {
int cmp = this.station.compareTo(t.station);
if (cmp != 0) {
return cmp;
}
return this.phenomenun.compareTo(t.phenomenun);
}
#Override
public boolean equals(Object obj) {
if (obj == null) {
return false;
}
if (getClass() != obj.getClass()) {
return false;
}
final StationPhenomenun other = (StationPhenomenun) obj;
if (this.station != other.station && (this.station == null || !this.station.equals(other.station))) {
return false;
}
if (this.phenomenun != other.phenomenun && (this.phenomenun == null || !this.phenomenun.equals(other.phenomenun))) {
return false;
}
return true;
}
#Override
public int hashCode() {
return this.station.hashCode() * 163 + this.phenomenun.hashCode();
}
}
NcdcJob.java
public class NcdcJob {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = new Job(conf);
job.setJarByClass(NcdcJob.class);
FileInputFormat.addInputPath(job, new Path("/user/hadoop/input"));
FileOutputFormat.setOutputPath(job, new Path("/user/hadoop/station"));
job.setMapperClass(Mapper.class);
job.setReducerClass(Reducer.class);
job.setMapOutputKeyClass(StationPhenomenun.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(StationPhenomenun.class);
job.setOutputValueClass(IntWritable.class);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
Has anyone done something similar?
PS.: I have tried this solution (Hadoop - composite key) but does not worked for me.
Just check if the following 2 classes matches towards your custom implementation.
job.setMapperClass(Mapper.class);
job.setReducerClass(Reducer.class);
I was able to get the desired result with the following changes
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
protected void reduce(StationPhenomenun key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
Also changed the Class Names to MyMapper and MyReducer
722115,1,1,0,0,0,1
722115,0,1,1,0,0,1
722110,1,1,1,0,0,0
722110,0,0,1,0,0,0
722000,0,0,1,0,0,0
For this input set, I could get the following result
StationPhenomenun [station=722000, phenomenun=S] 1
StationPhenomenun [station=722110, phenomenun=F] 1
StationPhenomenun [station=722110, phenomenun=R] 1
StationPhenomenun [station=722110, phenomenun=S] 2
StationPhenomenun [station=722115, phenomenun=F] 1
StationPhenomenun [station=722115, phenomenun=O] 2
StationPhenomenun [station=722115, phenomenun=R] 2
StationPhenomenun [station=722115, phenomenun=S] 1
Computation is same, you just need to customize how the output is displayed.
I have this hadoop map reduce code that works on graph data (in adjacency list form) and kind of similar to in-adjacency list to out-adjacency list transformation algorithms. The main MapReduce Task code is following:
public class TestTask extends Configured
implements Tool {
public static class TTMapper extends MapReduceBase
implements Mapper<Text, TextArrayWritable, Text, NeighborWritable> {
#Override
public void map(Text key,
TextArrayWritable value,
OutputCollector<Text, NeighborWritable> output,
Reporter reporter) throws IOException {
int numNeighbors = value.get().length;
double weight = (double)1 / numNeighbors;
Text[] neighbors = (Text[]) value.toArray();
NeighborWritable me = new NeighborWritable(key, new DoubleWritable(weight));
for (int i = 0; i < neighbors.length; i++) {
output.collect(neighbors[i], me);
}
}
}
public static class TTReducer extends MapReduceBase
implements Reducer<Text, NeighborWritable, Text, Text> {
#Override
public void reduce(Text key,
Iterator<NeighborWritable> values,
OutputCollector<Text, Text> output,
Reporter arg3)
throws IOException {
ArrayList<NeighborWritable> neighborList = new ArrayList<NeighborWritable>();
while(values.hasNext()) {
neighborList.add(values.next());
}
NeighborArrayWritable neighbors = new NeighborArrayWritable
(neighborList.toArray(new NeighborWritable[0]));
Text out = new Text(neighbors.toString());
output.collect(key, out);
}
}
#Override
public int run(String[] arg0) throws Exception {
JobConf conf = Util.getMapRedJobConf("testJob",
SequenceFileInputFormat.class,
TTMapper.class,
Text.class,
NeighborWritable.class,
1,
TTReducer.class,
Text.class,
Text.class,
TextOutputFormat.class,
"test/in",
"test/out");
JobClient.runJob(conf);
return 0;
}
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(new TestTask(), args);
System.exit(res);
}
}
The auxiliary code is following:
TextArrayWritable:
public class TextArrayWritable extends ArrayWritable {
public TextArrayWritable() {
super(Text.class);
}
public TextArrayWritable(Text[] values) {
super(Text.class, values);
}
}
NeighborWritable:
public class NeighborWritable implements Writable {
private Text nodeId;
private DoubleWritable weight;
public NeighborWritable(Text nodeId, DoubleWritable weight) {
this.nodeId = nodeId;
this.weight = weight;
}
public NeighborWritable () { }
public Text getNodeId() {
return nodeId;
}
public DoubleWritable getWeight() {
return weight;
}
public void setNodeId(Text nodeId) {
this.nodeId = nodeId;
}
public void setWeight(DoubleWritable weight) {
this.weight = weight;
}
#Override
public void readFields(DataInput in) throws IOException {
nodeId = new Text();
nodeId.readFields(in);
weight = new DoubleWritable();
weight.readFields(in);
}
#Override
public void write(DataOutput out) throws IOException {
nodeId.write(out);
weight.write(out);
}
public String toString() {
return "NW[nodeId=" + (nodeId != null ? nodeId.toString() : "(null)") +
",weight=" + (weight != null ? weight.toString() : "(null)") + "]";
}
public boolean equals(Object o) {
if (!(o instanceof NeighborWritable)) {
return false;
}
NeighborWritable that = (NeighborWritable)o;
return (nodeId.equals(that.getNodeId()) && (weight.equals(that.getWeight())));
}
}
and the Util class:
public class Util {
public static JobConf getMapRedJobConf(String jobName,
Class<? extends InputFormat> inputFormatClass,
Class<? extends Mapper> mapperClass,
Class<?> mapOutputKeyClass,
Class<?> mapOutputValueClass,
int numReducer,
Class<? extends Reducer> reducerClass,
Class<?> outputKeyClass,
Class<?> outputValueClass,
Class<? extends OutputFormat> outputFormatClass,
String inputDir,
String outputDir) throws IOException {
JobConf conf = new JobConf();
if (jobName != null)
conf.setJobName(jobName);
conf.setInputFormat(inputFormatClass);
conf.setMapperClass(mapperClass);
if (numReducer == 0) {
conf.setNumReduceTasks(0);
conf.setOutputKeyClass(outputKeyClass);
conf.setOutputValueClass(outputValueClass);
conf.setOutputFormat(outputFormatClass);
} else {
// may set actual number of reducers
// conf.setNumReduceTasks(numReducer);
conf.setMapOutputKeyClass(mapOutputKeyClass);
conf.setMapOutputValueClass(mapOutputValueClass);
conf.setReducerClass(reducerClass);
conf.setOutputKeyClass(outputKeyClass);
conf.setOutputValueClass(outputValueClass);
conf.setOutputFormat(outputFormatClass);
}
// delete the existing target output folder
FileSystem fs = FileSystem.get(conf);
fs.delete(new Path(outputDir), true);
// specify input and output DIRECTORIES (not files)
FileInputFormat.addInputPath(conf, new Path(inputDir));
FileOutputFormat.setOutputPath(conf, new Path(outputDir));
return conf;
}
}
My input is following graph: (in binary format, here I am giving the text format)
1 2
2 1,3,5
3 2,4
4 3,5
5 2,4
According to the logic of the code the output should be:
1 NWArray[size=1,{NW[nodeId=2,weight=0.3333333333333333],}]
2 NWArray[size=3,{NW[nodeId=5,weight=0.5],NW[nodeId=3,weight=0.5],NW[nodeId=1,weight=1.0],}]
3 NWArray[size=2,{NW[nodeId=2,weight=0.3333333333333333],NW[nodeId=4,weight=0.5],}]
4 NWArray[size=2,{NW[nodeId=5,weight=0.5],NW[nodeId=3,weight=0.5],}]
5 NWArray[size=2,{NW[nodeId=2,weight=0.3333333333333333],NW[nodeId=4,weight=0.5],}]
But the output is coming as:
1 NWArray[size=1,{NW[nodeId=2,weight=0.3333333333333333],}]
2 NWArray[size=3,{NW[nodeId=5,weight=0.5],NW[nodeId=5,weight=0.5],NW[nodeId=5,weight=0.5],}]
3 NWArray[size=2,{NW[nodeId=2,weight=0.3333333333333333],NW[nodeId=2,weight=0.3333333333333333],}]
4 NWArray[size=2,{NW[nodeId=5,weight=0.5],NW[nodeId=5,weight=0.5],}]
5 NWArray[size=2,{NW[nodeId=2,weight=0.3333333333333333],NW[nodeId=2,weight=0.3333333333333333],}]
I cannot understand the reason why the expected output is not coming out. Any help will be appreciated.
Thanks.
You're falling foul of object re-use
while(values.hasNext()) {
neighborList.add(values.next());
}
values.next() will return the same object reference, but the underlying contents of that object will change for each iteration (the readFields method is called to re-populate the contents)
Suggest you amend to (you'll need to obtain the Configuration conf variable from a setup method, unless you can obtain it from the Reporter or OutputCollector - sorry i don't use the old API)
while(values.hasNext()) {
neighborList.add(
ReflectionUtils.copy(conf, values.next(), new NeighborWritable());
}
But I still can't understand why my unit test passed then. Here is the code -
public class UWLTInitReducerTest {
private Text key;
private Iterator<NeighborWritable> values;
private NeighborArrayWritable nodeData;
private TTReducer reducer;
/**
* Set up the states for calling the map function
*/
#Before
public void setUp() throws Exception {
key = new Text("1001");
NeighborWritable[] neighbors = new NeighborWritable[4];
for (int i = 0; i < 4; i++) {
neighbors[i] = new NeighborWritable(new Text("300" + i), new DoubleWritable((double) 1 / (1 + i)));
}
values = Arrays.asList(neighbors).iterator();
nodeData = new NeighborArrayWritable(neighbors);
reducer = new TTReducer();
}
/**
* Test method for InitModelMapper#map - valid input
*/
#Test
public void testMapValid() {
// mock the output object
OutputCollector<Text, UWLTNodeData> output = mock(OutputCollector.class);
try {
// call the API
reducer.reduce(key, values, output, null);
// in order (sequential) verification of the calls to output.collect()
verify(output).collect(key, nodeData);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
Why didn't this code catch the bug?