Hadoop and Custom Writable Issue - hadoop

I am using Hadoop 2.7 and I have got an issue when using a custom Writable "TextPair" (page 104 of the Definitive Guide). Basically, my program works fine when I am using just Text whereas it outputs "test.TextTuple#3b86249a test.TextTuple#63cd18fd" when using the TextPair.
Please, Any idea of what is wrong with my code (below)?
============
Mapper1:
public class KWMapper extends Mapper<LongWritable, Text, TextTuple, TextTuple> {
#Override
public void map(LongWritable k, Text v, Mapper.Context c) throws IOException, InterruptedException {
String keywordRelRecord[] = v.toString().split(",");
String subTopicID = keywordRelRecord[0];
String paperID = keywordRelRecord[1];
//set the KEY
TextTuple key = new TextTuple();
key.setNaturalKey(new Text(subTopicID));
key.setSecondaryKey(new Text("K"));
//set the VALUE
TextTuple value = new TextTuple();
value.setNaturalKey(new Text(paperID));
value.setSecondaryKey(new Text("K"));
c.write(key, value);
}
Mapper2:
public class TDMapper extends Mapper<LongWritable, Text, TextTuple, TextTuple> {
#Override
public void map(LongWritable k, Text v, Mapper.Context c) throws IOException, InterruptedException {
String topicRecord[] = v.toString().split(",");
String superTopicID = topicRecord[0];
String subTopicID = topicRecord[1].substring(1, topicRecord[1].length() - 1);
TextTuple key = new TextTuple();
key.setNaturalKey(new Text(subTopicID));
key.setSecondaryKey(new Text("T"));
TextTuple value = new TextTuple();
value.setNaturalKey(new Text(superTopicID));
value.setSecondaryKey(new Text("T"));
c.write(key, value);
}
REDUCER :
public class TDKRReducer extends Reducer<TextTuple, TextTuple, Text, Text>{
public void reduce(TextTuple k, Iterable<TextTuple> values, Reducer.Context c) throws IOException, InterruptedException{
for (TextTuple val : values) {
c.write(k.getNaturalKey(), val.getNaturalKey());
}
}
}
DRIVER:
public class TDDriver {
public static void main(String args[]) throws IOException, InterruptedException, ClassNotFoundException {
// This class support the user for the configuration of the execution;
Configuration confStage1 = new Configuration();
Job job1 = new Job(confStage1, "TopDecKeywordRel");
// Setting the driver class
job1.setJarByClass(TDDriver.class);
// Setting the input Files and processing them using the corresponding mapper class
MultipleInputs.addInputPath(job1, new Path(args[0]), TextInputFormat.class, TDMapper.class);
MultipleInputs.addInputPath(job1, new Path(args[1]), TextInputFormat.class, KWMapper.class);
job1.setMapOutputKeyClass(TextTuple.class);
job1.setMapOutputValueClass(TextTuple.class);
// Setting the Reducer Class;
job1.setReducerClass(TDKRReducer.class);
// Setting the output class for the Key-value pairs
job1.setOutputKeyClass(Text.class);
job1.setOutputValueClass(Text.class);
// Setting the output file
Path outputPA = new Path(args[2]);
FileOutputFormat.setOutputPath(job1, outputPA);
// Submitting the Job Monitoring the execution of the Job
System.exit(job1.waitForCompletion(true) ? 0 : 1);
//conf.setPartitionerClass(CustomPartitioner.class);
}
}
CUSTOM VARIABLE
public class TextTuple implements Writable, WritableComparable<TextTuple> {
private Text naturalKey;
private Text secondaryKey;
public TextTuple() {
this.naturalKey = new Text();
this.secondaryKey = new Text();
}
public void setNaturalKey(Text naturalKey) {
this.naturalKey = naturalKey;
}
public void setSecondaryKey(Text secondaryKey) {
this.secondaryKey = secondaryKey;
}
public Text getNaturalKey() {
return naturalKey;
}
public Text getSecondaryKey() {
return secondaryKey;
}
#Override
public void write(DataOutput out) throws IOException {
naturalKey.write(out);
secondaryKey.write(out);
}
#Override
public void readFields(DataInput in) throws IOException {
naturalKey.readFields(in);
secondaryKey.readFields(in);
}
//This comparator controls the sort order of the keys.
#Override
public int compareTo(TextTuple o) {
// comparing the naturalKey
int compareValue = this.naturalKey.compareTo(o.naturalKey);
if (compareValue == 0) {
compareValue = this.secondaryKey.compareTo(o.secondaryKey);
}
return -1 * compareValue;
}
}

Related

WritableComparable object is not serializable

I have the following classes for MR jobs but when i run the job the job is failing with the below exception kindly suggest.
public class MongoKey implements WritableComparable<MongoKey> {
...
private Text name;
private Text place;
public MongoKey() {
this.name = new Text();
this.place = new Text();
}
public MongoKey(Text name, Text place) {
this.name = name;
this.place = place;
}
public void readFields(DataInput in) throws IOException {
name.readFields(in);
place.readFields(in);
}
public void write(DataOutput out) throws IOException {
name.write(out);
place.write(out);
}
public int compareTo(MongoKey o) {
MongoKey other = (MongoKey)o;
int cmp = name.compareTo(other.name);
if(cmp != 0){
return cmp;
}
return place.compareTo(other.place);
}
}
public class MongoValue implements Writable {
...
public void readFields(DataInput in) throws IOException {
profession.readFields(in);
}
public void write(DataOutput out) throws IOException {
profession.write(out);
}
}
public class MongoReducer extends Reducer<MongoKey, MongoValue, MongoKey, BSONWritable> {
...
context.write(key, new BSONWritable(output)); // line 41
}
public class MongoHadoopJobRunner extends Configured implements Tool {
public int run(String[] args) throws Exception {
if (args.length != 2) {
System.out.println("usage: [input] [output]");
System.exit(-1);
}
Configuration conf = getConf();
for (String arg : args)
System.out.println(arg);
GenericOptionsParser parser = new GenericOptionsParser(conf, args);
conf.set("mongo.output.uri", "mongodb://localhost/demo.logs_aggregate");
MongoConfigUtil.setOutputURI(conf, "mongodb://localhost/demo.logs_aggregate");
MongoConfigUtil.setOutputFormat(conf, MongoOutputFormat.class);
final Job job = new Job(conf, "mongo_hadoop");
job.setOutputFormatClass(MongoOutputFormat.class);
// Job job = new Job();
job.setJarByClass(MongoHadoopJobRunner.class);
// job.setJobName("mongo_hadoop");
job.setNumReduceTasks(1);
job.setMapperClass(MongoMapper.class);
job.setReducerClass(MongoReducer.class);
job.setMapOutputKeyClass(MongoKey.class);
job.setMapOutputValueClass(MongoValue.class);
job.setOutputKeyClass(MongoKey.class);
job.setOutputValueClass(BSONWritable.class);
job.setInputFormatClass(MongoInputFormat.class);
for (String arg2 : parser.getRemainingArgs()) {
System.out.println("remaining: " + arg2);
}
Path inPath = new Path(parser.getRemainingArgs()[0]);
MongoInputFormat.addInputPath(job, inPath);
job.waitForCompletion(true);
return 0;
}
public static void main(String[] pArgs) throws Exception {
Configuration conf = new Configuration();
for (String arg : pArgs) {
System.out.println(arg);
}
GenericOptionsParser parser = new GenericOptionsParser(conf, pArgs);
for (String arg2 : parser.getRemainingArgs()) {
System.out.println("ree" + arg2);
}
System.exit(ToolRunner.run(conf, new MongoHadoopJobRunner(), parser
.getRemainingArgs()));
}
}
With the following exception
java.lang.Exception: java.lang.IllegalArgumentException: can't serialize class com.name.custom.MongoKey
...
...
at com.mongodb.hadoop.output.MongoRecordWriter.write(MongoRecordWriter.java:93)
at org.apache.hadoop.mapred.ReduceTask$NewTrackingRecordWriter.write(ReduceTask.java:558)
at org.apache.hadoop.mapreduce.task.TaskInputOutputContextImpl.write(TaskInputOutputContextImpl.java:89)
at org.apache.hadoop.mapreduce.lib.reduce.WrappedReducer$Context.write(WrappedReducer.java:105)
at com.name.custom.MongoReducer.reduce(MongoReducer.java:41)
at com.name.custom.MongoReducer.reduce(MongoReducer.java:11)
It seems there should not be any issue with the code but why its unable to serialize the fields i am totally clueless.
Thanks very much in advance
As i see from MongoRecordWriter source code it does not support arbitrary WritableComparable object as key. You can use one of these classes as key: BSONWritable, BSONObject, Text, UTF8, simple wrappers like IntWritable. Also i think you can use Serializable object as key. So i can suggest you two workarounds:
Make your MongoKey serializable (implements Serializable, implement writeObject, readObject methods).
Use one of supported classes as key, for example you can use Text as key: Text key = new Text(name.toString() + "\t" + place.toString());
This:
java.lang.Exception: java.lang.IllegalArgumentException: can't serialize class com.name.custom.MongoKey
exception is raised because MongoKey doesn't implement java.io.Serializable.
Add the Serializable to your class declaration

Example for running mapreduce on hdfs files and storing reducer results in hbase table

Can somebody give one good example link for mapreduce with Hbase? My requirement is run mapreduce on hdfs file and store reducer output to hbase table. Mapper input will be hdfs file and output will be Text,IntWritable key value pairs. Reducers output will be Put object ie add reducer Iterable IntWritable values and store in hbase table.
Here is the code which will solve your problem
Driver
HBaseConfiguration conf = HBaseConfiguration.create();
Job job = new Job(conf,"JOB_NAME");
job.setJarByClass(yourclass.class);
job.setMapperClass(yourMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Intwritable.class);
FileInputFormat.setInputPaths(job, new Path(inputPath));
TableMapReduceUtil.initTableReducerJob(TABLE,
yourReducer.class, job);
job.setReducerClass(yourReducer.class);
job.waitForCompletion(true);
Mapper&Reducer
class yourMapper extends Mapper<LongWritable, Text, Text,IntWritable> {
//#overide map()
}
class yourReducer
extends
TableReducer<Text, IntWritable,
ImmutableBytesWritable>
{
//#override rdeuce()
}
**Ckeck the bellow code that works fine for me with Phoenix Hbase and map reduce **
This program will read data from Hbase table and inset result in to another table after map-reduce job .
Table :-> STOCK ,STOCK_STATS
StockComputationJob.java
public static class StockMapper extends Mapper<NullWritable, StockWritable, Text , DoubleWritable> {
private Text stock = new Text();
private DoubleWritable price = new DoubleWritable ();
#Override
protected void map(NullWritable key, StockWritable stockWritable, Context context) throws IOException, InterruptedException {
double[] recordings = stockWritable.getRecordings();
final String stockName = stockWritable.getStockName();
System.out.println("Map-"+recordings);
double maxPrice = Double.MIN_VALUE;
for(double recording : recordings) {
System.out.println("M-"+key+"-"+recording);
if(maxPrice < recording) {
maxPrice = recording;
}
}
System.out.println(stockName+"--"+maxPrice);
stock.set(stockName);
price.set(maxPrice);
context.write(stock,price);
}
}
public static void main(String[] args) throws Exception {
final Configuration conf = new Configuration();
HBaseConfiguration.addHbaseResources(conf);
conf.set(HConstants.ZOOKEEPER_QUORUM, zkUrl);
final Job job = Job.getInstance(conf, "stock-stats-job");
// We can either specify a selectQuery or ignore it when we would like to retrieve all the columns
final String selectQuery = "SELECT STOCK_NAME,RECORDING_YEAR,RECORDINGS_QUARTER FROM STOCK ";
// StockWritable is the DBWritable class that enables us to process the Result of the above query
PhoenixMapReduceUtil.setInput(job,StockWritable.class,"STOCK",selectQuery);
// Set the target Phoenix table and the columns
PhoenixMapReduceUtil.setOutput(job, "STOCK_STATS", "STOCK_NAME,MAX_RECORDING");
job.setMapperClass(StockMapper.class);
job.setReducerClass(StockReducer.class);
job.setOutputFormatClass(PhoenixOutputFormat.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(DoubleWritable.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(StockWritable.class);
TableMapReduceUtil.addDependencyJars(job);
job.waitForCompletion(true);
}
}
StockReducer.java
public class StockReducer extends Reducer<Text, DoubleWritable, NullWritable , StockWritable> {
protected void reduce(Text key, Iterable<DoubleWritable> recordings, Context context) throws IOException, InterruptedException {
double maxPrice = Double.MIN_VALUE;
System.out.println(recordings);
for(DoubleWritable recording : recordings) {
System.out.println("R-"+key+"-"+recording);
if(maxPrice < recording.get()) {
maxPrice = recording.get();
}
}
final StockWritable stock = new StockWritable();
stock.setStockName(key.toString());
stock.setMaxPrice(maxPrice);
System.out.println(key+"--"+maxPrice);
context.write(NullWritable.get(),stock);
}
}
StockWritable.java
public class StockWritable implements DBWritable,Writable {
private String stockName;
private int year;
private double[] recordings;
private double maxPrice;
public void readFields(DataInput input) throws IOException {
}
public void write(DataOutput output) throws IOException {
}
public void readFields(ResultSet rs) throws SQLException {
stockName = rs.getString("STOCK_NAME");
setYear(rs.getInt("RECORDING_YEAR"));
final Array recordingsArray = rs.getArray("RECORDINGS_QUARTER");
setRecordings((double[])recordingsArray.getArray());
}
public void write(PreparedStatement pstmt) throws SQLException {
pstmt.setString(1, stockName);
pstmt.setDouble(2, maxPrice);
}
public int getYear() {
return year;
}
public void setYear(int year) {
this.year = year;
}
public double[] getRecordings() {
return recordings;
}
public void setRecordings(double[] recordings) {
this.recordings = recordings;
}
public double getMaxPrice() {
return maxPrice;
}
public void setMaxPrice(double maxPrice) {
this.maxPrice = maxPrice;
}
public String getStockName() {
return stockName;
}
public void setStockName(String stockName) {
this.stockName = stockName;
}
}

Custom WritableCompare displays object reference as output

I am new to Hadoop and Java, and I feel there is something obvious I am just missing. I am using Hadoop 1.0.3 if that means anything.
My goal for using hadoop is to take a bunch of files and parse them one file at a time (as opposed to line by line). Each file will produce multiple key-values, but context to the other lines is important. The key and value are multi-value/composite, so I have implemented WritableCompare for the key and Writable for the value. Because the processing of each file take a bit of CPU, I want to save the output of the mapper, then run multiple reducers later on.
For the composite keys, I followed [http://stackoverflow.com/questions/12427090/hadoop-composite-key][1]
The problem is, the output is just Java object references as opposed to the composite key and value. Example:
LinkKeyWritable#bd2f9730 LinkValueWritable#8752408c
I am not sure if the problem is related to not reducing the data at all or
Here is my main class:
public static void main(String[] args) throws Exception {
JobConf conf = new JobConf(Parser.class);
conf.setJobName("raw_parser");
conf.setOutputKeyClass(LinkKeyWritable.class);
conf.setOutputValueClass(LinkValueWritable.class);
conf.setMapperClass(RawMap.class);
conf.setNumMapTasks(0);
conf.setInputFormat(PerFileInputFormat.class);
conf.setOutputFormat(TextOutputFormat.class);
PerFileInputFormat.setInputPaths(conf, new Path(args[0]));
FileOutputFormat.setOutputPath(conf, new Path(args[1]));
JobClient.runJob(conf);
}
And my Mapper class:
public class RawMap extends MapReduceBase implements
Mapper {
public void map(NullWritable key, Text value,
OutputCollector<LinkKeyWritable, LinkValueWritable> output,
Reporter reporter) throws IOException {
String json = value.toString();
SerpyReader reader = new SerpyReader(json);
GoogleParser parser = new GoogleParser(reader);
for (String page : reader.getPages()) {
String content = reader.readPageContent(page);
parser.addPage(content);
}
for (Link link : parser.getLinks()) {
LinkKeyWritable linkKey = new LinkKeyWritable(link);
LinkValueWritable linkValue = new LinkValueWritable(link);
output.collect(linkKey, linkValue);
}
}
}
Link is basically a struct of various information that get's split between LinkKeyWritable and LinkValueWritable
LinkKeyWritable:
public class LinkKeyWritable implements WritableComparable<LinkKeyWritable>{
protected Link link;
public LinkKeyWritable() {
super();
link = new Link();
}
public LinkKeyWritable(Link link) {
super();
this.link = link;
}
#Override
public void readFields(DataInput in) throws IOException {
link.batchDay = in.readLong();
link.source = in.readUTF();
link.domain = in.readUTF();
link.path = in.readUTF();
}
#Override
public void write(DataOutput out) throws IOException {
out.writeLong(link.batchDay);
out.writeUTF(link.source);
out.writeUTF(link.domain);
out.writeUTF(link.path);
}
#Override
public int compareTo(LinkKeyWritable o) {
return ComparisonChain.start().
compare(link.batchDay, o.link.batchDay).
compare(link.domain, o.link.domain).
compare(link.path, o.link.path).
result();
}
#Override
public int hashCode() {
return Objects.hashCode(link.batchDay, link.source, link.domain, link.path);
}
#Override
public boolean equals(final Object obj){
if(obj instanceof LinkKeyWritable) {
final LinkKeyWritable o = (LinkKeyWritable)obj;
return Objects.equal(link.batchDay, o.link.batchDay)
&& Objects.equal(link.source, o.link.source)
&& Objects.equal(link.domain, o.link.domain)
&& Objects.equal(link.path, o.link.path);
}
return false;
}
}
LinkValueWritable:
public class LinkValueWritable implements Writable{
protected Link link;
public LinkValueWritable() {
link = new Link();
}
public LinkValueWritable(Link link) {
this.link = new Link();
this.link.type = link.type;
this.link.description = link.description;
}
#Override
public void readFields(DataInput in) throws IOException {
link.type = in.readUTF();
link.description = in.readUTF();
}
#Override
public void write(DataOutput out) throws IOException {
out.writeUTF(link.type);
out.writeUTF(link.description);
}
#Override
public int hashCode() {
return Objects.hashCode(link.type, link.description);
}
#Override
public boolean equals(final Object obj){
if(obj instanceof LinkKeyWritable) {
final LinkKeyWritable o = (LinkKeyWritable)obj;
return Objects.equal(link.type, o.link.type)
&& Objects.equal(link.description, o.link.description);
}
return false;
}
}
I think the answer is in the implementation of the TextOutputFormat. Specifically, the LineRecordWriter's writeObject method:
/**
* Write the object to the byte stream, handling Text as a special
* case.
* #param o the object to print
* #throws IOException if the write throws, we pass it on
*/
private void writeObject(Object o) throws IOException {
if (o instanceof Text) {
Text to = (Text) o;
out.write(to.getBytes(), 0, to.getLength());
} else {
out.write(o.toString().getBytes(utf8));
}
}
As you can see, if your key or value is not a Text object, it calls the toString method on it and writes that out. Since you've left toString unimplemented in your key and value, it's using the Object class's implementation, which is writing out the reference.
I'd say that you should try writing an appropriate toString function or using a different OutputFormat.
It looks like you have a list of objects just like you wanted. You need to implement toString() on your writable if you want a human-readable version printed out instead of an ugly java reference.

Type mismatch in key from map, using SequenceFileInputFormat correctly

I am trying to run a recommender example from chapter6 (listing 6.1 ~ 6.4) in the ebook Mahout in Action. There are two mapper/reducer pairs. Here is the code:
Mapper - 1
public class WikipediaToItemPrefsMapper extends
Mapper<LongWritable,Text,VarLongWritable,VarLongWritable> {
private static final Pattern NUMBERS = Pattern.compile("(\d+)");
#Override
public void map(LongWritable key,
Text value,
Context context)
throws IOException, InterruptedException {
String line = value.toString();
Matcher m = NUMBERS.matcher(line);
m.find();
VarLongWritable userID = new VarLongWritable(Long.parseLong(m.group()));
VarLongWritable itemID = new VarLongWritable();
while (m.find()) {
itemID.set(Long.parseLong(m.group()));
context.write(userID, itemID);
}
}
}
Reducer - 1
public class WikipediaToUserVectorReducer extends
Reducer<VarLongWritable,VarLongWritable,VarLongWritable,VectorWritable> {
#Override
public void reduce(VarLongWritable userID,
Iterable<VarLongWritable> itemPrefs,
Context context)
throws IOException, InterruptedException {
Vector userVector = new RandomAccessSparseVector(
Integer.MAX_VALUE, 100);
for (VarLongWritable itemPref : itemPrefs) {
userVector.set((int)itemPref.get(), 1.0f);
}
//LongWritable userID_lw = new LongWritable(userID.get());
context.write(userID, new VectorWritable(userVector));
//context.write(userID_lw, new VectorWritable(userVector));
}
}
The reducer outputs a userID and a userVector and it looks like this: 98955 {590:1.0 22:1.0 9059:1.0 3:1.0 2:1.0 1:1.0} provided FileInputformat and TextInputFormat are used in the driver.
I want to use another pair of mapper-reducer to process this data further:
Mapper - 2
public class UserVectorToCooccurenceMapper extends
Mapper<VarLongWritable,VectorWritable,IntWritable,IntWritable> {
#Override
public void map(VarLongWritable userID,
VectorWritable userVector,
Context context)
throws IOException, InterruptedException {
Iterator<Vector.Element> it = userVector.get().iterateNonZero();
while (it.hasNext()) {
int index1 = it.next().index();
Iterator<Vector.Element> it2 = userVector.get().iterateNonZero();
while (it2.hasNext()) {
int index2 = it2.next().index();
context.write(new IntWritable(index1),
new IntWritable(index2));
}
}
}
}
Reducer - 2
public class UserVectorToCooccurenceReducer extends
Reducer {
#Override
public void reduce(IntWritable itemIndex1,
Iterable<IntWritable> itemIndex2s,
Context context)
throws IOException, InterruptedException {
Vector cooccurrenceRow = new RandomAccessSparseVector(Integer.MAX_VALUE, 100);
for (IntWritable intWritable : itemIndex2s) {
int itemIndex2 = intWritable.get();
cooccurrenceRow.set(itemIndex2, cooccurrenceRow.get(itemIndex2) + 1.0);
}
context.write(itemIndex1, new VectorWritable(cooccurrenceRow));
}
}
This is the driver I am using:
public final class RecommenderJob extends Configured implements Tool {
#Override
public int run(String[] args) throws Exception {
Job job_preferenceValues = new Job (getConf());
job_preferenceValues.setJarByClass(RecommenderJob.class);
job_preferenceValues.setJobName("job_preferenceValues");
job_preferenceValues.setInputFormatClass(TextInputFormat.class);
job_preferenceValues.setOutputFormatClass(SequenceFileOutputFormat.class);
FileInputFormat.setInputPaths(job_preferenceValues, new Path(args[0]));
SequenceFileOutputFormat.setOutputPath(job_preferenceValues, new Path(args[1]));
job_preferenceValues.setMapOutputKeyClass(VarLongWritable.class);
job_preferenceValues.setMapOutputValueClass(VarLongWritable.class);
job_preferenceValues.setOutputKeyClass(VarLongWritable.class);
job_preferenceValues.setOutputValueClass(VectorWritable.class);
job_preferenceValues.setMapperClass(WikipediaToItemPrefsMapper.class);
job_preferenceValues.setReducerClass(WikipediaToUserVectorReducer.class);
job_preferenceValues.waitForCompletion(true);
Job job_cooccurence = new Job (getConf());
job_cooccurence.setJarByClass(RecommenderJob.class);
job_cooccurence.setJobName("job_cooccurence");
job_cooccurence.setInputFormatClass(SequenceFileInputFormat.class);
job_cooccurence.setOutputFormatClass(TextOutputFormat.class);
SequenceFileInputFormat.setInputPaths(job_cooccurence, new Path(args[1]));
FileOutputFormat.setOutputPath(job_cooccurence, new Path(args[2]));
job_cooccurence.setMapOutputKeyClass(VarLongWritable.class);
job_cooccurence.setMapOutputValueClass(VectorWritable.class);
job_cooccurence.setOutputKeyClass(IntWritable.class);
job_cooccurence.setOutputValueClass(VectorWritable.class);
job_cooccurence.setMapperClass(UserVectorToCooccurenceMapper.class);
job_cooccurence.setReducerClass(UserVectorToCooccurenceReducer.class);
job_cooccurence.waitForCompletion(true);
return 0;
}
public static void main(String[] args) throws Exception {
ToolRunner.run(new Configuration(), new RecommenderJob(), args);
}
}
The error that I get is:
java.io.IOException: Type mismatch in key from map: expected org.apache.mahout.math.VarLongWritable, received org.apache.hadoop.io.IntWritable
In course of Googling for a fix, I found out that my issue is similar to this question. But the difference is that I am already using SequenceFileInputFormat and SequenceFileOutputFormat, I believe correctly. I also see that org.apache.mahout.cf.taste.hadoop.item.RecommenderJob does more or less something similar. In my understanding & Yahoo Tutorial
SequenceFileOutputFormat rapidly serializes arbitrary data types to the file; the corresponding SequenceFileInputFormat will deserialize the file into the same types and presents the data to the next Mapper in the same manner as it was emitted by the previous Reducer.
What am I doing wrong? Will really appreciate some pointers from someone.. I spent the day trying to fix this and got nowhere :(
Your second mapper has the following signature:
public class UserVectorToCooccurenceMapper extends
Mapper<VarLongWritable,VectorWritable,IntWritable,IntWritable>
But you define the following in your driver code:
job_cooccurence.setMapOutputKeyClass(VarLongWritable.class);
job_cooccurence.setMapOutputValueClass(VectorWritable.class);
The reducer is expecting <IntWritable, IntWritable> as input, so you should just amend your driver code to:
job_cooccurence.setMapOutputKeyClass(IntWritable.class);
job_cooccurence.setMapOutputValueClass(IntWritable.class);

Why Hadoop shuffle not working as expected

I have this hadoop map reduce code that works on graph data (in adjacency list form) and kind of similar to in-adjacency list to out-adjacency list transformation algorithms. The main MapReduce Task code is following:
public class TestTask extends Configured
implements Tool {
public static class TTMapper extends MapReduceBase
implements Mapper<Text, TextArrayWritable, Text, NeighborWritable> {
#Override
public void map(Text key,
TextArrayWritable value,
OutputCollector<Text, NeighborWritable> output,
Reporter reporter) throws IOException {
int numNeighbors = value.get().length;
double weight = (double)1 / numNeighbors;
Text[] neighbors = (Text[]) value.toArray();
NeighborWritable me = new NeighborWritable(key, new DoubleWritable(weight));
for (int i = 0; i < neighbors.length; i++) {
output.collect(neighbors[i], me);
}
}
}
public static class TTReducer extends MapReduceBase
implements Reducer<Text, NeighborWritable, Text, Text> {
#Override
public void reduce(Text key,
Iterator<NeighborWritable> values,
OutputCollector<Text, Text> output,
Reporter arg3)
throws IOException {
ArrayList<NeighborWritable> neighborList = new ArrayList<NeighborWritable>();
while(values.hasNext()) {
neighborList.add(values.next());
}
NeighborArrayWritable neighbors = new NeighborArrayWritable
(neighborList.toArray(new NeighborWritable[0]));
Text out = new Text(neighbors.toString());
output.collect(key, out);
}
}
#Override
public int run(String[] arg0) throws Exception {
JobConf conf = Util.getMapRedJobConf("testJob",
SequenceFileInputFormat.class,
TTMapper.class,
Text.class,
NeighborWritable.class,
1,
TTReducer.class,
Text.class,
Text.class,
TextOutputFormat.class,
"test/in",
"test/out");
JobClient.runJob(conf);
return 0;
}
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(new TestTask(), args);
System.exit(res);
}
}
The auxiliary code is following:
TextArrayWritable:
public class TextArrayWritable extends ArrayWritable {
public TextArrayWritable() {
super(Text.class);
}
public TextArrayWritable(Text[] values) {
super(Text.class, values);
}
}
NeighborWritable:
public class NeighborWritable implements Writable {
private Text nodeId;
private DoubleWritable weight;
public NeighborWritable(Text nodeId, DoubleWritable weight) {
this.nodeId = nodeId;
this.weight = weight;
}
public NeighborWritable () { }
public Text getNodeId() {
return nodeId;
}
public DoubleWritable getWeight() {
return weight;
}
public void setNodeId(Text nodeId) {
this.nodeId = nodeId;
}
public void setWeight(DoubleWritable weight) {
this.weight = weight;
}
#Override
public void readFields(DataInput in) throws IOException {
nodeId = new Text();
nodeId.readFields(in);
weight = new DoubleWritable();
weight.readFields(in);
}
#Override
public void write(DataOutput out) throws IOException {
nodeId.write(out);
weight.write(out);
}
public String toString() {
return "NW[nodeId=" + (nodeId != null ? nodeId.toString() : "(null)") +
",weight=" + (weight != null ? weight.toString() : "(null)") + "]";
}
public boolean equals(Object o) {
if (!(o instanceof NeighborWritable)) {
return false;
}
NeighborWritable that = (NeighborWritable)o;
return (nodeId.equals(that.getNodeId()) && (weight.equals(that.getWeight())));
}
}
and the Util class:
public class Util {
public static JobConf getMapRedJobConf(String jobName,
Class<? extends InputFormat> inputFormatClass,
Class<? extends Mapper> mapperClass,
Class<?> mapOutputKeyClass,
Class<?> mapOutputValueClass,
int numReducer,
Class<? extends Reducer> reducerClass,
Class<?> outputKeyClass,
Class<?> outputValueClass,
Class<? extends OutputFormat> outputFormatClass,
String inputDir,
String outputDir) throws IOException {
JobConf conf = new JobConf();
if (jobName != null)
conf.setJobName(jobName);
conf.setInputFormat(inputFormatClass);
conf.setMapperClass(mapperClass);
if (numReducer == 0) {
conf.setNumReduceTasks(0);
conf.setOutputKeyClass(outputKeyClass);
conf.setOutputValueClass(outputValueClass);
conf.setOutputFormat(outputFormatClass);
} else {
// may set actual number of reducers
// conf.setNumReduceTasks(numReducer);
conf.setMapOutputKeyClass(mapOutputKeyClass);
conf.setMapOutputValueClass(mapOutputValueClass);
conf.setReducerClass(reducerClass);
conf.setOutputKeyClass(outputKeyClass);
conf.setOutputValueClass(outputValueClass);
conf.setOutputFormat(outputFormatClass);
}
// delete the existing target output folder
FileSystem fs = FileSystem.get(conf);
fs.delete(new Path(outputDir), true);
// specify input and output DIRECTORIES (not files)
FileInputFormat.addInputPath(conf, new Path(inputDir));
FileOutputFormat.setOutputPath(conf, new Path(outputDir));
return conf;
}
}
My input is following graph: (in binary format, here I am giving the text format)
1 2
2 1,3,5
3 2,4
4 3,5
5 2,4
According to the logic of the code the output should be:
1 NWArray[size=1,{NW[nodeId=2,weight=0.3333333333333333],}]
2 NWArray[size=3,{NW[nodeId=5,weight=0.5],NW[nodeId=3,weight=0.5],NW[nodeId=1,weight=1.0],}]
3 NWArray[size=2,{NW[nodeId=2,weight=0.3333333333333333],NW[nodeId=4,weight=0.5],}]
4 NWArray[size=2,{NW[nodeId=5,weight=0.5],NW[nodeId=3,weight=0.5],}]
5 NWArray[size=2,{NW[nodeId=2,weight=0.3333333333333333],NW[nodeId=4,weight=0.5],}]
But the output is coming as:
1 NWArray[size=1,{NW[nodeId=2,weight=0.3333333333333333],}]
2 NWArray[size=3,{NW[nodeId=5,weight=0.5],NW[nodeId=5,weight=0.5],NW[nodeId=5,weight=0.5],}]
3 NWArray[size=2,{NW[nodeId=2,weight=0.3333333333333333],NW[nodeId=2,weight=0.3333333333333333],}]
4 NWArray[size=2,{NW[nodeId=5,weight=0.5],NW[nodeId=5,weight=0.5],}]
5 NWArray[size=2,{NW[nodeId=2,weight=0.3333333333333333],NW[nodeId=2,weight=0.3333333333333333],}]
I cannot understand the reason why the expected output is not coming out. Any help will be appreciated.
Thanks.
You're falling foul of object re-use
while(values.hasNext()) {
neighborList.add(values.next());
}
values.next() will return the same object reference, but the underlying contents of that object will change for each iteration (the readFields method is called to re-populate the contents)
Suggest you amend to (you'll need to obtain the Configuration conf variable from a setup method, unless you can obtain it from the Reporter or OutputCollector - sorry i don't use the old API)
while(values.hasNext()) {
neighborList.add(
ReflectionUtils.copy(conf, values.next(), new NeighborWritable());
}
But I still can't understand why my unit test passed then. Here is the code -
public class UWLTInitReducerTest {
private Text key;
private Iterator<NeighborWritable> values;
private NeighborArrayWritable nodeData;
private TTReducer reducer;
/**
* Set up the states for calling the map function
*/
#Before
public void setUp() throws Exception {
key = new Text("1001");
NeighborWritable[] neighbors = new NeighborWritable[4];
for (int i = 0; i < 4; i++) {
neighbors[i] = new NeighborWritable(new Text("300" + i), new DoubleWritable((double) 1 / (1 + i)));
}
values = Arrays.asList(neighbors).iterator();
nodeData = new NeighborArrayWritable(neighbors);
reducer = new TTReducer();
}
/**
* Test method for InitModelMapper#map - valid input
*/
#Test
public void testMapValid() {
// mock the output object
OutputCollector<Text, UWLTNodeData> output = mock(OutputCollector.class);
try {
// call the API
reducer.reduce(key, values, output, null);
// in order (sequential) verification of the calls to output.collect()
verify(output).collect(key, nodeData);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
Why didn't this code catch the bug?

Resources