How to normalize columns in csv with hadoop - hadoop

I want to read a csv file and normalize the data. If I understand well how hadoop works, the mapper gets the data line by line.
I found this formula to normalize : Xnew = (X - Xmin)/(Xmax - Xmin)
So I need to know the minimum value of the column and the maximum in order to normalize.
How can I do that when in a mapper I have access to only one line at a time ?

The problem with finding the max and min value of a column it this type of application is the scope of the max/min variables where they can be accessed/modified in a parallel program where each instance is isolated from the other in terms of data. So what it needs to be done here is finding a way to have a global scope for the max/min variables in order to access and synchronize their own instances at the end of each map/reduce step.
The closest thing to this supported by Hadoop (at the time this answer was written) is the feature of counters, but they are designed in a way to only increment their values so you have to be creative to achieve your desired output.
The trick here is to actually have if-statements modifying the maximum and minimum counters to the column value of each line (in case they are the max and/or min), by
resetting the counter to zero by adding the negative value of itself and then
increment the counter to the value of this specific line from the input csv file
It's a bit tedious, but it does the job inside the Map function.
Now, for accessing the max and min values of the counters from the Reduce function, we can simply get them in a setup method before the execution of all reducer instances and use them for computing the new normalized values of each key-value pair.
So, let's say we have a grades.csv file stored in the grades directory in the HDFS, which the grades of the students at an elementary school class are stored like that:
Jack,3
Dennis,5
Kate,10
Nancy,9
Peter,1
Zack,2
Alex,4
Yvonne,10
Violet,1
Claire,2
We can find the max and min values at the Map stage while turning each line of the input file into key-value pairs, and compute the normalized grade for each student (using the max and min values of course) at the Reduce stage as seen below:
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.Counters;
import org.apache.hadoop.mapreduce.Cluster;
import java.io.*;
import java.io.IOException;
import java.util.*;
import java.nio.charset.StandardCharsets;
public class NormGrades
{
public static enum Global_Counters
{
MAX_GRADE,
MIN_GRADE
}
/* input: <byte_offset, line_of_tweet>
* output: <student, grade>
*/
public static class Map_Normalize extends Mapper<Object, Text, Text, IntWritable>
{
public void map(Object key, Text value, Context context) throws IOException, InterruptedException
{
String line = value.toString();
String[] columns = line.split(",");
int student_grade = Integer.parseInt(columns[1]);
int max_grade = Math.toIntExact(context.getCounter(Global_Counters.MAX_GRADE).getValue());
int min_grade = Math.toIntExact(context.getCounter(Global_Counters.MIN_GRADE).getValue());
// in order to find the maximum grade, we first set the max grade counter to 0
// by "increasing" it to the negative value of itself, and then increment by
// the new found maximum grade
if(student_grade > max_grade)
{
context.getCounter(Global_Counters.MAX_GRADE).increment(max_grade*(-1));
context.getCounter(Global_Counters.MAX_GRADE).increment(student_grade);
}
// in order to find the minimum grade, we first set the min grade counter to 0
// by "increasing" it to the negative value of itself, and then increment by
// the new found minimum grade
// the contents on this if statement will be accessed at least once in order to
// make sure that the min grade counter value is certainly higher than 0
if((student_grade < min_grade) || (min_grade == 0))
{
context.getCounter(Global_Counters.MIN_GRADE).increment(min_grade*(-1));
context.getCounter(Global_Counters.MIN_GRADE).increment(student_grade);
}
context.write(new Text(columns[0]), new IntWritable(student_grade));
}
}
/* input: <student, grade>
* output: <student, normalized_grade>
*/
public static class Reduce_Normalize extends Reducer<Text, IntWritable, Text, DoubleWritable>
{
public int max_grade, min_grade;
protected void setup(Context context) throws IOException, InterruptedException
{
Configuration conf = context.getConfiguration();
Cluster cluster = new Cluster(conf);
Job current_job = cluster.getJob(context.getJobID());
max_grade = Math.toIntExact(current_job.getCounters().findCounter(Global_Counters.MAX_GRADE).getValue());
min_grade = Math.toIntExact(current_job.getCounters().findCounter(Global_Counters.MIN_GRADE).getValue());
}
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException
{
// each reducer instance is run for each student, so there is only one value/grade to access
int student_grade = values.iterator().next().get();
Double normalized_grade = (double) (student_grade - min_grade) / (max_grade - min_grade);
context.write(key, new DoubleWritable(normalized_grade));
}
}
public static void main(String[] args) throws Exception
{
Path input_dir = new Path("grades");
Path output_dir = new Path("normalized_grades");
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(conf);
if(fs.exists(output_dir))
fs.delete(output_dir, true);
Job normalize_job = Job.getInstance(conf, "Normalize Grades");
normalize_job.setJarByClass(NormGrades.class);
normalize_job.setMapperClass(Map_Normalize.class);
normalize_job.setReducerClass(Reduce_Normalize.class);
normalize_job.setMapOutputKeyClass(Text.class);
normalize_job.setMapOutputValueClass(IntWritable.class);
normalize_job.setOutputKeyClass(Text.class);
normalize_job.setOutputValueClass(DoubleWritable.class);
TextInputFormat.addInputPath(normalize_job, input_dir);
TextOutputFormat.setOutputPath(normalize_job, output_dir);
normalize_job.waitForCompletion(true);
}
}
The results are being stored as seen through the HDFS Browser in the following screenshot:

Related

Hadoop: Redcuer doesn't emit correct claculation

I have the following Reducer class (part of a MapReduce job) that's supposed to compute a score = POS /(-1*sum(NEGs)).
where POS is one positive number, and NEGs are 2 negative numbers. It's always this way.
For example, if the input from the mapper is:
<A, A> -15.0
<A, A> 2.0
<A, A> -15.0
The expected output would be:
<A, A> 0.06666666666666667
However, it's outputting infinity for every output record!
<A, A> Infinity
While debugging, if I added statement to emit values inside the while loop:
score.set(val);
context.write(key, score);
, it prints the results fine but repeats the division. So I get the following:
<A, A> -15.0
<A, A> 2.0
<A, A> -15.0
<A, A> 0.06666666666666667 # correct calculation (2/30)
<A, A> 0.0022222222222222222 # Not sure why it divids twice by 30 (2/30/30)!!
This is MyReducer class
private static class MyReducer extends
Reducer<Pair, DoubleWritable, Pair, DoubleWritable> {
private DoubleWritable score = new DoubleWritable();
int counter = 0;
#Override
public void reduce(Pair key, Iterable<DoubleWritable> values, Context context)
throws IOException, InterruptedException {
Iterator<DoubleWritable> iter = values.iterator();
double nor = 0.0;
double don = 0.0;
double val;
while (iter.hasNext()) {
val = iter.next().get();
if (val < 0)
don += val*-1;
else
nor = val;
//uncomment for debugging!
//score.set(val);
//context.write(key, score);
}
score.set(nor / don);
context.write(key, score);
}
Can anyone explain why it
emits infinity if I didn't emit anything inside the while loop
divides by the denominator twice?
Thanks!
Doubles acting funny in Java is far from rare, of course, but in this particular case, it's not the weird ways of doubles, as for how compatible they can be in Hadoop terms.
First and foremost, this type of reduce computation is critical to only be used at the Reduce stage of the job and not on the Combine stage (if any). In case you have set this reduce computation to be also implemented as a combiner, you could consider un-setting this setup. This is not so much of a rule of thumb, but there's been a lot of bugs in MapReduce jobs where one can't quite figure out why the reducers get weird data or have computations being executed twice in a row (just like you have pointed out).
However, the possible culprit of the issue is the fact that in order to have safe double-type divisions, you really need to use type casting to have a proper double-type result.
To showcase this, I used an example of input based on your input data and stored in an \input directory. Every unique key has one positive and two negative numbers as values (here the keys are set as String for the sake of simplicity), as shown below:
Α -15.0
Α 2.0
Α -15.0
Β -10.0
Β 9.0
Β -12.0
C -7.0
C 1.0
C -19.0
D -5.0
D 18.0
D -5.0
E -6.0
E 6.0
E -6.0
Then explicit type casting was used for the calculation of each score, as you can see from the code below:
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import java.io.*;
import java.io.IOException;
import java.util.*;
import java.nio.charset.StandardCharsets;
public class ScoreComp
{
/* input: <Character, Number>
* output: <Character, Number>
*/
public static class Map extends Mapper<Object, Text, Text, DoubleWritable>
{
public void map(Object key, Text value, Context context) throws IOException, InterruptedException
{
String record = value.toString();
String[] parts = record.split(" "); // just split the lines into key and value
// create key-value pairs from each line
context.write(new Text(parts[0]), new DoubleWritable(Double.parseDouble(parts[1])));
}
}
/* input: <Character, Number>
* output: <Character, Score>
*/
public static class Reduce extends Reducer<Text, DoubleWritable, Text, DoubleWritable>
{
public void reduce(Text key, Iterable<DoubleWritable> values, Context context) throws IOException, InterruptedException
{
double pos = 0.0;
double neg = 0.0;
// for every value of a unique key...
for(DoubleWritable value : values)
{
// retrieve the positive number and calculate the sum of the two negative numbers
if(value.get() < 0)
neg += value.get();
else
pos = value.get();
}
// calculate the score based on the values of each key (using explicit type casting)
double result = (double) pos / (-1 * neg);
// create key-value pairs for each key with its score
context.write(key, new DoubleWritable(result));
}
}
public static void main(String[] args) throws Exception
{
// set the paths of the input and output directories in the HDFS
Path input_dir = new Path("input");
Path output_dir = new Path("scores");
// in case the output directory already exists, delete it
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(conf);
if(fs.exists(output_dir))
fs.delete(output_dir, true);
// configure the MapReduce job
Job scorecomp_job = Job.getInstance(conf, "Score Computation");
scorecomp_job.setJarByClass(ScoreComp.class);
scorecomp_job.setMapperClass(Map.class);
scorecomp_job.setReducerClass(Reduce.class);
scorecomp_job.setMapOutputKeyClass(Text.class);
scorecomp_job.setMapOutputValueClass(DoubleWritable.class);
scorecomp_job.setOutputKeyClass(Text.class);
scorecomp_job.setOutputValueClass(DoubleWritable.class);
FileInputFormat.addInputPath(scorecomp_job, input_dir);
FileOutputFormat.setOutputPath(scorecomp_job, output_dir);
scorecomp_job.waitForCompletion(true);
}
}
And you can see the results from the MapReduce job in the /scores directory are making sense math-wise (screenshot taken through the HDFS browsing explorer):

Hadoop MapReduce Program for removing duplicate records

Could anyone help me to write the mapper and reducer for merging these two files and then removing the duplicate records?
These are the two text files:
file1.txt
2012-3-1a
2012-3-2b
2012-3-3c
2012-3-4d
2012-3-5a
2012-3-6b
2012-3-7c
2012-3-3c
and file2.txt:
2012-3-1b
2012-3-2a
2012-3-3b
2012-3-4d
2012-3-5a
2012-3-6c
2012-3-7d
2012-3-3c
A simple word count program will do the job for you. The only change you need to make is, set the output value of the Reducer as NullWritable.get()
Is there a common key in both the files which helps identify if record matched or not? If so then:
Mappers Input: Standard TextInputFormat
Mapper's Output Key : Common Key and Mapper's output Value : Entire Record.
At reducer : It will not be required to iterate over the Keys just take only 1 instance of the Value for Write.
If the match or duplicacy can be concluded only if complete record matched: then
Mappers Input: Standard TextInputFormat
Mapper's Output Key : Entire Record and Mapper's output Value : NullWritable.
At reducer: It will not be required to iterate over the Keys. Just take only one instance of Key and write that as a Value.
Reducer Output Key: Reducer Input Key, Reducer Output Value : NullWritable
Here's code to remove duplicate lines in large text data, which uses hash for efficiency:
DRMapper.java
import com.google.common.hash.Hashing;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
class DRMapper extends Mapper<LongWritable, Text, Text, Text> {
private Text hashKey = new Text();
private Text mappedValue = new Text();
#Override
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString();
hashKey.set(Hashing.murmur3_32().hashString(line, StandardCharsets.UTF_8).toString());
mappedValue.set(line);
context.write(hashKey, mappedValue);
}
}
DRReducer.java
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class DRReducer extends Reducer<Text, Text, Text, NullWritable> {
#Override
public void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
Text value;
if (values.iterator().hasNext()) {
value = values.iterator().next();
if (!(value.toString().isEmpty())) {
context.write(value, NullWritable.get());
}
}
}
}
DuplicateRemover.java
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class DuplicateRemover {
private static final int DEFAULT_NUM_REDUCERS = 210;
public static void main(String[] args) throws Exception {
if (args.length != 2) {
System.err.println("Usage: DuplicateRemover <input path> <output path>");
System.exit(-1);
}
Job job = new Job();
job.setJarByClass(DuplicateRemover.class);
job.setJobName("Duplicate Remover");
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setMapperClass(DRMapper.class);
job.setReducerClass(DRReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setNumReduceTasks(DEFAULT_NUM_REDUCERS);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
compile with:
javac -encoding UTF8 -cp $(hadoop classpath) *.java
jar cf dr.jar *.class
Assuming that the input text files are in in_folder, run as:
hadoop jar dr.jar in_folder out_folder

Setting number of Reduce tasks using command line

I am a beginner in Hadoop. When trying to set the number of reducers using command line using Generic Options Parser, the number of reducers is not changing. There is no property set in the configuration file "mapred-site.xml" for the number of reducers and I think, that would make the number of reducers=1 by default. I am using cloudera QuickVM and hadoop version : "Hadoop 2.5.0-cdh5.2.0".
Pointers Appreciated. Also my issue was I wanted to know the preference order of the ways to set the number of reducers.
Using configuration File "mapred-site.xml"
mapred.reduce.tasks
By specifying in the driver class
job.setNumReduceTasks(4)
By specifying at the command line using Tool interface:
-Dmapreduce.job.reduces=2
Mapper :
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable>
{
#Override
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException
{
String line = value.toString();
//Split the line into words
for(String word: line.split("\\W+"))
{
//Make sure that the word is legitimate
if(word.length() > 0)
{
//Emit the word as you see it
context.write(new Text(word), new IntWritable(1));
}
}
}
}
Reducer :
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
#Override
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException
{
//Initializing the word count to 0 for every key
int count=0;
for(IntWritable value: values)
{
//Adding the word count counter to count
count += value.get();
}
//Finally write the word and its count
context.write(key, new IntWritable(count));
}
}
Driver :
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class WordCount extends Configured implements Tool
{
public int run(String[] args) throws Exception
{
//Instantiate the job object for configuring your job
Job job = new Job();
//Specify the class that hadoop needs to look in the JAR file
//This Jar file is then sent to all the machines in the cluster
job.setJarByClass(WordCount.class);
//Set a meaningful name to the job
job.setJobName("Word Count");
//Add the apth from where the file input is to be taken
FileInputFormat.addInputPath(job, new Path(args[0]));
//Set the path where the output must be stored
FileOutputFormat.setOutputPath(job, new Path(args[1]));
//Set the Mapper and the Reducer class
job.setMapperClass(WordCountMapper.class);
job.setReducerClass(WordCountReducer.class);
//Set the type of the key and value of Mapper and reducer
/*
* If the Mapper output type and Reducer output type are not the same then
* also include setMapOutputKeyClass() and setMapOutputKeyValue()
*/
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//job.setNumReduceTasks(4);
//Start the job and wait for it to finish. And exit the program based on
//the success of the program
System.exit(job.waitForCompletion(true)?0:1);
return 0;
}
public static void main(String[] args) throws Exception
{
// Let ToolRunner handle generic command-line options
int res = ToolRunner.run(new Configuration(), new WordCount(), args);
System.exit(res);
}
}
And I have tried the following commands to run the job :
hadoop jar /home/cloudera/Misc/wordCount.jar WordCount -Dmapreduce.job.reduces=2 hdfs:/Input/inputdata hdfs:/Output/wordcount_tool_D=2_take13
and
hadoop jar /home/cloudera/Misc/wordCount.jar WordCount -D mapreduce.job.reduces=2 hdfs:/Input/inputdata hdfs:/Output/wordcount_tool_D=2_take14
Answering your query on order. It would always be 2>3>1
The option specified in your driver class takes precedence over the ones you specify as an argument to your GenOptionsParser or the ones you specify in your site specific config.
I would recommend debugging the configurations inside your driver class by printing it out before you submit the job. This way , you can be sure what the configurations are , right before you submit the job to the cluster.
Configuration conf = getConf(); // This is available to you since you extended Configured
for(Entry entry: conf)
//Sysout the entries here

Hadoop Not Finding Map Class

I am using hadoop-1.2.1 and trying to run a simple RowCount HBase job using ToolRunner. However, no matter what I seem to try, hadoop cannot find the map class. The jar file is being copied correctly into hdfs, but I can't seem to figure out where it is going wrong. Please help!
Here is the code:
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.filter.FirstKeyOnlyFilter;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class HBaseRowCountToolRunnerTest extends Configured implements Tool
{
// What to copy.
public static final String JAR_NAME = "myJar.jar";
public static final String LOCAL_JAR = <path_to_jar> + JAR_NAME;
public static final String REMOTE_JAR = "/tmp/"+JAR_NAME;
public static void main(String[] args) throws Exception
{
Configuration config = HBaseConfiguration.create();
//All connection configs set here -- omitted to post the code
config.set("tmpjars", REMOTE_JAR);
FileSystem dfs = FileSystem.get(config);
System.out.println("pathString = " + (new Path(LOCAL_JAR)).toString() + " \n");
// Copy jar file to remote.
dfs.copyFromLocalFile(new Path(LOCAL_JAR), new Path(REMOTE_JAR));
// Get rid of jar file when we're done.
dfs.deleteOnExit(new Path(REMOTE_JAR));
// Run the job.
System.exit(ToolRunner.run(config, new HBaseRowCountToolRunnerTest(), args));
}
#Override
public int run(String[] args) throws Exception
{
Job job = new RowCountJob(getConf(), "testJob", "myLittleHBaseTable");
return job.waitForCompletion(true) ? 0 : 1;
}
public static class RowCountJob extends Job
{
RowCountJob(Configuration conf, String jobName, String tableName) throws IOException
{
super(conf, RowCountJob.class.getCanonicalName() + "_" + jobName);
setJarByClass(getClass());
Scan scan = new Scan();
scan.setCacheBlocks(false);
scan.setFilter(new FirstKeyOnlyFilter());
setOutputFormatClass(NullOutputFormat.class);
TableMapReduceUtil.initTableMapperJob(tableName, scan,
RowCounterMapper.class, ImmutableBytesWritable.class, Result.class, this);
setNumReduceTasks(0);
}
}//end public static class RowCountJob extends Job
//Mapper that runs the count
//TableMapper -- TableMapper<KEYOUT, VALUEOUT> (*OUT by type)
public static class RowCounterMapper extends TableMapper<ImmutableBytesWritable, Result>
{
//Counter enumeration to count the actual rows
public static enum Counters {ROWS}
/**
* Maps the data.
*
* #param row The current table row key.
* #param values The columns.
* #param context The current context.
* #throws IOException When something is broken with the data.
* #see org.apache.hadoop.mapreduce.Mapper#map(KEYIN, VALUEIN,
* org.apache.hadoop.mapreduce.Mapper.Context)
*/
#Override
public void map(ImmutableBytesWritable row, Result values, Context context) throws IOException
{
// Count every row containing data times 2, whether it's in qualifiers or values
context.getCounter(Counters.ROWS).increment(2);
}
}//end public static class RowCounterMapper extends TableMapper<ImmutableBytesWritable, Result>
}//end public static void main(String[] args) throws Exception
Ok- I found a workaround to the problem and thought that I would share for all others having similar issues...
As is turns out, I abandoned the tmpjars configuration option and just copied the jar file directed into the DistributedCache from the code itself. Here is what it looks like:
// Copy jar file to remote.
FileSystem dfs = FileSystem.get(conf);
dfs.copyFromLocalFile(new Path(LOCAL_JAR), new Path(REMOTE_JAR));
// Get rid of jar file when we're done.
dfs.deleteOnExit(new Path(REMOTE_JAR));
//Place it in the distributed cache
DistributedCache.addFileToClassPath(new Path(REMOTE_JAR), conf, dfs);
Perhaps it doesn't solve what is going on with tmpjars, but it does work.
I got the same problem today.Finally, I found it was because I forgot to insert the following sentence in the driver class...
job.setJarByClass(HBaseTestDriver.class);

Mahout Datamodel with duplicate user,item enteries but different preference values

I was wondering how the distributed mahout recommender job org.apache.mahout.cf.taste.hadoop.item.RecommenderJob handled csv files where duplicate and triplicate user,item entries exist but with different preference values. For example, if I had a .csv file that had entries like
1,1,0.7
1,2,0.7
1,2,0.3
1,3,0.7
1,3,-0.7
How would Mahout's datamodel handle this? Would it sum up the preference values for a given user,item entry (e.g. for user item 1,2 the preference would be (0.7 + 0.3)), or does it average the values (e.g. for user item 1,2 the preference is (0.7 + 0.3)/2) or does it default to the last user,item entry it detects (e.g. for user 1,2 the preference value is set to 0.3).
I ask this question because I am considering recommendations based on multiple preference metrics (item views, likes, dislikes, saves to shopping cart, etc.). It would be helpful if the datamodel treated the preference values as linear weights (e.g. item views plus save to wish list has higher preference score than item views). If datamodel already handles this by summing, it would save me the chore of an additional map-reduce to sort and calculate total scores based on multiple metrics. Any clarification anyone could provide on mahout .csv datamodel works in this respect for org.apache.mahout.cf.taste.hadoop.item.RecommenderJob would be really appreciated. Thanks.
No, it overwrites. The model is not additive. However the model in Myrrix, a derivative of this code (that I'm commercializing) has a fundamentally additive data modet, just for the reason you give. The input values are weights and are always added.
merge it before starting computation.
examples:
import java.io.IOException;
import java.util.Iterator;
import java.util.StringTokenizer;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
public final class Merge {
public Merge() {
}
public static class MergeMapper extends MapReduceBase implements
Mapper<LongWritable, Text, Text, FloatWritable> {
public void map(LongWritable key, Text value, OutputCollector<Text, FloatWritable> collector,
Reporter reporter) throws IOException {
// TODO Auto-generated method stub
String line = value.toString();
StringTokenizer tokenizer = new StringTokenizer(line);
if (tokenizer.hasMoreTokens()) {
String userId = tokenizer.nextToken(",");
String itemId = tokenizer.nextToken(",");
FloatWritable score = new FloatWritable(Float.valueOf(tokenizer.nextToken(",")));
collector.collect(new Text(userId + "," + itemId), score);
}
else {
System.out.println("empty line " + line);
}
}
}
public static class MergeReducer extends MapReduceBase implements
Reducer<Text, FloatWritable, Text, FloatWritable> {
public void reduce(Text key, Iterator<FloatWritable> scores,
OutputCollector<Text, FloatWritable> collector, Reporter reporter) throws IOException {
// TODO Auto-generated method stub
float sum = 0.0f;
while (scores.hasNext()) {
sum += scores.next().get();
}
if (sum != 0.0)
collector.collect(key, new FloatWritable(sum));
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
JobConf conf = new JobConf(Merge.class);
conf.setJobName("Merge Data");
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(FloatWritable.class);
conf.setMapperClass(MergeMapper.class);
// combine the same key items
conf.setCombinerClass(MergeReducer.class);
conf.setReducerClass(MergeReducer.class);
conf.setInputFormat(TextInputFormat.class);
conf.set("mapred.textoutputformat.separator", ",");
conf.setOutputFormat(TextOutputFormat.class);
FileInputFormat.setInputPaths(conf, new Path("hdfs://localhost:49000/tmp/data"));
FileOutputFormat.setOutputPath(conf, new Path("hdfs://localhost:49000/tmp/data/output"));
JobClient.runJob(conf);
}
}

Resources