Hadoop: Redcuer doesn't emit correct claculation - hadoop

I have the following Reducer class (part of a MapReduce job) that's supposed to compute a score = POS /(-1*sum(NEGs)).
where POS is one positive number, and NEGs are 2 negative numbers. It's always this way.
For example, if the input from the mapper is:
<A, A> -15.0
<A, A> 2.0
<A, A> -15.0
The expected output would be:
<A, A> 0.06666666666666667
However, it's outputting infinity for every output record!
<A, A> Infinity
While debugging, if I added statement to emit values inside the while loop:
score.set(val);
context.write(key, score);
, it prints the results fine but repeats the division. So I get the following:
<A, A> -15.0
<A, A> 2.0
<A, A> -15.0
<A, A> 0.06666666666666667 # correct calculation (2/30)
<A, A> 0.0022222222222222222 # Not sure why it divids twice by 30 (2/30/30)!!
This is MyReducer class
private static class MyReducer extends
Reducer<Pair, DoubleWritable, Pair, DoubleWritable> {
private DoubleWritable score = new DoubleWritable();
int counter = 0;
#Override
public void reduce(Pair key, Iterable<DoubleWritable> values, Context context)
throws IOException, InterruptedException {
Iterator<DoubleWritable> iter = values.iterator();
double nor = 0.0;
double don = 0.0;
double val;
while (iter.hasNext()) {
val = iter.next().get();
if (val < 0)
don += val*-1;
else
nor = val;
//uncomment for debugging!
//score.set(val);
//context.write(key, score);
}
score.set(nor / don);
context.write(key, score);
}
Can anyone explain why it
emits infinity if I didn't emit anything inside the while loop
divides by the denominator twice?
Thanks!

Doubles acting funny in Java is far from rare, of course, but in this particular case, it's not the weird ways of doubles, as for how compatible they can be in Hadoop terms.
First and foremost, this type of reduce computation is critical to only be used at the Reduce stage of the job and not on the Combine stage (if any). In case you have set this reduce computation to be also implemented as a combiner, you could consider un-setting this setup. This is not so much of a rule of thumb, but there's been a lot of bugs in MapReduce jobs where one can't quite figure out why the reducers get weird data or have computations being executed twice in a row (just like you have pointed out).
However, the possible culprit of the issue is the fact that in order to have safe double-type divisions, you really need to use type casting to have a proper double-type result.
To showcase this, I used an example of input based on your input data and stored in an \input directory. Every unique key has one positive and two negative numbers as values (here the keys are set as String for the sake of simplicity), as shown below:
Α -15.0
Α 2.0
Α -15.0
Β -10.0
Β 9.0
Β -12.0
C -7.0
C 1.0
C -19.0
D -5.0
D 18.0
D -5.0
E -6.0
E 6.0
E -6.0
Then explicit type casting was used for the calculation of each score, as you can see from the code below:
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import java.io.*;
import java.io.IOException;
import java.util.*;
import java.nio.charset.StandardCharsets;
public class ScoreComp
{
/* input: <Character, Number>
* output: <Character, Number>
*/
public static class Map extends Mapper<Object, Text, Text, DoubleWritable>
{
public void map(Object key, Text value, Context context) throws IOException, InterruptedException
{
String record = value.toString();
String[] parts = record.split(" "); // just split the lines into key and value
// create key-value pairs from each line
context.write(new Text(parts[0]), new DoubleWritable(Double.parseDouble(parts[1])));
}
}
/* input: <Character, Number>
* output: <Character, Score>
*/
public static class Reduce extends Reducer<Text, DoubleWritable, Text, DoubleWritable>
{
public void reduce(Text key, Iterable<DoubleWritable> values, Context context) throws IOException, InterruptedException
{
double pos = 0.0;
double neg = 0.0;
// for every value of a unique key...
for(DoubleWritable value : values)
{
// retrieve the positive number and calculate the sum of the two negative numbers
if(value.get() < 0)
neg += value.get();
else
pos = value.get();
}
// calculate the score based on the values of each key (using explicit type casting)
double result = (double) pos / (-1 * neg);
// create key-value pairs for each key with its score
context.write(key, new DoubleWritable(result));
}
}
public static void main(String[] args) throws Exception
{
// set the paths of the input and output directories in the HDFS
Path input_dir = new Path("input");
Path output_dir = new Path("scores");
// in case the output directory already exists, delete it
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(conf);
if(fs.exists(output_dir))
fs.delete(output_dir, true);
// configure the MapReduce job
Job scorecomp_job = Job.getInstance(conf, "Score Computation");
scorecomp_job.setJarByClass(ScoreComp.class);
scorecomp_job.setMapperClass(Map.class);
scorecomp_job.setReducerClass(Reduce.class);
scorecomp_job.setMapOutputKeyClass(Text.class);
scorecomp_job.setMapOutputValueClass(DoubleWritable.class);
scorecomp_job.setOutputKeyClass(Text.class);
scorecomp_job.setOutputValueClass(DoubleWritable.class);
FileInputFormat.addInputPath(scorecomp_job, input_dir);
FileOutputFormat.setOutputPath(scorecomp_job, output_dir);
scorecomp_job.waitForCompletion(true);
}
}
And you can see the results from the MapReduce job in the /scores directory are making sense math-wise (screenshot taken through the HDFS browsing explorer):

Related

How to normalize columns in csv with hadoop

I want to read a csv file and normalize the data. If I understand well how hadoop works, the mapper gets the data line by line.
I found this formula to normalize : Xnew = (X - Xmin)/(Xmax - Xmin)
So I need to know the minimum value of the column and the maximum in order to normalize.
How can I do that when in a mapper I have access to only one line at a time ?
The problem with finding the max and min value of a column it this type of application is the scope of the max/min variables where they can be accessed/modified in a parallel program where each instance is isolated from the other in terms of data. So what it needs to be done here is finding a way to have a global scope for the max/min variables in order to access and synchronize their own instances at the end of each map/reduce step.
The closest thing to this supported by Hadoop (at the time this answer was written) is the feature of counters, but they are designed in a way to only increment their values so you have to be creative to achieve your desired output.
The trick here is to actually have if-statements modifying the maximum and minimum counters to the column value of each line (in case they are the max and/or min), by
resetting the counter to zero by adding the negative value of itself and then
increment the counter to the value of this specific line from the input csv file
It's a bit tedious, but it does the job inside the Map function.
Now, for accessing the max and min values of the counters from the Reduce function, we can simply get them in a setup method before the execution of all reducer instances and use them for computing the new normalized values of each key-value pair.
So, let's say we have a grades.csv file stored in the grades directory in the HDFS, which the grades of the students at an elementary school class are stored like that:
Jack,3
Dennis,5
Kate,10
Nancy,9
Peter,1
Zack,2
Alex,4
Yvonne,10
Violet,1
Claire,2
We can find the max and min values at the Map stage while turning each line of the input file into key-value pairs, and compute the normalized grade for each student (using the max and min values of course) at the Reduce stage as seen below:
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.Counters;
import org.apache.hadoop.mapreduce.Cluster;
import java.io.*;
import java.io.IOException;
import java.util.*;
import java.nio.charset.StandardCharsets;
public class NormGrades
{
public static enum Global_Counters
{
MAX_GRADE,
MIN_GRADE
}
/* input: <byte_offset, line_of_tweet>
* output: <student, grade>
*/
public static class Map_Normalize extends Mapper<Object, Text, Text, IntWritable>
{
public void map(Object key, Text value, Context context) throws IOException, InterruptedException
{
String line = value.toString();
String[] columns = line.split(",");
int student_grade = Integer.parseInt(columns[1]);
int max_grade = Math.toIntExact(context.getCounter(Global_Counters.MAX_GRADE).getValue());
int min_grade = Math.toIntExact(context.getCounter(Global_Counters.MIN_GRADE).getValue());
// in order to find the maximum grade, we first set the max grade counter to 0
// by "increasing" it to the negative value of itself, and then increment by
// the new found maximum grade
if(student_grade > max_grade)
{
context.getCounter(Global_Counters.MAX_GRADE).increment(max_grade*(-1));
context.getCounter(Global_Counters.MAX_GRADE).increment(student_grade);
}
// in order to find the minimum grade, we first set the min grade counter to 0
// by "increasing" it to the negative value of itself, and then increment by
// the new found minimum grade
// the contents on this if statement will be accessed at least once in order to
// make sure that the min grade counter value is certainly higher than 0
if((student_grade < min_grade) || (min_grade == 0))
{
context.getCounter(Global_Counters.MIN_GRADE).increment(min_grade*(-1));
context.getCounter(Global_Counters.MIN_GRADE).increment(student_grade);
}
context.write(new Text(columns[0]), new IntWritable(student_grade));
}
}
/* input: <student, grade>
* output: <student, normalized_grade>
*/
public static class Reduce_Normalize extends Reducer<Text, IntWritable, Text, DoubleWritable>
{
public int max_grade, min_grade;
protected void setup(Context context) throws IOException, InterruptedException
{
Configuration conf = context.getConfiguration();
Cluster cluster = new Cluster(conf);
Job current_job = cluster.getJob(context.getJobID());
max_grade = Math.toIntExact(current_job.getCounters().findCounter(Global_Counters.MAX_GRADE).getValue());
min_grade = Math.toIntExact(current_job.getCounters().findCounter(Global_Counters.MIN_GRADE).getValue());
}
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException
{
// each reducer instance is run for each student, so there is only one value/grade to access
int student_grade = values.iterator().next().get();
Double normalized_grade = (double) (student_grade - min_grade) / (max_grade - min_grade);
context.write(key, new DoubleWritable(normalized_grade));
}
}
public static void main(String[] args) throws Exception
{
Path input_dir = new Path("grades");
Path output_dir = new Path("normalized_grades");
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(conf);
if(fs.exists(output_dir))
fs.delete(output_dir, true);
Job normalize_job = Job.getInstance(conf, "Normalize Grades");
normalize_job.setJarByClass(NormGrades.class);
normalize_job.setMapperClass(Map_Normalize.class);
normalize_job.setReducerClass(Reduce_Normalize.class);
normalize_job.setMapOutputKeyClass(Text.class);
normalize_job.setMapOutputValueClass(IntWritable.class);
normalize_job.setOutputKeyClass(Text.class);
normalize_job.setOutputValueClass(DoubleWritable.class);
TextInputFormat.addInputPath(normalize_job, input_dir);
TextOutputFormat.setOutputPath(normalize_job, output_dir);
normalize_job.waitForCompletion(true);
}
}
The results are being stored as seen through the HDFS Browser in the following screenshot:

Passing a bag as an input for UDF in PIG

I'm trying to pass a databag(final) as an input.
dump final;
gives:-
(4,john,john,David,Banking ,4,M,20-01-1994,78.65,345000,Arkansasdest1,Destination)
(4,john,john,David,Banking ,4,M,20-01-1994,78.65,345000,Arkanssdest2,Destination)
(4,johns,johns,David,Banking ,4,M,20-01-1994,78.65,345000,ArkansasSrc1,source)
(4,johns,johns,David,Banking ,4,M,20-01-1994,78.65,345000,ArkansaSrc2,source)
I'm about to write an UDF for processing the above databag and finding mismatch between Source and Destination, in order to do that i have to check whether my UDF accepts databag or not. so i wrote one sample UDF below:
package PigUDFpck;
import java.io.IOException;
import java.util.Iterator;
import org.apache.pig.EvalFunc;
import org.apache.pig.data.BagFactory;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
public class databag extends EvalFunc<DataBag> {
TupleFactory mTupleFactory = TupleFactory.getInstance();
BagFactory mBagFactory = BagFactory.getInstance();
public DataBag exec(Tuple input) throws IOException { // different return type
DataBag result = mBagFactory.newDefaultBag(); // change here
DataBag values = (DataBag)input.get(0);
for (Iterator<Tuple> iterator = values.iterator(); iterator.hasNext();) {
Tuple tuple = iterator.next();
//logic
Tuple t = mTupleFactory.getInstance().newTuple();
t.append(tuple);
result.add(t);
}
return result; // change here
}
}
After that I registered the path using
REGISTER /usr/local/pig/UDF/UDFBAG.jar;
DEFINE Databag Databag(); // not sure how to define it
2017-02-16 19:07:05,875 [main] WARN org.apache.pig.newplan.BaseOperatorPlan - Encountered Warning IMPLICIT_CAST_TO_INT 2 time(s). //got this warning after defining.
final1 = FOREACH final GENERATE(Databag(final));
ERROR 1200: Pig script failed to parse:
Invalid scalar projection: final : A column needs to be projected from a relation for it to be used as a scalar
Please help me on Defining the UDF and how to pass a DataBag to UDF
Thanks
Try
final1 = FOREACH final GENERATE(Databag(*));
Though as far as I see, your final contains tuples, not bags of tuples, so you'll probably need to first group it by some key. in that case it will be smth like
final1 = FOREACH (group final [by key or all]) GENERATE(Databag(final));

Setting number of Reduce tasks using command line

I am a beginner in Hadoop. When trying to set the number of reducers using command line using Generic Options Parser, the number of reducers is not changing. There is no property set in the configuration file "mapred-site.xml" for the number of reducers and I think, that would make the number of reducers=1 by default. I am using cloudera QuickVM and hadoop version : "Hadoop 2.5.0-cdh5.2.0".
Pointers Appreciated. Also my issue was I wanted to know the preference order of the ways to set the number of reducers.
Using configuration File "mapred-site.xml"
mapred.reduce.tasks
By specifying in the driver class
job.setNumReduceTasks(4)
By specifying at the command line using Tool interface:
-Dmapreduce.job.reduces=2
Mapper :
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable>
{
#Override
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException
{
String line = value.toString();
//Split the line into words
for(String word: line.split("\\W+"))
{
//Make sure that the word is legitimate
if(word.length() > 0)
{
//Emit the word as you see it
context.write(new Text(word), new IntWritable(1));
}
}
}
}
Reducer :
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
#Override
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException
{
//Initializing the word count to 0 for every key
int count=0;
for(IntWritable value: values)
{
//Adding the word count counter to count
count += value.get();
}
//Finally write the word and its count
context.write(key, new IntWritable(count));
}
}
Driver :
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class WordCount extends Configured implements Tool
{
public int run(String[] args) throws Exception
{
//Instantiate the job object for configuring your job
Job job = new Job();
//Specify the class that hadoop needs to look in the JAR file
//This Jar file is then sent to all the machines in the cluster
job.setJarByClass(WordCount.class);
//Set a meaningful name to the job
job.setJobName("Word Count");
//Add the apth from where the file input is to be taken
FileInputFormat.addInputPath(job, new Path(args[0]));
//Set the path where the output must be stored
FileOutputFormat.setOutputPath(job, new Path(args[1]));
//Set the Mapper and the Reducer class
job.setMapperClass(WordCountMapper.class);
job.setReducerClass(WordCountReducer.class);
//Set the type of the key and value of Mapper and reducer
/*
* If the Mapper output type and Reducer output type are not the same then
* also include setMapOutputKeyClass() and setMapOutputKeyValue()
*/
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//job.setNumReduceTasks(4);
//Start the job and wait for it to finish. And exit the program based on
//the success of the program
System.exit(job.waitForCompletion(true)?0:1);
return 0;
}
public static void main(String[] args) throws Exception
{
// Let ToolRunner handle generic command-line options
int res = ToolRunner.run(new Configuration(), new WordCount(), args);
System.exit(res);
}
}
And I have tried the following commands to run the job :
hadoop jar /home/cloudera/Misc/wordCount.jar WordCount -Dmapreduce.job.reduces=2 hdfs:/Input/inputdata hdfs:/Output/wordcount_tool_D=2_take13
and
hadoop jar /home/cloudera/Misc/wordCount.jar WordCount -D mapreduce.job.reduces=2 hdfs:/Input/inputdata hdfs:/Output/wordcount_tool_D=2_take14
Answering your query on order. It would always be 2>3>1
The option specified in your driver class takes precedence over the ones you specify as an argument to your GenOptionsParser or the ones you specify in your site specific config.
I would recommend debugging the configurations inside your driver class by printing it out before you submit the job. This way , you can be sure what the configurations are , right before you submit the job to the cluster.
Configuration conf = getConf(); // This is available to you since you extended Configured
for(Entry entry: conf)
//Sysout the entries here

K-means on hadoop compile error...

I've downloaded the k-means(in hadoop mapreduce) opensource. But, it has compile errors.
---------------------SOURCE----------------------------
/*
* Copyright 2012
* Parallel and Distributed Systems Group (PVS)
* Institute of Computer Science (IFI)
* Heidelberg University
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package examples;
import algorithms.kmeans.Cluster;
import algorithms.kmeans.Clusters;
import algorithms.kmeans.SamplesCache;
import org.apache.*;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.mahout.math.DenseVector;
import org.apache.mahout.math.DenseVectorWritable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.StringTokenizer;
public class KMeansHadoop {
private final static Logger LOG = LoggerFactory.getLogger(KMeansHadoop.class);
public static class KMeansMapper extends
MRMapper<LongWritable, Text, IntWritable, Clusters, Clusters> {
private SamplesCache cache = new SamplesCache(500);
private int cacheSize = 10000;
private Clusters clusters = null;
private int k = 0;
private int nextCentroidToInit = 0;
/**
* Configures the mapper by reading two configuration options:
* - "numClusters": the k in k-Means
* - "numAuxClusters": the number of in-memory auxiliary clusters representing the input data
*
* #param context the mapper context, used to access the configuration
* #throws IOException
* #throws InterruptedException
*/
#Override
protected void setup(Context context) throws IOException, InterruptedException {
super.setup(context);
Configuration conf = context.getConfiguration();
this.k = conf.getInt("numCluster", 5);
this.clusters = new Clusters(k);
this.cacheSize = conf.getInt("numAuxCluster", 500);
this.cache = new SamplesCache(cacheSize);
}
/**
* Maps the input lines to initial centroids and, as a side-effect, stores auxiliary clusters representing the
* input data in memory
*
* #param key the key provided by the input format, not used here
* #param value one line of the input; input format: one data point per line, vector components delimited by spaces
* #param context the mapper context used to send initial centroids to the reducer
* #throws IOException
* #throws InterruptedException
*/
#Override
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
// Input format: one data point per line, components delimited by spaces
final List<Double> doubleValues = new ArrayList<Double>();
final StringTokenizer tk = new StringTokenizer(value.toString());
while(tk.hasMoreElements()) {
final String token = tk.nextToken();
doubleValues.add(Double.parseDouble(token));
}
double[] dv = new double[doubleValues.size()];
for(int i=0; i<doubleValues.size(); i++) {
dv[i] = doubleValues.get(i);
}
DenseVector dvec = new DenseVector(dv);
DenseVectorWritable sample = new DenseVectorWritable(dvec);
// add sample to local auxiliary clusters
this.cache.addSample(sample);
// first k points are chosen as initial centroids
if (nextCentroidToInit < k) {
this.clusters.set(nextCentroidToInit, new Cluster(sample, sample));
this.nextCentroidToInit += 1;
} else if (nextCentroidToInit == k) {
// send initial centroids to reducer
context.write(new IntWritable(0), this.clusters);
this.nextCentroidToInit += 1;
}
}
/**
* Remaps the input data when a new set of preliminary clusters is received from the reducer by recalculating
* the assignment of the local input data, as represented by the auxiliary clusters, to the preliminary clusters
* and sends the updated centroids to the reducer.
* #param cs the preliminary clusters computed by the reducer
* #param context the mapper context used to send the locally recomputed centroids to the reducer
* #throws IOException
* #throws InterruptedException
*/
public void remap(List<Clusters> cs, Context context) throws IOException, InterruptedException {
LOG.info("Remapping preliminary clusters");
// set the preliminary clusters as new clusters
this.clusters = cs.get(0).clone();
this.clusters.reset();
// reassign the local input data, represented by the auxiliary clusters, to the clusters, thereby readjusting
// the clusters centroids
this.cache.reAssignAll(clusters);
// send the locally updated clusters to the reducer
context.write(new IntWritable(0), this.clusters);
}
}
public static class KMeansReducer extends
MRReducer<IntWritable, Clusters, IntWritable, Clusters, Clusters> {
private double lastError = Double.MAX_VALUE;
private float epsilon = Float.MAX_VALUE;
/**
* Configures the mapper by reading the configuration option "epsilon": The minimum change of the MSE needed to
* trigger a new iteration.
*
* #param context the reducer context, used to access the configuration
* #throws IOException
* #throws InterruptedException
*/
#Override
protected void setup(Context context) throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
epsilon = conf.getFloat("epsilon", 100f);
}
/**
* Reduces a list of clusters locally computed by the mappers into a preliminary global set of clusters, which
* is then restreamed to the mappers, or, iff the MSE of the global set of clusters has not changed by more than
* epsilon since the last reduce invocation ends the iteration by emiting the final set of clusters.
*
* #param key the key set by the mapper, not used here
* #param values the list of locally computed clusters computed by the mappers
* #param context the reducer context, used to restream preliminary clusters to the mappers and emit the final
* clusters
* #throws IOException
* #throws InterruptedException
*/
#Override
protected void reduce(IntWritable key, Iterable<Clusters> values,
MRReduceContext<IntWritable, Clusters, IntWritable, Clusters, Clusters> context) throws IOException, InterruptedException {
// Merge the list of clusters into one set of clusters
Clusters results = null;
for(Clusters clusters : values) {
if( results == null ) {
results = clusters;
} else {
results.merge(clusters);
}
}
Double error = results.getMSE();
LOG.info("Last error " + lastError + ", current error " + error);
if (lastError < Double.MAX_VALUE &&
error <= lastError + epsilon &&
error >= lastError - epsilon) {
// MSE has changed by less than epsilon: Emit final result
context.write(new IntWritable(0), results);
LOG.info("Final result written.");
} else {
// MSE has changed by more than epsilon: Send recomputed preliminary clusters to mappers to start a new
// iteration
this.lastError = error;
results.computeNewCentroids();
context.restream(results);
LOG.info("Preliminary result restreamed.");
}
}
}
/**
* Executes the streaming Hadoop MapReduce program
* #param args first arg is input path, second arg is output path
* #throws Exception
*/
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
conf.setBoolean("mrstreamer.hadoop.streaming", true);
// has to be 1 to ensure the algorithm producing valid results
conf.setInt(JobContext.NUM_REDUCES, 1);
conf.setInt(JobContext.NUM_MAPS, 4);
conf.set("numCluster", "5");
conf.set("numAuxCluster", "500");
Job job = new MRSJob(conf, "kmeanshadoop");
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(Clusters.class);
job.setMapperClass(KMeansMapper.class);
job.setReducerClass(KMeansReducer.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.waitForCompletion(true);
}
}
---------------------------ERROR------------------------------
Exception in thread "main" java.lang.Error: Unresolved compilation problems:
NUM_REDUCES cannot be resolved or is not a field
NUM_MAPS cannot be resolved or is not a field
at examples.KMeansHadoop.main(KMeansHadoop.java:222)
Probably, you are not using the same version of Hadoop as the authors of this code. It should be covered by the line:
import org.apache.hadoop.mapred.JobContext;
Update to hadoop version 2.2.0 (or later) if you want to use these settings.
Otherwise, you can use instead of these two commands, the following on the old API:
conf.setNumReduceTasks(1);
conf.setNumMapTasks(4); //but this is only a suggestion to hadoop

Mahout Datamodel with duplicate user,item enteries but different preference values

I was wondering how the distributed mahout recommender job org.apache.mahout.cf.taste.hadoop.item.RecommenderJob handled csv files where duplicate and triplicate user,item entries exist but with different preference values. For example, if I had a .csv file that had entries like
1,1,0.7
1,2,0.7
1,2,0.3
1,3,0.7
1,3,-0.7
How would Mahout's datamodel handle this? Would it sum up the preference values for a given user,item entry (e.g. for user item 1,2 the preference would be (0.7 + 0.3)), or does it average the values (e.g. for user item 1,2 the preference is (0.7 + 0.3)/2) or does it default to the last user,item entry it detects (e.g. for user 1,2 the preference value is set to 0.3).
I ask this question because I am considering recommendations based on multiple preference metrics (item views, likes, dislikes, saves to shopping cart, etc.). It would be helpful if the datamodel treated the preference values as linear weights (e.g. item views plus save to wish list has higher preference score than item views). If datamodel already handles this by summing, it would save me the chore of an additional map-reduce to sort and calculate total scores based on multiple metrics. Any clarification anyone could provide on mahout .csv datamodel works in this respect for org.apache.mahout.cf.taste.hadoop.item.RecommenderJob would be really appreciated. Thanks.
No, it overwrites. The model is not additive. However the model in Myrrix, a derivative of this code (that I'm commercializing) has a fundamentally additive data modet, just for the reason you give. The input values are weights and are always added.
merge it before starting computation.
examples:
import java.io.IOException;
import java.util.Iterator;
import java.util.StringTokenizer;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
public final class Merge {
public Merge() {
}
public static class MergeMapper extends MapReduceBase implements
Mapper<LongWritable, Text, Text, FloatWritable> {
public void map(LongWritable key, Text value, OutputCollector<Text, FloatWritable> collector,
Reporter reporter) throws IOException {
// TODO Auto-generated method stub
String line = value.toString();
StringTokenizer tokenizer = new StringTokenizer(line);
if (tokenizer.hasMoreTokens()) {
String userId = tokenizer.nextToken(",");
String itemId = tokenizer.nextToken(",");
FloatWritable score = new FloatWritable(Float.valueOf(tokenizer.nextToken(",")));
collector.collect(new Text(userId + "," + itemId), score);
}
else {
System.out.println("empty line " + line);
}
}
}
public static class MergeReducer extends MapReduceBase implements
Reducer<Text, FloatWritable, Text, FloatWritable> {
public void reduce(Text key, Iterator<FloatWritable> scores,
OutputCollector<Text, FloatWritable> collector, Reporter reporter) throws IOException {
// TODO Auto-generated method stub
float sum = 0.0f;
while (scores.hasNext()) {
sum += scores.next().get();
}
if (sum != 0.0)
collector.collect(key, new FloatWritable(sum));
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
JobConf conf = new JobConf(Merge.class);
conf.setJobName("Merge Data");
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(FloatWritable.class);
conf.setMapperClass(MergeMapper.class);
// combine the same key items
conf.setCombinerClass(MergeReducer.class);
conf.setReducerClass(MergeReducer.class);
conf.setInputFormat(TextInputFormat.class);
conf.set("mapred.textoutputformat.separator", ",");
conf.setOutputFormat(TextOutputFormat.class);
FileInputFormat.setInputPaths(conf, new Path("hdfs://localhost:49000/tmp/data"));
FileOutputFormat.setOutputPath(conf, new Path("hdfs://localhost:49000/tmp/data/output"));
JobClient.runJob(conf);
}
}

Resources