Related
Kindly point me in a direction to get my desired output
Current outPut given:
Albania 3607 ++ Country minPopulation
Albania 418495 ++ Country maxPopulation
Desired Output
country city minPopulation
country city maxPopulation
Reducer Class:
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class Handson3Reducer extends Reducer<Text, IntWritable, Text, IntWritable> {
#Override
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int maxValue = Integer.MIN_VALUE;
int minValue = Integer.MAX_VALUE;
String line = key.toString();
String field[] = line.split(",");
for (IntWritable value : values) {
maxValue = Math.max(maxValue, value.get());
minValue = Math.min(minValue, value.get());
}
context.write(key, new IntWritable(minValue));
context.write(key, new IntWritable(maxValue));
}
}
Mapper class:
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class handson3Mapper extends Mapper<LongWritable, Text, Text, IntWritable> {
private static final int MISSING = 9999;
#Override
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
int populationVal;
String line = value.toString();
String field[] = line.split(",");
String country = field[4].substring(1, field[4].length()-1);
String newString = country.concat(field[0].substring(1, field[0].length()-1));
String population = field[9].substring(1, field[9].length()-1);
String city = field[0].substring(1, field[0].length()-1);
if (!population.matches(".*\\d.*") || population.equals("")||
population.matches("([0-9].*)\\.([0-9].*)") ){
return;
}else{
populationVal = Integer.parseInt(population);
context.write(new Text(country),new IntWritable(populationVal));
}
}
}
Runner Class:
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class handsonJobRunner {
public int run(String[] args) throws Exception {
if(args.length !=2) {
System.err.println("Usage: Handson3 <input path> <outputpath>");
System.exit(-1);
}
Job job = new Job();
job.setJarByClass(handsonJobRunner.class);
job.setJobName("Handson 3");
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path(args[1]));
job.setMapperClass(handson3Mapper.class);
job.setReducerClass(Handson3Reducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
System.exit(job.waitForCompletion(true) ? 0:1);
boolean success = job.waitForCompletion(true);
return success ? 0 : 1;
}
public static void main(String[] args) throws Exception {
handsonJobRunner driver = new handsonJobRunner();
driver.run(args);
}
}
Thank you in advance, any pointers would be much appreciated.
You should send both city and population as value to reducer and at reducer select the city with max and min population for each country.
Your mapper would be like this:
public class Handson3Mapper extends Mapper<LongWritable, Text, Text, Text> {
private static final int MISSING = 9999;
#Override
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
int populationVal;
String line = value.toString();
String field[] = line.split(",");
String country = field[4].substring(1, field[4].length() - 1);
String newString = country.concat(field[0].substring(1, field[0].length() - 1));
String population = field[9].substring(1, field[9].length() - 1);
String city = field[0].substring(1, field[0].length() - 1);
if (!population.matches(".*\\d.*") || population.equals("") ||
population.matches("([0-9].*)\\.([0-9].*)")) {
return;
} else {
populationVal = Integer.parseInt(population);
context.write(new Text(country), new Text(city + "-" + populationVal));
}
}
}
And Your reducer should change to this one:
public class Handson3Reducer extends Reducer<Text, Text, Text, IntWritable> {
#Override
public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
String maxPopulationCityName = "";
String minPopulationCityName = "";
int maxValue = Integer.MIN_VALUE;
int minValue = Integer.MAX_VALUE;
String line = key.toString();
String field[] = line.split(",");
for (IntWritable value : values) {
String[] array = value.toString().split("-");
int population = Integer.valueOf(array[1]);
if (population > maxValue) {
maxPopulationCityName = array[0];
maxValue = population;
}
if (population < minValue) {
minPopulationCityName = array[0];
minValue = population;
}
}
context.write(new Text(key + " " + minPopulationCityName), new IntWritable(minValue));
context.write(new Text(key + " " + maxPopulationCityName), new IntWritable(maxValue));
}
}
I am trying to create a map reduce program to perform the k-means algorithm. I know using map reduce isn't the best way to do iterative algorithms.
I have created the mapper and reducer classes.
In the mapper code I read an input file. When a map reduce has completed I want the results to be stored in the same input file. How do i make the output file overwrite the inputted file from the mapper?
Also so I make the map reduce iterate until the values from the old input file and new input file converge i.e. the difference between the values is less than 0.1
My code is:
import java.io.IOException;
import java.util.StringTokenizer;
import java.util.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.FileReader;
import java.io.BufferedReader;
import java.util.ArrayList;
public class kmeansMapper extends Mapper<Object, Text, DoubleWritable,
DoubleWritable> {
private final static String centroidFile = "centroid.txt";
private List<Double> centers = new ArrayList<Double>();
public void setup(Context context) throws IOException{
BufferedReader br = new BufferedReader(new
FileReader(centroidFile));
String contentLine;
while((contentLine = br.readLine())!=null){
centers.add(Double.parseDouble(contentLine));
}
}
public void map(Object key, Text input, Context context) throws IOException,
InterruptedException {
String[] fields = input.toString().split(" ");
Double rating = Double.parseDouble(fields[2]);
Double distance = centers.get(0) - rating;
int position = 0;
for(int i=1; i<centers.size(); i++){
Double cDistance = Math.abs(centers.get(i) - rating);
if(cDistance< distance){
position = i;
distance = cDistance;
}
}
Double closestCenter = centers.get(position);
context.write(new DoubleWritable(closestCenter),new
DoubleWritable(rating)); //outputs closestcenter and rating value
}
}
import java.io.IOException;
import java.lang.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Reducer;
import java.util.*;
public class kmeansReducer extends Reducer<DoubleWritable, DoubleWritable,
DoubleWritable, Text> {
public void reduce(DoubleWritable key, Iterable<DoubleWritable> values,
Context context)// get count // get total //get values in a string
throws IOException, InterruptedException {
Iterator<DoubleWritable> v = values.iterator();
double total = 0;
double count = 0;
String value = ""; //value is the rating
while (v.hasNext()){
double i = v.next().get();
value = value + " " + Double.toString(i);
total = total + i;
++count;
}
double nCenter = total/count;
context.write(new DoubleWritable(nCenter), new Text(value));
}
}
import java.util.Arrays;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class run
{
public static void runJob(String[] input, String output) throws Exception {
Configuration conf = new Configuration();
Job job = new Job(conf);
Path toCache = new Path("input/centroid.txt");
job.addCacheFile(toCache.toUri());
job.setJarByClass(run.class);
job.setMapperClass(kmeansMapper.class);
job.setReducerClass(kmeansReducer.class);
job.setMapOutputKeyClass(DoubleWritable.class);
job.setMapOutputValueClass(DoubleWritable.class);
job.setNumReduceTasks(1);
Path outputPath = new Path(output);
FileInputFormat.setInputPaths(job, StringUtils.join(input, ","));
FileOutputFormat.setOutputPath(job, outputPath);
outputPath.getFileSystem(conf).delete(outputPath,true);
job.waitForCompletion(true);
}
public static void main(String[] args) throws Exception {
runJob(Arrays.copyOfRange(args, 0, args.length-1), args[args.length-1]);
}
}
Thanks
I know you put the disclaimer.. but please switch to Spark or some other framework that can solve problems in-memory. Your life will be so much better.
If you really want to do this, just iteratively run the code in runJob and use a temporary file name for input. You can see this question on moving files in hadoop to achieve this. You'll need a FileSystem instance and a temp file for input:
FileSystem fs = FileSystem.get(new Configuration());
Path tempInputPath = Paths.get('/user/th/kmeans/tmp_input';
Broadly speaking, after each iteration is finished, do
fs.delete(tempInputPath)
fs.rename(outputPath, tempInputPath)
Of course for the very first iteration you must set the input path to be the input paths provided when running the job. Subsequent iterations can use the tempInputPath, which will be the output of the previous iteration.
Use-case :
File 1 contain impression data which contains trackerId + other fields
File 2 contains click details contains trackerId + clicked
I am using different mappers for above two and one reducer but it seems reducer is not able to combine both files data.
package com.hadoop.intellipaat;
import java.io.IOException;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import com.google.common.collect.Lists;
/**
* This job will combine click and impression on TrackerId
*
* #author raghunandangupta
*
*/
public class JoinClickImpressionDetailJob {
public static final String IMPRESSION_PREFIX = "IMPRESSION_PREFIX";
public static final String CLICK_PREFIX = "CLICK_PREFIX";
public static final String SEPERATOR = "~";
private static class ImpressionMapper extends Mapper<LongWritable, Text, Text, Text> {
#Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context)
throws IOException, InterruptedException {
/**
* Excluding header
*/
if (!(value.toString().indexOf("accountId") != -1)) {
String words[] = value.toString().split(",");
if (words.length > 18) {
context.write(new Text(words[18].trim()), new Text(IMPRESSION_PREFIX + SEPERATOR + value.toString()));
}
} else {
context.write(new Text(""), value);
}
}
}
private static class ClickMapper extends Mapper<LongWritable, Text, Text, Text> {
#Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context)
throws IOException, InterruptedException {
String words[] = value.toString().split(",");
if (words.length > 18) {
context.write(new Text(words[18].trim()), new Text(CLICK_PREFIX + SEPERATOR + value.toString()));
} else {
context.write(new Text(""), new Text("1"));
}
}
}
private static class ImpressionClickReducer extends Reducer<Text, Text, Text, Text> {
#Override
protected void reduce(Text key, Iterable<Text> values, Reducer<Text, Text, Text, Text>.Context context) {
try {
System.out.println("=========="+key.toString());
if (key.toString().length() != 0) {
List<Text> myList = Lists.newArrayList(values);
for(Text t : myList){
System.out.println("#######"+t.toString());
}
System.out.println("#########################");
if (myList.size() == 2) {
if (myList.get(0).toString().indexOf(IMPRESSION_PREFIX) != -1 && myList.get(1).toString().indexOf(CLICK_PREFIX) != -1) {
String line = myList.get(0).toString().split(SEPERATOR)[1] + ",1";
context.write(key, new Text(line));
} else if (myList.get(1).toString().indexOf(IMPRESSION_PREFIX) != -1
&& myList.get(0).toString().indexOf(CLICK_PREFIX) != -1) {
String line = myList.get(1).toString().split(SEPERATOR)[1] + ",1";
context.write(key, new Text(line));
}
}
}
} catch (Exception exception) {
exception.printStackTrace();
}
}
}
public static void main(String[] args) {
try {
Configuration conf = new Configuration();
// conf.set("mapreduce.output.fileoutputformat.compress", "true");
// conf.set("mapreduce.output.fileoutputformat.compress.codec",
// "org.apache.hadoop.io.compress.GzipCodec");
// conf.set("mapreduce.map.output.compress.codec",
// "org.apache.hadoop.io.compress.SnappyCodec");
// conf.set("mapreduce.output.fileoutputformat.compress.type",
// "BLOCK");
Job job = Job.getInstance(conf, "IMPRESSION_CLICK_COMBINE_JOB");
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
job.setReducerClass(ImpressionClickReducer.class);
FileInputFormat.setInputDirRecursive(job, true);
// FileInputFormat.addInputPath(job, new Path(args[0]));
// job.setMapperClass(ImpressionMapper.class);
/**
* Here directory of impressions will be present
*/
MultipleInputs.addInputPath(job, new Path(args[0]), TextInputFormat.class, ImpressionMapper.class);
/**
* Here directory of clicks will be present
*/
MultipleInputs.addInputPath(job, new Path(args[1]), TextInputFormat.class, ClickMapper.class);
FileOutputFormat.setOutputPath(job, new Path(args[2]));
job.waitForCompletion(true);
} catch (Exception e) {
e.printStackTrace();
}
}
}
Any leads will be appreciated.
Eg. File 1 [trackerId1,record1]
File2 [treackerId1, Clicked]
In reducer I am getting :
trackerId,[record1,record1] ideally it should be trackerId ,[record1,clicked]
Your problem is most likely with this line in the reducer:
List<Text> myList = Lists.newArrayList(values);
The main thing to remember is that Iterable<Text> values is reusing the Text object it gives you as you iterate. So you might be adding two Text objects to the array, but they are pointing at the same object.
If you look at how Lists.newArrayList() works, it just adds objects to the array, without creating a new one.
So if you're going to use Text objects you need to create a new one each time you add a value to the array. This is typically a reason why people use Strings in cases like this. A quick check to see if this is the problem would be to change this code to something like:
List<Text> myList = new ArrayList<Text>();
for (Text v : values) {
myList.add(new Text(v));
}
Thus, you create a new Text each time.
I have been trying to get the wordcount example that comes with cassandra to work with hadoop.
The source code:
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.*;
import java.util.Map.Entry;
import org.apache.cassandra.hadoop.cql3.CqlConfigHelper;
import org.apache.cassandra.hadoop.cql3.CqlOutputFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.cassandra.hadoop.cql3.CqlPagingInputFormat;
import org.apache.cassandra.hadoop.ConfigHelper;
import org.apache.cassandra.utils.ByteBufferUtil;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import java.nio.charset.CharacterCodingException;
/**
* This counts the occurrences of words in ColumnFamily
* cql3_worldcount ( user_id text,
* category_id text,
* sub_category_id text,
* title text,
* body text,
* PRIMARY KEY (user_id, category_id, sub_category_id))
*
* For each word, we output the total number of occurrences across all body texts.
*
* When outputting to Cassandra, we write the word counts to column family
* output_words ( row_id1 text,
* row_id2 text,
* word text,
* count_num text,
* PRIMARY KEY ((row_id1, row_id2), word))
* as a {word, count} to columns: word, count_num with a row key of "word sum"
*/
public class WordCount extends Configured implements Tool
{
private static final Logger logger = LoggerFactory.getLogger(WordCount.class);
static final String KEYSPACE = "cql3_worldcount";
static final String COLUMN_FAMILY = "inputs";
static final String OUTPUT_REDUCER_VAR = "output_reducer";
static final String OUTPUT_COLUMN_FAMILY = "output_words";
private static final String OUTPUT_PATH_PREFIX = "/tmp/word_count";
private static final String PRIMARY_KEY = "row_key";
public static void main(String[] args) throws Exception
{
// Let ToolRunner handle generic command-line options
ToolRunner.run(new Configuration(), new WordCount(), args);
System.exit(0);
}
public static class TokenizerMapper extends Mapper<Map<String, ByteBuffer>, Map<String, ByteBuffer>, Text, IntWritable>
{
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
private ByteBuffer sourceColumn;
protected void setup(org.apache.hadoop.mapreduce.Mapper.Context context)
throws IOException, InterruptedException
{
}
public void map(Map<String, ByteBuffer> keys, Map<String, ByteBuffer> columns, Context context) throws IOException, InterruptedException
{
for (Entry<String, ByteBuffer> column : columns.entrySet())
{
if (!"body".equalsIgnoreCase(column.getKey()))
continue;
String value = ByteBufferUtil.string(column.getValue());
logger.debug("read {}:{}={} from {}",
new Object[] {toString(keys), column.getKey(), value, context.getInputSplit()});
StringTokenizer itr = new StringTokenizer(value);
while (itr.hasMoreTokens())
{
word.set(itr.nextToken());
context.write(word, one);
}
}
}
private String toString(Map<String, ByteBuffer> keys)
{
String result = "";
try
{
for (ByteBuffer key : keys.values())
result = result + ByteBufferUtil.string(key) + ":";
}
catch (CharacterCodingException e)
{
logger.error("Failed to print keys", e);
}
return result;
}
}
public static class ReducerToFilesystem extends Reducer<Text, IntWritable, Text, IntWritable>
{
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException
{
int sum = 0;
for (IntWritable val : values)
sum += val.get();
context.write(key, new IntWritable(sum));
}
}
public static class ReducerToCassandra extends Reducer<Text, IntWritable, Map<String, ByteBuffer>, List<ByteBuffer>>
{
private Map<String, ByteBuffer> keys;
private ByteBuffer key;
protected void setup(org.apache.hadoop.mapreduce.Reducer.Context context)
throws IOException, InterruptedException
{
keys = new LinkedHashMap<String, ByteBuffer>();
String[] partitionKeys = context.getConfiguration().get(PRIMARY_KEY).split(",");
keys.put("row_id1", ByteBufferUtil.bytes(partitionKeys[0]));
keys.put("row_id2", ByteBufferUtil.bytes(partitionKeys[1]));
}
public void reduce(Text word, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException
{
int sum = 0;
for (IntWritable val : values)
sum += val.get();
context.write(keys, getBindVariables(word, sum));
}
private List<ByteBuffer> getBindVariables(Text word, int sum)
{
List<ByteBuffer> variables = new ArrayList<ByteBuffer>();
keys.put("word", ByteBufferUtil.bytes(word.toString()));
variables.add(ByteBufferUtil.bytes(String.valueOf(sum)));
return variables;
}
}
public int run(String[] args) throws Exception
{
String outputReducerType = "filesystem";
if (args != null && args[0].startsWith(OUTPUT_REDUCER_VAR))
{
String[] s = args[0].split("=");
if (s != null && s.length == 2)
outputReducerType = s[1];
}
logger.info("output reducer type: " + outputReducerType);
Job job = new Job(getConf(), "wordcount");
job.setJarByClass(WordCount.class);
job.setMapperClass(TokenizerMapper.class);
if (outputReducerType.equalsIgnoreCase("filesystem"))
{
job.setCombinerClass(ReducerToFilesystem.class);
job.setReducerClass(ReducerToFilesystem.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileOutputFormat.setOutputPath(job, new Path(OUTPUT_PATH_PREFIX));
}
else
{
job.setReducerClass(ReducerToCassandra.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Map.class);
job.setOutputValueClass(List.class);
job.setOutputFormatClass(CqlOutputFormat.class);
ConfigHelper.setOutputColumnFamily(job.getConfiguration(), KEYSPACE, OUTPUT_COLUMN_FAMILY);
job.getConfiguration().set(PRIMARY_KEY, "word,sum");
String query = "UPDATE " + KEYSPACE + "." + OUTPUT_COLUMN_FAMILY +
" SET count_num = ? ";
CqlConfigHelper.setOutputCql(job.getConfiguration(), query);
ConfigHelper.setOutputInitialAddress(job.getConfiguration(), "localhost");
ConfigHelper.setOutputPartitioner(job.getConfiguration(), "Murmur3Partitioner");
}
job.setInputFormatClass(CqlPagingInputFormat.class);
ConfigHelper.setInputRpcPort(job.getConfiguration(), "9160");
ConfigHelper.setInputInitialAddress(job.getConfiguration(), "localhost");
ConfigHelper.setInputColumnFamily(job.getConfiguration(), KEYSPACE, COLUMN_FAMILY);
ConfigHelper.setInputPartitioner(job.getConfiguration(), "Murmur3Partitioner");
CqlConfigHelper.setInputCQLPageRowSize(job.getConfiguration(), "3");
//this is the user defined filter clauses, you can comment it out if you want count all titles
CqlConfigHelper.setInputWhereClauses(job.getConfiguration(), "title='A'");
job.waitForCompletion(true);
return 0;
}
}
After I compile and make the jar file, when I try to run it with hadoop the program runs to the job.waitForCompletion(true); point and freezes, it doesn't output anything related to mapreduce or any error.
I am using hadoop 1.2.1 and cassandra 2.0.4
Does anyone have any idea what the problem is?
Thanks
I am very new to hadoop, Can anyone know what should i keep in(/user/gates/pages) pages folder.should i keep on text file which contain data.if only txt file what will be the format of that
What data should I keep in [FileInputFormat.addInputPath(lp, new Path("/user/gates/pages"));] pages file.username or ages or websites name could u please give the details of input file
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.KeyValueTextInputFormat;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.jobcontrol.Job;
import org.apache.hadoop.mapred.jobcontrol.JobControl;
import org.apache.hadoop.mapred.lib.IdentityMapper;
public class MRExample {
public static class LoadPages extends MapReduceBase
implements Mapper<LongWritable, Text, Text, Text> {
public void map(LongWritable k, Text val,
OutputCollector<Text, Text> oc,
Reporter reporter) throws IOException {
// Pull the key out
String line = val.toString();
int firstComma = line.indexOf(',');
String key = line.substring(0, firstComma);
String value = line.substring(firstComma + 1);
Text outKey = new Text(key);
// Prepend an index to the value so we know which file
// it came from.
Text outVal = new Text("1" + value);
oc.collect(outKey, outVal);
}
}
public static class LoadAndFilterUsers extends MapReduceBase
implements Mapper<LongWritable, Text, Text, Text> {
public void map(LongWritable k, Text val,
OutputCollector<Text, Text> oc,
Reporter reporter) throws IOException {
// Pull the key out
String line = val.toString();
int firstComma = line.indexOf(',');
String value = line.substring(firstComma + 1);
int age = Integer.parseInt(value);
if (age < 18 || age > 25) return;
String key = line.substring(0, firstComma);
Text outKey = new Text(key);
// Prepend an index to the value so we know which file
// it came from.
Text outVal = new Text("2" + value);
oc.collect(outKey, outVal);
}
}
public static class Join extends MapReduceBase
implements Reducer<Text, Text, Text, Text> {
public void reduce(Text key,
Iterator<Text> iter,
OutputCollector<Text, Text> oc,
Reporter reporter) throws IOException {
// For each value, figure out which file it's from and store it
// accordingly.
List<String> first = new ArrayList<String>();
List<String> second = new ArrayList<String>();
while (iter.hasNext()) {
Text t = iter.next();
String value = t.toString();
if (value.charAt(0) == '1') first.add(value.substring(1));
else second.add(value.substring(1));
reporter.setStatus("OK");
}
// Do the cross product and collect the values
for (String s1 : first) {
for (String s2 : second) {
String outval = key + "," + s1 + "," + s2;
oc.collect(null, new Text(outval));
reporter.setStatus("OK");
}
}
}
}
public static class LoadJoined extends MapReduceBase
implements Mapper<Text, Text, Text, LongWritable> {
public void map(
Text k,
Text val,
OutputCollector<Text, LongWritable> oc,
Reporter reporter) throws IOException {
// Find the url
String line = val.toString();
int firstComma = line.indexOf(',');
int secondComma = line.indexOf(',', firstComma);
String key = line.substring(firstComma, secondComma);
// drop the rest of the record, I don't need it anymore,
// just pass a 1 for the combiner/reducer to sum instead.
Text outKey = new Text(key);
oc.collect(outKey, new LongWritable(1L));
}
}
public static class ReduceUrls extends MapReduceBase
implements Reducer<Text, LongWritable, WritableComparable, Writable> {
public void reduce(
Text key,
Iterator<LongWritable> iter,
OutputCollector<WritableComparable, Writable> oc,
Reporter reporter) throws IOException {
// Add up all the values we see
long sum = 0;
while (iter.hasNext()) {
sum += iter.next().get();
reporter.setStatus("OK");
}
oc.collect(key, new LongWritable(sum));
}
}
public static class LoadClicks extends MapReduceBase
implements Mapper<WritableComparable, Writable, LongWritable, Text> {
public void map(
WritableComparable key,
Writable val,
OutputCollector<LongWritable, Text> oc,
Reporter reporter) throws IOException {
oc.collect((LongWritable)val, (Text)key);
}
}
public static class LimitClicks extends MapReduceBase
implements Reducer<LongWritable, Text, LongWritable, Text> {
int count = 0;
public void reduce(
LongWritable key,
Iterator<Text> iter,
OutputCollector<LongWritable, Text> oc,
Reporter reporter) throws IOException {
// Only output the first 100 records
while (count < 100 && iter.hasNext()) {
oc.collect(key, iter.next());
count++;
}
}
}
public static void main(String[] args) throws IOException {
JobConf lp = new JobConf(MRExample.class);
lp.setJobName("Load Pages");
lp.setInputFormat(TextInputFormat.class);
lp.setOutputKeyClass(Text.class);
lp.setOutputValueClass(Text.class);
lp.setMapperClass(LoadPages.class);
FileInputFormat.addInputPath(lp, new Path("/user/gates/pages"));
FileOutputFormat.setOutputPath(lp,
new Path("/user/gates/tmp/indexed_pages"));
lp.setNumReduceTasks(0);
Job loadPages = new Job(lp);
JobConf lfu = new JobConf(MRExample.class);
lfu.setJobName("Load and Filter Users");
lfu.setInputFormat(TextInputFormat.class);
lfu.setOutputKeyClass(Text.class);
lfu.setOutputValueClass(Text.class);
lfu.setMapperClass(LoadAndFilterUsers.class);
FileInputFormat.addInputPath(lfu, new Path("/user/gates/users"));
FileOutputFormat.setOutputPath(lfu,
new Path("/user/gates/tmp/filtered_users"));
lfu.setNumReduceTasks(0);
Job loadUsers = new Job(lfu);
JobConf join = new JobConf(MRExample.class);
join.setJobName("Join Users and Pages");
join.setInputFormat(KeyValueTextInputFormat.class);
join.setOutputKeyClass(Text.class);
join.setOutputValueClass(Text.class);
join.setMapperClass(IdentityMapper.class);
join.setReducerClass(Join.class);
FileInputFormat.addInputPath(join, new Path("/user/gates/tmp/indexed_pages"));
FileInputFormat.addInputPath(join, new Path("/user/gates/tmp/filtered_users"));
FileOutputFormat.setOutputPath(join, new Path("/user/gates/tmp/joined"));
join.setNumReduceTasks(50);
Job joinJob = new Job(join);
joinJob.addDependingJob(loadPages);
joinJob.addDependingJob(loadUsers);
JobConf group = new JobConf(MRExample.class);
group.setJobName("Group URLs");
group.setInputFormat(KeyValueTextInputFormat.class);
group.setOutputKeyClass(Text.class);
group.setOutputValueClass(LongWritable.class);
group.setOutputFormat(SequenceFileOutputFormat.class);
group.setMapperClass(LoadJoined.class);
group.setCombinerClass(ReduceUrls.class);
group.setReducerClass(ReduceUrls.class);
FileInputFormat.addInputPath(group, new Path("/user/gates/tmp/joined"));
FileOutputFormat.setOutputPath(group, new Path("/user/gates/tmp/grouped"));
group.setNumReduceTasks(50);
Job groupJob = new Job(group);
groupJob.addDependingJob(joinJob);
JobConf top100 = new JobConf(MRExample.class);
top100.setJobName("Top 100 sites");
top100.setInputFormat(SequenceFileInputFormat.class);
top100.setOutputKeyClass(LongWritable.class);
top100.setOutputValueClass(Text.class);
top100.setOutputFormat(SequenceFileOutputFormat.class);
top100.setMapperClass(LoadClicks.class);
top100.setCombinerClass(LimitClicks.class);
top100.setReducerClass(LimitClicks.class);
FileInputFormat.addInputPath(top100, new Path("/user/gates/tmp/grouped"));
FileOutputFormat.setOutputPath(top100, new Path("/user/gates/top100sitesforusers18to25"));
top100.setNumReduceTasks(1);
Job limit = new Job(top100);
limit.addDependingJob(groupJob);
JobControl jc = new JobControl("Find top 100 sites for users 18 to 25");
jc.addJob(loadPages);
jc.addJob(loadUsers);
jc.addJob(joinJob);
jc.addJob(groupJob);
jc.addJob(limit);
jc.run();
}
}
FileInputFormat.addInputPath(lp, new Path("/user/gates/pages"));
...
FileInputFormat.addInputPath(lfu, new Path("/user/gates/users"));
These are the two hard coded files it's expecting to find in HDFS, and for output:
FileOutputFormat.setOutputPath(top100, new Path("/user/gates/top100sitesforusers18to25"));