input file format for map-reduce - hadoop

I am very new to hadoop, Can anyone know what should i keep in(/user/gates/pages) pages folder.should i keep on text file which contain data.if only txt file what will be the format of that
What data should I keep in [FileInputFormat.addInputPath(lp, new Path("/user/gates/pages"));] pages file.username or ages or websites name could u please give the details of input file
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.KeyValueTextInputFormat;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.jobcontrol.Job;
import org.apache.hadoop.mapred.jobcontrol.JobControl;
import org.apache.hadoop.mapred.lib.IdentityMapper;
public class MRExample {
public static class LoadPages extends MapReduceBase
implements Mapper<LongWritable, Text, Text, Text> {
public void map(LongWritable k, Text val,
OutputCollector<Text, Text> oc,
Reporter reporter) throws IOException {
// Pull the key out
String line = val.toString();
int firstComma = line.indexOf(',');
String key = line.substring(0, firstComma);
String value = line.substring(firstComma + 1);
Text outKey = new Text(key);
// Prepend an index to the value so we know which file
// it came from.
Text outVal = new Text("1" + value);
oc.collect(outKey, outVal);
}
}
public static class LoadAndFilterUsers extends MapReduceBase
implements Mapper<LongWritable, Text, Text, Text> {
public void map(LongWritable k, Text val,
OutputCollector<Text, Text> oc,
Reporter reporter) throws IOException {
// Pull the key out
String line = val.toString();
int firstComma = line.indexOf(',');
String value = line.substring(firstComma + 1);
int age = Integer.parseInt(value);
if (age < 18 || age > 25) return;
String key = line.substring(0, firstComma);
Text outKey = new Text(key);
// Prepend an index to the value so we know which file
// it came from.
Text outVal = new Text("2" + value);
oc.collect(outKey, outVal);
}
}
public static class Join extends MapReduceBase
implements Reducer<Text, Text, Text, Text> {
public void reduce(Text key,
Iterator<Text> iter,
OutputCollector<Text, Text> oc,
Reporter reporter) throws IOException {
// For each value, figure out which file it's from and store it
// accordingly.
List<String> first = new ArrayList<String>();
List<String> second = new ArrayList<String>();
while (iter.hasNext()) {
Text t = iter.next();
String value = t.toString();
if (value.charAt(0) == '1') first.add(value.substring(1));
else second.add(value.substring(1));
reporter.setStatus("OK");
}
// Do the cross product and collect the values
for (String s1 : first) {
for (String s2 : second) {
String outval = key + "," + s1 + "," + s2;
oc.collect(null, new Text(outval));
reporter.setStatus("OK");
}
}
}
}
public static class LoadJoined extends MapReduceBase
implements Mapper<Text, Text, Text, LongWritable> {
public void map(
Text k,
Text val,
OutputCollector<Text, LongWritable> oc,
Reporter reporter) throws IOException {
// Find the url
String line = val.toString();
int firstComma = line.indexOf(',');
int secondComma = line.indexOf(',', firstComma);
String key = line.substring(firstComma, secondComma);
// drop the rest of the record, I don't need it anymore,
// just pass a 1 for the combiner/reducer to sum instead.
Text outKey = new Text(key);
oc.collect(outKey, new LongWritable(1L));
}
}
public static class ReduceUrls extends MapReduceBase
implements Reducer<Text, LongWritable, WritableComparable, Writable> {
public void reduce(
Text key,
Iterator<LongWritable> iter,
OutputCollector<WritableComparable, Writable> oc,
Reporter reporter) throws IOException {
// Add up all the values we see
long sum = 0;
while (iter.hasNext()) {
sum += iter.next().get();
reporter.setStatus("OK");
}
oc.collect(key, new LongWritable(sum));
}
}
public static class LoadClicks extends MapReduceBase
implements Mapper<WritableComparable, Writable, LongWritable, Text> {
public void map(
WritableComparable key,
Writable val,
OutputCollector<LongWritable, Text> oc,
Reporter reporter) throws IOException {
oc.collect((LongWritable)val, (Text)key);
}
}
public static class LimitClicks extends MapReduceBase
implements Reducer<LongWritable, Text, LongWritable, Text> {
int count = 0;
public void reduce(
LongWritable key,
Iterator<Text> iter,
OutputCollector<LongWritable, Text> oc,
Reporter reporter) throws IOException {
// Only output the first 100 records
while (count < 100 && iter.hasNext()) {
oc.collect(key, iter.next());
count++;
}
}
}
public static void main(String[] args) throws IOException {
JobConf lp = new JobConf(MRExample.class);
lp.setJobName("Load Pages");
lp.setInputFormat(TextInputFormat.class);
lp.setOutputKeyClass(Text.class);
lp.setOutputValueClass(Text.class);
lp.setMapperClass(LoadPages.class);
FileInputFormat.addInputPath(lp, new Path("/user/gates/pages"));
FileOutputFormat.setOutputPath(lp,
new Path("/user/gates/tmp/indexed_pages"));
lp.setNumReduceTasks(0);
Job loadPages = new Job(lp);
JobConf lfu = new JobConf(MRExample.class);
lfu.setJobName("Load and Filter Users");
lfu.setInputFormat(TextInputFormat.class);
lfu.setOutputKeyClass(Text.class);
lfu.setOutputValueClass(Text.class);
lfu.setMapperClass(LoadAndFilterUsers.class);
FileInputFormat.addInputPath(lfu, new Path("/user/gates/users"));
FileOutputFormat.setOutputPath(lfu,
new Path("/user/gates/tmp/filtered_users"));
lfu.setNumReduceTasks(0);
Job loadUsers = new Job(lfu);
JobConf join = new JobConf(MRExample.class);
join.setJobName("Join Users and Pages");
join.setInputFormat(KeyValueTextInputFormat.class);
join.setOutputKeyClass(Text.class);
join.setOutputValueClass(Text.class);
join.setMapperClass(IdentityMapper.class);
join.setReducerClass(Join.class);
FileInputFormat.addInputPath(join, new Path("/user/gates/tmp/indexed_pages"));
FileInputFormat.addInputPath(join, new Path("/user/gates/tmp/filtered_users"));
FileOutputFormat.setOutputPath(join, new Path("/user/gates/tmp/joined"));
join.setNumReduceTasks(50);
Job joinJob = new Job(join);
joinJob.addDependingJob(loadPages);
joinJob.addDependingJob(loadUsers);
JobConf group = new JobConf(MRExample.class);
group.setJobName("Group URLs");
group.setInputFormat(KeyValueTextInputFormat.class);
group.setOutputKeyClass(Text.class);
group.setOutputValueClass(LongWritable.class);
group.setOutputFormat(SequenceFileOutputFormat.class);
group.setMapperClass(LoadJoined.class);
group.setCombinerClass(ReduceUrls.class);
group.setReducerClass(ReduceUrls.class);
FileInputFormat.addInputPath(group, new Path("/user/gates/tmp/joined"));
FileOutputFormat.setOutputPath(group, new Path("/user/gates/tmp/grouped"));
group.setNumReduceTasks(50);
Job groupJob = new Job(group);
groupJob.addDependingJob(joinJob);
JobConf top100 = new JobConf(MRExample.class);
top100.setJobName("Top 100 sites");
top100.setInputFormat(SequenceFileInputFormat.class);
top100.setOutputKeyClass(LongWritable.class);
top100.setOutputValueClass(Text.class);
top100.setOutputFormat(SequenceFileOutputFormat.class);
top100.setMapperClass(LoadClicks.class);
top100.setCombinerClass(LimitClicks.class);
top100.setReducerClass(LimitClicks.class);
FileInputFormat.addInputPath(top100, new Path("/user/gates/tmp/grouped"));
FileOutputFormat.setOutputPath(top100, new Path("/user/gates/top100sitesforusers18to25"));
top100.setNumReduceTasks(1);
Job limit = new Job(top100);
limit.addDependingJob(groupJob);
JobControl jc = new JobControl("Find top 100 sites for users 18 to 25");
jc.addJob(loadPages);
jc.addJob(loadUsers);
jc.addJob(joinJob);
jc.addJob(groupJob);
jc.addJob(limit);
jc.run();
}
}

FileInputFormat.addInputPath(lp, new Path("/user/gates/pages"));
...
FileInputFormat.addInputPath(lfu, new Path("/user/gates/users"));
These are the two hard coded files it's expecting to find in HDFS, and for output:
FileOutputFormat.setOutputPath(top100, new Path("/user/gates/top100sitesforusers18to25"));

Related

mapReduce to get desired output

Kindly point me in a direction to get my desired output
Current outPut given:
Albania 3607 ++ Country minPopulation
Albania 418495 ++ Country maxPopulation
Desired Output
country city minPopulation
country city maxPopulation
Reducer Class:
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class Handson3Reducer extends Reducer<Text, IntWritable, Text, IntWritable> {
#Override
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int maxValue = Integer.MIN_VALUE;
int minValue = Integer.MAX_VALUE;
String line = key.toString();
String field[] = line.split(",");
for (IntWritable value : values) {
maxValue = Math.max(maxValue, value.get());
minValue = Math.min(minValue, value.get());
}
context.write(key, new IntWritable(minValue));
context.write(key, new IntWritable(maxValue));
}
}
Mapper class:
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class handson3Mapper extends Mapper<LongWritable, Text, Text, IntWritable> {
private static final int MISSING = 9999;
#Override
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
int populationVal;
String line = value.toString();
String field[] = line.split(",");
String country = field[4].substring(1, field[4].length()-1);
String newString = country.concat(field[0].substring(1, field[0].length()-1));
String population = field[9].substring(1, field[9].length()-1);
String city = field[0].substring(1, field[0].length()-1);
if (!population.matches(".*\\d.*") || population.equals("")||
population.matches("([0-9].*)\\.([0-9].*)") ){
return;
}else{
populationVal = Integer.parseInt(population);
context.write(new Text(country),new IntWritable(populationVal));
}
}
}
Runner Class:
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class handsonJobRunner {
public int run(String[] args) throws Exception {
if(args.length !=2) {
System.err.println("Usage: Handson3 <input path> <outputpath>");
System.exit(-1);
}
Job job = new Job();
job.setJarByClass(handsonJobRunner.class);
job.setJobName("Handson 3");
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path(args[1]));
job.setMapperClass(handson3Mapper.class);
job.setReducerClass(Handson3Reducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
System.exit(job.waitForCompletion(true) ? 0:1);
boolean success = job.waitForCompletion(true);
return success ? 0 : 1;
}
public static void main(String[] args) throws Exception {
handsonJobRunner driver = new handsonJobRunner();
driver.run(args);
}
}
Thank you in advance, any pointers would be much appreciated.
You should send both city and population as value to reducer and at reducer select the city with max and min population for each country.
Your mapper would be like this:
public class Handson3Mapper extends Mapper<LongWritable, Text, Text, Text> {
private static final int MISSING = 9999;
#Override
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
int populationVal;
String line = value.toString();
String field[] = line.split(",");
String country = field[4].substring(1, field[4].length() - 1);
String newString = country.concat(field[0].substring(1, field[0].length() - 1));
String population = field[9].substring(1, field[9].length() - 1);
String city = field[0].substring(1, field[0].length() - 1);
if (!population.matches(".*\\d.*") || population.equals("") ||
population.matches("([0-9].*)\\.([0-9].*)")) {
return;
} else {
populationVal = Integer.parseInt(population);
context.write(new Text(country), new Text(city + "-" + populationVal));
}
}
}
And Your reducer should change to this one:
public class Handson3Reducer extends Reducer<Text, Text, Text, IntWritable> {
#Override
public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
String maxPopulationCityName = "";
String minPopulationCityName = "";
int maxValue = Integer.MIN_VALUE;
int minValue = Integer.MAX_VALUE;
String line = key.toString();
String field[] = line.split(",");
for (IntWritable value : values) {
String[] array = value.toString().split("-");
int population = Integer.valueOf(array[1]);
if (population > maxValue) {
maxPopulationCityName = array[0];
maxValue = population;
}
if (population < minValue) {
minPopulationCityName = array[0];
minValue = population;
}
}
context.write(new Text(key + " " + minPopulationCityName), new IntWritable(minValue));
context.write(new Text(key + " " + maxPopulationCityName), new IntWritable(maxValue));
}
}

Reducer not able to group by key for different mappers

Use-case :
File 1 contain impression data which contains trackerId + other fields
File 2 contains click details contains trackerId + clicked
I am using different mappers for above two and one reducer but it seems reducer is not able to combine both files data.
package com.hadoop.intellipaat;
import java.io.IOException;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import com.google.common.collect.Lists;
/**
* This job will combine click and impression on TrackerId
*
* #author raghunandangupta
*
*/
public class JoinClickImpressionDetailJob {
public static final String IMPRESSION_PREFIX = "IMPRESSION_PREFIX";
public static final String CLICK_PREFIX = "CLICK_PREFIX";
public static final String SEPERATOR = "~";
private static class ImpressionMapper extends Mapper<LongWritable, Text, Text, Text> {
#Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context)
throws IOException, InterruptedException {
/**
* Excluding header
*/
if (!(value.toString().indexOf("accountId") != -1)) {
String words[] = value.toString().split(",");
if (words.length > 18) {
context.write(new Text(words[18].trim()), new Text(IMPRESSION_PREFIX + SEPERATOR + value.toString()));
}
} else {
context.write(new Text(""), value);
}
}
}
private static class ClickMapper extends Mapper<LongWritable, Text, Text, Text> {
#Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context)
throws IOException, InterruptedException {
String words[] = value.toString().split(",");
if (words.length > 18) {
context.write(new Text(words[18].trim()), new Text(CLICK_PREFIX + SEPERATOR + value.toString()));
} else {
context.write(new Text(""), new Text("1"));
}
}
}
private static class ImpressionClickReducer extends Reducer<Text, Text, Text, Text> {
#Override
protected void reduce(Text key, Iterable<Text> values, Reducer<Text, Text, Text, Text>.Context context) {
try {
System.out.println("=========="+key.toString());
if (key.toString().length() != 0) {
List<Text> myList = Lists.newArrayList(values);
for(Text t : myList){
System.out.println("#######"+t.toString());
}
System.out.println("#########################");
if (myList.size() == 2) {
if (myList.get(0).toString().indexOf(IMPRESSION_PREFIX) != -1 && myList.get(1).toString().indexOf(CLICK_PREFIX) != -1) {
String line = myList.get(0).toString().split(SEPERATOR)[1] + ",1";
context.write(key, new Text(line));
} else if (myList.get(1).toString().indexOf(IMPRESSION_PREFIX) != -1
&& myList.get(0).toString().indexOf(CLICK_PREFIX) != -1) {
String line = myList.get(1).toString().split(SEPERATOR)[1] + ",1";
context.write(key, new Text(line));
}
}
}
} catch (Exception exception) {
exception.printStackTrace();
}
}
}
public static void main(String[] args) {
try {
Configuration conf = new Configuration();
// conf.set("mapreduce.output.fileoutputformat.compress", "true");
// conf.set("mapreduce.output.fileoutputformat.compress.codec",
// "org.apache.hadoop.io.compress.GzipCodec");
// conf.set("mapreduce.map.output.compress.codec",
// "org.apache.hadoop.io.compress.SnappyCodec");
// conf.set("mapreduce.output.fileoutputformat.compress.type",
// "BLOCK");
Job job = Job.getInstance(conf, "IMPRESSION_CLICK_COMBINE_JOB");
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
job.setReducerClass(ImpressionClickReducer.class);
FileInputFormat.setInputDirRecursive(job, true);
// FileInputFormat.addInputPath(job, new Path(args[0]));
// job.setMapperClass(ImpressionMapper.class);
/**
* Here directory of impressions will be present
*/
MultipleInputs.addInputPath(job, new Path(args[0]), TextInputFormat.class, ImpressionMapper.class);
/**
* Here directory of clicks will be present
*/
MultipleInputs.addInputPath(job, new Path(args[1]), TextInputFormat.class, ClickMapper.class);
FileOutputFormat.setOutputPath(job, new Path(args[2]));
job.waitForCompletion(true);
} catch (Exception e) {
e.printStackTrace();
}
}
}
Any leads will be appreciated.
Eg. File 1 [trackerId1,record1]
File2 [treackerId1, Clicked]
In reducer I am getting :
trackerId,[record1,record1] ideally it should be trackerId ,[record1,clicked]
Your problem is most likely with this line in the reducer:
List<Text> myList = Lists.newArrayList(values);
The main thing to remember is that Iterable<Text> values is reusing the Text object it gives you as you iterate. So you might be adding two Text objects to the array, but they are pointing at the same object.
If you look at how Lists.newArrayList() works, it just adds objects to the array, without creating a new one.
So if you're going to use Text objects you need to create a new one each time you add a value to the array. This is typically a reason why people use Strings in cases like this. A quick check to see if this is the problem would be to change this code to something like:
List<Text> myList = new ArrayList<Text>();
for (Text v : values) {
myList.add(new Text(v));
}
Thus, you create a new Text each time.

Sample code for using hadoop mapreduce against cassandra

I have been trying to get a MapReduce sample code that comes with Cassandra running but I get run time error.
Source code:
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.*;
import java.util.Map.Entry;
import org.apache.cassandra.hadoop.cql3.CqlConfigHelper;
import org.apache.cassandra.hadoop.cql3.CqlOutputFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.cassandra.hadoop.cql3.CqlPagingInputFormat;
import org.apache.cassandra.hadoop.ConfigHelper;
import org.apache.cassandra.utils.ByteBufferUtil;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import java.nio.charset.CharacterCodingException;
/**
* This counts the occurrences of words in ColumnFamily
* cql3_worldcount ( user_id text,
* category_id text,
* sub_category_id text,
* title text,
* body text,
* PRIMARY KEY (user_id, category_id, sub_category_id))
*
* For each word, we output the total number of occurrences across all body texts.
*
* When outputting to Cassandra, we write the word counts to column family
* output_words ( row_id1 text,
* row_id2 text,
* word text,
* count_num text,
* PRIMARY KEY ((row_id1, row_id2), word))
* as a {word, count} to columns: word, count_num with a row key of "word sum"
*/
public class WordCount extends Configured implements Tool
{
private static final Logger logger = LoggerFactory.getLogger(WordCount.class);
static final String KEYSPACE = "cql3_worldcount";
static final String COLUMN_FAMILY = "inputs";
static final String OUTPUT_REDUCER_VAR = "output_reducer";
static final String OUTPUT_COLUMN_FAMILY = "output_words";
private static final String OUTPUT_PATH_PREFIX = "/tmp/word_count";
private static final String PRIMARY_KEY = "row_key";
public static void main(String[] args) throws Exception
{
// Let ToolRunner handle generic command-line options
ToolRunner.run(new Configuration(), new WordCount(), args);
System.exit(0);
}
public static class TokenizerMapper extends Mapper<Map<String, ByteBuffer>, Map<String, ByteBuffer>, Text, IntWritable>
{
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
private ByteBuffer sourceColumn;
protected void setup(org.apache.hadoop.mapreduce.Mapper.Context context)
throws IOException, InterruptedException
{
}
public void map(Map<String, ByteBuffer> keys, Map<String, ByteBuffer> columns, Context context) throws IOException, InterruptedException
{
for (Entry<String, ByteBuffer> column : columns.entrySet())
{
if (!"body".equalsIgnoreCase(column.getKey()))
continue;
String value = ByteBufferUtil.string(column.getValue());
logger.debug("read {}:{}={} from {}",
new Object[] {toString(keys), column.getKey(), value, context.getInputSplit()});
StringTokenizer itr = new StringTokenizer(value);
while (itr.hasMoreTokens())
{
word.set(itr.nextToken());
context.write(word, one);
}
}
}
private String toString(Map<String, ByteBuffer> keys)
{
String result = "";
try
{
for (ByteBuffer key : keys.values())
result = result + ByteBufferUtil.string(key) + ":";
}
catch (CharacterCodingException e)
{
logger.error("Failed to print keys", e);
}
return result;
}
}
public static class ReducerToFilesystem extends Reducer<Text, IntWritable, Text, IntWritable>
{
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException
{
int sum = 0;
for (IntWritable val : values)
sum += val.get();
context.write(key, new IntWritable(sum));
}
}
public static class ReducerToCassandra extends Reducer<Text, IntWritable, Map<String, ByteBuffer>, List<ByteBuffer>>
{
private Map<String, ByteBuffer> keys;
private ByteBuffer key;
protected void setup(org.apache.hadoop.mapreduce.Reducer.Context context)
throws IOException, InterruptedException
{
keys = new LinkedHashMap<String, ByteBuffer>();
String[] partitionKeys = context.getConfiguration().get(PRIMARY_KEY).split(",");
keys.put("row_id1", ByteBufferUtil.bytes(partitionKeys[0]));
keys.put("row_id2", ByteBufferUtil.bytes(partitionKeys[1]));
}
public void reduce(Text word, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException
{
int sum = 0;
for (IntWritable val : values)
sum += val.get();
context.write(keys, getBindVariables(word, sum));
}
private List<ByteBuffer> getBindVariables(Text word, int sum)
{
List<ByteBuffer> variables = new ArrayList<ByteBuffer>();
keys.put("word", ByteBufferUtil.bytes(word.toString()));
variables.add(ByteBufferUtil.bytes(String.valueOf(sum)));
return variables;
}
}
public int run(String[] args) throws Exception
{
String outputReducerType = "filesystem";
if (args != null && args[0].startsWith(OUTPUT_REDUCER_VAR))
{
String[] s = args[0].split("=");
if (s != null && s.length == 2)
outputReducerType = s[1];
}
logger.info("output reducer type: " + outputReducerType);
Job job = new Job(getConf(), "wordcount");
job.setJarByClass(WordCount.class);
job.setMapperClass(TokenizerMapper.class);
if (outputReducerType.equalsIgnoreCase("filesystem"))
{
job.setCombinerClass(ReducerToFilesystem.class);
job.setReducerClass(ReducerToFilesystem.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileOutputFormat.setOutputPath(job, new Path(OUTPUT_PATH_PREFIX));
}
else
{
job.setReducerClass(ReducerToCassandra.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Map.class);
job.setOutputValueClass(List.class);
job.setOutputFormatClass(CqlOutputFormat.class);
ConfigHelper.setOutputColumnFamily(job.getConfiguration(), KEYSPACE, OUTPUT_COLUMN_FAMILY);
job.getConfiguration().set(PRIMARY_KEY, "word,sum");
String query = "UPDATE " + KEYSPACE + "." + OUTPUT_COLUMN_FAMILY +
" SET count_num = ? ";
CqlConfigHelper.setOutputCql(job.getConfiguration(), query);
ConfigHelper.setOutputInitialAddress(job.getConfiguration(), "localhost");
ConfigHelper.setOutputPartitioner(job.getConfiguration(), "Murmur3Partitioner");
}
job.setInputFormatClass(CqlPagingInputFormat.class);
ConfigHelper.setInputRpcPort(job.getConfiguration(), "9160");
ConfigHelper.setInputInitialAddress(job.getConfiguration(), "localhost");
ConfigHelper.setInputColumnFamily(job.getConfiguration(), KEYSPACE, COLUMN_FAMILY);
ConfigHelper.setInputPartitioner(job.getConfiguration(), "Murmur3Partitioner");
CqlConfigHelper.setInputCQLPageRowSize(job.getConfiguration(), "3");
//this is the user defined filter clauses, you can comment it out if you want count all titles
CqlConfigHelper.setInputWhereClauses(job.getConfiguration(), "title='A'");
job.waitForCompletion(true);
return 0;
}
}
It compiles fine but I get this error:
Exception in thread "main" java.lang.NoClassDefFoundError: org/apache/cassandra/hadoop/cql3/CqlPagingInputFormat
at WordCount.run(WordCount.java:230)
at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:65)
at WordCount.main(WordCount.java:94)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at org.apache.hadoop.util.RunJar.main(RunJar.java:160)
Caused by: java.lang.ClassNotFoundException: org.apache.cassandra.hadoop.cql3.CqlPagingInputFormat
at java.net.URLClassLoader$1.run(URLClassLoader.java:366)
at java.net.URLClassLoader$1.run(URLClassLoader.java:355)
at java.security.AccessController.doPrivileged(Native Method)
at java.net.URLClassLoader.findClass(URLClassLoader.java:354)
at java.lang.ClassLoader.loadClass(ClassLoader.java:425)
at java.lang.ClassLoader.loadClass(ClassLoader.java:358)
... 8 more
I am using hadoop 1.2.1 and cassandra 2.0.4.
Help with this error or sample code or instruction for getting hadoop mapreduce to work with cassandra would be appreciated.
To solve the problem copy cassandra jar files to hadoop lib directory.
Please use following path
export HADOOP_CLASSPATH=/< path to cassandra >/lib/*:$HADOOP_CLASSPATH in /< hadoop path >/conf/hadoop-env.sh file.

job.waitForCompletion(true) freezes when using CqlPagingInputFormat as InputFormatClass

I have been trying to get the wordcount example that comes with cassandra to work with hadoop.
The source code:
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.*;
import java.util.Map.Entry;
import org.apache.cassandra.hadoop.cql3.CqlConfigHelper;
import org.apache.cassandra.hadoop.cql3.CqlOutputFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.cassandra.hadoop.cql3.CqlPagingInputFormat;
import org.apache.cassandra.hadoop.ConfigHelper;
import org.apache.cassandra.utils.ByteBufferUtil;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import java.nio.charset.CharacterCodingException;
/**
* This counts the occurrences of words in ColumnFamily
* cql3_worldcount ( user_id text,
* category_id text,
* sub_category_id text,
* title text,
* body text,
* PRIMARY KEY (user_id, category_id, sub_category_id))
*
* For each word, we output the total number of occurrences across all body texts.
*
* When outputting to Cassandra, we write the word counts to column family
* output_words ( row_id1 text,
* row_id2 text,
* word text,
* count_num text,
* PRIMARY KEY ((row_id1, row_id2), word))
* as a {word, count} to columns: word, count_num with a row key of "word sum"
*/
public class WordCount extends Configured implements Tool
{
private static final Logger logger = LoggerFactory.getLogger(WordCount.class);
static final String KEYSPACE = "cql3_worldcount";
static final String COLUMN_FAMILY = "inputs";
static final String OUTPUT_REDUCER_VAR = "output_reducer";
static final String OUTPUT_COLUMN_FAMILY = "output_words";
private static final String OUTPUT_PATH_PREFIX = "/tmp/word_count";
private static final String PRIMARY_KEY = "row_key";
public static void main(String[] args) throws Exception
{
// Let ToolRunner handle generic command-line options
ToolRunner.run(new Configuration(), new WordCount(), args);
System.exit(0);
}
public static class TokenizerMapper extends Mapper<Map<String, ByteBuffer>, Map<String, ByteBuffer>, Text, IntWritable>
{
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
private ByteBuffer sourceColumn;
protected void setup(org.apache.hadoop.mapreduce.Mapper.Context context)
throws IOException, InterruptedException
{
}
public void map(Map<String, ByteBuffer> keys, Map<String, ByteBuffer> columns, Context context) throws IOException, InterruptedException
{
for (Entry<String, ByteBuffer> column : columns.entrySet())
{
if (!"body".equalsIgnoreCase(column.getKey()))
continue;
String value = ByteBufferUtil.string(column.getValue());
logger.debug("read {}:{}={} from {}",
new Object[] {toString(keys), column.getKey(), value, context.getInputSplit()});
StringTokenizer itr = new StringTokenizer(value);
while (itr.hasMoreTokens())
{
word.set(itr.nextToken());
context.write(word, one);
}
}
}
private String toString(Map<String, ByteBuffer> keys)
{
String result = "";
try
{
for (ByteBuffer key : keys.values())
result = result + ByteBufferUtil.string(key) + ":";
}
catch (CharacterCodingException e)
{
logger.error("Failed to print keys", e);
}
return result;
}
}
public static class ReducerToFilesystem extends Reducer<Text, IntWritable, Text, IntWritable>
{
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException
{
int sum = 0;
for (IntWritable val : values)
sum += val.get();
context.write(key, new IntWritable(sum));
}
}
public static class ReducerToCassandra extends Reducer<Text, IntWritable, Map<String, ByteBuffer>, List<ByteBuffer>>
{
private Map<String, ByteBuffer> keys;
private ByteBuffer key;
protected void setup(org.apache.hadoop.mapreduce.Reducer.Context context)
throws IOException, InterruptedException
{
keys = new LinkedHashMap<String, ByteBuffer>();
String[] partitionKeys = context.getConfiguration().get(PRIMARY_KEY).split(",");
keys.put("row_id1", ByteBufferUtil.bytes(partitionKeys[0]));
keys.put("row_id2", ByteBufferUtil.bytes(partitionKeys[1]));
}
public void reduce(Text word, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException
{
int sum = 0;
for (IntWritable val : values)
sum += val.get();
context.write(keys, getBindVariables(word, sum));
}
private List<ByteBuffer> getBindVariables(Text word, int sum)
{
List<ByteBuffer> variables = new ArrayList<ByteBuffer>();
keys.put("word", ByteBufferUtil.bytes(word.toString()));
variables.add(ByteBufferUtil.bytes(String.valueOf(sum)));
return variables;
}
}
public int run(String[] args) throws Exception
{
String outputReducerType = "filesystem";
if (args != null && args[0].startsWith(OUTPUT_REDUCER_VAR))
{
String[] s = args[0].split("=");
if (s != null && s.length == 2)
outputReducerType = s[1];
}
logger.info("output reducer type: " + outputReducerType);
Job job = new Job(getConf(), "wordcount");
job.setJarByClass(WordCount.class);
job.setMapperClass(TokenizerMapper.class);
if (outputReducerType.equalsIgnoreCase("filesystem"))
{
job.setCombinerClass(ReducerToFilesystem.class);
job.setReducerClass(ReducerToFilesystem.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileOutputFormat.setOutputPath(job, new Path(OUTPUT_PATH_PREFIX));
}
else
{
job.setReducerClass(ReducerToCassandra.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Map.class);
job.setOutputValueClass(List.class);
job.setOutputFormatClass(CqlOutputFormat.class);
ConfigHelper.setOutputColumnFamily(job.getConfiguration(), KEYSPACE, OUTPUT_COLUMN_FAMILY);
job.getConfiguration().set(PRIMARY_KEY, "word,sum");
String query = "UPDATE " + KEYSPACE + "." + OUTPUT_COLUMN_FAMILY +
" SET count_num = ? ";
CqlConfigHelper.setOutputCql(job.getConfiguration(), query);
ConfigHelper.setOutputInitialAddress(job.getConfiguration(), "localhost");
ConfigHelper.setOutputPartitioner(job.getConfiguration(), "Murmur3Partitioner");
}
job.setInputFormatClass(CqlPagingInputFormat.class);
ConfigHelper.setInputRpcPort(job.getConfiguration(), "9160");
ConfigHelper.setInputInitialAddress(job.getConfiguration(), "localhost");
ConfigHelper.setInputColumnFamily(job.getConfiguration(), KEYSPACE, COLUMN_FAMILY);
ConfigHelper.setInputPartitioner(job.getConfiguration(), "Murmur3Partitioner");
CqlConfigHelper.setInputCQLPageRowSize(job.getConfiguration(), "3");
//this is the user defined filter clauses, you can comment it out if you want count all titles
CqlConfigHelper.setInputWhereClauses(job.getConfiguration(), "title='A'");
job.waitForCompletion(true);
return 0;
}
}
After I compile and make the jar file, when I try to run it with hadoop the program runs to the job.waitForCompletion(true); point and freezes, it doesn't output anything related to mapreduce or any error.
I am using hadoop 1.2.1 and cassandra 2.0.4
Does anyone have any idea what the problem is?
Thanks

How to store input of input file array in Map Reduce(Java)

I've write Linear Regression Program in java.
Input is -->
2,21.05
3,23.51
4,24.23
5,27.71
6,30.86
8,45.85
10,52.12
11,55.98
I want store input in array like x[]={2,3,...11} before processing input to reduce task. Then send that array variable to reduce() function
But I'm only on value at a time My program.
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.ArrayList;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Reducer.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class LinearRegression {
public static class RegressionMapper extends
Mapper<LongWritable, Text, Text, CountRegression> {
private Text id = new Text();
private CountRegression countRegression = new CountRegression();
#Override
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String tempString = value.toString();
String[] inputData = tempString.split(",");
String xVal = inputData[0];
String yVal = inputData[1];
countRegression.setxVal(Integer.parseInt(xVal));
countRegression.setyVal(Float.parseFloat(yVal));
id.set(xVal);
context.write(id, countRegression);
}
}
public static class RegressionReducer extends
Reducer<Text, CountRegression, Text, CountRegression> {
private CountRegression result = new CountRegression();
// static float meanX = 0;
// private float xValues[];
// private float yValues[];
static float xRed = 0.0f;
static float yRed = 0.3f;
static float sum = 0;
static ArrayList<Float> list = new ArrayList<Float>();
public void reduce(Text key, Iterable<CountRegression> values,
Context context) throws IOException, InterruptedException {
//float b = 0;
// while(values.iterator().hasNext())
// {
// xRed = xRed + values.iterator().next().getxVal();
// yRed = yRed + values.iterator().next().getyVal();
// }
for (CountRegression val : values) {
list.add(val.getxVal());
// list.add(val.getyVal());
// xRed += val.getxVal();
// yRed = val.getyVal();
// meanX += val.getxVal();
//xValues = val.getxVal();
}
for (int i=0; i< list.size(); i++) {
int lastIndex = list.listIterator().previousIndex();
sum += list.get(lastIndex);
}
result.setxVal(sum);
result.setyVal(yRed);
context.write(key, result);
}
}
public static class CountRegression implements Writable {
private float xVal = 0;
private float yVal = 0;
public float getxVal() {
return xVal;
}
public void setxVal(float x) {
this.xVal = x;
}
public float getyVal() {
return yVal;
}
public void setyVal(float y) {
this.yVal = y;
}
#Override
public void readFields(DataInput in) throws IOException {
xVal = in.readFloat();
yVal = in.readFloat();
}
#Override
public void write(DataOutput out) throws IOException {
out.writeFloat(xVal);
out.writeFloat(yVal);
}
#Override
public String toString() {
return "y = "+xVal+" +"+yVal+" x" ;
}
}
public static void main(String[] args) throws Exception {
// Provides access to configuration parameters.
Configuration conf = new Configuration();
// Create a new Job It allows the user to configure the job, submit it, control its execution, and query the state.
Job job = new Job(conf);
//Set the user-specified job name.
job.setJobName("LinearRegression");
//Set the Jar by finding where a given class came from.
job.setJarByClass(LinearRegression.class);
// Set the Mapper for the job.
job.setMapperClass(RegressionMapper.class);
// Set the Combiner for the job.
job.setCombinerClass(RegressionReducer.class);
// Set the Reducer for the job.
job.setReducerClass(RegressionReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(CountRegression.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}

Resources