mapReduce to get desired output - hadoop

Kindly point me in a direction to get my desired output
Current outPut given:
Albania 3607 ++ Country minPopulation
Albania 418495 ++ Country maxPopulation
Desired Output
country city minPopulation
country city maxPopulation
Reducer Class:
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class Handson3Reducer extends Reducer<Text, IntWritable, Text, IntWritable> {
#Override
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int maxValue = Integer.MIN_VALUE;
int minValue = Integer.MAX_VALUE;
String line = key.toString();
String field[] = line.split(",");
for (IntWritable value : values) {
maxValue = Math.max(maxValue, value.get());
minValue = Math.min(minValue, value.get());
}
context.write(key, new IntWritable(minValue));
context.write(key, new IntWritable(maxValue));
}
}
Mapper class:
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class handson3Mapper extends Mapper<LongWritable, Text, Text, IntWritable> {
private static final int MISSING = 9999;
#Override
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
int populationVal;
String line = value.toString();
String field[] = line.split(",");
String country = field[4].substring(1, field[4].length()-1);
String newString = country.concat(field[0].substring(1, field[0].length()-1));
String population = field[9].substring(1, field[9].length()-1);
String city = field[0].substring(1, field[0].length()-1);
if (!population.matches(".*\\d.*") || population.equals("")||
population.matches("([0-9].*)\\.([0-9].*)") ){
return;
}else{
populationVal = Integer.parseInt(population);
context.write(new Text(country),new IntWritable(populationVal));
}
}
}
Runner Class:
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class handsonJobRunner {
public int run(String[] args) throws Exception {
if(args.length !=2) {
System.err.println("Usage: Handson3 <input path> <outputpath>");
System.exit(-1);
}
Job job = new Job();
job.setJarByClass(handsonJobRunner.class);
job.setJobName("Handson 3");
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path(args[1]));
job.setMapperClass(handson3Mapper.class);
job.setReducerClass(Handson3Reducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
System.exit(job.waitForCompletion(true) ? 0:1);
boolean success = job.waitForCompletion(true);
return success ? 0 : 1;
}
public static void main(String[] args) throws Exception {
handsonJobRunner driver = new handsonJobRunner();
driver.run(args);
}
}
Thank you in advance, any pointers would be much appreciated.

You should send both city and population as value to reducer and at reducer select the city with max and min population for each country.
Your mapper would be like this:
public class Handson3Mapper extends Mapper<LongWritable, Text, Text, Text> {
private static final int MISSING = 9999;
#Override
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
int populationVal;
String line = value.toString();
String field[] = line.split(",");
String country = field[4].substring(1, field[4].length() - 1);
String newString = country.concat(field[0].substring(1, field[0].length() - 1));
String population = field[9].substring(1, field[9].length() - 1);
String city = field[0].substring(1, field[0].length() - 1);
if (!population.matches(".*\\d.*") || population.equals("") ||
population.matches("([0-9].*)\\.([0-9].*)")) {
return;
} else {
populationVal = Integer.parseInt(population);
context.write(new Text(country), new Text(city + "-" + populationVal));
}
}
}
And Your reducer should change to this one:
public class Handson3Reducer extends Reducer<Text, Text, Text, IntWritable> {
#Override
public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
String maxPopulationCityName = "";
String minPopulationCityName = "";
int maxValue = Integer.MIN_VALUE;
int minValue = Integer.MAX_VALUE;
String line = key.toString();
String field[] = line.split(",");
for (IntWritable value : values) {
String[] array = value.toString().split("-");
int population = Integer.valueOf(array[1]);
if (population > maxValue) {
maxPopulationCityName = array[0];
maxValue = population;
}
if (population < minValue) {
minPopulationCityName = array[0];
minValue = population;
}
}
context.write(new Text(key + " " + minPopulationCityName), new IntWritable(minValue));
context.write(new Text(key + " " + maxPopulationCityName), new IntWritable(maxValue));
}
}

Related

I'm getting the following error in My mapreduce code

I'm trying to get pearson's correlation for all pairs of column.
This is My MapReduce code :
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class Pearson
{
public static class MyMapper extends Mapper<LongWritable,Text,IndexPair,ValuePair>{
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] tokens = line.split(",");
double[] arr = toDouble(tokens);
for(int i=0; i < arr.length; i++) {
for(int j=i+1; j < arr.length; j++) {
IndexPair k2 = new IndexPair(i, j);
ValuePair v2 = new ValuePair(arr[i], arr[j]);
context.write(k2, v2);
}
}
}
public double[] toDouble(String[] tokens) {
double[] arr = new double[tokens.length];
for(int i=0; i < tokens.length; i++) {
arr[i] = Double.parseDouble(tokens[i]);
}
return arr;
}
}
public static class MyReduce extends Reducer<IndexPair,ValuePair,IndexPair,DoubleWritable>
{
public void reduce(IndexPair key, Iterable<ValuePair> values, Context context) throws IOException, InterruptedException {
double x = 0.0d;
double y = 0.0d;
double xx = 0.0d;
double yy = 0.0d;
double xy = 0.0d;
double n = 0.0d;
for(ValuePair pairs : values) {
x += pairs.v1;
y += pairs.v2;
xx += Math.pow(pairs.v1, 2.0d);
yy += Math.pow(pairs.v2, 2.0d);
xy += (pairs.v1 * pairs.v2);
n += 1.0d;
}
double numerator = xy - ((x * y) / n);
double denominator1 = xx - (Math.pow(x, 2.0d) / n);
double denominator2 = yy - (Math.pow(y, 2.0d) / n);
double denominator = Math.sqrt(denominator1 * denominator2);
double corr = numerator / denominator;
context.write(key, new DoubleWritable(corr));
}
}
public static void main(String[] args) throws Exception
{
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "Pearson's Correlation");
job.setJarByClass(Pearson.class);
job.setMapperClass(MyMapper.class);
job.setCombinerClass(MyReduce.class);
job.setReducerClass(MyReduce.class);
job.setMapOutputKeyClass(IndexPair.class);
job.setMapOutputValueClass(ValuePair.class);
job.setOutputKeyClass(IndexPair.class);
job.setOutputValueClass(DoubleWritable.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
And Code for IndexPair is this one :
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
public class IndexPair implements WritableComparable<IndexPair>{
public static String[] labels
={"Year","Month","MEI","CO2","CH4","N2O","CFC-11","CFC-12","TSI","Aerosols","Temp"};
public long i,j;
public IndexPair()
{
}
public IndexPair(long i,long j) {
this.i=i;
this.j=j;
}
#Override
public void readFields(DataInput in) throws IOException {
i = in.readLong();
j = in.readLong();
}
#Override
public void write(DataOutput out) throws IOException {
out.writeLong(i);
out.writeLong(j);
}
#Override
public int compareTo(IndexPair o) {
Long i1 = i;
Long j1 = j;
Long i2 = o.i;
Long j2 = o.j;
int result = i1.compareTo(i2);
if (0 == result) {
return j1.compareTo(j2);
}
return result;
}
#Override
public String toString()
{
return "Corelation between column "+labels[(int) i]+"-->"+ labels[(int)j];
}
}
And Code For value Pair is thisone :
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
public class ValuePair implements WritableComparable<ValuePair>{
public double v1,v2;
public ValuePair()
{
}
public ValuePair(double i,double j)
{
v1=i;
v2=j;
}
#Override
public void readFields(DataInput in) throws IOException {
v1=in.readDouble();
v2=in.readDouble();
}
#Override
public void write(DataOutput out) throws IOException {
out.writeDouble(v1);
out.writeDouble(v2);
}
#Override
public int compareTo(ValuePair o) {
// comparator for value pair is not required....
return 0;
}
}
But Whn I'm trying to execute this, I'm getting the following error
17/07/20 13:59:49 INFO mapreduce.Job: map 0% reduce 0%
17/07/20 13:59:53 INFO mapreduce.Job: Task Id : attempt_1500536519279_0007_m_000000_0, Status : FAILED
Error: java.io.IOException: wrong value class: class org.apache.hadoop.io.DoubleWritable is not class ValuePair
at org.apache.hadoop.mapred.IFile$Writer.append(IFile.java:194)
at org.apache.hadoop.mapred.Task$CombineOutputCollector.collect(Task.java:1411)
at org.apache.hadoop.mapred.Task$NewCombinerRunner$OutputConverter.write(Task.java:1728)
at org.apache.hadoop.mapreduce.task.TaskInputOutputContextImpl.write(TaskInputOutputContextImpl.java:89)
at org.apache.hadoop.mapreduce.lib.reduce.WrappedReducer$Context.write(WrappedReducer.java:105)
at Pearson$MyReduce.reduce(Pearson.java:66)
at Pearson$MyReduce.reduce(Pearson.java:1)
at org.apache.hadoop.mapreduce.Reducer.run(Reducer.java:171)
at org.apache.hadoop.mapred.Task$NewCombinerRunner.combine(Task.java:1749)
at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.sortAndSpill(MapTask.java:1639)
at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.flush(MapTask.java:1491)
at org.apache.hadoop.mapred.MapTask$NewOutputCollector.close(MapTask.java:723)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:793)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:341)
at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:175)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1807)
at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:169)
The problem is that you use the reducer as a combiner:
job.setCombinerClass(MyReduce.class);
The output key and value types of the combiner should be the same as the ones of the mapper, while, when you use the reducer as a combiner, it tries to emit pairs of different types, hence the error.

I want to show max,min and avg temperature using hadoop

My project is to show max,min and avg temperature. I have already done it, but I have to show this functions using group by key. There are 4 radio buttons for Year, month, date and city in my application. If I select one then it will ask me to input the aggregate functions(max,min,avg). For these I need to change my CompositeGroupKey class, but I don't have any idea about that. So please help me, and provide inputs about the changes need to be done with the code.
The driver :
import org.apache.hadoop.io.*;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class MaxTemperature
{
public static void Main (String[] args) throws Exception
{
if (args.length != 2)
{
System.err.println("Please Enter the input and output parameters");
System.exit(-1);
}
Job job = new Job();
job.setJarByClass(MaxTemperature.class);
job.setJobName("Max temperature");
FileInputFormat.addInputPath(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path (args[1]));
job.setMapperClass(MaxTemperatureMapper.class);
job.setReducerClass(MaxTemperatureReducer.class);
job.setMapOutputKeyClass(CompositeGroupKey.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(CompositeGroupKey.class);
job.setOutputValueClass(DoubleWritable.class);
System.exit(job.waitForCompletion(true)?0:1);
}
}
The mapper :
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;
import java.io.IOException;
public class MaxTemperatureMapper extends Mapper <LongWritable, Text, CompositeGroupKey, IntWritable>
{
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException
{
String line = value.toString();
int year = Integer.parseInt(line.substring(0,4));
String mnth = line.substring(7,10);
int date = Integer.parseInt(line.substring(10,12));
int temp= Integer.parseInt(line.substring(12,14));
CompositeGroupKey cntry = new CompositeGroupKey(year,mnth, date);
context.write(cntry, new IntWritable(temp));
}
}
The reducer :
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.*;
import java.io.IOException;
public class MaxTemperatureReducer extends Reducer <CompositeGroupKey, IntWritable, CompositeGroupKey, CompositeGroupkeyall >{
public void reduce(CompositeGroupKey key, Iterable<IntWritable> values , Context context) throws IOException,InterruptedException
{
Double max = Double.MIN_VALUE;
Double min =Double.MAX_VALUE;
for (IntWritable value : values )
{
min = Math.min(min, value.get());
max = Math.max(max, value.get());
}
CompositeGroupkeyall val =new CompositeGroupkeyall(max,min);
context.write(key, val);
}
}
And the composite key :
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableUtils;
class CompositeGroupKey implements WritableComparable<CompositeGroupKey> {
int year;
String mnth;
int date;
CompositeGroupKey(int y, String c, int d){
year = y;
mnth = c;
date = d;
}
CompositeGroupKey(){}
public void write(DataOutput out) throws IOException {
out.writeInt(year);
WritableUtils.writeString(out, mnth);
out.writeInt(date);
}
public void readFields(DataInput in) throws IOException {
this.year = in.readInt();
this.mnth = WritableUtils.readString(in);
this.date = in.readInt();
}
public int compareTo(CompositeGroupKey pop) {
if (pop == null)
return 0;
int intcnt;
intcnt = Integer.valueOf(year).toString().compareTo(Integer.valueOf(pop.year).toString());
if(intcnt != 0){
return intcnt;
}else if(mnth.compareTo(pop.mnth) != 0){
return mnth.compareTo(pop.mnth);
}else{
return Integer.valueOf(date).toString().compareTo(Integer.valueOf(pop.date).toString());
}
}
public String toString() {
return year + " :" + mnth.toString() + " :" + date;
}
}
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
class CompositeGroupkeyall implements WritableComparable<CompositeGroupkeyall> {
Double max;
Double min;
CompositeGroupkeyall(double x, double y){
max = x ;
min = y ;
}
CompositeGroupkeyall(){}
public void readFields(DataInput in) throws IOException {
this.max = in.readDouble();
this.min = in.readDouble();
}
public void write(DataOutput out) throws IOException {
out.writeDouble(max);
out.writeDouble(min);
}
public int compareTo(CompositeGroupkeyall arg0) {
return -1;
}
public String toString() {
return max + " " + min +" " ;
}
}
You can create more key value pairs as below and let the same reducer process the data, all the date/month/year will be processed by the same reducer
CompositeGroupKey cntry = new CompositeGroupKey(year, mnth, date);
CompositeGroupKey cntry_date = new CompositeGroupKey((int)0, "ALL", date);
CompositeGroupKey cntry_mnth = new CompositeGroupKey((int)0, mnth, (int) 1);
CompositeGroupKey cntry_year = new CompositeGroupKey(year, "ALL", (int) 1);
context.write(cntry, new IntWritable(temp));
context.write(cntry_date, new IntWritable(temp));
context.write(cntry_mnth, new IntWritable(temp));
context.write(cntry_year, new IntWritable(temp));

Sample code for using hadoop mapreduce against cassandra

I have been trying to get a MapReduce sample code that comes with Cassandra running but I get run time error.
Source code:
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.*;
import java.util.Map.Entry;
import org.apache.cassandra.hadoop.cql3.CqlConfigHelper;
import org.apache.cassandra.hadoop.cql3.CqlOutputFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.cassandra.hadoop.cql3.CqlPagingInputFormat;
import org.apache.cassandra.hadoop.ConfigHelper;
import org.apache.cassandra.utils.ByteBufferUtil;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import java.nio.charset.CharacterCodingException;
/**
* This counts the occurrences of words in ColumnFamily
* cql3_worldcount ( user_id text,
* category_id text,
* sub_category_id text,
* title text,
* body text,
* PRIMARY KEY (user_id, category_id, sub_category_id))
*
* For each word, we output the total number of occurrences across all body texts.
*
* When outputting to Cassandra, we write the word counts to column family
* output_words ( row_id1 text,
* row_id2 text,
* word text,
* count_num text,
* PRIMARY KEY ((row_id1, row_id2), word))
* as a {word, count} to columns: word, count_num with a row key of "word sum"
*/
public class WordCount extends Configured implements Tool
{
private static final Logger logger = LoggerFactory.getLogger(WordCount.class);
static final String KEYSPACE = "cql3_worldcount";
static final String COLUMN_FAMILY = "inputs";
static final String OUTPUT_REDUCER_VAR = "output_reducer";
static final String OUTPUT_COLUMN_FAMILY = "output_words";
private static final String OUTPUT_PATH_PREFIX = "/tmp/word_count";
private static final String PRIMARY_KEY = "row_key";
public static void main(String[] args) throws Exception
{
// Let ToolRunner handle generic command-line options
ToolRunner.run(new Configuration(), new WordCount(), args);
System.exit(0);
}
public static class TokenizerMapper extends Mapper<Map<String, ByteBuffer>, Map<String, ByteBuffer>, Text, IntWritable>
{
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
private ByteBuffer sourceColumn;
protected void setup(org.apache.hadoop.mapreduce.Mapper.Context context)
throws IOException, InterruptedException
{
}
public void map(Map<String, ByteBuffer> keys, Map<String, ByteBuffer> columns, Context context) throws IOException, InterruptedException
{
for (Entry<String, ByteBuffer> column : columns.entrySet())
{
if (!"body".equalsIgnoreCase(column.getKey()))
continue;
String value = ByteBufferUtil.string(column.getValue());
logger.debug("read {}:{}={} from {}",
new Object[] {toString(keys), column.getKey(), value, context.getInputSplit()});
StringTokenizer itr = new StringTokenizer(value);
while (itr.hasMoreTokens())
{
word.set(itr.nextToken());
context.write(word, one);
}
}
}
private String toString(Map<String, ByteBuffer> keys)
{
String result = "";
try
{
for (ByteBuffer key : keys.values())
result = result + ByteBufferUtil.string(key) + ":";
}
catch (CharacterCodingException e)
{
logger.error("Failed to print keys", e);
}
return result;
}
}
public static class ReducerToFilesystem extends Reducer<Text, IntWritable, Text, IntWritable>
{
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException
{
int sum = 0;
for (IntWritable val : values)
sum += val.get();
context.write(key, new IntWritable(sum));
}
}
public static class ReducerToCassandra extends Reducer<Text, IntWritable, Map<String, ByteBuffer>, List<ByteBuffer>>
{
private Map<String, ByteBuffer> keys;
private ByteBuffer key;
protected void setup(org.apache.hadoop.mapreduce.Reducer.Context context)
throws IOException, InterruptedException
{
keys = new LinkedHashMap<String, ByteBuffer>();
String[] partitionKeys = context.getConfiguration().get(PRIMARY_KEY).split(",");
keys.put("row_id1", ByteBufferUtil.bytes(partitionKeys[0]));
keys.put("row_id2", ByteBufferUtil.bytes(partitionKeys[1]));
}
public void reduce(Text word, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException
{
int sum = 0;
for (IntWritable val : values)
sum += val.get();
context.write(keys, getBindVariables(word, sum));
}
private List<ByteBuffer> getBindVariables(Text word, int sum)
{
List<ByteBuffer> variables = new ArrayList<ByteBuffer>();
keys.put("word", ByteBufferUtil.bytes(word.toString()));
variables.add(ByteBufferUtil.bytes(String.valueOf(sum)));
return variables;
}
}
public int run(String[] args) throws Exception
{
String outputReducerType = "filesystem";
if (args != null && args[0].startsWith(OUTPUT_REDUCER_VAR))
{
String[] s = args[0].split("=");
if (s != null && s.length == 2)
outputReducerType = s[1];
}
logger.info("output reducer type: " + outputReducerType);
Job job = new Job(getConf(), "wordcount");
job.setJarByClass(WordCount.class);
job.setMapperClass(TokenizerMapper.class);
if (outputReducerType.equalsIgnoreCase("filesystem"))
{
job.setCombinerClass(ReducerToFilesystem.class);
job.setReducerClass(ReducerToFilesystem.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileOutputFormat.setOutputPath(job, new Path(OUTPUT_PATH_PREFIX));
}
else
{
job.setReducerClass(ReducerToCassandra.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Map.class);
job.setOutputValueClass(List.class);
job.setOutputFormatClass(CqlOutputFormat.class);
ConfigHelper.setOutputColumnFamily(job.getConfiguration(), KEYSPACE, OUTPUT_COLUMN_FAMILY);
job.getConfiguration().set(PRIMARY_KEY, "word,sum");
String query = "UPDATE " + KEYSPACE + "." + OUTPUT_COLUMN_FAMILY +
" SET count_num = ? ";
CqlConfigHelper.setOutputCql(job.getConfiguration(), query);
ConfigHelper.setOutputInitialAddress(job.getConfiguration(), "localhost");
ConfigHelper.setOutputPartitioner(job.getConfiguration(), "Murmur3Partitioner");
}
job.setInputFormatClass(CqlPagingInputFormat.class);
ConfigHelper.setInputRpcPort(job.getConfiguration(), "9160");
ConfigHelper.setInputInitialAddress(job.getConfiguration(), "localhost");
ConfigHelper.setInputColumnFamily(job.getConfiguration(), KEYSPACE, COLUMN_FAMILY);
ConfigHelper.setInputPartitioner(job.getConfiguration(), "Murmur3Partitioner");
CqlConfigHelper.setInputCQLPageRowSize(job.getConfiguration(), "3");
//this is the user defined filter clauses, you can comment it out if you want count all titles
CqlConfigHelper.setInputWhereClauses(job.getConfiguration(), "title='A'");
job.waitForCompletion(true);
return 0;
}
}
It compiles fine but I get this error:
Exception in thread "main" java.lang.NoClassDefFoundError: org/apache/cassandra/hadoop/cql3/CqlPagingInputFormat
at WordCount.run(WordCount.java:230)
at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:65)
at WordCount.main(WordCount.java:94)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at org.apache.hadoop.util.RunJar.main(RunJar.java:160)
Caused by: java.lang.ClassNotFoundException: org.apache.cassandra.hadoop.cql3.CqlPagingInputFormat
at java.net.URLClassLoader$1.run(URLClassLoader.java:366)
at java.net.URLClassLoader$1.run(URLClassLoader.java:355)
at java.security.AccessController.doPrivileged(Native Method)
at java.net.URLClassLoader.findClass(URLClassLoader.java:354)
at java.lang.ClassLoader.loadClass(ClassLoader.java:425)
at java.lang.ClassLoader.loadClass(ClassLoader.java:358)
... 8 more
I am using hadoop 1.2.1 and cassandra 2.0.4.
Help with this error or sample code or instruction for getting hadoop mapreduce to work with cassandra would be appreciated.
To solve the problem copy cassandra jar files to hadoop lib directory.
Please use following path
export HADOOP_CLASSPATH=/< path to cassandra >/lib/*:$HADOOP_CLASSPATH in /< hadoop path >/conf/hadoop-env.sh file.

How to store input of input file array in Map Reduce(Java)

I've write Linear Regression Program in java.
Input is -->
2,21.05
3,23.51
4,24.23
5,27.71
6,30.86
8,45.85
10,52.12
11,55.98
I want store input in array like x[]={2,3,...11} before processing input to reduce task. Then send that array variable to reduce() function
But I'm only on value at a time My program.
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.ArrayList;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Reducer.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class LinearRegression {
public static class RegressionMapper extends
Mapper<LongWritable, Text, Text, CountRegression> {
private Text id = new Text();
private CountRegression countRegression = new CountRegression();
#Override
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String tempString = value.toString();
String[] inputData = tempString.split(",");
String xVal = inputData[0];
String yVal = inputData[1];
countRegression.setxVal(Integer.parseInt(xVal));
countRegression.setyVal(Float.parseFloat(yVal));
id.set(xVal);
context.write(id, countRegression);
}
}
public static class RegressionReducer extends
Reducer<Text, CountRegression, Text, CountRegression> {
private CountRegression result = new CountRegression();
// static float meanX = 0;
// private float xValues[];
// private float yValues[];
static float xRed = 0.0f;
static float yRed = 0.3f;
static float sum = 0;
static ArrayList<Float> list = new ArrayList<Float>();
public void reduce(Text key, Iterable<CountRegression> values,
Context context) throws IOException, InterruptedException {
//float b = 0;
// while(values.iterator().hasNext())
// {
// xRed = xRed + values.iterator().next().getxVal();
// yRed = yRed + values.iterator().next().getyVal();
// }
for (CountRegression val : values) {
list.add(val.getxVal());
// list.add(val.getyVal());
// xRed += val.getxVal();
// yRed = val.getyVal();
// meanX += val.getxVal();
//xValues = val.getxVal();
}
for (int i=0; i< list.size(); i++) {
int lastIndex = list.listIterator().previousIndex();
sum += list.get(lastIndex);
}
result.setxVal(sum);
result.setyVal(yRed);
context.write(key, result);
}
}
public static class CountRegression implements Writable {
private float xVal = 0;
private float yVal = 0;
public float getxVal() {
return xVal;
}
public void setxVal(float x) {
this.xVal = x;
}
public float getyVal() {
return yVal;
}
public void setyVal(float y) {
this.yVal = y;
}
#Override
public void readFields(DataInput in) throws IOException {
xVal = in.readFloat();
yVal = in.readFloat();
}
#Override
public void write(DataOutput out) throws IOException {
out.writeFloat(xVal);
out.writeFloat(yVal);
}
#Override
public String toString() {
return "y = "+xVal+" +"+yVal+" x" ;
}
}
public static void main(String[] args) throws Exception {
// Provides access to configuration parameters.
Configuration conf = new Configuration();
// Create a new Job It allows the user to configure the job, submit it, control its execution, and query the state.
Job job = new Job(conf);
//Set the user-specified job name.
job.setJobName("LinearRegression");
//Set the Jar by finding where a given class came from.
job.setJarByClass(LinearRegression.class);
// Set the Mapper for the job.
job.setMapperClass(RegressionMapper.class);
// Set the Combiner for the job.
job.setCombinerClass(RegressionReducer.class);
// Set the Reducer for the job.
job.setReducerClass(RegressionReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(CountRegression.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}

input file format for map-reduce

I am very new to hadoop, Can anyone know what should i keep in(/user/gates/pages) pages folder.should i keep on text file which contain data.if only txt file what will be the format of that
What data should I keep in [FileInputFormat.addInputPath(lp, new Path("/user/gates/pages"));] pages file.username or ages or websites name could u please give the details of input file
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.KeyValueTextInputFormat;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.jobcontrol.Job;
import org.apache.hadoop.mapred.jobcontrol.JobControl;
import org.apache.hadoop.mapred.lib.IdentityMapper;
public class MRExample {
public static class LoadPages extends MapReduceBase
implements Mapper<LongWritable, Text, Text, Text> {
public void map(LongWritable k, Text val,
OutputCollector<Text, Text> oc,
Reporter reporter) throws IOException {
// Pull the key out
String line = val.toString();
int firstComma = line.indexOf(',');
String key = line.substring(0, firstComma);
String value = line.substring(firstComma + 1);
Text outKey = new Text(key);
// Prepend an index to the value so we know which file
// it came from.
Text outVal = new Text("1" + value);
oc.collect(outKey, outVal);
}
}
public static class LoadAndFilterUsers extends MapReduceBase
implements Mapper<LongWritable, Text, Text, Text> {
public void map(LongWritable k, Text val,
OutputCollector<Text, Text> oc,
Reporter reporter) throws IOException {
// Pull the key out
String line = val.toString();
int firstComma = line.indexOf(',');
String value = line.substring(firstComma + 1);
int age = Integer.parseInt(value);
if (age < 18 || age > 25) return;
String key = line.substring(0, firstComma);
Text outKey = new Text(key);
// Prepend an index to the value so we know which file
// it came from.
Text outVal = new Text("2" + value);
oc.collect(outKey, outVal);
}
}
public static class Join extends MapReduceBase
implements Reducer<Text, Text, Text, Text> {
public void reduce(Text key,
Iterator<Text> iter,
OutputCollector<Text, Text> oc,
Reporter reporter) throws IOException {
// For each value, figure out which file it's from and store it
// accordingly.
List<String> first = new ArrayList<String>();
List<String> second = new ArrayList<String>();
while (iter.hasNext()) {
Text t = iter.next();
String value = t.toString();
if (value.charAt(0) == '1') first.add(value.substring(1));
else second.add(value.substring(1));
reporter.setStatus("OK");
}
// Do the cross product and collect the values
for (String s1 : first) {
for (String s2 : second) {
String outval = key + "," + s1 + "," + s2;
oc.collect(null, new Text(outval));
reporter.setStatus("OK");
}
}
}
}
public static class LoadJoined extends MapReduceBase
implements Mapper<Text, Text, Text, LongWritable> {
public void map(
Text k,
Text val,
OutputCollector<Text, LongWritable> oc,
Reporter reporter) throws IOException {
// Find the url
String line = val.toString();
int firstComma = line.indexOf(',');
int secondComma = line.indexOf(',', firstComma);
String key = line.substring(firstComma, secondComma);
// drop the rest of the record, I don't need it anymore,
// just pass a 1 for the combiner/reducer to sum instead.
Text outKey = new Text(key);
oc.collect(outKey, new LongWritable(1L));
}
}
public static class ReduceUrls extends MapReduceBase
implements Reducer<Text, LongWritable, WritableComparable, Writable> {
public void reduce(
Text key,
Iterator<LongWritable> iter,
OutputCollector<WritableComparable, Writable> oc,
Reporter reporter) throws IOException {
// Add up all the values we see
long sum = 0;
while (iter.hasNext()) {
sum += iter.next().get();
reporter.setStatus("OK");
}
oc.collect(key, new LongWritable(sum));
}
}
public static class LoadClicks extends MapReduceBase
implements Mapper<WritableComparable, Writable, LongWritable, Text> {
public void map(
WritableComparable key,
Writable val,
OutputCollector<LongWritable, Text> oc,
Reporter reporter) throws IOException {
oc.collect((LongWritable)val, (Text)key);
}
}
public static class LimitClicks extends MapReduceBase
implements Reducer<LongWritable, Text, LongWritable, Text> {
int count = 0;
public void reduce(
LongWritable key,
Iterator<Text> iter,
OutputCollector<LongWritable, Text> oc,
Reporter reporter) throws IOException {
// Only output the first 100 records
while (count < 100 && iter.hasNext()) {
oc.collect(key, iter.next());
count++;
}
}
}
public static void main(String[] args) throws IOException {
JobConf lp = new JobConf(MRExample.class);
lp.setJobName("Load Pages");
lp.setInputFormat(TextInputFormat.class);
lp.setOutputKeyClass(Text.class);
lp.setOutputValueClass(Text.class);
lp.setMapperClass(LoadPages.class);
FileInputFormat.addInputPath(lp, new Path("/user/gates/pages"));
FileOutputFormat.setOutputPath(lp,
new Path("/user/gates/tmp/indexed_pages"));
lp.setNumReduceTasks(0);
Job loadPages = new Job(lp);
JobConf lfu = new JobConf(MRExample.class);
lfu.setJobName("Load and Filter Users");
lfu.setInputFormat(TextInputFormat.class);
lfu.setOutputKeyClass(Text.class);
lfu.setOutputValueClass(Text.class);
lfu.setMapperClass(LoadAndFilterUsers.class);
FileInputFormat.addInputPath(lfu, new Path("/user/gates/users"));
FileOutputFormat.setOutputPath(lfu,
new Path("/user/gates/tmp/filtered_users"));
lfu.setNumReduceTasks(0);
Job loadUsers = new Job(lfu);
JobConf join = new JobConf(MRExample.class);
join.setJobName("Join Users and Pages");
join.setInputFormat(KeyValueTextInputFormat.class);
join.setOutputKeyClass(Text.class);
join.setOutputValueClass(Text.class);
join.setMapperClass(IdentityMapper.class);
join.setReducerClass(Join.class);
FileInputFormat.addInputPath(join, new Path("/user/gates/tmp/indexed_pages"));
FileInputFormat.addInputPath(join, new Path("/user/gates/tmp/filtered_users"));
FileOutputFormat.setOutputPath(join, new Path("/user/gates/tmp/joined"));
join.setNumReduceTasks(50);
Job joinJob = new Job(join);
joinJob.addDependingJob(loadPages);
joinJob.addDependingJob(loadUsers);
JobConf group = new JobConf(MRExample.class);
group.setJobName("Group URLs");
group.setInputFormat(KeyValueTextInputFormat.class);
group.setOutputKeyClass(Text.class);
group.setOutputValueClass(LongWritable.class);
group.setOutputFormat(SequenceFileOutputFormat.class);
group.setMapperClass(LoadJoined.class);
group.setCombinerClass(ReduceUrls.class);
group.setReducerClass(ReduceUrls.class);
FileInputFormat.addInputPath(group, new Path("/user/gates/tmp/joined"));
FileOutputFormat.setOutputPath(group, new Path("/user/gates/tmp/grouped"));
group.setNumReduceTasks(50);
Job groupJob = new Job(group);
groupJob.addDependingJob(joinJob);
JobConf top100 = new JobConf(MRExample.class);
top100.setJobName("Top 100 sites");
top100.setInputFormat(SequenceFileInputFormat.class);
top100.setOutputKeyClass(LongWritable.class);
top100.setOutputValueClass(Text.class);
top100.setOutputFormat(SequenceFileOutputFormat.class);
top100.setMapperClass(LoadClicks.class);
top100.setCombinerClass(LimitClicks.class);
top100.setReducerClass(LimitClicks.class);
FileInputFormat.addInputPath(top100, new Path("/user/gates/tmp/grouped"));
FileOutputFormat.setOutputPath(top100, new Path("/user/gates/top100sitesforusers18to25"));
top100.setNumReduceTasks(1);
Job limit = new Job(top100);
limit.addDependingJob(groupJob);
JobControl jc = new JobControl("Find top 100 sites for users 18 to 25");
jc.addJob(loadPages);
jc.addJob(loadUsers);
jc.addJob(joinJob);
jc.addJob(groupJob);
jc.addJob(limit);
jc.run();
}
}
FileInputFormat.addInputPath(lp, new Path("/user/gates/pages"));
...
FileInputFormat.addInputPath(lfu, new Path("/user/gates/users"));
These are the two hard coded files it's expecting to find in HDFS, and for output:
FileOutputFormat.setOutputPath(top100, new Path("/user/gates/top100sitesforusers18to25"));

Resources