hadoop compiling - hadoop

I'm new to hadoop. I got this code from net
import java.io.IOException;
import java.util.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;
public class Gender {
private static String genderCheck = "female";
public static class Map extends MapReduceBase implements Mapper {
private final static IntWritable one = new IntWritable(1);
private Text locText = new Text();
public void map(LongWritable key, Text value, OutputCollector output, Reporter reporter) throws IOException {
String line = value.toString();
String location = line.split(",")[14] + "," + line.split(",")[15];
long male = 0L;
long female = 0L;
if (line.split(",")[17].matches("\d+") && line.split(",")[18].matches("\d+")) {
male = Long.parseLong(line.split(",")[17]);
female = Long.parseLong(line.split(",")[18]);
}
long diff = male - female;
locText.set(location);
if (Gender.genderCheck.toLowerCase().equals("female") && diff < 0) {
output.collect(locText, new LongWritable(diff * -1L));
}
else if (Gender.genderCheck.toLowerCase().equals("male") && diff
> 0) {
output.collect(locText, new LongWritable(diff));
}
} }
public static void main(String[] args) throws Exception {
JobConf conf = new JobConf(Gender.class);
conf.setJobName("gender");
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(LongWritable.class);
conf.setMapperClass(Map.class);
if (args.length != 3) {
System.out.println("Usage:");
System.out.println("[male/female] /path/to/2kh/files /path/to/output");
System.exit(1);
}
if (!args[0].equalsIgnoreCase("male") && !args[0].equalsIgnoreCase("female")) {
System.out.println("first argument must be male or female");
System.exit(1);
}
Gender.genderCheck = args[0];
conf.setInputFormat(TextInputFormat.class);
conf.setOutputFormat(TextOutputFormat.class);
FileInputFormat.setInputPaths(conf, new Path(args[1]));
FileOutputFormat.setOutputPath(conf, new Path(args[2]));
JobClient.runJob(conf); }
}
when I compile this code using "javac -cp /usr/local/hadoop/hadoop-core-1.0.3.jar Gender.java"
getting the following error:
"Gender.Map is not abstract and does not override abstract method
map(java.lang.Object,java.lang.Object,org.apache.hadoop.mapred.OutputCollector,org.apache.hadoop.mapred.Reporter)
in org.apache.hadoop.mapred.Mapper
public static class Map extends MapReduceBase implements Mapper "
How can I compile it correctly?

Change the class Maper class declaration as follows:
public static class Map extends MapReduceBase implements Mapper<LongWritable,Text,Text, LongWritable>
If you do not specify any specific class names, you would need to have the map function as follows:
#Override
public void map(Object arg0, Object arg1, OutputCollector arg2, Reporter arg3) throws IOException {
// TODO Auto-generated method stub
}
Now, the specific types denote here the expected input key-value pair types and the output key-value types from the mapper.
In your case the input key-value pair are LongWritable-Text.
And, guessing from your output.collect method calls, your mapper output key-value pair is Text-LongWritable.
Hence, your Map class shall implememnt Mapper<LongWritable,Text,Text, LongWritable>.
There was one more error in your code -
Using "\d+" will not compile as \d has no meaning, after backslash it expects a special escape sequence, so I guess for you the following should work:
line.split(",")[17].matches("\\d+")

Change the map class as follows:
public static class Map extends MapReduceBase implements Mapper <Input key, Input value, Output Key , Output Value>
In your case the input key is LongWritable, Input value is Text, Output Key is Text , Output value is LongWritable
public static class Map extends MapReduceBase implements Mapper <LongWritable, Text, Text,LongWritable>

Related

hadoop partitioner not working

public class Partitioner_2 implements Partitioner<Text,Text>{
#Override
public int getPartition(Text key, Text value, int numPartitions) {
int hashValue=0;
for(char c: key.toString().split("\\|\\|")[0].toCharArray()){
hashValue+=(int)c;
}
return Math.abs(hashValue * 127) % numPartitions;
}
}
That is my partitioner code and the key is in the form:
"str1||str2" , I would like to send all keys that have the same value for str1 to the same reducer.
My GroupComparator and KeyComparator are as follows:
public static class GroupComparator_2 extends WritableComparator {
protected GroupComparator_2() {
super(Text.class, true);
}
#Override
public int compare(WritableComparable w1, WritableComparable w2) {
Text kw1 = (Text) w1;
Text kw2 = (Text) w2;
String k1=kw1.toString().split("||")[0].trim();
String k2=kw2.toString().split("||")[0].trim();
return k1.compareTo(k2);
}
}
public static class KeyComparator_2 extends WritableComparator {
protected KeyComparator_2() {
super(Text.class, true);
}
#Override
public int compare(WritableComparable w1, WritableComparable w2) {
Text key1 = (Text) w1;
Text key2 = (Text) w2;
String kw1_key1=key1.toString().split("||")[0];
String kw1_key2=key2.toString().split("||")[0];
int cmp=kw1_key1.compareTo(kw1_key2);
if(cmp==0){
String kw2_key1=key1.toString().split("||")[1].trim();
String kw2_key2=key2.toString().split("||")[1].trim();
cmp=kw2_key1.compareTo(kw2_key2);
}
return cmp;
}
}
The error I am currently receiving is :
KeywordKeywordCoOccurrence_2.java:92: interface expected here
public class Partitioner_2 implements Partitioner<Text,Text>{
^
KeywordKeywordCoOccurrence_2.java:94: method does not override or implement a method from a supertype
#Override
^
KeywordKeywordCoOccurrence_2.java:147: setPartitionerClass(java.lang.Class<? extends org.apache.hadoop.mapreduce.Partitioner>) in org.apache.hadoop.mapreduce.Job cannot be applied to (java.lang.Class<KeywordKeywordCoOccurrence_2.Partitioner_2>)
job.setPartitionerClass(Partitioner_2.class);
But as far as I can tell I have overridden the getPartition() method which is the only method in the Partitioner interface? Any help in identifying what I am doing wrong and how to fix it would be much appreciated.
Thanks in advance!
Partitioner is an abstract class in the new mapreduce API (that you're apparently using).
So you should define it as:
public class Partitioner_2 extends Partitioner<Text, Text> {

in hadoop word count example why not make text also static

In Hadoop word count example IntWritable is made static so that it can be reused in the same JVM instead of creating new. My question is why not make text also static?
I made it and is working fine but never saw that in any example. Am I missing something?
private ***static*** Text word = new Text();
private final static IntWritable intWritable = new IntWritable(1);
The original word count example.
public static class Map extends MapReduceBase implements Mapper<LongWritable, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
String line = value.toString();
StringTokenizer tokenizer = new StringTokenizer(line);
while (tokenizer.hasMoreTokens()) {
word.set(tokenizer.nextToken());
output.collect(word, one);
}
}
OutputCollector API, Collects the pairs output by Mappers and Reducers, in order to program works properly, decide the variable whether it has to be global or not,based on your logic and type of Application logic trying to solve, In the case of WordCount Program, the program works properly because mapper object is not sharing it's state across multiple threads

CellProcessor for MapEntry

I have a simple POJO that has a Map inside it.
public class Product {
public Map map;
}
then my csv looks like this:
"mapEntry1","mapEntry2","mapEntry3"
So I created a custom cell processor for parsing those:
public class MapEntryCellProcessor {
public Object execute(Object val, CsvContext context) {
return next.execute(new AbstractMap.SimpleEntry<>("somekey", val), context);
}
}
and then I add an entry setter method in my Product:
public void setName(Entry<String, String> entry) {
if (getName() == null) {
name = new HashMap<>();
}
name.put(entry.getKey(), entry.getValue());
}
Unfortunately this means I have 2 setter methods: one that accepts a map and another one that accepts an entry which doesn't really work for me (I have no control on how the POJOs are generated). Is there any other way I can parse such a CSV and have only setter that accepts a Map in my Product?
It's possible to write a cell processor that collects each column into a map. For example, the following processor allows you to specify the key and the map to add to.
package org.supercsv.example;
import java.util.Map;
import org.supercsv.cellprocessor.CellProcessorAdaptor;
import org.supercsv.cellprocessor.ift.CellProcessor;
import org.supercsv.util.CsvContext;
public class MapCollector extends CellProcessorAdaptor {
private String key;
private Map<String, String> map;
public MapCollector(String key, Map<String, String> map){
this.key = key;
this.map = map;
}
public MapCollector(String key, Map<String, String> map,
CellProcessor next){
super(next);
this.key = key;
this.map = map;
}
public Object execute(Object value, CsvContext context) {
validateInputNotNull(value, context);
map.put(key, String.valueOf(value));
return next.execute(map, context);
}
}
Then assuming your Product bean has a field name of type Map<String,String>, you can use the processor as follows.
package org.supercsv.example;
import java.io.IOException;
import java.io.StringReader;
import java.util.HashMap;
import java.util.Map;
import junit.framework.TestCase;
import org.supercsv.cellprocessor.ift.CellProcessor;
import org.supercsv.io.CsvBeanReader;
import org.supercsv.io.ICsvBeanReader;
import org.supercsv.prefs.CsvPreference;
public class MapCollectorTest extends TestCase {
private static final String CSV = "John,L,Smith\n" +
"Sally,P,Jones";
public void testMapCollector() throws IOException{
ICsvBeanReader reader = new CsvBeanReader(
new StringReader(CSV),
CsvPreference.STANDARD_PREFERENCE);
// only need to map the field once, so use nulls
String[] nameMapping = new String[]{"name", null, null};
// create processors for each row (otherwise every bean
// will contain the same map!)
Product product;
while ((product = reader.read(Product.class,
nameMapping, createProcessors())) != null){
System.out.println(product.getName());
}
}
private static CellProcessor[] createProcessors() {
Map<String, String> nameMap = new HashMap<String, String>();
final CellProcessor[] processors = new CellProcessor[]{
new MapCollector("name1", nameMap),
new MapCollector("name2", nameMap),
new MapCollector("name3", nameMap)};
return processors;
}
}
This outputs:
{name3=Smith, name2=L, name1=John}
{name3=Jones, name2=P, name1=Sally}
You'll notice that while the processors execute on all 3 columns, it's only mapped to the bean once (hence the nulls in the nameMapping array).
I've also created the processors each time a row is read, otherwise every bean will be using the same map...which probably isn't what you want ;)

Custom Partitioner Error

I am writing my own custom Partitioner(Old Api) below is the code where I am extending Partitioner class:
public static class WordPairPartitioner extends Partitioner<WordPair,IntWritable> {
#Override
public int getPartition(WordPair wordPair, IntWritable intWritable, int numPartitions) {
return wordPair.getWord().hashCode() % numPartitions;
}
}
Setting the JobConf:
conf.setPartitionerClass(WordPairPartitioner.class);
WordPair Class contains:
private Text word;
private Text neighbor;
Questions:
1. I am getting error:"actual argument class (WordPairPartitioner) cannot convert to Class (?extends Partitioner).
2. Is this a right way to write the custom partitioner or do I need to override some other functionality as well?
I believe you are mixing up old API(classes from org.apache.hadoop.mapred.*) and new API(classes from org.apache.hadoop.mapreduce.*)
Using old API, you may do as follows:
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Partitioner;
public static class WordPairPartitioner implements Partitioner<WordPair,IntWritable> {
#Override
public int getPartition(WordPair wordPair, IntWritable intWritable, int numPartitions) {
return wordPair.getWord().hashCode() % numPartitions;
}
#Override
public void configure(JobConf arg0) {
}
}
In addition to Amar's answer, you should handle the eventuality of hashCode returning a negative number by bit masking:
#Override
public int getPartition(WordPair wordPair, IntWritable intWritable, int numPartitions) {
return (wordPair.getWord().hashCode() % numPartitions) & 0x7FFFFFFF;
}

Hadoop/MapReduce: Reading and writing classes generated from DDL

Can someone walk me though the basic work-flow of reading and writing data with classes generated from DDL?
I have defined some struct-like records using DDL. For example:
class Customer {
ustring FirstName;
ustring LastName;
ustring CardNo;
long LastPurchase;
}
I've compiled this to get a Customer class and included it into my project. I can easily see how to use this as input and output for mappers and reducers (the generated class implements Writable), but not how to read and write it to file.
The JavaDoc for the org.apache.hadoop.record package talks about serializing these records in Binary, CSV or XML format. How do I actually do that? Say my reducer produces IntWritable keys and Customer values. What OutputFormat do I use to write the result in CSV format? What InputFormat would I use to read the resulting files in later, if I wanted to perform analysis over them?
Ok, so I think I have this figured out. I'm not sure if it is the most straight-forward way, so please correct me if you know a simpler work-flow.
Every class generated from DDL implements the Record interface, and consequently provides two methods:
serialize(RecordOutput out) for writing
deserialize(RecordInput in) for reading
RecordOutput and RecordInput are utility interfaces provided in the org.apache.hadoop.record package. There are a few implementations (e.g. XMLRecordOutput, BinaryRecordOutput, CSVRecordOutput)
As far as I know, you have to implement your own OutputFormat or InputFormat classes to use these. This is fairly easy to do.
For example, the OutputFormat I talked about in the original question (one that writes Integer keys and Customer values in CSV format) would be implemented like this:
private static class CustomerOutputFormat
extends TextOutputFormat<IntWritable, Customer>
{
public RecordWriter<IntWritable, Customer> getRecordWriter(FileSystem ignored,
JobConf job,
String name,
Progressable progress)
throws IOException {
Path file = FileOutputFormat.getTaskOutputPath(job, name);
FileSystem fs = file.getFileSystem(job);
FSDataOutputStream fileOut = fs.create(file, progress);
return new CustomerRecordWriter(fileOut);
}
protected static class CustomerRecordWriter
implements RecordWriter<IntWritable, Customer>
{
protected DataOutputStream outStream ;
public AnchorRecordWriter(DataOutputStream out) {
this.outStream = out ;
}
public synchronized void write(IntWritable key, Customer value) throws IOException {
CsvRecordOutput csvOutput = new CsvRecordOutput(outStream);
csvOutput.writeInteger(key.get(), "id") ;
value.serialize(csvOutput) ;
}
public synchronized void close(Reporter reporter) throws IOException {
outStream.close();
}
}
}
Creating the InputFormat is much the same. Because the csv format is one entry per line, we can use a LineRecordReader internally to do most of the work.
private static class CustomerInputFormat extends FileInputFormat<IntWritable, Customer> {
public RecordReader<IntWritable, Customer> getRecordReader(
InputSplit genericSplit,
JobConf job,
Reporter reporter)
throws IOException {
reporter.setStatus(genericSplit.toString());
return new CustomerRecordReader(job, (FileSplit) genericSplit);
}
private class CustomerRecordReader implements RecordReader<IntWritable, Customer> {
private LineRecordReader lrr ;
public CustomerRecordReader(Configuration job, FileSplit split)
throws IOException{
this.lrr = new LineRecordReader(job, split);
}
public IntWritable createKey() {
return new IntWritable();
}
public Customer createValue() {
return new Customer();
}
public synchronized boolean next(IntWritable key, Customer value)
throws IOException {
LongWritable offset = new LongWritable() ;
Text line = new Text() ;
if (!lrr.next(offset, line))
return false ;
CsvRecordInput cri = new CsvRecordInput(new
ByteArrayInputStream(line.toString().getBytes())) ;
key.set(cri.readInt("id")) ;
value.deserialize(cri) ;
return true ;
}
public float getProgress() {
return lrr.getProgress() ;
}
public synchronized long getPos() throws IOException {
return lrr.getPos() ;
}
public synchronized void close() throws IOException {
lrr.close();
}
}
}

Resources