Hadoop working Directory - hadoop

I am attempting to save a file in the main class of a Hadoop application so it can be read later on by the mapper. The file is an encryption key that will be used to encrypt data. My question here is, where will the data end up if I am writing the file to the working directory?
public class HadoopIndexProject {
private static SecretKey generateKey(int size, String Algorithm) throws UnsupportedEncodingException, NoSuchAlgorithmException {
KeyGenerator keyGen = KeyGenerator.getInstance(Algorithm);
keyGen.init(size);
return keyGen.generateKey();
}
private static IvParameterSpec generateIV() {
byte[] b = new byte[16];
new Random().nextBytes(b);
return new IvParameterSpec(b);
}
public static void saveKey(SecretKey key, IvParameterSpec IV, String path) throws IOException {
FileOutputStream stream = new FileOutputStream(path);
//FSDataOutputStream stream = fs.create(new Path(path));
try {
stream.write(key.getEncoded());
stream.write(IV.getIV());
} finally {
stream.close();
}
}
/**
* #param args the command line arguments
* #throws java.lang.Exception
*/
public static void main(String[] args) throws Exception {
// TODO code application logic here
Configuration conf = new Configuration();
//FileSystem fs = FileSystem.getLocal(conf);
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
SecretKey KEY;
IvParameterSpec IV;
if (otherArgs.length != 2) {
System.err.println("Usage: Index <in> <out>");
System.exit(2);
}
try {
if(! new File("key.dat").exists()) {
KEY = generateKey(128, "AES");
IV = generateIV();
saveKey(KEY, IV, "key.dat");
}
} catch (NoSuchAlgorithmException ex) {
Logger.getLogger(HadoopIndexMapper.class.getName()).log(Level.SEVERE, null, ex);
}
conf.set("mapred.textoutputformat.separator", ":");
Job job = Job.getInstance(conf);
job.setJobName("Index creator");
job.setJarByClass(HadoopIndexProject.class);
job.setMapperClass(HadoopIndexMapper.class);
job.setReducerClass(HadoopIndexReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntArrayWritable.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]) {});
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}

There is not concept of working directory in HDFS. All relative paths are paths from /user/<username>, so your file will be located in /user/<username>/key.dat.
But in yarn you have concept of distributed cache, so additional files for your yarn application you can add there using job.addCacheFile

Related

Bloom Filter in MapReduce

I have to use bloom filter in the reduce side join algorithm to filter one of my input, but I have a problem with the function readFields that de-serialise the input stream of a distributed cache (bloom filter) into a bloom filter.
public class BloomJoin {
//function map : input transaction.txt
public static class TransactionJoin extends
Mapper<LongWritable, Text, Text, Text> {
private Text CID=new Text();
private Text outValue=new Text();
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString();
String record[] = line.split(",", -1);
CID.set(record[1]);
outValue.set("A"+value);
context.write(CID, outValue);
}
}
//function map : input customer.txt
public static class CustomerJoinMapper extends
Mapper<LongWritable, Text, Text, Text> {
private Text outkey=new Text();
private Text outvalue = new Text();
private BloomFilter bfilter = new BloomFilter();
public void setup(Context context) throws IOException {
URI[] files = DistributedCache
.getCacheFiles(context.getConfiguration());
// if the files in the distributed cache are set
if (files != null) {
System.out.println("Reading Bloom filter from: "
+ files[0].getPath());
// Open local file for read.
DataInputStream strm = new DataInputStream(new FileInputStream(
files[0].toString()));
bfilter.readFields(strm);
strm.close();
// Read into our Bloom filter.
} else {
throw new IOException(
"Bloom filter file not set in the DistributedCache.");
}
};
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString();
String record[] = line.split(",", -1);
outkey.set(record[0]);
if (bfilter.membershipTest(new Key(outkey.getBytes()))) {
outvalue.set("B"+value);
context.write(outkey, outvalue);
}
}
}
//function reducer: join customer with transaction
public static class JoinReducer extends
Reducer<Text, Text, Text, Text> {
private ArrayList<Text> listA = new ArrayList<Text>();
private ArrayList<Text> listB = new ArrayList<Text>();
#Override
public void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
listA.clear();
listB.clear();
for (Text t : values) {
if (t.charAt(0) == 'A') {
listA.add(new Text(t.toString().substring(1)));
System.out.println("liste A: "+listA);
} else /* if (t.charAt('0') == 'B') */{
listB.add(new Text(t.toString().substring(1)));
System.out.println("listeB :"+listB);
}
}
executeJoinLogic(context);
}
private void executeJoinLogic(Context context) throws IOException,
InterruptedException {
if (!listA.isEmpty() && !listB.isEmpty()) {
for (Text A : listB) {
for (Text B : listA) {
context.write(A, B);
System.out.println("A="+A+",B="+B);
}
}
}
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Path bloompath=new Path("/user/biadmin/ezzaki/bloomfilter/output/part-00000");
DistributedCache.addCacheFile(bloompath.toUri(),conf);
Job job = new Job(conf, "Bloom Join");
job.setJarByClass(BloomJoin.class);
String[] otherArgs = new GenericOptionsParser(conf, args)
.getRemainingArgs();
if (otherArgs.length != 3) {
System.err
.println("ReduceSideJoin <Transaction data> <Customer data> <out> ");
System.exit(1);
}
MultipleInputs.addInputPath(job, new Path(otherArgs[0]),
TextInputFormat.class,TransactionJoin.class);
MultipleInputs.addInputPath(job, new Path(otherArgs[1]),
TextInputFormat.class, CustomerJoinMapper.class);
job.setReducerClass(JoinReducer.class);
FileOutputFormat.setOutputPath(job, new Path(otherArgs[2]));
//job.setMapOutputKeyClass(Text.class);
//job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
System.exit(job.waitForCompletion(true) ? 0 : 3);
}
}
How can I solve this problem?
Can you try changing
URI[] files = DistributedCache.getCacheFiles(context.getConfiguration());
to
Path[] cacheFilePaths = DistributedCache.getLocalCacheFiles(conf);
for (Path cacheFilePath : cacheFilePaths) {
DataInputStream fileInputStream = fs.open(cacheFilePath);
}
bloomFilter.readFields(fileInputStream);
fileInputStream.close();
Also, I think you are using Map side join and not Reduce side since you are using the Distributed cache in Mapper.
You can use a Bloom Filter from here:
https://github.com/odnoklassniki/apache-cassandra/blob/master/src/java/org/apache/cassandra/utils/BloomFilter.java
It goes with dedicated serializer:
https://github.com/odnoklassniki/apache-cassandra/blob/master/src/java/org/apache/cassandra/utils/BloomFilterSerializer.java
You can serialize like this:
Path file = new Path(bloomFilterPath);
FileSystem hdfs = file.getFileSystem(context.getConfiguration());
OutputStream os = hdfs.create(file);
BloomFilterSerializer serializer = new BloomFilterSerializer();
serializer.serialize(bloomFilter, new DataOutputStream(os));
And deserialize:
InputStream is = getInputStreamFromHdfs(context, bloomFilterPath);
Path path = new Path(bloomFilterPath);
InputStream is = path.getFileSystem(context.getConfiguration()).open(path);
BloomFilterSerializer serializer = new BloomFilterSerializer();
BloomFilter bloomFilter = serializer.deserialize(
new DataInputStream(new BufferedInputStream(is)));

How to write mrunit test case for (multiple mappers and single reducer) MMR Driver

I am new to BigData I have written a MR program and trying to write test cases for it using MRUnit from https://dzone.com/articles/testing-mapreduce-mrunit
But My MR program has 2 mappers and 1 reducer so i am not able to create a driver object using
newMapReduceDriver()
newMapReduceDriver(mapper,reducer)
newMapReduceDriver(mapper,reducer,combiner)
or
newMultipleInputMapReduceDriver()
newMultipleInputMapReduceDriver(combiner,reducer)
newMultipleInputMapReduceDriver(reducer)
Please suggestion any other way of I am doing something wrong.Thanks in advance
Here is the code
public class UC_RJoinerTool extends Configured implements Tool{
public int run(String[] args) throws Exception {
if(args == null || args.length < 4 ){
System.err.println("Usage: <User file Input Path> <Comments file Input Path> <Output Path> <Inner/Right Outer/Full join>");
ToolRunner.printGenericCommandUsage(System.err);
return -1;
} else {
Job job = Job.getInstance(getConf(), "Mapping Users with Comments");
job.setJarByClass(UC_RJoinerTool.class);
Path userInputPath = new Path(args[0]);
Path commentsInputPath = new Path(args[1]);
Path outPutPath = new Path(args[2]);
String joinTypeInput = args[3];
MultipleInputs.addInputPath(job, userInputPath, TextInputFormat.class,UserDotXmlMapper.class);
MultipleInputs.addInputPath(job, commentsInputPath, TextInputFormat.class,CommentsDotXmlMapper.class);
FileOutputFormat.setOutputPath(job,new Path(args[1]));
//When you are using TextInputFormat explicitly say the map key and value types
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.getConfiguration().set("joinType",joinTypeInput);
job.setReducerClass(UserCommentsReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileOutputFormat.setOutputPath(job, outPutPath);
return job.waitForCompletion(true)?0:1;
}
}
public static void main(String[] args)throws Exception {
int exitCode = ToolRunner.run(new UC_RJoinerTool(),args);
System.exit(exitCode);
}
}
My Unit testCase Code
public class UCJoinTest {
private MapDriver<LongWritable,Text,Text,Text> mapUsersDriver,mapCommentsDriver;
private ReduceDriver<Text, Text, Text,Text> reduceUCDriver;
//private MapReduceDriver<LongWritable,Text,Text, Text, Text,Text> mapReduceUCDriver;
private MultipleInputsMapReduceDriver<LongWritable,Text,Text,Text> mapReduceUCDriver;
private MapDriver<LongWritable, Text, LongWritable,Text> mapSortDriver;
private ReduceDriver<LongWritable,Text,Text,Text> reduceSortDriver;
private MapReduceDriver<LongWritable,Text,LongWritable,Text,Text,Text> mapReduceSortDriver;
#Before
public void setUp() throws Exception {
final UserDotXmlMapper usersMapper = new UserDotXmlMapper();
final CommentsDotXmlMapper CommentsMapper = new CommentsDotXmlMapper();
final UserCommentsReducer ucReducer = new UserCommentsReducer();
final ReputationSorterMapper sortMapper = new ReputationSorterMapper();
final ReputationSorterReducer sortReducer = new ReputationSorterReducer();
mapUsersDriver = MapDriver.newMapDriver(usersMapper);
mapCommentsDriver = MapDriver.newMapDriver(CommentsMapper);
reduceUCDriver = ReduceDriver.newReduceDriver(ucReducer);
mapReduceUCDriver = MapReduceDriver.newMapReduceDriver(usersMapper,CommentsMapper,ucReducer);
mapReduceUCDriver = MultipleInputsMapReduceDriver.newMultipleInputMapReduceDriver(usersMapper,CommentsMapper,ucReducer);
mapSortDriver = MapDriver.newMapDriver(sortMapper);
reduceSortDriver = ReduceDriver.newReduceDriver(sortReducer);
mapReduceSortDriver = MapReduceDriver.newMapReduceDriver(sortMapper,sortReducer);
}
public class CommentsDotXmlMapper extends Mapper<LongWritable,Text,Text,Text>{
}
public class UserDotXmlMapper extends Mapper<LongWritable,Text,Text,Text>{
}
public class UserCommentsReducer extends Reducer<Text, Text, Text,Text>{
}
For some reason Stackowerflow was not allowing me to post a question so I am adding this comment please ignore this

Disk full while running hadoop

I ran a recursive map/reduce program. Something went wrong and it nearly consumes all the disk space available in C drive. So i closed the resource manager, node manager, Name Node, data node consoles.
Now i have a C drive which is almost full and i don't know how to empty the disk space and make my C drive as it was before. What should i do now. Any help is appreciated.
Here is the code
public class apriori {
public static class CandidateGenMap extends Mapper<LongWritable, Text, Text, Text>
{
private Text word = new Text();
private Text count = new Text();
private int Support = 5;
public void CandidatesGenRecursion(Vector<String> in, Vector<String> out,
int length, int level, int start,
Context context) throws IOException {
int i,size;
for(i=start;i<length;i++) {
if(level==0){
out.add(in.get(i));
} else {
out.add(in.get(i));
int init=1;
StringBuffer current = new StringBuffer();
for(String s:out)
{
if(init==1){
current.append(s);
init=0;
} else {
current.append(" ");
current.append(s);
}
}
word.set(current.toString());
count.set(Integer.toString(1));
try {
context.write(word, count);
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
if(i < length-1) {
CandidatesGenRecursion(in, out, length,level+1,i+1, context);
}
size = out.size();
if(size>0){
out.remove(size-1);
}
}
}
#Override
public void map(LongWritable key,Text value,Context context) throws IOException
{
String line = value.toString();
StringTokenizer tokenizer = new StringTokenizer(line);
String[] token=new String[2];
int i=0;
while(tokenizer.hasMoreTokens()){
token[i]= tokenizer.nextToken();
++i;
}
StringTokenizer urlToken = new StringTokenizer(token[1],",");
Vector<String> lst = new Vector<String>();
int loop=0;
while (urlToken.hasMoreTokens()) {
String str = urlToken.nextToken();
lst.add(str);
loop++;
}
Vector<String> combinations = new Vector<String>();
if(!lst.isEmpty()) {
CandidatesGenRecursion(lst, combinations, loop,0,0, context);
}
}
}
public static class CandidateGenReduce extends Reducer<Text, IntWritable, Text, IntWritable>
{
public void reduce(Text key,Iterator<IntWritable> values,Context context) throws IOException
{
int sum = 0;
while (values.hasNext()) {
sum += values.next().get();
}
try {
context.write(key, new IntWritable(sum));
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
public static void main(String[] args) throws Exception
{
Date dt;
long start,end; // Start and end time
//Start Timer
dt = new Date();
start = dt.getTime();
Configuration conf1 = new Configuration();
System.out.println("Starting Job2");
Job job2 = new Job(conf1, "apriori candidate gen");
job2.setJarByClass(apriori.class);
job2.setMapperClass(CandidateGenMap.class);
job2.setCombinerClass(CandidateGenReduce.class); //
job2.setReducerClass(CandidateGenReduce.class);
job2.setMapOutputKeyClass(Text.class);
job2.setMapOutputValueClass(Text.class);
job2.setOutputKeyClass(Text.class);
job2.setOutputValueClass(IntWritable.class);
job2.setInputFormatClass(TextInputFormat.class);
job2.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(job2, new Path(args[0]));
FileOutputFormat.setOutputPath(job2, new Path(args[1]));
job2.waitForCompletion(true);
//End Timer
dt = new Date();
end = dt.getTime();
}
}
Hadoop needs sufficient disk space for its i/0 operations at each phase (map, reduce etc).
Check in your HDFS your job output path and delete the contents.
List contents:
$ sudo -u hdfs hadoop fs -ls [YourJobOutputPath]
Disk used:
$ sudo -u hdfs hadoop fs -du -h [YourJobOutputPath]
Delete contents (be careful!, it's recursive):
$ sudo -u hdfs hadoop fs -rm -R [YourJobOutputPath]
Deleting the output directory might help in freeing your disk from the files created by the MapReduce job.

Reading and Writing Sequencefile using Hadoop 2.0 Apis

I am looking for an example which is using the new API to read and write Sequence Files.
Effectively I need to know how to use these functions
createWriter(Configuration conf, org.apache.hadoop.io.SequenceFile.Writer.Option... opts)
The Old definition is not working for me:
SequenceFile.createWriter( fs, conf, path, key.getClass(), value.getClass());
Similarly I need to know what will be the code for reading the Sequence file, as the follwoing is deprecated:
SequenceFile.Reader(fs, path, conf);
Here is the way to use the same -
String uri = args[0];
Configuration conf = new Configuration();
Path path = new Path( uri);
IntWritable key = new IntWritable();
Text value = new Text();
CompressionCodec Codec = new GzipCodec();
SequenceFile.Writer writer = null;
Option optPath = SequenceFile.Writer.file(path);
Option optKey = SequenceFile.Writer.keyClass(key.getClass());
Option optVal = SequenceFile.Writer.valueClass(value.getClass());
Option optCom = SequenceFile.Writer.compression(CompressionType.RECORD, Codec);
writer = SequenceFile.createWriter( conf, optPath, optKey, optVal, optCom);
public class SequenceFilesTest {
#Test
public void testSeqFileReadWrite() throws IOException {
Configuration conf = new Configuration();
FileSystem fs = FileSystem.getLocal(conf);
Path seqFilePath = new Path("file.seq");
SequenceFile.Writer writer = SequenceFile.createWriter(conf,
Writer.file(seqFilePath), Writer.keyClass(Text.class),
Writer.valueClass(IntWritable.class));
writer.append(new Text("key1"), new IntWritable(1));
writer.append(new Text("key2"), new IntWritable(2));
writer.close();
SequenceFile.Reader reader = new SequenceFile.Reader(conf,
Reader.file(seqFilePath));
Text key = new Text();
IntWritable val = new IntWritable();
while (reader.next(key, val)) {
System.err.println(key + "\t" + val);
}
reader.close();
}
}
I'm late by more than an year to answer but just got started with Hadoop 2.4.1 :)
Below is the code, someone may find it useful.
Note: It includes the commented 1.x code to read and write a sequence file. I was wondering where does it pick up the file system but when I executed it directly on the cluster, it picked it properly(probably, from core-site.xml as mentioned in Configuration
import java.io.File;
import java.io.IOException;
import java.io.RandomAccessFile;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.SequenceFile.Reader.Option;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.util.ReflectionUtils;
public class SequenceFileOperator {
private Configuration conf = new Configuration();
/*private FileSystem fs;
{
try {
fs = FileSystem.get(URI.create("hdfs://cldx-1336-1202:9000"), conf);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}*/
public static void main(String[] args) throws IOException {
// TODO Auto-generated method stub
if (args == null || args.length < 2) {
System.out
.println("Following are the possible invocations <operation id> <arg1> <arg2> ...");
System.out
.println("1 <absolute path of directory containing documents> <HDFS path of the sequence file");
System.out.println("2 <HDFS path of the sequence file>");
return;
}
int operation = Integer.valueOf(args[0]);
SequenceFileOperator docToSeqFileWriter = new SequenceFileOperator();
switch (operation) {
case 1: {
String docDirectoryPath = args[1];
String sequenceFilePath = args[2];
System.out.println("Writing files present at " + docDirectoryPath
+ " to the sequence file " + sequenceFilePath);
docToSeqFileWriter.loadDocumentsToSequenceFile(docDirectoryPath,
sequenceFilePath);
break;
}
case 2: {
String sequenceFilePath = args[1];
System.out.println("Reading the sequence file " + sequenceFilePath);
docToSeqFileWriter.readSequenceFile(sequenceFilePath);
break;
}
}
}
private void readSequenceFile(String sequenceFilePath) throws IOException {
// TODO Auto-generated method stub
/*
* SequenceFile.Reader sequenceFileReader = new SequenceFile.Reader(fs,
* new Path(sequenceFilePath), conf);
*/
Option filePath = SequenceFile.Reader.file(new Path(sequenceFilePath));
SequenceFile.Reader sequenceFileReader = new SequenceFile.Reader(conf,
filePath);
Writable key = (Writable) ReflectionUtils.newInstance(
sequenceFileReader.getKeyClass(), conf);
Writable value = (Writable) ReflectionUtils.newInstance(
sequenceFileReader.getValueClass(), conf);
try {
while (sequenceFileReader.next(key, value)) {
System.out
.printf("[%s] %s %s \n",
sequenceFileReader.getPosition(), key,
value.getClass());
}
} finally {
IOUtils.closeStream(sequenceFileReader);
}
}
private void loadDocumentsToSequenceFile(String docDirectoryPath,
String sequenceFilePath) throws IOException {
// TODO Auto-generated method stub
File docDirectory = new File(docDirectoryPath);
if (!docDirectory.isDirectory()) {
System.out
.println("Please provide an absolute path of a directory that contains the documents to be added to the sequence file");
return;
}
/*
* SequenceFile.Writer sequenceFileWriter =
* SequenceFile.createWriter(fs, conf, new Path(sequenceFilePath),
* Text.class, BytesWritable.class);
*/
org.apache.hadoop.io.SequenceFile.Writer.Option filePath = SequenceFile.Writer
.file(new Path(sequenceFilePath));
org.apache.hadoop.io.SequenceFile.Writer.Option keyClass = SequenceFile.Writer
.keyClass(Text.class);
org.apache.hadoop.io.SequenceFile.Writer.Option valueClass = SequenceFile.Writer
.valueClass(BytesWritable.class);
SequenceFile.Writer sequenceFileWriter = SequenceFile.createWriter(
conf, filePath, keyClass, valueClass);
File[] documents = docDirectory.listFiles();
try {
for (File document : documents) {
RandomAccessFile raf = new RandomAccessFile(document, "r");
byte[] content = new byte[(int) raf.length()];
raf.readFully(content);
sequenceFileWriter.append(new Text(document.getName()),
new BytesWritable(content));
raf.close();
}
} finally {
IOUtils.closeStream(sequenceFileWriter);
}
}
}
for reading you can use
Path path= new Path("/bar");
Reader sequenceFileReader = new SequenceFile.Reader(conf,SequenceFile.Reader.file(path));
You need to set SequenceFile as input format
job.setInputFormatClass(SequenceFileInputFormat.class);
You will find an example of reading SeequnceFile form HDFS here.

Read hive table from mapreduce

I am currently wrting a mapreduce program to find the difference between two hive tables.
My hive table are partitioned on one or more columns. So teh folder name contains the value of partitioned columns.
Is there any way to read the hive partitioned table.
Can it be read in mapper ?
Since the underlying HDFS data will be organised by default in a partitioned hive table as
table/root/folder/x=1/y=1
table/root/folder/x=1/y=2
table/root/folder/x=2/y=1
table/root/folder/x=2/y=2....,
You can build each of these input paths in the driver and add them through multiple calls to FileInputFormat.addInputPath(job, path).One call per folder path that you built.
Pasted sample code below.Note how paths are added to MyMapper.class.In this sample, I am using MultipleInputs API.Table is partitioned by 'part' and 'xdate'.
public class MyDriver extends Configured implements Tool {
public int run(String[] args) throws Exception {
Configuration conf = getConf();
conf.set("mapred.compress.map.output", "true");
conf.set("mapred.output.compression.type", "BLOCK");
Job job = new Job(conf);
//set up various job parameters
job.setJarByClass(MyDriver.class);
job.setJobName(conf.get("job.name"));
MultipleInputs.addInputPath(job, new Path(conf.get("root.folder")+"/xdate="+conf.get("start.date")), TextInputFormat.class, OneMapper.class);
for (Path path : getPathList(job,conf)) {
System.out.println("path: "+path.toString());
MultipleInputs.addInputPath(job, path, Class.forName(conf.get("input.format")).asSubclass(FileInputFormat.class).asSubclass(InputFormat.class), MyMapper.class);
}
...
...
return job.waitForCompletion(true) ? 0 : -2;
}
private static ArrayList<Path> getPathList(Job job, Configuration conf) {
String rootdir = conf.get("input.path.rootfolder");
String partlist = conf.get("part.list");
String startdate_s = conf.get("start.date");
String enxdate_s = conf.get("end.date");
ArrayList<Path> pathlist = new ArrayList<Path>();
String[] partlist_split = partlist.split(",");
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
Date startdate_d = null;
Date enxdate_d = null;
Path path = null;
try {
startdate_d = sdf.parse(startdate_s);
enxdate_d = sdf.parse(enxdate_s);
GregorianCalendar gcal = new GregorianCalendar();
gcal.setTime(startdate_d);
Date d = null;
for (String part : partlist_split) {
gcal.setTime(startdate_d);
do {
d = gcal.getTime();
FileSystem fs = FileSystem.get(conf);
path = new Path(rootdir + "/part=" + part + "/xdate="
+ sdf.format(d));
if (fs.exists(path)) {
pathlist.add(path);
}
gcal.add(Calendar.DAY_OF_YEAR, 1);
} while (d.before(enxdate_d));
}
} catch (Exception e) {
e.printStackTrace();
}
return pathlist;
}
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(new Configuration(), new MyDriver(), args);
System.exit(res);
}
}
Yes, it can be read in Mapper pretty easily. This answer is based on the idea mentioned by #Daniel Koverman.
With the Context object passed to Mapper.map(), you can get the file split path this way
// this gives you the path plus offsets hdfs://.../tablename/partition1=20/partition2=ABC/000001_0:0+12345678
context.ctx.getInputSplit().toString();
// or this gets you the path only
((FileSplit)ctx.getInputSplit()).getPath();
Here's a more complete solution that parses out the actual partition value:
class MyMapper extends Mapper<KEYIN, VALUEIN, KEYOUT, VALUEOUT> {
// regex to parse out the /partitionName=partitionValue/ pairs
private static Pattern partitionRegex = Pattern.compile("(?<=/)(?<name>[_\\-\\w]+)=(?<value>[^/]*)(?=/)");
public static String parsePartitionValue(String path, String partitionName) throws IllegalArgumentException{
Matcher m = partitionRegex.matcher(path);
while(m.find()){
if(m.group("name").equals(partitionName)){
return m.group("value");
}
}
throw new IllegalArgumentException(String.format("Partition [%s] not found", partitionName));
}
#Override
public void map(KEYIN key, VALUEIN v, Context ctx) throws IOException, InterruptedException {
String partitionVal = parsePartitionValue(ctx.getInputSplit().toString(), "my_partition_col");
}
}

Resources