Disk full while running hadoop - hadoop

I ran a recursive map/reduce program. Something went wrong and it nearly consumes all the disk space available in C drive. So i closed the resource manager, node manager, Name Node, data node consoles.
Now i have a C drive which is almost full and i don't know how to empty the disk space and make my C drive as it was before. What should i do now. Any help is appreciated.
Here is the code
public class apriori {
public static class CandidateGenMap extends Mapper<LongWritable, Text, Text, Text>
{
private Text word = new Text();
private Text count = new Text();
private int Support = 5;
public void CandidatesGenRecursion(Vector<String> in, Vector<String> out,
int length, int level, int start,
Context context) throws IOException {
int i,size;
for(i=start;i<length;i++) {
if(level==0){
out.add(in.get(i));
} else {
out.add(in.get(i));
int init=1;
StringBuffer current = new StringBuffer();
for(String s:out)
{
if(init==1){
current.append(s);
init=0;
} else {
current.append(" ");
current.append(s);
}
}
word.set(current.toString());
count.set(Integer.toString(1));
try {
context.write(word, count);
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
if(i < length-1) {
CandidatesGenRecursion(in, out, length,level+1,i+1, context);
}
size = out.size();
if(size>0){
out.remove(size-1);
}
}
}
#Override
public void map(LongWritable key,Text value,Context context) throws IOException
{
String line = value.toString();
StringTokenizer tokenizer = new StringTokenizer(line);
String[] token=new String[2];
int i=0;
while(tokenizer.hasMoreTokens()){
token[i]= tokenizer.nextToken();
++i;
}
StringTokenizer urlToken = new StringTokenizer(token[1],",");
Vector<String> lst = new Vector<String>();
int loop=0;
while (urlToken.hasMoreTokens()) {
String str = urlToken.nextToken();
lst.add(str);
loop++;
}
Vector<String> combinations = new Vector<String>();
if(!lst.isEmpty()) {
CandidatesGenRecursion(lst, combinations, loop,0,0, context);
}
}
}
public static class CandidateGenReduce extends Reducer<Text, IntWritable, Text, IntWritable>
{
public void reduce(Text key,Iterator<IntWritable> values,Context context) throws IOException
{
int sum = 0;
while (values.hasNext()) {
sum += values.next().get();
}
try {
context.write(key, new IntWritable(sum));
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
public static void main(String[] args) throws Exception
{
Date dt;
long start,end; // Start and end time
//Start Timer
dt = new Date();
start = dt.getTime();
Configuration conf1 = new Configuration();
System.out.println("Starting Job2");
Job job2 = new Job(conf1, "apriori candidate gen");
job2.setJarByClass(apriori.class);
job2.setMapperClass(CandidateGenMap.class);
job2.setCombinerClass(CandidateGenReduce.class); //
job2.setReducerClass(CandidateGenReduce.class);
job2.setMapOutputKeyClass(Text.class);
job2.setMapOutputValueClass(Text.class);
job2.setOutputKeyClass(Text.class);
job2.setOutputValueClass(IntWritable.class);
job2.setInputFormatClass(TextInputFormat.class);
job2.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(job2, new Path(args[0]));
FileOutputFormat.setOutputPath(job2, new Path(args[1]));
job2.waitForCompletion(true);
//End Timer
dt = new Date();
end = dt.getTime();
}
}

Hadoop needs sufficient disk space for its i/0 operations at each phase (map, reduce etc).

Check in your HDFS your job output path and delete the contents.
List contents:
$ sudo -u hdfs hadoop fs -ls [YourJobOutputPath]
Disk used:
$ sudo -u hdfs hadoop fs -du -h [YourJobOutputPath]
Delete contents (be careful!, it's recursive):
$ sudo -u hdfs hadoop fs -rm -R [YourJobOutputPath]

Deleting the output directory might help in freeing your disk from the files created by the MapReduce job.

Related

Bloom Filter in MapReduce

I have to use bloom filter in the reduce side join algorithm to filter one of my input, but I have a problem with the function readFields that de-serialise the input stream of a distributed cache (bloom filter) into a bloom filter.
public class BloomJoin {
//function map : input transaction.txt
public static class TransactionJoin extends
Mapper<LongWritable, Text, Text, Text> {
private Text CID=new Text();
private Text outValue=new Text();
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString();
String record[] = line.split(",", -1);
CID.set(record[1]);
outValue.set("A"+value);
context.write(CID, outValue);
}
}
//function map : input customer.txt
public static class CustomerJoinMapper extends
Mapper<LongWritable, Text, Text, Text> {
private Text outkey=new Text();
private Text outvalue = new Text();
private BloomFilter bfilter = new BloomFilter();
public void setup(Context context) throws IOException {
URI[] files = DistributedCache
.getCacheFiles(context.getConfiguration());
// if the files in the distributed cache are set
if (files != null) {
System.out.println("Reading Bloom filter from: "
+ files[0].getPath());
// Open local file for read.
DataInputStream strm = new DataInputStream(new FileInputStream(
files[0].toString()));
bfilter.readFields(strm);
strm.close();
// Read into our Bloom filter.
} else {
throw new IOException(
"Bloom filter file not set in the DistributedCache.");
}
};
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString();
String record[] = line.split(",", -1);
outkey.set(record[0]);
if (bfilter.membershipTest(new Key(outkey.getBytes()))) {
outvalue.set("B"+value);
context.write(outkey, outvalue);
}
}
}
//function reducer: join customer with transaction
public static class JoinReducer extends
Reducer<Text, Text, Text, Text> {
private ArrayList<Text> listA = new ArrayList<Text>();
private ArrayList<Text> listB = new ArrayList<Text>();
#Override
public void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
listA.clear();
listB.clear();
for (Text t : values) {
if (t.charAt(0) == 'A') {
listA.add(new Text(t.toString().substring(1)));
System.out.println("liste A: "+listA);
} else /* if (t.charAt('0') == 'B') */{
listB.add(new Text(t.toString().substring(1)));
System.out.println("listeB :"+listB);
}
}
executeJoinLogic(context);
}
private void executeJoinLogic(Context context) throws IOException,
InterruptedException {
if (!listA.isEmpty() && !listB.isEmpty()) {
for (Text A : listB) {
for (Text B : listA) {
context.write(A, B);
System.out.println("A="+A+",B="+B);
}
}
}
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Path bloompath=new Path("/user/biadmin/ezzaki/bloomfilter/output/part-00000");
DistributedCache.addCacheFile(bloompath.toUri(),conf);
Job job = new Job(conf, "Bloom Join");
job.setJarByClass(BloomJoin.class);
String[] otherArgs = new GenericOptionsParser(conf, args)
.getRemainingArgs();
if (otherArgs.length != 3) {
System.err
.println("ReduceSideJoin <Transaction data> <Customer data> <out> ");
System.exit(1);
}
MultipleInputs.addInputPath(job, new Path(otherArgs[0]),
TextInputFormat.class,TransactionJoin.class);
MultipleInputs.addInputPath(job, new Path(otherArgs[1]),
TextInputFormat.class, CustomerJoinMapper.class);
job.setReducerClass(JoinReducer.class);
FileOutputFormat.setOutputPath(job, new Path(otherArgs[2]));
//job.setMapOutputKeyClass(Text.class);
//job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
System.exit(job.waitForCompletion(true) ? 0 : 3);
}
}
How can I solve this problem?
Can you try changing
URI[] files = DistributedCache.getCacheFiles(context.getConfiguration());
to
Path[] cacheFilePaths = DistributedCache.getLocalCacheFiles(conf);
for (Path cacheFilePath : cacheFilePaths) {
DataInputStream fileInputStream = fs.open(cacheFilePath);
}
bloomFilter.readFields(fileInputStream);
fileInputStream.close();
Also, I think you are using Map side join and not Reduce side since you are using the Distributed cache in Mapper.
You can use a Bloom Filter from here:
https://github.com/odnoklassniki/apache-cassandra/blob/master/src/java/org/apache/cassandra/utils/BloomFilter.java
It goes with dedicated serializer:
https://github.com/odnoklassniki/apache-cassandra/blob/master/src/java/org/apache/cassandra/utils/BloomFilterSerializer.java
You can serialize like this:
Path file = new Path(bloomFilterPath);
FileSystem hdfs = file.getFileSystem(context.getConfiguration());
OutputStream os = hdfs.create(file);
BloomFilterSerializer serializer = new BloomFilterSerializer();
serializer.serialize(bloomFilter, new DataOutputStream(os));
And deserialize:
InputStream is = getInputStreamFromHdfs(context, bloomFilterPath);
Path path = new Path(bloomFilterPath);
InputStream is = path.getFileSystem(context.getConfiguration()).open(path);
BloomFilterSerializer serializer = new BloomFilterSerializer();
BloomFilter bloomFilter = serializer.deserialize(
new DataInputStream(new BufferedInputStream(is)));

Reduce doesn't run but job is successfully completed

Firstly, I am a newbie at Hadoop MapReduce. My reducer does not run but shows that the job is successfully completed. Below is my console output :
INFO mapreduce.Job: Running job: job_1418240815217_0015
INFO mapreduce.Job: Job job_1418240815217_0015 running in uber mode : false
INFO mapreduce.Job: map 0% reduce 0%
INFO mapreduce.Job: map 100% reduce 0%
INFO mapreduce.Job: Job job_1418240815217_0015 completed successfully
INFO mapreduce.Job: Counters: 30
The main class is :
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
#SuppressWarnings("deprecation")
Job job = new Job(conf,"NPhase2");
job.setJarByClass(NPhase2.class);
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(NPhase2Value.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(Text.class);
job.setMapperClass(MapClass.class);
job.setReducerClass(Reduce.class);
int numberOfPartition = 0;
List<String> other_args = new ArrayList<String>();
for(int i = 0; i < args.length; ++i)
{
try {
if ("-m".equals(args[i])) {
//conf.setNumMapTasks(Integer.parseInt(args[++i]));
++i;
} else if ("-r".equals(args[i])) {
job.setNumReduceTasks(Integer.parseInt(args[++i]));
} else if ("-k".equals(args[i])) {
int knn = Integer.parseInt(args[++i]);
conf.setInt("knn", knn);
System.out.println(knn);
} else {
other_args.add(args[i]);
}
job.setNumReduceTasks(numberOfPartition * numberOfPartition);
//conf.setNumReduceTasks(1);
} catch (NumberFormatException except) {
System.out.println("ERROR: Integer expected instead of " + args[i]);
} catch (ArrayIndexOutOfBoundsException except) {
System.out.println("ERROR: Required parameter missing from " + args[i-1]);
}
}
// Make sure there are exactly 2 parameters left.
if (other_args.size() != 2) {
System.out.println("ERROR: Wrong number of parameters: " +
other_args.size() + " instead of 2.");
}
FileInputFormat.setInputPaths(job, other_args.get(0));
FileOutputFormat.setOutputPath(job, new Path(other_args.get(1)));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
My mapper is :
public static class MapClass extends Mapper
{
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException
{
String line = value.toString();
String[] parts = line.split("\\s+");
// key format <rid1>
IntWritable mapKey = new IntWritable(Integer.valueOf(parts[0]));
// value format <rid2, dist>
NPhase2Value np2v = new NPhase2Value(Integer.valueOf(parts[1]), Float.valueOf(parts[2]));
context.write(mapKey, np2v);
}
}
My reducer class is :
public static class Reduce extends Reducer<IntWritable, NPhase2Value, NullWritable, Text>
{
int numberOfPartition;
int knn;
class Record
{
public int id2;
public float dist;
Record(int id2, float dist)
{
this.id2 = id2;
this.dist = dist;
}
public String toString()
{
return Integer.toString(id2) + " " + Float.toString(dist);
}
}
class RecordComparator implements Comparator<Record>
{
public int compare(Record o1, Record o2)
{
int ret = 0;
float dist = o1.dist - o2.dist;
if (Math.abs(dist) < 1E-6)
ret = o1.id2 - o2.id2;
else if (dist > 0)
ret = 1;
else
ret = -1;
return -ret;
}
}
public void setup(Context context)
{
Configuration conf = new Configuration();
conf = context.getConfiguration();
numberOfPartition = conf.getInt("numberOfPartition", 2);
knn = conf.getInt("knn", 3);
}
public void reduce(IntWritable key, Iterator<NPhase2Value> values, Context context) throws IOException, InterruptedException
{
//initialize the pq
RecordComparator rc = new RecordComparator();
PriorityQueue<Record> pq = new PriorityQueue<Record>(knn + 1, rc);
// For each record we have a reduce task
// value format <rid1, rid2, dist>
while (values.hasNext())
{
NPhase2Value np2v = values.next();
int id2 = np2v.getFirst().get();
float dist = np2v.getSecond().get();
Record record = new Record(id2, dist);
pq.add(record);
if (pq.size() > knn)
pq.poll();
}
while(pq.size() > 0)
{
context.write(NullWritable.get(), new Text(key.toString() + " " + pq.poll().toString()));
//break; // only ouput the first record
}
} // reduce
}
This is my helper class :
public class NPhase2Value implements WritableComparable {
private IntWritable first;
private FloatWritable second;
public NPhase2Value() {
set(new IntWritable(), new FloatWritable());
}
public NPhase2Value(int first, float second) {
set(new IntWritable(first), new FloatWritable(second));
}
public void set(IntWritable first, FloatWritable second) {
this.first = first;
this.second = second;
}
public IntWritable getFirst() {
return first;
}
public FloatWritable getSecond() {
return second;
}
#Override
public void write(DataOutput out) throws IOException {
first.write(out);
second.write(out);
}
#Override
public void readFields(DataInput in) throws IOException {
first.readFields(in);
second.readFields(in);
}
#Override
public boolean equals(Object o) {
if (o instanceof NPhase2Value) {
NPhase2Value np2v = (NPhase2Value) o;
return first.equals(np2v.first) && second.equals(np2v.second);
}
return false;
}
#Override
public String toString() {
return first.toString() + " " + second.toString();
}
#Override
public int compareTo(NPhase2Value np2v) {
return 1;
}
}
The command line command I use is :
hadoop jar knn.jar NPhase2 -m 1 -r 3 -k 4 phase1out phase2out
I am trying hard to figure out the error but still not able to come up with solution. Please help me in this regards as I am running on a tight schedule.
Because you have set the number of reducer task as 0. See this:
int numberOfPartition = 0;
//.......
job.setNumReduceTasks(numberOfPartition * numberOfPartition);
I dont see you have resetted numberOfPartition anywhere in your code. I thins you should set it where you are parsing -r option or remove call to setNumReduceTasks method as above completely as you are setting it already while parsing -r option.

Accesing file in Mapper through Distributed Cache

I want to access the contents of the distributed file in my Mapper. Below is the code I have written which generates the name of the file for Distributed Cache. Please help me accessing the contents of the file
public class DistCacheExampleMapper extends MapReduceBase implements Mapper<LongWritable, Text, Text, Text >
{
Text a = new Text();
Path[] dates = new Path[0];
public void configure(JobConf conf) {
try {
dates = DistributedCache.getLocalCacheFiles(conf);
String astr = dates.toString();
a = new Text(astr);
} catch (IOException ioe) {
System.err.println("Caught exception while getting cached files: " +
StringUtils.stringifyException(ioe));
}
}
#Override
public void map(LongWritable key, Text value, OutputCollector<Text, Text> output,
Reporter reporter) throws IOException {
String line = value.toString();
for(Path cacheFile: dates){
output.collect(new Text(line), new Text(cacheFile.getName()));
}
}
}
Try this instead in your configure() method:
List<String []> lines;
Path[] files = new Path[0];
public void configure(JobConf conf) {
lines = new ArrayList<>();
BufferedReader SW;
try {
files = DistributedCache.getLocalCacheFiles(conf);
SW = new BufferedReader(new FileReader(files[0].toString()));
String line;
while ((line = SW.readLine()) != null) {
lines.add(line.split(",")); //now, each lines entry is a String array, with each element being a column
}
SW.close();
} catch (IOException ioe) {
System.err.println("Caught exception while getting cached files: " +
StringUtils.stringifyException(ioe));
}
}
This way, you will have the contents of the files (in this case the first file) in the Distributed Cache, in the variable lines. Each lines entry represent a String array, which is split by ','. So the first column of the first row is lines.get(0)[0], the third row of the second line is lines.get(1)[2], etc.

Distributed cache not working

I am storing a small amount of data (few MBs) in a distributed cache and using that to perform anti join with two big files. For few lines of data in cache , the functionality is working fine, but when the cache is having more data in production it's not able to do the job but its not throwing any error as well. Just that only few records (around 20%) are getting joined and others are just getting ignored. So is there any upper limit of number of records that can be stored in the distributed cache? Why its working for some of the records and ignoring the rest? Any suggestion will be extremely helpful.
Bellow is my code
public class MyMapper extends Mapper<LongWritable, Text, Text, TextPair> {
Text albumKey = new Text();
Text photoKey = new Text();
private HashSet<String> photoDeleted = new HashSet<String>();
private HashSet<String> albDeleted = new HashSet<String>();
Text interKey = new Text();
private TextPair interValue = new TextPair();
private static final Logger LOGGER = Logger.getLogger(SharedStreamsSlMapper.class);
protected void setup(Context context) throws IOException, InterruptedException {
int count=0;
Path[] cacheFiles = DistributedCache.getLocalCacheFiles(context.getConfiguration());
System.out.println(cacheFiles.length);
LOGGER.info(cacheFiles+"****");
try {
if (cacheFiles != null && cacheFiles.length > 0) {
for (Path path : cacheFiles) {
String line;
String[] tokens;
BufferedReader joinReader = new BufferedReader(new FileReader(path.toString()));
System.out.println(path.toString());
// BufferedReader joinReader = new BufferedReader(new FileReader("/Users/Kunal_Basak/Desktop/ss_test/dsitCache/part-m-00000"));
try {
while ((line = joinReader.readLine()) != null) {
count++;
tokens = line.split(SSConstants.TAB, 2);
if(tokens.length<2){
System.out.println("WL");
continue;
}
if (tokens[0].equals("P")) {
photoDeleted.add(tokens[1]);
}
else if (tokens[0].equals("A")) {
albDeleted.add(tokens[1]);
}
}
}
finally {
joinReader.close();
}
}
}
}
catch (IOException e) {
System.out.println("Exception reading DistributedCache: " + e);
}
System.out.println(count);
System.out.println("albdeleted *****"+albDeleted.size());
System.out.println("photo deleted *****"+photoDeleted.size());
LOGGER.info("albdeleted *****"+albDeleted.size());
LOGGER.info("albdeleted *****"+albDeleted.size());
}
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
try{
//my mapper code
}
}
}
According to this blog article:
The local.cache.size parameter controls the size of the
DistributedCache.
By default, it’s set to 10 GB.
So if you have more than 10GB in the cache, that may be your problem.

Read hive table from mapreduce

I am currently wrting a mapreduce program to find the difference between two hive tables.
My hive table are partitioned on one or more columns. So teh folder name contains the value of partitioned columns.
Is there any way to read the hive partitioned table.
Can it be read in mapper ?
Since the underlying HDFS data will be organised by default in a partitioned hive table as
table/root/folder/x=1/y=1
table/root/folder/x=1/y=2
table/root/folder/x=2/y=1
table/root/folder/x=2/y=2....,
You can build each of these input paths in the driver and add them through multiple calls to FileInputFormat.addInputPath(job, path).One call per folder path that you built.
Pasted sample code below.Note how paths are added to MyMapper.class.In this sample, I am using MultipleInputs API.Table is partitioned by 'part' and 'xdate'.
public class MyDriver extends Configured implements Tool {
public int run(String[] args) throws Exception {
Configuration conf = getConf();
conf.set("mapred.compress.map.output", "true");
conf.set("mapred.output.compression.type", "BLOCK");
Job job = new Job(conf);
//set up various job parameters
job.setJarByClass(MyDriver.class);
job.setJobName(conf.get("job.name"));
MultipleInputs.addInputPath(job, new Path(conf.get("root.folder")+"/xdate="+conf.get("start.date")), TextInputFormat.class, OneMapper.class);
for (Path path : getPathList(job,conf)) {
System.out.println("path: "+path.toString());
MultipleInputs.addInputPath(job, path, Class.forName(conf.get("input.format")).asSubclass(FileInputFormat.class).asSubclass(InputFormat.class), MyMapper.class);
}
...
...
return job.waitForCompletion(true) ? 0 : -2;
}
private static ArrayList<Path> getPathList(Job job, Configuration conf) {
String rootdir = conf.get("input.path.rootfolder");
String partlist = conf.get("part.list");
String startdate_s = conf.get("start.date");
String enxdate_s = conf.get("end.date");
ArrayList<Path> pathlist = new ArrayList<Path>();
String[] partlist_split = partlist.split(",");
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
Date startdate_d = null;
Date enxdate_d = null;
Path path = null;
try {
startdate_d = sdf.parse(startdate_s);
enxdate_d = sdf.parse(enxdate_s);
GregorianCalendar gcal = new GregorianCalendar();
gcal.setTime(startdate_d);
Date d = null;
for (String part : partlist_split) {
gcal.setTime(startdate_d);
do {
d = gcal.getTime();
FileSystem fs = FileSystem.get(conf);
path = new Path(rootdir + "/part=" + part + "/xdate="
+ sdf.format(d));
if (fs.exists(path)) {
pathlist.add(path);
}
gcal.add(Calendar.DAY_OF_YEAR, 1);
} while (d.before(enxdate_d));
}
} catch (Exception e) {
e.printStackTrace();
}
return pathlist;
}
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(new Configuration(), new MyDriver(), args);
System.exit(res);
}
}
Yes, it can be read in Mapper pretty easily. This answer is based on the idea mentioned by #Daniel Koverman.
With the Context object passed to Mapper.map(), you can get the file split path this way
// this gives you the path plus offsets hdfs://.../tablename/partition1=20/partition2=ABC/000001_0:0+12345678
context.ctx.getInputSplit().toString();
// or this gets you the path only
((FileSplit)ctx.getInputSplit()).getPath();
Here's a more complete solution that parses out the actual partition value:
class MyMapper extends Mapper<KEYIN, VALUEIN, KEYOUT, VALUEOUT> {
// regex to parse out the /partitionName=partitionValue/ pairs
private static Pattern partitionRegex = Pattern.compile("(?<=/)(?<name>[_\\-\\w]+)=(?<value>[^/]*)(?=/)");
public static String parsePartitionValue(String path, String partitionName) throws IllegalArgumentException{
Matcher m = partitionRegex.matcher(path);
while(m.find()){
if(m.group("name").equals(partitionName)){
return m.group("value");
}
}
throw new IllegalArgumentException(String.format("Partition [%s] not found", partitionName));
}
#Override
public void map(KEYIN key, VALUEIN v, Context ctx) throws IOException, InterruptedException {
String partitionVal = parsePartitionValue(ctx.getInputSplit().toString(), "my_partition_col");
}
}

Resources