How to remove r-00000 extention from reducer output in mapreduce

How to remove r-00000 extention from reducer output in mapreduce - hadoop

I am able to rename my reducer output file correctly but r-00000 is still persisting .
I have used MultipleOutputs in my reducer class .
Here is details of the that .Not sure what am i missing or what extra i have to do?
public class MyReducer extends Reducer<NullWritable, Text, NullWritable, Text> {
private Logger logger = Logger.getLogger(MyReducer.class);
private MultipleOutputs<NullWritable, Text> multipleOutputs;
String strName = "";
public void setup(Context context) {
logger.info("Inside Reducer.");
multipleOutputs = new MultipleOutputs<NullWritable, Text>(context);
}
#Override
public void reduce(NullWritable Key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
for (Text value : values) {
final String valueStr = value.toString();
StringBuilder sb = new StringBuilder();
sb.append(strArrvalueStr[0] + "|!|");
multipleOutputs.write(NullWritable.get(), new Text(sb.toString()),strName);
}
}
public void cleanup(Context context) throws IOException,
InterruptedException {
multipleOutputs.close();
}
}

I was able to do it explicitly after my job finishes and thats ok for me.No delay in the job
if (b){
DateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd-HHmm");
Calendar cal = Calendar.getInstance();
String strDate=dateFormat.format(cal.getTime());
FileSystem hdfs = FileSystem.get(getConf());
FileStatus fs[] = hdfs.listStatus(new Path(args[1]));
if (fs != null){
for (FileStatus aFile : fs) {
if (!aFile.isDir()) {
hdfs.rename(aFile.getPath(), new Path(aFile.getPath().toString()+".txt"));
}
}
}
}

A more suitable approach to the problem would be changing the OutputFormat.
For eg :- If you are using TextOutputFormatClass, just get the source code of the TextOutputFormat class and modify the below method to get the proper filename (without r-00000). We need to then set the modified output format in the driver.
public synchronized static String getUniqueFile(TaskAttemptContext context, String name, String extension) {
/*TaskID taskId = context.getTaskAttemptID().getTaskID();
int partition = taskId.getId();*/
StringBuilder result = new StringBuilder();
result.append(name);
/*
* result.append('-');
* result.append(TaskID.getRepresentingCharacter(taskId.getTaskType()));
* result.append('-'); result.append(NUMBER_FORMAT.format(partition));
* result.append(extension);
*/
return result.toString();
}
So whatever name is passed through the multiple outputs, filename will be created according to it.

Related

Bloom Filter in MapReduce

I have to use bloom filter in the reduce side join algorithm to filter one of my input, but I have a problem with the function readFields that de-serialise the input stream of a distributed cache (bloom filter) into a bloom filter.
public class BloomJoin {
//function map : input transaction.txt
public static class TransactionJoin extends
Mapper<LongWritable, Text, Text, Text> {
private Text CID=new Text();
private Text outValue=new Text();
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString();
String record[] = line.split(",", -1);
CID.set(record[1]);
outValue.set("A"+value);
context.write(CID, outValue);
}
}
//function map : input customer.txt
public static class CustomerJoinMapper extends
Mapper<LongWritable, Text, Text, Text> {
private Text outkey=new Text();
private Text outvalue = new Text();
private BloomFilter bfilter = new BloomFilter();
public void setup(Context context) throws IOException {
URI[] files = DistributedCache
.getCacheFiles(context.getConfiguration());
// if the files in the distributed cache are set
if (files != null) {
System.out.println("Reading Bloom filter from: "
+ files[0].getPath());
// Open local file for read.
DataInputStream strm = new DataInputStream(new FileInputStream(
files[0].toString()));
bfilter.readFields(strm);
strm.close();
// Read into our Bloom filter.
} else {
throw new IOException(
"Bloom filter file not set in the DistributedCache.");
}
};
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString();
String record[] = line.split(",", -1);
outkey.set(record[0]);
if (bfilter.membershipTest(new Key(outkey.getBytes()))) {
outvalue.set("B"+value);
context.write(outkey, outvalue);
}
}
}
//function reducer: join customer with transaction
public static class JoinReducer extends
Reducer<Text, Text, Text, Text> {
private ArrayList<Text> listA = new ArrayList<Text>();
private ArrayList<Text> listB = new ArrayList<Text>();
#Override
public void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
listA.clear();
listB.clear();
for (Text t : values) {
if (t.charAt(0) == 'A') {
listA.add(new Text(t.toString().substring(1)));
System.out.println("liste A: "+listA);
} else /* if (t.charAt('0') == 'B') */{
listB.add(new Text(t.toString().substring(1)));
System.out.println("listeB :"+listB);
}
}
executeJoinLogic(context);
}
private void executeJoinLogic(Context context) throws IOException,
InterruptedException {
if (!listA.isEmpty() && !listB.isEmpty()) {
for (Text A : listB) {
for (Text B : listA) {
context.write(A, B);
System.out.println("A="+A+",B="+B);
}
}
}
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Path bloompath=new Path("/user/biadmin/ezzaki/bloomfilter/output/part-00000");
DistributedCache.addCacheFile(bloompath.toUri(),conf);
Job job = new Job(conf, "Bloom Join");
job.setJarByClass(BloomJoin.class);
String[] otherArgs = new GenericOptionsParser(conf, args)
.getRemainingArgs();
if (otherArgs.length != 3) {
System.err
.println("ReduceSideJoin <Transaction data> <Customer data> <out> ");
System.exit(1);
}
MultipleInputs.addInputPath(job, new Path(otherArgs[0]),
TextInputFormat.class,TransactionJoin.class);
MultipleInputs.addInputPath(job, new Path(otherArgs[1]),
TextInputFormat.class, CustomerJoinMapper.class);
job.setReducerClass(JoinReducer.class);
FileOutputFormat.setOutputPath(job, new Path(otherArgs[2]));
//job.setMapOutputKeyClass(Text.class);
//job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
System.exit(job.waitForCompletion(true) ? 0 : 3);
}
}
How can I solve this problem?

Can you try changing
URI[] files = DistributedCache.getCacheFiles(context.getConfiguration());
to
Path[] cacheFilePaths = DistributedCache.getLocalCacheFiles(conf);
for (Path cacheFilePath : cacheFilePaths) {
DataInputStream fileInputStream = fs.open(cacheFilePath);
}
bloomFilter.readFields(fileInputStream);
fileInputStream.close();
Also, I think you are using Map side join and not Reduce side since you are using the Distributed cache in Mapper.

You can use a Bloom Filter from here:
https://github.com/odnoklassniki/apache-cassandra/blob/master/src/java/org/apache/cassandra/utils/BloomFilter.java
It goes with dedicated serializer:
https://github.com/odnoklassniki/apache-cassandra/blob/master/src/java/org/apache/cassandra/utils/BloomFilterSerializer.java
You can serialize like this:
Path file = new Path(bloomFilterPath);
FileSystem hdfs = file.getFileSystem(context.getConfiguration());
OutputStream os = hdfs.create(file);
BloomFilterSerializer serializer = new BloomFilterSerializer();
serializer.serialize(bloomFilter, new DataOutputStream(os));
And deserialize:
InputStream is = getInputStreamFromHdfs(context, bloomFilterPath);
Path path = new Path(bloomFilterPath);
InputStream is = path.getFileSystem(context.getConfiguration()).open(path);
BloomFilterSerializer serializer = new BloomFilterSerializer();
BloomFilter bloomFilter = serializer.deserialize(
new DataInputStream(new BufferedInputStream(is)));

Hadoop Mapreduce: Custom Input Format

I have a file with data having text and "^" in between:
SOME TEXT^GOES HERE^
AND A FEW^MORE
GOES HERE
I am writing a custom input format to delimit the rows using "^" character. i.e The output of the mapper should be like:
SOME TEXT
GOES HERE
AND A FEW
MORE GOES HERE
I have written a written a custom input format which extends FileInputFormat and also written a custom record reader that extends RecordReader. Code for my custom record reader is given below. I dont know how to proceed with this code. Having trouble with the nextKeyValue() method in the WHILE loop part. How should I read the data from a split and generate my custom key-value? I am using all new mapreduce package instead of the old mapred package.
public class MyRecordReader extends RecordReader<LongWritable, Text>
{
long start, current, end;
Text value;
LongWritable key;
LineReader reader;
FileSplit split;
Path path;
FileSystem fs;
FSDataInputStream in;
Configuration conf;
#Override
public void initialize(InputSplit inputSplit, TaskAttemptContext cont) throws IOException, InterruptedException
{
conf = cont.getConfiguration();
split = (FileSplit)inputSplit;
path = split.getPath();
fs = path.getFileSystem(conf);
in = fs.open(path);
reader = new LineReader(in, conf);
start = split.getStart();
current = start;
end = split.getLength() + start;
}
#Override
public boolean nextKeyValue() throws IOException
{
if(key==null)
key = new LongWritable();
key.set(current);
if(value==null)
value = new Text();
long readSize = 0;
while(current<end)
{
Text tmpText = new Text();
readSize = read //here how should i read data from the split, and generate key-value?
if(readSize==0)
break;
current+=readSize;
}
if(readSize==0)
{
key = null;
value = null;
return false;
}
return true;
}
#Override
public float getProgress() throws IOException
{
}
#Override
public LongWritable getCurrentKey() throws IOException
{
}
#Override
public Text getCurrentValue() throws IOException
{
}
#Override
public void close() throws IOException
{
}
}

There is no need to implement that yourself. You can simply set the configuration value textinputformat.record.delimiter to be the circumflex character.
conf.set("textinputformat.record.delimiter", "^");
This should work fine with the normal TextInputFormat.

Example for running mapreduce on hdfs files and storing reducer results in hbase table

Can somebody give one good example link for mapreduce with Hbase? My requirement is run mapreduce on hdfs file and store reducer output to hbase table. Mapper input will be hdfs file and output will be Text,IntWritable key value pairs. Reducers output will be Put object ie add reducer Iterable IntWritable values and store in hbase table.

Here is the code which will solve your problem
Driver
HBaseConfiguration conf = HBaseConfiguration.create();
Job job = new Job(conf,"JOB_NAME");
job.setJarByClass(yourclass.class);
job.setMapperClass(yourMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Intwritable.class);
FileInputFormat.setInputPaths(job, new Path(inputPath));
TableMapReduceUtil.initTableReducerJob(TABLE,
yourReducer.class, job);
job.setReducerClass(yourReducer.class);
job.waitForCompletion(true);
Mapper&Reducer
class yourMapper extends Mapper<LongWritable, Text, Text,IntWritable> {
//#overide map()
}
class yourReducer
extends
TableReducer<Text, IntWritable,
ImmutableBytesWritable>
{
//#override rdeuce()
}

**Ckeck the bellow code that works fine for me with Phoenix Hbase and map reduce **
This program will read data from Hbase table and inset result in to another table after map-reduce job .
Table :-> STOCK ,STOCK_STATS
StockComputationJob.java
public static class StockMapper extends Mapper<NullWritable, StockWritable, Text , DoubleWritable> {
private Text stock = new Text();
private DoubleWritable price = new DoubleWritable ();
#Override
protected void map(NullWritable key, StockWritable stockWritable, Context context) throws IOException, InterruptedException {
double[] recordings = stockWritable.getRecordings();
final String stockName = stockWritable.getStockName();
System.out.println("Map-"+recordings);
double maxPrice = Double.MIN_VALUE;
for(double recording : recordings) {
System.out.println("M-"+key+"-"+recording);
if(maxPrice < recording) {
maxPrice = recording;
}
}
System.out.println(stockName+"--"+maxPrice);
stock.set(stockName);
price.set(maxPrice);
context.write(stock,price);
}
}
public static void main(String[] args) throws Exception {
final Configuration conf = new Configuration();
HBaseConfiguration.addHbaseResources(conf);
conf.set(HConstants.ZOOKEEPER_QUORUM, zkUrl);
final Job job = Job.getInstance(conf, "stock-stats-job");
// We can either specify a selectQuery or ignore it when we would like to retrieve all the columns
final String selectQuery = "SELECT STOCK_NAME,RECORDING_YEAR,RECORDINGS_QUARTER FROM STOCK ";
// StockWritable is the DBWritable class that enables us to process the Result of the above query
PhoenixMapReduceUtil.setInput(job,StockWritable.class,"STOCK",selectQuery);
// Set the target Phoenix table and the columns
PhoenixMapReduceUtil.setOutput(job, "STOCK_STATS", "STOCK_NAME,MAX_RECORDING");
job.setMapperClass(StockMapper.class);
job.setReducerClass(StockReducer.class);
job.setOutputFormatClass(PhoenixOutputFormat.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(DoubleWritable.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(StockWritable.class);
TableMapReduceUtil.addDependencyJars(job);
job.waitForCompletion(true);
}
}
StockReducer.java
public class StockReducer extends Reducer<Text, DoubleWritable, NullWritable , StockWritable> {
protected void reduce(Text key, Iterable<DoubleWritable> recordings, Context context) throws IOException, InterruptedException {
double maxPrice = Double.MIN_VALUE;
System.out.println(recordings);
for(DoubleWritable recording : recordings) {
System.out.println("R-"+key+"-"+recording);
if(maxPrice < recording.get()) {
maxPrice = recording.get();
}
}
final StockWritable stock = new StockWritable();
stock.setStockName(key.toString());
stock.setMaxPrice(maxPrice);
System.out.println(key+"--"+maxPrice);
context.write(NullWritable.get(),stock);
}
}
StockWritable.java
public class StockWritable implements DBWritable,Writable {
private String stockName;
private int year;
private double[] recordings;
private double maxPrice;
public void readFields(DataInput input) throws IOException {
}
public void write(DataOutput output) throws IOException {
}
public void readFields(ResultSet rs) throws SQLException {
stockName = rs.getString("STOCK_NAME");
setYear(rs.getInt("RECORDING_YEAR"));
final Array recordingsArray = rs.getArray("RECORDINGS_QUARTER");
setRecordings((double[])recordingsArray.getArray());
}
public void write(PreparedStatement pstmt) throws SQLException {
pstmt.setString(1, stockName);
pstmt.setDouble(2, maxPrice);
}
public int getYear() {
return year;
}
public void setYear(int year) {
this.year = year;
}
public double[] getRecordings() {
return recordings;
}
public void setRecordings(double[] recordings) {
this.recordings = recordings;
}
public double getMaxPrice() {
return maxPrice;
}
public void setMaxPrice(double maxPrice) {
this.maxPrice = maxPrice;
}
public String getStockName() {
return stockName;
}
public void setStockName(String stockName) {
this.stockName = stockName;
}
}

Distributed Cache Hadoop not retrieving the file content

I am getting some garbage like value instead of the data from the file I want to use as distributed cache.
The Job Configuration is as follows:
Configuration config5 = new Configuration();
JobConf conf5 = new JobConf(config5, Job5.class);
conf5.setJobName("Job5");
conf5.setOutputKeyClass(Text.class);
conf5.setOutputValueClass(Text.class);
conf5.setMapperClass(MapThree4c.class);
conf5.setReducerClass(ReduceThree5.class);
conf5.setInputFormat(TextInputFormat.class);
conf5.setOutputFormat(TextOutputFormat.class);
DistributedCache.addCacheFile(new URI("/home/users/mlakshm/ap1228"), conf5);
FileInputFormat.setInputPaths(conf5, new Path(other_args.get(5)));
FileOutputFormat.setOutputPath(conf5, new Path(other_args.get(6)));
JobClient.runJob(conf5);
In the Mapper, I have the following code:
public class MapThree4c extends MapReduceBase implements Mapper<LongWritable, Text,
Text, Text >{
private Set<String> prefixCandidates = new HashSet<String>();
Text a = new Text();
public void configure(JobConf conf5) {
Path[] dates = new Path[0];
try {
dates = DistributedCache.getLocalCacheFiles(conf5);
System.out.println("candidates: "+candidates);
String astr = dates.toString();
a = new Text(astr);
} catch (IOException ioe) {
System.err.println("Caught exception while getting cached files: " +
StringUtils.stringifyException(ioe));
}
}
public void map(LongWritable key, Text value, OutputCollector<Text, Text> output,
Reporter reporter) throws IOException {
String line = value.toString();
StringTokenizer st = new StringTokenizer(line);
st.nextToken();
String t = st.nextToken();
String uidi = st.nextToken();
String uidj = st.nextToken();
String check = null;
output.collect(new Text(line), a);
}
}
The output value, I am getting from this mapper is:[Lorg.apache.hadoop.fs.Path;#786c1a82
instead of the value from the distributed cache file.

That looks like what you get when you call toString() on an array and if you look at the javadocs for DistributedCache.getLocalCacheFiles(), that is what it returns. If you need to actually read the contents of the files in the cache, you can open/read them with the standard java APIs.

From your code:
Path[] dates = DistributedCache.getLocalCacheFiles(conf5);
Implies that:
String astr = dates.toString(); // is a pointer to the above array (ie.dates) which is what you see in the output as [Lorg.apache.hadoop.fs.Path;#786c1a82.
You need to do the following to see the actual paths:
for(Path cacheFile: dates){
output.collect(new Text(line), new Text(cacheFile.getName()));
}

Use hive custom outputformat to handle log files

I want to use hive Version 0.7.0 handle the log files and i set the custem inputformat and outputformat. In the inputformat, i replace the "\n" to "###", and in the outputformat i want to change back to "\n". After test my inputformat does well but my outputformat doesn't work. I want to know why. Here is the code. Thanks!
public class ErrlogOutputFormat, V extends Writable>
extends HiveIgnoreKeyTextOutputFormat {
public static class CustomRecordWriter implements RecordWriter{
RecordWriter writer;
BytesWritable bytesWritable;
public CustomRecordWriter(RecordWriter writer) {
this.writer = writer;
bytesWritable = new BytesWritable();
}
#Override
public void write(Writable w) throws IOException {
//String str = ((Text) w).toString().replaceAll("###","\n");
String[] str = ((Text) w).toString().split("###");
StringBuffer sb = new StringBuffer();
for(String s:str){
sb.append(s).append("\n");
}
Text txtReplace = new Text(sb.toString());
System.out.println("------------------------");
System.out.println(txtReplace.toString());
System.out.println("------------------------");
// Get input data
// Encode
byte[] output = txtReplace.getBytes();
bytesWritable.set(output, 0, output.length);
writer.write(bytesWritable);
}
#Override
public void close(boolean abort) throws IOException {
writer.close(abort);
}
}
#Override
public RecordWriter getHiveRecordWriter(JobConf jc, Path finalOutPath,
Class valueClass, boolean isCompressed,
Properties tableProperties, Progressable progress)
throws IOException {
CustomRecordWriter writer = new CustomRecordWriter(super
.getHiveRecordWriter(jc, finalOutPath, BytesWritable.class,
isCompressed, tableProperties, progress));
return writer;
}
}

Develop Reference

ruby bash windows laravel spring algorithm oracle macos go visual-studio

How to remove r-00000 extention from reducer output in mapreduce - hadoop

Related

Bloom Filter in MapReduce

Hadoop Mapreduce: Custom Input Format

Example for running mapreduce on hdfs files and storing reducer results in hbase table

Distributed Cache Hadoop not retrieving the file content

Use hive custom outputformat to handle log files

Categories

Resources