I am trying to run java program on my hadoop system to store image in sequence file and then trying to read that sequence file after that.
My Sequence is created but image data is not getting appended in sequence file.
I am trying to run below code by running this command
sudo -u hdfs hadoop jar /usr/java_jar/ImageStorage.jar ImageStorage 12e2baa2ae0e455ac40015942b682c4b.jpg
Please help me out here.
import java.io.*;
import java.util.*;
import java.net.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.SequenceFile.Writer.Option;
import org.apache.hadoop.io.Writable;
public class ImageStorage {
private static void openOutputFile(String args1) throws Exception {
String uri = "hdfs://localhost:8020/";
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(URI.create(uri), conf);
Path path = new Path("hdfs://localhost:8020/user/img_data/SequenceFileCodecTest.seq");
String string1 = "hdfs://localhost:8020/user/img_data/";
string1 = string1 + args1;
Path inPath = new Path(string1);
FSDataInputStream in = null;
Text key = new Text();
BytesWritable value = new BytesWritable();
SequenceFile.Writer writer = null;
try{
in = fs.open(inPath);
byte buffer[] = new byte[in.available()];
in.read(buffer);
System.out.println(buffer);
in.close();
Option optPath = SequenceFile.Writer.file(path);
Option optKey = SequenceFile.Writer.keyClass(key.getClass());
Option optVal = SequenceFile.Writer.valueClass(value.getClass());
Option optCom = SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK);
FSDataOutputStream fileOutputStream = fs.append(path);
BufferedWriter br = new BufferedWriter(new OutputStreamWriter(fileOutputStream));
writer.append(new Text(inPath.getName()), new BytesWritable(buffer));
br.close();
fileOutputStream.close();
}catch (Exception e) {
System.out.println("Exception MESSAGES = "+e.getMessage());
}
finally {
IOUtils.closeStream(writer);
System.out.println("last line of the code....!!!!!!!!!!");
}
}
private static void openReadFile() throws Exception {
String uri = "hdfs://localhost:8020/";
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(URI.create(uri), conf);
Path path = new Path("hdfs://localhost:8020/user/img_data/SequenceFileCodecTest.seq");
/* Reading Operations */
org.apache.hadoop.io.SequenceFile.Reader.Option filePath = SequenceFile.Reader.file(path);
SequenceFile.Reader sequenceFileReader = new SequenceFile.Reader(conf,filePath);
Writable key1 = (Writable) ReflectionUtils.newInstance(
sequenceFileReader.getKeyClass(), conf);
Writable value1 = (Writable) ReflectionUtils.newInstance(
sequenceFileReader.getValueClass(), conf);
try {
while (sequenceFileReader.next(key1, value1)) {
System.out.printf("[%s] %s %s \n", sequenceFileReader.getPosition(), key1,value1.getClass());
}
} finally {
IOUtils.closeStream(sequenceFileReader);
}
/* Reading operations */
}
public static void main(String[] args) throws Exception {
openOutputFile(args[1]);
openReadFile();
}
}
Related
I have a file with abc.txt
abc.txt
hhjj
myhome _ dfdfdsfds
fdsfds
dfdfdsfdsd
fdsfdsf
dfdfdsfdsdfdsfd
dfdsfds
dfdsfds
_
hhhh
jjj
kkkk
The character after myhome could be any character it could be special characters,letters or numbers.I need to find the character after myhome and delete the lines in between and that end with character. In my file I have mentioned as '' after myhome and with ''.
The output should be
abc.txt
hhjj
hhhh
jjj
kkkk
I have tried how to read a file by line by line
package test;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
public class ReadStringFromFileLineByLine {
public static void main(String[] args) {
try {
File file = new File("abc.txt");
FileReader fileReader = new FileReader(file);
BufferedReader bufferedReader = new BufferedReader(fileReader);
StringBuffer stringBuffer = new StringBuffer();
String line;
while ((line = bufferedReader.readLine()) != null) {
stringBuffer.append(line);
stringBuffer.append("\n");
}
fileReader.close();
System.out.println("Contents of file:");
System.out.println(stringBuffer.toString());
} catch (IOException e) {
e.printStackTrace();
}
}
}
Could anyone help ??
Structure
App.java
package main;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.PrintWriter;
import java.io.UnsupportedEncodingException;
import java.util.Scanner;
public class App {
public static void main(String[] args)
throws FileNotFoundException, UnsupportedEncodingException {
App app = new App();
app.parseFile();
}
void parseFile() throws FileNotFoundException, UnsupportedEncodingException {
// Reading from package main
File file = new File(getClass().getResource("/main/abc.txt").getFile());
// If you want to read from HDD, for example :
// File file = new File("/home/user/abc.txt"); or File file = new File("c:\\abc.txt");
Scanner input = new Scanner(file);
StringBuilder sb = new StringBuilder();
while (input.hasNextLine()) {
sb.append(input.nextLine());
sb.append("\n");
}
input.close();
String text = sb.toString();
text = text
.replace(
text.substring(
text.indexOf("myhome _"),
text.lastIndexOf('_')+1
),
""
)
.replace("\n\n", "\n");
Path newFile = Paths.get("abc_new.txt");
Files.write(newFile, Arrays.asList(text.split("\n")), Charset.forName("UTF-8"));
}
}
New written file will be in project folder.
I want to run two mappers that produce two different outputs in different directories.The output of the first mapper(Send as argument) should be send to the input of the second mapper.i have this code in the driver class
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
public class Export_Column_Mapping
{
private static String[] Detail_output_column_array = new String[27];
private static String[] Shop_output_column_array = new String[8];
private static String details_output = null ;
private static String Shop_output = null;
public static void main(String[] args) throws Exception
{
String Output_filetype = args[3];
String Input_column_number = args[4];
String Output_column_number = args[5];
Configuration Detailsconf = new Configuration(false);
Detailsconf.setStrings("output_filetype",Output_filetype);
Detailsconf.setStrings("Input_column_number",Input_column_number);
Detailsconf.setStrings("Output_column_number",Output_column_number);
Job Details = new Job(Detailsconf," Export_Column_Mapping");
Details.setJarByClass(Export_Column_Mapping.class);
Details.setJobName("DetailsFile_Job");
Details.setMapperClass(DetailFile_Mapper.class);
Details.setNumReduceTasks(0);
Details.setInputFormatClass(TextInputFormat.class);
Details.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.setInputPaths(Details, new Path(args[0]));
FileOutputFormat.setOutputPath(Details, new Path(args[1]));
if(Details.waitForCompletion(true))
{
Configuration Shopconf = new Configuration();
Job Shop = new Job(Shopconf,"Export_Column_Mapping");
Shop.setJarByClass(Export_Column_Mapping.class);
Shop.setJobName("ShopFile_Job");
Shop.setMapperClass(ShopFile_Mapper.class);
Shop.setNumReduceTasks(0);
Shop.setInputFormatClass(TextInputFormat.class);
Shop.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.setInputPaths(Shop, new Path(args[1]));
FileOutputFormat.setOutputPath(Shop, new Path(args[2]));
MultipleOutputs.addNamedOutput(Shop, "text", TextOutputFormat.class,LongWritable.class, Text.class);
System.exit(Shop.waitForCompletion(true) ? 0 : 1);
}
}
public static class DetailFile_Mapper extends Mapper<LongWritable,Text,Text,Text>
{
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException
{
String str_Output_filetype = context.getConfiguration().get("output_filetype");
String str_Input_column_number = context.getConfiguration().get("Input_column_number");
String[] input_columns_number = str_Input_column_number.split(",");
String str_Output_column_number= context.getConfiguration().get("Output_column_number");
String[] output_columns_number = str_Output_column_number.split(",");
String str_line = value.toString();
String[] input_column_array = str_line.split(",");
try
{
for(int i = 0;i<=input_column_array.length+1; i++)
{
int int_outputcolumn = Integer.parseInt(output_columns_number[i]);
int int_inputcolumn = Integer.parseInt(input_columns_number[i]);
if((int_inputcolumn != 0) && (int_outputcolumn != 0) && output_columns_number.length == input_columns_number.length)
{
Detail_output_column_array[int_outputcolumn-1] = input_column_array[int_inputcolumn-1];
if(details_output != null)
{
details_output = details_output+" "+ Detail_output_column_array[int_outputcolumn-1];
Shop_output = Shop_output+" "+ Shop_output_column_array[int_outputcolumn-1];
}else
{
details_output = Detail_output_column_array[int_outputcolumn-1];
Shop_output = Shop_output_column_array[int_outputcolumn-1];
}
}
}
}catch (Exception e)
{
}
context.write(null,new Text(details_output));
}
}
public static class ShopFile_Mapper extends Mapper<LongWritable,Text,Text,Text>
{
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException
{
try
{
for(int i = 0;i<=Shop_output_column_array.length; i++)
{
Shop_output_column_array[0] = Detail_output_column_array[0];
Shop_output_column_array[1] = Detail_output_column_array[1];
Shop_output_column_array[2] = Detail_output_column_array[2];
Shop_output_column_array[3] = Detail_output_column_array[3];
Shop_output_column_array[4] = Detail_output_column_array[14];
if(details_output != null)
{
Shop_output = Shop_output+" "+ Shop_output_column_array[i];
}else
{
Shop_output = Shop_output_column_array[i-1];
}
}
}catch (Exception e){
}
context.write(null,new Text(Shop_output));
}
}
}
I get the error..
Error:org.apache.hadoop.mapreduce.lib.input.InvalidInputException:
Input path does not exist:
file:/home/Barath.B.Natarajan.ap/rules/text.txt
I want to run the jobs one by one can any one help me in this?...
There is something called jobcontrol with which you will be able to achieve it.
Suppose there are two jobs A and B
ControlledJob A= new ControlledJob(JobConf for A);
ControlledJob B= new ControlledJob(JobConf for B);
B.addDependingJob(A);
JobControl jControl = newJobControl("Name");
jControl.addJob(A);
jControl.addJob(B);
Thread runJControl = new Thread(jControl);
runJControl.start();
while (!jControl.allFinished()) {
code = jControl.getFailedJobList().size() == 0 ? 0 : 1;
Thread.sleep(1000);
}
System.exit(1);
Initialize code at the beginning like this:
int code =1;
Let the first job in your case be the first mapper with zero reducer and second job be the second mapper with zero reducer.The configuration should be such that the input path of B and output path of A should be same.
I have huge amount of data stored in HDFS, but the individual files are very small (KBs). So the MapReduce processing is taking a lot of time.
Can I reduce the processing time? Will SequenceFile be a good option?
Please provide some Java or MR code to convert multiple smaller text files into SequenceFile.
SequenceFile would be a good choice in such a scenario. You could do something like this :
public class TextToSequenceConverter {
/**
* #param args
* #throws IOException
* #throws IllegalAccessException
* #throws InstantiationException
*/
#SuppressWarnings("deprecation")
public static void main(String[] args) throws IOException,
InstantiationException, IllegalAccessException {
// TODO Auto-generated method stub
Configuration conf = new Configuration();
conf.addResource(new Path("/hadoop/projects/hadoop-1.0.4/conf/core-site.xml"));
conf.addResource(new Path("/hadoop/projects/hadoop-1.0.4/conf/hdfs-site.xml"));
FileSystem fs = FileSystem.get(conf);
Path inputFile = new Path("/infile");
FSDataInputStream inputStream = fs.open(inputFile);
Path outputFile = new Path("/outfile");
IntWritable key = new IntWritable();
int count = 0;
Text value = new Text();
String str;
SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf,outputFile, key.getClass(), value.getClass());
while (inputStream.available() > 0) {
key.set(count++);
str = inputStream.readLine();
value.set(str);
writer.append(key, value);
}
fs.close();
IOUtils.closeStream(writer);
System.out.println("SEQUENCE FILE CREATED SUCCESSFULLY........");
}
}
You might also wanna have a look at HAR files.
You might find this a good read :
http://blog.cloudera.com/blog/2009/02/the-small-files-problem/
To convert all the files inside a HDFS directory into a single Sequence file :
package my.pack;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
public class BundleSeq {
/**
* #param args
* #throws IOException
* #throws IllegalAccessException
* #throws InstantiationException
*/
public static void main(String[] args) throws IOException,
InstantiationException, IllegalAccessException {
// TODO Auto-generated method stub
Configuration conf = new Configuration();
conf.addResource(new Path(
"/hadoop/projects/hadoop-1.0.4/conf/core-site.xml"));
conf.addResource(new Path(
"/hadoop/projects/hadoop-1.0.4/conf/hdfs-site.xml"));
FileSystem fs = FileSystem.get(conf);
Path inputFile = new Path("/bundleinput");
Path outputFile = new Path("/outfile");
FSDataInputStream inputStream;
Text key = new Text();
Text value = new Text();
SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf,
outputFile, key.getClass(), value.getClass());
FileStatus[] fStatus = fs.listStatus(inputFile);
for (FileStatus fst : fStatus) {
String str = "";
System.out.println("Processing file : " + fst.getPath().getName() + " and the size is : " + fst.getPath().getName().length());
inputStream = fs.open(fst.getPath());
key.set(fst.getPath().getName());
while(inputStream.available()>0) {
str = str+inputStream.readLine();
}
value.set(str);
writer.append(key, value);
}
fs.close();
IOUtils.closeStream(writer);
System.out.println("SEQUENCE FILE CREATED SUCCESSFULLY........");
}
}
Here filename is the key and file content is the value.
You may override org.apache.hadoop.mapred.lib.CombineFileInputFormat and create your CombinedInputFormat. For implementation see my answer here. And by setting the parameter mapred.max.split.size you may control the size you would like the input files to be combined into.
For more read here.
I am looking for an example which is using the new API to read and write Sequence Files.
Effectively I need to know how to use these functions
createWriter(Configuration conf, org.apache.hadoop.io.SequenceFile.Writer.Option... opts)
The Old definition is not working for me:
SequenceFile.createWriter( fs, conf, path, key.getClass(), value.getClass());
Similarly I need to know what will be the code for reading the Sequence file, as the follwoing is deprecated:
SequenceFile.Reader(fs, path, conf);
Here is the way to use the same -
String uri = args[0];
Configuration conf = new Configuration();
Path path = new Path( uri);
IntWritable key = new IntWritable();
Text value = new Text();
CompressionCodec Codec = new GzipCodec();
SequenceFile.Writer writer = null;
Option optPath = SequenceFile.Writer.file(path);
Option optKey = SequenceFile.Writer.keyClass(key.getClass());
Option optVal = SequenceFile.Writer.valueClass(value.getClass());
Option optCom = SequenceFile.Writer.compression(CompressionType.RECORD, Codec);
writer = SequenceFile.createWriter( conf, optPath, optKey, optVal, optCom);
public class SequenceFilesTest {
#Test
public void testSeqFileReadWrite() throws IOException {
Configuration conf = new Configuration();
FileSystem fs = FileSystem.getLocal(conf);
Path seqFilePath = new Path("file.seq");
SequenceFile.Writer writer = SequenceFile.createWriter(conf,
Writer.file(seqFilePath), Writer.keyClass(Text.class),
Writer.valueClass(IntWritable.class));
writer.append(new Text("key1"), new IntWritable(1));
writer.append(new Text("key2"), new IntWritable(2));
writer.close();
SequenceFile.Reader reader = new SequenceFile.Reader(conf,
Reader.file(seqFilePath));
Text key = new Text();
IntWritable val = new IntWritable();
while (reader.next(key, val)) {
System.err.println(key + "\t" + val);
}
reader.close();
}
}
I'm late by more than an year to answer but just got started with Hadoop 2.4.1 :)
Below is the code, someone may find it useful.
Note: It includes the commented 1.x code to read and write a sequence file. I was wondering where does it pick up the file system but when I executed it directly on the cluster, it picked it properly(probably, from core-site.xml as mentioned in Configuration
import java.io.File;
import java.io.IOException;
import java.io.RandomAccessFile;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.SequenceFile.Reader.Option;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.util.ReflectionUtils;
public class SequenceFileOperator {
private Configuration conf = new Configuration();
/*private FileSystem fs;
{
try {
fs = FileSystem.get(URI.create("hdfs://cldx-1336-1202:9000"), conf);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}*/
public static void main(String[] args) throws IOException {
// TODO Auto-generated method stub
if (args == null || args.length < 2) {
System.out
.println("Following are the possible invocations <operation id> <arg1> <arg2> ...");
System.out
.println("1 <absolute path of directory containing documents> <HDFS path of the sequence file");
System.out.println("2 <HDFS path of the sequence file>");
return;
}
int operation = Integer.valueOf(args[0]);
SequenceFileOperator docToSeqFileWriter = new SequenceFileOperator();
switch (operation) {
case 1: {
String docDirectoryPath = args[1];
String sequenceFilePath = args[2];
System.out.println("Writing files present at " + docDirectoryPath
+ " to the sequence file " + sequenceFilePath);
docToSeqFileWriter.loadDocumentsToSequenceFile(docDirectoryPath,
sequenceFilePath);
break;
}
case 2: {
String sequenceFilePath = args[1];
System.out.println("Reading the sequence file " + sequenceFilePath);
docToSeqFileWriter.readSequenceFile(sequenceFilePath);
break;
}
}
}
private void readSequenceFile(String sequenceFilePath) throws IOException {
// TODO Auto-generated method stub
/*
* SequenceFile.Reader sequenceFileReader = new SequenceFile.Reader(fs,
* new Path(sequenceFilePath), conf);
*/
Option filePath = SequenceFile.Reader.file(new Path(sequenceFilePath));
SequenceFile.Reader sequenceFileReader = new SequenceFile.Reader(conf,
filePath);
Writable key = (Writable) ReflectionUtils.newInstance(
sequenceFileReader.getKeyClass(), conf);
Writable value = (Writable) ReflectionUtils.newInstance(
sequenceFileReader.getValueClass(), conf);
try {
while (sequenceFileReader.next(key, value)) {
System.out
.printf("[%s] %s %s \n",
sequenceFileReader.getPosition(), key,
value.getClass());
}
} finally {
IOUtils.closeStream(sequenceFileReader);
}
}
private void loadDocumentsToSequenceFile(String docDirectoryPath,
String sequenceFilePath) throws IOException {
// TODO Auto-generated method stub
File docDirectory = new File(docDirectoryPath);
if (!docDirectory.isDirectory()) {
System.out
.println("Please provide an absolute path of a directory that contains the documents to be added to the sequence file");
return;
}
/*
* SequenceFile.Writer sequenceFileWriter =
* SequenceFile.createWriter(fs, conf, new Path(sequenceFilePath),
* Text.class, BytesWritable.class);
*/
org.apache.hadoop.io.SequenceFile.Writer.Option filePath = SequenceFile.Writer
.file(new Path(sequenceFilePath));
org.apache.hadoop.io.SequenceFile.Writer.Option keyClass = SequenceFile.Writer
.keyClass(Text.class);
org.apache.hadoop.io.SequenceFile.Writer.Option valueClass = SequenceFile.Writer
.valueClass(BytesWritable.class);
SequenceFile.Writer sequenceFileWriter = SequenceFile.createWriter(
conf, filePath, keyClass, valueClass);
File[] documents = docDirectory.listFiles();
try {
for (File document : documents) {
RandomAccessFile raf = new RandomAccessFile(document, "r");
byte[] content = new byte[(int) raf.length()];
raf.readFully(content);
sequenceFileWriter.append(new Text(document.getName()),
new BytesWritable(content));
raf.close();
}
} finally {
IOUtils.closeStream(sequenceFileWriter);
}
}
}
for reading you can use
Path path= new Path("/bar");
Reader sequenceFileReader = new SequenceFile.Reader(conf,SequenceFile.Reader.file(path));
You need to set SequenceFile as input format
job.setInputFormatClass(SequenceFileInputFormat.class);
You will find an example of reading SeequnceFile form HDFS here.
Trying to compress the image using JAI with TIFF format. It works fine on Windows XP, But it produces 10 times bigger size file on Windows 7 compared to Windows XP.
Using JAI 1.1 and JRE 1.6_0_16
What might be the issue? Appreciate your assistance.
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.Iterator;
import java.util.Locale;
import javax.imageio.IIOImage;
import javax.imageio.ImageIO;
import javax.imageio.ImageWriteParam;
import javax.imageio.ImageWriter;
import javax.imageio.stream.ImageOutputStream;
import com.sun.media.imageio.plugins.tiff.TIFFImageWriteParam;
public class Test {
/**
* #param args
*/
public static void main(String[] args) {
File image = new File("abc.tiff");
File tempFile = new File("compressed.tiff");
try {
BufferedImage bi = ImageIO.read(image);
byte[] tiffArray = toTiff(bi, tempFile,"packBits" );
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public static byte[] toTiff(BufferedImage bi, File tempFile, String compType) {
byte[] bImg = null;
try{
tempFile.delete();
String format = "TIF";
Iterator writers = ImageIO.getImageWritersByFormatName(format);
if(writers == null || !writers.hasNext()) {
throw new IllegalArgumentException("Unsupported format (" + format + ")");
}
ImageWriter writer = (ImageWriter)writers.next();
IIOImage iioImg = new IIOImage(bi, null, null);
TIFFImageWriteParam writeParam = new TIFFImageWriteParam(Locale.ENGLISH);
writeParam.setCompressionMode(ImageWriteParam.MODE_EXPLICIT);
writeParam.setCompressionType(compType);
ImageOutputStream ios = ImageIO.createImageOutputStream(tempFile);
writer.setOutput(ios);
writer.write(null, iioImg, writeParam);
ios.close();
writer.dispose();
bImg = readImage(tempFile);
}catch(Exception e){
e.printStackTrace();
}
return bImg;
}
public static byte[] readImage(File f) throws Exception {
byte[] bImg = null;
try {
long fLength = f.length();
if(fLength > Integer.MAX_VALUE){
throw new RuntimeException("File is too large to upload....!");
}
bImg = new byte[(int)fLength];
FileInputStream fin = new FileInputStream(f);
fin.read(bImg);
fin.close();
} catch (Exception e) {
e.printStackTrace();
throw e;
}
return bImg;
}
}