ClassCastException: org.apache.hadoop.io.LongWritable cannot be cast to org.apache.hadoop.hbase.io.ImmutableBytesWritable - hadoop

I have exported a table from Hbase to a file in almost like org.apache.hadoop.mapreduce.lib.output.TextOutputFormat,To import the exported Text format file I have tweaked the code of Import from the open source to support importing text based files instead of SequenceFile.
job.setInputFormatClass(TextInputFormat.class);
while running the Import class I am getting the following exception.
java.lang.ClassCastException: org.apache.hadoop.io.LongWritable cannot be cast to org.apache.hadoop.hbase.io.ImmutableBytesWritable
at Import$Importer.map(Import.java:1)
at org.apache.hadoop.mapreduce.Mapper.run(Mapper.java:144)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:764)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:370)
at org.apache.hadoop.mapred.LocalJobRunner$Job.run(LocalJobRunner.java:212)
here is my Export Class which was tweaked to write the content to the file from the ExpoterTable.
public class Export
{
private static final Log LOG = LogFactory.getLog(Export.class);
final static String NAME = "export";
final static String RAW_SCAN = "hbase.mapreduce.include.deleted.rows";
private static OutputStream out;
private static final String utf8 = "UTF-8";
private static final byte[] newline;
private static final byte[] keyValueSeparator;
static {
try {
newline = "\n".getBytes(utf8);
keyValueSeparator = "\t".getBytes(utf8);
}
catch (UnsupportedEncodingException uee) {
throw new IllegalArgumentException("can't find " + utf8 + " encoding");
}
}
/**
* Mapper.
*/
static class ExporterTable extends TableMapper<ImmutableBytesWritable, Result>
{
/**
* #param row The current table row key.
* #param value The columns.
* #param context The current context.
* #throws IOException When something is broken with the data.
* #see org.apache.hadoop.mapreduce.Mapper#map(KEYIN, VALUEIN,
* org.apache.hadoop.mapreduce.Mapper.Context)
*/
#Override
public void map(ImmutableBytesWritable row, Result value, Context context) throws IOException {
try {
context.write(row, value);
write(row, value);
System.out.println(row);
System.out.println(value);
}
catch (InterruptedException e) {
e.printStackTrace();
}
}
}
/**
* Sets up the actual job.
*
* #param conf The current configuration.
* #param args The command line parameters.
* #return The newly created job.
* #throws IOException When setting up the job fails.
*/
public static Job createSubmittableJob(Configuration conf, String[] args) throws IOException {
String tableName = args[0];
// this.out = new DataOutputStream(fos);
Path outputDir = new Path(args[1]);
Job job = new Job(conf, NAME + "_" + tableName);
job.setJobName(NAME + "_" + tableName);
job.setJarByClass(ExporterTable.class);
// Set optional scan parameters
Scan s = getConfiguredScanForJob(conf, args);
TableMapReduceUtil.initTableMapperJob(tableName, s, ExporterTable.class, ImmutableBytesWritable.class, IntWritable.class, job);
// No reducers. Just write straight to output files.
job.setNumReduceTasks(0);
job.setOutputValueClass(Text.class);
// FileOutputFormat.setOutputPath(job, outputDir);
job.setOutputFormatClass(NullOutputFormat.class);
TableMapReduceUtil.addHBaseDependencyJars(conf);
TableMapReduceUtil.addDependencyJars(conf, JsonProcessingException.class);
TableMapReduceUtil.addDependencyJars(job);
return job;
}
private static Scan getConfiguredScanForJob(Configuration conf, String[] args) throws IOException {
Scan s = new Scan();
// Optional arguments.
// Set Scan Versions
int versions = args.length > 2 ? Integer.parseInt(args[2]) : 1;
s.setMaxVersions(versions);
// Set Scan Range
long startTime = args.length > 3 ? Long.parseLong(args[3]) : 0L;
long endTime = args.length > 4 ? Long.parseLong(args[4]) : Long.MAX_VALUE;
s.setTimeRange(startTime, endTime);
// Set cache blocks
s.setCacheBlocks(false);
// Set Scan Column Family
boolean raw = Boolean.parseBoolean(conf.get(RAW_SCAN));
if (raw) {
s.setRaw(raw);
}
if (conf.get(TableInputFormat.SCAN_COLUMN_FAMILY) != null) {
s.addFamily(Bytes.toBytes(conf.get(TableInputFormat.SCAN_COLUMN_FAMILY)));
}
// Set RowFilter or Prefix Filter if applicable.
Filter exportFilter = getExportFilter(args);
if (exportFilter != null) {
LOG.info("Setting Scan Filter for Export.");
s.setFilter(exportFilter);
}
LOG.info("versions=" + versions + ", starttime=" + startTime + ", endtime=" + endTime + ", keepDeletedCells=" + raw);
return s;
}
private static Filter getExportFilter(String[] args) {
Filter exportFilter = null;
String filterCriteria = (args.length > 5) ? args[5] : null;
if (filterCriteria == null)
return null;
if (filterCriteria.startsWith("^")) {
String regexPattern = filterCriteria.substring(1, filterCriteria.length());
exportFilter = new RowFilter(CompareOp.EQUAL, new RegexStringComparator(regexPattern));
}
else {
exportFilter = new PrefixFilter(Bytes.toBytes(filterCriteria));
}
return exportFilter;
}
/*
* #param errorMsg Error message. Can be null.
*/
private static void usage(final String errorMsg) {
if (errorMsg != null && errorMsg.length() > 0) {
System.err.println("ERROR: " + errorMsg);
}
System.err.println("Usage: Export [-D <property=value>]* <tablename> <outputdir> [<versions> " + "[<starttime> [<endtime>]] [^[regex pattern] or [Prefix] to filter]]\n");
System.err.println(" Note: -D properties will be applied to the conf used. ");
System.err.println(" For example: ");
System.err.println(" -D mapred.output.compress=true");
System.err.println(" -D mapred.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec");
System.err.println(" -D mapred.output.compression.type=BLOCK");
System.err.println(" Additionally, the following SCAN properties can be specified");
System.err.println(" to control/limit what is exported..");
System.err.println(" -D " + TableInputFormat.SCAN_COLUMN_FAMILY + "=<familyName>");
System.err.println(" -D " + RAW_SCAN + "=true");
System.err.println("For performance consider the following properties:\n" + " -Dhbase.client.scanner.caching=100\n" + " -Dmapred.map.tasks.speculative.execution=false\n" + " -Dmapred.reduce.tasks.speculative.execution=false");
}
/**
* Main entry point.
*
* #param args The command line parameters.
* #throws Exception When running the job fails.
*/
public static void main(String[] args) throws Exception {
Configuration conf = HBaseConfiguration.create();
conf.set("mapreduce.framework.name", "local");
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length < 2) {
usage("Wrong number of arguments: " + otherArgs.length);
System.exit(-1);
}
boolean jobStatus = false;
Job job = createSubmittableJob(conf, otherArgs);
try {
File f = new File("Test");
out = new FileOutputStream(f);
jobStatus = job.waitForCompletion(true);
}
catch (Exception e) {
e.printStackTrace();
}
finally {
IOUtils.closeStream(out);
}
// convertTextToSequence(conf);
System.exit(jobStatus ? 0 : 1);
}
public static void write(ImmutableBytesWritable key, Result value) throws IOException {
boolean nullKey = key == null;
boolean nullValue = value == null;
if (nullKey && nullValue) {
return;
}
if (!nullKey) {
writeObject(key);
}
if (!(nullKey || nullValue)) {
out.write(keyValueSeparator);
}
if (!nullValue) {
writeObject(value);
}
out.write(newline);
}
/**
* Write the object to the byte stream, handling Text as a special
* case.
* #param o the object to print
* #throws IOException if the write throws, we pass it on
*/
private static void writeObject(Object o) throws IOException {
if (o instanceof Text) {
Text to = (Text) o;
out.write(to.getBytes(), 0, to.getLength());
}
else {
out.write(o.toString().getBytes(utf8));
}
}
}
any help is appreciated .

You have declared map method as follows and writing output key as ImmutableBytesWritable
public void map(ImmutableBytesWritable row, Result value, Context context)
throws IOException {
try {
context.write(row, value);
You have to override job parameters as follows to set MapOutputKeyClass and MapOutPutvalueClass
job.setMapOutputKeyClass(ImmutableBytesWritable.class);
job.setMapOutputValueClass(Result.class);
Have a look at working Example : 7. Export an HBase table to File

Related

Chronicle Queue V3. Can Entries be lost on data block roll-over?

I have an application that writes entries to a Chronicle Queue (V3) that also retains excerpt entry index values in other (Chronicle)Maps by way of providing indexed access in the queue. Sometimes we fail to find a given entry that we've earlier saved and I believe it maybe related to data-block roll-over.
Below is a stand-alone test program that reproduces such use-cases at small-scale. It repeatedly writes an entry and immediately attempts to find the resulting index value up using a separate ExcerptTailer. All is well for a while until the first data-block is used up and a second data file is assigned, then the retrieval failures start. If the data block size is increased to avoid roll-overs, then no entries are lost. Also using a small index data-block size, causing multiple index files to be created, doesn't cause a problem.
The test program also tries using an ExcerptListener running in parallel to see if the entries apparently 'lost' by the writer, are ever received by the reader thread - they're not. Also tries to re-read the resulting queue from start until end, which reconfirms that they really are lost.
Stepping thru' the code, I see that when looking up a 'missing entry', within AbstractVanillarExcerpt#index, it appears to successfully locate the correct VanillaMappedBytes object from the dataCache, but determines that there is no entry and the data-offset as the len == 0. In addition to the entries not being found, at some point after the problems start occurring post-roll-over, an NPE is thrown from within the VanillaMappedFile#fileChannel method due to it having been passed a null File path. The code-path assumes that when resolving a entry looked up successfully in the index that a file will always have been found, but isn't in this case.
Is it possible to reliably use Chronicle Queue across data-block roll-overs, and if so, what am I doing that maybe causing the problem I'm experiencing?
import java.io.IOException;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Set;
import org.junit.Before;
import org.junit.Test;
import net.openhft.affinity.AffinitySupport;
import net.openhft.chronicle.Chronicle;
import net.openhft.chronicle.ChronicleQueueBuilder;
import net.openhft.chronicle.ExcerptAppender;
import net.openhft.chronicle.ExcerptCommon;
import net.openhft.chronicle.ExcerptTailer;
import net.openhft.chronicle.VanillaChronicle;
public class ChronicleTests {
private static final int CQ_LEN = VanillaChronicle.Cycle.DAYS.length();
private static final long CQ_ENT = VanillaChronicle.Cycle.DAYS.entries();
private static final String ROOT_DIR = System.getProperty(ChronicleTests.class.getName() + ".ROOT_DIR",
"C:/Temp/chronicle/");
private static final String QDIR = System.getProperty(ChronicleTests.class.getName() + ".QDIR", "chronicleTests");
private static final int DATA_SIZE = Integer
.parseInt(System.getProperty(ChronicleTests.class.getName() + ".DATA_SIZE", "100000"));
// Chunk file size of CQ index
private static final int INDX_SIZE = Integer
.parseInt(System.getProperty(ChronicleTests.class.getName() + ".INDX_SIZE", "10000"));
private static final int Q_ENTRIES = Integer
.parseInt(System.getProperty(ChronicleTests.class.getName() + ".Q_ENTRIES", "5000"));
// Data type id
protected static final byte FSYNC_DATA = 1;
protected static final byte NORMAL_DATA = 0;
protected static final byte TH_START_DATA = -1;
protected static final byte TH_END_DATA = -2;
protected static final byte CQ_START_DATA = -3;
private static final long MAX_RUNTIME_MILLISECONDS = 30000;
private static String PAYLOAD_STRING = "1234567890ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
private static byte PAYLOAD_BYTES[] = PAYLOAD_STRING.getBytes();
private Chronicle _chronicle;
private String _cqPath = ROOT_DIR + QDIR;
#Before
public void init() {
buildCQ();
}
#Test
public void test() throws IOException, InterruptedException {
boolean passed = true;
Collection<Long> missingEntries = new LinkedList<Long>();
long sent = 0;
Thread listener = listen();
try {
listener.start();
// Write entries to CQ,
for (int i = 0; i < Q_ENTRIES; i++) {
long entry = writeQEntry(PAYLOAD_BYTES, (i % 100) == 0);
sent++;
// check each entry can be looked up
boolean found = checkEntry(i, entry);
if (!found)
missingEntries.add(entry);
passed &= found;
}
// Wait awhile for the listener
listener.join(MAX_RUNTIME_MILLISECONDS);
if (listener.isAlive())
listener.interrupt();
} finally {
if (listener.isAlive()) { // => exception raised so wait for listener
log("Give listener a chance....");
sleep(MAX_RUNTIME_MILLISECONDS);
listener.interrupt();
}
log("Sent: " + sent + " Received: " + _receivedEntries.size());
// Look for missing entries in receivedEntries
missingEntries.forEach(me -> checkMissingEntry(me));
log("All passed? " + passed);
// Try to find missing entries by searching from the start...
searchFromStartFor(missingEntries);
_chronicle.close();
_chronicle = null;
// Re-initialise CQ and look for missing entries again...
log("Re-initialise");
init();
searchFromStartFor(missingEntries);
}
}
private void buildCQ() {
try {
// build chronicle queue
_chronicle = ChronicleQueueBuilder.vanilla(_cqPath).cycleLength(CQ_LEN).entriesPerCycle(CQ_ENT)
.indexBlockSize(INDX_SIZE).dataBlockSize(DATA_SIZE).build();
} catch (IOException e) {
throw new InitializationException("Failed to initialize Active Trade Store.", e);
}
}
private long writeQEntry(byte dataArray[], boolean fsync) throws IOException {
ExcerptAppender appender = _chronicle.createAppender();
return writeData(appender, dataArray, fsync);
}
private boolean checkEntry(int seqNo, long entry) throws IOException {
ExcerptTailer tailer = _chronicle.createTailer();
if (!tailer.index(entry)) {
log("SeqNo: " + seqNo + " for entry + " + entry + " not found");
return false;
}
boolean isMarker = isMarker(tailer);
boolean isFsyncData = isFsyncData(tailer);
boolean isNormalData = isNormalData(tailer);
String type = isMarker ? "MARKER" : isFsyncData ? "FSYNC" : isNormalData ? "NORMALDATA" : "UNKNOWN";
log("Entry: " + entry + "(" + seqNo + ") is " + type);
return true;
}
private void log(String string) {
System.out.println(string);
}
private void searchFromStartFor(Collection<Long> missingEntries) throws IOException {
Set<Long> foundEntries = new HashSet<Long>(Q_ENTRIES);
ExcerptTailer tailer = _chronicle.createTailer();
tailer.toStart();
while (tailer.nextIndex())
foundEntries.add(tailer.index());
Iterator<Long> iter = missingEntries.iterator();
long foundCount = 0;
while (iter.hasNext()) {
long me = iter.next();
if (foundEntries.contains(me)) {
log("Found missing entry: " + me);
foundCount++;
}
}
log("searchFromStartFor Found: " + foundCount + " of: " + missingEntries.size() + " missing entries");
}
private void checkMissingEntry(long missingEntry) {
if (_receivedEntries.contains(missingEntry))
log("Received missing entry:" + missingEntry);
}
Set<Long> _receivedEntries = new HashSet<Long>(Q_ENTRIES);
private Thread listen() {
Thread returnVal = new Thread("Listener") {
public void run() {
try {
int receivedCount = 0;
ExcerptTailer tailer = _chronicle.createTailer();
tailer.toStart();
while (receivedCount < Q_ENTRIES) {
if (tailer.nextIndex()) {
_receivedEntries.add(tailer.index());
} else {
ChronicleTests.this.sleep(1);
}
}
log("listener complete");
} catch (IOException e) {
log("Interupted before receiving all entries");
}
}
};
return returnVal;
}
private void sleep(long interval) {
try {
Thread.sleep(interval);
} catch (InterruptedException e) {
// No action required
}
}
protected static final int THREAD_ID_LEN = Integer.SIZE / Byte.SIZE;
protected static final int DATA_TYPE_LEN = Byte.SIZE / Byte.SIZE;
protected static final int TIMESTAMP_LEN = Long.SIZE / Byte.SIZE;
protected static final int CRC_LEN = Long.SIZE / Byte.SIZE;
protected static long writeData(ExcerptAppender appender, byte dataArray[],
boolean fsync) {
appender.startExcerpt(DATA_TYPE_LEN + THREAD_ID_LEN + dataArray.length
+ CRC_LEN);
appender.nextSynchronous(fsync);
if (fsync) {
appender.writeByte(FSYNC_DATA);
} else {
appender.writeByte(NORMAL_DATA);
}
appender.writeInt(AffinitySupport.getThreadId());
appender.write(dataArray);
appender.writeLong(CRCCalculator.calcDataAreaCRC(appender));
appender.finish();
return appender.lastWrittenIndex();
}
protected static boolean isMarker(ExcerptCommon excerpt) {
if (isCqStartMarker(excerpt) || isStartMarker(excerpt) || isEndMarker(excerpt)) {
return true;
}
return false;
}
protected static boolean isCqStartMarker(ExcerptCommon excerpt) {
return isDataTypeMatched(excerpt, CQ_START_DATA);
}
protected static boolean isStartMarker(ExcerptCommon excerpt) {
return isDataTypeMatched(excerpt, TH_START_DATA);
}
protected static boolean isEndMarker(ExcerptCommon excerpt) {
return isDataTypeMatched(excerpt, TH_END_DATA);
}
protected static boolean isData(ExcerptTailer tailer, long index) {
if (!tailer.index(index)) {
return false;
}
return isData(tailer);
}
private static void movePosition(ExcerptCommon excerpt, long position) {
if (excerpt.position() != position)
excerpt.position(position);
}
private static void moveToFsyncFlagPos(ExcerptCommon excerpt) {
movePosition(excerpt, 0);
}
private static boolean isDataTypeMatched(ExcerptCommon excerpt, byte type) {
moveToFsyncFlagPos(excerpt);
byte b = excerpt.readByte();
if (b == type) {
return true;
}
return false;
}
protected static boolean isNormalData(ExcerptCommon excerpt) {
return isDataTypeMatched(excerpt, NORMAL_DATA);
}
protected static boolean isFsyncData(ExcerptCommon excerpt) {
return isDataTypeMatched(excerpt, FSYNC_DATA);
}
/**
* Check if this entry is Data
*
* #param excerpt
* #return true if the entry is data
*/
protected static boolean isData(ExcerptCommon excerpt) {
if (isNormalData(excerpt) || isFsyncData(excerpt)) {
return true;
}
return false;
}
}
The problem only occurs when initialising the data-block size with a value that is not a power of two. The built-in configurations on IndexedChronicleQueueBuilder (small(), medium(), large()) take care to initialise using powers of two which provided the clue as to the appropriate usage.
Notwithstanding the above response regarding support, which I totally appreciate, it would be useful if a knowledgeable Chronicle user could confirm that the integrity of Chronicle Queue depends on using a data-block size of a power of two.

Upload File data into database using spring mvc and hibernate

I need to upload data into 2 database tables(factory and factoryType) using the below form but it isnt working can somebody take a look :
Factory table:factoryId,factoryname
FactoryType table: factoryType,factoryTypeId
FactoryConf table: factoryID,factoryTypeId
We are using hibernate for the database operations.
Model:
#Entity
#Table(name = "FactoryConf", uniqueConstraints = {
#UniqueConstraint(columnNames = { "factoryId" } )
})
public class FactoryConf {
#Id
long factoryId;
#OneToOne
#JoinColumn(name = "factoryId", insertable = false, updatable = false)
Factory factory;
#ManyToOne(optional = false)
#JoinColumn(name = "factoryTypeId")
FactoryType factoryType;
public FactoryConf() {
super();
}
public FactoryConf(long factoryId, FactoryType factoryType) {
super();
this.factoryType = factoryType;
this.factoryId = factoryId;
}
public Factory getFactory() {
return factory;
}
public void setFactory(Factory factory) {
this.factory = factory;
}
public FactoryType getFactoryType() {
return factoryType;
}
public void setFactoryType(FactoryType factoryType) {
this.factoryType = factoryType;
}
public long getFactoryId() {
return factoryId;
}
public void setFactoryId(long factoryId) {
this.factoryId = factoryId;
}
public FactoryType getFactoryTypeByFactoryID(long factoryId){
return factoryType;
}
}
Bean class:
/**
* This bean is defined to parse each record from the CSV file.
* All records are mapped to instances of this bean class.
*
*/
public class FactoryCSVFileInputBean {
private String upload_id;
private String file_name;
private byte file_data;
private long Id;
private String Name;
private String Type;
//getter setters
}
CSV parsing:
/**
* This class is defined for following actions
* 1. Validate the input CSV file format, header columns.
* 2. Parse the CSV file into a list of beans.
* 3. Validate input records with missing data and prepare a valid factory list to be processed. *
*/
public class FactoryCSVUtil {
private static Log log = LogFactory.getLog(FactoryCSVUtil.class);
private static final List<String> fileHeaderFields = new ArrayList<String>();
private static final String UTF8CHARSET = "UTF-8";
static{
for (Field f : FactoryCSVFileInputBean.class.getDeclaredFields()) {
fileHeaderFields.add(f.getName());
}
}
public static List<FactoryCSVFileInputBean> getCSVInputList(InputStream inputStream){
CSVReader reader = null;
List<FactoryCSVFileInputBean> csvList = null;
FactoryCSVFileInputBean inputRecord = null;
String[] header = null;
String[] row = null;
try {
reader = new CSVReader(new InputStreamReader(inputStream,UTF8CHARSET));
csvList = new ArrayList<FactoryCSVFileInputBean>();
header = reader.readNext();
boolean isEmptyLine = true;
while ((row = reader.readNext()) != null) {
isEmptyLine = true;
if(!(row.length==1 && StringUtils.isBlank(row[0]))){//not an empty line, not even containing ','
inputRecord = new FactoryCSVFileInputBean();
isEmptyLine = populateFields(inputRecord, header, row);
if(row.length != header.length)
//inputRecord.setUploadStatus("Not Loaded - Missing or invalid Data");
if(!isEmptyLine)
csvList.add(inputRecord);
}
}
} catch (IOException e) {
log.debug("IOException while accessing FactoryCSVFileInputBean: " + e);
return null;
} catch (IllegalAccessException e) {
log.debug("IllegalAccessException while accessing FactoryCSVFileInputBean: " + e);
return null;
} catch (InvocationTargetException e) {
log.debug("InvocationTargetException while copying FactoryCSVFileInputBean properties: " + e);
return null;
} catch (Exception e) {
log.debug("Exception while parsing CSV file: " + e);
return null;
}finally{
try{
if(reader!=null)
reader.close();
}catch(IOException ioe){}
}
return csvList;
}
protected static boolean populateFields(FactoryCSVFileInputBean inputRecord,String[] header, String[] row) throws IllegalAccessException, InvocationTargetException {
boolean isEmptyLine = true;
for (int i = 0; i < row.length; i++) {
String val = row[i];
if(!StringUtils.isBlank(val)){
BeanUtilsBean.getInstance().copyProperty(inputRecord, header[i], val);
isEmptyLine = false;
} else {
//inputRecord.setUploadStatus(String.format("Not Loaded - Missing or invalid Data for:%s",header[i]));
}
}
return isEmptyLine;
}
public static void validateInputFile(CommonsMultipartFile csvFile, Model model){
InputStream inputStream = null;
CSVReader reader = null;
String fileName = csvFile.getOriginalFilename();
String fileExtension = fileName.substring(fileName.lastIndexOf('.') + 1);
if(fileExtension.toUpperCase().equals("CSV")){
try{
inputStream = csvFile.getInputStream();
reader = new CSVReader(new InputStreamReader(inputStream,UTF8CHARSET));
String[] header = reader.readNext();
if(header!=null){
for (int i = 0; i < header.length; i++) {
if(!header[i].equals("") && !fileHeaderFields.contains(header[i])){
log.debug("Invalid Column found in upload file: " + header[i]);
model.addAttribute("failureMsg", "Invalid Column found in upload file: " + header[i]);
break;
}
}
for(csvHeaderFieldsEnum field : csvHeaderFieldsEnum.values()){
if(!Arrays.asList(header).contains(field.getValue())){
log.debug("Missing column in upload file: " + field.getValue());
model.addAttribute("failureMsg", "Missing column in upload file: " + field.getValue());
break;
}
}
}else{
model.addAttribute("failureMsg", "File is Empty - Please select a valid file");
}
String[] data = reader.readNext();
if(data==null){
log.debug("Empty file with header - No data found");
model.addAttribute("failureMsg", "Empty file with header - No data found");
}
}catch(IOException e){
log.debug("IOException in reading the CSV file: " + e);
model.addAttribute("failureMsg", "Exception in reading the CSV file");
}finally{
if(reader!=null)
try{
reader.close();
}catch(IOException e){ log.debug("IOException in closing reader of CSV file: " + e);}
}
}
else{
model.addAttribute("failureMsg", "Invalid file format - Please select a CSV file");
}
}
}
Model
public class FactoryUploadForm {
private CommonsMultipartFile fileData;
private String uploadComment;
/**
* #return the fileData
*/
public CommonsMultipartFile getFileData() {
return fileData;
}
/**
* #param fileData the fileData to set
*/
public void setFileData(CommonsMultipartFile fileData) {
this.fileData = fileData;
}
/**
* #return the uploadComment
*/
public String getUploadComment() {
return uploadComment;
}
/**
* #param uploadComment the uploadComment to set
*/
public void setUploadComment(String uploadComment) {
this.uploadComment = uploadComment;
}
public String toString(){
return " CSVFileName: " + getFileData().getOriginalFilename() + "; Upload Comment: " + uploadComment;
}
}
Controller
#Controller
public class FactoryUploadDownloadController {
private static final Log logger = LogFactory.getLog(FactoryUploadDownloadController.class);
#Resource
Service Service;
#Resource
FactoryUploadRepository repository;
#RequestMapping(value = "/submitUploadFactoryForm")
public String uploadFactory(FactoryUploadForm uploadform,
HttpServletRequest request, Model model, BindingResult result) {
logger.debug("====================================================================");
List<FactoryCSVFileInputBean> csvList = null;
List<FactoryType> factoryTypes = Service.getFactoryTypes();
try {
CommonsMultipartFile file = uploadform.getFileData();
// parse csv file to list
csvList = FactoryCSVUtil.getCSVInputList(file.getInputStream());
if (csvList == null) {
model.addAttribute("failureMsg","Error in file parsing - Please verify the file");
logger.debug("---------------------------------------------------------");
return "sucess";
}
} catch (Exception e) {
logger.debug("sorry this isn't working for you");
}
try {
CommonsMultipartFile file = uploadform.getFileData();
for (FactoryCSVFileInputBean inputRecord : csvList) {
Factory factoryentity = new Factory();
factoryentity.setId(inputRecord.getId());
factoryentity.setName(inputRecord.getName());
factoryentity = this.Service.saveFactory(factoryentity);
FactoryConf factoryconf = new FactoryConf();
factoryconf.setFactory(factoryentity);
factoryconf.setFactoryType(pickFactoryType(factoryTypes,inputRecord.getType()));
model.addAttribute("factoryconf", factoryconf);
this.Service.savefactoryCfg(factoryconf);
}
} catch (Exception e) {
logger.debug("sorry this isnt working for you");
}
return "success";
}
private FactoryType pickFactoryType(List<FactoryType> types, String typeName) {
for (FactoryType type : types) {
if (type.getFactoryType().equalsIgnoreCase(typeName))
return type;
}
throw new RuntimeException(String.format("Factory Type Invalid :%s", typeName));
}
}
From your question, I understand that you are not able to parse data from a CSV file. Here is sample code for similar task. I think it should help.

Quartz doesn't recognize schema job_scheduling_data_2_0.xsd present in quartz jar file

I am getting below exception on server startup.
I am using quartz 2.2.21 with spring 3.2.
I have enabled quartz plugin (org.quartz.plugins.xml.XMLSchedulingDataProcessorPlugin).
Please find below the start tag of our XML file:
During server startup we are getting below log information and stacktrace:
Error Message:
Unable to load local schema packaged in quartz distribution jar. Utilizing schema online at http://www.quartz-scheduler.org/xml/job_scheduling_data_2_0.xsd
Exception:
Caused by: org.xml.sax.SAXParseException; systemId: file:///quartz_job_data.xml; lineNumber: 5; columnNumber: 104;
schema_reference.4: Failed to read schema document 'http://www.quartz-scheduler.org/xml/job_scheduling_data_2_0.xsd', because 1) could not find the document; 2) the document could not be read; 3) the root element of the document is not <xsd:schema>.
I have the same problem. I'm using the 7.1.1 of Jboss and the problem appears when you don't have connection to the internet. This is easy as putting a fake address that's unreachable in hosts.
I tried to force to local copy but it does not work.
What I finally did is to partially overwrite the functionality until this is fixed. Watch: https://jira.spring.io/browse/SPR-13706
public class CustomXMLSchedulingDataProcessor extends org.quartz.xml.XMLSchedulingDataProcessor {
public static final String QUARTZ_XSD_PATH_IN_JAR_CLASSPATH = "classpath:org/quartz/xml/job_scheduling_data_2_0.xsd";
public CustomXMLSchedulingDataProcessor(ClassLoadHelper clh) throws ParserConfigurationException {
super(clh);
}
#Override
protected Object resolveSchemaSource() {
InputSource inputSource;
InputStream is = null;
try {
is = classLoadHelper.getResourceAsStream(QUARTZ_XSD_PATH_IN_JAR_CLASSPATH);
} finally {
if (is != null) {
inputSource = new InputSource(is);
inputSource.setSystemId(QUARTZ_SCHEMA_WEB_URL);
}
else {
return QUARTZ_SCHEMA_WEB_URL;
}
}
return inputSource;
}
}
And I did a new plugin XMLSchedulingDataProcessorPlugin overwritting just the instanciation of above class.
public class XMLSchedulingDataProcessorPlugin
extends SchedulerPluginWithUserTransactionSupport
implements FileScanListener {
/*
* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*
* Data members.
*
* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*/
private static final int MAX_JOB_TRIGGER_NAME_LEN = 80;
private static final String JOB_INITIALIZATION_PLUGIN_NAME = "JobSchedulingDataLoaderPlugin";
private static final String FILE_NAME_DELIMITERS = ",";
private boolean failOnFileNotFound = true;
private String fileNames = CustomXMLSchedulingDataProcessor.QUARTZ_XML_DEFAULT_FILE_NAME;
// Populated by initialization
private Map<String, JobFile> jobFiles = new LinkedHashMap<String, JobFile>();
private long scanInterval = 0;
boolean started = false;
protected ClassLoadHelper classLoadHelper = null;
private Set<String> jobTriggerNameSet = new HashSet<String>();
/*
* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*
* Constructors.
*
* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*/
public XMLSchedulingDataProcessorPlugin() {
}
/*
* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*
* Interface.
*
* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*/
/**
* Comma separated list of file names (with paths) to the XML files that should be read.
*/
public String getFileNames() {
return fileNames;
}
/**
* The file name (and path) to the XML file that should be read.
*/
public void setFileNames(String fileNames) {
this.fileNames = fileNames;
}
/**
* The interval (in seconds) at which to scan for changes to the file.
* If the file has been changed, it is re-loaded and parsed. The default
* value for the interval is 0, which disables scanning.
*
* #return Returns the scanInterval.
*/
public long getScanInterval() {
return scanInterval / 1000;
}
/**
* The interval (in seconds) at which to scan for changes to the file.
* If the file has been changed, it is re-loaded and parsed. The default
* value for the interval is 0, which disables scanning.
*
* #param scanInterval The scanInterval to set.
*/
public void setScanInterval(long scanInterval) {
this.scanInterval = scanInterval * 1000;
}
/**
* Whether or not initialization of the plugin should fail (throw an
* exception) if the file cannot be found. Default is <code>true</code>.
*/
public boolean isFailOnFileNotFound() {
return failOnFileNotFound;
}
/**
* Whether or not initialization of the plugin should fail (throw an
* exception) if the file cannot be found. Default is <code>true</code>.
*/
public void setFailOnFileNotFound(boolean failOnFileNotFound) {
this.failOnFileNotFound = failOnFileNotFound;
}
/*
* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*
* SchedulerPlugin Interface.
*
* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*/
/**
* <p>
* Called during creation of the <code>Scheduler</code> in order to give
* the <code>SchedulerPlugin</code> a chance to initialize.
* </p>
*
* #throws org.quartz.SchedulerConfigException
* if there is an error initializing.
*/
public void initialize(String name, final Scheduler scheduler, ClassLoadHelper schedulerFactoryClassLoadHelper)
throws SchedulerException {
super.initialize(name, scheduler);
this.classLoadHelper = schedulerFactoryClassLoadHelper;
getLog().info("Registering Quartz Job Initialization Plug-in.");
// Create JobFile objects
StringTokenizer stok = new StringTokenizer(fileNames, FILE_NAME_DELIMITERS);
while (stok.hasMoreTokens()) {
final String fileName = stok.nextToken();
final JobFile jobFile = new JobFile(fileName);
jobFiles.put(fileName, jobFile);
}
}
#Override
public void start(UserTransaction userTransaction) {
try {
if (jobFiles.isEmpty() == false) {
if (scanInterval > 0) {
getScheduler().getContext().put(JOB_INITIALIZATION_PLUGIN_NAME + '_' + getName(), this);
}
Iterator<JobFile> iterator = jobFiles.values().iterator();
while (iterator.hasNext()) {
JobFile jobFile = iterator.next();
if (scanInterval > 0) {
String jobTriggerName = buildJobTriggerName(jobFile.getFileBasename());
TriggerKey tKey = new TriggerKey(jobTriggerName, JOB_INITIALIZATION_PLUGIN_NAME);
// remove pre-existing job/trigger, if any
getScheduler().unscheduleJob(tKey);
JobDetail job = newJob().withIdentity(jobTriggerName, JOB_INITIALIZATION_PLUGIN_NAME).ofType(FileScanJob.class)
.usingJobData(FileScanJob.FILE_NAME, jobFile.getFileName())
.usingJobData(FileScanJob.FILE_SCAN_LISTENER_NAME, JOB_INITIALIZATION_PLUGIN_NAME + '_' + getName())
.build();
SimpleTrigger trig = newTrigger().withIdentity(tKey).withSchedule(
simpleSchedule().repeatForever().withIntervalInMilliseconds(scanInterval))
.forJob(job)
.build();
getScheduler().scheduleJob(job, trig);
getLog().debug("Scheduled file scan job for data file: {}, at interval: {}", jobFile.getFileName(), scanInterval);
}
processFile(jobFile);
}
}
} catch(SchedulerException se) {
getLog().error("Error starting background-task for watching jobs file.", se);
} finally {
started = true;
}
}
/**
* Helper method for generating unique job/trigger name for the
* file scanning jobs (one per FileJob). The unique names are saved
* in jobTriggerNameSet.
*/
private String buildJobTriggerName(
String fileBasename) {
// Name w/o collisions will be prefix + _ + filename (with '.' of filename replaced with '_')
// For example: JobInitializationPlugin_jobInitializer_myjobs_xml
String jobTriggerName = JOB_INITIALIZATION_PLUGIN_NAME + '_' + getName() + '_' + fileBasename.replace('.', '_');
// If name is too long (DB column is 80 chars), then truncate to max length
if (jobTriggerName.length() > MAX_JOB_TRIGGER_NAME_LEN) {
jobTriggerName = jobTriggerName.substring(0, MAX_JOB_TRIGGER_NAME_LEN);
}
// Make sure this name is unique in case the same file name under different
// directories is being checked, or had a naming collision due to length truncation.
// If there is a conflict, keep incrementing a _# suffix on the name (being sure
// not to get too long), until we find a unique name.
int currentIndex = 1;
while (jobTriggerNameSet.add(jobTriggerName) == false) {
// If not our first time through, then strip off old numeric suffix
if (currentIndex > 1) {
jobTriggerName = jobTriggerName.substring(0, jobTriggerName.lastIndexOf('_'));
}
String numericSuffix = "_" + currentIndex++;
// If the numeric suffix would make the name too long, then make room for it.
if (jobTriggerName.length() > (MAX_JOB_TRIGGER_NAME_LEN - numericSuffix.length())) {
jobTriggerName = jobTriggerName.substring(0, (MAX_JOB_TRIGGER_NAME_LEN - numericSuffix.length()));
}
jobTriggerName += numericSuffix;
}
return jobTriggerName;
}
/**
* Overriden to ignore <em>wrapInUserTransaction</em> because shutdown()
* does not interact with the <code>Scheduler</code>.
*/
#Override
public void shutdown() {
// Since we have nothing to do, override base shutdown so don't
// get extranious UserTransactions.
}
private void processFile(JobFile jobFile) {
if (jobFile == null || !jobFile.getFileFound()) {
return;
}
try {
CustomXMLSchedulingDataProcessor processor =
new CustomXMLSchedulingDataProcessor(this.classLoadHelper);
processor.addJobGroupToNeverDelete(JOB_INITIALIZATION_PLUGIN_NAME);
processor.addTriggerGroupToNeverDelete(JOB_INITIALIZATION_PLUGIN_NAME);
processor.processFileAndScheduleJobs(
jobFile.getFileName(),
jobFile.getFileName(), // systemId
getScheduler());
} catch (Exception e) {
getLog().error("Error scheduling jobs: " + e.getMessage(), e);
}
}
public void processFile(String filePath) {
processFile((JobFile)jobFiles.get(filePath));
}
/**
* #see org.quartz.jobs.FileScanListener#fileUpdated(java.lang.String)
*/
public void fileUpdated(String fileName) {
if (started) {
processFile(fileName);
}
}
class JobFile {
private String fileName;
// These are set by initialize()
private String filePath;
private String fileBasename;
private boolean fileFound;
protected JobFile(String fileName) throws SchedulerException {
this.fileName = fileName;
initialize();
}
protected String getFileName() {
return fileName;
}
protected boolean getFileFound() {
return fileFound;
}
protected String getFilePath() {
return filePath;
}
protected String getFileBasename() {
return fileBasename;
}
private void initialize() throws SchedulerException {
InputStream f = null;
try {
String furl = null;
File file = new File(getFileName()); // files in filesystem
if (!file.exists()) {
URL url = classLoadHelper.getResource(getFileName());
if(url != null) {
try {
furl = URLDecoder.decode(url.getPath(), "UTF-8");
} catch (UnsupportedEncodingException e) {
furl = url.getPath();
}
file = new File(furl);
try {
f = url.openStream();
} catch (IOException ignor) {
// Swallow the exception
}
}
} else {
try {
f = new java.io.FileInputStream(file);
}catch (FileNotFoundException e) {
// ignore
}
}
if (f == null) {
if (isFailOnFileNotFound()) {
throw new SchedulerException(
"File named '" + getFileName() + "' does not exist.");
} else {
getLog().warn("File named '" + getFileName() + "' does not exist.");
}
} else {
fileFound = true;
}
filePath = (furl != null) ? furl : file.getAbsolutePath();
fileBasename = file.getName();
} finally {
try {
if (f != null) {
f.close();
}
} catch (IOException ioe) {
getLog().warn("Error closing jobs file " + getFileName(), ioe);
}
}
}
}
}
That way you only have to use this plugin in your configuration and everything will work by default.
org.quartz.plugin.jobInitializer.class =
com.level2.quartz.processor.plugin.XMLSchedulingDataProcessorPlugin

Reduce doesn't run but job is successfully completed

Firstly, I am a newbie at Hadoop MapReduce. My reducer does not run but shows that the job is successfully completed. Below is my console output :
INFO mapreduce.Job: Running job: job_1418240815217_0015
INFO mapreduce.Job: Job job_1418240815217_0015 running in uber mode : false
INFO mapreduce.Job: map 0% reduce 0%
INFO mapreduce.Job: map 100% reduce 0%
INFO mapreduce.Job: Job job_1418240815217_0015 completed successfully
INFO mapreduce.Job: Counters: 30
The main class is :
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
#SuppressWarnings("deprecation")
Job job = new Job(conf,"NPhase2");
job.setJarByClass(NPhase2.class);
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(NPhase2Value.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(Text.class);
job.setMapperClass(MapClass.class);
job.setReducerClass(Reduce.class);
int numberOfPartition = 0;
List<String> other_args = new ArrayList<String>();
for(int i = 0; i < args.length; ++i)
{
try {
if ("-m".equals(args[i])) {
//conf.setNumMapTasks(Integer.parseInt(args[++i]));
++i;
} else if ("-r".equals(args[i])) {
job.setNumReduceTasks(Integer.parseInt(args[++i]));
} else if ("-k".equals(args[i])) {
int knn = Integer.parseInt(args[++i]);
conf.setInt("knn", knn);
System.out.println(knn);
} else {
other_args.add(args[i]);
}
job.setNumReduceTasks(numberOfPartition * numberOfPartition);
//conf.setNumReduceTasks(1);
} catch (NumberFormatException except) {
System.out.println("ERROR: Integer expected instead of " + args[i]);
} catch (ArrayIndexOutOfBoundsException except) {
System.out.println("ERROR: Required parameter missing from " + args[i-1]);
}
}
// Make sure there are exactly 2 parameters left.
if (other_args.size() != 2) {
System.out.println("ERROR: Wrong number of parameters: " +
other_args.size() + " instead of 2.");
}
FileInputFormat.setInputPaths(job, other_args.get(0));
FileOutputFormat.setOutputPath(job, new Path(other_args.get(1)));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
My mapper is :
public static class MapClass extends Mapper
{
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException
{
String line = value.toString();
String[] parts = line.split("\\s+");
// key format <rid1>
IntWritable mapKey = new IntWritable(Integer.valueOf(parts[0]));
// value format <rid2, dist>
NPhase2Value np2v = new NPhase2Value(Integer.valueOf(parts[1]), Float.valueOf(parts[2]));
context.write(mapKey, np2v);
}
}
My reducer class is :
public static class Reduce extends Reducer<IntWritable, NPhase2Value, NullWritable, Text>
{
int numberOfPartition;
int knn;
class Record
{
public int id2;
public float dist;
Record(int id2, float dist)
{
this.id2 = id2;
this.dist = dist;
}
public String toString()
{
return Integer.toString(id2) + " " + Float.toString(dist);
}
}
class RecordComparator implements Comparator<Record>
{
public int compare(Record o1, Record o2)
{
int ret = 0;
float dist = o1.dist - o2.dist;
if (Math.abs(dist) < 1E-6)
ret = o1.id2 - o2.id2;
else if (dist > 0)
ret = 1;
else
ret = -1;
return -ret;
}
}
public void setup(Context context)
{
Configuration conf = new Configuration();
conf = context.getConfiguration();
numberOfPartition = conf.getInt("numberOfPartition", 2);
knn = conf.getInt("knn", 3);
}
public void reduce(IntWritable key, Iterator<NPhase2Value> values, Context context) throws IOException, InterruptedException
{
//initialize the pq
RecordComparator rc = new RecordComparator();
PriorityQueue<Record> pq = new PriorityQueue<Record>(knn + 1, rc);
// For each record we have a reduce task
// value format <rid1, rid2, dist>
while (values.hasNext())
{
NPhase2Value np2v = values.next();
int id2 = np2v.getFirst().get();
float dist = np2v.getSecond().get();
Record record = new Record(id2, dist);
pq.add(record);
if (pq.size() > knn)
pq.poll();
}
while(pq.size() > 0)
{
context.write(NullWritable.get(), new Text(key.toString() + " " + pq.poll().toString()));
//break; // only ouput the first record
}
} // reduce
}
This is my helper class :
public class NPhase2Value implements WritableComparable {
private IntWritable first;
private FloatWritable second;
public NPhase2Value() {
set(new IntWritable(), new FloatWritable());
}
public NPhase2Value(int first, float second) {
set(new IntWritable(first), new FloatWritable(second));
}
public void set(IntWritable first, FloatWritable second) {
this.first = first;
this.second = second;
}
public IntWritable getFirst() {
return first;
}
public FloatWritable getSecond() {
return second;
}
#Override
public void write(DataOutput out) throws IOException {
first.write(out);
second.write(out);
}
#Override
public void readFields(DataInput in) throws IOException {
first.readFields(in);
second.readFields(in);
}
#Override
public boolean equals(Object o) {
if (o instanceof NPhase2Value) {
NPhase2Value np2v = (NPhase2Value) o;
return first.equals(np2v.first) && second.equals(np2v.second);
}
return false;
}
#Override
public String toString() {
return first.toString() + " " + second.toString();
}
#Override
public int compareTo(NPhase2Value np2v) {
return 1;
}
}
The command line command I use is :
hadoop jar knn.jar NPhase2 -m 1 -r 3 -k 4 phase1out phase2out
I am trying hard to figure out the error but still not able to come up with solution. Please help me in this regards as I am running on a tight schedule.
Because you have set the number of reducer task as 0. See this:
int numberOfPartition = 0;
//.......
job.setNumReduceTasks(numberOfPartition * numberOfPartition);
I dont see you have resetted numberOfPartition anywhere in your code. I thins you should set it where you are parsing -r option or remove call to setNumReduceTasks method as above completely as you are setting it already while parsing -r option.

isSplitable in combineFileInputFormat does not work

I have thousands of small files, and I want to process them with combineFileInputFormat.
In combineFileInputFormat, there are multiple small files for one mapper, each file will not be split.
the snippet of one of the small input files like this,
vers,3
period,2015-01-26-18-12-00,438469546,449329626,complete
config,libdvm.so,chromeview
pkgproc,com.futuredial.digitchat,10021,,0ns:10860078
pkgpss,com.futuredial.digitchat,10021,,0ns:9:6627:6627:6637:5912:5912:5912
pkgsvc-run,com.futuredial.digitchat,10021,.LiveScreenService,1,0n:10860078
pkgsvc-start,com.futuredial.digitchat,10021,.LiveScreenService,1,0n:10860078
pkgproc,com.google.android.youtube,10103,,0ns:10860078
pkgpss,com.google.android.youtube,10103,,0ns:9:12986:13000:13021:11552:11564:11580
pkgsvc- run,com.google.android.youtube,10103,com.google.android.apps.youtube.app.offline.transfer.OfflineTransferService,1,0n:10860078
pkgsvc- start,com.google.android.youtube,10103,com.google.android.apps.youtube.app.offline.transfer.OfflineTransferService,1,0n:10860078
I want to pass whole file content to the mapper. However, hadoop split the file to half.
For example, the above file may be split into
vers,3
period,2015-01-26-18-12-00,438469546,449329626,complete
config,libdvm.so,chromeview
pkgproc,com.futuredial.digitchat,#the line has been cut
But I want the content of whole file to be processed.
Here is my code, which reference Reading file as single record in hadoop
The driven code
public class CombineSmallfiles {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: conbinesmallfiles <in> <out>");
System.exit(2);
}
conf.setInt("mapred.min.split.size", 1);
conf.setLong("mapred.max.split.size", 26214400); // 25m
//conf.setLong("mapred.max.split.size", 134217728); // 128m
//conf.setInt("mapred.reduce.tasks", 5);
Job job = new Job(conf, "combine smallfiles");
job.setJarByClass(CombineSmallfiles.class);
job.setMapperClass(CombineSmallfileMapper.class);
//job.setReducerClass(IdentityReducer.class);
job.setNumReduceTasks(0);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
MultipleOutputs.addNamedOutput(job,"pkgproc",TextOutputFormat.class,Text.class,Text.class);
MultipleOutputs.addNamedOutput(job,"pkgpss",TextOutputFormat.class,Text.class,Text.class);
MultipleOutputs.addNamedOutput(job,"pkgsvc",TextOutputFormat.class,Text.class,Text.class);
job.setInputFormatClass(CombineSmallfileInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
int exitFlag = job.waitForCompletion(true) ? 0 : 1;
System.exit(exitFlag);
}
}
My Mapper code
public class CombineSmallfileMapper extends Mapper<NullWritable, Text, Text, Text> {
private Text file = new Text();
private MultipleOutputs mos;
private String period;
private Long elapsed;
#Override
public void setup(Context context) throws IOException, InterruptedException {
mos = new MultipleOutputs(context);
}
#Override
protected void map(NullWritable key, Text value, Context context) throws IOException, InterruptedException {
String file_name = context.getConfiguration().get("map.input.file.name");
String [] filename_tokens = file_name.split("_");
String uuid = filename_tokens[0];
String [] datetime_tokens;
try{
datetime_tokens = filename_tokens[1].split("-");
}catch(ArrayIndexOutOfBoundsException err){
throw new ArrayIndexOutOfBoundsException(file_name);
}
String year,month,day,hour,minute,sec,msec;
year = datetime_tokens[0];
month = datetime_tokens[1];
day = datetime_tokens[2];
hour = datetime_tokens[3];
minute = datetime_tokens[4];
sec = datetime_tokens[5];
msec = datetime_tokens[6];
String datetime = year+"-"+month+"-"+"-"+day+" "+hour+":"+minute+":"+sec+"."+msec;
String content = value.toString();
String []lines = content.split("\n");
for(int u = 0;u<lines.length;u++){
String line = lines[u];
String []tokens = line.split(",");
if(tokens[0].equals("period")){
period = tokens[1];
try{
long startTime = Long.valueOf(tokens[2]);
long endTime = Long.valueOf(tokens[3]);
elapsed = endTime-startTime;
}catch(NumberFormatException err){
throw new NumberFormatException(line);
}
}else if(tokens[0].equals("pkgproc")){
String proc_info = "";
try{
proc_info += period+","+String.valueOf(elapsed)+","+tokens[2]+","+tokens[3];
}catch(ArrayIndexOutOfBoundsException err){
throw new ArrayIndexOutOfBoundsException("pkgproc: "+content+ "line:"+line);
}
for(int i = 4;i<tokens.length;i++){
String []state_info = tokens[i].split(":");
String state = "";
state += ","+state_info[0].charAt(0)+","+state_info[0].charAt(1)+","+state_info[0].charAt(2)+","+state_info[1];
mos.write("pkgproc",new Text(tokens[1]), new Text(proc_info+state+','+uuid+','+datetime));
}
}else if(tokens[0].equals("pkgpss")){
String proc_info = "";
proc_info += period+","+String.valueOf(elapsed)+","+tokens[2]+","+tokens[3];
for(int i = 4;i<tokens.length;i++){
String []state_info = tokens[i].split(":");
String state = "";
state += ","+state_info[0].charAt(0)+","+state_info[0].charAt(1)+","+state_info[0].charAt(2)+","+state_info[1]+","+state_info[2]+","+state_info[3]+","+state_info[4]+","+state_info[5]+","+state_info[6]+","+state_info[7];
mos.write("pkgpss",new Text(tokens[1]), new Text(proc_info+state+','+uuid+','+datetime));
}
}else if(tokens[0].startsWith("pkgsvc")){
String []stateName = tokens[0].split("-");
String proc_info = "";
//tokens[2] = uid, tokens[3] = serviceName
proc_info += stateName[1]+','+period+","+String.valueOf(elapsed)+","+tokens[2]+","+tokens[3];
String opcount = tokens[4];
for(int i = 5;i<tokens.length;i++){
String []state_info = tokens[i].split(":");
String state = "";
state += ","+state_info[0].charAt(0)+","+state_info[0].charAt(1)+","+state_info[1];
mos.write("pkgsvc",new Text(tokens[1]), new Text(proc_info+state+','+opcount+','+uuid+','+datetime));
}
}
}
}
}
My CombineFileInputFormat, which overrides isSplitable and return false
public class CombineSmallfileInputFormat extends CombineFileInputFormat<NullWritable, Text> {
#Override
public RecordReader<NullWritable, Text> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException {
return new CombineFileRecordReader<NullWritable,Text>((CombineFileSplit) split,context,WholeFileRecordReader.class);
}
#Override
protected boolean isSplitable(JobContext context,Path file ){
return false;
}
}
The WholeFileRecordReader
public class WholeFileRecordReader extends RecordReader<NullWritable, Text> {
//private static final Logger LOG = Logger.getLogger(WholeFileRecordReader.class);
/** The path to the file to read. */
private final Path mFileToRead;
/** The length of this file. */
private final long mFileLength;
/** The Configuration. */
private final Configuration mConf;
/** Whether this FileSplit has been processed. */
private boolean mProcessed;
/** Single Text to store the file name of the current file. */
// private final Text mFileName;
/** Single Text to store the value of this file (the value) when it is read. */
private final Text mFileText;
/**
* Implementation detail: This constructor is built to be called via
* reflection from within CombineFileRecordReader.
*
* #param fileSplit The CombineFileSplit that this will read from.
* #param context The context for this task.
* #param pathToProcess The path index from the CombineFileSplit to process in this record.
*/
public WholeFileRecordReader(CombineFileSplit fileSplit, TaskAttemptContext context,
Integer pathToProcess) {
mProcessed = false;
mFileToRead = fileSplit.getPath(pathToProcess);
mFileLength = fileSplit.getLength(pathToProcess);
mConf = context.getConfiguration();
context.getConfiguration().set("map.input.file.name", mFileToRead.getName());
assert 0 == fileSplit.getOffset(pathToProcess);
//if (LOG.isDebugEnabled()) {
//LOG.debug("FileToRead is: " + mFileToRead.toString());
//LOG.debug("Processing path " + pathToProcess + " out of " + fileSplit.getNumPaths());
//try {
//FileSystem fs = FileSystem.get(mConf);
//assert fs.getFileStatus(mFileToRead).getLen() == mFileLength;
//} catch (IOException ioe) {
//// oh well, I was just testing.
//}
//}
//mFileName = new Text();
mFileText = new Text();
}
/** {#inheritDoc} */
#Override
public void close() throws IOException {
mFileText.clear();
}
/**
* Returns the absolute path to the current file.
*
* #return The absolute path to the current file.
* #throws IOException never.
* #throws InterruptedException never.
*/
#Override
public NullWritable getCurrentKey() throws IOException, InterruptedException {
return NullWritable.get();
}
/**
* <p>Returns the current value. If the file has been read with a call to NextKeyValue(),
* this returns the contents of the file as a BytesWritable. Otherwise, it returns an
* empty BytesWritable.</p>
*
* <p>Throws an IllegalStateException if initialize() is not called first.</p>
*
* #return A BytesWritable containing the contents of the file to read.
* #throws IOException never.
* #throws InterruptedException never.
*/
#Override
public Text getCurrentValue() throws IOException, InterruptedException {
return mFileText;
}
/**
* Returns whether the file has been processed or not. Since only one record
* will be generated for a file, progress will be 0.0 if it has not been processed,
* and 1.0 if it has.
*
* #return 0.0 if the file has not been processed. 1.0 if it has.
* #throws IOException never.
* #throws InterruptedException never.
*/
#Override
public float getProgress() throws IOException, InterruptedException {
return (mProcessed) ? (float) 1.0 : (float) 0.0;
}
/**
* All of the internal state is already set on instantiation. This is a no-op.
*
* #param split The InputSplit to read. Unused.
* #param context The context for this task. Unused.
* #throws IOException never.
* #throws InterruptedException never.
*/
#Override
public void initialize(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
// no-op.
}
/**
* <p>If the file has not already been read, this reads it into memory, so that a call
* to getCurrentValue() will return the entire contents of this file as Text,
* and getCurrentKey() will return the qualified path to this file as Text. Then, returns
* true. If it has already been read, then returns false without updating any internal state.</p>
*
* #return Whether the file was read or not.
* #throws IOException if there is an error reading the file.
* #throws InterruptedException if there is an error.
*/
#Override
public boolean nextKeyValue() throws IOException, InterruptedException {
if (!mProcessed) {
if (mFileLength > (long) Integer.MAX_VALUE) {
throw new IOException("File is longer than Integer.MAX_VALUE.");
}
byte[] contents = new byte[(int) mFileLength];
FileSystem fs = mFileToRead.getFileSystem(mConf);
FSDataInputStream in = null;
try {
// Set the contents of this file.
in = fs.open(mFileToRead);
IOUtils.readFully(in, contents, 0, contents.length);
mFileText.set(contents, 0, contents.length);
} finally {
IOUtils.closeQuietly(in);
}
mProcessed = true;
return true;
}
return false;
}
}
I want every mapper to parse multiple small files and each small file can not be split.
However, above code will cut(split) my input file and will raise a parsing error (since my parser will split the line into tokens).
In my concept, combineFileInputFormat will gather multiple files into one split, and each split will feed into one mapper. Therefore, one mapper can handle multiple files.
In my code, the max input split is set to 25MB, so I think the problem is that combineFileInputFormat will split the last part of small file of input split to satisfy the split size limit.
However, I have override isSplitable and return false, but it still splits the small file.
What is the correct way to do that?
I am not sure if it is possible to specify number of files to a mapper, rather than specify input split size?
Use setMaxSplitSize() method in your constructor code, it should work,
It ideally tells the split size,
public class CFInputFormat extends CombineFileInputFormat<FileLineWritable, Text> {
public CFInputFormat(){
super();
setMaxSplitSize(67108864); // 64 MB, default block size on hadoop
}
public RecordReader<FileLineWritable, Text> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException{
return new CombineFileRecordReader<FileLineWritable, Text>((CombineFileSplit)split, context, CFRecordReader.class);
}
#Override
protected boolean isSplitable(JobContext context, Path file){
return false;
}
}

Resources