Update column value for a specified time range in Hive table - hadoop

A Hive table "Employee" contains a column "timerange", and the data is
timerange
1:10
1:13
1:17
1:21
1:26
If the last digit range is between (0 & 4), the data must be updated as 0. If the last digit range is between (5 & 9) must be updated as 5.
Expected output is
timerange
1:10
1:10
1:15
1:20
1:25
How can I do this?

You can do this through built-in string manipulation:
SELECT CASE WHEN SUBSTRING(timerange, LENGTH(timerange)) < "5"
THEN CONCAT(SUBSTRING(timerange, 1, LENGTH(timerange) - 1), "0")
ELSE CONCAT(SUBSTRING(timerange, 1, LENGTH(timerange) - 1), "5")
END AS timerange
FROM Employee;

You can create a generic UDF (GenericUDF).
Here is a sample UDF:
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
public class TimeRangeConverter GenericUDF {
#Override
public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
if (arguments.length != 1) {
throw new UDFArgumentLengthException("The function time_range_converter(time_rage) requires 1 argument.");
}
ObjectInspector timeRangeVal = arguments[0];
if (!(timeRangeVal instanceof StringObjectInspector)) {
throw new UDFArgumentException("First argument must be of type String (time_range as String)");
}
return PrimitiveObjectInspectorFactory.writableStringObjectInspector;
}
#Override
public Object evaluate(DeferredObject[] arguments) throws HiveException {
String timeRangeVal = (String) ObjectInspectorUtils.copyToStandardJavaObject(arguments[0].get(),
PrimitiveObjectInspectorFactory.javaStringObjectInspector);
char[] characters = timeRangeVal.toCharArray();
if (characters[characters.length - 1] > '5') {
characters[characters.length - 1] = '5';
} else {
characters[characters.length - 1] = '0';
}
return String.valueOf(characters);
}
#Override
public String getDisplayString(String[] arguments) {
assert (arguments.length == 1);
return "time_range_converter(" + arguments[0] + ")";
}
}
Call the Hive update statement like:
CREATE TEMPORARY FUNCTION time_range_converterAS 'TimeRangeConverter';
UPDATE
Employee
SET
timerange = time_range_converter(timerange);

Related

How to create UDF in pig for categorize columns with respect to another filed

I want to categorize one column with respect to other column using UDF in pig.
Data i have
Id,name,age
1,jhon,31
2,adi,15
3,sam,25
4,lina,28
Expected output
1,jhon,31,30-35
2,adi,15,10-15
3,sam,25,20-25
4,lina,28,25-30
Please suggest
You can do this without a UDF. Assuming you have loaded the data to a relation A.
B = FOREACH A GENERATE A.Id,A.name,A.age,(A.age%5 == 0 ? A.age-5 : (A.age/5)*5) as lower_age,(A.age%5 == 0 ? A.age : ((A.age/5)*5) + 5) as upper_age;
C = FOREACH B GENERATE B.Id,B.name,B.age,CONCAT(CONCAT((chararray)lower_age,'-'),(chararray)upper_age);
DUMP C;
you can create pig udfs in eclipse
create a project in eclipse with pig jars and try below code
package com;
import java.io.IOException;
import org.apache.pig.EvalFunc;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.data.Tuple;
public class Age extends EvalFunc<String>{
#Override
public String exec(Tuple a) throws IOException {
// TODO Auto-generated method stub
if(a == null || a.size() == 0){
return null;
}
try{
Object object = a.get(0);
if(object == null){
return null;
}
int i = (Integer) object;
if(i >= 10 && i <= 20 ){
return "10-20";
}
else if (i >= 21 && i <= 30){
return "20-30";
}
else
return ">30";
} catch (ExecException e){
throw new IOException(e);
}
}
}
Now export the project as jar and register it in pig shell
REGISTER <path of your .jar file>
Define it with package and class.
DEFINE U com.Age();
a = LOAD '<input path>' using PigStorage(',') as (id:int,name:chararray,age:int);
b = FOREACH a GENERATE id,name,age,U(age);

How to find the address where width and height are stored inside an mp4 file?

I need to find the addresses where the width and height are stored, but the IUT version of the standard don't give a clear definition of the file format.
What I found so far... :
Both values are stored in "a QuickTime float". I couldn't find the format, but it seems it use two 16-bits integer: a signed one followed by an unsigned one.
Unlike many file format, there are no fixed position, so it is file specific. It depend on the TrackHeaderBox address.
What I desperatly need :
A clear canonical answer describing the places to find only those kind of information. I don't want answers only referring to third party libraries (unless they are written in proper JavaScript). Some pseudo C like structures can help.
There is no fixed position. You need to parse into the file. Please check this Java example.
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.List;
public class GetHeight {
public static void main(String[] args) throws IOException {
FileInputStream fis = new FileInputStream(new File(args[0]));
GetHeight ps = new GetHeight();
ps.find(fis);
}
byte[] lastTkhd;
private void find(InputStream fis) throws IOException {
while (fis.available() > 0) {
byte[] header = new byte[8];
fis.read(header);
long size = readUint32(header, 0);
String type = new String(header, 4, 4, "ISO-8859-1");
if (containers.contains(type)) {
find(fis);
} else {
if (type.equals("tkhd")) {
lastTkhd = new byte[(int) (size - 8)];
fis.read(lastTkhd);
} else {
if (type.equals("hdlr")) {
byte[] hdlr = new byte[(int) (size - 8)];
fis.read(hdlr);
if (hdlr[8] == 0x76 && hdlr[9] == 0x69 && hdlr[10] == 0x64 && hdlr[11] == 0x65) {
System.out.println("Video Track Header identified");
System.out.println("width: " + readFixedPoint1616(lastTkhd, lastTkhd.length - 8));
System.out.println("height: " + readFixedPoint1616(lastTkhd, lastTkhd.length - 4));
System.exit(1);
}
} else {
fis.skip(size - 8);
}
}
}
}
}
public static long readUint32(byte[] b, int s) {
long result = 0;
result |= ((b[s + 0] << 24) & 0xFF000000);
result |= ((b[s + 1] << 16) & 0xFF0000);
result |= ((b[s + 2] << 8) & 0xFF00);
result |= ((b[s + 3]) & 0xFF);
return result;
}
public static double readFixedPoint1616(byte[] b, int s) {
return ((double) readUint32(b, s)) / 65536;
}
List<String> containers = Arrays.asList(
"moov",
"mdia",
"trak"
);
}

why Array Index Out Of Bound Exception while re arranging doc file paragraph?

Here is a code snippet. Its giving arrayindexoutofboundexception. dont know why ?
import java.io.File;
import java.io.FileInputStream;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.xslf.usermodel.XSLFTextParagraph;
public class wordcount
{
public static void main(String[] args) throws Exception
{
File file = new File("E:\\myFiles\\abc.doc");
FileInputStream fis=new FileInputStream(file.getAbsolutePath());
HWPFDocument document=new HWPFDocument(fis);
WordExtractor extractor = new WordExtractor(document);
String [] fileData = extractor.getParagraphText();
for (int i = 0; i < fileData.length; i++)
{
// System.out.println(fileData[i].toString());
String[] paraword = fileData[i].toString().split(" ");
// out.println(paraword.length);
if(paraword[i].length() == 0 )
{
System.out.println("\n");
}
else if(paraword[i].length() > 0 && paraword[i].length() < 12)
{
for(int k=0 ; k < paraword[i].length()-1 ; k++)
{
System.out.println(paraword[k].toString());
}
}
else if(paraword[i].length() >= 12 )
{
for(int k=0 ; k < 12 ; k++)
{
System.out.println(paraword[k].toString());
}
}
System.out.println("\n");
}
}
}
This is the image of the abc.doc file
Note : Expected output will be printed on java console.
and the output will contain 12 words in each line. But after executing first line the error occurs.
Any help would be appreciated
TIA
Honestly, I'm not familiar with the apache.org API, but just by looking at your logic it looks like you want to replace every instance of:
paraword[i].length()
with:
paraword.length
Because it looks like you want to check how many words are in the paragraph and not how long the first word of the paragraph is. Correct me if I'm wrong, but I think that will fix you up.
Here is the correct code snippet
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
public class ExtractWordDocument
{
public String myString() throws IOException
{
File file = new File("PATH FOR THE .doc FILE");
FileInputStream fis=new FileInputStream(file.getAbsolutePath());
HWPFDocument document=new HWPFDocument(fis);
WordExtractor extractor = new WordExtractor(document);
String [] fileData = extractor.getParagraphText();
ArrayList<Object> EntireDoc = new ArrayList<>();
for (int i = 0; i < fileData.length; i++)
{
String[] paraword = fileData[i].toString().split("\\s+");
if(paraword.length == 0 )
{EntireDoc.add("\n");}
else if(paraword.length > 0 && paraword.length < 12)
{
for(int k=0 ; k < paraword.length ; k++)
{EntireDoc.add(paraword[k].toString()+" ");}
}
else if(paraword.length > 12 )
{
java.util.List<String> arrAsList = Arrays.asList(paraword);
String formatedString = arrAsList.toString()
.replace(",", "") //remove the commas
.replace("[", "") //remove the right bracket
.replace("]", ""); //remove the left bracket
StringBuilder sb = new StringBuilder(formatedString);
int i1 = 0;
while ((i1 = sb.indexOf(" ", i1 + 75)) != -1)
{sb.replace(i1, i1 + 1, "\n");}
EntireDoc.add(sb.toString());
}
EntireDoc.add("\n");
}
String formatedString = EntireDoc.toString()
.replace(",", "") //remove the commas
.replace("[", "") //remove the right bracket
.replace("]", ""); //remove the left bracket
return formatedString;
}
public static void main(String[] args)
{
try{
System.out.print(new ExtractWordDocument().myString());
}
catch(IOException ioe){System.out.print(ioe);}
}
}
Note : This code will not print 12 words in each line but 75 charecters in each line.

Splitting a tuple into multiple tuples in Pig

I like to generate multiple tuples from a single tuple. What I mean is:
I have file with following data in it.
>> cat data
ID | ColumnName1:Value1 | ColumnName2:Value2
so I load it by the following command
grunt >> A = load '$data' using PigStorage('|');
grunt >> dump A;
(ID,ColumnName1:Value1,ColumnName2:Value2)
Now I want to split this tuple into two tuples.
(ID, ColumnName1, Value1)
(ID, ColumnName2, Value2)
Can I use UDF along with foreach and generate. Some thing like the following?
grunt >> foreach A generate SOMEUDF(A)
EDIT:
input tuple : (id1,column1,column2)
output : two tuples (id1,column1) and (id2,column2) so it is List or should I return a Bag?
public class SPLITTUPPLE extends EvalFunc <List<Tuple>>
{
public List<Tuple> exec(Tuple input) throws IOException {
if (input == null || input.size() == 0)
return null;
try{
// not sure how whether I can create tuples on my own. Looks like I should use TupleFactory.
// return list of tuples.
}catch(Exception e){
throw WrappedIOException.wrap("Caught exception processing input row ", e);
}
}
}
Is this approach correct?
You could write a UDF or use a PIG script with built-in functions.
For example:
-- data should be chararray, PigStorage('|') return bytearray which will not work for this example
inpt = load '/pig_fun/input/single_tuple_to_multiple.txt' as (line:chararray);
-- split by | and create a row so we can dereference it later
splt = foreach inpt generate FLATTEN(STRSPLIT($0, '\\|')) ;
-- first column is id, rest is converted into a bag and flatten it to make rows
id_vals = foreach splt generate $0 as id, FLATTEN(TOBAG(*)) as value;
-- there will be records with (id, id), but id should not have ':'
id_vals = foreach id_vals generate id, INDEXOF(value, ':') as p, STRSPLIT(value, ':', 2) as vals;
final = foreach (filter id_vals by p != -1) generate id, FLATTEN(vals) as (col, val);
dump final;
Test INPUT:
1|c1:11:33|c2:12
234|c1:21|c2:22
33|c1:31|c2:32
345|c1:41|c2:42
OUTPUT
(1,c1,11:33)
(1,c2,12)
(234,c1,21)
(234,c2,22)
(33,c1,31)
(33,c2,32)
(345,c1,41)
(345,c2,42)
I hope it helps.
Cheers.
Here is the UDF version. I prefer to return a BAG:
import java.io.IOException;
import org.apache.pig.EvalFunc;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.data.BagFactory;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.logicalLayer.schema.Schema;
/**
* Converts input chararray "ID|ColumnName1:Value1|ColumnName2:Value2|.." into a bag
* {(ID, ColumnName1, Value1), (ID, ColumnName2, Value2), ...}
*
* Default rows separator is '|' and key value separator is ':'.
* In this implementation white spaces around separator characters are not removed.
* ID can be made of any character (including sequence of white spaces).
* #author
*
*/
public class TupleToBagColumnValuePairs extends EvalFunc<DataBag> {
private static final TupleFactory tupleFactory = TupleFactory.getInstance();
private static final BagFactory bagFactory = BagFactory.getInstance();
//Row separator character. Default is '|'.
private String rowsSeparator;
//Column value separator character. Default i
private String columnValueSeparator;
public TupleToBagColumnValuePairs() {
this.rowsSeparator = "\\|";
this.columnValueSeparator = ":";
}
public TupleToBagColumnValuePairs(String rowsSeparator, String keyValueSeparator) {
this.rowsSeparator = rowsSeparator;
this.columnValueSeparator = keyValueSeparator;
}
/**
* Creates a tuple with 3 fields (id:chararray, column:chararray, value:chararray)
* #param outputBag Output tuples (id, column, value) are added to this bag
* #param id
* #param column
* #param value
* #throws ExecException
*/
protected void addTuple(DataBag outputBag, String id, String column, String value) throws ExecException {
Tuple outputTuple = tupleFactory.newTuple();
outputTuple.append(id);
outputTuple.append(column);
outputTuple.append( value);
outputBag.add(outputTuple);
}
/**
* Takes column{separator}value from splitInputLine, splits id into column value and adds them to the outputBag as (id, column, value)
* #param outputBag Output tuples (id, column, value) should be added to this bag
* #param id
* #param splitInputLine format column{separator}value, which start from index 1
* #throws ExecException
*/
protected void parseColumnValues(DataBag outputBag, String id,
String[] splitInputLine) throws ExecException {
for (int i = 1; i < splitInputLine.length; i++) {
if (splitInputLine[i] != null) {
int columnValueSplitIndex = splitInputLine[i].indexOf(this.columnValueSeparator);
if (columnValueSplitIndex != -1) {
String column = splitInputLine[i].substring(0, columnValueSplitIndex);
String value = null;
if (columnValueSplitIndex + 1 < splitInputLine[i].length()) {
value = splitInputLine[i].substring(columnValueSplitIndex + 1);
}
this.addTuple(outputBag, id, column, value);
} else {
String column = splitInputLine[i];
this.addTuple(outputBag, id, column, null);
}
}
}
}
/**
* input - contains only one field of type chararray, which will be split by '|'
* All inputs that are: null or of length 0 are ignored.
*/
#Override
public DataBag exec(Tuple input) throws IOException {
if (input == null || input.size() != 1 || input.isNull(0)) {
return null;
}
String inputLine = (String)input.get(0);
String[] splitInputLine = inputLine.split(this.rowsSeparator, -1);
if (splitInputLine.length > 1 && splitInputLine[0].length() > 0) {
String id = splitInputLine[0];
DataBag outputBag = bagFactory.newDefaultBag();
if (splitInputLine.length == 1) { // there is just an id in the line
this.addTuple(outputBag, id, null, null);
} else {
this.parseColumnValues(outputBag, id, splitInputLine);
}
return outputBag;
}
return null;
}
#Override
public Schema outputSchema(Schema input) {
try {
if (input.size() != 1) {
throw new RuntimeException("Expected input to have only one field");
}
Schema.FieldSchema inputFieldSchema = input.getField(0);
if (inputFieldSchema.type != DataType.CHARARRAY) {
throw new RuntimeException("Expected a CHARARRAY as input");
}
Schema tupleSchema = new Schema();
tupleSchema.add(new Schema.FieldSchema("id", DataType.CHARARRAY));
tupleSchema.add(new Schema.FieldSchema("column", DataType.CHARARRAY));
tupleSchema.add(new Schema.FieldSchema("value", DataType.CHARARRAY));
return new Schema(new Schema.FieldSchema(getSchemaName(this.getClass().getName().toLowerCase(), input), tupleSchema, DataType.BAG));
} catch (FrontendException exx) {
throw new RuntimeException(exx);
}
}
}
Here is how it is used in PIG:
register 'path to the jar';
define IdColumnValue myPackage.TupleToBagColumnValuePairs();
inpt = load '/pig_fun/input/single_tuple_to_multiple.txt' as (line:chararray);
result = foreach inpt generate FLATTEN(IdColumnValue($0)) as (id1, c2, v2);
dump result;
A good inspiration for writing UDFs with bags see DataFu source code by LinkedIn
You could use TransposeTupleToBag (UDF from DataFu lib) on the output of STRSPLIT to get the bag, and then FLATTEN the bag to create separate row per original column.

Problems during counting strings in the txt file

I am developing a progam which reads a text file and creates a report. The content of the report is the following: the number of every string in file, its "status", and some symbols of every string beginning. It works well with file up to 100 Mb.
But when I run the program with input files which are bigger than 1,5Gb in size and contain more than 100000 lines, I get the following error:
> Exception in thread "main" java.lang.OutOfMemoryError: Java heap space
> at java.util.Arrays.copyOfRange(Unknown Source) at
> java.lang.String.<init>(Unknown Source) at
> java.lang.StringBuffer.toString(Unknown Source) at
> java.io.BufferedReader.readLine(Unknown Source) at
> java.io.BufferedReader.readLine(Unknown Source) at
> org.apache.commons.io.IOUtils.readLines(IOUtils.java:771) at
> org.apache.commons.io.IOUtils.readLines(IOUtils.java:723) at
> org.apache.commons.io.IOUtils.readLines(IOUtils.java:745) at
> org.apache.commons.io.FileUtils.readLines(FileUtils.java:1512) at
> org.apache.commons.io.FileUtils.readLines(FileUtils.java:1528) at
> org.apache.commons.io.ReadFileToListSample.main(ReadFileToListSample.java:43)
I increased VM arguments up to -Xms128m -Xmx1600m (in eclipse run configuration) but this did not help. Specialists from OTN forum advised me to read some books and improve my program's performance. Could anybody help me to improve it? Thank you.
code:
import org.apache.commons.io.FileUtils;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.LineNumberReader;
import java.io.PrintStream;
import java.util.List;
public class ReadFileToList {
public static void main(String[] args) throws FileNotFoundException
{
File file_out = new File ("D:\\Docs\\test_out.txt");
FileOutputStream fos = new FileOutputStream(file_out);
PrintStream ps = new PrintStream (fos);
System.setOut (ps);
// Create a file object
File file = new File("D:\\Docs\\test_in.txt");
FileReader fr = null;
LineNumberReader lnr = null;
try {
// Here we read a file, sample.txt, using FileUtils
// class of commons-io. Using FileUtils.readLines()
// we can read file content line by line and return
// the result as a List of string.
List<String> contents = FileUtils.readLines(file);
//
// Iterate the result to print each line of the file.
fr = new FileReader(file);
lnr = new LineNumberReader(fr);
for (String line : contents)
{
String begin_line = line.substring(0, 38); // return 38 chars from the string
String begin_line_without_null = begin_line.replace("\u0000", " ");
String begin_line_without_null_spaces = begin_line_without_null.replaceAll(" +", " ");
int stringlenght = line.length();
line = lnr.readLine();
int line_num = lnr.getLineNumber();
String status;
// some correct length for if
int c_u_length_f = 12;
int c_ea_length_f = 13;
int c_a_length_f = 2130;
int c_u_length_e = 3430;
int c_ea_length_e = 1331;
int c_a_length_e = 442;
int h_ext = 6;
int t_ext = 6;
if ( stringlenght == c_u_length_f ||
stringlenght == c_ea_length_f ||
stringlenght == c_a_length_f ||
stringlenght == c_u_length_e ||
stringlenght == c_ea_length_e ||
stringlenght == c_a_length_e ||
stringlenght == h_ext ||
stringlenght == t_ext)
status = "ok";
else status = "fail";
System.out.println(+ line_num + stringlenght + status + begin_line_without_null_spaces);
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
Also specialists from OTN said that this programm opens the input and reading it twice. May be some mistakes in "for statement"? But I can't find it.
Thank you.
You're declaring variables inside the loop and doing a lot of uneeded work, including reading the file twice - not good for peformance either. You can use the line number reader to get the line number and the text and reuse the line variable (declared outside the loop). Here's a shortened version that does what you need. You'll need to complete the validLength method to check all the values since I included only the first couple of tests.
import java.io.*;
public class TestFile {
//a method to determine if the length is valid implemented outside the method that does the reading
private static String validLength(int length) {
if (length == 12 || length == 13 || length == 2130) //you can finish it
return "ok";
return "fail";
}
public static void main(String[] args) {
try {
LineNumberReader lnr = new LineNumberReader(new FileReader(args[0]));
BufferedWriter out = new BufferedWriter(new FileWriter(args[1]));
String line;
int length;
while (null != (line = lnr.readLine())) {
length = line.length();
line = line.substring(0,38);
line = line.replace("\u0000", " ");
line = line.replace("+", " ");
out.write( lnr.getLineNumber() + length + validLength(length) + line);
out.newLine();
}
out.close();
}
catch (Exception e) {
e.printStackTrace();
}
}
}
Call this as java TestFile D:\Docs\test_in.txt D:\Docs\test_in.txt or replace the args[0] and args[1] with the file names if you want to hard code them.

Resources