FileNet exception "An exception occurred during a read of the RenditionEngineConnection" - filenet-p8

I am trying to publish a document using IBM FileNet.
I used the manual:
https://www.ibm.com/support/knowledgecenter/SSNW2F_5.2.0/com.ibm.p8.ce.dev.ce.doc/publish_procedures.htm
But I got an exception "An exception occurred during a read of the RenditionEngineConnection".
What's my mistake?
How should we set up "FileNet P8 Rendition Engine"?
https://www.ibm.com/support/knowledgecenter/it/SSNW2F_5.1.0/com.ibm.p8.installingre.doc/p8pic003.htm
My source code:
/**
* Create {#link PublishStyleTemplate}
*
* #param objectStore {#link ObjectStore}
* #param description description
*/
public void createPublishStyleTemplate(final ObjectStore objectStore, final String description) {
PublishStyleTemplate pst = Factory.PublishStyleTemplate.createInstance(objectStore);
pst.set_Title(description);
pst.set_Description(description);
StringList formats = Factory.StringList.createList();
formats.add("text/plain");
formats.add("application/msword");
formats.add("application/vnd.ms-excel");
formats.add("application/vnd.ms-powerpoint");
formats.add("application/vnd.openxmlformat");
formats.add("application/vnd.openxmlformats-officedocument.wordprocessingml.document");
formats.add("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
formats.add("application/vnd.openxmlformats-officedocument.presentationml.presentation");
pst.set_InputFormats(formats);
// ProviderID must use the well-known handler name.
String PDF_HANDLER = "PublishRequestPDFHandler";
pst.set_ProviderID(PDF_HANDLER);
pst.set_OutputFormat("application/pdf"); // PDF transformation
pst.save(RefreshMode.REFRESH);
}
/**
* Create {#link PublishTemplate}
*
* #param objectStore {#link ObjectStore}
* #param publishStyleTemplate {#link PublishStyleTemplate}
* #param description description
*/
public void createPublishTemplate(final ObjectStore objectStore, final PublishStyleTemplate publishStyleTemplate, final String description) {
// Create a publish template object.
PublishTemplate pt = Factory.PublishTemplate.createInstance(objectStore);
pt.set_StyleTemplate(publishStyleTemplate);
// Set document title for the publish template
pt.getProperties().putValue("DocumentTitle", description);
pt.set_Description("test_PublishTemplate");
// Is there a cascade delete dependency between source document and publication?
boolean isSourceDependency = true;
// isSourceDependency is a boolean variable that specifies whether the user wants
// to delete the publication automatically when the source is deleted. It is whichever value
// (true or false) the user chooses.
String VALUE_ISSOURCEDEPENDENCY = isSourceDependency ? "true" : "false";
// Publish template content.
String PT_CONTENT =
"<?xml version='1.0'?>" +
"<publishtemplatecontent>" +
"<version>2.0.1</version>" +
"<newinstructions>" +
"<issourcedependent>" + VALUE_ISSOURCEDEPENDENCY + "</issourcedependent>" +
"<outputfolderid>" + "B0FA3471-0000-CD1D-9D5E-B1E6E5E82135" + "</outputfolderid>" +
"<applyproperties><from>source</from></applyproperties>" +
"<applysecurity><from>default</from></applysecurity>" +
"</newinstructions>" +
"<republishinstructions>" +
"<versionablerepublishtype>versionandkeep</versionablerepublishtype>" +
"<nonversionablerepublishtype>addandkeep</nonversionablerepublishtype>" +
"<applypropertiesfrom>destination</applypropertiesfrom>" +
"<applysecurityfrom>destination</applysecurityfrom>" +
"</republishinstructions>" +
"</publishtemplatecontent>";
String[] PT_DATA = {"myNewPublishTemplate.xml", "application/x-filenet-publishtemplate", PT_CONTENT};
// Create content elements.
ContentElementList cel = Factory.ContentElement.createList();
ContentTransfer ctNew = Factory.ContentTransfer.createInstance();
ByteArrayInputStream is = new ByteArrayInputStream(PT_CONTENT.getBytes());
ctNew.setCaptureSource(is);
ctNew.set_RetrievalName(PT_DATA[0]);
ctNew.set_ContentType(PT_DATA[1]);
cel.add(ctNew);
pt.set_ContentElements(cel);
// Check in publish template as major version.
pt.checkin(AutoClassify.DO_NOT_AUTO_CLASSIFY, CheckinType.MAJOR_VERSION);
pt.save(RefreshMode.REFRESH);
}
/**
* Create {#link PublishRequest}
*
* #param objectStore
* #param document
* #param publishTemplate
* #return
*/
public PublishRequest createPublishRequest(
final ObjectStore objectStore,
final Document document,
final PublishTemplate publishTemplate) {
System.out.println(String.format("Document Id = %s", document.get_Id().toString()));
System.out.println(String.format("Document MimeType = %s", document.get_MimeType()));
System.out.println(String.format("Document Name = %s", document.get_Name()));
System.out.println(String.format("PublishTemplate Id = %s", publishTemplate.get_Id().toString()));
PublishStyleTemplate publishStyleTemplate = publishTemplate.get_StyleTemplate();
System.out.println(String.format("PublishStyleTemplate ProviderID = %s", publishStyleTemplate.get_ProviderID()));
StringList stringList = publishStyleTemplate.get_InputFormats();
Iterator iterator = stringList.iterator();
while(iterator.hasNext()) {
String inputFormat = (String) iterator.next();
System.out.println(String.format("PublishStyleTemplate InputFormat = %s", inputFormat));
}
String publishOpts = new String(
"<publishoptions><publicationname>"
+ document.get_Name()
+ "</publicationname></publishoptions>");
PublishRequest publishRequest = Factory.PublishRequest.createInstance(objectStore);
publishRequest.set_InputDocument(document);
publishRequest.set_PublishTemplate(publishTemplate);
publishRequest.setPublishOptions(publishOpts);
publishRequest.save(RefreshMode.REFRESH);
return publishRequest;
}
Log:
2020-04-14T12:47:57.492 9F0B4BDE PUBL FNRCE0000E - ERROR ERROR: Reading RenditionEngineConnection threw: An unexpected exception occurred.
2020-04-14T12:47:57.493 9F0B4BDE PUBL FNRCE0000I - INFO InvokeVista exception: An exception occurred during a read of the RenditionEngineConnection.
2020-04-14T12:47:57.493 E0476562 PUBL FNRCE0066E - ERROR Failed dispatching PublishRequest row {B0AB7771-0000-CA39-BFFD-BF7A84D30A96}\ncom.filenet.api.exception.EngineRuntimeException: FNRCE0066E: E_UNEXPECTED_EXCEPTION: An unexpected exception occurred.\n at com.filenet.engine.publish.PublishRequestPDFHandler.publishPDF(PublishRequestPDFHandler.java:435)\n at com.filenet.engine.publish.PublishRequestPDFHandler.execute(PublishRequestPDFHandler.java:169)\n at com.filenet.engine.publish.PublishRequestHandlerBase$1.run(PublishRequestHandlerBase.java:226)\n at com.filenet.engine.context.CallState.doAs(CallState.java:236)\n at com.filenet.engine.context.CallState.doAs(CallState.java:153)\n at com.filenet.engine.publish.PublishRequestHandlerBase.executeAs(PublishRequestHandlerBase.java:215)\n at com.filenet.engine.publish.PublishRequestExecutor.loadAndExecuteQueuedRow(PublishRequestExecutor.java:214)\n at com.filenet.engine.queueitem.QueueExecutor.dispatchQueuedRow(QueueExecutor.java:389)\n at com.filenet.engine.queueitem.QueueExecutor.dispatchEvent(QueueExecutor.java:209)\n at com.filenet.engine.queueitem.QueueExecutor.execute(QueueExecutor.java:133)\n at com.filenet.engine.tasks.BackgroundTask.safeExecute(BackgroundTask.java:275)\n at com.filenet.engine.tasks.BackgroundTask$BackgroundTaskPriviledgedExceptionAction.run(BackgroundTask.java:1110)\n at com.filenet.engine.context.CallState.doAsSystem(CallState.java:575)\n at com.filenet.engine.tasks.BackgroundTask.run(BackgroundTask.java:209)\n at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1153)\n at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)\n at java.lang.Thread.run(Thread.java:785)\nCaused by: com.filenet.api.exception.EngineRuntimeException: FNRCU0005E: PUBLISH_READING_REC_THREW: An exception occurred during a read of the RenditionEngineConnection.\n at com.filenet.engine.publish.PublishRequestHandlerUtil.readRenditionEngineConnection(PublishRequestHandlerUtil.java:199)\n at com.filenet.engine.publish.PublishRequestPDFHandler$InvokeVistaPDF$1.run(PublishRequestPDFHandler.java:128)\n at com.filenet.engine.context.CallState.doAs(CallState.java:236)\n at com.filenet.engine.context.CallState.doAs(CallState.java:153)\n at com.filenet.engine.publish.PublishRequestPDFHandler$InvokeVistaPDF.run(PublishRequestPDFHandler.java:118)\n ... 3 more\nCaused by: com.filenet.api.exception.EngineRuntimeException: FNRCE0066E: E_UNEXPECTED_EXCEPTION: An unexpected exception occurred.\n at com.filenet.engine.publish.PublishRequestHandlerUtil.readRenditionEngineConnection(PublishRequestHandlerUtil.java:120)\n ... 7 more
2020-04-14T12:47:57.494 E0476562 PUBL FNRCE0000I - INFO dispatchFailed: marked queue item: {B0AB7771-0000-CA39-BFFD-BF7A84D30A96} as "poisoned" and will not retry further.

Related

error in controller.php laravel (array_key_exists())

when I want to save changes in laravel, see this error :
/**
* Store a newly created resource in storage.
*
* #param \Illuminate\Http\Request $request
* #return array|\Illuminate\Http\Response
*/
public function arrangeRoleItem($content, $module_name)
{
if (array_key_exists(1, $content )) { //Module Show
$module_show = 1;
} else {
$module_show = 0;
}
if (array_key_exists(2, $content)) { // Show
$show = 1;
} else {
$show = 0;
}
if (array_key_exists(3, $content)) { // Create
$create = 1;
} else {
$create = 0;
}
if (array_key_exists(4, $content)) { // Edit
$edit = 1;
} else {
$edit = 0;
}
}
error:
Arguments
"array_key_exists() expects parameter 2 to be array, null given"
thanks for responses.
Your $content variable is null value but array_key_exists function expects second parameter is array
You need to set default value like this:
/**
* Arrange role item
*
* #param array $content
* #param string $module_name
* #return boolean $bool
*/
public function arrangeRoleItem($content = [], $module_name)

Updating an asset in Composer

namespace com.biz
participant User identified by name {
o String name
}
participant Bank identified by name {
o String name
o Integer points
--> Document document optional
}
asset Document identified by docname {
o String docname
o String doctype
o String hash
o String from
o String to
}
transaction UploadDoc {
--> Document document
}
transaction ShareDoc {
--> Document document
--> Bank bank
}
transaction SetupDemo {
}
Script File :
/**
*
* #param {com.biz.UploadDoc} uploadDoc
* #transaction
*/
async function uploadDoc(uploadDoc) {
uploadDoc.document.docname = 'BC1.jpg';
uploadDoc.document.doctype = 'BC';
uploadDoc.document.hash = '123456';
uploadDoc.document.from = 'Bank_1';
uploadDoc.document.to = 'User_1';
const ar = await getAssetRegistry('com.biz.Document');
await ar.update(uploadDoc.document);
}
/**
*
* #param {com.biz.ShareDoc} shareDoc
* #transaction
*/
async function shareDoc(shareDoc) {
shareDoc.document.docname = 'BC1.jpg';
shareDoc.document.doctype = 'BC';
shareDoc.document.hash = '12346';
shareDoc.document.from = 'User_1';
shareDoc.document.to = shareDoc.bank.name;
const dr = await getAssetRegistry('com.biz.Document');
await dr.update(shareDoc.document);
}
/**
*
* #param {com.biz.SetupDemo} setupDemo
* #transaction
*/
async function setupDemo(setupDemo) {
const factory = getFactory();
const NS = 'com.biz';
const user = factory.newResource(NS,'User','User_1')
user.name='User_1';
const banks = [
factory.newResource(NS,'Bank','Bank_1'),
factory.newResource(NS,'Bank','Bank_2')
];
banks[0].name = 'Bank_1';
banks[1].name = 'Bank_2';
banks[0].points = 100;
banks[1].points = 100;
const userRegistry = await getParticipantRegistry(NS + '.User');
await userRegistry.addAll([user]);
const bankRegistry = await getParticipantRegistry(NS + '.Bank');
await bankRegistry.addAll(banks);
const documents = [
factory.newResource(NS, 'Document', 'Pa60.jpg'),
factory.newResource(NS, 'Document', 'Pa80.jpg')
];
documents[0].docname = 'Pa60.jpg';
documents[0].doctype = 'DrivingLicense';
documents[0].hash = '12345';
documents[0].from ='Bank_1';
documents[0].to = 'User_1';
documents[1].docname = 'Pa80.jpg';
documents[1].doctype = 'DrivingLicense';
documents[1].hash = '123456';
documents[1].from ='Bank_1';
documents[1].to = 'User_1';
const docRegistry = await getAssetRegistry(NS + '.Document')
await docRegistry.addAll(documents);
}
Issue : First I run transaction SetupDemo. Runs fine.
Next, I run transaction UploadDoc. Runs fine. Pa60.jpg is updated as BC1.jpg. ID given in Plyground for submitting transaction : Pa60.jpg
Next, I want to run ShareDoc transaction. If I give ID as BC1.jpg while submitting transaction, 'Object with id 'BC1.jpg' not found' error.
Please help me resolve the issue.

Is there a list of documentation for the properties on the objects used in auto completion?

I'm working on setting up an auto completion list and I've been trying to figure out what each property does. Is there more documentation on this object?
Here's what I have so gathered far:
public class AutoCompleteObject {
public function AutoCompleteObject(name:String = null, metadata:String = null) {
this.value = name;
meta = metadata;
}
/**
* Value written upon auto completion
*
* #see #caption
* */
public var value:String;
/**
* The caption is what is shown in the auto completion list as you type the value
*
* #see #value
* */
public var caption:String;
/**
* The score is a reason unknown
* */
public var score:String;
/**
* What is shown to the right of the value or caption if set in the auto complete list
*
* */
public var meta:String;
/**
* Unknown
* */
public var className:String;
/**
* Unknown
* */
public var matchMask:Object;
/**
* Unknown
* */
public var exactMatch:Object;
/**
* Unknown
* Option: "rightAlignedText"
* */
public var type:String;
}
Here's my function for getting autocompletion objects:
public function getObjectsFromArray(values:Array, metadataType:String = "attribute", className:String = null):Array {
var newValues:Array = [];
var numberOfItems:int = values ? values.length :0;
var autoCompleteObject:AutoCompleteObject;
var testing:Boolean;
var object:Object;
for (var i:int = 0; i < numberOfItems; i++) {
if (testing) {
object = {"value":values[i], meta:metadataType};
newValues.push(object);
}
else {
autoCompleteObject = new AutoCompleteObject(values[i], metadataType);
autoCompleteObject.className = className;
autoCompleteObject.type = "attribute";
newValues.push(autoCompleteObject);
}
}
return newValues;
}
My question is what do the following properties mean:
score (I'm guessing it's a weighted value)
className
type
Less important:
matchMask
exactMatch
My related questions, if they should be separate questions let me know, are:
- if class name is what I think it is can I show className in the autocomplete list?
- can I sort the list by meta type? so my list is above the built in list?
- Should the strongly typed object I'm using be changed to dynamic type for future proofing? I found the other properties mentioned because errors were thrown when I changed from using Object.
I can post these as separate questions.
score is a number used for sorting https://github.com/ajaxorg/ace/blob/v1.2.6/lib/ace/autocomplete.js#L494
matchMask and exactMatch are internal properties used by the sorting algorithm
className is added to the row as a class name https://github.com/ajaxorg/ace/blob/v1.2.6/lib/ace/autocomplete/popup.js#L190
type is a custom property used only by snippet completer https://github.com/ajaxorg/ace/blob/v1.2.6/lib/ace/ext/language_tools.js#L67

How to recive email with laravel?

Is there a way to recover emails from a gmail account through laravel ?
I want to create an inbox mail, but directly from an acount of gmail or outlook.
Thanks for help.
David,
Follow below some code that I use to read emails. I believe that can helps you:
/**
* Get last UID on Emails table
*
* return integer
*/
private function getLastUID()
{
// I have a model Email and I try to get the higher uid on database
return Email::max('uid');
}
/**
* Open Imap instance
*
* #return resource
*/
private function startEmail()
{
return imap_open(env('IMAP'), env('IMAP_EMAIL'), env('IMAP_PASSWORD'), OP_READONLY);
}
/**
* Get Emails from Imap instance
*
* #return array
*/
private function getTodayEmails()
{
$mailbox = $this->startEmail();
$today = Carbon::now()->format('j-M-Y');
//I only search for todays emails, since I have a cron job that runs this task every hour (For my purpose I don't need to check it every minute)
$inbox = imap_search($mailbox,'SINCE '.$today);
/* If there is no email */
if ($inbox === false) return false;
//Sort to insert the new email first
rsort($inbox);
$emails = [];
foreach($inbox as $box) {
/* get information specific to this email */
$overview = imap_fetch_overview($mailbox, $box, 0);
$header = imap_headerinfo($mailbox , $box);
$uid = imap_uid($mailbox , $box);
// Here I check if the email $uid is already on my database, if no, I save it. If yes I break the conditional.
// I highly believe that you have to work on this conditional and in your Model. The rest is working well (at least for me! :) )
if ($uid > $this->getLastUID()) {
$emails[$box]['uid'] = $uid;
$emails[$box]['date'] = (isset($header->udate)) ? date('Y-m-d H:i:s', $header->udate) : null;
$emails[$box]['subject'] = (isset($overview[0]->subject)) ? $overview[0]->subject : null;
$emails[$box]['from'] = (isset($header->from[0])) ? $this->extractEmail($header->from[0]) : null;
$emails[$box]['from_name'] = (isset($header->from[0]->personal)) ? $header->from[0]->personal : null;
$emails[$box]['to'] = (isset($header->to[0])) ? $this->extractEmail($header->to[0]) : null;
$emails[$box]['to_name'] = (isset($header->to[0]->personal)) ? $header->to[0]->personal : null;
$emails[$box]['reply_to'] = (isset($header->reply_to[0])) ? $this->extractEmail($header->reply_to[0]) : null;
$emails[$box]['reply_name'] = (isset($header->reply_to[0]->personal)) ? $header->reply_to[0]->personal : null;
/* output the email body */
$emails[$box]['message'] = $this->getBody($uid, $mailbox);
} else {
break;
}
imap_close($mailbox);
return $emails;
}
/**
* Extract email from Imap Instance
*
* #param object $email
*
* #return bool|string
*/
private function extractEmail($email)
{
if (isset($email->mailbox) && isset($email->host))
return $email->mailbox.'#'.$email->host;
return false;
}
/**
* Get body message
*
* #param integer $uid
* #param Imap Instance $imap
*
* #return bool
*/
private function getBody($uid, $imap)
{
$body = $this->getPart($imap, $uid, "TEXT/HTML");
// if HTML body is empty, try getting text body
if ($body == "") {
$body = $this->getPart($imap, $uid, "TEXT/PLAIN");
}
return $body;
}
/**
* Treat body message of email
*
* #param Imap Instance $imap
* #param integer $uid
* #param string $mimetype
* #param bool $structure
* #param bool $partNumber
*
* #return bool|string
*/
private function getPart($imap, $uid, $mimetype, $structure = false, $partNumber = false)
{
if (!$structure) {
$structure = imap_fetchstructure($imap, $uid, FT_UID);
}
if ($structure) {
if ($mimetype == $this->getMimeType($structure)) {
if (!$partNumber) {
$partNumber = 1;
}
$text = imap_fetchbody($imap, $uid, $partNumber, FT_UID);
switch ($structure->encoding) {
case 3: return imap_base64($text);
case 4: return imap_qprint($text);
default: return $text;
}
}
// multipart
if ($structure->type == 1) {
foreach ($structure->parts as $index => $subStruct) {
$prefix = "";
if ($partNumber) {
$prefix = $partNumber . ".";
}
$data = $this->getPart($imap, $uid, $mimetype, $subStruct, $prefix . ($index + 1));
if ($data) {
return $data;
}
}
}
}
return false;
}
/**
* Get Mimetype of part
*
* #param $structure
*
* #return string
*/
private function getMimeType($structure)
{
$primaryMimetype = array("TEXT", "MULTIPART", "MESSAGE", "APPLICATION", "AUDIO", "IMAGE", "VIDEO", "OTHER");
if ($structure->subtype) {
return $primaryMimetype[(int)$structure->type] . "/" . $structure->subtype;
}
return "TEXT/PLAIN";
}
On the .env you must insert:
IMAP={imap.gmail.com:993/ssl/novalidate-cert}INBOX
(or you can use {imap.gmail.com:993/imap/ssl}INBOX)
IMAP_EMAIL=<Your GMAIL>
IMAP_PASSWORD=<PASSWORD>
Although you can use some libraries, like https://github.com/barbushin/php-imap. I really believe that it is easy to go straight with raw php (http://php.net/manual/en/book.imap.php).
You can get more information on: https://davidwalsh.name/gmail-php-imap (Where most of the functions are from, btw)
You can use GMAIL API
https://developers.google.com/gmail/api/quickstart/php
You can search or filter files using the messages.list and threads.list methods https://developers.google.com/gmail/api/guides/filtering
GET https://www.googleapis.com/gmail/v1/users/me/messages?q="in:sent after:2014/01/01 before:2014/01/30"

Splitting a tuple into multiple tuples in Pig

I like to generate multiple tuples from a single tuple. What I mean is:
I have file with following data in it.
>> cat data
ID | ColumnName1:Value1 | ColumnName2:Value2
so I load it by the following command
grunt >> A = load '$data' using PigStorage('|');
grunt >> dump A;
(ID,ColumnName1:Value1,ColumnName2:Value2)
Now I want to split this tuple into two tuples.
(ID, ColumnName1, Value1)
(ID, ColumnName2, Value2)
Can I use UDF along with foreach and generate. Some thing like the following?
grunt >> foreach A generate SOMEUDF(A)
EDIT:
input tuple : (id1,column1,column2)
output : two tuples (id1,column1) and (id2,column2) so it is List or should I return a Bag?
public class SPLITTUPPLE extends EvalFunc <List<Tuple>>
{
public List<Tuple> exec(Tuple input) throws IOException {
if (input == null || input.size() == 0)
return null;
try{
// not sure how whether I can create tuples on my own. Looks like I should use TupleFactory.
// return list of tuples.
}catch(Exception e){
throw WrappedIOException.wrap("Caught exception processing input row ", e);
}
}
}
Is this approach correct?
You could write a UDF or use a PIG script with built-in functions.
For example:
-- data should be chararray, PigStorage('|') return bytearray which will not work for this example
inpt = load '/pig_fun/input/single_tuple_to_multiple.txt' as (line:chararray);
-- split by | and create a row so we can dereference it later
splt = foreach inpt generate FLATTEN(STRSPLIT($0, '\\|')) ;
-- first column is id, rest is converted into a bag and flatten it to make rows
id_vals = foreach splt generate $0 as id, FLATTEN(TOBAG(*)) as value;
-- there will be records with (id, id), but id should not have ':'
id_vals = foreach id_vals generate id, INDEXOF(value, ':') as p, STRSPLIT(value, ':', 2) as vals;
final = foreach (filter id_vals by p != -1) generate id, FLATTEN(vals) as (col, val);
dump final;
Test INPUT:
1|c1:11:33|c2:12
234|c1:21|c2:22
33|c1:31|c2:32
345|c1:41|c2:42
OUTPUT
(1,c1,11:33)
(1,c2,12)
(234,c1,21)
(234,c2,22)
(33,c1,31)
(33,c2,32)
(345,c1,41)
(345,c2,42)
I hope it helps.
Cheers.
Here is the UDF version. I prefer to return a BAG:
import java.io.IOException;
import org.apache.pig.EvalFunc;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.data.BagFactory;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.logicalLayer.schema.Schema;
/**
* Converts input chararray "ID|ColumnName1:Value1|ColumnName2:Value2|.." into a bag
* {(ID, ColumnName1, Value1), (ID, ColumnName2, Value2), ...}
*
* Default rows separator is '|' and key value separator is ':'.
* In this implementation white spaces around separator characters are not removed.
* ID can be made of any character (including sequence of white spaces).
* #author
*
*/
public class TupleToBagColumnValuePairs extends EvalFunc<DataBag> {
private static final TupleFactory tupleFactory = TupleFactory.getInstance();
private static final BagFactory bagFactory = BagFactory.getInstance();
//Row separator character. Default is '|'.
private String rowsSeparator;
//Column value separator character. Default i
private String columnValueSeparator;
public TupleToBagColumnValuePairs() {
this.rowsSeparator = "\\|";
this.columnValueSeparator = ":";
}
public TupleToBagColumnValuePairs(String rowsSeparator, String keyValueSeparator) {
this.rowsSeparator = rowsSeparator;
this.columnValueSeparator = keyValueSeparator;
}
/**
* Creates a tuple with 3 fields (id:chararray, column:chararray, value:chararray)
* #param outputBag Output tuples (id, column, value) are added to this bag
* #param id
* #param column
* #param value
* #throws ExecException
*/
protected void addTuple(DataBag outputBag, String id, String column, String value) throws ExecException {
Tuple outputTuple = tupleFactory.newTuple();
outputTuple.append(id);
outputTuple.append(column);
outputTuple.append( value);
outputBag.add(outputTuple);
}
/**
* Takes column{separator}value from splitInputLine, splits id into column value and adds them to the outputBag as (id, column, value)
* #param outputBag Output tuples (id, column, value) should be added to this bag
* #param id
* #param splitInputLine format column{separator}value, which start from index 1
* #throws ExecException
*/
protected void parseColumnValues(DataBag outputBag, String id,
String[] splitInputLine) throws ExecException {
for (int i = 1; i < splitInputLine.length; i++) {
if (splitInputLine[i] != null) {
int columnValueSplitIndex = splitInputLine[i].indexOf(this.columnValueSeparator);
if (columnValueSplitIndex != -1) {
String column = splitInputLine[i].substring(0, columnValueSplitIndex);
String value = null;
if (columnValueSplitIndex + 1 < splitInputLine[i].length()) {
value = splitInputLine[i].substring(columnValueSplitIndex + 1);
}
this.addTuple(outputBag, id, column, value);
} else {
String column = splitInputLine[i];
this.addTuple(outputBag, id, column, null);
}
}
}
}
/**
* input - contains only one field of type chararray, which will be split by '|'
* All inputs that are: null or of length 0 are ignored.
*/
#Override
public DataBag exec(Tuple input) throws IOException {
if (input == null || input.size() != 1 || input.isNull(0)) {
return null;
}
String inputLine = (String)input.get(0);
String[] splitInputLine = inputLine.split(this.rowsSeparator, -1);
if (splitInputLine.length > 1 && splitInputLine[0].length() > 0) {
String id = splitInputLine[0];
DataBag outputBag = bagFactory.newDefaultBag();
if (splitInputLine.length == 1) { // there is just an id in the line
this.addTuple(outputBag, id, null, null);
} else {
this.parseColumnValues(outputBag, id, splitInputLine);
}
return outputBag;
}
return null;
}
#Override
public Schema outputSchema(Schema input) {
try {
if (input.size() != 1) {
throw new RuntimeException("Expected input to have only one field");
}
Schema.FieldSchema inputFieldSchema = input.getField(0);
if (inputFieldSchema.type != DataType.CHARARRAY) {
throw new RuntimeException("Expected a CHARARRAY as input");
}
Schema tupleSchema = new Schema();
tupleSchema.add(new Schema.FieldSchema("id", DataType.CHARARRAY));
tupleSchema.add(new Schema.FieldSchema("column", DataType.CHARARRAY));
tupleSchema.add(new Schema.FieldSchema("value", DataType.CHARARRAY));
return new Schema(new Schema.FieldSchema(getSchemaName(this.getClass().getName().toLowerCase(), input), tupleSchema, DataType.BAG));
} catch (FrontendException exx) {
throw new RuntimeException(exx);
}
}
}
Here is how it is used in PIG:
register 'path to the jar';
define IdColumnValue myPackage.TupleToBagColumnValuePairs();
inpt = load '/pig_fun/input/single_tuple_to_multiple.txt' as (line:chararray);
result = foreach inpt generate FLATTEN(IdColumnValue($0)) as (id1, c2, v2);
dump result;
A good inspiration for writing UDFs with bags see DataFu source code by LinkedIn
You could use TransposeTupleToBag (UDF from DataFu lib) on the output of STRSPLIT to get the bag, and then FLATTEN the bag to create separate row per original column.

Resources