Parse XML contains incrementing namespace numbers and multiple namespaces - xpath

Parse XML contains incrementing namespace numbers and multiple namespaces, this xml is a service which can't be updated. The original approach was to simply ummarshal it to the java objects and be on our way. The provider uses older tools Castor to create the message, which we have no access to. The plan is to parse it then marshal/unmarshal it.
<TESTXmlResponse xmlns="TEST/TESTXmlResponse">
<firstRequest>
<ns1:xmlRquest xmls:ns1="TEST/XMLRequest">
<ns2:username xmls:ns2="TEST/XMLUserNameRequest">
<ns3:value xmls:ns3=TEST/XMLValueRequest">test</ns3:value>
</ns2:username>
</ns1:xmlRquest>
</firstRequest>
<data>
<ns4:name xmls:ns4="TEST/XMLConstants">name1</ns4:name>
<ns5:value xmls:ns5=TEST/XMLConstants">data1</ns5:value>
</data>
<data>
<ns6:name xmls:ns6="TEST/XMLConstants">name2</ns6:name>
<ns7:value xmls:ns7=TEST/XMLConstants">data2</ns7:value>
</data>
<data>
<ns8:name xmls:ns8="TEST/XMLConstants">name3</ns8:name>
<ns9:value xmls:ns9=TEST/XMLConstants">data3/ns9:value>
</data>
</TESTXmlResponse>
import java.io.File;
import java.io.FileInputStream;
import java.util.ArrayList;
import java.util.Iterator;
import javax.xml.XMLConstants;
import javax.xml.namespace.NamespaceContext;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathFactory;
import org.w3c.dom.Document;
import org.w3c.dom.NodeList;
public class Main
{
public static void main(String[] args) throws Exception
{
ArrayList<String> constants = new ArrayList<String>();
DocumentBuilderFactory factory =
DocumentBuilderFactory.newInstance();
factory.setNamespaceAware(true);
DocumentBuilder builder = factory.newDocumentBuilder();
Document doc = builder.parse(new
FileInputStream(newfile("constants.xml")));
Get XPath expression
XPathFactory xpathfactory = XPathFactory.newInstance();
XPath xpath = xpathfactory.newXPath();
xpath.setNamespaceContext(new NamespaceResolver(doc));
XPathExpression expr =
xpath.compile("//firstRequest/ns1:xmlRequest/ns2:username/ns3:value/text()");
Object result = expr.evaluate(doc, XPathConstants.NODESET);
NodeList nodes = (NodeList) result;
for (int i = 0; i < nodes.getLength(); i++) {
constants.add(nodes.item(i).getNodeValue());
}
if (constants.size() > 0){
System.out.println(constants);
}
}
class NamespaceResolver implements NamespaceContext
{
private Document sourceDocument;
public NamespaceResolver(Document document) {
sourceDocument = document;
}
public String getNamespaceURI(String prefix) {
if (prefix.equals(XMLConstants.DEFAULT_NS_PREFIX)) {
return sourceDocument.lookupNamespaceURI(null);
} else {
return sourceDocument.lookupNamespaceURI(prefix);
}
}
public String getPrefix(String namespaceURI) {
return sourceDocument.lookupPrefix(namespaceURI);
}
#SuppressWarnings("rawtypes")
public Iterator getPrefixes(String namespaceURI) {
return null;
}
}
nodelist returned from the expr is where the issue lies, isn't null, but the length is zero 0.
I've looked at several examples, this one seems to be the closet to a solution. The XPathExpression expr appears to be the issue. Refining it for each case would seem to be a reasonable approach.

Related

How to use iText7 to convert HTML to PDF with mixed page orientations while preserving the CSS

I need to change the page orientation of some parts of a HTML. I followed the accepted answer for this question but the resulting PDF document had neither header nor footer (actually, no style at all).
Here is my first attempt:
package my.awesome.app.html2pdf;
import static com.itextpdf.styledxmlparser.css.media.MediaType.PRINT;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import org.springframework.http.MediaType;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestBody;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.ResponseBody;
import org.springframework.web.bind.annotation.RestController;
import com.itextpdf.html2pdf.ConverterProperties;
import com.itextpdf.html2pdf.HtmlConverter;
import com.itextpdf.html2pdf.attach.ITagWorker;
import com.itextpdf.html2pdf.attach.ProcessorContext;
import com.itextpdf.html2pdf.attach.impl.DefaultTagWorkerFactory;
import com.itextpdf.html2pdf.attach.impl.tags.DivTagWorker;
import com.itextpdf.kernel.geom.PageSize;
import com.itextpdf.kernel.pdf.PdfDocument;
import com.itextpdf.kernel.pdf.PdfWriter;
import com.itextpdf.layout.Document;
import com.itextpdf.layout.IPropertyContainer;
import com.itextpdf.layout.element.AreaBreak;
import com.itextpdf.layout.element.Div;
import com.itextpdf.layout.element.IBlockElement;
import com.itextpdf.layout.layout.LayoutContext;
import com.itextpdf.layout.layout.LayoutResult;
import com.itextpdf.layout.renderer.AreaBreakRenderer;
import com.itextpdf.styledxmlparser.css.media.MediaDeviceDescription;
import com.itextpdf.styledxmlparser.node.IElementNode;
#RestController
#RequestMapping("/html2pdf")
public class Html2PdfController {
#PostMapping(produces = MediaType.APPLICATION_PDF_VALUE)
public #ResponseBody byte[] convert(#RequestBody String html) throws IOException {
try (var baos = new ByteArrayOutputStream()) {
var mediaDeviceDescription = new MediaDeviceDescription(PRINT);
var converterProperties = new ConverterProperties();
converterProperties.setMediaDeviceDescription(mediaDeviceDescription);
var pdfDocument = new PdfDocument(new PdfWriter(baos));
converterProperties.setTagWorkerFactory(new CustomTagWorkerFactory(pdfDocument));
var document = new Document(pdfDocument);
var elements = HtmlConverter.convertToElements(html, converterProperties);
for (var element : elements) {
if (element instanceof IBlockElement) {
document.add((IBlockElement) element);
}
}
pdfDocument.close();
return baos.toByteArray();
}
}
private static class CustomTagWorkerFactory extends DefaultTagWorkerFactory {
PdfDocument pdfDocument;
public CustomTagWorkerFactory(PdfDocument pdfDocument) {
this.pdfDocument = pdfDocument;
}
#Override
public ITagWorker getCustomTagWorker(IElementNode tag, ProcessorContext context) {
if ("landscape".equalsIgnoreCase(tag.name())) {
return new LandscapeDivTagWorker(tag, context, pdfDocument);
}
return null;
}
}
private static class LandscapeDivTagWorker extends DivTagWorker {
private PdfDocument pdfDocument;
public LandscapeDivTagWorker(IElementNode tag, ProcessorContext context, PdfDocument pdfDocument) {
super(tag, context);
this.pdfDocument = pdfDocument;
}
#Override
public IPropertyContainer getElementResult() {
IPropertyContainer baseElementResult = super.getElementResult();
if (baseElementResult instanceof Div) {
var div = new Div();
var landscapeAreaBreak = new AreaBreak(new PageSize(PageSize.A4).rotate());
landscapeAreaBreak.setNextRenderer(
new DefaultPageSizeChangingAreaBreakRenderer(landscapeAreaBreak, pdfDocument));
div.add(landscapeAreaBreak);
div.add((IBlockElement) baseElementResult);
var portraitAreaBreak = new AreaBreak(new PageSize(PageSize.A4));
portraitAreaBreak.setNextRenderer(
new DefaultPageSizeChangingAreaBreakRenderer(portraitAreaBreak, pdfDocument));
div.add(portraitAreaBreak);
baseElementResult = div;
}
return baseElementResult;
}
}
private static class DefaultPageSizeChangingAreaBreakRenderer extends AreaBreakRenderer {
private PdfDocument pdfDocument;
private AreaBreak areaBreak;
public DefaultPageSizeChangingAreaBreakRenderer(AreaBreak areaBreak, PdfDocument pdfDocument) {
super(areaBreak);
this.pdfDocument = pdfDocument;
this.areaBreak = areaBreak;
}
#Override
public LayoutResult layout(LayoutContext layoutContext) {
pdfDocument.setDefaultPageSize(areaBreak.getPageSize());
return super.layout(layoutContext);
}
}
}
And here is my first result. There is an additional blank page at its end:
After some small changes in the main method:
#PostMapping(produces = MediaType.APPLICATION_PDF_VALUE)
public #ResponseBody byte[] convert(#RequestBody String html) throws IOException {
try (var baos = new ByteArrayOutputStream()) {
var mediaDeviceDescription = new MediaDeviceDescription(PRINT);
var converterProperties = new ConverterProperties();
converterProperties.setMediaDeviceDescription(mediaDeviceDescription);
var pdfDocument = new PdfDocument(new PdfWriter(baos));
converterProperties.setTagWorkerFactory(new CustomTagWorkerFactory(pdfDocument));
var document = HtmlConverter.convertToDocument(html, pdfDocument, converterProperties);
document.close();
pdfDocument.close();
return baos.toByteArray();
}
}
I've got this PDF with nice headers and footers, except for the rotated pages. Also, only the first in a series is in landscape mode:
Actually the landscape pages do have footers, but they are misaligned.
Is there a way to achieve this using iText7 HTML to PDF conversion?

Find reversed names on a file with hadoop mapreduce

Hello I have this file http://aminer.org/lab-datasets/citation/citation-network1.zip and I need to find the names of the authors that have publications with just 2 authors and they reverse their names on at least one of them.
The mapper I made is this one :
package bigdatauom;
import java.io.IOException;
import java.util.ArrayList;
import java.util.StringTokenizer;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class TokenizerMapper
extends Mapper<Object, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text keyAuthors = new Text();
public void map(Object key, Text value, Context context
) throws IOException, InterruptedException {
StringTokenizer authorslinetok = new StringTokenizer(value.toString(), "#");
while (authorslinetok.hasMoreTokens()) {
String tempLine = authorslinetok.nextToken();
if (tempLine.charAt(0) == '#') {
tempLine = tempLine.substring(1);
StringTokenizer seperateAuthorsTok = new StringTokenizer(tempLine, ",");
ArrayList<String> authors = new ArrayList<String>();
while (seperateAuthorsTok.hasMoreTokens()) {
authors.add(seperateAuthorsTok.nextToken());
}
if (authors.size() == 2){
keyAuthors.set(tempLine);
context.write(keyAuthors, one);
}
}
}
}
}
I need to have 2 instances of the reducer and have been working on this project for one week with no result.
Any advice is appreciated thanks in advance!

Is this ClassCastException a HtmlUnit bug?

I'm new to htmlunit (2.23) and I can't get this test to work:
I'm getting this ClassCastException thrown out of HtmlUnit and I don't know if it is a bug, or if I am doing something wrong.
java.lang.ClassCastException: com.gargoylesoftware.htmlunit.TextPage cannot be cast to com.gargoylesoftware.htmlunit.html.HtmlPage
at com.gargoylesoftware.htmlunit.WebClient.makeWebResponseForJavaScriptUrl(WebClient.java:1241)
at com.gargoylesoftware.htmlunit.WebClient.getPage(WebClient.java:375)
at com.gargoylesoftware.htmlunit.WebClient.getPage(WebClient.java:304)
at com.gargoylesoftware.htmlunit.WebClient.getPage(WebClient.java:451)
at com.gargoylesoftware.htmlunit.WebClient.getPage(WebClient.java:436)
at org.wyttenbach.dale.mlec.OutageTest.test(OutageTest.java:46)
...
The code
import java.awt.Desktop;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URI;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import org.junit.Assert;
import org.junit.Test;
import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
import com.gargoylesoftware.htmlunit.JavaScriptPage;
import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
import com.gargoylesoftware.htmlunit.Page;
import com.gargoylesoftware.htmlunit.TextPage;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.DomElement;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
public class OutageTest {
private static final String SITE_URL = "https://ebill.mlecmn.net/woViewer/";
private static final String OUTAGE_MAP_URL = SITE_URL + "mapviewer.html?config=Outage+Web+Map";
#Test
public void test() throws FailingHttpStatusCodeException, MalformedURLException, IOException {
try (final WebClient webClient = new WebClient()) {
webClient.waitForBackgroundJavaScript(20000);
webClient.setAjaxController(new NicelyResynchronizingAjaxController());
webClient.getOptions().setUseInsecureSSL(true);
Map<String, Page> urls = new HashMap<String, Page>();
LinkedList<String> urlsToVisit = new LinkedList<String>();
urlsToVisit.add(OUTAGE_MAP_URL);
while (!urlsToVisit.isEmpty()) {
String url = urlsToVisit.remove();
if (urls.containsKey(url)) {
continue;
}
Page page = webClient.getPage(url);
urls.put(url, page);
if (page instanceof HtmlPage) {
HtmlPage page2 = (HtmlPage) page;
System.err.println("================================================================");
System.err.println(page2.asXml());
System.err.println("================================================================");
Assert.assertFalse("Outage in Nordland township: " + url, page2.asText().contains("Nordland"));
urlsToVisit.addAll(extractLinks(page2));
} else if (page instanceof JavaScriptPage) {
JavaScriptPage page2 = (JavaScriptPage) page;
Assert.assertFalse("Outage in Nordland township: " + url, page2.getContent().contains("Nordland"));
} else if (page instanceof TextPage) {
TextPage page2 = (TextPage) page;
Assert.assertFalse("Outage in Nordland township: " + url, page2.getContent().contains("Nordland"));
} else {
System.err.println(String.format("%s => %s", url, page.getClass().getName()));
}
}
} catch (AssertionError e) {
reportOutage();
throw e;
}
}
private Collection<String> extractLinks(HtmlPage page) {
List<String> links = new ArrayList<String>();
for (DomElement x : page.getElementsByTagName("script")) {
String src = x.getAttribute("src");
if (!src.contains(":")) {
src = SITE_URL + src;
System.err.println("script src="+src);
}
links.add(src);
}
for (DomElement x : page.getElementsByTagName("link")) {
String href = x.getAttribute("href");
if (!href.contains(":")) {
href = SITE_URL + href;
System.err.println("link href="+href);
}
links.add(href);
}
// Causes ClassCastException com.gargoylesoftware.htmlunit.TextPage cannot be cast to com.gargoylesoftware.htmlunit.html.HtmlPage
//at com.gargoylesoftware.htmlunit.WebClient.makeWebResponseForJavaScriptUrl(WebClient.java:1241)
for (DomElement x : page.getElementsByTagName("iframe")) {
String src = x.getAttribute("src");
if (!src.contains(":")) {
src = SITE_URL + src;
System.err.println("iframe src="+src);
}
links.add(src);
}
return links;
}
private void reportOutage() {
try {
Desktop.getDesktop().browse(new URI(OUTAGE_MAP_URL));
} catch (Exception e) {
e.printStackTrace();
}
}
}
More or less yes - but i have to do a more deeper analysis.
But there is some hope for you ;-)
Your code tries to extract urls from a given web page. During the process you are adding the url 'javascript:""' to your list of urls to be processes. This url results in this class cast exception. If you do not add this url to the list, the test is working (at least for me).

Pig replace command

I have one file that has data like
11/16/2015,"others (phone,health,etc.)",cont'd attempts,"resource,inc.",dg
I want to remove comma's present only inside double quotes.
Expected Result
11/162015,"others(phone health etc.)",cont'd attempts,"resource inc.",dg
So far what I tried
Foreach a generate replace ($1,',','');
Foreach a generate regex_extract($1,'[\,]+',1);
But none of them work.
Frist of all use REGULAR EXP to separate field in the tuple and then apply the REPLACE
Try this code :
a = load '<path>' as line;
b = foreach a generate FLATTEN(REGEX_EXTRACT_ALL(line,'(.*)[,]["](.*)["][,](.*)[,]["](.*)["][,](.*)'));
c = foreach b generate $0,REPLACE($1,',',' '),$2,REPLACE($3,',',' '),$4;
dump c;
can be achievable using a UDF, which can look at all fields in each of the tuple passed.
import java.util.HashMap;
import java.util.Iterator;
import java.util.ArrayList;
import java.io.IOException;
import java.lang.Long;
import java.lang.Exception;
import org.apache.pig.EvalFunc;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.BagFactory;
import org.apache.pig.data.DataType;
import org.apache.pig.impl.logicalLayer.schema.Schema;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.backend.executionengine.ExecException;
public class CommaRemove extends EvalFunc<DataBag> {
#Override
public DataBag exec(Tuple input) throws IOException {
if (input == null || input.size() == 0) {
return null;
}
try {
int inputSize = input.size();
Tuple output = TupleFactory.getInstance().newTuple(inputSize);
for( int i = 0; i < inputSize ; i++)
{
output.set(i, input.get(i).replace(',',''));
}
return output;
} catch (Exception e) {
System.err.println("Failed to process input; error - " + e.getMessage());
return null;
}
}
}

(How) Can I use Bigram Features with the OpenNLP Document Classifier

(How) Can I use Bigram Features with the OpenNLP Document Classifier?
I have a collection of very short documents (titles, phrases, and sentences), and I would like to add bigram features, of the kind used in the tool LibShortText
http://www.csie.ntu.edu.tw/~cjlin/libshorttext/
is this possible?
The documentation only explains how to do this using the Name Finder using the
BigramNameFeatureGenerator()
and not the Document Classifier
I believe the trainer and classifier allow for custom featuregenerators in their methods, however they must be implemntation of FeatureGenerator, and BigramFeatureGenerator is not an impl of that. So I made a quick impl as an inner class below.. so Try this (untested) code when you get a chance
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import opennlp.tools.doccat.DoccatModel;
import opennlp.tools.doccat.DocumentCategorizerME;
import opennlp.tools.doccat.DocumentSample;
import opennlp.tools.doccat.DocumentSampleStream;
import opennlp.tools.doccat.FeatureGenerator;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
public class DoccatUsingBigram {
public static void main(String[] args) throws IOException {
InputStream dataIn = new FileInputStream(args[0]);
try {
ObjectStream<String> lineStream =
new PlainTextByLineStream(dataIn, "UTF-8");
//here you can use it as part of building the model
ObjectStream<DocumentSample> sampleStream = new DocumentSampleStream(lineStream);
DoccatModel model = DocumentCategorizerME.train("en", sampleStream, 10, 100, new MyBigramFeatureGenerator());
///now you would use it like this
DocumentCategorizerME classifier = new DocumentCategorizerME(model);
String[] someData = "whatever you are trying to classify".split(" ");
Collection<String> bigrams = new MyBigramFeatureGenerator().extractFeatures(someData);
double[] categorize = classifier.categorize(bigrams.toArray(new String[bigrams.size()]));
} catch (IOException e) {
// Failed to read or parse training data, training failed
e.printStackTrace();
}
}
public static class MyBigramFeatureGenerator implements FeatureGenerator {
#Override
public Collection<String> extractFeatures(String[] text) {
return generate(Arrays.asList(text), 2, "");
}
private List<String> generate(List<String> input, int n, String separator) {
List<String> outGrams = new ArrayList<String>();
for (int i = 0; i < input.size() - (n - 2); i++) {
String gram = "";
if ((i + n) <= input.size()) {
for (int x = i; x < (n + i); x++) {
gram += input.get(x) + separator;
}
gram = gram.substring(0, gram.lastIndexOf(separator));
outGrams.add(gram);
}
}
return outGrams;
}
}
}
hope this helps...
You can use NGramFeatureGenerator.java class in OpenNLP[1] for you use case.
[1] https://github.com/apache/opennlp
Thanks,
Madhawa

Resources