What is the better way to parse such xml:
<FindLicensesResponse xmlns="http://abc.com">
<FindLicensesResult>
<Licensies>
<ActivityLicense>
<id>1</id>
<DateIssue>2011-12-29T00:00:00</DateIssue>
<ActivityType xmlns:s01="http://www.w3.org/2001/XMLSchema-instance" s01:type="ActivityType">
<code>somecode1</code>
</ActivityType>
<ActivityTerritory xmlns:s02="http://www.w3.org/2001/XMLSchema-instance" s02:type="Territory">
<code>somecode2</code>
</ActivityTerritory>
<ActivityLicenseAttachments />
</ActivityLicense>
<ActivityLicense>
<id>2</id>
<DateIssue>2011-12-21T00:00:00</DateIssue>
<ActivityType xmlns:s01="http://www.w3.org/2001/XMLSchema-instance" s01:type="ActivityType">
<code>somecode3</code>
</ActivityType>
<ActivityTerritory xmlns:s02="http://www.w3.org/2001/XMLSchema-instance" s02:type="Territory">
<code>somecode4</code>
</ActivityTerritory>
<ActivityLicenseAttachments />
</ActivityLicense>
</Licensies>
</FindLicensesResult>
I need to get values from each ActivityLicense: id, DateIssue and inner ActivityType: code and inner ActivityTerritory: code.
Now I do it like this:
CachedXPathAPI xpathAPI = new CachedXPathAPI();
Element nsctx = result.getSOAPPart().createElementNS(null, "nsctx");
nsctx.setAttributeNS("http://www.w3.org/2000/xmlns/","xmlns:el","http://abc.com");
NodeList activityLicenses = xpathAPI.selectNodeList(result.getSOAPPart(),"//el:ActivityLicense", nsctx);
for (int i = 0; i < activityLicenses.getLength(); i++) {
Node id = xpathAPI.selectSingleNode(activityLicenses.item(i), "//el:id", nsctx);
Node dateIssue = xpathAPI.selectSingleNode(activityLicenses.item(i), "//el:DateIssue",nsctx);
System.out.println("id: " + id.getTextContent());
System.out.println("dateIssue: " + dateIssue.getTextContent());
}
But I can't get values from ActivityType/code and ActivityTerritory/code
check out this solution
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.InputStream;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathFactory;
public class StringTest {
public static void main(String[] args) throws Exception {
String xml = "";
java.util.Scanner sc = new java.util.Scanner(new File("xml.xml"));
while(sc.hasNextLine()){
xml+=sc.nextLine();
}
javax.xml.parsers.DocumentBuilderFactory dbFactory = javax.xml.parsers.DocumentBuilderFactory.newInstance();
javax.xml.parsers.DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
InputStream is = new ByteArrayInputStream(xml.getBytes());
org.w3c.dom.Document doc = dBuilder.parse(is);
doc.getDocumentElement().normalize();
XPath xpath = XPathFactory.newInstance().newXPath();
org.w3c.dom.NodeList nodeList = doc.getElementsByTagName("ActivityLicense");
for(int i=0;i<nodeList.getLength();i++){
org.w3c.dom.Node node = nodeList.item(i);
System.out.println(xpath.evaluate("ActivityTerritory/code/text()", node, XPathConstants.STRING));
}
}
}
Related
I am getting this error, and cannot figure out what wrong I am doing:
Error invoking bsh method: eval In file: inline evaluation of: ``import java.util.Set; import java.util.Map; import java.util.List; try { // Map . . . '' Encountered "String" at line 17, column 9.
This is the code that I am using:
import java.util.Set;
import java.util.Map;
import java.util.List;
try
{
// Map<String,List<String>> map = new HashMap<String,List<String>>();
// map = vars.getObject("headerMap");
boolean isHeaderValid = false;
// String apiKeySent = "${x_api_key}"
// String clientIdSent = "${X_IBM_Client_id}"
// String clientSecretSent = "${X_IBM_Client_Secret}"
String apiKeySent = vars.get("x_api_key")
String clientIdSent = vars.get("X_Client_id")
String clientSecretSent = vars.get("X_Client_Secret")
log.info("apiKeySent: " + vars.get("x_api_key"))
log.info("clientIdSent: " + vars.get("X_Client_id"))
log.info("clientSecretSent: " + vars.get("X_Client_Secret"))
if(apiKeySent != "")
{
apiKeyRec = vars.get("apiKeyRec")
isHeaderValid = apiKeySent.equals(apiKeyRec)
}
Failure = isHeaderValid
}
catch(Exception e)
{
log.debug("Error in verification: ",e)
}
Could anyone please help me in figuring this out? Have been stuck at this for ages.
You need to add semicolons like this
import java.util.Set;
import java.util.Map;
import java.util.List;
try
{
// Map<String,List<String>> map = new HashMap<String,List<String>>();
// map = vars.getObject("headerMap");
boolean isHeaderValid = false;
// String apiKeySent = "${x_api_key}"
// String clientIdSent = "${X_IBM_Client_id}"
// String clientSecretSent = "${X_IBM_Client_Secret}"
String apiKeySent = vars.get("x_api_key");
String clientIdSent = vars.get("X_Client_id");
String clientSecretSent = vars.get("X_Client_Secret");
log.info("apiKeySent: " + vars.get("x_api_key"));
log.info("clientIdSent: " + vars.get("X_Client_id"));
log.info("clientSecretSent: " + vars.get("X_Client_Secret"));
if(apiKeySent != "")
{
apiKeyRec = vars.get("apiKeyRec");
isHeaderValid = apiKeySent.equals(apiKeyRec);
}
Failure = isHeaderValid;
}
catch(Exception e)
{
log.debug("Error in verification: ",e);
}
Since JMeter 3.1 you should be using JSR223 Test Elements and Groovy language for scripting so consider migrating to JSR223 Assertion and Groovy
Your script can be simplified to
AssertionResult.setFailure(vars.get('x_api_key') == vars.get('apiKeyRec'))
And you don't even need any scripting for comparing 2 variables, it can be done using "normal" Response Assertion
It looks like you are forgetting to end all of your statements with semicolons from line 12 on. Add semicolons and let me know how that works!
There is a Waybill object that has a Set<Packing> field, the Packing object has a PRICE field.
I get a List<Waybill>.
Need to calculate the total cost of all Packing from the entire List<Waybill>.
How it competently to make through Stream?
Thank you.
class Waybill {
Set<Packing> setOfPacking;
}
class Packing {
int PRICE;
}
List<Waybill> allWaybills = ...
This worked for me:
double total = allWaybills.stream()
.flatMap(waybill -> waybill.setOfPacking.stream())
.mapToInt(packing -> packing.PRICE)
.sum();
I think it is easier to reason about because there aren't any multi-level stream operations.
I would be interested to see how to use flatMapToInt to replace both the flatMap and map operations with one operation without making it multi-level.
Here is a test program:
import java.util.Set;
import java.util.List;
import java.util.HashSet;
import java.util.ArrayList;
import java.util.stream.Collectors;
public class HelloWorld
{
public static class Packing
{
public int PRICE = 0;
}
public static class Waybill
{
public Set<Packing> setOfPacking = new HashSet<Packing>();
}
public static void main(String []args){
List<Waybill> allWaybills = new ArrayList<Waybill>();
Waybill w1 = new Waybill();
Packing p1 = new Packing(); p1.PRICE = 1; w1.setOfPacking.add(p1);
Packing p2 = new Packing(); p2.PRICE = 2; w1.setOfPacking.add(p2);
allWaybills.add(w1);
Waybill w2 = new Waybill();
Packing p3 = new Packing(); p3.PRICE = 3; w2.setOfPacking.add(p3);
Packing p4 = new Packing(); p4.PRICE = 4; w2.setOfPacking.add(p4);
allWaybills.add(w2);
double total = allWaybills.stream()
.flatMap(waybill -> waybill.setOfPacking.stream())
.mapToInt(packing -> packing.PRICE)
.sum();
System.out.println("total = "+total);
}
}
import java.util.stream.*
List<Waybill> allWaybills = ...
int totalCost = allWaybills
.stream()
.mapToInt(w -> w.setOfPacking
.stream()
.mapToInt(p -> p.PRICE)
.sum()
)
.sum();
how to add image in table(XSLFTable) cell in powerpoint poi api in java, we are not able to get the CTBlipFillProperties through poi latest jar poi-3.15.jar
how to add image in table(XSLFTable) cell in powerpoint poi api in java, we are not able to get the CTBlipFillProperties through poi latest jar poi-3.15.jar
public static void main(String[] args) throws Exception {
XMLSlideShow pptx = new XMLSlideShow();
XSLFSlide slide = pptx.createSlide();
// you need to include ooxml-schemas:1.1 for this to work!!!
// otherwise an empty table will be created
// see https://issues.apache.org/bugzilla/show_bug.cgi?id=49934
XSLFTable table = slide.createTable();
table.setAnchor(new Rectangle2D.Double(50, 50, 500, 20));
XSLFTableRow row = table.addRow();
row.addCell().setText("Cell 1");
XSLFTableCell cell = row.addCell();
cell.setText("Cell 2");
CTBlipFillProperties blipPr = cell.getXmlObject().getTcPr().addNewBlipFill();
blipPr.setDpi(72);
// http://officeopenxml.com/drwPic-ImageData.php
CTBlip blib = blipPr.addNewBlip();
blipPr.addNewSrcRect();
CTRelativeRect fillRect = blipPr.addNewStretch().addNewFillRect();
fillRect.setL(30000);
fillRect.setR(30000);
PackagePartName partName = PackagingURIHelper.createPartName("/ppt/media/100px.gif");
PackagePart part = pptx.getPackage().createPart(partName, "image/gif");
OutputStream partOs = part.getOutputStream();
FileInputStream fis = new FileInputStream("src/test/resources/100px.gif");
byte buf[] = new byte[1024];
for (int readBytes; (readBytes = fis.read(buf)) != -1; partOs.write(buf, 0, readBytes));
fis.close();
partOs.close();
PackageRelationship prs = slide.getPackagePart().addRelationship(partName, TargetMode.INTERNAL, "http://schemas.openxmlformats.org/officeDocument/2006/relationships/image");
blib.setEmbed(prs.getId());
FileOutputStream fos = new FileOutputStream("test2.pptx");
pptx.write(fos);
fos.close();
}
You were quite close ... the following was tested on the POI trunk (POI 3.16-beta2), but should work on POI 3.15 too ...
import java.awt.geom.Rectangle2D;
import java.io.File;
import java.io.FileOutputStream;
import org.apache.poi.POIXMLDocumentPart.RelationPart;
import org.apache.poi.sl.usermodel.PictureData.PictureType;
import org.apache.poi.xslf.usermodel.XMLSlideShow;
import org.apache.poi.xslf.usermodel.XSLFPictureData;
import org.apache.poi.xslf.usermodel.XSLFRelation;
import org.apache.poi.xslf.usermodel.XSLFSlide;
import org.apache.poi.xslf.usermodel.XSLFTable;
import org.apache.poi.xslf.usermodel.XSLFTableCell;
import org.apache.poi.xslf.usermodel.XSLFTableRow;
import org.openxmlformats.schemas.drawingml.x2006.main.CTBlip;
import org.openxmlformats.schemas.drawingml.x2006.main.CTBlipFillProperties;
import org.openxmlformats.schemas.drawingml.x2006.main.CTRelativeRect;
import org.openxmlformats.schemas.drawingml.x2006.main.CTTableCell;
public class TablePics {
public static void main(String[] args) throws Exception {
XMLSlideShow pptx = new XMLSlideShow();
XSLFPictureData pd = pptx.addPicture(new File("wrench.emf"), PictureType.EMF);
XSLFSlide slide = pptx.createSlide();
XSLFTable table = slide.createTable();
table.setAnchor(new Rectangle2D.Double(50, 50, 500, 20));
XSLFTableRow row = table.addRow();
row.addCell().setText("Cell 1");
XSLFTableCell cell = row.addCell();
cell.setText("Cell 2");
CTBlipFillProperties blipPr = ((CTTableCell)cell.getXmlObject()).getTcPr().addNewBlipFill();
blipPr.setDpi(72);
// http://officeopenxml.com/drwPic-ImageData.php
CTBlip blib = blipPr.addNewBlip();
blipPr.addNewSrcRect();
CTRelativeRect fillRect = blipPr.addNewStretch().addNewFillRect();
fillRect.setL(30000);
fillRect.setR(30000);
RelationPart rp = slide.addRelation(null, XSLFRelation.IMAGES, pd);
blib.setEmbed(rp.getRelationship().getId());
FileOutputStream fos = new FileOutputStream("test2.pptx");
pptx.write(fos);
fos.close();
}
}
For example, the parsing tree from Stanford Sentiment Treebank
"(2 (2 (2 near) (2 (2 the) (2 end))) (3 (3 (2 takes) (2 (2 on) (2 (2 a) (2 (2 whole) (2 (2 other) (2 meaning)))))) (2 .)))",
where the number is the sentiment label of each node.
I want to add POS tagging information to each node. Such as:
"(NP (ADJP (IN near)) (DT the) (NN end)) "
I have tried to directly parse the sentence, but the resulted tree is different from that in the Sentiment Treebank (may be because of the parsing version or parameters, I have tried to contact to the author but there is no response).
How can I obtain the tagging information?
I think the code in edu.stanford.nlp.sentiment.BuildBinarizedDataset should be helpful. The main() method steps through how these binary trees can be created in Java code.
Some key lines to look out for in the code:
LexicalizedParser parser = LexicalizedParser.loadModel(parserModel);
TreeBinarizer binarizer = TreeBinarizer.simpleTreeBinarizer(parser.getTLPParams().headFinder(), parser.treebankLanguagePack());
...
Tree tree = parser.apply(tokens);
Tree binarized = binarizer.transformTree(tree);
You can access the node tag information from the Tree object. You should look at the javadoc for edu.stanford.nlp.trees.Tree to see how to access this information.
Also in this answer I have some code that shows accessing a Tree:
How to get NN andNNS from a text?
You want to look at the label() of each tree and subtree to get the tag for a node.
Here is the reference on GitHub to BuildBinarizedDataset.java:
https://github.com/stanfordnlp/CoreNLP/blob/master/src/edu/stanford/nlp/sentiment/BuildBinarizedDataset.java
Please let me know if anything is unclear about this and I can provide further assistance!
First, you need to download the Stanford Parser
Set up
private LexicalizedParser parser;
private TreeBinarizer binarizer;
private CollapseUnaryTransformer transformer;
parser = LexicalizedParser.loadModel(PCFG_PATH);
binarizer = TreeBinarizer.simpleTreeBinarizer(
parser.getTLPParams().headFinder(), parser.treebankLanguagePack());
transformer = new CollapseUnaryTransformer();
Parse
Tree tree = parser.apply(tokens);
Access POSTAG
public String[] constTreePOSTAG(Tree tree) {
Tree binarized = binarizer.transformTree(tree);
Tree collapsedUnary = transformer.transformTree(binarized);
Trees.convertToCoreLabels(collapsedUnary);
collapsedUnary.indexSpans();
List<Tree> leaves = collapsedUnary.getLeaves();
int size = collapsedUnary.size() - leaves.size();
String[] tags = new String[size];
HashMap<Integer, Integer> index = new HashMap<Integer, Integer>();
int idx = leaves.size();
int leafIdx = 0;
for (Tree leaf : leaves) {
Tree cur = leaf.parent(collapsedUnary); // go to preterminal
int curIdx = leafIdx++;
boolean done = false;
while (!done) {
Tree parent = cur.parent(collapsedUnary);
if (parent == null) {
tags[curIdx] = cur.label().toString();
break;
}
int parentIdx;
int parentNumber = parent.nodeNumber(collapsedUnary);
if (!index.containsKey(parentNumber)) {
parentIdx = idx++;
index.put(parentNumber, parentIdx);
} else {
parentIdx = index.get(parentNumber);
done = true;
}
tags[curIdx] = parent.label().toString();
cur = parent;
curIdx = parentIdx;
}
}
return tags;
}
Here is the full source code ConstituencyParse.java that run:
Use param:
java ConstituencyParse -tokpath outputtoken.toks -parentpath outputparent.txt -tagpath outputag.txt < input_sentence_in_text_file_one_sent_per_line.txt
(Note: the source code is adapt from treelstm repo, you also need to replace preprocess-sst.py to call ConstituencyParse.java file below)
import edu.stanford.nlp.process.WordTokenFactory;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.TaggedWord;
import edu.stanford.nlp.process.PTBTokenizer;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
import edu.stanford.nlp.parser.lexparser.TreeBinarizer;
import edu.stanford.nlp.tagger.maxent.MaxentTagger;
import edu.stanford.nlp.trees.GrammaticalStructure;
import edu.stanford.nlp.trees.GrammaticalStructureFactory;
import edu.stanford.nlp.trees.PennTreebankLanguagePack;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.Trees;
import edu.stanford.nlp.trees.TreebankLanguagePack;
import edu.stanford.nlp.trees.TypedDependency;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.StringReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.HashMap;
import java.util.Properties;
import java.util.Scanner;
public class ConstituencyParse {
private boolean tokenize;
private BufferedWriter tokWriter, parentWriter, tagWriter;
private LexicalizedParser parser;
private TreeBinarizer binarizer;
private CollapseUnaryTransformer transformer;
private GrammaticalStructureFactory gsf;
private static final String PCFG_PATH = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";
public ConstituencyParse(String tokPath, String parentPath, String tagPath, boolean tokenize) throws IOException {
this.tokenize = tokenize;
if (tokPath != null) {
tokWriter = new BufferedWriter(new FileWriter(tokPath));
}
parentWriter = new BufferedWriter(new FileWriter(parentPath));
tagWriter = new BufferedWriter(new FileWriter(tagPath));
parser = LexicalizedParser.loadModel(PCFG_PATH);
binarizer = TreeBinarizer.simpleTreeBinarizer(
parser.getTLPParams().headFinder(), parser.treebankLanguagePack());
transformer = new CollapseUnaryTransformer();
// set up to produce dependency representations from constituency trees
TreebankLanguagePack tlp = new PennTreebankLanguagePack();
gsf = tlp.grammaticalStructureFactory();
}
public List<HasWord> sentenceToTokens(String line) {
List<HasWord> tokens = new ArrayList<>();
if (tokenize) {
PTBTokenizer<Word> tokenizer = new PTBTokenizer(new StringReader(line), new WordTokenFactory(), "");
for (Word label; tokenizer.hasNext(); ) {
tokens.add(tokenizer.next());
}
} else {
for (String word : line.split(" ")) {
tokens.add(new Word(word));
}
}
return tokens;
}
public Tree parse(List<HasWord> tokens) {
Tree tree = parser.apply(tokens);
return tree;
}
public String[] constTreePOSTAG(Tree tree) {
Tree binarized = binarizer.transformTree(tree);
Tree collapsedUnary = transformer.transformTree(binarized);
Trees.convertToCoreLabels(collapsedUnary);
collapsedUnary.indexSpans();
List<Tree> leaves = collapsedUnary.getLeaves();
int size = collapsedUnary.size() - leaves.size();
String[] tags = new String[size];
HashMap<Integer, Integer> index = new HashMap<Integer, Integer>();
int idx = leaves.size();
int leafIdx = 0;
for (Tree leaf : leaves) {
Tree cur = leaf.parent(collapsedUnary); // go to preterminal
int curIdx = leafIdx++;
boolean done = false;
while (!done) {
Tree parent = cur.parent(collapsedUnary);
if (parent == null) {
tags[curIdx] = cur.label().toString();
break;
}
int parentIdx;
int parentNumber = parent.nodeNumber(collapsedUnary);
if (!index.containsKey(parentNumber)) {
parentIdx = idx++;
index.put(parentNumber, parentIdx);
} else {
parentIdx = index.get(parentNumber);
done = true;
}
tags[curIdx] = parent.label().toString();
cur = parent;
curIdx = parentIdx;
}
}
return tags;
}
public int[] constTreeParents(Tree tree) {
Tree binarized = binarizer.transformTree(tree);
Tree collapsedUnary = transformer.transformTree(binarized);
Trees.convertToCoreLabels(collapsedUnary);
collapsedUnary.indexSpans();
List<Tree> leaves = collapsedUnary.getLeaves();
int size = collapsedUnary.size() - leaves.size();
int[] parents = new int[size];
HashMap<Integer, Integer> index = new HashMap<Integer, Integer>();
int idx = leaves.size();
int leafIdx = 0;
for (Tree leaf : leaves) {
Tree cur = leaf.parent(collapsedUnary); // go to preterminal
int curIdx = leafIdx++;
boolean done = false;
while (!done) {
Tree parent = cur.parent(collapsedUnary);
if (parent == null) {
parents[curIdx] = 0;
break;
}
int parentIdx;
int parentNumber = parent.nodeNumber(collapsedUnary);
if (!index.containsKey(parentNumber)) {
parentIdx = idx++;
index.put(parentNumber, parentIdx);
} else {
parentIdx = index.get(parentNumber);
done = true;
}
parents[curIdx] = parentIdx + 1;
cur = parent;
curIdx = parentIdx;
}
}
return parents;
}
// convert constituency parse to a dependency representation and return the
// parent pointer representation of the tree
public int[] depTreeParents(Tree tree, List<HasWord> tokens) {
GrammaticalStructure gs = gsf.newGrammaticalStructure(tree);
Collection<TypedDependency> tdl = gs.typedDependencies();
int len = tokens.size();
int[] parents = new int[len];
for (int i = 0; i < len; i++) {
// if a node has a parent of -1 at the end of parsing, then the node
// has no parent.
parents[i] = -1;
}
for (TypedDependency td : tdl) {
// let root have index 0
int child = td.dep().index();
int parent = td.gov().index();
parents[child - 1] = parent;
}
return parents;
}
public void printTokens(List<HasWord> tokens) throws IOException {
int len = tokens.size();
StringBuilder sb = new StringBuilder();
for (int i = 0; i < len - 1; i++) {
if (tokenize) {
sb.append(PTBTokenizer.ptbToken2Text(tokens.get(i).word()));
} else {
sb.append(tokens.get(i).word());
}
sb.append(' ');
}
if (tokenize) {
sb.append(PTBTokenizer.ptbToken2Text(tokens.get(len - 1).word()));
} else {
sb.append(tokens.get(len - 1).word());
}
sb.append('\n');
tokWriter.write(sb.toString());
}
public void printParents(int[] parents) throws IOException {
StringBuilder sb = new StringBuilder();
int size = parents.length;
for (int i = 0; i < size - 1; i++) {
sb.append(parents[i]);
sb.append(' ');
}
sb.append(parents[size - 1]);
sb.append('\n');
parentWriter.write(sb.toString());
}
public void printTags(String[] tags) throws IOException {
StringBuilder sb = new StringBuilder();
int size = tags.length;
for (int i = 0; i < size - 1; i++) {
sb.append(tags[i]);
sb.append(' ');
}
sb.append(tags[size - 1]);
sb.append('\n');
tagWriter.write(sb.toString().toLowerCase());
}
public void close() throws IOException {
if (tokWriter != null) tokWriter.close();
parentWriter.close();
tagWriter.close();
}
public static void main(String[] args) throws Exception {
String TAGGER_MODEL = "stanford-tagger/models/english-left3words-distsim.tagger";
Properties props = StringUtils.argsToProperties(args);
if (!props.containsKey("parentpath")) {
System.err.println(
"usage: java ConstituencyParse -deps - -tokenize - -tokpath <tokpath> -parentpath <parentpath>");
System.exit(1);
}
// whether to tokenize input sentences
boolean tokenize = false;
if (props.containsKey("tokenize")) {
tokenize = true;
}
// whether to produce dependency trees from the constituency parse
boolean deps = false;
if (props.containsKey("deps")) {
deps = true;
}
String tokPath = props.containsKey("tokpath") ? props.getProperty("tokpath") : null;
String parentPath = props.getProperty("parentpath");
String tagPath = props.getProperty("tagpath");
ConstituencyParse processor = new ConstituencyParse(tokPath, parentPath, tagPath, tokenize);
Scanner stdin = new Scanner(System.in);
int count = 0;
long start = System.currentTimeMillis();
while (stdin.hasNextLine() && count < 2) {
String line = stdin.nextLine();
List<HasWord> tokens = processor.sentenceToTokens(line);
//end tagger
Tree parse = processor.parse(tokens);
// produce parent pointer representation
int[] parents = deps ? processor.depTreeParents(parse, tokens)
: processor.constTreeParents(parse);
String[] tags = processor.constTreePOSTAG(parse);
// print
if (tokPath != null) {
processor.printTokens(tokens);
}
processor.printParents(parents);
processor.printTags(tags);
// print tag
StringBuilder sb = new StringBuilder();
int size = tags.length;
for (int i = 0; i < size - 1; i++) {
sb.append(tags[i]);
sb.append(' ');
}
sb.append(tags[size - 1]);
sb.append('\n');
count++;
if (count % 100 == 0) {
double elapsed = (System.currentTimeMillis() - start) / 1000.0;
System.err.printf("Parsed %d lines (%.2fs)\n", count, elapsed);
}
}
long totalTimeMillis = System.currentTimeMillis() - start;
System.err.printf("Done: %d lines in %.2fs (%.1fms per line)\n",
count, totalTimeMillis / 100.0, totalTimeMillis / (double) count);
processor.close();
}
}
Here is a code snippet. Its giving arrayindexoutofboundexception. dont know why ?
import java.io.File;
import java.io.FileInputStream;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.xslf.usermodel.XSLFTextParagraph;
public class wordcount
{
public static void main(String[] args) throws Exception
{
File file = new File("E:\\myFiles\\abc.doc");
FileInputStream fis=new FileInputStream(file.getAbsolutePath());
HWPFDocument document=new HWPFDocument(fis);
WordExtractor extractor = new WordExtractor(document);
String [] fileData = extractor.getParagraphText();
for (int i = 0; i < fileData.length; i++)
{
// System.out.println(fileData[i].toString());
String[] paraword = fileData[i].toString().split(" ");
// out.println(paraword.length);
if(paraword[i].length() == 0 )
{
System.out.println("\n");
}
else if(paraword[i].length() > 0 && paraword[i].length() < 12)
{
for(int k=0 ; k < paraword[i].length()-1 ; k++)
{
System.out.println(paraword[k].toString());
}
}
else if(paraword[i].length() >= 12 )
{
for(int k=0 ; k < 12 ; k++)
{
System.out.println(paraword[k].toString());
}
}
System.out.println("\n");
}
}
}
This is the image of the abc.doc file
Note : Expected output will be printed on java console.
and the output will contain 12 words in each line. But after executing first line the error occurs.
Any help would be appreciated
TIA
Honestly, I'm not familiar with the apache.org API, but just by looking at your logic it looks like you want to replace every instance of:
paraword[i].length()
with:
paraword.length
Because it looks like you want to check how many words are in the paragraph and not how long the first word of the paragraph is. Correct me if I'm wrong, but I think that will fix you up.
Here is the correct code snippet
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
public class ExtractWordDocument
{
public String myString() throws IOException
{
File file = new File("PATH FOR THE .doc FILE");
FileInputStream fis=new FileInputStream(file.getAbsolutePath());
HWPFDocument document=new HWPFDocument(fis);
WordExtractor extractor = new WordExtractor(document);
String [] fileData = extractor.getParagraphText();
ArrayList<Object> EntireDoc = new ArrayList<>();
for (int i = 0; i < fileData.length; i++)
{
String[] paraword = fileData[i].toString().split("\\s+");
if(paraword.length == 0 )
{EntireDoc.add("\n");}
else if(paraword.length > 0 && paraword.length < 12)
{
for(int k=0 ; k < paraword.length ; k++)
{EntireDoc.add(paraword[k].toString()+" ");}
}
else if(paraword.length > 12 )
{
java.util.List<String> arrAsList = Arrays.asList(paraword);
String formatedString = arrAsList.toString()
.replace(",", "") //remove the commas
.replace("[", "") //remove the right bracket
.replace("]", ""); //remove the left bracket
StringBuilder sb = new StringBuilder(formatedString);
int i1 = 0;
while ((i1 = sb.indexOf(" ", i1 + 75)) != -1)
{sb.replace(i1, i1 + 1, "\n");}
EntireDoc.add(sb.toString());
}
EntireDoc.add("\n");
}
String formatedString = EntireDoc.toString()
.replace(",", "") //remove the commas
.replace("[", "") //remove the right bracket
.replace("]", ""); //remove the left bracket
return formatedString;
}
public static void main(String[] args)
{
try{
System.out.print(new ExtractWordDocument().myString());
}
catch(IOException ioe){System.out.print(ioe);}
}
}
Note : This code will not print 12 words in each line but 75 charecters in each line.