(How) Can I use Bigram Features with the OpenNLP Document Classifier - opennlp

(How) Can I use Bigram Features with the OpenNLP Document Classifier?
I have a collection of very short documents (titles, phrases, and sentences), and I would like to add bigram features, of the kind used in the tool LibShortText
http://www.csie.ntu.edu.tw/~cjlin/libshorttext/
is this possible?
The documentation only explains how to do this using the Name Finder using the
BigramNameFeatureGenerator()
and not the Document Classifier

I believe the trainer and classifier allow for custom featuregenerators in their methods, however they must be implemntation of FeatureGenerator, and BigramFeatureGenerator is not an impl of that. So I made a quick impl as an inner class below.. so Try this (untested) code when you get a chance
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import opennlp.tools.doccat.DoccatModel;
import opennlp.tools.doccat.DocumentCategorizerME;
import opennlp.tools.doccat.DocumentSample;
import opennlp.tools.doccat.DocumentSampleStream;
import opennlp.tools.doccat.FeatureGenerator;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
public class DoccatUsingBigram {
public static void main(String[] args) throws IOException {
InputStream dataIn = new FileInputStream(args[0]);
try {
ObjectStream<String> lineStream =
new PlainTextByLineStream(dataIn, "UTF-8");
//here you can use it as part of building the model
ObjectStream<DocumentSample> sampleStream = new DocumentSampleStream(lineStream);
DoccatModel model = DocumentCategorizerME.train("en", sampleStream, 10, 100, new MyBigramFeatureGenerator());
///now you would use it like this
DocumentCategorizerME classifier = new DocumentCategorizerME(model);
String[] someData = "whatever you are trying to classify".split(" ");
Collection<String> bigrams = new MyBigramFeatureGenerator().extractFeatures(someData);
double[] categorize = classifier.categorize(bigrams.toArray(new String[bigrams.size()]));
} catch (IOException e) {
// Failed to read or parse training data, training failed
e.printStackTrace();
}
}
public static class MyBigramFeatureGenerator implements FeatureGenerator {
#Override
public Collection<String> extractFeatures(String[] text) {
return generate(Arrays.asList(text), 2, "");
}
private List<String> generate(List<String> input, int n, String separator) {
List<String> outGrams = new ArrayList<String>();
for (int i = 0; i < input.size() - (n - 2); i++) {
String gram = "";
if ((i + n) <= input.size()) {
for (int x = i; x < (n + i); x++) {
gram += input.get(x) + separator;
}
gram = gram.substring(0, gram.lastIndexOf(separator));
outGrams.add(gram);
}
}
return outGrams;
}
}
}
hope this helps...

You can use NGramFeatureGenerator.java class in OpenNLP[1] for you use case.
[1] https://github.com/apache/opennlp
Thanks,
Madhawa

Related

Parse XML contains incrementing namespace numbers and multiple namespaces

Parse XML contains incrementing namespace numbers and multiple namespaces, this xml is a service which can't be updated. The original approach was to simply ummarshal it to the java objects and be on our way. The provider uses older tools Castor to create the message, which we have no access to. The plan is to parse it then marshal/unmarshal it.
<TESTXmlResponse xmlns="TEST/TESTXmlResponse">
<firstRequest>
<ns1:xmlRquest xmls:ns1="TEST/XMLRequest">
<ns2:username xmls:ns2="TEST/XMLUserNameRequest">
<ns3:value xmls:ns3=TEST/XMLValueRequest">test</ns3:value>
</ns2:username>
</ns1:xmlRquest>
</firstRequest>
<data>
<ns4:name xmls:ns4="TEST/XMLConstants">name1</ns4:name>
<ns5:value xmls:ns5=TEST/XMLConstants">data1</ns5:value>
</data>
<data>
<ns6:name xmls:ns6="TEST/XMLConstants">name2</ns6:name>
<ns7:value xmls:ns7=TEST/XMLConstants">data2</ns7:value>
</data>
<data>
<ns8:name xmls:ns8="TEST/XMLConstants">name3</ns8:name>
<ns9:value xmls:ns9=TEST/XMLConstants">data3/ns9:value>
</data>
</TESTXmlResponse>
import java.io.File;
import java.io.FileInputStream;
import java.util.ArrayList;
import java.util.Iterator;
import javax.xml.XMLConstants;
import javax.xml.namespace.NamespaceContext;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathFactory;
import org.w3c.dom.Document;
import org.w3c.dom.NodeList;
public class Main
{
public static void main(String[] args) throws Exception
{
ArrayList<String> constants = new ArrayList<String>();
DocumentBuilderFactory factory =
DocumentBuilderFactory.newInstance();
factory.setNamespaceAware(true);
DocumentBuilder builder = factory.newDocumentBuilder();
Document doc = builder.parse(new
FileInputStream(newfile("constants.xml")));
Get XPath expression
XPathFactory xpathfactory = XPathFactory.newInstance();
XPath xpath = xpathfactory.newXPath();
xpath.setNamespaceContext(new NamespaceResolver(doc));
XPathExpression expr =
xpath.compile("//firstRequest/ns1:xmlRequest/ns2:username/ns3:value/text()");
Object result = expr.evaluate(doc, XPathConstants.NODESET);
NodeList nodes = (NodeList) result;
for (int i = 0; i < nodes.getLength(); i++) {
constants.add(nodes.item(i).getNodeValue());
}
if (constants.size() > 0){
System.out.println(constants);
}
}
class NamespaceResolver implements NamespaceContext
{
private Document sourceDocument;
public NamespaceResolver(Document document) {
sourceDocument = document;
}
public String getNamespaceURI(String prefix) {
if (prefix.equals(XMLConstants.DEFAULT_NS_PREFIX)) {
return sourceDocument.lookupNamespaceURI(null);
} else {
return sourceDocument.lookupNamespaceURI(prefix);
}
}
public String getPrefix(String namespaceURI) {
return sourceDocument.lookupPrefix(namespaceURI);
}
#SuppressWarnings("rawtypes")
public Iterator getPrefixes(String namespaceURI) {
return null;
}
}
nodelist returned from the expr is where the issue lies, isn't null, but the length is zero 0.
I've looked at several examples, this one seems to be the closet to a solution. The XPathExpression expr appears to be the issue. Refining it for each case would seem to be a reasonable approach.

Hadoop Mapreduce, How do I rewrite a txt file inputted in the mapper with map reduce output?

I am trying to create a map reduce program to perform the k-means algorithm. I know using map reduce isn't the best way to do iterative algorithms.
I have created the mapper and reducer classes.
In the mapper code I read an input file. When a map reduce has completed I want the results to be stored in the same input file. How do i make the output file overwrite the inputted file from the mapper?
Also so I make the map reduce iterate until the values from the old input file and new input file converge i.e. the difference between the values is less than 0.1
My code is:
import java.io.IOException;
import java.util.StringTokenizer;
import java.util.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.FileReader;
import java.io.BufferedReader;
import java.util.ArrayList;
public class kmeansMapper extends Mapper<Object, Text, DoubleWritable,
DoubleWritable> {
private final static String centroidFile = "centroid.txt";
private List<Double> centers = new ArrayList<Double>();
public void setup(Context context) throws IOException{
BufferedReader br = new BufferedReader(new
FileReader(centroidFile));
String contentLine;
while((contentLine = br.readLine())!=null){
centers.add(Double.parseDouble(contentLine));
}
}
public void map(Object key, Text input, Context context) throws IOException,
InterruptedException {
String[] fields = input.toString().split(" ");
Double rating = Double.parseDouble(fields[2]);
Double distance = centers.get(0) - rating;
int position = 0;
for(int i=1; i<centers.size(); i++){
Double cDistance = Math.abs(centers.get(i) - rating);
if(cDistance< distance){
position = i;
distance = cDistance;
}
}
Double closestCenter = centers.get(position);
context.write(new DoubleWritable(closestCenter),new
DoubleWritable(rating)); //outputs closestcenter and rating value
}
}
import java.io.IOException;
import java.lang.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Reducer;
import java.util.*;
public class kmeansReducer extends Reducer<DoubleWritable, DoubleWritable,
DoubleWritable, Text> {
public void reduce(DoubleWritable key, Iterable<DoubleWritable> values,
Context context)// get count // get total //get values in a string
throws IOException, InterruptedException {
Iterator<DoubleWritable> v = values.iterator();
double total = 0;
double count = 0;
String value = ""; //value is the rating
while (v.hasNext()){
double i = v.next().get();
value = value + " " + Double.toString(i);
total = total + i;
++count;
}
double nCenter = total/count;
context.write(new DoubleWritable(nCenter), new Text(value));
}
}
import java.util.Arrays;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class run
{
public static void runJob(String[] input, String output) throws Exception {
Configuration conf = new Configuration();
Job job = new Job(conf);
Path toCache = new Path("input/centroid.txt");
job.addCacheFile(toCache.toUri());
job.setJarByClass(run.class);
job.setMapperClass(kmeansMapper.class);
job.setReducerClass(kmeansReducer.class);
job.setMapOutputKeyClass(DoubleWritable.class);
job.setMapOutputValueClass(DoubleWritable.class);
job.setNumReduceTasks(1);
Path outputPath = new Path(output);
FileInputFormat.setInputPaths(job, StringUtils.join(input, ","));
FileOutputFormat.setOutputPath(job, outputPath);
outputPath.getFileSystem(conf).delete(outputPath,true);
job.waitForCompletion(true);
}
public static void main(String[] args) throws Exception {
runJob(Arrays.copyOfRange(args, 0, args.length-1), args[args.length-1]);
}
}
Thanks
I know you put the disclaimer.. but please switch to Spark or some other framework that can solve problems in-memory. Your life will be so much better.
If you really want to do this, just iteratively run the code in runJob and use a temporary file name for input. You can see this question on moving files in hadoop to achieve this. You'll need a FileSystem instance and a temp file for input:
FileSystem fs = FileSystem.get(new Configuration());
Path tempInputPath = Paths.get('/user/th/kmeans/tmp_input';
Broadly speaking, after each iteration is finished, do
fs.delete(tempInputPath)
fs.rename(outputPath, tempInputPath)
Of course for the very first iteration you must set the input path to be the input paths provided when running the job. Subsequent iterations can use the tempInputPath, which will be the output of the previous iteration.

Automatic correction of double value input

I have a field and a filter on it that just allows digits and ,
I want that if I type 1, that I automatically have 1,0 when I leave the text field.
I could parse it and check with a substring whether there is a , at the end. But that is not a very good way to do it in my opinion. Is there a better way to do it?
Use a converter in the text formatter you are using to filter the input:
import java.text.DecimalFormat;
import java.text.DecimalFormatSymbols;
import java.util.function.UnaryOperator;
import javafx.application.Application;
import javafx.geometry.Pos;
import javafx.scene.Scene;
import javafx.scene.control.TextArea;
import javafx.scene.control.TextField;
import javafx.scene.control.TextFormatter;
import javafx.scene.control.TextFormatter.Change;
import javafx.scene.layout.VBox;
import javafx.stage.Stage;
import javafx.util.StringConverter;
public class DecimalTextField extends Application {
#Override
public void start(Stage primaryStage) {
// decimal formatter for default locale:
DecimalFormat decimalFormat = new DecimalFormat();
decimalFormat.setMinimumFractionDigits(1);
DecimalFormatSymbols symbols = decimalFormat.getDecimalFormatSymbols() ;
char decimalSep = symbols.getDecimalSeparator() ;
UnaryOperator<Change> filter = change -> {
for (char c : change.getText().toCharArray()) {
if ( (! Character.isDigit(c)) && c != decimalSep) {
return null ;
}
}
return change ;
};
StringConverter<Double> converter = new StringConverter<Double>() {
#Override
public String toString(Double object) {
return object == null ? "" : decimalFormat.format(object);
}
#Override
public Double fromString(String string) {
try {
return string.isEmpty() ? 0.0 : decimalFormat.parse(string).doubleValue();
} catch (ParseException e) {
return 0.0 ;
}
}
};
TextFormatter<Double> formatter = new TextFormatter<>(converter, 0.0, filter);
TextField textField = new TextField();
textField.setTextFormatter(formatter);
VBox root = new VBox(10, textField, new TextArea());
root.setAlignment(Pos.CENTER);
primaryStage.setScene(new Scene(root, 400, 400));
primaryStage.show();
}
public static void main(String[] args) {
launch(args);
}
}
(Obviously the filter could be improved here to, e.g. avoid multiple decimal separator characters in the input.)
I think the best would be to convert the string to a double and then convert the double back to a string using DecimalFormat. That way you know you'll know the number is in your desired format.

Pig replace command

I have one file that has data like
11/16/2015,"others (phone,health,etc.)",cont'd attempts,"resource,inc.",dg
I want to remove comma's present only inside double quotes.
Expected Result
11/162015,"others(phone health etc.)",cont'd attempts,"resource inc.",dg
So far what I tried
Foreach a generate replace ($1,',','');
Foreach a generate regex_extract($1,'[\,]+',1);
But none of them work.
Frist of all use REGULAR EXP to separate field in the tuple and then apply the REPLACE
Try this code :
a = load '<path>' as line;
b = foreach a generate FLATTEN(REGEX_EXTRACT_ALL(line,'(.*)[,]["](.*)["][,](.*)[,]["](.*)["][,](.*)'));
c = foreach b generate $0,REPLACE($1,',',' '),$2,REPLACE($3,',',' '),$4;
dump c;
can be achievable using a UDF, which can look at all fields in each of the tuple passed.
import java.util.HashMap;
import java.util.Iterator;
import java.util.ArrayList;
import java.io.IOException;
import java.lang.Long;
import java.lang.Exception;
import org.apache.pig.EvalFunc;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.BagFactory;
import org.apache.pig.data.DataType;
import org.apache.pig.impl.logicalLayer.schema.Schema;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.backend.executionengine.ExecException;
public class CommaRemove extends EvalFunc<DataBag> {
#Override
public DataBag exec(Tuple input) throws IOException {
if (input == null || input.size() == 0) {
return null;
}
try {
int inputSize = input.size();
Tuple output = TupleFactory.getInstance().newTuple(inputSize);
for( int i = 0; i < inputSize ; i++)
{
output.set(i, input.get(i).replace(',',''));
}
return output;
} catch (Exception e) {
System.err.println("Failed to process input; error - " + e.getMessage());
return null;
}
}
}

Problems with setting up and accessing Distributed Cache

For some reason I can't find any good sources online for getting Distributed Cache working with the new API. Hoping someone here can explain what I'm doing wrong. My current attempt is sort of a mish-mash of various things I've found online.
This program attempts to run the k-nearest neighbors algorithm. The input file is the test dataset, while the distributed cache holds the train dataset and train labels. The mapper should take one row of test data, compare it to every row in the distributed cache data, and return the label of the row it is most similar to.
import java.net.URI;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class KNNDriver extends Configured implements Tool {
public int run(String[] args) throws Exception {
if (args.length != 2) {
System.out.printf("Usage: %s [generic options] <input dir> <output dir>\n", getClass().getSimpleName());
return -1;
}
Configuration conf = new Configuration();
// conf.set("mapreduce.input.keyvaluelinerecordreader.key.value.separator", "^");
conf.setInt ("train_rows",1000);
conf.setInt ("test_rows",1000);
conf.setInt ("cols",612);
DistributedCache.addCacheFile(new URI("cacheData/train_sample.csv"),conf);
DistributedCache.addCacheFile(new URI("cacheData/train_labels.csv"),conf);
Job job = new Job(conf);
job.setJarByClass(KNNDriver.class);
job.setJobName("KNN");
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setMapperClass(KNNMapper.class);
job.setReducerClass(KNNReducer.class);
// job.setInputFormatClass(KeyValueTextInputFormat.class);
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(IntWritable.class);
boolean success = job.waitForCompletion(true);
return success ? 0 : 1;
}
public static void main(String[] args) throws Exception {
int exitCode = ToolRunner.run(new Configuration(), new KNNDriver(), args);
System.exit(exitCode);
}
}
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.FileNotFoundException;
import java.util.Scanner;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class KNNMapper extends Mapper<LongWritable, Text, IntWritable, IntWritable> {
int[][] train_vals;
int[] train_label_vals;
int train_rows;
int test_rows;
int cols;
#Override
public void setup(Context context) throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
// Path[] cacheFiles = context.getLocalCacheFiles();
int train_rows = conf.getInt("train_rows", 0);
int test_rows = conf.getInt("test_rows", 0);
int cols = conf.getInt("cols", 0);
train_vals = new int[train_rows][cols];
train_label_vals = new int[train_rows];
// read train csv, parse, and store into 2d int array
Scanner myScan;
try {
myScan = new Scanner(new File("train_sample.csv"));
//Set the delimiter used in file
myScan.useDelimiter("[,\r\n]+");
//Get all tokens and store them in some data structure
//I am just printing them
System.out.println("myScan loaded for train_sample");
for(int row = 0; row < train_rows; row++) {
for(int col = 0; col < cols; col++) {
train_vals[row][col] = Integer.parseInt(myScan.next().toString());
}
}
myScan.close();
} catch (FileNotFoundException e) {
System.out.print("Error: Train file not found.");
}
// read train_labels csv, parse, and store into 2d int array
try {
myScan = new Scanner(new File("train_labels.csv"));
//Set the delimiter used in file
myScan.useDelimiter("[,\r\n]+");
//Get all tokens and store them in some data structure
//I am just printing them
System.out.println("myScan loaded for train_sample");
for(int row = 0; row < train_rows; row++) {
train_label_vals[row] = Integer.parseInt(myScan.next().toString());
}
myScan.close();
} catch (FileNotFoundException e) {
System.out.print("Error: Train Labels file not found.");
}
}
#Override
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
// setup() gave us train_vals & train_label_vals.
// Each line in map() represents a test observation. We iterate
// through every train_val row to find nearest L2 match, then
// return a key/value pair of <observation #,
// convert from Text to String
String line = value.toString();
long distance;
double best_distance = Double.POSITIVE_INFINITY;
int col_num;
int best_digit = -1;
IntWritable rowId = null;
int i;
IntWritable rowNum;
String[] pixels;
// comma delimited files, split on commas
// first we find the # of rows
for (i = 0; i < train_rows; i++) {
distance = 0;
col_num = 0;
pixels = line.split(",");
rowId = new IntWritable(Integer.parseInt(pixels[0]));
for (int j = 1; j < cols; j++) {
distance += (Integer.parseInt(pixels[j]) - train_vals[i][j-1])^2;
}
if (distance < best_distance) {
best_distance = distance;
best_digit = train_label_vals[i];
}
}
context.write(rowId, new IntWritable(best_digit));
}
}
I commented out the Path... statement because I don't understand what it does, or how it sends the file data to the mapper, but I noticed it listed on a couple websites. Currently the program is not finding the Distributed Cache datasets even though they are uploaded to HDFS.
Try to use symlinking:
DistributedCache.createSymlink(conf);
DistributedCache.addCacheFile(new URI("cacheData/train_sample.csv#train_sample.csv"),conf);
DistributedCache.addCacheFile(new URI("cacheData/train_labels.csv#train_labels.csv"),conf);
This will make the files available in the local directory of the mapper under the name that you are actually trying to access it.

Resources