I am trying to train custom relations in Stanford CoreNLP using the birthplace model.
I have gone through this documentation which details us to make a properties file (similar to the roth.properties) as follows:
#Below are some basic options. See edu.stanford.nlp.ie.machinereading.MachineReadingProperties class for more options.
# Pipeline options
annotators = pos, lemma, parse
parse.maxlen = 100
# MachineReading properties. You need one class to read the dataset into correct format. See edu.stanford.nlp.ie.machinereading.domains.ace.AceReader for another example.
datasetReaderClass = edu.stanford.nlp.ie.machinereading.domains.roth.RothCONLL04Reader
#Data directory for training. The datasetReaderClass reads data from this path and makes corresponding sentences and annotations.
trainPath = "D:\\stanford-corenlp-full-2017-06-09\\birthplace.corp"
#Whether to crossValidate, that is evaluate, or just train.
crossValidate = false
kfold = 10
#Change this to true if you want to use CoreNLP pipeline generated NER tags. The default model generated with the relation extractor release uses the CoreNLP pipeline provided tags (option set to true).
trainUsePipelineNER=false
# where to save training sentences. uses the file if it exists, otherwise creates it.
serializedTrainingSentencesPath = "D:\\stanford-corenlp-full-2017-06-09\\rel\\sentences.ser"
serializedEntityExtractorPath = "D:\\stanford-corenlp-full-2017-06-09\\rel\\entity_model.ser"
# where to store the output of the extractor (sentence objects with relations generated by the model). This is what you will use as the model when using 'relation' annotator in the CoreNLP pipeline.
serializedRelationExtractorPath = "D:\\stanford-corenlp-full-2017-06-09\\rel\\roth_relation_model_pipeline.ser"
# uncomment to load a serialized model instead of retraining
# loadModel = true
#relationResultsPrinters = edu.stanford.nlp.ie.machinereading.RelationExtractorResultsPrinter,edu.stanford.nlp.ie.machinereading.domains.roth.RothResultsByRelation. For printing output of the model.
relationResultsPrinters = edu.stanford.nlp.ie.machinereading.RelationExtractorResultsPrinter
#In this domain, this is trivial since all the entities are given (or set using CoreNLP NER tagger).
entityClassifier = edu.stanford.nlp.ie.machinereading.domains.roth.RothEntityExtractor
extractRelations = true
extractEvents = false
#We are setting the entities beforehand so the model does not learn how to extract entities etc.
extractEntities = false
#Opposite of crossValidate.
trainOnly=true
# The set chosen by feature selection using RothCONLL04:
relationFeatures = arg_words,arg_type,dependency_path_lowlevel,dependency_path_words,surface_path_POS,entities_between_args,full_tree_path
# The above features plus the features used in Bjorne BioNLP09:
# relationFeatures = arg_words,arg_type,dependency_path_lowlevel,dependency_path_words,surface_path_POS,entities_between_args,full_tree_path,dependency_path_POS_unigrams,dependency_path_word_n_grams,dependency_path_POS_n_grams,dependency_path_edge_lowlevel_n_grams,dependency_path_edge-node-edge-grams_lowlevel,dependency_path_node-edge-node-grams_lowlevel,dependency_path_directed_bigrams,dependency_path_edge_unigrams,same_head,entity_counts
I am executing this command in my directory D:\stanford-corenlp-full-2017-06-09:
D:\stanford-corenlp-full-2017-06-09\stanford-corenlp-3.8.0\edu\stanford\nlp>java -cp classpath edu.stanford.nlp.ie.machinereading.MachineReading --arguments roth.properties
and I am getting this error
Error: Could not find or load main class edu.stanford.nlp.ie.machinereading.MachineReading
Caused by: java.lang.ClassNotFoundException: edu.stanford.nlp.ie.machinereading.MachineReading
Also I have tried to programmatically train the custom relation model with the below C# code:
using java.util;
using System.Collections.Generic;
namespace StanfordRelationDemo
{
class Program
{
static void Main(string[] args)
{
string jarRoot = #"D:\Stanford English Model\stanford-english-corenlp-2018-10-05-models\";
string modelsDirectory = jarRoot + #"edu\stanford\nlp\models";
string sutimeRules = modelsDirectory + #"\sutime\defs.sutime.txt,"
//+ modelsDirectory + #"\sutime\english.holidays.sutime.txt,"
+ modelsDirectory + #"\sutime\english.sutime.txt";
Properties props = new Properties();
props.setProperty("annotators", "pos, lemma, parse");
props.setProperty("parse.maxlen", "100");
props.setProperty("datasetReaderClass", "edu.stanford.nlp.ie.machinereading.domains.roth.RothCONLL04Reader");
props.setProperty("trainPath", "D://Stanford English Model//stanford-english-corenlp-2018-10-05-models//edu//stanford//nlp//models//birthplace.corp");
props.setProperty("crossValidate", "false");
props.setProperty("kfold", "10");
props.setProperty("trainOnly", "true");
props.setProperty("trainUsePipelineNER", "true");
props.setProperty("serializedTrainingSentencesPath", "D://Stanford English Model//stanford-english-corenlp-2018-10-05-models//edu//stanford//nlp//models//rel//sentences.ser");
props.setProperty("serializedEntityExtractorPath", "D://Stanford English Model//stanford-english-corenlp-2018-10-05-models//edu//stanford//nlp//models//rel//entity_model.ser");
props.setProperty("serializedRelationExtractorPath", "D://Stanford English Model//stanford-english-corenlp-2018-10-05-models//edu//stanford//nlp//models//rel//roth_relation_model_pipeline.ser");
props.setProperty("relationResultsPrinters", "edu.stanford.nlp.ie.machinereading.RelationExtractorResultsPrinter");
props.setProperty("entityClassifier", "edu.stanford.nlp.ie.machinereading.domains.roth.RothEntityExtractor");
props.setProperty("extractRelations", "true");
props.setProperty("extractEvents", "false");
props.setProperty("extractEntities", "false");
props.setProperty("trainOnly", "true");
props.setProperty("relationFeatures", "arg_words,arg_type,dependency_path_lowlevel,dependency_path_words,surface_path_POS,entities_between_args,full_tree_path");
var propertyKeys = props.keys();
var propertyStringArray = new List<string>();
while (propertyKeys.hasMoreElements())
{
var key = propertyKeys.nextElement();
propertyStringArray.Add($"-{key}");
propertyStringArray.Add(props.getProperty(key.ToString(), string.Empty));
}
var machineReader = edu.stanford.nlp.ie.machinereading.MachineReading.makeMachineReading(propertyStringArray.ToArray());
var utestResultList = machineReader.run();
}
}
}
I am getting this exception:
SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.
Unhandled Exception: edu.stanford.nlp.io.RuntimeIOException: Error while loading a tagger model (probably missing model file) --->
java.io.IOException: Unable to open
"edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger"
as class path, filename or URL
at edu.stanford.nlp.io.IOUtils.getInputStreamFromURLOrClasspathOrFileSystem(String
textFileOrUrl)
at edu.stanford.nlp.tagger.maxent.MaxentTagger.readModelAndInit(Properties
config, String modelFileOrUrl, Boolean printLoading)
--- End of inner exception stack trace ---
at edu.stanford.nlp.tagger.maxent.MaxentTagger.readModelAndInit(Properties
config, String modelFileOrUrl, Boolean printLoading)
at edu.stanford.nlp.tagger.maxent.MaxentTagger..ctor(String modelFile, Properties config, Boolean printLoading)
at edu.stanford.nlp.tagger.maxent.MaxentTagger..ctor(String modelFile)
at edu.stanford.nlp.pipeline.POSTaggerAnnotator.loadModel(String ,
Boolean )
at edu.stanford.nlp.pipeline.POSTaggerAnnotator..ctor(String annotatorName, Properties props)
at edu.stanford.nlp.pipeline.AnnotatorImplementations.posTagger(Properties
properties)
at edu.stanford.nlp.pipeline.StanfordCoreNLP.lambda$getNamedAnnotators$42(Properties
, AnnotatorImplementations )
at edu.stanford.nlp.pipeline.StanfordCoreNLP.<>Anon4.apply(Object ,
Object )
at edu.stanford.nlp.pipeline.StanfordCoreNLP.lambda$getDefaultAnnotatorPool$65(Entry
, Properties , AnnotatorImplementations )
at edu.stanford.nlp.pipeline.StanfordCoreNLP.<>Anon27.get()
at edu.stanford.nlp.util.Lazy.3.compute()
at edu.stanford.nlp.util.Lazy.get()
at edu.stanford.nlp.pipeline.AnnotatorPool.get(String name)
at edu.stanford.nlp.pipeline.StanfordCoreNLP.construct(Properties ,
Boolean , AnnotatorImplementations , AnnotatorPool )
at edu.stanford.nlp.pipeline.StanfordCoreNLP..ctor(Properties props, Boolean enforceRequirements, AnnotatorPool annotatorPool)
at edu.stanford.nlp.pipeline.StanfordCoreNLP..ctor(Properties props, Boolean enforceRequirements)
at edu.stanford.nlp.ie.machinereading.MachineReading.makeMachineReading(String[]
args)
at StanfordRelationDemo.Program.Main(String[] args) in C:\Users\m1039332\Documents\Visual Studio
2017\Projects\StanfordRelationDemo\StanfordRelationDemo\Program.cs:line
46
I am simply thus unable to train the custom relation using CoreNLP any obvious mistakes which I am making, I would appreciate if anybody would point it out.
I don't think the machine reading code is distributed with the standard distribution.
You should build a jar from the full GitHub.
https://github.com/stanfordnlp/CoreNLP/tree/master/src/edu/stanford/nlp/ie/machinereading
I am using Stanford CoreNLP for extraction. Below is the sentence from which am trying to extract currency along with the currency symbol
5 March 2015 Kering Issue of €500,000,000 0.875 per cent
The data that I need to extract is €500,000,000 0.875
NLP by default its giving sentence as
5 March 2015 Kering Issue of **$**500,000,000 0.875 per cent
So i wrote
public static readonly TokenizerFactory TokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(),
"normalizeCurrency=false");
DocumentPreprocessor docPre = new DocumentPreprocessor(new java.io.StringReader(textChunk));
docPre.setTokenizerFactory(TokenizerFactory);
Now the sentence is coming properly as
5 March 2015 Kering Issue of €500,000,000 0.875 per cent
But when I do
props.put("annotators", "tokenize, cleanxml, ssplit, pos, lemma, ner, regexner");
props.setProperty("ner.useSUTime", "0");
_pipeline = new StanfordCoreNLP(props);
Annotation document = new Annotation(text);
_pipeline.annotate(document);
where text = 5 March 2015 Kering Issue of €500,000,000 0.875 per cent
am getting output as
<token id="9">
<word>$</word>
<lemma></lemma>
<CharacterOffsetBegin>48</CharacterOffsetBegin>
<CharacterOffsetEnd>49</CharacterOffsetEnd>
<POS>CD</POS>
<NER>MONEY</NER>
<NormalizedNER>$5.000000000875E9</NormalizedNER>
</token>
So I added the line props.put("tokenize.options", "normalizeCurrency=false");
But still the output is same with $5.000000000875E9
Can anybody Please help me. Thank you
When I ran this code it didn't change the currency symbol to "$":
package edu.stanford.nlp.examples;
import edu.stanford.nlp.ling.*;
import edu.stanford.nlp.pipeline.*;
import java.util.*;
public class TokenizeOptionsExample {
public static void main(String[] args) {
Annotation document = new Annotation("5 March 2015 Kering Issue of €500,000,000 0.875 per cent");
Properties props = new Properties();
props.setProperty("annotators", "tokenize,ssplit");
props.setProperty("tokenize.options", "normalizeCurrency=false");
StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
pipeline.annotate(document);
for (CoreLabel token : document.get(CoreAnnotations.TokensAnnotation.class)) {
System.out.println(token);
}
}
}
//tagger
MaxentTagger tagger = new MaxentTagger(args[0]);
TokenizerFactory<CoreLabel> ptbTokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(),
"untokenizable=noneKeep");
BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(args[1]), "utf-8"));
PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, "utf-8"));
DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(r);
documentPreprocessor.setTokenizerFactory(ptbTokenizerFactory);
for (List<HasWord> sentence : documentPreprocessor) {
List<TaggedWord> tSentence = tagger.tagSentence(sentence);
pw.println(Sentence.listToString(tSentence, false));
}
It fails with following exception
Reading POS tagger model from C:\work\development\workspace\stanfordnlp\sample.txt ...
C:\work\development\workspace\stanfordnlp\sample.txtException in thread "main" edu.stanford.nlp.io.RuntimeIOException: Error while loading a tagger model (probably missing model file)
at edu.stanford.nlp.tagger.maxent.MaxentTagger.readModelAndInit(MaxentTagger.java:869)
at edu.stanford.nlp.tagger.maxent.MaxentTagger.readModelAndInit(MaxentTagger.java:767)
at edu.stanford.nlp.tagger.maxent.MaxentTagger.<init>(MaxentTagger.java:298)
at edu.stanford.nlp.tagger.maxent.MaxentTagger.<init>(MaxentTagger.java:263)
at phoenix.TokenizerDemo.main(TokenizerDemo.java:42)
Caused by: java.io.StreamCorruptedException: invalid stream header: 416E6F74
at java.io.ObjectInputStream.readStreamHeader(Unknown Source)
at java.io.ObjectInputStream.<init>(Unknown Source)
at edu.stanford.nlp.tagger.maxent.TaggerConfig.readConfig(TaggerConfig.java:748)
at edu.stanford.nlp.tagger.maxent.MaxentTagger.readModelAndInit(MaxentTagger.java:804)
... 4 more
The log should clearly indicate the problem:
Reading POS tagger model from C:\work\development\workspace\stanfordnlp\sample.txt ...
You are incorrectly instantiating the MaxentTagger instance. If you provide a single string argument to the constructor, that string is expected to provide a path to a tagger model file.
See the documentation for MaxentTagger for more information.
Hi i am using the pipeline object to parse forums posts. for each one i do the following :
Annotation document = new Annotation(post);
mPipeline.annotate(document); // annoatiate the post text
I would like each call to annotate to timeout after a few seconds.
I have followed the example at line 65: https://github.com/stanfordnlp/CoreNLP/blob/master/itest/src/edu/stanford/nlp/pipeline/ParserAnnotatorITest.java
So i am creating the pipeline object as follows :
Properties props = new Properties();
props.setProperty("parse.maxtime", "30");
props.setProperty("dcoref.maxtime", "30");
props.setProperty("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref");
mPipeline = new StanfordCoreNLP(props);
How ever when i add the maxtime properties i get the following exception :
Exception in thread "main" java.lang.NullPointerException
at edu.stanford.nlp.pipeline.SentenceAnnotator.annotate(SentenceAnnotator.java:64)
at edu.stanford.nlp.pipeline.AnnotationPipeline.annotate(AnnotationPipeline.java:68)
at edu.stanford.nlp.pipeline.StanfordCoreNLP.annotate(StanfordCoreNLP.java:412)
Without the maxtime option there is no exception.
how can i set maxtime properly ?
Thanks
I have made a basic application where I use Stanford Parser through OpenIE Lib and when the StanfordCoreNLP is initialized with the specified properties it stops as pos (pos-tagger). I do think all the required models are included so not sure why the process is unable to find the model data.
Properties props = new Properties();
props.setProperty("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref");
StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
Annotation doc = new Annotation(testString);
pipeline.annotate(doc);
for (CoreMap sentence : doc.get(CoreAnnotations.SentencesAnnotation.class))
{
Collection<RelationTriple> triples = sentence.get(NaturalLogicAnnotations.RelationTriplesAnnotation.class);
for (RelationTriple triple : triples) {
System.out.println(triple.confidence + "\t" +
triple.subjectLemmaGloss() + "\t" +
triple.relationLemmaGloss() + "\t" +
triple.objectLemmaGloss());
}
}
The includes are
stanford-parser.jar
stanford-parser-3.5.2-models.jar
stanford-openie.jar
stanford-openie-models.jar
JDK1.8
Stack-trace when the StanfordCoreNLP class is initiated.
Reading POS tagger model from edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger ... done [1.2 sec].
Exception in thread "AWT-EventQueue-0" java.lang.NoSuchFieldError: REQUIREMENTS
at edu.stanford.nlp.pipeline.POSTaggerAnnotator.requires(POSTaggerAnnotator.java:169)
at edu.stanford.nlp.pipeline.StanfordCoreNLP.construct(StanfordCoreNLP.java:362)
at edu.stanford.nlp.pipeline.StanfordCoreNLP.<init>(StanfordCoreNLP.java:131)
at edu.stanford.nlp.pipeline.StanfordCoreNLP.<init>(StanfordCoreNLP.java:127)
at org.sjdp.q2sm.GUI.extractTriplets(GUI.java:373)
at org.sjdp.q2sm.GUI.processSPARQL(GUI.java:353)
at org.sjdp.q2sm.GUI.actionPerformed(GUI.java:153)
at javax.swing.AbstractButton.fireActionPerformed(AbstractButton.java:2022)
at javax.swing.AbstractButton$Handler.actionPerformed(AbstractButton.java:2348)
at javax.swing.DefaultButtonModel.fireActionPerformed(DefaultButtonModel.java:402)
at javax.swing.DefaultButtonModel.setPressed(DefaultButtonModel.java:259)
at javax.swing.plaf.basic.BasicButtonListener.mouseReleased(BasicButtonListener.java:252)
at java.awt.AWTEventMulticaster.mouseReleased(AWTEventMulticaster.java:289)
at java.awt.Component.processMouseEvent(Component.java:6535)
at javax.swing.JComponent.processMouseEvent(JComponent.java:3324)
at java.awt.Component.processEvent(Component.java:6300)
at java.awt.Container.processEvent(Container.java:2236)
at java.awt.Component.dispatchEventImpl(Component.java:4891)
at java.awt.Container.dispatchEventImpl(Container.java:2294)
at java.awt.Component.dispatchEvent(Component.java:4713)
at java.awt.LightweightDispatcher.retargetMouseEvent(Container.java:4888)
at java.awt.LightweightDispatcher.processMouseEvent(Container.java:4525)
at java.awt.LightweightDispatcher.dispatchEvent(Container.java:4466)
at java.awt.Container.dispatchEventImpl(Container.java:2280)
at java.awt.Window.dispatchEventImpl(Window.java:2750)
at java.awt.Component.dispatchEvent(Component.java:4713)
at java.awt.EventQueue.dispatchEventImpl(EventQueue.java:758)
at java.awt.EventQueue.access$500(EventQueue.java:97)
at java.awt.EventQueue$3.run(EventQueue.java:709)
at java.awt.EventQueue$3.run(EventQueue.java:703)
at java.security.AccessController.doPrivileged(Native Method)
at java.security.ProtectionDomain$JavaSecurityAccessImpl.doIntersectionPrivilege(ProtectionDomain.java:76)
at java.security.ProtectionDomain$JavaSecurityAccessImpl.doIntersectionPrivilege(ProtectionDomain.java:86)
at java.awt.EventQueue$4.run(EventQueue.java:731)
at java.awt.EventQueue$4.run(EventQueue.java:729)
at java.security.AccessController.doPrivileged(Native Method)
at java.security.ProtectionDomain$JavaSecurityAccessImpl.doIntersectionPrivilege(ProtectionDomain.java:76)
at java.awt.EventQueue.dispatchEvent(EventQueue.java:728)
at java.awt.EventDispatchThread.pumpOneEventForFilters(EventDispatchThread.java:201)
at java.awt.EventDispatchThread.pumpEventsForFilter(EventDispatchThread.java:116)
at java.awt.EventDispatchThread.pumpEventsForHierarchy(EventDispatchThread.java:105)
at java.awt.EventDispatchThread.pumpEvents(EventDispatchThread.java:101)
at java.awt.EventDispatchThread.pumpEvents(EventDispatchThread.java:93)
at java.awt.EventDispatchThread.run(EventDispatchThread.java:82)
This is almost certainly a class incompatibility bug from including both the parser and the OpenIE system at the same time. It'll be fixed in the next release (3.5.3) when everything syncs up again, but since OpenIE didn't exist at the 3.5.2 release the code in that jar is a bit "ahead" of the parser.
The easiest way to run the two at the same time is to run from the Github version of CoreNLP. The command ant jar should create a jar file which contains both the parser and the OpenIE system. The most recent models (warning: large download) should have both the parser and OpenIE models.