I'm trying to do sentiment analysis on Amazon product reviews using the Spacy module for preprocessing the text data. The code I'm using is exactly this. I modified the dataset that I'm using according to what's shown in the link. I'm getting the error:
TypeError Traceback (most recent call last)
<ipython-input-139-bcbf2d3c9cce> in <module>
4 ('classifier', classifier)])
5 # Fit our data
----> 6 pipe_countvect.fit(X_train,y_train)
7 # Predicting with a test dataset
8 sample_prediction = pipe_countvect.predict(X_test)
~\.conda\envs\py36\lib\site-packages\sklearn\pipeline.py in fit(self, X, y, **fit_params)
328 """
329 fit_params_steps = self._check_fit_params(**fit_params)
--> 330 Xt = self._fit(X, y, **fit_params_steps)
331 with _print_elapsed_time('Pipeline',
332 self._log_message(len(self.steps) - 1)):
~\.conda\envs\py36\lib\site-packages\sklearn\pipeline.py in _fit(self, X, y, **fit_params_steps)
294 message_clsname='Pipeline',
295 message=self._log_message(step_idx),
--> 296 **fit_params_steps[name])
297 # Replace the transformer of the step with the fitted
298 # transformer. This is necessary when loading the transformer
~\.conda\envs\py36\lib\site-packages\joblib\memory.py in __call__(self, *args, **kwargs)
350
351 def __call__(self, *args, **kwargs):
--> 352 return self.func(*args, **kwargs)
353
354 def call_and_shelve(self, *args, **kwargs):
~\.conda\envs\py36\lib\site-packages\sklearn\pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
738 with _print_elapsed_time(message_clsname, message):
739 if hasattr(transformer, 'fit_transform'):
--> 740 res = transformer.fit_transform(X, y, **fit_params)
741 else:
742 res = transformer.fit(X, y, **fit_params).transform(X)
~\.conda\envs\py36\lib\site-packages\sklearn\feature_extraction\text.py in fit_transform(self, raw_documents, y)
1197
1198 vocabulary, X = self._count_vocab(raw_documents,
-> 1199 self.fixed_vocabulary_)
1200
1201 if self.binary:
~\.conda\envs\py36\lib\site-packages\sklearn\feature_extraction\text.py in _count_vocab(self, raw_documents, fixed_vocab)
1108 for doc in raw_documents:
1109 feature_counter = {}
-> 1110 for feature in analyze(doc):
1111 try:
1112 feature_idx = vocabulary[feature]
~\.conda\envs\py36\lib\site-packages\sklearn\feature_extraction\text.py in _analyze(doc, analyzer, tokenizer, ngrams, preprocessor, decoder, stop_words)
104 doc = preprocessor(doc)
105 if tokenizer is not None:
--> 106 doc = tokenizer(doc)
107 if ngrams is not None:
108 if stop_words is not None:
TypeError: 'str' object is not callable
I'm not sure what's causing this error and how to get rid of it. I'm pretty sure the count vectorizer produces a sparse matrix and not a string one. One thing that I've considered is that I'm using the spacy tokenizer, which was used in the link as vectorizer = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1)) but when I ran the program it was saying that spacy_tokenizer was undefined. So I used vectorizer = CountVectorizer(tokenizer = 'spacy', ngram_range=(1,1)) instead. But if I remove this then I don't know how to use the spacy tokenizer, and either way I am not certain that this was indeed the cause of the problem. Please help me out!
The error comes at this line:
doc = tokenizer(doc)
Since it says 'str' is not callable and the only thing being called here is the tokenizer object, it looks like your tokenizer is a string for some reason.
Based on the code you linked it looks like the spacy_tokenizer object is being configured incorrectly. But that variable isn't defined anywhere in the code despite being passed as an option, so the code you linked to looks like it can't possibly run.
It would help if you could make a minimal example that you could actually paste in the question here.
Related
I am trying to get the shap values for the masked language modeling task using transformer. I get the error KeyError: 'label' for the code where I input a single data sample to get the explanation. My complete code and error trace are as follows:
import transformers
import shap
from transformers import RobertaTokenizer, RobertaForMaskedLM, pipeline
import torch
model = RobertaForMaskedLM.from_pretrained('microsoft/codebert-base-mlm')
tokenizer = RobertaTokenizer.from_pretrained('microsoft/codebert-base-mlm')
code_example = "if (x <mask> 10)"
fill_mask = pipeline('fill-mask', model=model, tokenizer=tokenizer)
explainer = shap.Explainer(fill_mask)
shap_values = explainer(['x {tokenizer.mask_token} 10'])
Following is the error trace
KeyError Traceback (most recent call last)
[<ipython-input-12-bb3832d1772d>](https://localhost:8080/#) in <module>
6 # explain the model on two sample inputs
7 explainer = shap.Explainer(fill_mask)
----> 8 shap_values = explainer(['x {tokenizer.mask_token} 10'])
9 print(shap_values)
10 # visualize the first prediction's explanation for the POSITIVE output class
5 frames
[/usr/local/lib/python3.7/dist-packages/shap/explainers/_partition.py](https://localhost:8080/#) in __call__(self, max_evals, fixed_context, main_effects, error_bounds, batch_size, outputs, silent, *args)
136 return super().__call__(
137 *args, max_evals=max_evals, fixed_context=fixed_context, main_effects=main_effects, error_bounds=error_bounds, batch_size=batch_size,
--> 138 outputs=outputs, silent=silent
139 )
140
[/usr/local/lib/python3.7/dist-packages/shap/explainers/_explainer.py](https://localhost:8080/#) in __call__(self, max_evals, main_effects, error_bounds, batch_size, outputs, silent, *args, **kwargs)
266 row_result = self.explain_row(
267 *row_args, max_evals=max_evals, main_effects=main_effects, error_bounds=error_bounds,
--> 268 batch_size=batch_size, outputs=outputs, silent=silent, **kwargs
269 )
270 values.append(row_result.get("values", None))
[/usr/local/lib/python3.7/dist-packages/shap/explainers/_partition.py](https://localhost:8080/#) in explain_row(self, max_evals, main_effects, error_bounds, batch_size, outputs, silent, fixed_context, *row_args)
159 # if not fixed background or no base value assigned then compute base value for a row
160 if self._curr_base_value is None or not getattr(self.masker, "fixed_background", False):
--> 161 self._curr_base_value = fm(m00.reshape(1, -1), zero_index=0)[0] # the zero index param tells the masked model what the baseline is
162 f11 = fm(~m00.reshape(1, -1))[0]
163
[/usr/local/lib/python3.7/dist-packages/shap/utils/_masked_model.py](https://localhost:8080/#) in __call__(self, masks, zero_index, batch_size)
65
66 else:
---> 67 return self._full_masking_call(masks, batch_size=batch_size)
68
69 def _full_masking_call(self, masks, zero_index=None, batch_size=None):
[/usr/local/lib/python3.7/dist-packages/shap/utils/_masked_model.py](https://localhost:8080/#) in _full_masking_call(self, masks, zero_index, batch_size)
142
143 joined_masked_inputs = tuple([np.concatenate(v) for v in all_masked_inputs])
--> 144 outputs = self.model(*joined_masked_inputs)
145 _assert_output_input_match(joined_masked_inputs, outputs)
146 all_outputs.append(outputs)
[/usr/local/lib/python3.7/dist-packages/shap/models/_transformers_pipeline.py](https://localhost:8080/#) in __call__(self, strings)
33 val = [val]
34 for obj in val:
---> 35 output[i, self.label2id[obj["label"]]] = sp.special.logit(obj["score"]) if self.rescale_to_logits else obj["score"]
36 return output
KeyError: 'label'
I am learning using ruamel.yaml, and I am wondering whether it supports type descriptor as the original YAML like "num: !!float 4"?
The file is like:
num: !!float 4
I tried import a file like this, but met an error:
---------------------------------------------------------------------------<br>
ValueError Traceback (most recent call last)
Input In [22], in <cell line: 2>()
1 from ruamel import yaml
2 with open("net.yaml", "r", encoding="utf-8") as yaml_file:
----> 3 yaml_dict = yaml.round_trip_load(yaml_file)
4 yaml_dict
...
File ~/software/python/anaconda/anaconda3/envs/conda-general/lib/python3.10/site-packages/ruamel/yaml/constructor.py:1469, in RoundTripConstructor.construct_mapping(self, node, maptyp, deep)
1462 if not isinstance(key, Hashable):
1463 raise ConstructorError(
1464 'while constructing a mapping',
1465 node.start_mark,
1466 'found unhashable key',
1467 key_node.start_mark,
1468 )
-> 1469 value = self.construct_object(value_node, deep=deep)
1470 if self.check_mapping_key(node, key_node, maptyp, key, value):
1471 if key_node.comment and len(key_node.comment) > 4 and key_node.comment[4]:
File ~/software/python/anaconda/anaconda3/envs/conda-general/lib/python3.10/site-packages/ruamel/yaml/constructor.py:146, in BaseConstructor.construct_object(self, node, deep)
142 # raise ConstructorError(
143 # None, None, 'found unconstructable recursive node', node.start_mark
144 # )
145 self.recursive_objects[node] = None
--> 146 data = self.construct_non_recursive_object(node)
148 self.constructed_objects[node] = data
149 del self.recursive_objects[node]
File ~/software/python/anaconda/anaconda3/envs/conda-general/lib/python3.10/site-packages/ruamel/yaml/constructor.py:181, in BaseConstructor.construct_non_recursive_object(self, node, tag)
179 constructor = self.__class__.construct_mapping
180 if tag_suffix is None:
--> 181 data = constructor(self, node)
182 else:
183 data = constructor(self, tag_suffix, node)
File ~/software/python/anaconda/anaconda3/envs/conda-general/lib/python3.10/site-packages/ruamel/yaml/constructor.py:1271, in RoundTripConstructor.construct_yaml_float(self, node)
1259 return ScalarFloat(
1260 sign * float(value_s),
1261 width=width,
(...)
1268 anchor=node.anchor,
1269 )
1270 width = len(value_so)
-> 1271 prec = value_so.index('.') # you can use index, this would not be float without dot
1272 lead0 = leading_zeros(value_so)
1273 return ScalarFloat(
1274 sign * float(value_s),
1275 width=width,
(...)
1279 anchor=node.anchor,
1280 )
ValueError: substring not found
Why do I get this error, and how do I get rid of it?
That is a bug in ruamel.yaml<=0.17.21. The comment on the offending line (1271) says
# you can use index, this would not be float without dot
Obviously the author of that comment didn't know what he was talking about, as in your case, when using !!float 4 you have a float without a dot...
It is trivial to "fix" that by replacing index with find in line 1271, and when doing so that will load your document and you can dump the data.
But the corresponding representer for dumping doesn't cope with that outputs the float as 4.0, dropping the tag.
You could temporarily fix this by registering a simpler float constructor (e.g. the simple one from the SafeLoader), although this will affect all floats:
import sys
import ruamel.yaml
yaml_str = """\
num: !!float 4
"""
yaml = ruamel.yaml.YAML()
yaml.constructor.add_constructor(
'tag:yaml.org,2002:float', ruamel.yaml.constructor.SafeConstructor.construct_yaml_float
)
data = yaml.load(yaml_str)
yaml.dump(data, sys.stdout)
which gives:
num: 4.0
I want to edit a tree that I got from BEAST2 treeannotator in nexus-format.
Usually I use the module Phylo from Biopython for such work but Phylo.read(r"filename.tree", "nexus") gave me the next exception:
---------------------------------------------------------------------------
NexusError Traceback (most recent call last)
Input In [29], in <cell line: 1>()
----> 1 Phylo.read(r"filename.tree", "nexus")
File ~\miniconda3\lib\site-packages\Bio\Phylo\_io.py:60, in read(file, format, **kwargs)
58 try:
59 tree_gen = parse(file, format, **kwargs)
---> 60 tree = next(tree_gen)
61 except StopIteration:
62 raise ValueError("There are no trees in this file.") from None
File ~\miniconda3\lib\site-packages\Bio\Phylo\_io.py:49, in parse(file, format, **kwargs)
34 """Parse a file iteratively, and yield each of the trees it contains.
35
36 If a file only contains one tree, this still returns an iterable object that
(...)
46
47 """
48 with File.as_handle(file) as fp:
---> 49 yield from getattr(supported_formats[format], "parse")(fp, **kwargs)
File ~\miniconda3\lib\site-packages\Bio\Phylo\NexusIO.py:40, in parse(handle)
32 def parse(handle):
33 """Parse the trees in a Nexus file.
34
35 Uses the old Nexus.Trees parser to extract the trees, converts them back to
(...)
38 eventually change Nexus to use the new NewickIO parser directly.)
39 """
---> 40 nex = Nexus.Nexus(handle)
42 # NB: Once Nexus.Trees is modified to use Tree.Newick objects, do this:
43 # return iter(nex.trees)
44 # Until then, convert the Nexus.Trees.Tree object hierarchy:
45 def node2clade(nxtree, node):
File ~\miniconda3\lib\site-packages\Bio\Nexus\Nexus.py:668, in Nexus.__init__(self, input)
665 self.options["gapmode"] = "missing"
667 if input:
--> 668 self.read(input)
669 else:
670 self.read(DEFAULTNEXUS)
File ~\miniconda3\lib\site-packages\Bio\Nexus\Nexus.py:718, in Nexus.read(self, input)
716 break
717 if title in KNOWN_NEXUS_BLOCKS:
--> 718 self._parse_nexus_block(title, contents)
719 else:
720 self._unknown_nexus_block(title, contents)
File ~\miniconda3\lib\site-packages\Bio\Nexus\Nexus.py:759, in Nexus._parse_nexus_block(self, title, contents)
757 for line in block.commandlines:
758 try:
--> 759 getattr(self, "_" + line.command)(line.options)
760 except AttributeError:
761 raise NexusError("Unknown command: %s " % line.command) from None
File ~\miniconda3\lib\site-packages\Bio\Nexus\Nexus.py:1144, in Nexus._translate(self, options)
1142 break
1143 elif c != ",":
-> 1144 raise NexusError("Missing ',' in line %s." % options)
1145 except NexusError:
1146 raise
NexusError: Missing ',' in line 1 AB298157.1_2015_-7.9133750332192605_114.8086828279248, 2 AB298158.1_2007_-8.41698974207…
Using Nexus.read(Nexus(), input=r"filename.tree") gave the same result. Please could anyone help with this? I cannot understand the reason of this error because nexus file looks correct.
The reason is that Biopython cannot read nexus trees with links, constituent from translations & a newick tree. So it is required previously to convert this to the form with full names into the tree (as hereinbelow).
Begin
tree TREE1 = (((your,tree),(in,(the, newick))),format);
End;
P.S. It is allowed in the newick format to surround the label with quotes, & some programmes or scripts add them to those names that have ambiguous characters. But it can lead to exceptions during the following phylogenetic analysis, for instance, in BEAST. I wish you would be careful with this.
I am trying to get an element like this by xpath Text contains.
<p><strong>Полное наименование</strong></p>
As a result I am getting this error.
In [4]: response.xpath("//p[contains(text(),'Полное')]").extract()
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-4-7e122465e645> in <module>()
----> 1 response.xpath("//p[contains(text(),'Полное')]").extract()
c:\python27\lib\site-packages\scrapy\http\response\text.pyc in xpath(self, query, **kwargs)
117
118 def xpath(self, query, **kwargs):
--> 119 return self.selector.xpath(query, **kwargs)
120
121 def css(self, query):
c:\python27\lib\site-packages\parsel\selector.pyc in xpath(self, query, namespaces, **kwargs)
226 result = xpathev(query, namespaces=nsp,
227 smart_strings=self._lxml_smart_strings,
--> 228 **kwargs)
229 except etree.XPathError as exc:
230 msg = u"XPath error: %s in %s" % (exc, query)
src\lxml\etree.pyx in lxml.etree._Element.xpath()
src\lxml\xpath.pxi in lxml.etree.XPathElementEvaluator.__call__()
src\lxml\apihelpers.pxi in lxml.etree._utf8()
ValueError: All strings must be XML compatible: Unicode or ASCII, no NULL bytes or control characters
Here is my xpath
response.xpath("//p[contains(text(),'Полное')]").extract()
'Полное' is russian text I use for search.
How do I fix the error?
Prefix your expression string with a u to make a unicode string:
response.xpath(u"//p[contains(text(),'Полное')]").extract()
I am new to using Pandas on Windows and I'm not sure what I am doing wrong here.
My data is located at 'C:\Users\me\data\lending_club\loan.csv'
path = 'C:\\Users\\me\\data\\lending_club\\loan.csv'
pd.read_csv(path)
And I get this error:
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
<ipython-input-107-b5792b17a3c3> in <module>()
1 path = 'C:\\Users\\me\\data\\lending_club\\loan.csv'
----> 2 pd.read_csv(path)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, skip_footer, doublequote, delim_whitespace, as_recarray, compact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision)
707 skip_blank_lines=skip_blank_lines)
708
--> 709 return _read(filepath_or_buffer, kwds)
710
711 parser_f.__name__ = name
C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\parsers.py in _read(filepath_or_buffer, kwds)
447
448 # Create the parser.
--> 449 parser = TextFileReader(filepath_or_buffer, **kwds)
450
451 if chunksize or iterator:
C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\parsers.py in __init__(self, f, engine, **kwds)
816 self.options['has_index_names'] = kwds['has_index_names']
817
--> 818 self._make_engine(self.engine)
819
820 def close(self):
C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\parsers.py in _make_engine(self, engine)
1047 def _make_engine(self, engine='c'):
1048 if engine == 'c':
-> 1049 self._engine = CParserWrapper(self.f, **self.options)
1050 else:
1051 if engine == 'python':
C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\parsers.py in __init__(self, src, **kwds)
1693 kwds['allow_leading_cols'] = self.index_col is not False
1694
-> 1695 self._reader = parsers.TextReader(src, **kwds)
1696
1697 # XXX
pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader.__cinit__()
pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader._setup_parser_source()
FileNotFoundError: File b'C:\\Users\\me\\data\\lending_club\\loan.csv' does not exist
EDIT:
I re-installed Anaconda and the error went away. Not sure exactly what was going on but could potentially have been related to the initial install being global vs. user specific in my second install. Thanks for the help everybody!
Just use forward slash('/') instead of backslash('\')
path = 'C:/Users/me/data/lending_club/loan.csv'
Python only access to the current folder's files.
If you want to access files from an other folder, try this :
import sys
sys.path.insert(0, 'C:/Users/myFolder')
Those lines allow your script to access an other folder. Be carrefull, you shoud use slashes /, not backslashes \