How to use StudentT distribution in pymc3? - pymc

I'm not sure whether this counts as a question or a bug report. I posted a GitHub gist here: https://gist.github.com/jbwhit/a9012e04b0f48e582c22
I found this question (pymc3: hierarchical model with multiple obsesrved variables) to be an excellent starting point for my own hierarchical model, but ran into difficulties as soon as I tried to modify it in any substantial way.
First, the model and setup that works:
import numpy as np
import pymc3 as pm
n_individuals = 200
points_per_individual = 10
means = np.random.normal(30, 12, n_individuals)
observed = np.random.normal(means, 1, (points_per_individual, n_individuals))
model = pm.Model()
with model:
hyper_mean = pm.Normal('hyper_mean', mu=0, sd=100)
hyper_sigma = pm.HalfNormal('hyper_sigma', sd=3)
means = pm.Normal('means', mu=hyper_mean, sd=hyper_sigma, shape=n_individuals)
sigmas = pm.HalfNormal('sigmas', sd=100)
ye = pm.Normal('ye', mu=means, sd=sigmas, observed=observed)
trace = pm.sample(10000)
All of the above works as expected (and the traces look nice). The next piece of code makes one change (swapping a T distribution for the Normal):
model = pm.Model()
with model:
hyper_mean = pm.Normal('hyper_mean', mu=0, sd=100)
hyper_sigma = pm.HalfNormal('hyper_sigma', sd=3)
### Changed to a T distribution ###
means = pm.StudentT('means', nu=hyper_mean, sd=hyper_sigma, shape=n_individuals)
sigmas = pm.HalfNormal('sigmas', sd=100)
ye = pm.Normal('ye', mu=means, sd=sigmas, observed=observed)
trace = pm.sample(10000)
The following is the output:
Assigned NUTS to hyper_mean
Assigned NUTS to hyper_sigma_log
Assigned NUTS to means
Assigned NUTS to sigmas_log
---------------------------------------------------------------------------
PositiveDefiniteError Traceback (most recent call last)
<ipython-input-12-69f59e2f3d47> in <module>()
18 ye = pm.Normal('ye', mu=means, sd=sigmas, observed=observed)
19
---> 20 trace = pm.sample(10000)
/Users/jonathan/miniconda2/envs/pymc3/lib/python3.5/site-packages/pymc3/sampling.py in sample(draws, step, start, trace, chain, njobs, tune, progressbar, model, random_seed)
121 """
122 model = modelcontext(model)
--> 123
124 step = assign_step_methods(model, step)
125
/Users/jonathan/miniconda2/envs/pymc3/lib/python3.5/site-packages/pymc3/sampling.py in assign_step_methods(model, step, methods)
66 selected_steps[selected].append(var)
67
---> 68 # Instantiate all selected step methods
69 steps += [s(vars=selected_steps[s]) for s in selected_steps if selected_steps[s]]
70
/Users/jonathan/miniconda2/envs/pymc3/lib/python3.5/site-packages/pymc3/sampling.py in <listcomp>(.0)
66 selected_steps[selected].append(var)
67
---> 68 # Instantiate all selected step methods
69 steps += [s(vars=selected_steps[s]) for s in selected_steps if selected_steps[s]]
70
/Users/jonathan/miniconda2/envs/pymc3/lib/python3.5/site-packages/pymc3/step_methods/nuts.py in __init__(self, vars, scaling, step_scale, is_cov, state, Emax, target_accept, gamma, k, t0, model, profile, **kwargs)
76
77
---> 78 self.potential = quad_potential(scaling, is_cov, as_cov=False)
79
80 if state is None:
/Users/jonathan/miniconda2/envs/pymc3/lib/python3.5/site-packages/pymc3/step_methods/quadpotential.py in quad_potential(C, is_cov, as_cov)
33 return QuadPotential_SparseInv(C)
34
---> 35 partial_check_positive_definite(C)
36 if C.ndim == 1:
37 if is_cov != as_cov:
/Users/jonathan/miniconda2/envs/pymc3/lib/python3.5/site-packages/pymc3/step_methods/quadpotential.py in partial_check_positive_definite(C)
56 if len(i):
57 raise PositiveDefiniteError(
---> 58 "Simple check failed. Diagonal contains negatives", i)
59
60
PositiveDefiniteError: Scaling is not positive definite. Simple check failed. Diagonal contains negatives. Check indexes [202]
Any suggestion on how to get this to work?

As I mentioned in the comment, try running:
model = pm.Model()
with model:
hyper_mean = pm.Normal('hyper_mean', mu = 0, sd = 100)
hyper_sigma = pm.HalfNormal('hyper_sigma', sd = 3)
nu = pm.Exponential('nu', 1./10, testval = 5.)
### Changed to a T distribution ###
means = pm.StudentT('means', nu = nu, mu = hyper_mean, sd = hyper_sigma, shape = n_individuals)
sigmas = pm.HalfNormal('sigmas', sd = 100)
ye = pm.Normal('ye', mu = means, sd = sigmas, observed = observed)
trace = pm.sample(10000)
In other words: use the mu argument of the pm.StudentT method for hyper_mean and nu for the degrees of freedom.
Once it starts working, you might also try to add the pm.find_MAP method (as suggested by #Chris Fonnesbeck).

Try finding the MAP estimate and use that as the starting point for the MCMC run:
start = pm.find_MAP()
trace = pm.sample(10000, start=start)

Related

Keep getting this error message "AttributeError: can't set attribute"

The code below is triggering a AttributeError: can't set attribute. I'm still new to programming so am having a difficult time figuring out why this error is occurring. Any help is appreciated.
import cimcb_lite as cb
cv = cb.cross_val.kfold(model=cb.model.PLS_SIMPLS,X=XTknn,
Y=Ytrain,
param_dict={'n_components': [1,2,3,4,5]},
folds=5,
bootnum=100)
cv.run()
seeing this error
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
/var/folders/rs/f6nsd1894354_821jj157jnr0000gn/T/ipykernel_30013/1292624611.py in <module>
8
9 # run the cross validation
---> 10 cv.run()
11
/opt/anaconda3/lib/python3.9/site-packages/cimcb_lite/cross_val/kfold.py in run(self)
82 def run(self):
83 """Runs all functions prior to plot."""
---> 84 self.calc_ypred()
85 self.calc_stats()
86 if self.bootnum > 1:
/opt/anaconda3/lib/python3.9/site-packages/cimcb_lite/cross_val/kfold.py in calc_ypred(self)
55 model_i = self.model(**params_i)
56 # Full
---> 57 model_i.train(self.X, self.Y)
58 ypred_full_i = model_i.test(self.X)
59 self.ypred_full.append(ypred_full_i)
/opt/anaconda3/lib/python3.9/site-packages/cimcb_lite/model/PLS_SIMPLS.py in train(self, X, Y)
77 # Calculates and store attributes of PLS SIMPLS
78 Xscores, Yscores, Xloadings, Yloadings, Weights, Beta = self.pls_simpls(X, Y, ncomp=self.n_component)
---> 79 self.model.x_scores_ = Xscores
80 self.model.y_scores_ = Yscores
81 self.model.x_loadings_ = Xloadings
AttributeError: can't set attribute

How to get the shap values for the masked language modeling task using transformer?

I am trying to get the shap values for the masked language modeling task using transformer. I get the error KeyError: 'label' for the code where I input a single data sample to get the explanation. My complete code and error trace are as follows:
import transformers
import shap
from transformers import RobertaTokenizer, RobertaForMaskedLM, pipeline
import torch
model = RobertaForMaskedLM.from_pretrained('microsoft/codebert-base-mlm')
tokenizer = RobertaTokenizer.from_pretrained('microsoft/codebert-base-mlm')
code_example = "if (x <mask> 10)"
fill_mask = pipeline('fill-mask', model=model, tokenizer=tokenizer)
explainer = shap.Explainer(fill_mask)
shap_values = explainer(['x {tokenizer.mask_token} 10'])
Following is the error trace
KeyError Traceback (most recent call last)
[<ipython-input-12-bb3832d1772d>](https://localhost:8080/#) in <module>
6 # explain the model on two sample inputs
7 explainer = shap.Explainer(fill_mask)
----> 8 shap_values = explainer(['x {tokenizer.mask_token} 10'])
9 print(shap_values)
10 # visualize the first prediction's explanation for the POSITIVE output class
5 frames
[/usr/local/lib/python3.7/dist-packages/shap/explainers/_partition.py](https://localhost:8080/#) in __call__(self, max_evals, fixed_context, main_effects, error_bounds, batch_size, outputs, silent, *args)
136 return super().__call__(
137 *args, max_evals=max_evals, fixed_context=fixed_context, main_effects=main_effects, error_bounds=error_bounds, batch_size=batch_size,
--> 138 outputs=outputs, silent=silent
139 )
140
[/usr/local/lib/python3.7/dist-packages/shap/explainers/_explainer.py](https://localhost:8080/#) in __call__(self, max_evals, main_effects, error_bounds, batch_size, outputs, silent, *args, **kwargs)
266 row_result = self.explain_row(
267 *row_args, max_evals=max_evals, main_effects=main_effects, error_bounds=error_bounds,
--> 268 batch_size=batch_size, outputs=outputs, silent=silent, **kwargs
269 )
270 values.append(row_result.get("values", None))
[/usr/local/lib/python3.7/dist-packages/shap/explainers/_partition.py](https://localhost:8080/#) in explain_row(self, max_evals, main_effects, error_bounds, batch_size, outputs, silent, fixed_context, *row_args)
159 # if not fixed background or no base value assigned then compute base value for a row
160 if self._curr_base_value is None or not getattr(self.masker, "fixed_background", False):
--> 161 self._curr_base_value = fm(m00.reshape(1, -1), zero_index=0)[0] # the zero index param tells the masked model what the baseline is
162 f11 = fm(~m00.reshape(1, -1))[0]
163
[/usr/local/lib/python3.7/dist-packages/shap/utils/_masked_model.py](https://localhost:8080/#) in __call__(self, masks, zero_index, batch_size)
65
66 else:
---> 67 return self._full_masking_call(masks, batch_size=batch_size)
68
69 def _full_masking_call(self, masks, zero_index=None, batch_size=None):
[/usr/local/lib/python3.7/dist-packages/shap/utils/_masked_model.py](https://localhost:8080/#) in _full_masking_call(self, masks, zero_index, batch_size)
142
143 joined_masked_inputs = tuple([np.concatenate(v) for v in all_masked_inputs])
--> 144 outputs = self.model(*joined_masked_inputs)
145 _assert_output_input_match(joined_masked_inputs, outputs)
146 all_outputs.append(outputs)
[/usr/local/lib/python3.7/dist-packages/shap/models/_transformers_pipeline.py](https://localhost:8080/#) in __call__(self, strings)
33 val = [val]
34 for obj in val:
---> 35 output[i, self.label2id[obj["label"]]] = sp.special.logit(obj["score"]) if self.rescale_to_logits else obj["score"]
36 return output
KeyError: 'label'

Problem using spacy tokenizer for count vectorizer

I'm trying to do sentiment analysis on Amazon product reviews using the Spacy module for preprocessing the text data. The code I'm using is exactly this. I modified the dataset that I'm using according to what's shown in the link. I'm getting the error:
TypeError Traceback (most recent call last)
<ipython-input-139-bcbf2d3c9cce> in <module>
4 ('classifier', classifier)])
5 # Fit our data
----> 6 pipe_countvect.fit(X_train,y_train)
7 # Predicting with a test dataset
8 sample_prediction = pipe_countvect.predict(X_test)
~\.conda\envs\py36\lib\site-packages\sklearn\pipeline.py in fit(self, X, y, **fit_params)
328 """
329 fit_params_steps = self._check_fit_params(**fit_params)
--> 330 Xt = self._fit(X, y, **fit_params_steps)
331 with _print_elapsed_time('Pipeline',
332 self._log_message(len(self.steps) - 1)):
~\.conda\envs\py36\lib\site-packages\sklearn\pipeline.py in _fit(self, X, y, **fit_params_steps)
294 message_clsname='Pipeline',
295 message=self._log_message(step_idx),
--> 296 **fit_params_steps[name])
297 # Replace the transformer of the step with the fitted
298 # transformer. This is necessary when loading the transformer
~\.conda\envs\py36\lib\site-packages\joblib\memory.py in __call__(self, *args, **kwargs)
350
351 def __call__(self, *args, **kwargs):
--> 352 return self.func(*args, **kwargs)
353
354 def call_and_shelve(self, *args, **kwargs):
~\.conda\envs\py36\lib\site-packages\sklearn\pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
738 with _print_elapsed_time(message_clsname, message):
739 if hasattr(transformer, 'fit_transform'):
--> 740 res = transformer.fit_transform(X, y, **fit_params)
741 else:
742 res = transformer.fit(X, y, **fit_params).transform(X)
~\.conda\envs\py36\lib\site-packages\sklearn\feature_extraction\text.py in fit_transform(self, raw_documents, y)
1197
1198 vocabulary, X = self._count_vocab(raw_documents,
-> 1199 self.fixed_vocabulary_)
1200
1201 if self.binary:
~\.conda\envs\py36\lib\site-packages\sklearn\feature_extraction\text.py in _count_vocab(self, raw_documents, fixed_vocab)
1108 for doc in raw_documents:
1109 feature_counter = {}
-> 1110 for feature in analyze(doc):
1111 try:
1112 feature_idx = vocabulary[feature]
~\.conda\envs\py36\lib\site-packages\sklearn\feature_extraction\text.py in _analyze(doc, analyzer, tokenizer, ngrams, preprocessor, decoder, stop_words)
104 doc = preprocessor(doc)
105 if tokenizer is not None:
--> 106 doc = tokenizer(doc)
107 if ngrams is not None:
108 if stop_words is not None:
TypeError: 'str' object is not callable
I'm not sure what's causing this error and how to get rid of it. I'm pretty sure the count vectorizer produces a sparse matrix and not a string one. One thing that I've considered is that I'm using the spacy tokenizer, which was used in the link as vectorizer = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1)) but when I ran the program it was saying that spacy_tokenizer was undefined. So I used vectorizer = CountVectorizer(tokenizer = 'spacy', ngram_range=(1,1)) instead. But if I remove this then I don't know how to use the spacy tokenizer, and either way I am not certain that this was indeed the cause of the problem. Please help me out!
The error comes at this line:
doc = tokenizer(doc)
Since it says 'str' is not callable and the only thing being called here is the tokenizer object, it looks like your tokenizer is a string for some reason.
Based on the code you linked it looks like the spacy_tokenizer object is being configured incorrectly. But that variable isn't defined anywhere in the code despite being passed as an option, so the code you linked to looks like it can't possibly run.
It would help if you could make a minimal example that you could actually paste in the question here.

ArgumentError: In `load': marshal data too short

I want to realize multiple processes. I have to send the data which bubble-sorted in different child processes back to parent process then merge data. This is part of my code:
rd1,wt1 = IO.pipe # reader & writer
pid1 = fork {
rd1.close
numbers = Marshal.load(Marshal.dump(copylist[0,p]))
bubble_sort(numbers)
sList[0] = numbers.clone
wt1.write Marshal.dump(sList[0])
Process.exit!(true)
}
Process.waitpid(pid1)
Process.waitpid(pid2)
wt1.close
wt2.close
pid5 = fork {
rd5.close
a = Marshal.load(rd1.gets)
b = Marshal.load(rd2.gets)
mList[0] = merge( a,b).clone
wt5.write Marshal.dump(mList[0])
Process.exit!(true)
}
There are pid1...pid7, rd1...rd7, wt1...wt7. pid1...pid4 are bubble-sort 4 part of data. pid5 and 6 merge data from pid1, 2 and pid 3, 4. Finally, pid7 merges the data from pid5 and 6.
When data size is small, it succeeds, but when I input larger data (10000):
Data example : 121 45 73 89 11 452 515 32 1 99 4 88 41 53 159 482 2013 2 ...
then, errors occur: :in 'load': marshal data too short (ArgumentError) and another kind error: in 'load': instance of IO needed (TypeError). The first error line is in pid5: a = ... and pid6: b = .... The other kind of error line is in pid7: b = .... Are my data too big for this method?
Marshal.load and Marshal.dump work with binary data. The problem with the short reads is here:
a = Marshal.load(rd1.gets)
b = Marshal.load(rd2.gets)
#gets reads up to a new-line (or end of file) and then stops. The trouble is that new-line may be present in the binary data created by Marshal.dump.
Change gets to read in both lines.

Looking for a way to speed up the write to file portion of my Python code

I have a simple code that reads in a data file ~2Gb, extracts the columns of data that I need and then writes that data as columns to another file for later processing. I ran the code last night and it took close to nine hours to complete. I ran the two sections separately and have determined that the portion that writes the data to a new file is the problem. I would like to ask if anyone can point out why it is so slow the way I have written it as well as suggestions on a better method.
sample of data being read in
26980300000000 26980300000000 39 13456502685696 1543 0
26980300000001 26980300000000 38 13282082553856 1523 0.01
26980300000002 26980300000000 37 13465223692288 1544 0.03
26980300000003 26980300000000 36 13290803560448 1524 0.05
26980300000004 26980300000000 35 9514610851840 1091 0.06
26980300000005 26980300000000 34 9575657897984 1098 0.08
26980300000006 26980300000000 33 8494254129152 974 0.1
26980300000007 26980300000000 32 8520417148928 977 0.12
26980300000008 26980300000000 31 8302391459840 952 0.14
26980300000009 26980300000000 30 8232623931392 944 0.16
Code
F = r'C:\Users\mass_red.csv'
def filesave(TID,M,R):
X = str(TID)
Y = str(M)
Z = str(R)
w = open(r'C:\Users\Outfiles\acc1_out3.txt','a')
w.write(X)
w.write('\t')
w.write(Y)
w.write('\t')
w.write(Z)
w.write('\n')
w.close()
return()
N = 47000000
f = open(F)
f.readline()
nlines = islice(f, N)
for line in nlines:
if line !='':
line = line.strip()
line = line.replace(',',' ')
columns = line.split()
tid = int(columns[1])
m = float(columns[3])
r = float(columns[5])
filesave(tid,m,r)
You open and close the file for each line. Open it once at the beginning.
In modern Python, most file use should be done with with statements. Open is easily seen to be done once in the header, and close is automatic. Here is a general template for line processing.
inp = r'C:\Users\mass_red.csv'
out = r'C:\Users\Outfiles\acc1_out3.txt'
with open(inp) as fi, open(out, 'a') as fo:
for line in fi:
...
if keep:
...
fo.write(whatever)
Here's a simplified but complete version of your code:
#!/usr/bin/env python
from __future__ import print_function
from itertools import islice
nlines_limit = 47000000
with open(r'C:\Users\mass_red.csv') as input_file, \
open(r'C:\Users\Outfiles\acc1_out3.txt', 'w') as output_file:
next(input_file) # skip line
for line in islice(input_file, nlines_limit):
columns = line.split()
try:
tid = int(columns[1])
m = float(columns[3])
r = float(columns[5])
except (ValueError, IndexError):
pass # skip invalid lines
else:
print(tid, m, r, sep='\t', file=output_file)
I don't see commas in your input; so I've removed line.replace(',', ' ') from the code.

Resources