sparse matrix use in pycaret for nlp - matrix

First of all, thank you for allowing me to use this wonderful library
I am doing Korean NLP now
So I pre-processed Korean and converted it into a Tfidf Vectorizer.
I'm going to put it in setup() and use it, but there are errors
Can't I use the sparse matrix for the pycaret?
If so, is there any way to do Korean NLP?
X_train = train_data.Text.tolist()
Y_train =train_data['Label'].values
X_test = test_data.Text.tolist()
Y_test =test_data['Label'].values
from soynlp.word import WordExtractor
word_extractor = WordExtractor(min_frequency=100,
min_cohesion_forward=0.05,
min_right_branching_entropy=0.0
)
word_extractor.train(X_train) # list of str or like
words = word_extractor.extract()
scores = word_extractor.word_scores()
import math
score_dict = {key: scores[key].cohesion_forward *
math.exp(scores[key].right_branching_entropy)
for key in scores}
from soynlp.tokenizer import LTokenizer
cohesion_score = {word:score.cohesion_forward for word, score in words.items()}
tokenizer = LTokenizer(scores=score_dict)
import os
from scipy.sparse import save_npz, load_npz
from sklearn.feature_extraction.text import TfidfVectorizer
# if not os.path.isfile('soy_train.npz'):
tfidf = TfidfVectorizer(ngram_range=(1, 2),
min_df=3,
tokenizer=tokenizer.tokenize,
token_pattern=None)
tfidf.fit(X_train)
X_train_soy = tfidf.transform(X_train)
X_test_soy = tfidf.transform(X_test)
save_npz('soy_train.npz', X_train_soy)
save_npz('soy_test.npz', X_test_soy)
type(X_train_soy[0])
import numpy as np
train = pd.DataFrame(X_train_soy)
y_train = np.array(Y_train,dtype=float)
train['Label'] = y_train
from pycaret.classification import *
import numpy as np
exp1 = setup(train,train_size=0.8, target = 'Label',use_gpu = True)
TypeError: Cannot compare types 'ndarray(dtype=object)' and 'float'

Related

Forecasting validation loss flactuation

I have a question for those who have some experience with timeseries forecasting.
I have been experiment with this field for few weeks and i was trying to forecast some timeseries with both ARIMA and LSTM models to compare the results.
Basically i did plot this graph Figure 1 that has 4 plots :
Top left : ARIMA training data points and fitted model points.
Top right : ARIMA test and forecast points.
Bottom Left : LSTM training data and fitted data (i could not really find fitted point for LSTM so i just forecasted the training data but you can just ignore that part).
Bottom right : Test and forecast data for the LSTM model.
This graph was acceptable and also i did compute the RMSE and MSE and LSTM gave lower error which agrees with most literature online that states the superiority of LSTM over ARIMA models.
However after i did plot the loss and validation loss of the LSTM model to have more insights, i noticed that the validation_loss is following a wierd flectuating pattern Figure 2.
I can explain this as follow : the time series has a lot of outliers or abnormal behaviour, so splitting it to train/validation/test would mean validation cannot be really a good metric to show how good the model can learn.
But since all research papers never show this graph and explain this problem, i don't have a solid argument to defende this idea.
what do you guys think?
Thank you in advance
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf
import statsmodels.api as sm
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_percentage_error,mean_absolute_percentage_error
from statsmodels.tsa.seasonal import STL
import numpy as np
from pandas import Series, DataFrame
from scipy import stats
from statsmodels.tsa.stattools import adfuller
import statsmodels
from statsmodels.tsa.seasonal import seasonal_decompose
from pandas.plotting import register_matplotlib_converters
import pmdarima as pm
register_matplotlib_converters()
import warnings
import time
from numpy import array
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from numpy import array
import keras_tuner as kt
import tensorflow as tf
print(tf.__version__)
from numpy import array
from tensorflow import keras
import keras_tuner as kt
from sklearn.preprocessing import MinMaxScaler
from keras.layers import Bidirectional
from keras.models import Sequential
from keras.preprocessing.sequence import TimeseriesGenerator
from keras.layers import Bidirectional
from tensorflow.keras import initializers
import random as rn
np.random.seed(123)
rn.seed(123)
tf.random.set_seed(123)
tf.keras.utils.set_random_seed(123)
keras.utils.set_random_seed(123)
warnings.filterwarnings('ignore')
df3 = pd.read_csv('favorita_train.csv')
## 1 - Get TS and do STL
print("TS lenbgth : "+str(len(df3)))
results = seasonal_decompose(df3['unit_sales'],period=30)
results.plot();
train_all = df3.iloc[:int(len(df3)*0.8)]
train = df3.iloc[:int(len(df3)*0.6)]
val = df3.iloc[int(len(df3)*0.6):int(len(df3)*0.8)]
test = df3.iloc[int(len(df3)*0.8):]
scaler = MinMaxScaler()
scaler.fit(train_all)
scaled_all = scaler.transform(df3)
scaled_train = scaler.transform(train)
scaled_train_all = scaler.transform(train_all)
scaled_val = scaler.transform(val)
scaled_test = scaler.transform(test)
# We do the same thing, but now instead for 12 months
n_features = 1
n_input =5
train_generator_all = TimeseriesGenerator(scaled_train_all, scaled_train_all, length=n_input, batch_size=1,shuffle=True)
train_generator = TimeseriesGenerator(scaled_train, scaled_train, length=n_input, batch_size=1,shuffle=True)
val_generator = TimeseriesGenerator(scaled_val, scaled_val, length=n_input, batch_size=1,shuffle=True)
adfPValue = adfuller(scaled_all)
adfPValue=adfPValue[1]
adi = len(scaled_all)/((scaled_all != 0).sum())
sd=scaled_all.std()
mean=scaled_all.mean()
cv2 = np.square(sd/mean)
print("CV2 (describe magnitude of demande variability <0.5 is good) :"+str(cv2))
print("SD (-2,2 is good | mean data variance is low) :"+str(sd))
print("ADI (1.3 or smaller means smooth ts) :"+str(adi))
print("Stationarity test (stationary if <0.05) :"+str(adfPValue))
def model_builder(hp):
model = keras.Sequential()
hp_units = hp.Int('units', min_value=1, max_value=50, step=1)
hp_layers = hp.Int('layers', min_value=1, max_value=3, step=1)
if hp_layers==1 :
model.add(Bidirectional(LSTM(hp_units,activation='relu'), input_shape=(n_input, n_features)))
elif hp_layers==2:
model.add(Bidirectional(LSTM(hp_units, activation='relu', return_sequences=True), input_shape=(n_input, n_features)))
model.add(Bidirectional(LSTM(hp_units, activation='relu')))
else:
model.add(Bidirectional(LSTM(hp_units, activation='relu', return_sequences=True), input_shape=(n_input, n_features)))
for i in range(hp_layers-2):
model.add(Bidirectional(LSTM(hp_units, activation='relu', return_sequences=True)))
model.add(Bidirectional(LSTM(hp_units, activation='relu')))
model.add(Dense(1))
hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])
model.compile(optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate), loss='mse',metrics=['accuracy'])
return model
tuner = kt.Hyperband(model_builder,
objective='val_loss',
max_epochs=300,
factor=3,
directory='499',
project_name='949',
seed=123)
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=30)
tuner.search(train_generator, epochs=300, validation_data=val_generator, shuffle=True, callbacks=[stop_early], batch_size=len(train_generator))
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]
print(best_hps.get('units'))
print(best_hps.get('layers'))
print(best_hps.get('window'))
print(best_hps.get('learning_rate'))
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]
model = tuner.hypermodel.build(best_hps)
history = model.fit(img_train, label_train, epochs=50, validation_split=0.2)
val_acc_per_epoch = history.history['val_accuracy']
best_epoch = val_acc_per_epoch.index(max(val_acc_per_epoch)) + 1
print('Best epoch: %d' % (best_epoch,))

How to keep same sequence size of all batches in Bert hugging face training

I'm using this code to train hugginface bert. But I saw different batch has different sequence length in training time. But I want to keep the same sequence length for all of the batches. How can I do that? And how does hugging face handles different sequence length in different batches?
from transformers import BertTokenizer
bert_cased_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=False)
Define model
from transformers import BertConfig, BertForPreTraining
config = BertConfig()
model = BertForPreTraining(config)
Next sentence prediction
from transformers import TextDatasetForNextSentencePrediction
dataset = TextDatasetForNextSentencePrediction(
tokenizer=bert_cased_tokenizer,
file_path="/path/to/your/dataset",
block_size = 256)
mlm
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(
tokenizer=bert_cased_tokenizer,
mlm=True,
mlm_probability= 0.15)
Train
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
output_dir= "/path/to/output/dir/for/training/arguments"
overwrite_output_dir=True,
num_train_epochs=2,
per_gpu_train_batch_size= 16,
save_steps=10_000,
save_total_limit=2,
prediction_loss_only=True,)
trainer = Trainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=dataset,)
trainer.train()
trainer.save_model("path/to/your/model")

"name 'pygeos' is not defined"

When doing df = gpd.GeoDataFrame(df1, crs = 'EPSG:4326', geometry = geopandas.points_from_xy(df1.longitude,df1.latitude)) I get "name 'pygeos' is not defined", yet I have installed pygeos in the directory where I dev and
python3.9/site-packages/geopandas/_vectorized.py in points_from_xy(x, y, z)
247
248 if compat.USE_PYGEOS:
--> 249 return pygeos.points(x, y, z)
250 else:
251 out = _points_from_xy(x, y, z)
anf import pygeos is in the script. Is there a specific way to well install pygeos in order to avoid such error ? Thanks
USE_PYGEOS=1
import pyproj
import shapely
import pandas as pd
pd.options.display.max_rows = 100
import geopandas as gpd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
gpd.show_versions()
print(gpd.options.use_pygeos)
location_df = pd.read_csv("location_01-03_01-04.csv", sep = ";")
import rtree
import pygeos
gpd.options.use_pygeos = True
#Point is (longitude, latitude)
# Function making geopandas points of latitude, longitude
location_geo = gpd.GeoDataFrame(location_df, crs = 'EPSG:4326', geometry = gpd.points_from_xy(location_df.longitude, location_df.latitude))
departments_df = gpd.read_file("departements.geojson", sep = ";")
print(departments_df)
import time
start = time.time()
print("hello")
import geopandas
import rtree
# Function to check wether a department contains a position - returns the department of the position. NaN values are probably in another country
dept_points = geopandas.sjoin(location_geo, departments_df)
end = time.time()
print(end-start, ' s')
print(dept_points)
Somehow this did it for me.
It was about setting the constant and importing packages in a specific order.

Can't get train and test sets

I applied k-fold cross validation to split data into train and test sets.
But when I want to get train and test sets I have these errors:
AttributeError: 'numpy.ndarray' object has no attribute 'iloc'
Thanks for your help.
y = df_dummies['Churn'].values
X = df_dummies.drop(columns = ['Churn'])
from sklearn.preprocessing import MinMaxScaler
features = X.columns.values
scaler = MinMaxScaler(feature_range = (0,1))
scaler.fit(X)
X = pd.DataFrame(scaler.transform(X))
X.columns = features
from sklearn.model_selection import KFold
kf=KFold(n_splits=5,shuffle=True)
for train,test in kf.split(X):
print("%s %s" % (train,test))
for train_index, test_index in kf.split(X):
print("TRAIN:", train_index, "TEST:", test_index)
X_train, X_test = X.iloc[train_index], X.iloc[test_index]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]
from sklearn.linear_model import LogisticRegression
CLF = LogisticRegression().fit(X_train, y_train)
print('Accuracy of Logistic regression classifier on training set: {:.2f}'
.format(CLF.score(X_train, y_train)))
print('Accuracy of Logistic regression classifier on test set: {:.2f}'
.format(CLF.score(X_test, y_test)))
NameError: name 'y_train' is not defined
The issue is that df_dummies['Churn'].values returns an array not a dataframe. But you are trying to get attributes from an array which don't exist. The iloc function is in pandas.DataFrame.
Use y = df_dummies['Churn'] instead.
Reference: https://pandas.pydata.org/pandas-docs/version/0.23.4/generated/pandas.DataFrame.iloc.html#pandas.DataFrame.iloc
PS: I don't know how these type of questions could be migrated to a sister site. Perhaps, someone who knows that could migrate this to cross-validated please.

Scikit learn algorithms performing extremely poorly

I'm new to scikit learn and I'm banging my head against the wall. I've used both real world and test data and the scikit algorithms are not performing above chance level in predicting anything. I've tried knn, decision trees, svc and naive bayes.
Basically, I made a test data set consisting of a column of 0s and 1s, with all the 0s having a feature between 0 and .5 and all the 1s having a feature value between .5 and 1. This should be extremely easy and give near 100% accuracy. However, none of the algorithms are performing above chance level. Accurasies range from 45 to 55 %. I've already tried tweaking a whole bunch of parameters for every algorithm but noting helps. I think something is fundamentally wrong with my implementation.
Please help me out. Here's my code:
from sklearn.cross_validation import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
import sklearn
import pandas
import numpy as np
df=pandas.read_excel('Test.xlsx')
# Make data into np arrays
y = np.array(df[1])
y=y.astype(float)
y=y.reshape(399)
x = np.array(df[2])
x=x.astype(float)
x=x.reshape(399, 1)
# Creating training and test data
labels_train, labels_test = train_test_split(y)
features_train, features_test = train_test_split(x)
#####################################################################
# PERCEPTRON
#####################################################################
from sklearn import linear_model
perceptron=linear_model.Perceptron()
perceptron.fit(features_train, labels_train)
perc_pred=perceptron.predict(features_test)
print sklearn.metrics.accuracy_score(labels_test, perc_pred, normalize=True, sample_weight=None)
print 'perceptron'
#####################################################################
# KNN classifier
#####################################################################
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(features_train, labels_train)
knn_pred = knn.predict(features_test)
# Accuraatheid
print sklearn.metrics.accuracy_score(labels_test, knn_pred, normalize=True, sample_weight=None)
print 'knn'
#####################################################################
## SVC
#####################################################################
from sklearn.svm import SVC
from sklearn import svm
svm2 = SVC(kernel="linear")
svm2 = svm.SVC()
svm2.fit(features_train, labels_train)
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3,
gamma=1.0, kernel='linear', max_iter=-1, probability=False,
random_state=None,
shrinking=True, tol=0.001, verbose=False)
svc_pred = svm2.predict(features_test)
print sklearn.metrics.accuracy_score(labels_test, svc_pred, normalize=True,
sample_weight=None)
#####################################################################
# Decision tree
#####################################################################
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(features_train, labels_train)
tree_pred=clf.predict(features_test)
# Accuraatheid
print sklearn.metrics.accuracy_score(labels_test, tree_pred, normalize=True,
sample_weight=None)
print 'tree'
#####################################################################
# Naive bayes
#####################################################################
import sklearn
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(features_train, labels_train)
print "training time:", round(time()-t0, 3), "s"
GaussianNB()
bayes_pred = clf.predict(features_test)
print sklearn.metrics.accuracy_score(labels_test, bayes_pred,
normalize=True, sample_weight=None)
You seem to use train_test_split the wrong way.
labels_train, labels_test = train_test_split(y) #WRONG
features_train, features_test = train_test_split(x) #WRONG
the splitting of your labels and data isn't necessary the same. One easy way to split your data manually:
randomvec=np.random.rand(len(data))
randomvec=randomvec>0.5
train_data=data[randomvec]
train_label=labels[randomvec]
test_data=data[np.logical_not(randomvec)]
test_label=labels[np.logical_not(randomvec)]
or to use the scikit method properly:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.5, random_state=42)

Resources