Can't get train and test sets - cross-validation

I applied k-fold cross validation to split data into train and test sets.
But when I want to get train and test sets I have these errors:
AttributeError: 'numpy.ndarray' object has no attribute 'iloc'
Thanks for your help.
y = df_dummies['Churn'].values
X = df_dummies.drop(columns = ['Churn'])
from sklearn.preprocessing import MinMaxScaler
features = X.columns.values
scaler = MinMaxScaler(feature_range = (0,1))
scaler.fit(X)
X = pd.DataFrame(scaler.transform(X))
X.columns = features
from sklearn.model_selection import KFold
kf=KFold(n_splits=5,shuffle=True)
for train,test in kf.split(X):
print("%s %s" % (train,test))
for train_index, test_index in kf.split(X):
print("TRAIN:", train_index, "TEST:", test_index)
X_train, X_test = X.iloc[train_index], X.iloc[test_index]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]
from sklearn.linear_model import LogisticRegression
CLF = LogisticRegression().fit(X_train, y_train)
print('Accuracy of Logistic regression classifier on training set: {:.2f}'
.format(CLF.score(X_train, y_train)))
print('Accuracy of Logistic regression classifier on test set: {:.2f}'
.format(CLF.score(X_test, y_test)))
NameError: name 'y_train' is not defined

The issue is that df_dummies['Churn'].values returns an array not a dataframe. But you are trying to get attributes from an array which don't exist. The iloc function is in pandas.DataFrame.
Use y = df_dummies['Churn'] instead.
Reference: https://pandas.pydata.org/pandas-docs/version/0.23.4/generated/pandas.DataFrame.iloc.html#pandas.DataFrame.iloc
PS: I don't know how these type of questions could be migrated to a sister site. Perhaps, someone who knows that could migrate this to cross-validated please.

Related

ValueError with Sklearn LinearRegression model.predict()

I am trying to do a simple linear regression model to estimate the sales price of an item for borrowers we don't have contract information on. I'm using data from borrowers we do have price and payment info on and using sklearn's LinearRegression model but getting an error when I call the predict() method on the model. The exact error:
ValueError: X has 844 features, but LinearRegression is expecting 2529 features as input.
Here is my code, I feel like it's fairly straightforward. The build_customer_df is a method call that returns the dataframe with some column formatting, nothing fancy:
`
fp = Path('master_borrower.xlsx')
df = build_customer_df(fp)
df = df[['payment', 'trailer_sales_price']]
df = df[df['payment']!= 0]
X = df['payment'].values
y = df['trailer_sales_price'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)
X_test = X_test.reshape(1,-1)
X_train = X_train.reshape(1,-1)
y_train = y_train.reshape(1,-1)
y_test = y_test.reshape(1,-1)
model = linear_model.LinearRegression()
model.fit(X_train, y_train)
predictions = model.predict(X_test)

Uploading models with custom forward functions to the huggingface model hub?

Is it possible to upload a model with a custom forward function to the huggingface model hub?
I can see how to do it if your model is of a normal form but can't see how to customise the forward function and do it?
Yes absolutely. You can create your own model with added any number of layers/customisations you want and upload it to model hub. Let me present you a demo which will describe the entire process.
Uploading custom model to 🤗 model hub
import tqdm
from datasets import load_dataset
import transformers
from transformers import AutoTokenizer, AutoModel, BertConfig
from transformers import AdamW
from transformers import get_scheduler
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
# setting device to `cuda` if gpu exists
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# initialising the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("google/bert_uncased_L-2_H-128_A-2")
bert = AutoModel.from_pretrained("google/bert_uncased_L-2_H-128_A-2")
def tokenize_function(examples):
'''Function for tokenizing raw texts'''
return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)
# downloading IMDB dataset from 🤗 `datasets`
raw_datasets = load_dataset("imdb")
# Running tokenizing function on the raw texts
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
# for simplicity I have taken only the train split
tokenized_datasets = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
# Now lets create the torch Dataset class
class IMDBClassificationDataset(Dataset):
def __init__(self, dataset):
self.dataset = dataset
def __len__(self):
return len(self.dataset)
def __getitem__(self, idx):
d = self.dataset[idx]
ids = torch.tensor(d['input_ids'])
mask = torch.tensor(d['attention_mask'])
label = torch.tensor(d['label'])
return ids, mask, label
# Preparing the dataset and the Dataloader
dataset = IMDBClassificationDataset(tokenized_datasets)
train_dataloader = DataLoader(dataset, shuffle=True, batch_size=8)
# Now lets create a custom Bert model
class CustomBert(transformers.PreTrainedModel):
'''Custom model class
------------------
Now the trick is not to inherit the class from `nn.Module` but `transformers.PretrainedModel`
Also you need to pass the model config during initialisation'''
def __init__(self, bert):
super(CustomBert, self).__init__(config=BertConfig.from_pretrained('google/bert_uncased_L-2_H-128_A-2'))
self.bert = bert
self.l1 = nn.Linear(128, 1)
self.do = nn.Dropout(0.1)
self.relu = nn.ReLU()
self.sigmoid = nn.Sigmoid()
def forward(self, sent_id, mask):
'''For simplicity I have added only one linear layer, you can create any type of network you want'''
bert_out = self.bert(sent_id, attention_mask=mask)
o = bert_out.last_hidden_state[:,0,:]
o = self.do(o)
o = self.relu(o)
o = self.l1(o)
o = self.sigmoid(o)
return o
# initialising model, loss and optimizer
model = CustomBert(bert)
model.to(device)
criterion = torch.nn.BCELoss()
optimizer = AdamW(model.parameters(), lr=5e-5)
# setting epochs, num_training_steps and the lr_scheduler
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
"linear",
optimizer=optimizer,
num_warmup_steps=0,
num_training_steps=num_training_steps
)
# training loop
model.train()
for epoch in tqdm.tqdm(range(num_epochs)):
for batch in train_dataloader:
ids, masks, labels = batch
labels = labels.type(torch.float32)
o = model(ids.to(device), masks.to(device))
loss = criterion(torch.squeeze(o), labels.to(device))
loss.backward()
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
# save the tokenizer and the model in `./test-model/` directory
tokenizer.save_pretrained("./test-model/")
model.save_pretrained("./test-model/", push_to_hub=False)
Now create a new model in 🤗 and push all the contents inside the test-model to 🤗 model hub.
To test the authenticity of the model you can try 🤗's pipeline to check if something is wrong.
from transformers import pipeline
# as this is classification so you need to mention `text-classification` as task
classifier = pipeline('text-classification', model='tanmoyio/test-model')
classifier("This movie was superb")
It will output something like this
[{'label': 'LABEL_0', 'score': 0.5571992993354797}]
This is a real demo, check the model here - https://huggingface.co/tanmoyio/test-model. Let me know if you have further questions.

AttributeError: 'H2OFrame' object has no attribute 'lower' when converting confusion matrix to data frame

I am trying to convert the confusion matrix to a python 2D list so I can access the components.
I am getting an error when trying to convert a confusion matrix to a data frame.
import h2o
from h2o.estimators.gbm import H2OGradientBoostingEstimator
import pandas as pd
h2o.init()
training_file = "AirlinesTrain.csv"
train = h2o.import_file(training_file)
response_col = "IsDepDelayed"
distribution = "multinomial"
project_name = "airlines"
problem_type = "binary-classification"
predictors = train.columns
gbm = H2OGradientBoostingEstimator(nfolds=3,
distribution=distribution)
gbm.train(x=predictors,
y=response_col,
training_frame=train)
print("gbm.confusion_matrix(train).as_data_frame()")
print(gbm.confusion_matrix(train).as_data_frame())#This errors AttributeError: 'H2OFrame' object has no attribute 'lower'
NOTE: if I use the cars dataset, there are no errors:
cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
cars["cylinders"] = cars["cylinders"].asfactor()
#r = cars[0].runif()
#train = cars[r > .2]
#valid = cars[r <= .2]
train=cars
response_col = "cylinders"
distribution = "multinomial"
predictors = ["displacement","power","weight","acceleration","year"]
Ran into this same issue. Seems there may be something wrong in the docs as it mentions you can pass a H2OFrame.
https://docs.h2o.ai/h2o/latest-stable/h2o-docs/performance-and-prediction.html
However i think if you passed train=True it would work
print(gbm.confusion_matrix(train=True).as_data_frame())

Data reshaping in sklearn (Linear regression)

input code:
data = pd.read_csv('test.csv')
data.head()
data['Density'] = data['Flow [Veh/h]'] / data['Speed [km/h]']
data = data.replace(np.nan, 1)
X = data['Density']
y = data['Speed [km/h]']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=101)
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(X_train,y_train) #HERE I GOT AN ERROR
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.
You can try changing your variable X as the following:
X = data['Density'].values.reshape((-1, 1))
I had faced the same error, where my feature set had only one variable. The above change solved the issue for me.
Try using [[]] while taking the parameters:
X = data[['Density']]

LSTM - LSTM - future value prediction error

After some research, I was able to predict the future value using the LSTM code below. I have also attached the Dmd1ahr.csv file in the github link that I am using.
https://github.com/ukeshchawal/hello-world/blob/master/Dmd1ahr.csv
As you all can see below, 90 data points are training sets and 91st to 100th are future value prediction.
However some of the questions that I still have are:
In order to predict these values I had to originally take more than hundred data sets (here, I have taken 500 data sets) which is not exactly what my primary goal is. Is there a way that given 500 data sets, it will predict the rest 10 or 20 out of sample data points? If yes, will you please write me a sample code where you can just take 500 data points from Dmd1ahr.csv file attached below and it will predict some future values (say 501 to 520) based on those 500 points?
The prediction are way off compared to the one who have in your blogs (definitely indicates for parameter tuning - I tried changing epochs, LSTM layers, Activation, optimizer). What other parameter tuning I can do to make it more robust?
Thank you'll in advance.
import numpy as np
import matplotlib.pyplot as plt
import pandas
# By twaking the architecture it could be made more robust
np.random.seed(7)
numOfSamples = 500
lengthTrain = 90
lengthValidation = 100
look_back = 1 # Can be set higher, in my experiments it made performance worse though
transientTime = 90 # Time to "burn in" time series
series = pandas.read_csv('Dmd1ahr.csv')
def generateTrainData(series, i, look_back):
return series[i:look_back+i+1]
trainX = np.stack([generateTrainData(series, i, look_back) for i in range(lengthTrain)])
testX = np.stack([generateTrainData(series, lengthTrain + i, look_back) for i in range(lengthValidation)])
trainX = trainX.reshape((lengthTrain,look_back+1,1))
testX = testX.reshape((lengthValidation, look_back + 1, 1))
trainY = trainX[:,1:,:]
trainX = trainX[:,:-1,:]
testY = testX[:,1:,:]
testX = testX[:,:-1,:]
############### Build Model ###############
import keras
from keras.models import Model
from keras import layers
from keras import regularizers
inputs = layers.Input(batch_shape=(1,look_back,1), name="main_input")
inputsAux = layers.Input(batch_shape=(1,look_back,1), name="aux_input")
# this layer makes the actual prediction, i.e. decides if and how much it goes up or down
x = layers.recurrent.LSTM(300,return_sequences=True, stateful=True)(inputs)
x = layers.recurrent.LSTM(200,return_sequences=True, stateful=True)(inputs)
x = layers.recurrent.LSTM(100,return_sequences=True, stateful=True)(inputs)
x = layers.recurrent.LSTM(50,return_sequences=True, stateful=True)(inputs)
x = layers.wrappers.TimeDistributed(layers.Dense(1, activation="linear",
kernel_regularizer=regularizers.l2(0.005),
activity_regularizer=regularizers.l1(0.005)))(x)
# auxillary input, the current input will be feed directly to the output
# this way the prediction from the step before will be used as a "base", and the Network just have to
# learn if it goes a little up or down
auxX = layers.wrappers.TimeDistributed(layers.Dense(1,
kernel_initializer=keras.initializers.Constant(value=1),
bias_initializer='zeros',
input_shape=(1,1), activation="linear", trainable=False
))(inputsAux)
outputs = layers.add([x, auxX], name="main_output")
model = Model(inputs=[inputs, inputsAux], outputs=outputs)
model.compile(optimizer='adam',
loss='mean_squared_error',
metrics=['mean_squared_error'])
#model.summary()
#model.fit({"main_input": trainX, "aux_input": trainX[look_back-1,look_back,:]},{"main_output": trainY}, epochs=4, batch_size=1, shuffle=False)
model.fit({"main_input": trainX, "aux_input": trainX[:,look_back-1,:].reshape(lengthTrain,1,1)},{"main_output": trainY}, epochs=100, batch_size=1, shuffle=False)
############### make predictions ###############
burnedInPredictions = np.zeros(transientTime)
testPredictions = np.zeros(len(testX))
# burn series in, here use first transitionTime number of samples from test data
for i in range(transientTime):
prediction = model.predict([np.array(testX[i, :, 0].reshape(1, look_back, 1)), np.array(testX[i, look_back - 1, 0].reshape(1, 1, 1))])
testPredictions[i] = prediction[0,0,0]
burnedInPredictions[:] = testPredictions[:transientTime]
# prediction, now dont use any previous data whatsoever anymore, network just has to run on its own output
for i in range(transientTime, len(testX)):
prediction = model.predict([prediction, prediction])
testPredictions[i] = prediction[0,0,0]
# for plotting reasons
testPredictions[:np.size(burnedInPredictions)-1] = np.nan
############### plot results ###############
#import matplotlib.pyplot as plt
plt.plot(testX[:, 0, 0])
plt.show()
plt.plot(burnedInPredictions, label = "training")
plt.plot(testPredictions, label = "prediction")
plt.legend()
plt.show()

Resources