Sklearn RandomizedSearchCV suddenly stuck - performance

I tried to do a RandomizedSearchCV for a SVM model but it seems to take forever. It works fine for KNN. I found the process stuck somewhere after finishing certain tasks. The below is my code:
# SVC Parameter Tuning
svc_params = {'C': np.power(10, np.arange(-5., 1.)),
'kernel': ['rbf', 'linear', 'sigmoid', 'poly'],
'degree': np.arange(3, 21),
'coef0' : np.linspace(0, 1, 100),
'shrinking': [True, False],
'class_weight' : ['balanced', None]}
svc = RandomizedSearchCV(SVC(),
svc_params,
cv = 5,
scoring = 'roc_auc',
n_jobs = 128,
n_iter = 100,
verbose = 2)
after some results, the process stuck.
[CV] kernel=poly, C=0.0001, degree=20, coef0=0.848484848485,
shrinking=True, class_weight=balanced, total= 11.1s
[CV] kernel=poly, C=0.0001, degree=20, coef0=0.848484848485,
shrinking=True, class_weight=balanced, total= 11.0s
[CV] kernel=poly, C=0.0001, degree=20, coef0=0.848484848485,
shrinking=True, class_weight=balanced, total= 11.5s
I am really out of clue. Thanks for your help!

Related

Hyperparameter tuning using GridSearch()

I am trying to do hyperparametric tuning using the following code :
def df_to_new(df,window_size):
df_as_np = df.to_numpy()
X = []
y = []
for i in range(len(df_as_np)-window_size):
row = [[a] for a in df_as_np[i:i+window_size]]
X.append(row)
label = df_as_np[i+window_size]
y.append(label)
return np.array(X),np.array(y)
Xgrid,Xnotuse,ygrid,ynouse = train_test_split(Xtrain,ytrain,test_size=0.8)
input_dim = Xgrid.shape[1]
def define_model(learning_rate=0.01):
model = Sequential()
model.add(LSTM(128,input_dim = input_dim))
model.add(Dropout(0.2))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='linear'))
optimizer = Adam(lr=learning_rate)
# compile the model
model.compile(loss=MeanSquaredError(),
optimizer=optimizer,
metrics=[RootMeanSquaredError()])
return model
model = KerasClassifier(build_fn=define_model,
epochs=epochs,
batch_size = batch_size,
verbose=1)
learning_rate = [0.0001, 0.001, 0.01, 0.1]
epochs = [10, 30, 60, 150]
batch_size = [5, 15, 25, 50]
param_grid = dict(learning_rate=learning_rate, epochs=epochs,batch_size=batch_size)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=1)
grid_result = grid.fit(Xgrid, ygrid)
But I am getting the following error:
ValueError: Invalid parameter learning_rate for estimator KerasClassifier.
This issue can likely be resolved by setting this parameter in the KerasClassifier constructor:
KerasClassifier(learning_rate=0.0001)
Check the list of available parameters with estimator.get_params().keys()
What I'm trying to do is to get the best epochs, batch_size, and learning_rate combination for my LSTM model. Any help?

Fitting Lightgbm distributed with lgb.train hangs

I'm trying to learn how to use lightgbm distributed.
I wrote a simple hello world kind of code where I use iris dataset with 150 rows, split it into train (100 rows) and test(50 rows). Then training the train test set are further split into two parts. Each part is fed into two machines with appropriate rank.
The problem I see is that lgb.train hangs.
Here is the code:
import argparse
import logging
import lightgbm as lgb
import pandas as pd
from sklearn import datasets
import socket
print('lightgbm', lgb.__version__)
HOST = socket.gethostname()
ip_address = socket.gethostbyname(HOST)
print("IP=", ip_address)
# looks like lightgbm operates only with ip addresses
IPS = ['10.121.22.166', '10.121.22.83']
assert ip_address in IPS
logger = logging.getLogger(__name__)
pd.set_option('display.max_rows', 4)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 10000)
pd.set_option('max_colwidth', 100)
pd.set_option('precision', 5)
def read_train_data(rank):
iris = datasets.load_iris()
iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)
partition = rank
assert partition < 2
separate = 100
train_df = iris_df.iloc[:separate]
test_df = iris_df.iloc[separate:]
separate_train = 60
separate_test = 30
if partition == 0:
train_df = train_df.iloc[:separate_train]
test_df = test_df.iloc[:separate_test]
else:
train_df = train_df.iloc[separate_train:]
test_df = test_df.iloc[separate_test:]
def get_lgb_dataset(df):
target_column = df.columns[-1]
columns = df.columns[:-1]
assert target_column not in columns
print('Target column', target_column)
x = df[columns]
y = df[target_column]
print(x)
ds = lgb.Dataset(free_raw_data=False, data=x, label=y, params={
"enable_bundle": False
})
ds.construct()
return ds
dtrain = get_lgb_dataset(train_df)
dtest = get_lgb_dataset(test_df)
return dtrain, dtest
def train(args):
port0 = 56456
rank = IPS.index(ip_address)
print("Rank=", rank, HOST)
print("RR", rank)
dtrain, dtest = read_train_data(rank=rank)
params = {'boosting_type': 'gbdt',
'class_weight': None,
'colsample_bytree': 1.0,
'importance_type': 'split',
'learning_rate': 0.1,
'max_depth': 2,
'min_child_samples': 20,
'min_child_weight': 0.001,
'min_split_gain': 0.0,
'n_estimators': 1,
'num_leaves': 31,
'objective': 'regression',
'metric': 'rmse',
'random_state': None,
'reg_alpha': 0.0,
'reg_lambda': 0.0,
'silent': False,
'subsample': 1.0,
'subsample_for_bin': 200000,
'subsample_freq': 0,
'tree_learner': 'data_parallel',
'num_threads': 48,
'machines': ','.join([f'{machine}:{port0}' for i, machine in enumerate(IPS)]),
'local_listen_port': port0,
'time_out': 120,
'num_machines': len(IPS)
}
print(params)
logging.info("starting to train lgb at node with rank %d", rank)
evals_result = {}
if args.scikit == 1:
print("Using scikit learn")
bst = lgb.sklearn.LGBMRegressor(**params)
bst.fit(
dtrain.data,
dtrain.label,
eval_set=[(dtest.data, dtest.label)],
)
else:
print("Using regular LGB")
bst = lgb.train(params,
dtrain,
valid_sets=[dtest],
evals_result=evals_result)
print(evals_result)
logging.info("finish xgboost training at node with rank %d", rank)
return bst
def main(args):
logging.info("starting the train job")
model = train(args)
pd.set_option('display.max_rows', 500)
print("OUT", model.__class__)
try:
print(model.trees_to_dataframe())
except:
print(model.booster_.trees_to_dataframe())
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
'--scikit',
help='scikit',
default=0,
type=int,
)
main(parser.parse_args())
I can run it with the scikit fit interface by running: python simple_distributed_lgb_test.py --scikit 1
On the two machines. It produces a reasonable result.
However, when I use -- scikit 0 (which uses lgb.train), then fitting just hangs on both nodes. Last messages before it hangs:
[LightGBM] [Info] Total Bins 22
[LightGBM] [Info] Number of data points in the train set: 40, number of used features: 2
[LightGBM] [Warning] Found whitespace in feature_names, replace with underlines
[LightGBM] [Info] Start training from score 0.873750
Is that a bug or an expected behavior? dask.py in lightgbm does use scikit learn fit interface.
I use an overnight master version 3.2.1.99. 5b7a6f3e7150aeb704d1dd2b852d246af3e913a3 tag to be exact from Jul 12.
UPDATE 1
I'm trying to dig into the code. So far I see few things:
scikit.train interface appears to have an extra syncronization step before fitting first tree. lgb.train doesn't have it. Dunno yet where it comes from. (I see some Network::Allreduce operations)
It appears that scikit.train has workers syncronized - each worker knows the correct sizes of the blocks to send and receive during reducescatter operations. For example one the first allreduce worker1 sends 208 blocks and receives 368 blocks of data (in Linkers::SendRecv), while worker2 is reversed - sends 368 and receives 208. So allreduce completes fine. ()
On the contrary, lgb.train has workers not syncronized - each worker has numbers for send and receive blocks during reducescatter at the first DataParallelTreeLearner::FindBestSplits encounter. But they don't match. Worker1 sends 208 abd wants to receive 400. Worker2 sends 192 and wants to receive 176. So, the worker that wants to receive more just hangs. The other worker eventually hangs too.
Possibly it has something to do with lgb.Dataset. That thing may need to have same bins or something. I tried to force it by forcedbins_filename parameter. But it doesn't seem to help with lgb.train.
UPDATE 2
Success. If I remove the following line from the example:
ds.construct()
Everything works. So I guess we can't use construct on Dataset when using distributed training.

Am getting error trying to predict on a single image CNN pytorch

Error message
Traceback (most recent call last):
File "pred.py", line 134, in
output = model(data)
Runtime Error: Expected 4-dimensional input for 4-dimensional weight [16, 3, 3, 3], but got 3-dimensional input of size [1, 32, 32] instead.
Prediction code
normalize = transforms.Normalize(mean=[0.4914, 0.4824, 0.4467],
std=[0.2471, 0.2435, 0.2616])
train_set = transforms.Compose([
transforms.RandomCrop(32, padding=4),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
normalize,
])
model = models.condensenet(args)
model = nn.DataParallel(model)
PATH = "results/savedir/save_models/checkpoint_001.pth.tar"
model.load_state_dict(torch.load(PATH)['state_dict'])
device = torch.device("cpu")
model.eval()
image = Image.open("horse.jpg")
input = train_set(image)
train_loader = torch.utils.data.DataLoader(
input,
batch_size=1,shuffle=True, num_workers=1)
for i, data in enumerate(train_loader):
#input_var = torch.autograd.Variable(data, volatile=True)
#input_var = input_var.view(1, 3, 32,32)
**output = model(data)
topk=(1,5)
maxk = max(topk)
_, pred = output.topk(maxk, 1, True, True)
Am getting this error when am trying to predict on a single image
Image shape/size error message
Link to saved model
Training code repository
Plz uncomment this line #input_var = input_var.view(1, 3, 32,32) so that your input dimension is 4.
I assume that your no. of input channels are 3 if its one then use input_var = input_var.view(1, 1, 32,32) if gray scale
Instead of doing the for loop and train_loader, solved this by just passing the input directly into the model. like this
input = train_set(image)
input = input.unsqueeze(0)
model.eval()
output = model(input)
More details can be found here link

How to do parallel processing in pytorch

I am working on a deep learning problem. I am solving it using pytorch. I have two GPU's which are on the same machine (16273MiB,12193MiB). I want to use both the GPU's for my training (video dataset).
I get a warning:
There is an imbalance between your GPUs. You may want to exclude GPU 1 which
has less than 75% of the memory or cores of GPU 0. You can do so by setting
the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
environment variable.
warnings.warn(imbalance_warn.format(device_ids[min_pos], device_ids[max_pos]))
I also get an error:
raise TypeError('Broadcast function not implemented for CPU tensors')
TypeError: Broadcast function not implemented for CPU tensors
if __name__ == '__main__':
opt.scales = [opt.initial_scale]
for i in range(1, opt.n_scales):
opt.scales.append(opt.scales[-1] * opt.scale_step)
opt.arch = '{}-{}'.format(opt.model, opt.model_depth)
opt.mean = get_mean(opt.norm_value)
opt.std = get_std(opt.norm_value)
print("opt",opt)
with open(os.path.join(opt.result_path, 'opts.json'), 'w') as opt_file:
json.dump(vars(opt), opt_file)
torch.manual_seed(opt.manual_seed)
model, parameters = generate_model(opt)
#print(model)
pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("Total number of trainable parameters: ", pytorch_total_params)
# Define Class weights
if opt.weighted:
print("Weighted Loss is created")
if opt.n_finetune_classes == 2:
weight = torch.tensor([1.0, 3.0])
else:
weight = torch.ones(opt.n_finetune_classes)
else:
weight = None
criterion = nn.CrossEntropyLoss()
if not opt.no_cuda:
criterion = nn.DataParallel(criterion.cuda())
if opt.no_mean_norm and not opt.std_norm:
norm_method = Normalize([0, 0, 0], [1, 1, 1])
elif not opt.std_norm:
norm_method = Normalize(opt.mean, [1, 1, 1])
else:
norm_method = Normalize(opt.mean, opt.std)
train_loader = torch.utils.data.DataLoader(
training_data,
batch_size=opt.batch_size,
shuffle=True,
num_workers=opt.n_threads,
pin_memory=True)
train_logger = Logger(
os.path.join(opt.result_path, 'train.log'),
['epoch', 'loss', 'acc', 'precision','recall','lr'])
train_batch_logger = Logger(
os.path.join(opt.result_path, 'train_batch.log'),
['epoch', 'batch', 'iter', 'loss', 'acc', 'precision', 'recall', 'lr'])
if opt.nesterov:
dampening = 0
else:
dampening = opt.dampening
optimizer = optim.SGD(
parameters,
lr=opt.learning_rate,
momentum=opt.momentum,
dampening=dampening,
weight_decay=opt.weight_decay,
nesterov=opt.nesterov)
# scheduler = lr_scheduler.ReduceLROnPlateau(
# optimizer, 'min', patience=opt.lr_patience)
if not opt.no_val:
spatial_transform = Compose([
Scale(opt.sample_size),
CenterCrop(opt.sample_size),
ToTensor(opt.norm_value), norm_method
])
print('run')
for i in range(opt.begin_epoch, opt.n_epochs + 1):
if not opt.no_train:
adjust_learning_rate(optimizer, i, opt.lr_steps)
train_epoch(i, train_loader, model, criterion, optimizer, opt,
train_logger, train_batch_logger)
I have also made changes in my train file:
model = nn.DataParallel(model(),device_ids=[0,1]).cuda()
outputs = model(inputs)
It does not seem to work properly and is giving error. Please advice, I am new to pytorch.
Thanks
As mentioned in this link, you have to do model.cuda() before passing it to nn.DataParallel.
net = nn.DataParallel(model.cuda(), device_ids=[0,1])
https://github.com/pytorch/pytorch/issues/17065

Parallel error with GridSearchCv, RandomSearchCV. Works fine with single models

I am having a similar problem on this link: Parallel error with GridSearchCV, works fine with other methods
I tried both of the solutions and neither one worked for me, as well.
When n_jobs = -1 in a grid search I get an error although n_jobs = -1 works fine for single models.
I tried updating sklearn but it did not help.
Here is the code I am trying:
rf = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator=rf,
param_distributions = random_grid,
n_iter = 100, cv = 3, verbose = 2, random_state = 42,
n_jobs = -1)
rf_random.fit(X_train, y_train)
I am getting this error:
ModuleNotFoundError: No module named
'sklearn.externals.joblib.externals.loky.backend.popen_loky_win32'
Tried the solution on the link but got the same error:
def randomsearcher():
clf = ensemble.RandomForestClassifier()
param_grid = random_grid
grid_s= model_selection.GridSearchCV(clf, cv=5, param_grid=param_grid
,n_jobs=-1,verbose=1)
grid_s.fit(X_train,y_train)
return grid_s
if __name__ == '__main__':
randomsearcher()
No problem with this code:
knn = KNeighborsClassifier(n_neighbors=50,
weights='distance',algorithm='auto',n_jobs = -1 )
I am working on a VM that has 8 virtual processors on 2 sockets.

Resources