Related
I am trying to run a RandomizedSearchCV on various classification models through "For" loop for Hyperparameter tuning. There is no issue with running any other models except CatBoost. Also the issue with Catboost arises when I used Pipeline in defining function.
My Code:
#Building the models:
lr = LogisticRegression()
knn = KNeighborsClassifier()
svm = SVC()
dt = DecisionTreeClassifier(random_state=1)
bag = BaggingClassifier(random_state=1)
adb = AdaBoostClassifier(random_state=1)
gb = GradientBoostingClassifier(random_state=1)
rf = RandomForestClassifier(random_state=1)
xgb = XGBClassifier()
cgb = CatBoostClassifier()
lgb = LGBMClassifier()
#Defining a function:
def fun_exp(model, name, x_tr, x_te, y_tr, y_te):
start = time.time()
pipe = Pipeline([('scale', StandardScaler()), ('pca', PCA(n_components = 62)), (name, model)])
rscv = RandomizedSearchCV(pipe, params, cv=10, random_state=1)
rscv.fit(x_tr, y_tr)
rscv_best_params = rscv.best_params_
rscv_best_score = rscv.best_score_
rscv_score_train = rscv.score(x_tr, y_tr)
rscv_score_test = rscv.score(x_te, y_te)
rscv_pred = rscv.predict(x_te)
end = time.time()
pickle.dump(rscv, open(name, 'wb'))
rscv_duration = end-start
return rscv_best_params, rscv_best_score, rscv_score_train, rscv_score_test, rscv_duration, rscv_pred
Running the above function in for loop & saving the result in a dictionary:
exp_result = {}
\#Fitting & Testing the model
for model, name in zip([lr, knn, svm, dt, bag, adb, gb, rf, lgb, cgb, xgb], ['Logistic Regression', 'KNeighbors', 'SVM', 'DecisionTree', 'Bagging', 'AdaBoost', 'GradientBoost', 'Random Forest', 'LightGBM', 'CatBoost', 'XGBoost']):
if model == lr:
params = {'Logistic Regression__solver': ['liblinear', 'lfbgs', 'sag', 'saga'], 'Logistic Regression__penalty':['elasticnet', 'l1', 'l2', 'none'], 'Logistic Regression__multi_class': ['auto', 'ovr', 'multinomial'], 'Logistic Regression__C':[0.1, 1, 10], 'Logistic Regression__tol': [0.00001, 0.0001, 0.001], 'Logistic Regression__class_weight': ['balanced', None]}
if model == knn:
params = {'KNeighbors__n_neighbors':np.arange(5,50,5), 'KNeighbors__weights': ['uniform', 'distance'], 'KNeighbors__algorithm':['auto', 'knn__ball_tree', 'kd_tree', 'brute'], 'KNeighbors__leaf_size': np.arange(10,51,10), 'KNeighbors__metric': ['minkowski', 'euclidean', 'manhattan']}
if model == svm:
params = {'SVM__gamma': [10, 1, 0.1, 0.01, 0.001, 0.0001], 'SVM__C': [1000, 100, 10, 1, 0.1, 0.01, 0.001], 'SVM__kernel': ['poly', 'rbf', 'sigmoid'], 'SVM__class_weight': ['balanced', None], 'SVM__decision_function_shape': ['ovo', 'ovr']}
if model == dt:
params = {'DecisionTree__criterion':['gini', 'entropy', 'log_loss'], 'DecisionTree__splitter':['best', 'random'], 'DecisionTree__max_depth':[None, np.arange(1,11)], 'DecisionTree__max_features': np.arange(8, 21, 2), 'DecisionTree__random_state':[1], 'DecisionTree__class_weight':['balanced', None]}
if model==bag:
params = {'Bagging__n_estimators': [10, 30, 50, 100, 500], 'Bagging__max_features': np.arange(8, 21, 2), 'Bagging__random_state':[1]}
if model == adb:
params = {'AdaBoost__n_estimators': [10, 30, 50, 100, 500], 'AdaBoost__learning_rate':[0.001, 0.01, 0.1, 1, 10], 'AdaBoost__algorithm':['SAMME.R', 'SAMME'], 'AdaBoost__random_state':[1]}
if model == gb:
params = {'GradientBoost__loss':['log_loss', 'exponential'], 'GradientBoost__learning_rate':[0.001, 0.01, 0.1, 1, 10], 'GradientBoost__n_estimators': [10, 30, 50, 100, 500], 'GradientBoost__max_depth':np.arange(1,11), 'GradientBoost__random_state':[1], 'GradientBoost__max_features': np.arange(8, 21, 2)}
if model == rf:
params = {'Random Forest__n_estimators': [10, 30, 50, 100, 500], 'Random Forest__criterion':['gini', 'entropy', 'log_loss'], 'Random Forest__max_depth':np.arange(1,11), 'Random Forest__max_features': np.arange(8, 21, 2), 'Random Forest__random_state':[1]}
if model == lgb:
params = {'LightGBM__boosting_type':['gbdt', 'rf'], 'LightGBM__num_leaves':np.arange(20, 40), 'LightGBM__max_depth':np.arange(1,11), 'LightGBM__learning_rate':[0.001, 0.01, 0.1, 1, 10], 'LightGBM__n_estimators': [10, 30, 50, 100, 500], 'LightGBM__class_weight': ['balanced', None], 'LightGBM__random_state':[1]}
if model == cgb:
params = {'CatBoost__learning_rate':[0.001, 0.01, 0.1, 1], 'CatBoost__n_estimators': [100, 500], 'CatBoost__max_depth':np.arange(1,11), 'CatBoost__random_state':[1], 'CatBoost__feature_border_type': ['Median', 'Uniform', 'UniformAndQuantiles', 'GreedyLogSum', 'MaxLogSum', 'MinEntropy']}
if model == xgb:
le = LabelEncoder()
y_tr = le.fit_transform(y_tr)
y_te = le.fit_transform(y_te)
params = {'XGBoost__n_estimators': [10, 30, 50, 100, 500], 'XGBoost__max_depth':np.arange(1,11), 'XGBoost__max_leaves': np.arange(0, 150), 'XGBoost__learning_rate':[0.001, 0.01, 0.1, 1, 10], 'XGBoost__random_state':[1]}
exp_result[name] = fun_exp(model, name, x_tr, x_te, y_tr, y_te)
I am looking for an algorithm for my hobby task.
For example we have test data cases:
var case1 = ['green', 'red', 'red', 'blue', 'green', 'green', 'green'];
var case2 = ['blue', 'blue', 'green', 'yellow', 'blue', 'orange', 'green', 'green', 'green', 'green'];
var case3 = ['purple', 'blue', 'blue', 'blue', 'red'];
Output:
var result1 = ['red', 'red', 'green', 'green', 'green'];
var result2 = ['blue', 'blue', 'green', 'green', 'green', 'green'];
var result3 = ['blue', 'blue', 'blue'];
Can anybody tell me which algorithm I should use? The classic way with duplicates - not like I looking.
Short version
we can use the .filter function to achieve this behavior.
.filter((item, i, array) => array[i - 1] == item || item == array[i + 1])
let case1 = ['green', 'red', 'red', 'blue', 'green', 'green', 'green'];
let case2 = ['blue', 'blue', 'green', 'yellow', 'blue', 'orange', 'green', 'green', 'green', 'green'];
let case3 = ['purple', 'blue', 'blue', 'blue', 'red'];
let result1 = case1.filter((item, i, array) => array[i - 1] == item || item == array[i + 1])
let result2 = case2.filter((item, i, array) => array[i - 1] == item || item == array[i + 1])
let result3 = case3.filter((item, i, array) => array[i - 1] == item || item == array[i + 1])
console.log(result1)
console.log(result2)
console.log(result3)
out of bounds index in javascript will return undefined which will evaluate to false in your comparison
So in javascript a naive implementation would be:
function selectSequential(source) {
var result = [];
for (let i = 0; i < source.length; i++) {
if ((i != source.length-1 && source[i+1] === source[i])
|| (i != 0 && source[i-1] === source[i])) {
result.push(source[i]);
}
}
return result;
}
Let say I have two classes:
class Cirle
include Mongoid::Document
field :lat, type: Float
field :lon, type: Float
field :radius, type: Integer
end
class Point
include Mongoid::Document
field :lat, type: Float
field :lon, type: Float
end
How can I find all Circles that include a given Point?
I'm not familiar with Mongoid, but perhaps the following will help. Suppose:
circles = [
{ x: 1, y: 2, radius: 3 },
{ x: 3, y: 1, radius: 2 },
{ x: 2, y: 2, radius: 4 },
]
and
point = { x: 4.5, y: 1 }
then the circles containing point are obtained with the help of Math::hypot:
circles.select { |c|
Math.hypot((c[:x]-point[:x]).abs, (c[:y]-point[:y]).abs) <= c[:radius] }
#=> [{ x: 3, y: 1, radius: 2 }, { x: 2, y: 2, radius: 4 }]
Edit: to improve efficiency as #Drenmi suggests:
x, y = point.values_at(:x, :y)
circles.select do |c|
d0, d1, r = (c[:x]-x).abs, (c[:y]-y).abs, c[:radius]
d0*d0 + d1*d1 <= r*r
end
EDIT: I believe I didn't state correctly my question, so here is the edit.
I want to be able to compare (and score) a set of images with one image in terms of width and height.
Ideally, I would have a BASE_SCORE value (for example 100) that would be used in order to score each image depending on how close they look to the main image (in terms of width and height).
So, if for example, the main image looks like {:width => 100, :height => 100}, and set_images look like [{:width => 100, :height => 100}, {:width => 10, :height => 40}], the first element would have a score of BASE_SCORE, because they look exactly the same.
I fail to see how to compare width/heights in order to score each element of set_images.
Is there a problem with just using the Euclidean distance? Zero represents equality:
def euclidean_distance(a, b)
dx = a[:width] - b[:width]
dy = a[:height] - b[:height]
Math.sqrt((dx * dx) + (dy * dy))
end
test_subject = { width: 200, height: 50 }
samples = [
{ width: 100, height: 100 },
{ width: 80, height: 200 },
{ width: 200, height: 50 },
{ width: 10, height: 10 }
]
distances = samples.map { |s| euclidean_distance(test_subject, s) }
samples.zip(distances) { |img, dist| puts "#{img[:width]}x#{img[:height]} => #{dist}" }
Output:
100x100 => 111.80339887498948
80x200 => 192.09372712298546
200x50 => 0.0
10x10 => 194.164878389476
You can then use sort easily enough:
sorted = samples.sort { |a, b| euclidean_distance(test_subject, a) <=> euclidean_distance(test_subject, b) }
Something like this seems to work. Excuse the formatting...
$ cat foo.rb
require 'pp'
main_image = {:width => 100, :height => 50}
set_of_images = [{:width => 200, :height => 300, :id => 2},
{:width => 100, :height => 50, :id => 9}]
aspect_ratio = main_image[:width] / main_image[:height].to_f
sorted_images = set_of_images.
map{|i| i[:score] = (aspect_ratio - i[:width]/i[:height].to_f).abs; i}.
sort_by{|i| i[:score]}
pp sorted_images
$ ruby foo.rb
[{:width=>100, :height=>50, :id=>9, :score=>0.0},
{:width=>200, :height=>300, :id=>2, :score=>1.3333333333333335}]
array = [0, 0.3, 0.4, 0.2, 0.6]
hash = {
"key1" => array[0..2],
"key2" => array[0..3],
"key3" => array,
"key4" => array,
"key5" => array,
"key6" => array,
"key7" => array
}
Is there a way I can remove the duplication by doing something like
hash = {
"key1" => array[0..2],
"key2" => array[0..3],
%(key3, key4, key5, key6, key7).each {|ele| ele => array}
}
Try
array = [0, 0.3, 0.4, 0.2, 0.6]
hash = {
"key1" => array[0..2],
"key2" => array[0..3]
}
%w(key3 key4 key5 key6 key7).each {|ele| hash[ele] = array}
array = [0, 0.3, 0.4, 0.2, 0.6]
h = Hash[*Array.new(7) {|x| ["key#{x+1}", array[0..(x<2?x+2:-1)]]}.flatten(1)]
h # => {"key1" => [0, 0.3, 0.4], "key2" => [0.3, 0.4, 0.2],...}
Here's a couple variations on a theme. They work with 1.8.7 or 1.9.2. The insertion order is maintained with 1.9.2 'cause that's what it does:
require 'pp'
array = [0, 0.3, 0.4, 0.2, 0.6]
hash = ('key3'..'key7').entries.inject({}) { |m, e| m[e] = array; m }
hash.merge!('key1' => array[0..2], 'key2' => array[0..3])
pp hash
puts '-' * 40
hash = {
'key1' => array[0..2],
'key2' => array[0..3]
}.merge(('key3'..'key7').entries.inject({}) { |m, e| m[e] = array; m })
pp hash
puts '-' * 40
# I think this is the most readable/maintainable
hash = {
'key1' => array[0..2],
'key2' => array[0..3]
}
('key3'..'key7').entries.inject(hash) { |m, e| m[e] = array; m }
pp hash
Which output:
# >> {"key3"=>[0, 0.3, 0.4, 0.2, 0.6],
# >> "key4"=>[0, 0.3, 0.4, 0.2, 0.6],
# >> "key5"=>[0, 0.3, 0.4, 0.2, 0.6],
# >> "key6"=>[0, 0.3, 0.4, 0.2, 0.6],
# >> "key7"=>[0, 0.3, 0.4, 0.2, 0.6],
# >> "key1"=>[0, 0.3, 0.4],
# >> "key2"=>[0, 0.3, 0.4, 0.2]}
# >> ----------------------------------------
# >> {"key1"=>[0, 0.3, 0.4],
# >> "key2"=>[0, 0.3, 0.4, 0.2],
# >> "key3"=>[0, 0.3, 0.4, 0.2, 0.6],
# >> "key4"=>[0, 0.3, 0.4, 0.2, 0.6],
# >> "key5"=>[0, 0.3, 0.4, 0.2, 0.6],
# >> "key6"=>[0, 0.3, 0.4, 0.2, 0.6],
# >> "key7"=>[0, 0.3, 0.4, 0.2, 0.6]}
# >> ----------------------------------------
# >> {"key1"=>[0, 0.3, 0.4],
# >> "key2"=>[0, 0.3, 0.4, 0.2],
# >> "key3"=>[0, 0.3, 0.4, 0.2, 0.6],
# >> "key4"=>[0, 0.3, 0.4, 0.2, 0.6],
# >> "key5"=>[0, 0.3, 0.4, 0.2, 0.6],
# >> "key6"=>[0, 0.3, 0.4, 0.2, 0.6],
# >> "key7"=>[0, 0.3, 0.4, 0.2, 0.6]}
Here is another version:
hash = {
"key1" => array[0..2],
"key2" => array[0..3]
}.tap { |h| ("key3".."key7").each{|k| h[k]=array}}