How use Catboost to encode a dataset? - categorical-data

There is a package based on the Catboost algorithm, [] that claims to use catboost algorithm to encode datasets. But it has not had all the features in the catboost package that allow refining the model training phase. After training, I'm trying to find a way to use the catboost to transform the categorical variables from a dataset. Could you help me with that?

# If you want to test this on your local notebook
!pip install category-encoders
# import libraries
import pandas as pd
# Make dataset based on Josh Starmer video example
categorical = ["Blue","Red","Green","Blue","Green","Green","Blue"]
numerical = [1.72, 1.32, 1.81, 1.56, 1.64, 1.61, 1.73]
Label = [1 , 0 , 1 , 0 , 1 , 0 , 0]
df = pd.DataFrame({
feature_list = list(df.columns) #['favorite_color', 'Hight(m)', 'LovesTroll2']
# import libraries
from category_encoders.cat_boost import CatBoostEncoder
import category_encoders as ce
# Define catboost encoder
cbe_encoder = ce.cat_boost.CatBoostEncoder() #approach1
CBE_encoder = CatBoostEncoder() #approach2
# Fit encoder and transform the features
train_cbe = cbe_encoder.fit_transform(df[feature_list], df[feature_list[-1]]) #approach1
Train_cbe = CBE_encoder.fit_transform(df[feature_list], df[feature_list[-1]]) #approach2
# favorite_color Hight(m) LovesTroll2
#0 0.428571 1.72 1
#1 0.428571 1.32 0
#2 0.428571 1.81 1
#3 0.714286 1.56 0
#4 0.714286 1.64 1
#5 0.809524 1.61 0
#6 0.476190 1.73 0
# plot the encoded results over target/label
import matplotlib.pyplot as plt
plt.scatter(Train_cbe['LovesTroll2'], Train_cbe['favorite_color'])


skan.Skeleton returning integer (not skeleton object)

Sorry, this might turn out to be a very basic problem but I'm stumped:
I'm trying to get the lengths of skeleton branches. I'm able to skeletonize my image and can see the skeleton with draw.overlay_skeleton_2d() but when I try to get any statistics on the skeleton back I get an error:
TypeError Traceback (most recent call last)
<ipython-input-3-5924e1b8ffc8> in <module>
4 # fig, ax = plt.subplots()
5 # draw.overlay_skeleton_2d(img, skeleton0, dilate=1, axes=ax)
----> 6 branch_data = summarize(Skeleton(skeleton0))
7 branch_data.head()
~\AppData\Roaming\Python\Python38\site-packages\skan\ in __init__(self, skeleton_image, spacing, source_image, _buffer_size_offset, keep_images, junction_mode, unique_junctions)
383 self.nbgraph = csr_to_nbgraph(graph, pixel_values)
384 self.coordinates = coords
--> 385 self.paths = _build_skeleton_path_graph(
386 self.nbgraph, _buffer_size_offset=_buffer_size_offset
387 )
~\AppData\Roaming\Python\Python38\site-packages\skan\ in _build_skeleton_path_graph(graph, _buffer_size_offset)
259 def _build_skeleton_path_graph(graph, *, _buffer_size_offset=None):
260 if _buffer_size_offset is None:
--> 261 max_num_cycles = graph.indices.size // 4
262 _buffer_size_offset = max_num_cycles
263 degrees = np.diff(graph.indptr)
TypeError: expected dtype object, got 'numpy.dtype[int32]'
Any tips would be great. I'm still a Python beginner so, like I said, the mistake might be a very basic one.
*Edits included
This is my code:
import cv2
from skan import draw, skeleton_to_csgraph, Skeleton, summarize
from skimage import morphology
# import numpy as np
from matplotlib import pyplot as plt
%matplotlib notebook
# load image and convert to bool
img = cv2.imread(save_p + "\\" + img_name + ".png")
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
img = img.astype(bool)
# skeletonize
skeleton0 = morphology.skeletonize(img)
### check skeleton
# fig, ax = plt.subplots()
# draw.overlay_skeleton_2d(img, skeleton0, dilate=1, axes=ax)
### from Documentation (linked below)
branch_data = summarize(Skeleton(skeleton0)) # ignoring scaling for now
# Skel_obj = Skeleton(skeleton0) # testing this but same problem
# Skel_obj.path_lengths(0)
I have attempted to follow a couple of posts (e.g. this but for that particular case the module wont install and anyway... it seems like skan.summarize is exactly what I need - documentation
Name: numpy
Version: 1.23.4
Name: scikit-image
Version: 0.19.3
Name: pandas
Version: 1.3.4
Name: skan
Version: 0.10.0
Name: numba
Version: 0.50.1
Windows 10 PC

visualizing regression tree model with continuous numerical target class?

I am practicing with this life expectancy dataset from Kaggle ( and I want to train and visualize a classification and regression tree model. however, I keep getting an error that says "InvocationException: GraphViz's executables not found". I am wondering if this is because of the nature of the continuous numerical target dataset type? how can I visualize the model?
import warnings
import pandas as pd
import numpy as np
import seaborn as sn
from sklearn import datasets
from sklearn import metrics
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import export_graphviz
import matplotlib.pyplot as plt,pydotplus
from IPython.display import Image,display
data = pd.read_csv('Life Expectancy Data.csv')
data = data.dropna(how = 'any')
#feature selection
data = data.drop(columns=['infant deaths', ' thinness 5-9 years', 'Alcohol', 'percentage expenditure', 'Hepatitis B', 'Total expenditure', 'Population', ' thinness 5-9 years', 'Year', 'Country'])
# Creating a instance of label Encoder.
le = LabelEncoder()
# Using .fit_transform function to fit label
# encoder and return encoded label
label = le.fit_transform(data['Status'])
# removing the column 'Status' from df
data.drop('Status', axis=1, inplace=True)
# Appending the array to our dataFrame
# with column name 'Status'
data['Status'] = label
#training model
model_data = data
X = data.drop(columns=['Life expectancy '])
y = data['Life expectancy ']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
model = DecisionTreeRegressor(), y_train)
#visualizing tree
LEtree = tree.export_graphviz(model,
feature_names = ['Adult Mortality', 'Measles', ' BMI', 'under-five deaths', 'Polio', 'Diphtheria', ' HIV/AIDS', 'GDP', ' thinness 1-19 years', 'Income composition of resources', 'Schooling', 'Status'],
class_names = y,
label = 'all',
rounded = True,
filled = True)
full error message:
InvocationException Traceback (most recent call last)
Input In [27], in <cell line: 2>()
1 graph=pydotplus.graph_from_dot_data(LEtree)
----> 2 display(Image(graph.create_png()))
File ~\Anaconda3\lib\site-packages\pydotplus\, in Dot.__init__.<locals>.<lambda>(f, prog)
1792 # Automatically creates all the methods enabling the creation
1793 # of output in any of the supported formats.
1794 for frmt in self.formats:
1795 self.__setattr__(
1796 'create_' + frmt,
-> 1797 lambda f=frmt, prog=self.prog: self.create(format=f, prog=prog)
1798 )
1799 f = self.__dict__['create_' + frmt]
1800 f.__doc__ = (
1801 '''Refer to the docstring accompanying the'''
1802 ''''create' method for more information.'''
1803 )
File ~\Anaconda3\lib\site-packages\pydotplus\, in Dot.create(self, prog, format)
1957 self.progs = find_graphviz()
1958 if self.progs is None:
-> 1959 raise InvocationException(
1960 'GraphViz\'s executables not found')
1962 if prog not in self.progs:
1963 raise InvocationException(
1964 'GraphViz\'s executable "%s" not found' % prog)
InvocationException: GraphViz's executables not found
Try Installing the Graphviz in a proper directory
you can install in Anaconda from conda-command-prompt using the below command -
conda install -c conda-forge python-graphviz
and replace the previously installed graphviz directory this might help you with the problem

Passing wrong argument for TensorFlow method

I am trying to classify two objects. I would like to get Accuracy and Cross Entropy from the script.
Here is the code I'm trying. (by tensorflow for poets)
# Copyright 2017 Google Inc.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import sys
import argparse
import numpy as np
import PIL.Image as Image
import tensorflow as tf
import scripts.retrain as retrain
from scripts.count_ops import load_graph
def evaluate_graph(graph_file_name):
with load_graph(graph_file_name).as_default() as graph:
ground_truth_input = tf.placeholder(
tf.float32, [None, 5], name='GroundTruthInput')
image_buffer_input = graph.get_tensor_by_name('input:0')
final_tensor = graph.get_tensor_by_name('final_result:0')
accuracy, _ = retrain.add_evaluation_step(final_tensor, ground_truth_input)
logits = graph.get_tensor_by_name("final_training_ops/Wx_plus_b/add:0")
xent = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
#image_dir = 'tf_files/flower_photos'
image_dir = 'tf_files/test_images'
testing_percentage = 10
validation_percentage = 10
validation_batch_size = 100
category = 'testing'
image_lists = retrain.create_image_lists(
image_dir, testing_percentage,
class_count = len(image_lists.keys())
ground_truths = []
filenames = []
for label_index, label_name in enumerate(image_lists.keys()):
for image_index, image_name in enumerate(image_lists[label_name][category]):
image_name = retrain.get_image_path(
image_lists, label_name, image_index, image_dir, category)
ground_truth = np.zeros([1, class_count], dtype=np.float32)
ground_truth[0, label_index] = 1.0
accuracies = []
xents = []
with tf.Session(graph=graph) as sess:
for filename, ground_truth in zip(filenames, ground_truths):
image =, 224), Image.ANTIALIAS)
image = np.array(image, dtype=np.float32)[None, ...]
image = (image - 128) / 128.0
feed_dict = {
image_buffer_input: image,
ground_truth_input: ground_truth}
eval_accuracy, eval_xent =[accuracy, xent], feed_dict)
return np.mean(accuracies), np.mean(xents)
if __name__ == "__main__":
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
accuracy, xent = evaluate_graph(*sys.argv[1:])
print('Accuracy: %g' % accuracy)
print('Cross Entropy: %g' % xent)
However when I run the above script for prediction I get the following error:
ValueError: Cannot feed value of shape (1, 224, 224) for Tensor
u'input:0', which has shape '(1, 224, 224, 3)'
How can I solve this error?
It seems like you are feeding grayscale images into input placeholder. Grayscale images have only 1 channel, hence the shape (224, 224, ) (dimension of size 1 is omitted), while pretrained network you are trying requires RGB images with 3 channels and shape (224, 224, 3)
If your images are actualy RGB, you might have an error here:
image = np.array(image, dtype=np.float32)[None, ...]
this indexing: [None, ...] does not seem necessary.
If your images are actually grayscale, you may convert them into RGB format using PIL.convert() (one channel will be repeated 3 times):
image = image.convert("RGB")
Although with channel duplication running 3-channel CNN is inefficient (computation is performed for the same data 3 times) and likely will perform worse than with colored images, this should run the script and will get you on track quickly.

LSTM - LSTM - future value prediction error

After some research, I was able to predict the future value using the LSTM code below. I have also attached the Dmd1ahr.csv file in the github link that I am using.
As you all can see below, 90 data points are training sets and 91st to 100th are future value prediction.
However some of the questions that I still have are:
In order to predict these values I had to originally take more than hundred data sets (here, I have taken 500 data sets) which is not exactly what my primary goal is. Is there a way that given 500 data sets, it will predict the rest 10 or 20 out of sample data points? If yes, will you please write me a sample code where you can just take 500 data points from Dmd1ahr.csv file attached below and it will predict some future values (say 501 to 520) based on those 500 points?
The prediction are way off compared to the one who have in your blogs (definitely indicates for parameter tuning - I tried changing epochs, LSTM layers, Activation, optimizer). What other parameter tuning I can do to make it more robust?
Thank you'll in advance.
import numpy as np
import matplotlib.pyplot as plt
import pandas
# By twaking the architecture it could be made more robust
numOfSamples = 500
lengthTrain = 90
lengthValidation = 100
look_back = 1 # Can be set higher, in my experiments it made performance worse though
transientTime = 90 # Time to "burn in" time series
series = pandas.read_csv('Dmd1ahr.csv')
def generateTrainData(series, i, look_back):
return series[i:look_back+i+1]
trainX = np.stack([generateTrainData(series, i, look_back) for i in range(lengthTrain)])
testX = np.stack([generateTrainData(series, lengthTrain + i, look_back) for i in range(lengthValidation)])
trainX = trainX.reshape((lengthTrain,look_back+1,1))
testX = testX.reshape((lengthValidation, look_back + 1, 1))
trainY = trainX[:,1:,:]
trainX = trainX[:,:-1,:]
testY = testX[:,1:,:]
testX = testX[:,:-1,:]
############### Build Model ###############
import keras
from keras.models import Model
from keras import layers
from keras import regularizers
inputs = layers.Input(batch_shape=(1,look_back,1), name="main_input")
inputsAux = layers.Input(batch_shape=(1,look_back,1), name="aux_input")
# this layer makes the actual prediction, i.e. decides if and how much it goes up or down
x = layers.recurrent.LSTM(300,return_sequences=True, stateful=True)(inputs)
x = layers.recurrent.LSTM(200,return_sequences=True, stateful=True)(inputs)
x = layers.recurrent.LSTM(100,return_sequences=True, stateful=True)(inputs)
x = layers.recurrent.LSTM(50,return_sequences=True, stateful=True)(inputs)
x = layers.wrappers.TimeDistributed(layers.Dense(1, activation="linear",
# auxillary input, the current input will be feed directly to the output
# this way the prediction from the step before will be used as a "base", and the Network just have to
# learn if it goes a little up or down
auxX = layers.wrappers.TimeDistributed(layers.Dense(1,
input_shape=(1,1), activation="linear", trainable=False
outputs = layers.add([x, auxX], name="main_output")
model = Model(inputs=[inputs, inputsAux], outputs=outputs)
#model.summary(){"main_input": trainX, "aux_input": trainX[look_back-1,look_back,:]},{"main_output": trainY}, epochs=4, batch_size=1, shuffle=False){"main_input": trainX, "aux_input": trainX[:,look_back-1,:].reshape(lengthTrain,1,1)},{"main_output": trainY}, epochs=100, batch_size=1, shuffle=False)
############### make predictions ###############
burnedInPredictions = np.zeros(transientTime)
testPredictions = np.zeros(len(testX))
# burn series in, here use first transitionTime number of samples from test data
for i in range(transientTime):
prediction = model.predict([np.array(testX[i, :, 0].reshape(1, look_back, 1)), np.array(testX[i, look_back - 1, 0].reshape(1, 1, 1))])
testPredictions[i] = prediction[0,0,0]
burnedInPredictions[:] = testPredictions[:transientTime]
# prediction, now dont use any previous data whatsoever anymore, network just has to run on its own output
for i in range(transientTime, len(testX)):
prediction = model.predict([prediction, prediction])
testPredictions[i] = prediction[0,0,0]
# for plotting reasons
testPredictions[:np.size(burnedInPredictions)-1] = np.nan
############### plot results ###############
#import matplotlib.pyplot as plt
plt.plot(testX[:, 0, 0])
plt.plot(burnedInPredictions, label = "training")
plt.plot(testPredictions, label = "prediction")

Use Gensim or other python LDA packages to use trained LDA model from Mallet

I have an LDA model trained through Mallet in Java. Three files are generated from the Mallet LDA model, which allow me to run the model from files and infer the topic distribution of a new text.
Now I would like to implement a Python tool which is able to infer a topic distribution given a new text, based on the trained LDA model. I do not want to re-trained the LDA model in Python. Therefore, I wonder if it is possible to load the trained Mallet LDA model into Gensim or any other python LDA package. If so, how can I do it?
Thanks for any answers or comments.
In short yes you can! That is what is nice about using mallet is that once it is run you don't have to go through and relabel topics. I'm doing something very similar - I'll post my code below with a few helpful links. Once your model is trained save the notebook widget state and you'll be free to run your model on new and different data-sets with the same topic allocation. This code includes a test and validation set. Make sure you've downloaded mallet and java then try this:
# future bridges python 2 and 3
from __future__ import print_function
# pandas works with data structures, data manipulation, and analysis specifically for numerical tables, and series like
# the csv we are using here today
import pandas as pd
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
# Gensim unsupervised topic modeling, natural language processing, statistical machine learning
import gensim
# convert a document to a list of tolkens
from gensim.utils import simple_preprocess
# remove stopwords - words that are not telling: "it" "I" "the" "and" ect.
from gensim.parsing.preprocessing import STOPWORDS
# corpus iterator
from gensim import corpora, models
# nltk - Natural Language Toolkit
# lemmatized — words in third person are changed to first person and verbs in past and future tenses are changed
# into present.
# stemmed — words are reduced to their root form.
import nltk'wordnet')
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
# NumPy - multidimensional arrays, matrices, and high-level mathematical formulas
import numpy as np
import os
from gensim.models.wrappers import LdaMallet
from pathlib import Path
import codecs
import logging
import re
import numpy as np
import pandas as pd
from pprint import pprint
# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
# spacy for lemmatization
import spacy
# Plotting tools
import pyLDAvis
import pyLDAvis.gensim # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline
# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
import warnings
logging.basicConfig(format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO)
data = pd.read_csv('YourData.csv', encoding = "ISO-8859-1");
data_text = data[['Preprocessed Document or your comments column title']]
data_text['index'] = data_text.index
documents = data_text
# Create functions to lemmatize stem, and preprocess
# turn beautiful, beautifuly, beautified into stem beauti
def lemmatize_stemming(text):
stemmer = PorterStemmer()
return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
# parse docs into individual words ignoring words that are less than 3 letters long
# and stopwords: him, her, them, for, there, ect since "their" is not a topic.
# then append the tolkens into a list
def preprocess(text):
result = []
for token in gensim.utils.simple_preprocess(text):
newStopWords = ['yourStopWord1', 'yourStopWord2']
if token not in gensim.parsing.preprocessing.STOPWORDS and token not in newStopWords and len(token) > 3:
return result
# gensim.parsing.preprocessing.STOPWORDS
# look at a random row 4310 and see if things worked out
# note that the document created was already preprocessed
doc_sample = documents[documents['index'] == 4310].values[0][0]
print('original document: ')
words = []
for word in doc_sample.split(' '):
print('\n\n tokenized and lemmatized document: ')
# let’s look at ten rows passed through the lemmatize stemming and preprocess
documents = documents.dropna(subset=['Preprocessed Document'])
processed_docs = documents['Preprocessed Document'].map(preprocess)
# we create a dictionary of all the words in the csv by iterating through
# contains the number of times a word appears in the training set.
dictionary_valid = gensim.corpora.Dictionary(processed_docs[20000:])
count = 0
for k, v in dictionary_valid.iteritems():
print(k, v)
count += 1
if count > 30:
# we create a dictionary of all the words in the csv by iterating through
# contains the number of times a word appears in the training set.
dictionary_test = gensim.corpora.Dictionary(processed_docs[:20000])
count = 0
for k, v in dictionary_test.iteritems():
print(k, v)
count += 1
if count > 30:
# we want to throw out words that are so frequent that they tell us little about the topic
# as well as words that are too infrequent >15 rows then keep just 100,000 words
dictionary_valid.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
# we want to throw out words that are so frequent that they tell us little about the topic
# as well as words that are too infrequent >15 rows then keep just 100,000 words
dictionary_test.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
# the words become numbers and are then counted for frequency
# consider a random row 4310 - it has 8 words word indexed 2 shows up once
# preview the bag of words
bow_corpus_valid = [dictionary_valid.doc2bow(doc) for doc in processed_docs]
# the words become numbers and are then counted for frequency
# consider a random row 4310 - it has 8 words word indexed 2 shows up once
# preview the bag of words
bow_corpus_test = [dictionary_test.doc2bow(doc) for doc in processed_docs]
# same thing in more words
bow_doc_4310 = bow_corpus_test[4310]
for i in range(len(bow_doc_4310)):
print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0],
mallet_path = 'C:/mallet/mallet-2.0.8/bin/mallet.bat'
ldamallet_test = gensim.models.wrappers.LdaMallet(mallet_path, corpus=bow_corpus_test, num_topics=20, id2word=dictionary_test)
result = (ldamallet_test.show_topics(num_topics=20, num_words=10,formatted=False))
for each in result:
print (each)
mallet_path = 'C:/mallet/mallet-2.0.8/bin/mallet.bat'
ldamallet_valid = gensim.models.wrappers.LdaMallet(mallet_path, corpus=bow_corpus_valid, num_topics=20, id2word=dictionary_valid)
result = (ldamallet_valid.show_topics(num_topics=20, num_words=10,formatted=False))
for each in result:
print (each)
# Show Topics
for idx, topic in ldamallet_test.print_topics(-1):
print('Topic: {} \nWords: {}'.format(idx, topic))
# Show Topics
for idx, topic in ldamallet_valid.print_topics(-1):
print('Topic: {} \nWords: {}'.format(idx, topic))
# check out the topics - 30 words - 20 topics
ldamallet_valid.print_topics(idx, 30)
# check out the topics - 30 words - 20 topics
ldamallet_test.print_topics(idx, 30)
# Compute Coherence Score
coherence_model_ldamallet_valid = CoherenceModel(model=ldamallet_valid, texts=processed_docs, dictionary=dictionary_valid, coherence='c_v')
coherence_ldamallet_valid = coherence_model_ldamallet_valid.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet_valid)
# Compute Coherence Score
coherence_model_ldamallet_test = CoherenceModel(model=ldamallet_test, texts=processed_docs, dictionary=dictionary_test, coherence='c_v')
coherence_ldamallet_test = coherence_model_ldamallet_test.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet_test)
Look at 16:
This helped:
and this:
I hope this helps and good luck :)
