visualizing regression tree model with continuous numerical target class? - graphviz

I am practicing with this life expectancy dataset from Kaggle (https://www.kaggle.com/datasets/kumarajarshi/life-expectancy-who?select=Life+Expectancy+Data.csv) and I want to train and visualize a classification and regression tree model. however, I keep getting an error that says "InvocationException: GraphViz's executables not found". I am wondering if this is because of the nature of the continuous numerical target dataset type? how can I visualize the model?
code:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import seaborn as sn
from sklearn import datasets
from sklearn import metrics
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import export_graphviz
import matplotlib.pyplot as plt,pydotplus
from IPython.display import Image,display
data = pd.read_csv('Life Expectancy Data.csv')
data = data.dropna(how = 'any')
#feature selection
data = data.drop(columns=['infant deaths', ' thinness 5-9 years', 'Alcohol', 'percentage expenditure', 'Hepatitis B', 'Total expenditure', 'Population', ' thinness 5-9 years', 'Year', 'Country'])
# Creating a instance of label Encoder.
le = LabelEncoder()
# Using .fit_transform function to fit label
# encoder and return encoded label
label = le.fit_transform(data['Status'])
# removing the column 'Status' from df
data.drop('Status', axis=1, inplace=True)
# Appending the array to our dataFrame
# with column name 'Status'
data['Status'] = label
#training model
model_data = data
X = data.drop(columns=['Life expectancy '])
y = data['Life expectancy ']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
model = DecisionTreeRegressor()
model.fit(X_train, y_train)
#visualizing tree
LEtree = tree.export_graphviz(model,
feature_names = ['Adult Mortality', 'Measles', ' BMI', 'under-five deaths', 'Polio', 'Diphtheria', ' HIV/AIDS', 'GDP', ' thinness 1-19 years', 'Income composition of resources', 'Schooling', 'Status'],
class_names = y,
label = 'all',
rounded = True,
filled = True)
graph=pydotplus.graph_from_dot_data(LEtree)
display(Image(graph.create_png()))
full error message:
InvocationException Traceback (most recent call last)
Input In [27], in <cell line: 2>()
1 graph=pydotplus.graph_from_dot_data(LEtree)
----> 2 display(Image(graph.create_png()))
File ~\Anaconda3\lib\site-packages\pydotplus\graphviz.py:1797, in Dot.__init__.<locals>.<lambda>(f, prog)
1792 # Automatically creates all the methods enabling the creation
1793 # of output in any of the supported formats.
1794 for frmt in self.formats:
1795 self.__setattr__(
1796 'create_' + frmt,
-> 1797 lambda f=frmt, prog=self.prog: self.create(format=f, prog=prog)
1798 )
1799 f = self.__dict__['create_' + frmt]
1800 f.__doc__ = (
1801 '''Refer to the docstring accompanying the'''
1802 ''''create' method for more information.'''
1803 )
File ~\Anaconda3\lib\site-packages\pydotplus\graphviz.py:1959, in Dot.create(self, prog, format)
1957 self.progs = find_graphviz()
1958 if self.progs is None:
-> 1959 raise InvocationException(
1960 'GraphViz\'s executables not found')
1962 if prog not in self.progs:
1963 raise InvocationException(
1964 'GraphViz\'s executable "%s" not found' % prog)
InvocationException: GraphViz's executables not found

Try Installing the Graphviz in a proper directory
you can install in Anaconda from conda-command-prompt using the below command -
conda install -c conda-forge python-graphviz
and replace the previously installed graphviz directory this might help you with the problem

Related

skan.Skeleton returning integer (not skeleton object)

Sorry, this might turn out to be a very basic problem but I'm stumped:
I'm trying to get the lengths of skeleton branches. I'm able to skeletonize my image and can see the skeleton with draw.overlay_skeleton_2d() but when I try to get any statistics on the skeleton back I get an error:
TypeError Traceback (most recent call last)
<ipython-input-3-5924e1b8ffc8> in <module>
4 # fig, ax = plt.subplots()
5 # draw.overlay_skeleton_2d(img, skeleton0, dilate=1, axes=ax)
----> 6 branch_data = summarize(Skeleton(skeleton0))
7 branch_data.head()
~\AppData\Roaming\Python\Python38\site-packages\skan\csr.py in __init__(self, skeleton_image, spacing, source_image, _buffer_size_offset, keep_images, junction_mode, unique_junctions)
383 self.nbgraph = csr_to_nbgraph(graph, pixel_values)
384 self.coordinates = coords
--> 385 self.paths = _build_skeleton_path_graph(
386 self.nbgraph, _buffer_size_offset=_buffer_size_offset
387 )
~\AppData\Roaming\Python\Python38\site-packages\skan\csr.py in _build_skeleton_path_graph(graph, _buffer_size_offset)
259 def _build_skeleton_path_graph(graph, *, _buffer_size_offset=None):
260 if _buffer_size_offset is None:
--> 261 max_num_cycles = graph.indices.size // 4
262 _buffer_size_offset = max_num_cycles
263 degrees = np.diff(graph.indptr)
TypeError: expected dtype object, got 'numpy.dtype[int32]'
Any tips would be great. I'm still a Python beginner so, like I said, the mistake might be a very basic one.
Rgds,
Olie
*Edits included
This is my code:
import cv2
from skan import draw, skeleton_to_csgraph, Skeleton, summarize
from skimage import morphology
# import numpy as np
from matplotlib import pyplot as plt
%matplotlib notebook
# load image and convert to bool
img = cv2.imread(save_p + "\\" + img_name + ".png")
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
img = img.astype(bool)
# skeletonize
skeleton0 = morphology.skeletonize(img)
### check skeleton
# fig, ax = plt.subplots()
# draw.overlay_skeleton_2d(img, skeleton0, dilate=1, axes=ax)
### from Documentation (linked below)
branch_data = summarize(Skeleton(skeleton0)) # ignoring scaling for now
branch_data.head()
# Skel_obj = Skeleton(skeleton0) # testing this but same problem
# Skel_obj.path_lengths(0)
I have attempted to follow a couple of posts (e.g. this but for that particular case the module wont install and anyway... it seems like skan.summarize is exactly what I need - documentation
Name: numpy
Version: 1.23.4
Name: scikit-image
Version: 0.19.3
Name: pandas
Version: 1.3.4
Name: skan
Version: 0.10.0
Name: numba
Version: 0.50.1
Windows 10 PC

Forecasting validation loss flactuation

I have a question for those who have some experience with timeseries forecasting.
I have been experiment with this field for few weeks and i was trying to forecast some timeseries with both ARIMA and LSTM models to compare the results.
Basically i did plot this graph Figure 1 that has 4 plots :
Top left : ARIMA training data points and fitted model points.
Top right : ARIMA test and forecast points.
Bottom Left : LSTM training data and fitted data (i could not really find fitted point for LSTM so i just forecasted the training data but you can just ignore that part).
Bottom right : Test and forecast data for the LSTM model.
This graph was acceptable and also i did compute the RMSE and MSE and LSTM gave lower error which agrees with most literature online that states the superiority of LSTM over ARIMA models.
However after i did plot the loss and validation loss of the LSTM model to have more insights, i noticed that the validation_loss is following a wierd flectuating pattern Figure 2.
I can explain this as follow : the time series has a lot of outliers or abnormal behaviour, so splitting it to train/validation/test would mean validation cannot be really a good metric to show how good the model can learn.
But since all research papers never show this graph and explain this problem, i don't have a solid argument to defende this idea.
what do you guys think?
Thank you in advance
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf
import statsmodels.api as sm
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_percentage_error,mean_absolute_percentage_error
from statsmodels.tsa.seasonal import STL
import numpy as np
from pandas import Series, DataFrame
from scipy import stats
from statsmodels.tsa.stattools import adfuller
import statsmodels
from statsmodels.tsa.seasonal import seasonal_decompose
from pandas.plotting import register_matplotlib_converters
import pmdarima as pm
register_matplotlib_converters()
import warnings
import time
from numpy import array
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from numpy import array
import keras_tuner as kt
import tensorflow as tf
print(tf.__version__)
from numpy import array
from tensorflow import keras
import keras_tuner as kt
from sklearn.preprocessing import MinMaxScaler
from keras.layers import Bidirectional
from keras.models import Sequential
from keras.preprocessing.sequence import TimeseriesGenerator
from keras.layers import Bidirectional
from tensorflow.keras import initializers
import random as rn
np.random.seed(123)
rn.seed(123)
tf.random.set_seed(123)
tf.keras.utils.set_random_seed(123)
keras.utils.set_random_seed(123)
warnings.filterwarnings('ignore')
df3 = pd.read_csv('favorita_train.csv')
## 1 - Get TS and do STL
print("TS lenbgth : "+str(len(df3)))
results = seasonal_decompose(df3['unit_sales'],period=30)
results.plot();
train_all = df3.iloc[:int(len(df3)*0.8)]
train = df3.iloc[:int(len(df3)*0.6)]
val = df3.iloc[int(len(df3)*0.6):int(len(df3)*0.8)]
test = df3.iloc[int(len(df3)*0.8):]
scaler = MinMaxScaler()
scaler.fit(train_all)
scaled_all = scaler.transform(df3)
scaled_train = scaler.transform(train)
scaled_train_all = scaler.transform(train_all)
scaled_val = scaler.transform(val)
scaled_test = scaler.transform(test)
# We do the same thing, but now instead for 12 months
n_features = 1
n_input =5
train_generator_all = TimeseriesGenerator(scaled_train_all, scaled_train_all, length=n_input, batch_size=1,shuffle=True)
train_generator = TimeseriesGenerator(scaled_train, scaled_train, length=n_input, batch_size=1,shuffle=True)
val_generator = TimeseriesGenerator(scaled_val, scaled_val, length=n_input, batch_size=1,shuffle=True)
adfPValue = adfuller(scaled_all)
adfPValue=adfPValue[1]
adi = len(scaled_all)/((scaled_all != 0).sum())
sd=scaled_all.std()
mean=scaled_all.mean()
cv2 = np.square(sd/mean)
print("CV2 (describe magnitude of demande variability <0.5 is good) :"+str(cv2))
print("SD (-2,2 is good | mean data variance is low) :"+str(sd))
print("ADI (1.3 or smaller means smooth ts) :"+str(adi))
print("Stationarity test (stationary if <0.05) :"+str(adfPValue))
def model_builder(hp):
model = keras.Sequential()
hp_units = hp.Int('units', min_value=1, max_value=50, step=1)
hp_layers = hp.Int('layers', min_value=1, max_value=3, step=1)
if hp_layers==1 :
model.add(Bidirectional(LSTM(hp_units,activation='relu'), input_shape=(n_input, n_features)))
elif hp_layers==2:
model.add(Bidirectional(LSTM(hp_units, activation='relu', return_sequences=True), input_shape=(n_input, n_features)))
model.add(Bidirectional(LSTM(hp_units, activation='relu')))
else:
model.add(Bidirectional(LSTM(hp_units, activation='relu', return_sequences=True), input_shape=(n_input, n_features)))
for i in range(hp_layers-2):
model.add(Bidirectional(LSTM(hp_units, activation='relu', return_sequences=True)))
model.add(Bidirectional(LSTM(hp_units, activation='relu')))
model.add(Dense(1))
hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])
model.compile(optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate), loss='mse',metrics=['accuracy'])
return model
tuner = kt.Hyperband(model_builder,
objective='val_loss',
max_epochs=300,
factor=3,
directory='499',
project_name='949',
seed=123)
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=30)
tuner.search(train_generator, epochs=300, validation_data=val_generator, shuffle=True, callbacks=[stop_early], batch_size=len(train_generator))
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]
print(best_hps.get('units'))
print(best_hps.get('layers'))
print(best_hps.get('window'))
print(best_hps.get('learning_rate'))
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]
model = tuner.hypermodel.build(best_hps)
history = model.fit(img_train, label_train, epochs=50, validation_split=0.2)
val_acc_per_epoch = history.history['val_accuracy']
best_epoch = val_acc_per_epoch.index(max(val_acc_per_epoch)) + 1
print('Best epoch: %d' % (best_epoch,))

"name 'pygeos' is not defined"

When doing df = gpd.GeoDataFrame(df1, crs = 'EPSG:4326', geometry = geopandas.points_from_xy(df1.longitude,df1.latitude)) I get "name 'pygeos' is not defined", yet I have installed pygeos in the directory where I dev and
python3.9/site-packages/geopandas/_vectorized.py in points_from_xy(x, y, z)
247
248 if compat.USE_PYGEOS:
--> 249 return pygeos.points(x, y, z)
250 else:
251 out = _points_from_xy(x, y, z)
anf import pygeos is in the script. Is there a specific way to well install pygeos in order to avoid such error ? Thanks
USE_PYGEOS=1
import pyproj
import shapely
import pandas as pd
pd.options.display.max_rows = 100
import geopandas as gpd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
gpd.show_versions()
print(gpd.options.use_pygeos)
location_df = pd.read_csv("location_01-03_01-04.csv", sep = ";")
import rtree
import pygeos
gpd.options.use_pygeos = True
#Point is (longitude, latitude)
# Function making geopandas points of latitude, longitude
location_geo = gpd.GeoDataFrame(location_df, crs = 'EPSG:4326', geometry = gpd.points_from_xy(location_df.longitude, location_df.latitude))
departments_df = gpd.read_file("departements.geojson", sep = ";")
print(departments_df)
import time
start = time.time()
print("hello")
import geopandas
import rtree
# Function to check wether a department contains a position - returns the department of the position. NaN values are probably in another country
dept_points = geopandas.sjoin(location_geo, departments_df)
end = time.time()
print(end-start, ' s')
print(dept_points)
Somehow this did it for me.
It was about setting the constant and importing packages in a specific order.

I can't load my nn model that I've trained and saved

I used transfer learning to train the model. The fundamental model was efficientNet.
You can read more about it here
from tensorflow import keras
from keras.models import Sequential,Model
from keras.layers import Dense,Dropout,Conv2D,MaxPooling2D,
Flatten,BatchNormalization, Activation
from keras.optimizers import RMSprop , Adam ,SGD
from keras.backend import sigmoid
Activation function
class SwishActivation(Activation):
def __init__(self, activation, **kwargs):
super(SwishActivation, self).__init__(activation, **kwargs)
self.__name__ = 'swish_act'
def swish_act(x, beta = 1):
return (x * sigmoid(beta * x))
from keras.utils.generic_utils import get_custom_objects
from keras.layers import Activation
get_custom_objects().update({'swish_act': SwishActivation(swish_act)})
Model Definition
model = enet.EfficientNetB0(include_top=False, input_shape=(150,50,3), pooling='avg', weights='imagenet')
Adding 2 fully-connected layers to B0.
x = model.output
x = BatchNormalization()(x)
x = Dropout(0.7)(x)
x = Dense(512)(x)
x = BatchNormalization()(x)
x = Activation(swish_act)(x)
x = Dropout(0.5)(x)
x = Dense(128)(x)
x = BatchNormalization()(x)
x = Activation(swish_act)(x)
x = Dense(64)(x)
x = Dense(32)(x)
x = Dense(16)(x)
# Output layer
predictions = Dense(1, activation="sigmoid")(x)
model_final = Model(inputs = model.input, outputs = predictions)
model_final.summary()
I saved it using:
model.save('model.h5')
I get the following error trying to load it:
model=tf.keras.models.load_model('model.h5')
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-12-e3bef1680e4f> in <module>()
1 # Recreate the exact same model, including its weights and the optimizer
----> 2 model = tf.keras.models.load_model('PhoneDetection-CNN_29_July.h5')
3
4 # Show the model architecture
5 model.summary()
10 frames
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/utils/generic_utils.py in class_and_config_for_serialized_keras_object(config, module_objects, custom_objects, printable_module_name)
319 cls = get_registered_object(class_name, custom_objects, module_objects)
320 if cls is None:
--> 321 raise ValueError('Unknown ' + printable_module_name + ': ' + class_name)
322
323 cls_config = config['config']
ValueError: Unknown layer: FixedDropout
```python
I was getting the same error while trying to do the inference by loading my saved model.
Then i just imported the effiecientNet library in my inference notebook as well and the error was gone.
My import command looked like:
import efficientnet.keras as efn
(Note that if you havent installed effiecientNet already(which is unlikely), you can do so by using !pip install efficientnet command.)
I had this same issue with a recent model. Researching the source code you can find the FixedDropout Class. I added this to my inference code with import of backend and layers. The rate should also match the rate from your efficientnet model, so for the EfficientNetB0 the rate is .2 (others are different).
from tensorflow.keras import backend, layers
class FixedDropout(layers.Dropout):
def _get_noise_shape(self, inputs):
if self.noise_shape is None:
return self.noise_shape
symbolic_shape = backend.shape(inputs)
noise_shape = [symbolic_shape[axis] if shape is None else shape
for axis, shape in enumerate(self.noise_shape)]
return tuple(noise_shape)
model = keras.models.load_model('model.h5',
custom_objects={'FixedDropout':FixedDropout(rate=0.2)})
I was getting the same error. Then I import the below code. then it id working properly
import cv2
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.metrics import confusion_matrix
import itertools
import os, glob
from tqdm import tqdm
from efficientnet.tfkeras import EfficientNetB4
if you don't have to install this. !pip install efficientnet. If you have any problem put here.
In my case, I had two files train.py and test.py.
I was saving my .h5 model inside train.py and was attempting to load it inside test.py and got the same error. To fix it, you need to add the import statements for your efficientnet models inside the file that is attempting to load it as well (in my case, test.py).
from efficientnet.tfkeras import EfficientNetB0

Scikit learn algorithms performing extremely poorly

I'm new to scikit learn and I'm banging my head against the wall. I've used both real world and test data and the scikit algorithms are not performing above chance level in predicting anything. I've tried knn, decision trees, svc and naive bayes.
Basically, I made a test data set consisting of a column of 0s and 1s, with all the 0s having a feature between 0 and .5 and all the 1s having a feature value between .5 and 1. This should be extremely easy and give near 100% accuracy. However, none of the algorithms are performing above chance level. Accurasies range from 45 to 55 %. I've already tried tweaking a whole bunch of parameters for every algorithm but noting helps. I think something is fundamentally wrong with my implementation.
Please help me out. Here's my code:
from sklearn.cross_validation import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
import sklearn
import pandas
import numpy as np
df=pandas.read_excel('Test.xlsx')
# Make data into np arrays
y = np.array(df[1])
y=y.astype(float)
y=y.reshape(399)
x = np.array(df[2])
x=x.astype(float)
x=x.reshape(399, 1)
# Creating training and test data
labels_train, labels_test = train_test_split(y)
features_train, features_test = train_test_split(x)
#####################################################################
# PERCEPTRON
#####################################################################
from sklearn import linear_model
perceptron=linear_model.Perceptron()
perceptron.fit(features_train, labels_train)
perc_pred=perceptron.predict(features_test)
print sklearn.metrics.accuracy_score(labels_test, perc_pred, normalize=True, sample_weight=None)
print 'perceptron'
#####################################################################
# KNN classifier
#####################################################################
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(features_train, labels_train)
knn_pred = knn.predict(features_test)
# Accuraatheid
print sklearn.metrics.accuracy_score(labels_test, knn_pred, normalize=True, sample_weight=None)
print 'knn'
#####################################################################
## SVC
#####################################################################
from sklearn.svm import SVC
from sklearn import svm
svm2 = SVC(kernel="linear")
svm2 = svm.SVC()
svm2.fit(features_train, labels_train)
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3,
gamma=1.0, kernel='linear', max_iter=-1, probability=False,
random_state=None,
shrinking=True, tol=0.001, verbose=False)
svc_pred = svm2.predict(features_test)
print sklearn.metrics.accuracy_score(labels_test, svc_pred, normalize=True,
sample_weight=None)
#####################################################################
# Decision tree
#####################################################################
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(features_train, labels_train)
tree_pred=clf.predict(features_test)
# Accuraatheid
print sklearn.metrics.accuracy_score(labels_test, tree_pred, normalize=True,
sample_weight=None)
print 'tree'
#####################################################################
# Naive bayes
#####################################################################
import sklearn
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(features_train, labels_train)
print "training time:", round(time()-t0, 3), "s"
GaussianNB()
bayes_pred = clf.predict(features_test)
print sklearn.metrics.accuracy_score(labels_test, bayes_pred,
normalize=True, sample_weight=None)
You seem to use train_test_split the wrong way.
labels_train, labels_test = train_test_split(y) #WRONG
features_train, features_test = train_test_split(x) #WRONG
the splitting of your labels and data isn't necessary the same. One easy way to split your data manually:
randomvec=np.random.rand(len(data))
randomvec=randomvec>0.5
train_data=data[randomvec]
train_label=labels[randomvec]
test_data=data[np.logical_not(randomvec)]
test_label=labels[np.logical_not(randomvec)]
or to use the scikit method properly:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.5, random_state=42)

Resources