How to plot when we have dropdown menu with multiple selections? - drop-down-menu

I am pretty new to dash plotly. I am trying to create a dropdown menu with multiple selections ON. I have a dataframe with the column names [‘col1’, ‘col2’, ‘col3’, ‘col4’, ‘col5’]. I want to plot this dataframe to have ‘col5’ in the y-axis, and the rest of the columns in the dropdown menu for the x-axis selection. I want to add these columns to my x-axis when I selected them from the dropdown menu.
I cannot understand what I need to modify to make it work. I checked the published posts but couldn’t figure it out.
I get “Callback error updating mygraph1.figure” when I run my code below:
import dash_bootstrap_components as dbc
import pandas as pd
import numpy as np
import plotly.express as px
from dash.exceptions import PreventUpdate
df = pd.DataFrame(np.random.randn(100, 5))
df.columns = (['col1', 'col2', 'col3', 'col4', 'col5'])
app = Dash(external_stylesheets=[dbc.themes.SUPERHERO])
app.layout = dbc.Container([
dbc.Row([
dbc.Col([
dcc.Dropdown(id='mydd1',
options=df.columns.values[0:4],
multi= True,
clearable=True,
value=[])
], width=4),
]),
dbc.Row([
dbc.Col([
dcc.Graph(id='mygraph1', figure={})
], width=4),
])
], fluid=True)
#app.callback(
Output('mygraph1', 'figure'),
Input('mydd1', 'value'),
)
def update_title(X):
if X is None:
raise PreventUpdate
fig1 = px.line(df, x=df[X], y=df['col5'])
return fig1
if __name__ == "__main__":
app.run_server(debug = True, port=8055)

The problem is in the value of X, it is a list and you cannot use is directly like df[X] because it is invalid synatx in pandas to write df[['col1']].
Full example:
import dash
import dash_bootstrap_components as dbc
from dash import Dash, html, dcc,dash_table
from dash.dependencies import Input, Output
import plotly.graph_objs as go
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.randn(10, 5))
df.columns = (['col1', 'col2', 'col3', 'col4', 'col5'])
app = dash.Dash(external_stylesheets=[dbc.themes.SUPERHERO])
app.layout = dbc.Container([
dbc.Row([
dbc.Col([
dcc.Dropdown(id='mydd1',
options=df.columns.values[0:4],
multi= True,
clearable=True,
value=[])
], width=4),
]),
dbc.Row([
dbc.Col([
dcc.Graph(id='mygraph1', figure={})
], width=4),
])
], fluid=True)
#app.callback(
Output('mygraph1', 'figure'),
Input('mydd1', 'value'),
)
def update_title(X):
if X == []:
return dash.no_update
fig = go.Figure()
for idx, col in enumerate(X):
fig.add_trace(go.Scatter(x =df[col] , y =df['col5'], mode ='lines', name = col))
return fig
app.run_server(debug=True, use_reloader=False)
Output

Related

visualizing regression tree model with continuous numerical target class?

I am practicing with this life expectancy dataset from Kaggle (https://www.kaggle.com/datasets/kumarajarshi/life-expectancy-who?select=Life+Expectancy+Data.csv) and I want to train and visualize a classification and regression tree model. however, I keep getting an error that says "InvocationException: GraphViz's executables not found". I am wondering if this is because of the nature of the continuous numerical target dataset type? how can I visualize the model?
code:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import seaborn as sn
from sklearn import datasets
from sklearn import metrics
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import export_graphviz
import matplotlib.pyplot as plt,pydotplus
from IPython.display import Image,display
data = pd.read_csv('Life Expectancy Data.csv')
data = data.dropna(how = 'any')
#feature selection
data = data.drop(columns=['infant deaths', ' thinness 5-9 years', 'Alcohol', 'percentage expenditure', 'Hepatitis B', 'Total expenditure', 'Population', ' thinness 5-9 years', 'Year', 'Country'])
# Creating a instance of label Encoder.
le = LabelEncoder()
# Using .fit_transform function to fit label
# encoder and return encoded label
label = le.fit_transform(data['Status'])
# removing the column 'Status' from df
data.drop('Status', axis=1, inplace=True)
# Appending the array to our dataFrame
# with column name 'Status'
data['Status'] = label
#training model
model_data = data
X = data.drop(columns=['Life expectancy '])
y = data['Life expectancy ']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
model = DecisionTreeRegressor()
model.fit(X_train, y_train)
#visualizing tree
LEtree = tree.export_graphviz(model,
feature_names = ['Adult Mortality', 'Measles', ' BMI', 'under-five deaths', 'Polio', 'Diphtheria', ' HIV/AIDS', 'GDP', ' thinness 1-19 years', 'Income composition of resources', 'Schooling', 'Status'],
class_names = y,
label = 'all',
rounded = True,
filled = True)
graph=pydotplus.graph_from_dot_data(LEtree)
display(Image(graph.create_png()))
full error message:
InvocationException Traceback (most recent call last)
Input In [27], in <cell line: 2>()
1 graph=pydotplus.graph_from_dot_data(LEtree)
----> 2 display(Image(graph.create_png()))
File ~\Anaconda3\lib\site-packages\pydotplus\graphviz.py:1797, in Dot.__init__.<locals>.<lambda>(f, prog)
1792 # Automatically creates all the methods enabling the creation
1793 # of output in any of the supported formats.
1794 for frmt in self.formats:
1795 self.__setattr__(
1796 'create_' + frmt,
-> 1797 lambda f=frmt, prog=self.prog: self.create(format=f, prog=prog)
1798 )
1799 f = self.__dict__['create_' + frmt]
1800 f.__doc__ = (
1801 '''Refer to the docstring accompanying the'''
1802 ''''create' method for more information.'''
1803 )
File ~\Anaconda3\lib\site-packages\pydotplus\graphviz.py:1959, in Dot.create(self, prog, format)
1957 self.progs = find_graphviz()
1958 if self.progs is None:
-> 1959 raise InvocationException(
1960 'GraphViz\'s executables not found')
1962 if prog not in self.progs:
1963 raise InvocationException(
1964 'GraphViz\'s executable "%s" not found' % prog)
InvocationException: GraphViz's executables not found
Try Installing the Graphviz in a proper directory
you can install in Anaconda from conda-command-prompt using the below command -
conda install -c conda-forge python-graphviz
and replace the previously installed graphviz directory this might help you with the problem

"name 'pygeos' is not defined"

When doing df = gpd.GeoDataFrame(df1, crs = 'EPSG:4326', geometry = geopandas.points_from_xy(df1.longitude,df1.latitude)) I get "name 'pygeos' is not defined", yet I have installed pygeos in the directory where I dev and
python3.9/site-packages/geopandas/_vectorized.py in points_from_xy(x, y, z)
247
248 if compat.USE_PYGEOS:
--> 249 return pygeos.points(x, y, z)
250 else:
251 out = _points_from_xy(x, y, z)
anf import pygeos is in the script. Is there a specific way to well install pygeos in order to avoid such error ? Thanks
USE_PYGEOS=1
import pyproj
import shapely
import pandas as pd
pd.options.display.max_rows = 100
import geopandas as gpd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
gpd.show_versions()
print(gpd.options.use_pygeos)
location_df = pd.read_csv("location_01-03_01-04.csv", sep = ";")
import rtree
import pygeos
gpd.options.use_pygeos = True
#Point is (longitude, latitude)
# Function making geopandas points of latitude, longitude
location_geo = gpd.GeoDataFrame(location_df, crs = 'EPSG:4326', geometry = gpd.points_from_xy(location_df.longitude, location_df.latitude))
departments_df = gpd.read_file("departements.geojson", sep = ";")
print(departments_df)
import time
start = time.time()
print("hello")
import geopandas
import rtree
# Function to check wether a department contains a position - returns the department of the position. NaN values are probably in another country
dept_points = geopandas.sjoin(location_geo, departments_df)
end = time.time()
print(end-start, ' s')
print(dept_points)
Somehow this did it for me.
It was about setting the constant and importing packages in a specific order.

exclude one of the hue from seaborn catplot visualization

I want to visualize category count by seaborn catplot but one of the hue are not important and don't need to include the visualization.
How can I select specific Hues at catplot to visualize without changing or removing any value from the column ?
You could remove the rows with that value from the dataframe. If the column is Categorical you might also need to change the categories as the legend will still contain all the categories.
Here is an example:
import seaborn as sns
import pandas as pd
tips = sns.load_dataset('tips')
tips['day'].dtype # CategoricalDtype(categories=['Thur', 'Fri', 'Sat', 'Sun'], ordered=False)
# create a subset, a copy is needed to be able to change the categorical column
tips_weekend = tips[tips['day'].isin(['Sat', 'Sun'])].copy()
tips_weekend['day'].dtype # CategoricalDtype(categories=['Thur', 'Fri', 'Sat', 'Sun'], ordered=False)
tips_weekend['day'] = pd.Categorical(tips_weekend['day'], ['Sat', 'Sun'])
tips_weekend['day'].dtype # CategoricalDtype(categories=['Sat', 'Sun'], ordered=False)
sns.catplot(data=tips_weekend, x='smoker', y='tip', hue='day')
For the follow-up question, a histplot with multiple='fill' can show the percentage distribution:
import seaborn as sns
import pandas as pd
from matplotlib.ticker import PercentFormatter
tips = sns.load_dataset('tips')
tips_weekend = tips.copy()
tips_weekend['day'] = tips_weekend['day'].apply(lambda x: x if x in ['Sat', 'Sun'] else 'other')
# fix a new order
tips_weekend['day'] = pd.Categorical(tips_weekend['day'], ['other', 'Sat', 'Sun'])
ax = sns.histplot(data=tips_weekend, x='smoker', hue='day', stat='count', multiple='fill',
palette=['none', 'turquoise', 'crimson'])
# remove the first label ('other') in the legend
ax.legend(handles=ax.legend_.legendHandles[1:], labels=['Sat', 'Sun'], title='day')
ax.yaxis.set_major_formatter(PercentFormatter(1))
# add percentages
for bar_group in ax.containers[:-1]:
ax.bar_label(bar_group, label_type='center', labels=[f'{bar.get_height() * 100:.1f} %' for bar in bar_group])

Issues with using data from upload in other components in Dash

When writing a program in Dash, I have been having issues. When using the Upload component, I am struggling to properly use that data on other components.
My goal is to use the data uploaded (CSV file) in order to add options to 2 identical Dropdown components, those being the names of the columns of the imported file.
A graph is to be generated using the selected values on the dropdowns afterward as the axis for the graph.
Any help would be appreciated.
import base64
import datetime
import io
import dash
import dash_table
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output, State
import plotly.express as px
import pandas as pd
external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']
df = pd.DataFrame()
app = dash.Dash(__name__, external_stylesheets=external_stylesheets)
app.layout = html.Div([
html.Div(children='this is an attempt to do stuff right'),
dcc.Dropdown(id='Drop1'),
dcc.Dropdown(id='Drop2'),
dcc.Dropdown(id='graphtype', options=[
{'label': 'Bar', 'value': 'Bar'},
{'label': 'Scatter', 'value': 'Scatter'},
{'label': 'Histogram', 'value': 'Hist'}
]),
dcc.Upload(
id='upload-data',
children=html.Div([
'Drag and Drop or ',
html.A('Select Files')
]),
style={
'width': '100%',
'height': '60px',
'lineHeight': '60px',
'borderWidth': '1px',
'borderStyle': 'dashed',
'borderRadius': '5px',
'textAlign': 'center',
'margin': '10px'
},
# Allow multiple files to be uploaded
multiple=True
),
html.Div(id='output-data-upload'),
dcc.Graph(id='output-graph')
]
)
def parse_contents(contents, filename, date):
content_type, content_string = contents.split(',')
decoded = base64.b64decode(content_string)
try:
if 'csv' in filename:
# Assume that the user uploaded a CSV file
df = pd.read_csv(
io.StringIO(decoded.decode('utf-8')))
elif 'xls' in filename:
# Assume that the user uploaded an excel file
df = pd.read_excel(io.BytesIO(decoded))
except Exception as e:
print(e)
return html.Div([
'There was an error processing this file.'
])
return html.Div([
html.H5(filename),
html.H6(datetime.datetime.fromtimestamp(date)),
dash_table.DataTable(
data=df.to_dict('records'),
columns=[{'name': i, 'id': i} for i in df.columns]
),
html.Hr(), # horizontal line
# For debugging, display the raw contents provided by the web browser
html.Div('Raw Content'),
html.Pre(contents[0:200] + '...', style={
'whiteSpace': 'pre-wrap',
'wordBreak': 'break-all'
})
])
#app.callback(Output('output-data-upload', 'children'),
[Input('upload-data', 'contents')],
[State('upload-data', 'filename'),
State('upload-data', 'last_modified')])
def update_output(list_of_contents, list_of_names, list_of_dates):
if list_of_contents is not None:
children = [
parse_contents(c, n, d) for c, n, d in
zip(list_of_contents, list_of_names, list_of_dates)]
print(children)
return children
if __name__ == '__main__':
app.run_server(debug=True)
This is the page from the docs that should give you all you need. If you upload a CSV file you can use:
df = pd.read_csv(io.StringIO(decoded.decode('utf-8')))
and from there just use it as a normal pandas data frame.

Display selected features after Gridsearch

I'm using GridSearchCV to perform feature selection (SelectKBest) for a linear regression. The results show that 10 features are selected (using .best_params_), but I'm unsure how to display which features this are.
The code is pasted below. I'm using a pipeline because the next models will also need hyperparameter selection. x_train is a dataframe with 12 columns that I cannot share due to data restrictions.
cv_folds = KFold(n_splits=5, shuffle=False)
steps = [('feature_selection', SelectKBest(mutual_info_regression, k=3)), ('regr',
LinearRegression())]
pipe = Pipeline(steps)
search_space = [{'feature_selection__k': [1,2,3,4,5,6,7,8,9,10,11,12]}]
clf = GridSearchCV(pipe, search_space, scoring='neg_mean_squared_error', cv=5, verbose=0)
clf = clf.fit(x_train, y_train)
print(clf.best_params_)
You can access the information about feature_selection step like this:
<GridSearch_model_variable>.best_estimater_.named_steps[<feature_selection_step>]
So, in your case, it would be like this:
print(clf.best_estimator_.named_steps['feature_selection'])
#Output: SelectKBest(k=8, score_func=<function mutual_info_regression at 0x13d37b430>)
Next you can use the get_support function to get the boolean map of the selected features:
print(clf.best_estimator_.named_steps['feature_selection'].get_support())
# Output: array([ True, False, True, False, True, True, True, False, False,
True, True, False, True])
Now provide this map over the original columns:
data_columns = X.columns # List of columns in your dataset
# This is the original list of columns
print(data_columns)
# Output: ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
'PTRATIO', 'B', 'LSTAT']
# Now print the select columns
print(data_columns[clf.best_estimator_.named_steps['feature_selection'].get_support()])
# Output: ['CRIM', 'INDUS', 'NOX', 'RM', 'AGE', 'TAX', 'PTRATIO', 'LSTAT']
So you can see out of 13 features only 8 were selected ( as in my data k=4 was the best case)
Here is the full code with boston dataset:
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.model_selection import KFold
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
boston_dataset = load_boston()
X = pd.DataFrame(boston_dataset.data, columns=boston_dataset.feature_names)
y = boston_dataset.target
cv_folds = KFold(n_splits=5, shuffle=False)
steps = [('feature_selection', SelectKBest(mutual_info_regression, k=3)),
('regr', LinearRegression())]
pipe = Pipeline(steps)
search_space = [{'feature_selection__k': [1,2,3,4,5,6,7,8,9,10,11,12]}]
clf = GridSearchCV(pipe, search_space, scoring='neg_mean_squared_error', cv=5, verbose=0)
clf = clf.fit(X, y)
print(clf.best_params_)
data_columns = X.columns
selected_features = data_columns[clf.best_estimator_.named_steps['feature_selection'].get_support()]
print(selected_features)
# Output : Index(['CRIM', 'INDUS', 'NOX', 'RM', 'AGE', 'TAX', 'PTRATIO', 'LSTAT'], dtype='object')
References:
https://stackoverflow.com/a/33378303/8160718
https://stackoverflow.com/a/38788087/8160718

Resources