When writing a program in Dash, I have been having issues. When using the Upload component, I am struggling to properly use that data on other components.
My goal is to use the data uploaded (CSV file) in order to add options to 2 identical Dropdown components, those being the names of the columns of the imported file.
A graph is to be generated using the selected values on the dropdowns afterward as the axis for the graph.
Any help would be appreciated.
import base64
import datetime
import io
import dash
import dash_table
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output, State
import plotly.express as px
import pandas as pd
external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']
df = pd.DataFrame()
app = dash.Dash(__name__, external_stylesheets=external_stylesheets)
app.layout = html.Div([
html.Div(children='this is an attempt to do stuff right'),
dcc.Dropdown(id='Drop1'),
dcc.Dropdown(id='Drop2'),
dcc.Dropdown(id='graphtype', options=[
{'label': 'Bar', 'value': 'Bar'},
{'label': 'Scatter', 'value': 'Scatter'},
{'label': 'Histogram', 'value': 'Hist'}
]),
dcc.Upload(
id='upload-data',
children=html.Div([
'Drag and Drop or ',
html.A('Select Files')
]),
style={
'width': '100%',
'height': '60px',
'lineHeight': '60px',
'borderWidth': '1px',
'borderStyle': 'dashed',
'borderRadius': '5px',
'textAlign': 'center',
'margin': '10px'
},
# Allow multiple files to be uploaded
multiple=True
),
html.Div(id='output-data-upload'),
dcc.Graph(id='output-graph')
]
)
def parse_contents(contents, filename, date):
content_type, content_string = contents.split(',')
decoded = base64.b64decode(content_string)
try:
if 'csv' in filename:
# Assume that the user uploaded a CSV file
df = pd.read_csv(
io.StringIO(decoded.decode('utf-8')))
elif 'xls' in filename:
# Assume that the user uploaded an excel file
df = pd.read_excel(io.BytesIO(decoded))
except Exception as e:
print(e)
return html.Div([
'There was an error processing this file.'
])
return html.Div([
html.H5(filename),
html.H6(datetime.datetime.fromtimestamp(date)),
dash_table.DataTable(
data=df.to_dict('records'),
columns=[{'name': i, 'id': i} for i in df.columns]
),
html.Hr(), # horizontal line
# For debugging, display the raw contents provided by the web browser
html.Div('Raw Content'),
html.Pre(contents[0:200] + '...', style={
'whiteSpace': 'pre-wrap',
'wordBreak': 'break-all'
})
])
#app.callback(Output('output-data-upload', 'children'),
[Input('upload-data', 'contents')],
[State('upload-data', 'filename'),
State('upload-data', 'last_modified')])
def update_output(list_of_contents, list_of_names, list_of_dates):
if list_of_contents is not None:
children = [
parse_contents(c, n, d) for c, n, d in
zip(list_of_contents, list_of_names, list_of_dates)]
print(children)
return children
if __name__ == '__main__':
app.run_server(debug=True)
This is the page from the docs that should give you all you need. If you upload a CSV file you can use:
df = pd.read_csv(io.StringIO(decoded.decode('utf-8')))
and from there just use it as a normal pandas data frame.
Related
I am pretty new to dash plotly. I am trying to create a dropdown menu with multiple selections ON. I have a dataframe with the column names [‘col1’, ‘col2’, ‘col3’, ‘col4’, ‘col5’]. I want to plot this dataframe to have ‘col5’ in the y-axis, and the rest of the columns in the dropdown menu for the x-axis selection. I want to add these columns to my x-axis when I selected them from the dropdown menu.
I cannot understand what I need to modify to make it work. I checked the published posts but couldn’t figure it out.
I get “Callback error updating mygraph1.figure” when I run my code below:
import dash_bootstrap_components as dbc
import pandas as pd
import numpy as np
import plotly.express as px
from dash.exceptions import PreventUpdate
df = pd.DataFrame(np.random.randn(100, 5))
df.columns = (['col1', 'col2', 'col3', 'col4', 'col5'])
app = Dash(external_stylesheets=[dbc.themes.SUPERHERO])
app.layout = dbc.Container([
dbc.Row([
dbc.Col([
dcc.Dropdown(id='mydd1',
options=df.columns.values[0:4],
multi= True,
clearable=True,
value=[])
], width=4),
]),
dbc.Row([
dbc.Col([
dcc.Graph(id='mygraph1', figure={})
], width=4),
])
], fluid=True)
#app.callback(
Output('mygraph1', 'figure'),
Input('mydd1', 'value'),
)
def update_title(X):
if X is None:
raise PreventUpdate
fig1 = px.line(df, x=df[X], y=df['col5'])
return fig1
if __name__ == "__main__":
app.run_server(debug = True, port=8055)
The problem is in the value of X, it is a list and you cannot use is directly like df[X] because it is invalid synatx in pandas to write df[['col1']].
Full example:
import dash
import dash_bootstrap_components as dbc
from dash import Dash, html, dcc,dash_table
from dash.dependencies import Input, Output
import plotly.graph_objs as go
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.randn(10, 5))
df.columns = (['col1', 'col2', 'col3', 'col4', 'col5'])
app = dash.Dash(external_stylesheets=[dbc.themes.SUPERHERO])
app.layout = dbc.Container([
dbc.Row([
dbc.Col([
dcc.Dropdown(id='mydd1',
options=df.columns.values[0:4],
multi= True,
clearable=True,
value=[])
], width=4),
]),
dbc.Row([
dbc.Col([
dcc.Graph(id='mygraph1', figure={})
], width=4),
])
], fluid=True)
#app.callback(
Output('mygraph1', 'figure'),
Input('mydd1', 'value'),
)
def update_title(X):
if X == []:
return dash.no_update
fig = go.Figure()
for idx, col in enumerate(X):
fig.add_trace(go.Scatter(x =df[col] , y =df['col5'], mode ='lines', name = col))
return fig
app.run_server(debug=True, use_reloader=False)
Output
I am practicing with this life expectancy dataset from Kaggle (https://www.kaggle.com/datasets/kumarajarshi/life-expectancy-who?select=Life+Expectancy+Data.csv) and I want to train and visualize a classification and regression tree model. however, I keep getting an error that says "InvocationException: GraphViz's executables not found". I am wondering if this is because of the nature of the continuous numerical target dataset type? how can I visualize the model?
code:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import seaborn as sn
from sklearn import datasets
from sklearn import metrics
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import export_graphviz
import matplotlib.pyplot as plt,pydotplus
from IPython.display import Image,display
data = pd.read_csv('Life Expectancy Data.csv')
data = data.dropna(how = 'any')
#feature selection
data = data.drop(columns=['infant deaths', ' thinness 5-9 years', 'Alcohol', 'percentage expenditure', 'Hepatitis B', 'Total expenditure', 'Population', ' thinness 5-9 years', 'Year', 'Country'])
# Creating a instance of label Encoder.
le = LabelEncoder()
# Using .fit_transform function to fit label
# encoder and return encoded label
label = le.fit_transform(data['Status'])
# removing the column 'Status' from df
data.drop('Status', axis=1, inplace=True)
# Appending the array to our dataFrame
# with column name 'Status'
data['Status'] = label
#training model
model_data = data
X = data.drop(columns=['Life expectancy '])
y = data['Life expectancy ']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
model = DecisionTreeRegressor()
model.fit(X_train, y_train)
#visualizing tree
LEtree = tree.export_graphviz(model,
feature_names = ['Adult Mortality', 'Measles', ' BMI', 'under-five deaths', 'Polio', 'Diphtheria', ' HIV/AIDS', 'GDP', ' thinness 1-19 years', 'Income composition of resources', 'Schooling', 'Status'],
class_names = y,
label = 'all',
rounded = True,
filled = True)
graph=pydotplus.graph_from_dot_data(LEtree)
display(Image(graph.create_png()))
full error message:
InvocationException Traceback (most recent call last)
Input In [27], in <cell line: 2>()
1 graph=pydotplus.graph_from_dot_data(LEtree)
----> 2 display(Image(graph.create_png()))
File ~\Anaconda3\lib\site-packages\pydotplus\graphviz.py:1797, in Dot.__init__.<locals>.<lambda>(f, prog)
1792 # Automatically creates all the methods enabling the creation
1793 # of output in any of the supported formats.
1794 for frmt in self.formats:
1795 self.__setattr__(
1796 'create_' + frmt,
-> 1797 lambda f=frmt, prog=self.prog: self.create(format=f, prog=prog)
1798 )
1799 f = self.__dict__['create_' + frmt]
1800 f.__doc__ = (
1801 '''Refer to the docstring accompanying the'''
1802 ''''create' method for more information.'''
1803 )
File ~\Anaconda3\lib\site-packages\pydotplus\graphviz.py:1959, in Dot.create(self, prog, format)
1957 self.progs = find_graphviz()
1958 if self.progs is None:
-> 1959 raise InvocationException(
1960 'GraphViz\'s executables not found')
1962 if prog not in self.progs:
1963 raise InvocationException(
1964 'GraphViz\'s executable "%s" not found' % prog)
InvocationException: GraphViz's executables not found
Try Installing the Graphviz in a proper directory
you can install in Anaconda from conda-command-prompt using the below command -
conda install -c conda-forge python-graphviz
and replace the previously installed graphviz directory this might help you with the problem
I'm using GridSearchCV to perform feature selection (SelectKBest) for a linear regression. The results show that 10 features are selected (using .best_params_), but I'm unsure how to display which features this are.
The code is pasted below. I'm using a pipeline because the next models will also need hyperparameter selection. x_train is a dataframe with 12 columns that I cannot share due to data restrictions.
cv_folds = KFold(n_splits=5, shuffle=False)
steps = [('feature_selection', SelectKBest(mutual_info_regression, k=3)), ('regr',
LinearRegression())]
pipe = Pipeline(steps)
search_space = [{'feature_selection__k': [1,2,3,4,5,6,7,8,9,10,11,12]}]
clf = GridSearchCV(pipe, search_space, scoring='neg_mean_squared_error', cv=5, verbose=0)
clf = clf.fit(x_train, y_train)
print(clf.best_params_)
You can access the information about feature_selection step like this:
<GridSearch_model_variable>.best_estimater_.named_steps[<feature_selection_step>]
So, in your case, it would be like this:
print(clf.best_estimator_.named_steps['feature_selection'])
#Output: SelectKBest(k=8, score_func=<function mutual_info_regression at 0x13d37b430>)
Next you can use the get_support function to get the boolean map of the selected features:
print(clf.best_estimator_.named_steps['feature_selection'].get_support())
# Output: array([ True, False, True, False, True, True, True, False, False,
True, True, False, True])
Now provide this map over the original columns:
data_columns = X.columns # List of columns in your dataset
# This is the original list of columns
print(data_columns)
# Output: ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
'PTRATIO', 'B', 'LSTAT']
# Now print the select columns
print(data_columns[clf.best_estimator_.named_steps['feature_selection'].get_support()])
# Output: ['CRIM', 'INDUS', 'NOX', 'RM', 'AGE', 'TAX', 'PTRATIO', 'LSTAT']
So you can see out of 13 features only 8 were selected ( as in my data k=4 was the best case)
Here is the full code with boston dataset:
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.model_selection import KFold
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
boston_dataset = load_boston()
X = pd.DataFrame(boston_dataset.data, columns=boston_dataset.feature_names)
y = boston_dataset.target
cv_folds = KFold(n_splits=5, shuffle=False)
steps = [('feature_selection', SelectKBest(mutual_info_regression, k=3)),
('regr', LinearRegression())]
pipe = Pipeline(steps)
search_space = [{'feature_selection__k': [1,2,3,4,5,6,7,8,9,10,11,12]}]
clf = GridSearchCV(pipe, search_space, scoring='neg_mean_squared_error', cv=5, verbose=0)
clf = clf.fit(X, y)
print(clf.best_params_)
data_columns = X.columns
selected_features = data_columns[clf.best_estimator_.named_steps['feature_selection'].get_support()]
print(selected_features)
# Output : Index(['CRIM', 'INDUS', 'NOX', 'RM', 'AGE', 'TAX', 'PTRATIO', 'LSTAT'], dtype='object')
References:
https://stackoverflow.com/a/33378303/8160718
https://stackoverflow.com/a/38788087/8160718
Spark Version: '2.0.0.2.5.0.0-1245'
So, my original question changed a bit but it's still the same issue.
What I want to do is load a huge amount of JSON files and transform those to a DataFrame - also probably save them as CSV or parquet file for further processing. Each JSON file represents one row in the final DataFrame.
import os
import glob
HDFS_MOUNT = # ...
DATA_SET_BASE = # ...
schema = StructType([
StructField("documentId", StringType(), True),
StructField("group", StringType(), True),
StructField("text", StringType(), True)
])
# Get the file paths
file_paths = glob.glob(os.path.join(HDFS_MOUNT, DATA_SET_BASE, '**/*.json'))
file_paths = [f.replace(HDFS_MOUNT + '/', '') for f in file_paths]
print('Found {:d} files'.format(len(file_paths))) # 676 files
sql = SQLContext(sc)
df = sql.read.json(file_paths, schema=schema)
print('Loaded {:d} rows'.format(df.count())) # 9660 rows (what !?)
Besides the fact that there are 9660 rows instead of 676 (number of available files) I also have the problem that the content seems to be None:
df.head(2)[0].asDict()
gives
{
'documentId': None,
'group': None,
'text': None,
}
Example Data
This is just fake data of course but it resembles the actual data.
Note: Some fields may be missing e.g. text must not always be present.
a.json
{
"documentId" : "001",
"group" : "A",
"category" : "indexed_document",
"linkIDs": ["adiojer", "asdi555", "1337"]
}
b.json
{
"documentId" : "002",
"group" : "B",
"category" : "indexed_document",
"linkIDs": ["linkId", "1000"],
"text": "This is the text of this document"
}
assuming that all your files has the same structure and are in the same directory:
df = sql_cntx.read.json('/hdfs/path/to/folder/*.json')
There might be a problem if any of the columns has Null values for all rows. Then spark will not be able to determine schema, so you have an option to tell spark which schema to use:
from pyspark import SparkContext, SQLContext
from pyspark.sql.types import StructType, StructField, StringType, LongType
sc = SparkContext(appName="My app")
sql_cntx = SQLContext(sc)
schema = StructType([
StructField("field1", StringType(), True),
StructField("field2", LongType(), True)
])
df = sql_cntx.read.json('/hdfs/path/to/folder/*.json', schema=schema)
UPD:
in case if file has multirows formatted json you can try this code:
sc = SparkContext(appName='Test')
sql_context = SQLContext(sc)
rdd = sc.wholeTextFiles('/tmp/test/*.json').values()
df = sql_context.read.json(rdd, schema=schema)
df.show()
Using Mac OS X API, I'm trying to save a PDF file with a Quartz filter applied, just like it is possible from the "Save As" dialog in the Preview application. So far I've written the following code (using Python and pyObjC, but it isn't important for me):
-- filter-pdf.py: begin
from Foundation import *
from Quartz import *
import objc
page_rect = CGRectMake (0, 0, 612, 792)
fdict = NSDictionary.dictionaryWithContentsOfFile_("/System/Library/Filters/Blue
\ Tone.qfilter")
in_pdf = CGPDFDocumentCreateWithProvider(CGDataProviderCreateWithFilename ("test
.pdf"))
url = CFURLCreateWithFileSystemPath(None, "test_out.pdf", kCFURLPOSIXPathStyle,
False)
c = CGPDFContextCreateWithURL(url, page_rect, fdict)
np = CGPDFDocumentGetNumberOfPages(in_pdf)
for ip in range (1, np+1):
page = CGPDFDocumentGetPage(in_pdf, ip)
r = CGPDFPageGetBoxRect(page, kCGPDFMediaBox)
CGContextBeginPage(c, r)
CGContextDrawPDFPage(c, page)
CGContextEndPage(c)
-- filter-pdf.py: end
Unfortunalte, the filter "Blue Tone" isn't applied, the output PDF looks exactly as the input PDF.
Question: what I missed? How to apply a filter?
Well, the documentation doesn't promise that such way of creating and using "fdict" should cause that the filter is applied. But I just rewritten (as far as I can) sample code /Developer/Examples/Quartz/Python/filter-pdf.py, which was distributed with older versions of Mac (meanwhile, this code doesn't work too):
----- filter-pdf-old.py: begin
from CoreGraphics import *
import sys, os, math, getopt, string
def usage ():
print '''
usage: python filter-pdf.py FILTER INPUT-PDF OUTPUT-PDF
Apply a ColorSync Filter to a PDF document.
'''
def main ():
page_rect = CGRectMake (0, 0, 612, 792)
try:
opts,args = getopt.getopt (sys.argv[1:], '', [])
except getopt.GetoptError:
usage ()
sys.exit (1)
if len (args) != 3:
usage ()
sys.exit (1)
filter = CGContextFilterCreateDictionary (args[0])
if not filter:
print 'Unable to create context filter'
sys.exit (1)
pdf = CGPDFDocumentCreateWithProvider (CGDataProviderCreateWithFilename (args[1]))
if not pdf:
print 'Unable to open input file'
sys.exit (1)
c = CGPDFContextCreateWithFilename (args[2], page_rect, filter)
if not c:
print 'Unable to create output context'
sys.exit (1)
for p in range (1, pdf.getNumberOfPages () + 1):
#r = pdf.getMediaBox (p)
r = pdf.getPage(p).getBoxRect(p)
c.beginPage (r)
c.drawPDFDocument (r, pdf, p)
c.endPage ()
c.finish ()
if __name__ == '__main__':
main ()
----- filter-pdf-old.py: end
=======================================================================
The working code based on the answer:
from Foundation import *
from Quartz import *
pdf_url = NSURL.fileURLWithPath_("test.pdf")
pdf_doc = PDFDocument.alloc().initWithURL_(pdf_url)
furl = NSURL.fileURLWithPath_("/System/Library/Filters/Blue Tone.qfilter")
fobj = QuartzFilter.quartzFilterWithURL_(furl)
fdict = { 'QuartzFilter': fobj }
pdf_doc.writeToFile_withOptions_("test_out.pdf", fdict)
two approaches - if you need to open and modify an already existing file, use the PDFKit's PDFDocument (reference) and use PDFDocument's writeToFile_withOptions_ with option dict including the "QuartzFilter" option of needed filter.
OTOH if you need your own drawing and have a CGContext at hand, you can use something along these lines:
from Quartz import *
data = NSMutableData.dataWithCapacity_(1024**2)
dataConsumer = CGDataConsumerCreateWithCFData(data)
context = CGPDFContextCreate(dataConsumer, None, None)
f = QuartzFilter.quartzFilterWithURL_(NSURL.fileURLWithPath_("YourFltr.qfilter"))
f.applyToContext_(context)
# do your drawing
CGPDFContextClose(context)
# the PDF is in the data variable. Do whatever you need to do with the data (save to file...).