My altair labels are cut off - how can I see the whole label? [duplicate] - label

bars = alt.Chart(df).mark_bar().encode(
x=alt.X('Pcnt:Q', axis=None),
y=alt.Y('Name',
axis=alt.Axis(domain=False,
ticks=False,
title=None,
labelPadding=15,
labelFontSize=16,
labelColor='#404040',
labelBaseline='middle',
# labelAngle= -45,
# labelExpr=axis_labels
),
sort=name_sort
)
)
text = bars.mark_text(align='left', baseline='middle', dx=3, size=14).\
encode(text=alt.Text('Pcnt:Q',format='.0%'))
Votes = (bars+text).properties(width=500,height=100
).properties(title={
"text": ["Who Shot First?"],
"subtitle": ["According to 834 respondents"],
"fontSize": 26, "color": '#353535',
"subtitleFontSize": 20, "subtitleColor": '#353535',
"anchor": 'start'}
).configure_mark(color='#008fd5'
).configure_view(strokeWidth=0
).configure_scale(bandPaddingInner=0.2
)
Votes
Currently (see below output), the third label in y-axis (i.e. "I don't understand this question") got truncated. I want to wrap it to make the whole label visible. Anyone can help? Thank you very much!
Desired chart is like this:

You can use labelLimit to control when the label is truncated:
import pandas as pd
import altair as alt
df = pd.DataFrame({
'label': ['Really long label here that will be truncated', 'Short label'],
'value': [4, 5]
})
alt.Chart(df).mark_bar().encode(
x='value',
y='label'
)
alt.Chart(df).mark_bar().encode(
x='value',
y=alt.Y('label', axis=alt.Axis(labelLimit=200))
)
You can also wrap on multiple lines by creating a list, as suggested in the comments:
from textwrap import wrap
# Wrap on whitespace with a max line length of 30 chars
df['label'] = df['label'].apply(wrap, args=[30])
alt.Chart(df).mark_bar().encode(
x='value',
y=alt.Y('label', axis=alt.Axis(labelFontSize=9)),
)

Related

How to plot when we have dropdown menu with multiple selections?

I am pretty new to dash plotly. I am trying to create a dropdown menu with multiple selections ON. I have a dataframe with the column names [‘col1’, ‘col2’, ‘col3’, ‘col4’, ‘col5’]. I want to plot this dataframe to have ‘col5’ in the y-axis, and the rest of the columns in the dropdown menu for the x-axis selection. I want to add these columns to my x-axis when I selected them from the dropdown menu.
I cannot understand what I need to modify to make it work. I checked the published posts but couldn’t figure it out.
I get “Callback error updating mygraph1.figure” when I run my code below:
import dash_bootstrap_components as dbc
import pandas as pd
import numpy as np
import plotly.express as px
from dash.exceptions import PreventUpdate
df = pd.DataFrame(np.random.randn(100, 5))
df.columns = (['col1', 'col2', 'col3', 'col4', 'col5'])
app = Dash(external_stylesheets=[dbc.themes.SUPERHERO])
app.layout = dbc.Container([
dbc.Row([
dbc.Col([
dcc.Dropdown(id='mydd1',
options=df.columns.values[0:4],
multi= True,
clearable=True,
value=[])
], width=4),
]),
dbc.Row([
dbc.Col([
dcc.Graph(id='mygraph1', figure={})
], width=4),
])
], fluid=True)
#app.callback(
Output('mygraph1', 'figure'),
Input('mydd1', 'value'),
)
def update_title(X):
if X is None:
raise PreventUpdate
fig1 = px.line(df, x=df[X], y=df['col5'])
return fig1
if __name__ == "__main__":
app.run_server(debug = True, port=8055)
The problem is in the value of X, it is a list and you cannot use is directly like df[X] because it is invalid synatx in pandas to write df[['col1']].
Full example:
import dash
import dash_bootstrap_components as dbc
from dash import Dash, html, dcc,dash_table
from dash.dependencies import Input, Output
import plotly.graph_objs as go
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.randn(10, 5))
df.columns = (['col1', 'col2', 'col3', 'col4', 'col5'])
app = dash.Dash(external_stylesheets=[dbc.themes.SUPERHERO])
app.layout = dbc.Container([
dbc.Row([
dbc.Col([
dcc.Dropdown(id='mydd1',
options=df.columns.values[0:4],
multi= True,
clearable=True,
value=[])
], width=4),
]),
dbc.Row([
dbc.Col([
dcc.Graph(id='mygraph1', figure={})
], width=4),
])
], fluid=True)
#app.callback(
Output('mygraph1', 'figure'),
Input('mydd1', 'value'),
)
def update_title(X):
if X == []:
return dash.no_update
fig = go.Figure()
for idx, col in enumerate(X):
fig.add_trace(go.Scatter(x =df[col] , y =df['col5'], mode ='lines', name = col))
return fig
app.run_server(debug=True, use_reloader=False)
Output

exclude one of the hue from seaborn catplot visualization

I want to visualize category count by seaborn catplot but one of the hue are not important and don't need to include the visualization.
How can I select specific Hues at catplot to visualize without changing or removing any value from the column ?
You could remove the rows with that value from the dataframe. If the column is Categorical you might also need to change the categories as the legend will still contain all the categories.
Here is an example:
import seaborn as sns
import pandas as pd
tips = sns.load_dataset('tips')
tips['day'].dtype # CategoricalDtype(categories=['Thur', 'Fri', 'Sat', 'Sun'], ordered=False)
# create a subset, a copy is needed to be able to change the categorical column
tips_weekend = tips[tips['day'].isin(['Sat', 'Sun'])].copy()
tips_weekend['day'].dtype # CategoricalDtype(categories=['Thur', 'Fri', 'Sat', 'Sun'], ordered=False)
tips_weekend['day'] = pd.Categorical(tips_weekend['day'], ['Sat', 'Sun'])
tips_weekend['day'].dtype # CategoricalDtype(categories=['Sat', 'Sun'], ordered=False)
sns.catplot(data=tips_weekend, x='smoker', y='tip', hue='day')
For the follow-up question, a histplot with multiple='fill' can show the percentage distribution:
import seaborn as sns
import pandas as pd
from matplotlib.ticker import PercentFormatter
tips = sns.load_dataset('tips')
tips_weekend = tips.copy()
tips_weekend['day'] = tips_weekend['day'].apply(lambda x: x if x in ['Sat', 'Sun'] else 'other')
# fix a new order
tips_weekend['day'] = pd.Categorical(tips_weekend['day'], ['other', 'Sat', 'Sun'])
ax = sns.histplot(data=tips_weekend, x='smoker', hue='day', stat='count', multiple='fill',
palette=['none', 'turquoise', 'crimson'])
# remove the first label ('other') in the legend
ax.legend(handles=ax.legend_.legendHandles[1:], labels=['Sat', 'Sun'], title='day')
ax.yaxis.set_major_formatter(PercentFormatter(1))
# add percentages
for bar_group in ax.containers[:-1]:
ax.bar_label(bar_group, label_type='center', labels=[f'{bar.get_height() * 100:.1f} %' for bar in bar_group])

Issues with using data from upload in other components in Dash

When writing a program in Dash, I have been having issues. When using the Upload component, I am struggling to properly use that data on other components.
My goal is to use the data uploaded (CSV file) in order to add options to 2 identical Dropdown components, those being the names of the columns of the imported file.
A graph is to be generated using the selected values on the dropdowns afterward as the axis for the graph.
Any help would be appreciated.
import base64
import datetime
import io
import dash
import dash_table
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output, State
import plotly.express as px
import pandas as pd
external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']
df = pd.DataFrame()
app = dash.Dash(__name__, external_stylesheets=external_stylesheets)
app.layout = html.Div([
html.Div(children='this is an attempt to do stuff right'),
dcc.Dropdown(id='Drop1'),
dcc.Dropdown(id='Drop2'),
dcc.Dropdown(id='graphtype', options=[
{'label': 'Bar', 'value': 'Bar'},
{'label': 'Scatter', 'value': 'Scatter'},
{'label': 'Histogram', 'value': 'Hist'}
]),
dcc.Upload(
id='upload-data',
children=html.Div([
'Drag and Drop or ',
html.A('Select Files')
]),
style={
'width': '100%',
'height': '60px',
'lineHeight': '60px',
'borderWidth': '1px',
'borderStyle': 'dashed',
'borderRadius': '5px',
'textAlign': 'center',
'margin': '10px'
},
# Allow multiple files to be uploaded
multiple=True
),
html.Div(id='output-data-upload'),
dcc.Graph(id='output-graph')
]
)
def parse_contents(contents, filename, date):
content_type, content_string = contents.split(',')
decoded = base64.b64decode(content_string)
try:
if 'csv' in filename:
# Assume that the user uploaded a CSV file
df = pd.read_csv(
io.StringIO(decoded.decode('utf-8')))
elif 'xls' in filename:
# Assume that the user uploaded an excel file
df = pd.read_excel(io.BytesIO(decoded))
except Exception as e:
print(e)
return html.Div([
'There was an error processing this file.'
])
return html.Div([
html.H5(filename),
html.H6(datetime.datetime.fromtimestamp(date)),
dash_table.DataTable(
data=df.to_dict('records'),
columns=[{'name': i, 'id': i} for i in df.columns]
),
html.Hr(), # horizontal line
# For debugging, display the raw contents provided by the web browser
html.Div('Raw Content'),
html.Pre(contents[0:200] + '...', style={
'whiteSpace': 'pre-wrap',
'wordBreak': 'break-all'
})
])
#app.callback(Output('output-data-upload', 'children'),
[Input('upload-data', 'contents')],
[State('upload-data', 'filename'),
State('upload-data', 'last_modified')])
def update_output(list_of_contents, list_of_names, list_of_dates):
if list_of_contents is not None:
children = [
parse_contents(c, n, d) for c, n, d in
zip(list_of_contents, list_of_names, list_of_dates)]
print(children)
return children
if __name__ == '__main__':
app.run_server(debug=True)
This is the page from the docs that should give you all you need. If you upload a CSV file you can use:
df = pd.read_csv(io.StringIO(decoded.decode('utf-8')))
and from there just use it as a normal pandas data frame.

Display selected features after Gridsearch

I'm using GridSearchCV to perform feature selection (SelectKBest) for a linear regression. The results show that 10 features are selected (using .best_params_), but I'm unsure how to display which features this are.
The code is pasted below. I'm using a pipeline because the next models will also need hyperparameter selection. x_train is a dataframe with 12 columns that I cannot share due to data restrictions.
cv_folds = KFold(n_splits=5, shuffle=False)
steps = [('feature_selection', SelectKBest(mutual_info_regression, k=3)), ('regr',
LinearRegression())]
pipe = Pipeline(steps)
search_space = [{'feature_selection__k': [1,2,3,4,5,6,7,8,9,10,11,12]}]
clf = GridSearchCV(pipe, search_space, scoring='neg_mean_squared_error', cv=5, verbose=0)
clf = clf.fit(x_train, y_train)
print(clf.best_params_)
You can access the information about feature_selection step like this:
<GridSearch_model_variable>.best_estimater_.named_steps[<feature_selection_step>]
So, in your case, it would be like this:
print(clf.best_estimator_.named_steps['feature_selection'])
#Output: SelectKBest(k=8, score_func=<function mutual_info_regression at 0x13d37b430>)
Next you can use the get_support function to get the boolean map of the selected features:
print(clf.best_estimator_.named_steps['feature_selection'].get_support())
# Output: array([ True, False, True, False, True, True, True, False, False,
True, True, False, True])
Now provide this map over the original columns:
data_columns = X.columns # List of columns in your dataset
# This is the original list of columns
print(data_columns)
# Output: ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
'PTRATIO', 'B', 'LSTAT']
# Now print the select columns
print(data_columns[clf.best_estimator_.named_steps['feature_selection'].get_support()])
# Output: ['CRIM', 'INDUS', 'NOX', 'RM', 'AGE', 'TAX', 'PTRATIO', 'LSTAT']
So you can see out of 13 features only 8 were selected ( as in my data k=4 was the best case)
Here is the full code with boston dataset:
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.model_selection import KFold
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
boston_dataset = load_boston()
X = pd.DataFrame(boston_dataset.data, columns=boston_dataset.feature_names)
y = boston_dataset.target
cv_folds = KFold(n_splits=5, shuffle=False)
steps = [('feature_selection', SelectKBest(mutual_info_regression, k=3)),
('regr', LinearRegression())]
pipe = Pipeline(steps)
search_space = [{'feature_selection__k': [1,2,3,4,5,6,7,8,9,10,11,12]}]
clf = GridSearchCV(pipe, search_space, scoring='neg_mean_squared_error', cv=5, verbose=0)
clf = clf.fit(X, y)
print(clf.best_params_)
data_columns = X.columns
selected_features = data_columns[clf.best_estimator_.named_steps['feature_selection'].get_support()]
print(selected_features)
# Output : Index(['CRIM', 'INDUS', 'NOX', 'RM', 'AGE', 'TAX', 'PTRATIO', 'LSTAT'], dtype='object')
References:
https://stackoverflow.com/a/33378303/8160718
https://stackoverflow.com/a/38788087/8160718

How to Repeat Table Column Headings over Page Breaks in PDF output from ReportLab

I'm using ReportLab to write tables in PDF documents and am very pleased with the results (despite not having a total grasp on flowables just yet).
However, I have not been able to figure out how to make a table that spans a page break have its column headings repeated.
The code below creates a test.pdf in C:\Temp that has a heading row followed by 99 rows of data.
The heading row looks great on the first page but I would like that to repeat at the top of the second and third pages.
I'm keen to hear of any approaches that have been used to accomplish that using the SimpleDocTemplate.
from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Paragraph, Frame, Spacer
from reportlab.lib import colors
from reportlab.lib.units import cm
from reportlab.lib.pagesizes import A3, A4, landscape, portrait
from reportlab.lib.styles import ParagraphStyle, getSampleStyleSheet
from reportlab.lib.enums import TA_LEFT, TA_RIGHT, TA_CENTER, TA_JUSTIFY
from reportlab.pdfgen import canvas
pdfReportPages = "C:\\Temp\\test.pdf"
doc = SimpleDocTemplate(pdfReportPages, pagesize=A4)
# container for the "Flowable" objects
elements = []
styles=getSampleStyleSheet()
styleN = styles["Normal"]
# Make heading for each column
column1Heading = Paragraph("<para align=center>COLUMN ONE HEADING</para>",styles['Normal'])
column2Heading = Paragraph("<para align=center>COLUMN TWO HEADING</para>",styles['Normal'])
row_array = [column1Heading,column2Heading]
tableHeading = [row_array]
tH = Table(tableHeading, [6 * cm, 6 * cm]) # These are the column widths for the headings on the table
tH.hAlign = 'LEFT'
tblStyle = TableStyle([('TEXTCOLOR',(0,0),(-1,-1),colors.black),
('VALIGN',(0,0),(-1,-1),'TOP'),
('BOX',(0,0),(-1,-1),1,colors.black),
('BOX',(0,0),(0,-1),1,colors.black)])
tblStyle.add('BACKGROUND',(0,0),(-1,-1),colors.lightblue)
tH.setStyle(tblStyle)
elements.append(tH)
# Assemble rows of data for each column
for i in range(1,100):
column1Data = Paragraph("<para align=center> " + "Row " + str(i) + " Column 1 Data" + "</font> </para>",styles['Normal'])
column2Data = Paragraph("<para align=center> " + "Row " + str(i) + " Column 2 Data" + "</font> </para>",styles['Normal'])
row_array = [column1Data,column2Data]
tableRow = [row_array]
tR=Table(tableRow, [6 * cm, 6 * cm])
tR.hAlign = 'LEFT'
tR.setStyle(TableStyle([('BACKGROUND',(0,0),(-1,-1),colors.white),
('TEXTCOLOR',(0,0),(-1,-1),colors.black),
('VALIGN',(0,0),(-1,-1),'TOP'),
('BOX',(0,0),(-1,-1),1,colors.black),
('BOX',(0,0),(0,-1),1,colors.black)]))
elements.append(tR)
del tR
elements.append(Spacer(1, 0.3 * cm))
doc.build(elements)
From the documentation (yes, I know, but it's sometimes hard to locate this stuff in the manual):
The repeatRows argument specifies the number of leading rows that
should be repeated when the Table is asked to split itself.
So when you create the table, this is one of the arguments you can pass, and it will turn the first n rows into header rows that repeat. You'll find this part of the text on page 77, but the section relating to creating a Table starts on page 76.
http://www.reportlab.com/docs/reportlab-userguide.pdf
This is the code I developed, after following Gordon's advice to reconsider using repeatRows, and it works!
from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Paragraph, Frame, Spacer
from reportlab.lib import colors
from reportlab.lib.units import cm
from reportlab.lib.pagesizes import A3, A4, landscape, portrait
from reportlab.lib.styles import ParagraphStyle, getSampleStyleSheet
from reportlab.lib.enums import TA_LEFT, TA_RIGHT, TA_CENTER, TA_JUSTIFY
from reportlab.pdfgen import canvas
pdfReportPages = "C:\\Temp\\test.pdf"
doc = SimpleDocTemplate(pdfReportPages, pagesize=A4)
# container for the "Flowable" objects
elements = []
styles=getSampleStyleSheet()
styleN = styles["Normal"]
# Make heading for each column and start data list
column1Heading = "COLUMN ONE HEADING"
column2Heading = "COLUMN TWO HEADING"
# Assemble data for each column using simple loop to append it into data list
data = [[column1Heading,column2Heading]]
for i in range(1,100):
data.append([str(i),str(i)])
tableThatSplitsOverPages = Table(data, [6 * cm, 6 * cm], repeatRows=1)
tableThatSplitsOverPages.hAlign = 'LEFT'
tblStyle = TableStyle([('TEXTCOLOR',(0,0),(-1,-1),colors.black),
('VALIGN',(0,0),(-1,-1),'TOP'),
('LINEBELOW',(0,0),(-1,-1),1,colors.black),
('BOX',(0,0),(-1,-1),1,colors.black),
('BOX',(0,0),(0,-1),1,colors.black)])
tblStyle.add('BACKGROUND',(0,0),(1,0),colors.lightblue)
tblStyle.add('BACKGROUND',(0,1),(-1,-1),colors.white)
tableThatSplitsOverPages.setStyle(tblStyle)
elements.append(tableThatSplitsOverPages)
doc.build(elements)
Use the repeatRows=1 when you create the Table...
from reportlab.platypus import Table
Table(data,repeatRows=1)
I always like to have something you can cut & paste into a .py file to run and test. So here it is...
import os
import pandas as pd
import numpy as np
import reportlab.platypus
import reportlab.lib.styles
from reportlab.lib import colors
from reportlab.lib.units import mm
from reportlab.lib.pagesizes import letter, landscape
reportoutputfilepath = os.path.join('.\\test.pdf')
pdf_file = reportlab.platypus.SimpleDocTemplate(
reportoutputfilepath,
pagesize=landscape(letter),
rightMargin=10,
leftMargin=10,
topMargin=38,
bottomMargin=23
)
ts_tables = [
('ALIGN', (4,0), (-1,-1), 'RIGHT'),
('LINEBELOW', (0,0), (-1,0), 1, colors.purple),
('FONT', (0,0), (-1,0), 'Times-Bold'),
('LINEABOVE', (0,-1), (-1,-1), 1, colors.purple),
('FONT', (0,-1), (-1,-1), 'Times-Bold'),
('BACKGROUND',(1,1),(-2,-2),colors.white),
('TEXTCOLOR',(0,0),(1,-1),colors.black),
('FONTSIZE', (0,0),(-1,-1), 8),
]
df = pd.DataFrame(np.random.randint(0,1000,size=(1000, 4)), columns=list('ABCD'))
lista = [df.columns[:,].values.astype(str).tolist()] + df.values.tolist()
#Here is where you put repeatRows=1
table = reportlab.platypus.Table(lista, colWidths=(20*mm, 20*mm, 20*mm, 20*mm),repeatRows=1)
table_style = reportlab.platypus.TableStyle(ts_tables)
table.setStyle(table_style)
elements = []
elements.append(table)
# Build the PDF
pdf_file.build(elements)
print reportoutputfilepath
t1 = Table(lista, colWidths=220, rowHeights=20, repeatRows=1)
just type repeatRows=1
I found this solution to repeat easily the header on a table which is on two pages. Add this line in your CSS for your table:
-fs-table-paginate: paginate;
I also found a class for FPDF which seems powerful (i don't need it for the moment, so I didn't test it)
http://interpid.eu/fpdf-table

Resources