exclude one of the hue from seaborn catplot visualization - seaborn

I want to visualize category count by seaborn catplot but one of the hue are not important and don't need to include the visualization.
How can I select specific Hues at catplot to visualize without changing or removing any value from the column ?

You could remove the rows with that value from the dataframe. If the column is Categorical you might also need to change the categories as the legend will still contain all the categories.
Here is an example:
import seaborn as sns
import pandas as pd
tips = sns.load_dataset('tips')
tips['day'].dtype # CategoricalDtype(categories=['Thur', 'Fri', 'Sat', 'Sun'], ordered=False)
# create a subset, a copy is needed to be able to change the categorical column
tips_weekend = tips[tips['day'].isin(['Sat', 'Sun'])].copy()
tips_weekend['day'].dtype # CategoricalDtype(categories=['Thur', 'Fri', 'Sat', 'Sun'], ordered=False)
tips_weekend['day'] = pd.Categorical(tips_weekend['day'], ['Sat', 'Sun'])
tips_weekend['day'].dtype # CategoricalDtype(categories=['Sat', 'Sun'], ordered=False)
sns.catplot(data=tips_weekend, x='smoker', y='tip', hue='day')
For the follow-up question, a histplot with multiple='fill' can show the percentage distribution:
import seaborn as sns
import pandas as pd
from matplotlib.ticker import PercentFormatter
tips = sns.load_dataset('tips')
tips_weekend = tips.copy()
tips_weekend['day'] = tips_weekend['day'].apply(lambda x: x if x in ['Sat', 'Sun'] else 'other')
# fix a new order
tips_weekend['day'] = pd.Categorical(tips_weekend['day'], ['other', 'Sat', 'Sun'])
ax = sns.histplot(data=tips_weekend, x='smoker', hue='day', stat='count', multiple='fill',
palette=['none', 'turquoise', 'crimson'])
# remove the first label ('other') in the legend
ax.legend(handles=ax.legend_.legendHandles[1:], labels=['Sat', 'Sun'], title='day')
ax.yaxis.set_major_formatter(PercentFormatter(1))
# add percentages
for bar_group in ax.containers[:-1]:
ax.bar_label(bar_group, label_type='center', labels=[f'{bar.get_height() * 100:.1f} %' for bar in bar_group])

Related

Plot milepost along coastline in python

I want to plot mileposts on a map every 100 miles along the coastline. An example is shown in the figure below:
It is easy to plot the coastlines using Cartopy, but how to determine the locations of these mileposts and show them on a map? It would be better if the coastlines between the points were colored.
You can use Shapely's interpolate method to figure out the location of the mileposts. However, the tricky part of this is getting a singlepart linestring for the coastline. I messed around with a few coastline shapefiles that I downloaded, but due to the complexity and distance, getting a nice singlepart linestring was not a simple task. Therefore, I chose to digitize my own for this example (digitize.shp).
import geopandas as gpd
import numpy as np
import matplotlib
import cartopy.crs as ccrs
import matplotlib.pyplot as plt
# hand digitized simplified version of coastline
cl_gdf = gpd.read_file("digitize.shp")
# project data from geographic crs so we have meters for interpolation below, vs degrees
cl_gdf = cl_gdf.to_crs(9311)
# get shapely ls from gdf
coastline = cl_gdf.iloc[0].geometry
interval = 160934 # approx 100 miles in meters
interval_arr = np.arange(0, coastline.length, interval )
points = [coastline.interpolate(d) for d in interval_arr] + [coastline.boundary[1]]
# create gdf from our list of interpolated Shapely points
points_gdf = gpd.GeoDataFrame(geometry=points, crs=9311)
# transform crs to wgs84 for plotting
points_gdf = points_gdf.to_crs(4326)
# add Lat and Long cols from geometry for plotting
points_gdf['Lat'] = (points_gdf['geometry'].apply(lambda geom: np.max([coord[1] for coord in geom.coords])))
points_gdf['Long'] = (points_gdf['geometry'].apply(lambda geom: np.max([coord[0] for coord in geom.coords])))
ax = plt.axes(projection=ccrs.PlateCarree())
ax.coastlines()
ax.set_extent([points_gdf['Long'].min() - 1,
points_gdf['Long'].max() + 1,
points_gdf['Lat'].min() - 1,
points_gdf['Lat'].max() + 1],
crs=ccrs.PlateCarree())
# add our interpolated points to the Cartopy coastline
ax.scatter(points_gdf['Long'], points_gdf['Lat'], color='red', s=10)
fig = matplotlib.pyplot.gcf()
fig.set_size_inches(18.5, 10.5)
plt.show()
EDIT:
Here is a replacement for "digitize.shp" for a full working example. You can use gpd's to_file() if you want to save cl_gdf to a shapefile.
from shapely.geometry import LineString
ls_coords = [(-84.35698841221803, 29.95854398344339),
(-84.05368623055452, 30.09279249008134),
(-83.96252983715839, 30.0613020996354),
(-83.7586709937452, 29.908822314318225),
(-83.44376708928581, 29.68673219222582),
(-83.40854757365547, 29.657727885236138),
(-83.16946921461201, 29.28191493609843),
(-82.7551219719023, 28.99684403311415),
(-82.66893774541866, 28.698514018363166),
(-82.68219685718537, 28.439961338912305),
(-82.79489930720241, 28.158205213869703),
(-82.63910474394356, 27.94937420354401),
(-82.54629096157659, 27.641099854967987),
(-82.63578996600191, 27.392491509342157),
(-82.26122005859231, 26.72290636512327),
(-81.52533935553987, 25.88095276793714),
(-80.85243943337927, 25.168275510476434),
(-80.7596256510123, 25.14175728694301),
(-80.65355275687861, 25.150044231797207),
(-80.59222936495757, 25.176562455330625),
(-80.49278602670725, 25.206395456805726),
(-80.65355275687864, 24.90143588617138),
(-80.53587813994908, 25.03236961486765),
(-80.23920551416893, 25.340643963443675),
(-80.27069590461487, 25.353903075210386),
(-80.37345402080688, 25.312468350939415),
(-80.104957007531, 25.963822216479077),
(-80.07180922811422, 26.91847826368225),
(-80.66846925761622, 28.60238545805451),
(-80.72150570468307, 28.86756769338872),
(-81.19883372828465, 29.663114399391368),
(-81.43749774008545, 30.392365546560455),
(-81.47727507538558, 31.095098470196124),
(-81.26512928711821, 31.406687596713834),
(-81.09276083415097, 31.844238285015287),
(-80.7612830399832, 32.175716079183054),
(-80.48947124876561, 32.48067564981741),
(-80.07180922811422, 32.61326676748452),
(-79.32929896917841, 33.07070612343603),
(-79.20996696327803, 33.22981546463656),
(-78.9779325073606, 33.61432970587117),
(-78.55364093082585, 33.8331050500219),
(-77.93046267779044, 33.919289276505516),
(-77.71831688952308, 34.28391485009006),
(-77.45976421007221, 34.469542414824005),
(-75.76259790393324, 35.20542311787645),
(-75.76259790393324, 36.21974516802982),
(-75.84215257453349, 36.58437074161438),
(-76.17031559075959, 36.939051981373886),
(-76.29959193048501, 36.96722759387815),
(-76.2946197635725, 37.12136476816616),
(-76.41726654741457, 37.16942904832049),
(-76.50179338492735, 37.23738199612488),
(-76.3377118768143, 37.5108511763133),
(-76.72885567393226, 37.79923685723926),
(-76.76200345334904, 37.878791527839525),
(-76.31119365328087, 37.68653440722222),
(-76.23163898268061, 37.89205063960623),
(-76.35428576652268, 37.951716642556434),
(-76.38411876799778, 37.95668880946896),
(-76.50676555183985, 38.02298436830251),
(-76.52665421948991, 38.05613214771928),
(-76.6028941121485, 38.10253903890277),
(-76.60952366803185, 38.12905726243619),
(-76.85978940262852, 38.16717720876549),
(-76.90785368278284, 38.19203804332807),
(-76.94265885117046, 38.2102693220073),
(-77.01724135485821, 38.313027438199306),
(-77.01724135485821, 38.31799960511182),
(-77.26916447842571, 38.3428604396744),
(-77.30396964681333, 38.38263777497453),
(-77.29568270195914, 38.52517322646668),
(-77.23270192106726, 38.60804267500862),
(-77.21612803135888, 38.63953306545456),
(-77.17966547400042, 38.613014841921135),
(-76.96254751882053, 38.4108133874788),
(-76.81338251144503, 38.27987965878253),
(-76.41063699153119, 38.311370049228465),
(-76.53825594228582, 38.710800791200626),
(-76.53494116434413, 39.2047027045106),
(-76.0808165863343, 39.532865720736694),
(-76.02446536132577, 39.370441601594486),
(-76.06755747456758, 39.254424373635764),
(-76.14048258928449, 39.101944588318595),
(-76.1835747025263, 38.956094358884776),
(-76.196833814293, 38.88316924416787),
(-76.18688948046797, 38.76383723826747),
(-76.17031559075959, 38.64782001030875),
(-76.094075698101, 38.51854367058332),
(-75.98468802602564, 38.33291610584937),
(-75.89850379954201, 38.24341710142407),
(-75.84546735247517, 38.190380654357234),
(-75.77917179364162, 38.10419642787361),
(-75.69961712304135, 38.011382645506636),
(-75.65652500979955, 37.95171664255644),
(-75.66978412156625, 37.91525408519799),
(-75.92502202307544, 37.54068417778841),
(-75.76591268187491, 37.481018174838205),
(-75.67972845539128, 37.46112950718814),
(-75.62337723038277, 37.48433295277989),
(-75.58359989508264, 37.59703540279693),
(-75.2189743214981, 38.051159980806766),
(-75.05655020235588, 38.37600821909118),
(-75.03666153470581, 38.44893333380809),
(-75.39465755240701, 39.1450367015604),
(-75.50404522448237, 39.39033026924455),
(-75.54713733772417, 39.536180498678355),
(-75.5438225597825, 39.60413344648275),
(-75.50901739139488, 39.57595783397849),
(-75.51896172521991, 39.48977360749487),
(-75.50073044654069, 39.45331105013641),
(-75.45100877741551, 39.41684849277796),
(-75.3996297193195, 39.38701549130286),
(-75.33333416048596, 39.34889554497357),
(-75.2753255465066, 39.31409037658595),
(-75.17588220825627, 39.27431304128582),
(-75.07146670309342, 39.23453570598569),
(-74.95710686410554, 39.19144359274388),
(-74.8957834721845, 39.169897536122974),
(-74.53778745448334, 39.270998263344154),
(-74.29249388679919, 39.46325538396146),
(-74.10023676618188, 39.89417651637956),
(-74.02731165146497, 40.046656301696736),
(-74.13338454559866, 40.32509764879766),
(-74.23945743973235, 40.4974661017649),
(-74.07371854264846, 40.550502548831744),
(-73.80853630731424, 40.56376166059845),
(-73.60302007493023, 40.550502548831744),
(-73.4439107337297, 40.61016855178194),
(-73.39750384254621, 40.70298233414891),
(-73.41739251019628, 40.88198034299951),
(-73.42070728813798, 40.935016790066356),
(-73.62953829846366, 40.8985542327079),
(-73.77207374995581, 40.83225867387435),
(-73.7588146381891, 40.931702012124674),
(-73.53672451609668, 41.031145350375006),
(-73.15552505280375, 41.1504773562754),
(-72.96658271012812, 41.17368080186715),
(-71.70033753640726, 41.35930836660109),
(-71.04401150395508, 41.50515859603491),
(-70.70590415390394, 41.65100882546872),
(-70.68601548625388, 41.670897493118794),
(-70.67938593037053, 41.51178815191826),
(-70.61309037153697, 41.53830637545168),
(-70.5070174774033, 41.76371127548577),
(-70.54016525682006, 41.86978416961945),
(-70.6528677068371, 42.00237528728656),
(-70.77882926862085, 42.247668854970726),
(-70.93130905393804, 42.41340775205461),
(-70.91142038628796, 42.5924057609052),
(-70.86501349510448, 42.65870131973875),
(-70.54679481270341, 43.32165690807428),
(-70.19542835088558, 43.63987559047534),
(-69.74461855081742, 43.81887359932593),
(-69.53910231843341, 43.885169158159485),
(-68.96233095658148, 44.289572067044155),
(-68.81648072714766, 44.44868140824468),
(-68.41207781826299, 44.48845874354482),
(-67.91486112701133, 44.42216318471126),
(-67.61653111226035, 44.52823607884495),
(-67.09942575335863, 44.70723408769554),
(-67.02650063864172, 44.760270534762384)]
ls = LineString(ls_coords)
cl_gdf = gpd.GeoDataFrame(geometry=[ls], crs=4326)

Two StatsModels modules have totally different 'end-runs'

I'm running StatsModels to estimate parameters of a multiple regression model, using county-level data for 3085 counties. When I use statsmodels.formula.api, and drop a few rows from the data, I get desired results. All seems well enough.
import pandas as pd
import numpy as np
import statsmodels.formula.api as sm
%matplotlib inline
from statsmodels.compat import lzip
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")
eg=pd.read_csv(r'C:/Users/user/anaconda3/une_edu_pipc_06.csv')
pd.options.display.precision = 3
plt.rc("figure", figsize=(16,8))
plt.rc("font", size=14)
sm_col = eg["lt_hsd_17"] + eg["hsd_17"]
eg["ut_hsd_17"] = sm_col
sm_col2 = eg["sm_col_17"] + eg["col_17"]
eg["bnd_hsd_17"] = sm_col2
eg["d_09"]= eg["Rate_09"]-eg["Rate_06"]
eg["d_10"]= eg["Rate_10"]-eg["Rate_06"]
inc_2=eg["p_c_inc_18"]*eg["p_c_inc_18"]
res = sm.ols(formula = "Rate_18 ~ p_c_inc_18 + ut_hsd_17 + d_10 + inc_2",
data=eg, missing='drop').fit()
print(res.summary()).
(BTW, eg["p_c_inc_18"]is per-capita income, and inc_2 is p_c_inc_18 squarred).
But when I wish to use import statsmodels.api as smas the module, everything else staying pretty much the same, and run the following code after all appropriate preliminaries,
inc_2=eg["p_c_inc_18"]*eg["p_c_inc_18"]
X = eg[["p_c_inc_18","ut_hsd_17","d_10","inc_2"]]
y = eg["Rate_18"]
X = sm.add_constant(X)
mod = sm.OLS(y, X)
res = mod.fit()
print(res.summary())
then things fall apart, and the Python interpreter throws an error, as follows:
[......]
KeyError: "['inc_2'] not in index"
BTW, the only difference between the two 'runs' is that 15 rows are dropped during the first, successful, model run, while I don't as yet know how to drop missing rows from the second model formulation. Could that difference be responsible for why the second run fails? (I chose to omit large parts of the error message, to reduce clutter.)
You need to assign inc_2 in your DataFrame.
inc_2=eg["p_c_inc_18"]*eg["p_c_inc_18"]
should be
eg["inc_2"] = eg["p_c_inc_18"]*eg["p_c_inc_18"]

Use Gensim or other python LDA packages to use trained LDA model from Mallet

I have an LDA model trained through Mallet in Java. Three files are generated from the Mallet LDA model, which allow me to run the model from files and infer the topic distribution of a new text.
Now I would like to implement a Python tool which is able to infer a topic distribution given a new text, based on the trained LDA model. I do not want to re-trained the LDA model in Python. Therefore, I wonder if it is possible to load the trained Mallet LDA model into Gensim or any other python LDA package. If so, how can I do it?
Thanks for any answers or comments.
In short yes you can! That is what is nice about using mallet is that once it is run you don't have to go through and relabel topics. I'm doing something very similar - I'll post my code below with a few helpful links. Once your model is trained save the notebook widget state and you'll be free to run your model on new and different data-sets with the same topic allocation. This code includes a test and validation set. Make sure you've downloaded mallet and java then try this:
# future bridges python 2 and 3
from __future__ import print_function
# pandas works with data structures, data manipulation, and analysis specifically for numerical tables, and series like
# the csv we are using here today
import pandas as pd
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
# Gensim unsupervised topic modeling, natural language processing, statistical machine learning
import gensim
# convert a document to a list of tolkens
from gensim.utils import simple_preprocess
# remove stopwords - words that are not telling: "it" "I" "the" "and" ect.
from gensim.parsing.preprocessing import STOPWORDS
# corpus iterator
from gensim import corpora, models
# nltk - Natural Language Toolkit
# lemmatized — words in third person are changed to first person and verbs in past and future tenses are changed
# into present.
# stemmed — words are reduced to their root form.
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
# NumPy - multidimensional arrays, matrices, and high-level mathematical formulas
import numpy as np
np.random.seed(2018)
import os
from gensim.models.wrappers import LdaMallet
from pathlib import Path
import codecs
import logging
import re
import numpy as np
import pandas as pd
from pprint import pprint
# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
# spacy for lemmatization
import spacy
# Plotting tools
import pyLDAvis
import pyLDAvis.gensim # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline
# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)
logging.basicConfig(format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO)
data = pd.read_csv('YourData.csv', encoding = "ISO-8859-1");
data_text = data[['Preprocessed Document or your comments column title']]
data_text['index'] = data_text.index
documents = data_text
# Create functions to lemmatize stem, and preprocess
# turn beautiful, beautifuly, beautified into stem beauti
def lemmatize_stemming(text):
stemmer = PorterStemmer()
return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
# parse docs into individual words ignoring words that are less than 3 letters long
# and stopwords: him, her, them, for, there, ect since "their" is not a topic.
# then append the tolkens into a list
def preprocess(text):
result = []
for token in gensim.utils.simple_preprocess(text):
newStopWords = ['yourStopWord1', 'yourStopWord2']
if token not in gensim.parsing.preprocessing.STOPWORDS and token not in newStopWords and len(token) > 3:
nltk.bigrams(token)
result.append(lemmatize_stemming(token))
return result
# gensim.parsing.preprocessing.STOPWORDS
# look at a random row 4310 and see if things worked out
# note that the document created was already preprocessed
doc_sample = documents[documents['index'] == 4310].values[0][0]
print('original document: ')
words = []
for word in doc_sample.split(' '):
words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))
# let’s look at ten rows passed through the lemmatize stemming and preprocess
documents = documents.dropna(subset=['Preprocessed Document'])
processed_docs = documents['Preprocessed Document'].map(preprocess)
processed_docs[:10]
# we create a dictionary of all the words in the csv by iterating through
# contains the number of times a word appears in the training set.
dictionary_valid = gensim.corpora.Dictionary(processed_docs[20000:])
count = 0
for k, v in dictionary_valid.iteritems():
print(k, v)
count += 1
if count > 30:
break
# we create a dictionary of all the words in the csv by iterating through
# contains the number of times a word appears in the training set.
dictionary_test = gensim.corpora.Dictionary(processed_docs[:20000])
count = 0
for k, v in dictionary_test.iteritems():
print(k, v)
count += 1
if count > 30:
break
# we want to throw out words that are so frequent that they tell us little about the topic
# as well as words that are too infrequent >15 rows then keep just 100,000 words
dictionary_valid.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
# we want to throw out words that are so frequent that they tell us little about the topic
# as well as words that are too infrequent >15 rows then keep just 100,000 words
dictionary_test.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
# the words become numbers and are then counted for frequency
# consider a random row 4310 - it has 8 words word indexed 2 shows up once
# preview the bag of words
bow_corpus_valid = [dictionary_valid.doc2bow(doc) for doc in processed_docs]
bow_corpus_valid[4310]
# the words become numbers and are then counted for frequency
# consider a random row 4310 - it has 8 words word indexed 2 shows up once
# preview the bag of words
bow_corpus_test = [dictionary_test.doc2bow(doc) for doc in processed_docs]
bow_corpus_test[4310]
# same thing in more words
bow_doc_4310 = bow_corpus_test[4310]
for i in range(len(bow_doc_4310)):
print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0],
dictionary_test[bow_doc_4310[i][0]],
bow_doc_4310[i][1]))
mallet_path = 'C:/mallet/mallet-2.0.8/bin/mallet.bat'
ldamallet_test = gensim.models.wrappers.LdaMallet(mallet_path, corpus=bow_corpus_test, num_topics=20, id2word=dictionary_test)
result = (ldamallet_test.show_topics(num_topics=20, num_words=10,formatted=False))
for each in result:
print (each)
mallet_path = 'C:/mallet/mallet-2.0.8/bin/mallet.bat'
ldamallet_valid = gensim.models.wrappers.LdaMallet(mallet_path, corpus=bow_corpus_valid, num_topics=20, id2word=dictionary_valid)
result = (ldamallet_valid.show_topics(num_topics=20, num_words=10,formatted=False))
for each in result:
print (each)
# Show Topics
for idx, topic in ldamallet_test.print_topics(-1):
print('Topic: {} \nWords: {}'.format(idx, topic))
# Show Topics
for idx, topic in ldamallet_valid.print_topics(-1):
print('Topic: {} \nWords: {}'.format(idx, topic))
# check out the topics - 30 words - 20 topics
ldamallet_valid.print_topics(idx, 30)
# check out the topics - 30 words - 20 topics
ldamallet_test.print_topics(idx, 30)
# Compute Coherence Score
coherence_model_ldamallet_valid = CoherenceModel(model=ldamallet_valid, texts=processed_docs, dictionary=dictionary_valid, coherence='c_v')
coherence_ldamallet_valid = coherence_model_ldamallet_valid.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet_valid)
# Compute Coherence Score
coherence_model_ldamallet_test = CoherenceModel(model=ldamallet_test, texts=processed_docs, dictionary=dictionary_test, coherence='c_v')
coherence_ldamallet_test = coherence_model_ldamallet_test.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet_test)
Look at 16: https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/
This helped: https://rare-technologies.com/tutorial-on-mallet-in-python/
and this: https://radimrehurek.com/gensim/models/wrappers/ldamallet.html
I hope this helps and good luck :)

pandas series sort_index() not working with kind='mergesort'

I needed a stable index sorting for DataFrames, when I had this problem:
In cases where a DataFrame becomes a Series (when only a single column matches the selection), the kind argument returns an error. See example:
import pandas as pd
df_a = pd.Series(range(10))
df_b = pd.Series(range(100, 110))
df = pd.concat([df_a, df_b])
df.sort_index(kind='mergesort')
with the following error:
----> 6 df.sort_index(kind='mergesort')
TypeError: sort_index() got an unexpected keyword argument 'kind'
If DataFrames (more then one column is selected), mergesort works ok.
EDIT:
When selecting a single column from a DataFrame for example:
import pandas as pd
import numpy as np
df_a = pd.DataFrame(np.array(range(25)).reshape(5,5))
df_b = pd.DataFrame(np.array(range(100, 125)).reshape(5,5))
df = pd.concat([df_a, df_b])
the following returns an error:
df[0].sort_index(kind='mergesort')
...since the selection is casted to a pandas Series, and as pointed out the pandas.Series.sort_index documentation contains a bug.
However,
df[[0]].sort_index(kind='mergesort')
works alright, since its type continues to be a DataFrame.
pandas.Series.sort_index() has no kind parameter.
here is the definition of this function for Pandas 0.18.1 (file: ./pandas/core/series.py):
# line 1729
#Appender(generic._shared_docs['sort_index'] % _shared_doc_kwargs)
def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
sort_remaining=True):
axis = self._get_axis_number(axis)
index = self.index
if level is not None:
new_index, indexer = index.sortlevel(level, ascending=ascending,
sort_remaining=sort_remaining)
elif isinstance(index, MultiIndex):
from pandas.core.groupby import _lexsort_indexer
indexer = _lexsort_indexer(index.labels, orders=ascending)
indexer = com._ensure_platform_int(indexer)
new_index = index.take(indexer)
else:
new_index, indexer = index.sort_values(return_indexer=True,
ascending=ascending)
new_values = self._values.take(indexer)
result = self._constructor(new_values, index=new_index)
if inplace:
self._update_inplace(result)
else:
return result.__finalize__(self)
file ./pandas/core/generic.py, line 39
_shared_doc_kwargs = dict(axes='keywords for axes', klass='NDFrame',
axes_single_arg='int or labels for object',
args_transpose='axes to permute (int or label for'
' object)')
So most probably it's a bug in the pandas documentation...
Your df is Series, it's not a data frame

How to Repeat Table Column Headings over Page Breaks in PDF output from ReportLab

I'm using ReportLab to write tables in PDF documents and am very pleased with the results (despite not having a total grasp on flowables just yet).
However, I have not been able to figure out how to make a table that spans a page break have its column headings repeated.
The code below creates a test.pdf in C:\Temp that has a heading row followed by 99 rows of data.
The heading row looks great on the first page but I would like that to repeat at the top of the second and third pages.
I'm keen to hear of any approaches that have been used to accomplish that using the SimpleDocTemplate.
from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Paragraph, Frame, Spacer
from reportlab.lib import colors
from reportlab.lib.units import cm
from reportlab.lib.pagesizes import A3, A4, landscape, portrait
from reportlab.lib.styles import ParagraphStyle, getSampleStyleSheet
from reportlab.lib.enums import TA_LEFT, TA_RIGHT, TA_CENTER, TA_JUSTIFY
from reportlab.pdfgen import canvas
pdfReportPages = "C:\\Temp\\test.pdf"
doc = SimpleDocTemplate(pdfReportPages, pagesize=A4)
# container for the "Flowable" objects
elements = []
styles=getSampleStyleSheet()
styleN = styles["Normal"]
# Make heading for each column
column1Heading = Paragraph("<para align=center>COLUMN ONE HEADING</para>",styles['Normal'])
column2Heading = Paragraph("<para align=center>COLUMN TWO HEADING</para>",styles['Normal'])
row_array = [column1Heading,column2Heading]
tableHeading = [row_array]
tH = Table(tableHeading, [6 * cm, 6 * cm]) # These are the column widths for the headings on the table
tH.hAlign = 'LEFT'
tblStyle = TableStyle([('TEXTCOLOR',(0,0),(-1,-1),colors.black),
('VALIGN',(0,0),(-1,-1),'TOP'),
('BOX',(0,0),(-1,-1),1,colors.black),
('BOX',(0,0),(0,-1),1,colors.black)])
tblStyle.add('BACKGROUND',(0,0),(-1,-1),colors.lightblue)
tH.setStyle(tblStyle)
elements.append(tH)
# Assemble rows of data for each column
for i in range(1,100):
column1Data = Paragraph("<para align=center> " + "Row " + str(i) + " Column 1 Data" + "</font> </para>",styles['Normal'])
column2Data = Paragraph("<para align=center> " + "Row " + str(i) + " Column 2 Data" + "</font> </para>",styles['Normal'])
row_array = [column1Data,column2Data]
tableRow = [row_array]
tR=Table(tableRow, [6 * cm, 6 * cm])
tR.hAlign = 'LEFT'
tR.setStyle(TableStyle([('BACKGROUND',(0,0),(-1,-1),colors.white),
('TEXTCOLOR',(0,0),(-1,-1),colors.black),
('VALIGN',(0,0),(-1,-1),'TOP'),
('BOX',(0,0),(-1,-1),1,colors.black),
('BOX',(0,0),(0,-1),1,colors.black)]))
elements.append(tR)
del tR
elements.append(Spacer(1, 0.3 * cm))
doc.build(elements)
From the documentation (yes, I know, but it's sometimes hard to locate this stuff in the manual):
The repeatRows argument specifies the number of leading rows that
should be repeated when the Table is asked to split itself.
So when you create the table, this is one of the arguments you can pass, and it will turn the first n rows into header rows that repeat. You'll find this part of the text on page 77, but the section relating to creating a Table starts on page 76.
http://www.reportlab.com/docs/reportlab-userguide.pdf
This is the code I developed, after following Gordon's advice to reconsider using repeatRows, and it works!
from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Paragraph, Frame, Spacer
from reportlab.lib import colors
from reportlab.lib.units import cm
from reportlab.lib.pagesizes import A3, A4, landscape, portrait
from reportlab.lib.styles import ParagraphStyle, getSampleStyleSheet
from reportlab.lib.enums import TA_LEFT, TA_RIGHT, TA_CENTER, TA_JUSTIFY
from reportlab.pdfgen import canvas
pdfReportPages = "C:\\Temp\\test.pdf"
doc = SimpleDocTemplate(pdfReportPages, pagesize=A4)
# container for the "Flowable" objects
elements = []
styles=getSampleStyleSheet()
styleN = styles["Normal"]
# Make heading for each column and start data list
column1Heading = "COLUMN ONE HEADING"
column2Heading = "COLUMN TWO HEADING"
# Assemble data for each column using simple loop to append it into data list
data = [[column1Heading,column2Heading]]
for i in range(1,100):
data.append([str(i),str(i)])
tableThatSplitsOverPages = Table(data, [6 * cm, 6 * cm], repeatRows=1)
tableThatSplitsOverPages.hAlign = 'LEFT'
tblStyle = TableStyle([('TEXTCOLOR',(0,0),(-1,-1),colors.black),
('VALIGN',(0,0),(-1,-1),'TOP'),
('LINEBELOW',(0,0),(-1,-1),1,colors.black),
('BOX',(0,0),(-1,-1),1,colors.black),
('BOX',(0,0),(0,-1),1,colors.black)])
tblStyle.add('BACKGROUND',(0,0),(1,0),colors.lightblue)
tblStyle.add('BACKGROUND',(0,1),(-1,-1),colors.white)
tableThatSplitsOverPages.setStyle(tblStyle)
elements.append(tableThatSplitsOverPages)
doc.build(elements)
Use the repeatRows=1 when you create the Table...
from reportlab.platypus import Table
Table(data,repeatRows=1)
I always like to have something you can cut & paste into a .py file to run and test. So here it is...
import os
import pandas as pd
import numpy as np
import reportlab.platypus
import reportlab.lib.styles
from reportlab.lib import colors
from reportlab.lib.units import mm
from reportlab.lib.pagesizes import letter, landscape
reportoutputfilepath = os.path.join('.\\test.pdf')
pdf_file = reportlab.platypus.SimpleDocTemplate(
reportoutputfilepath,
pagesize=landscape(letter),
rightMargin=10,
leftMargin=10,
topMargin=38,
bottomMargin=23
)
ts_tables = [
('ALIGN', (4,0), (-1,-1), 'RIGHT'),
('LINEBELOW', (0,0), (-1,0), 1, colors.purple),
('FONT', (0,0), (-1,0), 'Times-Bold'),
('LINEABOVE', (0,-1), (-1,-1), 1, colors.purple),
('FONT', (0,-1), (-1,-1), 'Times-Bold'),
('BACKGROUND',(1,1),(-2,-2),colors.white),
('TEXTCOLOR',(0,0),(1,-1),colors.black),
('FONTSIZE', (0,0),(-1,-1), 8),
]
df = pd.DataFrame(np.random.randint(0,1000,size=(1000, 4)), columns=list('ABCD'))
lista = [df.columns[:,].values.astype(str).tolist()] + df.values.tolist()
#Here is where you put repeatRows=1
table = reportlab.platypus.Table(lista, colWidths=(20*mm, 20*mm, 20*mm, 20*mm),repeatRows=1)
table_style = reportlab.platypus.TableStyle(ts_tables)
table.setStyle(table_style)
elements = []
elements.append(table)
# Build the PDF
pdf_file.build(elements)
print reportoutputfilepath
t1 = Table(lista, colWidths=220, rowHeights=20, repeatRows=1)
just type repeatRows=1
I found this solution to repeat easily the header on a table which is on two pages. Add this line in your CSS for your table:
-fs-table-paginate: paginate;
I also found a class for FPDF which seems powerful (i don't need it for the moment, so I didn't test it)
http://interpid.eu/fpdf-table

Resources