BikeSaferPA / lib /vis_data.py
etweedy's picture
Upload 22 files
5d396e9
raw
history blame
12.1 kB
import pandas as pd
import numpy as np
import plotly.express as px
from scipy import stats
def plot_map(df,city=None,county=None,animate=True,color_dots=True,animate_by='year',show_fig=True,return_fig=False):
"""
Displays a plotly.express.scatter_mapbox interactive map
of crashes in a municipality if specified, or otherwise
statewide. Can be animated over time or static.
Parameters:
-----------
df : pd.DataFrame
dataframe of crash samples
city or county : tuple or None
if provided, must be a tuple (code,name)
- code : str
the code corresponding to the desired municipality/county
(see the data dictionary)
- name : str
the name you want to use for the municipality/county
in plot title
* At most one of these can be not None!
animate : bool
if animate==True, then the map will animate using
the frequency provided in animate_by
color_dots : bool
if color_dots==True, then dots will be color-coded by
'serious injury or death' status.
WARNING: if color_dots and animate, then all frames
will be missing samples in 'serious injury or death'
classes which aren't present in first frame - due to
bug in plotly animation_frame implementation.
Recommend only using both when geographic
area is statewide or at least has all values of
'serious injury or death' in first frame
animate_by : str
the desired animation frequency, must be
either 'year' or 'month'
show_fig : bool
whether to display figure using fig.show()
return_fig : bool
whether to return the figure object
Returns: Either figure or None
--------
"""
assert (city is None)|(county is None), 'A city and county cannot both be provided.'
# Copy df and create new column for color coding event type
df = df.copy()
df.loc[df.BICYCLE_SUSP_SERIOUS_INJ_COUNT>0,'Serious cyclist injury or death']='serious injury'
df.loc[df.BICYCLE_DEATH_COUNT>0,'Serious cyclist injury or death']='death'
df['Serious cyclist injury or death']=df['Serious cyclist injury or death'].fillna('neither')
# Set animation parameters
if animate:
if animate_by == 'year':
animation_frame = 'CRASH_YEAR'
title_animate = ' by year'
elif animate_by == 'month':
df['DATE'] = pd.to_datetime((df['CRASH_MONTH'].astype('str')\
+'-'+df['CRASH_YEAR'].astype('str')),
format = "%m-%Y")
df=df.sort_values(by='DATE')
df['DATE']=df['DATE'].astype('str').apply(lambda x: x.rsplit('-',1)[0])
animation_frame = 'DATE'
title_animate = ' by month'
else:
raise ValueError("animate_by must be 'year' or 'month'")
else:
animation_frame = None
title_animate = ''
if color_dots:
color='Serious cyclist injury or death'
else:
color=None
# Adjustments for when city or county are provided
if city is not None:
df = df[df.MUNICIPALITY==city[0]]
# Ignore extreme outlier samples - lat,lon may be incorrect
df = df[np.abs(stats.zscore(df.DEC_LAT))<=4]
df = df[np.abs(stats.zscore(df.DEC_LONG))<=4]
title_place = city[1]+', PA'
elif county is not None:
df = df[df.COUNTY==county[0]]
# Ignore extreme outlier samples - lat,lon may be incorrect
df = df[np.abs(stats.zscore(df.DEC_LAT))<=4]
df = df[np.abs(stats.zscore(df.DEC_LONG))<=4]
title_place = county[1]+' county, PA'
else:
title_place = 'PA'
# Compute default zoom level based on lat,lon ranges.
# open-street-map uses
max_lat, min_lat = df.DEC_LAT.max(), df.DEC_LAT.min()
max_lon, min_lon = df.DEC_LONG.max(), df.DEC_LONG.min()
# 2^(zoom) = 360/(longitude width of 1 tile)
zoom = np.log2(360/max(max_lon-min_lon,max_lat-min_lat))
lat_center = (max_lat+min_lat)/2
lon_center = (max_lon+min_lon)/2
# Adjust width so that aspect ratio matches shape of state
width_mult = (max_lon-min_lon)/(max_lat-min_lat)
cols = ['CRN','DEC_LAT','DEC_LONG','Serious cyclist injury or death','CRASH_YEAR','CRASH_MONTH']
if animate_by=='month':
cols.append('DATE')
# Plot mapbox
fig = px.scatter_mapbox(df, lat='DEC_LAT',lon='DEC_LONG',
color=color,
color_discrete_map={'neither':'royalblue','serious injury':'orange','death':'crimson'},
mapbox_style='open-street-map',
animation_frame = animation_frame,
animation_group='CRN',
hover_data = {'DEC_LAT':False,'DEC_LONG':False,
'CRASH_YEAR':True,'CRASH_MONTH':True,
'Serious cyclist injury or death':True},
width = width_mult*500,height=700,zoom=zoom,
center={'lat':lat_center,'lon':lon_center},
title=f'Crashes involving bicycles{title_animate}<br> in {title_place}, 2002-2021')
fig.update_layout(legend=dict(orientation='h',xanchor='right',yanchor='bottom',x=1,y=-0.12),
legend_title_side='top')
if show_fig:
fig.show()
if return_fig:
return fig
def feat_perc(feat, df, col_name = 'percentage', feat_name = None):
"""
Constructs a single-column dataframe 'perc'
containing the value counts in the series
df[feat] as percentages of the whole.
- 'df' is the input dataframe.
- 'feat' is the desired column of df.
- 'col_name' is the name of the
column of the output dataframe
- 'feat_name' is the index name
of the output dataframe if provided, otherwise
will use 'feat' as index name.
"""
perc = pd.DataFrame({col_name:df[feat].value_counts(normalize=True).sort_index()})
if feat_name:
perc.index.name=feat_name
else:
perc.index.name=feat
return perc
def feat_perc_bar(feat,df,feat_name=None,cohort_name=None,show_fig=True,return_fig=False,sort=False):
"""
Makes barplot of two series:
- distribution of feature among all cyclists
- distribution of feature among cyclists with serious injury or fatality
Parameters:
-----------
feat : str
The column name of the desired feature
df : pd.DataFrame
The input dataframe
feat_name : str or None
The feature name to use in the
x-axis label. If None, will use feat
cohort_name : str or None
qualifier to use in front of 'cyclists'
in titles, if provided, e.g. 'rural cyclists'
show_fig : bool
whether to finish with fig.show()
return_fig : bool
whether to return the fig object
sort : bool
whether to sort bars. If False, will use default sorting
by category name or feature value. If True, will resort
in descending order by percentage
Returns: figure or None
--------
"""
if feat_name is None:
feat_name=feat
df_inj = df.query('SERIOUS_OR_FATALITY==1')
table = feat_perc(feat,df)
table.loc[:,'cohort']='all'
ordering = list(table['percentage'].sort_values(ascending=False).index) if sort else None
table_inj = feat_perc(feat,df_inj)
table_inj.loc[:,'cohort']='seriously injured or killed'
table = pd.concat([table,table_inj],axis=0).reset_index()
category_orders = {'cohort':['all','seriously injured or killed']}
if sort:
category_orders[feat]=ordering
fig = px.bar(table,y='cohort',x='percentage',color=feat,
barmode='stack',text_auto='.1%',
category_orders=category_orders,
title=f'Distributions of {feat} values within cyclist cohorts')
fig.update_yaxes(tickangle=-90)
fig.update_xaxes(tickformat=".0%")
if show_fig:
fig.show()
if return_fig:
return fig
# def feat_perc_comp(feat,df,feat_name=None,cohort_name = None,merge_inj_death=True):
# """
# Returns a styled dataframe (Styler object)
# whose underlying dataframe has three columns
# containing value counts of 'feat' among:
# - all cyclists involved in crashes
# - cyclists suffering serious injury or fatality
# each formatted as percentages of the series sum.
# Styled with bars comparing percentages
# Parameters:
# -----------
# feat : str
# The column name of the desired feature
# df : pd.DataFrame
# The input dataframe
# feat_name : str or None
# The feature name to use in the output dataframe
# index name. If None, will use feat
# cohort_name : str or None
# qualifier to use in front of 'cyclists'
# in titles, if provided, e.g. 'rural cyclists'
# merge_inj_death : bool
# whether to merge seriously injured and killed cohorts
# Returns:
# --------
# perc_comp : pd.Styler object
# """
# # Need qualifier for titles if restricting cyclist cohort
# qualifier = cohort_name if cohort_name is not None else ''
# # Two columns or three, depending on merge_inj_death
# if merge_inj_death:
# perc_comp = feat_perc(feat,df=df,feat_name=feat_name,
# col_name='all cyclists',)\
# .merge(feat_perc(feat,feat_name=feat_name,
# df=df.query('SERIOUS_OR_FATALITY==1'),
# col_name=qualifier+'cyclists with serious injury or fatality'),
# on=feat,how='left')
# perc_comp = perc_comp[perc_comp.max(axis=1)>=0.005]
# else:
# perc_comp = feat_perc(feat,df=df,feat_name=feat_name,
# col_name='all cyclists')\
# .merge(feat_perc(feat,feat_name=feat_name,
# df=df.query('INJ_SEVERITY=="susp_serious_injury"'),
# col_name=qualifier+'cyclists with serious injury'),
# on=feat,how='left')\
# .merge(feat_perc(feat,feat_name=feat_name,
# df=df.query('INJ_SEVERITY=="killed"'),
# col_name=qualifier+'cyclists with fatality'),
# on=feat,how='left')
# # If feature is not ordinal, sort rows descending by crash counts
# if feat not in ['AGE_BINS','SPEED_LIMIT','DAY_OF_WEEK','HOUR_OF_DAY']:
# perc_comp=perc_comp.sort_values(by='all cyclists',ascending=False)
# # Relabel day numbers with strings
# if feat == 'DAY_OF_WEEK':
# perc_comp.index=['Sun','Mon','Tues','Wed','Thurs','Fri','Sat']
# perc_comp.index.name='DAY_OF_WEEK'
# perc_comp=perc_comp.fillna(0)
# table_columns = list(perc_comp.columns)
# # Define format for displaying floats
# format_dict={col:'{:.2%}' for col in perc_comp.columns}
# # Define table styles
# styles = [dict(selector="caption",
# props=[("text-align", "center"),
# ("font-size", "100%"),
# ("color", 'black'),
# ("text-decoration","underline"),
# ("font-weight","bold")])]
# # Return formatted dataframe
# if feat_name is None:
# feat_name=feat
# caption = f'Breakdown of {feat_name} among cyclist groups'
# return perc_comp.reset_index().style.set_table_attributes("style='display:inline'")\
# .format(format_dict).bar(color='powderblue',
# subset=table_columns).hide().set_caption(caption)\
# .set_table_styles(styles)