File size: 12,061 Bytes
5d396e9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
import pandas as pd
import numpy as np
import plotly.express as px
from scipy import stats
	
def plot_map(df,city=None,county=None,animate=True,color_dots=True,animate_by='year',show_fig=True,return_fig=False):
    """
    Displays a plotly.express.scatter_mapbox interactive map
    of crashes in a municipality if specified, or otherwise
    statewide.  Can be animated over time or static.
    
    Parameters:
    -----------
    df : pd.DataFrame
        dataframe of crash samples
    city or county : tuple or None
        if provided, must be a tuple (code,name)
        - code : str
            the code corresponding to the desired municipality/county
            (see the data dictionary)
        - name : str
            the name you want to use for the municipality/county
            in plot title
        * At most one of these can be not None!
    animate : bool
        if animate==True, then the map will animate using
        the frequency provided in animate_by
    color_dots : bool
        if color_dots==True, then dots will be color-coded by
        'serious injury or death' status.
        WARNING: if color_dots and animate, then all frames
        will be missing samples in 'serious injury or death'
        classes which aren't present in first frame - due to
        bug in plotly animation_frame implementation.
        Recommend only using both when geographic
        area is statewide or at least has all values of
        'serious injury or death' in first frame
    animate_by : str
        the desired animation frequency, must be
        either 'year' or 'month'
    show_fig : bool
        whether to display figure using fig.show()
    return_fig : bool
        whether to return the figure object
   
   Returns: Either figure or None
   --------
    """
    assert (city is None)|(county is None), 'A city and county cannot both be provided.'
    # Copy df and create new column for color coding event type
    df = df.copy()
    df.loc[df.BICYCLE_SUSP_SERIOUS_INJ_COUNT>0,'Serious cyclist injury or death']='serious injury'
    df.loc[df.BICYCLE_DEATH_COUNT>0,'Serious cyclist injury or death']='death'
    df['Serious cyclist injury or death']=df['Serious cyclist injury or death'].fillna('neither')
    
    # Set animation parameters
    if animate:
        if animate_by == 'year':
            animation_frame = 'CRASH_YEAR'
            title_animate = ' by year'
        elif animate_by == 'month':
            df['DATE'] = pd.to_datetime((df['CRASH_MONTH'].astype('str')\
                                         +'-'+df['CRASH_YEAR'].astype('str')),
                                       format = "%m-%Y")
            df=df.sort_values(by='DATE')
            df['DATE']=df['DATE'].astype('str').apply(lambda x: x.rsplit('-',1)[0])
            animation_frame = 'DATE'
            title_animate = ' by month'
        else:
            raise ValueError("animate_by must be 'year' or 'month'")
    else:
        animation_frame = None
        title_animate = ''
    
    if color_dots:
        color='Serious cyclist injury or death'
    else:
        color=None
    
    # Adjustments for when city or county are provided
    if city is not None:
        df = df[df.MUNICIPALITY==city[0]]
        # Ignore extreme outlier samples - lat,lon may be incorrect
        df = df[np.abs(stats.zscore(df.DEC_LAT))<=4]
        df = df[np.abs(stats.zscore(df.DEC_LONG))<=4]
        title_place = city[1]+', PA'
    elif county is not None:
        df = df[df.COUNTY==county[0]]
        # Ignore extreme outlier samples - lat,lon may be incorrect
        df = df[np.abs(stats.zscore(df.DEC_LAT))<=4]
        df = df[np.abs(stats.zscore(df.DEC_LONG))<=4]
        title_place = county[1]+' county, PA'
    else:
        title_place = 'PA'
    
    # Compute default zoom level based on lat,lon ranges.
    # open-street-map uses 
    max_lat, min_lat = df.DEC_LAT.max(), df.DEC_LAT.min()
    max_lon, min_lon = df.DEC_LONG.max(), df.DEC_LONG.min()
    
    # 2^(zoom) = 360/(longitude width of 1 tile)
    zoom = np.log2(360/max(max_lon-min_lon,max_lat-min_lat))
    
    lat_center = (max_lat+min_lat)/2
    lon_center = (max_lon+min_lon)/2
    
    # Adjust width so that aspect ratio matches shape of state
    width_mult = (max_lon-min_lon)/(max_lat-min_lat)
    cols  = ['CRN','DEC_LAT','DEC_LONG','Serious cyclist injury or death','CRASH_YEAR','CRASH_MONTH']
    if animate_by=='month':
        cols.append('DATE')
    # Plot mapbox
    fig = px.scatter_mapbox(df, lat='DEC_LAT',lon='DEC_LONG',
                            color=color,
                            color_discrete_map={'neither':'royalblue','serious injury':'orange','death':'crimson'},
                            mapbox_style='open-street-map',
                            animation_frame = animation_frame,
                            animation_group='CRN',
                            hover_data = {'DEC_LAT':False,'DEC_LONG':False,
                                         'CRASH_YEAR':True,'CRASH_MONTH':True,
                                         'Serious cyclist injury or death':True},
                            width = width_mult*500,height=700,zoom=zoom,
                            center={'lat':lat_center,'lon':lon_center},
                            title=f'Crashes involving bicycles{title_animate}<br> in {title_place}, 2002-2021')
    fig.update_layout(legend=dict(orientation='h',xanchor='right',yanchor='bottom',x=1,y=-0.12),
                     legend_title_side='top')
    if show_fig:
        fig.show()
    if return_fig:
        return fig
	
def feat_perc(feat, df, col_name = 'percentage', feat_name = None):
    """
    Constructs a single-column dataframe 'perc'
    containing the value counts in the series
    df[feat] as percentages of the whole.
    - 'df' is the input dataframe.
    - 'feat' is the desired column of df.
    - 'col_name' is the name of the
    column of the output dataframe
    - 'feat_name' is the index name
    of the output dataframe if provided, otherwise
    will use 'feat' as index name.
    """
    perc = pd.DataFrame({col_name:df[feat].value_counts(normalize=True).sort_index()})
    if feat_name:
        perc.index.name=feat_name
    else:
        perc.index.name=feat
    return perc

def feat_perc_bar(feat,df,feat_name=None,cohort_name=None,show_fig=True,return_fig=False,sort=False):
    """
    Makes barplot of two series:
        - distribution of feature among all cyclists
        - distribution of feature among cyclists with serious injury or fatality

    Parameters:
    -----------
    feat : str
        The column name of the desired feature
    df : pd.DataFrame
        The input dataframe
    feat_name : str or None
        The feature name to use in the
        x-axis label.  If None, will use feat
    cohort_name : str or None
        qualifier to use in front of 'cyclists'
        in titles, if provided, e.g. 'rural cyclists'
    show_fig : bool
        whether to finish with fig.show()
    return_fig : bool
        whether to return the fig object
    sort : bool
        whether to sort bars. If False, will use default sorting
        by category name or feature value.  If True, will resort
        in descending order by percentage

    Returns: figure or None
    --------
    """
    if feat_name is None:
        feat_name=feat
    df_inj = df.query('SERIOUS_OR_FATALITY==1')
    table = feat_perc(feat,df)
    table.loc[:,'cohort']='all'
    ordering = list(table['percentage'].sort_values(ascending=False).index) if sort else None
    table_inj = feat_perc(feat,df_inj)
    table_inj.loc[:,'cohort']='seriously injured or killed'
    table = pd.concat([table,table_inj],axis=0).reset_index()
    category_orders = {'cohort':['all','seriously injured or killed']}
    if sort:
        category_orders[feat]=ordering
    fig = px.bar(table,y='cohort',x='percentage',color=feat,
                 barmode='stack',text_auto='.1%',
                category_orders=category_orders,
                title=f'Distributions of {feat} values within cyclist cohorts')
    fig.update_yaxes(tickangle=-90)
    fig.update_xaxes(tickformat=".0%")
    if show_fig:
        fig.show()
    if return_fig:
        return fig
    
# def feat_perc_comp(feat,df,feat_name=None,cohort_name = None,merge_inj_death=True):
#     """
#     Returns a styled dataframe (Styler object)
#     whose underlying dataframe has three columns
#     containing value counts of 'feat' among:
#     - all cyclists involved in crashes
#     - cyclists suffering serious injury or fatality
#     each formatted as percentages of the series sum.
#     Styled with bars comparing percentages

#     Parameters:
#     -----------
#     feat : str
#         The column name of the desired feature
#     df : pd.DataFrame
#         The input dataframe
#     feat_name : str or None
#         The feature name to use in the output dataframe
#         index name.  If None, will use feat
#     cohort_name : str or None
#         qualifier to use in front of 'cyclists'
#         in titles, if provided, e.g. 'rural cyclists'
#     merge_inj_death : bool
#         whether to merge seriously injured and killed cohorts
#     Returns:
#     --------
#     perc_comp : pd.Styler object
#     """
#     # Need qualifier for titles if restricting cyclist cohort
#     qualifier = cohort_name if cohort_name is not None else ''
    
#     # Two columns or three, depending on merge_inj_death
#     if merge_inj_death:
#         perc_comp = feat_perc(feat,df=df,feat_name=feat_name,
#                          col_name='all cyclists',)\
#                 .merge(feat_perc(feat,feat_name=feat_name,
#                                  df=df.query('SERIOUS_OR_FATALITY==1'),
#                                  col_name=qualifier+'cyclists with serious injury or fatality'),
#                       on=feat,how='left')
#         perc_comp = perc_comp[perc_comp.max(axis=1)>=0.005]
#     else:
#         perc_comp = feat_perc(feat,df=df,feat_name=feat_name,
#                          col_name='all cyclists')\
#                 .merge(feat_perc(feat,feat_name=feat_name,
#                                  df=df.query('INJ_SEVERITY=="susp_serious_injury"'),
#                                  col_name=qualifier+'cyclists with serious injury'),
#                       on=feat,how='left')\
#                 .merge(feat_perc(feat,feat_name=feat_name,
#                                  df=df.query('INJ_SEVERITY=="killed"'),
#                                  col_name=qualifier+'cyclists with fatality'),
#                       on=feat,how='left')
    
#     # If feature is not ordinal, sort rows descending by crash counts
#     if feat not in ['AGE_BINS','SPEED_LIMIT','DAY_OF_WEEK','HOUR_OF_DAY']:
#         perc_comp=perc_comp.sort_values(by='all cyclists',ascending=False)
    
#     # Relabel day numbers with strings
#     if feat == 'DAY_OF_WEEK':
#         perc_comp.index=['Sun','Mon','Tues','Wed','Thurs','Fri','Sat']
#         perc_comp.index.name='DAY_OF_WEEK'
#     perc_comp=perc_comp.fillna(0)
#     table_columns = list(perc_comp.columns)
    
#     # Define format for displaying floats
#     format_dict={col:'{:.2%}' for col in perc_comp.columns}

        
#     # Define table styles
#     styles = [dict(selector="caption",
#                    props=[("text-align", "center"),
#                           ("font-size", "100%"),
#                           ("color", 'black'),
#                           ("text-decoration","underline"),
#                           ("font-weight","bold")])]
    
#     # Return formatted dataframe
#     if feat_name is None:
#         feat_name=feat
#     caption = f'Breakdown of {feat_name} among cyclist groups'
#     return perc_comp.reset_index().style.set_table_attributes("style='display:inline'")\
#                                     .format(format_dict).bar(color='powderblue',
#                                     subset=table_columns).hide().set_caption(caption)\
#                                     .set_table_styles(styles)