Spaces:
Runtime error
Runtime error
| import pandas as pd | |
| import numpy as np | |
| from sklearn.preprocessing import OneHotEncoder | |
| from sklearn.compose import ColumnTransformer | |
| from sklearn.impute import SimpleImputer | |
| from sklearn.pipeline import make_pipeline | |
| from sklearn.feature_extraction.text import _VectorizerMixin | |
| from sklearn.feature_selection._base import SelectorMixin | |
| from sklearn.pipeline import Pipeline | |
| def merge_category(x): | |
| if x == "Human Trafficking (A), Commercial Sex Acts": | |
| return "Human Trafficking" | |
| elif x == "Human Trafficking (B), Involuntary Servitude": | |
| return "Human Trafficking" | |
| elif x == "Human Trafficking, Commercial Sex Acts": | |
| return "Human Trafficking" | |
| elif x == "Weapons Offence": | |
| return "Weapons Offense" | |
| elif x == "Drug Violation": | |
| return "Drug Offense" | |
| elif x == "Motor Vehicle Theft?": | |
| return "Motor Vehicle Theft" | |
| elif x == "Suspicious Occ": | |
| return "Suspicious" | |
| elif x == "Rape": | |
| return "Sex Offense" | |
| else: | |
| return x | |
| def merge_category_2(x): | |
| if x == "Gambling": | |
| return "Other" | |
| elif x == "Homicide": | |
| return "Other" | |
| elif x == "Human Trafficking": | |
| return "Other" | |
| elif x == "Liquor Laws": | |
| return "Other" | |
| elif x == "Other Miscellaneous": | |
| return "Other" | |
| elif x == "Weapons Carrying Etc": | |
| return "Weapons Offense" | |
| elif x == "Offences Against The Family And Children": | |
| return "Other Offenses" | |
| elif x == "Sex Offense": | |
| return "Other Offenses" | |
| elif x == "Prostitution": | |
| return "Other" | |
| elif x == "Case Closure": | |
| return "Other" | |
| elif x == "Courtesy Report": | |
| return "Other" | |
| elif x == "Fire Report": | |
| return "Other" | |
| elif x == "Suicide": | |
| return "Other" | |
| elif x == "Embezzlement": | |
| return "Financial Offense" | |
| elif x == "Forgery And Counterfeiting": | |
| return "Financial Offense" | |
| elif x == "Fraud": | |
| return "Financial Offense" | |
| elif x == "Lost Property": | |
| return "Financial Offense" | |
| elif x == "Stolen Property": | |
| return "Financial Offense" | |
| elif x == "Motor Vehicle Theft": | |
| return "Traffic and Vehicle Offense" | |
| elif x == "Recovered Vehicle": | |
| return "Traffic and Vehicle Offense" | |
| elif x == "Traffic Collision": | |
| return "Traffic and Vehicle Offense" | |
| elif x == "Traffic Violation Arrest": | |
| return "Traffic and Vehicle Offense" | |
| elif x == "Vehicle Impounded": | |
| return "Traffic and Vehicle Offense" | |
| elif x == "Vehicle Misplaced": | |
| return "Traffic and Vehicle Offense" | |
| elif x == "Civil Sidewalks": | |
| return "Traffic and Vehicle Offense" | |
| elif x == "Burglary": | |
| return "Theft and Robbery" | |
| elif x == "Larceny Theft": | |
| return "Theft and Robbery" | |
| elif x == "Robbery": | |
| return "Theft and Robbery" | |
| elif x == "Arson": | |
| return "Assault" | |
| elif x == "Disorderly Conduct": | |
| return "Other Offenses" | |
| elif x == "Vandalism": | |
| return "Malicious Mischief" | |
| elif x == "Miscellaneous Investigation": | |
| return "Suspicious" | |
| else: | |
| return x | |
| def get_feature_out(estimator, feature_in): | |
| if hasattr(estimator, 'get_feature_names'): | |
| if isinstance(estimator, _VectorizerMixin): | |
| # handling all vectorizers | |
| return [f'vec_{f}' \ | |
| for f in estimator.get_feature_names()] | |
| else: | |
| return estimator.get_feature_names(feature_in) | |
| elif isinstance(estimator, SelectorMixin): | |
| return np.array(feature_in)[estimator.get_support()] | |
| else: | |
| return feature_in | |
| def get_ct_feature_names(ct): | |
| # handles all estimators, pipelines inside ColumnTransfomer | |
| # doesn't work when remainder =='passthrough' | |
| # which requires the input column names. | |
| output_features = [] | |
| for name, estimator, features in ct.transformers_: | |
| if name != 'remainder': | |
| if isinstance(estimator, Pipeline): | |
| current_features = features | |
| for step in estimator: | |
| current_features = get_feature_out(step, current_features) | |
| features_out = current_features | |
| else: | |
| features_out = get_feature_out(estimator, features) | |
| output_features.extend(features_out) | |
| elif estimator == 'passthrough': | |
| output_features.extend(ct._feature_names_in[features]) | |
| return output_features | |
| def preprocessing_incident(incident_df): | |
| # step 1: dropping irrelavent columns and null values | |
| incident_df.drop(columns=['incident_date','incident_time','incident_year','report_datetime','row_id','incident_id','incident_number', | |
| 'report_type_description','filed_online','incident_code','incident_subcategory', | |
| 'incident_description','resolution','cad_number','intersection','cnn','analysis_neighborhood', | |
| 'supervisor_district','point',':@computed_region_jwn9_ihcz',':@computed_region_26cr_cadq', | |
| ':@computed_region_qgnn_b9vv',':@computed_region_nqbw_i6c3',':@computed_region_h4ep_8xdi', | |
| ':@computed_region_n4xg_c4py',':@computed_region_jg9y_a9du'], inplace=True) | |
| incident_df.dropna(inplace=True) | |
| # step 2: create new columns | |
| incident_df['incident_month']=pd.to_datetime(incident_df["incident_datetime"]).dt.month | |
| incident_df['incident_year']=pd.to_datetime(incident_df["incident_datetime"]).dt.year | |
| incident_df['incident_hour']=pd.to_datetime(incident_df["incident_datetime"]).dt.hour | |
| #incident_df['incident_dayofweek']=pd.to_datetime(incident_df["incident_datetime"]).dt.dayofweek | |
| # step 3: merging labels | |
| incident_df['incident_category']=incident_df['incident_category'].apply(merge_category) | |
| incident_df['incident_category']=incident_df['incident_category'].apply(merge_category_2) | |
| # step 4: onehot encoding using column Transformer Settings | |
| t = [('ohe-cat', OneHotEncoder(sparse=False, handle_unknown='ignore'), ['incident_day_of_week', 'report_type_code','police_district']), | |
| ('do_nothing', SimpleImputer(strategy='most_frequent'), ['incident_datetime', 'incident_category', 'latitude', 'longitude', 'incident_month', 'incident_year', 'incident_hour']), | |
| ] | |
| pre_processor = ColumnTransformer(transformers=t, remainder='drop') | |
| incident_df_processed = pre_processor.fit_transform(X=incident_df) | |
| # Get column names | |
| columns = get_ct_feature_names(pre_processor) | |
| incident_df_processed = pd.DataFrame(incident_df_processed, columns=columns) | |
| # step 5: change column types and names | |
| numeric_columns = incident_df_processed.columns.drop(['incident_datetime','incident_category']) | |
| incident_df_processed[numeric_columns] = incident_df_processed[numeric_columns].apply(pd.to_numeric) | |
| incident_df_processed['incident_datetime'] = incident_df_processed['incident_datetime'].apply(pd.to_datetime) | |
| incident_df_processed.rename(columns={"police_district_Out of SF": "police_district_OutOfSF"},inplace=True) | |
| return incident_df_processed |