| | """ |
| | To Do: |
| | - Refactor script to be more readable/smaller main function |
| | """ |
| | import json |
| | import pandas as pd |
| | import numpy as np |
| | from datetime import timedelta |
| |
|
| |
|
| | def read_pkl_data(dataset, data_path, path_type): |
| | """ |
| | Read in pickled dataset |
| | -------- |
| | :param dataset: type of dataset to read in |
| | :param data_path: path to generated data |
| | :param path_type: type of path to read from |
| | :return: dataframe |
| | """ |
| | print('Reading in ' + dataset) |
| |
|
| | file_path = data_path + dataset |
| | if path_type == 'data': |
| | file_path += '_proc.pkl' |
| | else: |
| | file_path += '_first_dates.pkl' |
| | |
| | return pd.read_pickle(file_path) |
| |
|
| |
|
| | def fill_eth_grp_data(df): |
| | """ |
| | Fill nulls in eth_grp column introduced in joining |
| | :param df: dataframe to update |
| | :return: Filled dataframe |
| | """ |
| | df['eth_grp'] = df.groupby('SafeHavenID').eth_grp.apply( |
| | lambda x: x.ffill().bfill()) |
| | df['eth_grp'] = df['eth_grp'].fillna('Unknown') |
| |
|
| | return df |
| |
|
| |
|
| | def fill_to_date_columns(df): |
| | """ |
| | Fill nulls in to_date columns introduced in joining |
| | :param df: dataframe to update |
| | :return: Filled dataframe |
| | """ |
| | to_date_cols = ['adm_to_date', 'copd_to_date', 'resp_to_date', |
| | 'presc_to_date', 'rescue_to_date', 'labs_to_date', |
| | 'anxiety_depression_to_date', |
| | 'anxiety_depression_presc_to_date'] |
| | df[to_date_cols] = df.groupby('SafeHavenID')[to_date_cols].apply( |
| | lambda x: x.ffill().fillna(0)) |
| |
|
| | return df |
| |
|
| |
|
| | def fill_yearly_columns(df): |
| | """ |
| | Fill nulls in yearly columns introduced in joining |
| | :param df: dataframe to update |
| | :return: Filled dataframe |
| | """ |
| | zero_cols = ['adm_per_year', 'total_hosp_days', 'mean_los', |
| | 'copd_per_year', 'resp_per_year', 'comorb_per_year', |
| | 'salbutamol_per_year', |
| | 'saba_inhaler_per_year', 'laba_inhaler_per_year', |
| | 'lama_inhaler_per_year', 'sama_inhaler_per_year', |
| | 'ics_inhaler_per_year', 'laba_ics_inhaler_per_year', |
| | 'lama_laba_ics_inhaler_per_year', 'saba_sama_inhaler_per_year', |
| | 'mcs_inhaler_per_year', 'rescue_meds_per_year', |
| | 'presc_per_year', 'labs_per_year', |
| | 'anxiety_depression_per_year', 'anxiety_depression_presc_per_year'] |
| | df[zero_cols] = df[zero_cols].fillna(0) |
| |
|
| | return df |
| |
|
| |
|
| | def fill_days_since(df, typ): |
| | """ |
| | Fill days_since_copd/resp/rescue |
| | :param df: dataframe to update |
| | :param typ: type of feature to fill ('copd', 'resp', 'rescue') |
| | :return: Filled dataframe |
| | """ |
| | df['days_since_' + typ] = df.eoy - df[typ + '_date'].ffill() |
| |
|
| | return df |
| |
|
| |
|
| | def process_first_dates(df): |
| | """ |
| | Process dataframe containing patient's first date in the health board region |
| | -------- |
| | :param df: dataframe to process |
| | :return: processed dataframe |
| | """ |
| | df = df.set_index('SafeHavenID') |
| | entry_dataset = df.idxmin(axis=1).apply(lambda x: x.split('_')[1]) |
| | first_entry = df.min(axis=1) |
| | df['entry_dataset'] = entry_dataset |
| | df['first_entry'] = first_entry |
| | df_reduced = df[['entry_dataset', 'first_entry']].reset_index() |
| |
|
| | return df_reduced |
| |
|
| |
|
| | def find_closest_simd(v): |
| | """ |
| | Find closest SIMD vigintile for each row 'v' |
| | -------- |
| | :param v: row of data from apply statement |
| | :param typ: type of simd column to add |
| | :return: simd value |
| | """ |
| | simd_years = [2009, 2012, 2016] |
| | bools = [v.eoy.year >= year for year in simd_years] |
| | if any(bools): |
| | simd_year = str(simd_years[np.where(bools)[0][-1]]) |
| | v['simd_quintile'] = v['simd_' + simd_year + '_quintile'] |
| | v['simd_decile'] = v['simd_' + simd_year + '_decile'] |
| | v['simd_vigintile'] = v['simd_' + simd_year + '_vigintile'] |
| | else: |
| | v['simd_quintile'] = np.nan |
| | v['simd_decile'] = np.nan |
| | v['simd_vigintile'] = np.nan |
| |
|
| | return v |
| |
|
| |
|
| | def main(): |
| |
|
| | |
| | with open('../../../config.json') as json_config_file: |
| | config = json.load(json_config_file) |
| | data_path = config['model_data_path'] |
| |
|
| | |
| | adm = read_pkl_data('adm', data_path, 'data') |
| | comorb = read_pkl_data('comorb', data_path, 'data') |
| | presc = read_pkl_data('presc', data_path, 'data') |
| | labs = read_pkl_data('labs', data_path, 'data') |
| | demo = read_pkl_data('demo', data_path, 'data') |
| |
|
| | |
| | df = adm.join( |
| | comorb, how='left').join( |
| | presc, how='outer').join( |
| | labs, how='outer') |
| | df = df.reset_index() |
| |
|
| | |
| | print('Filling data') |
| | df = fill_eth_grp_data(df) |
| | df = fill_to_date_columns(df) |
| | df = fill_yearly_columns(df) |
| |
|
| | |
| | for typ in ['copd', 'resp', 'rescue', 'adm']: |
| | df = df.groupby('SafeHavenID').apply(fill_days_since, typ) |
| |
|
| | |
| | ds_cols = ['days_since_copd', 'days_since_resp'] |
| | df['days_since_copd_resp'] = df[ds_cols].min(axis=1) |
| |
|
| | |
| | print('Adding first dates') |
| | adm_dates = read_pkl_data('adm', data_path, 'date') |
| | presc_dates = read_pkl_data('presc', data_path, 'date') |
| | labs_dates = read_pkl_data('labs', data_path, 'date') |
| |
|
| | |
| | first_dates = pd.merge( |
| | pd.merge(adm_dates, presc_dates, how="outer", on='SafeHavenID'), |
| | labs_dates, how="outer", on='SafeHavenID') |
| |
|
| | |
| | first_dates.to_pickle(data_path + 'overall_first_dates.pkl') |
| |
|
| | |
| | date_data = process_first_dates(first_dates) |
| |
|
| | |
| | print('Merging data') |
| | df_merged = pd.merge(df, date_data, on='SafeHavenID', how='inner') |
| |
|
| | |
| | ggc_years = (df_merged.eoy - df_merged.first_entry) / np.timedelta64(1, 'Y') |
| | df_merged['ggc_years'] = round(ggc_years) |
| |
|
| | |
| | df_merged = pd.merge(df_merged, demo, on='SafeHavenID') |
| |
|
| | |
| | dt_diff = df_merged.eoy - pd.to_datetime(df_merged.obf_dob) |
| | df_merged['age'] = dt_diff // timedelta(days=365.2425) |
| |
|
| | |
| | df_merged = df_merged.apply(find_closest_simd, axis=1) |
| |
|
| | |
| | cols2drop = ['copd_date', 'resp_date', 'adm_date', 'rescue_date', |
| | 'simd_2009_quintile', 'simd_2009_decile', |
| | 'simd_2009_vigintile', 'simd_2012_quintile', |
| | 'simd_2012_decile', 'simd_2012_vigintile', |
| | 'simd_2016_quintile', 'simd_2016_decile', |
| | 'simd_2016_vigintile', 'days_since_copd', |
| | 'days_since_resp'] |
| | df_merged = df_merged.drop(cols2drop, axis=1) |
| |
|
| | |
| | df_merged.to_pickle(data_path + 'merged_full.pkl') |
| |
|
| |
|
| | main() |
| |
|