IamGrooooot's picture
Model E: Unsupervised PCA + clustering risk stratification
53a6def
"""
To Do:
- Refactor script to be more readable/smaller main function
"""
import json
import pandas as pd
import numpy as np
from datetime import timedelta
def read_pkl_data(dataset, data_path, path_type):
"""
Read in pickled dataset
--------
:param dataset: type of dataset to read in
:param data_path: path to generated data
:param path_type: type of path to read from
:return: dataframe
"""
print('Reading in ' + dataset)
file_path = data_path + dataset
if path_type == 'data':
file_path += '_proc.pkl'
else:
file_path += '_first_dates.pkl'
return pd.read_pickle(file_path)
def fill_eth_grp_data(df):
"""
Fill nulls in eth_grp column introduced in joining
:param df: dataframe to update
:return: Filled dataframe
"""
df['eth_grp'] = df.groupby('SafeHavenID').eth_grp.apply(
lambda x: x.ffill().bfill())
df['eth_grp'] = df['eth_grp'].fillna('Unknown')
return df
def fill_to_date_columns(df):
"""
Fill nulls in to_date columns introduced in joining
:param df: dataframe to update
:return: Filled dataframe
"""
to_date_cols = ['adm_to_date', 'copd_to_date', 'resp_to_date',
'presc_to_date', 'rescue_to_date', 'labs_to_date',
'anxiety_depression_to_date',
'anxiety_depression_presc_to_date']
df[to_date_cols] = df.groupby('SafeHavenID')[to_date_cols].apply(
lambda x: x.ffill().fillna(0))
return df
def fill_yearly_columns(df):
"""
Fill nulls in yearly columns introduced in joining
:param df: dataframe to update
:return: Filled dataframe
"""
zero_cols = ['adm_per_year', 'total_hosp_days', 'mean_los',
'copd_per_year', 'resp_per_year', 'comorb_per_year',
'salbutamol_per_year',
'saba_inhaler_per_year', 'laba_inhaler_per_year',
'lama_inhaler_per_year', 'sama_inhaler_per_year',
'ics_inhaler_per_year', 'laba_ics_inhaler_per_year',
'lama_laba_ics_inhaler_per_year', 'saba_sama_inhaler_per_year',
'mcs_inhaler_per_year', 'rescue_meds_per_year',
'presc_per_year', 'labs_per_year',
'anxiety_depression_per_year', 'anxiety_depression_presc_per_year']
df[zero_cols] = df[zero_cols].fillna(0)
return df
def fill_days_since(df, typ):
"""
Fill days_since_copd/resp/rescue
:param df: dataframe to update
:param typ: type of feature to fill ('copd', 'resp', 'rescue')
:return: Filled dataframe
"""
df['days_since_' + typ] = df.eoy - df[typ + '_date'].ffill()
return df
def process_first_dates(df):
"""
Process dataframe containing patient's first date in the health board region
--------
:param df: dataframe to process
:return: processed dataframe
"""
df = df.set_index('SafeHavenID')
entry_dataset = df.idxmin(axis=1).apply(lambda x: x.split('_')[1])
first_entry = df.min(axis=1)
df['entry_dataset'] = entry_dataset
df['first_entry'] = first_entry
df_reduced = df[['entry_dataset', 'first_entry']].reset_index()
return df_reduced
def find_closest_simd(v):
"""
Find closest SIMD vigintile for each row 'v'
--------
:param v: row of data from apply statement
:param typ: type of simd column to add
:return: simd value
"""
simd_years = [2009, 2012, 2016]
bools = [v.eoy.year >= year for year in simd_years]
if any(bools):
simd_year = str(simd_years[np.where(bools)[0][-1]])
v['simd_quintile'] = v['simd_' + simd_year + '_quintile']
v['simd_decile'] = v['simd_' + simd_year + '_decile']
v['simd_vigintile'] = v['simd_' + simd_year + '_vigintile']
else:
v['simd_quintile'] = np.nan
v['simd_decile'] = np.nan
v['simd_vigintile'] = np.nan
return v
def main():
# Load in config items
with open('../../../config.json') as json_config_file:
config = json.load(json_config_file)
data_path = config['model_data_path']
# Read in data
adm = read_pkl_data('adm', data_path, 'data')
comorb = read_pkl_data('comorb', data_path, 'data')
presc = read_pkl_data('presc', data_path, 'data')
labs = read_pkl_data('labs', data_path, 'data')
demo = read_pkl_data('demo', data_path, 'data')
# Join datasets
df = adm.join(
comorb, how='left').join(
presc, how='outer').join(
labs, how='outer')
df = df.reset_index()
# Fill nulls introduced in joining
print('Filling data')
df = fill_eth_grp_data(df)
df = fill_to_date_columns(df)
df = fill_yearly_columns(df)
# Fill days_since columns
for typ in ['copd', 'resp', 'rescue', 'adm']:
df = df.groupby('SafeHavenID').apply(fill_days_since, typ)
# Reduce to single column
ds_cols = ['days_since_copd', 'days_since_resp']
df['days_since_copd_resp'] = df[ds_cols].min(axis=1)
# Read in first date data
print('Adding first dates')
adm_dates = read_pkl_data('adm', data_path, 'date')
presc_dates = read_pkl_data('presc', data_path, 'date')
labs_dates = read_pkl_data('labs', data_path, 'date')
# Merge first date data
first_dates = pd.merge(
pd.merge(adm_dates, presc_dates, how="outer", on='SafeHavenID'),
labs_dates, how="outer", on='SafeHavenID')
# Save first dates if needed
first_dates.to_pickle(data_path + 'overall_first_dates.pkl')
# Process first_years
date_data = process_first_dates(first_dates)
# Merge first dates data with dataframe
print('Merging data')
df_merged = pd.merge(df, date_data, on='SafeHavenID', how='inner')
# Add years in health board region
ggc_years = (df_merged.eoy - df_merged.first_entry) / np.timedelta64(1, 'Y')
df_merged['ggc_years'] = round(ggc_years)
# Merge demographics
df_merged = pd.merge(df_merged, demo, on='SafeHavenID')
# Calculate age relative to end of year
dt_diff = df_merged.eoy - pd.to_datetime(df_merged.obf_dob)
df_merged['age'] = dt_diff // timedelta(days=365.2425)
# Find closest SIMD
df_merged = df_merged.apply(find_closest_simd, axis=1)
# Drop additional columns
cols2drop = ['copd_date', 'resp_date', 'adm_date', 'rescue_date',
'simd_2009_quintile', 'simd_2009_decile',
'simd_2009_vigintile', 'simd_2012_quintile',
'simd_2012_decile', 'simd_2012_vigintile',
'simd_2016_quintile', 'simd_2016_decile',
'simd_2016_vigintile', 'days_since_copd',
'days_since_resp']
df_merged = df_merged.drop(cols2drop, axis=1)
# Save dataset
df_merged.to_pickle(data_path + 'merged_full.pkl')
main()