| | """ |
| | TRAIN |
| | Impute any null data, save ethnicity info for each ID and scale |
| | final dataset |
| | """ |
| | import json |
| | import joblib |
| | import pandas as pd |
| | import numpy as np |
| | from numpy import savetxt |
| | from sklearn.preprocessing import MinMaxScaler |
| | from utils.reduction import calc_ds_med |
| |
|
| |
|
| | demo_cols = ['age_bin', 'sex_bin'] |
| |
|
| | ds_cols = ['days_since_copd_resp', 'days_since_adm', 'days_since_rescue'] |
| |
|
| | null_cols = ['alt_med_2yr', 'ast_med_2yr', 'albumin_med_2yr', |
| | 'alkaline_phosphatase_med_2yr', 'basophils_med_2yr', |
| | 'c_reactive_protein_med_2yr', 'chloride_med_2yr', |
| | 'creatinine_med_2yr', 'eosinophils_med_2yr', |
| | 'estimated_gfr_med_2yr', 'haematocrit_med_2yr', |
| | 'haemoglobin_med_2yr', 'lymphocytes_med_2yr', |
| | 'mch_med_2yr', 'mean_cell_volume_med_2yr', |
| | 'monocytes_med_2yr', 'neutrophils_med_2yr', |
| | 'platelets_med_2yr', 'potassium_med_2yr', |
| | 'red_blood_count_med_2yr', 'sodium_med_2yr', |
| | 'total_bilirubin_med_2yr', 'urea_med_2yr', |
| | 'white_blood_count_med_2yr', 'neut_lymph_med_2yr'] |
| |
|
| | cols2drop = ['eth_grp', 'entry_dataset', 'first_entry', 'obf_dob', |
| | 'sex_bin', 'marital_status', 'age_bin', |
| | 'days_since_copd_resp_med', 'days_since_adm_med', |
| | 'days_since_rescue_med', 'simd_vigintile', 'simd_decile', |
| | 'simd_quintile'] |
| |
|
| |
|
| | def calc_age_bins_train(df, data_path): |
| | """ |
| | Split ages into 10 bins and save results for median filling test data |
| | -------- |
| | :param df: dataframe to be updated |
| | :param data_path: path to generated data |
| | :return: updated dataframe |
| | """ |
| | |
| | cat, ed = pd.qcut(df['age'], q=10, precision=0, retbins=True) |
| | categories, edges = pd.qcut( |
| | df['age'], q=10, precision=0, retbins=True, labels=ed[1:]) |
| | df['age_bin'] = categories.astype(int) |
| |
|
| | |
| | savetxt(data_path + 'age_bins_train.csv', edges, delimiter=',') |
| |
|
| | return df |
| |
|
| |
|
| | def calc_df_med(df, data_path): |
| | """ |
| | Calculate the medians for all columns in the dataset |
| | -------- |
| | :param df: dataframe to update |
| | :param data_path: path to generated data |
| | :return: dataframe with null columns filled with median values and days_since |
| | median columns added to the dataframe |
| | """ |
| | |
| | all_cols = df.columns |
| | all_cols = all_cols.drop(['SafeHavenID', 'eoy']) |
| | df_median = df[all_cols].groupby(demo_cols).median() |
| |
|
| | |
| | ds_med = df[demo_cols + ds_cols].groupby(demo_cols).apply(calc_ds_med) |
| |
|
| | |
| | df_median = df_median.join(ds_med) |
| |
|
| | |
| | df_median.to_pickle(data_path + 'medians.pkl') |
| |
|
| | |
| | ds_med.columns += '_med' |
| | df = df.join(ds_med, on=demo_cols) |
| |
|
| | return df |
| |
|
| |
|
| | def ds_fill_5year_train(df, col): |
| | """ |
| | Fill days_since_X columns where patient has been in the dataset less than |
| | 5 years |
| | -------- |
| | :param df: dataframe to be updated |
| | :param col: column to check |
| | :return: dataframe with column nulls filled where patient has ggc_years < 5 |
| | """ |
| | df_5years = df.ggc_years < 5 |
| | df.loc[df_5years, col] = df.loc[df_5years, col].fillna(df[col].max()) |
| |
|
| | return df |
| |
|
| |
|
| | def scale_data_train(df, data_path, scaler): |
| | """ |
| | Min-max scale final dataset |
| | ----- |
| | :param df: dataframe to be scaled |
| | :param data_path: path to generated data |
| | :param scaler: scaler object to apply to df |
| | :return: scaled dataset for modelling |
| | """ |
| | all_cols = df.columns |
| | all_cols = all_cols.drop(['SafeHavenID', 'eoy']) |
| | data_scaled = scaler.fit_transform(df[all_cols].to_numpy()) |
| | df_scaled = pd.DataFrame(data_scaled, columns=all_cols) |
| | df_final = (df[['SafeHavenID', 'eoy']] |
| | .reset_index(drop=True) |
| | .join(df_scaled)) |
| |
|
| | |
| | joblib.dump(scaler, data_path + 'min_max_scaler_train.pkl') |
| |
|
| | return df_final |
| |
|
| |
|
| | def main(): |
| |
|
| | |
| | with open('../../../config.json') as json_config_file: |
| | config = json.load(json_config_file) |
| | data_path = config['model_data_path'] |
| | |
| | |
| | df = pd.read_pickle(data_path + 'merged_train.pkl') |
| |
|
| | |
| | df = calc_age_bins_train(df, data_path) |
| |
|
| | |
| | df = calc_df_med(df, data_path) |
| |
|
| | |
| | df[null_cols] = df.groupby(demo_cols)[null_cols].apply( |
| | lambda x: x.fillna(x.median())) |
| |
|
| | |
| | day = np.timedelta64(1, 'D') |
| | df[ds_cols].max().to_pickle(data_path + 'maxs.pkl') |
| | for col in ds_cols: |
| | df = ds_fill_5year_train(df, col) |
| | df[col] = df[col].fillna(df[col + '_med']) |
| | df[col] = (df[col] / day).astype(int) |
| |
|
| | |
| | df.to_pickle(data_path + 'filled_train.pkl') |
| |
|
| | |
| | df = df.drop(cols2drop, axis=1) |
| |
|
| | |
| | scaler = MinMaxScaler() |
| |
|
| | |
| | df_final = scale_data_train(df, data_path, scaler) |
| |
|
| | |
| | df_final.to_pickle(data_path + 'min_max_train.pkl') |
| |
|
| |
|
| | main() |
| |
|