| | """ |
| | TESTING |
| | Impute any null data, save ethnicity info for each ID and scale |
| | final dataset |
| | |
| | NB: This script can be used for merged receiver, scale up or testing data |
| | """ |
| | import json |
| | import sys |
| | import joblib |
| | import pandas as pd |
| | import numpy as np |
| | from numpy import loadtxt |
| |
|
| |
|
| | ds_cols = ['days_since_copd_resp', 'days_since_adm', 'days_since_rescue'] |
| |
|
| | null_cols = ['alt_med_2yr', 'ast_med_2yr', 'albumin_med_2yr', |
| | 'alkaline_phosphatase_med_2yr', 'basophils_med_2yr', |
| | 'c_reactive_protein_med_2yr', 'chloride_med_2yr', |
| | 'creatinine_med_2yr', 'eosinophils_med_2yr', |
| | 'estimated_gfr_med_2yr', 'haematocrit_med_2yr', |
| | 'haemoglobin_med_2yr', 'lymphocytes_med_2yr', |
| | 'mch_med_2yr', 'mean_cell_volume_med_2yr', |
| | 'monocytes_med_2yr', 'neutrophils_med_2yr', |
| | 'platelets_med_2yr', 'potassium_med_2yr', |
| | 'red_blood_count_med_2yr', 'sodium_med_2yr', |
| | 'total_bilirubin_med_2yr', 'urea_med_2yr', |
| | 'white_blood_count_med_2yr', 'neut_lymph_med_2yr', |
| | 'days_since_copd_resp', 'days_since_adm', 'days_since_rescue'] |
| |
|
| | cols2drop = ['eth_grp', 'entry_dataset', 'first_entry', 'obf_dob', |
| | 'marital_status', 'label', 'simd_vigintile', 'simd_decile', |
| | 'simd_quintile', 'sex_bin'] |
| |
|
| |
|
| | def calc_age_bins_test(df, data_path): |
| | """ |
| | Load training bins and assign to testing data |
| | -------- |
| | :param df: dataframe to be updated |
| | :param data_path: path to generated data |
| | :return: updated dataframe |
| | """ |
| | ed = loadtxt(data_path + 'age_bins_train.csv', delimiter=',') |
| | categories, edges = pd.qcut( |
| | df['age'], q=10, precision=0, retbins=True, labels=ed[1:]) |
| | df['age_bin'] = categories.astype(int) |
| |
|
| | return df |
| |
|
| |
|
| | def create_label(df): |
| | """ |
| | Create a label containing the age and sex bins of the data |
| | -------- |
| | :param df: dataframe |
| | :return: dataframe with label added |
| | """ |
| | df['label'] = df['age_bin'].astype(str) + '_' + df['sex_bin'].astype(str) |
| | df = df.drop('age_bin', axis=1) |
| |
|
| | return df |
| |
|
| |
|
| | def fill_nulls(label, df, medians): |
| | """ |
| | Fill any null values in testing/REC/SUP data with median values from |
| | training data. |
| | -------- |
| | :param label: string label containing age and sex bin values, e.g. '51_0' |
| | for a male patient in the less than 51 age bin |
| | :param df: dataframe |
| | :param medians: dataframe of training set medians for each label and |
| | column |
| | :return: filled dataframe for specified label |
| | """ |
| | meds = medians[medians['label'] == label].iloc[0] |
| | df_2_fill = df[df['label'] == label] |
| | for col in null_cols: |
| | df_2_fill[col] = df_2_fill[col].fillna(meds[col]) |
| | |
| | return df_2_fill |
| |
|
| |
|
| | def ds_fill_5year_test(df, col, max_vals): |
| | """ |
| | Fill days_since_X columns where patient has been in the dataset less than |
| | 5 years |
| | -------- |
| | :param df: dataframe to be updated |
| | :param col: column to check |
| | :param max_vals: series with columns and their max value from training |
| | :return: dataframe with column nulls filled where patient has ggc_years < 5 |
| | """ |
| | df_5years = df.ggc_years < 5 |
| | df.loc[df_5years, col] = df.loc[df_5years, col].fillna(max_vals[col]) |
| |
|
| | return df |
| |
|
| |
|
| | def scale_data_test(df, scaler): |
| | """ |
| | Min-max scale final dataset |
| | ----- |
| | :param df: dataframe to be scaled |
| | :param scaler: scaler object to apply to df |
| | :return: scaled dataset for modelling |
| | """ |
| | all_cols = df.columns |
| | all_cols = all_cols.drop(['SafeHavenID', 'eoy']) |
| | data_scaled = scaler.transform(df[all_cols].to_numpy()) |
| | df_scaled = pd.DataFrame(data_scaled, columns=all_cols) |
| | df_final = (df[['SafeHavenID', 'eoy']] |
| | .reset_index(drop=True) |
| | .join(df_scaled)) |
| |
|
| | return df_final |
| |
|
| |
|
| | def main(): |
| |
|
| | |
| | with open('../../../config.json') as json_config_file: |
| | config = json.load(json_config_file) |
| |
|
| | |
| | data_path = config['model_data_path'] |
| |
|
| | |
| | data_type = sys.argv[1] |
| |
|
| | |
| | df = pd.read_pickle(data_path + 'merged_' + data_type + '.pkl') |
| |
|
| | |
| | df = calc_age_bins_test(df, data_path) |
| |
|
| | |
| | df_medians = pd.read_pickle(data_path + 'medians.pkl') |
| | df_medians = df_medians.reset_index() |
| | df_medians = create_label(df_medians) |
| | df = create_label(df) |
| | labels = df_medians['label'] |
| |
|
| | |
| | max_vals = pd.read_pickle(data_path + 'maxs.pkl') |
| | for col in ds_cols: |
| | df = ds_fill_5year_test(df, col, max_vals) |
| |
|
| | |
| | df_filled = pd.concat([fill_nulls(x, df, df_medians) for x in labels]) |
| |
|
| | |
| | for col in ds_cols: |
| | day = np.timedelta64(1, 'D') |
| | df_filled[col] = (df_filled[col] / day).astype(int) |
| |
|
| | |
| | df_filled.to_pickle(data_path + 'filled_' + data_type + '.pkl') |
| |
|
| | |
| | df_filled = df_filled.drop(cols2drop, axis=1) |
| |
|
| | |
| | scaler = joblib.load(data_path + 'min_max_scaler_train.pkl') |
| | df_filled = scale_data_test(df_filled, scaler) |
| |
|
| | |
| | df_filled.to_pickle(data_path + 'min_max_' + data_type + '.pkl') |
| |
|
| |
|
| | main() |
| |
|