| """Script that performs encoding of categorical features and imputation. |
| |
| Performs encoding of categorical features, and imputation of missing values. After encoding |
| and imputation are performed, features are dropped. Two versions of the data is saved: |
| imputed and not imputed dataframes. |
| """ |
|
|
| import pandas as pd |
| import numpy as np |
| import os |
| import sys |
| import yaml |
| import json |
| import joblib |
| import encoding |
| import imputation |
|
|
|
|
| with open("./training/config.yaml", "r") as config: |
| config = yaml.safe_load(config) |
|
|
| |
| model_type = config["model_settings"]["model_type"] |
|
|
| |
| log = open("./training/logging/encode_and_impute_" + model_type + ".log", "w") |
| sys.stdout = log |
|
|
| |
| data_to_process = config["model_settings"]["data_to_process"] |
|
|
| |
| data = pd.read_pickle( |
| os.path.join( |
| config["outputs"]["processed_data_dir"], |
| "{}_combined_{}.pkl".format(data_to_process, model_type), |
| ) |
| ) |
|
|
| |
| |
| |
|
|
| categorical_cols = [ |
| "LatestSymptomDiaryQ8", |
| "LatestSymptomDiaryQ9", |
| "LatestSymptomDiaryQ10", |
| "DaysSinceLastExac", |
| "AgeBinned", |
| "Comorbidities", |
| "FEV1PercentPredicted", |
| ] |
|
|
| |
| |
| for categorical_col in categorical_cols: |
| data[categorical_col] = data[categorical_col].replace(np.nan, "nan") |
|
|
| if data_to_process == "train": |
| |
| target_encodings = encoding.get_target_encodings( |
| train_data=data, |
| cols_to_encode=categorical_cols, |
| target_col="ExacWithin3Months", |
| smooth="auto", |
| ) |
| train_encoded = encoding.apply_target_encodings( |
| data=data, |
| cols_to_encode=categorical_cols, |
| encodings=target_encodings, |
| drop_categorical_cols=False, |
| ) |
| json.dump( |
| target_encodings, |
| open("./data/artifacts/target_encodings_" + model_type + ".json", "w"), |
| ) |
|
|
| |
| |
| fold_patients = np.load( |
| os.path.join( |
| config["outputs"]["cohort_info_dir"], |
| "fold_patients_{}.npy".format(model_type), |
| ), |
| allow_pickle=True, |
| ) |
| train_encoded_cv, target_encodings = encoding.kfold_target_encode( |
| df=data, |
| fold_ids=fold_patients, |
| cols_to_encode=categorical_cols, |
| id_col="StudyId", |
| target="ExacWithin3Months", |
| smooth="auto", |
| drop_categorical_cols=False, |
| ) |
|
|
| |
| categorical_cols.remove("AgeBinned") |
| train_encoded = train_encoded.drop(columns=categorical_cols) |
| train_encoded_cv = train_encoded_cv.drop(columns=categorical_cols) |
|
|
| if (data_to_process == "test") | (data_to_process == "forward_val"): |
| |
| target_encodings = json.load( |
| open("./data/artifacts/target_encodings_" + model_type + ".json") |
| ) |
| test_encoded = encoding.apply_target_encodings( |
| data=data, |
| cols_to_encode=categorical_cols, |
| encodings=target_encodings, |
| drop_categorical_cols=False, |
| ) |
|
|
| |
| categorical_cols.remove("AgeBinned") |
| test_encoded = test_encoded.drop(columns=categorical_cols) |
|
|
| |
| |
| |
|
|
| cols_to_ignore = [ |
| "StudyId", |
| "PatientId", |
| "IndexDate", |
| "ExacWithin3Months", |
| "HospExacWithin3Months", |
| "CommExacWithin3Months", |
| "Age", |
| "Sex_F", |
| "SafeHavenID", |
| "AgeBinned", |
| ] |
|
|
| if data_to_process == "train": |
| |
| not_imputed_train = train_encoded.copy() |
| cols_to_impute = train_encoded.drop(columns=cols_to_ignore).columns |
|
|
| imputer = imputation.get_imputer( |
| train_data=train_encoded, |
| cols_to_impute=cols_to_impute, |
| average_type="median", |
| cols_to_groupby=["AgeBinned", "Sex_F"], |
| ) |
| imputed_train = imputation.apply_imputer( |
| data=train_encoded, |
| cols_to_impute=cols_to_impute, |
| imputer=imputer, |
| cols_to_groupby=["AgeBinned", "Sex_F"], |
| ) |
| joblib.dump(imputer, "./data/artifacts/imputer_" + model_type + ".pkl") |
|
|
| |
| not_imputed_train_cv = train_encoded_cv.copy() |
| imputed_train_cv = imputation.kfold_impute( |
| df=train_encoded, |
| fold_ids=fold_patients, |
| cols_to_impute=cols_to_impute, |
| average_type="median", |
| cols_to_groupby=["AgeBinned", "Sex_F"], |
| id_col="StudyId", |
| ) |
|
|
| df_columns = imputed_train.columns.tolist() |
|
|
| if (data_to_process == "test") | (data_to_process == "forward_val"): |
| not_imputed_test = test_encoded.copy() |
| cols_to_impute = test_encoded.drop(columns=cols_to_ignore).columns |
|
|
| |
| imputer = joblib.load("./data/artifacts/imputer_" + model_type + ".pkl") |
| imputed_test = imputation.apply_imputer( |
| data=test_encoded, |
| cols_to_impute=cols_to_impute, |
| imputer=imputer, |
| cols_to_groupby=["AgeBinned", "Sex_F"], |
| ) |
|
|
| df_columns = imputed_test.columns.tolist() |
|
|
| |
| |
| |
| cols_to_drop_startswith = ( |
| "DiffLatest", |
| "Var", |
| "LatestEQ5D", |
| "TotalEngagement", |
| "Age", |
| "NumHosp", |
| "Required", |
| "LungFunction", |
| "EngagementCAT", |
| "LatestSymptomDiary", |
| "LatestAlbumin", |
| "LatestEosinophils", |
| "LatestNeutrophils", |
| "LatestWhite Blood Count", |
| ) |
|
|
| additional_cols_to_drop = [ |
| "PatientId", |
| "SafeHavenID", |
| "Sex_F", |
| "NumCommExacPrior6mo", |
| "AsthmaOverlap", |
| "TimeSinceLungFunc", |
| "LatestNeutLymphRatio", |
| "EngagementEQ5DTW1", |
| "EngagementMRCTW1", |
| "LatestMRCQ1", |
| "WeekAvgCATQ1", |
| "WeekAvgCATQ3", |
| "WeekAvgCATQ4", |
| "WeekAvgCATQ5", |
| "WeekAvgCATQ6", |
| "WeekAvgCATQ7", |
| "WeekAvgCATQ8", |
| "WeekAvgSymptomDiaryQ1", |
| "WeekAvgSymptomDiaryQ3", |
| "WeekAvgSymptomDiaryScore", |
| "EngagementSymptomDiaryTW1", |
| "ScaledSumSymptomDiaryQ3TW1", |
| |
| ] |
|
|
| cols_to_drop = [] |
| cols_to_drop.extend( |
| [item for item in df_columns if item.startswith(cols_to_drop_startswith)] |
| ) |
| cols_to_drop.extend(additional_cols_to_drop) |
|
|
| if data_to_process == "train": |
| imputed_train = imputed_train.drop(columns=cols_to_drop) |
| not_imputed_train = not_imputed_train.drop(columns=cols_to_drop) |
| imputed_train_cv = imputed_train_cv.drop(columns=cols_to_drop) |
| not_imputed_train_cv = not_imputed_train_cv.drop(columns=cols_to_drop) |
| if (data_to_process == "test") | (data_to_process == "forward_val"): |
| imputed_test = imputed_test.drop(columns=cols_to_drop) |
| not_imputed_test = not_imputed_test.drop(columns=cols_to_drop) |
|
|
| |
| |
| |
| os.makedirs(config["outputs"]["model_input_data_dir"], exist_ok=True) |
|
|
| if data_to_process == "train": |
| imputed_train.to_pickle( |
| os.path.join( |
| config["outputs"]["model_input_data_dir"], |
| "{}_imputed_{}.pkl".format(data_to_process, model_type), |
| ) |
| ) |
| not_imputed_train.to_pickle( |
| os.path.join( |
| config["outputs"]["model_input_data_dir"], |
| "{}_not_imputed_{}.pkl".format(data_to_process, model_type), |
| ) |
| ) |
| imputed_train_cv.to_pickle( |
| os.path.join( |
| config["outputs"]["model_input_data_dir"], |
| "{}_imputed_cv_{}.pkl".format(data_to_process, model_type), |
| ) |
| ) |
| not_imputed_train_cv.to_pickle( |
| os.path.join( |
| config["outputs"]["model_input_data_dir"], |
| "{}_not_imputed_cv_{}.pkl".format(data_to_process, model_type), |
| ) |
| ) |
|
|
| if (data_to_process == "test") | (data_to_process == "forward_val"): |
| imputed_test.to_pickle( |
| os.path.join( |
| config["outputs"]["model_input_data_dir"], |
| "{}_imputed_{}.pkl".format(data_to_process, model_type), |
| ) |
| ) |
| not_imputed_test.to_pickle( |
| os.path.join( |
| config["outputs"]["model_input_data_dir"], |
| "{}_not_imputed_{}.pkl".format(data_to_process, model_type), |
| ) |
| ) |
|
|