| import numpy as np |
| import pandas as pd |
| import pickle |
| import model_h |
| import mlflow |
| import os |
| import shutil |
| import matplotlib.pyplot as plt |
| import sys |
| import scipy |
| import yaml |
|
|
| with open("./training/config.yaml", "r") as config: |
| config = yaml.safe_load(config) |
|
|
|
|
| def perform_ks_test(train_data, forward_val_data): |
| """Perform Kolmogorov-Smirnov test. |
| |
| Args: |
| train_data (pd.DataFrame): data used to train model. |
| forward_val_data (pd.DataFrame): data used for the forward validation. |
| |
| Returns: |
| pd.DataFrame: dataframe containing the results of the K-S test. |
| """ |
| for num, feature_name in enumerate(train_data.columns.tolist()): |
| statistic, pvalue = scipy.stats.ks_2samp( |
| train_data[feature_name], forward_val_data[feature_name] |
| ) |
| pvalue = round(pvalue, 4) |
| if num == 0: |
| df_ks = pd.DataFrame( |
| { |
| "FeatureName": feature_name, |
| "KS_PValue": pvalue, |
| "KS_TestStatistic": statistic, |
| }, |
| index=[num], |
| ) |
| else: |
| df_ks_feat = pd.DataFrame( |
| { |
| "FeatureName": feature_name, |
| "KS_PValue": pvalue, |
| "KS_TestStatistic": statistic, |
| }, |
| index=[num], |
| ) |
| df_ks = pd.concat([df_ks, df_ks_feat]) |
| df_ks["KS_DistributionsIdentical"] = np.where(df_ks["KS_PValue"] < 0.05, 0, 1) |
| return df_ks |
|
|
|
|
| def compute_wasserstein_distance(train_data, forward_val_data): |
| """Calculate the wasserstein distance. |
| |
| Args: |
| train_data (pd.DataFrame): data used to train model. |
| forward_val_data (pd.DataFrame): data used for the forward validation. |
| |
| Returns: |
| pd.DataFrame: dataframe containing the wasserstein distance results. |
| """ |
| for num, feature_name in enumerate(train_data.columns.tolist()): |
| w_distance = scipy.stats.wasserstein_distance( |
| train_data[feature_name], forward_val_data[feature_name] |
| ) |
| if num == 0: |
| df_wd = pd.DataFrame( |
| {"FeatureName": feature_name, "WassersteinDistance": w_distance}, |
| index=[num], |
| ) |
| else: |
| df_wd_feat = pd.DataFrame( |
| {"FeatureName": feature_name, "WassersteinDistance": w_distance}, |
| index=[num], |
| ) |
| df_wd = pd.concat([df_wd, df_wd_feat]) |
| df_wd = df_wd.sort_values(by="WassersteinDistance", ascending=True) |
| return df_wd |
|
|
|
|
| |
| |
| |
| model_type = config["model_settings"]["model_type"] |
|
|
| |
| log = open( |
| os.path.join( |
| config["outputs"]["logging_dir"], "run_forward_val_" + model_type + ".log" |
| ), |
| "w", |
| ) |
| sys.stdout = log |
|
|
| |
| forward_val_data_imputed = pd.read_pickle( |
| os.path.join( |
| config["outputs"]["model_input_data_dir"], |
| "forward_val_imputed_{}.pkl".format(model_type), |
| ) |
| ) |
| forward_val_data_not_imputed = pd.read_pickle( |
| os.path.join( |
| config["outputs"]["model_input_data_dir"], |
| "forward_val_not_imputed_{}.pkl".format(model_type), |
| ) |
| ) |
|
|
| |
| |
|
|
| |
| train_data = model_h.load_data_for_modelling( |
| os.path.join( |
| config["outputs"]["model_input_data_dir"], |
| "crossval_imputed_{}.pkl".format(model_type), |
| ) |
| ) |
|
|
| |
| |
| |
| train_data_for_data_drift = train_data.drop(columns=["StudyId", "IndexDate"]) |
| forward_val_data_for_data_drift = forward_val_data_imputed.drop(columns=["StudyId", "IndexDate"]) |
|
|
| df_ks = perform_ks_test(train_data_for_data_drift, forward_val_data_for_data_drift) |
| df_wd = compute_wasserstein_distance( |
| train_data_for_data_drift, forward_val_data_for_data_drift |
| ) |
| df_data_drift = df_wd.merge(df_ks, on="FeatureName", how="left") |
| print(df_data_drift) |
|
|
| |
| |
| |
| |
| print(forward_val_data_imputed["ExacWithin3Months"].value_counts()) |
| print( |
| forward_val_data_imputed[forward_val_data_imputed["ExacWithin3Months"] == 1][ |
| "HospExacWithin3Months" |
| ].value_counts() |
| ) |
| print( |
| forward_val_data_imputed[forward_val_data_imputed["ExacWithin3Months"] == 1][ |
| "CommExacWithin3Months" |
| ].value_counts() |
| ) |
|
|
| |
| forward_val_features_imp = forward_val_data_imputed.drop( |
| columns=["StudyId", "IndexDate", "ExacWithin3Months", 'HospExacWithin3Months', |
| 'CommExacWithin3Months'] |
| ) |
| forward_val_target_imp = forward_val_data_imputed["ExacWithin3Months"] |
| forward_val_features_no_imp = forward_val_data_not_imputed.drop( |
| columns=["StudyId", "IndexDate", "ExacWithin3Months", 'HospExacWithin3Months', |
| 'CommExacWithin3Months'] |
| ) |
| forward_val_target_no_imp = forward_val_data_not_imputed["ExacWithin3Months"] |
|
|
| |
| |
| if not forward_val_target_no_imp.equals(forward_val_target_imp): |
| raise ValueError( |
| "Target variable is not the same in imputed and non imputed datasets in the test set." |
| ) |
| test_target = forward_val_target_no_imp |
|
|
| |
| for features in [forward_val_features_imp, forward_val_features_no_imp]: |
| for col in features: |
| features[col] = pd.to_numeric(features[col], errors="coerce") |
|
|
| |
| |
| models = [ |
| ("balanced_random_forest", "imputed", 0.27), |
| |
| |
| ] |
|
|
| |
| |
| |
| mlflow.set_tracking_uri("sqlite:///mlruns.db") |
| mlflow.set_experiment("model_h_drop_1_hosp_comm") |
|
|
| with mlflow.start_run(run_name="sig_forward_val_models_10_2023"): |
| for model_info in models: |
| print(model_info[0]) |
| with mlflow.start_run(run_name=model_info[0], nested=True): |
| |
| os.makedirs(config["outputs"]["artifact_dir"], exist_ok=True) |
| |
| shutil.rmtree(config["outputs"]["artifact_dir"]) |
|
|
| |
| with open("./data/model/trained_iso_" + model_info[0] + "_pkl", "rb") as f: |
| model = pickle.load(f) |
|
|
| |
| if model_info[1] == "imputed": |
| test_features = forward_val_features_imp |
| else: |
| test_features = forward_val_features_no_imp |
|
|
| |
| test_probs = model.predict_proba(test_features)[:, 1] |
| test_preds = model.predict(test_features) |
|
|
| |
| metrics = model_h.calc_eval_metrics_for_model( |
| test_target, |
| test_preds, |
| test_probs, |
| "forward_val", |
| best_threshold=model_info[2], |
| ) |
|
|
| |
| model_h.plot_confusion_matrix( |
| [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, model_info[2]], |
| test_probs, |
| test_target, |
| model_info[0], |
| model_type, |
| "forward_val", |
| ) |
|
|
| |
| for bins in [6, 10]: |
| plt.figure(figsize=(8, 8)) |
| plt.plot([0, 1], [0, 1], linestyle="--") |
| model_h.plot_calibration_curve( |
| test_target, test_probs, bins, "quantile", "Forward Validation" |
| ) |
| plt.legend(bbox_to_anchor=(1.05, 1.0), loc="upper left") |
| plt.title(model_info[0]) |
| plt.tight_layout() |
| plt.savefig( |
| os.path.join( |
| config["outputs"]["artifact_dir"], |
| model_info[0] |
| + "_" |
| + "quantile" |
| + "_bins" |
| + str(bins) |
| + model_type |
| + ".png", |
| ) |
| ) |
| plt.close() |
|
|
| |
| |
| preds_events_df_forward_val = model_h.create_df_probabilities_and_predictions( |
| test_probs, |
| model_info[2], |
| forward_val_data_imputed["StudyId"].tolist(), |
| test_target, |
| forward_val_data_imputed[["ExacWithin3Months", 'HospExacWithin3Months', |
| 'CommExacWithin3Months']], |
| model_info[0], |
| model_type, |
| output_dir="./data/prediction_and_events/", |
| calib_type="forward_val", |
| ) |
| |
| metrics_by_event_type_forward_val = model_h.calc_metrics_by_event_type( |
| preds_events_df_forward_val, calib_type="forward_val" |
| ) |
| |
| model_h.plot_roc_curve_by_event_type( |
| preds_events_df_forward_val, model_info[0], "forward_val" |
| ) |
| |
| model_h.plot_prec_recall_by_event_type( |
| preds_events_df_forward_val, model_info[0], "forward_val" |
| ) |
|
|
| |
| model_h.plot_score_distribution( |
| test_target, |
| test_probs, |
| config["outputs"]["artifact_dir"], |
| model_info[0], |
| model_type, |
| ) |
|
|
| |
| mlflow.log_metrics(metrics) |
| mlflow.log_artifacts(config["outputs"]["artifact_dir"]) |
| mlflow.end_run() |
|
|