copd-model-h / training /perform_forward_validation.py

Inital Upload

000de75 about 1 month ago

10.8 kB

	import numpy as np
	import pandas as pd
	import pickle
	import model_h
	import mlflow
	import os
	import shutil
	import matplotlib.pyplot as plt
	import sys
	import scipy
	import yaml

	with open("./training/config.yaml", "r") as config:
	config = yaml.safe_load(config)


	def perform_ks_test(train_data, forward_val_data):
	"""Perform Kolmogorov-Smirnov test.

	Args:
	train_data (pd.DataFrame): data used to train model.
	forward_val_data (pd.DataFrame): data used for the forward validation.

	Returns:
	pd.DataFrame: dataframe containing the results of the K-S test.
	"""
	for num, feature_name in enumerate(train_data.columns.tolist()):
	statistic, pvalue = scipy.stats.ks_2samp(
	train_data[feature_name], forward_val_data[feature_name]
	)
	pvalue = round(pvalue, 4)
	if num == 0:
	df_ks = pd.DataFrame(
	{
	"FeatureName": feature_name,
	"KS_PValue": pvalue,
	"KS_TestStatistic": statistic,
	},
	index=[num],
	)
	else:
	df_ks_feat = pd.DataFrame(
	{
	"FeatureName": feature_name,
	"KS_PValue": pvalue,
	"KS_TestStatistic": statistic,
	},
	index=[num],
	)
	df_ks = pd.concat([df_ks, df_ks_feat])
	df_ks["KS_DistributionsIdentical"] = np.where(df_ks["KS_PValue"] < 0.05, 0, 1)
	return df_ks


	def compute_wasserstein_distance(train_data, forward_val_data):
	"""Calculate the wasserstein distance.

	Args:
	train_data (pd.DataFrame): data used to train model.
	forward_val_data (pd.DataFrame): data used for the forward validation.

	Returns:
	pd.DataFrame: dataframe containing the wasserstein distance results.
	"""
	for num, feature_name in enumerate(train_data.columns.tolist()):
	w_distance = scipy.stats.wasserstein_distance(
	train_data[feature_name], forward_val_data[feature_name]
	)
	if num == 0:
	df_wd = pd.DataFrame(
	{"FeatureName": feature_name, "WassersteinDistance": w_distance},
	index=[num],
	)
	else:
	df_wd_feat = pd.DataFrame(
	{"FeatureName": feature_name, "WassersteinDistance": w_distance},
	index=[num],
	)
	df_wd = pd.concat([df_wd, df_wd_feat])
	df_wd = df_wd.sort_values(by="WassersteinDistance", ascending=True)
	return df_wd


	##############################################################
	# Load data
	##############################################################
	model_type = config["model_settings"]["model_type"]

	# Setup log file
	log = open(
	os.path.join(
	config["outputs"]["logging_dir"], "run_forward_val_" + model_type + ".log"
	),
	"w",
	)
	sys.stdout = log

	# Load test data
	forward_val_data_imputed = pd.read_pickle(
	os.path.join(
	config["outputs"]["model_input_data_dir"],
	"forward_val_imputed_{}.pkl".format(model_type),
	)
	)
	forward_val_data_not_imputed = pd.read_pickle(
	os.path.join(
	config["outputs"]["model_input_data_dir"],
	"forward_val_not_imputed_{}.pkl".format(model_type),
	)
	)

	# Load exac event type data
	#test_exac_data = pd.read_pickle("./data/forward_val_exac_data.pkl")

	# Load data the model was trained on
	train_data = model_h.load_data_for_modelling(
	os.path.join(
	config["outputs"]["model_input_data_dir"],
	"crossval_imputed_{}.pkl".format(model_type),
	)
	)

	##############################################################
	# Check for data drift
	##############################################################
	train_data_for_data_drift = train_data.drop(columns=["StudyId", "IndexDate"])
	forward_val_data_for_data_drift = forward_val_data_imputed.drop(columns=["StudyId", "IndexDate"])

	df_ks = perform_ks_test(train_data_for_data_drift, forward_val_data_for_data_drift)
	df_wd = compute_wasserstein_distance(
	train_data_for_data_drift, forward_val_data_for_data_drift
	)
	df_data_drift = df_wd.merge(df_ks, on="FeatureName", how="left")
	print(df_data_drift)

	##############################################################
	# Prepare data for running model
	##############################################################
	# Value counts for hospital and community exacerbations
	print(forward_val_data_imputed["ExacWithin3Months"].value_counts())
	print(
	forward_val_data_imputed[forward_val_data_imputed["ExacWithin3Months"] == 1][
	"HospExacWithin3Months"
	].value_counts()
	)
	print(
	forward_val_data_imputed[forward_val_data_imputed["ExacWithin3Months"] == 1][
	"CommExacWithin3Months"
	].value_counts()
	)

	# Separate features and target
	forward_val_features_imp = forward_val_data_imputed.drop(
	columns=["StudyId", "IndexDate", "ExacWithin3Months", 'HospExacWithin3Months',
	'CommExacWithin3Months']
	)
	forward_val_target_imp = forward_val_data_imputed["ExacWithin3Months"]
	forward_val_features_no_imp = forward_val_data_not_imputed.drop(
	columns=["StudyId", "IndexDate", "ExacWithin3Months", 'HospExacWithin3Months',
	'CommExacWithin3Months']
	)
	forward_val_target_no_imp = forward_val_data_not_imputed["ExacWithin3Months"]

	# Check that the target in imputed and not imputed datasets are the same. If not,
	# raise an error
	if not forward_val_target_no_imp.equals(forward_val_target_imp):
	raise ValueError(
	"Target variable is not the same in imputed and non imputed datasets in the test set."
	)
	test_target = forward_val_target_no_imp

	# Make sure all features are numeric
	for features in [forward_val_features_imp, forward_val_features_no_imp]:
	for col in features:
	features[col] = pd.to_numeric(features[col], errors="coerce")

	# Make a list of models to perform forward validation on. Contains model name, whether
	# imputation was performed, and the threshold used in the original model
	models = [
	("balanced_random_forest", "imputed", 0.27),
	# ("xgb", "not_imputed", 0.44),
	# ("random_forest", "imputed", 0.30),
	]

	##############################################################
	# Run models
	##############################################################
	mlflow.set_tracking_uri("sqlite:///mlruns.db")
	mlflow.set_experiment("model_h_drop_1_hosp_comm")

	with mlflow.start_run(run_name="sig_forward_val_models_10_2023"):
	for model_info in models:
	print(model_info[0])
	with mlflow.start_run(run_name=model_info[0], nested=True):
	# Create the artifacts directory if it doesn't exist
	os.makedirs(config["outputs"]["artifact_dir"], exist_ok=True)
	# Remove existing directory contents to not mix files between different runs
	shutil.rmtree(config["outputs"]["artifact_dir"])

	#### Load model ####
	with open("./data/model/trained_iso_" + model_info[0] + "_pkl", "rb") as f:
	model = pickle.load(f)

	# Select the correct data based on model used
	if model_info[1] == "imputed":
	test_features = forward_val_features_imp
	else:
	test_features = forward_val_features_no_imp

	#### Run model and get predictions for forward validation data ####
	test_probs = model.predict_proba(test_features)[:, 1]
	test_preds = model.predict(test_features)

	#### Calculate metrics ####
	metrics = model_h.calc_eval_metrics_for_model(
	test_target,
	test_preds,
	test_probs,
	"forward_val",
	best_threshold=model_info[2],
	)

	#### Plot confusion matrix ####
	model_h.plot_confusion_matrix(
	[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, model_info[2]],
	test_probs,
	test_target,
	model_info[0],
	model_type,
	"forward_val",
	)

	#### Plot calibration curves ####
	for bins in [6, 10]:
	plt.figure(figsize=(8, 8))
	plt.plot([0, 1], [0, 1], linestyle="--")
	model_h.plot_calibration_curve(
	test_target, test_probs, bins, "quantile", "Forward Validation"
	)
	plt.legend(bbox_to_anchor=(1.05, 1.0), loc="upper left")
	plt.title(model_info[0])
	plt.tight_layout()
	plt.savefig(
	os.path.join(
	config["outputs"]["artifact_dir"],
	model_info[0]
	+ "_"
	+ "quantile"
	+ "_bins"
	+ str(bins)
	+ model_type
	+ ".png",
	)
	)
	plt.close()

	#### Calculate model performance by event type ####
	# Create df to contain prediction data and event type data
	preds_events_df_forward_val = model_h.create_df_probabilities_and_predictions(
	test_probs,
	model_info[2],
	forward_val_data_imputed["StudyId"].tolist(),
	test_target,
	forward_val_data_imputed[["ExacWithin3Months", 'HospExacWithin3Months',
	'CommExacWithin3Months']],
	model_info[0],
	model_type,
	output_dir="./data/prediction_and_events/",
	calib_type="forward_val",
	)
	# Subset to each event type and calculate metrics
	metrics_by_event_type_forward_val = model_h.calc_metrics_by_event_type(
	preds_events_df_forward_val, calib_type="forward_val"
	)
	# Subset to each event type and plot ROC curve
	model_h.plot_roc_curve_by_event_type(
	preds_events_df_forward_val, model_info[0], "forward_val"
	)
	# Subset to each event type and plot PR curve
	model_h.plot_prec_recall_by_event_type(
	preds_events_df_forward_val, model_info[0], "forward_val"
	)

	#### Plot distribution of model scores for uncalibrated model ####
	model_h.plot_score_distribution(
	test_target,
	test_probs,
	config["outputs"]["artifact_dir"],
	model_info[0],
	model_type,
	)

	#### Log to MLFlow ####
	mlflow.log_metrics(metrics)
	mlflow.log_artifacts(config["outputs"]["artifact_dir"])
	mlflow.end_run()