copd-model-e / training /src /reduction /clean_and_scale_train.py

Model E: Unsupervised PCA + clustering risk stratification

53a6def 4 days ago

5.41 kB

	"""
	TRAIN
	Impute any null data, save ethnicity info for each ID and scale
	final dataset
	"""
	import json
	import joblib
	import pandas as pd
	import numpy as np
	from numpy import savetxt
	from sklearn.preprocessing import MinMaxScaler
	from utils.reduction import calc_ds_med


	demo_cols = ['age_bin', 'sex_bin']

	ds_cols = ['days_since_copd_resp', 'days_since_adm', 'days_since_rescue']

	null_cols = ['alt_med_2yr', 'ast_med_2yr', 'albumin_med_2yr',
	'alkaline_phosphatase_med_2yr', 'basophils_med_2yr',
	'c_reactive_protein_med_2yr', 'chloride_med_2yr',
	'creatinine_med_2yr', 'eosinophils_med_2yr',
	'estimated_gfr_med_2yr', 'haematocrit_med_2yr',
	'haemoglobin_med_2yr', 'lymphocytes_med_2yr',
	'mch_med_2yr', 'mean_cell_volume_med_2yr',
	'monocytes_med_2yr', 'neutrophils_med_2yr',
	'platelets_med_2yr', 'potassium_med_2yr',
	'red_blood_count_med_2yr', 'sodium_med_2yr',
	'total_bilirubin_med_2yr', 'urea_med_2yr',
	'white_blood_count_med_2yr', 'neut_lymph_med_2yr']

	cols2drop = ['eth_grp', 'entry_dataset', 'first_entry', 'obf_dob',
	'sex_bin', 'marital_status', 'age_bin',
	'days_since_copd_resp_med', 'days_since_adm_med',
	'days_since_rescue_med', 'simd_vigintile', 'simd_decile',
	'simd_quintile']


	def calc_age_bins_train(df, data_path):
	"""
	Split ages into 10 bins and save results for median filling test data
	--------
	:param df: dataframe to be updated
	:param data_path: path to generated data
	:return: updated dataframe
	"""
	# Split age column into 10 buckets and use the edges as labels
	cat, ed = pd.qcut(df['age'], q=10, precision=0, retbins=True)
	categories, edges = pd.qcut(
	df['age'], q=10, precision=0, retbins=True, labels=ed[1:])
	df['age_bin'] = categories.astype(int)

	# Save categories for test data
	savetxt(data_path + 'age_bins_train.csv', edges, delimiter=',')

	return df


	def calc_df_med(df, data_path):
	"""
	Calculate the medians for all columns in the dataset
	--------
	:param df: dataframe to update
	:param data_path: path to generated data
	:return: dataframe with null columns filled with median values and days_since
	median columns added to the dataframe
	"""
	# Calculate median for all columns except SafeHavenID, year and ds_cols
	all_cols = df.columns
	all_cols = all_cols.drop(['SafeHavenID', 'eoy'])
	df_median = df[all_cols].groupby(demo_cols).median()

	# Calculate medians for ds_cols
	ds_med = df[demo_cols + ds_cols].groupby(demo_cols).apply(calc_ds_med)

	# Join ds_cols medians to median table and original dataframe
	df_median = df_median.join(ds_med)

	# Save medians for imputing testing data
	df_median.to_pickle(data_path + 'medians.pkl')

	# Rename and add to original dataframe
	ds_med.columns += '_med'
	df = df.join(ds_med, on=demo_cols)

	return df


	def ds_fill_5year_train(df, col):
	"""
	Fill days_since_X columns where patient has been in the dataset less than
	5 years
	--------
	:param df: dataframe to be updated
	:param col: column to check
	:return: dataframe with column nulls filled where patient has ggc_years < 5
	"""
	df_5years = df.ggc_years < 5
	df.loc[df_5years, col] = df.loc[df_5years, col].fillna(df[col].max())

	return df


	def scale_data_train(df, data_path, scaler):
	"""
	Min-max scale final dataset
	-----
	:param df: dataframe to be scaled
	:param data_path: path to generated data
	:param scaler: scaler object to apply to df
	:return: scaled dataset for modelling
	"""
	all_cols = df.columns
	all_cols = all_cols.drop(['SafeHavenID', 'eoy'])
	data_scaled = scaler.fit_transform(df[all_cols].to_numpy())
	df_scaled = pd.DataFrame(data_scaled, columns=all_cols)
	df_final = (df[['SafeHavenID', 'eoy']]
	.reset_index(drop=True)
	.join(df_scaled))

	# Save the scaler for testing
	joblib.dump(scaler, data_path + 'min_max_scaler_train.pkl')

	return df_final


	def main():

	# Load in config items
	with open('../../../config.json') as json_config_file:
	config = json.load(json_config_file)
	data_path = config['model_data_path']

	# Read in combined data
	df = pd.read_pickle(data_path + 'merged_train.pkl')

	# Calculate age bins
	df = calc_age_bins_train(df, data_path)

	# Calculate medians for each column for imputation
	df = calc_df_med(df, data_path)

	# Fill null columns
	df[null_cols] = df.groupby(demo_cols)[null_cols].apply(
	lambda x: x.fillna(x.median()))

	# Fill null days_since columns
	day = np.timedelta64(1, 'D')
	df[ds_cols].max().to_pickle(data_path + 'maxs.pkl')
	for col in ds_cols:
	df = ds_fill_5year_train(df, col)
	df[col] = df[col].fillna(df[col + '_med'])
	df[col] = (df[col] / day).astype(int)

	# Save processed data before scaling
	df.to_pickle(data_path + 'filled_train.pkl')

	# Drop non-modelling columns
	df = df.drop(cols2drop, axis=1)

	# Initialize scaler
	scaler = MinMaxScaler()

	# Scale final dataset
	df_final = scale_data_train(df, data_path, scaler)

	# Save final dataset
	df_final.to_pickle(data_path + 'min_max_train.pkl')


	main()