IamGrooooot's picture
Model E: Unsupervised PCA + clustering risk stratification
53a6def
"""
Script to remove all receiver IDs from relevant data sources.
"""
import json
import pandas as pd
from sklearn.model_selection import train_test_split
def get_ids(path):
"""
Read in IDs
--------
:return: list of SafeHavenIDs
"""
print('Loading IDs from ' + path)
df = pd.read_csv(path, encoding="cp1252")
ids = df['SafeHavenID'].tolist()
return ids
def save_rec_sup(df, data_path, rec_ids, sup_ids):
"""
Remove receiver IDs from dataframe and pickle the dataset
--------
:param df: pandas dataframe to remove ids from
:param data_path: path to generated data
:param rec_ids: list of SafeHavenIDs in receiver cohort to remove
:param sup_ids: list of SafeHavenIDs in scale-up cohort to remove
:return: None
"""
print('Saving REC and SUP data')
# Remove receiver IDs
df_rec = df[df['SafeHavenID'].isin(rec_ids)]
df_sup = df[df['SafeHavenID'].isin(sup_ids)]
df = df[~df['SafeHavenID'].isin(rec_ids + sup_ids)]
# Save data
df_rec.to_pickle(data_path + 'merged_rec.pkl')
df_sup.to_pickle(data_path + 'merged_sup.pkl')
return df
def save_df_ids(df, data_path, ids, typ):
"""
Save train, test or validation ids and corresponding data
--------
:param df: dataframe
:param data_path: path to generated data
:param ids: list of SafeHavenIDs
:param typ: type of dataset to create, 'train', 'test', 'val'
"""
print('Saving ' + typ + ' data')
df_ids = pd.DataFrame(ids, columns=['SafeHavenID'])
df_ids.to_pickle(data_path + typ + '_ids.pkl')
df_ids_data = df[df['SafeHavenID'].isin(ids)]
df_ids_data.to_pickle(data_path + 'merged_' + typ + '.pkl')
def df_tts(df, data_path):
"""
Split data into training and testing sets and save dataframes
--------
:param df: pandas dataframe to split
:param data_path: path to generated data
:return: None
"""
# Split IDs into training, testing and validation sets
ids = df['SafeHavenID'].tolist()
train_ids, test_ids = train_test_split(
ids, test_size=0.2, random_state=42)
train_ids, val_ids = train_test_split(
train_ids, test_size=0.25, random_state=42)
# Save IDs and datasets
save_df_ids(df, data_path, train_ids, 'train')
save_df_ids(df, data_path, test_ids, 'test')
save_df_ids(df, data_path, val_ids, 'val')
def main():
# Load in config items
with open('../../../config.json') as json_config_file:
config = json.load(json_config_file)
# Set paths
data_path = config['model_data_path']
rec_path = config['rec_data_path'] + 'Cohort3Rand.csv'
sup_path = config['sup_data_path'] + 'Scale_Up_lookup.csv'
# Get IDs to exclude
rec_ids = get_ids(rec_path)
sup_ids = get_ids(sup_path)
# Remove IDs from datasets
df = pd.read_pickle(data_path + 'merged.pkl')
df = save_rec_sup(df, data_path, rec_ids, sup_ids)
# Split and save the data
df_tts(df, data_path)
main()