| """Prepare final train set for cross-validation (K fold encoded, scaled and imputed).""" |
| import copd |
| |
| from lenusml import crossvalidation |
| import os |
| import pandas as pd |
| import numpy as np |
| import seaborn as sns |
| from sklearn.preprocessing import MinMaxScaler |
| from sklearn.impute import SimpleImputer |
|
|
| sns.set(style='darkgrid', context='talk') |
| sns.set_palette('dark') |
| muted = sns.palettes.color_palette(palette='muted') |
| dark = sns.palettes.color_palette(palette='dark') |
|
|
| data_dir = '<YOUR_DATA_PATH>/train_data/' |
| cohort_info_dir = '../data/cohort_info/' |
| output_data_dir = '../data/models/model1' |
|
|
| fold_patients = np.load(os.path.join(cohort_info_dir, 'fold_patients.npy'), |
| allow_pickle=True) |
|
|
| data = pd.read_pickle(os.path.join(data_dir, 'train_data.pkl')) |
|
|
| exacs = data[data.IsExac == 1] |
| exac_patients = exacs.StudyId.unique() |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| bool_mapping = {True: 1, False: 0} |
| data['RequiredAcuteNIV'] = data.RequiredAcuteNIV.replace(bool_mapping) |
| data['RequiredICUAdmission'] = data.RequiredICUAdmission.replace(bool_mapping) |
|
|
| |
| sex_mapping = {'F': 1, 'M': 0} |
| data['Sex_F'] = data.Sex.map(sex_mapping) |
| data = data.drop(columns=['Sex']) |
|
|
| |
| |
| |
| cat = pd.read_csv(os.path.join('<YOUR_DATA_PATH>/copd-dataset/', 'CopdDatasetProCat.txt'), |
| delimiter="|") |
|
|
| symptom_diary = pd.read_csv( |
| os.path.join('<YOUR_DATA_PATH>/copd-dataset/', 'CopdDatasetProSymptomDiary.txt'), |
| usecols=['PatientId', 'SubmissionTime', 'SymptomDiaryQ1', 'SymptomDiaryQ2', |
| 'SymptomDiaryQ3', 'SymptomDiaryQ8', 'SymptomDiaryQ9', 'SymptomDiaryQ10'], |
| delimiter="|") |
|
|
| cat['SubmissionTime'] = pd.to_datetime(cat.SubmissionTime, |
| utc=True).dt.normalize() |
| symptom_diary['SubmissionTime'] = pd.to_datetime(symptom_diary.SubmissionTime, |
| utc=True).dt.normalize() |
|
|
|
|
| |
| cat = cat[cat.PatientId.isin(data.PatientId)] |
| symptom_diary = symptom_diary[symptom_diary.PatientId.isin(data.PatientId)] |
|
|
| |
| |
| daily_pros = pd.merge(cat.drop_duplicates(subset=['PatientId', 'SubmissionTime']), |
| symptom_diary.drop_duplicates(subset=['PatientId', |
| 'SubmissionTime']), |
| on=['PatientId', 'SubmissionTime'], how='inner') |
|
|
| |
| numeric_pros = ['CATQ1', 'CATQ2', 'CATQ3', 'CATQ4', 'CATQ5', 'CATQ6', 'CATQ7', |
| 'CATQ8', 'SymptomDiaryQ1', 'SymptomDiaryQ2', 'Score'] |
|
|
| mean_pros = copd.rolling_mean_previous_period(df=daily_pros, cols=numeric_pros, |
| date_col='SubmissionTime', |
| id_col='StudyId', window=3) |
|
|
| |
| daily_pros = daily_pros.merge(mean_pros, on=['StudyId', 'SubmissionTime'], how='left') |
|
|
| daily_pros = copd.calculate_diff_from_rolling_mean(df=daily_pros, cols=numeric_pros) |
|
|
| |
| daily_pros = daily_pros.loc[:, ~daily_pros.columns.str.endswith('_ave')] |
|
|
| |
| train_data = pd.merge_asof(data.sort_values(by='DateOfEvent'), daily_pros.drop( |
| columns=['StudyId']).sort_values(by='SubmissionTime'), |
| left_on='DateOfEvent', right_on='SubmissionTime', |
| by='PatientId', direction='backward') |
|
|
| |
| |
| |
| comorbidities = pd.read_csv('<YOUR_DATA_PATH>/copd-dataset/CopdDatasetCoMorbidityDetails.txt', |
| delimiter='|') |
| comorbidities = comorbidities.drop(columns=['Id', 'Created']) |
| |
| comorbidity_list = list(comorbidities.columns) |
| comorbidity_list.remove('PatientId') |
|
|
| |
| comorbidities = comorbidities[comorbidities.PatientId.isin(data.PatientId)] |
| print('Train patients with entries in CopdDatasetCoMorbidityDetails: {} out of {}'.format( |
| len(comorbidities), len(data.PatientId.unique()))) |
| comorbidities[comorbidity_list] = comorbidities[comorbidity_list].replace( |
| bool_mapping).fillna(0) |
| print('Comorbidity counts:', '\n', comorbidities[comorbidity_list].sum()) |
|
|
| |
| train_data = train_data.merge(comorbidities, on='PatientId', how='left') |
| print('Comorbidity counts after merging with patient days:', '\n', |
| train_data[comorbidity_list].sum()) |
| train_data[comorbidity_list] = train_data[comorbidity_list].fillna(0) |
|
|
| |
| train_data['Comorbidities'] = train_data[comorbidity_list].sum(axis=1) |
| comorb_counts = train_data.groupby('StudyId')['Comorbidities'].max().reset_index() |
| |
| |
|
|
| comorb_counts.loc[comorb_counts.StudyId.isin(exac_patients), 'IsExacPatient'] = 1 |
| comorb_counts['IsExacPatient'] = comorb_counts['IsExacPatient'].fillna(0) |
|
|
| |
| comorbidity_list.remove('AsthmaOverlap') |
| train_data = train_data.drop(columns=comorbidity_list) |
|
|
| |
| |
| |
| |
| inhaler_type = pd.read_csv('<YOUR_DATA_PATH>/copd-dataset/CopdDatasetUsualTherapies.txt', |
| delimiter='|', usecols=['StudyId', 'InhalerType']) |
| |
| inhaler_type = inhaler_type[inhaler_type.StudyId.isin(data.StudyId)] |
| |
| inhaler_type = copd.triple_inhaler_therapy_service( |
| df=inhaler_type, id_col='StudyId', inhaler_col='InhalerType', include_mitt=True) |
|
|
| print('Patients taking triple inhaler therapy: ', '\n', |
| inhaler_type.TripleTherapy.value_counts()) |
| train_data = train_data.merge(inhaler_type, on='StudyId', how='left') |
|
|
| |
| |
| |
|
|
| |
| |
| train_data['SymptomDiaryQ8'] = train_data.SymptomDiaryQ8.replace( |
| {1: 'Not difficult', 2: 'A little difficult', 3: 'Quite difficult', |
| 4: 'Very difficult', np.nan: 'None'}) |
|
|
| |
| |
| train_data['SymptomDiaryQ9'] = train_data.SymptomDiaryQ9.replace( |
| {1: 'Watery', 2: 'Sticky liquid', 3: 'Semi-solid', 4: 'Solid', np.nan: 'None'}) |
|
|
| |
| |
| train_data['SymptomDiaryQ10'] = train_data.SymptomDiaryQ10.replace( |
| {1: 'White', 2: 'Yellow', 3: 'Green', 4: 'Dark green', np.nan: 'None'}) |
|
|
| |
| train_data['SmokingStatus'] = train_data.SmokingStatus.replace( |
| {1: 'Smoker', 2: 'Ex-smoker', 3: 'Non-smoker'}) |
|
|
| train_data['InExacWindow'] = train_data.IsExac.replace({0: False, 1: True}) |
|
|
| |
| |
| |
|
|
| train_data['DaysSinceCAT'] = (train_data.DateOfEvent - |
| train_data.SubmissionTime).dt.days.astype('int') |
|
|
| DaysSinceCAT_cutoff = 14 |
| train_data = train_data[train_data.DaysSinceCAT <= DaysSinceCAT_cutoff] |
| |
| |
| |
|
|
| |
| exac_bins = [-1, 0, 21, 90, 180, np.inf] |
| exac_labels = ['None', '<21 days', '21 - 89 days', '90 - 179 days', '>= 180 days'] |
|
|
| train_data['DaysSinceLastExac'] = copd.bin_numeric_column( |
| col=train_data['DaysSinceLastExac'], bins=exac_bins, labels=exac_labels) |
|
|
| |
| age_bins = [0, 50, 60, 70, 80, np.inf] |
| age_labels = ['<50', '50-59', '60-69', '70-79', '80+'] |
|
|
| train_data['Age'] = copd.bin_numeric_column( |
| col=train_data['Age'], bins=age_bins, labels=age_labels) |
|
|
| |
| comorb_bins = [0, 1, 3, np.inf] |
| comorb_labels = ['None', '1-2', '3+'] |
| train_data['Comorbidities'] = copd.bin_numeric_column( |
| col=train_data['Comorbidities'], bins=comorb_bins, labels=comorb_labels) |
|
|
| comorb_counts['Comorbidities_binned'] = copd.bin_numeric_column( |
| col=comorb_counts['Comorbidities'], bins=comorb_bins, labels=comorb_labels) |
|
|
| |
| spirometry_bins = [0, 30, 50, 80, np.inf] |
| spirometry_labels = ['Very severe', 'Severe', 'Moderate', 'Mild'] |
|
|
| train_data['FEV1PercentPredicted'] = copd.bin_numeric_column( |
| col=train_data['LungFunction_FEV1PercentPredicted'], bins=spirometry_bins, |
| labels=spirometry_labels) |
|
|
| train_data = train_data.drop(columns=['LungFunction_FEV1PercentPredicted']) |
| |
| train_data.loc[ |
| train_data['FEV1PercentPredicted'] == 'nan', 'FEV1PercentPredicted'] = 'Mild' |
| train_data['FEV1PercentPredicted'].value_counts() |
|
|
| |
| |
| |
| train_data['HighestEosinophilCount_0_3'] = np.where( |
| train_data['LabsHighestEosinophilCount'] >= 0.3, 1, 0) |
| train_data = train_data.drop(columns=['LabsHighestEosinophilCount']) |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
|
|
|
|
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| categorical_columns = ['SmokingStatus', 'SymptomDiaryQ8', 'SymptomDiaryQ9', |
| 'SymptomDiaryQ10', 'DaysSinceLastExac', 'Age', 'Comorbidities', |
| 'FEV1PercentPredicted'] |
| train_data[categorical_columns] = train_data[categorical_columns].astype("str") |
| data_encoded = copd.kfold_encode_train_data(df=train_data, fold_patients=fold_patients, |
| cols_to_encode=categorical_columns, |
| target='IsExac', id_col='StudyId') |
| data_encoded = data_encoded.drop(columns=categorical_columns, axis=1) |
|
|
| |
| |
| |
| data_encoded = data_encoded.drop(columns=['PatientId', 'InExacWindow', |
| 'DateOfEvent', 'SubmissionTime', |
| 'FirstSubmissionDate', 'LatestPredictionDate']) |
|
|
| scaler = MinMaxScaler() |
| train_data_scaled = crossvalidation.kfold_process_train_data(df=data_encoded, |
| fold_patients=fold_patients, |
| processor=scaler, |
| id_col='StudyId', |
| target='IsExac') |
|
|
| |
| |
| |
| |
| imputer = SimpleImputer(missing_values=np.nan, strategy='median') |
| train_data_imputed = crossvalidation.kfold_process_train_data(df=train_data_scaled, |
| fold_patients=fold_patients, |
| processor=imputer, |
| id_col='StudyId', |
| target='IsExac') |
| |
| |
| |
|
|
| |
| os.makedirs(output_data_dir, exist_ok=True) |
| train_data_imputed.to_pickle(os.path.join(output_data_dir, 'train_data_cv.pkl')) |
| print('Final train data saved (CV)') |
|
|