| """ |
| Derive features from lab tests for 2 models: |
| Parallel model 1: uses both hospital and community exacerbation events |
| Parallel model 2: uses only hospital exacerbation events |
| """ |
|
|
| import numpy as np |
| import pandas as pd |
| import sys |
| import os |
| import model_h |
| import ggc.preprocessing.labs as labs_preprocessing |
| import yaml |
|
|
|
|
| def calc_lab_metric(lab_df, data, lab_name, metric, weigh_data_by_recency=False): |
| """ |
| Calculate metrics on laboratory data. |
| |
| Args: |
| lab_df (pd.DataFrame): dataframe containing labs to be used in calculations. |
| data (pd.DataFrame): main dataframe to which columns containing the results from |
| the lab calculations are merged onto. |
| lab_name (list): name of labs required for metric calculations. |
| metric (str): name of metric to be calculated. The possible metrics are: |
| 'MaxLifetime': calculates the maximum value of lab for patient within |
| entire dataset before their index date. |
| 'MinLifetime': calculates the minimum value of lab for patient within |
| entire dataset before their index date. |
| 'Max1Year': calculates the maximum value of lab for patient within 1 |
| year prior to index date. |
| 'Min1Year': calculates the maximum value of lab for patient within 1 |
| year prior to index date. |
| 'Latest': finds the closest lab value prior to index date. |
| weigh_data_by_recency (bool): option to weigh data based on how recent it is. Older |
| observations are decreased or increased towards the median. Defaults to False. |
| |
| Returns: |
| pd.DataFrame: the input dataframe with additional columns with calculated |
| metrics. |
| """ |
| |
| cols_to_keep = ["StudyId", "IndexDate", "TimeSinceLab"] |
| cols_to_keep.append(lab_name) |
| labs_calc = lab_df[cols_to_keep] |
|
|
| |
| if (metric == "Max1Year") | (metric == "Min1Year"): |
| labs_calc = labs_calc[labs_calc["TimeSinceLab"] <= 365] |
| if (metric == "MaxLifetime") | (metric == "Max1Year"): |
| labs_calc = labs_calc.groupby(["StudyId", "IndexDate"]).max() |
| if (metric == "MinLifetime") | (metric == "Min1Year"): |
| labs_calc = labs_calc.groupby(["StudyId", "IndexDate"]).min() |
| labs_calc = labs_calc.drop(columns=["TimeSinceLab"]) |
| if metric == "Latest": |
| labs_calc = labs_calc[labs_calc["TimeSinceLab"] <= 365] |
| labs_calc = labs_calc.sort_values( |
| by=["StudyId", "IndexDate", "TimeSinceLab"], ascending=True |
| ) |
| labs_calc["TimeSinceLab"] = np.where( |
| labs_calc[lab_name].isna(), np.NaN, labs_calc["TimeSinceLab"] |
| ) |
| labs_calc = labs_calc.bfill() |
| labs_calc = labs_calc.drop_duplicates( |
| subset=["StudyId", "IndexDate"], keep="first" |
| ) |
| if weigh_data_by_recency is True: |
| median_val = labs_calc[lab_name].median() |
| labs_calc = model_h.weigh_features_by_recency( |
| df=labs_calc, |
| feature=lab_name, |
| feature_recency_days="TimeSinceLab", |
| median_value=median_val, |
| decay_rate=0.001, |
| ) |
| labs_calc = labs_calc.set_index(["StudyId", "IndexDate"]) |
|
|
| |
| labs_calc = labs_calc.add_prefix(metric) |
| labs_calc = labs_calc.reset_index() |
| data = data.merge(labs_calc, on=["StudyId", "IndexDate"], how="left") |
| return data |
|
|
|
|
| with open("./training/config.yaml", "r") as config: |
| config = yaml.safe_load(config) |
|
|
| |
| model_type = config["model_settings"]["model_type"] |
|
|
| |
| log = open("./training/logging/process_labs_" + model_type + ".log", "w") |
| sys.stdout = log |
|
|
| |
| data_to_process = config["model_settings"]["data_to_process"] |
|
|
| |
| if data_to_process == "forward_val": |
| data = pd.read_pickle("./data/patient_labels_forward_val_hosp_comm.pkl") |
| patient_details = pd.read_pickle("./data/patient_details_forward_val.pkl") |
| else: |
| data = pd.read_pickle("./data/patient_labels_" + model_type + ".pkl") |
| patient_details = pd.read_pickle("./data/patient_details.pkl") |
| data = data[["StudyId", "IndexDate"]] |
| patient_details = data.merge( |
| patient_details[["StudyId", "PatientId"]], |
| on="StudyId", |
| how="left", |
| ) |
|
|
| |
| id_mapping = pd.read_pickle("./data/sh_to_studyid_mapping.pkl") |
|
|
| |
| id_mapping["SafeHavenID"] = np.where( |
| id_mapping["StudyId"] == "SU125", np.NaN, id_mapping["SafeHavenID"] |
| ) |
| id_mapping = id_mapping.merge( |
| data[["StudyId"]], on="StudyId", how="inner" |
| ).drop_duplicates() |
| print( |
| "Num patients with SafeHaven mapping: {} of {}".format( |
| len(id_mapping), data.StudyId.nunique() |
| ) |
| ) |
|
|
| |
| patient_details = patient_details.merge(id_mapping, on="StudyId", how="left") |
|
|
| |
| |
| patient_details["LookbackStartDate"] = patient_details["IndexDate"] - pd.DateOffset( |
| days=config["model_settings"]["lookback_period"] |
| ) |
|
|
| |
| |
| |
| |
| cols_to_use = [ |
| "SafeHavenID", |
| "ClinicalCodeDescription", |
| "QuantityUnit", |
| "RangeHighValue", |
| "RangeLowValue", |
| "QuantityValue", |
| "SampleDate", |
| ] |
|
|
| labs = pd.read_csv(config["inputs"]["raw_data_paths"]["labs"], usecols=cols_to_use) |
|
|
| |
| labs = labs[labs.SafeHavenID.isin(patient_details.SafeHavenID)] |
|
|
| |
| lookup_table = pd.read_csv(config["inputs"]["raw_data_paths"]["labs_lookup_table"]) |
| tests_of_interest = [ |
| "Eosinophils", |
| "Albumin", |
| "Neutrophils", |
| "White Blood Count", |
| "Lymphocytes", |
| ] |
| labs_processed = labs_preprocessing.clean_labs_data( |
| df=labs, |
| tests_of_interest=tests_of_interest, |
| units_lookup=lookup_table, |
| print_log=True, |
| ) |
| labs_processed = patient_details[["StudyId", "IndexDate", "SafeHavenID"]].merge( |
| labs_processed, on="SafeHavenID", how="left" |
| ) |
| labs_processed["SampleDate"] = pd.to_datetime(labs_processed["SampleDate"], utc=True) |
| labs_processed["TimeSinceLab"] = ( |
| labs_processed["IndexDate"] - labs_processed["SampleDate"] |
| ).dt.days |
|
|
| |
| labs_processed = labs_processed[labs_processed["TimeSinceLab"] >= 0] |
|
|
| |
| labs_processed = pd.pivot_table( |
| labs_processed, |
| values="QuantityValue", |
| index=["StudyId", "IndexDate", "TimeSinceLab"], |
| columns=["ClinicalCodeDescription"], |
| ) |
| labs_processed = labs_processed.reset_index() |
|
|
| |
| labs_processed["NeutLymphRatio"] = ( |
| labs_processed["Neutrophils"] / labs_processed["Lymphocytes"] |
| ) |
|
|
| |
| data = calc_lab_metric(labs_processed, data, lab_name="Albumin", metric="Min1Year") |
|
|
| |
| lab_names = [ |
| "NeutLymphRatio", |
| "Albumin", |
| "Eosinophils", |
| "Neutrophils", |
| "White Blood Count", |
| ] |
|
|
| for lab_name in lab_names: |
| data = calc_lab_metric( |
| labs_processed, data, lab_name, metric="Latest", weigh_data_by_recency=True |
| ) |
|
|
| |
| os.makedirs(config["outputs"]["processed_data_dir"], exist_ok=True) |
| if data_to_process == "forward_val": |
| data.to_pickle( |
| os.path.join( |
| config["outputs"]["processed_data_dir"], |
| "labs_forward_val_" + model_type + ".pkl", |
| ) |
| ) |
| else: |
| data.to_pickle( |
| os.path.join( |
| config["outputs"]["processed_data_dir"], |
| "labs_" + model_type + ".pkl", |
| ) |
| ) |
|
|