| |
| import pandas as pd |
| import numpy as np |
|
|
| |
| input_file_path = '<YOUR_DATA_PATH>/EXAMPLE_STUDY_DATA/' |
| output_file_path = '<YOUR_DATA_PATH>/summary_files/' |
|
|
|
|
| copd = 'CHRONIC OBSTRUCTIVE PULMONARY DISEASE' |
|
|
|
|
| resp = ['PNEUMONITIS DUE TO FOOD AND VOMIT', |
| 'RESPIRATORY FAILURE, UNSPECIFIED; TYPE UNSPECIFIED', |
| 'CHRONIC RESPIRATORY FAILURE; TYPE II [HYPERCAPNIC]', |
| 'BRONCHOPNEUMONIA, UNSPECIFIED', 'DYSPNOEA', |
| 'PLEURAL EFFUSION IN CONDITIONS CLASSIFIED ELSEWHERE', |
| 'RESPIRATORY FAILURE, UNSPECIFIED; TYPE [HYPERCAPNIC]', |
| 'PLEURAL EFFUSION, NOT ELSEWHERE CLASSIFIED', |
| 'CHRONIC RESPIRATORY FAILURE', 'OTHER BACTERIAL PNEUMONIA', |
| 'ABN MICROBIOLOGICAL FINDINGS IN SPECS FROM RESPIRATORY ORGANS AND THORAX', |
| 'RESPIRATORY FAILURE, UNSPECIFIED', 'PNEUMONIA, UNSPECIFIED', |
| 'LOBAR PNEUMONIA, UNSPECIFIED', 'COUGH', |
| 'PLEURAL PLAQUE WITH PRESENCE OF ASBESTOS', |
| 'PLEURAL PLAQUE WITHOUT ASBESTOS', 'OTHER DISORDERS OF LUNG', |
| 'OTHER SPECIFIED PLEURAL CONDITIONS', 'PULMONARY COLLAPSE', |
| 'ACQUIRED ABSENCE OF LUNG [PART OF]', 'ASPHYXIATION', |
| 'RESPIRATORY FAILURE, UNSPECIFIED; TYPE [HYPOXIC]', |
| 'TRACHEOSTOMY STATUS', 'ACUTE RESPIRATORY FAILURE', |
| 'UNSPECIFIED ACUTE LOWER RESPIRATORY INFECTION', |
| 'OTHER SPECIFIED SYMPTOMS AND SIGNS INVOLVING THE CIRC AND RESP SYSTEMS', |
| 'BACTERIAL PNEUMONIA, UNSPECIFIED', 'PYOTHORAX WITHOUT FISTULA', |
| 'DISEASES OF BRONCHUS, NOT ELSEWHERE CLASSIFIED', |
| 'PNEUMONIA DUE TO HAEMOPHILUS INFLUENZAE', 'ABNORMAL SPUTUM', |
| 'OTHER POSTPROCEDURAL RESPIRATORY DISORDERS', |
| 'OTHER AND UNSPECIFIED ABNORMALITIES OF BREATHING', |
| 'INFLUENZA WITH OTHER RESP MANIFESTATIONS, SEASONAL INFLUENZA VIRUS IDENTIF', |
| 'PERSONAL HISTORY OF DISEASES OF THE RESPIRATORY SYSTEM', |
| 'PNEUMONIA DUE TO STREPTOCOCCUS PNEUMONIAE', |
| 'WHEEZING', 'CHEST PAIN ON BREATHING', 'HAEMOPTYSIS', |
| 'INFLUENZA WITH OTHER MANIFESTATIONS, VIRUS NOT IDENTIFIED', |
| 'OTHER SPECIFIED RESPIRATORY DISORDERS', |
| 'ACUTE UPPER RESPIRATORY INFECTION, UNSPECIFIED', |
| 'T.B. OF LUNG, W/O MENTION OF BACTERIOLOGICAL OR HISTOLOGICAL CONFIRMATION', |
| 'DEPENDENCE ON RESPIRATOR', 'PLEURISY', |
| 'BRONCHITIS, NOT SPECIFIED AS ACUTE OR CHRONIC'] |
|
|
|
|
| def read_data(file, cols, types): |
| """ |
| Read in data source |
| -------- |
| :param file: string filename |
| :param cols: string list of column names |
| :param types: string list of column types |
| :return: dataframe |
| """ |
| schema = dict(zip(cols, types)) |
| df = pd.read_csv(file, usecols=cols, encoding="cp1252", dtype=schema) |
| return df |
|
|
|
|
| def update_null_stay(df): |
| """ |
| Calculate the values for any null 'STAY' values using the admission and |
| discharge dates. |
| -------- |
| df : pandas dataframe to be updated |
| """ |
| is_null = df['STAY'].isnull() |
| if sum(is_null) > 0: |
| null_stay = np.where(is_null) |
| for i in null_stay: |
| stay = df.loc[i, 'DISDATE'].item() - df.loc[i, 'ADMDATE'].item() |
| df.loc[i, 'STAY'] = float(stay.days) |
|
|
| return df |
|
|
|
|
| def calculate_total_stay(df): |
| """ |
| Model A: |
| Calculate the cumulative (total) length of stay, given data already |
| grouped by patient ID and sorted by admission date then discharge date. It |
| sums all stays for which the admission date matches the previous discharge |
| date, sets the admission date to the first admission and drops all rows |
| except the final (or only if the patient was not transferred) record |
| for any given stay. Works for any number of transfers. Also adds a |
| 'transfer' column to the existing data (True/False) |
| |
| df : pandas dataframe |
| dataframe to be updated |
| """ |
| df.reset_index(inplace=True, drop=True) |
| rows_to_drop = [] |
| df['transfer'] = df.ADMDATE.eq(df.DISDATE.shift()) |
| for index, row in df.iloc[1:].iterrows(): |
| if row.transfer is True: |
| df.loc[index, 'ADMDATE'] = df.iloc[index - 1].ADMDATE |
| df.loc[index, 'STAY'] = row.STAY + df.iloc[index - 1].STAY |
| rows_to_drop.append(index - 1) |
| df.drop(rows_to_drop, inplace=True) |
| df.drop('transfer', axis=1, inplace=True) |
|
|
| return df |
|
|
|
|
| def track_copd_resp(df, track_type='both'): |
| """ |
| Search for COPD and/or respiratory admissions |
| -------- |
| df : pandas dataframe |
| dataframe to be updated |
| track_type : str |
| 'copd', 'resp' or 'both' |
| """ |
| diag_columns = ['DIAG1Desc', 'DIAG2Desc', 'DIAG3Desc', 'DIAG4Desc', |
| 'DIAG5Desc', 'DIAG6Desc'] |
| df_diag = df[diag_columns] |
|
|
| if track_type in ['copd', 'both']: |
| copd_event = df_diag.apply(lambda x: track_feature(x, copd, True)) |
| copd_event = copd_event.any(axis=1).astype(int) |
| df['copd_event'] = copd_event |
|
|
| if track_type in ['resp', 'both']: |
| resp_event = df_diag.apply(lambda x: track_feature(x, resp, False)) |
| resp_event = resp_event.any(axis=1).astype(int) |
| df['resp_event'] = resp_event |
|
|
| return df |
|
|
|
|
| def track_feature(x, desc, single): |
| """ |
| Fill nulls and search to see if x matches a description |
| ------- |
| x : str list |
| feature to track |
| desc : str list |
| string list to compare |
| single : boolean |
| if checking against single description e.g. "COPD" True otherwise False |
| """ |
| x = x.fillna('') |
| if single: |
| result = [desc in s for s in x] |
| else: |
| result = [s in desc for s in x] |
|
|
| return result |
|
|
|
|
| def filter_data(data, date): |
| """ |
| Filter data to only include copd or resp admission events occurring after |
| the index date |
| -------- |
| :param data: dataframe |
| :param date: index date |
| :return: filtered dataframe |
| """ |
| data['ADMDATE'] = pd.to_datetime(data['ADMDATE']) |
| data = data[data['ADMDATE'] >= date] |
| data = data[(data['copd_event'] == 1) | (data['resp_event'] == 1)] |
| return data |
|
|
|
|
| def calculate_time_to_first_copd_admission(data, date): |
| """ |
| Calculate days to first COPD admission |
| -------- |
| :param data: dataframe |
| :param date: Index date in 'DD-MM-YYYY' format |
| :return: dataframe showing the number of days to the first COPD admission |
| event for each ID since the index date |
| """ |
| copd_data = data[data['copd_event'] == 1] |
| first_copd_admission = copd_data.groupby('SafeHavenID').agg(first_copd_admission=('ADMDATE', np.min)) |
| first_copd_admission['index_date'] = date |
| first_copd_admission['index_date'] = pd.to_datetime(first_copd_admission['index_date']) |
| first_copd_admission['days_to_first_copd_admission'] = (first_copd_admission['first_copd_admission'] - first_copd_admission['index_date']).dt.days |
| return first_copd_admission |
|
|
|
|
| def calculate_time_to_first_resp_admission(data, date): |
| """ |
| Calculate days to first resp admission |
| -------- |
| :param data: dataframe |
| :param date: Index date in 'DD-MM-YYYY' format |
| :return: dataframe showing the number of days to the first resp admission event for each ID since |
| the index date |
| """ |
| resp_data = data[data['resp_event'] == 1] |
| first_resp_admission = resp_data.groupby('SafeHavenID').agg(first_resp_admission=('ADMDATE', np.min)) |
| first_resp_admission['index_date'] = date |
| first_resp_admission['index_date'] = pd.to_datetime(first_resp_admission['index_date']) |
| first_resp_admission['days_to_first_resp_admission'] = (first_resp_admission['first_resp_admission'] - first_resp_admission['index_date']).dt.days |
| return first_resp_admission |
|
|
|
|
| def calculate_time_to_first_copd_or_resp_admission(data, date): |
| """ |
| Calculate days to first copd or resp admission |
| -------- |
| :param data: dataframe |
| :param date: Index date in 'DD-MM-YYYY' format |
| :return: dataframe showing the number of days to the first COPD or resp admission |
| event for each ID since the index date |
| """ |
| data['copd_or_resp_event'] = (data['resp_event'] | data['copd_event']) |
| resp_copd_data = data[(data['copd_or_resp_event'] == 1)] |
| first_resp_or_copd_admission = resp_copd_data.groupby('SafeHavenID').agg(first_copd_or_resp_admission=('ADMDATE', np.min)) |
| first_resp_or_copd_admission['index_date'] = date |
| first_resp_or_copd_admission['index_date'] = pd.to_datetime(first_resp_or_copd_admission['index_date']) |
| first_resp_or_copd_admission['first_copd_or_resp_admission'] = pd.to_datetime(first_resp_or_copd_admission['first_copd_or_resp_admission']) |
| first_resp_or_copd_admission['days_to_first_copd_or_resp_admission'] = (first_resp_or_copd_admission['first_copd_or_resp_admission'] - first_resp_or_copd_admission['index_date']).dt.days |
| return first_resp_or_copd_admission |
|
|
|
|
| def calculate_ad_count_1_year(data, year_censor, first_admission_df, adm_col): |
| """ |
| Calculate the number of COPD or respiratory admissions in the year |
| following the index date and join this data to the time to first |
| admissions data for each ID |
| -------- |
| :param data: dataframe containing admissions dates |
| :param year_censor: date 1 year following Index date 'DD-MM-YYYY' format |
| :param first_admission_df: dataframe showing days to first admission |
| :param adm_col: binary column showing if an admission was copd or |
| respiratory related or not |
| :return: dataframe showing the number of days to the first COPD or resp |
| admission event for each ID since the index date |
| """ |
| admission_year = data[data['ADMDATE'] < year_censor] |
| year_admission_count = admission_year.groupby('SafeHavenID').agg(admission_count_year_post_index=(adm_col, 'sum')) |
| all_admissions_data = pd.merge(year_admission_count, first_admission_df, on="SafeHavenID", how="outer") |
| all_admissions_data['admission_count_year_post_index'] = all_admissions_data['admission_count_year_post_index'].fillna(0) |
| return all_admissions_data |
|
|
|
|
| def main(): |
|
|
| adm_file = input_file_path + "SMR01_Cohort3R.csv" |
| adm_cols = ['SafeHavenID', 'ETHGRP', 'ADMDATE', 'DISDATE', 'DIAG1Desc', |
| 'DIAG2Desc', 'DIAG3Desc', 'DIAG4Desc', 'DIAG5Desc', |
| 'DIAG6Desc', 'STAY'] |
| adm_types = ['int', 'object', 'object', 'object', 'str', 'str', 'str', |
| 'str', 'str', 'str', 'int'] |
| adm = read_data(adm_file, adm_cols, adm_types) |
|
|
| |
| adm = adm.drop_duplicates() |
|
|
| |
| adm['ADMDATE'] = pd.to_datetime(adm['ADMDATE']) |
| adm['DISDATE'] = pd.to_datetime(adm['DISDATE']) |
|
|
| |
| adm = update_null_stay(adm) |
|
|
| |
| adm = adm.sort_values(['SafeHavenID', 'ADMDATE', 'DISDATE']) |
| adm = adm.groupby('SafeHavenID').apply(calculate_total_stay) |
| adm = adm.reset_index(drop=True) |
|
|
| |
| adm = adm.apply(lambda x: x.str.strip() if x.dtype == 'object' else x) |
|
|
| |
| adm = track_copd_resp(adm) |
|
|
| |
| adm = filter_data(adm, '01-01-2020') |
|
|
| |
| first_copd_admission = calculate_time_to_first_copd_admission(adm, '01-01-2020') |
| first_resp_admission = calculate_time_to_first_resp_admission(adm, '01-01-2020') |
| first_resp_or_copd_admission = calculate_time_to_first_copd_or_resp_admission(adm, '01-01-2020') |
|
|
| |
| first_copd_admission = calculate_ad_count_1_year(adm, '01-01-2021', first_copd_admission, 'copd_event') |
| first_resp_admission = calculate_ad_count_1_year(adm, '01-01-2021', first_resp_admission, 'resp_event') |
| first_resp_or_copd_admission = calculate_ad_count_1_year(adm, '01-01-2021', first_resp_or_copd_admission, 'copd_or_resp_event') |
|
|
| |
| adm.to_pickle(output_file_path + 'all_COPD_and_resp_admissions_from_index_date.pkl') |
| first_copd_admission.to_pickle(output_file_path + 'copd_admissions_cohort_summary.pkl') |
| first_resp_admission.to_pickle(output_file_path + 'resp_admissions_cohort_summary.pkl') |
| first_resp_or_copd_admission.to_pickle(output_file_path + 'copd_or_resp_admissions_cohort_summary.pkl') |
|
|
|
|
| main() |
|
|