| import copd |
| import os |
| import pandas as pd |
| from scipy.stats import ks_2samp, cramervonmises_2samp |
| import seaborn as sns |
| import matplotlib.pyplot as plt |
| sns.set(style='darkgrid', context='talk') |
| sns.set_palette('dark') |
| muted = sns.palettes.color_palette(palette='muted') |
| dark = sns.palettes.color_palette(palette='dark') |
|
|
| data_dir = '<YOUR_DATA_PATH>/lenus-samples-dataset' |
|
|
| |
| |
| |
| |
| |
| |
| |
| lenus_sample = pd.read_csv(os.path.join(data_dir, "DataServerDatasetSample.txt"), |
| delimiter="|", usecols=['StartDate', 'EndDate', |
| 'CreatorSubject', 'QuantityId', |
| 'TypeIdentifier', 'CreationDate']) |
|
|
| |
| |
| date_cols = ['StartDate', 'EndDate', 'CreationDate'] |
| for col in date_cols: |
| lenus_sample[col] = pd.to_datetime(lenus_sample[col], utc=True).dt.normalize() |
|
|
| |
| |
| lenus_quantity = pd.read_csv(os.path.join(data_dir, "DataServerDatasetQuantity.txt"), |
| delimiter="|") |
|
|
| |
| platform_data = lenus_sample.merge(lenus_quantity, left_on='QuantityId', |
| right_on='Id').drop(columns=['Id']) |
|
|
| |
| platform_data['Units'] = copd.unit_lookup(platform_data['Unit']) |
| type_lookup = pd.read_csv('./lookups/type_lookup.txt') |
| platform_data = platform_data.merge(type_lookup, left_on='TypeIdentifier', |
| right_on=type_lookup.index) |
|
|
| |
| platform_data = platform_data.drop(columns=['TypeIdentifier', 'Unit']) |
|
|
| |
| platform_data = pd.pivot_table(platform_data, values='Value', |
| index=['StartDate', 'EndDate', 'CreationDate', |
| 'CreatorSubject'], |
| columns=['Description']).reset_index() |
|
|
| data = pd.read_pickle(os.path.join('<YOUR_DATA_PATH>/copd-dataset', 'exac_data.pkl')) |
| patients = data.LenusId.unique() |
|
|
|
|
| def filter_on_date_and_id(df, min_date, patients): |
| return df[(df.CreationDate >= min_date) & (df.CreatorSubject.isin(patients))] |
|
|
|
|
| def resample_and_merge_median(df, fitbit): |
| |
| |
| fitbit = fitbit.set_index('CreationDate').groupby('CreatorSubject').resample( |
| '1d').median().dropna().reset_index() |
| data = df.merge(fitbit, left_on=['LenusId', 'DateOfEvent'], |
| right_on=['CreatorSubject', 'CreationDate'], how='inner') |
| return data |
|
|
|
|
| def resample_and_merge_last(df, fitbit): |
| fitbit['DateOfEvent'] = fitbit['CreationDate'] |
| |
| fitbit = fitbit.set_index('CreationDate').groupby('CreatorSubject').resample( |
| '1d').last().dropna().reset_index(drop=True) |
| data = df.merge(fitbit, left_on=['LenusId', 'DateOfEvent'], |
| right_on=['CreatorSubject', 'DateOfEvent'], how='inner') |
| return data |
|
|
|
|
| def print_numbers(df, measurement): |
| fitbit_patients = pd.Series(df.StudyId.unique()) |
| print('{} patient days with {} data across {} unique patients ({} RC and {} SU)'. |
| format(len(df), measurement, len(df.PatientId.unique()), |
| fitbit_patients.str.startswith('RC').sum(), |
| fitbit_patients.str.startswith('SU').sum())) |
| exac_patients = pd.Series(df[df.IsExac == 1].StudyId.unique()) |
| print('{} exacerbations across {} patients ({} RC and {} SU)'.format(df.IsExac.sum(), |
| len(df[df.IsExac == 1].PatientId.unique()), |
| exac_patients.str.startswith('RC').sum(), |
| exac_patients.str.startswith('SU').sum())) |
|
|
|
|
| |
| heart_rate = platform_data[platform_data['heart rate'].notna()][ |
| ['CreationDate', 'CreatorSubject', 'heart rate']] |
|
|
| |
| heart_rate = filter_on_date_and_id(heart_rate, min_date='2010-01-01', patients=patients) |
| heart_rate.columns |
|
|
| hr_data = resample_and_merge_last(df=data, fitbit=heart_rate) |
| print_numbers(hr_data, 'HR') |
|
|
| steps = platform_data[platform_data['number of steps taken;'].notna()][[ |
| 'CreationDate', 'CreatorSubject', 'number of steps taken;']] |
| |
| steps = filter_on_date_and_id(steps, min_date='2010-01-01', patients=patients) |
| steps_data = resample_and_merge_median(df=data, fitbit=steps) |
|
|
| print_numbers(steps_data, 'steps') |
|
|
| hr_exac_patients = hr_data[hr_data.IsExac == 1]['PatientId'].unique() |
| hr_data = hr_data[hr_data.PatientId.isin(hr_exac_patients)] |
|
|
| hr_exac = hr_data[hr_data.IsExac == 1]['heart rate'] |
| hr_no_exac = hr_data[hr_data.IsExac == 0]['heart rate'] |
|
|
| ks_2samp(hr_exac, hr_no_exac) |
| cramervonmises_2samp(hr_exac, hr_no_exac) |
|
|
| steps_exac_patients = steps_data[steps_data.IsExac == 1]['PatientId'].unique() |
| steps_data = steps_data[steps_data.PatientId.isin(steps_exac_patients)] |
|
|
| steps_exac = steps_data[steps_data.IsExac == 1]['number of steps taken;'] |
| steps_no_exac = steps_data[steps_data.IsExac == 0]['number of steps taken;'] |
|
|
| ks_2samp(steps_exac, steps_no_exac) |
| cramervonmises_2samp(steps_exac, steps_no_exac) |
|
|
| fig, axes = plt.subplots(nrows=1, ncols=2, sharex=True, sharey=True, |
| constrained_layout=True, figsize=(8, 6)) |
| sns.histplot(hr_data[hr_data.IsExac == 0], x="heart rate", binwidth=5, binrange=[50, 100], |
| alpha=.6, stat="density", legend=True, ax=axes[0], color=dark[0]) |
| axes[0].set_xlabel(None) |
| plt.legend(['a']) |
| sns.histplot(hr_data[hr_data.IsExac == 1], x="heart rate", binwidth=5, binrange=[50, 100], |
| alpha=.6, stat="density", legend=True, ax=axes[1], color=dark[1]) |
| axes[1].set_xlabel(None) |
| fig.supxlabel('heart rate') |
| plt.legend(['b']) |
|
|