anicolson commited on
Commit
453bf0e
1 Parent(s): 28abdb1

Upload model

Browse files
Files changed (3) hide show
  1. dataset.py +41 -13
  2. lmdb_jpg.py +69 -0
  3. modelling_cxrmate_ed.py +42 -17
dataset.py CHANGED
@@ -1,9 +1,10 @@
1
  import os
2
 
 
3
  import pandas as pd
4
  import torch
5
  from torch.utils.data import Dataset
6
- from torchvision.io import read_image
7
 
8
  # Ordered by oblique, lateral, AP, and then PA views so that PA views are closest in position to the generated tokens (and oblique is furtherest).
9
  VIEW_ORDER = ['LPO', 'RAO', 'LAO', 'SWIMMERS', 'XTABLE LATERAL', 'LL', 'LATERAL', 'AP AXIAL', 'AP RLD', 'AP LLD', 'AP', 'PA RLD', 'PA LLD', 'PA']
@@ -25,7 +26,8 @@ class StudyIDEDStayIDSubset(Dataset):
25
  self,
26
  split,
27
  records,
28
- dataset_dir=None,
 
29
  max_images_per_study=None,
30
  transforms=None,
31
  images=True,
@@ -39,8 +41,9 @@ class StudyIDEDStayIDSubset(Dataset):
39
  """
40
  Argument/s:
41
  split - 'train', 'validate', or 'test'.
42
- dataset_dir - Dataset directory.
43
  records - MIMIC-CXR & MIMIC-IV-ED records class instance.
 
 
44
  max_images_per_study - the maximum number of images per study.
45
  transforms - torchvision transformations.
46
  colour_space - PIL target colour space.
@@ -54,7 +57,8 @@ class StudyIDEDStayIDSubset(Dataset):
54
  """
55
  super(StudyIDEDStayIDSubset, self).__init__()
56
  self.split = split
57
- self.dataset_dir = dataset_dir
 
58
  self.records = records
59
  self.max_images_per_study = max_images_per_study
60
  self.transforms = transforms
@@ -68,15 +72,16 @@ class StudyIDEDStayIDSubset(Dataset):
68
  # If max images per study is not set:
69
  self.max_images_per_study = float('inf') if self.max_images_per_study is None else self.max_images_per_study
70
 
71
- assert self.extension == 'jpg' or self.extension == 'dcm'
 
72
 
73
- if self.dataset_dir is not None:
74
  if self.extension == 'jpg':
75
- if 'physionet.org/files/mimic-cxr-jpg/2.0.0/files' not in self.dataset_dir:
76
- self.dataset_dir = os.path.join(self.dataset_dir, 'physionet.org/files/mimic-cxr-jpg/2.0.0/files')
77
  elif self.extension == 'dcm':
78
- if 'physionet.org/files/mimic-cxr/2.0.0/files' not in self.dataset_dir:
79
- self.dataset_dir = os.path.join(self.dataset_dir, 'physionet.org/files/mimic-cxr/2.0.0/files')
80
 
81
  query = f"""
82
  SELECT {columns}
@@ -108,6 +113,18 @@ class StudyIDEDStayIDSubset(Dataset):
108
  self.num_dicom_ids = len(df['dicom_id'].unique().tolist())
109
  self.num_subject_ids = len(df['subject_id'].unique().tolist())
110
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  def __len__(self):
112
  return self.num_study_ids
113
 
@@ -212,9 +229,20 @@ class StudyIDEDStayIDSubset(Dataset):
212
  """
213
 
214
  if self.extension == 'jpg':
215
-
216
- image_file_path = mimic_cxr_image_path(self.dataset_dir, subject_id, study_id, dicom_id, self.extension)
217
- image = read_image(image_file_path)
 
 
 
 
 
 
 
 
 
 
 
218
 
219
  elif self.extension == 'dcm':
220
  raise NotImplementedError
 
1
  import os
2
 
3
+ import lmdb
4
  import pandas as pd
5
  import torch
6
  from torch.utils.data import Dataset
7
+ from torchvision.io import decode_image, read_image
8
 
9
  # Ordered by oblique, lateral, AP, and then PA views so that PA views are closest in position to the generated tokens (and oblique is furtherest).
10
  VIEW_ORDER = ['LPO', 'RAO', 'LAO', 'SWIMMERS', 'XTABLE LATERAL', 'LL', 'LATERAL', 'AP AXIAL', 'AP RLD', 'AP LLD', 'AP', 'PA RLD', 'PA LLD', 'PA']
 
26
  self,
27
  split,
28
  records,
29
+ mimic_cxr_jpg_lmdb_path=None,
30
+ mimic_cxr_dir=None,
31
  max_images_per_study=None,
32
  transforms=None,
33
  images=True,
 
41
  """
42
  Argument/s:
43
  split - 'train', 'validate', or 'test'.
 
44
  records - MIMIC-CXR & MIMIC-IV-ED records class instance.
45
+ mimic_cxr_jpg_lmdb_path - JPG database for MIMIC-CXR-JPG.
46
+ mimic_cxr_dir - Path to the MIMIC-CXR directory containing the patient study subdirectories with the JPG or DCM images.
47
  max_images_per_study - the maximum number of images per study.
48
  transforms - torchvision transformations.
49
  colour_space - PIL target colour space.
 
57
  """
58
  super(StudyIDEDStayIDSubset, self).__init__()
59
  self.split = split
60
+ self.mimic_cxr_jpg_lmdb_path = mimic_cxr_jpg_lmdb_path
61
+ self.mimic_cxr_dir = mimic_cxr_dir
62
  self.records = records
63
  self.max_images_per_study = max_images_per_study
64
  self.transforms = transforms
 
72
  # If max images per study is not set:
73
  self.max_images_per_study = float('inf') if self.max_images_per_study is None else self.max_images_per_study
74
 
75
+ assert self.extension == 'jpg' or self.extension == 'dcm', '"extension" can only be either "jpg" or "dcm".'
76
+ assert (mimic_cxr_jpg_lmdb_path is None) != (mimic_cxr_dir is None), 'Either "mimic_cxr_jpg_lmdb_path" or "mimic_cxr_dir" can be set.'
77
 
78
+ if self.mimic_cxr_dir is not None and self.mimic_cxr_jpg_lmdb_path is None:
79
  if self.extension == 'jpg':
80
+ if 'physionet.org/files/mimic-cxr-jpg/2.0.0/files' not in self.mimic_cxr_dir:
81
+ self.mimic_cxr_dir = os.path.join(self.mimic_cxr_dir, 'physionet.org/files/mimic-cxr-jpg/2.0.0/files')
82
  elif self.extension == 'dcm':
83
+ if 'physionet.org/files/mimic-cxr/2.0.0/files' not in self.mimic_cxr_dir:
84
+ self.mimic_cxr_dir = os.path.join(self.mimic_cxr_dir, 'physionet.org/files/mimic-cxr/2.0.0/files')
85
 
86
  query = f"""
87
  SELECT {columns}
 
113
  self.num_dicom_ids = len(df['dicom_id'].unique().tolist())
114
  self.num_subject_ids = len(df['subject_id'].unique().tolist())
115
 
116
+ # Prepare the LMDB .jpg database:
117
+ if self.mimic_cxr_jpg_lmdb_path is not None:
118
+
119
+ print('Loading images using LMDB.')
120
+
121
+ # Map size:
122
+ map_size = int(0.65 * (1024 ** 4))
123
+ assert isinstance(map_size, int)
124
+
125
+ self.env = lmdb.open(self.mimic_cxr_jpg_lmdb_path, map_size=map_size, lock=False, readonly=True)
126
+ self.txn = self.env.begin(write=False)
127
+
128
  def __len__(self):
129
  return self.num_study_ids
130
 
 
229
  """
230
 
231
  if self.extension == 'jpg':
232
+
233
+ if self.mimic_cxr_jpg_lmdb_path is not None:
234
+
235
+ # Convert to bytes:
236
+ key = bytes(dicom_id, 'utf-8')
237
+
238
+ # Retrieve image:
239
+ image = bytearray(self.txn.get(key))
240
+ image = torch.frombuffer(image, dtype=torch.uint8)
241
+ image = decode_image(image)
242
+
243
+ else:
244
+ image_file_path = mimic_cxr_image_path(self.mimic_cxr_dir, subject_id, study_id, dicom_id, self.extension)
245
+ image = read_image(image_file_path)
246
 
247
  elif self.extension == 'dcm':
248
  raise NotImplementedError
lmdb_jpg.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import multiprocessing
2
+
3
+ import duckdb
4
+ import lmdb
5
+ from torch.utils.data import DataLoader, Dataset
6
+ from tqdm import tqdm
7
+
8
+ from .dataset import mimic_cxr_image_path
9
+
10
+
11
+ class JPGDataset(Dataset):
12
+ def __init__(self, df, jpg_path):
13
+ self.df = df
14
+ self.jpg_path = jpg_path
15
+
16
+ def __len__(self):
17
+ return len(self.df)
18
+
19
+ def __getitem__(self, idx):
20
+
21
+ row = self.df.iloc[idx]
22
+
23
+ jpg_path = mimic_cxr_image_path(self.jpg_path, row['subject_id'], row['study_id'], row['dicom_id'], 'jpg')
24
+
25
+ # Convert key to bytes:
26
+ key = bytes(row['dicom_id'], 'utf-8')
27
+
28
+ # Read the .jpg file as bytes:
29
+ with open(jpg_path, 'rb') as f:
30
+ image = f.read()
31
+
32
+ return {
33
+ 'keys': key,
34
+ 'images': image,
35
+ }
36
+
37
+ def prepare_mimic_cxr_jpg_lmdb(mimic_iv_duckdb_path, mimic_cxr_jpg_path, mimic_cxr_jpg_lmdb_path, map_size_tb, num_workers=None):
38
+
39
+ num_workers = num_workers if num_workers is not None else multiprocessing.cpu_count()
40
+
41
+ connect = duckdb.connect(mimic_iv_duckdb_path, read_only=True)
42
+ df = connect.sql("SELECT DISTINCT ON(dicom_id) subject_id, study_id, dicom_id FROM mimic_cxr").df()
43
+ connect.close()
44
+
45
+ # Map size:
46
+ map_size = int(map_size_tb * (1024 ** 4))
47
+ assert isinstance(map_size, int)
48
+
49
+ print(f'Map size: {map_size}')
50
+
51
+ dataset = JPGDataset(df, mimic_cxr_jpg_path)
52
+ dataloader = DataLoader(
53
+ dataset,
54
+ batch_size=num_workers,
55
+ shuffle=False,
56
+ num_workers=num_workers,
57
+ prefetch_factor=1,
58
+ collate_fn=lambda x: x,
59
+ )
60
+
61
+ env = lmdb.open(mimic_cxr_jpg_lmdb_path, map_size=map_size, readonly=False)
62
+ for batch in tqdm(dataloader):
63
+ for i in batch:
64
+ with env.begin(write=True) as txn:
65
+ value = txn.get(b'image_keys')
66
+ if value is None:
67
+ txn.put(i['keys'], i['images'])
68
+ env.sync()
69
+ env.close()
modelling_cxrmate_ed.py CHANGED
@@ -21,6 +21,7 @@ from transformers.utils import logging
21
 
22
  from .create_section_files import create_section_files
23
  from .dataset import StudyIDEDStayIDSubset
 
24
  from .modelling_uniformer import MultiUniFormerWithProjectionHead
25
  from .records import EDCXRSubjectRecords
26
  from .tables import ed_module_tables, mimic_cxr_tables
@@ -917,11 +918,14 @@ class MIMICIVEDCXRMultimodalModel(VisionEncoderDecoderModel):
917
  return position_ids
918
 
919
  @staticmethod
920
- def prepare_data(physionet_dir, database_path, dataset_dir=None):
921
 
922
- dataset_dir = physionet_dir if dataset_dir is None else dataset_dir
923
-
924
- sectioned_dir = os.path.join(dataset_dir, 'mimic_cxr_sectioned')
 
 
 
925
 
926
  mimic_cxr_sectioned_path = os.path.join(sectioned_dir, 'mimic_cxr_sectioned.csv')
927
  if not os.path.exists(mimic_cxr_sectioned_path):
@@ -947,9 +951,9 @@ class MIMICIVEDCXRMultimodalModel(VisionEncoderDecoderModel):
947
  no_split=True,
948
  )
949
 
950
- if not os.path.exists(database_path):
951
 
952
- connect = duckdb.connect(database_path)
953
 
954
  csv_paths = []
955
  csv_paths.append(glob(os.path.join(physionet_dir, 'mimic-iv-ed', '*', 'ed', 'edstays.csv.gz'))[0])
@@ -982,14 +986,16 @@ class MIMICIVEDCXRMultimodalModel(VisionEncoderDecoderModel):
982
  # MIMIC-CXR report sections:
983
  print(f'Copying mimic_cxr_sectioned into database...')
984
  connect.sql(f"CREATE OR REPLACE TABLE mimic_cxr_sectioned AS FROM '{mimic_cxr_sectioned_path}';")
985
- connect.sql("ALTER TABLE mimic_cxr_sectioned RENAME COLUMN column0 TO study;")
986
- connect.sql("ALTER TABLE mimic_cxr_sectioned RENAME COLUMN column1 TO impression;")
987
- connect.sql("ALTER TABLE mimic_cxr_sectioned RENAME COLUMN column2 TO findings;")
988
- connect.sql("ALTER TABLE mimic_cxr_sectioned RENAME COLUMN column3 TO indication;")
989
- connect.sql("ALTER TABLE mimic_cxr_sectioned RENAME COLUMN column4 TO history;")
990
- connect.sql("ALTER TABLE mimic_cxr_sectioned RENAME COLUMN column5 TO last_paragraph;")
991
- connect.sql("ALTER TABLE mimic_cxr_sectioned RENAME COLUMN column6 TO comparison;")
992
- connect.sql("DELETE FROM mimic_cxr_sectioned WHERE study='study';")
 
 
993
 
994
  splits = connect.sql("FROM mimic_cxr_2_0_0_split").df()
995
  reports = connect.sql("FROM mimic_cxr_sectioned").df()
@@ -1065,6 +1071,7 @@ class MIMICIVEDCXRMultimodalModel(VisionEncoderDecoderModel):
1065
  df = df.sort_values(by='study_datetime', ascending=False)
1066
  df = df.groupby('study_id').first().reset_index()
1067
 
 
1068
  for _, row in tqdm(df.iterrows(), total=df.shape[0]):
1069
  edstays = connect.sql(
1070
  f"""
@@ -1109,21 +1116,39 @@ class MIMICIVEDCXRMultimodalModel(VisionEncoderDecoderModel):
1109
  df = pd.DataFrame(v)
1110
  df = df.drop_duplicates(subset=['study_id', 'stay_id'])
1111
  connect.sql(f"CREATE TABLE {k}_study_ids AS SELECT * FROM df")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1112
 
1113
  @staticmethod
1114
- def get_dataset(split, transforms, database_path, mimic_cxr_jpg_dir, max_images_per_study=5, records=None):
 
 
 
1115
 
1116
  if records is None:
1117
 
1118
  # This is the setup for CXRs + all effective inputs - medicine reconciliation:
1119
- records = EDCXRSubjectRecords(database_path=database_path, time_delta_map=lambda x: 1 / math.sqrt(x + 1))
1120
 
1121
  records.ed_module_tables = {k: records.ed_module_tables[k] for k in ['edstays', 'triage', 'vitalsign']}
1122
  records.mimic_cxr_tables = {k: records.mimic_cxr_tables[k] for k in ['mimic_cxr_sectioned']}
1123
  records.mimic_cxr_tables['mimic_cxr_sectioned'].text_columns = ['indication', 'history']
1124
 
1125
  dataset = StudyIDEDStayIDSubset(
1126
- dataset_dir=mimic_cxr_jpg_dir,
 
1127
  transforms=transforms,
1128
  split=split,
1129
  max_images_per_study=max_images_per_study,
 
21
 
22
  from .create_section_files import create_section_files
23
  from .dataset import StudyIDEDStayIDSubset
24
+ from .lmdb_jpg import prepare_mimic_cxr_jpg_lmdb
25
  from .modelling_uniformer import MultiUniFormerWithProjectionHead
26
  from .records import EDCXRSubjectRecords
27
  from .tables import ed_module_tables, mimic_cxr_tables
 
918
  return position_ids
919
 
920
  @staticmethod
921
+ def prepare_data(physionet_dir, database_dir):
922
 
923
+ Path(database_dir).mkdir(parents=True, exist_ok=True)
924
+
925
+ mimic_iv_duckdb_path = os.path.join(database_dir, 'mimic_iv_duckdb.db')
926
+ mimic_cxr_jpg_lmdb_path = os.path.join(database_dir, 'mimic_cxr_jpg_lmdb.db')
927
+
928
+ sectioned_dir = os.path.join(database_dir, 'mimic_cxr_sectioned')
929
 
930
  mimic_cxr_sectioned_path = os.path.join(sectioned_dir, 'mimic_cxr_sectioned.csv')
931
  if not os.path.exists(mimic_cxr_sectioned_path):
 
951
  no_split=True,
952
  )
953
 
954
+ if not os.path.exists(mimic_iv_duckdb_path):
955
 
956
+ connect = duckdb.connect(mimic_iv_duckdb_path)
957
 
958
  csv_paths = []
959
  csv_paths.append(glob(os.path.join(physionet_dir, 'mimic-iv-ed', '*', 'ed', 'edstays.csv.gz'))[0])
 
986
  # MIMIC-CXR report sections:
987
  print(f'Copying mimic_cxr_sectioned into database...')
988
  connect.sql(f"CREATE OR REPLACE TABLE mimic_cxr_sectioned AS FROM '{mimic_cxr_sectioned_path}';")
989
+ columns = list(connect.sql('FROM mimic_cxr_sectioned LIMIT 1').df().columns)
990
+ if 'column0' in columns: # If the column headers are not read correctly:
991
+ connect.sql("ALTER TABLE mimic_cxr_sectioned RENAME COLUMN column0 TO study;")
992
+ connect.sql("ALTER TABLE mimic_cxr_sectioned RENAME COLUMN column1 TO impression;")
993
+ connect.sql("ALTER TABLE mimic_cxr_sectioned RENAME COLUMN column2 TO findings;")
994
+ connect.sql("ALTER TABLE mimic_cxr_sectioned RENAME COLUMN column3 TO indication;")
995
+ connect.sql("ALTER TABLE mimic_cxr_sectioned RENAME COLUMN column4 TO history;")
996
+ connect.sql("ALTER TABLE mimic_cxr_sectioned RENAME COLUMN column5 TO last_paragraph;")
997
+ connect.sql("ALTER TABLE mimic_cxr_sectioned RENAME COLUMN column6 TO comparison;")
998
+ connect.sql("DELETE FROM mimic_cxr_sectioned WHERE study='study';")
999
 
1000
  splits = connect.sql("FROM mimic_cxr_2_0_0_split").df()
1001
  reports = connect.sql("FROM mimic_cxr_sectioned").df()
 
1071
  df = df.sort_values(by='study_datetime', ascending=False)
1072
  df = df.groupby('study_id').first().reset_index()
1073
 
1074
+ print('Searching for studies associated with an ED stay...')
1075
  for _, row in tqdm(df.iterrows(), total=df.shape[0]):
1076
  edstays = connect.sql(
1077
  f"""
 
1116
  df = pd.DataFrame(v)
1117
  df = df.drop_duplicates(subset=['study_id', 'stay_id'])
1118
  connect.sql(f"CREATE TABLE {k}_study_ids AS SELECT * FROM df")
1119
+
1120
+ connect.close()
1121
+
1122
+ if not os.path.exists(mimic_cxr_jpg_lmdb_path):
1123
+ print('Preparing MIMIC-CXR-JPG LMDB database...')
1124
+ pattern = os.path.join(physionet_dir, 'mimic-cxr-jpg', '*', 'files')
1125
+ mimic_cxr_jpg_dir = glob(pattern)
1126
+ assert len(mimic_cxr_jpg_dir), f'Multiple directories matched the pattern {pattern}: {mimic_cxr_jpg_dir}. Only one is required.'
1127
+ prepare_mimic_cxr_jpg_lmdb(
1128
+ mimic_iv_duckdb_path=mimic_iv_duckdb_path,
1129
+ mimic_cxr_jpg_dir=mimic_cxr_jpg_dir[0],
1130
+ mimic_cxr_jpg_lmdb_path=mimic_cxr_jpg_lmdb_path,
1131
+ map_size_tb=0.65
1132
+ )
1133
 
1134
  @staticmethod
1135
+ def get_dataset(split, transforms, database_dir, max_images_per_study=5, mimic_cxr_jpg_dir=None, records=None):
1136
+
1137
+ mimic_iv_duckdb_path = os.path.join(database_dir, 'mimic_iv_duckdb.db')
1138
+ mimic_cxr_jpg_lmdb_path = os.path.join(database_dir, 'mimic_cxr_jpg_lmdb.db') if mimic_cxr_jpg_dir is None else None
1139
 
1140
  if records is None:
1141
 
1142
  # This is the setup for CXRs + all effective inputs - medicine reconciliation:
1143
+ records = EDCXRSubjectRecords(database_path=mimic_iv_duckdb_path, time_delta_map=lambda x: 1 / math.sqrt(x + 1))
1144
 
1145
  records.ed_module_tables = {k: records.ed_module_tables[k] for k in ['edstays', 'triage', 'vitalsign']}
1146
  records.mimic_cxr_tables = {k: records.mimic_cxr_tables[k] for k in ['mimic_cxr_sectioned']}
1147
  records.mimic_cxr_tables['mimic_cxr_sectioned'].text_columns = ['indication', 'history']
1148
 
1149
  dataset = StudyIDEDStayIDSubset(
1150
+ mimic_cxr_jpg_lmdb_path=mimic_cxr_jpg_lmdb_path,
1151
+ mimic_cxr_dir=mimic_cxr_jpg_dir,
1152
  transforms=transforms,
1153
  split=split,
1154
  max_images_per_study=max_images_per_study,