cxrmate-ed / lmdb_jpg.py
anicolson's picture
Upload model
453bf0e verified
raw
history blame contribute delete
No virus
1.99 kB
import multiprocessing
import duckdb
import lmdb
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from .dataset import mimic_cxr_image_path
class JPGDataset(Dataset):
def __init__(self, df, jpg_path):
self.df = df
self.jpg_path = jpg_path
def __len__(self):
return len(self.df)
def __getitem__(self, idx):
row = self.df.iloc[idx]
jpg_path = mimic_cxr_image_path(self.jpg_path, row['subject_id'], row['study_id'], row['dicom_id'], 'jpg')
# Convert key to bytes:
key = bytes(row['dicom_id'], 'utf-8')
# Read the .jpg file as bytes:
with open(jpg_path, 'rb') as f:
image = f.read()
return {
'keys': key,
'images': image,
}
def prepare_mimic_cxr_jpg_lmdb(mimic_iv_duckdb_path, mimic_cxr_jpg_path, mimic_cxr_jpg_lmdb_path, map_size_tb, num_workers=None):
num_workers = num_workers if num_workers is not None else multiprocessing.cpu_count()
connect = duckdb.connect(mimic_iv_duckdb_path, read_only=True)
df = connect.sql("SELECT DISTINCT ON(dicom_id) subject_id, study_id, dicom_id FROM mimic_cxr").df()
connect.close()
# Map size:
map_size = int(map_size_tb * (1024 ** 4))
assert isinstance(map_size, int)
print(f'Map size: {map_size}')
dataset = JPGDataset(df, mimic_cxr_jpg_path)
dataloader = DataLoader(
dataset,
batch_size=num_workers,
shuffle=False,
num_workers=num_workers,
prefetch_factor=1,
collate_fn=lambda x: x,
)
env = lmdb.open(mimic_cxr_jpg_lmdb_path, map_size=map_size, readonly=False)
for batch in tqdm(dataloader):
for i in batch:
with env.begin(write=True) as txn:
value = txn.get(b'image_keys')
if value is None:
txn.put(i['keys'], i['images'])
env.sync()
env.close()