cxrmate-ed / create_section_files.py
anicolson's picture
Upload model
7dae7aa verified
raw
history blame
6.05 kB
import csv
import os
from pathlib import Path
from tqdm import tqdm
from .section_parser import custom_mimic_cxr_rules, section_text
def list_rindex(l, s):
"""
Source: https://github.com/MIT-LCP/mimic-cxr/blob/master/txt/create_section_files.py
"""
"""Helper function: *last* matching element in a list"""
return len(l) - l[-1::-1].index(s) - 1
def create_section_files(reports_path, output_path, no_split):
"""
Modification of: https://github.com/MIT-LCP/mimic-cxr/blob/master/txt/create_section_files.py
"""
reports_path = Path(reports_path)
output_path = Path(output_path)
if not output_path.exists():
output_path.mkdir()
# not all reports can be automatically sectioned
# we load in some dictionaries which have manually determined sections
custom_section_names, custom_indices = custom_mimic_cxr_rules()
# get all higher up folders (p00, p01, etc)
p_grp_folders = os.listdir(reports_path)
p_grp_folders = [p for p in p_grp_folders
if p.startswith('p') and len(p) == 3]
p_grp_folders.sort()
# patient_studies will hold the text for use in NLP labeling
patient_studies = []
# study_sections will have an element for each study
# this element will be a list, each element having text for a specific section
study_sections = []
for p_grp in p_grp_folders:
# get patient folders, usually around ~6k per group folder
cxr_path = reports_path / p_grp
p_folders = os.listdir(cxr_path)
p_folders = [p for p in p_folders if p.startswith('p')]
p_folders.sort()
# For each patient in this grouping folder
print(p_grp)
for p in tqdm(p_folders):
patient_path = cxr_path / p
# get the filename for all their free-text reports
studies = os.listdir(patient_path)
studies = [s for s in studies
if s.endswith('.txt') and s.startswith('s')]
for s in studies:
# load in the free-text report
with open(patient_path / s, 'r') as fp:
text = ''.join(fp.readlines())
# get study string name without the txt extension
s_stem = s[0:-4]
# custom rules for some poorly formatted reports
if s_stem in custom_indices:
idx = custom_indices[s_stem]
patient_studies.append([s_stem, text[idx[0]:idx[1]]])
continue
# split text into sections
sections, section_names, section_idx = section_text(text)
# check to see if this has mis-named sections
# e.g. sometimes the impression is in the comparison section
if s_stem in custom_section_names:
sn = custom_section_names[s_stem]
idx = list_rindex(section_names, sn)
patient_studies.append([s_stem, sections[idx].strip()])
continue
# grab the *last* section with the given title
# prioritizes impression > findings, etc.
# "last_paragraph" is text up to the end of the report
# many reports are simple, and have a single section
# header followed by a few paragraphs
# these paragraphs are grouped into section "last_paragraph"
# note also comparison seems unusual but if no other sections
# exist the radiologist has usually written the report
# in the comparison section
idx = -1
for sn in ('impression', 'findings', 'indication', 'history', 'technique', 'last_paragraph', 'comparison'):
if sn in section_names:
idx = list_rindex(section_names, sn)
break
if idx == -1:
# we didn't find any sections we can use :(
patient_studies.append([s_stem, ''])
print(f'no impression/findings: {patient_path / s}')
else:
# store the text of the conclusion section
patient_studies.append([s_stem, sections[idx].strip()])
study_sectioned = [s_stem]
for sn in ('impression', 'findings', 'indication', 'history', 'technique', 'last_paragraph', 'comparison'):
if sn in section_names:
idx = list_rindex(section_names, sn)
study_sectioned.append(sections[idx].strip())
else:
study_sectioned.append(None)
study_sections.append(study_sectioned)
# write distinct files to facilitate modular processing
if len(patient_studies) > 0:
# write out a single CSV with the sections
with open(output_path / 'mimic_cxr_sectioned.csv', 'w') as fp:
csvwriter = csv.writer(fp)
# write header
csvwriter.writerow(['study', 'impression', 'findings', 'indication', 'history', 'technique', 'last_paragraph', 'comparison'])
for row in study_sections:
csvwriter.writerow(row)
if no_split:
# write all the reports out to a single file
with open(output_path / f'mimic_cxr_sections.csv', 'w') as fp:
csvwriter = csv.writer(fp)
for row in patient_studies:
csvwriter.writerow(row)
else:
# write ~22 files with ~10k reports each
n = 0
jmp = 10000
while n < len(patient_studies):
n_fn = n // jmp
with open(output_path / f'mimic_cxr_{n_fn:02d}.csv', 'w') as fp:
csvwriter = csv.writer(fp)
for row in patient_studies[n:n+jmp]:
csvwriter.writerow(row)
n += jmp