Upload model
Browse files- README.md +3 -3
- config.json +4 -0
- create_section_files.py +150 -0
- modelling_cxrmate_ed.py +33 -9
- section_parser.py +281 -0
README.md
CHANGED
@@ -1,4 +1,7 @@
|
|
1 |
---
|
|
|
|
|
|
|
2 |
library_name: transformers
|
3 |
tags:
|
4 |
- chest X-ray report generation
|
@@ -16,9 +19,6 @@ tags:
|
|
16 |
- patient records
|
17 |
- mimic-cxr
|
18 |
- mimic-iv-ed
|
19 |
-
license: apache-2.0
|
20 |
-
language:
|
21 |
-
- en
|
22 |
---
|
23 |
|
24 |
# CXRMate-ED: The Impact of Auxiliary Patient Data on Automated Chest X-Ray Report Generation and How to Incorporate It
|
|
|
1 |
---
|
2 |
+
language:
|
3 |
+
- en
|
4 |
+
license: apache-2.0
|
5 |
library_name: transformers
|
6 |
tags:
|
7 |
- chest X-ray report generation
|
|
|
19 |
- patient records
|
20 |
- mimic-cxr
|
21 |
- mimic-iv-ed
|
|
|
|
|
|
|
22 |
---
|
23 |
|
24 |
# CXRMate-ED: The Impact of Auxiliary Patient Data on Automated Chest X-Ray Report Generation and How to Incorporate It
|
config.json
CHANGED
@@ -85,6 +85,10 @@
|
|
85 |
"rms_norm_eps": 1e-06,
|
86 |
"rope_scaling": null,
|
87 |
"rope_theta": 10000.0,
|
|
|
|
|
|
|
|
|
88 |
"sep_token_id": null,
|
89 |
"suppress_tokens": null,
|
90 |
"task_specific_params": null,
|
|
|
85 |
"rms_norm_eps": 1e-06,
|
86 |
"rope_scaling": null,
|
87 |
"rope_theta": 10000.0,
|
88 |
+
"section_ids": [
|
89 |
+
12,
|
90 |
+
13
|
91 |
+
],
|
92 |
"sep_token_id": null,
|
93 |
"suppress_tokens": null,
|
94 |
"task_specific_params": null,
|
create_section_files.py
ADDED
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import csv
|
2 |
+
import os
|
3 |
+
from pathlib import Path
|
4 |
+
|
5 |
+
from tqdm import tqdm
|
6 |
+
|
7 |
+
# local folder import
|
8 |
+
from .section_parser import custom_mimic_cxr_rules, section_text
|
9 |
+
|
10 |
+
|
11 |
+
def list_rindex(l, s):
|
12 |
+
"""
|
13 |
+
Source: https://github.com/MIT-LCP/mimic-cxr/blob/master/txt/create_section_files.py
|
14 |
+
"""
|
15 |
+
|
16 |
+
"""Helper function: *last* matching element in a list"""
|
17 |
+
return len(l) - l[-1::-1].index(s) - 1
|
18 |
+
|
19 |
+
|
20 |
+
def create_section_files(reports_path, output_path, no_split):
|
21 |
+
"""
|
22 |
+
Modification of: https://github.com/MIT-LCP/mimic-cxr/blob/master/txt/create_section_files.py
|
23 |
+
"""
|
24 |
+
|
25 |
+
reports_path = Path(reports_path)
|
26 |
+
output_path = Path(output_path)
|
27 |
+
|
28 |
+
if not output_path.exists():
|
29 |
+
output_path.mkdir()
|
30 |
+
|
31 |
+
# not all reports can be automatically sectioned
|
32 |
+
# we load in some dictionaries which have manually determined sections
|
33 |
+
custom_section_names, custom_indices = custom_mimic_cxr_rules()
|
34 |
+
|
35 |
+
# get all higher up folders (p00, p01, etc)
|
36 |
+
p_grp_folders = os.listdir(reports_path)
|
37 |
+
p_grp_folders = [p for p in p_grp_folders
|
38 |
+
if p.startswith('p') and len(p) == 3]
|
39 |
+
p_grp_folders.sort()
|
40 |
+
|
41 |
+
# patient_studies will hold the text for use in NLP labeling
|
42 |
+
patient_studies = []
|
43 |
+
|
44 |
+
# study_sections will have an element for each study
|
45 |
+
# this element will be a list, each element having text for a specific section
|
46 |
+
study_sections = []
|
47 |
+
for p_grp in p_grp_folders:
|
48 |
+
# get patient folders, usually around ~6k per group folder
|
49 |
+
cxr_path = reports_path / p_grp
|
50 |
+
p_folders = os.listdir(cxr_path)
|
51 |
+
p_folders = [p for p in p_folders if p.startswith('p')]
|
52 |
+
p_folders.sort()
|
53 |
+
|
54 |
+
# For each patient in this grouping folder
|
55 |
+
print(p_grp)
|
56 |
+
for p in tqdm(p_folders):
|
57 |
+
patient_path = cxr_path / p
|
58 |
+
|
59 |
+
# get the filename for all their free-text reports
|
60 |
+
studies = os.listdir(patient_path)
|
61 |
+
studies = [s for s in studies
|
62 |
+
if s.endswith('.txt') and s.startswith('s')]
|
63 |
+
|
64 |
+
for s in studies:
|
65 |
+
# load in the free-text report
|
66 |
+
with open(patient_path / s, 'r') as fp:
|
67 |
+
text = ''.join(fp.readlines())
|
68 |
+
|
69 |
+
# get study string name without the txt extension
|
70 |
+
s_stem = s[0:-4]
|
71 |
+
|
72 |
+
# custom rules for some poorly formatted reports
|
73 |
+
if s_stem in custom_indices:
|
74 |
+
idx = custom_indices[s_stem]
|
75 |
+
patient_studies.append([s_stem, text[idx[0]:idx[1]]])
|
76 |
+
continue
|
77 |
+
|
78 |
+
# split text into sections
|
79 |
+
sections, section_names, section_idx = section_text(text)
|
80 |
+
|
81 |
+
# check to see if this has mis-named sections
|
82 |
+
# e.g. sometimes the impression is in the comparison section
|
83 |
+
if s_stem in custom_section_names:
|
84 |
+
sn = custom_section_names[s_stem]
|
85 |
+
idx = list_rindex(section_names, sn)
|
86 |
+
patient_studies.append([s_stem, sections[idx].strip()])
|
87 |
+
continue
|
88 |
+
|
89 |
+
# grab the *last* section with the given title
|
90 |
+
# prioritizes impression > findings, etc.
|
91 |
+
|
92 |
+
# "last_paragraph" is text up to the end of the report
|
93 |
+
# many reports are simple, and have a single section
|
94 |
+
# header followed by a few paragraphs
|
95 |
+
# these paragraphs are grouped into section "last_paragraph"
|
96 |
+
|
97 |
+
# note also comparison seems unusual but if no other sections
|
98 |
+
# exist the radiologist has usually written the report
|
99 |
+
# in the comparison section
|
100 |
+
idx = -1
|
101 |
+
for sn in ('impression', 'findings', 'indication', 'history', 'last_paragraph', 'comparison'):
|
102 |
+
if sn in section_names:
|
103 |
+
idx = list_rindex(section_names, sn)
|
104 |
+
break
|
105 |
+
|
106 |
+
if idx == -1:
|
107 |
+
# we didn't find any sections we can use :(
|
108 |
+
patient_studies.append([s_stem, ''])
|
109 |
+
print(f'no impression/findings: {patient_path / s}')
|
110 |
+
else:
|
111 |
+
# store the text of the conclusion section
|
112 |
+
patient_studies.append([s_stem, sections[idx].strip()])
|
113 |
+
|
114 |
+
study_sectioned = [s_stem]
|
115 |
+
for sn in ('impression', 'findings', 'indication', 'history', 'last_paragraph', 'comparison'):
|
116 |
+
if sn in section_names:
|
117 |
+
idx = list_rindex(section_names, sn)
|
118 |
+
study_sectioned.append(sections[idx].strip())
|
119 |
+
else:
|
120 |
+
study_sectioned.append(None)
|
121 |
+
study_sections.append(study_sectioned)
|
122 |
+
# write distinct files to facilitate modular processing
|
123 |
+
if len(patient_studies) > 0:
|
124 |
+
# write out a single CSV with the sections
|
125 |
+
with open(output_path / 'mimic_cxr_sectioned.csv', 'w') as fp:
|
126 |
+
csvwriter = csv.writer(fp)
|
127 |
+
# write header
|
128 |
+
csvwriter.writerow(['study', 'impression', 'findings', 'indication', 'history', 'last_paragraph', 'comparison'])
|
129 |
+
for row in study_sections:
|
130 |
+
csvwriter.writerow(row)
|
131 |
+
|
132 |
+
if no_split:
|
133 |
+
# write all the reports out to a single file
|
134 |
+
with open(output_path / f'mimic_cxr_sections.csv', 'w') as fp:
|
135 |
+
csvwriter = csv.writer(fp)
|
136 |
+
for row in patient_studies:
|
137 |
+
csvwriter.writerow(row)
|
138 |
+
else:
|
139 |
+
# write ~22 files with ~10k reports each
|
140 |
+
n = 0
|
141 |
+
jmp = 10000
|
142 |
+
|
143 |
+
while n < len(patient_studies):
|
144 |
+
n_fn = n // jmp
|
145 |
+
with open(output_path / f'mimic_cxr_{n_fn:02d}.csv', 'w') as fp:
|
146 |
+
csvwriter = csv.writer(fp)
|
147 |
+
for row in patient_studies[n:n+jmp]:
|
148 |
+
csvwriter.writerow(row)
|
149 |
+
n += jmp
|
150 |
+
|
modelling_cxrmate_ed.py
CHANGED
@@ -1,12 +1,8 @@
|
|
1 |
-
import csv
|
2 |
-
import functools
|
3 |
import math
|
4 |
import os
|
5 |
-
import re
|
6 |
-
from collections import OrderedDict
|
7 |
from glob import glob
|
8 |
from pathlib import Path
|
9 |
-
from typing import
|
10 |
|
11 |
import duckdb
|
12 |
import pandas as pd
|
@@ -24,10 +20,11 @@ from transformers.models.vision_encoder_decoder.configuration_vision_encoder_dec
|
|
24 |
)
|
25 |
from transformers.utils import logging
|
26 |
|
|
|
27 |
from .dataset import StudyIDEDStayIDSubset
|
28 |
from .modelling_uniformer import MultiUniFormerWithProjectionHead
|
29 |
from .records import EDCXRSubjectRecords
|
30 |
-
from .tables import ed_module_tables
|
31 |
|
32 |
logger = logging.get_logger(__name__)
|
33 |
|
@@ -940,7 +937,7 @@ class MIMICIVEDCXRMultimodalModel(VisionEncoderDecoderModel):
|
|
940 |
"Please download them using wget -r -N -c -np --reject dcm --user <username> --ask-password https://physionet.org/files/mimic-cxr/2.0.0/"""
|
941 |
|
942 |
print('Extracting sections from reports...')
|
943 |
-
|
944 |
reports_path=os.path.join(physionet_dir, 'mimic-cxr', '2.0.0', 'files'),
|
945 |
output_path=sectioned_dir,
|
946 |
no_split=True,
|
@@ -1009,8 +1006,8 @@ class MIMICIVEDCXRMultimodalModel(VisionEncoderDecoderModel):
|
|
1009 |
|
1010 |
connect.sql(f"CREATE OR REPLACE TABLE mimic_cxr AS SELECT * FROM df")
|
1011 |
|
1012 |
-
# Create lookup tables
|
1013 |
-
for k, v in ed_module_tables.items():
|
1014 |
if v.load and v.index_columns:
|
1015 |
start_idx = 0
|
1016 |
for i in v.index_columns_source:
|
@@ -1127,3 +1124,30 @@ class MIMICIVEDCXRMultimodalModel(VisionEncoderDecoderModel):
|
|
1127 |
f'No. of training dicom_ids, study_ids, & subject_ids: {dataset.num_dicom_ids},',
|
1128 |
f'{dataset.num_study_ids}, & {dataset.num_subject_ids}.',
|
1129 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import math
|
2 |
import os
|
|
|
|
|
3 |
from glob import glob
|
4 |
from pathlib import Path
|
5 |
+
from typing import Optional, Tuple, Union
|
6 |
|
7 |
import duckdb
|
8 |
import pandas as pd
|
|
|
20 |
)
|
21 |
from transformers.utils import logging
|
22 |
|
23 |
+
from .create_section_files import create_section_files
|
24 |
from .dataset import StudyIDEDStayIDSubset
|
25 |
from .modelling_uniformer import MultiUniFormerWithProjectionHead
|
26 |
from .records import EDCXRSubjectRecords
|
27 |
+
from .tables import ed_module_tables, mimic_cxr_tables
|
28 |
|
29 |
logger = logging.get_logger(__name__)
|
30 |
|
|
|
937 |
"Please download them using wget -r -N -c -np --reject dcm --user <username> --ask-password https://physionet.org/files/mimic-cxr/2.0.0/"""
|
938 |
|
939 |
print('Extracting sections from reports...')
|
940 |
+
create_section_files(
|
941 |
reports_path=os.path.join(physionet_dir, 'mimic-cxr', '2.0.0', 'files'),
|
942 |
output_path=sectioned_dir,
|
943 |
no_split=True,
|
|
|
1006 |
|
1007 |
connect.sql(f"CREATE OR REPLACE TABLE mimic_cxr AS SELECT * FROM df")
|
1008 |
|
1009 |
+
# Create lookup tables:
|
1010 |
+
for k, v in (ed_module_tables | mimic_cxr_tables).items():
|
1011 |
if v.load and v.index_columns:
|
1012 |
start_idx = 0
|
1013 |
for i in v.index_columns_source:
|
|
|
1124 |
f'No. of training dicom_ids, study_ids, & subject_ids: {dataset.num_dicom_ids},',
|
1125 |
f'{dataset.num_study_ids}, & {dataset.num_subject_ids}.',
|
1126 |
)
|
1127 |
+
return dataset
|
1128 |
+
|
1129 |
+
@staticmethod
|
1130 |
+
def collate_fn(batch):
|
1131 |
+
keys = set().union(*(d.keys() for d in batch))
|
1132 |
+
batch = {j: [i.setdefault(j, None) for i in batch] for j in keys}
|
1133 |
+
batch['images'] = torch.nn.utils.rnn.pad_sequence(batch['images'], batch_first=True, padding_value=0.0)
|
1134 |
+
|
1135 |
+
for k in keys:
|
1136 |
+
if 'index_value_feats' in k:
|
1137 |
+
|
1138 |
+
total_indices = next(i for i in batch[k] if i is not None).shape[-1]
|
1139 |
+
batch[k] = [i if i is not None else torch.empty(0, total_indices) for i in batch[k]]
|
1140 |
+
batch[k] = torch.nn.utils.rnn.pad_sequence(batch[k], batch_first=True, padding_value=-1) # Pad value of -1 is not ideal. Need to use something else.
|
1141 |
+
token_type_id_name = k.replace('_feats', '_token_type_ids')
|
1142 |
+
batch[token_type_id_name] = [i if i is not None else torch.empty(0, dtype=torch.long) for i in batch[token_type_id_name]]
|
1143 |
+
batch[token_type_id_name] = torch.nn.utils.rnn.pad_sequence(
|
1144 |
+
batch[token_type_id_name], batch_first=True, padding_value=0,
|
1145 |
+
)
|
1146 |
+
mask_name = k.replace('_feats', '_mask')
|
1147 |
+
batch[mask_name] = (batch[k] != -1).any(dim=-1).int()
|
1148 |
+
|
1149 |
+
if 'time_delta' in k and 'index_value' in k:
|
1150 |
+
batch[k] = [i if i is not None else torch.empty(0, 1) for i in batch[k]]
|
1151 |
+
batch[k] = torch.nn.utils.rnn.pad_sequence(batch[k], batch_first=True, padding_value=0)
|
1152 |
+
|
1153 |
+
return batch
|
section_parser.py
ADDED
@@ -0,0 +1,281 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
|
3 |
+
|
4 |
+
def section_text(text):
|
5 |
+
"""
|
6 |
+
Source: https://github.com/MIT-LCP/mimic-cxr/blob/master/LICENSE
|
7 |
+
"""
|
8 |
+
|
9 |
+
"""Splits text into sections.
|
10 |
+
|
11 |
+
Assumes text is in a radiology report format, e.g.:
|
12 |
+
|
13 |
+
COMPARISON: Chest radiograph dated XYZ.
|
14 |
+
|
15 |
+
IMPRESSION: ABC...
|
16 |
+
|
17 |
+
Given text like this, it will output text from each section,
|
18 |
+
where the section type is determined by the all caps header.
|
19 |
+
|
20 |
+
Returns a three element tuple:
|
21 |
+
sections - list containing the text of each section
|
22 |
+
section_names - a normalized version of the section name
|
23 |
+
section_idx - list of start indices of the text in the section
|
24 |
+
"""
|
25 |
+
p_section = re.compile(
|
26 |
+
r'\n ([A-Z ()/,-]+):\s', re.DOTALL)
|
27 |
+
|
28 |
+
sections = list()
|
29 |
+
section_names = list()
|
30 |
+
section_idx = list()
|
31 |
+
|
32 |
+
idx = 0
|
33 |
+
s = p_section.search(text, idx)
|
34 |
+
|
35 |
+
if s:
|
36 |
+
sections.append(text[0:s.start(1)])
|
37 |
+
section_names.append('preamble')
|
38 |
+
section_idx.append(0)
|
39 |
+
|
40 |
+
while s:
|
41 |
+
current_section = s.group(1).lower()
|
42 |
+
# get the start of the text for this section
|
43 |
+
idx_start = s.end()
|
44 |
+
# skip past the first newline to avoid some bad parses
|
45 |
+
idx_skip = text[idx_start:].find('\n')
|
46 |
+
if idx_skip == -1:
|
47 |
+
idx_skip = 0
|
48 |
+
|
49 |
+
s = p_section.search(text, idx_start + idx_skip)
|
50 |
+
|
51 |
+
if s is None:
|
52 |
+
idx_end = len(text)
|
53 |
+
else:
|
54 |
+
idx_end = s.start()
|
55 |
+
|
56 |
+
sections.append(text[idx_start:idx_end])
|
57 |
+
section_names.append(current_section)
|
58 |
+
section_idx.append(idx_start)
|
59 |
+
|
60 |
+
else:
|
61 |
+
sections.append(text)
|
62 |
+
section_names.append('full report')
|
63 |
+
section_idx.append(0)
|
64 |
+
|
65 |
+
section_names = normalize_section_names(section_names)
|
66 |
+
|
67 |
+
# remove empty sections
|
68 |
+
# this handles when the report starts with a finding-like statement
|
69 |
+
# .. but this statement is not a section, more like a report title
|
70 |
+
# e.g. p10/p10103318/s57408307
|
71 |
+
# CHEST, PA LATERAL:
|
72 |
+
#
|
73 |
+
# INDICATION: This is the actual section ....
|
74 |
+
# it also helps when there are multiple findings sections
|
75 |
+
# usually one is empty
|
76 |
+
for i in reversed(range(len(section_names))):
|
77 |
+
if section_names[i] in ('impression', 'findings'):
|
78 |
+
if sections[i].strip() == '':
|
79 |
+
sections.pop(i)
|
80 |
+
section_names.pop(i)
|
81 |
+
section_idx.pop(i)
|
82 |
+
|
83 |
+
if ('impression' not in section_names) & ('findings' not in section_names):
|
84 |
+
# create a new section for the final paragraph
|
85 |
+
if '\n \n' in sections[-1]:
|
86 |
+
sections.append('\n \n'.join(sections[-1].split('\n \n')[1:]))
|
87 |
+
sections[-2] = sections[-2].split('\n \n')[0]
|
88 |
+
section_names.append('last_paragraph')
|
89 |
+
section_idx.append(section_idx[-1] + len(sections[-2]))
|
90 |
+
|
91 |
+
return sections, section_names, section_idx
|
92 |
+
|
93 |
+
|
94 |
+
def normalize_section_names(section_names):
|
95 |
+
"""
|
96 |
+
Source: https://github.com/MIT-LCP/mimic-cxr/blob/master/LICENSE
|
97 |
+
"""
|
98 |
+
|
99 |
+
# first, lower case all
|
100 |
+
section_names = [s.lower().strip() for s in section_names]
|
101 |
+
|
102 |
+
frequent_sections = {
|
103 |
+
"preamble": "preamble", # 227885
|
104 |
+
"impression": "impression", # 187759
|
105 |
+
"comparison": "comparison", # 154647
|
106 |
+
"indication": "indication", # 153730
|
107 |
+
"findings": "findings", # 149842
|
108 |
+
"examination": "examination", # 94094
|
109 |
+
"technique": "technique", # 81402
|
110 |
+
"history": "history", # 45624
|
111 |
+
"comparisons": "comparison", # 8686
|
112 |
+
"clinical history": "history", # 7121
|
113 |
+
"reason for examination": "indication", # 5845
|
114 |
+
"notification": "notification", # 5749
|
115 |
+
"reason for exam": "indication", # 4430
|
116 |
+
"clinical information": "history", # 4024
|
117 |
+
"exam": "examination", # 3907
|
118 |
+
"clinical indication": "indication", # 1945
|
119 |
+
"conclusion": "impression", # 1802
|
120 |
+
"chest, two views": "findings", # 1735
|
121 |
+
"recommendation(s)": "recommendations", # 1700
|
122 |
+
"type of examination": "examination", # 1678
|
123 |
+
"reference exam": "comparison", # 347
|
124 |
+
"patient history": "history", # 251
|
125 |
+
"addendum": "addendum", # 183
|
126 |
+
"comparison exam": "comparison", # 163
|
127 |
+
"date": "date", # 108
|
128 |
+
"comment": "comment", # 88
|
129 |
+
"findings and impression": "impression", # 87
|
130 |
+
"wet read": "wet read", # 83
|
131 |
+
"comparison film": "comparison", # 79
|
132 |
+
"recommendations": "recommendations", # 72
|
133 |
+
"findings/impression": "impression", # 47
|
134 |
+
"pfi": "history",
|
135 |
+
'recommendation': 'recommendations',
|
136 |
+
'wetread': 'wet read',
|
137 |
+
'ndication': 'impression', # 1
|
138 |
+
'impresson': 'impression', # 2
|
139 |
+
'imprression': 'impression', # 1
|
140 |
+
'imoression': 'impression', # 1
|
141 |
+
'impressoin': 'impression', # 1
|
142 |
+
'imprssion': 'impression', # 1
|
143 |
+
'impresion': 'impression', # 1
|
144 |
+
'imperssion': 'impression', # 1
|
145 |
+
'mpression': 'impression', # 1
|
146 |
+
'impession': 'impression', # 3
|
147 |
+
'findings/ impression': 'impression', # ,1
|
148 |
+
'finding': 'findings', # ,8
|
149 |
+
'findins': 'findings',
|
150 |
+
'findindgs': 'findings', # ,1
|
151 |
+
'findgings': 'findings', # ,1
|
152 |
+
'findngs': 'findings', # ,1
|
153 |
+
'findnings': 'findings', # ,1
|
154 |
+
'finidngs': 'findings', # ,2
|
155 |
+
'idication': 'indication', # ,1
|
156 |
+
'reference findings': 'findings', # ,1
|
157 |
+
'comparision': 'comparison', # ,2
|
158 |
+
'comparsion': 'comparison', # ,1
|
159 |
+
'comparrison': 'comparison', # ,1
|
160 |
+
'comparisions': 'comparison' # ,1
|
161 |
+
}
|
162 |
+
|
163 |
+
p_findings = [
|
164 |
+
'chest',
|
165 |
+
'portable',
|
166 |
+
'pa and lateral',
|
167 |
+
'lateral and pa',
|
168 |
+
'ap and lateral',
|
169 |
+
'lateral and ap',
|
170 |
+
'frontal and',
|
171 |
+
'two views',
|
172 |
+
'frontal view',
|
173 |
+
'pa view',
|
174 |
+
'ap view',
|
175 |
+
'one view',
|
176 |
+
'lateral view',
|
177 |
+
'bone window',
|
178 |
+
'frontal upright',
|
179 |
+
'frontal semi-upright',
|
180 |
+
'ribs',
|
181 |
+
'pa and lat'
|
182 |
+
]
|
183 |
+
p_findings = re.compile('({})'.format('|'.join(p_findings)))
|
184 |
+
|
185 |
+
main_sections = [
|
186 |
+
'impression', 'findings', 'history', 'comparison',
|
187 |
+
'addendum'
|
188 |
+
]
|
189 |
+
for i, s in enumerate(section_names):
|
190 |
+
if s in frequent_sections:
|
191 |
+
section_names[i] = frequent_sections[s]
|
192 |
+
continue
|
193 |
+
|
194 |
+
main_flag = False
|
195 |
+
for m in main_sections:
|
196 |
+
if m in s:
|
197 |
+
section_names[i] = m
|
198 |
+
main_flag = True
|
199 |
+
break
|
200 |
+
if main_flag:
|
201 |
+
continue
|
202 |
+
|
203 |
+
m = p_findings.search(s)
|
204 |
+
if m is not None:
|
205 |
+
section_names[i] = 'findings'
|
206 |
+
|
207 |
+
# if it looks like it is describing the entire study
|
208 |
+
# it's equivalent to findings
|
209 |
+
# group similar phrasings for impression
|
210 |
+
|
211 |
+
return section_names
|
212 |
+
|
213 |
+
|
214 |
+
def custom_mimic_cxr_rules():
|
215 |
+
"""
|
216 |
+
Source: https://github.com/MIT-LCP/mimic-cxr/blob/master/LICENSE
|
217 |
+
"""
|
218 |
+
custom_section_names = {
|
219 |
+
's50913680': 'recommendations', # files/p11/p11851243/s50913680.txt
|
220 |
+
's59363654': 'examination', # files/p12/p12128253/s59363654.txt
|
221 |
+
's59279892': 'technique', # files/p13/p13150370/s59279892.txt
|
222 |
+
's59768032': 'recommendations', # files/p13/p13249077/s59768032.txt
|
223 |
+
's57936451': 'indication', # files/p14/p14325424/s57936451.txt
|
224 |
+
's50058765': 'indication', # files/p14/p14731346/s50058765.txt
|
225 |
+
's53356173': 'examination', # files/p15/p15898350/s53356173.txt
|
226 |
+
's53202765': 'technique', # files/p16/p16076182/s53202765.txt
|
227 |
+
's50808053': 'technique', # files/p16/p16631485/s50808053.txt
|
228 |
+
's51966317': 'indication', # files/p10/p10817099/s51966317.txt
|
229 |
+
's50743547': 'examination', # files/p11/p11388341/s50743547.txt
|
230 |
+
's56451190': 'note', # files/p11/p11842879/s56451190.txt
|
231 |
+
's59067458': 'recommendations', # files/p11/p11984647/s59067458.txt
|
232 |
+
's59215320': 'examination', # files/p12/p12408912/s59215320.txt
|
233 |
+
's55124749': 'indication', # files/p12/p12428492/s55124749.txt
|
234 |
+
's54365831': 'indication', # files/p13/p13876470/s54365831.txt
|
235 |
+
's59087630': 'recommendations', # files/p14/p14267880/s59087630.txt
|
236 |
+
's58157373': 'recommendations', # files/p15/p15032392/s58157373.txt
|
237 |
+
's56482935': 'recommendations', # files/p15/p15388421/s56482935.txt
|
238 |
+
's58375018': 'recommendations', # files/p15/p15505556/s58375018.txt
|
239 |
+
's54654948': 'indication', # files/p17/p17090359/s54654948.txt
|
240 |
+
's55157853': 'examination', # files/p18/p18975498/s55157853.txt
|
241 |
+
's51491012': 'history', # files/p19/p19314266/s51491012.txt
|
242 |
+
|
243 |
+
}
|
244 |
+
|
245 |
+
custom_indices = {
|
246 |
+
's50525523': [201, 349], # files/p10/p10602608/s50525523.txt
|
247 |
+
's57564132': [233, 554], # files/p10/p10637168/s57564132.txt
|
248 |
+
's59982525': [313, 717], # files/p11/p11989982/s59982525.txt
|
249 |
+
's53488209': [149, 475], # files/p12/p12458657/s53488209.txt
|
250 |
+
's54875119': [234, 988], # files/p13/p13687044/s54875119.txt
|
251 |
+
's50196495': [59, 399], # files/p13/p13894879/s50196495.txt
|
252 |
+
's56579911': [59, 218], # files/p15/p15394326/s56579911.txt
|
253 |
+
's52648681': [292, 631], # files/p15/p15666238/s52648681.txt
|
254 |
+
's59889364': [172, 453], # files/p15/p15835529/s59889364.txt
|
255 |
+
's53514462': [73, 377], # files/p16/p16297706/s53514462.txt
|
256 |
+
's59505494': [59, 450], # files/p16/p16730991/s59505494.txt
|
257 |
+
's53182247': [59, 412], # files/p16/p16770442/s53182247.txt
|
258 |
+
's51410602': [47, 320], # files/p17/p17069955/s51410602.txt
|
259 |
+
's56412866': [522, 822], # files/p17/p17612000/s56412866.txt
|
260 |
+
's54986978': [59, 306], # files/p17/p17912487/s54986978.txt
|
261 |
+
's59003148': [262, 505], # files/p17/p17916384/s59003148.txt
|
262 |
+
's57150433': [61, 394], # files/p18/p18335791/s57150433.txt
|
263 |
+
's56760320': [219, 457], # files/p18/p18418794/s56760320.txt
|
264 |
+
's59562049': [158, 348], # files/p18/p18502016/s59562049.txt
|
265 |
+
's52674888': [145, 296], # files/p19/p19381919/s52674888.txt
|
266 |
+
's55258338': [192, 568], # files/p13/p13719117/s55258338.txt
|
267 |
+
's59330497': [140, 655], # files/p15/p15479218/s59330497.txt
|
268 |
+
's52119491': [179, 454], # files/p17/p17959278/s52119491.txt
|
269 |
+
# below have no findings at all in the entire report
|
270 |
+
's58235663': [0, 0], # files/p11/p11573679/s58235663.txt
|
271 |
+
's50798377': [0, 0], # files/p12/p12632853/s50798377.txt
|
272 |
+
's54168089': [0, 0], # files/p14/p14463099/s54168089.txt
|
273 |
+
's53071062': [0, 0], # files/p15/p15774521/s53071062.txt
|
274 |
+
's56724958': [0, 0], # files/p16/p16175671/s56724958.txt
|
275 |
+
's54231141': [0, 0], # files/p16/p16312859/s54231141.txt
|
276 |
+
's53607029': [0, 0], # files/p17/p17603668/s53607029.txt
|
277 |
+
's52035334': [0, 0], # files/p19/p19349312/s52035334.txt
|
278 |
+
}
|
279 |
+
|
280 |
+
return custom_section_names, custom_indices
|
281 |
+
|