aehrc
/

cxrmate-ed

Model card Files Files and versions Community

cxrmate-ed / section_parser.py

anicolson

Upload model

ae934ba verified 26 days ago

raw

history blame

No virus

11.1 kB

	import re


	def section_text(text):
	"""
	Source: https://github.com/MIT-LCP/mimic-cxr/blob/master/LICENSE
	"""

	"""Splits text into sections.

	Assumes text is in a radiology report format, e.g.:

	COMPARISON: Chest radiograph dated XYZ.

	IMPRESSION: ABC...

	Given text like this, it will output text from each section,
	where the section type is determined by the all caps header.

	Returns a three element tuple:
	sections - list containing the text of each section
	section_names - a normalized version of the section name
	section_idx - list of start indices of the text in the section
	"""
	p_section = re.compile(
	r'\n ([A-Z ()/,-]+):\s', re.DOTALL)

	sections = list()
	section_names = list()
	section_idx = list()

	idx = 0
	s = p_section.search(text, idx)

	if s:
	sections.append(text[0:s.start(1)])
	section_names.append('preamble')
	section_idx.append(0)

	while s:
	current_section = s.group(1).lower()
	# get the start of the text for this section
	idx_start = s.end()
	# skip past the first newline to avoid some bad parses
	idx_skip = text[idx_start:].find('\n')
	if idx_skip == -1:
	idx_skip = 0

	s = p_section.search(text, idx_start + idx_skip)

	if s is None:
	idx_end = len(text)
	else:
	idx_end = s.start()

	sections.append(text[idx_start:idx_end])
	section_names.append(current_section)
	section_idx.append(idx_start)

	else:
	sections.append(text)
	section_names.append('full report')
	section_idx.append(0)

	section_names = normalize_section_names(section_names)

	# remove empty sections
	# this handles when the report starts with a finding-like statement
	# .. but this statement is not a section, more like a report title
	# e.g. p10/p10103318/s57408307
	# CHEST, PA LATERAL:
	#
	# INDICATION: This is the actual section ....
	# it also helps when there are multiple findings sections
	# usually one is empty
	for i in reversed(range(len(section_names))):
	if section_names[i] in ('impression', 'findings'):
	if sections[i].strip() == '':
	sections.pop(i)
	section_names.pop(i)
	section_idx.pop(i)

	if ('impression' not in section_names) & ('findings' not in section_names):
	# create a new section for the final paragraph
	if '\n \n' in sections[-1]:
	sections.append('\n \n'.join(sections[-1].split('\n \n')[1:]))
	sections[-2] = sections[-2].split('\n \n')[0]
	section_names.append('last_paragraph')
	section_idx.append(section_idx[-1] + len(sections[-2]))

	return sections, section_names, section_idx


	def normalize_section_names(section_names):
	"""
	Source: https://github.com/MIT-LCP/mimic-cxr/blob/master/LICENSE
	"""

	# first, lower case all
	section_names = [s.lower().strip() for s in section_names]

	frequent_sections = {
	"preamble": "preamble", # 227885
	"impression": "impression", # 187759
	"comparison": "comparison", # 154647
	"indication": "indication", # 153730
	"findings": "findings", # 149842
	"examination": "examination", # 94094
	"technique": "technique", # 81402
	"history": "history", # 45624
	"comparisons": "comparison", # 8686
	"clinical history": "history", # 7121
	"reason for examination": "indication", # 5845
	"notification": "notification", # 5749
	"reason for exam": "indication", # 4430
	"clinical information": "history", # 4024
	"exam": "examination", # 3907
	"clinical indication": "indication", # 1945
	"conclusion": "impression", # 1802
	"chest, two views": "findings", # 1735
	"recommendation(s)": "recommendations", # 1700
	"type of examination": "examination", # 1678
	"reference exam": "comparison", # 347
	"patient history": "history", # 251
	"addendum": "addendum", # 183
	"comparison exam": "comparison", # 163
	"date": "date", # 108
	"comment": "comment", # 88
	"findings and impression": "impression", # 87
	"wet read": "wet read", # 83
	"comparison film": "comparison", # 79
	"recommendations": "recommendations", # 72
	"findings/impression": "impression", # 47
	"pfi": "history",
	'recommendation': 'recommendations',
	'wetread': 'wet read',
	'ndication': 'impression', # 1
	'impresson': 'impression', # 2
	'imprression': 'impression', # 1
	'imoression': 'impression', # 1
	'impressoin': 'impression', # 1
	'imprssion': 'impression', # 1
	'impresion': 'impression', # 1
	'imperssion': 'impression', # 1
	'mpression': 'impression', # 1
	'impession': 'impression', # 3
	'findings/ impression': 'impression', # ,1
	'finding': 'findings', # ,8
	'findins': 'findings',
	'findindgs': 'findings', # ,1
	'findgings': 'findings', # ,1
	'findngs': 'findings', # ,1
	'findnings': 'findings', # ,1
	'finidngs': 'findings', # ,2
	'idication': 'indication', # ,1
	'reference findings': 'findings', # ,1
	'comparision': 'comparison', # ,2
	'comparsion': 'comparison', # ,1
	'comparrison': 'comparison', # ,1
	'comparisions': 'comparison' # ,1
	}

	p_findings = [
	'chest',
	'portable',
	'pa and lateral',
	'lateral and pa',
	'ap and lateral',
	'lateral and ap',
	'frontal and',
	'two views',
	'frontal view',
	'pa view',
	'ap view',
	'one view',
	'lateral view',
	'bone window',
	'frontal upright',
	'frontal semi-upright',
	'ribs',
	'pa and lat'
	]
	p_findings = re.compile('({})'.format('\|'.join(p_findings)))

	main_sections = [
	'impression', 'findings', 'history', 'comparison',
	'addendum'
	]
	for i, s in enumerate(section_names):
	if s in frequent_sections:
	section_names[i] = frequent_sections[s]
	continue

	main_flag = False
	for m in main_sections:
	if m in s:
	section_names[i] = m
	main_flag = True
	break
	if main_flag:
	continue

	m = p_findings.search(s)
	if m is not None:
	section_names[i] = 'findings'

	# if it looks like it is describing the entire study
	# it's equivalent to findings
	# group similar phrasings for impression

	return section_names


	def custom_mimic_cxr_rules():
	"""
	Source: https://github.com/MIT-LCP/mimic-cxr/blob/master/LICENSE
	"""
	custom_section_names = {
	's50913680': 'recommendations', # files/p11/p11851243/s50913680.txt
	's59363654': 'examination', # files/p12/p12128253/s59363654.txt
	's59279892': 'technique', # files/p13/p13150370/s59279892.txt
	's59768032': 'recommendations', # files/p13/p13249077/s59768032.txt
	's57936451': 'indication', # files/p14/p14325424/s57936451.txt
	's50058765': 'indication', # files/p14/p14731346/s50058765.txt
	's53356173': 'examination', # files/p15/p15898350/s53356173.txt
	's53202765': 'technique', # files/p16/p16076182/s53202765.txt
	's50808053': 'technique', # files/p16/p16631485/s50808053.txt
	's51966317': 'indication', # files/p10/p10817099/s51966317.txt
	's50743547': 'examination', # files/p11/p11388341/s50743547.txt
	's56451190': 'note', # files/p11/p11842879/s56451190.txt
	's59067458': 'recommendations', # files/p11/p11984647/s59067458.txt
	's59215320': 'examination', # files/p12/p12408912/s59215320.txt
	's55124749': 'indication', # files/p12/p12428492/s55124749.txt
	's54365831': 'indication', # files/p13/p13876470/s54365831.txt
	's59087630': 'recommendations', # files/p14/p14267880/s59087630.txt
	's58157373': 'recommendations', # files/p15/p15032392/s58157373.txt
	's56482935': 'recommendations', # files/p15/p15388421/s56482935.txt
	's58375018': 'recommendations', # files/p15/p15505556/s58375018.txt
	's54654948': 'indication', # files/p17/p17090359/s54654948.txt
	's55157853': 'examination', # files/p18/p18975498/s55157853.txt
	's51491012': 'history', # files/p19/p19314266/s51491012.txt

	}

	custom_indices = {
	's50525523': [201, 349], # files/p10/p10602608/s50525523.txt
	's57564132': [233, 554], # files/p10/p10637168/s57564132.txt
	's59982525': [313, 717], # files/p11/p11989982/s59982525.txt
	's53488209': [149, 475], # files/p12/p12458657/s53488209.txt
	's54875119': [234, 988], # files/p13/p13687044/s54875119.txt
	's50196495': [59, 399], # files/p13/p13894879/s50196495.txt
	's56579911': [59, 218], # files/p15/p15394326/s56579911.txt
	's52648681': [292, 631], # files/p15/p15666238/s52648681.txt
	's59889364': [172, 453], # files/p15/p15835529/s59889364.txt
	's53514462': [73, 377], # files/p16/p16297706/s53514462.txt
	's59505494': [59, 450], # files/p16/p16730991/s59505494.txt
	's53182247': [59, 412], # files/p16/p16770442/s53182247.txt
	's51410602': [47, 320], # files/p17/p17069955/s51410602.txt
	's56412866': [522, 822], # files/p17/p17612000/s56412866.txt
	's54986978': [59, 306], # files/p17/p17912487/s54986978.txt
	's59003148': [262, 505], # files/p17/p17916384/s59003148.txt
	's57150433': [61, 394], # files/p18/p18335791/s57150433.txt
	's56760320': [219, 457], # files/p18/p18418794/s56760320.txt
	's59562049': [158, 348], # files/p18/p18502016/s59562049.txt
	's52674888': [145, 296], # files/p19/p19381919/s52674888.txt
	's55258338': [192, 568], # files/p13/p13719117/s55258338.txt
	's59330497': [140, 655], # files/p15/p15479218/s59330497.txt
	's52119491': [179, 454], # files/p17/p17959278/s52119491.txt
	# below have no findings at all in the entire report
	's58235663': [0, 0], # files/p11/p11573679/s58235663.txt
	's50798377': [0, 0], # files/p12/p12632853/s50798377.txt
	's54168089': [0, 0], # files/p14/p14463099/s54168089.txt
	's53071062': [0, 0], # files/p15/p15774521/s53071062.txt
	's56724958': [0, 0], # files/p16/p16175671/s56724958.txt
	's54231141': [0, 0], # files/p16/p16312859/s54231141.txt
	's53607029': [0, 0], # files/p17/p17603668/s53607029.txt
	's52035334': [0, 0], # files/p19/p19349312/s52035334.txt
	}

	return custom_section_names, custom_indices