import re def section_text(text): """ Source: https://github.com/MIT-LCP/mimic-cxr/blob/master/LICENSE """ """Splits text into sections. Assumes text is in a radiology report format, e.g.: COMPARISON: Chest radiograph dated XYZ. IMPRESSION: ABC... Given text like this, it will output text from each section, where the section type is determined by the all caps header. Returns a three element tuple: sections - list containing the text of each section section_names - a normalized version of the section name section_idx - list of start indices of the text in the section """ p_section = re.compile( r'\n ([A-Z ()/,-]+):\s', re.DOTALL) sections = list() section_names = list() section_idx = list() idx = 0 s = p_section.search(text, idx) if s: sections.append(text[0:s.start(1)]) section_names.append('preamble') section_idx.append(0) while s: current_section = s.group(1).lower() # get the start of the text for this section idx_start = s.end() # skip past the first newline to avoid some bad parses idx_skip = text[idx_start:].find('\n') if idx_skip == -1: idx_skip = 0 s = p_section.search(text, idx_start + idx_skip) if s is None: idx_end = len(text) else: idx_end = s.start() sections.append(text[idx_start:idx_end]) section_names.append(current_section) section_idx.append(idx_start) else: sections.append(text) section_names.append('full report') section_idx.append(0) section_names = normalize_section_names(section_names) # remove empty sections # this handles when the report starts with a finding-like statement # .. but this statement is not a section, more like a report title # e.g. p10/p10103318/s57408307 # CHEST, PA LATERAL: # # INDICATION: This is the actual section .... # it also helps when there are multiple findings sections # usually one is empty for i in reversed(range(len(section_names))): if section_names[i] in ('impression', 'findings'): if sections[i].strip() == '': sections.pop(i) section_names.pop(i) section_idx.pop(i) if ('impression' not in section_names) & ('findings' not in section_names): # create a new section for the final paragraph if '\n \n' in sections[-1]: sections.append('\n \n'.join(sections[-1].split('\n \n')[1:])) sections[-2] = sections[-2].split('\n \n')[0] section_names.append('last_paragraph') section_idx.append(section_idx[-1] + len(sections[-2])) return sections, section_names, section_idx def normalize_section_names(section_names): """ Source: https://github.com/MIT-LCP/mimic-cxr/blob/master/LICENSE """ # first, lower case all section_names = [s.lower().strip() for s in section_names] frequent_sections = { "preamble": "preamble", # 227885 "impression": "impression", # 187759 "comparison": "comparison", # 154647 "indication": "indication", # 153730 "findings": "findings", # 149842 "examination": "examination", # 94094 "technique": "technique", # 81402 "history": "history", # 45624 "comparisons": "comparison", # 8686 "clinical history": "history", # 7121 "reason for examination": "indication", # 5845 "notification": "notification", # 5749 "reason for exam": "indication", # 4430 "clinical information": "history", # 4024 "exam": "examination", # 3907 "clinical indication": "indication", # 1945 "conclusion": "impression", # 1802 "chest, two views": "findings", # 1735 "recommendation(s)": "recommendations", # 1700 "type of examination": "examination", # 1678 "reference exam": "comparison", # 347 "patient history": "history", # 251 "addendum": "addendum", # 183 "comparison exam": "comparison", # 163 "date": "date", # 108 "comment": "comment", # 88 "findings and impression": "impression", # 87 "wet read": "wet read", # 83 "comparison film": "comparison", # 79 "recommendations": "recommendations", # 72 "findings/impression": "impression", # 47 "pfi": "history", 'recommendation': 'recommendations', 'wetread': 'wet read', 'ndication': 'impression', # 1 'impresson': 'impression', # 2 'imprression': 'impression', # 1 'imoression': 'impression', # 1 'impressoin': 'impression', # 1 'imprssion': 'impression', # 1 'impresion': 'impression', # 1 'imperssion': 'impression', # 1 'mpression': 'impression', # 1 'impession': 'impression', # 3 'findings/ impression': 'impression', # ,1 'finding': 'findings', # ,8 'findins': 'findings', 'findindgs': 'findings', # ,1 'findgings': 'findings', # ,1 'findngs': 'findings', # ,1 'findnings': 'findings', # ,1 'finidngs': 'findings', # ,2 'idication': 'indication', # ,1 'reference findings': 'findings', # ,1 'comparision': 'comparison', # ,2 'comparsion': 'comparison', # ,1 'comparrison': 'comparison', # ,1 'comparisions': 'comparison' # ,1 } p_findings = [ 'chest', 'portable', 'pa and lateral', 'lateral and pa', 'ap and lateral', 'lateral and ap', 'frontal and', 'two views', 'frontal view', 'pa view', 'ap view', 'one view', 'lateral view', 'bone window', 'frontal upright', 'frontal semi-upright', 'ribs', 'pa and lat' ] p_findings = re.compile('({})'.format('|'.join(p_findings))) main_sections = [ 'impression', 'findings', 'history', 'comparison', 'addendum' ] for i, s in enumerate(section_names): if s in frequent_sections: section_names[i] = frequent_sections[s] continue main_flag = False for m in main_sections: if m in s: section_names[i] = m main_flag = True break if main_flag: continue m = p_findings.search(s) if m is not None: section_names[i] = 'findings' # if it looks like it is describing the entire study # it's equivalent to findings # group similar phrasings for impression return section_names def custom_mimic_cxr_rules(): """ Source: https://github.com/MIT-LCP/mimic-cxr/blob/master/LICENSE """ custom_section_names = { 's50913680': 'recommendations', # files/p11/p11851243/s50913680.txt 's59363654': 'examination', # files/p12/p12128253/s59363654.txt 's59279892': 'technique', # files/p13/p13150370/s59279892.txt 's59768032': 'recommendations', # files/p13/p13249077/s59768032.txt 's57936451': 'indication', # files/p14/p14325424/s57936451.txt 's50058765': 'indication', # files/p14/p14731346/s50058765.txt 's53356173': 'examination', # files/p15/p15898350/s53356173.txt 's53202765': 'technique', # files/p16/p16076182/s53202765.txt 's50808053': 'technique', # files/p16/p16631485/s50808053.txt 's51966317': 'indication', # files/p10/p10817099/s51966317.txt 's50743547': 'examination', # files/p11/p11388341/s50743547.txt 's56451190': 'note', # files/p11/p11842879/s56451190.txt 's59067458': 'recommendations', # files/p11/p11984647/s59067458.txt 's59215320': 'examination', # files/p12/p12408912/s59215320.txt 's55124749': 'indication', # files/p12/p12428492/s55124749.txt 's54365831': 'indication', # files/p13/p13876470/s54365831.txt 's59087630': 'recommendations', # files/p14/p14267880/s59087630.txt 's58157373': 'recommendations', # files/p15/p15032392/s58157373.txt 's56482935': 'recommendations', # files/p15/p15388421/s56482935.txt 's58375018': 'recommendations', # files/p15/p15505556/s58375018.txt 's54654948': 'indication', # files/p17/p17090359/s54654948.txt 's55157853': 'examination', # files/p18/p18975498/s55157853.txt 's51491012': 'history', # files/p19/p19314266/s51491012.txt } custom_indices = { 's50525523': [201, 349], # files/p10/p10602608/s50525523.txt 's57564132': [233, 554], # files/p10/p10637168/s57564132.txt 's59982525': [313, 717], # files/p11/p11989982/s59982525.txt 's53488209': [149, 475], # files/p12/p12458657/s53488209.txt 's54875119': [234, 988], # files/p13/p13687044/s54875119.txt 's50196495': [59, 399], # files/p13/p13894879/s50196495.txt 's56579911': [59, 218], # files/p15/p15394326/s56579911.txt 's52648681': [292, 631], # files/p15/p15666238/s52648681.txt 's59889364': [172, 453], # files/p15/p15835529/s59889364.txt 's53514462': [73, 377], # files/p16/p16297706/s53514462.txt 's59505494': [59, 450], # files/p16/p16730991/s59505494.txt 's53182247': [59, 412], # files/p16/p16770442/s53182247.txt 's51410602': [47, 320], # files/p17/p17069955/s51410602.txt 's56412866': [522, 822], # files/p17/p17612000/s56412866.txt 's54986978': [59, 306], # files/p17/p17912487/s54986978.txt 's59003148': [262, 505], # files/p17/p17916384/s59003148.txt 's57150433': [61, 394], # files/p18/p18335791/s57150433.txt 's56760320': [219, 457], # files/p18/p18418794/s56760320.txt 's59562049': [158, 348], # files/p18/p18502016/s59562049.txt 's52674888': [145, 296], # files/p19/p19381919/s52674888.txt 's55258338': [192, 568], # files/p13/p13719117/s55258338.txt 's59330497': [140, 655], # files/p15/p15479218/s59330497.txt 's52119491': [179, 454], # files/p17/p17959278/s52119491.txt # below have no findings at all in the entire report 's58235663': [0, 0], # files/p11/p11573679/s58235663.txt 's50798377': [0, 0], # files/p12/p12632853/s50798377.txt 's54168089': [0, 0], # files/p14/p14463099/s54168089.txt 's53071062': [0, 0], # files/p15/p15774521/s53071062.txt 's56724958': [0, 0], # files/p16/p16175671/s56724958.txt 's54231141': [0, 0], # files/p16/p16312859/s54231141.txt 's53607029': [0, 0], # files/p17/p17603668/s53607029.txt 's52035334': [0, 0], # files/p19/p19349312/s52035334.txt } return custom_section_names, custom_indices