Spaces:
Sleeping
Sleeping
ljyflores
commited on
Commit
•
2d00e5a
1
Parent(s):
5d4e803
MVP app
Browse files- app.py +40 -0
- requirements.txt +6 -0
- terms.json +1 -0
- utils_casemaker.py +250 -0
- utils_report_parser.py +20 -0
app.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import streamlit as st
|
3 |
+
|
4 |
+
from utils_casemaker import CaseMaker, format_casemaker_data
|
5 |
+
|
6 |
+
st.title("Juni Health Patient Casemaker")
|
7 |
+
|
8 |
+
casemaker = CaseMaker("terms.json")
|
9 |
+
|
10 |
+
uploaded_file = st.file_uploader("Choose a file")
|
11 |
+
|
12 |
+
if uploaded_file is not None:
|
13 |
+
# Can be used wherever a "file-like" object is accepted:
|
14 |
+
df = pd.read_csv(uploaded_file)
|
15 |
+
reports = format_casemaker_data(
|
16 |
+
df=df,
|
17 |
+
patient_id_column="patient_id",
|
18 |
+
date_column="report_id",
|
19 |
+
text_column="text",
|
20 |
+
)
|
21 |
+
|
22 |
+
patient_options = {
|
23 |
+
f"Patient {patient_id}: {len(reports[patient_id])} reports": patient_id
|
24 |
+
for patient_id in reports.keys()
|
25 |
+
}
|
26 |
+
selected_patient_string = st.radio(
|
27 |
+
"Select a Patient ID",
|
28 |
+
list(patient_options.keys()),
|
29 |
+
key = "patient_select_button"
|
30 |
+
)
|
31 |
+
|
32 |
+
if st.button("Generate Case", key = "task_begin_button"):
|
33 |
+
selected_patient_id = patient_options[selected_patient_string]
|
34 |
+
summary_by_organ = casemaker.parse_records(reports[selected_patient_id])
|
35 |
+
summary_by_organ = casemaker.format_reports(summary_by_organ)
|
36 |
+
|
37 |
+
for chosen_organ in summary_by_organ.keys():
|
38 |
+
if summary_by_organ[chosen_organ]:
|
39 |
+
st.header(chosen_organ.capitalize())
|
40 |
+
st.write(summary_by_organ[chosen_organ])
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
nltk
|
2 |
+
numpy==1.24.1
|
3 |
+
pandas
|
4 |
+
torch==2.0.1
|
5 |
+
transformers
|
6 |
+
streamlit
|
terms.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"brain": ["craniotomy", "corticopontine fibers", "lamina terminalis", "superior frontal sulcus", "other", "diagonal band of broca", "gigantocellular reticular nucleus", "cuneus", "intraparietal sulcus", "cross-section of the midbrain.", "retinohypothalamic tract", "parvocellular reticular nucleus", "rostromedial tegmental nucleus", "serotonergic pathways", "precuneus", "superior salivatory nucleus", "lateral spinothalamic tract", "hypoglossal", "forebrain (prosencephalon)", "nigrostriatal pathway", "tuberomammillary nucleus", "botzinger complex", "posterior cingulate", "cerebrum", "spinomesencephalic tract", "posterior part of lateral nucleus", "uncinate fasciculus", "cuneate nucleus", "dorsomedial hypothalamic nucleus", "nucleus ambiguus", "parietal lobe", "part of supraoptic nucleus", "subthalamus (hpa axis)", "pretectum", "vestibulocochlear nuclei (vestibular nuclei and cochlear nuclei) (viii)", "tegmentum", "cingulate cortex", "ventral striatum", "rubro-olivary tract", "third ventricle", "insula", "intralaminar nuclear group", "other nuclei of preoptic area", "caudal ventrolateral medulla", "crus cerebri", "supramarginal gyrus", "tectum", "interpeduncular nucleus", "entorhinal cortex", "anterior perforated substance", "rostral linear nucleus of the raphe", "adenohypophysis", "paracentral nucleus", "extrapyramidal system", "vagus", "perihypoglossal nuclei", "cerebral cortex", "rhomboidal nucleus", "inferior salivatory nucleus", "medial longitudinal fasciculus", "medial lemniscus", "metathalamus", "gyri", "bed nucleus of the stria terminalis", "tuberal region", "mesolimbic pathway", "anteroventral nucleus (a.k.a. ventral anterior nucleus)", "pyramidal tract", "ventrolateral prefrontal cortex", "lateral raphespinal tract", "pars intermedia (intermediate lobe)", "ventral pallidum", "habenular nuclei", "nucleus basalis", "stria terminalis", "primary motor cortex", "tuberal", "interstitiospinal tract", "laterodorsal tegmental nucleus", "vestibulospinal tract", "substantia innominata", "lateral corticospinal tract", "abducens", "metencephalon", "caudal linear nucleus", "habenular commissure", "ventral anterior nucleus", "claustrum", "rostral ventrolateral medulla", "primary somatosensory cortex", "superior frontal gyrus", "fourth ventricle", "facial nerve nucleus (vii)", "superior colliculi", "subinsular cortex", "fusiform gyrus", "paranigral nucleus", "parabrachial area", "perforant pathway", "fastigial nucleus", "inferior olivary nucleus", "accessory", "midline nuclear group", "dorsal raphe nucleus", "corticospinal tract or cerebrospinal fibers", "anterior hypothalamic nucleus", "inferior parietal lobule", "optic radiation", "medullary striae of fourth ventricle", "neurohypophysis", "ventral posterior nucleus", "pituitary", "midbrain reticular formation", "basal ganglia", "superior olivary complex", "mesencephalic duct (cerebral aqueduct, aqueduct of sylvius)", "medial nuclear group", "vestibulocochlear", "parts of preoptic area", "pineal body (pineal gland)", "superior parietal lobule", "lateral sulcus", "substantia nigra", "infundibulum", "lateral nuclear group", "inferior temporal gyrus", "inferior temporal cortex", "trochlear nucleus (iv)", "myoclonic triangle", "trochlear", "anterior corticospinal tract", "solitary tract", "corpora quadrigemina", "cerebellar peduncles", "oculomotor nucleus (iii)", "corpus callosum", "rhinencephalon", "globus pallidus", "corticobulbar tract", "pulvinar", "area postrema", "pars reticulata", "hypoglossal nucleus", "caudate nucleus", "diencephalon", "glossopharyngeal", "pars compacta", "anteromedial nucleus", "anterior nuclear group", "frontopontine fibers", "subcommissural organ", "lateral geniculate body", "prefrontal cortex", "olfactory tubercle", "occipital lobe", "gracile fasciculus", "tuberal part of lateral nucleus", "anterodorsal nucleus", "posterior inferior temporal cortex", "periventricular nucleus", "parabrachial pigmented nucleus", "anterior commissure", "parafascicular nucleus", "fornix", "medial geniculate body", "middle cerebellar peduncle", "secondary somatosensory cortex", "raphe nuclei", "parafacial zone", "tegmental pontine reticular nucleus", "edinger-westphal nucleus", "putamen", "oculomotor", "amygdala", "parahippocampal gyrus", "trapezoid body", "cerebral hemisphere", "hypothalamus", "ventral respiratory group or apneustic centre", "central sulcus", "pineal", "septal nuclei", "optic tract", "ventromedial prefrontal cortex", "epithalamus", "paraventricular nucleus", "olfactory bulb", "thalamocortical radiations", "retrotrapezoid nucleus", "dentate nucleus", "supplementary motor cortex", "medial superior olive", "posterior commissure", "anterior olfactory nucleus", "chemoreceptor trigger zone", "basal forebrain", "medial parabrachial nucleus", "caudate", "olivary body", "rostral interstitial nucleus of medial longitudinal fasciculus", "reuniens nucleus", "pontine nuclei", "amygdalofugal pathway", "angular gyrus", "nucleus retrofacialis", "mesencephalic cranial nerve nuclei", "olfactory tract", "middle frontal gyrus", "dorsal column\u2013medial lemniscus pathway", "lateral dorsal nucleus", "olfactory", "pedunculopontine nucleus", "precentral gyrus", "interthalamic adhesion", "cranial nerves", "subparabrachial nucleus", "centromedian nucleus", "emboliform nucleus", "vermis", "incertohypothalamic pathway", "cerebral peduncle", "perirhinal cortex", "abducens nucleus (vi)", "cingulate gyrus", "lateral posterior nucleus", "medulla", "superior longitudinal fasciculus", "lateral occipital gyrus", "dorsolateral prefrontal cortex", "spinocerebellar tract", "lateral tuberal nuclei", "suprachiasmatic nucleus", "interfascicular nucleus", "cortex", "middle frontal sulcus", "caudal pontine reticular nucleus", "circumventricular organs (also fourth ventricle)", "thalamic reticular nucleus", "lateral superior olive", "corona radiata", "tuber cinereum", "anterior spinothalamic tract", "lateral preoptic nucleus", "globose nucleus", "medial septal nuclei", "spinothalamic tract", "white matter", "olivocerebellar tract", "nucleus para-ambiguus", "mammillary nucleus", "supraoptic nucleus", "taenia thalami", "chief or pontine nucleus of the trigeminal nerve sensory nucleus (v)", "trigeminal", "olivospinal tract", "cerebellar hemispheres", "interhemispheric fissure", "temporal lobe", "premotor cortex", "prepositus nucleus", "internal capsule", "postcentral gyrus (primary somesthetic area)", "subthalamic nucleus", "medullary cranial nerve nuclei", "lateral area", "reticulospinal tract", "nucleus accumbens", "posterior parietal cortex", "frontal lobe", "rubrospinal tract", "posterior lobe", "midbrain (mesencephalon)", "periamygdaloid cortex", "tectospinal tract", "inferior frontal gyrus", "respiratory center-respiratory groups", "pons", "cerebellar nuclei", "subfornical organ", "medial dorsal nucleus", "stria medullaris", "tuberoinfundibular pathway", "medial nucleus of the trapezoid body", "thalamus", "superior cerebellar peduncle", "longitudinal cerebral fissure", "spinoreticular tract", "piriform cortex", "pontine cranial nerve nuclei", "pontine micturition center (barrington's nucleus)", "paramedian reticular nucleus", "motor cortex", "striatum", "motor nucleus for the trigeminal nerve (v)", "periaqueductal gray", "extreme capsule", "facial", "blood brain barrier", "mesocortical pathway", "paratenial nucleus", "orbitofrontal cortex", "anterior cingulate", "vascular organ of lamina terminalis", "periventricular preoptic nucleus", "red nucleus", "optic chiasm", "external capsule", "anterior lobe", "sublingual nucleus", "temporopontine fibers", "ventral posterior lateral nucleus", "corticomesencephalic tract", "precentral sulcus", "cuneate fasciculus", "dentate gyrus", "myelencephalon", "tuberal nucleus", "medial forebrain bundle", "ventral nuclear group", "cingulate sulcus", "centrum semiovale", "dorsomedial prefrontal cortex", "dorsal nucleus of vagus nerve", "inferior colliculi", "arcuate fasciculus", "cerebellum", "dorsal respiratory group", "gracile nucleus", "median eminence", "anterior part of lateral nucleus", "nucleus retroambiguus", "brain stem", "central lateral nucleus", "lateral lemniscus", "insular cortex", "interposed nucleus", "flocculonodular lobe", "ventromedial nucleus", "pontine respiratory group", "cerebrospinal fluid", "zona incerta", "postcentral sulcus", "intercalated nucleus", "medullary pyramids", "ventral posterior medial nucleus", "mammillotegmental fasciculus", "postcentral gyrus", "locus coeruleus", "nucleus incertus", "sylvian fissure", "median preoptic nucleus", "cerebellar vermis", "medial preoptic nucleus", "indusium griseum", "ventral lateral nucleus", "mammillary bodies", "pontine tegmentum", "lateral vestibulospinal tract", "superior temporal gyrus", "lateral parabrachial nucleus", "optic", "middle temporal gyrus", "subcortical", "major dopaminergic pathways from dopaminergic cell groups", "paramedian pontine reticular formation", "retrosplenial cortex", "medulla oblongata", "mammillary nuclei (part of mammillary bodies)", "inferior cerebellar peduncle", "spino-olivary tract", "uncus", "arcuate nucleus", "medial vestibulospinal tract", "solitary nucleus (nucleus of the solitary tract)", "ventral tegmental area"], "spine": [], "meninges": ["posterior horn", "leptomeningeal", "angular bundle", "foramen of magendie", "subdural", "fourth ventricle", "subarachnoid", "pia", "arachnoid septum", "ventricular system", "subarachnoid space", "interventricular foramina", "epidural space", "pia mater", "meningeal coverings", "pontine cistern", "calcar avis", "superior cistern", "dura mater", "arachnoid mater", "lateral ventricles", "arachnoid", "subdural space", "chiasmatic cistern", "cisterna magna", "cistern of lamina terminalis", "body of lateral ventricle", "foramina of luschka", "foramina", "inferior horn", "pachymeningeal", "subventricular zone", "interpeduncular cistern", "dura", "spinal subarachnoid space", "cerebral aqueduct", "anterior horn", "third ventricle"], "vascular": ["circle of willis", "circumventricular organs", "blood brain barrier", "middle cerebral artery", "basilar artery", "m4", "v2", "p2", "v4", "a1", "m2", "m3", "superior sagittal sinus", "p4", "p3", "v1", "m1", "anterior cerebral artery", "p1", "vertebral artery", "posterior cerebral artery", "a2", "glymphatic system", "v3", "a3"], "head": ["neck", "pharyngeal mucosal space", "masticator space", "carotid space", "oral cavity", "hypopharynx", "pharynx", "parotid space", "larynx", "parapharyngeal space", "nasopharynx", "perivertebral space", "oropharynx", "nasal cavity", "retropharyngeal space"], "liver": ["intrahepatic", "hepatic", "cirrhosis"], "biliary": ["right hepatic duct", "common bile duct", "gallbladder", "common hepatic duct", "left hepatic duct"], "spleen": ["splenic"], "pancreas": ["wirsung", "main pancreatic duct", "pancreatic "], "adrenal glands": ["adrenal", "pheochromocystoma", "adrenal adenoma"], "urinary system": ["renal cell carcinoma", "ureter", "renal angiomyolipoma", "renal", "kidneys", "oncocytoma", "bladder", "kidney", "urinary bladder", "rcc"], "gastrointestinal": ["small intestine", "duodenum", "ileum", "appendix", "esophagus", "descending colon", "transverse colon", "sigmoid", "ascending colon", "stomach", "cecum", "pylorus", "jejunum", "rectum", "colon", "antrum", "large intestine", "anus"], "peritoneum": ["pneumoperitoneum", "peritonitis", "ascites", "peritoneal carcinomatosis"], "retroperitoneum": [], "pelvis": ["ovary", "uterus", "uterine tubes", "fallopean tubes", "parametrium", "endometrium", "ovaries"], "lung": ["left lung", "right middle lobe", "right upper lobe", "right lung", "left upper lobe", "lingula", "right lower lobe", "left lower lobe"], "airway": ["bronchus intermedius", "bronchi", "main stem bronchus", "small airways", "trachea", "carina", "central airways"], "pleura": ["pneumothorax", "pleural effusion"], "mediastinum": ["mediastinal", "thymic", "thymus"], "heart": ["cardiac", "left atrium", "left ventricle", "right atrium", "right ventricle"], "breast": ["mammogram", "nipple", "mammary", "mammography", "subareolar"], "upper extremity": ["ulnar", "wrist", "radial", "carpal", "ulna", "metacarpal", "humerus", "radius", "hand", "shoulder", "elbow"], "lower extremity": ["femoral", "femur"], "vascular": ["superior mesenteric vein", "superior mesenteric artery", "aneurysm", "inferior mesenteric artery", "common femoral vein", "aorta", "sma", "common femoral artery", "iliac artery", "gda", "external iliac artery", "proper hepatic artery", "renal arteries", "ima", "renal artery", "common iliac artery", "common hepatic artery", "gastroduodenal artery", "splenic vein", "celiac axis", "internal iliac artery", "celiac artery", "portal vein", "celiac trunk"], "lymphatic": ["lymphatics", "lad", "lymphadenopathy", "lymph node", "lymph nodes"], "soft tissues": ["soft tissues", "dermis", "subcutaneous fat", "epidermis", "skin"]}
|
utils_casemaker.py
ADDED
@@ -0,0 +1,250 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import pandas as pd
|
3 |
+
import re
|
4 |
+
|
5 |
+
from dataclasses import dataclass
|
6 |
+
from nltk.tokenize import sent_tokenize
|
7 |
+
from typing import Dict, List, Sequence
|
8 |
+
from utils_report_parser import get_section_from_report
|
9 |
+
|
10 |
+
from transformers import (
|
11 |
+
AutoModelForTokenClassification,
|
12 |
+
AutoTokenizer,
|
13 |
+
pipeline,
|
14 |
+
)
|
15 |
+
|
16 |
+
@dataclass
|
17 |
+
class Report:
|
18 |
+
patient_id: str|int
|
19 |
+
text: str
|
20 |
+
date: str
|
21 |
+
summary: str|None = None
|
22 |
+
|
23 |
+
def clean(s: str) -> str:
|
24 |
+
s = s.replace("\n", " ") # Concatenate into one string
|
25 |
+
s = s.replace("_", "") # Remove long lines and underscores
|
26 |
+
s = re.sub(r"\[.*?\]", "", s) # Remove brackets and parentheses
|
27 |
+
s = re.sub(r"\(.*?\)", "", s)
|
28 |
+
s = " ".join(s.split()) # Replace multiple white spaces
|
29 |
+
return s
|
30 |
+
|
31 |
+
|
32 |
+
def split_paragraphs(text: str) -> List[str]:
|
33 |
+
paragraphs = text.split("\n\n")
|
34 |
+
paragraphs = list(map(clean, paragraphs))
|
35 |
+
paragraphs = list(filter(lambda s: len(s.split()) > 10, paragraphs))
|
36 |
+
return paragraphs
|
37 |
+
|
38 |
+
|
39 |
+
def format_casemaker_data(
|
40 |
+
df: pd.DataFrame, patient_id_column: str, text_column: str, date_column: str
|
41 |
+
):
|
42 |
+
"""Take in a pandas dataframe where each row corresponds to one report for a patient,
|
43 |
+
and output a dataframe where each row corresponds to a patient, and the "records" column
|
44 |
+
contains a list of dictionaries of all their reports sorted by date
|
45 |
+
|
46 |
+
Args:
|
47 |
+
df (pd.DataFrame): Input dataframe on report level
|
48 |
+
patient_id_column (str): Patient ID
|
49 |
+
text_column (str): Text/Report
|
50 |
+
date_column (str): Date (will be used to sort)
|
51 |
+
"""
|
52 |
+
df = df.rename(
|
53 |
+
columns={
|
54 |
+
patient_id_column: "patient_id",
|
55 |
+
text_column: "text",
|
56 |
+
date_column: "date",
|
57 |
+
}
|
58 |
+
)
|
59 |
+
df = (
|
60 |
+
df.sort_values(by=["patient_id", "date"])
|
61 |
+
.groupby("patient_id")
|
62 |
+
.apply(lambda df: df[["patient_id", "text", "date"]].to_dict("records"))
|
63 |
+
)
|
64 |
+
reports_by_patient = dict[str,Sequence[Report]]()
|
65 |
+
for patient_id, report_list in zip(df.index, df):
|
66 |
+
patient_id = str(patient_id)
|
67 |
+
report_list = [Report(**report) for report in report_list]
|
68 |
+
reports_by_patient[patient_id] = report_list
|
69 |
+
return reports_by_patient
|
70 |
+
|
71 |
+
|
72 |
+
class CaseMaker:
|
73 |
+
def __init__(self, organ_keywords_dict_path: str = "../assets/terms.json"):
|
74 |
+
self.organ_keyword_dict = json.load(open(organ_keywords_dict_path, "r"))
|
75 |
+
|
76 |
+
self.ner_pipe = pipeline(
|
77 |
+
"ner",
|
78 |
+
model=AutoModelForTokenClassification.from_pretrained(
|
79 |
+
"d4data/biomedical-ner-all"
|
80 |
+
),
|
81 |
+
tokenizer=AutoTokenizer.from_pretrained("d4data/biomedical-ner-all"),
|
82 |
+
aggregation_strategy="simple",
|
83 |
+
device_map="auto",
|
84 |
+
)
|
85 |
+
# self.summ_pipe = pipeline(
|
86 |
+
# "text2text-generation", model="starmpcc/Asclepius-7B", device_map="auto"
|
87 |
+
# )
|
88 |
+
|
89 |
+
def standardize_organ(self, organ_entity: Dict) -> Dict:
|
90 |
+
"""Given an entity, map its name to a set of recognized entities provided in
|
91 |
+
organ_keyword_dict if it matches any of the keywords; otherwise set it as "Other"
|
92 |
+
|
93 |
+
Args:
|
94 |
+
organ_entity (Dict): Dictionary corresponding to entity; should contain "word" key
|
95 |
+
which is the entity
|
96 |
+
|
97 |
+
Returns:
|
98 |
+
Dict: Same dictionary where the "word" key has been updated to either a set of standard
|
99 |
+
body organs or "Other"
|
100 |
+
"""
|
101 |
+
# If the organ matches any of the keys or their synonyms, replace the name and return
|
102 |
+
for key in self.organ_keyword_dict:
|
103 |
+
if (organ_entity["word"].lower() == key.lower()) or (
|
104 |
+
organ_entity["word"].lower() in self.organ_keyword_dict[key]
|
105 |
+
):
|
106 |
+
organ_entity["word"] = key
|
107 |
+
return organ_entity
|
108 |
+
# Otherwise, it's a bad match so set the score to 0 and return other
|
109 |
+
organ_entity["word"] = "Other"
|
110 |
+
organ_entity["score"] = 0.0
|
111 |
+
|
112 |
+
return organ_entity
|
113 |
+
|
114 |
+
def pick_organ_by_keyword(self, s: str):
|
115 |
+
words = s.lower()
|
116 |
+
for organ in self.organ_keyword_dict.keys():
|
117 |
+
if any(
|
118 |
+
[
|
119 |
+
keyword.lower() in words
|
120 |
+
for keyword in [organ] + self.organ_keyword_dict[organ]
|
121 |
+
]
|
122 |
+
):
|
123 |
+
return organ
|
124 |
+
return "other"
|
125 |
+
|
126 |
+
def parse_report_by_organ(self, report: str):
|
127 |
+
"""Take in a text report and output a dictionary of body organs
|
128 |
+
and a list of all the sentences corresponding to that organ
|
129 |
+
|
130 |
+
Args:
|
131 |
+
report (str): Input report
|
132 |
+
"""
|
133 |
+
report_string_by_organ = dict[str, str]()
|
134 |
+
|
135 |
+
# Split the report into a list of paragraphs
|
136 |
+
paragraphs = split_paragraphs(report)
|
137 |
+
# Collect a list of paragraphs related to each organ
|
138 |
+
for p in paragraphs:
|
139 |
+
# Figure out which organ is being referenced
|
140 |
+
selected_organ = self.pick_organ_by_keyword(p)
|
141 |
+
|
142 |
+
# Concatenate the report to its corresponding organ
|
143 |
+
if selected_organ not in report_string_by_organ:
|
144 |
+
report_string_by_organ[selected_organ] = p
|
145 |
+
else:
|
146 |
+
report_string_by_organ[selected_organ] += p
|
147 |
+
|
148 |
+
return report_string_by_organ
|
149 |
+
|
150 |
+
def trim_to_relevant_portion(self, report: str):
|
151 |
+
# Cut the report to the findings
|
152 |
+
report = get_section_from_report(report, "findings")
|
153 |
+
|
154 |
+
# Only keep sentences with symptoms and disease descriptions
|
155 |
+
relevant_sentences = []
|
156 |
+
for sentence in sent_tokenize(report):
|
157 |
+
if any(
|
158 |
+
[
|
159 |
+
ent["entity_group"] in ["Sign_symptom", "Disease_disorder"]
|
160 |
+
for ent in self.ner_pipe(sentence)
|
161 |
+
]
|
162 |
+
):
|
163 |
+
relevant_sentences.append(sentence)
|
164 |
+
return "\n".join(relevant_sentences)
|
165 |
+
|
166 |
+
def summarize_report(self, text: str) -> str:
|
167 |
+
"""Format text into prompt and summarize clinical text
|
168 |
+
|
169 |
+
Args:
|
170 |
+
text (str): Input report
|
171 |
+
|
172 |
+
Returns:
|
173 |
+
str: Output
|
174 |
+
"""
|
175 |
+
|
176 |
+
question = (
|
177 |
+
"Can you provide a succinct summary of the key clinical findings "
|
178 |
+
"and treatment recommendations outlined in this discharge summary?"
|
179 |
+
)
|
180 |
+
|
181 |
+
prompt = """
|
182 |
+
You are an intelligent clinical languge model.
|
183 |
+
Below is a snippet of patient's discharge summary and a following instruction from healthcare professional.
|
184 |
+
Write a response that appropriately completes the instruction.
|
185 |
+
The response should provide the accurate answer to the instruction, while being concise.
|
186 |
+
|
187 |
+
[Discharge Summary Begin]
|
188 |
+
{note}
|
189 |
+
[Discharge Summary End]
|
190 |
+
|
191 |
+
[Instruction Begin]
|
192 |
+
{question}
|
193 |
+
[Instruction End]
|
194 |
+
""".format(
|
195 |
+
question=question, note=text
|
196 |
+
)
|
197 |
+
|
198 |
+
output = self.summ_pipe(prompt, max_new_tokens=len(text.split()) // 2)[0][
|
199 |
+
"generated_text"
|
200 |
+
]
|
201 |
+
answer = output.split("[Instruction End]")[-1]
|
202 |
+
answer = clean(answer)
|
203 |
+
return answer
|
204 |
+
|
205 |
+
def parse_records(
|
206 |
+
self,
|
207 |
+
reports: Sequence[Report],
|
208 |
+
):
|
209 |
+
"""Given a list of reports (represented by dictionaries), split each of them
|
210 |
+
by body part using parse_report_by_organ, then compile all the text for the same
|
211 |
+
organ across different reports
|
212 |
+
(i.e. for each body part, have a list of dicts which contain the text from various reports)
|
213 |
+
|
214 |
+
Args:
|
215 |
+
records (Sequence[Report]): List of reports represented by dictionaries; each dictionary
|
216 |
+
must contain "text" and "date" keys
|
217 |
+
"""
|
218 |
+
|
219 |
+
# For each organ, collect a list of relevant records containing the text and date
|
220 |
+
reports_by_organ = dict[str, Sequence[Report]]()
|
221 |
+
for report in reports:
|
222 |
+
report_by_organ = self.parse_report_by_organ(report.text)
|
223 |
+
for organ, report_text in report_by_organ.items():
|
224 |
+
organ_level_record = Report(text=report_text, date=report.date, patient_id=report.patient_id)
|
225 |
+
if organ in reports_by_organ:
|
226 |
+
reports_by_organ[organ].append(organ_level_record)
|
227 |
+
else:
|
228 |
+
reports_by_organ[organ] = [organ_level_record]
|
229 |
+
|
230 |
+
# For each organ, then filter only to the relevant reports and summarize them
|
231 |
+
summarized_reports_by_organ = dict[str, Sequence[Report]]()
|
232 |
+
for organ in reports_by_organ.keys():
|
233 |
+
cleaned_reports = list[Report]()
|
234 |
+
for report in reports_by_organ[organ]:
|
235 |
+
# Trim the report
|
236 |
+
report_text = self.trim_to_relevant_portion(report.text)
|
237 |
+
if report_text:
|
238 |
+
report.summary = report_text
|
239 |
+
cleaned_reports.append(report)
|
240 |
+
summarized_reports_by_organ[organ] = cleaned_reports
|
241 |
+
|
242 |
+
return summarized_reports_by_organ
|
243 |
+
|
244 |
+
def format_reports(self, all_reports: Dict[str, List[Dict]]):
|
245 |
+
new_reports = {}
|
246 |
+
for organ, organ_reports in all_reports.items():
|
247 |
+
new_reports[organ] = "\n\n".join(
|
248 |
+
[f"**Report {str(r.date)}**\n\n{str(r.summary)}" for r in organ_reports]
|
249 |
+
)
|
250 |
+
return new_reports
|
utils_report_parser.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def get_section_from_report(report: str, section: str):
|
2 |
+
section_upper = section.upper()
|
3 |
+
section_lower = section.lower()
|
4 |
+
findings_start_idx = report.lower().find(f"{section_lower}:") + len(
|
5 |
+
f"{section_lower}:"
|
6 |
+
)
|
7 |
+
|
8 |
+
if findings_start_idx == -1:
|
9 |
+
findings_start_idx = report.lower().find(f"{section_lower}:") + len(
|
10 |
+
f"{section_lower}:"
|
11 |
+
)
|
12 |
+
if findings_start_idx == -1:
|
13 |
+
findings_start_idx = report.find(f"{section_upper}") + len(f"{section_upper}")
|
14 |
+
|
15 |
+
if findings_start_idx == -1:
|
16 |
+
findings = report
|
17 |
+
else:
|
18 |
+
findings = report[findings_start_idx:]
|
19 |
+
|
20 |
+
return findings
|