Spaces:
Sleeping
Sleeping
ljyflores
commited on
Commit
•
2b98370
1
Parent(s):
facce4e
Turn reports into table, remove header, use logic to use organ of previous sentence
Browse files- __pycache__/utils_casemaker.cpython-310.pyc +0 -0
- __pycache__/utils_report_parser.cpython-310.pyc +0 -0
- app.py +7 -3
- utils_casemaker.py +48 -47
- utils_report_parser.py +4 -11
__pycache__/utils_casemaker.cpython-310.pyc
CHANGED
Binary files a/__pycache__/utils_casemaker.cpython-310.pyc and b/__pycache__/utils_casemaker.cpython-310.pyc differ
|
|
__pycache__/utils_report_parser.cpython-310.pyc
CHANGED
Binary files a/__pycache__/utils_report_parser.cpython-310.pyc and b/__pycache__/utils_report_parser.cpython-310.pyc differ
|
|
app.py
CHANGED
@@ -3,6 +3,8 @@ import streamlit as st
|
|
3 |
|
4 |
from utils_casemaker import CaseMaker, format_casemaker_data
|
5 |
|
|
|
|
|
6 |
st.title("Juni Health Patient Casemaker")
|
7 |
|
8 |
casemaker = CaseMaker("terms.json")
|
@@ -38,9 +40,11 @@ if uploaded_file is not None:
|
|
38 |
col1, col2 = st.columns(2)
|
39 |
with col1:
|
40 |
st.subheader("Original")
|
41 |
-
for
|
42 |
-
|
43 |
-
|
|
|
|
|
44 |
|
45 |
with col2:
|
46 |
st.subheader("With Casemaker")
|
|
|
3 |
|
4 |
from utils_casemaker import CaseMaker, format_casemaker_data
|
5 |
|
6 |
+
st.set_page_config(layout="wide")
|
7 |
+
|
8 |
st.title("Juni Health Patient Casemaker")
|
9 |
|
10 |
casemaker = CaseMaker("terms.json")
|
|
|
40 |
col1, col2 = st.columns(2)
|
41 |
with col1:
|
42 |
st.subheader("Original")
|
43 |
+
display_table = pd.DataFrame.from_records([item.dict() for item in reports[selected_patient_id]])
|
44 |
+
display_table = display_table[["date", "text"]]
|
45 |
+
display_table["text"] = display_table["text"].apply(lambda s: casemaker.remove_header_names(s))
|
46 |
+
display_table = display_table.rename(columns={"date": "ID/Date", "text": "Report"})
|
47 |
+
st.table(display_table)
|
48 |
|
49 |
with col2:
|
50 |
st.subheader("With Casemaker")
|
utils_casemaker.py
CHANGED
@@ -5,9 +5,9 @@ import re
|
|
5 |
|
6 |
nltk.download('punkt')
|
7 |
|
8 |
-
from dataclasses import dataclass
|
9 |
from nltk.tokenize import sent_tokenize
|
10 |
-
from typing import Dict, List, Sequence
|
11 |
from utils_report_parser import get_section_from_report
|
12 |
|
13 |
from transformers import (
|
@@ -24,6 +24,9 @@ class Report:
|
|
24 |
date: str
|
25 |
summary: str | None = None
|
26 |
|
|
|
|
|
|
|
27 |
|
28 |
def clean(s: str) -> str:
|
29 |
s = s.replace("\n", " ") # Concatenate into one string
|
@@ -35,19 +38,16 @@ def clean(s: str) -> str:
|
|
35 |
|
36 |
|
37 |
def split_into_sentences(text: str):
|
38 |
-
# Split paragraphs
|
39 |
-
paragraphs = text.split("\n\n")
|
40 |
-
paragraphs = list(map(clean, paragraphs))
|
41 |
-
paragraphs = list(filter(lambda s: len(s.split()) > 10, paragraphs))
|
42 |
-
|
43 |
# Split into sentences
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
|
|
|
|
51 |
|
52 |
|
53 |
def format_casemaker_data(
|
@@ -85,7 +85,7 @@ def format_casemaker_data(
|
|
85 |
|
86 |
class CaseMaker:
|
87 |
def __init__(self, organ_keywords_dict_path: str = "../assets/terms.json"):
|
88 |
-
self.organ_keyword_dict = json.load(open(organ_keywords_dict_path, "r"))
|
89 |
|
90 |
self.ner_pipe = pipeline(
|
91 |
"ner",
|
@@ -100,30 +100,22 @@ class CaseMaker:
|
|
100 |
# "text2text-generation", model="starmpcc/Asclepius-7B", device_map="auto"
|
101 |
# )
|
102 |
|
103 |
-
def
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
for
|
117 |
-
|
118 |
-
|
119 |
-
):
|
120 |
-
organ_entity["word"] = key
|
121 |
-
return organ_entity
|
122 |
-
# Otherwise, it's a bad match so set the score to 0 and return other
|
123 |
-
organ_entity["word"] = "Other"
|
124 |
-
organ_entity["score"] = 0.0
|
125 |
-
|
126 |
-
return organ_entity
|
127 |
|
128 |
def pick_organ_by_keyword(self, s: str):
|
129 |
words = s.lower()
|
@@ -135,7 +127,7 @@ class CaseMaker:
|
|
135 |
]
|
136 |
):
|
137 |
return organ
|
138 |
-
return
|
139 |
|
140 |
def parse_report_by_organ(self, report: str):
|
141 |
"""Take in a text report and output a dictionary of body organs
|
@@ -147,17 +139,23 @@ class CaseMaker:
|
|
147 |
report_string_by_organ = dict[str, str]()
|
148 |
|
149 |
# Split the report into a list of sentences
|
150 |
-
|
|
|
|
|
151 |
# Collect a list of paragraphs related to each organ
|
152 |
-
|
|
|
153 |
# Figure out which organ is being referenced
|
154 |
-
selected_organ = self.pick_organ_by_keyword(
|
155 |
-
|
|
|
|
|
|
|
156 |
# Concatenate the report to its corresponding organ
|
157 |
if selected_organ not in report_string_by_organ:
|
158 |
-
report_string_by_organ[selected_organ] =
|
159 |
else:
|
160 |
-
report_string_by_organ[selected_organ] +=
|
161 |
|
162 |
return report_string_by_organ
|
163 |
|
@@ -232,7 +230,10 @@ class CaseMaker:
|
|
232 |
for report in reports:
|
233 |
# Cut the report to the findings
|
234 |
report_findings = get_section_from_report(report.text, "findings")
|
235 |
-
|
|
|
|
|
|
|
236 |
# For each organ, collect a list of relevant records containing the text and date
|
237 |
report_by_organ = self.parse_report_by_organ(report_findings)
|
238 |
for organ, report_text in report_by_organ.items():
|
|
|
5 |
|
6 |
nltk.download('punkt')
|
7 |
|
8 |
+
from dataclasses import asdict, dataclass
|
9 |
from nltk.tokenize import sent_tokenize
|
10 |
+
from typing import Dict, List, Mapping, Sequence
|
11 |
from utils_report_parser import get_section_from_report
|
12 |
|
13 |
from transformers import (
|
|
|
24 |
date: str
|
25 |
summary: str | None = None
|
26 |
|
27 |
+
def dict(self):
|
28 |
+
return {k: str(v) for k, v in asdict(self).items()}
|
29 |
+
|
30 |
|
31 |
def clean(s: str) -> str:
|
32 |
s = s.replace("\n", " ") # Concatenate into one string
|
|
|
38 |
|
39 |
|
40 |
def split_into_sentences(text: str):
|
|
|
|
|
|
|
|
|
|
|
41 |
# Split into sentences
|
42 |
+
return sent_tokenize(text)
|
43 |
+
|
44 |
+
|
45 |
+
def remove_keyword(text: str, keyword: str):
|
46 |
+
start_idx = text.lower().find(keyword.lower())
|
47 |
+
if start_idx > -1:
|
48 |
+
substring_to_replace = text[start_idx: start_idx+len(keyword)]
|
49 |
+
text = text.replace(substring_to_replace, " ")
|
50 |
+
return text
|
51 |
|
52 |
|
53 |
def format_casemaker_data(
|
|
|
85 |
|
86 |
class CaseMaker:
|
87 |
def __init__(self, organ_keywords_dict_path: str = "../assets/terms.json"):
|
88 |
+
self.organ_keyword_dict: Mapping[str, list[str]] = json.load(open(organ_keywords_dict_path, "r"))
|
89 |
|
90 |
self.ner_pipe = pipeline(
|
91 |
"ner",
|
|
|
100 |
# "text2text-generation", model="starmpcc/Asclepius-7B", device_map="auto"
|
101 |
# )
|
102 |
|
103 |
+
def filter_out_irrelevant_sentences(self, lst: list[str]):
|
104 |
+
bad_keywords = [
|
105 |
+
"date of procedure", "physicians", "report initiated by",
|
106 |
+
"reported by", "reported and signed by"
|
107 |
+
]
|
108 |
+
return [s for s in lst if not any([s.lower().startswith(k) for k in bad_keywords])]
|
109 |
+
|
110 |
+
def remove_header_names(self, s: str):
|
111 |
+
headers = [
|
112 |
+
"IMPRESSION", "FINDINGS", "RECOMMENDATION",
|
113 |
+
"COMPARISON", "INDICATION", "TECHNIQUE", "STUDY",
|
114 |
+
"MEDICATIONS", "TECHNIQUE AND FINDINGS"
|
115 |
+
]
|
116 |
+
for header in headers:
|
117 |
+
s = remove_keyword(s, f"{header}:")
|
118 |
+
return s
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
|
120 |
def pick_organ_by_keyword(self, s: str):
|
121 |
words = s.lower()
|
|
|
127 |
]
|
128 |
):
|
129 |
return organ
|
130 |
+
return None
|
131 |
|
132 |
def parse_report_by_organ(self, report: str):
|
133 |
"""Take in a text report and output a dictionary of body organs
|
|
|
139 |
report_string_by_organ = dict[str, str]()
|
140 |
|
141 |
# Split the report into a list of sentences
|
142 |
+
sentences = split_into_sentences(report)
|
143 |
+
# Filter out irrelevant sentences using rules
|
144 |
+
sentences = self.filter_out_irrelevant_sentences(sentences)
|
145 |
# Collect a list of paragraphs related to each organ
|
146 |
+
previous_sentence_organ = "Other"
|
147 |
+
for s in sentences:
|
148 |
# Figure out which organ is being referenced
|
149 |
+
selected_organ = self.pick_organ_by_keyword(s)
|
150 |
+
if selected_organ is None:
|
151 |
+
selected_organ = previous_sentence_organ
|
152 |
+
else:
|
153 |
+
previous_sentence_organ = selected_organ
|
154 |
# Concatenate the report to its corresponding organ
|
155 |
if selected_organ not in report_string_by_organ:
|
156 |
+
report_string_by_organ[selected_organ] = s
|
157 |
else:
|
158 |
+
report_string_by_organ[selected_organ] += f" {s}"
|
159 |
|
160 |
return report_string_by_organ
|
161 |
|
|
|
230 |
for report in reports:
|
231 |
# Cut the report to the findings
|
232 |
report_findings = get_section_from_report(report.text, "findings")
|
233 |
+
|
234 |
+
# Remove any other keywords
|
235 |
+
report_findings = self.remove_header_names(report_findings)
|
236 |
+
|
237 |
# For each organ, collect a list of relevant records containing the text and date
|
238 |
report_by_organ = self.parse_report_by_organ(report_findings)
|
239 |
for organ, report_text in report_by_organ.items():
|
utils_report_parser.py
CHANGED
@@ -1,20 +1,13 @@
|
|
1 |
def get_section_from_report(report: str, section: str):
|
2 |
-
section_upper = section.upper()
|
3 |
section_lower = section.lower()
|
4 |
-
findings_start_idx = report.lower().find(f"{section_lower}:")
|
5 |
-
f"{section_lower}:"
|
6 |
-
)
|
7 |
|
8 |
-
if findings_start_idx
|
9 |
findings_start_idx = report.lower().find(f"{section_lower}:") + len(
|
10 |
f"{section_lower}:"
|
11 |
)
|
12 |
-
if findings_start_idx == -1:
|
13 |
-
findings_start_idx = report.find(f"{section_upper}") + len(f"{section_upper}")
|
14 |
-
|
15 |
-
if findings_start_idx == -1:
|
16 |
-
findings = report
|
17 |
-
else:
|
18 |
findings = report[findings_start_idx:]
|
|
|
|
|
19 |
|
20 |
return findings
|
|
|
1 |
def get_section_from_report(report: str, section: str):
|
|
|
2 |
section_lower = section.lower()
|
3 |
+
findings_start_idx = report.lower().find(f"{section_lower}:")
|
|
|
|
|
4 |
|
5 |
+
if findings_start_idx > -1:
|
6 |
findings_start_idx = report.lower().find(f"{section_lower}:") + len(
|
7 |
f"{section_lower}:"
|
8 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
findings = report[findings_start_idx:]
|
10 |
+
else:
|
11 |
+
findings = report
|
12 |
|
13 |
return findings
|