ljyflores commited on
Commit
2b98370
1 Parent(s): facce4e

Turn reports into table, remove header, use logic to use organ of previous sentence

Browse files
__pycache__/utils_casemaker.cpython-310.pyc CHANGED
Binary files a/__pycache__/utils_casemaker.cpython-310.pyc and b/__pycache__/utils_casemaker.cpython-310.pyc differ
 
__pycache__/utils_report_parser.cpython-310.pyc CHANGED
Binary files a/__pycache__/utils_report_parser.cpython-310.pyc and b/__pycache__/utils_report_parser.cpython-310.pyc differ
 
app.py CHANGED
@@ -3,6 +3,8 @@ import streamlit as st
3
 
4
  from utils_casemaker import CaseMaker, format_casemaker_data
5
 
 
 
6
  st.title("Juni Health Patient Casemaker")
7
 
8
  casemaker = CaseMaker("terms.json")
@@ -38,9 +40,11 @@ if uploaded_file is not None:
38
  col1, col2 = st.columns(2)
39
  with col1:
40
  st.subheader("Original")
41
- for report in reports[selected_patient_id]:
42
- st.write(f"**Report {report.date}**")
43
- st.write(report.text)
 
 
44
 
45
  with col2:
46
  st.subheader("With Casemaker")
 
3
 
4
  from utils_casemaker import CaseMaker, format_casemaker_data
5
 
6
+ st.set_page_config(layout="wide")
7
+
8
  st.title("Juni Health Patient Casemaker")
9
 
10
  casemaker = CaseMaker("terms.json")
 
40
  col1, col2 = st.columns(2)
41
  with col1:
42
  st.subheader("Original")
43
+ display_table = pd.DataFrame.from_records([item.dict() for item in reports[selected_patient_id]])
44
+ display_table = display_table[["date", "text"]]
45
+ display_table["text"] = display_table["text"].apply(lambda s: casemaker.remove_header_names(s))
46
+ display_table = display_table.rename(columns={"date": "ID/Date", "text": "Report"})
47
+ st.table(display_table)
48
 
49
  with col2:
50
  st.subheader("With Casemaker")
utils_casemaker.py CHANGED
@@ -5,9 +5,9 @@ import re
5
 
6
  nltk.download('punkt')
7
 
8
- from dataclasses import dataclass
9
  from nltk.tokenize import sent_tokenize
10
- from typing import Dict, List, Sequence
11
  from utils_report_parser import get_section_from_report
12
 
13
  from transformers import (
@@ -24,6 +24,9 @@ class Report:
24
  date: str
25
  summary: str | None = None
26
 
 
 
 
27
 
28
  def clean(s: str) -> str:
29
  s = s.replace("\n", " ") # Concatenate into one string
@@ -35,19 +38,16 @@ def clean(s: str) -> str:
35
 
36
 
37
  def split_into_sentences(text: str):
38
- # Split paragraphs
39
- paragraphs = text.split("\n\n")
40
- paragraphs = list(map(clean, paragraphs))
41
- paragraphs = list(filter(lambda s: len(s.split()) > 10, paragraphs))
42
-
43
  # Split into sentences
44
- sentences = [sent_tokenize(p) for p in paragraphs]
45
- sentences = [
46
- sent
47
- for lst in sentences
48
- for sent in lst if isinstance(sent, str)
49
- ]
50
- return sentences
 
 
51
 
52
 
53
  def format_casemaker_data(
@@ -85,7 +85,7 @@ def format_casemaker_data(
85
 
86
  class CaseMaker:
87
  def __init__(self, organ_keywords_dict_path: str = "../assets/terms.json"):
88
- self.organ_keyword_dict = json.load(open(organ_keywords_dict_path, "r"))
89
 
90
  self.ner_pipe = pipeline(
91
  "ner",
@@ -100,30 +100,22 @@ class CaseMaker:
100
  # "text2text-generation", model="starmpcc/Asclepius-7B", device_map="auto"
101
  # )
102
 
103
- def standardize_organ(self, organ_entity: Dict) -> Dict:
104
- """Given an entity, map its name to a set of recognized entities provided in
105
- organ_keyword_dict if it matches any of the keywords; otherwise set it as "Other"
106
-
107
- Args:
108
- organ_entity (Dict): Dictionary corresponding to entity; should contain "word" key
109
- which is the entity
110
-
111
- Returns:
112
- Dict: Same dictionary where the "word" key has been updated to either a set of standard
113
- body organs or "Other"
114
- """
115
- # If the organ matches any of the keys or their synonyms, replace the name and return
116
- for key in self.organ_keyword_dict:
117
- if (organ_entity["word"].lower() == key.lower()) or (
118
- organ_entity["word"].lower() in self.organ_keyword_dict[key]
119
- ):
120
- organ_entity["word"] = key
121
- return organ_entity
122
- # Otherwise, it's a bad match so set the score to 0 and return other
123
- organ_entity["word"] = "Other"
124
- organ_entity["score"] = 0.0
125
-
126
- return organ_entity
127
 
128
  def pick_organ_by_keyword(self, s: str):
129
  words = s.lower()
@@ -135,7 +127,7 @@ class CaseMaker:
135
  ]
136
  ):
137
  return organ
138
- return "other"
139
 
140
  def parse_report_by_organ(self, report: str):
141
  """Take in a text report and output a dictionary of body organs
@@ -147,17 +139,23 @@ class CaseMaker:
147
  report_string_by_organ = dict[str, str]()
148
 
149
  # Split the report into a list of sentences
150
- paragraphs = split_into_sentences(report)
 
 
151
  # Collect a list of paragraphs related to each organ
152
- for p in paragraphs:
 
153
  # Figure out which organ is being referenced
154
- selected_organ = self.pick_organ_by_keyword(p)
155
-
 
 
 
156
  # Concatenate the report to its corresponding organ
157
  if selected_organ not in report_string_by_organ:
158
- report_string_by_organ[selected_organ] = p
159
  else:
160
- report_string_by_organ[selected_organ] += p
161
 
162
  return report_string_by_organ
163
 
@@ -232,7 +230,10 @@ class CaseMaker:
232
  for report in reports:
233
  # Cut the report to the findings
234
  report_findings = get_section_from_report(report.text, "findings")
235
-
 
 
 
236
  # For each organ, collect a list of relevant records containing the text and date
237
  report_by_organ = self.parse_report_by_organ(report_findings)
238
  for organ, report_text in report_by_organ.items():
 
5
 
6
  nltk.download('punkt')
7
 
8
+ from dataclasses import asdict, dataclass
9
  from nltk.tokenize import sent_tokenize
10
+ from typing import Dict, List, Mapping, Sequence
11
  from utils_report_parser import get_section_from_report
12
 
13
  from transformers import (
 
24
  date: str
25
  summary: str | None = None
26
 
27
+ def dict(self):
28
+ return {k: str(v) for k, v in asdict(self).items()}
29
+
30
 
31
  def clean(s: str) -> str:
32
  s = s.replace("\n", " ") # Concatenate into one string
 
38
 
39
 
40
  def split_into_sentences(text: str):
 
 
 
 
 
41
  # Split into sentences
42
+ return sent_tokenize(text)
43
+
44
+
45
+ def remove_keyword(text: str, keyword: str):
46
+ start_idx = text.lower().find(keyword.lower())
47
+ if start_idx > -1:
48
+ substring_to_replace = text[start_idx: start_idx+len(keyword)]
49
+ text = text.replace(substring_to_replace, " ")
50
+ return text
51
 
52
 
53
  def format_casemaker_data(
 
85
 
86
  class CaseMaker:
87
  def __init__(self, organ_keywords_dict_path: str = "../assets/terms.json"):
88
+ self.organ_keyword_dict: Mapping[str, list[str]] = json.load(open(organ_keywords_dict_path, "r"))
89
 
90
  self.ner_pipe = pipeline(
91
  "ner",
 
100
  # "text2text-generation", model="starmpcc/Asclepius-7B", device_map="auto"
101
  # )
102
 
103
+ def filter_out_irrelevant_sentences(self, lst: list[str]):
104
+ bad_keywords = [
105
+ "date of procedure", "physicians", "report initiated by",
106
+ "reported by", "reported and signed by"
107
+ ]
108
+ return [s for s in lst if not any([s.lower().startswith(k) for k in bad_keywords])]
109
+
110
+ def remove_header_names(self, s: str):
111
+ headers = [
112
+ "IMPRESSION", "FINDINGS", "RECOMMENDATION",
113
+ "COMPARISON", "INDICATION", "TECHNIQUE", "STUDY",
114
+ "MEDICATIONS", "TECHNIQUE AND FINDINGS"
115
+ ]
116
+ for header in headers:
117
+ s = remove_keyword(s, f"{header}:")
118
+ return s
 
 
 
 
 
 
 
 
119
 
120
  def pick_organ_by_keyword(self, s: str):
121
  words = s.lower()
 
127
  ]
128
  ):
129
  return organ
130
+ return None
131
 
132
  def parse_report_by_organ(self, report: str):
133
  """Take in a text report and output a dictionary of body organs
 
139
  report_string_by_organ = dict[str, str]()
140
 
141
  # Split the report into a list of sentences
142
+ sentences = split_into_sentences(report)
143
+ # Filter out irrelevant sentences using rules
144
+ sentences = self.filter_out_irrelevant_sentences(sentences)
145
  # Collect a list of paragraphs related to each organ
146
+ previous_sentence_organ = "Other"
147
+ for s in sentences:
148
  # Figure out which organ is being referenced
149
+ selected_organ = self.pick_organ_by_keyword(s)
150
+ if selected_organ is None:
151
+ selected_organ = previous_sentence_organ
152
+ else:
153
+ previous_sentence_organ = selected_organ
154
  # Concatenate the report to its corresponding organ
155
  if selected_organ not in report_string_by_organ:
156
+ report_string_by_organ[selected_organ] = s
157
  else:
158
+ report_string_by_organ[selected_organ] += f" {s}"
159
 
160
  return report_string_by_organ
161
 
 
230
  for report in reports:
231
  # Cut the report to the findings
232
  report_findings = get_section_from_report(report.text, "findings")
233
+
234
+ # Remove any other keywords
235
+ report_findings = self.remove_header_names(report_findings)
236
+
237
  # For each organ, collect a list of relevant records containing the text and date
238
  report_by_organ = self.parse_report_by_organ(report_findings)
239
  for organ, report_text in report_by_organ.items():
utils_report_parser.py CHANGED
@@ -1,20 +1,13 @@
1
  def get_section_from_report(report: str, section: str):
2
- section_upper = section.upper()
3
  section_lower = section.lower()
4
- findings_start_idx = report.lower().find(f"{section_lower}:") + len(
5
- f"{section_lower}:"
6
- )
7
 
8
- if findings_start_idx == -1:
9
  findings_start_idx = report.lower().find(f"{section_lower}:") + len(
10
  f"{section_lower}:"
11
  )
12
- if findings_start_idx == -1:
13
- findings_start_idx = report.find(f"{section_upper}") + len(f"{section_upper}")
14
-
15
- if findings_start_idx == -1:
16
- findings = report
17
- else:
18
  findings = report[findings_start_idx:]
 
 
19
 
20
  return findings
 
1
  def get_section_from_report(report: str, section: str):
 
2
  section_lower = section.lower()
3
+ findings_start_idx = report.lower().find(f"{section_lower}:")
 
 
4
 
5
+ if findings_start_idx > -1:
6
  findings_start_idx = report.lower().find(f"{section_lower}:") + len(
7
  f"{section_lower}:"
8
  )
 
 
 
 
 
 
9
  findings = report[findings_start_idx:]
10
+ else:
11
+ findings = report
12
 
13
  return findings