Spaces:

ljyflores
/

casemaker_demo

Sleeping

App Files Files Community

ljyflores commited on Jun 29

Commit

2b98370

•

1 Parent(s): facce4e

Turn reports into table, remove header, use logic to use organ of previous sentence

Browse files

Files changed (5) hide show

__pycache__/utils_casemaker.cpython-310.pyc +0 -0
__pycache__/utils_report_parser.cpython-310.pyc +0 -0
app.py +7 -3
utils_casemaker.py +48 -47
utils_report_parser.py +4 -11

__pycache__/utils_casemaker.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/utils_casemaker.cpython-310.pyc and b/__pycache__/utils_casemaker.cpython-310.pyc differ

__pycache__/utils_report_parser.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/utils_report_parser.cpython-310.pyc and b/__pycache__/utils_report_parser.cpython-310.pyc differ

app.py CHANGED Viewed

@@ -3,6 +3,8 @@ import streamlit as st
 from utils_casemaker import CaseMaker, format_casemaker_data
 st.title("Juni Health Patient Casemaker")
 casemaker = CaseMaker("terms.json")
@@ -38,9 +40,11 @@ if uploaded_file is not None:
         col1, col2 = st.columns(2)
         with col1:
             st.subheader("Original")
-            for report in reports[selected_patient_id]:
-                st.write(f"**Report {report.date}**")
-                st.write(report.text)
         with col2:
             st.subheader("With Casemaker")

 from utils_casemaker import CaseMaker, format_casemaker_data
+st.set_page_config(layout="wide")
 st.title("Juni Health Patient Casemaker")
 casemaker = CaseMaker("terms.json")
         col1, col2 = st.columns(2)
         with col1:
             st.subheader("Original")
+            display_table = pd.DataFrame.from_records([item.dict() for item in reports[selected_patient_id]])
+            display_table = display_table[["date", "text"]]
+            display_table["text"] = display_table["text"].apply(lambda s: casemaker.remove_header_names(s))
+            display_table = display_table.rename(columns={"date": "ID/Date", "text": "Report"})
+            st.table(display_table)
         with col2:
             st.subheader("With Casemaker")

utils_casemaker.py CHANGED Viewed

@@ -5,9 +5,9 @@ import re
 nltk.download('punkt')
-from dataclasses import dataclass
 from nltk.tokenize import sent_tokenize
-from typing import Dict, List, Sequence
 from utils_report_parser import get_section_from_report
 from transformers import (
@@ -24,6 +24,9 @@ class Report:
     date: str
     summary: str | None = None
 def clean(s: str) -> str:
     s = s.replace("\n", " ")  # Concatenate into one string
@@ -35,19 +38,16 @@ def clean(s: str) -> str:
 def split_into_sentences(text: str):
-    # Split paragraphs
-    paragraphs = text.split("\n\n")
-    paragraphs = list(map(clean, paragraphs))
-    paragraphs = list(filter(lambda s: len(s.split()) > 10, paragraphs))
     # Split into sentences
-    sentences = [sent_tokenize(p) for p in paragraphs]
-    sentences = [
-        sent
-        for lst in sentences
-        for sent in lst if isinstance(sent, str)
-    ]
-    return sentences
 def format_casemaker_data(
@@ -85,7 +85,7 @@ def format_casemaker_data(
 class CaseMaker:
     def __init__(self, organ_keywords_dict_path: str = "../assets/terms.json"):
-        self.organ_keyword_dict = json.load(open(organ_keywords_dict_path, "r"))
         self.ner_pipe = pipeline(
             "ner",
@@ -100,30 +100,22 @@ class CaseMaker:
         #     "text2text-generation", model="starmpcc/Asclepius-7B", device_map="auto"
         # )
-    def standardize_organ(self, organ_entity: Dict) -> Dict:
-        """Given an entity, map its name to a set of recognized entities provided in
-        organ_keyword_dict if it matches any of the keywords; otherwise set it as "Other"
-        Args:
-            organ_entity (Dict): Dictionary corresponding to entity; should contain "word" key
-            which is the entity
-        Returns:
-            Dict: Same dictionary where the "word" key has been updated to either a set of standard
-            body organs or "Other"
-        """
-        # If the organ matches any of the keys or their synonyms, replace the name and return
-        for key in self.organ_keyword_dict:
-            if (organ_entity["word"].lower() == key.lower()) or (
-                organ_entity["word"].lower() in self.organ_keyword_dict[key]
-            ):
-                organ_entity["word"] = key
-                return organ_entity
-        # Otherwise, it's a bad match so set the score to 0 and return other
-        organ_entity["word"] = "Other"
-        organ_entity["score"] = 0.0
-        return organ_entity
     def pick_organ_by_keyword(self, s: str):
         words = s.lower()
@@ -135,7 +127,7 @@ class CaseMaker:
                 ]
             ):
                 return organ
-        return "other"
     def parse_report_by_organ(self, report: str):
         """Take in a text report and output a dictionary of body organs
@@ -147,17 +139,23 @@ class CaseMaker:
         report_string_by_organ = dict[str, str]()
         # Split the report into a list of sentences
-        paragraphs = split_into_sentences(report)
         # Collect a list of paragraphs related to each organ
-        for p in paragraphs:
             # Figure out which organ is being referenced
-            selected_organ = self.pick_organ_by_keyword(p)
             # Concatenate the report to its corresponding organ
             if selected_organ not in report_string_by_organ:
-                report_string_by_organ[selected_organ] = p
             else:
-                report_string_by_organ[selected_organ] += p
         return report_string_by_organ
@@ -232,7 +230,10 @@ class CaseMaker:
         for report in reports:
             # Cut the report to the findings
             report_findings = get_section_from_report(report.text, "findings")
             # For each organ, collect a list of relevant records containing the text and date
             report_by_organ = self.parse_report_by_organ(report_findings)
             for organ, report_text in report_by_organ.items():

 nltk.download('punkt')
+from dataclasses import asdict, dataclass
 from nltk.tokenize import sent_tokenize
+from typing import Dict, List, Mapping, Sequence
 from utils_report_parser import get_section_from_report
 from transformers import (
     date: str
     summary: str | None = None
+    def dict(self):
+        return {k: str(v) for k, v in asdict(self).items()}
 def clean(s: str) -> str:
     s = s.replace("\n", " ")  # Concatenate into one string
 def split_into_sentences(text: str):
     # Split into sentences
+    return sent_tokenize(text)
+def remove_keyword(text: str, keyword: str):
+    start_idx = text.lower().find(keyword.lower())
+    if start_idx > -1:
+        substring_to_replace = text[start_idx: start_idx+len(keyword)]
+        text = text.replace(substring_to_replace, " ")
+    return text
 def format_casemaker_data(
 class CaseMaker:
     def __init__(self, organ_keywords_dict_path: str = "../assets/terms.json"):
+        self.organ_keyword_dict: Mapping[str, list[str]] = json.load(open(organ_keywords_dict_path, "r"))
         self.ner_pipe = pipeline(
             "ner",
         #     "text2text-generation", model="starmpcc/Asclepius-7B", device_map="auto"
         # )
+    def filter_out_irrelevant_sentences(self, lst: list[str]):
+        bad_keywords = [
+            "date of procedure", "physicians", "report initiated by",
+            "reported by", "reported and signed by"
+            ]
+        return [s for s in lst if not any([s.lower().startswith(k) for k in bad_keywords])]
+    def remove_header_names(self, s: str):
+        headers = [
+            "IMPRESSION", "FINDINGS", "RECOMMENDATION",
+            "COMPARISON", "INDICATION", "TECHNIQUE", "STUDY",
+            "MEDICATIONS", "TECHNIQUE AND FINDINGS"
+            ]
+        for header in headers:
+            s = remove_keyword(s, f"{header}:")
+        return s
     def pick_organ_by_keyword(self, s: str):
         words = s.lower()
                 ]
             ):
                 return organ
+        return None
     def parse_report_by_organ(self, report: str):
         """Take in a text report and output a dictionary of body organs
         report_string_by_organ = dict[str, str]()
         # Split the report into a list of sentences
+        sentences = split_into_sentences(report)
+        # Filter out irrelevant sentences using rules
+        sentences = self.filter_out_irrelevant_sentences(sentences)
         # Collect a list of paragraphs related to each organ
+        previous_sentence_organ = "Other"
+        for s in sentences:
             # Figure out which organ is being referenced
+            selected_organ = self.pick_organ_by_keyword(s)
+            if selected_organ is None:
+                selected_organ = previous_sentence_organ
+            else:
+                previous_sentence_organ = selected_organ
             # Concatenate the report to its corresponding organ
             if selected_organ not in report_string_by_organ:
+                report_string_by_organ[selected_organ] = s
             else:
+                report_string_by_organ[selected_organ] += f" {s}"
         return report_string_by_organ
         for report in reports:
             # Cut the report to the findings
             report_findings = get_section_from_report(report.text, "findings")
+            # Remove any other keywords
+            report_findings = self.remove_header_names(report_findings)
             # For each organ, collect a list of relevant records containing the text and date
             report_by_organ = self.parse_report_by_organ(report_findings)
             for organ, report_text in report_by_organ.items():

utils_report_parser.py CHANGED Viewed

@@ -1,20 +1,13 @@
 def get_section_from_report(report: str, section: str):
-    section_upper = section.upper()
     section_lower = section.lower()
-    findings_start_idx = report.lower().find(f"{section_lower}:") + len(
-        f"{section_lower}:"
-    )
-    if findings_start_idx == -1:
         findings_start_idx = report.lower().find(f"{section_lower}:") + len(
             f"{section_lower}:"
         )
-    if findings_start_idx == -1:
-        findings_start_idx = report.find(f"{section_upper}") + len(f"{section_upper}")
-    if findings_start_idx == -1:
-        findings = report
-    else:
         findings = report[findings_start_idx:]
     return findings

 def get_section_from_report(report: str, section: str):
     section_lower = section.lower()
+    findings_start_idx = report.lower().find(f"{section_lower}:")
+    if findings_start_idx > -1:
         findings_start_idx = report.lower().find(f"{section_lower}:") + len(
             f"{section_lower}:"
         )
         findings = report[findings_start_idx:]
+    else:
+        findings = report
     return findings