Spaces:

klinic-hackupc
/

klinic

Sleeping

App Files Files Community

ACMCMC commited on May 5, 2024

Commit

e7d7b51

1 Parent(s): 47c6369

UI

Browse files

Files changed (4) hide show

app.py +16 -13
llm_res.py +12 -9
requirements.txt +1 -0
utils.py +64 -27

app.py CHANGED Viewed

@@ -13,12 +13,14 @@ from utils import (
     augment_the_set_of_diseaces,
     get_clinical_trials_related_to_diseases,
     get_clinical_records_by_ids,
-    render_trial_details
 )
 from llm_res import get_short_summary_out_of_json_files, tagging_insights_from_json
 import json
 import numpy as np
 from sentence_transformers import SentenceTransformer
 # variables to reveal next steps
@@ -71,8 +73,13 @@ with st.container():
             status.write("Getting the similarities among the diseases to filter out less promising ones...")
             diseases_uris = [disease["uri"] for disease in diseases_related_to_the_user_text]
             similarities = get_similarities_among_diseases_uris(diseases_uris)
-            status.info(f'Obtained similarity information among the diseases by measuring the cosine similarity of their embeddings. Using the similarity information to filter out less promising diseases.')
             status.json(similarities, expanded=False)
             status.divider()
             # 4. Potentially filter out the diseases that are not similar enough (e.g. similarity < 0.8)
             # 5. Augment the set of diseases: add new diseases that are similar to the ones that are already in the set, until we get 10-15 diseases
@@ -80,6 +87,8 @@ with st.container():
             augmented_set_of_diseases = augment_the_set_of_diseaces(diseases_uris)
             # print(augmented_set_of_diseases)
             status.info(f'Augmented set of diseases: {len(augmented_set_of_diseases)} diseases.')
             # 6. Query the embeddings of the diseases related to each clinical trial (also in the DB), to get the most similar clinical trials to our set of diseases
             status.write("Getting the clinical trials related to the diseases found...")
             clinical_trials_related_to_the_diseases = get_clinical_trials_related_to_diseases(
@@ -97,18 +106,19 @@ with st.container():
             # 7. Use an LLM to get a summary of the clinical trials, in plain text format.
             status.write("Getting a summary of the clinical trials...")
             response, stats_dict = get_short_summary_out_of_json_files(json_of_clinical_trials)
-            print(f'Response from LLM summarization: {response}')
-            print(f'basic_stats_dict:{stats_dict}')
             status.write(f'Response from LLM summarization: {response}')
             # 8. Use an LLM to extract numerical data from the clinical trials (e.g. number of patients, number of deaths, etc.). Get summary statistics out of that.
             status.write("Getting summary statistics of the clinical trials...")
-            response = tagging_insights_from_json(json_of_clinical_trials)
             print(f'Response from LLM tagging: {response}')
             status.write(f'Response from LLM tagging: {response}')
             # 9. Show the results to the user: graph of the diseases chosen, summary of the clinical trials, summary statistics of the clinical trials, and list of the details of the clinical trials considered
             status.update(label="Done!", state="complete")
             status.balloons()
             show_graph = True
 # graph
@@ -158,8 +168,7 @@ $$"""
 # overview
 with st.container():
     if show_overview:
-        st.write("## Disease Overview")
-        disease_overview = ":red[lorem ipsum]"  # TODO
         st.write(disease_overview)
         time.sleep(2)
         show_details = True
@@ -169,12 +178,6 @@ with st.container():
 with st.container():
     if show_details:
         st.write("## Clinical Trials Details")
-        trials = []
-        # TODO replace mock data
-        with open("mock_trial.json") as f:
-            d = json.load(f)
-        for i in range(0, 8):
-            trials.append(d)
         tab_titles = [f"{trial['protocolSection']['identificationModule']['nctId']}" for trial in trials]

     augment_the_set_of_diseaces,
     get_clinical_trials_related_to_diseases,
     get_clinical_records_by_ids,
+    render_trial_details,
+    filter_out_less_promising_diseases
 )
 from llm_res import get_short_summary_out_of_json_files, tagging_insights_from_json
 import json
 import numpy as np
 from sentence_transformers import SentenceTransformer
+import matplotlib
 # variables to reveal next steps
             status.write("Getting the similarities among the diseases to filter out less promising ones...")
             diseases_uris = [disease["uri"] for disease in diseases_related_to_the_user_text]
             similarities = get_similarities_among_diseases_uris(diseases_uris)
+            status.info(f'Obtained similarity information among the diseases by measuring the cosine similarity of their embeddings.')
             status.json(similarities, expanded=False)
+            filtered_diseases_uris, df_similarities = filter_out_less_promising_diseases(similarities)
+            # Apply a colormap to the table
+            status.table(df_similarities.style.background_gradient(cmap='viridis', axis=None))
+            status.info(f'Filtered out less promising diseases, keeping {len(filtered_diseases_uris)} diseases.')
+            status.json(filtered_diseases_uris, expanded=False)
             status.divider()
             # 4. Potentially filter out the diseases that are not similar enough (e.g. similarity < 0.8)
             # 5. Augment the set of diseases: add new diseases that are similar to the ones that are already in the set, until we get 10-15 diseases
             augmented_set_of_diseases = augment_the_set_of_diseaces(diseases_uris)
             # print(augmented_set_of_diseases)
             status.info(f'Augmented set of diseases: {len(augmented_set_of_diseases)} diseases.')
+            status.json(augmented_set_of_diseases, expanded=False)
+            status.divider()
             # 6. Query the embeddings of the diseases related to each clinical trial (also in the DB), to get the most similar clinical trials to our set of diseases
             status.write("Getting the clinical trials related to the diseases found...")
             clinical_trials_related_to_the_diseases = get_clinical_trials_related_to_diseases(
             # 7. Use an LLM to get a summary of the clinical trials, in plain text format.
             status.write("Getting a summary of the clinical trials...")
             response, stats_dict = get_short_summary_out_of_json_files(json_of_clinical_trials)
+            disease_overview = response
             status.write(f'Response from LLM summarization: {response}')
             # 8. Use an LLM to extract numerical data from the clinical trials (e.g. number of patients, number of deaths, etc.). Get summary statistics out of that.
             status.write("Getting summary statistics of the clinical trials...")
+            #response = tagging_insights_from_json(json_of_clinical_trials)
+            response = ""
             print(f'Response from LLM tagging: {response}')
             status.write(f'Response from LLM tagging: {response}')
             # 9. Show the results to the user: graph of the diseases chosen, summary of the clinical trials, summary statistics of the clinical trials, and list of the details of the clinical trials considered
             status.update(label="Done!", state="complete")
             status.balloons()
             show_graph = True
+            trials = json_of_clinical_trials
 # graph
 # overview
 with st.container():
     if show_overview:
+        st.write("## Overview of Related Clinical Trials")
         st.write(disease_overview)
         time.sleep(2)
         show_details = True
 with st.container():
     if show_details:
         st.write("## Clinical Trials Details")
         tab_titles = [f"{trial['protocolSection']['identificationModule']['nctId']}" for trial in trials]

llm_res.py CHANGED Viewed

@@ -221,20 +221,23 @@ def process_dictionaty_with_llm_to_generate_response(json_data):
     return filtered_data
 def get_short_summary_out_of_json_files(data_json):
-    prompt_template = """You are an expert clinician working on the analysis of reports of clinical trials.
-#       # Task
-#       You will be given a set of descriptions of clinical trials. Your job is to come up with a short summary (100-200 words) of the descriptions of the clinical trials. Your users are clinical researchers who are experts in medicine, so you should be technical and specific, including scientific terms. Always be faithful to the original information written in the reports.
-#       To write your summary, you will need to read the following examples, labeled as "Report 1", "Report 2", and so on. Your answer should be a single paragraph (100-200 words) that summarizes the general content of all the reports.
-# {text}
-# General summary:"""
-    prompt_template = """ You are an expert on clinicial trials and their analysis of their reports.
-          # Task
-          You will be given a text of descriptions of multiple clinical trials realed to similar diseases. Your job is to come up with a short and detailed summary of the descriptions of the clinical trials. Your users are clinical researchers, so you should be technical and specific, including scientific terms in the summary."""
     prompt = PromptTemplate.from_template(prompt_template)

     return filtered_data
 def get_short_summary_out_of_json_files(data_json):
+    prompt_template = """You are an expert on clinicial trials and their analysis of their reports.
+# Task
+You will be given a text of descriptions of multiple clinical trials realed to similar diseases. Your job is to come up with a short and detailed summary of the descriptions of the clinical trials. Your users are clinical researchers, so you should be technical and specific, including scientific terms in the summary.
+{text}"""
+    prompt_template = """You are an expert clinician working on the analysis of reports of clinical trials.
+# Task
+You will be given a set of descriptions of clinical trials. Your job is to come up with a short summary (100-200 words) of the descriptions of the clinical trials. Your users are clinical researchers who are experts in medicine, so you should be technical and specific, including scientific terms. Always be faithful to the original information written in the reports.
+To write your summary, you will need to read the following examples, labeled as "Report 1", "Report 2", and so on. Your answer should be a single paragraph (100-200 words) that summarizes the general content of all the reports. Format your answer in Markdown format, **highlighting** the most important concepts, and _italicizing_ the technical concepts extracted from the reports. Be very specific about the details of the clinical trials.
+{text}
+General summary:"""
     prompt = PromptTemplate.from_template(prompt_template)

requirements.txt CHANGED Viewed

@@ -11,3 +11,4 @@ sentence_transformers==2.7.0
 streamlit-agraph
 streamlit==1.34.0
 langchain-openai==0.1.6

 streamlit-agraph
 streamlit==1.34.0
 langchain-openai==0.1.6
+matplotlib==3.8.4

utils.py CHANGED Viewed

@@ -5,6 +5,7 @@ from sqlalchemy import create_engine, text
 import requests
 from sentence_transformers import SentenceTransformer
 import streamlit as st
 username = "demo"
 password = "demo"
@@ -121,7 +122,12 @@ def get_similarities_among_diseases_uris(
                 """
             result = conn.execute(text(sql))
             data = result.fetchall()
-    return data
 def augment_the_set_of_diseaces(diseases: List[str]) -> str:
@@ -169,7 +175,7 @@ def get_diseases_related_to_a_textual_description(
             result = conn.execute(text(sql))
             data = result.fetchall()
-    return [{"uri": row[0], "distance": row[1]} for row in data if row[1] > 0.8]
 def get_clinical_trials_related_to_diseases(
     diseases: List[str], encoder
@@ -191,6 +197,20 @@ def get_clinical_trials_related_to_diseases(
     return [{"nct_id": row[0], "distance": row[1]} for row in data]
 def to_capitalized_case(string: str) -> str:
     string = string.replace("_", " ")
     if string.isupper():
@@ -206,36 +226,53 @@ def render_trial_details(trial: dict) -> None:
             official_title = trial["protocolSection"]["identificationModule"]["officialTitle"]
             st.write(f"##### {official_title}")
-            brief_summary = trial["protocolSection"]["descriptionModule"]["briefSummary"]
-            st.write(brief_summary)
-            status_module = {
-                "Status": to_capitalized_case(trial["protocolSection"]["statusModule"]["overallStatus"]),
-                "Status Date": trial["protocolSection"]["statusModule"]["statusVerifiedDate"],
-                "Has Results": trial["hasResults"]
-            }
             st.write("###### Status")
-            st.table(status_module)
-            design_module = {
-                "Study Type": to_capitalized_case(trial["protocolSection"]["designModule"]["studyType"]),
-                "Phases": list_to_capitalized_case(trial["protocolSection"]["designModule"]["phases"]),
-                "Allocation": to_capitalized_case(trial["protocolSection"]["designModule"]["designInfo"]["allocation"]),
-                "Primary Purpose": to_capitalized_case(trial["protocolSection"]["designModule"]["designInfo"]["primaryPurpose"]),
-                "Participants": trial["protocolSection"]["designModule"]["enrollmentInfo"]["count"],
-                "Masking": to_capitalized_case(trial["protocolSection"]["designModule"]["designInfo"]["maskingInfo"]["masking"]),
-                "Who Masked": list_to_capitalized_case(trial["protocolSection"]["designModule"]["designInfo"]["maskingInfo"]["whoMasked"])
-            }
             st.write("###### Design")
-            st.table(design_module)
-            interventions_module = {}
-            for intervention in trial["protocolSection"]["armsInterventionsModule"]["interventions"]:
-                name = intervention["name"]
-                desc = intervention["description"]
-                interventions_module[name] = desc
             st.write("###### Interventions")
-            st.table(interventions_module)
 if __name__ == "__main__":
     username = "demo"

 import requests
 from sentence_transformers import SentenceTransformer
 import streamlit as st
+import pandas as pd
 username = "demo"
 password = "demo"
                 """
             result = conn.execute(text(sql))
             data = result.fetchall()
+    return [{
+        "uri1": row[0].split("/")[-1],
+        "uri2": row[1].split("/")[-1],
+        "distance": float(row[2]),
+    } for row in data]
 def augment_the_set_of_diseaces(diseases: List[str]) -> str:
             result = conn.execute(text(sql))
             data = result.fetchall()
+    return [{"uri": row[0], "distance": float(row[1])} for row in data if float(row[1]) > 0.8]
 def get_clinical_trials_related_to_diseases(
     diseases: List[str], encoder
     return [{"nct_id": row[0], "distance": row[1]} for row in data]
+def filter_out_less_promising_diseases(info_dicts: List[Dict[str, Any]]) -> List[str]:
+    # Find out the score of each disease by averaging the cosine similarity of the embeddings of the diseases that include it as uri1 or uri2
+    df_diseases_similarities = pd.DataFrame(info_dicts)
+    # Use uri1 as the index, and uri2 as the columns. The values are the distances.
+    df_diseases_similarities = df_diseases_similarities.pivot(index="uri1", columns="uri2", values="distance")
+    # Fill the diagonal with 1.0
+    df_diseases_similarities = df_diseases_similarities.fillna(1.0)
+    # Filter out the diseases that are 1 standard deviation below the mean
+    mean = df_diseases_similarities.mean().mean()
+    std = df_diseases_similarities.mean().std()
+    filtered_diseases = df_diseases_similarities.mean()[df_diseases_similarities.mean() > mean - std].index.tolist()
+    return filtered_diseases, df_diseases_similarities
 def to_capitalized_case(string: str) -> str:
     string = string.replace("_", " ")
     if string.isupper():
             official_title = trial["protocolSection"]["identificationModule"]["officialTitle"]
             st.write(f"##### {official_title}")
+            try:
+                st.write(trial["protocolSection"]["descriptionModule"]["briefSummary"])
+            except KeyError:
+                try:
+                    st.write(trial["protocolSection"]["descriptionModule"]["detailedDescription"])
+                except KeyError:
+                    st.error("No description available.")
             st.write("###### Status")
+            try:
+                status_module = {
+                    "Status": to_capitalized_case(trial["protocolSection"]["statusModule"]["overallStatus"]),
+                    "Status Date": trial["protocolSection"]["statusModule"]["statusVerifiedDate"],
+                    "Has Results": trial["hasResults"]
+                }
+                st.table(status_module)
+            except KeyError:
+                st.info("No status information available.")
             st.write("###### Design")
+            try:
+                design_module = {
+                    "Study Type": to_capitalized_case(trial["protocolSection"]["designModule"]["studyType"]),
+                    "Phases": list_to_capitalized_case(trial["protocolSection"]["designModule"]["phases"]),
+                    "Allocation": to_capitalized_case(trial["protocolSection"]["designModule"]["designInfo"]["allocation"]),
+                    "Primary Purpose": to_capitalized_case(trial["protocolSection"]["designModule"]["designInfo"]["primaryPurpose"]),
+                    "Participants": trial["protocolSection"]["designModule"]["enrollmentInfo"]["count"],
+                    "Masking": to_capitalized_case(trial["protocolSection"]["designModule"]["designInfo"]["maskingInfo"]["masking"]),
+                    "Who Masked": list_to_capitalized_case(trial["protocolSection"]["designModule"]["designInfo"]["maskingInfo"]["whoMasked"])
+                }
+                st.table(design_module)
+            except KeyError:
+                st.info("No design information available.")
             st.write("###### Interventions")
+            try:
+                interventions_module = {}
+                for intervention in trial["protocolSection"]["armsInterventionsModule"]["interventions"]:
+                    name = intervention["name"]
+                    desc = intervention["description"]
+                    interventions_module[name] = desc
+                st.table(interventions_module)
+            except KeyError:
+                st.info("No interventions information available.")
+            # Button to go to ClinicalTrials.gov and see the trial. It takes the user to the official page of the trial.
+            st.markdown(f"See more in [ClinicalTrials.gov](https://clinicaltrials.gov/study/{trial['protocolSection']['identificationModule']['nctId']})")
 if __name__ == "__main__":
     username = "demo"