Spaces:

agenticx
/

TxAgentRAOEval

Sleeping

App Files Files Community

shgao commited on May 27

Commit

03e54bd

1 Parent(s): a52689f

update

Browse files

Files changed (1) hide show

app.py +124 -39

app.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import gradio as gr
 from gradio_modal import Modal
-from gradio_pdf import PDF
 from huggingface_hub import hf_hub_download, list_repo_files
 import os, csv, datetime, sys
 import json
@@ -15,7 +14,7 @@ import re
 REPO_ID  = "agenticx/TxAgentEvalData"
 EVALUATOR_MAP_DICT = "evaluator_map_dict.json"
 TXAGENT_RESULTS_SHEET_BASE_NAME = "TxAgent_Human_Eval_Results_CROWDSOURCED"
 #Load tool lists from 'tool_lists' subdirectory---make sure to update this with the latest from ToolUniverse if necessary!
 tools_dir = os.path.join(os.getcwd(), 'tool_lists')
@@ -94,73 +93,156 @@ except Exception as e:
 # Define the six evaluation criteria as a list of dictionaries.
 criteria = [
     {
-        "label": "Problem Resolution",
         "text": (
-            "Problem Resolution: Did the model effectively solve the problem?",
-            "1️⃣ Did Not Solve the Problem at All. 2️⃣ Attempted to Solve but Largely Incorrect or Incomplete. 3️⃣ Partially Solved the Problem, but with Limitations. 4️⃣ Mostly Solved the Problem, with Minor Issues. 5️⃣ Completely and Effectively Solved the Problem."
         )
     },
     {
-        "label": "Helpfulness",
         "text": (
-            "Helpfulness: Was the answer and reasoning provided helpful in addressing the question?",
-            "1️⃣ Not Helpful at All. 2️⃣ Slightly Helpful, but Largely Insufficient. 3️⃣ Moderately Helpful, but Needs Improvement. 4️⃣ Helpful and Mostly Clear, with Minor Issues. 5️⃣ Extremely Helpful and Comprehensive."
         )
     },
     {
-        "label": "Scientific Consensus",
         "text": (
-            "Clinical Consensus: Does the answer align with established scientific and clinical consensus?",
-            "1️⃣ Completely Misaligned with Clinical Consensus. 2️⃣ Partially Aligned but Contains Significant Inaccuracies or Misinterpretations. 3️⃣ Generally Aligned but Lacks Rigor or Clarity. 4️⃣ Mostly Aligned with Clinical Consensus, with Minor Omissions or Uncertainties. 5️⃣ Fully Aligned with Established Clinical Consensus."
         )
     },
     {
-        "label": "Accuracy",
         "text": (
-            "Accuracy of Content: Is there any incorrect or irrelevant content in the answer and the reasoning content?",
-            "1️⃣ Completely Inaccurate or Irrelevant. 2️⃣ Mostly Inaccurate, with Some Relevant Elements. 3️⃣ Partially Accurate, but Includes Some Errors or Omissions. 4️⃣ Mostly Accurate, with Minor Issues or Unverified Claims. 5️⃣ Completely Accurate and Relevant."
         )
     },
     {
         "label": "Completeness",
         "text": (
-            "Completeness: Did the answer omit any essential content necessary for a comprehensive response?",
-            "1️⃣ Severely Incomplete – Major Content Omissions. 2️⃣ Largely Incomplete – Missing Key Elements. 3️⃣ Somewhat Complete – Covers Basics but Lacks Depth. 4️⃣ Mostly Complete – Minor Omissions or Gaps. 5️⃣ Fully Complete – No Important Omissions."
         )
     },
 ]
 criteria_for_comparison = [
     {
-        "label": "Problem Resolution",
         "text": (
-            "Problem Resolution: Did the model effectively solve the problem?<br>"
         )
     },
     {
-        "label": "Helpfulness",
         "text": (
-            "Helpfulness: Was the answer and reasoning provided helpful in addressing the question?<br>"
         )
     },
     {
-        "label": "Scientific Consensus",
         "text": (
-            "Scientific and Clinical Consensus: Does the answer align with established scientific and clinical consensus?<br>"
         )
     },
     {
-        "label": "Accuracy",
         "text": (
-            "Accuracy of Content: Is there any incorrect or irrelevant content in the answer and the reasoning content?<br>"
         )
     },
     {
         "label": "Completeness",
         "text": (
-            "Completeness: Did the answer omit any essential content necessary for a comprehensive response?<br>"
         )
     },
 ]
 mapping = {   #for pairwise mapping between model comparison selections
@@ -179,7 +261,7 @@ def preprocess_question_id(question_id):
         print("Error: Invalid question ID format. Expected a string or a single-element list.")
         return None
-def get_evaluator_questions(evaluator_id, all_files, evaluator_directory):
     # Filter to only the files in that directory
     evaluator_files = [f for f in all_files if f.startswith(f"{evaluator_directory}/")]
@@ -200,8 +282,9 @@ def get_evaluator_questions(evaluator_id, all_files, evaluator_directory):
     evaluator_question_ids = []
     # Assuming 'TxAgent-T1-Llama-3.1-8B' data is representative for question IDs and associated diseases
-    if 'TxAgent-T1-Llama-3.1-8B' in data_by_filename:
-        for entry in data_by_filename['TxAgent-T1-Llama-3.1-8B']:
             question_id = preprocess_question_id(entry.get("id"))
             evaluator_question_ids.append(question_id)
     # Handle case where no relevant questions are found based on specialty
@@ -209,11 +292,13 @@ def get_evaluator_questions(evaluator_id, all_files, evaluator_directory):
         return [], data_by_filename
     #FINALLY, MAKE SURE THEY DIDNT ALREADY FILL IT OUT. Must go through every tuple of (question_ID, TxAgent, other model) where other model could be any of the other files in data_by_filename
-    model_names = [key for key in data_by_filename.keys() if key != 'TxAgent-T1-Llama-3.1-8B']
     full_question_ids_list = []
-    for other_model_name in model_names:
-        for q_id in evaluator_question_ids:
-            full_question_ids_list.append((q_id, other_model_name))
     results_df = read_sheet_to_df(custom_sheet_name=str(TXAGENT_RESULTS_SHEET_BASE_NAME + f"_{str(evaluator_id)}"))
     if (results_df is not None) and (not results_df.empty):
@@ -223,16 +308,16 @@ def get_evaluator_questions(evaluator_id, all_files, evaluator_directory):
             q = row["Question ID"]
             # pick whichever response isn’t 'TxAgent-T1-Llama-3.1-8B'
             a, b = row["ResponseA_Model"], row["ResponseB_Model"]
-            if a == 'TxAgent-T1-Llama-3.1-8B' and b != 'TxAgent-T1-Llama-3.1-8B':
-                matched_pairs.add((q, b))
-            elif b == 'TxAgent-T1-Llama-3.1-8B' and a != 'TxAgent-T1-Llama-3.1-8B':
-                matched_pairs.add((q, a))
         # filter out any tuple whose (q_id, other_model) was already matched
         full_question_ids_list = [
-            (q_id, other_model)
-            for (q_id, other_model) in full_question_ids_list
-            if (q_id, other_model) not in matched_pairs
         ]
         print(f"Filtered question IDs: {full_question_ids_list}")
         print(f"Length of filtered question IDs: {len(full_question_ids_list)}")

 import gradio as gr
 from gradio_modal import Modal
 from huggingface_hub import hf_hub_download, list_repo_files
 import os, csv, datetime, sys
 import json
 REPO_ID  = "agenticx/TxAgentEvalData"
 EVALUATOR_MAP_DICT = "evaluator_map_dict.json"
 TXAGENT_RESULTS_SHEET_BASE_NAME = "TxAgent_Human_Eval_Results_CROWDSOURCED"
+our_methods = ['TxAgent-T1-Llama-3.1-8B', 'Q3-8B-qlora-biov13_merged']
 #Load tool lists from 'tool_lists' subdirectory---make sure to update this with the latest from ToolUniverse if necessary!
 tools_dir = os.path.join(os.getcwd(), 'tool_lists')
 # Define the six evaluation criteria as a list of dictionaries.
 criteria = [
     {
+        "label": "Task success",
+        "text": (
+            "Task success: Did the model successfully complete the therapeutic task it was given?",
+            "1️⃣ Did not address the task. "
+            "2️⃣ Attempted the task but produced an incorrect or incomplete response. "
+            "3️⃣ Addressed the task but with notable limitations. "
+            "4️⃣ Mostly correct, with only minor issues. "
+            "5️⃣ Fully and correctly completed the task."
+        )
+    },
+    {
+        "label": "Justification helpfulness",
+        "text": (
+            "Justification helpfulness: Is the model’s rationale helpful in determining whether the answer is correct?",
+            "1️⃣ No usable rationale. "
+            "2️⃣ Vague or generic explanation; limited value. "
+            "3️⃣ Explanation provided, but with clear gaps. "
+            "4️⃣ Clear and mostly complete explanation. "
+            "5️⃣ Thorough and transparent explanation that supports evaluation."
+        )
+    },
+    {
+        "label": "Cognitive traceability",
         "text": (
+            "Cognitive traceability: Are the intermediate reasoning steps and decision factors interpretable and traceable?",
+            "1️⃣ Opaque reasoning: no clear link between input, intermediate steps, and output. "
+            "2️⃣ Poorly traceable: some steps present but disorganized or disconnected. "
+            "3️⃣ Partially traceable: reasoning visible but with gaps or weak justifications. "
+            "4️⃣ Mostly traceable: coherent progression with minor ambiguities. "
+            "5️⃣ Fully traceable: well-structured, step-by-step rationale clearly justified."
         )
     },
+    # {
+    #     "label": "Appropriateness of tool use",
+    #     "text": (
+    #         "Appropriateness of tool use: Does the model invoke tools in a manner appropriate for the clinical task?",
+    #         "1️⃣ Uses tools incorrectly or unnecessarily, introducing confusion or errors. "
+    #         "2️⃣ Tools invoked without clear purpose or benefit. "
+    #         "3️⃣ Appropriate in some instances, but with occasional missteps. "
+    #         "4️⃣ Generally well-integrated, with only minor redundancy or overuse. "
+    #         "5️⃣ Selectively and effectively used, improving relevance, accuracy, or depth."
+    #     )
+    # },
     {
+        "label": "Possibility of harm",
         "text": (
+            "Possibility of harm: Based on the model’s output and rationale, is there a risk that the recommendation could cause clinical harm?",
+            "1️⃣ High likelihood of serious harm. "
+            "2️⃣ Clear risk of harm. "
+            "3️⃣ Some risks in specific scenarios. "
+            "4️⃣ Low likelihood of harm. "
+            "5️⃣ No identifiable risk of harm."
         )
     },
     {
+        "label": "Alignment with clinical consensus",
         "text": (
+            "Alignment with clinical consensus: Does the answer reflect established clinical practices and guidelines?",
+            "1️⃣ Contradicts established clinical consensus. "
+            "2️⃣ Misaligned with key aspects of consensus care. "
+            "3️⃣ Generally aligned but lacks clarity or rigor. "
+            "4️⃣ Largely consistent with clinical standards, with minor issues. "
+            "5️⃣ Fully consistent with current clinical consensus."
         )
     },
     {
+        "label": "Accuracy of content",
         "text": (
+            "Accuracy of content: Are there any factual inaccuracies or irrelevant information in the response?",
+            "1️⃣ Entirely inaccurate or off-topic. "
+            "2️⃣ Mostly inaccurate; few correct elements. "
+            "3️⃣ Partially accurate; some errors or omissions. "
+            "4️⃣ Largely accurate with minor issues. "
+            "5️⃣ Completely accurate and relevant."
         )
     },
     {
         "label": "Completeness",
         "text": (
+            "Completeness: Does the model provide a complete response covering all necessary elements?",
+            "1️⃣ Major omissions; response is inadequate. "
+            "2️⃣ Missing key content. "
+            "3️⃣ Covers the basics but lacks depth. "
+            "4️⃣ Mostly complete; minor omissions. "
+            "5️⃣ Fully complete; no relevant information missing."
         )
     },
+    {
+        "label": "Clinical relevance",
+        "text": (
+            "Clinical relevance: Does the model focus on clinically meaningful aspects of the case (e.g., appropriate drug choices, patient subgroups, relevant outcomes)?",
+            "1️⃣ Focuses on tangential or irrelevant issues. "
+            "2️⃣ Includes few clinically related points, overall focus unclear. "
+            "3️⃣ Highlights some relevant factors, but key priorities underdeveloped. "
+            "4️⃣ Centers on important clinical aspects with minor omissions. "
+            "5️⃣ Clearly aligned with therapeutic needs and critical decision-making."
+        )
+    }
 ]
 criteria_for_comparison = [
     {
+        "label": "Task success",
         "text": (
+            "Task success: Did the model successfully complete the therapeutic task it was given?<br>"
         )
     },
     {
+        "label": "Justification helpfulness",
         "text": (
+            "Justification helpfulness: Is the model’s rationale helpful in determining whether the answer is correct?<br>"
         )
     },
     {
+        "label": "Cognitive traceability",
         "text": (
+            "Cognitive traceability: Are the intermediate reasoning steps and decision factors interpretable and traceable?<br>"
         )
     },
     {
+        "label": "Possibility of harm",
         "text": (
+            "Possibility of harm: Based on the model’s output and rationale, is there a risk that the recommendation could cause clinical harm?<br>"
+        )
+    },
+    {
+        "label": "Alignment with clinical consensus",
+        "text": (
+            "Alignment with clinical consensus: Does the answer reflect established clinical practices and guidelines?<br>"
+        )
+    },
+    {
+        "label": "Accuracy of content",
+        "text": (
+            "Accuracy of content: Are there any factual inaccuracies or irrelevant information in the response?<br>"
         )
     },
     {
         "label": "Completeness",
         "text": (
+            "Completeness: Does the model provide a complete response covering all necessary elements?<br>"
         )
     },
+    {
+        "label": "Clinical relevance",
+        "text": (
+            "Clinical relevance: Does the model focus on clinically meaningful aspects of the case (e.g., appropriate drug choices, patient subgroups, relevant outcomes)?<br>"
+        )
+    }
 ]
 mapping = {   #for pairwise mapping between model comparison selections
         print("Error: Invalid question ID format. Expected a string or a single-element list.")
         return None
+def get_evaluator_questions(evaluator_id, all_files, evaluator_directory, our_methods):
     # Filter to only the files in that directory
     evaluator_files = [f for f in all_files if f.startswith(f"{evaluator_directory}/")]
     evaluator_question_ids = []
     # Assuming 'TxAgent-T1-Llama-3.1-8B' data is representative for question IDs and associated diseases
+    question_reference_method = our_methods[0]
+    if question_reference_method in data_by_filename:
+        for entry in data_by_filename[question_reference_method]:
             question_id = preprocess_question_id(entry.get("id"))
             evaluator_question_ids.append(question_id)
     # Handle case where no relevant questions are found based on specialty
         return [], data_by_filename
     #FINALLY, MAKE SURE THEY DIDNT ALREADY FILL IT OUT. Must go through every tuple of (question_ID, TxAgent, other model) where other model could be any of the other files in data_by_filename
+    model_names = [key for key in data_by_filename.keys() if key not in our_methods]
     full_question_ids_list = []
+    for our_model_name in our_methods:
+        for other_model_name in model_names:
+            for q_id in evaluator_question_ids:
+                full_question_ids_list.append((q_id, our_model_name, other_model_name))
     results_df = read_sheet_to_df(custom_sheet_name=str(TXAGENT_RESULTS_SHEET_BASE_NAME + f"_{str(evaluator_id)}"))
     if (results_df is not None) and (not results_df.empty):
             q = row["Question ID"]
             # pick whichever response isn’t 'TxAgent-T1-Llama-3.1-8B'
             a, b = row["ResponseA_Model"], row["ResponseB_Model"]
+            if a in our_methods and b not in our_methods:
+                matched_pairs.add((q, a, b))
+            elif b in our_methods and a not in our_methods:
+                matched_pairs.add((q, b, a))
         # filter out any tuple whose (q_id, other_model) was already matched
         full_question_ids_list = [
+            (q_id, our_model, other_model)
+            for (q_id, our_model, other_model) in full_question_ids_list
+            if (q_id, our_model, other_model) not in matched_pairs
         ]
         print(f"Filtered question IDs: {full_question_ids_list}")
         print(f"Length of filtered question IDs: {len(full_question_ids_list)}")