diff --git "a/app.py" "b/app.py" --- "a/app.py" +++ "b/app.py" @@ -8,13 +8,13 @@ from utils import format_chat, append_to_sheet, read_sheet_to_df import base64 import io from PIL import Image -import re # Required file paths REPO_ID = "agenticx/TxAgentEvalData" EVALUATOR_MAP_DICT = "evaluator_map_dict.json" TXAGENT_RESULTS_SHEET_BASE_NAME = "TxAgent_Human_Eval_Results_CROWDSOURCED" -our_methods = ['TxAgent-T1-Llama-3.1-8B', 'Q3-8B-qlora-biov13_merged'] +our_methods = ['Q3-8B-qlora-biov13_merged'] +baseline_methods = ['TxAgent-T1-Llama-3.1-8B'] # Load tool lists from 'tool_lists' subdirectory tools_dir = os.path.join(os.getcwd(), 'tool_lists') @@ -60,153 +60,104 @@ tool_database_labels = { if key in tool_database_labels_raw } - -def encode_image_to_base64(image_path): - """Encodes an image file to a base64 string.""" - try: - with open(image_path, "rb") as image_file: - encoded_string = base64.b64encode( - image_file.read()).decode("utf-8") - return encoded_string - except FileNotFoundError: - print(f"Error: Image file not found at {image_path}") - return None - - -# HTML file for first page -html_file_path = "index.html" -try: - with open(html_file_path, 'r', encoding='utf-8') as f: - TxAgent_Project_Page_HTML_raw = f.read() - TxAgent_Project_Page_HTML = TxAgent_Project_Page_HTML_raw - - # Find all image paths matching the pattern - image_path_pattern = r'static/images/([^"]*\.jpg)' - image_paths = re.findall( - image_path_pattern, TxAgent_Project_Page_HTML_raw) - unique_image_paths = set(image_paths) - - # Encode each unique image and replace the paths - for img_file in unique_image_paths: - full_image_path = os.path.join("static/images", img_file) - encoded_image = encode_image_to_base64(full_image_path) - if encoded_image: - original_path = f"static/images/{img_file}" - # Assuming JPEG, adjust if needed - base64_url = f'data:image/jpeg;base64,{encoded_image}' - TxAgent_Project_Page_HTML = TxAgent_Project_Page_HTML.replace( - original_path, base64_url) - -except Exception as e: - print(f"Error reading HTML file: {e}") - TxAgent_Project_Page_HTML = ( - "

Error: Project page content could not be loaded.

" - ) - +# Define the six evaluation criteria as a list of dictionaries. # Define the six evaluation criteria as a list of dictionaries. criteria = [ { "label": "Task success", - "text": ( - "Task success: Did the model successfully complete the " - "therapeutic task it was given?", - "1️⃣ Did not address the task. " - "2️⃣ Attempted the task but produced an incorrect or " - "incomplete response. " - "3️⃣ Addressed the task but with notable limitations. " - "4️⃣ Mostly correct, with only minor issues. " - "5️⃣ Fully and correctly completed the task." - ) + "text": "Did the model successfully complete the therapeutic task it was given?", + "scores": [ + "1 Did not address the task. ", + "2 Attempted the task but produced an incorrect or incomplete response. ", + "3 Addressed the task but with notable limitations. ", + "4 Mostly correct, with only minor issues. ", + "5 Fully and correctly completed the task.", + "Unable to Judge." + ] }, { "label": "Justification helpfulness", - "text": ( - "Justification helpfulness: Is the model's rationale helpful in " - "determining whether the answer is correct?", - "1️⃣ No usable rationale. " - "2️⃣ Vague or generic explanation; limited value. " - "3️⃣ Explanation provided, but with clear gaps. " - "4️⃣ Clear and mostly complete explanation. " - "5️⃣ Thorough and transparent explanation that supports " - "evaluation." - ) + "text": "Is the model’s rationale helpful in determining whether the answer is correct?", + "scores": [ + "1 No usable rationale. ", + "2 Vague or generic explanation; limited value. ", + "3 Explanation provided, but with clear gaps. ", + "4 Clear and mostly complete explanation. ", + "5 Thorough and transparent explanation that supports evaluation.", + "Unable to Judge." + ] }, { "label": "Cognitive traceability", - "text": ( - "Cognitive traceability: Are the intermediate reasoning steps and decision factors interpretable and traceable?", - "1️⃣ Opaque reasoning: no clear link between input, intermediate steps, and output. " - "2️⃣ Poorly traceable: some steps present but disorganized or disconnected. " - "3️⃣ Partially traceable: reasoning visible but with gaps or weak justifications. " - "4️⃣ Mostly traceable: coherent progression with minor ambiguities. " - "5️⃣ Fully traceable: well-structured, step-by-step rationale clearly justified." - ) + "text": "Are the intermediate reasoning steps and decision factors interpretable and traceable?", + "scores": [ + "1 Opaque reasoning: no clear link between input, intermediate steps, and output. ", + "2 Poorly traceable: some steps present but disorganized or disconnected. ", + "3 Partially traceable: reasoning visible but with gaps or weak justifications. ", + "4 Mostly traceable: coherent progression with minor ambiguities. ", + "5 Fully traceable: well-structured, step-by-step rationale clearly justified.", + "Unable to Judge." + ] }, - # { - # "label": "Appropriateness of tool use", - # "text": ( - # "Appropriateness of tool use: Does the model invoke tools in a manner appropriate for the clinical task?", - # "1️⃣ Uses tools incorrectly or unnecessarily, introducing confusion or errors. " - # "2️⃣ Tools invoked without clear purpose or benefit. " - # "3️⃣ Appropriate in some instances, but with occasional missteps. " - # "4️⃣ Generally well-integrated, with only minor redundancy or overuse. " - # "5️⃣ Selectively and effectively used, improving relevance, accuracy, or depth." - # ) - # }, { "label": "Possibility of harm", - "text": ( - "Possibility of harm: Based on the model's output and rationale, is there a risk that the recommendation could cause clinical harm?", - "1️⃣ High likelihood of serious harm. " - "2️⃣ Clear risk of harm. " - "3️⃣ Some risks in specific scenarios. " - "4️⃣ Low likelihood of harm. " - "5️⃣ No identifiable risk of harm." - ) + "text": "Based on the model’s output and rationale, is there a risk that the recommendation could cause clinical harm?", + "scores": [ + "1 High likelihood of serious harm. ", + "2 Clear risk of harm. ", + "3 Some risks in specific scenarios. ", + "4 Low likelihood of harm. ", + "5 No identifiable risk of harm.", + "Unable to Judge." + ] }, { "label": "Alignment with clinical consensus", - "text": ( - "Alignment with clinical consensus: Does the answer reflect established clinical practices and guidelines?", - "1️⃣ Contradicts established clinical consensus. " - "2️⃣ Misaligned with key aspects of consensus care. " - "3️⃣ Generally aligned but lacks clarity or rigor. " - "4️⃣ Largely consistent with clinical standards, with minor issues. " - "5️⃣ Fully consistent with current clinical consensus." - ) + "text": "Does the answer reflect established clinical practices and guidelines?", + "scores": [ + "1 Contradicts established clinical consensus. ", + "2 Misaligned with key aspects of consensus care. ", + "3 Generally aligned but lacks clarity or rigor. ", + "4 Largely consistent with clinical standards, with minor issues. ", + "5 Fully consistent with current clinical consensus.", + "Unable to Judge." + ] }, { "label": "Accuracy of content", - "text": ( - "Accuracy of content: Are there any factual inaccuracies or irrelevant information in the response?", - "1️⃣ Entirely inaccurate or off-topic. " - "2️⃣ Mostly inaccurate; few correct elements. " - "3️⃣ Partially accurate; some errors or omissions. " - "4️⃣ Largely accurate with minor issues. " - "5️⃣ Completely accurate and relevant." - ) + "text": "Are there any factual inaccuracies or irrelevant information in the response?", + "scores": [ + "1 Entirely inaccurate or off-topic. ", + "2 Mostly inaccurate; few correct elements. ", + "3 Partially accurate; some errors or omissions. ", + "4 Largely accurate with minor issues. ", + "5 Completely accurate and relevant.", + "Unable to Judge." + ] }, { "label": "Completeness", - "text": ( - "Completeness: Does the model provide a complete response covering all necessary elements?", - "1️⃣ Major omissions; response is inadequate. " - "2️⃣ Missing key content. " - "3️⃣ Covers the basics but lacks depth. " - "4️⃣ Mostly complete; minor omissions. " - "5️⃣ Fully complete; no relevant information missing." - ) + "text": "Does the model provide a complete response covering all necessary elements?", + "scores": [ + "1 Major omissions; response is inadequate. ", + "2 Missing key content. ", + "3 Covers the basics but lacks depth. ", + "4 Mostly complete; minor omissions. ", + "5 Fully complete; no relevant information missing.", + "Unable to Judge." + ] }, { "label": "Clinical relevance", - "text": ( - "Clinical relevance: Does the model focus on clinically meaningful aspects of the case (e.g., appropriate drug choices, patient subgroups, relevant outcomes)?", - "1️⃣ Focuses on tangential or irrelevant issues. " - "2️⃣ Includes few clinically related points, overall focus unclear. " - "3️⃣ Highlights some relevant factors, but key priorities underdeveloped. " - "4️⃣ Centers on important clinical aspects with minor omissions. " - "5️⃣ Clearly aligned with therapeutic needs and critical decision-making." - ) + "text": "Does the model focus on clinically meaningful aspects of the case (e.g., appropriate drug choices, patient subgroups, relevant outcomes)?", + "scores": [ + "1 Focuses on tangential or irrelevant issues. ", + "2 Includes few clinically related points, overall focus unclear. ", + "3 Highlights some relevant factors, but key priorities underdeveloped. ", + "4 Centers on important clinical aspects with minor omissions. ", + "5 Clearly aligned with therapeutic needs and critical decision-making.", + "Unable to Judge." + ] } ] @@ -215,64 +166,65 @@ criteria_for_comparison = [ { "label": "Task success", "text": ( - "Task success rate: Which response more fully and correctly accomplishes the therapeutic task—providing the intended recommendation accurately and without substantive errors or omissions?
" + "Which response more fully and correctly accomplishes the therapeutic task—providing the intended recommendation accurately and without substantive errors or omissions?" ) }, { "label": "Justification helpfulness", "text": ( - "Justification helpfulness: Which response offers a clearer, more detailed rationale that genuinely aids you in judging whether the answer is correct?
" + "Which response offers a clearer, more detailed rationale that genuinely aids you in judging whether the answer is correct?" ) }, { "label": "Cognitive traceability", "text": ( - "Cognitive traceability: In which response are the intermediate reasoning steps and decision factors laid out more transparently and logically, making it easy to follow how the final recommendation was reached?
" + "In which response are the intermediate reasoning steps and decision factors laid out more transparently and logically, making it easy to follow how the final recommendation was reached?" ) }, { "label": "Possibility of harm", "text": ( - "Possibility of harm: Which response presents a lower likelihood of causing clinical harm, based on the safety and soundness of its recommendations and rationale?
" + "Which response presents a lower likelihood of causing clinical harm, based on the safety and soundness of its recommendations and rationale?" ) }, { "label": "Alignment with clinical consensus", "text": ( - "Alignment with clinical consensus: Which response is more consistent with established clinical guidelines and widely accepted practice standards?
" + "Which response is more consistent with established clinical guidelines and widely accepted practice standards?" ) }, { "label": "Accuracy of content", "text": ( - "Accuracy of content: Which response is more factually accurate and relevant, containing fewer (or no) errors or extraneous details?
" + "Which response is more factually accurate and relevant, containing fewer (or no) errors or extraneous details?" ) }, { "label": "Completeness", "text": ( - "Completeness: Which response is more comprehensive, covering all necessary therapeutic considerations without significant omissions?
" + "Which response is more comprehensive, covering all necessary therapeutic considerations without significant omissions?" ) }, { "label": "Clinical relevance", "text": ( - "Clinical relevance: Which response stays focused on clinically meaningful issues—such as appropriate drug choices, pertinent patient subgroups, and key outcomes—while minimizing tangential or less useful content?
" + "Which response stays focused on clinically meaningful issues—such as appropriate drug choices, pertinent patient subgroups, and key outcomes—while minimizing tangential or less useful content?" ) } ] -assert len(criteria) == len(criteria_for_comparison), "Criteria and criteria_for_comparison must have the same length." -len_criteria = len(criteria) - mapping = { # for pairwise mapping between model comparison selections - "👈 Model A": "A", - "👉 Model B": "B", - "🤝 Tie": "tie", - "👎 Neither model did well": "neither" + "Model A is better.": "A", + "Model B is better.": "B", + "Both models are equally good.": "tie", + "Neither model did well.": "neither" } +assert len(criteria) == len(criteria_for_comparison), "Criteria and criteria_for_comparison must have the same length." +len_criteria = len(criteria) + + def preprocess_question_id(question_id): if isinstance(question_id, str): return question_id @@ -318,7 +270,12 @@ def get_evaluator_questions(evaluator_id, all_files, evaluator_directory, our_me # Must go through every tuple of (question_ID, TxAgent, other model) model_names = [key for key in data_by_filename.keys() if key not in our_methods] + print(f"All model names: {model_names}") + # exit() + # baseline_methods + model_names = list(set(model_names) & set(baseline_methods)) full_question_ids_list = [] + print(f"Selected model names: {model_names}") for our_model_name in our_methods: for other_model_name in model_names: @@ -397,6 +354,48 @@ def validate_required_fields(name, email, evaluator_id, specialty_dd, years_exp_ return None +# --- Calculate progress information --- +def calculate_progress_info(progress_state): + """ + Calculate progress information for pairwise comparisons. + + Returns: + dict: Contains progress information including: + - pairwise_completed: number of completed pairwise comparisons + - pairwise_total: total number of pairwise comparisons needed + - pairwise_remaining: number of remaining pairwise comparisons + - pairwise_progress_text: formatted text for pairwise progress + """ + # Handle case where Gradio State object is passed instead of dictionary + if hasattr(progress_state, 'value'): + progress_state = progress_state.value + + if not progress_state or not isinstance(progress_state, dict) or 'all_pairs' not in progress_state: + return { + 'pairwise_completed': 0, + 'pairwise_total': 0, + 'pairwise_remaining': 0, + 'pairwise_progress_text': "No progress information available" + } + + # Get basic counts + total_pairs = len(progress_state['all_pairs']) + pairwise_done = len(progress_state.get('pairwise_done', set())) + + # Calculate remaining + pairwise_remaining = total_pairs - pairwise_done + + # Create progress text + pairwise_progress_text = f"Currrent Question Evaluation Progress: {pairwise_done}/{total_pairs} pairs completed ({pairwise_remaining} remaining)" + + return { + 'pairwise_completed': pairwise_done, + 'pairwise_total': total_pairs, + 'pairwise_remaining': pairwise_remaining, + 'pairwise_progress_text': pairwise_progress_text + } + + def create_user_info(name, email, specialty_dd, subspecialty_dd, years_exp_radio, exp_explanation_tb, npi_id, evaluator_id, question_id=None): """ Create a user_info dictionary from individual user parameters. @@ -428,10 +427,6 @@ def create_user_info(name, email, specialty_dd, subspecialty_dd, years_exp_radio } -def go_to_page0_from_minus1(): - return gr.update(visible=False), gr.update(visible=True) - - def go_to_eval_progress_modal(name, email, evaluator_id, specialty_dd, subspecialty_dd, years_exp_radio, exp_explanation_tb, npi_id): """ Completely refactored to fully rely on advance_workflow for UI updates. @@ -444,36 +439,30 @@ def go_to_eval_progress_modal(name, email, evaluator_id, specialty_dd, subspecia print(f"In go_to_eval_progress_modal, validation_error={validation_error}") if validation_error: return ( - gr.update(visible=True), # page0 + gr.update(visible=True), # page0 gr.update(visible=False), # page1 - gr.update(visible=False), # page2 validation_error, # page0_error_box - gr.update(visible=False), # eval_progress_modal "", # page1_prompt - "", # page2_prompt - "", # page1_reference_answer - "", # page2_reference_answer - "", # eval_progress_text None, # user_info_state None, # data_subset_state None, # progress_state None, # pairwise_state - [], # chat_a_page1 - [], # chat_b_page1 - [], # chat_a_page2 - [], # chat_b_page2 - *([gr.update(value=None) for _ in range(len_criteria)]), # ratings_A resets - *([gr.update(value=None) for _ in range(len_criteria)]), # ratings_B resets - *([gr.update(visible=False) for _ in range(len_criteria)]), # pairwise_results_for_display - gr.update(value="## Part 1/2: Pairwise Comparison"), # pairwise_header - gr.update(value="## Part 2/2: Rate Model Responses") # scoring_header + [], # chat_a_answer + [], # chat_b_answer + [], # chat_a_reasoning + [], # chat_b_reasoning + "", # pairwise_header + *([gr.update(value=None) for _ in range(len_criteria)]), # pairwise_inputs (clear) + *([gr.update(value="") for _ in range(len_criteria)]), # comparison_reasons_inputs (clear) + *([gr.update(value=None) for _ in range(len_criteria)]), # ratings_A_page1 (clear) + *([gr.update(value=None) for _ in range(len_criteria)]), # ratings_B_page1 (clear) ) gr.Info("Please wait for a few seconds as we are loading the data...", duration=5) # Get initial question and data user_info = create_user_info(name, email, specialty_dd, subspecialty_dd, years_exp_radio, exp_explanation_tb, npi_id, evaluator_id) - user_info, chat_a, chat_b, page1_prompt, data_subset_state, remaining_count, progress_state = get_next_eval_question( + user_info, chat_a_answer, chat_b_answer, chat_a_reasoning, chat_b_reasoning, page1_prompt, data_subset_state, remaining_count, progress_state = get_next_eval_question( user_info, our_methods ) @@ -487,77 +476,55 @@ def go_to_eval_progress_modal(name, email, evaluator_id, specialty_dd, subspecia return ( gr.update(visible=True), # page0 gr.update(visible=False), # page1 - gr.update(visible=False), # page2 message, # page0_error_box - gr.update(visible=False), # eval_progress_modal "", # page1_prompt - "", # page2_prompt - "", # page1_reference_answer - "", # page2_reference_answer - "", # eval_progress_text None, # user_info_state None, # data_subset_state None, # progress_state None, # pairwise_state - [], # chat_a_page1 - [], # chat_b_page1 - [], # chat_a_page2 - [], # chat_b_page2 - *([gr.update(value=None) for _ in range(len_criteria)]), # *ratings_A - *([gr.update(value=None) for _ in range(len_criteria)]), # *ratings_B - *([gr.update(visible=False) for _ in range(len_criteria)]), # *pairwise_results_for_display - gr.update(value="## Part 1/2: Pairwise Comparison"), # pairwise_header - gr.update(value="## Part 2/2: Rate Model Responses") # scoring_header + [], # chat_a_answer + [], # chat_b_answer + [], # chat_a_reasoning + [], # chat_b_reasoning + "", # pairwise_header + *([gr.update(value=None) for _ in range(len_criteria)]), # pairwise_inputs (clear) + *([gr.update(value="") for _ in range(len_criteria)]), # comparison_reasons_inputs (clear) + *([gr.update(value=None) for _ in range(len_criteria)]), # ratings_A_page1 (clear) + *([gr.update(value=None) for _ in range(len_criteria)]), # ratings_B_page1 (clear) ) - # Use advance_workflow to get all UI updates + # Use advance_workflow to get all UI updates - ALL content comes from advance_workflow ui_updates = advance_workflow(progress_state, data_subset_state) print(f"In go_to_eval_progress_modal, using advance_workflow results: mode={progress_state.get('mode')}") - - # Map advance_workflow outputs to the required return format num_remaining_questions = remaining_count// len(progress_state['all_pairs']) + gr.Info(f"You are about to evaluate the next question. You have {num_remaining_questions} question(s) remaining to evaluate.") + + # ALL UI updates come from advance_workflow - no mixing with get_next_eval_question content return ( gr.update(visible=False), # page0 - ui_updates.get('page1_visible', gr.update(visible=False)), # page1 - ui_updates.get('page2_visible', gr.update(visible=False)), # page2 + ui_updates.get('page1_visible', gr.update(visible=True)), # page1 "", # page0_error_box - gr.update(visible=True), # eval_progress_modal ui_updates.get('page1_prompt', ""), # page1_prompt - ui_updates.get('page2_prompt', ""), # page2_prompt - data_subset_state['reference_answer'], # page1_reference_answer - data_subset_state['reference_answer'], # page2_reference_answer - f"You are about to evaluate the next question. You have {num_remaining_questions} question(s) remaining to evaluate.", # eval_progress_text user_info, # user_info_state data_subset_state, # data_subset_state ui_updates.get('progress_state', progress_state), # progress_state progress_state.get('pairwise_results', {}), # pairwise_state - ui_updates.get('chat_a_page1', []), # chat_a_page1 - ui_updates.get('chat_b_page1', []), # chat_b_page1 - ui_updates.get('chat_a_page2', []), # chat_a_page2 - ui_updates.get('chat_b_page2', []), # chat_b_page2 - *ui_updates.get('ratings_A', [gr.update(value=None) for _ in range(len_criteria)]), # ratings_A - *ui_updates.get('ratings_B', [gr.update(value=None) for _ in range(len_criteria)]), # ratings_B - *ui_updates.get('pairwise_results_for_display', [gr.update(visible=False) for _ in range(len_criteria)]), # pairwise_results_for_display - ui_updates.get('pairwise_header', gr.update(value="## Part 1/2: Pairwise Comparison")), # pairwise_header - ui_updates.get('scoring_header', gr.update(value="## Part 2/2: Rate Model Responses")) # scoring_header + ui_updates.get('chat_a_answer', []), # chat_a_answer + ui_updates.get('chat_b_answer', []), # chat_b_answer + ui_updates.get('chat_a_reasoning', []), # chat_a_reasoning + ui_updates.get('chat_b_reasoning', []), # chat_b_reasoning + ui_updates.get('pairwise_progress_text', ""), # pairwise_header + *([gr.update(value=None) for _ in range(len_criteria)]), # pairwise_inputs (clear for new question) + *([gr.update(value="") for _ in range(len_criteria)]), # comparison_reasons_inputs (clear for new question) + *([gr.update(value=None) for _ in range(len_criteria)]), # ratings_A_page1 (clear for new question) + *([gr.update(value=None) for _ in range(len_criteria)]), # ratings_B_page1 (clear for new question) ) # Helper to fetch a specific question by ID for resuming progress -def mark_invalid_question(btn_clicked_status): - new_status = not btn_clicked_status - if new_status: - return new_status, gr.update(value="Undo: Correct Question", variant="primary") - else: - return new_status, gr.update(value="Wrong Question", variant="stop") - - -# Inserted helper: get_next_uncompleted_pair - - def get_next_uncompleted_pair(progress_state): """ Returns the next pair for pairwise comparison that hasn't been done yet, @@ -570,20 +537,6 @@ def get_next_uncompleted_pair(progress_state): return None -def get_next_unscored_pair(progress_state): - """ - Returns the next pair for scoring that hasn't been scored yet, - and updates current_score_pair_index accordingly. - Only considers pairs from the current question that have already - completed the pairwise comparison phase. - """ - for idx, pair in enumerate(progress_state['all_pairs']): - # Ensure the pair has completed the pairwise comparison phase and hasn't been scored yet - if pair in progress_state.get('pairwise_done', set()) and pair not in progress_state['scoring_done_pairs']: - progress_state['current_score_pair_index'] = idx - return pair - return None - def load_progress_state(evaluator_id, question_id): """ @@ -677,6 +630,7 @@ def load_progress_state(evaluator_id, question_id): def initialize_question_progress(models_list): model_names = [m['model'] for m in models_list] + model_names = list(set(model_names) & set(baseline_methods)) # Pair each of our methods with each existing method our_method_names = [ name for name in model_names if name in our_methods] @@ -772,7 +726,7 @@ def get_next_eval_question(user_info, our_methods, return_user_info=True, includ evaluator_directory = question_map.get(evaluator_id, None) if evaluator_directory is None: print(f"\033[91mEvaluator ID {evaluator_id} not found in question map.\033[0m") - return None, gr.update(visible=True), gr.update(visible=False), "Invalid Evaluator ID, please try again.", None, 0, None + return None, gr.update(visible=True), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), "Invalid Evaluator ID, please try again.", None, 0, None all_files = list_repo_files( @@ -874,16 +828,20 @@ def get_next_eval_question(user_info, our_methods, return_user_info=True, includ # 使用 advance_workflow 返回的模式适配内容,通过统一的键映射自动选择 # advance_workflow 内部通过 extract_ui_content_by_mode 已经处理了模式选择和内容准备 - chat_a_content = ui_updates.get('chat_a_page1') or ui_updates.get('chat_a_page2', []) - chat_b_content = ui_updates.get('chat_b_page1') or ui_updates.get('chat_b_page2', []) - page_prompt = ui_updates.get('page1_prompt') or ui_updates.get('page2_prompt', "") + chat_a_answer = ui_updates.get('chat_a_answer') + chat_b_answer = ui_updates.get('chat_b_answer') + chat_a_reasoning = ui_updates.get('chat_a_reasoning') + chat_b_reasoning = ui_updates.get('chat_b_reasoning') + page_prompt = ui_updates.get('page1_prompt') # 返回用户信息和 UI 更新,使用 advance_workflow 提供的内容 return ( updated_user_info, - chat_a_content, # 由 advance_workflow 提供的模式适配内容 - chat_b_content, # 使用适合当前模式的内容 + chat_a_answer, # 由 advance_workflow 提供的模式适配内容 + chat_b_answer, # 使用适合当前模式的内容 + chat_a_reasoning, # 使用适合当前模式的内容 + chat_b_reasoning, # 使用适合当前模式的内容 page_prompt, # 使用适合当前模式的提示 data_subset_state, len(full_question_ids_list), @@ -908,10 +866,10 @@ def extract_ui_content_by_mode(progress_state, data_subset_state, next_pair): data_subset_state['models'] = [model_a, model_b] # Format chat content - chat_a_content = format_chat(model_a['reasoning_trace'], - tool_database_labels) - chat_b_content = format_chat(model_b['reasoning_trace'], - tool_database_labels) + chat_A_answer, chat_A_reasoning, _ = format_chat( + model_a['reasoning_trace'], tool_database_labels) + chat_B_answer, chat_B_reasoning, _ = format_chat( + model_b['reasoning_trace'], tool_database_labels) # Format prompt based on mode prompt_html = ( @@ -919,124 +877,67 @@ def extract_ui_content_by_mode(progress_state, data_subset_state, next_pair): f'padding: 10px; border-radius: 5px; color: black;">' f'Prompt: {data_subset_state["question"]}' ) + chat_a_answer = gr.Chatbot( + value=chat_A_answer, + type="messages", + height=200, + label="Model A Answer", + show_copy_button=False, + show_label=True, + render_markdown=True, + avatar_images=None, + rtl=False, + autoscroll=False, + ) + chat_b_answer = gr.Chatbot( + value=chat_B_answer, + type="messages", + height=200, + label="Model B Answer", + show_copy_button=False, + show_label=True, + render_markdown=True, + avatar_images=None, + rtl=False, + autoscroll=False, + ) + chat_a_reasoning = gr.Chatbot( + value=chat_A_reasoning, + type="messages", + height=300, + label="Model A Reasoning", + show_copy_button=False, + show_label=True, + render_markdown=True, + avatar_images=None, + rtl=False, + autoscroll=False, + ) + chat_b_reasoning = gr.Chatbot( + value=chat_B_reasoning, + type="messages", + height=300, + label="Model B Reasoning", + show_copy_button=False, + show_label=True, + render_markdown=True, + avatar_images=None, + rtl=False, + autoscroll=False, + ) current_mode = progress_state.get('mode', 'pairwise') - if current_mode == 'scoring': - # For scoring mode, prepare additional context and enhanced displays - current_pair = next_pair - pairwise_results = progress_state['pairwise_results'].get( - current_pair, [None] * len_criteria) - data_subset_state['current_pairwise_results'] = pairwise_results - - # Create enhanced pairwise results display with conflict warnings - pairwise_results_for_display = [] - for i in range(len_criteria): # Use len_criteria to generate correct number of display elements - # Get corresponding comparison result - choice = pairwise_results[i] if i < len(pairwise_results) else None - - # Normalize choice, support both mapping key and value forms - normalized_choice = choice - if choice in mapping: - normalized_choice = mapping[choice] - - # Determine color based on normalized choice - if normalized_choice == "A": - choice_color = "#2E8B57" - constraint_text = "Constraint: Model A score ≥ Model B score" - display_choice = "A" - elif normalized_choice == "B": - choice_color = "#FF6347" - constraint_text = "Constraint: Model B score ≥ Model A score" - display_choice = "B" - elif normalized_choice == "tie": - choice_color = "#4682B4" - constraint_text = "Constraint: Model A score = Model B score" - display_choice = "Tie" - elif normalized_choice == "neither": - choice_color = "#808080" - constraint_text = "Constraint: Neither model performed well" - display_choice = "Neither" - else: - choice_color = "#808080" - constraint_text = "Constraint: No score constraints applied" - display_choice = "Not selected" - - # Check for existing scores and conflicts using unified logic - conflict_warning = "" - current_scoring_pair = next_pair - pairwise_scores = progress_state.get('pairwise_scores', {}) - pair_scores_tuple = pairwise_scores.get(current_scoring_pair) - - if pair_scores_tuple and len(pair_scores_tuple) >= 2 * len_criteria: - # Extract scores from tuple format [A1, A2, ..., An, B1, B2, ..., Bn] - score_a = pair_scores_tuple[i] # A scores are in first half - score_b = pair_scores_tuple[i + len_criteria] # B scores are in second half - - # Use unified conflict detection - if _detect_scoring_conflict(choice, score_a, score_b): - conflict_warning = _generate_conflict_warning_html(score_a, score_b) - - # Create enhanced display with better formatting and conflict warning - display_html = ( - f'
' - f'Your pairwise choice: ' - f'' - f'{display_choice}' - f' {constraint_text}
' - f'{conflict_warning}' - f'
' - ) - pairwise_results_for_display.append(gr.Markdown(display_html, visible=True)) - - - # Ensure correct length - assert len(pairwise_results_for_display) == len_criteria, f"Expected {len_criteria}, got {len(pairwise_results_for_display)}" - - # Create enhanced chatbot components for scoring page - chat_a_page2 = gr.Chatbot( - value=chat_a_content, - type="messages", - height=400, - label="Model A Response", - show_copy_button=False, - render_markdown=True, - autoscroll=False, - ) - - chat_b_page2 = gr.Chatbot( - value=chat_b_content, - type="messages", - height=400, - label="Model B Response", - show_copy_button=False, - render_markdown=True, - autoscroll=False, - ) - - result_dict = { - 'chat_a_page1': None, # Pairwise content (unused in scoring) - 'chat_b_page1': None, # Pairwise content (unused in scoring) - 'page1_prompt': None, # Pairwise prompt (unused in pairwise) - 'chat_a_page2': chat_a_page2, # Enhanced scoring content - 'chat_b_page2': chat_b_page2, # Enhanced scoring content - 'page2_prompt': gr.HTML(prompt_html), # Scoring prompt - 'pairwise_results_for_display': pairwise_results_for_display - } - - return result_dict - else: # pairwise mode - return { - 'chat_a_page1': chat_a_content, # Pairwise content - 'chat_b_page1': chat_b_content, # Pairwise content - 'page1_prompt': gr.HTML(prompt_html), # Pairwise prompt - 'chat_a_page2': None, # Scoring content (unused in pairwise) - 'chat_b_page2': None, # Scoring content (unused in pairwise) - 'page2_prompt': None, # Scoring prompt (unused in pairwise) - 'pairwise_results_for_display': [gr.update(visible=False) for _ in range(len_criteria)] # Ensure correct length - } + return { + 'chat_a_answer': chat_a_answer, # Pairwise content + 'chat_b_answer': chat_b_answer, # Pairwise content + 'chat_a_reasoning': chat_a_reasoning, # Scoring content + 'chat_b_reasoning': chat_b_reasoning, # Scoring content + 'page1_prompt': gr.HTML(prompt_html), # Pairwise prompt + 'chat_a_page2': None, # Scoring content (unused in pairwise) + 'chat_b_page2': None, # Scoring content (unused in pairwise) + 'page2_prompt': None, # Scoring prompt (unused in pairwise) + } def _extract_pairwise_choice(progress_state, index): @@ -1071,65 +972,6 @@ def _extract_pairwise_choice(progress_state, index): return None -def _detect_scoring_conflict(pairwise_choice, score_a, score_b): - """ - Unified conflict detection logic for scoring vs pairwise comparison. - - Args: - pairwise_choice: The pairwise comparison choice (raw or normalized) - score_a: Score for model A - score_b: Score for model B - - Returns: - bool: True if conflict detected, False otherwise - """ - if pairwise_choice is None or score_a is None or score_b is None: - return False - - # Normalize the pairwise choice - normalized_choice = mapping.get(pairwise_choice, pairwise_choice) - - # Helper to safely convert scores to integers for comparison - def safe_int(x): - try: - return int(x) - except (ValueError, TypeError): - return None - - score_a_int = safe_int(score_a) - score_b_int = safe_int(score_b) - - # Check for conflicts based on normalized pairwise choice - if normalized_choice == "A" and score_a_int is not None and score_b_int is not None: - return score_a_int < score_b_int # A should be >= B for A preferred - elif normalized_choice == "B" and score_a_int is not None and score_b_int is not None: - return score_b_int < score_a_int # B should be >= A for B preferred - elif normalized_choice == "tie": - return score_a != score_b # Tie requires exact match - - return False - - -def _generate_conflict_warning_html(score_a, score_b): - """ - Generate HTML for conflict warning display. - - Args: - score_a: Score for model A - score_b: Score for model B - - Returns: - str: HTML string for conflict warning - """ - return ( - f'
' - f'⚠️ CONFLICT WARNING: ' - f'Your current scores (A={score_a}, B={score_b}) conflict with your pairwise choice. ' - f'The system will prioritize your pairwise choice when you submit.' - f'
' - ) - def _apply_rating_restrictions(pairwise_choice, score_a, score_b, include_values=True): """ @@ -1198,104 +1040,31 @@ def _apply_rating_restrictions(pairwise_choice, score_a, score_b, include_values return upd_A, upd_B -def apply_scoring_restrictions(progress_state, index, score_a, score_b): - """ - Apply scoring restrictions based on pairwise comparison results. - This is a standalone version of the restrict_choices logic used for initial page load. - - Key logic (COMPARISON-FIRST): - 1. If both scores exist, check for conflicts with pairwise choice: - - If scores match pairwise choice: lock both to existing values - - If scores conflict with pairwise choice: prioritize pairwise and clear conflicting scores - 2. If one score exists, lock it and apply restrictions to the other based on the locked score - 3. If no scores exist, apply normal pairwise restrictions - """ - print(f"[DEBUG] apply_scoring_restrictions called - criterion {index}: input score_a={score_a}, score_b={score_b}") - - # Extract the pairwise choice for the current criterion (needed for restrictions) - pairwise_choice = _extract_pairwise_choice(progress_state, index) - - if pairwise_choice is not None: - print(f"[Auto-restrict] Found pairwise choice for criterion {index}: {pairwise_choice}") - else: - print(f"[Auto-restrict] No pairwise choice found for criterion {index}") - - # Case 1: Both scores exist - COMPARISON-FIRST logic - if score_a is not None and score_b is not None: - print(f"[COMPARISON-FIRST] Both scores exist: A={score_a}, B={score_b} - checking for conflicts with pairwise choice") - - # Use unified conflict detection - conflict_detected = _detect_scoring_conflict(pairwise_choice, score_a, score_b) - - if conflict_detected: - print(f"[CONFLICT] Pairwise choice conflicts with scores: A={score_a}, B={score_b}") - print(f"[COMPARISON-FIRST] Conflict detected - prioritizing pairwise comparison, clearing existing scores") - # Apply normal pairwise restrictions with cleared values to prioritize pairwise choice - result_A, result_B = _apply_rating_restrictions(pairwise_choice, None, None, include_values=True) - else: - print(f"[NO-CONFLICT] Existing scores are compatible with pairwise choice - locking both") - # Ensure scores are strings for Gradio compatibility - score_a_str = str(score_a) - score_b_str = str(score_b) - result_A = gr.update(choices=[score_a_str], value=score_a_str) - result_B = gr.update(choices=[score_b_str], value=score_b_str) - - # Case 2: Only score_a exists - lock A, restrict B based on A's value (B value stays None) - elif score_a is not None and score_b is None: - print(f"[LOCK] Found existing score_a={score_a}, locking A and restricting B based on A (B value=None)") - score_a_str = str(score_a) - result_A = gr.update(choices=[score_a_str], value=score_a_str) - # Apply restrictions to B based on A's locked value and pairwise choice, but keep B value as None - _, temp_result_B = _apply_rating_restrictions(pairwise_choice, score_a_str, None, include_values=False) - # Explicitly ensure B value is None while keeping the restricted choices - result_B = gr.update(choices=temp_result_B.get('choices', ["1", "2", "3", "4", "5", "Unable to Judge"]), value=None) - - # Case 3: Only score_b exists - lock B, restrict A based on B's value (A value stays None) - elif score_a is None and score_b is not None: - print(f"[LOCK] Found existing score_b={score_b}, locking B and restricting A based on B (A value=None)") - score_b_str = str(score_b) - result_B = gr.update(choices=[score_b_str], value=score_b_str) - # Apply restrictions to A based on B's locked value and pairwise choice, but keep A value as None - temp_result_A, _ = _apply_rating_restrictions(pairwise_choice, None, score_b_str, include_values=False) - # Explicitly ensure A value is None while keeping the restricted choices - result_A = gr.update(choices=temp_result_A.get('choices', ["1", "2", "3", "4", "5", "Unable to Judge"]), value=None) - - # Case 4: No scores exist - apply normal pairwise restrictions with None values to ensure clearing - else: - print(f"[CLEAR] No existing scores - applying normal restrictions with None values to ensure clearing") - # Apply restrictions using the shared utility function with explicit None values - result_A, result_B = _apply_rating_restrictions(pairwise_choice, None, None, include_values=True) - - print(f"[DEBUG] apply_scoring_restrictions result - criterion {index}: returning gradio updates") - - return result_A, result_B - - def advance_workflow(progress_state, data_subset_state, current_pairwise=None, current_scoring=None): """ Unified workflow manager that handles all state transitions and UI updates. """ - print(f"Advance workflow called, previous mode: {progress_state.get('mode')}") - print(progress_state) - default_pairwise_results = [gr.update(visible=False) for _ in range(len_criteria)] + # print(f"Advance workflow called, previous mode: {progress_state.get('mode')}") + # print(progress_state) # Validate input for pairwise comparisons - if current_pairwise is not None and progress_state.get('mode') == 'pairwise' and any(answer is None for answer in current_pairwise): - gr.Warning("Error: Please select an option for every comparison.", - duration=5) + if current_pairwise is not None and any(answer is None for answer in current_pairwise): + missing_comparisons = [] + for i, answer in enumerate(current_pairwise): + if answer is None: + missing_comparisons.append(criteria_for_comparison[i]['label']) + + missing_text = ", ".join(missing_comparisons) + error_msg = f"Please select an option for the following pairwise comparison(s): {missing_text}" + gr.Info(error_msg) return { 'progress_state': progress_state, 'page1_visible': gr.update(visible=True), # Keep page1 visible - 'page2_visible': gr.update(visible=False), # Hide page2 - 'pairwise_radios': [gr.update() for _ in range(len_criteria)], - 'pairwise_reasons': [gr.update() for _ in range(len_criteria)], - 'chat_a_page1': gr.update(), # Keep chat_a unchanged - 'chat_b_page1': gr.update(), # Keep chat_b unchanged + 'chat_a_answer': gr.update(), # Keep chat_a unchanged + 'chat_b_answer': gr.update(), # Keep chat_b unchanged 'page1_prompt': gr.update(), # Keep page1_prompt unchanged - 'chat_a_page2': gr.update(), # Keep chat_a_page2 unchanged - 'chat_b_page2': gr.update(), # Keep chat_b_page2 unchanged - 'page2_prompt': gr.update(), # Keep page2_prompt unchanged - 'pairwise_results_for_display': default_pairwise_results + 'chat_a_reasoning': gr.update(), # Keep chat_a_page2 unchanged + 'chat_b_reasoning': gr.update(), # Keep chat_b_page2 unchanged } # Validate input for scoring @@ -1306,308 +1075,158 @@ def advance_workflow(progress_state, data_subset_state, current_pairwise=None, c duration=5) return { 'progress_state': progress_state, - 'page1_visible': gr.update(visible=False), # Hide page1 - 'page2_visible': gr.update(visible=True), # Keep page2 visible - 'pairwise_radios': [gr.update(value=None) for _ in range(len_criteria)], - 'pairwise_reasons': [gr.update(value=None) for _ in range(len_criteria)], - 'chat_a_page1': gr.update(), # Keep chat_a unchanged - 'chat_b_page1': gr.update(), # Keep chat_b unchanged + 'page1_visible': gr.update(visible=True), # Show page1 + 'chat_a_answer': gr.update(), # Keep chat_a unchanged + 'chat_b_answer': gr.update(), # Keep chat_b unchanged 'page1_prompt': gr.update(), # Keep page1_prompt unchanged - 'chat_a_page2': gr.update(), # Keep chat_a_page2 unchanged - 'chat_b_page2': gr.update(), # Keep chat_b_page2 unchanged - 'page2_prompt': gr.update(), # Keep page2_prompt unchanged - 'ratings_A': [gr.update() for _ in range(len_criteria)], # Keep current ratings - 'ratings_B': [gr.update() for _ in range(len_criteria)], # Keep current ratings - 'pairwise_results_for_display': progress_state['ui_buffer']['pairwise_results_for_display'] + 'chat_a_reasoning': gr.update(), # Keep chat_a_page2 unchanged + 'chat_b_reasoning': gr.update(), # Keep chat_b_page2 unchanged } # 1. Determine next task based on current progress - next_pairwise_pair = get_next_uncompleted_pair(progress_state) - next_scoring_pair = get_next_unscored_pair(progress_state) + next_pair = get_next_uncompleted_pair(progress_state) # 2. Determine workflow phase and set mode - if next_pairwise_pair is not None: + if next_pair is not None: progress_state['mode'] = 'pairwise' - next_pair = next_pairwise_pair print(f"Pairwise mode: next pair {next_pair}") - elif next_scoring_pair is not None: - progress_state['mode'] = 'scoring' - next_pair = next_scoring_pair - print(f"Scoring mode: next pair {next_pair}") else: - progress_state['mode'] = 'completed' - next_pair = None - print("All evaluations completed") + # Current question completed, but this doesn't mean all questions are done + # The caller (submit_pairwise_scoring) will handle question transitions + progress_state['mode'] = 'current_question_completed' + print("Current question completed - awaiting next question") # 3. Create base UI update structure current_mode = progress_state.get('mode', 'pairwise') - ui_updates = { 'progress_state': progress_state, - 'page1_visible': gr.update(visible=(current_mode == 'pairwise')), - 'page2_visible': gr.update(visible=(current_mode == 'scoring')), - 'pairwise_radios': [gr.update(value=None) for _ in range(len_criteria)], - 'pairwise_reasons': [gr.update(value=None) for _ in range(len_criteria)], - 'chat_a_page1': None, - 'chat_b_page1': None, + 'page1_visible': gr.update(visible=True), + 'chat_a_answer': None, + 'chat_b_answer': None, 'page1_prompt': None, - 'chat_a_page2': None, - 'chat_b_page2': None, - 'page2_prompt': None, - 'pairwise_results_for_display': default_pairwise_results + 'chat_a_reasoning': None, + 'chat_b_reasoning': None, } - # 4. If in scoring mode, check historical scoring records - if current_mode == 'scoring' and next_pair is not None: - # Get historical scores for current model pair (if exists) - current_model_A, current_model_B = next_pair - - # Search for historical scores directly from method-keyed storage - simpler and more efficient! - relevant_scores_A = progress_state.get('pairwise_scores', {}).get(current_model_A) - relevant_scores_B = progress_state.get('pairwise_scores', {}).get(current_model_B) - - print(f"Relevant scores for {next_pair}: A={relevant_scores_A}, B={relevant_scores_B}") - - # Debug check for None values - if relevant_scores_A is None: - print(f"[DEBUG] relevant_scores_A is None for pair {next_pair}") - if relevant_scores_B is None: - print(f"[DEBUG] relevant_scores_B is None for pair {next_pair}") - - # Set initial values for UI scoring controls - ratings_A_updates = [] - ratings_B_updates = [] - - # Set initial values for A and B model scoring controls and apply scoring restrictions - for i in range(len_criteria): - # For A model: use historical score if available, otherwise clear - score_A = relevant_scores_A[i] if relevant_scores_A and i < len(relevant_scores_A) else None - # For B model: use historical score if available, otherwise clear - score_B = relevant_scores_B[i] if relevant_scores_B and i < len(relevant_scores_B) else None - - print(f"[DEBUG] Criterion {i}: Before apply_scoring_restrictions - score_A={score_A}, score_B={score_B}") - - # Apply scoring restrictions based on pairwise comparison results - restricted_update_A, restricted_update_B = apply_scoring_restrictions( - progress_state, i, score_A, score_B) - - print(f"[DEBUG] Criterion {i}: After apply_scoring_restrictions - restricted_update_A={restricted_update_A}, restricted_update_B={restricted_update_B}") - - ratings_A_updates.append(restricted_update_A) - ratings_B_updates.append(restricted_update_B) - - ui_updates['ratings_A'] = ratings_A_updates - ui_updates['ratings_B'] = ratings_B_updates - - print(f"Loaded relevant scores for pair {next_pair}: A={relevant_scores_A}, B={relevant_scores_B}") - print(f"Applied automatic scoring restrictions based on pairwise choices") - else: - # Non-scoring mode, keep blank - ui_updates['ratings_A'] = [gr.update(value=None) for _ in range(len_criteria)] - ui_updates['ratings_B'] = [gr.update(value=None) for _ in range(len_criteria)] - - # 5. Extract content for current phase + # 4. Extract content for current phase if next_pair is not None: - print("debug: Extracting UI content for next pair") - print("progress_state:", progress_state) - print("next_pair:", next_pair) + # print("debug: Extracting UI content for next pair") + # print("progress_state:", progress_state) + # print("next_pair:", next_pair) content_updates = extract_ui_content_by_mode(progress_state, data_subset_state, next_pair) ui_updates.update(content_updates) - - # If extract_ui_content_by_mode returned new pairwise_results_for_display, - # ensure it has correct length - if 'pairwise_results_for_display' in content_updates: - pairwise_results = content_updates['pairwise_results_for_display'] - if pairwise_results is None or len(pairwise_results) != len_criteria: - print(f"Warning: pairwise_results_for_display has incorrect length {len(pairwise_results) if pairwise_results else 'None'}, expected {len_criteria}") - ui_updates['pairwise_results_for_display'] = default_pairwise_results - - # 6. Ensure all required arrays have correct lengths before returning - # Validate pairwise_radios and pairwise_reasons always have the right length - if len(ui_updates.get('pairwise_radios', [])) != len_criteria: - ui_updates['pairwise_radios'] = [gr.update(value=None) for _ in range(len_criteria)] - if len(ui_updates.get('pairwise_reasons', [])) != len_criteria: - ui_updates['pairwise_reasons'] = [gr.update(value=None) for _ in range(len_criteria)] - - # Validate pairwise_results_for_display always has the right length - if len(ui_updates.get('pairwise_results_for_display', [])) != len_criteria: - print(f"[ADVANCE_WORKFLOW] WARNING: Overriding pairwise_results_for_display due to length mismatch!") - print(f" Original length: {len(ui_updates.get('pairwise_results_for_display', []))}, Expected: {len_criteria}") - ui_updates['pairwise_results_for_display'] = [gr.update(visible=False) for _ in range(len_criteria)] - if 'pairwise_results_for_display' in ui_updates: - if 'ui_buffer' not in progress_state: - progress_state['ui_buffer'] = {} - progress_state['ui_buffer']['pairwise_results_for_display'] = ui_updates['pairwise_results_for_display'] - - # 7. Calculate and add progress information + + # 5. Calculate and add progress information progress_info = calculate_progress_info(progress_state) # Update progress bar headers with dynamic content current_mode = progress_state.get('mode', 'pairwise') if current_mode == 'pairwise': ui_updates['pairwise_header'] = gr.update(value=f"## {progress_info['pairwise_progress_text']}") - ui_updates['scoring_header'] = gr.update(value="## Part 2/2: Rate Model Responses") - elif current_mode == 'scoring': - ui_updates['pairwise_header'] = gr.update(value="## Part 1/2: Pairwise Comparison") - ui_updates['scoring_header'] = gr.update(value=f"## {progress_info['scoring_progress_text']}") + ui_updates['pairwise_progress_text'] = progress_info['pairwise_progress_text'] + elif current_mode == 'current_question_completed': + # Current question is done, show completion status for this question + ui_updates['pairwise_header'] = gr.update(value="## Current Question Completed") + ui_updates['pairwise_progress_text'] = "Current question evaluation completed" else: - # Completed mode - ui_updates['pairwise_header'] = gr.update(value="## Part 1/2: Pairwise Comparison (Completed)") - ui_updates['scoring_header'] = gr.update(value="## Part 2/2: Rate Model Responses (Completed)") + # Completed mode (all questions done) + ui_updates['pairwise_header'] = gr.update(value="## All Evaluations Completed") + ui_updates['pairwise_progress_text'] = "All evaluations completed" return ui_updates -def submit_pairwise_comparison(progress_state, data_subset_state, user_info, *pairwise_comparisons): - """Process pairwise comparison submission and transition to scoring mode.""" - print(f"=== SUBMIT PAIRWISE COMPARISON DEBUG ===") - print(f"Input progress_state: {progress_state}") - print(f"Pairwise comparisons: {pairwise_comparisons}") - +def submit_pairwise_scoring(progress_state, data_subset_state, user_info, *combined_values): + """ + Submit scoring results and proceed to the next step. + Simplified to use unified workflow management. + """ + # print(f"Input progress_state: {progress_state}") + # print(f"Pairwise comparisons: {combined_values}") + # Process input parameters criteria_count = len_criteria - pairwise = list(pairwise_comparisons[:criteria_count]) - comparison_reasons = list(pairwise_comparisons[criteria_count:]) + + pairwise = list(combined_values[:criteria_count]) + comparison_reasons = list( + combined_values[criteria_count:criteria_count*2]) + ratings_A = list( + combined_values[criteria_count*2:criteria_count*3]) + ratings_B = list(combined_values[criteria_count*3:]) + pairwise = [mapping.get(choice, choice) for choice in pairwise] # Normalize choices + # Save current ratings - now store by method instead of by pair + pair = progress_state['all_pairs'][progress_state['current_score_pair_index']] + model_A, model_B = pair + gr.Info(f"Submitting your evaluation results and loading next question...") + + # Validate input if any(answer is None for answer in pairwise): # Return current state with no changes - let advance_workflow handle the structure ui_updates = advance_workflow(progress_state, data_subset_state, current_pairwise=pairwise) return [ - ui_updates.get('page1_visible'), - ui_updates.get('page2_visible'), - ui_updates.get('page1_prompt'), - ui_updates.get('page2_prompt'), - ui_updates.get('progress_state'), - ui_updates.get('chat_a_page1'), - ui_updates.get('chat_b_page1'), - ui_updates.get('chat_a_page2'), - ui_updates.get('chat_b_page2'), - *ui_updates.get('pairwise_radios'), - *ui_updates.get('pairwise_reasons'), - *ui_updates.get('pairwise_results_for_display') + gr.update(visible=False), # page0 + gr.update(visible=True), # page1 + "", # page0_error_box + ui_updates.get('page1_prompt'), # page1_prompt + user_info, # user_info_state + data_subset_state, # data_subset_state + ui_updates.get('progress_state'), # progress_state + progress_state.get('pairwise_results', {}), # pairwise_state + ui_updates.get('chat_a_answer'), # chat_a_answer + ui_updates.get('chat_b_answer'), # chat_b_answer + ui_updates.get('chat_a_reasoning'), # chat_a_reasoning + ui_updates.get('chat_b_reasoning'), # chat_b_reasoning + ui_updates.get('pairwise_header'), # pairwise_header + *([gr.update() for _ in range(len_criteria)]), # pairwise_inputs (keep current values) + *([gr.update() for _ in range(len_criteria)]), # comparison_reasons_inputs (keep current values) + *([gr.update() for _ in range(len_criteria)]), # ratings_A_page1 (keep current values) + *([gr.update() for _ in range(len_criteria)]), # ratings_B_page1 (keep current values) ] - # Save current results - ratings_A_vals = None - ratings_B_vals = None - row_dict = build_row_dict( - data_subset_state, user_info, pairwise, - comparison_reasons, ratings_A_vals, ratings_B_vals - ) - append_to_sheet( - user_data=None, - custom_row_dict=row_dict, - custom_sheet_name=str(TXAGENT_RESULTS_SHEET_BASE_NAME + - f"_{user_info['evaluator_id']}"), - add_header_when_create_sheet=True - ) - - # Update progress state - pair = progress_state['all_pairs'][progress_state['current_pair_index']] - progress_state['pairwise_results'][pair] = pairwise - progress_state['pairwise_done'].add(pair) - - # Use unified workflow manager - completely delegate UI updates - ui_updates = advance_workflow(progress_state, data_subset_state) - - # Check individual elements if it's a list - if isinstance(pairwise_results_for_display, list): - for i, item in enumerate(pairwise_results_for_display): - print(f" Element {i}: {type(item)} - {item}") - - return [ - ui_updates.get('page1_visible'), - ui_updates.get('page2_visible'), - ui_updates.get('page1_prompt'), - ui_updates.get('page2_prompt'), - ui_updates.get('progress_state'), - ui_updates.get('chat_a_page1'), - ui_updates.get('chat_b_page1'), - ui_updates.get('chat_a_page2'), - ui_updates.get('chat_b_page2'), - *ui_updates.get('pairwise_radios'), - *ui_updates.get('pairwise_reasons'), - *ui_updates.get('pairwise_results_for_display'), - ui_updates.get('pairwise_header'), # Add progress header - ui_updates.get('scoring_header') # Add progress header - ] - - -def submit_pairwise_scoring(progress_state, data_subset_state, user_info, *ratings): - """ - Submit scoring results and proceed to the next step. - Simplified to use unified workflow management. - """ - print(f"=== SUBMIT PAIRWISE SCORING DEBUG ===") - print(f"Input progress_state: {progress_state}") - print(f"Ratings: {ratings}") - - # Save current ratings - now store by method instead of by pair - pair = progress_state['all_pairs'][progress_state['current_score_pair_index']] - model_A, model_B = pair - criteria_count = len_criteria - - # Split ratings into A and B scores - ratings_A = list(ratings[:criteria_count]) - ratings_B = list(ratings[criteria_count:]) - # Validate input - check if all ratings are provided if any(rating is None for rating in ratings_A) or any(rating is None for rating in ratings_B): # Return current state with no changes - let advance_workflow handle the structure ui_updates = advance_workflow(progress_state, data_subset_state, current_scoring=[ratings_A, ratings_B]) - # Determine modal visibility based on completion status - all_scoring_done = (len(progress_state['scoring_done_pairs']) == - len(progress_state['all_pairs'])) - return [ - ui_updates.get('page1_visible'), # 5 - ui_updates.get('page2_visible'), # 6 - ui_updates.get('page1_prompt'), # 4 - ui_updates.get('page2_prompt'), # 25 - data_subset_state['reference_answer'], # page1_reference_answer - data_subset_state['reference_answer'], # page2_reference_answer + gr.update(visible=False), # page0 + gr.update(visible=True), # page1 + "", # page0_error_box + ui_updates.get('page1_prompt'), # page1_prompt user_info, # user_info_state data_subset_state, # data_subset_state - ui_updates.get('progress_state'), # 1 + ui_updates.get('progress_state'), # progress_state progress_state.get('pairwise_results', {}), # pairwise_state - ui_updates.get('chat_a_page1'), # 2 - ui_updates.get('chat_b_page1'), # 3 - ui_updates.get('chat_a_page2'), # 23 - ui_updates.get('chat_b_page2'), # 24 - *ui_updates.get('pairwise_radios'), # 7-14 - *ui_updates.get('pairwise_reasons'), # 15-22 - *ui_updates.get('ratings_A'), - *ui_updates.get('ratings_B'), - *ui_updates.get('pairwise_results_for_display'), - ui_updates.get('pairwise_header'), # Add pairwise progress header - ui_updates.get('scoring_header') # Add scoring progress header + ui_updates.get('chat_a_answer'), # chat_a_answer + ui_updates.get('chat_b_answer'), # chat_b_answer + ui_updates.get('chat_a_reasoning'), # chat_a_reasoning + ui_updates.get('chat_b_reasoning'), # chat_b_reasoning + ui_updates.get('pairwise_header'), # pairwise_header + *([gr.update() for _ in range(len_criteria)]), # pairwise_inputs (keep current values) + *([gr.update() for _ in range(len_criteria)]), # comparison_reasons_inputs (keep current values) + *([gr.update() for _ in range(len_criteria)]), # ratings_A_page1 (keep current values) + *([gr.update() for _ in range(len_criteria)]), # ratings_B_page1 (keep current values) ] # Initialize pairwise_scores as method-keyed dict if it doesn't exist if 'pairwise_scores' not in progress_state: progress_state['pairwise_scores'] = {} + progress_state['pairwise_results'][pair] = pairwise + progress_state['pairwise_done'].add(pair) # Store scores by method name instead of by pair progress_state['pairwise_scores'][model_A] = ratings_A progress_state['pairwise_scores'][model_B] = ratings_B - - progress_state['scoring_done_pairs'].add(pair) - - print(f"[SCORE_SAVE] Saved scores for {model_A}: {ratings_A}") - print(f"[SCORE_SAVE] Saved scores for {model_B}: {ratings_B}") # Save results to database like submit_pairwise_comparison does - # Extract pairwise comparison results for this pair - pairwise_results = progress_state['pairwise_results'].get(pair, [None] * len_criteria) - comparison_reasons = [None] * len_criteria # We don't have reasons for scoring page - + # Build and save the row row_dict = build_row_dict( - data_subset_state, user_info, pairwise_results, + data_subset_state, user_info, pairwise, comparison_reasons, ratings_A, ratings_B ) append_to_sheet( @@ -1618,98 +1237,95 @@ def submit_pairwise_scoring(progress_state, data_subset_state, user_info, *ratin add_header_when_create_sheet=True ) - # Use unified workflow manager - completely delegate UI updates - ui_updates = advance_workflow(progress_state, data_subset_state) - - # Determine modal visibility based on completion status - all_scoring_done = (len(progress_state['scoring_done_pairs']) == - len(progress_state['all_pairs'])) - + # Check if current question is completed (all pairs done) + current_question_completed = (len(progress_state['pairwise_done']) == len(progress_state['all_pairs'])) - if not all_scoring_done: - # advance_workflow handles all UI updates properly - # Return UI updates using advance_workflow results directly + if not current_question_completed: + # Still have pairs to evaluate in current question + # Use unified workflow manager for within-question navigation + ui_updates = advance_workflow(progress_state, data_subset_state) return [ - ui_updates.get('page1_visible'), # 5 - ui_updates.get('page2_visible'), # 6 - ui_updates.get('page1_prompt'), # 4 - ui_updates.get('page2_prompt'), # 25 - data_subset_state['reference_answer'], # page1_reference_answer - data_subset_state['reference_answer'], # page2_reference_answer + gr.update(visible=False), # page0 + gr.update(visible=True), # page1 + "", # page0_error_box + ui_updates.get('page1_prompt'), # page1_prompt user_info, # user_info_state data_subset_state, # data_subset_state - ui_updates.get('progress_state'), # 1 + ui_updates.get('progress_state'), # progress_state progress_state.get('pairwise_results', {}), # pairwise_state - ui_updates.get('chat_a_page1'), # 2 - ui_updates.get('chat_b_page1'), # 3 - ui_updates.get('chat_a_page2'), # 23 - ui_updates.get('chat_b_page2'), # 24 - *ui_updates.get('pairwise_radios'), # 7-14 - *ui_updates.get('pairwise_reasons'), # 15-22 - *ui_updates.get('ratings_A'), - *ui_updates.get('ratings_B'), - *ui_updates.get('pairwise_results_for_display'), - ui_updates.get('pairwise_header'), # Add pairwise progress header - ui_updates.get('scoring_header') # Add scoring progress header + ui_updates.get('chat_a_answer'), # chat_a_answer + ui_updates.get('chat_b_answer'), # chat_b_answer + ui_updates.get('chat_a_reasoning'), # chat_a_reasoning + ui_updates.get('chat_b_reasoning'), # chat_b_reasoning + ui_updates.get('pairwise_header'), # pairwise_header + *([gr.update(value=None) for _ in range(len_criteria)]), # pairwise_inputs (clear for new pair) + *([gr.update(value="") for _ in range(len_criteria)]), # comparison_reasons_inputs (clear for new pair) + *([gr.update(value=None) for _ in range(len_criteria)]), # ratings_A_page1 (clear for new pair) + *([gr.update(value=None) for _ in range(len_criteria)]), # ratings_B_page1 (clear for new pair) ] - user_info, chat_a, chat_b, page1_prompt, data_subset_state, remaining_count, progress_state = get_next_eval_question( + + # Get fresh question data when current question is completed + user_info, chat_a_answer, chat_b_answer, chat_a_reasoning, chat_b_reasoning, page1_prompt, data_subset_state, remaining_count, progress_state = get_next_eval_question( user_info, our_methods ) - if remaining_count == 0: # TODO: handle completion + if remaining_count == 0: # Handle completion gr.Info("You have no more questions to evaluate. You may exit the page; we will follow-up if we require anything else from you. Thank you!") + # Create a completion state for advance_workflow to handle properly + if progress_state is None: + progress_state = {'mode': 'completed'} + else: + progress_state['mode'] = 'completed' + + # Use advance_workflow for completion state + ui_updates = advance_workflow(progress_state, data_subset_state) return [ - ui_updates.get('page1_visible'), # 5 - ui_updates.get('page2_visible'), # 6 - ui_updates.get('page1_prompt'), # 4 - ui_updates.get('page2_prompt'), # 25 - data_subset_state['reference_answer'], # page1_reference_answer - data_subset_state['reference_answer'], # page2_reference_answer + gr.update(visible=False), # page0 + gr.update(visible=True), # page1 + "", # page0_error_box + ui_updates.get('page1_prompt', "## All Evaluations Completed"), # page1_prompt user_info, # user_info_state data_subset_state, # data_subset_state - ui_updates.get('progress_state'), # 1 - progress_state.get('pairwise_results', {}), # pairwise_state - ui_updates.get('chat_a_page1'), # 2 - ui_updates.get('chat_b_page1'), # 3 - ui_updates.get('chat_a_page2'), # 23 - ui_updates.get('chat_b_page2'), # 24 - *ui_updates.get('pairwise_radios'), # 7-14 - *ui_updates.get('pairwise_reasons'), # 15-22 - *ui_updates.get('ratings_A'), - *ui_updates.get('ratings_B'), - *ui_updates.get('pairwise_results_for_display'), # 26-33 - ui_updates.get('pairwise_header'), # Add pairwise progress header - ui_updates.get('scoring_header') # Add scoring progress header + progress_state, # progress_state + progress_state.get('pairwise_results', {}) if progress_state else {}, # pairwise_state + ui_updates.get('chat_a_answer', []), # chat_a_answer + ui_updates.get('chat_b_answer', []), # chat_b_answer + ui_updates.get('chat_a_reasoning', []), # chat_a_reasoning + ui_updates.get('chat_b_reasoning', []), # chat_b_reasoning + ui_updates.get('pairwise_header', gr.update(value="## All Evaluations Completed")), # pairwise_header + *([gr.update(value=None) for _ in range(len_criteria)]), # pairwise_inputs (clear for completion) + *([gr.update(value="") for _ in range(len_criteria)]), # comparison_reasons_inputs (clear for completion) + *([gr.update(value=None) for _ in range(len_criteria)]), # ratings_A_page1 (clear for completion) + *([gr.update(value=None) for _ in range(len_criteria)]), # ratings_B_page1 (clear for completion) ] - # Use advance_workflow to get all UI updates + + # Calculate progress and show info message + num_remaining_questions = remaining_count // len(progress_state['all_pairs']) + gr.Info(f"You are about to evaluate the next question. You have {num_remaining_questions} question(s) remaining to evaluate.") + + # Use advance_workflow to get ALL UI updates for new question ui_updates = advance_workflow(progress_state, data_subset_state) - print(f"In submit_pairwise_scoring, using advance_workflow results: mode={progress_state.get('mode')}") - num_remaining_questions = remaining_count// len(progress_state['all_pairs']) - gr.Info(f"You are about to evaluate the next question. You have {num_remaining_questions} question(s) remaining to evaluate.") # eval_progress_text + + # Return using ONLY advance_workflow results - complete delegation return ( - ui_updates.get('page1_visible'), # 5 - ui_updates.get('page2_visible'), # 6 - ui_updates.get('page1_prompt'), # 4 - ui_updates.get('page2_prompt'), # 25 - data_subset_state['reference_answer'], # page1_reference_answer - data_subset_state['reference_answer'], # page2_reference_answer + gr.update(visible=False), # page0 + gr.update(visible=True), # page1 + "", # page0_error_box + ui_updates.get('page1_prompt', ""), # page1_prompt - use advance_workflow content user_info, # user_info_state - data_subset_state, # data_subset_state - ui_updates.get('progress_state'), # 1 + data_subset_state, # data_subset_state - use fresh content + ui_updates.get('progress_state', progress_state), # progress_state - use advance_workflow content progress_state.get('pairwise_results', {}), # pairwise_state - ui_updates.get('chat_a_page1'), # 2 - ui_updates.get('chat_b_page1'), # 3 - ui_updates.get('chat_a_page2'), # 23 - ui_updates.get('chat_b_page2'), # 24 - *ui_updates.get('pairwise_radios'), # 7-14 - *ui_updates.get('pairwise_reasons'), # 15-22 - *ui_updates.get('ratings_A'), - *ui_updates.get('ratings_B'), - *ui_updates.get('pairwise_results_for_display'), - ui_updates.get('pairwise_header'), # Add pairwise progress header - ui_updates.get('scoring_header') # Add scoring progress header - # next_question_modal_visibility + ui_updates.get('chat_a_answer', []), # chat_a_answer - use advance_workflow content + ui_updates.get('chat_b_answer', []), # chat_b_answer - use advance_workflow content + ui_updates.get('chat_a_reasoning', []), # chat_a_reasoning - use advance_workflow content + ui_updates.get('chat_b_reasoning', []), # chat_b_reasoning - use advance_workflow content + ui_updates.get('pairwise_progress_text', ""), # pairwise_header - use advance_workflow content + *([gr.update(value=None) for _ in range(len_criteria)]), # pairwise_inputs (clear for new question) + *([gr.update(value="") for _ in range(len_criteria)]), # comparison_reasons_inputs (clear for new question) + *([gr.update(value=None) for _ in range(len_criteria)]), # ratings_A_page1 (clear for new question) + *([gr.update(value=None) for _ in range(len_criteria)]), # ratings_B_page1 (clear for new question) ) # --- Define Callback Functions for Confirmation Flow --- @@ -1757,50 +1373,6 @@ def build_row_dict( return row -def reset_everything_except_user_info(): - - # 3) Reset all pairwise radios & textboxes - reset_pairwise_radios = [gr.update(value=None) - for i in range(len_criteria)] - reset_pairwise_reasoning_texts = [ - gr.update(value=None) for i in range(len_criteria)] - - # 4) Reset all rating radios - reset_ratings_A = [gr.update(value=None) for i in range(len_criteria)] - reset_ratings_B = [gr.update(value=None) for i in range(len_criteria)] - - return ( - # pages - gr.update(visible=True), # page0 - gr.update(visible=False), # final_page - - # states - # gr.update(value=None), # user_info_state - gr.update(value=None), # pairwise_state - gr.update(value=None), # scores_A_state - gr.update(value=None), # comparison_reasons - gr.update(value=None), # unqualified_A_state - - # page0 elements that need to be reset - gr.update(value=""), # page0_error_box - - # page1 elements that need to be reset - gr.update(value=""), # page1_error_box - - # page2 elements that need to be reset - gr.update(value=""), # page2_prompt - gr.update(value=""), # page2_reference_answer - gr.update(value=[]), # chat_a_page2 - gr.update(value=[]), # chat_b_page2 - gr.update(value=""), # result_text - - # lists of gradio elements that need to be unrolled - *reset_pairwise_radios, - *reset_pairwise_reasoning_texts, - *reset_ratings_A, - *reset_ratings_B - ) - def restrict_choices(progress_state, index, score_a, score_b): """ Returns (update_for_A, update_for_B). @@ -1831,6 +1403,135 @@ def restrict_choices(progress_state, index, score_a, score_b): def clear_selection(): return None, None +def make_restrict_function(base_choices): + def restrict_choices_page1(radio_choice, score_a, score_b): + """ + Returns (update_for_A, update_for_B). + Enforces rating constraints based on the radio choice for page 1. + """ + # Helper to parse int safely + def to_int(x): + try: + # Extract number from "1 text..." format + return int(x.split()[0]) + except (ValueError, TypeError, AttributeError): + return None + + # Default: no restrictions, but ensure current values are valid + upd_A = gr.update(choices=base_choices, + value=score_a if score_a in base_choices else None) + upd_B = gr.update(choices=base_choices, + value=score_b if score_b in base_choices else None) + + # Skip if no meaningful pairwise choice + if radio_choice is None or radio_choice == "Neither model did well.": + return upd_A, upd_B + + a_int = to_int(score_a) + b_int = to_int(score_b) + + # Apply Restrictions based on radio choice + if radio_choice == "Model A is better.": + # Rule: A >= B + if a_int is not None and b_int is not None: + # Both are numeric, enforce A >= B + if a_int < b_int: + # Violation: A < B, reset the one that doesn't match the constraint + upd_A = gr.update(choices=base_choices, value=None) + upd_B = gr.update(choices=base_choices, value=None) + else: + # Valid: A >= B, apply mutual restrictions + allowed_a_choices = [choice for choice in base_choices if to_int( + choice) is None or to_int(choice) >= b_int] + allowed_b_choices = [choice for choice in base_choices if to_int( + choice) is None or to_int(choice) <= a_int] + upd_A = gr.update( + choices=allowed_a_choices, value=score_a if score_a in allowed_a_choices else None) + upd_B = gr.update( + choices=allowed_b_choices, value=score_b if score_b in allowed_b_choices else None) + elif a_int is not None: + # Only A is numeric, B must be <= A + allowed_b_choices = [choice for choice in base_choices if to_int( + choice) is None or to_int(choice) <= a_int] + upd_B = gr.update( + choices=allowed_b_choices, value=score_b if score_b in allowed_b_choices else None) + elif b_int is not None: + # Only B is numeric, A must be >= B + allowed_a_choices = [choice for choice in base_choices if to_int( + choice) is None or to_int(choice) >= b_int] + upd_A = gr.update( + choices=allowed_a_choices, value=score_a if score_a in allowed_a_choices else None) + # If both are "Unable to Judge", no restrictions needed + + elif radio_choice == "Model B is better.": + # Rule: B >= A + if a_int is not None and b_int is not None: + # Both are numeric, enforce B >= A + if b_int < a_int: + # Violation: B < A, reset both + upd_A = gr.update(choices=base_choices, value=None) + upd_B = gr.update(choices=base_choices, value=None) + else: + # Valid: B >= A, apply mutual restrictions + allowed_a_choices = [choice for choice in base_choices if to_int( + choice) is None or to_int(choice) <= b_int] + allowed_b_choices = [choice for choice in base_choices if to_int( + choice) is None or to_int(choice) >= a_int] + upd_A = gr.update( + choices=allowed_a_choices, value=score_a if score_a in allowed_a_choices else None) + upd_B = gr.update( + choices=allowed_b_choices, value=score_b if score_b in allowed_b_choices else None) + elif a_int is not None: + # Only A is numeric, B must be >= A + allowed_b_choices = [choice for choice in base_choices if to_int( + choice) is None or to_int(choice) >= a_int] + upd_B = gr.update( + choices=allowed_b_choices, value=score_b if score_b in allowed_b_choices else None) + elif b_int is not None: + # Only B is numeric, A must be <= B + allowed_a_choices = [choice for choice in base_choices if to_int( + choice) is None or to_int(choice) <= b_int] + upd_A = gr.update( + choices=allowed_a_choices, value=score_a if score_a in allowed_a_choices else None) + + elif radio_choice == "Both models are equally good.": + # Rule: A == B + if a_int is not None and b_int is not None: + # Both are numeric + if a_int == b_int: + # Valid: A == B, restrict both to the same value + upd_A = gr.update(choices=[score_a], value=score_a) + upd_B = gr.update(choices=[score_b], value=score_b) + else: + # Invalid: A != B, reset both + upd_A = gr.update(choices=base_choices, value=None) + upd_B = gr.update(choices=base_choices, value=None) + elif a_int is not None: + # A is numeric, B must match A + upd_B = gr.update(choices=[score_a], value=score_a) + elif b_int is not None: + # B is numeric, A must match B + upd_A = gr.update(choices=[score_b], value=score_b) + elif score_a == "Unable to Judge." and score_b == "Unable to Judge.": + # Both are "Unable to Judge", restrict both to that + upd_A = gr.update( + choices=["Unable to Judge."], value="Unable to Judge.") + upd_B = gr.update( + choices=["Unable to Judge."], value="Unable to Judge.") + elif score_a == "Unable to Judge.": + # A is "Unable to Judge", B must match + upd_B = gr.update( + choices=["Unable to Judge."], value="Unable to Judge.") + elif score_b == "Unable to Judge.": + # B is "Unable to Judge", A must match + upd_A = gr.update( + choices=["Unable to Judge."], value="Unable to Judge.") + # If neither has a value, no restrictions needed + + return upd_A, upd_B + return restrict_choices_page1 + + centered_col_css = """ #centered-column { margin-left: auto; @@ -1874,7 +1575,9 @@ centered_col_css = """ } .short-btn { min-width: 80px !important; max-width: 120px !important; width: 100px !important; padding-left: 4px !important; padding-right: 4px !important; } .light-stop-btn { background-color: #ffcccc !important; color: #b30000 !important; border-color: #ffcccc !important; } + """ + with gr.Blocks(css=centered_col_css) as demo: # States to save information between pages. user_info_state = gr.State() @@ -1906,329 +1609,224 @@ with gr.Blocks(css=centered_col_css) as demo: specialties_list = ["Error loading specialties"] subspecialties_list = ["Error parsing subspecialties"] - # Page -1: Page to link them to question submission form or evaluation portal - with gr.Column(visible=True, elem_id="page-1") as page_minus1: + # Page 0: Welcome / Informational page. + with gr.Column(visible=True, elem_id="page0") as page0: gr.HTML("""

TxAgent Evaluation Portal

-

Welcome to the TxAgent Evaluation Portal.

""") - with gr.Row(): - participate_eval_btn = gr.Button( - value="Click to 🌟 Participate in TxAgent Evaluation 🌟", - variant="primary", - size="lg", - elem_id="participate-btn" - ) - gr.HTML(TxAgent_Project_Page_HTML) + gr.Markdown("## Information:") + name = gr.Textbox(label="Name (required)", value="") + email = gr.Textbox( + label="Email (required). Please use the same email every time you log onto this evaluation portal, as we use your email to prevent showing repeat questions.", value="") + evaluator_id = gr.Textbox( + label="Evaluator ID (auto-filled from email above)", interactive=False, visible=False) + + # Auto-sync evaluator_id with email + def sync_evaluator_id(email_value): + return email_value + + email.change( + fn=sync_evaluator_id, + inputs=[email], + outputs=[evaluator_id] + ) + + specialty_dd = gr.Dropdown( + choices=specialties_list, label="Primary Medical Specialty (required). Go to https://www.abms.org/member-boards/specialty-subspecialty-certificates/ for categorization)", multiselect=True, value=["None"], visible=False) + subspecialty_dd = gr.Dropdown( + choices=subspecialties_list, label="Subspecialty (if applicable). Go to https://www.abms.org/member-boards/specialty-subspecialty-certificates/ for categorization)", multiselect=True, value=["None"], visible=False) + npi_id = gr.Textbox( + label="National Provider Identifier ID (optional). Got to https://npiregistry.cms.hhs.gov/search to search for your NPI ID. If you do not have an NPI ID, please leave this blank.") + years_exp_radio = gr.Radio( + choices=["0-2 years", "3-5 years", "6-10 years", + "11-20 years", "20+ years", "Not Applicable"], + label="How many years have you been involved in clinical and/or research activities related to your biomedical area of expertise? (required)", + value="Not Applicable", + visible=False + ) + exp_explanation_tb = gr.Textbox( + label="Please briefly explain your expertise/experience relevant to evaluating AI for clinical decision support (optional)") - # Page 0: Welcome / Informational page. - with gr.Column(visible=False, elem_id="page0") as page0: - gr.Markdown("## Welcome to the TxAgent Evaluation Study!") + page0_error_box = gr.Markdown("") with gr.Row(): - with gr.Column(): - gr.Markdown( - "Please read the following instructions and then enter your information to begin:") - # Existing informational markdown... - gr.Markdown(""" - - Each session requires a minimum commitment of 5-10 minutes to complete one page of one question. - - Some questions will contain multiple pages. In the first few pages, you will be asked to compare the responses of two different models to the question. In the remaining pages, you will be asked to rate each model's response on a scale of 1-5. - - If a question contains multiple pages, your progress through the question will be saved if you exit the evaluation platform in the middle (provided you submit the page you are currently on). - - If you feel that a question does not make sense or is not biomedically relevant, there is a RED BUTTON at the top of the first model comparison page to indicate this - - You may use the Next buttons at the bottom of each page to submit your current response and go to the next question. - - You may stop in between questions and return at a later time; however, you must submit your answers to the current question or page if you would like the current answers saved. - - Please review the example question and LLM model response below: - - """) - # Assume 'your_image.png' is in the same directory - with open("anatomyofAgentResponse.jpg", "rb") as image_file: - img = Image.open(image_file) - new_size = (int(img.width * 0.5), int(img.height * 0.5)) - img = img.resize(new_size, Image.LANCZOS) - buffer = io.BytesIO() - img.save(buffer, format="PNG") - encoded_string = base64.b64encode( - buffer.getvalue()).decode("utf-8") - - image_html = f'
Your Image
' - ReasoningTraceExampleHTML = f""" -
- {image_html} -
- """ - gr.HTML(ReasoningTraceExampleHTML) - with gr.Column(): - gr.Markdown("""By clicking 'Next' below, you will start the study, with your progress saved after submitting each question. If you have any other questions or concerns, please contact us directly. Thank you for your participation! - """) - gr.Markdown("## Please enter your information to get a question to evaluate. Please use the same email every time you log onto this evaluation portal, as we use your email to prevent showing repeat questions.") - name = gr.Textbox(label="Name (required)", value="") - email = gr.Textbox( - label="Email (required). Please use the same email every time you log onto this evaluation portal, as we use your email to prevent showing repeat questions.", value="") - evaluator_id = gr.Textbox( - label="Evaluator ID (auto-filled from email above)", interactive=False, visible=False) - - # Auto-sync evaluator_id with email - def sync_evaluator_id(email_value): - return email_value - - email.change( - fn=sync_evaluator_id, - inputs=[email], - outputs=[evaluator_id] - ) - - specialty_dd = gr.Dropdown( - choices=specialties_list, label="Primary Medical Specialty (required). Go to https://www.abms.org/member-boards/specialty-subspecialty-certificates/ for categorization)", multiselect=True, value=["None"], visible=False) - subspecialty_dd = gr.Dropdown( - choices=subspecialties_list, label="Subspecialty (if applicable). Go to https://www.abms.org/member-boards/specialty-subspecialty-certificates/ for categorization)", multiselect=True, value=["None"], visible=False) - npi_id = gr.Textbox( - label="National Provider Identifier ID (optional). Got to https://npiregistry.cms.hhs.gov/search to search for your NPI ID. If you do not have an NPI ID, please leave this blank.") - years_exp_radio = gr.Radio( - choices=["0-2 years", "3-5 years", "6-10 years", - "11-20 years", "20+ years", "Not Applicable"], - label="How many years have you been involved in clinical and/or research activities related to your biomedical area of expertise? (required)", - value="Not Applicable", - visible=False - ) - exp_explanation_tb = gr.Textbox( - label="Please briefly explain your expertise/experience relevant to evaluating AI for clinical decision support (optional)") + next_btn_0 = gr.Button("Next") - page0_error_box = gr.Markdown("") - with gr.Row(): - next_btn_0 = gr.Button("Next") - with gr.Row(): - # (your registration info will be saved) - home_btn_0 = gr.Button("Home") + gr.Markdown("""By clicking 'Next' below, you will start the study, with your progress saved after submitting each question. If you have any other questions or concerns, please contact us directly. Thank you for your participation! + """) + + gr.Markdown(""" + ## Instructions: + Please review these instructions and enter your information to begin: + + - Each session requires at least 5-10 minutes per question. + - You can evaluate multiple questions; you will not repeat evaluations. + - For each question, compare responses from two models and rate them (scale: 1-5). + - If a question is unclear or irrelevant to biomedicine, click the RED BUTTON at the top of the comparison page. + - Use the Back and Next buttons to edit responses before submission. + - Use the Home Page button to return to the homepage; progress will save but not submit. + - Submit answers to the current question before moving to the next. + - You can pause between questions and return later; ensure current answers are submitted to save them. + """) + with open("anatomyofAgentResponse.jpg", "rb") as image_file: + img = Image.open(image_file) + new_size = (int(img.width * 0.5), int(img.height * 0.5)) + img = img.resize(new_size, Image.LANCZOS) + buffer = io.BytesIO() + img.save(buffer, format="PNG") + encoded_string = base64.b64encode( + buffer.getvalue()).decode("utf-8") - with Modal(visible=False, elem_id="confirm_modal") as eval_progress_modal: - eval_progress_text = gr.Markdown("You have X questions remaining.") + image_html = f'
Your Image
' + ReasoningTraceExampleHTML = f""" +
+ {image_html} +
+ """ + gr.HTML(ReasoningTraceExampleHTML) # Page 1: Pairwise Comparison. with gr.Column(visible=False) as page1: - # Dynamic progress header for pairwise comparison + # Make the number controlled by question indexing! pairwise_header = gr.Markdown("## Part 1/2: Pairwise Comparison") + gr.Markdown("") + gr.Markdown("") + # Add small red button and comments text box in the same row page1_prompt = gr.HTML() - with gr.Accordion("Click to reveal a reference answer.", open=False, elem_id="answer-reference-btn"): - note_reference_answer = gr.Markdown( - """ - Warning: This answer has been generated automatically and may be incomplete or one of several correct solutions—please use it for reference only. - """, - elem_classes="reference-box" - ) - page1_reference_answer = gr.Markdown( - """ - **Reference Answer:** - - This is the reference answer content. - """, - elem_classes="reference-box" - ) - - - # Add small red button under the prompt - with gr.Row(): - nonsense_btn = gr.Button( - "Wrong Question?", - size="sm", - variant="stop", # red variant - elem_id="invalid-question-btn", - elem_classes=["short-btn"] - ) - gr.Markdown( - "Click the button if you think this question does not make sense or is not biomedically-relevant", - render=True - ) - - nonsense_btn.click( - fn=mark_invalid_question, - inputs=[nonsense_btn_clicked], - outputs=[nonsense_btn_clicked, nonsense_btn], - queue=False, - ) - + # --- Define four chat components: answer and reasoning for each model --- with gr.Row(): - # ADDED: Use gr.Chatbot to display the scrollable chat window for Response A. + # Model A components with gr.Column(): - gr.Markdown("**Model A Response:** (You may have encountered this earlier because of the pairwise comparison.)") # Already bold label. - chat_a_page1 = gr.Chatbot( + gr.Markdown("**Model A Response:**") + chat_a_answer = gr.Chatbot( value=[], # Placeholder for chat history type="messages", - height=400, - label="Model A Response", + height=200, + label="Model A Answer", show_copy_button=False, show_label=True, - render_markdown=True, # Required for markdown/HTML support in messages - avatar_images=None, # Optional: omit user/assistant icons - rtl=False, - autoscroll=False + render_markdown=True, + avatar_images=None, + rtl=False ) - # ADDED: Use gr.Chatbot to display the scrollable chat window for Response B. - with gr.Column(): - gr.Markdown("**Model B Response:** (You may have encountered this earlier because of the pairwise comparison.)") - chat_b_page1 = gr.Chatbot( + # gr.Markdown("**Model A Reasoning:**") + chat_a_reasoning = gr.Chatbot( value=[], type="messages", - height=400, - label="Model B Response", + height=300, + label="Model A Reasoning", show_copy_button=False, show_label=True, - render_markdown=True, # Required for markdown/HTML support in messages - avatar_images=None, # Optional: omit user/assistant icons - rtl=False, - autoscroll=False - ) - gr.Markdown("

") - gr.Markdown("### For each criterion, select which response did better:") - pairwise_reasons = [] # ADDED: list to store the free-text inputs - pairwise_radios = [] - for crit in criteria_for_comparison: - with gr.Row(): - gr.Markdown(crit['text']) - radio = gr.Radio( - choices=[ - "👈 Model A", # A - "👉 Model B", # B - "🤝 Tie", # tie - "👎 Neither model did well" # neither - ], - label="Which is better?" + render_markdown=True, + avatar_images=None, + rtl=False ) - pairwise_radios.append(radio) - # ADDED: free text under each comparison - text_input = gr.Textbox( - label="Reasons for your selection (optional)") - pairwise_reasons.append(text_input) - - page1_error_box = gr.Markdown("") # ADDED: display validation errors - with gr.Row(): - # back_btn_0 = gr.Button("Back") - next_btn_1 = gr.Button("Submit & Next One (Once submitted, you cannot edit your responses)",) - - with gr.Row(): - # ADDED: Home button on page11 - home_btn_1 = gr.Button( - "Home Page") - - # Page 2: Combined Rating Page for both responses. - with gr.Column(visible=False) as page2: - # Add progress bar for scoring - # Dynamic progress header for scoring - scoring_header = gr.Markdown("## Part 2/2: Rate Model Responses") - # ### EDIT: Show a highlighted prompt as on previous pages. - page2_prompt = gr.HTML() - with gr.Accordion("Click to reveal a reference answer.", open=False, elem_id="answer-reference-btn"): - note2_reference_answer = gr.Markdown( - """ - Warning: This answer has been generated automatically and may be incomplete or one of several correct solutions—please use it for reference only. - """, - elem_classes="reference-box" - ) - page2_reference_answer = gr.Markdown( - """ - **Reference Answer:** - - This is the reference answer content. - """, - elem_classes="reference-box" - ) - # ### EDIT: Display both responses side-by-side using Chatbot windows. - with gr.Row(): + # Model B components with gr.Column(): - gr.Markdown("**Model A Response:** (You may have encountered this earlier because of the pairwise comparison.)") # Already bold label. - chat_a_page2 = gr.Chatbot( + gr.Markdown("**Model B Response:**") + chat_b_answer = gr.Chatbot( value=[], type="messages", - height=400, - label="Model A Response", + height=200, + label="Model B Answer", show_copy_button=False, + show_label=True, render_markdown=True, - autoscroll=False + avatar_images=None, + rtl=False ) - with gr.Column(): - gr.Markdown("**Model B Response:** (You may have encountered this earlier because of the pairwise comparison.)") - chat_b_page2 = gr.Chatbot( + # gr.Markdown("**Model B Reasoning:**") + chat_b_reasoning = gr.Chatbot( value=[], type="messages", - height=400, - label="Model B Response", + height=300, + label="Model B Reasoning", show_copy_button=False, + show_label=True, render_markdown=True, - autoscroll=False + avatar_images=None, + rtl=False ) - gr.Markdown("
") - gr.HTML( - """ -
- For each criterion, rate each model’s response below. (If you have already scored these responses, your previous ratings will be loaded automatically, provided they are consistent with your pairwise comparison choices.) -
- """ - ) - # ### EDIT: For each criterion, create a row with two multiple-choice sets (left: Response A, right: Response B) separated by a border. - ratings_A = [] # to store the radio components for response A - ratings_B = [] # to store the radio components for response B - + comparison_reasons_inputs = [] # ADDED: list to store the free-text inputs + pairwise_inputs = [] + ratings_A_page1 = [] # Store rating components for page 1 + ratings_B_page1 = [] # Store rating components for page 1 + + for i, crit_comp in enumerate(criteria_for_comparison): + # for crit in criteria_for_comparison: + crit_score = criteria[i] # Get the corresponding score criterion + + restrict_fn = make_restrict_function(sorted(crit_score["scores"])) + + # Add bold formatting + gr.Markdown(f"**{crit_comp['label']}**", + elem_classes="criteria-font-large") + radio = gr.Radio( + choices=[ + "Model A is better.", + "Model B is better.", + "Both models are equally good.", + "Neither model did well." + ], + # Remove duplicate label since we have markdown above + label=crit_comp['text'], + elem_classes="criteria-radio-label" # <--- add class here + ) + pairwise_inputs.append(radio) + # ADDED: free text under each comparison - pairwise_results_for_display = [gr.Markdown( - render=False) for _ in range(len_criteria)] - indices_for_change = [] - for i, crit in enumerate(criteria): + # for i, crit in enumerate(criteria): index_component = gr.Number( value=i, visible=False, interactive=False) - indices_for_change.append(index_component) - - with gr.Column(elem_id="centered-column"): - gr.Markdown( - f'
{crit["text"][0]}
') - gr.Markdown( - f'
{crit["text"][1]}
') - pairwise_results_for_display[i].render() + # indices_for_change.append(index_component) + with gr.Row(): with gr.Column(scale=1): - rating_a = gr.Radio(choices=["1", "2", "3", "4", "5", "Unable to Judge"], - label=f"Score for Response A - {crit['label']}", - interactive=True) + rating_a = gr.Radio(choices=sorted(crit_score["scores"]), # ["1", "2", "3", "4", "5", "Unable to Judge"], + label=f"Response A - {crit_score['text']}", + interactive=True, + elem_classes="criteria-radio-label") with gr.Column(scale=1): - rating_b = gr.Radio(choices=["1", "2", "3", "4", "5", "Unable to Judge"], - label=f"Score for Response B - {crit['label']}", - interactive=True) - with gr.Row(): - clear_btn = gr.Button( - "Clear Selection", size="sm", elem_id="clear_btn") - clear_btn.click(fn=clear_selection, outputs=[ - rating_a, rating_b]) + rating_b = gr.Radio(choices=sorted(crit_score["scores"]), # ["1", "2", "3", "4", "5", "Unable to Judge"], + label=f"Response B - {crit_score['text']}", + interactive=True, + elem_classes="criteria-radio-label") + # Add clear button and wire up the restrictions + with gr.Row(): # wire each to re‐restrict the other on change + radio.change( + fn=restrict_fn, + inputs=[radio, rating_a, rating_b], + outputs=[rating_a, rating_b] + ) rating_a.change( - fn=restrict_choices, - inputs=[progress_state, index_component, - rating_a, rating_b], + fn=restrict_fn, + inputs=[radio, rating_a, rating_b], outputs=[rating_a, rating_b] ) rating_b.change( - fn=restrict_choices, - inputs=[progress_state, index_component, - rating_a, rating_b], + fn=restrict_fn, + inputs=[radio, rating_a, rating_b], outputs=[rating_a, rating_b] ) - ratings_A.append(rating_a) - ratings_B.append(rating_b) - with gr.Row(): - # back_btn_2 = gr.Button("Back") - submit_btn = gr.Button( - "Submit & Next One (Once submitted, you cannot edit your responses)", elem_id="submit_btn") - with gr.Row(): - home_btn_2 = gr.Button( - "Home Page") + ratings_A_page1.append(rating_a) + ratings_B_page1.append(rating_b) + + text_input = gr.Textbox( + # Remove label since we have markdown above + placeholder="Comments for your selection (optional)", + show_label=False, + # elem_classes="textbox-bold-label" + ) + comparison_reasons_inputs.append(text_input) - result_text = gr.Textbox(label="Validation Result") + with gr.Row(): + submit_btn_1 = gr.Button( + "Submit Evaluation", variant="primary", elem_id="submit_btn") # Final Page: Thank you message. with gr.Column(visible=False, elem_id="final_page") as final_page: @@ -2244,142 +1842,39 @@ with gr.Blocks(css=centered_col_css) as demo: # --- Define Transitions Between Pages --- - # For the "Participate in Evaluation" button, transition to page0 - participate_eval_btn.click( - fn=go_to_page0_from_minus1, - inputs=None, - outputs=[page_minus1, page0] - ) - # Transition from Page 0 (Welcome) to Page 1. next_btn_0.click( fn=go_to_eval_progress_modal, inputs=[name, email, evaluator_id, specialty_dd, subspecialty_dd, years_exp_radio, exp_explanation_tb, npi_id], outputs=[ - page0, page1, page2, page0_error_box, eval_progress_modal, - page1_prompt, page2_prompt, page1_reference_answer, page2_reference_answer, eval_progress_text, + page0, page1, page0_error_box, + page1_prompt, user_info_state, data_subset_state, progress_state, pairwise_state, - chat_a_page1, chat_b_page1, chat_a_page2, chat_b_page2, - *ratings_A, *ratings_B, - *pairwise_results_for_display, - pairwise_header, # Add pairwise progress header - scoring_header # Add scoring progress header + chat_a_answer, chat_b_answer, chat_a_reasoning, chat_b_reasoning, pairwise_header, + *pairwise_inputs, *comparison_reasons_inputs, + *ratings_A_page1, *ratings_B_page1 ], scroll_to_output=True ) - # Home page buttons to simply shown page-1 - home_btn_0.click(lambda: (gr.update(visible=True), gr.update( - visible=False)), None, [page_minus1, page0], scroll_to_output=True) - home_btn_1.click(lambda: (gr.update(visible=True), gr.update( - visible=False)), None, [page_minus1, page1], scroll_to_output=True) - home_btn_2.click(lambda: (gr.update(visible=True), gr.update( - visible=False)), None, [page_minus1, page2], scroll_to_output=True) # Transition from Page 1 (Pairwise) to the combined Rating Page (Page 2). - next_btn_1.click( - fn=submit_pairwise_comparison, + submit_btn_1.click( + fn=submit_pairwise_scoring, inputs=[progress_state, data_subset_state, - user_info_state, *pairwise_radios, *pairwise_reasons], + user_info_state, + *pairwise_inputs, *comparison_reasons_inputs, + *ratings_A_page1, *ratings_B_page1], outputs=[ - page1, page2, - page1_prompt, page2_prompt, - progress_state, - chat_a_page1, chat_b_page1, chat_a_page2, chat_b_page2, - *pairwise_radios, - *pairwise_reasons, - *pairwise_results_for_display, - pairwise_header, # Add pairwise progress header - scoring_header # Add scoring progress header + page0, page1, page0_error_box, + page1_prompt, + user_info_state, data_subset_state, progress_state, pairwise_state, + chat_a_answer, chat_b_answer, chat_a_reasoning, chat_b_reasoning, pairwise_header, + *pairwise_inputs, *comparison_reasons_inputs, + *ratings_A_page1, *ratings_B_page1 ], scroll_to_output=True, ) - submit_btn.click( - fn=submit_pairwise_scoring, - inputs=[progress_state, - data_subset_state, user_info_state, *ratings_A, *ratings_B], - outputs=[ - page1, # gr.update(visible=False) - page2, # gr.update(visible=True) - page1_prompt, # None (page1_prompt) - page2_prompt, # page2_prompt_val - page1_reference_answer, # page1_reference_answer - page2_reference_answer, # page2_reference_answer - user_info_state, - data_subset_state, - progress_state, # progress_state - pairwise_state, - chat_a_page1, # None (page1 chat_a) - chat_b_page1, # None (page1 chat_b) - chat_a_page2, # chat_a_val (不是None!) - chat_b_page2, # chat_b_val (不是None!) - *pairwise_radios, # reset radios - 修复:不应该使用ui_updates - *pairwise_reasons, # reset texts - 修复:不应该使用ui_updates - *ratings_A, *ratings_B, - *pairwise_results_for_display, # pairwise results display - pairwise_header, # Add pairwise progress header - scoring_header # Add scoring progress header - # next_question_modal_visibility # next question modal visibility - ], - scroll_to_output=True - ) - - # --- Calculate progress information --- - def calculate_progress_info(progress_state): - """ - Calculate progress information for both pairwise and scoring phases. - - Returns: - dict: Contains progress information including: - - pairwise_completed: number of completed pairwise comparisons - - pairwise_total: total number of pairwise comparisons needed - - pairwise_remaining: number of remaining pairwise comparisons - - scoring_completed: number of completed scoring pairs - - scoring_total: total number of scoring pairs needed (same as pairwise_total) - - scoring_remaining: number of remaining scoring pairs - - pairwise_progress_text: formatted text for pairwise progress - - scoring_progress_text: formatted text for scoring progress - """ - # Handle case where Gradio State object is passed instead of dictionary - if hasattr(progress_state, 'value'): - progress_state = progress_state.value - - if not progress_state or not isinstance(progress_state, dict) or 'all_pairs' not in progress_state: - return { - 'pairwise_completed': 0, - 'pairwise_total': 0, - 'pairwise_remaining': 0, - 'scoring_completed': 0, - 'scoring_total': 0, - 'scoring_remaining': 0, - 'pairwise_progress_text': "No progress information available", - 'scoring_progress_text': "No progress information available" - } - - # Get basic counts - total_pairs = len(progress_state['all_pairs']) - pairwise_done = len(progress_state.get('pairwise_done', set())) - scoring_done = len(progress_state.get('scoring_done_pairs', set())) - - # Calculate remaining - pairwise_remaining = total_pairs - pairwise_done - scoring_remaining = total_pairs - scoring_done - - # Create progress text - pairwise_progress_text = f"Pairwise Comparison Progress: {pairwise_done}/{total_pairs} pairs completed ({pairwise_remaining} remaining)" - scoring_progress_text = f"Scoring Progress: {scoring_done}/{total_pairs} pairs completed ({scoring_remaining} remaining)" - - return { - 'pairwise_completed': pairwise_done, - 'pairwise_total': total_pairs, - 'pairwise_remaining': pairwise_remaining, - 'scoring_completed': scoring_done, - 'scoring_total': total_pairs, - 'scoring_remaining': scoring_remaining, - 'pairwise_progress_text': pairwise_progress_text, - 'scoring_progress_text': scoring_progress_text - } - demo.launch(share=True, allowed_paths=["."])