import gradio as gr import random import pandas as pd import os import threading import time from utils.data_loader import get_random_example from utils.models import generate_summaries, model_names from utils.ui_helpers import toggle_context_display, update_feedback, get_context_html from utils.leaderboard import load_leaderboard_data, submit_vote_with_elo, generate_leaderboard_html from utils.vote_logger import save_vote_details from utils.shared import generation_interrupt # Import from shared module # Feedback options for different voting outcomes feedback_options = { "left": ["Model A: More complete", "Model A: More accurate", "Model A: More relevant", "Model A: Better written", "Model A: Better refusal (if applicable)"], "right": ["Model B: More complete", "Model B: More accurate", "Model B: More relevant", "Model B: Better written", "Model B: Better refusal (if applicable)"], "tie": ["Model A: More complete", "Model A: More accurate", "Model A: More relevant", "Model A: Better written", "Model A: Better refusal (if applicable)", "Model B: More complete", "Model B: More accurate", "Model B: More relevant", "Model B: Better written", "Model B: Better refusal (if applicable)"], "neither": ["Model A: Incomplete", "Model A: Hallucinate", "Model A: Irrelevant", "Model A: Incorrect refusal (if applicable)", "Model B: Incomplete", "Model B: Hallucinate", "Model B: Irrelevant", "Model B: Incorrect refusal (if applicable)"] } def load_context(set_interrupt=False): """ Load a new question and context Parameters: - set_interrupt: If True, will interrupt any ongoing inference before loading """ if set_interrupt: # Interrupt any ongoing inference generation_interrupt.set() time.sleep(0.2) # Short delay to allow threads to detect interrupt # Always clear the flag before starting new work generation_interrupt.clear() example = get_random_example() # Format the context description context_desc = example.get('processed_context_desc', '') if context_desc: context_desc = f"
The question and context are about: {context_desc}
" show_full = False context_html = get_context_html(example, show_full=show_full) return [ example, gr.update(value=example['question']), gr.update(value=context_desc, visible=bool(context_desc)), gr.update(value=context_html), gr.update(value="Show Full Context", elem_classes=["context-toggle-button"]), show_full ] def load_leaderboard(): """Loads and displays the leaderboard data""" results = load_leaderboard_data() leaderboard_html = generate_leaderboard_html(results) return leaderboard_html def generate_model_summaries(example): """Run model inference""" result = { "model_a": "", "model_b": "", "summary_a": "", "summary_b": "", "completed": False } if generation_interrupt.is_set(): return result try: m_a_name, m_b_name = random.sample(model_names, 2) # Track the partial completion state result["model_a"] = m_a_name result["model_b"] = m_b_name s_a, s_b = generate_summaries(example, m_a_name, m_b_name) if not generation_interrupt.is_set(): result["summary_a"] = s_a result["summary_b"] = s_b result["completed"] = bool(s_a and s_b) # Only mark complete if both have content except Exception as e: print(f"Error in generation: {e}") return result def process_generation_result(result): """Process the results from the generation function""" if not result["completed"] or not result["summary_a"] or not result["summary_b"]: # Either generation was interrupted or both summaries aren't ready return [ result.get("model_a", ""), result.get("model_b", ""), result.get("summary_a", ""), result.get("summary_b", ""), None, [], False, load_leaderboard_data(), gr.update(value=result.get("summary_a", "Generation was interrupted or failed.")), gr.update(value=result.get("summary_b", "Generation was interrupted or failed.")), gr.update(interactive=False, elem_classes=["vote-button"]), # Explicitly disable gr.update(interactive=False, elem_classes=["vote-button"]), gr.update(interactive=False, elem_classes=["vote-button"]), gr.update(interactive=False, elem_classes=["vote-button", "vote-button-neither"]), gr.update(choices=[], value=[], interactive=False, visible=False), gr.update(visible=False), gr.update(interactive=False, visible=True), gr.update(visible=False), gr.update(interactive=True), gr.update(elem_classes=[]) ] # Only enable voting when both summaries are complete and non-empty buttons_interactive = bool(result["summary_a"] and result["summary_b"]) # Generation completed successfully agg_results = load_leaderboard_data() return [ result["model_a"], result["model_b"], result["summary_a"], result["summary_b"], None, [], False, agg_results, gr.update(value=result["summary_a"]), gr.update(value=result["summary_b"]), gr.update(interactive=buttons_interactive, elem_classes=["vote-button"]), gr.update(interactive=buttons_interactive, elem_classes=["vote-button"]), gr.update(interactive=buttons_interactive, elem_classes=["vote-button"]), gr.update(interactive=buttons_interactive, elem_classes=["vote-button", "vote-button-neither"]), gr.update(choices=[], value=[], interactive=False, visible=False), gr.update(visible=False), gr.update(interactive=False, visible=True), gr.update(visible=False), gr.update(interactive=True), gr.update(elem_classes=[]) ] def process_example(example): result = generate_model_summaries(example) return process_generation_result(result) def select_vote_improved(winner_choice): """Updates UI based on vote selection""" feedback_choices = feedback_options.get(winner_choice, []) btn_a_classes = ["vote-button"] btn_b_classes = ["vote-button"] btn_tie_classes = ["vote-button"] btn_neither_classes = ["vote-button", "vote-button-neither"] if winner_choice == 'left': btn_a_classes.append("selected") elif winner_choice == 'right': btn_b_classes.append("selected") elif winner_choice == 'tie': btn_tie_classes.append("selected") elif winner_choice == 'neither': btn_neither_classes.append("selected") return [ winner_choice, gr.update(choices=feedback_choices, value=[], interactive=True, visible=True), gr.update(visible=True), gr.update(interactive=True), gr.update(elem_classes=btn_a_classes), gr.update(elem_classes=btn_b_classes), gr.update(elem_classes=btn_tie_classes), gr.update(elem_classes=btn_neither_classes) ] def handle_vote_submission(example, m_a, m_b, winner, feedback, summary_a, summary_b, current_results): """Handle vote submission - logs details and updates leaderboard""" if winner is None: print("Warning: Submit called without a winner selected.") return {} # Save detailed vote information save_vote_details(example, m_a, m_b, winner, feedback, summary_a, summary_b) # Update Elo ratings and get UI updates return submit_vote_with_elo(m_a, m_b, winner, feedback, current_results) def show_loading_state(): """Show loading state while fetching new content""" return [ gr.update(value="Loading new question and summaries...", interactive=False), gr.update(value="Loading new question and summaries...", interactive=False), gr.update(interactive=False), # For vote_button_a gr.update(interactive=False), # For vote_button_b gr.update(interactive=False), # For vote_button_tie gr.update(interactive=False) # For vote_button_neither ] def handle_new_example_click(): """Handle clicking 'Get new example' button""" # Use the centralized approach - set_interrupt=True tells load_context to handle interruption return load_context(set_interrupt=True)[0] def update_ui_for_new_context(example): """Update UI with new context information""" # Format the context description context_desc = example.get('processed_context_desc', '') if context_desc: context_desc = f"
The question and context are about: {context_desc}
" return [ gr.update(value=example['question']), gr.update(value=context_desc, visible=bool(context_desc)), gr.update(value=get_context_html(example, False)), gr.update(value="Show Full Context", elem_classes=["context-toggle-button"]), False ] # Resource cleanup function for unload event def cleanup_on_disconnect(): """Clean up resources when browser disconnects""" print(f"Browser disconnected. Cleaning up resources...") generation_interrupt.set() # No need for time.sleep here as this is just setting the flag # Threads will detect it on their next check # Create Gradio interface with gr.Blocks(theme=gr.themes.Default( primary_hue=gr.themes.colors.orange, secondary_hue=gr.themes.colors.slate )) as demo: # Load CSS css_path = os.path.join(os.getcwd(), 'static', 'styles.css') # Load the CSS file with open(css_path, 'r') as f: css_content = f.read() # Create HTML components with CSS gr.HTML(f"") # Add JavaScript to handle browser unload events unload_js = """ """ gr.HTML(unload_js) # State Variables current_example = gr.State({}) model_a_name = gr.State("") model_b_name = gr.State("") summary_a_text = gr.State("") summary_b_text = gr.State("") selected_winner = gr.State(None) feedback_list = gr.State([]) show_results_state = gr.State(False) results_agg = gr.State(load_leaderboard_data()) show_full_context = gr.State(False) # Create Tabs with gr.Tabs() as tabs: # Main Arena Tab with gr.TabItem("Arena", id="arena-tab"): gr.Markdown("# RAG SLM Summarizer/Generator Arena") gr.Markdown(""" 1️⃣ Review the query and examine the highlighted context (✨ highlights contain key information! )\n 2️⃣ Compare answers generated by two different models side-by-side\n 3️⃣ Vote for the better response or select 'Tie/Neither' if appropriate""") gr.HTML("
") # Main container with gr.Column(elem_id="main-interface-area") as main_interface_area: # Query section with gr.Row(elem_id="query-title-row"): gr.Markdown("### 💬 Query (What Users Want to Ask About the Doc)", elem_classes="section-heading") with gr.Row(elem_id="query-container"): with gr.Row(elem_classes="query-box-row"): query_display = gr.Markdown(value="Loading question...", elem_classes="query-text", elem_id="query-section") random_question_btn = gr.Button("🔄 Try a New Question", elem_classes="query-button") # Context description and display context_description = gr.Markdown("", elem_classes="context-description") gr.HTML("
") with gr.Row(elem_id="context-header-row"): gr.Markdown("### 📋 Context (Relevant Information We Got from the Database)", elem_classes="context-title") context_toggle_btn = gr.Button("Show Full Context", elem_classes=["context-toggle-button"]) context_display = gr.HTML(value="Loading context...", label="Context Chunks") gr.Markdown("---") gr.Markdown("### 🔍 Compare Answers from Models", elem_classes="section-heading") # Model summaries - Add ID for JavaScript to target and disable autoscroll with gr.Row(elem_id="summary-containers"): with gr.Column(scale=1): with gr.Group(elem_classes=["summary-card", "summary-card-a"]): summary_a_display = gr.Textbox( label="Model A", lines=10, interactive=False, show_copy_button=True, autoscroll=False, # Disable auto-scrolling elem_id="summary-a-display" ) with gr.Column(scale=1): with gr.Group(elem_classes=["summary-card", "summary-card-b"]): summary_b_display = gr.Textbox( label="Model B", lines=10, interactive=False, show_copy_button=True, autoscroll=False, # Disable auto-scrolling elem_id="summary-b-display" ) gr.HTML("
") # Voting section gr.Markdown("### 🏅 Cast Your Vote", elem_classes="section-heading") with gr.Row(): vote_button_a = gr.Button("⬅️ Summary A is Better", elem_classes=["vote-button"], interactive=False) vote_button_tie = gr.Button("🤝 Tie / Equally Good", elem_classes=["vote-button"], interactive=False) vote_button_b = gr.Button("➡️ Summary B is Better", elem_classes=["vote-button"], interactive=False) vote_button_neither = gr.Button("❌ Neither is Good", elem_classes=["vote-button", "vote-button-neither"], interactive=False) # Feedback and Submit sections with gr.Group(elem_classes=["feedback-section"], visible=False) as feedback_section: feedback_checkboxes = gr.CheckboxGroup(label="Feedback (optional)", choices=[], interactive=False) submit_button = gr.Button("Submit Your Vote", variant="primary", interactive=False, elem_id="submit-button") # Results area with gr.Column(visible=False) as results_reveal_area: gr.Markdown("---") gr.Markdown("### ✅ Vote Submitted!", elem_classes="section-heading") # Model reveal section with gr.Row(): with gr.Column(scale=1): gr.Markdown("### Model A was:", elem_classes="section-heading") model_a_reveal = gr.Markdown("", elem_classes="model-reveal model-a-reveal") with gr.Column(scale=1): gr.Markdown("### Model B was:", elem_classes="section-heading") model_b_reveal = gr.Markdown("", elem_classes="model-reveal model-b-reveal") gr.HTML("
") # Try another button with gr.Row(elem_classes=["control-buttons"]): try_another_btn = gr.Button("🔄 Try Another Question", elem_id="try-another-btn") # Leaderboard Tab with gr.TabItem("Leaderboard", id="leaderboard-tab"): gr.Markdown("# RAG SLM Summarizer/Generator Leaderboard", elem_classes="orange-title") gr.Markdown("View performance statistics for all models ranked by Elo rating.") with gr.Group(elem_id="leaderboard-info"): gr.Markdown("""### About Elo Ratings The Elo rating system provides a more accurate ranking than simple win rates: - All models start at 1500 points - Points are exchanged after each comparison based on the expected outcome - Beating a stronger model earns more points than beating a weaker one - The ± value shows the statistical confidence interval (95%) """) results_table_display = gr.HTML(label="Model Performance") # Event handling # Toggle context display context_toggle_btn.click( fn=toggle_context_display, inputs=[current_example, show_full_context], outputs=[show_full_context, context_display, context_toggle_btn] ) # Initial loading - context first, then summaries # Uses load_context without interruption since it's the first load demo.load( fn=load_context, # Default is set_interrupt=False inputs=[], outputs=[current_example, query_display, context_description, context_display, context_toggle_btn, show_full_context] ).then( fn=process_example, inputs=[current_example], outputs=[model_a_name, model_b_name, summary_a_text, summary_b_text, selected_winner, feedback_list, show_results_state, results_agg, summary_a_display, summary_b_display, vote_button_a, vote_button_b, vote_button_tie, vote_button_neither, feedback_checkboxes, feedback_section, submit_button, results_reveal_area, random_question_btn, main_interface_area] ) # Load leaderboard content on app start demo.load( fn=load_leaderboard, inputs=[], outputs=[results_table_display] ) # Use a single event chain for each button, structured to update UI first, then run inference for btn in [random_question_btn, try_another_btn]: btn.click( # Step 1: Show loading state immediately fn=show_loading_state, inputs=[], outputs=[summary_a_display, summary_b_display, vote_button_a, vote_button_b, vote_button_tie, vote_button_neither] ).then( # Step 2: Get new example fn=handle_new_example_click, inputs=[], outputs=[current_example] ).then( # Step 3: Update context UI immediately fn=update_ui_for_new_context, inputs=[current_example], outputs=[query_display, context_description, context_display, context_toggle_btn, show_full_context] ).then( # Step 4: Then process example for model outputs fn=process_example, inputs=[current_example], outputs=[model_a_name, model_b_name, summary_a_text, summary_b_text, selected_winner, feedback_list, show_results_state, results_agg, summary_a_display, summary_b_display, vote_button_a, vote_button_b, vote_button_tie, vote_button_neither, feedback_checkboxes, feedback_section, submit_button, results_reveal_area, random_question_btn, main_interface_area] ) # Vote button handlers for btn, choice in zip( [vote_button_a, vote_button_b, vote_button_tie, vote_button_neither], ['left', 'right', 'tie', 'neither'] ): btn.click( fn=lambda choice=choice: select_vote_improved(choice), inputs=None, outputs=[selected_winner, feedback_checkboxes, feedback_section, submit_button, vote_button_a, vote_button_b, vote_button_tie, vote_button_neither] ) # Update feedback when checkboxes change feedback_checkboxes.change( fn=update_feedback, inputs=[feedback_checkboxes], outputs=[feedback_list] ) # Process vote submission and reveal results submit_button.click( fn=handle_vote_submission, inputs=[current_example, model_a_name, model_b_name, selected_winner, feedback_list, summary_a_text, summary_b_text, results_agg], outputs=[show_results_state, results_agg, vote_button_a, vote_button_b, vote_button_tie, vote_button_neither, feedback_checkboxes, feedback_section, submit_button, results_reveal_area, random_question_btn, results_table_display, main_interface_area, context_toggle_btn, model_a_reveal, model_b_reveal] ) # Refresh leaderboard when switching to the leaderboard tab tabs.select( fn=load_leaderboard, inputs=[], outputs=[results_table_display], api_name="refresh_leaderboard" ) # Register unload event for browser disconnections demo.unload(cleanup_on_disconnect) if __name__ == "__main__": demo.launch(debug=True)