import gradio as gr
import random
import pandas as pd
import os
import threading
import time
from utils.data_loader import get_random_example
from utils.models import generate_summaries, model_names
from utils.ui_helpers import toggle_context_display, update_feedback, get_context_html
from utils.leaderboard import load_leaderboard_data, submit_vote_with_elo, generate_leaderboard_html
from utils.vote_logger import save_vote_details
from utils.shared import generation_interrupt # Import from shared module
# Feedback options for different voting outcomes
feedback_options = {
"left": ["Model A: More complete", "Model A: More accurate", "Model A: More relevant", "Model A: Better written", "Model A: Better refusal (if applicable)"],
"right": ["Model B: More complete", "Model B: More accurate", "Model B: More relevant", "Model B: Better written", "Model B: Better refusal (if applicable)"],
"tie": ["Model A: More complete", "Model A: More accurate", "Model A: More relevant", "Model A: Better written", "Model A: Better refusal (if applicable)",
"Model B: More complete", "Model B: More accurate", "Model B: More relevant", "Model B: Better written", "Model B: Better refusal (if applicable)"],
"neither": ["Model A: Incomplete", "Model A: Hallucinate", "Model A: Irrelevant", "Model A: Incorrect refusal (if applicable)",
"Model B: Incomplete", "Model B: Hallucinate", "Model B: Irrelevant", "Model B: Incorrect refusal (if applicable)"]
}
def load_context(set_interrupt=False):
"""
Load a new question and context
Parameters:
- set_interrupt: If True, will interrupt any ongoing inference before loading
"""
if set_interrupt:
# Interrupt any ongoing inference
generation_interrupt.set()
time.sleep(0.2) # Short delay to allow threads to detect interrupt
# Always clear the flag before starting new work
generation_interrupt.clear()
example = get_random_example()
# Format the context description
context_desc = example.get('processed_context_desc', '')
if context_desc:
context_desc = f"
The question and context are about: {context_desc}
"
show_full = False
context_html = get_context_html(example, show_full=show_full)
return [
example,
gr.update(value=example['question']),
gr.update(value=context_desc, visible=bool(context_desc)),
gr.update(value=context_html),
gr.update(value="Show Full Context", elem_classes=["context-toggle-button"]),
show_full
]
def load_leaderboard():
"""Loads and displays the leaderboard data"""
results = load_leaderboard_data()
leaderboard_html = generate_leaderboard_html(results)
return leaderboard_html
def generate_model_summaries(example):
"""Run model inference"""
result = {
"model_a": "",
"model_b": "",
"summary_a": "",
"summary_b": "",
"completed": False
}
if generation_interrupt.is_set():
return result
try:
m_a_name, m_b_name = random.sample(model_names, 2)
# Track the partial completion state
result["model_a"] = m_a_name
result["model_b"] = m_b_name
s_a, s_b = generate_summaries(example, m_a_name, m_b_name)
if not generation_interrupt.is_set():
result["summary_a"] = s_a
result["summary_b"] = s_b
result["completed"] = bool(s_a and s_b) # Only mark complete if both have content
except Exception as e:
print(f"Error in generation: {e}")
return result
def process_generation_result(result):
"""Process the results from the generation function"""
if not result["completed"] or not result["summary_a"] or not result["summary_b"]:
# Either generation was interrupted or both summaries aren't ready
return [
result.get("model_a", ""),
result.get("model_b", ""),
result.get("summary_a", ""),
result.get("summary_b", ""),
None, [], False, load_leaderboard_data(),
gr.update(value=result.get("summary_a", "Generation was interrupted or failed.")),
gr.update(value=result.get("summary_b", "Generation was interrupted or failed.")),
gr.update(interactive=False, elem_classes=["vote-button"]), # Explicitly disable
gr.update(interactive=False, elem_classes=["vote-button"]),
gr.update(interactive=False, elem_classes=["vote-button"]),
gr.update(interactive=False, elem_classes=["vote-button", "vote-button-neither"]),
gr.update(choices=[], value=[], interactive=False, visible=False),
gr.update(visible=False),
gr.update(interactive=False, visible=True),
gr.update(visible=False),
gr.update(interactive=True),
gr.update(elem_classes=[])
]
# Only enable voting when both summaries are complete and non-empty
buttons_interactive = bool(result["summary_a"] and result["summary_b"])
# Generation completed successfully
agg_results = load_leaderboard_data()
return [
result["model_a"], result["model_b"],
result["summary_a"], result["summary_b"],
None, [], False, agg_results,
gr.update(value=result["summary_a"]),
gr.update(value=result["summary_b"]),
gr.update(interactive=buttons_interactive, elem_classes=["vote-button"]),
gr.update(interactive=buttons_interactive, elem_classes=["vote-button"]),
gr.update(interactive=buttons_interactive, elem_classes=["vote-button"]),
gr.update(interactive=buttons_interactive, elem_classes=["vote-button", "vote-button-neither"]),
gr.update(choices=[], value=[], interactive=False, visible=False),
gr.update(visible=False),
gr.update(interactive=False, visible=True),
gr.update(visible=False),
gr.update(interactive=True),
gr.update(elem_classes=[])
]
def process_example(example):
result = generate_model_summaries(example)
return process_generation_result(result)
def select_vote_improved(winner_choice):
"""Updates UI based on vote selection"""
feedback_choices = feedback_options.get(winner_choice, [])
btn_a_classes = ["vote-button"]
btn_b_classes = ["vote-button"]
btn_tie_classes = ["vote-button"]
btn_neither_classes = ["vote-button", "vote-button-neither"]
if winner_choice == 'left':
btn_a_classes.append("selected")
elif winner_choice == 'right':
btn_b_classes.append("selected")
elif winner_choice == 'tie':
btn_tie_classes.append("selected")
elif winner_choice == 'neither':
btn_neither_classes.append("selected")
return [
winner_choice,
gr.update(choices=feedback_choices, value=[], interactive=True, visible=True),
gr.update(visible=True),
gr.update(interactive=True),
gr.update(elem_classes=btn_a_classes),
gr.update(elem_classes=btn_b_classes),
gr.update(elem_classes=btn_tie_classes),
gr.update(elem_classes=btn_neither_classes)
]
def handle_vote_submission(example, m_a, m_b, winner, feedback, summary_a, summary_b, current_results):
"""Handle vote submission - logs details and updates leaderboard"""
if winner is None:
print("Warning: Submit called without a winner selected.")
return {}
# Save detailed vote information
save_vote_details(example, m_a, m_b, winner, feedback, summary_a, summary_b)
# Update Elo ratings and get UI updates
return submit_vote_with_elo(m_a, m_b, winner, feedback, current_results)
def show_loading_state():
"""Show loading state while fetching new content"""
return [
gr.update(value="Loading new question and summaries...", interactive=False),
gr.update(value="Loading new question and summaries...", interactive=False),
gr.update(interactive=False), # For vote_button_a
gr.update(interactive=False), # For vote_button_b
gr.update(interactive=False), # For vote_button_tie
gr.update(interactive=False) # For vote_button_neither
]
def handle_new_example_click():
"""Handle clicking 'Get new example' button"""
# Use the centralized approach - set_interrupt=True tells load_context to handle interruption
return load_context(set_interrupt=True)[0]
def update_ui_for_new_context(example):
"""Update UI with new context information"""
# Format the context description
context_desc = example.get('processed_context_desc', '')
if context_desc:
context_desc = f"The question and context are about: {context_desc}
"
return [
gr.update(value=example['question']),
gr.update(value=context_desc, visible=bool(context_desc)),
gr.update(value=get_context_html(example, False)),
gr.update(value="Show Full Context", elem_classes=["context-toggle-button"]),
False
]
# Resource cleanup function for unload event
def cleanup_on_disconnect():
"""Clean up resources when browser disconnects"""
print(f"Browser disconnected. Cleaning up resources...")
generation_interrupt.set()
# No need for time.sleep here as this is just setting the flag
# Threads will detect it on their next check
# Create Gradio interface
with gr.Blocks(theme=gr.themes.Default(
primary_hue=gr.themes.colors.orange,
secondary_hue=gr.themes.colors.slate
)) as demo:
# Load CSS
css_path = os.path.join(os.getcwd(), 'static', 'styles.css')
# Load the CSS file
with open(css_path, 'r') as f:
css_content = f.read()
# Create HTML components with CSS
gr.HTML(f"")
# Add JavaScript to handle browser unload events
unload_js = """
"""
gr.HTML(unload_js)
# State Variables
current_example = gr.State({})
model_a_name = gr.State("")
model_b_name = gr.State("")
summary_a_text = gr.State("")
summary_b_text = gr.State("")
selected_winner = gr.State(None)
feedback_list = gr.State([])
show_results_state = gr.State(False)
results_agg = gr.State(load_leaderboard_data())
show_full_context = gr.State(False)
# Create Tabs
with gr.Tabs() as tabs:
# Main Arena Tab
with gr.TabItem("Arena", id="arena-tab"):
gr.Markdown("# RAG SLM Summarizer/Generator Arena")
gr.Markdown("""
1️⃣ Review the query and examine the highlighted context (✨ highlights contain key information! )\n
2️⃣ Compare answers generated by two different models side-by-side\n
3️⃣ Vote for the better response or select 'Tie/Neither' if appropriate""")
gr.HTML("
")
# Main container
with gr.Column(elem_id="main-interface-area") as main_interface_area:
# Query section
with gr.Row(elem_id="query-title-row"):
gr.Markdown("### 💬 Query (What Users Want to Ask About the Doc)", elem_classes="section-heading")
with gr.Row(elem_id="query-container"):
with gr.Row(elem_classes="query-box-row"):
query_display = gr.Markdown(value="Loading question...", elem_classes="query-text", elem_id="query-section")
random_question_btn = gr.Button("🔄 Try a New Question", elem_classes="query-button")
# Context description and display
context_description = gr.Markdown("", elem_classes="context-description")
gr.HTML("
")
with gr.Row(elem_id="context-header-row"):
gr.Markdown("### 📋 Context (Relevant Information We Got from the Database)", elem_classes="context-title")
context_toggle_btn = gr.Button("Show Full Context", elem_classes=["context-toggle-button"])
context_display = gr.HTML(value="Loading context...", label="Context Chunks")
gr.Markdown("---")
gr.Markdown("### 🔍 Compare Answers from Models", elem_classes="section-heading")
# Model summaries - Add ID for JavaScript to target and disable autoscroll
with gr.Row(elem_id="summary-containers"):
with gr.Column(scale=1):
with gr.Group(elem_classes=["summary-card", "summary-card-a"]):
summary_a_display = gr.Textbox(
label="Model A",
lines=10,
interactive=False,
show_copy_button=True,
autoscroll=False, # Disable auto-scrolling
elem_id="summary-a-display"
)
with gr.Column(scale=1):
with gr.Group(elem_classes=["summary-card", "summary-card-b"]):
summary_b_display = gr.Textbox(
label="Model B",
lines=10,
interactive=False,
show_copy_button=True,
autoscroll=False, # Disable auto-scrolling
elem_id="summary-b-display"
)
gr.HTML("
")
# Voting section
gr.Markdown("### 🏅 Cast Your Vote", elem_classes="section-heading")
with gr.Row():
vote_button_a = gr.Button("⬅️ Summary A is Better", elem_classes=["vote-button"], interactive=False)
vote_button_tie = gr.Button("🤝 Tie / Equally Good", elem_classes=["vote-button"], interactive=False)
vote_button_b = gr.Button("➡️ Summary B is Better", elem_classes=["vote-button"], interactive=False)
vote_button_neither = gr.Button("❌ Neither is Good", elem_classes=["vote-button", "vote-button-neither"], interactive=False)
# Feedback and Submit sections
with gr.Group(elem_classes=["feedback-section"], visible=False) as feedback_section:
feedback_checkboxes = gr.CheckboxGroup(label="Feedback (optional)", choices=[], interactive=False)
submit_button = gr.Button("Submit Your Vote", variant="primary", interactive=False, elem_id="submit-button")
# Results area
with gr.Column(visible=False) as results_reveal_area:
gr.Markdown("---")
gr.Markdown("### ✅ Vote Submitted!", elem_classes="section-heading")
# Model reveal section
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### Model A was:", elem_classes="section-heading")
model_a_reveal = gr.Markdown("", elem_classes="model-reveal model-a-reveal")
with gr.Column(scale=1):
gr.Markdown("### Model B was:", elem_classes="section-heading")
model_b_reveal = gr.Markdown("", elem_classes="model-reveal model-b-reveal")
gr.HTML("
")
# Try another button
with gr.Row(elem_classes=["control-buttons"]):
try_another_btn = gr.Button("🔄 Try Another Question", elem_id="try-another-btn")
# Leaderboard Tab
with gr.TabItem("Leaderboard", id="leaderboard-tab"):
gr.Markdown("# RAG SLM Summarizer/Generator Leaderboard", elem_classes="orange-title")
gr.Markdown("View performance statistics for all models ranked by Elo rating.")
with gr.Group(elem_id="leaderboard-info"):
gr.Markdown("""### About Elo Ratings
The Elo rating system provides a more accurate ranking than simple win rates:
- All models start at 1500 points
- Points are exchanged after each comparison based on the expected outcome
- Beating a stronger model earns more points than beating a weaker one
- The ± value shows the statistical confidence interval (95%)
""")
results_table_display = gr.HTML(label="Model Performance")
# Event handling
# Toggle context display
context_toggle_btn.click(
fn=toggle_context_display,
inputs=[current_example, show_full_context],
outputs=[show_full_context, context_display, context_toggle_btn]
)
# Initial loading - context first, then summaries
# Uses load_context without interruption since it's the first load
demo.load(
fn=load_context, # Default is set_interrupt=False
inputs=[],
outputs=[current_example, query_display, context_description, context_display,
context_toggle_btn, show_full_context]
).then(
fn=process_example,
inputs=[current_example],
outputs=[model_a_name, model_b_name, summary_a_text, summary_b_text,
selected_winner, feedback_list, show_results_state, results_agg,
summary_a_display, summary_b_display, vote_button_a, vote_button_b,
vote_button_tie, vote_button_neither, feedback_checkboxes, feedback_section,
submit_button, results_reveal_area, random_question_btn, main_interface_area]
)
# Load leaderboard content on app start
demo.load(
fn=load_leaderboard,
inputs=[],
outputs=[results_table_display]
)
# Use a single event chain for each button, structured to update UI first, then run inference
for btn in [random_question_btn, try_another_btn]:
btn.click(
# Step 1: Show loading state immediately
fn=show_loading_state,
inputs=[],
outputs=[summary_a_display, summary_b_display, vote_button_a,
vote_button_b, vote_button_tie, vote_button_neither]
).then(
# Step 2: Get new example
fn=handle_new_example_click,
inputs=[],
outputs=[current_example]
).then(
# Step 3: Update context UI immediately
fn=update_ui_for_new_context,
inputs=[current_example],
outputs=[query_display, context_description, context_display,
context_toggle_btn, show_full_context]
).then(
# Step 4: Then process example for model outputs
fn=process_example,
inputs=[current_example],
outputs=[model_a_name, model_b_name, summary_a_text, summary_b_text,
selected_winner, feedback_list, show_results_state, results_agg,
summary_a_display, summary_b_display, vote_button_a, vote_button_b,
vote_button_tie, vote_button_neither, feedback_checkboxes, feedback_section,
submit_button, results_reveal_area, random_question_btn, main_interface_area]
)
# Vote button handlers
for btn, choice in zip(
[vote_button_a, vote_button_b, vote_button_tie, vote_button_neither],
['left', 'right', 'tie', 'neither']
):
btn.click(
fn=lambda choice=choice: select_vote_improved(choice),
inputs=None,
outputs=[selected_winner, feedback_checkboxes, feedback_section, submit_button,
vote_button_a, vote_button_b, vote_button_tie, vote_button_neither]
)
# Update feedback when checkboxes change
feedback_checkboxes.change(
fn=update_feedback,
inputs=[feedback_checkboxes],
outputs=[feedback_list]
)
# Process vote submission and reveal results
submit_button.click(
fn=handle_vote_submission,
inputs=[current_example, model_a_name, model_b_name, selected_winner, feedback_list, summary_a_text, summary_b_text, results_agg],
outputs=[show_results_state, results_agg, vote_button_a, vote_button_b,
vote_button_tie, vote_button_neither, feedback_checkboxes,
feedback_section, submit_button, results_reveal_area,
random_question_btn, results_table_display, main_interface_area,
context_toggle_btn, model_a_reveal, model_b_reveal]
)
# Refresh leaderboard when switching to the leaderboard tab
tabs.select(
fn=load_leaderboard,
inputs=[],
outputs=[results_table_display],
api_name="refresh_leaderboard"
)
# Register unload event for browser disconnections
demo.unload(cleanup_on_disconnect)
if __name__ == "__main__":
demo.launch(debug=True)