SLM-RAG-Arena

Running on Zero

App Files Files Community

aizip-dev commited on 8 days ago

Commit

7198068

verified ·

1 Parent(s): 31bbdd9

update instruction

Browse files

Files changed (1) hide show

app.py +36 -89

app.py CHANGED Viewed

@@ -9,35 +9,25 @@ from utils.models import generate_summaries, model_names
 from utils.ui_helpers import toggle_context_display, update_feedback, get_context_html
 from utils.leaderboard import load_leaderboard_data, submit_vote_with_elo, generate_leaderboard_html
 from utils.vote_logger import save_vote_details
-from utils.shared import generation_interrupt  # Import from shared module
-# Feedback options for different voting outcomes
 feedback_options = {
     "left": ["Model A: More complete", "Model A: More accurate", "Model A: More relevant", "Model A: Better written", "Model A: Better refusal (if applicable)"],
     "right": ["Model B: More complete", "Model B: More accurate", "Model B: More relevant", "Model B: Better written", "Model B: Better refusal (if applicable)"],
-    "tie": ["Model A: More complete", "Model A: More accurate", "Model A: More relevant", "Model A: Better written", "Model A: Better refusal (if applicable)",
-           "Model B: More complete", "Model B: More accurate", "Model B: More relevant", "Model B: Better written", "Model B: Better refusal (if applicable)"],
     "neither": ["Model A: Incomplete", "Model A: Hallucinate", "Model A: Irrelevant", "Model A: Incorrect refusal (if applicable)",
                "Model B: Incomplete", "Model B: Hallucinate", "Model B: Irrelevant", "Model B: Incorrect refusal (if applicable)"]
 }
 def load_context(set_interrupt=False):
-    """
-    Load a new question and context
-    Parameters:
-    - set_interrupt: If True, will interrupt any ongoing inference before loading
-    """
     if set_interrupt:
-        # Interrupt any ongoing inference
         generation_interrupt.set()
-        time.sleep(0.2)  # Short delay to allow threads to detect interrupt
-    # Always clear the flag before starting new work
     generation_interrupt.clear()
     example = get_random_example()
-    # Format the context description
     context_desc = example.get('processed_context_desc', '')
     if context_desc:
         context_desc = f"<div class='context-topic'><span class='topic-label'>The question and context are about:</span> {context_desc}</div>"
@@ -55,14 +45,11 @@ def load_context(set_interrupt=False):
     ]
 def load_leaderboard():
-    """Loads and displays the leaderboard data"""
     results = load_leaderboard_data()
     leaderboard_html = generate_leaderboard_html(results)
     return leaderboard_html
 def generate_model_summaries(example):
-    """Run model inference"""
     result = {
         "model_a": "",
         "model_b": "",
@@ -77,7 +64,6 @@ def generate_model_summaries(example):
     try:
         m_a_name, m_b_name = random.sample(model_names, 2)
-        # Track the partial completion state
         result["model_a"] = m_a_name
         result["model_b"] = m_b_name
@@ -86,16 +72,14 @@ def generate_model_summaries(example):
         if not generation_interrupt.is_set():
             result["summary_a"] = s_a
             result["summary_b"] = s_b
-            result["completed"] = bool(s_a and s_b)  # Only mark complete if both have content
     except Exception as e:
         print(f"Error in generation: {e}")
     return result
 def process_generation_result(result):
-    """Process the results from the generation function"""
     if not result["completed"] or not result["summary_a"] or not result["summary_b"]:
-        # Either generation was interrupted or both summaries aren't ready
         return [
             result.get("model_a", ""),
             result.get("model_b", ""),
@@ -104,7 +88,7 @@ def process_generation_result(result):
             None, [], False, load_leaderboard_data(),
             gr.update(value=result.get("summary_a", "Generation was interrupted or failed.")),
             gr.update(value=result.get("summary_b", "Generation was interrupted or failed.")),
-            gr.update(interactive=False, elem_classes=["vote-button"]),  # Explicitly disable
             gr.update(interactive=False, elem_classes=["vote-button"]),
             gr.update(interactive=False, elem_classes=["vote-button"]),
             gr.update(interactive=False, elem_classes=["vote-button", "vote-button-neither"]),
@@ -116,10 +100,8 @@ def process_generation_result(result):
             gr.update(elem_classes=[])
         ]
-    # Only enable voting when both summaries are complete and non-empty
     buttons_interactive = bool(result["summary_a"] and result["summary_b"])
-    # Generation completed successfully
     agg_results = load_leaderboard_data()
     return [
         result["model_a"], result["model_b"],
@@ -144,7 +126,6 @@ def process_example(example):
     return process_generation_result(result)
 def select_vote_improved(winner_choice):
-    """Updates UI based on vote selection"""
     feedback_choices = feedback_options.get(winner_choice, [])
     btn_a_classes = ["vote-button"]
@@ -173,36 +154,31 @@ def select_vote_improved(winner_choice):
     ]
 def handle_vote_submission(example, m_a, m_b, winner, feedback, summary_a, summary_b, current_results):
-    """Handle vote submission - logs details and updates leaderboard"""
     if winner is None:
         print("Warning: Submit called without a winner selected.")
         return {}
-    # Save detailed vote information
     save_vote_details(example, m_a, m_b, winner, feedback, summary_a, summary_b)
-    # Update Elo ratings and get UI updates
     return submit_vote_with_elo(m_a, m_b, winner, feedback, current_results)
 def show_loading_state():
-    """Show loading state while fetching new content"""
     return [
         gr.update(value="Loading new question and summaries...", interactive=False),
         gr.update(value="Loading new question and summaries...", interactive=False),
-        gr.update(interactive=False),  # For vote_button_a
-        gr.update(interactive=False),  # For vote_button_b
-        gr.update(interactive=False),  # For vote_button_tie
-        gr.update(interactive=False)   # For vote_button_neither
     ]
 def handle_new_example_click():
-    """Handle clicking 'Get new example' button"""
-    # Use the centralized approach - set_interrupt=True tells load_context to handle interruption
     return load_context(set_interrupt=True)[0]
 def update_ui_for_new_context(example):
-    """Update UI with new context information"""
-    # Format the context description
     context_desc = example.get('processed_context_desc', '')
     if context_desc:
         context_desc = f"<div class='context-topic'><span class='topic-label'>The question and context are about:</span> {context_desc}</div>"
@@ -215,42 +191,30 @@ def update_ui_for_new_context(example):
         False
     ]
-# Resource cleanup function for unload event
 def cleanup_on_disconnect():
-    """Clean up resources when browser disconnects"""
     print(f"Browser disconnected. Cleaning up resources...")
     generation_interrupt.set()
-    # No need for time.sleep here as this is just setting the flag
-    # Threads will detect it on their next check
-# Create Gradio interface
 with gr.Blocks(theme=gr.themes.Default(
     primary_hue=gr.themes.colors.orange,
     secondary_hue=gr.themes.colors.slate
 )) as demo:
-    # Load CSS
     css_path = os.path.join(os.getcwd(), 'static', 'styles.css')
-    # Load the CSS file
     with open(css_path, 'r') as f:
         css_content = f.read()
-    # Create HTML components with CSS
     gr.HTML(f"<style>{css_content}</style>")
-    # Add JavaScript to handle browser unload events
     unload_js = """
     <script>
-    // This runs when the page is about to be closed or refreshed
     window.addEventListener('beforeunload', function(e) {
-        // Send a synchronous request to the server
         navigator.sendBeacon('/cleanup?session_id=' + window.gradioClientState.session_hash);
     });
     </script>
     """
     gr.HTML(unload_js)
-    # State Variables
     current_example = gr.State({})
     model_a_name = gr.State("")
     model_b_name = gr.State("")
@@ -262,44 +226,45 @@ with gr.Blocks(theme=gr.themes.Default(
     results_agg = gr.State(load_leaderboard_data())
     show_full_context = gr.State(False)
-    # Create Tabs
     with gr.Tabs() as tabs:
-        # Main Arena Tab
         with gr.TabItem("Arena", id="arena-tab"):
-            gr.Markdown("# RAG SLM Summarizer/Generator Arena")
             gr.Markdown("""
-1️⃣ Review the query and examine the highlighted context (✨ highlights contain key information! )\n
-2️⃣ Compare answers generated by two different models side-by-side\n
-3️⃣ Vote for the better response or select 'Tie/Neither' if appropriate""")
             gr.HTML("<hr>")
-            # Main container
             with gr.Column(elem_id="main-interface-area") as main_interface_area:
-                # Query section
                 with gr.Row(elem_id="query-title-row"):
-                    gr.Markdown("### 💬 Query (What Users Want to Ask About the Doc)", elem_classes="section-heading")
                 with gr.Row(elem_id="query-container"):
                     with gr.Row(elem_classes="query-box-row"):
                         query_display = gr.Markdown(value="Loading question...", elem_classes="query-text", elem_id="query-section")
                     random_question_btn = gr.Button("🔄 Try a New Question", elem_classes="query-button")
-                # Context description and display
                 context_description = gr.Markdown("", elem_classes="context-description")
                 gr.HTML("<hr>")
                 with gr.Row(elem_id="context-header-row"):
-                    gr.Markdown("### 📋 Context (Relevant Information We Got from the Database)", elem_classes="context-title")
                     context_toggle_btn = gr.Button("Show Full Context", elem_classes=["context-toggle-button"])
                 context_display = gr.HTML(value="Loading context...", label="Context Chunks")
                 gr.Markdown("---")
-                gr.Markdown("### 🔍 Compare Answers from Models", elem_classes="section-heading")
-                # Model summaries - Add ID for JavaScript to target and disable autoscroll
                 with gr.Row(elem_id="summary-containers"):
                     with gr.Column(scale=1):
                         with gr.Group(elem_classes=["summary-card", "summary-card-a"]):
@@ -308,7 +273,7 @@ with gr.Blocks(theme=gr.themes.Default(
                                 lines=10,
                                 interactive=False,
                                 show_copy_button=True,
-                                autoscroll=False,  # Disable auto-scrolling
                                 elem_id="summary-a-display"
                             )
                     with gr.Column(scale=1):
@@ -318,13 +283,12 @@ with gr.Blocks(theme=gr.themes.Default(
                                 lines=10,
                                 interactive=False,
                                 show_copy_button=True,
-                                autoscroll=False,  # Disable auto-scrolling
                                 elem_id="summary-b-display"
                             )
                 gr.HTML("<hr>")
-                # Voting section
                 gr.Markdown("### 🏅 Cast Your Vote", elem_classes="section-heading")
                 with gr.Row():
                     vote_button_a = gr.Button("⬅️ Summary A is Better", elem_classes=["vote-button"], interactive=False)
@@ -332,17 +296,14 @@ with gr.Blocks(theme=gr.themes.Default(
                     vote_button_b = gr.Button("➡️ Summary B is Better", elem_classes=["vote-button"], interactive=False)
                     vote_button_neither = gr.Button("❌ Neither is Good", elem_classes=["vote-button", "vote-button-neither"], interactive=False)
-                # Feedback and Submit sections
                 with gr.Group(elem_classes=["feedback-section"], visible=False) as feedback_section:
                     feedback_checkboxes = gr.CheckboxGroup(label="Feedback (optional)", choices=[], interactive=False)
                 submit_button = gr.Button("Submit Your Vote", variant="primary", interactive=False, elem_id="submit-button")
-                # Results area
                 with gr.Column(visible=False) as results_reveal_area:
                     gr.Markdown("---")
                     gr.Markdown("### ✅ Vote Submitted!", elem_classes="section-heading")
-                    # Model reveal section
                     with gr.Row():
                         with gr.Column(scale=1):
                             gr.Markdown("### Model A was:", elem_classes="section-heading")
@@ -353,11 +314,9 @@ with gr.Blocks(theme=gr.themes.Default(
                     gr.HTML("<hr>")
-                    # Try another button
                     with gr.Row(elem_classes=["control-buttons"]):
                         try_another_btn = gr.Button("🔄 Try Another Question", elem_id="try-another-btn")
-        # Leaderboard Tab
         with gr.TabItem("Leaderboard", id="leaderboard-tab"):
             gr.Markdown("# RAG SLM Summarizer/Generator Leaderboard", elem_classes="orange-title")
             gr.Markdown("View performance statistics for all models ranked by Elo rating.")
@@ -374,18 +333,14 @@ The Elo rating system provides a more accurate ranking than simple win rates:
             results_table_display = gr.HTML(label="Model Performance")
-    # Event handling
-    # Toggle context display
     context_toggle_btn.click(
         fn=toggle_context_display,
         inputs=[current_example, show_full_context],
         outputs=[show_full_context, context_display, context_toggle_btn]
     )
-    # Initial loading - context first, then summaries
-    # Uses load_context without interruption since it's the first load
     demo.load(
-        fn=load_context,  # Default is set_interrupt=False
         inputs=[],
         outputs=[current_example, query_display, context_description, context_display,
                 context_toggle_btn, show_full_context]
@@ -399,34 +354,31 @@ The Elo rating system provides a more accurate ranking than simple win rates:
                 submit_button, results_reveal_area, random_question_btn, main_interface_area]
     )
-    # Load leaderboard content on app start
     demo.load(
         fn=load_leaderboard,
         inputs=[],
         outputs=[results_table_display]
     )
-    # Use a single event chain for each button, structured to update UI first, then run inference
     for btn in [random_question_btn, try_another_btn]:
         btn.click(
-            # Step 1: Show loading state immediately
             fn=show_loading_state,
             inputs=[],
-            outputs=[summary_a_display, summary_b_display, vote_button_a,
-                    vote_button_b, vote_button_tie, vote_button_neither]
         ).then(
-            # Step 2: Get new example
             fn=handle_new_example_click,
             inputs=[],
             outputs=[current_example]
         ).then(
-            # Step 3: Update context UI immediately
             fn=update_ui_for_new_context,
             inputs=[current_example],
             outputs=[query_display, context_description, context_display,
                     context_toggle_btn, show_full_context]
         ).then(
-            # Step 4: Then process example for model outputs
             fn=process_example,
             inputs=[current_example],
             outputs=[model_a_name, model_b_name, summary_a_text, summary_b_text,
@@ -436,7 +388,6 @@ The Elo rating system provides a more accurate ranking than simple win rates:
                     submit_button, results_reveal_area, random_question_btn, main_interface_area]
         )
-    # Vote button handlers
     for btn, choice in zip(
         [vote_button_a, vote_button_b, vote_button_tie, vote_button_neither],
         ['left', 'right', 'tie', 'neither']
@@ -448,14 +399,12 @@ The Elo rating system provides a more accurate ranking than simple win rates:
                     vote_button_a, vote_button_b, vote_button_tie, vote_button_neither]
         )
-    # Update feedback when checkboxes change
     feedback_checkboxes.change(
         fn=update_feedback,
         inputs=[feedback_checkboxes],
         outputs=[feedback_list]
     )
-    # Process vote submission and reveal results
     submit_button.click(
         fn=handle_vote_submission,
         inputs=[current_example, model_a_name, model_b_name, selected_winner, feedback_list, summary_a_text, summary_b_text, results_agg],
@@ -466,7 +415,6 @@ The Elo rating system provides a more accurate ranking than simple win rates:
                 context_toggle_btn, model_a_reveal, model_b_reveal]
     )
-    # Refresh leaderboard when switching to the leaderboard tab
     tabs.select(
         fn=load_leaderboard,
         inputs=[],
@@ -474,7 +422,6 @@ The Elo rating system provides a more accurate ranking than simple win rates:
         api_name="refresh_leaderboard"
     )
-    # Register unload event for browser disconnections
     demo.unload(cleanup_on_disconnect)
 if __name__ == "__main__":

 from utils.ui_helpers import toggle_context_display, update_feedback, get_context_html
 from utils.leaderboard import load_leaderboard_data, submit_vote_with_elo, generate_leaderboard_html
 from utils.vote_logger import save_vote_details
+from utils.shared import generation_interrupt
 feedback_options = {
     "left": ["Model A: More complete", "Model A: More accurate", "Model A: More relevant", "Model A: Better written", "Model A: Better refusal (if applicable)"],
     "right": ["Model B: More complete", "Model B: More accurate", "Model B: More relevant", "Model B: Better written", "Model B: Better refusal (if applicable)"],
+    "tie": ["Model A: Complete", "Model A: Accurate", "Model A: Relevant", "Model A: Well written", "Model A: Correct refusal (if applicable)",
+           "Model B: Complete", "Model B: Accurate", "Model B: Relevant", "Model B: Well written", "Model B: Corrent refusal (if applicable)"],
     "neither": ["Model A: Incomplete", "Model A: Hallucinate", "Model A: Irrelevant", "Model A: Incorrect refusal (if applicable)",
                "Model B: Incomplete", "Model B: Hallucinate", "Model B: Irrelevant", "Model B: Incorrect refusal (if applicable)"]
 }
 def load_context(set_interrupt=False):
     if set_interrupt:
         generation_interrupt.set()
+        time.sleep(0.2)
     generation_interrupt.clear()
     example = get_random_example()
     context_desc = example.get('processed_context_desc', '')
     if context_desc:
         context_desc = f"<div class='context-topic'><span class='topic-label'>The question and context are about:</span> {context_desc}</div>"
     ]
 def load_leaderboard():
     results = load_leaderboard_data()
     leaderboard_html = generate_leaderboard_html(results)
     return leaderboard_html
 def generate_model_summaries(example):
     result = {
         "model_a": "",
         "model_b": "",
     try:
         m_a_name, m_b_name = random.sample(model_names, 2)
         result["model_a"] = m_a_name
         result["model_b"] = m_b_name
         if not generation_interrupt.is_set():
             result["summary_a"] = s_a
             result["summary_b"] = s_b
+            result["completed"] = bool(s_a and s_b)
     except Exception as e:
         print(f"Error in generation: {e}")
     return result
 def process_generation_result(result):
     if not result["completed"] or not result["summary_a"] or not result["summary_b"]:
         return [
             result.get("model_a", ""),
             result.get("model_b", ""),
             None, [], False, load_leaderboard_data(),
             gr.update(value=result.get("summary_a", "Generation was interrupted or failed.")),
             gr.update(value=result.get("summary_b", "Generation was interrupted or failed.")),
+            gr.update(interactive=False, elem_classes=["vote-button"]),
             gr.update(interactive=False, elem_classes=["vote-button"]),
             gr.update(interactive=False, elem_classes=["vote-button"]),
             gr.update(interactive=False, elem_classes=["vote-button", "vote-button-neither"]),
             gr.update(elem_classes=[])
         ]
     buttons_interactive = bool(result["summary_a"] and result["summary_b"])
     agg_results = load_leaderboard_data()
     return [
         result["model_a"], result["model_b"],
     return process_generation_result(result)
 def select_vote_improved(winner_choice):
     feedback_choices = feedback_options.get(winner_choice, [])
     btn_a_classes = ["vote-button"]
     ]
 def handle_vote_submission(example, m_a, m_b, winner, feedback, summary_a, summary_b, current_results):
     if winner is None:
         print("Warning: Submit called without a winner selected.")
         return {}
     save_vote_details(example, m_a, m_b, winner, feedback, summary_a, summary_b)
     return submit_vote_with_elo(m_a, m_b, winner, feedback, current_results)
 def show_loading_state():
     return [
         gr.update(value="Loading new question and summaries...", interactive=False),
         gr.update(value="Loading new question and summaries...", interactive=False),
+        gr.update(interactive=False),
+        gr.update(interactive=False),
+        gr.update(interactive=False),
+        gr.update(interactive=False),
+        gr.update(visible=False),
+        gr.update(interactive=False),
+        gr.update(visible=False),
+        gr.update(interactive=False)
     ]
 def handle_new_example_click():
     return load_context(set_interrupt=True)[0]
 def update_ui_for_new_context(example):
     context_desc = example.get('processed_context_desc', '')
     if context_desc:
         context_desc = f"<div class='context-topic'><span class='topic-label'>The question and context are about:</span> {context_desc}</div>"
         False
     ]
 def cleanup_on_disconnect():
     print(f"Browser disconnected. Cleaning up resources...")
     generation_interrupt.set()
 with gr.Blocks(theme=gr.themes.Default(
     primary_hue=gr.themes.colors.orange,
     secondary_hue=gr.themes.colors.slate
 )) as demo:
     css_path = os.path.join(os.getcwd(), 'static', 'styles.css')
     with open(css_path, 'r') as f:
         css_content = f.read()
     gr.HTML(f"<style>{css_content}</style>")
     unload_js = """
     <script>
     window.addEventListener('beforeunload', function(e) {
         navigator.sendBeacon('/cleanup?session_id=' + window.gradioClientState.session_hash);
     });
     </script>
     """
     gr.HTML(unload_js)
     current_example = gr.State({})
     model_a_name = gr.State("")
     model_b_name = gr.State("")
     results_agg = gr.State(load_leaderboard_data())
     show_full_context = gr.State(False)
     with gr.Tabs() as tabs:
         with gr.TabItem("Arena", id="arena-tab"):
+            gr.Markdown("# SLM RAG Summarization/Generation Arena")
             gr.Markdown("""
+🏟️ This arena evaluates small language models on document QA tasks with retrieved context. Models should provide **grounded, comprehensive** answers or **properly decline** with clarification when information is insufficient.
+1️⃣ **Review the query and context** - ✨Highlighted text✨ contains key information that should be included in good answers
+2️⃣ **Compare answers** generated by two different models working with the same query and context
+3️⃣ **Vote for the better response** or select 'Tie/Neither' if appropriate
+> **Note:** Highlights are abbreviated contexts based on ground truth (via GPT-4o). Full Context shows the actual text provided to the models.
+""")
             gr.HTML("<hr>")
             with gr.Column(elem_id="main-interface-area") as main_interface_area:
                 with gr.Row(elem_id="query-title-row"):
+                    gr.Markdown("### 💬 Query - Question About Document Content", elem_classes="section-heading")
                 with gr.Row(elem_id="query-container"):
                     with gr.Row(elem_classes="query-box-row"):
                         query_display = gr.Markdown(value="Loading question...", elem_classes="query-text", elem_id="query-section")
                     random_question_btn = gr.Button("🔄 Try a New Question", elem_classes="query-button")
                 context_description = gr.Markdown("", elem_classes="context-description")
                 gr.HTML("<hr>")
                 with gr.Row(elem_id="context-header-row"):
+                    gr.Markdown("### 📋 Context - Retrieved Content from the Document", elem_classes="context-title")
                     context_toggle_btn = gr.Button("Show Full Context", elem_classes=["context-toggle-button"])
                 context_display = gr.HTML(value="Loading context...", label="Context Chunks")
                 gr.Markdown("---")
+                gr.Markdown("### 🔍 Compare Models - Are these Grouded, Complete Answers or Correct Rejections?", elem_classes="section-heading")
                 with gr.Row(elem_id="summary-containers"):
                     with gr.Column(scale=1):
                         with gr.Group(elem_classes=["summary-card", "summary-card-a"]):
                                 lines=10,
                                 interactive=False,
                                 show_copy_button=True,
+                                autoscroll=False,
                                 elem_id="summary-a-display"
                             )
                     with gr.Column(scale=1):
                                 lines=10,
                                 interactive=False,
                                 show_copy_button=True,
+                                autoscroll=False,
                                 elem_id="summary-b-display"
                             )
                 gr.HTML("<hr>")
                 gr.Markdown("### 🏅 Cast Your Vote", elem_classes="section-heading")
                 with gr.Row():
                     vote_button_a = gr.Button("⬅️ Summary A is Better", elem_classes=["vote-button"], interactive=False)
                     vote_button_b = gr.Button("➡️ Summary B is Better", elem_classes=["vote-button"], interactive=False)
                     vote_button_neither = gr.Button("❌ Neither is Good", elem_classes=["vote-button", "vote-button-neither"], interactive=False)
                 with gr.Group(elem_classes=["feedback-section"], visible=False) as feedback_section:
                     feedback_checkboxes = gr.CheckboxGroup(label="Feedback (optional)", choices=[], interactive=False)
                 submit_button = gr.Button("Submit Your Vote", variant="primary", interactive=False, elem_id="submit-button")
                 with gr.Column(visible=False) as results_reveal_area:
                     gr.Markdown("---")
                     gr.Markdown("### ✅ Vote Submitted!", elem_classes="section-heading")
                     with gr.Row():
                         with gr.Column(scale=1):
                             gr.Markdown("### Model A was:", elem_classes="section-heading")
                     gr.HTML("<hr>")
                     with gr.Row(elem_classes=["control-buttons"]):
                         try_another_btn = gr.Button("🔄 Try Another Question", elem_id="try-another-btn")
         with gr.TabItem("Leaderboard", id="leaderboard-tab"):
             gr.Markdown("# RAG SLM Summarizer/Generator Leaderboard", elem_classes="orange-title")
             gr.Markdown("View performance statistics for all models ranked by Elo rating.")
             results_table_display = gr.HTML(label="Model Performance")
     context_toggle_btn.click(
         fn=toggle_context_display,
         inputs=[current_example, show_full_context],
         outputs=[show_full_context, context_display, context_toggle_btn]
     )
     demo.load(
+        fn=load_context,
         inputs=[],
         outputs=[current_example, query_display, context_description, context_display,
                 context_toggle_btn, show_full_context]
                 submit_button, results_reveal_area, random_question_btn, main_interface_area]
     )
     demo.load(
         fn=load_leaderboard,
         inputs=[],
         outputs=[results_table_display]
     )
     for btn in [random_question_btn, try_another_btn]:
         btn.click(
             fn=show_loading_state,
             inputs=[],
+            outputs=[
+                summary_a_display, summary_b_display,
+                vote_button_a, vote_button_b, vote_button_tie, vote_button_neither,
+                feedback_section, submit_button, results_reveal_area, random_question_btn
+            ]
         ).then(
             fn=handle_new_example_click,
             inputs=[],
             outputs=[current_example]
         ).then(
             fn=update_ui_for_new_context,
             inputs=[current_example],
             outputs=[query_display, context_description, context_display,
                     context_toggle_btn, show_full_context]
         ).then(
             fn=process_example,
             inputs=[current_example],
             outputs=[model_a_name, model_b_name, summary_a_text, summary_b_text,
                     submit_button, results_reveal_area, random_question_btn, main_interface_area]
         )
     for btn, choice in zip(
         [vote_button_a, vote_button_b, vote_button_tie, vote_button_neither],
         ['left', 'right', 'tie', 'neither']
                     vote_button_a, vote_button_b, vote_button_tie, vote_button_neither]
         )
     feedback_checkboxes.change(
         fn=update_feedback,
         inputs=[feedback_checkboxes],
         outputs=[feedback_list]
     )
     submit_button.click(
         fn=handle_vote_submission,
         inputs=[current_example, model_a_name, model_b_name, selected_winner, feedback_list, summary_a_text, summary_b_text, results_agg],
                 context_toggle_btn, model_a_reveal, model_b_reveal]
     )
     tabs.select(
         fn=load_leaderboard,
         inputs=[],
         api_name="refresh_leaderboard"
     )
     demo.unload(cleanup_on_disconnect)
 if __name__ == "__main__":