SLM-RAG-Arena

Running on Zero

App Files Files Community

oliver-aizip

kai-aizip commited on 6 days ago

Commit

ce4dda5

verified ·

1 Parent(s): d9de1e9

Remove timeout & update elo ranking (#4)

Browse files

- Remove timeout & update elo ranking (ab1541bea9ef113b68d60b1c48c4be1d9ec0b2a7)

Co-authored-by: Kai <kai-aizip@users.noreply.huggingface.co>

Files changed (1) hide show

app.py +68 -147

app.py CHANGED Viewed

@@ -7,7 +7,8 @@ from threading import Event
 from utils.data_loader import get_random_example
 from utils.models import generate_summaries, model_names
 from utils.ui_helpers import toggle_context_display, update_feedback, get_context_html
-from utils.leaderboard import load_leaderboard_data, save_leaderboard_data
 # Global interrupt mechanism for model generation
 generation_interrupt = Event()
@@ -41,54 +42,14 @@ def load_context():
         show_full
     ]
-def generate_model_summaries_with_timeout(example, timeout=60):
-    """Run model inference in a separate thread with timeout for interruptibility"""
-    import threading
-    import time
-    result = {
-        "model_a": "",
-        "model_b": "",
-        "summary_a": "",
-        "summary_b": "",
-        "completed": False
-    }
-    if generation_interrupt.is_set():
-        return result
-    def run_generation():
-        try:
-            m_a_name, m_b_name = random.sample(model_names, 2)
-            s_a, s_b = generate_summaries(example, m_a_name, m_b_name)
-            if not generation_interrupt.is_set():
-                result["model_a"] = m_a_name
-                result["model_b"] = m_b_name
-                result["summary_a"] = s_a
-                result["summary_b"] = s_b
-                result["completed"] = True
-        except Exception as e:
-            print(f"Error in generation thread: {e}")
-    generation_thread = threading.Thread(target=run_generation)
-    generation_thread.daemon = True
-    generation_thread.start()
-    # Uncomment this critical waiting code
-    start_time = time.time()
-    while time.time() - start_time < timeout:
-        if generation_interrupt.is_set() or not generation_thread.is_alive() or result["completed"]:
-            break
-        time.sleep(0.1)
-    return result
-async def generate_model_summaries_with_timeout_async(example, timeout=30):
-    """Async version that properly waits for the thread"""
-    import asyncio
-    import threading
-    import time
     result = {
         "model_a": "",
@@ -101,41 +62,29 @@ async def generate_model_summaries_with_timeout_async(example, timeout=30):
     if generation_interrupt.is_set():
         return result
-    def run_generation():
-        try:
-            m_a_name, m_b_name = random.sample(model_names, 2)
-            s_a, s_b = generate_summaries(example, m_a_name, m_b_name)
-            if not generation_interrupt.is_set():
-                result["model_a"] = m_a_name
-                result["model_b"] = m_b_name
-                result["summary_a"] = s_a
-                result["summary_b"] = s_b
-                result["completed"] = True
-        except Exception as e:
-            print(f"Error in generation thread: {e}")
-    generation_thread = threading.Thread(target=run_generation)
-    generation_thread.daemon = True
-    generation_thread.start()
-    # Use asyncio.sleep instead of time.sleep for async waiting
-    start_time = time.time()
-    while time.time() - start_time < timeout:
-        if generation_interrupt.is_set() or not generation_thread.is_alive() or result["completed"]:
-            break
-        await asyncio.sleep(0.1)  # Non-blocking sleep
     return result
 def process_generation_result(result):
-    """Process the results from the threaded generation function"""
     if not result["completed"]:
         # Generation was interrupted or failed
         return [
             "", "", "", "", None, [], False, load_leaderboard_data(),
-            gr.update(value="Generation was interrupted or timed out. Please try again."),
-            gr.update(value="Generation was interrupted or timed out. Please try again."),
             gr.update(interactive=True, elem_classes=["vote-button"]),
             gr.update(interactive=True, elem_classes=["vote-button"]),
             gr.update(interactive=True, elem_classes=["vote-button"]),
@@ -167,12 +116,9 @@ def process_generation_result(result):
         gr.update(interactive=True),
         gr.update(elem_classes=[])
     ]
-async def process_example_async(example):
-    result = await generate_model_summaries_with_timeout_async(example)
-    return process_generation_result(result)
-def process_example_sync(example):
-    result = generate_model_summaries_with_timeout(example)
     return process_generation_result(result)
 def select_vote_improved(winner_choice):
@@ -204,69 +150,17 @@ def select_vote_improved(winner_choice):
         gr.update(elem_classes=btn_neither_classes)
     ]
-def submit_vote_fixed(m_a, m_b, winner, feedback, current_results):
-    """Processes vote and updates leaderboard"""
     if winner is None:
         print("Warning: Submit called without a winner selected.")
         return {}
-    updated_results = current_results.copy()
-    models_involved = [m_a, m_b]
-    for model in models_involved:
-         if model not in updated_results["wins"]:
-            updated_results["wins"][model] = 0
-            updated_results["losses"][model] = 0
-            updated_results["ties"][model] = 0
-    if winner == 'left':
-        updated_results["wins"][m_a] = updated_results["wins"].get(m_a, 0) + 1
-        updated_results["losses"][m_b] = updated_results["losses"].get(m_b, 0) + 1
-    elif winner == 'right':
-        updated_results["wins"][m_b] = updated_results["wins"].get(m_b, 0) + 1
-        updated_results["losses"][m_a] = updated_results["losses"].get(m_a, 0) + 1
-    elif winner == 'tie':
-        updated_results["ties"][m_a] = updated_results["ties"].get(m_a, 0) + 1
-        updated_results["ties"][m_b] = updated_results["ties"].get(m_b, 0) + 1
-    updated_results["votes"] = updated_results.get("votes", 0) + 1
-    save_leaderboard_data(updated_results)
-    # Prepare Results Table
-    results_list = []
-    all_models = list(set(list(updated_results["wins"].keys()) +
-                          list(updated_results["losses"].keys()) +
-                          list(updated_results["ties"].keys())))
-    for model in sorted(all_models):
-        wins = updated_results["wins"].get(model, 0)
-        losses = updated_results["losses"].get(model, 0)
-        ties = updated_results["ties"].get(model, 0)
-        total_comparisons = wins + losses + ties
-        win_rate = (wins + 0.5 * ties) / total_comparisons if total_comparisons > 0 else 0.0
-        results_list.append({
-            "Model": model,
-            "Win Rate (%)": f"{win_rate:.1%}",
-            "Wins": wins,
-            "Losses": losses,
-            "Ties": ties,
-            "Comparisons": total_comparisons
-        })
-    results_df = pd.DataFrame(results_list)
-    if not results_df.empty:
-        results_df['Win Rate Value'] = results_df['Win Rate (%)'].str.rstrip('%').astype('float') / 100.0
-        results_df = results_df.sort_values(by='Win Rate Value', ascending=False).drop(columns=['Win Rate Value'])
-    return [
-        True, updated_results,
-        gr.update(interactive=False), gr.update(interactive=False),
-        gr.update(interactive=False), gr.update(interactive=False),
-        gr.update(interactive=False), gr.update(visible=True),
-        gr.update(visible=False), gr.update(visible=True),
-        gr.update(interactive=False), gr.update(value=results_df, visible=True),
-        gr.update(elem_classes=["results-revealed"]),
-        gr.update(interactive=True), gr.update(value=m_a), gr.update(value=m_b)
-    ]
 # Create Gradio interface
 with gr.Blocks(theme=gr.themes.Default(
@@ -288,7 +182,7 @@ with gr.Blocks(theme=gr.themes.Default(
     selected_winner = gr.State(None)
     feedback_list = gr.State([])
     show_results_state = gr.State(False)
-    results_agg = gr.State({"wins": {}, "losses": {}, "ties": {}, "votes": 0})
     show_full_context = gr.State(False)
     # Create Tabs
@@ -365,9 +259,21 @@ with gr.Blocks(theme=gr.themes.Default(
         # Leaderboard Tab
         with gr.TabItem("Leaderboard", id="leaderboard-tab"):
-            gr.Markdown("# Model Performance Leaderboard")
-            gr.Markdown("View aggregate performance statistics for all models. The table below shows win rates, wins, losses, and ties for each model based on all evaluations.")
-            results_table_display = gr.DataFrame(label="Model Performance", interactive=False, wrap=True)
     # Generic function to handle starting a new example
     def handle_new_example_click():
@@ -398,7 +304,7 @@ with gr.Blocks(theme=gr.themes.Default(
         outputs=[current_example, query_display, context_description, context_display,
                 context_toggle_btn, show_full_context]
     ).then(
-        fn=process_example_async,
         inputs=[current_example],
         outputs=[model_a_name, model_b_name, summary_a_text, summary_b_text,
                 selected_winner, feedback_list, show_results_state, results_agg,
@@ -407,6 +313,13 @@ with gr.Blocks(theme=gr.themes.Default(
                 submit_button, results_reveal_area, random_question_btn, main_interface_area]
     )
     # Random Question and Try Another buttons with interruption
     for btn in [random_question_btn, try_another_btn]:
         btn.click(
@@ -419,7 +332,7 @@ with gr.Blocks(theme=gr.themes.Default(
             outputs=[query_display, context_description, context_display,
                     context_toggle_btn, show_full_context]
         ).then(
-            fn=process_example_sync,
             inputs=[current_example],
             outputs=[model_a_name, model_b_name, summary_a_text, summary_b_text,
                     selected_winner, feedback_list, show_results_state, results_agg,
@@ -449,14 +362,22 @@ with gr.Blocks(theme=gr.themes.Default(
     # Process vote submission and reveal results
     submit_button.click(
-        fn=submit_vote_fixed,
-        inputs=[model_a_name, model_b_name, selected_winner, feedback_list, results_agg],
         outputs=[show_results_state, results_agg, vote_button_a, vote_button_b,
                 vote_button_tie, vote_button_neither, feedback_checkboxes,
                 feedback_section, submit_button, results_reveal_area,
                 random_question_btn, results_table_display, main_interface_area,
                 context_toggle_btn, model_a_reveal, model_b_reveal]
     )
 if __name__ == "__main__":
     demo.launch(debug=True)

 from utils.data_loader import get_random_example
 from utils.models import generate_summaries, model_names
 from utils.ui_helpers import toggle_context_display, update_feedback, get_context_html
+from utils.leaderboard import load_leaderboard_data, submit_vote_with_elo, generate_leaderboard_html
+from utils.vote_logger import save_vote_details
 # Global interrupt mechanism for model generation
 generation_interrupt = Event()
         show_full
     ]
+def load_leaderboard():
+    """Loads and displays the leaderboard data"""
+    results = load_leaderboard_data()
+    leaderboard_html = generate_leaderboard_html(results)
+    return leaderboard_html
+def generate_model_summaries(example):
+    """Run model inference"""
     result = {
         "model_a": "",
     if generation_interrupt.is_set():
         return result
+    try:
+        m_a_name, m_b_name = random.sample(model_names, 2)
+        s_a, s_b = generate_summaries(example, m_a_name, m_b_name)
+        if not generation_interrupt.is_set():
+            result["model_a"] = m_a_name
+            result["model_b"] = m_b_name
+            result["summary_a"] = s_a
+            result["summary_b"] = s_b
+            result["completed"] = True
+    except Exception as e:
+        print(f"Error in generation: {e}")
     return result
 def process_generation_result(result):
+    """Process the results from the generation function"""
     if not result["completed"]:
         # Generation was interrupted or failed
         return [
             "", "", "", "", None, [], False, load_leaderboard_data(),
+            gr.update(value="Generation was interrupted or failed. Please try again."),
+            gr.update(value="Generation was interrupted or failed. Please try again."),
             gr.update(interactive=True, elem_classes=["vote-button"]),
             gr.update(interactive=True, elem_classes=["vote-button"]),
             gr.update(interactive=True, elem_classes=["vote-button"]),
         gr.update(interactive=True),
         gr.update(elem_classes=[])
     ]
+def process_example(example):
+    result = generate_model_summaries(example)
     return process_generation_result(result)
 def select_vote_improved(winner_choice):
         gr.update(elem_classes=btn_neither_classes)
     ]
+def handle_vote_submission(example, m_a, m_b, winner, feedback, summary_a, summary_b, current_results):
+    """Handle vote submission - logs details and updates leaderboard"""
     if winner is None:
         print("Warning: Submit called without a winner selected.")
         return {}
+    # Save detailed vote information
+    save_vote_details(example, m_a, m_b, winner, feedback, summary_a, summary_b)
+    # Update Elo ratings and get UI updates
+    return submit_vote_with_elo(m_a, m_b, winner, feedback, current_results)
 # Create Gradio interface
 with gr.Blocks(theme=gr.themes.Default(
     selected_winner = gr.State(None)
     feedback_list = gr.State([])
     show_results_state = gr.State(False)
+    results_agg = gr.State(load_leaderboard_data())
     show_full_context = gr.State(False)
     # Create Tabs
         # Leaderboard Tab
         with gr.TabItem("Leaderboard", id="leaderboard-tab"):
+            gr.Markdown("# Model Performance Leaderboard", elem_classes="orange-title")
+            gr.Markdown("View performance statistics for all models ranked by Elo rating.")
+            with gr.Group(elem_id="leaderboard-info"):
+                gr.Markdown("""### About Elo Ratings
+The Elo rating system provides a more accurate ranking than simple win rates:
+- All models start at 1500 points
+- Points are exchanged after each comparison based on the expected outcome
+- Beating a stronger model earns more points than beating a weaker one
+- The ± value shows the statistical confidence interval (95%)
+""")
+            results_table_display = gr.HTML(label="Model Performance")
     # Generic function to handle starting a new example
     def handle_new_example_click():
         outputs=[current_example, query_display, context_description, context_display,
                 context_toggle_btn, show_full_context]
     ).then(
+        fn=process_example,
         inputs=[current_example],
         outputs=[model_a_name, model_b_name, summary_a_text, summary_b_text,
                 selected_winner, feedback_list, show_results_state, results_agg,
                 submit_button, results_reveal_area, random_question_btn, main_interface_area]
     )
+    # Load leaderboard content on app start
+    demo.load(
+        fn=load_leaderboard,
+        inputs=[],
+        outputs=[results_table_display]
+    )
     # Random Question and Try Another buttons with interruption
     for btn in [random_question_btn, try_another_btn]:
         btn.click(
             outputs=[query_display, context_description, context_display,
                     context_toggle_btn, show_full_context]
         ).then(
+            fn=process_example,
             inputs=[current_example],
             outputs=[model_a_name, model_b_name, summary_a_text, summary_b_text,
                     selected_winner, feedback_list, show_results_state, results_agg,
     # Process vote submission and reveal results
     submit_button.click(
+        fn=handle_vote_submission,
+        inputs=[current_example, model_a_name, model_b_name, selected_winner, feedback_list, summary_a_text, summary_b_text, results_agg],
         outputs=[show_results_state, results_agg, vote_button_a, vote_button_b,
                 vote_button_tie, vote_button_neither, feedback_checkboxes,
                 feedback_section, submit_button, results_reveal_area,
                 random_question_btn, results_table_display, main_interface_area,
                 context_toggle_btn, model_a_reveal, model_b_reveal]
     )
+    # Refresh leaderboard when switching to the leaderboard tab
+    tabs.select(
+        fn=load_leaderboard,
+        inputs=[],
+        outputs=[results_table_display],
+        api_name="refresh_leaderboard"
+    )
 if __name__ == "__main__":
     demo.launch(debug=True)