oliver-aizip kai-aizip commited on
Commit
ce4dda5
·
verified ·
1 Parent(s): d9de1e9

Remove timeout & update elo ranking (#4)

Browse files

- Remove timeout & update elo ranking (ab1541bea9ef113b68d60b1c48c4be1d9ec0b2a7)


Co-authored-by: Kai <kai-aizip@users.noreply.huggingface.co>

Files changed (1) hide show
  1. app.py +68 -147
app.py CHANGED
@@ -7,7 +7,8 @@ from threading import Event
7
  from utils.data_loader import get_random_example
8
  from utils.models import generate_summaries, model_names
9
  from utils.ui_helpers import toggle_context_display, update_feedback, get_context_html
10
- from utils.leaderboard import load_leaderboard_data, save_leaderboard_data
 
11
 
12
  # Global interrupt mechanism for model generation
13
  generation_interrupt = Event()
@@ -41,54 +42,14 @@ def load_context():
41
  show_full
42
  ]
43
 
44
- def generate_model_summaries_with_timeout(example, timeout=60):
45
- """Run model inference in a separate thread with timeout for interruptibility"""
46
- import threading
47
- import time
48
-
49
- result = {
50
- "model_a": "",
51
- "model_b": "",
52
- "summary_a": "",
53
- "summary_b": "",
54
- "completed": False
55
- }
56
-
57
- if generation_interrupt.is_set():
58
- return result
59
 
60
- def run_generation():
61
- try:
62
- m_a_name, m_b_name = random.sample(model_names, 2)
63
- s_a, s_b = generate_summaries(example, m_a_name, m_b_name)
64
-
65
- if not generation_interrupt.is_set():
66
- result["model_a"] = m_a_name
67
- result["model_b"] = m_b_name
68
- result["summary_a"] = s_a
69
- result["summary_b"] = s_b
70
- result["completed"] = True
71
- except Exception as e:
72
- print(f"Error in generation thread: {e}")
73
-
74
- generation_thread = threading.Thread(target=run_generation)
75
- generation_thread.daemon = True
76
- generation_thread.start()
77
-
78
- # Uncomment this critical waiting code
79
- start_time = time.time()
80
- while time.time() - start_time < timeout:
81
- if generation_interrupt.is_set() or not generation_thread.is_alive() or result["completed"]:
82
- break
83
- time.sleep(0.1)
84
-
85
- return result
86
-
87
- async def generate_model_summaries_with_timeout_async(example, timeout=30):
88
- """Async version that properly waits for the thread"""
89
- import asyncio
90
- import threading
91
- import time
92
 
93
  result = {
94
  "model_a": "",
@@ -101,41 +62,29 @@ async def generate_model_summaries_with_timeout_async(example, timeout=30):
101
  if generation_interrupt.is_set():
102
  return result
103
 
104
- def run_generation():
105
- try:
106
- m_a_name, m_b_name = random.sample(model_names, 2)
107
- s_a, s_b = generate_summaries(example, m_a_name, m_b_name)
108
-
109
- if not generation_interrupt.is_set():
110
- result["model_a"] = m_a_name
111
- result["model_b"] = m_b_name
112
- result["summary_a"] = s_a
113
- result["summary_b"] = s_b
114
- result["completed"] = True
115
- except Exception as e:
116
- print(f"Error in generation thread: {e}")
117
-
118
- generation_thread = threading.Thread(target=run_generation)
119
- generation_thread.daemon = True
120
- generation_thread.start()
121
-
122
- # Use asyncio.sleep instead of time.sleep for async waiting
123
- start_time = time.time()
124
- while time.time() - start_time < timeout:
125
- if generation_interrupt.is_set() or not generation_thread.is_alive() or result["completed"]:
126
- break
127
- await asyncio.sleep(0.1) # Non-blocking sleep
128
 
129
  return result
130
 
131
  def process_generation_result(result):
132
- """Process the results from the threaded generation function"""
133
  if not result["completed"]:
134
  # Generation was interrupted or failed
135
  return [
136
  "", "", "", "", None, [], False, load_leaderboard_data(),
137
- gr.update(value="Generation was interrupted or timed out. Please try again."),
138
- gr.update(value="Generation was interrupted or timed out. Please try again."),
139
  gr.update(interactive=True, elem_classes=["vote-button"]),
140
  gr.update(interactive=True, elem_classes=["vote-button"]),
141
  gr.update(interactive=True, elem_classes=["vote-button"]),
@@ -167,12 +116,9 @@ def process_generation_result(result):
167
  gr.update(interactive=True),
168
  gr.update(elem_classes=[])
169
  ]
170
- async def process_example_async(example):
171
- result = await generate_model_summaries_with_timeout_async(example)
172
- return process_generation_result(result)
173
 
174
- def process_example_sync(example):
175
- result = generate_model_summaries_with_timeout(example)
176
  return process_generation_result(result)
177
 
178
  def select_vote_improved(winner_choice):
@@ -204,69 +150,17 @@ def select_vote_improved(winner_choice):
204
  gr.update(elem_classes=btn_neither_classes)
205
  ]
206
 
207
- def submit_vote_fixed(m_a, m_b, winner, feedback, current_results):
208
- """Processes vote and updates leaderboard"""
209
  if winner is None:
210
  print("Warning: Submit called without a winner selected.")
211
  return {}
212
 
213
- updated_results = current_results.copy()
214
- models_involved = [m_a, m_b]
215
- for model in models_involved:
216
- if model not in updated_results["wins"]:
217
- updated_results["wins"][model] = 0
218
- updated_results["losses"][model] = 0
219
- updated_results["ties"][model] = 0
220
-
221
- if winner == 'left':
222
- updated_results["wins"][m_a] = updated_results["wins"].get(m_a, 0) + 1
223
- updated_results["losses"][m_b] = updated_results["losses"].get(m_b, 0) + 1
224
- elif winner == 'right':
225
- updated_results["wins"][m_b] = updated_results["wins"].get(m_b, 0) + 1
226
- updated_results["losses"][m_a] = updated_results["losses"].get(m_a, 0) + 1
227
- elif winner == 'tie':
228
- updated_results["ties"][m_a] = updated_results["ties"].get(m_a, 0) + 1
229
- updated_results["ties"][m_b] = updated_results["ties"].get(m_b, 0) + 1
230
-
231
- updated_results["votes"] = updated_results.get("votes", 0) + 1
232
- save_leaderboard_data(updated_results)
233
-
234
- # Prepare Results Table
235
- results_list = []
236
- all_models = list(set(list(updated_results["wins"].keys()) +
237
- list(updated_results["losses"].keys()) +
238
- list(updated_results["ties"].keys())))
239
-
240
- for model in sorted(all_models):
241
- wins = updated_results["wins"].get(model, 0)
242
- losses = updated_results["losses"].get(model, 0)
243
- ties = updated_results["ties"].get(model, 0)
244
- total_comparisons = wins + losses + ties
245
- win_rate = (wins + 0.5 * ties) / total_comparisons if total_comparisons > 0 else 0.0
246
- results_list.append({
247
- "Model": model,
248
- "Win Rate (%)": f"{win_rate:.1%}",
249
- "Wins": wins,
250
- "Losses": losses,
251
- "Ties": ties,
252
- "Comparisons": total_comparisons
253
- })
254
-
255
- results_df = pd.DataFrame(results_list)
256
- if not results_df.empty:
257
- results_df['Win Rate Value'] = results_df['Win Rate (%)'].str.rstrip('%').astype('float') / 100.0
258
- results_df = results_df.sort_values(by='Win Rate Value', ascending=False).drop(columns=['Win Rate Value'])
259
-
260
- return [
261
- True, updated_results,
262
- gr.update(interactive=False), gr.update(interactive=False),
263
- gr.update(interactive=False), gr.update(interactive=False),
264
- gr.update(interactive=False), gr.update(visible=True),
265
- gr.update(visible=False), gr.update(visible=True),
266
- gr.update(interactive=False), gr.update(value=results_df, visible=True),
267
- gr.update(elem_classes=["results-revealed"]),
268
- gr.update(interactive=True), gr.update(value=m_a), gr.update(value=m_b)
269
- ]
270
 
271
  # Create Gradio interface
272
  with gr.Blocks(theme=gr.themes.Default(
@@ -288,7 +182,7 @@ with gr.Blocks(theme=gr.themes.Default(
288
  selected_winner = gr.State(None)
289
  feedback_list = gr.State([])
290
  show_results_state = gr.State(False)
291
- results_agg = gr.State({"wins": {}, "losses": {}, "ties": {}, "votes": 0})
292
  show_full_context = gr.State(False)
293
 
294
  # Create Tabs
@@ -365,9 +259,21 @@ with gr.Blocks(theme=gr.themes.Default(
365
 
366
  # Leaderboard Tab
367
  with gr.TabItem("Leaderboard", id="leaderboard-tab"):
368
- gr.Markdown("# Model Performance Leaderboard")
369
- gr.Markdown("View aggregate performance statistics for all models. The table below shows win rates, wins, losses, and ties for each model based on all evaluations.")
370
- results_table_display = gr.DataFrame(label="Model Performance", interactive=False, wrap=True)
 
 
 
 
 
 
 
 
 
 
 
 
371
 
372
  # Generic function to handle starting a new example
373
  def handle_new_example_click():
@@ -398,7 +304,7 @@ with gr.Blocks(theme=gr.themes.Default(
398
  outputs=[current_example, query_display, context_description, context_display,
399
  context_toggle_btn, show_full_context]
400
  ).then(
401
- fn=process_example_async,
402
  inputs=[current_example],
403
  outputs=[model_a_name, model_b_name, summary_a_text, summary_b_text,
404
  selected_winner, feedback_list, show_results_state, results_agg,
@@ -407,6 +313,13 @@ with gr.Blocks(theme=gr.themes.Default(
407
  submit_button, results_reveal_area, random_question_btn, main_interface_area]
408
  )
409
 
 
 
 
 
 
 
 
410
  # Random Question and Try Another buttons with interruption
411
  for btn in [random_question_btn, try_another_btn]:
412
  btn.click(
@@ -419,7 +332,7 @@ with gr.Blocks(theme=gr.themes.Default(
419
  outputs=[query_display, context_description, context_display,
420
  context_toggle_btn, show_full_context]
421
  ).then(
422
- fn=process_example_sync,
423
  inputs=[current_example],
424
  outputs=[model_a_name, model_b_name, summary_a_text, summary_b_text,
425
  selected_winner, feedback_list, show_results_state, results_agg,
@@ -449,14 +362,22 @@ with gr.Blocks(theme=gr.themes.Default(
449
 
450
  # Process vote submission and reveal results
451
  submit_button.click(
452
- fn=submit_vote_fixed,
453
- inputs=[model_a_name, model_b_name, selected_winner, feedback_list, results_agg],
454
  outputs=[show_results_state, results_agg, vote_button_a, vote_button_b,
455
  vote_button_tie, vote_button_neither, feedback_checkboxes,
456
  feedback_section, submit_button, results_reveal_area,
457
  random_question_btn, results_table_display, main_interface_area,
458
  context_toggle_btn, model_a_reveal, model_b_reveal]
459
  )
 
 
 
 
 
 
 
 
460
 
461
  if __name__ == "__main__":
462
  demo.launch(debug=True)
 
7
  from utils.data_loader import get_random_example
8
  from utils.models import generate_summaries, model_names
9
  from utils.ui_helpers import toggle_context_display, update_feedback, get_context_html
10
+ from utils.leaderboard import load_leaderboard_data, submit_vote_with_elo, generate_leaderboard_html
11
+ from utils.vote_logger import save_vote_details
12
 
13
  # Global interrupt mechanism for model generation
14
  generation_interrupt = Event()
 
42
  show_full
43
  ]
44
 
45
+ def load_leaderboard():
46
+ """Loads and displays the leaderboard data"""
47
+ results = load_leaderboard_data()
48
+ leaderboard_html = generate_leaderboard_html(results)
49
+ return leaderboard_html
 
 
 
 
 
 
 
 
 
 
50
 
51
+ def generate_model_summaries(example):
52
+ """Run model inference"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
  result = {
55
  "model_a": "",
 
62
  if generation_interrupt.is_set():
63
  return result
64
 
65
+ try:
66
+ m_a_name, m_b_name = random.sample(model_names, 2)
67
+ s_a, s_b = generate_summaries(example, m_a_name, m_b_name)
68
+
69
+ if not generation_interrupt.is_set():
70
+ result["model_a"] = m_a_name
71
+ result["model_b"] = m_b_name
72
+ result["summary_a"] = s_a
73
+ result["summary_b"] = s_b
74
+ result["completed"] = True
75
+ except Exception as e:
76
+ print(f"Error in generation: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
  return result
79
 
80
  def process_generation_result(result):
81
+ """Process the results from the generation function"""
82
  if not result["completed"]:
83
  # Generation was interrupted or failed
84
  return [
85
  "", "", "", "", None, [], False, load_leaderboard_data(),
86
+ gr.update(value="Generation was interrupted or failed. Please try again."),
87
+ gr.update(value="Generation was interrupted or failed. Please try again."),
88
  gr.update(interactive=True, elem_classes=["vote-button"]),
89
  gr.update(interactive=True, elem_classes=["vote-button"]),
90
  gr.update(interactive=True, elem_classes=["vote-button"]),
 
116
  gr.update(interactive=True),
117
  gr.update(elem_classes=[])
118
  ]
 
 
 
119
 
120
+ def process_example(example):
121
+ result = generate_model_summaries(example)
122
  return process_generation_result(result)
123
 
124
  def select_vote_improved(winner_choice):
 
150
  gr.update(elem_classes=btn_neither_classes)
151
  ]
152
 
153
+ def handle_vote_submission(example, m_a, m_b, winner, feedback, summary_a, summary_b, current_results):
154
+ """Handle vote submission - logs details and updates leaderboard"""
155
  if winner is None:
156
  print("Warning: Submit called without a winner selected.")
157
  return {}
158
 
159
+ # Save detailed vote information
160
+ save_vote_details(example, m_a, m_b, winner, feedback, summary_a, summary_b)
161
+
162
+ # Update Elo ratings and get UI updates
163
+ return submit_vote_with_elo(m_a, m_b, winner, feedback, current_results)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
 
165
  # Create Gradio interface
166
  with gr.Blocks(theme=gr.themes.Default(
 
182
  selected_winner = gr.State(None)
183
  feedback_list = gr.State([])
184
  show_results_state = gr.State(False)
185
+ results_agg = gr.State(load_leaderboard_data())
186
  show_full_context = gr.State(False)
187
 
188
  # Create Tabs
 
259
 
260
  # Leaderboard Tab
261
  with gr.TabItem("Leaderboard", id="leaderboard-tab"):
262
+ gr.Markdown("# Model Performance Leaderboard", elem_classes="orange-title")
263
+ gr.Markdown("View performance statistics for all models ranked by Elo rating.")
264
+
265
+ with gr.Group(elem_id="leaderboard-info"):
266
+ gr.Markdown("""### About Elo Ratings
267
+
268
+ The Elo rating system provides a more accurate ranking than simple win rates:
269
+
270
+ - All models start at 1500 points
271
+ - Points are exchanged after each comparison based on the expected outcome
272
+ - Beating a stronger model earns more points than beating a weaker one
273
+ - The ± value shows the statistical confidence interval (95%)
274
+ """)
275
+
276
+ results_table_display = gr.HTML(label="Model Performance")
277
 
278
  # Generic function to handle starting a new example
279
  def handle_new_example_click():
 
304
  outputs=[current_example, query_display, context_description, context_display,
305
  context_toggle_btn, show_full_context]
306
  ).then(
307
+ fn=process_example,
308
  inputs=[current_example],
309
  outputs=[model_a_name, model_b_name, summary_a_text, summary_b_text,
310
  selected_winner, feedback_list, show_results_state, results_agg,
 
313
  submit_button, results_reveal_area, random_question_btn, main_interface_area]
314
  )
315
 
316
+ # Load leaderboard content on app start
317
+ demo.load(
318
+ fn=load_leaderboard,
319
+ inputs=[],
320
+ outputs=[results_table_display]
321
+ )
322
+
323
  # Random Question and Try Another buttons with interruption
324
  for btn in [random_question_btn, try_another_btn]:
325
  btn.click(
 
332
  outputs=[query_display, context_description, context_display,
333
  context_toggle_btn, show_full_context]
334
  ).then(
335
+ fn=process_example,
336
  inputs=[current_example],
337
  outputs=[model_a_name, model_b_name, summary_a_text, summary_b_text,
338
  selected_winner, feedback_list, show_results_state, results_agg,
 
362
 
363
  # Process vote submission and reveal results
364
  submit_button.click(
365
+ fn=handle_vote_submission,
366
+ inputs=[current_example, model_a_name, model_b_name, selected_winner, feedback_list, summary_a_text, summary_b_text, results_agg],
367
  outputs=[show_results_state, results_agg, vote_button_a, vote_button_b,
368
  vote_button_tie, vote_button_neither, feedback_checkboxes,
369
  feedback_section, submit_button, results_reveal_area,
370
  random_question_btn, results_table_display, main_interface_area,
371
  context_toggle_btn, model_a_reveal, model_b_reveal]
372
  )
373
+
374
+ # Refresh leaderboard when switching to the leaderboard tab
375
+ tabs.select(
376
+ fn=load_leaderboard,
377
+ inputs=[],
378
+ outputs=[results_table_display],
379
+ api_name="refresh_leaderboard"
380
+ )
381
 
382
  if __name__ == "__main__":
383
  demo.launch(debug=True)