Spaces:
Running
Running
13-14 Nov changes
Browse files
app.py
CHANGED
@@ -4,13 +4,19 @@ import random
|
|
4 |
from collections import defaultdict
|
5 |
from datetime import datetime, timezone
|
6 |
import hashlib
|
|
|
7 |
|
8 |
from dotenv import load_dotenv
|
9 |
|
10 |
load_dotenv()
|
11 |
|
12 |
import gradio as gr
|
13 |
-
from gen_api_answer import
|
|
|
|
|
|
|
|
|
|
|
14 |
from db import add_vote, create_db_connection, get_votes
|
15 |
from utils import Vote
|
16 |
from common import (
|
@@ -26,12 +32,16 @@ from common import (
|
|
26 |
EVAL_DESCRIPTION,
|
27 |
VOTING_HEADER,
|
28 |
)
|
29 |
-
from
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
|
32 |
-
# Model and ELO score data
|
33 |
-
DEFAULT_ELO = 1200 # Starting ELO for new models
|
34 |
-
K_FACTOR = 32 # Standard chess K-factor, adjust as needed
|
35 |
elo_scores = defaultdict(lambda: DEFAULT_ELO)
|
36 |
vote_counts = defaultdict(int)
|
37 |
|
@@ -143,6 +153,30 @@ def get_ip(request: gr.Request) -> str:
|
|
143 |
return hashlib.sha256(ip.encode()).hexdigest()[:16]
|
144 |
|
145 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
146 |
def vote(
|
147 |
choice,
|
148 |
model_a,
|
@@ -192,16 +226,20 @@ def vote(
|
|
192 |
store_vote_data(
|
193 |
final_prompt, response_a, response_b, model_a, model_b, choice, judge_id
|
194 |
)
|
195 |
-
|
|
|
|
|
|
|
196 |
# Return updates for UI components
|
197 |
return [
|
198 |
-
gr.update(
|
199 |
-
gr.update(
|
200 |
-
gr.update(
|
201 |
gr.update(value=f"*Model: {model_a}*"), # model_name_a
|
202 |
gr.update(value=f"*Model: {model_b}*"), # model_name_b
|
203 |
-
gr.update(interactive=True, value="
|
204 |
-
gr.update(
|
|
|
205 |
]
|
206 |
|
207 |
|
@@ -210,150 +248,24 @@ def get_current_votes():
|
|
210 |
return get_votes(db)
|
211 |
|
212 |
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
voting_data = get_current_votes()
|
217 |
-
print(f"Fetched {len(voting_data)} votes from database") # Debug log
|
218 |
-
|
219 |
-
# Initialize dictionaries for tracking
|
220 |
-
ratings = defaultdict(lambda: DEFAULT_ELO)
|
221 |
-
matches = defaultdict(int)
|
222 |
-
|
223 |
-
# Process each vote
|
224 |
-
for vote in voting_data:
|
225 |
-
try:
|
226 |
-
model_a = vote.get("model_a")
|
227 |
-
model_b = vote.get("model_b")
|
228 |
-
winner = vote.get("winner")
|
229 |
-
|
230 |
-
# Skip if models aren't in current model_data
|
231 |
-
if (
|
232 |
-
not all([model_a, model_b, winner])
|
233 |
-
or model_a not in model_data
|
234 |
-
or model_b not in model_data
|
235 |
-
):
|
236 |
-
continue
|
237 |
-
|
238 |
-
# Update match counts
|
239 |
-
matches[model_a] += 1
|
240 |
-
matches[model_b] += 1
|
241 |
-
|
242 |
-
# Calculate ELO changes
|
243 |
-
elo_a = ratings[model_a]
|
244 |
-
elo_b = ratings[model_b]
|
245 |
-
|
246 |
-
# Expected scores
|
247 |
-
expected_a = 1 / (1 + 10 ** ((elo_b - elo_a) / 400))
|
248 |
-
expected_b = 1 - expected_a
|
249 |
-
|
250 |
-
# Actual scores
|
251 |
-
score_a = 1 if winner == "A" else 0 if winner == "B" else 0.5
|
252 |
-
score_b = 1 - score_a
|
253 |
-
|
254 |
-
# Update ratings
|
255 |
-
ratings[model_a] += K_FACTOR * (score_a - expected_a)
|
256 |
-
ratings[model_b] += K_FACTOR * (score_b - expected_b)
|
257 |
-
|
258 |
-
except Exception as e:
|
259 |
-
print(f"Error processing vote: {e}")
|
260 |
-
continue
|
261 |
-
|
262 |
-
# Generate leaderboard data
|
263 |
-
leaderboard = []
|
264 |
-
for model in model_data.keys():
|
265 |
-
votes = matches[model]
|
266 |
-
# Skip models with < 500 votes if show_preliminary is False
|
267 |
-
if not show_preliminary and votes < 500:
|
268 |
-
continue
|
269 |
-
|
270 |
-
elo = ratings[model]
|
271 |
-
ci = 1.96 * (400 / (votes + 1) ** 0.5) if votes > 0 else 0
|
272 |
-
data = {
|
273 |
-
"Model": model,
|
274 |
-
"ELO Score": f"{int(elo)}",
|
275 |
-
"95% CI": f"±{int(ci)}",
|
276 |
-
"# Votes": votes,
|
277 |
-
"Organization": model_data[model]["organization"],
|
278 |
-
"License": model_data[model]["license"],
|
279 |
-
}
|
280 |
-
leaderboard.append(data)
|
281 |
-
|
282 |
-
# Sort leaderboard by ELO score in descending order
|
283 |
-
leaderboard.sort(key=lambda x: float(x["ELO Score"]), reverse=True)
|
284 |
-
|
285 |
-
return leaderboard
|
286 |
-
|
287 |
-
|
288 |
-
def calculate_elo_change(rating_a, rating_b, winner):
|
289 |
-
"""Calculate ELO rating changes for both players."""
|
290 |
-
expected_a = 1 / (1 + 10 ** ((rating_b - rating_a) / 400))
|
291 |
-
expected_b = 1 - expected_a
|
292 |
-
|
293 |
-
if winner == "A":
|
294 |
-
score_a, score_b = 1, 0
|
295 |
-
elif winner == "B":
|
296 |
-
score_a, score_b = 0, 1
|
297 |
-
else: # Handle ties
|
298 |
-
score_a, score_b = 0.5, 0.5
|
299 |
-
|
300 |
-
change_a = K_FACTOR * (score_a - expected_a)
|
301 |
-
change_b = K_FACTOR * (score_b - expected_b)
|
302 |
-
|
303 |
-
return change_a, change_b
|
304 |
-
|
305 |
-
|
306 |
-
def update_leaderboard():
|
307 |
-
"""Generate leaderboard DataFrame using fresh votes from MongoDB."""
|
308 |
-
# Get fresh voting data
|
309 |
voting_data = get_current_votes()
|
310 |
-
|
311 |
-
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
if not all([model_a, model_b, winner]):
|
325 |
-
print(f"Missing required fields in vote: {vote}")
|
326 |
-
continue
|
327 |
-
|
328 |
-
if model_a not in model_data:
|
329 |
-
print(f"Model A '{model_a}' not found in model_data")
|
330 |
-
continue
|
331 |
-
|
332 |
-
if model_b not in model_data:
|
333 |
-
print(f"Model B '{model_b}' not found in model_data")
|
334 |
-
continue
|
335 |
-
|
336 |
-
# Update match counts
|
337 |
-
matches[model_a] += 1
|
338 |
-
matches[model_b] += 1
|
339 |
-
print(
|
340 |
-
f"Updated matches - {model_a}: {matches[model_a]}, {model_b}: {matches[model_b]}"
|
341 |
-
)
|
342 |
-
except Exception as e:
|
343 |
-
print(f"Error processing vote: {e}")
|
344 |
-
print(f"Problematic vote data: {vote}")
|
345 |
-
continue
|
346 |
-
|
347 |
-
|
348 |
-
# Update the display_leaderboard function
|
349 |
-
def display_leaderboard():
|
350 |
-
df = update_leaderboard()
|
351 |
-
return gr.DataFrame(
|
352 |
-
value=df,
|
353 |
-
headers=["Model", "ELO", "95% CI", "Matches", "Organization", "License"],
|
354 |
-
datatype=["str", "number", "str", "number", "str", "str", "str"],
|
355 |
-
row_count=(len(df) + 1, "dynamic"),
|
356 |
-
)
|
357 |
|
358 |
|
359 |
# Update the leaderboard table definition in the UI
|
@@ -363,63 +275,22 @@ leaderboard_table = gr.Dataframe(
|
|
363 |
)
|
364 |
|
365 |
|
366 |
-
def get_leaderboard_stats():
|
367 |
-
"""Get summary statistics for the leaderboard."""
|
368 |
-
now = datetime.now(timezone.utc)
|
369 |
-
total_votes = len(get_current_votes())
|
370 |
-
total_models = len(model_data)
|
371 |
-
last_updated = now.replace(minute=0, second=0, microsecond=0).strftime(
|
372 |
-
"%B %d, %Y at %H:00 UTC"
|
373 |
-
)
|
374 |
-
|
375 |
-
return f"""
|
376 |
-
### Leaderboard Stats
|
377 |
-
- **Total Models**: {total_models}
|
378 |
-
- **Total Votes**: {total_votes}
|
379 |
-
- **Last Updated**: {last_updated}
|
380 |
-
"""
|
381 |
-
|
382 |
-
|
383 |
-
#def set_example_metric(metric_name):
|
384 |
-
# if metric_name == "Custom":
|
385 |
-
# variables = parse_variables(DEFAULT_EVAL_PROMPT)
|
386 |
-
# variable_values = []
|
387 |
-
# for var in variables:
|
388 |
-
# if var == "input":
|
389 |
-
# variable_values.append(DEFAULT_INPUT)
|
390 |
-
# elif var == "response":
|
391 |
-
# variable_values.append(DEFAULT_RESPONSE)
|
392 |
-
# else:
|
393 |
-
# variable_values.append("") # Default empty value
|
394 |
-
# Pad variable_values to match the length of variable_rows
|
395 |
-
# while len(variable_values) < len(variable_rows):
|
396 |
-
# variable_values.append("")
|
397 |
-
# return [DEFAULT_EVAL_PROMPT] + variable_values
|
398 |
-
|
399 |
-
# metric_data = EXAMPLE_METRICS[metric_name]
|
400 |
-
# variables = parse_variables(metric_data["prompt"])
|
401 |
-
# variable_values = []
|
402 |
-
# for var in variables:
|
403 |
-
# value = metric_data.get(var, "") # Default to empty string if not found
|
404 |
-
# variable_values.append(value)
|
405 |
-
# Pad variable_values to match the length of variable_rows
|
406 |
-
# while len(variable_values) < len(variable_rows):
|
407 |
-
# variable_values.append("")
|
408 |
-
# return [metric_data["prompt"]] + variable_values
|
409 |
-
|
410 |
-
|
411 |
-
# Select random metric at startup
|
412 |
-
# def get_random_metric():
|
413 |
-
# metrics = list(EXAMPLE_METRICS.keys())
|
414 |
-
# return set_example_metric(random.choice(metrics))
|
415 |
-
|
416 |
-
|
417 |
def populate_random_example(request: gr.Request):
|
418 |
-
"""Generate a random human-AI conversation example."""
|
419 |
human_msg, ai_msg = get_random_human_ai_pair()
|
420 |
return [
|
421 |
gr.update(value=human_msg),
|
422 |
-
gr.update(value=ai_msg)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
423 |
]
|
424 |
|
425 |
|
@@ -435,27 +306,35 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
435 |
|
436 |
with gr.Tabs():
|
437 |
with gr.TabItem("Judge Arena"):
|
438 |
-
random_btn = gr.Button("🎲", scale=0)
|
439 |
with gr.Row():
|
440 |
# Left side - Input section
|
441 |
with gr.Column(scale=1):
|
442 |
with gr.Group():
|
443 |
human_input = gr.TextArea(
|
444 |
label="👩 Human Input",
|
445 |
-
lines=
|
446 |
placeholder="Enter the human message here..."
|
447 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
448 |
|
449 |
ai_response = gr.TextArea(
|
450 |
label="🤖 AI Response",
|
451 |
-
lines=
|
452 |
placeholder="Enter the AI response here..."
|
453 |
)
|
454 |
|
|
|
|
|
455 |
send_btn = gr.Button(
|
456 |
-
value="Run
|
457 |
variant="primary",
|
458 |
-
size="lg"
|
|
|
459 |
)
|
460 |
|
461 |
# Right side - Model outputs
|
@@ -466,17 +345,14 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
466 |
with gr.Row():
|
467 |
with gr.Column(scale=1, min_width=100): # Fixed narrow width for score
|
468 |
score_a = gr.Textbox(label="Score", lines=6, interactive=False)
|
469 |
-
vote_a = gr.Button("Vote A", variant="primary",
|
470 |
with gr.Column(scale=9, min_width=400): # Wider width for critique
|
471 |
critique_a = gr.TextArea(label="Critique", lines=8, interactive=False)
|
472 |
|
473 |
-
# Spacing div that's visible only when tie button is hidden
|
474 |
-
spacing_div = gr.HTML('<div style="height: 42px;"></div>', visible=True, elem_id="spacing-div")
|
475 |
-
|
476 |
# Tie button row
|
477 |
-
with gr.Row(
|
478 |
with gr.Column():
|
479 |
-
vote_tie = gr.Button("Tie", variant="
|
480 |
|
481 |
|
482 |
gr.Markdown("### 🧑⚖️ Judge B")
|
@@ -485,13 +361,17 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
485 |
with gr.Row():
|
486 |
with gr.Column(scale=1, min_width=100): # Fixed narrow width for score
|
487 |
score_b = gr.Textbox(label="Score", lines=6, interactive=False)
|
488 |
-
vote_b = gr.Button("Vote B", variant="primary",
|
489 |
with gr.Column(scale=9, min_width=400): # Wider width for critique
|
490 |
critique_b = gr.TextArea(label="Critique", lines=8, interactive=False)
|
491 |
# Place Vote B button directly under Judge B
|
492 |
|
493 |
gr.Markdown("<br>")
|
494 |
|
|
|
|
|
|
|
|
|
495 |
# Add spacing and acknowledgements at the bottom
|
496 |
gr.Markdown(ACKNOWLEDGEMENTS)
|
497 |
|
@@ -510,24 +390,6 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
510 |
datatype=["str", "number", "str", "number", "str", "str", "str"],
|
511 |
)
|
512 |
|
513 |
-
# Update refresh_leaderboard to use the checkbox value
|
514 |
-
def refresh_leaderboard(show_preliminary):
|
515 |
-
"""Refresh the leaderboard data and stats."""
|
516 |
-
leaderboard = get_leaderboard(show_preliminary)
|
517 |
-
data = [
|
518 |
-
[
|
519 |
-
entry["Model"],
|
520 |
-
float(entry["ELO Score"]),
|
521 |
-
entry["95% CI"],
|
522 |
-
entry["# Votes"],
|
523 |
-
entry["Organization"],
|
524 |
-
entry["License"],
|
525 |
-
]
|
526 |
-
for entry in leaderboard
|
527 |
-
]
|
528 |
-
stats = get_leaderboard_stats()
|
529 |
-
return [gr.update(value=data), gr.update(value=stats)]
|
530 |
-
|
531 |
# Add change handler for checkbox
|
532 |
show_preliminary.change(
|
533 |
fn=refresh_leaderboard,
|
@@ -551,35 +413,35 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
551 |
final_prompt_state = gr.State()
|
552 |
|
553 |
# Update variable inputs based on the eval prompt
|
554 |
-
def update_variables(eval_prompt):
|
555 |
-
|
556 |
-
|
557 |
-
|
558 |
-
|
559 |
-
|
560 |
-
|
561 |
-
|
562 |
-
|
563 |
-
|
564 |
-
|
565 |
-
|
566 |
-
|
567 |
-
|
568 |
-
|
569 |
-
|
570 |
-
|
571 |
-
|
572 |
-
|
573 |
-
|
574 |
-
|
575 |
-
|
576 |
-
|
577 |
-
|
578 |
-
gr.update(visible=False), # Hide the variable row
|
579 |
-
gr.update(value="", visible=False), # Clear value when hidden
|
580 |
-
]
|
581 |
-
|
582 |
-
|
583 |
|
584 |
#eval_prompt.change(
|
585 |
# fn=update_variables,
|
@@ -619,7 +481,7 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
619 |
vote_a.click(
|
620 |
fn=vote,
|
621 |
inputs=[
|
622 |
-
gr.State("A"),
|
623 |
model_a_state,
|
624 |
model_b_state,
|
625 |
final_prompt_state,
|
@@ -631,18 +493,19 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
631 |
outputs=[
|
632 |
vote_a,
|
633 |
vote_b,
|
634 |
-
|
635 |
model_name_a,
|
636 |
model_name_b,
|
637 |
send_btn,
|
638 |
-
|
|
|
639 |
],
|
640 |
)
|
641 |
|
642 |
vote_b.click(
|
643 |
fn=vote,
|
644 |
inputs=[
|
645 |
-
gr.State("B"),
|
646 |
model_a_state,
|
647 |
model_b_state,
|
648 |
final_prompt_state,
|
@@ -654,18 +517,19 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
654 |
outputs=[
|
655 |
vote_a,
|
656 |
vote_b,
|
657 |
-
|
658 |
model_name_a,
|
659 |
model_name_b,
|
660 |
send_btn,
|
661 |
-
|
|
|
662 |
],
|
663 |
)
|
664 |
|
665 |
vote_tie.click(
|
666 |
fn=vote,
|
667 |
inputs=[
|
668 |
-
gr.State("Tie"),
|
669 |
model_a_state,
|
670 |
model_b_state,
|
671 |
final_prompt_state,
|
@@ -677,11 +541,12 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
677 |
outputs=[
|
678 |
vote_a,
|
679 |
vote_b,
|
680 |
-
|
681 |
model_name_a,
|
682 |
model_name_b,
|
683 |
send_btn,
|
684 |
-
|
|
|
685 |
],
|
686 |
)
|
687 |
|
@@ -717,21 +582,20 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
717 |
critique_a,
|
718 |
score_b,
|
719 |
critique_b,
|
720 |
-
gr.update(
|
721 |
-
gr.update(
|
722 |
-
gr.update(
|
723 |
model_a,
|
724 |
model_b,
|
725 |
-
final_prompt,
|
726 |
gr.update(value="*Model: Hidden*"),
|
727 |
gr.update(value="*Model: Hidden*"),
|
728 |
-
# Change the button to "Regenerate" mode after evaluation
|
729 |
gr.update(
|
730 |
-
value="Regenerate
|
731 |
variant="secondary",
|
732 |
interactive=True
|
733 |
),
|
734 |
-
gr.update(
|
735 |
)
|
736 |
|
737 |
send_btn.click(
|
@@ -744,29 +608,29 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
744 |
critique_b,
|
745 |
vote_a,
|
746 |
vote_b,
|
747 |
-
|
748 |
model_a_state,
|
749 |
model_b_state,
|
750 |
final_prompt_state,
|
751 |
model_name_a,
|
752 |
model_name_b,
|
753 |
send_btn,
|
754 |
-
|
755 |
],
|
756 |
)
|
757 |
|
758 |
# Update the input change handlers to also disable regenerate button
|
759 |
-
def handle_input_changes(prompt, *variables):
|
760 |
-
|
761 |
-
|
762 |
-
|
763 |
-
|
764 |
-
|
765 |
-
|
766 |
-
|
767 |
-
|
768 |
-
|
769 |
-
|
770 |
|
771 |
# Update the change handlers for prompt and variables
|
772 |
#eval_prompt.change(
|
@@ -813,24 +677,62 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
813 |
random_btn.click(
|
814 |
fn=populate_random_example,
|
815 |
inputs=[],
|
816 |
-
outputs=[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
817 |
)
|
818 |
|
819 |
# Add new input change handlers
|
820 |
def handle_input_change():
|
821 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
822 |
|
823 |
# Update the change handlers for inputs
|
824 |
human_input.change(
|
825 |
fn=handle_input_change,
|
826 |
inputs=[],
|
827 |
-
outputs=[send_btn]
|
828 |
)
|
829 |
|
830 |
ai_response.change(
|
831 |
fn=handle_input_change,
|
832 |
inputs=[],
|
833 |
-
outputs=[send_btn]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
834 |
)
|
835 |
|
836 |
# Update the demo.load to include the random example population
|
|
|
4 |
from collections import defaultdict
|
5 |
from datetime import datetime, timezone
|
6 |
import hashlib
|
7 |
+
from typing import Dict, List
|
8 |
|
9 |
from dotenv import load_dotenv
|
10 |
|
11 |
load_dotenv()
|
12 |
|
13 |
import gradio as gr
|
14 |
+
from gen_api_answer import (
|
15 |
+
get_model_response,
|
16 |
+
parse_model_response,
|
17 |
+
get_random_human_ai_pair,
|
18 |
+
generate_ai_response
|
19 |
+
)
|
20 |
from db import add_vote, create_db_connection, get_votes
|
21 |
from utils import Vote
|
22 |
from common import (
|
|
|
32 |
EVAL_DESCRIPTION,
|
33 |
VOTING_HEADER,
|
34 |
)
|
35 |
+
from leaderboard import (
|
36 |
+
get_leaderboard,
|
37 |
+
get_leaderboard_stats,
|
38 |
+
calculate_elo_change,
|
39 |
+
get_model_rankings,
|
40 |
+
DEFAULT_ELO,
|
41 |
+
K_FACTOR
|
42 |
+
)
|
43 |
|
44 |
|
|
|
|
|
|
|
45 |
elo_scores = defaultdict(lambda: DEFAULT_ELO)
|
46 |
vote_counts = defaultdict(int)
|
47 |
|
|
|
153 |
return hashlib.sha256(ip.encode()).hexdigest()[:16]
|
154 |
|
155 |
|
156 |
+
def get_vote_message(choice: str, model_a: str, model_b: str) -> str:
|
157 |
+
"""Generate appropriate message based on vote and model rankings."""
|
158 |
+
voting_data = get_current_votes()
|
159 |
+
leaderboard = get_leaderboard(model_data, voting_data, show_preliminary=True)
|
160 |
+
rankings = get_model_rankings(leaderboard)
|
161 |
+
pos_a = rankings.get(model_a, 0)
|
162 |
+
pos_b = rankings.get(model_b, 0)
|
163 |
+
|
164 |
+
if choice == "Tie":
|
165 |
+
return f"It's a tie! Currently, {model_a} ranks #{pos_a} and {model_b} ranks #{pos_b}. \nYour votes shapes the leaderboard, carry on voting responsibly :)"
|
166 |
+
|
167 |
+
# Get chosen and rejected models based on vote
|
168 |
+
model_chosen = model_a if choice == "A" else model_b
|
169 |
+
model_rejected = model_b if choice == "A" else model_a
|
170 |
+
pos_chosen = pos_a if choice == "A" else pos_b
|
171 |
+
pos_rejected = pos_b if choice == "A" else pos_a
|
172 |
+
|
173 |
+
# Check if vote aligns with leaderboard
|
174 |
+
if (choice == "A" and pos_a < pos_b) or (choice == "B" and pos_b < pos_a):
|
175 |
+
return f"You're in touch with the community! {model_chosen} ranks #{pos_chosen} ahead of {model_rejected} in #{pos_rejected}. \nYour votes shapes the leaderboard, carry on voting responsibly :)"
|
176 |
+
else:
|
177 |
+
return f"You don't think like everyone else ;) {model_chosen} ranks #{pos_chosen} which is behind {model_rejected} in #{pos_rejected}. \nYour votes shapes the leaderboard, carry on voting responsibly :)"
|
178 |
+
|
179 |
+
|
180 |
def vote(
|
181 |
choice,
|
182 |
model_a,
|
|
|
226 |
store_vote_data(
|
227 |
final_prompt, response_a, response_b, model_a, model_b, choice, judge_id
|
228 |
)
|
229 |
+
|
230 |
+
# Generate vote message
|
231 |
+
message = get_vote_message(choice, model_a, model_b)
|
232 |
+
|
233 |
# Return updates for UI components
|
234 |
return [
|
235 |
+
gr.update(interactive=False, variant="primary" if choice == "A" else "secondary"), # vote_a
|
236 |
+
gr.update(interactive=False, variant="primary" if choice == "B" else "secondary"), # vote_b
|
237 |
+
gr.update(interactive=False, variant="primary" if choice == "Tie" else "secondary"), # vote_tie
|
238 |
gr.update(value=f"*Model: {model_a}*"), # model_name_a
|
239 |
gr.update(value=f"*Model: {model_b}*"), # model_name_b
|
240 |
+
gr.update(interactive=True, value="Regenerate judges", variant="secondary"), # send_btn
|
241 |
+
gr.update(value="🎲 New round", variant="primary"), # random_btn
|
242 |
+
gr.Info(message, title = "🥳 Thanks for your vote!"), # success message
|
243 |
]
|
244 |
|
245 |
|
|
|
248 |
return get_votes(db)
|
249 |
|
250 |
|
251 |
+
# Update the refresh_leaderboard function
|
252 |
+
def refresh_leaderboard(show_preliminary):
|
253 |
+
"""Refresh the leaderboard data and stats."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
254 |
voting_data = get_current_votes()
|
255 |
+
leaderboard = get_leaderboard(model_data, voting_data, show_preliminary)
|
256 |
+
data = [
|
257 |
+
[
|
258 |
+
entry["Model"],
|
259 |
+
float(entry["ELO Score"]),
|
260 |
+
entry["95% CI"],
|
261 |
+
entry["# Votes"],
|
262 |
+
entry["Organization"],
|
263 |
+
entry["License"],
|
264 |
+
]
|
265 |
+
for entry in leaderboard
|
266 |
+
]
|
267 |
+
stats = get_leaderboard_stats(model_data, voting_data)
|
268 |
+
return [gr.update(value=data), gr.update(value=stats)]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
269 |
|
270 |
|
271 |
# Update the leaderboard table definition in the UI
|
|
|
275 |
)
|
276 |
|
277 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
278 |
def populate_random_example(request: gr.Request):
|
279 |
+
"""Generate a random human-AI conversation example and reset judge outputs."""
|
280 |
human_msg, ai_msg = get_random_human_ai_pair()
|
281 |
return [
|
282 |
gr.update(value=human_msg),
|
283 |
+
gr.update(value=ai_msg),
|
284 |
+
gr.update(value="🎲", variant="secondary"), # Reset random button appearance
|
285 |
+
gr.update(value=""), # Clear score A
|
286 |
+
gr.update(value=""), # Clear critique A
|
287 |
+
gr.update(value=""), # Clear score B
|
288 |
+
gr.update(value=""), # Clear critique B
|
289 |
+
gr.update(interactive=False, variant="primary"), # Reset vote A
|
290 |
+
gr.update(interactive=False, variant="primary"), # Reset vote B
|
291 |
+
gr.update(interactive=False, variant="primary"), # Reset vote tie
|
292 |
+
gr.update(value="*Model: Hidden*"), # Reset model name A
|
293 |
+
gr.update(value="*Model: Hidden*"), # Reset model name B
|
294 |
]
|
295 |
|
296 |
|
|
|
306 |
|
307 |
with gr.Tabs():
|
308 |
with gr.TabItem("Judge Arena"):
|
|
|
309 |
with gr.Row():
|
310 |
# Left side - Input section
|
311 |
with gr.Column(scale=1):
|
312 |
with gr.Group():
|
313 |
human_input = gr.TextArea(
|
314 |
label="👩 Human Input",
|
315 |
+
lines=10,
|
316 |
placeholder="Enter the human message here..."
|
317 |
)
|
318 |
+
with gr.Row():
|
319 |
+
generate_btn = gr.Button(
|
320 |
+
"Generate AI Response",
|
321 |
+
size="sm",
|
322 |
+
interactive=False
|
323 |
+
)
|
324 |
|
325 |
ai_response = gr.TextArea(
|
326 |
label="🤖 AI Response",
|
327 |
+
lines=15,
|
328 |
placeholder="Enter the AI response here..."
|
329 |
)
|
330 |
|
331 |
+
with gr.Row():
|
332 |
+
random_btn = gr.Button("🎲", scale=2)
|
333 |
send_btn = gr.Button(
|
334 |
+
value="Run judges",
|
335 |
variant="primary",
|
336 |
+
size="lg",
|
337 |
+
scale=8
|
338 |
)
|
339 |
|
340 |
# Right side - Model outputs
|
|
|
345 |
with gr.Row():
|
346 |
with gr.Column(scale=1, min_width=100): # Fixed narrow width for score
|
347 |
score_a = gr.Textbox(label="Score", lines=6, interactive=False)
|
348 |
+
vote_a = gr.Button("Vote A", variant="primary", interactive=False)
|
349 |
with gr.Column(scale=9, min_width=400): # Wider width for critique
|
350 |
critique_a = gr.TextArea(label="Critique", lines=8, interactive=False)
|
351 |
|
|
|
|
|
|
|
352 |
# Tie button row
|
353 |
+
with gr.Row() as tie_button_row:
|
354 |
with gr.Column():
|
355 |
+
vote_tie = gr.Button("Tie", variant="primary", interactive=False)
|
356 |
|
357 |
|
358 |
gr.Markdown("### 🧑⚖️ Judge B")
|
|
|
361 |
with gr.Row():
|
362 |
with gr.Column(scale=1, min_width=100): # Fixed narrow width for score
|
363 |
score_b = gr.Textbox(label="Score", lines=6, interactive=False)
|
364 |
+
vote_b = gr.Button("Vote B", variant="primary", interactive=False)
|
365 |
with gr.Column(scale=9, min_width=400): # Wider width for critique
|
366 |
critique_b = gr.TextArea(label="Critique", lines=8, interactive=False)
|
367 |
# Place Vote B button directly under Judge B
|
368 |
|
369 |
gr.Markdown("<br>")
|
370 |
|
371 |
+
# Add Evaluator Prompt Accordion
|
372 |
+
with gr.Accordion("📝 Evaluator Prompt", open=False):
|
373 |
+
gr.Markdown(f"```\n{DEFAULT_EVAL_PROMPT}\n```")
|
374 |
+
|
375 |
# Add spacing and acknowledgements at the bottom
|
376 |
gr.Markdown(ACKNOWLEDGEMENTS)
|
377 |
|
|
|
390 |
datatype=["str", "number", "str", "number", "str", "str", "str"],
|
391 |
)
|
392 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
393 |
# Add change handler for checkbox
|
394 |
show_preliminary.change(
|
395 |
fn=refresh_leaderboard,
|
|
|
413 |
final_prompt_state = gr.State()
|
414 |
|
415 |
# Update variable inputs based on the eval prompt
|
416 |
+
#def update_variables(eval_prompt):
|
417 |
+
# variables = parse_variables(eval_prompt)
|
418 |
+
# updates = []
|
419 |
+
|
420 |
+
# for i in range(len(variable_rows)):
|
421 |
+
# var_row, var_input = variable_rows[i]
|
422 |
+
# if i < len(variables):
|
423 |
+
# var_name = variables[i]
|
424 |
+
# # Set the number of lines based on the variable name
|
425 |
+
# if var_name == "response":
|
426 |
+
# lines = 4 # Adjust this number as needed
|
427 |
+
# else:
|
428 |
+
# lines = 1 # Default to single line for other variables
|
429 |
+
# updates.extend(
|
430 |
+
# [
|
431 |
+
# gr.update(visible=True), # Show the variable row
|
432 |
+
# gr.update(
|
433 |
+
# label=var_name, visible=True, lines=lines
|
434 |
+
# ), # Update label and lines
|
435 |
+
# ]
|
436 |
+
# )
|
437 |
+
# else:
|
438 |
+
# updates.extend(
|
439 |
+
# [
|
440 |
+
# gr.update(visible=False), # Hide the variable row
|
441 |
+
# gr.update(value="", visible=False), # Clear value when hidden
|
442 |
+
# ]
|
443 |
+
# )
|
444 |
+
# return updates
|
445 |
|
446 |
#eval_prompt.change(
|
447 |
# fn=update_variables,
|
|
|
481 |
vote_a.click(
|
482 |
fn=vote,
|
483 |
inputs=[
|
484 |
+
gr.State("A"),
|
485 |
model_a_state,
|
486 |
model_b_state,
|
487 |
final_prompt_state,
|
|
|
493 |
outputs=[
|
494 |
vote_a,
|
495 |
vote_b,
|
496 |
+
vote_tie,
|
497 |
model_name_a,
|
498 |
model_name_b,
|
499 |
send_btn,
|
500 |
+
random_btn,
|
501 |
+
gr.State(), # placeholder for success message
|
502 |
],
|
503 |
)
|
504 |
|
505 |
vote_b.click(
|
506 |
fn=vote,
|
507 |
inputs=[
|
508 |
+
gr.State("B"),
|
509 |
model_a_state,
|
510 |
model_b_state,
|
511 |
final_prompt_state,
|
|
|
517 |
outputs=[
|
518 |
vote_a,
|
519 |
vote_b,
|
520 |
+
vote_tie,
|
521 |
model_name_a,
|
522 |
model_name_b,
|
523 |
send_btn,
|
524 |
+
random_btn,
|
525 |
+
gr.State(), # placeholder for success message
|
526 |
],
|
527 |
)
|
528 |
|
529 |
vote_tie.click(
|
530 |
fn=vote,
|
531 |
inputs=[
|
532 |
+
gr.State("Tie"),
|
533 |
model_a_state,
|
534 |
model_b_state,
|
535 |
final_prompt_state,
|
|
|
541 |
outputs=[
|
542 |
vote_a,
|
543 |
vote_b,
|
544 |
+
vote_tie,
|
545 |
model_name_a,
|
546 |
model_name_b,
|
547 |
send_btn,
|
548 |
+
random_btn,
|
549 |
+
gr.State(), # placeholder for success message
|
550 |
],
|
551 |
)
|
552 |
|
|
|
582 |
critique_a,
|
583 |
score_b,
|
584 |
critique_b,
|
585 |
+
gr.update(interactive=True, variant="primary"), # vote_a
|
586 |
+
gr.update(interactive=True, variant="primary"), # vote_b
|
587 |
+
gr.update(interactive=True, variant="primary"), # vote_tie
|
588 |
model_a,
|
589 |
model_b,
|
590 |
+
final_prompt,
|
591 |
gr.update(value="*Model: Hidden*"),
|
592 |
gr.update(value="*Model: Hidden*"),
|
|
|
593 |
gr.update(
|
594 |
+
value="Regenerate judges",
|
595 |
variant="secondary",
|
596 |
interactive=True
|
597 |
),
|
598 |
+
gr.update(value="🎲"), # random_btn
|
599 |
)
|
600 |
|
601 |
send_btn.click(
|
|
|
608 |
critique_b,
|
609 |
vote_a,
|
610 |
vote_b,
|
611 |
+
vote_tie,
|
612 |
model_a_state,
|
613 |
model_b_state,
|
614 |
final_prompt_state,
|
615 |
model_name_a,
|
616 |
model_name_b,
|
617 |
send_btn,
|
618 |
+
random_btn,
|
619 |
],
|
620 |
)
|
621 |
|
622 |
# Update the input change handlers to also disable regenerate button
|
623 |
+
# def handle_input_changes(prompt, *variables):
|
624 |
+
# """Enable send button and manage regenerate button based on input changes"""
|
625 |
+
# last_inputs = last_submission.value
|
626 |
+
# current_inputs = {"prompt": prompt, "variables": variables}
|
627 |
+
# inputs_changed = last_inputs != current_inputs
|
628 |
+
# return [
|
629 |
+
# gr.update(interactive=True), # send button always enabled
|
630 |
+
# gr.update(
|
631 |
+
# interactive=not inputs_changed
|
632 |
+
# ), # regenerate button disabled if inputs changed
|
633 |
+
# ]
|
634 |
|
635 |
# Update the change handlers for prompt and variables
|
636 |
#eval_prompt.change(
|
|
|
677 |
random_btn.click(
|
678 |
fn=populate_random_example,
|
679 |
inputs=[],
|
680 |
+
outputs=[
|
681 |
+
human_input,
|
682 |
+
ai_response,
|
683 |
+
random_btn,
|
684 |
+
score_a,
|
685 |
+
critique_a,
|
686 |
+
score_b,
|
687 |
+
critique_b,
|
688 |
+
vote_a,
|
689 |
+
vote_b,
|
690 |
+
vote_tie,
|
691 |
+
model_name_a,
|
692 |
+
model_name_b,
|
693 |
+
]
|
694 |
)
|
695 |
|
696 |
# Add new input change handlers
|
697 |
def handle_input_change():
|
698 |
+
"""Reset UI state when inputs are changed"""
|
699 |
+
return [
|
700 |
+
gr.update(interactive=False), # vote_a
|
701 |
+
gr.update(interactive=False), # vote_b
|
702 |
+
gr.update(interactive=False), # vote_tie
|
703 |
+
gr.update(value="Run judges", variant="primary"), # send_btn
|
704 |
+
gr.update(value="🎲", variant="secondary"), # random_btn
|
705 |
+
]
|
706 |
|
707 |
# Update the change handlers for inputs
|
708 |
human_input.change(
|
709 |
fn=handle_input_change,
|
710 |
inputs=[],
|
711 |
+
outputs=[vote_a, vote_b, vote_tie, send_btn, random_btn]
|
712 |
)
|
713 |
|
714 |
ai_response.change(
|
715 |
fn=handle_input_change,
|
716 |
inputs=[],
|
717 |
+
outputs=[vote_a, vote_b, vote_tie, send_btn, random_btn]
|
718 |
+
)
|
719 |
+
|
720 |
+
generate_btn.click(
|
721 |
+
fn=lambda msg: (
|
722 |
+
generate_ai_response(msg)[0], # Only take the response text
|
723 |
+
gr.update(
|
724 |
+
value="Generate AI Response", # Keep the label
|
725 |
+
interactive=False # Disable the button
|
726 |
+
)
|
727 |
+
),
|
728 |
+
inputs=[human_input],
|
729 |
+
outputs=[ai_response, generate_btn]
|
730 |
+
)
|
731 |
+
|
732 |
+
human_input.change(
|
733 |
+
fn=lambda x: gr.update(interactive=bool(x.strip())),
|
734 |
+
inputs=[human_input],
|
735 |
+
outputs=[generate_btn]
|
736 |
)
|
737 |
|
738 |
# Update the demo.load to include the random example population
|