Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -31,6 +31,8 @@ from common import (
|
|
31 |
BATTLE_RULES,
|
32 |
EVAL_DESCRIPTION,
|
33 |
VOTING_HEADER,
|
|
|
|
|
34 |
)
|
35 |
from leaderboard import (
|
36 |
get_leaderboard,
|
@@ -153,8 +155,10 @@ def get_ip(request: gr.Request) -> str:
|
|
153 |
return hashlib.sha256(ip.encode()).hexdigest()[:16]
|
154 |
|
155 |
|
156 |
-
def get_vote_message(choice: str, model_a: str, model_b: str) -> str:
|
157 |
-
"""Generate appropriate message based on vote and model rankings.
|
|
|
|
|
158 |
voting_data = get_current_votes()
|
159 |
leaderboard = get_leaderboard(model_data, voting_data, show_preliminary=True)
|
160 |
rankings = get_model_rankings(leaderboard)
|
@@ -162,19 +166,13 @@ def get_vote_message(choice: str, model_a: str, model_b: str) -> str:
|
|
162 |
pos_b = rankings.get(model_b, 0)
|
163 |
|
164 |
if choice == "Tie":
|
165 |
-
return
|
166 |
-
|
167 |
-
# Get chosen and rejected models based on vote
|
168 |
-
model_chosen = model_a if choice == "A" else model_b
|
169 |
-
model_rejected = model_b if choice == "A" else model_a
|
170 |
-
pos_chosen = pos_a if choice == "A" else pos_b
|
171 |
-
pos_rejected = pos_b if choice == "A" else pos_a
|
172 |
|
173 |
# Check if vote aligns with leaderboard
|
174 |
if (choice == "A" and pos_a < pos_b) or (choice == "B" and pos_b < pos_a):
|
175 |
-
return
|
176 |
else:
|
177 |
-
return
|
178 |
|
179 |
|
180 |
def vote(
|
@@ -227,19 +225,38 @@ def vote(
|
|
227 |
final_prompt, response_a, response_b, model_a, model_b, choice, judge_id
|
228 |
)
|
229 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
230 |
# Generate vote message
|
231 |
-
message = get_vote_message(choice, model_a, model_b)
|
232 |
|
233 |
-
# Return updates for UI components
|
234 |
return [
|
235 |
gr.update(interactive=False, variant="primary" if choice == "A" else "secondary"), # vote_a
|
236 |
gr.update(interactive=False, variant="primary" if choice == "B" else "secondary"), # vote_b
|
237 |
gr.update(interactive=False, variant="primary" if choice == "Tie" else "secondary"), # vote_tie
|
238 |
-
gr.update(value=
|
239 |
-
gr.update(value=
|
240 |
gr.update(interactive=True, value="Regenerate judges", variant="secondary"), # send_btn
|
241 |
gr.update(value="π² New round", variant="primary"), # random_btn
|
242 |
-
gr.Info(message, title
|
243 |
]
|
244 |
|
245 |
|
@@ -311,7 +328,7 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
311 |
with gr.Column(scale=1):
|
312 |
with gr.Group():
|
313 |
human_input = gr.TextArea(
|
314 |
-
label="π©
|
315 |
lines=10,
|
316 |
placeholder="Enter the human message here..."
|
317 |
)
|
@@ -368,12 +385,18 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
368 |
|
369 |
gr.Markdown("<br>")
|
370 |
|
371 |
-
#
|
372 |
with gr.Accordion("π Evaluator Prompt", open=False):
|
373 |
-
gr.
|
374 |
-
|
375 |
-
|
376 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
377 |
|
378 |
with gr.TabItem("Leaderboard"):
|
379 |
with gr.Row():
|
@@ -406,11 +429,14 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
406 |
|
407 |
with gr.TabItem("Policy"):
|
408 |
gr.Markdown(POLICY_CONTENT)
|
|
|
409 |
|
410 |
# Define state variables for model tracking
|
411 |
model_a_state = gr.State()
|
412 |
model_b_state = gr.State()
|
413 |
final_prompt_state = gr.State()
|
|
|
|
|
414 |
|
415 |
# Update variable inputs based on the eval prompt
|
416 |
#def update_variables(eval_prompt):
|
@@ -550,12 +576,50 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
550 |
],
|
551 |
)
|
552 |
|
553 |
-
#
|
554 |
-
def
|
555 |
-
|
556 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
557 |
|
558 |
-
|
|
|
|
|
|
|
|
|
|
|
559 |
(
|
560 |
response_a,
|
561 |
response_b,
|
@@ -564,18 +628,19 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
564 |
model_a,
|
565 |
model_b,
|
566 |
final_prompt,
|
567 |
-
) = submit_prompt(
|
568 |
|
569 |
# Parse the responses
|
570 |
score_a, critique_a = parse_model_response(response_a)
|
571 |
score_b, critique_b = parse_model_response(response_b)
|
572 |
|
573 |
-
#
|
574 |
-
|
575 |
-
|
|
|
576 |
|
577 |
# Update the last_submission state with the current values
|
578 |
-
last_submission.value =
|
579 |
|
580 |
return (
|
581 |
score_a,
|
@@ -598,9 +663,10 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
598 |
gr.update(value="π²"), # random_btn
|
599 |
)
|
600 |
|
|
|
601 |
send_btn.click(
|
602 |
fn=submit_and_store,
|
603 |
-
inputs=[
|
604 |
outputs=[
|
605 |
score_a,
|
606 |
critique_a,
|
|
|
31 |
BATTLE_RULES,
|
32 |
EVAL_DESCRIPTION,
|
33 |
VOTING_HEADER,
|
34 |
+
DEFAULT_EVAL_PROMPT_EDITABLE,
|
35 |
+
FIXED_EVAL_SUFFIX,
|
36 |
)
|
37 |
from leaderboard import (
|
38 |
get_leaderboard,
|
|
|
155 |
return hashlib.sha256(ip.encode()).hexdigest()[:16]
|
156 |
|
157 |
|
158 |
+
def get_vote_message(choice: str, model_a: str, model_b: str) -> tuple[str, str]:
|
159 |
+
"""Generate appropriate message based on vote and model rankings.
|
160 |
+
Returns (title, message) tuple."""
|
161 |
+
# Get current rankings
|
162 |
voting_data = get_current_votes()
|
163 |
leaderboard = get_leaderboard(model_data, voting_data, show_preliminary=True)
|
164 |
rankings = get_model_rankings(leaderboard)
|
|
|
166 |
pos_b = rankings.get(model_b, 0)
|
167 |
|
168 |
if choice == "Tie":
|
169 |
+
return "It's a tie!", "Keep voting responsibly π€"
|
|
|
|
|
|
|
|
|
|
|
|
|
170 |
|
171 |
# Check if vote aligns with leaderboard
|
172 |
if (choice == "A" and pos_a < pos_b) or (choice == "B" and pos_b < pos_a):
|
173 |
+
return "The favourite wins!", "Keep voting responsibly π€"
|
174 |
else:
|
175 |
+
return "The underdog wins!", "Keep voting responsibly π€"
|
176 |
|
177 |
|
178 |
def vote(
|
|
|
225 |
final_prompt, response_a, response_b, model_a, model_b, choice, judge_id
|
226 |
)
|
227 |
|
228 |
+
# Get model positions for display
|
229 |
+
voting_data = get_current_votes()
|
230 |
+
leaderboard = get_leaderboard(model_data, voting_data, show_preliminary=True)
|
231 |
+
rankings = get_model_rankings(leaderboard)
|
232 |
+
pos_a = rankings.get(model_a, 0)
|
233 |
+
pos_b = rankings.get(model_b, 0)
|
234 |
+
|
235 |
+
# Format model names with positions and win/loss indicators
|
236 |
+
if choice == "Tie":
|
237 |
+
model_a_display = f"*Model: {model_a} (Position #{pos_a})*"
|
238 |
+
model_b_display = f"*Model: {model_b} (Position #{pos_b})*"
|
239 |
+
else:
|
240 |
+
winner = model_a if choice == "A" else model_b
|
241 |
+
loser = model_b if choice == "A" else model_a
|
242 |
+
winner_pos = pos_a if choice == "A" else pos_b
|
243 |
+
loser_pos = pos_b if choice == "A" else pos_a
|
244 |
+
|
245 |
+
model_a_display = f"*Model: {model_a} {'β
' if choice == 'A' else 'β'} (Position #{pos_a})*"
|
246 |
+
model_b_display = f"*Model: {model_b} {'β
' if choice == 'B' else 'β'} (Position #{pos_b})*"
|
247 |
+
|
248 |
# Generate vote message
|
249 |
+
title, message = get_vote_message(choice, model_a, model_b)
|
250 |
|
|
|
251 |
return [
|
252 |
gr.update(interactive=False, variant="primary" if choice == "A" else "secondary"), # vote_a
|
253 |
gr.update(interactive=False, variant="primary" if choice == "B" else "secondary"), # vote_b
|
254 |
gr.update(interactive=False, variant="primary" if choice == "Tie" else "secondary"), # vote_tie
|
255 |
+
gr.update(value=model_a_display), # model_name_a
|
256 |
+
gr.update(value=model_b_display), # model_name_b
|
257 |
gr.update(interactive=True, value="Regenerate judges", variant="secondary"), # send_btn
|
258 |
gr.update(value="π² New round", variant="primary"), # random_btn
|
259 |
+
gr.Info(message, title=title), # success message
|
260 |
]
|
261 |
|
262 |
|
|
|
328 |
with gr.Column(scale=1):
|
329 |
with gr.Group():
|
330 |
human_input = gr.TextArea(
|
331 |
+
label="π© User Input",
|
332 |
lines=10,
|
333 |
placeholder="Enter the human message here..."
|
334 |
)
|
|
|
385 |
|
386 |
gr.Markdown("<br>")
|
387 |
|
388 |
+
# Update Evaluator Prompt Accordion
|
389 |
with gr.Accordion("π Evaluator Prompt", open=False):
|
390 |
+
eval_prompt_editable = gr.TextArea(
|
391 |
+
value=DEFAULT_EVAL_PROMPT_EDITABLE,
|
392 |
+
label="Evaluation Criteria",
|
393 |
+
lines=12
|
394 |
+
)
|
395 |
+
with gr.Row(visible=False) as edit_buttons_row: # Make buttons row initially hidden
|
396 |
+
cancel_prompt_btn = gr.Button("Cancel")
|
397 |
+
save_prompt_btn = gr.Button("Save", variant="primary")
|
398 |
+
gr.Markdown("*The sample being evaluated is always appended as:*")
|
399 |
+
gr.Markdown(f"```{FIXED_EVAL_SUFFIX}")
|
400 |
|
401 |
with gr.TabItem("Leaderboard"):
|
402 |
with gr.Row():
|
|
|
429 |
|
430 |
with gr.TabItem("Policy"):
|
431 |
gr.Markdown(POLICY_CONTENT)
|
432 |
+
gr.Markdown(ACKNOWLEDGEMENTS)
|
433 |
|
434 |
# Define state variables for model tracking
|
435 |
model_a_state = gr.State()
|
436 |
model_b_state = gr.State()
|
437 |
final_prompt_state = gr.State()
|
438 |
+
eval_prompt_previous = gr.State(value=DEFAULT_EVAL_PROMPT_EDITABLE) # Initialize with default value
|
439 |
+
is_editing = gr.State(False) # Track editing state
|
440 |
|
441 |
# Update variable inputs based on the eval prompt
|
442 |
#def update_variables(eval_prompt):
|
|
|
576 |
],
|
577 |
)
|
578 |
|
579 |
+
# Add handlers for save/cancel buttons
|
580 |
+
def save_prompt(new_prompt, previous_prompt):
|
581 |
+
return [
|
582 |
+
gr.update(value=new_prompt), # Update the prompt
|
583 |
+
new_prompt, # Update the previous prompt state
|
584 |
+
gr.update(visible=False) # Hide the buttons
|
585 |
+
]
|
586 |
+
|
587 |
+
def cancel_prompt(previous_prompt):
|
588 |
+
return [
|
589 |
+
gr.update(value=previous_prompt), # Revert to previous prompt
|
590 |
+
previous_prompt, # Keep the previous prompt state
|
591 |
+
gr.update(visible=False) # Hide the buttons
|
592 |
+
]
|
593 |
+
|
594 |
+
def show_edit_buttons(current_value, previous_value):
|
595 |
+
# Show buttons only if the current value differs from the previous value
|
596 |
+
return gr.update(visible=current_value != previous_value)
|
597 |
+
|
598 |
+
# Add handlers for save/cancel buttons and prompt changes
|
599 |
+
save_prompt_btn.click(
|
600 |
+
fn=save_prompt,
|
601 |
+
inputs=[eval_prompt_editable, eval_prompt_previous],
|
602 |
+
outputs=[eval_prompt_editable, eval_prompt_previous, edit_buttons_row]
|
603 |
+
)
|
604 |
+
|
605 |
+
cancel_prompt_btn.click(
|
606 |
+
fn=cancel_prompt,
|
607 |
+
inputs=[eval_prompt_previous],
|
608 |
+
outputs=[eval_prompt_editable, eval_prompt_previous, edit_buttons_row]
|
609 |
+
)
|
610 |
+
|
611 |
+
eval_prompt_editable.change(
|
612 |
+
fn=show_edit_buttons,
|
613 |
+
inputs=[eval_prompt_editable, eval_prompt_previous],
|
614 |
+
outputs=edit_buttons_row
|
615 |
+
)
|
616 |
|
617 |
+
# Update the submit function to combine editable and fixed parts
|
618 |
+
def submit_and_store(editable_prompt, *variables):
|
619 |
+
# Combine the editable prompt with fixed suffix
|
620 |
+
full_prompt = editable_prompt + FIXED_EVAL_SUFFIX
|
621 |
+
|
622 |
+
# Get the responses using the full prompt
|
623 |
(
|
624 |
response_a,
|
625 |
response_b,
|
|
|
628 |
model_a,
|
629 |
model_b,
|
630 |
final_prompt,
|
631 |
+
) = submit_prompt(full_prompt, *variables)
|
632 |
|
633 |
# Parse the responses
|
634 |
score_a, critique_a = parse_model_response(response_a)
|
635 |
score_b, critique_b = parse_model_response(response_b)
|
636 |
|
637 |
+
# Only append "/ 5" if using the default prompt
|
638 |
+
if editable_prompt.strip() == DEFAULT_EVAL_PROMPT_EDITABLE.strip():
|
639 |
+
score_a = f"{score_a} / 5"
|
640 |
+
score_b = f"{score_b} / 5"
|
641 |
|
642 |
# Update the last_submission state with the current values
|
643 |
+
last_submission.value = {"prompt": full_prompt, "variables": variables}
|
644 |
|
645 |
return (
|
646 |
score_a,
|
|
|
663 |
gr.update(value="π²"), # random_btn
|
664 |
)
|
665 |
|
666 |
+
# Update the click handler to use the editable prompt
|
667 |
send_btn.click(
|
668 |
fn=submit_and_store,
|
669 |
+
inputs=[eval_prompt_editable, human_input, ai_response],
|
670 |
outputs=[
|
671 |
score_a,
|
672 |
critique_a,
|