Spaces:
Running
Running
Simplified
Browse files
app.py
CHANGED
@@ -10,7 +10,7 @@ from dotenv import load_dotenv
|
|
10 |
load_dotenv()
|
11 |
|
12 |
import gradio as gr
|
13 |
-
from gen_api_answer import get_model_response, parse_model_response
|
14 |
from db import add_vote, create_db_connection, get_votes
|
15 |
from utils import Vote
|
16 |
from common import (
|
@@ -195,11 +195,12 @@ def vote(
|
|
195 |
|
196 |
# Return updates for UI components
|
197 |
return [
|
198 |
-
gr.update(visible=False), #
|
|
|
|
|
199 |
gr.update(value=f"*Model: {model_a}*"), # model_name_a
|
200 |
gr.update(value=f"*Model: {model_b}*"), # model_name_b
|
201 |
-
gr.update(interactive=True), # send_btn
|
202 |
-
gr.update(visible=True, interactive=True), # regenerate_button
|
203 |
]
|
204 |
|
205 |
|
@@ -283,41 +284,6 @@ def get_leaderboard(show_preliminary=True):
|
|
283 |
return leaderboard
|
284 |
|
285 |
|
286 |
-
def regenerate_prompt(model_a, model_b, eval_prompt, *variable_values):
|
287 |
-
variables = parse_variables(eval_prompt)
|
288 |
-
variable_values_dict = {var: val for var, val in zip(variables, variable_values)}
|
289 |
-
final_prompt = get_final_prompt(eval_prompt, variable_values_dict)
|
290 |
-
|
291 |
-
# Get available models excluding the previous ones
|
292 |
-
available_models = [m for m in model_data.keys() if m not in (model_a, model_b)]
|
293 |
-
|
294 |
-
# If we have enough models for new pairs
|
295 |
-
if len(available_models) >= 2:
|
296 |
-
model1, model2 = random.sample(available_models, 2)
|
297 |
-
else:
|
298 |
-
# Fallback to allowing previous models if necessary
|
299 |
-
model1, model2 = random.sample(list(model_data.keys()), 2)
|
300 |
-
|
301 |
-
response_a = get_model_response(model1, model_data.get(model1), final_prompt)
|
302 |
-
response_b = get_model_response(model2, model_data.get(model2), final_prompt)
|
303 |
-
|
304 |
-
# Parse the responses
|
305 |
-
score_a, critique_a = parse_model_response(response_a)
|
306 |
-
score_b, critique_b = parse_model_response(response_b)
|
307 |
-
|
308 |
-
return (
|
309 |
-
score_a, # score_a textbox
|
310 |
-
critique_a, # critique_a textbox
|
311 |
-
score_b, # score_b textbox
|
312 |
-
critique_b, # critique_b textbox
|
313 |
-
gr.update(visible=True), # action_buttons_row
|
314 |
-
gr.update(value="*Model: Hidden*"), # model_name_a
|
315 |
-
gr.update(value="*Model: Hidden*"), # model_name_b
|
316 |
-
model1, # model_a_state
|
317 |
-
model2, # model_b_state
|
318 |
-
)
|
319 |
-
|
320 |
-
|
321 |
def calculate_elo_change(rating_a, rating_b, winner):
|
322 |
"""Calculate ELO rating changes for both players."""
|
323 |
expected_a = 1 / (1 + 10 ** ((rating_b - rating_a) / 400))
|
@@ -413,138 +379,120 @@ def get_leaderboard_stats():
|
|
413 |
"""
|
414 |
|
415 |
|
416 |
-
def set_example_metric(metric_name):
|
417 |
-
if metric_name == "Custom":
|
418 |
-
variables = parse_variables(DEFAULT_EVAL_PROMPT)
|
419 |
-
variable_values = []
|
420 |
-
for var in variables:
|
421 |
-
if var == "input":
|
422 |
-
variable_values.append(DEFAULT_INPUT)
|
423 |
-
elif var == "response":
|
424 |
-
variable_values.append(DEFAULT_RESPONSE)
|
425 |
-
else:
|
426 |
-
variable_values.append("") # Default empty value
|
427 |
# Pad variable_values to match the length of variable_rows
|
428 |
-
while len(variable_values) < len(variable_rows):
|
429 |
-
variable_values.append("")
|
430 |
-
return [DEFAULT_EVAL_PROMPT] + variable_values
|
431 |
-
|
432 |
-
metric_data = EXAMPLE_METRICS[metric_name]
|
433 |
-
variables = parse_variables(metric_data["prompt"])
|
434 |
-
variable_values = []
|
435 |
-
for var in variables:
|
436 |
-
value = metric_data.get(var, "") # Default to empty string if not found
|
437 |
-
variable_values.append(value)
|
438 |
# Pad variable_values to match the length of variable_rows
|
439 |
-
while len(variable_values) < len(variable_rows):
|
440 |
-
variable_values.append("")
|
441 |
-
return [metric_data["prompt"]] + variable_values
|
442 |
|
443 |
|
444 |
# Select random metric at startup
|
445 |
-
def get_random_metric():
|
446 |
-
metrics = list(EXAMPLE_METRICS.keys())
|
447 |
-
return set_example_metric(random.choice(metrics))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
448 |
|
449 |
|
450 |
with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
451 |
gr.Markdown(MAIN_TITLE)
|
452 |
gr.Markdown(HOW_IT_WORKS)
|
|
|
|
|
|
|
|
|
|
|
|
|
453 |
|
454 |
with gr.Tabs():
|
455 |
with gr.TabItem("Judge Arena"):
|
456 |
-
|
457 |
-
with gr.Row():
|
458 |
-
with gr.Column():
|
459 |
-
gr.Markdown(BATTLE_RULES)
|
460 |
-
|
461 |
-
# Add Example Metrics Section
|
462 |
-
with gr.Accordion("Evaluator Prompt Templates", open=False):
|
463 |
-
with gr.Row():
|
464 |
-
custom_btn = gr.Button("Custom", variant="secondary")
|
465 |
-
hallucination_btn = gr.Button("Hallucination")
|
466 |
-
precision_btn = gr.Button("Precision")
|
467 |
-
recall_btn = gr.Button("Recall")
|
468 |
-
coherence_btn = gr.Button("Logical coherence")
|
469 |
-
faithfulness_btn = gr.Button("Faithfulness")
|
470 |
-
|
471 |
-
# Eval Prompt and Variables side by side
|
472 |
with gr.Row():
|
473 |
-
# Left
|
474 |
with gr.Column(scale=1):
|
475 |
-
gr.
|
476 |
-
|
477 |
-
|
478 |
-
|
479 |
-
|
480 |
-
|
481 |
-
|
482 |
-
|
483 |
-
|
484 |
-
|
485 |
-
|
486 |
-
|
487 |
-
|
488 |
-
|
489 |
-
|
490 |
-
|
491 |
-
|
492 |
-
|
493 |
-
|
494 |
-
"input" if i == 0
|
495 |
-
else "ground_truth" if i == 1
|
496 |
-
else "response" if i == 2
|
497 |
-
else ""
|
498 |
-
)
|
499 |
-
default_value = (
|
500 |
-
EXAMPLE_METRICS["Hallucination"]["input"] if i == 0
|
501 |
-
else EXAMPLE_METRICS["Hallucination"]["ground_truth"] if i == 1
|
502 |
-
else EXAMPLE_METRICS["Hallucination"]["response"] if i == 2
|
503 |
-
else ""
|
504 |
)
|
505 |
-
var_input = gr.Textbox(
|
506 |
-
container=True,
|
507 |
-
label=default_label,
|
508 |
-
value=default_value
|
509 |
-
)
|
510 |
-
variable_rows.append((var_row, var_input))
|
511 |
-
|
512 |
-
# Send button
|
513 |
-
with gr.Row(elem_classes="send-button-row"):
|
514 |
-
send_btn = gr.Button(
|
515 |
-
value="Test the evaluators", variant="primary", size="lg", scale=1
|
516 |
-
)
|
517 |
-
|
518 |
-
# Add divider heading for model outputs
|
519 |
-
gr.Markdown(VOTING_HEADER)
|
520 |
|
521 |
-
|
522 |
-
|
523 |
-
|
524 |
-
gr.Markdown("###
|
525 |
-
|
526 |
-
|
|
|
|
|
|
|
|
|
|
|
527 |
model_name_a = gr.Markdown("*Model: Hidden*")
|
528 |
-
|
529 |
-
|
530 |
-
|
531 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
532 |
model_name_b = gr.Markdown("*Model: Hidden*")
|
533 |
-
|
534 |
-
|
535 |
-
with gr.Row(visible=False) as action_buttons_row:
|
536 |
-
vote_a = gr.Button("Choose A", variant="primary")
|
537 |
-
vote_tie = gr.Button("Tie", variant="secondary")
|
538 |
-
vote_b = gr.Button("Choose B", variant="primary")
|
539 |
-
regenerate_button = gr.Button(
|
540 |
-
"Regenerate with different models", variant="secondary", visible=False
|
541 |
-
)
|
542 |
-
|
543 |
gr.Markdown("<br>")
|
544 |
|
545 |
-
# Add evaluation tips
|
546 |
-
gr.Markdown(EVAL_DESCRIPTION)
|
547 |
-
|
548 |
# Add spacing and acknowledgements at the bottom
|
549 |
gr.Markdown(ACKNOWLEDGEMENTS)
|
550 |
|
@@ -634,29 +582,30 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
634 |
)
|
635 |
return updates
|
636 |
|
637 |
-
eval_prompt.change(
|
638 |
-
|
639 |
-
|
640 |
-
|
641 |
-
)
|
642 |
|
643 |
# Regenerate button functionality
|
644 |
-
regenerate_button.click(
|
645 |
-
|
646 |
-
|
647 |
-
|
648 |
-
|
649 |
-
|
650 |
-
|
651 |
-
|
652 |
-
|
653 |
-
|
654 |
-
|
655 |
-
|
656 |
-
|
657 |
-
|
658 |
-
|
659 |
-
|
|
|
660 |
|
661 |
# Update model names after responses are generated
|
662 |
def update_model_names(model_a, model_b):
|
@@ -681,11 +630,12 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
681 |
critique_b,
|
682 |
],
|
683 |
outputs=[
|
684 |
-
|
|
|
|
|
685 |
model_name_a,
|
686 |
model_name_b,
|
687 |
send_btn,
|
688 |
-
regenerate_button,
|
689 |
],
|
690 |
)
|
691 |
|
@@ -702,11 +652,12 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
702 |
critique_b,
|
703 |
],
|
704 |
outputs=[
|
705 |
-
|
|
|
|
|
706 |
model_name_a,
|
707 |
model_name_b,
|
708 |
send_btn,
|
709 |
-
regenerate_button,
|
710 |
],
|
711 |
)
|
712 |
|
@@ -723,11 +674,12 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
723 |
critique_b,
|
724 |
],
|
725 |
outputs=[
|
726 |
-
|
|
|
|
|
727 |
model_name_a,
|
728 |
model_name_b,
|
729 |
send_btn,
|
730 |
-
regenerate_button,
|
731 |
],
|
732 |
)
|
733 |
|
@@ -759,32 +711,39 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
759 |
critique_a,
|
760 |
score_b,
|
761 |
critique_b,
|
762 |
-
|
763 |
-
gr.update(
|
764 |
-
|
765 |
-
), # Show and enable regenerate button
|
766 |
model_a,
|
767 |
model_b,
|
768 |
final_prompt, # Add final_prompt to state
|
769 |
gr.update(value="*Model: Hidden*"),
|
770 |
gr.update(value="*Model: Hidden*"),
|
|
|
|
|
|
|
|
|
|
|
|
|
771 |
)
|
772 |
|
773 |
send_btn.click(
|
774 |
fn=submit_and_store,
|
775 |
-
inputs=[eval_prompt
|
776 |
outputs=[
|
777 |
score_a,
|
778 |
critique_a,
|
779 |
score_b,
|
780 |
critique_b,
|
781 |
-
|
782 |
-
|
|
|
783 |
model_a_state,
|
784 |
model_b_state,
|
785 |
-
final_prompt_state,
|
786 |
model_name_a,
|
787 |
model_name_b,
|
|
|
788 |
],
|
789 |
)
|
790 |
|
@@ -802,44 +761,68 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
802 |
]
|
803 |
|
804 |
# Update the change handlers for prompt and variables
|
805 |
-
eval_prompt.change(
|
806 |
-
|
807 |
-
|
808 |
-
|
809 |
-
)
|
810 |
-
|
811 |
-
for _, var_input in variable_rows:
|
812 |
-
|
813 |
-
|
814 |
-
|
815 |
-
|
816 |
-
|
817 |
|
818 |
# Add click handlers for metric buttons
|
819 |
-
outputs_list = [eval_prompt] + [var_input for _, var_input in variable_rows]
|
820 |
|
821 |
-
custom_btn.click(fn=lambda: set_example_metric("Custom"), outputs=outputs_list)
|
822 |
|
823 |
-
hallucination_btn.click(
|
824 |
-
|
825 |
-
)
|
826 |
|
827 |
-
precision_btn.click(fn=lambda: set_example_metric("Precision"), outputs=outputs_list)
|
828 |
|
829 |
-
recall_btn.click(fn=lambda: set_example_metric("Recall"), outputs=outputs_list)
|
830 |
|
831 |
-
coherence_btn.click(
|
832 |
-
|
833 |
-
)
|
834 |
|
835 |
-
faithfulness_btn.click(
|
836 |
-
|
837 |
-
)
|
838 |
|
839 |
# Set default metric at startup
|
840 |
demo.load(
|
841 |
-
fn=lambda: set_example_metric("Hallucination"),
|
842 |
-
outputs=[eval_prompt] + [var_input for _, var_input in variable_rows],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
843 |
)
|
844 |
|
845 |
if __name__ == "__main__":
|
|
|
10 |
load_dotenv()
|
11 |
|
12 |
import gradio as gr
|
13 |
+
from gen_api_answer import get_model_response, parse_model_response, get_random_human_ai_pair
|
14 |
from db import add_vote, create_db_connection, get_votes
|
15 |
from utils import Vote
|
16 |
from common import (
|
|
|
195 |
|
196 |
# Return updates for UI components
|
197 |
return [
|
198 |
+
gr.update(visible=False), # vote_a
|
199 |
+
gr.update(visible=False), # vote_b
|
200 |
+
gr.update(visible=False), # tie_button_row
|
201 |
gr.update(value=f"*Model: {model_a}*"), # model_name_a
|
202 |
gr.update(value=f"*Model: {model_b}*"), # model_name_b
|
203 |
+
gr.update(interactive=True, value="Run the evaluators", variant="primary"), # send_btn
|
|
|
204 |
]
|
205 |
|
206 |
|
|
|
284 |
return leaderboard
|
285 |
|
286 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
287 |
def calculate_elo_change(rating_a, rating_b, winner):
|
288 |
"""Calculate ELO rating changes for both players."""
|
289 |
expected_a = 1 / (1 + 10 ** ((rating_b - rating_a) / 400))
|
|
|
379 |
"""
|
380 |
|
381 |
|
382 |
+
#def set_example_metric(metric_name):
|
383 |
+
# if metric_name == "Custom":
|
384 |
+
# variables = parse_variables(DEFAULT_EVAL_PROMPT)
|
385 |
+
# variable_values = []
|
386 |
+
# for var in variables:
|
387 |
+
# if var == "input":
|
388 |
+
# variable_values.append(DEFAULT_INPUT)
|
389 |
+
# elif var == "response":
|
390 |
+
# variable_values.append(DEFAULT_RESPONSE)
|
391 |
+
# else:
|
392 |
+
# variable_values.append("") # Default empty value
|
393 |
# Pad variable_values to match the length of variable_rows
|
394 |
+
# while len(variable_values) < len(variable_rows):
|
395 |
+
# variable_values.append("")
|
396 |
+
# return [DEFAULT_EVAL_PROMPT] + variable_values
|
397 |
+
|
398 |
+
# metric_data = EXAMPLE_METRICS[metric_name]
|
399 |
+
# variables = parse_variables(metric_data["prompt"])
|
400 |
+
# variable_values = []
|
401 |
+
# for var in variables:
|
402 |
+
# value = metric_data.get(var, "") # Default to empty string if not found
|
403 |
+
# variable_values.append(value)
|
404 |
# Pad variable_values to match the length of variable_rows
|
405 |
+
# while len(variable_values) < len(variable_rows):
|
406 |
+
# variable_values.append("")
|
407 |
+
# return [metric_data["prompt"]] + variable_values
|
408 |
|
409 |
|
410 |
# Select random metric at startup
|
411 |
+
# def get_random_metric():
|
412 |
+
# metrics = list(EXAMPLE_METRICS.keys())
|
413 |
+
# return set_example_metric(random.choice(metrics))
|
414 |
+
|
415 |
+
|
416 |
+
def populate_random_example(request: gr.Request):
|
417 |
+
"""Generate a random human-AI conversation example."""
|
418 |
+
human_msg, ai_msg = get_random_human_ai_pair()
|
419 |
+
return [
|
420 |
+
gr.update(value=human_msg),
|
421 |
+
gr.update(value=ai_msg)
|
422 |
+
]
|
423 |
|
424 |
|
425 |
with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
426 |
gr.Markdown(MAIN_TITLE)
|
427 |
gr.Markdown(HOW_IT_WORKS)
|
428 |
+
|
429 |
+
# Hidden eval prompt that will always contain DEFAULT_EVAL_PROMPT
|
430 |
+
eval_prompt = gr.Textbox(
|
431 |
+
value=DEFAULT_EVAL_PROMPT,
|
432 |
+
visible=False
|
433 |
+
)
|
434 |
|
435 |
with gr.Tabs():
|
436 |
with gr.TabItem("Judge Arena"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
437 |
with gr.Row():
|
438 |
+
# Left side - Input section
|
439 |
with gr.Column(scale=1):
|
440 |
+
random_btn = gr.Button("🎲", scale=0)
|
441 |
+
with gr.Group():
|
442 |
+
human_input = gr.TextArea(
|
443 |
+
label="👩 Human Input",
|
444 |
+
lines=8,
|
445 |
+
placeholder="Enter the human message here..."
|
446 |
+
)
|
447 |
+
|
448 |
+
ai_response = gr.TextArea(
|
449 |
+
label="🤖 AI Response",
|
450 |
+
lines=8,
|
451 |
+
placeholder="Enter the AI response here..."
|
452 |
+
)
|
453 |
+
|
454 |
+
with gr.Row(elem_classes="send-button-row"):
|
455 |
+
send_btn = gr.Button(
|
456 |
+
value="Run the evaluators",
|
457 |
+
variant="primary",
|
458 |
+
size="lg"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
459 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
460 |
|
461 |
+
# Right side - Model outputs
|
462 |
+
with gr.Column(scale=1):
|
463 |
+
gr.Markdown("<br>")
|
464 |
+
gr.Markdown("\n### 👩⚖️ Judge A")
|
465 |
+
with gr.Group():
|
466 |
+
with gr.Row():
|
467 |
+
with gr.Column(scale=1, min_width=100): # Fixed narrow width for score
|
468 |
+
score_a = gr.Textbox(label="Score", interactive=False)
|
469 |
+
vote_a = gr.Button("Vote A", variant="primary", visible=False)
|
470 |
+
with gr.Column(scale=9, min_width=400): # Wider width for critique
|
471 |
+
critique_a = gr.TextArea(label="Critique", lines=8, interactive=False)
|
472 |
model_name_a = gr.Markdown("*Model: Hidden*")
|
473 |
+
|
474 |
+
# Add spacing between judges
|
475 |
+
gr.Markdown("<br>")
|
476 |
+
|
477 |
+
# Center the Tie button between judges
|
478 |
+
with gr.Row(visible=False) as tie_button_row:
|
479 |
+
with gr.Column():
|
480 |
+
vote_tie = gr.Button("Tie", variant="secondary")
|
481 |
+
gr.Markdown("<br>")
|
482 |
+
|
483 |
+
gr.Markdown("### 👩⚖️ Judge B")
|
484 |
+
with gr.Group():
|
485 |
+
with gr.Row():
|
486 |
+
with gr.Column(scale=1, min_width=100): # Fixed narrow width for score
|
487 |
+
score_b = gr.Textbox(label="Score", interactive=False)
|
488 |
+
vote_b = gr.Button("Vote B", variant="primary", visible=False)
|
489 |
+
with gr.Column(scale=9, min_width=400): # Wider width for critique
|
490 |
+
critique_b = gr.TextArea(label="Critique", lines=8, interactive=False)
|
491 |
model_name_b = gr.Markdown("*Model: Hidden*")
|
492 |
+
# Place Vote B button directly under Judge B
|
493 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
494 |
gr.Markdown("<br>")
|
495 |
|
|
|
|
|
|
|
496 |
# Add spacing and acknowledgements at the bottom
|
497 |
gr.Markdown(ACKNOWLEDGEMENTS)
|
498 |
|
|
|
582 |
)
|
583 |
return updates
|
584 |
|
585 |
+
#eval_prompt.change(
|
586 |
+
# fn=update_variables,
|
587 |
+
# inputs=eval_prompt,
|
588 |
+
# outputs=[item for sublist in variable_rows for item in sublist],
|
589 |
+
#)
|
590 |
|
591 |
# Regenerate button functionality
|
592 |
+
#regenerate_button.click(
|
593 |
+
# fn=regenerate_prompt,
|
594 |
+
# inputs=[model_a_state, model_b_state, eval_prompt, human_input, ai_response],
|
595 |
+
# outputs=[
|
596 |
+
# score_a,
|
597 |
+
# critique_a,
|
598 |
+
# score_b,
|
599 |
+
# critique_b,
|
600 |
+
# vote_a,
|
601 |
+
# vote_b,
|
602 |
+
# tie_button_row,
|
603 |
+
# model_name_a,
|
604 |
+
# model_name_b,
|
605 |
+
# model_a_state,
|
606 |
+
# model_b_state,
|
607 |
+
# ],
|
608 |
+
#)
|
609 |
|
610 |
# Update model names after responses are generated
|
611 |
def update_model_names(model_a, model_b):
|
|
|
630 |
critique_b,
|
631 |
],
|
632 |
outputs=[
|
633 |
+
vote_a,
|
634 |
+
vote_b,
|
635 |
+
tie_button_row,
|
636 |
model_name_a,
|
637 |
model_name_b,
|
638 |
send_btn,
|
|
|
639 |
],
|
640 |
)
|
641 |
|
|
|
652 |
critique_b,
|
653 |
],
|
654 |
outputs=[
|
655 |
+
vote_a,
|
656 |
+
vote_b,
|
657 |
+
tie_button_row,
|
658 |
model_name_a,
|
659 |
model_name_b,
|
660 |
send_btn,
|
|
|
661 |
],
|
662 |
)
|
663 |
|
|
|
674 |
critique_b,
|
675 |
],
|
676 |
outputs=[
|
677 |
+
vote_a,
|
678 |
+
vote_b,
|
679 |
+
tie_button_row,
|
680 |
model_name_a,
|
681 |
model_name_b,
|
682 |
send_btn,
|
|
|
683 |
],
|
684 |
)
|
685 |
|
|
|
711 |
critique_a,
|
712 |
score_b,
|
713 |
critique_b,
|
714 |
+
gr.update(visible=True), # vote_a
|
715 |
+
gr.update(visible=True), # vote_b
|
716 |
+
gr.update(visible=True), # tie_button_row
|
|
|
717 |
model_a,
|
718 |
model_b,
|
719 |
final_prompt, # Add final_prompt to state
|
720 |
gr.update(value="*Model: Hidden*"),
|
721 |
gr.update(value="*Model: Hidden*"),
|
722 |
+
# Change the button to "Regenerate" mode after evaluation
|
723 |
+
gr.update(
|
724 |
+
value="Regenerate with different models",
|
725 |
+
variant="secondary",
|
726 |
+
interactive=True
|
727 |
+
),
|
728 |
)
|
729 |
|
730 |
send_btn.click(
|
731 |
fn=submit_and_store,
|
732 |
+
inputs=[eval_prompt, human_input, ai_response],
|
733 |
outputs=[
|
734 |
score_a,
|
735 |
critique_a,
|
736 |
score_b,
|
737 |
critique_b,
|
738 |
+
vote_a,
|
739 |
+
vote_b,
|
740 |
+
tie_button_row,
|
741 |
model_a_state,
|
742 |
model_b_state,
|
743 |
+
final_prompt_state,
|
744 |
model_name_a,
|
745 |
model_name_b,
|
746 |
+
send_btn,
|
747 |
],
|
748 |
)
|
749 |
|
|
|
761 |
]
|
762 |
|
763 |
# Update the change handlers for prompt and variables
|
764 |
+
#eval_prompt.change(
|
765 |
+
# fn=handle_input_changes,
|
766 |
+
# inputs=[eval_prompt] + [var_input for _, var_input in variable_rows],
|
767 |
+
# outputs=[send_btn, regenerate_button],
|
768 |
+
#)
|
769 |
+
|
770 |
+
# for _, var_input in variable_rows:
|
771 |
+
# var_input.change(
|
772 |
+
# fn=handle_input_changes,
|
773 |
+
# inputs=[eval_prompt] + [var_input for _, var_input in variable_rows],
|
774 |
+
# outputs=[send_btn, regenerate_button],
|
775 |
+
# )
|
776 |
|
777 |
# Add click handlers for metric buttons
|
778 |
+
#outputs_list = [eval_prompt] + [var_input for _, var_input in variable_rows]
|
779 |
|
780 |
+
#custom_btn.click(fn=lambda: set_example_metric("Custom"), outputs=outputs_list)
|
781 |
|
782 |
+
#hallucination_btn.click(
|
783 |
+
# fn=lambda: set_example_metric("Hallucination"), outputs=outputs_list
|
784 |
+
#)
|
785 |
|
786 |
+
#precision_btn.click(fn=lambda: set_example_metric("Precision"), outputs=outputs_list)
|
787 |
|
788 |
+
#recall_btn.click(fn=lambda: set_example_metric("Recall"), outputs=outputs_list)
|
789 |
|
790 |
+
#coherence_btn.click(
|
791 |
+
# fn=lambda: set_example_metric("Logical_Coherence"), outputs=outputs_list
|
792 |
+
#)
|
793 |
|
794 |
+
#faithfulness_btn.click(
|
795 |
+
# fn=lambda: set_example_metric("Faithfulness"), outputs=outputs_list
|
796 |
+
#)
|
797 |
|
798 |
# Set default metric at startup
|
799 |
demo.load(
|
800 |
+
#fn=lambda: set_example_metric("Hallucination"),
|
801 |
+
#outputs=[eval_prompt] + [var_input for _, var_input in variable_rows],
|
802 |
+
)
|
803 |
+
|
804 |
+
# Add random button handler
|
805 |
+
random_btn.click(
|
806 |
+
fn=populate_random_example,
|
807 |
+
inputs=[],
|
808 |
+
outputs=[human_input, ai_response]
|
809 |
+
)
|
810 |
+
|
811 |
+
# Add new input change handlers
|
812 |
+
def handle_input_change():
|
813 |
+
return gr.update(value="Run the evaluators", variant="primary")
|
814 |
+
|
815 |
+
# Update the change handlers for inputs
|
816 |
+
human_input.change(
|
817 |
+
fn=handle_input_change,
|
818 |
+
inputs=[],
|
819 |
+
outputs=[send_btn]
|
820 |
+
)
|
821 |
+
|
822 |
+
ai_response.change(
|
823 |
+
fn=handle_input_change,
|
824 |
+
inputs=[],
|
825 |
+
outputs=[send_btn]
|
826 |
)
|
827 |
|
828 |
if __name__ == "__main__":
|