Spaces:
Running
Running
Likert-5 by default with Prometheus prompt template
Browse files
app.py
CHANGED
@@ -2,9 +2,8 @@ import json
|
|
2 |
import re
|
3 |
import random
|
4 |
from collections import defaultdict
|
5 |
-
from datetime import datetime
|
6 |
import hashlib
|
7 |
-
from typing import Dict, List
|
8 |
|
9 |
from dotenv import load_dotenv
|
10 |
|
@@ -14,7 +13,7 @@ import gradio as gr
|
|
14 |
from gen_api_answer import (
|
15 |
get_model_response,
|
16 |
parse_model_response,
|
17 |
-
|
18 |
)
|
19 |
|
20 |
from random_sample_generation import (
|
@@ -27,15 +26,12 @@ from utils import Vote
|
|
27 |
from common import (
|
28 |
POLICY_CONTENT,
|
29 |
ACKNOWLEDGEMENTS,
|
30 |
-
DEFAULT_EVAL_PROMPT,
|
31 |
-
DEFAULT_INPUT,
|
32 |
-
DEFAULT_RESPONSE,
|
33 |
CSS_STYLES,
|
34 |
MAIN_TITLE,
|
35 |
HOW_IT_WORKS,
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
DEFAULT_EVAL_PROMPT_EDITABLE,
|
40 |
FIXED_EVAL_SUFFIX,
|
41 |
DEFAULT_EVAL_CRITERIA,
|
@@ -48,7 +44,6 @@ from common import (
|
|
48 |
from leaderboard import (
|
49 |
get_leaderboard,
|
50 |
get_leaderboard_stats,
|
51 |
-
calculate_elo_change,
|
52 |
get_model_rankings,
|
53 |
DEFAULT_ELO,
|
54 |
K_FACTOR
|
@@ -85,9 +80,11 @@ def load_model_data():
|
|
85 |
model_data = load_model_data()
|
86 |
|
87 |
def store_vote_data(prompt, response_a, response_b, model_a, model_b, winner, judge_id):
|
|
|
|
|
88 |
vote = Vote(
|
89 |
timestamp=datetime.now().isoformat(),
|
90 |
-
prompt=
|
91 |
response_a=response_a,
|
92 |
response_b=response_b,
|
93 |
model_a=model_a,
|
@@ -416,13 +413,13 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
416 |
# Replace the "Edit Judge Prompt" Accordion section with:
|
417 |
with gr.Accordion("📝 Edit Judge Prompt", open=False) as prompt_accordion:
|
418 |
gr.Markdown("<br>")
|
419 |
-
|
420 |
-
label="Use a
|
421 |
value=False
|
422 |
)
|
423 |
|
424 |
-
#
|
425 |
-
with gr.Column(visible=
|
426 |
eval_prompt_editable = gr.TextArea(
|
427 |
value=DEFAULT_EVAL_PROMPT_EDITABLE,
|
428 |
label="Evaluation Criteria",
|
@@ -435,8 +432,8 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
435 |
gr.Markdown("*The sample being evaluated is always appended as:*")
|
436 |
gr.Markdown(f"```{FIXED_EVAL_SUFFIX}")
|
437 |
|
438 |
-
#
|
439 |
-
with gr.Column(visible=
|
440 |
with gr.Row():
|
441 |
# Left column - Evaluation Criteria
|
442 |
with gr.Column(scale=1):
|
@@ -447,8 +444,8 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
447 |
placeholder="Enter the evaluation criteria..."
|
448 |
)
|
449 |
prometheus_reference = gr.Markdown(
|
450 |
-
"<br> *
|
451 |
-
visible=
|
452 |
)
|
453 |
|
454 |
# Right column - Score Descriptions
|
@@ -658,89 +655,80 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
658 |
)
|
659 |
|
660 |
# Function to toggle visibility based on compatible mode
|
661 |
-
def
|
662 |
-
|
663 |
-
|
664 |
-
|
665 |
-
|
666 |
-
|
667 |
-
|
668 |
-
|
669 |
-
|
670 |
-
|
671 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
672 |
outputs=[
|
673 |
ground_truth,
|
674 |
-
|
675 |
-
|
676 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
677 |
]
|
678 |
)
|
679 |
|
680 |
-
# Update the submit function to handle
|
681 |
def submit_and_store(
|
682 |
-
|
683 |
-
|
684 |
human_input,
|
685 |
ai_response,
|
686 |
ground_truth_input,
|
687 |
-
|
688 |
-
|
689 |
-
|
690 |
-
|
691 |
-
|
692 |
-
score5_desc,
|
693 |
):
|
694 |
-
|
695 |
-
|
696 |
-
|
697 |
-
|
698 |
-
|
699 |
-
|
700 |
-
|
701 |
-
|
702 |
-
|
703 |
-
|
704 |
-
|
705 |
-
|
706 |
-
###Response to evaluate:
|
707 |
-
{ai_response}
|
708 |
-
|
709 |
-
###Reference Answer (Score 5):
|
710 |
-
{ground_truth_input}
|
711 |
-
|
712 |
-
###Score Rubrics:
|
713 |
-
[{eval_criteria_text_input}]
|
714 |
-
Score 1: {score1_desc}
|
715 |
-
Score 2: {score2_desc}
|
716 |
-
Score 3: {score3_desc}
|
717 |
-
Score 4: {score4_desc}
|
718 |
-
Score 5: {score5_desc}
|
719 |
-
|
720 |
-
###Feedback:
|
721 |
-
"""
|
722 |
-
final_prompt = prompt
|
723 |
-
use_alternative_prompt = True
|
724 |
-
else:
|
725 |
-
# Combine the editable prompt with fixed suffix
|
726 |
-
full_prompt = editable_prompt + FIXED_EVAL_SUFFIX
|
727 |
-
# Replace variables in the eval prompt
|
728 |
-
variable_values = {'input': human_input, 'response': ai_response}
|
729 |
-
final_prompt = get_final_prompt(full_prompt, variable_values)
|
730 |
-
use_alternative_prompt = False
|
731 |
-
|
732 |
-
# Filter models based on compatible mode
|
733 |
-
if compatible_mode:
|
734 |
-
# Include all models when compatible mode is enabled
|
735 |
-
models = list(model_data.keys())
|
736 |
-
else:
|
737 |
-
# Exclude Prometheus models when not in compatible mode
|
738 |
-
models = [
|
739 |
-
model_name for model_name in model_data.keys()
|
740 |
-
if model_data[model_name]["organization"] != "Prometheus"
|
741 |
-
]
|
742 |
|
743 |
-
|
744 |
model1, model2 = random.sample(models, 2)
|
745 |
model_a, model_b = (model1, model2) if random.random() < 0.5 else (model2, model1)
|
746 |
|
@@ -748,31 +736,33 @@ Score 5: {score5_desc}
|
|
748 |
response_a = get_model_response(
|
749 |
model_a,
|
750 |
model_data.get(model_a),
|
751 |
-
|
752 |
-
|
753 |
)
|
754 |
response_b = get_model_response(
|
755 |
model_b,
|
756 |
model_data.get(model_b),
|
757 |
-
|
758 |
-
|
759 |
)
|
760 |
|
761 |
-
# Parse the responses based on
|
762 |
-
|
763 |
-
|
764 |
-
|
|
|
|
|
|
|
765 |
else:
|
766 |
score_a_val, critique_a_val = parse_model_response(response_a)
|
767 |
-
score_b_val, critique_b_val = parse_model_response(response_b)
|
768 |
-
|
769 |
-
# Only append "/ 5" if using the default prompt
|
770 |
-
if not compatible_mode and editable_prompt.strip() == DEFAULT_EVAL_PROMPT_EDITABLE.strip():
|
771 |
score_a_val = f"{score_a_val} / 5"
|
772 |
-
score_b_val = f"{score_b_val} / 5"
|
773 |
|
774 |
-
|
775 |
-
|
|
|
|
|
|
|
|
|
776 |
|
777 |
return (
|
778 |
score_a_val,
|
@@ -784,7 +774,7 @@ Score 5: {score5_desc}
|
|
784 |
gr.update(interactive=True, variant="primary"), # vote_tie
|
785 |
model_a,
|
786 |
model_b,
|
787 |
-
|
788 |
gr.update(value="*Model: Hidden*"),
|
789 |
gr.update(value="*Model: Hidden*"),
|
790 |
gr.update(value="Regenerate judges", variant="secondary", interactive=True),
|
@@ -795,12 +785,11 @@ Score 5: {score5_desc}
|
|
795 |
send_btn.click(
|
796 |
fn=submit_and_store,
|
797 |
inputs=[
|
798 |
-
|
799 |
-
|
800 |
human_input,
|
801 |
ai_response,
|
802 |
ground_truth,
|
803 |
-
eval_criteria_text,
|
804 |
score1_description,
|
805 |
score2_description,
|
806 |
score3_description,
|
@@ -828,7 +817,7 @@ Score 5: {score5_desc}
|
|
828 |
# Add random button handler
|
829 |
random_btn.click(
|
830 |
fn=populate_random_example,
|
831 |
-
inputs=[
|
832 |
outputs=[
|
833 |
human_input,
|
834 |
ai_response,
|
|
|
2 |
import re
|
3 |
import random
|
4 |
from collections import defaultdict
|
5 |
+
from datetime import datetime
|
6 |
import hashlib
|
|
|
7 |
|
8 |
from dotenv import load_dotenv
|
9 |
|
|
|
13 |
from gen_api_answer import (
|
14 |
get_model_response,
|
15 |
parse_model_response,
|
16 |
+
prometheus_parse_model_response
|
17 |
)
|
18 |
|
19 |
from random_sample_generation import (
|
|
|
26 |
from common import (
|
27 |
POLICY_CONTENT,
|
28 |
ACKNOWLEDGEMENTS,
|
|
|
|
|
|
|
29 |
CSS_STYLES,
|
30 |
MAIN_TITLE,
|
31 |
HOW_IT_WORKS,
|
32 |
+
)
|
33 |
+
from prompts import (
|
34 |
+
DEFAULT_EVAL_PROMPT,
|
35 |
DEFAULT_EVAL_PROMPT_EDITABLE,
|
36 |
FIXED_EVAL_SUFFIX,
|
37 |
DEFAULT_EVAL_CRITERIA,
|
|
|
44 |
from leaderboard import (
|
45 |
get_leaderboard,
|
46 |
get_leaderboard_stats,
|
|
|
47 |
get_model_rankings,
|
48 |
DEFAULT_ELO,
|
49 |
K_FACTOR
|
|
|
80 |
model_data = load_model_data()
|
81 |
|
82 |
def store_vote_data(prompt, response_a, response_b, model_a, model_b, winner, judge_id):
|
83 |
+
prompt_value = prompt.value if hasattr(prompt, 'value') else prompt
|
84 |
+
|
85 |
vote = Vote(
|
86 |
timestamp=datetime.now().isoformat(),
|
87 |
+
prompt=prompt_value,
|
88 |
response_a=response_a,
|
89 |
response_b=response_b,
|
90 |
model_a=model_a,
|
|
|
413 |
# Replace the "Edit Judge Prompt" Accordion section with:
|
414 |
with gr.Accordion("📝 Edit Judge Prompt", open=False) as prompt_accordion:
|
415 |
gr.Markdown("<br>")
|
416 |
+
use_reference_toggle = gr.Checkbox(
|
417 |
+
label="Use a reference response",
|
418 |
value=False
|
419 |
)
|
420 |
|
421 |
+
# Hide the default prompt editor
|
422 |
+
with gr.Column(visible=False) as default_prompt_editor:
|
423 |
eval_prompt_editable = gr.TextArea(
|
424 |
value=DEFAULT_EVAL_PROMPT_EDITABLE,
|
425 |
label="Evaluation Criteria",
|
|
|
432 |
gr.Markdown("*The sample being evaluated is always appended as:*")
|
433 |
gr.Markdown(f"```{FIXED_EVAL_SUFFIX}")
|
434 |
|
435 |
+
# Show the compatible mode editor
|
436 |
+
with gr.Column(visible=True) as compatible_prompt_editor:
|
437 |
with gr.Row():
|
438 |
# Left column - Evaluation Criteria
|
439 |
with gr.Column(scale=1):
|
|
|
444 |
placeholder="Enter the evaluation criteria..."
|
445 |
)
|
446 |
prometheus_reference = gr.Markdown(
|
447 |
+
"<br> *By default, we use the Prometheus absolute grading prompt template - see [here](https://huggingface.co/prometheus-eval/prometheus-7b-v2.0).*",
|
448 |
+
visible=True
|
449 |
)
|
450 |
|
451 |
# Right column - Score Descriptions
|
|
|
655 |
)
|
656 |
|
657 |
# Function to toggle visibility based on compatible mode
|
658 |
+
def toggle_use_reference(checked):
|
659 |
+
if checked:
|
660 |
+
# Get new random samples with ground truth when enabling reference mode
|
661 |
+
human_msg, ai_msg, ground_truth_msg = get_random_human_ai_ground_truth_pair()
|
662 |
+
return {
|
663 |
+
ground_truth: gr.update(visible=True, value=ground_truth_msg),
|
664 |
+
human_input: gr.update(value=human_msg),
|
665 |
+
ai_response: gr.update(value=ai_msg),
|
666 |
+
# Reset other UI elements
|
667 |
+
score_a: gr.update(value=""),
|
668 |
+
critique_a: gr.update(value=""),
|
669 |
+
score_b: gr.update(value=""),
|
670 |
+
critique_b: gr.update(value=""),
|
671 |
+
vote_a: gr.update(interactive=False, variant="primary"),
|
672 |
+
vote_b: gr.update(interactive=False, variant="primary"),
|
673 |
+
vote_tie: gr.update(interactive=False, variant="primary"),
|
674 |
+
model_name_a: gr.update(value="*Model: Hidden*"),
|
675 |
+
model_name_b: gr.update(value="*Model: Hidden*"),
|
676 |
+
random_btn: gr.update(value="🎲", variant="secondary"),
|
677 |
+
}
|
678 |
+
else:
|
679 |
+
# Just hide ground truth when disabling reference mode
|
680 |
+
return {
|
681 |
+
ground_truth: gr.update(visible=False)
|
682 |
+
}
|
683 |
+
|
684 |
+
# Update the change handler to include all necessary outputs
|
685 |
+
use_reference_toggle.change(
|
686 |
+
fn=toggle_use_reference,
|
687 |
+
inputs=[use_reference_toggle],
|
688 |
outputs=[
|
689 |
ground_truth,
|
690 |
+
human_input,
|
691 |
+
ai_response,
|
692 |
+
score_a,
|
693 |
+
critique_a,
|
694 |
+
score_b,
|
695 |
+
critique_b,
|
696 |
+
vote_a,
|
697 |
+
vote_b,
|
698 |
+
vote_tie,
|
699 |
+
model_name_a,
|
700 |
+
model_name_b,
|
701 |
+
random_btn,
|
702 |
]
|
703 |
)
|
704 |
|
705 |
+
# Update the submit function to handle different prompts
|
706 |
def submit_and_store(
|
707 |
+
use_reference,
|
708 |
+
eval_criteria_text_input,
|
709 |
human_input,
|
710 |
ai_response,
|
711 |
ground_truth_input,
|
712 |
+
score1_description,
|
713 |
+
score2_description,
|
714 |
+
score3_description,
|
715 |
+
score4_description,
|
716 |
+
score5_description,
|
|
|
717 |
):
|
718 |
+
# Build prompt data dictionary
|
719 |
+
prompt_data = {
|
720 |
+
'human_input': human_input,
|
721 |
+
'ai_response': ai_response,
|
722 |
+
'ground_truth_input': ground_truth_input,
|
723 |
+
'eval_criteria': eval_criteria_text_input,
|
724 |
+
'score1_desc': score1_description,
|
725 |
+
'score2_desc': score2_description,
|
726 |
+
'score3_desc': score3_description,
|
727 |
+
'score4_desc': score4_description,
|
728 |
+
'score5_desc': score5_description,
|
729 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
730 |
|
731 |
+
models = list(model_data.keys())
|
732 |
model1, model2 = random.sample(models, 2)
|
733 |
model_a, model_b = (model1, model2) if random.random() < 0.5 else (model2, model1)
|
734 |
|
|
|
736 |
response_a = get_model_response(
|
737 |
model_a,
|
738 |
model_data.get(model_a),
|
739 |
+
prompt_data,
|
740 |
+
use_reference=use_reference
|
741 |
)
|
742 |
response_b = get_model_response(
|
743 |
model_b,
|
744 |
model_data.get(model_b),
|
745 |
+
prompt_data,
|
746 |
+
use_reference=use_reference
|
747 |
)
|
748 |
|
749 |
+
# Parse the responses based on model, using Prometheus parsing for Prometheus models and JSON parsing for others
|
750 |
+
is_prometheus_a = (model_data.get(model_a)['organization'] == 'Prometheus')
|
751 |
+
is_prometheus_b = (model_data.get(model_b)['organization'] == 'Prometheus')
|
752 |
+
|
753 |
+
if is_prometheus_a:
|
754 |
+
score_a_val, critique_a_val = prometheus_parse_model_response(response_a)
|
755 |
+
score_a_val = f"{score_a_val} / 5"
|
756 |
else:
|
757 |
score_a_val, critique_a_val = parse_model_response(response_a)
|
|
|
|
|
|
|
|
|
758 |
score_a_val = f"{score_a_val} / 5"
|
|
|
759 |
|
760 |
+
if is_prometheus_b:
|
761 |
+
score_b_val, critique_b_val = prometheus_parse_model_response(response_b)
|
762 |
+
score_b_val = f"{score_b_val} / 5"
|
763 |
+
else:
|
764 |
+
score_b_val, critique_b_val = parse_model_response(response_b)
|
765 |
+
score_b_val = f"{score_b_val} / 5"
|
766 |
|
767 |
return (
|
768 |
score_a_val,
|
|
|
774 |
gr.update(interactive=True, variant="primary"), # vote_tie
|
775 |
model_a,
|
776 |
model_b,
|
777 |
+
eval_prompt,
|
778 |
gr.update(value="*Model: Hidden*"),
|
779 |
gr.update(value="*Model: Hidden*"),
|
780 |
gr.update(value="Regenerate judges", variant="secondary", interactive=True),
|
|
|
785 |
send_btn.click(
|
786 |
fn=submit_and_store,
|
787 |
inputs=[
|
788 |
+
use_reference_toggle,
|
789 |
+
eval_criteria_text,
|
790 |
human_input,
|
791 |
ai_response,
|
792 |
ground_truth,
|
|
|
793 |
score1_description,
|
794 |
score2_description,
|
795 |
score3_description,
|
|
|
817 |
# Add random button handler
|
818 |
random_btn.click(
|
819 |
fn=populate_random_example,
|
820 |
+
inputs=[use_reference_toggle], # Use compatible mode toggle to decide behavior
|
821 |
outputs=[
|
822 |
human_input,
|
823 |
ai_response,
|