kaikaidai commited on
Commit
65ba9f3
1 Parent(s): 0b4911a

Likert-5 by default with Prometheus prompt template

Browse files
Files changed (1) hide show
  1. app.py +100 -111
app.py CHANGED
@@ -2,9 +2,8 @@ import json
2
  import re
3
  import random
4
  from collections import defaultdict
5
- from datetime import datetime, timezone
6
  import hashlib
7
- from typing import Dict, List
8
 
9
  from dotenv import load_dotenv
10
 
@@ -14,7 +13,7 @@ import gradio as gr
14
  from gen_api_answer import (
15
  get_model_response,
16
  parse_model_response,
17
- alternative_parse_model_response
18
  )
19
 
20
  from random_sample_generation import (
@@ -27,15 +26,12 @@ from utils import Vote
27
  from common import (
28
  POLICY_CONTENT,
29
  ACKNOWLEDGEMENTS,
30
- DEFAULT_EVAL_PROMPT,
31
- DEFAULT_INPUT,
32
- DEFAULT_RESPONSE,
33
  CSS_STYLES,
34
  MAIN_TITLE,
35
  HOW_IT_WORKS,
36
- BATTLE_RULES,
37
- EVAL_DESCRIPTION,
38
- VOTING_HEADER,
39
  DEFAULT_EVAL_PROMPT_EDITABLE,
40
  FIXED_EVAL_SUFFIX,
41
  DEFAULT_EVAL_CRITERIA,
@@ -48,7 +44,6 @@ from common import (
48
  from leaderboard import (
49
  get_leaderboard,
50
  get_leaderboard_stats,
51
- calculate_elo_change,
52
  get_model_rankings,
53
  DEFAULT_ELO,
54
  K_FACTOR
@@ -85,9 +80,11 @@ def load_model_data():
85
  model_data = load_model_data()
86
 
87
  def store_vote_data(prompt, response_a, response_b, model_a, model_b, winner, judge_id):
 
 
88
  vote = Vote(
89
  timestamp=datetime.now().isoformat(),
90
- prompt=prompt,
91
  response_a=response_a,
92
  response_b=response_b,
93
  model_a=model_a,
@@ -416,13 +413,13 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
416
  # Replace the "Edit Judge Prompt" Accordion section with:
417
  with gr.Accordion("📝 Edit Judge Prompt", open=False) as prompt_accordion:
418
  gr.Markdown("<br>")
419
- compatible_mode_toggle = gr.Checkbox(
420
- label="Use a prompt compatible with Prometheus models",
421
  value=False
422
  )
423
 
424
- # Default prompt editor
425
- with gr.Column(visible=True) as default_prompt_editor:
426
  eval_prompt_editable = gr.TextArea(
427
  value=DEFAULT_EVAL_PROMPT_EDITABLE,
428
  label="Evaluation Criteria",
@@ -435,8 +432,8 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
435
  gr.Markdown("*The sample being evaluated is always appended as:*")
436
  gr.Markdown(f"```{FIXED_EVAL_SUFFIX}")
437
 
438
- # Compatible mode editor
439
- with gr.Column(visible=False) as compatible_prompt_editor:
440
  with gr.Row():
441
  # Left column - Evaluation Criteria
442
  with gr.Column(scale=1):
@@ -447,8 +444,8 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
447
  placeholder="Enter the evaluation criteria..."
448
  )
449
  prometheus_reference = gr.Markdown(
450
- "<br> *This enforces the Prometheus absolute grading prompt template - see [here](https://huggingface.co/prometheus-eval/prometheus-7b-v2.0).*",
451
- visible=False # Initially hidden
452
  )
453
 
454
  # Right column - Score Descriptions
@@ -658,89 +655,80 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
658
  )
659
 
660
  # Function to toggle visibility based on compatible mode
661
- def toggle_compatible_mode(checked):
662
- return {
663
- ground_truth: gr.update(visible=checked),
664
- default_prompt_editor: gr.update(visible=not checked),
665
- compatible_prompt_editor: gr.update(visible=checked),
666
- prometheus_reference: gr.update(visible=checked),
667
- }
668
-
669
- compatible_mode_toggle.change(
670
- fn=toggle_compatible_mode,
671
- inputs=[compatible_mode_toggle],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
672
  outputs=[
673
  ground_truth,
674
- default_prompt_editor,
675
- compatible_prompt_editor,
676
- prometheus_reference,
 
 
 
 
 
 
 
 
 
677
  ]
678
  )
679
 
680
- # Update the submit function to handle compatible mode
681
  def submit_and_store(
682
- compatible_mode,
683
- editable_prompt,
684
  human_input,
685
  ai_response,
686
  ground_truth_input,
687
- eval_criteria_text_input,
688
- score1_desc,
689
- score2_desc,
690
- score3_desc,
691
- score4_desc,
692
- score5_desc,
693
  ):
694
- if compatible_mode:
695
- # Build the prompt using the new format
696
- prompt = f"""###Task Description:
697
- An instruction (might include an Input inside it), a response to evaluate, a reference answer that gets a score of 5, and a score rubric representing an evaluation criteria are given.
698
- 1. Write a detailed feedback that assesses the quality of the response strictly based on the given score rubric, not evaluating in general.
699
- 2. After writing the feedback, write a score that is an integer between 1 and 5. You should refer to the score rubric.
700
- 3. The output format should look as follows: "Feedback: (write a feedback for criteria) [RESULT] (an integer number between 1 and 5)"
701
- 4. Please do not generate any other openings, closings, or explanations.
702
-
703
- ###The instruction to evaluate:
704
- {human_input}
705
-
706
- ###Response to evaluate:
707
- {ai_response}
708
-
709
- ###Reference Answer (Score 5):
710
- {ground_truth_input}
711
-
712
- ###Score Rubrics:
713
- [{eval_criteria_text_input}]
714
- Score 1: {score1_desc}
715
- Score 2: {score2_desc}
716
- Score 3: {score3_desc}
717
- Score 4: {score4_desc}
718
- Score 5: {score5_desc}
719
-
720
- ###Feedback:
721
- """
722
- final_prompt = prompt
723
- use_alternative_prompt = True
724
- else:
725
- # Combine the editable prompt with fixed suffix
726
- full_prompt = editable_prompt + FIXED_EVAL_SUFFIX
727
- # Replace variables in the eval prompt
728
- variable_values = {'input': human_input, 'response': ai_response}
729
- final_prompt = get_final_prompt(full_prompt, variable_values)
730
- use_alternative_prompt = False
731
-
732
- # Filter models based on compatible mode
733
- if compatible_mode:
734
- # Include all models when compatible mode is enabled
735
- models = list(model_data.keys())
736
- else:
737
- # Exclude Prometheus models when not in compatible mode
738
- models = [
739
- model_name for model_name in model_data.keys()
740
- if model_data[model_name]["organization"] != "Prometheus"
741
- ]
742
 
743
- # Select two models randomly from the filtered list
744
  model1, model2 = random.sample(models, 2)
745
  model_a, model_b = (model1, model2) if random.random() < 0.5 else (model2, model1)
746
 
@@ -748,31 +736,33 @@ Score 5: {score5_desc}
748
  response_a = get_model_response(
749
  model_a,
750
  model_data.get(model_a),
751
- final_prompt,
752
- use_alternative_prompt=use_alternative_prompt
753
  )
754
  response_b = get_model_response(
755
  model_b,
756
  model_data.get(model_b),
757
- final_prompt,
758
- use_alternative_prompt=use_alternative_prompt
759
  )
760
 
761
- # Parse the responses based on mode
762
- if compatible_mode:
763
- score_a_val, critique_a_val = alternative_parse_model_response(response_a)
764
- score_b_val, critique_b_val = alternative_parse_model_response(response_b)
 
 
 
765
  else:
766
  score_a_val, critique_a_val = parse_model_response(response_a)
767
- score_b_val, critique_b_val = parse_model_response(response_b)
768
-
769
- # Only append "/ 5" if using the default prompt
770
- if not compatible_mode and editable_prompt.strip() == DEFAULT_EVAL_PROMPT_EDITABLE.strip():
771
  score_a_val = f"{score_a_val} / 5"
772
- score_b_val = f"{score_b_val} / 5"
773
 
774
- # Update the last_submission state
775
- last_submission.value = {"prompt": final_prompt, "variables": [human_input, ai_response]}
 
 
 
 
776
 
777
  return (
778
  score_a_val,
@@ -784,7 +774,7 @@ Score 5: {score5_desc}
784
  gr.update(interactive=True, variant="primary"), # vote_tie
785
  model_a,
786
  model_b,
787
- final_prompt,
788
  gr.update(value="*Model: Hidden*"),
789
  gr.update(value="*Model: Hidden*"),
790
  gr.update(value="Regenerate judges", variant="secondary", interactive=True),
@@ -795,12 +785,11 @@ Score 5: {score5_desc}
795
  send_btn.click(
796
  fn=submit_and_store,
797
  inputs=[
798
- compatible_mode_toggle,
799
- eval_prompt_editable,
800
  human_input,
801
  ai_response,
802
  ground_truth,
803
- eval_criteria_text,
804
  score1_description,
805
  score2_description,
806
  score3_description,
@@ -828,7 +817,7 @@ Score 5: {score5_desc}
828
  # Add random button handler
829
  random_btn.click(
830
  fn=populate_random_example,
831
- inputs=[compatible_mode_toggle], # Use compatible mode toggle to decide behavior
832
  outputs=[
833
  human_input,
834
  ai_response,
 
2
  import re
3
  import random
4
  from collections import defaultdict
5
+ from datetime import datetime
6
  import hashlib
 
7
 
8
  from dotenv import load_dotenv
9
 
 
13
  from gen_api_answer import (
14
  get_model_response,
15
  parse_model_response,
16
+ prometheus_parse_model_response
17
  )
18
 
19
  from random_sample_generation import (
 
26
  from common import (
27
  POLICY_CONTENT,
28
  ACKNOWLEDGEMENTS,
 
 
 
29
  CSS_STYLES,
30
  MAIN_TITLE,
31
  HOW_IT_WORKS,
32
+ )
33
+ from prompts import (
34
+ DEFAULT_EVAL_PROMPT,
35
  DEFAULT_EVAL_PROMPT_EDITABLE,
36
  FIXED_EVAL_SUFFIX,
37
  DEFAULT_EVAL_CRITERIA,
 
44
  from leaderboard import (
45
  get_leaderboard,
46
  get_leaderboard_stats,
 
47
  get_model_rankings,
48
  DEFAULT_ELO,
49
  K_FACTOR
 
80
  model_data = load_model_data()
81
 
82
  def store_vote_data(prompt, response_a, response_b, model_a, model_b, winner, judge_id):
83
+ prompt_value = prompt.value if hasattr(prompt, 'value') else prompt
84
+
85
  vote = Vote(
86
  timestamp=datetime.now().isoformat(),
87
+ prompt=prompt_value,
88
  response_a=response_a,
89
  response_b=response_b,
90
  model_a=model_a,
 
413
  # Replace the "Edit Judge Prompt" Accordion section with:
414
  with gr.Accordion("📝 Edit Judge Prompt", open=False) as prompt_accordion:
415
  gr.Markdown("<br>")
416
+ use_reference_toggle = gr.Checkbox(
417
+ label="Use a reference response",
418
  value=False
419
  )
420
 
421
+ # Hide the default prompt editor
422
+ with gr.Column(visible=False) as default_prompt_editor:
423
  eval_prompt_editable = gr.TextArea(
424
  value=DEFAULT_EVAL_PROMPT_EDITABLE,
425
  label="Evaluation Criteria",
 
432
  gr.Markdown("*The sample being evaluated is always appended as:*")
433
  gr.Markdown(f"```{FIXED_EVAL_SUFFIX}")
434
 
435
+ # Show the compatible mode editor
436
+ with gr.Column(visible=True) as compatible_prompt_editor:
437
  with gr.Row():
438
  # Left column - Evaluation Criteria
439
  with gr.Column(scale=1):
 
444
  placeholder="Enter the evaluation criteria..."
445
  )
446
  prometheus_reference = gr.Markdown(
447
+ "<br> *By default, we use the Prometheus absolute grading prompt template - see [here](https://huggingface.co/prometheus-eval/prometheus-7b-v2.0).*",
448
+ visible=True
449
  )
450
 
451
  # Right column - Score Descriptions
 
655
  )
656
 
657
  # Function to toggle visibility based on compatible mode
658
+ def toggle_use_reference(checked):
659
+ if checked:
660
+ # Get new random samples with ground truth when enabling reference mode
661
+ human_msg, ai_msg, ground_truth_msg = get_random_human_ai_ground_truth_pair()
662
+ return {
663
+ ground_truth: gr.update(visible=True, value=ground_truth_msg),
664
+ human_input: gr.update(value=human_msg),
665
+ ai_response: gr.update(value=ai_msg),
666
+ # Reset other UI elements
667
+ score_a: gr.update(value=""),
668
+ critique_a: gr.update(value=""),
669
+ score_b: gr.update(value=""),
670
+ critique_b: gr.update(value=""),
671
+ vote_a: gr.update(interactive=False, variant="primary"),
672
+ vote_b: gr.update(interactive=False, variant="primary"),
673
+ vote_tie: gr.update(interactive=False, variant="primary"),
674
+ model_name_a: gr.update(value="*Model: Hidden*"),
675
+ model_name_b: gr.update(value="*Model: Hidden*"),
676
+ random_btn: gr.update(value="🎲", variant="secondary"),
677
+ }
678
+ else:
679
+ # Just hide ground truth when disabling reference mode
680
+ return {
681
+ ground_truth: gr.update(visible=False)
682
+ }
683
+
684
+ # Update the change handler to include all necessary outputs
685
+ use_reference_toggle.change(
686
+ fn=toggle_use_reference,
687
+ inputs=[use_reference_toggle],
688
  outputs=[
689
  ground_truth,
690
+ human_input,
691
+ ai_response,
692
+ score_a,
693
+ critique_a,
694
+ score_b,
695
+ critique_b,
696
+ vote_a,
697
+ vote_b,
698
+ vote_tie,
699
+ model_name_a,
700
+ model_name_b,
701
+ random_btn,
702
  ]
703
  )
704
 
705
+ # Update the submit function to handle different prompts
706
  def submit_and_store(
707
+ use_reference,
708
+ eval_criteria_text_input,
709
  human_input,
710
  ai_response,
711
  ground_truth_input,
712
+ score1_description,
713
+ score2_description,
714
+ score3_description,
715
+ score4_description,
716
+ score5_description,
 
717
  ):
718
+ # Build prompt data dictionary
719
+ prompt_data = {
720
+ 'human_input': human_input,
721
+ 'ai_response': ai_response,
722
+ 'ground_truth_input': ground_truth_input,
723
+ 'eval_criteria': eval_criteria_text_input,
724
+ 'score1_desc': score1_description,
725
+ 'score2_desc': score2_description,
726
+ 'score3_desc': score3_description,
727
+ 'score4_desc': score4_description,
728
+ 'score5_desc': score5_description,
729
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
730
 
731
+ models = list(model_data.keys())
732
  model1, model2 = random.sample(models, 2)
733
  model_a, model_b = (model1, model2) if random.random() < 0.5 else (model2, model1)
734
 
 
736
  response_a = get_model_response(
737
  model_a,
738
  model_data.get(model_a),
739
+ prompt_data,
740
+ use_reference=use_reference
741
  )
742
  response_b = get_model_response(
743
  model_b,
744
  model_data.get(model_b),
745
+ prompt_data,
746
+ use_reference=use_reference
747
  )
748
 
749
+ # Parse the responses based on model, using Prometheus parsing for Prometheus models and JSON parsing for others
750
+ is_prometheus_a = (model_data.get(model_a)['organization'] == 'Prometheus')
751
+ is_prometheus_b = (model_data.get(model_b)['organization'] == 'Prometheus')
752
+
753
+ if is_prometheus_a:
754
+ score_a_val, critique_a_val = prometheus_parse_model_response(response_a)
755
+ score_a_val = f"{score_a_val} / 5"
756
  else:
757
  score_a_val, critique_a_val = parse_model_response(response_a)
 
 
 
 
758
  score_a_val = f"{score_a_val} / 5"
 
759
 
760
+ if is_prometheus_b:
761
+ score_b_val, critique_b_val = prometheus_parse_model_response(response_b)
762
+ score_b_val = f"{score_b_val} / 5"
763
+ else:
764
+ score_b_val, critique_b_val = parse_model_response(response_b)
765
+ score_b_val = f"{score_b_val} / 5"
766
 
767
  return (
768
  score_a_val,
 
774
  gr.update(interactive=True, variant="primary"), # vote_tie
775
  model_a,
776
  model_b,
777
+ eval_prompt,
778
  gr.update(value="*Model: Hidden*"),
779
  gr.update(value="*Model: Hidden*"),
780
  gr.update(value="Regenerate judges", variant="secondary", interactive=True),
 
785
  send_btn.click(
786
  fn=submit_and_store,
787
  inputs=[
788
+ use_reference_toggle,
789
+ eval_criteria_text,
790
  human_input,
791
  ai_response,
792
  ground_truth,
 
793
  score1_description,
794
  score2_description,
795
  score3_description,
 
817
  # Add random button handler
818
  random_btn.click(
819
  fn=populate_random_example,
820
+ inputs=[use_reference_toggle], # Use compatible mode toggle to decide behavior
821
  outputs=[
822
  human_input,
823
  ai_response,