kaikaidai commited on
Commit
7af825c
1 Parent(s): 2cb716b

Changed structure

Browse files
Files changed (1) hide show
  1. app.py +171 -371
app.py CHANGED
@@ -3,96 +3,40 @@ import json
3
  import gradio as gr
4
  import re
5
  import random
6
- import time
7
  from collections import defaultdict
8
- from functools import partial
9
- import openai
10
- from openai import OpenAI
11
- import anthropic
12
  import pandas as pd
13
- from together import Together
14
  import os
 
 
 
15
 
16
- anthropic_client = anthropic.Anthropic()
17
- openai_client = OpenAI()
18
- together_client = Together()
19
 
20
  # Model and ELO score data
21
- DEFAULT_ELO = 1000 # Starting ELO for new models
 
22
  elo_scores = defaultdict(lambda: DEFAULT_ELO)
23
  vote_counts = defaultdict(int)
24
- model_data = {
25
- 'Meta Llama 3.1 70B Instruct Turbo': {
26
- 'organization': 'Meta',
27
- 'license': 'Open Source',
28
- 'api_model': 'meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo'
29
- },
30
- 'Meta Llama 3.1 405B Instruct Turbo': {
31
- 'organization': 'Meta',
32
- 'license': 'Open Source',
33
- 'api_model': 'meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo'
34
- },
35
- 'Gemma 2 27B': {
36
- 'organization': 'Google',
37
- 'license': 'Open Source',
38
- 'api_model': 'google/gemma-2-27b-it'
39
- },
40
- 'Gemma 2 9B': {
41
- 'organization': 'Google',
42
- 'license': 'Open Source',
43
- 'api_model': 'google/gemma-2-9b-it'
44
- },
45
- 'Qwen 2 Instruct (72B)': {
46
- 'organization': 'Alibaba',
47
- 'license': 'Open Source',
48
- 'api_model': 'Qwen/Qwen2-72B-Instruct'
49
- },
50
- 'Mistral (7B) Instruct v0.3': {
51
- 'organization': 'Mistral AI',
52
- 'license': 'Open Source',
53
- 'api_model': 'mistralai/Mistral-7B-Instruct-v0.3'
54
- },
55
- 'GPT-4o': {
56
- 'organization': 'OpenAI',
57
- 'license': 'Proprietary',
58
- 'api_model': 'gpt-4o'
59
- },
60
- 'GPT-4 Turbo': {
61
- 'organization': 'OpenAI',
62
- 'license': 'Proprietary',
63
- 'api_model': 'gpt-4-turbo'
64
- },
65
- 'GPT-3.5 Turbo': {
66
- 'organization': 'OpenAI',
67
- 'license': 'Proprietary',
68
- 'api_model': 'gpt-3.5-turbo'
69
- },
70
- 'Claude 3 Haiku': {
71
- 'organization': 'Anthropic',
72
- 'license': 'Proprietary',
73
- 'api_model': 'claude-3-haiku-20240307'
74
- },
75
- 'Claude 3 Sonnet': {
76
- 'organization': 'Anthropic',
77
- 'license': 'Proprietary',
78
- 'api_model': 'claude-3-sonnet-20240229'
79
- },
80
- 'Claude 3 Opus': {
81
- 'organization': 'Anthropic',
82
- 'license': 'Proprietary',
83
- 'api_model': 'claude-3-opus-20240229'
84
- },
85
- 'Meta Llama 3.1 8B Instruct Turbo': {
86
- 'organization': 'Meta',
87
- 'license': 'Open Source',
88
- 'api_model': 'meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo'
89
- },
90
- 'Meta Llama 3.1 70B Instruct Turbo': {
91
- 'organization': 'Meta',
92
- 'license': 'Open Source',
93
- 'api_model': 'meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo'
94
- },
95
- }
96
 
97
  current_session_id = 0
98
  voting_data = []
@@ -115,7 +59,7 @@ def store_vote_data(prompt, response_a, response_b, model_a, model_b, winner, ju
115
  }
116
  voting_data.append(vote_entry)
117
 
118
- # Optionally save to file after each vote
119
  with open('voting_data.json', 'w') as f:
120
  json.dump(voting_data, f, indent=2)
121
 
@@ -133,56 +77,6 @@ def get_final_prompt(eval_prompt, variable_values):
133
  eval_prompt = eval_prompt.replace('{{' + var + '}}', val)
134
  return eval_prompt
135
 
136
- # Add this near the top with other constants
137
- SYSTEM_PROMPT = """Please act as an impartial judge and evaluate based on the user's instruction. Your output format should be a JSON as follows: {{"feedback": "(write a feedback for the evaluation criteria)", "result": "(a score based on the evaluation criteria)"}}"""
138
-
139
- def get_openai_response(model_name, prompt):
140
- try:
141
- response = openai_client.chat.completions.create(
142
- model=model_name,
143
- messages=[
144
- {"role": "system", "content": SYSTEM_PROMPT},
145
- {"role": "user", "content": prompt}
146
- ]
147
- )
148
- return response.choices[0].message.content
149
- except Exception as e:
150
- return f"Error with OpenAI model {model_name}: {str(e)}"
151
-
152
- def get_anthropic_response(model_name, prompt):
153
- try:
154
- response = anthropic_client.messages.create(
155
- model=model_name,
156
- max_tokens=1000,
157
- temperature=0,
158
- system=SYSTEM_PROMPT,
159
- messages=[
160
- {"role": "user", "content": [{"type": "text", "text": prompt}]}
161
- ]
162
- )
163
- return response.content[0].text
164
- except Exception as e:
165
- return f"Error with Anthropic model {model_name}: {str(e)}"
166
-
167
- def get_model_response(model_name, prompt):
168
- model_info = model_data.get(model_name)
169
- if not model_info:
170
- return "Model not found or unsupported."
171
-
172
- api_model = model_info['api_model']
173
- organization = model_info['organization']
174
-
175
- try:
176
- if organization == 'OpenAI':
177
- return get_openai_response(api_model, prompt)
178
- elif organization == 'Anthropic':
179
- return get_anthropic_response(api_model, prompt)
180
- else:
181
- # All other organizations use Together API
182
- return get_together_response(api_model, prompt)
183
- except Exception as e:
184
- return f"Error with {organization} model {model_name}: {str(e)}"
185
-
186
  def submit_prompt(eval_prompt, *variable_values):
187
  try:
188
  variables = parse_variables(eval_prompt)
@@ -193,20 +87,19 @@ def submit_prompt(eval_prompt, *variable_values):
193
  model1, model2 = random.sample(models, 2)
194
  model_a, model_b = (model1, model2) if random.random() < 0.5 else (model2, model1)
195
 
196
- response_a = get_model_response(model_a, final_prompt)
197
- response_b = get_model_response(model_b, final_prompt)
198
 
199
  return (
200
- response_a, # response_a textbox
201
- response_b, # response_b textbox
202
- gr.update(visible=True), # action_buttons_row
203
- gr.update(visible=True), # regenerate_button
204
- model_a, # model_a_state
205
- model_b # model_b_state
206
  )
207
  except Exception as e:
208
  print(f"Error in submit_prompt: {str(e)}")
209
- # Return default values in case of error
210
  return (
211
  "Error generating response",
212
  "Error generating response",
@@ -220,7 +113,6 @@ def vote(choice, model_a, model_b, prompt, response_a, response_b, judge_id):
220
  # Update ELO scores based on user choice
221
  elo_a = elo_scores[model_a]
222
  elo_b = elo_scores[model_b]
223
- K = 32 # ELO K-factor
224
 
225
  # Calculate expected scores
226
  Ea = 1 / (1 + 10 ** ((elo_b - elo_a) / 400))
@@ -235,8 +127,8 @@ def vote(choice, model_a, model_b, prompt, response_a, response_b, judge_id):
235
  Sa, Sb = 0.5, 0.5
236
 
237
  # Update scores and vote counts
238
- elo_scores[model_a] += K * (Sa - Ea)
239
- elo_scores[model_b] += K * (Sb - Eb)
240
  vote_counts[model_a] += 1
241
  vote_counts[model_b] += 1
242
 
@@ -252,6 +144,8 @@ def vote(choice, model_a, model_b, prompt, response_a, response_b, judge_id):
252
  regenerate_button: gr.update(visible=True, interactive=True)
253
  }
254
 
 
 
255
  def get_leaderboard():
256
  # Generate leaderboard data
257
  leaderboard = []
@@ -286,8 +180,8 @@ def regenerate_prompt(model_a, model_b, eval_prompt, *variable_values):
286
  # Fallback to allowing previous models if necessary
287
  model1, model2 = random.sample(list(model_data.keys()), 2)
288
 
289
- response_a = get_model_response(model1, final_prompt)
290
- response_b = get_model_response(model2, final_prompt)
291
 
292
  # Parse the responses
293
  score_a, critique_a = parse_model_response(response_a)
@@ -305,10 +199,6 @@ def regenerate_prompt(model_a, model_b, eval_prompt, *variable_values):
305
  model2 # model_b_state
306
  )
307
 
308
- # Add these constants at the top of your file
309
- K_FACTOR = 32 # Standard chess K-factor, adjust as needed
310
- DEFAULT_ELO = 1500 # Starting ELO for new models
311
-
312
  def calculate_elo_change(rating_a, rating_b, winner):
313
  """Calculate ELO rating changes for both players."""
314
  expected_a = 1 / (1 + 10 ** ((rating_b - rating_a) / 400))
@@ -400,72 +290,117 @@ leaderboard_table = gr.Dataframe(
400
  datatype=['str', 'number', 'str', 'number', 'str', 'str', 'str']
401
  )
402
 
403
- def get_together_response(model_name, prompt):
404
  try:
405
- response = together_client.chat.completions.create(
406
- model=model_name,
407
- messages=[
408
- {"role": "system", "content": SYSTEM_PROMPT},
409
- {"role": "user", "content": prompt}
410
- ],
411
- stream=False
412
- )
413
- return response.choices[0].message.content
 
 
 
 
 
 
 
414
  except Exception as e:
415
- return f"Error with Together model {model_name}: {str(e)}"
 
 
416
 
417
- def parse_model_response(response):
 
418
  try:
419
- # Parse JSON response
420
- data = json.loads(response)
421
- return data.get('result', 'N/A'), data.get('feedback', 'N/A')
422
- except:
423
- # If JSON parsing fails, return original response
424
- return 'Error', response
425
-
426
- with gr.Blocks(theme='default', css="""
427
- .prompt-row {
428
- align-items: flex-start !important;
429
- }
430
- .send-button-row {
431
- display: flex;
432
- justify-content: flex-end;
433
- margin-top: 8px;
434
- }
435
- """) as demo:
 
 
 
 
 
 
 
 
 
 
 
 
436
  judge_id = gr.State(get_new_session_id())
437
- gr.Markdown("# Judge Arena")
438
- gr.Markdown("*Free LLM Evals to test your GenAI application.*")
439
 
440
  with gr.Tabs():
441
  with gr.TabItem("Judge Arena"):
442
- # Add introduction section with side-by-side rules and scoring
443
- gr.Markdown("""
444
- # How the Arena Works:
445
-
446
- ## Test two anonymous LLM judges side by side
447
- Try out different eval metrics - from simple hallucination detection to qualitative interpretations
448
- """)
449
 
450
  with gr.Row():
451
  with gr.Column():
452
- gr.Markdown("""
453
- ## 🤺 Battle Rules:
454
- - Both AIs stay anonymous - if either reveals its identity, the duel is void
455
- - Evaluate anything: coding, analysis, creative writing, math, or general knowledge
456
- """)
457
- with gr.Column():
458
- gr.Markdown("""
459
- ## 🧮 Scoring System:
460
- - Choose the LLM judge that most aligned with your choice as a human
461
- - If both score the same - choose the critique that you prefer more!
462
- - Your votes shape our real-time leaderboard
463
- """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
464
 
465
- # Add divider heading
466
- gr.Markdown("""
467
- # Start Voting Now
468
- """)
469
 
470
  # Model Responses side-by-side
471
  with gr.Row():
@@ -479,163 +414,27 @@ with gr.Blocks(theme='default', css="""
479
  score_b = gr.Textbox(label="Score", interactive=False)
480
  critique_b = gr.TextArea(label="Critique", lines=8, interactive=False)
481
  model_name_b = gr.Markdown("*Model: Unknown*")
 
482
  # Initially hide vote buttons and regenerate button
483
  with gr.Row(visible=False) as action_buttons_row:
484
  vote_a = gr.Button("Choose A", variant="primary")
485
  vote_tie = gr.Button("Tie", variant="secondary")
486
  vote_b = gr.Button("Choose B", variant="primary")
487
  regenerate_button = gr.Button("Regenerate with different models", variant="secondary", visible=False)
488
- # Eval Prompt and Variables below
489
- with gr.Row(elem_classes="prompt-row"):
490
- eval_prompt = gr.TextArea(
491
- label="Eval Prompt",
492
- lines=1,
493
- value="""You are assessing a chat bot response to a user's input based on the helpfulness of the response.\n
494
-
495
- Score:
496
-
497
- A score of 1 means that the response's answer meets all of the evaluation criteria.
498
-
499
- A score of 0 means that the response's answer does not meet all of the evaluation criteria.
500
-
501
- Here is the data:\n
502
-
503
- [BEGIN DATA]
504
-
505
- ***
506
-
507
- [User Query]: {{input}}
508
-
509
- ***
510
-
511
- [Response]: {{response}}
512
-
513
- ***
514
-
515
- [END DATA]""",
516
- placeholder="Type your eval prompt here... denote variables like a ground truth response with {{variable}} to be populated below.",
517
- show_label=True,
518
- scale=8
519
- )
520
- with gr.Row(elem_classes="send-button-row"):
521
- send_btn = gr.Button(
522
- value="Send",
523
- variant="primary",
524
- size="lg",
525
- scale=1 # Make button larger
526
- )
527
- gr.Markdown("### Variable Mapping")
528
- # Create inputs for up to 5 variables, with first two visible by default
529
- variable_rows = []
530
- for i in range(5):
531
- # Set initial visibility True for first two rows (input and response)
532
- initial_visibility = True if i < 2 else False
533
- with gr.Row(visible=initial_visibility) as var_row:
534
- with gr.Column(scale=0.2, min_width=80):
535
- # Set initial labels for input and response
536
- initial_label = "**input:**" if i == 0 else "**response:**" if i == 1 else "Variable"
537
- var_label = gr.Markdown(initial_label)
538
- with gr.Column(scale=1):
539
- # Set initial values for input and response
540
- initial_value = "Hello! Can you tell me the weather today?" if i == 0 else \
541
- "Hi there! It is 27 degrees Celsius today. Would you like the weather for the week ahead?" if i == 1 else ""
542
- var_input = gr.Textbox(label="", container=False, value=initial_value)
543
- variable_rows.append((var_row, var_label, var_input))
544
 
545
  # Add spacing and acknowledgements at the bottom
546
- gr.Markdown("""
547
- <br><br><br>
548
- # Acknowledgements
549
-
550
- We thank [LMSYS Org](https://lmsys.org/) for their hard work on the Chatbot Arena and fully credit them for the inspiration to build this.
551
-
552
- We thank [Clementine Fourrier](https://huggingface.co/clefourrier) and Hugging Face for their guidance and partnership in setting this up.
553
- """)
554
 
555
  with gr.TabItem("Leaderboard"):
556
  refresh_button = gr.Button("Refresh")
 
557
  leaderboard_table = gr.Dataframe(
558
  headers=['Model', 'ELO', '95% CI', 'Matches', 'Organization', 'License'],
559
  datatype=['str', 'number', 'str', 'number', 'str', 'str']
560
  )
561
 
562
  with gr.TabItem("Policy"):
563
- gr.Markdown("""
564
- # About Atla
565
-
566
- Atla is an applied research organization that trains models as evaluators to capture human preferences. We're a team of researchers, engineers, and operational leaders, with experience spanning a variety of disciplines, all working together to build reliable and understandable AI systems. Our research is informed by our experiences conducting AI safety research at the UK AI Task Force, OpenAI and the Stanford Existential Risks Initiative.
567
-
568
- # Our Mission
569
-
570
- By creating advanced evaluation models, we enable AI developers to identify and fix risks, leading to safer, more reliable AI that can be trusted and widely used. Our aim is to surpass the current state-of-the-art evaluation methods by training models specifically for evaluation. AIs will probably become very powerful, and perform tasks that are difficult for us to verify. We want to enable humans to oversee AI systems that are solving tasks too difficult for humans to evaluate. We have written more about [our approach to scalable oversight](https://www.atla-ai.com/post/scaling-alignment) on our blog.
571
-
572
- # Judge Arena Policy
573
-
574
- ## Overview
575
-
576
- Judge Arena is an open-source platform dedicated to improving the standard of evaluation of generative AI models in their role as judges. Users can run evals and assess anonymized responses from two competing model judges, choosing the better judgement or declaring a tie. This policy outlines our commitments and guidelines to ensure a fair, open, and collaborative environment for both users and model providers.
577
-
578
- ## Transparency
579
-
580
- - **Open-Source**: Judge Arena's code is open-source and available on GitHub. This approach allows anyone to review, replicate, or modify the platform to suit their needs. We use proprietary model provider APIs where provided and Together AI's API to serve leading open-source models.
581
- - **Community Engagement**: We actively encourage contributions from the community. Feedback, code contributions, and discussions are welcome to improve the platform's functionality, fairness, and transparency.
582
- - **Methodology**: All processes related to model evaluation, rating calculations, and model selection are openly documented. This transparency ensures that our processes are understandable and reproducible by others.
583
- - **Data Sharing**: Periodically, we will share 20% of the collected evaluation data with the community. This data includes anonymized prompts, model responses, and aggregated evaluation results.
584
-
585
- ## Model Inclusion Criteria
586
-
587
- Judge Arena is specifically designed to assess AI models that function as evaluators (a.k.a judges), including but not limited to powerful general-purpose models and the latest language models designed for evaluation tasks. Models are eligible for inclusion if they meet the following criteria:
588
-
589
- - **Judge Capability**: The model must possess the ability to score AND critique responses, content, or other models' outputs effectively.
590
- - **Adaptable:** The model must be prompt-able to be evaluate in different scoring formats, for different criteria.
591
- - **Accessibility**:
592
- - **Public API Access**: Models accessible through public APIs without restrictive barriers.
593
- - **Open-Source Models**: Models with publicly available weights that can be downloaded and run by the community.
594
-
595
- ## Evaluation Methodology
596
-
597
- - **User Participation**: Users run evaluations and select preferred model responses based on quality, relevance, and accuracy contributing to the model's overall rating.
598
- - **Blind Testing**: All model evaluations are conducted blindly. Users are not informed which model produced which response to eliminate bias.
599
- - **Data Collection**: We collect sufficient data to ensure statistical significance in our evaluations. We additionally show the 95% confidence interval in the leaderboard to provide a signal of reliability.
600
- - **Anomaly Detection**: We monitor user activity to detect and mitigate anomalous behavior or voting patterns that could skew results.
601
-
602
- ## Leaderboard Management
603
-
604
- - **ELO Ranking System**: Models are ranked on a public leaderboard based on aggregated user evaluations. We use an ELO rating system to rank AI judges on the public leaderboard. Each model begins with an initial rating of 1500 (as is used by the International Chess Federation), and we use a K-factor of 32 to determine the maximum rating adjustment after each evaluation.
605
- - **Minimum Period**: Listed models remain accessible on Judge Arena for a minimum period of two weeks to allow for comprehensive community evaluation.
606
- - **Deprecation Policy**: Models may be removed from the leaderboard if they become inaccessible, are no longer publicly available.
607
-
608
- ## Privacy and Data Protection
609
-
610
- - **Anonymization**: All shared data is anonymized to prevent the identification of individual users.
611
-
612
- ## Policy Updates and Communication
613
-
614
- - **Ongoing Revisions**: This policy may be updated to reflect changes in our practices or in response to community feedback.
615
- - **Notification of Changes**: Policy changes will be communicated to users and stakeholders on this page.
616
-
617
- # FAQ
618
-
619
- **Isn't this the same as Chatbot Arena?**
620
-
621
- - We are big fans of what the LMSYS team have done with Chatbot Arena and fully credit them for the inspiration to develop this. We were looking for a dynamic leaderboard that graded on AI judge capabilities and didn't manage to find one, so we created Judge Arena. This UI is designed especially for evals; to match the format of the model-based eval prompts that you would use in your LLM evaluation / monitoring tool.
622
-
623
- \n\n**Why should I trust this leaderboard?**
624
-
625
- - We have listed out our efforts to be fully transparent in the policies above. All of the code for this leaderboard is open-source and can be found on our [Github](https://github.com/atla-ai/judge-arena).
626
-
627
- \n\n**Who funds this effort?**
628
-
629
- - Atla currently funds this out of our own pocket. We are looking for API credits (with no strings attached) to support this effort - please get in touch if you or someone you know might be able to help.
630
-
631
- \n\n**What is Atla working on?**
632
-
633
- - We are training a general-purpose evaluator that you will soon be able to run in this Judge Arena. Our next step will be to open-source a powerful model that the community can use to run fast and accurate evaluations.
634
-
635
- ## Get in touch
636
-
637
- Feel free to email us at [support@atla-ai.com](mailto:support@atla-ai.com) or leave feedback on our [Github](https://github.com/atla-ai/judge-arena)!
638
- """)
639
 
640
  # Define state variables for model tracking
641
  model_a_state = gr.State()
@@ -646,17 +445,17 @@ Feel free to email us at [support@atla-ai.com](mailto:support@atla-ai.com) or le
646
  variables = parse_variables(eval_prompt)
647
  updates = []
648
  for i in range(5):
649
- var_row, var_label, var_input = variable_rows[i]
650
  if i < len(variables):
651
  updates.extend([
652
  gr.update(visible=True), # var_row
653
- gr.update(value=f"**{variables[i]}:**"), # var_label
654
  gr.update(visible=True) # var_input
655
  ])
656
  else:
657
  updates.extend([
658
  gr.update(visible=False), # var_row
659
- gr.update(), # var_label
660
  gr.update(visible=False, value="") # var_input
661
  ])
662
  return updates
@@ -666,7 +465,7 @@ Feel free to email us at [support@atla-ai.com](mailto:support@atla-ai.com) or le
666
  # Regenerate button functionality
667
  regenerate_button.click(
668
  fn=regenerate_prompt,
669
- inputs=[model_a_state, model_b_state, eval_prompt] + [var_input for _, _, var_input in variable_rows],
670
  outputs=[
671
  score_a,
672
  critique_a,
@@ -687,15 +486,6 @@ Feel free to email us at [support@atla-ai.com](mailto:support@atla-ai.com) or le
687
  # Store the last submitted prompt and variables for comparison
688
  last_submission = gr.State({})
689
 
690
- def handle_input_changes(prompt, *variables):
691
- """Enable send button and disable regenerate button if inputs have changed"""
692
- last_inputs = last_submission.value
693
- current_inputs = {"prompt": prompt, "variables": variables}
694
- inputs_changed = last_inputs != current_inputs
695
- return [
696
- gr.update(interactive=True), # Always keep send button enabled
697
- gr.update(visible=False) # Hide regenerate button when inputs change
698
- ]
699
 
700
  # Update the vote button click handlers
701
  vote_a.click(
@@ -731,7 +521,7 @@ Feel free to email us at [support@atla-ai.com](mailto:support@atla-ai.com) or le
731
  score_b,
732
  critique_b,
733
  buttons_visible,
734
- gr.update(visible=False), # Hide regenerate button on new submission
735
  model_a,
736
  model_b,
737
  gr.update(value="*Model: Unknown*"),
@@ -740,7 +530,7 @@ Feel free to email us at [support@atla-ai.com](mailto:support@atla-ai.com) or le
740
 
741
  send_btn.click(
742
  fn=submit_and_store,
743
- inputs=[eval_prompt] + [var_input for _, _, var_input in variable_rows],
744
  outputs=[
745
  score_a,
746
  critique_a,
@@ -757,31 +547,31 @@ Feel free to email us at [support@atla-ai.com](mailto:support@atla-ai.com) or le
757
 
758
  # Update the input change handlers to also disable regenerate button
759
  def handle_input_changes(prompt, *variables):
760
- """Enable send button and disable regenerate button if inputs have changed"""
761
  last_inputs = last_submission.value
762
  current_inputs = {"prompt": prompt, "variables": variables}
763
  inputs_changed = last_inputs != current_inputs
764
  return [
765
- gr.update(interactive=inputs_changed), # send button
766
- gr.update(interactive=not inputs_changed) # regenerate button
767
  ]
768
 
769
  # Update the change handlers for prompt and variables
770
  eval_prompt.change(
771
  fn=handle_input_changes,
772
- inputs=[eval_prompt] + [var_input for _, _, var_input in variable_rows],
773
  outputs=[send_btn, regenerate_button]
774
  )
775
 
776
- for _, _, var_input in variable_rows:
777
  var_input.change(
778
  fn=handle_input_changes,
779
- inputs=[eval_prompt] + [var_input for _, _, var_input in variable_rows],
780
  outputs=[send_btn, regenerate_button]
781
  )
782
 
783
  # Update the leaderboard
784
- def update_leaderboard():
785
  leaderboard = get_leaderboard()
786
  data = [
787
  [
@@ -793,10 +583,20 @@ Feel free to email us at [support@atla-ai.com](mailto:support@atla-ai.com) or le
793
  entry['License']
794
  ] for entry in leaderboard
795
  ]
796
- return gr.update(value=data)
 
797
 
798
- refresh_button.click(fn=update_leaderboard, inputs=None, outputs=leaderboard_table)
799
-
800
- demo.launch()
 
 
801
 
 
 
 
 
 
 
802
 
 
 
3
  import gradio as gr
4
  import re
5
  import random
 
6
  from collections import defaultdict
 
 
 
 
7
  import pandas as pd
 
8
  import os
9
+ from dotenv import load_dotenv
10
+ from gen_api_answer import get_model_response
11
+ from common import *
12
 
13
+ load_dotenv()
 
 
14
 
15
  # Model and ELO score data
16
+ DEFAULT_ELO = 1500 # Starting ELO for new models
17
+ K_FACTOR = 32 # Standard chess K-factor, adjust as needed
18
  elo_scores = defaultdict(lambda: DEFAULT_ELO)
19
  vote_counts = defaultdict(int)
20
+
21
+
22
+ # Load the model_data from JSONL
23
+ def load_model_data():
24
+ model_data = {}
25
+ try:
26
+ with open('data/models.jsonl', 'r') as f:
27
+ for line in f:
28
+ model = json.loads(line)
29
+ model_data[model['name']] = {
30
+ 'organization': model['organization'],
31
+ 'license': model['license'],
32
+ 'api_model': model['api_model']
33
+ }
34
+ except FileNotFoundError:
35
+ print("Warning: models.jsonl not found")
36
+ return {}
37
+ return model_data
38
+
39
+ model_data = load_model_data()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
  current_session_id = 0
42
  voting_data = []
 
59
  }
60
  voting_data.append(vote_entry)
61
 
62
+ # Save to file after each vote
63
  with open('voting_data.json', 'w') as f:
64
  json.dump(voting_data, f, indent=2)
65
 
 
77
  eval_prompt = eval_prompt.replace('{{' + var + '}}', val)
78
  return eval_prompt
79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  def submit_prompt(eval_prompt, *variable_values):
81
  try:
82
  variables = parse_variables(eval_prompt)
 
87
  model1, model2 = random.sample(models, 2)
88
  model_a, model_b = (model1, model2) if random.random() < 0.5 else (model2, model1)
89
 
90
+ response_a = get_model_response(model_a, model_data.get(model_a), final_prompt)
91
+ response_b = get_model_response(model_b, model_data.get(model_b), final_prompt)
92
 
93
  return (
94
+ response_a,
95
+ response_b,
96
+ gr.update(visible=True),
97
+ gr.update(visible=True),
98
+ model_a,
99
+ model_b
100
  )
101
  except Exception as e:
102
  print(f"Error in submit_prompt: {str(e)}")
 
103
  return (
104
  "Error generating response",
105
  "Error generating response",
 
113
  # Update ELO scores based on user choice
114
  elo_a = elo_scores[model_a]
115
  elo_b = elo_scores[model_b]
 
116
 
117
  # Calculate expected scores
118
  Ea = 1 / (1 + 10 ** ((elo_b - elo_a) / 400))
 
127
  Sa, Sb = 0.5, 0.5
128
 
129
  # Update scores and vote counts
130
+ elo_scores[model_a] += K_FACTOR * (Sa - Ea)
131
+ elo_scores[model_b] += K_FACTOR * (Sb - Eb)
132
  vote_counts[model_a] += 1
133
  vote_counts[model_b] += 1
134
 
 
144
  regenerate_button: gr.update(visible=True, interactive=True)
145
  }
146
 
147
+
148
+
149
  def get_leaderboard():
150
  # Generate leaderboard data
151
  leaderboard = []
 
180
  # Fallback to allowing previous models if necessary
181
  model1, model2 = random.sample(list(model_data.keys()), 2)
182
 
183
+ response_a = get_model_response(model1, model_data.get(model1), final_prompt)
184
+ response_b = get_model_response(model2, model_data.get(model2), final_prompt)
185
 
186
  # Parse the responses
187
  score_a, critique_a = parse_model_response(response_a)
 
199
  model2 # model_b_state
200
  )
201
 
 
 
 
 
202
  def calculate_elo_change(rating_a, rating_b, winner):
203
  """Calculate ELO rating changes for both players."""
204
  expected_a = 1 / (1 + 10 ** ((rating_b - rating_a) / 400))
 
290
  datatype=['str', 'number', 'str', 'number', 'str', 'str', 'str']
291
  )
292
 
293
+ def parse_model_response(response):
294
  try:
295
+ # Debug print
296
+ print(f"Raw model response: {response}")
297
+
298
+ # First try to parse the entire response as JSON
299
+ try:
300
+ data = json.loads(response)
301
+ return str(data.get('result', 'N/A')), data.get('feedback', 'N/A')
302
+ except json.JSONDecodeError:
303
+ # If that fails (typically for smaller models), try to find JSON within the response
304
+ json_match = re.search(r'{.*}', response)
305
+ if json_match:
306
+ data = json.loads(json_match.group(0))
307
+ return str(data.get('result', 'N/A')), data.get('feedback', 'N/A')
308
+ else:
309
+ return 'Error', f"Failed to parse response: {response}"
310
+
311
  except Exception as e:
312
+ # Debug print for error case
313
+ print(f"Failed to parse response: {str(e)}")
314
+ return 'Error', f"Failed to parse response: {response}"
315
 
316
+ def get_leaderboard_stats():
317
+ """Get summary statistics for the leaderboard."""
318
  try:
319
+ with open('voting_data.json', 'r') as f:
320
+ voting_data = json.load(f)
321
+
322
+ total_votes = len(voting_data)
323
+ total_models = len(model_data)
324
+ last_updated = datetime.now().strftime("%Y-%m-%d %H:%M:%S UTC")
325
+
326
+ return f"""
327
+ ### Leaderboard Stats
328
+ - **Total Models**: {total_models}
329
+ - **Total Votes**: {total_votes}
330
+ - **Last Updated**: {last_updated}
331
+ """
332
+ except FileNotFoundError:
333
+ return "No voting data available"
334
+
335
+ def initialize_voting_data():
336
+ """Initialize or clear the voting data file."""
337
+ empty_data = []
338
+ with open('voting_data.json', 'w') as f:
339
+ json.dump(empty_data, f)
340
+
341
+ # Add this near the start of your app initialization, before the Gradio interface setup
342
+ if __name__ == "__main__":
343
+ initialize_voting_data()
344
+
345
+ # ... rest of your Gradio app setup ...
346
+
347
+ with gr.Blocks(theme='default', css=CSS_STYLES) as demo:
348
  judge_id = gr.State(get_new_session_id())
349
+ gr.Markdown(MAIN_TITLE)
350
+ gr.Markdown(SUBTITLE)
351
 
352
  with gr.Tabs():
353
  with gr.TabItem("Judge Arena"):
354
+ gr.Markdown(HOW_IT_WORKS)
 
 
 
 
 
 
355
 
356
  with gr.Row():
357
  with gr.Column():
358
+ gr.Markdown(BATTLE_RULES)
359
+
360
+ # Add heading for Eval Prompt
361
+ gr.Markdown("\n")
362
+
363
+ # Eval Prompt and Variables side by side
364
+ with gr.Row():
365
+ # Left column - Eval Prompt
366
+ with gr.Column(scale=1):
367
+ eval_prompt = gr.TextArea(
368
+ label="Eval Prompt",
369
+ lines=1,
370
+ value=DEFAULT_EVAL_PROMPT,
371
+ placeholder="Type your eval prompt here... denote variables in {{curly brackets}} to be populated on the right.",
372
+ show_label=True
373
+ )
374
+
375
+ # Right column - Variable Mapping
376
+ with gr.Column(scale=1):
377
+ gr.Markdown("### Variable Mapping")
378
+ # Create inputs for up to 5 variables, with first two visible by default
379
+ variable_rows = []
380
+ for i in range(5):
381
+ initial_visibility = True if i < 2 else False
382
+ with gr.Group(visible=initial_visibility) as var_row:
383
+ # Variable input with direct label
384
+ initial_value = DEFAULT_INPUT if i == 0 else DEFAULT_RESPONSE
385
+ initial_label = "input" if i == 0 else "response" if i == 1 else f"variable_{i+1}"
386
+ var_input = gr.Textbox(
387
+ label=initial_label,
388
+ value=initial_value,
389
+ container=True
390
+ )
391
+ variable_rows.append((var_row, var_input))
392
+
393
+ # Send button
394
+ with gr.Row(elem_classes="send-button-row"):
395
+ send_btn = gr.Button(
396
+ value="Send",
397
+ variant="primary",
398
+ size="lg",
399
+ scale=1
400
+ )
401
 
402
+ # Add divider heading for model outputs
403
+ gr.Markdown(VOTING_HEADER)
 
 
404
 
405
  # Model Responses side-by-side
406
  with gr.Row():
 
414
  score_b = gr.Textbox(label="Score", interactive=False)
415
  critique_b = gr.TextArea(label="Critique", lines=8, interactive=False)
416
  model_name_b = gr.Markdown("*Model: Unknown*")
417
+
418
  # Initially hide vote buttons and regenerate button
419
  with gr.Row(visible=False) as action_buttons_row:
420
  vote_a = gr.Button("Choose A", variant="primary")
421
  vote_tie = gr.Button("Tie", variant="secondary")
422
  vote_b = gr.Button("Choose B", variant="primary")
423
  regenerate_button = gr.Button("Regenerate with different models", variant="secondary", visible=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
424
 
425
  # Add spacing and acknowledgements at the bottom
426
+ gr.Markdown(ACKNOWLEDGEMENTS)
 
 
 
 
 
 
 
427
 
428
  with gr.TabItem("Leaderboard"):
429
  refresh_button = gr.Button("Refresh")
430
+ stats_display = gr.Markdown()
431
  leaderboard_table = gr.Dataframe(
432
  headers=['Model', 'ELO', '95% CI', 'Matches', 'Organization', 'License'],
433
  datatype=['str', 'number', 'str', 'number', 'str', 'str']
434
  )
435
 
436
  with gr.TabItem("Policy"):
437
+ gr.Markdown(POLICY_CONTENT)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
438
 
439
  # Define state variables for model tracking
440
  model_a_state = gr.State()
 
445
  variables = parse_variables(eval_prompt)
446
  updates = []
447
  for i in range(5):
448
+ var_row, var_input = variable_rows[i]
449
  if i < len(variables):
450
  updates.extend([
451
  gr.update(visible=True), # var_row
452
+ gr.update(value=f"**{variables[i]}:**"), # var_input
453
  gr.update(visible=True) # var_input
454
  ])
455
  else:
456
  updates.extend([
457
  gr.update(visible=False), # var_row
458
+ gr.update(), # var_input
459
  gr.update(visible=False, value="") # var_input
460
  ])
461
  return updates
 
465
  # Regenerate button functionality
466
  regenerate_button.click(
467
  fn=regenerate_prompt,
468
+ inputs=[model_a_state, model_b_state, eval_prompt] + [var_input for _, var_input in variable_rows],
469
  outputs=[
470
  score_a,
471
  critique_a,
 
486
  # Store the last submitted prompt and variables for comparison
487
  last_submission = gr.State({})
488
 
 
 
 
 
 
 
 
 
 
489
 
490
  # Update the vote button click handlers
491
  vote_a.click(
 
521
  score_b,
522
  critique_b,
523
  buttons_visible,
524
+ gr.update(visible=True), # Show regenerate button
525
  model_a,
526
  model_b,
527
  gr.update(value="*Model: Unknown*"),
 
530
 
531
  send_btn.click(
532
  fn=submit_and_store,
533
+ inputs=[eval_prompt] + [var_input for _, var_input in variable_rows],
534
  outputs=[
535
  score_a,
536
  critique_a,
 
547
 
548
  # Update the input change handlers to also disable regenerate button
549
  def handle_input_changes(prompt, *variables):
550
+ """Enable send button and manage regenerate button based on input changes"""
551
  last_inputs = last_submission.value
552
  current_inputs = {"prompt": prompt, "variables": variables}
553
  inputs_changed = last_inputs != current_inputs
554
  return [
555
+ gr.update(interactive=True), # send button always enabled
556
+ gr.update(interactive=not inputs_changed) # regenerate button disabled if inputs changed
557
  ]
558
 
559
  # Update the change handlers for prompt and variables
560
  eval_prompt.change(
561
  fn=handle_input_changes,
562
+ inputs=[eval_prompt] + [var_input for _, var_input in variable_rows],
563
  outputs=[send_btn, regenerate_button]
564
  )
565
 
566
+ for _, var_input in variable_rows:
567
  var_input.change(
568
  fn=handle_input_changes,
569
+ inputs=[eval_prompt] + [var_input for _, var_input in variable_rows],
570
  outputs=[send_btn, regenerate_button]
571
  )
572
 
573
  # Update the leaderboard
574
+ def refresh_leaderboard():
575
  leaderboard = get_leaderboard()
576
  data = [
577
  [
 
583
  entry['License']
584
  ] for entry in leaderboard
585
  ]
586
+ stats = get_leaderboard_stats()
587
+ return [gr.update(value=data), gr.update(value=stats)]
588
 
589
+ refresh_button.click(
590
+ fn=refresh_leaderboard,
591
+ inputs=None,
592
+ outputs=[leaderboard_table, stats_display]
593
+ )
594
 
595
+ # Add the load event at the very end, just before demo.launch()
596
+ demo.load(
597
+ fn=refresh_leaderboard,
598
+ inputs=None,
599
+ outputs=[leaderboard_table, stats_display]
600
+ )
601
 
602
+ demo.launch()