tobydrane commited on
Commit
0136a5b
1 Parent(s): 3f0d906
Files changed (7) hide show
  1. app.py +409 -292
  2. common.py +23 -33
  3. db.py +33 -0
  4. example_metrics.py +113 -0
  5. gen_api_answer.py +24 -21
  6. requirements.txt +6 -5
  7. utils.py +27 -0
app.py CHANGED
@@ -1,13 +1,32 @@
1
- from datetime import datetime
2
  import json
3
- import gradio as gr
4
  import re
5
  import random
6
  from collections import defaultdict
7
- import pandas as pd
8
- import os
 
 
 
 
 
9
  from gen_api_answer import get_model_response, parse_model_response
10
- from common import *
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
  # Model and ELO score data
13
  DEFAULT_ELO = 1500 # Starting ELO for new models
@@ -15,65 +34,73 @@ K_FACTOR = 32 # Standard chess K-factor, adjust as needed
15
  elo_scores = defaultdict(lambda: DEFAULT_ELO)
16
  vote_counts = defaultdict(int)
17
 
 
 
 
 
 
18
 
19
  # Load the model_data from JSONL
20
  def load_model_data():
21
  model_data = {}
22
  try:
23
- with open('data/models.jsonl', 'r') as f:
24
  for line in f:
25
  model = json.loads(line)
26
- model_data[model['name']] = {
27
- 'organization': model['organization'],
28
- 'license': model['license'],
29
- 'api_model': model['api_model']
30
  }
31
  except FileNotFoundError:
32
  print("Warning: models.jsonl not found")
33
  return {}
34
  return model_data
35
 
 
36
  model_data = load_model_data()
37
 
38
  current_session_id = 0
39
- voting_data = []
40
 
41
  def get_new_session_id():
42
  global current_session_id
43
  current_session_id += 1
44
  return f"user{current_session_id}"
45
 
 
46
  def store_vote_data(prompt, response_a, response_b, model_a, model_b, winner, judge_id):
47
- vote_entry = {
48
- "timestamp": datetime.now().isoformat(),
49
- "prompt": prompt,
50
- "response_a": response_a,
51
- "response_b": response_b,
52
- "model_a": model_a,
53
- "model_b": model_b,
54
- "winner": winner,
55
- "judge_id": judge_id,
56
- }
57
- voting_data.append(vote_entry)
58
-
59
- # Save to file after each vote
60
- with open('voting_data.json', 'w') as f:
61
- json.dump(voting_data, f, indent=2)
62
 
63
  def parse_variables(prompt):
64
  # Extract variables enclosed in double curly braces
65
- variables = re.findall(r'{{(.*?)}}', prompt)
66
  # Remove duplicates while preserving order
67
  seen = set()
68
- variables = [x.strip() for x in variables if not (x.strip() in seen or seen.add(x.strip()))]
 
 
69
  return variables
70
 
 
71
  def get_final_prompt(eval_prompt, variable_values):
72
  # Replace variables in the eval prompt with their values
73
  for var, val in variable_values.items():
74
- eval_prompt = eval_prompt.replace('{{' + var + '}}', val)
75
  return eval_prompt
76
 
 
77
  def submit_prompt(eval_prompt, *variable_values):
78
  try:
79
  variables = parse_variables(eval_prompt)
@@ -93,7 +120,8 @@ def submit_prompt(eval_prompt, *variable_values):
93
  gr.update(visible=True),
94
  gr.update(visible=True),
95
  model_a,
96
- model_b
 
97
  )
98
  except Exception as e:
99
  print(f"Error in submit_prompt: {str(e)}")
@@ -103,10 +131,22 @@ def submit_prompt(eval_prompt, *variable_values):
103
  gr.update(visible=False),
104
  gr.update(visible=False),
105
  None,
106
- None
 
107
  )
108
 
109
- def vote(choice, model_a, model_b, prompt, response_a, response_b, judge_id):
 
 
 
 
 
 
 
 
 
 
 
110
  # Update ELO scores based on user choice
111
  elo_a = elo_scores[model_a]
112
  elo_b = elo_scores[model_b]
@@ -116,9 +156,9 @@ def vote(choice, model_a, model_b, prompt, response_a, response_b, judge_id):
116
  Eb = 1 / (1 + 10 ** ((elo_a - elo_b) / 400))
117
 
118
  # Assign actual scores
119
- if choice == 'A':
120
  Sa, Sb = 1, 0
121
- elif choice == 'B':
122
  Sa, Sb = 0, 1
123
  else:
124
  Sa, Sb = 0.5, 0.5
@@ -129,39 +169,103 @@ def vote(choice, model_a, model_b, prompt, response_a, response_b, judge_id):
129
  vote_counts[model_a] += 1
130
  vote_counts[model_b] += 1
131
 
132
- # Store the vote data
133
- store_vote_data(prompt, response_a, response_b, model_a, model_b, choice, judge_id)
 
 
 
 
 
 
 
 
 
 
 
134
 
135
  # Return updates for UI components
136
- return {
137
- action_buttons_row: gr.update(visible=False),
138
- model_name_a: gr.update(value=f"*Model: {model_a}*"),
139
- model_name_b: gr.update(value=f"*Model: {model_b}*"),
140
- send_btn: gr.update(interactive=True),
141
- regenerate_button: gr.update(visible=True, interactive=True)
142
- }
 
143
 
 
 
 
144
 
145
 
146
  def get_leaderboard():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
  # Generate leaderboard data
148
  leaderboard = []
149
- for model, elo in elo_scores.items():
150
- votes = vote_counts[model]
151
- ci = 1.96 * (400 / (votes + 1) ** 0.5) # Approximate 95% confidence interval
 
152
  data = {
153
- 'Model': model,
154
- 'ELO Score': f"{elo:.2f}",
155
- '95% CI': f"±{ci:.2f}",
156
- '# Votes': votes,
157
- 'Organization': model_data[model]['organization'],
158
- 'License': model_data[model]['license'],
159
  }
160
  leaderboard.append(data)
161
- # Sort by ELO score
162
- leaderboard.sort(key=lambda x: float(x['ELO Score']), reverse=True)
163
  return leaderboard
164
 
 
165
  def regenerate_prompt(model_a, model_b, eval_prompt, *variable_values):
166
  variables = parse_variables(eval_prompt)
167
  variable_values_dict = {var: val for var, val in zip(variables, variable_values)}
@@ -169,14 +273,14 @@ def regenerate_prompt(model_a, model_b, eval_prompt, *variable_values):
169
 
170
  # Get available models excluding the previous ones
171
  available_models = [m for m in model_data.keys() if m not in (model_a, model_b)]
172
-
173
  # If we have enough models for new pairs
174
  if len(available_models) >= 2:
175
  model1, model2 = random.sample(available_models, 2)
176
  else:
177
  # Fallback to allowing previous models if necessary
178
  model1, model2 = random.sample(list(model_data.keys()), 2)
179
-
180
  response_a = get_model_response(model1, model_data.get(model1), final_prompt)
181
  response_b = get_model_response(model2, model_data.get(model2), final_prompt)
182
 
@@ -185,203 +289,162 @@ def regenerate_prompt(model_a, model_b, eval_prompt, *variable_values):
185
  score_b, critique_b = parse_model_response(response_b)
186
 
187
  return (
188
- score_a, # score_a textbox
189
- critique_a, # critique_a textbox
190
- score_b, # score_b textbox
191
- critique_b, # critique_b textbox
192
  gr.update(visible=True), # action_buttons_row
193
  gr.update(value="*Model: Unknown*"), # model_name_a
194
  gr.update(value="*Model: Unknown*"), # model_name_b
195
- model1, # model_a_state
196
- model2 # model_b_state
197
  )
198
 
 
199
  def calculate_elo_change(rating_a, rating_b, winner):
200
  """Calculate ELO rating changes for both players."""
201
  expected_a = 1 / (1 + 10 ** ((rating_b - rating_a) / 400))
202
  expected_b = 1 - expected_a
203
-
204
  if winner == "A":
205
  score_a, score_b = 1, 0
206
  elif winner == "B":
207
  score_a, score_b = 0, 1
208
  else: # Handle ties
209
  score_a, score_b = 0.5, 0.5
210
-
211
  change_a = K_FACTOR * (score_a - expected_a)
212
  change_b = K_FACTOR * (score_b - expected_b)
213
-
214
  return change_a, change_b
215
 
 
216
  def update_leaderboard():
217
- """Calculate current ELO ratings from voting history."""
218
- ratings = defaultdict(lambda: DEFAULT_ELO)
 
 
219
  matches = defaultdict(int)
220
- wins = defaultdict(int)
221
-
222
- # Load voting data
223
- try:
224
- with open('voting_data.json', 'r') as f:
225
- voting_data = json.load(f)
226
- except FileNotFoundError:
227
- return pd.DataFrame()
228
-
229
- # Process each vote
230
  for vote in voting_data:
231
- model_a = vote['model_a']
232
- model_b = vote['model_b']
233
- winner = vote['winner']
234
-
235
- # Skip if models aren't in current model_data
236
- if model_a not in model_data or model_b not in model_data:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
  continue
238
-
239
- # Update match counts
240
- matches[model_a] += 1
241
- matches[model_b] += 1
242
- if winner == "A":
243
- wins[model_a] += 1
244
- elif winner == "B":
245
- wins[model_b] += 1
246
- else: # Handle ties
247
- wins[model_a] += 0.5
248
- wins[model_b] += 0.5
249
-
250
- # Update ELO ratings
251
- change_a, change_b = calculate_elo_change(ratings[model_a], ratings[model_b], winner)
252
- ratings[model_a] += change_a
253
- ratings[model_b] += change_b
254
-
255
- # Create leaderboard DataFrame
256
- leaderboard_data = []
257
- for model in model_data.keys(): # Only include current models
258
- win_rate = (wins[model] / matches[model] * 100) if matches[model] > 0 else 0
259
- ci = 1.96 * (400 / (matches[model] + 1) ** 0.5) if matches[model] > 0 else 0 # Confidence interval
260
- leaderboard_data.append({
261
- 'Model': model,
262
- 'ELO': round(ratings[model], 1),
263
- '95% CI': f"±{ci:.1f}",
264
- 'Matches': matches[model],
265
- 'Win Rate': f"{win_rate:.1f}%",
266
- 'Organization': model_data[model]['organization'],
267
- 'License': model_data[model]['license']
268
- })
269
-
270
- # Sort by ELO rating
271
- df = pd.DataFrame(leaderboard_data)
272
- return df.sort_values('ELO', ascending=False).reset_index(drop=True)
273
 
274
  # Update the display_leaderboard function
275
  def display_leaderboard():
276
  df = update_leaderboard()
277
  return gr.DataFrame(
278
  value=df,
279
- headers=['Model', 'ELO', '95% CI', 'Matches', 'Organization', 'License'],
280
- datatype=['str', 'number', 'str', 'number', 'str', 'str', 'str'],
281
- row_count=(len(df) + 1, 'dynamic'),
282
  )
283
 
 
284
  # Update the leaderboard table definition in the UI
285
  leaderboard_table = gr.Dataframe(
286
- headers=['Model', 'ELO', '95% CI', 'Matches', 'Organization', 'License'],
287
- datatype=['str', 'number', 'str', 'number', 'str', 'str', 'str']
288
  )
289
 
 
290
  def get_leaderboard_stats():
291
  """Get summary statistics for the leaderboard."""
292
- try:
293
- with open('voting_data.json', 'r') as f:
294
- voting_data = json.load(f)
295
-
296
- total_votes = len(voting_data)
297
- total_models = len(model_data)
298
- last_updated = datetime.now().strftime("%Y-%m-%d %H:%M:%S UTC")
299
-
300
- return f"""
301
  ### Leaderboard Stats
302
  - **Total Models**: {total_models}
303
  - **Total Votes**: {total_votes}
304
  - **Last Updated**: {last_updated}
305
  """
306
- except FileNotFoundError:
307
- return "No voting data available"
308
 
309
- def initialize_voting_data():
310
- """Initialize or clear the voting data file."""
311
- empty_data = []
312
- with open('voting_data.json', 'w') as f:
313
- json.dump(empty_data, f)
314
-
315
- # Add this near the start of your app initialization, before the Gradio interface setup
316
- if __name__ == "__main__":
317
- initialize_voting_data()
318
-
319
- # ... rest of your Gradio app setup ...
320
-
321
- # Example evaluation metrics data
322
- EXAMPLE_METRICS = {
323
- "Hallucination": {
324
- "prompt": DEFAULT_EVAL_PROMPT, # We'll replace these with actual examples
325
- "input": DEFAULT_INPUT,
326
- "response": DEFAULT_RESPONSE
327
- },
328
- "Precision": {
329
- "prompt": DEFAULT_EVAL_PROMPT,
330
- "input": DEFAULT_INPUT,
331
- "response": DEFAULT_RESPONSE
332
- },
333
- "Recall": {
334
- "prompt": DEFAULT_EVAL_PROMPT,
335
- "input": DEFAULT_INPUT,
336
- "response": DEFAULT_RESPONSE
337
- },
338
- "Logical coherence": {
339
- "prompt": DEFAULT_EVAL_PROMPT,
340
- "input": DEFAULT_INPUT,
341
- "response": DEFAULT_RESPONSE
342
- },
343
- "Faithfulness": {
344
- "prompt": DEFAULT_EVAL_PROMPT,
345
- "input": DEFAULT_INPUT,
346
- "response": DEFAULT_RESPONSE
347
- }
348
- }
349
 
350
  def set_example_metric(metric_name):
351
  if metric_name == "Custom":
352
- return [
353
- DEFAULT_EVAL_PROMPT,
354
- DEFAULT_INPUT,
355
- DEFAULT_RESPONSE
356
- ]
357
-
 
 
 
 
 
 
 
 
358
  metric_data = EXAMPLE_METRICS[metric_name]
359
- return [
360
- metric_data["prompt"],
361
- metric_data["input"],
362
- metric_data["response"]
363
- ]
 
 
 
 
 
364
 
365
  # Select random metric at startup
366
  def get_random_metric():
367
  metrics = list(EXAMPLE_METRICS.keys())
368
  return set_example_metric(random.choice(metrics))
369
 
370
- with gr.Blocks(theme='default', css=CSS_STYLES) as demo:
 
371
  judge_id = gr.State(get_new_session_id())
372
  gr.Markdown(MAIN_TITLE)
373
  gr.Markdown(HOW_IT_WORKS)
374
-
375
  with gr.Tabs():
376
  with gr.TabItem("Judge Arena"):
377
-
378
  with gr.Row():
379
  with gr.Column():
380
  gr.Markdown(BATTLE_RULES)
381
  gr.Markdown(EVAL_DESCRIPTION)
382
-
383
  # Add Example Metrics Section
384
- with gr.Accordion("Example evaluation metrics", open=True):
385
  with gr.Row():
386
  custom_btn = gr.Button("Custom", variant="secondary")
387
  hallucination_btn = gr.Button("Hallucination")
@@ -399,7 +462,7 @@ with gr.Blocks(theme='default', css=CSS_STYLES) as demo:
399
  lines=1,
400
  value=DEFAULT_EVAL_PROMPT,
401
  placeholder="Type your eval prompt here... denote variables in {{curly brackets}} to be populated on the right.",
402
- show_label=True
403
  )
404
 
405
  # Right column - Variable Mapping
@@ -410,28 +473,25 @@ with gr.Blocks(theme='default', css=CSS_STYLES) as demo:
410
  for i in range(5):
411
  initial_visibility = True if i < 2 else False
412
  with gr.Group(visible=initial_visibility) as var_row:
413
- # Variable input with direct label
414
- initial_value = DEFAULT_INPUT if i == 0 else DEFAULT_RESPONSE
415
- initial_label = "input" if i == 0 else "response" if i == 1 else f"variable_{i+1}"
 
416
  var_input = gr.Textbox(
417
- label=initial_label,
418
- value=initial_value,
419
- container=True
420
  )
421
  variable_rows.append((var_row, var_input))
422
 
423
  # Send button
424
  with gr.Row(elem_classes="send-button-row"):
425
  send_btn = gr.Button(
426
- value="Test the evaluators",
427
- variant="primary",
428
- size="lg",
429
- scale=1
430
  )
431
-
432
  # Add divider heading for model outputs
433
  gr.Markdown(VOTING_HEADER)
434
-
435
  # Model Responses side-by-side
436
  with gr.Row():
437
  with gr.Column():
@@ -444,23 +504,24 @@ with gr.Blocks(theme='default', css=CSS_STYLES) as demo:
444
  score_b = gr.Textbox(label="Score", interactive=False)
445
  critique_b = gr.TextArea(label="Critique", lines=8, interactive=False)
446
  model_name_b = gr.Markdown("*Model: Unknown*")
447
-
448
  # Initially hide vote buttons and regenerate button
449
  with gr.Row(visible=False) as action_buttons_row:
450
  vote_a = gr.Button("Choose A", variant="primary")
451
  vote_tie = gr.Button("Tie", variant="secondary")
452
  vote_b = gr.Button("Choose B", variant="primary")
453
- regenerate_button = gr.Button("Regenerate with different models", variant="secondary", visible=False)
454
-
 
 
455
  # Add spacing and acknowledgements at the bottom
456
  gr.Markdown(ACKNOWLEDGEMENTS)
457
 
458
  with gr.TabItem("Leaderboard"):
459
- refresh_button = gr.Button("Refresh")
460
  stats_display = gr.Markdown()
461
  leaderboard_table = gr.Dataframe(
462
- headers=['Model', 'ELO', '95% CI', 'Matches', 'Organization', 'License'],
463
- datatype=['str', 'number', 'str', 'number', 'str', 'str']
464
  )
465
 
466
  with gr.TabItem("Policy"):
@@ -469,39 +530,50 @@ with gr.Blocks(theme='default', css=CSS_STYLES) as demo:
469
  # Define state variables for model tracking
470
  model_a_state = gr.State()
471
  model_b_state = gr.State()
 
472
 
473
  # Update variable inputs based on the eval prompt
474
  def update_variables(eval_prompt):
475
  variables = parse_variables(eval_prompt)
476
  updates = []
477
- for i in range(5):
 
478
  var_row, var_input = variable_rows[i]
479
  if i < len(variables):
480
- # Set default values for 'input' and 'response', otherwise leave empty
481
- if variables[i] == "input":
482
- initial_value = DEFAULT_INPUT
483
- elif variables[i] == "response":
484
- initial_value = DEFAULT_RESPONSE
485
  else:
486
- initial_value = "" # Empty for new variables
487
-
488
- updates.extend([
489
- gr.update(visible=True), # var_row
490
- gr.update(value=initial_value, label=variables[i], visible=True) # var_input with dynamic label
491
- ])
 
 
 
492
  else:
493
- updates.extend([
494
- gr.update(visible=False), # var_row
495
- gr.update(value="", visible=False) # var_input
496
- ])
 
 
497
  return updates
498
 
499
- eval_prompt.change(fn=update_variables, inputs=eval_prompt, outputs=[item for sublist in variable_rows for item in sublist])
 
 
 
 
500
 
501
  # Regenerate button functionality
502
  regenerate_button.click(
503
  fn=regenerate_prompt,
504
- inputs=[model_a_state, model_b_state, eval_prompt] + [var_input for _, var_input in variable_rows],
 
505
  outputs=[
506
  score_a,
507
  critique_a,
@@ -511,63 +583,120 @@ with gr.Blocks(theme='default', css=CSS_STYLES) as demo:
511
  model_name_a,
512
  model_name_b,
513
  model_a_state,
514
- model_b_state
515
- ]
516
  )
517
 
518
  # Update model names after responses are generated
519
  def update_model_names(model_a, model_b):
520
- return gr.update(value=f"*Model: {model_a}*"), gr.update(value=f"*Model: {model_b}*")
 
 
521
 
522
  # Store the last submitted prompt and variables for comparison
523
  last_submission = gr.State({})
524
 
525
-
526
  # Update the vote button click handlers
527
  vote_a.click(
528
- fn=lambda *args: vote('A', *args),
529
- inputs=[model_a_state, model_b_state, eval_prompt, score_a, score_b, judge_id],
530
- outputs=[action_buttons_row, model_name_a, model_name_b, send_btn, regenerate_button]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
531
  )
532
 
533
  vote_b.click(
534
- fn=lambda *args: vote('B', *args),
535
- inputs=[model_a_state, model_b_state, eval_prompt, score_a, score_b, judge_id],
536
- outputs=[action_buttons_row, model_name_a, model_name_b, send_btn, regenerate_button]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
537
  )
538
 
539
  vote_tie.click(
540
- fn=lambda *args: vote('Tie', *args),
541
- inputs=[model_a_state, model_b_state, eval_prompt, score_a, score_b, judge_id],
542
- outputs=[action_buttons_row, model_name_a, model_name_b, send_btn, regenerate_button]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
543
  )
544
 
545
  # Update the send button handler to store the submitted inputs
546
  def submit_and_store(prompt, *variables):
547
  # Create a copy of the current submission
548
  current_submission = {"prompt": prompt, "variables": variables}
549
-
550
  # Get the responses
551
- response_a, response_b, buttons_visible, regen_visible, model_a, model_b = submit_prompt(prompt, *variables)
552
-
 
 
 
 
 
 
 
 
553
  # Parse the responses
554
  score_a, critique_a = parse_model_response(response_a)
555
  score_b, critique_b = parse_model_response(response_b)
556
-
557
  # Update the last_submission state with the current values
558
  last_submission.value = current_submission
559
-
560
  return (
561
  score_a,
562
  critique_a,
563
  score_b,
564
  critique_b,
565
  buttons_visible,
566
- gr.update(visible=True, interactive=True), # Show and enable regenerate button
 
 
567
  model_a,
568
  model_b,
 
 
569
  gr.update(value="*Model: Unknown*"),
570
- gr.update(value="*Model: Unknown*")
571
  )
572
 
573
  send_btn.click(
@@ -582,9 +711,10 @@ with gr.Blocks(theme='default', css=CSS_STYLES) as demo:
582
  regenerate_button,
583
  model_a_state,
584
  model_b_state,
585
- model_name_a, # Add model name outputs
586
- model_name_b
587
- ]
 
588
  )
589
 
590
  # Update the input change handlers to also disable regenerate button
@@ -594,88 +724,75 @@ with gr.Blocks(theme='default', css=CSS_STYLES) as demo:
594
  current_inputs = {"prompt": prompt, "variables": variables}
595
  inputs_changed = last_inputs != current_inputs
596
  return [
597
- gr.update(interactive=True), # send button always enabled
598
- gr.update(interactive=not inputs_changed) # regenerate button disabled if inputs changed
 
 
599
  ]
600
 
601
  # Update the change handlers for prompt and variables
602
  eval_prompt.change(
603
  fn=handle_input_changes,
604
  inputs=[eval_prompt] + [var_input for _, var_input in variable_rows],
605
- outputs=[send_btn, regenerate_button]
606
  )
607
 
608
  for _, var_input in variable_rows:
609
  var_input.change(
610
  fn=handle_input_changes,
611
  inputs=[eval_prompt] + [var_input for _, var_input in variable_rows],
612
- outputs=[send_btn, regenerate_button]
613
  )
614
 
615
  # Update the leaderboard
616
  def refresh_leaderboard():
 
617
  leaderboard = get_leaderboard()
618
  data = [
619
  [
620
- entry['Model'],
621
- float(entry['ELO Score']),
622
- entry['95% CI'],
623
- entry['# Votes'],
624
- entry['Organization'],
625
- entry['License']
626
- ] for entry in leaderboard
 
627
  ]
628
  stats = get_leaderboard_stats()
629
  return [gr.update(value=data), gr.update(value=stats)]
630
 
631
- refresh_button.click(
632
- fn=refresh_leaderboard,
633
- inputs=None,
634
- outputs=[leaderboard_table, stats_display]
635
- )
636
-
637
  # Add the load event at the very end, just before demo.launch()
638
  demo.load(
639
- fn=refresh_leaderboard,
640
- inputs=None,
641
- outputs=[leaderboard_table, stats_display]
642
  )
643
 
644
  # Add click handlers for metric buttons
645
- custom_btn.click(
646
- fn=lambda: set_example_metric("Custom"),
647
- outputs=[eval_prompt, variable_rows[0][1], variable_rows[1][1]]
648
- )
649
 
650
  hallucination_btn.click(
651
- fn=lambda: set_example_metric("Hallucination"),
652
- outputs=[eval_prompt, variable_rows[0][1], variable_rows[1][1]]
653
  )
654
 
655
- precision_btn.click(
656
- fn=lambda: set_example_metric("Precision"),
657
- outputs=[eval_prompt, variable_rows[0][1], variable_rows[1][1]]
658
- )
659
 
660
- recall_btn.click(
661
- fn=lambda: set_example_metric("Recall"),
662
- outputs=[eval_prompt, variable_rows[0][1], variable_rows[1][1]]
663
- )
664
 
665
  coherence_btn.click(
666
- fn=lambda: set_example_metric("Logical coherence"),
667
- outputs=[eval_prompt, variable_rows[0][1], variable_rows[1][1]]
668
  )
669
 
670
  faithfulness_btn.click(
671
- fn=lambda: set_example_metric("Faithfulness"),
672
- outputs=[eval_prompt, variable_rows[0][1], variable_rows[1][1]]
673
  )
674
 
675
- # Set random metric at startup
676
  demo.load(
677
- fn=get_random_metric,
678
- outputs=[eval_prompt, variable_rows[0][1], variable_rows[1][1]]
679
  )
680
 
681
- demo.launch()
 
 
 
1
  import json
 
2
  import re
3
  import random
4
  from collections import defaultdict
5
+ from datetime import datetime, timezone
6
+
7
+ from dotenv import load_dotenv
8
+
9
+ load_dotenv()
10
+
11
+ import gradio as gr
12
  from gen_api_answer import get_model_response, parse_model_response
13
+ from db import add_vote, create_db_connection, get_votes
14
+ from utils import Vote
15
+ from common import (
16
+ POLICY_CONTENT,
17
+ ACKNOWLEDGEMENTS,
18
+ DEFAULT_EVAL_PROMPT,
19
+ DEFAULT_INPUT,
20
+ DEFAULT_RESPONSE,
21
+ CSS_STYLES,
22
+ MAIN_TITLE,
23
+ HOW_IT_WORKS,
24
+ BATTLE_RULES,
25
+ EVAL_DESCRIPTION,
26
+ VOTING_HEADER,
27
+ )
28
+ from example_metrics import EXAMPLE_METRICS
29
+
30
 
31
  # Model and ELO score data
32
  DEFAULT_ELO = 1500 # Starting ELO for new models
 
34
  elo_scores = defaultdict(lambda: DEFAULT_ELO)
35
  vote_counts = defaultdict(int)
36
 
37
+ db = create_db_connection()
38
+ votes_collection = get_votes(db)
39
+
40
+ current_time = datetime.now()
41
+
42
 
43
  # Load the model_data from JSONL
44
  def load_model_data():
45
  model_data = {}
46
  try:
47
+ with open("data/models.jsonl", "r") as f:
48
  for line in f:
49
  model = json.loads(line)
50
+ model_data[model["name"]] = {
51
+ "organization": model["organization"],
52
+ "license": model["license"],
53
+ "api_model": model["api_model"],
54
  }
55
  except FileNotFoundError:
56
  print("Warning: models.jsonl not found")
57
  return {}
58
  return model_data
59
 
60
+
61
  model_data = load_model_data()
62
 
63
  current_session_id = 0
64
+
65
 
66
  def get_new_session_id():
67
  global current_session_id
68
  current_session_id += 1
69
  return f"user{current_session_id}"
70
 
71
+
72
  def store_vote_data(prompt, response_a, response_b, model_a, model_b, winner, judge_id):
73
+ vote = Vote(
74
+ timestamp=datetime.now().isoformat(),
75
+ prompt=prompt,
76
+ response_a=response_a,
77
+ response_b=response_b,
78
+ model_a=model_a,
79
+ model_b=model_b,
80
+ winner=winner,
81
+ judge_id=judge_id,
82
+ )
83
+ add_vote(vote, db)
84
+
 
 
 
85
 
86
  def parse_variables(prompt):
87
  # Extract variables enclosed in double curly braces
88
+ variables = re.findall(r"{{(.*?)}}", prompt)
89
  # Remove duplicates while preserving order
90
  seen = set()
91
+ variables = [
92
+ x.strip() for x in variables if not (x.strip() in seen or seen.add(x.strip()))
93
+ ]
94
  return variables
95
 
96
+
97
  def get_final_prompt(eval_prompt, variable_values):
98
  # Replace variables in the eval prompt with their values
99
  for var, val in variable_values.items():
100
+ eval_prompt = eval_prompt.replace("{{" + var + "}}", val)
101
  return eval_prompt
102
 
103
+
104
  def submit_prompt(eval_prompt, *variable_values):
105
  try:
106
  variables = parse_variables(eval_prompt)
 
120
  gr.update(visible=True),
121
  gr.update(visible=True),
122
  model_a,
123
+ model_b,
124
+ final_prompt,
125
  )
126
  except Exception as e:
127
  print(f"Error in submit_prompt: {str(e)}")
 
131
  gr.update(visible=False),
132
  gr.update(visible=False),
133
  None,
134
+ None,
135
+ None,
136
  )
137
 
138
+
139
+ def vote(
140
+ choice,
141
+ model_a,
142
+ model_b,
143
+ final_prompt,
144
+ score_a,
145
+ critique_a,
146
+ score_b,
147
+ critique_b,
148
+ judge_id,
149
+ ):
150
  # Update ELO scores based on user choice
151
  elo_a = elo_scores[model_a]
152
  elo_b = elo_scores[model_b]
 
156
  Eb = 1 / (1 + 10 ** ((elo_a - elo_b) / 400))
157
 
158
  # Assign actual scores
159
+ if choice == "A":
160
  Sa, Sb = 1, 0
161
+ elif choice == "B":
162
  Sa, Sb = 0, 1
163
  else:
164
  Sa, Sb = 0.5, 0.5
 
169
  vote_counts[model_a] += 1
170
  vote_counts[model_b] += 1
171
 
172
+ # Format the full responses with score and critique
173
+ response_a = f"""{score_a}
174
+
175
+ {critique_a}"""
176
+
177
+ response_b = f"""{score_b}
178
+
179
+ {critique_b}"""
180
+
181
+ # Store the vote data with the final prompt
182
+ store_vote_data(
183
+ final_prompt, response_a, response_b, model_a, model_b, choice, judge_id
184
+ )
185
 
186
  # Return updates for UI components
187
+ return [
188
+ gr.update(visible=False), # action_buttons_row
189
+ gr.update(value=f"*Model: {model_a}*"), # model_name_a
190
+ gr.update(value=f"*Model: {model_b}*"), # model_name_b
191
+ gr.update(interactive=True), # send_btn
192
+ gr.update(visible=True, interactive=True), # regenerate_button
193
+ ]
194
+
195
 
196
+ def get_current_votes():
197
+ """Get current votes from database."""
198
+ return get_votes(db)
199
 
200
 
201
  def get_leaderboard():
202
+ """Generate leaderboard data using fresh votes from MongoDB."""
203
+ # Get fresh voting data
204
+ voting_data = get_current_votes()
205
+ print(f"Fetched {len(voting_data)} votes from database") # Debug log
206
+
207
+ # Initialize dictionaries for tracking
208
+ ratings = defaultdict(lambda: DEFAULT_ELO)
209
+ matches = defaultdict(int)
210
+
211
+ # Process each vote
212
+ for vote in voting_data:
213
+ try:
214
+ model_a = vote.get("model_a")
215
+ model_b = vote.get("model_b")
216
+ winner = vote.get("winner")
217
+
218
+ # Skip if models aren't in current model_data
219
+ if (
220
+ not all([model_a, model_b, winner])
221
+ or model_a not in model_data
222
+ or model_b not in model_data
223
+ ):
224
+ continue
225
+
226
+ # Update match counts
227
+ matches[model_a] += 1
228
+ matches[model_b] += 1
229
+
230
+ # Calculate ELO changes
231
+ elo_a = ratings[model_a]
232
+ elo_b = ratings[model_b]
233
+
234
+ # Expected scores
235
+ expected_a = 1 / (1 + 10 ** ((elo_b - elo_a) / 400))
236
+ expected_b = 1 - expected_a
237
+
238
+ # Actual scores
239
+ score_a = 1 if winner == "A" else 0 if winner == "B" else 0.5
240
+ score_b = 1 - score_a
241
+
242
+ # Update ratings
243
+ ratings[model_a] += K_FACTOR * (score_a - expected_a)
244
+ ratings[model_b] += K_FACTOR * (score_b - expected_b)
245
+
246
+ except Exception as e:
247
+ print(f"Error processing vote: {e}")
248
+ continue
249
+
250
  # Generate leaderboard data
251
  leaderboard = []
252
+ for model in model_data.keys():
253
+ votes = matches[model]
254
+ elo = ratings[model]
255
+ ci = 1.96 * (400 / (votes + 1) ** 0.5) if votes > 0 else 0
256
  data = {
257
+ "Model": model,
258
+ "ELO Score": f"{elo:.2f}",
259
+ "95% CI": f"±{ci:.2f}",
260
+ "# Votes": votes,
261
+ "Organization": model_data[model]["organization"],
262
+ "License": model_data[model]["license"],
263
  }
264
  leaderboard.append(data)
265
+
 
266
  return leaderboard
267
 
268
+
269
  def regenerate_prompt(model_a, model_b, eval_prompt, *variable_values):
270
  variables = parse_variables(eval_prompt)
271
  variable_values_dict = {var: val for var, val in zip(variables, variable_values)}
 
273
 
274
  # Get available models excluding the previous ones
275
  available_models = [m for m in model_data.keys() if m not in (model_a, model_b)]
276
+
277
  # If we have enough models for new pairs
278
  if len(available_models) >= 2:
279
  model1, model2 = random.sample(available_models, 2)
280
  else:
281
  # Fallback to allowing previous models if necessary
282
  model1, model2 = random.sample(list(model_data.keys()), 2)
283
+
284
  response_a = get_model_response(model1, model_data.get(model1), final_prompt)
285
  response_b = get_model_response(model2, model_data.get(model2), final_prompt)
286
 
 
289
  score_b, critique_b = parse_model_response(response_b)
290
 
291
  return (
292
+ score_a, # score_a textbox
293
+ critique_a, # critique_a textbox
294
+ score_b, # score_b textbox
295
+ critique_b, # critique_b textbox
296
  gr.update(visible=True), # action_buttons_row
297
  gr.update(value="*Model: Unknown*"), # model_name_a
298
  gr.update(value="*Model: Unknown*"), # model_name_b
299
+ model1, # model_a_state
300
+ model2, # model_b_state
301
  )
302
 
303
+
304
  def calculate_elo_change(rating_a, rating_b, winner):
305
  """Calculate ELO rating changes for both players."""
306
  expected_a = 1 / (1 + 10 ** ((rating_b - rating_a) / 400))
307
  expected_b = 1 - expected_a
308
+
309
  if winner == "A":
310
  score_a, score_b = 1, 0
311
  elif winner == "B":
312
  score_a, score_b = 0, 1
313
  else: # Handle ties
314
  score_a, score_b = 0.5, 0.5
315
+
316
  change_a = K_FACTOR * (score_a - expected_a)
317
  change_b = K_FACTOR * (score_b - expected_b)
318
+
319
  return change_a, change_b
320
 
321
+
322
  def update_leaderboard():
323
+ """Generate leaderboard DataFrame using fresh votes from MongoDB."""
324
+ # Get fresh voting data
325
+ voting_data = get_current_votes()
326
+ print(f"Found {len(voting_data)} votes in database")
327
  matches = defaultdict(int)
328
+
329
+ # Process each vote chronologically
 
 
 
 
 
 
 
 
330
  for vote in voting_data:
331
+ # Extract model names from the vote document
332
+ try:
333
+ model_a = vote.get("model_a")
334
+ model_b = vote.get("model_b")
335
+ winner = vote.get("winner")
336
+
337
+ print(f"Processing vote: {model_a} vs {model_b}, winner: {winner}")
338
+
339
+ # Skip if any required field is missing or models aren't in current model_data
340
+ if not all([model_a, model_b, winner]):
341
+ print(f"Missing required fields in vote: {vote}")
342
+ continue
343
+
344
+ if model_a not in model_data:
345
+ print(f"Model A '{model_a}' not found in model_data")
346
+ continue
347
+
348
+ if model_b not in model_data:
349
+ print(f"Model B '{model_b}' not found in model_data")
350
+ continue
351
+
352
+ # Update match counts
353
+ matches[model_a] += 1
354
+ matches[model_b] += 1
355
+ print(
356
+ f"Updated matches - {model_a}: {matches[model_a]}, {model_b}: {matches[model_b]}"
357
+ )
358
+ except Exception as e:
359
+ print(f"Error processing vote: {e}")
360
+ print(f"Problematic vote data: {vote}")
361
  continue
362
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
363
 
364
  # Update the display_leaderboard function
365
  def display_leaderboard():
366
  df = update_leaderboard()
367
  return gr.DataFrame(
368
  value=df,
369
+ headers=["Model", "ELO", "95% CI", "Matches", "Organization", "License"],
370
+ datatype=["str", "number", "str", "number", "str", "str", "str"],
371
+ row_count=(len(df) + 1, "dynamic"),
372
  )
373
 
374
+
375
  # Update the leaderboard table definition in the UI
376
  leaderboard_table = gr.Dataframe(
377
+ headers=["Model", "ELO", "95% CI", "Matches", "Organization", "License"],
378
+ datatype=["str", "number", "str", "number", "str", "str", "str"],
379
  )
380
 
381
+
382
  def get_leaderboard_stats():
383
  """Get summary statistics for the leaderboard."""
384
+ now = datetime.now(timezone.utc)
385
+ total_votes = len(get_current_votes())
386
+ total_models = len(model_data)
387
+ last_updated = now.replace(minute=0, second=0, microsecond=0).strftime(
388
+ "%B %d, %Y at %H:00 UTC"
389
+ )
390
+
391
+ return f"""
 
392
  ### Leaderboard Stats
393
  - **Total Models**: {total_models}
394
  - **Total Votes**: {total_votes}
395
  - **Last Updated**: {last_updated}
396
  """
 
 
397
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
398
 
399
  def set_example_metric(metric_name):
400
  if metric_name == "Custom":
401
+ variables = parse_variables(DEFAULT_EVAL_PROMPT)
402
+ variable_values = []
403
+ for var in variables:
404
+ if var == "input":
405
+ variable_values.append(DEFAULT_INPUT)
406
+ elif var == "response":
407
+ variable_values.append(DEFAULT_RESPONSE)
408
+ else:
409
+ variable_values.append("") # Default empty value
410
+ # Pad variable_values to match the length of variable_rows
411
+ while len(variable_values) < len(variable_rows):
412
+ variable_values.append("")
413
+ return [DEFAULT_EVAL_PROMPT] + variable_values
414
+
415
  metric_data = EXAMPLE_METRICS[metric_name]
416
+ variables = parse_variables(metric_data["prompt"])
417
+ variable_values = []
418
+ for var in variables:
419
+ value = metric_data.get(var, "") # Default to empty string if not found
420
+ variable_values.append(value)
421
+ # Pad variable_values to match the length of variable_rows
422
+ while len(variable_values) < len(variable_rows):
423
+ variable_values.append("")
424
+ return [metric_data["prompt"]] + variable_values
425
+
426
 
427
  # Select random metric at startup
428
  def get_random_metric():
429
  metrics = list(EXAMPLE_METRICS.keys())
430
  return set_example_metric(random.choice(metrics))
431
 
432
+
433
+ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
434
  judge_id = gr.State(get_new_session_id())
435
  gr.Markdown(MAIN_TITLE)
436
  gr.Markdown(HOW_IT_WORKS)
437
+
438
  with gr.Tabs():
439
  with gr.TabItem("Judge Arena"):
440
+
441
  with gr.Row():
442
  with gr.Column():
443
  gr.Markdown(BATTLE_RULES)
444
  gr.Markdown(EVAL_DESCRIPTION)
445
+
446
  # Add Example Metrics Section
447
+ with gr.Accordion("Evaluator Prompt Templates", open=False):
448
  with gr.Row():
449
  custom_btn = gr.Button("Custom", variant="secondary")
450
  hallucination_btn = gr.Button("Hallucination")
 
462
  lines=1,
463
  value=DEFAULT_EVAL_PROMPT,
464
  placeholder="Type your eval prompt here... denote variables in {{curly brackets}} to be populated on the right.",
465
+ show_label=True,
466
  )
467
 
468
  # Right column - Variable Mapping
 
473
  for i in range(5):
474
  initial_visibility = True if i < 2 else False
475
  with gr.Group(visible=initial_visibility) as var_row:
476
+ # Set default labels for the first two inputs
477
+ default_label = (
478
+ "input" if i == 0 else "response" if i == 1 else ""
479
+ )
480
  var_input = gr.Textbox(
481
+ container=True,
482
+ label=default_label, # Add default label here
 
483
  )
484
  variable_rows.append((var_row, var_input))
485
 
486
  # Send button
487
  with gr.Row(elem_classes="send-button-row"):
488
  send_btn = gr.Button(
489
+ value="Test the evaluators", variant="primary", size="lg", scale=1
 
 
 
490
  )
491
+
492
  # Add divider heading for model outputs
493
  gr.Markdown(VOTING_HEADER)
494
+
495
  # Model Responses side-by-side
496
  with gr.Row():
497
  with gr.Column():
 
504
  score_b = gr.Textbox(label="Score", interactive=False)
505
  critique_b = gr.TextArea(label="Critique", lines=8, interactive=False)
506
  model_name_b = gr.Markdown("*Model: Unknown*")
507
+
508
  # Initially hide vote buttons and regenerate button
509
  with gr.Row(visible=False) as action_buttons_row:
510
  vote_a = gr.Button("Choose A", variant="primary")
511
  vote_tie = gr.Button("Tie", variant="secondary")
512
  vote_b = gr.Button("Choose B", variant="primary")
513
+ regenerate_button = gr.Button(
514
+ "Regenerate with different models", variant="secondary", visible=False
515
+ )
516
+
517
  # Add spacing and acknowledgements at the bottom
518
  gr.Markdown(ACKNOWLEDGEMENTS)
519
 
520
  with gr.TabItem("Leaderboard"):
 
521
  stats_display = gr.Markdown()
522
  leaderboard_table = gr.Dataframe(
523
+ headers=["Model", "ELO", "95% CI", "Matches", "Organization", "License"],
524
+ datatype=["str", "number", "str", "number", "str", "str", "str"],
525
  )
526
 
527
  with gr.TabItem("Policy"):
 
530
  # Define state variables for model tracking
531
  model_a_state = gr.State()
532
  model_b_state = gr.State()
533
+ final_prompt_state = gr.State()
534
 
535
  # Update variable inputs based on the eval prompt
536
  def update_variables(eval_prompt):
537
  variables = parse_variables(eval_prompt)
538
  updates = []
539
+
540
+ for i in range(len(variable_rows)):
541
  var_row, var_input = variable_rows[i]
542
  if i < len(variables):
543
+ var_name = variables[i]
544
+ # Set the number of lines based on the variable name
545
+ if var_name == "response":
546
+ lines = 4 # Adjust this number as needed
 
547
  else:
548
+ lines = 1 # Default to single line for other variables
549
+ updates.extend(
550
+ [
551
+ gr.update(visible=True), # Show the variable row
552
+ gr.update(
553
+ label=var_name, visible=True, lines=lines
554
+ ), # Update label and lines
555
+ ]
556
+ )
557
  else:
558
+ updates.extend(
559
+ [
560
+ gr.update(visible=False), # Hide the variable row
561
+ gr.update(value="", visible=False), # Clear value when hidden
562
+ ]
563
+ )
564
  return updates
565
 
566
+ eval_prompt.change(
567
+ fn=update_variables,
568
+ inputs=eval_prompt,
569
+ outputs=[item for sublist in variable_rows for item in sublist],
570
+ )
571
 
572
  # Regenerate button functionality
573
  regenerate_button.click(
574
  fn=regenerate_prompt,
575
+ inputs=[model_a_state, model_b_state, eval_prompt]
576
+ + [var_input for _, var_input in variable_rows],
577
  outputs=[
578
  score_a,
579
  critique_a,
 
583
  model_name_a,
584
  model_name_b,
585
  model_a_state,
586
+ model_b_state,
587
+ ],
588
  )
589
 
590
  # Update model names after responses are generated
591
  def update_model_names(model_a, model_b):
592
+ return gr.update(value=f"*Model: {model_a}*"), gr.update(
593
+ value=f"*Model: {model_b}*"
594
+ )
595
 
596
  # Store the last submitted prompt and variables for comparison
597
  last_submission = gr.State({})
598
 
 
599
  # Update the vote button click handlers
600
  vote_a.click(
601
+ fn=lambda *args: vote("A", *args),
602
+ inputs=[
603
+ model_a_state,
604
+ model_b_state,
605
+ final_prompt_state,
606
+ score_a,
607
+ critique_a,
608
+ score_b,
609
+ critique_b,
610
+ judge_id,
611
+ ],
612
+ outputs=[
613
+ action_buttons_row,
614
+ model_name_a,
615
+ model_name_b,
616
+ send_btn,
617
+ regenerate_button,
618
+ ],
619
  )
620
 
621
  vote_b.click(
622
+ fn=lambda *args: vote("B", *args),
623
+ inputs=[
624
+ model_a_state,
625
+ model_b_state,
626
+ final_prompt_state,
627
+ score_a,
628
+ critique_a,
629
+ score_b,
630
+ critique_b,
631
+ judge_id,
632
+ ],
633
+ outputs=[
634
+ action_buttons_row,
635
+ model_name_a,
636
+ model_name_b,
637
+ send_btn,
638
+ regenerate_button,
639
+ ],
640
  )
641
 
642
  vote_tie.click(
643
+ fn=lambda *args: vote("Tie", *args),
644
+ inputs=[
645
+ model_a_state,
646
+ model_b_state,
647
+ final_prompt_state,
648
+ score_a,
649
+ critique_a,
650
+ score_b,
651
+ critique_b,
652
+ judge_id,
653
+ ],
654
+ outputs=[
655
+ action_buttons_row,
656
+ model_name_a,
657
+ model_name_b,
658
+ send_btn,
659
+ regenerate_button,
660
+ ],
661
  )
662
 
663
  # Update the send button handler to store the submitted inputs
664
  def submit_and_store(prompt, *variables):
665
  # Create a copy of the current submission
666
  current_submission = {"prompt": prompt, "variables": variables}
667
+
668
  # Get the responses
669
+ (
670
+ response_a,
671
+ response_b,
672
+ buttons_visible,
673
+ regen_visible,
674
+ model_a,
675
+ model_b,
676
+ final_prompt,
677
+ ) = submit_prompt(prompt, *variables)
678
+
679
  # Parse the responses
680
  score_a, critique_a = parse_model_response(response_a)
681
  score_b, critique_b = parse_model_response(response_b)
682
+
683
  # Update the last_submission state with the current values
684
  last_submission.value = current_submission
685
+
686
  return (
687
  score_a,
688
  critique_a,
689
  score_b,
690
  critique_b,
691
  buttons_visible,
692
+ gr.update(
693
+ visible=True, interactive=True
694
+ ), # Show and enable regenerate button
695
  model_a,
696
  model_b,
697
+ final_prompt, # Add final_prompt to state
698
+ gr.update(value="*Model: Unknown*"),
699
  gr.update(value="*Model: Unknown*"),
 
700
  )
701
 
702
  send_btn.click(
 
711
  regenerate_button,
712
  model_a_state,
713
  model_b_state,
714
+ final_prompt_state, # Add final_prompt_state to outputs
715
+ model_name_a,
716
+ model_name_b,
717
+ ],
718
  )
719
 
720
  # Update the input change handlers to also disable regenerate button
 
724
  current_inputs = {"prompt": prompt, "variables": variables}
725
  inputs_changed = last_inputs != current_inputs
726
  return [
727
+ gr.update(interactive=True), # send button always enabled
728
+ gr.update(
729
+ interactive=not inputs_changed
730
+ ), # regenerate button disabled if inputs changed
731
  ]
732
 
733
  # Update the change handlers for prompt and variables
734
  eval_prompt.change(
735
  fn=handle_input_changes,
736
  inputs=[eval_prompt] + [var_input for _, var_input in variable_rows],
737
+ outputs=[send_btn, regenerate_button],
738
  )
739
 
740
  for _, var_input in variable_rows:
741
  var_input.change(
742
  fn=handle_input_changes,
743
  inputs=[eval_prompt] + [var_input for _, var_input in variable_rows],
744
+ outputs=[send_btn, regenerate_button],
745
  )
746
 
747
  # Update the leaderboard
748
  def refresh_leaderboard():
749
+ """Refresh the leaderboard data and stats."""
750
  leaderboard = get_leaderboard()
751
  data = [
752
  [
753
+ entry["Model"],
754
+ float(entry["ELO Score"]),
755
+ entry["95% CI"],
756
+ entry["# Votes"],
757
+ entry["Organization"],
758
+ entry["License"],
759
+ ]
760
+ for entry in leaderboard
761
  ]
762
  stats = get_leaderboard_stats()
763
  return [gr.update(value=data), gr.update(value=stats)]
764
 
 
 
 
 
 
 
765
  # Add the load event at the very end, just before demo.launch()
766
  demo.load(
767
+ fn=refresh_leaderboard, inputs=None, outputs=[leaderboard_table, stats_display]
 
 
768
  )
769
 
770
  # Add click handlers for metric buttons
771
+ outputs_list = [eval_prompt] + [var_input for _, var_input in variable_rows]
772
+
773
+ custom_btn.click(fn=lambda: set_example_metric("Custom"), outputs=outputs_list)
 
774
 
775
  hallucination_btn.click(
776
+ fn=lambda: set_example_metric("Hallucination"), outputs=outputs_list
 
777
  )
778
 
779
+ precision_btn.click(fn=lambda: set_example_metric("Precision"), outputs=outputs_list)
 
 
 
780
 
781
+ recall_btn.click(fn=lambda: set_example_metric("Recall"), outputs=outputs_list)
 
 
 
782
 
783
  coherence_btn.click(
784
+ fn=lambda: set_example_metric("Logical_Coherence"), outputs=outputs_list
 
785
  )
786
 
787
  faithfulness_btn.click(
788
+ fn=lambda: set_example_metric("Faithfulness"), outputs=outputs_list
 
789
  )
790
 
791
+ # Set default metric at startup
792
  demo.load(
793
+ fn=lambda: set_example_metric("Custom"),
794
+ outputs=[eval_prompt] + [var_input for _, var_input in variable_rows],
795
  )
796
 
797
+ if __name__ == "__main__":
798
+ demo.launch()
common.py CHANGED
@@ -49,7 +49,7 @@ EVAL_DESCRIPTION = """
49
  <br><br>
50
  """
51
 
52
- DEFAULT_EVAL_PROMPT = """You are assessing a chat bot response to a user's input based on the helpfulness of the response.
53
 
54
  Score:
55
  A score of 1 means that the response's answer meets all of the evaluation criteria.
@@ -101,65 +101,55 @@ By creating advanced evaluation models, we enable AI developers to identify and
101
 
102
  ## Overview
103
 
104
- Judge Arena is an open-source platform dedicated to improving the standard of evaluation of generative AI models in their role as judges. Users can run evals and assess anonymized responses from two competing model judges, choosing the better judgement or declaring a tie. This policy outlines our commitments and guidelines to ensure a fair, open, and collaborative environment for both users and model providers.
105
 
106
  ## Transparency
107
 
108
- - **Open-Source**: Judge Arena's code is open-source and available on GitHub. This approach allows anyone to review, replicate, or modify the platform to suit their needs. We use proprietary model provider APIs where provided and Together AI's API to serve leading open-source models.
109
- - **Community Engagement**: We actively encourage contributions from the community. Feedback, code contributions, and discussions are welcome to improve the platform's functionality, fairness, and transparency.
110
- - **Methodology**: All processes related to model evaluation, rating calculations, and model selection are openly documented. This transparency ensures that our processes are understandable and reproducible by others.
111
- - **Data Sharing**: Periodically, we will share 20% of the collected evaluation data with the community. This data includes anonymized prompts, model responses, and aggregated evaluation results.
112
 
113
  ## Model Inclusion Criteria
114
 
115
- Judge Arena is specifically designed to assess AI models that function as evaluators (a.k.a judges), including but not limited to powerful general-purpose models and the latest language models designed for evaluation tasks. Models are eligible for inclusion if they meet the following criteria:
116
 
117
- - **Judge Capability**: The model must possess the ability to score AND critique responses, content, or other models' outputs effectively.
118
  - **Adaptable:** The model must be prompt-able to be evaluate in different scoring formats, for different criteria.
119
  - **Accessibility**:
120
  - **Public API Access**: Models accessible through public APIs without restrictive barriers.
121
  - **Open-Source Models**: Models with publicly available weights that can be downloaded and run by the community.
122
 
123
- ## Evaluation Methodology
124
-
125
- - **User Participation**: Users run evaluations and select preferred model responses based on quality, relevance, and accuracy contributing to the model's overall rating.
126
- - **Blind Testing**: All model evaluations are conducted blindly. Users are not informed which model produced which response to eliminate bias.
127
- - **Data Collection**: We collect sufficient data to ensure statistical significance in our evaluations. We additionally show the 95% confidence interval in the leaderboard to provide a signal of reliability.
128
- - **Anomaly Detection**: We monitor user activity to detect and mitigate anomalous behavior or voting patterns that could skew results.
129
-
130
  ## Leaderboard Management
131
 
132
  - **ELO Ranking System**: Models are ranked on a public leaderboard based on aggregated user evaluations. We use an ELO rating system to rank AI judges on the public leaderboard. Each model begins with an initial rating of 1500 (as is used by the International Chess Federation), and we use a K-factor of 32 to determine the maximum rating adjustment after each evaluation.
133
- - **Minimum Period**: Listed models remain accessible on Judge Arena for a minimum period of two weeks to allow for comprehensive community evaluation.
134
- - **Deprecation Policy**: Models may be removed from the leaderboard if they become inaccessible, are no longer publicly available.
135
 
136
- ## Privacy and Data Protection
137
 
138
- - **Anonymization**: All shared data is anonymized to prevent the identification of individual users.
139
 
140
- ## Policy Updates and Communication
141
 
142
- - **Ongoing Revisions**: This policy may be updated to reflect changes in our practices or in response to community feedback.
143
- - **Notification of Changes**: Policy changes will be communicated to users and stakeholders on this page.
144
- <br><br>
145
 
146
- # FAQ
147
 
148
- **Isn't this the same as Chatbot Arena?**
149
 
150
- - We are big fans of what the LMSYS team have done with Chatbot Arena and fully credit them for the inspiration to develop this. We were looking for a dynamic leaderboard that graded on AI judge capabilities and didn't manage to find one, so we created Judge Arena. This UI is designed especially for evals; to match the format of the model-based eval prompts that you would use in your LLM evaluation / monitoring tool.
151
 
152
- \n\n**Why should I trust this leaderboard?**
153
 
154
- - We have listed out our efforts to be fully transparent in the policies above. All of the code for this leaderboard is open-source and can be found on our [Github](https://github.com/atla-ai/judge-arena).
155
 
156
- \n\n**Who funds this effort?**
157
 
158
- - Atla currently funds this out of our own pocket. We are looking for API credits (with no strings attached) to support this effort - please get in touch if you or someone you know might be able to help.
159
 
160
- \n\n**What is Atla working on?**
161
 
162
- - We are training a general-purpose evaluator that you will soon be able to run in this Judge Arena. Our next step will be to open-source a powerful model that the community can use to run fast and accurate evaluations.
163
  <br><br>
164
  # Get in touch
165
- Feel free to email us at [support@atla-ai.com](mailto:support@atla-ai.com) or leave feedback on our [Github](https://github.com/atla-ai/judge-arena)!"""
 
49
  <br><br>
50
  """
51
 
52
+ DEFAULT_EVAL_PROMPT = """You are assessing a chat bot response to a user's input based on [INSERT CRITERIA]
53
 
54
  Score:
55
  A score of 1 means that the response's answer meets all of the evaluation criteria.
 
101
 
102
  ## Overview
103
 
104
+ Judge Arena is an open-source platform dedicated to improving the standard of evaluation of generative AI models in their role as judges. Users can run evals and assess anonymized responses from two competing model judges, choosing the better judgement or declaring a tie. This policy outlines our commitments to maintain a fair, open, and collaborative environment :)
105
 
106
  ## Transparency
107
 
108
+ - **Open-Source**: Judge Arena's code is open-source and available on GitHub. We encourage contributions from the community and anyone can replicate or modify the platform to suit their needs. We use proprietary model provider APIs where provided and Together AI's API to serve leading open-source models.
109
+ - **Methodology**: All processes related to model evaluation, rating calculations, and model selection are openly documented. We'd like to ensure that our ranking system is understandable and reproducible by others!
110
+ - **Data Sharing**: Periodically, we'll share 20% of the collected evaluation data with the community. The data collected from Judge Arena is restricted to an anonymized user ID, the final prompt sent, the model responses, the user vote, and the timestamp.
 
111
 
112
  ## Model Inclusion Criteria
113
 
114
+ Judge Arena is specifically designed to assess AI models that function as evaluators (a.k.a judges). This includes but is not limited to powerful general-purpose models and the latest language models designed for evaluation tasks. Models are eligible for inclusion if they meet the following criteria:
115
 
116
+ - **Judge Capability**: The model should possess the ability to score AND critique responses, content, or other models' outputs effectively.
117
  - **Adaptable:** The model must be prompt-able to be evaluate in different scoring formats, for different criteria.
118
  - **Accessibility**:
119
  - **Public API Access**: Models accessible through public APIs without restrictive barriers.
120
  - **Open-Source Models**: Models with publicly available weights that can be downloaded and run by the community.
121
 
 
 
 
 
 
 
 
122
  ## Leaderboard Management
123
 
124
  - **ELO Ranking System**: Models are ranked on a public leaderboard based on aggregated user evaluations. We use an ELO rating system to rank AI judges on the public leaderboard. Each model begins with an initial rating of 1500 (as is used by the International Chess Federation), and we use a K-factor of 32 to determine the maximum rating adjustment after each evaluation.
125
+ - **Minimum Period**: Listed models remain accessible on Judge Arena for a minimum period of two weeks so they can be comprehensively evaluated.
126
+ - **Deprecation Policy**: Models may be removed from the leaderboard if they become inaccessible or are no longer publicly available.
127
 
128
+ This policy might be updated to reflect changes in our practices or in response to community feedback.
129
 
130
+ # FAQ
131
 
132
+ **Isn't this the same as Chatbot Arena?**
133
 
134
+ We are big fans of what the LMSYS team have done with Chatbot Arena and fully credit them for the inspiration to develop this. We were looking for a dynamic leaderboard that graded on AI judge capabilities and didn't manage to find one, so we created Judge Arena. This UI is designed especially for evals; to match the format of the model-based eval prompts that you would use in your LLM evaluation / monitoring tool.
 
 
135
 
136
+ **What are the Evaluator Prompt Templates based on?**
137
 
138
+ As a quick start, we've set up templates that cover the most popular evaluation metrics out there on LLM evaluation / monitoring tools, often known as 'base metrics'. The data samples used in these were randomly picked from popular datasets from academia - [ARC](https://huggingface.co/datasets/allenai/ai2_arc), [Preference Collection](https://huggingface.co/datasets/prometheus-eval/Preference-Collection), [RewardBench](https://huggingface.co/datasets/allenai/reward-bench), [RAGTruth](https://arxiv.org/abs/2401.00396).
139
 
140
+ These templates are designed as a starting point to showcase how to interact with the Judge Arena, especially for those less familiar with using LLM judges.
141
 
142
+ **Why should I trust this leaderboard?**
143
 
144
+ We have listed out our efforts to be fully transparent in the policies above. All of the code for this leaderboard is open-source and can be found on our [Github](https://github.com/atla-ai/judge-arena).
145
 
146
+ **Who funds this effort?**
147
 
148
+ Atla currently funds this out of our own pocket. We are looking for API credits (with no strings attached) to support this effort - please get in touch if you or someone you know might be able to help.
149
 
150
+ **What is Atla working on?**
151
 
152
+ We are training a general-purpose evaluator that you will soon be able to run in this Judge Arena. Our next step will be to open-source a powerful model that the community can use to run fast and accurate evaluations.
153
  <br><br>
154
  # Get in touch
155
+ Feel free to email us at [support@atla-ai.com](mailto:support@atla-ai.com) or leave feedback on our [Github](https://github.com/atla-ai/judge-arena)!"""
db.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pymongo import MongoClient
3
+ from pymongo.database import Database
4
+ from utils import get_logger, Vote
5
+ from datetime import datetime, timedelta, timezone
6
+ from typing import List
7
+
8
+ logger = get_logger()
9
+
10
+
11
+ def create_db_connection() -> Database:
12
+ print(os.getenv("MONGO_URI"))
13
+ print(os.getenv("MONGO_DB"))
14
+ db = MongoClient(os.getenv("MONGO_URI")).get_database(os.getenv("MONGO_DB"))
15
+ return db
16
+
17
+
18
+ def add_vote(vote: Vote, db: Database) -> None:
19
+ try:
20
+ db.get_collection("votes").insert_one(vote.__dict__)
21
+ logger.info("Vote added to database")
22
+ except Exception as e:
23
+ logger.error("Error adding vote to database")
24
+ logger.error(e)
25
+
26
+
27
+ def get_votes(db: Database) -> List[Vote]:
28
+ now = datetime.now(timezone.utc)
29
+ current_hour = now.replace(minute=0, second=0, microsecond=0)
30
+ votes = list(
31
+ db.get_collection("votes").find({"timestamp": {"$lte": current_hour.isoformat()}})
32
+ )
33
+ return votes
example_metrics.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Example evaluation metrics data
2
+
3
+ EXAMPLE_METRICS = {
4
+ "Hallucination": {
5
+ "prompt": """You are assessing a chat bot response to a user's input based on whether it contradicts the known ground truth. Focus on factual inconsistencies and opposing statements.\n
6
+ Score:
7
+ A score of 0 means the response contains no contradictions to the ground truth.
8
+ A score of 1 means the response contains contradictions to the ground truth.\n
9
+ Here is the data:
10
+ [BEGIN DATA]
11
+ ***
12
+ [User Query]: {{input}}
13
+ ***
14
+ [Ground Truth Response]: {{ground_truth}}
15
+ ***
16
+ [Response]: {{response}}
17
+ ***
18
+ [END DATA]""",
19
+ "input": """Lichens are symbiotic organisms made of green algae and fungi. What do the green algae supply to the fungi in this symbiotic relationship?
20
+ A) Carbon dioxide
21
+ B) Food
22
+ C) Protection
23
+ D) Water""",
24
+ # Example from ARC Evals
25
+ "ground_truth": "B) Food",
26
+ "response": "C) Protection",
27
+ },
28
+ "Precision": {
29
+ "prompt": """You are assessing a chat bot response to a user's input based on how completely it captures the key facts and details from the ground truth response.\n
30
+ Score:
31
+ A score of 1 means the response is completely irrelevant or contradicts the ground truth response.
32
+ A score of 2 means the response includes some relevant information but is mostly filled with irrelevant or incorrect details.
33
+ A score of 3 means the response is somewhat relevant with a balanced mix of relevant and irrelevant information.
34
+ A score of 4 means the response is mostly relevant, containing only minor irrelevant details.
35
+ A score of 5 means the response is entirely relevant, perfectly aligning with the ground truth response without any irrelevant information.\n
36
+ Here is the data:
37
+ [BEGIN DATA]
38
+ ***
39
+ [User Query]: {{input}}
40
+ ***
41
+ [Ground Truth Response]: {{ground_truth}}
42
+ ***
43
+ [Response]: {{response}}
44
+ ***
45
+ [END DATA]""",
46
+ # Example from Prometheus
47
+ "input": "In a city, there are three different colored houses - red, blue, and yellow. The city council has made a rule that no two same colored houses can be adjacent to each other. A query is raised by the city planner who is designing the layout of the houses. The query is, 'Can you help me design a layout where I can place these houses without violating the rule of the city council?' The user didn't provide information about the total number of houses, the layout structure, or the proportion of house colors.",
48
+ "ground_truth": "In order to provide an accurate layout design, I'll need a bit more information. Could you please specify how many houses in total are to be placed in the city? Also, it would be helpful to know the ratio of the different colored houses. Are there equal numbers of red, blue, and yellow houses, or does one color predominate? Furthermore, understanding the layout structure you have in mind is also important. Is it a linear layout, a circular one, a block-based one, or some other structure? Once I have these details, I'll be able to provide a more precise and useful answer.",
49
+ "response": "In designing the layout, you could alternate between the three colors - red, blue, and yellow, to ensure no two houses of the same color are next to each other. You could possibly start with a red house, followed by a blue, then a yellow, and then repeat the sequence until all houses are placed. However, the total number of houses and the ratio of the different colored houses could affect the final layout. For instance, if there are more red houses than blue and yellow, some adjustments may need to be made. Also, the structure of the layout could play a role, whether it's a linear or block-based layout, or any other. It might be helpful to get some additional information about these aspects, but based on the information provided, this approach could work.",
50
+ },
51
+ "Recall": {
52
+ "prompt": """You are assessing a chat bot response to a user's input based on how completely it captures the key facts and details from the ground truth response.\n
53
+ Score:
54
+ A score of 1 means the response misses almost all the key facts and details from the ground truth.
55
+ A score of 2 means the response includes only a few of the key facts and details from the ground truth.
56
+ A score of 3 means the response captures a moderate amount of the key facts and details from the ground truth.
57
+ A score of 4 means the response includes most of the key facts and details from the ground truth.
58
+ A score of 5 means the response perfectly captures all key facts and details from the ground truth without omission.\n
59
+ Here is the data:
60
+ [BEGIN DATA]
61
+ ***
62
+ [User Query]: {{input}}
63
+ ***
64
+ [Ground Truth Response]: {{ground_truth}}
65
+ ***
66
+ [Response]: {{response}}
67
+ ***
68
+ [END DATA]""",
69
+ # Example from Prometheus
70
+ "input": """There's a thing that's bothering me. It's related to a kind of fruit, but not really the usual ones we eat. It's sweet, crunchy, and a lot of people like it, but it's not something you'd normally find in a grocery store. What kind of fruit am I talking about?""",
71
+ "ground_truth": """Based on your description, it seems like you might be referring to the 'honeycrisp apple'. While it is a type of fruit and is known for its sweetness and crunchiness, it's not as commonly found in grocery stores as other fruit varieties due to its specific growing requirements and seasonality. Its unique taste and texture make it a favorite among many people. However, it's important to note that there could potentially be other fruits that also fit the given description, as taste and texture can vary significantly among fruit varieties.""",
72
+ "response": """From your clues, it seems that you might be referring to a 'honeycrisp apple'. This fruit is known for its sweetness and crunchiness, but it isn't typically seen in many grocery stores. Many people do enjoy its unique taste and texture. However, without more specific details, there could be other fruits that also match your description. Some fruits can share similar characteristics like sweetness and crunchiness.""",
73
+ },
74
+ "Logical_Coherence": {
75
+ "prompt": """You are assessing a chat bot response to a user's input based on the logical coherence of the response.\n
76
+ Score:
77
+ A score of 1 means the response is illogical, contains internal contradictions, or has significant jumps in reasoning that make it hard to follow.
78
+ A score of 3 means the response follows some logical flow but has minor inconsistencies or gaps in reasoning.
79
+ A score of 5 means the response is completely logically coherent, with clear connections between ideas and a natural progression of thought.\n
80
+ Here is the data:
81
+ [BEGIN DATA]
82
+ ***
83
+ [User Query]: {{input}}
84
+ ***
85
+ [Response]: {{response}}
86
+ ***
87
+ [END DATA]""",
88
+ # Example from RewardBench
89
+ "input": "How did US states get their names?",
90
+ "response": "US states got their names for a variety of reasons. Many are named after early explorers, leaders, or patriots. Some are named after geographical features, such as mountains or rivers. Others are named after plants or animals that are native to the state. Some are even named after historical figures or characters from literature or mythology.",
91
+ },
92
+ "Faithfulness": {
93
+ "prompt": """You are assessing a chat bot response to a user's input based on the faithfulness of the response to the context retrieved.\n
94
+ Score:
95
+ A score of 1 means that the response contains statements that directly contradict the context or are entirely unsupported by it.
96
+ A score of 3 means that the response contains some statements that are not directly supported by the context.
97
+ A score of 5 means that the response is fully grounded in the context, with all statements accurately reflecting the provided information.\n
98
+ Here is the data:
99
+ [BEGIN DATA]
100
+ ***
101
+ [User Query]: {{input}}
102
+ ***
103
+ [Retrieved Context]: {{context}}
104
+ ***
105
+ [Response]: {{response}}
106
+ ***
107
+ [END DATA]""",
108
+ # Example from RAGTruth
109
+ "input": "How to prepare to get an ultrasound?",
110
+ "context": """Passage 1: 1 If you are just interested in checking the gender, you can go as early as 17 weeks. 2 Any earlier, and the results may not be accurate, leading to an extra coat of paint for your bedroom. 3 To get good pictures, 3D ultrasounds are best performed between 20 and 32 weeks, and the best pictures are between 24 and 30 weeks.""",
111
+ "response": """ Based on the given passages, here’s how to prepare for an ultrasound: For a gender reveal ultrasound, wait until at least 17 weeks of pregnancy to ensure accurate results. For 3D ultrasounds, schedule the appointment between 20 and 32 weeks of pregnancy for the best pictures""",
112
+ },
113
+ }
gen_api_answer.py CHANGED
@@ -11,6 +11,7 @@ together_client = Together()
11
 
12
  SYSTEM_PROMPT = """Please act as an impartial judge and evaluate based on the user's instruction. Your output format should strictly adhere to JSON as follows: {"feedback": "<write feedback>", "result": <numerical score>}. Ensure the output is valid JSON, without additional formatting or explanations."""
13
 
 
14
  def get_openai_response(model_name, prompt):
15
  """Get response from OpenAI API"""
16
  try:
@@ -18,13 +19,14 @@ def get_openai_response(model_name, prompt):
18
  model=model_name,
19
  messages=[
20
  {"role": "system", "content": SYSTEM_PROMPT},
21
- {"role": "user", "content": prompt}
22
- ]
23
  )
24
  return response.choices[0].message.content
25
  except Exception as e:
26
  return f"Error with OpenAI model {model_name}: {str(e)}"
27
 
 
28
  def get_anthropic_response(model_name, prompt):
29
  """Get response from Anthropic API"""
30
  try:
@@ -33,14 +35,13 @@ def get_anthropic_response(model_name, prompt):
33
  max_tokens=1000,
34
  temperature=0,
35
  system=SYSTEM_PROMPT,
36
- messages=[
37
- {"role": "user", "content": [{"type": "text", "text": prompt}]}
38
- ]
39
  )
40
  return response.content[0].text
41
  except Exception as e:
42
  return f"Error with Anthropic model {model_name}: {str(e)}"
43
 
 
44
  def get_together_response(model_name, prompt):
45
  """Get response from Together API"""
46
  try:
@@ -48,52 +49,54 @@ def get_together_response(model_name, prompt):
48
  model=model_name,
49
  messages=[
50
  {"role": "system", "content": SYSTEM_PROMPT},
51
- {"role": "user", "content": prompt}
52
  ],
53
- stream=False
54
  )
55
  return response.choices[0].message.content
56
  except Exception as e:
57
  return f"Error with Together model {model_name}: {str(e)}"
58
 
 
59
  def get_model_response(model_name, model_info, prompt):
60
  """Get response from appropriate API based on model organization"""
61
  if not model_info:
62
  return "Model not found or unsupported."
63
-
64
- api_model = model_info['api_model']
65
- organization = model_info['organization']
66
-
67
  try:
68
- if organization == 'OpenAI':
69
  return get_openai_response(api_model, prompt)
70
- elif organization == 'Anthropic':
71
  return get_anthropic_response(api_model, prompt)
72
  else:
73
  # All other organizations use Together API
74
  return get_together_response(api_model, prompt)
75
  except Exception as e:
76
- return f"Error with {organization} model {model_name}: {str(e)}"
 
77
 
78
  def parse_model_response(response):
79
  try:
80
  # Debug print
81
  print(f"Raw model response: {response}")
82
-
83
  # First try to parse the entire response as JSON
84
  try:
85
  data = json.loads(response)
86
- return str(data.get('result', 'N/A')), data.get('feedback', 'N/A')
87
  except json.JSONDecodeError:
88
  # If that fails (typically for smaller models), try to find JSON within the response
89
- json_match = re.search(r'{.*}', response)
90
  if json_match:
91
  data = json.loads(json_match.group(0))
92
- return str(data.get('result', 'N/A')), data.get('feedback', 'N/A')
93
  else:
94
- return 'Error', f"Failed to parse response: {response}"
95
-
96
  except Exception as e:
97
  # Debug print for error case
98
  print(f"Failed to parse response: {str(e)}")
99
- return 'Error', f"Failed to parse response: {response}"
 
11
 
12
  SYSTEM_PROMPT = """Please act as an impartial judge and evaluate based on the user's instruction. Your output format should strictly adhere to JSON as follows: {"feedback": "<write feedback>", "result": <numerical score>}. Ensure the output is valid JSON, without additional formatting or explanations."""
13
 
14
+
15
  def get_openai_response(model_name, prompt):
16
  """Get response from OpenAI API"""
17
  try:
 
19
  model=model_name,
20
  messages=[
21
  {"role": "system", "content": SYSTEM_PROMPT},
22
+ {"role": "user", "content": prompt},
23
+ ],
24
  )
25
  return response.choices[0].message.content
26
  except Exception as e:
27
  return f"Error with OpenAI model {model_name}: {str(e)}"
28
 
29
+
30
  def get_anthropic_response(model_name, prompt):
31
  """Get response from Anthropic API"""
32
  try:
 
35
  max_tokens=1000,
36
  temperature=0,
37
  system=SYSTEM_PROMPT,
38
+ messages=[{"role": "user", "content": [{"type": "text", "text": prompt}]}],
 
 
39
  )
40
  return response.content[0].text
41
  except Exception as e:
42
  return f"Error with Anthropic model {model_name}: {str(e)}"
43
 
44
+
45
  def get_together_response(model_name, prompt):
46
  """Get response from Together API"""
47
  try:
 
49
  model=model_name,
50
  messages=[
51
  {"role": "system", "content": SYSTEM_PROMPT},
52
+ {"role": "user", "content": prompt},
53
  ],
54
+ stream=False,
55
  )
56
  return response.choices[0].message.content
57
  except Exception as e:
58
  return f"Error with Together model {model_name}: {str(e)}"
59
 
60
+
61
  def get_model_response(model_name, model_info, prompt):
62
  """Get response from appropriate API based on model organization"""
63
  if not model_info:
64
  return "Model not found or unsupported."
65
+
66
+ api_model = model_info["api_model"]
67
+ organization = model_info["organization"]
68
+
69
  try:
70
+ if organization == "OpenAI":
71
  return get_openai_response(api_model, prompt)
72
+ elif organization == "Anthropic":
73
  return get_anthropic_response(api_model, prompt)
74
  else:
75
  # All other organizations use Together API
76
  return get_together_response(api_model, prompt)
77
  except Exception as e:
78
+ return f"Error with {organization} model {model_name}: {str(e)}"
79
+
80
 
81
  def parse_model_response(response):
82
  try:
83
  # Debug print
84
  print(f"Raw model response: {response}")
85
+
86
  # First try to parse the entire response as JSON
87
  try:
88
  data = json.loads(response)
89
+ return str(data.get("result", "N/A")), data.get("feedback", "N/A")
90
  except json.JSONDecodeError:
91
  # If that fails (typically for smaller models), try to find JSON within the response
92
+ json_match = re.search(r"{.*}", response)
93
  if json_match:
94
  data = json.loads(json_match.group(0))
95
+ return str(data.get("result", "N/A")), data.get("feedback", "N/A")
96
  else:
97
+ return "Error", f"Failed to parse response: {response}"
98
+
99
  except Exception as e:
100
  # Debug print for error case
101
  print(f"Failed to parse response: {str(e)}")
102
+ return "Error", f"Failed to parse response: {response}"
requirements.txt CHANGED
@@ -1,5 +1,6 @@
1
- gradio>=4.19.2
2
- openai>=1.12.0
3
- anthropic>=0.18.1
4
- pandas>=2.2.1
5
- together>=0.2.8
 
 
1
+ pymongo
2
+ gradio
3
+ python-dotenv
4
+ openai
5
+ anthropic
6
+ together
utils.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from datetime import datetime
3
+ import logging
4
+
5
+
6
+ def get_logger(sink_name: str = "core_utils") -> logging.Logger:
7
+ logging.basicConfig(
8
+ format="%(asctime)s,%(msecs)03d %(levelname)-8s "
9
+ "[%(filename)s:%(lineno)d] %(message)s",
10
+ datefmt="%Y-%m-%d:%H:%M:%S",
11
+ level=logging.INFO,
12
+ force=True,
13
+ )
14
+ logger = logging.getLogger(sink_name)
15
+ return logger
16
+
17
+
18
+ @dataclass
19
+ class Vote:
20
+ timestamp: str
21
+ prompt: str
22
+ response_a: str
23
+ response_b: str
24
+ model_a: str
25
+ model_b: str
26
+ winner: str
27
+ judge_id: str