saattrupdan commited on
Commit
5f70754
1 Parent(s): 734648f

feat: Update app with log rank scores

Browse files
Files changed (1) hide show
  1. app.py +32 -27
app.py CHANGED
@@ -43,11 +43,14 @@ The generative models are evaluated using in-context learning with few-shot prom
43
  The few-shot examples are sampled randomly from the training split, and we benchmark
44
  the models 10 times with bootstrapped test sets and different few-shot examples in each
45
  iteration. This allows us to better measure the uncertainty of the results. We use the
46
- uncertainty in the radial plot when we compute the win ratios (i.e., the percentage of
47
- other models that a model beats on a task). Namely, we compute the win ratio as the
48
- percentage of other models that a model _significantly_ beats on a task, where we use a
49
- paired t-test with a significance level of 0.05 to determine whether a model
50
- significantly beats another model.
 
 
 
51
 
52
  ## The Benchmark Datasets
53
 
@@ -276,8 +279,9 @@ def main() -> None:
276
  scale=2,
277
  )
278
  with gr.Row():
279
- use_win_ratio_checkbox = gr.Checkbox(
280
- label="Compare models with win ratios (as opposed to raw scores)",
 
281
  value=True,
282
  interactive=True,
283
  scale=1,
@@ -316,7 +320,7 @@ def main() -> None:
316
  value=produce_radial_plot(
317
  model_ids_dropdown.value,
318
  language_names=language_names_dropdown.value,
319
- use_win_ratio=use_win_ratio_checkbox.value,
320
  show_scale=show_scale_checkbox.value,
321
  plot_width=plot_width_slider.value,
322
  plot_height=plot_height_slider.value,
@@ -346,7 +350,7 @@ def main() -> None:
346
  inputs=[
347
  model_ids_dropdown,
348
  language_names_dropdown,
349
- use_win_ratio_checkbox,
350
  show_scale_checkbox,
351
  plot_width_slider,
352
  plot_height_slider,
@@ -355,7 +359,7 @@ def main() -> None:
355
  )
356
  language_names_dropdown.change(**update_plot_kwargs)
357
  model_ids_dropdown.change(**update_plot_kwargs)
358
- use_win_ratio_checkbox.change(**update_plot_kwargs)
359
  show_scale_checkbox.change(**update_plot_kwargs)
360
  plot_width_slider.change(**update_plot_kwargs)
361
  plot_height_slider.change(**update_plot_kwargs)
@@ -453,7 +457,7 @@ def update_model_ids_dropdown(
453
  def produce_radial_plot(
454
  model_ids: list[str],
455
  language_names: list[str],
456
- use_win_ratio: bool,
457
  show_scale: bool,
458
  plot_width: int,
459
  plot_height: int,
@@ -466,8 +470,8 @@ def produce_radial_plot(
466
  The ids of the models to include in the plot.
467
  language_names:
468
  The names of the languages to include in the plot.
469
- use_win_ratio:
470
- Whether to use win ratios (as opposed to raw scores).
471
  show_scale:
472
  Whether to show the scale on the plot.
473
  plot_width:
@@ -515,8 +519,8 @@ def produce_radial_plot(
515
  ]
516
 
517
 
518
- logger.info("Computing win ratios...")
519
- all_win_ratios: dict[Task, dict[Language, dict[str, float]]] = {
520
  task: {
521
  language: dict()
522
  for language in languages
@@ -546,10 +550,11 @@ def produce_radial_plot(
546
  best_scores = scores
547
  ranks.append(rank)
548
 
549
- for model_id, rank in zip(model_ids_sorted, ranks):
550
- pct_models_with_higher_rank = np.mean(np.asarray(ranks) >= rank)
551
- all_win_ratios[task][language][model_id] = pct_models_with_higher_rank
552
- logger.info("Successfully computed win ratios.")
 
553
 
554
  # Add all the evaluation results for each model
555
  results: list[list[float]] = list()
@@ -557,7 +562,7 @@ def produce_radial_plot(
557
  result_list = list()
558
  for task in tasks:
559
 
560
- win_ratios = list()
561
  scores = list()
562
  for language in languages:
563
  if model_id not in results_dfs_filtered[language].index:
@@ -565,15 +570,15 @@ def produce_radial_plot(
565
 
566
  score_list = results_dfs_filtered[language].loc[model_id][task]
567
 
568
- win_ratio = 100 * all_win_ratios[task][language][model_id]
569
- win_ratios.append(win_ratio)
570
 
571
  if np.mean(score_list) < 1:
572
  score_list = [100 * score for score in score_list]
573
 
574
  scores.append(np.mean(score_list))
575
- if use_win_ratio:
576
- result_list.append(np.mean(win_ratios))
577
  else:
578
  result_list.append(np.mean(scores))
579
  results.append(result_list)
@@ -616,10 +621,10 @@ def produce_radial_plot(
616
  languages_str += " and "
617
  languages_str += languages[-1].name
618
 
619
- if use_win_ratio:
620
- title = f'Win Ratio on on {languages_str} Language Tasks'
621
  else:
622
- title = f'LLM Score on on {languages_str} Language Tasks'
623
 
624
  # Builds the radial plot from the results
625
  fig.update_layout(
 
43
  The few-shot examples are sampled randomly from the training split, and we benchmark
44
  the models 10 times with bootstrapped test sets and different few-shot examples in each
45
  iteration. This allows us to better measure the uncertainty of the results. We use the
46
+ uncertainty in the radial plot when we compute the rank scores for the models. Namely,
47
+ we compute the rank score by firstly computing the rank of the model on each task,
48
+ where two models are considered to have the same rank if they have there is not a
49
+ statistically significant difference between their scores (one-tailed t-test with p <
50
+ 0.05). We next apply a logaritmic transformation to the ranks, to downplay the
51
+ importance of the poorly performing models. Lastly, we invert and normalise the
52
+ logaritmic ranks to the range [0, 1], resulting in the best performing models having
53
+ large rank scores and the worst performing models having small rank scores.
54
 
55
  ## The Benchmark Datasets
56
 
 
279
  scale=2,
280
  )
281
  with gr.Row():
282
+ use_rank_score_checkbox = gr.Checkbox(
283
+ label="Compare models with rank scores (as opposed to raw "
284
+ "scores)",
285
  value=True,
286
  interactive=True,
287
  scale=1,
 
320
  value=produce_radial_plot(
321
  model_ids_dropdown.value,
322
  language_names=language_names_dropdown.value,
323
+ use_rank_score=use_rank_score_checkbox.value,
324
  show_scale=show_scale_checkbox.value,
325
  plot_width=plot_width_slider.value,
326
  plot_height=plot_height_slider.value,
 
350
  inputs=[
351
  model_ids_dropdown,
352
  language_names_dropdown,
353
+ use_rank_score_checkbox,
354
  show_scale_checkbox,
355
  plot_width_slider,
356
  plot_height_slider,
 
359
  )
360
  language_names_dropdown.change(**update_plot_kwargs)
361
  model_ids_dropdown.change(**update_plot_kwargs)
362
+ use_rank_score_checkbox.change(**update_plot_kwargs)
363
  show_scale_checkbox.change(**update_plot_kwargs)
364
  plot_width_slider.change(**update_plot_kwargs)
365
  plot_height_slider.change(**update_plot_kwargs)
 
457
  def produce_radial_plot(
458
  model_ids: list[str],
459
  language_names: list[str],
460
+ use_rank_score: bool,
461
  show_scale: bool,
462
  plot_width: int,
463
  plot_height: int,
 
470
  The ids of the models to include in the plot.
471
  language_names:
472
  The names of the languages to include in the plot.
473
+ use_rank_score:
474
+ Whether to use rank scores (as opposed to raw scores).
475
  show_scale:
476
  Whether to show the scale on the plot.
477
  plot_width:
 
519
  ]
520
 
521
 
522
+ logger.info("Computing rank scores...")
523
+ all_rank_scores: dict[Task, dict[Language, dict[str, float]]] = {
524
  task: {
525
  language: dict()
526
  for language in languages
 
550
  best_scores = scores
551
  ranks.append(rank)
552
 
553
+ log_ranks = np.log(ranks)
554
+ scores = log_ranks / log_ranks.max()
555
+ for model_id, score in zip(model_ids_sorted, scores):
556
+ all_rank_scores[task][language][model_id] = 1 - score
557
+ logger.info("Successfully computed rank scores.")
558
 
559
  # Add all the evaluation results for each model
560
  results: list[list[float]] = list()
 
562
  result_list = list()
563
  for task in tasks:
564
 
565
+ rank_scores = list()
566
  scores = list()
567
  for language in languages:
568
  if model_id not in results_dfs_filtered[language].index:
 
570
 
571
  score_list = results_dfs_filtered[language].loc[model_id][task]
572
 
573
+ rank_score = 100 * all_rank_scores[task][language][model_id]
574
+ rank_scores.append(rank_score)
575
 
576
  if np.mean(score_list) < 1:
577
  score_list = [100 * score for score in score_list]
578
 
579
  scores.append(np.mean(score_list))
580
+ if use_rank_score:
581
+ result_list.append(np.mean(rank_scores))
582
  else:
583
  result_list.append(np.mean(scores))
584
  results.append(result_list)
 
621
  languages_str += " and "
622
  languages_str += languages[-1].name
623
 
624
+ if use_rank_score:
625
+ title = f'Rank Score on on {languages_str} Language Tasks'
626
  else:
627
+ title = f'Raw Score on on {languages_str} Language Tasks'
628
 
629
  # Builds the radial plot from the results
630
  fig.update_layout(