saattrupdan commited on
Commit
734648f
1 Parent(s): c34e772

fix: Update win ratios to take ranks into account

Browse files
Files changed (1) hide show
  1. app.py +96 -53
app.py CHANGED
@@ -232,52 +232,6 @@ DATASETS = [
232
  ]
233
 
234
 
235
- def update_colour_mapping(results_dfs: dict[Language, pd.DataFrame]) -> None:
236
- """Get a mapping from model ids to RGB triplets.
237
-
238
- Args:
239
- results_dfs:
240
- The results dataframes for each language.
241
- """
242
- global colour_mapping
243
- global seed
244
- seed += 1
245
-
246
- gr.Info(f"Updating colour mapping...")
247
-
248
- # Get distinct RGB values for all models
249
- all_models = list(
250
- {model_id for df in results_dfs.values() for model_id in df.index}
251
- )
252
- colour_mapping = dict()
253
-
254
- for i in it.count():
255
- min_colour_distance = MIN_COLOUR_DISTANCE_BETWEEN_MODELS - i
256
- retries_left = 10 * len(all_models)
257
- for model_id in all_models:
258
- random.seed(hash(model_id) + i + seed)
259
- r, g, b = 0, 0, 0
260
- too_bright, similar_to_other_model = True, True
261
- while (too_bright or similar_to_other_model) and retries_left > 0:
262
- r, g, b = tuple(random.randint(0, 255) for _ in range(3))
263
- too_bright = np.min([r, g, b]) > 200
264
- similar_to_other_model = any(
265
- np.abs(
266
- np.array(colour) - np.array([r, g, b])
267
- ).sum() < min_colour_distance
268
- for colour in colour_mapping.values()
269
- )
270
- retries_left -= 1
271
- colour_mapping[model_id] = (r, g, b)
272
-
273
- if retries_left:
274
- logger.info(
275
- f"Successfully found a colour mapping with min colour distance "
276
- f"{min_colour_distance}."
277
- )
278
- break
279
-
280
-
281
  def main() -> None:
282
  """Produce a radial plot."""
283
 
@@ -560,26 +514,61 @@ def produce_radial_plot(
560
  if all(task in df.columns for df in results_dfs_filtered.values())
561
  ]
562
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
563
  # Add all the evaluation results for each model
564
  results: list[list[float]] = list()
565
  for model_id in model_ids:
566
  result_list = list()
567
  for task in tasks:
 
568
  win_ratios = list()
569
  scores = list()
570
  for language in languages:
571
  if model_id not in results_dfs_filtered[language].index:
572
  continue
 
573
  score_list = results_dfs_filtered[language].loc[model_id][task]
574
- win_ratio = 100 * np.mean([
575
- stats.ttest_rel(
576
- a=score_list, b=other_scores, alternative="greater"
577
- ).pvalue < 0.05
578
- for other_scores in results_dfs_filtered[language][task].dropna().drop(index=model_id)
579
- ])
580
  win_ratios.append(win_ratio)
581
 
582
- if all(score < 1 for score in score_list):
583
  score_list = [100 * score for score in score_list]
584
 
585
  scores.append(np.mean(score_list))
@@ -645,6 +634,7 @@ def produce_radial_plot(
645
 
646
  return fig
647
 
 
648
  def fetch_results() -> dict[Language, pd.DataFrame]:
649
  """Fetch the results from the ScandEval benchmark.
650
 
@@ -674,6 +664,12 @@ def fetch_results() -> dict[Language, pd.DataFrame]:
674
  data_dict = defaultdict(dict)
675
  for record in records:
676
  model_name = record["model"]
 
 
 
 
 
 
677
  dataset_name = record["dataset"]
678
  if dataset_name in possible_dataset_names:
679
  dataset = next(
@@ -702,5 +698,52 @@ def fetch_results() -> dict[Language, pd.DataFrame]:
702
 
703
  return results_dfs
704
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
705
  if __name__ == "__main__":
706
  main()
 
232
  ]
233
 
234
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
  def main() -> None:
236
  """Produce a radial plot."""
237
 
 
514
  if all(task in df.columns for df in results_dfs_filtered.values())
515
  ]
516
 
517
+
518
+ logger.info("Computing win ratios...")
519
+ all_win_ratios: dict[Task, dict[Language, dict[str, float]]] = {
520
+ task: {
521
+ language: dict()
522
+ for language in languages
523
+ }
524
+ for task in tasks
525
+ }
526
+ for task in tasks:
527
+ for language in languages:
528
+ df = results_dfs_filtered[language][task].dropna()
529
+ model_ids_sorted: list[str] = (
530
+ df.map(np.mean).sort_values(ascending=False).index.tolist()
531
+ )
532
+ ranks = list()
533
+ rank = 0
534
+ best_scores = None
535
+ for model_id in model_ids_sorted:
536
+ if best_scores is None:
537
+ best_scores = df.loc[model_id]
538
+ rank = 1
539
+ else:
540
+ scores = df.loc[model_id]
541
+ worse_than_previous_models = stats.ttest_rel(
542
+ a=best_scores, b=scores, alternative="greater"
543
+ ).pvalue < 0.05
544
+ if worse_than_previous_models:
545
+ rank += 1
546
+ best_scores = scores
547
+ ranks.append(rank)
548
+
549
+ for model_id, rank in zip(model_ids_sorted, ranks):
550
+ pct_models_with_higher_rank = np.mean(np.asarray(ranks) >= rank)
551
+ all_win_ratios[task][language][model_id] = pct_models_with_higher_rank
552
+ logger.info("Successfully computed win ratios.")
553
+
554
  # Add all the evaluation results for each model
555
  results: list[list[float]] = list()
556
  for model_id in model_ids:
557
  result_list = list()
558
  for task in tasks:
559
+
560
  win_ratios = list()
561
  scores = list()
562
  for language in languages:
563
  if model_id not in results_dfs_filtered[language].index:
564
  continue
565
+
566
  score_list = results_dfs_filtered[language].loc[model_id][task]
567
+
568
+ win_ratio = 100 * all_win_ratios[task][language][model_id]
 
 
 
 
569
  win_ratios.append(win_ratio)
570
 
571
+ if np.mean(score_list) < 1:
572
  score_list = [100 * score for score in score_list]
573
 
574
  scores.append(np.mean(score_list))
 
634
 
635
  return fig
636
 
637
+
638
  def fetch_results() -> dict[Language, pd.DataFrame]:
639
  """Fetch the results from the ScandEval benchmark.
640
 
 
664
  data_dict = defaultdict(dict)
665
  for record in records:
666
  model_name = record["model"]
667
+
668
+ # Manual fix for OpenAI models: Only keep the validation split results
669
+ if "gpt-3.5" in model_name or "gpt-4" in model_name:
670
+ if not record.get("validation_split", False):
671
+ continue
672
+
673
  dataset_name = record["dataset"]
674
  if dataset_name in possible_dataset_names:
675
  dataset = next(
 
698
 
699
  return results_dfs
700
 
701
+
702
+ def update_colour_mapping(results_dfs: dict[Language, pd.DataFrame]) -> None:
703
+ """Get a mapping from model ids to RGB triplets.
704
+
705
+ Args:
706
+ results_dfs:
707
+ The results dataframes for each language.
708
+ """
709
+ global colour_mapping
710
+ global seed
711
+ seed += 1
712
+
713
+ gr.Info(f"Updating colour mapping...")
714
+
715
+ # Get distinct RGB values for all models
716
+ all_models = list(
717
+ {model_id for df in results_dfs.values() for model_id in df.index}
718
+ )
719
+ colour_mapping = dict()
720
+
721
+ for i in it.count():
722
+ min_colour_distance = MIN_COLOUR_DISTANCE_BETWEEN_MODELS - i
723
+ retries_left = 10 * len(all_models)
724
+ for model_id in all_models:
725
+ random.seed(hash(model_id) + i + seed)
726
+ r, g, b = 0, 0, 0
727
+ too_bright, similar_to_other_model = True, True
728
+ while (too_bright or similar_to_other_model) and retries_left > 0:
729
+ r, g, b = tuple(random.randint(0, 255) for _ in range(3))
730
+ too_bright = np.min([r, g, b]) > 200
731
+ similar_to_other_model = any(
732
+ np.abs(
733
+ np.array(colour) - np.array([r, g, b])
734
+ ).sum() < min_colour_distance
735
+ for colour in colour_mapping.values()
736
+ )
737
+ retries_left -= 1
738
+ colour_mapping[model_id] = (r, g, b)
739
+
740
+ if retries_left:
741
+ logger.info(
742
+ f"Successfully found a colour mapping with min colour distance "
743
+ f"{min_colour_distance}."
744
+ )
745
+ break
746
+
747
+
748
  if __name__ == "__main__":
749
  main()