saattrupdan commited on
Commit
9a46da5
1 Parent(s): 65f7993

feat: Use t-tests to determine win ratios

Browse files
Files changed (2) hide show
  1. app.py +26 -13
  2. requirements.txt +1 -0
app.py CHANGED
@@ -12,6 +12,8 @@ import requests
12
  import random
13
  import logging
14
  import datetime as dt
 
 
15
 
16
 
17
  fmt = "%(asctime)s [%(levelname)s] <%(name)s> %(message)s"
@@ -408,13 +410,19 @@ def produce_radial_plot(
408
  for language in languages:
409
  if model_id not in results_dfs_filtered[language].index:
410
  continue
411
- score = results_dfs_filtered[language].loc[model_id][task]
412
  win_ratio = 100 * np.mean([
413
- score >= other_score
414
- for other_score in results_dfs_filtered[language][task].dropna()
 
 
415
  ])
416
  win_ratios.append(win_ratio)
417
- scores.append(score)
 
 
 
 
418
  if use_win_ratio:
419
  result_list.append(np.mean(win_ratios))
420
  else:
@@ -515,18 +523,23 @@ def fetch_results() -> dict[Language, pd.DataFrame]:
515
  dataset = next(
516
  dataset for dataset in DATASETS if dataset.name == dataset_name
517
  )
518
- results_dict = record['results']['total']
519
- score = results_dict.get(
520
- f"test_{dataset.task.metric}", results_dict.get(dataset.task.metric)
521
- )
 
 
 
522
  if dataset.task in data_dict[model_name]:
523
- data_dict[model_name][dataset.task].append(score)
524
  else:
525
- data_dict[model_name][dataset.task] = [score]
526
  results_df = pd.DataFrame(data_dict).T.map(
527
- lambda list_or_nan:
528
- np.mean(list_or_nan) if list_or_nan == list_or_nan else list_or_nan
529
- ).dropna()
 
 
530
  results_dfs[language] = results_df
531
 
532
  logger.info("Successfully fetched results from ScandEval benchmark.")
 
12
  import random
13
  import logging
14
  import datetime as dt
15
+ import scipy.stats as stats
16
+ import itertools as it
17
 
18
 
19
  fmt = "%(asctime)s [%(levelname)s] <%(name)s> %(message)s"
 
410
  for language in languages:
411
  if model_id not in results_dfs_filtered[language].index:
412
  continue
413
+ score_list = results_dfs_filtered[language].loc[model_id][task]
414
  win_ratio = 100 * np.mean([
415
+ stats.ttest_rel(
416
+ a=score_list, b=other_scores, alternative="greater"
417
+ ).pvalue < 0.05
418
+ for other_scores in results_dfs_filtered[language][task].dropna().drop(index=model_id)
419
  ])
420
  win_ratios.append(win_ratio)
421
+
422
+ if all(score < 1 for score in score_list):
423
+ score_list = [100 * score for score in score_list]
424
+
425
+ scores.append(np.mean(score_list))
426
  if use_win_ratio:
427
  result_list.append(np.mean(win_ratios))
428
  else:
 
523
  dataset = next(
524
  dataset for dataset in DATASETS if dataset.name == dataset_name
525
  )
526
+ scores = [
527
+ test_score_dict.get(
528
+ f"test_{dataset.task.metric}",
529
+ test_score_dict.get(dataset.task.metric)
530
+ )
531
+ for test_score_dict in record["results"]["raw"]["test"]
532
+ ]
533
  if dataset.task in data_dict[model_name]:
534
+ data_dict[model_name][dataset.task].append(scores)
535
  else:
536
+ data_dict[model_name][dataset.task] = [scores]
537
  results_df = pd.DataFrame(data_dict).T.map(
538
+ lambda lists_or_nan:
539
+ list(it.chain(lists_or_nan))
540
+ if lists_or_nan == lists_or_nan
541
+ else lists_or_nan
542
+ ).dropna().map(lambda lst: lst[0])
543
  results_dfs[language] = results_df
544
 
545
  logger.info("Successfully fetched results from ScandEval benchmark.")
requirements.txt CHANGED
@@ -52,6 +52,7 @@ requests==2.31.0
52
  rich==13.7.0
53
  rpds-py==0.17.1
54
  ruff==0.1.14
 
55
  semantic-version==2.10.0
56
  shellingham==1.5.4
57
  six==1.16.0
 
52
  rich==13.7.0
53
  rpds-py==0.17.1
54
  ruff==0.1.14
55
+ scipy==1.12.0
56
  semantic-version==2.10.0
57
  shellingham==1.5.4
58
  six==1.16.0