saattrupdan
commited on
Commit
•
9a46da5
1
Parent(s):
65f7993
feat: Use t-tests to determine win ratios
Browse files- app.py +26 -13
- requirements.txt +1 -0
app.py
CHANGED
@@ -12,6 +12,8 @@ import requests
|
|
12 |
import random
|
13 |
import logging
|
14 |
import datetime as dt
|
|
|
|
|
15 |
|
16 |
|
17 |
fmt = "%(asctime)s [%(levelname)s] <%(name)s> %(message)s"
|
@@ -408,13 +410,19 @@ def produce_radial_plot(
|
|
408 |
for language in languages:
|
409 |
if model_id not in results_dfs_filtered[language].index:
|
410 |
continue
|
411 |
-
|
412 |
win_ratio = 100 * np.mean([
|
413 |
-
|
414 |
-
|
|
|
|
|
415 |
])
|
416 |
win_ratios.append(win_ratio)
|
417 |
-
|
|
|
|
|
|
|
|
|
418 |
if use_win_ratio:
|
419 |
result_list.append(np.mean(win_ratios))
|
420 |
else:
|
@@ -515,18 +523,23 @@ def fetch_results() -> dict[Language, pd.DataFrame]:
|
|
515 |
dataset = next(
|
516 |
dataset for dataset in DATASETS if dataset.name == dataset_name
|
517 |
)
|
518 |
-
|
519 |
-
|
520 |
-
|
521 |
-
|
|
|
|
|
|
|
522 |
if dataset.task in data_dict[model_name]:
|
523 |
-
data_dict[model_name][dataset.task].append(
|
524 |
else:
|
525 |
-
data_dict[model_name][dataset.task] = [
|
526 |
results_df = pd.DataFrame(data_dict).T.map(
|
527 |
-
lambda
|
528 |
-
|
529 |
-
|
|
|
|
|
530 |
results_dfs[language] = results_df
|
531 |
|
532 |
logger.info("Successfully fetched results from ScandEval benchmark.")
|
|
|
12 |
import random
|
13 |
import logging
|
14 |
import datetime as dt
|
15 |
+
import scipy.stats as stats
|
16 |
+
import itertools as it
|
17 |
|
18 |
|
19 |
fmt = "%(asctime)s [%(levelname)s] <%(name)s> %(message)s"
|
|
|
410 |
for language in languages:
|
411 |
if model_id not in results_dfs_filtered[language].index:
|
412 |
continue
|
413 |
+
score_list = results_dfs_filtered[language].loc[model_id][task]
|
414 |
win_ratio = 100 * np.mean([
|
415 |
+
stats.ttest_rel(
|
416 |
+
a=score_list, b=other_scores, alternative="greater"
|
417 |
+
).pvalue < 0.05
|
418 |
+
for other_scores in results_dfs_filtered[language][task].dropna().drop(index=model_id)
|
419 |
])
|
420 |
win_ratios.append(win_ratio)
|
421 |
+
|
422 |
+
if all(score < 1 for score in score_list):
|
423 |
+
score_list = [100 * score for score in score_list]
|
424 |
+
|
425 |
+
scores.append(np.mean(score_list))
|
426 |
if use_win_ratio:
|
427 |
result_list.append(np.mean(win_ratios))
|
428 |
else:
|
|
|
523 |
dataset = next(
|
524 |
dataset for dataset in DATASETS if dataset.name == dataset_name
|
525 |
)
|
526 |
+
scores = [
|
527 |
+
test_score_dict.get(
|
528 |
+
f"test_{dataset.task.metric}",
|
529 |
+
test_score_dict.get(dataset.task.metric)
|
530 |
+
)
|
531 |
+
for test_score_dict in record["results"]["raw"]["test"]
|
532 |
+
]
|
533 |
if dataset.task in data_dict[model_name]:
|
534 |
+
data_dict[model_name][dataset.task].append(scores)
|
535 |
else:
|
536 |
+
data_dict[model_name][dataset.task] = [scores]
|
537 |
results_df = pd.DataFrame(data_dict).T.map(
|
538 |
+
lambda lists_or_nan:
|
539 |
+
list(it.chain(lists_or_nan))
|
540 |
+
if lists_or_nan == lists_or_nan
|
541 |
+
else lists_or_nan
|
542 |
+
).dropna().map(lambda lst: lst[0])
|
543 |
results_dfs[language] = results_df
|
544 |
|
545 |
logger.info("Successfully fetched results from ScandEval benchmark.")
|
requirements.txt
CHANGED
@@ -52,6 +52,7 @@ requests==2.31.0
|
|
52 |
rich==13.7.0
|
53 |
rpds-py==0.17.1
|
54 |
ruff==0.1.14
|
|
|
55 |
semantic-version==2.10.0
|
56 |
shellingham==1.5.4
|
57 |
six==1.16.0
|
|
|
52 |
rich==13.7.0
|
53 |
rpds-py==0.17.1
|
54 |
ruff==0.1.14
|
55 |
+
scipy==1.12.0
|
56 |
semantic-version==2.10.0
|
57 |
shellingham==1.5.4
|
58 |
six==1.16.0
|