saattrupdan
commited on
Commit
·
3a84b3b
1
Parent(s):
5f70754
chore: Small update
Browse files
app.py
CHANGED
@@ -45,12 +45,12 @@ the models 10 times with bootstrapped test sets and different few-shot examples
|
|
45 |
iteration. This allows us to better measure the uncertainty of the results. We use the
|
46 |
uncertainty in the radial plot when we compute the rank scores for the models. Namely,
|
47 |
we compute the rank score by firstly computing the rank of the model on each task,
|
48 |
-
where two models are considered to have the same rank if
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
|
55 |
## The Benchmark Datasets
|
56 |
|
@@ -551,9 +551,9 @@ def produce_radial_plot(
|
|
551 |
ranks.append(rank)
|
552 |
|
553 |
log_ranks = np.log(ranks)
|
554 |
-
scores = log_ranks / log_ranks.max()
|
555 |
for model_id, score in zip(model_ids_sorted, scores):
|
556 |
-
all_rank_scores[task][language][model_id] =
|
557 |
logger.info("Successfully computed rank scores.")
|
558 |
|
559 |
# Add all the evaluation results for each model
|
@@ -568,15 +568,13 @@ def produce_radial_plot(
|
|
568 |
if model_id not in results_dfs_filtered[language].index:
|
569 |
continue
|
570 |
|
571 |
-
score_list = results_dfs_filtered[language].loc[model_id][task]
|
572 |
-
|
573 |
rank_score = 100 * all_rank_scores[task][language][model_id]
|
574 |
rank_scores.append(rank_score)
|
575 |
|
576 |
-
|
577 |
-
|
578 |
-
|
579 |
-
scores.append(
|
580 |
if use_rank_score:
|
581 |
result_list.append(np.mean(rank_scores))
|
582 |
else:
|
|
|
45 |
iteration. This allows us to better measure the uncertainty of the results. We use the
|
46 |
uncertainty in the radial plot when we compute the rank scores for the models. Namely,
|
47 |
we compute the rank score by firstly computing the rank of the model on each task,
|
48 |
+
where two models are considered to have the same rank if there is not a statistically
|
49 |
+
significant difference between their scores (one-tailed t-test with p < 0.05). We next
|
50 |
+
apply a logaritmic transformation to the ranks, to downplay the importance of the
|
51 |
+
poorly performing models. Lastly, we invert and normalise the logaritmic ranks to the
|
52 |
+
range [0, 1], resulting in the best performing models having rank scores close to 1 and
|
53 |
+
the worst performing models having rank scores close to 0.
|
54 |
|
55 |
## The Benchmark Datasets
|
56 |
|
|
|
551 |
ranks.append(rank)
|
552 |
|
553 |
log_ranks = np.log(ranks)
|
554 |
+
scores = 1 - (log_ranks / log_ranks.max())
|
555 |
for model_id, score in zip(model_ids_sorted, scores):
|
556 |
+
all_rank_scores[task][language][model_id] = score
|
557 |
logger.info("Successfully computed rank scores.")
|
558 |
|
559 |
# Add all the evaluation results for each model
|
|
|
568 |
if model_id not in results_dfs_filtered[language].index:
|
569 |
continue
|
570 |
|
|
|
|
|
571 |
rank_score = 100 * all_rank_scores[task][language][model_id]
|
572 |
rank_scores.append(rank_score)
|
573 |
|
574 |
+
score_arr = np.array(results_dfs_filtered[language].loc[model_id][task])
|
575 |
+
if score_arr.mean() < 1:
|
576 |
+
score_arr *= 100
|
577 |
+
scores.append(score_arr.mean())
|
578 |
if use_rank_score:
|
579 |
result_list.append(np.mean(rank_scores))
|
580 |
else:
|