saattrupdan
commited on
Commit
•
5f70754
1
Parent(s):
734648f
feat: Update app with log rank scores
Browse files
app.py
CHANGED
@@ -43,11 +43,14 @@ The generative models are evaluated using in-context learning with few-shot prom
|
|
43 |
The few-shot examples are sampled randomly from the training split, and we benchmark
|
44 |
the models 10 times with bootstrapped test sets and different few-shot examples in each
|
45 |
iteration. This allows us to better measure the uncertainty of the results. We use the
|
46 |
-
uncertainty in the radial plot when we compute the
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
|
|
|
|
|
|
51 |
|
52 |
## The Benchmark Datasets
|
53 |
|
@@ -276,8 +279,9 @@ def main() -> None:
|
|
276 |
scale=2,
|
277 |
)
|
278 |
with gr.Row():
|
279 |
-
|
280 |
-
label="Compare models with
|
|
|
281 |
value=True,
|
282 |
interactive=True,
|
283 |
scale=1,
|
@@ -316,7 +320,7 @@ def main() -> None:
|
|
316 |
value=produce_radial_plot(
|
317 |
model_ids_dropdown.value,
|
318 |
language_names=language_names_dropdown.value,
|
319 |
-
|
320 |
show_scale=show_scale_checkbox.value,
|
321 |
plot_width=plot_width_slider.value,
|
322 |
plot_height=plot_height_slider.value,
|
@@ -346,7 +350,7 @@ def main() -> None:
|
|
346 |
inputs=[
|
347 |
model_ids_dropdown,
|
348 |
language_names_dropdown,
|
349 |
-
|
350 |
show_scale_checkbox,
|
351 |
plot_width_slider,
|
352 |
plot_height_slider,
|
@@ -355,7 +359,7 @@ def main() -> None:
|
|
355 |
)
|
356 |
language_names_dropdown.change(**update_plot_kwargs)
|
357 |
model_ids_dropdown.change(**update_plot_kwargs)
|
358 |
-
|
359 |
show_scale_checkbox.change(**update_plot_kwargs)
|
360 |
plot_width_slider.change(**update_plot_kwargs)
|
361 |
plot_height_slider.change(**update_plot_kwargs)
|
@@ -453,7 +457,7 @@ def update_model_ids_dropdown(
|
|
453 |
def produce_radial_plot(
|
454 |
model_ids: list[str],
|
455 |
language_names: list[str],
|
456 |
-
|
457 |
show_scale: bool,
|
458 |
plot_width: int,
|
459 |
plot_height: int,
|
@@ -466,8 +470,8 @@ def produce_radial_plot(
|
|
466 |
The ids of the models to include in the plot.
|
467 |
language_names:
|
468 |
The names of the languages to include in the plot.
|
469 |
-
|
470 |
-
Whether to use
|
471 |
show_scale:
|
472 |
Whether to show the scale on the plot.
|
473 |
plot_width:
|
@@ -515,8 +519,8 @@ def produce_radial_plot(
|
|
515 |
]
|
516 |
|
517 |
|
518 |
-
logger.info("Computing
|
519 |
-
|
520 |
task: {
|
521 |
language: dict()
|
522 |
for language in languages
|
@@ -546,10 +550,11 @@ def produce_radial_plot(
|
|
546 |
best_scores = scores
|
547 |
ranks.append(rank)
|
548 |
|
549 |
-
|
550 |
-
|
551 |
-
|
552 |
-
|
|
|
553 |
|
554 |
# Add all the evaluation results for each model
|
555 |
results: list[list[float]] = list()
|
@@ -557,7 +562,7 @@ def produce_radial_plot(
|
|
557 |
result_list = list()
|
558 |
for task in tasks:
|
559 |
|
560 |
-
|
561 |
scores = list()
|
562 |
for language in languages:
|
563 |
if model_id not in results_dfs_filtered[language].index:
|
@@ -565,15 +570,15 @@ def produce_radial_plot(
|
|
565 |
|
566 |
score_list = results_dfs_filtered[language].loc[model_id][task]
|
567 |
|
568 |
-
|
569 |
-
|
570 |
|
571 |
if np.mean(score_list) < 1:
|
572 |
score_list = [100 * score for score in score_list]
|
573 |
|
574 |
scores.append(np.mean(score_list))
|
575 |
-
if
|
576 |
-
result_list.append(np.mean(
|
577 |
else:
|
578 |
result_list.append(np.mean(scores))
|
579 |
results.append(result_list)
|
@@ -616,10 +621,10 @@ def produce_radial_plot(
|
|
616 |
languages_str += " and "
|
617 |
languages_str += languages[-1].name
|
618 |
|
619 |
-
if
|
620 |
-
title = f'
|
621 |
else:
|
622 |
-
title = f'
|
623 |
|
624 |
# Builds the radial plot from the results
|
625 |
fig.update_layout(
|
|
|
43 |
The few-shot examples are sampled randomly from the training split, and we benchmark
|
44 |
the models 10 times with bootstrapped test sets and different few-shot examples in each
|
45 |
iteration. This allows us to better measure the uncertainty of the results. We use the
|
46 |
+
uncertainty in the radial plot when we compute the rank scores for the models. Namely,
|
47 |
+
we compute the rank score by firstly computing the rank of the model on each task,
|
48 |
+
where two models are considered to have the same rank if they have there is not a
|
49 |
+
statistically significant difference between their scores (one-tailed t-test with p <
|
50 |
+
0.05). We next apply a logaritmic transformation to the ranks, to downplay the
|
51 |
+
importance of the poorly performing models. Lastly, we invert and normalise the
|
52 |
+
logaritmic ranks to the range [0, 1], resulting in the best performing models having
|
53 |
+
large rank scores and the worst performing models having small rank scores.
|
54 |
|
55 |
## The Benchmark Datasets
|
56 |
|
|
|
279 |
scale=2,
|
280 |
)
|
281 |
with gr.Row():
|
282 |
+
use_rank_score_checkbox = gr.Checkbox(
|
283 |
+
label="Compare models with rank scores (as opposed to raw "
|
284 |
+
"scores)",
|
285 |
value=True,
|
286 |
interactive=True,
|
287 |
scale=1,
|
|
|
320 |
value=produce_radial_plot(
|
321 |
model_ids_dropdown.value,
|
322 |
language_names=language_names_dropdown.value,
|
323 |
+
use_rank_score=use_rank_score_checkbox.value,
|
324 |
show_scale=show_scale_checkbox.value,
|
325 |
plot_width=plot_width_slider.value,
|
326 |
plot_height=plot_height_slider.value,
|
|
|
350 |
inputs=[
|
351 |
model_ids_dropdown,
|
352 |
language_names_dropdown,
|
353 |
+
use_rank_score_checkbox,
|
354 |
show_scale_checkbox,
|
355 |
plot_width_slider,
|
356 |
plot_height_slider,
|
|
|
359 |
)
|
360 |
language_names_dropdown.change(**update_plot_kwargs)
|
361 |
model_ids_dropdown.change(**update_plot_kwargs)
|
362 |
+
use_rank_score_checkbox.change(**update_plot_kwargs)
|
363 |
show_scale_checkbox.change(**update_plot_kwargs)
|
364 |
plot_width_slider.change(**update_plot_kwargs)
|
365 |
plot_height_slider.change(**update_plot_kwargs)
|
|
|
457 |
def produce_radial_plot(
|
458 |
model_ids: list[str],
|
459 |
language_names: list[str],
|
460 |
+
use_rank_score: bool,
|
461 |
show_scale: bool,
|
462 |
plot_width: int,
|
463 |
plot_height: int,
|
|
|
470 |
The ids of the models to include in the plot.
|
471 |
language_names:
|
472 |
The names of the languages to include in the plot.
|
473 |
+
use_rank_score:
|
474 |
+
Whether to use rank scores (as opposed to raw scores).
|
475 |
show_scale:
|
476 |
Whether to show the scale on the plot.
|
477 |
plot_width:
|
|
|
519 |
]
|
520 |
|
521 |
|
522 |
+
logger.info("Computing rank scores...")
|
523 |
+
all_rank_scores: dict[Task, dict[Language, dict[str, float]]] = {
|
524 |
task: {
|
525 |
language: dict()
|
526 |
for language in languages
|
|
|
550 |
best_scores = scores
|
551 |
ranks.append(rank)
|
552 |
|
553 |
+
log_ranks = np.log(ranks)
|
554 |
+
scores = log_ranks / log_ranks.max()
|
555 |
+
for model_id, score in zip(model_ids_sorted, scores):
|
556 |
+
all_rank_scores[task][language][model_id] = 1 - score
|
557 |
+
logger.info("Successfully computed rank scores.")
|
558 |
|
559 |
# Add all the evaluation results for each model
|
560 |
results: list[list[float]] = list()
|
|
|
562 |
result_list = list()
|
563 |
for task in tasks:
|
564 |
|
565 |
+
rank_scores = list()
|
566 |
scores = list()
|
567 |
for language in languages:
|
568 |
if model_id not in results_dfs_filtered[language].index:
|
|
|
570 |
|
571 |
score_list = results_dfs_filtered[language].loc[model_id][task]
|
572 |
|
573 |
+
rank_score = 100 * all_rank_scores[task][language][model_id]
|
574 |
+
rank_scores.append(rank_score)
|
575 |
|
576 |
if np.mean(score_list) < 1:
|
577 |
score_list = [100 * score for score in score_list]
|
578 |
|
579 |
scores.append(np.mean(score_list))
|
580 |
+
if use_rank_score:
|
581 |
+
result_list.append(np.mean(rank_scores))
|
582 |
else:
|
583 |
result_list.append(np.mean(scores))
|
584 |
results.append(result_list)
|
|
|
621 |
languages_str += " and "
|
622 |
languages_str += languages[-1].name
|
623 |
|
624 |
+
if use_rank_score:
|
625 |
+
title = f'Rank Score on on {languages_str} Language Tasks'
|
626 |
else:
|
627 |
+
title = f'Raw Score on on {languages_str} Language Tasks'
|
628 |
|
629 |
# Builds the radial plot from the results
|
630 |
fig.update_layout(
|