Spaces:
Sleeping
Sleeping
"""Script to produce radial plots.""" | |
from functools import partial | |
import plotly.graph_objects as go | |
import json | |
import numpy as np | |
from collections import defaultdict | |
import pandas as pd | |
from pydantic import BaseModel | |
import gradio as gr | |
import requests | |
import random | |
import logging | |
import datetime as dt | |
import scipy.stats as stats | |
import itertools as it | |
fmt = "%(asctime)s [%(levelname)s] <%(name)s> %(message)s" | |
logging.basicConfig(level=logging.INFO, format=fmt) | |
logger = logging.getLogger("radial_plot_generator") | |
INTRO_MARKDOWN = """ | |
# Radial Plot Generator | |
This demo allows you to generate a radial plot comparing the performance of different | |
language models on different tasks. It is based on the generative results from the | |
[ScandEval benchmark](https://scandeval.com). | |
""" | |
ABOUT_MARKDOWN = """ | |
## About the ScandEval Benchmark | |
The [ScandEval benchmark](https://scandeval.com) is used compare pretrained language | |
models on tasks in Danish, Swedish, Norwegian Bokmål, Norwegian Nynorsk, Icelandic, | |
Faroese, German, Dutch and English. The benchmark supports both encoder models (such as | |
BERT) and generative models (such as GPT), and leaderboards for both kinds [are | |
available](https://scandeval.com). | |
The generative models are evaluated using in-context learning with few-shot prompts. | |
The few-shot examples are sampled randomly from the training split, and we benchmark | |
the models 10 times with bootstrapped test sets and different few-shot examples in each | |
iteration. This allows us to better measure the uncertainty of the results. We use the | |
uncertainty in the radial plot when we compute the win ratios (i.e., the percentage of | |
other models that a model beats on a task). Namely, we compute the win ratio as the | |
percentage of other models that a model _significantly_ beats on a task, where we use a | |
paired t-test with a significance level of 0.05 to determine whether a model | |
significantly beats another model. | |
## The Benchmark Datasets | |
The ScandEval generative benchmark currently covers the languages Danish, Swedish, | |
Norwegian, Icelandic, German, Dutch and English. For each language, the benchmark | |
consists of 7 different tasks, each of which consists of 1-2 datasets. The tasks are | |
the following: | |
### Text Classification | |
Given a piece of text, classify it into a number of classes. For this task we extract | |
the first token of the possible labels, and choose the label whose first token has the | |
highest probability. All datasets in this category are currently trinary sentiment | |
classification datasets. We use the Matthews Correlation Coefficient (MCC) as the | |
evaluation metric. | |
### Information Extraction | |
Given a piece of text, extract a number of entities from the text. As the model needs | |
to extract multiple entities, we use [structured | |
generation](https://github.com/noamgat/lm-format-enforcer) to make the model generate a | |
JSON dictionary with keys being the entity categories and values being lists of the | |
identified entities. All datasets in this task are named entity recognition datasets. | |
We use the micro-averaged F1 score as the evaluation metric, where we ignore the | |
Miscellaneous category. | |
### Grammar | |
Given a piece of text, determine whether it is grammatically correct or not. All | |
datasets in this task are built from the dependency treebanks of the languages, where | |
words are removed or swapped, in a way that makes the sentence ungrammatical. We use | |
the Matthews Correlation Coefficient (MCC) as the evaluation metric. | |
### Question Answering | |
Given a question and a piece of text, extract the answer to the question from the text. | |
All datasets in this task are extractive question answering datasets. We use the exact | |
match (EM) score as the evaluation metric. | |
### Summarisation | |
Given a piece of text, generate a summary of the text. All the datasets come from | |
either news articles or WikiHow articles. We use the BERTScore metric as the evaluation | |
metric, where the encoder model used is | |
[microsoft/mdeberta-v3-base](https://huggingface.co/microsoft/mdeberta-v3-base). | |
### Knowledge | |
Given a trivia-style question with multiple choice answers, choose the correct answer. | |
As with text classification, we use the probabilities of the answer letter (a, b, c or | |
d) to choose the answer. The datasets in this task are machine translated versions of | |
the [MMLU](https://doi.org/10.48550/arXiv.2009.03300) and | |
[ARC](https://allenai.org/data/arc) datasets. We use the Matthews Correlation | |
Coefficient (MCC) as the evaluation metric. | |
### Reasoning | |
Given a scenario and multiple possible endings, choose the correct ending. As with text | |
classification, we use the probabilities of the answer letter (a, b, c or d) to choose | |
the answer. The datasets in this task are machine translated versions of the | |
[HellaSwag](https://rowanzellers.com/hellaswag/) dataset. We use the Matthews | |
Correlation Coefficient (MCC) as the evaluation metric. | |
## Citation | |
If you use the ScandEval benchmark in your work, please cite [the | |
paper](https://aclanthology.org/2023.nodalida-1.20): | |
``` | |
@inproceedings{nielsen2023scandeval, | |
title={ScandEval: A Benchmark for Scandinavian Natural Language Processing}, | |
author={Nielsen, Dan}, | |
booktitle={Proceedings of the 24th Nordic Conference on Computational Linguistics (NoDaLiDa)}, | |
pages={185--201}, | |
year={2023} | |
} | |
``` | |
""" | |
UPDATE_FREQUENCY_MINUTES = 30 | |
MIN_COLOUR_DISTANCE_BETWEEN_MODELS = 200 | |
class Task(BaseModel): | |
"""Class to hold task information.""" | |
name: str | |
metric: str | |
def __hash__(self): | |
return hash(self.name) | |
class Language(BaseModel): | |
"""Class to hold language information.""" | |
code: str | |
name: str | |
def __hash__(self): | |
return hash(self.code) | |
class Dataset(BaseModel): | |
"""Class to hold dataset information.""" | |
name: str | |
language: Language | |
task: Task | |
def __hash__(self): | |
return hash(self.name) | |
SUMMARISATION = Task(name="summarisation", metric="bertscore") | |
KNOWLEDGE = Task(name="knowledge", metric="mcc") | |
REASONING = Task(name="reasoning", metric="mcc") | |
GRAMMAR = Task(name="grammar", metric="mcc") | |
QUESTION_ANSWERING = Task(name="question answering", metric="em") | |
TEXT_CLASSIFICATION = Task(name="text classification", metric="mcc") | |
INFORMATION_EXTRACTION = Task(name="information extraction", metric="micro_f1_no_misc") | |
ALL_TASKS = [obj for obj in globals().values() if isinstance(obj, Task)] | |
DANISH = Language(code="da", name="Danish") | |
NORWEGIAN = Language(code="no", name="Norwegian") | |
SWEDISH = Language(code="sv", name="Swedish") | |
ICELANDIC = Language(code="is", name="Icelandic") | |
GERMAN = Language(code="de", name="German") | |
DUTCH = Language(code="nl", name="Dutch") | |
ENGLISH = Language(code="en", name="English") | |
ALL_LANGUAGES = { | |
obj.name: obj for obj in globals().values() if isinstance(obj, Language) | |
} | |
DATASETS = [ | |
Dataset(name="swerec", language=SWEDISH, task=TEXT_CLASSIFICATION), | |
Dataset(name="angry-tweets", language=DANISH, task=TEXT_CLASSIFICATION), | |
Dataset(name="norec", language=NORWEGIAN, task=TEXT_CLASSIFICATION), | |
Dataset(name="sb10k", language=GERMAN, task=TEXT_CLASSIFICATION), | |
Dataset(name="dutch-social", language=DUTCH, task=TEXT_CLASSIFICATION), | |
Dataset(name="sst5", language=ENGLISH, task=TEXT_CLASSIFICATION), | |
Dataset(name="suc3", language=SWEDISH, task=INFORMATION_EXTRACTION), | |
Dataset(name="dansk", language=DANISH, task=INFORMATION_EXTRACTION), | |
Dataset(name="norne-nb", language=NORWEGIAN, task=INFORMATION_EXTRACTION), | |
Dataset(name="norne-nn", language=NORWEGIAN, task=INFORMATION_EXTRACTION), | |
Dataset(name="mim-gold-ner", language=ICELANDIC, task=INFORMATION_EXTRACTION), | |
Dataset(name="germeval", language=GERMAN, task=INFORMATION_EXTRACTION), | |
Dataset(name="conll-nl", language=DUTCH, task=INFORMATION_EXTRACTION), | |
Dataset(name="conll-en", language=ENGLISH, task=INFORMATION_EXTRACTION), | |
Dataset(name="scala-sv", language=SWEDISH, task=GRAMMAR), | |
Dataset(name="scala-da", language=DANISH, task=GRAMMAR), | |
Dataset(name="scala-nb", language=NORWEGIAN, task=GRAMMAR), | |
Dataset(name="scala-nn", language=NORWEGIAN, task=GRAMMAR), | |
Dataset(name="scala-is", language=ICELANDIC, task=GRAMMAR), | |
Dataset(name="scala-de", language=GERMAN, task=GRAMMAR), | |
Dataset(name="scala-nl", language=DUTCH, task=GRAMMAR), | |
Dataset(name="scala-en", language=ENGLISH, task=GRAMMAR), | |
Dataset(name="scandiqa-da", language=DANISH, task=QUESTION_ANSWERING), | |
Dataset(name="norquad", language=NORWEGIAN, task=QUESTION_ANSWERING), | |
Dataset(name="scandiqa-sv", language=SWEDISH, task=QUESTION_ANSWERING), | |
Dataset(name="nqii", language=ICELANDIC, task=QUESTION_ANSWERING), | |
Dataset(name="germanquad", language=GERMAN, task=QUESTION_ANSWERING), | |
Dataset(name="squad", language=ENGLISH, task=QUESTION_ANSWERING), | |
Dataset(name="squad-nl", language=DUTCH, task=QUESTION_ANSWERING), | |
Dataset(name="nordjylland-news", language=DANISH, task=SUMMARISATION), | |
Dataset(name="mlsum", language=GERMAN, task=SUMMARISATION), | |
Dataset(name="rrn", language=ICELANDIC, task=SUMMARISATION), | |
Dataset(name="no-sammendrag", language=NORWEGIAN, task=SUMMARISATION), | |
Dataset(name="wiki-lingua-nl", language=DUTCH, task=SUMMARISATION), | |
Dataset(name="swedn", language=SWEDISH, task=SUMMARISATION), | |
Dataset(name="cnn-dailymail", language=ENGLISH, task=SUMMARISATION), | |
Dataset(name="danish-citizen-tests", language=DANISH, task=KNOWLEDGE), | |
Dataset(name="danske-talemaader", language=DANISH, task=KNOWLEDGE), | |
Dataset(name="mmlu-no", language=NORWEGIAN, task=KNOWLEDGE), | |
Dataset(name="mmlu-sv", language=SWEDISH, task=KNOWLEDGE), | |
Dataset(name="mmlu-is", language=ICELANDIC, task=KNOWLEDGE), | |
Dataset(name="mmlu-de", language=GERMAN, task=KNOWLEDGE), | |
Dataset(name="mmlu-nl", language=DUTCH, task=KNOWLEDGE), | |
Dataset(name="mmlu", language=ENGLISH, task=KNOWLEDGE), | |
Dataset(name="hellaswag-da", language=DANISH, task=REASONING), | |
Dataset(name="hellaswag-no", language=NORWEGIAN, task=REASONING), | |
Dataset(name="hellaswag-sv", language=SWEDISH, task=REASONING), | |
Dataset(name="hellaswag-is", language=ICELANDIC, task=REASONING), | |
Dataset(name="hellaswag-de", language=GERMAN, task=REASONING), | |
Dataset(name="hellaswag-nl", language=DUTCH, task=REASONING), | |
Dataset(name="hellaswag", language=ENGLISH, task=REASONING), | |
] | |
def update_colour_mapping(results_dfs: dict[Language, pd.DataFrame]) -> None: | |
"""Get a mapping from model ids to RGB triplets. | |
Args: | |
results_dfs: | |
The results dataframes for each language. | |
""" | |
global colour_mapping | |
global seed | |
seed += 1 | |
gr.Info(f"Updating colour mapping...") | |
# Get distinct RGB values for all models | |
all_models = list( | |
{model_id for df in results_dfs.values() for model_id in df.index} | |
) | |
colour_mapping = dict() | |
for i in it.count(): | |
min_colour_distance = MIN_COLOUR_DISTANCE_BETWEEN_MODELS - i | |
retries_left = 10 * len(all_models) | |
for model_id in all_models: | |
random.seed(hash(model_id) + i + seed) | |
r, g, b = 0, 0, 0 | |
too_bright, similar_to_other_model = True, True | |
while (too_bright or similar_to_other_model) and retries_left > 0: | |
r, g, b = tuple(random.randint(0, 255) for _ in range(3)) | |
too_bright = np.min([r, g, b]) > 200 | |
similar_to_other_model = any( | |
np.abs( | |
np.array(colour) - np.array([r, g, b]) | |
).sum() < min_colour_distance | |
for colour in colour_mapping.values() | |
) | |
retries_left -= 1 | |
colour_mapping[model_id] = (r, g, b) | |
if retries_left: | |
logger.info( | |
f"Successfully found a colour mapping with min colour distance " | |
f"{min_colour_distance}." | |
) | |
break | |
def main() -> None: | |
"""Produce a radial plot.""" | |
global last_fetch | |
results_dfs = fetch_results() | |
last_fetch = dt.datetime.now() | |
all_languages = sorted( | |
[language.name for language in ALL_LANGUAGES.values()], | |
key=lambda language_name: language_name.lower(), | |
) | |
danish_models = sorted( | |
list({model_id for model_id in results_dfs[DANISH].index}), | |
key=lambda model_id: model_id.lower(), | |
) | |
global colour_mapping | |
global seed | |
seed = 4242 | |
update_colour_mapping(results_dfs=results_dfs) | |
with gr.Blocks(theme=gr.themes.Monochrome()) as demo: | |
gr.Markdown(INTRO_MARKDOWN) | |
with gr.Tab(label="Build a Radial Plot"): | |
with gr.Column(): | |
with gr.Row(): | |
language_names_dropdown = gr.Dropdown( | |
choices=all_languages, | |
multiselect=True, | |
label="Languages", | |
value=["Danish"], | |
interactive=True, | |
scale=2, | |
) | |
model_ids_dropdown = gr.Dropdown( | |
choices=danish_models, | |
multiselect=True, | |
label="Models", | |
value=["gpt-4-0613", "mistralai/Mistral-7B-v0.1"], | |
interactive=True, | |
scale=2, | |
) | |
with gr.Row(): | |
use_win_ratio_checkbox = gr.Checkbox( | |
label="Compare models with win ratios (as opposed to raw scores)", | |
value=True, | |
interactive=True, | |
scale=1, | |
) | |
show_scale_checkbox = gr.Checkbox( | |
label="Show the scale on the plot (always 0-100)", | |
value=False, | |
interactive=True, | |
scale=1, | |
) | |
plot_width_slider = gr.Slider( | |
label="Plot width", | |
minimum=600, | |
maximum=1000, | |
step=10, | |
value=800, | |
interactive=True, | |
scale=1, | |
) | |
plot_height_slider = gr.Slider( | |
label="Plot height", | |
minimum=300, | |
maximum=700, | |
step=10, | |
value=500, | |
interactive=True, | |
scale=1, | |
) | |
update_colours_button = gr.Button( | |
value="Update colours", | |
interactive=True, | |
scale=1, | |
) | |
with gr.Row(): | |
plot = gr.Plot( | |
value=produce_radial_plot( | |
model_ids_dropdown.value, | |
language_names=language_names_dropdown.value, | |
use_win_ratio=use_win_ratio_checkbox.value, | |
show_scale=show_scale_checkbox.value, | |
plot_width=plot_width_slider.value, | |
plot_height=plot_height_slider.value, | |
results_dfs=results_dfs, | |
), | |
) | |
with gr.Tab(label="About"): | |
gr.Markdown(ABOUT_MARKDOWN) | |
gr.Markdown( | |
"<center>Made with ❤️ by the <a href=\"https://alexandra.dk\">" | |
"Alexandra Institute</a>.</center>" | |
) | |
language_names_dropdown.change( | |
fn=partial(update_model_ids_dropdown, results_dfs=results_dfs), | |
inputs=[language_names_dropdown, model_ids_dropdown], | |
outputs=model_ids_dropdown, | |
) | |
# Update plot when anything changes | |
update_plot_kwargs = dict( | |
fn=partial( | |
produce_radial_plot, | |
results_dfs=results_dfs, | |
), | |
inputs=[ | |
model_ids_dropdown, | |
language_names_dropdown, | |
use_win_ratio_checkbox, | |
show_scale_checkbox, | |
plot_width_slider, | |
plot_height_slider, | |
], | |
outputs=plot, | |
) | |
language_names_dropdown.change(**update_plot_kwargs) | |
model_ids_dropdown.change(**update_plot_kwargs) | |
use_win_ratio_checkbox.change(**update_plot_kwargs) | |
show_scale_checkbox.change(**update_plot_kwargs) | |
plot_width_slider.change(**update_plot_kwargs) | |
plot_height_slider.change(**update_plot_kwargs) | |
# Update colours when the button is clicked | |
update_colours_button.click( | |
fn=partial(update_colour_mapping, results_dfs=results_dfs), | |
).then(**update_plot_kwargs) | |
demo.launch() | |
def update_model_ids_dropdown( | |
language_names: list[str], | |
model_ids: list[str], | |
results_dfs: dict[Language, pd.DataFrame] | None, | |
) -> dict: | |
"""When the language names are updated, update the model ids dropdown. | |
Args: | |
language_names: | |
The names of the languages to include in the plot. | |
model_ids: | |
The ids of the models to include in the plot. | |
results_dfs: | |
The results dataframes for each language. | |
Returns: | |
The Gradio update to the model ids dropdown. | |
""" | |
global last_fetch | |
minutes_since_last_fetch = (dt.datetime.now() - last_fetch).total_seconds() / 60 | |
if minutes_since_last_fetch > UPDATE_FREQUENCY_MINUTES: | |
results_dfs = fetch_results() | |
last_fetch = dt.datetime.now() | |
if results_dfs is None or len(language_names) == 0: | |
if results_dfs is None: | |
logger.info("No results fetched yet. Resetting model ids dropdown.") | |
else: | |
logger.info("No languages selected. Resetting model ids dropdown.") | |
return gr.update(choices=[], value=[]) | |
tasks = [ | |
task | |
for task in ALL_TASKS | |
if all( | |
task in df.columns | |
for language, df in results_dfs.items() | |
if language.name in language_names | |
) | |
] | |
filtered_results_dfs = { | |
language: df[tasks] | |
for language, df in results_dfs.items() | |
if language.name in language_names | |
} | |
unique_models: set[str] = { | |
str(model_id) | |
for df in filtered_results_dfs.values() | |
for model_id in df.index | |
} | |
filtered_models: list[str] = sorted([ | |
model_id | |
for model_id in unique_models | |
if all(model_id in df.index for df in filtered_results_dfs.values()) | |
]) | |
if len(filtered_models) == 0: | |
logger.info( | |
"No valid models for the selected languages. Resetting model ids dropdown." | |
) | |
return gr.update(choices=[], value=[]) | |
valid_selected_models: list[str] = [ | |
model_id for model_id in model_ids if model_id in filtered_models | |
] | |
if not valid_selected_models: | |
if len(filtered_models) > 1: | |
valid_selected_models = random.sample(population=filtered_models, k=2) | |
elif len(filtered_models) == 1: | |
valid_selected_models = random.sample(population=filtered_models, k=1) | |
logger.info( | |
f"Updated model ids dropdown with {len(filtered_models):,} valid models for " | |
f"the selected languages, with {valid_selected_models} selected." | |
) | |
return gr.update(choices=filtered_models, value=valid_selected_models) | |
def produce_radial_plot( | |
model_ids: list[str], | |
language_names: list[str], | |
use_win_ratio: bool, | |
show_scale: bool, | |
plot_width: int, | |
plot_height: int, | |
results_dfs: dict[Language, pd.DataFrame] | None, | |
) -> go.Figure: | |
"""Produce a radial plot as a plotly figure. | |
Args: | |
model_ids: | |
The ids of the models to include in the plot. | |
language_names: | |
The names of the languages to include in the plot. | |
use_win_ratio: | |
Whether to use win ratios (as opposed to raw scores). | |
show_scale: | |
Whether to show the scale on the plot. | |
plot_width: | |
The width of the plot. | |
plot_height: | |
The height of the plot. | |
results_dfs: | |
The results dataframes for each language. | |
Returns: | |
A plotly figure. | |
""" | |
global last_fetch | |
minutes_since_last_fetch = (dt.datetime.now() - last_fetch).total_seconds() / 60 | |
if minutes_since_last_fetch > UPDATE_FREQUENCY_MINUTES: | |
results_dfs = fetch_results() | |
last_fetch = dt.datetime.now() | |
if results_dfs is None or len(language_names) == 0 or len(model_ids) == 0: | |
if results_dfs is None: | |
logger.info("No results fetched yet. Resetting plot.") | |
elif len(language_names) == 0: | |
logger.info("No languages selected. Resetting plot.") | |
else: | |
logger.info("No models selected. Resetting plot.") | |
return go.Figure() | |
logger.info( | |
f"Producing radial plot for models {model_ids!r} on languages " | |
f"{language_names!r}..." | |
) | |
languages = [ALL_LANGUAGES[language_name] for language_name in language_names] | |
results_dfs_filtered = { | |
language: df | |
for language, df in results_dfs.items() | |
if language.name in language_names | |
} | |
tasks = [ | |
task | |
for task in ALL_TASKS | |
if all(task in df.columns for df in results_dfs_filtered.values()) | |
] | |
# Add all the evaluation results for each model | |
results: list[list[float]] = list() | |
for model_id in model_ids: | |
result_list = list() | |
for task in tasks: | |
win_ratios = list() | |
scores = list() | |
for language in languages: | |
if model_id not in results_dfs_filtered[language].index: | |
continue | |
score_list = results_dfs_filtered[language].loc[model_id][task] | |
win_ratio = 100 * np.mean([ | |
stats.ttest_rel( | |
a=score_list, b=other_scores, alternative="greater" | |
).pvalue < 0.05 | |
for other_scores in results_dfs_filtered[language][task].dropna().drop(index=model_id) | |
]) | |
win_ratios.append(win_ratio) | |
if all(score < 1 for score in score_list): | |
score_list = [100 * score for score in score_list] | |
scores.append(np.mean(score_list)) | |
if use_win_ratio: | |
result_list.append(np.mean(win_ratios)) | |
else: | |
result_list.append(np.mean(scores)) | |
results.append(result_list) | |
# Get a matrix of shape [num_models, num_tasks], where entry (i, j) indicates how | |
# many models that model i has beaten on task j | |
result_matrix = np.array(results) | |
num_models = result_matrix.shape[0] | |
num_tasks = result_matrix.shape[1] | |
num_models_beaten = np.zeros((num_models, num_tasks)) | |
for i in range(num_models): | |
for j in range(num_tasks): | |
num_models_beaten[i, j] = np.sum( | |
result_matrix[i, j] > result_matrix[:, j] | |
) | |
# Sort the models (and their results) such that the model who beats most other | |
# models first. This will result in the "smaller areas" being on top of the "larger | |
# areas", which is more aesthetically pleasing. | |
sorted_idxs = num_models_beaten.sum(axis=1).argsort()[::-1] | |
model_ids = np.asarray(model_ids)[sorted_idxs].tolist() | |
results = result_matrix[sorted_idxs].tolist() | |
# Add the results to a plotly figure | |
fig = go.Figure() | |
for model_id, result_list in zip(model_ids, results): | |
r, g, b = colour_mapping[model_id] | |
fig.add_trace(go.Scatterpolar( | |
r=result_list, | |
theta=[task.name for task in tasks], | |
name=model_id, | |
fill='toself', | |
fillcolor=f'rgba({r}, {g}, {b}, 0.6)', | |
line=dict(color=f'rgb({r}, {g}, {b})'), | |
)) | |
languages_str = "" | |
if len(languages) > 1: | |
languages_str = ", ".join([language.name for language in languages[:-1]]) | |
languages_str += " and " | |
languages_str += languages[-1].name | |
if use_win_ratio: | |
title = f'Win Ratio on on {languages_str} Language Tasks' | |
else: | |
title = f'LLM Score on on {languages_str} Language Tasks' | |
# Builds the radial plot from the results | |
fig.update_layout( | |
polar=dict(radialaxis=dict(visible=show_scale, range=[0, 100])), | |
showlegend=True, | |
title=title, | |
width=plot_width, | |
height=plot_height, | |
) | |
logger.info("Successfully produced radial plot.") | |
return fig | |
def fetch_results() -> dict[Language, pd.DataFrame]: | |
"""Fetch the results from the ScandEval benchmark. | |
Returns: | |
A dictionary of languages -> results-dataframes, whose indices are the | |
models and columns are the tasks. | |
""" | |
logger.info("Fetching results from ScandEval benchmark...") | |
response = requests.get( | |
"https://www.scandeval.com/scandeval_benchmark_results.jsonl" | |
) | |
response.raise_for_status() | |
records = [ | |
json.loads(dct_str) | |
for dct_str in response.text.split("\n") | |
if dct_str.strip("\n") | |
] | |
# Build a dictionary of languages -> results-dataframes, whose indices are the | |
# models and columns are the tasks. | |
results_dfs = dict() | |
for language in {dataset.language for dataset in DATASETS}: | |
possible_dataset_names = { | |
dataset.name for dataset in DATASETS if dataset.language == language | |
} | |
data_dict = defaultdict(dict) | |
for record in records: | |
model_name = record["model"] | |
dataset_name = record["dataset"] | |
if dataset_name in possible_dataset_names: | |
dataset = next( | |
dataset for dataset in DATASETS if dataset.name == dataset_name | |
) | |
scores = [ | |
test_score_dict.get( | |
f"test_{dataset.task.metric}", | |
test_score_dict.get(dataset.task.metric) | |
) | |
for test_score_dict in record["results"]["raw"]["test"] | |
] | |
if dataset.task in data_dict[model_name]: | |
data_dict[model_name][dataset.task].append(scores) | |
else: | |
data_dict[model_name][dataset.task] = [scores] | |
results_df = pd.DataFrame(data_dict).T.map( | |
lambda lists_or_nan: | |
list(it.chain(lists_or_nan)) | |
if lists_or_nan == lists_or_nan | |
else lists_or_nan | |
).dropna().map(lambda lst: lst[0]) | |
results_dfs[language] = results_df | |
logger.info("Successfully fetched results from ScandEval benchmark.") | |
return results_dfs | |
if __name__ == "__main__": | |
main() | |