Spaces:

openGPT-X
/

european-llm-leaderboard

Running on CPU Upgrade

File size: 7,810 Bytes

2b62c4c

import itertools
import os

import gradio as gr
import numpy as np
import pandas as pd
import plotly.express as px
from datasets import load_dataset

import style

TAB_STATE = 0  # FIXME
GSM8K_TASK_GROUP_NAME = "GSM8K"  # FIXME


def init():
    global repo_id, config_name, split_name, hidden_df, task_group_names_list, task_group_type_dict, task_groups_shots_dict, languages_list, model_type_dict

    repo_id = os.getenv("OGX_LEADERBOARD_DATASET_NAME")
    config_name = os.getenv("OGX_LEADERBOARD_DATASET_CONFIG")
    split_name = os.getenv("OGX_LEADERBOARD_DATASET_SPLIT")

    dataset = load_dataset(repo_id, config_name, split=split_name)
    hidden_df = dataset.to_pandas()

    task_group_names_list = hidden_df["Task_Group"].unique().tolist()
    task_group_type_df = hidden_df[["Task_Group", "Task_Type"]].drop_duplicates()
    task_group_type_dict = task_group_type_df.set_index("Task_Group")["Task_Type"].to_dict()
    task_groups_shots_df = hidden_df[hidden_df["Few_Shot"] == True][["Task_Group", "Number_Shots"]].drop_duplicates()
    task_groups_shots_dict = task_groups_shots_df.set_index("Task_Group")["Number_Shots"].to_dict()
    languages_list = hidden_df["Language"].drop_duplicates().str.upper().tolist()
    model_type_df = hidden_df[["Model_Name", "Model_Type"]].drop_duplicates()
    model_type_dict = model_type_df.set_index("Model_Name")["Model_Type"].to_dict()

    hidden_df = hidden_df.pivot_table(
        columns=["Task_Group", "Few_Shot", "Language"],
        index=["Model_Name"],
        values="Value",
        dropna=False,
    ).reset_index(inplace=False)

    hidden_df["Type"] = hidden_df["Model_Name"].apply(lambda x: style.T_SYMBOLS[model_type_dict[x]])


def sort_cols(df: pd.DataFrame, fewshot: bool = False) -> pd.DataFrame:
    task_cols = get_task_columns(df)
    if fewshot:
        renamer = {col: f"{col} ({task_groups_shots_dict[col]}-shot)" for col in task_cols if col in task_groups_shots_dict}
        df.rename(columns=renamer, inplace=True)
        task_cols = renamer.values()
    return df.reindex(["Type", "Model_Name", "Average"] + sorted(task_cols), axis=1)


def get_task_columns(df: pd.DataFrame) -> pd.DataFrame:
    l = list(df.columns)
    l.remove("Model_Name")
    l.remove("Average")
    l.remove("Type")
    return l


def get_models(df: pd.DataFrame) -> pd.DataFrame:
    return df["Model_Name"].unique()


def filter_type(df: pd.DataFrame, model_types: list[str]) -> pd.DataFrame:
    """Keep only rows for which model type is in list of types"""
    return df[df["Type"].isin(model_types)]


def search_model(df: pd.DataFrame, query: str) -> pd.DataFrame:
    """Keep only rows for which model name matches search query"""
    query = query.replace(";", "|")
    return df[df["Model_Name"].str.contains(query, case=False)]


def aggregate_langs(df: pd.DataFrame, tasks: list, langs: list):
    """Aggregates results over langs for each task in tasks.
    If a language does not exist for a task, the aggregate for
    that task will be shown as NaN.
    """

    langs_lower = [item.lower() for item in langs]
    df.columns = ["_".join(filter(None, col)) for col in df.columns]
    colset = set(df.columns)
    for t in tasks:
        cols = [(f"{a}_{b}") for a, b in itertools.product([t], langs_lower)]
        if set(cols).issubset(colset):
            df.loc[:, t] = df[cols].mean(axis=1, skipna=False)
        else:
            df.loc[:, t] = np.nan
    df.loc[:, "Average"] = df[tasks].mean(axis=1)
    return df[["Type", "Model_Name", "Average"] + tasks]


def select_shots(df: pd.DataFrame, fewshot: bool = False):
    cols = [col for col in df.columns if col[1] == fewshot] + []
    # Move model name and type icon to the end
    cols.append(("Model_Name", "", ""))
    cols.append(("Type", "", ""))
    return df[cols].droplevel(level=1, axis="columns")


def update_df(
    tasks: list[str],
    model_query: str,
    langs: list[str],
    model_types: list[str],
    fewshot: bool = False,
    format: bool = True,
) -> pd.DataFrame:
    """Return a filtered dataframe according to selected models, tasks and
    languages. The format flag controls whether the output dataframe should
    be formatted to tw significant figures.
    """
    # keep only selected shots
    df = select_shots(hidden_df, fewshot)

    # aggregate results over languages per task
    df = aggregate_langs(df, tasks, langs)

    # filter models by search bar and model type
    df = search_model(df, model_query)
    df = filter_type(df, model_types)

    if format:
        return sort_cols(df, fewshot).style.format(precision=2, decimal=".")
    else:
        return sort_cols(df, fewshot)


def make_plot(df: pd.DataFrame):
    df.columns = df.loc["Model_Name"]
    df = df.drop("Model_Name")
    df = df.reset_index(names="task")
    if len(df.columns) > 2:
        fig = px.line(data_frame=df, x="task", y=df.columns, markers=True, width=1200)
    else:
        fig = px.bar(data_frame=df, x="task", y=df.columns[-1], width=1200)
    fig.update_xaxes(type="category")
    return fig


def update_plot(
    tasks: list[str],
    model_query: str,
    langs: list[str],
    model_types: list[str],
    fewshot: bool = False,
):
    df = update_df(tasks, model_query, langs, model_types, fewshot, False).transpose()
    plot = make_plot(df)
    return plot


def fix_zeroshot(tasks: list[str | int | float], fewshot: bool = False):
    global TAB_STATE
    selected_task_type = get_selected_task_type(TAB_STATE)
    choices = task_groups_with_task_type(selected_task_type)
    if not fewshot:
        try:
            choices.remove(GSM8K_TASK_GROUP_NAME)
        except ValueError:
            pass
        value = [v for v in tasks if v in choices]
    else:
        if TAB_STATE == 0:
            value = [v for v in tasks if v in choices] + [GSM8K_TASK_GROUP_NAME]
        elif TAB_STATE == 1:
            value = [v for v in tasks if v in choices]
    shown_tasks = gr.CheckboxGroup(
        choices=choices,
        value=value,
        label="Select tasks to show",
        elem_id="column-select",
        interactive=True,
        scale=50,
    )
    return shown_tasks


def update_tab_tasks(id: int, fewshot: bool = False):
    # when the tab is changed, update the TAB_STATE accordingly
    global TAB_STATE
    TAB_STATE = id
    selected_task_type = get_selected_task_type(TAB_STATE)
    choices = task_groups_with_task_type(selected_task_type)
    if not fewshot:
        try:
            choices.remove(GSM8K_TASK_GROUP_NAME)
        except ValueError:
            pass
    values = choices.copy()
    shown_tasks = gr.CheckboxGroup(
        choices=choices,
        value=values,
        label="Select tasks to show",
        elem_id="column-select",
        interactive=True,
        scale=50,
    )
    if id == 0:
        # switching to accuracy tab, default to fewshot
        fewshot = gr.Radio(
            choices=[("0-Shot", False), ("Few-shot", True)],
            value=True,
            label="Select evaluation type",
            interactive=True,
            scale=29,
        )
    elif id == 1:
        # switching to translation tab, default to 0-shot and disable selection
        fewshot = gr.Radio(
            choices=[("0-Shot", False), ("Few-shot", True)],
            value=False,
            label="Select evaluation type",
            interactive=False,
            scale=29,
        )
    return [shown_tasks, fewshot]


def get_selected_task_type(task_type_id):
    task_types = {0: "accuracy", 1: "misc"}
    selected_task_type = task_types[task_type_id]
    return selected_task_type


def task_groups_with_task_type(selected_task_type):
    choices = [task_group_name for task_group_name, task_type in task_group_type_dict.items() if task_type == selected_task_type]

    return choices


init()