File size: 3,580 Bytes
0061e14
416ebf1
b49d2fb
e00a798
0061e14
416ebf1
e00a798
 
0061e14
e00a798
0061e14
e00a798
 
416ebf1
 
 
0061e14
54e1175
 
 
 
 
 
 
 
 
 
 
b49d2fb
416ebf1
 
 
 
b49d2fb
416ebf1
b49d2fb
416ebf1
54e1175
 
 
 
 
e00a798
 
416ebf1
 
 
 
 
 
 
 
 
 
 
e00a798
 
 
 
 
 
54e1175
e00a798
54e1175
 
e00a798
 
 
 
54e1175
 
 
 
 
 
e00a798
 
 
 
 
6d7c674
a6adcf8
e00a798
 
 
ea641c7
 
e00a798
 
 
ea641c7
 
 
 
 
e00a798
0061e14
e00a798
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import pandas as pd
from datasets import get_dataset_config_names, load_dataset
from datasets.exceptions import DatasetNotFoundError
from tqdm.auto import tqdm

from src.display.utils import AutoEvalColumn
from src.envs import TOKEN
from src.logger import get_logger

logger = get_logger(__name__)


def get_leaderboard_df(results_dataset_name: str) -> pd.DataFrame:
    """
    @brief Creates a dataframe from all the individual experiment results.
    """

    empty_df = pd.DataFrame(
        columns=[
            AutoEvalColumn.system.name,
            AutoEvalColumn.organization.name,
            AutoEvalColumn.success_rate_overall.name,
            AutoEvalColumn.success_rate_tier1.name,
            AutoEvalColumn.success_rate_tier2.name,
            AutoEvalColumn.submitted_on.name,
        ]
    )

    try:
        configs = get_dataset_config_names(
            results_dataset_name,
            token=TOKEN,
        )
    except (DatasetNotFoundError, FileNotFoundError):

        # Return an empty DataFrame with expected columns
        logger.warning("Failed to load configuration", exc_info=True)
        return empty_df

    if configs == ["default"]:
        logger.info("Dataset has only default config — treating as empty")
        return empty_df

    rows = []
    for submission_id in tqdm(
        configs,
        total=len(configs),
        desc="Processing Submission Results",
    ):
        submission_ds = load_dataset(
            results_dataset_name,
            submission_id,
            split="train",
            token=TOKEN,
        )
        submission_df = pd.DataFrame(submission_ds)

        if submission_df.empty or "did_pass" not in submission_df.columns or submission_df.did_pass.isna().any():
            logger.warning(f"Skipping {submission_id} due to invalid did_pass values")
            continue

        assert submission_df["tier"].isin([1, 2]).all(), "Invalid tier values found in submission_df"
        success_rate = 100 * submission_df["did_pass"].mean()
        tier1_success_rate = 100 * submission_df[submission_df["tier"] == 1]["did_pass"].mean()
        tier2_success_rate = 100 * submission_df[submission_df["tier"] == 2]["did_pass"].mean()
        first_row = submission_df.iloc[0]

        rows.append(
            {
                AutoEvalColumn.system.name: first_row["system_name"],
                AutoEvalColumn.organization.name: first_row["organization"],
                AutoEvalColumn.success_rate_overall.name: success_rate,
                AutoEvalColumn.success_rate_tier1.name: tier1_success_rate,
                AutoEvalColumn.success_rate_tier2.name: tier2_success_rate,
                AutoEvalColumn.submitted_on.name: pd.to_datetime(first_row["submission_ts"]).strftime("%Y-%m-%d %H:%M"),
            }
        )

    full_df = pd.DataFrame(rows)

    logger.info(f"Loaded results df with {len(full_df)} entries")

    # Keep only the latest entry per unique (System Name, System Type, Organization) triplet
    final_df = (
        full_df.sort_values("Submitted On", ascending=False)
        .drop_duplicates(subset=[AutoEvalColumn.system.name, AutoEvalColumn.organization.name], keep="first")
        .sort_values(by=[AutoEvalColumn.success_rate_overall.name], ascending=False)
        .reset_index(drop=True)
    )

    cols_to_round = [
        AutoEvalColumn.success_rate_overall.name,
        AutoEvalColumn.success_rate_tier1.name,
        AutoEvalColumn.success_rate_tier2.name,
    ]
    final_df[cols_to_round] = final_df[cols_to_round].round(decimals=2)

    return final_df