import logging import os import pandas as pd # type: ignore[import] from datasets import (get_dataset_config_names, # type: ignore[import] load_dataset) from .leaderboard_formatting import (COLUMNS_PRETTY, METRICS_PER_TASK, SORT_COLUMN_PER_TASK, get_columns_per_task) from .tasks import TASKS_PRETTY_REVERSE AVAILABLE_TASKS = get_dataset_config_names(os.environ["DATASET_ID"]) def _get_results_stub() -> pd.DataFrame: stub_df = pd.DataFrame( [ { "Model Name": "GPT-4", "Availability": "Proprietary", "Context Size": "16k", "BLEU": "X", "ROUGE": "X", "ChrF": "X", "BERTScore": "X", "BERTScore (Normalized)": "X", "Submitted By": "🏟 Long Code Arena Team", }, { "Model Name": "CodeLlama-7b (instruct)", "Availability": "Llama 2 license", "Context Size": "16k", "BLEU": "X", "ROUGE": "X", "ChrF": "X", "BERTScore": "X", "BERTScore (Normalized)": "X", "Submitted By": "🏟 Long Code Arena Team", }, ] ) return stub_df def _get_results_dataset(task_id: str) -> pd.DataFrame: results_df = load_dataset( os.environ["DATASET_ID"], task_id, split="test" ).to_pandas() results_df = results_df.rename(columns=COLUMNS_PRETTY, errors="ignore") results_df["Context Size"] = results_df["Context Size"].map( lambda x: f"{int(x) // 1000}k" if int(x) >= 1000 else x ) results_df = results_df.sort_values( by=SORT_COLUMN_PER_TASK[task_id], ascending=False ) for metric_column in METRICS_PER_TASK[task_id]: if "BERTScore" in metric_column: results_df[metric_column] = results_df[metric_column].map( lambda x: f"{x:.5f}" ) else: results_df[metric_column] = results_df[metric_column].map( lambda x: f"{x:.2f}" ) results_df = results_df[get_columns_per_task(task_id)] return results_df def get_results_for_task(task_pretty: str) -> pd.DataFrame: task_id = TASKS_PRETTY_REVERSE[task_pretty] if task_id in AVAILABLE_TASKS: logging.info(f"Retrieving results for {task_pretty}...") return _get_results_dataset(task_id) logging.info(f"Generating leaderboard stub for {task_pretty}...") return _get_results_stub()