import os from datetime import datetime from pathlib import Path from re import sub import pandas as pd import requests import streamlit as st from datasets import get_dataset_config_names from dotenv import load_dotenv if Path(".env").is_file(): load_dotenv(".env") auth_token = os.getenv("HF_HUB_TOKEN") header = {"Authorization": "Bearer " + auth_token} TASKS = get_dataset_config_names("ought/raft") # TODO(lewtun): Evaluate with new subtasks and remove this filter TASKS_TO_EXCLUDE = ["gpai_initiatives", "ade_corpus_v2", "tweet_eval_hate"] TASKS = [t for t in TASKS if t not in TASKS_TO_EXCLUDE] # Split and capitalize the task names, e.g. banking_77 => Banking 77 FORMATTED_TASK_NAMES = [" ".join(t.capitalize() for t in task.split("_")) for task in TASKS] def extract_tags(dataset): tags = {} for tag in dataset["tags"]: k, v = tuple(tag.split(":", 1)) tags[k] = v return tags def download_submissions(): response = requests.get("http://huggingface.co/api/datasets", headers=header) all_datasets = response.json() submissions = [] for dataset in all_datasets: tags = extract_tags(dataset) if tags.get("benchmark") == "ought/raft" and tags.get("type") == "evaluation": submissions.append(dataset) submissions = sorted(submissions, key = lambda x: int(x["id"].split("-")[-1])) return submissions def format_submissions(submissions): submission_data = {**{"Submission": []}, **{"Date": []}, **{t: [] for t in TASKS}} # TODO(lewtun): delete / filter all the junk repos from development # The following picks the latest submissions which adhere to the model card schema for submission in submissions[-1:]: submission_id = submission["id"] response = requests.get( f"http://huggingface.co/api/datasets/{submission_id}?full=true", headers=header, ) data = response.json() card_data = data["card_data"] submission_name = card_data["submission_dataset"] submission_data["Submission"].append(submission_name) submission_id = card_data["submission_id"] timestamp = submission_id.split("-")[-1] timestamp = pd.to_datetime(int(timestamp)) submission_data["Date"].append(datetime.date(timestamp)) for task in card_data["results"]: task_data = task["task"] task_name = task_data["name"] # TODO(lewtun): Evaluate with new subtasks and remove this filter if task_name in TASKS_TO_EXCLUDE: continue score = task_data["metrics"][0]["value"] submission_data[task_name].append(score) df = pd.DataFrame(submission_data) df.insert(2, "Overall", df[TASKS].mean(axis=1)) df = df.copy().sort_values("Overall", ascending=False).reset_index().rename(columns={"index": "Rank"}) df.rename(columns={k: v for k, v in zip(TASKS, FORMATTED_TASK_NAMES)}, inplace=True) return df ########### ### APP ### ########### st.set_page_config(layout="wide") st.title("RAFT Leaderboard") submissions = download_submissions() df = format_submissions(submissions) # hack to remove index column from https://github.com/streamlit/streamlit/issues/641 st.table(df.assign(hack="").set_index("hack"))