import os from datetime import datetime from pathlib import Path from re import sub import pandas as pd import numpy as np import requests import streamlit as st from datasets import get_dataset_config_names from dotenv import load_dotenv if Path(".env").is_file(): load_dotenv(".env") auth_token = os.getenv("HF_HUB_TOKEN") header = {"Authorization": "Bearer " + auth_token} TASKS = sorted(get_dataset_config_names("ought/raft")) # Split and capitalize the task names, e.g. banking_77 => Banking 77 FORMATTED_TASK_NAMES = sorted([" ".join(t.capitalize() for t in task.split("_")) for task in TASKS]) def extract_tags(dataset): tags = {} for tag in dataset["tags"]: k, v = tuple(tag.split(":", 1)) tags[k] = v return tags def download_submissions(): response = requests.get("http://huggingface.co/api/datasets", headers=header) all_datasets = response.json() submissions = [] for dataset in all_datasets: tags = extract_tags(dataset) if tags.get("benchmark") == "ought/raft" and tags.get("type") == "evaluation": submissions.append(dataset) return submissions def format_submissions(submissions): submission_data = {**{"Submission": []}, **{"Date": []}, **{t: [] for t in TASKS}} # TODO(lewtun): delete / filter all the junk repos from development # The following picks the latest submissions which adhere to the model card schema for submission in submissions: submission_id = submission["id"] response = requests.get( f"http://huggingface.co/api/datasets/{submission_id}?full=true", headers=header, ) data = response.json() card_data = data["card_data"] submission_name = card_data["submission_dataset"] submission_data["Submission"].append(submission_name) submission_id = card_data["submission_id"] timestamp = submission_id.split("-")[-1] timestamp = pd.to_datetime(int(timestamp)) submission_data["Date"].append(datetime.date(timestamp)) for task in card_data["results"]: task_data = task["task"] task_name = task_data["name"] score = task_data["metrics"][0]["value"] submission_data[task_name].append(score) df = pd.DataFrame(submission_data) df.insert(2, "Overall", df[TASKS].mean(axis=1)) df = df.copy().sort_values("Overall", ascending=False) df.rename(columns={k: v for k, v in zip(TASKS, FORMATTED_TASK_NAMES)}, inplace=True) # Start ranking from 1 df.insert(0, "Rank", np.arange(1, len(df) + 1)) return df ########### ### APP ### ########### st.set_page_config(layout="wide") st.title("RAFT: Real-world Annotated Few-shot Tasks") st.markdown( """ Large pre-trained language models have shown promise for few-shot learning, completing text-based tasks given only a few task-specific examples. Will models soon solve classification tasks that have so far been reserved for human research assistants? [RAFT](https://raft.elicit.org) is a few-shot classification benchmark that tests language models: - across multiple domains (lit review, tweets, customer interaction, etc.) - on economically valuable classification tasks (someone inherently cares about the task) - in a setting that mirrors deployment (50 examples per task, info retrieval allowed, hidden test set) To submit to RAFT, follow the instruction posted on [this page](https://github.com/oughtinc/raft_submission). """ ) submissions = download_submissions() df = format_submissions(submissions) # hack to remove index column from https://github.com/streamlit/streamlit/issues/641 st.table(df.assign(hack="").set_index("hack").style.format(precision=3))