Spaces:
Runtime error
Runtime error
import os | |
from datetime import datetime | |
from pathlib import Path | |
import numpy as np | |
import pandas as pd | |
import requests | |
import streamlit as st | |
from datasets import get_dataset_config_names | |
from dotenv import load_dotenv | |
if Path(".env").is_file(): | |
load_dotenv(".env") | |
auth_token = os.getenv("HF_HUB_TOKEN") | |
header = {"Authorization": "Bearer " + auth_token} | |
TASKS = sorted(get_dataset_config_names("ought/raft")) | |
# Split and capitalize the task names, e.g. banking_77 => Banking 77 | |
FORMATTED_TASK_NAMES = sorted([" ".join(t.capitalize() for t in task.split("_")) for task in TASKS]) | |
def extract_tags(dataset): | |
tags = {} | |
for tag in dataset["tags"]: | |
k, v = tuple(tag.split(":", 1)) | |
tags[k] = v | |
return tags | |
def download_submissions(): | |
response = requests.get("http://huggingface.co/api/datasets", headers=header) | |
all_datasets = response.json() | |
submissions = [] | |
for dataset in all_datasets: | |
tags = extract_tags(dataset) | |
if tags.get("benchmark") == "raft" and tags.get("type") == "evaluation": | |
submissions.append(dataset) | |
return submissions | |
def format_submissions(submissions): | |
submission_data = {**{"Submitter": []}, **{"Submission Name": []}, **{"Submission Date": []}, **{t: [] for t in TASKS}} | |
# The following picks the latest submissions which adhere to the model card schema | |
for submission in submissions: | |
submission_id = submission["id"] | |
response = requests.get( | |
f"http://huggingface.co/api/datasets/{submission_id}?full=true", | |
headers=header, | |
) | |
data = response.json() | |
card_data = data["cardData"] | |
username = card_data["submission_dataset"].split("/")[0] | |
submission_data["Submitter"].append(username) | |
submission_id = card_data["submission_id"] | |
submission_name, sha, timestamp = submission_id.split("__") | |
submission_data["Submission Name"].append(submission_name) | |
timestamp = pd.to_datetime(int(timestamp)) | |
submission_data["Submission Date"].append(datetime.date(timestamp).strftime("%b %d, %Y")) | |
for task in card_data["results"]: | |
task_data = task["task"] | |
task_name = task_data["name"] | |
score = task_data["metrics"][0]["value"] | |
submission_data[task_name].append(score) | |
df = pd.DataFrame(submission_data) | |
df.insert(3, "Overall", df[TASKS].mean(axis=1)) | |
df = df.copy().sort_values("Overall", ascending=False) | |
df.rename(columns={k: v for k, v in zip(TASKS, FORMATTED_TASK_NAMES)}, inplace=True) | |
# Start ranking from 1 | |
df.insert(0, "Rank", np.arange(1, len(df) + 1)) | |
return df | |
########### | |
### APP ### | |
########### | |
st.set_page_config(layout="wide") | |
st.title("RAFT: Real-world Annotated Few-shot Tasks") | |
st.markdown( | |
""" | |
Large pre-trained language models have shown promise for few-shot learning, completing text-based tasks given only a few task-specific examples. Will models soon solve classification tasks that have so far been reserved for human research assistants? | |
[RAFT](https://raft.elicit.org) is a few-shot classification benchmark that tests language models: | |
- across multiple domains (lit review, tweets, customer interaction, etc.) | |
- on economically valuable classification tasks (someone inherently cares about the task) | |
- in a setting that mirrors deployment (50 examples per task, info retrieval allowed, hidden test set) | |
To submit to RAFT, follow the instruction posted on [this page](https://huggingface.co/datasets/ought/raft-submission). | |
""" | |
) | |
submissions = download_submissions() | |
df = format_submissions(submissions) | |
styler = df.style.set_precision(3).set_properties(**{"white-space": "pre-wrap", "text-align": "center"}) | |
# hack to remove index column: https://discuss.streamlit.io/t/questions-on-st-table/6878/3 | |
st.markdown( | |
""" | |
<style> | |
table td:nth-child(1) { | |
display: none | |
} | |
table th:nth-child(1) { | |
display: none | |
} | |
</style> | |
""", | |
unsafe_allow_html=True, | |
) | |
st.table(styler) | |