1 import os
2 from datetime import datetime
3 from pathlib import Path
4
5 import numpy as np
6 import pandas as pd
7 import requests
8 import streamlit as st
9 from datasets import get_dataset_config_names
10 from dotenv import load_dotenv
11
12 if Path(".env").is_file():
13 load_dotenv(".env")
14
15 auth_token = os.getenv("HF_HUB_TOKEN")
16 header = {"Authorization": "Bearer " + auth_token}
17
18 TASKS = sorted(get_dataset_config_names("ought/raft"))
19 # Split and capitalize the task names, e.g. banking_77 => Banking 77
20 FORMATTED_TASK_NAMES = sorted([" ".join(t.capitalize() for t in task.split("_")) for task in TASKS])
21
22
23 def extract_tags(dataset):
24 tags = {}
25 for tag in dataset["tags"]:
26 k, v = tuple(tag.split(":", 1))
27 tags[k] = v
28 return tags
29
30
31 def download_submissions():
32 response = requests.get("http://huggingface.co/api/datasets", headers=header)
33 all_datasets = response.json()
34
35 submissions = []
36
37 for dataset in all_datasets:
38 tags = extract_tags(dataset)
39 if tags.get("benchmark") == "raft" and tags.get("type") == "evaluation":
40 submissions.append(dataset)
41 return submissions
42
43
44 def format_submissions(submissions):
45 submission_data = {**{"Submitter": []}, **{"Submission Name": []}, **{"Submission Date": []}, **{t: [] for t in TASKS}}
46
47 # The following picks the latest submissions which adhere to the model card schema
48 for submission in submissions:
49 submission_id = submission["id"]
50 response = requests.get(
51 f"http://huggingface.co/api/datasets/{submission_id}?full=true",
52 headers=header,
53 )
54 data = response.json()
55 card_data = data["cardData"]
56 username = card_data["submission_dataset"].split("/")[0]
57 submission_data["Submitter"].append(username)
58 submission_id = card_data["submission_id"]
59 submission_name, sha, timestamp = submission_id.split("__")
60 submission_data["Submission Name"].append(submission_name)
61 timestamp = pd.to_datetime(int(timestamp))
62 submission_data["Submission Date"].append(datetime.date(timestamp).strftime("%b %d, %Y"))
63
64 for task in card_data["results"]:
65 task_data = task["task"]
66 task_name = task_data["name"]
67 score = task_data["metrics"][0]["value"]
68 submission_data[task_name].append(score)
69
70 df = pd.DataFrame(submission_data)
71 df.insert(3, "Overall", df[TASKS].mean(axis=1))
72 df = df.copy().sort_values("Overall", ascending=False)
73 df.rename(columns={k: v for k, v in zip(TASKS, FORMATTED_TASK_NAMES)}, inplace=True)
74 # Start ranking from 1
75 df.insert(0, "Rank", np.arange(1, len(df) + 1))
76 return df
77
78
79 ###########
80 ### APP ###
81 ###########
82 st.set_page_config(layout="wide")
83 st.title("RAFT: Real-world Annotated Few-shot Tasks")
84 st.markdown(
85 """
86 Large pre-trained language models have shown promise for few-shot learning, completing text-based tasks given only a few task-specific examples. Will models soon solve classification tasks that have so far been reserved for human research assistants?
87
88 [RAFT](https://raft.elicit.org) is a few-shot classification benchmark that tests language models:
89
90 - across multiple domains (lit review, tweets, customer interaction, etc.)
91 - on economically valuable classification tasks (someone inherently cares about the task)
92 - in a setting that mirrors deployment (50 examples per task, info retrieval allowed, hidden test set)
93
94 To submit to RAFT, follow the instruction posted on [this page](https://huggingface.co/datasets/ought/raft-submission).
95 """
96 )
97 submissions = download_submissions()
98 df = format_submissions(submissions)
99 styler = df.style.set_precision(3).set_properties(**{"white-space": "pre-wrap", "text-align": "center"})
100 # hack to remove index column: https://discuss.streamlit.io/t/questions-on-st-table/6878/3
101 st.markdown(
102 """
103 <style>
104 table td:nth-child(1) {
105 display: none
106 }
107 table th:nth-child(1) {
108 display: none
109 }
110 </style>
111 """,
112 unsafe_allow_html=True,
113 )
114 st.table(styler)
115