Spaces:

ought
/

raft-leaderboard

Running

App Files Files Community

lewtun HF staff commited on Oct 11, 2021

Commit

7cfc852

•

1 Parent(s): b3c67da

Tweak submission column names

Browse files

Files changed (2) hide show

Untitled.ipynb +1833 -0
app.py +3 -4

Untitled.ipynb ADDED Viewed

	@@ -0,0 +1,1833 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 89,
+   "id": "c0cdda73-430c-4e18-bce4-b2218e2597b9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import load_dataset, get_dataset_config_names"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4981ce75-5d13-4fd2-b08f-af077066f7d3",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "id": "13e20072-0304-424a-923d-ac31a1769e94",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "from datetime import datetime\n",
+    "from pathlib import Path\n",
+    "from re import sub\n",
+    "\n",
+    "import pandas as pd\n",
+    "import requests\n",
+    "import streamlit as st\n",
+    "from datasets import get_dataset_config_names\n",
+    "from dotenv import load_dotenv\n",
+    "\n",
+    "if Path(\".env\").is_file():\n",
+    "    load_dotenv(\".env\")\n",
+    "\n",
+    "auth_token = os.getenv(\"HF_HUB_TOKEN\")\n",
+    "header = {\"Authorization\": \"Bearer \" + auth_token}\n",
+    "\n",
+    "TASKS = get_dataset_config_names(\"ought/raft\")\n",
+    "# Split and capitalize the task names, e.g. banking_77 => Banking 77\n",
+    "FORMATTED_TASK_NAMES = [\" \".join(t.capitalize() for t in task.split(\"_\")) for task in TASKS]\n",
+    "\n",
+    "\n",
+    "def extract_tags(dataset):\n",
+    "    tags = {}\n",
+    "    for tag in dataset[\"tags\"]:\n",
+    "        k, v = tuple(tag.split(\":\", 1))\n",
+    "        tags[k] = v\n",
+    "    return tags\n",
+    "\n",
+    "\n",
+    "def download_submissions():\n",
+    "    response = requests.get(\"http://huggingface.co/api/datasets\", headers=header)\n",
+    "    all_datasets = response.json()\n",
+    "\n",
+    "    submissions = []\n",
+    "\n",
+    "    for dataset in all_datasets:\n",
+    "        tags = extract_tags(dataset)\n",
+    "        if tags.get(\"benchmark\") == \"ought/raft\" and tags.get(\"type\") == \"evaluation\":\n",
+    "            submissions.append(dataset)\n",
+    "    return submissions\n",
+    "\n",
+    "\n",
+    "def format_submissions(submissions):\n",
+    "    submission_data = {**{\"Submission\": []}, **{\"Date\": []}, **{t: [] for t in TASKS}}\n",
+    "\n",
+    "    # TODO(lewtun): delete / filter all the junk repos from development\n",
+    "    # The following picks the latest submissions which adhere to the model card schema\n",
+    "    for submission in submissions:\n",
+    "        submission_id = submission[\"id\"]\n",
+    "        response = requests.get(\n",
+    "            f\"http://huggingface.co/api/datasets/{submission_id}?full=true\",\n",
+    "            headers=header,\n",
+    "        )\n",
+    "        data = response.json()\n",
+    "        card_data = data[\"card_data\"]\n",
+    "        submission_name = card_data[\"submission_dataset\"]\n",
+    "        submission_data[\"Submission\"].append(submission_name)\n",
+    "        submission_id = card_data[\"submission_id\"]\n",
+    "        timestamp = submission_id.split(\"-\")[-1]\n",
+    "        timestamp = pd.to_datetime(int(timestamp))\n",
+    "        submission_data[\"Date\"].append(datetime.date(timestamp))\n",
+    "\n",
+    "        for task in card_data[\"results\"]:\n",
+    "            task_data = task[\"task\"]\n",
+    "            task_name = task_data[\"name\"]\n",
+    "            score = task_data[\"metrics\"][0][\"value\"]\n",
+    "            submission_data[task_name].append(score)\n",
+    "\n",
+    "    df = pd.DataFrame(submission_data)\n",
+    "    df.insert(2, \"Overall\", df[TASKS].mean(axis=1))\n",
+    "    df = df.copy().sort_values(\"Overall\", ascending=False).reset_index().rename(columns={\"index\": \"Rank\"})\n",
+    "    df.rename(columns={k: v for k, v in zip(TASKS, FORMATTED_TASK_NAMES)}, inplace=True)\n",
+    "    return df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "8dccc419-7b18-4a10-a4bf-2d69cc3b5888",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "submissions = download_submissions()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "id": "934ea3b9-76dd-4d8f-a62d-8e2fa5959111",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "2"
+      ]
+     },
+     "execution_count": 29,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(submissions)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "id": "c3803890-d664-4d24-86bc-8fb095cad40a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = format_submissions(submissions)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "id": "2de6f903-c327-42b6-a1ca-a530a62cc412",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Rank</th>\n",
+       "      <th>Submission</th>\n",
+       "      <th>Date</th>\n",
+       "      <th>Overall</th>\n",
+       "      <th>Ade Corpus V2</th>\n",
+       "      <th>Banking 77</th>\n",
+       "      <th>Terms Of Service</th>\n",
+       "      <th>Tai Safety Research</th>\n",
+       "      <th>Neurips Impact Statement Risks</th>\n",
+       "      <th>Overruling</th>\n",
+       "      <th>Systematic Review Inclusion</th>\n",
+       "      <th>One Stop English</th>\n",
+       "      <th>Tweet Eval Hate</th>\n",
+       "      <th>Twitter Complaints</th>\n",
+       "      <th>Semiconductor Org Types</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1</td>\n",
+       "      <td>Human baseline (crowdsourced)</td>\n",
+       "      <td>2021-08-27</td>\n",
+       "      <td>0.735273</td>\n",
+       "      <td>0.830</td>\n",
+       "      <td>0.607</td>\n",
+       "      <td>0.627</td>\n",
+       "      <td>0.609</td>\n",
+       "      <td>0.857</td>\n",
+       "      <td>0.917</td>\n",
+       "      <td>0.468</td>\n",
+       "      <td>0.646</td>\n",
+       "      <td>0.722</td>\n",
+       "      <td>0.897</td>\n",
+       "      <td>0.908</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>0</td>\n",
+       "      <td>GPT-3 baseline</td>\n",
+       "      <td>2021-08-27</td>\n",
+       "      <td>0.631000</td>\n",
+       "      <td>0.688</td>\n",
+       "      <td>0.295</td>\n",
+       "      <td>0.579</td>\n",
+       "      <td>0.667</td>\n",
+       "      <td>0.595</td>\n",
+       "      <td>0.940</td>\n",
+       "      <td>0.535</td>\n",
+       "      <td>0.407</td>\n",
+       "      <td>0.529</td>\n",
+       "      <td>0.822</td>\n",
+       "      <td>0.884</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   Rank                     Submission        Date   Overall  Ade Corpus V2  \\\n",
+       "0     1  Human baseline (crowdsourced)  2021-08-27  0.735273          0.830   \n",
+       "1     0                 GPT-3 baseline  2021-08-27  0.631000          0.688   \n",
+       "\n",
+       "   Banking 77  Terms Of Service  Tai Safety Research  \\\n",
+       "0       0.607             0.627                0.609   \n",
+       "1       0.295             0.579                0.667   \n",
+       "\n",
+       "   Neurips Impact Statement Risks  Overruling  Systematic Review Inclusion  \\\n",
+       "0                           0.857       0.917                        0.468   \n",
+       "1                           0.595       0.940                        0.535   \n",
+       "\n",
+       "   One Stop English  Tweet Eval Hate  Twitter Complaints  \\\n",
+       "0             0.646            0.722               0.897   \n",
+       "1             0.407            0.529               0.822   \n",
+       "\n",
+       "   Semiconductor Org Types  \n",
+       "0                    0.908  \n",
+       "1                    0.884  "
+      ]
+     },
+     "execution_count": 35,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "id": "ca6ba762-047f-4074-a5c3-b4168c13d398",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<style type=\"text/css\">\n",
+       "</style>\n",
+       "<table id=\"T_b6d1f_\">\n",
+       "  <thead>\n",
+       "    <tr>\n",
+       "      <th class=\"blank level0\" >&nbsp;</th>\n",
+       "      <th class=\"col_heading level0 col0\" >Rank</th>\n",
+       "      <th class=\"col_heading level0 col1\" >Submission</th>\n",
+       "      <th class=\"col_heading level0 col2\" >Date</th>\n",
+       "      <th class=\"col_heading level0 col3\" >Overall</th>\n",
+       "      <th class=\"col_heading level0 col4\" >Ade Corpus V2</th>\n",
+       "      <th class=\"col_heading level0 col5\" >Banking 77</th>\n",
+       "      <th class=\"col_heading level0 col6\" >Terms Of Service</th>\n",
+       "      <th class=\"col_heading level0 col7\" >Tai Safety Research</th>\n",
+       "      <th class=\"col_heading level0 col8\" >Neurips Impact Statement Risks</th>\n",
+       "      <th class=\"col_heading level0 col9\" >Overruling</th>\n",
+       "      <th class=\"col_heading level0 col10\" >Systematic Review Inclusion</th>\n",
+       "      <th class=\"col_heading level0 col11\" >One Stop English</th>\n",
+       "      <th class=\"col_heading level0 col12\" >Tweet Eval Hate</th>\n",
+       "      <th class=\"col_heading level0 col13\" >Twitter Complaints</th>\n",
+       "      <th class=\"col_heading level0 col14\" >Semiconductor Org Types</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th id=\"T_b6d1f_level0_row0\" class=\"row_heading level0 row0\" >0</th>\n",
+       "      <td id=\"T_b6d1f_row0_col0\" class=\"data row0 col0\" >1</td>\n",
+       "      <td id=\"T_b6d1f_row0_col1\" class=\"data row0 col1\" >Human baseline (crowdsourced)</td>\n",
+       "      <td id=\"T_b6d1f_row0_col2\" class=\"data row0 col2\" >2021-08-27</td>\n",
+       "      <td id=\"T_b6d1f_row0_col3\" class=\"data row0 col3\" >0.735</td>\n",
+       "      <td id=\"T_b6d1f_row0_col4\" class=\"data row0 col4\" >0.830</td>\n",
+       "      <td id=\"T_b6d1f_row0_col5\" class=\"data row0 col5\" >0.607</td>\n",
+       "      <td id=\"T_b6d1f_row0_col6\" class=\"data row0 col6\" >0.627</td>\n",
+       "      <td id=\"T_b6d1f_row0_col7\" class=\"data row0 col7\" >0.609</td>\n",
+       "      <td id=\"T_b6d1f_row0_col8\" class=\"data row0 col8\" >0.857</td>\n",
+       "      <td id=\"T_b6d1f_row0_col9\" class=\"data row0 col9\" >0.917</td>\n",
+       "      <td id=\"T_b6d1f_row0_col10\" class=\"data row0 col10\" >0.468</td>\n",
+       "      <td id=\"T_b6d1f_row0_col11\" class=\"data row0 col11\" >0.646</td>\n",
+       "      <td id=\"T_b6d1f_row0_col12\" class=\"data row0 col12\" >0.722</td>\n",
+       "      <td id=\"T_b6d1f_row0_col13\" class=\"data row0 col13\" >0.897</td>\n",
+       "      <td id=\"T_b6d1f_row0_col14\" class=\"data row0 col14\" >0.908</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th id=\"T_b6d1f_level0_row1\" class=\"row_heading level0 row1\" >1</th>\n",
+       "      <td id=\"T_b6d1f_row1_col0\" class=\"data row1 col0\" >0</td>\n",
+       "      <td id=\"T_b6d1f_row1_col1\" class=\"data row1 col1\" >GPT-3 baseline</td>\n",
+       "      <td id=\"T_b6d1f_row1_col2\" class=\"data row1 col2\" >2021-08-27</td>\n",
+       "      <td id=\"T_b6d1f_row1_col3\" class=\"data row1 col3\" >0.631</td>\n",
+       "      <td id=\"T_b6d1f_row1_col4\" class=\"data row1 col4\" >0.688</td>\n",
+       "      <td id=\"T_b6d1f_row1_col5\" class=\"data row1 col5\" >0.295</td>\n",
+       "      <td id=\"T_b6d1f_row1_col6\" class=\"data row1 col6\" >0.579</td>\n",
+       "      <td id=\"T_b6d1f_row1_col7\" class=\"data row1 col7\" >0.667</td>\n",
+       "      <td id=\"T_b6d1f_row1_col8\" class=\"data row1 col8\" >0.595</td>\n",
+       "      <td id=\"T_b6d1f_row1_col9\" class=\"data row1 col9\" >0.940</td>\n",
+       "      <td id=\"T_b6d1f_row1_col10\" class=\"data row1 col10\" >0.535</td>\n",
+       "      <td id=\"T_b6d1f_row1_col11\" class=\"data row1 col11\" >0.407</td>\n",
+       "      <td id=\"T_b6d1f_row1_col12\" class=\"data row1 col12\" >0.529</td>\n",
+       "      <td id=\"T_b6d1f_row1_col13\" class=\"data row1 col13\" >0.822</td>\n",
+       "      <td id=\"T_b6d1f_row1_col14\" class=\"data row1 col14\" >0.884</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n"
+      ],
+      "text/plain": [
+       "<pandas.io.formats.style.Styler at 0x7fba946d44c0>"
+      ]
+     },
+     "execution_count": 45,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.style.format(precision=3)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "id": "094e757c-1c6a-4d01-abb1-872face8c72b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df2 = df.assign(hack=\"\").set_index(\"hack\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "id": "2ff434e2-5bf6-453f-8470-28c7b1034154",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<style type=\"text/css\">\n",
+       "</style>\n",
+       "<table id=\"T_59a1f_\">\n",
+       "  <thead>\n",
+       "    <tr>\n",
+       "      <th class=\"blank level0\" >&nbsp;</th>\n",
+       "      <th class=\"col_heading level0 col0\" >Rank</th>\n",
+       "      <th class=\"col_heading level0 col1\" >Submission</th>\n",
+       "      <th class=\"col_heading level0 col2\" >Date</th>\n",
+       "      <th class=\"col_heading level0 col3\" >Overall</th>\n",
+       "      <th class=\"col_heading level0 col4\" >Ade Corpus V2</th>\n",
+       "      <th class=\"col_heading level0 col5\" >Banking 77</th>\n",
+       "      <th class=\"col_heading level0 col6\" >Terms Of Service</th>\n",
+       "      <th class=\"col_heading level0 col7\" >Tai Safety Research</th>\n",
+       "      <th class=\"col_heading level0 col8\" >Neurips Impact Statement Risks</th>\n",
+       "      <th class=\"col_heading level0 col9\" >Overruling</th>\n",
+       "      <th class=\"col_heading level0 col10\" >Systematic Review Inclusion</th>\n",
+       "      <th class=\"col_heading level0 col11\" >One Stop English</th>\n",
+       "      <th class=\"col_heading level0 col12\" >Tweet Eval Hate</th>\n",
+       "      <th class=\"col_heading level0 col13\" >Twitter Complaints</th>\n",
+       "      <th class=\"col_heading level0 col14\" >Semiconductor Org Types</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th class=\"index_name level0\" >hack</th>\n",
+       "      <th class=\"blank col0\" >&nbsp;</th>\n",
+       "      <th class=\"blank col1\" >&nbsp;</th>\n",
+       "      <th class=\"blank col2\" >&nbsp;</th>\n",
+       "      <th class=\"blank col3\" >&nbsp;</th>\n",
+       "      <th class=\"blank col4\" >&nbsp;</th>\n",
+       "      <th class=\"blank col5\" >&nbsp;</th>\n",
+       "      <th class=\"blank col6\" >&nbsp;</th>\n",
+       "      <th class=\"blank col7\" >&nbsp;</th>\n",
+       "      <th class=\"blank col8\" >&nbsp;</th>\n",
+       "      <th class=\"blank col9\" >&nbsp;</th>\n",
+       "      <th class=\"blank col10\" >&nbsp;</th>\n",
+       "      <th class=\"blank col11\" >&nbsp;</th>\n",
+       "      <th class=\"blank col12\" >&nbsp;</th>\n",
+       "      <th class=\"blank col13\" >&nbsp;</th>\n",
+       "      <th class=\"blank col14\" >&nbsp;</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th id=\"T_59a1f_level0_row0\" class=\"row_heading level0 row0\" ></th>\n",
+       "      <td id=\"T_59a1f_row0_col0\" class=\"data row0 col0\" >1</td>\n",
+       "      <td id=\"T_59a1f_row0_col1\" class=\"data row0 col1\" >Human baseline (crowdsourced)</td>\n",
+       "      <td id=\"T_59a1f_row0_col2\" class=\"data row0 col2\" >2021-08-27</td>\n",
+       "      <td id=\"T_59a1f_row0_col3\" class=\"data row0 col3\" >0.735</td>\n",
+       "      <td id=\"T_59a1f_row0_col4\" class=\"data row0 col4\" >0.830</td>\n",
+       "      <td id=\"T_59a1f_row0_col5\" class=\"data row0 col5\" >0.607</td>\n",
+       "      <td id=\"T_59a1f_row0_col6\" class=\"data row0 col6\" >0.627</td>\n",
+       "      <td id=\"T_59a1f_row0_col7\" class=\"data row0 col7\" >0.609</td>\n",
+       "      <td id=\"T_59a1f_row0_col8\" class=\"data row0 col8\" >0.857</td>\n",
+       "      <td id=\"T_59a1f_row0_col9\" class=\"data row0 col9\" >0.917</td>\n",
+       "      <td id=\"T_59a1f_row0_col10\" class=\"data row0 col10\" >0.468</td>\n",
+       "      <td id=\"T_59a1f_row0_col11\" class=\"data row0 col11\" >0.646</td>\n",
+       "      <td id=\"T_59a1f_row0_col12\" class=\"data row0 col12\" >0.722</td>\n",
+       "      <td id=\"T_59a1f_row0_col13\" class=\"data row0 col13\" >0.897</td>\n",
+       "      <td id=\"T_59a1f_row0_col14\" class=\"data row0 col14\" >0.908</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th id=\"T_59a1f_level0_row1\" class=\"row_heading level0 row1\" ></th>\n",
+       "      <td id=\"T_59a1f_row1_col0\" class=\"data row1 col0\" >0</td>\n",
+       "      <td id=\"T_59a1f_row1_col1\" class=\"data row1 col1\" >GPT-3 baseline</td>\n",
+       "      <td id=\"T_59a1f_row1_col2\" class=\"data row1 col2\" >2021-08-27</td>\n",
+       "      <td id=\"T_59a1f_row1_col3\" class=\"data row1 col3\" >0.631</td>\n",
+       "      <td id=\"T_59a1f_row1_col4\" class=\"data row1 col4\" >0.688</td>\n",
+       "      <td id=\"T_59a1f_row1_col5\" class=\"data row1 col5\" >0.295</td>\n",
+       "      <td id=\"T_59a1f_row1_col6\" class=\"data row1 col6\" >0.579</td>\n",
+       "      <td id=\"T_59a1f_row1_col7\" class=\"data row1 col7\" >0.667</td>\n",
+       "      <td id=\"T_59a1f_row1_col8\" class=\"data row1 col8\" >0.595</td>\n",
+       "      <td id=\"T_59a1f_row1_col9\" class=\"data row1 col9\" >0.940</td>\n",
+       "      <td id=\"T_59a1f_row1_col10\" class=\"data row1 col10\" >0.535</td>\n",
+       "      <td id=\"T_59a1f_row1_col11\" class=\"data row1 col11\" >0.407</td>\n",
+       "      <td id=\"T_59a1f_row1_col12\" class=\"data row1 col12\" >0.529</td>\n",
+       "      <td id=\"T_59a1f_row1_col13\" class=\"data row1 col13\" >0.822</td>\n",
+       "      <td id=\"T_59a1f_row1_col14\" class=\"data row1 col14\" >0.884</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n"
+      ],
+      "text/plain": [
+       "<pandas.io.formats.style.Styler at 0x7fba946d4910>"
+      ]
+     },
+     "execution_count": 48,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df2.style.format(precision=3)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 186,
+   "id": "8be02c77-bda3-499b-9ac4-d50ec35644a5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for submission in submissions[-1:]:\n",
+    "    submission_id = submission[\"id\"]\n",
+    "    response = requests.get(\n",
+    "        f\"http://huggingface.co/api/datasets/{submission_id}?full=true\",\n",
+    "        headers=header,\n",
+    "    )\n",
+    "    data = response.json()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 188,
+   "id": "7ab07904-0f7e-401b-96f8-3558433e479a",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'Submission': [], 'foo': [], 'bar': []}"
+      ]
+     },
+     "execution_count": 188,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "{**{\"Submission\": []}, **{\"foo\":[]}, **{\"bar\": []}}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 191,
+   "id": "69ffb778-09cf-4eb8-ab95-739700d68420",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'f5a21c3fcb58ac17c8a47cfffd509b55cbad7ccf-1629986165000000000'"
+      ]
+     },
+     "execution_count": 191,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sub_id = data[\"card_data\"][\"submission_id\"]\n",
+    "sub_id"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 195,
+   "id": "f7c3e8c0-68c7-4bad-802b-1b39703e100d",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'1629986165000000000'"
+      ]
+     },
+     "execution_count": 195,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "t = sub_id.split(\"-\")[-1]\n",
+    "t"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 197,
+   "id": "34a7483c-0b00-42a7-99b9-ee6bdf34048a",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Timestamp('2021-08-26 13:56:05')"
+      ]
+     },
+     "execution_count": 197,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "d = pd.to_datetime(int(t))\n",
+    "d"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 201,
+   "id": "95f1ecff-025f-4b42-8761-2c0964dfac5f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import datetime"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 205,
+   "id": "c606cfea-165d-4b58-ba37-6fc9b06795cf",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>d</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>2021-08-26</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "            d\n",
+       "0  2021-08-26"
+      ]
+     },
+     "execution_count": 205,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pd.DataFrame({\"d\":[datetime.datetime.date(d)]})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 147,
+   "id": "eebefd5a-6451-44b9-bc0f-d0663f321e34",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "timestamp = data[\"lastModified\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 149,
+   "id": "bf663ca3-12e8-4178-9aef-aba46621477a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 160,
+   "id": "4723aeb6-3993-49b1-b779-c1394b54d776",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Timestamp('2021-08-04 22:52:57+0000', tz='UTC')"
+      ]
+     },
+     "execution_count": 160,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "t = pd.to_datetime(timestamp)\n",
+    "t"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 157,
+   "id": "cbcc2bf8-e2c8-449c-9f00-38ed80e46ae0",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'2021-08-04T22:52:57.000Z'"
+      ]
+     },
+     "execution_count": 157,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "timestamp"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 161,
+   "id": "177574a9-327e-4999-a1db-c316bb741c8c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "t_int = int(t.timestamp() * 10 **9)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 162,
+   "id": "b3aa4f70-50af-47b3-a492-c77f65266a5b",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Timestamp('2021-08-04 22:52:57')"
+      ]
+     },
+     "execution_count": 162,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pd.to_datetime(t_int)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 96,
+   "id": "ae7c9100-a630-4b4e-a060-331914f86055",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "submissions = download_submissions()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 97,
+   "id": "831077a1-7f44-4d31-94b3-49257a62c5f7",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "16"
+      ]
+     },
+     "execution_count": 97,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(submissions)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "id": "4a8a9ff9-7f94-4abb-8194-9d570ad2216b",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[{'id': 'autonlp/autonlp-benchmark-raft-ought__raft-ought__raft-dummy-predictions-642',\n",
+       "  'private': True,\n",
+       "  'tags': ['benchmark:ought/raft',\n",
+       "   'type:evaluation',\n",
+       "   'submission_dataset:ought/raft-dummy-predictions',\n",
+       "   'tags:autonlp',\n",
+       "   'tags:evaluation',\n",
+       "   'tags:benchmark'],\n",
+       "  'author': 'autonlp',\n",
+       "  'key': ''}]"
+      ]
+     },
+     "execution_count": 35,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "submissions[-1:]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 98,
+   "id": "2d6e56cb-fca3-4e9e-9a8b-9d2e26816773",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = format_submissions(submissions[-2:])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 109,
+   "id": "7d8a3402-f7b8-4edb-8d1e-afb704dc3c67",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Submission</th>\n",
+       "      <th>Overall</th>\n",
+       "      <th>banking_77</th>\n",
+       "      <th>medical_subdomain_of_clinical_notes</th>\n",
+       "      <th>overruling</th>\n",
+       "      <th>gpai_initiatives</th>\n",
+       "      <th>semiconductor_org_types</th>\n",
+       "      <th>twitter_complaints</th>\n",
+       "      <th>neurips_impact_statement_risks</th>\n",
+       "      <th>systematic_review_inclusion</th>\n",
+       "      <th>terms_of_service</th>\n",
+       "      <th>tai_safety_research</th>\n",
+       "      <th>one_stop_english</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>lewtun/my-raft-dummy-predictions</td>\n",
+       "      <td>0.605079</td>\n",
+       "      <td>0.948903</td>\n",
+       "      <td>0.716526</td>\n",
+       "      <td>0.064395</td>\n",
+       "      <td>0.529422</td>\n",
+       "      <td>0.643723</td>\n",
+       "      <td>0.873478</td>\n",
+       "      <td>0.756919</td>\n",
+       "      <td>0.381609</td>\n",
+       "      <td>0.461302</td>\n",
+       "      <td>0.624133</td>\n",
+       "      <td>0.655457</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>ought/raft-dummy-predictions</td>\n",
+       "      <td>0.407345</td>\n",
+       "      <td>0.009504</td>\n",
+       "      <td>0.591213</td>\n",
+       "      <td>0.552390</td>\n",
+       "      <td>0.594769</td>\n",
+       "      <td>0.339822</td>\n",
+       "      <td>0.728116</td>\n",
+       "      <td>0.878378</td>\n",
+       "      <td>0.291842</td>\n",
+       "      <td>0.144772</td>\n",
+       "      <td>0.089622</td>\n",
+       "      <td>0.260366</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                         Submission   Overall  banking_77  \\\n",
+       "1  lewtun/my-raft-dummy-predictions  0.605079    0.948903   \n",
+       "0      ought/raft-dummy-predictions  0.407345    0.009504   \n",
+       "\n",
+       "   medical_subdomain_of_clinical_notes  overruling  gpai_initiatives  \\\n",
+       "1                             0.716526    0.064395          0.529422   \n",
+       "0                             0.591213    0.552390          0.594769   \n",
+       "\n",
+       "   semiconductor_org_types  twitter_complaints  \\\n",
+       "1                 0.643723            0.873478   \n",
+       "0                 0.339822            0.728116   \n",
+       "\n",
+       "   neurips_impact_statement_risks  systematic_review_inclusion  \\\n",
+       "1                        0.756919                     0.381609   \n",
+       "0                        0.878378                     0.291842   \n",
+       "\n",
+       "   terms_of_service  tai_safety_research  one_stop_english  \n",
+       "1          0.461302             0.624133          0.655457  \n",
+       "0          0.144772             0.089622          0.260366  "
+      ]
+     },
+     "execution_count": 109,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 100,
+   "id": "f60f453b-2457-4597-9eee-324d4c3a2f2e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.insert(1, \"Overall\", df[TASKS].mean(axis=1))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 110,
+   "id": "1fd83f7a-b554-4e7d-aef6-4338b01f3eec",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Rank</th>\n",
+       "      <th>Submission</th>\n",
+       "      <th>Overall</th>\n",
+       "      <th>banking_77</th>\n",
+       "      <th>medical_subdomain_of_clinical_notes</th>\n",
+       "      <th>overruling</th>\n",
+       "      <th>gpai_initiatives</th>\n",
+       "      <th>semiconductor_org_types</th>\n",
+       "      <th>twitter_complaints</th>\n",
+       "      <th>neurips_impact_statement_risks</th>\n",
+       "      <th>systematic_review_inclusion</th>\n",
+       "      <th>terms_of_service</th>\n",
+       "      <th>tai_safety_research</th>\n",
+       "      <th>one_stop_english</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1</td>\n",
+       "      <td>lewtun/my-raft-dummy-predictions</td>\n",
+       "      <td>0.605079</td>\n",
+       "      <td>0.948903</td>\n",
+       "      <td>0.716526</td>\n",
+       "      <td>0.064395</td>\n",
+       "      <td>0.529422</td>\n",
+       "      <td>0.643723</td>\n",
+       "      <td>0.873478</td>\n",
+       "      <td>0.756919</td>\n",
+       "      <td>0.381609</td>\n",
+       "      <td>0.461302</td>\n",
+       "      <td>0.624133</td>\n",
+       "      <td>0.655457</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>0</td>\n",
+       "      <td>ought/raft-dummy-predictions</td>\n",
+       "      <td>0.407345</td>\n",
+       "      <td>0.009504</td>\n",
+       "      <td>0.591213</td>\n",
+       "      <td>0.552390</td>\n",
+       "      <td>0.594769</td>\n",
+       "      <td>0.339822</td>\n",
+       "      <td>0.728116</td>\n",
+       "      <td>0.878378</td>\n",
+       "      <td>0.291842</td>\n",
+       "      <td>0.144772</td>\n",
+       "      <td>0.089622</td>\n",
+       "      <td>0.260366</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   Rank                        Submission   Overall  banking_77  \\\n",
+       "0     1  lewtun/my-raft-dummy-predictions  0.605079    0.948903   \n",
+       "1     0      ought/raft-dummy-predictions  0.407345    0.009504   \n",
+       "\n",
+       "   medical_subdomain_of_clinical_notes  overruling  gpai_initiatives  \\\n",
+       "0                             0.716526    0.064395          0.529422   \n",
+       "1                             0.591213    0.552390          0.594769   \n",
+       "\n",
+       "   semiconductor_org_types  twitter_complaints  \\\n",
+       "0                 0.643723            0.873478   \n",
+       "1                 0.339822            0.728116   \n",
+       "\n",
+       "   neurips_impact_statement_risks  systematic_review_inclusion  \\\n",
+       "0                        0.756919                     0.381609   \n",
+       "1                        0.878378                     0.291842   \n",
+       "\n",
+       "   terms_of_service  tai_safety_research  one_stop_english  \n",
+       "0          0.461302             0.624133          0.655457  \n",
+       "1          0.144772             0.089622          0.260366  "
+      ]
+     },
+     "execution_count": 110,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.copy().sort_values(\"Overall\", ascending=False).reset_index().rename(columns={\"index\":\"Rank\"})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 119,
+   "id": "e1262ff5-6ea3-41ca-affc-b106dd9df5fd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "task_names = [\" \".join(t.capitalize() for t in task.split(\"_\")) for task in TASKS]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 121,
+   "id": "45d74b9c-c472-4494-aadc-909976d13b08",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Submission</th>\n",
+       "      <th>Overall</th>\n",
+       "      <th>Banking 77</th>\n",
+       "      <th>Medical Subdomain Of Clinical Notes</th>\n",
+       "      <th>Overruling</th>\n",
+       "      <th>Gpai Initiatives</th>\n",
+       "      <th>Semiconductor Org Types</th>\n",
+       "      <th>Twitter Complaints</th>\n",
+       "      <th>Neurips Impact Statement Risks</th>\n",
+       "      <th>Systematic Review Inclusion</th>\n",
+       "      <th>Terms Of Service</th>\n",
+       "      <th>Tai Safety Research</th>\n",
+       "      <th>One Stop English</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>lewtun/my-raft-dummy-predictions</td>\n",
+       "      <td>0.605079</td>\n",
+       "      <td>0.948903</td>\n",
+       "      <td>0.716526</td>\n",
+       "      <td>0.064395</td>\n",
+       "      <td>0.529422</td>\n",
+       "      <td>0.643723</td>\n",
+       "      <td>0.873478</td>\n",
+       "      <td>0.756919</td>\n",
+       "      <td>0.381609</td>\n",
+       "      <td>0.461302</td>\n",
+       "      <td>0.624133</td>\n",
+       "      <td>0.655457</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>ought/raft-dummy-predictions</td>\n",
+       "      <td>0.407345</td>\n",
+       "      <td>0.009504</td>\n",
+       "      <td>0.591213</td>\n",
+       "      <td>0.552390</td>\n",
+       "      <td>0.594769</td>\n",
+       "      <td>0.339822</td>\n",
+       "      <td>0.728116</td>\n",
+       "      <td>0.878378</td>\n",
+       "      <td>0.291842</td>\n",
+       "      <td>0.144772</td>\n",
+       "      <td>0.089622</td>\n",
+       "      <td>0.260366</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                         Submission   Overall  Banking 77  \\\n",
+       "1  lewtun/my-raft-dummy-predictions  0.605079    0.948903   \n",
+       "0      ought/raft-dummy-predictions  0.407345    0.009504   \n",
+       "\n",
+       "   Medical Subdomain Of Clinical Notes  Overruling  Gpai Initiatives  \\\n",
+       "1                             0.716526    0.064395          0.529422   \n",
+       "0                             0.591213    0.552390          0.594769   \n",
+       "\n",
+       "   Semiconductor Org Types  Twitter Complaints  \\\n",
+       "1                 0.643723            0.873478   \n",
+       "0                 0.339822            0.728116   \n",
+       "\n",
+       "   Neurips Impact Statement Risks  Systematic Review Inclusion  \\\n",
+       "1                        0.756919                     0.381609   \n",
+       "0                        0.878378                     0.291842   \n",
+       "\n",
+       "   Terms Of Service  Tai Safety Research  One Stop English  \n",
+       "1          0.461302             0.624133          0.655457  \n",
+       "0          0.144772             0.089622          0.260366  "
+      ]
+     },
+     "execution_count": 121,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.rename(columns={k:v for k,v in zip(TASKS, task_names)})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 88,
+   "id": "d31c2bde-1645-4c1b-982b-c9daac40311d",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Submission</th>\n",
+       "      <th>Overall</th>\n",
+       "      <th>banking_77</th>\n",
+       "      <th>medical_subdomain_of_clinical_notes</th>\n",
+       "      <th>overruling</th>\n",
+       "      <th>gpai_initiatives</th>\n",
+       "      <th>semiconductor_org_types</th>\n",
+       "      <th>twitter_complaints</th>\n",
+       "      <th>neurips_impact_statement_risks</th>\n",
+       "      <th>systematic_review_inclusion</th>\n",
+       "      <th>terms_of_service</th>\n",
+       "      <th>tai_safety_research</th>\n",
+       "      <th>one_stop_english</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>ought/raft-dummy-predictions</td>\n",
+       "      <td>0.407345</td>\n",
+       "      <td>0.009504</td>\n",
+       "      <td>0.591213</td>\n",
+       "      <td>0.55239</td>\n",
+       "      <td>0.594769</td>\n",
+       "      <td>0.339822</td>\n",
+       "      <td>0.728116</td>\n",
+       "      <td>0.878378</td>\n",
+       "      <td>0.291842</td>\n",
+       "      <td>0.144772</td>\n",
+       "      <td>0.089622</td>\n",
+       "      <td>0.260366</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                     Submission   Overall  banking_77  \\\n",
+       "0  ought/raft-dummy-predictions  0.407345    0.009504   \n",
+       "\n",
+       "   medical_subdomain_of_clinical_notes  overruling  gpai_initiatives  \\\n",
+       "0                             0.591213     0.55239          0.594769   \n",
+       "\n",
+       "   semiconductor_org_types  twitter_complaints  \\\n",
+       "0                 0.339822            0.728116   \n",
+       "\n",
+       "   neurips_impact_statement_risks  systematic_review_inclusion  \\\n",
+       "0                        0.878378                     0.291842   \n",
+       "\n",
+       "   terms_of_service  tai_safety_research  one_stop_english  \n",
+       "0          0.144772             0.089622          0.260366  "
+      ]
+     },
+     "execution_count": 88,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.sort_values(\"Overall\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4df33059-020a-43cf-aa3a-de6939268cc7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df[\"Overall\"] = df.mean()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "327539f3-3bf7-4a2e-ac10-89973a2ba37f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df[\"Submission\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "id": "f07ec556-2ebf-400e-85f7-c978d03b0dc1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = format_submissions(submissions[-1:])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "id": "a982e024-ab16-4752-984a-5368fa238f1d",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>bank</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0.2</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   bank\n",
+       "0   0.2"
+      ]
+     },
+     "execution_count": 48,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pd.DataFrame({\"bank\":[0.2]})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 60,
+   "id": "b7c73606-d7f9-4f17-bf4d-17cfbb3aa664",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['cola', 'sst2', 'mrpc', 'qqp', 'stsb', 'mnli', 'mnli_mismatched', 'mnli_matched', 'qnli', 'rte', 'wnli', 'ax']\n"
+     ]
+    }
+   ],
+   "source": [
+    "from datasets import get_dataset_config_names\n",
+    "\n",
+    "configs = get_dataset_config_names(\"glue\")\n",
+    "print(configs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "id": "92eea464-6b63-4613-ab4d-aa5003e0bb3b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import get_dataset_config_names"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 51,
+   "id": "4f9c2924-001a-4b76-a8ed-a072b43eedbd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tasks = get_dataset_config_names(\"ought/raft\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 55,
+   "id": "9b27374f-b118-440a-acab-6e4aa09f42a4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "submission_data = {t:[] for t in tasks}\n",
+    "\n",
+    "for task in data[\"card_data\"][\"results\"]:\n",
+    "    task_data = task[\"task\"]\n",
+    "    task_name = task_data[\"name\"]\n",
+    "    score = task_data[\"metrics\"][0][\"value\"]\n",
+    "    submission_data[task_name].append(score)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
+   "id": "6b7cf2e0-ee92-4647-8d9b-6edef48e06f8",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'banking_77': [0.009504218288713173],\n",
+       " 'medical_subdomain_of_clinical_notes': [0.5912133593265538],\n",
+       " 'overruling': [0.5523904885287522],\n",
+       " 'gpai_initiatives': [0.5947694876413803],\n",
+       " 'semiconductor_org_types': [0.33982211621333613],\n",
+       " 'twitter_complaints': [0.7281156178656647],\n",
+       " 'neurips_impact_statement_risks': [0.8783775228874845],\n",
+       " 'systematic_review_inclusion': [0.2918416872180052],\n",
+       " 'terms_of_service': [0.14477157391911066],\n",
+       " 'tai_safety_research': [0.08962249895220364],\n",
+       " 'one_stop_english': [0.2603661495335281]}"
+      ]
+     },
+     "execution_count": 56,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "submission_data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 61,
+   "id": "5df282e4-87c4-4ea7-833e-6a87886e2f76",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'benchmark': 'ought/raft',\n",
+       " 'type': 'evaluation',\n",
+       " 'submission_dataset': 'ought/raft-dummy-predictions',\n",
+       " 'tags': ['autonlp', 'evaluation', 'benchmark'],\n",
+       " 'model-index': None,\n",
+       " 'results': [{'task': {'metrics': [{'name': 'f1',\n",
+       "      'type': 'f1',\n",
+       "      'value': 0.009504218288713173}],\n",
+       "    'name': 'banking_77',\n",
+       "    'type': 'text-classification'}},\n",
+       "  {'task': {'metrics': [{'name': 'f1',\n",
+       "      'type': 'f1',\n",
+       "      'value': 0.5912133593265538}],\n",
+       "    'name': 'medical_subdomain_of_clinical_notes',\n",
+       "    'type': 'text-classification'}},\n",
+       "  {'task': {'metrics': [{'name': 'f1',\n",
+       "      'type': 'f1',\n",
+       "      'value': 0.5523904885287522}],\n",
+       "    'name': 'overruling',\n",
+       "    'type': 'text-classification'}},\n",
+       "  {'task': {'metrics': [{'name': 'f1',\n",
+       "      'type': 'f1',\n",
+       "      'value': 0.5947694876413803}],\n",
+       "    'name': 'gpai_initiatives',\n",
+       "    'type': 'text-classification'}},\n",
+       "  {'task': {'metrics': [{'name': 'f1',\n",
+       "      'type': 'f1',\n",
+       "      'value': 0.33982211621333613}],\n",
+       "    'name': 'semiconductor_org_types',\n",
+       "    'type': 'text-classification'}},\n",
+       "  {'task': {'metrics': [{'name': 'f1',\n",
+       "      'type': 'f1',\n",
+       "      'value': 0.7281156178656647}],\n",
+       "    'name': 'twitter_complaints',\n",
+       "    'type': 'text-classification'}},\n",
+       "  {'task': {'metrics': [{'name': 'f1',\n",
+       "      'type': 'f1',\n",
+       "      'value': 0.8783775228874845}],\n",
+       "    'name': 'neurips_impact_statement_risks',\n",
+       "    'type': 'text-classification'}},\n",
+       "  {'task': {'metrics': [{'name': 'f1',\n",
+       "      'type': 'f1',\n",
+       "      'value': 0.2918416872180052}],\n",
+       "    'name': 'systematic_review_inclusion',\n",
+       "    'type': 'text-classification'}},\n",
+       "  {'task': {'metrics': [{'name': 'f1',\n",
+       "      'type': 'f1',\n",
+       "      'value': 0.14477157391911066}],\n",
+       "    'name': 'terms_of_service',\n",
+       "    'type': 'text-classification'}},\n",
+       "  {'task': {'metrics': [{'name': 'f1',\n",
+       "      'type': 'f1',\n",
+       "      'value': 0.08962249895220364}],\n",
+       "    'name': 'tai_safety_research',\n",
+       "    'type': 'text-classification'}},\n",
+       "  {'task': {'metrics': [{'name': 'f1',\n",
+       "      'type': 'f1',\n",
+       "      'value': 0.2603661495335281}],\n",
+       "    'name': 'one_stop_english',\n",
+       "    'type': 'text-classification'}}]}"
+      ]
+     },
+     "execution_count": 61,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data[\"card_data\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "b07a4fa9-176e-4ff3-bc3f-eb2a6fc9efda",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response = requests.get(\"http://huggingface.co/api/datasets\", headers=header)\n",
+    "all_datasets = response.json()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "63dc07ec-2f28-483f-8163-c97e8a6a4005",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "2510"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(all_datasets)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "296f68c1-608d-4ea6-8d0e-cc35fb7d74c4",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'id': 'disfl_qa',\n",
+       " 'tags': ['annotations_creators:expert-generated',\n",
+       "  'language_creators:found',\n",
+       "  'languages:en',\n",
+       "  'licenses:cc-by-4.0',\n",
+       "  'multilinguality:monolingual',\n",
+       "  'pretty_name:DISFL-QA: A Benchmark Dataset for Understanding Disfluencies in Question Answering',\n",
+       "  'size_categories:10K<n<100K',\n",
+       "  'source_datasets:original',\n",
+       "  'task_categories:question-answering',\n",
+       "  'task_ids:extractive-qa',\n",
+       "  'task_ids:open-domain-qa'],\n",
+       " 'citation': '@inproceedings{gupta-etal-2021-disflqa,\\n    title = \"{Disfl-QA: A Benchmark Dataset for Understanding Disfluencies in Question Answering}\",\\n    author = \"Gupta, Aditya and Xu, Jiacheng and Upadhyay, Shyam and Yang, Diyi and Faruqui, Manaal\",\\n    booktitle = \"Findings of ACL\",\\n    year = \"2021\"\\n}',\n",
+       " 'description': 'Disfl-QA is a targeted dataset for contextual disfluencies in an information seeking setting,\\nnamely question answering over Wikipedia passages. Disfl-QA builds upon the SQuAD-v2 (Rajpurkar et al., 2018)\\ndataset, where each question in the dev set is annotated to add a contextual disfluency using the paragraph as\\na source of distractors.\\n\\nThe final dataset consists of ~12k (disfluent question, answer) pairs. Over 90% of the disfluencies are\\ncorrections or restarts, making it a much harder test set for disfluency correction. Disfl-QA aims to fill a\\nmajor gap between speech and NLP research community. We hope the dataset can serve as a benchmark dataset for\\ntesting robustness of models against disfluent inputs.\\n\\nOur expriments reveal that the state-of-the-art models are brittle when subjected to disfluent inputs from\\nDisfl-QA. Detailed experiments and analyses can be found in our paper.',\n",
+       " 'key': ''}"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "all_datasets[154]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "8c73c912-c903-48f9-9ccf-fdb70d0bd556",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def extract_tags(dataset):\n",
+    "    tags = {}\n",
+    "    for tag in dataset[\"tags\"]:\n",
+    "        k,v = tuple(tag.split(\":\", 1))\n",
+    "        tags[k] = v\n",
+    "    return tags"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "d4aa1b62-1501-4f3d-8613-e2dfb5fef79d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tags = extract_tags(all_datasets[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "b8c25fe5-d0d5-4ca9-afc6-8d5cf68f20fd",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "False"
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tags.get(\"benchmark\") == \"raft\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "441f0b74-68a4-4b82-862d-2fcc69331cc0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for idx, dset in enumerate(all_datasets):\n",
+    "    try:\n",
+    "        extract_tags(dset)\n",
+    "    except:\n",
+    "        print(dset[\"id\"], idx)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "b43f6131-6509-455f-ac02-1efabd9cdd1c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'annotations_creators': 'expert-generated',\n",
+       " 'language_creators': 'found',\n",
+       " 'languages': 'en',\n",
+       " 'licenses': 'mit',\n",
+       " 'multilinguality': 'monolingual',\n",
+       " 'size_categories': '10K<n<100K',\n",
+       " 'source_datasets': 'original',\n",
+       " 'task_categories': 'structure-prediction',\n",
+       " 'task_ids': 'structure-prediction-other-acronym-identification'}"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "{i[0]:i[1] for t.split(\":\") in all_datasets[0][\"tags\"]}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "63420516-9870-4ecf-80d8-d922994e4b17",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "('a',)\n",
+      "('b',)\n"
+     ]
+    }
+   ],
+   "source": [
+    "for i in zip(\"a:b\".split(\":\")):\n",
+    "    print(i)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "dc43998b-c93b-48c0-bee4-e80845950246",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ValueError",
+     "evalue": "not enough values to unpack (expected 2, got 1)",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
+      "\u001b[0;32m/var/folders/28/k4cy5q7s2hs92xq7_h89_vgm0000gn/T/ipykernel_19497/2621214275.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mb\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mzip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"a\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"b\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[0;31mValueError\u001b[0m: not enough values to unpack (expected 2, got 1)"
+     ]
+    }
+   ],
+   "source": [
+    "a, b = zip(*[\"a\", \"b\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "4990ce09-a53f-47dd-b662-3f498352b641",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "annotations_creators expert-generated\n",
+      "language_creators found\n",
+      "languages en\n",
+      "licenses mit\n",
+      "multilinguality monolingual\n",
+      "size_categories 10K<n<100K\n",
+      "source_datasets original\n",
+      "task_categories structure-prediction\n",
+      "task_ids structure-prediction-other-acronym-identification\n"
+     ]
+    }
+   ],
+   "source": [
+    "for tag in all_datasets[0][\"tags\"]:\n",
+    "    k,v = tuple(tag.split(\":\"))\n",
+    "    print(k,v)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 138,
+   "id": "cf6b59da-02ff-4522-892d-8fe0aa254d01",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0 <s>\n",
+      "1922 ¡\n",
+      "11884 hola\n",
+      "16 ,\n",
+      "378  me\n",
+      "13496  llamo\n",
+      "466  le\n",
+      "91 w\n",
+      "350 is\n",
+      "5 !\n",
+      "2 </s>\n"
+     ]
+    }
+   ],
+   "source": [
+    "from transformers import AutoTokenizer\n",
+    "\n",
+    "model_ckpt = \"bertin-project/bertin-roberta-base-spanish\"\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_ckpt, use_fast=False)\n",
+    "input_ids = tokenizer(\"¡hola, me llamo lewis!\").input_ids\n",
+    "for token in input_ids:\n",
+    "    print(token, tokenizer.decode(token))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "430400f2-2c04-48d7-bf8e-63528441d410",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 1922 ¡\n",
+    "# 11884 hola\n",
+    "# 16 ,\n",
+    "# 378  me\n",
+    "# 13496  llamo\n",
+    "# 466  le\n",
+    "# 91 w\n",
+    "# 350 is\n",
+    "# 5 !"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 130,
+   "id": "2ecdd872-af9b-4258-8a5e-d867f3785520",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0"
+      ]
+     },
+     "execution_count": 130,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tokenizer.vocab[\"<s>\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 131,
+   "id": "16941c33-5e22-485f-9d24-ac8f8542c368",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'<s>'"
+      ]
+     },
+     "execution_count": 131,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tokenizer.bos_token"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "71929465-5ad5-444d-8c77-22f586b1ba23",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

app.py CHANGED Viewed

@@ -42,9 +42,8 @@ def download_submissions():
 def format_submissions(submissions):
-    submission_data = {**{"Team": []}, **{"Model": []}, **{"Submission Date": []}, **{t: [] for t in TASKS}}
-    # TODO(lewtun): delete / filter all the junk repos from development
     # The following picks the latest submissions which adhere to the model card schema
     for submission in submissions:
         submission_id = submission["id"]
@@ -55,10 +54,10 @@ def format_submissions(submissions):
         data = response.json()
         card_data = data["card_data"]
         username = card_data["submission_dataset"].split("/")[0]
-        submission_data["Team"].append(username)
         submission_id = card_data["submission_id"]
         submission_name, sha, timestamp = submission_id.split("__")
-        submission_data["Model"].append(submission_name)
         timestamp = pd.to_datetime(int(timestamp))
         submission_data["Submission Date"].append(datetime.date(timestamp).strftime("%b %d, %Y"))

 def format_submissions(submissions):
+    submission_data = {**{"Submitter": []}, **{"Submission Name": []}, **{"Submission Date": []}, **{t: [] for t in TASKS}}
     # The following picks the latest submissions which adhere to the model card schema
     for submission in submissions:
         submission_id = submission["id"]
         data = response.json()
         card_data = data["card_data"]
         username = card_data["submission_dataset"].split("/")[0]
+        submission_data["Submitter"].append(username)
         submission_id = card_data["submission_id"]
         submission_name, sha, timestamp = submission_id.split("__")
+        submission_data["Submission Name"].append(submission_name)
         timestamp = pd.to_datetime(int(timestamp))
         submission_data["Submission Date"].append(datetime.date(timestamp).strftime("%b %d, %Y"))