Spaces:
Runtime error
Runtime error
import json | |
import os | |
import shutil | |
from datetime import datetime | |
from pathlib import Path | |
import jsonlines | |
import streamlit as st | |
from dotenv import load_dotenv | |
from huggingface_hub import HfApi, Repository, hf_hub_url, cached_download | |
from utils import http_post, validate_json | |
if Path(".env").is_file(): | |
load_dotenv(".env") | |
HF_TOKEN = os.getenv("HF_TOKEN") | |
AUTONLP_USERNAME = os.getenv("AUTONLP_USERNAME") | |
HF_AUTONLP_BACKEND_API = os.getenv("HF_AUTONLP_BACKEND_API") | |
LOCAL_REPO = "submission_repo" | |
LOGS_REPO = "submission-logs" | |
## TODO ## | |
# 1. Add check that fields are nested under `tasks` field correctly | |
# 2. Add check that names of tasks and datasets are valid | |
MARKDOWN = """--- | |
benchmark: gem | |
type: prediction | |
submission_name: {submission_name} | |
tags: | |
- evaluation | |
- benchmark | |
--- | |
# GEM Submission | |
Submission name: {submission_name} | |
""" | |
def generate_dataset_card(submission_name): | |
""" | |
Generate dataset card for the submission | |
""" | |
markdown = MARKDOWN.format( | |
submission_name=submission_name, | |
) | |
with open(os.path.join(LOCAL_REPO, "README.md"), "w") as f: | |
f.write(markdown) | |
def load_json(path): | |
with open(path, "r") as f: | |
return json.load(f) | |
# The GEM frontend requires the submission names to be unique, so here we | |
# download all submission names and use them as a check against the user | |
# submissions | |
scores_url = hf_hub_url("GEM-submissions/submission-scores", "scores.json", repo_type="dataset") | |
scores_filepath = cached_download(scores_url) | |
scores_data = load_json(scores_filepath) | |
submission_names = [score["submission_name"] for score in scores_data] | |
########### | |
### APP ### | |
########### | |
st.title("GEM Submissions") | |
st.markdown( | |
""" | |
Welcome to the [GEM benchmark](https://gem-benchmark.com/)! GEM is a benchmark | |
environment for Natural Language Generation with a focus on its Evaluation, both | |
through human annotations and automated Metrics. | |
GEM aims to: | |
- measure NLG progress across many NLG tasks across languages. | |
- audit data and models and present results via data cards and model robustness | |
reports. | |
- develop standards for evaluation of generated text using both automated and | |
human metrics. | |
Use this page to submit your system's predictions to the benchmark. | |
""" | |
) | |
with st.form(key="form"): | |
# Flush local repo | |
shutil.rmtree(LOCAL_REPO, ignore_errors=True) | |
submission_errors = 0 | |
uploaded_file = st.file_uploader("Upload submission file", type=["json"]) | |
if uploaded_file: | |
data = str(uploaded_file.read(), "utf-8") | |
json_data = json.loads(data) | |
submission_name = json_data["submission_name"] | |
if submission_name in submission_names: | |
st.error(f"π Submission name `{submission_name}` is already taken. Please rename your submission.") | |
submission_errors += 1 | |
else: | |
is_valid, message = validate_json(json_data) | |
if is_valid: | |
st.success(message) | |
else: | |
st.error(message) | |
submission_errors += 1 | |
with st.expander("Submission format"): | |
st.markdown( | |
""" | |
Please follow this JSON format for your `submission.json` file: | |
```json | |
{ | |
"submission_name": "An identifying name of your system", | |
"param_count": 123, # The number of parameters your system has. | |
"description": "An optional brief description of the system that will be shown on the results page", | |
"tasks": | |
{ | |
"dataset_identifier": { | |
"values": ["output-0", "output-1", "..."], # A list of system outputs. | |
"keys": ["gem_id-0", "gem_id-1", ...] # A list of GEM IDs. | |
} | |
} | |
} | |
``` | |
Here, `dataset_identifier` is the identifier of the dataset followed by | |
an identifier of the set the outputs were created from, for example | |
`_validation` or `_test`. For example, the `mlsum_de` test set has the | |
identifier `mlsum_de_test`. The `keys` field is needed to avoid | |
accidental shuffling that will impact your metrics. Simply add a list of | |
IDs from the `gem_id` column of each evaluation dataset in the same | |
order as your values. Please see the sample submission below: | |
""" | |
) | |
with open("sample-submission.json", "r") as f: | |
example_submission = json.load(f) | |
st.json(example_submission) | |
user_name = st.text_input("Enter your π€ Hub username.") | |
submit_button = st.form_submit_button("Make Submission") | |
if submit_button and submission_errors == 0: | |
with st.spinner("β³ Preparing submission for evaluation ..."): | |
submission_name = json_data["submission_name"] | |
submission_name_formatted = submission_name.lower().replace(" ", "-").replace("/", "-") | |
submission_time = str(int(datetime.now().timestamp())) | |
# Create submission dataset under benchmarks ORG | |
submission_repo_id = f"{user_name}__{submission_name_formatted}__{submission_time}" | |
dataset_repo_url = f"https://huggingface.co/datasets/GEM-submissions/{submission_repo_id}" | |
repo = Repository( | |
local_dir=LOCAL_REPO, | |
clone_from=dataset_repo_url, | |
repo_type="dataset", | |
private=False, | |
use_auth_token=HF_TOKEN, | |
) | |
generate_dataset_card(submission_name) | |
with open(f"{LOCAL_REPO}/submission.json", "w", encoding="utf-8") as f: | |
json.dump(json_data, f) | |
# TODO: add informative commit msg | |
commit_url = repo.push_to_hub() | |
if commit_url is not None: | |
commit_sha = commit_url.split("/")[-1] | |
else: | |
commit_sha = repo.git_head_commit_url().split("/")[-1] | |
submission_id = submission_name + "__" + commit_sha + "__" + submission_time | |
payload = { | |
"username": AUTONLP_USERNAME, | |
"dataset": "GEM/references", | |
"task": 1, | |
"model": "gem", | |
"submission_dataset": f"GEM-submissions/{submission_repo_id}", | |
"submission_id": submission_id, | |
"col_mapping": {}, | |
"split": "test", | |
"config": None, | |
} | |
json_resp = http_post( | |
path="/evaluate/create", payload=payload, token=HF_TOKEN, domain=HF_AUTONLP_BACKEND_API | |
).json() | |
logs_repo_url = f"https://huggingface.co/datasets/GEM-submissions/{LOGS_REPO}" | |
logs_repo = Repository( | |
local_dir=LOGS_REPO, | |
clone_from=logs_repo_url, | |
repo_type="dataset", | |
private=True, | |
use_auth_token=HF_TOKEN, | |
) | |
json_resp["submission_name"] = submission_name | |
with jsonlines.open(f"{LOGS_REPO}/logs.jsonl") as r: | |
lines = [] | |
for obj in r: | |
lines.append(obj) | |
lines.append(json_resp) | |
with jsonlines.open(f"{LOGS_REPO}/logs.jsonl", mode="w") as writer: | |
for job in lines: | |
writer.write(job) | |
logs_repo.push_to_hub(commit_message=f"Submission with job ID {json_resp['id']}") | |
if json_resp["status"] == 1: | |
st.success( | |
f"β Submission {submission_name} was successfully submitted for evaluation with job ID {json_resp['id']}" | |
) | |
st.markdown( | |
f""" | |
Evaluation takes appoximately 1-2 hours to complete, so grab a β or π΅ while you wait: | |
* π Click [here](https://huggingface.co/spaces/GEM/results) to view the results from your submission | |
* πΎ Click [here]({dataset_repo_url}) to view your submission file on the Hugging Face Hub | |
Please [contact the organisers](mailto:gehrmann@google.com) if you would like your submission and/or evaluation scores deleted. | |
""" | |
) | |
else: | |
st.error( | |
"π Oh noes, there was an error submitting your submission! Please [contact the organisers](mailto:gehrmann@google.com)" | |
) | |
# # Flush local repos | |
shutil.rmtree(LOCAL_REPO, ignore_errors=True) | |
shutil.rmtree(LOGS_REPO, ignore_errors=True) | |
with st.expander("Download all submissions and scores"): | |
st.markdown("Click the button below if you'd like to download all the submissions and evaluations from GEM:") | |
outputs_url = hf_hub_url( | |
"GEM-submissions/v2-outputs-and-scores", "gem-v2-outputs-and-scores.zip", repo_type="dataset" | |
) | |
outputs_filepath = cached_download(outputs_url) | |
with open(outputs_filepath, "rb") as f: | |
btn = st.download_button(label="Download submissions and scores", data=f, file_name="outputs-and-scores.zip") | |