Spaces:

GEM
/

submission-form

Runtime error

File size: 11,298 Bytes

import json
import os
import shutil
import uuid
from datetime import datetime
from pathlib import Path

import jsonlines
import streamlit as st
from dotenv import load_dotenv
from huggingface_hub import Repository, cached_download, hf_hub_url

from utils import http_get, http_post, validate_json

if Path(".env").is_file():
    load_dotenv(".env")

HF_TOKEN = os.getenv("HF_TOKEN")
AUTOTRAIN_USERNAME = os.getenv("AUTOTRAIN_USERNAME")
AUTOTRAIN_BACKEND_API = os.getenv("AUTOTRAIN_BACKEND_API")
LOCAL_REPO = "submission_repo"
LOGS_REPO = "submission-logs"

# TODO
# 1. Add check that fields are nested under `tasks` field correctly
# 2. Add check that names of tasks and datasets are valid

MARKDOWN = """---
benchmark: gem
type: prediction
submission_name: {submission_name}
tags:
- evaluation
- benchmark
---
# GEM Submission

Submission name: {submission_name}

"""


def generate_dataset_card(submission_name):
    """
    Generate dataset card for the submission
    """
    markdown = MARKDOWN.format(
        submission_name=submission_name,
    )
    with open(os.path.join(LOCAL_REPO, "README.md"), "w") as f:
        f.write(markdown)


def load_json(path):
    with open(path, "r") as f:
        return json.load(f)


def get_submission_names():
    """Download all submission names.

    The GEM frontend requires the submission names to be unique, so here we
    download all submission names and use them as a check against the user
    submissions.
    """
    scores_url = hf_hub_url("GEM-submissions/submission-scores", "scores.json", repo_type="dataset")
    scores_filepath = cached_download(scores_url, force_download=True)
    scores_data = load_json(scores_filepath)
    return [score["submission_name"] for score in scores_data]


#######
# APP #
#######
st.title("GEM Submissions")
st.markdown(
    """
    Welcome to the [GEM benchmark](https://gem-benchmark.com/)! GEM is a benchmark
    environment for Natural Language Generation with a focus on its Evaluation, both
    through human annotations and automated Metrics.

    GEM aims to:

    - measure NLG progress across many NLG tasks across languages.
    - audit data and models and present results via data cards and model robustness
    reports.
    - develop standards for evaluation of generated text using both automated and
    human metrics.

    Use this page to submit your system's predictions to the benchmark.
    """
)

with st.form(key="form"):
    # Flush local repo
    shutil.rmtree(LOCAL_REPO, ignore_errors=True)
    submission_errors = 0
    uploaded_file = st.file_uploader("Upload submission file", type=["json"])

    if uploaded_file:
        data = str(uploaded_file.read(), "utf-8")
        json_data = json.loads(data)
        submission_names = get_submission_names()
        submission_name = json_data["submission_name"]
        if submission_name in submission_names:
            st.error(f"🙈 Submission name `{submission_name}` is already taken. Please rename your submission.")
            submission_errors += 1
        else:
            is_valid, message = validate_json(json_data)
            if is_valid:
                st.success(message)
            else:
                st.error(message)
                submission_errors += 1

    with st.expander("Submission format"):
        st.markdown(
            """
        Please follow this JSON format for your `submission.json` file:

        ```json
        {
        "submission_name": "An identifying name of your system",
        "param_count": 123, # The number of parameters your system has.
        "description": "An optional brief description of the system that will be shown on the results page",
        "tasks":
            {
            "dataset_identifier": {
                "values": ["output-0", "output-1", "..."], # A list of system outputs.
                "keys": ["gem_id-0", "gem_id-1", ...] # A list of GEM IDs.
                }
            }
        }
        ```
        Here, `dataset_identifier` is the identifier of the dataset followed by
        an identifier of the set the outputs were created from, for example
        `_validation` or `_test`. For example, the `mlsum_de` test set has the
        identifier `mlsum_de_test`. The `keys` field is needed to avoid
        accidental shuffling that will impact your metrics. Simply add a list of
        IDs from the `gem_id` column of each evaluation dataset in the same
        order as your values. Please see the sample submission below:
        """
        )
        with open("sample-submission.json", "r") as f:
            example_submission = json.load(f)
            st.json(example_submission)

    user_name = st.text_input("Enter your 🤗 Hub username", help="This field is required to track your submission and cannot be empty")
    submit_button = st.form_submit_button("Make Submission")

if submit_button and submission_errors == 0:
    with st.spinner("⏳ Preparing submission for evaluation ..."):
        submission_name = json_data["submission_name"]
        submission_name_formatted = submission_name.lower().replace(" ", "-").replace("/", "-")
        submission_time = str(int(datetime.now().timestamp()))

        # Create submission dataset under benchmarks ORG
        submission_repo_id = f"GEM-submissions/{user_name}__{submission_name_formatted}__{submission_time}"
        dataset_repo_url = f"https://huggingface.co/datasets/{submission_repo_id}"
        repo = Repository(
            local_dir=LOCAL_REPO,
            clone_from=dataset_repo_url,
            repo_type="dataset",
            private=False,
            use_auth_token=HF_TOKEN,
        )
        generate_dataset_card(submission_name)

        with open(f"{LOCAL_REPO}/submission.json", "w", encoding="utf-8") as f:
            json.dump(json_data, f)

        # TODO: add informative commit msg
        commit_url = repo.push_to_hub()
        if commit_url is not None:
            commit_sha = commit_url.split("/")[-1]
        else:
            commit_sha = repo.git_head_commit_url().split("/")[-1]

        submission_id = submission_name + "__" + str(uuid.uuid4())[:6] + "__" + submission_time

        # Define AutoTrain payload
        project_config = {}
        # Need a dummy dataset to use the dataset loader in AutoTrain
        project_config["dataset_name"] = "lewtun/imdb-dummy"
        project_config["dataset_config"] = "lewtun--imdb-dummy"
        project_config["dataset_split"] = "train"
        project_config["col_mapping"] = {"text": "text", "label": "target"}
        # Specify benchmark parameters
        project_config["model"] = "gem"
        project_config["dataset"] = "GEM/references"
        project_config["submission_dataset"] = submission_repo_id
        project_id = str(uuid.uuid4()).split("-")[0]
        project_payload = {
            "username": AUTOTRAIN_USERNAME,
            "proj_name": f"benchmark-gem-{project_id}",
            "task": 1,
            "config": {
                "language": "en",
                "max_models": 5,
                "instance": {
                    "provider": "aws",
                    "instance_type": "ml.g4dn.4xlarge",
                    "max_runtime_seconds": 172800,
                    "num_instances": 1,
                    "disk_size_gb": 150,
                },
                "benchmark": {
                    "dataset": project_config["dataset"],
                    "model": project_config["model"],
                    "submission_dataset": project_config["submission_dataset"],
                },
            },
        }
        project_json_resp = http_post(
            path="/projects/create", payload=project_payload, token=HF_TOKEN, domain=AUTOTRAIN_BACKEND_API
        ).json()
        print(f"Project creation: {project_json_resp}")

        # Upload data
        payload = {
            "split": 4,
            "col_mapping": project_config["col_mapping"],
            "load_config": {"max_size_bytes": 0, "shuffle": False},
        }
        data_json_resp = http_post(
            path=f"/projects/{project_json_resp['id']}/data/{project_config['dataset_name']}",
            payload=payload,
            token=HF_TOKEN,
            domain=AUTOTRAIN_BACKEND_API,
            params={
                "type": "dataset",
                "config_name": project_config["dataset_config"],
                "split_name": project_config["dataset_split"],
            },
        ).json()
        print(f"Dataset creation: {data_json_resp}")

        # Run training
        train_json_resp = http_get(
            path=f"/projects/{project_json_resp['id']}/data/start_process",
            token=HF_TOKEN,
            domain=AUTOTRAIN_BACKEND_API,
        ).json()
        print(f"Training job response: {train_json_resp}")

        logs_repo_url = f"https://huggingface.co/datasets/GEM-submissions/{LOGS_REPO}"
        logs_repo = Repository(
            local_dir=LOGS_REPO,
            clone_from=logs_repo_url,
            repo_type="dataset",
            private=True,
            use_auth_token=HF_TOKEN,
        )
        evaluation_log = {}
        evaluation_log["payload"] = project_payload
        evaluation_log["project_creation_response"] = project_json_resp
        evaluation_log["dataset_creation_response"] = data_json_resp
        evaluation_log["autotrain_job_response"] = train_json_resp
        with jsonlines.open(f"{LOGS_REPO}/logs.jsonl") as r:
            lines = []
            for obj in r:
                lines.append(obj)

        lines.append(evaluation_log)
        with jsonlines.open(f"{LOGS_REPO}/logs.jsonl", mode="w") as writer:
            for job in lines:
                writer.write(job)
        logs_repo.push_to_hub(commit_message=f"Submission with job ID {project_json_resp['id']}")

    if train_json_resp["success"] == 1:
        st.success(
            f"✅ Submission {submission_name} was successfully submitted for evaluation!"
        )
        st.markdown(
            f"""
            Evaluation can take up to 1 hour to complete, so grab a ☕ or 🍵 while you wait:

            * 📊 Click [here](https://huggingface.co/spaces/GEM/results) to view the results from your submission
            * 💾 Click [here]({dataset_repo_url}) to view your submission file on the Hugging Face Hub

            Please [contact the organisers](mailto:gehrmann@google.com) if you would like your submission and/or evaluation scores deleted.
            """
        )
    else:
        st.error(
            "🙈 Oh noes, there was an error submitting your submission! Please [contact the organisers](mailto:gehrmann@google.com)"
        )

    # # Flush local repos
    shutil.rmtree(LOCAL_REPO, ignore_errors=True)
    shutil.rmtree(LOGS_REPO, ignore_errors=True)


with st.expander("Download all submissions and scores"):
    st.markdown("Click the button below if you'd like to download all the submissions and evaluations from GEM:")
    outputs_url = hf_hub_url(
        "GEM-submissions/v2-outputs-and-scores", "gem-v2-outputs-and-scores.zip", repo_type="dataset"
    )
    outputs_filepath = cached_download(outputs_url)

    with open(outputs_filepath, "rb") as f:
        btn = st.download_button(label="Download submissions and scores", data=f, file_name="outputs-and-scores.zip")