Spaces:

GEM
/

submission-form

Runtime error

App Files Files Community

lewtun HF Staff commited on Jun 23, 2022

Commit

1b95f45

1 Parent(s): 0ef1d60

Refactor to mathc new AutoTrain API

Browse files

Files changed (5) hide show

.gitignore +4 -1
Makefile +8 -0
app.py +84 -30
requirements.txt +1 -1
utils.py +14 -6

.gitignore CHANGED Viewed

@@ -143,4 +143,7 @@ cython_debug/
 # Submissions
 submission_repo/
 GEM-outputs/
-sample-submissions/

 # Submissions
 submission_repo/
 GEM-outputs/
+sample-submissions/
+# mac OS
+.DS_Store

Makefile ADDED Viewed

	@@ -0,0 +1,8 @@

+style:
+	python -m black --line-length 119 --target-version py39 .
+	python -m isort .
+quality:
+	python -m black --check --line-length 119 --target-version py39 .
+	python -m isort --check-only .
+	python -m flake8 --max-line-length 119

app.py CHANGED Viewed

@@ -1,26 +1,27 @@
 import json
 import os
 import shutil
 from datetime import datetime
 from pathlib import Path
 import jsonlines
 import streamlit as st
 from dotenv import load_dotenv
-from huggingface_hub import HfApi, Repository, cached_download, hf_hub_url
-from utils import http_post, validate_json
 if Path(".env").is_file():
     load_dotenv(".env")
 HF_TOKEN = os.getenv("HF_TOKEN")
-AUTONLP_USERNAME = os.getenv("AUTONLP_USERNAME")
-HF_AUTONLP_BACKEND_API = os.getenv("HF_AUTONLP_BACKEND_API")
 LOCAL_REPO = "submission_repo"
 LOGS_REPO = "submission-logs"
-## TODO ##
 # 1. Add check that fields are nested under `tasks` field correctly
 # 2. Add check that names of tasks and datasets are valid
@@ -68,9 +69,9 @@ def get_submission_names():
     return [score["submission_name"] for score in scores_data]
-###########
-### APP ###
-###########
 st.title("GEM Submissions")
 st.markdown(
     """
@@ -144,8 +145,7 @@ with st.form(key="form"):
             example_submission = json.load(f)
             st.json(example_submission)
-    user_name = st.text_input("Enter your 🤗 Hub username.")
     submit_button = st.form_submit_button("Make Submission")
 if submit_button and submission_errors == 0:
@@ -155,8 +155,8 @@ if submit_button and submission_errors == 0:
         submission_time = str(int(datetime.now().timestamp()))
         # Create submission dataset under benchmarks ORG
-        submission_repo_id = f"{user_name}__{submission_name_formatted}__{submission_time}"
-        dataset_repo_url = f"https://huggingface.co/datasets/GEM-submissions/{submission_repo_id}"
         repo = Repository(
             local_dir=LOCAL_REPO,
             clone_from=dataset_repo_url,
@@ -176,22 +176,72 @@ if submit_button and submission_errors == 0:
         else:
             commit_sha = repo.git_head_commit_url().split("/")[-1]
-        submission_id = submission_name + "__" + commit_sha + "__" + submission_time
         payload = {
-            "username": AUTONLP_USERNAME,
-            "dataset": "GEM/references",
-            "task": 1,
-            "model": "gem",
-            "submission_dataset": f"GEM-submissions/{submission_repo_id}",
-            "submission_id": submission_id,
-            "col_mapping": {},
-            "split": "test",
-            "config": None,
         }
-        json_resp = http_post(
-            path="/evaluate/create", payload=payload, token=HF_TOKEN, domain=HF_AUTONLP_BACKEND_API
         ).json()
         logs_repo_url = f"https://huggingface.co/datasets/GEM-submissions/{LOGS_REPO}"
         logs_repo = Repository(
@@ -201,25 +251,29 @@ if submit_button and submission_errors == 0:
             private=True,
             use_auth_token=HF_TOKEN,
         )
-        json_resp["submission_name"] = submission_name
         with jsonlines.open(f"{LOGS_REPO}/logs.jsonl") as r:
             lines = []
             for obj in r:
                 lines.append(obj)
-        lines.append(json_resp)
         with jsonlines.open(f"{LOGS_REPO}/logs.jsonl", mode="w") as writer:
             for job in lines:
                 writer.write(job)
-        logs_repo.push_to_hub(commit_message=f"Submission with job ID {json_resp['id']}")
-    if json_resp["status"] == 1:
         st.success(
-            f"✅ Submission {submission_name} was successfully submitted for evaluation with job ID {json_resp['id']}"
         )
         st.markdown(
             f"""
-            Evaluation takes appoximately 1-2 hours to complete, so grab a ☕ or 🍵 while you wait:
             * 📊 Click [here](https://huggingface.co/spaces/GEM/results) to view the results from your submission
             * 💾 Click [here]({dataset_repo_url}) to view your submission file on the Hugging Face Hub

 import json
 import os
 import shutil
+import uuid
 from datetime import datetime
 from pathlib import Path
 import jsonlines
 import streamlit as st
 from dotenv import load_dotenv
+from huggingface_hub import Repository, cached_download, hf_hub_url
+from utils import http_get, http_post, validate_json
 if Path(".env").is_file():
     load_dotenv(".env")
 HF_TOKEN = os.getenv("HF_TOKEN")
+AUTOTRAIN_USERNAME = os.getenv("AUTOTRAIN_USERNAME")
+AUTOTRAIN_BACKEND_API = os.getenv("AUTOTRAIN_BACKEND_API")
 LOCAL_REPO = "submission_repo"
 LOGS_REPO = "submission-logs"
+# TODO
 # 1. Add check that fields are nested under `tasks` field correctly
 # 2. Add check that names of tasks and datasets are valid
     return [score["submission_name"] for score in scores_data]
+#######
+# APP #
+#######
 st.title("GEM Submissions")
 st.markdown(
     """
             example_submission = json.load(f)
             st.json(example_submission)
+    user_name = st.text_input("Enter your 🤗 Hub username", help="This field is required to track your submission and cannot be empty")
     submit_button = st.form_submit_button("Make Submission")
 if submit_button and submission_errors == 0:
         submission_time = str(int(datetime.now().timestamp()))
         # Create submission dataset under benchmarks ORG
+        submission_repo_id = f"GEM-submissions/{user_name}__{submission_name_formatted}__{submission_time}"
+        dataset_repo_url = f"https://huggingface.co/datasets/{submission_repo_id}"
         repo = Repository(
             local_dir=LOCAL_REPO,
             clone_from=dataset_repo_url,
         else:
             commit_sha = repo.git_head_commit_url().split("/")[-1]
+        submission_id = submission_name + "__" + str(uuid.uuid4())[:6] + "__" + submission_time
+        # Define AutoTrain payload
+        project_config = {}
+        # Need a dummy dataset to use the dataset loader in AutoTrain
+        project_config["dataset_name"] = "lewtun/imdb-dummy"
+        project_config["dataset_config"] = "lewtun--imdb-dummy"
+        project_config["dataset_split"] = "train"
+        project_config["col_mapping"] = {"text": "text", "label": "target"}
+        # Specify benchmark parameters
+        project_config["model"] = "gem"
+        project_config["dataset"] = "GEM/references"
+        project_config["submission_dataset"] = submission_repo_id
+        project_id = str(uuid.uuid4()).split("-")[0]
+        project_payload = {
+            "username": AUTOTRAIN_USERNAME,
+            "proj_name": f"benchmark-gem-{project_id}",
+            "task": 1,
+            "config": {
+                "language": "en",
+                "max_models": 5,
+                "instance": {
+                    "provider": "aws",
+                    "instance_type": "ml.g4dn.4xlarge",
+                    "max_runtime_seconds": 172800,
+                    "num_instances": 1,
+                    "disk_size_gb": 150,
+                },
+                "benchmark": {
+                    "dataset": project_config["dataset"],
+                    "model": project_config["model"],
+                    "submission_dataset": project_config["submission_dataset"],
+                },
+            },
+        }
+        project_json_resp = http_post(
+            path="/projects/create", payload=project_payload, token=HF_TOKEN, domain=AUTOTRAIN_BACKEND_API
+        ).json()
+        print(f"Project creation: {project_json_resp}")
+        # Upload data
         payload = {
+            "split": 4,
+            "col_mapping": project_config["col_mapping"],
+            "load_config": {"max_size_bytes": 0, "shuffle": False},
         }
+        data_json_resp = http_post(
+            path=f"/projects/{project_json_resp['id']}/data/{project_config['dataset_name']}",
+            payload=payload,
+            token=HF_TOKEN,
+            domain=AUTOTRAIN_BACKEND_API,
+            params={
+                "type": "dataset",
+                "config_name": project_config["dataset_config"],
+                "split_name": project_config["dataset_split"],
+            },
+        ).json()
+        print(f"Dataset creation: {data_json_resp}")
+        # Run training
+        train_json_resp = http_get(
+            path=f"/projects/{project_json_resp['id']}/data/start_process",
+            token=HF_TOKEN,
+            domain=AUTOTRAIN_BACKEND_API,
         ).json()
+        print(f"Training job response: {train_json_resp}")
         logs_repo_url = f"https://huggingface.co/datasets/GEM-submissions/{LOGS_REPO}"
         logs_repo = Repository(
             private=True,
             use_auth_token=HF_TOKEN,
         )
+        evaluation_log = {}
+        evaluation_log["payload"] = project_payload
+        evaluation_log["project_creation_response"] = project_json_resp
+        evaluation_log["dataset_creation_response"] = data_json_resp
+        evaluation_log["autotrain_job_response"] = train_json_resp
         with jsonlines.open(f"{LOGS_REPO}/logs.jsonl") as r:
             lines = []
             for obj in r:
                 lines.append(obj)
+        lines.append(evaluation_log)
         with jsonlines.open(f"{LOGS_REPO}/logs.jsonl", mode="w") as writer:
             for job in lines:
                 writer.write(job)
+        logs_repo.push_to_hub(commit_message=f"Submission with job ID {project_json_resp['id']}")
+    if train_json_resp["success"] == 1:
         st.success(
+            f"✅ Submission {submission_name} was successfully submitted for evaluation!"
         )
         st.markdown(
             f"""
+            Evaluation can take up to 1 hour to complete, so grab a ☕ or 🍵 while you wait:
             * 📊 Click [here](https://huggingface.co/spaces/GEM/results) to view the results from your submission
             * 💾 Click [here]({dataset_repo_url}) to view your submission file on the Hugging Face Hub

requirements.txt CHANGED Viewed

@@ -1,3 +1,3 @@
 python-dotenv
-huggingface-hub==0.2.1
 jsonlines

 python-dotenv
+huggingface-hub==0.8.1
 jsonlines

utils.py CHANGED Viewed

@@ -2,7 +2,6 @@ import json
 import jsonschema
 import requests
-import streamlit as st
 def load_schema():
@@ -28,17 +27,26 @@ def get_auth_headers(token: str, prefix: str = "autonlp"):
     return {"Authorization": f"{prefix} {token}"}
-def http_post(
     path: str,
     token: str,
-    payload=None,
     domain: str = None,
 ) -> requests.Response:
     """HTTP POST request to the AutoNLP API, raises UnreachableAPIError if the API cannot be reached"""
     try:
-        response = requests.post(
-            url=domain + path, json=payload, headers=get_auth_headers(token=token), allow_redirects=True
-        )
     except requests.exceptions.ConnectionError:
         print("❌ Failed to reach AutoNLP API, check your internet connection")
     response.raise_for_status()

 import jsonschema
 import requests
 def load_schema():
     return {"Authorization": f"{prefix} {token}"}
+def http_post(path: str, token: str, payload=None, domain: str = None, params=None) -> requests.Response:
+    """HTTP POST request to the AutoNLP API, raises UnreachableAPIError if the API cannot be reached"""
+    try:
+        response = requests.post(
+            url=domain + path, json=payload, headers=get_auth_headers(token=token), allow_redirects=True, params=params
+        )
+    except requests.exceptions.ConnectionError:
+        print("❌ Failed to reach AutoNLP API, check your internet connection")
+    response.raise_for_status()
+    return response
+def http_get(
     path: str,
     token: str,
     domain: str = None,
 ) -> requests.Response:
     """HTTP POST request to the AutoNLP API, raises UnreachableAPIError if the API cannot be reached"""
     try:
+        response = requests.get(url=domain + path, headers=get_auth_headers(token=token), allow_redirects=True)
     except requests.exceptions.ConnectionError:
         print("❌ Failed to reach AutoNLP API, check your internet connection")
     response.raise_for_status()