Spaces:

douwekiela
/

dadc

Runtime error

App Files Files Community

Make it Work

by Tristan - opened Jul 19, 2022

base: refs/heads/main

←

from: refs/pr/3

Discussion Files changed

+191

-50

Files changed (4) hide show

README.md +37 -0
app.py +113 -28
collect.py +35 -19
requirements.txt +6 -3

README.md CHANGED Viewed

@@ -11,3 +11,40 @@ license: bigscience-bloom-rail-1.0
 ---
 A basic example of dynamic adversarial data collection with a Gradio app.

 ---
 A basic example of dynamic adversarial data collection with a Gradio app.
+**Instructions for someone to use for their own project:**
+*Setting up the Space*
+1. Clone this repo and deploy it on your own Hugging Face space.
+2. Add one of your Hugging Face tokens to the secrets for your space, with the
+   name `HF_TOKEN`. Now, create an empty Hugging Face dataset on the hub. Put
+   the url of this dataset in the secrets for your space, with the name
+   `DATASET_REPO_URL`. It can be a private or public dataset. When you run this
+   space on mturk and when people visit your space on huggingface.co, the app
+   will use your token to automatically store new HITs in your dataset. NOTE:
+   if you push something to your dataset manually, you need to reboot your space
+   or it could get merge conflicts when trying to push HIT data.
+*Running Data Collection*
+1. On your local repo that you pulled, create a copy of `config.py.example`,
+   just called `config.py`. Now, put keys from your AWS account in `config.py`.
+   These keys should be for an AWS account that has the
+   AmazonMechanicalTurkFullAccess permission. You also need to
+   create an mturk requestor account associated with your AWS account.
+2. Run `python collect.py` locally.
+*Profit*
+Now, you should be watching hits come into your Hugging Face dataset
+automatically!
+*Tips and Tricks*
+- If you are developing and running this space locally to test it out, try
+deleting the data directory that the app clones before running the app again.
+Otherwise, the app could get merge conflicts when storing new HITs on the hub.
+When you redeploy your app on Hugging Face spaces, the data directory is deleted
+automatically.
+- huggingface spaces have limited computational resources and memory. If you
+run too many HITs and/or assignments at once, then you could encounter issues.
+You could also encounter issues if you are trying to create a dataset that is
+very large. Check the log of your space for any errors that could be happening.

app.py CHANGED Viewed

@@ -1,13 +1,30 @@
 # Basic example for doing model-in-the-loop dynamic adversarial data collection
 # using Gradio Blocks.
 import random
 from urllib.parse import parse_qs
 import gradio as gr
 import requests
 from transformers import pipeline
 pipe = pipeline("sentiment-analysis")
 demo = gr.Blocks()
@@ -16,9 +33,9 @@ with demo:
     total_cnt = 2 # How many examples per HIT
     dummy = gr.Textbox(visible=False)  # dummy for passing assignmentId
-    # We keep track of state as a Variable
-    state_dict = {"assignmentId": "", "cnt": 0, "fooled": 0, "data": [], "metadata": {}}
-    state = gr.Variable(state_dict)
     gr.Markdown("# DADC in Gradio example")
     gr.Markdown("Try to fool the model and find an example where it predicts the wrong label!")
@@ -27,26 +44,41 @@ with demo:
     # Generate model prediction
     # Default model: distilbert-base-uncased-finetuned-sst-2-english
-    def _predict(txt, tgt, state):
         pred = pipe(txt)[0]
         other_label = 'negative' if pred['label'].lower() == "positive" else "positive"
         pred_confidences = {pred['label'].lower(): pred['score'], other_label: 1 - pred['score']}
         pred["label"] = pred["label"].title()
         ret = f"Target: **{tgt}**. Model prediction: **{pred['label']}**\n\n"
-        if pred["label"] != tgt:
-            state["fooled"] += 1
             ret += " You fooled the model! Well done!"
         else:
             ret += " You did not fool the model! Too bad, try again!"
-        state["data"].append(ret)
         state["cnt"] += 1
         done = state["cnt"] == total_cnt
-        toggle_final_submit = gr.update(visible=done)
         toggle_example_submit = gr.update(visible=not done)
-        new_state_md = f"State: {state['cnt']}/{total_cnt} ({state['fooled']} fooled)"
-        return pred_confidences, ret, state, toggle_example_submit, toggle_final_submit, new_state_md
     # Input fields
     text_input = gr.Textbox(placeholder="Enter model-fooling statement", show_label=False)
@@ -59,28 +91,81 @@ with demo:
         submit_ex_button = gr.Button("Submit")
     with gr.Column(visible=False) as final_submit:
         submit_hit_button = gr.Button("Submit HIT")
-    # Submit state to MTurk backend for ExternalQuestion
-    # Update the URL below to switch from Sandbox to real data collection
-    def _submit(state, dummy):
-        query = parse_qs(dummy[1:])
-        assert "assignmentId" in query, "No assignment ID provided, unable to submit"
-        state["assignmentId"] = query["assignmentId"]
-        url = "https://workersandbox.mturk.com/mturk/externalSubmit"
-        return requests.post(url, data=state)
     # Button event handlers
     submit_ex_button.click(
         _predict,
-        inputs=[text_input, label_input, state],
-        outputs=[label_output, text_output, state, example_submit, final_submit, state_display],
     )
     submit_hit_button.click(
-        _submit,
-        inputs=[state, dummy],
-        outputs=None,
-        _js="function(state, dummy) { return [state, window.location.search]; }",
     )
-demo.launch(favicon_path="https://huggingface.co/favicon.ico")

 # Basic example for doing model-in-the-loop dynamic adversarial data collection
 # using Gradio Blocks.
+import os
 import random
 from urllib.parse import parse_qs
 import gradio as gr
 import requests
 from transformers import pipeline
+from huggingface_hub import Repository
+from dotenv import load_dotenv
+from pathlib import Path
+import json
+from filelock import FileLock
+# These variables are for storing the mturk HITs in a Hugging Face dataset.
+if Path(".env").is_file():
+    load_dotenv(".env")
+DATASET_REPO_URL = os.getenv("DATASET_REPO_URL")
+HF_TOKEN = os.getenv("HF_TOKEN")
+DATA_FILENAME = "data.jsonl"
+DATA_FILE = os.path.join("data", DATA_FILENAME)
+repo = Repository(
+    local_dir="data", clone_from=DATASET_REPO_URL, use_auth_token=HF_TOKEN
+)
+# Now let's run the app!
 pipe = pipeline("sentiment-analysis")
 demo = gr.Blocks()
     total_cnt = 2 # How many examples per HIT
     dummy = gr.Textbox(visible=False)  # dummy for passing assignmentId
+    # We keep track of state as a JSON
+    state_dict = {"assignmentId": "", "cnt": 0, "cnt_fooled": 0, "data": []}
+    state = gr.JSON(state_dict, visible=False)
     gr.Markdown("# DADC in Gradio example")
     gr.Markdown("Try to fool the model and find an example where it predicts the wrong label!")
     # Generate model prediction
     # Default model: distilbert-base-uncased-finetuned-sst-2-english
+    def _predict(txt, tgt, state, dummy):
         pred = pipe(txt)[0]
         other_label = 'negative' if pred['label'].lower() == "positive" else "positive"
         pred_confidences = {pred['label'].lower(): pred['score'], other_label: 1 - pred['score']}
         pred["label"] = pred["label"].title()
         ret = f"Target: **{tgt}**. Model prediction: **{pred['label']}**\n\n"
+        fooled = pred["label"] != tgt
+        if fooled:
+            state["cnt_fooled"] += 1
             ret += " You fooled the model! Well done!"
         else:
             ret += " You did not fool the model! Too bad, try again!"
         state["cnt"] += 1
         done = state["cnt"] == total_cnt
         toggle_example_submit = gr.update(visible=not done)
+        new_state_md = f"State: {state['cnt']}/{total_cnt} ({state['cnt_fooled']} fooled)"
+        state["data"].append({"cnt": state["cnt"], "text": txt, "target": tgt.lower(), "model_pred": pred["label"].lower(), "fooled": fooled})
+        query = parse_qs(dummy[1:])
+        if "assignmentId" in query and query["assignmentId"][0] != "ASSIGNMENT_ID_NOT_AVAILABLE":
+            # It seems that someone is using this app on mturk. We need to
+            # store the assignmentId in the state before submit_hit_button
+            # is clicked. We can do this here in _predict. We need to save the
+            # assignmentId so that the turker can get credit for their HIT.
+            state["assignmentId"] = query["assignmentId"][0]
+            toggle_final_submit = gr.update(visible=done)
+            toggle_final_submit_preview = gr.update(visible=False)
+        else:
+            toggle_final_submit_preview = gr.update(visible=done)
+            toggle_final_submit = gr.update(visible=False)
+        return pred_confidences, ret, state, toggle_example_submit, toggle_final_submit, toggle_final_submit_preview, new_state_md, dummy
     # Input fields
     text_input = gr.Textbox(placeholder="Enter model-fooling statement", show_label=False)
         submit_ex_button = gr.Button("Submit")
     with gr.Column(visible=False) as final_submit:
         submit_hit_button = gr.Button("Submit HIT")
+    with gr.Column(visible=False) as final_submit_preview:
+        submit_hit_button_preview = gr.Button("Submit Work (preview mode; no mturk HIT credit)")
+    # Store the HIT data into a Hugging Face dataset.
+    # The HIT is also stored and logged on mturk when post_hit_js is run below.
+    # This _store_in_huggingface_dataset function just demonstrates how easy it is
+    # to automatically create a Hugging Face dataset from mturk.
+    def _store_in_huggingface_dataset(state):
+        lock = FileLock(DATA_FILE + ".lock")
+        lock.acquire()
+        try:
+            with open(DATA_FILE, "a") as jsonlfile:
+                json_data_with_assignment_id =\
+                    [json.dumps(dict({"assignmentId": state["assignmentId"]}, **datum)) for datum in state["data"]]
+                jsonlfile.write("\n".join(json_data_with_assignment_id) + "\n")
+            repo.push_to_hub()
+        finally:
+            lock.release()
+        return state
     # Button event handlers
+    get_window_location_search_js = """
+        function(text_input, label_input, state, dummy) {
+            return [text_input, label_input, state, window.location.search];
+        }
+        """
     submit_ex_button.click(
         _predict,
+        inputs=[text_input, label_input, state, dummy],
+        outputs=[label_output, text_output, state, example_submit, final_submit, final_submit_preview, state_display, dummy],
+        _js=get_window_location_search_js,
     )
+    post_hit_js = """
+        function(state) {
+            // If there is an assignmentId, then the submitter is on mturk
+            // and has accepted the HIT. So, we need to submit their HIT.
+            const form = document.createElement('form');
+            form.action = 'https://workersandbox.mturk.com/mturk/externalSubmit';
+            form.method = 'post';
+            for (const key in state) {
+                const hiddenField = document.createElement('input');
+                hiddenField.type = 'hidden';
+                hiddenField.name = key;
+                hiddenField.value = state[key];
+                form.appendChild(hiddenField);
+            };
+            document.body.appendChild(form);
+            form.submit();
+            return state;
+        }
+        """
     submit_hit_button.click(
+        _store_in_huggingface_dataset,
+        inputs=[state],
+        outputs=[state],
+        _js=post_hit_js,
+    )
+    refresh_app_js = """
+        function(state) {
+            // The following line here loads the app again so the user can
+            // enter in another preview-mode "HIT".
+            window.location.href = window.location.href;
+            return state;
+        }
+        """
+    submit_hit_button_preview.click(
+        _store_in_huggingface_dataset,
+        inputs=[state],
+        outputs=[state],
+        _js=refresh_app_js,
     )
+demo.launch()

collect.py CHANGED Viewed

@@ -5,36 +5,52 @@ import boto3
 from boto.mturk.question import ExternalQuestion
 from config import MTURK_KEY, MTURK_SECRET
-MTURK_REGION = "us-east-1"
-MTURK_SANDBOX = "https://mturk-requester-sandbox.us-east-1.amazonaws.com"
 mturk = boto3.client(
     "mturk",
     aws_access_key_id=MTURK_KEY,
     aws_secret_access_key=MTURK_SECRET,
-    region_name=MTURK_REGION,
-    endpoint_url=MTURK_SANDBOX,
 )
-# The + in the URL makes the Space easily embeddable in an iframe
-question = ExternalQuestion(
-    "https://huggingface.co/spaces/douwekiela/dadc/+", frame_height=600
 )
-new_hit = mturk.create_hit(
-    Title="DADC with Gradio",
-    Description="Hello world",
-    Keywords="fool the model",
-    Reward="0.15",
-    MaxAssignments=1,
-    LifetimeInSeconds=172800,
-    AssignmentDurationInSeconds=600,
-    AutoApprovalDelayInSeconds=14400,
-    Question=question.get_as_xml(),
-)
 print(
-    "Sandbox link: https://workersandbox.mturk.com/mturk/preview?groupId="
     + new_hit["HIT"]["HITGroupId"]
 )

 from boto.mturk.question import ExternalQuestion
 from config import MTURK_KEY, MTURK_SECRET
+import argparse
+parser = argparse.ArgumentParser()
+parser.add_argument("--mturk_region", default="us-east-1", help="The region for mturk (default: us-east-1)")
+parser.add_argument("--space_name", default="Tristan/dadc", help="Name of the accompanying Hugging Face space (default: Tristan/dadc)")
+parser.add_argument("--num_hits", type=int, default=5, help="The number of HITs.")
+parser.add_argument("--num_assignments", type=int, default=1, help="The number of times that the HIT can be accepted and completed.")
+parser.add_argument("--live_mode", action="store_true", help="""
+    Whether to run in live mode with real turkers. This will charge your account money.
+    If you don't use this flag, the HITs will be deployed on the sandbox version of mturk,
+    which will not charge your account money.
+    """
+)
+args = parser.parse_args()
+MTURK_URL = f"https://mturk-requester{'' if args.live_mode else '-sandbox'}.{args.mturk_region}.amazonaws.com"
 mturk = boto3.client(
     "mturk",
     aws_access_key_id=MTURK_KEY,
     aws_secret_access_key=MTURK_SECRET,
+    region_name=args.mturk_region,
+    endpoint_url=MTURK_URL,
 )
+# This is the URL that makes the space embeddable in an mturk iframe
+question = ExternalQuestion(f"https://hf.space/embed/{args.space_name}/+?__theme=light",
+    frame_height=600
 )
+for i in range(args.num_hits):
+    new_hit = mturk.create_hit(
+        Title="Beat the AI",
+        Description="Try to fool an AI by creating examples that it gets wrong",
+        Keywords="fool the model",
+        Reward="0.15",
+        MaxAssignments=args.num_assignments,
+        LifetimeInSeconds=172800,
+        AssignmentDurationInSeconds=600,
+        AutoApprovalDelayInSeconds=14400,
+        Question=question.get_as_xml(),
+    )
 print(
+    f"HIT Group Link: https://worker{'' if args.live_mode else 'sandbox'}.mturk.com/mturk/preview?groupId="
     + new_hit["HIT"]["HITGroupId"]
 )

requirements.txt CHANGED Viewed

@@ -1,3 +1,6 @@
-requests
-torch
-transformers

+torch==1.12.0
+transformers==4.20.1
+gradio==3.0.26
+boto3==1.24.32
+huggingface_hub==0.8.1
+python-dotenv==0.20.0