Tristan Thrush commited on
Commit
829775d
1 Parent(s): e3e024d
Files changed (4) hide show
  1. README.md +16 -9
  2. app.py +13 -6
  3. collect.py +21 -17
  4. requirements.txt +6 -7
README.md CHANGED
@@ -20,8 +20,10 @@ A basic example of dynamic adversarial data collection with a Gradio app.
20
  name `HF_TOKEN`. Now, create an empty Hugging Face dataset on the hub. Put
21
  the url of this dataset in the secrets for your space, with the name
22
  `DATASET_REPO_URL`. It can be a private or public dataset. When you run this
23
- space on mturk in the following lines, the app will use your token to
24
- automatically store new hits to your dataset.
 
 
25
 
26
  **Running Data Collection**
27
  1. On your local repo that you pulled, create a copy of `config.py.example`,
@@ -29,15 +31,20 @@ A basic example of dynamic adversarial data collection with a Gradio app.
29
  These keys should be for an AWS account that has the
30
  AmazonMechanicalTurkFullAccess permission. You also need to
31
  create an mturk requestor account associated with your AWS account.
32
- 2. Run `python collect.py` locally. If you run it with the `--live_mode` flag,
33
- it launches HITs on mturk, using the app you deployed on the space as the
34
- data collection UI and backend. NOTE: this means that you will need to pay
35
- real workers. If you don't use the `--live_mode` flag, then it will run the
36
- HITs on mturk sandbox, which is identical to the normal mturk, but just for
37
- testing. You can create a worker account and go to the sandbox version to
38
- test your HIT.
39
 
40
  **Profit**
41
  Now, you should be watching hits come into your Hugging Face dataset
42
  automatically!
43
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  name `HF_TOKEN`. Now, create an empty Hugging Face dataset on the hub. Put
21
  the url of this dataset in the secrets for your space, with the name
22
  `DATASET_REPO_URL`. It can be a private or public dataset. When you run this
23
+ space on mturk and when people visit your space on huggingface.co, the app
24
+ will use your token to automatically store new HITs in your dataset. NOTE:
25
+ if you push something to your dataset manually, you need to restart your space
26
+ or it could get merge conflicts when trying to push HIT data.
27
 
28
  **Running Data Collection**
29
  1. On your local repo that you pulled, create a copy of `config.py.example`,
 
31
  These keys should be for an AWS account that has the
32
  AmazonMechanicalTurkFullAccess permission. You also need to
33
  create an mturk requestor account associated with your AWS account.
34
+ 2. Run `python collect.py` locally.
 
 
 
 
 
 
35
 
36
  **Profit**
37
  Now, you should be watching hits come into your Hugging Face dataset
38
  automatically!
39
 
40
+ **Tips and Tricks**
41
+ - If you are developing and running this space locally to test it out, try
42
+ deleting the data directory that the app clones before running the app again.
43
+ Otherwise, the app could get merge conflicts when storing new HITs on the hub.
44
+ When you redeploy your app on Hugging Face spaces, the data directory is deleted
45
+ automatically.
46
+ - huggingface spaces have limited computational resources and memory. If you
47
+ run too many HITs and/or assignments at once, then you could encounter issues.
48
+ You could also encounter issues if you are trying to create a dataset that is
49
+ very large. Check the log of your space for any errors that could be happening.
50
+
app.py CHANGED
@@ -11,6 +11,7 @@ from huggingface_hub import Repository
11
  from dotenv import load_dotenv
12
  from pathlib import Path
13
  import json
 
14
 
15
  # These variables are for storing the mturk HITs in a Hugging Face dataset.
16
  if Path(".env").is_file():
@@ -92,11 +93,16 @@ with demo:
92
  # This _store_in_huggingface_dataset function just demonstrates how easy it is
93
  # to automatically create a Hugging Face dataset from mturk.
94
  def _store_in_huggingface_dataset(state):
95
- with open(DATA_FILE, "a") as jsonlfile:
96
- json_data_with_assignment_id =\
97
- [json.dumps(dict({"assignmentId": state["assignmentId"]}, **datum)) for datum in state["data"]]
98
- jsonlfile.write("\n".join(json_data_with_assignment_id) + "\n")
99
- repo.push_to_hub()
 
 
 
 
 
100
  return state
101
 
102
  # Button event handlers
@@ -130,7 +136,7 @@ with demo:
130
  };
131
  document.body.appendChild(form);
132
  form.submit();
133
- return [state];
134
  } else {
135
  // If there is no assignmentId, then we assume that the submitter is
136
  // on huggingface.co and we can't submit a HIT to mturk. But
@@ -138,6 +144,7 @@ with demo:
138
  // our dataset without an assignmentId. The following line here
139
  // loads the app again so the user can enter in another "fake" HIT.
140
  window.location.href = window.location.href;
 
141
  }
142
  }
143
  """
 
11
  from dotenv import load_dotenv
12
  from pathlib import Path
13
  import json
14
+ from filelock import FileLock
15
 
16
  # These variables are for storing the mturk HITs in a Hugging Face dataset.
17
  if Path(".env").is_file():
 
93
  # This _store_in_huggingface_dataset function just demonstrates how easy it is
94
  # to automatically create a Hugging Face dataset from mturk.
95
  def _store_in_huggingface_dataset(state):
96
+ lock = FileLock(DATA_FILE + ".lock")
97
+ lock.acquire()
98
+ try:
99
+ with open(DATA_FILE, "a") as jsonlfile:
100
+ json_data_with_assignment_id =\
101
+ [json.dumps(dict({"assignmentId": state["assignmentId"]}, **datum)) for datum in state["data"]]
102
+ jsonlfile.write("\n".join(json_data_with_assignment_id) + "\n")
103
+ repo.push_to_hub()
104
+ finally:
105
+ lock.release()
106
  return state
107
 
108
  # Button event handlers
 
136
  };
137
  document.body.appendChild(form);
138
  form.submit();
139
+ return state;
140
  } else {
141
  // If there is no assignmentId, then we assume that the submitter is
142
  // on huggingface.co and we can't submit a HIT to mturk. But
 
144
  // our dataset without an assignmentId. The following line here
145
  // loads the app again so the user can enter in another "fake" HIT.
146
  window.location.href = window.location.href;
147
+ return state;
148
  }
149
  }
150
  """
collect.py CHANGED
@@ -10,7 +10,8 @@ import argparse
10
  parser = argparse.ArgumentParser()
11
  parser.add_argument("--mturk_region", default="us-east-1", help="The region for mturk (default: us-east-1)")
12
  parser.add_argument("--space_name", default="Tristan/dadc", help="Name of the accompanying Hugging Face space (default: Tristan/dadc)")
13
- parser.add_argument("--num_assignments", type=int, default=5, help="The number of times that the HIT can be accepted and completed.")
 
14
  parser.add_argument("--live_mode", action="store_true", help="""
15
  Whether to run in live mode with real turkers. This will charge your account money.
16
  If you don't use this flag, the HITs will be deployed on the sandbox version of mturk,
@@ -35,19 +36,22 @@ question = ExternalQuestion(f"https://hf.space/embed/{args.space_name}/+?__theme
35
  frame_height=600
36
  )
37
 
38
- new_hit = mturk.create_hit(
39
- Title="DADC with Gradio",
40
- Description="Hello",
41
- Keywords="fool the model",
42
- Reward="0.15",
43
- MaxAssignments=args.num_assignments,
44
- LifetimeInSeconds=172800,
45
- AssignmentDurationInSeconds=600,
46
- AutoApprovalDelayInSeconds=14400,
47
- Question=question.get_as_xml(),
48
- )
49
-
50
- print(
51
- f"Link: https://worker{'' if args.live_mode else 'sandbox'}.mturk.com/mturk/preview?groupId="
52
- + new_hit["HIT"]["HITGroupId"]
53
- )
 
 
 
 
10
  parser = argparse.ArgumentParser()
11
  parser.add_argument("--mturk_region", default="us-east-1", help="The region for mturk (default: us-east-1)")
12
  parser.add_argument("--space_name", default="Tristan/dadc", help="Name of the accompanying Hugging Face space (default: Tristan/dadc)")
13
+ parser.add_argument("--num_hits", type=int, default=5, help="The number of HITs.")
14
+ parser.add_argument("--num_assignments", type=int, default=1, help="The number of times that the HIT can be accepted and completed.")
15
  parser.add_argument("--live_mode", action="store_true", help="""
16
  Whether to run in live mode with real turkers. This will charge your account money.
17
  If you don't use this flag, the HITs will be deployed on the sandbox version of mturk,
 
36
  frame_height=600
37
  )
38
 
39
+ for i in range(args.num_hits):
40
+ new_hit = mturk.create_hit(
41
+ Title="Beat the AI",
42
+ Description="Try to fool an AI by creating examples that it gets wrong",
43
+ Keywords="fool the model",
44
+ Reward="0.15",
45
+ MaxAssignments=args.num_assignments,
46
+ LifetimeInSeconds=172800,
47
+ AssignmentDurationInSeconds=600,
48
+ AutoApprovalDelayInSeconds=14400,
49
+ Question=question.get_as_xml(),
50
+ )
51
+
52
+ print(
53
+ f"HIT #{i} Link: https://worker{'' if args.live_mode else 'sandbox'}.mturk.com/mturk/preview?groupId="
54
+ + new_hit["HIT"]["HITGroupId"]
55
+ )
56
+
57
+ new_hits.append(new_hit)
requirements.txt CHANGED
@@ -1,7 +1,6 @@
1
- requests
2
- torch
3
- transformers
4
- gradio
5
- boto3
6
- huggingface_hub
7
- python-dotenv
 
1
+ torch==1.12.0
2
+ transformers==4.20.1
3
+ gradio==3.0.26
4
+ boto3==1.24.32
5
+ huggingface_hub==0.8.1
6
+ python-dotenv==0.20.0