saridormi commited on
Commit
04b315a
β€’
1 Parent(s): 6c92442

Add a separate dataset for aggregating requests metadata

Browse files
Files changed (2) hide show
  1. app.py +6 -2
  2. src/submission_uploader.py +83 -12
app.py CHANGED
@@ -30,7 +30,9 @@ logging.basicConfig(
30
  handlers=[logging.StreamHandler()],
31
  )
32
 
33
- submission_uploader = SubmissionUploader(os.environ["DATASET_ID"])
 
 
34
 
35
 
36
  with gr.Blocks() as demo:
@@ -61,7 +63,7 @@ with gr.Blocks() as demo:
61
  with gr.Column():
62
  model_folder_textbox = gr.Textbox(
63
  label="Model Folder",
64
- placeholder="How to call a folder related to this submission in our results dataset.",
65
  )
66
  model_name_textbox = gr.Textbox(
67
  label="Model Name",
@@ -111,6 +113,8 @@ with gr.Blocks() as demo:
111
  url_textbox,
112
  context_size_textbox,
113
  submitted_by_textbox,
 
 
114
  file_output,
115
  ],
116
  submission_result,
 
30
  handlers=[logging.StreamHandler()],
31
  )
32
 
33
+ submission_uploader = SubmissionUploader(
34
+ dataset_id=os.environ["DATASET_ID"], private_dataset_id=os.environ["PRIVATE_DATASET_ID"]
35
+ )
36
 
37
 
38
  with gr.Blocks() as demo:
 
63
  with gr.Column():
64
  model_folder_textbox = gr.Textbox(
65
  label="Model Folder",
66
+ placeholder="How to call a folder related to this submission in our results dataset (should be unique).",
67
  )
68
  model_name_textbox = gr.Textbox(
69
  label="Model Name",
 
113
  url_textbox,
114
  context_size_textbox,
115
  submitted_by_textbox,
116
+ contact_textbox,
117
+ comment_textbox,
118
  file_output,
119
  ],
120
  submission_result,
src/submission_uploader.py CHANGED
@@ -1,6 +1,7 @@
1
  import json
2
  import logging
3
  import os
 
4
  from tempfile import TemporaryDirectory
5
  from typing import Dict, List, Optional
6
 
@@ -26,10 +27,11 @@ class SubmissionUploader:
26
  * https://huggingface.co/spaces/gaia-benchmark/leaderboard
27
  """
28
 
29
- def __init__(self, dataset_id: str):
30
  self._api = HfApi(token=os.environ["HF_TOKEN"])
31
  self._fs = HfFileSystem(token=os.environ["HF_TOKEN"])
32
  self._dataset_id = dataset_id
 
33
 
34
  def _get_previous_pr(self, pr_title: str) -> Optional[Discussion]:
35
  """Searches among discussions of dataset repo for a PR with the given title."""
@@ -46,10 +48,10 @@ class SubmissionUploader:
46
  self,
47
  model_name_pretty: str,
48
  model_availability: str,
49
- urls: str,
50
  context_size: str,
51
  submitted_by: str,
52
- ) -> Dict[str, str]:
53
  return {
54
  "model_name": model_name_pretty,
55
  "model_availability": model_availability,
@@ -58,6 +60,45 @@ class SubmissionUploader:
58
  "submitted_by": submitted_by,
59
  }
60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  def _upload_predictions(
62
  self,
63
  task_id: str,
@@ -107,7 +148,7 @@ class SubmissionUploader:
107
  model_folder: str,
108
  model_name_pretty: str,
109
  model_availability: str,
110
- urls: str,
111
  context_size: str,
112
  submitted_by: str,
113
  temp_directory: str,
@@ -141,9 +182,11 @@ class SubmissionUploader:
141
  model_folder: str,
142
  model_name_pretty: str,
143
  model_availability: str,
144
- urls: str,
145
  context_size: str,
146
  submitted_by: str,
 
 
147
  filenames: Optional[List[str]],
148
  ):
149
  assert task_pretty and task_pretty in TASKS_PRETTY_REVERSE, "Please, select one of the supported tasks."
@@ -158,6 +201,7 @@ class SubmissionUploader:
158
 
159
  assert submitted_by, "Please, specify non-empty information about a submission's author(s)."
160
  assert filenames, "Please, attach at least one file with predictions."
 
161
 
162
  def upload_files(
163
  self,
@@ -165,9 +209,11 @@ class SubmissionUploader:
165
  model_folder: str,
166
  model_name_pretty: str,
167
  model_availability: str,
168
- urls: str,
169
  context_size: str,
170
  submitted_by: str,
 
 
171
  filenames: Optional[List[str]],
172
  force: bool = False,
173
  ) -> str:
@@ -180,6 +226,8 @@ class SubmissionUploader:
180
  urls=urls,
181
  context_size=context_size,
182
  submitted_by=submitted_by,
 
 
183
  filenames=filenames,
184
  )
185
  pr_title = f"πŸš€ New submission to {task_pretty} task: {model_name_pretty} with {context_size} context size from {submitted_by}"
@@ -190,11 +238,10 @@ class SubmissionUploader:
190
 
191
  logging.info("Checking if this request has already been submitted...")
192
  if not force:
193
- if model_name_pretty in self._fs.ls(f"datasets/{self._dataset_id}/{task_id}/predictions") and all(
194
- filename in self._fs.ls(f"datasets/{self._dataset_id}/{task_id}/predictions/{model_name_pretty}")
195
- for filename in filenames
196
- ):
197
- return styled_warning(f"{model_name_pretty} is already present in {self._dataset_id}.")
198
 
199
  prev_pr = self._get_previous_pr(pr_title)
200
  if prev_pr is not None:
@@ -224,7 +271,7 @@ class SubmissionUploader:
224
  temp_directory=str(d),
225
  )
226
 
227
- logging.info("Creating commit...")
228
  new_pr = self._api.create_commit(
229
  repo_id=self._dataset_id,
230
  operations=predictions_commit_operations + results_commit_operations,
@@ -233,6 +280,30 @@ class SubmissionUploader:
233
  create_pr=True,
234
  repo_type="dataset",
235
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
236
  return styled_message(f"πŸŽ‰ PR created at {new_pr.pr_url}.")
237
 
238
  except Exception as e:
 
1
  import json
2
  import logging
3
  import os
4
+ import time
5
  from tempfile import TemporaryDirectory
6
  from typing import Dict, List, Optional
7
 
 
27
  * https://huggingface.co/spaces/gaia-benchmark/leaderboard
28
  """
29
 
30
+ def __init__(self, dataset_id: str, private_dataset_id: str):
31
  self._api = HfApi(token=os.environ["HF_TOKEN"])
32
  self._fs = HfFileSystem(token=os.environ["HF_TOKEN"])
33
  self._dataset_id = dataset_id
34
+ self._private_dataset_id = private_dataset_id
35
 
36
  def _get_previous_pr(self, pr_title: str) -> Optional[Discussion]:
37
  """Searches among discussions of dataset repo for a PR with the given title."""
 
48
  self,
49
  model_name_pretty: str,
50
  model_availability: str,
51
+ urls: Optional[str],
52
  context_size: str,
53
  submitted_by: str,
54
+ ) -> Dict[str, Optional[str]]:
55
  return {
56
  "model_name": model_name_pretty,
57
  "model_availability": model_availability,
 
60
  "submitted_by": submitted_by,
61
  }
62
 
63
+ def _upload_request(
64
+ self,
65
+ task_id: str,
66
+ model_folder: str,
67
+ model_name_pretty: str,
68
+ model_availability: str,
69
+ urls: Optional[str],
70
+ context_size: str,
71
+ submitted_by: str,
72
+ contact_information: str,
73
+ comment: Optional[str],
74
+ pr_url: str,
75
+ temp_directory: str,
76
+ ) -> List[CommitOperationAdd]:
77
+ request_metadata = {
78
+ "model_folder": model_folder,
79
+ "model_name_pretty": model_name_pretty,
80
+ "model_availability": model_availability,
81
+ "urls": urls,
82
+ "context_size": context_size,
83
+ "submitted_by": submitted_by,
84
+ "contact_information": contact_information,
85
+ "comment": comment,
86
+ "timestamp": time.time(),
87
+ "pr_url": pr_url,
88
+ }
89
+
90
+ with open(os.path.join(temp_directory, "request_metadata.json"), "w") as f:
91
+ json.dump(request_metadata, f)
92
+
93
+ num_requests_already_present = len(self._fs.ls(f"datasets/{self._private_dataset_id}/{task_id}/"))
94
+ commit_operations = [
95
+ CommitOperationAdd(
96
+ path_in_repo=f"{task_id}/{num_requests_already_present}_{model_folder}.json",
97
+ path_or_fileobj=os.path.join(temp_directory, "request_metadata.json"),
98
+ )
99
+ ]
100
+ return commit_operations
101
+
102
  def _upload_predictions(
103
  self,
104
  task_id: str,
 
148
  model_folder: str,
149
  model_name_pretty: str,
150
  model_availability: str,
151
+ urls: Optional[str],
152
  context_size: str,
153
  submitted_by: str,
154
  temp_directory: str,
 
182
  model_folder: str,
183
  model_name_pretty: str,
184
  model_availability: str,
185
+ urls: Optional[str],
186
  context_size: str,
187
  submitted_by: str,
188
+ contact_information: str,
189
+ comment: Optional[str],
190
  filenames: Optional[List[str]],
191
  ):
192
  assert task_pretty and task_pretty in TASKS_PRETTY_REVERSE, "Please, select one of the supported tasks."
 
201
 
202
  assert submitted_by, "Please, specify non-empty information about a submission's author(s)."
203
  assert filenames, "Please, attach at least one file with predictions."
204
+ assert contact_information, "Please, fill in the field with contact information."
205
 
206
  def upload_files(
207
  self,
 
209
  model_folder: str,
210
  model_name_pretty: str,
211
  model_availability: str,
212
+ urls: Optional[str],
213
  context_size: str,
214
  submitted_by: str,
215
+ contact_information: str,
216
+ comment: Optional[str],
217
  filenames: Optional[List[str]],
218
  force: bool = False,
219
  ) -> str:
 
226
  urls=urls,
227
  context_size=context_size,
228
  submitted_by=submitted_by,
229
+ contact_information=contact_information,
230
+ comment=comment,
231
  filenames=filenames,
232
  )
233
  pr_title = f"πŸš€ New submission to {task_pretty} task: {model_name_pretty} with {context_size} context size from {submitted_by}"
 
238
 
239
  logging.info("Checking if this request has already been submitted...")
240
  if not force:
241
+ if model_folder in self._fs.ls(f"datasets/{self._dataset_id}/{task_id}/predictions"):
242
+ return styled_warning(
243
+ f"{model_folder} is already present in {self._dataset_id}, please, select another folder name."
244
+ )
 
245
 
246
  prev_pr = self._get_previous_pr(pr_title)
247
  if prev_pr is not None:
 
271
  temp_directory=str(d),
272
  )
273
 
274
+ logging.info(f"Creating commit to results dataset...")
275
  new_pr = self._api.create_commit(
276
  repo_id=self._dataset_id,
277
  operations=predictions_commit_operations + results_commit_operations,
 
280
  create_pr=True,
281
  repo_type="dataset",
282
  )
283
+
284
+ logging.info(f"Creating commit to requests dataset...")
285
+ request_commit_operations = self._upload_request(
286
+ task_id=task_id,
287
+ model_folder=model_folder,
288
+ temp_directory=str(d),
289
+ model_name_pretty=model_name_pretty,
290
+ model_availability=model_availability,
291
+ urls=urls,
292
+ context_size=context_size,
293
+ submitted_by=submitted_by,
294
+ contact_information=contact_information,
295
+ comment=comment,
296
+ pr_url=new_pr.pr_url,
297
+ )
298
+ self._api.create_commit(
299
+ repo_id=self._private_dataset_id,
300
+ operations=request_commit_operations,
301
+ commit_message=pr_title,
302
+ commit_description=f"""New submission to {task_pretty} task in 🏟️ Long Code Arena benchmark!\n* Model name: {model_name_pretty}\n* Model availability: {model_availability}\n* Context Size: {context_size}\n* Relevant URLs: {urls}\n* Submitted By: {submitted_by}\n* PR: {new_pr.pr_url}\n* Contact information: {contact_information}\n* Comment: {comment}""",
303
+ create_pr=True,
304
+ repo_type="dataset",
305
+ )
306
+
307
  return styled_message(f"πŸŽ‰ PR created at {new_pr.pr_url}.")
308
 
309
  except Exception as e: