Jason commited on
Commit
d60c9d9
·
unverified ·
1 Parent(s): 011f79c

Disable HF account age requirement; submission fixes (#76)

Browse files
Files changed (1) hide show
  1. submission.py +25 -72
submission.py CHANGED
@@ -4,6 +4,7 @@ import sys
4
  import matplotlib
5
  from agenteval.cli import SUBMISSION_METADATA_FILENAME
6
  from agenteval.models import SubmissionMetadata
 
7
  from gradio_modal import Modal
8
 
9
  matplotlib.use('Agg')
@@ -62,6 +63,8 @@ def try_load_dataset_submission(*args, **kwargs) -> DatasetDict: # Renamed to av
62
  return DatasetDict()
63
  except ValueError: # Handles cases where dataset is empty or ill-formed
64
  return DatasetDict()
 
 
65
 
66
  def checked_upload_folder(
67
  api_hf: HfApi, # Renamed to avoid conflict with global api
@@ -138,17 +141,16 @@ def add_new_eval(
138
 
139
  logger.debug(f"agent {agent_name}: User account age check {profile.username}")
140
  try:
141
- user_data_resp = requests.get(f"https://huggingface.co/api/users/{profile.username}/overview")
142
- user_data_resp.raise_for_status()
143
- creation_date_str = user_data_resp.json()["createdAt"]
144
- created_at = datetime.strptime(creation_date_str, "%Y-%m-%dT%H:%M:%S.%fZ").replace(tzinfo=timezone.utc)
145
- if submission_time - created_at < timedelta(days=60):
146
- return (
147
- format_error("This account is not authorized to submit here (account too new)."), # error_message
148
- gr.update(visible=True), # error_modal
149
- gr.update(visible=False), # success_modal
150
- gr.update(visible=False) # loading_modal
151
- )
152
  except Exception as e:
153
  logger.warning(f"Error checking user account age: {e}")
154
  return (
@@ -271,9 +273,8 @@ def add_new_eval(
271
  contact_info["submit_time"] = submission_time.isoformat()
272
  contact_info["username_auth"] = profile.username
273
  contact_info["email"] = email
274
- contact_info["email_opt_in"] = email_opt_in,
275
- contact_info["role"] = role,
276
- contact_info
277
 
278
  logger.debug(f"agent {agent_name}: Contact info: {contact_info}")
279
  if val_or_test in contact_infos:
@@ -299,65 +300,14 @@ def add_new_eval(
299
  gr.update(visible=False) # loading_modal
300
  )
301
 
302
- def _deprecated_scoring_logic():
303
- # No longer triggered on eval submission. Kept for quick reference for a little while (2025). TODO delete this.
304
-
305
- # 3. Process and score the submission
306
- eval_result_obj = None # Define to avoid NameError
307
- try:
308
- json_path = Path(extracted_dir) / AGENTEVAL_MANIFEST_NAME
309
- if not json_path.exists():
310
- return format_error(f"Missing manifest {AGENTEVAL_MANIFEST_NAME} in submission.")
311
 
312
- eval_result_obj = LeaderboardSubmission.model_validate_json(json_path.read_text(encoding="utf-8"))
313
- if eval_result_obj.suite_config.version != CONFIG_NAME:
314
- return format_error(f"Suite version mismatch: expected {CONFIG_NAME}, got {eval_result_obj.suite_config.version}.")
315
- if eval_result_obj.split != val_or_test:
316
- return format_error(f"Split mismatch: expected {val_or_test}, got {eval_result_obj.split}.")
 
317
 
318
- # Re-compute results from logs for integrity
319
- eval_result_obj.results = process_eval_logs(extracted_dir)[0] # Assuming process_eval_logs returns a tuple/list
320
- eval_result_obj.save_json(str(json_path)) # Save the re-processed manifest
321
-
322
- except Exception as e:
323
- return format_error(f"Error scoring submission: {e}. Check manifest and log files.")
324
-
325
- # 4. Upload scored submission files
326
- logs_url_private_val, logs_url_public_val = None, None
327
- scored_submission_name = f"{submission_name}_scored"
328
- if not LOCAL_DEBUG:
329
- try:
330
- logs_url_private_val = checked_upload_folder(api, extracted_dir, SUBMISSION_DATASET, CONFIG_NAME, val_or_test, scored_submission_name)
331
- if val_or_test == "validation" and not IS_INTERNAL: # Public copy for validation
332
- logs_url_public_val = checked_upload_folder(api, extracted_dir, SUBMISSION_DATASET_PUBLIC, CONFIG_NAME, val_or_test, scored_submission_name)
333
- except ValueError as e: return format_error(str(e))
334
- except Exception as e: return format_error(f"Failed to upload scored submission: {e}")
335
- else: print("mock uploaded scored submission", flush=True)
336
-
337
-
338
- # Update LeaderboardSubmission with submission details
339
- eval_result_obj.submission.agent_name = agent_name
340
- eval_result_obj.submission.agent_description = agent_description
341
- eval_result_obj.submission.agent_url = agent_url
342
- eval_result_obj.submission.openness = openness
343
- eval_result_obj.submission.degree_of_control = degree_of_control
344
- eval_result_obj.submission.username = username
345
- eval_result_obj.submission.submit_time = submission_time
346
- eval_result_obj.submission.logs_url = logs_url_private_val
347
- eval_result_obj.submission.logs_url_public = logs_url_public_val
348
-
349
- # 5. Upload summary statistics to RESULTS_DATASET (for the leaderboard)
350
- if not LOCAL_DEBUG:
351
- try:
352
- upload_summary_to_hf(api, eval_result_obj, RESULTS_DATASET, CONFIG_NAME, val_or_test, scored_submission_name)
353
- except Exception as e:
354
- return format_error(f"Failed to upload summary results to leaderboard: {e}")
355
- else: print("mock uploaded results to lb", flush=True)
356
-
357
- return format_log(
358
- f"Agent '{agent_name}' submitted successfully by '{username}' to '{val_or_test}' split. "
359
- "Please refresh the leaderboard in a few moments. It may take some time for changes to propagate."
360
- )
361
 
362
  openness_label_html = """
363
  <div class="form-label-with-tooltip">
@@ -422,7 +372,10 @@ def build_page():
422
  with gr.Group(elem_classes="custom-form-group"):
423
  gr.HTML(value="""<h2>Agent Information</h2>""", elem_id="agent-info-label-html")
424
  gr.HTML(value="""<h3>Split</h3>""", elem_classes="form-label")
425
- level_of_test_radio = gr.Radio(["Test set", "Validation set"], elem_classes="form-label-fieldset", value="validation", label="The Test Set is used for final leaderboard rankings. The Validation Set is for development and iteration. Choose based on your evaluation goal.")
 
 
 
426
  gr.HTML(value="""<h3>Agent name</h3>""", elem_classes="form-label")
427
  agent_name_tb = gr.Textbox(label="This is how your agent will appear on the leaderboard. Use a clear, descriptive name (e.g., Asta Scholar QA, Perplexity Deep Research). Omit model names (e.g. GPT-4, Mistral) as they’ll be shown automatically based on your logs.")
428
  gr.HTML(value="""<h3>Agent description</h3>""", elem_classes="form-label")
 
4
  import matplotlib
5
  from agenteval.cli import SUBMISSION_METADATA_FILENAME
6
  from agenteval.models import SubmissionMetadata
7
+ from datasets.exceptions import DataFilesNotFoundError
8
  from gradio_modal import Modal
9
 
10
  matplotlib.use('Agg')
 
63
  return DatasetDict()
64
  except ValueError: # Handles cases where dataset is empty or ill-formed
65
  return DatasetDict()
66
+ except DataFilesNotFoundError:
67
+ return DatasetDict()
68
 
69
  def checked_upload_folder(
70
  api_hf: HfApi, # Renamed to avoid conflict with global api
 
141
 
142
  logger.debug(f"agent {agent_name}: User account age check {profile.username}")
143
  try:
144
+ # Account age check disabled for launch.
145
+ # https://github.com/allenai/astabench-issues/issues/419
146
+ # if _is_hf_acct_too_new(submission_time, profile.username):
147
+ # return (
148
+ # format_error("This account is not authorized to submit here (account too new)."), # error_message
149
+ # gr.update(visible=True), # error_modal
150
+ # gr.update(visible=False), # success_modal
151
+ # gr.update(visible=False) # loading_modal
152
+ # )
153
+ pass
 
154
  except Exception as e:
155
  logger.warning(f"Error checking user account age: {e}")
156
  return (
 
273
  contact_info["submit_time"] = submission_time.isoformat()
274
  contact_info["username_auth"] = profile.username
275
  contact_info["email"] = email
276
+ contact_info["email_opt_in"] = email_opt_in
277
+ contact_info["role"] = role
 
278
 
279
  logger.debug(f"agent {agent_name}: Contact info: {contact_info}")
280
  if val_or_test in contact_infos:
 
300
  gr.update(visible=False) # loading_modal
301
  )
302
 
 
 
 
 
 
 
 
 
 
303
 
304
+ def _is_hf_acct_too_new(submission_time: datetime, username: str):
305
+ user_data_resp = requests.get(f"https://huggingface.co/api/users/{username}/overview")
306
+ user_data_resp.raise_for_status()
307
+ creation_date_str = user_data_resp.json()["createdAt"]
308
+ created_at = datetime.strptime(creation_date_str, "%Y-%m-%dT%H:%M:%S.%fZ").replace(tzinfo=timezone.utc)
309
+ return submission_time - created_at < timedelta(days=60)
310
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
311
 
312
  openness_label_html = """
313
  <div class="form-label-with-tooltip">
 
372
  with gr.Group(elem_classes="custom-form-group"):
373
  gr.HTML(value="""<h2>Agent Information</h2>""", elem_id="agent-info-label-html")
374
  gr.HTML(value="""<h3>Split</h3>""", elem_classes="form-label")
375
+ level_of_test_radio = gr.Radio(choices=[
376
+ ("Test set", "test"),
377
+ ("Validation set", "validation"),
378
+ ], elem_classes="form-label-fieldset", value="validation", label="The Test Set is used for final leaderboard rankings. The Validation Set is for development and iteration. Choose based on your evaluation goal.")
379
  gr.HTML(value="""<h3>Agent name</h3>""", elem_classes="form-label")
380
  agent_name_tb = gr.Textbox(label="This is how your agent will appear on the leaderboard. Use a clear, descriptive name (e.g., Asta Scholar QA, Perplexity Deep Research). Omit model names (e.g. GPT-4, Mistral) as they’ll be shown automatically based on your logs.")
381
  gr.HTML(value="""<h3>Agent description</h3>""", elem_classes="form-label")