Spaces:
Running
Running
Jason
commited on
Disable HF account age requirement; submission fixes (#76)
Browse files- submission.py +25 -72
submission.py
CHANGED
|
@@ -4,6 +4,7 @@ import sys
|
|
| 4 |
import matplotlib
|
| 5 |
from agenteval.cli import SUBMISSION_METADATA_FILENAME
|
| 6 |
from agenteval.models import SubmissionMetadata
|
|
|
|
| 7 |
from gradio_modal import Modal
|
| 8 |
|
| 9 |
matplotlib.use('Agg')
|
|
@@ -62,6 +63,8 @@ def try_load_dataset_submission(*args, **kwargs) -> DatasetDict: # Renamed to av
|
|
| 62 |
return DatasetDict()
|
| 63 |
except ValueError: # Handles cases where dataset is empty or ill-formed
|
| 64 |
return DatasetDict()
|
|
|
|
|
|
|
| 65 |
|
| 66 |
def checked_upload_folder(
|
| 67 |
api_hf: HfApi, # Renamed to avoid conflict with global api
|
|
@@ -138,17 +141,16 @@ def add_new_eval(
|
|
| 138 |
|
| 139 |
logger.debug(f"agent {agent_name}: User account age check {profile.username}")
|
| 140 |
try:
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
)
|
| 152 |
except Exception as e:
|
| 153 |
logger.warning(f"Error checking user account age: {e}")
|
| 154 |
return (
|
|
@@ -271,9 +273,8 @@ def add_new_eval(
|
|
| 271 |
contact_info["submit_time"] = submission_time.isoformat()
|
| 272 |
contact_info["username_auth"] = profile.username
|
| 273 |
contact_info["email"] = email
|
| 274 |
-
contact_info["email_opt_in"] = email_opt_in
|
| 275 |
-
contact_info["role"] = role
|
| 276 |
-
contact_info
|
| 277 |
|
| 278 |
logger.debug(f"agent {agent_name}: Contact info: {contact_info}")
|
| 279 |
if val_or_test in contact_infos:
|
|
@@ -299,65 +300,14 @@ def add_new_eval(
|
|
| 299 |
gr.update(visible=False) # loading_modal
|
| 300 |
)
|
| 301 |
|
| 302 |
-
def _deprecated_scoring_logic():
|
| 303 |
-
# No longer triggered on eval submission. Kept for quick reference for a little while (2025). TODO delete this.
|
| 304 |
-
|
| 305 |
-
# 3. Process and score the submission
|
| 306 |
-
eval_result_obj = None # Define to avoid NameError
|
| 307 |
-
try:
|
| 308 |
-
json_path = Path(extracted_dir) / AGENTEVAL_MANIFEST_NAME
|
| 309 |
-
if not json_path.exists():
|
| 310 |
-
return format_error(f"Missing manifest {AGENTEVAL_MANIFEST_NAME} in submission.")
|
| 311 |
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
|
|
|
|
| 317 |
|
| 318 |
-
# Re-compute results from logs for integrity
|
| 319 |
-
eval_result_obj.results = process_eval_logs(extracted_dir)[0] # Assuming process_eval_logs returns a tuple/list
|
| 320 |
-
eval_result_obj.save_json(str(json_path)) # Save the re-processed manifest
|
| 321 |
-
|
| 322 |
-
except Exception as e:
|
| 323 |
-
return format_error(f"Error scoring submission: {e}. Check manifest and log files.")
|
| 324 |
-
|
| 325 |
-
# 4. Upload scored submission files
|
| 326 |
-
logs_url_private_val, logs_url_public_val = None, None
|
| 327 |
-
scored_submission_name = f"{submission_name}_scored"
|
| 328 |
-
if not LOCAL_DEBUG:
|
| 329 |
-
try:
|
| 330 |
-
logs_url_private_val = checked_upload_folder(api, extracted_dir, SUBMISSION_DATASET, CONFIG_NAME, val_or_test, scored_submission_name)
|
| 331 |
-
if val_or_test == "validation" and not IS_INTERNAL: # Public copy for validation
|
| 332 |
-
logs_url_public_val = checked_upload_folder(api, extracted_dir, SUBMISSION_DATASET_PUBLIC, CONFIG_NAME, val_or_test, scored_submission_name)
|
| 333 |
-
except ValueError as e: return format_error(str(e))
|
| 334 |
-
except Exception as e: return format_error(f"Failed to upload scored submission: {e}")
|
| 335 |
-
else: print("mock uploaded scored submission", flush=True)
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
# Update LeaderboardSubmission with submission details
|
| 339 |
-
eval_result_obj.submission.agent_name = agent_name
|
| 340 |
-
eval_result_obj.submission.agent_description = agent_description
|
| 341 |
-
eval_result_obj.submission.agent_url = agent_url
|
| 342 |
-
eval_result_obj.submission.openness = openness
|
| 343 |
-
eval_result_obj.submission.degree_of_control = degree_of_control
|
| 344 |
-
eval_result_obj.submission.username = username
|
| 345 |
-
eval_result_obj.submission.submit_time = submission_time
|
| 346 |
-
eval_result_obj.submission.logs_url = logs_url_private_val
|
| 347 |
-
eval_result_obj.submission.logs_url_public = logs_url_public_val
|
| 348 |
-
|
| 349 |
-
# 5. Upload summary statistics to RESULTS_DATASET (for the leaderboard)
|
| 350 |
-
if not LOCAL_DEBUG:
|
| 351 |
-
try:
|
| 352 |
-
upload_summary_to_hf(api, eval_result_obj, RESULTS_DATASET, CONFIG_NAME, val_or_test, scored_submission_name)
|
| 353 |
-
except Exception as e:
|
| 354 |
-
return format_error(f"Failed to upload summary results to leaderboard: {e}")
|
| 355 |
-
else: print("mock uploaded results to lb", flush=True)
|
| 356 |
-
|
| 357 |
-
return format_log(
|
| 358 |
-
f"Agent '{agent_name}' submitted successfully by '{username}' to '{val_or_test}' split. "
|
| 359 |
-
"Please refresh the leaderboard in a few moments. It may take some time for changes to propagate."
|
| 360 |
-
)
|
| 361 |
|
| 362 |
openness_label_html = """
|
| 363 |
<div class="form-label-with-tooltip">
|
|
@@ -422,7 +372,10 @@ def build_page():
|
|
| 422 |
with gr.Group(elem_classes="custom-form-group"):
|
| 423 |
gr.HTML(value="""<h2>Agent Information</h2>""", elem_id="agent-info-label-html")
|
| 424 |
gr.HTML(value="""<h3>Split</h3>""", elem_classes="form-label")
|
| 425 |
-
level_of_test_radio = gr.Radio([
|
|
|
|
|
|
|
|
|
|
| 426 |
gr.HTML(value="""<h3>Agent name</h3>""", elem_classes="form-label")
|
| 427 |
agent_name_tb = gr.Textbox(label="This is how your agent will appear on the leaderboard. Use a clear, descriptive name (e.g., Asta Scholar QA, Perplexity Deep Research). Omit model names (e.g. GPT-4, Mistral) as they’ll be shown automatically based on your logs.")
|
| 428 |
gr.HTML(value="""<h3>Agent description</h3>""", elem_classes="form-label")
|
|
|
|
| 4 |
import matplotlib
|
| 5 |
from agenteval.cli import SUBMISSION_METADATA_FILENAME
|
| 6 |
from agenteval.models import SubmissionMetadata
|
| 7 |
+
from datasets.exceptions import DataFilesNotFoundError
|
| 8 |
from gradio_modal import Modal
|
| 9 |
|
| 10 |
matplotlib.use('Agg')
|
|
|
|
| 63 |
return DatasetDict()
|
| 64 |
except ValueError: # Handles cases where dataset is empty or ill-formed
|
| 65 |
return DatasetDict()
|
| 66 |
+
except DataFilesNotFoundError:
|
| 67 |
+
return DatasetDict()
|
| 68 |
|
| 69 |
def checked_upload_folder(
|
| 70 |
api_hf: HfApi, # Renamed to avoid conflict with global api
|
|
|
|
| 141 |
|
| 142 |
logger.debug(f"agent {agent_name}: User account age check {profile.username}")
|
| 143 |
try:
|
| 144 |
+
# Account age check disabled for launch.
|
| 145 |
+
# https://github.com/allenai/astabench-issues/issues/419
|
| 146 |
+
# if _is_hf_acct_too_new(submission_time, profile.username):
|
| 147 |
+
# return (
|
| 148 |
+
# format_error("This account is not authorized to submit here (account too new)."), # error_message
|
| 149 |
+
# gr.update(visible=True), # error_modal
|
| 150 |
+
# gr.update(visible=False), # success_modal
|
| 151 |
+
# gr.update(visible=False) # loading_modal
|
| 152 |
+
# )
|
| 153 |
+
pass
|
|
|
|
| 154 |
except Exception as e:
|
| 155 |
logger.warning(f"Error checking user account age: {e}")
|
| 156 |
return (
|
|
|
|
| 273 |
contact_info["submit_time"] = submission_time.isoformat()
|
| 274 |
contact_info["username_auth"] = profile.username
|
| 275 |
contact_info["email"] = email
|
| 276 |
+
contact_info["email_opt_in"] = email_opt_in
|
| 277 |
+
contact_info["role"] = role
|
|
|
|
| 278 |
|
| 279 |
logger.debug(f"agent {agent_name}: Contact info: {contact_info}")
|
| 280 |
if val_or_test in contact_infos:
|
|
|
|
| 300 |
gr.update(visible=False) # loading_modal
|
| 301 |
)
|
| 302 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 303 |
|
| 304 |
+
def _is_hf_acct_too_new(submission_time: datetime, username: str):
|
| 305 |
+
user_data_resp = requests.get(f"https://huggingface.co/api/users/{username}/overview")
|
| 306 |
+
user_data_resp.raise_for_status()
|
| 307 |
+
creation_date_str = user_data_resp.json()["createdAt"]
|
| 308 |
+
created_at = datetime.strptime(creation_date_str, "%Y-%m-%dT%H:%M:%S.%fZ").replace(tzinfo=timezone.utc)
|
| 309 |
+
return submission_time - created_at < timedelta(days=60)
|
| 310 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 311 |
|
| 312 |
openness_label_html = """
|
| 313 |
<div class="form-label-with-tooltip">
|
|
|
|
| 372 |
with gr.Group(elem_classes="custom-form-group"):
|
| 373 |
gr.HTML(value="""<h2>Agent Information</h2>""", elem_id="agent-info-label-html")
|
| 374 |
gr.HTML(value="""<h3>Split</h3>""", elem_classes="form-label")
|
| 375 |
+
level_of_test_radio = gr.Radio(choices=[
|
| 376 |
+
("Test set", "test"),
|
| 377 |
+
("Validation set", "validation"),
|
| 378 |
+
], elem_classes="form-label-fieldset", value="validation", label="The Test Set is used for final leaderboard rankings. The Validation Set is for development and iteration. Choose based on your evaluation goal.")
|
| 379 |
gr.HTML(value="""<h3>Agent name</h3>""", elem_classes="form-label")
|
| 380 |
agent_name_tb = gr.Textbox(label="This is how your agent will appear on the leaderboard. Use a clear, descriptive name (e.g., Asta Scholar QA, Perplexity Deep Research). Omit model names (e.g. GPT-4, Mistral) as they’ll be shown automatically based on your logs.")
|
| 381 |
gr.HTML(value="""<h3>Agent description</h3>""", elem_classes="form-label")
|