chore: create final submit space
Browse files- README.md +66 -6
- __init__.py +1 -0
- app.py +660 -0
- repo_ops.py +114 -0
- requirements.txt +2 -0
- validator.py +518 -0
README.md
CHANGED
|
@@ -1,12 +1,72 @@
|
|
| 1 |
---
|
| 2 |
-
title: ResearchClawBench Task
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version:
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
---
|
| 11 |
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: ResearchClawBench Task Submission
|
| 3 |
+
emoji: 📦
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: indigo
|
| 6 |
sdk: gradio
|
| 7 |
+
sdk_version: 5.49.1
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
---
|
| 11 |
|
| 12 |
+
# ResearchClawBench Hugging Face Submission Space
|
| 13 |
+
|
| 14 |
+
This directory contains a deployable MVP for a Hugging Face Space that lets users submit a new ResearchClawBench task as a zip archive.
|
| 15 |
+
|
| 16 |
+
## What it does
|
| 17 |
+
|
| 18 |
+
- accepts a single `.zip` upload
|
| 19 |
+
- requires exactly one top-level task directory inside the archive
|
| 20 |
+
- validates the full ResearchClawBench task structure and JSON/path format
|
| 21 |
+
- allocates the next available `Domain_NNN` task id from the Hugging Face dataset repo
|
| 22 |
+
- creates a PR against `InternScience/ResearchClawBench` when validation passes
|
| 23 |
+
|
| 24 |
+
## Files
|
| 25 |
+
|
| 26 |
+
- `app.py`: Gradio Space UI
|
| 27 |
+
- `validator.py`: archive extraction and task-format validation
|
| 28 |
+
- `repo_ops.py`: Hugging Face repo scanning, task-id allocation, PR creation
|
| 29 |
+
- `requirements.txt`: extra Python dependencies beyond the built-in Gradio SDK
|
| 30 |
+
|
| 31 |
+
## Expected upload format
|
| 32 |
+
|
| 33 |
+
The uploaded zip must contain exactly one task directory:
|
| 34 |
+
|
| 35 |
+
```text
|
| 36 |
+
Astronomy_submission.zip
|
| 37 |
+
└── some_folder_name/
|
| 38 |
+
├── task_info.json
|
| 39 |
+
├── data/
|
| 40 |
+
├── related_work/
|
| 41 |
+
└── target_study/
|
| 42 |
+
├── checklist.json
|
| 43 |
+
├── paper.pdf
|
| 44 |
+
└── images/
|
| 45 |
+
```
|
| 46 |
+
|
| 47 |
+
The top-level directory name inside the zip does not need to be the final task id. The Space validates the structure, then renames it to the next available `Domain_NNN` id when opening the PR.
|
| 48 |
+
|
| 49 |
+
## Required environment variables / Space secrets
|
| 50 |
+
|
| 51 |
+
- `RCB_SPACE_HF_TOKEN` or `HF_TOKEN`: Hugging Face write token for creating PRs to `InternScience/ResearchClawBench`
|
| 52 |
+
|
| 53 |
+
Optional limits:
|
| 54 |
+
|
| 55 |
+
- `RCB_SPACE_MAX_FILES`
|
| 56 |
+
- `RCB_SPACE_MAX_TOTAL_BYTES`
|
| 57 |
+
- `RCB_SPACE_MAX_SINGLE_FILE_BYTES`
|
| 58 |
+
|
| 59 |
+
## Local run
|
| 60 |
+
|
| 61 |
+
```bash
|
| 62 |
+
cd ResearchClawBench-Self/huggingface/space_submitter
|
| 63 |
+
/home/xwh/miniconda3/envs/agent/bin/python -m pip install gradio==5.49.1 -r requirements.txt
|
| 64 |
+
/home/xwh/miniconda3/envs/agent/bin/python app.py
|
| 65 |
+
```
|
| 66 |
+
|
| 67 |
+
## Notes
|
| 68 |
+
|
| 69 |
+
- validation does not modify the main `ResearchClawBench` repo
|
| 70 |
+
- PR creation targets the Hugging Face dataset repo directly with `create_pr=True`
|
| 71 |
+
- after a PR is created, maintainers still decide whether to merge it
|
| 72 |
+
- on Hugging Face Spaces, the Gradio version comes from the README YAML `sdk_version`, not from `requirements.txt`
|
__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""ResearchClawBench Hugging Face Space submission tools."""
|
app.py
ADDED
|
@@ -0,0 +1,660 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
import gradio as gr
|
| 7 |
+
|
| 8 |
+
try:
|
| 9 |
+
from .repo_ops import DEFAULT_REPO_ID, allocate_next_task_id, create_dataset_pr, list_existing_task_ids, load_hf_token
|
| 10 |
+
from .validator import (
|
| 11 |
+
DOMAINS,
|
| 12 |
+
PreparedSubmission,
|
| 13 |
+
SubmissionMetadata,
|
| 14 |
+
ValidationError,
|
| 15 |
+
build_public_report,
|
| 16 |
+
cleanup_work_dir,
|
| 17 |
+
normalize_domain_token,
|
| 18 |
+
validate_and_prepare_submission,
|
| 19 |
+
)
|
| 20 |
+
except ImportError:
|
| 21 |
+
from repo_ops import DEFAULT_REPO_ID, allocate_next_task_id, create_dataset_pr, list_existing_task_ids, load_hf_token
|
| 22 |
+
from validator import (
|
| 23 |
+
DOMAINS,
|
| 24 |
+
PreparedSubmission,
|
| 25 |
+
SubmissionMetadata,
|
| 26 |
+
ValidationError,
|
| 27 |
+
build_public_report,
|
| 28 |
+
cleanup_work_dir,
|
| 29 |
+
normalize_domain_token,
|
| 30 |
+
validate_and_prepare_submission,
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
SPACE_TITLE = 'ResearchClawBench Task Submission'
|
| 35 |
+
GITHUB_REPO_URL = 'https://github.com/InternScience/ResearchClawBench'
|
| 36 |
+
DATASET_URL = f'https://huggingface.co/datasets/{DEFAULT_REPO_ID}'
|
| 37 |
+
SPACE_URL = 'https://huggingface.co/spaces/InternScience/ResearchClawBench-Task-Submit'
|
| 38 |
+
|
| 39 |
+
CSS = """
|
| 40 |
+
@import url('https://fonts.googleapis.com/css2?family=Manrope:wght@400;500;600;700;800&display=swap');
|
| 41 |
+
|
| 42 |
+
:root {
|
| 43 |
+
--page-text: #0f172a;
|
| 44 |
+
--page-muted: #526075;
|
| 45 |
+
--page-line: rgba(15, 23, 42, 0.12);
|
| 46 |
+
--page-surface: rgba(255, 255, 255, 0.78);
|
| 47 |
+
--page-surface-strong: #ffffff;
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
body {
|
| 51 |
+
background:
|
| 52 |
+
radial-gradient(circle at top left, rgba(54, 107, 245, 0.12), transparent 34%),
|
| 53 |
+
radial-gradient(circle at top right, rgba(15, 118, 110, 0.08), transparent 28%),
|
| 54 |
+
linear-gradient(180deg, #f8fafc 0%, #f3f6fb 55%, #f6f8fb 100%);
|
| 55 |
+
color: var(--page-text);
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
body,
|
| 59 |
+
button,
|
| 60 |
+
input,
|
| 61 |
+
textarea {
|
| 62 |
+
font-family: 'Manrope', 'Noto Sans SC', 'PingFang SC', 'Microsoft YaHei', sans-serif !important;
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
.gradio-container {
|
| 66 |
+
max-width: 1220px !important;
|
| 67 |
+
margin: 0 auto !important;
|
| 68 |
+
padding: 34px 28px 56px !important;
|
| 69 |
+
--block-background-fill: transparent;
|
| 70 |
+
--block-border-width: 0px;
|
| 71 |
+
--block-border-color: transparent;
|
| 72 |
+
--block-label-background-fill: transparent;
|
| 73 |
+
--block-label-border-width: 0px;
|
| 74 |
+
--panel-background-fill: transparent;
|
| 75 |
+
--panel-border-width: 0px;
|
| 76 |
+
--panel-border-color: transparent;
|
| 77 |
+
--background-fill-secondary: transparent;
|
| 78 |
+
--body-background-fill: transparent;
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
.page-shell {
|
| 82 |
+
margin-top: 26px;
|
| 83 |
+
padding: 30px 34px 34px;
|
| 84 |
+
background: #ffffff;
|
| 85 |
+
border: 1px solid rgba(15, 23, 42, 0.08);
|
| 86 |
+
border-radius: 22px;
|
| 87 |
+
box-shadow: 0 18px 48px rgba(15, 23, 42, 0.05);
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
.hero {
|
| 91 |
+
padding: 38px 42px 34px;
|
| 92 |
+
border-radius: 24px;
|
| 93 |
+
color: #f8fbff;
|
| 94 |
+
background:
|
| 95 |
+
radial-gradient(circle at 14% 18%, rgba(255, 255, 255, 0.16), transparent 18%),
|
| 96 |
+
linear-gradient(135deg, #0f274d 0%, #133c7c 46%, #124f75 100%);
|
| 97 |
+
box-shadow: 0 26px 60px rgba(15, 39, 77, 0.18);
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
.hero h1 {
|
| 101 |
+
margin: 0;
|
| 102 |
+
font-size: 2.4rem;
|
| 103 |
+
line-height: 1.02;
|
| 104 |
+
letter-spacing: -0.04em;
|
| 105 |
+
color: #f8fbff !important;
|
| 106 |
+
text-shadow: 0 1px 12px rgba(0, 0, 0, 0.14);
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
.hero-copy {
|
| 110 |
+
margin-top: 16px;
|
| 111 |
+
max-width: 860px;
|
| 112 |
+
font-size: 1.04rem;
|
| 113 |
+
line-height: 1.72;
|
| 114 |
+
color: rgba(248, 251, 255, 0.9) !important;
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
.hero-links {
|
| 118 |
+
display: flex;
|
| 119 |
+
gap: 14px;
|
| 120 |
+
flex-wrap: wrap;
|
| 121 |
+
margin-top: 22px;
|
| 122 |
+
}
|
| 123 |
+
|
| 124 |
+
.hero-links a {
|
| 125 |
+
color: #f8fbff !important;
|
| 126 |
+
text-decoration: none;
|
| 127 |
+
font-weight: 700;
|
| 128 |
+
letter-spacing: -0.01em;
|
| 129 |
+
}
|
| 130 |
+
|
| 131 |
+
.hero-links a:hover {
|
| 132 |
+
text-decoration: underline;
|
| 133 |
+
}
|
| 134 |
+
|
| 135 |
+
.hero-meta {
|
| 136 |
+
margin-top: 18px;
|
| 137 |
+
font-size: 0.93rem;
|
| 138 |
+
color: rgba(248, 251, 255, 0.72) !important;
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
.section-row {
|
| 142 |
+
margin-top: 30px;
|
| 143 |
+
}
|
| 144 |
+
|
| 145 |
+
.section-row,
|
| 146 |
+
.section-row > div,
|
| 147 |
+
.section-copy,
|
| 148 |
+
.section-copy > div,
|
| 149 |
+
.main-form,
|
| 150 |
+
.side-notes {
|
| 151 |
+
background: transparent !important;
|
| 152 |
+
border: 0 !important;
|
| 153 |
+
box-shadow: none !important;
|
| 154 |
+
}
|
| 155 |
+
|
| 156 |
+
.section-copy h2 {
|
| 157 |
+
margin: 0 0 10px;
|
| 158 |
+
font-size: 1.2rem;
|
| 159 |
+
letter-spacing: -0.03em;
|
| 160 |
+
}
|
| 161 |
+
|
| 162 |
+
.section-copy h3 {
|
| 163 |
+
margin: 24px 0 8px;
|
| 164 |
+
font-size: 1rem;
|
| 165 |
+
}
|
| 166 |
+
|
| 167 |
+
.section-copy p,
|
| 168 |
+
.section-copy li {
|
| 169 |
+
color: #5a667a;
|
| 170 |
+
line-height: 1.72;
|
| 171 |
+
}
|
| 172 |
+
|
| 173 |
+
.section-copy ul,
|
| 174 |
+
.section-copy ol {
|
| 175 |
+
margin: 10px 0 0;
|
| 176 |
+
padding-left: 1.2rem;
|
| 177 |
+
}
|
| 178 |
+
|
| 179 |
+
.section-copy code {
|
| 180 |
+
font-size: 0.95em;
|
| 181 |
+
}
|
| 182 |
+
|
| 183 |
+
.section-copy .prose {
|
| 184 |
+
max-width: 100%;
|
| 185 |
+
}
|
| 186 |
+
|
| 187 |
+
.subtle-block {
|
| 188 |
+
padding-bottom: 22px;
|
| 189 |
+
border-bottom: 1px solid var(--page-line);
|
| 190 |
+
}
|
| 191 |
+
|
| 192 |
+
.section-copy .prose,
|
| 193 |
+
.section-copy .prose *,
|
| 194 |
+
.section-copy .md,
|
| 195 |
+
.section-copy .md *,
|
| 196 |
+
.section-copy .markdown,
|
| 197 |
+
.section-copy .markdown * {
|
| 198 |
+
background: transparent !important;
|
| 199 |
+
}
|
| 200 |
+
|
| 201 |
+
.main-form {
|
| 202 |
+
padding-right: 14px;
|
| 203 |
+
}
|
| 204 |
+
|
| 205 |
+
.side-notes {
|
| 206 |
+
padding-left: 10px;
|
| 207 |
+
}
|
| 208 |
+
|
| 209 |
+
.caption {
|
| 210 |
+
margin-top: 4px;
|
| 211 |
+
color: var(--page-muted);
|
| 212 |
+
font-size: 0.93rem;
|
| 213 |
+
line-height: 1.6;
|
| 214 |
+
}
|
| 215 |
+
|
| 216 |
+
.field-label {
|
| 217 |
+
margin: 18px 0 8px;
|
| 218 |
+
color: var(--page-text);
|
| 219 |
+
font-size: 0.95rem;
|
| 220 |
+
font-weight: 700;
|
| 221 |
+
letter-spacing: -0.01em;
|
| 222 |
+
}
|
| 223 |
+
|
| 224 |
+
.results-shell {
|
| 225 |
+
margin-top: 26px;
|
| 226 |
+
padding-top: 22px;
|
| 227 |
+
border-top: 1px solid var(--page-line);
|
| 228 |
+
}
|
| 229 |
+
|
| 230 |
+
.action-row {
|
| 231 |
+
margin-top: 10px;
|
| 232 |
+
}
|
| 233 |
+
|
| 234 |
+
.upload-row {
|
| 235 |
+
margin-top: 18px;
|
| 236 |
+
margin-bottom: 10px;
|
| 237 |
+
}
|
| 238 |
+
|
| 239 |
+
.upload-button button {
|
| 240 |
+
border-radius: 12px !important;
|
| 241 |
+
min-height: 48px !important;
|
| 242 |
+
padding: 0 18px !important;
|
| 243 |
+
background: #ffffff !important;
|
| 244 |
+
color: var(--page-text) !important;
|
| 245 |
+
border: 1px solid rgba(19, 70, 162, 0.16) !important;
|
| 246 |
+
box-shadow: 0 8px 22px rgba(15, 23, 42, 0.04) !important;
|
| 247 |
+
}
|
| 248 |
+
|
| 249 |
+
.upload-status {
|
| 250 |
+
padding-top: 10px;
|
| 251 |
+
}
|
| 252 |
+
|
| 253 |
+
.upload-status p {
|
| 254 |
+
margin: 0 !important;
|
| 255 |
+
color: var(--page-muted) !important;
|
| 256 |
+
}
|
| 257 |
+
|
| 258 |
+
.primary-button button,
|
| 259 |
+
.secondary-button button {
|
| 260 |
+
border-radius: 12px !important;
|
| 261 |
+
min-height: 48px !important;
|
| 262 |
+
font-weight: 700 !important;
|
| 263 |
+
letter-spacing: -0.01em;
|
| 264 |
+
}
|
| 265 |
+
|
| 266 |
+
.primary-button button {
|
| 267 |
+
background: linear-gradient(135deg, #1346a2 0%, #155eef 100%) !important;
|
| 268 |
+
box-shadow: 0 16px 32px rgba(21, 94, 239, 0.2) !important;
|
| 269 |
+
}
|
| 270 |
+
|
| 271 |
+
.secondary-button button {
|
| 272 |
+
background: var(--page-surface-strong) !important;
|
| 273 |
+
color: var(--page-text) !important;
|
| 274 |
+
border: 1px solid rgba(15, 23, 42, 0.12) !important;
|
| 275 |
+
}
|
| 276 |
+
|
| 277 |
+
.gradio-container .block,
|
| 278 |
+
.gradio-container .gr-box,
|
| 279 |
+
.gradio-container .gr-form,
|
| 280 |
+
.gradio-container .gr-group,
|
| 281 |
+
.gradio-container .form,
|
| 282 |
+
.gradio-container .input-container,
|
| 283 |
+
.gradio-container .wrap,
|
| 284 |
+
.gradio-container .row,
|
| 285 |
+
.gradio-container .column,
|
| 286 |
+
.gradio-container fieldset {
|
| 287 |
+
background: transparent !important;
|
| 288 |
+
box-shadow: none !important;
|
| 289 |
+
border-color: transparent !important;
|
| 290 |
+
}
|
| 291 |
+
|
| 292 |
+
.gradio-container input:not([type="checkbox"]),
|
| 293 |
+
.gradio-container textarea,
|
| 294 |
+
.gradio-container button[aria-haspopup="listbox"],
|
| 295 |
+
.gradio-container button[role="listbox"],
|
| 296 |
+
.gradio-container .wrap:has(input:not([type="checkbox"])),
|
| 297 |
+
.gradio-container .wrap:has(textarea),
|
| 298 |
+
.gradio-container .wrap:has(button[aria-haspopup="listbox"]),
|
| 299 |
+
.gradio-container .wrap:has(button[role="listbox"]),
|
| 300 |
+
.gradio-container .wrap:has(select),
|
| 301 |
+
.gradio-container .input-container:has(input:not([type="checkbox"])),
|
| 302 |
+
.gradio-container .input-container:has(textarea),
|
| 303 |
+
.gradio-container .input-container:has(button[aria-haspopup="listbox"]),
|
| 304 |
+
.gradio-container .input-container:has(button[role="listbox"]),
|
| 305 |
+
.gradio-container input:not([type="checkbox"]),
|
| 306 |
+
.gradio-container textarea {
|
| 307 |
+
background: var(--page-surface-strong) !important;
|
| 308 |
+
border: 1px solid rgba(19, 70, 162, 0.16) !important;
|
| 309 |
+
border-radius: 10px !important;
|
| 310 |
+
box-shadow: 0 1px 0 rgba(15, 23, 42, 0.02), 0 8px 22px rgba(15, 23, 42, 0.04) !important;
|
| 311 |
+
}
|
| 312 |
+
|
| 313 |
+
.gradio-container .block,
|
| 314 |
+
.gradio-container .wrap,
|
| 315 |
+
.gradio-container .gr-box,
|
| 316 |
+
.gradio-container .gr-form,
|
| 317 |
+
.gradio-container .gr-panel,
|
| 318 |
+
.gradio-container .gr-group,
|
| 319 |
+
.gradio-container .form,
|
| 320 |
+
.gradio-container .input-container,
|
| 321 |
+
.gradio-container .wrap-inner {
|
| 322 |
+
overflow: visible !important;
|
| 323 |
+
}
|
| 324 |
+
|
| 325 |
+
.gradio-container label,
|
| 326 |
+
.gradio-container .label-wrap,
|
| 327 |
+
.gradio-container .caption-label {
|
| 328 |
+
color: var(--page-text) !important;
|
| 329 |
+
}
|
| 330 |
+
|
| 331 |
+
.link-list a {
|
| 332 |
+
color: #1346a2;
|
| 333 |
+
text-decoration: none;
|
| 334 |
+
font-weight: 600;
|
| 335 |
+
}
|
| 336 |
+
|
| 337 |
+
.link-list a:hover {
|
| 338 |
+
text-decoration: underline;
|
| 339 |
+
}
|
| 340 |
+
|
| 341 |
+
@media (max-width: 900px) {
|
| 342 |
+
.gradio-container {
|
| 343 |
+
padding: 22px 16px 42px !important;
|
| 344 |
+
}
|
| 345 |
+
|
| 346 |
+
.hero {
|
| 347 |
+
padding: 28px 24px 26px;
|
| 348 |
+
border-radius: 20px;
|
| 349 |
+
}
|
| 350 |
+
|
| 351 |
+
.hero h1 {
|
| 352 |
+
font-size: 2rem;
|
| 353 |
+
}
|
| 354 |
+
|
| 355 |
+
.main-form,
|
| 356 |
+
.side-notes {
|
| 357 |
+
padding-right: 0;
|
| 358 |
+
padding-left: 0;
|
| 359 |
+
}
|
| 360 |
+
}
|
| 361 |
+
"""
|
| 362 |
+
|
| 363 |
+
|
| 364 |
+
def build_hero_html() -> str:
|
| 365 |
+
return f"""
|
| 366 |
+
<section class="hero">
|
| 367 |
+
<h1>{SPACE_TITLE}</h1>
|
| 368 |
+
<p class="hero-copy">
|
| 369 |
+
Submit a new ResearchClawBench task as a single ZIP archive. This Space validates the full task
|
| 370 |
+
structure, checks JSON fields and referenced paths, allocates the next available task ID, and then
|
| 371 |
+
opens a PR against the official Hugging Face dataset for maintainer review.
|
| 372 |
+
</p>
|
| 373 |
+
<div class="hero-links">
|
| 374 |
+
<a href="{GITHUB_REPO_URL}" target="_blank">GitHub Repository</a>
|
| 375 |
+
<a href="{DATASET_URL}" target="_blank">Hugging Face Dataset</a>
|
| 376 |
+
<a href="{SPACE_URL}" target="_blank">Space Repository</a>
|
| 377 |
+
</div>
|
| 378 |
+
<div class="hero-meta">
|
| 379 |
+
ZIP upload only · full task-format validation · PR to dataset repo after passing checks
|
| 380 |
+
</div>
|
| 381 |
+
</section>
|
| 382 |
+
"""
|
| 383 |
+
|
| 384 |
+
|
| 385 |
+
def field_label_html(text: str) -> str:
|
| 386 |
+
return f'<div class="field-label">{text}</div>'
|
| 387 |
+
|
| 388 |
+
|
| 389 |
+
def submission_guide_markdown() -> str:
|
| 390 |
+
return """
|
| 391 |
+
## Before You Upload
|
| 392 |
+
|
| 393 |
+
1. Put exactly one task directory at the top level of the ZIP.
|
| 394 |
+
2. Make sure the directory contains `task_info.json`, `data/`, `related_work/`, and `target_study/`.
|
| 395 |
+
3. Keep every data reference inside `task_info.json` in the `./data/...` format.
|
| 396 |
+
4. Make sure every checklist image path points to `target_study/images/...`.
|
| 397 |
+
5. Ensure that uploaded files can be redistributed through Hugging Face before submitting.
|
| 398 |
+
|
| 399 |
+
## Expected ZIP Layout
|
| 400 |
+
|
| 401 |
+
```text
|
| 402 |
+
your_submission.zip
|
| 403 |
+
└── any_folder_name/
|
| 404 |
+
├── task_info.json
|
| 405 |
+
├── data/
|
| 406 |
+
├── related_work/
|
| 407 |
+
└── target_study/
|
| 408 |
+
├── checklist.json
|
| 409 |
+
├── paper.pdf
|
| 410 |
+
└── images/
|
| 411 |
+
```
|
| 412 |
+
|
| 413 |
+
## What The Space Checks
|
| 414 |
+
|
| 415 |
+
- top-level folder structure and missing or extra files
|
| 416 |
+
- `task_info.json` and `checklist.json` parseability and required keys
|
| 417 |
+
- file naming conventions such as `related_work/paper_000.pdf`
|
| 418 |
+
- whether declared data paths actually exist
|
| 419 |
+
- whether image references actually exist
|
| 420 |
+
- whether old source paths or stale `/tasks/...` references remain in descriptions
|
| 421 |
+
|
| 422 |
+
Example task in GitHub:
|
| 423 |
+
[tasks/Astronomy_000](https://github.com/InternScience/ResearchClawBench/tree/main/tasks/Astronomy_000)
|
| 424 |
+
"""
|
| 425 |
+
|
| 426 |
+
|
| 427 |
+
def final_task_help_html() -> str:
|
| 428 |
+
return (
|
| 429 |
+
'<div class="caption">'
|
| 430 |
+
'The final task ID is assigned automatically after the Space scans existing <code>tasks/</code> folders. '
|
| 431 |
+
'You do not need to choose the numeric suffix yourself. The selected domain becomes the prefix, and if the '
|
| 432 |
+
'custom field is filled, it overrides the suggested domain.'
|
| 433 |
+
'</div>'
|
| 434 |
+
)
|
| 435 |
+
|
| 436 |
+
|
| 437 |
+
def resolve_domain(selected_domain: str, custom_domain: str) -> str:
|
| 438 |
+
raw_value = (custom_domain or '').strip() or (selected_domain or '').strip()
|
| 439 |
+
normalized = normalize_domain_token(raw_value)
|
| 440 |
+
if not normalized:
|
| 441 |
+
raise ValidationError('Please select a suggested domain or provide a custom domain.')
|
| 442 |
+
return normalized
|
| 443 |
+
|
| 444 |
+
|
| 445 |
+
def handle_archive_upload(archive_path: str | None):
|
| 446 |
+
if not archive_path:
|
| 447 |
+
return '', 'No ZIP file selected yet.'
|
| 448 |
+
filename = Path(archive_path).name
|
| 449 |
+
return archive_path, f'Selected ZIP: `{filename}`'
|
| 450 |
+
|
| 451 |
+
|
| 452 |
+
def build_validation_markdown(prepared: PreparedSubmission) -> str:
|
| 453 |
+
metadata = prepared.metadata
|
| 454 |
+
return '\n'.join([
|
| 455 |
+
'## Validation passed',
|
| 456 |
+
'',
|
| 457 |
+
f'- Final task ID: `{prepared.assigned_task_id}`',
|
| 458 |
+
'- This is the folder name that will be created under `tasks/` in the dataset repo.',
|
| 459 |
+
f'- Domain token used for allocation: `{metadata.domain}`',
|
| 460 |
+
f'- Submitter: `{metadata.submitter}`',
|
| 461 |
+
f'- Archive file count: `{prepared.archive_stats.file_count}`',
|
| 462 |
+
f'- Archive total bytes: `{prepared.archive_stats.total_bytes}`',
|
| 463 |
+
'',
|
| 464 |
+
'You can now create a PR to the Hugging Face dataset repo.',
|
| 465 |
+
])
|
| 466 |
+
|
| 467 |
+
|
| 468 |
+
def build_failure_markdown(message: str) -> str:
|
| 469 |
+
items = [line.strip() for line in message.splitlines() if line.strip()]
|
| 470 |
+
bullets = '\n'.join(f'- {item}' for item in items) if items else '- Unknown validation error'
|
| 471 |
+
return f'## Validation failed\n\n{bullets}'
|
| 472 |
+
|
| 473 |
+
|
| 474 |
+
def validate_submission(
|
| 475 |
+
archive_path: str,
|
| 476 |
+
suggested_domain: str,
|
| 477 |
+
custom_domain: str,
|
| 478 |
+
submitter: str,
|
| 479 |
+
email: str,
|
| 480 |
+
paper_title: str,
|
| 481 |
+
paper_url: str,
|
| 482 |
+
notes: str,
|
| 483 |
+
current_state: dict | None,
|
| 484 |
+
):
|
| 485 |
+
if current_state:
|
| 486 |
+
cleanup_work_dir(current_state.get('work_dir'))
|
| 487 |
+
|
| 488 |
+
if not archive_path:
|
| 489 |
+
return None, '', '## Validation failed\n\n- Please upload a zip file.', '{}', gr.update(interactive=False), ''
|
| 490 |
+
|
| 491 |
+
domain = resolve_domain(suggested_domain, custom_domain)
|
| 492 |
+
metadata = SubmissionMetadata(
|
| 493 |
+
domain=domain,
|
| 494 |
+
submitter=submitter,
|
| 495 |
+
email=email,
|
| 496 |
+
paper_title=paper_title,
|
| 497 |
+
paper_url=paper_url,
|
| 498 |
+
notes=notes or '',
|
| 499 |
+
)
|
| 500 |
+
|
| 501 |
+
try:
|
| 502 |
+
existing_ids = list_existing_task_ids(repo_id=DEFAULT_REPO_ID, token=load_hf_token())
|
| 503 |
+
assigned_task_id = allocate_next_task_id(domain, existing_ids)
|
| 504 |
+
prepared = validate_and_prepare_submission(archive_path, metadata, assigned_task_id)
|
| 505 |
+
pr_ready = bool(load_hf_token())
|
| 506 |
+
return (
|
| 507 |
+
prepared.to_state(),
|
| 508 |
+
prepared.assigned_task_id,
|
| 509 |
+
build_validation_markdown(prepared),
|
| 510 |
+
json.dumps(build_public_report(prepared), indent=2, ensure_ascii=False),
|
| 511 |
+
gr.update(interactive=pr_ready),
|
| 512 |
+
'' if pr_ready else 'Validation passed, but PR creation is disabled until a write token is configured.',
|
| 513 |
+
)
|
| 514 |
+
except ValidationError as exc:
|
| 515 |
+
return (
|
| 516 |
+
None,
|
| 517 |
+
'',
|
| 518 |
+
build_failure_markdown(str(exc)),
|
| 519 |
+
json.dumps({'status': 'error', 'errors': str(exc).splitlines()}, indent=2, ensure_ascii=False),
|
| 520 |
+
gr.update(interactive=False),
|
| 521 |
+
'',
|
| 522 |
+
)
|
| 523 |
+
except Exception as exc:
|
| 524 |
+
return (
|
| 525 |
+
None,
|
| 526 |
+
'',
|
| 527 |
+
build_failure_markdown(str(exc)),
|
| 528 |
+
json.dumps({'status': 'error', 'errors': [str(exc)]}, indent=2, ensure_ascii=False),
|
| 529 |
+
gr.update(interactive=False),
|
| 530 |
+
'',
|
| 531 |
+
)
|
| 532 |
+
|
| 533 |
+
|
| 534 |
+
def create_pr(state: dict | None):
|
| 535 |
+
if not state:
|
| 536 |
+
return '## PR creation failed\n\n- Validate a submission first.'
|
| 537 |
+
|
| 538 |
+
prepared = PreparedSubmission.from_state(state)
|
| 539 |
+
try:
|
| 540 |
+
commit_info = create_dataset_pr(prepared, repo_id=DEFAULT_REPO_ID, token=load_hf_token())
|
| 541 |
+
pr_url = commit_info.pr_url or commit_info.commit_url
|
| 542 |
+
return '\n'.join([
|
| 543 |
+
'## PR created',
|
| 544 |
+
'',
|
| 545 |
+
f'- Task ID: `{prepared.assigned_task_id}`',
|
| 546 |
+
f'- PR: {pr_url}',
|
| 547 |
+
])
|
| 548 |
+
finally:
|
| 549 |
+
cleanup_work_dir(prepared.work_dir)
|
| 550 |
+
|
| 551 |
+
|
| 552 |
+
with gr.Blocks(title=SPACE_TITLE, theme=gr.themes.Base(), css=CSS, fill_width=True) as demo:
|
| 553 |
+
state = gr.State(None)
|
| 554 |
+
archive_state = gr.State('')
|
| 555 |
+
|
| 556 |
+
gr.HTML(build_hero_html())
|
| 557 |
+
|
| 558 |
+
with gr.Group(elem_classes=['page-shell']):
|
| 559 |
+
with gr.Row(elem_classes=['section-row']):
|
| 560 |
+
with gr.Column(scale=7, elem_classes=['section-copy', 'main-form']):
|
| 561 |
+
gr.HTML(field_label_html('Task ZIP archive'))
|
| 562 |
+
with gr.Row(elem_classes=['upload-row']):
|
| 563 |
+
archive = gr.UploadButton(
|
| 564 |
+
'Select ZIP file',
|
| 565 |
+
file_types=['.zip'],
|
| 566 |
+
file_count='single',
|
| 567 |
+
type='filepath',
|
| 568 |
+
variant='secondary',
|
| 569 |
+
elem_classes=['upload-button'],
|
| 570 |
+
)
|
| 571 |
+
archive_notice = gr.Markdown('No ZIP file selected yet.', elem_classes=['upload-status'])
|
| 572 |
+
with gr.Row():
|
| 573 |
+
with gr.Column():
|
| 574 |
+
gr.HTML(field_label_html('Suggested domain'))
|
| 575 |
+
suggested_domain = gr.Dropdown(
|
| 576 |
+
choices=list(DOMAINS),
|
| 577 |
+
value='Astronomy',
|
| 578 |
+
show_label=False,
|
| 579 |
+
container=False,
|
| 580 |
+
)
|
| 581 |
+
with gr.Column():
|
| 582 |
+
gr.HTML(field_label_html('Custom domain (optional)'))
|
| 583 |
+
custom_domain = gr.Textbox(
|
| 584 |
+
placeholder='e.g. Robotics or Robot-Learning',
|
| 585 |
+
show_label=False,
|
| 586 |
+
container=False,
|
| 587 |
+
)
|
| 588 |
+
gr.Markdown(
|
| 589 |
+
'<div class="caption">Use the custom field if your task does not belong to the suggested list. '
|
| 590 |
+
'If the custom field is filled, it overrides the suggested domain and becomes the prefix of the final task ID.</div>'
|
| 591 |
+
)
|
| 592 |
+
gr.HTML(field_label_html('Submitter name or HF username'))
|
| 593 |
+
submitter = gr.Textbox(
|
| 594 |
+
placeholder='e.g. your-hf-handle',
|
| 595 |
+
show_label=False,
|
| 596 |
+
container=False,
|
| 597 |
+
)
|
| 598 |
+
gr.HTML(field_label_html('Contact email'))
|
| 599 |
+
email = gr.Textbox(
|
| 600 |
+
placeholder='name@example.com',
|
| 601 |
+
show_label=False,
|
| 602 |
+
container=False,
|
| 603 |
+
)
|
| 604 |
+
gr.HTML(field_label_html('Target paper title'))
|
| 605 |
+
paper_title = gr.Textbox(show_label=False, container=False)
|
| 606 |
+
gr.HTML(field_label_html('Target paper URL or DOI'))
|
| 607 |
+
paper_url = gr.Textbox(
|
| 608 |
+
placeholder='https://... or DOI',
|
| 609 |
+
show_label=False,
|
| 610 |
+
container=False,
|
| 611 |
+
)
|
| 612 |
+
gr.HTML(field_label_html('Optional notes for reviewers'))
|
| 613 |
+
notes = gr.Textbox(
|
| 614 |
+
lines=4,
|
| 615 |
+
placeholder='Anything maintainers should know about licensing, preprocessing, or provenance.',
|
| 616 |
+
show_label=False,
|
| 617 |
+
container=False,
|
| 618 |
+
)
|
| 619 |
+
with gr.Column(scale=5, elem_classes=['section-copy', 'side-notes']):
|
| 620 |
+
gr.Markdown(submission_guide_markdown(), elem_classes=['subtle-block'])
|
| 621 |
+
|
| 622 |
+
with gr.Row(elem_classes=['action-row']):
|
| 623 |
+
validate_btn = gr.Button('Validate ZIP', variant='primary', elem_classes=['primary-button'])
|
| 624 |
+
create_pr_btn = gr.Button('Create Dataset PR', interactive=False, elem_classes=['secondary-button'])
|
| 625 |
+
|
| 626 |
+
with gr.Column(elem_classes=['section-copy', 'results-shell']):
|
| 627 |
+
gr.HTML(field_label_html('Final task ID (assigned automatically)'))
|
| 628 |
+
assigned_task_id = gr.Textbox(
|
| 629 |
+
interactive=False,
|
| 630 |
+
show_label=False,
|
| 631 |
+
container=False,
|
| 632 |
+
)
|
| 633 |
+
gr.Markdown(final_task_help_html())
|
| 634 |
+
validation_md = gr.Markdown()
|
| 635 |
+
gr.HTML(field_label_html('Validation report'))
|
| 636 |
+
validation_report = gr.Code(language='json', show_label=False, container=False)
|
| 637 |
+
pr_md = gr.Markdown()
|
| 638 |
+
|
| 639 |
+
archive.upload(fn=handle_archive_upload, inputs=[archive], outputs=[archive_state, archive_notice])
|
| 640 |
+
|
| 641 |
+
validate_btn.click(
|
| 642 |
+
fn=validate_submission,
|
| 643 |
+
inputs=[
|
| 644 |
+
archive_state,
|
| 645 |
+
suggested_domain,
|
| 646 |
+
custom_domain,
|
| 647 |
+
submitter,
|
| 648 |
+
email,
|
| 649 |
+
paper_title,
|
| 650 |
+
paper_url,
|
| 651 |
+
notes,
|
| 652 |
+
state,
|
| 653 |
+
],
|
| 654 |
+
outputs=[state, assigned_task_id, validation_md, validation_report, create_pr_btn, pr_md],
|
| 655 |
+
)
|
| 656 |
+
create_pr_btn.click(fn=create_pr, inputs=[state], outputs=[pr_md])
|
| 657 |
+
|
| 658 |
+
|
| 659 |
+
if __name__ == '__main__':
|
| 660 |
+
demo.launch()
|
repo_ops.py
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from pathlib import Path, PurePosixPath
|
| 4 |
+
from typing import Iterable
|
| 5 |
+
|
| 6 |
+
from huggingface_hub import CommitOperationAdd, HfApi
|
| 7 |
+
|
| 8 |
+
try:
|
| 9 |
+
from .validator import DOMAIN_TOKEN_RE, PreparedSubmission, TASK_ID_RE, normalize_domain_token
|
| 10 |
+
except ImportError:
|
| 11 |
+
from validator import DOMAIN_TOKEN_RE, PreparedSubmission, TASK_ID_RE, normalize_domain_token
|
| 12 |
+
|
| 13 |
+
DEFAULT_REPO_ID = 'InternScience/ResearchClawBench'
|
| 14 |
+
TOKEN_ENV_KEYS = (
|
| 15 |
+
'RCB_SPACE_HF_TOKEN',
|
| 16 |
+
'HF_TOKEN',
|
| 17 |
+
'HUGGINGFACEHUB_API_TOKEN',
|
| 18 |
+
'HUGGINGFACE_TOKEN',
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def load_hf_token() -> str | None:
|
| 23 |
+
import os
|
| 24 |
+
|
| 25 |
+
for key in TOKEN_ENV_KEYS:
|
| 26 |
+
value = os.environ.get(key)
|
| 27 |
+
if value:
|
| 28 |
+
return value
|
| 29 |
+
return None
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def list_existing_task_ids(repo_id: str = DEFAULT_REPO_ID, token: str | None = None) -> set[str]:
|
| 33 |
+
api = HfApi(token=token)
|
| 34 |
+
task_ids: set[str] = set()
|
| 35 |
+
for remote_path in api.list_repo_files(repo_id=repo_id, repo_type='dataset', token=token):
|
| 36 |
+
parts = PurePosixPath(remote_path).parts
|
| 37 |
+
if len(parts) >= 2 and parts[0] == 'tasks':
|
| 38 |
+
task_ids.add(parts[1])
|
| 39 |
+
return task_ids
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def allocate_next_task_id(domain: str, existing_task_ids: Iterable[str]) -> str:
|
| 43 |
+
domain = normalize_domain_token(domain)
|
| 44 |
+
if not DOMAIN_TOKEN_RE.fullmatch(domain):
|
| 45 |
+
raise ValueError(
|
| 46 |
+
'Domain must start with a letter and contain only letters, numbers, or hyphens '
|
| 47 |
+
f'after normalization. Got: {domain!r}'
|
| 48 |
+
)
|
| 49 |
+
used_numbers = []
|
| 50 |
+
for task_id in existing_task_ids:
|
| 51 |
+
match = TASK_ID_RE.match(task_id)
|
| 52 |
+
if match and match.group(1) == domain:
|
| 53 |
+
used_numbers.append(int(match.group(2)))
|
| 54 |
+
next_number = (max(used_numbers) + 1) if used_numbers else 0
|
| 55 |
+
if next_number > 999:
|
| 56 |
+
raise ValueError(f'No task IDs left for domain {domain}.')
|
| 57 |
+
return f'{domain}_{next_number:03d}'
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def build_commit_description(prepared: PreparedSubmission) -> str:
|
| 61 |
+
metadata = prepared.metadata
|
| 62 |
+
lines = [
|
| 63 |
+
f'Submitter: {metadata.submitter}',
|
| 64 |
+
f'Contact email: {metadata.email}',
|
| 65 |
+
f'Domain: {metadata.domain}',
|
| 66 |
+
f'Assigned task id: {prepared.assigned_task_id}',
|
| 67 |
+
f'Paper title: {metadata.paper_title}',
|
| 68 |
+
f'Paper URL/DOI: {metadata.paper_url}',
|
| 69 |
+
f'Archive files: {prepared.archive_stats.file_count}',
|
| 70 |
+
f'Archive total bytes: {prepared.archive_stats.total_bytes}',
|
| 71 |
+
]
|
| 72 |
+
if metadata.notes.strip():
|
| 73 |
+
lines.extend(['', 'Submitter notes:', metadata.notes.strip()])
|
| 74 |
+
lines.extend(['', 'This PR was created automatically by the ResearchClawBench submission Space after passing format validation.'])
|
| 75 |
+
return '\n'.join(lines)
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def create_dataset_pr(
|
| 79 |
+
prepared: PreparedSubmission,
|
| 80 |
+
*,
|
| 81 |
+
repo_id: str = DEFAULT_REPO_ID,
|
| 82 |
+
token: str | None = None,
|
| 83 |
+
):
|
| 84 |
+
token = token or load_hf_token()
|
| 85 |
+
if not token:
|
| 86 |
+
raise RuntimeError('No Hugging Face write token configured. Set RCB_SPACE_HF_TOKEN or HF_TOKEN.')
|
| 87 |
+
|
| 88 |
+
staged_task_dir = Path(prepared.staged_task_dir)
|
| 89 |
+
if not staged_task_dir.is_dir():
|
| 90 |
+
raise RuntimeError(f'Staged task directory does not exist: {staged_task_dir}')
|
| 91 |
+
|
| 92 |
+
operations = []
|
| 93 |
+
for path in sorted(staged_task_dir.rglob('*')):
|
| 94 |
+
if not path.is_file():
|
| 95 |
+
continue
|
| 96 |
+
rel_path = path.relative_to(staged_task_dir).as_posix()
|
| 97 |
+
operations.append(
|
| 98 |
+
CommitOperationAdd(
|
| 99 |
+
path_in_repo=f'tasks/{prepared.assigned_task_id}/{rel_path}',
|
| 100 |
+
path_or_fileobj=str(path),
|
| 101 |
+
)
|
| 102 |
+
)
|
| 103 |
+
|
| 104 |
+
api = HfApi(token=token)
|
| 105 |
+
return api.create_commit(
|
| 106 |
+
repo_id=repo_id,
|
| 107 |
+
repo_type='dataset',
|
| 108 |
+
operations=operations,
|
| 109 |
+
commit_message=f'Add task submission {prepared.assigned_task_id}',
|
| 110 |
+
commit_description=build_commit_description(prepared),
|
| 111 |
+
token=token,
|
| 112 |
+
create_pr=True,
|
| 113 |
+
revision='main',
|
| 114 |
+
)
|
requirements.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Gradio is provided by the Hugging Face Space SDK via README.yaml `sdk_version`.
|
| 2 |
+
huggingface_hub>=0.34.0,<1
|
validator.py
ADDED
|
@@ -0,0 +1,518 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import os
|
| 5 |
+
import re
|
| 6 |
+
import shutil
|
| 7 |
+
import stat
|
| 8 |
+
import tempfile
|
| 9 |
+
import zipfile
|
| 10 |
+
from dataclasses import asdict, dataclass
|
| 11 |
+
from pathlib import Path, PurePosixPath
|
| 12 |
+
from typing import Any
|
| 13 |
+
|
| 14 |
+
DOMAINS = (
|
| 15 |
+
'Astronomy',
|
| 16 |
+
'Chemistry',
|
| 17 |
+
'Earth',
|
| 18 |
+
'Energy',
|
| 19 |
+
'Information',
|
| 20 |
+
'Life',
|
| 21 |
+
'Material',
|
| 22 |
+
'Math',
|
| 23 |
+
'Neuroscience',
|
| 24 |
+
'Physics',
|
| 25 |
+
)
|
| 26 |
+
DOMAIN_TOKEN_RE = re.compile(r'^[A-Za-z][A-Za-z0-9-]*$')
|
| 27 |
+
TASK_ID_RE = re.compile(r'^([A-Za-z][A-Za-z0-9-]*)_(\d{3})$')
|
| 28 |
+
STALE_TOKENS = (
|
| 29 |
+
'/mnt/shared-storage-user/',
|
| 30 |
+
'SGI-EvalAgent',
|
| 31 |
+
'prior_literature',
|
| 32 |
+
'target_literature',
|
| 33 |
+
)
|
| 34 |
+
STALE_TASK_REF_RE = re.compile(r'(?:\./|/)tasks/[\w/.\-]+')
|
| 35 |
+
DATA_REF_RE = re.compile(r"""\./data/[^'"`\n;,]+""")
|
| 36 |
+
EXPECTED_TOP_LEVEL = {'data', 'related_work', 'target_study', 'task_info.json'}
|
| 37 |
+
EXPECTED_TARGET_STUDY = ('checklist.json', 'images/', 'paper.pdf')
|
| 38 |
+
EXPECTED_TASK_INFO_KEYS = ('data', 'task')
|
| 39 |
+
EXPECTED_DATA_ITEM_KEYS = ('description', 'name', 'path', 'type')
|
| 40 |
+
EXPECTED_CHECKLIST_ITEM_KEYS = ('content', 'keywords', 'path', 'type', 'weight')
|
| 41 |
+
IGNORED_ARCHIVE_PARTS = {'__MACOSX'}
|
| 42 |
+
IGNORED_ARCHIVE_NAMES = {'.DS_Store'}
|
| 43 |
+
DEFAULT_MAX_FILES = int(os.environ.get('RCB_SPACE_MAX_FILES', '5000'))
|
| 44 |
+
DEFAULT_MAX_TOTAL_BYTES = int(os.environ.get('RCB_SPACE_MAX_TOTAL_BYTES', str(5 * 1024 * 1024 * 1024)))
|
| 45 |
+
DEFAULT_MAX_SINGLE_FILE_BYTES = int(os.environ.get('RCB_SPACE_MAX_SINGLE_FILE_BYTES', str(1024 * 1024 * 1024)))
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
@dataclass
|
| 49 |
+
class SubmissionMetadata:
|
| 50 |
+
domain: str
|
| 51 |
+
submitter: str
|
| 52 |
+
email: str
|
| 53 |
+
paper_title: str
|
| 54 |
+
paper_url: str
|
| 55 |
+
notes: str = ''
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
@dataclass
|
| 59 |
+
class ArchiveStats:
|
| 60 |
+
file_count: int
|
| 61 |
+
total_bytes: int
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
@dataclass
|
| 65 |
+
class PreparedSubmission:
|
| 66 |
+
work_dir: str
|
| 67 |
+
uploaded_task_dir: str
|
| 68 |
+
staged_task_dir: str
|
| 69 |
+
assigned_task_id: str
|
| 70 |
+
archive_stats: ArchiveStats
|
| 71 |
+
metadata: SubmissionMetadata
|
| 72 |
+
|
| 73 |
+
def to_state(self) -> dict[str, Any]:
|
| 74 |
+
return {
|
| 75 |
+
'work_dir': self.work_dir,
|
| 76 |
+
'uploaded_task_dir': self.uploaded_task_dir,
|
| 77 |
+
'staged_task_dir': self.staged_task_dir,
|
| 78 |
+
'assigned_task_id': self.assigned_task_id,
|
| 79 |
+
'archive_stats': asdict(self.archive_stats),
|
| 80 |
+
'metadata': asdict(self.metadata),
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
@classmethod
|
| 84 |
+
def from_state(cls, state: dict[str, Any]) -> 'PreparedSubmission':
|
| 85 |
+
return cls(
|
| 86 |
+
work_dir=state['work_dir'],
|
| 87 |
+
uploaded_task_dir=state['uploaded_task_dir'],
|
| 88 |
+
staged_task_dir=state['staged_task_dir'],
|
| 89 |
+
assigned_task_id=state['assigned_task_id'],
|
| 90 |
+
archive_stats=ArchiveStats(**state['archive_stats']),
|
| 91 |
+
metadata=SubmissionMetadata(**state['metadata']),
|
| 92 |
+
)
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
class ValidationError(RuntimeError):
|
| 96 |
+
pass
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
def normalize_domain_token(domain: str) -> str:
|
| 100 |
+
value = re.sub(r'[\s_]+', '-', (domain or '').strip())
|
| 101 |
+
value = re.sub(r'-{2,}', '-', value)
|
| 102 |
+
return value.strip('-')
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
def load_json(path: Path) -> Any:
|
| 106 |
+
try:
|
| 107 |
+
return json.loads(path.read_text(encoding='utf-8'))
|
| 108 |
+
except Exception as exc:
|
| 109 |
+
raise ValidationError(f'Failed to parse JSON: {path}: {exc}') from exc
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def rel(path: Path, base: Path) -> str:
|
| 113 |
+
try:
|
| 114 |
+
return str(path.relative_to(base))
|
| 115 |
+
except Exception:
|
| 116 |
+
return str(path)
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
def _target_entries(target_dir: Path) -> tuple[str, ...]:
|
| 120 |
+
return tuple(sorted(x.name + ('/' if x.is_dir() else '') for x in target_dir.iterdir()))
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
def _is_ignored_archive_path(path: PurePosixPath) -> bool:
|
| 124 |
+
return any(part in IGNORED_ARCHIVE_PARTS for part in path.parts) or path.name in IGNORED_ARCHIVE_NAMES or path.name.startswith('._')
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
def _is_zip_symlink(info: zipfile.ZipInfo) -> bool:
|
| 128 |
+
mode = info.external_attr >> 16
|
| 129 |
+
return stat.S_ISLNK(mode)
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
def _iter_data_refs(text: str) -> list[str]:
|
| 133 |
+
refs = []
|
| 134 |
+
for raw_ref in DATA_REF_RE.findall(text):
|
| 135 |
+
ref = raw_ref.rstrip('.')
|
| 136 |
+
if ref not in refs:
|
| 137 |
+
refs.append(ref)
|
| 138 |
+
return refs
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
def cleanup_work_dir(work_dir: str | Path | None) -> None:
|
| 142 |
+
if not work_dir:
|
| 143 |
+
return
|
| 144 |
+
shutil.rmtree(Path(work_dir), ignore_errors=True)
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
def create_work_dir() -> Path:
|
| 148 |
+
return Path(tempfile.mkdtemp(prefix='rcb_space_submit_'))
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
def extract_submission_zip(
|
| 152 |
+
archive_path: str | Path,
|
| 153 |
+
work_dir: str | Path,
|
| 154 |
+
*,
|
| 155 |
+
max_files: int = DEFAULT_MAX_FILES,
|
| 156 |
+
max_total_bytes: int = DEFAULT_MAX_TOTAL_BYTES,
|
| 157 |
+
max_single_file_bytes: int = DEFAULT_MAX_SINGLE_FILE_BYTES,
|
| 158 |
+
) -> tuple[Path, ArchiveStats]:
|
| 159 |
+
archive_path = Path(archive_path)
|
| 160 |
+
work_dir = Path(work_dir)
|
| 161 |
+
extract_root = work_dir / 'extracted'
|
| 162 |
+
extract_root.mkdir(parents=True, exist_ok=True)
|
| 163 |
+
|
| 164 |
+
if archive_path.suffix.lower() != '.zip':
|
| 165 |
+
raise ValidationError('Only .zip uploads are supported.')
|
| 166 |
+
|
| 167 |
+
file_count = 0
|
| 168 |
+
total_bytes = 0
|
| 169 |
+
safe_infos: list[tuple[zipfile.ZipInfo, PurePosixPath]] = []
|
| 170 |
+
|
| 171 |
+
with zipfile.ZipFile(archive_path) as zf:
|
| 172 |
+
infos = zf.infolist()
|
| 173 |
+
if not infos:
|
| 174 |
+
raise ValidationError('The uploaded zip archive is empty.')
|
| 175 |
+
|
| 176 |
+
for info in infos:
|
| 177 |
+
raw_name = info.filename.replace('\\', '/')
|
| 178 |
+
if not raw_name:
|
| 179 |
+
continue
|
| 180 |
+
posix_path = PurePosixPath(raw_name)
|
| 181 |
+
if _is_ignored_archive_path(posix_path):
|
| 182 |
+
continue
|
| 183 |
+
if posix_path.is_absolute() or '..' in posix_path.parts:
|
| 184 |
+
raise ValidationError(f'Archive contains an invalid path: {raw_name}')
|
| 185 |
+
if _is_zip_symlink(info):
|
| 186 |
+
raise ValidationError(f'Archive contains a symbolic link, which is not allowed: {raw_name}')
|
| 187 |
+
safe_infos.append((info, posix_path))
|
| 188 |
+
if info.is_dir():
|
| 189 |
+
continue
|
| 190 |
+
file_count += 1
|
| 191 |
+
total_bytes += info.file_size
|
| 192 |
+
if info.file_size > max_single_file_bytes:
|
| 193 |
+
raise ValidationError(
|
| 194 |
+
f'Archive file exceeds the per-file limit ({max_single_file_bytes} bytes): {raw_name}'
|
| 195 |
+
)
|
| 196 |
+
if file_count > max_files:
|
| 197 |
+
raise ValidationError(f'Archive exceeds the file-count limit ({max_files}).')
|
| 198 |
+
if total_bytes > max_total_bytes:
|
| 199 |
+
raise ValidationError(f'Archive exceeds the total-size limit ({max_total_bytes} bytes).')
|
| 200 |
+
|
| 201 |
+
if file_count == 0:
|
| 202 |
+
raise ValidationError('The uploaded zip archive does not contain any files.')
|
| 203 |
+
|
| 204 |
+
for info, posix_path in safe_infos:
|
| 205 |
+
destination = extract_root.joinpath(*posix_path.parts)
|
| 206 |
+
if info.is_dir():
|
| 207 |
+
destination.mkdir(parents=True, exist_ok=True)
|
| 208 |
+
continue
|
| 209 |
+
destination.parent.mkdir(parents=True, exist_ok=True)
|
| 210 |
+
with zf.open(info) as src, destination.open('wb') as dst:
|
| 211 |
+
shutil.copyfileobj(src, dst)
|
| 212 |
+
|
| 213 |
+
return extract_root, ArchiveStats(file_count=file_count, total_bytes=total_bytes)
|
| 214 |
+
|
| 215 |
+
|
| 216 |
+
def find_single_task_dir(extract_root: str | Path) -> Path:
|
| 217 |
+
extract_root = Path(extract_root)
|
| 218 |
+
entries = []
|
| 219 |
+
for path in sorted(extract_root.iterdir(), key=lambda p: p.name.lower()):
|
| 220 |
+
if path.name in IGNORED_ARCHIVE_NAMES or path.name in IGNORED_ARCHIVE_PARTS or path.name.startswith('._'):
|
| 221 |
+
continue
|
| 222 |
+
entries.append(path)
|
| 223 |
+
|
| 224 |
+
if len(entries) != 1 or not entries[0].is_dir():
|
| 225 |
+
names = [p.name for p in entries]
|
| 226 |
+
raise ValidationError(
|
| 227 |
+
'Zip must contain exactly one top-level task directory. '
|
| 228 |
+
f'Found: {names if names else "(none)"}'
|
| 229 |
+
)
|
| 230 |
+
return entries[0]
|
| 231 |
+
|
| 232 |
+
|
| 233 |
+
def validate_submission_metadata(metadata: SubmissionMetadata) -> list[str]:
|
| 234 |
+
errors: list[str] = []
|
| 235 |
+
normalized_domain = normalize_domain_token(metadata.domain)
|
| 236 |
+
if not normalized_domain:
|
| 237 |
+
errors.append('A domain is required.')
|
| 238 |
+
elif not DOMAIN_TOKEN_RE.fullmatch(normalized_domain):
|
| 239 |
+
errors.append(
|
| 240 |
+
'Domain must start with a letter and contain only letters, numbers, or hyphens '
|
| 241 |
+
f'after normalization. Got: {metadata.domain!r}'
|
| 242 |
+
)
|
| 243 |
+
if not metadata.submitter.strip():
|
| 244 |
+
errors.append('Submitter name or HF username is required.')
|
| 245 |
+
if not metadata.email.strip():
|
| 246 |
+
errors.append('Contact email is required.')
|
| 247 |
+
elif not re.fullmatch(r'[^@\s]+@[^@\s]+\.[^@\s]+', metadata.email.strip()):
|
| 248 |
+
errors.append('Contact email must be a valid email address.')
|
| 249 |
+
if not metadata.paper_title.strip():
|
| 250 |
+
errors.append('Paper title is required.')
|
| 251 |
+
if not metadata.paper_url.strip():
|
| 252 |
+
errors.append('Paper URL or DOI is required.')
|
| 253 |
+
return errors
|
| 254 |
+
|
| 255 |
+
|
| 256 |
+
def validate_task_dir(
|
| 257 |
+
task_dir: str | Path,
|
| 258 |
+
*,
|
| 259 |
+
enforce_task_name: bool = True,
|
| 260 |
+
expected_domain: str | None = None,
|
| 261 |
+
) -> list[str]:
|
| 262 |
+
task_dir = Path(task_dir)
|
| 263 |
+
errors: list[str] = []
|
| 264 |
+
task_name = task_dir.name
|
| 265 |
+
match = TASK_ID_RE.match(task_name)
|
| 266 |
+
|
| 267 |
+
if enforce_task_name:
|
| 268 |
+
if not match:
|
| 269 |
+
errors.append(f'invalid task directory name: {task_name}')
|
| 270 |
+
elif expected_domain and match.group(1) != expected_domain:
|
| 271 |
+
errors.append(f'task directory domain {match.group(1)!r} does not match selected domain {expected_domain!r}')
|
| 272 |
+
elif expected_domain and match and match.group(1) != expected_domain:
|
| 273 |
+
errors.append(f'task directory domain {match.group(1)!r} does not match selected domain {expected_domain!r}')
|
| 274 |
+
|
| 275 |
+
if not task_dir.is_dir():
|
| 276 |
+
return [f'task directory does not exist: {task_dir}']
|
| 277 |
+
|
| 278 |
+
actual_top = {p.name for p in task_dir.iterdir()}
|
| 279 |
+
if actual_top != EXPECTED_TOP_LEVEL:
|
| 280 |
+
errors.append(f'top-level entries mismatch: expected {sorted(EXPECTED_TOP_LEVEL)}, got {sorted(actual_top)}')
|
| 281 |
+
|
| 282 |
+
data_dir = task_dir / 'data'
|
| 283 |
+
related_dir = task_dir / 'related_work'
|
| 284 |
+
target_dir = task_dir / 'target_study'
|
| 285 |
+
task_info_path = task_dir / 'task_info.json'
|
| 286 |
+
checklist_path = target_dir / 'checklist.json'
|
| 287 |
+
paper_path = target_dir / 'paper.pdf'
|
| 288 |
+
images_dir = target_dir / 'images'
|
| 289 |
+
|
| 290 |
+
if not data_dir.is_dir():
|
| 291 |
+
errors.append('missing data/ directory')
|
| 292 |
+
if not related_dir.is_dir():
|
| 293 |
+
errors.append('missing related_work/ directory')
|
| 294 |
+
if not target_dir.is_dir():
|
| 295 |
+
errors.append('missing target_study/ directory')
|
| 296 |
+
if not task_info_path.is_file():
|
| 297 |
+
errors.append('missing task_info.json')
|
| 298 |
+
return errors
|
| 299 |
+
if not checklist_path.is_file():
|
| 300 |
+
errors.append('missing target_study/checklist.json')
|
| 301 |
+
return errors
|
| 302 |
+
if not paper_path.is_file():
|
| 303 |
+
errors.append('missing target_study/paper.pdf')
|
| 304 |
+
if not images_dir.is_dir():
|
| 305 |
+
errors.append('missing target_study/images/ directory')
|
| 306 |
+
|
| 307 |
+
try:
|
| 308 |
+
task_info = load_json(task_info_path)
|
| 309 |
+
except ValidationError as exc:
|
| 310 |
+
errors.append(str(exc))
|
| 311 |
+
return errors
|
| 312 |
+
|
| 313 |
+
if tuple(sorted(task_info.keys())) != EXPECTED_TASK_INFO_KEYS:
|
| 314 |
+
errors.append(f'task_info.json keys mismatch: {sorted(task_info.keys())}')
|
| 315 |
+
|
| 316 |
+
if not isinstance(task_info.get('task'), str) or not task_info['task'].strip():
|
| 317 |
+
errors.append('task_info.json field `task` must be a non-empty string')
|
| 318 |
+
if not isinstance(task_info.get('data'), list):
|
| 319 |
+
errors.append('task_info.json field `data` must be a list')
|
| 320 |
+
task_info['data'] = []
|
| 321 |
+
|
| 322 |
+
covered_files: set[Path] = set()
|
| 323 |
+
declared_paths: set[str] = set()
|
| 324 |
+
for idx, item in enumerate(task_info['data']):
|
| 325 |
+
prefix = f'task_info.data[{idx}]'
|
| 326 |
+
if tuple(sorted(item.keys())) != EXPECTED_DATA_ITEM_KEYS:
|
| 327 |
+
errors.append(f'{prefix} keys mismatch: {sorted(item.keys())}')
|
| 328 |
+
continue
|
| 329 |
+
for field in EXPECTED_DATA_ITEM_KEYS:
|
| 330 |
+
if not isinstance(item.get(field), str) or not item[field].strip():
|
| 331 |
+
errors.append(f'{prefix}.{field} must be a non-empty string')
|
| 332 |
+
data_path = item.get('path')
|
| 333 |
+
if not isinstance(data_path, str):
|
| 334 |
+
continue
|
| 335 |
+
if not data_path.startswith('./data/') and data_path != './data':
|
| 336 |
+
errors.append(f'{prefix}.path must start with ./data/: {data_path}')
|
| 337 |
+
continue
|
| 338 |
+
if '\\' in data_path or '..' in Path(data_path).parts:
|
| 339 |
+
errors.append(f'{prefix}.path contains an invalid segment: {data_path}')
|
| 340 |
+
continue
|
| 341 |
+
if data_path in declared_paths:
|
| 342 |
+
errors.append(f'duplicate data path declaration: {data_path}')
|
| 343 |
+
continue
|
| 344 |
+
declared_paths.add(data_path)
|
| 345 |
+
rel_path = data_path[2:] if data_path.startswith('./') else data_path
|
| 346 |
+
target = task_dir / rel_path
|
| 347 |
+
if not target.exists():
|
| 348 |
+
errors.append(f'{prefix}.path does not exist: {data_path}')
|
| 349 |
+
continue
|
| 350 |
+
if target.is_file():
|
| 351 |
+
covered_files.add(target)
|
| 352 |
+
elif target.is_dir():
|
| 353 |
+
nested_files = {p for p in target.rglob('*') if p.is_file()}
|
| 354 |
+
if not nested_files:
|
| 355 |
+
errors.append(f'{prefix}.path points to an empty directory: {data_path}')
|
| 356 |
+
covered_files.update(nested_files)
|
| 357 |
+
else:
|
| 358 |
+
errors.append(f'{prefix}.path is neither file nor directory: {data_path}')
|
| 359 |
+
description = item.get('description', '')
|
| 360 |
+
if any(token in description for token in STALE_TOKENS):
|
| 361 |
+
errors.append(f'{prefix}.description still contains stale source paths or legacy directories')
|
| 362 |
+
|
| 363 |
+
actual_data_files = {p for p in data_dir.rglob('*') if p.is_file()} if data_dir.exists() else set()
|
| 364 |
+
uncovered = sorted(actual_data_files - covered_files)
|
| 365 |
+
if uncovered:
|
| 366 |
+
errors.append('data/ contains undeclared files: ' + ', '.join(rel(p, task_dir) for p in uncovered[:20]))
|
| 367 |
+
missing_backing = sorted(covered_files - actual_data_files)
|
| 368 |
+
if missing_backing:
|
| 369 |
+
errors.append('declared data coverage points outside data/: ' + ', '.join(rel(p, task_dir) for p in missing_backing[:20]))
|
| 370 |
+
|
| 371 |
+
related_entries = sorted(related_dir.iterdir(), key=lambda p: p.name) if related_dir.exists() else []
|
| 372 |
+
related_files = [p for p in related_entries if p.is_file()]
|
| 373 |
+
related_dirs = [p for p in related_entries if p.is_dir()]
|
| 374 |
+
if related_dirs:
|
| 375 |
+
errors.append('related_work/ must not contain subdirectories')
|
| 376 |
+
if not related_files:
|
| 377 |
+
errors.append('related_work/ must contain at least one PDF')
|
| 378 |
+
pdf_names = []
|
| 379 |
+
for path in related_files:
|
| 380 |
+
if not re.fullmatch(r'paper_\d{3}\.pdf', path.name):
|
| 381 |
+
errors.append(f'invalid related_work filename: {path.name}')
|
| 382 |
+
pdf_names.append(path.name)
|
| 383 |
+
expected_pdf_names = [f'paper_{i:03d}.pdf' for i in range(len(pdf_names))]
|
| 384 |
+
if pdf_names and pdf_names != expected_pdf_names:
|
| 385 |
+
errors.append(f'related_work PDFs must be contiguous starting from paper_000.pdf; got {pdf_names}')
|
| 386 |
+
|
| 387 |
+
if target_dir.exists() and _target_entries(target_dir) != EXPECTED_TARGET_STUDY:
|
| 388 |
+
errors.append(f'target_study entries mismatch: {_target_entries(target_dir)}')
|
| 389 |
+
|
| 390 |
+
try:
|
| 391 |
+
checklist = load_json(checklist_path)
|
| 392 |
+
except ValidationError as exc:
|
| 393 |
+
errors.append(str(exc))
|
| 394 |
+
return errors
|
| 395 |
+
|
| 396 |
+
if not isinstance(checklist, list) or not checklist:
|
| 397 |
+
errors.append('checklist.json must be a non-empty list')
|
| 398 |
+
checklist = []
|
| 399 |
+
|
| 400 |
+
referenced_images: set[str] = set()
|
| 401 |
+
for idx, item in enumerate(checklist):
|
| 402 |
+
prefix = f'checklist[{idx}]'
|
| 403 |
+
if tuple(sorted(item.keys())) != EXPECTED_CHECKLIST_ITEM_KEYS:
|
| 404 |
+
errors.append(f'{prefix} keys mismatch: {sorted(item.keys())}')
|
| 405 |
+
continue
|
| 406 |
+
item_type = item.get('type')
|
| 407 |
+
if item_type not in {'text', 'image'}:
|
| 408 |
+
errors.append(f'{prefix}.type must be text or image, got {item_type!r}')
|
| 409 |
+
if not isinstance(item.get('content'), str) or not item['content'].strip():
|
| 410 |
+
errors.append(f'{prefix}.content must be a non-empty string')
|
| 411 |
+
if not isinstance(item.get('keywords'), list) or not item['keywords']:
|
| 412 |
+
errors.append(f'{prefix}.keywords must be a non-empty list')
|
| 413 |
+
elif not all(isinstance(x, str) and x.strip() for x in item['keywords']):
|
| 414 |
+
errors.append(f'{prefix}.keywords must contain only non-empty strings')
|
| 415 |
+
if not isinstance(item.get('weight'), (int, float)) or item['weight'] <= 0:
|
| 416 |
+
errors.append(f'{prefix}.weight must be a positive number')
|
| 417 |
+
path_value = item.get('path')
|
| 418 |
+
if item_type == 'text':
|
| 419 |
+
if path_value is not None:
|
| 420 |
+
errors.append(f'{prefix}.path must be null for text items')
|
| 421 |
+
elif item_type == 'image':
|
| 422 |
+
if not isinstance(path_value, str) or not path_value.startswith('images/'):
|
| 423 |
+
errors.append(f'{prefix}.path must start with images/ for image items')
|
| 424 |
+
else:
|
| 425 |
+
if '\\' in path_value or '..' in Path(path_value).parts:
|
| 426 |
+
errors.append(f'{prefix}.path contains an invalid segment: {path_value}')
|
| 427 |
+
image_path = target_dir / path_value
|
| 428 |
+
if not image_path.is_file():
|
| 429 |
+
errors.append(f'{prefix}.path does not exist: {path_value}')
|
| 430 |
+
referenced_images.add(path_value)
|
| 431 |
+
|
| 432 |
+
actual_image_files = {str(p.relative_to(target_dir)) for p in images_dir.rglob('*') if p.is_file()} if images_dir.exists() else set()
|
| 433 |
+
extra_images = sorted(actual_image_files - referenced_images)
|
| 434 |
+
missing_images = sorted(referenced_images - actual_image_files)
|
| 435 |
+
if extra_images:
|
| 436 |
+
errors.append('target_study/images contains unreferenced files: ' + ', '.join(extra_images[:20]))
|
| 437 |
+
if missing_images:
|
| 438 |
+
errors.append('checklist image references are missing from target_study/images: ' + ', '.join(missing_images[:20]))
|
| 439 |
+
|
| 440 |
+
for text_path in (task_info_path, checklist_path):
|
| 441 |
+
text = text_path.read_text(encoding='utf-8')
|
| 442 |
+
if any(token in text for token in STALE_TOKENS):
|
| 443 |
+
errors.append(f'stale source path tokens remain in {rel(text_path, task_dir)}')
|
| 444 |
+
|
| 445 |
+
task_text = task_info.get('task', '') if isinstance(task_info, dict) else ''
|
| 446 |
+
if isinstance(task_text, str):
|
| 447 |
+
for ref in STALE_TASK_REF_RE.findall(task_text):
|
| 448 |
+
errors.append(f'task description contains stale path: {ref}')
|
| 449 |
+
|
| 450 |
+
for idx, item in enumerate(task_info.get('data', [])):
|
| 451 |
+
desc = item.get('description', '')
|
| 452 |
+
for ref in STALE_TASK_REF_RE.findall(desc):
|
| 453 |
+
errors.append(f'task_info.data[{idx}].description contains stale path: {ref}')
|
| 454 |
+
for data_ref in _iter_data_refs(desc):
|
| 455 |
+
rel_ref = data_ref[2:]
|
| 456 |
+
if not (task_dir / rel_ref).exists():
|
| 457 |
+
errors.append(f'task_info.data[{idx}].description references non-existent path: {data_ref}')
|
| 458 |
+
|
| 459 |
+
return errors
|
| 460 |
+
|
| 461 |
+
|
| 462 |
+
def stage_submission(task_dir: str | Path, assigned_task_id: str, work_dir: str | Path) -> Path:
|
| 463 |
+
task_dir = Path(task_dir)
|
| 464 |
+
work_dir = Path(work_dir)
|
| 465 |
+
staged_root = work_dir / 'staged'
|
| 466 |
+
staged_root.mkdir(parents=True, exist_ok=True)
|
| 467 |
+
staged_task_dir = staged_root / assigned_task_id
|
| 468 |
+
if staged_task_dir.exists():
|
| 469 |
+
shutil.rmtree(staged_task_dir)
|
| 470 |
+
shutil.copytree(task_dir, staged_task_dir)
|
| 471 |
+
return staged_task_dir
|
| 472 |
+
|
| 473 |
+
|
| 474 |
+
def build_public_report(prepared: PreparedSubmission) -> dict[str, Any]:
|
| 475 |
+
return {
|
| 476 |
+
'status': 'ok',
|
| 477 |
+
'assigned_task_id': prepared.assigned_task_id,
|
| 478 |
+
'archive': {
|
| 479 |
+
'file_count': prepared.archive_stats.file_count,
|
| 480 |
+
'total_bytes': prepared.archive_stats.total_bytes,
|
| 481 |
+
},
|
| 482 |
+
'metadata': {
|
| 483 |
+
'domain': prepared.metadata.domain,
|
| 484 |
+
'submitter': prepared.metadata.submitter,
|
| 485 |
+
'paper_title': prepared.metadata.paper_title,
|
| 486 |
+
'paper_url': prepared.metadata.paper_url,
|
| 487 |
+
},
|
| 488 |
+
}
|
| 489 |
+
|
| 490 |
+
|
| 491 |
+
def validate_and_prepare_submission(
|
| 492 |
+
archive_path: str | Path,
|
| 493 |
+
metadata: SubmissionMetadata,
|
| 494 |
+
assigned_task_id: str,
|
| 495 |
+
) -> PreparedSubmission:
|
| 496 |
+
metadata_errors = validate_submission_metadata(metadata)
|
| 497 |
+
if metadata_errors:
|
| 498 |
+
raise ValidationError('\n'.join(metadata_errors))
|
| 499 |
+
|
| 500 |
+
work_dir = create_work_dir()
|
| 501 |
+
try:
|
| 502 |
+
extract_root, archive_stats = extract_submission_zip(archive_path, work_dir)
|
| 503 |
+
uploaded_task_dir = find_single_task_dir(extract_root)
|
| 504 |
+
errors = validate_task_dir(uploaded_task_dir, enforce_task_name=False, expected_domain=metadata.domain)
|
| 505 |
+
if errors:
|
| 506 |
+
raise ValidationError('\n'.join(errors))
|
| 507 |
+
staged_task_dir = stage_submission(uploaded_task_dir, assigned_task_id, work_dir)
|
| 508 |
+
return PreparedSubmission(
|
| 509 |
+
work_dir=str(work_dir),
|
| 510 |
+
uploaded_task_dir=str(uploaded_task_dir),
|
| 511 |
+
staged_task_dir=str(staged_task_dir),
|
| 512 |
+
assigned_task_id=assigned_task_id,
|
| 513 |
+
archive_stats=archive_stats,
|
| 514 |
+
metadata=metadata,
|
| 515 |
+
)
|
| 516 |
+
except Exception:
|
| 517 |
+
cleanup_work_dir(work_dir)
|
| 518 |
+
raise
|