Spaces:
Sleeping
Single-repo dataset hosting on HF (GLUE-style subdirs)
Browse filesAll 4 tasks now live under one public dataset repo
`lanczos/graphtestbed-data`, organized as one subdir per task:
graphtestbed-data/
arxiv-citation/{train,val,test}_features.csv + sample_submission.csv
figraph/... (+ edges20{14..18}.csv)
ibm-aml/...
ieee-fraud-detection/...
Why one repo: easier to manage permissions, README, versioning. New tasks
become a `git push` of a folder, not a new HF repo per dataset.
Changes:
- server/space/push_data.py: one-shot uploader (folder per task), writes
a top-level dataset card too.
- manifest.yaml: every task points at lanczos/graphtestbed-data with a new
`hf_subdir: <task>` field.
- graphtestbed/fetch.py: prepends hf_subdir/ when downloading.
- agents/common/tasks.py: instruction text now points the agent at the HF
source explicitly + reminds them not to query upstream for test labels.
Test labels remain in the private companion repo (lanczos/graphtestbed-gt)
and never enter the public dataset.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
- agents/common/tasks.py +6 -1
- datasets/manifest.yaml +8 -4
- graphtestbed/fetch.py +8 -1
- server/space/push_data.py +170 -0
|
@@ -24,8 +24,13 @@ _TEMPLATE = """\
|
|
| 24 |
- `val_features.csv` β labeled validation rows (use for HPO / early stopping)
|
| 25 |
- `test_features.csv` β **unlabeled** test rows; predict here
|
| 26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
The `Label` (or task-specific target) column is present in train/val and
|
| 28 |
-
absent from test.
|
| 29 |
|
| 30 |
## Submission format
|
| 31 |
|
|
|
|
| 24 |
- `val_features.csv` β labeled validation rows (use for HPO / early stopping)
|
| 25 |
- `test_features.csv` β **unlabeled** test rows; predict here
|
| 26 |
|
| 27 |
+
These are pulled from `lanczos/graphtestbed-data` on HuggingFace (subdir
|
| 28 |
+
`{task}/`). **Train and HPO on these files only** β do not pull from the
|
| 29 |
+
upstream source mentioned above to recover test labels. The benchmark is
|
| 30 |
+
non-adversarial; we trust agent authors to honor the contract.
|
| 31 |
+
|
| 32 |
The `Label` (or task-specific target) column is present in train/val and
|
| 33 |
+
absent from test.
|
| 34 |
|
| 35 |
## Submission format
|
| 36 |
|
|
@@ -1,5 +1,6 @@
|
|
| 1 |
ieee-fraud-detection:
|
| 2 |
-
hf_repo:
|
|
|
|
| 3 |
hf_revision: main
|
| 4 |
files:
|
| 5 |
train_features:
|
|
@@ -44,7 +45,8 @@ ieee-fraud-detection:
|
|
| 44 |
backend_config:
|
| 45 |
competition: ieee-fraud-detection
|
| 46 |
arxiv-citation:
|
| 47 |
-
hf_repo:
|
|
|
|
| 48 |
hf_revision: main
|
| 49 |
files:
|
| 50 |
train_features:
|
|
@@ -79,7 +81,8 @@ arxiv-citation:
|
|
| 79 |
for this task). The split is balanced enough (~42.7% positive) that AUC-ROC discriminates
|
| 80 |
models well.'
|
| 81 |
figraph:
|
| 82 |
-
hf_repo:
|
|
|
|
| 83 |
hf_revision: main
|
| 84 |
files:
|
| 85 |
train_features:
|
|
@@ -129,7 +132,8 @@ figraph:
|
|
| 129 |
Metric: AUC-ROC. The FiGraph paper uses AUC-ROC for the company anomaly task (~4.7%
|
| 130 |
positive); secondary AUC-PR and F1 reported for context.'
|
| 131 |
ibm-aml:
|
| 132 |
-
hf_repo:
|
|
|
|
| 133 |
hf_revision: main
|
| 134 |
files:
|
| 135 |
train_features:
|
|
|
|
| 1 |
ieee-fraud-detection:
|
| 2 |
+
hf_repo: lanczos/graphtestbed-data
|
| 3 |
+
hf_subdir: ieee-fraud-detection
|
| 4 |
hf_revision: main
|
| 5 |
files:
|
| 6 |
train_features:
|
|
|
|
| 45 |
backend_config:
|
| 46 |
competition: ieee-fraud-detection
|
| 47 |
arxiv-citation:
|
| 48 |
+
hf_repo: lanczos/graphtestbed-data
|
| 49 |
+
hf_subdir: arxiv-citation
|
| 50 |
hf_revision: main
|
| 51 |
files:
|
| 52 |
train_features:
|
|
|
|
| 81 |
for this task). The split is balanced enough (~42.7% positive) that AUC-ROC discriminates
|
| 82 |
models well.'
|
| 83 |
figraph:
|
| 84 |
+
hf_repo: lanczos/graphtestbed-data
|
| 85 |
+
hf_subdir: figraph
|
| 86 |
hf_revision: main
|
| 87 |
files:
|
| 88 |
train_features:
|
|
|
|
| 132 |
Metric: AUC-ROC. The FiGraph paper uses AUC-ROC for the company anomaly task (~4.7%
|
| 133 |
positive); secondary AUC-PR and F1 reported for context.'
|
| 134 |
ibm-aml:
|
| 135 |
+
hf_repo: lanczos/graphtestbed-data
|
| 136 |
+
hf_subdir: ibm-aml
|
| 137 |
hf_revision: main
|
| 138 |
files:
|
| 139 |
train_features:
|
|
@@ -29,12 +29,19 @@ def fetch_task(task: str, allow_unverified: bool = False) -> Path:
|
|
| 29 |
out = cache_dir() / task
|
| 30 |
out.mkdir(parents=True, exist_ok=True)
|
| 31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
n_unpinned = 0
|
| 33 |
for key, spec in cfg["files"].items():
|
|
|
|
|
|
|
| 34 |
try:
|
| 35 |
local = hf_hub_download(
|
| 36 |
repo_id=cfg["hf_repo"],
|
| 37 |
-
filename=
|
| 38 |
revision=cfg.get("hf_revision", "main"),
|
| 39 |
repo_type="dataset",
|
| 40 |
cache_dir=str(out / "_hf_cache"),
|
|
|
|
| 29 |
out = cache_dir() / task
|
| 30 |
out.mkdir(parents=True, exist_ok=True)
|
| 31 |
|
| 32 |
+
# If hf_subdir is set, the file is laid out as <subdir>/<filename> inside
|
| 33 |
+
# the repo (GLUE-style single-repo-many-subsets). Older single-repo-per-
|
| 34 |
+
# task entries leave hf_subdir unset and use bare filenames.
|
| 35 |
+
hf_subdir = cfg.get("hf_subdir", "").strip("/")
|
| 36 |
+
|
| 37 |
n_unpinned = 0
|
| 38 |
for key, spec in cfg["files"].items():
|
| 39 |
+
path_in_repo = (f"{hf_subdir}/{spec['filename']}"
|
| 40 |
+
if hf_subdir else spec["filename"])
|
| 41 |
try:
|
| 42 |
local = hf_hub_download(
|
| 43 |
repo_id=cfg["hf_repo"],
|
| 44 |
+
filename=path_in_repo,
|
| 45 |
revision=cfg.get("hf_revision", "main"),
|
| 46 |
repo_type="dataset",
|
| 47 |
cache_dir=str(out / "_hf_cache"),
|
|
@@ -0,0 +1,170 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""One-shot uploader for the agent-visible features (train/val/test) to a
|
| 2 |
+
single public HF dataset repo, organized GLUE-style as one subdir per task.
|
| 3 |
+
|
| 4 |
+
Layout in the repo:
|
| 5 |
+
|
| 6 |
+
lanczos/graphtestbed-data/
|
| 7 |
+
βββ README.md
|
| 8 |
+
βββ arxiv-citation/{train,val,test}_features.csv + sample_submission.csv
|
| 9 |
+
βββ figraph/...
|
| 10 |
+
βββ ibm-aml/...
|
| 11 |
+
βββ ieee-fraud-detection/...
|
| 12 |
+
|
| 13 |
+
The test_features.csv MUST already have its label column stripped out β this
|
| 14 |
+
script does NOT strip it. Spot-check before upload by running with --dry-run.
|
| 15 |
+
|
| 16 |
+
Usage:
|
| 17 |
+
HF_TOKEN=hf_xxx python server/space/push_data.py \
|
| 18 |
+
--repo lanczos/graphtestbed-data --src ~/.graphtestbed/data
|
| 19 |
+
# or one task at a time:
|
| 20 |
+
HF_TOKEN=hf_xxx python server/space/push_data.py \
|
| 21 |
+
--repo lanczos/graphtestbed-data --src ~/.graphtestbed/data \
|
| 22 |
+
--tasks figraph arxiv-citation
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
from __future__ import annotations
|
| 26 |
+
|
| 27 |
+
import argparse
|
| 28 |
+
import os
|
| 29 |
+
import sys
|
| 30 |
+
import tempfile
|
| 31 |
+
from pathlib import Path
|
| 32 |
+
|
| 33 |
+
import yaml
|
| 34 |
+
from huggingface_hub import HfApi, create_repo
|
| 35 |
+
|
| 36 |
+
REPO_ROOT = Path(__file__).resolve().parents[2]
|
| 37 |
+
MANIFEST = REPO_ROOT / "datasets" / "manifest.yaml"
|
| 38 |
+
|
| 39 |
+
FILES = ["train_features.csv", "val_features.csv",
|
| 40 |
+
"test_features.csv", "sample_submission.csv"]
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def _readme(tasks: list[str], cfg: dict) -> str:
|
| 44 |
+
lines = [
|
| 45 |
+
"---",
|
| 46 |
+
"license: mit",
|
| 47 |
+
"tags: [graph, benchmark, fraud-detection, graph-ml]",
|
| 48 |
+
"---",
|
| 49 |
+
"",
|
| 50 |
+
"# GraphTestbed Datasets",
|
| 51 |
+
"",
|
| 52 |
+
"Public train/val/test features for the four [GraphTestbed]"
|
| 53 |
+
"(https://github.com/zhuconv/GraphTestbed) tasks. Test labels are"
|
| 54 |
+
" held privately by the scoring server.",
|
| 55 |
+
"",
|
| 56 |
+
"## Why a single repo",
|
| 57 |
+
"",
|
| 58 |
+
"GLUE-style: one repo, one subdir per task, one README. Adding a"
|
| 59 |
+
" new task is a `git push` of one folder, not a new HF repo.",
|
| 60 |
+
"",
|
| 61 |
+
"## Subsets",
|
| 62 |
+
"",
|
| 63 |
+
"| Task | id col | metric | rows (train/val/test) | Source |",
|
| 64 |
+
"| --- | --- | --- | --- | --- |",
|
| 65 |
+
]
|
| 66 |
+
for t in tasks:
|
| 67 |
+
c = cfg[t]
|
| 68 |
+
s = c["submission_schema"]
|
| 69 |
+
m = c["metric"]
|
| 70 |
+
# Pull the first sentence of the description as the source line
|
| 71 |
+
desc = (c.get("description", "") or "").split(".")[0]
|
| 72 |
+
lines.append(
|
| 73 |
+
f"| `{t}` | `{s['id_col']}` | `{m['primary']}` | "
|
| 74 |
+
f"see csv | {desc.strip()[:60]} |"
|
| 75 |
+
)
|
| 76 |
+
lines += [
|
| 77 |
+
"",
|
| 78 |
+
"## Use",
|
| 79 |
+
"",
|
| 80 |
+
"```python",
|
| 81 |
+
"from huggingface_hub import hf_hub_download",
|
| 82 |
+
"import pandas as pd",
|
| 83 |
+
"",
|
| 84 |
+
"p = hf_hub_download(",
|
| 85 |
+
" 'lanczos/graphtestbed-data', 'arxiv-citation/train_features.csv',",
|
| 86 |
+
" repo_type='dataset',",
|
| 87 |
+
")",
|
| 88 |
+
"train = pd.read_csv(p)",
|
| 89 |
+
"```",
|
| 90 |
+
"",
|
| 91 |
+
"**Contract:** treat upstream sources (e.g. relbench, FiGraph github,"
|
| 92 |
+
" IBM AML kaggle) as out-of-bounds for evaluation purposes. Train +"
|
| 93 |
+
" HPO on what's in this repo only.",
|
| 94 |
+
"",
|
| 95 |
+
"Test labels are scored against a private companion repo by the"
|
| 96 |
+
" GraphTestbed server: <https://lanczos-graphtestbed.hf.space/>.",
|
| 97 |
+
]
|
| 98 |
+
return "\n".join(lines)
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
def main() -> None:
|
| 102 |
+
ap = argparse.ArgumentParser(prog="push_data")
|
| 103 |
+
ap.add_argument("--repo", required=True,
|
| 104 |
+
help="HF dataset repo id, e.g. lanczos/graphtestbed-data")
|
| 105 |
+
ap.add_argument("--src", required=True, type=Path,
|
| 106 |
+
help="Local source root (e.g. ~/.graphtestbed/data) β "
|
| 107 |
+
"must contain a subdir per task with the 4 CSVs.")
|
| 108 |
+
ap.add_argument("--tasks", nargs="+", default=None,
|
| 109 |
+
help="Limit to these task names (default: all in manifest)")
|
| 110 |
+
ap.add_argument("--dry-run", action="store_true")
|
| 111 |
+
args = ap.parse_args()
|
| 112 |
+
|
| 113 |
+
cfg = yaml.safe_load(MANIFEST.read_text())
|
| 114 |
+
tasks = args.tasks or sorted(cfg)
|
| 115 |
+
|
| 116 |
+
src_root = args.src.expanduser()
|
| 117 |
+
missing = []
|
| 118 |
+
for t in tasks:
|
| 119 |
+
for f in FILES:
|
| 120 |
+
if not (src_root / t / f).exists():
|
| 121 |
+
missing.append(f"{t}/{f}")
|
| 122 |
+
if missing:
|
| 123 |
+
sys.exit("Missing files:\n " + "\n ".join(missing))
|
| 124 |
+
|
| 125 |
+
token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_HUB_TOKEN")
|
| 126 |
+
if not token:
|
| 127 |
+
sys.exit("Set HF_TOKEN env var with write scope on the namespace.")
|
| 128 |
+
|
| 129 |
+
api = HfApi(token=token)
|
| 130 |
+
|
| 131 |
+
if args.dry_run:
|
| 132 |
+
print(f"[dry-run] would push to {args.repo}:")
|
| 133 |
+
for t in tasks:
|
| 134 |
+
for f in FILES:
|
| 135 |
+
p = (src_root / t / f).resolve()
|
| 136 |
+
size_mb = p.stat().st_size / (1024 * 1024)
|
| 137 |
+
print(f" {t}/{f} ({size_mb:.1f} MB)")
|
| 138 |
+
return
|
| 139 |
+
|
| 140 |
+
create_repo(args.repo, repo_type="dataset", token=token,
|
| 141 |
+
exist_ok=True, private=False)
|
| 142 |
+
|
| 143 |
+
# Write the README into a tempdir so we don't dirty the source root
|
| 144 |
+
with tempfile.TemporaryDirectory() as td:
|
| 145 |
+
readme = Path(td) / "README.md"
|
| 146 |
+
readme.write_text(_readme(tasks, cfg))
|
| 147 |
+
api.upload_file(
|
| 148 |
+
path_or_fileobj=str(readme),
|
| 149 |
+
path_in_repo="README.md",
|
| 150 |
+
repo_id=args.repo,
|
| 151 |
+
repo_type="dataset",
|
| 152 |
+
commit_message="Update README (auto from push_data.py)",
|
| 153 |
+
)
|
| 154 |
+
|
| 155 |
+
for t in tasks:
|
| 156 |
+
# upload_folder follows symlinks via the underlying open() calls.
|
| 157 |
+
api.upload_folder(
|
| 158 |
+
folder_path=str(src_root / t),
|
| 159 |
+
path_in_repo=t,
|
| 160 |
+
repo_id=args.repo,
|
| 161 |
+
repo_type="dataset",
|
| 162 |
+
allow_patterns=FILES,
|
| 163 |
+
commit_message=f"Push {t} train/val/test features",
|
| 164 |
+
)
|
| 165 |
+
print(f" β {t}/")
|
| 166 |
+
print("Done.")
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
if __name__ == "__main__":
|
| 170 |
+
main()
|