Spaces:
Sleeping
arxiv-citation: ship the heterograph (citations + author/category tables)
Browse filesWithout the relation tables, agents collapsed arxiv-citation into a pure
tabular regression on the per-paper feature CSV β figraph and ibm-aml run
the same way. That's a significant under-fit; the task is a graph task
because RelBench rel-arxiv:paper-citation is a temporal heterograph.
Added 5 tables under arxiv-citation/ subdir on HF:
citations.csv (Paper_ID, References_Paper_ID, Submission_Date)
β 1.2M rows; filtered to Submission_Date < 2023-01-01
so test-period citations (which encode the labels)
do not leak.
paperAuthors.csv (Paper_ID, Author_ID, Submission_Date) β 617k rows.
paperCategories.csv (Paper_ID, Category_ID, Submission_Date) β 155k rows.
authors.csv (Author_ID, Name, ORCID) β 144k rows.
categories.csv (Category_ID, Category) β 53 rows.
Manifest gets one entry per file; the auto-generated agent instruction
template now lists every file declared in `files:` (was previously a
hardcoded 3-row list), so agents see them.
mlevolve adapter forwards any non-canonical files into the public tree
beside train.csv/test.csv so its REPL can read them with a relative path.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
- agents/common/tasks.py +34 -5
- agents/mlevolve/adapter.py +13 -0
- datasets/manifest.yaml +32 -1
|
@@ -20,17 +20,14 @@ _TEMPLATE = """\
|
|
| 20 |
|
| 21 |
## Files you will see
|
| 22 |
|
| 23 |
-
|
| 24 |
-
- `val_features.csv` β labeled validation rows (use for HPO / early stopping)
|
| 25 |
-
- `test_features.csv` β **unlabeled** test rows; predict here
|
| 26 |
|
| 27 |
These are pulled from `lanczos/graphtestbed-data` on HuggingFace (subdir
|
| 28 |
`{task}/`). **Train and HPO on these files only** β do not pull from the
|
| 29 |
upstream source mentioned above to recover test labels. The benchmark is
|
| 30 |
non-adversarial; we trust agent authors to honor the contract.
|
| 31 |
|
| 32 |
-
The
|
| 33 |
-
absent from test.
|
| 34 |
|
| 35 |
## Submission format
|
| 36 |
|
|
@@ -56,6 +53,37 @@ _DTYPE_DESC = {
|
|
| 56 |
}
|
| 57 |
|
| 58 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
def task_instruction(task: str) -> str:
|
| 60 |
override = Path(__file__).parent / "tasks_md" / f"{task}.md"
|
| 61 |
if override.exists():
|
|
@@ -69,6 +97,7 @@ def task_instruction(task: str) -> str:
|
|
| 69 |
return _TEMPLATE.format(
|
| 70 |
task=task,
|
| 71 |
description=str(cfg.get("description", "")).strip(),
|
|
|
|
| 72 |
id_col=s["id_col"],
|
| 73 |
pred_col=s["pred_col"],
|
| 74 |
n_rows=s.get("n_rows", "?"),
|
|
|
|
| 20 |
|
| 21 |
## Files you will see
|
| 22 |
|
| 23 |
+
{files_block}
|
|
|
|
|
|
|
| 24 |
|
| 25 |
These are pulled from `lanczos/graphtestbed-data` on HuggingFace (subdir
|
| 26 |
`{task}/`). **Train and HPO on these files only** β do not pull from the
|
| 27 |
upstream source mentioned above to recover test labels. The benchmark is
|
| 28 |
non-adversarial; we trust agent authors to honor the contract.
|
| 29 |
|
| 30 |
+
The label column is present in train/val and absent from test.
|
|
|
|
| 31 |
|
| 32 |
## Submission format
|
| 33 |
|
|
|
|
| 53 |
}
|
| 54 |
|
| 55 |
|
| 56 |
+
_KNOWN_FILE_HINTS = {
|
| 57 |
+
"train_features.csv": "labeled training rows",
|
| 58 |
+
"val_features.csv": "labeled validation rows (use for HPO / early stopping)",
|
| 59 |
+
"test_features.csv": "**unlabeled** test rows; predict here",
|
| 60 |
+
"sample_submission.csv": "the schema you must match (column order + row IDs)",
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def _files_block(cfg: dict) -> str:
|
| 65 |
+
"""Render every file declared in manifest, with a known hint when we have
|
| 66 |
+
one β otherwise just the filename so the agent knows it's available."""
|
| 67 |
+
lines = []
|
| 68 |
+
seen = set()
|
| 69 |
+
# Show the canonical four first in a fixed order, then everything else
|
| 70 |
+
# (graph tables, edges, etc.) in manifest declaration order.
|
| 71 |
+
canonical = ["train_features.csv", "val_features.csv",
|
| 72 |
+
"test_features.csv", "sample_submission.csv"]
|
| 73 |
+
by_name = {spec["filename"]: key for key, spec in cfg["files"].items()}
|
| 74 |
+
for fn in canonical:
|
| 75 |
+
if fn in by_name:
|
| 76 |
+
lines.append(f"- `{fn}` β {_KNOWN_FILE_HINTS[fn]}")
|
| 77 |
+
seen.add(fn)
|
| 78 |
+
for key, spec in cfg["files"].items():
|
| 79 |
+
fn = spec["filename"]
|
| 80 |
+
if fn in seen:
|
| 81 |
+
continue
|
| 82 |
+
hint = _KNOWN_FILE_HINTS.get(fn, "additional task data (see description above)")
|
| 83 |
+
lines.append(f"- `{fn}` β {hint}")
|
| 84 |
+
return "\n".join(lines)
|
| 85 |
+
|
| 86 |
+
|
| 87 |
def task_instruction(task: str) -> str:
|
| 88 |
override = Path(__file__).parent / "tasks_md" / f"{task}.md"
|
| 89 |
if override.exists():
|
|
|
|
| 97 |
return _TEMPLATE.format(
|
| 98 |
task=task,
|
| 99 |
description=str(cfg.get("description", "")).strip(),
|
| 100 |
+
files_block=_files_block(cfg),
|
| 101 |
id_col=s["id_col"],
|
| 102 |
pred_col=s["pred_col"],
|
| 103 |
n_rows=s.get("n_rows", "?"),
|
|
@@ -76,4 +76,17 @@ def stage(task: str, root: Path) -> Path:
|
|
| 76 |
# Stash the real test set for post-search re-execution by the user.
|
| 77 |
test.to_csv(root / task / "REAL_TEST_FEATURES.csv", index=False)
|
| 78 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
return base
|
|
|
|
| 76 |
# Stash the real test set for post-search re-execution by the user.
|
| 77 |
test.to_csv(root / task / "REAL_TEST_FEATURES.csv", index=False)
|
| 78 |
|
| 79 |
+
# Forward any additional task data files declared in the manifest (graph
|
| 80 |
+
# edges, relation tables, β¦) into the public tree so the agent can build
|
| 81 |
+
# a real graph model instead of treating the task as pure tabular.
|
| 82 |
+
canonical = {"train_features.csv", "val_features.csv",
|
| 83 |
+
"test_features.csv", "sample_submission.csv"}
|
| 84 |
+
for spec in cfg["files"].values():
|
| 85 |
+
fn = spec["filename"]
|
| 86 |
+
if fn in canonical:
|
| 87 |
+
continue
|
| 88 |
+
src_path = src / fn
|
| 89 |
+
if src_path.exists():
|
| 90 |
+
(pub / fn).write_bytes(src_path.read_bytes())
|
| 91 |
+
|
| 92 |
return base
|
|
@@ -61,6 +61,21 @@ arxiv-citation:
|
|
| 61 |
sample_submission:
|
| 62 |
filename: sample_submission.csv
|
| 63 |
sha256: TBD
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
submission_schema:
|
| 65 |
id_col: Paper_ID
|
| 66 |
pred_col: Label
|
|
@@ -74,7 +89,23 @@ arxiv-citation:
|
|
| 74 |
description: 'Predict whether each arXiv paper receives β₯1 citation within 6 months
|
| 75 |
after submission. Source: RelBench rel-arxiv:paper-citation (stanford-snap/relbench,
|
| 76 |
MIT). Temporal split: train cutoff 2022-01-01, val cutoff 2023-01-01, test from
|
| 77 |
-
val cutoff onward. Test rows: 193,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
|
| 79 |
|
| 80 |
Metric: AUC-ROC, matching RelBench rel-arxiv:paper-citation (the official benchmark
|
|
|
|
| 61 |
sample_submission:
|
| 62 |
filename: sample_submission.csv
|
| 63 |
sha256: TBD
|
| 64 |
+
citations:
|
| 65 |
+
filename: citations.csv
|
| 66 |
+
sha256: TBD
|
| 67 |
+
paper_authors:
|
| 68 |
+
filename: paperAuthors.csv
|
| 69 |
+
sha256: TBD
|
| 70 |
+
paper_categories:
|
| 71 |
+
filename: paperCategories.csv
|
| 72 |
+
sha256: TBD
|
| 73 |
+
authors:
|
| 74 |
+
filename: authors.csv
|
| 75 |
+
sha256: TBD
|
| 76 |
+
categories:
|
| 77 |
+
filename: categories.csv
|
| 78 |
+
sha256: TBD
|
| 79 |
submission_schema:
|
| 80 |
id_col: Paper_ID
|
| 81 |
pred_col: Label
|
|
|
|
| 89 |
description: 'Predict whether each arXiv paper receives β₯1 citation within 6 months
|
| 90 |
after submission. Source: RelBench rel-arxiv:paper-citation (stanford-snap/relbench,
|
| 91 |
MIT). Temporal split: train cutoff 2022-01-01, val cutoff 2023-01-01, test from
|
| 92 |
+
val cutoff onward. Test rows: 193,696 (~42.7% positive).
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
This is a GRAPH task. Beyond train/val/test_features.csv (one row per paper with
|
| 96 |
+
pre-extracted scalar features), the subdir also ships the relational tables that
|
| 97 |
+
let you build the actual paper-author-category-citation heterograph:
|
| 98 |
+
|
| 99 |
+
citations.csv (Paper_ID, References_Paper_ID, Submission_Date) β 1.2M
|
| 100 |
+
edges; filtered to Submission_Date < 2023-01-01 to
|
| 101 |
+
prevent test-label leakage.
|
| 102 |
+
paperAuthors.csv (Paper_ID, Author_ID, Submission_Date) β 617k edges.
|
| 103 |
+
paperCategories.csv (Paper_ID, Category_ID, Submission_Date) β 155k edges.
|
| 104 |
+
authors.csv (Author_ID, Name, ORCID) β 144k author entities.
|
| 105 |
+
categories.csv (Category_ID, Category) β 53 category entities.
|
| 106 |
+
|
| 107 |
+
A purely tabular model that ignores these will under-fit. Most baselines for this
|
| 108 |
+
benchmark use a GNN (GraphSAGE / R-GCN / temporal HGN) over the heterograph.
|
| 109 |
|
| 110 |
|
| 111 |
Metric: AUC-ROC, matching RelBench rel-arxiv:paper-citation (the official benchmark
|