| """Bootstrap a private Kaggle Dataset that holds the secrets the Bee |
| training kernel needs. |
| |
| WHY THIS EXISTS |
| --------------- |
| Kaggle's `UserSecretsClient` (Add-ons β Secrets) is UI-managed. Bindings |
| between a kernel and a secret are NOT preserved when the kernel is pushed |
| via the Kaggle CLI / API β and the cron at /api/cron/kaggle-dispatch |
| pushes on every tick. So every cron-driven run loses access to its |
| secrets and aborts. |
| |
| The fix: store the same secrets in a PRIVATE Kaggle Dataset and attach |
| that dataset to the kernel via `kernel-metadata.json`'s `dataset_sources`. |
| Dataset attachments DO survive CLI pushes (they're part of the metadata |
| file the kernel itself owns). |
| |
| Security delta vs Kaggle Secrets: |
| - Kaggle Secrets: encrypted at rest by Kaggle; UI-only management. |
| - Private Dataset: cleartext file inside a private Kaggle Dataset; |
| only readable by the dataset owner (you) and the cron's |
| KAGGLE_KEY (also yours). For a single-tenant private kernel, |
| practically equivalent. Both gated by Kaggle authentication. |
| |
| Run locally with HF_TOKEN + CRON_SECRET + KAGGLE creds in env: |
| |
| HF_TOKEN=... CRON_SECRET=... \\ |
| python scripts/bootstrap_kaggle_secrets.py |
| """ |
| from __future__ import annotations |
|
|
| import json |
| import os |
| import subprocess |
| import sys |
| import tempfile |
| from pathlib import Path |
|
|
| DATASET_OWNER = "ceocxx" |
| DATASET_SLUG = "bee-secrets" |
| DATASET_TITLE = "Bee training kernel secrets (private)" |
|
|
|
|
| def main() -> None: |
| hf_token = os.environ.get("HF_TOKEN", "") |
| cron_secret = os.environ.get("CRON_SECRET") or os.environ.get("BEE_CRON_SECRET", "") |
| if not hf_token or not cron_secret: |
| raise SystemExit( |
| "Both HF_TOKEN and CRON_SECRET (or BEE_CRON_SECRET) env vars are required." |
| ) |
|
|
| secrets = { |
| "hf_token": hf_token, |
| "cron_secret": cron_secret, |
| |
| |
| |
| "ingest_url": "https://workspace.bee.cuilabs.io/api/training/runs", |
| "next_domain_url": "https://workspace.bee.cuilabs.io/api/training/next-domain", |
| } |
|
|
| with tempfile.TemporaryDirectory() as tmp: |
| d = Path(tmp) |
| (d / "secrets.json").write_text(json.dumps(secrets, indent=2), encoding="utf-8") |
| (d / "dataset-metadata.json").write_text( |
| json.dumps({ |
| "title": DATASET_TITLE, |
| "id": f"{DATASET_OWNER}/{DATASET_SLUG}", |
| "licenses": [{"name": "other"}], |
| "subtitle": "Cleartext secrets attached to bee-train-online β private only.", |
| "description": ( |
| "PRIVATE. Holds the HF write token and CRON bearer that " |
| "the Bee training kernel needs. This dataset is attached " |
| "to ceocxx/bee-train-online via the kernel-metadata.json " |
| "dataset_sources field. Do not make public." |
| ), |
| "isPrivate": True, |
| "keywords": [], |
| }, indent=2), |
| encoding="utf-8", |
| ) |
| |
| create = subprocess.run( |
| ["kaggle", "datasets", "create", "-p", str(d)], |
| capture_output=True, text=True, |
| ) |
| out = (create.stdout + create.stderr).strip() |
| print(out) |
| if create.returncode != 0: |
| print("create failed β trying `datasets version` (rotates existing)") |
| ver = subprocess.run( |
| ["kaggle", "datasets", "version", "-p", str(d), "-m", |
| "rotate bee-secrets", "--dir-mode", "zip"], |
| capture_output=True, text=True, |
| ) |
| print((ver.stdout + ver.stderr).strip()) |
| if ver.returncode != 0: |
| sys.exit(ver.returncode) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|