Spaces:
Sleeping
Sleeping
| """Entry point for the GraphTestbed scoring server on HF Spaces. | |
| On boot: | |
| 1. snapshot_download the companion dataset repo (lanczos/graphtestbed-gt by | |
| default) into /data: gt/*.csv, leaderboard.db, submissions/**/*.csv. | |
| 2. Spawn a daemon thread that every BACKUP_INTERVAL seconds: | |
| a. SELECT COUNT(*) FROM submissions; bail if unchanged. | |
| b. sqlite3.Connection.backup() into a temp file (atomic, lock-safe). | |
| c. upload_file the temp file β leaderboard.db in the dataset repo. | |
| d. upload_folder /data/submissions/ β submissions/ in the dataset repo | |
| (huggingface_hub diffs by content-hash; unchanged files don't transfer). | |
| 3. Hand off to server/api.py via Flask app.run(threaded=True). | |
| Env vars (all have sensible defaults baked into the Dockerfile): | |
| HF_TOKEN required write scope on GT_DATASET_REPO | |
| GT_DATASET_REPO optional default: lanczos/graphtestbed-gt | |
| GT_DATA_ROOT optional default: /data | |
| GT_BACKUP_INTERVAL optional default: 60 (seconds) | |
| PORT optional default: 7860 | |
| """ | |
| from __future__ import annotations | |
| import os | |
| import sqlite3 | |
| import sys | |
| import threading | |
| import time | |
| from pathlib import Path | |
| from huggingface_hub import snapshot_download, upload_file, upload_folder | |
| HF_TOKEN = os.environ.get("HF_TOKEN") | |
| HF_REPO = os.environ.get("GT_DATASET_REPO", "lanczos/graphtestbed-gt") | |
| DATA_DIR = Path(os.environ.get("GT_DATA_ROOT", "/data")) | |
| GT_DIR = DATA_DIR / "gt" | |
| DB_PATH = DATA_DIR / "leaderboard.db" | |
| ARCHIVE_DIR = DATA_DIR / "submissions" | |
| BACKUP_INTERVAL = int(os.environ.get("GT_BACKUP_INTERVAL", "60")) | |
| PORT = int(os.environ.get("PORT", "7860")) | |
| def _require_token() -> str: | |
| if not HF_TOKEN: | |
| raise SystemExit( | |
| "HF_TOKEN is unset. Set it as a Space secret with write scope on " | |
| f"{HF_REPO}." | |
| ) | |
| return HF_TOKEN | |
| def bootstrap() -> None: | |
| """Pull GT files, leaderboard, and submission archive from the dataset repo.""" | |
| token = _require_token() | |
| for d in (DATA_DIR, GT_DIR, ARCHIVE_DIR): | |
| d.mkdir(parents=True, exist_ok=True) | |
| print(f"snapshot_download {HF_REPO} β {DATA_DIR}", flush=True) | |
| try: | |
| snapshot_download( | |
| HF_REPO, | |
| repo_type="dataset", | |
| local_dir=str(DATA_DIR), | |
| allow_patterns=["gt/*.csv", "leaderboard.db", "submissions/**/*.csv"], | |
| token=token, | |
| ) | |
| except Exception as e: | |
| # First-deploy or empty repo: keep going with empty /data. | |
| print(f"snapshot_download warning ({type(e).__name__}): {e}", flush=True) | |
| n_gt = len(list(GT_DIR.glob("*.csv"))) | |
| print(f"GT files present: {n_gt}", flush=True) | |
| if DB_PATH.exists(): | |
| try: | |
| n = int(sqlite3.connect(DB_PATH).execute( | |
| "SELECT COUNT(*) FROM submissions" | |
| ).fetchone()[0]) | |
| print(f"restored leaderboard.db ({n} submissions)", flush=True) | |
| except sqlite3.OperationalError: | |
| print("leaderboard.db present but no submissions table yet", flush=True) | |
| else: | |
| print("no prior leaderboard.db; starting fresh", flush=True) | |
| def _submission_count() -> int: | |
| if not DB_PATH.exists(): | |
| return 0 | |
| try: | |
| conn = sqlite3.connect(DB_PATH) | |
| try: | |
| row = conn.execute("SELECT COUNT(*) FROM submissions").fetchone() | |
| return int(row[0]) if row else 0 | |
| finally: | |
| conn.close() | |
| except sqlite3.OperationalError: | |
| return 0 | |
| def _atomic_db_copy(dst: Path) -> None: | |
| """sqlite3.backup() is lock-safe β readers/writers stay consistent.""" | |
| src = sqlite3.connect(DB_PATH) | |
| try: | |
| target = sqlite3.connect(dst) | |
| try: | |
| src.backup(target) | |
| finally: | |
| target.close() | |
| finally: | |
| src.close() | |
| def backup_loop() -> None: | |
| token = _require_token() | |
| last_count = -1 | |
| print(f"backup_loop started (interval={BACKUP_INTERVAL}s)", flush=True) | |
| while True: | |
| time.sleep(BACKUP_INTERVAL) | |
| n = _submission_count() | |
| if n == last_count: | |
| continue | |
| try: | |
| tmp = DATA_DIR / "_leaderboard.db.tmp" | |
| _atomic_db_copy(tmp) | |
| upload_file( | |
| path_or_fileobj=str(tmp), | |
| path_in_repo="leaderboard.db", | |
| repo_id=HF_REPO, repo_type="dataset", | |
| token=token, | |
| commit_message=f"backup leaderboard ({n} submissions)", | |
| ) | |
| tmp.unlink() | |
| except Exception as e: | |
| print(f"leaderboard backup failed: {type(e).__name__}: {e}", flush=True) | |
| continue | |
| if ARCHIVE_DIR.exists() and any(ARCHIVE_DIR.rglob("*.csv")): | |
| try: | |
| upload_folder( | |
| folder_path=str(ARCHIVE_DIR), | |
| path_in_repo="submissions", | |
| repo_id=HF_REPO, repo_type="dataset", | |
| token=token, | |
| commit_message=f"archive submissions ({n} total)", | |
| allow_patterns=["**/*.csv"], | |
| ) | |
| except Exception as e: | |
| print(f"submission archive failed: {type(e).__name__}: {e}", flush=True) | |
| last_count = n | |
| print(f"backup pushed: {n} submissions", flush=True) | |
| def main() -> int: | |
| bootstrap() | |
| # Make sure server/api.py reads paths consistent with what we just bootstrapped. | |
| os.environ.setdefault("GT_DIR", str(GT_DIR)) | |
| os.environ.setdefault("GT_DB", str(DB_PATH)) | |
| os.environ.setdefault("GT_ARCHIVE_DIR", str(ARCHIVE_DIR)) | |
| threading.Thread(target=backup_loop, daemon=True).start() | |
| sys.path.insert(0, str(Path(__file__).resolve().parents[1])) | |
| from api import app # noqa: E402 β env vars must be set first | |
| print(f"serving on 0.0.0.0:{PORT}", flush=True) | |
| app.run(host="0.0.0.0", port=PORT, threaded=True, use_reloader=False) | |
| return 0 | |
| if __name__ == "__main__": | |
| raise SystemExit(main()) | |