| |
| """ |
| Hindsight Restore β Download pg_dump from HF Dataset and restore into running PG. |
| |
| Called AFTER Hindsight starts (PostgreSQL must be running). |
| Uses pg_restore --clean to replace the fresh empty database with backup data. |
| |
| Usage (called by entrypoint.sh): |
| python3 /opt/backup/restore.py |
| |
| Env vars: |
| HF_TOKEN β HuggingFace token (read access) |
| HF_BACKUP_REPO β Dataset repo (default: Arnwald84/atum-hindsight-backup) |
| """ |
|
|
| import glob |
| import os |
| import subprocess |
| import sys |
| from pathlib import Path |
|
|
| HF_TOKEN = os.environ.get("HF_TOKEN", "") |
| HF_REPO = os.environ.get("HF_BACKUP_REPO", "Arnwald84/atum-hindsight-backup") |
|
|
| PG_USER = "hindsight" |
| PG_PASSWORD = "hindsight" |
| PG_DATABASE = "hindsight" |
| PG_PORT = "5432" |
|
|
|
|
| def log(msg: str) -> None: |
| print(f"[RESTORE] {msg}", flush=True) |
|
|
|
|
| def find_pg_bin(name: str) -> str: |
| """Find a PostgreSQL binary in the pg0 installation.""" |
| pattern = os.path.expanduser(f"~/.pg0/installation/*/bin/{name}") |
| matches = sorted(glob.glob(pattern)) |
| if matches: |
| return matches[-1] |
| raise FileNotFoundError(f"{name} not found in ~/.pg0/installation/") |
|
|
|
|
| EXIT_RESTORED = 0 |
| EXIT_ERROR = 1 |
| EXIT_NO_BACKUP = 2 |
|
|
|
|
| def main() -> int: |
| """Returns exit code: 0=restored, 1=error, 2=no backup.""" |
| if not HF_TOKEN: |
| log("HF_TOKEN not set β skipping restore") |
| return EXIT_NO_BACKUP |
|
|
| try: |
| from huggingface_hub import HfApi, hf_hub_download |
| except ImportError: |
| log("huggingface_hub not installed β skipping restore") |
| return EXIT_NO_BACKUP |
|
|
| api = HfApi(token=HF_TOKEN) |
|
|
| |
| try: |
| files = list(api.list_repo_files(repo_id=HF_REPO, repo_type="dataset")) |
| except Exception as e: |
| log(f"Cannot access repo {HF_REPO}: {e}") |
| return EXIT_ERROR |
|
|
| if "snapshots/latest.pgdump" not in files: |
| log("No pg_dump backup found in HF Dataset β starting fresh") |
| return EXIT_NO_BACKUP |
|
|
| log(f"Downloading latest backup from {HF_REPO}...") |
|
|
| local_path = hf_hub_download( |
| repo_id=HF_REPO, |
| filename="snapshots/latest.pgdump", |
| repo_type="dataset", |
| token=HF_TOKEN, |
| cache_dir="/tmp/hf_cache", |
| ) |
|
|
| size_kb = Path(local_path).stat().st_size / 1024 |
| log(f"Downloaded: {size_kb:.0f} KB") |
|
|
| |
| pg_restore = find_pg_bin("pg_restore") |
| env = os.environ.copy() |
| env["PGPASSWORD"] = PG_PASSWORD |
|
|
| log("Restoring database...") |
| result = subprocess.run( |
| [ |
| pg_restore, |
| "-U", PG_USER, |
| "-d", PG_DATABASE, |
| "-p", PG_PORT, |
| "--clean", |
| "--if-exists", |
| "--no-owner", |
| "--no-acl", |
| "--single-transaction", |
| local_path, |
| ], |
| capture_output=True, |
| text=True, |
| env=env, |
| ) |
|
|
| if result.returncode != 0: |
| stderr = result.stderr.strip() |
| |
| |
| real_errors = [ |
| line for line in stderr.split("\n") |
| if "ERROR" in line |
| and "does not exist" not in line |
| and "already exists" not in line |
| ] |
| if real_errors: |
| log(f"pg_restore had errors: {'; '.join(real_errors[:5])}") |
| return EXIT_ERROR |
| else: |
| log("pg_restore completed (minor warnings only)") |
| else: |
| log("pg_restore completed successfully") |
|
|
| log("Restore complete β Hindsight should be restarted to load restored data") |
| return EXIT_RESTORED |
|
|
|
|
| if __name__ == "__main__": |
| try: |
| code = main() |
| sys.exit(code) |
| except Exception as e: |
| log(f"FAILED: {e}") |
| sys.exit(1) |
|
|