fix: restart Hindsight after pg_restore to reload data
Browse filespg_restore modifies PG underneath but the running API
has cached state. Restarting forces reconnection to
the restored database.
restore.py now returns exit codes:
0 = data restored (restart needed)
2 = no backup found (skip restart)
1 = error
- scripts/entrypoint.sh +35 -3
- scripts/restore.py +16 -7
scripts/entrypoint.sh
CHANGED
|
@@ -53,12 +53,44 @@ done
|
|
| 53 |
|
| 54 |
# ============================================================
|
| 55 |
# STEP 4: Restore from backup (PG is now running)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
# ============================================================
|
| 57 |
if [ -n "${HF_TOKEN:-}" ]; then
|
| 58 |
log "Attempting restore from HF Dataset..."
|
| 59 |
-
python3 /opt/backup/restore.py
|
| 60 |
-
|
| 61 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
else
|
| 63 |
log "HF_TOKEN not set β skipping restore"
|
| 64 |
fi
|
|
|
|
| 53 |
|
| 54 |
# ============================================================
|
| 55 |
# STEP 4: Restore from backup (PG is now running)
|
| 56 |
+
# After pg_restore, we must restart Hindsight so the API
|
| 57 |
+
# reconnects to PG and sees the restored data.
|
| 58 |
+
# Exit codes from restore.py:
|
| 59 |
+
# 0 = data restored successfully
|
| 60 |
+
# 2 = no backup found (skip restart)
|
| 61 |
+
# 1 = error
|
| 62 |
# ============================================================
|
| 63 |
if [ -n "${HF_TOKEN:-}" ]; then
|
| 64 |
log "Attempting restore from HF Dataset..."
|
| 65 |
+
python3 /opt/backup/restore.py
|
| 66 |
+
restore_exit=$?
|
| 67 |
+
|
| 68 |
+
if [ "$restore_exit" -eq 0 ]; then
|
| 69 |
+
log "Restore succeeded β restarting Hindsight to load restored data..."
|
| 70 |
+
kill "$HINDSIGHT_PID" 2>/dev/null || true
|
| 71 |
+
wait "$HINDSIGHT_PID" 2>/dev/null || true
|
| 72 |
+
sleep 2
|
| 73 |
+
|
| 74 |
+
/app/start-all.sh &
|
| 75 |
+
HINDSIGHT_PID=$!
|
| 76 |
+
log "Hindsight restarted (PID $HINDSIGHT_PID)"
|
| 77 |
+
|
| 78 |
+
for i in $(seq 1 60); do
|
| 79 |
+
if curl -sf http://localhost:${HINDSIGHT_API_PORT:-7860}/health > /dev/null 2>&1; then
|
| 80 |
+
log "Hindsight is healthy after restore"
|
| 81 |
+
break
|
| 82 |
+
fi
|
| 83 |
+
if ! kill -0 "$HINDSIGHT_PID" 2>/dev/null; then
|
| 84 |
+
log "Hindsight died after restore restart"
|
| 85 |
+
exit 1
|
| 86 |
+
fi
|
| 87 |
+
sleep 5
|
| 88 |
+
done
|
| 89 |
+
elif [ "$restore_exit" -eq 2 ]; then
|
| 90 |
+
log "No backup found β continuing with fresh database"
|
| 91 |
+
else
|
| 92 |
+
log "Restore failed β continuing with fresh database"
|
| 93 |
+
fi
|
| 94 |
else
|
| 95 |
log "HF_TOKEN not set β skipping restore"
|
| 96 |
fi
|
scripts/restore.py
CHANGED
|
@@ -41,16 +41,22 @@ def find_pg_bin(name: str) -> str:
|
|
| 41 |
raise FileNotFoundError(f"{name} not found in ~/.pg0/installation/")
|
| 42 |
|
| 43 |
|
| 44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
if not HF_TOKEN:
|
| 46 |
log("HF_TOKEN not set β skipping restore")
|
| 47 |
-
return
|
| 48 |
|
| 49 |
try:
|
| 50 |
from huggingface_hub import HfApi, hf_hub_download
|
| 51 |
except ImportError:
|
| 52 |
log("huggingface_hub not installed β skipping restore")
|
| 53 |
-
return
|
| 54 |
|
| 55 |
api = HfApi(token=HF_TOKEN)
|
| 56 |
|
|
@@ -59,11 +65,11 @@ def main() -> None:
|
|
| 59 |
files = list(api.list_repo_files(repo_id=HF_REPO, repo_type="dataset"))
|
| 60 |
except Exception as e:
|
| 61 |
log(f"Cannot access repo {HF_REPO}: {e}")
|
| 62 |
-
return
|
| 63 |
|
| 64 |
if "snapshots/latest.pgdump" not in files:
|
| 65 |
log("No pg_dump backup found in HF Dataset β starting fresh")
|
| 66 |
-
return
|
| 67 |
|
| 68 |
log(f"Downloading latest backup from {HF_REPO}...")
|
| 69 |
|
|
@@ -114,17 +120,20 @@ def main() -> None:
|
|
| 114 |
]
|
| 115 |
if real_errors:
|
| 116 |
log(f"pg_restore had errors: {'; '.join(real_errors[:5])}")
|
|
|
|
| 117 |
else:
|
| 118 |
log("pg_restore completed (minor warnings only)")
|
| 119 |
else:
|
| 120 |
log("pg_restore completed successfully")
|
| 121 |
|
| 122 |
-
log("Restore complete")
|
|
|
|
| 123 |
|
| 124 |
|
| 125 |
if __name__ == "__main__":
|
| 126 |
try:
|
| 127 |
-
main()
|
|
|
|
| 128 |
except Exception as e:
|
| 129 |
log(f"FAILED: {e}")
|
| 130 |
sys.exit(1)
|
|
|
|
| 41 |
raise FileNotFoundError(f"{name} not found in ~/.pg0/installation/")
|
| 42 |
|
| 43 |
|
| 44 |
+
EXIT_RESTORED = 0 # Data was restored β caller should restart Hindsight
|
| 45 |
+
EXIT_ERROR = 1 # Restore failed
|
| 46 |
+
EXIT_NO_BACKUP = 2 # No backup found β skip restart
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def main() -> int:
|
| 50 |
+
"""Returns exit code: 0=restored, 1=error, 2=no backup."""
|
| 51 |
if not HF_TOKEN:
|
| 52 |
log("HF_TOKEN not set β skipping restore")
|
| 53 |
+
return EXIT_NO_BACKUP
|
| 54 |
|
| 55 |
try:
|
| 56 |
from huggingface_hub import HfApi, hf_hub_download
|
| 57 |
except ImportError:
|
| 58 |
log("huggingface_hub not installed β skipping restore")
|
| 59 |
+
return EXIT_NO_BACKUP
|
| 60 |
|
| 61 |
api = HfApi(token=HF_TOKEN)
|
| 62 |
|
|
|
|
| 65 |
files = list(api.list_repo_files(repo_id=HF_REPO, repo_type="dataset"))
|
| 66 |
except Exception as e:
|
| 67 |
log(f"Cannot access repo {HF_REPO}: {e}")
|
| 68 |
+
return EXIT_ERROR
|
| 69 |
|
| 70 |
if "snapshots/latest.pgdump" not in files:
|
| 71 |
log("No pg_dump backup found in HF Dataset β starting fresh")
|
| 72 |
+
return EXIT_NO_BACKUP
|
| 73 |
|
| 74 |
log(f"Downloading latest backup from {HF_REPO}...")
|
| 75 |
|
|
|
|
| 120 |
]
|
| 121 |
if real_errors:
|
| 122 |
log(f"pg_restore had errors: {'; '.join(real_errors[:5])}")
|
| 123 |
+
return EXIT_ERROR
|
| 124 |
else:
|
| 125 |
log("pg_restore completed (minor warnings only)")
|
| 126 |
else:
|
| 127 |
log("pg_restore completed successfully")
|
| 128 |
|
| 129 |
+
log("Restore complete β Hindsight should be restarted to load restored data")
|
| 130 |
+
return EXIT_RESTORED
|
| 131 |
|
| 132 |
|
| 133 |
if __name__ == "__main__":
|
| 134 |
try:
|
| 135 |
+
code = main()
|
| 136 |
+
sys.exit(code)
|
| 137 |
except Exception as e:
|
| 138 |
log(f"FAILED: {e}")
|
| 139 |
sys.exit(1)
|