page / scripts /delete-backups.sh
GGSheng's picture
feat: deploy Gemma 4 to hf space
a757bd3 verified
#!/usr/bin/env bash
# ============================================================
# OpenClaw HF Dataset 备份清理脚本
#
# 根据指定日期删除 Hugging Face Dataset 中所有匹配的备份文件。
# 如日期为 20260427,则删除文件名中包含 openclaw-backup-20260427、
# openclaw-backup-20260426、openclaw-backup-20260425 … 及更早日期的备份文件。
# 注意 包含“openclaw-backup-20260427、openclaw-backup-20260426、openclaw-backup-20260425” 这样的,都是文件名,不是文件夹。
# 以下是实际存在dataset中的文件路径示例
# backups/openclaw-backup-20260430-214003.tar.gz.enc.meta.json
# backups/openclaw-backup-20260430-215503.tar.gz.enc
# backups/openclaw-backup-20260430-215503.tar.gz.enc.meta.json
# backups/openclaw-backup-20260430-220045.tar.gz.enc
# backups/openclaw-backup-20260430-220045.tar.gz.enc.meta.json
# backups/openclaw-backup-20260430-221503.tar.gz.enc
# backups/openclaw-backup-20260430-221503.tar.gz.enc.meta.json
# backups/openclaw-backup-20260430-223004.tar.gz.enc.meta.json
# backups/openclaw-backup-20260430-223004.tar.gz.enc.part-aa
# backups/openclaw-backup-20260430-223004.tar.gz.enc.part-ab
# backups/openclaw-backup-20260430-223004.tar.gz.enc.part-ac
# backups/openclaw-backup-20260430-223503.tar.gz.enc
# backups/openclaw-backup-20260430-223503.tar.gz.enc.meta.json
# backups/openclaw-backup-20260430-225003.tar.gz.enc
# backups/openclaw-backup-20260430-225003.tar.gz.enc.meta.json
# backups/openclaw-backup-20260430-230502.tar.gz.enc
# backups/openclaw-backup-20260430-230502.tar.gz.enc.meta.json
# backups/openclaw-backup-20260430-232004.tar.gz.enc
# backups/openclaw-backup-20260430-232004.tar.gz.enc.meta.json
# backups/openclaw-backup-20260430-233503.tar.gz.enc.meta.json
# backups/openclaw-backup-20260430-233503.tar.gz.enc.part-aa
# backups/openclaw-backup-20260430-233503.tar.gz.enc.part-ab
# backups/openclaw-backup-20260430-233503.tar.gz.enc.part-ac
# backups/openclaw-backup-20260430-234003.tar.gz.enc
# backups/openclaw-backup-20260430-234003.tar.gz.enc.meta.json
# backups/openclaw-backup-20260430-235503.tar.gz.enc
# backups/openclaw-backup-20260430-235503.tar.gz.enc.meta.json
#
#
# 用法:
# ./delete-backups.sh <DATASET_REPO> <DATE> [HF_TOKEN]
#
# 示例:
# ./scripts/delete-backups.sh GGSheng/page-backup 20260427 hf_xxxxx
#
# 方式1:直接提供参数
# ./scripts/delete-backups.sh GGSheng/page-backup 20260427 hf_xxxxx
#
# 方式2:设置环境变量
# export DATASET_REPO="GGSheng/page-backup"
# export DELETE_DATE="20260427"
# export HF_TOKEN="hf_xxxxx"
# ./scripts/delete-backups.sh
#
# 方式3:使用缓存的 token(默认从 ~/.cache/huggingface/token 读取)
# ./scripts/delete-backups.sh GGSheng/page-backup 20260427
#
# 环境变量 (可选):
# DATASET_REPO - Dataset 仓库 ID (如 GGSheng/page-backup)
# DELETE_DATE - 要清理的日期 (YYYYMMDD 格式)
# HF_TOKEN - Hugging Face API Token
# HF_TOKEN_FILE - Token 文件路径 (默认 ~/.cache/huggingface/token)
# DRY_RUN - 设为 1 时仅列出匹配文件而不删除 (默认 0)
# BATCH_DELETE - 设为 1 时使用批量删除 API (默认 0,逐条删除)
#
# 删除模式:
# BATCH_DELETE=0 (默认): 逐条删除,每条独立 commit,有重试/限速处理,容错性好
# BATCH_DELETE=1: 批量删除,每批最多 100 个文件,单次 commit 含多个文件,效率更高
#
# 注意事项:
# 1. 请确保 HF_TOKEN 有对该 Dataset 的 write 权限
# 2. 删除操作不可逆,建议先用 DRY_RUN=1 预览匹配的文件
# 3. 日期格式固定为 YYYYMMDD(如 20260427)
# 4. 批量删除模式使用 delete_files() API,单次 commit 含多个文件,失败会全部回滚
#
############################################################
# 与 push-to-space.sh 同样的参数模式
# ./scripts/delete-backups.sh GGSheng/page-backup 20260427 hf_xxxxx
# 使用缓存的 token
# ./scripts/delete-backups.sh GGSheng/page-backup 20260427
# 使用环境变量
# DATASET_REPO=GGSheng/page-backup DELETE_DATE=20260427 ./scripts/delete-backups.sh
# 仅预览
# DRY_RUN=1 ./scripts/delete-backups.sh GGSheng/page-backup 20260430
# 删除
# BATCH_DELETE=1 DRY_RUN=0 ./scripts/delete-backups.sh GGSheng/page-backup 20260430
# ============================================================
set -euo pipefail
SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd -- "$SCRIPT_DIR/.." && pwd)"
# ---- 检测 uv 并设置 Python 命令 ----
if command -v uv &>/dev/null && [[ -f "$REPO_ROOT/pyproject.toml" ]]; then
PYTHON="uv run python3"
echo "[UV] Detected uv project, using: $PYTHON"
else
PYTHON="python3"
fi
# ---- 参数解析 ----
DATASET_REPO="${1:-${DATASET_REPO:-${OPENCLAW_BACKUP_DATASET_REPO:-}}}"
DELETE_DATE="${2:-${DELETE_DATE:-}}"
HF_TOKEN="${3:-${HF_TOKEN:-}}"
# ---- 前置检查 ----
if [[ -z "$DATASET_REPO" || -z "$DELETE_DATE" ]]; then
echo "Usage: $0 <DATASET_REPO> <DATE> [HF_TOKEN]"
echo ""
echo "Examples:"
echo " $0 GGSheng/page-backup 20260427 hf_xxxxx"
echo " DATASET_REPO=GGSheng/page-backup DELETE_DATE=20260427 HF_TOKEN=hf_xxxxx $0"
echo " $0 GGSheng/page-backup 20260427"
echo ""
echo "Environment variables:"
echo " DATASET_REPO - Dataset 仓库 ID (如 GGSheng/page-backup)"
echo " DELETE_DATE - 要清理的日期 (YYYYMMDD)"
echo " HF_TOKEN - Hugging Face API Token"
echo " HF_TOKEN_FILE - Token 文件路径 (默认 ~/.cache/huggingface/token)"
echo " DRY_RUN - 设为 1 仅预览 (默认 0)"
echo " BATCH_DELETE - 设为 1 使用批量删除 (默认 0)"
exit 1
fi
if ! [[ "$DELETE_DATE" =~ ^[0-9]{8}$ ]]; then
echo "Error: 日期格式错误 '$DELETE_DATE',应为 YYYYMMDD (如 20260427)"
exit 1
fi
# 验证日期合法性
if ! date -d "${DELETE_DATE:0:4}-${DELETE_DATE:4:2}-${DELETE_DATE:6:2}" &>/dev/null 2>&1; then
echo "Error: 无效的日期 '$DELETE_DATE'"
exit 1
fi
# ---- 解析 HF Token ----
if [[ -z "$HF_TOKEN" ]]; then
HF_TOKEN_FILE="${HF_TOKEN_FILE:-$HOME/.cache/huggingface/token}"
if [[ -f "$HF_TOKEN_FILE" ]]; then
HF_TOKEN="$(cat "$HF_TOKEN_FILE")"
fi
fi
if [[ -z "$HF_TOKEN" ]]; then
echo "Error: HF_TOKEN is required. Provide as 3rd arg, set HF_TOKEN env var, or ensure ~/.cache/huggingface/token exists."
exit 1
fi
DRY_RUN="${DRY_RUN:-0}"
REPO_TYPE="${REPO_TYPE:-dataset}"
BATCH_DELETE="${BATCH_DELETE:-0}"
echo "============================================"
echo "OpenClaw HF Dataset Cleanup Script"
echo "============================================"
echo "Dataset: $DATASET_REPO"
echo "Delete From: ${DELETE_DATE:0:4}-${DELETE_DATE:4:2}-${DELETE_DATE:6:2}"
echo "DRY_RUN: $([ "$DRY_RUN" = "1" ] && echo 'YES (preview only)' || echo 'NO')"
echo "Delete Mode: $([ "$BATCH_DELETE" = "1" ] && echo 'BATCH (delete_files API)' || echo 'INDIVIDUAL (delete_file per file)')"
echo ""
# ---- 临时文件清理 ----
CLEANUP_FILES=()
cleanup() {
for f in "${CLEANUP_FILES[@]}"; do
rm -f "$f"
done
}
trap cleanup EXIT
# ---- 步骤 1:列出并筛选文件 ----
echo "[1/3] Fetching file list from Dataset..."
ALL_FILES_FILE="$(mktemp)"
MATCHED_FILES_FILE="$(mktemp)"
CLEANUP_FILES+=("$ALL_FILES_FILE" "$MATCHED_FILES_FILE")
# 确保 huggingface_hub 模块可用
if ! $PYTHON -c "import huggingface_hub" 2>/dev/null; then
echo "Error: 'huggingface_hub' module not found."
echo ""
echo "Install it with:"
echo " uv add huggingface_hub"
echo ""
echo "Or with CLI support:"
echo " uv add 'huggingface_hub[cli]'"
exit 1
fi
# ---- 打印 huggingface_hub 版本和安装位置信息 ----
echo ""
echo "[INFO] huggingface_hub version and installation information:"
# 1. 检查模块是否可用
if ! $PYTHON -c "import huggingface_hub" 2>/dev/null; then
echo " ERROR: huggingface_hub module not found!"
echo " Install it with: uv add huggingface_hub"
echo ""
exit 1
fi
# 2. 获取模块版本
MODULE_VERSION=$($PYTHON -c "import huggingface_hub; print(huggingface_hub.__version__)" 2>/dev/null || echo "N/A")
echo " - Module version: $MODULE_VERSION"
# 3. 获取模块文件位置
MODULE_FILE=$($PYTHON -c "import huggingface_hub; print(huggingface_hub.__file__)" 2>/dev/null || echo "N/A")
MODULE_DIR=$(dirname "$MODULE_FILE" 2>/dev/null || echo "N/A")
echo " - Module location: $MODULE_DIR"
# 4. 获取 pip 包版本和位置
PKG_VERSION=""
PKG_LOCATION=""
if $PYTHON -m pip --version &>/dev/null 2>&1; then
PKG_VERSION=$($PYTHON -m pip show huggingface-hub 2>/dev/null | grep "^Version:" | awk '{print $2}')
PKG_LOCATION=$($PYTHON -m pip show huggingface-hub 2>/dev/null | grep "^Location:" | awk '{print $2}')
fi
# 如果 pip 找不到,尝试 uv pip
if [ -z "$PKG_VERSION" ] && command -v uv &>/dev/null; then
PKG_VERSION=$(uv pip show huggingface-hub 2>/dev/null | grep "^Version:" | awk '{print $2}')
PKG_LOCATION=$(uv pip show huggingface-hub 2>/dev/null | grep "^Location:" | awk '{print $2}')
fi
if [ -n "$PKG_VERSION" ]; then
echo " - Package version: $PKG_VERSION"
if [ -n "$PKG_LOCATION" ]; then
echo " - Package location: $PKG_LOCATION"
fi
if [ "$MODULE_VERSION" != "N/A" ] && [ "$MODULE_VERSION" != "$PKG_VERSION" ]; then
echo " (Note: Module version $MODULE_VERSION may differ from package version $PKG_VERSION)"
fi
else
echo " - Package version: Unable to determine"
fi
# 5. 显示 Python 解释器和 sys.path
PYTHON_EXEC=$($PYTHON -c "import sys; print(sys.executable)" 2>/dev/null || echo "N/A")
echo " - Python executable: $PYTHON_EXEC"
# 6. 检查 CLI 支持 (huggingface_hub[cli] 是同一包,但带有CLI额外依赖)
CLI_AVAILABLE=$($PYTHON -c "import huggingface_hub.commands.huggingface_cli; print('YES')" 2>/dev/null || echo "NO")
echo " - CLI support (huggingface_hub[cli]): $CLI_AVAILABLE"
if [ "$CLI_AVAILABLE" = "NO" ]; then
echo " (To install CLI support: uv add 'huggingface_hub[cli]' or pip install 'huggingface_hub[cli]')"
fi
# 7. 显示相关依赖包的位置(如果可能)
echo ""
echo "[INFO] Related dependencies installation locations:"
$PYTHON << 'PYEOF_DEPS'
import sys
import pkgutil
dependencies = ['huggingface_hub', 'requests', 'tqdm', 'typer']
print(" Checking module locations:")
for dep in dependencies:
try:
mod = __import__(dep)
if hasattr(mod, '__file__') and mod.__file__:
print(f" - {dep}: {mod.__file__}")
else:
print(f" - {dep}: built-in or namespace package")
except ImportError:
print(f" - {dep}: NOT INSTALLED")
PYEOF_DEPS
echo ""
$PYTHON << PYEOF
import json, sys
from huggingface_hub import HfApi
token = """${HF_TOKEN}"""
repo_id = """${DATASET_REPO}"""
repo_type = """${REPO_TYPE}"""
max_date = ${DELETE_DATE}
api = HfApi(token=token)
try:
files = api.list_repo_files(repo_id=repo_id, repo_type=repo_type)
except Exception as e:
err = str(e)
print(f"[ERROR] Failed to list files: {e}", file=sys.stderr)
print(file=sys.stderr)
if "404" in err:
print(f"[HINT] Dataset '{repo_id}' not found.", file=sys.stderr)
print(f"[HINT] Make sure you specified the Dataset repo, not the Space repo.", file=sys.stderr)
print(f"[HINT] Backup dataset is typically named like '<space-name>-backup'.", file=sys.stderr)
print(f"[HINT] Check your OPENCLAW_BACKUP_DATASET_REPO env var.", file=sys.stderr)
sys.exit(1)
matched = []
date_set = set() # 收集所有需要删除的日期
for f in files:
# 文件路径格式:backups/openclaw-backup-YYYYMMDD-HHMMSS.tar.gz.enc
if "openclaw-backup-" in f:
parts = f.split("openclaw-backup-")
date_part = parts[1][:8] # 提取 YYYYMMDD
if date_part.isdigit() and int(date_part) <= max_date:
matched.append(f)
date_set.add(date_part)
print(f"Total files: {len(files)}, matched: {len(matched)}")
# 生成通配符模式(每个日期一个模式)
# 文件路径格式:backups/openclaw-backup-YYYYMMDD-HHMMSS.tar.gz.enc
# 通配符格式:backups/openclaw-backup-YYYYMMDD*
pattern_list = sorted([f"backups/openclaw-backup-{d}*" for d in date_set])
print(f"Delete patterns ({len(pattern_list)} date patterns):")
for p in pattern_list:
print(f" - {p}")
with open("${ALL_FILES_FILE//\\///}", 'w') as fh:
json.dump(files, fh)
with open("${MATCHED_FILES_FILE//\\///}", 'w') as fh:
json.dump(matched, fh)
# 保存通配符模式列表,供后续删除使用
with open("${MATCHED_FILES_FILE//\\///}.patterns", 'w') as fh:
json.dump(pattern_list, fh)
PYEOF
MATCHED_COUNT=$($PYTHON -c "import json; print(len(json.load(open('${MATCHED_FILES_FILE//\\///}'))))")
if [[ "$MATCHED_COUNT" -eq 0 ]]; then
echo "No matching backup files found. Nothing to clean up."
exit 0
fi
echo ""
echo "Matched files:"
$PYTHON -c "
import json
files = json.load(open('${MATCHED_FILES_FILE//\\///}'))
for f in sorted(files):
print(f' - {f}')
"
# ---- 步骤 2:确认 / DRY_RUN ----
echo ""
echo "[2/3] Ready to delete $MATCHED_COUNT file(s)..."
if [[ "$DRY_RUN" = "1" ]]; then
echo "DRY_RUN mode enabled. Set DRY_RUN=0 to actually delete."
exit 0
fi
# ---- 步骤 3:执行删除 ----
echo "[3/3] Deleting files..."
BATCH_DELETE="${BATCH_DELETE:-0}"
if [[ "$BATCH_DELETE" = "1" ]]; then
echo "Mode: BATCH DELETE (using delete_files API)"
echo ""
$PYTHON << PYEOF_BATCH
import json, sys
from huggingface_hub import HfApi
token = """${HF_TOKEN}"""
repo_id = """${DATASET_REPO}"""
repo_type = """${REPO_TYPE}"""
delete_date = """${DELETE_DATE}"""
with open("${MATCHED_FILES_FILE//\\///}.patterns", 'r') as fh:
patterns_to_delete = json.load(fh)
api = HfApi(token=token)
# 使用通配符模式删除,每个模式匹配一个日期的所有文件
# 例如:"openclaw-backup-20260101*" 匹配该日期的所有备份文件
BATCH_SIZE = 10 # 每批最多 10 个日期模式
total = len(patterns_to_delete)
total_batches = (total + BATCH_SIZE - 1) // BATCH_SIZE
deleted_total = 0
for batch_idx in range(total_batches):
start = batch_idx * BATCH_SIZE
end = min(start + BATCH_SIZE, total)
batch = patterns_to_delete[start:end]
try:
api.delete_files(
repo_id=repo_id,
repo_type=repo_type,
delete_patterns=batch,
commit_message=f"backup cleanup: batch {batch_idx + 1}/{total_batches} ({len(batch)} date patterns, date {delete_date})"
)
deleted_total += len(batch)
print(f" Batch {batch_idx + 1}/{total_batches}: deleted files matching {len(batch)} date pattern(s)")
except Exception as e:
print(f" Batch {batch_idx + 1}/{total_batches}: FAILED - {e}")
print(f"\nResult: deleted {deleted_total}/{total} date pattern(s) before failure")
sys.exit(1)
print(f"\nResult: deleted all files matching {total} date pattern(s) in {total_batches} batch(es)")
PYEOF_BATCH
else
echo "Mode: INDIVIDUAL DELETE (one by one with retry)"
echo ""
$PYTHON << PYEOF_INDIVIDUAL
import json, sys, time
token = """${HF_TOKEN}"""
repo_id = """${DATASET_REPO}"""
repo_type = """${REPO_TYPE}"""
delete_date = """${DELETE_DATE}"""
with open("${MATCHED_FILES_FILE//\\///}", 'r') as fh:
files_to_delete = json.load(fh)
from huggingface_hub import HfApi
api = HfApi(token=token)
total = len(files_to_delete)
deleted = 0
failed = 0
rate_limit_hits = 0
max_retries = 3
for idx, path in enumerate(sorted(files_to_delete), 1):
retries = 0
while retries < max_retries:
try:
api.delete_file(
repo_id=repo_id,
repo_type=repo_type,
path_in_repo=path,
commit_message=f"backup cleanup: delete {path} (date {delete_date})"
)
deleted += 1
print(f" [{idx}/{total}] Deleted: {path}")
break
except Exception as e:
err = str(e)
if "404" in err or "Entry Not Found" in err:
print(f" [{idx}/{total}] Skipped (not found): {path}")
deleted += 1
break
elif "429" in err or "Too Many Requests" in err:
retries += 1
rate_limit_hits += 1
if retries >= max_retries:
print(f" [{idx}/{total}] Failed (rate limit): {path}")
failed += 1
break
wait = min(2 ** retries * 15, 1800)
print(f" [{idx}/{total}] Rate limited, waiting {wait}s...")
time.sleep(wait)
continue
else:
print(f" [{idx}/{total}] Failed: {path} - {err}")
failed += 1
break
print(f"\nResult: deleted {deleted}, failed {failed}, rate limit hits {rate_limit_hits}")
PYEOF_INDIVIDUAL
fi
echo ""
echo "============================================"
echo "Done! Cleanup finished."
echo "============================================"