| #!/usr/bin/env bash |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
|
|
| |
| |
|
|
| |
| |
| |
| |
| |
|
|
| set -euo pipefail |
|
|
| SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)" |
| REPO_ROOT="$(cd -- "$SCRIPT_DIR/.." && pwd)" |
|
|
| |
| if command -v uv &>/dev/null && [[ -f "$REPO_ROOT/pyproject.toml" ]]; then |
| PYTHON="uv run python3" |
| echo "[UV] Detected uv project, using: $PYTHON" |
| else |
| PYTHON="python3" |
| fi |
|
|
| |
| DATASET_REPO="${1:-${DATASET_REPO:-${OPENCLAW_BACKUP_DATASET_REPO:-}}}" |
| DELETE_DATE="${2:-${DELETE_DATE:-}}" |
| HF_TOKEN="${3:-${HF_TOKEN:-}}" |
|
|
| |
| if [[ -z "$DATASET_REPO" || -z "$DELETE_DATE" ]]; then |
| echo "Usage: $0 <DATASET_REPO> <DATE> [HF_TOKEN]" |
| echo "" |
| echo "Examples:" |
| echo " $0 GGSheng/page-backup 20260427 hf_xxxxx" |
| echo " DATASET_REPO=GGSheng/page-backup DELETE_DATE=20260427 HF_TOKEN=hf_xxxxx $0" |
| echo " $0 GGSheng/page-backup 20260427" |
| echo "" |
| echo "Environment variables:" |
| echo " DATASET_REPO - Dataset 仓库 ID (如 GGSheng/page-backup)" |
| echo " DELETE_DATE - 要清理的日期 (YYYYMMDD)" |
| echo " HF_TOKEN - Hugging Face API Token" |
| echo " HF_TOKEN_FILE - Token 文件路径 (默认 ~/.cache/huggingface/token)" |
| echo " DRY_RUN - 设为 1 仅预览 (默认 0)" |
| echo " BATCH_DELETE - 设为 1 使用批量删除 (默认 0)" |
| exit 1 |
| fi |
|
|
| if ! [[ "$DELETE_DATE" =~ ^[0-9]{8}$ ]]; then |
| echo "Error: 日期格式错误 '$DELETE_DATE',应为 YYYYMMDD (如 20260427)" |
| exit 1 |
| fi |
|
|
| |
| if ! date -d "${DELETE_DATE:0:4}-${DELETE_DATE:4:2}-${DELETE_DATE:6:2}" &>/dev/null 2>&1; then |
| echo "Error: 无效的日期 '$DELETE_DATE'" |
| exit 1 |
| fi |
|
|
| |
| if [[ -z "$HF_TOKEN" ]]; then |
| HF_TOKEN_FILE="${HF_TOKEN_FILE:-$HOME/.cache/huggingface/token}" |
| if [[ -f "$HF_TOKEN_FILE" ]]; then |
| HF_TOKEN="$(cat "$HF_TOKEN_FILE")" |
| fi |
| fi |
|
|
| if [[ -z "$HF_TOKEN" ]]; then |
| echo "Error: HF_TOKEN is required. Provide as 3rd arg, set HF_TOKEN env var, or ensure ~/.cache/huggingface/token exists." |
| exit 1 |
| fi |
|
|
| DRY_RUN="${DRY_RUN:-0}" |
| REPO_TYPE="${REPO_TYPE:-dataset}" |
| BATCH_DELETE="${BATCH_DELETE:-0}" |
|
|
| echo "============================================" |
| echo "OpenClaw HF Dataset Cleanup Script" |
| echo "============================================" |
| echo "Dataset: $DATASET_REPO" |
| echo "Delete From: ${DELETE_DATE:0:4}-${DELETE_DATE:4:2}-${DELETE_DATE:6:2}" |
| echo "DRY_RUN: $([ "$DRY_RUN" = "1" ] && echo 'YES (preview only)' || echo 'NO')" |
| echo "Delete Mode: $([ "$BATCH_DELETE" = "1" ] && echo 'BATCH (delete_files API)' || echo 'INDIVIDUAL (delete_file per file)')" |
| echo "" |
|
|
| |
| CLEANUP_FILES=() |
| cleanup() { |
| for f in "${CLEANUP_FILES[@]}"; do |
| rm -f "$f" |
| done |
| } |
| trap cleanup EXIT |
|
|
| |
| echo "[1/3] Fetching file list from Dataset..." |
|
|
| ALL_FILES_FILE="$(mktemp)" |
| MATCHED_FILES_FILE="$(mktemp)" |
| CLEANUP_FILES+=("$ALL_FILES_FILE" "$MATCHED_FILES_FILE") |
|
|
| |
| if ! $PYTHON -c "import huggingface_hub" 2>/dev/null; then |
| echo "Error: 'huggingface_hub' module not found." |
| echo "" |
| echo "Install it with:" |
| echo " uv add huggingface_hub" |
| echo "" |
| echo "Or with CLI support:" |
| echo " uv add 'huggingface_hub[cli]'" |
| exit 1 |
| fi |
|
|
| |
| echo "" |
| echo "[INFO] huggingface_hub version and installation information:" |
|
|
| |
| if ! $PYTHON -c "import huggingface_hub" 2>/dev/null; then |
| echo " ERROR: huggingface_hub module not found!" |
| echo " Install it with: uv add huggingface_hub" |
| echo "" |
| exit 1 |
| fi |
|
|
| |
| MODULE_VERSION=$($PYTHON -c "import huggingface_hub; print(huggingface_hub.__version__)" 2>/dev/null || echo "N/A") |
| echo " - Module version: $MODULE_VERSION" |
|
|
| |
| MODULE_FILE=$($PYTHON -c "import huggingface_hub; print(huggingface_hub.__file__)" 2>/dev/null || echo "N/A") |
| MODULE_DIR=$(dirname "$MODULE_FILE" 2>/dev/null || echo "N/A") |
| echo " - Module location: $MODULE_DIR" |
|
|
| |
| PKG_VERSION="" |
| PKG_LOCATION="" |
| if $PYTHON -m pip --version &>/dev/null 2>&1; then |
| PKG_VERSION=$($PYTHON -m pip show huggingface-hub 2>/dev/null | grep "^Version:" | awk '{print $2}') |
| PKG_LOCATION=$($PYTHON -m pip show huggingface-hub 2>/dev/null | grep "^Location:" | awk '{print $2}') |
| fi |
|
|
| |
| if [ -z "$PKG_VERSION" ] && command -v uv &>/dev/null; then |
| PKG_VERSION=$(uv pip show huggingface-hub 2>/dev/null | grep "^Version:" | awk '{print $2}') |
| PKG_LOCATION=$(uv pip show huggingface-hub 2>/dev/null | grep "^Location:" | awk '{print $2}') |
| fi |
|
|
| if [ -n "$PKG_VERSION" ]; then |
| echo " - Package version: $PKG_VERSION" |
| if [ -n "$PKG_LOCATION" ]; then |
| echo " - Package location: $PKG_LOCATION" |
| fi |
| if [ "$MODULE_VERSION" != "N/A" ] && [ "$MODULE_VERSION" != "$PKG_VERSION" ]; then |
| echo " (Note: Module version $MODULE_VERSION may differ from package version $PKG_VERSION)" |
| fi |
| else |
| echo " - Package version: Unable to determine" |
| fi |
|
|
| |
| PYTHON_EXEC=$($PYTHON -c "import sys; print(sys.executable)" 2>/dev/null || echo "N/A") |
| echo " - Python executable: $PYTHON_EXEC" |
|
|
| |
| CLI_AVAILABLE=$($PYTHON -c "import huggingface_hub.commands.huggingface_cli; print('YES')" 2>/dev/null || echo "NO") |
| echo " - CLI support (huggingface_hub[cli]): $CLI_AVAILABLE" |
| if [ "$CLI_AVAILABLE" = "NO" ]; then |
| echo " (To install CLI support: uv add 'huggingface_hub[cli]' or pip install 'huggingface_hub[cli]')" |
| fi |
|
|
| |
| echo "" |
| echo "[INFO] Related dependencies installation locations:" |
| $PYTHON << 'PYEOF_DEPS' |
| import sys |
| import pkgutil |
|
|
| dependencies = ['huggingface_hub', 'requests', 'tqdm', 'typer'] |
|
|
| print(" Checking module locations:") |
| for dep in dependencies: |
| try: |
| mod = __import__(dep) |
| if hasattr(mod, '__file__') and mod.__file__: |
| print(f" - {dep}: {mod.__file__}") |
| else: |
| print(f" - {dep}: built-in or namespace package") |
| except ImportError: |
| print(f" - {dep}: NOT INSTALLED") |
| PYEOF_DEPS |
|
|
| echo "" |
|
|
| $PYTHON << PYEOF |
| import json, sys |
| from huggingface_hub import HfApi |
| |
| token = """${HF_TOKEN}""" |
| repo_id = """${DATASET_REPO}""" |
| repo_type = """${REPO_TYPE}""" |
| max_date = ${DELETE_DATE} |
| |
| api = HfApi(token=token) |
| try: |
| files = api.list_repo_files(repo_id=repo_id, repo_type=repo_type) |
| except Exception as e: |
| err = str(e) |
| print(f"[ERROR] Failed to list files: {e}", file=sys.stderr) |
| print(file=sys.stderr) |
| if "404" in err: |
| print(f"[HINT] Dataset '{repo_id}' not found.", file=sys.stderr) |
| print(f"[HINT] Make sure you specified the Dataset repo, not the Space repo.", file=sys.stderr) |
| print(f"[HINT] Backup dataset is typically named like '<space-name>-backup'.", file=sys.stderr) |
| print(f"[HINT] Check your OPENCLAW_BACKUP_DATASET_REPO env var.", file=sys.stderr) |
| sys.exit(1) |
| |
| matched = [] |
| date_set = set() # 收集所有需要删除的日期 |
| for f in files: |
| # 文件路径格式:backups/openclaw-backup-YYYYMMDD-HHMMSS.tar.gz.enc |
| if "openclaw-backup-" in f: |
| parts = f.split("openclaw-backup-") |
| date_part = parts[1][:8] # 提取 YYYYMMDD |
| if date_part.isdigit() and int(date_part) <= max_date: |
| matched.append(f) |
| date_set.add(date_part) |
| |
| print(f"Total files: {len(files)}, matched: {len(matched)}") |
| |
| # 生成通配符模式(每个日期一个模式) |
| # 文件路径格式:backups/openclaw-backup-YYYYMMDD-HHMMSS.tar.gz.enc |
| # 通配符格式:backups/openclaw-backup-YYYYMMDD* |
| pattern_list = sorted([f"backups/openclaw-backup-{d}*" for d in date_set]) |
| print(f"Delete patterns ({len(pattern_list)} date patterns):") |
| for p in pattern_list: |
| print(f" - {p}") |
| |
| with open("${ALL_FILES_FILE//\\///}", 'w') as fh: |
| json.dump(files, fh) |
| with open("${MATCHED_FILES_FILE//\\///}", 'w') as fh: |
| json.dump(matched, fh) |
| # 保存通配符模式列表,供后续删除使用 |
| with open("${MATCHED_FILES_FILE//\\///}.patterns", 'w') as fh: |
| json.dump(pattern_list, fh) |
| PYEOF |
|
|
| MATCHED_COUNT=$($PYTHON -c "import json; print(len(json.load(open('${MATCHED_FILES_FILE//\\///}'))))") |
|
|
| if [[ "$MATCHED_COUNT" -eq 0 ]]; then |
| echo "No matching backup files found. Nothing to clean up." |
| exit 0 |
| fi |
|
|
| echo "" |
| echo "Matched files:" |
| $PYTHON -c " |
| import json |
| files = json.load(open('${MATCHED_FILES_FILE//\\///}')) |
| for f in sorted(files): |
| print(f' - {f}') |
| " |
|
|
| |
| echo "" |
| echo "[2/3] Ready to delete $MATCHED_COUNT file(s)..." |
|
|
| if [[ "$DRY_RUN" = "1" ]]; then |
| echo "DRY_RUN mode enabled. Set DRY_RUN=0 to actually delete." |
| exit 0 |
| fi |
|
|
| |
| echo "[3/3] Deleting files..." |
|
|
| BATCH_DELETE="${BATCH_DELETE:-0}" |
|
|
| if [[ "$BATCH_DELETE" = "1" ]]; then |
| echo "Mode: BATCH DELETE (using delete_files API)" |
| echo "" |
|
|
| $PYTHON << PYEOF_BATCH |
| import json, sys |
| from huggingface_hub import HfApi |
| |
| token = """${HF_TOKEN}""" |
| repo_id = """${DATASET_REPO}""" |
| repo_type = """${REPO_TYPE}""" |
| delete_date = """${DELETE_DATE}""" |
| |
| with open("${MATCHED_FILES_FILE//\\///}.patterns", 'r') as fh: |
| patterns_to_delete = json.load(fh) |
| |
| api = HfApi(token=token) |
| |
| # 使用通配符模式删除,每个模式匹配一个日期的所有文件 |
| # 例如:"openclaw-backup-20260101*" 匹配该日期的所有备份文件 |
| BATCH_SIZE = 10 # 每批最多 10 个日期模式 |
| total = len(patterns_to_delete) |
| total_batches = (total + BATCH_SIZE - 1) // BATCH_SIZE |
| deleted_total = 0 |
| |
| for batch_idx in range(total_batches): |
| start = batch_idx * BATCH_SIZE |
| end = min(start + BATCH_SIZE, total) |
| batch = patterns_to_delete[start:end] |
| |
| try: |
| api.delete_files( |
| repo_id=repo_id, |
| repo_type=repo_type, |
| delete_patterns=batch, |
| commit_message=f"backup cleanup: batch {batch_idx + 1}/{total_batches} ({len(batch)} date patterns, date {delete_date})" |
| ) |
| deleted_total += len(batch) |
| print(f" Batch {batch_idx + 1}/{total_batches}: deleted files matching {len(batch)} date pattern(s)") |
| except Exception as e: |
| print(f" Batch {batch_idx + 1}/{total_batches}: FAILED - {e}") |
| print(f"\nResult: deleted {deleted_total}/{total} date pattern(s) before failure") |
| sys.exit(1) |
| |
| print(f"\nResult: deleted all files matching {total} date pattern(s) in {total_batches} batch(es)") |
| PYEOF_BATCH |
|
|
| else |
| echo "Mode: INDIVIDUAL DELETE (one by one with retry)" |
| echo "" |
|
|
| $PYTHON << PYEOF_INDIVIDUAL |
| import json, sys, time |
| |
| token = """${HF_TOKEN}""" |
| repo_id = """${DATASET_REPO}""" |
| repo_type = """${REPO_TYPE}""" |
| delete_date = """${DELETE_DATE}""" |
| |
| with open("${MATCHED_FILES_FILE//\\///}", 'r') as fh: |
| files_to_delete = json.load(fh) |
| |
| from huggingface_hub import HfApi |
| api = HfApi(token=token) |
| |
| total = len(files_to_delete) |
| deleted = 0 |
| failed = 0 |
| rate_limit_hits = 0 |
| max_retries = 3 |
| |
| for idx, path in enumerate(sorted(files_to_delete), 1): |
| retries = 0 |
| while retries < max_retries: |
| try: |
| api.delete_file( |
| repo_id=repo_id, |
| repo_type=repo_type, |
| path_in_repo=path, |
| commit_message=f"backup cleanup: delete {path} (date {delete_date})" |
| ) |
| deleted += 1 |
| print(f" [{idx}/{total}] Deleted: {path}") |
| break |
| except Exception as e: |
| err = str(e) |
| if "404" in err or "Entry Not Found" in err: |
| print(f" [{idx}/{total}] Skipped (not found): {path}") |
| deleted += 1 |
| break |
| elif "429" in err or "Too Many Requests" in err: |
| retries += 1 |
| rate_limit_hits += 1 |
| if retries >= max_retries: |
| print(f" [{idx}/{total}] Failed (rate limit): {path}") |
| failed += 1 |
| break |
| wait = min(2 ** retries * 15, 1800) |
| print(f" [{idx}/{total}] Rate limited, waiting {wait}s...") |
| time.sleep(wait) |
| continue |
| else: |
| print(f" [{idx}/{total}] Failed: {path} - {err}") |
| failed += 1 |
| break |
| |
| print(f"\nResult: deleted {deleted}, failed {failed}, rate limit hits {rate_limit_hits}") |
| PYEOF_INDIVIDUAL |
| fi |
|
|
| echo "" |
| echo "============================================" |
| echo "Done! Cleanup finished." |
| echo "============================================" |
|
|