| |
| """ |
| Monitor HF checkpoints repo and delete intermediate checkpoints. |
| Keeps every 5000th step checkpoint, deletes the rest. |
| Runs in a loop, checking every 10 minutes. |
| """ |
|
|
| import time |
| import re |
| import os |
| from huggingface_hub import HfApi |
|
|
| REPO_ID = os.environ.get("CHECKPOINT_REPO", "StrongRoboticsLab/pi05-so100-diverse-checkpoints") |
| KEEP_EVERY = 5000 |
| CHECK_INTERVAL = 600 |
|
|
| api = HfApi(token=os.environ.get("HF_TOKEN")) |
|
|
|
|
| def get_checkpoint_steps(): |
| """List all checkpoint step numbers in the repo.""" |
| try: |
| files = api.list_repo_tree(REPO_ID, recursive=False) |
| steps = [] |
| for f in files: |
| match = re.match(r"step_(\d+)", f.rfilename) |
| if match: |
| steps.append(int(match.group(1))) |
| return sorted(steps) |
| except Exception as e: |
| print(f"Error listing repo: {e}") |
| return [] |
|
|
|
|
| def cleanup(): |
| """Delete checkpoints that aren't multiples of KEEP_EVERY, except the latest.""" |
| steps = get_checkpoint_steps() |
| if len(steps) <= 1: |
| return |
|
|
| latest = max(steps) |
| to_delete = [] |
| to_keep = [] |
|
|
| for step in steps: |
| if step == latest or step % KEEP_EVERY == 0: |
| to_keep.append(step) |
| else: |
| to_delete.append(step) |
|
|
| if not to_delete: |
| print(f"Nothing to delete. {len(to_keep)} checkpoints kept.") |
| return |
|
|
| print(f"Keeping {len(to_keep)} checkpoints: {to_keep}") |
| print(f"Deleting {len(to_delete)} checkpoints...") |
|
|
| for step in to_delete: |
| folder = f"step_{step:06d}" |
| try: |
| api.delete_folder(path_in_repo=folder, repo_id=REPO_ID, repo_type="model") |
| print(f" Deleted {folder}") |
| except Exception as e: |
| print(f" Failed to delete {folder}: {e}") |
|
|
|
|
| if __name__ == "__main__": |
| print(f"Monitoring {REPO_ID}, keeping every {KEEP_EVERY} steps") |
| while True: |
| cleanup() |
| print(f"Sleeping {CHECK_INTERVAL}s...") |
| time.sleep(CHECK_INTERVAL) |
|
|