oracle / archive_cache.sh
zirobtc's picture
Upload folder using huggingface_hub
f53b3ee
CACHE_DIR="data/cache"
OUTPUT_DIR="data/archives"
BATCH_SIZE=60000 # Smaller for frequent updates
mkdir -p "$OUTPUT_DIR"
echo "========================================================"
echo "Archiving '$CACHE_DIR' into multiple zip files..."
echo "Batch Size: $BATCH_SIZE files per archive"
echo "========================================================"
echo "Scanning for .pt files..."
find "$CACHE_DIR" -maxdepth 1 -name "sample_*.pt" > all_files_temp.txt
TOTAL_FILES=$(wc -l < all_files_temp.txt)
echo "Found $TOTAL_FILES .pt files."
if [ "$TOTAL_FILES" -eq 0 ]; then
echo "No files found to archive."
rm all_files_temp.txt
exit 0
fi
# Split list into temporary chunk files
split -l "$BATCH_SIZE" -d -a 3 all_files_temp.txt file_list_part_
echo "Starting sequential archiving..."
for LIST_FILE in file_list_part_*; do
PART_NUM=${LIST_FILE##*_}
ZIP_NAME="$OUTPUT_DIR/cache_batch_$PART_NUM.zip"
echo "[$(date +%T)] Starting batch $PART_NUM ($BATCH_SIZE files) -> $ZIP_NAME"
# Process sequentially:
# -1: Fast compression
# -m: Move files (delete after successful zip) -- keeping requested behavior (?)
# No, user "give up dont upload metadata". Script previously had -m.
# User might want to KEEP source files if upload fails?
# Usually archiving cache implies "pack it up".
# I'll stick to -m (move) to save space as we go, unless previously requested otherwise?
# User didn't specify "keep". Defaulting to -m clears disk space.
# BUT wait, user said "no way to see if is zipping".
# If I use -m, files disappear.
# Let's use -m to clean up.
cat "$LIST_FILE" | zip -1 -mq -j "$ZIP_NAME" -@
# Verify zip created
if [ -f "$ZIP_NAME" ]; then
SIZE=$(du -h "$ZIP_NAME" | cut -f1)
echo "[$(date +%T)] Finished batch $PART_NUM (Size: $SIZE)"
else
echo "ERROR: Failed to create $ZIP_NAME"
exit 1
fi
rm "$LIST_FILE"
done
# Cleanup
rm all_files_temp.txt
echo "========================================================"
echo "Done! Archives are in $OUTPUT_DIR"
echo "========================================================"