wd / train.sh
AlotaibiFahad's picture
End of training
16b48db verified
#!/bin/bash
# Download and install AzCopy
wget -q https://aka.ms/downloadazcopy-v10-linux -O azcopy-v10.tar.gz
tar -xvf azcopy-v10.tar.gz > /dev/null
sudo cp ./azcopy_linux_amd64_*/azcopy /usr/bin/
# Define blob storage path and SAS token
export PATH_TO_THE_BLOB="https://multimodel.blob.core.windows.net/fahad2"
export SAS_TOKEN="sp=racwdli&st=2025-01-06T08:40:20Z&se=2025-03-31T16:40:20Z&spr=https&sv=2022-11-02&sr=c&sig=O7v6p%2FE6IMF8uFeDoLYjS7Jy8Y0I9GLt9M3A%2B7UclH4%3D"
# # Make tmp directory
mkdir tmp_20_01_2025
# Use AzCopy to download files from blob storage
azcopy copy "${PATH_TO_THE_BLOB}/tmp_18_01_2025/programs?${SAS_TOKEN}" "tmp_20_01_2025/" --recursive
# Create and activate the conda environment, suppressing confirmation prompts
conda create -n base2 python=3.11 -y
conda activate base2
# # Install required Python packages
python -m pip install -r req.txt
# run the program
nohup python run.py config/examples/train_lora_flux_schnell_24gb.yaml > output.txt 2>&1 &
export DIR_NAME="tmp_18_01_2025"
# azcopy copy "${DIR_NAME}" "${PATH_TO_THE_BLOB}/${DIR_NAME}?${SAS_TOKEN}" --recursive=true
# loop to copy the rorkspace to the BLOB every hour
while true
do
# azcopy sync "tmp_08_01_2025" "${PATH_TO_THE_BLOB}?${SAS_TOKEN}" --recursive=true
azcopy sync "${DIR_NAME}" "${PATH_TO_THE_BLOB}/${DIR_NAME}?${SAS_TOKEN}" --recursive=true
sleep 900
done