| ## split audio clips | |
| PATH_TO_AUDIO_DIR= # dir to audio clips e.g.: /home/to/audiocaps_wav | |
| OUTPUT_PARTITION_FILE= # ouput csv path, e.g.: /home/to/output/audiocaps-test-partition.tsv | |
| python training/partition_clips.py \ | |
| --data_dir $PATH_TO_AUDIO_DIR \ | |
| --output_dir $OUTPUT_PARTITION_FILE | |
| ## extract audio latents | |
| export CUDA_VISIBLE_DEVICES=0 | |
| CAPTIONS_TSV=./sets/audiocaps-test.tsv # captions tsv path, e.g.: /home/to/audiocaps-test.tsv | |
| OUTPUT_LATENT_DIR= # output latent dir, e.g.: /home/to/output/audiocaps-test-latent | |
| OUTPUT_NPZ_DIR= # output npz dir, e.g.: /home/to/output/audiocaps-test-npz | |
| torchrun --standalone --nproc_per_node=1 training/extract_audio_latents.py \ | |
| --captions_tsv $CAPTIONS_TSV \ | |
| --data_dir $PATH_TO_AUDIO_DIR \ | |
| --clips_tsv $OUTPUT_PARTITION_FILE \ | |
| --latent_dir $OUTPUT_LATENT_DIR \ | |
| --output_dir $OUTPUT_NPZ_DIR \ | |
| --text_encoder='t5_clap' # ['clip', 't5', 't5_clap'] |