output_dir: "outputs" pretrained_model_path: "" motion_module_path: "models/mm_sd_v15_v2.ckpt" train_data: csv_path: "./curated.csv" audio_fps: 48000 audio_size: 480000 validation_data: prompts: - "./data/input/lighthouse.png" - "./data/input/guitar.png" - "./data/input/lion.png" - "./data/input/gun.png" num_inference_steps: 25 guidance_scale: 7.5 sample_size: 512 trainable_modules: - 'time_conv_in.' - 'conv_in.' video_unet_checkpoint_path: "models/vggsound_unet.ckpt" audio_unet_checkpoint_path: "" learning_rate: 5.0e-5 train_batch_size: 1 # max for mixed gradient_accumulation_steps: 1 max_train_epoch: -1 max_train_steps: 500000 checkpointing_epochs: 4000 checkpointing_steps: 500 validation_steps: 3000 validation_steps_tuple: [2, 300, 1000] global_seed: 42 mixed_precision_training: true is_debug: False resume_ckpt: "" zero_no_label_mel: false