FoleyCrafter / configs /train /train_semantic_adapter.yaml
ymzhang319's picture
init
7f2690b
raw
history blame
970 Bytes
output_dir: "outputs"
pretrained_model_path: ""
motion_module_path: "models/mm_sd_v15_v2.ckpt"
train_data:
csv_path: "./curated.csv"
audio_fps: 48000
audio_size: 480000
validation_data:
prompts:
- "./data/input/lighthouse.png"
- "./data/input/guitar.png"
- "./data/input/lion.png"
- "./data/input/gun.png"
num_inference_steps: 25
guidance_scale: 7.5
sample_size: 512
trainable_modules:
- 'to_k_ip'
- 'to_v_ip'
audio_unet_checkpoint_path: ""
learning_rate: 1.0e-4
train_batch_size: 1 # max for mixed
gradient_accumulation_steps: 1
max_train_epoch: -1
max_train_steps: 200000
checkpointing_epochs: 4000
checkpointing_steps: 500
validation_steps: 3000
validation_steps_tuple: [2, 50, 300, 1000]
global_seed: 42
mixed_precision_training: true
is_debug: False
resume_ckpt: ""
# params for adapter
init_from_ip_adapter: false
always_null_text: false
reverse_null_text_prob: true
frame_wise_condition: true