pretrained_model_path: /home/ubuntu/models/model_scope_diffusers/ output_dir: /home/ubuntu/outputs train_data: width: 1024 height: 576 use_bucketing: true sample_start_idx: 1 fps: 24 frame_step: 1 n_sample_frames: 20 json_path: /home/ubuntu/Video-BLIP2-Preprocessor/train_data/my_videos.json validation_data: prompt: duck sample_preview: false num_frames: 16 width: 1024 height: 576 num_inference_steps: 25 guidance_scale: 9 dataset_types: - json validation_steps: 500 extra_unet_params: null extra_text_encoder_params: null train_batch_size: 1 max_train_steps: 50000 learning_rate: 5.0e-06 scale_lr: false lr_scheduler: constant lr_warmup_steps: 0 adam_beta1: 0.9 adam_beta2: 0.999 adam_weight_decay: 0.01 adam_epsilon: 1.0e-08 max_grad_norm: 1.0 gradient_accumulation_steps: 1 checkpointing_steps: 5000 resume_from_checkpoint: null mixed_precision: fp16 use_8bit_adam: false enable_xformers_memory_efficient_attention: false enable_torch_2_attn: true seed: 64 extend_dataset: false cached_latent_dir: null use_unet_lora: false unet_lora_modules: - ResnetBlock2D text_encoder_lora_modules: - CLIPEncoderLayer lora_rank: 16 lora_path: '' kwargs: {} cache_latents: false gradient_checkpointing: true offset_noise_strength: 0.1 text_encoder_gradient_checkpointing: false train_text_encoder: true trainable_modules: - attn1 - attn2 - temp_conv trainable_text_modules: - all use_offset_noise: false use_text_lora: false