image_finetune: true

output_dir: "outputs"
pretrained_model_path: "models/StableDiffusion/stable-diffusion-v1-5"

noise_scheduler_kwargs:
  num_train_timesteps: 1000
  beta_start:          0.00085
  beta_end:            0.012
  beta_schedule:       "scaled_linear"
  steps_offset:        1
  clip_sample:         false

train_data:
  csv_path:     "/mnt/petrelfs/guoyuwei/projects/datasets/webvid/results_2M_val.csv"
  video_folder: "/mnt/petrelfs/guoyuwei/projects/datasets/webvid/2M_val"
  sample_size:  256

validation_data:
  prompts:
    - "Snow rocky mountains peaks canyon. Snow blanketed rocky mountains surround and shadow deep canyons."
    - "A drone view of celebration with Christma tree and fireworks, starry sky - background."
    - "Robot dancing in times square."
    - "Pacific coast, carmel by the sea ocean and waves."
  num_inference_steps: 25
  guidance_scale: 8.

trainable_modules:
  - "."

unet_checkpoint_path: ""

learning_rate:    1.e-5
train_batch_size: 50

max_train_epoch:      -1
max_train_steps:      100
checkpointing_epochs: -1
checkpointing_steps:  60

validation_steps:       5000
validation_steps_tuple: [2, 50]

global_seed: 42
mixed_precision_training: true
enable_xformers_memory_efficient_attention: True

is_debug: False