image_finetune: false output_dir: "outputs" pretrained_model_path: "runwayml/stable-diffusion-v1-5" unet_additional_kwargs: use_motion_module : true motion_module_resolutions : [ 1,2,4,8 ] unet_use_cross_frame_attention : false unet_use_temporal_attention : false motion_module_type: Vanilla motion_module_kwargs: num_attention_heads : 8 num_transformer_block : 1 attention_block_types : [ "Temporal_Self", "Temporal_Self" ] temporal_position_encoding : true temporal_position_encoding_max_len : 24 temporal_attention_dim_div : 1 zero_initialize : true noise_scheduler_kwargs: num_train_timesteps: 1000 beta_start: 0.00085 beta_end: 0.012 beta_schedule: "linear" steps_offset: 1 clip_sample: false train_data: csv_path: "data/output.csv" video_folder: "data/output" sample_size: 256 sample_stride: 4 sample_n_frames: 5 validation_data: prompts: - "Snow rocky mountains peaks canyon. Snow blanketed rocky mountains surround and shadow deep canyons." - "A drone view of celebration with Christmas tree and fireworks, starry sky - background." - "Robot dancing in times square." - "Pacific coast, carmel by the sea ocean and waves." num_inference_steps: 20 guidance_scale: 12.5 temporal_context": 24 use_inv_latent": True num_inv_steps: 50 trainable_modules: - "motion_modules." unet_checkpoint_path: "" learning_rate: 3.e-5 train_batch_size: 1 max_train_epoch: -1 max_train_steps: 300 checkpointing_epochs: -1 checkpointing_steps: 1000 validation_steps: 100 # validation_steps_tuple: [2, 50] global_seed: 42 mixed_precision_training: true enable_xformers_memory_efficient_attention: True is_debug: True