File size: 2,259 Bytes
38e3f9b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
output_dir: "output/cameractrl_model"
pretrained_model_path: "[replace with SVD root path]"
unet_subfolder: "unet"
down_block_types: ['CrossAttnDownBlockSpatioTemporalPoseCond', 'CrossAttnDownBlockSpatioTemporalPoseCond', 'CrossAttnDownBlockSpatioTemporalPoseCond', 'DownBlockSpatioTemporal']
up_block_types: ['UpBlockSpatioTemporal', 'CrossAttnUpBlockSpatioTemporalPoseCond', 'CrossAttnUpBlockSpatioTemporalPoseCond', 'CrossAttnUpBlockSpatioTemporalPoseCond']

train_data:
  root_path:       "[replace RealEstate10K root path]"
  annotation_json:       "annotations/train.json"
  sample_stride: 8
  sample_n_frames: 14
  relative_pose: true
  zero_t_first_frame: true
  sample_size: [320, 576]
  rescale_fxy: true
  shuffle_frames: false
  use_flip: false

validation_data:
  root_path:       "[replace RealEstate10K root path]"
  annotation_json:       "annotations/validation.json"
  sample_stride: 8
  sample_n_frames: 14
  relative_pose: true
  zero_t_first_frame: true
  sample_size: [320, 576]
  rescale_fxy: true
  shuffle_frames: false
  use_flip: false
  return_clip_name: true

random_null_image_ratio: 0.15

pose_encoder_kwargs:
  downscale_factor: 8
  channels: [320, 640, 1280, 1280]
  nums_rb: 2
  cin: 384
  ksize: 1
  sk: true
  use_conv: false
  compression_factor: 1
  temporal_attention_nhead: 8
  attention_block_types: ["Temporal_Self", ]
  temporal_position_encoding: true
  temporal_position_encoding_max_len: 14

attention_processor_kwargs:
  add_spatial: false
  add_temporal: true
  attn_processor_name: 'attn1'
  pose_feature_dimensions: [320, 640, 1280, 1280]
  query_condition: true
  key_value_condition: true
  scale: 1.0

do_sanity_check: true
sample_before_training: false

max_train_epoch:      -1
max_train_steps:      50000
validation_steps:       2500
validation_steps_tuple: [500, ]

learning_rate:    3.e-5

P_mean: 0.7
P_std: 1.6
condition_image_noise_mean: -3.0
condition_image_noise_std: 0.5
sample_latent: true
first_image_cond: true

num_inference_steps: 25
min_guidance_scale: 1.0
max_guidance_scale: 3.0

num_workers: 8
train_batch_size: 1
checkpointing_epochs: -1
checkpointing_steps:  10000

mixed_precision_training: false
enable_xformers_memory_efficient_attention: true

global_seed: 42
logger_interval: 10