Spaces:
Running
on
Zero
Running
on
Zero
File size: 2,259 Bytes
38e3f9b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
output_dir: "output/cameractrl_model"
pretrained_model_path: "[replace with SVD root path]"
unet_subfolder: "unet"
down_block_types: ['CrossAttnDownBlockSpatioTemporalPoseCond', 'CrossAttnDownBlockSpatioTemporalPoseCond', 'CrossAttnDownBlockSpatioTemporalPoseCond', 'DownBlockSpatioTemporal']
up_block_types: ['UpBlockSpatioTemporal', 'CrossAttnUpBlockSpatioTemporalPoseCond', 'CrossAttnUpBlockSpatioTemporalPoseCond', 'CrossAttnUpBlockSpatioTemporalPoseCond']
train_data:
root_path: "[replace RealEstate10K root path]"
annotation_json: "annotations/train.json"
sample_stride: 8
sample_n_frames: 14
relative_pose: true
zero_t_first_frame: true
sample_size: [320, 576]
rescale_fxy: true
shuffle_frames: false
use_flip: false
validation_data:
root_path: "[replace RealEstate10K root path]"
annotation_json: "annotations/validation.json"
sample_stride: 8
sample_n_frames: 14
relative_pose: true
zero_t_first_frame: true
sample_size: [320, 576]
rescale_fxy: true
shuffle_frames: false
use_flip: false
return_clip_name: true
random_null_image_ratio: 0.15
pose_encoder_kwargs:
downscale_factor: 8
channels: [320, 640, 1280, 1280]
nums_rb: 2
cin: 384
ksize: 1
sk: true
use_conv: false
compression_factor: 1
temporal_attention_nhead: 8
attention_block_types: ["Temporal_Self", ]
temporal_position_encoding: true
temporal_position_encoding_max_len: 14
attention_processor_kwargs:
add_spatial: false
add_temporal: true
attn_processor_name: 'attn1'
pose_feature_dimensions: [320, 640, 1280, 1280]
query_condition: true
key_value_condition: true
scale: 1.0
do_sanity_check: true
sample_before_training: false
max_train_epoch: -1
max_train_steps: 50000
validation_steps: 2500
validation_steps_tuple: [500, ]
learning_rate: 3.e-5
P_mean: 0.7
P_std: 1.6
condition_image_noise_mean: -3.0
condition_image_noise_std: 0.5
sample_latent: true
first_image_cond: true
num_inference_steps: 25
min_guidance_scale: 1.0
max_guidance_scale: 3.0
num_workers: 8
train_batch_size: 1
checkpointing_epochs: -1
checkpointing_steps: 10000
mixed_precision_training: false
enable_xformers_memory_efficient_attention: true
global_seed: 42
logger_interval: 10
|