File size: 2,198 Bytes
c52d378 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
logging:
project: titok_video
run_name: BaseAll-CB16k-TL128-256x33-init-BS64-clipGrad1.0
logging_interval: 50
save_path: out_tiny
save_step_interval: 5000
keep_prior_checkpoints: -1 # -1 to keep all
resume_from_checkpoint:
init_from_checkpoint: base-interp-256x33-TL128.ckpt
model:
titok:
temporal_patch_size: 2
spatial_patch_size: 4
fsq_levels: [8, 8, 8, 6, 5] # [7, 5, 5, 5, 5]
num_latent_tokens: 128
encoder_size: base
decoder_size: base
exp_residual: False
vae:
type: wfvae # cogvideox, vidtok, wfvae
path: preprocess_dataset/wf-16
latent_channels: 16
temporal_compression: 4
spatial_compression: 8
disc: # experimental
use_disc: False
model_layers: 1
model_heads: 1
model_dim: 128
temporal_patch_size: 4
spatial_patch_size: 4
disc_start: 45000
disc_factor: 1.0
disc_weight: 0.1
lecam_weight: 0.0 # disabled
base_gamma: 1 # higher gamma smooths more earlier in training.
final_gamma: 0.1
dataset:
train_dataset: "/workspace/out_enc_256_33/**/*.pt"
eval_dataset: "/workspace/out_enc_256_33_eval/*.pt"
resolution: 256
num_frames: 33
frames_per_second: 8
workers: 8
optimizer:
titok:
learning_rate: 1e-4
beta1: 0.9
beta2: 0.99
weight_decay: 1e-4
warmup_steps: 5000 # 10000
end_lr: 1e-5
disc: # not used
learning_rate: 1e-4
beta1: 0.9
beta2: 0.99
weight_decay: 1e-4
warmup_steps: 1000
end_lr: 1e-5
training:
torch_compile: True
seed: 42
max_grad_norm: 1.0 # not needed?
batch_size: 64
# strategy: # ddp
enable_tf32: True
precision: bf16-mixed
train_devices: 1
accelerator: 'gpu'
max_steps: 500000
val_step_interval: 2000
eval_recon_log_num: 4
eval_sample_size: 32
eval_batch_size: 1
eval_clear_cache: True
eval_shuffle: True
log_codebook: True |