File size: 2,198 Bytes
c52d378
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
logging:
    project: titok_video
    run_name: BaseAll-CB16k-TL128-256x33-init-BS64-clipGrad1.0

    logging_interval: 50

    save_path: out_tiny
    save_step_interval: 5000
    keep_prior_checkpoints: -1 # -1 to keep all

    resume_from_checkpoint:
    init_from_checkpoint: base-interp-256x33-TL128.ckpt

model:
    titok:
        temporal_patch_size: 2
        spatial_patch_size: 4

        fsq_levels: [8, 8, 8, 6, 5] # [7, 5, 5, 5, 5]
        num_latent_tokens: 128

        encoder_size: base
        decoder_size: base
        exp_residual: False

    vae:
        type: wfvae # cogvideox, vidtok, wfvae
        path: preprocess_dataset/wf-16
        latent_channels: 16
        temporal_compression: 4
        spatial_compression: 8

    disc: # experimental
        use_disc: False

        model_layers: 1
        model_heads: 1
        model_dim: 128

        temporal_patch_size: 4
        spatial_patch_size: 4

        disc_start: 45000
        disc_factor: 1.0
        disc_weight: 0.1
        lecam_weight: 0.0 # disabled

        base_gamma: 1  # higher gamma smooths more earlier in training.
        final_gamma: 0.1

dataset:
    train_dataset: "/workspace/out_enc_256_33/**/*.pt"
    eval_dataset: "/workspace/out_enc_256_33_eval/*.pt"
    resolution: 256
    num_frames: 33
    frames_per_second: 8
    workers: 8

optimizer:
    titok:
        learning_rate: 1e-4
        beta1: 0.9
        beta2: 0.99
        weight_decay: 1e-4
        warmup_steps: 5000 # 10000
        end_lr: 1e-5

    disc: # not used
        learning_rate: 1e-4
        beta1: 0.9
        beta2: 0.99
        weight_decay: 1e-4
        warmup_steps: 1000
        end_lr: 1e-5

training:
    torch_compile: True
    seed: 42
    max_grad_norm: 1.0 # not needed?
    
    batch_size: 64
    # strategy: # ddp

    enable_tf32: True
    precision: bf16-mixed
    train_devices: 1
    accelerator: 'gpu'

    max_steps: 500000
    val_step_interval: 2000

    eval_recon_log_num: 4
    eval_sample_size: 32
    eval_batch_size: 1
    eval_clear_cache: True
    eval_shuffle: True

    log_codebook: True