Doubiiu commited on
Commit
93e8f1d
1 Parent(s): 6818527

Upload 2 files

Browse files
configs/inference_1024_v1.0.yaml ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ target: lvdm.models.ddpm3d.LatentVisualDiffusion
3
+ params:
4
+ rescale_betas_zero_snr: True
5
+ parameterization: "v"
6
+ linear_start: 0.00085
7
+ linear_end: 0.012
8
+ num_timesteps_cond: 1
9
+ timesteps: 1000
10
+ first_stage_key: video
11
+ cond_stage_key: caption
12
+ cond_stage_trainable: False
13
+ conditioning_key: hybrid
14
+ image_size: [72, 128]
15
+ channels: 4
16
+ scale_by_std: False
17
+ scale_factor: 0.18215
18
+ use_ema: False
19
+ uncond_type: 'empty_seq'
20
+ use_dynamic_rescale: true
21
+ base_scale: 0.3
22
+ fps_condition_type: 'fps'
23
+ perframe_ae: True
24
+ unet_config:
25
+ target: lvdm.modules.networks.openaimodel3d.UNetModel
26
+ params:
27
+ in_channels: 8
28
+ out_channels: 4
29
+ model_channels: 320
30
+ attention_resolutions:
31
+ - 4
32
+ - 2
33
+ - 1
34
+ num_res_blocks: 2
35
+ channel_mult:
36
+ - 1
37
+ - 2
38
+ - 4
39
+ - 4
40
+ dropout: 0.1
41
+ num_head_channels: 64
42
+ transformer_depth: 1
43
+ context_dim: 1024
44
+ use_linear: true
45
+ use_checkpoint: True
46
+ temporal_conv: True
47
+ temporal_attention: True
48
+ temporal_selfatt_only: true
49
+ use_relative_position: false
50
+ use_causal_attention: False
51
+ temporal_length: 16
52
+ addition_attention: true
53
+ image_cross_attention: true
54
+ default_fs: 10
55
+ fs_condition: true
56
+
57
+ first_stage_config:
58
+ target: lvdm.models.autoencoder.AutoencoderKL
59
+ params:
60
+ embed_dim: 4
61
+ monitor: val/rec_loss
62
+ ddconfig:
63
+ double_z: True
64
+ z_channels: 4
65
+ resolution: 256
66
+ in_channels: 3
67
+ out_ch: 3
68
+ ch: 128
69
+ ch_mult:
70
+ - 1
71
+ - 2
72
+ - 4
73
+ - 4
74
+ num_res_blocks: 2
75
+ attn_resolutions: []
76
+ dropout: 0.0
77
+ lossconfig:
78
+ target: torch.nn.Identity
79
+
80
+ cond_stage_config:
81
+ target: lvdm.modules.encoders.condition.FrozenOpenCLIPEmbedder
82
+ params:
83
+ version: "/apdcephfs_cq2/share_1290939/jinboxing/Pretrained/hub/models--laion--CLIP-ViT-H-14-laion2B-s32B-b79K/blobs/9a78ef8e8c73fd0df621682e7a8e8eb36c6916cb3c16b291a082ecd52ab79cc4"
84
+ freeze: true
85
+ layer: "penultimate"
86
+
87
+ img_cond_stage_config:
88
+ target: lvdm.modules.encoders.condition.FrozenOpenCLIPImageEmbedderV2
89
+ params:
90
+ version: "/apdcephfs_cq2/share_1290939/jinboxing/Pretrained/hub/models--laion--CLIP-ViT-H-14-laion2B-s32B-b79K/blobs/9a78ef8e8c73fd0df621682e7a8e8eb36c6916cb3c16b291a082ecd52ab79cc4"
91
+ freeze: true
92
+
93
+ image_proj_stage_config:
94
+ target: lvdm.modules.encoders.resampler.Resampler
95
+ params:
96
+ dim: 1024
97
+ depth: 4
98
+ dim_head: 64
99
+ heads: 12
100
+ num_queries: 16
101
+ embedding_dim: 1280
102
+ output_dim: 1024
103
+ ff_mult: 4
104
+ video_length: 16
105
+
configs/inference_512_v1.0.yaml ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ target: lvdm.models.ddpm3d.LatentVisualDiffusion
3
+ params:
4
+ rescale_betas_zero_snr: True
5
+ parameterization: "v"
6
+ linear_start: 0.00085
7
+ linear_end: 0.012
8
+ num_timesteps_cond: 1
9
+ timesteps: 1000
10
+ first_stage_key: video
11
+ cond_stage_key: caption
12
+ cond_stage_trainable: False
13
+ conditioning_key: hybrid
14
+ image_size: [40, 64]
15
+ channels: 4
16
+ scale_by_std: False
17
+ scale_factor: 0.18215
18
+ use_ema: False
19
+ uncond_type: 'empty_seq'
20
+ use_dynamic_rescale: true
21
+ base_scale: 0.7
22
+ fps_condition_type: 'fps'
23
+ unet_config:
24
+ target: lvdm.modules.networks.openaimodel3d.UNetModel
25
+ params:
26
+ in_channels: 8
27
+ out_channels: 4
28
+ model_channels: 320
29
+ attention_resolutions:
30
+ - 4
31
+ - 2
32
+ - 1
33
+ num_res_blocks: 2
34
+ channel_mult:
35
+ - 1
36
+ - 2
37
+ - 4
38
+ - 4
39
+ dropout: 0.1
40
+ num_head_channels: 64
41
+ transformer_depth: 1
42
+ context_dim: 1024
43
+ use_linear: true
44
+ use_checkpoint: True
45
+ temporal_conv: True
46
+ temporal_attention: True
47
+ temporal_selfatt_only: true
48
+ use_relative_position: false
49
+ use_causal_attention: False
50
+ temporal_length: 16
51
+ addition_attention: true
52
+ image_cross_attention: true
53
+ default_fs: 24
54
+ fs_condition: true
55
+
56
+ first_stage_config:
57
+ target: lvdm.models.autoencoder.AutoencoderKL
58
+ params:
59
+ embed_dim: 4
60
+ monitor: val/rec_loss
61
+ ddconfig:
62
+ double_z: True
63
+ z_channels: 4
64
+ resolution: 256
65
+ in_channels: 3
66
+ out_ch: 3
67
+ ch: 128
68
+ ch_mult:
69
+ - 1
70
+ - 2
71
+ - 4
72
+ - 4
73
+ num_res_blocks: 2
74
+ attn_resolutions: []
75
+ dropout: 0.0
76
+ lossconfig:
77
+ target: torch.nn.Identity
78
+
79
+ cond_stage_config:
80
+ target: lvdm.modules.encoders.condition.FrozenOpenCLIPEmbedder
81
+ params:
82
+ version: "/apdcephfs_cq2/share_1290939/jinboxing/Pretrained/hub/models--laion--CLIP-ViT-H-14-laion2B-s32B-b79K/blobs/9a78ef8e8c73fd0df621682e7a8e8eb36c6916cb3c16b291a082ecd52ab79cc4"
83
+ freeze: true
84
+ layer: "penultimate"
85
+
86
+ img_cond_stage_config:
87
+ target: lvdm.modules.encoders.condition.FrozenOpenCLIPImageEmbedderV2
88
+ params:
89
+ version: "/apdcephfs_cq2/share_1290939/jinboxing/Pretrained/hub/models--laion--CLIP-ViT-H-14-laion2B-s32B-b79K/blobs/9a78ef8e8c73fd0df621682e7a8e8eb36c6916cb3c16b291a082ecd52ab79cc4"
90
+ freeze: true
91
+
92
+ image_proj_stage_config:
93
+ target: lvdm.modules.encoders.resampler.Resampler
94
+ params:
95
+ dim: 1024
96
+ depth: 4
97
+ dim_head: 64
98
+ heads: 12
99
+ num_queries: 16
100
+ embedding_dim: 1280
101
+ output_dim: 1024
102
+ ff_mult: 4
103
+ video_length: 16
104
+