benjamin-paine commited on
Commit
32152cc
1 Parent(s): c8e02a0

Upload config.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. config.json +218 -0
config.json ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "reference_unet": {
3
+ "_class_name": "UNet2DConditionModel",
4
+ "act_fn": "silu",
5
+ "attention_head_dim": 8,
6
+ "block_out_channels": [
7
+ 320,
8
+ 640,
9
+ 1280,
10
+ 1280
11
+ ],
12
+ "center_input_sample": false,
13
+ "cross_attention_dim": 768,
14
+ "down_block_types": [
15
+ "CrossAttnDownBlock2D",
16
+ "CrossAttnDownBlock2D",
17
+ "CrossAttnDownBlock2D",
18
+ "DownBlock2D"
19
+ ],
20
+ "downsample_padding": 1,
21
+ "flip_sin_to_cos": true,
22
+ "freq_shift": 0,
23
+ "in_channels": 4,
24
+ "layers_per_block": 2,
25
+ "mid_block_scale_factor": 1,
26
+ "norm_eps": 1e-05,
27
+ "norm_num_groups": 32,
28
+ "out_channels": 4,
29
+ "sample_size": 64,
30
+ "up_block_types": [
31
+ "UpBlock2D",
32
+ "CrossAttnUpBlock2D",
33
+ "CrossAttnUpBlock2D",
34
+ "CrossAttnUpBlock2D"
35
+ ]
36
+ },
37
+ "denoising_unet": {
38
+ "_class_name": "UNet2DConditionModel",
39
+ "act_fn": "silu",
40
+ "attention_head_dim": 8,
41
+ "block_out_channels": [
42
+ 320,
43
+ 640,
44
+ 1280,
45
+ 1280
46
+ ],
47
+ "center_input_sample": false,
48
+ "cross_attention_dim": 768,
49
+ "down_block_types": [
50
+ "CrossAttnDownBlock3D",
51
+ "CrossAttnDownBlock3D",
52
+ "CrossAttnDownBlock3D",
53
+ "DownBlock3D"
54
+ ],
55
+ "downsample_padding": 1,
56
+ "flip_sin_to_cos": true,
57
+ "freq_shift": 0,
58
+ "in_channels": 4,
59
+ "layers_per_block": 2,
60
+ "mid_block_scale_factor": 1,
61
+ "norm_eps": 1e-05,
62
+ "norm_num_groups": 32,
63
+ "out_channels": 4,
64
+ "sample_size": 64,
65
+ "up_block_types": [
66
+ "UpBlock3D",
67
+ "CrossAttnUpBlock3D",
68
+ "CrossAttnUpBlock3D",
69
+ "CrossAttnUpBlock3D"
70
+ ],
71
+ "mid_block_type": "UNetMidBlock3DCrossAttn",
72
+ "use_inflated_groupnorm": true,
73
+ "unet_use_cross_frame_attention": false,
74
+ "unet_use_temporal_attention": false,
75
+ "use_motion_module": true,
76
+ "motion_module_resolutions": [
77
+ 1,
78
+ 2,
79
+ 4,
80
+ 8
81
+ ],
82
+ "motion_module_mid_block": true,
83
+ "motion_module_decoder_only": false,
84
+ "motion_module_type": "Vanilla",
85
+ "motion_module_kwargs": {
86
+ "num_attention_heads": 8,
87
+ "num_transformer_block": 1,
88
+ "attention_block_types": [
89
+ "Temporal_Self",
90
+ "Temporal_Self"
91
+ ],
92
+ "temporal_position_encoding": true,
93
+ "temporal_position_encoding_max_len": 32,
94
+ "temporal_attention_dim_div": 1
95
+ }
96
+ },
97
+ "vae": {
98
+ "_class_name": "AutoencoderKL",
99
+ "act_fn": "silu",
100
+ "block_out_channels": [
101
+ 128,
102
+ 256,
103
+ 512,
104
+ 512
105
+ ],
106
+ "down_block_types": [
107
+ "DownEncoderBlock2D",
108
+ "DownEncoderBlock2D",
109
+ "DownEncoderBlock2D",
110
+ "DownEncoderBlock2D"
111
+ ],
112
+ "in_channels": 3,
113
+ "latent_channels": 4,
114
+ "layers_per_block": 2,
115
+ "norm_num_groups": 32,
116
+ "out_channels": 3,
117
+ "sample_size": 256,
118
+ "up_block_types": [
119
+ "UpDecoderBlock2D",
120
+ "UpDecoderBlock2D",
121
+ "UpDecoderBlock2D",
122
+ "UpDecoderBlock2D"
123
+ ]
124
+ },
125
+ "image_encoder": {
126
+ "architectures": [
127
+ "CLIPVisionModelWithProjection"
128
+ ],
129
+ "attention_dropout": 0,
130
+ "dropout": 0,
131
+ "hidden_act": "quick_gelu",
132
+ "hidden_size": 1024,
133
+ "image_size": 224,
134
+ "initializer_factor": 1,
135
+ "initializer_range": 0.02,
136
+ "intermediate_size": 4096,
137
+ "layer_norm_eps": 0.00001,
138
+ "model_type": "clip_vision_model",
139
+ "num_attention_heads": 16,
140
+ "num_channels": 3,
141
+ "num_hidden_layers": 24,
142
+ "patch_size": 14,
143
+ "projection_dim": 768,
144
+ "torch_dtype": "float32"
145
+ },
146
+ "pose_guider": {
147
+ "noise_latent_channels": 320,
148
+ "use_ca": true
149
+ },
150
+ "audio_mesher": {
151
+ "_class_name": "Audio2Mesh",
152
+ "attention_dropout": 0.1,
153
+ "bos_token_id": 1,
154
+ "codevector_dim": 256,
155
+ "contrastive_logits_temperature": 0.1,
156
+ "conv_bias": false,
157
+ "conv_dim": [512,512,512,512,512,512,512],
158
+ "conv_kernel": [10,3,3,3,3,2,2],
159
+ "conv_stride": [5,2,2,2,2,2,2],
160
+ "ctc_loss_reduction": "sum",
161
+ "ctc_zero_infinity": false,
162
+ "diversity_loss_weight": 0.1,
163
+ "do_stable_layer_norm": false,
164
+ "eos_token_id": 2,
165
+ "feat_extract_activation": "gelu",
166
+ "feat_extract_dropout": 0.0,
167
+ "feat_extract_norm": "group",
168
+ "feat_proj_dropout": 0.1,
169
+ "feat_quantizer_dropout": 0.0,
170
+ "final_dropout": 0.1,
171
+ "gradient_checkpointing": false,
172
+ "hidden_act": "gelu",
173
+ "hidden_dropout": 0.1,
174
+ "hidden_dropout_prob": 0.1,
175
+ "hidden_size": 768,
176
+ "initializer_range": 0.02,
177
+ "intermediate_size": 3072,
178
+ "layer_norm_eps": 1e-05,
179
+ "layerdrop": 0.1,
180
+ "mask_feature_length": 10,
181
+ "mask_feature_prob": 0.0,
182
+ "mask_time_length": 10,
183
+ "mask_time_prob": 0.05,
184
+ "model_type": "wav2vec2",
185
+ "num_attention_heads": 12,
186
+ "num_codevector_groups": 2,
187
+ "num_codevectors_per_group": 320,
188
+ "num_conv_pos_embedding_groups": 16,
189
+ "num_conv_pos_embeddings": 128,
190
+ "num_feat_extract_layers": 7,
191
+ "num_hidden_layers": 12,
192
+ "num_negatives": 100,
193
+ "pad_token_id": 0,
194
+ "proj_codevector_dim": 256,
195
+ "vocab_size": 32,
196
+ "feature_size": 1,
197
+ "sampling_rate": 16000,
198
+ "padding_value": 0.0,
199
+ "padding_side": "right",
200
+ "do_normalize": true,
201
+ "return_attention_mask": false,
202
+ "out_dim": 1404,
203
+ "latent_dim": 512,
204
+ "use_final_features": true,
205
+ "zero_init": false
206
+ },
207
+ "scheduler": {
208
+ "num_train_timesteps": 1000,
209
+ "beta_start": 0.00085,
210
+ "beta_end": 0.012,
211
+ "beta_schedule": "linear",
212
+ "steps_offset": 1,
213
+ "clip_sample": false,
214
+ "rescale_betas_zero_snr": true,
215
+ "timestep_spacing": "trailing",
216
+ "prediction_type": "v_prediction"
217
+ }
218
+ }