chrisc36 commited on
Commit
2131f03
1 Parent(s): f1393f1

Upload UnifiedIOPreprocessing

Browse files
Files changed (1) hide show
  1. preprocessor_config.json +268 -0
preprocessor_config.json ADDED
@@ -0,0 +1,268 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "audio_history_cfg": {
4
+ "attn_qk_norm": true,
5
+ "attn_scaled_cosine": false,
6
+ "clip_attn_logit": null,
7
+ "dropout_broadcast_dims": [
8
+ -2
9
+ ],
10
+ "dropout_rate": 0.0,
11
+ "droppath_rate": 0.0,
12
+ "dtype": "float32",
13
+ "emb_dim": 768,
14
+ "float32_attention_logits": true,
15
+ "head_dim": 64,
16
+ "latents_size": 16,
17
+ "layer_drop": 0.0,
18
+ "max_frames": 8,
19
+ "mlp_activations": [
20
+ "gelu"
21
+ ],
22
+ "mlp_dim": 2048,
23
+ "num_heads": 12,
24
+ "num_layers": 2,
25
+ "resampler_type": "perceiver",
26
+ "xattention_index": [
27
+ 0,
28
+ 1
29
+ ],
30
+ "xattn_qk_norm": true,
31
+ "xattn_scaled_cosine": false
32
+ },
33
+ "audio_vit_cfg": {
34
+ "default_input_size": [
35
+ 256,
36
+ 128
37
+ ],
38
+ "dropout_broadcast_dims": [],
39
+ "dropout_rate": 0.0,
40
+ "dtype": "float32",
41
+ "emb_dim": 768,
42
+ "float32_attention_logits": true,
43
+ "head_dim": 64,
44
+ "mlp_activations": [
45
+ "gelu"
46
+ ],
47
+ "mlp_dim": 3072,
48
+ "num_heads": 12,
49
+ "num_layers": 11,
50
+ "patch_size": 16,
51
+ "pos_patch_size": 16,
52
+ "transpose_input": true,
53
+ "vit_embed": true
54
+ },
55
+ "audio_vqgan": {
56
+ "act_fn": "relu",
57
+ "attention_dropout_rate": 0.0,
58
+ "checkpoint_path": "",
59
+ "decoder_head_dim": 64,
60
+ "decoder_hidden_size": 512,
61
+ "decoder_mlp_dim": 2048,
62
+ "decoder_num_heads": 8,
63
+ "decoder_num_layers": 8,
64
+ "default_input_size": [
65
+ 128,
66
+ 256
67
+ ],
68
+ "dropout_rate": 0.0,
69
+ "droppath_rate": 0.0,
70
+ "dtype": "float32",
71
+ "encoder_head_dim": 64,
72
+ "encoder_hidden_size": 512,
73
+ "encoder_mlp_dim": 2048,
74
+ "encoder_num_heads": 8,
75
+ "encoder_num_layers": 8,
76
+ "output_channel": 1,
77
+ "patch_size": [
78
+ 8,
79
+ 8
80
+ ],
81
+ "proj_dim": 32,
82
+ "use_bias": false,
83
+ "use_decoder": true,
84
+ "vocab_size": 8192
85
+ },
86
+ "freeze_vit": true,
87
+ "image_history_cfg": {
88
+ "attn_qk_norm": true,
89
+ "attn_scaled_cosine": false,
90
+ "clip_attn_logit": null,
91
+ "dropout_broadcast_dims": [
92
+ -2
93
+ ],
94
+ "dropout_rate": 0.0,
95
+ "droppath_rate": 0.0,
96
+ "dtype": "float32",
97
+ "emb_dim": 768,
98
+ "float32_attention_logits": true,
99
+ "head_dim": 64,
100
+ "latents_size": 32,
101
+ "layer_drop": 0.0,
102
+ "max_frames": 8,
103
+ "mlp_activations": [
104
+ "gelu"
105
+ ],
106
+ "mlp_dim": 2048,
107
+ "num_heads": 12,
108
+ "num_layers": 2,
109
+ "resampler_type": "perceiver",
110
+ "xattention_index": [
111
+ 0,
112
+ 1
113
+ ],
114
+ "xattn_qk_norm": true,
115
+ "xattn_scaled_cosine": false
116
+ },
117
+ "image_vit_cfg": {
118
+ "default_input_size": [
119
+ 256,
120
+ 256
121
+ ],
122
+ "dropout_broadcast_dims": [],
123
+ "dropout_rate": 0.0,
124
+ "dtype": "float32",
125
+ "emb_dim": 768,
126
+ "float32_attention_logits": true,
127
+ "head_dim": 64,
128
+ "mlp_activations": [
129
+ "gelu"
130
+ ],
131
+ "mlp_dim": 3072,
132
+ "num_heads": 12,
133
+ "num_layers": 11,
134
+ "num_pos": 197,
135
+ "patch_size": 16,
136
+ "pos_patch_size": 16
137
+ },
138
+ "image_vqgan": {
139
+ "attn_resolutions": [
140
+ 32
141
+ ],
142
+ "ch": 128,
143
+ "ch_mult": [
144
+ 1,
145
+ 2,
146
+ 2,
147
+ 4
148
+ ],
149
+ "checkpoint_path": "",
150
+ "default_input_size": [
151
+ 256,
152
+ 256
153
+ ],
154
+ "double_z": false,
155
+ "dropout": 0,
156
+ "dtype": "float32",
157
+ "embed_dim": 4,
158
+ "in_channels": 3,
159
+ "n_embed": 16384,
160
+ "num_res_blocks": 2,
161
+ "out_ch": 3,
162
+ "patch_size": [
163
+ 8,
164
+ 8
165
+ ],
166
+ "resolution": 256,
167
+ "z_channels": 4
168
+ },
169
+ "input_modalities": [
170
+ "text",
171
+ "image",
172
+ "image_history",
173
+ "audio",
174
+ "audio_history"
175
+ ],
176
+ "sequence_length": {
177
+ "audio_history_input_samples": 128,
178
+ "audio_input_samples": 128,
179
+ "image_history_input_samples": 256,
180
+ "image_input_samples": 576,
181
+ "is_training": true,
182
+ "num_frames": 4
183
+ },
184
+ "t5_config": {
185
+ "audio_history_pos_emb": "llama_rope",
186
+ "audio_patch_size": 16,
187
+ "audio_pos_emb": "llama_rope",
188
+ "audio_vit_patch_size": 16,
189
+ "audio_vocab_size": 8320,
190
+ "dalle_attn_mask": true,
191
+ "decoder_max_audio_length": 512,
192
+ "decoder_max_image_length": 1024,
193
+ "decoder_max_text_length": 512,
194
+ "decoder_xattention_internval": 1,
195
+ "default_audio_history_vit_size": [
196
+ 256,
197
+ 128
198
+ ],
199
+ "default_audio_size": [
200
+ 256,
201
+ 128
202
+ ],
203
+ "default_audio_vit_size": [
204
+ 256,
205
+ 128
206
+ ],
207
+ "default_image_history_vit_size": [
208
+ 256,
209
+ 256
210
+ ],
211
+ "default_image_size": [
212
+ 256,
213
+ 256
214
+ ],
215
+ "default_image_vit_size": [
216
+ 384,
217
+ 384
218
+ ],
219
+ "dropout_broadcast_dims": [
220
+ -2
221
+ ],
222
+ "dropout_rate": 0.0,
223
+ "dtype": "float32",
224
+ "dynamic_unk_mask": true,
225
+ "emb_dim": 1024,
226
+ "encoder_max_audio_length": 128,
227
+ "encoder_max_image_length": 576,
228
+ "encoder_max_text_length": 512,
229
+ "float32_attention_logits": true,
230
+ "head_dim": 64,
231
+ "image_history_pos_emb": "llama_rope",
232
+ "image_patch_size": 16,
233
+ "image_pos_emb": "llama_rope",
234
+ "image_tokenizer_type": "vqgan",
235
+ "image_vit_patch_size": 16,
236
+ "image_vocab_size": 16512,
237
+ "logits_via_embedding": true,
238
+ "mlp_activations": [
239
+ "silu",
240
+ "linear"
241
+ ],
242
+ "mlp_dim": 2816,
243
+ "num_decoder_layers": 24,
244
+ "num_encoder_layers": 24,
245
+ "num_heads": 16,
246
+ "qk_norm": true,
247
+ "text_pos_emb": "llama_rope",
248
+ "vocab_size": 33280
249
+ },
250
+ "target_modalities": [
251
+ "text",
252
+ "image",
253
+ "audio"
254
+ ],
255
+ "use_audio_history_vit": true,
256
+ "use_audio_vit": true,
257
+ "use_image_history_vit": true,
258
+ "use_image_vit": true
259
+ },
260
+ "sequence_length": {
261
+ "audio_history_input_samples": 128,
262
+ "audio_input_samples": 128,
263
+ "image_history_input_samples": 256,
264
+ "image_input_samples": 576,
265
+ "is_training": true,
266
+ "num_frames": 4
267
+ }
268
+ }