uio2-preprocessor / preprocessor_config.json
chrisc36's picture
Upload UnifiedIOPreprocessing
c8c61ba verified
{
"config": {
"audio_history_cfg": {
"attn_qk_norm": true,
"attn_scaled_cosine": false,
"clip_attn_logit": null,
"dropout_broadcast_dims": [
-2
],
"dropout_rate": 0.0,
"droppath_rate": 0.0,
"emb_dim": 768,
"float32_attention_logits": true,
"head_dim": 64,
"latents_size": 16,
"layer_drop": 0.0,
"max_frames": 8,
"mlp_activations": [
"gelu"
],
"mlp_dim": 2048,
"num_heads": 12,
"num_layers": 2,
"resampler_type": "perceiver",
"xattention_index": [
0,
1
],
"xattn_qk_norm": true,
"xattn_scaled_cosine": false
},
"audio_vit_cfg": {
"default_input_size": [
256,
128
],
"dropout_broadcast_dims": [],
"dropout_rate": 0.0,
"emb_dim": 768,
"float32_attention_logits": true,
"head_dim": 64,
"mlp_activations": [
"gelu"
],
"mlp_dim": 3072,
"num_heads": 12,
"num_layers": 11,
"patch_size": 16,
"pos_patch_size": 16,
"transpose_input": true,
"vit_embed": true
},
"audio_vqgan": {
"act_fn": "relu",
"attention_dropout_rate": 0.0,
"checkpoint_path": "",
"decoder_head_dim": 64,
"decoder_hidden_size": 512,
"decoder_mlp_dim": 2048,
"decoder_num_heads": 8,
"decoder_num_layers": 8,
"default_input_size": [
128,
256
],
"dropout_rate": 0.0,
"droppath_rate": 0.0,
"encoder_head_dim": 64,
"encoder_hidden_size": 512,
"encoder_mlp_dim": 2048,
"encoder_num_heads": 8,
"encoder_num_layers": 8,
"output_channel": 1,
"patch_size": [
8,
8
],
"proj_dim": 32,
"use_bias": false,
"use_decoder": true,
"vocab_size": 8192
},
"freeze_vit": true,
"image_history_cfg": {
"attn_qk_norm": true,
"attn_scaled_cosine": false,
"clip_attn_logit": null,
"dropout_broadcast_dims": [
-2
],
"dropout_rate": 0.0,
"droppath_rate": 0.0,
"emb_dim": 768,
"float32_attention_logits": true,
"head_dim": 64,
"latents_size": 32,
"layer_drop": 0.0,
"max_frames": 8,
"mlp_activations": [
"gelu"
],
"mlp_dim": 2048,
"num_heads": 12,
"num_layers": 2,
"resampler_type": "perceiver",
"xattention_index": [
0,
1
],
"xattn_qk_norm": true,
"xattn_scaled_cosine": false
},
"image_vit_cfg": {
"default_input_size": [
256,
256
],
"dropout_broadcast_dims": [],
"dropout_rate": 0.0,
"emb_dim": 768,
"float32_attention_logits": true,
"head_dim": 64,
"mlp_activations": [
"gelu"
],
"mlp_dim": 3072,
"num_heads": 12,
"num_layers": 11,
"num_pos": 197,
"patch_size": 16,
"pos_patch_size": 16
},
"image_vqgan": {
"attn_resolutions": [
32
],
"ch": 128,
"ch_mult": [
1,
2,
2,
4
],
"checkpoint_path": "",
"default_input_size": [
256,
256
],
"double_z": false,
"dropout": 0,
"embed_dim": 4,
"in_channels": 3,
"n_embed": 16384,
"num_res_blocks": 2,
"out_ch": 3,
"patch_size": [
8,
8
],
"resolution": 256,
"z_channels": 4
},
"input_modalities": [
"text",
"image",
"image_history",
"audio",
"audio_history"
],
"sequence_length": {
"audio_history_input_samples": 128,
"audio_input_samples": 128,
"image_history_input_samples": 256,
"image_input_samples": 576,
"is_training": true,
"num_frames": 4
},
"t5_config": {
"audio_history_pos_emb": "llama_rope",
"audio_patch_size": 16,
"audio_pos_emb": "llama_rope",
"audio_vit_patch_size": 16,
"audio_vocab_size": 8320,
"dalle_attn_mask": true,
"decoder_max_audio_length": 512,
"decoder_max_image_length": 1024,
"decoder_max_text_length": 512,
"decoder_xattention_internval": 1,
"default_audio_history_vit_size": [
256,
128
],
"default_audio_size": [
256,
128
],
"default_audio_vit_size": [
256,
128
],
"default_image_history_vit_size": [
256,
256
],
"default_image_size": [
256,
256
],
"default_image_vit_size": [
384,
384
],
"dropout_broadcast_dims": [
-2
],
"dropout_rate": 0.0,
"dynamic_unk_mask": true,
"emb_dim": 1024,
"encoder_max_audio_length": 128,
"encoder_max_image_length": 576,
"encoder_max_text_length": 512,
"float32_attention_logits": true,
"head_dim": 64,
"image_history_pos_emb": "llama_rope",
"image_patch_size": 16,
"image_pos_emb": "llama_rope",
"image_tokenizer_type": "vqgan",
"image_vit_patch_size": 16,
"image_vocab_size": 16512,
"logits_via_embedding": true,
"mlp_activations": [
"silu",
"linear"
],
"mlp_dim": 2816,
"num_decoder_layers": 24,
"num_encoder_layers": 24,
"num_heads": 16,
"qk_norm": true,
"text_pos_emb": "llama_rope",
"vocab_size": 33280
},
"target_modalities": [
"text",
"image",
"audio"
],
"use_audio_history_vit": true,
"use_audio_vit": true,
"use_image_history_vit": true,
"use_image_vit": true
},
"sequence_length": {
"audio_history_input_samples": 128,
"audio_input_samples": 128,
"image_history_input_samples": 256,
"image_input_samples": 576,
"is_training": true,
"num_frames": 4
}
}