|
{ |
|
"config": { |
|
"audio_history_cfg": { |
|
"attn_qk_norm": true, |
|
"attn_scaled_cosine": false, |
|
"clip_attn_logit": null, |
|
"dropout_broadcast_dims": [ |
|
-2 |
|
], |
|
"dropout_rate": 0.0, |
|
"droppath_rate": 0.0, |
|
"emb_dim": 768, |
|
"float32_attention_logits": true, |
|
"head_dim": 64, |
|
"latents_size": 16, |
|
"layer_drop": 0.0, |
|
"max_frames": 8, |
|
"mlp_activations": [ |
|
"gelu" |
|
], |
|
"mlp_dim": 2048, |
|
"num_heads": 12, |
|
"num_layers": 2, |
|
"resampler_type": "perceiver", |
|
"xattention_index": [ |
|
0, |
|
1 |
|
], |
|
"xattn_qk_norm": true, |
|
"xattn_scaled_cosine": false |
|
}, |
|
"audio_vit_cfg": { |
|
"default_input_size": [ |
|
256, |
|
128 |
|
], |
|
"dropout_broadcast_dims": [], |
|
"dropout_rate": 0.0, |
|
"emb_dim": 768, |
|
"float32_attention_logits": true, |
|
"head_dim": 64, |
|
"mlp_activations": [ |
|
"gelu" |
|
], |
|
"mlp_dim": 3072, |
|
"num_heads": 12, |
|
"num_layers": 11, |
|
"patch_size": 16, |
|
"pos_patch_size": 16, |
|
"transpose_input": true, |
|
"vit_embed": true |
|
}, |
|
"audio_vqgan": { |
|
"act_fn": "relu", |
|
"attention_dropout_rate": 0.0, |
|
"checkpoint_path": "", |
|
"decoder_head_dim": 64, |
|
"decoder_hidden_size": 512, |
|
"decoder_mlp_dim": 2048, |
|
"decoder_num_heads": 8, |
|
"decoder_num_layers": 8, |
|
"default_input_size": [ |
|
128, |
|
256 |
|
], |
|
"dropout_rate": 0.0, |
|
"droppath_rate": 0.0, |
|
"encoder_head_dim": 64, |
|
"encoder_hidden_size": 512, |
|
"encoder_mlp_dim": 2048, |
|
"encoder_num_heads": 8, |
|
"encoder_num_layers": 8, |
|
"output_channel": 1, |
|
"patch_size": [ |
|
8, |
|
8 |
|
], |
|
"proj_dim": 32, |
|
"use_bias": false, |
|
"use_decoder": true, |
|
"vocab_size": 8192 |
|
}, |
|
"freeze_vit": true, |
|
"image_history_cfg": { |
|
"attn_qk_norm": true, |
|
"attn_scaled_cosine": false, |
|
"clip_attn_logit": null, |
|
"dropout_broadcast_dims": [ |
|
-2 |
|
], |
|
"dropout_rate": 0.0, |
|
"droppath_rate": 0.0, |
|
"emb_dim": 768, |
|
"float32_attention_logits": true, |
|
"head_dim": 64, |
|
"latents_size": 32, |
|
"layer_drop": 0.0, |
|
"max_frames": 8, |
|
"mlp_activations": [ |
|
"gelu" |
|
], |
|
"mlp_dim": 2048, |
|
"num_heads": 12, |
|
"num_layers": 2, |
|
"resampler_type": "perceiver", |
|
"xattention_index": [ |
|
0, |
|
1 |
|
], |
|
"xattn_qk_norm": true, |
|
"xattn_scaled_cosine": false |
|
}, |
|
"image_vit_cfg": { |
|
"default_input_size": [ |
|
256, |
|
256 |
|
], |
|
"dropout_broadcast_dims": [], |
|
"dropout_rate": 0.0, |
|
"emb_dim": 768, |
|
"float32_attention_logits": true, |
|
"head_dim": 64, |
|
"mlp_activations": [ |
|
"gelu" |
|
], |
|
"mlp_dim": 3072, |
|
"num_heads": 12, |
|
"num_layers": 11, |
|
"num_pos": 197, |
|
"patch_size": 16, |
|
"pos_patch_size": 16 |
|
}, |
|
"image_vqgan": { |
|
"attn_resolutions": [ |
|
32 |
|
], |
|
"ch": 128, |
|
"ch_mult": [ |
|
1, |
|
2, |
|
2, |
|
4 |
|
], |
|
"checkpoint_path": "", |
|
"default_input_size": [ |
|
256, |
|
256 |
|
], |
|
"double_z": false, |
|
"dropout": 0, |
|
"embed_dim": 4, |
|
"in_channels": 3, |
|
"n_embed": 16384, |
|
"num_res_blocks": 2, |
|
"out_ch": 3, |
|
"patch_size": [ |
|
8, |
|
8 |
|
], |
|
"resolution": 256, |
|
"z_channels": 4 |
|
}, |
|
"input_modalities": [ |
|
"text", |
|
"image", |
|
"image_history", |
|
"audio", |
|
"audio_history" |
|
], |
|
"sequence_length": { |
|
"audio_history_input_samples": 128, |
|
"audio_input_samples": 128, |
|
"image_history_input_samples": 256, |
|
"image_input_samples": 576, |
|
"is_training": true, |
|
"num_frames": 4 |
|
}, |
|
"t5_config": { |
|
"audio_history_pos_emb": "llama_rope", |
|
"audio_patch_size": 16, |
|
"audio_pos_emb": "llama_rope", |
|
"audio_vit_patch_size": 16, |
|
"audio_vocab_size": 8320, |
|
"dalle_attn_mask": true, |
|
"decoder_max_audio_length": 512, |
|
"decoder_max_image_length": 1024, |
|
"decoder_max_text_length": 512, |
|
"decoder_xattention_internval": 1, |
|
"default_audio_history_vit_size": [ |
|
256, |
|
128 |
|
], |
|
"default_audio_size": [ |
|
256, |
|
128 |
|
], |
|
"default_audio_vit_size": [ |
|
256, |
|
128 |
|
], |
|
"default_image_history_vit_size": [ |
|
256, |
|
256 |
|
], |
|
"default_image_size": [ |
|
256, |
|
256 |
|
], |
|
"default_image_vit_size": [ |
|
384, |
|
384 |
|
], |
|
"dropout_broadcast_dims": [ |
|
-2 |
|
], |
|
"dropout_rate": 0.0, |
|
"dynamic_unk_mask": true, |
|
"emb_dim": 1024, |
|
"encoder_max_audio_length": 128, |
|
"encoder_max_image_length": 576, |
|
"encoder_max_text_length": 512, |
|
"float32_attention_logits": true, |
|
"head_dim": 64, |
|
"image_history_pos_emb": "llama_rope", |
|
"image_patch_size": 16, |
|
"image_pos_emb": "llama_rope", |
|
"image_tokenizer_type": "vqgan", |
|
"image_vit_patch_size": 16, |
|
"image_vocab_size": 16512, |
|
"logits_via_embedding": true, |
|
"mlp_activations": [ |
|
"silu", |
|
"linear" |
|
], |
|
"mlp_dim": 2816, |
|
"num_decoder_layers": 24, |
|
"num_encoder_layers": 24, |
|
"num_heads": 16, |
|
"qk_norm": true, |
|
"text_pos_emb": "llama_rope", |
|
"vocab_size": 33280 |
|
}, |
|
"target_modalities": [ |
|
"text", |
|
"image", |
|
"audio" |
|
], |
|
"use_audio_history_vit": true, |
|
"use_audio_vit": true, |
|
"use_image_history_vit": true, |
|
"use_image_vit": true |
|
}, |
|
"sequence_length": { |
|
"audio_history_input_samples": 128, |
|
"audio_input_samples": 128, |
|
"image_history_input_samples": 256, |
|
"image_input_samples": 576, |
|
"is_training": true, |
|
"num_frames": 4 |
|
} |
|
} |
|
|