{ "config": { "audio_history_cfg": { "attn_qk_norm": true, "attn_scaled_cosine": false, "clip_attn_logit": null, "dropout_broadcast_dims": [ -2 ], "dropout_rate": 0.0, "droppath_rate": 0.0, "emb_dim": 768, "float32_attention_logits": true, "head_dim": 64, "latents_size": 16, "layer_drop": 0.0, "max_frames": 8, "mlp_activations": [ "gelu" ], "mlp_dim": 2048, "num_heads": 12, "num_layers": 2, "resampler_type": "perceiver", "xattention_index": [ 0, 1 ], "xattn_qk_norm": true, "xattn_scaled_cosine": false }, "audio_vit_cfg": { "default_input_size": [ 256, 128 ], "dropout_broadcast_dims": [], "dropout_rate": 0.0, "emb_dim": 768, "float32_attention_logits": true, "head_dim": 64, "mlp_activations": [ "gelu" ], "mlp_dim": 3072, "num_heads": 12, "num_layers": 11, "patch_size": 16, "pos_patch_size": 16, "transpose_input": true, "vit_embed": true }, "audio_vqgan": { "act_fn": "relu", "attention_dropout_rate": 0.0, "checkpoint_path": "", "decoder_head_dim": 64, "decoder_hidden_size": 512, "decoder_mlp_dim": 2048, "decoder_num_heads": 8, "decoder_num_layers": 8, "default_input_size": [ 128, 256 ], "dropout_rate": 0.0, "droppath_rate": 0.0, "encoder_head_dim": 64, "encoder_hidden_size": 512, "encoder_mlp_dim": 2048, "encoder_num_heads": 8, "encoder_num_layers": 8, "output_channel": 1, "patch_size": [ 8, 8 ], "proj_dim": 32, "use_bias": false, "use_decoder": true, "vocab_size": 8192 }, "freeze_vit": true, "image_history_cfg": { "attn_qk_norm": true, "attn_scaled_cosine": false, "clip_attn_logit": null, "dropout_broadcast_dims": [ -2 ], "dropout_rate": 0.0, "droppath_rate": 0.0, "emb_dim": 768, "float32_attention_logits": true, "head_dim": 64, "latents_size": 32, "layer_drop": 0.0, "max_frames": 8, "mlp_activations": [ "gelu" ], "mlp_dim": 2048, "num_heads": 12, "num_layers": 2, "resampler_type": "perceiver", "xattention_index": [ 0, 1 ], "xattn_qk_norm": true, "xattn_scaled_cosine": false }, "image_vit_cfg": { "default_input_size": [ 256, 256 ], "dropout_broadcast_dims": [], "dropout_rate": 0.0, "emb_dim": 768, "float32_attention_logits": true, "head_dim": 64, "mlp_activations": [ "gelu" ], "mlp_dim": 3072, "num_heads": 12, "num_layers": 11, "num_pos": 197, "patch_size": 16, "pos_patch_size": 16 }, "image_vqgan": { "attn_resolutions": [ 32 ], "ch": 128, "ch_mult": [ 1, 2, 2, 4 ], "checkpoint_path": "", "default_input_size": [ 256, 256 ], "double_z": false, "dropout": 0, "embed_dim": 4, "in_channels": 3, "n_embed": 16384, "num_res_blocks": 2, "out_ch": 3, "patch_size": [ 8, 8 ], "resolution": 256, "z_channels": 4 }, "input_modalities": [ "text", "image", "image_history", "audio", "audio_history" ], "sequence_length": { "audio_history_input_samples": 128, "audio_input_samples": 128, "image_history_input_samples": 256, "image_input_samples": 576, "is_training": true, "num_frames": 4 }, "t5_config": { "audio_history_pos_emb": "llama_rope", "audio_patch_size": 16, "audio_pos_emb": "llama_rope", "audio_vit_patch_size": 16, "audio_vocab_size": 8320, "dalle_attn_mask": true, "decoder_max_audio_length": 512, "decoder_max_image_length": 1024, "decoder_max_text_length": 512, "decoder_xattention_internval": 1, "default_audio_history_vit_size": [ 256, 128 ], "default_audio_size": [ 256, 128 ], "default_audio_vit_size": [ 256, 128 ], "default_image_history_vit_size": [ 256, 256 ], "default_image_size": [ 256, 256 ], "default_image_vit_size": [ 384, 384 ], "dropout_broadcast_dims": [ -2 ], "dropout_rate": 0.0, "dynamic_unk_mask": true, "emb_dim": 1024, "encoder_max_audio_length": 128, "encoder_max_image_length": 576, "encoder_max_text_length": 512, "float32_attention_logits": true, "head_dim": 64, "image_history_pos_emb": "llama_rope", "image_patch_size": 16, "image_pos_emb": "llama_rope", "image_tokenizer_type": "vqgan", "image_vit_patch_size": 16, "image_vocab_size": 16512, "logits_via_embedding": true, "mlp_activations": [ "silu", "linear" ], "mlp_dim": 2816, "num_decoder_layers": 24, "num_encoder_layers": 24, "num_heads": 16, "qk_norm": true, "text_pos_emb": "llama_rope", "vocab_size": 33280 }, "target_modalities": [ "text", "image", "audio" ], "use_audio_history_vit": true, "use_audio_vit": true, "use_image_history_vit": true, "use_image_vit": true }, "sequence_length": { "audio_history_input_samples": 128, "audio_input_samples": 128, "image_history_input_samples": 256, "image_input_samples": 576, "is_training": true, "num_frames": 4 } }