Josiev47( (encoder): EncoderModel( (modality_preprocessors): ModuleDict( (vision): RGBDTPreprocessor( (rgbt_stem): PatchEmbedGeneric( (proj): Sequential( (0): make_image_to_video() (1): Conv3d(3, {llm_in_embedding}, kernel_size=(2, 14, 14), stride=(2, 14, 14), bias=False) ) (norm_layer): RMSNorm() ) (pos_embedding_helper): SpatioTemporalPosEmbeddingHelper() ) (audio): AudioPreprocessor( (rgbt_stem): PatchEmbedGeneric( (proj): Conv2d(1, {llm_in_embedding}, kernel_size=(16, 16), stride=(10, 10), bias=False) (norm_layer): RMSNorm() ) (pos_embedding_helper): SpatioTemporalPosEmbeddingHelper() ) ) (modality_transformers): ModuleDict( (vision): EncoderTransformer( (pre_transformer_layer_norm): RMSNorm() (layers): ModuleList( (0 - n): EncoderTransformerLayer( (attn): EncoderTransformerAttention( (qkv): Linear(in_features={llm_in_embedding}, out_features=3072, bias=True) (attn_drop): Dropout(p=0.0, inplace=False) (proj): Linear(in_features={llm_in_embedding}, out_features={llm_in_embedding}, bias=True) (proj_drop): Dropout(p=0.0, inplace=False) ) (drop_path): Identity() (norm_1): RMSNorm() (mlp): EncoderMLP( (fc1): Linear(in_features={llm_in_embedding}, out_features=4096, bias=True) (fc2): Linear(in_features=4096, out_features={llm_in_embedding}, bias=True) (drop): Dropout(p=0.0, inplace=False) (act): SiLU() ) (norm_2): RMSNorm() ) ) (head): Linear(in_features={llm_in_embedding}, out_features={llm_in_embedding}, bias=False) (post_transformer_layer_norm): RMSNorm() ) (audio): EncoderTransformer( (pre_transformer_layer_norm): RMSNorm() (layers): ModuleList( (0 - n): EncoderTransformerLayer( (attn): EncoderTransformerAttention( (qkv): Linear(in_features={llm_in_embedding}, out_features=3072, bias=True) (attn_drop): Dropout(p=0.0, inplace=False) (proj): Linear(in_features={llm_in_embedding}, out_features={llm_in_embedding}, bias=True) (proj_drop): Dropout(p=0.0, inplace=False) ) (drop_path): DropPath() (norm_1): RMSNorm() (mlp): EncoderMLP( (fc1): Linear(in_features={llm_in_embedding}, out_features=4096, bias=True) (fc2): Linear(in_features=4096, out_features={llm_in_embedding}, bias=True) (drop): Dropout(p=0.0, inplace=False) (act): SiLU() ) (norm_2): RMSNorm() ) ) (head): Linear(in_features={llm_in_embedding}, out_features={llm_in_embedding}, bias=False) (post_transformer_layer_norm): RMSNorm() ) ) ) (llm): LlamaForCausalLM( (model): LlamaModel( (embed_tokens): Embedding({vocab_size}, {llm_in_embedding}, {padding_idx}) (layers): ModuleList( (0 - n): LlamaDecoderLayer( (self_attn): LlamaAttention( (q_proj): Linear(in_features={llm_in_embedding}, out_features={llm_in_embedding}, bias=False) (k_proj): Linear(in_features={llm_in_embedding}, out_features={llm_in_embedding}, bias=False) (v_proj): Linear(in_features={llm_in_embedding}, out_features={llm_in_embedding}, bias=False) (o_proj): Linear(in_features={llm_in_embedding}, out_features={llm_in_embedding}, bias=False) (rotary_emb): LlamaRotaryEmbedding() ) (mlp): LlamaMLP( (gate_proj): Linear(in_features={llm_in_embedding}, out_features=4096, bias=False) (down_proj): Linear(in_features=4096, out_features={llm_in_embedding}, bias=False) (up_proj): Linear(in_features={llm_in_embedding}, out_features=4096, bias=False) (act_fn): SiLU() ) (input_layernorm): RMSNorm() (post_attention_layernorm): RMSNorm() ) ) (norm): RMSNorm() ) (lm_head): Linear(in_features={llm_in_embedding}, out_features={vocab_size}, bias=False) ) (input_embeddings): Embedding({vocab_size}, {llm_in_embedding}, {padding_idx}) )