|
Josiev47( |
|
(encoder): EncoderModel( |
|
(modality_preprocessors): ModuleDict( |
|
(vision): RGBDTPreprocessor( |
|
(rgbt_stem): PatchEmbedGeneric( |
|
(proj): Sequential( |
|
(0): make_image_to_video() |
|
(1): Conv3d(3, {llm_in_embedding}, kernel_size=(2, 14, 14), stride=(2, 14, 14), bias=False) |
|
) |
|
(norm_layer): RMSNorm() |
|
) |
|
(pos_embedding_helper): SpatioTemporalPosEmbeddingHelper() |
|
) |
|
(audio): AudioPreprocessor( |
|
(rgbt_stem): PatchEmbedGeneric( |
|
(proj): Conv2d(1, {llm_in_embedding}, kernel_size=(16, 16), stride=(10, 10), bias=False) |
|
(norm_layer): RMSNorm() |
|
) |
|
(pos_embedding_helper): SpatioTemporalPosEmbeddingHelper() |
|
) |
|
) |
|
|
|
(modality_transformers): ModuleDict( |
|
(vision): EncoderTransformer( |
|
(pre_transformer_layer_norm): RMSNorm() |
|
(layers): ModuleList( |
|
(0 - n): EncoderTransformerLayer( |
|
(attn): EncoderTransformerAttention( |
|
(qkv): Linear(in_features={llm_in_embedding}, out_features=3072, bias=True) |
|
(attn_drop): Dropout(p=0.0, inplace=False) |
|
(proj): Linear(in_features={llm_in_embedding}, out_features={llm_in_embedding}, bias=True) |
|
(proj_drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): RMSNorm() |
|
(mlp): EncoderMLP( |
|
(fc1): Linear(in_features={llm_in_embedding}, out_features=4096, bias=True) |
|
(fc2): Linear(in_features=4096, out_features={llm_in_embedding}, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
(act): SiLU() |
|
) |
|
(norm_2): RMSNorm() |
|
) |
|
) |
|
(head): Linear(in_features={llm_in_embedding}, out_features={llm_in_embedding}, bias=False) |
|
(post_transformer_layer_norm): RMSNorm() |
|
) |
|
(audio): EncoderTransformer( |
|
(pre_transformer_layer_norm): RMSNorm() |
|
(layers): ModuleList( |
|
(0 - n): EncoderTransformerLayer( |
|
(attn): EncoderTransformerAttention( |
|
(qkv): Linear(in_features={llm_in_embedding}, out_features=3072, bias=True) |
|
(attn_drop): Dropout(p=0.0, inplace=False) |
|
(proj): Linear(in_features={llm_in_embedding}, out_features={llm_in_embedding}, bias=True) |
|
(proj_drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(drop_path): DropPath() |
|
(norm_1): RMSNorm() |
|
(mlp): EncoderMLP( |
|
(fc1): Linear(in_features={llm_in_embedding}, out_features=4096, bias=True) |
|
(fc2): Linear(in_features=4096, out_features={llm_in_embedding}, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
(act): SiLU() |
|
) |
|
(norm_2): RMSNorm() |
|
) |
|
) |
|
(head): Linear(in_features={llm_in_embedding}, out_features={llm_in_embedding}, bias=False) |
|
(post_transformer_layer_norm): RMSNorm() |
|
) |
|
) |
|
) |
|
|
|
|
|
(llm): LlamaForCausalLM( |
|
(model): LlamaModel( |
|
(embed_tokens): Embedding({vocab_size}, {llm_in_embedding}, {padding_idx}) |
|
|
|
(layers): ModuleList( |
|
(0 - n): LlamaDecoderLayer( |
|
(self_attn): LlamaAttention( |
|
(q_proj): Linear(in_features={llm_in_embedding}, out_features={llm_in_embedding}, bias=False) |
|
(k_proj): Linear(in_features={llm_in_embedding}, out_features={llm_in_embedding}, bias=False) |
|
(v_proj): Linear(in_features={llm_in_embedding}, out_features={llm_in_embedding}, bias=False) |
|
(o_proj): Linear(in_features={llm_in_embedding}, out_features={llm_in_embedding}, bias=False) |
|
(rotary_emb): LlamaRotaryEmbedding() |
|
) |
|
(mlp): LlamaMLP( |
|
(gate_proj): Linear(in_features={llm_in_embedding}, out_features=4096, bias=False) |
|
(down_proj): Linear(in_features=4096, out_features={llm_in_embedding}, bias=False) |
|
(up_proj): Linear(in_features={llm_in_embedding}, out_features=4096, bias=False) |
|
(act_fn): SiLU() |
|
) |
|
(input_layernorm): RMSNorm() |
|
(post_attention_layernorm): RMSNorm() |
|
) |
|
) |
|
(norm): RMSNorm() |
|
) |
|
(lm_head): Linear(in_features={llm_in_embedding}, out_features={vocab_size}, bias=False) |
|
) |
|
|
|
(input_embeddings): Embedding({vocab_size}, {llm_in_embedding}, {padding_idx}) |
|
) |