MLX
German
English
Mixture of Experts
multimodal
vision
audio
endtoend
j.o.s.i.e.
J.O.S.I.E.v4o / Version4.7-architecture.txt
Isaak-Carter's picture
Update Version4.7-architecture.txt
5c1268c verified
Josiev47(
(encoder): EncoderModel(
(modality_preprocessors): ModuleDict(
(vision): RGBDTPreprocessor(
(rgbt_stem): PatchEmbedGeneric(
(proj): Sequential(
(0): make_image_to_video()
(1): Conv3d(3, {llm_in_embedding}, kernel_size=(2, 14, 14), stride=(2, 14, 14), bias=False)
)
(norm_layer): RMSNorm()
)
(pos_embedding_helper): SpatioTemporalPosEmbeddingHelper()
)
(audio): AudioPreprocessor(
(rgbt_stem): PatchEmbedGeneric(
(proj): Conv2d(1, {llm_in_embedding}, kernel_size=(16, 16), stride=(10, 10), bias=False)
(norm_layer): RMSNorm()
)
(pos_embedding_helper): SpatioTemporalPosEmbeddingHelper()
)
)
(modality_transformers): ModuleDict(
(vision): EncoderTransformer(
(pre_transformer_layer_norm): RMSNorm()
(layers): ModuleList(
(0 - n): EncoderTransformerLayer(
(attn): EncoderTransformerAttention(
(qkv): Linear(in_features={llm_in_embedding}, out_features=3072, bias=True)
(attn_drop): Dropout(p=0.0, inplace=False)
(proj): Linear(in_features={llm_in_embedding}, out_features={llm_in_embedding}, bias=True)
(proj_drop): Dropout(p=0.0, inplace=False)
)
(drop_path): Identity()
(norm_1): RMSNorm()
(mlp): EncoderMLP(
(fc1): Linear(in_features={llm_in_embedding}, out_features=4096, bias=True)
(fc2): Linear(in_features=4096, out_features={llm_in_embedding}, bias=True)
(drop): Dropout(p=0.0, inplace=False)
(act): SiLU()
)
(norm_2): RMSNorm()
)
)
(head): Linear(in_features={llm_in_embedding}, out_features={llm_in_embedding}, bias=False)
(post_transformer_layer_norm): RMSNorm()
)
(audio): EncoderTransformer(
(pre_transformer_layer_norm): RMSNorm()
(layers): ModuleList(
(0 - n): EncoderTransformerLayer(
(attn): EncoderTransformerAttention(
(qkv): Linear(in_features={llm_in_embedding}, out_features=3072, bias=True)
(attn_drop): Dropout(p=0.0, inplace=False)
(proj): Linear(in_features={llm_in_embedding}, out_features={llm_in_embedding}, bias=True)
(proj_drop): Dropout(p=0.0, inplace=False)
)
(drop_path): DropPath()
(norm_1): RMSNorm()
(mlp): EncoderMLP(
(fc1): Linear(in_features={llm_in_embedding}, out_features=4096, bias=True)
(fc2): Linear(in_features=4096, out_features={llm_in_embedding}, bias=True)
(drop): Dropout(p=0.0, inplace=False)
(act): SiLU()
)
(norm_2): RMSNorm()
)
)
(head): Linear(in_features={llm_in_embedding}, out_features={llm_in_embedding}, bias=False)
(post_transformer_layer_norm): RMSNorm()
)
)
)
(llm): LlamaForCausalLM(
(model): LlamaModel(
(embed_tokens): Embedding({vocab_size}, {llm_in_embedding}, {padding_idx})
(layers): ModuleList(
(0 - n): LlamaDecoderLayer(
(self_attn): LlamaAttention(
(q_proj): Linear(in_features={llm_in_embedding}, out_features={llm_in_embedding}, bias=False)
(k_proj): Linear(in_features={llm_in_embedding}, out_features={llm_in_embedding}, bias=False)
(v_proj): Linear(in_features={llm_in_embedding}, out_features={llm_in_embedding}, bias=False)
(o_proj): Linear(in_features={llm_in_embedding}, out_features={llm_in_embedding}, bias=False)
(rotary_emb): LlamaRotaryEmbedding()
)
(mlp): LlamaMLP(
(gate_proj): Linear(in_features={llm_in_embedding}, out_features=4096, bias=False)
(down_proj): Linear(in_features=4096, out_features={llm_in_embedding}, bias=False)
(up_proj): Linear(in_features={llm_in_embedding}, out_features=4096, bias=False)
(act_fn): SiLU()
)
(input_layernorm): RMSNorm()
(post_attention_layernorm): RMSNorm()
)
)
(norm): RMSNorm()
)
(lm_head): Linear(in_features={llm_in_embedding}, out_features={vocab_size}, bias=False)
)
(input_embeddings): Embedding({vocab_size}, {llm_in_embedding}, {padding_idx})
)