MLX
German
English
Mixture of Experts
multimodal
vision
audio
endtoend
j.o.s.i.e.
File size: 4,795 Bytes
a905bf9
 
 
 
 
 
5c1268c
a905bf9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
Josiev47(
  (encoder): EncoderModel(
    (modality_preprocessors): ModuleDict(
        (vision): RGBDTPreprocessor(
            (rgbt_stem): PatchEmbedGeneric(
                (proj): Sequential(
                    (0): make_image_to_video()
                    (1): Conv3d(3, {llm_in_embedding}, kernel_size=(2, 14, 14), stride=(2, 14, 14), bias=False)
                )
                (norm_layer): RMSNorm()
            )
            (pos_embedding_helper): SpatioTemporalPosEmbeddingHelper()
        )
        (audio): AudioPreprocessor(
            (rgbt_stem): PatchEmbedGeneric(
                (proj): Conv2d(1, {llm_in_embedding}, kernel_size=(16, 16), stride=(10, 10), bias=False)
                (norm_layer): RMSNorm()
                )
            (pos_embedding_helper): SpatioTemporalPosEmbeddingHelper()
        )
    )

    (modality_transformers): ModuleDict(
      (vision): EncoderTransformer(
        (pre_transformer_layer_norm): RMSNorm()
        (layers): ModuleList(
            (0 - n): EncoderTransformerLayer(
                    (attn): EncoderTransformerAttention(
                    (qkv): Linear(in_features={llm_in_embedding}, out_features=3072, bias=True)
                    (attn_drop): Dropout(p=0.0, inplace=False)
                    (proj): Linear(in_features={llm_in_embedding}, out_features={llm_in_embedding}, bias=True)
                    (proj_drop): Dropout(p=0.0, inplace=False)
                )
                (drop_path): Identity()
                (norm_1): RMSNorm()
                (mlp): EncoderMLP(
                    (fc1): Linear(in_features={llm_in_embedding}, out_features=4096, bias=True)
                    (fc2): Linear(in_features=4096, out_features={llm_in_embedding}, bias=True)
                    (drop): Dropout(p=0.0, inplace=False)
                    (act): SiLU()
                )
                (norm_2): RMSNorm()
            )
        )
        (head): Linear(in_features={llm_in_embedding}, out_features={llm_in_embedding}, bias=False)
        (post_transformer_layer_norm): RMSNorm()
      )
      (audio): EncoderTransformer(
        (pre_transformer_layer_norm): RMSNorm()
        (layers): ModuleList(
            (0 - n): EncoderTransformerLayer(
                (attn): EncoderTransformerAttention(
                    (qkv): Linear(in_features={llm_in_embedding}, out_features=3072, bias=True)
                    (attn_drop): Dropout(p=0.0, inplace=False)
                    (proj): Linear(in_features={llm_in_embedding}, out_features={llm_in_embedding}, bias=True)
                    (proj_drop): Dropout(p=0.0, inplace=False)
                )
                (drop_path): DropPath()
                (norm_1): RMSNorm()
                (mlp): EncoderMLP(
                    (fc1): Linear(in_features={llm_in_embedding}, out_features=4096, bias=True)
                    (fc2): Linear(in_features=4096, out_features={llm_in_embedding}, bias=True)
                    (drop): Dropout(p=0.0, inplace=False)
                    (act): SiLU()
                )
                (norm_2): RMSNorm()
            )
        )
        (head): Linear(in_features={llm_in_embedding}, out_features={llm_in_embedding}, bias=False)
        (post_transformer_layer_norm): RMSNorm()
      )
    )
  )


  (llm): LlamaForCausalLM(
    (model): LlamaModel(
        (embed_tokens): Embedding({vocab_size}, {llm_in_embedding}, {padding_idx})

        (layers): ModuleList(
            (0 - n): LlamaDecoderLayer(
                (self_attn): LlamaAttention(
                    (q_proj): Linear(in_features={llm_in_embedding}, out_features={llm_in_embedding}, bias=False)
                    (k_proj): Linear(in_features={llm_in_embedding}, out_features={llm_in_embedding}, bias=False)
                    (v_proj): Linear(in_features={llm_in_embedding}, out_features={llm_in_embedding}, bias=False)
                    (o_proj): Linear(in_features={llm_in_embedding}, out_features={llm_in_embedding}, bias=False)
                    (rotary_emb): LlamaRotaryEmbedding()
                )
                (mlp): LlamaMLP(
                    (gate_proj): Linear(in_features={llm_in_embedding}, out_features=4096, bias=False)
                    (down_proj): Linear(in_features=4096, out_features={llm_in_embedding}, bias=False)
                    (up_proj): Linear(in_features={llm_in_embedding}, out_features=4096, bias=False)
                    (act_fn): SiLU()
                )
                (input_layernorm): RMSNorm()
                (post_attention_layernorm): RMSNorm()
            )
        )
        (norm): RMSNorm()
    )
    (lm_head): Linear(in_features={llm_in_embedding}, out_features={vocab_size}, bias=False)
  )

  (input_embeddings): Embedding({vocab_size}, {llm_in_embedding}, {padding_idx})
)