MLX
German
English
Mixture of Experts
multimodal
vision
audio
endtoend
j.o.s.i.e.
Isaak-Carter commited on
Commit
a905bf9
1 Parent(s): 287e1c0

Upload Version4.7-architecture.txt

Browse files
Files changed (1) hide show
  1. Version4.7-architecture.txt +105 -0
Version4.7-architecture.txt ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Josiev47(
2
+ (encoder): EncoderModel(
3
+ (modality_preprocessors): ModuleDict(
4
+ (vision): RGBDTPreprocessor(
5
+ (rgbt_stem): PatchEmbedGeneric(
6
+ (proj): Sequential(
7
+ (0): PadIm2Video()
8
+ (1): Conv3d(3, {llm_in_embedding}, kernel_size=(2, 14, 14), stride=(2, 14, 14), bias=False)
9
+ )
10
+ (norm_layer): RMSNorm()
11
+ )
12
+ (pos_embedding_helper): SpatioTemporalPosEmbeddingHelper()
13
+ )
14
+ (audio): AudioPreprocessor(
15
+ (rgbt_stem): PatchEmbedGeneric(
16
+ (proj): Conv2d(1, {llm_in_embedding}, kernel_size=(16, 16), stride=(10, 10), bias=False)
17
+ (norm_layer): RMSNorm()
18
+ )
19
+ (pos_embedding_helper): SpatioTemporalPosEmbeddingHelper()
20
+ )
21
+ )
22
+
23
+ (modality_transformers): ModuleDict(
24
+ (vision): EncoderTransformer(
25
+ (pre_transformer_layer_norm): RMSNorm()
26
+ (layers): ModuleList(
27
+ (0 - n): EncoderTransformerLayer(
28
+ (attn): EncoderTransformerAttention(
29
+ (qkv): Linear(in_features={llm_in_embedding}, out_features=3072, bias=True)
30
+ (attn_drop): Dropout(p=0.0, inplace=False)
31
+ (proj): Linear(in_features={llm_in_embedding}, out_features={llm_in_embedding}, bias=True)
32
+ (proj_drop): Dropout(p=0.0, inplace=False)
33
+ )
34
+ (drop_path): Identity()
35
+ (norm_1): RMSNorm()
36
+ (mlp): EncoderMLP(
37
+ (fc1): Linear(in_features={llm_in_embedding}, out_features=4096, bias=True)
38
+ (fc2): Linear(in_features=4096, out_features={llm_in_embedding}, bias=True)
39
+ (drop): Dropout(p=0.0, inplace=False)
40
+ (act): SiLU()
41
+ )
42
+ (norm_2): RMSNorm()
43
+ )
44
+ )
45
+ (head): Linear(in_features={llm_in_embedding}, out_features={llm_in_embedding}, bias=False)
46
+ (post_transformer_layer_norm): RMSNorm()
47
+ )
48
+ (audio): EncoderTransformer(
49
+ (pre_transformer_layer_norm): RMSNorm()
50
+ (layers): ModuleList(
51
+ (0 - n): EncoderTransformerLayer(
52
+ (attn): EncoderTransformerAttention(
53
+ (qkv): Linear(in_features={llm_in_embedding}, out_features=3072, bias=True)
54
+ (attn_drop): Dropout(p=0.0, inplace=False)
55
+ (proj): Linear(in_features={llm_in_embedding}, out_features={llm_in_embedding}, bias=True)
56
+ (proj_drop): Dropout(p=0.0, inplace=False)
57
+ )
58
+ (drop_path): DropPath()
59
+ (norm_1): RMSNorm()
60
+ (mlp): EncoderMLP(
61
+ (fc1): Linear(in_features={llm_in_embedding}, out_features=4096, bias=True)
62
+ (fc2): Linear(in_features=4096, out_features={llm_in_embedding}, bias=True)
63
+ (drop): Dropout(p=0.0, inplace=False)
64
+ (act): SiLU()
65
+ )
66
+ (norm_2): RMSNorm()
67
+ )
68
+ )
69
+ (head): Linear(in_features={llm_in_embedding}, out_features={llm_in_embedding}, bias=False)
70
+ (post_transformer_layer_norm): RMSNorm()
71
+ )
72
+ )
73
+ )
74
+
75
+
76
+ (llm): LlamaForCausalLM(
77
+ (model): LlamaModel(
78
+ (embed_tokens): Embedding({vocab_size}, {llm_in_embedding}, {padding_idx})
79
+
80
+ (layers): ModuleList(
81
+ (0 - n): LlamaDecoderLayer(
82
+ (self_attn): LlamaAttention(
83
+ (q_proj): Linear(in_features={llm_in_embedding}, out_features={llm_in_embedding}, bias=False)
84
+ (k_proj): Linear(in_features={llm_in_embedding}, out_features={llm_in_embedding}, bias=False)
85
+ (v_proj): Linear(in_features={llm_in_embedding}, out_features={llm_in_embedding}, bias=False)
86
+ (o_proj): Linear(in_features={llm_in_embedding}, out_features={llm_in_embedding}, bias=False)
87
+ (rotary_emb): LlamaRotaryEmbedding()
88
+ )
89
+ (mlp): LlamaMLP(
90
+ (gate_proj): Linear(in_features={llm_in_embedding}, out_features=4096, bias=False)
91
+ (down_proj): Linear(in_features=4096, out_features={llm_in_embedding}, bias=False)
92
+ (up_proj): Linear(in_features={llm_in_embedding}, out_features=4096, bias=False)
93
+ (act_fn): SiLU()
94
+ )
95
+ (input_layernorm): RMSNorm()
96
+ (post_attention_layernorm): RMSNorm()
97
+ )
98
+ )
99
+ (norm): RMSNorm()
100
+ )
101
+ (lm_head): Linear(in_features={llm_in_embedding}, out_features={vocab_size}, bias=False)
102
+ )
103
+
104
+ (input_embeddings): Embedding({vocab_size}, {llm_in_embedding}, {padding_idx})
105
+ )