itriedcoding commited on
Commit
3b1dd61
·
verified ·
1 Parent(s): ff73b98

Add config.json

Browse files
Files changed (1) hide show
  1. config.json +83 -0
config.json ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "OMNI-LITE (Unified Sparse-Multimodal Transformer)",
3
+ "layers": [
4
+ {
5
+ "type": "Conv2d",
6
+ "params": {
7
+ "in_channels": 3,
8
+ "out_channels": 1024,
9
+ "kernel_size": 14,
10
+ "stride": 14,
11
+ "note": "Vision Patch Embedding for ViT encoder"
12
+ }
13
+ },
14
+ {
15
+ "type": "TransformerBlock",
16
+ "params": {
17
+ "embed_dim": 1024,
18
+ "num_heads": 16,
19
+ "ff_dim": 4096,
20
+ "depth": 12,
21
+ "note": "Lightweight Vision Transformer (ViT) Backbone"
22
+ }
23
+ },
24
+ {
25
+ "type": "TransformerBlock",
26
+ "params": {
27
+ "type": "PerceiverResampler",
28
+ "num_latents": 64,
29
+ "embed_dim": 2048,
30
+ "note": "Maps visual features to text latent space"
31
+ }
32
+ },
33
+ {
34
+ "type": "Linear",
35
+ "params": {
36
+ "in_features": 32000,
37
+ "out_features": 2048,
38
+ "note": "Text Token Embedding layer"
39
+ }
40
+ },
41
+ {
42
+ "type": "TransformerBlock",
43
+ "params": {
44
+ "type": "GQA_MoE_Layer",
45
+ "repeat": 24,
46
+ "num_experts": 16,
47
+ "top_k": 2,
48
+ "hidden_dim": 2048,
49
+ "num_heads": 32,
50
+ "num_kv_heads": 8,
51
+ "rope_dim": 64,
52
+ "note": "Shared Backbone: 480M active parameters per token"
53
+ }
54
+ },
55
+ {
56
+ "type": "Linear",
57
+ "params": {
58
+ "in_features": 2048,
59
+ "out_features": 32000,
60
+ "note": "Causal Language Modeling (CLM) Head"
61
+ }
62
+ },
63
+ {
64
+ "type": "Linear",
65
+ "params": {
66
+ "in_features": 2048,
67
+ "out_features": 64,
68
+ "note": "Rectified Flow-Matching (RFM) Head for DiT Latents"
69
+ }
70
+ },
71
+ {
72
+ "type": "Conv2d",
73
+ "params": {
74
+ "in_channels": 4,
75
+ "out_channels": 3,
76
+ "kernel_size": 3,
77
+ "stride": 1,
78
+ "note": "VQ-VAE Decoder for 8x8 Latent Reconstruction"
79
+ }
80
+ }
81
+ ],
82
+ "explanation": "OMNI-LITE utilizes a Sparse MoE backbone to minimize active compute (480M params) while maintaining 2.5B knowledge capacity, fitting within the 6GB VRAM limit when quantized via NF4/AWQ. Grouped-Query Attention (GQA) significantly reduces the KV-cache footprint for edge deployment. The Perceiver Resampler allows the model to treat visual inputs as a fixed set of tokens within the Causal Transformer's context window. For generation, the dual-head design supports standard autoregressive text while a separate Flow-Matching head handles Diffusion Transformer (DiT) logic within the same latent space, ensuring hardware-agnostic efficiency through RFM's low-step count."
83
+ }