| { |
| "name": "OMNI-LITE (Unified Sparse-Multimodal Transformer)", |
| "layers": [ |
| { |
| "type": "Conv2d", |
| "params": { |
| "in_channels": 3, |
| "out_channels": 1024, |
| "kernel_size": 14, |
| "stride": 14, |
| "note": "Vision Patch Embedding for ViT encoder" |
| } |
| }, |
| { |
| "type": "TransformerBlock", |
| "params": { |
| "embed_dim": 1024, |
| "num_heads": 16, |
| "ff_dim": 4096, |
| "depth": 12, |
| "note": "Lightweight Vision Transformer (ViT) Backbone" |
| } |
| }, |
| { |
| "type": "TransformerBlock", |
| "params": { |
| "type": "PerceiverResampler", |
| "num_latents": 64, |
| "embed_dim": 2048, |
| "note": "Maps visual features to text latent space" |
| } |
| }, |
| { |
| "type": "Linear", |
| "params": { |
| "in_features": 32000, |
| "out_features": 2048, |
| "note": "Text Token Embedding layer" |
| } |
| }, |
| { |
| "type": "TransformerBlock", |
| "params": { |
| "type": "GQA_MoE_Layer", |
| "repeat": 24, |
| "num_experts": 16, |
| "top_k": 2, |
| "hidden_dim": 2048, |
| "num_heads": 32, |
| "num_kv_heads": 8, |
| "rope_dim": 64, |
| "note": "Shared Backbone: 480M active parameters per token" |
| } |
| }, |
| { |
| "type": "Linear", |
| "params": { |
| "in_features": 2048, |
| "out_features": 32000, |
| "note": "Causal Language Modeling (CLM) Head" |
| } |
| }, |
| { |
| "type": "Linear", |
| "params": { |
| "in_features": 2048, |
| "out_features": 64, |
| "note": "Rectified Flow-Matching (RFM) Head for DiT Latents" |
| } |
| }, |
| { |
| "type": "Conv2d", |
| "params": { |
| "in_channels": 4, |
| "out_channels": 3, |
| "kernel_size": 3, |
| "stride": 1, |
| "note": "VQ-VAE Decoder for 8x8 Latent Reconstruction" |
| } |
| } |
| ], |
| "explanation": "OMNI-LITE utilizes a Sparse MoE backbone to minimize active compute (480M params) while maintaining 2.5B knowledge capacity, fitting within the 6GB VRAM limit when quantized via NF4/AWQ. Grouped-Query Attention (GQA) significantly reduces the KV-cache footprint for edge deployment. The Perceiver Resampler allows the model to treat visual inputs as a fixed set of tokens within the Causal Transformer's context window. For generation, the dual-head design supports standard autoregressive text while a separate Flow-Matching head handles Diffusion Transformer (DiT) logic within the same latent space, ensuring hardware-agnostic efficiency through RFM's low-step count." |
| } |