{ "bias": false, "capacity_factor": 0.12, "d_model": 1024, "dropout": 0.2, "ffn": "swiglu", "hidden_dim": 4096, "mixture_of_depth": true, "mixture_of_expert": false, "model_type": { "mixture_of_depth": true, "name": "mixture of depth" }, "moe_num_experts": 4, "moe_num_experts_per_tok": 2, "multiple_of": 4, "num_heads": 16, "num_kv_heads": 0, "num_layers": 16, "seq_len": 512, "vocab_size": 50257, "weight_tying": true, "window_size": 128 }