{ "n_embd": 1792, "n_inner": 3584, "n_head": 16, "n_layer": 36, "mlp_fc1_bias": false, "mlp_fc2_bias": false, "out_proj_bias": false, "qkv_proj_bias": false, "reorder_and_upcast_attn": false, "scale_attn_by_inverse_layer_idx": false, "activation_function": "swiglu", "resid_pdrop": 0.0, "rms_norm": true, "residual_in_fp32": true, "pad_vocab_size_multiple": 8, "use_flash_attn": true, "special_initializer": true, "rotary_emb_fraction": 1, "max_position_embeddings": 0, "alt_mixer_layers": [1, 6, 11, 16, 21, 27, 33], "alt_mixer_2_layers": [2, 7, 12, 17, 22, 28, 34], "mixer": { "_target_": "based.models.mixers.base_conv.BaseConvWithSiLU4", "expand_proj": 4, "l_max": 2048, "kernel_sizes": [3], "use_bias": true }, "alt_mixer": { "_target_": "based.models.mixers.linear_attn.LinearAttention", "feature_dim": 16, "feature_name": "taylor_exp", "l_max": 2048, "num_heads": 16, "num_key_value_heads": 16, "train_view": "linear" }, "alt_mixer_2": { "_target_": "based.models.mixers.slide_fa2.SlidingsMHA", "causal": true, "num_heads": 16, "window_size": 128 } }