File size: 1,296 Bytes
be96765
 
 
 
 
d7359c4
be96765
 
 
 
 
 
5df808e
be96765
 
 
 
 
 
 
 
 
d7359c4
be96765
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
{
    "n_embd": 1792,
    "n_inner": 3584,
    "n_head": 16,
    "n_layer": 36,

    "mlp_fc1_bias": false,
    "mlp_fc2_bias": false,
    "out_proj_bias": false,
    "qkv_proj_bias": false,
    "reorder_and_upcast_attn": false,
    "scale_attn_by_inverse_layer_idx": false,

    "activation_function": "swiglu",
    "resid_pdrop": 0.0,
    "rms_norm": true,
    "residual_in_fp32": true,
    "pad_vocab_size_multiple": 8,
    "use_flash_attn": true,
    "special_initializer": true,
    "rotary_emb_fraction": 1,
    "max_position_embeddings": 0,

    "alt_mixer_layers": [1, 6, 11, 16, 21, 27, 33],
    "alt_mixer_2_layers": [2, 7, 12, 17, 22, 28, 34],
    "mixer": {
        "_target_": "based.models.mixers.base_conv.BaseConvWithSiLU4",
        "expand_proj": 4,
        "l_max": 2048,
        "kernel_sizes": [3],
        "use_bias": true
    },
    "alt_mixer": {
        "_target_": "based.models.mixers.linear_attn.LinearAttention",
        "feature_dim": 16,
        "feature_name": "taylor_exp",
        "l_max": 2048,
        "num_heads": 16,
        "num_key_value_heads": 16,
        "train_view": "linear"
    },
    "alt_mixer_2": {
        "_target_": "based.models.mixers.slide_fa2.SlidingsMHA",
        "causal": true,
        "num_heads": 16,
        "window_size": 128
    }
}