simarora commited on
Commit
be96765
1 Parent(s): 5df808e

Update config.json

Browse files
Files changed (1) hide show
  1. config.json +45 -45
config.json CHANGED
@@ -1,48 +1,48 @@
1
- config_check = {
2
- "n_embd": 1792,
3
- "n_inner": 3584,
4
- "n_head": 16,
5
- "n_layer": 36,
6
 
7
- "mlp_fc1_bias": False,
8
- "mlp_fc2_bias": False,
9
- "out_proj_bias": False,
10
- "qkv_proj_bias": False,
11
- "reorder_and_upcast_attn": False,
12
- "scale_attn_by_inverse_layer_idx": False,
13
 
14
- "activation_function": "swiglu",
15
- "resid_pdrop": 0.0,
16
- "rms_norm": True,
17
- "residual_in_fp32": True,
18
- "pad_vocab_size_multiple": 8,
19
- "use_flash_attn": True,
20
- "special_initializer": True,
21
- "rotary_emb_fraction": 1,
22
- "max_position_embeddings": 0,
23
 
24
- "alt_mixer_layers": [1, 6, 11, 16, 21, 27, 33],
25
- "alt_mixer_2_layers": [2, 7, 12, 17, 22, 28, 34],
26
- "mixer": {
27
- "_target_": "based.models.mixers.base_conv.BaseConvWithSiLU4",
28
- "expand_proj": 4,
29
- "l_max": 2048,
30
- "kernel_sizes": [3],
31
- "use_bias": True,
32
- },
33
- "alt_mixer": {
34
- "_target_": "based.models.mixers.linear_attn.LinearAttention",
35
- "feature_dim": 16,
36
- "feature_name": "taylor_exp",
37
- "l_max": 2048,
38
- "num_heads": 16,
39
- "num_key_value_heads": 16,
40
- "train_view": "linear",
41
- },
42
- "alt_mixer_2": {
43
- "_target_": "based.models.mixers.slide_fa2.SlidingsMHA",
44
- "causal": True,
45
- "num_heads": 16,
46
- "window_size": 128,
47
- }
48
- }
 
1
+ {
2
+ "n_embd": 1792,
3
+ "n_inner": 3584,
4
+ "n_head": 16,
5
+ "n_layer": 36,
6
 
7
+ "mlp_fc1_bias": false,
8
+ "mlp_fc2_bias": false,
9
+ "out_proj_bias": false,
10
+ "qkv_proj_bias": false,
11
+ "reorder_and_upcast_attn": false,
12
+ "scale_attn_by_inverse_layer_idx": false,
13
 
14
+ "activation_function": "swiglu",
15
+ "resid_pdrop": 0.0,
16
+ "rms_norm": true,
17
+ "residual_in_fp32": true,
18
+ "pad_vocab_size_multiple": 8,
19
+ "use_flash_attn": true,
20
+ "special_initializer": true,
21
+ "rotary_emb_fraction": 1,
22
+ "max_position_embeddings": 0,
23
 
24
+ "alt_mixer_layers": [1, 6, 11, 16, 21, 27, 33],
25
+ "alt_mixer_2_layers": [2, 7, 12, 17, 22, 28, 34],
26
+ "mixer": {
27
+ "_target_": "based.models.mixers.base_conv.BaseConvWithSiLU4",
28
+ "expand_proj": 4,
29
+ "l_max": 2048,
30
+ "kernel_sizes": [3],
31
+ "use_bias": true
32
+ },
33
+ "alt_mixer": {
34
+ "_target_": "based.models.mixers.linear_attn.LinearAttention",
35
+ "feature_dim": 16,
36
+ "feature_name": "taylor_exp",
37
+ "l_max": 2048,
38
+ "num_heads": 16,
39
+ "num_key_value_heads": 16,
40
+ "train_view": "linear"
41
+ },
42
+ "alt_mixer_2": {
43
+ "_target_": "based.models.mixers.slide_fa2.SlidingsMHA",
44
+ "causal": true,
45
+ "num_heads": 16,
46
+ "window_size": 128
47
+ }
48
+ }