File size: 834 Bytes
d90b3a8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
{
  "pipe_parallel_size": 1,
  "model_parallel_size": 1,
  "make_vocab_size_divisible_by": 1,

  # model settings
  "num_layers": 48,
  "hidden_size": 8192,
  "num_attention_heads": 64,
  "num_kv_heads": 8,
  # Codellama was uptrained on 16k token sequence lengths
  # with rotary_emb_base adjusted to 1_000_000.
  "seq_length": 16384,
  "max_position_embeddings": 16384,
  "pos_emb": "rotary",
  "rotary_pct": 1,
  "rotary_emb_base": 1000000,
  "no_weight_tying": true,
  "gpt_j_residual": false,
  "output_layer_parallelism": "column",
  "norm": "rmsnorm",
  "rms_norm_epsilon": 1.0e-5,

  "attention_config": [[["flash"], 48]],

  "scaled_upper_triang_masked_softmax_fusion": true,
  "bias_gelu_fusion": false,
  "use_bias_in_norms": false,
  "use_bias_in_attn_linear": false,
  "activation": "swiglu",
  "mlp_multiple_of": 256,
}