File size: 1,277 Bytes
a4f4290
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
{
  "n_layers": 4,
  "d_model": 512,
  "d_mlp": 2048,
  "d_head": 64,
  "n_heads": 8,
  "lr_hidden": 0.002,
  "lr_vector": 0.001,
  "batch_size_per_device": 32,
  "batches_per_step": 1,
  "seed": 9153,
  "save_checkpoints": true,
  "debug": false,
  "debug_batch": false,
  "normalization": "LN",
  "max_tokens": 22000000000,
  "version": 203,
  "use_bfloat16_matmul": true,
  "n_ctx": 1024,
  "d_vocab": 48262,
  "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits",
  "betas": [
    0.9,
    0.99
  ],
  "weight_decay": 0.05,
  "dataset_name": "c4_code",
  "grad_norm_clip": 1.0,
  "n_devices": 8,
  "act_fn": "gelu",
  "shortformer_pos": false,
  "attn_only": false,
  "ln_eps": 1e-05,
  "lr_schedule": "cosine_warmup",
  "warmup_tokens": 300000000,
  "train_loss_ewma_beta": 0.99,
  "truncate_tokens": 1000000000000,
  "log_interval": 50,
  "initializer_scale_global": 1.0,
  "initializer_scale_hidden": 0.02,
  "initializer_scale_embed": 0.1,
  "initializer_scale_unembed": 0.02,
  "neuron_scale": 1.0,
  "neuron_temp": 1.0,
  "use_acc": false,
  "weight_init_scheme": "gpt2",
  "fixed_init": "4L512W_init",
  "store_init": false,
  "control": 1.0,
  "tokens_per_step": 262144,
  "batch_size": 256,
  "max_steps": 83923,
  "warmup_steps": 1144,
  "n_params": 12582912
}