File size: 1,277 Bytes
a4f4290 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
{
"n_layers": 4,
"d_model": 512,
"d_mlp": 2048,
"d_head": 64,
"n_heads": 8,
"lr_hidden": 0.002,
"lr_vector": 0.001,
"batch_size_per_device": 32,
"batches_per_step": 1,
"seed": 9153,
"save_checkpoints": true,
"debug": false,
"debug_batch": false,
"normalization": "LN",
"max_tokens": 22000000000,
"version": 203,
"use_bfloat16_matmul": true,
"n_ctx": 1024,
"d_vocab": 48262,
"tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits",
"betas": [
0.9,
0.99
],
"weight_decay": 0.05,
"dataset_name": "c4_code",
"grad_norm_clip": 1.0,
"n_devices": 8,
"act_fn": "gelu",
"shortformer_pos": false,
"attn_only": false,
"ln_eps": 1e-05,
"lr_schedule": "cosine_warmup",
"warmup_tokens": 300000000,
"train_loss_ewma_beta": 0.99,
"truncate_tokens": 1000000000000,
"log_interval": 50,
"initializer_scale_global": 1.0,
"initializer_scale_hidden": 0.02,
"initializer_scale_embed": 0.1,
"initializer_scale_unembed": 0.02,
"neuron_scale": 1.0,
"neuron_temp": 1.0,
"use_acc": false,
"weight_init_scheme": "gpt2",
"fixed_init": "4L512W_init",
"store_init": false,
"control": 1.0,
"tokens_per_step": 262144,
"batch_size": 256,
"max_steps": 83923,
"warmup_steps": 1144,
"n_params": 12582912
} |