{ "n_layers": 10, "d_model": 1280, "d_mlp": 5120, "d_head": 64, "n_heads": 20, "lr_hidden": 0.002, "lr_vector": 0.001, "batch_size_per_device": 8, "batches_per_step": 3, "seed": 101010, "save_checkpoints": true, "debug": false, "debug_batch": false, "normalization": "LN", "max_tokens": 30000000000, "version": 418, "use_bfloat16_matmul": true, "n_ctx": 1024, "d_vocab": 48262, "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "betas": [ 0.9, 0.99 ], "weight_decay": 0.05, "dataset_name": "c4_code", "grad_norm_clip": 1.0, "n_devices": 8, "act_fn": "solu_ln", "shortformer_pos": false, "attn_only": false, "ln_eps": 1e-05, "lr_schedule": "cosine_warmup", "warmup_tokens": 300000000, "train_loss_ewma_beta": 0.99, "truncate_tokens": 1000000000000, "log_interval": 50, "initializer_scale_global": 1.0, "initializer_scale_hidden": 0.02, "initializer_scale_embed": 0.1, "initializer_scale_unembed": 0.02, "neuron_scale": 1.0, "neuron_temp": 1.0, "use_acc": false, "weight_init_scheme": "gpt2", "fixed_init": "", "store_init": false, "control": 1.0, "tokens_per_step": 196608, "batch_size": 64, "max_steps": 152587, "warmup_steps": 1525, "n_params": 196608000 }