{ "n_layers": 1, "d_model": 64, "d_mlp": 256, "d_head": 32, "n_heads": 2, "lr_hidden": 0.002, "lr_vector": 0.001, "batch_size_per_device": 32, "batches_per_step": 1, "seed": 1297, "save_checkpoints": true, "debug": false, "debug_batch": false, "normalization": "LN", "max_tokens": 10000000000, "version": 425, "use_bfloat16_matmul": true, "n_ctx": 1024, "d_vocab": 48262, "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "betas": [ 0.9, 0.99 ], "weight_decay": 0.05, "dataset_name": "c4", "grad_norm_clip": 1.0, "n_devices": 8, "act_fn": "solu_ln", "shortformer_pos": false, "attn_only": false, "ln_eps": 1e-05, "lr_schedule": "cosine_warmup", "warmup_tokens": 300000000, "train_loss_ewma_beta": 0.99, "truncate_tokens": 1000000000000, "log_interval": 50, "initializer_scale_global": 1.0, "initializer_scale_hidden": 0.02, "initializer_scale_embed": 0.1, "initializer_scale_unembed": 0.02, "neuron_scale": 1.0, "neuron_temp": 1.0, "use_acc": false, "weight_init_scheme": "gpt2", "fixed_init": "", "store_init": false, "control": 1.0, "tokens_per_step": 262144, "batch_size": 256, "max_steps": 38146, "warmup_steps": 1144, "n_params": 49152 }