| model_name = "basic_reference_200m" | |
| n_layers = 2 | |
| d_model = 512 | |
| d_mlp = 2048 | |
| d_head = 64 | |
| n_heads = 8 | |
| attn_only = false | |
| layer_norm_eps = 1e-05 | |
| init_range = 0.02 | |
| n_ctx = 1024 | |
| d_vocab = 48262 | |
| dataset_name = "eoinf/unprocessed-c4-code-test" | |
| tokenizer_name = "NeelNanda/gpt-neox-tokenizer-digits" | |
| seed = 10 | |
| device = "cuda" | |
| use_bfloat16_matmul = false | |
| batch_size_per_device = 32 | |
| n_devices = 1 | |
| batches_per_step = 1 | |
| max_tokens = 200000000 | |
| lr_hidden = 0.002 | |
| lr_vector = 0.001 | |
| lr_schedule = "constant_with_warmup" | |
| warmup_tokens = 30000000 | |
| weight_decay = 0.05 | |
| grad_norm_clip = 1.0 | |
| train_loss_moving_average_beta = 0.99 | |
| log_interval = 25 | |
| save_checkpoints = true | |
| checkpoint_interval = 1000 | |
| checkpoint_interval_ratio = 1.10 | |
| save_log_checkpoints = true |