{ "always_save_checkpoint": true, "architectures": [ "CustomGPTModel" ], "backend": "nccl", "batch_size": 12, "beta1": 0.9, "beta2": 0.95, "bias": false, "block_size": 1024, "compile": true, "dataset": "openwebtext", "decay_lr": true, "device": "cuda", "dropout": 0.0, "dtype": "bfloat16", "eval_interval": 1, "eval_iters": 1, "eval_only": false, "grad_clip": 1.0, "gradient_accumulation_steps": 40, "init_from": "scratch", "learning_rate": 0.0006, "log_interval": 1, "lr_decay_iters": 5, "max_iters": 5, "min_lr": 6e-05, "model_type": "custom_gpt", "n_embd": 768, "n_head": 12, "n_layer": 12, "out_dir": "out", "torch_dtype": "float32", "transformers_version": "4.42.3", "vocab_size": 50304, "wandb_log": false, "wandb_project": "owt", "wandb_run_name": "gpt2", "warmup_iters": 1, "weight_decay": 0.1 }