{ "n_head": 6, "n_vocab": 50257, "embed_dropout": 0.1, "lr": 0.0006, "lr_decay": "cosine", "warmup_steps": 3000, "beta1": 0.9, "beta2": 0.95, "epsilon": 1e-8, "opt_name": "adam", "weight_decay": 0, "train_batch_size": 512, "attn_dropout": 0.1, "train_steps": 1000000, "lr_decay_end": 300000, "eval_steps": 30, "predict_steps": 0, "res_dropout": 0.1, "eval_batch_size": 128, "predict_batch_size": 8, "iterations": 2500, "n_embd": 768, "datasets": ["openwebtext2_new_inputs"], "model_path": "gs://neo-models/GPT2_SMALL", "n_ctx": 1024, "n_layer": 12, "scale_by_depth": true, "scale_by_in": false, "attention_types" : [[["global"],12]], "activation_function": "gelu", "mesh_shape": "all:64", "layout": "batch:all", "recompute_grad": false, "gradient_clipping": 1.0 }