{ "n_head": 16, "n_vocab": 50304, "embed_dropout": 0, "lr": 0.00025, "lr_decay": "cosine", "warmup_steps": 3000, "beta1": 0.9, "beta2": 0.95, "epsilon": 1e-8, "ada_epsilon1": 1e-30, "ada_epsilon2": 1e-3, "opt_name": "adam", "weight_decay": 0.10, "train_batch_size": 256, "attn_dropout": 0, "train_steps": 572300, "eval_steps": 0, "predict_steps": 1, "res_dropout": 0, "eval_batch_size": 64, "predict_batch_size": 1, "iterations": 2500, "n_embd": 1536, "datasets": [["openwebtext-documents", 25, "documents_random", 1.0]], "model_path": "gs://neo-models/GPT3_LARGE", "n_ctx": 2048, "n_layer": 24, "scale_by_depth": true, "scale_by_in": false, "attention_types" : [[["global"],24]], "mesh_shape": "x:64,y:4", "layout": "batch:x,vocab:y,heads:y", "activation_function": "gelu", "recompute_grad": true, "gradient_clipping": 1.0, "tokens_per_mb_per_replica": 2048 }