{ "n_head": 40, "n_vocab": 50257, "embed_dropout": 0, "lr": 0.0001, "lr_decay": "cosine", "warmup_steps": 3000, "beta1": 0.9, "beta2": 0.95, "epsilon": 1e-8, "opt_name": "adam", "weight_decay": 0.1, "train_batch_size": 1024, "attn_dropout": 0, "train_steps": 286150, "eval_steps": 10, "predict_steps": 1, "res_dropout": 0, "eval_batch_size": 512, "predict_batch_size": 1, "iterations": 500, "n_embd": 5120, "datasets": [["pile", 25, "documents_random", 1.0]], "model_path": "gs://neo-models/GPT3_13B_Pile", "n_ctx": 2048, "n_layer": 40, "scale_by_depth": true, "scale_by_in": false, "attention_types" : [[["global"],40]], "mesh_shape": "x:16,y:16", "layout": "batch:x,memory_length:y,embd:y", "activation_function": "gelu", "recompute_grad": true, "gradient_clipping": 1.0, "tokens_per_mb_per_replica": 2048, "precision": "bfloat16" }