{ "n_head": 12, "n_vocab": 50304, "embed_dropout": 0, "lr": 0.0006, "lr_decay": "cosine", "warmup_steps": 3000, "beta1": 0.9, "beta2": 0.95, "epsilon": 1e-8, "opt_name": "adam", "weight_decay": 0.10, "train_batch_size": 256, "attn_dropout": 0, "train_steps": 572300, "eval_steps": 0, "predict_steps": 1, "res_dropout": 0, "eval_batch_size": 64, "predict_batch_size": 1, "iterations": 1000, "n_embd": 768, "datasets": [["openwebtext-documents", 25, "documents_random", 1.0]], "model_path": "gs://neo-models/GPT3_PAR_SMALL", "n_ctx": 2048, "n_layer": 19, "scale_by_depth": true, "scale_by_in": false, "attention_types": [[["global", "none", "none"],5], [["none"], 4]], "mesh_shape": "x:64,y:4", "layout": "batch:x,heads:y,vocab:y,intermediate_expanded:y", "activation_function": "gelu", "recompute_grad": false, "gradient_clipping": 1.0 }