gpt-neo / configs /gpt3_13B_256.json
aliabd
full working demo
c6e7238
{
"n_head": 40,
"n_vocab": 50257,
"embed_dropout": 0,
"lr": 0.0001,
"lr_decay": "cosine",
"warmup_steps": 3000,
"beta1": 0.9,
"beta2": 0.95,
"epsilon": 1e-8,
"ada_epsilon1": 1e-30,
"ada_epsilon2": 1e-3,
"opt_name": "adam",
"weight_decay": 0.10,
"train_batch_size": 1024,
"attn_dropout": 0,
"train_steps": 143075,
"eval_steps": 0,
"predict_steps": 1,
"res_dropout": 0,
"eval_batch_size": 128,
"predict_batch_size": 1,
"iterations": 500,
"n_embd": 5120,
"datasets": [["openwebtext-documents", 25, "documents_random", 1.0]],
"model_path": "gs://neo-models/GPT3_13B",
"n_ctx": 2048,
"n_layer": 40,
"scale_by_depth": true,
"scale_by_in": false,
"attention_types" : [[["global", "local"],20]],
"mesh_shape": "x:16,y:16",
"layout": "batch:x,embd:y,memory_length:y",
"activation_function": "gelu",
"recompute_grad": true,
"gradient_clipping": 1.0,
"tokens_per_mb_per_replica": 2048,
"precision": "bfloat16"
}