|
{ |
|
"n_head": 40, |
|
"n_vocab": 50257, |
|
"embed_dropout": 0, |
|
"lr": 0.0001, |
|
"lr_decay": "cosine", |
|
"warmup_steps": 3000, |
|
"beta1": 0.9, |
|
"beta2": 0.95, |
|
"epsilon": 1e-8, |
|
"ada_epsilon1": 1e-30, |
|
"ada_epsilon2": 1e-3, |
|
"opt_name": "adam", |
|
"weight_decay": 0.10, |
|
"train_batch_size": 1024, |
|
"attn_dropout": 0, |
|
"train_steps": 143075, |
|
"eval_steps": 0, |
|
"predict_steps": 1, |
|
"res_dropout": 0, |
|
"eval_batch_size": 128, |
|
"predict_batch_size": 1, |
|
"iterations": 500, |
|
"n_embd": 5120, |
|
"datasets": [["openwebtext-documents", 25, "documents_random", 1.0]], |
|
"model_path": "gs://neo-models/GPT3_13B", |
|
"n_ctx": 2048, |
|
"n_layer": 40, |
|
"scale_by_depth": true, |
|
"scale_by_in": false, |
|
"attention_types" : [[["global", "local"],20]], |
|
"mesh_shape": "x:16,y:16", |
|
"layout": "batch:x,embd:y,memory_length:y", |
|
"activation_function": "gelu", |
|
"recompute_grad": true, |
|
"gradient_clipping": 1.0, |
|
"tokens_per_mb_per_replica": 2048, |
|
"precision": "bfloat16" |
|
} |
|
|
|
|