Update README.md
Browse files
README.md
CHANGED
@@ -63,8 +63,9 @@ The following hyperparameters were used during pre-training:
|
|
63 |
- learning_rate: 2e-4
|
64 |
- per_device_train_batch_size: 36
|
65 |
- gradient_accumulation_steps: 32
|
66 |
-
- optimizer:
|
67 |
- weight_decay: 0.01
|
|
|
68 |
- max_grad_norm: 1.0
|
69 |
- max_steps: 500,000 (but terminated at *** steps)
|
70 |
- warmup_steps: 10,000
|
|
|
63 |
- learning_rate: 2e-4
|
64 |
- per_device_train_batch_size: 36
|
65 |
- gradient_accumulation_steps: 32
|
66 |
+
- optimizer: AdamW with betas=(0.9, 0.999) and epsilon=1e-06
|
67 |
- weight_decay: 0.01
|
68 |
+
- lr_scheduler_type: linear
|
69 |
- max_grad_norm: 1.0
|
70 |
- max_steps: 500,000 (but terminated at *** steps)
|
71 |
- warmup_steps: 10,000
|