micro_batch_size: 4
Browse files- README.md +25 -0
- scripts/pretrain-core-model.yaml +2 -1
README.md
CHANGED
|
@@ -65,6 +65,31 @@ CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable
|
|
| 65 |
```
|
| 66 |
|
| 67 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
# ...
|
| 69 |
```
|
| 70 |
|
|
|
|
| 65 |
```
|
| 66 |
|
| 67 |
```
|
| 68 |
+
Seed set to 23
|
| 69 |
+
Time to instantiate model: 0.23 seconds.
|
| 70 |
+
Total parameters: 138,084,864
|
| 71 |
+
Verifying settings ...
|
| 72 |
+
Measured TFLOPs: 6972.54
|
| 73 |
+
Epoch 1 | iter 256 step 1 | loss train: 10.530, val: n/a | iter time: 3230.47 ms (step) remaining time: 3 days, 5:34:13
|
| 74 |
+
Epoch 1 | iter 512 step 2 | loss train: 10.520, val: n/a | iter time: 589.19 ms (step) remaining time: 3 days, 0:40:40
|
| 75 |
+
Epoch 1 | iter 768 step 3 | loss train: 10.485, val: n/a | iter time: 591.81 ms (step) remaining time: 2 days, 23:01:54
|
| 76 |
+
Epoch 1 | iter 1024 step 4 | loss train: 10.447, val: n/a | iter time: 589.35 ms (step) remaining time: 2 days, 22:11:32
|
| 77 |
+
Epoch 1 | iter 1280 step 5 | loss train: 10.350, val: n/a | iter time: 589.38 ms (step) remaining time: 2 days, 21:40:13
|
| 78 |
+
Epoch 1 | iter 1536 step 6 | loss train: 10.241, val: n/a | iter time: 593.75 ms (step) remaining time: 2 days, 21:18:19
|
| 79 |
+
Epoch 1 | iter 1792 step 7 | loss train: 10.134, val: n/a | iter time: 592.92 ms (step) remaining time: 2 days, 21:01:58
|
| 80 |
+
Epoch 1 | iter 2048 step 8 | loss train: 10.049, val: n/a | iter time: 590.74 ms (step) remaining time: 2 days, 20:49:12
|
| 81 |
+
Epoch 1 | iter 2304 step 9 | loss train: 9.869, val: n/a | iter time: 594.27 ms (step) remaining time: 2 days, 20:39:10
|
| 82 |
+
Epoch 1 | iter 2560 step 10 | loss train: 9.771, val: n/a | iter time: 590.04 ms (step) remaining time: 2 days, 20:30:14
|
| 83 |
+
Epoch 1 | iter 2816 step 11 | loss train: 9.643, val: n/a | iter time: 588.32 ms (step) remaining time: 2 days, 20:22:22
|
| 84 |
+
Epoch 1 | iter 3072 step 12 | loss train: 9.557, val: n/a | iter time: 588.95 ms (step) remaining time: 2 days, 20:15:26
|
| 85 |
+
Epoch 1 | iter 3328 step 13 | loss train: 9.487, val: n/a | iter time: 589.32 ms (step) remaining time: 2 days, 20:09:05
|
| 86 |
+
Epoch 1 | iter 3584 step 14 | loss train: 9.413, val: n/a | iter time: 588.95 ms (step) remaining time: 2 days, 20:03:24
|
| 87 |
+
Epoch 1 | iter 3840 step 15 | loss train: 9.322, val: n/a | iter time: 591.62 ms (step) remaining time: 2 days, 19:58:18
|
| 88 |
+
Epoch 1 | iter 4096 step 16 | loss train: 9.241, val: n/a | iter time: 593.65 ms (step) remaining time: 2 days, 19:53:30
|
| 89 |
+
Epoch 1 | iter 4352 step 17 | loss train: 9.163, val: n/a | iter time: 593.89 ms (step) remaining time: 2 days, 19:49:00
|
| 90 |
+
Epoch 1 | iter 4608 step 18 | loss train: 9.122, val: n/a | iter time: 590.63 ms (step) remaining time: 2 days, 19:44:42
|
| 91 |
+
Epoch 1 | iter 4864 step 19 | loss train: 9.077, val: n/a | iter time: 590.87 ms (step) remaining time: 2 days, 19:40:47
|
| 92 |
+
Epoch 1 | iter 5120 step 20 | loss train: 9.018, val: n/a | iter time: 588.44 ms (step) remaining time: 2 days, 19:36:59
|
| 93 |
# ...
|
| 94 |
```
|
| 95 |
|
scripts/pretrain-core-model.yaml
CHANGED
|
@@ -67,7 +67,8 @@ train:
|
|
| 67 |
# global_batch_size: 256
|
| 68 |
|
| 69 |
# Number of samples per data-parallel rank (type: int, default: 4)
|
| 70 |
-
micro_batch_size:
|
|
|
|
| 71 |
# micro_batch_size: 1
|
| 72 |
|
| 73 |
# Number of iterations with learning rate warmup active (type: int, default: 2000)
|
|
|
|
| 67 |
# global_batch_size: 256
|
| 68 |
|
| 69 |
# Number of samples per data-parallel rank (type: int, default: 4)
|
| 70 |
+
micro_batch_size: 4
|
| 71 |
+
# micro_batch_size: 2
|
| 72 |
# micro_batch_size: 1
|
| 73 |
|
| 74 |
# Number of iterations with learning rate warmup active (type: int, default: 2000)
|