step 100
Browse files- log/debug_0.log +64 -0
- model.safetensors +1 -1
- my_checkpoint/model.safetensors +1 -1
- my_checkpoint/optimizer.bin +1 -1
- my_checkpoint/random_states_0.pkl +1 -1
- my_checkpoint/scaler.pt +1 -1
- runs/Jul25_08-52-29_lab/events.out.tfevents.1721897549.lab.173640.0 +2 -2
- torch_checkpoint/latest_checkpoint.pth +1 -1
log/debug_0.log
CHANGED
@@ -249,3 +249,67 @@ Mixed precision type: fp16
|
|
249 |
07/25/2024 08:53:18 - INFO - accelerate.checkpointing - Sampler state for dataloader 1 saved in my_checkpoint/sampler_1.bin
|
250 |
07/25/2024 08:53:18 - INFO - accelerate.checkpointing - Gradient scaler state saved in my_checkpoint/scaler.pt
|
251 |
07/25/2024 08:53:18 - INFO - accelerate.checkpointing - Random states saved in my_checkpoint/random_states_0.pkl
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
249 |
07/25/2024 08:53:18 - INFO - accelerate.checkpointing - Sampler state for dataloader 1 saved in my_checkpoint/sampler_1.bin
|
250 |
07/25/2024 08:53:18 - INFO - accelerate.checkpointing - Gradient scaler state saved in my_checkpoint/scaler.pt
|
251 |
07/25/2024 08:53:18 - INFO - accelerate.checkpointing - Random states saved in my_checkpoint/random_states_0.pkl
|
252 |
+
07/25/2024 08:54:21 - WARNING - huggingface_hub.repository - To https://huggingface.co/shng2025/gptesla-small
|
253 |
+
4d63b0c..a942953 spring-music-133 -> spring-music-133
|
254 |
+
|
255 |
+
07/25/2024 08:54:21 - INFO - __main__ - Step 51: {'lr': 3.571428571428571e-05, 'samples': 2448, 'steps': 50, 'loss/train': 8.343396186828613}
|
256 |
+
07/25/2024 08:54:21 - INFO - __main__ - Step 52: {'lr': 3.642857142857143e-05, 'samples': 2496, 'steps': 51, 'loss/train': 8.461634635925293}
|
257 |
+
07/25/2024 08:54:22 - INFO - __main__ - Step 53: {'lr': 3.7142857142857143e-05, 'samples': 2544, 'steps': 52, 'loss/train': 8.43316650390625}
|
258 |
+
07/25/2024 08:54:22 - INFO - __main__ - Step 54: {'lr': 3.7857142857142864e-05, 'samples': 2592, 'steps': 53, 'loss/train': 8.464268684387207}
|
259 |
+
07/25/2024 08:54:22 - INFO - __main__ - Step 55: {'lr': 3.857142857142857e-05, 'samples': 2640, 'steps': 54, 'loss/train': 8.371450424194336}
|
260 |
+
07/25/2024 08:54:23 - INFO - __main__ - Step 56: {'lr': 3.928571428571428e-05, 'samples': 2688, 'steps': 55, 'loss/train': 8.155680656433105}
|
261 |
+
07/25/2024 08:54:23 - INFO - __main__ - Step 57: {'lr': 4e-05, 'samples': 2736, 'steps': 56, 'loss/train': 8.359997749328613}
|
262 |
+
07/25/2024 08:54:23 - INFO - __main__ - Step 58: {'lr': 4.0714285714285717e-05, 'samples': 2784, 'steps': 57, 'loss/train': 7.883953094482422}
|
263 |
+
07/25/2024 08:54:23 - INFO - __main__ - Step 59: {'lr': 4.142857142857143e-05, 'samples': 2832, 'steps': 58, 'loss/train': 8.425983428955078}
|
264 |
+
07/25/2024 08:54:24 - INFO - __main__ - Step 60: {'lr': 4.214285714285714e-05, 'samples': 2880, 'steps': 59, 'loss/train': 8.220914840698242}
|
265 |
+
07/25/2024 08:54:24 - INFO - __main__ - Step 61: {'lr': 4.2857142857142856e-05, 'samples': 2928, 'steps': 60, 'loss/train': 8.216103553771973}
|
266 |
+
07/25/2024 08:54:24 - INFO - __main__ - Step 62: {'lr': 4.3571428571428576e-05, 'samples': 2976, 'steps': 61, 'loss/train': 8.129951477050781}
|
267 |
+
07/25/2024 08:54:25 - INFO - __main__ - Step 63: {'lr': 4.428571428571428e-05, 'samples': 3024, 'steps': 62, 'loss/train': 7.993805885314941}
|
268 |
+
07/25/2024 08:54:25 - INFO - __main__ - Step 64: {'lr': 4.4999999999999996e-05, 'samples': 3072, 'steps': 63, 'loss/train': 6.955376625061035}
|
269 |
+
07/25/2024 08:54:25 - INFO - __main__ - Step 65: {'lr': 4.5714285714285716e-05, 'samples': 3120, 'steps': 64, 'loss/train': 7.9038238525390625}
|
270 |
+
07/25/2024 08:54:25 - INFO - __main__ - Step 66: {'lr': 4.642857142857143e-05, 'samples': 3168, 'steps': 65, 'loss/train': 7.659880638122559}
|
271 |
+
07/25/2024 08:54:26 - INFO - __main__ - Step 67: {'lr': 4.714285714285715e-05, 'samples': 3216, 'steps': 66, 'loss/train': 7.462357521057129}
|
272 |
+
07/25/2024 08:54:26 - INFO - __main__ - Step 68: {'lr': 4.7857142857142856e-05, 'samples': 3264, 'steps': 67, 'loss/train': 7.9803571701049805}
|
273 |
+
07/25/2024 08:54:26 - INFO - __main__ - Step 69: {'lr': 4.857142857142857e-05, 'samples': 3312, 'steps': 68, 'loss/train': 7.895639896392822}
|
274 |
+
07/25/2024 08:54:27 - INFO - __main__ - Step 70: {'lr': 4.928571428571429e-05, 'samples': 3360, 'steps': 69, 'loss/train': 7.726537704467773}
|
275 |
+
07/25/2024 08:54:27 - INFO - __main__ - Step 71: {'lr': 5e-05, 'samples': 3408, 'steps': 70, 'loss/train': 7.8505425453186035}
|
276 |
+
07/25/2024 08:54:27 - INFO - __main__ - Step 72: {'lr': 5.0714285714285716e-05, 'samples': 3456, 'steps': 71, 'loss/train': 7.492800235748291}
|
277 |
+
07/25/2024 08:54:27 - INFO - __main__ - Step 73: {'lr': 5.142857142857143e-05, 'samples': 3504, 'steps': 72, 'loss/train': 7.890054225921631}
|
278 |
+
07/25/2024 08:54:28 - INFO - __main__ - Step 74: {'lr': 5.214285714285714e-05, 'samples': 3552, 'steps': 73, 'loss/train': 7.429488182067871}
|
279 |
+
07/25/2024 08:54:28 - INFO - __main__ - Step 75: {'lr': 5.285714285714286e-05, 'samples': 3600, 'steps': 74, 'loss/train': 7.520913600921631}
|
280 |
+
07/25/2024 08:54:28 - INFO - __main__ - Step 76: {'lr': 5.357142857142857e-05, 'samples': 3648, 'steps': 75, 'loss/train': 7.66839075088501}
|
281 |
+
07/25/2024 08:54:28 - INFO - __main__ - Step 77: {'lr': 5.428571428571429e-05, 'samples': 3696, 'steps': 76, 'loss/train': 7.810487270355225}
|
282 |
+
07/25/2024 08:54:29 - INFO - __main__ - Step 78: {'lr': 5.5e-05, 'samples': 3744, 'steps': 77, 'loss/train': 7.009271621704102}
|
283 |
+
07/25/2024 08:54:29 - INFO - __main__ - Step 79: {'lr': 5.5714285714285715e-05, 'samples': 3792, 'steps': 78, 'loss/train': 7.631109714508057}
|
284 |
+
07/25/2024 08:54:29 - INFO - __main__ - Step 80: {'lr': 5.642857142857143e-05, 'samples': 3840, 'steps': 79, 'loss/train': 6.9839606285095215}
|
285 |
+
07/25/2024 08:54:30 - INFO - __main__ - Step 81: {'lr': 5.714285714285714e-05, 'samples': 3888, 'steps': 80, 'loss/train': 7.642471790313721}
|
286 |
+
07/25/2024 08:54:30 - INFO - __main__ - Step 82: {'lr': 5.7857142857142855e-05, 'samples': 3936, 'steps': 81, 'loss/train': 7.183259010314941}
|
287 |
+
07/25/2024 08:54:30 - INFO - __main__ - Step 83: {'lr': 5.8571428571428575e-05, 'samples': 3984, 'steps': 82, 'loss/train': 7.3919596672058105}
|
288 |
+
07/25/2024 08:54:30 - INFO - __main__ - Step 84: {'lr': 5.928571428571429e-05, 'samples': 4032, 'steps': 83, 'loss/train': 7.52573299407959}
|
289 |
+
07/25/2024 08:54:31 - INFO - __main__ - Step 85: {'lr': 6e-05, 'samples': 4080, 'steps': 84, 'loss/train': 7.169320583343506}
|
290 |
+
07/25/2024 08:54:31 - INFO - __main__ - Step 86: {'lr': 6.0714285714285715e-05, 'samples': 4128, 'steps': 85, 'loss/train': 7.095631122589111}
|
291 |
+
07/25/2024 08:54:31 - INFO - __main__ - Step 87: {'lr': 6.142857142857143e-05, 'samples': 4176, 'steps': 86, 'loss/train': 7.257204532623291}
|
292 |
+
07/25/2024 08:54:32 - INFO - __main__ - Step 88: {'lr': 6.214285714285714e-05, 'samples': 4224, 'steps': 87, 'loss/train': 6.010106563568115}
|
293 |
+
07/25/2024 08:54:32 - INFO - __main__ - Step 89: {'lr': 6.285714285714286e-05, 'samples': 4272, 'steps': 88, 'loss/train': 7.189196586608887}
|
294 |
+
07/25/2024 08:54:32 - INFO - __main__ - Step 90: {'lr': 6.357142857142857e-05, 'samples': 4320, 'steps': 89, 'loss/train': 6.902089595794678}
|
295 |
+
07/25/2024 08:54:32 - INFO - __main__ - Step 91: {'lr': 6.428571428571427e-05, 'samples': 4368, 'steps': 90, 'loss/train': 6.5942535400390625}
|
296 |
+
07/25/2024 08:54:33 - INFO - __main__ - Step 92: {'lr': 6.500000000000001e-05, 'samples': 4416, 'steps': 91, 'loss/train': 7.392148017883301}
|
297 |
+
07/25/2024 08:54:33 - INFO - __main__ - Step 93: {'lr': 6.571428571428571e-05, 'samples': 4464, 'steps': 92, 'loss/train': 6.586553573608398}
|
298 |
+
07/25/2024 08:54:33 - INFO - __main__ - Step 94: {'lr': 6.642857142857143e-05, 'samples': 4512, 'steps': 93, 'loss/train': 7.5296549797058105}
|
299 |
+
07/25/2024 08:54:34 - INFO - __main__ - Step 95: {'lr': 6.714285714285714e-05, 'samples': 4560, 'steps': 94, 'loss/train': 7.048985481262207}
|
300 |
+
07/25/2024 08:54:34 - INFO - __main__ - Step 96: {'lr': 6.785714285714285e-05, 'samples': 4608, 'steps': 95, 'loss/train': 4.687469959259033}
|
301 |
+
07/25/2024 08:54:34 - INFO - __main__ - Step 97: {'lr': 6.857142857142858e-05, 'samples': 4656, 'steps': 96, 'loss/train': 7.1623854637146}
|
302 |
+
07/25/2024 08:54:34 - INFO - __main__ - Step 98: {'lr': 6.928571428571429e-05, 'samples': 4704, 'steps': 97, 'loss/train': 6.722190856933594}
|
303 |
+
07/25/2024 08:54:35 - INFO - __main__ - Step 99: {'lr': 7.000000000000001e-05, 'samples': 4752, 'steps': 98, 'loss/train': 6.930887699127197}
|
304 |
+
07/25/2024 08:54:35 - INFO - __main__ - Step 100: {'lr': 7.071428571428571e-05, 'samples': 4800, 'steps': 99, 'loss/train': 7.2268805503845215}
|
305 |
+
07/25/2024 08:54:35 - INFO - __main__ - Evaluating and saving model checkpoint
|
306 |
+
07/25/2024 08:54:35 - DEBUG - datasets.iterable_dataset - dataloader worker#0, ': Starting to iterate over 1/1 shards.
|
307 |
+
07/25/2024 08:54:39 - INFO - __main__ - Step 100: {'loss/eval': 7.000552177429199, 'perplexity': 1097.2388916015625}
|
308 |
+
07/25/2024 08:54:41 - INFO - accelerate.accelerator - Saving current state to my_checkpoint
|
309 |
+
07/25/2024 08:54:41 - WARNING - accelerate.utils.other - Removed shared tensor {'lm_head.weight'} while saving. This should be OK, but check by verifying that you don't receive any warning while reloading
|
310 |
+
07/25/2024 08:54:42 - INFO - accelerate.checkpointing - Model weights saved in my_checkpoint/model.safetensors
|
311 |
+
07/25/2024 08:54:43 - INFO - accelerate.checkpointing - Optimizer state saved in my_checkpoint/optimizer.bin
|
312 |
+
07/25/2024 08:54:43 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in my_checkpoint/sampler.bin
|
313 |
+
07/25/2024 08:54:43 - INFO - accelerate.checkpointing - Sampler state for dataloader 1 saved in my_checkpoint/sampler_1.bin
|
314 |
+
07/25/2024 08:54:43 - INFO - accelerate.checkpointing - Gradient scaler state saved in my_checkpoint/scaler.pt
|
315 |
+
07/25/2024 08:54:43 - INFO - accelerate.checkpointing - Random states saved in my_checkpoint/random_states_0.pkl
|
model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 444048000
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c5b53a26927d2b43758728f34ad60f2c710d38bfb6715cc5a25b0b4eddc03e34
|
3 |
size 444048000
|
my_checkpoint/model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 444048000
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c5b53a26927d2b43758728f34ad60f2c710d38bfb6715cc5a25b0b4eddc03e34
|
3 |
size 444048000
|
my_checkpoint/optimizer.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 888189882
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fce95cdbff74158e4b8bd4f9caea7fd7daf803e0fd9fe145f1a3628a99c075fc
|
3 |
size 888189882
|
my_checkpoint/random_states_0.pkl
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 15124
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d1a2c0e274d1095442c199a361a0073b9b9573004b962744ec7f39207787b113
|
3 |
size 15124
|
my_checkpoint/scaler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 988
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:22e351a6446f3cec84d6aad5716ef9dbee9f7eb8766ebf9f8ce2fcfefe161ba4
|
3 |
size 988
|
runs/Jul25_08-52-29_lab/events.out.tfevents.1721897549.lab.173640.0
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:706b377dd2e6fa04f28d73b17d74d366447ae15a73b733ec8757e3d1e778530c
|
3 |
+
size 17878
|
torch_checkpoint/latest_checkpoint.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1333274074
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6f757b60fe2a84adf317d9e51bcc70355bb190fdcd0c17b89b824e72ce547dee
|
3 |
size 1333274074
|