step 350
Browse files
log/debug_0.log
CHANGED
@@ -735,3 +735,69 @@ Mixed precision type: fp16
|
|
735 |
07/25/2024 06:31:33 - INFO - accelerate.checkpointing - Sampler state for dataloader 1 saved in my_checkpoint/sampler_1.bin
|
736 |
07/25/2024 06:31:33 - INFO - accelerate.checkpointing - Gradient scaler state saved in my_checkpoint/scaler.pt
|
737 |
07/25/2024 06:31:33 - INFO - accelerate.checkpointing - Random states saved in my_checkpoint/random_states_0.pkl
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
735 |
07/25/2024 06:31:33 - INFO - accelerate.checkpointing - Sampler state for dataloader 1 saved in my_checkpoint/sampler_1.bin
|
736 |
07/25/2024 06:31:33 - INFO - accelerate.checkpointing - Gradient scaler state saved in my_checkpoint/scaler.pt
|
737 |
07/25/2024 06:31:33 - INFO - accelerate.checkpointing - Random states saved in my_checkpoint/random_states_0.pkl
|
738 |
+
07/25/2024 06:32:35 - WARNING - huggingface_hub.repository - Several commits (6) will be pushed upstream.
|
739 |
+
07/25/2024 06:32:35 - WARNING - huggingface_hub.repository - The progress bars may be unreliable.
|
740 |
+
07/25/2024 06:33:00 - WARNING - huggingface_hub.repository - To https://huggingface.co/shng2025/gptesla-small
|
741 |
+
4d31a9f..aae7e8d celestial-aardvark-128 -> celestial-aardvark-128
|
742 |
+
|
743 |
+
07/25/2024 06:33:00 - INFO - __main__ - Step 301: {'lr': 0.00021428571428571427, 'samples': 14448, 'steps': 300, 'loss/train': 5.406757354736328}
|
744 |
+
07/25/2024 06:33:00 - INFO - __main__ - Step 302: {'lr': 0.000215, 'samples': 14496, 'steps': 301, 'loss/train': 5.90996789932251}
|
745 |
+
07/25/2024 06:33:01 - INFO - __main__ - Step 303: {'lr': 0.00021571428571428571, 'samples': 14544, 'steps': 302, 'loss/train': 6.092479228973389}
|
746 |
+
07/25/2024 06:33:01 - INFO - __main__ - Step 304: {'lr': 0.00021642857142857143, 'samples': 14592, 'steps': 303, 'loss/train': 5.216100215911865}
|
747 |
+
07/25/2024 06:33:01 - INFO - __main__ - Step 305: {'lr': 0.00021714285714285715, 'samples': 14640, 'steps': 304, 'loss/train': 5.621682643890381}
|
748 |
+
07/25/2024 06:33:01 - INFO - __main__ - Step 306: {'lr': 0.00021785714285714287, 'samples': 14688, 'steps': 305, 'loss/train': 5.823093414306641}
|
749 |
+
07/25/2024 06:33:02 - INFO - __main__ - Step 307: {'lr': 0.0002185714285714286, 'samples': 14736, 'steps': 306, 'loss/train': 6.228525161743164}
|
750 |
+
07/25/2024 06:33:02 - INFO - __main__ - Step 308: {'lr': 0.0002192857142857143, 'samples': 14784, 'steps': 307, 'loss/train': 5.9510087966918945}
|
751 |
+
07/25/2024 06:33:02 - INFO - __main__ - Step 309: {'lr': 0.00022, 'samples': 14832, 'steps': 308, 'loss/train': 5.266091346740723}
|
752 |
+
07/25/2024 06:33:03 - INFO - __main__ - Step 310: {'lr': 0.00022071428571428573, 'samples': 14880, 'steps': 309, 'loss/train': 5.217267036437988}
|
753 |
+
07/25/2024 06:33:03 - INFO - __main__ - Step 311: {'lr': 0.00022142857142857142, 'samples': 14928, 'steps': 310, 'loss/train': 7.697060585021973}
|
754 |
+
07/25/2024 06:33:03 - INFO - __main__ - Step 312: {'lr': 0.00022214285714285714, 'samples': 14976, 'steps': 311, 'loss/train': 5.666650772094727}
|
755 |
+
07/25/2024 06:33:03 - INFO - __main__ - Step 313: {'lr': 0.00022285714285714286, 'samples': 15024, 'steps': 312, 'loss/train': 6.425085067749023}
|
756 |
+
07/25/2024 06:33:04 - INFO - __main__ - Step 314: {'lr': 0.00022357142857142855, 'samples': 15072, 'steps': 313, 'loss/train': 4.396389007568359}
|
757 |
+
07/25/2024 06:33:04 - INFO - __main__ - Step 315: {'lr': 0.0002242857142857143, 'samples': 15120, 'steps': 314, 'loss/train': 5.2941131591796875}
|
758 |
+
07/25/2024 06:33:04 - INFO - __main__ - Step 316: {'lr': 0.00022500000000000002, 'samples': 15168, 'steps': 315, 'loss/train': 5.752312183380127}
|
759 |
+
07/25/2024 06:33:04 - INFO - __main__ - Step 317: {'lr': 0.00022571428571428571, 'samples': 15216, 'steps': 316, 'loss/train': 6.089960098266602}
|
760 |
+
07/25/2024 06:33:05 - INFO - __main__ - Step 318: {'lr': 0.00022642857142857143, 'samples': 15264, 'steps': 317, 'loss/train': 5.828670978546143}
|
761 |
+
07/25/2024 06:33:05 - INFO - __main__ - Step 319: {'lr': 0.00022714285714285715, 'samples': 15312, 'steps': 318, 'loss/train': 5.34361457824707}
|
762 |
+
07/25/2024 06:33:05 - INFO - __main__ - Step 320: {'lr': 0.00022785714285714287, 'samples': 15360, 'steps': 319, 'loss/train': 3.9433271884918213}
|
763 |
+
07/25/2024 06:33:06 - INFO - __main__ - Step 321: {'lr': 0.00022857142857142857, 'samples': 15408, 'steps': 320, 'loss/train': 5.489405632019043}
|
764 |
+
07/25/2024 06:33:06 - INFO - __main__ - Step 322: {'lr': 0.0002292857142857143, 'samples': 15456, 'steps': 321, 'loss/train': 5.065426826477051}
|
765 |
+
07/25/2024 06:33:06 - INFO - __main__ - Step 323: {'lr': 0.00023, 'samples': 15504, 'steps': 322, 'loss/train': 4.657402038574219}
|
766 |
+
07/25/2024 06:33:06 - INFO - __main__ - Step 324: {'lr': 0.0002307142857142857, 'samples': 15552, 'steps': 323, 'loss/train': 6.042489528656006}
|
767 |
+
07/25/2024 06:33:07 - INFO - __main__ - Step 325: {'lr': 0.00023142857142857142, 'samples': 15600, 'steps': 324, 'loss/train': 5.562082290649414}
|
768 |
+
07/25/2024 06:33:07 - INFO - __main__ - Step 326: {'lr': 0.00023214285714285717, 'samples': 15648, 'steps': 325, 'loss/train': 5.726541519165039}
|
769 |
+
07/25/2024 06:33:07 - INFO - __main__ - Step 327: {'lr': 0.00023285714285714286, 'samples': 15696, 'steps': 326, 'loss/train': 5.573945045471191}
|
770 |
+
07/25/2024 06:33:08 - INFO - __main__ - Step 328: {'lr': 0.00023357142857142858, 'samples': 15744, 'steps': 327, 'loss/train': 6.105917930603027}
|
771 |
+
07/25/2024 06:33:08 - INFO - __main__ - Step 329: {'lr': 0.0002342857142857143, 'samples': 15792, 'steps': 328, 'loss/train': 5.546865463256836}
|
772 |
+
07/25/2024 06:33:08 - INFO - __main__ - Step 330: {'lr': 0.000235, 'samples': 15840, 'steps': 329, 'loss/train': 5.543821334838867}
|
773 |
+
07/25/2024 06:33:08 - INFO - __main__ - Step 331: {'lr': 0.0002357142857142857, 'samples': 15888, 'steps': 330, 'loss/train': 5.6774582862854}
|
774 |
+
07/25/2024 06:33:09 - INFO - __main__ - Step 332: {'lr': 0.00023642857142857143, 'samples': 15936, 'steps': 331, 'loss/train': 5.767722129821777}
|
775 |
+
07/25/2024 06:33:09 - INFO - __main__ - Step 333: {'lr': 0.00023714285714285715, 'samples': 15984, 'steps': 332, 'loss/train': 5.70899772644043}
|
776 |
+
07/25/2024 06:33:09 - INFO - __main__ - Step 334: {'lr': 0.00023785714285714285, 'samples': 16032, 'steps': 333, 'loss/train': 5.67036247253418}
|
777 |
+
07/25/2024 06:33:10 - INFO - __main__ - Step 335: {'lr': 0.00023857142857142857, 'samples': 16080, 'steps': 334, 'loss/train': 5.325812339782715}
|
778 |
+
07/25/2024 06:33:10 - INFO - __main__ - Step 336: {'lr': 0.0002392857142857143, 'samples': 16128, 'steps': 335, 'loss/train': 5.349172592163086}
|
779 |
+
07/25/2024 06:33:10 - INFO - __main__ - Step 337: {'lr': 0.00024, 'samples': 16176, 'steps': 336, 'loss/train': 5.448930263519287}
|
780 |
+
07/25/2024 06:33:10 - INFO - __main__ - Step 338: {'lr': 0.00024071428571428573, 'samples': 16224, 'steps': 337, 'loss/train': 3.7934205532073975}
|
781 |
+
07/25/2024 06:33:11 - INFO - __main__ - Step 339: {'lr': 0.00024142857142857145, 'samples': 16272, 'steps': 338, 'loss/train': 5.1056013107299805}
|
782 |
+
07/25/2024 06:33:11 - INFO - __main__ - Step 340: {'lr': 0.00024214285714285714, 'samples': 16320, 'steps': 339, 'loss/train': 5.9682464599609375}
|
783 |
+
07/25/2024 06:33:11 - INFO - __main__ - Step 341: {'lr': 0.00024285714285714286, 'samples': 16368, 'steps': 340, 'loss/train': 5.546884536743164}
|
784 |
+
07/25/2024 06:33:12 - INFO - __main__ - Step 342: {'lr': 0.00024357142857142858, 'samples': 16416, 'steps': 341, 'loss/train': 6.586970329284668}
|
785 |
+
07/25/2024 06:33:12 - INFO - __main__ - Step 343: {'lr': 0.0002442857142857143, 'samples': 16464, 'steps': 342, 'loss/train': 5.654937744140625}
|
786 |
+
07/25/2024 06:33:12 - INFO - __main__ - Step 344: {'lr': 0.000245, 'samples': 16512, 'steps': 343, 'loss/train': 3.9033658504486084}
|
787 |
+
07/25/2024 06:33:12 - INFO - __main__ - Step 345: {'lr': 0.00024571428571428574, 'samples': 16560, 'steps': 344, 'loss/train': 6.266292095184326}
|
788 |
+
07/25/2024 06:33:13 - INFO - __main__ - Step 346: {'lr': 0.00024642857142857143, 'samples': 16608, 'steps': 345, 'loss/train': 5.5901007652282715}
|
789 |
+
07/25/2024 06:33:13 - INFO - __main__ - Step 347: {'lr': 0.0002471428571428571, 'samples': 16656, 'steps': 346, 'loss/train': 5.836148738861084}
|
790 |
+
07/25/2024 06:33:13 - INFO - __main__ - Step 348: {'lr': 0.00024785714285714287, 'samples': 16704, 'steps': 347, 'loss/train': 5.447431564331055}
|
791 |
+
07/25/2024 06:33:13 - INFO - __main__ - Step 349: {'lr': 0.00024857142857142857, 'samples': 16752, 'steps': 348, 'loss/train': 5.124023914337158}
|
792 |
+
07/25/2024 06:33:14 - INFO - __main__ - Step 350: {'lr': 0.00024928571428571426, 'samples': 16800, 'steps': 349, 'loss/train': 5.541380405426025}
|
793 |
+
07/25/2024 06:33:14 - INFO - __main__ - Evaluating and saving model checkpoint
|
794 |
+
07/25/2024 06:33:14 - DEBUG - datasets.iterable_dataset - dataloader worker#0, ': Starting to iterate over 1/1 shards.
|
795 |
+
07/25/2024 06:33:17 - INFO - __main__ - Step 350: {'loss/eval': 5.6890645027160645, 'perplexity': 295.616943359375}
|
796 |
+
07/25/2024 06:33:18 - INFO - accelerate.accelerator - Saving current state to my_checkpoint
|
797 |
+
07/25/2024 06:33:18 - WARNING - accelerate.utils.other - Removed shared tensor {'lm_head.weight'} while saving. This should be OK, but check by verifying that you don't receive any warning while reloading
|
798 |
+
07/25/2024 06:33:18 - INFO - accelerate.checkpointing - Model weights saved in my_checkpoint/model.safetensors
|
799 |
+
07/25/2024 06:33:20 - INFO - accelerate.checkpointing - Optimizer state saved in my_checkpoint/optimizer.bin
|
800 |
+
07/25/2024 06:33:20 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in my_checkpoint/sampler.bin
|
801 |
+
07/25/2024 06:33:20 - INFO - accelerate.checkpointing - Sampler state for dataloader 1 saved in my_checkpoint/sampler_1.bin
|
802 |
+
07/25/2024 06:33:20 - INFO - accelerate.checkpointing - Gradient scaler state saved in my_checkpoint/scaler.pt
|
803 |
+
07/25/2024 06:33:20 - INFO - accelerate.checkpointing - Random states saved in my_checkpoint/random_states_0.pkl
|
model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 444048000
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:74d03dfe4f9a7c3dbed0e2772823c452d61c735ff1d9e4edb373db7f38aa08d6
|
3 |
size 444048000
|
my_checkpoint/model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 444048000
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:74d03dfe4f9a7c3dbed0e2772823c452d61c735ff1d9e4edb373db7f38aa08d6
|
3 |
size 444048000
|
my_checkpoint/optimizer.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 888189882
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a4e8ba2424b27ae17a33f056fcd4d41d0614ae78ffb99613e9bb2e0bc0fa8eca
|
3 |
size 888189882
|
my_checkpoint/random_states_0.pkl
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 15124
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3553afdcff7fb785ac1270d5080609b4c8a0923b15553a0bc793529bfaf13828
|
3 |
size 15124
|
my_checkpoint/scaler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 988
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:decf8f85268f875c08a51552b9878ffacb11e12823a8bff511e2a56422200275
|
3 |
size 988
|
runs/Jul25_06-22-39_lab/events.out.tfevents.1721888559.lab.31151.0
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c78aa4878fb5b5894e0d0f86e8686fc9b7fc6019540c6c1cc9cd1b3f15083f2a
|
3 |
+
size 63255
|