step 200
Browse files
log/debug_0.log
CHANGED
@@ -537,3 +537,69 @@ Mixed precision type: fp16
|
|
537 |
07/25/2024 06:26:14 - INFO - accelerate.checkpointing - Sampler state for dataloader 1 saved in my_checkpoint/sampler_1.bin
|
538 |
07/25/2024 06:26:14 - INFO - accelerate.checkpointing - Gradient scaler state saved in my_checkpoint/scaler.pt
|
539 |
07/25/2024 06:26:14 - INFO - accelerate.checkpointing - Random states saved in my_checkpoint/random_states_0.pkl
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
537 |
07/25/2024 06:26:14 - INFO - accelerate.checkpointing - Sampler state for dataloader 1 saved in my_checkpoint/sampler_1.bin
|
538 |
07/25/2024 06:26:14 - INFO - accelerate.checkpointing - Gradient scaler state saved in my_checkpoint/scaler.pt
|
539 |
07/25/2024 06:26:14 - INFO - accelerate.checkpointing - Random states saved in my_checkpoint/random_states_0.pkl
|
540 |
+
07/25/2024 06:27:15 - WARNING - huggingface_hub.repository - Several commits (3) will be pushed upstream.
|
541 |
+
07/25/2024 06:27:15 - WARNING - huggingface_hub.repository - The progress bars may be unreliable.
|
542 |
+
07/25/2024 06:27:38 - WARNING - huggingface_hub.repository - To https://huggingface.co/shng2025/gptesla-small
|
543 |
+
1e419a3..dcc8019 celestial-aardvark-128 -> celestial-aardvark-128
|
544 |
+
|
545 |
+
07/25/2024 06:27:38 - INFO - __main__ - Step 151: {'lr': 0.00010714285714285714, 'samples': 7248, 'steps': 150, 'loss/train': 6.574682235717773}
|
546 |
+
07/25/2024 06:27:38 - INFO - __main__ - Step 152: {'lr': 0.00010785714285714286, 'samples': 7296, 'steps': 151, 'loss/train': 5.2919840812683105}
|
547 |
+
07/25/2024 06:27:39 - INFO - __main__ - Step 153: {'lr': 0.00010857142857142858, 'samples': 7344, 'steps': 152, 'loss/train': 6.282163143157959}
|
548 |
+
07/25/2024 06:27:39 - INFO - __main__ - Step 154: {'lr': 0.0001092857142857143, 'samples': 7392, 'steps': 153, 'loss/train': 6.462711334228516}
|
549 |
+
07/25/2024 06:27:39 - INFO - __main__ - Step 155: {'lr': 0.00011, 'samples': 7440, 'steps': 154, 'loss/train': 5.595396518707275}
|
550 |
+
07/25/2024 06:27:39 - INFO - __main__ - Step 156: {'lr': 0.00011071428571428571, 'samples': 7488, 'steps': 155, 'loss/train': 6.128833293914795}
|
551 |
+
07/25/2024 06:27:40 - INFO - __main__ - Step 157: {'lr': 0.00011142857142857143, 'samples': 7536, 'steps': 156, 'loss/train': 6.035909652709961}
|
552 |
+
07/25/2024 06:27:40 - INFO - __main__ - Step 158: {'lr': 0.00011214285714285715, 'samples': 7584, 'steps': 157, 'loss/train': 6.275477886199951}
|
553 |
+
07/25/2024 06:27:40 - INFO - __main__ - Step 159: {'lr': 0.00011285714285714286, 'samples': 7632, 'steps': 158, 'loss/train': 6.1195969581604}
|
554 |
+
07/25/2024 06:27:40 - INFO - __main__ - Step 160: {'lr': 0.00011357142857142858, 'samples': 7680, 'steps': 159, 'loss/train': 8.316116333007812}
|
555 |
+
07/25/2024 06:27:41 - INFO - __main__ - Step 161: {'lr': 0.00011428571428571428, 'samples': 7728, 'steps': 160, 'loss/train': 6.287449836730957}
|
556 |
+
07/25/2024 06:27:41 - INFO - __main__ - Step 162: {'lr': 0.000115, 'samples': 7776, 'steps': 161, 'loss/train': 5.879787445068359}
|
557 |
+
07/25/2024 06:27:41 - INFO - __main__ - Step 163: {'lr': 0.00011571428571428571, 'samples': 7824, 'steps': 162, 'loss/train': 6.221517086029053}
|
558 |
+
07/25/2024 06:27:42 - INFO - __main__ - Step 164: {'lr': 0.00011642857142857143, 'samples': 7872, 'steps': 163, 'loss/train': 5.967787265777588}
|
559 |
+
07/25/2024 06:27:42 - INFO - __main__ - Step 165: {'lr': 0.00011714285714285715, 'samples': 7920, 'steps': 164, 'loss/train': 6.09508752822876}
|
560 |
+
07/25/2024 06:27:42 - INFO - __main__ - Step 166: {'lr': 0.00011785714285714286, 'samples': 7968, 'steps': 165, 'loss/train': 6.462942123413086}
|
561 |
+
07/25/2024 06:27:42 - INFO - __main__ - Step 167: {'lr': 0.00011857142857142858, 'samples': 8016, 'steps': 166, 'loss/train': 6.146663188934326}
|
562 |
+
07/25/2024 06:27:43 - INFO - __main__ - Step 168: {'lr': 0.00011928571428571428, 'samples': 8064, 'steps': 167, 'loss/train': 6.4038286209106445}
|
563 |
+
07/25/2024 06:27:43 - INFO - __main__ - Step 169: {'lr': 0.00012, 'samples': 8112, 'steps': 168, 'loss/train': 6.267633438110352}
|
564 |
+
07/25/2024 06:27:43 - INFO - __main__ - Step 170: {'lr': 0.00012071428571428572, 'samples': 8160, 'steps': 169, 'loss/train': 6.64249324798584}
|
565 |
+
07/25/2024 06:27:44 - INFO - __main__ - Step 171: {'lr': 0.00012142857142857143, 'samples': 8208, 'steps': 170, 'loss/train': 6.448271751403809}
|
566 |
+
07/25/2024 06:27:44 - INFO - __main__ - Step 172: {'lr': 0.00012214285714285715, 'samples': 8256, 'steps': 171, 'loss/train': 6.485412120819092}
|
567 |
+
07/25/2024 06:27:44 - INFO - __main__ - Step 173: {'lr': 0.00012285714285714287, 'samples': 8304, 'steps': 172, 'loss/train': 6.213407516479492}
|
568 |
+
07/25/2024 06:27:44 - INFO - __main__ - Step 174: {'lr': 0.00012357142857142856, 'samples': 8352, 'steps': 173, 'loss/train': 5.832103729248047}
|
569 |
+
07/25/2024 06:27:45 - INFO - __main__ - Step 175: {'lr': 0.00012428571428571428, 'samples': 8400, 'steps': 174, 'loss/train': 5.645206928253174}
|
570 |
+
07/25/2024 06:27:45 - INFO - __main__ - Step 176: {'lr': 0.000125, 'samples': 8448, 'steps': 175, 'loss/train': 5.942577838897705}
|
571 |
+
07/25/2024 06:27:45 - INFO - __main__ - Step 177: {'lr': 0.00012571428571428572, 'samples': 8496, 'steps': 176, 'loss/train': 6.108009338378906}
|
572 |
+
07/25/2024 06:27:46 - INFO - __main__ - Step 178: {'lr': 0.00012642857142857142, 'samples': 8544, 'steps': 177, 'loss/train': 6.048696994781494}
|
573 |
+
07/25/2024 06:27:46 - INFO - __main__ - Step 179: {'lr': 0.00012714285714285714, 'samples': 8592, 'steps': 178, 'loss/train': 6.014152526855469}
|
574 |
+
07/25/2024 06:27:46 - INFO - __main__ - Step 180: {'lr': 0.00012785714285714286, 'samples': 8640, 'steps': 179, 'loss/train': 6.590332508087158}
|
575 |
+
07/25/2024 06:27:46 - INFO - __main__ - Step 181: {'lr': 0.00012857142857142855, 'samples': 8688, 'steps': 180, 'loss/train': 6.095800399780273}
|
576 |
+
07/25/2024 06:27:47 - INFO - __main__ - Step 182: {'lr': 0.0001292857142857143, 'samples': 8736, 'steps': 181, 'loss/train': 5.968374729156494}
|
577 |
+
07/25/2024 06:27:47 - INFO - __main__ - Step 183: {'lr': 0.00013000000000000002, 'samples': 8784, 'steps': 182, 'loss/train': 6.073035717010498}
|
578 |
+
07/25/2024 06:27:47 - INFO - __main__ - Step 184: {'lr': 0.00013071428571428574, 'samples': 8832, 'steps': 183, 'loss/train': 7.681509494781494}
|
579 |
+
07/25/2024 06:27:47 - INFO - __main__ - Step 185: {'lr': 0.00013142857142857143, 'samples': 8880, 'steps': 184, 'loss/train': 5.806171417236328}
|
580 |
+
07/25/2024 06:27:48 - INFO - __main__ - Step 186: {'lr': 0.00013214285714285715, 'samples': 8928, 'steps': 185, 'loss/train': 5.868297576904297}
|
581 |
+
07/25/2024 06:27:48 - INFO - __main__ - Step 187: {'lr': 0.00013285714285714287, 'samples': 8976, 'steps': 186, 'loss/train': 5.532838344573975}
|
582 |
+
07/25/2024 06:27:48 - INFO - __main__ - Step 188: {'lr': 0.00013357142857142856, 'samples': 9024, 'steps': 187, 'loss/train': 6.210916042327881}
|
583 |
+
07/25/2024 06:27:49 - INFO - __main__ - Step 189: {'lr': 0.00013428571428571428, 'samples': 9072, 'steps': 188, 'loss/train': 5.803860187530518}
|
584 |
+
07/25/2024 06:27:49 - INFO - __main__ - Step 190: {'lr': 0.000135, 'samples': 9120, 'steps': 189, 'loss/train': 6.666335105895996}
|
585 |
+
07/25/2024 06:27:49 - INFO - __main__ - Step 191: {'lr': 0.0001357142857142857, 'samples': 9168, 'steps': 190, 'loss/train': 5.624790668487549}
|
586 |
+
07/25/2024 06:27:49 - INFO - __main__ - Step 192: {'lr': 0.00013642857142857144, 'samples': 9216, 'steps': 191, 'loss/train': 5.217100143432617}
|
587 |
+
07/25/2024 06:27:50 - INFO - __main__ - Step 193: {'lr': 0.00013714285714285716, 'samples': 9264, 'steps': 192, 'loss/train': 5.951303482055664}
|
588 |
+
07/25/2024 06:27:50 - INFO - __main__ - Step 194: {'lr': 0.00013785714285714285, 'samples': 9312, 'steps': 193, 'loss/train': 5.851853847503662}
|
589 |
+
07/25/2024 06:27:50 - INFO - __main__ - Step 195: {'lr': 0.00013857142857142857, 'samples': 9360, 'steps': 194, 'loss/train': 5.776468276977539}
|
590 |
+
07/25/2024 06:27:51 - INFO - __main__ - Step 196: {'lr': 0.0001392857142857143, 'samples': 9408, 'steps': 195, 'loss/train': 5.7882866859436035}
|
591 |
+
07/25/2024 06:27:51 - INFO - __main__ - Step 197: {'lr': 0.00014000000000000001, 'samples': 9456, 'steps': 196, 'loss/train': 5.621963024139404}
|
592 |
+
07/25/2024 06:27:51 - INFO - __main__ - Step 198: {'lr': 0.0001407142857142857, 'samples': 9504, 'steps': 197, 'loss/train': 5.277397632598877}
|
593 |
+
07/25/2024 06:27:51 - INFO - __main__ - Step 199: {'lr': 0.00014142857142857143, 'samples': 9552, 'steps': 198, 'loss/train': 5.9324951171875}
|
594 |
+
07/25/2024 06:27:52 - INFO - __main__ - Step 200: {'lr': 0.00014214285714285715, 'samples': 9600, 'steps': 199, 'loss/train': 6.0901618003845215}
|
595 |
+
07/25/2024 06:27:52 - INFO - __main__ - Evaluating and saving model checkpoint
|
596 |
+
07/25/2024 06:27:52 - DEBUG - datasets.iterable_dataset - dataloader worker#0, ': Starting to iterate over 1/1 shards.
|
597 |
+
07/25/2024 06:27:55 - INFO - __main__ - Step 200: {'loss/eval': 6.142789840698242, 'perplexity': 465.3500061035156}
|
598 |
+
07/25/2024 06:27:56 - INFO - accelerate.accelerator - Saving current state to my_checkpoint
|
599 |
+
07/25/2024 06:27:56 - WARNING - accelerate.utils.other - Removed shared tensor {'lm_head.weight'} while saving. This should be OK, but check by verifying that you don't receive any warning while reloading
|
600 |
+
07/25/2024 06:27:56 - INFO - accelerate.checkpointing - Model weights saved in my_checkpoint/model.safetensors
|
601 |
+
07/25/2024 06:27:58 - INFO - accelerate.checkpointing - Optimizer state saved in my_checkpoint/optimizer.bin
|
602 |
+
07/25/2024 06:27:58 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in my_checkpoint/sampler.bin
|
603 |
+
07/25/2024 06:27:58 - INFO - accelerate.checkpointing - Sampler state for dataloader 1 saved in my_checkpoint/sampler_1.bin
|
604 |
+
07/25/2024 06:27:58 - INFO - accelerate.checkpointing - Gradient scaler state saved in my_checkpoint/scaler.pt
|
605 |
+
07/25/2024 06:27:58 - INFO - accelerate.checkpointing - Random states saved in my_checkpoint/random_states_0.pkl
|
model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 444048000
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2915166eeb447787f7f807f52abb7974ddbe6809c764e6a97c08f1342ed1aaeb
|
3 |
size 444048000
|
my_checkpoint/model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 444048000
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2915166eeb447787f7f807f52abb7974ddbe6809c764e6a97c08f1342ed1aaeb
|
3 |
size 444048000
|
my_checkpoint/optimizer.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 888189882
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f33a96f559673c18a958f574b923919a15ab83a3abd184e0036e3c177b7ed038
|
3 |
size 888189882
|
my_checkpoint/random_states_0.pkl
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 15124
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4b78ba6311cb397c7d4865e76a561647029bc9a753964384051d9e4f61d2f5df
|
3 |
size 15124
|
my_checkpoint/scaler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 988
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:875ef2d9f0990004d87a6506b33ab8a55d70c5ab5c100eb1bd25758e01924e1f
|
3 |
size 988
|
runs/Jul25_06-22-39_lab/events.out.tfevents.1721888559.lab.31151.0
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:063bc9f0c16dc09f2795efc3d7d747202fd549a883528e8b40574e6715141ad7
|
3 |
+
size 35964
|