Training in progress, step 12000

Browse files

Files changed (5) hide show

optimizer.pt +1 -1
rng_state.pth +1 -1
runs/Jun07_12-33-16_DESKTOP-69FPKCK/events.out.tfevents.1717788805.DESKTOP-69FPKCK +2 -2
scheduler.pt +1 -1
trainer_state.json +712 -3

optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bf57752b7fe8b0f27d2c11bd437ea0d0546c101a5edf6c2b8ef58464ff8128e9
 size 11230198

 version https://git-lfs.github.com/spec/v1
+oid sha256:bf5c3ae23e0061e18908dedb416fcb0d32e5ecadfc7f18abb693d4f1c6a53a96
 size 11230198

rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c062f7f375beded48b5337f5a3f3a5cb38807fa3e85dbf3e294c0ab6b627bfc2
 size 14244

 version https://git-lfs.github.com/spec/v1
+oid sha256:69f50a692634404f2eebb2eab9f456865957578d752987bc52d843ac2a774366
 size 14244

runs/Jun07_12-33-16_DESKTOP-69FPKCK/events.out.tfevents.1717788805.DESKTOP-69FPKCK CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5b87ebb898ef8d4cf2fd130cf8dc476f284c6ad66d1da7de9a1509f9a6001400
-size 131495

 version https://git-lfs.github.com/spec/v1
+oid sha256:43356ad5026fe7bb50ebc6b79634c8aa625fabdcc290f5b9e335f623df1606f8
+size 132761

scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c903d70ba831f7bc91d767743519849df9eeb11f7c11a55a187111672ce37e65
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:af7225d5b0731cf57528f6961f709bb0e7ed929fa0d79711b3aa9685866e262d
 size 1064

trainer_state.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.045244978938462306,
   "eval_steps": 2000,
-  "global_step": 10000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -3552,6 +3552,715 @@
       "eval_samples_per_second": 2857.071,
       "eval_steps_per_second": 11.163,
       "step": 10000
     }
   ],
   "logging_steps": 20,
@@ -3559,7 +4268,7 @@
   "num_input_tokens_seen": 0,
   "num_train_epochs": 3,
   "save_steps": 100,
-  "total_flos": 3595905269760000.0,
   "train_batch_size": 256,
   "trial_name": null,
   "trial_params": null

 {
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.05429397472615476,
   "eval_steps": 2000,
+  "global_step": 12000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "eval_samples_per_second": 2857.071,
       "eval_steps_per_second": 11.163,
       "step": 10000
+    },
+    {
+      "epoch": 0.04533546889633923,
+      "grad_norm": 12.951713562011719,
+      "learning_rate": 0.00013589720387295268,
+      "loss": 9.154,
+      "step": 10020
+    },
+    {
+      "epoch": 0.04542595885421615,
+      "grad_norm": 9.139362335205078,
+      "learning_rate": 0.00013616867251832414,
+      "loss": 9.154,
+      "step": 10040
+    },
+    {
+      "epoch": 0.04551644881209308,
+      "grad_norm": 8.388337135314941,
+      "learning_rate": 0.0001364401411636956,
+      "loss": 9.1391,
+      "step": 10060
+    },
+    {
+      "epoch": 0.045606938769970004,
+      "grad_norm": 10.0809326171875,
+      "learning_rate": 0.00013671160980906704,
+      "loss": 9.1417,
+      "step": 10080
+    },
+    {
+      "epoch": 0.04569742872784693,
+      "grad_norm": 8.565701484680176,
+      "learning_rate": 0.0001369830784544385,
+      "loss": 9.1112,
+      "step": 10100
+    },
+    {
+      "epoch": 0.04578791868572385,
+      "grad_norm": 10.437520027160645,
+      "learning_rate": 0.00013725454709980997,
+      "loss": 9.1169,
+      "step": 10120
+    },
+    {
+      "epoch": 0.04587840864360078,
+      "grad_norm": 8.615896224975586,
+      "learning_rate": 0.00013752601574518143,
+      "loss": 9.1003,
+      "step": 10140
+    },
+    {
+      "epoch": 0.0459688986014777,
+      "grad_norm": 10.89583683013916,
+      "learning_rate": 0.0001377974843905529,
+      "loss": 9.101,
+      "step": 10160
+    },
+    {
+      "epoch": 0.046059388559354625,
+      "grad_norm": 9.786931991577148,
+      "learning_rate": 0.00013806895303592433,
+      "loss": 9.0689,
+      "step": 10180
+    },
+    {
+      "epoch": 0.04614987851723155,
+      "grad_norm": 9.010174751281738,
+      "learning_rate": 0.0001383404216812958,
+      "loss": 9.0579,
+      "step": 10200
+    },
+    {
+      "epoch": 0.04624036847510848,
+      "grad_norm": 11.039669036865234,
+      "learning_rate": 0.00013861189032666725,
+      "loss": 9.0865,
+      "step": 10220
+    },
+    {
+      "epoch": 0.0463308584329854,
+      "grad_norm": 12.055830001831055,
+      "learning_rate": 0.00013888335897203872,
+      "loss": 9.0955,
+      "step": 10240
+    },
+    {
+      "epoch": 0.04642134839086232,
+      "grad_norm": 8.361885070800781,
+      "learning_rate": 0.00013915482761741018,
+      "loss": 9.07,
+      "step": 10260
+    },
+    {
+      "epoch": 0.046511838348739246,
+      "grad_norm": 7.196146011352539,
+      "learning_rate": 0.00013942629626278164,
+      "loss": 9.0528,
+      "step": 10280
+    },
+    {
+      "epoch": 0.046602328306616175,
+      "grad_norm": 9.67076587677002,
+      "learning_rate": 0.0001396977649081531,
+      "loss": 9.0546,
+      "step": 10300
+    },
+    {
+      "epoch": 0.0466928182644931,
+      "grad_norm": 10.09327220916748,
+      "learning_rate": 0.00013996923355352457,
+      "loss": 9.0741,
+      "step": 10320
+    },
+    {
+      "epoch": 0.04678330822237002,
+      "grad_norm": 9.639015197753906,
+      "learning_rate": 0.00014024070219889603,
+      "loss": 9.0633,
+      "step": 10340
+    },
+    {
+      "epoch": 0.04687379818024695,
+      "grad_norm": 10.251932144165039,
+      "learning_rate": 0.0001405121708442675,
+      "loss": 9.0446,
+      "step": 10360
+    },
+    {
+      "epoch": 0.04696428813812387,
+      "grad_norm": 11.07875919342041,
+      "learning_rate": 0.00014078363948963896,
+      "loss": 9.0418,
+      "step": 10380
+    },
+    {
+      "epoch": 0.047054778096000796,
+      "grad_norm": 9.328507423400879,
+      "learning_rate": 0.00014105510813501042,
+      "loss": 9.0287,
+      "step": 10400
+    },
+    {
+      "epoch": 0.04714526805387772,
+      "grad_norm": 7.056753635406494,
+      "learning_rate": 0.00014132657678038186,
+      "loss": 9.0362,
+      "step": 10420
+    },
+    {
+      "epoch": 0.04723575801175465,
+      "grad_norm": 8.899680137634277,
+      "learning_rate": 0.0001415980454257533,
+      "loss": 9.036,
+      "step": 10440
+    },
+    {
+      "epoch": 0.04732624796963157,
+      "grad_norm": 9.175132751464844,
+      "learning_rate": 0.00014186951407112476,
+      "loss": 9.0444,
+      "step": 10460
+    },
+    {
+      "epoch": 0.047416737927508494,
+      "grad_norm": 9.374978065490723,
+      "learning_rate": 0.00014214098271649622,
+      "loss": 9.0372,
+      "step": 10480
+    },
+    {
+      "epoch": 0.04750722788538542,
+      "grad_norm": 9.893750190734863,
+      "learning_rate": 0.00014241245136186769,
+      "loss": 9.0424,
+      "step": 10500
+    },
+    {
+      "epoch": 0.04759771784326235,
+      "grad_norm": 7.787280082702637,
+      "learning_rate": 0.00014265677314270202,
+      "loss": 8.9691,
+      "step": 10520
+    },
+    {
+      "epoch": 0.04768820780113927,
+      "grad_norm": 17.40734100341797,
+      "learning_rate": 0.00014277893403311917,
+      "loss": 8.2225,
+      "step": 10540
+    },
+    {
+      "epoch": 0.04777869775901619,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014286037462673062,
+      "loss": 6.6046,
+      "step": 10560
+    },
+    {
+      "epoch": 0.047869187716893115,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001429146683558049,
+      "loss": 3.0921,
+      "step": 10580
+    },
+    {
+      "epoch": 0.047959677674770045,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014294181522034205,
+      "loss": 3.9765,
+      "step": 10600
+    },
+    {
+      "epoch": 0.04805016763264697,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 6.9972,
+      "step": 10620
+    },
+    {
+      "epoch": 0.04814065759052389,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 10640
+    },
+    {
+      "epoch": 0.04823114754840081,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 10660
+    },
+    {
+      "epoch": 0.04832163750627774,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 10680
+    },
+    {
+      "epoch": 0.048412127464154665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 10700
+    },
+    {
+      "epoch": 0.04850261742203159,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 10720
+    },
+    {
+      "epoch": 0.04859310737990852,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 10740
+    },
+    {
+      "epoch": 0.04868359733778544,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 10760
+    },
+    {
+      "epoch": 0.04877408729566236,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 10780
+    },
+    {
+      "epoch": 0.048864577253539286,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 10800
+    },
+    {
+      "epoch": 0.048955067211416216,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 10820
+    },
+    {
+      "epoch": 0.04904555716929314,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 10840
+    },
+    {
+      "epoch": 0.04913604712717006,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 10860
+    },
+    {
+      "epoch": 0.049226537085046984,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 10880
+    },
+    {
+      "epoch": 0.049317027042923914,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 10900
+    },
+    {
+      "epoch": 0.04940751700080084,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 10920
+    },
+    {
+      "epoch": 0.04949800695867776,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 10940
+    },
+    {
+      "epoch": 0.04958849691655468,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 10960
+    },
+    {
+      "epoch": 0.04967898687443161,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 10980
+    },
+    {
+      "epoch": 0.049769476832308535,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 11000
+    },
+    {
+      "epoch": 0.04985996679018546,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 11020
+    },
+    {
+      "epoch": 0.04995045674806239,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 11040
+    },
+    {
+      "epoch": 0.05004094670593931,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 11060
+    },
+    {
+      "epoch": 0.05013143666381623,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 11080
+    },
+    {
+      "epoch": 0.050221926621693155,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 11100
+    },
+    {
+      "epoch": 0.050312416579570085,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 11120
+    },
+    {
+      "epoch": 0.05040290653744701,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 11140
+    },
+    {
+      "epoch": 0.05049339649532393,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 11160
+    },
+    {
+      "epoch": 0.05058388645320085,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 11180
+    },
+    {
+      "epoch": 0.05067437641107778,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 11200
+    },
+    {
+      "epoch": 0.050764866368954706,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 11220
+    },
+    {
+      "epoch": 0.05085535632683163,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 11240
+    },
+    {
+      "epoch": 0.05094584628470855,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 11260
+    },
+    {
+      "epoch": 0.05103633624258548,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 11280
+    },
+    {
+      "epoch": 0.051126826200462404,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 11300
+    },
+    {
+      "epoch": 0.05121731615833933,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 11320
+    },
+    {
+      "epoch": 0.051307806116216256,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 11340
+    },
+    {
+      "epoch": 0.05139829607409318,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 11360
+    },
+    {
+      "epoch": 0.0514887860319701,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 11380
+    },
+    {
+      "epoch": 0.051579275989847025,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 11400
+    },
+    {
+      "epoch": 0.051669765947723954,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 11420
+    },
+    {
+      "epoch": 0.05176025590560088,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 11440
+    },
+    {
+      "epoch": 0.0518507458634778,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 11460
+    },
+    {
+      "epoch": 0.05194123582135472,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 11480
+    },
+    {
+      "epoch": 0.05203172577923165,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 11500
+    },
+    {
+      "epoch": 0.052122215737108575,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 11520
+    },
+    {
+      "epoch": 0.0522127056949855,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 11540
+    },
+    {
+      "epoch": 0.05230319565286242,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 11560
+    },
+    {
+      "epoch": 0.05239368561073935,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 11580
+    },
+    {
+      "epoch": 0.05248417556861627,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 11600
+    },
+    {
+      "epoch": 0.052574665526493196,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 11620
+    },
+    {
+      "epoch": 0.05266515548437012,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 11640
+    },
+    {
+      "epoch": 0.05275564544224705,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 11660
+    },
+    {
+      "epoch": 0.05284613540012397,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 11680
+    },
+    {
+      "epoch": 0.052936625358000894,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 11700
+    },
+    {
+      "epoch": 0.053027115315877824,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 11720
+    },
+    {
+      "epoch": 0.053117605273754746,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 11740
+    },
+    {
+      "epoch": 0.05320809523163167,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 11760
+    },
+    {
+      "epoch": 0.05329858518950859,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 11780
+    },
+    {
+      "epoch": 0.05338907514738552,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 11800
+    },
+    {
+      "epoch": 0.053479565105262444,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 11820
+    },
+    {
+      "epoch": 0.05357005506313937,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 11840
+    },
+    {
+      "epoch": 0.05366054502101629,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 11860
+    },
+    {
+      "epoch": 0.05375103497889322,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 11880
+    },
+    {
+      "epoch": 0.05384152493677014,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 11900
+    },
+    {
+      "epoch": 0.053932014894647065,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 11920
+    },
+    {
+      "epoch": 0.05402250485252399,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 11940
+    },
+    {
+      "epoch": 0.05411299481040092,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 11960
+    },
+    {
+      "epoch": 0.05420348476827784,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 11980
+    },
+    {
+      "epoch": 0.05429397472615476,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014298253551714776,
+      "loss": 0.0,
+      "step": 12000
+    },
+    {
+      "epoch": 0.05429397472615476,
+      "eval_accuracy": 0.021626624590642192,
+      "eval_loss": NaN,
+      "eval_runtime": 218.9297,
+      "eval_samples_per_second": 2776.417,
+      "eval_steps_per_second": 10.848,
+      "step": 12000
     }
   ],
   "logging_steps": 20,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 3,
   "save_steps": 100,
+  "total_flos": 4315086323712000.0,
   "train_batch_size": 256,
   "trial_name": null,
   "trial_params": null