Training in progress, step 500, checkpoint

Browse files

Files changed (6) hide show

last-checkpoint/model.safetensors +1 -1
last-checkpoint/optimizer.pt +1 -1
last-checkpoint/rng_state.pth +1 -1
last-checkpoint/scheduler.pt +1 -1
last-checkpoint/trainer_state.json +10 -1414
last-checkpoint/training_args.bin +1 -1

last-checkpoint/model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:99e38e3602d41711cdbb533cb7d6b84bc03b5cfdcec09079e6e94d8e9c933c1f
 size 577789320

 version https://git-lfs.github.com/spec/v1
+oid sha256:cc0336bab9ad53a5d9ba35f689531e4f56cffd1eb07fbe59ee2bf923acde76a8
 size 577789320

last-checkpoint/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4324a138b51ccff30cd6582605837ac09263f11a633b6ebf8526d9c133d37e6a
 size 1155772233

 version https://git-lfs.github.com/spec/v1
+oid sha256:52d54884dc3d75a7228a6f73783a44ed6321489769a21ba9feb34fcacc24f3c9
 size 1155772233

last-checkpoint/rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ddee4e1cd9c11fae1531a3888b26c6306dfc6effea2b5b4de3f934096de4907a
 size 14244

 version https://git-lfs.github.com/spec/v1
+oid sha256:99eeec94447248854bf811d769a2a208fc950d0961184a4f99f03ffdc252b32b
 size 14244

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bc650e789a99f1d69f79d9aa960ac1927e43a2cad64cee2ef28fa7a0ac21a5a3
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:2a1a187666ea0e44f9d015f844e1601f5b4c6844588e1b362a3c9b6a7527a74f
 size 1064

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
-  "best_metric": 0.3102165162563324,
-  "best_model_checkpoint": "mikhail_panzo/zlm_b128_le4_s8000/checkpoint-9500",
-  "epoch": 15.916230366492147,
   "eval_steps": 500,
-  "global_step": 9500,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -81,1420 +81,16 @@
     {
       "epoch": 0.837696335078534,
       "eval_loss": 0.4566049873828888,
-      "eval_runtime": 268.5202,
-      "eval_samples_per_second": 31.614,
-      "eval_steps_per_second": 3.955,
       "step": 500
-    },
-    {
-      "epoch": 0.9214659685863874,
-      "grad_norm": 1.9436837434768677,
-      "learning_rate": 2.7450000000000003e-05,
-      "loss": 0.5079,
-      "step": 550
-    },
-    {
-      "epoch": 1.0052356020942408,
-      "grad_norm": 1.819956660270691,
-      "learning_rate": 2.995e-05,
-      "loss": 0.4969,
-      "step": 600
-    },
-    {
-      "epoch": 1.0890052356020943,
-      "grad_norm": 5.457251071929932,
-      "learning_rate": 3.245e-05,
-      "loss": 0.4977,
-      "step": 650
-    },
-    {
-      "epoch": 1.1727748691099475,
-      "grad_norm": 3.183980703353882,
-      "learning_rate": 3.495e-05,
-      "loss": 0.4923,
-      "step": 700
-    },
-    {
-      "epoch": 1.256544502617801,
-      "grad_norm": 7.1660051345825195,
-      "learning_rate": 3.745e-05,
-      "loss": 0.4802,
-      "step": 750
-    },
-    {
-      "epoch": 1.3403141361256545,
-      "grad_norm": 5.499026775360107,
-      "learning_rate": 3.995e-05,
-      "loss": 0.4754,
-      "step": 800
-    },
-    {
-      "epoch": 1.4240837696335078,
-      "grad_norm": 2.8053908348083496,
-      "learning_rate": 4.245e-05,
-      "loss": 0.4669,
-      "step": 850
-    },
-    {
-      "epoch": 1.5078534031413613,
-      "grad_norm": 3.017005443572998,
-      "learning_rate": 4.495e-05,
-      "loss": 0.4604,
-      "step": 900
-    },
-    {
-      "epoch": 1.5916230366492146,
-      "grad_norm": 2.7971177101135254,
-      "learning_rate": 4.745e-05,
-      "loss": 0.4565,
-      "step": 950
-    },
-    {
-      "epoch": 1.675392670157068,
-      "grad_norm": 3.1588356494903564,
-      "learning_rate": 4.995e-05,
-      "loss": 0.455,
-      "step": 1000
-    },
-    {
-      "epoch": 1.675392670157068,
-      "eval_loss": 0.40312233567237854,
-      "eval_runtime": 271.3585,
-      "eval_samples_per_second": 31.283,
-      "eval_steps_per_second": 3.914,
-      "step": 1000
-    },
-    {
-      "epoch": 1.7591623036649215,
-      "grad_norm": 2.2053232192993164,
-      "learning_rate": 5.245e-05,
-      "loss": 0.4543,
-      "step": 1050
-    },
-    {
-      "epoch": 1.8429319371727748,
-      "grad_norm": 2.0562164783477783,
-      "learning_rate": 5.495e-05,
-      "loss": 0.4456,
-      "step": 1100
-    },
-    {
-      "epoch": 1.9267015706806283,
-      "grad_norm": 2.730119466781616,
-      "learning_rate": 5.745e-05,
-      "loss": 0.4355,
-      "step": 1150
-    },
-    {
-      "epoch": 2.0104712041884816,
-      "grad_norm": 1.7484283447265625,
-      "learning_rate": 5.995000000000001e-05,
-      "loss": 0.4299,
-      "step": 1200
-    },
-    {
-      "epoch": 2.094240837696335,
-      "grad_norm": 1.1786061525344849,
-      "learning_rate": 6.245000000000001e-05,
-      "loss": 0.4305,
-      "step": 1250
-    },
-    {
-      "epoch": 2.1780104712041886,
-      "grad_norm": 1.98978590965271,
-      "learning_rate": 6.494999999999999e-05,
-      "loss": 0.4295,
-      "step": 1300
-    },
-    {
-      "epoch": 2.261780104712042,
-      "grad_norm": 2.818659782409668,
-      "learning_rate": 6.745e-05,
-      "loss": 0.4235,
-      "step": 1350
-    },
-    {
-      "epoch": 2.345549738219895,
-      "grad_norm": 2.3864262104034424,
-      "learning_rate": 6.995e-05,
-      "loss": 0.4271,
-      "step": 1400
-    },
-    {
-      "epoch": 2.4293193717277486,
-      "grad_norm": 1.3647903203964233,
-      "learning_rate": 7.245000000000001e-05,
-      "loss": 0.4208,
-      "step": 1450
-    },
-    {
-      "epoch": 2.513089005235602,
-      "grad_norm": 2.2144172191619873,
-      "learning_rate": 7.495e-05,
-      "loss": 0.4175,
-      "step": 1500
-    },
-    {
-      "epoch": 2.513089005235602,
-      "eval_loss": 0.3777858018875122,
-      "eval_runtime": 273.3281,
-      "eval_samples_per_second": 31.058,
-      "eval_steps_per_second": 3.885,
-      "step": 1500
-    },
-    {
-      "epoch": 2.5968586387434556,
-      "grad_norm": 1.6483193635940552,
-      "learning_rate": 7.745e-05,
-      "loss": 0.414,
-      "step": 1550
-    },
-    {
-      "epoch": 2.680628272251309,
-      "grad_norm": 1.7688554525375366,
-      "learning_rate": 7.995e-05,
-      "loss": 0.4153,
-      "step": 1600
-    },
-    {
-      "epoch": 2.7643979057591626,
-      "grad_norm": 1.2314317226409912,
-      "learning_rate": 8.245e-05,
-      "loss": 0.4089,
-      "step": 1650
-    },
-    {
-      "epoch": 2.8481675392670156,
-      "grad_norm": 1.6623793840408325,
-      "learning_rate": 8.495e-05,
-      "loss": 0.4124,
-      "step": 1700
-    },
-    {
-      "epoch": 2.931937172774869,
-      "grad_norm": 3.812507390975952,
-      "learning_rate": 8.745000000000001e-05,
-      "loss": 0.4112,
-      "step": 1750
-    },
-    {
-      "epoch": 3.0157068062827226,
-      "grad_norm": 2.141019821166992,
-      "learning_rate": 8.995e-05,
-      "loss": 0.4081,
-      "step": 1800
-    },
-    {
-      "epoch": 3.099476439790576,
-      "grad_norm": 1.8928133249282837,
-      "learning_rate": 9.245e-05,
-      "loss": 0.4067,
-      "step": 1850
-    },
-    {
-      "epoch": 3.183246073298429,
-      "grad_norm": 2.322817087173462,
-      "learning_rate": 9.495e-05,
-      "loss": 0.4088,
-      "step": 1900
-    },
-    {
-      "epoch": 3.2670157068062826,
-      "grad_norm": 2.1984918117523193,
-      "learning_rate": 9.745000000000001e-05,
-      "loss": 0.3976,
-      "step": 1950
-    },
-    {
-      "epoch": 3.350785340314136,
-      "grad_norm": 2.0455121994018555,
-      "learning_rate": 9.995e-05,
-      "loss": 0.4022,
-      "step": 2000
-    },
-    {
-      "epoch": 3.350785340314136,
-      "eval_loss": 0.3677983582019806,
-      "eval_runtime": 274.4574,
-      "eval_samples_per_second": 30.93,
-      "eval_steps_per_second": 3.869,
-      "step": 2000
-    },
-    {
-      "epoch": 3.4345549738219896,
-      "grad_norm": 1.2897744178771973,
-      "learning_rate": 9.951e-05,
-      "loss": 0.4026,
-      "step": 2050
-    },
-    {
-      "epoch": 3.518324607329843,
-      "grad_norm": 1.470860242843628,
-      "learning_rate": 9.901e-05,
-      "loss": 0.4008,
-      "step": 2100
-    },
-    {
-      "epoch": 3.6020942408376966,
-      "grad_norm": 1.2159388065338135,
-      "learning_rate": 9.851e-05,
-      "loss": 0.3971,
-      "step": 2150
-    },
-    {
-      "epoch": 3.6858638743455496,
-      "grad_norm": 2.0348379611968994,
-      "learning_rate": 9.801e-05,
-      "loss": 0.396,
-      "step": 2200
-    },
-    {
-      "epoch": 3.769633507853403,
-      "grad_norm": 1.7535659074783325,
-      "learning_rate": 9.751e-05,
-      "loss": 0.3929,
-      "step": 2250
-    },
-    {
-      "epoch": 3.8534031413612566,
-      "grad_norm": 1.361984372138977,
-      "learning_rate": 9.701e-05,
-      "loss": 0.3905,
-      "step": 2300
-    },
-    {
-      "epoch": 3.93717277486911,
-      "grad_norm": 1.7380383014678955,
-      "learning_rate": 9.651e-05,
-      "loss": 0.3957,
-      "step": 2350
-    },
-    {
-      "epoch": 4.020942408376963,
-      "grad_norm": 1.2679184675216675,
-      "learning_rate": 9.601e-05,
-      "loss": 0.388,
-      "step": 2400
-    },
-    {
-      "epoch": 4.104712041884817,
-      "grad_norm": 1.274625301361084,
-      "learning_rate": 9.551e-05,
-      "loss": 0.3887,
-      "step": 2450
-    },
-    {
-      "epoch": 4.18848167539267,
-      "grad_norm": 1.813714861869812,
-      "learning_rate": 9.501e-05,
-      "loss": 0.3865,
-      "step": 2500
-    },
-    {
-      "epoch": 4.18848167539267,
-      "eval_loss": 0.35398951172828674,
-      "eval_runtime": 271.385,
-      "eval_samples_per_second": 31.28,
-      "eval_steps_per_second": 3.913,
-      "step": 2500
-    },
-    {
-      "epoch": 4.272251308900524,
-      "grad_norm": 2.468984842300415,
-      "learning_rate": 9.451000000000002e-05,
-      "loss": 0.3902,
-      "step": 2550
-    },
-    {
-      "epoch": 4.356020942408377,
-      "grad_norm": 1.2810943126678467,
-      "learning_rate": 9.401e-05,
-      "loss": 0.386,
-      "step": 2600
-    },
-    {
-      "epoch": 4.439790575916231,
-      "grad_norm": 1.6781765222549438,
-      "learning_rate": 9.351e-05,
-      "loss": 0.383,
-      "step": 2650
-    },
-    {
-      "epoch": 4.523560209424084,
-      "grad_norm": 1.617163896560669,
-      "learning_rate": 9.301e-05,
-      "loss": 0.3849,
-      "step": 2700
-    },
-    {
-      "epoch": 4.607329842931938,
-      "grad_norm": 1.4169151782989502,
-      "learning_rate": 9.251000000000001e-05,
-      "loss": 0.3807,
-      "step": 2750
-    },
-    {
-      "epoch": 4.69109947643979,
-      "grad_norm": 1.1944037675857544,
-      "learning_rate": 9.201000000000001e-05,
-      "loss": 0.3838,
-      "step": 2800
-    },
-    {
-      "epoch": 4.774869109947644,
-      "grad_norm": 1.7312718629837036,
-      "learning_rate": 9.151000000000001e-05,
-      "loss": 0.3808,
-      "step": 2850
-    },
-    {
-      "epoch": 4.858638743455497,
-      "grad_norm": 1.357228398323059,
-      "learning_rate": 9.101000000000001e-05,
-      "loss": 0.3832,
-      "step": 2900
-    },
-    {
-      "epoch": 4.942408376963351,
-      "grad_norm": 1.2495553493499756,
-      "learning_rate": 9.051000000000001e-05,
-      "loss": 0.3837,
-      "step": 2950
-    },
-    {
-      "epoch": 5.026178010471204,
-      "grad_norm": 1.3688994646072388,
-      "learning_rate": 9.001e-05,
-      "loss": 0.3802,
-      "step": 3000
-    },
-    {
-      "epoch": 5.026178010471204,
-      "eval_loss": 0.3458922803401947,
-      "eval_runtime": 277.371,
-      "eval_samples_per_second": 30.605,
-      "eval_steps_per_second": 3.829,
-      "step": 3000
-    },
-    {
-      "epoch": 5.109947643979058,
-      "grad_norm": 1.0916550159454346,
-      "learning_rate": 8.951e-05,
-      "loss": 0.3747,
-      "step": 3050
-    },
-    {
-      "epoch": 5.193717277486911,
-      "grad_norm": 1.4605640172958374,
-      "learning_rate": 8.901e-05,
-      "loss": 0.3765,
-      "step": 3100
-    },
-    {
-      "epoch": 5.277486910994765,
-      "grad_norm": 1.302049994468689,
-      "learning_rate": 8.851e-05,
-      "loss": 0.3753,
-      "step": 3150
-    },
-    {
-      "epoch": 5.361256544502618,
-      "grad_norm": 1.0380531549453735,
-      "learning_rate": 8.801e-05,
-      "loss": 0.3735,
-      "step": 3200
-    },
-    {
-      "epoch": 5.445026178010472,
-      "grad_norm": 2.157710075378418,
-      "learning_rate": 8.751000000000001e-05,
-      "loss": 0.3766,
-      "step": 3250
-    },
-    {
-      "epoch": 5.528795811518324,
-      "grad_norm": 2.2072594165802,
-      "learning_rate": 8.701000000000001e-05,
-      "loss": 0.3767,
-      "step": 3300
-    },
-    {
-      "epoch": 5.612565445026178,
-      "grad_norm": 1.258347749710083,
-      "learning_rate": 8.651e-05,
-      "loss": 0.3709,
-      "step": 3350
-    },
-    {
-      "epoch": 5.696335078534031,
-      "grad_norm": 1.7026106119155884,
-      "learning_rate": 8.601e-05,
-      "loss": 0.3715,
-      "step": 3400
-    },
-    {
-      "epoch": 5.780104712041885,
-      "grad_norm": 1.1708229780197144,
-      "learning_rate": 8.551e-05,
-      "loss": 0.3716,
-      "step": 3450
-    },
-    {
-      "epoch": 5.863874345549738,
-      "grad_norm": 2.3675355911254883,
-      "learning_rate": 8.501e-05,
-      "loss": 0.3693,
-      "step": 3500
-    },
-    {
-      "epoch": 5.863874345549738,
-      "eval_loss": 0.3417563736438751,
-      "eval_runtime": 272.8827,
-      "eval_samples_per_second": 31.109,
-      "eval_steps_per_second": 3.892,
-      "step": 3500
-    },
-    {
-      "epoch": 5.947643979057592,
-      "grad_norm": 1.6144191026687622,
-      "learning_rate": 8.451e-05,
-      "loss": 0.3666,
-      "step": 3550
-    },
-    {
-      "epoch": 6.031413612565445,
-      "grad_norm": 1.4944205284118652,
-      "learning_rate": 8.401e-05,
-      "loss": 0.3657,
-      "step": 3600
-    },
-    {
-      "epoch": 6.115183246073299,
-      "grad_norm": 1.0198278427124023,
-      "learning_rate": 8.351e-05,
-      "loss": 0.3702,
-      "step": 3650
-    },
-    {
-      "epoch": 6.198952879581152,
-      "grad_norm": 2.195380926132202,
-      "learning_rate": 8.300999999999999e-05,
-      "loss": 0.3686,
-      "step": 3700
-    },
-    {
-      "epoch": 6.282722513089006,
-      "grad_norm": 1.3650749921798706,
-      "learning_rate": 8.251e-05,
-      "loss": 0.3701,
-      "step": 3750
-    },
-    {
-      "epoch": 6.366492146596858,
-      "grad_norm": 1.6887727975845337,
-      "learning_rate": 8.201000000000001e-05,
-      "loss": 0.3677,
-      "step": 3800
-    },
-    {
-      "epoch": 6.450261780104712,
-      "grad_norm": 0.8709685206413269,
-      "learning_rate": 8.151000000000001e-05,
-      "loss": 0.3678,
-      "step": 3850
-    },
-    {
-      "epoch": 6.534031413612565,
-      "grad_norm": 1.0899595022201538,
-      "learning_rate": 8.101000000000001e-05,
-      "loss": 0.3641,
-      "step": 3900
-    },
-    {
-      "epoch": 6.617801047120419,
-      "grad_norm": 1.1222867965698242,
-      "learning_rate": 8.051000000000001e-05,
-      "loss": 0.3691,
-      "step": 3950
-    },
-    {
-      "epoch": 6.701570680628272,
-      "grad_norm": 1.0771104097366333,
-      "learning_rate": 8.001e-05,
-      "loss": 0.3674,
-      "step": 4000
-    },
-    {
-      "epoch": 6.701570680628272,
-      "eval_loss": 0.3313756585121155,
-      "eval_runtime": 279.286,
-      "eval_samples_per_second": 30.395,
-      "eval_steps_per_second": 3.803,
-      "step": 4000
-    },
-    {
-      "epoch": 6.785340314136126,
-      "grad_norm": 1.868295669555664,
-      "learning_rate": 7.951e-05,
-      "loss": 0.3617,
-      "step": 4050
-    },
-    {
-      "epoch": 6.869109947643979,
-      "grad_norm": 1.0599360466003418,
-      "learning_rate": 7.901e-05,
-      "loss": 0.3637,
-      "step": 4100
-    },
-    {
-      "epoch": 6.952879581151833,
-      "grad_norm": 1.4801158905029297,
-      "learning_rate": 7.851e-05,
-      "loss": 0.363,
-      "step": 4150
-    },
-    {
-      "epoch": 7.036649214659686,
-      "grad_norm": 1.137289047241211,
-      "learning_rate": 7.801000000000001e-05,
-      "loss": 0.3622,
-      "step": 4200
-    },
-    {
-      "epoch": 7.12041884816754,
-      "grad_norm": 1.2109190225601196,
-      "learning_rate": 7.751000000000001e-05,
-      "loss": 0.3668,
-      "step": 4250
-    },
-    {
-      "epoch": 7.204188481675392,
-      "grad_norm": 1.1171132326126099,
-      "learning_rate": 7.701000000000001e-05,
-      "loss": 0.3594,
-      "step": 4300
-    },
-    {
-      "epoch": 7.287958115183246,
-      "grad_norm": 1.2529895305633545,
-      "learning_rate": 7.651e-05,
-      "loss": 0.3635,
-      "step": 4350
-    },
-    {
-      "epoch": 7.371727748691099,
-      "grad_norm": 1.352792739868164,
-      "learning_rate": 7.601e-05,
-      "loss": 0.3627,
-      "step": 4400
-    },
-    {
-      "epoch": 7.455497382198953,
-      "grad_norm": 0.8809813261032104,
-      "learning_rate": 7.552e-05,
-      "loss": 0.3647,
-      "step": 4450
-    },
-    {
-      "epoch": 7.539267015706806,
-      "grad_norm": 4.0386962890625,
-      "learning_rate": 7.502e-05,
-      "loss": 0.3582,
-      "step": 4500
-    },
-    {
-      "epoch": 7.539267015706806,
-      "eval_loss": 0.32692766189575195,
-      "eval_runtime": 272.0854,
-      "eval_samples_per_second": 31.2,
-      "eval_steps_per_second": 3.903,
-      "step": 4500
-    },
-    {
-      "epoch": 7.62303664921466,
-      "grad_norm": 1.616075873374939,
-      "learning_rate": 7.452e-05,
-      "loss": 0.3603,
-      "step": 4550
-    },
-    {
-      "epoch": 7.706806282722513,
-      "grad_norm": 2.2668583393096924,
-      "learning_rate": 7.402e-05,
-      "loss": 0.3622,
-      "step": 4600
-    },
-    {
-      "epoch": 7.790575916230367,
-      "grad_norm": 1.0464789867401123,
-      "learning_rate": 7.352e-05,
-      "loss": 0.3667,
-      "step": 4650
-    },
-    {
-      "epoch": 7.87434554973822,
-      "grad_norm": 1.2528297901153564,
-      "learning_rate": 7.302e-05,
-      "loss": 0.3631,
-      "step": 4700
-    },
-    {
-      "epoch": 7.958115183246074,
-      "grad_norm": 1.72895085811615,
-      "learning_rate": 7.252e-05,
-      "loss": 0.3567,
-      "step": 4750
-    },
-    {
-      "epoch": 8.041884816753926,
-      "grad_norm": 1.5020617246627808,
-      "learning_rate": 7.202e-05,
-      "loss": 0.3553,
-      "step": 4800
-    },
-    {
-      "epoch": 8.12565445026178,
-      "grad_norm": 1.976888656616211,
-      "learning_rate": 7.151999999999999e-05,
-      "loss": 0.3569,
-      "step": 4850
-    },
-    {
-      "epoch": 8.209424083769633,
-      "grad_norm": 1.156580924987793,
-      "learning_rate": 7.102000000000001e-05,
-      "loss": 0.3659,
-      "step": 4900
-    },
-    {
-      "epoch": 8.293193717277488,
-      "grad_norm": 0.9017566442489624,
-      "learning_rate": 7.052000000000001e-05,
-      "loss": 0.3549,
-      "step": 4950
-    },
-    {
-      "epoch": 8.37696335078534,
-      "grad_norm": 1.5168513059616089,
-      "learning_rate": 7.002000000000001e-05,
-      "loss": 0.362,
-      "step": 5000
-    },
-    {
-      "epoch": 8.37696335078534,
-      "eval_loss": 0.34056970477104187,
-      "eval_runtime": 276.7614,
-      "eval_samples_per_second": 30.673,
-      "eval_steps_per_second": 3.837,
-      "step": 5000
-    },
-    {
-      "epoch": 8.460732984293193,
-      "grad_norm": 1.111985206604004,
-      "learning_rate": 6.952000000000001e-05,
-      "loss": 0.3553,
-      "step": 5050
-    },
-    {
-      "epoch": 8.544502617801047,
-      "grad_norm": 1.3966108560562134,
-      "learning_rate": 6.902000000000001e-05,
-      "loss": 0.3545,
-      "step": 5100
-    },
-    {
-      "epoch": 8.6282722513089,
-      "grad_norm": 1.3428140878677368,
-      "learning_rate": 6.852e-05,
-      "loss": 0.3609,
-      "step": 5150
-    },
-    {
-      "epoch": 8.712041884816754,
-      "grad_norm": 1.9436802864074707,
-      "learning_rate": 6.802e-05,
-      "loss": 0.3547,
-      "step": 5200
-    },
-    {
-      "epoch": 8.795811518324607,
-      "grad_norm": 1.1481266021728516,
-      "learning_rate": 6.752e-05,
-      "loss": 0.3569,
-      "step": 5250
-    },
-    {
-      "epoch": 8.879581151832461,
-      "grad_norm": 1.410223364830017,
-      "learning_rate": 6.702e-05,
-      "loss": 0.3558,
-      "step": 5300
-    },
-    {
-      "epoch": 8.963350785340314,
-      "grad_norm": 1.7548959255218506,
-      "learning_rate": 6.652000000000001e-05,
-      "loss": 0.3561,
-      "step": 5350
-    },
-    {
-      "epoch": 9.047120418848168,
-      "grad_norm": 1.343935489654541,
-      "learning_rate": 6.602000000000001e-05,
-      "loss": 0.3609,
-      "step": 5400
-    },
-    {
-      "epoch": 9.13089005235602,
-      "grad_norm": 1.5190401077270508,
-      "learning_rate": 6.552000000000001e-05,
-      "loss": 0.3504,
-      "step": 5450
-    },
-    {
-      "epoch": 9.214659685863875,
-      "grad_norm": 0.8521016240119934,
-      "learning_rate": 6.502e-05,
-      "loss": 0.3521,
-      "step": 5500
-    },
-    {
-      "epoch": 9.214659685863875,
-      "eval_loss": 0.3218235671520233,
-      "eval_runtime": 279.5684,
-      "eval_samples_per_second": 30.365,
-      "eval_steps_per_second": 3.799,
-      "step": 5500
-    },
-    {
-      "epoch": 9.298429319371728,
-      "grad_norm": 1.0284796953201294,
-      "learning_rate": 6.452e-05,
-      "loss": 0.356,
-      "step": 5550
-    },
-    {
-      "epoch": 9.38219895287958,
-      "grad_norm": 1.8278234004974365,
-      "learning_rate": 6.402e-05,
-      "loss": 0.356,
-      "step": 5600
-    },
-    {
-      "epoch": 9.465968586387435,
-      "grad_norm": 0.9208963513374329,
-      "learning_rate": 6.352e-05,
-      "loss": 0.3504,
-      "step": 5650
-    },
-    {
-      "epoch": 9.549738219895287,
-      "grad_norm": 1.295639991760254,
-      "learning_rate": 6.302e-05,
-      "loss": 0.3551,
-      "step": 5700
-    },
-    {
-      "epoch": 9.633507853403142,
-      "grad_norm": 0.9757601022720337,
-      "learning_rate": 6.252e-05,
-      "loss": 0.3529,
-      "step": 5750
-    },
-    {
-      "epoch": 9.717277486910994,
-      "grad_norm": 1.451418399810791,
-      "learning_rate": 6.202e-05,
-      "loss": 0.3537,
-      "step": 5800
-    },
-    {
-      "epoch": 9.801047120418849,
-      "grad_norm": 2.2001028060913086,
-      "learning_rate": 6.152e-05,
-      "loss": 0.3522,
-      "step": 5850
-    },
-    {
-      "epoch": 9.884816753926701,
-      "grad_norm": 1.1149827241897583,
-      "learning_rate": 6.102e-05,
-      "loss": 0.3472,
-      "step": 5900
-    },
-    {
-      "epoch": 9.968586387434556,
-      "grad_norm": 1.4035720825195312,
-      "learning_rate": 6.0519999999999997e-05,
-      "loss": 0.3525,
-      "step": 5950
-    },
-    {
-      "epoch": 10.052356020942408,
-      "grad_norm": 1.0732487440109253,
-      "learning_rate": 6.002e-05,
-      "loss": 0.3485,
-      "step": 6000
-    },
-    {
-      "epoch": 10.052356020942408,
-      "eval_loss": 0.31853485107421875,
-      "eval_runtime": 271.779,
-      "eval_samples_per_second": 31.235,
-      "eval_steps_per_second": 3.908,
-      "step": 6000
-    },
-    {
-      "epoch": 10.136125654450261,
-      "grad_norm": 1.2576690912246704,
-      "learning_rate": 5.952e-05,
-      "loss": 0.3488,
-      "step": 6050
-    },
-    {
-      "epoch": 10.219895287958115,
-      "grad_norm": 1.2645186185836792,
-      "learning_rate": 5.902e-05,
-      "loss": 0.3537,
-      "step": 6100
-    },
-    {
-      "epoch": 10.303664921465968,
-      "grad_norm": 1.743445634841919,
-      "learning_rate": 5.852000000000001e-05,
-      "loss": 0.3501,
-      "step": 6150
-    },
-    {
-      "epoch": 10.387434554973822,
-      "grad_norm": 1.2827191352844238,
-      "learning_rate": 5.802000000000001e-05,
-      "loss": 0.349,
-      "step": 6200
-    },
-    {
-      "epoch": 10.471204188481675,
-      "grad_norm": 1.0109118223190308,
-      "learning_rate": 5.7520000000000005e-05,
-      "loss": 0.3495,
-      "step": 6250
-    },
-    {
-      "epoch": 10.55497382198953,
-      "grad_norm": 1.420745611190796,
-      "learning_rate": 5.7020000000000006e-05,
-      "loss": 0.3493,
-      "step": 6300
-    },
-    {
-      "epoch": 10.638743455497382,
-      "grad_norm": 1.2105921506881714,
-      "learning_rate": 5.652000000000001e-05,
-      "loss": 0.3487,
-      "step": 6350
-    },
-    {
-      "epoch": 10.722513089005236,
-      "grad_norm": 1.1536401510238647,
-      "learning_rate": 5.602000000000001e-05,
-      "loss": 0.35,
-      "step": 6400
-    },
-    {
-      "epoch": 10.806282722513089,
-      "grad_norm": 1.0635104179382324,
-      "learning_rate": 5.5520000000000004e-05,
-      "loss": 0.3475,
-      "step": 6450
-    },
-    {
-      "epoch": 10.890052356020943,
-      "grad_norm": 1.4069427251815796,
-      "learning_rate": 5.5020000000000005e-05,
-      "loss": 0.3472,
-      "step": 6500
-    },
-    {
-      "epoch": 10.890052356020943,
-      "eval_loss": 0.3199196457862854,
-      "eval_runtime": 276.9702,
-      "eval_samples_per_second": 30.65,
-      "eval_steps_per_second": 3.834,
-      "step": 6500
-    },
-    {
-      "epoch": 10.973821989528796,
-      "grad_norm": 0.8649620413780212,
-      "learning_rate": 5.4520000000000007e-05,
-      "loss": 0.3496,
-      "step": 6550
-    },
-    {
-      "epoch": 11.057591623036648,
-      "grad_norm": 2.6794686317443848,
-      "learning_rate": 5.402e-05,
-      "loss": 0.3482,
-      "step": 6600
-    },
-    {
-      "epoch": 11.141361256544503,
-      "grad_norm": 1.6224123239517212,
-      "learning_rate": 5.352e-05,
-      "loss": 0.3498,
-      "step": 6650
-    },
-    {
-      "epoch": 11.225130890052355,
-      "grad_norm": 1.2548692226409912,
-      "learning_rate": 5.3020000000000004e-05,
-      "loss": 0.346,
-      "step": 6700
-    },
-    {
-      "epoch": 11.30890052356021,
-      "grad_norm": 1.390360713005066,
-      "learning_rate": 5.2520000000000005e-05,
-      "loss": 0.345,
-      "step": 6750
-    },
-    {
-      "epoch": 11.392670157068062,
-      "grad_norm": 1.1040029525756836,
-      "learning_rate": 5.202e-05,
-      "loss": 0.3477,
-      "step": 6800
-    },
-    {
-      "epoch": 11.476439790575917,
-      "grad_norm": 1.0738588571548462,
-      "learning_rate": 5.152e-05,
-      "loss": 0.3455,
-      "step": 6850
-    },
-    {
-      "epoch": 11.56020942408377,
-      "grad_norm": 1.0175799131393433,
-      "learning_rate": 5.102e-05,
-      "loss": 0.3448,
-      "step": 6900
-    },
-    {
-      "epoch": 11.643979057591624,
-      "grad_norm": 1.8546490669250488,
-      "learning_rate": 5.052e-05,
-      "loss": 0.346,
-      "step": 6950
-    },
-    {
-      "epoch": 11.727748691099476,
-      "grad_norm": 1.7156524658203125,
-      "learning_rate": 5.002e-05,
-      "loss": 0.3469,
-      "step": 7000
-    },
-    {
-      "epoch": 11.727748691099476,
-      "eval_loss": 0.31849026679992676,
-      "eval_runtime": 283.0428,
-      "eval_samples_per_second": 29.992,
-      "eval_steps_per_second": 3.752,
-      "step": 7000
-    },
-    {
-      "epoch": 11.81151832460733,
-      "grad_norm": 1.1094063520431519,
-      "learning_rate": 4.952e-05,
-      "loss": 0.346,
-      "step": 7050
-    },
-    {
-      "epoch": 11.895287958115183,
-      "grad_norm": 1.8263230323791504,
-      "learning_rate": 4.902e-05,
-      "loss": 0.3496,
-      "step": 7100
-    },
-    {
-      "epoch": 11.979057591623036,
-      "grad_norm": 1.4049593210220337,
-      "learning_rate": 4.852e-05,
-      "loss": 0.3433,
-      "step": 7150
-    },
-    {
-      "epoch": 12.06282722513089,
-      "grad_norm": 1.3455963134765625,
-      "learning_rate": 4.8030000000000006e-05,
-      "loss": 0.3518,
-      "step": 7200
-    },
-    {
-      "epoch": 12.146596858638743,
-      "grad_norm": 1.174660325050354,
-      "learning_rate": 4.753e-05,
-      "loss": 0.348,
-      "step": 7250
-    },
-    {
-      "epoch": 12.230366492146597,
-      "grad_norm": 1.2765902280807495,
-      "learning_rate": 4.703e-05,
-      "loss": 0.345,
-      "step": 7300
-    },
-    {
-      "epoch": 12.31413612565445,
-      "grad_norm": 1.419295072555542,
-      "learning_rate": 4.6530000000000003e-05,
-      "loss": 0.3436,
-      "step": 7350
-    },
-    {
-      "epoch": 12.397905759162304,
-      "grad_norm": 1.3437247276306152,
-      "learning_rate": 4.603e-05,
-      "loss": 0.3469,
-      "step": 7400
-    },
-    {
-      "epoch": 12.481675392670157,
-      "grad_norm": 1.6074751615524292,
-      "learning_rate": 4.553e-05,
-      "loss": 0.3461,
-      "step": 7450
-    },
-    {
-      "epoch": 12.565445026178011,
-      "grad_norm": 1.432062029838562,
-      "learning_rate": 4.503e-05,
-      "loss": 0.3441,
-      "step": 7500
-    },
-    {
-      "epoch": 12.565445026178011,
-      "eval_loss": 0.3222896158695221,
-      "eval_runtime": 282.6486,
-      "eval_samples_per_second": 30.034,
-      "eval_steps_per_second": 3.757,
-      "step": 7500
-    },
-    {
-      "epoch": 12.649214659685864,
-      "grad_norm": 1.4210392236709595,
-      "learning_rate": 4.453e-05,
-      "loss": 0.3436,
-      "step": 7550
-    },
-    {
-      "epoch": 12.732984293193716,
-      "grad_norm": 1.275467038154602,
-      "learning_rate": 4.4030000000000004e-05,
-      "loss": 0.3453,
-      "step": 7600
-    },
-    {
-      "epoch": 12.81675392670157,
-      "grad_norm": 1.1207870244979858,
-      "learning_rate": 4.3530000000000005e-05,
-      "loss": 0.3438,
-      "step": 7650
-    },
-    {
-      "epoch": 12.900523560209423,
-      "grad_norm": 1.8535631895065308,
-      "learning_rate": 4.3030000000000006e-05,
-      "loss": 0.3442,
-      "step": 7700
-    },
-    {
-      "epoch": 12.984293193717278,
-      "grad_norm": 1.0426372289657593,
-      "learning_rate": 4.253e-05,
-      "loss": 0.3494,
-      "step": 7750
-    },
-    {
-      "epoch": 13.06806282722513,
-      "grad_norm": 1.3337020874023438,
-      "learning_rate": 4.203e-05,
-      "loss": 0.3413,
-      "step": 7800
-    },
-    {
-      "epoch": 13.151832460732985,
-      "grad_norm": 1.017905592918396,
-      "learning_rate": 4.1530000000000004e-05,
-      "loss": 0.3417,
-      "step": 7850
-    },
-    {
-      "epoch": 13.235602094240837,
-      "grad_norm": 1.166343331336975,
-      "learning_rate": 4.103e-05,
-      "loss": 0.3443,
-      "step": 7900
-    },
-    {
-      "epoch": 13.319371727748692,
-      "grad_norm": 1.4170418977737427,
-      "learning_rate": 4.053e-05,
-      "loss": 0.3433,
-      "step": 7950
-    },
-    {
-      "epoch": 13.403141361256544,
-      "grad_norm": 1.125741720199585,
-      "learning_rate": 4.003e-05,
-      "loss": 0.3422,
-      "step": 8000
-    },
-    {
-      "epoch": 13.403141361256544,
-      "eval_loss": 0.31487980484962463,
-      "eval_runtime": 278.3852,
-      "eval_samples_per_second": 30.494,
-      "eval_steps_per_second": 3.815,
-      "step": 8000
-    },
-    {
-      "epoch": 13.486910994764397,
-      "grad_norm": 1.5452402830123901,
-      "learning_rate": 3.953e-05,
-      "loss": 0.3403,
-      "step": 8050
-    },
-    {
-      "epoch": 13.570680628272251,
-      "grad_norm": 0.9096773862838745,
-      "learning_rate": 3.903e-05,
-      "loss": 0.3409,
-      "step": 8100
-    },
-    {
-      "epoch": 13.654450261780104,
-      "grad_norm": 1.6249001026153564,
-      "learning_rate": 3.853e-05,
-      "loss": 0.3414,
-      "step": 8150
-    },
-    {
-      "epoch": 13.738219895287958,
-      "grad_norm": 0.9276340007781982,
-      "learning_rate": 3.803000000000001e-05,
-      "loss": 0.3389,
-      "step": 8200
-    },
-    {
-      "epoch": 13.821989528795811,
-      "grad_norm": 1.7416585683822632,
-      "learning_rate": 3.753e-05,
-      "loss": 0.343,
-      "step": 8250
-    },
-    {
-      "epoch": 13.905759162303665,
-      "grad_norm": 2.2160768508911133,
-      "learning_rate": 3.703e-05,
-      "loss": 0.3402,
-      "step": 8300
-    },
-    {
-      "epoch": 13.989528795811518,
-      "grad_norm": 1.0885984897613525,
-      "learning_rate": 3.6530000000000004e-05,
-      "loss": 0.3407,
-      "step": 8350
-    },
-    {
-      "epoch": 14.073298429319372,
-      "grad_norm": 0.9969326853752136,
-      "learning_rate": 3.6030000000000006e-05,
-      "loss": 0.3447,
-      "step": 8400
-    },
-    {
-      "epoch": 14.157068062827225,
-      "grad_norm": 1.2978531122207642,
-      "learning_rate": 3.553e-05,
-      "loss": 0.3377,
-      "step": 8450
-    },
-    {
-      "epoch": 14.24083769633508,
-      "grad_norm": 1.0465147495269775,
-      "learning_rate": 3.503e-05,
-      "loss": 0.3396,
-      "step": 8500
-    },
-    {
-      "epoch": 14.24083769633508,
-      "eval_loss": 0.310507208108902,
-      "eval_runtime": 279.5625,
-      "eval_samples_per_second": 30.365,
-      "eval_steps_per_second": 3.799,
-      "step": 8500
-    },
-    {
-      "epoch": 14.324607329842932,
-      "grad_norm": 2.537041425704956,
-      "learning_rate": 3.453e-05,
-      "loss": 0.3418,
-      "step": 8550
-    },
-    {
-      "epoch": 14.408376963350785,
-      "grad_norm": 1.3357998132705688,
-      "learning_rate": 3.403e-05,
-      "loss": 0.3408,
-      "step": 8600
-    },
-    {
-      "epoch": 14.492146596858639,
-      "grad_norm": 0.8550173044204712,
-      "learning_rate": 3.353e-05,
-      "loss": 0.3408,
-      "step": 8650
-    },
-    {
-      "epoch": 14.575916230366492,
-      "grad_norm": 1.4455218315124512,
-      "learning_rate": 3.303e-05,
-      "loss": 0.3407,
-      "step": 8700
-    },
-    {
-      "epoch": 14.659685863874346,
-      "grad_norm": 1.0547473430633545,
-      "learning_rate": 3.253e-05,
-      "loss": 0.3382,
-      "step": 8750
-    },
-    {
-      "epoch": 14.743455497382199,
-      "grad_norm": 1.5398694276809692,
-      "learning_rate": 3.2029999999999997e-05,
-      "loss": 0.3402,
-      "step": 8800
-    },
-    {
-      "epoch": 14.827225130890053,
-      "grad_norm": 1.008465051651001,
-      "learning_rate": 3.1530000000000005e-05,
-      "loss": 0.3433,
-      "step": 8850
-    },
-    {
-      "epoch": 14.910994764397905,
-      "grad_norm": 1.8319462537765503,
-      "learning_rate": 3.1030000000000006e-05,
-      "loss": 0.341,
-      "step": 8900
-    },
-    {
-      "epoch": 14.99476439790576,
-      "grad_norm": 1.1432167291641235,
-      "learning_rate": 3.053e-05,
-      "loss": 0.3369,
-      "step": 8950
-    },
-    {
-      "epoch": 15.078534031413612,
-      "grad_norm": 1.098186731338501,
-      "learning_rate": 3.0030000000000002e-05,
-      "loss": 0.3396,
-      "step": 9000
-    },
-    {
-      "epoch": 15.078534031413612,
-      "eval_loss": 0.31039854884147644,
-      "eval_runtime": 280.3967,
-      "eval_samples_per_second": 30.275,
-      "eval_steps_per_second": 3.787,
-      "step": 9000
-    },
-    {
-      "epoch": 15.162303664921467,
-      "grad_norm": 1.0989015102386475,
-      "learning_rate": 2.9530000000000004e-05,
-      "loss": 0.3381,
-      "step": 9050
-    },
-    {
-      "epoch": 15.24607329842932,
-      "grad_norm": 1.1959214210510254,
-      "learning_rate": 2.903e-05,
-      "loss": 0.3381,
-      "step": 9100
-    },
-    {
-      "epoch": 15.329842931937172,
-      "grad_norm": 0.9721996188163757,
-      "learning_rate": 2.853e-05,
-      "loss": 0.3384,
-      "step": 9150
-    },
-    {
-      "epoch": 15.413612565445026,
-      "grad_norm": 1.2921016216278076,
-      "learning_rate": 2.803e-05,
-      "loss": 0.3375,
-      "step": 9200
-    },
-    {
-      "epoch": 15.497382198952879,
-      "grad_norm": 1.1854231357574463,
-      "learning_rate": 2.753e-05,
-      "loss": 0.3389,
-      "step": 9250
-    },
-    {
-      "epoch": 15.581151832460733,
-      "grad_norm": 1.571321725845337,
-      "learning_rate": 2.703e-05,
-      "loss": 0.3406,
-      "step": 9300
-    },
-    {
-      "epoch": 15.664921465968586,
-      "grad_norm": 1.2595016956329346,
-      "learning_rate": 2.6540000000000003e-05,
-      "loss": 0.3392,
-      "step": 9350
-    },
-    {
-      "epoch": 15.74869109947644,
-      "grad_norm": 1.2291969060897827,
-      "learning_rate": 2.6040000000000005e-05,
-      "loss": 0.3362,
-      "step": 9400
-    },
-    {
-      "epoch": 15.832460732984293,
-      "grad_norm": 1.0605494976043701,
-      "learning_rate": 2.5540000000000003e-05,
-      "loss": 0.3388,
-      "step": 9450
-    },
-    {
-      "epoch": 15.916230366492147,
-      "grad_norm": 0.9927255511283875,
-      "learning_rate": 2.504e-05,
-      "loss": 0.3391,
-      "step": 9500
-    },
-    {
-      "epoch": 15.916230366492147,
-      "eval_loss": 0.3102165162563324,
-      "eval_runtime": 279.552,
-      "eval_samples_per_second": 30.366,
-      "eval_steps_per_second": 3.799,
-      "step": 9500
     }
   ],
   "logging_steps": 50,
-  "max_steps": 12000,
   "num_input_tokens_seen": 0,
-  "num_train_epochs": 21,
   "save_steps": 500,
   "stateful_callbacks": {
     "TrainerControl": {
@@ -1508,7 +104,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 1.7021322045447034e+17,
   "train_batch_size": 16,
   "trial_name": null,
   "trial_params": null

 {
+  "best_metric": 0.4566049873828888,
+  "best_model_checkpoint": "mikhail_panzo/zlm_b128_le4_s8000/checkpoint-500",
+  "epoch": 0.837696335078534,
   "eval_steps": 500,
+  "global_step": 500,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
     {
       "epoch": 0.837696335078534,
       "eval_loss": 0.4566049873828888,
+      "eval_runtime": 281.3712,
+      "eval_samples_per_second": 30.17,
+      "eval_steps_per_second": 3.774,
       "step": 500
     }
   ],
   "logging_steps": 50,
+  "max_steps": 8000,
   "num_input_tokens_seen": 0,
+  "num_train_epochs": 14,
   "save_steps": 500,
   "stateful_callbacks": {
     "TrainerControl": {
       "attributes": {}
     }
   },
+  "total_flos": 8963407491426432.0,
   "train_batch_size": 16,
   "trial_name": null,
   "trial_params": null

last-checkpoint/training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5a94a4f6f1d582f4a7bb7ca6a113897e919df3fe53304bd598160a40931f24f6
 size 5304

 version https://git-lfs.github.com/spec/v1
+oid sha256:11675416f8a34c5963cafc78c11d51d2aedc5632f839698999d98e8c1dadbc99
 size 5304