zlm_b128_le4_s8000 / last-checkpoint /trainer_state.json
mikhail-panzo's picture
Training in progress, step 9500, checkpoint
4e0c565 verified
raw
history blame
No virus
36.3 kB
{
"best_metric": 0.3102165162563324,
"best_model_checkpoint": "mikhail_panzo/zlm_b128_le4_s8000/checkpoint-9500",
"epoch": 15.916230366492147,
"eval_steps": 500,
"global_step": 9500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.08376963350785341,
"grad_norm": 2.9717624187469482,
"learning_rate": 2.4500000000000003e-06,
"loss": 1.0424,
"step": 50
},
{
"epoch": 0.16753926701570682,
"grad_norm": 2.9720630645751953,
"learning_rate": 4.950000000000001e-06,
"loss": 0.8474,
"step": 100
},
{
"epoch": 0.2513089005235602,
"grad_norm": 2.445929765701294,
"learning_rate": 7.45e-06,
"loss": 0.7336,
"step": 150
},
{
"epoch": 0.33507853403141363,
"grad_norm": 5.502955913543701,
"learning_rate": 9.950000000000001e-06,
"loss": 0.6492,
"step": 200
},
{
"epoch": 0.418848167539267,
"grad_norm": 2.3356130123138428,
"learning_rate": 1.2450000000000001e-05,
"loss": 0.6133,
"step": 250
},
{
"epoch": 0.5026178010471204,
"grad_norm": 1.937270164489746,
"learning_rate": 1.4950000000000001e-05,
"loss": 0.5889,
"step": 300
},
{
"epoch": 0.5863874345549738,
"grad_norm": 2.392244338989258,
"learning_rate": 1.745e-05,
"loss": 0.5694,
"step": 350
},
{
"epoch": 0.6701570680628273,
"grad_norm": 7.3209919929504395,
"learning_rate": 1.995e-05,
"loss": 0.5477,
"step": 400
},
{
"epoch": 0.7539267015706806,
"grad_norm": 3.415917158126831,
"learning_rate": 2.245e-05,
"loss": 0.5329,
"step": 450
},
{
"epoch": 0.837696335078534,
"grad_norm": 3.0256705284118652,
"learning_rate": 2.495e-05,
"loss": 0.5173,
"step": 500
},
{
"epoch": 0.837696335078534,
"eval_loss": 0.4566049873828888,
"eval_runtime": 268.5202,
"eval_samples_per_second": 31.614,
"eval_steps_per_second": 3.955,
"step": 500
},
{
"epoch": 0.9214659685863874,
"grad_norm": 1.9436837434768677,
"learning_rate": 2.7450000000000003e-05,
"loss": 0.5079,
"step": 550
},
{
"epoch": 1.0052356020942408,
"grad_norm": 1.819956660270691,
"learning_rate": 2.995e-05,
"loss": 0.4969,
"step": 600
},
{
"epoch": 1.0890052356020943,
"grad_norm": 5.457251071929932,
"learning_rate": 3.245e-05,
"loss": 0.4977,
"step": 650
},
{
"epoch": 1.1727748691099475,
"grad_norm": 3.183980703353882,
"learning_rate": 3.495e-05,
"loss": 0.4923,
"step": 700
},
{
"epoch": 1.256544502617801,
"grad_norm": 7.1660051345825195,
"learning_rate": 3.745e-05,
"loss": 0.4802,
"step": 750
},
{
"epoch": 1.3403141361256545,
"grad_norm": 5.499026775360107,
"learning_rate": 3.995e-05,
"loss": 0.4754,
"step": 800
},
{
"epoch": 1.4240837696335078,
"grad_norm": 2.8053908348083496,
"learning_rate": 4.245e-05,
"loss": 0.4669,
"step": 850
},
{
"epoch": 1.5078534031413613,
"grad_norm": 3.017005443572998,
"learning_rate": 4.495e-05,
"loss": 0.4604,
"step": 900
},
{
"epoch": 1.5916230366492146,
"grad_norm": 2.7971177101135254,
"learning_rate": 4.745e-05,
"loss": 0.4565,
"step": 950
},
{
"epoch": 1.675392670157068,
"grad_norm": 3.1588356494903564,
"learning_rate": 4.995e-05,
"loss": 0.455,
"step": 1000
},
{
"epoch": 1.675392670157068,
"eval_loss": 0.40312233567237854,
"eval_runtime": 271.3585,
"eval_samples_per_second": 31.283,
"eval_steps_per_second": 3.914,
"step": 1000
},
{
"epoch": 1.7591623036649215,
"grad_norm": 2.2053232192993164,
"learning_rate": 5.245e-05,
"loss": 0.4543,
"step": 1050
},
{
"epoch": 1.8429319371727748,
"grad_norm": 2.0562164783477783,
"learning_rate": 5.495e-05,
"loss": 0.4456,
"step": 1100
},
{
"epoch": 1.9267015706806283,
"grad_norm": 2.730119466781616,
"learning_rate": 5.745e-05,
"loss": 0.4355,
"step": 1150
},
{
"epoch": 2.0104712041884816,
"grad_norm": 1.7484283447265625,
"learning_rate": 5.995000000000001e-05,
"loss": 0.4299,
"step": 1200
},
{
"epoch": 2.094240837696335,
"grad_norm": 1.1786061525344849,
"learning_rate": 6.245000000000001e-05,
"loss": 0.4305,
"step": 1250
},
{
"epoch": 2.1780104712041886,
"grad_norm": 1.98978590965271,
"learning_rate": 6.494999999999999e-05,
"loss": 0.4295,
"step": 1300
},
{
"epoch": 2.261780104712042,
"grad_norm": 2.818659782409668,
"learning_rate": 6.745e-05,
"loss": 0.4235,
"step": 1350
},
{
"epoch": 2.345549738219895,
"grad_norm": 2.3864262104034424,
"learning_rate": 6.995e-05,
"loss": 0.4271,
"step": 1400
},
{
"epoch": 2.4293193717277486,
"grad_norm": 1.3647903203964233,
"learning_rate": 7.245000000000001e-05,
"loss": 0.4208,
"step": 1450
},
{
"epoch": 2.513089005235602,
"grad_norm": 2.2144172191619873,
"learning_rate": 7.495e-05,
"loss": 0.4175,
"step": 1500
},
{
"epoch": 2.513089005235602,
"eval_loss": 0.3777858018875122,
"eval_runtime": 273.3281,
"eval_samples_per_second": 31.058,
"eval_steps_per_second": 3.885,
"step": 1500
},
{
"epoch": 2.5968586387434556,
"grad_norm": 1.6483193635940552,
"learning_rate": 7.745e-05,
"loss": 0.414,
"step": 1550
},
{
"epoch": 2.680628272251309,
"grad_norm": 1.7688554525375366,
"learning_rate": 7.995e-05,
"loss": 0.4153,
"step": 1600
},
{
"epoch": 2.7643979057591626,
"grad_norm": 1.2314317226409912,
"learning_rate": 8.245e-05,
"loss": 0.4089,
"step": 1650
},
{
"epoch": 2.8481675392670156,
"grad_norm": 1.6623793840408325,
"learning_rate": 8.495e-05,
"loss": 0.4124,
"step": 1700
},
{
"epoch": 2.931937172774869,
"grad_norm": 3.812507390975952,
"learning_rate": 8.745000000000001e-05,
"loss": 0.4112,
"step": 1750
},
{
"epoch": 3.0157068062827226,
"grad_norm": 2.141019821166992,
"learning_rate": 8.995e-05,
"loss": 0.4081,
"step": 1800
},
{
"epoch": 3.099476439790576,
"grad_norm": 1.8928133249282837,
"learning_rate": 9.245e-05,
"loss": 0.4067,
"step": 1850
},
{
"epoch": 3.183246073298429,
"grad_norm": 2.322817087173462,
"learning_rate": 9.495e-05,
"loss": 0.4088,
"step": 1900
},
{
"epoch": 3.2670157068062826,
"grad_norm": 2.1984918117523193,
"learning_rate": 9.745000000000001e-05,
"loss": 0.3976,
"step": 1950
},
{
"epoch": 3.350785340314136,
"grad_norm": 2.0455121994018555,
"learning_rate": 9.995e-05,
"loss": 0.4022,
"step": 2000
},
{
"epoch": 3.350785340314136,
"eval_loss": 0.3677983582019806,
"eval_runtime": 274.4574,
"eval_samples_per_second": 30.93,
"eval_steps_per_second": 3.869,
"step": 2000
},
{
"epoch": 3.4345549738219896,
"grad_norm": 1.2897744178771973,
"learning_rate": 9.951e-05,
"loss": 0.4026,
"step": 2050
},
{
"epoch": 3.518324607329843,
"grad_norm": 1.470860242843628,
"learning_rate": 9.901e-05,
"loss": 0.4008,
"step": 2100
},
{
"epoch": 3.6020942408376966,
"grad_norm": 1.2159388065338135,
"learning_rate": 9.851e-05,
"loss": 0.3971,
"step": 2150
},
{
"epoch": 3.6858638743455496,
"grad_norm": 2.0348379611968994,
"learning_rate": 9.801e-05,
"loss": 0.396,
"step": 2200
},
{
"epoch": 3.769633507853403,
"grad_norm": 1.7535659074783325,
"learning_rate": 9.751e-05,
"loss": 0.3929,
"step": 2250
},
{
"epoch": 3.8534031413612566,
"grad_norm": 1.361984372138977,
"learning_rate": 9.701e-05,
"loss": 0.3905,
"step": 2300
},
{
"epoch": 3.93717277486911,
"grad_norm": 1.7380383014678955,
"learning_rate": 9.651e-05,
"loss": 0.3957,
"step": 2350
},
{
"epoch": 4.020942408376963,
"grad_norm": 1.2679184675216675,
"learning_rate": 9.601e-05,
"loss": 0.388,
"step": 2400
},
{
"epoch": 4.104712041884817,
"grad_norm": 1.274625301361084,
"learning_rate": 9.551e-05,
"loss": 0.3887,
"step": 2450
},
{
"epoch": 4.18848167539267,
"grad_norm": 1.813714861869812,
"learning_rate": 9.501e-05,
"loss": 0.3865,
"step": 2500
},
{
"epoch": 4.18848167539267,
"eval_loss": 0.35398951172828674,
"eval_runtime": 271.385,
"eval_samples_per_second": 31.28,
"eval_steps_per_second": 3.913,
"step": 2500
},
{
"epoch": 4.272251308900524,
"grad_norm": 2.468984842300415,
"learning_rate": 9.451000000000002e-05,
"loss": 0.3902,
"step": 2550
},
{
"epoch": 4.356020942408377,
"grad_norm": 1.2810943126678467,
"learning_rate": 9.401e-05,
"loss": 0.386,
"step": 2600
},
{
"epoch": 4.439790575916231,
"grad_norm": 1.6781765222549438,
"learning_rate": 9.351e-05,
"loss": 0.383,
"step": 2650
},
{
"epoch": 4.523560209424084,
"grad_norm": 1.617163896560669,
"learning_rate": 9.301e-05,
"loss": 0.3849,
"step": 2700
},
{
"epoch": 4.607329842931938,
"grad_norm": 1.4169151782989502,
"learning_rate": 9.251000000000001e-05,
"loss": 0.3807,
"step": 2750
},
{
"epoch": 4.69109947643979,
"grad_norm": 1.1944037675857544,
"learning_rate": 9.201000000000001e-05,
"loss": 0.3838,
"step": 2800
},
{
"epoch": 4.774869109947644,
"grad_norm": 1.7312718629837036,
"learning_rate": 9.151000000000001e-05,
"loss": 0.3808,
"step": 2850
},
{
"epoch": 4.858638743455497,
"grad_norm": 1.357228398323059,
"learning_rate": 9.101000000000001e-05,
"loss": 0.3832,
"step": 2900
},
{
"epoch": 4.942408376963351,
"grad_norm": 1.2495553493499756,
"learning_rate": 9.051000000000001e-05,
"loss": 0.3837,
"step": 2950
},
{
"epoch": 5.026178010471204,
"grad_norm": 1.3688994646072388,
"learning_rate": 9.001e-05,
"loss": 0.3802,
"step": 3000
},
{
"epoch": 5.026178010471204,
"eval_loss": 0.3458922803401947,
"eval_runtime": 277.371,
"eval_samples_per_second": 30.605,
"eval_steps_per_second": 3.829,
"step": 3000
},
{
"epoch": 5.109947643979058,
"grad_norm": 1.0916550159454346,
"learning_rate": 8.951e-05,
"loss": 0.3747,
"step": 3050
},
{
"epoch": 5.193717277486911,
"grad_norm": 1.4605640172958374,
"learning_rate": 8.901e-05,
"loss": 0.3765,
"step": 3100
},
{
"epoch": 5.277486910994765,
"grad_norm": 1.302049994468689,
"learning_rate": 8.851e-05,
"loss": 0.3753,
"step": 3150
},
{
"epoch": 5.361256544502618,
"grad_norm": 1.0380531549453735,
"learning_rate": 8.801e-05,
"loss": 0.3735,
"step": 3200
},
{
"epoch": 5.445026178010472,
"grad_norm": 2.157710075378418,
"learning_rate": 8.751000000000001e-05,
"loss": 0.3766,
"step": 3250
},
{
"epoch": 5.528795811518324,
"grad_norm": 2.2072594165802,
"learning_rate": 8.701000000000001e-05,
"loss": 0.3767,
"step": 3300
},
{
"epoch": 5.612565445026178,
"grad_norm": 1.258347749710083,
"learning_rate": 8.651e-05,
"loss": 0.3709,
"step": 3350
},
{
"epoch": 5.696335078534031,
"grad_norm": 1.7026106119155884,
"learning_rate": 8.601e-05,
"loss": 0.3715,
"step": 3400
},
{
"epoch": 5.780104712041885,
"grad_norm": 1.1708229780197144,
"learning_rate": 8.551e-05,
"loss": 0.3716,
"step": 3450
},
{
"epoch": 5.863874345549738,
"grad_norm": 2.3675355911254883,
"learning_rate": 8.501e-05,
"loss": 0.3693,
"step": 3500
},
{
"epoch": 5.863874345549738,
"eval_loss": 0.3417563736438751,
"eval_runtime": 272.8827,
"eval_samples_per_second": 31.109,
"eval_steps_per_second": 3.892,
"step": 3500
},
{
"epoch": 5.947643979057592,
"grad_norm": 1.6144191026687622,
"learning_rate": 8.451e-05,
"loss": 0.3666,
"step": 3550
},
{
"epoch": 6.031413612565445,
"grad_norm": 1.4944205284118652,
"learning_rate": 8.401e-05,
"loss": 0.3657,
"step": 3600
},
{
"epoch": 6.115183246073299,
"grad_norm": 1.0198278427124023,
"learning_rate": 8.351e-05,
"loss": 0.3702,
"step": 3650
},
{
"epoch": 6.198952879581152,
"grad_norm": 2.195380926132202,
"learning_rate": 8.300999999999999e-05,
"loss": 0.3686,
"step": 3700
},
{
"epoch": 6.282722513089006,
"grad_norm": 1.3650749921798706,
"learning_rate": 8.251e-05,
"loss": 0.3701,
"step": 3750
},
{
"epoch": 6.366492146596858,
"grad_norm": 1.6887727975845337,
"learning_rate": 8.201000000000001e-05,
"loss": 0.3677,
"step": 3800
},
{
"epoch": 6.450261780104712,
"grad_norm": 0.8709685206413269,
"learning_rate": 8.151000000000001e-05,
"loss": 0.3678,
"step": 3850
},
{
"epoch": 6.534031413612565,
"grad_norm": 1.0899595022201538,
"learning_rate": 8.101000000000001e-05,
"loss": 0.3641,
"step": 3900
},
{
"epoch": 6.617801047120419,
"grad_norm": 1.1222867965698242,
"learning_rate": 8.051000000000001e-05,
"loss": 0.3691,
"step": 3950
},
{
"epoch": 6.701570680628272,
"grad_norm": 1.0771104097366333,
"learning_rate": 8.001e-05,
"loss": 0.3674,
"step": 4000
},
{
"epoch": 6.701570680628272,
"eval_loss": 0.3313756585121155,
"eval_runtime": 279.286,
"eval_samples_per_second": 30.395,
"eval_steps_per_second": 3.803,
"step": 4000
},
{
"epoch": 6.785340314136126,
"grad_norm": 1.868295669555664,
"learning_rate": 7.951e-05,
"loss": 0.3617,
"step": 4050
},
{
"epoch": 6.869109947643979,
"grad_norm": 1.0599360466003418,
"learning_rate": 7.901e-05,
"loss": 0.3637,
"step": 4100
},
{
"epoch": 6.952879581151833,
"grad_norm": 1.4801158905029297,
"learning_rate": 7.851e-05,
"loss": 0.363,
"step": 4150
},
{
"epoch": 7.036649214659686,
"grad_norm": 1.137289047241211,
"learning_rate": 7.801000000000001e-05,
"loss": 0.3622,
"step": 4200
},
{
"epoch": 7.12041884816754,
"grad_norm": 1.2109190225601196,
"learning_rate": 7.751000000000001e-05,
"loss": 0.3668,
"step": 4250
},
{
"epoch": 7.204188481675392,
"grad_norm": 1.1171132326126099,
"learning_rate": 7.701000000000001e-05,
"loss": 0.3594,
"step": 4300
},
{
"epoch": 7.287958115183246,
"grad_norm": 1.2529895305633545,
"learning_rate": 7.651e-05,
"loss": 0.3635,
"step": 4350
},
{
"epoch": 7.371727748691099,
"grad_norm": 1.352792739868164,
"learning_rate": 7.601e-05,
"loss": 0.3627,
"step": 4400
},
{
"epoch": 7.455497382198953,
"grad_norm": 0.8809813261032104,
"learning_rate": 7.552e-05,
"loss": 0.3647,
"step": 4450
},
{
"epoch": 7.539267015706806,
"grad_norm": 4.0386962890625,
"learning_rate": 7.502e-05,
"loss": 0.3582,
"step": 4500
},
{
"epoch": 7.539267015706806,
"eval_loss": 0.32692766189575195,
"eval_runtime": 272.0854,
"eval_samples_per_second": 31.2,
"eval_steps_per_second": 3.903,
"step": 4500
},
{
"epoch": 7.62303664921466,
"grad_norm": 1.616075873374939,
"learning_rate": 7.452e-05,
"loss": 0.3603,
"step": 4550
},
{
"epoch": 7.706806282722513,
"grad_norm": 2.2668583393096924,
"learning_rate": 7.402e-05,
"loss": 0.3622,
"step": 4600
},
{
"epoch": 7.790575916230367,
"grad_norm": 1.0464789867401123,
"learning_rate": 7.352e-05,
"loss": 0.3667,
"step": 4650
},
{
"epoch": 7.87434554973822,
"grad_norm": 1.2528297901153564,
"learning_rate": 7.302e-05,
"loss": 0.3631,
"step": 4700
},
{
"epoch": 7.958115183246074,
"grad_norm": 1.72895085811615,
"learning_rate": 7.252e-05,
"loss": 0.3567,
"step": 4750
},
{
"epoch": 8.041884816753926,
"grad_norm": 1.5020617246627808,
"learning_rate": 7.202e-05,
"loss": 0.3553,
"step": 4800
},
{
"epoch": 8.12565445026178,
"grad_norm": 1.976888656616211,
"learning_rate": 7.151999999999999e-05,
"loss": 0.3569,
"step": 4850
},
{
"epoch": 8.209424083769633,
"grad_norm": 1.156580924987793,
"learning_rate": 7.102000000000001e-05,
"loss": 0.3659,
"step": 4900
},
{
"epoch": 8.293193717277488,
"grad_norm": 0.9017566442489624,
"learning_rate": 7.052000000000001e-05,
"loss": 0.3549,
"step": 4950
},
{
"epoch": 8.37696335078534,
"grad_norm": 1.5168513059616089,
"learning_rate": 7.002000000000001e-05,
"loss": 0.362,
"step": 5000
},
{
"epoch": 8.37696335078534,
"eval_loss": 0.34056970477104187,
"eval_runtime": 276.7614,
"eval_samples_per_second": 30.673,
"eval_steps_per_second": 3.837,
"step": 5000
},
{
"epoch": 8.460732984293193,
"grad_norm": 1.111985206604004,
"learning_rate": 6.952000000000001e-05,
"loss": 0.3553,
"step": 5050
},
{
"epoch": 8.544502617801047,
"grad_norm": 1.3966108560562134,
"learning_rate": 6.902000000000001e-05,
"loss": 0.3545,
"step": 5100
},
{
"epoch": 8.6282722513089,
"grad_norm": 1.3428140878677368,
"learning_rate": 6.852e-05,
"loss": 0.3609,
"step": 5150
},
{
"epoch": 8.712041884816754,
"grad_norm": 1.9436802864074707,
"learning_rate": 6.802e-05,
"loss": 0.3547,
"step": 5200
},
{
"epoch": 8.795811518324607,
"grad_norm": 1.1481266021728516,
"learning_rate": 6.752e-05,
"loss": 0.3569,
"step": 5250
},
{
"epoch": 8.879581151832461,
"grad_norm": 1.410223364830017,
"learning_rate": 6.702e-05,
"loss": 0.3558,
"step": 5300
},
{
"epoch": 8.963350785340314,
"grad_norm": 1.7548959255218506,
"learning_rate": 6.652000000000001e-05,
"loss": 0.3561,
"step": 5350
},
{
"epoch": 9.047120418848168,
"grad_norm": 1.343935489654541,
"learning_rate": 6.602000000000001e-05,
"loss": 0.3609,
"step": 5400
},
{
"epoch": 9.13089005235602,
"grad_norm": 1.5190401077270508,
"learning_rate": 6.552000000000001e-05,
"loss": 0.3504,
"step": 5450
},
{
"epoch": 9.214659685863875,
"grad_norm": 0.8521016240119934,
"learning_rate": 6.502e-05,
"loss": 0.3521,
"step": 5500
},
{
"epoch": 9.214659685863875,
"eval_loss": 0.3218235671520233,
"eval_runtime": 279.5684,
"eval_samples_per_second": 30.365,
"eval_steps_per_second": 3.799,
"step": 5500
},
{
"epoch": 9.298429319371728,
"grad_norm": 1.0284796953201294,
"learning_rate": 6.452e-05,
"loss": 0.356,
"step": 5550
},
{
"epoch": 9.38219895287958,
"grad_norm": 1.8278234004974365,
"learning_rate": 6.402e-05,
"loss": 0.356,
"step": 5600
},
{
"epoch": 9.465968586387435,
"grad_norm": 0.9208963513374329,
"learning_rate": 6.352e-05,
"loss": 0.3504,
"step": 5650
},
{
"epoch": 9.549738219895287,
"grad_norm": 1.295639991760254,
"learning_rate": 6.302e-05,
"loss": 0.3551,
"step": 5700
},
{
"epoch": 9.633507853403142,
"grad_norm": 0.9757601022720337,
"learning_rate": 6.252e-05,
"loss": 0.3529,
"step": 5750
},
{
"epoch": 9.717277486910994,
"grad_norm": 1.451418399810791,
"learning_rate": 6.202e-05,
"loss": 0.3537,
"step": 5800
},
{
"epoch": 9.801047120418849,
"grad_norm": 2.2001028060913086,
"learning_rate": 6.152e-05,
"loss": 0.3522,
"step": 5850
},
{
"epoch": 9.884816753926701,
"grad_norm": 1.1149827241897583,
"learning_rate": 6.102e-05,
"loss": 0.3472,
"step": 5900
},
{
"epoch": 9.968586387434556,
"grad_norm": 1.4035720825195312,
"learning_rate": 6.0519999999999997e-05,
"loss": 0.3525,
"step": 5950
},
{
"epoch": 10.052356020942408,
"grad_norm": 1.0732487440109253,
"learning_rate": 6.002e-05,
"loss": 0.3485,
"step": 6000
},
{
"epoch": 10.052356020942408,
"eval_loss": 0.31853485107421875,
"eval_runtime": 271.779,
"eval_samples_per_second": 31.235,
"eval_steps_per_second": 3.908,
"step": 6000
},
{
"epoch": 10.136125654450261,
"grad_norm": 1.2576690912246704,
"learning_rate": 5.952e-05,
"loss": 0.3488,
"step": 6050
},
{
"epoch": 10.219895287958115,
"grad_norm": 1.2645186185836792,
"learning_rate": 5.902e-05,
"loss": 0.3537,
"step": 6100
},
{
"epoch": 10.303664921465968,
"grad_norm": 1.743445634841919,
"learning_rate": 5.852000000000001e-05,
"loss": 0.3501,
"step": 6150
},
{
"epoch": 10.387434554973822,
"grad_norm": 1.2827191352844238,
"learning_rate": 5.802000000000001e-05,
"loss": 0.349,
"step": 6200
},
{
"epoch": 10.471204188481675,
"grad_norm": 1.0109118223190308,
"learning_rate": 5.7520000000000005e-05,
"loss": 0.3495,
"step": 6250
},
{
"epoch": 10.55497382198953,
"grad_norm": 1.420745611190796,
"learning_rate": 5.7020000000000006e-05,
"loss": 0.3493,
"step": 6300
},
{
"epoch": 10.638743455497382,
"grad_norm": 1.2105921506881714,
"learning_rate": 5.652000000000001e-05,
"loss": 0.3487,
"step": 6350
},
{
"epoch": 10.722513089005236,
"grad_norm": 1.1536401510238647,
"learning_rate": 5.602000000000001e-05,
"loss": 0.35,
"step": 6400
},
{
"epoch": 10.806282722513089,
"grad_norm": 1.0635104179382324,
"learning_rate": 5.5520000000000004e-05,
"loss": 0.3475,
"step": 6450
},
{
"epoch": 10.890052356020943,
"grad_norm": 1.4069427251815796,
"learning_rate": 5.5020000000000005e-05,
"loss": 0.3472,
"step": 6500
},
{
"epoch": 10.890052356020943,
"eval_loss": 0.3199196457862854,
"eval_runtime": 276.9702,
"eval_samples_per_second": 30.65,
"eval_steps_per_second": 3.834,
"step": 6500
},
{
"epoch": 10.973821989528796,
"grad_norm": 0.8649620413780212,
"learning_rate": 5.4520000000000007e-05,
"loss": 0.3496,
"step": 6550
},
{
"epoch": 11.057591623036648,
"grad_norm": 2.6794686317443848,
"learning_rate": 5.402e-05,
"loss": 0.3482,
"step": 6600
},
{
"epoch": 11.141361256544503,
"grad_norm": 1.6224123239517212,
"learning_rate": 5.352e-05,
"loss": 0.3498,
"step": 6650
},
{
"epoch": 11.225130890052355,
"grad_norm": 1.2548692226409912,
"learning_rate": 5.3020000000000004e-05,
"loss": 0.346,
"step": 6700
},
{
"epoch": 11.30890052356021,
"grad_norm": 1.390360713005066,
"learning_rate": 5.2520000000000005e-05,
"loss": 0.345,
"step": 6750
},
{
"epoch": 11.392670157068062,
"grad_norm": 1.1040029525756836,
"learning_rate": 5.202e-05,
"loss": 0.3477,
"step": 6800
},
{
"epoch": 11.476439790575917,
"grad_norm": 1.0738588571548462,
"learning_rate": 5.152e-05,
"loss": 0.3455,
"step": 6850
},
{
"epoch": 11.56020942408377,
"grad_norm": 1.0175799131393433,
"learning_rate": 5.102e-05,
"loss": 0.3448,
"step": 6900
},
{
"epoch": 11.643979057591624,
"grad_norm": 1.8546490669250488,
"learning_rate": 5.052e-05,
"loss": 0.346,
"step": 6950
},
{
"epoch": 11.727748691099476,
"grad_norm": 1.7156524658203125,
"learning_rate": 5.002e-05,
"loss": 0.3469,
"step": 7000
},
{
"epoch": 11.727748691099476,
"eval_loss": 0.31849026679992676,
"eval_runtime": 283.0428,
"eval_samples_per_second": 29.992,
"eval_steps_per_second": 3.752,
"step": 7000
},
{
"epoch": 11.81151832460733,
"grad_norm": 1.1094063520431519,
"learning_rate": 4.952e-05,
"loss": 0.346,
"step": 7050
},
{
"epoch": 11.895287958115183,
"grad_norm": 1.8263230323791504,
"learning_rate": 4.902e-05,
"loss": 0.3496,
"step": 7100
},
{
"epoch": 11.979057591623036,
"grad_norm": 1.4049593210220337,
"learning_rate": 4.852e-05,
"loss": 0.3433,
"step": 7150
},
{
"epoch": 12.06282722513089,
"grad_norm": 1.3455963134765625,
"learning_rate": 4.8030000000000006e-05,
"loss": 0.3518,
"step": 7200
},
{
"epoch": 12.146596858638743,
"grad_norm": 1.174660325050354,
"learning_rate": 4.753e-05,
"loss": 0.348,
"step": 7250
},
{
"epoch": 12.230366492146597,
"grad_norm": 1.2765902280807495,
"learning_rate": 4.703e-05,
"loss": 0.345,
"step": 7300
},
{
"epoch": 12.31413612565445,
"grad_norm": 1.419295072555542,
"learning_rate": 4.6530000000000003e-05,
"loss": 0.3436,
"step": 7350
},
{
"epoch": 12.397905759162304,
"grad_norm": 1.3437247276306152,
"learning_rate": 4.603e-05,
"loss": 0.3469,
"step": 7400
},
{
"epoch": 12.481675392670157,
"grad_norm": 1.6074751615524292,
"learning_rate": 4.553e-05,
"loss": 0.3461,
"step": 7450
},
{
"epoch": 12.565445026178011,
"grad_norm": 1.432062029838562,
"learning_rate": 4.503e-05,
"loss": 0.3441,
"step": 7500
},
{
"epoch": 12.565445026178011,
"eval_loss": 0.3222896158695221,
"eval_runtime": 282.6486,
"eval_samples_per_second": 30.034,
"eval_steps_per_second": 3.757,
"step": 7500
},
{
"epoch": 12.649214659685864,
"grad_norm": 1.4210392236709595,
"learning_rate": 4.453e-05,
"loss": 0.3436,
"step": 7550
},
{
"epoch": 12.732984293193716,
"grad_norm": 1.275467038154602,
"learning_rate": 4.4030000000000004e-05,
"loss": 0.3453,
"step": 7600
},
{
"epoch": 12.81675392670157,
"grad_norm": 1.1207870244979858,
"learning_rate": 4.3530000000000005e-05,
"loss": 0.3438,
"step": 7650
},
{
"epoch": 12.900523560209423,
"grad_norm": 1.8535631895065308,
"learning_rate": 4.3030000000000006e-05,
"loss": 0.3442,
"step": 7700
},
{
"epoch": 12.984293193717278,
"grad_norm": 1.0426372289657593,
"learning_rate": 4.253e-05,
"loss": 0.3494,
"step": 7750
},
{
"epoch": 13.06806282722513,
"grad_norm": 1.3337020874023438,
"learning_rate": 4.203e-05,
"loss": 0.3413,
"step": 7800
},
{
"epoch": 13.151832460732985,
"grad_norm": 1.017905592918396,
"learning_rate": 4.1530000000000004e-05,
"loss": 0.3417,
"step": 7850
},
{
"epoch": 13.235602094240837,
"grad_norm": 1.166343331336975,
"learning_rate": 4.103e-05,
"loss": 0.3443,
"step": 7900
},
{
"epoch": 13.319371727748692,
"grad_norm": 1.4170418977737427,
"learning_rate": 4.053e-05,
"loss": 0.3433,
"step": 7950
},
{
"epoch": 13.403141361256544,
"grad_norm": 1.125741720199585,
"learning_rate": 4.003e-05,
"loss": 0.3422,
"step": 8000
},
{
"epoch": 13.403141361256544,
"eval_loss": 0.31487980484962463,
"eval_runtime": 278.3852,
"eval_samples_per_second": 30.494,
"eval_steps_per_second": 3.815,
"step": 8000
},
{
"epoch": 13.486910994764397,
"grad_norm": 1.5452402830123901,
"learning_rate": 3.953e-05,
"loss": 0.3403,
"step": 8050
},
{
"epoch": 13.570680628272251,
"grad_norm": 0.9096773862838745,
"learning_rate": 3.903e-05,
"loss": 0.3409,
"step": 8100
},
{
"epoch": 13.654450261780104,
"grad_norm": 1.6249001026153564,
"learning_rate": 3.853e-05,
"loss": 0.3414,
"step": 8150
},
{
"epoch": 13.738219895287958,
"grad_norm": 0.9276340007781982,
"learning_rate": 3.803000000000001e-05,
"loss": 0.3389,
"step": 8200
},
{
"epoch": 13.821989528795811,
"grad_norm": 1.7416585683822632,
"learning_rate": 3.753e-05,
"loss": 0.343,
"step": 8250
},
{
"epoch": 13.905759162303665,
"grad_norm": 2.2160768508911133,
"learning_rate": 3.703e-05,
"loss": 0.3402,
"step": 8300
},
{
"epoch": 13.989528795811518,
"grad_norm": 1.0885984897613525,
"learning_rate": 3.6530000000000004e-05,
"loss": 0.3407,
"step": 8350
},
{
"epoch": 14.073298429319372,
"grad_norm": 0.9969326853752136,
"learning_rate": 3.6030000000000006e-05,
"loss": 0.3447,
"step": 8400
},
{
"epoch": 14.157068062827225,
"grad_norm": 1.2978531122207642,
"learning_rate": 3.553e-05,
"loss": 0.3377,
"step": 8450
},
{
"epoch": 14.24083769633508,
"grad_norm": 1.0465147495269775,
"learning_rate": 3.503e-05,
"loss": 0.3396,
"step": 8500
},
{
"epoch": 14.24083769633508,
"eval_loss": 0.310507208108902,
"eval_runtime": 279.5625,
"eval_samples_per_second": 30.365,
"eval_steps_per_second": 3.799,
"step": 8500
},
{
"epoch": 14.324607329842932,
"grad_norm": 2.537041425704956,
"learning_rate": 3.453e-05,
"loss": 0.3418,
"step": 8550
},
{
"epoch": 14.408376963350785,
"grad_norm": 1.3357998132705688,
"learning_rate": 3.403e-05,
"loss": 0.3408,
"step": 8600
},
{
"epoch": 14.492146596858639,
"grad_norm": 0.8550173044204712,
"learning_rate": 3.353e-05,
"loss": 0.3408,
"step": 8650
},
{
"epoch": 14.575916230366492,
"grad_norm": 1.4455218315124512,
"learning_rate": 3.303e-05,
"loss": 0.3407,
"step": 8700
},
{
"epoch": 14.659685863874346,
"grad_norm": 1.0547473430633545,
"learning_rate": 3.253e-05,
"loss": 0.3382,
"step": 8750
},
{
"epoch": 14.743455497382199,
"grad_norm": 1.5398694276809692,
"learning_rate": 3.2029999999999997e-05,
"loss": 0.3402,
"step": 8800
},
{
"epoch": 14.827225130890053,
"grad_norm": 1.008465051651001,
"learning_rate": 3.1530000000000005e-05,
"loss": 0.3433,
"step": 8850
},
{
"epoch": 14.910994764397905,
"grad_norm": 1.8319462537765503,
"learning_rate": 3.1030000000000006e-05,
"loss": 0.341,
"step": 8900
},
{
"epoch": 14.99476439790576,
"grad_norm": 1.1432167291641235,
"learning_rate": 3.053e-05,
"loss": 0.3369,
"step": 8950
},
{
"epoch": 15.078534031413612,
"grad_norm": 1.098186731338501,
"learning_rate": 3.0030000000000002e-05,
"loss": 0.3396,
"step": 9000
},
{
"epoch": 15.078534031413612,
"eval_loss": 0.31039854884147644,
"eval_runtime": 280.3967,
"eval_samples_per_second": 30.275,
"eval_steps_per_second": 3.787,
"step": 9000
},
{
"epoch": 15.162303664921467,
"grad_norm": 1.0989015102386475,
"learning_rate": 2.9530000000000004e-05,
"loss": 0.3381,
"step": 9050
},
{
"epoch": 15.24607329842932,
"grad_norm": 1.1959214210510254,
"learning_rate": 2.903e-05,
"loss": 0.3381,
"step": 9100
},
{
"epoch": 15.329842931937172,
"grad_norm": 0.9721996188163757,
"learning_rate": 2.853e-05,
"loss": 0.3384,
"step": 9150
},
{
"epoch": 15.413612565445026,
"grad_norm": 1.2921016216278076,
"learning_rate": 2.803e-05,
"loss": 0.3375,
"step": 9200
},
{
"epoch": 15.497382198952879,
"grad_norm": 1.1854231357574463,
"learning_rate": 2.753e-05,
"loss": 0.3389,
"step": 9250
},
{
"epoch": 15.581151832460733,
"grad_norm": 1.571321725845337,
"learning_rate": 2.703e-05,
"loss": 0.3406,
"step": 9300
},
{
"epoch": 15.664921465968586,
"grad_norm": 1.2595016956329346,
"learning_rate": 2.6540000000000003e-05,
"loss": 0.3392,
"step": 9350
},
{
"epoch": 15.74869109947644,
"grad_norm": 1.2291969060897827,
"learning_rate": 2.6040000000000005e-05,
"loss": 0.3362,
"step": 9400
},
{
"epoch": 15.832460732984293,
"grad_norm": 1.0605494976043701,
"learning_rate": 2.5540000000000003e-05,
"loss": 0.3388,
"step": 9450
},
{
"epoch": 15.916230366492147,
"grad_norm": 0.9927255511283875,
"learning_rate": 2.504e-05,
"loss": 0.3391,
"step": 9500
},
{
"epoch": 15.916230366492147,
"eval_loss": 0.3102165162563324,
"eval_runtime": 279.552,
"eval_samples_per_second": 30.366,
"eval_steps_per_second": 3.799,
"step": 9500
}
],
"logging_steps": 50,
"max_steps": 12000,
"num_input_tokens_seen": 0,
"num_train_epochs": 21,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.7021322045447034e+17,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}