KoMo-KoAlpaca-0522-15epoch / trainer_state.json
canho's picture
Upload checkpoint-23445
2059d14 verified
raw
history blame contribute delete
No virus
41.5 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 15.0,
"eval_steps": 500,
"global_step": 23445,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.06397952655150352,
"grad_norm": 1.3757662773132324,
"learning_rate": 0.0002987204094689699,
"loss": 2.6581,
"step": 100
},
{
"epoch": 0.12795905310300704,
"grad_norm": 1.3149511814117432,
"learning_rate": 0.0002974408189379398,
"loss": 2.2938,
"step": 200
},
{
"epoch": 0.19193857965451055,
"grad_norm": 1.3127187490463257,
"learning_rate": 0.00029617402431222006,
"loss": 2.2174,
"step": 300
},
{
"epoch": 0.2559181062060141,
"grad_norm": 1.1597858667373657,
"learning_rate": 0.00029489443378119,
"loss": 2.1893,
"step": 400
},
{
"epoch": 0.3198976327575176,
"grad_norm": 1.3044805526733398,
"learning_rate": 0.00029361484325015995,
"loss": 2.173,
"step": 500
},
{
"epoch": 0.3838771593090211,
"grad_norm": 1.2641485929489136,
"learning_rate": 0.00029233525271912987,
"loss": 2.1609,
"step": 600
},
{
"epoch": 0.44785668586052463,
"grad_norm": 1.2622572183609009,
"learning_rate": 0.0002910556621880998,
"loss": 2.1425,
"step": 700
},
{
"epoch": 0.5118362124120281,
"grad_norm": 1.181531548500061,
"learning_rate": 0.0002897760716570697,
"loss": 2.1016,
"step": 800
},
{
"epoch": 0.5758157389635317,
"grad_norm": 1.1225402355194092,
"learning_rate": 0.0002884964811260397,
"loss": 2.087,
"step": 900
},
{
"epoch": 0.6397952655150352,
"grad_norm": 1.2168927192687988,
"learning_rate": 0.00028721689059500955,
"loss": 2.1028,
"step": 1000
},
{
"epoch": 0.7037747920665387,
"grad_norm": 1.1682486534118652,
"learning_rate": 0.0002859373000639795,
"loss": 2.0909,
"step": 1100
},
{
"epoch": 0.7677543186180422,
"grad_norm": 1.1344854831695557,
"learning_rate": 0.00028465770953294944,
"loss": 2.0896,
"step": 1200
},
{
"epoch": 0.8317338451695457,
"grad_norm": 1.1923776865005493,
"learning_rate": 0.00028337811900191936,
"loss": 2.0488,
"step": 1300
},
{
"epoch": 0.8957133717210493,
"grad_norm": 1.3097106218338013,
"learning_rate": 0.0002820985284708893,
"loss": 2.0488,
"step": 1400
},
{
"epoch": 0.9596928982725528,
"grad_norm": 1.1138262748718262,
"learning_rate": 0.0002808189379398592,
"loss": 2.0354,
"step": 1500
},
{
"epoch": 1.0236724248240563,
"grad_norm": 1.2939698696136475,
"learning_rate": 0.00027953934740882917,
"loss": 1.9789,
"step": 1600
},
{
"epoch": 1.0876519513755598,
"grad_norm": 1.2986066341400146,
"learning_rate": 0.0002782597568777991,
"loss": 1.9117,
"step": 1700
},
{
"epoch": 1.1516314779270633,
"grad_norm": 1.36199152469635,
"learning_rate": 0.000276980166346769,
"loss": 1.9463,
"step": 1800
},
{
"epoch": 1.2156110044785668,
"grad_norm": 1.1844524145126343,
"learning_rate": 0.00027570057581573893,
"loss": 1.9163,
"step": 1900
},
{
"epoch": 1.2795905310300704,
"grad_norm": 1.3748372793197632,
"learning_rate": 0.0002744209852847089,
"loss": 1.9157,
"step": 2000
},
{
"epoch": 1.3435700575815739,
"grad_norm": 1.3857694864273071,
"learning_rate": 0.00027314139475367877,
"loss": 1.9234,
"step": 2100
},
{
"epoch": 1.4075495841330774,
"grad_norm": 1.244759202003479,
"learning_rate": 0.00027186180422264874,
"loss": 1.9202,
"step": 2200
},
{
"epoch": 1.471529110684581,
"grad_norm": 1.3650200366973877,
"learning_rate": 0.00027058221369161866,
"loss": 1.9262,
"step": 2300
},
{
"epoch": 1.5355086372360844,
"grad_norm": 1.3304725885391235,
"learning_rate": 0.0002693026231605886,
"loss": 1.9225,
"step": 2400
},
{
"epoch": 1.599488163787588,
"grad_norm": 1.2908028364181519,
"learning_rate": 0.0002680230326295585,
"loss": 1.9448,
"step": 2500
},
{
"epoch": 1.6634676903390915,
"grad_norm": 1.4424597024917603,
"learning_rate": 0.00026674344209852847,
"loss": 1.9327,
"step": 2600
},
{
"epoch": 1.727447216890595,
"grad_norm": 1.3038992881774902,
"learning_rate": 0.0002654638515674984,
"loss": 1.9156,
"step": 2700
},
{
"epoch": 1.7914267434420985,
"grad_norm": 1.2507870197296143,
"learning_rate": 0.0002641842610364683,
"loss": 1.9111,
"step": 2800
},
{
"epoch": 1.855406269993602,
"grad_norm": 1.3444691896438599,
"learning_rate": 0.00026290467050543823,
"loss": 1.9101,
"step": 2900
},
{
"epoch": 1.9193857965451055,
"grad_norm": 1.3593779802322388,
"learning_rate": 0.00026162507997440815,
"loss": 1.9322,
"step": 3000
},
{
"epoch": 1.983365323096609,
"grad_norm": 1.3340197801589966,
"learning_rate": 0.0002603454894433781,
"loss": 1.9361,
"step": 3100
},
{
"epoch": 2.0473448496481126,
"grad_norm": 1.5682820081710815,
"learning_rate": 0.000259065898912348,
"loss": 1.8007,
"step": 3200
},
{
"epoch": 2.111324376199616,
"grad_norm": 1.5191752910614014,
"learning_rate": 0.00025778630838131796,
"loss": 1.7626,
"step": 3300
},
{
"epoch": 2.1753039027511196,
"grad_norm": 1.614561915397644,
"learning_rate": 0.0002565067178502879,
"loss": 1.7621,
"step": 3400
},
{
"epoch": 2.239283429302623,
"grad_norm": 1.4934717416763306,
"learning_rate": 0.00025522712731925785,
"loss": 1.7753,
"step": 3500
},
{
"epoch": 2.3032629558541267,
"grad_norm": 1.6563340425491333,
"learning_rate": 0.0002539475367882277,
"loss": 1.771,
"step": 3600
},
{
"epoch": 2.36724248240563,
"grad_norm": 1.502406120300293,
"learning_rate": 0.0002526679462571977,
"loss": 1.7823,
"step": 3700
},
{
"epoch": 2.4312220089571337,
"grad_norm": 1.6300321817398071,
"learning_rate": 0.0002513883557261676,
"loss": 1.7792,
"step": 3800
},
{
"epoch": 2.495201535508637,
"grad_norm": 1.5266133546829224,
"learning_rate": 0.00025010876519513753,
"loss": 1.7994,
"step": 3900
},
{
"epoch": 2.5591810620601407,
"grad_norm": 1.5854878425598145,
"learning_rate": 0.00024882917466410745,
"loss": 1.7947,
"step": 4000
},
{
"epoch": 2.6231605886116443,
"grad_norm": 1.4971704483032227,
"learning_rate": 0.00024754958413307737,
"loss": 1.8,
"step": 4100
},
{
"epoch": 2.6871401151631478,
"grad_norm": 1.4385017156600952,
"learning_rate": 0.00024626999360204734,
"loss": 1.8041,
"step": 4200
},
{
"epoch": 2.7511196417146513,
"grad_norm": 1.526516079902649,
"learning_rate": 0.00024499040307101726,
"loss": 1.8172,
"step": 4300
},
{
"epoch": 2.815099168266155,
"grad_norm": 1.5419269800186157,
"learning_rate": 0.00024371081253998718,
"loss": 1.7861,
"step": 4400
},
{
"epoch": 2.8790786948176583,
"grad_norm": 1.6025625467300415,
"learning_rate": 0.00024243122200895713,
"loss": 1.8099,
"step": 4500
},
{
"epoch": 2.943058221369162,
"grad_norm": 1.6059303283691406,
"learning_rate": 0.00024115163147792705,
"loss": 1.8057,
"step": 4600
},
{
"epoch": 3.0070377479206654,
"grad_norm": 1.4824891090393066,
"learning_rate": 0.00023987204094689697,
"loss": 1.7666,
"step": 4700
},
{
"epoch": 3.071017274472169,
"grad_norm": 1.7200456857681274,
"learning_rate": 0.00023859245041586688,
"loss": 1.5827,
"step": 4800
},
{
"epoch": 3.1349968010236724,
"grad_norm": 1.7883553504943848,
"learning_rate": 0.00023731285988483683,
"loss": 1.6154,
"step": 4900
},
{
"epoch": 3.198976327575176,
"grad_norm": 1.7612642049789429,
"learning_rate": 0.00023604606525911705,
"loss": 1.6331,
"step": 5000
},
{
"epoch": 3.2629558541266794,
"grad_norm": 1.8273234367370605,
"learning_rate": 0.000234766474728087,
"loss": 1.6489,
"step": 5100
},
{
"epoch": 3.326935380678183,
"grad_norm": 1.9144798517227173,
"learning_rate": 0.00023348688419705691,
"loss": 1.6451,
"step": 5200
},
{
"epoch": 3.3909149072296865,
"grad_norm": 1.7729228734970093,
"learning_rate": 0.00023220729366602686,
"loss": 1.6504,
"step": 5300
},
{
"epoch": 3.45489443378119,
"grad_norm": 1.7989096641540527,
"learning_rate": 0.00023092770313499675,
"loss": 1.6774,
"step": 5400
},
{
"epoch": 3.5188739603326935,
"grad_norm": 1.9125975370407104,
"learning_rate": 0.0002296481126039667,
"loss": 1.6734,
"step": 5500
},
{
"epoch": 3.582853486884197,
"grad_norm": 1.764624834060669,
"learning_rate": 0.00022836852207293665,
"loss": 1.6829,
"step": 5600
},
{
"epoch": 3.6468330134357005,
"grad_norm": 1.8827048540115356,
"learning_rate": 0.0002270889315419066,
"loss": 1.6917,
"step": 5700
},
{
"epoch": 3.710812539987204,
"grad_norm": 1.7637380361557007,
"learning_rate": 0.00022580934101087648,
"loss": 1.6957,
"step": 5800
},
{
"epoch": 3.7747920665387076,
"grad_norm": 1.9357694387435913,
"learning_rate": 0.00022452975047984643,
"loss": 1.7071,
"step": 5900
},
{
"epoch": 3.838771593090211,
"grad_norm": 1.8760075569152832,
"learning_rate": 0.00022325015994881638,
"loss": 1.689,
"step": 6000
},
{
"epoch": 3.9027511196417146,
"grad_norm": 1.83319890499115,
"learning_rate": 0.0002219705694177863,
"loss": 1.7135,
"step": 6100
},
{
"epoch": 3.966730646193218,
"grad_norm": 1.8324424028396606,
"learning_rate": 0.00022069097888675622,
"loss": 1.7052,
"step": 6200
},
{
"epoch": 4.030710172744722,
"grad_norm": 1.772605538368225,
"learning_rate": 0.00021941138835572613,
"loss": 1.6098,
"step": 6300
},
{
"epoch": 4.094689699296225,
"grad_norm": 1.936546802520752,
"learning_rate": 0.00021813179782469608,
"loss": 1.4814,
"step": 6400
},
{
"epoch": 4.158669225847729,
"grad_norm": 2.055730104446411,
"learning_rate": 0.000216852207293666,
"loss": 1.5106,
"step": 6500
},
{
"epoch": 4.222648752399232,
"grad_norm": 2.3238749504089355,
"learning_rate": 0.00021557261676263592,
"loss": 1.5136,
"step": 6600
},
{
"epoch": 4.286628278950736,
"grad_norm": 1.9317381381988525,
"learning_rate": 0.00021429302623160587,
"loss": 1.5307,
"step": 6700
},
{
"epoch": 4.350607805502239,
"grad_norm": 2.057237386703491,
"learning_rate": 0.0002130134357005758,
"loss": 1.5524,
"step": 6800
},
{
"epoch": 4.414587332053743,
"grad_norm": 2.2331132888793945,
"learning_rate": 0.0002117338451695457,
"loss": 1.5674,
"step": 6900
},
{
"epoch": 4.478566858605246,
"grad_norm": 2.0981638431549072,
"learning_rate": 0.00021046705054382595,
"loss": 1.5627,
"step": 7000
},
{
"epoch": 4.54254638515675,
"grad_norm": 2.1681783199310303,
"learning_rate": 0.0002091874600127959,
"loss": 1.5615,
"step": 7100
},
{
"epoch": 4.606525911708253,
"grad_norm": 2.0720126628875732,
"learning_rate": 0.00020790786948176584,
"loss": 1.5598,
"step": 7200
},
{
"epoch": 4.670505438259757,
"grad_norm": 1.9248210191726685,
"learning_rate": 0.00020662827895073573,
"loss": 1.5712,
"step": 7300
},
{
"epoch": 4.73448496481126,
"grad_norm": 1.9172708988189697,
"learning_rate": 0.00020534868841970568,
"loss": 1.5629,
"step": 7400
},
{
"epoch": 4.798464491362764,
"grad_norm": 2.141303539276123,
"learning_rate": 0.00020406909788867563,
"loss": 1.5871,
"step": 7500
},
{
"epoch": 4.862444017914267,
"grad_norm": 1.9447873830795288,
"learning_rate": 0.00020278950735764552,
"loss": 1.6006,
"step": 7600
},
{
"epoch": 4.926423544465771,
"grad_norm": 2.089735984802246,
"learning_rate": 0.00020150991682661547,
"loss": 1.5995,
"step": 7700
},
{
"epoch": 4.990403071017274,
"grad_norm": 2.056344985961914,
"learning_rate": 0.00020023032629558539,
"loss": 1.6122,
"step": 7800
},
{
"epoch": 5.054382597568778,
"grad_norm": 2.0974326133728027,
"learning_rate": 0.00019895073576455533,
"loss": 1.4254,
"step": 7900
},
{
"epoch": 5.1183621241202815,
"grad_norm": 2.250195264816284,
"learning_rate": 0.00019767114523352525,
"loss": 1.3966,
"step": 8000
},
{
"epoch": 5.182341650671785,
"grad_norm": 2.278472900390625,
"learning_rate": 0.00019639155470249517,
"loss": 1.4051,
"step": 8100
},
{
"epoch": 5.2463211772232885,
"grad_norm": 2.3049135208129883,
"learning_rate": 0.00019511196417146512,
"loss": 1.4147,
"step": 8200
},
{
"epoch": 5.310300703774792,
"grad_norm": 2.423823833465576,
"learning_rate": 0.00019383237364043506,
"loss": 1.4283,
"step": 8300
},
{
"epoch": 5.3742802303262955,
"grad_norm": 2.4221420288085938,
"learning_rate": 0.00019255278310940495,
"loss": 1.4538,
"step": 8400
},
{
"epoch": 5.438259756877799,
"grad_norm": 2.3886525630950928,
"learning_rate": 0.0001912731925783749,
"loss": 1.4435,
"step": 8500
},
{
"epoch": 5.502239283429303,
"grad_norm": 2.4025745391845703,
"learning_rate": 0.00018999360204734485,
"loss": 1.4512,
"step": 8600
},
{
"epoch": 5.566218809980806,
"grad_norm": 2.312255382537842,
"learning_rate": 0.00018871401151631477,
"loss": 1.4644,
"step": 8700
},
{
"epoch": 5.63019833653231,
"grad_norm": 2.2402162551879883,
"learning_rate": 0.00018743442098528469,
"loss": 1.4844,
"step": 8800
},
{
"epoch": 5.694177863083813,
"grad_norm": 2.3729898929595947,
"learning_rate": 0.00018615483045425463,
"loss": 1.4749,
"step": 8900
},
{
"epoch": 5.758157389635317,
"grad_norm": 2.137364149093628,
"learning_rate": 0.00018488803582853488,
"loss": 1.4875,
"step": 9000
},
{
"epoch": 5.82213691618682,
"grad_norm": 2.404008626937866,
"learning_rate": 0.00018360844529750477,
"loss": 1.4954,
"step": 9100
},
{
"epoch": 5.886116442738324,
"grad_norm": 2.167051076889038,
"learning_rate": 0.00018232885476647472,
"loss": 1.4923,
"step": 9200
},
{
"epoch": 5.950095969289827,
"grad_norm": 2.2124693393707275,
"learning_rate": 0.00018104926423544464,
"loss": 1.4972,
"step": 9300
},
{
"epoch": 6.014075495841331,
"grad_norm": 2.4574244022369385,
"learning_rate": 0.00017976967370441458,
"loss": 1.4535,
"step": 9400
},
{
"epoch": 6.078055022392834,
"grad_norm": 2.2989892959594727,
"learning_rate": 0.0001784900831733845,
"loss": 1.272,
"step": 9500
},
{
"epoch": 6.142034548944338,
"grad_norm": 2.8099708557128906,
"learning_rate": 0.00017721049264235442,
"loss": 1.3033,
"step": 9600
},
{
"epoch": 6.206014075495841,
"grad_norm": 2.516444206237793,
"learning_rate": 0.00017593090211132437,
"loss": 1.3066,
"step": 9700
},
{
"epoch": 6.269993602047345,
"grad_norm": 2.617293357849121,
"learning_rate": 0.0001746513115802943,
"loss": 1.3263,
"step": 9800
},
{
"epoch": 6.333973128598848,
"grad_norm": 2.6873817443847656,
"learning_rate": 0.0001733717210492642,
"loss": 1.3415,
"step": 9900
},
{
"epoch": 6.397952655150352,
"grad_norm": 2.558847427368164,
"learning_rate": 0.00017209213051823415,
"loss": 1.3599,
"step": 10000
},
{
"epoch": 6.461932181701855,
"grad_norm": 2.6037933826446533,
"learning_rate": 0.0001708125399872041,
"loss": 1.3486,
"step": 10100
},
{
"epoch": 6.525911708253359,
"grad_norm": 2.470381259918213,
"learning_rate": 0.000169532949456174,
"loss": 1.3639,
"step": 10200
},
{
"epoch": 6.589891234804862,
"grad_norm": 2.6497058868408203,
"learning_rate": 0.00016825335892514394,
"loss": 1.365,
"step": 10300
},
{
"epoch": 6.653870761356366,
"grad_norm": 2.8465592861175537,
"learning_rate": 0.00016697376839411388,
"loss": 1.3754,
"step": 10400
},
{
"epoch": 6.717850287907869,
"grad_norm": 2.4625210762023926,
"learning_rate": 0.0001656941778630838,
"loss": 1.3743,
"step": 10500
},
{
"epoch": 6.781829814459373,
"grad_norm": 2.633486747741699,
"learning_rate": 0.00016441458733205372,
"loss": 1.3786,
"step": 10600
},
{
"epoch": 6.8458093410108765,
"grad_norm": 2.799623727798462,
"learning_rate": 0.00016313499680102367,
"loss": 1.3837,
"step": 10700
},
{
"epoch": 6.90978886756238,
"grad_norm": 2.4427671432495117,
"learning_rate": 0.0001618554062699936,
"loss": 1.4229,
"step": 10800
},
{
"epoch": 6.9737683941138835,
"grad_norm": 2.680490016937256,
"learning_rate": 0.00016057581573896353,
"loss": 1.3956,
"step": 10900
},
{
"epoch": 7.037747920665387,
"grad_norm": 2.6439030170440674,
"learning_rate": 0.00015929622520793343,
"loss": 1.2917,
"step": 11000
},
{
"epoch": 7.1017274472168905,
"grad_norm": 2.618804454803467,
"learning_rate": 0.00015802943058221367,
"loss": 1.2111,
"step": 11100
},
{
"epoch": 7.165706973768394,
"grad_norm": 2.7387425899505615,
"learning_rate": 0.00015674984005118362,
"loss": 1.2036,
"step": 11200
},
{
"epoch": 7.229686500319898,
"grad_norm": 2.7440712451934814,
"learning_rate": 0.00015547024952015354,
"loss": 1.2316,
"step": 11300
},
{
"epoch": 7.293666026871401,
"grad_norm": 2.687152147293091,
"learning_rate": 0.00015419065898912346,
"loss": 1.2214,
"step": 11400
},
{
"epoch": 7.357645553422905,
"grad_norm": 2.7659342288970947,
"learning_rate": 0.0001529110684580934,
"loss": 1.2316,
"step": 11500
},
{
"epoch": 7.421625079974408,
"grad_norm": 2.877668857574463,
"learning_rate": 0.00015163147792706335,
"loss": 1.2567,
"step": 11600
},
{
"epoch": 7.485604606525912,
"grad_norm": 2.554941415786743,
"learning_rate": 0.00015035188739603324,
"loss": 1.2661,
"step": 11700
},
{
"epoch": 7.549584133077415,
"grad_norm": 2.9956531524658203,
"learning_rate": 0.0001490722968650032,
"loss": 1.2855,
"step": 11800
},
{
"epoch": 7.613563659628919,
"grad_norm": 2.6620912551879883,
"learning_rate": 0.00014779270633397313,
"loss": 1.288,
"step": 11900
},
{
"epoch": 7.677543186180422,
"grad_norm": 2.637007713317871,
"learning_rate": 0.00014651311580294305,
"loss": 1.2665,
"step": 12000
},
{
"epoch": 7.741522712731926,
"grad_norm": 2.8605268001556396,
"learning_rate": 0.00014523352527191297,
"loss": 1.2897,
"step": 12100
},
{
"epoch": 7.805502239283429,
"grad_norm": 2.8997604846954346,
"learning_rate": 0.0001439539347408829,
"loss": 1.3052,
"step": 12200
},
{
"epoch": 7.869481765834933,
"grad_norm": 2.889934778213501,
"learning_rate": 0.00014267434420985284,
"loss": 1.2934,
"step": 12300
},
{
"epoch": 7.933461292386436,
"grad_norm": 2.7243242263793945,
"learning_rate": 0.00014139475367882276,
"loss": 1.3035,
"step": 12400
},
{
"epoch": 7.99744081893794,
"grad_norm": 2.7266464233398438,
"learning_rate": 0.00014011516314779268,
"loss": 1.3032,
"step": 12500
},
{
"epoch": 8.061420345489443,
"grad_norm": 2.5178606510162354,
"learning_rate": 0.00013883557261676262,
"loss": 1.125,
"step": 12600
},
{
"epoch": 8.125399872040948,
"grad_norm": 2.5766046047210693,
"learning_rate": 0.00013755598208573254,
"loss": 1.1352,
"step": 12700
},
{
"epoch": 8.18937939859245,
"grad_norm": 2.6406075954437256,
"learning_rate": 0.0001362763915547025,
"loss": 1.1377,
"step": 12800
},
{
"epoch": 8.253358925143955,
"grad_norm": 2.6050360202789307,
"learning_rate": 0.0001349968010236724,
"loss": 1.1539,
"step": 12900
},
{
"epoch": 8.317338451695457,
"grad_norm": 3.0384466648101807,
"learning_rate": 0.00013371721049264235,
"loss": 1.1446,
"step": 13000
},
{
"epoch": 8.381317978246962,
"grad_norm": 2.728938102722168,
"learning_rate": 0.00013245041586692257,
"loss": 1.154,
"step": 13100
},
{
"epoch": 8.445297504798464,
"grad_norm": 2.723478078842163,
"learning_rate": 0.00013117082533589252,
"loss": 1.1583,
"step": 13200
},
{
"epoch": 8.509277031349969,
"grad_norm": 2.9142537117004395,
"learning_rate": 0.00012989123480486244,
"loss": 1.1697,
"step": 13300
},
{
"epoch": 8.573256557901471,
"grad_norm": 3.0678508281707764,
"learning_rate": 0.00012861164427383238,
"loss": 1.1759,
"step": 13400
},
{
"epoch": 8.637236084452976,
"grad_norm": 2.6915736198425293,
"learning_rate": 0.0001273320537428023,
"loss": 1.18,
"step": 13500
},
{
"epoch": 8.701215611004478,
"grad_norm": 2.8362019062042236,
"learning_rate": 0.00012605246321177222,
"loss": 1.1975,
"step": 13600
},
{
"epoch": 8.765195137555983,
"grad_norm": 2.8301174640655518,
"learning_rate": 0.00012477287268074214,
"loss": 1.2049,
"step": 13700
},
{
"epoch": 8.829174664107486,
"grad_norm": 2.866494655609131,
"learning_rate": 0.00012350607805502239,
"loss": 1.1916,
"step": 13800
},
{
"epoch": 8.89315419065899,
"grad_norm": 3.0079123973846436,
"learning_rate": 0.0001222264875239923,
"loss": 1.1894,
"step": 13900
},
{
"epoch": 8.957133717210493,
"grad_norm": 2.8487589359283447,
"learning_rate": 0.00012094689699296224,
"loss": 1.2077,
"step": 14000
},
{
"epoch": 9.021113243761997,
"grad_norm": 2.8493270874023438,
"learning_rate": 0.00011966730646193218,
"loss": 1.1668,
"step": 14100
},
{
"epoch": 9.0850927703135,
"grad_norm": 2.9474496841430664,
"learning_rate": 0.0001183877159309021,
"loss": 1.035,
"step": 14200
},
{
"epoch": 9.149072296865004,
"grad_norm": 2.907160758972168,
"learning_rate": 0.00011710812539987204,
"loss": 1.0505,
"step": 14300
},
{
"epoch": 9.213051823416507,
"grad_norm": 2.9124813079833984,
"learning_rate": 0.00011582853486884196,
"loss": 1.0573,
"step": 14400
},
{
"epoch": 9.277031349968011,
"grad_norm": 2.925597906112671,
"learning_rate": 0.00011454894433781189,
"loss": 1.0654,
"step": 14500
},
{
"epoch": 9.341010876519514,
"grad_norm": 2.844844102859497,
"learning_rate": 0.00011326935380678182,
"loss": 1.0703,
"step": 14600
},
{
"epoch": 9.404990403071018,
"grad_norm": 3.0153064727783203,
"learning_rate": 0.00011198976327575174,
"loss": 1.0727,
"step": 14700
},
{
"epoch": 9.46896992962252,
"grad_norm": 3.0144588947296143,
"learning_rate": 0.00011071017274472169,
"loss": 1.0791,
"step": 14800
},
{
"epoch": 9.532949456174025,
"grad_norm": 3.219855308532715,
"learning_rate": 0.0001094305822136916,
"loss": 1.0921,
"step": 14900
},
{
"epoch": 9.596928982725528,
"grad_norm": 3.0687131881713867,
"learning_rate": 0.00010815099168266154,
"loss": 1.0968,
"step": 15000
},
{
"epoch": 9.660908509277032,
"grad_norm": 2.7260611057281494,
"learning_rate": 0.00010687140115163146,
"loss": 1.0924,
"step": 15100
},
{
"epoch": 9.724888035828535,
"grad_norm": 3.266075372695923,
"learning_rate": 0.0001055918106206014,
"loss": 1.0871,
"step": 15200
},
{
"epoch": 9.78886756238004,
"grad_norm": 2.816058874130249,
"learning_rate": 0.00010431222008957132,
"loss": 1.1183,
"step": 15300
},
{
"epoch": 9.852847088931542,
"grad_norm": 2.7959651947021484,
"learning_rate": 0.00010303262955854126,
"loss": 1.1188,
"step": 15400
},
{
"epoch": 9.916826615483046,
"grad_norm": 2.996344566345215,
"learning_rate": 0.00010175303902751119,
"loss": 1.1112,
"step": 15500
},
{
"epoch": 9.980806142034549,
"grad_norm": 3.110300302505493,
"learning_rate": 0.00010047344849648112,
"loss": 1.13,
"step": 15600
},
{
"epoch": 10.044785668586053,
"grad_norm": 2.8949873447418213,
"learning_rate": 9.919385796545104e-05,
"loss": 1.0038,
"step": 15700
},
{
"epoch": 10.108765195137556,
"grad_norm": 2.7868170738220215,
"learning_rate": 9.791426743442099e-05,
"loss": 0.9668,
"step": 15800
},
{
"epoch": 10.17274472168906,
"grad_norm": 3.0076348781585693,
"learning_rate": 9.663467690339091e-05,
"loss": 0.9592,
"step": 15900
},
{
"epoch": 10.236724248240563,
"grad_norm": 3.313863515853882,
"learning_rate": 9.535508637236083e-05,
"loss": 0.9923,
"step": 16000
},
{
"epoch": 10.300703774792067,
"grad_norm": 3.0162906646728516,
"learning_rate": 9.407549584133077e-05,
"loss": 0.9932,
"step": 16100
},
{
"epoch": 10.36468330134357,
"grad_norm": 3.1555402278900146,
"learning_rate": 9.279590531030069e-05,
"loss": 0.9909,
"step": 16200
},
{
"epoch": 10.428662827895074,
"grad_norm": 2.6180832386016846,
"learning_rate": 9.151631477927062e-05,
"loss": 1.0052,
"step": 16300
},
{
"epoch": 10.492642354446577,
"grad_norm": 3.2279481887817383,
"learning_rate": 9.023672424824054e-05,
"loss": 1.0014,
"step": 16400
},
{
"epoch": 10.556621880998081,
"grad_norm": 3.1148877143859863,
"learning_rate": 8.895713371721049e-05,
"loss": 1.0179,
"step": 16500
},
{
"epoch": 10.620601407549584,
"grad_norm": 2.617116928100586,
"learning_rate": 8.767754318618041e-05,
"loss": 1.0234,
"step": 16600
},
{
"epoch": 10.684580934101088,
"grad_norm": 3.179914951324463,
"learning_rate": 8.639795265515034e-05,
"loss": 1.018,
"step": 16700
},
{
"epoch": 10.748560460652591,
"grad_norm": 3.002013921737671,
"learning_rate": 8.511836212412028e-05,
"loss": 1.03,
"step": 16800
},
{
"epoch": 10.812539987204095,
"grad_norm": 3.1604723930358887,
"learning_rate": 8.383877159309021e-05,
"loss": 1.0289,
"step": 16900
},
{
"epoch": 10.876519513755598,
"grad_norm": 3.0055463314056396,
"learning_rate": 8.255918106206013e-05,
"loss": 1.0244,
"step": 17000
},
{
"epoch": 10.940499040307103,
"grad_norm": 3.2063984870910645,
"learning_rate": 8.127959053103007e-05,
"loss": 1.0246,
"step": 17100
},
{
"epoch": 11.004478566858605,
"grad_norm": 2.9076807498931885,
"learning_rate": 7.999999999999999e-05,
"loss": 1.0284,
"step": 17200
},
{
"epoch": 11.06845809341011,
"grad_norm": 2.873387098312378,
"learning_rate": 7.872040946896993e-05,
"loss": 0.8981,
"step": 17300
},
{
"epoch": 11.132437619961612,
"grad_norm": 3.000307083129883,
"learning_rate": 7.744081893793986e-05,
"loss": 0.8994,
"step": 17400
},
{
"epoch": 11.196417146513117,
"grad_norm": 2.965081214904785,
"learning_rate": 7.616122840690978e-05,
"loss": 0.9159,
"step": 17500
},
{
"epoch": 11.26039667306462,
"grad_norm": 2.9889376163482666,
"learning_rate": 7.488163787587971e-05,
"loss": 0.9196,
"step": 17600
},
{
"epoch": 11.324376199616124,
"grad_norm": 3.2027907371520996,
"learning_rate": 7.360204734484964e-05,
"loss": 0.9258,
"step": 17700
},
{
"epoch": 11.388355726167626,
"grad_norm": 2.8633768558502197,
"learning_rate": 7.233525271912987e-05,
"loss": 0.9217,
"step": 17800
},
{
"epoch": 11.45233525271913,
"grad_norm": 3.171734571456909,
"learning_rate": 7.10556621880998e-05,
"loss": 0.92,
"step": 17900
},
{
"epoch": 11.516314779270633,
"grad_norm": 3.0337626934051514,
"learning_rate": 6.977607165706973e-05,
"loss": 0.9361,
"step": 18000
},
{
"epoch": 11.580294305822138,
"grad_norm": 3.2068841457366943,
"learning_rate": 6.849648112603966e-05,
"loss": 0.9255,
"step": 18100
},
{
"epoch": 11.64427383237364,
"grad_norm": 3.1199960708618164,
"learning_rate": 6.721689059500959e-05,
"loss": 0.9362,
"step": 18200
},
{
"epoch": 11.708253358925145,
"grad_norm": 3.16876220703125,
"learning_rate": 6.593730006397953e-05,
"loss": 0.9509,
"step": 18300
},
{
"epoch": 11.772232885476647,
"grad_norm": 2.9640047550201416,
"learning_rate": 6.465770953294944e-05,
"loss": 0.9338,
"step": 18400
},
{
"epoch": 11.836212412028152,
"grad_norm": 3.112344980239868,
"learning_rate": 6.337811900191938e-05,
"loss": 0.9417,
"step": 18500
},
{
"epoch": 11.900191938579654,
"grad_norm": 3.0994222164154053,
"learning_rate": 6.211132437619961e-05,
"loss": 0.9573,
"step": 18600
},
{
"epoch": 11.964171465131159,
"grad_norm": 3.336512327194214,
"learning_rate": 6.0831733845169535e-05,
"loss": 0.9523,
"step": 18700
},
{
"epoch": 12.028150991682661,
"grad_norm": 2.5776188373565674,
"learning_rate": 5.955214331413947e-05,
"loss": 0.9043,
"step": 18800
},
{
"epoch": 12.092130518234166,
"grad_norm": 2.4695932865142822,
"learning_rate": 5.8272552783109394e-05,
"loss": 0.8316,
"step": 18900
},
{
"epoch": 12.156110044785668,
"grad_norm": 2.912343740463257,
"learning_rate": 5.699296225207933e-05,
"loss": 0.8499,
"step": 19000
},
{
"epoch": 12.220089571337173,
"grad_norm": 2.9618356227874756,
"learning_rate": 5.571337172104926e-05,
"loss": 0.8406,
"step": 19100
},
{
"epoch": 12.284069097888676,
"grad_norm": 2.899482011795044,
"learning_rate": 5.4433781190019186e-05,
"loss": 0.8472,
"step": 19200
},
{
"epoch": 12.34804862444018,
"grad_norm": 3.3630785942077637,
"learning_rate": 5.315419065898912e-05,
"loss": 0.8585,
"step": 19300
},
{
"epoch": 12.412028150991683,
"grad_norm": 3.245290517807007,
"learning_rate": 5.187460012795905e-05,
"loss": 0.8593,
"step": 19400
},
{
"epoch": 12.476007677543187,
"grad_norm": 3.1242306232452393,
"learning_rate": 5.059500959692898e-05,
"loss": 0.8659,
"step": 19500
},
{
"epoch": 12.53998720409469,
"grad_norm": 3.265775442123413,
"learning_rate": 4.931541906589891e-05,
"loss": 0.879,
"step": 19600
},
{
"epoch": 12.603966730646194,
"grad_norm": 2.995530843734741,
"learning_rate": 4.8035828534868836e-05,
"loss": 0.8688,
"step": 19700
},
{
"epoch": 12.667946257197697,
"grad_norm": 3.1246113777160645,
"learning_rate": 4.675623800383877e-05,
"loss": 0.8759,
"step": 19800
},
{
"epoch": 12.731925783749201,
"grad_norm": 3.249753713607788,
"learning_rate": 4.54766474728087e-05,
"loss": 0.8663,
"step": 19900
},
{
"epoch": 12.795905310300704,
"grad_norm": 3.2970869541168213,
"learning_rate": 4.419705694177863e-05,
"loss": 0.8618,
"step": 20000
},
{
"epoch": 12.859884836852208,
"grad_norm": 3.212738275527954,
"learning_rate": 4.291746641074856e-05,
"loss": 0.8709,
"step": 20100
},
{
"epoch": 12.92386436340371,
"grad_norm": 3.064932107925415,
"learning_rate": 4.163787587971848e-05,
"loss": 0.8756,
"step": 20200
},
{
"epoch": 12.987843889955215,
"grad_norm": 3.0357518196105957,
"learning_rate": 4.035828534868841e-05,
"loss": 0.8792,
"step": 20300
},
{
"epoch": 13.051823416506718,
"grad_norm": 2.5418007373809814,
"learning_rate": 3.9078694817658345e-05,
"loss": 0.8018,
"step": 20400
},
{
"epoch": 13.115802943058222,
"grad_norm": 3.088637590408325,
"learning_rate": 3.779910428662827e-05,
"loss": 0.7951,
"step": 20500
},
{
"epoch": 13.179782469609725,
"grad_norm": 3.0088999271392822,
"learning_rate": 3.6519513755598204e-05,
"loss": 0.7759,
"step": 20600
},
{
"epoch": 13.24376199616123,
"grad_norm": 2.929150104522705,
"learning_rate": 3.523992322456814e-05,
"loss": 0.7967,
"step": 20700
},
{
"epoch": 13.307741522712732,
"grad_norm": 2.795482873916626,
"learning_rate": 3.396033269353806e-05,
"loss": 0.8023,
"step": 20800
},
{
"epoch": 13.371721049264236,
"grad_norm": 2.998296022415161,
"learning_rate": 3.2680742162507996e-05,
"loss": 0.7929,
"step": 20900
},
{
"epoch": 13.435700575815739,
"grad_norm": 3.09736967086792,
"learning_rate": 3.140115163147792e-05,
"loss": 0.801,
"step": 21000
},
{
"epoch": 13.499680102367243,
"grad_norm": 3.1578280925750732,
"learning_rate": 3.0134357005758153e-05,
"loss": 0.8051,
"step": 21100
},
{
"epoch": 13.563659628918746,
"grad_norm": 3.167719841003418,
"learning_rate": 2.8854766474728086e-05,
"loss": 0.8035,
"step": 21200
},
{
"epoch": 13.62763915547025,
"grad_norm": 3.2616937160491943,
"learning_rate": 2.7575175943698016e-05,
"loss": 0.8152,
"step": 21300
},
{
"epoch": 13.691618682021753,
"grad_norm": 3.2310030460357666,
"learning_rate": 2.6295585412667945e-05,
"loss": 0.8136,
"step": 21400
},
{
"epoch": 13.755598208573257,
"grad_norm": 3.1108453273773193,
"learning_rate": 2.5015994881637874e-05,
"loss": 0.8158,
"step": 21500
},
{
"epoch": 13.81957773512476,
"grad_norm": 2.875555992126465,
"learning_rate": 2.37364043506078e-05,
"loss": 0.8203,
"step": 21600
},
{
"epoch": 13.883557261676264,
"grad_norm": 2.9165468215942383,
"learning_rate": 2.2456813819577733e-05,
"loss": 0.8142,
"step": 21700
},
{
"epoch": 13.947536788227767,
"grad_norm": 2.839167356491089,
"learning_rate": 2.1177223288547663e-05,
"loss": 0.8075,
"step": 21800
},
{
"epoch": 14.011516314779271,
"grad_norm": 2.6591968536376953,
"learning_rate": 1.9897632757517592e-05,
"loss": 0.7897,
"step": 21900
},
{
"epoch": 14.075495841330774,
"grad_norm": 2.895771026611328,
"learning_rate": 1.861804222648752e-05,
"loss": 0.7396,
"step": 22000
},
{
"epoch": 14.139475367882278,
"grad_norm": 3.234828472137451,
"learning_rate": 1.733845169545745e-05,
"loss": 0.7564,
"step": 22100
},
{
"epoch": 14.203454894433781,
"grad_norm": 3.1565563678741455,
"learning_rate": 1.605886116442738e-05,
"loss": 0.7555,
"step": 22200
},
{
"epoch": 14.267434420985285,
"grad_norm": 2.6395761966705322,
"learning_rate": 1.4779270633397312e-05,
"loss": 0.752,
"step": 22300
},
{
"epoch": 14.331413947536788,
"grad_norm": 2.860470771789551,
"learning_rate": 1.3499680102367243e-05,
"loss": 0.7592,
"step": 22400
},
{
"epoch": 14.395393474088293,
"grad_norm": 3.0977256298065186,
"learning_rate": 1.222008957133717e-05,
"loss": 0.7538,
"step": 22500
},
{
"epoch": 14.459373000639795,
"grad_norm": 3.199491024017334,
"learning_rate": 1.09404990403071e-05,
"loss": 0.7554,
"step": 22600
},
{
"epoch": 14.5233525271913,
"grad_norm": 3.1228437423706055,
"learning_rate": 9.660908509277031e-06,
"loss": 0.7536,
"step": 22700
},
{
"epoch": 14.587332053742802,
"grad_norm": 3.0725913047790527,
"learning_rate": 8.38131797824696e-06,
"loss": 0.745,
"step": 22800
},
{
"epoch": 14.651311580294307,
"grad_norm": 2.7372710704803467,
"learning_rate": 7.101727447216891e-06,
"loss": 0.7761,
"step": 22900
},
{
"epoch": 14.71529110684581,
"grad_norm": 2.8115973472595215,
"learning_rate": 5.822136916186819e-06,
"loss": 0.7549,
"step": 23000
},
{
"epoch": 14.779270633397314,
"grad_norm": 3.0110971927642822,
"learning_rate": 4.5425463851567495e-06,
"loss": 0.7585,
"step": 23100
},
{
"epoch": 14.843250159948816,
"grad_norm": 3.1187989711761475,
"learning_rate": 3.2629558541266794e-06,
"loss": 0.7612,
"step": 23200
},
{
"epoch": 14.90722968650032,
"grad_norm": 3.022102117538452,
"learning_rate": 1.983365323096609e-06,
"loss": 0.7555,
"step": 23300
},
{
"epoch": 14.971209213051823,
"grad_norm": 3.1614649295806885,
"learning_rate": 7.037747920665386e-07,
"loss": 0.7546,
"step": 23400
}
],
"logging_steps": 100,
"max_steps": 23445,
"num_input_tokens_seen": 0,
"num_train_epochs": 15,
"save_steps": 2500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 9.649122201906708e+17,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}