mistral-7b-v0.3-sft / trainer_state.json
simonycl's picture
Upload folder using huggingface_hub
b14daa6 verified
raw
history blame
28.8 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.999882826231301,
"eval_steps": 500,
"global_step": 1600,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.006249267663945631,
"grad_norm": 6.042294502258301,
"learning_rate": 4.166666666666667e-06,
"loss": 0.7433,
"step": 10
},
{
"epoch": 0.012498535327891263,
"grad_norm": 3.4981777667999268,
"learning_rate": 8.333333333333334e-06,
"loss": 0.6848,
"step": 20
},
{
"epoch": 0.018747802991836895,
"grad_norm": 7.156867980957031,
"learning_rate": 1.25e-05,
"loss": 0.6722,
"step": 30
},
{
"epoch": 0.024997070655782525,
"grad_norm": 3.301668167114258,
"learning_rate": 1.6666666666666667e-05,
"loss": 0.6924,
"step": 40
},
{
"epoch": 0.031246338319728156,
"grad_norm": 2.7308597564697266,
"learning_rate": 1.999991805061211e-05,
"loss": 0.6759,
"step": 50
},
{
"epoch": 0.03749560598367379,
"grad_norm": 5.730428695678711,
"learning_rate": 1.999704996306308e-05,
"loss": 0.7484,
"step": 60
},
{
"epoch": 0.04374487364761942,
"grad_norm": 2.7434427738189697,
"learning_rate": 1.999008574916082e-05,
"loss": 0.7298,
"step": 70
},
{
"epoch": 0.04999414131156505,
"grad_norm": 2.67563533782959,
"learning_rate": 1.997902826237712e-05,
"loss": 0.7381,
"step": 80
},
{
"epoch": 0.05624340897551068,
"grad_norm": 4.653203010559082,
"learning_rate": 1.9963882033334827e-05,
"loss": 0.716,
"step": 90
},
{
"epoch": 0.06249267663945631,
"grad_norm": 3.6553049087524414,
"learning_rate": 1.9944653267951507e-05,
"loss": 0.713,
"step": 100
},
{
"epoch": 0.06874194430340194,
"grad_norm": 3.856987953186035,
"learning_rate": 1.9921349844896655e-05,
"loss": 0.6886,
"step": 110
},
{
"epoch": 0.07499121196734758,
"grad_norm": 2.80169677734375,
"learning_rate": 1.9893981312363563e-05,
"loss": 0.7459,
"step": 120
},
{
"epoch": 0.0812404796312932,
"grad_norm": 9.439187049865723,
"learning_rate": 1.9862558884157067e-05,
"loss": 0.7131,
"step": 130
},
{
"epoch": 0.08748974729523884,
"grad_norm": 3.0188660621643066,
"learning_rate": 1.9827095435098926e-05,
"loss": 0.7419,
"step": 140
},
{
"epoch": 0.09373901495918448,
"grad_norm": 6.821515083312988,
"learning_rate": 1.9787605495752528e-05,
"loss": 0.7093,
"step": 150
},
{
"epoch": 0.0999882826231301,
"grad_norm": 3.1190407276153564,
"learning_rate": 1.9744105246469264e-05,
"loss": 0.7162,
"step": 160
},
{
"epoch": 0.10623755028707574,
"grad_norm": 3.4266233444213867,
"learning_rate": 1.9696612510758878e-05,
"loss": 0.7501,
"step": 170
},
{
"epoch": 0.11248681795102136,
"grad_norm": 6.012339115142822,
"learning_rate": 1.964514674798659e-05,
"loss": 0.7228,
"step": 180
},
{
"epoch": 0.118736085614967,
"grad_norm": 2.518491268157959,
"learning_rate": 1.9589729045399935e-05,
"loss": 0.7283,
"step": 190
},
{
"epoch": 0.12498535327891262,
"grad_norm": 2.739582061767578,
"learning_rate": 1.953038210948861e-05,
"loss": 0.7464,
"step": 200
},
{
"epoch": 0.13123462094285826,
"grad_norm": 2.70062518119812,
"learning_rate": 1.9467130256680867e-05,
"loss": 0.721,
"step": 210
},
{
"epoch": 0.13748388860680388,
"grad_norm": 2.4166440963745117,
"learning_rate": 1.9399999403380266e-05,
"loss": 0.711,
"step": 220
},
{
"epoch": 0.14373315627074953,
"grad_norm": 3.875697374343872,
"learning_rate": 1.932901705534683e-05,
"loss": 0.7296,
"step": 230
},
{
"epoch": 0.14998242393469516,
"grad_norm": 3.1276426315307617,
"learning_rate": 1.9254212296427043e-05,
"loss": 0.7079,
"step": 240
},
{
"epoch": 0.15623169159864078,
"grad_norm": 3.067948341369629,
"learning_rate": 1.9175615776637212e-05,
"loss": 0.7231,
"step": 250
},
{
"epoch": 0.1624809592625864,
"grad_norm": 2.604984760284424,
"learning_rate": 1.9093259699605125e-05,
"loss": 0.7202,
"step": 260
},
{
"epoch": 0.16873022692653206,
"grad_norm": 2.3994998931884766,
"learning_rate": 1.900717780937514e-05,
"loss": 0.7129,
"step": 270
},
{
"epoch": 0.17497949459047768,
"grad_norm": 2.617325782775879,
"learning_rate": 1.8917405376582144e-05,
"loss": 0.7142,
"step": 280
},
{
"epoch": 0.1812287622544233,
"grad_norm": 2.6472878456115723,
"learning_rate": 1.8823979183999965e-05,
"loss": 0.7168,
"step": 290
},
{
"epoch": 0.18747802991836895,
"grad_norm": 2.1472179889678955,
"learning_rate": 1.8726937511470247e-05,
"loss": 0.7209,
"step": 300
},
{
"epoch": 0.19372729758231458,
"grad_norm": 2.5548176765441895,
"learning_rate": 1.8626320120217922e-05,
"loss": 0.7278,
"step": 310
},
{
"epoch": 0.1999765652462602,
"grad_norm": 2.2397587299346924,
"learning_rate": 1.8522168236559693e-05,
"loss": 0.7229,
"step": 320
},
{
"epoch": 0.20622583291020583,
"grad_norm": 2.72623872756958,
"learning_rate": 1.8414524535012244e-05,
"loss": 0.7003,
"step": 330
},
{
"epoch": 0.21247510057415148,
"grad_norm": 2.6225929260253906,
"learning_rate": 1.8303433120807043e-05,
"loss": 0.729,
"step": 340
},
{
"epoch": 0.2187243682380971,
"grad_norm": 4.628861427307129,
"learning_rate": 1.8188939511818965e-05,
"loss": 0.7437,
"step": 350
},
{
"epoch": 0.22497363590204272,
"grad_norm": 2.2117223739624023,
"learning_rate": 1.8071090619916095e-05,
"loss": 0.7101,
"step": 360
},
{
"epoch": 0.23122290356598835,
"grad_norm": 2.2710976600646973,
"learning_rate": 1.7949934731738348e-05,
"loss": 0.7297,
"step": 370
},
{
"epoch": 0.237472171229934,
"grad_norm": 2.2278854846954346,
"learning_rate": 1.7825521488912833e-05,
"loss": 0.7275,
"step": 380
},
{
"epoch": 0.24372143889387962,
"grad_norm": 2.332300901412964,
"learning_rate": 1.7697901867713997e-05,
"loss": 0.7103,
"step": 390
},
{
"epoch": 0.24997070655782525,
"grad_norm": 1.9451990127563477,
"learning_rate": 1.7567128158176955e-05,
"loss": 0.6981,
"step": 400
},
{
"epoch": 0.2562199742217709,
"grad_norm": 2.7532973289489746,
"learning_rate": 1.7433253942672497e-05,
"loss": 0.7015,
"step": 410
},
{
"epoch": 0.2624692418857165,
"grad_norm": 2.6847236156463623,
"learning_rate": 1.7296334073952606e-05,
"loss": 0.7242,
"step": 420
},
{
"epoch": 0.26871850954966214,
"grad_norm": 2.6689813137054443,
"learning_rate": 1.7156424652675433e-05,
"loss": 0.7115,
"step": 430
},
{
"epoch": 0.27496777721360777,
"grad_norm": 2.2913591861724854,
"learning_rate": 1.7013583004418994e-05,
"loss": 0.6912,
"step": 440
},
{
"epoch": 0.2812170448775534,
"grad_norm": 2.5969536304473877,
"learning_rate": 1.6867867656192946e-05,
"loss": 0.7074,
"step": 450
},
{
"epoch": 0.28746631254149907,
"grad_norm": 2.246922731399536,
"learning_rate": 1.6719338312458123e-05,
"loss": 0.7048,
"step": 460
},
{
"epoch": 0.2937155802054447,
"grad_norm": 2.251768112182617,
"learning_rate": 1.656805583066361e-05,
"loss": 0.7085,
"step": 470
},
{
"epoch": 0.2999648478693903,
"grad_norm": 2.249643325805664,
"learning_rate": 1.6414082196311402e-05,
"loss": 0.6897,
"step": 480
},
{
"epoch": 0.30621411553333594,
"grad_norm": 1.974482536315918,
"learning_rate": 1.6257480497558873e-05,
"loss": 0.6941,
"step": 490
},
{
"epoch": 0.31246338319728156,
"grad_norm": 2.1665918827056885,
"learning_rate": 1.6098314899369446e-05,
"loss": 0.7061,
"step": 500
},
{
"epoch": 0.3187126508612272,
"grad_norm": 2.391334295272827,
"learning_rate": 1.5936650617222063e-05,
"loss": 0.6958,
"step": 510
},
{
"epoch": 0.3249619185251728,
"grad_norm": 2.047011375427246,
"learning_rate": 1.5772553890390196e-05,
"loss": 0.7177,
"step": 520
},
{
"epoch": 0.3312111861891185,
"grad_norm": 1.8378033638000488,
"learning_rate": 1.560609195480142e-05,
"loss": 0.6803,
"step": 530
},
{
"epoch": 0.3374604538530641,
"grad_norm": 2.126997232437134,
"learning_rate": 1.5437333015488586e-05,
"loss": 0.6694,
"step": 540
},
{
"epoch": 0.34370972151700974,
"grad_norm": 2.4227328300476074,
"learning_rate": 1.526634621864395e-05,
"loss": 0.6773,
"step": 550
},
{
"epoch": 0.34995898918095536,
"grad_norm": 2.368255615234375,
"learning_rate": 1.5093201623287631e-05,
"loss": 0.7048,
"step": 560
},
{
"epoch": 0.356208256844901,
"grad_norm": 1.9544981718063354,
"learning_rate": 1.4917970172562122e-05,
"loss": 0.686,
"step": 570
},
{
"epoch": 0.3624575245088466,
"grad_norm": 2.019627809524536,
"learning_rate": 1.4740723664664483e-05,
"loss": 0.6911,
"step": 580
},
{
"epoch": 0.36870679217279223,
"grad_norm": 1.8915493488311768,
"learning_rate": 1.4561534723428205e-05,
"loss": 0.6828,
"step": 590
},
{
"epoch": 0.3749560598367379,
"grad_norm": 2.211825370788574,
"learning_rate": 1.4380476768566825e-05,
"loss": 0.6636,
"step": 600
},
{
"epoch": 0.38120532750068353,
"grad_norm": 1.9755256175994873,
"learning_rate": 1.4197623985591373e-05,
"loss": 0.6902,
"step": 610
},
{
"epoch": 0.38745459516462916,
"grad_norm": 1.9239295721054077,
"learning_rate": 1.4013051295414108e-05,
"loss": 0.6795,
"step": 620
},
{
"epoch": 0.3937038628285748,
"grad_norm": 2.1901464462280273,
"learning_rate": 1.3826834323650899e-05,
"loss": 0.7005,
"step": 630
},
{
"epoch": 0.3999531304925204,
"grad_norm": 1.9579427242279053,
"learning_rate": 1.3639049369634878e-05,
"loss": 0.6789,
"step": 640
},
{
"epoch": 0.406202398156466,
"grad_norm": 1.819069266319275,
"learning_rate": 1.344977337515404e-05,
"loss": 0.6808,
"step": 650
},
{
"epoch": 0.41245166582041165,
"grad_norm": 2.11354923248291,
"learning_rate": 1.3259083892925633e-05,
"loss": 0.6798,
"step": 660
},
{
"epoch": 0.41870093348435733,
"grad_norm": 2.0537407398223877,
"learning_rate": 1.3067059054820184e-05,
"loss": 0.6587,
"step": 670
},
{
"epoch": 0.42495020114830295,
"grad_norm": 2.4250621795654297,
"learning_rate": 1.2873777539848284e-05,
"loss": 0.6699,
"step": 680
},
{
"epoch": 0.4311994688122486,
"grad_norm": 2.0022125244140625,
"learning_rate": 1.2679318541923131e-05,
"loss": 0.6644,
"step": 690
},
{
"epoch": 0.4374487364761942,
"grad_norm": 1.9328835010528564,
"learning_rate": 1.248376173741215e-05,
"loss": 0.68,
"step": 700
},
{
"epoch": 0.4436980041401398,
"grad_norm": 2.0376391410827637,
"learning_rate": 1.2287187252490914e-05,
"loss": 0.684,
"step": 710
},
{
"epoch": 0.44994727180408545,
"grad_norm": 1.9673105478286743,
"learning_rate": 1.2089675630312755e-05,
"loss": 0.6515,
"step": 720
},
{
"epoch": 0.45619653946803107,
"grad_norm": 1.7611652612686157,
"learning_rate": 1.1891307798007536e-05,
"loss": 0.6609,
"step": 730
},
{
"epoch": 0.4624458071319767,
"grad_norm": 1.9665021896362305,
"learning_rate": 1.1692165033523117e-05,
"loss": 0.6801,
"step": 740
},
{
"epoch": 0.4686950747959224,
"grad_norm": 2.349421977996826,
"learning_rate": 1.1492328932323022e-05,
"loss": 0.6831,
"step": 750
},
{
"epoch": 0.474944342459868,
"grad_norm": 2.5945515632629395,
"learning_rate": 1.1291881373954066e-05,
"loss": 0.6725,
"step": 760
},
{
"epoch": 0.4811936101238136,
"grad_norm": 1.7327567338943481,
"learning_rate": 1.109090448849755e-05,
"loss": 0.6394,
"step": 770
},
{
"epoch": 0.48744287778775924,
"grad_norm": 1.980264663696289,
"learning_rate": 1.088948062291783e-05,
"loss": 0.6682,
"step": 780
},
{
"epoch": 0.49369214545170487,
"grad_norm": 2.313495635986328,
"learning_rate": 1.0687692307321984e-05,
"loss": 0.6353,
"step": 790
},
{
"epoch": 0.4999414131156505,
"grad_norm": 2.176020860671997,
"learning_rate": 1.0485622221144485e-05,
"loss": 0.6433,
"step": 800
},
{
"epoch": 0.5061906807795962,
"grad_norm": 2.5199544429779053,
"learning_rate": 1.0283353159270644e-05,
"loss": 0.6719,
"step": 810
},
{
"epoch": 0.5124399484435418,
"grad_norm": 2.6333529949188232,
"learning_rate": 1.0080967998112787e-05,
"loss": 0.6576,
"step": 820
},
{
"epoch": 0.5186892161074874,
"grad_norm": 2.367847204208374,
"learning_rate": 9.878549661653013e-06,
"loss": 0.6547,
"step": 830
},
{
"epoch": 0.524938483771433,
"grad_norm": 1.736178994178772,
"learning_rate": 9.676181087466444e-06,
"loss": 0.6361,
"step": 840
},
{
"epoch": 0.5311877514353787,
"grad_norm": 1.8639039993286133,
"learning_rate": 9.473945192738933e-06,
"loss": 0.6497,
"step": 850
},
{
"epoch": 0.5374370190993243,
"grad_norm": 1.7635918855667114,
"learning_rate": 9.27192484029312e-06,
"loss": 0.6654,
"step": 860
},
{
"epoch": 0.5436862867632699,
"grad_norm": 2.0125083923339844,
"learning_rate": 9.070202804636745e-06,
"loss": 0.6397,
"step": 870
},
{
"epoch": 0.5499355544272155,
"grad_norm": 2.078883171081543,
"learning_rate": 8.868861738047158e-06,
"loss": 0.6603,
"step": 880
},
{
"epoch": 0.5561848220911612,
"grad_norm": 1.781632423400879,
"learning_rate": 8.667984136705927e-06,
"loss": 0.6556,
"step": 890
},
{
"epoch": 0.5624340897551068,
"grad_norm": 2.0591318607330322,
"learning_rate": 8.46765230689737e-06,
"loss": 0.6258,
"step": 900
},
{
"epoch": 0.5686833574190524,
"grad_norm": 1.6111246347427368,
"learning_rate": 8.267948331284923e-06,
"loss": 0.6472,
"step": 910
},
{
"epoch": 0.5749326250829981,
"grad_norm": 2.417381763458252,
"learning_rate": 8.068954035279121e-06,
"loss": 0.6226,
"step": 920
},
{
"epoch": 0.5811818927469438,
"grad_norm": 2.185147762298584,
"learning_rate": 7.870750953510983e-06,
"loss": 0.617,
"step": 930
},
{
"epoch": 0.5874311604108894,
"grad_norm": 2.3496596813201904,
"learning_rate": 7.673420296424541e-06,
"loss": 0.633,
"step": 940
},
{
"epoch": 0.593680428074835,
"grad_norm": 2.4552910327911377,
"learning_rate": 7.4770429170022e-06,
"loss": 0.6312,
"step": 950
},
{
"epoch": 0.5999296957387806,
"grad_norm": 2.7454309463500977,
"learning_rate": 7.2816992776365714e-06,
"loss": 0.6252,
"step": 960
},
{
"epoch": 0.6061789634027263,
"grad_norm": 2.967708110809326,
"learning_rate": 7.08746941716232e-06,
"loss": 0.634,
"step": 970
},
{
"epoch": 0.6124282310666719,
"grad_norm": 2.755345582962036,
"learning_rate": 6.894432918061579e-06,
"loss": 0.6216,
"step": 980
},
{
"epoch": 0.6186774987306175,
"grad_norm": 1.987317681312561,
"learning_rate": 6.702668873856339e-06,
"loss": 0.643,
"step": 990
},
{
"epoch": 0.6249267663945631,
"grad_norm": 2.4610655307769775,
"learning_rate": 6.5122558567011775e-06,
"loss": 0.6409,
"step": 1000
},
{
"epoch": 0.6311760340585088,
"grad_norm": 1.9931960105895996,
"learning_rate": 6.323271885189636e-06,
"loss": 0.6418,
"step": 1010
},
{
"epoch": 0.6374253017224544,
"grad_norm": 3.0215256214141846,
"learning_rate": 6.135794392387353e-06,
"loss": 0.635,
"step": 1020
},
{
"epoch": 0.6436745693864,
"grad_norm": 1.752156376838684,
"learning_rate": 5.949900194105167e-06,
"loss": 0.6326,
"step": 1030
},
{
"epoch": 0.6499238370503456,
"grad_norm": 1.8890056610107422,
"learning_rate": 5.765665457425102e-06,
"loss": 0.6338,
"step": 1040
},
{
"epoch": 0.6561731047142912,
"grad_norm": 2.2345621585845947,
"learning_rate": 5.5831656694921465e-06,
"loss": 0.6262,
"step": 1050
},
{
"epoch": 0.662422372378237,
"grad_norm": 1.983519196510315,
"learning_rate": 5.40247560658467e-06,
"loss": 0.6421,
"step": 1060
},
{
"epoch": 0.6686716400421826,
"grad_norm": 2.091252326965332,
"learning_rate": 5.223669303476041e-06,
"loss": 0.6166,
"step": 1070
},
{
"epoch": 0.6749209077061282,
"grad_norm": 1.9888851642608643,
"learning_rate": 5.046820023100129e-06,
"loss": 0.6033,
"step": 1080
},
{
"epoch": 0.6811701753700738,
"grad_norm": 3.788809061050415,
"learning_rate": 4.872000226533001e-06,
"loss": 0.6292,
"step": 1090
},
{
"epoch": 0.6874194430340195,
"grad_norm": 2.0238468647003174,
"learning_rate": 4.699281543303222e-06,
"loss": 0.6238,
"step": 1100
},
{
"epoch": 0.6936687106979651,
"grad_norm": 1.7076054811477661,
"learning_rate": 4.528734742042803e-06,
"loss": 0.609,
"step": 1110
},
{
"epoch": 0.6999179783619107,
"grad_norm": 2.1124353408813477,
"learning_rate": 4.360429701490935e-06,
"loss": 0.6116,
"step": 1120
},
{
"epoch": 0.7061672460258563,
"grad_norm": 1.8266746997833252,
"learning_rate": 4.194435381862343e-06,
"loss": 0.6217,
"step": 1130
},
{
"epoch": 0.712416513689802,
"grad_norm": 1.7669917345046997,
"learning_rate": 4.03081979659195e-06,
"loss": 0.6247,
"step": 1140
},
{
"epoch": 0.7186657813537476,
"grad_norm": 2.847994565963745,
"learning_rate": 3.869649984467504e-06,
"loss": 0.6092,
"step": 1150
},
{
"epoch": 0.7249150490176932,
"grad_norm": 5.524293899536133,
"learning_rate": 3.7109919821615546e-06,
"loss": 0.6063,
"step": 1160
},
{
"epoch": 0.7311643166816388,
"grad_norm": 2.3931970596313477,
"learning_rate": 3.5549107971739905e-06,
"loss": 0.6154,
"step": 1170
},
{
"epoch": 0.7374135843455845,
"grad_norm": 1.9852912425994873,
"learning_rate": 3.4014703811963024e-06,
"loss": 0.6119,
"step": 1180
},
{
"epoch": 0.7436628520095301,
"grad_norm": 1.8052198886871338,
"learning_rate": 3.2507336039084315e-06,
"loss": 0.6043,
"step": 1190
},
{
"epoch": 0.7499121196734758,
"grad_norm": 1.8438955545425415,
"learning_rate": 3.1027622272189572e-06,
"loss": 0.6308,
"step": 1200
},
{
"epoch": 0.7561613873374214,
"grad_norm": 2.11885929107666,
"learning_rate": 2.9576168799591663e-06,
"loss": 0.5984,
"step": 1210
},
{
"epoch": 0.7624106550013671,
"grad_norm": 2.6443002223968506,
"learning_rate": 2.8153570330413925e-06,
"loss": 0.6139,
"step": 1220
},
{
"epoch": 0.7686599226653127,
"grad_norm": 2.413456678390503,
"learning_rate": 2.6760409750917925e-06,
"loss": 0.5974,
"step": 1230
},
{
"epoch": 0.7749091903292583,
"grad_norm": 2.166473627090454,
"learning_rate": 2.5397257885675396e-06,
"loss": 0.6034,
"step": 1240
},
{
"epoch": 0.7811584579932039,
"grad_norm": 1.9241559505462646,
"learning_rate": 2.406467326368237e-06,
"loss": 0.6026,
"step": 1250
},
{
"epoch": 0.7874077256571496,
"grad_norm": 2.043766736984253,
"learning_rate": 2.2763201889510987e-06,
"loss": 0.6216,
"step": 1260
},
{
"epoch": 0.7936569933210952,
"grad_norm": 1.6110656261444092,
"learning_rate": 2.149337701959325e-06,
"loss": 0.6198,
"step": 1270
},
{
"epoch": 0.7999062609850408,
"grad_norm": 1.7661818265914917,
"learning_rate": 2.025571894372794e-06,
"loss": 0.5838,
"step": 1280
},
{
"epoch": 0.8061555286489864,
"grad_norm": 1.8362523317337036,
"learning_rate": 1.9050734771900414e-06,
"loss": 0.5999,
"step": 1290
},
{
"epoch": 0.812404796312932,
"grad_norm": 1.9224945306777954,
"learning_rate": 1.7878918226502816e-06,
"loss": 0.5956,
"step": 1300
},
{
"epoch": 0.8186540639768777,
"grad_norm": 2.024622678756714,
"learning_rate": 1.6740749440039262e-06,
"loss": 0.6083,
"step": 1310
},
{
"epoch": 0.8249033316408233,
"grad_norm": 2.0744571685791016,
"learning_rate": 1.5636694758399563e-06,
"loss": 0.5917,
"step": 1320
},
{
"epoch": 0.8311525993047689,
"grad_norm": 2.3073995113372803,
"learning_rate": 1.4567206549781699e-06,
"loss": 0.5929,
"step": 1330
},
{
"epoch": 0.8374018669687147,
"grad_norm": 1.8136156797409058,
"learning_rate": 1.3532723019341376e-06,
"loss": 0.5936,
"step": 1340
},
{
"epoch": 0.8436511346326603,
"grad_norm": 1.8207391500473022,
"learning_rate": 1.2533668029644751e-06,
"loss": 0.5933,
"step": 1350
},
{
"epoch": 0.8499004022966059,
"grad_norm": 2.001443386077881,
"learning_rate": 1.1570450926997657e-06,
"loss": 0.5891,
"step": 1360
},
{
"epoch": 0.8561496699605515,
"grad_norm": 1.8668729066848755,
"learning_rate": 1.064346637372271e-06,
"loss": 0.5799,
"step": 1370
},
{
"epoch": 0.8623989376244972,
"grad_norm": 2.3950822353363037,
"learning_rate": 9.753094186453028e-07,
"loss": 0.5994,
"step": 1380
},
{
"epoch": 0.8686482052884428,
"grad_norm": 1.9551249742507935,
"learning_rate": 8.89969918050847e-07,
"loss": 0.6119,
"step": 1390
},
{
"epoch": 0.8748974729523884,
"grad_norm": 1.774511456489563,
"learning_rate": 8.083631020418792e-07,
"loss": 0.5958,
"step": 1400
},
{
"epoch": 0.881146740616334,
"grad_norm": 1.6152182817459106,
"learning_rate": 7.305224076654127e-07,
"loss": 0.6025,
"step": 1410
},
{
"epoch": 0.8873960082802796,
"grad_norm": 3.579341173171997,
"learning_rate": 6.564797288622371e-07,
"loss": 0.5985,
"step": 1420
},
{
"epoch": 0.8936452759442253,
"grad_norm": 1.9974656105041504,
"learning_rate": 5.86265403398899e-07,
"loss": 0.5992,
"step": 1430
},
{
"epoch": 0.8998945436081709,
"grad_norm": 1.8749598264694214,
"learning_rate": 5.199082004372958e-07,
"loss": 0.5714,
"step": 1440
},
{
"epoch": 0.9061438112721165,
"grad_norm": 2.20654296875,
"learning_rate": 4.5743530874699293e-07,
"loss": 0.5967,
"step": 1450
},
{
"epoch": 0.9123930789360621,
"grad_norm": 1.7783575057983398,
"learning_rate": 3.988723255650728e-07,
"loss": 0.5883,
"step": 1460
},
{
"epoch": 0.9186423466000078,
"grad_norm": 1.8559459447860718,
"learning_rate": 3.442432461080858e-07,
"loss": 0.6041,
"step": 1470
},
{
"epoch": 0.9248916142639534,
"grad_norm": 1.6267383098602295,
"learning_rate": 2.935704537404083e-07,
"loss": 0.6076,
"step": 1480
},
{
"epoch": 0.9311408819278991,
"grad_norm": 1.5899640321731567,
"learning_rate": 2.468747108030289e-07,
"loss": 0.593,
"step": 1490
},
{
"epoch": 0.9373901495918447,
"grad_norm": 1.630462884902954,
"learning_rate": 2.0417515010652032e-07,
"loss": 0.5928,
"step": 1500
},
{
"epoch": 0.9436394172557904,
"grad_norm": 1.6635667085647583,
"learning_rate": 1.6548926709168634e-07,
"loss": 0.6025,
"step": 1510
},
{
"epoch": 0.949888684919736,
"grad_norm": 1.7267752885818481,
"learning_rate": 1.30832912661093e-07,
"loss": 0.5937,
"step": 1520
},
{
"epoch": 0.9561379525836816,
"grad_norm": 1.8789523839950562,
"learning_rate": 1.0022028668442374e-07,
"loss": 0.5874,
"step": 1530
},
{
"epoch": 0.9623872202476272,
"grad_norm": 1.6144094467163086,
"learning_rate": 7.366393218031564e-08,
"loss": 0.6099,
"step": 1540
},
{
"epoch": 0.9686364879115729,
"grad_norm": 1.5856353044509888,
"learning_rate": 5.1174730177064866e-08,
"loss": 0.5902,
"step": 1550
},
{
"epoch": 0.9748857555755185,
"grad_norm": 1.5565356016159058,
"learning_rate": 3.2761895254306285e-08,
"loss": 0.5852,
"step": 1560
},
{
"epoch": 0.9811350232394641,
"grad_norm": 1.7339156866073608,
"learning_rate": 1.8432971767488038e-08,
"loss": 0.5962,
"step": 1570
},
{
"epoch": 0.9873842909034097,
"grad_norm": 1.5991542339324951,
"learning_rate": 8.193830756699773e-09,
"loss": 0.6002,
"step": 1580
},
{
"epoch": 0.9936335585673554,
"grad_norm": 1.7472448348999023,
"learning_rate": 2.0486675411102165e-09,
"loss": 0.5943,
"step": 1590
},
{
"epoch": 0.999882826231301,
"grad_norm": 1.9169718027114868,
"learning_rate": 0.0,
"loss": 0.5911,
"step": 1600
},
{
"epoch": 0.999882826231301,
"step": 1600,
"total_flos": 1.7902479199415828e+19,
"train_loss": 0.6576289132237434,
"train_runtime": 37752.4225,
"train_samples_per_second": 2.713,
"train_steps_per_second": 0.042
}
],
"logging_steps": 10,
"max_steps": 1600,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 256,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.7902479199415828e+19,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}