CueFlow / trainer_state.json
JiaqiLiu's picture
Upload folder using huggingface_hub
9f03e03 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9995759717314487,
"eval_steps": 200,
"global_step": 884,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0011307420494699647,
"grad_norm": 8.3125,
"learning_rate": 3.3707865168539325e-07,
"loss": 0.493535578250885,
"step": 1,
"token_acc": 0.8620689655172413
},
{
"epoch": 0.005653710247349823,
"grad_norm": 10.0625,
"learning_rate": 1.6853932584269663e-06,
"loss": 0.4733062982559204,
"step": 5,
"token_acc": 0.8886255924170616
},
{
"epoch": 0.011307420494699646,
"grad_norm": 7.8125,
"learning_rate": 3.3707865168539327e-06,
"loss": 0.575874137878418,
"step": 10,
"token_acc": 0.8524590163934426
},
{
"epoch": 0.01696113074204947,
"grad_norm": 10.625,
"learning_rate": 5.056179775280899e-06,
"loss": 0.5115116596221924,
"step": 15,
"token_acc": 0.8833652007648184
},
{
"epoch": 0.022614840989399292,
"grad_norm": 9.1875,
"learning_rate": 6.741573033707865e-06,
"loss": 0.5164161682128906,
"step": 20,
"token_acc": 0.858195211786372
},
{
"epoch": 0.028268551236749116,
"grad_norm": 7.28125,
"learning_rate": 8.426966292134832e-06,
"loss": 0.500755786895752,
"step": 25,
"token_acc": 0.8675623800383877
},
{
"epoch": 0.03392226148409894,
"grad_norm": 11.4375,
"learning_rate": 1.0112359550561798e-05,
"loss": 0.45065789222717284,
"step": 30,
"token_acc": 0.8818011257035647
},
{
"epoch": 0.039575971731448764,
"grad_norm": 9.375,
"learning_rate": 1.1797752808988765e-05,
"loss": 0.5452223300933838,
"step": 35,
"token_acc": 0.8604651162790697
},
{
"epoch": 0.045229681978798585,
"grad_norm": 12.0,
"learning_rate": 1.348314606741573e-05,
"loss": 0.46700444221496584,
"step": 40,
"token_acc": 0.8830188679245283
},
{
"epoch": 0.05088339222614841,
"grad_norm": 8.4375,
"learning_rate": 1.5168539325842698e-05,
"loss": 0.4567443370819092,
"step": 45,
"token_acc": 0.8964879852125693
},
{
"epoch": 0.05653710247349823,
"grad_norm": 13.5,
"learning_rate": 1.6853932584269665e-05,
"loss": 0.5110898494720459,
"step": 50,
"token_acc": 0.864406779661017
},
{
"epoch": 0.06219081272084806,
"grad_norm": 5.0,
"learning_rate": 1.853932584269663e-05,
"loss": 0.4209805965423584,
"step": 55,
"token_acc": 0.8962264150943396
},
{
"epoch": 0.06784452296819787,
"grad_norm": 8.125,
"learning_rate": 2.0224719101123596e-05,
"loss": 0.4727331638336182,
"step": 60,
"token_acc": 0.891941391941392
},
{
"epoch": 0.0734982332155477,
"grad_norm": 11.125,
"learning_rate": 2.1910112359550563e-05,
"loss": 0.390924072265625,
"step": 65,
"token_acc": 0.8983364140480592
},
{
"epoch": 0.07915194346289753,
"grad_norm": 7.09375,
"learning_rate": 2.359550561797753e-05,
"loss": 0.3485325813293457,
"step": 70,
"token_acc": 0.9133709981167608
},
{
"epoch": 0.08480565371024736,
"grad_norm": 9.875,
"learning_rate": 2.5280898876404494e-05,
"loss": 0.4826796531677246,
"step": 75,
"token_acc": 0.8802946593001841
},
{
"epoch": 0.09045936395759717,
"grad_norm": 8.5,
"learning_rate": 2.696629213483146e-05,
"loss": 0.4428101539611816,
"step": 80,
"token_acc": 0.8783269961977186
},
{
"epoch": 0.096113074204947,
"grad_norm": 9.375,
"learning_rate": 2.865168539325843e-05,
"loss": 0.48508028984069823,
"step": 85,
"token_acc": 0.886654478976234
},
{
"epoch": 0.10176678445229682,
"grad_norm": 13.5,
"learning_rate": 2.999989459318379e-05,
"loss": 0.4538856506347656,
"step": 90,
"token_acc": 0.9044943820224719
},
{
"epoch": 0.10742049469964664,
"grad_norm": 8.4375,
"learning_rate": 2.999620552744515e-05,
"loss": 0.4510812282562256,
"step": 95,
"token_acc": 0.9045801526717557
},
{
"epoch": 0.11307420494699646,
"grad_norm": 16.5,
"learning_rate": 2.998724776679495e-05,
"loss": 0.49095354080200193,
"step": 100,
"token_acc": 0.8860294117647058
},
{
"epoch": 0.11872791519434629,
"grad_norm": 9.875,
"learning_rate": 2.997302480819445e-05,
"loss": 0.41774497032165525,
"step": 105,
"token_acc": 0.8880597014925373
},
{
"epoch": 0.12438162544169612,
"grad_norm": 7.15625,
"learning_rate": 2.9953542204050917e-05,
"loss": 0.30308961868286133,
"step": 110,
"token_acc": 0.9306049822064056
},
{
"epoch": 0.13003533568904593,
"grad_norm": 16.0,
"learning_rate": 2.9928807560050043e-05,
"loss": 0.5130989074707031,
"step": 115,
"token_acc": 0.8803738317757009
},
{
"epoch": 0.13568904593639575,
"grad_norm": 6.84375,
"learning_rate": 2.9898830532186824e-05,
"loss": 0.4366453647613525,
"step": 120,
"token_acc": 0.886654478976234
},
{
"epoch": 0.1413427561837456,
"grad_norm": 10.875,
"learning_rate": 2.9863622822996006e-05,
"loss": 0.5397056102752685,
"step": 125,
"token_acc": 0.8822429906542056
},
{
"epoch": 0.1469964664310954,
"grad_norm": 12.5,
"learning_rate": 2.982319817698363e-05,
"loss": 0.5331222534179687,
"step": 130,
"token_acc": 0.8604651162790697
},
{
"epoch": 0.15265017667844524,
"grad_norm": 12.1875,
"learning_rate": 2.977757237526136e-05,
"loss": 0.343440842628479,
"step": 135,
"token_acc": 0.9131238447319778
},
{
"epoch": 0.15830388692579506,
"grad_norm": 12.625,
"learning_rate": 2.9726763229385863e-05,
"loss": 0.47114005088806155,
"step": 140,
"token_acc": 0.886654478976234
},
{
"epoch": 0.16395759717314487,
"grad_norm": 8.625,
"learning_rate": 2.9670790574405432e-05,
"loss": 0.4388119697570801,
"step": 145,
"token_acc": 0.8894927536231884
},
{
"epoch": 0.1696113074204947,
"grad_norm": 6.6875,
"learning_rate": 2.9609676261116703e-05,
"loss": 0.558556079864502,
"step": 150,
"token_acc": 0.8729582577132486
},
{
"epoch": 0.17526501766784452,
"grad_norm": 10.375,
"learning_rate": 2.9543444147534497e-05,
"loss": 0.4793074131011963,
"step": 155,
"token_acc": 0.8905660377358491
},
{
"epoch": 0.18091872791519434,
"grad_norm": 8.3125,
"learning_rate": 2.947212008957803e-05,
"loss": 0.48024735450744627,
"step": 160,
"token_acc": 0.8924528301886793
},
{
"epoch": 0.18657243816254418,
"grad_norm": 19.875,
"learning_rate": 2.9395731930977187e-05,
"loss": 0.36183197498321534,
"step": 165,
"token_acc": 0.9111969111969112
},
{
"epoch": 0.192226148409894,
"grad_norm": 16.5,
"learning_rate": 2.9314309492402806e-05,
"loss": 0.5196819305419922,
"step": 170,
"token_acc": 0.8822393822393823
},
{
"epoch": 0.1978798586572438,
"grad_norm": 11.5625,
"learning_rate": 2.922788455982516e-05,
"loss": 0.46062512397766114,
"step": 175,
"token_acc": 0.8813559322033898
},
{
"epoch": 0.20353356890459365,
"grad_norm": 4.65625,
"learning_rate": 2.9136490872105272e-05,
"loss": 0.35768780708312986,
"step": 180,
"token_acc": 0.9044943820224719
},
{
"epoch": 0.20918727915194346,
"grad_norm": 9.5625,
"learning_rate": 2.904016410782379e-05,
"loss": 0.49846739768981935,
"step": 185,
"token_acc": 0.875
},
{
"epoch": 0.21484098939929328,
"grad_norm": 9.625,
"learning_rate": 2.8938941871352683e-05,
"loss": 0.5851227760314941,
"step": 190,
"token_acc": 0.8588007736943907
},
{
"epoch": 0.22049469964664312,
"grad_norm": 14.3125,
"learning_rate": 2.883286367817511e-05,
"loss": 0.4418447017669678,
"step": 195,
"token_acc": 0.8899253731343284
},
{
"epoch": 0.22614840989399293,
"grad_norm": 11.4375,
"learning_rate": 2.872197093945924e-05,
"loss": 0.7016141414642334,
"step": 200,
"token_acc": 0.8148148148148148
},
{
"epoch": 0.23180212014134274,
"grad_norm": 17.875,
"learning_rate": 2.860630694589199e-05,
"loss": 0.3626969337463379,
"step": 205,
"token_acc": 0.9100917431192661
},
{
"epoch": 0.23745583038869258,
"grad_norm": 12.9375,
"learning_rate": 2.8485916850779088e-05,
"loss": 0.4723252296447754,
"step": 210,
"token_acc": 0.8835978835978836
},
{
"epoch": 0.2431095406360424,
"grad_norm": 11.4375,
"learning_rate": 2.8360847652417973e-05,
"loss": 0.5085994720458984,
"step": 215,
"token_acc": 0.893048128342246
},
{
"epoch": 0.24876325088339224,
"grad_norm": 9.9375,
"learning_rate": 2.82311481757504e-05,
"loss": 0.4896233081817627,
"step": 220,
"token_acc": 0.8727272727272727
},
{
"epoch": 0.254416961130742,
"grad_norm": 11.375,
"learning_rate": 2.8096869053302046e-05,
"loss": 0.4948256492614746,
"step": 225,
"token_acc": 0.8743068391866913
},
{
"epoch": 0.26007067137809187,
"grad_norm": 8.5625,
"learning_rate": 2.7958062705416376e-05,
"loss": 0.4669034481048584,
"step": 230,
"token_acc": 0.8822463768115942
},
{
"epoch": 0.2657243816254417,
"grad_norm": 9.8125,
"learning_rate": 2.7814783319790595e-05,
"loss": 0.4465358734130859,
"step": 235,
"token_acc": 0.8836772983114447
},
{
"epoch": 0.2713780918727915,
"grad_norm": 10.9375,
"learning_rate": 2.766708683032173e-05,
"loss": 0.4827105522155762,
"step": 240,
"token_acc": 0.8766859344894027
},
{
"epoch": 0.27703180212014133,
"grad_norm": 9.4375,
"learning_rate": 2.75150308952709e-05,
"loss": 0.40639634132385255,
"step": 245,
"token_acc": 0.9010791366906474
},
{
"epoch": 0.2826855123674912,
"grad_norm": 11.4375,
"learning_rate": 2.735867487475452e-05,
"loss": 0.5378179550170898,
"step": 250,
"token_acc": 0.874766355140187
},
{
"epoch": 0.28833922261484096,
"grad_norm": 19.125,
"learning_rate": 2.7198079807571094e-05,
"loss": 0.42697606086730955,
"step": 255,
"token_acc": 0.8934579439252337
},
{
"epoch": 0.2939929328621908,
"grad_norm": 7.625,
"learning_rate": 2.7033308387372666e-05,
"loss": 0.45955357551574705,
"step": 260,
"token_acc": 0.888045540796964
},
{
"epoch": 0.29964664310954064,
"grad_norm": 8.875,
"learning_rate": 2.6864424938190263e-05,
"loss": 0.51542067527771,
"step": 265,
"token_acc": 0.8812260536398467
},
{
"epoch": 0.3053003533568905,
"grad_norm": 10.25,
"learning_rate": 2.6691495389322878e-05,
"loss": 0.49557199478149416,
"step": 270,
"token_acc": 0.8658536585365854
},
{
"epoch": 0.31095406360424027,
"grad_norm": 7.125,
"learning_rate": 2.651458724959973e-05,
"loss": 0.4124931812286377,
"step": 275,
"token_acc": 0.8986615678776291
},
{
"epoch": 0.3166077738515901,
"grad_norm": 8.9375,
"learning_rate": 2.633376958102597e-05,
"loss": 0.6442465782165527,
"step": 280,
"token_acc": 0.8523364485981308
},
{
"epoch": 0.32226148409893995,
"grad_norm": 9.875,
"learning_rate": 2.614911297182199e-05,
"loss": 0.43247137069702146,
"step": 285,
"token_acc": 0.8936567164179104
},
{
"epoch": 0.32791519434628974,
"grad_norm": 8.1875,
"learning_rate": 2.596068950886699e-05,
"loss": 0.40369553565979005,
"step": 290,
"token_acc": 0.9065934065934066
},
{
"epoch": 0.3335689045936396,
"grad_norm": 6.75,
"learning_rate": 2.5768572749557398e-05,
"loss": 0.35304784774780273,
"step": 295,
"token_acc": 0.9064885496183206
},
{
"epoch": 0.3392226148409894,
"grad_norm": 10.625,
"learning_rate": 2.5572837693091338e-05,
"loss": 0.42280235290527346,
"step": 300,
"token_acc": 0.8916518650088809
},
{
"epoch": 0.3448763250883392,
"grad_norm": 15.3125,
"learning_rate": 2.5373560751190164e-05,
"loss": 0.41590089797973634,
"step": 305,
"token_acc": 0.8983050847457628
},
{
"epoch": 0.35053003533568905,
"grad_norm": 15.5625,
"learning_rate": 2.517081971826858e-05,
"loss": 0.5931228637695313,
"step": 310,
"token_acc": 0.8493408662900188
},
{
"epoch": 0.3561837455830389,
"grad_norm": 16.0,
"learning_rate": 2.4964693741065e-05,
"loss": 0.3908259630203247,
"step": 315,
"token_acc": 0.8956043956043956
},
{
"epoch": 0.3618374558303887,
"grad_norm": 6.9375,
"learning_rate": 2.4755263287743982e-05,
"loss": 0.533808708190918,
"step": 320,
"token_acc": 0.8736842105263158
},
{
"epoch": 0.3674911660777385,
"grad_norm": 11.75,
"learning_rate": 2.4542610116482777e-05,
"loss": 0.43399462699890134,
"step": 325,
"token_acc": 0.8923076923076924
},
{
"epoch": 0.37314487632508836,
"grad_norm": 16.375,
"learning_rate": 2.43268172435543e-05,
"loss": 0.5363679885864258,
"step": 330,
"token_acc": 0.8554006968641115
},
{
"epoch": 0.37879858657243815,
"grad_norm": 6.46875,
"learning_rate": 2.4107968910918943e-05,
"loss": 0.4685643196105957,
"step": 335,
"token_acc": 0.8926553672316384
},
{
"epoch": 0.384452296819788,
"grad_norm": 11.25,
"learning_rate": 2.3886150553337925e-05,
"loss": 0.4552040100097656,
"step": 340,
"token_acc": 0.8849557522123894
},
{
"epoch": 0.3901060070671378,
"grad_norm": 9.375,
"learning_rate": 2.366144876502097e-05,
"loss": 0.41571660041809083,
"step": 345,
"token_acc": 0.8919925512104283
},
{
"epoch": 0.3957597173144876,
"grad_norm": 16.75,
"learning_rate": 2.3433951265821347e-05,
"loss": 0.48478131294250487,
"step": 350,
"token_acc": 0.8825688073394495
},
{
"epoch": 0.40141342756183745,
"grad_norm": 12.875,
"learning_rate": 2.320374686699154e-05,
"loss": 0.48756847381591795,
"step": 355,
"token_acc": 0.8759398496240601
},
{
"epoch": 0.4070671378091873,
"grad_norm": 10.25,
"learning_rate": 2.2970925436512743e-05,
"loss": 0.5582265853881836,
"step": 360,
"token_acc": 0.8727272727272727
},
{
"epoch": 0.4127208480565371,
"grad_norm": 7.125,
"learning_rate": 2.2735577864011946e-05,
"loss": 0.40789146423339845,
"step": 365,
"token_acc": 0.903954802259887
},
{
"epoch": 0.4183745583038869,
"grad_norm": 9.0625,
"learning_rate": 2.2497796025280097e-05,
"loss": 0.4335779666900635,
"step": 370,
"token_acc": 0.8895027624309392
},
{
"epoch": 0.42402826855123676,
"grad_norm": 8.6875,
"learning_rate": 2.2257672746405337e-05,
"loss": 0.5792682647705079,
"step": 375,
"token_acc": 0.8518518518518519
},
{
"epoch": 0.42968197879858655,
"grad_norm": 9.125,
"learning_rate": 2.201530176753521e-05,
"loss": 0.5717463016510009,
"step": 380,
"token_acc": 0.8565965583173997
},
{
"epoch": 0.4353356890459364,
"grad_norm": 6.9375,
"learning_rate": 2.17707777062821e-05,
"loss": 0.3671257972717285,
"step": 385,
"token_acc": 0.9074410163339383
},
{
"epoch": 0.44098939929328623,
"grad_norm": 12.125,
"learning_rate": 2.1524196020786038e-05,
"loss": 0.5280078887939453,
"step": 390,
"token_acc": 0.8771929824561403
},
{
"epoch": 0.446643109540636,
"grad_norm": 21.25,
"learning_rate": 2.127565297244947e-05,
"loss": 0.4925088882446289,
"step": 395,
"token_acc": 0.8908765652951699
},
{
"epoch": 0.45229681978798586,
"grad_norm": 10.3125,
"learning_rate": 2.1025245588358365e-05,
"loss": 0.4084740161895752,
"step": 400,
"token_acc": 0.8884758364312267
},
{
"epoch": 0.4579505300353357,
"grad_norm": 11.1875,
"learning_rate": 2.0773071623404486e-05,
"loss": 0.6456653594970703,
"step": 405,
"token_acc": 0.8585461689587426
},
{
"epoch": 0.4636042402826855,
"grad_norm": 6.6875,
"learning_rate": 2.0519229522123453e-05,
"loss": 0.6197998046875,
"step": 410,
"token_acc": 0.8672727272727273
},
{
"epoch": 0.46925795053003533,
"grad_norm": 8.8125,
"learning_rate": 2.026381838026368e-05,
"loss": 0.4290182113647461,
"step": 415,
"token_acc": 0.8931860036832413
},
{
"epoch": 0.47491166077738517,
"grad_norm": 9.9375,
"learning_rate": 2.0006937906100998e-05,
"loss": 0.4530322551727295,
"step": 420,
"token_acc": 0.9066901408450704
},
{
"epoch": 0.48056537102473496,
"grad_norm": 14.5625,
"learning_rate": 1.9748688381514224e-05,
"loss": 0.4739545345306396,
"step": 425,
"token_acc": 0.8712121212121212
},
{
"epoch": 0.4862190812720848,
"grad_norm": 14.1875,
"learning_rate": 1.9489170622836754e-05,
"loss": 0.5166975498199463,
"step": 430,
"token_acc": 0.8643122676579925
},
{
"epoch": 0.49187279151943464,
"grad_norm": 7.25,
"learning_rate": 1.922848594149955e-05,
"loss": 0.41360926628112793,
"step": 435,
"token_acc": 0.8875968992248062
},
{
"epoch": 0.4975265017667845,
"grad_norm": 8.1875,
"learning_rate": 1.896673610448085e-05,
"loss": 0.3576143741607666,
"step": 440,
"token_acc": 0.9126559714795008
},
{
"epoch": 0.5031802120141343,
"grad_norm": 9.8125,
"learning_rate": 1.8704023294578e-05,
"loss": 0.4304816246032715,
"step": 445,
"token_acc": 0.8942486085343229
},
{
"epoch": 0.508833922261484,
"grad_norm": 9.3125,
"learning_rate": 1.8440450070517e-05,
"loss": 0.40277585983276365,
"step": 450,
"token_acc": 0.8975791433891993
},
{
"epoch": 0.5144876325088339,
"grad_norm": 11.25,
"learning_rate": 1.817611932691528e-05,
"loss": 0.4203328609466553,
"step": 455,
"token_acc": 0.8958333333333334
},
{
"epoch": 0.5201413427561837,
"grad_norm": 15.375,
"learning_rate": 1.791113425411332e-05,
"loss": 0.4957026481628418,
"step": 460,
"token_acc": 0.869811320754717
},
{
"epoch": 0.5257950530035336,
"grad_norm": 12.5,
"learning_rate": 1.7645598297890914e-05,
"loss": 0.38120887279510496,
"step": 465,
"token_acc": 0.8954372623574145
},
{
"epoch": 0.5314487632508834,
"grad_norm": 12.5625,
"learning_rate": 1.7379615119083562e-05,
"loss": 0.46092791557312013,
"step": 470,
"token_acc": 0.9040590405904059
},
{
"epoch": 0.5371024734982333,
"grad_norm": 8.875,
"learning_rate": 1.7113288553115094e-05,
"loss": 0.35474748611450196,
"step": 475,
"token_acc": 0.9045045045045045
},
{
"epoch": 0.542756183745583,
"grad_norm": 11.5,
"learning_rate": 1.6846722569461957e-05,
"loss": 0.6311816215515137,
"step": 480,
"token_acc": 0.864376130198915
},
{
"epoch": 0.5484098939929328,
"grad_norm": 7.46875,
"learning_rate": 1.658002123106531e-05,
"loss": 0.4077010154724121,
"step": 485,
"token_acc": 0.8971428571428571
},
{
"epoch": 0.5540636042402827,
"grad_norm": 9.25,
"learning_rate": 1.6313288653706577e-05,
"loss": 0.5004054546356201,
"step": 490,
"token_acc": 0.8745173745173745
},
{
"epoch": 0.5597173144876325,
"grad_norm": 10.875,
"learning_rate": 1.6046628965362325e-05,
"loss": 0.49433560371398927,
"step": 495,
"token_acc": 0.8756660746003553
},
{
"epoch": 0.5653710247349824,
"grad_norm": 13.625,
"learning_rate": 1.5780146265554462e-05,
"loss": 0.5154177188873291,
"step": 500,
"token_acc": 0.8778359511343804
},
{
"epoch": 0.5710247349823322,
"grad_norm": 16.0,
"learning_rate": 1.5513944584711537e-05,
"loss": 0.5768596172332764,
"step": 505,
"token_acc": 0.8682170542635659
},
{
"epoch": 0.5766784452296819,
"grad_norm": 11.5,
"learning_rate": 1.5248127843556906e-05,
"loss": 0.542631196975708,
"step": 510,
"token_acc": 0.8648148148148148
},
{
"epoch": 0.5823321554770318,
"grad_norm": 9.3125,
"learning_rate": 1.4982799812539898e-05,
"loss": 0.44904112815856934,
"step": 515,
"token_acc": 0.8768656716417911
},
{
"epoch": 0.5879858657243816,
"grad_norm": 12.0,
"learning_rate": 1.471806407132547e-05,
"loss": 0.4998485088348389,
"step": 520,
"token_acc": 0.8848920863309353
},
{
"epoch": 0.5936395759717314,
"grad_norm": 14.9375,
"learning_rate": 1.445402396835848e-05,
"loss": 0.4888237476348877,
"step": 525,
"token_acc": 0.8878676470588235
},
{
"epoch": 0.5992932862190813,
"grad_norm": 11.875,
"learning_rate": 1.4190782580518134e-05,
"loss": 0.4358950614929199,
"step": 530,
"token_acc": 0.8945454545454545
},
{
"epoch": 0.6049469964664311,
"grad_norm": 13.4375,
"learning_rate": 1.3928442672878498e-05,
"loss": 0.3919216632843018,
"step": 535,
"token_acc": 0.9005424954792043
},
{
"epoch": 0.610600706713781,
"grad_norm": 14.625,
"learning_rate": 1.3667106658590713e-05,
"loss": 0.4299191474914551,
"step": 540,
"token_acc": 0.8834586466165414
},
{
"epoch": 0.6162544169611307,
"grad_norm": 11.0,
"learning_rate": 1.3406876558902596e-05,
"loss": 0.4752546787261963,
"step": 545,
"token_acc": 0.8946395563770795
},
{
"epoch": 0.6219081272084805,
"grad_norm": 8.0,
"learning_rate": 1.3147853963331226e-05,
"loss": 0.462324857711792,
"step": 550,
"token_acc": 0.8884892086330936
},
{
"epoch": 0.6275618374558304,
"grad_norm": 14.125,
"learning_rate": 1.2890139990004112e-05,
"loss": 0.4813478946685791,
"step": 555,
"token_acc": 0.8781818181818182
},
{
"epoch": 0.6332155477031802,
"grad_norm": 10.375,
"learning_rate": 1.2633835246184317e-05,
"loss": 0.4766115188598633,
"step": 560,
"token_acc": 0.8956692913385826
},
{
"epoch": 0.6388692579505301,
"grad_norm": 13.75,
"learning_rate": 1.2379039788995068e-05,
"loss": 0.555994701385498,
"step": 565,
"token_acc": 0.8626692456479691
},
{
"epoch": 0.6445229681978799,
"grad_norm": 11.4375,
"learning_rate": 1.2125853086359117e-05,
"loss": 0.554969596862793,
"step": 570,
"token_acc": 0.864406779661017
},
{
"epoch": 0.6501766784452296,
"grad_norm": 9.375,
"learning_rate": 1.1874373978168092e-05,
"loss": 0.48480896949768065,
"step": 575,
"token_acc": 0.8574144486692015
},
{
"epoch": 0.6558303886925795,
"grad_norm": 10.75,
"learning_rate": 1.1624700637697078e-05,
"loss": 0.6284814357757569,
"step": 580,
"token_acc": 0.8512241054613936
},
{
"epoch": 0.6614840989399293,
"grad_norm": 10.1875,
"learning_rate": 1.1376930533279357e-05,
"loss": 0.38442087173461914,
"step": 585,
"token_acc": 0.9003690036900369
},
{
"epoch": 0.6671378091872792,
"grad_norm": 10.3125,
"learning_rate": 1.1131160390256417e-05,
"loss": 0.5038439750671386,
"step": 590,
"token_acc": 0.8627819548872181
},
{
"epoch": 0.672791519434629,
"grad_norm": 14.3125,
"learning_rate": 1.0887486153217962e-05,
"loss": 0.45342187881469725,
"step": 595,
"token_acc": 0.8917910447761194
},
{
"epoch": 0.6784452296819788,
"grad_norm": 10.3125,
"learning_rate": 1.064600294854675e-05,
"loss": 0.5414403915405274,
"step": 600,
"token_acc": 0.8771266540642723
},
{
"epoch": 0.6840989399293286,
"grad_norm": 12.1875,
"learning_rate": 1.0406805047282826e-05,
"loss": 0.44243249893188474,
"step": 605,
"token_acc": 0.8971428571428571
},
{
"epoch": 0.6897526501766784,
"grad_norm": 9.4375,
"learning_rate": 1.0169985828321664e-05,
"loss": 0.4240866661071777,
"step": 610,
"token_acc": 0.900562851782364
},
{
"epoch": 0.6954063604240283,
"grad_norm": 17.0,
"learning_rate": 9.935637741960595e-06,
"loss": 0.5268006324768066,
"step": 615,
"token_acc": 0.8721804511278195
},
{
"epoch": 0.7010600706713781,
"grad_norm": 13.75,
"learning_rate": 9.703852273807745e-06,
"loss": 0.5309527397155762,
"step": 620,
"token_acc": 0.864963503649635
},
{
"epoch": 0.7067137809187279,
"grad_norm": 10.625,
"learning_rate": 9.474719909067592e-06,
"loss": 0.46470232009887696,
"step": 625,
"token_acc": 0.8984674329501916
},
{
"epoch": 0.7123674911660778,
"grad_norm": 10.75,
"learning_rate": 9.248330097216998e-06,
"loss": 0.3585221290588379,
"step": 630,
"token_acc": 0.9072356215213359
},
{
"epoch": 0.7180212014134275,
"grad_norm": 10.8125,
"learning_rate": 9.024771217085648e-06,
"loss": 0.45208401679992677,
"step": 635,
"token_acc": 0.9046728971962616
},
{
"epoch": 0.7236749116607774,
"grad_norm": 9.0,
"learning_rate": 8.804130542354423e-06,
"loss": 0.40645594596862794,
"step": 640,
"token_acc": 0.8970588235294118
},
{
"epoch": 0.7293286219081272,
"grad_norm": 11.1875,
"learning_rate": 8.586494207485173e-06,
"loss": 0.5020310878753662,
"step": 645,
"token_acc": 0.8729582577132486
},
{
"epoch": 0.734982332155477,
"grad_norm": 26.25,
"learning_rate": 8.371947174095276e-06,
"loss": 0.5611002445220947,
"step": 650,
"token_acc": 0.8490566037735849
},
{
"epoch": 0.7406360424028269,
"grad_norm": 14.8125,
"learning_rate": 8.160573197790034e-06,
"loss": 0.504447078704834,
"step": 655,
"token_acc": 0.8843416370106761
},
{
"epoch": 0.7462897526501767,
"grad_norm": 18.375,
"learning_rate": 7.952454795465847e-06,
"loss": 0.41913480758666993,
"step": 660,
"token_acc": 0.8958333333333334
},
{
"epoch": 0.7519434628975264,
"grad_norm": 8.75,
"learning_rate": 7.747673213097013e-06,
"loss": 0.4940896511077881,
"step": 665,
"token_acc": 0.8772893772893773
},
{
"epoch": 0.7575971731448763,
"grad_norm": 13.5625,
"learning_rate": 7.5463083940186235e-06,
"loss": 0.47562193870544434,
"step": 670,
"token_acc": 0.8994614003590664
},
{
"epoch": 0.7632508833922261,
"grad_norm": 10.0625,
"learning_rate": 7.3484389477180245e-06,
"loss": 0.48333349227905276,
"step": 675,
"token_acc": 0.8953488372093024
},
{
"epoch": 0.768904593639576,
"grad_norm": 8.5,
"learning_rate": 7.154142119146981e-06,
"loss": 0.41202802658081056,
"step": 680,
"token_acc": 0.9003831417624522
},
{
"epoch": 0.7745583038869258,
"grad_norm": 12.9375,
"learning_rate": 6.9634937585665066e-06,
"loss": 0.47983555793762206,
"step": 685,
"token_acc": 0.8998211091234347
},
{
"epoch": 0.7802120141342757,
"grad_norm": 8.6875,
"learning_rate": 6.776568291936193e-06,
"loss": 0.4969668388366699,
"step": 690,
"token_acc": 0.8790786948176583
},
{
"epoch": 0.7858657243816255,
"grad_norm": 10.375,
"learning_rate": 6.593438691859566e-06,
"loss": 0.44586987495422364,
"step": 695,
"token_acc": 0.884469696969697
},
{
"epoch": 0.7915194346289752,
"grad_norm": 11.875,
"learning_rate": 6.414176449096749e-06,
"loss": 0.5935549736022949,
"step": 700,
"token_acc": 0.8718929254302104
},
{
"epoch": 0.7971731448763251,
"grad_norm": 16.75,
"learning_rate": 6.238851544655688e-06,
"loss": 0.5915599822998047,
"step": 705,
"token_acc": 0.8622641509433963
},
{
"epoch": 0.8028268551236749,
"grad_norm": 13.4375,
"learning_rate": 6.067532422472728e-06,
"loss": 0.45562114715576174,
"step": 710,
"token_acc": 0.8954372623574145
},
{
"epoch": 0.8084805653710248,
"grad_norm": 9.5625,
"learning_rate": 5.9002859626932115e-06,
"loss": 0.44912257194519045,
"step": 715,
"token_acc": 0.8789571694599627
},
{
"epoch": 0.8141342756183746,
"grad_norm": 12.6875,
"learning_rate": 5.7371774555625925e-06,
"loss": 0.4588914394378662,
"step": 720,
"token_acc": 0.8875638841567292
},
{
"epoch": 0.8197879858657244,
"grad_norm": 10.5625,
"learning_rate": 5.578270575938212e-06,
"loss": 0.42406349182128905,
"step": 725,
"token_acc": 0.9009009009009009
},
{
"epoch": 0.8254416961130742,
"grad_norm": 13.875,
"learning_rate": 5.423627358431671e-06,
"loss": 0.49872541427612305,
"step": 730,
"token_acc": 0.8799249530956847
},
{
"epoch": 0.831095406360424,
"grad_norm": 17.0,
"learning_rate": 5.273308173191575e-06,
"loss": 0.48893170356750487,
"step": 735,
"token_acc": 0.8968105065666041
},
{
"epoch": 0.8367491166077738,
"grad_norm": 10.3125,
"learning_rate": 5.127371702336002e-06,
"loss": 0.5636299133300782,
"step": 740,
"token_acc": 0.860236220472441
},
{
"epoch": 0.8424028268551237,
"grad_norm": 12.0,
"learning_rate": 4.985874917043985e-06,
"loss": 0.41251296997070314,
"step": 745,
"token_acc": 0.8983364140480592
},
{
"epoch": 0.8480565371024735,
"grad_norm": 12.3125,
"learning_rate": 4.848873055314914e-06,
"loss": 0.5803286552429199,
"step": 750,
"token_acc": 0.8612612612612612
},
{
"epoch": 0.8537102473498234,
"grad_norm": 9.75,
"learning_rate": 4.7164196004045305e-06,
"loss": 0.5717785835266114,
"step": 755,
"token_acc": 0.8810408921933085
},
{
"epoch": 0.8593639575971731,
"grad_norm": 8.9375,
"learning_rate": 4.588566259945948e-06,
"loss": 0.545508623123169,
"step": 760,
"token_acc": 0.8704761904761905
},
{
"epoch": 0.8650176678445229,
"grad_norm": 11.5,
"learning_rate": 4.465362945763868e-06,
"loss": 0.40853538513183596,
"step": 765,
"token_acc": 0.9005424954792043
},
{
"epoch": 0.8706713780918728,
"grad_norm": 15.1875,
"learning_rate": 4.3468577543898026e-06,
"loss": 0.469269323348999,
"step": 770,
"token_acc": 0.8923357664233577
},
{
"epoch": 0.8763250883392226,
"grad_norm": 14.875,
"learning_rate": 4.233096948286008e-06,
"loss": 0.485385799407959,
"step": 775,
"token_acc": 0.8886792452830189
},
{
"epoch": 0.8819787985865725,
"grad_norm": 10.25,
"learning_rate": 4.124124937785375e-06,
"loss": 0.5617117404937744,
"step": 780,
"token_acc": 0.8745318352059925
},
{
"epoch": 0.8876325088339223,
"grad_norm": 14.625,
"learning_rate": 4.019984263754374e-06,
"loss": 0.5018572807312012,
"step": 785,
"token_acc": 0.8718861209964412
},
{
"epoch": 0.893286219081272,
"grad_norm": 20.25,
"learning_rate": 3.920715580985813e-06,
"loss": 0.5424814224243164,
"step": 790,
"token_acc": 0.8679245283018868
},
{
"epoch": 0.8989399293286219,
"grad_norm": 7.375,
"learning_rate": 3.8263576423278684e-06,
"loss": 0.3595900058746338,
"step": 795,
"token_acc": 0.9087591240875912
},
{
"epoch": 0.9045936395759717,
"grad_norm": 13.9375,
"learning_rate": 3.736947283555621e-06,
"loss": 0.5959813117980957,
"step": 800,
"token_acc": 0.8745019920318725
},
{
"epoch": 0.9102473498233216,
"grad_norm": 9.6875,
"learning_rate": 3.6525194089909827e-06,
"loss": 0.6750380039215088,
"step": 805,
"token_acc": 0.8412098298676749
},
{
"epoch": 0.9159010600706714,
"grad_norm": 14.4375,
"learning_rate": 3.5731069778766223e-06,
"loss": 0.680885648727417,
"step": 810,
"token_acc": 0.8374291115311909
},
{
"epoch": 0.9215547703180212,
"grad_norm": 9.0,
"learning_rate": 3.498740991509231e-06,
"loss": 0.5743994235992431,
"step": 815,
"token_acc": 0.8820224719101124
},
{
"epoch": 0.927208480565371,
"grad_norm": 17.5,
"learning_rate": 3.4294504811371234e-06,
"loss": 0.5227997779846192,
"step": 820,
"token_acc": 0.887189292543021
},
{
"epoch": 0.9328621908127208,
"grad_norm": 12.8125,
"learning_rate": 3.3652624966269193e-06,
"loss": 0.41265015602111815,
"step": 825,
"token_acc": 0.8854545454545455
},
{
"epoch": 0.9385159010600707,
"grad_norm": 17.125,
"learning_rate": 3.306202095903728e-06,
"loss": 0.5101790428161621,
"step": 830,
"token_acc": 0.891588785046729
},
{
"epoch": 0.9441696113074205,
"grad_norm": 12.5,
"learning_rate": 3.252292335168949e-06,
"loss": 0.48482298851013184,
"step": 835,
"token_acc": 0.8655616942909761
},
{
"epoch": 0.9498233215547703,
"grad_norm": 11.0,
"learning_rate": 3.2035542598995146e-06,
"loss": 0.5188216209411621,
"step": 840,
"token_acc": 0.8686679174484052
},
{
"epoch": 0.9554770318021202,
"grad_norm": 16.625,
"learning_rate": 3.1600068966320774e-06,
"loss": 0.5178674697875977,
"step": 845,
"token_acc": 0.8787878787878788
},
{
"epoch": 0.9611307420494699,
"grad_norm": 9.375,
"learning_rate": 3.1216672455353746e-06,
"loss": 0.3571352958679199,
"step": 850,
"token_acc": 0.9163636363636364
},
{
"epoch": 0.9667844522968198,
"grad_norm": 19.75,
"learning_rate": 3.0885502737736366e-06,
"loss": 0.5291311740875244,
"step": 855,
"token_acc": 0.8817829457364341
},
{
"epoch": 0.9724381625441696,
"grad_norm": 10.625,
"learning_rate": 3.0606689096636604e-06,
"loss": 0.7415075778961182,
"step": 860,
"token_acc": 0.8426763110307414
},
{
"epoch": 0.9780918727915194,
"grad_norm": 11.125,
"learning_rate": 3.0380340376278078e-06,
"loss": 0.4753167152404785,
"step": 865,
"token_acc": 0.8872180451127819
},
{
"epoch": 0.9837455830388693,
"grad_norm": 13.75,
"learning_rate": 3.0206544939449e-06,
"loss": 0.5666730403900146,
"step": 870,
"token_acc": 0.8776595744680851
},
{
"epoch": 0.9893992932862191,
"grad_norm": 14.1875,
"learning_rate": 3.0085370633006945e-06,
"loss": 0.5065449714660645,
"step": 875,
"token_acc": 0.8717948717948718
},
{
"epoch": 0.995053003533569,
"grad_norm": 12.8125,
"learning_rate": 3.0016864761392417e-06,
"loss": 0.5106320858001709,
"step": 880,
"token_acc": 0.874031007751938
}
],
"logging_steps": 5,
"max_steps": 884,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 6.272720385457565e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}