PHI2-SFT-OASST1 / trainer_state.json
satyanayak's picture
upload all model files
0865df8 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9998981773750127,
"eval_steps": 500,
"global_step": 2455,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004072904999490887,
"grad_norm": 0.7094523906707764,
"learning_rate": 2.702702702702703e-05,
"loss": 1.8961,
"mean_token_accuracy": 0.5438283555209636,
"step": 10
},
{
"epoch": 0.008145809998981774,
"grad_norm": 0.4965726137161255,
"learning_rate": 5.405405405405406e-05,
"loss": 2.0135,
"mean_token_accuracy": 0.5206024497747421,
"step": 20
},
{
"epoch": 0.01221871499847266,
"grad_norm": 0.5204955339431763,
"learning_rate": 8.108108108108109e-05,
"loss": 1.7338,
"mean_token_accuracy": 0.5830004885792732,
"step": 30
},
{
"epoch": 0.01629161999796355,
"grad_norm": 0.4678299129009247,
"learning_rate": 0.00010810810810810812,
"loss": 1.7561,
"mean_token_accuracy": 0.5730855345726014,
"step": 40
},
{
"epoch": 0.020364524997454433,
"grad_norm": 0.439376562833786,
"learning_rate": 0.00013513513513513514,
"loss": 1.7277,
"mean_token_accuracy": 0.5785414174199104,
"step": 50
},
{
"epoch": 0.02443742999694532,
"grad_norm": 0.5652154684066772,
"learning_rate": 0.00016216216216216218,
"loss": 1.5663,
"mean_token_accuracy": 0.6036677993834019,
"step": 60
},
{
"epoch": 0.02851033499643621,
"grad_norm": 0.5163070559501648,
"learning_rate": 0.0001891891891891892,
"loss": 1.8259,
"mean_token_accuracy": 0.5530456639826298,
"step": 70
},
{
"epoch": 0.0325832399959271,
"grad_norm": 0.41974571347236633,
"learning_rate": 0.00019999686634492516,
"loss": 1.6554,
"mean_token_accuracy": 0.6073097884654999,
"step": 80
},
{
"epoch": 0.03665614499541798,
"grad_norm": 0.5097134709358215,
"learning_rate": 0.00019997771694180204,
"loss": 1.7208,
"mean_token_accuracy": 0.5835812106728554,
"step": 90
},
{
"epoch": 0.04072904999490887,
"grad_norm": 0.3469955623149872,
"learning_rate": 0.00019994116238472668,
"loss": 1.7954,
"mean_token_accuracy": 0.5794057920575142,
"step": 100
},
{
"epoch": 0.044801954994399755,
"grad_norm": 0.5898286700248718,
"learning_rate": 0.0001998872090374941,
"loss": 1.8089,
"mean_token_accuracy": 0.5614037178456783,
"step": 110
},
{
"epoch": 0.04887485999389064,
"grad_norm": 0.3150334656238556,
"learning_rate": 0.0001998158662928604,
"loss": 1.5827,
"mean_token_accuracy": 0.5801003761589527,
"step": 120
},
{
"epoch": 0.05294776499338153,
"grad_norm": 0.3100312352180481,
"learning_rate": 0.00019972714657090772,
"loss": 1.6712,
"mean_token_accuracy": 0.5900266923010349,
"step": 130
},
{
"epoch": 0.05702066999287242,
"grad_norm": 0.30420345067977905,
"learning_rate": 0.0001996210653168819,
"loss": 1.646,
"mean_token_accuracy": 0.5839473098516464,
"step": 140
},
{
"epoch": 0.0610935749923633,
"grad_norm": 0.454593688249588,
"learning_rate": 0.0001994976409985037,
"loss": 1.7184,
"mean_token_accuracy": 0.566600239276886,
"step": 150
},
{
"epoch": 0.0651664799918542,
"grad_norm": 0.35688647627830505,
"learning_rate": 0.0001993568951027537,
"loss": 1.6766,
"mean_token_accuracy": 0.5824202686548233,
"step": 160
},
{
"epoch": 0.06923938499134508,
"grad_norm": 0.3199939727783203,
"learning_rate": 0.0001991988521321317,
"loss": 1.542,
"mean_token_accuracy": 0.6095141984522343,
"step": 170
},
{
"epoch": 0.07331228999083596,
"grad_norm": 0.5523242950439453,
"learning_rate": 0.00019902353960039087,
"loss": 1.7218,
"mean_token_accuracy": 0.5745485998690129,
"step": 180
},
{
"epoch": 0.07738519499032685,
"grad_norm": 0.3872721493244171,
"learning_rate": 0.00019883098802774812,
"loss": 1.7306,
"mean_token_accuracy": 0.5514535710215569,
"step": 190
},
{
"epoch": 0.08145809998981773,
"grad_norm": 0.2679811120033264,
"learning_rate": 0.0001986212309355707,
"loss": 1.6524,
"mean_token_accuracy": 0.5822832569479942,
"step": 200
},
{
"epoch": 0.08553100498930863,
"grad_norm": 0.5167363882064819,
"learning_rate": 0.00019839430484054046,
"loss": 1.6964,
"mean_token_accuracy": 0.573430598527193,
"step": 210
},
{
"epoch": 0.08960390998879951,
"grad_norm": 0.4363399147987366,
"learning_rate": 0.0001981502492482967,
"loss": 1.7296,
"mean_token_accuracy": 0.5835007324814796,
"step": 220
},
{
"epoch": 0.0936768149882904,
"grad_norm": 0.4052150845527649,
"learning_rate": 0.00019788910664655848,
"loss": 1.5772,
"mean_token_accuracy": 0.5771500714123249,
"step": 230
},
{
"epoch": 0.09774971998778129,
"grad_norm": 0.34224212169647217,
"learning_rate": 0.0001976109224977281,
"loss": 1.6263,
"mean_token_accuracy": 0.5942870646715164,
"step": 240
},
{
"epoch": 0.10182262498727217,
"grad_norm": 0.4852873980998993,
"learning_rate": 0.00019731574523097647,
"loss": 1.5731,
"mean_token_accuracy": 0.5886094763875007,
"step": 250
},
{
"epoch": 0.10589552998676306,
"grad_norm": 0.30241233110427856,
"learning_rate": 0.00019700362623381197,
"loss": 1.8311,
"mean_token_accuracy": 0.5616028495132923,
"step": 260
},
{
"epoch": 0.10996843498625394,
"grad_norm": 0.38147303462028503,
"learning_rate": 0.00019667461984313448,
"loss": 1.652,
"mean_token_accuracy": 0.5836799181997776,
"step": 270
},
{
"epoch": 0.11404133998574484,
"grad_norm": 0.3174324333667755,
"learning_rate": 0.00019632878333577592,
"loss": 1.6831,
"mean_token_accuracy": 0.5850063987076283,
"step": 280
},
{
"epoch": 0.11811424498523572,
"grad_norm": 0.350323349237442,
"learning_rate": 0.00019596617691852863,
"loss": 1.6644,
"mean_token_accuracy": 0.5841067053377629,
"step": 290
},
{
"epoch": 0.1221871499847266,
"grad_norm": 0.30346542596817017,
"learning_rate": 0.0001955868637176643,
"loss": 1.656,
"mean_token_accuracy": 0.584677055478096,
"step": 300
},
{
"epoch": 0.1262600549842175,
"grad_norm": 0.2639765739440918,
"learning_rate": 0.00019519090976794406,
"loss": 1.7454,
"mean_token_accuracy": 0.5678185373544693,
"step": 310
},
{
"epoch": 0.1303329599837084,
"grad_norm": 0.3039887547492981,
"learning_rate": 0.00019477838400112254,
"loss": 1.754,
"mean_token_accuracy": 0.5744720883667469,
"step": 320
},
{
"epoch": 0.13440586498319926,
"grad_norm": 0.35102295875549316,
"learning_rate": 0.00019434935823394746,
"loss": 1.6665,
"mean_token_accuracy": 0.5876846723258495,
"step": 330
},
{
"epoch": 0.13847876998269015,
"grad_norm": 0.3325759470462799,
"learning_rate": 0.00019390390715565725,
"loss": 1.6773,
"mean_token_accuracy": 0.5869172632694244,
"step": 340
},
{
"epoch": 0.14255167498218105,
"grad_norm": 0.37209993600845337,
"learning_rate": 0.000193442108314978,
"loss": 1.6328,
"mean_token_accuracy": 0.5927142709493637,
"step": 350
},
{
"epoch": 0.14662457998167192,
"grad_norm": 0.2964717149734497,
"learning_rate": 0.00019296404210662331,
"loss": 1.5659,
"mean_token_accuracy": 0.60322862342,
"step": 360
},
{
"epoch": 0.1506974849811628,
"grad_norm": 0.29879456758499146,
"learning_rate": 0.00019246979175729822,
"loss": 1.7083,
"mean_token_accuracy": 0.5893984287977219,
"step": 370
},
{
"epoch": 0.1547703899806537,
"grad_norm": 0.3726056218147278,
"learning_rate": 0.00019195944331121015,
"loss": 1.6854,
"mean_token_accuracy": 0.5761750474572181,
"step": 380
},
{
"epoch": 0.1588432949801446,
"grad_norm": 0.517816424369812,
"learning_rate": 0.0001914330856150897,
"loss": 1.7282,
"mean_token_accuracy": 0.5854727104306221,
"step": 390
},
{
"epoch": 0.16291619997963547,
"grad_norm": 0.25848233699798584,
"learning_rate": 0.00019089081030272296,
"loss": 1.5562,
"mean_token_accuracy": 0.6038706839084625,
"step": 400
},
{
"epoch": 0.16698910497912636,
"grad_norm": 0.3191607892513275,
"learning_rate": 0.00019033271177899922,
"loss": 1.6452,
"mean_token_accuracy": 0.5812859788537026,
"step": 410
},
{
"epoch": 0.17106200997861726,
"grad_norm": 0.3990655243396759,
"learning_rate": 0.0001897588872034758,
"loss": 1.626,
"mean_token_accuracy": 0.569889971613884,
"step": 420
},
{
"epoch": 0.17513491497810812,
"grad_norm": 0.346086323261261,
"learning_rate": 0.00018916943647346375,
"loss": 1.7451,
"mean_token_accuracy": 0.578500047326088,
"step": 430
},
{
"epoch": 0.17920781997759902,
"grad_norm": 0.36437422037124634,
"learning_rate": 0.0001885644622066364,
"loss": 1.846,
"mean_token_accuracy": 0.5627885892987251,
"step": 440
},
{
"epoch": 0.18328072497708991,
"grad_norm": 0.2968160808086395,
"learning_rate": 0.00018794406972316482,
"loss": 1.671,
"mean_token_accuracy": 0.5769762165844441,
"step": 450
},
{
"epoch": 0.1873536299765808,
"grad_norm": 0.2781198024749756,
"learning_rate": 0.00018730836702738257,
"loss": 1.4983,
"mean_token_accuracy": 0.613883113116026,
"step": 460
},
{
"epoch": 0.19142653497607168,
"grad_norm": 0.4645621180534363,
"learning_rate": 0.0001866574647889831,
"loss": 1.6776,
"mean_token_accuracy": 0.5890260674059391,
"step": 470
},
{
"epoch": 0.19549943997556257,
"grad_norm": 0.3920878767967224,
"learning_rate": 0.00018599147632375332,
"loss": 1.802,
"mean_token_accuracy": 0.568213502317667,
"step": 480
},
{
"epoch": 0.19957234497505347,
"grad_norm": 0.3473225235939026,
"learning_rate": 0.00018531051757384633,
"loss": 1.7161,
"mean_token_accuracy": 0.5727271348237991,
"step": 490
},
{
"epoch": 0.20364524997454433,
"grad_norm": 0.30091673135757446,
"learning_rate": 0.00018461470708759712,
"loss": 1.7042,
"mean_token_accuracy": 0.5763454169034958,
"step": 500
},
{
"epoch": 0.20771815497403523,
"grad_norm": 0.31175661087036133,
"learning_rate": 0.00018390416599888435,
"loss": 1.689,
"mean_token_accuracy": 0.5796464458107948,
"step": 510
},
{
"epoch": 0.21179105997352612,
"grad_norm": 0.3624255955219269,
"learning_rate": 0.0001831790180060422,
"loss": 1.5619,
"mean_token_accuracy": 0.6015763126313687,
"step": 520
},
{
"epoch": 0.215863964973017,
"grad_norm": 0.2667541205883026,
"learning_rate": 0.00018243938935032561,
"loss": 1.6877,
"mean_token_accuracy": 0.5839527539908886,
"step": 530
},
{
"epoch": 0.2199368699725079,
"grad_norm": 0.31019967794418335,
"learning_rate": 0.00018168540879393296,
"loss": 1.7831,
"mean_token_accuracy": 0.5688935197889805,
"step": 540
},
{
"epoch": 0.22400977497199878,
"grad_norm": 0.2726418673992157,
"learning_rate": 0.0001809172075975897,
"loss": 1.7288,
"mean_token_accuracy": 0.5798229008913041,
"step": 550
},
{
"epoch": 0.22808267997148968,
"grad_norm": 0.29514381289482117,
"learning_rate": 0.00018013491949769734,
"loss": 1.7188,
"mean_token_accuracy": 0.5756009854376316,
"step": 560
},
{
"epoch": 0.23215558497098054,
"grad_norm": 0.37964069843292236,
"learning_rate": 0.00017933868068305104,
"loss": 1.6244,
"mean_token_accuracy": 0.5932842157781124,
"step": 570
},
{
"epoch": 0.23622848997047144,
"grad_norm": 0.300620436668396,
"learning_rate": 0.0001785286297711305,
"loss": 1.5565,
"mean_token_accuracy": 0.5965760670602321,
"step": 580
},
{
"epoch": 0.24030139496996233,
"grad_norm": 0.5466737151145935,
"learning_rate": 0.00017770490778396808,
"loss": 1.6532,
"mean_token_accuracy": 0.5821332208812237,
"step": 590
},
{
"epoch": 0.2443742999694532,
"grad_norm": 0.3445660173892975,
"learning_rate": 0.00017686765812359808,
"loss": 1.7585,
"mean_token_accuracy": 0.5790032669901848,
"step": 600
},
{
"epoch": 0.2484472049689441,
"grad_norm": 0.3492606282234192,
"learning_rate": 0.0001760170265470921,
"loss": 1.6401,
"mean_token_accuracy": 0.6002471588551999,
"step": 610
},
{
"epoch": 0.252520109968435,
"grad_norm": 0.31294527649879456,
"learning_rate": 0.00017515316114118375,
"loss": 1.6915,
"mean_token_accuracy": 0.5570432722568512,
"step": 620
},
{
"epoch": 0.25659301496792586,
"grad_norm": 0.27257561683654785,
"learning_rate": 0.00017427621229648853,
"loss": 1.5666,
"mean_token_accuracy": 0.6028999522328377,
"step": 630
},
{
"epoch": 0.2606659199674168,
"grad_norm": 0.29818692803382874,
"learning_rate": 0.00017338633268132212,
"loss": 1.5926,
"mean_token_accuracy": 0.5965964362025261,
"step": 640
},
{
"epoch": 0.26473882496690765,
"grad_norm": 0.49210649728775024,
"learning_rate": 0.0001724836772151223,
"loss": 1.5925,
"mean_token_accuracy": 0.5952631443738937,
"step": 650
},
{
"epoch": 0.2688117299663985,
"grad_norm": 0.3807302713394165,
"learning_rate": 0.00017156840304147902,
"loss": 1.6237,
"mean_token_accuracy": 0.5884141281247139,
"step": 660
},
{
"epoch": 0.27288463496588944,
"grad_norm": 0.2621661126613617,
"learning_rate": 0.00017064066950077722,
"loss": 1.7356,
"mean_token_accuracy": 0.5827617473900318,
"step": 670
},
{
"epoch": 0.2769575399653803,
"grad_norm": 0.30957838892936707,
"learning_rate": 0.00016970063810245716,
"loss": 1.5585,
"mean_token_accuracy": 0.5888052701950073,
"step": 680
},
{
"epoch": 0.2810304449648712,
"grad_norm": 0.2501460611820221,
"learning_rate": 0.00016874847249689722,
"loss": 1.5913,
"mean_token_accuracy": 0.5886548452079297,
"step": 690
},
{
"epoch": 0.2851033499643621,
"grad_norm": 0.3207811415195465,
"learning_rate": 0.00016778433844692397,
"loss": 1.6791,
"mean_token_accuracy": 0.5861249402165413,
"step": 700
},
{
"epoch": 0.28917625496385296,
"grad_norm": 0.45466601848602295,
"learning_rate": 0.0001668084037989544,
"loss": 1.5153,
"mean_token_accuracy": 0.5999807387590408,
"step": 710
},
{
"epoch": 0.29324915996334383,
"grad_norm": 0.34910282492637634,
"learning_rate": 0.00016582083845377552,
"loss": 1.6821,
"mean_token_accuracy": 0.5889992110431195,
"step": 720
},
{
"epoch": 0.29732206496283475,
"grad_norm": 0.4916020929813385,
"learning_rate": 0.00016482181433696643,
"loss": 1.8462,
"mean_token_accuracy": 0.5748938458040357,
"step": 730
},
{
"epoch": 0.3013949699623256,
"grad_norm": 0.2545833885669708,
"learning_rate": 0.00016381150536896736,
"loss": 1.5756,
"mean_token_accuracy": 0.6056667067110538,
"step": 740
},
{
"epoch": 0.3054678749618165,
"grad_norm": 0.30347147583961487,
"learning_rate": 0.0001627900874348022,
"loss": 1.6003,
"mean_token_accuracy": 0.5873342089354991,
"step": 750
},
{
"epoch": 0.3095407799613074,
"grad_norm": 0.37371426820755005,
"learning_rate": 0.0001617577383534584,
"loss": 1.6576,
"mean_token_accuracy": 0.5790071420371532,
"step": 760
},
{
"epoch": 0.3136136849607983,
"grad_norm": 0.41969504952430725,
"learning_rate": 0.00016071463784693045,
"loss": 1.6181,
"mean_token_accuracy": 0.5854876518249512,
"step": 770
},
{
"epoch": 0.3176865899602892,
"grad_norm": 0.17495319247245789,
"learning_rate": 0.00015966096750893197,
"loss": 1.5142,
"mean_token_accuracy": 0.6079291738569736,
"step": 780
},
{
"epoch": 0.32175949495978007,
"grad_norm": 0.30013784766197205,
"learning_rate": 0.00015859691077328215,
"loss": 1.6583,
"mean_token_accuracy": 0.581703095138073,
"step": 790
},
{
"epoch": 0.32583239995927094,
"grad_norm": 0.3358050584793091,
"learning_rate": 0.00015752265288197155,
"loss": 1.6468,
"mean_token_accuracy": 0.6049091577529907,
"step": 800
},
{
"epoch": 0.32990530495876186,
"grad_norm": 0.3690403699874878,
"learning_rate": 0.00015643838085291323,
"loss": 1.8431,
"mean_token_accuracy": 0.5602408707141876,
"step": 810
},
{
"epoch": 0.3339782099582527,
"grad_norm": 0.34296655654907227,
"learning_rate": 0.00015534428344738505,
"loss": 1.7042,
"mean_token_accuracy": 0.5799131192266941,
"step": 820
},
{
"epoch": 0.3380511149577436,
"grad_norm": 0.2764555513858795,
"learning_rate": 0.00015424055113716763,
"loss": 1.5479,
"mean_token_accuracy": 0.5909703068435193,
"step": 830
},
{
"epoch": 0.3421240199572345,
"grad_norm": 0.26227012276649475,
"learning_rate": 0.0001531273760713855,
"loss": 1.5303,
"mean_token_accuracy": 0.5910052061080933,
"step": 840
},
{
"epoch": 0.3461969249567254,
"grad_norm": 0.3656936585903168,
"learning_rate": 0.00015200495204305574,
"loss": 1.5586,
"mean_token_accuracy": 0.5943005800247192,
"step": 850
},
{
"epoch": 0.35026982995621625,
"grad_norm": 0.29997819662094116,
"learning_rate": 0.00015087347445535013,
"loss": 1.8219,
"mean_token_accuracy": 0.5533552631735802,
"step": 860
},
{
"epoch": 0.3543427349557072,
"grad_norm": 0.290685772895813,
"learning_rate": 0.00014973314028757787,
"loss": 1.7261,
"mean_token_accuracy": 0.5844682581722737,
"step": 870
},
{
"epoch": 0.35841563995519804,
"grad_norm": 0.34553012251853943,
"learning_rate": 0.00014858414806089295,
"loss": 1.6982,
"mean_token_accuracy": 0.5762835793197155,
"step": 880
},
{
"epoch": 0.3624885449546889,
"grad_norm": 0.2141156941652298,
"learning_rate": 0.0001474266978037338,
"loss": 1.5318,
"mean_token_accuracy": 0.6048024773597718,
"step": 890
},
{
"epoch": 0.36656144995417983,
"grad_norm": 0.30456602573394775,
"learning_rate": 0.00014626099101700018,
"loss": 1.7901,
"mean_token_accuracy": 0.5763920709490776,
"step": 900
},
{
"epoch": 0.3706343549536707,
"grad_norm": 0.26921945810317993,
"learning_rate": 0.00014508723063897376,
"loss": 1.4936,
"mean_token_accuracy": 0.6303243085741996,
"step": 910
},
{
"epoch": 0.3747072599531616,
"grad_norm": 0.28455570340156555,
"learning_rate": 0.00014390562100998868,
"loss": 1.5804,
"mean_token_accuracy": 0.6074232332408428,
"step": 920
},
{
"epoch": 0.3787801649526525,
"grad_norm": 0.3388415575027466,
"learning_rate": 0.00014271636783685777,
"loss": 1.6731,
"mean_token_accuracy": 0.5768752813339233,
"step": 930
},
{
"epoch": 0.38285306995214335,
"grad_norm": 0.4311608076095581,
"learning_rate": 0.00014151967815706091,
"loss": 1.7237,
"mean_token_accuracy": 0.5706497602164745,
"step": 940
},
{
"epoch": 0.3869259749516343,
"grad_norm": 0.35940027236938477,
"learning_rate": 0.00014031576030270202,
"loss": 1.5355,
"mean_token_accuracy": 0.5908183179795742,
"step": 950
},
{
"epoch": 0.39099887995112514,
"grad_norm": 0.34071287512779236,
"learning_rate": 0.00013910482386424023,
"loss": 1.7247,
"mean_token_accuracy": 0.5757749699056148,
"step": 960
},
{
"epoch": 0.395071784950616,
"grad_norm": 0.413870245218277,
"learning_rate": 0.00013788707965400236,
"loss": 1.6796,
"mean_token_accuracy": 0.592286454886198,
"step": 970
},
{
"epoch": 0.39914468995010693,
"grad_norm": 0.2649496793746948,
"learning_rate": 0.00013666273966948252,
"loss": 1.5955,
"mean_token_accuracy": 0.5936679825186729,
"step": 980
},
{
"epoch": 0.4032175949495978,
"grad_norm": 0.3525199294090271,
"learning_rate": 0.00013543201705643526,
"loss": 1.647,
"mean_token_accuracy": 0.5950982637703419,
"step": 990
},
{
"epoch": 0.40729049994908867,
"grad_norm": 0.33436283469200134,
"learning_rate": 0.00013419512607176914,
"loss": 1.7161,
"mean_token_accuracy": 0.574284989386797,
"step": 1000
},
{
"epoch": 0.4113634049485796,
"grad_norm": 0.46867313981056213,
"learning_rate": 0.00013295228204624648,
"loss": 1.544,
"mean_token_accuracy": 0.6102774910628795,
"step": 1010
},
{
"epoch": 0.41543630994807046,
"grad_norm": 0.30373555421829224,
"learning_rate": 0.00013170370134699653,
"loss": 1.6287,
"mean_token_accuracy": 0.5843084178864956,
"step": 1020
},
{
"epoch": 0.4195092149475613,
"grad_norm": 0.2981624901294708,
"learning_rate": 0.00013044960133984804,
"loss": 1.6858,
"mean_token_accuracy": 0.5856122255325318,
"step": 1030
},
{
"epoch": 0.42358211994705225,
"grad_norm": 0.3545626997947693,
"learning_rate": 0.00012919020035148776,
"loss": 1.7392,
"mean_token_accuracy": 0.5841099888086319,
"step": 1040
},
{
"epoch": 0.4276550249465431,
"grad_norm": 0.2896677553653717,
"learning_rate": 0.0001279257176314521,
"loss": 1.5007,
"mean_token_accuracy": 0.573243772238493,
"step": 1050
},
{
"epoch": 0.431727929946034,
"grad_norm": 0.36384209990501404,
"learning_rate": 0.00012665637331395785,
"loss": 1.487,
"mean_token_accuracy": 0.6025885075330735,
"step": 1060
},
{
"epoch": 0.4358008349455249,
"grad_norm": 0.3681187033653259,
"learning_rate": 0.00012538238837957882,
"loss": 1.4913,
"mean_token_accuracy": 0.5982382036745548,
"step": 1070
},
{
"epoch": 0.4398737399450158,
"grad_norm": 0.2680988311767578,
"learning_rate": 0.00012410398461677554,
"loss": 1.6263,
"mean_token_accuracy": 0.5956345148384571,
"step": 1080
},
{
"epoch": 0.4439466449445067,
"grad_norm": 0.23174384236335754,
"learning_rate": 0.00012282138458328358,
"loss": 1.7378,
"mean_token_accuracy": 0.590882021188736,
"step": 1090
},
{
"epoch": 0.44801954994399756,
"grad_norm": 0.34088292717933655,
"learning_rate": 0.00012153481156736892,
"loss": 1.7385,
"mean_token_accuracy": 0.5994494572281838,
"step": 1100
},
{
"epoch": 0.45209245494348843,
"grad_norm": 0.24563632905483246,
"learning_rate": 0.00012024448954895522,
"loss": 1.5212,
"mean_token_accuracy": 0.6165470741689205,
"step": 1110
},
{
"epoch": 0.45616535994297935,
"grad_norm": 0.26980966329574585,
"learning_rate": 0.00011895064316063127,
"loss": 1.5254,
"mean_token_accuracy": 0.5898841544985771,
"step": 1120
},
{
"epoch": 0.4602382649424702,
"grad_norm": 0.32573202252388,
"learning_rate": 0.00011765349764854461,
"loss": 1.5704,
"mean_token_accuracy": 0.6047514051198959,
"step": 1130
},
{
"epoch": 0.4643111699419611,
"grad_norm": 0.3137454390525818,
"learning_rate": 0.00011635327883318831,
"loss": 1.5893,
"mean_token_accuracy": 0.5792985640466213,
"step": 1140
},
{
"epoch": 0.468384074941452,
"grad_norm": 0.368747353553772,
"learning_rate": 0.00011505021307008785,
"loss": 1.6388,
"mean_token_accuracy": 0.5851111486554146,
"step": 1150
},
{
"epoch": 0.4724569799409429,
"grad_norm": 0.325250506401062,
"learning_rate": 0.00011374452721039477,
"loss": 1.7192,
"mean_token_accuracy": 0.5636343933641911,
"step": 1160
},
{
"epoch": 0.47652988494043375,
"grad_norm": 0.32918378710746765,
"learning_rate": 0.00011243644856139403,
"loss": 1.6048,
"mean_token_accuracy": 0.6072004094719887,
"step": 1170
},
{
"epoch": 0.48060278993992467,
"grad_norm": 0.2892746031284332,
"learning_rate": 0.00011112620484693223,
"loss": 1.6785,
"mean_token_accuracy": 0.5872686378657818,
"step": 1180
},
{
"epoch": 0.48467569493941554,
"grad_norm": 0.2459000200033188,
"learning_rate": 0.0001098140241677728,
"loss": 1.5799,
"mean_token_accuracy": 0.6077749952673912,
"step": 1190
},
{
"epoch": 0.4887485999389064,
"grad_norm": 0.3696756660938263,
"learning_rate": 0.00010850013496188606,
"loss": 1.5966,
"mean_token_accuracy": 0.5970290452241898,
"step": 1200
},
{
"epoch": 0.4928215049383973,
"grad_norm": 0.27681517601013184,
"learning_rate": 0.00010718476596468028,
"loss": 1.7161,
"mean_token_accuracy": 0.5730410292744637,
"step": 1210
},
{
"epoch": 0.4968944099378882,
"grad_norm": 0.2720302641391754,
"learning_rate": 0.00010586814616918113,
"loss": 1.6991,
"mean_token_accuracy": 0.5764113113284111,
"step": 1220
},
{
"epoch": 0.5009673149373791,
"grad_norm": 0.34990179538726807,
"learning_rate": 0.00010455050478616617,
"loss": 1.7114,
"mean_token_accuracy": 0.5776129819452762,
"step": 1230
},
{
"epoch": 0.50504021993687,
"grad_norm": 0.33753877878189087,
"learning_rate": 0.00010323207120426142,
"loss": 1.8174,
"mean_token_accuracy": 0.5551487416028976,
"step": 1240
},
{
"epoch": 0.5091131249363609,
"grad_norm": 0.41568267345428467,
"learning_rate": 0.00010191307495000712,
"loss": 1.799,
"mean_token_accuracy": 0.5767477229237556,
"step": 1250
},
{
"epoch": 0.5131860299358517,
"grad_norm": 0.2747114300727844,
"learning_rate": 0.00010059374564789932,
"loss": 1.4763,
"mean_token_accuracy": 0.6238099962472916,
"step": 1260
},
{
"epoch": 0.5172589349353426,
"grad_norm": 0.2458280771970749,
"learning_rate": 9.927431298041441e-05,
"loss": 1.5262,
"mean_token_accuracy": 0.6056429393589496,
"step": 1270
},
{
"epoch": 0.5213318399348336,
"grad_norm": 0.2757134437561035,
"learning_rate": 9.795500664802385e-05,
"loss": 1.621,
"mean_token_accuracy": 0.5842474676668644,
"step": 1280
},
{
"epoch": 0.5254047449343244,
"grad_norm": 0.21551673114299774,
"learning_rate": 9.663605632920518e-05,
"loss": 1.659,
"mean_token_accuracy": 0.5935076788067818,
"step": 1290
},
{
"epoch": 0.5294776499338153,
"grad_norm": 0.5034237504005432,
"learning_rate": 9.53176916404576e-05,
"loss": 1.7666,
"mean_token_accuracy": 0.5699214018881321,
"step": 1300
},
{
"epoch": 0.5335505549333062,
"grad_norm": 0.26525890827178955,
"learning_rate": 9.400014209632763e-05,
"loss": 1.6026,
"mean_token_accuracy": 0.5935329027473927,
"step": 1310
},
{
"epoch": 0.537623459932797,
"grad_norm": 0.28077974915504456,
"learning_rate": 9.268363706945312e-05,
"loss": 1.7769,
"mean_token_accuracy": 0.5664741955697536,
"step": 1320
},
{
"epoch": 0.5416963649322879,
"grad_norm": 0.514976978302002,
"learning_rate": 9.136840575063147e-05,
"loss": 1.5157,
"mean_token_accuracy": 0.6034789860248566,
"step": 1330
},
{
"epoch": 0.5457692699317789,
"grad_norm": 0.318249374628067,
"learning_rate": 9.005467710891987e-05,
"loss": 1.8756,
"mean_token_accuracy": 0.5630597174167633,
"step": 1340
},
{
"epoch": 0.5498421749312697,
"grad_norm": 0.24940232932567596,
"learning_rate": 8.874267985177394e-05,
"loss": 1.5708,
"mean_token_accuracy": 0.5888857699930667,
"step": 1350
},
{
"epoch": 0.5539150799307606,
"grad_norm": 0.26299914717674255,
"learning_rate": 8.743264238523199e-05,
"loss": 1.6876,
"mean_token_accuracy": 0.5782084472477436,
"step": 1360
},
{
"epoch": 0.5579879849302515,
"grad_norm": 0.2588869333267212,
"learning_rate": 8.612479277415174e-05,
"loss": 1.6694,
"mean_token_accuracy": 0.585976778715849,
"step": 1370
},
{
"epoch": 0.5620608899297423,
"grad_norm": 0.2464841604232788,
"learning_rate": 8.481935870250637e-05,
"loss": 1.5838,
"mean_token_accuracy": 0.605075704306364,
"step": 1380
},
{
"epoch": 0.5661337949292333,
"grad_norm": 0.3231446146965027,
"learning_rate": 8.351656743374709e-05,
"loss": 1.6321,
"mean_token_accuracy": 0.5716924026608468,
"step": 1390
},
{
"epoch": 0.5702066999287242,
"grad_norm": 0.23010632395744324,
"learning_rate": 8.22166457712386e-05,
"loss": 1.5016,
"mean_token_accuracy": 0.6048496462404728,
"step": 1400
},
{
"epoch": 0.5742796049282151,
"grad_norm": 0.3723667860031128,
"learning_rate": 8.091982001877493e-05,
"loss": 1.5412,
"mean_token_accuracy": 0.6111127749085427,
"step": 1410
},
{
"epoch": 0.5783525099277059,
"grad_norm": 0.24990710616111755,
"learning_rate": 7.962631594118208e-05,
"loss": 1.7629,
"mean_token_accuracy": 0.5585654892027379,
"step": 1420
},
{
"epoch": 0.5824254149271968,
"grad_norm": 0.3681967556476593,
"learning_rate": 7.833635872501462e-05,
"loss": 1.6342,
"mean_token_accuracy": 0.5907308183610439,
"step": 1430
},
{
"epoch": 0.5864983199266877,
"grad_norm": 0.3382493555545807,
"learning_rate": 7.705017293935281e-05,
"loss": 1.5803,
"mean_token_accuracy": 0.6061145611107349,
"step": 1440
},
{
"epoch": 0.5905712249261786,
"grad_norm": 0.28145501017570496,
"learning_rate": 7.576798249670725e-05,
"loss": 1.8459,
"mean_token_accuracy": 0.5457224696874619,
"step": 1450
},
{
"epoch": 0.5946441299256695,
"grad_norm": 0.3189752697944641,
"learning_rate": 7.449001061403809e-05,
"loss": 1.5263,
"mean_token_accuracy": 0.5937092356383801,
"step": 1460
},
{
"epoch": 0.5987170349251604,
"grad_norm": 0.2588890492916107,
"learning_rate": 7.321647977389479e-05,
"loss": 1.5965,
"mean_token_accuracy": 0.5941358201205731,
"step": 1470
},
{
"epoch": 0.6027899399246512,
"grad_norm": 0.2777283191680908,
"learning_rate": 7.194761168568445e-05,
"loss": 1.5667,
"mean_token_accuracy": 0.6003799811005592,
"step": 1480
},
{
"epoch": 0.6068628449241421,
"grad_norm": 0.23376941680908203,
"learning_rate": 7.068362724707392e-05,
"loss": 1.4813,
"mean_token_accuracy": 0.6078310683369637,
"step": 1490
},
{
"epoch": 0.610935749923633,
"grad_norm": 0.2295948565006256,
"learning_rate": 6.942474650553408e-05,
"loss": 1.6786,
"mean_token_accuracy": 0.5886344678699971,
"step": 1500
},
{
"epoch": 0.615008654923124,
"grad_norm": 0.3243666887283325,
"learning_rate": 6.817118862003132e-05,
"loss": 1.6343,
"mean_token_accuracy": 0.5855603873729706,
"step": 1510
},
{
"epoch": 0.6190815599226148,
"grad_norm": 0.7187057733535767,
"learning_rate": 6.692317182287432e-05,
"loss": 1.8144,
"mean_token_accuracy": 0.5671629451215268,
"step": 1520
},
{
"epoch": 0.6231544649221057,
"grad_norm": 0.35659492015838623,
"learning_rate": 6.568091338172195e-05,
"loss": 1.6117,
"mean_token_accuracy": 0.601442601531744,
"step": 1530
},
{
"epoch": 0.6272273699215966,
"grad_norm": 0.3395217955112457,
"learning_rate": 6.444462956175876e-05,
"loss": 1.6222,
"mean_token_accuracy": 0.5970501154661179,
"step": 1540
},
{
"epoch": 0.6313002749210874,
"grad_norm": 0.26399192214012146,
"learning_rate": 6.321453558804571e-05,
"loss": 1.6048,
"mean_token_accuracy": 0.5844796732068062,
"step": 1550
},
{
"epoch": 0.6353731799205784,
"grad_norm": 0.2993052899837494,
"learning_rate": 6.199084560805121e-05,
"loss": 1.7073,
"mean_token_accuracy": 0.5789771333336831,
"step": 1560
},
{
"epoch": 0.6394460849200693,
"grad_norm": 0.2676873505115509,
"learning_rate": 6.077377265437043e-05,
"loss": 1.8152,
"mean_token_accuracy": 0.5734024614095687,
"step": 1570
},
{
"epoch": 0.6435189899195601,
"grad_norm": 0.293557733297348,
"learning_rate": 5.956352860763809e-05,
"loss": 1.7108,
"mean_token_accuracy": 0.5808110930025577,
"step": 1580
},
{
"epoch": 0.647591894919051,
"grad_norm": 0.23729322850704193,
"learning_rate": 5.83603241596423e-05,
"loss": 1.4793,
"mean_token_accuracy": 0.6202867470681668,
"step": 1590
},
{
"epoch": 0.6516647999185419,
"grad_norm": 0.30609002709388733,
"learning_rate": 5.716436877664517e-05,
"loss": 1.752,
"mean_token_accuracy": 0.5730870619416237,
"step": 1600
},
{
"epoch": 0.6557377049180327,
"grad_norm": 0.30717799067497253,
"learning_rate": 5.5975870662916484e-05,
"loss": 1.7172,
"mean_token_accuracy": 0.5701417997479439,
"step": 1610
},
{
"epoch": 0.6598106099175237,
"grad_norm": 0.44037064909935,
"learning_rate": 5.4795036724487735e-05,
"loss": 1.5377,
"mean_token_accuracy": 0.6102925211191177,
"step": 1620
},
{
"epoch": 0.6638835149170146,
"grad_norm": 0.24488377571105957,
"learning_rate": 5.362207253313136e-05,
"loss": 1.4547,
"mean_token_accuracy": 0.6181615687906742,
"step": 1630
},
{
"epoch": 0.6679564199165055,
"grad_norm": 0.2750435769557953,
"learning_rate": 5.245718229057326e-05,
"loss": 1.6086,
"mean_token_accuracy": 0.5703060247004033,
"step": 1640
},
{
"epoch": 0.6720293249159963,
"grad_norm": 0.2821342647075653,
"learning_rate": 5.1300568792942535e-05,
"loss": 1.6018,
"mean_token_accuracy": 0.5989562854170799,
"step": 1650
},
{
"epoch": 0.6761022299154872,
"grad_norm": 0.22521165013313293,
"learning_rate": 5.015243339546731e-05,
"loss": 1.7574,
"mean_token_accuracy": 0.5801547184586525,
"step": 1660
},
{
"epoch": 0.6801751349149782,
"grad_norm": 0.29259297251701355,
"learning_rate": 4.90129759774202e-05,
"loss": 1.7425,
"mean_token_accuracy": 0.5723637498915195,
"step": 1670
},
{
"epoch": 0.684248039914469,
"grad_norm": 0.2705146074295044,
"learning_rate": 4.7882394907321674e-05,
"loss": 1.6121,
"mean_token_accuracy": 0.6098110035061837,
"step": 1680
},
{
"epoch": 0.6883209449139599,
"grad_norm": 0.2677505910396576,
"learning_rate": 4.676088700840575e-05,
"loss": 1.6416,
"mean_token_accuracy": 0.5757282719016075,
"step": 1690
},
{
"epoch": 0.6923938499134508,
"grad_norm": 0.2644527554512024,
"learning_rate": 4.564864752435509e-05,
"loss": 1.6675,
"mean_token_accuracy": 0.6154301188886165,
"step": 1700
},
{
"epoch": 0.6964667549129416,
"grad_norm": 0.23048701882362366,
"learning_rate": 4.454587008531097e-05,
"loss": 1.6641,
"mean_token_accuracy": 0.5855869121849537,
"step": 1710
},
{
"epoch": 0.7005396599124325,
"grad_norm": 0.2789078652858734,
"learning_rate": 4.345274667416399e-05,
"loss": 1.6978,
"mean_token_accuracy": 0.5762215368449688,
"step": 1720
},
{
"epoch": 0.7046125649119235,
"grad_norm": 0.271881103515625,
"learning_rate": 4.2369467593131926e-05,
"loss": 1.681,
"mean_token_accuracy": 0.5667479492723941,
"step": 1730
},
{
"epoch": 0.7086854699114143,
"grad_norm": 0.24953240156173706,
"learning_rate": 4.129622143062985e-05,
"loss": 1.5405,
"mean_token_accuracy": 0.6005463972687721,
"step": 1740
},
{
"epoch": 0.7127583749109052,
"grad_norm": 0.3925758898258209,
"learning_rate": 4.02331950284387e-05,
"loss": 1.7217,
"mean_token_accuracy": 0.5689709268510341,
"step": 1750
},
{
"epoch": 0.7168312799103961,
"grad_norm": 0.2544846832752228,
"learning_rate": 3.918057344917795e-05,
"loss": 1.5948,
"mean_token_accuracy": 0.5933421194553375,
"step": 1760
},
{
"epoch": 0.720904184909887,
"grad_norm": 0.32760509848594666,
"learning_rate": 3.813853994408793e-05,
"loss": 1.6678,
"mean_token_accuracy": 0.5856216661632061,
"step": 1770
},
{
"epoch": 0.7249770899093778,
"grad_norm": 0.2847062647342682,
"learning_rate": 3.7107275921127704e-05,
"loss": 1.682,
"mean_token_accuracy": 0.5889982558786869,
"step": 1780
},
{
"epoch": 0.7290499949088688,
"grad_norm": 0.22774401307106018,
"learning_rate": 3.60869609133936e-05,
"loss": 1.7135,
"mean_token_accuracy": 0.5773006275296211,
"step": 1790
},
{
"epoch": 0.7331228999083597,
"grad_norm": 0.2606080174446106,
"learning_rate": 3.507777254786425e-05,
"loss": 1.4999,
"mean_token_accuracy": 0.6269011601805687,
"step": 1800
},
{
"epoch": 0.7371958049078505,
"grad_norm": 0.2962757647037506,
"learning_rate": 3.407988651447738e-05,
"loss": 1.6202,
"mean_token_accuracy": 0.5973276488482953,
"step": 1810
},
{
"epoch": 0.7412687099073414,
"grad_norm": 0.29107147455215454,
"learning_rate": 3.3093476535544074e-05,
"loss": 1.5502,
"mean_token_accuracy": 0.6133273020386696,
"step": 1820
},
{
"epoch": 0.7453416149068323,
"grad_norm": 0.20980948209762573,
"learning_rate": 3.211871433550513e-05,
"loss": 1.6333,
"mean_token_accuracy": 0.6155988665297627,
"step": 1830
},
{
"epoch": 0.7494145199063232,
"grad_norm": 0.24882718920707703,
"learning_rate": 3.1155769611035825e-05,
"loss": 1.4907,
"mean_token_accuracy": 0.6201219961047173,
"step": 1840
},
{
"epoch": 0.7534874249058141,
"grad_norm": 0.23715901374816895,
"learning_rate": 3.0204810001503124e-05,
"loss": 1.8018,
"mean_token_accuracy": 0.5756942637264728,
"step": 1850
},
{
"epoch": 0.757560329905305,
"grad_norm": 0.35216882824897766,
"learning_rate": 2.9266001059781258e-05,
"loss": 1.7305,
"mean_token_accuracy": 0.5722471877932549,
"step": 1860
},
{
"epoch": 0.7616332349047958,
"grad_norm": 0.2924104332923889,
"learning_rate": 2.83395062234308e-05,
"loss": 1.6642,
"mean_token_accuracy": 0.58627370595932,
"step": 1870
},
{
"epoch": 0.7657061399042867,
"grad_norm": 0.27772393822669983,
"learning_rate": 2.742548678624548e-05,
"loss": 1.8349,
"mean_token_accuracy": 0.5614061944186688,
"step": 1880
},
{
"epoch": 0.7697790449037776,
"grad_norm": 0.31574469804763794,
"learning_rate": 2.6524101870172846e-05,
"loss": 1.7883,
"mean_token_accuracy": 0.561104378849268,
"step": 1890
},
{
"epoch": 0.7738519499032686,
"grad_norm": 0.253779798746109,
"learning_rate": 2.5635508397612262e-05,
"loss": 1.6654,
"mean_token_accuracy": 0.5888113439083099,
"step": 1900
},
{
"epoch": 0.7779248549027594,
"grad_norm": 0.2504970133304596,
"learning_rate": 2.4759861064096603e-05,
"loss": 1.6478,
"mean_token_accuracy": 0.5726306334137916,
"step": 1910
},
{
"epoch": 0.7819977599022503,
"grad_norm": 0.23571030795574188,
"learning_rate": 2.3897312311360955e-05,
"loss": 1.5355,
"mean_token_accuracy": 0.6026113323867321,
"step": 1920
},
{
"epoch": 0.7860706649017412,
"grad_norm": 0.2395690232515335,
"learning_rate": 2.3048012300804222e-05,
"loss": 1.5565,
"mean_token_accuracy": 0.5976604223251343,
"step": 1930
},
{
"epoch": 0.790143569901232,
"grad_norm": 0.5269713997840881,
"learning_rate": 2.221210888734736e-05,
"loss": 1.636,
"mean_token_accuracy": 0.5818449839949608,
"step": 1940
},
{
"epoch": 0.7942164749007229,
"grad_norm": 0.4233987033367157,
"learning_rate": 2.13897475936933e-05,
"loss": 1.7844,
"mean_token_accuracy": 0.5720866233110428,
"step": 1950
},
{
"epoch": 0.7982893799002139,
"grad_norm": 0.2641923427581787,
"learning_rate": 2.0581071584992818e-05,
"loss": 1.5874,
"mean_token_accuracy": 0.5966846913099288,
"step": 1960
},
{
"epoch": 0.8023622848997047,
"grad_norm": 0.27280351519584656,
"learning_rate": 1.9786221643920844e-05,
"loss": 1.6279,
"mean_token_accuracy": 0.5751761384308338,
"step": 1970
},
{
"epoch": 0.8064351898991956,
"grad_norm": 0.3823714256286621,
"learning_rate": 1.9005336146167686e-05,
"loss": 1.6269,
"mean_token_accuracy": 0.5963201723992825,
"step": 1980
},
{
"epoch": 0.8105080948986865,
"grad_norm": 0.25173816084861755,
"learning_rate": 1.8238551036349028e-05,
"loss": 1.5308,
"mean_token_accuracy": 0.6112879984080791,
"step": 1990
},
{
"epoch": 0.8145809998981773,
"grad_norm": 0.21256780624389648,
"learning_rate": 1.7485999804339348e-05,
"loss": 1.5568,
"mean_token_accuracy": 0.5963364981114865,
"step": 2000
},
{
"epoch": 0.8186539048976683,
"grad_norm": 0.2510949969291687,
"learning_rate": 1.6747813462032615e-05,
"loss": 1.6787,
"mean_token_accuracy": 0.58960345312953,
"step": 2010
},
{
"epoch": 0.8227268098971592,
"grad_norm": 0.255790650844574,
"learning_rate": 1.6024120520534326e-05,
"loss": 1.6416,
"mean_token_accuracy": 0.5875880800187587,
"step": 2020
},
{
"epoch": 0.82679971489665,
"grad_norm": 0.307492196559906,
"learning_rate": 1.5315046967789082e-05,
"loss": 1.69,
"mean_token_accuracy": 0.5625761769711971,
"step": 2030
},
{
"epoch": 0.8308726198961409,
"grad_norm": 0.2648999094963074,
"learning_rate": 1.4620716246647203e-05,
"loss": 1.6092,
"mean_token_accuracy": 0.6106476083397865,
"step": 2040
},
{
"epoch": 0.8349455248956318,
"grad_norm": 0.2488166093826294,
"learning_rate": 1.394124923337462e-05,
"loss": 1.6848,
"mean_token_accuracy": 0.5697021905332804,
"step": 2050
},
{
"epoch": 0.8390184298951227,
"grad_norm": 0.2427694946527481,
"learning_rate": 1.3276764216609294e-05,
"loss": 1.5843,
"mean_token_accuracy": 0.6084981314837933,
"step": 2060
},
{
"epoch": 0.8430913348946136,
"grad_norm": 0.2833966910839081,
"learning_rate": 1.2627376876768593e-05,
"loss": 1.5443,
"mean_token_accuracy": 0.6015144042670727,
"step": 2070
},
{
"epoch": 0.8471642398941045,
"grad_norm": 0.4057978689670563,
"learning_rate": 1.1993200265910131e-05,
"loss": 1.6073,
"mean_token_accuracy": 0.5917512811720371,
"step": 2080
},
{
"epoch": 0.8512371448935954,
"grad_norm": 0.25613030791282654,
"learning_rate": 1.1374344788050829e-05,
"loss": 1.8038,
"mean_token_accuracy": 0.5568435616791249,
"step": 2090
},
{
"epoch": 0.8553100498930862,
"grad_norm": 0.30181950330734253,
"learning_rate": 1.0770918179946388e-05,
"loss": 1.5022,
"mean_token_accuracy": 0.6081097513437271,
"step": 2100
},
{
"epoch": 0.8593829548925771,
"grad_norm": 0.23373402655124664,
"learning_rate": 1.0183025492335408e-05,
"loss": 1.7432,
"mean_token_accuracy": 0.5653887689113617,
"step": 2110
},
{
"epoch": 0.863455859892068,
"grad_norm": 0.2826649248600006,
"learning_rate": 9.610769071651193e-06,
"loss": 1.6706,
"mean_token_accuracy": 0.5875243842601776,
"step": 2120
},
{
"epoch": 0.867528764891559,
"grad_norm": 0.3047688603401184,
"learning_rate": 9.05424854220408e-06,
"loss": 1.5901,
"mean_token_accuracy": 0.6013362683355808,
"step": 2130
},
{
"epoch": 0.8716016698910498,
"grad_norm": 0.3211512863636017,
"learning_rate": 8.513560788837916e-06,
"loss": 1.6414,
"mean_token_accuracy": 0.5845984369516373,
"step": 2140
},
{
"epoch": 0.8756745748905407,
"grad_norm": 0.22475050389766693,
"learning_rate": 7.988799940063297e-06,
"loss": 1.6038,
"mean_token_accuracy": 0.5835995152592659,
"step": 2150
},
{
"epoch": 0.8797474798900315,
"grad_norm": 0.2239948809146881,
"learning_rate": 7.480057351670688e-06,
"loss": 1.6661,
"mean_token_accuracy": 0.5898953646421432,
"step": 2160
},
{
"epoch": 0.8838203848895224,
"grad_norm": 0.3669275641441345,
"learning_rate": 6.987421590826282e-06,
"loss": 1.6066,
"mean_token_accuracy": 0.5877827815711498,
"step": 2170
},
{
"epoch": 0.8878932898890134,
"grad_norm": 0.30003634095191956,
"learning_rate": 6.510978420653335e-06,
"loss": 1.6816,
"mean_token_accuracy": 0.5926426865160466,
"step": 2180
},
{
"epoch": 0.8919661948885043,
"grad_norm": 0.2707299590110779,
"learning_rate": 6.050810785301597e-06,
"loss": 1.7702,
"mean_token_accuracy": 0.561020129173994,
"step": 2190
},
{
"epoch": 0.8960390998879951,
"grad_norm": 0.3029952347278595,
"learning_rate": 5.606998795507578e-06,
"loss": 1.5417,
"mean_token_accuracy": 0.598423033952713,
"step": 2200
},
{
"epoch": 0.900112004887486,
"grad_norm": 0.27840766310691833,
"learning_rate": 5.1796197146479985e-06,
"loss": 1.5119,
"mean_token_accuracy": 0.6152562454342843,
"step": 2210
},
{
"epoch": 0.9041849098869769,
"grad_norm": 0.28235796093940735,
"learning_rate": 4.768747945288987e-06,
"loss": 1.5287,
"mean_token_accuracy": 0.61318289488554,
"step": 2220
},
{
"epoch": 0.9082578148864677,
"grad_norm": 0.21450947225093842,
"learning_rate": 4.37445501623337e-06,
"loss": 1.5842,
"mean_token_accuracy": 0.6025399126112461,
"step": 2230
},
{
"epoch": 0.9123307198859587,
"grad_norm": 0.29954469203948975,
"learning_rate": 3.996809570068127e-06,
"loss": 1.5514,
"mean_token_accuracy": 0.6040661752223968,
"step": 2240
},
{
"epoch": 0.9164036248854496,
"grad_norm": 0.34261876344680786,
"learning_rate": 3.635877351214445e-06,
"loss": 1.5493,
"mean_token_accuracy": 0.5996488876640796,
"step": 2250
},
{
"epoch": 0.9204765298849404,
"grad_norm": 0.24511079490184784,
"learning_rate": 3.291721194482189e-06,
"loss": 1.5494,
"mean_token_accuracy": 0.6054005287587643,
"step": 2260
},
{
"epoch": 0.9245494348844313,
"grad_norm": 0.21510252356529236,
"learning_rate": 2.9644010141310017e-06,
"loss": 1.6294,
"mean_token_accuracy": 0.5961603626608849,
"step": 2270
},
{
"epoch": 0.9286223398839222,
"grad_norm": 0.23636655509471893,
"learning_rate": 2.65397379343979e-06,
"loss": 1.7332,
"mean_token_accuracy": 0.5859133303165436,
"step": 2280
},
{
"epoch": 0.932695244883413,
"grad_norm": 0.25582408905029297,
"learning_rate": 2.3604935747865377e-06,
"loss": 1.6691,
"mean_token_accuracy": 0.5889919593930244,
"step": 2290
},
{
"epoch": 0.936768149882904,
"grad_norm": 0.3853449523448944,
"learning_rate": 2.0840114502400086e-06,
"loss": 1.5358,
"mean_token_accuracy": 0.5844359740614891,
"step": 2300
},
{
"epoch": 0.9408410548823949,
"grad_norm": 0.2177136093378067,
"learning_rate": 1.8245755526650753e-06,
"loss": 1.6318,
"mean_token_accuracy": 0.5915890723466873,
"step": 2310
},
{
"epoch": 0.9449139598818858,
"grad_norm": 0.23138591647148132,
"learning_rate": 1.5822310473433411e-06,
"loss": 1.5595,
"mean_token_accuracy": 0.5974130786955356,
"step": 2320
},
{
"epoch": 0.9489868648813766,
"grad_norm": 0.2235519289970398,
"learning_rate": 1.357020124110231e-06,
"loss": 1.7522,
"mean_token_accuracy": 0.5713608346879482,
"step": 2330
},
{
"epoch": 0.9530597698808675,
"grad_norm": 0.37900933623313904,
"learning_rate": 1.1489819900101784e-06,
"loss": 1.5307,
"mean_token_accuracy": 0.6045880667865277,
"step": 2340
},
{
"epoch": 0.9571326748803585,
"grad_norm": 0.2911360561847687,
"learning_rate": 9.581528624710734e-07,
"loss": 1.5633,
"mean_token_accuracy": 0.5826431967318058,
"step": 2350
},
{
"epoch": 0.9612055798798493,
"grad_norm": 0.25369352102279663,
"learning_rate": 7.845659629990842e-07,
"loss": 1.6927,
"mean_token_accuracy": 0.5901580177247524,
"step": 2360
},
{
"epoch": 0.9652784848793402,
"grad_norm": 0.32107028365135193,
"learning_rate": 6.282515113952281e-07,
"loss": 1.815,
"mean_token_accuracy": 0.56534923017025,
"step": 2370
},
{
"epoch": 0.9693513898788311,
"grad_norm": 0.3105465769767761,
"learning_rate": 4.892367204943016e-07,
"loss": 1.5694,
"mean_token_accuracy": 0.5809950686991214,
"step": 2380
},
{
"epoch": 0.9734242948783219,
"grad_norm": 0.2689298689365387,
"learning_rate": 3.6754579142741495e-07,
"loss": 1.6555,
"mean_token_accuracy": 0.591179046779871,
"step": 2390
},
{
"epoch": 0.9774971998778128,
"grad_norm": 0.44850870966911316,
"learning_rate": 2.6319990940885107e-07,
"loss": 1.7315,
"mean_token_accuracy": 0.5772897489368916,
"step": 2400
},
{
"epoch": 0.9815701048773038,
"grad_norm": 0.25496381521224976,
"learning_rate": 1.762172400478601e-07,
"loss": 1.5847,
"mean_token_accuracy": 0.5798953503370285,
"step": 2410
},
{
"epoch": 0.9856430098767947,
"grad_norm": 0.2383822500705719,
"learning_rate": 1.0661292618624474e-07,
"loss": 1.54,
"mean_token_accuracy": 0.6138455606997013,
"step": 2420
},
{
"epoch": 0.9897159148762855,
"grad_norm": 0.2854715585708618,
"learning_rate": 5.439908526212456e-08,
"loss": 1.4109,
"mean_token_accuracy": 0.6151122771203518,
"step": 2430
},
{
"epoch": 0.9937888198757764,
"grad_norm": 0.297370046377182,
"learning_rate": 1.9584807200423438e-08,
"loss": 1.5128,
"mean_token_accuracy": 0.6013165354728699,
"step": 2440
},
{
"epoch": 0.9978617248752673,
"grad_norm": 0.2563394010066986,
"learning_rate": 2.176152830357658e-09,
"loss": 1.6287,
"mean_token_accuracy": 0.5945099242031574,
"step": 2450
}
],
"logging_steps": 10,
"max_steps": 2455,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.283473658609664e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}