ParagonLight's picture
update lora_a and lora_b
33f3da9
raw
history blame contribute delete
No virus
241 kB
{
"best_metric": 0.6174443364143372,
"best_model_checkpoint": "ckpt/llama2_13b_other/fuze_28_balance_no_sys/checkpoint-12000",
"epoch": 2.0,
"eval_steps": 3000,
"global_step": 13776,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0014518002322880372,
"grad_norm": 0.5469616055488586,
"learning_rate": 2.5e-05,
"loss": 2.7374,
"step": 10
},
{
"epoch": 0.0029036004645760743,
"grad_norm": 3.591161012649536,
"learning_rate": 5e-05,
"loss": 2.639,
"step": 20
},
{
"epoch": 0.004355400696864111,
"grad_norm": 4.15903902053833,
"learning_rate": 4.9999934803356854e-05,
"loss": 1.9973,
"step": 30
},
{
"epoch": 0.005807200929152149,
"grad_norm": 0.7227652072906494,
"learning_rate": 4.999973921376744e-05,
"loss": 1.0062,
"step": 40
},
{
"epoch": 0.007259001161440186,
"grad_norm": 2.8751795291900635,
"learning_rate": 4.9999413232251924e-05,
"loss": 0.8956,
"step": 50
},
{
"epoch": 0.008710801393728223,
"grad_norm": 1.205117106437683,
"learning_rate": 4.9998956860510515e-05,
"loss": 0.6662,
"step": 60
},
{
"epoch": 0.01016260162601626,
"grad_norm": 2.018847703933716,
"learning_rate": 4.9998370100923546e-05,
"loss": 0.843,
"step": 70
},
{
"epoch": 0.011614401858304297,
"grad_norm": 1.5158370733261108,
"learning_rate": 4.9997652956551386e-05,
"loss": 0.6177,
"step": 80
},
{
"epoch": 0.013066202090592335,
"grad_norm": 2.3284151554107666,
"learning_rate": 4.999680543113447e-05,
"loss": 0.9499,
"step": 90
},
{
"epoch": 0.014518002322880372,
"grad_norm": 1.6479175090789795,
"learning_rate": 4.999582752909326e-05,
"loss": 0.94,
"step": 100
},
{
"epoch": 0.01596980255516841,
"grad_norm": 2.4836409091949463,
"learning_rate": 4.999471925552824e-05,
"loss": 1.0131,
"step": 110
},
{
"epoch": 0.017421602787456445,
"grad_norm": 0.8880699872970581,
"learning_rate": 4.999348061621986e-05,
"loss": 0.7342,
"step": 120
},
{
"epoch": 0.018873403019744484,
"grad_norm": 1.1543691158294678,
"learning_rate": 4.999211161762852e-05,
"loss": 0.9561,
"step": 130
},
{
"epoch": 0.02032520325203252,
"grad_norm": 1.358257532119751,
"learning_rate": 4.9990612266894574e-05,
"loss": 0.8214,
"step": 140
},
{
"epoch": 0.02177700348432056,
"grad_norm": 1.9943785667419434,
"learning_rate": 4.9988982571838214e-05,
"loss": 0.8319,
"step": 150
},
{
"epoch": 0.023228803716608595,
"grad_norm": 6.3814520835876465,
"learning_rate": 4.998722254095949e-05,
"loss": 0.9133,
"step": 160
},
{
"epoch": 0.02468060394889663,
"grad_norm": 5.650225639343262,
"learning_rate": 4.998533218343826e-05,
"loss": 0.8295,
"step": 170
},
{
"epoch": 0.02613240418118467,
"grad_norm": 0.8774411082267761,
"learning_rate": 4.998331150913412e-05,
"loss": 0.7959,
"step": 180
},
{
"epoch": 0.027584204413472705,
"grad_norm": 1.1295948028564453,
"learning_rate": 4.998116052858636e-05,
"loss": 0.6883,
"step": 190
},
{
"epoch": 0.029036004645760744,
"grad_norm": 1.1888564825057983,
"learning_rate": 4.9978879253013925e-05,
"loss": 0.9037,
"step": 200
},
{
"epoch": 0.03048780487804878,
"grad_norm": 2.754950761795044,
"learning_rate": 4.997646769431532e-05,
"loss": 0.9584,
"step": 210
},
{
"epoch": 0.03193960511033682,
"grad_norm": 0.6559054851531982,
"learning_rate": 4.9973925865068604e-05,
"loss": 0.9589,
"step": 220
},
{
"epoch": 0.03339140534262486,
"grad_norm": 0.49671778082847595,
"learning_rate": 4.997125377853127e-05,
"loss": 0.6646,
"step": 230
},
{
"epoch": 0.03484320557491289,
"grad_norm": 1.9896801710128784,
"learning_rate": 4.996845144864021e-05,
"loss": 0.9931,
"step": 240
},
{
"epoch": 0.03629500580720093,
"grad_norm": 5.797499179840088,
"learning_rate": 4.9965518890011606e-05,
"loss": 0.6066,
"step": 250
},
{
"epoch": 0.03774680603948897,
"grad_norm": 1.675417423248291,
"learning_rate": 4.996245611794091e-05,
"loss": 0.8694,
"step": 260
},
{
"epoch": 0.039198606271777,
"grad_norm": 1.7982059717178345,
"learning_rate": 4.9959263148402713e-05,
"loss": 0.699,
"step": 270
},
{
"epoch": 0.04065040650406504,
"grad_norm": 2.3844377994537354,
"learning_rate": 4.9955939998050696e-05,
"loss": 0.9412,
"step": 280
},
{
"epoch": 0.04210220673635308,
"grad_norm": 1.0922398567199707,
"learning_rate": 4.9952486684217516e-05,
"loss": 0.7914,
"step": 290
},
{
"epoch": 0.04355400696864112,
"grad_norm": 0.8432398438453674,
"learning_rate": 4.994890322491472e-05,
"loss": 0.8929,
"step": 300
},
{
"epoch": 0.04500580720092915,
"grad_norm": 0.7825923562049866,
"learning_rate": 4.9945189638832676e-05,
"loss": 0.9772,
"step": 310
},
{
"epoch": 0.04645760743321719,
"grad_norm": 3.1025567054748535,
"learning_rate": 4.994134594534046e-05,
"loss": 0.9957,
"step": 320
},
{
"epoch": 0.04790940766550523,
"grad_norm": 0.7623017430305481,
"learning_rate": 4.993737216448573e-05,
"loss": 0.5879,
"step": 330
},
{
"epoch": 0.04936120789779326,
"grad_norm": 0.5946126580238342,
"learning_rate": 4.9933268316994665e-05,
"loss": 0.7163,
"step": 340
},
{
"epoch": 0.0508130081300813,
"grad_norm": 2.404294490814209,
"learning_rate": 4.992903442427184e-05,
"loss": 0.7242,
"step": 350
},
{
"epoch": 0.05226480836236934,
"grad_norm": 2.895533323287964,
"learning_rate": 4.9924670508400096e-05,
"loss": 0.7379,
"step": 360
},
{
"epoch": 0.05371660859465738,
"grad_norm": 1.498542070388794,
"learning_rate": 4.992017659214044e-05,
"loss": 0.8413,
"step": 370
},
{
"epoch": 0.05516840882694541,
"grad_norm": 2.449810743331909,
"learning_rate": 4.991555269893194e-05,
"loss": 0.7442,
"step": 380
},
{
"epoch": 0.05662020905923345,
"grad_norm": 0.6948937773704529,
"learning_rate": 4.991079885289159e-05,
"loss": 0.7314,
"step": 390
},
{
"epoch": 0.05807200929152149,
"grad_norm": 1.1707820892333984,
"learning_rate": 4.990591507881416e-05,
"loss": 0.6596,
"step": 400
},
{
"epoch": 0.05952380952380952,
"grad_norm": 2.644362211227417,
"learning_rate": 4.99009014021721e-05,
"loss": 0.6841,
"step": 410
},
{
"epoch": 0.06097560975609756,
"grad_norm": 4.067262649536133,
"learning_rate": 4.9895757849115415e-05,
"loss": 0.9483,
"step": 420
},
{
"epoch": 0.0624274099883856,
"grad_norm": 0.9770334959030151,
"learning_rate": 4.989048444647149e-05,
"loss": 0.738,
"step": 430
},
{
"epoch": 0.06387921022067364,
"grad_norm": 0.6074944734573364,
"learning_rate": 4.988508122174498e-05,
"loss": 0.8884,
"step": 440
},
{
"epoch": 0.06533101045296168,
"grad_norm": 1.6675050258636475,
"learning_rate": 4.9879548203117654e-05,
"loss": 0.713,
"step": 450
},
{
"epoch": 0.06678281068524972,
"grad_norm": 2.338547945022583,
"learning_rate": 4.987388541944824e-05,
"loss": 1.0344,
"step": 460
},
{
"epoch": 0.06823461091753774,
"grad_norm": 2.637160301208496,
"learning_rate": 4.986809290027231e-05,
"loss": 0.6869,
"step": 470
},
{
"epoch": 0.06968641114982578,
"grad_norm": 1.666285753250122,
"learning_rate": 4.986217067580209e-05,
"loss": 0.7925,
"step": 480
},
{
"epoch": 0.07113821138211382,
"grad_norm": 3.2277042865753174,
"learning_rate": 4.98561187769263e-05,
"loss": 0.4641,
"step": 490
},
{
"epoch": 0.07259001161440186,
"grad_norm": 2.3560967445373535,
"learning_rate": 4.984993723521003e-05,
"loss": 0.5966,
"step": 500
},
{
"epoch": 0.0740418118466899,
"grad_norm": 1.709749698638916,
"learning_rate": 4.984362608289454e-05,
"loss": 0.672,
"step": 510
},
{
"epoch": 0.07549361207897794,
"grad_norm": 0.9946199655532837,
"learning_rate": 4.98371853528971e-05,
"loss": 0.7569,
"step": 520
},
{
"epoch": 0.07694541231126598,
"grad_norm": 0.839363694190979,
"learning_rate": 4.983061507881083e-05,
"loss": 0.6376,
"step": 530
},
{
"epoch": 0.078397212543554,
"grad_norm": 1.063310146331787,
"learning_rate": 4.982391529490452e-05,
"loss": 0.9329,
"step": 540
},
{
"epoch": 0.07984901277584204,
"grad_norm": 1.1951221227645874,
"learning_rate": 4.981708603612244e-05,
"loss": 0.7935,
"step": 550
},
{
"epoch": 0.08130081300813008,
"grad_norm": 1.3913841247558594,
"learning_rate": 4.981012733808417e-05,
"loss": 0.7963,
"step": 560
},
{
"epoch": 0.08275261324041812,
"grad_norm": 1.2319680452346802,
"learning_rate": 4.980303923708441e-05,
"loss": 0.8177,
"step": 570
},
{
"epoch": 0.08420441347270616,
"grad_norm": 1.925199031829834,
"learning_rate": 4.979582177009279e-05,
"loss": 0.7387,
"step": 580
},
{
"epoch": 0.0856562137049942,
"grad_norm": 1.3763043880462646,
"learning_rate": 4.9788474974753686e-05,
"loss": 0.6866,
"step": 590
},
{
"epoch": 0.08710801393728224,
"grad_norm": 1.1121422052383423,
"learning_rate": 4.9780998889386e-05,
"loss": 1.0793,
"step": 600
},
{
"epoch": 0.08855981416957026,
"grad_norm": 4.042593479156494,
"learning_rate": 4.9773393552982994e-05,
"loss": 0.7474,
"step": 610
},
{
"epoch": 0.0900116144018583,
"grad_norm": 1.7447359561920166,
"learning_rate": 4.976565900521205e-05,
"loss": 0.6573,
"step": 620
},
{
"epoch": 0.09146341463414634,
"grad_norm": 2.484264373779297,
"learning_rate": 4.975779528641451e-05,
"loss": 0.7327,
"step": 630
},
{
"epoch": 0.09291521486643438,
"grad_norm": 1.6296310424804688,
"learning_rate": 4.97498024376054e-05,
"loss": 1.0403,
"step": 640
},
{
"epoch": 0.09436701509872242,
"grad_norm": 0.7024840712547302,
"learning_rate": 4.9741680500473276e-05,
"loss": 0.8121,
"step": 650
},
{
"epoch": 0.09581881533101046,
"grad_norm": 5.444680690765381,
"learning_rate": 4.973342951737999e-05,
"loss": 0.6586,
"step": 660
},
{
"epoch": 0.0972706155632985,
"grad_norm": 3.954261302947998,
"learning_rate": 4.9725049531360454e-05,
"loss": 1.0836,
"step": 670
},
{
"epoch": 0.09872241579558652,
"grad_norm": 0.8371906876564026,
"learning_rate": 4.9716540586122425e-05,
"loss": 0.7811,
"step": 680
},
{
"epoch": 0.10017421602787456,
"grad_norm": 1.8764899969100952,
"learning_rate": 4.970790272604626e-05,
"loss": 0.5407,
"step": 690
},
{
"epoch": 0.1016260162601626,
"grad_norm": 1.2408713102340698,
"learning_rate": 4.9699135996184745e-05,
"loss": 0.751,
"step": 700
},
{
"epoch": 0.10307781649245064,
"grad_norm": 1.968125343322754,
"learning_rate": 4.969024044226276e-05,
"loss": 0.7491,
"step": 710
},
{
"epoch": 0.10452961672473868,
"grad_norm": 2.832357168197632,
"learning_rate": 4.9681216110677145e-05,
"loss": 0.8841,
"step": 720
},
{
"epoch": 0.10598141695702672,
"grad_norm": 1.2367392778396606,
"learning_rate": 4.9672063048496384e-05,
"loss": 0.6756,
"step": 730
},
{
"epoch": 0.10743321718931476,
"grad_norm": 1.3740241527557373,
"learning_rate": 4.9662781303460385e-05,
"loss": 0.62,
"step": 740
},
{
"epoch": 0.10888501742160278,
"grad_norm": 1.3408523797988892,
"learning_rate": 4.9653370923980245e-05,
"loss": 1.1724,
"step": 750
},
{
"epoch": 0.11033681765389082,
"grad_norm": 0.5014351010322571,
"learning_rate": 4.964383195913798e-05,
"loss": 0.6388,
"step": 760
},
{
"epoch": 0.11178861788617886,
"grad_norm": 4.638858318328857,
"learning_rate": 4.963416445868626e-05,
"loss": 0.9585,
"step": 770
},
{
"epoch": 0.1132404181184669,
"grad_norm": 1.0444010496139526,
"learning_rate": 4.962436847304818e-05,
"loss": 0.8725,
"step": 780
},
{
"epoch": 0.11469221835075494,
"grad_norm": 0.930262565612793,
"learning_rate": 4.9614444053316954e-05,
"loss": 0.6572,
"step": 790
},
{
"epoch": 0.11614401858304298,
"grad_norm": 2.152299642562866,
"learning_rate": 4.960439125125571e-05,
"loss": 0.9154,
"step": 800
},
{
"epoch": 0.11759581881533102,
"grad_norm": 0.5455219745635986,
"learning_rate": 4.959421011929716e-05,
"loss": 0.6822,
"step": 810
},
{
"epoch": 0.11904761904761904,
"grad_norm": 2.061328172683716,
"learning_rate": 4.9583900710543344e-05,
"loss": 0.6367,
"step": 820
},
{
"epoch": 0.12049941927990708,
"grad_norm": 0.8445961475372314,
"learning_rate": 4.957346307876537e-05,
"loss": 0.7663,
"step": 830
},
{
"epoch": 0.12195121951219512,
"grad_norm": 1.552341341972351,
"learning_rate": 4.956289727840313e-05,
"loss": 0.8241,
"step": 840
},
{
"epoch": 0.12340301974448316,
"grad_norm": 1.2934677600860596,
"learning_rate": 4.9552203364565e-05,
"loss": 0.8214,
"step": 850
},
{
"epoch": 0.1248548199767712,
"grad_norm": 1.5896968841552734,
"learning_rate": 4.9541381393027564e-05,
"loss": 0.8306,
"step": 860
},
{
"epoch": 0.12630662020905922,
"grad_norm": 1.5255497694015503,
"learning_rate": 4.953043142023531e-05,
"loss": 1.0784,
"step": 870
},
{
"epoch": 0.12775842044134728,
"grad_norm": 1.1418253183364868,
"learning_rate": 4.951935350330037e-05,
"loss": 0.7049,
"step": 880
},
{
"epoch": 0.1292102206736353,
"grad_norm": 1.0762348175048828,
"learning_rate": 4.950814770000217e-05,
"loss": 0.7348,
"step": 890
},
{
"epoch": 0.13066202090592335,
"grad_norm": 2.7543962001800537,
"learning_rate": 4.949681406878718e-05,
"loss": 0.5945,
"step": 900
},
{
"epoch": 0.13211382113821138,
"grad_norm": 0.9333184361457825,
"learning_rate": 4.948535266876857e-05,
"loss": 0.5863,
"step": 910
},
{
"epoch": 0.13356562137049943,
"grad_norm": 0.7856501340866089,
"learning_rate": 4.947376355972593e-05,
"loss": 0.8137,
"step": 920
},
{
"epoch": 0.13501742160278746,
"grad_norm": 2.296924352645874,
"learning_rate": 4.9462046802104945e-05,
"loss": 0.6391,
"step": 930
},
{
"epoch": 0.13646922183507548,
"grad_norm": 1.829867959022522,
"learning_rate": 4.9450202457017055e-05,
"loss": 0.763,
"step": 940
},
{
"epoch": 0.13792102206736354,
"grad_norm": 0.624662458896637,
"learning_rate": 4.9438230586239207e-05,
"loss": 0.789,
"step": 950
},
{
"epoch": 0.13937282229965156,
"grad_norm": 1.0507394075393677,
"learning_rate": 4.942613125221346e-05,
"loss": 0.7278,
"step": 960
},
{
"epoch": 0.14082462253193961,
"grad_norm": 0.9353327751159668,
"learning_rate": 4.9413904518046674e-05,
"loss": 0.6717,
"step": 970
},
{
"epoch": 0.14227642276422764,
"grad_norm": 4.606626510620117,
"learning_rate": 4.9401550447510235e-05,
"loss": 0.6505,
"step": 980
},
{
"epoch": 0.1437282229965157,
"grad_norm": 0.5310667753219604,
"learning_rate": 4.9389069105039634e-05,
"loss": 0.6163,
"step": 990
},
{
"epoch": 0.14518002322880372,
"grad_norm": 1.1119409799575806,
"learning_rate": 4.9376460555734225e-05,
"loss": 0.6708,
"step": 1000
},
{
"epoch": 0.14663182346109174,
"grad_norm": 0.934678852558136,
"learning_rate": 4.936372486535679e-05,
"loss": 0.63,
"step": 1010
},
{
"epoch": 0.1480836236933798,
"grad_norm": 0.9781250357627869,
"learning_rate": 4.9350862100333294e-05,
"loss": 0.7353,
"step": 1020
},
{
"epoch": 0.14953542392566782,
"grad_norm": 0.732448935508728,
"learning_rate": 4.9337872327752444e-05,
"loss": 0.6336,
"step": 1030
},
{
"epoch": 0.15098722415795587,
"grad_norm": 0.9106850624084473,
"learning_rate": 4.932475561536542e-05,
"loss": 0.5646,
"step": 1040
},
{
"epoch": 0.1524390243902439,
"grad_norm": 3.07547926902771,
"learning_rate": 4.931151203158547e-05,
"loss": 0.5629,
"step": 1050
},
{
"epoch": 0.15389082462253195,
"grad_norm": 2.424933433532715,
"learning_rate": 4.929814164548756e-05,
"loss": 0.9348,
"step": 1060
},
{
"epoch": 0.15534262485481998,
"grad_norm": 0.5797663927078247,
"learning_rate": 4.928464452680804e-05,
"loss": 0.7293,
"step": 1070
},
{
"epoch": 0.156794425087108,
"grad_norm": 4.099000453948975,
"learning_rate": 4.9271020745944265e-05,
"loss": 0.3943,
"step": 1080
},
{
"epoch": 0.15824622531939606,
"grad_norm": 2.4443376064300537,
"learning_rate": 4.92572703739542e-05,
"loss": 0.503,
"step": 1090
},
{
"epoch": 0.15969802555168408,
"grad_norm": 1.4358808994293213,
"learning_rate": 4.924339348255611e-05,
"loss": 0.7181,
"step": 1100
},
{
"epoch": 0.16114982578397213,
"grad_norm": 1.4664112329483032,
"learning_rate": 4.922939014412812e-05,
"loss": 0.7096,
"step": 1110
},
{
"epoch": 0.16260162601626016,
"grad_norm": 1.605031967163086,
"learning_rate": 4.9215260431707885e-05,
"loss": 0.7917,
"step": 1120
},
{
"epoch": 0.1640534262485482,
"grad_norm": 2.435290813446045,
"learning_rate": 4.92010044189922e-05,
"loss": 0.7983,
"step": 1130
},
{
"epoch": 0.16550522648083624,
"grad_norm": 3.4949209690093994,
"learning_rate": 4.9186622180336595e-05,
"loss": 0.811,
"step": 1140
},
{
"epoch": 0.16695702671312426,
"grad_norm": 0.8932238221168518,
"learning_rate": 4.917211379075496e-05,
"loss": 0.5875,
"step": 1150
},
{
"epoch": 0.16840882694541232,
"grad_norm": 1.7695764303207397,
"learning_rate": 4.9157479325919156e-05,
"loss": 0.8934,
"step": 1160
},
{
"epoch": 0.16986062717770034,
"grad_norm": 5.012516975402832,
"learning_rate": 4.9142718862158634e-05,
"loss": 0.6394,
"step": 1170
},
{
"epoch": 0.1713124274099884,
"grad_norm": 1.3308697938919067,
"learning_rate": 4.912783247646e-05,
"loss": 0.5884,
"step": 1180
},
{
"epoch": 0.17276422764227642,
"grad_norm": 0.9371745586395264,
"learning_rate": 4.911282024646664e-05,
"loss": 0.8007,
"step": 1190
},
{
"epoch": 0.17421602787456447,
"grad_norm": 3.8555784225463867,
"learning_rate": 4.909768225047833e-05,
"loss": 0.632,
"step": 1200
},
{
"epoch": 0.1756678281068525,
"grad_norm": 3.392313003540039,
"learning_rate": 4.908241856745077e-05,
"loss": 0.8346,
"step": 1210
},
{
"epoch": 0.17711962833914052,
"grad_norm": 1.3405215740203857,
"learning_rate": 4.906702927699525e-05,
"loss": 0.7455,
"step": 1220
},
{
"epoch": 0.17857142857142858,
"grad_norm": 1.5076225996017456,
"learning_rate": 4.905151445937817e-05,
"loss": 0.8539,
"step": 1230
},
{
"epoch": 0.1800232288037166,
"grad_norm": 1.722262978553772,
"learning_rate": 4.903587419552065e-05,
"loss": 0.641,
"step": 1240
},
{
"epoch": 0.18147502903600465,
"grad_norm": 0.5747788548469543,
"learning_rate": 4.902010856699811e-05,
"loss": 0.669,
"step": 1250
},
{
"epoch": 0.18292682926829268,
"grad_norm": 3.8853611946105957,
"learning_rate": 4.900421765603983e-05,
"loss": 0.598,
"step": 1260
},
{
"epoch": 0.18437862950058073,
"grad_norm": 0.7678889036178589,
"learning_rate": 4.8988201545528536e-05,
"loss": 0.57,
"step": 1270
},
{
"epoch": 0.18583042973286876,
"grad_norm": 0.890204906463623,
"learning_rate": 4.897206031899997e-05,
"loss": 0.52,
"step": 1280
},
{
"epoch": 0.18728222996515678,
"grad_norm": 1.0817334651947021,
"learning_rate": 4.8955794060642416e-05,
"loss": 0.5813,
"step": 1290
},
{
"epoch": 0.18873403019744484,
"grad_norm": 0.7758299112319946,
"learning_rate": 4.893940285529631e-05,
"loss": 0.8182,
"step": 1300
},
{
"epoch": 0.19018583042973286,
"grad_norm": 1.0594083070755005,
"learning_rate": 4.8922886788453796e-05,
"loss": 0.6143,
"step": 1310
},
{
"epoch": 0.1916376306620209,
"grad_norm": 0.9304606914520264,
"learning_rate": 4.8906245946258235e-05,
"loss": 0.7401,
"step": 1320
},
{
"epoch": 0.19308943089430894,
"grad_norm": 2.11362361907959,
"learning_rate": 4.8889480415503785e-05,
"loss": 0.4487,
"step": 1330
},
{
"epoch": 0.194541231126597,
"grad_norm": 2.9040818214416504,
"learning_rate": 4.8872590283634955e-05,
"loss": 0.7218,
"step": 1340
},
{
"epoch": 0.19599303135888502,
"grad_norm": 0.6021516919136047,
"learning_rate": 4.8855575638746135e-05,
"loss": 0.7179,
"step": 1350
},
{
"epoch": 0.19744483159117304,
"grad_norm": 3.067187786102295,
"learning_rate": 4.883843656958115e-05,
"loss": 0.9561,
"step": 1360
},
{
"epoch": 0.1988966318234611,
"grad_norm": 4.093753337860107,
"learning_rate": 4.882117316553278e-05,
"loss": 0.8025,
"step": 1370
},
{
"epoch": 0.20034843205574912,
"grad_norm": 1.0853984355926514,
"learning_rate": 4.88037855166423e-05,
"loss": 0.7298,
"step": 1380
},
{
"epoch": 0.20180023228803717,
"grad_norm": 1.4068083763122559,
"learning_rate": 4.878627371359902e-05,
"loss": 0.5038,
"step": 1390
},
{
"epoch": 0.2032520325203252,
"grad_norm": 1.063698649406433,
"learning_rate": 4.876863784773981e-05,
"loss": 0.8824,
"step": 1400
},
{
"epoch": 0.20470383275261325,
"grad_norm": 1.4493242502212524,
"learning_rate": 4.875087801104859e-05,
"loss": 0.8179,
"step": 1410
},
{
"epoch": 0.20615563298490128,
"grad_norm": 1.8046404123306274,
"learning_rate": 4.8732994296155915e-05,
"loss": 0.7289,
"step": 1420
},
{
"epoch": 0.2076074332171893,
"grad_norm": 1.531055212020874,
"learning_rate": 4.871498679633844e-05,
"loss": 0.9306,
"step": 1430
},
{
"epoch": 0.20905923344947736,
"grad_norm": 1.2926791906356812,
"learning_rate": 4.869685560551844e-05,
"loss": 0.7812,
"step": 1440
},
{
"epoch": 0.21051103368176538,
"grad_norm": 2.004673957824707,
"learning_rate": 4.867860081826334e-05,
"loss": 0.6344,
"step": 1450
},
{
"epoch": 0.21196283391405343,
"grad_norm": 0.8372285962104797,
"learning_rate": 4.866022252978521e-05,
"loss": 0.9279,
"step": 1460
},
{
"epoch": 0.21341463414634146,
"grad_norm": 3.9492061138153076,
"learning_rate": 4.8641720835940265e-05,
"loss": 0.6554,
"step": 1470
},
{
"epoch": 0.2148664343786295,
"grad_norm": 1.1838141679763794,
"learning_rate": 4.862309583322837e-05,
"loss": 0.35,
"step": 1480
},
{
"epoch": 0.21631823461091754,
"grad_norm": 0.5205928683280945,
"learning_rate": 4.860434761879255e-05,
"loss": 0.8758,
"step": 1490
},
{
"epoch": 0.21777003484320556,
"grad_norm": 1.2075397968292236,
"learning_rate": 4.858547629041844e-05,
"loss": 0.8463,
"step": 1500
},
{
"epoch": 0.21922183507549362,
"grad_norm": 0.9651175141334534,
"learning_rate": 4.8566481946533824e-05,
"loss": 0.5918,
"step": 1510
},
{
"epoch": 0.22067363530778164,
"grad_norm": 1.0648430585861206,
"learning_rate": 4.8547364686208106e-05,
"loss": 0.7321,
"step": 1520
},
{
"epoch": 0.2221254355400697,
"grad_norm": 1.3580704927444458,
"learning_rate": 4.852812460915178e-05,
"loss": 0.8827,
"step": 1530
},
{
"epoch": 0.22357723577235772,
"grad_norm": 1.9950529336929321,
"learning_rate": 4.850876181571592e-05,
"loss": 0.8698,
"step": 1540
},
{
"epoch": 0.22502903600464577,
"grad_norm": 0.6319971680641174,
"learning_rate": 4.848927640689165e-05,
"loss": 0.8824,
"step": 1550
},
{
"epoch": 0.2264808362369338,
"grad_norm": 0.40468019247055054,
"learning_rate": 4.846966848430964e-05,
"loss": 0.454,
"step": 1560
},
{
"epoch": 0.22793263646922182,
"grad_norm": 2.219438076019287,
"learning_rate": 4.8449938150239544e-05,
"loss": 0.7014,
"step": 1570
},
{
"epoch": 0.22938443670150988,
"grad_norm": 0.6382218599319458,
"learning_rate": 4.843008550758948e-05,
"loss": 0.7618,
"step": 1580
},
{
"epoch": 0.2308362369337979,
"grad_norm": 1.5169848203659058,
"learning_rate": 4.8410110659905514e-05,
"loss": 0.9599,
"step": 1590
},
{
"epoch": 0.23228803716608595,
"grad_norm": 1.203534483909607,
"learning_rate": 4.8390013711371085e-05,
"loss": 0.4722,
"step": 1600
},
{
"epoch": 0.23373983739837398,
"grad_norm": 1.456782341003418,
"learning_rate": 4.836979476680647e-05,
"loss": 0.8534,
"step": 1610
},
{
"epoch": 0.23519163763066203,
"grad_norm": 0.9215080142021179,
"learning_rate": 4.834945393166826e-05,
"loss": 0.8088,
"step": 1620
},
{
"epoch": 0.23664343786295006,
"grad_norm": 0.7815489768981934,
"learning_rate": 4.832899131204879e-05,
"loss": 0.8544,
"step": 1630
},
{
"epoch": 0.23809523809523808,
"grad_norm": 0.947912871837616,
"learning_rate": 4.8308407014675577e-05,
"loss": 0.6289,
"step": 1640
},
{
"epoch": 0.23954703832752614,
"grad_norm": 0.6381635665893555,
"learning_rate": 4.82877011469108e-05,
"loss": 0.7655,
"step": 1650
},
{
"epoch": 0.24099883855981416,
"grad_norm": 1.064013957977295,
"learning_rate": 4.8266873816750716e-05,
"loss": 0.5693,
"step": 1660
},
{
"epoch": 0.2424506387921022,
"grad_norm": 2.0902225971221924,
"learning_rate": 4.824592513282505e-05,
"loss": 0.8012,
"step": 1670
},
{
"epoch": 0.24390243902439024,
"grad_norm": 6.462097644805908,
"learning_rate": 4.8224855204396555e-05,
"loss": 0.628,
"step": 1680
},
{
"epoch": 0.2453542392566783,
"grad_norm": 1.036537766456604,
"learning_rate": 4.820366414136028e-05,
"loss": 0.7784,
"step": 1690
},
{
"epoch": 0.24680603948896632,
"grad_norm": 1.232399582862854,
"learning_rate": 4.818235205424315e-05,
"loss": 0.7538,
"step": 1700
},
{
"epoch": 0.24825783972125434,
"grad_norm": 1.105141282081604,
"learning_rate": 4.816091905420327e-05,
"loss": 0.9042,
"step": 1710
},
{
"epoch": 0.2497096399535424,
"grad_norm": 0.6853220462799072,
"learning_rate": 4.813936525302942e-05,
"loss": 0.5936,
"step": 1720
},
{
"epoch": 0.2511614401858304,
"grad_norm": 0.4660559892654419,
"learning_rate": 4.811769076314044e-05,
"loss": 0.7323,
"step": 1730
},
{
"epoch": 0.25261324041811845,
"grad_norm": 1.0349425077438354,
"learning_rate": 4.809589569758464e-05,
"loss": 0.5865,
"step": 1740
},
{
"epoch": 0.2540650406504065,
"grad_norm": 0.4405325651168823,
"learning_rate": 4.8073980170039234e-05,
"loss": 0.7297,
"step": 1750
},
{
"epoch": 0.25551684088269455,
"grad_norm": 1.4432979822158813,
"learning_rate": 4.805194429480972e-05,
"loss": 0.6268,
"step": 1760
},
{
"epoch": 0.2569686411149826,
"grad_norm": 0.7807000279426575,
"learning_rate": 4.802978818682933e-05,
"loss": 0.7536,
"step": 1770
},
{
"epoch": 0.2584204413472706,
"grad_norm": 1.3717634677886963,
"learning_rate": 4.800751196165835e-05,
"loss": 0.908,
"step": 1780
},
{
"epoch": 0.2598722415795586,
"grad_norm": 1.9359996318817139,
"learning_rate": 4.79851157354836e-05,
"loss": 0.4698,
"step": 1790
},
{
"epoch": 0.2613240418118467,
"grad_norm": 2.113598346710205,
"learning_rate": 4.7962599625117773e-05,
"loss": 0.6629,
"step": 1800
},
{
"epoch": 0.26277584204413473,
"grad_norm": 0.7605477571487427,
"learning_rate": 4.7939963747998855e-05,
"loss": 0.727,
"step": 1810
},
{
"epoch": 0.26422764227642276,
"grad_norm": 0.6016331315040588,
"learning_rate": 4.7917208222189506e-05,
"loss": 0.8574,
"step": 1820
},
{
"epoch": 0.2656794425087108,
"grad_norm": 0.8621135950088501,
"learning_rate": 4.789433316637644e-05,
"loss": 0.7995,
"step": 1830
},
{
"epoch": 0.26713124274099886,
"grad_norm": 1.2249228954315186,
"learning_rate": 4.7871338699869796e-05,
"loss": 0.9538,
"step": 1840
},
{
"epoch": 0.2685830429732869,
"grad_norm": 3.5839085578918457,
"learning_rate": 4.784822494260255e-05,
"loss": 0.602,
"step": 1850
},
{
"epoch": 0.2700348432055749,
"grad_norm": 1.334702491760254,
"learning_rate": 4.782499201512983e-05,
"loss": 0.702,
"step": 1860
},
{
"epoch": 0.27148664343786294,
"grad_norm": 0.8643277287483215,
"learning_rate": 4.780164003862838e-05,
"loss": 0.7837,
"step": 1870
},
{
"epoch": 0.27293844367015097,
"grad_norm": 0.9091192483901978,
"learning_rate": 4.777816913489581e-05,
"loss": 0.658,
"step": 1880
},
{
"epoch": 0.27439024390243905,
"grad_norm": 4.237992763519287,
"learning_rate": 4.775457942635006e-05,
"loss": 0.7956,
"step": 1890
},
{
"epoch": 0.27584204413472707,
"grad_norm": 0.5401553511619568,
"learning_rate": 4.773087103602871e-05,
"loss": 0.6637,
"step": 1900
},
{
"epoch": 0.2772938443670151,
"grad_norm": 2.7873334884643555,
"learning_rate": 4.770704408758837e-05,
"loss": 0.4589,
"step": 1910
},
{
"epoch": 0.2787456445993031,
"grad_norm": 0.6120592355728149,
"learning_rate": 4.7683098705304e-05,
"loss": 0.6523,
"step": 1920
},
{
"epoch": 0.28019744483159115,
"grad_norm": 0.4932442307472229,
"learning_rate": 4.765903501406826e-05,
"loss": 0.7068,
"step": 1930
},
{
"epoch": 0.28164924506387923,
"grad_norm": 1.102984070777893,
"learning_rate": 4.7634853139390945e-05,
"loss": 0.7414,
"step": 1940
},
{
"epoch": 0.28310104529616725,
"grad_norm": 0.7468515634536743,
"learning_rate": 4.7610553207398185e-05,
"loss": 0.8069,
"step": 1950
},
{
"epoch": 0.2845528455284553,
"grad_norm": 1.3317950963974,
"learning_rate": 4.758613534483191e-05,
"loss": 0.8219,
"step": 1960
},
{
"epoch": 0.2860046457607433,
"grad_norm": 1.7681723833084106,
"learning_rate": 4.7561599679049135e-05,
"loss": 0.5898,
"step": 1970
},
{
"epoch": 0.2874564459930314,
"grad_norm": 1.0765740871429443,
"learning_rate": 4.7536946338021306e-05,
"loss": 0.552,
"step": 1980
},
{
"epoch": 0.2889082462253194,
"grad_norm": 1.1886732578277588,
"learning_rate": 4.751217545033362e-05,
"loss": 0.5558,
"step": 1990
},
{
"epoch": 0.29036004645760743,
"grad_norm": 1.0681451559066772,
"learning_rate": 4.748728714518438e-05,
"loss": 0.6335,
"step": 2000
},
{
"epoch": 0.29181184668989546,
"grad_norm": 0.8771520256996155,
"learning_rate": 4.7462281552384306e-05,
"loss": 0.6354,
"step": 2010
},
{
"epoch": 0.2932636469221835,
"grad_norm": 1.085581660270691,
"learning_rate": 4.7437158802355854e-05,
"loss": 0.4697,
"step": 2020
},
{
"epoch": 0.29471544715447157,
"grad_norm": 1.2349504232406616,
"learning_rate": 4.7411919026132536e-05,
"loss": 0.5823,
"step": 2030
},
{
"epoch": 0.2961672473867596,
"grad_norm": 0.8741536736488342,
"learning_rate": 4.7386562355358254e-05,
"loss": 0.7622,
"step": 2040
},
{
"epoch": 0.2976190476190476,
"grad_norm": 3.957540273666382,
"learning_rate": 4.736108892228658e-05,
"loss": 0.696,
"step": 2050
},
{
"epoch": 0.29907084785133564,
"grad_norm": 1.2028242349624634,
"learning_rate": 4.733549885978012e-05,
"loss": 0.5248,
"step": 2060
},
{
"epoch": 0.30052264808362367,
"grad_norm": 2.623757839202881,
"learning_rate": 4.7309792301309755e-05,
"loss": 0.7899,
"step": 2070
},
{
"epoch": 0.30197444831591175,
"grad_norm": 0.8219063878059387,
"learning_rate": 4.728396938095399e-05,
"loss": 0.8088,
"step": 2080
},
{
"epoch": 0.3034262485481998,
"grad_norm": 2.02731990814209,
"learning_rate": 4.7258030233398244e-05,
"loss": 0.7673,
"step": 2090
},
{
"epoch": 0.3048780487804878,
"grad_norm": 1.400942087173462,
"learning_rate": 4.723197499393415e-05,
"loss": 0.648,
"step": 2100
},
{
"epoch": 0.3063298490127758,
"grad_norm": 2.6127829551696777,
"learning_rate": 4.7205803798458836e-05,
"loss": 0.7408,
"step": 2110
},
{
"epoch": 0.3077816492450639,
"grad_norm": 2.252988338470459,
"learning_rate": 4.7179516783474226e-05,
"loss": 0.7625,
"step": 2120
},
{
"epoch": 0.30923344947735193,
"grad_norm": 1.4618316888809204,
"learning_rate": 4.7153114086086336e-05,
"loss": 0.9155,
"step": 2130
},
{
"epoch": 0.31068524970963995,
"grad_norm": 0.945075511932373,
"learning_rate": 4.712659584400454e-05,
"loss": 0.8939,
"step": 2140
},
{
"epoch": 0.312137049941928,
"grad_norm": 1.9799119234085083,
"learning_rate": 4.709996219554088e-05,
"loss": 0.7928,
"step": 2150
},
{
"epoch": 0.313588850174216,
"grad_norm": 3.0045998096466064,
"learning_rate": 4.7073213279609293e-05,
"loss": 0.7881,
"step": 2160
},
{
"epoch": 0.3150406504065041,
"grad_norm": 1.4035004377365112,
"learning_rate": 4.7046349235724964e-05,
"loss": 0.8062,
"step": 2170
},
{
"epoch": 0.3164924506387921,
"grad_norm": 1.9164339303970337,
"learning_rate": 4.701937020400352e-05,
"loss": 0.7617,
"step": 2180
},
{
"epoch": 0.31794425087108014,
"grad_norm": 1.0605820417404175,
"learning_rate": 4.699227632516034e-05,
"loss": 0.7231,
"step": 2190
},
{
"epoch": 0.31939605110336816,
"grad_norm": 0.9426791071891785,
"learning_rate": 4.6965067740509825e-05,
"loss": 0.6771,
"step": 2200
},
{
"epoch": 0.3208478513356562,
"grad_norm": 1.0823321342468262,
"learning_rate": 4.693774459196465e-05,
"loss": 0.8387,
"step": 2210
},
{
"epoch": 0.32229965156794427,
"grad_norm": 1.703384518623352,
"learning_rate": 4.691030702203502e-05,
"loss": 0.4302,
"step": 2220
},
{
"epoch": 0.3237514518002323,
"grad_norm": 1.2216838598251343,
"learning_rate": 4.6882755173827933e-05,
"loss": 0.5434,
"step": 2230
},
{
"epoch": 0.3252032520325203,
"grad_norm": 0.8944600820541382,
"learning_rate": 4.6855089191046406e-05,
"loss": 0.7718,
"step": 2240
},
{
"epoch": 0.32665505226480834,
"grad_norm": 1.4830057621002197,
"learning_rate": 4.682730921798881e-05,
"loss": 0.7067,
"step": 2250
},
{
"epoch": 0.3281068524970964,
"grad_norm": 1.1373881101608276,
"learning_rate": 4.679941539954801e-05,
"loss": 0.6134,
"step": 2260
},
{
"epoch": 0.32955865272938445,
"grad_norm": 3.311741352081299,
"learning_rate": 4.677140788121067e-05,
"loss": 0.5914,
"step": 2270
},
{
"epoch": 0.3310104529616725,
"grad_norm": 1.3183683156967163,
"learning_rate": 4.674328680905649e-05,
"loss": 0.6412,
"step": 2280
},
{
"epoch": 0.3324622531939605,
"grad_norm": 0.6239253282546997,
"learning_rate": 4.671505232975741e-05,
"loss": 0.8585,
"step": 2290
},
{
"epoch": 0.3339140534262485,
"grad_norm": 0.6019532680511475,
"learning_rate": 4.668670459057692e-05,
"loss": 0.6322,
"step": 2300
},
{
"epoch": 0.3353658536585366,
"grad_norm": 1.3409720659255981,
"learning_rate": 4.665824373936921e-05,
"loss": 0.8676,
"step": 2310
},
{
"epoch": 0.33681765389082463,
"grad_norm": 1.3901034593582153,
"learning_rate": 4.662966992457842e-05,
"loss": 0.6381,
"step": 2320
},
{
"epoch": 0.33826945412311266,
"grad_norm": 0.4752490818500519,
"learning_rate": 4.660098329523791e-05,
"loss": 0.7852,
"step": 2330
},
{
"epoch": 0.3397212543554007,
"grad_norm": 0.8826183676719666,
"learning_rate": 4.657218400096942e-05,
"loss": 0.7941,
"step": 2340
},
{
"epoch": 0.3411730545876887,
"grad_norm": 1.8894481658935547,
"learning_rate": 4.654327219198235e-05,
"loss": 0.554,
"step": 2350
},
{
"epoch": 0.3426248548199768,
"grad_norm": 4.281989097595215,
"learning_rate": 4.6514248019072926e-05,
"loss": 0.6456,
"step": 2360
},
{
"epoch": 0.3440766550522648,
"grad_norm": 1.1848098039627075,
"learning_rate": 4.648511163362343e-05,
"loss": 0.8237,
"step": 2370
},
{
"epoch": 0.34552845528455284,
"grad_norm": 1.174756646156311,
"learning_rate": 4.645586318760145e-05,
"loss": 0.709,
"step": 2380
},
{
"epoch": 0.34698025551684086,
"grad_norm": 2.8332509994506836,
"learning_rate": 4.6426502833559e-05,
"loss": 0.6055,
"step": 2390
},
{
"epoch": 0.34843205574912894,
"grad_norm": 0.6192472577095032,
"learning_rate": 4.639703072463181e-05,
"loss": 0.8328,
"step": 2400
},
{
"epoch": 0.34988385598141697,
"grad_norm": 0.6660485863685608,
"learning_rate": 4.636744701453849e-05,
"loss": 0.92,
"step": 2410
},
{
"epoch": 0.351335656213705,
"grad_norm": 1.6284211874008179,
"learning_rate": 4.633775185757973e-05,
"loss": 0.7252,
"step": 2420
},
{
"epoch": 0.352787456445993,
"grad_norm": 0.7274760007858276,
"learning_rate": 4.630794540863747e-05,
"loss": 0.6107,
"step": 2430
},
{
"epoch": 0.35423925667828104,
"grad_norm": 2.6577463150024414,
"learning_rate": 4.627802782317417e-05,
"loss": 0.647,
"step": 2440
},
{
"epoch": 0.3556910569105691,
"grad_norm": 1.4532408714294434,
"learning_rate": 4.624799925723191e-05,
"loss": 0.435,
"step": 2450
},
{
"epoch": 0.35714285714285715,
"grad_norm": 2.7971816062927246,
"learning_rate": 4.621785986743163e-05,
"loss": 0.5866,
"step": 2460
},
{
"epoch": 0.3585946573751452,
"grad_norm": 1.4571512937545776,
"learning_rate": 4.61876098109723e-05,
"loss": 0.7796,
"step": 2470
},
{
"epoch": 0.3600464576074332,
"grad_norm": 2.3864150047302246,
"learning_rate": 4.6157249245630075e-05,
"loss": 0.9921,
"step": 2480
},
{
"epoch": 0.3614982578397213,
"grad_norm": 2.915992021560669,
"learning_rate": 4.6126778329757516e-05,
"loss": 0.7665,
"step": 2490
},
{
"epoch": 0.3629500580720093,
"grad_norm": 2.1495201587677,
"learning_rate": 4.609619722228274e-05,
"loss": 0.6569,
"step": 2500
},
{
"epoch": 0.36440185830429733,
"grad_norm": 2.9136157035827637,
"learning_rate": 4.606550608270859e-05,
"loss": 0.835,
"step": 2510
},
{
"epoch": 0.36585365853658536,
"grad_norm": 0.8638590574264526,
"learning_rate": 4.603470507111182e-05,
"loss": 0.7063,
"step": 2520
},
{
"epoch": 0.3673054587688734,
"grad_norm": 2.173835277557373,
"learning_rate": 4.600379434814221e-05,
"loss": 0.761,
"step": 2530
},
{
"epoch": 0.36875725900116146,
"grad_norm": 2.0101635456085205,
"learning_rate": 4.597277407502181e-05,
"loss": 0.5618,
"step": 2540
},
{
"epoch": 0.3702090592334495,
"grad_norm": 1.1493425369262695,
"learning_rate": 4.5941644413544024e-05,
"loss": 0.671,
"step": 2550
},
{
"epoch": 0.3716608594657375,
"grad_norm": 1.129114031791687,
"learning_rate": 4.591040552607281e-05,
"loss": 0.601,
"step": 2560
},
{
"epoch": 0.37311265969802554,
"grad_norm": 2.0701091289520264,
"learning_rate": 4.587905757554182e-05,
"loss": 0.8573,
"step": 2570
},
{
"epoch": 0.37456445993031356,
"grad_norm": 1.2713189125061035,
"learning_rate": 4.5847600725453536e-05,
"loss": 0.6449,
"step": 2580
},
{
"epoch": 0.37601626016260165,
"grad_norm": 1.8538284301757812,
"learning_rate": 4.581603513987845e-05,
"loss": 0.6038,
"step": 2590
},
{
"epoch": 0.37746806039488967,
"grad_norm": 1.350251317024231,
"learning_rate": 4.5784360983454175e-05,
"loss": 0.5973,
"step": 2600
},
{
"epoch": 0.3789198606271777,
"grad_norm": 0.7953972220420837,
"learning_rate": 4.5752578421384606e-05,
"loss": 0.9078,
"step": 2610
},
{
"epoch": 0.3803716608594657,
"grad_norm": 0.8986756205558777,
"learning_rate": 4.572068761943905e-05,
"loss": 0.6951,
"step": 2620
},
{
"epoch": 0.3818234610917538,
"grad_norm": 0.920846700668335,
"learning_rate": 4.568868874395137e-05,
"loss": 0.4939,
"step": 2630
},
{
"epoch": 0.3832752613240418,
"grad_norm": 1.8228408098220825,
"learning_rate": 4.565658196181909e-05,
"loss": 0.8694,
"step": 2640
},
{
"epoch": 0.38472706155632985,
"grad_norm": 1.1996351480484009,
"learning_rate": 4.5624367440502594e-05,
"loss": 0.6528,
"step": 2650
},
{
"epoch": 0.3861788617886179,
"grad_norm": 1.8428452014923096,
"learning_rate": 4.559204534802415e-05,
"loss": 0.6755,
"step": 2660
},
{
"epoch": 0.3876306620209059,
"grad_norm": 1.1987791061401367,
"learning_rate": 4.555961585296712e-05,
"loss": 0.5469,
"step": 2670
},
{
"epoch": 0.389082462253194,
"grad_norm": 1.405840277671814,
"learning_rate": 4.5527079124475045e-05,
"loss": 0.7443,
"step": 2680
},
{
"epoch": 0.390534262485482,
"grad_norm": 1.371089220046997,
"learning_rate": 4.549443533225075e-05,
"loss": 0.7145,
"step": 2690
},
{
"epoch": 0.39198606271777003,
"grad_norm": 1.3392704725265503,
"learning_rate": 4.546168464655551e-05,
"loss": 0.6241,
"step": 2700
},
{
"epoch": 0.39343786295005806,
"grad_norm": 1.8694888353347778,
"learning_rate": 4.542882723820809e-05,
"loss": 0.7412,
"step": 2710
},
{
"epoch": 0.3948896631823461,
"grad_norm": 0.8335723876953125,
"learning_rate": 4.5395863278583914e-05,
"loss": 0.5457,
"step": 2720
},
{
"epoch": 0.39634146341463417,
"grad_norm": 1.200954556465149,
"learning_rate": 4.5362792939614126e-05,
"loss": 0.8856,
"step": 2730
},
{
"epoch": 0.3977932636469222,
"grad_norm": 0.7490825057029724,
"learning_rate": 4.532961639378477e-05,
"loss": 0.7058,
"step": 2740
},
{
"epoch": 0.3992450638792102,
"grad_norm": 0.5736889839172363,
"learning_rate": 4.529633381413577e-05,
"loss": 0.8461,
"step": 2750
},
{
"epoch": 0.40069686411149824,
"grad_norm": 3.038465976715088,
"learning_rate": 4.526294537426013e-05,
"loss": 0.9319,
"step": 2760
},
{
"epoch": 0.4021486643437863,
"grad_norm": 3.3678839206695557,
"learning_rate": 4.5229451248302996e-05,
"loss": 0.7878,
"step": 2770
},
{
"epoch": 0.40360046457607435,
"grad_norm": 0.9918755292892456,
"learning_rate": 4.5195851610960716e-05,
"loss": 0.5738,
"step": 2780
},
{
"epoch": 0.40505226480836237,
"grad_norm": 0.45315515995025635,
"learning_rate": 4.516214663747999e-05,
"loss": 0.8513,
"step": 2790
},
{
"epoch": 0.4065040650406504,
"grad_norm": 3.0047781467437744,
"learning_rate": 4.512833650365691e-05,
"loss": 0.494,
"step": 2800
},
{
"epoch": 0.4079558652729384,
"grad_norm": 1.6291121244430542,
"learning_rate": 4.509442138583604e-05,
"loss": 0.4759,
"step": 2810
},
{
"epoch": 0.4094076655052265,
"grad_norm": 1.279628038406372,
"learning_rate": 4.506040146090953e-05,
"loss": 0.75,
"step": 2820
},
{
"epoch": 0.41085946573751453,
"grad_norm": 0.6952537894248962,
"learning_rate": 4.502627690631618e-05,
"loss": 0.6722,
"step": 2830
},
{
"epoch": 0.41231126596980255,
"grad_norm": 6.771650791168213,
"learning_rate": 4.499204790004051e-05,
"loss": 0.6538,
"step": 2840
},
{
"epoch": 0.4137630662020906,
"grad_norm": 1.1350947618484497,
"learning_rate": 4.49577146206118e-05,
"loss": 0.651,
"step": 2850
},
{
"epoch": 0.4152148664343786,
"grad_norm": 1.379130482673645,
"learning_rate": 4.492327724710324e-05,
"loss": 0.8259,
"step": 2860
},
{
"epoch": 0.4166666666666667,
"grad_norm": 0.9285208582878113,
"learning_rate": 4.488873595913091e-05,
"loss": 0.5317,
"step": 2870
},
{
"epoch": 0.4181184668989547,
"grad_norm": 1.3536639213562012,
"learning_rate": 4.485409093685289e-05,
"loss": 0.9471,
"step": 2880
},
{
"epoch": 0.41957026713124274,
"grad_norm": 1.6582531929016113,
"learning_rate": 4.4819342360968316e-05,
"loss": 0.6531,
"step": 2890
},
{
"epoch": 0.42102206736353076,
"grad_norm": 0.5296352505683899,
"learning_rate": 4.478449041271644e-05,
"loss": 0.8268,
"step": 2900
},
{
"epoch": 0.42247386759581884,
"grad_norm": 1.2088879346847534,
"learning_rate": 4.474953527387564e-05,
"loss": 0.9049,
"step": 2910
},
{
"epoch": 0.42392566782810687,
"grad_norm": 0.5331336855888367,
"learning_rate": 4.471447712676256e-05,
"loss": 1.1198,
"step": 2920
},
{
"epoch": 0.4253774680603949,
"grad_norm": 1.4603538513183594,
"learning_rate": 4.4679316154231054e-05,
"loss": 0.5809,
"step": 2930
},
{
"epoch": 0.4268292682926829,
"grad_norm": 0.9502357840538025,
"learning_rate": 4.464405253967133e-05,
"loss": 0.5471,
"step": 2940
},
{
"epoch": 0.42828106852497094,
"grad_norm": 1.7284854650497437,
"learning_rate": 4.4608686467008926e-05,
"loss": 0.6076,
"step": 2950
},
{
"epoch": 0.429732868757259,
"grad_norm": 0.7732632160186768,
"learning_rate": 4.457321812070378e-05,
"loss": 0.8251,
"step": 2960
},
{
"epoch": 0.43118466898954705,
"grad_norm": 1.8970303535461426,
"learning_rate": 4.453764768574926e-05,
"loss": 0.6548,
"step": 2970
},
{
"epoch": 0.4326364692218351,
"grad_norm": 0.50247722864151,
"learning_rate": 4.450197534767121e-05,
"loss": 0.6137,
"step": 2980
},
{
"epoch": 0.4340882694541231,
"grad_norm": 1.2860316038131714,
"learning_rate": 4.4466201292526956e-05,
"loss": 0.5776,
"step": 2990
},
{
"epoch": 0.4355400696864111,
"grad_norm": 1.1598414182662964,
"learning_rate": 4.4430325706904366e-05,
"loss": 0.9806,
"step": 3000
},
{
"epoch": 0.4355400696864111,
"eval_loss": 0.6489894390106201,
"eval_runtime": 107.7493,
"eval_samples_per_second": 13.457,
"eval_steps_per_second": 3.369,
"step": 3000
},
{
"epoch": 0.4369918699186992,
"grad_norm": 0.9044310450553894,
"learning_rate": 4.439434877792086e-05,
"loss": 0.666,
"step": 3010
},
{
"epoch": 0.43844367015098723,
"grad_norm": 0.6645646095275879,
"learning_rate": 4.435827069322244e-05,
"loss": 0.5448,
"step": 3020
},
{
"epoch": 0.43989547038327526,
"grad_norm": 4.799647331237793,
"learning_rate": 4.4322091640982705e-05,
"loss": 0.6945,
"step": 3030
},
{
"epoch": 0.4413472706155633,
"grad_norm": 1.6476815938949585,
"learning_rate": 4.428581180990188e-05,
"loss": 0.6551,
"step": 3040
},
{
"epoch": 0.44279907084785136,
"grad_norm": 0.8809843063354492,
"learning_rate": 4.424943138920581e-05,
"loss": 0.9209,
"step": 3050
},
{
"epoch": 0.4442508710801394,
"grad_norm": 2.0106568336486816,
"learning_rate": 4.4212950568645007e-05,
"loss": 0.6188,
"step": 3060
},
{
"epoch": 0.4457026713124274,
"grad_norm": 0.4192439913749695,
"learning_rate": 4.417636953849364e-05,
"loss": 0.7886,
"step": 3070
},
{
"epoch": 0.44715447154471544,
"grad_norm": 1.8189557790756226,
"learning_rate": 4.4139688489548534e-05,
"loss": 1.0636,
"step": 3080
},
{
"epoch": 0.44860627177700346,
"grad_norm": 1.1486669778823853,
"learning_rate": 4.410290761312818e-05,
"loss": 0.7989,
"step": 3090
},
{
"epoch": 0.45005807200929154,
"grad_norm": 0.6434163451194763,
"learning_rate": 4.406602710107177e-05,
"loss": 0.7368,
"step": 3100
},
{
"epoch": 0.45150987224157957,
"grad_norm": 1.370603084564209,
"learning_rate": 4.4029047145738134e-05,
"loss": 0.6113,
"step": 3110
},
{
"epoch": 0.4529616724738676,
"grad_norm": 1.5696393251419067,
"learning_rate": 4.39919679400048e-05,
"loss": 0.6274,
"step": 3120
},
{
"epoch": 0.4544134727061556,
"grad_norm": 21.59466552734375,
"learning_rate": 4.3954789677266936e-05,
"loss": 0.7229,
"step": 3130
},
{
"epoch": 0.45586527293844364,
"grad_norm": 0.975163459777832,
"learning_rate": 4.391751255143639e-05,
"loss": 0.7115,
"step": 3140
},
{
"epoch": 0.4573170731707317,
"grad_norm": 0.6678398251533508,
"learning_rate": 4.3880136756940624e-05,
"loss": 0.6668,
"step": 3150
},
{
"epoch": 0.45876887340301975,
"grad_norm": 0.9730459451675415,
"learning_rate": 4.384266248872176e-05,
"loss": 0.6139,
"step": 3160
},
{
"epoch": 0.4602206736353078,
"grad_norm": 0.7275809049606323,
"learning_rate": 4.380508994223551e-05,
"loss": 0.9358,
"step": 3170
},
{
"epoch": 0.4616724738675958,
"grad_norm": 4.506844520568848,
"learning_rate": 4.376741931345019e-05,
"loss": 0.5481,
"step": 3180
},
{
"epoch": 0.4631242740998839,
"grad_norm": 0.5535733699798584,
"learning_rate": 4.3729650798845676e-05,
"loss": 0.7074,
"step": 3190
},
{
"epoch": 0.4645760743321719,
"grad_norm": 0.7955453991889954,
"learning_rate": 4.36917845954124e-05,
"loss": 0.5912,
"step": 3200
},
{
"epoch": 0.46602787456445993,
"grad_norm": 1.144351601600647,
"learning_rate": 4.365382090065032e-05,
"loss": 0.893,
"step": 3210
},
{
"epoch": 0.46747967479674796,
"grad_norm": 2.5055947303771973,
"learning_rate": 4.3615759912567864e-05,
"loss": 0.7052,
"step": 3220
},
{
"epoch": 0.468931475029036,
"grad_norm": 2.367400884628296,
"learning_rate": 4.3577601829680925e-05,
"loss": 0.5374,
"step": 3230
},
{
"epoch": 0.47038327526132406,
"grad_norm": 2.6038706302642822,
"learning_rate": 4.353934685101181e-05,
"loss": 0.5551,
"step": 3240
},
{
"epoch": 0.4718350754936121,
"grad_norm": 1.4026364088058472,
"learning_rate": 4.350099517608823e-05,
"loss": 0.7855,
"step": 3250
},
{
"epoch": 0.4732868757259001,
"grad_norm": 1.1398979425430298,
"learning_rate": 4.346254700494221e-05,
"loss": 0.6862,
"step": 3260
},
{
"epoch": 0.47473867595818814,
"grad_norm": 0.881351888179779,
"learning_rate": 4.3424002538109096e-05,
"loss": 0.7258,
"step": 3270
},
{
"epoch": 0.47619047619047616,
"grad_norm": 3.777125120162964,
"learning_rate": 4.338536197662646e-05,
"loss": 0.6882,
"step": 3280
},
{
"epoch": 0.47764227642276424,
"grad_norm": 1.4731556177139282,
"learning_rate": 4.3346625522033105e-05,
"loss": 0.8303,
"step": 3290
},
{
"epoch": 0.47909407665505227,
"grad_norm": 1.810880184173584,
"learning_rate": 4.330779337636798e-05,
"loss": 0.7837,
"step": 3300
},
{
"epoch": 0.4805458768873403,
"grad_norm": 1.3891079425811768,
"learning_rate": 4.326886574216911e-05,
"loss": 0.4782,
"step": 3310
},
{
"epoch": 0.4819976771196283,
"grad_norm": 3.7195885181427,
"learning_rate": 4.32298428224726e-05,
"loss": 0.6343,
"step": 3320
},
{
"epoch": 0.4834494773519164,
"grad_norm": 1.837262511253357,
"learning_rate": 4.319072482081151e-05,
"loss": 0.4242,
"step": 3330
},
{
"epoch": 0.4849012775842044,
"grad_norm": 0.9354246854782104,
"learning_rate": 4.315151194121484e-05,
"loss": 0.6616,
"step": 3340
},
{
"epoch": 0.48635307781649245,
"grad_norm": 5.568230152130127,
"learning_rate": 4.3112204388206436e-05,
"loss": 0.5538,
"step": 3350
},
{
"epoch": 0.4878048780487805,
"grad_norm": 1.4984145164489746,
"learning_rate": 4.307280236680393e-05,
"loss": 0.6217,
"step": 3360
},
{
"epoch": 0.4892566782810685,
"grad_norm": 1.278181552886963,
"learning_rate": 4.303330608251769e-05,
"loss": 0.6273,
"step": 3370
},
{
"epoch": 0.4907084785133566,
"grad_norm": 0.48235225677490234,
"learning_rate": 4.2993715741349726e-05,
"loss": 0.5814,
"step": 3380
},
{
"epoch": 0.4921602787456446,
"grad_norm": 0.9399949312210083,
"learning_rate": 4.2954031549792634e-05,
"loss": 0.869,
"step": 3390
},
{
"epoch": 0.49361207897793263,
"grad_norm": 1.9232203960418701,
"learning_rate": 4.291425371482849e-05,
"loss": 0.8627,
"step": 3400
},
{
"epoch": 0.49506387921022066,
"grad_norm": 0.5802033543586731,
"learning_rate": 4.287438244392781e-05,
"loss": 0.8384,
"step": 3410
},
{
"epoch": 0.4965156794425087,
"grad_norm": 0.3144931197166443,
"learning_rate": 4.283441794504842e-05,
"loss": 0.6346,
"step": 3420
},
{
"epoch": 0.49796747967479676,
"grad_norm": 5.040658473968506,
"learning_rate": 4.279436042663443e-05,
"loss": 0.6497,
"step": 3430
},
{
"epoch": 0.4994192799070848,
"grad_norm": 0.7379769682884216,
"learning_rate": 4.275421009761509e-05,
"loss": 0.6061,
"step": 3440
},
{
"epoch": 0.5008710801393729,
"grad_norm": 0.975500226020813,
"learning_rate": 4.271396716740374e-05,
"loss": 0.601,
"step": 3450
},
{
"epoch": 0.5023228803716608,
"grad_norm": 1.0296087265014648,
"learning_rate": 4.267363184589669e-05,
"loss": 0.5649,
"step": 3460
},
{
"epoch": 0.5037746806039489,
"grad_norm": 1.4896851778030396,
"learning_rate": 4.2633204343472146e-05,
"loss": 0.6021,
"step": 3470
},
{
"epoch": 0.5052264808362369,
"grad_norm": 1.235889196395874,
"learning_rate": 4.25926848709891e-05,
"loss": 0.4451,
"step": 3480
},
{
"epoch": 0.506678281068525,
"grad_norm": 0.9615374207496643,
"learning_rate": 4.255207363978625e-05,
"loss": 0.4711,
"step": 3490
},
{
"epoch": 0.508130081300813,
"grad_norm": 1.6776018142700195,
"learning_rate": 4.251137086168086e-05,
"loss": 0.7406,
"step": 3500
},
{
"epoch": 0.509581881533101,
"grad_norm": 1.5150796175003052,
"learning_rate": 4.247057674896771e-05,
"loss": 0.496,
"step": 3510
},
{
"epoch": 0.5110336817653891,
"grad_norm": 1.1669261455535889,
"learning_rate": 4.24296915144179e-05,
"loss": 0.8257,
"step": 3520
},
{
"epoch": 0.5124854819976771,
"grad_norm": 0.6701371073722839,
"learning_rate": 4.2388715371277875e-05,
"loss": 0.8408,
"step": 3530
},
{
"epoch": 0.5139372822299652,
"grad_norm": 1.5670065879821777,
"learning_rate": 4.234764853326817e-05,
"loss": 0.9285,
"step": 3540
},
{
"epoch": 0.5153890824622532,
"grad_norm": 0.589513897895813,
"learning_rate": 4.230649121458239e-05,
"loss": 0.7376,
"step": 3550
},
{
"epoch": 0.5168408826945412,
"grad_norm": 0.7740994095802307,
"learning_rate": 4.226524362988605e-05,
"loss": 0.5336,
"step": 3560
},
{
"epoch": 0.5182926829268293,
"grad_norm": 1.503607153892517,
"learning_rate": 4.222390599431549e-05,
"loss": 0.8121,
"step": 3570
},
{
"epoch": 0.5197444831591173,
"grad_norm": 1.1378567218780518,
"learning_rate": 4.21824785234767e-05,
"loss": 1.0838,
"step": 3580
},
{
"epoch": 0.5211962833914053,
"grad_norm": 0.8732675313949585,
"learning_rate": 4.214096143344425e-05,
"loss": 0.6242,
"step": 3590
},
{
"epoch": 0.5226480836236934,
"grad_norm": 1.4234071969985962,
"learning_rate": 4.2099354940760124e-05,
"loss": 0.7382,
"step": 3600
},
{
"epoch": 0.5240998838559814,
"grad_norm": 0.9399917721748352,
"learning_rate": 4.205765926243264e-05,
"loss": 0.6173,
"step": 3610
},
{
"epoch": 0.5255516840882695,
"grad_norm": 1.9771159887313843,
"learning_rate": 4.201587461593522e-05,
"loss": 0.9029,
"step": 3620
},
{
"epoch": 0.5270034843205574,
"grad_norm": 2.2527432441711426,
"learning_rate": 4.197400121920539e-05,
"loss": 0.624,
"step": 3630
},
{
"epoch": 0.5284552845528455,
"grad_norm": 1.7543494701385498,
"learning_rate": 4.193203929064353e-05,
"loss": 0.5714,
"step": 3640
},
{
"epoch": 0.5299070847851336,
"grad_norm": 0.9363800883293152,
"learning_rate": 4.1889989049111794e-05,
"loss": 0.5273,
"step": 3650
},
{
"epoch": 0.5313588850174216,
"grad_norm": 0.6811619400978088,
"learning_rate": 4.184785071393295e-05,
"loss": 0.634,
"step": 3660
},
{
"epoch": 0.5328106852497096,
"grad_norm": 1.3300182819366455,
"learning_rate": 4.180562450488923e-05,
"loss": 0.7374,
"step": 3670
},
{
"epoch": 0.5342624854819977,
"grad_norm": 5.183244228363037,
"learning_rate": 4.17633106422212e-05,
"loss": 0.6945,
"step": 3680
},
{
"epoch": 0.5357142857142857,
"grad_norm": 3.467090368270874,
"learning_rate": 4.1720909346626624e-05,
"loss": 0.52,
"step": 3690
},
{
"epoch": 0.5371660859465738,
"grad_norm": 0.5636081099510193,
"learning_rate": 4.167842083925926e-05,
"loss": 0.7019,
"step": 3700
},
{
"epoch": 0.5386178861788617,
"grad_norm": 0.8139100074768066,
"learning_rate": 4.163584534172774e-05,
"loss": 0.6844,
"step": 3710
},
{
"epoch": 0.5400696864111498,
"grad_norm": 0.3868808150291443,
"learning_rate": 4.1593183076094445e-05,
"loss": 0.4764,
"step": 3720
},
{
"epoch": 0.5415214866434379,
"grad_norm": 3.8870656490325928,
"learning_rate": 4.155043426487429e-05,
"loss": 0.6925,
"step": 3730
},
{
"epoch": 0.5429732868757259,
"grad_norm": 1.7030867338180542,
"learning_rate": 4.150759913103359e-05,
"loss": 0.5368,
"step": 3740
},
{
"epoch": 0.544425087108014,
"grad_norm": 1.52249276638031,
"learning_rate": 4.1464677897988904e-05,
"loss": 0.6469,
"step": 3750
},
{
"epoch": 0.5458768873403019,
"grad_norm": 1.3640564680099487,
"learning_rate": 4.1421670789605856e-05,
"loss": 0.6186,
"step": 3760
},
{
"epoch": 0.54732868757259,
"grad_norm": 0.9472920298576355,
"learning_rate": 4.137857803019797e-05,
"loss": 0.6701,
"step": 3770
},
{
"epoch": 0.5487804878048781,
"grad_norm": 2.9700679779052734,
"learning_rate": 4.1335399844525514e-05,
"loss": 0.6616,
"step": 3780
},
{
"epoch": 0.5502322880371661,
"grad_norm": 1.1544781923294067,
"learning_rate": 4.129213645779431e-05,
"loss": 0.6644,
"step": 3790
},
{
"epoch": 0.5516840882694541,
"grad_norm": 2.1192784309387207,
"learning_rate": 4.124878809565455e-05,
"loss": 0.5912,
"step": 3800
},
{
"epoch": 0.5531358885017421,
"grad_norm": 0.9204639196395874,
"learning_rate": 4.1205354984199665e-05,
"loss": 1.0158,
"step": 3810
},
{
"epoch": 0.5545876887340302,
"grad_norm": 1.1523475646972656,
"learning_rate": 4.116183734996509e-05,
"loss": 0.5879,
"step": 3820
},
{
"epoch": 0.5560394889663183,
"grad_norm": 1.5894629955291748,
"learning_rate": 4.1118235419927125e-05,
"loss": 0.5309,
"step": 3830
},
{
"epoch": 0.5574912891986062,
"grad_norm": 1.463646650314331,
"learning_rate": 4.107454942150173e-05,
"loss": 0.5955,
"step": 3840
},
{
"epoch": 0.5589430894308943,
"grad_norm": 0.8998947739601135,
"learning_rate": 4.103077958254334e-05,
"loss": 0.5999,
"step": 3850
},
{
"epoch": 0.5603948896631823,
"grad_norm": 1.8093136548995972,
"learning_rate": 4.098692613134367e-05,
"loss": 0.7605,
"step": 3860
},
{
"epoch": 0.5618466898954704,
"grad_norm": 1.070966124534607,
"learning_rate": 4.0942989296630566e-05,
"loss": 0.7076,
"step": 3870
},
{
"epoch": 0.5632984901277585,
"grad_norm": 1.424028754234314,
"learning_rate": 4.0898969307566734e-05,
"loss": 0.553,
"step": 3880
},
{
"epoch": 0.5647502903600464,
"grad_norm": 4.3886189460754395,
"learning_rate": 4.0854866393748633e-05,
"loss": 0.6369,
"step": 3890
},
{
"epoch": 0.5662020905923345,
"grad_norm": 0.7212158441543579,
"learning_rate": 4.081068078520521e-05,
"loss": 0.5729,
"step": 3900
},
{
"epoch": 0.5676538908246226,
"grad_norm": 1.5475590229034424,
"learning_rate": 4.076641271239674e-05,
"loss": 0.6781,
"step": 3910
},
{
"epoch": 0.5691056910569106,
"grad_norm": 2.9124624729156494,
"learning_rate": 4.072206240621359e-05,
"loss": 0.3627,
"step": 3920
},
{
"epoch": 0.5705574912891986,
"grad_norm": 3.567720651626587,
"learning_rate": 4.067763009797506e-05,
"loss": 0.6201,
"step": 3930
},
{
"epoch": 0.5720092915214866,
"grad_norm": 1.0543193817138672,
"learning_rate": 4.063311601942814e-05,
"loss": 0.8288,
"step": 3940
},
{
"epoch": 0.5734610917537747,
"grad_norm": 2.356640338897705,
"learning_rate": 4.058852040274629e-05,
"loss": 0.7107,
"step": 3950
},
{
"epoch": 0.5749128919860628,
"grad_norm": 1.225469946861267,
"learning_rate": 4.054384348052829e-05,
"loss": 0.7114,
"step": 3960
},
{
"epoch": 0.5763646922183507,
"grad_norm": 1.6612083911895752,
"learning_rate": 4.049908548579695e-05,
"loss": 0.6198,
"step": 3970
},
{
"epoch": 0.5778164924506388,
"grad_norm": 0.8432019352912903,
"learning_rate": 4.0454246651997976e-05,
"loss": 0.641,
"step": 3980
},
{
"epoch": 0.5792682926829268,
"grad_norm": 1.41001296043396,
"learning_rate": 4.040932721299866e-05,
"loss": 0.6773,
"step": 3990
},
{
"epoch": 0.5807200929152149,
"grad_norm": 4.1915788650512695,
"learning_rate": 4.036432740308675e-05,
"loss": 0.708,
"step": 4000
},
{
"epoch": 0.582171893147503,
"grad_norm": 1.1455175876617432,
"learning_rate": 4.031924745696915e-05,
"loss": 0.687,
"step": 4010
},
{
"epoch": 0.5836236933797909,
"grad_norm": 0.27715983986854553,
"learning_rate": 4.027408760977078e-05,
"loss": 0.6192,
"step": 4020
},
{
"epoch": 0.585075493612079,
"grad_norm": 0.9823393821716309,
"learning_rate": 4.022884809703325e-05,
"loss": 0.7417,
"step": 4030
},
{
"epoch": 0.586527293844367,
"grad_norm": 1.0322932004928589,
"learning_rate": 4.018352915471373e-05,
"loss": 0.6031,
"step": 4040
},
{
"epoch": 0.587979094076655,
"grad_norm": 1.231325387954712,
"learning_rate": 4.0138131019183635e-05,
"loss": 0.6654,
"step": 4050
},
{
"epoch": 0.5894308943089431,
"grad_norm": 0.7293880581855774,
"learning_rate": 4.009265392722745e-05,
"loss": 0.7368,
"step": 4060
},
{
"epoch": 0.5908826945412311,
"grad_norm": 1.2683119773864746,
"learning_rate": 4.0047098116041494e-05,
"loss": 0.7025,
"step": 4070
},
{
"epoch": 0.5923344947735192,
"grad_norm": 3.7659318447113037,
"learning_rate": 4.000146382323262e-05,
"loss": 0.6851,
"step": 4080
},
{
"epoch": 0.5937862950058072,
"grad_norm": 0.5533025860786438,
"learning_rate": 3.995575128681706e-05,
"loss": 0.7296,
"step": 4090
},
{
"epoch": 0.5952380952380952,
"grad_norm": 1.3915671110153198,
"learning_rate": 3.990996074521912e-05,
"loss": 0.8556,
"step": 4100
},
{
"epoch": 0.5966898954703833,
"grad_norm": 1.290931224822998,
"learning_rate": 3.986409243726997e-05,
"loss": 0.6936,
"step": 4110
},
{
"epoch": 0.5981416957026713,
"grad_norm": 1.8250644207000732,
"learning_rate": 3.981814660220639e-05,
"loss": 0.48,
"step": 4120
},
{
"epoch": 0.5995934959349594,
"grad_norm": 5.125851631164551,
"learning_rate": 3.977212347966951e-05,
"loss": 0.6769,
"step": 4130
},
{
"epoch": 0.6010452961672473,
"grad_norm": 1.0293982028961182,
"learning_rate": 3.9726023309703586e-05,
"loss": 0.4873,
"step": 4140
},
{
"epoch": 0.6024970963995354,
"grad_norm": 1.5232713222503662,
"learning_rate": 3.9679846332754716e-05,
"loss": 0.5796,
"step": 4150
},
{
"epoch": 0.6039488966318235,
"grad_norm": 1.948309302330017,
"learning_rate": 3.963359278966962e-05,
"loss": 0.7975,
"step": 4160
},
{
"epoch": 0.6054006968641115,
"grad_norm": 4.971721649169922,
"learning_rate": 3.9587262921694343e-05,
"loss": 0.5604,
"step": 4170
},
{
"epoch": 0.6068524970963995,
"grad_norm": 0.7850014567375183,
"learning_rate": 3.954085697047305e-05,
"loss": 0.6898,
"step": 4180
},
{
"epoch": 0.6083042973286876,
"grad_norm": 0.5327876210212708,
"learning_rate": 3.949437517804672e-05,
"loss": 0.7244,
"step": 4190
},
{
"epoch": 0.6097560975609756,
"grad_norm": 2.595165729522705,
"learning_rate": 3.944781778685189e-05,
"loss": 0.6537,
"step": 4200
},
{
"epoch": 0.6112078977932637,
"grad_norm": 3.179577350616455,
"learning_rate": 3.940118503971941e-05,
"loss": 0.6315,
"step": 4210
},
{
"epoch": 0.6126596980255516,
"grad_norm": 4.726830959320068,
"learning_rate": 3.935447717987318e-05,
"loss": 0.9359,
"step": 4220
},
{
"epoch": 0.6141114982578397,
"grad_norm": 0.4002162516117096,
"learning_rate": 3.930769445092883e-05,
"loss": 0.7475,
"step": 4230
},
{
"epoch": 0.6155632984901278,
"grad_norm": 1.5376918315887451,
"learning_rate": 3.9260837096892536e-05,
"loss": 0.8695,
"step": 4240
},
{
"epoch": 0.6170150987224158,
"grad_norm": 1.1458797454833984,
"learning_rate": 3.921390536215966e-05,
"loss": 0.5302,
"step": 4250
},
{
"epoch": 0.6184668989547039,
"grad_norm": 2.180319309234619,
"learning_rate": 3.916689949151352e-05,
"loss": 0.6508,
"step": 4260
},
{
"epoch": 0.6199186991869918,
"grad_norm": 0.7947795391082764,
"learning_rate": 3.911981973012413e-05,
"loss": 0.5396,
"step": 4270
},
{
"epoch": 0.6213704994192799,
"grad_norm": 2.065096616744995,
"learning_rate": 3.907266632354687e-05,
"loss": 0.6551,
"step": 4280
},
{
"epoch": 0.622822299651568,
"grad_norm": 0.585402250289917,
"learning_rate": 3.902543951772125e-05,
"loss": 0.8218,
"step": 4290
},
{
"epoch": 0.624274099883856,
"grad_norm": 0.9007218480110168,
"learning_rate": 3.897813955896961e-05,
"loss": 0.6261,
"step": 4300
},
{
"epoch": 0.625725900116144,
"grad_norm": 1.722657322883606,
"learning_rate": 3.8930766693995836e-05,
"loss": 0.6373,
"step": 4310
},
{
"epoch": 0.627177700348432,
"grad_norm": 2.8142952919006348,
"learning_rate": 3.888332116988405e-05,
"loss": 0.7586,
"step": 4320
},
{
"epoch": 0.6286295005807201,
"grad_norm": 0.6167258620262146,
"learning_rate": 3.883580323409739e-05,
"loss": 0.6376,
"step": 4330
},
{
"epoch": 0.6300813008130082,
"grad_norm": 1.2382534742355347,
"learning_rate": 3.878821313447662e-05,
"loss": 0.7507,
"step": 4340
},
{
"epoch": 0.6315331010452961,
"grad_norm": 1.4185280799865723,
"learning_rate": 3.874055111923895e-05,
"loss": 0.8366,
"step": 4350
},
{
"epoch": 0.6329849012775842,
"grad_norm": 1.5447771549224854,
"learning_rate": 3.869281743697664e-05,
"loss": 0.7417,
"step": 4360
},
{
"epoch": 0.6344367015098722,
"grad_norm": 0.8044071793556213,
"learning_rate": 3.864501233665574e-05,
"loss": 0.6307,
"step": 4370
},
{
"epoch": 0.6358885017421603,
"grad_norm": 1.0656015872955322,
"learning_rate": 3.8597136067614834e-05,
"loss": 0.8411,
"step": 4380
},
{
"epoch": 0.6373403019744484,
"grad_norm": 1.03560471534729,
"learning_rate": 3.854918887956369e-05,
"loss": 0.4866,
"step": 4390
},
{
"epoch": 0.6387921022067363,
"grad_norm": 3.3328843116760254,
"learning_rate": 3.850117102258194e-05,
"loss": 0.5966,
"step": 4400
},
{
"epoch": 0.6402439024390244,
"grad_norm": 0.6904016733169556,
"learning_rate": 3.8453082747117866e-05,
"loss": 0.7452,
"step": 4410
},
{
"epoch": 0.6416957026713124,
"grad_norm": 1.4979177713394165,
"learning_rate": 3.8404924303986966e-05,
"loss": 0.5983,
"step": 4420
},
{
"epoch": 0.6431475029036005,
"grad_norm": 0.5199301838874817,
"learning_rate": 3.8356695944370766e-05,
"loss": 0.6088,
"step": 4430
},
{
"epoch": 0.6445993031358885,
"grad_norm": 0.7011024355888367,
"learning_rate": 3.8308397919815425e-05,
"loss": 0.8235,
"step": 4440
},
{
"epoch": 0.6460511033681765,
"grad_norm": 0.6176084280014038,
"learning_rate": 3.826003048223048e-05,
"loss": 0.5582,
"step": 4450
},
{
"epoch": 0.6475029036004646,
"grad_norm": 0.8521440029144287,
"learning_rate": 3.8211593883887486e-05,
"loss": 0.608,
"step": 4460
},
{
"epoch": 0.6489547038327527,
"grad_norm": 1.2053148746490479,
"learning_rate": 3.816308837741875e-05,
"loss": 0.6533,
"step": 4470
},
{
"epoch": 0.6504065040650406,
"grad_norm": 1.701720952987671,
"learning_rate": 3.811451421581595e-05,
"loss": 0.6655,
"step": 4480
},
{
"epoch": 0.6518583042973287,
"grad_norm": 1.8435336351394653,
"learning_rate": 3.8065871652428874e-05,
"loss": 0.6773,
"step": 4490
},
{
"epoch": 0.6533101045296167,
"grad_norm": 3.5968480110168457,
"learning_rate": 3.801716094096407e-05,
"loss": 0.8139,
"step": 4500
},
{
"epoch": 0.6547619047619048,
"grad_norm": 0.776545524597168,
"learning_rate": 3.796838233548353e-05,
"loss": 0.758,
"step": 4510
},
{
"epoch": 0.6562137049941928,
"grad_norm": 1.1160175800323486,
"learning_rate": 3.7919536090403366e-05,
"loss": 0.4703,
"step": 4520
},
{
"epoch": 0.6576655052264808,
"grad_norm": 1.2551127672195435,
"learning_rate": 3.787062246049245e-05,
"loss": 0.8029,
"step": 4530
},
{
"epoch": 0.6591173054587689,
"grad_norm": 1.130473256111145,
"learning_rate": 3.7821641700871174e-05,
"loss": 0.6633,
"step": 4540
},
{
"epoch": 0.6605691056910569,
"grad_norm": 0.6870506405830383,
"learning_rate": 3.7772594067010005e-05,
"loss": 0.5136,
"step": 4550
},
{
"epoch": 0.662020905923345,
"grad_norm": 1.1664706468582153,
"learning_rate": 3.772347981472824e-05,
"loss": 0.7384,
"step": 4560
},
{
"epoch": 0.663472706155633,
"grad_norm": 1.849837303161621,
"learning_rate": 3.767429920019261e-05,
"loss": 0.6037,
"step": 4570
},
{
"epoch": 0.664924506387921,
"grad_norm": 1.2257493734359741,
"learning_rate": 3.7625052479916015e-05,
"loss": 0.7564,
"step": 4580
},
{
"epoch": 0.6663763066202091,
"grad_norm": 1.277335286140442,
"learning_rate": 3.7575739910756124e-05,
"loss": 0.6522,
"step": 4590
},
{
"epoch": 0.667828106852497,
"grad_norm": 0.8080965280532837,
"learning_rate": 3.752636174991403e-05,
"loss": 0.8077,
"step": 4600
},
{
"epoch": 0.6692799070847851,
"grad_norm": 1.9517686367034912,
"learning_rate": 3.747691825493298e-05,
"loss": 0.5579,
"step": 4610
},
{
"epoch": 0.6707317073170732,
"grad_norm": 1.0174436569213867,
"learning_rate": 3.742740968369697e-05,
"loss": 0.8038,
"step": 4620
},
{
"epoch": 0.6721835075493612,
"grad_norm": 0.6888383626937866,
"learning_rate": 3.73778362944294e-05,
"loss": 0.8365,
"step": 4630
},
{
"epoch": 0.6736353077816493,
"grad_norm": 2.7746047973632812,
"learning_rate": 3.732819834569176e-05,
"loss": 0.5363,
"step": 4640
},
{
"epoch": 0.6750871080139372,
"grad_norm": 0.43378978967666626,
"learning_rate": 3.7278496096382254e-05,
"loss": 0.5768,
"step": 4650
},
{
"epoch": 0.6765389082462253,
"grad_norm": 1.7999366521835327,
"learning_rate": 3.722872980573448e-05,
"loss": 0.7168,
"step": 4660
},
{
"epoch": 0.6779907084785134,
"grad_norm": 0.7228707075119019,
"learning_rate": 3.717889973331603e-05,
"loss": 0.8107,
"step": 4670
},
{
"epoch": 0.6794425087108014,
"grad_norm": 1.048464059829712,
"learning_rate": 3.7129006139027203e-05,
"loss": 0.6335,
"step": 4680
},
{
"epoch": 0.6808943089430894,
"grad_norm": 3.776031494140625,
"learning_rate": 3.707904928309956e-05,
"loss": 0.5367,
"step": 4690
},
{
"epoch": 0.6823461091753774,
"grad_norm": 4.042102336883545,
"learning_rate": 3.7029029426094666e-05,
"loss": 0.5869,
"step": 4700
},
{
"epoch": 0.6837979094076655,
"grad_norm": 2.6105918884277344,
"learning_rate": 3.6978946828902646e-05,
"loss": 0.4038,
"step": 4710
},
{
"epoch": 0.6852497096399536,
"grad_norm": 0.17694531381130219,
"learning_rate": 3.6928801752740895e-05,
"loss": 0.6876,
"step": 4720
},
{
"epoch": 0.6867015098722415,
"grad_norm": 1.4261376857757568,
"learning_rate": 3.687859445915265e-05,
"loss": 0.4988,
"step": 4730
},
{
"epoch": 0.6881533101045296,
"grad_norm": 3.2906527519226074,
"learning_rate": 3.682832521000568e-05,
"loss": 0.6203,
"step": 4740
},
{
"epoch": 0.6896051103368177,
"grad_norm": 0.8446171283721924,
"learning_rate": 3.677799426749088e-05,
"loss": 0.9472,
"step": 4750
},
{
"epoch": 0.6910569105691057,
"grad_norm": 1.2324299812316895,
"learning_rate": 3.6727601894120945e-05,
"loss": 0.6428,
"step": 4760
},
{
"epoch": 0.6925087108013938,
"grad_norm": 2.0250608921051025,
"learning_rate": 3.667714835272895e-05,
"loss": 0.55,
"step": 4770
},
{
"epoch": 0.6939605110336817,
"grad_norm": 1.788245677947998,
"learning_rate": 3.662663390646701e-05,
"loss": 0.672,
"step": 4780
},
{
"epoch": 0.6954123112659698,
"grad_norm": 2.5829572677612305,
"learning_rate": 3.657605881880493e-05,
"loss": 0.4385,
"step": 4790
},
{
"epoch": 0.6968641114982579,
"grad_norm": 0.9620968699455261,
"learning_rate": 3.652542335352878e-05,
"loss": 0.8065,
"step": 4800
},
{
"epoch": 0.6983159117305459,
"grad_norm": 1.38759183883667,
"learning_rate": 3.647472777473954e-05,
"loss": 0.7473,
"step": 4810
},
{
"epoch": 0.6997677119628339,
"grad_norm": 1.4988477230072021,
"learning_rate": 3.6423972346851744e-05,
"loss": 0.6581,
"step": 4820
},
{
"epoch": 0.7012195121951219,
"grad_norm": 1.095119595527649,
"learning_rate": 3.637315733459207e-05,
"loss": 0.5304,
"step": 4830
},
{
"epoch": 0.70267131242741,
"grad_norm": 0.6751285791397095,
"learning_rate": 3.6322283002997964e-05,
"loss": 0.7912,
"step": 4840
},
{
"epoch": 0.7041231126596981,
"grad_norm": 4.3074564933776855,
"learning_rate": 3.62713496174163e-05,
"loss": 0.545,
"step": 4850
},
{
"epoch": 0.705574912891986,
"grad_norm": 1.85584557056427,
"learning_rate": 3.622035744350192e-05,
"loss": 0.9848,
"step": 4860
},
{
"epoch": 0.7070267131242741,
"grad_norm": 1.2834818363189697,
"learning_rate": 3.6169306747216324e-05,
"loss": 0.7151,
"step": 4870
},
{
"epoch": 0.7084785133565621,
"grad_norm": 2.248262882232666,
"learning_rate": 3.611819779482623e-05,
"loss": 0.5322,
"step": 4880
},
{
"epoch": 0.7099303135888502,
"grad_norm": 2.055523633956909,
"learning_rate": 3.606703085290221e-05,
"loss": 0.6814,
"step": 4890
},
{
"epoch": 0.7113821138211383,
"grad_norm": 1.6206103563308716,
"learning_rate": 3.601580618831727e-05,
"loss": 0.8505,
"step": 4900
},
{
"epoch": 0.7128339140534262,
"grad_norm": 1.4901407957077026,
"learning_rate": 3.5964524068245536e-05,
"loss": 0.9409,
"step": 4910
},
{
"epoch": 0.7142857142857143,
"grad_norm": 1.2524491548538208,
"learning_rate": 3.591318476016076e-05,
"loss": 0.6961,
"step": 4920
},
{
"epoch": 0.7157375145180023,
"grad_norm": 1.2523133754730225,
"learning_rate": 3.586178853183498e-05,
"loss": 0.7585,
"step": 4930
},
{
"epoch": 0.7171893147502904,
"grad_norm": 1.0829603672027588,
"learning_rate": 3.581033565133713e-05,
"loss": 0.6737,
"step": 4940
},
{
"epoch": 0.7186411149825784,
"grad_norm": 2.06748628616333,
"learning_rate": 3.5758826387031626e-05,
"loss": 0.7715,
"step": 4950
},
{
"epoch": 0.7200929152148664,
"grad_norm": 0.8570627570152283,
"learning_rate": 3.570726100757693e-05,
"loss": 0.7153,
"step": 4960
},
{
"epoch": 0.7215447154471545,
"grad_norm": 4.75230073928833,
"learning_rate": 3.5655639781924247e-05,
"loss": 0.447,
"step": 4970
},
{
"epoch": 0.7229965156794426,
"grad_norm": 2.5116281509399414,
"learning_rate": 3.5603962979315996e-05,
"loss": 0.5853,
"step": 4980
},
{
"epoch": 0.7244483159117305,
"grad_norm": 1.00091552734375,
"learning_rate": 3.555223086928453e-05,
"loss": 0.8609,
"step": 4990
},
{
"epoch": 0.7259001161440186,
"grad_norm": 1.0202133655548096,
"learning_rate": 3.550044372165062e-05,
"loss": 0.879,
"step": 5000
},
{
"epoch": 0.7273519163763066,
"grad_norm": 1.4836984872817993,
"learning_rate": 3.5448601806522134e-05,
"loss": 0.3201,
"step": 5010
},
{
"epoch": 0.7288037166085947,
"grad_norm": 1.1861945390701294,
"learning_rate": 3.539670539429256e-05,
"loss": 0.4413,
"step": 5020
},
{
"epoch": 0.7302555168408827,
"grad_norm": 1.24436616897583,
"learning_rate": 3.534475475563967e-05,
"loss": 0.7143,
"step": 5030
},
{
"epoch": 0.7317073170731707,
"grad_norm": 1.162705421447754,
"learning_rate": 3.5292750161524045e-05,
"loss": 0.6185,
"step": 5040
},
{
"epoch": 0.7331591173054588,
"grad_norm": 1.116911768913269,
"learning_rate": 3.5240691883187666e-05,
"loss": 0.6876,
"step": 5050
},
{
"epoch": 0.7346109175377468,
"grad_norm": 0.6887683272361755,
"learning_rate": 3.5188580192152544e-05,
"loss": 0.5068,
"step": 5060
},
{
"epoch": 0.7360627177700348,
"grad_norm": 0.8753703832626343,
"learning_rate": 3.513641536021925e-05,
"loss": 0.8465,
"step": 5070
},
{
"epoch": 0.7375145180023229,
"grad_norm": 1.0913424491882324,
"learning_rate": 3.5084197659465555e-05,
"loss": 0.5948,
"step": 5080
},
{
"epoch": 0.7389663182346109,
"grad_norm": 4.28510856628418,
"learning_rate": 3.503192736224496e-05,
"loss": 0.6233,
"step": 5090
},
{
"epoch": 0.740418118466899,
"grad_norm": 1.444339632987976,
"learning_rate": 3.49796047411853e-05,
"loss": 0.4999,
"step": 5100
},
{
"epoch": 0.741869918699187,
"grad_norm": 1.1212478876113892,
"learning_rate": 3.4927230069187307e-05,
"loss": 0.5284,
"step": 5110
},
{
"epoch": 0.743321718931475,
"grad_norm": 0.00559547683224082,
"learning_rate": 3.487480361942321e-05,
"loss": 0.4229,
"step": 5120
},
{
"epoch": 0.7447735191637631,
"grad_norm": 13.444393157958984,
"learning_rate": 3.482232566533529e-05,
"loss": 0.7992,
"step": 5130
},
{
"epoch": 0.7462253193960511,
"grad_norm": 0.6348085403442383,
"learning_rate": 3.4769796480634456e-05,
"loss": 0.7238,
"step": 5140
},
{
"epoch": 0.7476771196283392,
"grad_norm": 1.069054126739502,
"learning_rate": 3.471721633929885e-05,
"loss": 0.4417,
"step": 5150
},
{
"epoch": 0.7491289198606271,
"grad_norm": 0.9457240104675293,
"learning_rate": 3.466458551557235e-05,
"loss": 0.7843,
"step": 5160
},
{
"epoch": 0.7505807200929152,
"grad_norm": 5.595800399780273,
"learning_rate": 3.4611904283963205e-05,
"loss": 0.8307,
"step": 5170
},
{
"epoch": 0.7520325203252033,
"grad_norm": 0.6603794693946838,
"learning_rate": 3.455917291924256e-05,
"loss": 0.5221,
"step": 5180
},
{
"epoch": 0.7534843205574913,
"grad_norm": 1.487289309501648,
"learning_rate": 3.450639169644308e-05,
"loss": 0.6535,
"step": 5190
},
{
"epoch": 0.7549361207897793,
"grad_norm": 0.9417099952697754,
"learning_rate": 3.445356089085743e-05,
"loss": 0.7801,
"step": 5200
},
{
"epoch": 0.7563879210220673,
"grad_norm": 0.5838674306869507,
"learning_rate": 3.4400680778036906e-05,
"loss": 0.5079,
"step": 5210
},
{
"epoch": 0.7578397212543554,
"grad_norm": 1.297662377357483,
"learning_rate": 3.434775163378997e-05,
"loss": 0.6784,
"step": 5220
},
{
"epoch": 0.7592915214866435,
"grad_norm": 0.6394696235656738,
"learning_rate": 3.4294773734180825e-05,
"loss": 0.5856,
"step": 5230
},
{
"epoch": 0.7607433217189314,
"grad_norm": 3.172327756881714,
"learning_rate": 3.424174735552799e-05,
"loss": 0.7602,
"step": 5240
},
{
"epoch": 0.7621951219512195,
"grad_norm": 1.0046736001968384,
"learning_rate": 3.418867277440278e-05,
"loss": 0.8301,
"step": 5250
},
{
"epoch": 0.7636469221835076,
"grad_norm": 5.960042953491211,
"learning_rate": 3.413555026762799e-05,
"loss": 0.745,
"step": 5260
},
{
"epoch": 0.7650987224157956,
"grad_norm": 0.9394850730895996,
"learning_rate": 3.408238011227635e-05,
"loss": 0.7655,
"step": 5270
},
{
"epoch": 0.7665505226480837,
"grad_norm": 1.9447022676467896,
"learning_rate": 3.402916258566907e-05,
"loss": 0.909,
"step": 5280
},
{
"epoch": 0.7680023228803716,
"grad_norm": 1.3960545063018799,
"learning_rate": 3.3975897965374515e-05,
"loss": 1.0169,
"step": 5290
},
{
"epoch": 0.7694541231126597,
"grad_norm": 1.291868805885315,
"learning_rate": 3.392258652920664e-05,
"loss": 0.8068,
"step": 5300
},
{
"epoch": 0.7709059233449478,
"grad_norm": 0.8512223362922668,
"learning_rate": 3.386922855522356e-05,
"loss": 0.6296,
"step": 5310
},
{
"epoch": 0.7723577235772358,
"grad_norm": 1.03252112865448,
"learning_rate": 3.3815824321726154e-05,
"loss": 0.7254,
"step": 5320
},
{
"epoch": 0.7738095238095238,
"grad_norm": 0.5753119587898254,
"learning_rate": 3.376237410725655e-05,
"loss": 0.8159,
"step": 5330
},
{
"epoch": 0.7752613240418118,
"grad_norm": 0.9350941181182861,
"learning_rate": 3.370887819059672e-05,
"loss": 0.6446,
"step": 5340
},
{
"epoch": 0.7767131242740999,
"grad_norm": 1.6437619924545288,
"learning_rate": 3.3655336850767e-05,
"loss": 0.891,
"step": 5350
},
{
"epoch": 0.778164924506388,
"grad_norm": 2.669983386993408,
"learning_rate": 3.3601750367024645e-05,
"loss": 0.8369,
"step": 5360
},
{
"epoch": 0.7796167247386759,
"grad_norm": 1.661522388458252,
"learning_rate": 3.354811901886234e-05,
"loss": 0.7392,
"step": 5370
},
{
"epoch": 0.781068524970964,
"grad_norm": 0.7996639609336853,
"learning_rate": 3.3494443086006824e-05,
"loss": 0.745,
"step": 5380
},
{
"epoch": 0.782520325203252,
"grad_norm": 0.6470725536346436,
"learning_rate": 3.344072284841734e-05,
"loss": 0.7941,
"step": 5390
},
{
"epoch": 0.7839721254355401,
"grad_norm": 1.523929476737976,
"learning_rate": 3.3386958586284204e-05,
"loss": 0.5812,
"step": 5400
},
{
"epoch": 0.7854239256678281,
"grad_norm": 0.7597313523292542,
"learning_rate": 3.333315058002739e-05,
"loss": 0.4126,
"step": 5410
},
{
"epoch": 0.7868757259001161,
"grad_norm": 2.064470052719116,
"learning_rate": 3.3279299110295e-05,
"loss": 0.7855,
"step": 5420
},
{
"epoch": 0.7883275261324042,
"grad_norm": 0.6145796179771423,
"learning_rate": 3.3225404457961834e-05,
"loss": 0.6219,
"step": 5430
},
{
"epoch": 0.7897793263646922,
"grad_norm": 3.158587694168091,
"learning_rate": 3.317146690412793e-05,
"loss": 0.7321,
"step": 5440
},
{
"epoch": 0.7912311265969802,
"grad_norm": 4.978558540344238,
"learning_rate": 3.311748673011709e-05,
"loss": 0.5758,
"step": 5450
},
{
"epoch": 0.7926829268292683,
"grad_norm": 1.3039811849594116,
"learning_rate": 3.306346421747539e-05,
"loss": 0.7172,
"step": 5460
},
{
"epoch": 0.7941347270615563,
"grad_norm": 0.47538790106773376,
"learning_rate": 3.300939964796977e-05,
"loss": 0.5409,
"step": 5470
},
{
"epoch": 0.7955865272938444,
"grad_norm": 1.0770827531814575,
"learning_rate": 3.295529330358649e-05,
"loss": 0.4414,
"step": 5480
},
{
"epoch": 0.7970383275261324,
"grad_norm": 0.7383883595466614,
"learning_rate": 3.290114546652971e-05,
"loss": 0.5318,
"step": 5490
},
{
"epoch": 0.7984901277584204,
"grad_norm": 0.9000987410545349,
"learning_rate": 3.284695641922e-05,
"loss": 0.5446,
"step": 5500
},
{
"epoch": 0.7999419279907085,
"grad_norm": 2.9022693634033203,
"learning_rate": 3.279272644429291e-05,
"loss": 0.725,
"step": 5510
},
{
"epoch": 0.8013937282229965,
"grad_norm": 1.3384835720062256,
"learning_rate": 3.2738455824597405e-05,
"loss": 0.6995,
"step": 5520
},
{
"epoch": 0.8028455284552846,
"grad_norm": 0.9091627597808838,
"learning_rate": 3.268414484319445e-05,
"loss": 0.5134,
"step": 5530
},
{
"epoch": 0.8042973286875726,
"grad_norm": 3.8653523921966553,
"learning_rate": 3.262979378335557e-05,
"loss": 0.7161,
"step": 5540
},
{
"epoch": 0.8057491289198606,
"grad_norm": 0.8096335530281067,
"learning_rate": 3.257540292856126e-05,
"loss": 0.5652,
"step": 5550
},
{
"epoch": 0.8072009291521487,
"grad_norm": 1.397865653038025,
"learning_rate": 3.252097256249965e-05,
"loss": 0.6965,
"step": 5560
},
{
"epoch": 0.8086527293844367,
"grad_norm": 2.277859926223755,
"learning_rate": 3.246650296906489e-05,
"loss": 0.6531,
"step": 5570
},
{
"epoch": 0.8101045296167247,
"grad_norm": 2.0666253566741943,
"learning_rate": 3.241199443235576e-05,
"loss": 0.4249,
"step": 5580
},
{
"epoch": 0.8115563298490128,
"grad_norm": 1.2161462306976318,
"learning_rate": 3.2357447236674136e-05,
"loss": 0.4259,
"step": 5590
},
{
"epoch": 0.8130081300813008,
"grad_norm": 2.8734538555145264,
"learning_rate": 3.2302861666523564e-05,
"loss": 0.4658,
"step": 5600
},
{
"epoch": 0.8144599303135889,
"grad_norm": 0.739331841468811,
"learning_rate": 3.22482380066077e-05,
"loss": 0.6863,
"step": 5610
},
{
"epoch": 0.8159117305458768,
"grad_norm": 0.8823861479759216,
"learning_rate": 3.2193576541828894e-05,
"loss": 0.6399,
"step": 5620
},
{
"epoch": 0.8173635307781649,
"grad_norm": 1.240403175354004,
"learning_rate": 3.2138877557286675e-05,
"loss": 0.8784,
"step": 5630
},
{
"epoch": 0.818815331010453,
"grad_norm": 1.1647741794586182,
"learning_rate": 3.208414133827623e-05,
"loss": 0.9796,
"step": 5640
},
{
"epoch": 0.820267131242741,
"grad_norm": 1.0195775032043457,
"learning_rate": 3.2029368170287e-05,
"loss": 0.4319,
"step": 5650
},
{
"epoch": 0.8217189314750291,
"grad_norm": 1.4524924755096436,
"learning_rate": 3.197455833900112e-05,
"loss": 0.7408,
"step": 5660
},
{
"epoch": 0.823170731707317,
"grad_norm": 0.5133039355278015,
"learning_rate": 3.191971213029195e-05,
"loss": 0.4198,
"step": 5670
},
{
"epoch": 0.8246225319396051,
"grad_norm": 1.205497145652771,
"learning_rate": 3.186482983022257e-05,
"loss": 0.4425,
"step": 5680
},
{
"epoch": 0.8260743321718932,
"grad_norm": 0.6108511090278625,
"learning_rate": 3.180991172504434e-05,
"loss": 0.6768,
"step": 5690
},
{
"epoch": 0.8275261324041812,
"grad_norm": 1.1527341604232788,
"learning_rate": 3.175495810119533e-05,
"loss": 0.5248,
"step": 5700
},
{
"epoch": 0.8289779326364692,
"grad_norm": 1.3975361585617065,
"learning_rate": 3.16999692452989e-05,
"loss": 0.8838,
"step": 5710
},
{
"epoch": 0.8304297328687572,
"grad_norm": 4.7035603523254395,
"learning_rate": 3.164494544416215e-05,
"loss": 1.0907,
"step": 5720
},
{
"epoch": 0.8318815331010453,
"grad_norm": 1.6571784019470215,
"learning_rate": 3.158988698477445e-05,
"loss": 0.732,
"step": 5730
},
{
"epoch": 0.8333333333333334,
"grad_norm": 0.6325092315673828,
"learning_rate": 3.1534794154305935e-05,
"loss": 0.8245,
"step": 5740
},
{
"epoch": 0.8347851335656213,
"grad_norm": 0.9876767992973328,
"learning_rate": 3.1479667240106016e-05,
"loss": 0.7428,
"step": 5750
},
{
"epoch": 0.8362369337979094,
"grad_norm": 0.37352874875068665,
"learning_rate": 3.142450652970187e-05,
"loss": 0.5489,
"step": 5760
},
{
"epoch": 0.8376887340301974,
"grad_norm": 1.924856424331665,
"learning_rate": 3.136931231079696e-05,
"loss": 0.7834,
"step": 5770
},
{
"epoch": 0.8391405342624855,
"grad_norm": 1.8147573471069336,
"learning_rate": 3.1314084871269496e-05,
"loss": 0.6688,
"step": 5780
},
{
"epoch": 0.8405923344947736,
"grad_norm": 0.680001437664032,
"learning_rate": 3.1258824499170975e-05,
"loss": 0.6193,
"step": 5790
},
{
"epoch": 0.8420441347270615,
"grad_norm": 1.3029786348342896,
"learning_rate": 3.1203531482724665e-05,
"loss": 0.694,
"step": 5800
},
{
"epoch": 0.8434959349593496,
"grad_norm": 1.4556697607040405,
"learning_rate": 3.114820611032408e-05,
"loss": 0.7933,
"step": 5810
},
{
"epoch": 0.8449477351916377,
"grad_norm": 1.7649941444396973,
"learning_rate": 3.1092848670531514e-05,
"loss": 0.4818,
"step": 5820
},
{
"epoch": 0.8463995354239257,
"grad_norm": 0.8985478281974792,
"learning_rate": 3.1037459452076504e-05,
"loss": 0.6992,
"step": 5830
},
{
"epoch": 0.8478513356562137,
"grad_norm": 1.0186079740524292,
"learning_rate": 3.0982038743854346e-05,
"loss": 0.2927,
"step": 5840
},
{
"epoch": 0.8493031358885017,
"grad_norm": 1.724191427230835,
"learning_rate": 3.0926586834924555e-05,
"loss": 0.6936,
"step": 5850
},
{
"epoch": 0.8507549361207898,
"grad_norm": 0.25378018617630005,
"learning_rate": 3.087110401450941e-05,
"loss": 0.6692,
"step": 5860
},
{
"epoch": 0.8522067363530779,
"grad_norm": 0.5603824853897095,
"learning_rate": 3.0815590571992394e-05,
"loss": 0.4975,
"step": 5870
},
{
"epoch": 0.8536585365853658,
"grad_norm": 0.4890209138393402,
"learning_rate": 3.076004679691672e-05,
"loss": 0.832,
"step": 5880
},
{
"epoch": 0.8551103368176539,
"grad_norm": 0.6338871121406555,
"learning_rate": 3.0704472978983795e-05,
"loss": 0.6447,
"step": 5890
},
{
"epoch": 0.8565621370499419,
"grad_norm": 0.9809471964836121,
"learning_rate": 3.064886940805174e-05,
"loss": 0.6176,
"step": 5900
},
{
"epoch": 0.85801393728223,
"grad_norm": 0.7329034209251404,
"learning_rate": 3.059323637413385e-05,
"loss": 0.4022,
"step": 5910
},
{
"epoch": 0.859465737514518,
"grad_norm": 1.2352603673934937,
"learning_rate": 3.053757416739708e-05,
"loss": 0.9392,
"step": 5920
},
{
"epoch": 0.860917537746806,
"grad_norm": 1.056897759437561,
"learning_rate": 3.0481883078160555e-05,
"loss": 0.616,
"step": 5930
},
{
"epoch": 0.8623693379790941,
"grad_norm": 0.6841446757316589,
"learning_rate": 3.042616339689404e-05,
"loss": 0.5995,
"step": 5940
},
{
"epoch": 0.8638211382113821,
"grad_norm": 1.3766181468963623,
"learning_rate": 3.0370415414216436e-05,
"loss": 0.6945,
"step": 5950
},
{
"epoch": 0.8652729384436701,
"grad_norm": 0.960422694683075,
"learning_rate": 3.0314639420894242e-05,
"loss": 0.6205,
"step": 5960
},
{
"epoch": 0.8667247386759582,
"grad_norm": 2.2252063751220703,
"learning_rate": 3.0258835707840062e-05,
"loss": 0.67,
"step": 5970
},
{
"epoch": 0.8681765389082462,
"grad_norm": 4.834002494812012,
"learning_rate": 3.020300456611109e-05,
"loss": 0.5169,
"step": 5980
},
{
"epoch": 0.8696283391405343,
"grad_norm": 0.3208721876144409,
"learning_rate": 3.0147146286907546e-05,
"loss": 0.7802,
"step": 5990
},
{
"epoch": 0.8710801393728222,
"grad_norm": 0.6140812039375305,
"learning_rate": 3.0091261161571227e-05,
"loss": 0.753,
"step": 6000
},
{
"epoch": 0.8710801393728222,
"eval_loss": 0.6239650249481201,
"eval_runtime": 107.7332,
"eval_samples_per_second": 13.459,
"eval_steps_per_second": 3.369,
"step": 6000
},
{
"epoch": 0.8725319396051103,
"grad_norm": 0.7981186509132385,
"learning_rate": 3.003534948158393e-05,
"loss": 0.581,
"step": 6010
},
{
"epoch": 0.8739837398373984,
"grad_norm": 1.1279065608978271,
"learning_rate": 2.9979411538565977e-05,
"loss": 0.5993,
"step": 6020
},
{
"epoch": 0.8754355400696864,
"grad_norm": 0.7594296336174011,
"learning_rate": 2.9923447624274647e-05,
"loss": 0.7433,
"step": 6030
},
{
"epoch": 0.8768873403019745,
"grad_norm": 4.225851058959961,
"learning_rate": 2.9867458030602684e-05,
"loss": 0.5974,
"step": 6040
},
{
"epoch": 0.8783391405342624,
"grad_norm": 1.2313289642333984,
"learning_rate": 2.9811443049576793e-05,
"loss": 0.5609,
"step": 6050
},
{
"epoch": 0.8797909407665505,
"grad_norm": 2.6386501789093018,
"learning_rate": 2.9755402973356045e-05,
"loss": 0.9846,
"step": 6060
},
{
"epoch": 0.8812427409988386,
"grad_norm": 1.1028252840042114,
"learning_rate": 2.969933809423045e-05,
"loss": 0.5933,
"step": 6070
},
{
"epoch": 0.8826945412311266,
"grad_norm": 1.0655920505523682,
"learning_rate": 2.964324870461935e-05,
"loss": 0.8486,
"step": 6080
},
{
"epoch": 0.8841463414634146,
"grad_norm": 2.2200887203216553,
"learning_rate": 2.9587135097069934e-05,
"loss": 0.3357,
"step": 6090
},
{
"epoch": 0.8855981416957027,
"grad_norm": 8.945457458496094,
"learning_rate": 2.9530997564255725e-05,
"loss": 0.7661,
"step": 6100
},
{
"epoch": 0.8870499419279907,
"grad_norm": 0.8916497230529785,
"learning_rate": 2.9474836398975005e-05,
"loss": 0.3096,
"step": 6110
},
{
"epoch": 0.8885017421602788,
"grad_norm": 1.2500933408737183,
"learning_rate": 2.9418651894149334e-05,
"loss": 0.7636,
"step": 6120
},
{
"epoch": 0.8899535423925667,
"grad_norm": 1.3231313228607178,
"learning_rate": 2.9362444342822015e-05,
"loss": 0.8473,
"step": 6130
},
{
"epoch": 0.8914053426248548,
"grad_norm": 1.0085506439208984,
"learning_rate": 2.9306214038156516e-05,
"loss": 0.6876,
"step": 6140
},
{
"epoch": 0.8928571428571429,
"grad_norm": 0.7650404572486877,
"learning_rate": 2.924996127343502e-05,
"loss": 0.4889,
"step": 6150
},
{
"epoch": 0.8943089430894309,
"grad_norm": 0.7335465550422668,
"learning_rate": 2.9193686342056847e-05,
"loss": 0.6647,
"step": 6160
},
{
"epoch": 0.895760743321719,
"grad_norm": 0.5137434005737305,
"learning_rate": 2.9137389537536913e-05,
"loss": 0.6737,
"step": 6170
},
{
"epoch": 0.8972125435540069,
"grad_norm": 0.9400390386581421,
"learning_rate": 2.9081071153504236e-05,
"loss": 0.6747,
"step": 6180
},
{
"epoch": 0.898664343786295,
"grad_norm": 0.660967230796814,
"learning_rate": 2.9024731483700396e-05,
"loss": 0.4432,
"step": 6190
},
{
"epoch": 0.9001161440185831,
"grad_norm": 2.423039197921753,
"learning_rate": 2.8968370821977963e-05,
"loss": 0.6982,
"step": 6200
},
{
"epoch": 0.9015679442508711,
"grad_norm": 3.0828261375427246,
"learning_rate": 2.8911989462299016e-05,
"loss": 0.5868,
"step": 6210
},
{
"epoch": 0.9030197444831591,
"grad_norm": 2.1633851528167725,
"learning_rate": 2.8855587698733595e-05,
"loss": 0.5404,
"step": 6220
},
{
"epoch": 0.9044715447154471,
"grad_norm": 5.27179479598999,
"learning_rate": 2.8799165825458145e-05,
"loss": 0.7313,
"step": 6230
},
{
"epoch": 0.9059233449477352,
"grad_norm": 0.805304229259491,
"learning_rate": 2.8742724136754005e-05,
"loss": 0.5804,
"step": 6240
},
{
"epoch": 0.9073751451800233,
"grad_norm": 2.6164822578430176,
"learning_rate": 2.868626292700588e-05,
"loss": 0.6612,
"step": 6250
},
{
"epoch": 0.9088269454123112,
"grad_norm": 1.3976331949234009,
"learning_rate": 2.8629782490700253e-05,
"loss": 0.5746,
"step": 6260
},
{
"epoch": 0.9102787456445993,
"grad_norm": 1.42573881149292,
"learning_rate": 2.857328312242392e-05,
"loss": 0.576,
"step": 6270
},
{
"epoch": 0.9117305458768873,
"grad_norm": 2.0388023853302,
"learning_rate": 2.851676511686243e-05,
"loss": 0.7672,
"step": 6280
},
{
"epoch": 0.9131823461091754,
"grad_norm": 1.3161983489990234,
"learning_rate": 2.8460228768798506e-05,
"loss": 0.6011,
"step": 6290
},
{
"epoch": 0.9146341463414634,
"grad_norm": 1.2606275081634521,
"learning_rate": 2.8403674373110562e-05,
"loss": 0.6017,
"step": 6300
},
{
"epoch": 0.9160859465737514,
"grad_norm": 2.2314658164978027,
"learning_rate": 2.8347102224771144e-05,
"loss": 0.6201,
"step": 6310
},
{
"epoch": 0.9175377468060395,
"grad_norm": 1.990546703338623,
"learning_rate": 2.8290512618845367e-05,
"loss": 0.6775,
"step": 6320
},
{
"epoch": 0.9189895470383276,
"grad_norm": 1.7261875867843628,
"learning_rate": 2.823390585048943e-05,
"loss": 0.6419,
"step": 6330
},
{
"epoch": 0.9204413472706156,
"grad_norm": 2.2154932022094727,
"learning_rate": 2.8177282214949047e-05,
"loss": 0.8979,
"step": 6340
},
{
"epoch": 0.9218931475029036,
"grad_norm": 6.259598731994629,
"learning_rate": 2.8120642007557873e-05,
"loss": 0.767,
"step": 6350
},
{
"epoch": 0.9233449477351916,
"grad_norm": 1.4923880100250244,
"learning_rate": 2.806398552373603e-05,
"loss": 0.7091,
"step": 6360
},
{
"epoch": 0.9247967479674797,
"grad_norm": 0.6974102258682251,
"learning_rate": 2.8007313058988527e-05,
"loss": 0.6863,
"step": 6370
},
{
"epoch": 0.9262485481997678,
"grad_norm": 1.9238085746765137,
"learning_rate": 2.7950624908903705e-05,
"loss": 0.555,
"step": 6380
},
{
"epoch": 0.9277003484320557,
"grad_norm": 0.496724933385849,
"learning_rate": 2.789392136915175e-05,
"loss": 0.9554,
"step": 6390
},
{
"epoch": 0.9291521486643438,
"grad_norm": 1.349373459815979,
"learning_rate": 2.7837202735483093e-05,
"loss": 0.8156,
"step": 6400
},
{
"epoch": 0.9306039488966318,
"grad_norm": 1.377130150794983,
"learning_rate": 2.778046930372689e-05,
"loss": 0.7222,
"step": 6410
},
{
"epoch": 0.9320557491289199,
"grad_norm": 1.0762406587600708,
"learning_rate": 2.7723721369789486e-05,
"loss": 0.6956,
"step": 6420
},
{
"epoch": 0.9335075493612079,
"grad_norm": 1.7975473403930664,
"learning_rate": 2.7666959229652867e-05,
"loss": 0.7824,
"step": 6430
},
{
"epoch": 0.9349593495934959,
"grad_norm": 1.836282730102539,
"learning_rate": 2.761018317937311e-05,
"loss": 0.6559,
"step": 6440
},
{
"epoch": 0.936411149825784,
"grad_norm": 1.9735631942749023,
"learning_rate": 2.7553393515078852e-05,
"loss": 0.578,
"step": 6450
},
{
"epoch": 0.937862950058072,
"grad_norm": 1.7507141828536987,
"learning_rate": 2.749659053296973e-05,
"loss": 0.897,
"step": 6460
},
{
"epoch": 0.93931475029036,
"grad_norm": 1.1130051612854004,
"learning_rate": 2.743977452931484e-05,
"loss": 0.5654,
"step": 6470
},
{
"epoch": 0.9407665505226481,
"grad_norm": 0.851780354976654,
"learning_rate": 2.738294580045119e-05,
"loss": 0.5722,
"step": 6480
},
{
"epoch": 0.9422183507549361,
"grad_norm": 0.6273514628410339,
"learning_rate": 2.732610464278219e-05,
"loss": 0.6938,
"step": 6490
},
{
"epoch": 0.9436701509872242,
"grad_norm": 1.4148989915847778,
"learning_rate": 2.7269251352776042e-05,
"loss": 0.5636,
"step": 6500
},
{
"epoch": 0.9451219512195121,
"grad_norm": 0.9783958792686462,
"learning_rate": 2.7212386226964242e-05,
"loss": 0.5425,
"step": 6510
},
{
"epoch": 0.9465737514518002,
"grad_norm": 0.860564649105072,
"learning_rate": 2.7155509561940017e-05,
"loss": 0.6981,
"step": 6520
},
{
"epoch": 0.9480255516840883,
"grad_norm": 1.0383031368255615,
"learning_rate": 2.7098621654356766e-05,
"loss": 0.7683,
"step": 6530
},
{
"epoch": 0.9494773519163763,
"grad_norm": 0.6206135153770447,
"learning_rate": 2.704172280092655e-05,
"loss": 0.5571,
"step": 6540
},
{
"epoch": 0.9509291521486644,
"grad_norm": 1.0526723861694336,
"learning_rate": 2.698481329841851e-05,
"loss": 0.9023,
"step": 6550
},
{
"epoch": 0.9523809523809523,
"grad_norm": 0.7944720983505249,
"learning_rate": 2.6927893443657316e-05,
"loss": 0.5719,
"step": 6560
},
{
"epoch": 0.9538327526132404,
"grad_norm": 0.16957837343215942,
"learning_rate": 2.6870963533521655e-05,
"loss": 0.641,
"step": 6570
},
{
"epoch": 0.9552845528455285,
"grad_norm": 0.8909958004951477,
"learning_rate": 2.681402386494264e-05,
"loss": 0.5357,
"step": 6580
},
{
"epoch": 0.9567363530778165,
"grad_norm": 0.8061552047729492,
"learning_rate": 2.6757074734902303e-05,
"loss": 0.8705,
"step": 6590
},
{
"epoch": 0.9581881533101045,
"grad_norm": 0.7766616940498352,
"learning_rate": 2.6700116440432005e-05,
"loss": 0.6641,
"step": 6600
},
{
"epoch": 0.9596399535423926,
"grad_norm": 4.805869102478027,
"learning_rate": 2.6643149278610925e-05,
"loss": 0.4838,
"step": 6610
},
{
"epoch": 0.9610917537746806,
"grad_norm": 1.1826244592666626,
"learning_rate": 2.6586173546564465e-05,
"loss": 0.8335,
"step": 6620
},
{
"epoch": 0.9625435540069687,
"grad_norm": 4.609352111816406,
"learning_rate": 2.6529189541462745e-05,
"loss": 0.5172,
"step": 6630
},
{
"epoch": 0.9639953542392566,
"grad_norm": 1.5737910270690918,
"learning_rate": 2.647219756051904e-05,
"loss": 0.4788,
"step": 6640
},
{
"epoch": 0.9654471544715447,
"grad_norm": 4.146353244781494,
"learning_rate": 2.6415197900988213e-05,
"loss": 0.7194,
"step": 6650
},
{
"epoch": 0.9668989547038328,
"grad_norm": 0.5611397624015808,
"learning_rate": 2.6358190860165187e-05,
"loss": 0.489,
"step": 6660
},
{
"epoch": 0.9683507549361208,
"grad_norm": 2.0827231407165527,
"learning_rate": 2.6301176735383382e-05,
"loss": 0.5859,
"step": 6670
},
{
"epoch": 0.9698025551684089,
"grad_norm": 2.0396342277526855,
"learning_rate": 2.624415582401314e-05,
"loss": 0.7885,
"step": 6680
},
{
"epoch": 0.9712543554006968,
"grad_norm": 2.5447700023651123,
"learning_rate": 2.6187128423460233e-05,
"loss": 0.722,
"step": 6690
},
{
"epoch": 0.9727061556329849,
"grad_norm": 4.586677551269531,
"learning_rate": 2.6130094831164282e-05,
"loss": 0.5383,
"step": 6700
},
{
"epoch": 0.974157955865273,
"grad_norm": 2.4895076751708984,
"learning_rate": 2.607305534459717e-05,
"loss": 0.6993,
"step": 6710
},
{
"epoch": 0.975609756097561,
"grad_norm": 0.8436042666435242,
"learning_rate": 2.6016010261261546e-05,
"loss": 0.6571,
"step": 6720
},
{
"epoch": 0.977061556329849,
"grad_norm": 0.6883308291435242,
"learning_rate": 2.5958959878689253e-05,
"loss": 0.5514,
"step": 6730
},
{
"epoch": 0.978513356562137,
"grad_norm": 2.935514211654663,
"learning_rate": 2.590190449443975e-05,
"loss": 0.6725,
"step": 6740
},
{
"epoch": 0.9799651567944251,
"grad_norm": 2.491732597351074,
"learning_rate": 2.584484440609861e-05,
"loss": 0.6864,
"step": 6750
},
{
"epoch": 0.9814169570267132,
"grad_norm": 2.6545393466949463,
"learning_rate": 2.5787779911275937e-05,
"loss": 0.6371,
"step": 6760
},
{
"epoch": 0.9828687572590011,
"grad_norm": 0.4963870942592621,
"learning_rate": 2.57307113076048e-05,
"loss": 0.6246,
"step": 6770
},
{
"epoch": 0.9843205574912892,
"grad_norm": 0.5385538339614868,
"learning_rate": 2.567363889273971e-05,
"loss": 0.8436,
"step": 6780
},
{
"epoch": 0.9857723577235772,
"grad_norm": 7.346200466156006,
"learning_rate": 2.561656296435506e-05,
"loss": 0.65,
"step": 6790
},
{
"epoch": 0.9872241579558653,
"grad_norm": 1.658962368965149,
"learning_rate": 2.555948382014357e-05,
"loss": 0.6879,
"step": 6800
},
{
"epoch": 0.9886759581881533,
"grad_norm": 1.5802571773529053,
"learning_rate": 2.5502401757814714e-05,
"loss": 0.7704,
"step": 6810
},
{
"epoch": 0.9901277584204413,
"grad_norm": 2.8722903728485107,
"learning_rate": 2.5445317075093223e-05,
"loss": 0.4583,
"step": 6820
},
{
"epoch": 0.9915795586527294,
"grad_norm": 0.8566171526908875,
"learning_rate": 2.5388230069717446e-05,
"loss": 0.8975,
"step": 6830
},
{
"epoch": 0.9930313588850174,
"grad_norm": 0.7352342009544373,
"learning_rate": 2.5331141039437882e-05,
"loss": 0.7039,
"step": 6840
},
{
"epoch": 0.9944831591173054,
"grad_norm": 1.2166061401367188,
"learning_rate": 2.5274050282015587e-05,
"loss": 0.6728,
"step": 6850
},
{
"epoch": 0.9959349593495935,
"grad_norm": 1.2508012056350708,
"learning_rate": 2.521695809522061e-05,
"loss": 0.7019,
"step": 6860
},
{
"epoch": 0.9973867595818815,
"grad_norm": 1.4797356128692627,
"learning_rate": 2.515986477683048e-05,
"loss": 0.5035,
"step": 6870
},
{
"epoch": 0.9988385598141696,
"grad_norm": 0.6893177628517151,
"learning_rate": 2.510277062462861e-05,
"loss": 0.6175,
"step": 6880
},
{
"epoch": 1.0002903600464577,
"grad_norm": 3.4936206340789795,
"learning_rate": 2.504567593640275e-05,
"loss": 0.7674,
"step": 6890
},
{
"epoch": 1.0017421602787457,
"grad_norm": 1.3632289171218872,
"learning_rate": 2.4988581009943477e-05,
"loss": 0.3736,
"step": 6900
},
{
"epoch": 1.0031939605110336,
"grad_norm": 0.46680641174316406,
"learning_rate": 2.4931486143042586e-05,
"loss": 0.4425,
"step": 6910
},
{
"epoch": 1.0046457607433217,
"grad_norm": 0.8818445801734924,
"learning_rate": 2.4874391633491576e-05,
"loss": 0.6905,
"step": 6920
},
{
"epoch": 1.0060975609756098,
"grad_norm": 0.5474444031715393,
"learning_rate": 2.4817297779080073e-05,
"loss": 0.7923,
"step": 6930
},
{
"epoch": 1.0075493612078978,
"grad_norm": 0.8076862096786499,
"learning_rate": 2.4760204877594297e-05,
"loss": 0.6344,
"step": 6940
},
{
"epoch": 1.009001161440186,
"grad_norm": 1.0539401769638062,
"learning_rate": 2.4703113226815474e-05,
"loss": 0.7762,
"step": 6950
},
{
"epoch": 1.0104529616724738,
"grad_norm": 1.6129015684127808,
"learning_rate": 2.4646023124518336e-05,
"loss": 0.5475,
"step": 6960
},
{
"epoch": 1.0119047619047619,
"grad_norm": 3.270751476287842,
"learning_rate": 2.4588934868469522e-05,
"loss": 0.7106,
"step": 6970
},
{
"epoch": 1.01335656213705,
"grad_norm": 1.1488885879516602,
"learning_rate": 2.4531848756426032e-05,
"loss": 0.6126,
"step": 6980
},
{
"epoch": 1.014808362369338,
"grad_norm": 0.7339674234390259,
"learning_rate": 2.447476508613372e-05,
"loss": 0.5384,
"step": 6990
},
{
"epoch": 1.016260162601626,
"grad_norm": 0.2865770161151886,
"learning_rate": 2.4417684155325664e-05,
"loss": 0.6378,
"step": 7000
},
{
"epoch": 1.017711962833914,
"grad_norm": 1.421481728553772,
"learning_rate": 2.4360606261720673e-05,
"loss": 0.6757,
"step": 7010
},
{
"epoch": 1.019163763066202,
"grad_norm": 0.846333384513855,
"learning_rate": 2.430353170302172e-05,
"loss": 0.517,
"step": 7020
},
{
"epoch": 1.0206155632984901,
"grad_norm": 0.3524300158023834,
"learning_rate": 2.4246460776914363e-05,
"loss": 0.6129,
"step": 7030
},
{
"epoch": 1.0220673635307782,
"grad_norm": 0.7928240299224854,
"learning_rate": 2.4189393781065232e-05,
"loss": 0.4327,
"step": 7040
},
{
"epoch": 1.0235191637630663,
"grad_norm": 0.9376094341278076,
"learning_rate": 2.4132331013120453e-05,
"loss": 0.6137,
"step": 7050
},
{
"epoch": 1.0249709639953541,
"grad_norm": 1.046407699584961,
"learning_rate": 2.4075272770704104e-05,
"loss": 0.6877,
"step": 7060
},
{
"epoch": 1.0264227642276422,
"grad_norm": 2.0462183952331543,
"learning_rate": 2.4018219351416645e-05,
"loss": 0.4539,
"step": 7070
},
{
"epoch": 1.0278745644599303,
"grad_norm": 0.4574951231479645,
"learning_rate": 2.3961171052833386e-05,
"loss": 0.9033,
"step": 7080
},
{
"epoch": 1.0293263646922184,
"grad_norm": 3.518298864364624,
"learning_rate": 2.3904128172502946e-05,
"loss": 0.5817,
"step": 7090
},
{
"epoch": 1.0307781649245065,
"grad_norm": 0.598048985004425,
"learning_rate": 2.3847091007945667e-05,
"loss": 0.4244,
"step": 7100
},
{
"epoch": 1.0322299651567943,
"grad_norm": 1.5225111246109009,
"learning_rate": 2.3790059856652083e-05,
"loss": 0.9356,
"step": 7110
},
{
"epoch": 1.0336817653890824,
"grad_norm": 0.9001873135566711,
"learning_rate": 2.3733035016081355e-05,
"loss": 0.4678,
"step": 7120
},
{
"epoch": 1.0351335656213705,
"grad_norm": 2.5215003490448,
"learning_rate": 2.367601678365974e-05,
"loss": 0.5787,
"step": 7130
},
{
"epoch": 1.0365853658536586,
"grad_norm": 0.9304032325744629,
"learning_rate": 2.361900545677903e-05,
"loss": 0.3138,
"step": 7140
},
{
"epoch": 1.0380371660859466,
"grad_norm": 0.9305661916732788,
"learning_rate": 2.3562001332795e-05,
"loss": 0.5626,
"step": 7150
},
{
"epoch": 1.0394889663182345,
"grad_norm": 1.5378453731536865,
"learning_rate": 2.3505004709025842e-05,
"loss": 0.7586,
"step": 7160
},
{
"epoch": 1.0409407665505226,
"grad_norm": 0.8000249266624451,
"learning_rate": 2.3448015882750647e-05,
"loss": 0.4352,
"step": 7170
},
{
"epoch": 1.0423925667828107,
"grad_norm": 0.8322232365608215,
"learning_rate": 2.339103515120783e-05,
"loss": 0.7357,
"step": 7180
},
{
"epoch": 1.0438443670150988,
"grad_norm": 0.9948438405990601,
"learning_rate": 2.3334062811593556e-05,
"loss": 0.657,
"step": 7190
},
{
"epoch": 1.0452961672473868,
"grad_norm": 1.071321725845337,
"learning_rate": 2.3277099161060298e-05,
"loss": 0.5158,
"step": 7200
},
{
"epoch": 1.0467479674796747,
"grad_norm": 0.7249424457550049,
"learning_rate": 2.3220144496715125e-05,
"loss": 0.606,
"step": 7210
},
{
"epoch": 1.0481997677119628,
"grad_norm": 1.2231613397598267,
"learning_rate": 2.3163199115618282e-05,
"loss": 0.4094,
"step": 7220
},
{
"epoch": 1.0496515679442509,
"grad_norm": 1.2972086668014526,
"learning_rate": 2.310626331478159e-05,
"loss": 0.4112,
"step": 7230
},
{
"epoch": 1.051103368176539,
"grad_norm": 1.0579259395599365,
"learning_rate": 2.304933739116688e-05,
"loss": 0.6859,
"step": 7240
},
{
"epoch": 1.052555168408827,
"grad_norm": 1.3413074016571045,
"learning_rate": 2.2992421641684494e-05,
"loss": 0.4698,
"step": 7250
},
{
"epoch": 1.054006968641115,
"grad_norm": 1.203018069267273,
"learning_rate": 2.2935516363191693e-05,
"loss": 0.4366,
"step": 7260
},
{
"epoch": 1.055458768873403,
"grad_norm": 1.540850281715393,
"learning_rate": 2.2878621852491135e-05,
"loss": 0.5985,
"step": 7270
},
{
"epoch": 1.056910569105691,
"grad_norm": 0.8544327616691589,
"learning_rate": 2.28217384063293e-05,
"loss": 0.6348,
"step": 7280
},
{
"epoch": 1.0583623693379791,
"grad_norm": 0.9405458569526672,
"learning_rate": 2.2764866321394963e-05,
"loss": 0.5561,
"step": 7290
},
{
"epoch": 1.0598141695702672,
"grad_norm": 0.6483383178710938,
"learning_rate": 2.2708005894317657e-05,
"loss": 0.6295,
"step": 7300
},
{
"epoch": 1.0612659698025553,
"grad_norm": 1.3376249074935913,
"learning_rate": 2.2651157421666096e-05,
"loss": 0.6177,
"step": 7310
},
{
"epoch": 1.0627177700348431,
"grad_norm": 2.9725067615509033,
"learning_rate": 2.2594321199946656e-05,
"loss": 0.4115,
"step": 7320
},
{
"epoch": 1.0641695702671312,
"grad_norm": 1.1227383613586426,
"learning_rate": 2.253749752560179e-05,
"loss": 0.7575,
"step": 7330
},
{
"epoch": 1.0656213704994193,
"grad_norm": 7.148159027099609,
"learning_rate": 2.248068669500853e-05,
"loss": 0.6736,
"step": 7340
},
{
"epoch": 1.0670731707317074,
"grad_norm": 1.4029227495193481,
"learning_rate": 2.2423889004476915e-05,
"loss": 0.5547,
"step": 7350
},
{
"epoch": 1.0685249709639955,
"grad_norm": 0.13588035106658936,
"learning_rate": 2.2367104750248444e-05,
"loss": 0.5272,
"step": 7360
},
{
"epoch": 1.0699767711962833,
"grad_norm": 1.2609344720840454,
"learning_rate": 2.2310334228494536e-05,
"loss": 0.6262,
"step": 7370
},
{
"epoch": 1.0714285714285714,
"grad_norm": 1.735031008720398,
"learning_rate": 2.2253577735314987e-05,
"loss": 0.4278,
"step": 7380
},
{
"epoch": 1.0728803716608595,
"grad_norm": 5.371007919311523,
"learning_rate": 2.219683556673642e-05,
"loss": 0.6081,
"step": 7390
},
{
"epoch": 1.0743321718931476,
"grad_norm": 2.175072431564331,
"learning_rate": 2.2140108018710758e-05,
"loss": 0.7055,
"step": 7400
},
{
"epoch": 1.0757839721254356,
"grad_norm": 0.49367207288742065,
"learning_rate": 2.208339538711366e-05,
"loss": 0.3842,
"step": 7410
},
{
"epoch": 1.0772357723577235,
"grad_norm": 1.9475051164627075,
"learning_rate": 2.2026697967742968e-05,
"loss": 0.4956,
"step": 7420
},
{
"epoch": 1.0786875725900116,
"grad_norm": 1.74053955078125,
"learning_rate": 2.1970016056317203e-05,
"loss": 0.6627,
"step": 7430
},
{
"epoch": 1.0801393728222997,
"grad_norm": 1.1123576164245605,
"learning_rate": 2.1913349948473996e-05,
"loss": 0.4789,
"step": 7440
},
{
"epoch": 1.0815911730545877,
"grad_norm": 1.6125507354736328,
"learning_rate": 2.1856699939768545e-05,
"loss": 0.4892,
"step": 7450
},
{
"epoch": 1.0830429732868758,
"grad_norm": 1.4963864088058472,
"learning_rate": 2.1800066325672074e-05,
"loss": 0.4966,
"step": 7460
},
{
"epoch": 1.0844947735191637,
"grad_norm": 1.2943956851959229,
"learning_rate": 2.1743449401570324e-05,
"loss": 0.7522,
"step": 7470
},
{
"epoch": 1.0859465737514518,
"grad_norm": 0.6681497097015381,
"learning_rate": 2.1686849462761947e-05,
"loss": 0.5014,
"step": 7480
},
{
"epoch": 1.0873983739837398,
"grad_norm": 1.1527822017669678,
"learning_rate": 2.1630266804457035e-05,
"loss": 0.4268,
"step": 7490
},
{
"epoch": 1.088850174216028,
"grad_norm": 1.0493078231811523,
"learning_rate": 2.157370172177553e-05,
"loss": 0.6676,
"step": 7500
},
{
"epoch": 1.090301974448316,
"grad_norm": 0.7843257784843445,
"learning_rate": 2.1517154509745724e-05,
"loss": 0.4035,
"step": 7510
},
{
"epoch": 1.0917537746806039,
"grad_norm": 1.5716508626937866,
"learning_rate": 2.1460625463302686e-05,
"loss": 0.4774,
"step": 7520
},
{
"epoch": 1.093205574912892,
"grad_norm": 0.881391704082489,
"learning_rate": 2.1404114877286747e-05,
"loss": 0.6217,
"step": 7530
},
{
"epoch": 1.09465737514518,
"grad_norm": 0.4978386461734772,
"learning_rate": 2.134762304644193e-05,
"loss": 0.7448,
"step": 7540
},
{
"epoch": 1.096109175377468,
"grad_norm": 1.047534465789795,
"learning_rate": 2.129115026541447e-05,
"loss": 0.7455,
"step": 7550
},
{
"epoch": 1.0975609756097562,
"grad_norm": 3.125924825668335,
"learning_rate": 2.1234696828751226e-05,
"loss": 0.3793,
"step": 7560
},
{
"epoch": 1.099012775842044,
"grad_norm": 4.937119960784912,
"learning_rate": 2.1178263030898155e-05,
"loss": 0.6671,
"step": 7570
},
{
"epoch": 1.1004645760743321,
"grad_norm": 0.9988604187965393,
"learning_rate": 2.1121849166198793e-05,
"loss": 0.6868,
"step": 7580
},
{
"epoch": 1.1019163763066202,
"grad_norm": 1.7846256494522095,
"learning_rate": 2.106545552889272e-05,
"loss": 0.7165,
"step": 7590
},
{
"epoch": 1.1033681765389083,
"grad_norm": 1.7793424129486084,
"learning_rate": 2.1009082413113973e-05,
"loss": 0.6098,
"step": 7600
},
{
"epoch": 1.1048199767711964,
"grad_norm": 0.6615446209907532,
"learning_rate": 2.095273011288963e-05,
"loss": 0.5701,
"step": 7610
},
{
"epoch": 1.1062717770034842,
"grad_norm": 1.3341655731201172,
"learning_rate": 2.0896398922138122e-05,
"loss": 0.676,
"step": 7620
},
{
"epoch": 1.1077235772357723,
"grad_norm": 1.0205527544021606,
"learning_rate": 2.0840089134667824e-05,
"loss": 0.5475,
"step": 7630
},
{
"epoch": 1.1091753774680604,
"grad_norm": 1.5262418985366821,
"learning_rate": 2.0783801044175467e-05,
"loss": 0.582,
"step": 7640
},
{
"epoch": 1.1106271777003485,
"grad_norm": 2.5063817501068115,
"learning_rate": 2.0727534944244615e-05,
"loss": 0.7552,
"step": 7650
},
{
"epoch": 1.1120789779326365,
"grad_norm": 3.6351969242095947,
"learning_rate": 2.067129112834413e-05,
"loss": 0.6419,
"step": 7660
},
{
"epoch": 1.1135307781649244,
"grad_norm": 0.8957704305648804,
"learning_rate": 2.061506988982665e-05,
"loss": 0.4333,
"step": 7670
},
{
"epoch": 1.1149825783972125,
"grad_norm": 1.9803669452667236,
"learning_rate": 2.0558871521927073e-05,
"loss": 0.4656,
"step": 7680
},
{
"epoch": 1.1164343786295006,
"grad_norm": 0.8719884157180786,
"learning_rate": 2.0502696317760973e-05,
"loss": 0.4252,
"step": 7690
},
{
"epoch": 1.1178861788617886,
"grad_norm": 1.6916320323944092,
"learning_rate": 2.044654457032314e-05,
"loss": 0.7204,
"step": 7700
},
{
"epoch": 1.1193379790940767,
"grad_norm": 1.6074903011322021,
"learning_rate": 2.0390416572486e-05,
"loss": 0.4984,
"step": 7710
},
{
"epoch": 1.1207897793263646,
"grad_norm": 0.2988170087337494,
"learning_rate": 2.033431261699813e-05,
"loss": 0.4557,
"step": 7720
},
{
"epoch": 1.1222415795586527,
"grad_norm": 15.167128562927246,
"learning_rate": 2.0278232996482688e-05,
"loss": 0.551,
"step": 7730
},
{
"epoch": 1.1236933797909407,
"grad_norm": 0.8808531761169434,
"learning_rate": 2.0222178003435926e-05,
"loss": 0.434,
"step": 7740
},
{
"epoch": 1.1251451800232288,
"grad_norm": 0.7921860814094543,
"learning_rate": 2.0166147930225615e-05,
"loss": 0.4803,
"step": 7750
},
{
"epoch": 1.126596980255517,
"grad_norm": 1.9591280221939087,
"learning_rate": 2.011014306908958e-05,
"loss": 0.786,
"step": 7760
},
{
"epoch": 1.1280487804878048,
"grad_norm": 1.479054570198059,
"learning_rate": 2.0054163712134145e-05,
"loss": 0.655,
"step": 7770
},
{
"epoch": 1.1295005807200929,
"grad_norm": 3.091681480407715,
"learning_rate": 1.9998210151332585e-05,
"loss": 0.7444,
"step": 7780
},
{
"epoch": 1.130952380952381,
"grad_norm": 2.064387321472168,
"learning_rate": 1.994228267852366e-05,
"loss": 0.4337,
"step": 7790
},
{
"epoch": 1.132404181184669,
"grad_norm": 1.0761544704437256,
"learning_rate": 1.9886381585410045e-05,
"loss": 0.5395,
"step": 7800
},
{
"epoch": 1.133855981416957,
"grad_norm": 1.1305792331695557,
"learning_rate": 1.9830507163556816e-05,
"loss": 0.6013,
"step": 7810
},
{
"epoch": 1.135307781649245,
"grad_norm": 3.304077386856079,
"learning_rate": 1.977465970438998e-05,
"loss": 0.8103,
"step": 7820
},
{
"epoch": 1.136759581881533,
"grad_norm": 0.8400141596794128,
"learning_rate": 1.9718839499194868e-05,
"loss": 0.5292,
"step": 7830
},
{
"epoch": 1.1382113821138211,
"grad_norm": 5.679340839385986,
"learning_rate": 1.9663046839114684e-05,
"loss": 0.5317,
"step": 7840
},
{
"epoch": 1.1396631823461092,
"grad_norm": 2.914165496826172,
"learning_rate": 1.960728201514896e-05,
"loss": 0.7501,
"step": 7850
},
{
"epoch": 1.1411149825783973,
"grad_norm": 3.093472957611084,
"learning_rate": 1.9551545318152047e-05,
"loss": 0.5741,
"step": 7860
},
{
"epoch": 1.1425667828106851,
"grad_norm": 1.7415759563446045,
"learning_rate": 1.949583703883158e-05,
"loss": 0.5044,
"step": 7870
},
{
"epoch": 1.1440185830429732,
"grad_norm": 4.8877668380737305,
"learning_rate": 1.9440157467746985e-05,
"loss": 0.786,
"step": 7880
},
{
"epoch": 1.1454703832752613,
"grad_norm": 1.9730969667434692,
"learning_rate": 1.9384506895307964e-05,
"loss": 0.7195,
"step": 7890
},
{
"epoch": 1.1469221835075494,
"grad_norm": 12.92557430267334,
"learning_rate": 1.932888561177294e-05,
"loss": 0.5679,
"step": 7900
},
{
"epoch": 1.1483739837398375,
"grad_norm": 2.283071517944336,
"learning_rate": 1.92732939072476e-05,
"loss": 0.5129,
"step": 7910
},
{
"epoch": 1.1498257839721253,
"grad_norm": 0.8420314788818359,
"learning_rate": 1.9217732071683343e-05,
"loss": 0.6232,
"step": 7920
},
{
"epoch": 1.1512775842044134,
"grad_norm": 1.523573637008667,
"learning_rate": 1.9162200394875783e-05,
"loss": 0.6329,
"step": 7930
},
{
"epoch": 1.1527293844367015,
"grad_norm": 3.2268831729888916,
"learning_rate": 1.9106699166463247e-05,
"loss": 0.5248,
"step": 7940
},
{
"epoch": 1.1541811846689896,
"grad_norm": 2.4383325576782227,
"learning_rate": 1.905122867592522e-05,
"loss": 0.725,
"step": 7950
},
{
"epoch": 1.1556329849012776,
"grad_norm": 7.215484142303467,
"learning_rate": 1.8995789212580884e-05,
"loss": 0.4331,
"step": 7960
},
{
"epoch": 1.1570847851335655,
"grad_norm": 1.5999699831008911,
"learning_rate": 1.89403810655876e-05,
"loss": 0.421,
"step": 7970
},
{
"epoch": 1.1585365853658536,
"grad_norm": 0.6313633918762207,
"learning_rate": 1.8885004523939386e-05,
"loss": 0.3322,
"step": 7980
},
{
"epoch": 1.1599883855981417,
"grad_norm": 1.2481117248535156,
"learning_rate": 1.8829659876465406e-05,
"loss": 0.4594,
"step": 7990
},
{
"epoch": 1.1614401858304297,
"grad_norm": 1.2827537059783936,
"learning_rate": 1.8774347411828472e-05,
"loss": 0.603,
"step": 8000
},
{
"epoch": 1.1628919860627178,
"grad_norm": 0.5014917254447937,
"learning_rate": 1.871906741852356e-05,
"loss": 0.3013,
"step": 8010
},
{
"epoch": 1.164343786295006,
"grad_norm": 1.5885872840881348,
"learning_rate": 1.8663820184876247e-05,
"loss": 0.5299,
"step": 8020
},
{
"epoch": 1.1657955865272938,
"grad_norm": 0.0035865483805537224,
"learning_rate": 1.8608605999041297e-05,
"loss": 0.5274,
"step": 8030
},
{
"epoch": 1.1672473867595818,
"grad_norm": 1.4468114376068115,
"learning_rate": 1.8553425149001057e-05,
"loss": 0.4781,
"step": 8040
},
{
"epoch": 1.16869918699187,
"grad_norm": 2.688275098800659,
"learning_rate": 1.8498277922564026e-05,
"loss": 0.4668,
"step": 8050
},
{
"epoch": 1.170150987224158,
"grad_norm": 1.6105045080184937,
"learning_rate": 1.8443164607363333e-05,
"loss": 0.6738,
"step": 8060
},
{
"epoch": 1.171602787456446,
"grad_norm": 1.46797513961792,
"learning_rate": 1.8388085490855217e-05,
"loss": 0.552,
"step": 8070
},
{
"epoch": 1.173054587688734,
"grad_norm": 1.1859605312347412,
"learning_rate": 1.833304086031757e-05,
"loss": 0.4247,
"step": 8080
},
{
"epoch": 1.174506387921022,
"grad_norm": 1.4056955575942993,
"learning_rate": 1.8278031002848394e-05,
"loss": 0.4875,
"step": 8090
},
{
"epoch": 1.17595818815331,
"grad_norm": 1.6861822605133057,
"learning_rate": 1.8223056205364342e-05,
"loss": 0.5837,
"step": 8100
},
{
"epoch": 1.1774099883855982,
"grad_norm": 1.9432148933410645,
"learning_rate": 1.8168116754599186e-05,
"loss": 0.6512,
"step": 8110
},
{
"epoch": 1.1788617886178863,
"grad_norm": 1.438887119293213,
"learning_rate": 1.811321293710235e-05,
"loss": 0.5249,
"step": 8120
},
{
"epoch": 1.1803135888501741,
"grad_norm": 4.159003734588623,
"learning_rate": 1.8058345039237395e-05,
"loss": 0.4055,
"step": 8130
},
{
"epoch": 1.1817653890824622,
"grad_norm": 1.9116485118865967,
"learning_rate": 1.8003513347180557e-05,
"loss": 0.6732,
"step": 8140
},
{
"epoch": 1.1832171893147503,
"grad_norm": 0.8615849614143372,
"learning_rate": 1.7948718146919212e-05,
"loss": 0.4732,
"step": 8150
},
{
"epoch": 1.1846689895470384,
"grad_norm": 1.812454342842102,
"learning_rate": 1.7893959724250402e-05,
"loss": 0.4385,
"step": 8160
},
{
"epoch": 1.1861207897793264,
"grad_norm": 1.0954737663269043,
"learning_rate": 1.7839238364779358e-05,
"loss": 0.4728,
"step": 8170
},
{
"epoch": 1.1875725900116145,
"grad_norm": 3.3820154666900635,
"learning_rate": 1.7784554353918002e-05,
"loss": 0.4665,
"step": 8180
},
{
"epoch": 1.1890243902439024,
"grad_norm": 1.3054349422454834,
"learning_rate": 1.772990797688344e-05,
"loss": 0.8027,
"step": 8190
},
{
"epoch": 1.1904761904761905,
"grad_norm": 0.9854569435119629,
"learning_rate": 1.7675299518696503e-05,
"loss": 0.6728,
"step": 8200
},
{
"epoch": 1.1919279907084785,
"grad_norm": 1.3417751789093018,
"learning_rate": 1.7620729264180244e-05,
"loss": 0.6094,
"step": 8210
},
{
"epoch": 1.1933797909407666,
"grad_norm": 1.8543522357940674,
"learning_rate": 1.756619749795846e-05,
"loss": 0.3593,
"step": 8220
},
{
"epoch": 1.1948315911730547,
"grad_norm": 4.067511081695557,
"learning_rate": 1.751170450445418e-05,
"loss": 0.4437,
"step": 8230
},
{
"epoch": 1.1962833914053426,
"grad_norm": 0.7072954773902893,
"learning_rate": 1.7457250567888255e-05,
"loss": 0.4523,
"step": 8240
},
{
"epoch": 1.1977351916376306,
"grad_norm": 2.400019884109497,
"learning_rate": 1.7402835972277774e-05,
"loss": 0.7181,
"step": 8250
},
{
"epoch": 1.1991869918699187,
"grad_norm": 0.7188956141471863,
"learning_rate": 1.734846100143466e-05,
"loss": 0.6106,
"step": 8260
},
{
"epoch": 1.2006387921022068,
"grad_norm": 0.9549878835678101,
"learning_rate": 1.7294125938964163e-05,
"loss": 0.6636,
"step": 8270
},
{
"epoch": 1.202090592334495,
"grad_norm": 1.26228928565979,
"learning_rate": 1.7239831068263366e-05,
"loss": 0.3134,
"step": 8280
},
{
"epoch": 1.2035423925667827,
"grad_norm": 1.7492179870605469,
"learning_rate": 1.718557667251974e-05,
"loss": 0.7868,
"step": 8290
},
{
"epoch": 1.2049941927990708,
"grad_norm": 5.789414405822754,
"learning_rate": 1.7131363034709647e-05,
"loss": 0.3828,
"step": 8300
},
{
"epoch": 1.206445993031359,
"grad_norm": 5.111294746398926,
"learning_rate": 1.7077190437596864e-05,
"loss": 0.5902,
"step": 8310
},
{
"epoch": 1.207897793263647,
"grad_norm": 1.8779693841934204,
"learning_rate": 1.7023059163731097e-05,
"loss": 0.5968,
"step": 8320
},
{
"epoch": 1.209349593495935,
"grad_norm": 4.733475685119629,
"learning_rate": 1.696896949544654e-05,
"loss": 0.6245,
"step": 8330
},
{
"epoch": 1.210801393728223,
"grad_norm": 0.9428911805152893,
"learning_rate": 1.6914921714860378e-05,
"loss": 0.537,
"step": 8340
},
{
"epoch": 1.212253193960511,
"grad_norm": 0.8777297735214233,
"learning_rate": 1.686091610387133e-05,
"loss": 0.5012,
"step": 8350
},
{
"epoch": 1.213704994192799,
"grad_norm": 4.631138801574707,
"learning_rate": 1.680695294415815e-05,
"loss": 0.6156,
"step": 8360
},
{
"epoch": 1.2151567944250872,
"grad_norm": 0.6276788711547852,
"learning_rate": 1.6753032517178187e-05,
"loss": 0.6097,
"step": 8370
},
{
"epoch": 1.2166085946573753,
"grad_norm": 0.7549428939819336,
"learning_rate": 1.6699155104165904e-05,
"loss": 0.7467,
"step": 8380
},
{
"epoch": 1.2180603948896631,
"grad_norm": 0.9138199687004089,
"learning_rate": 1.6645320986131433e-05,
"loss": 0.5846,
"step": 8390
},
{
"epoch": 1.2195121951219512,
"grad_norm": 1.1513859033584595,
"learning_rate": 1.659153044385906e-05,
"loss": 0.4798,
"step": 8400
},
{
"epoch": 1.2209639953542393,
"grad_norm": 1.6771997213363647,
"learning_rate": 1.6537783757905816e-05,
"loss": 0.8278,
"step": 8410
},
{
"epoch": 1.2224157955865274,
"grad_norm": 1.2027699947357178,
"learning_rate": 1.648408120859998e-05,
"loss": 0.7619,
"step": 8420
},
{
"epoch": 1.2238675958188154,
"grad_norm": 2.257286310195923,
"learning_rate": 1.643042307603964e-05,
"loss": 0.7877,
"step": 8430
},
{
"epoch": 1.2253193960511033,
"grad_norm": 0.687853217124939,
"learning_rate": 1.6376809640091174e-05,
"loss": 0.6319,
"step": 8440
},
{
"epoch": 1.2267711962833914,
"grad_norm": 1.3753950595855713,
"learning_rate": 1.63232411803879e-05,
"loss": 0.5018,
"step": 8450
},
{
"epoch": 1.2282229965156795,
"grad_norm": 3.117898464202881,
"learning_rate": 1.6269717976328503e-05,
"loss": 0.6428,
"step": 8460
},
{
"epoch": 1.2296747967479675,
"grad_norm": 1.2253605127334595,
"learning_rate": 1.6216240307075642e-05,
"loss": 0.6265,
"step": 8470
},
{
"epoch": 1.2311265969802556,
"grad_norm": 1.9370412826538086,
"learning_rate": 1.6162808451554483e-05,
"loss": 0.6584,
"step": 8480
},
{
"epoch": 1.2325783972125435,
"grad_norm": 4.468973636627197,
"learning_rate": 1.6109422688451224e-05,
"loss": 0.6343,
"step": 8490
},
{
"epoch": 1.2340301974448316,
"grad_norm": 6.738311290740967,
"learning_rate": 1.605608329621168e-05,
"loss": 0.6665,
"step": 8500
},
{
"epoch": 1.2354819976771196,
"grad_norm": 1.266482949256897,
"learning_rate": 1.6002790553039803e-05,
"loss": 0.7137,
"step": 8510
},
{
"epoch": 1.2369337979094077,
"grad_norm": 0.7233752012252808,
"learning_rate": 1.594954473689621e-05,
"loss": 0.5351,
"step": 8520
},
{
"epoch": 1.2383855981416958,
"grad_norm": 3.379714012145996,
"learning_rate": 1.5896346125496793e-05,
"loss": 0.5488,
"step": 8530
},
{
"epoch": 1.2398373983739837,
"grad_norm": 2.4713003635406494,
"learning_rate": 1.5843194996311213e-05,
"loss": 0.7367,
"step": 8540
},
{
"epoch": 1.2412891986062717,
"grad_norm": 0.4656989574432373,
"learning_rate": 1.5790091626561494e-05,
"loss": 0.3323,
"step": 8550
},
{
"epoch": 1.2427409988385598,
"grad_norm": 1.3530571460723877,
"learning_rate": 1.5737036293220554e-05,
"loss": 0.5089,
"step": 8560
},
{
"epoch": 1.244192799070848,
"grad_norm": 1.5478246212005615,
"learning_rate": 1.568402927301076e-05,
"loss": 0.6737,
"step": 8570
},
{
"epoch": 1.245644599303136,
"grad_norm": 1.6007646322250366,
"learning_rate": 1.5631070842402494e-05,
"loss": 0.5032,
"step": 8580
},
{
"epoch": 1.2470963995354238,
"grad_norm": 1.9949185848236084,
"learning_rate": 1.5578161277612707e-05,
"loss": 0.746,
"step": 8590
},
{
"epoch": 1.248548199767712,
"grad_norm": 1.552194595336914,
"learning_rate": 1.5525300854603486e-05,
"loss": 0.4807,
"step": 8600
},
{
"epoch": 1.25,
"grad_norm": 6.406808376312256,
"learning_rate": 1.547248984908059e-05,
"loss": 0.5125,
"step": 8610
},
{
"epoch": 1.251451800232288,
"grad_norm": 0.6398019790649414,
"learning_rate": 1.5419728536492055e-05,
"loss": 0.3386,
"step": 8620
},
{
"epoch": 1.2529036004645762,
"grad_norm": 1.874664306640625,
"learning_rate": 1.5367017192026713e-05,
"loss": 0.5268,
"step": 8630
},
{
"epoch": 1.254355400696864,
"grad_norm": 2.999232053756714,
"learning_rate": 1.5314356090612776e-05,
"loss": 0.5744,
"step": 8640
},
{
"epoch": 1.255807200929152,
"grad_norm": 9.338212966918945,
"learning_rate": 1.5261745506916408e-05,
"loss": 0.6682,
"step": 8650
},
{
"epoch": 1.2572590011614402,
"grad_norm": 3.1387779712677,
"learning_rate": 1.5209185715340294e-05,
"loss": 0.4691,
"step": 8660
},
{
"epoch": 1.2587108013937283,
"grad_norm": 0.6614925861358643,
"learning_rate": 1.5156676990022184e-05,
"loss": 0.4255,
"step": 8670
},
{
"epoch": 1.2601626016260163,
"grad_norm": 0.9042619466781616,
"learning_rate": 1.5104219604833494e-05,
"loss": 0.3958,
"step": 8680
},
{
"epoch": 1.2616144018583042,
"grad_norm": 0.8313902020454407,
"learning_rate": 1.5051813833377859e-05,
"loss": 0.5207,
"step": 8690
},
{
"epoch": 1.2630662020905923,
"grad_norm": 1.0558016300201416,
"learning_rate": 1.4999459948989702e-05,
"loss": 0.3235,
"step": 8700
},
{
"epoch": 1.2645180023228804,
"grad_norm": 1.1987258195877075,
"learning_rate": 1.4947158224732827e-05,
"loss": 0.4936,
"step": 8710
},
{
"epoch": 1.2659698025551684,
"grad_norm": 1.0946906805038452,
"learning_rate": 1.4894908933398989e-05,
"loss": 0.6256,
"step": 8720
},
{
"epoch": 1.2674216027874565,
"grad_norm": 1.2409650087356567,
"learning_rate": 1.4842712347506443e-05,
"loss": 0.793,
"step": 8730
},
{
"epoch": 1.2688734030197444,
"grad_norm": 0.7660655379295349,
"learning_rate": 1.4790568739298582e-05,
"loss": 0.5611,
"step": 8740
},
{
"epoch": 1.2703252032520325,
"grad_norm": 0.7420207262039185,
"learning_rate": 1.473847838074245e-05,
"loss": 0.6045,
"step": 8750
},
{
"epoch": 1.2717770034843205,
"grad_norm": 0.743302047252655,
"learning_rate": 1.4686441543527374e-05,
"loss": 0.7294,
"step": 8760
},
{
"epoch": 1.2732288037166086,
"grad_norm": 1.441884160041809,
"learning_rate": 1.4634458499063536e-05,
"loss": 0.6125,
"step": 8770
},
{
"epoch": 1.2746806039488967,
"grad_norm": 0.13829253613948822,
"learning_rate": 1.458252951848051e-05,
"loss": 0.4259,
"step": 8780
},
{
"epoch": 1.2761324041811846,
"grad_norm": 1.4842077493667603,
"learning_rate": 1.4530654872625935e-05,
"loss": 0.5568,
"step": 8790
},
{
"epoch": 1.2775842044134726,
"grad_norm": 1.0749858617782593,
"learning_rate": 1.4478834832064026e-05,
"loss": 0.5374,
"step": 8800
},
{
"epoch": 1.2790360046457607,
"grad_norm": 8.395951271057129,
"learning_rate": 1.4427069667074184e-05,
"loss": 0.4693,
"step": 8810
},
{
"epoch": 1.2804878048780488,
"grad_norm": 0.30601173639297485,
"learning_rate": 1.4375359647649634e-05,
"loss": 0.3597,
"step": 8820
},
{
"epoch": 1.2819396051103369,
"grad_norm": 1.7369287014007568,
"learning_rate": 1.4323705043495938e-05,
"loss": 0.4448,
"step": 8830
},
{
"epoch": 1.2833914053426247,
"grad_norm": 1.2835052013397217,
"learning_rate": 1.4272106124029627e-05,
"loss": 0.7685,
"step": 8840
},
{
"epoch": 1.2848432055749128,
"grad_norm": 3.1556379795074463,
"learning_rate": 1.4220563158376832e-05,
"loss": 0.5719,
"step": 8850
},
{
"epoch": 1.286295005807201,
"grad_norm": 2.188831090927124,
"learning_rate": 1.4169076415371802e-05,
"loss": 0.5663,
"step": 8860
},
{
"epoch": 1.287746806039489,
"grad_norm": 2.645719051361084,
"learning_rate": 1.4117646163555565e-05,
"loss": 0.653,
"step": 8870
},
{
"epoch": 1.289198606271777,
"grad_norm": 1.1469491720199585,
"learning_rate": 1.4066272671174512e-05,
"loss": 0.5314,
"step": 8880
},
{
"epoch": 1.290650406504065,
"grad_norm": 1.8526806831359863,
"learning_rate": 1.4014956206178987e-05,
"loss": 0.4409,
"step": 8890
},
{
"epoch": 1.292102206736353,
"grad_norm": 0.9696226716041565,
"learning_rate": 1.3963697036221863e-05,
"loss": 0.7264,
"step": 8900
},
{
"epoch": 1.293554006968641,
"grad_norm": 2.522721529006958,
"learning_rate": 1.3912495428657236e-05,
"loss": 0.7832,
"step": 8910
},
{
"epoch": 1.2950058072009292,
"grad_norm": 1.7248927354812622,
"learning_rate": 1.3861351650538929e-05,
"loss": 0.548,
"step": 8920
},
{
"epoch": 1.2964576074332173,
"grad_norm": 0.5419870018959045,
"learning_rate": 1.3810265968619141e-05,
"loss": 0.6291,
"step": 8930
},
{
"epoch": 1.297909407665505,
"grad_norm": 0.7840960025787354,
"learning_rate": 1.3759238649347091e-05,
"loss": 0.5772,
"step": 8940
},
{
"epoch": 1.2993612078977932,
"grad_norm": 1.4585460424423218,
"learning_rate": 1.3708269958867565e-05,
"loss": 0.8735,
"step": 8950
},
{
"epoch": 1.3008130081300813,
"grad_norm": 3.5455801486968994,
"learning_rate": 1.3657360163019544e-05,
"loss": 0.6392,
"step": 8960
},
{
"epoch": 1.3022648083623694,
"grad_norm": 0.890296220779419,
"learning_rate": 1.3606509527334894e-05,
"loss": 0.853,
"step": 8970
},
{
"epoch": 1.3037166085946574,
"grad_norm": 2.1235806941986084,
"learning_rate": 1.3555718317036847e-05,
"loss": 0.6268,
"step": 8980
},
{
"epoch": 1.3051684088269453,
"grad_norm": 1.9171247482299805,
"learning_rate": 1.3504986797038715e-05,
"loss": 0.5688,
"step": 8990
},
{
"epoch": 1.3066202090592334,
"grad_norm": 2.7999086380004883,
"learning_rate": 1.3454315231942499e-05,
"loss": 0.5062,
"step": 9000
},
{
"epoch": 1.3066202090592334,
"eval_loss": 0.6196444034576416,
"eval_runtime": 107.7639,
"eval_samples_per_second": 13.455,
"eval_steps_per_second": 3.368,
"step": 9000
},
{
"epoch": 1.3080720092915215,
"grad_norm": 1.666410207748413,
"learning_rate": 1.3403703886037466e-05,
"loss": 0.7899,
"step": 9010
},
{
"epoch": 1.3095238095238095,
"grad_norm": 1.47067129611969,
"learning_rate": 1.3353153023298789e-05,
"loss": 0.5773,
"step": 9020
},
{
"epoch": 1.3109756097560976,
"grad_norm": 1.4645687341690063,
"learning_rate": 1.3302662907386222e-05,
"loss": 0.6352,
"step": 9030
},
{
"epoch": 1.3124274099883855,
"grad_norm": 1.135907530784607,
"learning_rate": 1.325223380164263e-05,
"loss": 0.5388,
"step": 9040
},
{
"epoch": 1.3138792102206736,
"grad_norm": 0.8413094282150269,
"learning_rate": 1.3201865969092686e-05,
"loss": 0.7493,
"step": 9050
},
{
"epoch": 1.3153310104529616,
"grad_norm": 1.01530921459198,
"learning_rate": 1.315155967244149e-05,
"loss": 0.4492,
"step": 9060
},
{
"epoch": 1.3167828106852497,
"grad_norm": 2.6221423149108887,
"learning_rate": 1.3101315174073162e-05,
"loss": 0.5208,
"step": 9070
},
{
"epoch": 1.3182346109175378,
"grad_norm": 5.264577865600586,
"learning_rate": 1.305113273604952e-05,
"loss": 0.4573,
"step": 9080
},
{
"epoch": 1.3196864111498257,
"grad_norm": 1.6519479751586914,
"learning_rate": 1.3001012620108693e-05,
"loss": 0.5216,
"step": 9090
},
{
"epoch": 1.321138211382114,
"grad_norm": 1.1643894910812378,
"learning_rate": 1.2950955087663741e-05,
"loss": 0.4458,
"step": 9100
},
{
"epoch": 1.3225900116144018,
"grad_norm": 1.967511534690857,
"learning_rate": 1.2900960399801292e-05,
"loss": 0.7898,
"step": 9110
},
{
"epoch": 1.32404181184669,
"grad_norm": 1.269264578819275,
"learning_rate": 1.2851028817280242e-05,
"loss": 0.5747,
"step": 9120
},
{
"epoch": 1.325493612078978,
"grad_norm": 1.0032755136489868,
"learning_rate": 1.2801160600530299e-05,
"loss": 0.5245,
"step": 9130
},
{
"epoch": 1.3269454123112658,
"grad_norm": 4.5418925285339355,
"learning_rate": 1.2751356009650681e-05,
"loss": 0.6442,
"step": 9140
},
{
"epoch": 1.3283972125435541,
"grad_norm": 1.1265850067138672,
"learning_rate": 1.270161530440878e-05,
"loss": 0.4234,
"step": 9150
},
{
"epoch": 1.329849012775842,
"grad_norm": 0.029596175998449326,
"learning_rate": 1.2651938744238745e-05,
"loss": 0.4876,
"step": 9160
},
{
"epoch": 1.33130081300813,
"grad_norm": 4.938312530517578,
"learning_rate": 1.2602326588240168e-05,
"loss": 0.5431,
"step": 9170
},
{
"epoch": 1.3327526132404182,
"grad_norm": 1.1647799015045166,
"learning_rate": 1.2552779095176737e-05,
"loss": 0.5084,
"step": 9180
},
{
"epoch": 1.334204413472706,
"grad_norm": 0.8059009313583374,
"learning_rate": 1.2503296523474883e-05,
"loss": 0.7431,
"step": 9190
},
{
"epoch": 1.3356562137049943,
"grad_norm": 1.5592460632324219,
"learning_rate": 1.245387913122239e-05,
"loss": 0.312,
"step": 9200
},
{
"epoch": 1.3371080139372822,
"grad_norm": 1.1873098611831665,
"learning_rate": 1.2404527176167124e-05,
"loss": 0.7229,
"step": 9210
},
{
"epoch": 1.3385598141695703,
"grad_norm": 1.4901853799819946,
"learning_rate": 1.2355240915715618e-05,
"loss": 0.538,
"step": 9220
},
{
"epoch": 1.3400116144018583,
"grad_norm": 1.5089656114578247,
"learning_rate": 1.2306020606931767e-05,
"loss": 0.5226,
"step": 9230
},
{
"epoch": 1.3414634146341464,
"grad_norm": 0.9845458269119263,
"learning_rate": 1.2256866506535497e-05,
"loss": 0.61,
"step": 9240
},
{
"epoch": 1.3429152148664345,
"grad_norm": 0.9404434561729431,
"learning_rate": 1.220777887090139e-05,
"loss": 0.5815,
"step": 9250
},
{
"epoch": 1.3443670150987224,
"grad_norm": 1.297400712966919,
"learning_rate": 1.2158757956057357e-05,
"loss": 0.5703,
"step": 9260
},
{
"epoch": 1.3458188153310104,
"grad_norm": 5.133298397064209,
"learning_rate": 1.2109804017683349e-05,
"loss": 0.3776,
"step": 9270
},
{
"epoch": 1.3472706155632985,
"grad_norm": 0.2664077579975128,
"learning_rate": 1.206091731110994e-05,
"loss": 0.4978,
"step": 9280
},
{
"epoch": 1.3487224157955866,
"grad_norm": 0.8112949132919312,
"learning_rate": 1.2012098091317083e-05,
"loss": 0.4887,
"step": 9290
},
{
"epoch": 1.3501742160278747,
"grad_norm": 1.9871488809585571,
"learning_rate": 1.1963346612932702e-05,
"loss": 0.7117,
"step": 9300
},
{
"epoch": 1.3516260162601625,
"grad_norm": 3.5719833374023438,
"learning_rate": 1.191466313023143e-05,
"loss": 0.568,
"step": 9310
},
{
"epoch": 1.3530778164924506,
"grad_norm": 1.1161819696426392,
"learning_rate": 1.1866047897133223e-05,
"loss": 0.4455,
"step": 9320
},
{
"epoch": 1.3545296167247387,
"grad_norm": 1.2592240571975708,
"learning_rate": 1.1817501167202099e-05,
"loss": 0.5396,
"step": 9330
},
{
"epoch": 1.3559814169570268,
"grad_norm": 8.793890953063965,
"learning_rate": 1.1769023193644757e-05,
"loss": 0.5515,
"step": 9340
},
{
"epoch": 1.3574332171893149,
"grad_norm": 1.0319164991378784,
"learning_rate": 1.1720614229309277e-05,
"loss": 0.62,
"step": 9350
},
{
"epoch": 1.3588850174216027,
"grad_norm": 1.9891750812530518,
"learning_rate": 1.1672274526683835e-05,
"loss": 0.5769,
"step": 9360
},
{
"epoch": 1.3603368176538908,
"grad_norm": 3.4943082332611084,
"learning_rate": 1.162400433789533e-05,
"loss": 0.6463,
"step": 9370
},
{
"epoch": 1.3617886178861789,
"grad_norm": 1.8810696601867676,
"learning_rate": 1.1575803914708096e-05,
"loss": 0.7964,
"step": 9380
},
{
"epoch": 1.363240418118467,
"grad_norm": 1.418583869934082,
"learning_rate": 1.1527673508522604e-05,
"loss": 0.428,
"step": 9390
},
{
"epoch": 1.364692218350755,
"grad_norm": 4.40504264831543,
"learning_rate": 1.1479613370374136e-05,
"loss": 0.6119,
"step": 9400
},
{
"epoch": 1.366144018583043,
"grad_norm": 2.4559905529022217,
"learning_rate": 1.143162375093145e-05,
"loss": 0.5134,
"step": 9410
},
{
"epoch": 1.367595818815331,
"grad_norm": 3.8561477661132812,
"learning_rate": 1.1383704900495529e-05,
"loss": 0.4626,
"step": 9420
},
{
"epoch": 1.369047619047619,
"grad_norm": 1.6356045007705688,
"learning_rate": 1.1335857068998221e-05,
"loss": 0.5223,
"step": 9430
},
{
"epoch": 1.3704994192799071,
"grad_norm": 1.7519195079803467,
"learning_rate": 1.1288080506000955e-05,
"loss": 0.641,
"step": 9440
},
{
"epoch": 1.3719512195121952,
"grad_norm": 0.4097733199596405,
"learning_rate": 1.1240375460693475e-05,
"loss": 0.5781,
"step": 9450
},
{
"epoch": 1.373403019744483,
"grad_norm": 2.5884532928466797,
"learning_rate": 1.119274218189247e-05,
"loss": 0.5514,
"step": 9460
},
{
"epoch": 1.3748548199767712,
"grad_norm": 1.4594874382019043,
"learning_rate": 1.1145180918040332e-05,
"loss": 0.7619,
"step": 9470
},
{
"epoch": 1.3763066202090593,
"grad_norm": 7.807918548583984,
"learning_rate": 1.109769191720384e-05,
"loss": 0.3226,
"step": 9480
},
{
"epoch": 1.3777584204413473,
"grad_norm": 0.6364027261734009,
"learning_rate": 1.1050275427072884e-05,
"loss": 0.5776,
"step": 9490
},
{
"epoch": 1.3792102206736354,
"grad_norm": 0.4011842608451843,
"learning_rate": 1.1002931694959131e-05,
"loss": 0.4091,
"step": 9500
},
{
"epoch": 1.3806620209059233,
"grad_norm": 5.032822132110596,
"learning_rate": 1.0955660967794768e-05,
"loss": 0.5523,
"step": 9510
},
{
"epoch": 1.3821138211382114,
"grad_norm": 3.3209786415100098,
"learning_rate": 1.0908463492131227e-05,
"loss": 0.5782,
"step": 9520
},
{
"epoch": 1.3835656213704994,
"grad_norm": 0.4670596718788147,
"learning_rate": 1.086133951413785e-05,
"loss": 0.6112,
"step": 9530
},
{
"epoch": 1.3850174216027875,
"grad_norm": 6.041258335113525,
"learning_rate": 1.081428927960067e-05,
"loss": 0.6415,
"step": 9540
},
{
"epoch": 1.3864692218350756,
"grad_norm": 2.76751446723938,
"learning_rate": 1.0767313033921067e-05,
"loss": 0.3524,
"step": 9550
},
{
"epoch": 1.3879210220673635,
"grad_norm": 2.8424673080444336,
"learning_rate": 1.0720411022114512e-05,
"loss": 0.6496,
"step": 9560
},
{
"epoch": 1.3893728222996515,
"grad_norm": 1.2790861129760742,
"learning_rate": 1.0673583488809321e-05,
"loss": 0.6281,
"step": 9570
},
{
"epoch": 1.3908246225319396,
"grad_norm": 2.5029184818267822,
"learning_rate": 1.0626830678245329e-05,
"loss": 0.6078,
"step": 9580
},
{
"epoch": 1.3922764227642277,
"grad_norm": 1.0946515798568726,
"learning_rate": 1.0580152834272622e-05,
"loss": 0.5256,
"step": 9590
},
{
"epoch": 1.3937282229965158,
"grad_norm": 1.5489437580108643,
"learning_rate": 1.0533550200350314e-05,
"loss": 0.6867,
"step": 9600
},
{
"epoch": 1.3951800232288036,
"grad_norm": 1.4204350709915161,
"learning_rate": 1.0487023019545235e-05,
"loss": 0.6683,
"step": 9610
},
{
"epoch": 1.3966318234610917,
"grad_norm": 1.674791932106018,
"learning_rate": 1.044057153453066e-05,
"loss": 0.7691,
"step": 9620
},
{
"epoch": 1.3980836236933798,
"grad_norm": 2.3372557163238525,
"learning_rate": 1.039419598758505e-05,
"loss": 0.5875,
"step": 9630
},
{
"epoch": 1.3995354239256679,
"grad_norm": 4.951801300048828,
"learning_rate": 1.0347896620590819e-05,
"loss": 0.4327,
"step": 9640
},
{
"epoch": 1.400987224157956,
"grad_norm": 1.4369560480117798,
"learning_rate": 1.0301673675033017e-05,
"loss": 0.4592,
"step": 9650
},
{
"epoch": 1.4024390243902438,
"grad_norm": 1.2974849939346313,
"learning_rate": 1.025552739199813e-05,
"loss": 0.5833,
"step": 9660
},
{
"epoch": 1.403890824622532,
"grad_norm": 1.037194848060608,
"learning_rate": 1.0209458012172768e-05,
"loss": 0.4698,
"step": 9670
},
{
"epoch": 1.40534262485482,
"grad_norm": 2.5829808712005615,
"learning_rate": 1.016346577584244e-05,
"loss": 0.5585,
"step": 9680
},
{
"epoch": 1.406794425087108,
"grad_norm": 2.260946273803711,
"learning_rate": 1.0117550922890307e-05,
"loss": 0.6017,
"step": 9690
},
{
"epoch": 1.4082462253193961,
"grad_norm": 1.81033194065094,
"learning_rate": 1.0071713692795918e-05,
"loss": 0.6426,
"step": 9700
},
{
"epoch": 1.409698025551684,
"grad_norm": 1.978293776512146,
"learning_rate": 1.0025954324633948e-05,
"loss": 0.4709,
"step": 9710
},
{
"epoch": 1.411149825783972,
"grad_norm": 1.209401249885559,
"learning_rate": 9.980273057072968e-06,
"loss": 0.4459,
"step": 9720
},
{
"epoch": 1.4126016260162602,
"grad_norm": 1.3207520246505737,
"learning_rate": 9.934670128374212e-06,
"loss": 0.3628,
"step": 9730
},
{
"epoch": 1.4140534262485482,
"grad_norm": 0.9167854189872742,
"learning_rate": 9.889145776390308e-06,
"loss": 0.5037,
"step": 9740
},
{
"epoch": 1.4155052264808363,
"grad_norm": 2.53662109375,
"learning_rate": 9.843700238564035e-06,
"loss": 0.4758,
"step": 9750
},
{
"epoch": 1.4169570267131242,
"grad_norm": 2.7502434253692627,
"learning_rate": 9.798333751927139e-06,
"loss": 0.6707,
"step": 9760
},
{
"epoch": 1.4184088269454123,
"grad_norm": 1.7120157480239868,
"learning_rate": 9.753046553099007e-06,
"loss": 0.7902,
"step": 9770
},
{
"epoch": 1.4198606271777003,
"grad_norm": 2.2297070026397705,
"learning_rate": 9.707838878285527e-06,
"loss": 0.7242,
"step": 9780
},
{
"epoch": 1.4213124274099884,
"grad_norm": 2.1308581829071045,
"learning_rate": 9.662710963277783e-06,
"loss": 0.5492,
"step": 9790
},
{
"epoch": 1.4227642276422765,
"grad_norm": 7.506939888000488,
"learning_rate": 9.617663043450847e-06,
"loss": 0.469,
"step": 9800
},
{
"epoch": 1.4242160278745644,
"grad_norm": 7.771796703338623,
"learning_rate": 9.572695353762584e-06,
"loss": 0.4342,
"step": 9810
},
{
"epoch": 1.4256678281068524,
"grad_norm": 5.498221397399902,
"learning_rate": 9.527808128752397e-06,
"loss": 0.6446,
"step": 9820
},
{
"epoch": 1.4271196283391405,
"grad_norm": 3.744574785232544,
"learning_rate": 9.483001602539984e-06,
"loss": 0.4798,
"step": 9830
},
{
"epoch": 1.4285714285714286,
"grad_norm": 1.2909547090530396,
"learning_rate": 9.43827600882415e-06,
"loss": 0.4513,
"step": 9840
},
{
"epoch": 1.4300232288037167,
"grad_norm": 1.5674768686294556,
"learning_rate": 9.393631580881596e-06,
"loss": 0.3784,
"step": 9850
},
{
"epoch": 1.4314750290360045,
"grad_norm": 0.9246693253517151,
"learning_rate": 9.349068551565649e-06,
"loss": 0.3879,
"step": 9860
},
{
"epoch": 1.4329268292682926,
"grad_norm": 4.146523952484131,
"learning_rate": 9.304587153305122e-06,
"loss": 0.4375,
"step": 9870
},
{
"epoch": 1.4343786295005807,
"grad_norm": 0.4749496579170227,
"learning_rate": 9.260187618103036e-06,
"loss": 0.6098,
"step": 9880
},
{
"epoch": 1.4358304297328688,
"grad_norm": 0.9822236895561218,
"learning_rate": 9.215870177535433e-06,
"loss": 0.6339,
"step": 9890
},
{
"epoch": 1.4372822299651569,
"grad_norm": 1.2814334630966187,
"learning_rate": 9.171635062750189e-06,
"loss": 0.8344,
"step": 9900
},
{
"epoch": 1.4387340301974447,
"grad_norm": 2.548846483230591,
"learning_rate": 9.127482504465792e-06,
"loss": 0.5309,
"step": 9910
},
{
"epoch": 1.4401858304297328,
"grad_norm": 2.3129475116729736,
"learning_rate": 9.083412732970123e-06,
"loss": 0.6082,
"step": 9920
},
{
"epoch": 1.4416376306620209,
"grad_norm": 1.459028959274292,
"learning_rate": 9.039425978119267e-06,
"loss": 0.5144,
"step": 9930
},
{
"epoch": 1.443089430894309,
"grad_norm": 2.7794339656829834,
"learning_rate": 8.995522469336337e-06,
"loss": 0.6107,
"step": 9940
},
{
"epoch": 1.444541231126597,
"grad_norm": 1.3913737535476685,
"learning_rate": 8.951702435610244e-06,
"loss": 0.5444,
"step": 9950
},
{
"epoch": 1.445993031358885,
"grad_norm": 1.145751714706421,
"learning_rate": 8.907966105494498e-06,
"loss": 0.5478,
"step": 9960
},
{
"epoch": 1.447444831591173,
"grad_norm": 1.6435590982437134,
"learning_rate": 8.864313707106075e-06,
"loss": 0.5803,
"step": 9970
},
{
"epoch": 1.448896631823461,
"grad_norm": 2.8100969791412354,
"learning_rate": 8.820745468124144e-06,
"loss": 0.6449,
"step": 9980
},
{
"epoch": 1.4503484320557491,
"grad_norm": 0.4345937967300415,
"learning_rate": 8.777261615788956e-06,
"loss": 0.7335,
"step": 9990
},
{
"epoch": 1.4518002322880372,
"grad_norm": 3.7761106491088867,
"learning_rate": 8.733862376900597e-06,
"loss": 0.4368,
"step": 10000
},
{
"epoch": 1.453252032520325,
"grad_norm": 0.9145069718360901,
"learning_rate": 8.690547977817839e-06,
"loss": 0.6349,
"step": 10010
},
{
"epoch": 1.4547038327526132,
"grad_norm": 0.9794019460678101,
"learning_rate": 8.64731864445696e-06,
"loss": 0.5048,
"step": 10020
},
{
"epoch": 1.4561556329849012,
"grad_norm": 2.5523462295532227,
"learning_rate": 8.604174602290563e-06,
"loss": 0.5025,
"step": 10030
},
{
"epoch": 1.4576074332171893,
"grad_norm": 1.2542840242385864,
"learning_rate": 8.561116076346377e-06,
"loss": 0.342,
"step": 10040
},
{
"epoch": 1.4590592334494774,
"grad_norm": 4.584123611450195,
"learning_rate": 8.518143291206099e-06,
"loss": 0.5593,
"step": 10050
},
{
"epoch": 1.4605110336817653,
"grad_norm": 2.7633087635040283,
"learning_rate": 8.475256471004259e-06,
"loss": 0.5616,
"step": 10060
},
{
"epoch": 1.4619628339140534,
"grad_norm": 4.123738765716553,
"learning_rate": 8.43245583942698e-06,
"loss": 0.5572,
"step": 10070
},
{
"epoch": 1.4634146341463414,
"grad_norm": 6.2447428703308105,
"learning_rate": 8.389741619710855e-06,
"loss": 0.3971,
"step": 10080
},
{
"epoch": 1.4648664343786295,
"grad_norm": 1.1899082660675049,
"learning_rate": 8.347114034641806e-06,
"loss": 0.333,
"step": 10090
},
{
"epoch": 1.4663182346109176,
"grad_norm": 5.325255393981934,
"learning_rate": 8.304573306553846e-06,
"loss": 0.4626,
"step": 10100
},
{
"epoch": 1.4677700348432055,
"grad_norm": 2.715012788772583,
"learning_rate": 8.262119657327996e-06,
"loss": 0.4834,
"step": 10110
},
{
"epoch": 1.4692218350754935,
"grad_norm": 1.1475021839141846,
"learning_rate": 8.219753308391101e-06,
"loss": 0.5551,
"step": 10120
},
{
"epoch": 1.4706736353077816,
"grad_norm": 7.364482402801514,
"learning_rate": 8.17747448071465e-06,
"loss": 0.5282,
"step": 10130
},
{
"epoch": 1.4721254355400697,
"grad_norm": 1.1067121028900146,
"learning_rate": 8.135283394813651e-06,
"loss": 0.569,
"step": 10140
},
{
"epoch": 1.4735772357723578,
"grad_norm": 1.5818873643875122,
"learning_rate": 8.093180270745485e-06,
"loss": 0.5892,
"step": 10150
},
{
"epoch": 1.4750290360046456,
"grad_norm": 2.0473148822784424,
"learning_rate": 8.05116532810874e-06,
"loss": 0.8704,
"step": 10160
},
{
"epoch": 1.476480836236934,
"grad_norm": 1.3639038801193237,
"learning_rate": 8.009238786042062e-06,
"loss": 0.517,
"step": 10170
},
{
"epoch": 1.4779326364692218,
"grad_norm": 1.7621372938156128,
"learning_rate": 7.967400863223051e-06,
"loss": 0.469,
"step": 10180
},
{
"epoch": 1.4793844367015099,
"grad_norm": 1.9453188180923462,
"learning_rate": 7.925651777867068e-06,
"loss": 0.5911,
"step": 10190
},
{
"epoch": 1.480836236933798,
"grad_norm": 1.6738969087600708,
"learning_rate": 7.883991747726127e-06,
"loss": 0.5271,
"step": 10200
},
{
"epoch": 1.4822880371660858,
"grad_norm": 1.6197839975357056,
"learning_rate": 7.842420990087774e-06,
"loss": 0.5143,
"step": 10210
},
{
"epoch": 1.4837398373983741,
"grad_norm": 1.3254222869873047,
"learning_rate": 7.800939721773893e-06,
"loss": 0.5526,
"step": 10220
},
{
"epoch": 1.485191637630662,
"grad_norm": 2.3349244594573975,
"learning_rate": 7.759548159139654e-06,
"loss": 0.557,
"step": 10230
},
{
"epoch": 1.48664343786295,
"grad_norm": 1.9867162704467773,
"learning_rate": 7.718246518072341e-06,
"loss": 0.4553,
"step": 10240
},
{
"epoch": 1.4880952380952381,
"grad_norm": 0.5736078023910522,
"learning_rate": 7.677035013990211e-06,
"loss": 0.6118,
"step": 10250
},
{
"epoch": 1.489547038327526,
"grad_norm": 2.2709176540374756,
"learning_rate": 7.635913861841395e-06,
"loss": 0.7102,
"step": 10260
},
{
"epoch": 1.4909988385598143,
"grad_norm": 0.5769612789154053,
"learning_rate": 7.594883276102799e-06,
"loss": 0.639,
"step": 10270
},
{
"epoch": 1.4924506387921022,
"grad_norm": 2.3885700702667236,
"learning_rate": 7.5539434707789266e-06,
"loss": 0.767,
"step": 10280
},
{
"epoch": 1.4939024390243902,
"grad_norm": 2.5631144046783447,
"learning_rate": 7.513094659400802e-06,
"loss": 0.557,
"step": 10290
},
{
"epoch": 1.4953542392566783,
"grad_norm": 1.6625310182571411,
"learning_rate": 7.47233705502487e-06,
"loss": 0.419,
"step": 10300
},
{
"epoch": 1.4968060394889664,
"grad_norm": 3.3970789909362793,
"learning_rate": 7.431670870231844e-06,
"loss": 0.4773,
"step": 10310
},
{
"epoch": 1.4982578397212545,
"grad_norm": 2.158837080001831,
"learning_rate": 7.391096317125607e-06,
"loss": 0.5095,
"step": 10320
},
{
"epoch": 1.4997096399535423,
"grad_norm": 2.132723569869995,
"learning_rate": 7.350613607332163e-06,
"loss": 0.582,
"step": 10330
},
{
"epoch": 1.5011614401858304,
"grad_norm": 3.694959878921509,
"learning_rate": 7.310222951998438e-06,
"loss": 0.3228,
"step": 10340
},
{
"epoch": 1.5026132404181185,
"grad_norm": 3.7945165634155273,
"learning_rate": 7.269924561791236e-06,
"loss": 0.5246,
"step": 10350
},
{
"epoch": 1.5040650406504064,
"grad_norm": 1.9424091577529907,
"learning_rate": 7.2297186468961554e-06,
"loss": 0.6539,
"step": 10360
},
{
"epoch": 1.5055168408826947,
"grad_norm": 1.384211540222168,
"learning_rate": 7.189605417016443e-06,
"loss": 0.5089,
"step": 10370
},
{
"epoch": 1.5069686411149825,
"grad_norm": 1.18372642993927,
"learning_rate": 7.149585081371923e-06,
"loss": 0.624,
"step": 10380
},
{
"epoch": 1.5084204413472706,
"grad_norm": 2.478210926055908,
"learning_rate": 7.109657848697937e-06,
"loss": 0.5944,
"step": 10390
},
{
"epoch": 1.5098722415795587,
"grad_norm": 1.7582294940948486,
"learning_rate": 7.0698239272441985e-06,
"loss": 0.3679,
"step": 10400
},
{
"epoch": 1.5113240418118465,
"grad_norm": 3.0840678215026855,
"learning_rate": 7.03008352477374e-06,
"loss": 0.6691,
"step": 10410
},
{
"epoch": 1.5127758420441348,
"grad_norm": 5.244002342224121,
"learning_rate": 6.99043684856184e-06,
"loss": 0.5773,
"step": 10420
},
{
"epoch": 1.5142276422764227,
"grad_norm": 2.812211513519287,
"learning_rate": 6.950884105394903e-06,
"loss": 0.4341,
"step": 10430
},
{
"epoch": 1.5156794425087108,
"grad_norm": 0.9920812845230103,
"learning_rate": 6.911425501569418e-06,
"loss": 0.5441,
"step": 10440
},
{
"epoch": 1.5171312427409989,
"grad_norm": 0.8474797606468201,
"learning_rate": 6.872061242890882e-06,
"loss": 0.7427,
"step": 10450
},
{
"epoch": 1.5185830429732867,
"grad_norm": 1.2484221458435059,
"learning_rate": 6.8327915346726806e-06,
"loss": 0.5319,
"step": 10460
},
{
"epoch": 1.520034843205575,
"grad_norm": 2.2322065830230713,
"learning_rate": 6.793616581735062e-06,
"loss": 0.7047,
"step": 10470
},
{
"epoch": 1.5214866434378629,
"grad_norm": 3.255192756652832,
"learning_rate": 6.754536588404078e-06,
"loss": 0.5605,
"step": 10480
},
{
"epoch": 1.522938443670151,
"grad_norm": 2.065782308578491,
"learning_rate": 6.715551758510469e-06,
"loss": 0.609,
"step": 10490
},
{
"epoch": 1.524390243902439,
"grad_norm": 1.5074211359024048,
"learning_rate": 6.676662295388631e-06,
"loss": 0.4149,
"step": 10500
},
{
"epoch": 1.525842044134727,
"grad_norm": 1.3542487621307373,
"learning_rate": 6.637868401875577e-06,
"loss": 0.4952,
"step": 10510
},
{
"epoch": 1.5272938443670152,
"grad_norm": 6.184685230255127,
"learning_rate": 6.599170280309824e-06,
"loss": 0.6942,
"step": 10520
},
{
"epoch": 1.528745644599303,
"grad_norm": 1.49580979347229,
"learning_rate": 6.560568132530376e-06,
"loss": 0.5696,
"step": 10530
},
{
"epoch": 1.5301974448315911,
"grad_norm": 1.4806469678878784,
"learning_rate": 6.522062159875692e-06,
"loss": 0.6504,
"step": 10540
},
{
"epoch": 1.5316492450638792,
"grad_norm": 2.461064577102661,
"learning_rate": 6.4836525631825714e-06,
"loss": 0.5862,
"step": 10550
},
{
"epoch": 1.533101045296167,
"grad_norm": 1.635206937789917,
"learning_rate": 6.4453395427851475e-06,
"loss": 0.5664,
"step": 10560
},
{
"epoch": 1.5345528455284554,
"grad_norm": 2.978720188140869,
"learning_rate": 6.407123298513865e-06,
"loss": 0.6014,
"step": 10570
},
{
"epoch": 1.5360046457607432,
"grad_norm": 3.055194854736328,
"learning_rate": 6.369004029694378e-06,
"loss": 0.5824,
"step": 10580
},
{
"epoch": 1.5374564459930313,
"grad_norm": 2.0090768337249756,
"learning_rate": 6.330981935146555e-06,
"loss": 0.7431,
"step": 10590
},
{
"epoch": 1.5389082462253194,
"grad_norm": 3.167921781539917,
"learning_rate": 6.29305721318344e-06,
"loss": 0.4873,
"step": 10600
},
{
"epoch": 1.5403600464576073,
"grad_norm": 2.447772264480591,
"learning_rate": 6.25523006161019e-06,
"loss": 0.6174,
"step": 10610
},
{
"epoch": 1.5418118466898956,
"grad_norm": 1.068217396736145,
"learning_rate": 6.217500677723065e-06,
"loss": 0.6131,
"step": 10620
},
{
"epoch": 1.5432636469221834,
"grad_norm": 3.059321403503418,
"learning_rate": 6.179869258308407e-06,
"loss": 0.5651,
"step": 10630
},
{
"epoch": 1.5447154471544715,
"grad_norm": 1.0179533958435059,
"learning_rate": 6.142335999641599e-06,
"loss": 0.8561,
"step": 10640
},
{
"epoch": 1.5461672473867596,
"grad_norm": 3.0670573711395264,
"learning_rate": 6.104901097486024e-06,
"loss": 0.4205,
"step": 10650
},
{
"epoch": 1.5476190476190477,
"grad_norm": 1.0629135370254517,
"learning_rate": 6.067564747092094e-06,
"loss": 0.7445,
"step": 10660
},
{
"epoch": 1.5490708478513358,
"grad_norm": 1.5961717367172241,
"learning_rate": 6.030327143196179e-06,
"loss": 0.6035,
"step": 10670
},
{
"epoch": 1.5505226480836236,
"grad_norm": 2.1358516216278076,
"learning_rate": 5.993188480019615e-06,
"loss": 0.3647,
"step": 10680
},
{
"epoch": 1.5519744483159117,
"grad_norm": 3.7955398559570312,
"learning_rate": 5.956148951267706e-06,
"loss": 0.4885,
"step": 10690
},
{
"epoch": 1.5534262485481998,
"grad_norm": 1.8019474744796753,
"learning_rate": 5.919208750128685e-06,
"loss": 0.4086,
"step": 10700
},
{
"epoch": 1.5548780487804879,
"grad_norm": 0.8065319061279297,
"learning_rate": 5.882368069272709e-06,
"loss": 0.6092,
"step": 10710
},
{
"epoch": 1.556329849012776,
"grad_norm": 1.8280988931655884,
"learning_rate": 5.8456271008508955e-06,
"loss": 0.583,
"step": 10720
},
{
"epoch": 1.5577816492450638,
"grad_norm": 2.872685670852661,
"learning_rate": 5.808986036494254e-06,
"loss": 0.3497,
"step": 10730
},
{
"epoch": 1.5592334494773519,
"grad_norm": 2.1516687870025635,
"learning_rate": 5.772445067312729e-06,
"loss": 0.4461,
"step": 10740
},
{
"epoch": 1.56068524970964,
"grad_norm": 0.9675107598304749,
"learning_rate": 5.736004383894231e-06,
"loss": 0.8109,
"step": 10750
},
{
"epoch": 1.562137049941928,
"grad_norm": 3.2056965827941895,
"learning_rate": 5.69966417630356e-06,
"loss": 0.7312,
"step": 10760
},
{
"epoch": 1.5635888501742161,
"grad_norm": 1.4558873176574707,
"learning_rate": 5.663424634081474e-06,
"loss": 0.5516,
"step": 10770
},
{
"epoch": 1.565040650406504,
"grad_norm": 1.376585602760315,
"learning_rate": 5.62728594624371e-06,
"loss": 0.3938,
"step": 10780
},
{
"epoch": 1.566492450638792,
"grad_norm": 1.4354243278503418,
"learning_rate": 5.59124830127995e-06,
"loss": 0.6572,
"step": 10790
},
{
"epoch": 1.5679442508710801,
"grad_norm": 2.6437528133392334,
"learning_rate": 5.555311887152867e-06,
"loss": 0.4434,
"step": 10800
},
{
"epoch": 1.5693960511033682,
"grad_norm": 2.039637327194214,
"learning_rate": 5.5194768912971565e-06,
"loss": 0.4561,
"step": 10810
},
{
"epoch": 1.5708478513356563,
"grad_norm": 1.7926547527313232,
"learning_rate": 5.483743500618529e-06,
"loss": 0.7296,
"step": 10820
},
{
"epoch": 1.5722996515679442,
"grad_norm": 2.8525867462158203,
"learning_rate": 5.448111901492747e-06,
"loss": 0.5546,
"step": 10830
},
{
"epoch": 1.5737514518002322,
"grad_norm": 1.2368862628936768,
"learning_rate": 5.412582279764669e-06,
"loss": 0.5491,
"step": 10840
},
{
"epoch": 1.5752032520325203,
"grad_norm": 2.139909505844116,
"learning_rate": 5.377154820747271e-06,
"loss": 0.5243,
"step": 10850
},
{
"epoch": 1.5766550522648084,
"grad_norm": 1.4064335823059082,
"learning_rate": 5.341829709220647e-06,
"loss": 0.9336,
"step": 10860
},
{
"epoch": 1.5781068524970965,
"grad_norm": 3.3268957138061523,
"learning_rate": 5.306607129431107e-06,
"loss": 0.57,
"step": 10870
},
{
"epoch": 1.5795586527293843,
"grad_norm": 5.089993000030518,
"learning_rate": 5.271487265090163e-06,
"loss": 0.5028,
"step": 10880
},
{
"epoch": 1.5810104529616724,
"grad_norm": 1.5383248329162598,
"learning_rate": 5.236470299373589e-06,
"loss": 0.4664,
"step": 10890
},
{
"epoch": 1.5824622531939605,
"grad_norm": 1.7498929500579834,
"learning_rate": 5.201556414920486e-06,
"loss": 0.8543,
"step": 10900
},
{
"epoch": 1.5839140534262486,
"grad_norm": 1.0989433526992798,
"learning_rate": 5.1667457938322925e-06,
"loss": 0.4634,
"step": 10910
},
{
"epoch": 1.5853658536585367,
"grad_norm": 1.1718672513961792,
"learning_rate": 5.1320386176718555e-06,
"loss": 0.2643,
"step": 10920
},
{
"epoch": 1.5868176538908245,
"grad_norm": 2.5529532432556152,
"learning_rate": 5.097435067462497e-06,
"loss": 0.7085,
"step": 10930
},
{
"epoch": 1.5882694541231128,
"grad_norm": 0.7183501124382019,
"learning_rate": 5.0629353236870375e-06,
"loss": 0.6274,
"step": 10940
},
{
"epoch": 1.5897212543554007,
"grad_norm": 0.3472674787044525,
"learning_rate": 5.02853956628686e-06,
"loss": 0.6934,
"step": 10950
},
{
"epoch": 1.5911730545876888,
"grad_norm": 1.4065948724746704,
"learning_rate": 4.994247974661026e-06,
"loss": 0.7115,
"step": 10960
},
{
"epoch": 1.5926248548199768,
"grad_norm": 0.9510209560394287,
"learning_rate": 4.960060727665255e-06,
"loss": 0.5962,
"step": 10970
},
{
"epoch": 1.5940766550522647,
"grad_norm": 3.3892316818237305,
"learning_rate": 4.92597800361104e-06,
"loss": 0.4962,
"step": 10980
},
{
"epoch": 1.595528455284553,
"grad_norm": 3.7970123291015625,
"learning_rate": 4.891999980264728e-06,
"loss": 0.537,
"step": 10990
},
{
"epoch": 1.5969802555168409,
"grad_norm": 1.8874022960662842,
"learning_rate": 4.85812683484656e-06,
"loss": 0.6591,
"step": 11000
},
{
"epoch": 1.598432055749129,
"grad_norm": 3.3695411682128906,
"learning_rate": 4.824358744029761e-06,
"loss": 0.4808,
"step": 11010
},
{
"epoch": 1.599883855981417,
"grad_norm": 4.303611755371094,
"learning_rate": 4.790695883939633e-06,
"loss": 0.4313,
"step": 11020
},
{
"epoch": 1.6013356562137049,
"grad_norm": 2.4233243465423584,
"learning_rate": 4.757138430152608e-06,
"loss": 0.4927,
"step": 11030
},
{
"epoch": 1.6027874564459932,
"grad_norm": 1.4356447458267212,
"learning_rate": 4.72368655769535e-06,
"loss": 0.4185,
"step": 11040
},
{
"epoch": 1.604239256678281,
"grad_norm": 5.6396636962890625,
"learning_rate": 4.690340441043847e-06,
"loss": 0.5059,
"step": 11050
},
{
"epoch": 1.6056910569105691,
"grad_norm": 0.8661177754402161,
"learning_rate": 4.6571002541224955e-06,
"loss": 0.6568,
"step": 11060
},
{
"epoch": 1.6071428571428572,
"grad_norm": 8.649468421936035,
"learning_rate": 4.623966170303171e-06,
"loss": 0.4749,
"step": 11070
},
{
"epoch": 1.608594657375145,
"grad_norm": 0.2540145516395569,
"learning_rate": 4.590938362404368e-06,
"loss": 0.6654,
"step": 11080
},
{
"epoch": 1.6100464576074334,
"grad_norm": 2.6379213333129883,
"learning_rate": 4.558017002690257e-06,
"loss": 0.5673,
"step": 11090
},
{
"epoch": 1.6114982578397212,
"grad_norm": 0.4164746403694153,
"learning_rate": 4.525202262869804e-06,
"loss": 0.4536,
"step": 11100
},
{
"epoch": 1.6129500580720093,
"grad_norm": 0.745841920375824,
"learning_rate": 4.492494314095891e-06,
"loss": 0.5186,
"step": 11110
},
{
"epoch": 1.6144018583042974,
"grad_norm": 2.1661605834960938,
"learning_rate": 4.45989332696439e-06,
"loss": 0.67,
"step": 11120
},
{
"epoch": 1.6158536585365852,
"grad_norm": 1.8644750118255615,
"learning_rate": 4.427399471513288e-06,
"loss": 0.6665,
"step": 11130
},
{
"epoch": 1.6173054587688735,
"grad_norm": 0.8033472299575806,
"learning_rate": 4.395012917221825e-06,
"loss": 0.6176,
"step": 11140
},
{
"epoch": 1.6187572590011614,
"grad_norm": 1.7303358316421509,
"learning_rate": 4.362733833009558e-06,
"loss": 0.4351,
"step": 11150
},
{
"epoch": 1.6202090592334495,
"grad_norm": 5.510407447814941,
"learning_rate": 4.330562387235512e-06,
"loss": 0.7516,
"step": 11160
},
{
"epoch": 1.6216608594657376,
"grad_norm": 0.6413615942001343,
"learning_rate": 4.298498747697335e-06,
"loss": 0.3923,
"step": 11170
},
{
"epoch": 1.6231126596980254,
"grad_norm": 1.7095063924789429,
"learning_rate": 4.266543081630347e-06,
"loss": 0.3482,
"step": 11180
},
{
"epoch": 1.6245644599303137,
"grad_norm": 0.7411553859710693,
"learning_rate": 4.234695555706714e-06,
"loss": 0.3467,
"step": 11190
},
{
"epoch": 1.6260162601626016,
"grad_norm": 6.189662456512451,
"learning_rate": 4.202956336034591e-06,
"loss": 0.6474,
"step": 11200
},
{
"epoch": 1.6274680603948897,
"grad_norm": 0.5904057025909424,
"learning_rate": 4.171325588157218e-06,
"loss": 0.3935,
"step": 11210
},
{
"epoch": 1.6289198606271778,
"grad_norm": 1.5648072957992554,
"learning_rate": 4.139803477052076e-06,
"loss": 0.6161,
"step": 11220
},
{
"epoch": 1.6303716608594656,
"grad_norm": 4.8867597579956055,
"learning_rate": 4.108390167130044e-06,
"loss": 0.5963,
"step": 11230
},
{
"epoch": 1.631823461091754,
"grad_norm": 0.865047037601471,
"learning_rate": 4.077085822234503e-06,
"loss": 0.4213,
"step": 11240
},
{
"epoch": 1.6332752613240418,
"grad_norm": 1.2489089965820312,
"learning_rate": 4.045890605640504e-06,
"loss": 0.4975,
"step": 11250
},
{
"epoch": 1.6347270615563299,
"grad_norm": 0.8895522952079773,
"learning_rate": 4.0148046800539265e-06,
"loss": 0.5152,
"step": 11260
},
{
"epoch": 1.636178861788618,
"grad_norm": 7.4556121826171875,
"learning_rate": 3.983828207610615e-06,
"loss": 0.8086,
"step": 11270
},
{
"epoch": 1.6376306620209058,
"grad_norm": 2.2906975746154785,
"learning_rate": 3.9529613498755165e-06,
"loss": 0.4963,
"step": 11280
},
{
"epoch": 1.639082462253194,
"grad_norm": 1.934874415397644,
"learning_rate": 3.922204267841889e-06,
"loss": 0.4317,
"step": 11290
},
{
"epoch": 1.640534262485482,
"grad_norm": 1.597822666168213,
"learning_rate": 3.8915571219304055e-06,
"loss": 0.5763,
"step": 11300
},
{
"epoch": 1.64198606271777,
"grad_norm": 1.507169485092163,
"learning_rate": 3.861020071988339e-06,
"loss": 0.4695,
"step": 11310
},
{
"epoch": 1.6434378629500581,
"grad_norm": 0.8798676133155823,
"learning_rate": 3.830593277288757e-06,
"loss": 0.4347,
"step": 11320
},
{
"epoch": 1.644889663182346,
"grad_norm": 2.6623973846435547,
"learning_rate": 3.800276896529642e-06,
"loss": 0.4887,
"step": 11330
},
{
"epoch": 1.6463414634146343,
"grad_norm": 1.0119774341583252,
"learning_rate": 3.7700710878330907e-06,
"loss": 0.4776,
"step": 11340
},
{
"epoch": 1.6477932636469221,
"grad_norm": 1.744946002960205,
"learning_rate": 3.7399760087444975e-06,
"loss": 0.3542,
"step": 11350
},
{
"epoch": 1.6492450638792102,
"grad_norm": 0.9083417057991028,
"learning_rate": 3.7099918162317114e-06,
"loss": 0.5441,
"step": 11360
},
{
"epoch": 1.6506968641114983,
"grad_norm": 1.0866427421569824,
"learning_rate": 3.680118666684218e-06,
"loss": 0.6087,
"step": 11370
},
{
"epoch": 1.6521486643437862,
"grad_norm": 1.4837919473648071,
"learning_rate": 3.6503567159123536e-06,
"loss": 0.5775,
"step": 11380
},
{
"epoch": 1.6536004645760745,
"grad_norm": 1.312999963760376,
"learning_rate": 3.6207061191464636e-06,
"loss": 0.6444,
"step": 11390
},
{
"epoch": 1.6550522648083623,
"grad_norm": 1.7816232442855835,
"learning_rate": 3.5911670310360882e-06,
"loss": 0.7579,
"step": 11400
},
{
"epoch": 1.6565040650406504,
"grad_norm": 5.434678077697754,
"learning_rate": 3.561739605649189e-06,
"loss": 0.5099,
"step": 11410
},
{
"epoch": 1.6579558652729385,
"grad_norm": 3.44694185256958,
"learning_rate": 3.532423996471307e-06,
"loss": 0.8014,
"step": 11420
},
{
"epoch": 1.6594076655052263,
"grad_norm": 1.121071219444275,
"learning_rate": 3.503220356404785e-06,
"loss": 0.7484,
"step": 11430
},
{
"epoch": 1.6608594657375146,
"grad_norm": 1.6010463237762451,
"learning_rate": 3.4741288377679732e-06,
"loss": 0.6689,
"step": 11440
},
{
"epoch": 1.6623112659698025,
"grad_norm": 2.2549779415130615,
"learning_rate": 3.4451495922944195e-06,
"loss": 0.5535,
"step": 11450
},
{
"epoch": 1.6637630662020906,
"grad_norm": 0.8929911255836487,
"learning_rate": 3.4162827711320788e-06,
"loss": 0.6548,
"step": 11460
},
{
"epoch": 1.6652148664343787,
"grad_norm": 0.8602511286735535,
"learning_rate": 3.3875285248425427e-06,
"loss": 0.4342,
"step": 11470
},
{
"epoch": 1.6666666666666665,
"grad_norm": 2.5482707023620605,
"learning_rate": 3.358887003400246e-06,
"loss": 0.5578,
"step": 11480
},
{
"epoch": 1.6681184668989548,
"grad_norm": 2.0349433422088623,
"learning_rate": 3.3303583561916624e-06,
"loss": 0.4982,
"step": 11490
},
{
"epoch": 1.6695702671312427,
"grad_norm": 0.9225724339485168,
"learning_rate": 3.3019427320145542e-06,
"loss": 0.4313,
"step": 11500
},
{
"epoch": 1.6710220673635308,
"grad_norm": 1.9111758470535278,
"learning_rate": 3.2736402790771948e-06,
"loss": 0.4434,
"step": 11510
},
{
"epoch": 1.6724738675958188,
"grad_norm": 3.7338778972625732,
"learning_rate": 3.245451144997569e-06,
"loss": 0.6636,
"step": 11520
},
{
"epoch": 1.6739256678281067,
"grad_norm": 1.3860654830932617,
"learning_rate": 3.2173754768026394e-06,
"loss": 0.5516,
"step": 11530
},
{
"epoch": 1.675377468060395,
"grad_norm": 1.1442302465438843,
"learning_rate": 3.189413420927545e-06,
"loss": 0.3753,
"step": 11540
},
{
"epoch": 1.6768292682926829,
"grad_norm": 1.216781497001648,
"learning_rate": 3.1615651232148547e-06,
"loss": 0.478,
"step": 11550
},
{
"epoch": 1.678281068524971,
"grad_norm": 1.9530525207519531,
"learning_rate": 3.1338307289138254e-06,
"loss": 0.5791,
"step": 11560
},
{
"epoch": 1.679732868757259,
"grad_norm": 1.5259203910827637,
"learning_rate": 3.1062103826796e-06,
"loss": 0.8218,
"step": 11570
},
{
"epoch": 1.6811846689895469,
"grad_norm": 1.0574325323104858,
"learning_rate": 3.078704228572485e-06,
"loss": 0.4432,
"step": 11580
},
{
"epoch": 1.6826364692218352,
"grad_norm": 1.1627357006072998,
"learning_rate": 3.0513124100571944e-06,
"loss": 0.4513,
"step": 11590
},
{
"epoch": 1.684088269454123,
"grad_norm": 1.7360894680023193,
"learning_rate": 3.0240350700021097e-06,
"loss": 0.4008,
"step": 11600
},
{
"epoch": 1.6855400696864111,
"grad_norm": 1.3201311826705933,
"learning_rate": 2.9968723506784953e-06,
"loss": 0.7546,
"step": 11610
},
{
"epoch": 1.6869918699186992,
"grad_norm": 2.6173791885375977,
"learning_rate": 2.9698243937598125e-06,
"loss": 0.587,
"step": 11620
},
{
"epoch": 1.688443670150987,
"grad_norm": 1.6192914247512817,
"learning_rate": 2.942891340320936e-06,
"loss": 0.5349,
"step": 11630
},
{
"epoch": 1.6898954703832754,
"grad_norm": 1.5096668004989624,
"learning_rate": 2.9160733308374347e-06,
"loss": 0.5358,
"step": 11640
},
{
"epoch": 1.6913472706155632,
"grad_norm": 2.530461072921753,
"learning_rate": 2.8893705051848546e-06,
"loss": 0.4036,
"step": 11650
},
{
"epoch": 1.6927990708478513,
"grad_norm": 1.12082040309906,
"learning_rate": 2.862783002637959e-06,
"loss": 0.6056,
"step": 11660
},
{
"epoch": 1.6942508710801394,
"grad_norm": 2.175119400024414,
"learning_rate": 2.836310961870012e-06,
"loss": 0.5726,
"step": 11670
},
{
"epoch": 1.6957026713124272,
"grad_norm": 1.1249805688858032,
"learning_rate": 2.8099545209520794e-06,
"loss": 0.6046,
"step": 11680
},
{
"epoch": 1.6971544715447155,
"grad_norm": 1.3748245239257812,
"learning_rate": 2.783713817352282e-06,
"loss": 0.5619,
"step": 11690
},
{
"epoch": 1.6986062717770034,
"grad_norm": 1.3081672191619873,
"learning_rate": 2.757588987935078e-06,
"loss": 0.4904,
"step": 11700
},
{
"epoch": 1.7000580720092915,
"grad_norm": 1.0695126056671143,
"learning_rate": 2.731580168960557e-06,
"loss": 0.4588,
"step": 11710
},
{
"epoch": 1.7015098722415796,
"grad_norm": 0.9099944829940796,
"learning_rate": 2.705687496083742e-06,
"loss": 0.6389,
"step": 11720
},
{
"epoch": 1.7029616724738676,
"grad_norm": 3.102823495864868,
"learning_rate": 2.679911104353855e-06,
"loss": 0.4546,
"step": 11730
},
{
"epoch": 1.7044134727061557,
"grad_norm": 2.0113303661346436,
"learning_rate": 2.654251128213642e-06,
"loss": 0.5193,
"step": 11740
},
{
"epoch": 1.7058652729384436,
"grad_norm": 5.120201110839844,
"learning_rate": 2.6287077014986396e-06,
"loss": 0.3837,
"step": 11750
},
{
"epoch": 1.7073170731707317,
"grad_norm": 1.8609308004379272,
"learning_rate": 2.603280957436499e-06,
"loss": 0.6775,
"step": 11760
},
{
"epoch": 1.7087688734030198,
"grad_norm": 0.5862835049629211,
"learning_rate": 2.5779710286463006e-06,
"loss": 0.4387,
"step": 11770
},
{
"epoch": 1.7102206736353078,
"grad_norm": 1.4512196779251099,
"learning_rate": 2.552778047137824e-06,
"loss": 0.551,
"step": 11780
},
{
"epoch": 1.711672473867596,
"grad_norm": 1.8097496032714844,
"learning_rate": 2.527702144310909e-06,
"loss": 0.4015,
"step": 11790
},
{
"epoch": 1.7131242740998838,
"grad_norm": 1.6561768054962158,
"learning_rate": 2.502743450954714e-06,
"loss": 0.6328,
"step": 11800
},
{
"epoch": 1.7145760743321719,
"grad_norm": 3.0371804237365723,
"learning_rate": 2.477902097247095e-06,
"loss": 0.5383,
"step": 11810
},
{
"epoch": 1.71602787456446,
"grad_norm": 1.1968135833740234,
"learning_rate": 2.453178212753876e-06,
"loss": 0.4626,
"step": 11820
},
{
"epoch": 1.717479674796748,
"grad_norm": 1.953162431716919,
"learning_rate": 2.428571926428194e-06,
"loss": 0.416,
"step": 11830
},
{
"epoch": 1.718931475029036,
"grad_norm": 5.2054443359375,
"learning_rate": 2.4040833666098413e-06,
"loss": 0.5872,
"step": 11840
},
{
"epoch": 1.720383275261324,
"grad_norm": 0.8436479568481445,
"learning_rate": 2.3797126610245605e-06,
"loss": 0.6168,
"step": 11850
},
{
"epoch": 1.721835075493612,
"grad_norm": 1.6038563251495361,
"learning_rate": 2.3554599367834137e-06,
"loss": 0.4696,
"step": 11860
},
{
"epoch": 1.7232868757259001,
"grad_norm": 1.6510204076766968,
"learning_rate": 2.3313253203820965e-06,
"loss": 0.5301,
"step": 11870
},
{
"epoch": 1.7247386759581882,
"grad_norm": 1.8987841606140137,
"learning_rate": 2.307308937700278e-06,
"loss": 0.4072,
"step": 11880
},
{
"epoch": 1.7261904761904763,
"grad_norm": 1.4687321186065674,
"learning_rate": 2.283410914000969e-06,
"loss": 0.6519,
"step": 11890
},
{
"epoch": 1.7276422764227641,
"grad_norm": 1.3027409315109253,
"learning_rate": 2.2596313739298462e-06,
"loss": 0.5261,
"step": 11900
},
{
"epoch": 1.7290940766550522,
"grad_norm": 1.6218777894973755,
"learning_rate": 2.235970441514598e-06,
"loss": 0.5705,
"step": 11910
},
{
"epoch": 1.7305458768873403,
"grad_norm": 1.294359803199768,
"learning_rate": 2.2124282401642936e-06,
"loss": 0.5077,
"step": 11920
},
{
"epoch": 1.7319976771196284,
"grad_norm": 3.3764312267303467,
"learning_rate": 2.189004892668742e-06,
"loss": 0.5721,
"step": 11930
},
{
"epoch": 1.7334494773519165,
"grad_norm": 2.6626949310302734,
"learning_rate": 2.165700521197825e-06,
"loss": 0.4905,
"step": 11940
},
{
"epoch": 1.7349012775842043,
"grad_norm": 1.1768718957901,
"learning_rate": 2.1425152473008832e-06,
"loss": 0.6437,
"step": 11950
},
{
"epoch": 1.7363530778164924,
"grad_norm": 3.1630823612213135,
"learning_rate": 2.119449191906089e-06,
"loss": 0.5341,
"step": 11960
},
{
"epoch": 1.7378048780487805,
"grad_norm": 6.051577568054199,
"learning_rate": 2.096502475319781e-06,
"loss": 0.4468,
"step": 11970
},
{
"epoch": 1.7392566782810686,
"grad_norm": 1.5965017080307007,
"learning_rate": 2.0736752172258846e-06,
"loss": 0.8102,
"step": 11980
},
{
"epoch": 1.7407084785133566,
"grad_norm": 1.8911795616149902,
"learning_rate": 2.050967536685233e-06,
"loss": 0.7823,
"step": 11990
},
{
"epoch": 1.7421602787456445,
"grad_norm": 1.6226707696914673,
"learning_rate": 2.0283795521350042e-06,
"loss": 0.7913,
"step": 12000
},
{
"epoch": 1.7421602787456445,
"eval_loss": 0.6174443364143372,
"eval_runtime": 107.8262,
"eval_samples_per_second": 13.448,
"eval_steps_per_second": 3.367,
"step": 12000
},
{
"epoch": 1.7436120789779328,
"grad_norm": 1.1205623149871826,
"learning_rate": 2.005911381388048e-06,
"loss": 0.6055,
"step": 12010
},
{
"epoch": 1.7450638792102207,
"grad_norm": 1.6299880743026733,
"learning_rate": 1.9835631416323164e-06,
"loss": 0.599,
"step": 12020
},
{
"epoch": 1.7465156794425087,
"grad_norm": 1.34322190284729,
"learning_rate": 1.961334949430227e-06,
"loss": 0.5602,
"step": 12030
},
{
"epoch": 1.7479674796747968,
"grad_norm": 0.7588962912559509,
"learning_rate": 1.9392269207180512e-06,
"loss": 0.4644,
"step": 12040
},
{
"epoch": 1.7494192799070847,
"grad_norm": 3.5957090854644775,
"learning_rate": 1.9172391708053408e-06,
"loss": 1.0411,
"step": 12050
},
{
"epoch": 1.750871080139373,
"grad_norm": 1.6750432252883911,
"learning_rate": 1.895371814374286e-06,
"loss": 0.5805,
"step": 12060
},
{
"epoch": 1.7523228803716608,
"grad_norm": 1.9335286617279053,
"learning_rate": 1.8736249654791538e-06,
"loss": 0.7541,
"step": 12070
},
{
"epoch": 1.753774680603949,
"grad_norm": 1.4929413795471191,
"learning_rate": 1.8519987375456654e-06,
"loss": 0.5656,
"step": 12080
},
{
"epoch": 1.755226480836237,
"grad_norm": 1.5122934579849243,
"learning_rate": 1.8304932433704097e-06,
"loss": 0.5031,
"step": 12090
},
{
"epoch": 1.7566782810685249,
"grad_norm": 1.0562947988510132,
"learning_rate": 1.809108595120279e-06,
"loss": 0.5551,
"step": 12100
},
{
"epoch": 1.7581300813008132,
"grad_norm": 1.0522669553756714,
"learning_rate": 1.7878449043318534e-06,
"loss": 0.4314,
"step": 12110
},
{
"epoch": 1.759581881533101,
"grad_norm": 0.8575490117073059,
"learning_rate": 1.766702281910837e-06,
"loss": 0.4565,
"step": 12120
},
{
"epoch": 1.761033681765389,
"grad_norm": 1.6525681018829346,
"learning_rate": 1.7456808381314583e-06,
"loss": 0.4301,
"step": 12130
},
{
"epoch": 1.7624854819976772,
"grad_norm": 1.931264042854309,
"learning_rate": 1.7247806826359375e-06,
"loss": 0.4871,
"step": 12140
},
{
"epoch": 1.763937282229965,
"grad_norm": 1.8501496315002441,
"learning_rate": 1.704001924433865e-06,
"loss": 0.657,
"step": 12150
},
{
"epoch": 1.7653890824622533,
"grad_norm": 1.272760272026062,
"learning_rate": 1.6833446719016627e-06,
"loss": 0.606,
"step": 12160
},
{
"epoch": 1.7668408826945412,
"grad_norm": 0.9544461369514465,
"learning_rate": 1.6628090327820172e-06,
"loss": 0.6067,
"step": 12170
},
{
"epoch": 1.7682926829268293,
"grad_norm": 9.855928421020508,
"learning_rate": 1.6423951141833011e-06,
"loss": 0.3548,
"step": 12180
},
{
"epoch": 1.7697444831591174,
"grad_norm": 3.282947540283203,
"learning_rate": 1.6221030225790413e-06,
"loss": 0.3999,
"step": 12190
},
{
"epoch": 1.7711962833914052,
"grad_norm": 3.6573681831359863,
"learning_rate": 1.6019328638073261e-06,
"loss": 0.5574,
"step": 12200
},
{
"epoch": 1.7726480836236935,
"grad_norm": 3.0135483741760254,
"learning_rate": 1.581884743070297e-06,
"loss": 0.7062,
"step": 12210
},
{
"epoch": 1.7740998838559814,
"grad_norm": 1.4390877485275269,
"learning_rate": 1.5619587649335605e-06,
"loss": 0.5718,
"step": 12220
},
{
"epoch": 1.7755516840882695,
"grad_norm": 0.5463725328445435,
"learning_rate": 1.5421550333256734e-06,
"loss": 0.609,
"step": 12230
},
{
"epoch": 1.7770034843205575,
"grad_norm": 2.245574712753296,
"learning_rate": 1.5224736515375814e-06,
"loss": 0.5087,
"step": 12240
},
{
"epoch": 1.7784552845528454,
"grad_norm": 1.2518837451934814,
"learning_rate": 1.502914722222079e-06,
"loss": 0.6448,
"step": 12250
},
{
"epoch": 1.7799070847851337,
"grad_norm": 1.8152894973754883,
"learning_rate": 1.4834783473932994e-06,
"loss": 0.6077,
"step": 12260
},
{
"epoch": 1.7813588850174216,
"grad_norm": 1.3485125303268433,
"learning_rate": 1.4641646284261485e-06,
"loss": 0.5192,
"step": 12270
},
{
"epoch": 1.7828106852497096,
"grad_norm": 4.338469982147217,
"learning_rate": 1.444973666055796e-06,
"loss": 0.6732,
"step": 12280
},
{
"epoch": 1.7842624854819977,
"grad_norm": 0.5736151933670044,
"learning_rate": 1.4259055603771527e-06,
"loss": 0.4268,
"step": 12290
},
{
"epoch": 1.7857142857142856,
"grad_norm": 2.7274398803710938,
"learning_rate": 1.4069604108443296e-06,
"loss": 0.6137,
"step": 12300
},
{
"epoch": 1.787166085946574,
"grad_norm": 1.3027504682540894,
"learning_rate": 1.3881383162701433e-06,
"loss": 0.48,
"step": 12310
},
{
"epoch": 1.7886178861788617,
"grad_norm": 1.979504942893982,
"learning_rate": 1.3694393748255902e-06,
"loss": 0.3862,
"step": 12320
},
{
"epoch": 1.7900696864111498,
"grad_norm": 2.1074235439300537,
"learning_rate": 1.3508636840393246e-06,
"loss": 0.5215,
"step": 12330
},
{
"epoch": 1.791521486643438,
"grad_norm": 2.5477986335754395,
"learning_rate": 1.3324113407971516e-06,
"loss": 0.4583,
"step": 12340
},
{
"epoch": 1.7929732868757258,
"grad_norm": 0.5263664126396179,
"learning_rate": 1.314082441341552e-06,
"loss": 0.6051,
"step": 12350
},
{
"epoch": 1.794425087108014,
"grad_norm": 1.8962557315826416,
"learning_rate": 1.2958770812711352e-06,
"loss": 0.6069,
"step": 12360
},
{
"epoch": 1.795876887340302,
"grad_norm": 2.079145908355713,
"learning_rate": 1.2777953555401678e-06,
"loss": 0.7225,
"step": 12370
},
{
"epoch": 1.79732868757259,
"grad_norm": 0.8144702315330505,
"learning_rate": 1.2598373584580824e-06,
"loss": 0.474,
"step": 12380
},
{
"epoch": 1.798780487804878,
"grad_norm": 6.011617660522461,
"learning_rate": 1.2420031836889668e-06,
"loss": 0.4614,
"step": 12390
},
{
"epoch": 1.800232288037166,
"grad_norm": 2.0123348236083984,
"learning_rate": 1.224292924251083e-06,
"loss": 0.4744,
"step": 12400
},
{
"epoch": 1.8016840882694543,
"grad_norm": 1.1453293561935425,
"learning_rate": 1.2067066725163946e-06,
"loss": 0.7232,
"step": 12410
},
{
"epoch": 1.8031358885017421,
"grad_norm": 3.011850357055664,
"learning_rate": 1.1892445202100643e-06,
"loss": 0.6242,
"step": 12420
},
{
"epoch": 1.8045876887340302,
"grad_norm": 1.3641666173934937,
"learning_rate": 1.1719065584099881e-06,
"loss": 0.6855,
"step": 12430
},
{
"epoch": 1.8060394889663183,
"grad_norm": 2.096034288406372,
"learning_rate": 1.1546928775463234e-06,
"loss": 0.5658,
"step": 12440
},
{
"epoch": 1.8074912891986061,
"grad_norm": 1.173338770866394,
"learning_rate": 1.137603567401005e-06,
"loss": 0.6926,
"step": 12450
},
{
"epoch": 1.8089430894308944,
"grad_norm": 7.650896072387695,
"learning_rate": 1.1206387171072808e-06,
"loss": 0.5958,
"step": 12460
},
{
"epoch": 1.8103948896631823,
"grad_norm": 4.46699857711792,
"learning_rate": 1.1037984151492624e-06,
"loss": 0.4605,
"step": 12470
},
{
"epoch": 1.8118466898954704,
"grad_norm": 2.164135217666626,
"learning_rate": 1.0870827493614344e-06,
"loss": 0.665,
"step": 12480
},
{
"epoch": 1.8132984901277585,
"grad_norm": 1.531969428062439,
"learning_rate": 1.0704918069282226e-06,
"loss": 0.4462,
"step": 12490
},
{
"epoch": 1.8147502903600463,
"grad_norm": 1.4021626710891724,
"learning_rate": 1.0540256743835297e-06,
"loss": 0.5399,
"step": 12500
},
{
"epoch": 1.8162020905923346,
"grad_norm": 1.4886596202850342,
"learning_rate": 1.0376844376102784e-06,
"loss": 0.5748,
"step": 12510
},
{
"epoch": 1.8176538908246225,
"grad_norm": 2.1300623416900635,
"learning_rate": 1.0214681818399712e-06,
"loss": 0.6041,
"step": 12520
},
{
"epoch": 1.8191056910569106,
"grad_norm": 2.5872268676757812,
"learning_rate": 1.0053769916522488e-06,
"loss": 0.6594,
"step": 12530
},
{
"epoch": 1.8205574912891986,
"grad_norm": 0.8167919516563416,
"learning_rate": 9.894109509744342e-07,
"loss": 0.5516,
"step": 12540
},
{
"epoch": 1.8220092915214865,
"grad_norm": 1.7698092460632324,
"learning_rate": 9.735701430811067e-07,
"loss": 0.4946,
"step": 12550
},
{
"epoch": 1.8234610917537748,
"grad_norm": 1.8498523235321045,
"learning_rate": 9.578546505936676e-07,
"loss": 0.6975,
"step": 12560
},
{
"epoch": 1.8249128919860627,
"grad_norm": 2.3681557178497314,
"learning_rate": 9.422645554799048e-07,
"loss": 0.7246,
"step": 12570
},
{
"epoch": 1.8263646922183507,
"grad_norm": 1.6103743314743042,
"learning_rate": 9.267999390535659e-07,
"loss": 0.597,
"step": 12580
},
{
"epoch": 1.8278164924506388,
"grad_norm": 1.0685875415802002,
"learning_rate": 9.11460881973944e-07,
"loss": 0.4749,
"step": 12590
},
{
"epoch": 1.8292682926829267,
"grad_norm": 1.7253142595291138,
"learning_rate": 8.962474642454338e-07,
"loss": 0.7401,
"step": 12600
},
{
"epoch": 1.830720092915215,
"grad_norm": 3.835946798324585,
"learning_rate": 8.811597652171377e-07,
"loss": 0.5107,
"step": 12610
},
{
"epoch": 1.8321718931475028,
"grad_norm": 3.366118907928467,
"learning_rate": 8.661978635824464e-07,
"loss": 0.5523,
"step": 12620
},
{
"epoch": 1.833623693379791,
"grad_norm": 1.4780124425888062,
"learning_rate": 8.513618373786198e-07,
"loss": 0.4592,
"step": 12630
},
{
"epoch": 1.835075493612079,
"grad_norm": 1.5837668180465698,
"learning_rate": 8.366517639863819e-07,
"loss": 0.5838,
"step": 12640
},
{
"epoch": 1.8365272938443669,
"grad_norm": 5.799704551696777,
"learning_rate": 8.220677201295296e-07,
"loss": 0.7116,
"step": 12650
},
{
"epoch": 1.8379790940766552,
"grad_norm": 1.668204665184021,
"learning_rate": 8.076097818745188e-07,
"loss": 0.5013,
"step": 12660
},
{
"epoch": 1.839430894308943,
"grad_norm": 2.408761501312256,
"learning_rate": 7.932780246300703e-07,
"loss": 0.4475,
"step": 12670
},
{
"epoch": 1.840882694541231,
"grad_norm": 1.6833887100219727,
"learning_rate": 7.790725231467844e-07,
"loss": 0.3637,
"step": 12680
},
{
"epoch": 1.8423344947735192,
"grad_norm": 2.4150564670562744,
"learning_rate": 7.649933515167407e-07,
"loss": 0.5217,
"step": 12690
},
{
"epoch": 1.843786295005807,
"grad_norm": 0.23005646467208862,
"learning_rate": 7.510405831731155e-07,
"loss": 0.7733,
"step": 12700
},
{
"epoch": 1.8452380952380953,
"grad_norm": 5.178878307342529,
"learning_rate": 7.372142908898038e-07,
"loss": 0.5562,
"step": 12710
},
{
"epoch": 1.8466898954703832,
"grad_norm": 1.1063512563705444,
"learning_rate": 7.235145467810344e-07,
"loss": 0.6543,
"step": 12720
},
{
"epoch": 1.8481416957026713,
"grad_norm": 1.4870764017105103,
"learning_rate": 7.099414223009859e-07,
"loss": 0.5468,
"step": 12730
},
{
"epoch": 1.8495934959349594,
"grad_norm": 0.8903436660766602,
"learning_rate": 6.964949882434402e-07,
"loss": 0.441,
"step": 12740
},
{
"epoch": 1.8510452961672472,
"grad_norm": 2.586010694503784,
"learning_rate": 6.831753147413827e-07,
"loss": 0.7283,
"step": 12750
},
{
"epoch": 1.8524970963995355,
"grad_norm": 7.29203987121582,
"learning_rate": 6.699824712666503e-07,
"loss": 0.5616,
"step": 12760
},
{
"epoch": 1.8539488966318234,
"grad_norm": 2.853286027908325,
"learning_rate": 6.569165266295779e-07,
"loss": 0.6829,
"step": 12770
},
{
"epoch": 1.8554006968641115,
"grad_norm": 1.3794944286346436,
"learning_rate": 6.439775489786193e-07,
"loss": 0.6023,
"step": 12780
},
{
"epoch": 1.8568524970963995,
"grad_norm": 1.2747677564620972,
"learning_rate": 6.311656058000076e-07,
"loss": 0.5941,
"step": 12790
},
{
"epoch": 1.8583042973286876,
"grad_norm": 1.2692632675170898,
"learning_rate": 6.184807639173979e-07,
"loss": 0.54,
"step": 12800
},
{
"epoch": 1.8597560975609757,
"grad_norm": 3.867017984390259,
"learning_rate": 6.059230894915224e-07,
"loss": 0.4035,
"step": 12810
},
{
"epoch": 1.8612078977932636,
"grad_norm": 5.028682708740234,
"learning_rate": 5.934926480198333e-07,
"loss": 0.6283,
"step": 12820
},
{
"epoch": 1.8626596980255516,
"grad_norm": 1.321096658706665,
"learning_rate": 5.811895043361742e-07,
"loss": 0.3401,
"step": 12830
},
{
"epoch": 1.8641114982578397,
"grad_norm": 1.8228753805160522,
"learning_rate": 5.690137226104481e-07,
"loss": 0.6275,
"step": 12840
},
{
"epoch": 1.8655632984901278,
"grad_norm": 1.2849879264831543,
"learning_rate": 5.569653663482527e-07,
"loss": 0.5017,
"step": 12850
},
{
"epoch": 1.8670150987224159,
"grad_norm": 1.4431346654891968,
"learning_rate": 5.450444983905845e-07,
"loss": 0.4334,
"step": 12860
},
{
"epoch": 1.8684668989547037,
"grad_norm": 3.2771754264831543,
"learning_rate": 5.332511809134883e-07,
"loss": 0.5051,
"step": 12870
},
{
"epoch": 1.8699186991869918,
"grad_norm": 0.42705005407333374,
"learning_rate": 5.215854754277382e-07,
"loss": 0.5255,
"step": 12880
},
{
"epoch": 1.87137049941928,
"grad_norm": 1.7732504606246948,
"learning_rate": 5.100474427785245e-07,
"loss": 0.5235,
"step": 12890
},
{
"epoch": 1.872822299651568,
"grad_norm": 1.3279632329940796,
"learning_rate": 4.986371431451254e-07,
"loss": 0.7319,
"step": 12900
},
{
"epoch": 1.874274099883856,
"grad_norm": 3.910167694091797,
"learning_rate": 4.87354636040599e-07,
"loss": 0.4989,
"step": 12910
},
{
"epoch": 1.875725900116144,
"grad_norm": 4.26170015335083,
"learning_rate": 4.7619998031147304e-07,
"loss": 0.3566,
"step": 12920
},
{
"epoch": 1.877177700348432,
"grad_norm": 1.6784484386444092,
"learning_rate": 4.651732341374365e-07,
"loss": 0.4187,
"step": 12930
},
{
"epoch": 1.87862950058072,
"grad_norm": 2.036226511001587,
"learning_rate": 4.5427445503103684e-07,
"loss": 0.504,
"step": 12940
},
{
"epoch": 1.8800813008130082,
"grad_norm": 1.2957289218902588,
"learning_rate": 4.435036998373776e-07,
"loss": 0.419,
"step": 12950
},
{
"epoch": 1.8815331010452963,
"grad_norm": 1.349066972732544,
"learning_rate": 4.3286102473382994e-07,
"loss": 0.3789,
"step": 12960
},
{
"epoch": 1.8829849012775841,
"grad_norm": 8.302398681640625,
"learning_rate": 4.2234648522972156e-07,
"loss": 0.3884,
"step": 12970
},
{
"epoch": 1.8844367015098722,
"grad_norm": 4.297000885009766,
"learning_rate": 4.11960136166073e-07,
"loss": 0.4167,
"step": 12980
},
{
"epoch": 1.8858885017421603,
"grad_norm": 0.8533451557159424,
"learning_rate": 4.0170203171528974e-07,
"loss": 0.3217,
"step": 12990
},
{
"epoch": 1.8873403019744484,
"grad_norm": 1.4826991558074951,
"learning_rate": 3.9157222538088454e-07,
"loss": 0.3191,
"step": 13000
},
{
"epoch": 1.8887921022067364,
"grad_norm": 2.1450116634368896,
"learning_rate": 3.815707699972165e-07,
"loss": 0.3731,
"step": 13010
},
{
"epoch": 1.8902439024390243,
"grad_norm": 1.522496223449707,
"learning_rate": 3.716977177291886e-07,
"loss": 0.5099,
"step": 13020
},
{
"epoch": 1.8916957026713124,
"grad_norm": 0.7258571982383728,
"learning_rate": 3.619531200719839e-07,
"loss": 0.5385,
"step": 13030
},
{
"epoch": 1.8931475029036005,
"grad_norm": 3.8711488246917725,
"learning_rate": 3.5233702785081035e-07,
"loss": 0.3283,
"step": 13040
},
{
"epoch": 1.8945993031358885,
"grad_norm": 1.6500895023345947,
"learning_rate": 3.428494912206259e-07,
"loss": 0.2962,
"step": 13050
},
{
"epoch": 1.8960511033681766,
"grad_norm": 1.1146255731582642,
"learning_rate": 3.334905596658666e-07,
"loss": 0.7563,
"step": 13060
},
{
"epoch": 1.8975029036004645,
"grad_norm": 1.4949523210525513,
"learning_rate": 3.242602820002161e-07,
"loss": 0.4394,
"step": 13070
},
{
"epoch": 1.8989547038327528,
"grad_norm": 3.016923189163208,
"learning_rate": 3.1515870636631696e-07,
"loss": 0.5093,
"step": 13080
},
{
"epoch": 1.9004065040650406,
"grad_norm": 1.2064356803894043,
"learning_rate": 3.061858802355433e-07,
"loss": 0.5408,
"step": 13090
},
{
"epoch": 1.9018583042973287,
"grad_norm": 2.9825875759124756,
"learning_rate": 2.97341850407748e-07,
"loss": 0.4946,
"step": 13100
},
{
"epoch": 1.9033101045296168,
"grad_norm": 2.217625617980957,
"learning_rate": 2.886266630110185e-07,
"loss": 0.5713,
"step": 13110
},
{
"epoch": 1.9047619047619047,
"grad_norm": 1.1467139720916748,
"learning_rate": 2.8004036350142705e-07,
"loss": 0.4261,
"step": 13120
},
{
"epoch": 1.906213704994193,
"grad_norm": 2.285097599029541,
"learning_rate": 2.7158299666280864e-07,
"loss": 0.53,
"step": 13130
},
{
"epoch": 1.9076655052264808,
"grad_norm": 3.382395029067993,
"learning_rate": 2.6325460660651393e-07,
"loss": 0.761,
"step": 13140
},
{
"epoch": 1.909117305458769,
"grad_norm": 2.245380163192749,
"learning_rate": 2.550552367711956e-07,
"loss": 0.6742,
"step": 13150
},
{
"epoch": 1.910569105691057,
"grad_norm": 2.25600528717041,
"learning_rate": 2.469849299225585e-07,
"loss": 0.5978,
"step": 13160
},
{
"epoch": 1.9120209059233448,
"grad_norm": 1.8853719234466553,
"learning_rate": 2.390437281531627e-07,
"loss": 0.422,
"step": 13170
},
{
"epoch": 1.9134727061556331,
"grad_norm": 1.3422927856445312,
"learning_rate": 2.3123167288217618e-07,
"loss": 0.6992,
"step": 13180
},
{
"epoch": 1.914924506387921,
"grad_norm": 1.3293629884719849,
"learning_rate": 2.2354880485518648e-07,
"loss": 0.3887,
"step": 13190
},
{
"epoch": 1.916376306620209,
"grad_norm": 1.4933987855911255,
"learning_rate": 2.1599516414396726e-07,
"loss": 0.7679,
"step": 13200
},
{
"epoch": 1.9178281068524972,
"grad_norm": 2.126613140106201,
"learning_rate": 2.0857079014628135e-07,
"loss": 0.5869,
"step": 13210
},
{
"epoch": 1.919279907084785,
"grad_norm": 1.1330955028533936,
"learning_rate": 2.0127572158566976e-07,
"loss": 0.6385,
"step": 13220
},
{
"epoch": 1.9207317073170733,
"grad_norm": 3.0028634071350098,
"learning_rate": 1.9410999651125196e-07,
"loss": 0.3487,
"step": 13230
},
{
"epoch": 1.9221835075493612,
"grad_norm": 3.6127331256866455,
"learning_rate": 1.8707365229752306e-07,
"loss": 0.3929,
"step": 13240
},
{
"epoch": 1.9236353077816493,
"grad_norm": 3.5471410751342773,
"learning_rate": 1.8016672564416526e-07,
"loss": 0.4829,
"step": 13250
},
{
"epoch": 1.9250871080139373,
"grad_norm": 2.57403564453125,
"learning_rate": 1.7338925257585626e-07,
"loss": 0.4579,
"step": 13260
},
{
"epoch": 1.9265389082462252,
"grad_norm": 0.892292320728302,
"learning_rate": 1.6674126844207215e-07,
"loss": 0.6123,
"step": 13270
},
{
"epoch": 1.9279907084785135,
"grad_norm": 5.541355609893799,
"learning_rate": 1.6022280791691547e-07,
"loss": 0.5871,
"step": 13280
},
{
"epoch": 1.9294425087108014,
"grad_norm": 4.043185234069824,
"learning_rate": 1.5383390499892625e-07,
"loss": 0.7962,
"step": 13290
},
{
"epoch": 1.9308943089430894,
"grad_norm": 3.4358088970184326,
"learning_rate": 1.4757459301089904e-07,
"loss": 0.8971,
"step": 13300
},
{
"epoch": 1.9323461091753775,
"grad_norm": 2.0731537342071533,
"learning_rate": 1.414449045997357e-07,
"loss": 0.587,
"step": 13310
},
{
"epoch": 1.9337979094076654,
"grad_norm": 2.296551465988159,
"learning_rate": 1.3544487173623443e-07,
"loss": 0.6924,
"step": 13320
},
{
"epoch": 1.9352497096399537,
"grad_norm": 1.4577088356018066,
"learning_rate": 1.295745257149622e-07,
"loss": 0.607,
"step": 13330
},
{
"epoch": 1.9367015098722415,
"grad_norm": 2.300415277481079,
"learning_rate": 1.2383389715406592e-07,
"loss": 0.584,
"step": 13340
},
{
"epoch": 1.9381533101045296,
"grad_norm": 5.345014572143555,
"learning_rate": 1.1822301599511976e-07,
"loss": 0.5331,
"step": 13350
},
{
"epoch": 1.9396051103368177,
"grad_norm": 1.9530677795410156,
"learning_rate": 1.1274191150297542e-07,
"loss": 0.4817,
"step": 13360
},
{
"epoch": 1.9410569105691056,
"grad_norm": 2.270688533782959,
"learning_rate": 1.0739061226560099e-07,
"loss": 0.3805,
"step": 13370
},
{
"epoch": 1.9425087108013939,
"grad_norm": 4.100897312164307,
"learning_rate": 1.021691461939367e-07,
"loss": 0.5722,
"step": 13380
},
{
"epoch": 1.9439605110336817,
"grad_norm": 1.5650062561035156,
"learning_rate": 9.707754052174777e-08,
"loss": 0.4488,
"step": 13390
},
{
"epoch": 1.9454123112659698,
"grad_norm": 3.1894681453704834,
"learning_rate": 9.211582180548295e-08,
"loss": 0.8613,
"step": 13400
},
{
"epoch": 1.9468641114982579,
"grad_norm": 4.314057350158691,
"learning_rate": 8.728401592413283e-08,
"loss": 0.5325,
"step": 13410
},
{
"epoch": 1.9483159117305457,
"grad_norm": 1.2972936630249023,
"learning_rate": 8.258214807909947e-08,
"loss": 0.3854,
"step": 13420
},
{
"epoch": 1.949767711962834,
"grad_norm": 1.0964834690093994,
"learning_rate": 7.801024279406599e-08,
"loss": 0.5104,
"step": 13430
},
{
"epoch": 1.951219512195122,
"grad_norm": 2.3164939880371094,
"learning_rate": 7.356832391485769e-08,
"loss": 0.5469,
"step": 13440
},
{
"epoch": 1.95267131242741,
"grad_norm": 0.8079650402069092,
"learning_rate": 6.925641460933107e-08,
"loss": 0.5622,
"step": 13450
},
{
"epoch": 1.954123112659698,
"grad_norm": 0.8280888795852661,
"learning_rate": 6.5074537367249e-08,
"loss": 0.4257,
"step": 13460
},
{
"epoch": 1.955574912891986,
"grad_norm": 1.360197901725769,
"learning_rate": 6.102271400016124e-08,
"loss": 0.4915,
"step": 13470
},
{
"epoch": 1.9570267131242742,
"grad_norm": 4.283596992492676,
"learning_rate": 5.710096564128797e-08,
"loss": 0.4378,
"step": 13480
},
{
"epoch": 1.958478513356562,
"grad_norm": 1.3778785467147827,
"learning_rate": 5.3309312745419835e-08,
"loss": 0.5003,
"step": 13490
},
{
"epoch": 1.9599303135888502,
"grad_norm": 2.252000093460083,
"learning_rate": 4.9647775088793035e-08,
"loss": 0.5867,
"step": 13500
},
{
"epoch": 1.9613821138211383,
"grad_norm": 2.1115314960479736,
"learning_rate": 4.611637176901162e-08,
"loss": 0.6936,
"step": 13510
},
{
"epoch": 1.962833914053426,
"grad_norm": 0.7023373246192932,
"learning_rate": 4.2715121204922606e-08,
"loss": 0.4358,
"step": 13520
},
{
"epoch": 1.9642857142857144,
"grad_norm": 2.6030709743499756,
"learning_rate": 3.944404113653544e-08,
"loss": 0.6004,
"step": 13530
},
{
"epoch": 1.9657375145180023,
"grad_norm": 1.5096830129623413,
"learning_rate": 3.630314862492767e-08,
"loss": 0.66,
"step": 13540
},
{
"epoch": 1.9671893147502904,
"grad_norm": 2.45739483833313,
"learning_rate": 3.3292460052147814e-08,
"loss": 0.5751,
"step": 13550
},
{
"epoch": 1.9686411149825784,
"grad_norm": 1.5947012901306152,
"learning_rate": 3.0411991121143124e-08,
"loss": 0.4873,
"step": 13560
},
{
"epoch": 1.9700929152148663,
"grad_norm": 4.944673538208008,
"learning_rate": 2.76617568556653e-08,
"loss": 0.5386,
"step": 13570
},
{
"epoch": 1.9715447154471546,
"grad_norm": 1.3134737014770508,
"learning_rate": 2.5041771600195496e-08,
"loss": 0.596,
"step": 13580
},
{
"epoch": 1.9729965156794425,
"grad_norm": 1.0245544910430908,
"learning_rate": 2.2552049019874955e-08,
"loss": 0.7618,
"step": 13590
},
{
"epoch": 1.9744483159117305,
"grad_norm": 1.1660908460617065,
"learning_rate": 2.0192602100424507e-08,
"loss": 0.5421,
"step": 13600
},
{
"epoch": 1.9759001161440186,
"grad_norm": 2.883108377456665,
"learning_rate": 1.796344314809184e-08,
"loss": 0.7773,
"step": 13610
},
{
"epoch": 1.9773519163763065,
"grad_norm": 1.4589025974273682,
"learning_rate": 1.5864583789565457e-08,
"loss": 0.7103,
"step": 13620
},
{
"epoch": 1.9788037166085948,
"grad_norm": 0.33630794286727905,
"learning_rate": 1.3896034971935812e-08,
"loss": 0.5487,
"step": 13630
},
{
"epoch": 1.9802555168408826,
"grad_norm": 8.537004470825195,
"learning_rate": 1.2057806962625928e-08,
"loss": 0.7031,
"step": 13640
},
{
"epoch": 1.9817073170731707,
"grad_norm": 2.483400583267212,
"learning_rate": 1.0349909349333109e-08,
"loss": 0.5642,
"step": 13650
},
{
"epoch": 1.9831591173054588,
"grad_norm": 3.4803388118743896,
"learning_rate": 8.77235104000118e-09,
"loss": 0.3511,
"step": 13660
},
{
"epoch": 1.9846109175377467,
"grad_norm": 3.3463430404663086,
"learning_rate": 7.3251402627427805e-09,
"loss": 0.3859,
"step": 13670
},
{
"epoch": 1.986062717770035,
"grad_norm": 2.710141897201538,
"learning_rate": 6.008284565825473e-09,
"loss": 0.5433,
"step": 13680
},
{
"epoch": 1.9875145180023228,
"grad_norm": 1.5595346689224243,
"learning_rate": 4.82179081761347e-09,
"loss": 0.5447,
"step": 13690
},
{
"epoch": 1.988966318234611,
"grad_norm": 2.3480842113494873,
"learning_rate": 3.76566520653987e-09,
"loss": 0.4713,
"step": 13700
},
{
"epoch": 1.990418118466899,
"grad_norm": 0.828123152256012,
"learning_rate": 2.8399132410733553e-09,
"loss": 0.5635,
"step": 13710
},
{
"epoch": 1.9918699186991868,
"grad_norm": 2.631500720977783,
"learning_rate": 2.044539749684882e-09,
"loss": 0.5184,
"step": 13720
},
{
"epoch": 1.9933217189314751,
"grad_norm": 2.2084085941314697,
"learning_rate": 1.3795488808310274e-09,
"loss": 0.5075,
"step": 13730
},
{
"epoch": 1.994773519163763,
"grad_norm": 2.0961720943450928,
"learning_rate": 8.449441029234617e-10,
"loss": 0.9044,
"step": 13740
},
{
"epoch": 1.996225319396051,
"grad_norm": 1.8734853267669678,
"learning_rate": 4.4072820432061733e-10,
"loss": 0.4804,
"step": 13750
},
{
"epoch": 1.9976771196283392,
"grad_norm": 1.1035456657409668,
"learning_rate": 1.6690329330271147e-10,
"loss": 0.4869,
"step": 13760
},
{
"epoch": 1.999128919860627,
"grad_norm": 3.4342620372772217,
"learning_rate": 2.3470798063418564e-11,
"loss": 0.4943,
"step": 13770
},
{
"epoch": 2.0,
"step": 13776,
"total_flos": 2.442921933399982e+18,
"train_loss": 0.63349506684712,
"train_runtime": 13338.7431,
"train_samples_per_second": 4.131,
"train_steps_per_second": 1.033
}
],
"logging_steps": 10,
"max_steps": 13776,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 3000,
"total_flos": 2.442921933399982e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}