Muedi's picture
Training in progress, step 111000, checkpoint
0e6a9dc verified
raw
history blame
196 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.3615599847558493,
"eval_steps": 500,
"global_step": 111000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0003257297159962606,
"grad_norm": 2.2308592796325684,
"learning_rate": 4.99853416853153e-05,
"loss": 1.4483,
"step": 100
},
{
"epoch": 0.0006514594319925212,
"grad_norm": 2.3997225761413574,
"learning_rate": 4.996905466899897e-05,
"loss": 1.3276,
"step": 200
},
{
"epoch": 0.0009771891479887819,
"grad_norm": 1.4687339067459106,
"learning_rate": 4.995276765268264e-05,
"loss": 1.3394,
"step": 300
},
{
"epoch": 0.0013029188639850425,
"grad_norm": 0.6583470702171326,
"learning_rate": 4.993648063636631e-05,
"loss": 1.3245,
"step": 400
},
{
"epoch": 0.0016286485799813031,
"grad_norm": 1.6252340078353882,
"learning_rate": 4.992019362004997e-05,
"loss": 1.3249,
"step": 500
},
{
"epoch": 0.0019543782959775637,
"grad_norm": 2.0806777477264404,
"learning_rate": 4.9903906603733634e-05,
"loss": 1.32,
"step": 600
},
{
"epoch": 0.002280108011973824,
"grad_norm": 1.376539707183838,
"learning_rate": 4.988761958741731e-05,
"loss": 1.3133,
"step": 700
},
{
"epoch": 0.002605837727970085,
"grad_norm": 2.234644889831543,
"learning_rate": 4.987133257110097e-05,
"loss": 1.3179,
"step": 800
},
{
"epoch": 0.0029315674439663454,
"grad_norm": 1.4599684476852417,
"learning_rate": 4.985504555478464e-05,
"loss": 1.3097,
"step": 900
},
{
"epoch": 0.0032572971599626062,
"grad_norm": 1.7078094482421875,
"learning_rate": 4.9838758538468304e-05,
"loss": 1.3083,
"step": 1000
},
{
"epoch": 0.0035830268759588666,
"grad_norm": 0.6953567266464233,
"learning_rate": 4.9822471522151976e-05,
"loss": 1.3075,
"step": 1100
},
{
"epoch": 0.0039087565919551275,
"grad_norm": 1.225602626800537,
"learning_rate": 4.980618450583564e-05,
"loss": 1.3054,
"step": 1200
},
{
"epoch": 0.004234486307951388,
"grad_norm": 1.3010519742965698,
"learning_rate": 4.978989748951931e-05,
"loss": 1.3066,
"step": 1300
},
{
"epoch": 0.004560216023947648,
"grad_norm": 0.6475724577903748,
"learning_rate": 4.9773610473202974e-05,
"loss": 1.3109,
"step": 1400
},
{
"epoch": 0.004885945739943909,
"grad_norm": 1.046614646911621,
"learning_rate": 4.975732345688664e-05,
"loss": 1.3074,
"step": 1500
},
{
"epoch": 0.00521167545594017,
"grad_norm": 1.113573670387268,
"learning_rate": 4.974103644057031e-05,
"loss": 1.3083,
"step": 1600
},
{
"epoch": 0.005537405171936431,
"grad_norm": 1.4273550510406494,
"learning_rate": 4.972474942425398e-05,
"loss": 1.3018,
"step": 1700
},
{
"epoch": 0.005863134887932691,
"grad_norm": 0.5519908666610718,
"learning_rate": 4.970846240793764e-05,
"loss": 1.2945,
"step": 1800
},
{
"epoch": 0.006188864603928952,
"grad_norm": 0.6653416156768799,
"learning_rate": 4.969217539162131e-05,
"loss": 1.3004,
"step": 1900
},
{
"epoch": 0.0065145943199252125,
"grad_norm": 0.732170581817627,
"learning_rate": 4.9675888375304975e-05,
"loss": 1.3014,
"step": 2000
},
{
"epoch": 0.006840324035921473,
"grad_norm": 0.405608594417572,
"learning_rate": 4.965960135898865e-05,
"loss": 1.2939,
"step": 2100
},
{
"epoch": 0.007166053751917733,
"grad_norm": 0.9849847555160522,
"learning_rate": 4.9643314342672306e-05,
"loss": 1.2922,
"step": 2200
},
{
"epoch": 0.007491783467913994,
"grad_norm": 0.7152832746505737,
"learning_rate": 4.962702732635598e-05,
"loss": 1.2905,
"step": 2300
},
{
"epoch": 0.007817513183910255,
"grad_norm": 1.1164734363555908,
"learning_rate": 4.9610740310039644e-05,
"loss": 1.3024,
"step": 2400
},
{
"epoch": 0.008143242899906516,
"grad_norm": 0.574243426322937,
"learning_rate": 4.959445329372332e-05,
"loss": 1.2944,
"step": 2500
},
{
"epoch": 0.008468972615902777,
"grad_norm": 0.6976324319839478,
"learning_rate": 4.9578166277406976e-05,
"loss": 1.2939,
"step": 2600
},
{
"epoch": 0.008794702331899037,
"grad_norm": 0.4648737609386444,
"learning_rate": 4.956187926109064e-05,
"loss": 1.2841,
"step": 2700
},
{
"epoch": 0.009120432047895297,
"grad_norm": 1.189271092414856,
"learning_rate": 4.9545592244774314e-05,
"loss": 1.294,
"step": 2800
},
{
"epoch": 0.009446161763891557,
"grad_norm": 0.6437670588493347,
"learning_rate": 4.952930522845798e-05,
"loss": 1.2882,
"step": 2900
},
{
"epoch": 0.009771891479887818,
"grad_norm": 1.591304898262024,
"learning_rate": 4.9513018212141646e-05,
"loss": 1.2805,
"step": 3000
},
{
"epoch": 0.010097621195884079,
"grad_norm": 0.2836475670337677,
"learning_rate": 4.949673119582531e-05,
"loss": 1.2802,
"step": 3100
},
{
"epoch": 0.01042335091188034,
"grad_norm": 1.304417610168457,
"learning_rate": 4.9480444179508984e-05,
"loss": 1.2833,
"step": 3200
},
{
"epoch": 0.0107490806278766,
"grad_norm": 0.27579864859580994,
"learning_rate": 4.946415716319265e-05,
"loss": 1.2852,
"step": 3300
},
{
"epoch": 0.011074810343872862,
"grad_norm": 1.1080585718154907,
"learning_rate": 4.9447870146876315e-05,
"loss": 1.289,
"step": 3400
},
{
"epoch": 0.011400540059869122,
"grad_norm": 0.2783690392971039,
"learning_rate": 4.943158313055998e-05,
"loss": 1.2885,
"step": 3500
},
{
"epoch": 0.011726269775865382,
"grad_norm": 0.6603112816810608,
"learning_rate": 4.941529611424365e-05,
"loss": 1.2882,
"step": 3600
},
{
"epoch": 0.012051999491861642,
"grad_norm": 0.9498095512390137,
"learning_rate": 4.939900909792732e-05,
"loss": 1.2835,
"step": 3700
},
{
"epoch": 0.012377729207857903,
"grad_norm": 0.5274548530578613,
"learning_rate": 4.9382722081610985e-05,
"loss": 1.279,
"step": 3800
},
{
"epoch": 0.012703458923854164,
"grad_norm": 0.5299821496009827,
"learning_rate": 4.936643506529465e-05,
"loss": 1.2879,
"step": 3900
},
{
"epoch": 0.013029188639850425,
"grad_norm": 1.0898863077163696,
"learning_rate": 4.9350148048978316e-05,
"loss": 1.2913,
"step": 4000
},
{
"epoch": 0.013354918355846686,
"grad_norm": 0.6892501711845398,
"learning_rate": 4.933386103266198e-05,
"loss": 1.2835,
"step": 4100
},
{
"epoch": 0.013680648071842947,
"grad_norm": 0.9103847146034241,
"learning_rate": 4.9317574016345655e-05,
"loss": 1.2876,
"step": 4200
},
{
"epoch": 0.014006377787839207,
"grad_norm": 0.8750960826873779,
"learning_rate": 4.9301287000029314e-05,
"loss": 1.2761,
"step": 4300
},
{
"epoch": 0.014332107503835467,
"grad_norm": 1.7296843528747559,
"learning_rate": 4.9284999983712986e-05,
"loss": 1.2825,
"step": 4400
},
{
"epoch": 0.014657837219831727,
"grad_norm": 0.7019387483596802,
"learning_rate": 4.926871296739665e-05,
"loss": 1.2774,
"step": 4500
},
{
"epoch": 0.014983566935827988,
"grad_norm": 0.9353660345077515,
"learning_rate": 4.9252425951080324e-05,
"loss": 1.2701,
"step": 4600
},
{
"epoch": 0.015309296651824249,
"grad_norm": 0.7081932425498962,
"learning_rate": 4.923613893476399e-05,
"loss": 1.276,
"step": 4700
},
{
"epoch": 0.01563502636782051,
"grad_norm": 0.8366962671279907,
"learning_rate": 4.9219851918447656e-05,
"loss": 1.2767,
"step": 4800
},
{
"epoch": 0.01596075608381677,
"grad_norm": 1.765871286392212,
"learning_rate": 4.920356490213132e-05,
"loss": 1.2617,
"step": 4900
},
{
"epoch": 0.01628648579981303,
"grad_norm": 0.2926379442214966,
"learning_rate": 4.918727788581499e-05,
"loss": 1.2762,
"step": 5000
},
{
"epoch": 0.01661221551580929,
"grad_norm": 1.1176525354385376,
"learning_rate": 4.917099086949866e-05,
"loss": 1.2647,
"step": 5100
},
{
"epoch": 0.016937945231805553,
"grad_norm": 0.384264200925827,
"learning_rate": 4.915470385318232e-05,
"loss": 1.2628,
"step": 5200
},
{
"epoch": 0.017263674947801812,
"grad_norm": 1.5339140892028809,
"learning_rate": 4.913841683686599e-05,
"loss": 1.2692,
"step": 5300
},
{
"epoch": 0.017589404663798075,
"grad_norm": 1.2026703357696533,
"learning_rate": 4.912212982054966e-05,
"loss": 1.2618,
"step": 5400
},
{
"epoch": 0.017915134379794334,
"grad_norm": 0.6754997968673706,
"learning_rate": 4.910584280423333e-05,
"loss": 1.2495,
"step": 5500
},
{
"epoch": 0.018240864095790593,
"grad_norm": 0.8240428566932678,
"learning_rate": 4.908955578791699e-05,
"loss": 1.2498,
"step": 5600
},
{
"epoch": 0.018566593811786856,
"grad_norm": 0.6363087892532349,
"learning_rate": 4.9073268771600654e-05,
"loss": 1.2514,
"step": 5700
},
{
"epoch": 0.018892323527783115,
"grad_norm": 1.393833875656128,
"learning_rate": 4.905698175528433e-05,
"loss": 1.2509,
"step": 5800
},
{
"epoch": 0.019218053243779377,
"grad_norm": 0.6422170996665955,
"learning_rate": 4.904069473896799e-05,
"loss": 1.2405,
"step": 5900
},
{
"epoch": 0.019543782959775637,
"grad_norm": 0.7575420141220093,
"learning_rate": 4.902440772265166e-05,
"loss": 1.2241,
"step": 6000
},
{
"epoch": 0.0198695126757719,
"grad_norm": 0.7148196697235107,
"learning_rate": 4.9008120706335324e-05,
"loss": 1.2372,
"step": 6100
},
{
"epoch": 0.020195242391768158,
"grad_norm": 1.1207329034805298,
"learning_rate": 4.8991833690018996e-05,
"loss": 1.2372,
"step": 6200
},
{
"epoch": 0.02052097210776442,
"grad_norm": 1.3915568590164185,
"learning_rate": 4.897554667370266e-05,
"loss": 1.2129,
"step": 6300
},
{
"epoch": 0.02084670182376068,
"grad_norm": 0.8674553036689758,
"learning_rate": 4.895925965738633e-05,
"loss": 1.2262,
"step": 6400
},
{
"epoch": 0.02117243153975694,
"grad_norm": 0.7640644311904907,
"learning_rate": 4.8942972641069994e-05,
"loss": 1.1998,
"step": 6500
},
{
"epoch": 0.0214981612557532,
"grad_norm": 0.7928606271743774,
"learning_rate": 4.892668562475366e-05,
"loss": 1.1776,
"step": 6600
},
{
"epoch": 0.02182389097174946,
"grad_norm": 1.1644946336746216,
"learning_rate": 4.891039860843733e-05,
"loss": 1.1916,
"step": 6700
},
{
"epoch": 0.022149620687745723,
"grad_norm": 1.1310213804244995,
"learning_rate": 4.8894111592121e-05,
"loss": 1.1786,
"step": 6800
},
{
"epoch": 0.022475350403741982,
"grad_norm": 1.3858141899108887,
"learning_rate": 4.887782457580466e-05,
"loss": 1.1728,
"step": 6900
},
{
"epoch": 0.022801080119738245,
"grad_norm": 3.814767360687256,
"learning_rate": 4.886153755948833e-05,
"loss": 1.1384,
"step": 7000
},
{
"epoch": 0.023126809835734504,
"grad_norm": 1.2411885261535645,
"learning_rate": 4.8845250543171995e-05,
"loss": 1.1588,
"step": 7100
},
{
"epoch": 0.023452539551730763,
"grad_norm": 1.4492881298065186,
"learning_rate": 4.882896352685567e-05,
"loss": 1.1266,
"step": 7200
},
{
"epoch": 0.023778269267727026,
"grad_norm": 0.8389878869056702,
"learning_rate": 4.8812676510539326e-05,
"loss": 1.1446,
"step": 7300
},
{
"epoch": 0.024103998983723285,
"grad_norm": 0.33955487608909607,
"learning_rate": 4.8796389494223e-05,
"loss": 1.1111,
"step": 7400
},
{
"epoch": 0.024429728699719547,
"grad_norm": 0.7004753351211548,
"learning_rate": 4.8780102477906664e-05,
"loss": 1.0954,
"step": 7500
},
{
"epoch": 0.024755458415715807,
"grad_norm": 0.7213209271430969,
"learning_rate": 4.876381546159034e-05,
"loss": 1.1123,
"step": 7600
},
{
"epoch": 0.02508118813171207,
"grad_norm": 0.960991382598877,
"learning_rate": 4.8747528445273996e-05,
"loss": 1.0982,
"step": 7700
},
{
"epoch": 0.025406917847708328,
"grad_norm": 0.6955804228782654,
"learning_rate": 4.873124142895766e-05,
"loss": 1.0827,
"step": 7800
},
{
"epoch": 0.02573264756370459,
"grad_norm": 0.47498619556427,
"learning_rate": 4.8714954412641334e-05,
"loss": 1.1043,
"step": 7900
},
{
"epoch": 0.02605837727970085,
"grad_norm": 0.304063618183136,
"learning_rate": 4.8698667396325e-05,
"loss": 1.0699,
"step": 8000
},
{
"epoch": 0.02638410699569711,
"grad_norm": 0.9996088743209839,
"learning_rate": 4.8682380380008666e-05,
"loss": 1.0697,
"step": 8100
},
{
"epoch": 0.02670983671169337,
"grad_norm": 0.5986392498016357,
"learning_rate": 4.866609336369233e-05,
"loss": 1.0733,
"step": 8200
},
{
"epoch": 0.02703556642768963,
"grad_norm": 0.41347017884254456,
"learning_rate": 4.8649806347376004e-05,
"loss": 1.0643,
"step": 8300
},
{
"epoch": 0.027361296143685893,
"grad_norm": 0.3976612687110901,
"learning_rate": 4.863351933105967e-05,
"loss": 1.0401,
"step": 8400
},
{
"epoch": 0.027687025859682152,
"grad_norm": 1.1716387271881104,
"learning_rate": 4.8617232314743335e-05,
"loss": 1.0298,
"step": 8500
},
{
"epoch": 0.028012755575678415,
"grad_norm": 0.7384105324745178,
"learning_rate": 4.8600945298427e-05,
"loss": 1.0223,
"step": 8600
},
{
"epoch": 0.028338485291674674,
"grad_norm": 0.517280638217926,
"learning_rate": 4.858465828211067e-05,
"loss": 1.0445,
"step": 8700
},
{
"epoch": 0.028664215007670933,
"grad_norm": 0.7129126787185669,
"learning_rate": 4.856837126579434e-05,
"loss": 1.0508,
"step": 8800
},
{
"epoch": 0.028989944723667196,
"grad_norm": 0.35596320033073425,
"learning_rate": 4.8552084249478005e-05,
"loss": 1.0296,
"step": 8900
},
{
"epoch": 0.029315674439663455,
"grad_norm": 0.9362590909004211,
"learning_rate": 4.853579723316167e-05,
"loss": 1.0785,
"step": 9000
},
{
"epoch": 0.029641404155659717,
"grad_norm": 0.8223775625228882,
"learning_rate": 4.8519510216845336e-05,
"loss": 1.043,
"step": 9100
},
{
"epoch": 0.029967133871655977,
"grad_norm": 0.7149192690849304,
"learning_rate": 4.8503223200529e-05,
"loss": 1.0036,
"step": 9200
},
{
"epoch": 0.03029286358765224,
"grad_norm": 0.5907948017120361,
"learning_rate": 4.8486936184212675e-05,
"loss": 1.0408,
"step": 9300
},
{
"epoch": 0.030618593303648498,
"grad_norm": 0.6083859801292419,
"learning_rate": 4.847064916789634e-05,
"loss": 1.0313,
"step": 9400
},
{
"epoch": 0.03094432301964476,
"grad_norm": 0.5470224618911743,
"learning_rate": 4.8454362151580006e-05,
"loss": 1.0395,
"step": 9500
},
{
"epoch": 0.03127005273564102,
"grad_norm": 0.9455150961875916,
"learning_rate": 4.843807513526367e-05,
"loss": 1.0132,
"step": 9600
},
{
"epoch": 0.03159578245163728,
"grad_norm": 0.9068177938461304,
"learning_rate": 4.8421788118947344e-05,
"loss": 1.0219,
"step": 9700
},
{
"epoch": 0.03192151216763354,
"grad_norm": 0.6018943190574646,
"learning_rate": 4.840550110263101e-05,
"loss": 0.9966,
"step": 9800
},
{
"epoch": 0.032247241883629804,
"grad_norm": 1.1521615982055664,
"learning_rate": 4.838921408631467e-05,
"loss": 0.9782,
"step": 9900
},
{
"epoch": 0.03257297159962606,
"grad_norm": 0.33281368017196655,
"learning_rate": 4.837292706999834e-05,
"loss": 1.0325,
"step": 10000
},
{
"epoch": 0.03289870131562232,
"grad_norm": 0.8903327584266663,
"learning_rate": 4.835664005368201e-05,
"loss": 0.9889,
"step": 10100
},
{
"epoch": 0.03322443103161858,
"grad_norm": 0.5526803731918335,
"learning_rate": 4.834035303736568e-05,
"loss": 1.0018,
"step": 10200
},
{
"epoch": 0.03355016074761485,
"grad_norm": 0.8086706399917603,
"learning_rate": 4.832406602104934e-05,
"loss": 1.0189,
"step": 10300
},
{
"epoch": 0.03387589046361111,
"grad_norm": 0.6990864276885986,
"learning_rate": 4.830777900473301e-05,
"loss": 0.996,
"step": 10400
},
{
"epoch": 0.034201620179607366,
"grad_norm": 0.4859602451324463,
"learning_rate": 4.829149198841668e-05,
"loss": 0.992,
"step": 10500
},
{
"epoch": 0.034527349895603625,
"grad_norm": 1.2284592390060425,
"learning_rate": 4.827520497210034e-05,
"loss": 1.0139,
"step": 10600
},
{
"epoch": 0.034853079611599884,
"grad_norm": 0.6529733538627625,
"learning_rate": 4.825891795578401e-05,
"loss": 1.025,
"step": 10700
},
{
"epoch": 0.03517880932759615,
"grad_norm": 0.6755232810974121,
"learning_rate": 4.8242630939467674e-05,
"loss": 1.0123,
"step": 10800
},
{
"epoch": 0.03550453904359241,
"grad_norm": 0.9006055593490601,
"learning_rate": 4.8226343923151347e-05,
"loss": 0.9936,
"step": 10900
},
{
"epoch": 0.03583026875958867,
"grad_norm": 0.7058572769165039,
"learning_rate": 4.821005690683501e-05,
"loss": 0.934,
"step": 11000
},
{
"epoch": 0.03615599847558493,
"grad_norm": 0.4535008668899536,
"learning_rate": 4.819376989051868e-05,
"loss": 1.0269,
"step": 11100
},
{
"epoch": 0.036481728191581186,
"grad_norm": 0.39823395013809204,
"learning_rate": 4.8177482874202344e-05,
"loss": 0.9866,
"step": 11200
},
{
"epoch": 0.03680745790757745,
"grad_norm": 0.8109054565429688,
"learning_rate": 4.816119585788601e-05,
"loss": 1.0209,
"step": 11300
},
{
"epoch": 0.03713318762357371,
"grad_norm": 0.760396420955658,
"learning_rate": 4.814490884156968e-05,
"loss": 0.9711,
"step": 11400
},
{
"epoch": 0.03745891733956997,
"grad_norm": 0.8584955334663391,
"learning_rate": 4.812862182525335e-05,
"loss": 1.0151,
"step": 11500
},
{
"epoch": 0.03778464705556623,
"grad_norm": 1.104041576385498,
"learning_rate": 4.8112334808937013e-05,
"loss": 0.9826,
"step": 11600
},
{
"epoch": 0.038110376771562496,
"grad_norm": 0.6111257672309875,
"learning_rate": 4.809604779262068e-05,
"loss": 0.9524,
"step": 11700
},
{
"epoch": 0.038436106487558755,
"grad_norm": 0.6601366996765137,
"learning_rate": 4.807976077630435e-05,
"loss": 0.9527,
"step": 11800
},
{
"epoch": 0.038761836203555014,
"grad_norm": 0.4624398350715637,
"learning_rate": 4.806347375998802e-05,
"loss": 1.0077,
"step": 11900
},
{
"epoch": 0.03908756591955127,
"grad_norm": 0.2786065638065338,
"learning_rate": 4.8047186743671676e-05,
"loss": 0.956,
"step": 12000
},
{
"epoch": 0.03941329563554753,
"grad_norm": 1.0275955200195312,
"learning_rate": 4.803089972735535e-05,
"loss": 0.9484,
"step": 12100
},
{
"epoch": 0.0397390253515438,
"grad_norm": 0.6198407411575317,
"learning_rate": 4.8014612711039015e-05,
"loss": 0.9847,
"step": 12200
},
{
"epoch": 0.04006475506754006,
"grad_norm": 0.5880489945411682,
"learning_rate": 4.799832569472269e-05,
"loss": 0.9559,
"step": 12300
},
{
"epoch": 0.040390484783536316,
"grad_norm": 0.39753594994544983,
"learning_rate": 4.7982038678406346e-05,
"loss": 0.9489,
"step": 12400
},
{
"epoch": 0.040716214499532576,
"grad_norm": 0.5815085768699646,
"learning_rate": 4.796575166209002e-05,
"loss": 0.9567,
"step": 12500
},
{
"epoch": 0.04104194421552884,
"grad_norm": 0.8463611602783203,
"learning_rate": 4.7949464645773684e-05,
"loss": 0.9706,
"step": 12600
},
{
"epoch": 0.0413676739315251,
"grad_norm": 0.7260481715202332,
"learning_rate": 4.793317762945736e-05,
"loss": 1.0032,
"step": 12700
},
{
"epoch": 0.04169340364752136,
"grad_norm": 0.6970434188842773,
"learning_rate": 4.7916890613141016e-05,
"loss": 0.9559,
"step": 12800
},
{
"epoch": 0.04201913336351762,
"grad_norm": 0.6083927750587463,
"learning_rate": 4.790060359682468e-05,
"loss": 0.9558,
"step": 12900
},
{
"epoch": 0.04234486307951388,
"grad_norm": 0.4736403524875641,
"learning_rate": 4.7884316580508354e-05,
"loss": 0.9444,
"step": 13000
},
{
"epoch": 0.042670592795510144,
"grad_norm": 0.34586021304130554,
"learning_rate": 4.786802956419202e-05,
"loss": 0.9186,
"step": 13100
},
{
"epoch": 0.0429963225115064,
"grad_norm": 0.5979019403457642,
"learning_rate": 4.7851742547875685e-05,
"loss": 0.9367,
"step": 13200
},
{
"epoch": 0.04332205222750266,
"grad_norm": 1.0827624797821045,
"learning_rate": 4.783545553155935e-05,
"loss": 0.9324,
"step": 13300
},
{
"epoch": 0.04364778194349892,
"grad_norm": 1.1920030117034912,
"learning_rate": 4.7819168515243024e-05,
"loss": 0.9367,
"step": 13400
},
{
"epoch": 0.04397351165949519,
"grad_norm": 0.6469812989234924,
"learning_rate": 4.780288149892669e-05,
"loss": 0.9815,
"step": 13500
},
{
"epoch": 0.04429924137549145,
"grad_norm": 0.8156530857086182,
"learning_rate": 4.7786594482610355e-05,
"loss": 0.9679,
"step": 13600
},
{
"epoch": 0.044624971091487706,
"grad_norm": 1.2997325658798218,
"learning_rate": 4.777030746629402e-05,
"loss": 0.9358,
"step": 13700
},
{
"epoch": 0.044950700807483965,
"grad_norm": 0.42360150814056396,
"learning_rate": 4.7754020449977687e-05,
"loss": 0.9326,
"step": 13800
},
{
"epoch": 0.045276430523480224,
"grad_norm": 0.7316247820854187,
"learning_rate": 4.773773343366136e-05,
"loss": 0.9283,
"step": 13900
},
{
"epoch": 0.04560216023947649,
"grad_norm": 0.5978175401687622,
"learning_rate": 4.7721446417345025e-05,
"loss": 0.9699,
"step": 14000
},
{
"epoch": 0.04592788995547275,
"grad_norm": 0.5278334617614746,
"learning_rate": 4.770515940102869e-05,
"loss": 0.99,
"step": 14100
},
{
"epoch": 0.04625361967146901,
"grad_norm": 0.7452822327613831,
"learning_rate": 4.7688872384712356e-05,
"loss": 0.8824,
"step": 14200
},
{
"epoch": 0.04657934938746527,
"grad_norm": 0.4158065617084503,
"learning_rate": 4.767258536839602e-05,
"loss": 0.9076,
"step": 14300
},
{
"epoch": 0.046905079103461526,
"grad_norm": 0.6929590106010437,
"learning_rate": 4.7656298352079694e-05,
"loss": 0.926,
"step": 14400
},
{
"epoch": 0.04723080881945779,
"grad_norm": 0.8249752521514893,
"learning_rate": 4.764001133576336e-05,
"loss": 0.9342,
"step": 14500
},
{
"epoch": 0.04755653853545405,
"grad_norm": 0.6523115038871765,
"learning_rate": 4.7623724319447026e-05,
"loss": 0.9312,
"step": 14600
},
{
"epoch": 0.04788226825145031,
"grad_norm": 0.7809571027755737,
"learning_rate": 4.760743730313069e-05,
"loss": 0.927,
"step": 14700
},
{
"epoch": 0.04820799796744657,
"grad_norm": 0.4370424747467041,
"learning_rate": 4.7591150286814364e-05,
"loss": 0.9275,
"step": 14800
},
{
"epoch": 0.048533727683442836,
"grad_norm": 0.8082228302955627,
"learning_rate": 4.757486327049803e-05,
"loss": 0.9524,
"step": 14900
},
{
"epoch": 0.048859457399439095,
"grad_norm": 0.7073273658752441,
"learning_rate": 4.755857625418169e-05,
"loss": 0.9069,
"step": 15000
},
{
"epoch": 0.049185187115435354,
"grad_norm": 0.9150802493095398,
"learning_rate": 4.754228923786536e-05,
"loss": 0.9669,
"step": 15100
},
{
"epoch": 0.04951091683143161,
"grad_norm": 0.6621295809745789,
"learning_rate": 4.752600222154903e-05,
"loss": 0.9117,
"step": 15200
},
{
"epoch": 0.04983664654742787,
"grad_norm": 1.1658425331115723,
"learning_rate": 4.75097152052327e-05,
"loss": 0.9061,
"step": 15300
},
{
"epoch": 0.05016237626342414,
"grad_norm": 1.1669522523880005,
"learning_rate": 4.749342818891636e-05,
"loss": 0.9625,
"step": 15400
},
{
"epoch": 0.0504881059794204,
"grad_norm": 0.6995384693145752,
"learning_rate": 4.747714117260003e-05,
"loss": 0.9098,
"step": 15500
},
{
"epoch": 0.050813835695416656,
"grad_norm": 0.5169076919555664,
"learning_rate": 4.74608541562837e-05,
"loss": 0.9243,
"step": 15600
},
{
"epoch": 0.051139565411412916,
"grad_norm": 0.33565372228622437,
"learning_rate": 4.744456713996736e-05,
"loss": 0.9375,
"step": 15700
},
{
"epoch": 0.05146529512740918,
"grad_norm": 0.4140024781227112,
"learning_rate": 4.742828012365103e-05,
"loss": 0.919,
"step": 15800
},
{
"epoch": 0.05179102484340544,
"grad_norm": 0.9499224424362183,
"learning_rate": 4.7411993107334694e-05,
"loss": 0.9034,
"step": 15900
},
{
"epoch": 0.0521167545594017,
"grad_norm": 0.8801336288452148,
"learning_rate": 4.7395706091018366e-05,
"loss": 0.881,
"step": 16000
},
{
"epoch": 0.05244248427539796,
"grad_norm": 0.7208696007728577,
"learning_rate": 4.737941907470203e-05,
"loss": 0.8518,
"step": 16100
},
{
"epoch": 0.05276821399139422,
"grad_norm": 0.5132054686546326,
"learning_rate": 4.73631320583857e-05,
"loss": 0.8933,
"step": 16200
},
{
"epoch": 0.053093943707390484,
"grad_norm": 0.6521860957145691,
"learning_rate": 4.7346845042069364e-05,
"loss": 0.9332,
"step": 16300
},
{
"epoch": 0.05341967342338674,
"grad_norm": 0.7121620178222656,
"learning_rate": 4.733055802575303e-05,
"loss": 0.9067,
"step": 16400
},
{
"epoch": 0.053745403139383,
"grad_norm": 0.5065134763717651,
"learning_rate": 4.73142710094367e-05,
"loss": 0.9062,
"step": 16500
},
{
"epoch": 0.05407113285537926,
"grad_norm": 0.5855521559715271,
"learning_rate": 4.729798399312037e-05,
"loss": 0.915,
"step": 16600
},
{
"epoch": 0.05439686257137553,
"grad_norm": 0.5392531156539917,
"learning_rate": 4.728169697680403e-05,
"loss": 0.9124,
"step": 16700
},
{
"epoch": 0.05472259228737179,
"grad_norm": 0.6617989540100098,
"learning_rate": 4.72654099604877e-05,
"loss": 0.8594,
"step": 16800
},
{
"epoch": 0.055048322003368046,
"grad_norm": 0.6459785103797913,
"learning_rate": 4.724912294417137e-05,
"loss": 0.9262,
"step": 16900
},
{
"epoch": 0.055374051719364305,
"grad_norm": 0.34565970301628113,
"learning_rate": 4.723283592785504e-05,
"loss": 0.8747,
"step": 17000
},
{
"epoch": 0.055699781435360564,
"grad_norm": 0.9510948061943054,
"learning_rate": 4.7216548911538696e-05,
"loss": 0.9027,
"step": 17100
},
{
"epoch": 0.05602551115135683,
"grad_norm": 0.577192485332489,
"learning_rate": 4.720026189522237e-05,
"loss": 0.9192,
"step": 17200
},
{
"epoch": 0.05635124086735309,
"grad_norm": 0.38653406500816345,
"learning_rate": 4.7183974878906034e-05,
"loss": 0.8759,
"step": 17300
},
{
"epoch": 0.05667697058334935,
"grad_norm": 0.6405381560325623,
"learning_rate": 4.716768786258971e-05,
"loss": 0.8486,
"step": 17400
},
{
"epoch": 0.05700270029934561,
"grad_norm": 0.6968704462051392,
"learning_rate": 4.7151400846273366e-05,
"loss": 0.903,
"step": 17500
},
{
"epoch": 0.057328430015341866,
"grad_norm": 0.8094695210456848,
"learning_rate": 4.713511382995704e-05,
"loss": 0.864,
"step": 17600
},
{
"epoch": 0.05765415973133813,
"grad_norm": 0.8325287103652954,
"learning_rate": 4.7118826813640704e-05,
"loss": 0.8886,
"step": 17700
},
{
"epoch": 0.05797988944733439,
"grad_norm": 0.5068339705467224,
"learning_rate": 4.710253979732437e-05,
"loss": 0.8767,
"step": 17800
},
{
"epoch": 0.05830561916333065,
"grad_norm": 0.7535611391067505,
"learning_rate": 4.7086252781008036e-05,
"loss": 0.8661,
"step": 17900
},
{
"epoch": 0.05863134887932691,
"grad_norm": 0.9104974865913391,
"learning_rate": 4.70699657646917e-05,
"loss": 0.8612,
"step": 18000
},
{
"epoch": 0.058957078595323176,
"grad_norm": 0.9106101989746094,
"learning_rate": 4.7053678748375374e-05,
"loss": 0.8885,
"step": 18100
},
{
"epoch": 0.059282808311319435,
"grad_norm": 0.9990994334220886,
"learning_rate": 4.703739173205904e-05,
"loss": 0.9097,
"step": 18200
},
{
"epoch": 0.059608538027315694,
"grad_norm": 0.6219133138656616,
"learning_rate": 4.7021104715742705e-05,
"loss": 0.8349,
"step": 18300
},
{
"epoch": 0.05993426774331195,
"grad_norm": 0.28884798288345337,
"learning_rate": 4.700481769942637e-05,
"loss": 0.8359,
"step": 18400
},
{
"epoch": 0.06025999745930821,
"grad_norm": 0.6142743229866028,
"learning_rate": 4.698853068311004e-05,
"loss": 0.8686,
"step": 18500
},
{
"epoch": 0.06058572717530448,
"grad_norm": 0.7121238708496094,
"learning_rate": 4.697224366679371e-05,
"loss": 0.8318,
"step": 18600
},
{
"epoch": 0.06091145689130074,
"grad_norm": 0.3502013683319092,
"learning_rate": 4.6955956650477375e-05,
"loss": 0.8353,
"step": 18700
},
{
"epoch": 0.061237186607296996,
"grad_norm": 0.869159460067749,
"learning_rate": 4.693966963416104e-05,
"loss": 0.8811,
"step": 18800
},
{
"epoch": 0.061562916323293256,
"grad_norm": 0.4008027911186218,
"learning_rate": 4.6923382617844706e-05,
"loss": 0.8595,
"step": 18900
},
{
"epoch": 0.06188864603928952,
"grad_norm": 0.6609760522842407,
"learning_rate": 4.690709560152838e-05,
"loss": 0.8591,
"step": 19000
},
{
"epoch": 0.06221437575528578,
"grad_norm": 0.41599878668785095,
"learning_rate": 4.6890808585212045e-05,
"loss": 0.8792,
"step": 19100
},
{
"epoch": 0.06254010547128204,
"grad_norm": 0.8219528794288635,
"learning_rate": 4.687452156889571e-05,
"loss": 0.8469,
"step": 19200
},
{
"epoch": 0.0628658351872783,
"grad_norm": 0.5383628010749817,
"learning_rate": 4.6858234552579376e-05,
"loss": 0.8619,
"step": 19300
},
{
"epoch": 0.06319156490327456,
"grad_norm": 1.0892442464828491,
"learning_rate": 4.684194753626304e-05,
"loss": 0.8219,
"step": 19400
},
{
"epoch": 0.06351729461927082,
"grad_norm": 0.7258702516555786,
"learning_rate": 4.6825660519946714e-05,
"loss": 0.8243,
"step": 19500
},
{
"epoch": 0.06384302433526708,
"grad_norm": 1.2622634172439575,
"learning_rate": 4.680937350363038e-05,
"loss": 0.8619,
"step": 19600
},
{
"epoch": 0.06416875405126335,
"grad_norm": 0.3901592195034027,
"learning_rate": 4.6793086487314046e-05,
"loss": 0.8315,
"step": 19700
},
{
"epoch": 0.06449448376725961,
"grad_norm": 0.5976518392562866,
"learning_rate": 4.677679947099771e-05,
"loss": 0.8193,
"step": 19800
},
{
"epoch": 0.06482021348325587,
"grad_norm": 1.0668984651565552,
"learning_rate": 4.676051245468138e-05,
"loss": 0.8381,
"step": 19900
},
{
"epoch": 0.06514594319925213,
"grad_norm": 0.6844903826713562,
"learning_rate": 4.674422543836505e-05,
"loss": 0.8202,
"step": 20000
},
{
"epoch": 0.06547167291524839,
"grad_norm": 0.6987929344177246,
"learning_rate": 4.672793842204871e-05,
"loss": 0.844,
"step": 20100
},
{
"epoch": 0.06579740263124464,
"grad_norm": 1.0227413177490234,
"learning_rate": 4.671165140573238e-05,
"loss": 0.8093,
"step": 20200
},
{
"epoch": 0.0661231323472409,
"grad_norm": 0.5901645421981812,
"learning_rate": 4.669536438941605e-05,
"loss": 0.8068,
"step": 20300
},
{
"epoch": 0.06644886206323716,
"grad_norm": 0.7951213717460632,
"learning_rate": 4.667907737309972e-05,
"loss": 0.8581,
"step": 20400
},
{
"epoch": 0.06677459177923342,
"grad_norm": 0.617341160774231,
"learning_rate": 4.666279035678338e-05,
"loss": 0.8427,
"step": 20500
},
{
"epoch": 0.0671003214952297,
"grad_norm": 0.694558322429657,
"learning_rate": 4.6646503340467044e-05,
"loss": 0.8619,
"step": 20600
},
{
"epoch": 0.06742605121122595,
"grad_norm": 0.6441329717636108,
"learning_rate": 4.663021632415072e-05,
"loss": 0.8866,
"step": 20700
},
{
"epoch": 0.06775178092722221,
"grad_norm": 0.46440285444259644,
"learning_rate": 4.661392930783438e-05,
"loss": 0.8435,
"step": 20800
},
{
"epoch": 0.06807751064321847,
"grad_norm": 0.42911046743392944,
"learning_rate": 4.659764229151805e-05,
"loss": 0.8145,
"step": 20900
},
{
"epoch": 0.06840324035921473,
"grad_norm": 0.7508918046951294,
"learning_rate": 4.6581355275201714e-05,
"loss": 0.8576,
"step": 21000
},
{
"epoch": 0.06872897007521099,
"grad_norm": 0.6361901164054871,
"learning_rate": 4.6565068258885386e-05,
"loss": 0.7982,
"step": 21100
},
{
"epoch": 0.06905469979120725,
"grad_norm": 0.804426372051239,
"learning_rate": 4.654878124256905e-05,
"loss": 0.8386,
"step": 21200
},
{
"epoch": 0.06938042950720351,
"grad_norm": 0.5336636304855347,
"learning_rate": 4.653249422625272e-05,
"loss": 0.8296,
"step": 21300
},
{
"epoch": 0.06970615922319977,
"grad_norm": 0.5880811810493469,
"learning_rate": 4.6516207209936384e-05,
"loss": 0.8065,
"step": 21400
},
{
"epoch": 0.07003188893919603,
"grad_norm": 0.4607875347137451,
"learning_rate": 4.649992019362005e-05,
"loss": 0.8601,
"step": 21500
},
{
"epoch": 0.0703576186551923,
"grad_norm": 0.6503331065177917,
"learning_rate": 4.648363317730372e-05,
"loss": 0.7925,
"step": 21600
},
{
"epoch": 0.07068334837118856,
"grad_norm": 0.7841913104057312,
"learning_rate": 4.646734616098739e-05,
"loss": 0.8218,
"step": 21700
},
{
"epoch": 0.07100907808718482,
"grad_norm": 0.45437848567962646,
"learning_rate": 4.645105914467105e-05,
"loss": 0.8663,
"step": 21800
},
{
"epoch": 0.07133480780318108,
"grad_norm": 0.6052650213241577,
"learning_rate": 4.643477212835472e-05,
"loss": 0.8634,
"step": 21900
},
{
"epoch": 0.07166053751917734,
"grad_norm": 0.5301306247711182,
"learning_rate": 4.641848511203839e-05,
"loss": 0.8215,
"step": 22000
},
{
"epoch": 0.0719862672351736,
"grad_norm": 0.8724095821380615,
"learning_rate": 4.640219809572206e-05,
"loss": 0.8304,
"step": 22100
},
{
"epoch": 0.07231199695116985,
"grad_norm": 0.8219661116600037,
"learning_rate": 4.6385911079405716e-05,
"loss": 0.8515,
"step": 22200
},
{
"epoch": 0.07263772666716611,
"grad_norm": 0.6308414936065674,
"learning_rate": 4.636962406308939e-05,
"loss": 0.7233,
"step": 22300
},
{
"epoch": 0.07296345638316237,
"grad_norm": 0.35772112011909485,
"learning_rate": 4.6353337046773054e-05,
"loss": 0.7792,
"step": 22400
},
{
"epoch": 0.07328918609915865,
"grad_norm": 0.519975483417511,
"learning_rate": 4.633705003045673e-05,
"loss": 0.8265,
"step": 22500
},
{
"epoch": 0.0736149158151549,
"grad_norm": 0.8935458660125732,
"learning_rate": 4.6320763014140386e-05,
"loss": 0.8276,
"step": 22600
},
{
"epoch": 0.07394064553115116,
"grad_norm": 0.4765929877758026,
"learning_rate": 4.630447599782406e-05,
"loss": 0.8088,
"step": 22700
},
{
"epoch": 0.07426637524714742,
"grad_norm": 0.5910876989364624,
"learning_rate": 4.6288188981507724e-05,
"loss": 0.8003,
"step": 22800
},
{
"epoch": 0.07459210496314368,
"grad_norm": 0.6108260154724121,
"learning_rate": 4.627190196519139e-05,
"loss": 0.7949,
"step": 22900
},
{
"epoch": 0.07491783467913994,
"grad_norm": 0.9665610194206238,
"learning_rate": 4.625561494887506e-05,
"loss": 0.7989,
"step": 23000
},
{
"epoch": 0.0752435643951362,
"grad_norm": 0.43020346760749817,
"learning_rate": 4.623932793255872e-05,
"loss": 0.8052,
"step": 23100
},
{
"epoch": 0.07556929411113246,
"grad_norm": 0.3901965022087097,
"learning_rate": 4.6223040916242394e-05,
"loss": 0.7756,
"step": 23200
},
{
"epoch": 0.07589502382712872,
"grad_norm": 0.8132317066192627,
"learning_rate": 4.620675389992606e-05,
"loss": 0.797,
"step": 23300
},
{
"epoch": 0.07622075354312499,
"grad_norm": 0.6211370825767517,
"learning_rate": 4.619046688360973e-05,
"loss": 0.7698,
"step": 23400
},
{
"epoch": 0.07654648325912125,
"grad_norm": 0.8378313779830933,
"learning_rate": 4.617417986729339e-05,
"loss": 0.805,
"step": 23500
},
{
"epoch": 0.07687221297511751,
"grad_norm": 0.9225132465362549,
"learning_rate": 4.615789285097706e-05,
"loss": 0.7999,
"step": 23600
},
{
"epoch": 0.07719794269111377,
"grad_norm": 0.46878713369369507,
"learning_rate": 4.614160583466073e-05,
"loss": 0.75,
"step": 23700
},
{
"epoch": 0.07752367240711003,
"grad_norm": 0.409138560295105,
"learning_rate": 4.6125318818344395e-05,
"loss": 0.7944,
"step": 23800
},
{
"epoch": 0.07784940212310629,
"grad_norm": 0.4791303277015686,
"learning_rate": 4.610903180202806e-05,
"loss": 0.7912,
"step": 23900
},
{
"epoch": 0.07817513183910255,
"grad_norm": 0.8759014010429382,
"learning_rate": 4.6092744785711726e-05,
"loss": 0.8198,
"step": 24000
},
{
"epoch": 0.0785008615550988,
"grad_norm": 0.47595012187957764,
"learning_rate": 4.60764577693954e-05,
"loss": 0.7984,
"step": 24100
},
{
"epoch": 0.07882659127109506,
"grad_norm": 0.7923133373260498,
"learning_rate": 4.6060170753079065e-05,
"loss": 0.7436,
"step": 24200
},
{
"epoch": 0.07915232098709134,
"grad_norm": 0.39254361391067505,
"learning_rate": 4.604388373676273e-05,
"loss": 0.7771,
"step": 24300
},
{
"epoch": 0.0794780507030876,
"grad_norm": 0.6828033924102783,
"learning_rate": 4.6027596720446396e-05,
"loss": 0.8083,
"step": 24400
},
{
"epoch": 0.07980378041908386,
"grad_norm": 0.6189585328102112,
"learning_rate": 4.601130970413006e-05,
"loss": 0.7885,
"step": 24500
},
{
"epoch": 0.08012951013508011,
"grad_norm": 0.6750975847244263,
"learning_rate": 4.5995022687813734e-05,
"loss": 0.759,
"step": 24600
},
{
"epoch": 0.08045523985107637,
"grad_norm": 0.6616020798683167,
"learning_rate": 4.59787356714974e-05,
"loss": 0.8226,
"step": 24700
},
{
"epoch": 0.08078096956707263,
"grad_norm": 0.7598117589950562,
"learning_rate": 4.5962448655181066e-05,
"loss": 0.7806,
"step": 24800
},
{
"epoch": 0.08110669928306889,
"grad_norm": 0.41183263063430786,
"learning_rate": 4.594616163886473e-05,
"loss": 0.7939,
"step": 24900
},
{
"epoch": 0.08143242899906515,
"grad_norm": 0.40911582112312317,
"learning_rate": 4.59298746225484e-05,
"loss": 0.7635,
"step": 25000
},
{
"epoch": 0.08175815871506141,
"grad_norm": 0.8820083737373352,
"learning_rate": 4.591358760623207e-05,
"loss": 0.7886,
"step": 25100
},
{
"epoch": 0.08208388843105768,
"grad_norm": 0.9055482745170593,
"learning_rate": 4.589730058991573e-05,
"loss": 0.7487,
"step": 25200
},
{
"epoch": 0.08240961814705394,
"grad_norm": 0.5680561065673828,
"learning_rate": 4.58810135735994e-05,
"loss": 0.7505,
"step": 25300
},
{
"epoch": 0.0827353478630502,
"grad_norm": 0.5064377188682556,
"learning_rate": 4.586472655728307e-05,
"loss": 0.768,
"step": 25400
},
{
"epoch": 0.08306107757904646,
"grad_norm": 0.462200403213501,
"learning_rate": 4.584843954096674e-05,
"loss": 0.7399,
"step": 25500
},
{
"epoch": 0.08338680729504272,
"grad_norm": 0.7820500731468201,
"learning_rate": 4.58321525246504e-05,
"loss": 0.8109,
"step": 25600
},
{
"epoch": 0.08371253701103898,
"grad_norm": 0.4833464026451111,
"learning_rate": 4.5815865508334064e-05,
"loss": 0.764,
"step": 25700
},
{
"epoch": 0.08403826672703524,
"grad_norm": 0.3821680247783661,
"learning_rate": 4.5799578492017737e-05,
"loss": 0.7397,
"step": 25800
},
{
"epoch": 0.0843639964430315,
"grad_norm": 0.5084909200668335,
"learning_rate": 4.57832914757014e-05,
"loss": 0.7428,
"step": 25900
},
{
"epoch": 0.08468972615902776,
"grad_norm": 0.925619900226593,
"learning_rate": 4.576700445938507e-05,
"loss": 0.7386,
"step": 26000
},
{
"epoch": 0.08501545587502403,
"grad_norm": 0.8126088380813599,
"learning_rate": 4.5750717443068734e-05,
"loss": 0.7798,
"step": 26100
},
{
"epoch": 0.08534118559102029,
"grad_norm": 1.0178046226501465,
"learning_rate": 4.5734430426752406e-05,
"loss": 0.7796,
"step": 26200
},
{
"epoch": 0.08566691530701655,
"grad_norm": 0.4879295229911804,
"learning_rate": 4.571814341043607e-05,
"loss": 0.7762,
"step": 26300
},
{
"epoch": 0.0859926450230128,
"grad_norm": 0.6722548604011536,
"learning_rate": 4.570185639411974e-05,
"loss": 0.7234,
"step": 26400
},
{
"epoch": 0.08631837473900907,
"grad_norm": 0.6326486468315125,
"learning_rate": 4.5685569377803403e-05,
"loss": 0.72,
"step": 26500
},
{
"epoch": 0.08664410445500532,
"grad_norm": 0.4354076087474823,
"learning_rate": 4.566928236148707e-05,
"loss": 0.7704,
"step": 26600
},
{
"epoch": 0.08696983417100158,
"grad_norm": 0.7113054394721985,
"learning_rate": 4.565299534517074e-05,
"loss": 0.7623,
"step": 26700
},
{
"epoch": 0.08729556388699784,
"grad_norm": 0.595664381980896,
"learning_rate": 4.563670832885441e-05,
"loss": 0.765,
"step": 26800
},
{
"epoch": 0.0876212936029941,
"grad_norm": 0.5344740152359009,
"learning_rate": 4.562042131253807e-05,
"loss": 0.7201,
"step": 26900
},
{
"epoch": 0.08794702331899037,
"grad_norm": 0.5330939292907715,
"learning_rate": 4.560413429622174e-05,
"loss": 0.7617,
"step": 27000
},
{
"epoch": 0.08827275303498663,
"grad_norm": 0.45265939831733704,
"learning_rate": 4.5587847279905405e-05,
"loss": 0.7806,
"step": 27100
},
{
"epoch": 0.0885984827509829,
"grad_norm": 0.5947338342666626,
"learning_rate": 4.557156026358908e-05,
"loss": 0.7524,
"step": 27200
},
{
"epoch": 0.08892421246697915,
"grad_norm": 0.8656592965126038,
"learning_rate": 4.555527324727274e-05,
"loss": 0.7599,
"step": 27300
},
{
"epoch": 0.08924994218297541,
"grad_norm": 0.645728349685669,
"learning_rate": 4.553898623095641e-05,
"loss": 0.7629,
"step": 27400
},
{
"epoch": 0.08957567189897167,
"grad_norm": 0.8474392890930176,
"learning_rate": 4.5522699214640074e-05,
"loss": 0.7641,
"step": 27500
},
{
"epoch": 0.08990140161496793,
"grad_norm": 0.7386724948883057,
"learning_rate": 4.550641219832375e-05,
"loss": 0.7523,
"step": 27600
},
{
"epoch": 0.09022713133096419,
"grad_norm": 0.9216130971908569,
"learning_rate": 4.549012518200741e-05,
"loss": 0.7562,
"step": 27700
},
{
"epoch": 0.09055286104696045,
"grad_norm": 0.8789349794387817,
"learning_rate": 4.547383816569107e-05,
"loss": 0.7229,
"step": 27800
},
{
"epoch": 0.0908785907629567,
"grad_norm": 0.582091748714447,
"learning_rate": 4.5457551149374744e-05,
"loss": 0.7274,
"step": 27900
},
{
"epoch": 0.09120432047895298,
"grad_norm": 0.6011328101158142,
"learning_rate": 4.544126413305841e-05,
"loss": 0.7297,
"step": 28000
},
{
"epoch": 0.09153005019494924,
"grad_norm": 0.6041598916053772,
"learning_rate": 4.542497711674208e-05,
"loss": 0.7409,
"step": 28100
},
{
"epoch": 0.0918557799109455,
"grad_norm": 0.7190874814987183,
"learning_rate": 4.540869010042574e-05,
"loss": 0.7149,
"step": 28200
},
{
"epoch": 0.09218150962694176,
"grad_norm": 0.5705780982971191,
"learning_rate": 4.5392403084109414e-05,
"loss": 0.76,
"step": 28300
},
{
"epoch": 0.09250723934293802,
"grad_norm": 0.7988401651382446,
"learning_rate": 4.537611606779308e-05,
"loss": 0.7594,
"step": 28400
},
{
"epoch": 0.09283296905893428,
"grad_norm": 0.48971208930015564,
"learning_rate": 4.5359829051476745e-05,
"loss": 0.7505,
"step": 28500
},
{
"epoch": 0.09315869877493053,
"grad_norm": 0.6600379347801208,
"learning_rate": 4.534354203516041e-05,
"loss": 0.7902,
"step": 28600
},
{
"epoch": 0.0934844284909268,
"grad_norm": 0.6095920205116272,
"learning_rate": 4.5327255018844077e-05,
"loss": 0.7166,
"step": 28700
},
{
"epoch": 0.09381015820692305,
"grad_norm": 0.6808424592018127,
"learning_rate": 4.531096800252775e-05,
"loss": 0.7148,
"step": 28800
},
{
"epoch": 0.09413588792291933,
"grad_norm": 0.9923068284988403,
"learning_rate": 4.5294680986211415e-05,
"loss": 0.7226,
"step": 28900
},
{
"epoch": 0.09446161763891558,
"grad_norm": 0.8952274918556213,
"learning_rate": 4.527839396989508e-05,
"loss": 0.7645,
"step": 29000
},
{
"epoch": 0.09478734735491184,
"grad_norm": 0.7416999936103821,
"learning_rate": 4.5262106953578746e-05,
"loss": 0.7503,
"step": 29100
},
{
"epoch": 0.0951130770709081,
"grad_norm": 0.7862002849578857,
"learning_rate": 4.524581993726242e-05,
"loss": 0.7469,
"step": 29200
},
{
"epoch": 0.09543880678690436,
"grad_norm": 0.6296769380569458,
"learning_rate": 4.5229532920946085e-05,
"loss": 0.6873,
"step": 29300
},
{
"epoch": 0.09576453650290062,
"grad_norm": 0.9056894779205322,
"learning_rate": 4.521324590462975e-05,
"loss": 0.7126,
"step": 29400
},
{
"epoch": 0.09609026621889688,
"grad_norm": 0.624724268913269,
"learning_rate": 4.5196958888313416e-05,
"loss": 0.7668,
"step": 29500
},
{
"epoch": 0.09641599593489314,
"grad_norm": 0.680957555770874,
"learning_rate": 4.518067187199708e-05,
"loss": 0.7783,
"step": 29600
},
{
"epoch": 0.0967417256508894,
"grad_norm": 0.5778472423553467,
"learning_rate": 4.5164384855680754e-05,
"loss": 0.7355,
"step": 29700
},
{
"epoch": 0.09706745536688567,
"grad_norm": 0.6346442699432373,
"learning_rate": 4.514809783936442e-05,
"loss": 0.7276,
"step": 29800
},
{
"epoch": 0.09739318508288193,
"grad_norm": 0.9289300441741943,
"learning_rate": 4.5131810823048086e-05,
"loss": 0.7179,
"step": 29900
},
{
"epoch": 0.09771891479887819,
"grad_norm": 0.7473464012145996,
"learning_rate": 4.511552380673175e-05,
"loss": 0.7172,
"step": 30000
},
{
"epoch": 0.09804464451487445,
"grad_norm": 0.6801792979240417,
"learning_rate": 4.509923679041542e-05,
"loss": 0.7074,
"step": 30100
},
{
"epoch": 0.09837037423087071,
"grad_norm": 0.6129624247550964,
"learning_rate": 4.508294977409909e-05,
"loss": 0.7166,
"step": 30200
},
{
"epoch": 0.09869610394686697,
"grad_norm": 0.8195613026618958,
"learning_rate": 4.506666275778275e-05,
"loss": 0.7709,
"step": 30300
},
{
"epoch": 0.09902183366286323,
"grad_norm": 0.4703550934791565,
"learning_rate": 4.505037574146642e-05,
"loss": 0.7037,
"step": 30400
},
{
"epoch": 0.09934756337885949,
"grad_norm": 0.7674877047538757,
"learning_rate": 4.503408872515009e-05,
"loss": 0.7202,
"step": 30500
},
{
"epoch": 0.09967329309485574,
"grad_norm": 0.8670388460159302,
"learning_rate": 4.501780170883376e-05,
"loss": 0.7183,
"step": 30600
},
{
"epoch": 0.09999902281085202,
"grad_norm": 0.280652791261673,
"learning_rate": 4.500151469251742e-05,
"loss": 0.6998,
"step": 30700
},
{
"epoch": 0.10032475252684828,
"grad_norm": 0.7346746325492859,
"learning_rate": 4.4985227676201084e-05,
"loss": 0.7358,
"step": 30800
},
{
"epoch": 0.10065048224284454,
"grad_norm": 0.978670060634613,
"learning_rate": 4.4968940659884756e-05,
"loss": 0.7259,
"step": 30900
},
{
"epoch": 0.1009762119588408,
"grad_norm": 0.5910704135894775,
"learning_rate": 4.495265364356842e-05,
"loss": 0.7074,
"step": 31000
},
{
"epoch": 0.10130194167483705,
"grad_norm": 0.7966532707214355,
"learning_rate": 4.493636662725209e-05,
"loss": 0.7117,
"step": 31100
},
{
"epoch": 0.10162767139083331,
"grad_norm": 0.9344640374183655,
"learning_rate": 4.4920079610935754e-05,
"loss": 0.7349,
"step": 31200
},
{
"epoch": 0.10195340110682957,
"grad_norm": 0.8043787479400635,
"learning_rate": 4.4903792594619426e-05,
"loss": 0.7361,
"step": 31300
},
{
"epoch": 0.10227913082282583,
"grad_norm": 0.6786687970161438,
"learning_rate": 4.488750557830309e-05,
"loss": 0.6969,
"step": 31400
},
{
"epoch": 0.10260486053882209,
"grad_norm": 0.4679253399372101,
"learning_rate": 4.487121856198676e-05,
"loss": 0.7157,
"step": 31500
},
{
"epoch": 0.10293059025481836,
"grad_norm": 0.5903817415237427,
"learning_rate": 4.485493154567042e-05,
"loss": 0.7352,
"step": 31600
},
{
"epoch": 0.10325631997081462,
"grad_norm": 0.715834379196167,
"learning_rate": 4.483864452935409e-05,
"loss": 0.7532,
"step": 31700
},
{
"epoch": 0.10358204968681088,
"grad_norm": 0.6664106249809265,
"learning_rate": 4.482235751303776e-05,
"loss": 0.6853,
"step": 31800
},
{
"epoch": 0.10390777940280714,
"grad_norm": 0.700243353843689,
"learning_rate": 4.480607049672143e-05,
"loss": 0.6835,
"step": 31900
},
{
"epoch": 0.1042335091188034,
"grad_norm": 0.7481942772865295,
"learning_rate": 4.478978348040509e-05,
"loss": 0.7343,
"step": 32000
},
{
"epoch": 0.10455923883479966,
"grad_norm": 0.5347774028778076,
"learning_rate": 4.477349646408876e-05,
"loss": 0.6688,
"step": 32100
},
{
"epoch": 0.10488496855079592,
"grad_norm": 0.541346549987793,
"learning_rate": 4.4757209447772425e-05,
"loss": 0.7088,
"step": 32200
},
{
"epoch": 0.10521069826679218,
"grad_norm": 0.6126936674118042,
"learning_rate": 4.47409224314561e-05,
"loss": 0.7333,
"step": 32300
},
{
"epoch": 0.10553642798278844,
"grad_norm": 0.952684760093689,
"learning_rate": 4.472463541513976e-05,
"loss": 0.7242,
"step": 32400
},
{
"epoch": 0.10586215769878471,
"grad_norm": 0.72658771276474,
"learning_rate": 4.470834839882343e-05,
"loss": 0.7422,
"step": 32500
},
{
"epoch": 0.10618788741478097,
"grad_norm": 0.5741873383522034,
"learning_rate": 4.4692061382507094e-05,
"loss": 0.7307,
"step": 32600
},
{
"epoch": 0.10651361713077723,
"grad_norm": 0.646496057510376,
"learning_rate": 4.467577436619077e-05,
"loss": 0.7138,
"step": 32700
},
{
"epoch": 0.10683934684677349,
"grad_norm": 0.40007448196411133,
"learning_rate": 4.465948734987443e-05,
"loss": 0.7045,
"step": 32800
},
{
"epoch": 0.10716507656276975,
"grad_norm": 0.6594932675361633,
"learning_rate": 4.464320033355809e-05,
"loss": 0.6874,
"step": 32900
},
{
"epoch": 0.107490806278766,
"grad_norm": 0.7663995623588562,
"learning_rate": 4.4626913317241764e-05,
"loss": 0.7303,
"step": 33000
},
{
"epoch": 0.10781653599476226,
"grad_norm": 0.5867152810096741,
"learning_rate": 4.461062630092543e-05,
"loss": 0.7072,
"step": 33100
},
{
"epoch": 0.10814226571075852,
"grad_norm": 0.5017038583755493,
"learning_rate": 4.45943392846091e-05,
"loss": 0.6879,
"step": 33200
},
{
"epoch": 0.10846799542675478,
"grad_norm": 0.6196131110191345,
"learning_rate": 4.457805226829276e-05,
"loss": 0.7094,
"step": 33300
},
{
"epoch": 0.10879372514275105,
"grad_norm": 0.643118679523468,
"learning_rate": 4.4561765251976434e-05,
"loss": 0.6763,
"step": 33400
},
{
"epoch": 0.10911945485874731,
"grad_norm": 0.516583263874054,
"learning_rate": 4.45454782356601e-05,
"loss": 0.6744,
"step": 33500
},
{
"epoch": 0.10944518457474357,
"grad_norm": 0.6565887928009033,
"learning_rate": 4.4529191219343765e-05,
"loss": 0.6818,
"step": 33600
},
{
"epoch": 0.10977091429073983,
"grad_norm": 0.644209623336792,
"learning_rate": 4.451290420302743e-05,
"loss": 0.6795,
"step": 33700
},
{
"epoch": 0.11009664400673609,
"grad_norm": 0.5720322132110596,
"learning_rate": 4.4496617186711096e-05,
"loss": 0.6444,
"step": 33800
},
{
"epoch": 0.11042237372273235,
"grad_norm": 0.7580476999282837,
"learning_rate": 4.448033017039477e-05,
"loss": 0.7067,
"step": 33900
},
{
"epoch": 0.11074810343872861,
"grad_norm": 0.3334468603134155,
"learning_rate": 4.4464043154078435e-05,
"loss": 0.7245,
"step": 34000
},
{
"epoch": 0.11107383315472487,
"grad_norm": 0.7232679724693298,
"learning_rate": 4.44477561377621e-05,
"loss": 0.6476,
"step": 34100
},
{
"epoch": 0.11139956287072113,
"grad_norm": 0.49447712302207947,
"learning_rate": 4.4431469121445766e-05,
"loss": 0.6813,
"step": 34200
},
{
"epoch": 0.11172529258671739,
"grad_norm": 0.9112755656242371,
"learning_rate": 4.441518210512943e-05,
"loss": 0.7039,
"step": 34300
},
{
"epoch": 0.11205102230271366,
"grad_norm": 0.9391865134239197,
"learning_rate": 4.4398895088813104e-05,
"loss": 0.7154,
"step": 34400
},
{
"epoch": 0.11237675201870992,
"grad_norm": 0.6869890689849854,
"learning_rate": 4.438260807249677e-05,
"loss": 0.7462,
"step": 34500
},
{
"epoch": 0.11270248173470618,
"grad_norm": 0.6954273581504822,
"learning_rate": 4.4366321056180436e-05,
"loss": 0.7151,
"step": 34600
},
{
"epoch": 0.11302821145070244,
"grad_norm": 0.8512132167816162,
"learning_rate": 4.43500340398641e-05,
"loss": 0.7157,
"step": 34700
},
{
"epoch": 0.1133539411666987,
"grad_norm": 0.7044045329093933,
"learning_rate": 4.4333747023547774e-05,
"loss": 0.6649,
"step": 34800
},
{
"epoch": 0.11367967088269496,
"grad_norm": 0.6773298978805542,
"learning_rate": 4.431746000723144e-05,
"loss": 0.6137,
"step": 34900
},
{
"epoch": 0.11400540059869121,
"grad_norm": 0.544491171836853,
"learning_rate": 4.43011729909151e-05,
"loss": 0.6577,
"step": 35000
},
{
"epoch": 0.11433113031468747,
"grad_norm": 0.543596625328064,
"learning_rate": 4.428488597459877e-05,
"loss": 0.6699,
"step": 35100
},
{
"epoch": 0.11465686003068373,
"grad_norm": 0.7878594398498535,
"learning_rate": 4.426859895828244e-05,
"loss": 0.709,
"step": 35200
},
{
"epoch": 0.11498258974668,
"grad_norm": 0.8226998448371887,
"learning_rate": 4.425231194196611e-05,
"loss": 0.6954,
"step": 35300
},
{
"epoch": 0.11530831946267626,
"grad_norm": 0.48608875274658203,
"learning_rate": 4.423602492564977e-05,
"loss": 0.7502,
"step": 35400
},
{
"epoch": 0.11563404917867252,
"grad_norm": 0.6490182280540466,
"learning_rate": 4.421973790933344e-05,
"loss": 0.7085,
"step": 35500
},
{
"epoch": 0.11595977889466878,
"grad_norm": 0.3032003343105316,
"learning_rate": 4.420345089301711e-05,
"loss": 0.6778,
"step": 35600
},
{
"epoch": 0.11628550861066504,
"grad_norm": 0.7003344297409058,
"learning_rate": 4.418716387670077e-05,
"loss": 0.71,
"step": 35700
},
{
"epoch": 0.1166112383266613,
"grad_norm": 0.6569785475730896,
"learning_rate": 4.417087686038444e-05,
"loss": 0.653,
"step": 35800
},
{
"epoch": 0.11693696804265756,
"grad_norm": 0.5428867936134338,
"learning_rate": 4.4154589844068104e-05,
"loss": 0.6733,
"step": 35900
},
{
"epoch": 0.11726269775865382,
"grad_norm": 0.6179760098457336,
"learning_rate": 4.4138302827751776e-05,
"loss": 0.7081,
"step": 36000
},
{
"epoch": 0.11758842747465008,
"grad_norm": 0.7397803068161011,
"learning_rate": 4.412201581143544e-05,
"loss": 0.6894,
"step": 36100
},
{
"epoch": 0.11791415719064635,
"grad_norm": 0.725395679473877,
"learning_rate": 4.410572879511911e-05,
"loss": 0.6874,
"step": 36200
},
{
"epoch": 0.11823988690664261,
"grad_norm": 0.45658519864082336,
"learning_rate": 4.4089441778802774e-05,
"loss": 0.6821,
"step": 36300
},
{
"epoch": 0.11856561662263887,
"grad_norm": 0.9002487063407898,
"learning_rate": 4.407315476248644e-05,
"loss": 0.641,
"step": 36400
},
{
"epoch": 0.11889134633863513,
"grad_norm": 0.8738647103309631,
"learning_rate": 4.405686774617011e-05,
"loss": 0.6763,
"step": 36500
},
{
"epoch": 0.11921707605463139,
"grad_norm": 1.0051002502441406,
"learning_rate": 4.404058072985378e-05,
"loss": 0.6775,
"step": 36600
},
{
"epoch": 0.11954280577062765,
"grad_norm": 0.8074469566345215,
"learning_rate": 4.402429371353744e-05,
"loss": 0.7408,
"step": 36700
},
{
"epoch": 0.1198685354866239,
"grad_norm": 0.485388845205307,
"learning_rate": 4.400800669722111e-05,
"loss": 0.6729,
"step": 36800
},
{
"epoch": 0.12019426520262017,
"grad_norm": 0.7123886942863464,
"learning_rate": 4.399171968090478e-05,
"loss": 0.661,
"step": 36900
},
{
"epoch": 0.12051999491861642,
"grad_norm": 0.4587586522102356,
"learning_rate": 4.397543266458845e-05,
"loss": 0.6662,
"step": 37000
},
{
"epoch": 0.1208457246346127,
"grad_norm": 0.7726449966430664,
"learning_rate": 4.395914564827211e-05,
"loss": 0.7469,
"step": 37100
},
{
"epoch": 0.12117145435060896,
"grad_norm": 0.8636273741722107,
"learning_rate": 4.394285863195578e-05,
"loss": 0.6669,
"step": 37200
},
{
"epoch": 0.12149718406660522,
"grad_norm": 0.6817033886909485,
"learning_rate": 4.3926571615639444e-05,
"loss": 0.6874,
"step": 37300
},
{
"epoch": 0.12182291378260147,
"grad_norm": 0.5549355149269104,
"learning_rate": 4.391028459932312e-05,
"loss": 0.6939,
"step": 37400
},
{
"epoch": 0.12214864349859773,
"grad_norm": 0.6180316805839539,
"learning_rate": 4.389399758300678e-05,
"loss": 0.6299,
"step": 37500
},
{
"epoch": 0.12247437321459399,
"grad_norm": 0.7779985070228577,
"learning_rate": 4.387771056669045e-05,
"loss": 0.7181,
"step": 37600
},
{
"epoch": 0.12280010293059025,
"grad_norm": 0.7182669043540955,
"learning_rate": 4.3861423550374114e-05,
"loss": 0.6703,
"step": 37700
},
{
"epoch": 0.12312583264658651,
"grad_norm": 0.7191387414932251,
"learning_rate": 4.3845136534057787e-05,
"loss": 0.6802,
"step": 37800
},
{
"epoch": 0.12345156236258277,
"grad_norm": 0.6137369275093079,
"learning_rate": 4.382884951774145e-05,
"loss": 0.7028,
"step": 37900
},
{
"epoch": 0.12377729207857904,
"grad_norm": 0.7508791089057922,
"learning_rate": 4.381256250142511e-05,
"loss": 0.642,
"step": 38000
},
{
"epoch": 0.1241030217945753,
"grad_norm": 0.6414891481399536,
"learning_rate": 4.3796275485108784e-05,
"loss": 0.6255,
"step": 38100
},
{
"epoch": 0.12442875151057156,
"grad_norm": 0.6669697165489197,
"learning_rate": 4.377998846879245e-05,
"loss": 0.6691,
"step": 38200
},
{
"epoch": 0.12475448122656782,
"grad_norm": 0.8991898894309998,
"learning_rate": 4.376370145247612e-05,
"loss": 0.6727,
"step": 38300
},
{
"epoch": 0.12508021094256408,
"grad_norm": 0.4924679398536682,
"learning_rate": 4.374741443615978e-05,
"loss": 0.6661,
"step": 38400
},
{
"epoch": 0.12540594065856034,
"grad_norm": 0.3712103068828583,
"learning_rate": 4.3731127419843453e-05,
"loss": 0.7306,
"step": 38500
},
{
"epoch": 0.1257316703745566,
"grad_norm": 0.9136518836021423,
"learning_rate": 4.371484040352712e-05,
"loss": 0.6453,
"step": 38600
},
{
"epoch": 0.12605740009055286,
"grad_norm": 0.6828204393386841,
"learning_rate": 4.3698553387210785e-05,
"loss": 0.6587,
"step": 38700
},
{
"epoch": 0.12638312980654912,
"grad_norm": 0.6366333961486816,
"learning_rate": 4.368226637089445e-05,
"loss": 0.6606,
"step": 38800
},
{
"epoch": 0.12670885952254538,
"grad_norm": 0.39375558495521545,
"learning_rate": 4.3665979354578116e-05,
"loss": 0.6937,
"step": 38900
},
{
"epoch": 0.12703458923854163,
"grad_norm": 0.46293410658836365,
"learning_rate": 4.364969233826179e-05,
"loss": 0.6504,
"step": 39000
},
{
"epoch": 0.1273603189545379,
"grad_norm": 0.9897958040237427,
"learning_rate": 4.3633405321945455e-05,
"loss": 0.7126,
"step": 39100
},
{
"epoch": 0.12768604867053415,
"grad_norm": 0.5616987347602844,
"learning_rate": 4.361711830562912e-05,
"loss": 0.5956,
"step": 39200
},
{
"epoch": 0.1280117783865304,
"grad_norm": 0.4081191122531891,
"learning_rate": 4.3600831289312786e-05,
"loss": 0.6648,
"step": 39300
},
{
"epoch": 0.1283375081025267,
"grad_norm": 0.485188364982605,
"learning_rate": 4.358454427299645e-05,
"loss": 0.6694,
"step": 39400
},
{
"epoch": 0.12866323781852296,
"grad_norm": 0.7212422490119934,
"learning_rate": 4.3568257256680124e-05,
"loss": 0.6767,
"step": 39500
},
{
"epoch": 0.12898896753451922,
"grad_norm": 0.5502139925956726,
"learning_rate": 4.355197024036379e-05,
"loss": 0.6721,
"step": 39600
},
{
"epoch": 0.12931469725051548,
"grad_norm": 0.49975594878196716,
"learning_rate": 4.3535683224047456e-05,
"loss": 0.6669,
"step": 39700
},
{
"epoch": 0.12964042696651173,
"grad_norm": 0.4203544557094574,
"learning_rate": 4.351939620773112e-05,
"loss": 0.6716,
"step": 39800
},
{
"epoch": 0.129966156682508,
"grad_norm": 0.5464275479316711,
"learning_rate": 4.3503109191414794e-05,
"loss": 0.6544,
"step": 39900
},
{
"epoch": 0.13029188639850425,
"grad_norm": 0.6473097801208496,
"learning_rate": 4.348682217509846e-05,
"loss": 0.6977,
"step": 40000
},
{
"epoch": 0.1306176161145005,
"grad_norm": 0.39890334010124207,
"learning_rate": 4.347053515878212e-05,
"loss": 0.6704,
"step": 40100
},
{
"epoch": 0.13094334583049677,
"grad_norm": 1.0785876512527466,
"learning_rate": 4.345424814246579e-05,
"loss": 0.6196,
"step": 40200
},
{
"epoch": 0.13126907554649303,
"grad_norm": 0.6607077121734619,
"learning_rate": 4.343796112614946e-05,
"loss": 0.6608,
"step": 40300
},
{
"epoch": 0.1315948052624893,
"grad_norm": 0.5987501740455627,
"learning_rate": 4.342167410983313e-05,
"loss": 0.6334,
"step": 40400
},
{
"epoch": 0.13192053497848555,
"grad_norm": 0.3443163335323334,
"learning_rate": 4.340538709351679e-05,
"loss": 0.6621,
"step": 40500
},
{
"epoch": 0.1322462646944818,
"grad_norm": 0.9362694025039673,
"learning_rate": 4.338910007720046e-05,
"loss": 0.6404,
"step": 40600
},
{
"epoch": 0.13257199441047807,
"grad_norm": 0.5049243569374084,
"learning_rate": 4.3372813060884127e-05,
"loss": 0.6426,
"step": 40700
},
{
"epoch": 0.13289772412647433,
"grad_norm": 0.787389874458313,
"learning_rate": 4.335652604456779e-05,
"loss": 0.6432,
"step": 40800
},
{
"epoch": 0.13322345384247058,
"grad_norm": 0.8065658211708069,
"learning_rate": 4.334023902825146e-05,
"loss": 0.6477,
"step": 40900
},
{
"epoch": 0.13354918355846684,
"grad_norm": 0.5166397094726562,
"learning_rate": 4.3323952011935124e-05,
"loss": 0.6384,
"step": 41000
},
{
"epoch": 0.1338749132744631,
"grad_norm": 0.9597229957580566,
"learning_rate": 4.3307664995618796e-05,
"loss": 0.6832,
"step": 41100
},
{
"epoch": 0.1342006429904594,
"grad_norm": 0.5936517715454102,
"learning_rate": 4.329137797930246e-05,
"loss": 0.6767,
"step": 41200
},
{
"epoch": 0.13452637270645565,
"grad_norm": 0.8391766548156738,
"learning_rate": 4.3275090962986135e-05,
"loss": 0.6215,
"step": 41300
},
{
"epoch": 0.1348521024224519,
"grad_norm": 0.977497398853302,
"learning_rate": 4.3258803946669793e-05,
"loss": 0.6307,
"step": 41400
},
{
"epoch": 0.13517783213844817,
"grad_norm": 0.6750873923301697,
"learning_rate": 4.324251693035346e-05,
"loss": 0.631,
"step": 41500
},
{
"epoch": 0.13550356185444443,
"grad_norm": 0.4655423164367676,
"learning_rate": 4.322622991403713e-05,
"loss": 0.7025,
"step": 41600
},
{
"epoch": 0.13582929157044069,
"grad_norm": 0.43544334173202515,
"learning_rate": 4.32099428977208e-05,
"loss": 0.6555,
"step": 41700
},
{
"epoch": 0.13615502128643694,
"grad_norm": 0.7595189213752747,
"learning_rate": 4.319365588140446e-05,
"loss": 0.6197,
"step": 41800
},
{
"epoch": 0.1364807510024332,
"grad_norm": 0.4422534108161926,
"learning_rate": 4.317736886508813e-05,
"loss": 0.5798,
"step": 41900
},
{
"epoch": 0.13680648071842946,
"grad_norm": 0.4622032344341278,
"learning_rate": 4.31610818487718e-05,
"loss": 0.6493,
"step": 42000
},
{
"epoch": 0.13713221043442572,
"grad_norm": 0.7267939448356628,
"learning_rate": 4.314479483245547e-05,
"loss": 0.6228,
"step": 42100
},
{
"epoch": 0.13745794015042198,
"grad_norm": 0.66838139295578,
"learning_rate": 4.312850781613913e-05,
"loss": 0.6507,
"step": 42200
},
{
"epoch": 0.13778366986641824,
"grad_norm": 0.40865644812583923,
"learning_rate": 4.31122207998228e-05,
"loss": 0.6388,
"step": 42300
},
{
"epoch": 0.1381093995824145,
"grad_norm": 0.7203364968299866,
"learning_rate": 4.3095933783506464e-05,
"loss": 0.589,
"step": 42400
},
{
"epoch": 0.13843512929841076,
"grad_norm": 0.7719990015029907,
"learning_rate": 4.307964676719014e-05,
"loss": 0.6446,
"step": 42500
},
{
"epoch": 0.13876085901440702,
"grad_norm": 0.35780540108680725,
"learning_rate": 4.30633597508738e-05,
"loss": 0.683,
"step": 42600
},
{
"epoch": 0.13908658873040328,
"grad_norm": 0.5952534675598145,
"learning_rate": 4.304707273455747e-05,
"loss": 0.6697,
"step": 42700
},
{
"epoch": 0.13941231844639954,
"grad_norm": 0.539117157459259,
"learning_rate": 4.3030785718241134e-05,
"loss": 0.6582,
"step": 42800
},
{
"epoch": 0.1397380481623958,
"grad_norm": 0.8181525468826294,
"learning_rate": 4.30144987019248e-05,
"loss": 0.6695,
"step": 42900
},
{
"epoch": 0.14006377787839205,
"grad_norm": 0.8720047473907471,
"learning_rate": 4.299821168560847e-05,
"loss": 0.5931,
"step": 43000
},
{
"epoch": 0.14038950759438834,
"grad_norm": 0.9138098955154419,
"learning_rate": 4.298192466929213e-05,
"loss": 0.6874,
"step": 43100
},
{
"epoch": 0.1407152373103846,
"grad_norm": 0.8015493750572205,
"learning_rate": 4.2965637652975804e-05,
"loss": 0.6574,
"step": 43200
},
{
"epoch": 0.14104096702638086,
"grad_norm": 0.8426867723464966,
"learning_rate": 4.294935063665947e-05,
"loss": 0.6662,
"step": 43300
},
{
"epoch": 0.14136669674237712,
"grad_norm": 0.3480939567089081,
"learning_rate": 4.293306362034314e-05,
"loss": 0.6351,
"step": 43400
},
{
"epoch": 0.14169242645837338,
"grad_norm": 0.5666735172271729,
"learning_rate": 4.29167766040268e-05,
"loss": 0.641,
"step": 43500
},
{
"epoch": 0.14201815617436964,
"grad_norm": 0.9445961117744446,
"learning_rate": 4.2900489587710467e-05,
"loss": 0.6608,
"step": 43600
},
{
"epoch": 0.1423438858903659,
"grad_norm": 0.7916907072067261,
"learning_rate": 4.288420257139414e-05,
"loss": 0.6615,
"step": 43700
},
{
"epoch": 0.14266961560636215,
"grad_norm": 0.9159532785415649,
"learning_rate": 4.2867915555077805e-05,
"loss": 0.5919,
"step": 43800
},
{
"epoch": 0.1429953453223584,
"grad_norm": 0.5766249895095825,
"learning_rate": 4.285162853876147e-05,
"loss": 0.6724,
"step": 43900
},
{
"epoch": 0.14332107503835467,
"grad_norm": 0.753519594669342,
"learning_rate": 4.2835341522445136e-05,
"loss": 0.6995,
"step": 44000
},
{
"epoch": 0.14364680475435093,
"grad_norm": 1.1004271507263184,
"learning_rate": 4.281905450612881e-05,
"loss": 0.6636,
"step": 44100
},
{
"epoch": 0.1439725344703472,
"grad_norm": 0.7064334154129028,
"learning_rate": 4.2802767489812475e-05,
"loss": 0.6793,
"step": 44200
},
{
"epoch": 0.14429826418634345,
"grad_norm": 0.5158839225769043,
"learning_rate": 4.278648047349614e-05,
"loss": 0.6336,
"step": 44300
},
{
"epoch": 0.1446239939023397,
"grad_norm": 1.0451433658599854,
"learning_rate": 4.2770193457179806e-05,
"loss": 0.6227,
"step": 44400
},
{
"epoch": 0.14494972361833597,
"grad_norm": 0.5956864356994629,
"learning_rate": 4.275390644086347e-05,
"loss": 0.6517,
"step": 44500
},
{
"epoch": 0.14527545333433223,
"grad_norm": 0.9525729417800903,
"learning_rate": 4.2737619424547144e-05,
"loss": 0.6245,
"step": 44600
},
{
"epoch": 0.1456011830503285,
"grad_norm": 0.7456961274147034,
"learning_rate": 4.272133240823081e-05,
"loss": 0.6577,
"step": 44700
},
{
"epoch": 0.14592691276632475,
"grad_norm": 0.5686585307121277,
"learning_rate": 4.2705045391914476e-05,
"loss": 0.6675,
"step": 44800
},
{
"epoch": 0.14625264248232103,
"grad_norm": 0.5127500295639038,
"learning_rate": 4.268875837559814e-05,
"loss": 0.5966,
"step": 44900
},
{
"epoch": 0.1465783721983173,
"grad_norm": 0.6099263429641724,
"learning_rate": 4.267247135928181e-05,
"loss": 0.6259,
"step": 45000
},
{
"epoch": 0.14690410191431355,
"grad_norm": 0.5734119415283203,
"learning_rate": 4.265618434296548e-05,
"loss": 0.6251,
"step": 45100
},
{
"epoch": 0.1472298316303098,
"grad_norm": 0.40758875012397766,
"learning_rate": 4.263989732664914e-05,
"loss": 0.5856,
"step": 45200
},
{
"epoch": 0.14755556134630607,
"grad_norm": 0.5974459052085876,
"learning_rate": 4.262361031033281e-05,
"loss": 0.6443,
"step": 45300
},
{
"epoch": 0.14788129106230233,
"grad_norm": 0.48085859417915344,
"learning_rate": 4.260732329401648e-05,
"loss": 0.6612,
"step": 45400
},
{
"epoch": 0.1482070207782986,
"grad_norm": 0.5771530270576477,
"learning_rate": 4.259103627770015e-05,
"loss": 0.6272,
"step": 45500
},
{
"epoch": 0.14853275049429485,
"grad_norm": 0.8463455438613892,
"learning_rate": 4.2574749261383815e-05,
"loss": 0.6008,
"step": 45600
},
{
"epoch": 0.1488584802102911,
"grad_norm": 0.7014292478561401,
"learning_rate": 4.255846224506748e-05,
"loss": 0.5353,
"step": 45700
},
{
"epoch": 0.14918420992628736,
"grad_norm": 0.6181588768959045,
"learning_rate": 4.2542175228751146e-05,
"loss": 0.6139,
"step": 45800
},
{
"epoch": 0.14950993964228362,
"grad_norm": 0.6540141701698303,
"learning_rate": 4.252588821243481e-05,
"loss": 0.5997,
"step": 45900
},
{
"epoch": 0.14983566935827988,
"grad_norm": 0.47981733083724976,
"learning_rate": 4.2509601196118485e-05,
"loss": 0.6511,
"step": 46000
},
{
"epoch": 0.15016139907427614,
"grad_norm": 0.964857816696167,
"learning_rate": 4.2493314179802144e-05,
"loss": 0.6365,
"step": 46100
},
{
"epoch": 0.1504871287902724,
"grad_norm": 0.6706714034080505,
"learning_rate": 4.2477027163485816e-05,
"loss": 0.664,
"step": 46200
},
{
"epoch": 0.15081285850626866,
"grad_norm": 0.5073367953300476,
"learning_rate": 4.246074014716948e-05,
"loss": 0.5633,
"step": 46300
},
{
"epoch": 0.15113858822226492,
"grad_norm": 0.37114378809928894,
"learning_rate": 4.2444453130853154e-05,
"loss": 0.6498,
"step": 46400
},
{
"epoch": 0.15146431793826118,
"grad_norm": 1.153325080871582,
"learning_rate": 4.242816611453681e-05,
"loss": 0.6254,
"step": 46500
},
{
"epoch": 0.15179004765425744,
"grad_norm": 0.7353873252868652,
"learning_rate": 4.241187909822048e-05,
"loss": 0.6573,
"step": 46600
},
{
"epoch": 0.15211577737025372,
"grad_norm": 0.5379579067230225,
"learning_rate": 4.239559208190415e-05,
"loss": 0.6642,
"step": 46700
},
{
"epoch": 0.15244150708624998,
"grad_norm": 0.341907799243927,
"learning_rate": 4.237930506558782e-05,
"loss": 0.6294,
"step": 46800
},
{
"epoch": 0.15276723680224624,
"grad_norm": 0.3866462707519531,
"learning_rate": 4.236301804927148e-05,
"loss": 0.6212,
"step": 46900
},
{
"epoch": 0.1530929665182425,
"grad_norm": 0.6686252951622009,
"learning_rate": 4.234673103295515e-05,
"loss": 0.64,
"step": 47000
},
{
"epoch": 0.15341869623423876,
"grad_norm": 0.6398385167121887,
"learning_rate": 4.233044401663882e-05,
"loss": 0.6156,
"step": 47100
},
{
"epoch": 0.15374442595023502,
"grad_norm": 0.8679475784301758,
"learning_rate": 4.231415700032249e-05,
"loss": 0.6492,
"step": 47200
},
{
"epoch": 0.15407015566623128,
"grad_norm": 0.6425623297691345,
"learning_rate": 4.229786998400615e-05,
"loss": 0.6661,
"step": 47300
},
{
"epoch": 0.15439588538222754,
"grad_norm": 0.7811526656150818,
"learning_rate": 4.228158296768982e-05,
"loss": 0.6416,
"step": 47400
},
{
"epoch": 0.1547216150982238,
"grad_norm": 0.6820793747901917,
"learning_rate": 4.2265295951373484e-05,
"loss": 0.6426,
"step": 47500
},
{
"epoch": 0.15504734481422006,
"grad_norm": 0.8748511672019958,
"learning_rate": 4.224900893505716e-05,
"loss": 0.6038,
"step": 47600
},
{
"epoch": 0.15537307453021632,
"grad_norm": 0.6828723549842834,
"learning_rate": 4.223272191874082e-05,
"loss": 0.6408,
"step": 47700
},
{
"epoch": 0.15569880424621257,
"grad_norm": 1.01051926612854,
"learning_rate": 4.221643490242449e-05,
"loss": 0.6218,
"step": 47800
},
{
"epoch": 0.15602453396220883,
"grad_norm": 0.6920143961906433,
"learning_rate": 4.2200147886108154e-05,
"loss": 0.63,
"step": 47900
},
{
"epoch": 0.1563502636782051,
"grad_norm": 0.6410394310951233,
"learning_rate": 4.218386086979182e-05,
"loss": 0.6176,
"step": 48000
},
{
"epoch": 0.15667599339420135,
"grad_norm": 0.5157743692398071,
"learning_rate": 4.216757385347549e-05,
"loss": 0.5947,
"step": 48100
},
{
"epoch": 0.1570017231101976,
"grad_norm": 0.6770983934402466,
"learning_rate": 4.215128683715915e-05,
"loss": 0.6192,
"step": 48200
},
{
"epoch": 0.15732745282619387,
"grad_norm": 0.49714550375938416,
"learning_rate": 4.2134999820842824e-05,
"loss": 0.6121,
"step": 48300
},
{
"epoch": 0.15765318254219013,
"grad_norm": 0.3486001789569855,
"learning_rate": 4.211871280452649e-05,
"loss": 0.5821,
"step": 48400
},
{
"epoch": 0.15797891225818642,
"grad_norm": 0.4202999770641327,
"learning_rate": 4.210242578821016e-05,
"loss": 0.5909,
"step": 48500
},
{
"epoch": 0.15830464197418267,
"grad_norm": 0.44769522547721863,
"learning_rate": 4.208613877189382e-05,
"loss": 0.6369,
"step": 48600
},
{
"epoch": 0.15863037169017893,
"grad_norm": 0.6501901745796204,
"learning_rate": 4.2069851755577486e-05,
"loss": 0.6187,
"step": 48700
},
{
"epoch": 0.1589561014061752,
"grad_norm": 0.8261470794677734,
"learning_rate": 4.205356473926116e-05,
"loss": 0.6136,
"step": 48800
},
{
"epoch": 0.15928183112217145,
"grad_norm": 0.9979439973831177,
"learning_rate": 4.2037277722944825e-05,
"loss": 0.623,
"step": 48900
},
{
"epoch": 0.1596075608381677,
"grad_norm": 0.5651659369468689,
"learning_rate": 4.202099070662849e-05,
"loss": 0.6742,
"step": 49000
},
{
"epoch": 0.15993329055416397,
"grad_norm": 0.7412470579147339,
"learning_rate": 4.2004703690312156e-05,
"loss": 0.6272,
"step": 49100
},
{
"epoch": 0.16025902027016023,
"grad_norm": 0.43271690607070923,
"learning_rate": 4.198841667399583e-05,
"loss": 0.5729,
"step": 49200
},
{
"epoch": 0.1605847499861565,
"grad_norm": 0.5117851495742798,
"learning_rate": 4.1972129657679494e-05,
"loss": 0.6156,
"step": 49300
},
{
"epoch": 0.16091047970215275,
"grad_norm": 0.7106539011001587,
"learning_rate": 4.195584264136316e-05,
"loss": 0.6052,
"step": 49400
},
{
"epoch": 0.161236209418149,
"grad_norm": 0.6146919131278992,
"learning_rate": 4.1939555625046826e-05,
"loss": 0.5932,
"step": 49500
},
{
"epoch": 0.16156193913414527,
"grad_norm": 0.49088531732559204,
"learning_rate": 4.192326860873049e-05,
"loss": 0.568,
"step": 49600
},
{
"epoch": 0.16188766885014153,
"grad_norm": 0.9923317432403564,
"learning_rate": 4.1906981592414164e-05,
"loss": 0.596,
"step": 49700
},
{
"epoch": 0.16221339856613778,
"grad_norm": 0.3995937705039978,
"learning_rate": 4.189069457609783e-05,
"loss": 0.6442,
"step": 49800
},
{
"epoch": 0.16253912828213404,
"grad_norm": 0.5258984565734863,
"learning_rate": 4.1874407559781496e-05,
"loss": 0.5601,
"step": 49900
},
{
"epoch": 0.1628648579981303,
"grad_norm": 0.19585928320884705,
"learning_rate": 4.185812054346516e-05,
"loss": 0.6509,
"step": 50000
},
{
"epoch": 0.16319058771412656,
"grad_norm": 0.625548243522644,
"learning_rate": 4.184183352714883e-05,
"loss": 0.6411,
"step": 50100
},
{
"epoch": 0.16351631743012282,
"grad_norm": 0.7014303207397461,
"learning_rate": 4.18255465108325e-05,
"loss": 0.6125,
"step": 50200
},
{
"epoch": 0.16384204714611908,
"grad_norm": 0.5523779988288879,
"learning_rate": 4.1809259494516165e-05,
"loss": 0.5811,
"step": 50300
},
{
"epoch": 0.16416777686211537,
"grad_norm": 0.5742841958999634,
"learning_rate": 4.179297247819983e-05,
"loss": 0.6282,
"step": 50400
},
{
"epoch": 0.16449350657811163,
"grad_norm": 0.5776492357254028,
"learning_rate": 4.17766854618835e-05,
"loss": 0.6622,
"step": 50500
},
{
"epoch": 0.16481923629410788,
"grad_norm": 0.7464694380760193,
"learning_rate": 4.176039844556717e-05,
"loss": 0.6309,
"step": 50600
},
{
"epoch": 0.16514496601010414,
"grad_norm": 0.5271546244621277,
"learning_rate": 4.1744111429250835e-05,
"loss": 0.645,
"step": 50700
},
{
"epoch": 0.1654706957261004,
"grad_norm": 0.6904231905937195,
"learning_rate": 4.1727824412934494e-05,
"loss": 0.5927,
"step": 50800
},
{
"epoch": 0.16579642544209666,
"grad_norm": 0.578195333480835,
"learning_rate": 4.1711537396618166e-05,
"loss": 0.5812,
"step": 50900
},
{
"epoch": 0.16612215515809292,
"grad_norm": 0.8716936707496643,
"learning_rate": 4.169525038030183e-05,
"loss": 0.6261,
"step": 51000
},
{
"epoch": 0.16644788487408918,
"grad_norm": 0.6577697992324829,
"learning_rate": 4.1678963363985505e-05,
"loss": 0.6101,
"step": 51100
},
{
"epoch": 0.16677361459008544,
"grad_norm": 0.7431929111480713,
"learning_rate": 4.1662676347669164e-05,
"loss": 0.6227,
"step": 51200
},
{
"epoch": 0.1670993443060817,
"grad_norm": 0.9198315739631653,
"learning_rate": 4.1646389331352836e-05,
"loss": 0.6399,
"step": 51300
},
{
"epoch": 0.16742507402207796,
"grad_norm": 0.5159572958946228,
"learning_rate": 4.16301023150365e-05,
"loss": 0.6329,
"step": 51400
},
{
"epoch": 0.16775080373807422,
"grad_norm": 0.7744697332382202,
"learning_rate": 4.161381529872017e-05,
"loss": 0.5579,
"step": 51500
},
{
"epoch": 0.16807653345407048,
"grad_norm": 0.4429173767566681,
"learning_rate": 4.159752828240383e-05,
"loss": 0.5786,
"step": 51600
},
{
"epoch": 0.16840226317006673,
"grad_norm": 0.7796801924705505,
"learning_rate": 4.15812412660875e-05,
"loss": 0.6353,
"step": 51700
},
{
"epoch": 0.168727992886063,
"grad_norm": 0.43117523193359375,
"learning_rate": 4.156495424977117e-05,
"loss": 0.5807,
"step": 51800
},
{
"epoch": 0.16905372260205925,
"grad_norm": 0.44315412640571594,
"learning_rate": 4.154866723345484e-05,
"loss": 0.5979,
"step": 51900
},
{
"epoch": 0.1693794523180555,
"grad_norm": 0.4306319057941437,
"learning_rate": 4.15323802171385e-05,
"loss": 0.6498,
"step": 52000
},
{
"epoch": 0.16970518203405177,
"grad_norm": 0.283033549785614,
"learning_rate": 4.151609320082217e-05,
"loss": 0.6329,
"step": 52100
},
{
"epoch": 0.17003091175004806,
"grad_norm": 0.4118421673774719,
"learning_rate": 4.1499806184505834e-05,
"loss": 0.5933,
"step": 52200
},
{
"epoch": 0.17035664146604432,
"grad_norm": 0.9130700826644897,
"learning_rate": 4.148351916818951e-05,
"loss": 0.5349,
"step": 52300
},
{
"epoch": 0.17068237118204058,
"grad_norm": 0.33348548412323,
"learning_rate": 4.146723215187317e-05,
"loss": 0.6182,
"step": 52400
},
{
"epoch": 0.17100810089803684,
"grad_norm": 0.6642253398895264,
"learning_rate": 4.145094513555684e-05,
"loss": 0.5989,
"step": 52500
},
{
"epoch": 0.1713338306140331,
"grad_norm": 0.7113855481147766,
"learning_rate": 4.1434658119240504e-05,
"loss": 0.6063,
"step": 52600
},
{
"epoch": 0.17165956033002935,
"grad_norm": 1.0840643644332886,
"learning_rate": 4.1418371102924177e-05,
"loss": 0.615,
"step": 52700
},
{
"epoch": 0.1719852900460256,
"grad_norm": 0.5277838706970215,
"learning_rate": 4.140208408660784e-05,
"loss": 0.6234,
"step": 52800
},
{
"epoch": 0.17231101976202187,
"grad_norm": 0.5993104577064514,
"learning_rate": 4.13857970702915e-05,
"loss": 0.5905,
"step": 52900
},
{
"epoch": 0.17263674947801813,
"grad_norm": 0.7363581657409668,
"learning_rate": 4.1369510053975174e-05,
"loss": 0.6032,
"step": 53000
},
{
"epoch": 0.1729624791940144,
"grad_norm": 0.6299027800559998,
"learning_rate": 4.135322303765884e-05,
"loss": 0.5717,
"step": 53100
},
{
"epoch": 0.17328820891001065,
"grad_norm": 0.49232372641563416,
"learning_rate": 4.133693602134251e-05,
"loss": 0.6031,
"step": 53200
},
{
"epoch": 0.1736139386260069,
"grad_norm": 0.7371428608894348,
"learning_rate": 4.132064900502617e-05,
"loss": 0.5608,
"step": 53300
},
{
"epoch": 0.17393966834200317,
"grad_norm": 1.0730559825897217,
"learning_rate": 4.1304361988709843e-05,
"loss": 0.6026,
"step": 53400
},
{
"epoch": 0.17426539805799943,
"grad_norm": 0.674548327922821,
"learning_rate": 4.128807497239351e-05,
"loss": 0.5721,
"step": 53500
},
{
"epoch": 0.17459112777399569,
"grad_norm": 0.5990965962409973,
"learning_rate": 4.1271787956077175e-05,
"loss": 0.6185,
"step": 53600
},
{
"epoch": 0.17491685748999194,
"grad_norm": 0.61868816614151,
"learning_rate": 4.125550093976084e-05,
"loss": 0.6089,
"step": 53700
},
{
"epoch": 0.1752425872059882,
"grad_norm": 0.4897661507129669,
"learning_rate": 4.1239213923444506e-05,
"loss": 0.6025,
"step": 53800
},
{
"epoch": 0.17556831692198446,
"grad_norm": 0.2856525480747223,
"learning_rate": 4.122292690712818e-05,
"loss": 0.5609,
"step": 53900
},
{
"epoch": 0.17589404663798075,
"grad_norm": 0.5488519668579102,
"learning_rate": 4.1206639890811845e-05,
"loss": 0.5781,
"step": 54000
},
{
"epoch": 0.176219776353977,
"grad_norm": 0.7812597155570984,
"learning_rate": 4.119035287449551e-05,
"loss": 0.665,
"step": 54100
},
{
"epoch": 0.17654550606997327,
"grad_norm": 0.5567785501480103,
"learning_rate": 4.1174065858179176e-05,
"loss": 0.6178,
"step": 54200
},
{
"epoch": 0.17687123578596953,
"grad_norm": 0.7302952408790588,
"learning_rate": 4.115777884186285e-05,
"loss": 0.5912,
"step": 54300
},
{
"epoch": 0.1771969655019658,
"grad_norm": 0.6872962713241577,
"learning_rate": 4.1141491825546514e-05,
"loss": 0.5698,
"step": 54400
},
{
"epoch": 0.17752269521796205,
"grad_norm": 0.6139744520187378,
"learning_rate": 4.112520480923018e-05,
"loss": 0.6148,
"step": 54500
},
{
"epoch": 0.1778484249339583,
"grad_norm": 0.6646268367767334,
"learning_rate": 4.1108917792913846e-05,
"loss": 0.5222,
"step": 54600
},
{
"epoch": 0.17817415464995456,
"grad_norm": 0.4842844009399414,
"learning_rate": 4.109263077659751e-05,
"loss": 0.6225,
"step": 54700
},
{
"epoch": 0.17849988436595082,
"grad_norm": 0.6158716082572937,
"learning_rate": 4.1076343760281184e-05,
"loss": 0.634,
"step": 54800
},
{
"epoch": 0.17882561408194708,
"grad_norm": 0.5122677683830261,
"learning_rate": 4.106005674396485e-05,
"loss": 0.6355,
"step": 54900
},
{
"epoch": 0.17915134379794334,
"grad_norm": 0.6086121201515198,
"learning_rate": 4.1043769727648515e-05,
"loss": 0.5787,
"step": 55000
},
{
"epoch": 0.1794770735139396,
"grad_norm": 0.5853461623191833,
"learning_rate": 4.102748271133218e-05,
"loss": 0.5935,
"step": 55100
},
{
"epoch": 0.17980280322993586,
"grad_norm": 0.9216148853302002,
"learning_rate": 4.101119569501585e-05,
"loss": 0.575,
"step": 55200
},
{
"epoch": 0.18012853294593212,
"grad_norm": 0.6602348685264587,
"learning_rate": 4.099490867869952e-05,
"loss": 0.6324,
"step": 55300
},
{
"epoch": 0.18045426266192838,
"grad_norm": 0.7494210004806519,
"learning_rate": 4.0978621662383185e-05,
"loss": 0.5859,
"step": 55400
},
{
"epoch": 0.18077999237792464,
"grad_norm": 0.6391832232475281,
"learning_rate": 4.096233464606685e-05,
"loss": 0.6172,
"step": 55500
},
{
"epoch": 0.1811057220939209,
"grad_norm": 0.5824201107025146,
"learning_rate": 4.0946047629750517e-05,
"loss": 0.6298,
"step": 55600
},
{
"epoch": 0.18143145180991715,
"grad_norm": 0.6924212574958801,
"learning_rate": 4.092976061343419e-05,
"loss": 0.6105,
"step": 55700
},
{
"epoch": 0.1817571815259134,
"grad_norm": 0.4423877000808716,
"learning_rate": 4.0913473597117855e-05,
"loss": 0.5613,
"step": 55800
},
{
"epoch": 0.1820829112419097,
"grad_norm": 0.6090314984321594,
"learning_rate": 4.0897186580801514e-05,
"loss": 0.6643,
"step": 55900
},
{
"epoch": 0.18240864095790596,
"grad_norm": 0.7554407119750977,
"learning_rate": 4.0880899564485186e-05,
"loss": 0.6017,
"step": 56000
},
{
"epoch": 0.18273437067390222,
"grad_norm": 0.8148972988128662,
"learning_rate": 4.086461254816885e-05,
"loss": 0.6539,
"step": 56100
},
{
"epoch": 0.18306010038989848,
"grad_norm": 0.5610066652297974,
"learning_rate": 4.0848325531852525e-05,
"loss": 0.5872,
"step": 56200
},
{
"epoch": 0.18338583010589474,
"grad_norm": 0.6361645460128784,
"learning_rate": 4.0832038515536183e-05,
"loss": 0.5815,
"step": 56300
},
{
"epoch": 0.183711559821891,
"grad_norm": 0.4567771553993225,
"learning_rate": 4.0815751499219856e-05,
"loss": 0.5799,
"step": 56400
},
{
"epoch": 0.18403728953788726,
"grad_norm": 0.8705578446388245,
"learning_rate": 4.079946448290352e-05,
"loss": 0.6088,
"step": 56500
},
{
"epoch": 0.18436301925388351,
"grad_norm": 0.8278294801712036,
"learning_rate": 4.078317746658719e-05,
"loss": 0.6064,
"step": 56600
},
{
"epoch": 0.18468874896987977,
"grad_norm": 0.38864201307296753,
"learning_rate": 4.076689045027085e-05,
"loss": 0.5705,
"step": 56700
},
{
"epoch": 0.18501447868587603,
"grad_norm": 0.6986147165298462,
"learning_rate": 4.075060343395452e-05,
"loss": 0.6071,
"step": 56800
},
{
"epoch": 0.1853402084018723,
"grad_norm": 0.9127377867698669,
"learning_rate": 4.073431641763819e-05,
"loss": 0.608,
"step": 56900
},
{
"epoch": 0.18566593811786855,
"grad_norm": 0.5072229504585266,
"learning_rate": 4.071802940132186e-05,
"loss": 0.583,
"step": 57000
},
{
"epoch": 0.1859916678338648,
"grad_norm": 0.47545337677001953,
"learning_rate": 4.070174238500552e-05,
"loss": 0.5826,
"step": 57100
},
{
"epoch": 0.18631739754986107,
"grad_norm": 0.5175743103027344,
"learning_rate": 4.068545536868919e-05,
"loss": 0.6184,
"step": 57200
},
{
"epoch": 0.18664312726585733,
"grad_norm": 0.7252177596092224,
"learning_rate": 4.0669168352372854e-05,
"loss": 0.6042,
"step": 57300
},
{
"epoch": 0.1869688569818536,
"grad_norm": 0.21297673881053925,
"learning_rate": 4.065288133605653e-05,
"loss": 0.5874,
"step": 57400
},
{
"epoch": 0.18729458669784985,
"grad_norm": 0.6985592246055603,
"learning_rate": 4.063659431974019e-05,
"loss": 0.5641,
"step": 57500
},
{
"epoch": 0.1876203164138461,
"grad_norm": 0.35783612728118896,
"learning_rate": 4.062030730342386e-05,
"loss": 0.5743,
"step": 57600
},
{
"epoch": 0.1879460461298424,
"grad_norm": 0.40871796011924744,
"learning_rate": 4.0604020287107524e-05,
"loss": 0.6418,
"step": 57700
},
{
"epoch": 0.18827177584583865,
"grad_norm": 0.6412025094032288,
"learning_rate": 4.0587733270791197e-05,
"loss": 0.6048,
"step": 57800
},
{
"epoch": 0.1885975055618349,
"grad_norm": 0.6944416165351868,
"learning_rate": 4.057144625447486e-05,
"loss": 0.5647,
"step": 57900
},
{
"epoch": 0.18892323527783117,
"grad_norm": 0.8592963218688965,
"learning_rate": 4.055515923815852e-05,
"loss": 0.5703,
"step": 58000
},
{
"epoch": 0.18924896499382743,
"grad_norm": 0.7240419983863831,
"learning_rate": 4.0538872221842194e-05,
"loss": 0.6025,
"step": 58100
},
{
"epoch": 0.1895746947098237,
"grad_norm": 0.3861270546913147,
"learning_rate": 4.052258520552586e-05,
"loss": 0.5864,
"step": 58200
},
{
"epoch": 0.18990042442581995,
"grad_norm": 0.6718447208404541,
"learning_rate": 4.050629818920953e-05,
"loss": 0.6139,
"step": 58300
},
{
"epoch": 0.1902261541418162,
"grad_norm": 0.7049744129180908,
"learning_rate": 4.049001117289319e-05,
"loss": 0.5697,
"step": 58400
},
{
"epoch": 0.19055188385781247,
"grad_norm": 0.39576876163482666,
"learning_rate": 4.047372415657686e-05,
"loss": 0.5987,
"step": 58500
},
{
"epoch": 0.19087761357380872,
"grad_norm": 0.7814981341362,
"learning_rate": 4.045743714026053e-05,
"loss": 0.5715,
"step": 58600
},
{
"epoch": 0.19120334328980498,
"grad_norm": 1.0083011388778687,
"learning_rate": 4.0441150123944195e-05,
"loss": 0.6355,
"step": 58700
},
{
"epoch": 0.19152907300580124,
"grad_norm": 0.7083866596221924,
"learning_rate": 4.042486310762786e-05,
"loss": 0.6666,
"step": 58800
},
{
"epoch": 0.1918548027217975,
"grad_norm": 0.4740765690803528,
"learning_rate": 4.0408576091311526e-05,
"loss": 0.5773,
"step": 58900
},
{
"epoch": 0.19218053243779376,
"grad_norm": 0.3599790632724762,
"learning_rate": 4.03922890749952e-05,
"loss": 0.5916,
"step": 59000
},
{
"epoch": 0.19250626215379002,
"grad_norm": 0.6107310652732849,
"learning_rate": 4.0376002058678865e-05,
"loss": 0.63,
"step": 59100
},
{
"epoch": 0.19283199186978628,
"grad_norm": 0.6388813257217407,
"learning_rate": 4.035971504236253e-05,
"loss": 0.6197,
"step": 59200
},
{
"epoch": 0.19315772158578254,
"grad_norm": 0.4137844145298004,
"learning_rate": 4.0343428026046196e-05,
"loss": 0.6185,
"step": 59300
},
{
"epoch": 0.1934834513017788,
"grad_norm": 0.6289616823196411,
"learning_rate": 4.032714100972986e-05,
"loss": 0.6367,
"step": 59400
},
{
"epoch": 0.19380918101777508,
"grad_norm": 0.7528841495513916,
"learning_rate": 4.0310853993413534e-05,
"loss": 0.5783,
"step": 59500
},
{
"epoch": 0.19413491073377134,
"grad_norm": 0.7345238924026489,
"learning_rate": 4.02945669770972e-05,
"loss": 0.6378,
"step": 59600
},
{
"epoch": 0.1944606404497676,
"grad_norm": 0.7652753591537476,
"learning_rate": 4.0278279960780866e-05,
"loss": 0.5419,
"step": 59700
},
{
"epoch": 0.19478637016576386,
"grad_norm": 0.3726235032081604,
"learning_rate": 4.026199294446453e-05,
"loss": 0.5933,
"step": 59800
},
{
"epoch": 0.19511209988176012,
"grad_norm": 0.475990355014801,
"learning_rate": 4.0245705928148204e-05,
"loss": 0.5421,
"step": 59900
},
{
"epoch": 0.19543782959775638,
"grad_norm": 0.8618846535682678,
"learning_rate": 4.022941891183187e-05,
"loss": 0.6149,
"step": 60000
},
{
"epoch": 0.19576355931375264,
"grad_norm": 0.3643835484981537,
"learning_rate": 4.0213131895515535e-05,
"loss": 0.5898,
"step": 60100
},
{
"epoch": 0.1960892890297489,
"grad_norm": 0.6492701172828674,
"learning_rate": 4.01968448791992e-05,
"loss": 0.6115,
"step": 60200
},
{
"epoch": 0.19641501874574516,
"grad_norm": 0.46400219202041626,
"learning_rate": 4.018055786288287e-05,
"loss": 0.6093,
"step": 60300
},
{
"epoch": 0.19674074846174142,
"grad_norm": 0.6529611349105835,
"learning_rate": 4.016427084656654e-05,
"loss": 0.5663,
"step": 60400
},
{
"epoch": 0.19706647817773768,
"grad_norm": 0.8332497477531433,
"learning_rate": 4.0147983830250205e-05,
"loss": 0.557,
"step": 60500
},
{
"epoch": 0.19739220789373393,
"grad_norm": 0.43394774198532104,
"learning_rate": 4.013169681393387e-05,
"loss": 0.5864,
"step": 60600
},
{
"epoch": 0.1977179376097302,
"grad_norm": 0.3713783919811249,
"learning_rate": 4.0115409797617537e-05,
"loss": 0.597,
"step": 60700
},
{
"epoch": 0.19804366732572645,
"grad_norm": 0.5605040788650513,
"learning_rate": 4.00991227813012e-05,
"loss": 0.5965,
"step": 60800
},
{
"epoch": 0.1983693970417227,
"grad_norm": 0.4591531455516815,
"learning_rate": 4.0082835764984875e-05,
"loss": 0.5718,
"step": 60900
},
{
"epoch": 0.19869512675771897,
"grad_norm": 0.7599985003471375,
"learning_rate": 4.0066548748668534e-05,
"loss": 0.6088,
"step": 61000
},
{
"epoch": 0.19902085647371523,
"grad_norm": 0.7234918475151062,
"learning_rate": 4.0050261732352206e-05,
"loss": 0.6022,
"step": 61100
},
{
"epoch": 0.1993465861897115,
"grad_norm": 0.8344034552574158,
"learning_rate": 4.003397471603587e-05,
"loss": 0.5978,
"step": 61200
},
{
"epoch": 0.19967231590570778,
"grad_norm": 0.7539324164390564,
"learning_rate": 4.0017687699719544e-05,
"loss": 0.5979,
"step": 61300
},
{
"epoch": 0.19999804562170403,
"grad_norm": 0.7535436153411865,
"learning_rate": 4.00014006834032e-05,
"loss": 0.5632,
"step": 61400
},
{
"epoch": 0.2003237753377003,
"grad_norm": 1.0253859758377075,
"learning_rate": 3.998511366708687e-05,
"loss": 0.6245,
"step": 61500
},
{
"epoch": 0.20064950505369655,
"grad_norm": 0.8442240357398987,
"learning_rate": 3.996882665077054e-05,
"loss": 0.56,
"step": 61600
},
{
"epoch": 0.2009752347696928,
"grad_norm": 0.7696794867515564,
"learning_rate": 3.995253963445421e-05,
"loss": 0.5525,
"step": 61700
},
{
"epoch": 0.20130096448568907,
"grad_norm": 1.0839108228683472,
"learning_rate": 3.993625261813787e-05,
"loss": 0.576,
"step": 61800
},
{
"epoch": 0.20162669420168533,
"grad_norm": 0.4837821125984192,
"learning_rate": 3.991996560182154e-05,
"loss": 0.6654,
"step": 61900
},
{
"epoch": 0.2019524239176816,
"grad_norm": 0.8696286082267761,
"learning_rate": 3.990367858550521e-05,
"loss": 0.5237,
"step": 62000
},
{
"epoch": 0.20227815363367785,
"grad_norm": 0.5389662384986877,
"learning_rate": 3.988739156918888e-05,
"loss": 0.5765,
"step": 62100
},
{
"epoch": 0.2026038833496741,
"grad_norm": 0.39996546506881714,
"learning_rate": 3.987110455287254e-05,
"loss": 0.5666,
"step": 62200
},
{
"epoch": 0.20292961306567037,
"grad_norm": 0.5612654685974121,
"learning_rate": 3.985481753655621e-05,
"loss": 0.5975,
"step": 62300
},
{
"epoch": 0.20325534278166663,
"grad_norm": 0.4764688014984131,
"learning_rate": 3.9838530520239874e-05,
"loss": 0.5973,
"step": 62400
},
{
"epoch": 0.20358107249766288,
"grad_norm": 0.538745105266571,
"learning_rate": 3.982224350392355e-05,
"loss": 0.6108,
"step": 62500
},
{
"epoch": 0.20390680221365914,
"grad_norm": 0.6589317321777344,
"learning_rate": 3.980595648760721e-05,
"loss": 0.5482,
"step": 62600
},
{
"epoch": 0.2042325319296554,
"grad_norm": 0.8373557925224304,
"learning_rate": 3.978966947129088e-05,
"loss": 0.5671,
"step": 62700
},
{
"epoch": 0.20455826164565166,
"grad_norm": 0.6305526494979858,
"learning_rate": 3.9773382454974544e-05,
"loss": 0.6205,
"step": 62800
},
{
"epoch": 0.20488399136164792,
"grad_norm": 0.6550065875053406,
"learning_rate": 3.9757095438658216e-05,
"loss": 0.5805,
"step": 62900
},
{
"epoch": 0.20520972107764418,
"grad_norm": 0.6951280236244202,
"learning_rate": 3.974080842234188e-05,
"loss": 0.6103,
"step": 63000
},
{
"epoch": 0.20553545079364044,
"grad_norm": 0.5202652215957642,
"learning_rate": 3.972452140602554e-05,
"loss": 0.5623,
"step": 63100
},
{
"epoch": 0.20586118050963673,
"grad_norm": 1.0889042615890503,
"learning_rate": 3.9708234389709214e-05,
"loss": 0.5879,
"step": 63200
},
{
"epoch": 0.20618691022563299,
"grad_norm": 0.4142896234989166,
"learning_rate": 3.969194737339288e-05,
"loss": 0.6148,
"step": 63300
},
{
"epoch": 0.20651263994162924,
"grad_norm": 0.6650342345237732,
"learning_rate": 3.967566035707655e-05,
"loss": 0.5902,
"step": 63400
},
{
"epoch": 0.2068383696576255,
"grad_norm": 0.42452552914619446,
"learning_rate": 3.965937334076021e-05,
"loss": 0.4877,
"step": 63500
},
{
"epoch": 0.20716409937362176,
"grad_norm": 0.6702756881713867,
"learning_rate": 3.964308632444388e-05,
"loss": 0.5943,
"step": 63600
},
{
"epoch": 0.20748982908961802,
"grad_norm": 0.9007012248039246,
"learning_rate": 3.962679930812755e-05,
"loss": 0.5652,
"step": 63700
},
{
"epoch": 0.20781555880561428,
"grad_norm": 0.8962705135345459,
"learning_rate": 3.9610512291811215e-05,
"loss": 0.5731,
"step": 63800
},
{
"epoch": 0.20814128852161054,
"grad_norm": 0.8256299495697021,
"learning_rate": 3.959422527549489e-05,
"loss": 0.5596,
"step": 63900
},
{
"epoch": 0.2084670182376068,
"grad_norm": 0.5674106478691101,
"learning_rate": 3.9577938259178546e-05,
"loss": 0.557,
"step": 64000
},
{
"epoch": 0.20879274795360306,
"grad_norm": 0.564755916595459,
"learning_rate": 3.956165124286222e-05,
"loss": 0.5735,
"step": 64100
},
{
"epoch": 0.20911847766959932,
"grad_norm": 1.0437874794006348,
"learning_rate": 3.9545364226545884e-05,
"loss": 0.5371,
"step": 64200
},
{
"epoch": 0.20944420738559558,
"grad_norm": 0.877699077129364,
"learning_rate": 3.952907721022956e-05,
"loss": 0.538,
"step": 64300
},
{
"epoch": 0.20976993710159184,
"grad_norm": 0.6481153964996338,
"learning_rate": 3.9512790193913216e-05,
"loss": 0.5763,
"step": 64400
},
{
"epoch": 0.2100956668175881,
"grad_norm": 0.7963904142379761,
"learning_rate": 3.949650317759688e-05,
"loss": 0.5617,
"step": 64500
},
{
"epoch": 0.21042139653358435,
"grad_norm": 1.1034698486328125,
"learning_rate": 3.9480216161280554e-05,
"loss": 0.5876,
"step": 64600
},
{
"epoch": 0.2107471262495806,
"grad_norm": 0.7540128827095032,
"learning_rate": 3.946392914496422e-05,
"loss": 0.574,
"step": 64700
},
{
"epoch": 0.21107285596557687,
"grad_norm": 0.7184910178184509,
"learning_rate": 3.9447642128647886e-05,
"loss": 0.5328,
"step": 64800
},
{
"epoch": 0.21139858568157313,
"grad_norm": 0.7150009274482727,
"learning_rate": 3.943135511233155e-05,
"loss": 0.6049,
"step": 64900
},
{
"epoch": 0.21172431539756942,
"grad_norm": 0.4451941251754761,
"learning_rate": 3.9415068096015224e-05,
"loss": 0.5958,
"step": 65000
},
{
"epoch": 0.21205004511356568,
"grad_norm": 1.00858736038208,
"learning_rate": 3.939878107969889e-05,
"loss": 0.5752,
"step": 65100
},
{
"epoch": 0.21237577482956194,
"grad_norm": 0.7953845858573914,
"learning_rate": 3.9382494063382555e-05,
"loss": 0.5555,
"step": 65200
},
{
"epoch": 0.2127015045455582,
"grad_norm": 0.5992127060890198,
"learning_rate": 3.936620704706622e-05,
"loss": 0.59,
"step": 65300
},
{
"epoch": 0.21302723426155445,
"grad_norm": 0.5878809690475464,
"learning_rate": 3.934992003074989e-05,
"loss": 0.5881,
"step": 65400
},
{
"epoch": 0.2133529639775507,
"grad_norm": 0.9159529805183411,
"learning_rate": 3.933363301443356e-05,
"loss": 0.5951,
"step": 65500
},
{
"epoch": 0.21367869369354697,
"grad_norm": 0.6340069770812988,
"learning_rate": 3.9317345998117225e-05,
"loss": 0.5799,
"step": 65600
},
{
"epoch": 0.21400442340954323,
"grad_norm": 0.8940368890762329,
"learning_rate": 3.930105898180089e-05,
"loss": 0.5273,
"step": 65700
},
{
"epoch": 0.2143301531255395,
"grad_norm": 0.7908622622489929,
"learning_rate": 3.9284771965484556e-05,
"loss": 0.5472,
"step": 65800
},
{
"epoch": 0.21465588284153575,
"grad_norm": 0.9964277744293213,
"learning_rate": 3.926848494916822e-05,
"loss": 0.5719,
"step": 65900
},
{
"epoch": 0.214981612557532,
"grad_norm": 0.6497515439987183,
"learning_rate": 3.9252197932851895e-05,
"loss": 0.5338,
"step": 66000
},
{
"epoch": 0.21530734227352827,
"grad_norm": 0.8303185105323792,
"learning_rate": 3.9235910916535554e-05,
"loss": 0.5237,
"step": 66100
},
{
"epoch": 0.21563307198952453,
"grad_norm": 0.8530830144882202,
"learning_rate": 3.9219623900219226e-05,
"loss": 0.5328,
"step": 66200
},
{
"epoch": 0.2159588017055208,
"grad_norm": 0.9482616782188416,
"learning_rate": 3.920333688390289e-05,
"loss": 0.5548,
"step": 66300
},
{
"epoch": 0.21628453142151705,
"grad_norm": 0.430633008480072,
"learning_rate": 3.9187049867586564e-05,
"loss": 0.551,
"step": 66400
},
{
"epoch": 0.2166102611375133,
"grad_norm": 0.5612674355506897,
"learning_rate": 3.917076285127022e-05,
"loss": 0.5571,
"step": 66500
},
{
"epoch": 0.21693599085350956,
"grad_norm": 0.7157821655273438,
"learning_rate": 3.915447583495389e-05,
"loss": 0.555,
"step": 66600
},
{
"epoch": 0.21726172056950582,
"grad_norm": 0.6013966202735901,
"learning_rate": 3.913818881863756e-05,
"loss": 0.585,
"step": 66700
},
{
"epoch": 0.2175874502855021,
"grad_norm": 0.4616648554801941,
"learning_rate": 3.912190180232123e-05,
"loss": 0.5832,
"step": 66800
},
{
"epoch": 0.21791318000149837,
"grad_norm": 0.6870980858802795,
"learning_rate": 3.910561478600489e-05,
"loss": 0.5944,
"step": 66900
},
{
"epoch": 0.21823890971749463,
"grad_norm": 0.629490315914154,
"learning_rate": 3.908932776968856e-05,
"loss": 0.5279,
"step": 67000
},
{
"epoch": 0.2185646394334909,
"grad_norm": 0.5478650331497192,
"learning_rate": 3.907304075337223e-05,
"loss": 0.5815,
"step": 67100
},
{
"epoch": 0.21889036914948715,
"grad_norm": 0.6581255793571472,
"learning_rate": 3.90567537370559e-05,
"loss": 0.5661,
"step": 67200
},
{
"epoch": 0.2192160988654834,
"grad_norm": 0.7738802433013916,
"learning_rate": 3.904046672073956e-05,
"loss": 0.5901,
"step": 67300
},
{
"epoch": 0.21954182858147966,
"grad_norm": 0.5748447179794312,
"learning_rate": 3.902417970442323e-05,
"loss": 0.5813,
"step": 67400
},
{
"epoch": 0.21986755829747592,
"grad_norm": 0.7152987718582153,
"learning_rate": 3.9007892688106894e-05,
"loss": 0.5359,
"step": 67500
},
{
"epoch": 0.22019328801347218,
"grad_norm": 0.867574155330658,
"learning_rate": 3.899160567179057e-05,
"loss": 0.5419,
"step": 67600
},
{
"epoch": 0.22051901772946844,
"grad_norm": 0.8477634787559509,
"learning_rate": 3.897531865547423e-05,
"loss": 0.5788,
"step": 67700
},
{
"epoch": 0.2208447474454647,
"grad_norm": 0.7993571758270264,
"learning_rate": 3.89590316391579e-05,
"loss": 0.528,
"step": 67800
},
{
"epoch": 0.22117047716146096,
"grad_norm": 0.6607359647750854,
"learning_rate": 3.8942744622841564e-05,
"loss": 0.5647,
"step": 67900
},
{
"epoch": 0.22149620687745722,
"grad_norm": 0.6910780072212219,
"learning_rate": 3.892645760652523e-05,
"loss": 0.5418,
"step": 68000
},
{
"epoch": 0.22182193659345348,
"grad_norm": 0.4793308675289154,
"learning_rate": 3.89101705902089e-05,
"loss": 0.5913,
"step": 68100
},
{
"epoch": 0.22214766630944974,
"grad_norm": 0.7222141027450562,
"learning_rate": 3.889388357389257e-05,
"loss": 0.6128,
"step": 68200
},
{
"epoch": 0.222473396025446,
"grad_norm": 0.43712884187698364,
"learning_rate": 3.8877596557576233e-05,
"loss": 0.583,
"step": 68300
},
{
"epoch": 0.22279912574144226,
"grad_norm": 0.5187420845031738,
"learning_rate": 3.88613095412599e-05,
"loss": 0.5758,
"step": 68400
},
{
"epoch": 0.22312485545743851,
"grad_norm": 0.5550572872161865,
"learning_rate": 3.884502252494357e-05,
"loss": 0.5269,
"step": 68500
},
{
"epoch": 0.22345058517343477,
"grad_norm": 0.7551735639572144,
"learning_rate": 3.882873550862724e-05,
"loss": 0.6005,
"step": 68600
},
{
"epoch": 0.22377631488943106,
"grad_norm": 0.7213869690895081,
"learning_rate": 3.8812448492310896e-05,
"loss": 0.5174,
"step": 68700
},
{
"epoch": 0.22410204460542732,
"grad_norm": 0.6445099115371704,
"learning_rate": 3.879616147599457e-05,
"loss": 0.5501,
"step": 68800
},
{
"epoch": 0.22442777432142358,
"grad_norm": 0.7937589883804321,
"learning_rate": 3.8779874459678235e-05,
"loss": 0.5598,
"step": 68900
},
{
"epoch": 0.22475350403741984,
"grad_norm": 0.5327324271202087,
"learning_rate": 3.876358744336191e-05,
"loss": 0.531,
"step": 69000
},
{
"epoch": 0.2250792337534161,
"grad_norm": 0.7627710103988647,
"learning_rate": 3.8747300427045566e-05,
"loss": 0.578,
"step": 69100
},
{
"epoch": 0.22540496346941236,
"grad_norm": 0.5054932832717896,
"learning_rate": 3.873101341072924e-05,
"loss": 0.5905,
"step": 69200
},
{
"epoch": 0.22573069318540862,
"grad_norm": 0.6468352675437927,
"learning_rate": 3.8714726394412904e-05,
"loss": 0.5931,
"step": 69300
},
{
"epoch": 0.22605642290140487,
"grad_norm": 0.37974539399147034,
"learning_rate": 3.869843937809657e-05,
"loss": 0.5777,
"step": 69400
},
{
"epoch": 0.22638215261740113,
"grad_norm": 0.8011950850486755,
"learning_rate": 3.8682152361780236e-05,
"loss": 0.5187,
"step": 69500
},
{
"epoch": 0.2267078823333974,
"grad_norm": 0.40006023645401,
"learning_rate": 3.86658653454639e-05,
"loss": 0.5292,
"step": 69600
},
{
"epoch": 0.22703361204939365,
"grad_norm": 0.42605412006378174,
"learning_rate": 3.8649578329147574e-05,
"loss": 0.5704,
"step": 69700
},
{
"epoch": 0.2273593417653899,
"grad_norm": 0.820277988910675,
"learning_rate": 3.863329131283124e-05,
"loss": 0.5641,
"step": 69800
},
{
"epoch": 0.22768507148138617,
"grad_norm": 0.6671209931373596,
"learning_rate": 3.8617004296514905e-05,
"loss": 0.5942,
"step": 69900
},
{
"epoch": 0.22801080119738243,
"grad_norm": 0.7214267253875732,
"learning_rate": 3.860071728019857e-05,
"loss": 0.6078,
"step": 70000
},
{
"epoch": 0.2283365309133787,
"grad_norm": 0.5705024003982544,
"learning_rate": 3.858443026388224e-05,
"loss": 0.5111,
"step": 70100
},
{
"epoch": 0.22866226062937495,
"grad_norm": 0.7017680406570435,
"learning_rate": 3.856814324756591e-05,
"loss": 0.5386,
"step": 70200
},
{
"epoch": 0.2289879903453712,
"grad_norm": 0.36700716614723206,
"learning_rate": 3.8551856231249575e-05,
"loss": 0.5947,
"step": 70300
},
{
"epoch": 0.22931372006136747,
"grad_norm": 1.018539309501648,
"learning_rate": 3.853556921493324e-05,
"loss": 0.5739,
"step": 70400
},
{
"epoch": 0.22963944977736375,
"grad_norm": 0.8273037672042847,
"learning_rate": 3.851928219861691e-05,
"loss": 0.5247,
"step": 70500
},
{
"epoch": 0.22996517949336,
"grad_norm": 1.0655425786972046,
"learning_rate": 3.850299518230058e-05,
"loss": 0.5397,
"step": 70600
},
{
"epoch": 0.23029090920935627,
"grad_norm": 0.38495421409606934,
"learning_rate": 3.8486708165984245e-05,
"loss": 0.5844,
"step": 70700
},
{
"epoch": 0.23061663892535253,
"grad_norm": 0.9659711122512817,
"learning_rate": 3.847042114966791e-05,
"loss": 0.5873,
"step": 70800
},
{
"epoch": 0.2309423686413488,
"grad_norm": 0.7230137586593628,
"learning_rate": 3.8454134133351576e-05,
"loss": 0.593,
"step": 70900
},
{
"epoch": 0.23126809835734505,
"grad_norm": 0.9325969219207764,
"learning_rate": 3.843784711703524e-05,
"loss": 0.5965,
"step": 71000
},
{
"epoch": 0.2315938280733413,
"grad_norm": 0.6791651248931885,
"learning_rate": 3.8421560100718915e-05,
"loss": 0.6223,
"step": 71100
},
{
"epoch": 0.23191955778933757,
"grad_norm": 0.8241651058197021,
"learning_rate": 3.8405273084402573e-05,
"loss": 0.5257,
"step": 71200
},
{
"epoch": 0.23224528750533383,
"grad_norm": 0.8813059329986572,
"learning_rate": 3.8388986068086246e-05,
"loss": 0.5965,
"step": 71300
},
{
"epoch": 0.23257101722133008,
"grad_norm": 0.7717010378837585,
"learning_rate": 3.837269905176991e-05,
"loss": 0.5502,
"step": 71400
},
{
"epoch": 0.23289674693732634,
"grad_norm": 0.39482927322387695,
"learning_rate": 3.8356412035453584e-05,
"loss": 0.5618,
"step": 71500
},
{
"epoch": 0.2332224766533226,
"grad_norm": 0.8985998630523682,
"learning_rate": 3.834012501913724e-05,
"loss": 0.5247,
"step": 71600
},
{
"epoch": 0.23354820636931886,
"grad_norm": 0.4451032876968384,
"learning_rate": 3.832383800282091e-05,
"loss": 0.565,
"step": 71700
},
{
"epoch": 0.23387393608531512,
"grad_norm": 0.46427956223487854,
"learning_rate": 3.830755098650458e-05,
"loss": 0.5511,
"step": 71800
},
{
"epoch": 0.23419966580131138,
"grad_norm": 1.1371232271194458,
"learning_rate": 3.829126397018825e-05,
"loss": 0.5867,
"step": 71900
},
{
"epoch": 0.23452539551730764,
"grad_norm": 0.5856015086174011,
"learning_rate": 3.827497695387191e-05,
"loss": 0.5425,
"step": 72000
},
{
"epoch": 0.2348511252333039,
"grad_norm": 0.5723338723182678,
"learning_rate": 3.825868993755558e-05,
"loss": 0.5828,
"step": 72100
},
{
"epoch": 0.23517685494930016,
"grad_norm": 0.6274189352989197,
"learning_rate": 3.824240292123925e-05,
"loss": 0.4961,
"step": 72200
},
{
"epoch": 0.23550258466529644,
"grad_norm": 0.5841485857963562,
"learning_rate": 3.822611590492292e-05,
"loss": 0.5639,
"step": 72300
},
{
"epoch": 0.2358283143812927,
"grad_norm": 0.9061130285263062,
"learning_rate": 3.820982888860658e-05,
"loss": 0.5126,
"step": 72400
},
{
"epoch": 0.23615404409728896,
"grad_norm": 0.9499684572219849,
"learning_rate": 3.819354187229025e-05,
"loss": 0.5684,
"step": 72500
},
{
"epoch": 0.23647977381328522,
"grad_norm": 0.7132393717765808,
"learning_rate": 3.8177254855973914e-05,
"loss": 0.5287,
"step": 72600
},
{
"epoch": 0.23680550352928148,
"grad_norm": 0.8645475506782532,
"learning_rate": 3.8160967839657587e-05,
"loss": 0.564,
"step": 72700
},
{
"epoch": 0.23713123324527774,
"grad_norm": 0.8675580024719238,
"learning_rate": 3.814468082334125e-05,
"loss": 0.5435,
"step": 72800
},
{
"epoch": 0.237456962961274,
"grad_norm": 0.7194923162460327,
"learning_rate": 3.812839380702492e-05,
"loss": 0.5843,
"step": 72900
},
{
"epoch": 0.23778269267727026,
"grad_norm": 0.782618522644043,
"learning_rate": 3.8112106790708584e-05,
"loss": 0.5609,
"step": 73000
},
{
"epoch": 0.23810842239326652,
"grad_norm": 0.6671516299247742,
"learning_rate": 3.809581977439225e-05,
"loss": 0.4925,
"step": 73100
},
{
"epoch": 0.23843415210926278,
"grad_norm": 0.8488081097602844,
"learning_rate": 3.807953275807592e-05,
"loss": 0.5536,
"step": 73200
},
{
"epoch": 0.23875988182525903,
"grad_norm": 0.7259848117828369,
"learning_rate": 3.806324574175959e-05,
"loss": 0.5372,
"step": 73300
},
{
"epoch": 0.2390856115412553,
"grad_norm": 0.5849174857139587,
"learning_rate": 3.8046958725443253e-05,
"loss": 0.5602,
"step": 73400
},
{
"epoch": 0.23941134125725155,
"grad_norm": 0.36567142605781555,
"learning_rate": 3.803067170912692e-05,
"loss": 0.5976,
"step": 73500
},
{
"epoch": 0.2397370709732478,
"grad_norm": 0.8540560007095337,
"learning_rate": 3.801438469281059e-05,
"loss": 0.576,
"step": 73600
},
{
"epoch": 0.24006280068924407,
"grad_norm": 0.7733421921730042,
"learning_rate": 3.799809767649426e-05,
"loss": 0.5446,
"step": 73700
},
{
"epoch": 0.24038853040524033,
"grad_norm": 0.6541240811347961,
"learning_rate": 3.7981810660177916e-05,
"loss": 0.5302,
"step": 73800
},
{
"epoch": 0.2407142601212366,
"grad_norm": 0.6777580976486206,
"learning_rate": 3.796552364386159e-05,
"loss": 0.5742,
"step": 73900
},
{
"epoch": 0.24103998983723285,
"grad_norm": 1.1045103073120117,
"learning_rate": 3.7949236627545255e-05,
"loss": 0.5391,
"step": 74000
},
{
"epoch": 0.2413657195532291,
"grad_norm": 1.223781943321228,
"learning_rate": 3.793294961122893e-05,
"loss": 0.5754,
"step": 74100
},
{
"epoch": 0.2416914492692254,
"grad_norm": 0.7645404934883118,
"learning_rate": 3.7916662594912586e-05,
"loss": 0.5424,
"step": 74200
},
{
"epoch": 0.24201717898522165,
"grad_norm": 0.8637171983718872,
"learning_rate": 3.790037557859626e-05,
"loss": 0.5577,
"step": 74300
},
{
"epoch": 0.2423429087012179,
"grad_norm": 0.633642315864563,
"learning_rate": 3.7884088562279924e-05,
"loss": 0.5513,
"step": 74400
},
{
"epoch": 0.24266863841721417,
"grad_norm": 0.48609936237335205,
"learning_rate": 3.786780154596359e-05,
"loss": 0.6002,
"step": 74500
},
{
"epoch": 0.24299436813321043,
"grad_norm": 0.3668748140335083,
"learning_rate": 3.7851514529647256e-05,
"loss": 0.5947,
"step": 74600
},
{
"epoch": 0.2433200978492067,
"grad_norm": 0.735894501209259,
"learning_rate": 3.783522751333092e-05,
"loss": 0.5862,
"step": 74700
},
{
"epoch": 0.24364582756520295,
"grad_norm": 0.8264063000679016,
"learning_rate": 3.7818940497014594e-05,
"loss": 0.5749,
"step": 74800
},
{
"epoch": 0.2439715572811992,
"grad_norm": 0.482183575630188,
"learning_rate": 3.780265348069826e-05,
"loss": 0.5553,
"step": 74900
},
{
"epoch": 0.24429728699719547,
"grad_norm": 0.6649850606918335,
"learning_rate": 3.7786366464381925e-05,
"loss": 0.6042,
"step": 75000
},
{
"epoch": 0.24462301671319173,
"grad_norm": 0.5215208530426025,
"learning_rate": 3.777007944806559e-05,
"loss": 0.5134,
"step": 75100
},
{
"epoch": 0.24494874642918799,
"grad_norm": 0.6028915643692017,
"learning_rate": 3.775379243174926e-05,
"loss": 0.5,
"step": 75200
},
{
"epoch": 0.24527447614518424,
"grad_norm": 0.5038050413131714,
"learning_rate": 3.773750541543293e-05,
"loss": 0.6081,
"step": 75300
},
{
"epoch": 0.2456002058611805,
"grad_norm": 0.568586528301239,
"learning_rate": 3.7721218399116595e-05,
"loss": 0.5484,
"step": 75400
},
{
"epoch": 0.24592593557717676,
"grad_norm": 0.4442402720451355,
"learning_rate": 3.770493138280026e-05,
"loss": 0.5983,
"step": 75500
},
{
"epoch": 0.24625166529317302,
"grad_norm": 0.775284469127655,
"learning_rate": 3.7688644366483927e-05,
"loss": 0.549,
"step": 75600
},
{
"epoch": 0.24657739500916928,
"grad_norm": 0.7132833003997803,
"learning_rate": 3.76723573501676e-05,
"loss": 0.5317,
"step": 75700
},
{
"epoch": 0.24690312472516554,
"grad_norm": 0.7935360074043274,
"learning_rate": 3.7656070333851265e-05,
"loss": 0.5389,
"step": 75800
},
{
"epoch": 0.2472288544411618,
"grad_norm": 0.5749487280845642,
"learning_rate": 3.7639783317534924e-05,
"loss": 0.5918,
"step": 75900
},
{
"epoch": 0.2475545841571581,
"grad_norm": 0.6536827087402344,
"learning_rate": 3.7623496301218596e-05,
"loss": 0.5245,
"step": 76000
},
{
"epoch": 0.24788031387315435,
"grad_norm": 0.7014347314834595,
"learning_rate": 3.760720928490226e-05,
"loss": 0.5661,
"step": 76100
},
{
"epoch": 0.2482060435891506,
"grad_norm": 0.8436623811721802,
"learning_rate": 3.7590922268585934e-05,
"loss": 0.5714,
"step": 76200
},
{
"epoch": 0.24853177330514686,
"grad_norm": 0.6371897459030151,
"learning_rate": 3.7574635252269593e-05,
"loss": 0.5767,
"step": 76300
},
{
"epoch": 0.24885750302114312,
"grad_norm": 0.7796430587768555,
"learning_rate": 3.7558348235953266e-05,
"loss": 0.5308,
"step": 76400
},
{
"epoch": 0.24918323273713938,
"grad_norm": 0.6565324664115906,
"learning_rate": 3.754206121963693e-05,
"loss": 0.5377,
"step": 76500
},
{
"epoch": 0.24950896245313564,
"grad_norm": 0.6670543551445007,
"learning_rate": 3.75257742033206e-05,
"loss": 0.6095,
"step": 76600
},
{
"epoch": 0.2498346921691319,
"grad_norm": 0.8650514483451843,
"learning_rate": 3.750948718700426e-05,
"loss": 0.5586,
"step": 76700
},
{
"epoch": 0.25016042188512816,
"grad_norm": 0.42015933990478516,
"learning_rate": 3.749320017068793e-05,
"loss": 0.5274,
"step": 76800
},
{
"epoch": 0.2504861516011244,
"grad_norm": 0.5667533278465271,
"learning_rate": 3.74769131543716e-05,
"loss": 0.5628,
"step": 76900
},
{
"epoch": 0.2508118813171207,
"grad_norm": 0.6887187361717224,
"learning_rate": 3.746062613805527e-05,
"loss": 0.5663,
"step": 77000
},
{
"epoch": 0.25113761103311694,
"grad_norm": 0.4367005527019501,
"learning_rate": 3.744433912173893e-05,
"loss": 0.5368,
"step": 77100
},
{
"epoch": 0.2514633407491132,
"grad_norm": 0.3392166197299957,
"learning_rate": 3.74280521054226e-05,
"loss": 0.5353,
"step": 77200
},
{
"epoch": 0.25178907046510945,
"grad_norm": 0.5449352860450745,
"learning_rate": 3.7411765089106264e-05,
"loss": 0.5611,
"step": 77300
},
{
"epoch": 0.2521148001811057,
"grad_norm": 0.6924061179161072,
"learning_rate": 3.739547807278994e-05,
"loss": 0.5918,
"step": 77400
},
{
"epoch": 0.252440529897102,
"grad_norm": 0.8356592655181885,
"learning_rate": 3.73791910564736e-05,
"loss": 0.5713,
"step": 77500
},
{
"epoch": 0.25276625961309823,
"grad_norm": 0.9207838177680969,
"learning_rate": 3.736290404015727e-05,
"loss": 0.5078,
"step": 77600
},
{
"epoch": 0.2530919893290945,
"grad_norm": 0.6466575860977173,
"learning_rate": 3.7346617023840934e-05,
"loss": 0.5274,
"step": 77700
},
{
"epoch": 0.25341771904509075,
"grad_norm": 0.5351524353027344,
"learning_rate": 3.7330330007524606e-05,
"loss": 0.5411,
"step": 77800
},
{
"epoch": 0.253743448761087,
"grad_norm": 0.7786761522293091,
"learning_rate": 3.731404299120827e-05,
"loss": 0.4859,
"step": 77900
},
{
"epoch": 0.25406917847708327,
"grad_norm": 0.6750699281692505,
"learning_rate": 3.729775597489194e-05,
"loss": 0.5689,
"step": 78000
},
{
"epoch": 0.2543949081930795,
"grad_norm": 0.7088775038719177,
"learning_rate": 3.7281468958575604e-05,
"loss": 0.5325,
"step": 78100
},
{
"epoch": 0.2547206379090758,
"grad_norm": 0.8920672535896301,
"learning_rate": 3.726518194225927e-05,
"loss": 0.5284,
"step": 78200
},
{
"epoch": 0.25504636762507205,
"grad_norm": 0.6582838296890259,
"learning_rate": 3.724889492594294e-05,
"loss": 0.511,
"step": 78300
},
{
"epoch": 0.2553720973410683,
"grad_norm": 0.6662094593048096,
"learning_rate": 3.723260790962661e-05,
"loss": 0.5618,
"step": 78400
},
{
"epoch": 0.25569782705706456,
"grad_norm": 0.4346591830253601,
"learning_rate": 3.721632089331027e-05,
"loss": 0.54,
"step": 78500
},
{
"epoch": 0.2560235567730608,
"grad_norm": 0.7967207431793213,
"learning_rate": 3.720003387699394e-05,
"loss": 0.5884,
"step": 78600
},
{
"epoch": 0.25634928648905714,
"grad_norm": 0.4879821538925171,
"learning_rate": 3.7183746860677605e-05,
"loss": 0.5557,
"step": 78700
},
{
"epoch": 0.2566750162050534,
"grad_norm": 0.5626016855239868,
"learning_rate": 3.716745984436128e-05,
"loss": 0.498,
"step": 78800
},
{
"epoch": 0.25700074592104966,
"grad_norm": 0.5859974026679993,
"learning_rate": 3.7151172828044936e-05,
"loss": 0.5218,
"step": 78900
},
{
"epoch": 0.2573264756370459,
"grad_norm": 0.7462596893310547,
"learning_rate": 3.713488581172861e-05,
"loss": 0.5093,
"step": 79000
},
{
"epoch": 0.2576522053530422,
"grad_norm": 0.9555974006652832,
"learning_rate": 3.7118598795412274e-05,
"loss": 0.5348,
"step": 79100
},
{
"epoch": 0.25797793506903843,
"grad_norm": 0.7466504573822021,
"learning_rate": 3.710231177909595e-05,
"loss": 0.5383,
"step": 79200
},
{
"epoch": 0.2583036647850347,
"grad_norm": 0.8801865577697754,
"learning_rate": 3.7086024762779606e-05,
"loss": 0.4767,
"step": 79300
},
{
"epoch": 0.25862939450103095,
"grad_norm": 0.48174184560775757,
"learning_rate": 3.706973774646328e-05,
"loss": 0.5528,
"step": 79400
},
{
"epoch": 0.2589551242170272,
"grad_norm": 0.7198649048805237,
"learning_rate": 3.7053450730146944e-05,
"loss": 0.5953,
"step": 79500
},
{
"epoch": 0.25928085393302347,
"grad_norm": 0.4515075385570526,
"learning_rate": 3.703716371383061e-05,
"loss": 0.5505,
"step": 79600
},
{
"epoch": 0.25960658364901973,
"grad_norm": 0.706524670124054,
"learning_rate": 3.7020876697514276e-05,
"loss": 0.6011,
"step": 79700
},
{
"epoch": 0.259932313365016,
"grad_norm": 0.6895307302474976,
"learning_rate": 3.700458968119794e-05,
"loss": 0.5188,
"step": 79800
},
{
"epoch": 0.26025804308101225,
"grad_norm": 0.7927341461181641,
"learning_rate": 3.6988302664881614e-05,
"loss": 0.5739,
"step": 79900
},
{
"epoch": 0.2605837727970085,
"grad_norm": 0.8496550917625427,
"learning_rate": 3.697201564856528e-05,
"loss": 0.5152,
"step": 80000
},
{
"epoch": 0.26090950251300477,
"grad_norm": 0.47138693928718567,
"learning_rate": 3.6955728632248945e-05,
"loss": 0.5475,
"step": 80100
},
{
"epoch": 0.261235232229001,
"grad_norm": 0.8020485639572144,
"learning_rate": 3.693944161593261e-05,
"loss": 0.5489,
"step": 80200
},
{
"epoch": 0.2615609619449973,
"grad_norm": 0.6385429501533508,
"learning_rate": 3.692315459961628e-05,
"loss": 0.5457,
"step": 80300
},
{
"epoch": 0.26188669166099354,
"grad_norm": 0.6027743220329285,
"learning_rate": 3.690686758329995e-05,
"loss": 0.5412,
"step": 80400
},
{
"epoch": 0.2622124213769898,
"grad_norm": 0.6040454506874084,
"learning_rate": 3.6890580566983615e-05,
"loss": 0.5348,
"step": 80500
},
{
"epoch": 0.26253815109298606,
"grad_norm": 0.6697177290916443,
"learning_rate": 3.687429355066728e-05,
"loss": 0.509,
"step": 80600
},
{
"epoch": 0.2628638808089823,
"grad_norm": 0.8428653478622437,
"learning_rate": 3.6858006534350946e-05,
"loss": 0.5505,
"step": 80700
},
{
"epoch": 0.2631896105249786,
"grad_norm": 0.9421257972717285,
"learning_rate": 3.684171951803462e-05,
"loss": 0.5587,
"step": 80800
},
{
"epoch": 0.26351534024097484,
"grad_norm": 0.7752894759178162,
"learning_rate": 3.6825432501718285e-05,
"loss": 0.5308,
"step": 80900
},
{
"epoch": 0.2638410699569711,
"grad_norm": 0.9658520817756653,
"learning_rate": 3.6809145485401944e-05,
"loss": 0.5394,
"step": 81000
},
{
"epoch": 0.26416679967296736,
"grad_norm": 0.3100132644176483,
"learning_rate": 3.6792858469085616e-05,
"loss": 0.5616,
"step": 81100
},
{
"epoch": 0.2644925293889636,
"grad_norm": 1.0838834047317505,
"learning_rate": 3.677657145276928e-05,
"loss": 0.5374,
"step": 81200
},
{
"epoch": 0.2648182591049599,
"grad_norm": 0.9311345219612122,
"learning_rate": 3.6760284436452954e-05,
"loss": 0.5353,
"step": 81300
},
{
"epoch": 0.26514398882095613,
"grad_norm": 0.32365360856056213,
"learning_rate": 3.674399742013661e-05,
"loss": 0.5493,
"step": 81400
},
{
"epoch": 0.2654697185369524,
"grad_norm": 0.6390203833580017,
"learning_rate": 3.6727710403820286e-05,
"loss": 0.5205,
"step": 81500
},
{
"epoch": 0.26579544825294865,
"grad_norm": 0.6106113195419312,
"learning_rate": 3.671142338750395e-05,
"loss": 0.5161,
"step": 81600
},
{
"epoch": 0.2661211779689449,
"grad_norm": 0.4415883421897888,
"learning_rate": 3.669513637118762e-05,
"loss": 0.5235,
"step": 81700
},
{
"epoch": 0.26644690768494117,
"grad_norm": 0.8828484416007996,
"learning_rate": 3.667884935487128e-05,
"loss": 0.5214,
"step": 81800
},
{
"epoch": 0.26677263740093743,
"grad_norm": 0.8186760544776917,
"learning_rate": 3.666256233855495e-05,
"loss": 0.5435,
"step": 81900
},
{
"epoch": 0.2670983671169337,
"grad_norm": 0.43989554047584534,
"learning_rate": 3.664627532223862e-05,
"loss": 0.5653,
"step": 82000
},
{
"epoch": 0.26742409683292995,
"grad_norm": 1.083422303199768,
"learning_rate": 3.662998830592229e-05,
"loss": 0.5338,
"step": 82100
},
{
"epoch": 0.2677498265489262,
"grad_norm": 0.40522611141204834,
"learning_rate": 3.661370128960596e-05,
"loss": 0.4892,
"step": 82200
},
{
"epoch": 0.26807555626492247,
"grad_norm": 0.7010061740875244,
"learning_rate": 3.659741427328962e-05,
"loss": 0.5372,
"step": 82300
},
{
"epoch": 0.2684012859809188,
"grad_norm": 0.9971382021903992,
"learning_rate": 3.6581127256973284e-05,
"loss": 0.501,
"step": 82400
},
{
"epoch": 0.26872701569691504,
"grad_norm": 0.5222276449203491,
"learning_rate": 3.656484024065696e-05,
"loss": 0.5194,
"step": 82500
},
{
"epoch": 0.2690527454129113,
"grad_norm": 0.724824845790863,
"learning_rate": 3.654855322434062e-05,
"loss": 0.499,
"step": 82600
},
{
"epoch": 0.26937847512890756,
"grad_norm": 0.48272421956062317,
"learning_rate": 3.653226620802429e-05,
"loss": 0.486,
"step": 82700
},
{
"epoch": 0.2697042048449038,
"grad_norm": 0.8187432885169983,
"learning_rate": 3.6515979191707954e-05,
"loss": 0.5634,
"step": 82800
},
{
"epoch": 0.2700299345609001,
"grad_norm": 0.46917855739593506,
"learning_rate": 3.6499692175391626e-05,
"loss": 0.5468,
"step": 82900
},
{
"epoch": 0.27035566427689633,
"grad_norm": 0.5338607430458069,
"learning_rate": 3.648340515907529e-05,
"loss": 0.481,
"step": 83000
},
{
"epoch": 0.2706813939928926,
"grad_norm": 0.5420836806297302,
"learning_rate": 3.646711814275896e-05,
"loss": 0.5391,
"step": 83100
},
{
"epoch": 0.27100712370888885,
"grad_norm": 0.5124307870864868,
"learning_rate": 3.6450831126442624e-05,
"loss": 0.5446,
"step": 83200
},
{
"epoch": 0.2713328534248851,
"grad_norm": 0.5944223403930664,
"learning_rate": 3.643454411012629e-05,
"loss": 0.5759,
"step": 83300
},
{
"epoch": 0.27165858314088137,
"grad_norm": 1.1431384086608887,
"learning_rate": 3.641825709380996e-05,
"loss": 0.5416,
"step": 83400
},
{
"epoch": 0.27198431285687763,
"grad_norm": 0.9613766670227051,
"learning_rate": 3.640197007749363e-05,
"loss": 0.521,
"step": 83500
},
{
"epoch": 0.2723100425728739,
"grad_norm": 0.7477935552597046,
"learning_rate": 3.638568306117729e-05,
"loss": 0.558,
"step": 83600
},
{
"epoch": 0.27263577228887015,
"grad_norm": 0.47112804651260376,
"learning_rate": 3.636939604486096e-05,
"loss": 0.5083,
"step": 83700
},
{
"epoch": 0.2729615020048664,
"grad_norm": 0.5914379954338074,
"learning_rate": 3.6353109028544625e-05,
"loss": 0.5776,
"step": 83800
},
{
"epoch": 0.27328723172086267,
"grad_norm": 0.5500662326812744,
"learning_rate": 3.63368220122283e-05,
"loss": 0.5194,
"step": 83900
},
{
"epoch": 0.2736129614368589,
"grad_norm": 0.41591793298721313,
"learning_rate": 3.6320534995911956e-05,
"loss": 0.5266,
"step": 84000
},
{
"epoch": 0.2739386911528552,
"grad_norm": 1.080356478691101,
"learning_rate": 3.630424797959563e-05,
"loss": 0.4964,
"step": 84100
},
{
"epoch": 0.27426442086885144,
"grad_norm": 0.40892690420150757,
"learning_rate": 3.6287960963279294e-05,
"loss": 0.5163,
"step": 84200
},
{
"epoch": 0.2745901505848477,
"grad_norm": 0.7729841470718384,
"learning_rate": 3.627167394696297e-05,
"loss": 0.5336,
"step": 84300
},
{
"epoch": 0.27491588030084396,
"grad_norm": 0.6264617443084717,
"learning_rate": 3.6255386930646626e-05,
"loss": 0.5762,
"step": 84400
},
{
"epoch": 0.2752416100168402,
"grad_norm": 0.8050372004508972,
"learning_rate": 3.623909991433029e-05,
"loss": 0.4509,
"step": 84500
},
{
"epoch": 0.2755673397328365,
"grad_norm": 0.621804416179657,
"learning_rate": 3.6222812898013964e-05,
"loss": 0.5174,
"step": 84600
},
{
"epoch": 0.27589306944883274,
"grad_norm": 0.5717790126800537,
"learning_rate": 3.620652588169763e-05,
"loss": 0.5431,
"step": 84700
},
{
"epoch": 0.276218799164829,
"grad_norm": 0.394345223903656,
"learning_rate": 3.6190238865381295e-05,
"loss": 0.5294,
"step": 84800
},
{
"epoch": 0.27654452888082526,
"grad_norm": 0.8917814493179321,
"learning_rate": 3.617395184906496e-05,
"loss": 0.4955,
"step": 84900
},
{
"epoch": 0.2768702585968215,
"grad_norm": 0.721481442451477,
"learning_rate": 3.6157664832748634e-05,
"loss": 0.5433,
"step": 85000
},
{
"epoch": 0.2771959883128178,
"grad_norm": 0.6476948857307434,
"learning_rate": 3.61413778164323e-05,
"loss": 0.563,
"step": 85100
},
{
"epoch": 0.27752171802881404,
"grad_norm": 0.38036003708839417,
"learning_rate": 3.6125090800115965e-05,
"loss": 0.516,
"step": 85200
},
{
"epoch": 0.2778474477448103,
"grad_norm": 0.6185033917427063,
"learning_rate": 3.610880378379963e-05,
"loss": 0.5178,
"step": 85300
},
{
"epoch": 0.27817317746080655,
"grad_norm": 0.8313725590705872,
"learning_rate": 3.60925167674833e-05,
"loss": 0.5296,
"step": 85400
},
{
"epoch": 0.2784989071768028,
"grad_norm": 0.5369439721107483,
"learning_rate": 3.607622975116697e-05,
"loss": 0.5803,
"step": 85500
},
{
"epoch": 0.27882463689279907,
"grad_norm": 0.7777513265609741,
"learning_rate": 3.6059942734850635e-05,
"loss": 0.4875,
"step": 85600
},
{
"epoch": 0.27915036660879533,
"grad_norm": 0.5527925491333008,
"learning_rate": 3.60436557185343e-05,
"loss": 0.5141,
"step": 85700
},
{
"epoch": 0.2794760963247916,
"grad_norm": 0.8335199356079102,
"learning_rate": 3.6027368702217966e-05,
"loss": 0.4851,
"step": 85800
},
{
"epoch": 0.27980182604078785,
"grad_norm": 0.7015230059623718,
"learning_rate": 3.601108168590163e-05,
"loss": 0.5395,
"step": 85900
},
{
"epoch": 0.2801275557567841,
"grad_norm": 0.7245033979415894,
"learning_rate": 3.5994794669585305e-05,
"loss": 0.5204,
"step": 86000
},
{
"epoch": 0.2804532854727804,
"grad_norm": 0.8472508192062378,
"learning_rate": 3.5978507653268964e-05,
"loss": 0.5087,
"step": 86100
},
{
"epoch": 0.2807790151887767,
"grad_norm": 0.7517431974411011,
"learning_rate": 3.5962220636952636e-05,
"loss": 0.5176,
"step": 86200
},
{
"epoch": 0.28110474490477294,
"grad_norm": 0.5864343643188477,
"learning_rate": 3.59459336206363e-05,
"loss": 0.5828,
"step": 86300
},
{
"epoch": 0.2814304746207692,
"grad_norm": 0.8981267809867859,
"learning_rate": 3.5929646604319974e-05,
"loss": 0.5309,
"step": 86400
},
{
"epoch": 0.28175620433676546,
"grad_norm": 0.8167164325714111,
"learning_rate": 3.591335958800364e-05,
"loss": 0.5513,
"step": 86500
},
{
"epoch": 0.2820819340527617,
"grad_norm": 0.7764830589294434,
"learning_rate": 3.58970725716873e-05,
"loss": 0.5249,
"step": 86600
},
{
"epoch": 0.282407663768758,
"grad_norm": 0.7545201182365417,
"learning_rate": 3.588078555537097e-05,
"loss": 0.5293,
"step": 86700
},
{
"epoch": 0.28273339348475424,
"grad_norm": 0.6954336166381836,
"learning_rate": 3.586449853905464e-05,
"loss": 0.5532,
"step": 86800
},
{
"epoch": 0.2830591232007505,
"grad_norm": 0.6742025017738342,
"learning_rate": 3.584821152273831e-05,
"loss": 0.5356,
"step": 86900
},
{
"epoch": 0.28338485291674675,
"grad_norm": 0.731679379940033,
"learning_rate": 3.583192450642197e-05,
"loss": 0.5128,
"step": 87000
},
{
"epoch": 0.283710582632743,
"grad_norm": 0.7906468510627747,
"learning_rate": 3.581563749010564e-05,
"loss": 0.5359,
"step": 87100
},
{
"epoch": 0.2840363123487393,
"grad_norm": 0.36753523349761963,
"learning_rate": 3.579935047378931e-05,
"loss": 0.5366,
"step": 87200
},
{
"epoch": 0.28436204206473553,
"grad_norm": 0.6043976545333862,
"learning_rate": 3.578306345747298e-05,
"loss": 0.4995,
"step": 87300
},
{
"epoch": 0.2846877717807318,
"grad_norm": 0.7573038339614868,
"learning_rate": 3.576677644115664e-05,
"loss": 0.5093,
"step": 87400
},
{
"epoch": 0.28501350149672805,
"grad_norm": 0.25290992856025696,
"learning_rate": 3.5750489424840304e-05,
"loss": 0.4948,
"step": 87500
},
{
"epoch": 0.2853392312127243,
"grad_norm": 0.6551434397697449,
"learning_rate": 3.5734202408523977e-05,
"loss": 0.5116,
"step": 87600
},
{
"epoch": 0.28566496092872057,
"grad_norm": 0.6715214252471924,
"learning_rate": 3.571791539220764e-05,
"loss": 0.6104,
"step": 87700
},
{
"epoch": 0.2859906906447168,
"grad_norm": 0.7275449633598328,
"learning_rate": 3.570162837589131e-05,
"loss": 0.506,
"step": 87800
},
{
"epoch": 0.2863164203607131,
"grad_norm": 0.2885235846042633,
"learning_rate": 3.5685341359574974e-05,
"loss": 0.4684,
"step": 87900
},
{
"epoch": 0.28664215007670935,
"grad_norm": 0.9342713356018066,
"learning_rate": 3.5669054343258646e-05,
"loss": 0.5293,
"step": 88000
},
{
"epoch": 0.2869678797927056,
"grad_norm": 1.0423755645751953,
"learning_rate": 3.565276732694231e-05,
"loss": 0.5466,
"step": 88100
},
{
"epoch": 0.28729360950870186,
"grad_norm": 1.0259456634521484,
"learning_rate": 3.563648031062598e-05,
"loss": 0.4885,
"step": 88200
},
{
"epoch": 0.2876193392246981,
"grad_norm": 0.8733958601951599,
"learning_rate": 3.5620193294309643e-05,
"loss": 0.5353,
"step": 88300
},
{
"epoch": 0.2879450689406944,
"grad_norm": 0.33869871497154236,
"learning_rate": 3.560390627799331e-05,
"loss": 0.5465,
"step": 88400
},
{
"epoch": 0.28827079865669064,
"grad_norm": 0.5838894844055176,
"learning_rate": 3.558761926167698e-05,
"loss": 0.555,
"step": 88500
},
{
"epoch": 0.2885965283726869,
"grad_norm": 0.8616543412208557,
"learning_rate": 3.557133224536065e-05,
"loss": 0.5173,
"step": 88600
},
{
"epoch": 0.28892225808868316,
"grad_norm": 0.8486323356628418,
"learning_rate": 3.555504522904431e-05,
"loss": 0.5258,
"step": 88700
},
{
"epoch": 0.2892479878046794,
"grad_norm": 0.6569567918777466,
"learning_rate": 3.553875821272798e-05,
"loss": 0.5097,
"step": 88800
},
{
"epoch": 0.2895737175206757,
"grad_norm": 0.6821163296699524,
"learning_rate": 3.5522471196411645e-05,
"loss": 0.5428,
"step": 88900
},
{
"epoch": 0.28989944723667194,
"grad_norm": 0.6147534251213074,
"learning_rate": 3.550618418009532e-05,
"loss": 0.5544,
"step": 89000
},
{
"epoch": 0.2902251769526682,
"grad_norm": 0.42478904128074646,
"learning_rate": 3.5489897163778976e-05,
"loss": 0.5376,
"step": 89100
},
{
"epoch": 0.29055090666866445,
"grad_norm": 0.5254961252212524,
"learning_rate": 3.547361014746265e-05,
"loss": 0.4964,
"step": 89200
},
{
"epoch": 0.2908766363846607,
"grad_norm": 0.6934669017791748,
"learning_rate": 3.5457323131146314e-05,
"loss": 0.4835,
"step": 89300
},
{
"epoch": 0.291202366100657,
"grad_norm": 0.4250465929508209,
"learning_rate": 3.544103611482999e-05,
"loss": 0.4954,
"step": 89400
},
{
"epoch": 0.29152809581665323,
"grad_norm": 0.6067728996276855,
"learning_rate": 3.5424749098513646e-05,
"loss": 0.4926,
"step": 89500
},
{
"epoch": 0.2918538255326495,
"grad_norm": 0.5424463748931885,
"learning_rate": 3.540846208219731e-05,
"loss": 0.5627,
"step": 89600
},
{
"epoch": 0.2921795552486458,
"grad_norm": 0.5810889005661011,
"learning_rate": 3.5392175065880984e-05,
"loss": 0.4316,
"step": 89700
},
{
"epoch": 0.29250528496464206,
"grad_norm": 0.4583912491798401,
"learning_rate": 3.537588804956465e-05,
"loss": 0.4987,
"step": 89800
},
{
"epoch": 0.2928310146806383,
"grad_norm": 0.4320780634880066,
"learning_rate": 3.5359601033248315e-05,
"loss": 0.5204,
"step": 89900
},
{
"epoch": 0.2931567443966346,
"grad_norm": 0.6955101490020752,
"learning_rate": 3.534331401693198e-05,
"loss": 0.5179,
"step": 90000
},
{
"epoch": 0.29348247411263084,
"grad_norm": 0.512250542640686,
"learning_rate": 3.5327027000615654e-05,
"loss": 0.4909,
"step": 90100
},
{
"epoch": 0.2938082038286271,
"grad_norm": 0.7975231409072876,
"learning_rate": 3.531073998429932e-05,
"loss": 0.4845,
"step": 90200
},
{
"epoch": 0.29413393354462336,
"grad_norm": 0.25338149070739746,
"learning_rate": 3.5294452967982985e-05,
"loss": 0.4963,
"step": 90300
},
{
"epoch": 0.2944596632606196,
"grad_norm": 0.43115437030792236,
"learning_rate": 3.527816595166665e-05,
"loss": 0.5203,
"step": 90400
},
{
"epoch": 0.2947853929766159,
"grad_norm": 0.830754280090332,
"learning_rate": 3.5261878935350317e-05,
"loss": 0.4916,
"step": 90500
},
{
"epoch": 0.29511112269261214,
"grad_norm": 0.8370751738548279,
"learning_rate": 3.524559191903399e-05,
"loss": 0.547,
"step": 90600
},
{
"epoch": 0.2954368524086084,
"grad_norm": 0.7122400403022766,
"learning_rate": 3.5229304902717655e-05,
"loss": 0.5126,
"step": 90700
},
{
"epoch": 0.29576258212460466,
"grad_norm": 0.4084763824939728,
"learning_rate": 3.521301788640132e-05,
"loss": 0.4971,
"step": 90800
},
{
"epoch": 0.2960883118406009,
"grad_norm": 0.8079352974891663,
"learning_rate": 3.5196730870084986e-05,
"loss": 0.4992,
"step": 90900
},
{
"epoch": 0.2964140415565972,
"grad_norm": 0.25352516770362854,
"learning_rate": 3.518044385376865e-05,
"loss": 0.5333,
"step": 91000
},
{
"epoch": 0.29673977127259343,
"grad_norm": 0.5390329957008362,
"learning_rate": 3.5164156837452324e-05,
"loss": 0.5007,
"step": 91100
},
{
"epoch": 0.2970655009885897,
"grad_norm": 0.6617804765701294,
"learning_rate": 3.514786982113599e-05,
"loss": 0.548,
"step": 91200
},
{
"epoch": 0.29739123070458595,
"grad_norm": 0.7202132940292358,
"learning_rate": 3.5131582804819656e-05,
"loss": 0.5417,
"step": 91300
},
{
"epoch": 0.2977169604205822,
"grad_norm": 0.28012895584106445,
"learning_rate": 3.511529578850332e-05,
"loss": 0.4883,
"step": 91400
},
{
"epoch": 0.29804269013657847,
"grad_norm": 0.3527827560901642,
"learning_rate": 3.5099008772186994e-05,
"loss": 0.523,
"step": 91500
},
{
"epoch": 0.29836841985257473,
"grad_norm": 0.7193790078163147,
"learning_rate": 3.508272175587066e-05,
"loss": 0.5148,
"step": 91600
},
{
"epoch": 0.298694149568571,
"grad_norm": 0.9702345728874207,
"learning_rate": 3.506643473955432e-05,
"loss": 0.4781,
"step": 91700
},
{
"epoch": 0.29901987928456725,
"grad_norm": 0.7323670983314514,
"learning_rate": 3.505014772323799e-05,
"loss": 0.5394,
"step": 91800
},
{
"epoch": 0.2993456090005635,
"grad_norm": 0.6757960915565491,
"learning_rate": 3.503386070692166e-05,
"loss": 0.4984,
"step": 91900
},
{
"epoch": 0.29967133871655977,
"grad_norm": 0.7119109630584717,
"learning_rate": 3.501757369060533e-05,
"loss": 0.5502,
"step": 92000
},
{
"epoch": 0.299997068432556,
"grad_norm": 0.6820542216300964,
"learning_rate": 3.500128667428899e-05,
"loss": 0.5498,
"step": 92100
},
{
"epoch": 0.3003227981485523,
"grad_norm": 0.784050703048706,
"learning_rate": 3.498499965797266e-05,
"loss": 0.5445,
"step": 92200
},
{
"epoch": 0.30064852786454854,
"grad_norm": 0.6549366116523743,
"learning_rate": 3.496871264165633e-05,
"loss": 0.5326,
"step": 92300
},
{
"epoch": 0.3009742575805448,
"grad_norm": 0.4872061014175415,
"learning_rate": 3.495242562533999e-05,
"loss": 0.5093,
"step": 92400
},
{
"epoch": 0.30129998729654106,
"grad_norm": 0.3646996319293976,
"learning_rate": 3.493613860902366e-05,
"loss": 0.5476,
"step": 92500
},
{
"epoch": 0.3016257170125373,
"grad_norm": 0.5709706544876099,
"learning_rate": 3.4919851592707324e-05,
"loss": 0.4513,
"step": 92600
},
{
"epoch": 0.3019514467285336,
"grad_norm": 0.6031984090805054,
"learning_rate": 3.4903564576390996e-05,
"loss": 0.5044,
"step": 92700
},
{
"epoch": 0.30227717644452984,
"grad_norm": 0.8381587862968445,
"learning_rate": 3.488727756007466e-05,
"loss": 0.5128,
"step": 92800
},
{
"epoch": 0.3026029061605261,
"grad_norm": 1.0859401226043701,
"learning_rate": 3.487099054375833e-05,
"loss": 0.5328,
"step": 92900
},
{
"epoch": 0.30292863587652236,
"grad_norm": 0.34642109274864197,
"learning_rate": 3.4854703527441994e-05,
"loss": 0.4852,
"step": 93000
},
{
"epoch": 0.3032543655925186,
"grad_norm": 0.6529460549354553,
"learning_rate": 3.483841651112566e-05,
"loss": 0.5032,
"step": 93100
},
{
"epoch": 0.3035800953085149,
"grad_norm": 0.7026881575584412,
"learning_rate": 3.482212949480933e-05,
"loss": 0.6338,
"step": 93200
},
{
"epoch": 0.30390582502451113,
"grad_norm": 0.49741417169570923,
"learning_rate": 3.4805842478493e-05,
"loss": 0.5231,
"step": 93300
},
{
"epoch": 0.30423155474050745,
"grad_norm": 0.6611301898956299,
"learning_rate": 3.478955546217666e-05,
"loss": 0.5189,
"step": 93400
},
{
"epoch": 0.3045572844565037,
"grad_norm": 0.6907228827476501,
"learning_rate": 3.477326844586033e-05,
"loss": 0.5256,
"step": 93500
},
{
"epoch": 0.30488301417249997,
"grad_norm": 0.5975654721260071,
"learning_rate": 3.4756981429544e-05,
"loss": 0.522,
"step": 93600
},
{
"epoch": 0.3052087438884962,
"grad_norm": 0.6043006777763367,
"learning_rate": 3.474069441322767e-05,
"loss": 0.5018,
"step": 93700
},
{
"epoch": 0.3055344736044925,
"grad_norm": 0.5697898864746094,
"learning_rate": 3.4724407396911326e-05,
"loss": 0.5009,
"step": 93800
},
{
"epoch": 0.30586020332048874,
"grad_norm": 0.40364518761634827,
"learning_rate": 3.4708120380595e-05,
"loss": 0.4642,
"step": 93900
},
{
"epoch": 0.306185933036485,
"grad_norm": 0.940877377986908,
"learning_rate": 3.4691833364278664e-05,
"loss": 0.5136,
"step": 94000
},
{
"epoch": 0.30651166275248126,
"grad_norm": 0.7497209310531616,
"learning_rate": 3.467554634796234e-05,
"loss": 0.5261,
"step": 94100
},
{
"epoch": 0.3068373924684775,
"grad_norm": 0.8120318651199341,
"learning_rate": 3.4659259331645996e-05,
"loss": 0.4756,
"step": 94200
},
{
"epoch": 0.3071631221844738,
"grad_norm": 0.6802115440368652,
"learning_rate": 3.464297231532967e-05,
"loss": 0.5257,
"step": 94300
},
{
"epoch": 0.30748885190047004,
"grad_norm": 0.43083488941192627,
"learning_rate": 3.4626685299013334e-05,
"loss": 0.5365,
"step": 94400
},
{
"epoch": 0.3078145816164663,
"grad_norm": 0.6194273233413696,
"learning_rate": 3.4610398282697e-05,
"loss": 0.5157,
"step": 94500
},
{
"epoch": 0.30814031133246256,
"grad_norm": 0.5603410601615906,
"learning_rate": 3.4594111266380666e-05,
"loss": 0.51,
"step": 94600
},
{
"epoch": 0.3084660410484588,
"grad_norm": 1.0651506185531616,
"learning_rate": 3.457782425006433e-05,
"loss": 0.4759,
"step": 94700
},
{
"epoch": 0.3087917707644551,
"grad_norm": 0.7674971222877502,
"learning_rate": 3.4561537233748004e-05,
"loss": 0.467,
"step": 94800
},
{
"epoch": 0.30911750048045133,
"grad_norm": 0.9666951298713684,
"learning_rate": 3.454525021743167e-05,
"loss": 0.5524,
"step": 94900
},
{
"epoch": 0.3094432301964476,
"grad_norm": 0.6148163080215454,
"learning_rate": 3.4528963201115335e-05,
"loss": 0.5345,
"step": 95000
},
{
"epoch": 0.30976895991244385,
"grad_norm": 0.7641096711158752,
"learning_rate": 3.4512676184799e-05,
"loss": 0.4872,
"step": 95100
},
{
"epoch": 0.3100946896284401,
"grad_norm": 0.6152538657188416,
"learning_rate": 3.449638916848267e-05,
"loss": 0.4832,
"step": 95200
},
{
"epoch": 0.31042041934443637,
"grad_norm": 0.7761083841323853,
"learning_rate": 3.448010215216634e-05,
"loss": 0.4761,
"step": 95300
},
{
"epoch": 0.31074614906043263,
"grad_norm": 0.6005348563194275,
"learning_rate": 3.4463815135850005e-05,
"loss": 0.4585,
"step": 95400
},
{
"epoch": 0.3110718787764289,
"grad_norm": 0.7649496793746948,
"learning_rate": 3.444752811953367e-05,
"loss": 0.5283,
"step": 95500
},
{
"epoch": 0.31139760849242515,
"grad_norm": 0.9503573179244995,
"learning_rate": 3.4431241103217336e-05,
"loss": 0.5032,
"step": 95600
},
{
"epoch": 0.3117233382084214,
"grad_norm": 0.8403215408325195,
"learning_rate": 3.441495408690101e-05,
"loss": 0.5172,
"step": 95700
},
{
"epoch": 0.31204906792441767,
"grad_norm": 0.5137957334518433,
"learning_rate": 3.4398667070584675e-05,
"loss": 0.5551,
"step": 95800
},
{
"epoch": 0.3123747976404139,
"grad_norm": 0.6618998646736145,
"learning_rate": 3.438238005426834e-05,
"loss": 0.5237,
"step": 95900
},
{
"epoch": 0.3127005273564102,
"grad_norm": 0.3272695541381836,
"learning_rate": 3.4366093037952006e-05,
"loss": 0.4556,
"step": 96000
},
{
"epoch": 0.31302625707240644,
"grad_norm": 0.7416215538978577,
"learning_rate": 3.434980602163567e-05,
"loss": 0.5039,
"step": 96100
},
{
"epoch": 0.3133519867884027,
"grad_norm": 0.9183087944984436,
"learning_rate": 3.4333519005319344e-05,
"loss": 0.5408,
"step": 96200
},
{
"epoch": 0.31367771650439896,
"grad_norm": 0.3782617151737213,
"learning_rate": 3.431723198900301e-05,
"loss": 0.5113,
"step": 96300
},
{
"epoch": 0.3140034462203952,
"grad_norm": 0.6314922571182251,
"learning_rate": 3.4300944972686676e-05,
"loss": 0.4955,
"step": 96400
},
{
"epoch": 0.3143291759363915,
"grad_norm": 0.3009500801563263,
"learning_rate": 3.428465795637034e-05,
"loss": 0.5114,
"step": 96500
},
{
"epoch": 0.31465490565238774,
"grad_norm": 0.8378229737281799,
"learning_rate": 3.4268370940054014e-05,
"loss": 0.5287,
"step": 96600
},
{
"epoch": 0.314980635368384,
"grad_norm": 0.7249593138694763,
"learning_rate": 3.425208392373768e-05,
"loss": 0.5209,
"step": 96700
},
{
"epoch": 0.31530636508438026,
"grad_norm": 0.45489412546157837,
"learning_rate": 3.423579690742134e-05,
"loss": 0.5745,
"step": 96800
},
{
"epoch": 0.3156320948003765,
"grad_norm": 0.6379255056381226,
"learning_rate": 3.421950989110501e-05,
"loss": 0.5199,
"step": 96900
},
{
"epoch": 0.31595782451637283,
"grad_norm": 0.8550392389297485,
"learning_rate": 3.420322287478868e-05,
"loss": 0.5374,
"step": 97000
},
{
"epoch": 0.3162835542323691,
"grad_norm": 0.5571677684783936,
"learning_rate": 3.418693585847235e-05,
"loss": 0.5057,
"step": 97100
},
{
"epoch": 0.31660928394836535,
"grad_norm": 0.48302140831947327,
"learning_rate": 3.417064884215601e-05,
"loss": 0.5496,
"step": 97200
},
{
"epoch": 0.3169350136643616,
"grad_norm": 0.7864711284637451,
"learning_rate": 3.415436182583968e-05,
"loss": 0.5132,
"step": 97300
},
{
"epoch": 0.31726074338035787,
"grad_norm": 0.5517250299453735,
"learning_rate": 3.413807480952335e-05,
"loss": 0.4826,
"step": 97400
},
{
"epoch": 0.3175864730963541,
"grad_norm": 0.7834230065345764,
"learning_rate": 3.412178779320701e-05,
"loss": 0.5186,
"step": 97500
},
{
"epoch": 0.3179122028123504,
"grad_norm": 0.938097357749939,
"learning_rate": 3.410550077689068e-05,
"loss": 0.4817,
"step": 97600
},
{
"epoch": 0.31823793252834665,
"grad_norm": 0.25078582763671875,
"learning_rate": 3.4089213760574344e-05,
"loss": 0.4996,
"step": 97700
},
{
"epoch": 0.3185636622443429,
"grad_norm": 0.7896013259887695,
"learning_rate": 3.4072926744258016e-05,
"loss": 0.5163,
"step": 97800
},
{
"epoch": 0.31888939196033916,
"grad_norm": 0.6857266426086426,
"learning_rate": 3.405663972794168e-05,
"loss": 0.4952,
"step": 97900
},
{
"epoch": 0.3192151216763354,
"grad_norm": 0.5710707306861877,
"learning_rate": 3.404035271162535e-05,
"loss": 0.5273,
"step": 98000
},
{
"epoch": 0.3195408513923317,
"grad_norm": 0.5274339914321899,
"learning_rate": 3.4024065695309014e-05,
"loss": 0.5385,
"step": 98100
},
{
"epoch": 0.31986658110832794,
"grad_norm": 0.27135804295539856,
"learning_rate": 3.400777867899268e-05,
"loss": 0.5042,
"step": 98200
},
{
"epoch": 0.3201923108243242,
"grad_norm": 0.6852828860282898,
"learning_rate": 3.399149166267635e-05,
"loss": 0.5214,
"step": 98300
},
{
"epoch": 0.32051804054032046,
"grad_norm": 0.5614081621170044,
"learning_rate": 3.397520464636002e-05,
"loss": 0.5023,
"step": 98400
},
{
"epoch": 0.3208437702563167,
"grad_norm": 0.7719017863273621,
"learning_rate": 3.395891763004368e-05,
"loss": 0.4919,
"step": 98500
},
{
"epoch": 0.321169499972313,
"grad_norm": 0.8100476264953613,
"learning_rate": 3.394263061372735e-05,
"loss": 0.4607,
"step": 98600
},
{
"epoch": 0.32149522968830924,
"grad_norm": 0.6814531087875366,
"learning_rate": 3.392634359741102e-05,
"loss": 0.5457,
"step": 98700
},
{
"epoch": 0.3218209594043055,
"grad_norm": 1.0356829166412354,
"learning_rate": 3.391005658109469e-05,
"loss": 0.4844,
"step": 98800
},
{
"epoch": 0.32214668912030175,
"grad_norm": 0.8719603419303894,
"learning_rate": 3.3893769564778346e-05,
"loss": 0.5182,
"step": 98900
},
{
"epoch": 0.322472418836298,
"grad_norm": 0.6145396828651428,
"learning_rate": 3.387748254846202e-05,
"loss": 0.4732,
"step": 99000
},
{
"epoch": 0.3227981485522943,
"grad_norm": 1.005679726600647,
"learning_rate": 3.3861195532145684e-05,
"loss": 0.5182,
"step": 99100
},
{
"epoch": 0.32312387826829053,
"grad_norm": 0.29751360416412354,
"learning_rate": 3.384490851582936e-05,
"loss": 0.4823,
"step": 99200
},
{
"epoch": 0.3234496079842868,
"grad_norm": 0.7968891263008118,
"learning_rate": 3.3828621499513016e-05,
"loss": 0.5235,
"step": 99300
},
{
"epoch": 0.32377533770028305,
"grad_norm": 0.7049364447593689,
"learning_rate": 3.381233448319669e-05,
"loss": 0.5392,
"step": 99400
},
{
"epoch": 0.3241010674162793,
"grad_norm": 0.6265050172805786,
"learning_rate": 3.3796047466880354e-05,
"loss": 0.5119,
"step": 99500
},
{
"epoch": 0.32442679713227557,
"grad_norm": 0.6732152104377747,
"learning_rate": 3.377976045056402e-05,
"loss": 0.4837,
"step": 99600
},
{
"epoch": 0.3247525268482718,
"grad_norm": 0.25657424330711365,
"learning_rate": 3.3763473434247686e-05,
"loss": 0.5199,
"step": 99700
},
{
"epoch": 0.3250782565642681,
"grad_norm": 0.4994146227836609,
"learning_rate": 3.374718641793135e-05,
"loss": 0.4894,
"step": 99800
},
{
"epoch": 0.32540398628026435,
"grad_norm": 0.7468940615653992,
"learning_rate": 3.3730899401615024e-05,
"loss": 0.5409,
"step": 99900
},
{
"epoch": 0.3257297159962606,
"grad_norm": 0.17829063534736633,
"learning_rate": 3.371461238529869e-05,
"loss": 0.5111,
"step": 100000
},
{
"epoch": 0.32605544571225686,
"grad_norm": 0.6492403745651245,
"learning_rate": 3.369832536898236e-05,
"loss": 0.5085,
"step": 100100
},
{
"epoch": 0.3263811754282531,
"grad_norm": 0.41203296184539795,
"learning_rate": 3.368203835266602e-05,
"loss": 0.4674,
"step": 100200
},
{
"epoch": 0.3267069051442494,
"grad_norm": 0.6258901953697205,
"learning_rate": 3.366575133634969e-05,
"loss": 0.4797,
"step": 100300
},
{
"epoch": 0.32703263486024564,
"grad_norm": 0.5243533849716187,
"learning_rate": 3.364946432003336e-05,
"loss": 0.4851,
"step": 100400
},
{
"epoch": 0.3273583645762419,
"grad_norm": 0.7344015836715698,
"learning_rate": 3.3633177303717025e-05,
"loss": 0.4964,
"step": 100500
},
{
"epoch": 0.32768409429223816,
"grad_norm": 1.1914827823638916,
"learning_rate": 3.361689028740069e-05,
"loss": 0.4923,
"step": 100600
},
{
"epoch": 0.3280098240082345,
"grad_norm": 0.7036446928977966,
"learning_rate": 3.3600603271084356e-05,
"loss": 0.5234,
"step": 100700
},
{
"epoch": 0.32833555372423073,
"grad_norm": 0.8239650726318359,
"learning_rate": 3.358431625476803e-05,
"loss": 0.4715,
"step": 100800
},
{
"epoch": 0.328661283440227,
"grad_norm": 0.6158246397972107,
"learning_rate": 3.3568029238451695e-05,
"loss": 0.488,
"step": 100900
},
{
"epoch": 0.32898701315622325,
"grad_norm": 0.708604633808136,
"learning_rate": 3.355174222213536e-05,
"loss": 0.4674,
"step": 101000
},
{
"epoch": 0.3293127428722195,
"grad_norm": 0.5420898199081421,
"learning_rate": 3.3535455205819026e-05,
"loss": 0.4741,
"step": 101100
},
{
"epoch": 0.32963847258821577,
"grad_norm": 0.49769943952560425,
"learning_rate": 3.351916818950269e-05,
"loss": 0.4638,
"step": 101200
},
{
"epoch": 0.32996420230421203,
"grad_norm": 0.7099531888961792,
"learning_rate": 3.3502881173186364e-05,
"loss": 0.5236,
"step": 101300
},
{
"epoch": 0.3302899320202083,
"grad_norm": 0.712815523147583,
"learning_rate": 3.348659415687003e-05,
"loss": 0.5268,
"step": 101400
},
{
"epoch": 0.33061566173620455,
"grad_norm": 0.8762120008468628,
"learning_rate": 3.3470307140553696e-05,
"loss": 0.5045,
"step": 101500
},
{
"epoch": 0.3309413914522008,
"grad_norm": 0.7411269545555115,
"learning_rate": 3.345402012423736e-05,
"loss": 0.5017,
"step": 101600
},
{
"epoch": 0.33126712116819707,
"grad_norm": 0.7993664145469666,
"learning_rate": 3.343773310792103e-05,
"loss": 0.4866,
"step": 101700
},
{
"epoch": 0.3315928508841933,
"grad_norm": 0.9997897148132324,
"learning_rate": 3.34214460916047e-05,
"loss": 0.5033,
"step": 101800
},
{
"epoch": 0.3319185806001896,
"grad_norm": 0.3995771110057831,
"learning_rate": 3.340515907528836e-05,
"loss": 0.5037,
"step": 101900
},
{
"epoch": 0.33224431031618584,
"grad_norm": 0.4990951418876648,
"learning_rate": 3.338887205897203e-05,
"loss": 0.5353,
"step": 102000
},
{
"epoch": 0.3325700400321821,
"grad_norm": 0.4299832880496979,
"learning_rate": 3.33725850426557e-05,
"loss": 0.5121,
"step": 102100
},
{
"epoch": 0.33289576974817836,
"grad_norm": 0.9922016263008118,
"learning_rate": 3.335629802633937e-05,
"loss": 0.4948,
"step": 102200
},
{
"epoch": 0.3332214994641746,
"grad_norm": 0.547074556350708,
"learning_rate": 3.334001101002303e-05,
"loss": 0.5031,
"step": 102300
},
{
"epoch": 0.3335472291801709,
"grad_norm": 0.799204409122467,
"learning_rate": 3.3323723993706694e-05,
"loss": 0.4683,
"step": 102400
},
{
"epoch": 0.33387295889616714,
"grad_norm": 0.8631702065467834,
"learning_rate": 3.3307436977390367e-05,
"loss": 0.4813,
"step": 102500
},
{
"epoch": 0.3341986886121634,
"grad_norm": 1.0079576969146729,
"learning_rate": 3.329114996107403e-05,
"loss": 0.48,
"step": 102600
},
{
"epoch": 0.33452441832815966,
"grad_norm": 0.6884191036224365,
"learning_rate": 3.32748629447577e-05,
"loss": 0.5356,
"step": 102700
},
{
"epoch": 0.3348501480441559,
"grad_norm": 0.9845031499862671,
"learning_rate": 3.3258575928441364e-05,
"loss": 0.5276,
"step": 102800
},
{
"epoch": 0.3351758777601522,
"grad_norm": 0.5960990786552429,
"learning_rate": 3.3242288912125036e-05,
"loss": 0.4858,
"step": 102900
},
{
"epoch": 0.33550160747614843,
"grad_norm": 0.5453081727027893,
"learning_rate": 3.32260018958087e-05,
"loss": 0.5118,
"step": 103000
},
{
"epoch": 0.3358273371921447,
"grad_norm": 0.5795672535896301,
"learning_rate": 3.320971487949237e-05,
"loss": 0.4631,
"step": 103100
},
{
"epoch": 0.33615306690814095,
"grad_norm": 1.148959994316101,
"learning_rate": 3.3193427863176033e-05,
"loss": 0.4791,
"step": 103200
},
{
"epoch": 0.3364787966241372,
"grad_norm": 0.5743905901908875,
"learning_rate": 3.31771408468597e-05,
"loss": 0.4983,
"step": 103300
},
{
"epoch": 0.33680452634013347,
"grad_norm": 1.2373428344726562,
"learning_rate": 3.316085383054337e-05,
"loss": 0.4886,
"step": 103400
},
{
"epoch": 0.33713025605612973,
"grad_norm": 0.6242794990539551,
"learning_rate": 3.314456681422704e-05,
"loss": 0.4817,
"step": 103500
},
{
"epoch": 0.337455985772126,
"grad_norm": 0.3083389103412628,
"learning_rate": 3.31282797979107e-05,
"loss": 0.4843,
"step": 103600
},
{
"epoch": 0.33778171548812225,
"grad_norm": 0.4972945749759674,
"learning_rate": 3.311199278159437e-05,
"loss": 0.4806,
"step": 103700
},
{
"epoch": 0.3381074452041185,
"grad_norm": 0.7972423434257507,
"learning_rate": 3.309570576527804e-05,
"loss": 0.4699,
"step": 103800
},
{
"epoch": 0.33843317492011477,
"grad_norm": 0.5987827777862549,
"learning_rate": 3.307941874896171e-05,
"loss": 0.4969,
"step": 103900
},
{
"epoch": 0.338758904636111,
"grad_norm": 0.7832911014556885,
"learning_rate": 3.3063131732645366e-05,
"loss": 0.4627,
"step": 104000
},
{
"epoch": 0.3390846343521073,
"grad_norm": 0.4860471189022064,
"learning_rate": 3.304684471632904e-05,
"loss": 0.4596,
"step": 104100
},
{
"epoch": 0.33941036406810354,
"grad_norm": 0.3446727693080902,
"learning_rate": 3.3030557700012704e-05,
"loss": 0.4668,
"step": 104200
},
{
"epoch": 0.33973609378409986,
"grad_norm": 0.5124432444572449,
"learning_rate": 3.301427068369638e-05,
"loss": 0.5025,
"step": 104300
},
{
"epoch": 0.3400618235000961,
"grad_norm": 0.6023364663124084,
"learning_rate": 3.2997983667380036e-05,
"loss": 0.4446,
"step": 104400
},
{
"epoch": 0.3403875532160924,
"grad_norm": 0.7395136952400208,
"learning_rate": 3.298169665106371e-05,
"loss": 0.4543,
"step": 104500
},
{
"epoch": 0.34071328293208863,
"grad_norm": 0.8566365838050842,
"learning_rate": 3.2965409634747374e-05,
"loss": 0.5162,
"step": 104600
},
{
"epoch": 0.3410390126480849,
"grad_norm": 0.5422640442848206,
"learning_rate": 3.294912261843104e-05,
"loss": 0.4841,
"step": 104700
},
{
"epoch": 0.34136474236408115,
"grad_norm": 1.2125647068023682,
"learning_rate": 3.293283560211471e-05,
"loss": 0.5119,
"step": 104800
},
{
"epoch": 0.3416904720800774,
"grad_norm": 0.7454204559326172,
"learning_rate": 3.291654858579837e-05,
"loss": 0.4564,
"step": 104900
},
{
"epoch": 0.34201620179607367,
"grad_norm": 0.4049842953681946,
"learning_rate": 3.2900261569482044e-05,
"loss": 0.512,
"step": 105000
},
{
"epoch": 0.34234193151206993,
"grad_norm": 0.4401283264160156,
"learning_rate": 3.288397455316571e-05,
"loss": 0.515,
"step": 105100
},
{
"epoch": 0.3426676612280662,
"grad_norm": 1.0636835098266602,
"learning_rate": 3.286768753684938e-05,
"loss": 0.5331,
"step": 105200
},
{
"epoch": 0.34299339094406245,
"grad_norm": 0.5115429759025574,
"learning_rate": 3.285140052053304e-05,
"loss": 0.4552,
"step": 105300
},
{
"epoch": 0.3433191206600587,
"grad_norm": 0.5709575414657593,
"learning_rate": 3.2835113504216707e-05,
"loss": 0.4783,
"step": 105400
},
{
"epoch": 0.34364485037605497,
"grad_norm": 0.3476814329624176,
"learning_rate": 3.281882648790038e-05,
"loss": 0.4824,
"step": 105500
},
{
"epoch": 0.3439705800920512,
"grad_norm": 0.5530911684036255,
"learning_rate": 3.2802539471584045e-05,
"loss": 0.4672,
"step": 105600
},
{
"epoch": 0.3442963098080475,
"grad_norm": 0.7868565320968628,
"learning_rate": 3.278625245526771e-05,
"loss": 0.4574,
"step": 105700
},
{
"epoch": 0.34462203952404374,
"grad_norm": 0.9544464945793152,
"learning_rate": 3.2769965438951376e-05,
"loss": 0.4785,
"step": 105800
},
{
"epoch": 0.34494776924004,
"grad_norm": 0.6327000856399536,
"learning_rate": 3.275367842263505e-05,
"loss": 0.4899,
"step": 105900
},
{
"epoch": 0.34527349895603626,
"grad_norm": 0.5785555839538574,
"learning_rate": 3.2737391406318714e-05,
"loss": 0.5283,
"step": 106000
},
{
"epoch": 0.3455992286720325,
"grad_norm": 1.1979222297668457,
"learning_rate": 3.272110439000238e-05,
"loss": 0.4504,
"step": 106100
},
{
"epoch": 0.3459249583880288,
"grad_norm": 0.64732426404953,
"learning_rate": 3.2704817373686046e-05,
"loss": 0.4814,
"step": 106200
},
{
"epoch": 0.34625068810402504,
"grad_norm": 0.46975287795066833,
"learning_rate": 3.268853035736971e-05,
"loss": 0.453,
"step": 106300
},
{
"epoch": 0.3465764178200213,
"grad_norm": 0.3508839011192322,
"learning_rate": 3.2672243341053384e-05,
"loss": 0.5106,
"step": 106400
},
{
"epoch": 0.34690214753601756,
"grad_norm": 0.6801757216453552,
"learning_rate": 3.265595632473705e-05,
"loss": 0.4647,
"step": 106500
},
{
"epoch": 0.3472278772520138,
"grad_norm": 0.9168288111686707,
"learning_rate": 3.2639669308420716e-05,
"loss": 0.5184,
"step": 106600
},
{
"epoch": 0.3475536069680101,
"grad_norm": 0.7734511494636536,
"learning_rate": 3.262338229210438e-05,
"loss": 0.5379,
"step": 106700
},
{
"epoch": 0.34787933668400634,
"grad_norm": 0.4107971489429474,
"learning_rate": 3.260709527578805e-05,
"loss": 0.4379,
"step": 106800
},
{
"epoch": 0.3482050664000026,
"grad_norm": 0.7145285606384277,
"learning_rate": 3.259080825947172e-05,
"loss": 0.4784,
"step": 106900
},
{
"epoch": 0.34853079611599885,
"grad_norm": 0.6061236262321472,
"learning_rate": 3.257452124315538e-05,
"loss": 0.5022,
"step": 107000
},
{
"epoch": 0.3488565258319951,
"grad_norm": 0.49363043904304504,
"learning_rate": 3.255823422683905e-05,
"loss": 0.5555,
"step": 107100
},
{
"epoch": 0.34918225554799137,
"grad_norm": 0.9029503464698792,
"learning_rate": 3.254194721052272e-05,
"loss": 0.4924,
"step": 107200
},
{
"epoch": 0.34950798526398763,
"grad_norm": 0.914335310459137,
"learning_rate": 3.252566019420639e-05,
"loss": 0.4352,
"step": 107300
},
{
"epoch": 0.3498337149799839,
"grad_norm": 0.8748767375946045,
"learning_rate": 3.250937317789005e-05,
"loss": 0.4577,
"step": 107400
},
{
"epoch": 0.35015944469598015,
"grad_norm": 0.6719549298286438,
"learning_rate": 3.2493086161573714e-05,
"loss": 0.4886,
"step": 107500
},
{
"epoch": 0.3504851744119764,
"grad_norm": 0.7287290096282959,
"learning_rate": 3.2476799145257386e-05,
"loss": 0.5013,
"step": 107600
},
{
"epoch": 0.35081090412797267,
"grad_norm": 0.5061945915222168,
"learning_rate": 3.246051212894105e-05,
"loss": 0.4945,
"step": 107700
},
{
"epoch": 0.3511366338439689,
"grad_norm": 0.4315279722213745,
"learning_rate": 3.244422511262472e-05,
"loss": 0.4889,
"step": 107800
},
{
"epoch": 0.3514623635599652,
"grad_norm": 0.7010080814361572,
"learning_rate": 3.2427938096308384e-05,
"loss": 0.4927,
"step": 107900
},
{
"epoch": 0.3517880932759615,
"grad_norm": 0.5063943862915039,
"learning_rate": 3.2411651079992056e-05,
"loss": 0.485,
"step": 108000
},
{
"epoch": 0.35211382299195776,
"grad_norm": 0.4876722991466522,
"learning_rate": 3.239536406367572e-05,
"loss": 0.4779,
"step": 108100
},
{
"epoch": 0.352439552707954,
"grad_norm": 0.5269170999526978,
"learning_rate": 3.237907704735939e-05,
"loss": 0.4836,
"step": 108200
},
{
"epoch": 0.3527652824239503,
"grad_norm": 0.49777817726135254,
"learning_rate": 3.236279003104305e-05,
"loss": 0.5038,
"step": 108300
},
{
"epoch": 0.35309101213994654,
"grad_norm": 0.6626752018928528,
"learning_rate": 3.234650301472672e-05,
"loss": 0.434,
"step": 108400
},
{
"epoch": 0.3534167418559428,
"grad_norm": 0.5564941167831421,
"learning_rate": 3.233021599841039e-05,
"loss": 0.4893,
"step": 108500
},
{
"epoch": 0.35374247157193905,
"grad_norm": 0.9265629649162292,
"learning_rate": 3.231392898209406e-05,
"loss": 0.4447,
"step": 108600
},
{
"epoch": 0.3540682012879353,
"grad_norm": 0.3788335919380188,
"learning_rate": 3.229764196577772e-05,
"loss": 0.4792,
"step": 108700
},
{
"epoch": 0.3543939310039316,
"grad_norm": 0.7376036643981934,
"learning_rate": 3.228135494946139e-05,
"loss": 0.4611,
"step": 108800
},
{
"epoch": 0.35471966071992783,
"grad_norm": 0.6144190430641174,
"learning_rate": 3.2265067933145054e-05,
"loss": 0.5023,
"step": 108900
},
{
"epoch": 0.3550453904359241,
"grad_norm": 0.8389730453491211,
"learning_rate": 3.224878091682873e-05,
"loss": 0.4752,
"step": 109000
},
{
"epoch": 0.35537112015192035,
"grad_norm": 0.6739189624786377,
"learning_rate": 3.223249390051239e-05,
"loss": 0.5024,
"step": 109100
},
{
"epoch": 0.3556968498679166,
"grad_norm": 0.6198161840438843,
"learning_rate": 3.221620688419606e-05,
"loss": 0.4733,
"step": 109200
},
{
"epoch": 0.35602257958391287,
"grad_norm": 0.6034826636314392,
"learning_rate": 3.2199919867879724e-05,
"loss": 0.4766,
"step": 109300
},
{
"epoch": 0.3563483092999091,
"grad_norm": 0.38500547409057617,
"learning_rate": 3.21836328515634e-05,
"loss": 0.4524,
"step": 109400
},
{
"epoch": 0.3566740390159054,
"grad_norm": 0.8445745706558228,
"learning_rate": 3.216734583524706e-05,
"loss": 0.4634,
"step": 109500
},
{
"epoch": 0.35699976873190165,
"grad_norm": 0.6940500140190125,
"learning_rate": 3.215105881893072e-05,
"loss": 0.4598,
"step": 109600
},
{
"epoch": 0.3573254984478979,
"grad_norm": 0.7966079115867615,
"learning_rate": 3.2134771802614394e-05,
"loss": 0.5149,
"step": 109700
},
{
"epoch": 0.35765122816389416,
"grad_norm": 0.47482743859291077,
"learning_rate": 3.211848478629806e-05,
"loss": 0.4576,
"step": 109800
},
{
"epoch": 0.3579769578798904,
"grad_norm": 0.6817350387573242,
"learning_rate": 3.210219776998173e-05,
"loss": 0.4921,
"step": 109900
},
{
"epoch": 0.3583026875958867,
"grad_norm": 0.7756426930427551,
"learning_rate": 3.208591075366539e-05,
"loss": 0.4892,
"step": 110000
},
{
"epoch": 0.35862841731188294,
"grad_norm": 0.5921733975410461,
"learning_rate": 3.2069623737349064e-05,
"loss": 0.5321,
"step": 110100
},
{
"epoch": 0.3589541470278792,
"grad_norm": 0.9905286431312561,
"learning_rate": 3.205333672103273e-05,
"loss": 0.4957,
"step": 110200
},
{
"epoch": 0.35927987674387546,
"grad_norm": 0.5868031978607178,
"learning_rate": 3.2037049704716395e-05,
"loss": 0.5048,
"step": 110300
},
{
"epoch": 0.3596056064598717,
"grad_norm": 0.8284581899642944,
"learning_rate": 3.202076268840006e-05,
"loss": 0.5218,
"step": 110400
},
{
"epoch": 0.359931336175868,
"grad_norm": 0.7068589925765991,
"learning_rate": 3.2004475672083726e-05,
"loss": 0.5366,
"step": 110500
},
{
"epoch": 0.36025706589186424,
"grad_norm": 0.5528571009635925,
"learning_rate": 3.19881886557674e-05,
"loss": 0.5041,
"step": 110600
},
{
"epoch": 0.3605827956078605,
"grad_norm": 0.39369356632232666,
"learning_rate": 3.1971901639451065e-05,
"loss": 0.4997,
"step": 110700
},
{
"epoch": 0.36090852532385675,
"grad_norm": 0.9514594674110413,
"learning_rate": 3.195561462313473e-05,
"loss": 0.4835,
"step": 110800
},
{
"epoch": 0.361234255039853,
"grad_norm": 0.7980431318283081,
"learning_rate": 3.1939327606818396e-05,
"loss": 0.4503,
"step": 110900
},
{
"epoch": 0.3615599847558493,
"grad_norm": 0.6403480172157288,
"learning_rate": 3.192304059050206e-05,
"loss": 0.4764,
"step": 111000
}
],
"logging_steps": 100,
"max_steps": 307003,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.649417127985152e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}