zzjrabbit3-pre2 / checkpoint-82500 /trainer_state.json
xiaoyewuz-Ruster's picture
Add files using upload-large-folder tool
3a0dae7 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.1689600865075643,
"eval_steps": 500,
"global_step": 82500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00010240005242882685,
"grad_norm": 1.5689221620559692,
"learning_rate": 1.47e-05,
"loss": 10.437386474609376,
"step": 50
},
{
"epoch": 0.0002048001048576537,
"grad_norm": 1.4314604997634888,
"learning_rate": 2.97e-05,
"loss": 8.872786865234374,
"step": 100
},
{
"epoch": 0.00030720015728648054,
"grad_norm": 2.1900956630706787,
"learning_rate": 4.4699999999999996e-05,
"loss": 6.67523681640625,
"step": 150
},
{
"epoch": 0.0004096002097153074,
"grad_norm": 0.7161903381347656,
"learning_rate": 5.97e-05,
"loss": 4.3745730590820315,
"step": 200
},
{
"epoch": 0.0005120002621441342,
"grad_norm": 1.2864420413970947,
"learning_rate": 7.47e-05,
"loss": 1.9937155151367187,
"step": 250
},
{
"epoch": 0.0006144003145729611,
"grad_norm": 1.5255779027938843,
"learning_rate": 8.969999999999998e-05,
"loss": 6.841383056640625,
"step": 300
},
{
"epoch": 0.0007168003670017879,
"grad_norm": 1.0778907537460327,
"learning_rate": 0.00010469999999999998,
"loss": 6.193285522460937,
"step": 350
},
{
"epoch": 0.0008192004194306148,
"grad_norm": 0.8099711537361145,
"learning_rate": 0.0001197,
"loss": 6.307914428710937,
"step": 400
},
{
"epoch": 0.0009216004718594416,
"grad_norm": 1.3735090494155884,
"learning_rate": 0.0001347,
"loss": 5.728865966796875,
"step": 450
},
{
"epoch": 0.0010240005242882684,
"grad_norm": 1.2599254846572876,
"learning_rate": 0.00014969999999999998,
"loss": 5.8577117919921875,
"step": 500
},
{
"epoch": 0.0011264005767170954,
"grad_norm": 1.0690525770187378,
"learning_rate": 0.0001647,
"loss": 5.578800048828125,
"step": 550
},
{
"epoch": 0.0012288006291459222,
"grad_norm": 0.9692347049713135,
"learning_rate": 0.00017969999999999998,
"loss": 5.012375183105469,
"step": 600
},
{
"epoch": 0.001331200681574749,
"grad_norm": 1.241825819015503,
"learning_rate": 0.0001947,
"loss": 6.2367095947265625,
"step": 650
},
{
"epoch": 0.0014336007340035757,
"grad_norm": 1.1092249155044556,
"learning_rate": 0.00020969999999999997,
"loss": 5.6980218505859375,
"step": 700
},
{
"epoch": 0.0015360007864324027,
"grad_norm": 0.8965554237365723,
"learning_rate": 0.0002247,
"loss": 5.374319458007813,
"step": 750
},
{
"epoch": 0.0016384008388612295,
"grad_norm": 1.4790899753570557,
"learning_rate": 0.0002397,
"loss": 4.950992126464843,
"step": 800
},
{
"epoch": 0.0017408008912900563,
"grad_norm": 0.9521295428276062,
"learning_rate": 0.00025469999999999996,
"loss": 5.40803955078125,
"step": 850
},
{
"epoch": 0.0018432009437188831,
"grad_norm": 0.836391806602478,
"learning_rate": 0.0002697,
"loss": 5.781373291015625,
"step": 900
},
{
"epoch": 0.0019456009961477101,
"grad_norm": 0.9251846075057983,
"learning_rate": 0.0002847,
"loss": 5.027839660644531,
"step": 950
},
{
"epoch": 0.0020480010485765367,
"grad_norm": 0.8701666593551636,
"learning_rate": 0.00029969999999999997,
"loss": 5.40314208984375,
"step": 1000
},
{
"epoch": 0.002150401101005364,
"grad_norm": 1.102386474609375,
"learning_rate": 0.0002999999925149585,
"loss": 5.350908203125,
"step": 1050
},
{
"epoch": 0.0022528011534341907,
"grad_norm": 1.2463473081588745,
"learning_rate": 0.0002999999694456937,
"loss": 5.666819458007812,
"step": 1100
},
{
"epoch": 0.0023552012058630175,
"grad_norm": 1.5089119672775269,
"learning_rate": 0.00029999993078909046,
"loss": 5.699287109375,
"step": 1150
},
{
"epoch": 0.0024576012582918443,
"grad_norm": 0.8943452835083008,
"learning_rate": 0.0002999998765451527,
"loss": 5.515684814453125,
"step": 1200
},
{
"epoch": 0.002560001310720671,
"grad_norm": 0.9431995749473572,
"learning_rate": 0.0002999998067138862,
"loss": 5.668785400390625,
"step": 1250
},
{
"epoch": 0.002662401363149498,
"grad_norm": 0.7088674902915955,
"learning_rate": 0.00029999972129529813,
"loss": 5.059076843261718,
"step": 1300
},
{
"epoch": 0.0027648014155783247,
"grad_norm": 0.7139531970024109,
"learning_rate": 0.00029999962028939744,
"loss": 5.189839477539063,
"step": 1350
},
{
"epoch": 0.0028672014680071515,
"grad_norm": 0.6557895541191101,
"learning_rate": 0.0002999995036961946,
"loss": 5.100037841796875,
"step": 1400
},
{
"epoch": 0.0029696015204359783,
"grad_norm": 2.214290142059326,
"learning_rate": 0.0002999993715157016,
"loss": 3.933666076660156,
"step": 1450
},
{
"epoch": 0.0030720015728648055,
"grad_norm": 1.1693249940872192,
"learning_rate": 0.0002999992237479324,
"loss": 5.641339721679688,
"step": 1500
},
{
"epoch": 0.0031744016252936323,
"grad_norm": 0.812566876411438,
"learning_rate": 0.0002999990603929022,
"loss": 4.826256408691406,
"step": 1550
},
{
"epoch": 0.003276801677722459,
"grad_norm": 0.8744778037071228,
"learning_rate": 0.00029999888145062803,
"loss": 5.060762329101562,
"step": 1600
},
{
"epoch": 0.003379201730151286,
"grad_norm": 1.4869335889816284,
"learning_rate": 0.0002999986869211285,
"loss": 5.231287231445313,
"step": 1650
},
{
"epoch": 0.0034816017825801127,
"grad_norm": 2.121548652648926,
"learning_rate": 0.0002999984768044237,
"loss": 5.097483520507812,
"step": 1700
},
{
"epoch": 0.0035840018350089395,
"grad_norm": 0.8741556406021118,
"learning_rate": 0.00029999825110053565,
"loss": 4.697709045410156,
"step": 1750
},
{
"epoch": 0.0036864018874377662,
"grad_norm": 0.6771953105926514,
"learning_rate": 0.00029999800980948764,
"loss": 5.405962524414062,
"step": 1800
},
{
"epoch": 0.003788801939866593,
"grad_norm": 0.7090007066726685,
"learning_rate": 0.00029999775293130485,
"loss": 5.24799560546875,
"step": 1850
},
{
"epoch": 0.0038912019922954203,
"grad_norm": 0.561838686466217,
"learning_rate": 0.00029999748046601396,
"loss": 5.034546813964844,
"step": 1900
},
{
"epoch": 0.003993602044724247,
"grad_norm": 1.955099105834961,
"learning_rate": 0.0002999971924136432,
"loss": 4.816056823730468,
"step": 1950
},
{
"epoch": 0.004096002097153073,
"grad_norm": 1.5861859321594238,
"learning_rate": 0.00029999688877422264,
"loss": 4.836883544921875,
"step": 2000
},
{
"epoch": 0.0041984021495819,
"grad_norm": 0.599829375743866,
"learning_rate": 0.00029999656954778374,
"loss": 4.677350463867188,
"step": 2050
},
{
"epoch": 0.004300802202010728,
"grad_norm": 0.7785560488700867,
"learning_rate": 0.0002999962347343597,
"loss": 4.665549621582032,
"step": 2100
},
{
"epoch": 0.004403202254439555,
"grad_norm": 0.7040075659751892,
"learning_rate": 0.00029999588433398533,
"loss": 4.816753540039063,
"step": 2150
},
{
"epoch": 0.0045056023068683814,
"grad_norm": 0.9000102877616882,
"learning_rate": 0.00029999551834669695,
"loss": 4.776250915527344,
"step": 2200
},
{
"epoch": 0.004608002359297208,
"grad_norm": 0.8187811374664307,
"learning_rate": 0.0002999951367725327,
"loss": 5.544743041992188,
"step": 2250
},
{
"epoch": 0.004710402411726035,
"grad_norm": 0.684819757938385,
"learning_rate": 0.0002999947396115322,
"loss": 5.165157470703125,
"step": 2300
},
{
"epoch": 0.004812802464154862,
"grad_norm": 1.125178337097168,
"learning_rate": 0.0002999943268637367,
"loss": 4.768605651855469,
"step": 2350
},
{
"epoch": 0.004915202516583689,
"grad_norm": 0.8499088287353516,
"learning_rate": 0.0002999938985291891,
"loss": 4.563653869628906,
"step": 2400
},
{
"epoch": 0.005017602569012515,
"grad_norm": 0.8239416480064392,
"learning_rate": 0.0002999934546079339,
"loss": 4.3343331909179685,
"step": 2450
},
{
"epoch": 0.005120002621441342,
"grad_norm": 0.9708461761474609,
"learning_rate": 0.00029999299510001726,
"loss": 4.572106018066406,
"step": 2500
},
{
"epoch": 0.005222402673870169,
"grad_norm": 0.5595722794532776,
"learning_rate": 0.0002999925200054869,
"loss": 3.886677551269531,
"step": 2550
},
{
"epoch": 0.005324802726298996,
"grad_norm": 0.843467116355896,
"learning_rate": 0.0002999920293243922,
"loss": 4.781981506347656,
"step": 2600
},
{
"epoch": 0.005427202778727823,
"grad_norm": 0.7127471566200256,
"learning_rate": 0.0002999915230567842,
"loss": 4.583160400390625,
"step": 2650
},
{
"epoch": 0.005529602831156649,
"grad_norm": 1.2107303142547607,
"learning_rate": 0.00029999100120271544,
"loss": 4.792764587402344,
"step": 2700
},
{
"epoch": 0.005632002883585476,
"grad_norm": 0.46370163559913635,
"learning_rate": 0.0002999904637622402,
"loss": 4.452548522949218,
"step": 2750
},
{
"epoch": 0.005734402936014303,
"grad_norm": 0.8558986186981201,
"learning_rate": 0.00029998991073541424,
"loss": 4.687911376953125,
"step": 2800
},
{
"epoch": 0.00583680298844313,
"grad_norm": 0.716712236404419,
"learning_rate": 0.0002999893421222951,
"loss": 5.1007318115234375,
"step": 2850
},
{
"epoch": 0.0059392030408719565,
"grad_norm": 0.6236938238143921,
"learning_rate": 0.00029998875792294186,
"loss": 4.0649325561523435,
"step": 2900
},
{
"epoch": 0.006041603093300784,
"grad_norm": 0.7991392612457275,
"learning_rate": 0.0002999881581374152,
"loss": 5.119035339355468,
"step": 2950
},
{
"epoch": 0.006144003145729611,
"grad_norm": 0.8357495665550232,
"learning_rate": 0.00029998754276577757,
"loss": 4.757432556152343,
"step": 3000
},
{
"epoch": 0.006246403198158438,
"grad_norm": 0.6117859482765198,
"learning_rate": 0.0002999869118080927,
"loss": 4.448386840820312,
"step": 3050
},
{
"epoch": 0.0063488032505872646,
"grad_norm": 0.49256569147109985,
"learning_rate": 0.0002999862652644263,
"loss": 3.11305419921875,
"step": 3100
},
{
"epoch": 0.006451203303016091,
"grad_norm": 0.6232755184173584,
"learning_rate": 0.00029998560313484557,
"loss": 4.7346923828125,
"step": 3150
},
{
"epoch": 0.006553603355444918,
"grad_norm": 0.9806835055351257,
"learning_rate": 0.00029998492541941926,
"loss": 5.011588745117187,
"step": 3200
},
{
"epoch": 0.006656003407873745,
"grad_norm": 0.5504988431930542,
"learning_rate": 0.00029998423211821776,
"loss": 4.568263549804687,
"step": 3250
},
{
"epoch": 0.006758403460302572,
"grad_norm": 1.2172794342041016,
"learning_rate": 0.0002999835232313133,
"loss": 4.617164306640625,
"step": 3300
},
{
"epoch": 0.0068608035127313985,
"grad_norm": 0.8813052773475647,
"learning_rate": 0.0002999827987587793,
"loss": 4.053099975585938,
"step": 3350
},
{
"epoch": 0.006963203565160225,
"grad_norm": 0.9132696986198425,
"learning_rate": 0.0002999820587006912,
"loss": 3.3842108154296877,
"step": 3400
},
{
"epoch": 0.007065603617589052,
"grad_norm": 0.6898446679115295,
"learning_rate": 0.0002999813030571258,
"loss": 4.640269470214844,
"step": 3450
},
{
"epoch": 0.007168003670017879,
"grad_norm": 0.8895163536071777,
"learning_rate": 0.0002999805318281617,
"loss": 4.337832641601563,
"step": 3500
},
{
"epoch": 0.007270403722446706,
"grad_norm": 0.8650217056274414,
"learning_rate": 0.000299979745013879,
"loss": 4.312217102050782,
"step": 3550
},
{
"epoch": 0.0073728037748755325,
"grad_norm": 0.8591002821922302,
"learning_rate": 0.0002999789426143595,
"loss": 4.517200622558594,
"step": 3600
},
{
"epoch": 0.007475203827304359,
"grad_norm": 1.0993435382843018,
"learning_rate": 0.0002999781246296866,
"loss": 5.017222900390625,
"step": 3650
},
{
"epoch": 0.007577603879733186,
"grad_norm": 0.826409101486206,
"learning_rate": 0.00029997729105994523,
"loss": 5.4449609375,
"step": 3700
},
{
"epoch": 0.007680003932162013,
"grad_norm": 0.7336626052856445,
"learning_rate": 0.0002999764419052221,
"loss": 5.442882080078125,
"step": 3750
},
{
"epoch": 0.0077824039845908405,
"grad_norm": 0.8554229140281677,
"learning_rate": 0.00029997557716560536,
"loss": 5.044765625,
"step": 3800
},
{
"epoch": 0.007884804037019667,
"grad_norm": 1.2047715187072754,
"learning_rate": 0.0002999746968411849,
"loss": 5.347750244140625,
"step": 3850
},
{
"epoch": 0.007987204089448493,
"grad_norm": 0.6852602362632751,
"learning_rate": 0.00029997380093205227,
"loss": 5.431246948242188,
"step": 3900
},
{
"epoch": 0.008089604141877321,
"grad_norm": 0.599185585975647,
"learning_rate": 0.00029997288943830043,
"loss": 5.4587548828125,
"step": 3950
},
{
"epoch": 0.008192004194306147,
"grad_norm": 0.6573649644851685,
"learning_rate": 0.0002999719623600242,
"loss": 5.388607177734375,
"step": 4000
},
{
"epoch": 0.008294404246734974,
"grad_norm": 0.8899281024932861,
"learning_rate": 0.00029997101969731995,
"loss": 5.013424072265625,
"step": 4050
},
{
"epoch": 0.0083968042991638,
"grad_norm": 0.7623964548110962,
"learning_rate": 0.0002999700614502855,
"loss": 5.455863037109375,
"step": 4100
},
{
"epoch": 0.008499204351592628,
"grad_norm": 0.6434335112571716,
"learning_rate": 0.0002999690876190205,
"loss": 4.965211791992187,
"step": 4150
},
{
"epoch": 0.008601604404021456,
"grad_norm": 1.0846576690673828,
"learning_rate": 0.0002999680982036263,
"loss": 5.367398071289062,
"step": 4200
},
{
"epoch": 0.008704004456450282,
"grad_norm": 0.687623143196106,
"learning_rate": 0.0002999670932042054,
"loss": 5.260775146484375,
"step": 4250
},
{
"epoch": 0.00880640450887911,
"grad_norm": 0.7438795566558838,
"learning_rate": 0.0002999660726208625,
"loss": 4.861600341796875,
"step": 4300
},
{
"epoch": 0.008908804561307935,
"grad_norm": 0.653516948223114,
"learning_rate": 0.0002999650364537035,
"loss": 5.213981323242187,
"step": 4350
},
{
"epoch": 0.009011204613736763,
"grad_norm": 0.6365879774093628,
"learning_rate": 0.0002999639847028362,
"loss": 5.282333984375,
"step": 4400
},
{
"epoch": 0.009113604666165589,
"grad_norm": 1.073702335357666,
"learning_rate": 0.00029996291736836977,
"loss": 4.728897705078125,
"step": 4450
},
{
"epoch": 0.009216004718594416,
"grad_norm": 0.5726307034492493,
"learning_rate": 0.00029996183445041524,
"loss": 4.985563354492188,
"step": 4500
},
{
"epoch": 0.009318404771023242,
"grad_norm": 0.8428155779838562,
"learning_rate": 0.00029996073594908503,
"loss": 5.237740478515625,
"step": 4550
},
{
"epoch": 0.00942080482345207,
"grad_norm": 0.7983867526054382,
"learning_rate": 0.0002999596218644934,
"loss": 5.2612847900390625,
"step": 4600
},
{
"epoch": 0.009523204875880896,
"grad_norm": 1.4800513982772827,
"learning_rate": 0.000299958492196756,
"loss": 5.220035400390625,
"step": 4650
},
{
"epoch": 0.009625604928309724,
"grad_norm": 0.7891004085540771,
"learning_rate": 0.00029995734694599033,
"loss": 4.930169677734375,
"step": 4700
},
{
"epoch": 0.00972800498073855,
"grad_norm": 0.6847373247146606,
"learning_rate": 0.0002999561861123153,
"loss": 4.984630126953125,
"step": 4750
},
{
"epoch": 0.009830405033167377,
"grad_norm": 0.6594445705413818,
"learning_rate": 0.0002999550096958517,
"loss": 5.030910034179687,
"step": 4800
},
{
"epoch": 0.009932805085596203,
"grad_norm": 0.6435703635215759,
"learning_rate": 0.0002999538176967216,
"loss": 5.204117431640625,
"step": 4850
},
{
"epoch": 0.01003520513802503,
"grad_norm": 0.43691107630729675,
"learning_rate": 0.0002999526101150489,
"loss": 4.9494412231445315,
"step": 4900
},
{
"epoch": 0.010137605190453857,
"grad_norm": 0.839853823184967,
"learning_rate": 0.00029995138695095914,
"loss": 3.1014248657226564,
"step": 4950
},
{
"epoch": 0.010240005242882684,
"grad_norm": 0.8040403723716736,
"learning_rate": 0.00029995014820457947,
"loss": 5.11622314453125,
"step": 5000
},
{
"epoch": 0.010342405295311512,
"grad_norm": 0.6953795552253723,
"learning_rate": 0.0002999488938760385,
"loss": 5.122266235351563,
"step": 5050
},
{
"epoch": 0.010444805347740338,
"grad_norm": 0.5960660576820374,
"learning_rate": 0.00029994762396546665,
"loss": 4.512597961425781,
"step": 5100
},
{
"epoch": 0.010547205400169166,
"grad_norm": 0.7795936465263367,
"learning_rate": 0.0002999463384729958,
"loss": 4.1439907836914065,
"step": 5150
},
{
"epoch": 0.010649605452597992,
"grad_norm": 0.5827996730804443,
"learning_rate": 0.0002999450373987597,
"loss": 5.13221435546875,
"step": 5200
},
{
"epoch": 0.01075200550502682,
"grad_norm": 0.5559226870536804,
"learning_rate": 0.0002999437207428934,
"loss": 5.330996704101563,
"step": 5250
},
{
"epoch": 0.010854405557455645,
"grad_norm": 0.7576444745063782,
"learning_rate": 0.0002999423885055338,
"loss": 5.0482110595703125,
"step": 5300
},
{
"epoch": 0.010956805609884473,
"grad_norm": 0.6038886308670044,
"learning_rate": 0.0002999410406868193,
"loss": 5.026975708007813,
"step": 5350
},
{
"epoch": 0.011059205662313299,
"grad_norm": 0.8441299200057983,
"learning_rate": 0.00029993967728688997,
"loss": 5.212452392578125,
"step": 5400
},
{
"epoch": 0.011161605714742126,
"grad_norm": 0.6785016655921936,
"learning_rate": 0.00029993829830588745,
"loss": 5.052464599609375,
"step": 5450
},
{
"epoch": 0.011264005767170952,
"grad_norm": 0.7248463034629822,
"learning_rate": 0.0002999369037439551,
"loss": 4.948311157226563,
"step": 5500
},
{
"epoch": 0.01136640581959978,
"grad_norm": 1.8698147535324097,
"learning_rate": 0.00029993549360123777,
"loss": 4.748592529296875,
"step": 5550
},
{
"epoch": 0.011468805872028606,
"grad_norm": 0.5474430918693542,
"learning_rate": 0.0002999340678778821,
"loss": 4.849425659179688,
"step": 5600
},
{
"epoch": 0.011571205924457434,
"grad_norm": 0.6169009804725647,
"learning_rate": 0.00029993262657403613,
"loss": 4.795867919921875,
"step": 5650
},
{
"epoch": 0.01167360597688626,
"grad_norm": 0.773813009262085,
"learning_rate": 0.0002999311696898497,
"loss": 4.561126098632813,
"step": 5700
},
{
"epoch": 0.011776006029315087,
"grad_norm": 0.841324508190155,
"learning_rate": 0.00029992969722547424,
"loss": 4.801204223632812,
"step": 5750
},
{
"epoch": 0.011878406081743913,
"grad_norm": 0.6325180530548096,
"learning_rate": 0.0002999282091810627,
"loss": 5.0141598510742185,
"step": 5800
},
{
"epoch": 0.01198080613417274,
"grad_norm": 0.6073687672615051,
"learning_rate": 0.00029992670555676964,
"loss": 4.727720642089844,
"step": 5850
},
{
"epoch": 0.012083206186601568,
"grad_norm": 0.7254152297973633,
"learning_rate": 0.00029992518635275147,
"loss": 5.180827026367187,
"step": 5900
},
{
"epoch": 0.012185606239030394,
"grad_norm": 0.6669420599937439,
"learning_rate": 0.000299923651569166,
"loss": 5.232777099609375,
"step": 5950
},
{
"epoch": 0.012288006291459222,
"grad_norm": 0.9034198522567749,
"learning_rate": 0.0002999221012061726,
"loss": 4.571735229492187,
"step": 6000
},
{
"epoch": 0.012390406343888048,
"grad_norm": 0.9541974663734436,
"learning_rate": 0.0002999205352639326,
"loss": 4.678871459960938,
"step": 6050
},
{
"epoch": 0.012492806396316876,
"grad_norm": 1.4738138914108276,
"learning_rate": 0.0002999189537426085,
"loss": 4.96472412109375,
"step": 6100
},
{
"epoch": 0.012595206448745701,
"grad_norm": 0.7434485554695129,
"learning_rate": 0.0002999173566423648,
"loss": 5.090062255859375,
"step": 6150
},
{
"epoch": 0.012697606501174529,
"grad_norm": 0.5921583771705627,
"learning_rate": 0.0002999157439633674,
"loss": 4.839577026367188,
"step": 6200
},
{
"epoch": 0.012800006553603355,
"grad_norm": 0.5730924606323242,
"learning_rate": 0.00029991411570578385,
"loss": 4.44057373046875,
"step": 6250
},
{
"epoch": 0.012902406606032183,
"grad_norm": 0.6314680576324463,
"learning_rate": 0.0002999124718697834,
"loss": 4.906407165527344,
"step": 6300
},
{
"epoch": 0.013004806658461009,
"grad_norm": 0.5586856603622437,
"learning_rate": 0.00029991081245553695,
"loss": 4.8386752319335935,
"step": 6350
},
{
"epoch": 0.013107206710889836,
"grad_norm": 0.4960859417915344,
"learning_rate": 0.0002999091374632168,
"loss": 4.7797067260742185,
"step": 6400
},
{
"epoch": 0.013209606763318662,
"grad_norm": 0.7504858374595642,
"learning_rate": 0.0002999074468929971,
"loss": 4.906391906738281,
"step": 6450
},
{
"epoch": 0.01331200681574749,
"grad_norm": 0.5791200995445251,
"learning_rate": 0.0002999057407450534,
"loss": 4.6073193359375,
"step": 6500
},
{
"epoch": 0.013414406868176316,
"grad_norm": 1.04066002368927,
"learning_rate": 0.00029990401901956314,
"loss": 4.697982177734375,
"step": 6550
},
{
"epoch": 0.013516806920605143,
"grad_norm": 0.5570167899131775,
"learning_rate": 0.0002999022817167052,
"loss": 5.063222351074219,
"step": 6600
},
{
"epoch": 0.01361920697303397,
"grad_norm": 0.6061655879020691,
"learning_rate": 0.00029990052883666004,
"loss": 4.329053955078125,
"step": 6650
},
{
"epoch": 0.013721607025462797,
"grad_norm": 0.6637709736824036,
"learning_rate": 0.0002998987603796099,
"loss": 4.776343688964844,
"step": 6700
},
{
"epoch": 0.013824007077891625,
"grad_norm": 0.6519717574119568,
"learning_rate": 0.0002998969763457385,
"loss": 4.839088439941406,
"step": 6750
},
{
"epoch": 0.01392640713032045,
"grad_norm": 0.643963098526001,
"learning_rate": 0.00029989517673523127,
"loss": 4.581628112792969,
"step": 6800
},
{
"epoch": 0.014028807182749278,
"grad_norm": 1.4058446884155273,
"learning_rate": 0.0002998933615482751,
"loss": 4.007187194824219,
"step": 6850
},
{
"epoch": 0.014131207235178104,
"grad_norm": 0.7021802067756653,
"learning_rate": 0.00029989153078505886,
"loss": 4.761097106933594,
"step": 6900
},
{
"epoch": 0.014233607287606932,
"grad_norm": 0.6105393171310425,
"learning_rate": 0.0002998896844457725,
"loss": 5.0122119140625,
"step": 6950
},
{
"epoch": 0.014336007340035758,
"grad_norm": 0.7652610540390015,
"learning_rate": 0.00029988782253060806,
"loss": 4.946090393066406,
"step": 7000
},
{
"epoch": 0.014438407392464585,
"grad_norm": 0.7618656754493713,
"learning_rate": 0.000299885945039759,
"loss": 3.561051025390625,
"step": 7050
},
{
"epoch": 0.014540807444893411,
"grad_norm": 0.6516929864883423,
"learning_rate": 0.0002998840519734204,
"loss": 4.529894409179687,
"step": 7100
},
{
"epoch": 0.014643207497322239,
"grad_norm": 1.0100959539413452,
"learning_rate": 0.000299882143331789,
"loss": 4.72200927734375,
"step": 7150
},
{
"epoch": 0.014745607549751065,
"grad_norm": 0.9135130047798157,
"learning_rate": 0.0002998802191150631,
"loss": 4.017086791992187,
"step": 7200
},
{
"epoch": 0.014848007602179893,
"grad_norm": 1.0336369276046753,
"learning_rate": 0.0002998782793234427,
"loss": 4.969613952636719,
"step": 7250
},
{
"epoch": 0.014950407654608719,
"grad_norm": 0.6827586889266968,
"learning_rate": 0.0002998763239571293,
"loss": 4.958232421875,
"step": 7300
},
{
"epoch": 0.015052807707037546,
"grad_norm": 0.8095134496688843,
"learning_rate": 0.00029987435301632624,
"loss": 4.539352722167969,
"step": 7350
},
{
"epoch": 0.015155207759466372,
"grad_norm": 0.811736524105072,
"learning_rate": 0.0002998723665012382,
"loss": 4.618602905273438,
"step": 7400
},
{
"epoch": 0.0152576078118952,
"grad_norm": 0.6750462651252747,
"learning_rate": 0.00029987036441207163,
"loss": 4.390194702148437,
"step": 7450
},
{
"epoch": 0.015360007864324026,
"grad_norm": 0.6136668920516968,
"learning_rate": 0.0002998683467490346,
"loss": 4.691050109863281,
"step": 7500
},
{
"epoch": 0.015462407916752853,
"grad_norm": 0.608397364616394,
"learning_rate": 0.0002998663135123368,
"loss": 5.00837646484375,
"step": 7550
},
{
"epoch": 0.015564807969181681,
"grad_norm": 0.6426307559013367,
"learning_rate": 0.0002998642647021895,
"loss": 4.924872741699219,
"step": 7600
},
{
"epoch": 0.015667208021610507,
"grad_norm": 0.8153278827667236,
"learning_rate": 0.00029986220031880557,
"loss": 4.830538635253906,
"step": 7650
},
{
"epoch": 0.015769608074039335,
"grad_norm": 0.6194471120834351,
"learning_rate": 0.0002998601203623995,
"loss": 4.807819213867187,
"step": 7700
},
{
"epoch": 0.015872008126468162,
"grad_norm": 1.5707075595855713,
"learning_rate": 0.00029985802483318755,
"loss": 4.509772644042969,
"step": 7750
},
{
"epoch": 0.015974408178896986,
"grad_norm": 0.8517248630523682,
"learning_rate": 0.0002998559137313874,
"loss": 4.2860891723632815,
"step": 7800
},
{
"epoch": 0.016076808231325814,
"grad_norm": 1.0736734867095947,
"learning_rate": 0.00029985378705721843,
"loss": 4.5593634033203125,
"step": 7850
},
{
"epoch": 0.016179208283754642,
"grad_norm": 0.6145778894424438,
"learning_rate": 0.0002998516448109016,
"loss": 4.50625,
"step": 7900
},
{
"epoch": 0.01628160833618347,
"grad_norm": 0.7230775356292725,
"learning_rate": 0.00029984948699265967,
"loss": 4.884090270996094,
"step": 7950
},
{
"epoch": 0.016384008388612294,
"grad_norm": 0.7744879722595215,
"learning_rate": 0.0002998473136027167,
"loss": 4.186481018066406,
"step": 8000
},
{
"epoch": 0.01648640844104112,
"grad_norm": 0.7375713586807251,
"learning_rate": 0.00029984512464129856,
"loss": 4.879469299316407,
"step": 8050
},
{
"epoch": 0.01658880849346995,
"grad_norm": 1.0072307586669922,
"learning_rate": 0.0002998429201086329,
"loss": 4.755104064941406,
"step": 8100
},
{
"epoch": 0.016691208545898777,
"grad_norm": 0.9491130113601685,
"learning_rate": 0.00029984070000494854,
"loss": 4.182529907226563,
"step": 8150
},
{
"epoch": 0.0167936085983276,
"grad_norm": 0.9159969687461853,
"learning_rate": 0.00029983846433047633,
"loss": 4.361718444824219,
"step": 8200
},
{
"epoch": 0.01689600865075643,
"grad_norm": 0.9138163328170776,
"learning_rate": 0.00029983621308544864,
"loss": 4.748040466308594,
"step": 8250
},
{
"epoch": 0.016998408703185256,
"grad_norm": 0.7999444603919983,
"learning_rate": 0.0002998339462700993,
"loss": 4.52157470703125,
"step": 8300
},
{
"epoch": 0.017100808755614084,
"grad_norm": 0.732362687587738,
"learning_rate": 0.0002998316638846639,
"loss": 4.664584045410156,
"step": 8350
},
{
"epoch": 0.01720320880804291,
"grad_norm": 0.9679093956947327,
"learning_rate": 0.00029982936592937967,
"loss": 4.6484066772460935,
"step": 8400
},
{
"epoch": 0.017305608860471736,
"grad_norm": 0.7307636141777039,
"learning_rate": 0.0002998270524044853,
"loss": 4.694376220703125,
"step": 8450
},
{
"epoch": 0.017408008912900563,
"grad_norm": 0.7069781422615051,
"learning_rate": 0.00029982472331022126,
"loss": 4.551060180664063,
"step": 8500
},
{
"epoch": 0.01751040896532939,
"grad_norm": 0.764034628868103,
"learning_rate": 0.00029982237864682965,
"loss": 4.622559814453125,
"step": 8550
},
{
"epoch": 0.01761280901775822,
"grad_norm": 0.7239750623703003,
"learning_rate": 0.000299820018414554,
"loss": 4.617013549804687,
"step": 8600
},
{
"epoch": 0.017715209070187043,
"grad_norm": 0.6056758165359497,
"learning_rate": 0.0002998176426136396,
"loss": 4.456921997070313,
"step": 8650
},
{
"epoch": 0.01781760912261587,
"grad_norm": 0.8634012341499329,
"learning_rate": 0.0002998152512443334,
"loss": 4.55794677734375,
"step": 8700
},
{
"epoch": 0.017920009175044698,
"grad_norm": 0.7804837226867676,
"learning_rate": 0.00029981284430688384,
"loss": 4.680322570800781,
"step": 8750
},
{
"epoch": 0.018022409227473526,
"grad_norm": 0.773954451084137,
"learning_rate": 0.00029981042180154103,
"loss": 4.5744256591796875,
"step": 8800
},
{
"epoch": 0.01812480927990235,
"grad_norm": 0.691335916519165,
"learning_rate": 0.0002998079837285568,
"loss": 4.607868347167969,
"step": 8850
},
{
"epoch": 0.018227209332331178,
"grad_norm": 0.4418846368789673,
"learning_rate": 0.0002998055300881844,
"loss": 4.455259094238281,
"step": 8900
},
{
"epoch": 0.018329609384760005,
"grad_norm": 1.0125758647918701,
"learning_rate": 0.00029980306088067877,
"loss": 3.1990432739257812,
"step": 8950
},
{
"epoch": 0.018432009437188833,
"grad_norm": 0.7495264410972595,
"learning_rate": 0.00029980057610629664,
"loss": 4.650667419433594,
"step": 9000
},
{
"epoch": 0.018534409489617657,
"grad_norm": 0.8682289123535156,
"learning_rate": 0.0002997980757652961,
"loss": 3.851683349609375,
"step": 9050
},
{
"epoch": 0.018636809542046485,
"grad_norm": 0.9349716305732727,
"learning_rate": 0.000299795559857937,
"loss": 4.859715576171875,
"step": 9100
},
{
"epoch": 0.018739209594475312,
"grad_norm": 0.7786422967910767,
"learning_rate": 0.0002997930283844809,
"loss": 4.666428833007813,
"step": 9150
},
{
"epoch": 0.01884160964690414,
"grad_norm": 0.7877052426338196,
"learning_rate": 0.0002997904813451907,
"loss": 4.6610784912109375,
"step": 9200
},
{
"epoch": 0.018944009699332968,
"grad_norm": 0.9601690173149109,
"learning_rate": 0.00029978791874033114,
"loss": 4.808619384765625,
"step": 9250
},
{
"epoch": 0.019046409751761792,
"grad_norm": 0.5345655083656311,
"learning_rate": 0.0002997853405701684,
"loss": 4.262407836914062,
"step": 9300
},
{
"epoch": 0.01914880980419062,
"grad_norm": 0.8365965485572815,
"learning_rate": 0.00029978274683497067,
"loss": 3.8195550537109373,
"step": 9350
},
{
"epoch": 0.019251209856619447,
"grad_norm": 0.8324418663978577,
"learning_rate": 0.00029978013753500723,
"loss": 4.371593933105469,
"step": 9400
},
{
"epoch": 0.019353609909048275,
"grad_norm": 0.7757883071899414,
"learning_rate": 0.00029977751267054934,
"loss": 4.406093444824219,
"step": 9450
},
{
"epoch": 0.0194560099614771,
"grad_norm": 0.8704003095626831,
"learning_rate": 0.0002997748722418697,
"loss": 4.736319885253907,
"step": 9500
},
{
"epoch": 0.019558410013905927,
"grad_norm": 0.8212069869041443,
"learning_rate": 0.0002997722162492427,
"loss": 4.341388549804687,
"step": 9550
},
{
"epoch": 0.019660810066334754,
"grad_norm": 0.5836915373802185,
"learning_rate": 0.0002997695446929444,
"loss": 4.658592529296875,
"step": 9600
},
{
"epoch": 0.019763210118763582,
"grad_norm": 0.8792363405227661,
"learning_rate": 0.0002997668575732524,
"loss": 4.1852349853515625,
"step": 9650
},
{
"epoch": 0.019865610171192406,
"grad_norm": 0.6817139387130737,
"learning_rate": 0.00029976415489044585,
"loss": 4.120821838378906,
"step": 9700
},
{
"epoch": 0.019968010223621234,
"grad_norm": 0.9270561337471008,
"learning_rate": 0.0002997614366448057,
"loss": 4.595604553222656,
"step": 9750
},
{
"epoch": 0.02007041027605006,
"grad_norm": 0.7752207517623901,
"learning_rate": 0.0002997587028366144,
"loss": 4.643276977539062,
"step": 9800
},
{
"epoch": 0.02017281032847889,
"grad_norm": 0.6949714422225952,
"learning_rate": 0.000299755953466156,
"loss": 4.598296203613281,
"step": 9850
},
{
"epoch": 0.020275210380907713,
"grad_norm": 0.6971185207366943,
"learning_rate": 0.00029975318853371624,
"loss": 3.976045837402344,
"step": 9900
},
{
"epoch": 0.02037761043333654,
"grad_norm": 0.6620817184448242,
"learning_rate": 0.00029975040803958237,
"loss": 4.670194396972656,
"step": 9950
},
{
"epoch": 0.02048001048576537,
"grad_norm": 0.7390024065971375,
"learning_rate": 0.0002997476119840434,
"loss": 4.440447998046875,
"step": 10000
},
{
"epoch": 0.020582410538194196,
"grad_norm": 1.074389934539795,
"learning_rate": 0.0002997448003673899,
"loss": 4.406011352539062,
"step": 10050
},
{
"epoch": 0.020684810590623024,
"grad_norm": 0.7580602765083313,
"learning_rate": 0.000299741973189914,
"loss": 4.489655456542969,
"step": 10100
},
{
"epoch": 0.02078721064305185,
"grad_norm": 0.8966153860092163,
"learning_rate": 0.0002997391304519094,
"loss": 4.419082946777344,
"step": 10150
},
{
"epoch": 0.020889610695480676,
"grad_norm": 0.8477383255958557,
"learning_rate": 0.00029973627215367166,
"loss": 4.569579467773438,
"step": 10200
},
{
"epoch": 0.020992010747909504,
"grad_norm": 1.0875380039215088,
"learning_rate": 0.00029973339829549776,
"loss": 4.634755859375,
"step": 10250
},
{
"epoch": 0.02109441080033833,
"grad_norm": 1.043662190437317,
"learning_rate": 0.00029973050887768625,
"loss": 4.522914123535156,
"step": 10300
},
{
"epoch": 0.021196810852767155,
"grad_norm": 0.7864259481430054,
"learning_rate": 0.0002997276039005375,
"loss": 4.525141296386718,
"step": 10350
},
{
"epoch": 0.021299210905195983,
"grad_norm": 0.7917724251747131,
"learning_rate": 0.00029972468336435335,
"loss": 4.140654602050781,
"step": 10400
},
{
"epoch": 0.02140161095762481,
"grad_norm": 0.8878633975982666,
"learning_rate": 0.0002997217472694372,
"loss": 4.351778564453125,
"step": 10450
},
{
"epoch": 0.02150401101005364,
"grad_norm": 0.8213766813278198,
"learning_rate": 0.0002997187956160943,
"loss": 4.426820068359375,
"step": 10500
},
{
"epoch": 0.021606411062482463,
"grad_norm": 0.8385847210884094,
"learning_rate": 0.0002997158284046313,
"loss": 4.410148315429687,
"step": 10550
},
{
"epoch": 0.02170881111491129,
"grad_norm": 1.029899001121521,
"learning_rate": 0.0002997128456353565,
"loss": 4.456363830566406,
"step": 10600
},
{
"epoch": 0.021811211167340118,
"grad_norm": 0.8777541518211365,
"learning_rate": 0.0002997098473085799,
"loss": 4.56017578125,
"step": 10650
},
{
"epoch": 0.021913611219768946,
"grad_norm": 0.7988455891609192,
"learning_rate": 0.0002997068334246131,
"loss": 4.490418701171875,
"step": 10700
},
{
"epoch": 0.02201601127219777,
"grad_norm": 1.208889126777649,
"learning_rate": 0.00029970380398376917,
"loss": 4.553769836425781,
"step": 10750
},
{
"epoch": 0.022118411324626597,
"grad_norm": 1.0305062532424927,
"learning_rate": 0.0002997007589863631,
"loss": 4.295799865722656,
"step": 10800
},
{
"epoch": 0.022220811377055425,
"grad_norm": 0.8051795363426208,
"learning_rate": 0.00029969769843271116,
"loss": 4.561275329589844,
"step": 10850
},
{
"epoch": 0.022323211429484253,
"grad_norm": 0.8639199733734131,
"learning_rate": 0.00029969462232313154,
"loss": 4.470157165527343,
"step": 10900
},
{
"epoch": 0.02242561148191308,
"grad_norm": 0.7574597597122192,
"learning_rate": 0.00029969153065794374,
"loss": 4.476951599121094,
"step": 10950
},
{
"epoch": 0.022528011534341905,
"grad_norm": 0.787169337272644,
"learning_rate": 0.00029968842343746906,
"loss": 4.459609375,
"step": 11000
},
{
"epoch": 0.022630411586770732,
"grad_norm": 0.7591275572776794,
"learning_rate": 0.0002996853006620305,
"loss": 4.777853393554688,
"step": 11050
},
{
"epoch": 0.02273281163919956,
"grad_norm": 0.9186727404594421,
"learning_rate": 0.0002996821623319524,
"loss": 4.658177185058594,
"step": 11100
},
{
"epoch": 0.022835211691628388,
"grad_norm": 0.7670950293540955,
"learning_rate": 0.0002996790084475611,
"loss": 4.718930053710937,
"step": 11150
},
{
"epoch": 0.022937611744057212,
"grad_norm": 0.9697529077529907,
"learning_rate": 0.00029967583900918413,
"loss": 4.4181521606445315,
"step": 11200
},
{
"epoch": 0.02304001179648604,
"grad_norm": 1.807626724243164,
"learning_rate": 0.00029967265401715083,
"loss": 4.645519104003906,
"step": 11250
},
{
"epoch": 0.023142411848914867,
"grad_norm": 0.8117107152938843,
"learning_rate": 0.00029966945347179236,
"loss": 3.835715637207031,
"step": 11300
},
{
"epoch": 0.023244811901343695,
"grad_norm": 1.2005183696746826,
"learning_rate": 0.00029966623737344124,
"loss": 4.443558959960938,
"step": 11350
},
{
"epoch": 0.02334721195377252,
"grad_norm": 0.8746537566184998,
"learning_rate": 0.0002996630057224316,
"loss": 4.395650024414063,
"step": 11400
},
{
"epoch": 0.023449612006201347,
"grad_norm": 0.7293654680252075,
"learning_rate": 0.00029965975851909934,
"loss": 4.513606262207031,
"step": 11450
},
{
"epoch": 0.023552012058630174,
"grad_norm": 0.7779085636138916,
"learning_rate": 0.00029965649576378184,
"loss": 4.524747009277344,
"step": 11500
},
{
"epoch": 0.023654412111059002,
"grad_norm": 1.0146737098693848,
"learning_rate": 0.00029965321745681816,
"loss": 4.670032348632812,
"step": 11550
},
{
"epoch": 0.023756812163487826,
"grad_norm": 0.9226559400558472,
"learning_rate": 0.00029964992359854896,
"loss": 4.319842529296875,
"step": 11600
},
{
"epoch": 0.023859212215916654,
"grad_norm": 0.729659378528595,
"learning_rate": 0.0002996466141893166,
"loss": 4.3390591430664065,
"step": 11650
},
{
"epoch": 0.02396161226834548,
"grad_norm": 0.8851988315582275,
"learning_rate": 0.00029964328922946486,
"loss": 4.193225708007812,
"step": 11700
},
{
"epoch": 0.02406401232077431,
"grad_norm": 1.142880916595459,
"learning_rate": 0.0002996399487193393,
"loss": 4.7212896728515625,
"step": 11750
},
{
"epoch": 0.024166412373203137,
"grad_norm": 0.6688424944877625,
"learning_rate": 0.0002996365926592871,
"loss": 4.5033807373046875,
"step": 11800
},
{
"epoch": 0.02426881242563196,
"grad_norm": 0.89569491147995,
"learning_rate": 0.00029963322104965693,
"loss": 4.241100463867188,
"step": 11850
},
{
"epoch": 0.02437121247806079,
"grad_norm": 0.8132964372634888,
"learning_rate": 0.0002996298338907992,
"loss": 4.217136535644531,
"step": 11900
},
{
"epoch": 0.024473612530489616,
"grad_norm": 1.4552931785583496,
"learning_rate": 0.00029962643118306597,
"loss": 4.451352844238281,
"step": 11950
},
{
"epoch": 0.024576012582918444,
"grad_norm": 0.7032333612442017,
"learning_rate": 0.00029962301292681066,
"loss": 3.709466857910156,
"step": 12000
},
{
"epoch": 0.024678412635347268,
"grad_norm": 0.7736782431602478,
"learning_rate": 0.0002996195791223886,
"loss": 3.931116027832031,
"step": 12050
},
{
"epoch": 0.024780812687776096,
"grad_norm": 1.0214853286743164,
"learning_rate": 0.0002996161297701566,
"loss": 4.091096496582031,
"step": 12100
},
{
"epoch": 0.024883212740204923,
"grad_norm": 0.7319433093070984,
"learning_rate": 0.00029961266487047307,
"loss": 4.754253234863281,
"step": 12150
},
{
"epoch": 0.02498561279263375,
"grad_norm": 0.7848948240280151,
"learning_rate": 0.00029960918442369804,
"loss": 4.210378723144531,
"step": 12200
},
{
"epoch": 0.025088012845062575,
"grad_norm": 0.8420546650886536,
"learning_rate": 0.00029960568843019327,
"loss": 4.331927185058594,
"step": 12250
},
{
"epoch": 0.025190412897491403,
"grad_norm": 0.7843689322471619,
"learning_rate": 0.00029960217689032205,
"loss": 4.491570129394531,
"step": 12300
},
{
"epoch": 0.02529281294992023,
"grad_norm": 1.0013247728347778,
"learning_rate": 0.0002995986498044491,
"loss": 4.356235961914063,
"step": 12350
},
{
"epoch": 0.025395213002349058,
"grad_norm": 0.8285472989082336,
"learning_rate": 0.0002995951071729412,
"loss": 4.19695556640625,
"step": 12400
},
{
"epoch": 0.025497613054777882,
"grad_norm": 0.8935615420341492,
"learning_rate": 0.0002995915489961663,
"loss": 4.556292724609375,
"step": 12450
},
{
"epoch": 0.02560001310720671,
"grad_norm": 1.1061961650848389,
"learning_rate": 0.0002995879752744942,
"loss": 4.260919799804688,
"step": 12500
},
{
"epoch": 0.025702413159635538,
"grad_norm": 0.7796922922134399,
"learning_rate": 0.00029958438600829633,
"loss": 3.7681890869140626,
"step": 12550
},
{
"epoch": 0.025804813212064365,
"grad_norm": 0.9937464594841003,
"learning_rate": 0.0002995807811979456,
"loss": 4.396112670898438,
"step": 12600
},
{
"epoch": 0.025907213264493193,
"grad_norm": 0.9796547889709473,
"learning_rate": 0.0002995771608438166,
"loss": 4.378516540527344,
"step": 12650
},
{
"epoch": 0.026009613316922017,
"grad_norm": 0.9051157236099243,
"learning_rate": 0.00029957352494628563,
"loss": 4.480902404785156,
"step": 12700
},
{
"epoch": 0.026112013369350845,
"grad_norm": 0.706322968006134,
"learning_rate": 0.0002995698735057304,
"loss": 4.157791442871094,
"step": 12750
},
{
"epoch": 0.026214413421779673,
"grad_norm": 1.033637285232544,
"learning_rate": 0.0002995662065225304,
"loss": 4.6359164428710935,
"step": 12800
},
{
"epoch": 0.0263168134742085,
"grad_norm": 0.9319335222244263,
"learning_rate": 0.00029956252399706673,
"loss": 4.510284423828125,
"step": 12850
},
{
"epoch": 0.026419213526637324,
"grad_norm": 0.887332022190094,
"learning_rate": 0.000299558825929722,
"loss": 4.224294738769531,
"step": 12900
},
{
"epoch": 0.026521613579066152,
"grad_norm": 0.7545831203460693,
"learning_rate": 0.0002995551123208805,
"loss": 3.612664794921875,
"step": 12950
},
{
"epoch": 0.02662401363149498,
"grad_norm": 1.4527435302734375,
"learning_rate": 0.0002995513831709281,
"loss": 3.0348556518554686,
"step": 13000
},
{
"epoch": 0.026726413683923807,
"grad_norm": 0.826316237449646,
"learning_rate": 0.00029954763848025244,
"loss": 3.7530322265625,
"step": 13050
},
{
"epoch": 0.02682881373635263,
"grad_norm": 0.7737396955490112,
"learning_rate": 0.0002995438782492426,
"loss": 4.3491796875,
"step": 13100
},
{
"epoch": 0.02693121378878146,
"grad_norm": 0.7360561490058899,
"learning_rate": 0.0002995401024782892,
"loss": 4.23507568359375,
"step": 13150
},
{
"epoch": 0.027033613841210287,
"grad_norm": 1.048795223236084,
"learning_rate": 0.00029953631116778483,
"loss": 4.128821716308594,
"step": 13200
},
{
"epoch": 0.027136013893639115,
"grad_norm": 0.744465172290802,
"learning_rate": 0.00029953250431812326,
"loss": 4.229864501953125,
"step": 13250
},
{
"epoch": 0.02723841394606794,
"grad_norm": 1.0225343704223633,
"learning_rate": 0.0002995286819297002,
"loss": 4.329259033203125,
"step": 13300
},
{
"epoch": 0.027340813998496766,
"grad_norm": 0.8426514863967896,
"learning_rate": 0.0002995248440029128,
"loss": 4.405516662597656,
"step": 13350
},
{
"epoch": 0.027443214050925594,
"grad_norm": 0.8175310492515564,
"learning_rate": 0.00029952099053815996,
"loss": 4.2612826538085935,
"step": 13400
},
{
"epoch": 0.027545614103354422,
"grad_norm": 0.9133870601654053,
"learning_rate": 0.000299517121535842,
"loss": 4.32334228515625,
"step": 13450
},
{
"epoch": 0.02764801415578325,
"grad_norm": 0.9261609315872192,
"learning_rate": 0.00029951323699636107,
"loss": 4.267542114257813,
"step": 13500
},
{
"epoch": 0.027750414208212074,
"grad_norm": 0.964561402797699,
"learning_rate": 0.00029950933692012076,
"loss": 4.246123657226563,
"step": 13550
},
{
"epoch": 0.0278528142606409,
"grad_norm": 1.1370861530303955,
"learning_rate": 0.00029950542130752634,
"loss": 4.350406188964843,
"step": 13600
},
{
"epoch": 0.02795521431306973,
"grad_norm": 0.8274940848350525,
"learning_rate": 0.00029950149015898483,
"loss": 4.124059448242187,
"step": 13650
},
{
"epoch": 0.028057614365498557,
"grad_norm": 1.0486522912979126,
"learning_rate": 0.0002994975434749046,
"loss": 4.241673278808594,
"step": 13700
},
{
"epoch": 0.02816001441792738,
"grad_norm": 0.8022660613059998,
"learning_rate": 0.0002994935812556958,
"loss": 3.647921447753906,
"step": 13750
},
{
"epoch": 0.02826241447035621,
"grad_norm": 1.1589747667312622,
"learning_rate": 0.00029948960350177026,
"loss": 4.2052005004882815,
"step": 13800
},
{
"epoch": 0.028364814522785036,
"grad_norm": 0.6878979802131653,
"learning_rate": 0.0002994856102135412,
"loss": 4.0764639282226565,
"step": 13850
},
{
"epoch": 0.028467214575213864,
"grad_norm": 1.299386739730835,
"learning_rate": 0.0002994816013914236,
"loss": 3.8260293579101563,
"step": 13900
},
{
"epoch": 0.028569614627642688,
"grad_norm": 0.7897019982337952,
"learning_rate": 0.0002994775770358342,
"loss": 4.474502258300781,
"step": 13950
},
{
"epoch": 0.028672014680071516,
"grad_norm": 1.031049132347107,
"learning_rate": 0.000299473537147191,
"loss": 4.189122924804687,
"step": 14000
},
{
"epoch": 0.028774414732500343,
"grad_norm": 1.224804401397705,
"learning_rate": 0.0002994694817259139,
"loss": 4.223143615722656,
"step": 14050
},
{
"epoch": 0.02887681478492917,
"grad_norm": 0.8684813380241394,
"learning_rate": 0.00029946541077242433,
"loss": 4.23610107421875,
"step": 14100
},
{
"epoch": 0.028979214837357995,
"grad_norm": 0.6440140008926392,
"learning_rate": 0.0002994613242871453,
"loss": 3.841741638183594,
"step": 14150
},
{
"epoch": 0.029081614889786823,
"grad_norm": 0.49674278497695923,
"learning_rate": 0.0002994572222705014,
"loss": 2.3330259704589844,
"step": 14200
},
{
"epoch": 0.02918401494221565,
"grad_norm": 0.8202585577964783,
"learning_rate": 0.00029945310472291906,
"loss": 3.3214230346679687,
"step": 14250
},
{
"epoch": 0.029286414994644478,
"grad_norm": 0.9601882100105286,
"learning_rate": 0.00029944897164482597,
"loss": 4.437399291992188,
"step": 14300
},
{
"epoch": 0.029388815047073306,
"grad_norm": 0.8373337388038635,
"learning_rate": 0.00029944482303665175,
"loss": 4.476743469238281,
"step": 14350
},
{
"epoch": 0.02949121509950213,
"grad_norm": 0.7051481008529663,
"learning_rate": 0.0002994406588988274,
"loss": 4.182169189453125,
"step": 14400
},
{
"epoch": 0.029593615151930958,
"grad_norm": 1.0870895385742188,
"learning_rate": 0.00029943647923178575,
"loss": 4.3550872802734375,
"step": 14450
},
{
"epoch": 0.029696015204359785,
"grad_norm": 0.768278181552887,
"learning_rate": 0.00029943228403596107,
"loss": 4.4534228515625,
"step": 14500
},
{
"epoch": 0.029798415256788613,
"grad_norm": 0.8001137971878052,
"learning_rate": 0.00029942807331178933,
"loss": 4.255840148925781,
"step": 14550
},
{
"epoch": 0.029900815309217437,
"grad_norm": 0.779834508895874,
"learning_rate": 0.000299423847059708,
"loss": 4.268035888671875,
"step": 14600
},
{
"epoch": 0.030003215361646265,
"grad_norm": 0.7155870199203491,
"learning_rate": 0.00029941960528015644,
"loss": 3.9607696533203125,
"step": 14650
},
{
"epoch": 0.030105615414075092,
"grad_norm": 0.8414117693901062,
"learning_rate": 0.0002994153479735753,
"loss": 3.9243670654296876,
"step": 14700
},
{
"epoch": 0.03020801546650392,
"grad_norm": 1.1119954586029053,
"learning_rate": 0.00029941107514040694,
"loss": 4.47902099609375,
"step": 14750
},
{
"epoch": 0.030310415518932744,
"grad_norm": 1.0905205011367798,
"learning_rate": 0.00029940678678109546,
"loss": 4.391621398925781,
"step": 14800
},
{
"epoch": 0.030412815571361572,
"grad_norm": 0.9594521522521973,
"learning_rate": 0.00029940248289608655,
"loss": 4.156022644042968,
"step": 14850
},
{
"epoch": 0.0305152156237904,
"grad_norm": 0.830136775970459,
"learning_rate": 0.0002993981634858273,
"loss": 4.088116760253906,
"step": 14900
},
{
"epoch": 0.030617615676219227,
"grad_norm": 0.7149996161460876,
"learning_rate": 0.00029939382855076664,
"loss": 3.857545166015625,
"step": 14950
},
{
"epoch": 0.03072001572864805,
"grad_norm": 0.8593119978904724,
"learning_rate": 0.0002993894780913551,
"loss": 4.10996826171875,
"step": 15000
},
{
"epoch": 0.03082241578107688,
"grad_norm": 0.8296166658401489,
"learning_rate": 0.0002993851121080446,
"loss": 4.353337097167969,
"step": 15050
},
{
"epoch": 0.030924815833505707,
"grad_norm": 0.7708966732025146,
"learning_rate": 0.00029938073060128896,
"loss": 4.261842651367187,
"step": 15100
},
{
"epoch": 0.031027215885934534,
"grad_norm": 0.6590582132339478,
"learning_rate": 0.00029937633357154345,
"loss": 3.885545349121094,
"step": 15150
},
{
"epoch": 0.031129615938363362,
"grad_norm": 1.0012860298156738,
"learning_rate": 0.000299371921019265,
"loss": 4.263138427734375,
"step": 15200
},
{
"epoch": 0.031232015990792186,
"grad_norm": 0.9702419638633728,
"learning_rate": 0.00029936749294491214,
"loss": 4.242536926269532,
"step": 15250
},
{
"epoch": 0.031334416043221014,
"grad_norm": 1.740096092224121,
"learning_rate": 0.000299363049348945,
"loss": 3.794849853515625,
"step": 15300
},
{
"epoch": 0.03143681609564984,
"grad_norm": 0.8641059994697571,
"learning_rate": 0.0002993585902318254,
"loss": 3.7392898559570313,
"step": 15350
},
{
"epoch": 0.03153921614807867,
"grad_norm": 0.7307964563369751,
"learning_rate": 0.0002993541155940166,
"loss": 4.284304809570313,
"step": 15400
},
{
"epoch": 0.0316416162005075,
"grad_norm": 0.8395029902458191,
"learning_rate": 0.0002993496254359837,
"loss": 3.666776428222656,
"step": 15450
},
{
"epoch": 0.031744016252936325,
"grad_norm": 0.88369220495224,
"learning_rate": 0.00029934511975819323,
"loss": 4.232069396972657,
"step": 15500
},
{
"epoch": 0.031846416305365145,
"grad_norm": 0.900976836681366,
"learning_rate": 0.00029934059856111337,
"loss": 4.181927490234375,
"step": 15550
},
{
"epoch": 0.03194881635779397,
"grad_norm": 0.8746826648712158,
"learning_rate": 0.00029933606184521404,
"loss": 4.177504577636719,
"step": 15600
},
{
"epoch": 0.0320512164102228,
"grad_norm": 0.9220513105392456,
"learning_rate": 0.0002993315096109666,
"loss": 4.219546813964843,
"step": 15650
},
{
"epoch": 0.03215361646265163,
"grad_norm": 0.9001684784889221,
"learning_rate": 0.00029932694185884416,
"loss": 4.161190490722657,
"step": 15700
},
{
"epoch": 0.032256016515080456,
"grad_norm": 1.1615084409713745,
"learning_rate": 0.0002993223585893213,
"loss": 4.272937316894531,
"step": 15750
},
{
"epoch": 0.032358416567509284,
"grad_norm": 0.9227635860443115,
"learning_rate": 0.0002993177598028743,
"loss": 4.0247500610351565,
"step": 15800
},
{
"epoch": 0.03246081661993811,
"grad_norm": 0.9501990675926208,
"learning_rate": 0.0002993131454999812,
"loss": 3.6547119140625,
"step": 15850
},
{
"epoch": 0.03256321667236694,
"grad_norm": 0.8894864320755005,
"learning_rate": 0.0002993085156811213,
"loss": 4.238618469238281,
"step": 15900
},
{
"epoch": 0.03266561672479577,
"grad_norm": 1.1804680824279785,
"learning_rate": 0.0002993038703467758,
"loss": 4.274075317382812,
"step": 15950
},
{
"epoch": 0.03276801677722459,
"grad_norm": 0.9597388505935669,
"learning_rate": 0.00029929920949742743,
"loss": 2.690977783203125,
"step": 16000
},
{
"epoch": 0.032870416829653415,
"grad_norm": 0.8713410496711731,
"learning_rate": 0.0002992945331335605,
"loss": 4.216771850585937,
"step": 16050
},
{
"epoch": 0.03297281688208224,
"grad_norm": 0.7275038361549377,
"learning_rate": 0.000299289841255661,
"loss": 3.7311639404296875,
"step": 16100
},
{
"epoch": 0.03307521693451107,
"grad_norm": 0.987648069858551,
"learning_rate": 0.0002992851338642164,
"loss": 4.2468328857421875,
"step": 16150
},
{
"epoch": 0.0331776169869399,
"grad_norm": 0.8776699900627136,
"learning_rate": 0.00029928041095971593,
"loss": 4.107083435058594,
"step": 16200
},
{
"epoch": 0.033280017039368726,
"grad_norm": 1.0074553489685059,
"learning_rate": 0.00029927567254265037,
"loss": 4.185172119140625,
"step": 16250
},
{
"epoch": 0.03338241709179755,
"grad_norm": 0.8109734058380127,
"learning_rate": 0.00029927091861351216,
"loss": 4.268891296386719,
"step": 16300
},
{
"epoch": 0.03348481714422638,
"grad_norm": 1.1346583366394043,
"learning_rate": 0.00029926614917279523,
"loss": 4.282049865722656,
"step": 16350
},
{
"epoch": 0.0335872171966552,
"grad_norm": 0.8583949208259583,
"learning_rate": 0.0002992613642209952,
"loss": 4.328241577148438,
"step": 16400
},
{
"epoch": 0.03368961724908403,
"grad_norm": 0.8398747444152832,
"learning_rate": 0.0002992565637586094,
"loss": 4.186492004394531,
"step": 16450
},
{
"epoch": 0.03379201730151286,
"grad_norm": 0.643873929977417,
"learning_rate": 0.0002992517477861366,
"loss": 3.162231140136719,
"step": 16500
},
{
"epoch": 0.033894417353941685,
"grad_norm": 0.9688578248023987,
"learning_rate": 0.00029924691630407724,
"loss": 4.235280151367188,
"step": 16550
},
{
"epoch": 0.03399681740637051,
"grad_norm": 0.8266287446022034,
"learning_rate": 0.0002992420693129334,
"loss": 4.479638977050781,
"step": 16600
},
{
"epoch": 0.03409921745879934,
"grad_norm": 0.8200719356536865,
"learning_rate": 0.0002992372068132088,
"loss": 4.379118957519531,
"step": 16650
},
{
"epoch": 0.03420161751122817,
"grad_norm": 0.9193712472915649,
"learning_rate": 0.00029923232880540865,
"loss": 4.209988708496094,
"step": 16700
},
{
"epoch": 0.034304017563656995,
"grad_norm": 0.9132387638092041,
"learning_rate": 0.0002992274352900399,
"loss": 4.341851501464844,
"step": 16750
},
{
"epoch": 0.03440641761608582,
"grad_norm": 1.0033169984817505,
"learning_rate": 0.0002992225262676111,
"loss": 4.356620483398437,
"step": 16800
},
{
"epoch": 0.034508817668514644,
"grad_norm": 1.109008550643921,
"learning_rate": 0.0002992176017386323,
"loss": 4.189815368652344,
"step": 16850
},
{
"epoch": 0.03461121772094347,
"grad_norm": 1.2428394556045532,
"learning_rate": 0.00029921266170361533,
"loss": 4.286259460449219,
"step": 16900
},
{
"epoch": 0.0347136177733723,
"grad_norm": 0.9120133519172668,
"learning_rate": 0.0002992077061630734,
"loss": 4.392665405273437,
"step": 16950
},
{
"epoch": 0.03481601782580113,
"grad_norm": 1.2237519025802612,
"learning_rate": 0.0002992027351175216,
"loss": 4.461217041015625,
"step": 17000
},
{
"epoch": 0.034918417878229954,
"grad_norm": 0.9254854917526245,
"learning_rate": 0.00029919774856747636,
"loss": 4.2495333862304685,
"step": 17050
},
{
"epoch": 0.03502081793065878,
"grad_norm": 1.204923391342163,
"learning_rate": 0.000299192746513456,
"loss": 4.237576293945312,
"step": 17100
},
{
"epoch": 0.03512321798308761,
"grad_norm": 0.8846333026885986,
"learning_rate": 0.0002991877289559803,
"loss": 3.958520812988281,
"step": 17150
},
{
"epoch": 0.03522561803551644,
"grad_norm": 0.8742989897727966,
"learning_rate": 0.00029918269589557055,
"loss": 4.097115173339843,
"step": 17200
},
{
"epoch": 0.03532801808794526,
"grad_norm": 0.9790547490119934,
"learning_rate": 0.0002991776473327499,
"loss": 4.068385314941406,
"step": 17250
},
{
"epoch": 0.035430418140374086,
"grad_norm": 0.8808755278587341,
"learning_rate": 0.0002991725832680428,
"loss": 4.071025390625,
"step": 17300
},
{
"epoch": 0.03553281819280291,
"grad_norm": 0.9796196818351746,
"learning_rate": 0.00029916750370197567,
"loss": 3.7829425048828127,
"step": 17350
},
{
"epoch": 0.03563521824523174,
"grad_norm": 0.9726704955101013,
"learning_rate": 0.00029916240863507625,
"loss": 4.105780334472656,
"step": 17400
},
{
"epoch": 0.03573761829766057,
"grad_norm": 1.0631580352783203,
"learning_rate": 0.000299157298067874,
"loss": 4.146686401367187,
"step": 17450
},
{
"epoch": 0.035840018350089396,
"grad_norm": 0.8494559526443481,
"learning_rate": 0.0002991521720009001,
"loss": 4.303363342285156,
"step": 17500
},
{
"epoch": 0.035942418402518224,
"grad_norm": 1.3400248289108276,
"learning_rate": 0.00029914703043468704,
"loss": 4.124955749511718,
"step": 17550
},
{
"epoch": 0.03604481845494705,
"grad_norm": 1.2535253763198853,
"learning_rate": 0.00029914187336976925,
"loss": 3.625634765625,
"step": 17600
},
{
"epoch": 0.03614721850737588,
"grad_norm": 0.9625725746154785,
"learning_rate": 0.0002991367008066826,
"loss": 4.224259948730468,
"step": 17650
},
{
"epoch": 0.0362496185598047,
"grad_norm": 0.9419931769371033,
"learning_rate": 0.00029913151274596456,
"loss": 4.3089794921875,
"step": 17700
},
{
"epoch": 0.03635201861223353,
"grad_norm": 1.2326748371124268,
"learning_rate": 0.0002991263091881543,
"loss": 4.07185791015625,
"step": 17750
},
{
"epoch": 0.036454418664662355,
"grad_norm": 0.9051257967948914,
"learning_rate": 0.00029912109013379253,
"loss": 4.346282958984375,
"step": 17800
},
{
"epoch": 0.03655681871709118,
"grad_norm": 0.8675338625907898,
"learning_rate": 0.0002991158555834216,
"loss": 4.14196044921875,
"step": 17850
},
{
"epoch": 0.03665921876952001,
"grad_norm": 1.7800242900848389,
"learning_rate": 0.0002991106055375854,
"loss": 4.262186279296875,
"step": 17900
},
{
"epoch": 0.03676161882194884,
"grad_norm": 0.8730024099349976,
"learning_rate": 0.0002991053399968296,
"loss": 3.647480163574219,
"step": 17950
},
{
"epoch": 0.036864018874377666,
"grad_norm": 0.8715499639511108,
"learning_rate": 0.0002991000589617013,
"loss": 3.8062033081054687,
"step": 18000
},
{
"epoch": 0.036966418926806494,
"grad_norm": 1.1045186519622803,
"learning_rate": 0.0002990947624327493,
"loss": 3.142933349609375,
"step": 18050
},
{
"epoch": 0.037068818979235314,
"grad_norm": 0.9436252117156982,
"learning_rate": 0.000299089450410524,
"loss": 3.2230126953125,
"step": 18100
},
{
"epoch": 0.03717121903166414,
"grad_norm": 0.7957382798194885,
"learning_rate": 0.00029908412289557737,
"loss": 4.389481811523438,
"step": 18150
},
{
"epoch": 0.03727361908409297,
"grad_norm": 1.0775970220565796,
"learning_rate": 0.0002990787798884631,
"loss": 3.8384576416015626,
"step": 18200
},
{
"epoch": 0.0373760191365218,
"grad_norm": 0.9266685843467712,
"learning_rate": 0.00029907342138973627,
"loss": 4.209334106445312,
"step": 18250
},
{
"epoch": 0.037478419188950625,
"grad_norm": 0.9169478416442871,
"learning_rate": 0.00029906804739995385,
"loss": 4.067582092285156,
"step": 18300
},
{
"epoch": 0.03758081924137945,
"grad_norm": 0.8588764071464539,
"learning_rate": 0.0002990626579196742,
"loss": 4.140736694335938,
"step": 18350
},
{
"epoch": 0.03768321929380828,
"grad_norm": 1.0396775007247925,
"learning_rate": 0.0002990572529494574,
"loss": 4.312765502929688,
"step": 18400
},
{
"epoch": 0.03778561934623711,
"grad_norm": 1.0524662733078003,
"learning_rate": 0.0002990518324898652,
"loss": 3.2222711181640626,
"step": 18450
},
{
"epoch": 0.037888019398665936,
"grad_norm": 0.8703554272651672,
"learning_rate": 0.00029904639654146066,
"loss": 4.180811462402343,
"step": 18500
},
{
"epoch": 0.037990419451094756,
"grad_norm": 2.620311737060547,
"learning_rate": 0.00029904094510480885,
"loss": 4.130848388671875,
"step": 18550
},
{
"epoch": 0.038092819503523584,
"grad_norm": 0.8157406449317932,
"learning_rate": 0.0002990354781804762,
"loss": 3.6872372436523437,
"step": 18600
},
{
"epoch": 0.03819521955595241,
"grad_norm": 0.8512464165687561,
"learning_rate": 0.0002990299957690308,
"loss": 4.433642883300781,
"step": 18650
},
{
"epoch": 0.03829761960838124,
"grad_norm": 0.9459244012832642,
"learning_rate": 0.0002990244978710423,
"loss": 4.312282104492187,
"step": 18700
},
{
"epoch": 0.03840001966081007,
"grad_norm": 0.8191068768501282,
"learning_rate": 0.0002990189844870821,
"loss": 4.3835546875,
"step": 18750
},
{
"epoch": 0.038502419713238895,
"grad_norm": 0.9797852039337158,
"learning_rate": 0.0002990134556177231,
"loss": 4.277929077148437,
"step": 18800
},
{
"epoch": 0.03860481976566772,
"grad_norm": 0.957114040851593,
"learning_rate": 0.00029900791126353984,
"loss": 4.525142822265625,
"step": 18850
},
{
"epoch": 0.03870721981809655,
"grad_norm": 0.9237158894538879,
"learning_rate": 0.0002990023514251085,
"loss": 3.7692413330078125,
"step": 18900
},
{
"epoch": 0.03880961987052537,
"grad_norm": 1.055321455001831,
"learning_rate": 0.0002989967761030067,
"loss": 4.0058810424804685,
"step": 18950
},
{
"epoch": 0.0389120199229542,
"grad_norm": 0.9850941896438599,
"learning_rate": 0.000298991185297814,
"loss": 3.8927227783203127,
"step": 19000
},
{
"epoch": 0.039014419975383026,
"grad_norm": 0.8424584269523621,
"learning_rate": 0.0002989855790101112,
"loss": 4.3304986572265625,
"step": 19050
},
{
"epoch": 0.039116820027811854,
"grad_norm": 0.8309029936790466,
"learning_rate": 0.00029897995724048105,
"loss": 4.19474609375,
"step": 19100
},
{
"epoch": 0.03921922008024068,
"grad_norm": 0.8734010457992554,
"learning_rate": 0.00029897431998950763,
"loss": 4.056589965820312,
"step": 19150
},
{
"epoch": 0.03932162013266951,
"grad_norm": 1.723552942276001,
"learning_rate": 0.0002989686672577767,
"loss": 4.061507568359375,
"step": 19200
},
{
"epoch": 0.03942402018509834,
"grad_norm": 1.0202237367630005,
"learning_rate": 0.0002989629990458757,
"loss": 3.8971566772460937,
"step": 19250
},
{
"epoch": 0.039526420237527164,
"grad_norm": 1.2921315431594849,
"learning_rate": 0.00029895731535439367,
"loss": 3.0908432006835938,
"step": 19300
},
{
"epoch": 0.03962882028995599,
"grad_norm": 1.0007706880569458,
"learning_rate": 0.00029895161618392126,
"loss": 3.4613546752929687,
"step": 19350
},
{
"epoch": 0.03973122034238481,
"grad_norm": 1.0438216924667358,
"learning_rate": 0.00029894590153505066,
"loss": 3.344393615722656,
"step": 19400
},
{
"epoch": 0.03983362039481364,
"grad_norm": 1.0282576084136963,
"learning_rate": 0.0002989401714083757,
"loss": 3.807875671386719,
"step": 19450
},
{
"epoch": 0.03993602044724247,
"grad_norm": 1.20839262008667,
"learning_rate": 0.00029893442580449187,
"loss": 4.143163452148437,
"step": 19500
},
{
"epoch": 0.040038420499671296,
"grad_norm": 1.1626482009887695,
"learning_rate": 0.0002989286647239962,
"loss": 4.075806884765625,
"step": 19550
},
{
"epoch": 0.04014082055210012,
"grad_norm": 0.7632113695144653,
"learning_rate": 0.0002989228881674874,
"loss": 4.186883239746094,
"step": 19600
},
{
"epoch": 0.04024322060452895,
"grad_norm": 0.8571646213531494,
"learning_rate": 0.00029891709613556565,
"loss": 3.8722219848632813,
"step": 19650
},
{
"epoch": 0.04034562065695778,
"grad_norm": 1.4133912324905396,
"learning_rate": 0.0002989112886288329,
"loss": 3.877001953125,
"step": 19700
},
{
"epoch": 0.040448020709386606,
"grad_norm": 0.9766141176223755,
"learning_rate": 0.0002989054656478927,
"loss": 3.9540411376953126,
"step": 19750
},
{
"epoch": 0.04055042076181543,
"grad_norm": 0.8429685235023499,
"learning_rate": 0.00029889962719335003,
"loss": 4.412438049316406,
"step": 19800
},
{
"epoch": 0.040652820814244255,
"grad_norm": 0.7656176686286926,
"learning_rate": 0.0002988937732658116,
"loss": 4.269136657714844,
"step": 19850
},
{
"epoch": 0.04075522086667308,
"grad_norm": 1.1075332164764404,
"learning_rate": 0.0002988879038658859,
"loss": 4.419913330078125,
"step": 19900
},
{
"epoch": 0.04085762091910191,
"grad_norm": 0.8199209570884705,
"learning_rate": 0.0002988820189941826,
"loss": 4.36384765625,
"step": 19950
},
{
"epoch": 0.04096002097153074,
"grad_norm": 0.8144904375076294,
"learning_rate": 0.00029887611865131344,
"loss": 4.030648803710937,
"step": 20000
},
{
"epoch": 0.041062421023959565,
"grad_norm": 0.9372329711914062,
"learning_rate": 0.00029887020283789147,
"loss": 4.1174404907226565,
"step": 20050
},
{
"epoch": 0.04116482107638839,
"grad_norm": 0.8546763062477112,
"learning_rate": 0.0002988642715545314,
"loss": 4.441152648925781,
"step": 20100
},
{
"epoch": 0.04126722112881722,
"grad_norm": 1.333139181137085,
"learning_rate": 0.00029885832480184963,
"loss": 4.200628356933594,
"step": 20150
},
{
"epoch": 0.04136962118124605,
"grad_norm": 1.320517659187317,
"learning_rate": 0.0002988523625804641,
"loss": 3.89320068359375,
"step": 20200
},
{
"epoch": 0.04147202123367487,
"grad_norm": 0.9039347171783447,
"learning_rate": 0.0002988463848909944,
"loss": 3.9010406494140626,
"step": 20250
},
{
"epoch": 0.0415744212861037,
"grad_norm": 0.9151229858398438,
"learning_rate": 0.00029884039173406167,
"loss": 3.6283367919921874,
"step": 20300
},
{
"epoch": 0.041676821338532524,
"grad_norm": 0.8544915318489075,
"learning_rate": 0.00029883438311028876,
"loss": 4.021604919433594,
"step": 20350
},
{
"epoch": 0.04177922139096135,
"grad_norm": 1.2115877866744995,
"learning_rate": 0.0002988283590203,
"loss": 4.037056579589843,
"step": 20400
},
{
"epoch": 0.04188162144339018,
"grad_norm": 0.8434769511222839,
"learning_rate": 0.0002988223194647214,
"loss": 4.190481262207031,
"step": 20450
},
{
"epoch": 0.04198402149581901,
"grad_norm": 1.0086390972137451,
"learning_rate": 0.00029881626444418056,
"loss": 3.7280892944335937,
"step": 20500
},
{
"epoch": 0.042086421548247835,
"grad_norm": 1.0009269714355469,
"learning_rate": 0.0002988101939593067,
"loss": 4.065418090820312,
"step": 20550
},
{
"epoch": 0.04218882160067666,
"grad_norm": 0.7844799160957336,
"learning_rate": 0.0002988041080107307,
"loss": 3.97632080078125,
"step": 20600
},
{
"epoch": 0.04229122165310548,
"grad_norm": 0.9640885591506958,
"learning_rate": 0.00029879800659908485,
"loss": 4.065289916992188,
"step": 20650
},
{
"epoch": 0.04239362170553431,
"grad_norm": 0.8006758093833923,
"learning_rate": 0.0002987918897250033,
"loss": 4.137116088867187,
"step": 20700
},
{
"epoch": 0.04249602175796314,
"grad_norm": 0.624839186668396,
"learning_rate": 0.00029878575738912156,
"loss": 2.075597839355469,
"step": 20750
},
{
"epoch": 0.042598421810391966,
"grad_norm": 0.8152270317077637,
"learning_rate": 0.00029877960959207706,
"loss": 3.2935858154296875,
"step": 20800
},
{
"epoch": 0.042700821862820794,
"grad_norm": 0.9872801303863525,
"learning_rate": 0.0002987734463345085,
"loss": 3.3229608154296875,
"step": 20850
},
{
"epoch": 0.04280322191524962,
"grad_norm": 0.6640042066574097,
"learning_rate": 0.00029876726761705636,
"loss": 2.9013262939453126,
"step": 20900
},
{
"epoch": 0.04290562196767845,
"grad_norm": 0.6145225167274475,
"learning_rate": 0.00029876107344036277,
"loss": 2.4409584045410155,
"step": 20950
},
{
"epoch": 0.04300802202010728,
"grad_norm": 1.0556402206420898,
"learning_rate": 0.0002987548638050714,
"loss": 2.4114979553222655,
"step": 21000
},
{
"epoch": 0.043110422072536105,
"grad_norm": 0.9862767457962036,
"learning_rate": 0.00029874863871182745,
"loss": 3.802875671386719,
"step": 21050
},
{
"epoch": 0.043212822124964925,
"grad_norm": 0.852150559425354,
"learning_rate": 0.0002987423981612778,
"loss": 3.66058349609375,
"step": 21100
},
{
"epoch": 0.04331522217739375,
"grad_norm": 0.8836477398872375,
"learning_rate": 0.0002987361421540711,
"loss": 3.4694943237304687,
"step": 21150
},
{
"epoch": 0.04341762222982258,
"grad_norm": 1.5402307510375977,
"learning_rate": 0.00029872987069085727,
"loss": 3.277726135253906,
"step": 21200
},
{
"epoch": 0.04352002228225141,
"grad_norm": 0.9419423341751099,
"learning_rate": 0.0002987235837722881,
"loss": 3.5211444091796875,
"step": 21250
},
{
"epoch": 0.043622422334680236,
"grad_norm": 0.7486373782157898,
"learning_rate": 0.0002987172813990169,
"loss": 3.471663818359375,
"step": 21300
},
{
"epoch": 0.043724822387109064,
"grad_norm": 0.7535277605056763,
"learning_rate": 0.0002987109635716985,
"loss": 3.376907958984375,
"step": 21350
},
{
"epoch": 0.04382722243953789,
"grad_norm": 0.8332289457321167,
"learning_rate": 0.0002987046302909895,
"loss": 3.9842266845703125,
"step": 21400
},
{
"epoch": 0.04392962249196672,
"grad_norm": 1.322947382926941,
"learning_rate": 0.000298698281557548,
"loss": 3.1945089721679687,
"step": 21450
},
{
"epoch": 0.04403202254439554,
"grad_norm": 1.0296247005462646,
"learning_rate": 0.00029869191737203377,
"loss": 3.6288201904296873,
"step": 21500
},
{
"epoch": 0.04413442259682437,
"grad_norm": 0.9314439296722412,
"learning_rate": 0.0002986855377351081,
"loss": 3.4926687622070314,
"step": 21550
},
{
"epoch": 0.044236822649253195,
"grad_norm": 0.7597600221633911,
"learning_rate": 0.000298679142647434,
"loss": 2.996235046386719,
"step": 21600
},
{
"epoch": 0.04433922270168202,
"grad_norm": 1.4043519496917725,
"learning_rate": 0.00029867273210967593,
"loss": 3.252802429199219,
"step": 21650
},
{
"epoch": 0.04444162275411085,
"grad_norm": 3.3350236415863037,
"learning_rate": 0.00029866630612250013,
"loss": 3.2056927490234375,
"step": 21700
},
{
"epoch": 0.04454402280653968,
"grad_norm": 0.8740987777709961,
"learning_rate": 0.0002986598646865743,
"loss": 3.5895626831054686,
"step": 21750
},
{
"epoch": 0.044646422858968506,
"grad_norm": 1.1191177368164062,
"learning_rate": 0.00029865340780256777,
"loss": 3.456165466308594,
"step": 21800
},
{
"epoch": 0.04474882291139733,
"grad_norm": 0.8428330421447754,
"learning_rate": 0.0002986469354711516,
"loss": 3.3481961059570313,
"step": 21850
},
{
"epoch": 0.04485122296382616,
"grad_norm": 0.9282798767089844,
"learning_rate": 0.0002986404476929984,
"loss": 3.2974124145507813,
"step": 21900
},
{
"epoch": 0.04495362301625498,
"grad_norm": 1.1790461540222168,
"learning_rate": 0.00029863394446878223,
"loss": 2.619112854003906,
"step": 21950
},
{
"epoch": 0.04505602306868381,
"grad_norm": 0.905838131904602,
"learning_rate": 0.00029862742579917894,
"loss": 3.3288262939453124,
"step": 22000
},
{
"epoch": 0.04515842312111264,
"grad_norm": 0.7021234631538391,
"learning_rate": 0.00029862089168486596,
"loss": 3.40490234375,
"step": 22050
},
{
"epoch": 0.045260823173541465,
"grad_norm": 0.8678475618362427,
"learning_rate": 0.00029861434212652215,
"loss": 3.6314691162109374,
"step": 22100
},
{
"epoch": 0.04536322322597029,
"grad_norm": 0.9551572203636169,
"learning_rate": 0.00029860777712482824,
"loss": 3.654752197265625,
"step": 22150
},
{
"epoch": 0.04546562327839912,
"grad_norm": 1.1007713079452515,
"learning_rate": 0.00029860119668046636,
"loss": 3.439637451171875,
"step": 22200
},
{
"epoch": 0.04556802333082795,
"grad_norm": 0.8319056034088135,
"learning_rate": 0.0002985946007941204,
"loss": 3.5101995849609375,
"step": 22250
},
{
"epoch": 0.045670423383256775,
"grad_norm": 1.040257215499878,
"learning_rate": 0.0002985879894664757,
"loss": 3.7279443359375,
"step": 22300
},
{
"epoch": 0.045772823435685596,
"grad_norm": 1.222548246383667,
"learning_rate": 0.00029858136269821935,
"loss": 3.6467132568359375,
"step": 22350
},
{
"epoch": 0.045875223488114424,
"grad_norm": 0.7653852701187134,
"learning_rate": 0.00029857472049003993,
"loss": 3.789747619628906,
"step": 22400
},
{
"epoch": 0.04597762354054325,
"grad_norm": 1.0074176788330078,
"learning_rate": 0.00029856806284262767,
"loss": 3.3356439208984376,
"step": 22450
},
{
"epoch": 0.04608002359297208,
"grad_norm": 0.9829652309417725,
"learning_rate": 0.0002985613897566744,
"loss": 2.86457763671875,
"step": 22500
},
{
"epoch": 0.04618242364540091,
"grad_norm": 1.0552867650985718,
"learning_rate": 0.0002985547012328736,
"loss": 3.389576110839844,
"step": 22550
},
{
"epoch": 0.046284823697829734,
"grad_norm": 0.7977453470230103,
"learning_rate": 0.00029854799727192024,
"loss": 3.094827880859375,
"step": 22600
},
{
"epoch": 0.04638722375025856,
"grad_norm": 1.0439661741256714,
"learning_rate": 0.00029854127787451104,
"loss": 3.353898620605469,
"step": 22650
},
{
"epoch": 0.04648962380268739,
"grad_norm": 0.8338518738746643,
"learning_rate": 0.0002985345430413442,
"loss": 3.2231854248046874,
"step": 22700
},
{
"epoch": 0.04659202385511622,
"grad_norm": 1.1333472728729248,
"learning_rate": 0.0002985277927731196,
"loss": 3.30358642578125,
"step": 22750
},
{
"epoch": 0.04669442390754504,
"grad_norm": 0.8333401679992676,
"learning_rate": 0.0002985210270705387,
"loss": 3.2313726806640624,
"step": 22800
},
{
"epoch": 0.046796823959973866,
"grad_norm": 0.926623523235321,
"learning_rate": 0.0002985142459343045,
"loss": 3.3423468017578126,
"step": 22850
},
{
"epoch": 0.04689922401240269,
"grad_norm": 0.7728790640830994,
"learning_rate": 0.00029850744936512177,
"loss": 3.470130615234375,
"step": 22900
},
{
"epoch": 0.04700162406483152,
"grad_norm": 1.0513544082641602,
"learning_rate": 0.0002985006373636967,
"loss": 4.155077514648437,
"step": 22950
},
{
"epoch": 0.04710402411726035,
"grad_norm": 0.8886310458183289,
"learning_rate": 0.00029849380993073716,
"loss": 4.144877319335937,
"step": 23000
},
{
"epoch": 0.047206424169689176,
"grad_norm": 0.615044116973877,
"learning_rate": 0.0002984869670669527,
"loss": 4.217498779296875,
"step": 23050
},
{
"epoch": 0.047308824222118004,
"grad_norm": 1.0154633522033691,
"learning_rate": 0.00029848010877305437,
"loss": 3.5084097290039065,
"step": 23100
},
{
"epoch": 0.04741122427454683,
"grad_norm": 1.1519191265106201,
"learning_rate": 0.0002984732350497548,
"loss": 4.138232727050781,
"step": 23150
},
{
"epoch": 0.04751362432697565,
"grad_norm": 1.1761195659637451,
"learning_rate": 0.0002984663458977683,
"loss": 4.233868713378906,
"step": 23200
},
{
"epoch": 0.04761602437940448,
"grad_norm": 1.0882890224456787,
"learning_rate": 0.00029845944131781085,
"loss": 3.8094412231445314,
"step": 23250
},
{
"epoch": 0.04771842443183331,
"grad_norm": 1.145857810974121,
"learning_rate": 0.0002984525213105998,
"loss": 4.4981906127929685,
"step": 23300
},
{
"epoch": 0.047820824484262135,
"grad_norm": 1.0446664094924927,
"learning_rate": 0.0002984455858768544,
"loss": 3.7824630737304688,
"step": 23350
},
{
"epoch": 0.04792322453669096,
"grad_norm": 0.9234415292739868,
"learning_rate": 0.0002984386350172952,
"loss": 4.244895629882812,
"step": 23400
},
{
"epoch": 0.04802562458911979,
"grad_norm": 0.8664620518684387,
"learning_rate": 0.0002984316687326446,
"loss": 4.05336181640625,
"step": 23450
},
{
"epoch": 0.04812802464154862,
"grad_norm": 1.1607353687286377,
"learning_rate": 0.0002984246870236265,
"loss": 3.920790710449219,
"step": 23500
},
{
"epoch": 0.048230424693977446,
"grad_norm": 1.0881608724594116,
"learning_rate": 0.00029841768989096633,
"loss": 4.012793273925781,
"step": 23550
},
{
"epoch": 0.048332824746406274,
"grad_norm": 1.136512041091919,
"learning_rate": 0.0002984106773353913,
"loss": 3.7952926635742186,
"step": 23600
},
{
"epoch": 0.048435224798835094,
"grad_norm": 0.9657559990882874,
"learning_rate": 0.0002984036493576301,
"loss": 3.48884033203125,
"step": 23650
},
{
"epoch": 0.04853762485126392,
"grad_norm": 0.8505204319953918,
"learning_rate": 0.000298396605958413,
"loss": 3.842665710449219,
"step": 23700
},
{
"epoch": 0.04864002490369275,
"grad_norm": 0.9779611825942993,
"learning_rate": 0.00029838954713847193,
"loss": 3.847880859375,
"step": 23750
},
{
"epoch": 0.04874242495612158,
"grad_norm": 1.0220547914505005,
"learning_rate": 0.0002983824728985404,
"loss": 4.149264831542968,
"step": 23800
},
{
"epoch": 0.048844825008550405,
"grad_norm": 1.3035789728164673,
"learning_rate": 0.00029837538323935364,
"loss": 4.045937194824218,
"step": 23850
},
{
"epoch": 0.04894722506097923,
"grad_norm": 1.0806480646133423,
"learning_rate": 0.00029836827816164826,
"loss": 3.93858154296875,
"step": 23900
},
{
"epoch": 0.04904962511340806,
"grad_norm": 1.0183125734329224,
"learning_rate": 0.0002983611576661626,
"loss": 3.665546875,
"step": 23950
},
{
"epoch": 0.04915202516583689,
"grad_norm": 1.1539430618286133,
"learning_rate": 0.0002983540217536367,
"loss": 4.074727783203125,
"step": 24000
},
{
"epoch": 0.04925442521826571,
"grad_norm": 1.0822535753250122,
"learning_rate": 0.00029834687042481193,
"loss": 4.032168579101563,
"step": 24050
},
{
"epoch": 0.049356825270694536,
"grad_norm": 1.0588322877883911,
"learning_rate": 0.00029833970368043153,
"loss": 4.178402404785157,
"step": 24100
},
{
"epoch": 0.049459225323123364,
"grad_norm": 0.7627548575401306,
"learning_rate": 0.0002983325215212402,
"loss": 4.084798889160156,
"step": 24150
},
{
"epoch": 0.04956162537555219,
"grad_norm": 1.185702919960022,
"learning_rate": 0.0002983253239479843,
"loss": 4.136662292480469,
"step": 24200
},
{
"epoch": 0.04966402542798102,
"grad_norm": 2.4309804439544678,
"learning_rate": 0.0002983181109614118,
"loss": 4.230069885253906,
"step": 24250
},
{
"epoch": 0.04976642548040985,
"grad_norm": 1.0039188861846924,
"learning_rate": 0.00029831088256227216,
"loss": 3.9972125244140626,
"step": 24300
},
{
"epoch": 0.049868825532838675,
"grad_norm": 0.9414103627204895,
"learning_rate": 0.0002983036387513166,
"loss": 4.060273742675781,
"step": 24350
},
{
"epoch": 0.0499712255852675,
"grad_norm": 1.0714952945709229,
"learning_rate": 0.0002982963795292978,
"loss": 3.6833465576171873,
"step": 24400
},
{
"epoch": 0.05007362563769633,
"grad_norm": 0.924064576625824,
"learning_rate": 0.00029828910489697016,
"loss": 3.9215875244140626,
"step": 24450
},
{
"epoch": 0.05017602569012515,
"grad_norm": 0.9032275080680847,
"learning_rate": 0.00029828181485508956,
"loss": 4.0937020874023435,
"step": 24500
},
{
"epoch": 0.05027842574255398,
"grad_norm": 0.9629778861999512,
"learning_rate": 0.00029827450940441363,
"loss": 3.5827789306640625,
"step": 24550
},
{
"epoch": 0.050380825794982806,
"grad_norm": 1.0797669887542725,
"learning_rate": 0.00029826718854570147,
"loss": 3.6074313354492187,
"step": 24600
},
{
"epoch": 0.050483225847411634,
"grad_norm": 1.1837302446365356,
"learning_rate": 0.00029825985227971386,
"loss": 3.8778558349609376,
"step": 24650
},
{
"epoch": 0.05058562589984046,
"grad_norm": 1.0532505512237549,
"learning_rate": 0.0002982525006072131,
"loss": 4.007304382324219,
"step": 24700
},
{
"epoch": 0.05068802595226929,
"grad_norm": 1.024993896484375,
"learning_rate": 0.00029824513352896327,
"loss": 4.1383056640625,
"step": 24750
},
{
"epoch": 0.050790426004698117,
"grad_norm": 2.709007978439331,
"learning_rate": 0.00029823775104572976,
"loss": 3.71488525390625,
"step": 24800
},
{
"epoch": 0.050892826057126944,
"grad_norm": 0.9420567750930786,
"learning_rate": 0.0002982303531582799,
"loss": 4.161868591308593,
"step": 24850
},
{
"epoch": 0.050995226109555765,
"grad_norm": 1.638623595237732,
"learning_rate": 0.0002982229398673822,
"loss": 4.007568969726562,
"step": 24900
},
{
"epoch": 0.05109762616198459,
"grad_norm": 0.7433112859725952,
"learning_rate": 0.0002982155111738073,
"loss": 3.716796875,
"step": 24950
},
{
"epoch": 0.05120002621441342,
"grad_norm": 1.1634193658828735,
"learning_rate": 0.00029820806707832694,
"loss": 4.099712524414063,
"step": 25000
},
{
"epoch": 0.05130242626684225,
"grad_norm": 1.0174721479415894,
"learning_rate": 0.0002982006075817148,
"loss": 3.70357666015625,
"step": 25050
},
{
"epoch": 0.051404826319271076,
"grad_norm": 1.041905164718628,
"learning_rate": 0.00029819313268474593,
"loss": 3.85610107421875,
"step": 25100
},
{
"epoch": 0.0515072263716999,
"grad_norm": 1.108231782913208,
"learning_rate": 0.00029818564238819723,
"loss": 4.048504333496094,
"step": 25150
},
{
"epoch": 0.05160962642412873,
"grad_norm": 0.8780749440193176,
"learning_rate": 0.00029817813669284695,
"loss": 4.2607119750976565,
"step": 25200
},
{
"epoch": 0.05171202647655756,
"grad_norm": 1.0939981937408447,
"learning_rate": 0.0002981706155994751,
"loss": 4.242766723632813,
"step": 25250
},
{
"epoch": 0.051814426528986386,
"grad_norm": 0.9443891644477844,
"learning_rate": 0.00029816307910886323,
"loss": 4.077508850097656,
"step": 25300
},
{
"epoch": 0.05191682658141521,
"grad_norm": 0.8710380792617798,
"learning_rate": 0.00029815552722179447,
"loss": 3.954695739746094,
"step": 25350
},
{
"epoch": 0.052019226633844035,
"grad_norm": 0.9465594291687012,
"learning_rate": 0.0002981479599390536,
"loss": 3.9642620849609376,
"step": 25400
},
{
"epoch": 0.05212162668627286,
"grad_norm": 1.2072516679763794,
"learning_rate": 0.00029814037726142703,
"loss": 3.5950994873046875,
"step": 25450
},
{
"epoch": 0.05222402673870169,
"grad_norm": 0.9787052869796753,
"learning_rate": 0.0002981327791897026,
"loss": 3.669163818359375,
"step": 25500
},
{
"epoch": 0.05232642679113052,
"grad_norm": 0.9823593497276306,
"learning_rate": 0.00029812516572467,
"loss": 3.70659423828125,
"step": 25550
},
{
"epoch": 0.052428826843559345,
"grad_norm": 0.9548662304878235,
"learning_rate": 0.00029811753686712024,
"loss": 4.188983459472656,
"step": 25600
},
{
"epoch": 0.05253122689598817,
"grad_norm": 0.8237021565437317,
"learning_rate": 0.0002981098926178462,
"loss": 4.097180786132813,
"step": 25650
},
{
"epoch": 0.052633626948417,
"grad_norm": 0.8100720047950745,
"learning_rate": 0.00029810223297764224,
"loss": 4.057103271484375,
"step": 25700
},
{
"epoch": 0.05273602700084582,
"grad_norm": 0.9498805403709412,
"learning_rate": 0.00029809455794730424,
"loss": 3.9076028442382813,
"step": 25750
},
{
"epoch": 0.05283842705327465,
"grad_norm": 0.9514391422271729,
"learning_rate": 0.00029808686752762984,
"loss": 3.881569519042969,
"step": 25800
},
{
"epoch": 0.05294082710570348,
"grad_norm": 0.5591891407966614,
"learning_rate": 0.0002980791617194181,
"loss": 4.816184692382812,
"step": 25850
},
{
"epoch": 0.053043227158132304,
"grad_norm": 0.8840929269790649,
"learning_rate": 0.0002980714405234698,
"loss": 3.9826123046875,
"step": 25900
},
{
"epoch": 0.05314562721056113,
"grad_norm": 0.6732226610183716,
"learning_rate": 0.00029806370394058735,
"loss": 3.7573004150390625,
"step": 25950
},
{
"epoch": 0.05324802726298996,
"grad_norm": 1.1279404163360596,
"learning_rate": 0.0002980559519715747,
"loss": 3.7439083862304687,
"step": 26000
},
{
"epoch": 0.05335042731541879,
"grad_norm": 1.28814697265625,
"learning_rate": 0.0002980481846172372,
"loss": 3.40891357421875,
"step": 26050
},
{
"epoch": 0.053452827367847615,
"grad_norm": 0.8305365443229675,
"learning_rate": 0.0002980404018783823,
"loss": 3.9074551391601564,
"step": 26100
},
{
"epoch": 0.05355522742027644,
"grad_norm": 1.059561848640442,
"learning_rate": 0.0002980326037558186,
"loss": 3.3790802001953124,
"step": 26150
},
{
"epoch": 0.05365762747270526,
"grad_norm": 0.7863622903823853,
"learning_rate": 0.00029802479025035645,
"loss": 3.8910751342773438,
"step": 26200
},
{
"epoch": 0.05376002752513409,
"grad_norm": 0.8412345051765442,
"learning_rate": 0.0002980169613628078,
"loss": 3.905106201171875,
"step": 26250
},
{
"epoch": 0.05386242757756292,
"grad_norm": 0.6786169409751892,
"learning_rate": 0.0002980091170939862,
"loss": 3.6419586181640624,
"step": 26300
},
{
"epoch": 0.053964827629991746,
"grad_norm": 0.8411727547645569,
"learning_rate": 0.00029800125744470677,
"loss": 3.3573968505859373,
"step": 26350
},
{
"epoch": 0.054067227682420574,
"grad_norm": 0.9979608654975891,
"learning_rate": 0.0002979933824157863,
"loss": 3.6130526733398436,
"step": 26400
},
{
"epoch": 0.0541696277348494,
"grad_norm": 0.8738940358161926,
"learning_rate": 0.00029798549200804305,
"loss": 3.2773031616210937,
"step": 26450
},
{
"epoch": 0.05427202778727823,
"grad_norm": 0.8625099062919617,
"learning_rate": 0.0002979775862222971,
"loss": 3.92064453125,
"step": 26500
},
{
"epoch": 0.05437442783970706,
"grad_norm": 1.1380776166915894,
"learning_rate": 0.00029796966505936975,
"loss": 3.9016488647460936,
"step": 26550
},
{
"epoch": 0.05447682789213588,
"grad_norm": 0.8728241324424744,
"learning_rate": 0.0002979617285200844,
"loss": 4.155015258789063,
"step": 26600
},
{
"epoch": 0.054579227944564705,
"grad_norm": 1.174974799156189,
"learning_rate": 0.0002979537766052656,
"loss": 3.755271301269531,
"step": 26650
},
{
"epoch": 0.05468162799699353,
"grad_norm": 1.0797170400619507,
"learning_rate": 0.00029794580931573973,
"loss": 3.6002767944335936,
"step": 26700
},
{
"epoch": 0.05478402804942236,
"grad_norm": 0.8095331192016602,
"learning_rate": 0.0002979378266523347,
"loss": 3.9049578857421876,
"step": 26750
},
{
"epoch": 0.05488642810185119,
"grad_norm": 0.8785421252250671,
"learning_rate": 0.00029792982861588007,
"loss": 3.594248046875,
"step": 26800
},
{
"epoch": 0.054988828154280016,
"grad_norm": 0.8992822766304016,
"learning_rate": 0.0002979218152072069,
"loss": 4.156261901855469,
"step": 26850
},
{
"epoch": 0.055091228206708844,
"grad_norm": 1.633196234703064,
"learning_rate": 0.000297913786427148,
"loss": 3.608190612792969,
"step": 26900
},
{
"epoch": 0.05519362825913767,
"grad_norm": 1.1997803449630737,
"learning_rate": 0.0002979057422765376,
"loss": 3.6971340942382813,
"step": 26950
},
{
"epoch": 0.0552960283115665,
"grad_norm": 0.987196147441864,
"learning_rate": 0.00029789768275621163,
"loss": 3.6062017822265626,
"step": 27000
},
{
"epoch": 0.05539842836399532,
"grad_norm": 1.0470249652862549,
"learning_rate": 0.00029788960786700767,
"loss": 3.6216055297851564,
"step": 27050
},
{
"epoch": 0.05550082841642415,
"grad_norm": 1.3368786573410034,
"learning_rate": 0.00029788151760976473,
"loss": 3.4363177490234373,
"step": 27100
},
{
"epoch": 0.055603228468852975,
"grad_norm": 1.0057690143585205,
"learning_rate": 0.0002978734119853236,
"loss": 2.9398748779296877,
"step": 27150
},
{
"epoch": 0.0557056285212818,
"grad_norm": 1.0253512859344482,
"learning_rate": 0.0002978652909945265,
"loss": 3.5486212158203125,
"step": 27200
},
{
"epoch": 0.05580802857371063,
"grad_norm": 0.9567630887031555,
"learning_rate": 0.0002978571546382174,
"loss": 3.531204833984375,
"step": 27250
},
{
"epoch": 0.05591042862613946,
"grad_norm": 0.7189958691596985,
"learning_rate": 0.00029784900291724174,
"loss": 4.003550415039062,
"step": 27300
},
{
"epoch": 0.056012828678568286,
"grad_norm": 0.7804083228111267,
"learning_rate": 0.0002978408358324466,
"loss": 3.952115173339844,
"step": 27350
},
{
"epoch": 0.05611522873099711,
"grad_norm": 0.7131394743919373,
"learning_rate": 0.00029783265338468077,
"loss": 3.712818298339844,
"step": 27400
},
{
"epoch": 0.056217628783425934,
"grad_norm": 0.9421349167823792,
"learning_rate": 0.0002978244555747944,
"loss": 3.955911865234375,
"step": 27450
},
{
"epoch": 0.05632002883585476,
"grad_norm": 1.1702853441238403,
"learning_rate": 0.0002978162424036395,
"loss": 3.715908203125,
"step": 27500
},
{
"epoch": 0.05642242888828359,
"grad_norm": 1.0307793617248535,
"learning_rate": 0.0002978080138720694,
"loss": 4.063231506347656,
"step": 27550
},
{
"epoch": 0.05652482894071242,
"grad_norm": 1.0633025169372559,
"learning_rate": 0.00029779976998093926,
"loss": 3.9883132934570313,
"step": 27600
},
{
"epoch": 0.056627228993141245,
"grad_norm": 1.6195343732833862,
"learning_rate": 0.0002977915107311058,
"loss": 4.001260681152344,
"step": 27650
},
{
"epoch": 0.05672962904557007,
"grad_norm": 0.9477188587188721,
"learning_rate": 0.00029778323612342716,
"loss": 3.9925576782226564,
"step": 27700
},
{
"epoch": 0.0568320290979989,
"grad_norm": 0.7277911305427551,
"learning_rate": 0.00029777494615876337,
"loss": 3.8298355102539063,
"step": 27750
},
{
"epoch": 0.05693442915042773,
"grad_norm": 0.8074896931648254,
"learning_rate": 0.0002977666408379757,
"loss": 3.53470947265625,
"step": 27800
},
{
"epoch": 0.057036829202856555,
"grad_norm": 1.870801568031311,
"learning_rate": 0.0002977583201619273,
"loss": 4.093720703125,
"step": 27850
},
{
"epoch": 0.057139229255285376,
"grad_norm": 0.9061904549598694,
"learning_rate": 0.00029774998413148283,
"loss": 3.6751202392578124,
"step": 27900
},
{
"epoch": 0.057241629307714204,
"grad_norm": 0.766776978969574,
"learning_rate": 0.0002977416327475085,
"loss": 3.7472940063476563,
"step": 27950
},
{
"epoch": 0.05734402936014303,
"grad_norm": 0.9437297582626343,
"learning_rate": 0.0002977332660108722,
"loss": 3.1673342895507814,
"step": 28000
},
{
"epoch": 0.05744642941257186,
"grad_norm": 0.9875741004943848,
"learning_rate": 0.00029772488392244324,
"loss": 3.69399658203125,
"step": 28050
},
{
"epoch": 0.057548829465000687,
"grad_norm": 1.2089347839355469,
"learning_rate": 0.00029771648648309275,
"loss": 3.5663076782226564,
"step": 28100
},
{
"epoch": 0.057651229517429514,
"grad_norm": 1.0613031387329102,
"learning_rate": 0.00029770807369369334,
"loss": 3.696695556640625,
"step": 28150
},
{
"epoch": 0.05775362956985834,
"grad_norm": 1.1133229732513428,
"learning_rate": 0.00029769964555511925,
"loss": 3.527508544921875,
"step": 28200
},
{
"epoch": 0.05785602962228717,
"grad_norm": 1.0089772939682007,
"learning_rate": 0.0002976912020682463,
"loss": 3.744898376464844,
"step": 28250
},
{
"epoch": 0.05795842967471599,
"grad_norm": 0.9647061824798584,
"learning_rate": 0.00029768274323395183,
"loss": 3.6294049072265624,
"step": 28300
},
{
"epoch": 0.05806082972714482,
"grad_norm": 1.680829644203186,
"learning_rate": 0.00029767426905311485,
"loss": 3.6455474853515626,
"step": 28350
},
{
"epoch": 0.058163229779573646,
"grad_norm": 0.9101169109344482,
"learning_rate": 0.00029766577952661607,
"loss": 3.9211056518554686,
"step": 28400
},
{
"epoch": 0.05826562983200247,
"grad_norm": 1.0310935974121094,
"learning_rate": 0.00029765727465533764,
"loss": 3.7476397705078126,
"step": 28450
},
{
"epoch": 0.0583680298844313,
"grad_norm": 1.042888879776001,
"learning_rate": 0.00029764875444016325,
"loss": 4.1402108764648435,
"step": 28500
},
{
"epoch": 0.05847042993686013,
"grad_norm": 1.0709255933761597,
"learning_rate": 0.00029764021888197835,
"loss": 3.9610775756835936,
"step": 28550
},
{
"epoch": 0.058572829989288956,
"grad_norm": 1.027099370956421,
"learning_rate": 0.00029763166798166995,
"loss": 3.751552734375,
"step": 28600
},
{
"epoch": 0.058675230041717784,
"grad_norm": 0.7349804639816284,
"learning_rate": 0.0002976231017401266,
"loss": 3.742770080566406,
"step": 28650
},
{
"epoch": 0.05877763009414661,
"grad_norm": 1.0283441543579102,
"learning_rate": 0.0002976145201582384,
"loss": 3.7890921020507813,
"step": 28700
},
{
"epoch": 0.05888003014657543,
"grad_norm": 0.8082360029220581,
"learning_rate": 0.00029760592323689725,
"loss": 4.149041748046875,
"step": 28750
},
{
"epoch": 0.05898243019900426,
"grad_norm": 0.9537481665611267,
"learning_rate": 0.00029759731097699635,
"loss": 4.166469421386719,
"step": 28800
},
{
"epoch": 0.05908483025143309,
"grad_norm": 1.1642649173736572,
"learning_rate": 0.0002975886833794308,
"loss": 4.074107360839844,
"step": 28850
},
{
"epoch": 0.059187230303861915,
"grad_norm": 1.0695040225982666,
"learning_rate": 0.00029758004044509707,
"loss": 4.009411926269531,
"step": 28900
},
{
"epoch": 0.05928963035629074,
"grad_norm": 0.933382511138916,
"learning_rate": 0.00029757138217489324,
"loss": 3.857533264160156,
"step": 28950
},
{
"epoch": 0.05939203040871957,
"grad_norm": 1.0519219636917114,
"learning_rate": 0.0002975627085697191,
"loss": 3.5922341918945313,
"step": 29000
},
{
"epoch": 0.0594944304611484,
"grad_norm": 0.879135251045227,
"learning_rate": 0.00029755401963047596,
"loss": 4.271012268066406,
"step": 29050
},
{
"epoch": 0.059596830513577226,
"grad_norm": 1.0314289331436157,
"learning_rate": 0.0002975453153580667,
"loss": 3.891732177734375,
"step": 29100
},
{
"epoch": 0.05969923056600605,
"grad_norm": 0.9761302471160889,
"learning_rate": 0.000297536595753396,
"loss": 3.860959167480469,
"step": 29150
},
{
"epoch": 0.059801630618434874,
"grad_norm": 0.866371750831604,
"learning_rate": 0.0002975278608173697,
"loss": 3.8342303466796874,
"step": 29200
},
{
"epoch": 0.0599040306708637,
"grad_norm": 0.9015768766403198,
"learning_rate": 0.0002975191105508957,
"loss": 3.8901824951171875,
"step": 29250
},
{
"epoch": 0.06000643072329253,
"grad_norm": 0.9253438711166382,
"learning_rate": 0.0002975103449548832,
"loss": 3.8019094848632813,
"step": 29300
},
{
"epoch": 0.06010883077572136,
"grad_norm": 0.7289124727249146,
"learning_rate": 0.0002975015640302431,
"loss": 3.34075439453125,
"step": 29350
},
{
"epoch": 0.060211230828150185,
"grad_norm": 0.713688313961029,
"learning_rate": 0.0002974927677778879,
"loss": 3.235279235839844,
"step": 29400
},
{
"epoch": 0.06031363088057901,
"grad_norm": 0.6275246143341064,
"learning_rate": 0.0002974839561987316,
"loss": 3.8884927368164064,
"step": 29450
},
{
"epoch": 0.06041603093300784,
"grad_norm": 1.1090385913848877,
"learning_rate": 0.0002974751292936899,
"loss": 3.3796435546875,
"step": 29500
},
{
"epoch": 0.06051843098543667,
"grad_norm": 0.8206045031547546,
"learning_rate": 0.0002974662870636801,
"loss": 3.9724603271484376,
"step": 29550
},
{
"epoch": 0.06062083103786549,
"grad_norm": 1.3841317892074585,
"learning_rate": 0.00029745742950962095,
"loss": 3.951322021484375,
"step": 29600
},
{
"epoch": 0.060723231090294316,
"grad_norm": 0.9978547692298889,
"learning_rate": 0.000297448556632433,
"loss": 4.5418917846679685,
"step": 29650
},
{
"epoch": 0.060825631142723144,
"grad_norm": 0.9191545248031616,
"learning_rate": 0.0002974396684330382,
"loss": 3.5654345703125,
"step": 29700
},
{
"epoch": 0.06092803119515197,
"grad_norm": 1.4994065761566162,
"learning_rate": 0.0002974307649123602,
"loss": 3.7218731689453124,
"step": 29750
},
{
"epoch": 0.0610304312475808,
"grad_norm": 0.6516634821891785,
"learning_rate": 0.0002974218460713242,
"loss": 3.561522216796875,
"step": 29800
},
{
"epoch": 0.06113283130000963,
"grad_norm": 1.1022820472717285,
"learning_rate": 0.000297412911910857,
"loss": 4.207413024902344,
"step": 29850
},
{
"epoch": 0.061235231352438454,
"grad_norm": 0.861346960067749,
"learning_rate": 0.000297403962431887,
"loss": 3.959631042480469,
"step": 29900
},
{
"epoch": 0.06133763140486728,
"grad_norm": 0.8098173141479492,
"learning_rate": 0.0002973949976353442,
"loss": 4.688843383789062,
"step": 29950
},
{
"epoch": 0.0614400314572961,
"grad_norm": 0.8004640936851501,
"learning_rate": 0.0002973860175221603,
"loss": 4.384559631347656,
"step": 30000
},
{
"epoch": 0.06154243150972493,
"grad_norm": 1.4548406600952148,
"learning_rate": 0.0002973770220932683,
"loss": 4.232876281738282,
"step": 30050
},
{
"epoch": 0.06164483156215376,
"grad_norm": 1.1136951446533203,
"learning_rate": 0.00029736801134960296,
"loss": 4.017593994140625,
"step": 30100
},
{
"epoch": 0.061747231614582586,
"grad_norm": 0.9526700377464294,
"learning_rate": 0.00029735898529210074,
"loss": 3.694122619628906,
"step": 30150
},
{
"epoch": 0.061849631667011414,
"grad_norm": 0.9094407558441162,
"learning_rate": 0.0002973499439216996,
"loss": 2.5258544921875,
"step": 30200
},
{
"epoch": 0.06195203171944024,
"grad_norm": 0.9788475632667542,
"learning_rate": 0.000297340887239339,
"loss": 3.6749945068359375,
"step": 30250
},
{
"epoch": 0.06205443177186907,
"grad_norm": 0.9837728142738342,
"learning_rate": 0.00029733181524596006,
"loss": 3.9548965454101563,
"step": 30300
},
{
"epoch": 0.062156831824297896,
"grad_norm": 0.9949678778648376,
"learning_rate": 0.00029732272794250563,
"loss": 3.194211730957031,
"step": 30350
},
{
"epoch": 0.062259231876726724,
"grad_norm": 1.091620683670044,
"learning_rate": 0.00029731362532991985,
"loss": 3.8439263916015625,
"step": 30400
},
{
"epoch": 0.062361631929155545,
"grad_norm": 0.759272575378418,
"learning_rate": 0.0002973045074091488,
"loss": 3.965645751953125,
"step": 30450
},
{
"epoch": 0.06246403198158437,
"grad_norm": 0.9479434490203857,
"learning_rate": 0.0002972953741811398,
"loss": 3.6418606567382814,
"step": 30500
},
{
"epoch": 0.0625664320340132,
"grad_norm": 0.8087990880012512,
"learning_rate": 0.00029728622564684204,
"loss": 3.7622882080078126,
"step": 30550
},
{
"epoch": 0.06266883208644203,
"grad_norm": 1.2932571172714233,
"learning_rate": 0.0002972770618072062,
"loss": 4.1614468383789065,
"step": 30600
},
{
"epoch": 0.06277123213887086,
"grad_norm": 0.6852632761001587,
"learning_rate": 0.00029726788266318455,
"loss": 3.5135552978515623,
"step": 30650
},
{
"epoch": 0.06287363219129968,
"grad_norm": 0.9849332571029663,
"learning_rate": 0.0002972586882157309,
"loss": 3.3184869384765623,
"step": 30700
},
{
"epoch": 0.06297603224372851,
"grad_norm": 1.1004332304000854,
"learning_rate": 0.00029724947846580064,
"loss": 3.4316140747070314,
"step": 30750
},
{
"epoch": 0.06307843229615734,
"grad_norm": 0.9240966439247131,
"learning_rate": 0.00029724025341435097,
"loss": 4.058392333984375,
"step": 30800
},
{
"epoch": 0.06318083234858617,
"grad_norm": 0.8939677476882935,
"learning_rate": 0.0002972310130623404,
"loss": 4.048366088867187,
"step": 30850
},
{
"epoch": 0.063283232401015,
"grad_norm": 0.8218761086463928,
"learning_rate": 0.00029722175741072915,
"loss": 4.063833618164063,
"step": 30900
},
{
"epoch": 0.06338563245344382,
"grad_norm": 0.9675712585449219,
"learning_rate": 0.0002972124864604791,
"loss": 3.7749728393554687,
"step": 30950
},
{
"epoch": 0.06348803250587265,
"grad_norm": 1.2063570022583008,
"learning_rate": 0.0002972032002125536,
"loss": 3.5751220703125,
"step": 31000
},
{
"epoch": 0.06359043255830148,
"grad_norm": 1.0709924697875977,
"learning_rate": 0.00029719389866791755,
"loss": 3.7293637084960936,
"step": 31050
},
{
"epoch": 0.06369283261073029,
"grad_norm": 0.8866503834724426,
"learning_rate": 0.0002971845818275377,
"loss": 3.907535400390625,
"step": 31100
},
{
"epoch": 0.06379523266315912,
"grad_norm": 1.0057551860809326,
"learning_rate": 0.00029717524969238206,
"loss": 3.222738037109375,
"step": 31150
},
{
"epoch": 0.06389763271558795,
"grad_norm": 0.9129172563552856,
"learning_rate": 0.0002971659022634205,
"loss": 3.4403155517578123,
"step": 31200
},
{
"epoch": 0.06400003276801677,
"grad_norm": 0.9336997866630554,
"learning_rate": 0.0002971565395416243,
"loss": 3.30571044921875,
"step": 31250
},
{
"epoch": 0.0641024328204456,
"grad_norm": 1.1164926290512085,
"learning_rate": 0.0002971471615279664,
"loss": 3.8188116455078127,
"step": 31300
},
{
"epoch": 0.06420483287287443,
"grad_norm": 0.8115789890289307,
"learning_rate": 0.0002971377682234213,
"loss": 3.9151617431640626,
"step": 31350
},
{
"epoch": 0.06430723292530326,
"grad_norm": 0.9240061044692993,
"learning_rate": 0.00029712835962896514,
"loss": 3.709864196777344,
"step": 31400
},
{
"epoch": 0.06440963297773208,
"grad_norm": 1.6785798072814941,
"learning_rate": 0.0002971189357455756,
"loss": 3.689013671875,
"step": 31450
},
{
"epoch": 0.06451203303016091,
"grad_norm": 0.7833497524261475,
"learning_rate": 0.0002971094965742321,
"loss": 3.3715243530273438,
"step": 31500
},
{
"epoch": 0.06461443308258974,
"grad_norm": 0.8799951076507568,
"learning_rate": 0.0002971000421159153,
"loss": 4.09000244140625,
"step": 31550
},
{
"epoch": 0.06471683313501857,
"grad_norm": 0.7977895736694336,
"learning_rate": 0.0002970905723716078,
"loss": 4.248508911132813,
"step": 31600
},
{
"epoch": 0.0648192331874474,
"grad_norm": 0.8709924221038818,
"learning_rate": 0.00029708108734229365,
"loss": 3.489057922363281,
"step": 31650
},
{
"epoch": 0.06492163323987622,
"grad_norm": 0.8895650506019592,
"learning_rate": 0.00029707158702895847,
"loss": 3.898555908203125,
"step": 31700
},
{
"epoch": 0.06502403329230505,
"grad_norm": 0.8814746737480164,
"learning_rate": 0.00029706207143258945,
"loss": 3.7208917236328123,
"step": 31750
},
{
"epoch": 0.06512643334473388,
"grad_norm": 0.9977162480354309,
"learning_rate": 0.0002970525405541755,
"loss": 4.208245849609375,
"step": 31800
},
{
"epoch": 0.0652288333971627,
"grad_norm": 0.7882950901985168,
"learning_rate": 0.0002970429943947069,
"loss": 3.7341409301757813,
"step": 31850
},
{
"epoch": 0.06533123344959153,
"grad_norm": 0.9084259867668152,
"learning_rate": 0.00029703343295517577,
"loss": 3.782439880371094,
"step": 31900
},
{
"epoch": 0.06543363350202035,
"grad_norm": 1.0745272636413574,
"learning_rate": 0.0002970238562365756,
"loss": 3.530187072753906,
"step": 31950
},
{
"epoch": 0.06553603355444917,
"grad_norm": 0.7873273491859436,
"learning_rate": 0.0002970142642399017,
"loss": 3.5862966918945314,
"step": 32000
},
{
"epoch": 0.065638433606878,
"grad_norm": 0.9771028757095337,
"learning_rate": 0.0002970046569661506,
"loss": 3.8827175903320312,
"step": 32050
},
{
"epoch": 0.06574083365930683,
"grad_norm": 0.8443105816841125,
"learning_rate": 0.00029699503441632085,
"loss": 3.28268310546875,
"step": 32100
},
{
"epoch": 0.06584323371173566,
"grad_norm": 0.7213400602340698,
"learning_rate": 0.0002969853965914123,
"loss": 3.13387939453125,
"step": 32150
},
{
"epoch": 0.06594563376416449,
"grad_norm": 1.1795644760131836,
"learning_rate": 0.0002969757434924265,
"loss": 3.658702087402344,
"step": 32200
},
{
"epoch": 0.06604803381659331,
"grad_norm": 0.7857722640037537,
"learning_rate": 0.0002969660751203665,
"loss": 3.4446502685546876,
"step": 32250
},
{
"epoch": 0.06615043386902214,
"grad_norm": 1.0390616655349731,
"learning_rate": 0.00029695639147623703,
"loss": 3.644783630371094,
"step": 32300
},
{
"epoch": 0.06625283392145097,
"grad_norm": 0.7487825155258179,
"learning_rate": 0.00029694669256104446,
"loss": 3.63455810546875,
"step": 32350
},
{
"epoch": 0.0663552339738798,
"grad_norm": 0.8825246691703796,
"learning_rate": 0.0002969369783757965,
"loss": 3.3496524047851564,
"step": 32400
},
{
"epoch": 0.06645763402630862,
"grad_norm": 1.1626224517822266,
"learning_rate": 0.00029692724892150266,
"loss": 3.726259460449219,
"step": 32450
},
{
"epoch": 0.06656003407873745,
"grad_norm": 0.74493008852005,
"learning_rate": 0.00029691750419917406,
"loss": 3.7289053344726564,
"step": 32500
},
{
"epoch": 0.06666243413116628,
"grad_norm": 0.6749517917633057,
"learning_rate": 0.00029690774420982317,
"loss": 3.5053274536132815,
"step": 32550
},
{
"epoch": 0.0667648341835951,
"grad_norm": 1.099471926689148,
"learning_rate": 0.0002968979689544644,
"loss": 3.514427490234375,
"step": 32600
},
{
"epoch": 0.06686723423602393,
"grad_norm": 0.9038723111152649,
"learning_rate": 0.00029688817843411344,
"loss": 3.616097106933594,
"step": 32650
},
{
"epoch": 0.06696963428845276,
"grad_norm": 0.7338837385177612,
"learning_rate": 0.0002968783726497877,
"loss": 3.4425479125976564,
"step": 32700
},
{
"epoch": 0.06707203434088159,
"grad_norm": 1.254689335823059,
"learning_rate": 0.0002968685516025061,
"loss": 3.3777651977539063,
"step": 32750
},
{
"epoch": 0.0671744343933104,
"grad_norm": 0.8535405397415161,
"learning_rate": 0.00029685871529328933,
"loss": 4.319814758300781,
"step": 32800
},
{
"epoch": 0.06727683444573923,
"grad_norm": 0.9299177527427673,
"learning_rate": 0.00029684886372315935,
"loss": 3.78345458984375,
"step": 32850
},
{
"epoch": 0.06737923449816806,
"grad_norm": 1.0497288703918457,
"learning_rate": 0.0002968389968931401,
"loss": 3.619969787597656,
"step": 32900
},
{
"epoch": 0.06748163455059689,
"grad_norm": 0.9285115599632263,
"learning_rate": 0.00029682911480425673,
"loss": 3.488844909667969,
"step": 32950
},
{
"epoch": 0.06758403460302571,
"grad_norm": 1.2114810943603516,
"learning_rate": 0.0002968192174575362,
"loss": 3.8050308227539062,
"step": 33000
},
{
"epoch": 0.06768643465545454,
"grad_norm": 0.9714403748512268,
"learning_rate": 0.0002968093048540071,
"loss": 3.402801208496094,
"step": 33050
},
{
"epoch": 0.06778883470788337,
"grad_norm": 1.049149990081787,
"learning_rate": 0.00029679937699469934,
"loss": 3.4410101318359376,
"step": 33100
},
{
"epoch": 0.0678912347603122,
"grad_norm": 0.8005252480506897,
"learning_rate": 0.0002967894338806446,
"loss": 3.9667138671875,
"step": 33150
},
{
"epoch": 0.06799363481274102,
"grad_norm": 1.0901520252227783,
"learning_rate": 0.00029677947551287625,
"loss": 3.6659295654296873,
"step": 33200
},
{
"epoch": 0.06809603486516985,
"grad_norm": 0.9532211422920227,
"learning_rate": 0.000296769501892429,
"loss": 3.9132586669921876,
"step": 33250
},
{
"epoch": 0.06819843491759868,
"grad_norm": 1.3878906965255737,
"learning_rate": 0.0002967595130203394,
"loss": 3.927642822265625,
"step": 33300
},
{
"epoch": 0.06830083497002751,
"grad_norm": 1.046176552772522,
"learning_rate": 0.00029674950889764523,
"loss": 3.9409329223632814,
"step": 33350
},
{
"epoch": 0.06840323502245634,
"grad_norm": 0.8497132062911987,
"learning_rate": 0.0002967394895253863,
"loss": 3.60568359375,
"step": 33400
},
{
"epoch": 0.06850563507488516,
"grad_norm": 1.1313576698303223,
"learning_rate": 0.00029672945490460365,
"loss": 4.024774780273438,
"step": 33450
},
{
"epoch": 0.06860803512731399,
"grad_norm": 1.0038946866989136,
"learning_rate": 0.00029671940503634006,
"loss": 3.707646484375,
"step": 33500
},
{
"epoch": 0.06871043517974282,
"grad_norm": 1.1383546590805054,
"learning_rate": 0.0002967093399216399,
"loss": 3.2730068969726562,
"step": 33550
},
{
"epoch": 0.06881283523217165,
"grad_norm": 0.9146387577056885,
"learning_rate": 0.00029669925956154905,
"loss": 3.9269442749023438,
"step": 33600
},
{
"epoch": 0.06891523528460046,
"grad_norm": 0.6965939402580261,
"learning_rate": 0.000296689163957115,
"loss": 2.7985528564453124,
"step": 33650
},
{
"epoch": 0.06901763533702929,
"grad_norm": 0.8769970536231995,
"learning_rate": 0.00029667905310938695,
"loss": 4.186055908203125,
"step": 33700
},
{
"epoch": 0.06912003538945811,
"grad_norm": 0.8398081660270691,
"learning_rate": 0.0002966689270194154,
"loss": 3.677633056640625,
"step": 33750
},
{
"epoch": 0.06922243544188694,
"grad_norm": 0.7318697571754456,
"learning_rate": 0.00029665878568825284,
"loss": 4.001636352539062,
"step": 33800
},
{
"epoch": 0.06932483549431577,
"grad_norm": 1.0592197179794312,
"learning_rate": 0.00029664862911695286,
"loss": 3.9292962646484373,
"step": 33850
},
{
"epoch": 0.0694272355467446,
"grad_norm": 1.3345533609390259,
"learning_rate": 0.0002966384573065711,
"loss": 3.7566705322265626,
"step": 33900
},
{
"epoch": 0.06952963559917343,
"grad_norm": 0.9815147519111633,
"learning_rate": 0.00029662827025816443,
"loss": 3.8881317138671876,
"step": 33950
},
{
"epoch": 0.06963203565160225,
"grad_norm": 0.996683657169342,
"learning_rate": 0.00029661806797279147,
"loss": 3.9453826904296876,
"step": 34000
},
{
"epoch": 0.06973443570403108,
"grad_norm": 1.0983341932296753,
"learning_rate": 0.0002966078504515125,
"loss": 4.025393371582031,
"step": 34050
},
{
"epoch": 0.06983683575645991,
"grad_norm": 1.2514588832855225,
"learning_rate": 0.0002965976176953891,
"loss": 4.020445556640625,
"step": 34100
},
{
"epoch": 0.06993923580888874,
"grad_norm": 0.7997650504112244,
"learning_rate": 0.00029658736970548477,
"loss": 3.7041055297851564,
"step": 34150
},
{
"epoch": 0.07004163586131756,
"grad_norm": 0.7876397371292114,
"learning_rate": 0.00029657710648286437,
"loss": 3.2046856689453125,
"step": 34200
},
{
"epoch": 0.07014403591374639,
"grad_norm": 0.9293930530548096,
"learning_rate": 0.00029656682802859443,
"loss": 3.7819009399414063,
"step": 34250
},
{
"epoch": 0.07024643596617522,
"grad_norm": 0.6517935395240784,
"learning_rate": 0.000296556534343743,
"loss": 2.6689993286132814,
"step": 34300
},
{
"epoch": 0.07034883601860405,
"grad_norm": 1.03813898563385,
"learning_rate": 0.00029654622542937977,
"loss": 2.5518731689453125,
"step": 34350
},
{
"epoch": 0.07045123607103287,
"grad_norm": 0.7847388386726379,
"learning_rate": 0.00029653590128657603,
"loss": 3.8658258056640626,
"step": 34400
},
{
"epoch": 0.0705536361234617,
"grad_norm": 0.9255051612854004,
"learning_rate": 0.0002965255619164046,
"loss": 3.4440853881835936,
"step": 34450
},
{
"epoch": 0.07065603617589052,
"grad_norm": 0.8334102630615234,
"learning_rate": 0.00029651520731993993,
"loss": 3.837626647949219,
"step": 34500
},
{
"epoch": 0.07075843622831934,
"grad_norm": 1.0661958456039429,
"learning_rate": 0.000296504837498258,
"loss": 4.052589111328125,
"step": 34550
},
{
"epoch": 0.07086083628074817,
"grad_norm": 0.8307774662971497,
"learning_rate": 0.0002964944524524363,
"loss": 4.152563781738281,
"step": 34600
},
{
"epoch": 0.070963236333177,
"grad_norm": 1.1851427555084229,
"learning_rate": 0.00029648405218355415,
"loss": 3.877910461425781,
"step": 34650
},
{
"epoch": 0.07106563638560583,
"grad_norm": 1.024609088897705,
"learning_rate": 0.0002964736366926923,
"loss": 2.9125543212890626,
"step": 34700
},
{
"epoch": 0.07116803643803465,
"grad_norm": 1.091864824295044,
"learning_rate": 0.00029646320598093295,
"loss": 3.8221173095703125,
"step": 34750
},
{
"epoch": 0.07127043649046348,
"grad_norm": 0.9245619177818298,
"learning_rate": 0.0002964527600493601,
"loss": 2.984726867675781,
"step": 34800
},
{
"epoch": 0.07137283654289231,
"grad_norm": 0.9920394420623779,
"learning_rate": 0.0002964422988990592,
"loss": 3.581501159667969,
"step": 34850
},
{
"epoch": 0.07147523659532114,
"grad_norm": 0.7046719789505005,
"learning_rate": 0.0002964318225311174,
"loss": 2.9280935668945314,
"step": 34900
},
{
"epoch": 0.07157763664774996,
"grad_norm": 0.7766258120536804,
"learning_rate": 0.0002964213309466233,
"loss": 2.3795321655273436,
"step": 34950
},
{
"epoch": 0.07168003670017879,
"grad_norm": 1.3687994480133057,
"learning_rate": 0.0002964108241466672,
"loss": 3.43721923828125,
"step": 35000
},
{
"epoch": 0.07178243675260762,
"grad_norm": 1.0140045881271362,
"learning_rate": 0.00029640030213234084,
"loss": 3.19546875,
"step": 35050
},
{
"epoch": 0.07188483680503645,
"grad_norm": 0.8950518369674683,
"learning_rate": 0.0002963897649047376,
"loss": 3.794825134277344,
"step": 35100
},
{
"epoch": 0.07198723685746528,
"grad_norm": 1.7291576862335205,
"learning_rate": 0.0002963792124649526,
"loss": 4.249531555175781,
"step": 35150
},
{
"epoch": 0.0720896369098941,
"grad_norm": 1.7866417169570923,
"learning_rate": 0.0002963686448140823,
"loss": 3.9559259033203125,
"step": 35200
},
{
"epoch": 0.07219203696232293,
"grad_norm": 0.9784793257713318,
"learning_rate": 0.0002963580619532249,
"loss": 3.6990866088867187,
"step": 35250
},
{
"epoch": 0.07229443701475176,
"grad_norm": 1.6409136056900024,
"learning_rate": 0.00029634746388348005,
"loss": 3.6978335571289063,
"step": 35300
},
{
"epoch": 0.07239683706718057,
"grad_norm": 1.109778642654419,
"learning_rate": 0.00029633685060594914,
"loss": 3.7638284301757814,
"step": 35350
},
{
"epoch": 0.0724992371196094,
"grad_norm": 1.3247849941253662,
"learning_rate": 0.000296326222121735,
"loss": 4.101665954589844,
"step": 35400
},
{
"epoch": 0.07260163717203823,
"grad_norm": 1.0803288221359253,
"learning_rate": 0.0002963155784319421,
"loss": 3.9325439453125,
"step": 35450
},
{
"epoch": 0.07270403722446706,
"grad_norm": 1.2640902996063232,
"learning_rate": 0.00029630491953767647,
"loss": 3.4765811157226563,
"step": 35500
},
{
"epoch": 0.07280643727689588,
"grad_norm": 1.0323841571807861,
"learning_rate": 0.0002962942454400458,
"loss": 3.8790185546875,
"step": 35550
},
{
"epoch": 0.07290883732932471,
"grad_norm": 0.9365559816360474,
"learning_rate": 0.0002962835561401592,
"loss": 3.8441122436523436,
"step": 35600
},
{
"epoch": 0.07301123738175354,
"grad_norm": 0.9189864993095398,
"learning_rate": 0.00029627285163912753,
"loss": 3.819436340332031,
"step": 35650
},
{
"epoch": 0.07311363743418237,
"grad_norm": 1.2897831201553345,
"learning_rate": 0.00029626213193806317,
"loss": 3.544706115722656,
"step": 35700
},
{
"epoch": 0.0732160374866112,
"grad_norm": 0.86373370885849,
"learning_rate": 0.00029625139703807996,
"loss": 3.7399945068359375,
"step": 35750
},
{
"epoch": 0.07331843753904002,
"grad_norm": 1.0938329696655273,
"learning_rate": 0.00029624064694029357,
"loss": 3.89250244140625,
"step": 35800
},
{
"epoch": 0.07342083759146885,
"grad_norm": 0.9408879280090332,
"learning_rate": 0.000296229881645821,
"loss": 3.056258239746094,
"step": 35850
},
{
"epoch": 0.07352323764389768,
"grad_norm": 1.1271533966064453,
"learning_rate": 0.0002962191011557809,
"loss": 3.544586181640625,
"step": 35900
},
{
"epoch": 0.0736256376963265,
"grad_norm": 1.011702537536621,
"learning_rate": 0.0002962083054712936,
"loss": 3.683125305175781,
"step": 35950
},
{
"epoch": 0.07372803774875533,
"grad_norm": 0.8757224678993225,
"learning_rate": 0.000296197494593481,
"loss": 3.3673382568359376,
"step": 36000
},
{
"epoch": 0.07383043780118416,
"grad_norm": 0.6535724997520447,
"learning_rate": 0.00029618666852346644,
"loss": 4.935340881347656,
"step": 36050
},
{
"epoch": 0.07393283785361299,
"grad_norm": 0.6584002375602722,
"learning_rate": 0.0002961758272623749,
"loss": 4.499714660644531,
"step": 36100
},
{
"epoch": 0.07403523790604181,
"grad_norm": 0.8999959230422974,
"learning_rate": 0.000296164970811333,
"loss": 4.462132568359375,
"step": 36150
},
{
"epoch": 0.07413763795847063,
"grad_norm": 1.0016320943832397,
"learning_rate": 0.00029615409917146886,
"loss": 3.6168402099609374,
"step": 36200
},
{
"epoch": 0.07424003801089946,
"grad_norm": 0.9233262538909912,
"learning_rate": 0.0002961432123439122,
"loss": 3.7079287719726564,
"step": 36250
},
{
"epoch": 0.07434243806332828,
"grad_norm": 1.2862437963485718,
"learning_rate": 0.0002961323103297944,
"loss": 3.554483642578125,
"step": 36300
},
{
"epoch": 0.07444483811575711,
"grad_norm": 1.0531319379806519,
"learning_rate": 0.0002961213931302483,
"loss": 3.9057122802734376,
"step": 36350
},
{
"epoch": 0.07454723816818594,
"grad_norm": 1.0585157871246338,
"learning_rate": 0.00029611046074640835,
"loss": 4.065590209960938,
"step": 36400
},
{
"epoch": 0.07464963822061477,
"grad_norm": 0.9923078417778015,
"learning_rate": 0.00029609951317941067,
"loss": 3.753091125488281,
"step": 36450
},
{
"epoch": 0.0747520382730436,
"grad_norm": 1.4187533855438232,
"learning_rate": 0.0002960885504303928,
"loss": 3.81512939453125,
"step": 36500
},
{
"epoch": 0.07485443832547242,
"grad_norm": 0.9602039456367493,
"learning_rate": 0.000296077572500494,
"loss": 3.3714874267578123,
"step": 36550
},
{
"epoch": 0.07495683837790125,
"grad_norm": 0.9583538770675659,
"learning_rate": 0.000296066579390855,
"loss": 4.172694702148437,
"step": 36600
},
{
"epoch": 0.07505923843033008,
"grad_norm": 0.9498001933097839,
"learning_rate": 0.0002960555711026182,
"loss": 3.799460144042969,
"step": 36650
},
{
"epoch": 0.0751616384827589,
"grad_norm": 1.037429928779602,
"learning_rate": 0.00029604454763692753,
"loss": 3.3060308837890626,
"step": 36700
},
{
"epoch": 0.07526403853518773,
"grad_norm": 0.9222440123558044,
"learning_rate": 0.0002960335089949284,
"loss": 3.724703063964844,
"step": 36750
},
{
"epoch": 0.07536643858761656,
"grad_norm": 0.891686201095581,
"learning_rate": 0.0002960224551777681,
"loss": 3.8415121459960937,
"step": 36800
},
{
"epoch": 0.07546883864004539,
"grad_norm": 1.1739381551742554,
"learning_rate": 0.0002960113861865951,
"loss": 3.6421640014648435,
"step": 36850
},
{
"epoch": 0.07557123869247422,
"grad_norm": 1.118273138999939,
"learning_rate": 0.0002960003020225598,
"loss": 4.042873229980469,
"step": 36900
},
{
"epoch": 0.07567363874490304,
"grad_norm": 1.1903246641159058,
"learning_rate": 0.00029598920268681387,
"loss": 3.7228439331054686,
"step": 36950
},
{
"epoch": 0.07577603879733187,
"grad_norm": 0.8074274063110352,
"learning_rate": 0.00029597808818051076,
"loss": 3.74279296875,
"step": 37000
},
{
"epoch": 0.07587843884976068,
"grad_norm": 0.7993521690368652,
"learning_rate": 0.00029596695850480547,
"loss": 3.5909658813476564,
"step": 37050
},
{
"epoch": 0.07598083890218951,
"grad_norm": 0.9763518571853638,
"learning_rate": 0.0002959558136608545,
"loss": 3.4760845947265624,
"step": 37100
},
{
"epoch": 0.07608323895461834,
"grad_norm": 0.9700019359588623,
"learning_rate": 0.000295944653649816,
"loss": 3.5362197875976564,
"step": 37150
},
{
"epoch": 0.07618563900704717,
"grad_norm": 0.9611456990242004,
"learning_rate": 0.0002959334784728497,
"loss": 3.392528381347656,
"step": 37200
},
{
"epoch": 0.076288039059476,
"grad_norm": 1.0106040239334106,
"learning_rate": 0.0002959222881311168,
"loss": 3.9602230834960936,
"step": 37250
},
{
"epoch": 0.07639043911190482,
"grad_norm": 0.9530378580093384,
"learning_rate": 0.00029591108262578023,
"loss": 3.755385437011719,
"step": 37300
},
{
"epoch": 0.07649283916433365,
"grad_norm": 1.2167539596557617,
"learning_rate": 0.0002958998619580044,
"loss": 3.5471917724609376,
"step": 37350
},
{
"epoch": 0.07659523921676248,
"grad_norm": 0.6693082451820374,
"learning_rate": 0.0002958886261289553,
"loss": 2.953871154785156,
"step": 37400
},
{
"epoch": 0.0766976392691913,
"grad_norm": 0.8044131398200989,
"learning_rate": 0.0002958773751398004,
"loss": 3.6775543212890627,
"step": 37450
},
{
"epoch": 0.07680003932162013,
"grad_norm": 0.9389724731445312,
"learning_rate": 0.00029586610899170904,
"loss": 3.951288757324219,
"step": 37500
},
{
"epoch": 0.07690243937404896,
"grad_norm": 0.9143916964530945,
"learning_rate": 0.0002958548276858519,
"loss": 3.773663330078125,
"step": 37550
},
{
"epoch": 0.07700483942647779,
"grad_norm": 1.3429774045944214,
"learning_rate": 0.0002958435312234012,
"loss": 3.516551818847656,
"step": 37600
},
{
"epoch": 0.07710723947890662,
"grad_norm": 0.9084817171096802,
"learning_rate": 0.00029583221960553086,
"loss": 3.5966671752929686,
"step": 37650
},
{
"epoch": 0.07720963953133544,
"grad_norm": 0.9403077363967896,
"learning_rate": 0.0002958208928334164,
"loss": 3.7960610961914063,
"step": 37700
},
{
"epoch": 0.07731203958376427,
"grad_norm": 0.9307132363319397,
"learning_rate": 0.0002958095509082347,
"loss": 3.526631164550781,
"step": 37750
},
{
"epoch": 0.0774144396361931,
"grad_norm": 1.0403499603271484,
"learning_rate": 0.0002957981938311645,
"loss": 3.657856140136719,
"step": 37800
},
{
"epoch": 0.07751683968862193,
"grad_norm": 0.8535223007202148,
"learning_rate": 0.00029578682160338594,
"loss": 3.4064453125,
"step": 37850
},
{
"epoch": 0.07761923974105074,
"grad_norm": 0.6557066440582275,
"learning_rate": 0.00029577543422608073,
"loss": 3.3173226928710937,
"step": 37900
},
{
"epoch": 0.07772163979347957,
"grad_norm": 1.1363869905471802,
"learning_rate": 0.0002957640317004323,
"loss": 3.3008172607421873,
"step": 37950
},
{
"epoch": 0.0778240398459084,
"grad_norm": 0.9868215322494507,
"learning_rate": 0.0002957526140276254,
"loss": 3.6425216674804686,
"step": 38000
},
{
"epoch": 0.07792643989833722,
"grad_norm": 0.6904926896095276,
"learning_rate": 0.00029574118120884657,
"loss": 3.5323916625976564,
"step": 38050
},
{
"epoch": 0.07802883995076605,
"grad_norm": 1.4620712995529175,
"learning_rate": 0.00029572973324528394,
"loss": 3.751639709472656,
"step": 38100
},
{
"epoch": 0.07813124000319488,
"grad_norm": 0.9893333315849304,
"learning_rate": 0.000295718270138127,
"loss": 3.4107330322265623,
"step": 38150
},
{
"epoch": 0.07823364005562371,
"grad_norm": 0.8329883217811584,
"learning_rate": 0.00029570679188856705,
"loss": 3.1873550415039062,
"step": 38200
},
{
"epoch": 0.07833604010805253,
"grad_norm": 1.5774136781692505,
"learning_rate": 0.00029569529849779685,
"loss": 3.432158203125,
"step": 38250
},
{
"epoch": 0.07843844016048136,
"grad_norm": 0.8458206057548523,
"learning_rate": 0.0002956837899670107,
"loss": 3.302817077636719,
"step": 38300
},
{
"epoch": 0.07854084021291019,
"grad_norm": 1.3035576343536377,
"learning_rate": 0.00029567226629740445,
"loss": 3.1465521240234375,
"step": 38350
},
{
"epoch": 0.07864324026533902,
"grad_norm": 0.9802455902099609,
"learning_rate": 0.00029566072749017574,
"loss": 3.3001138305664064,
"step": 38400
},
{
"epoch": 0.07874564031776785,
"grad_norm": 0.9335483312606812,
"learning_rate": 0.00029564917354652355,
"loss": 3.2266500854492186,
"step": 38450
},
{
"epoch": 0.07884804037019667,
"grad_norm": 1.0493320226669312,
"learning_rate": 0.0002956376044676485,
"loss": 3.0587277221679687,
"step": 38500
},
{
"epoch": 0.0789504404226255,
"grad_norm": 1.096993088722229,
"learning_rate": 0.00029562602025475285,
"loss": 4.07334716796875,
"step": 38550
},
{
"epoch": 0.07905284047505433,
"grad_norm": 0.7419833540916443,
"learning_rate": 0.0002956144209090403,
"loss": 3.633370056152344,
"step": 38600
},
{
"epoch": 0.07915524052748316,
"grad_norm": 1.1980725526809692,
"learning_rate": 0.00029560280643171633,
"loss": 3.7685275268554688,
"step": 38650
},
{
"epoch": 0.07925764057991198,
"grad_norm": 2.977545738220215,
"learning_rate": 0.00029559117682398774,
"loss": 3.9755072021484374,
"step": 38700
},
{
"epoch": 0.0793600406323408,
"grad_norm": 1.0279217958450317,
"learning_rate": 0.0002955795320870631,
"loss": 3.4383935546875,
"step": 38750
},
{
"epoch": 0.07946244068476963,
"grad_norm": 0.9934809803962708,
"learning_rate": 0.00029556787222215247,
"loss": 3.663726501464844,
"step": 38800
},
{
"epoch": 0.07956484073719845,
"grad_norm": 1.145448088645935,
"learning_rate": 0.00029555619723046746,
"loss": 3.51242431640625,
"step": 38850
},
{
"epoch": 0.07966724078962728,
"grad_norm": 0.6591020226478577,
"learning_rate": 0.00029554450711322133,
"loss": 3.402906494140625,
"step": 38900
},
{
"epoch": 0.07976964084205611,
"grad_norm": 0.9392556548118591,
"learning_rate": 0.00029553280187162876,
"loss": 3.1334713745117186,
"step": 38950
},
{
"epoch": 0.07987204089448494,
"grad_norm": 1.0616618394851685,
"learning_rate": 0.0002955210815069063,
"loss": 3.934781494140625,
"step": 39000
},
{
"epoch": 0.07997444094691376,
"grad_norm": 0.7064653038978577,
"learning_rate": 0.0002955093460202717,
"loss": 3.5139471435546876,
"step": 39050
},
{
"epoch": 0.08007684099934259,
"grad_norm": 0.8327042460441589,
"learning_rate": 0.0002954975954129445,
"loss": 3.4110308837890626,
"step": 39100
},
{
"epoch": 0.08017924105177142,
"grad_norm": 1.7272720336914062,
"learning_rate": 0.0002954858296861459,
"loss": 3.403736877441406,
"step": 39150
},
{
"epoch": 0.08028164110420025,
"grad_norm": 0.7907924056053162,
"learning_rate": 0.00029547404884109837,
"loss": 2.493211212158203,
"step": 39200
},
{
"epoch": 0.08038404115662907,
"grad_norm": 0.8305758237838745,
"learning_rate": 0.00029546225287902623,
"loss": 3.276422119140625,
"step": 39250
},
{
"epoch": 0.0804864412090579,
"grad_norm": 1.02776300907135,
"learning_rate": 0.0002954504418011552,
"loss": 3.471695861816406,
"step": 39300
},
{
"epoch": 0.08058884126148673,
"grad_norm": 1.1195554733276367,
"learning_rate": 0.0002954386156087127,
"loss": 3.8007080078125,
"step": 39350
},
{
"epoch": 0.08069124131391556,
"grad_norm": 1.0131675004959106,
"learning_rate": 0.00029542677430292755,
"loss": 3.80172119140625,
"step": 39400
},
{
"epoch": 0.08079364136634438,
"grad_norm": 1.0135658979415894,
"learning_rate": 0.0002954149178850304,
"loss": 3.3349169921875,
"step": 39450
},
{
"epoch": 0.08089604141877321,
"grad_norm": 0.9203832149505615,
"learning_rate": 0.00029540304635625316,
"loss": 3.532286376953125,
"step": 39500
},
{
"epoch": 0.08099844147120204,
"grad_norm": 0.9541352987289429,
"learning_rate": 0.0002953911597178296,
"loss": 3.9118218994140626,
"step": 39550
},
{
"epoch": 0.08110084152363085,
"grad_norm": 0.8768864870071411,
"learning_rate": 0.0002953792579709948,
"loss": 2.9628286743164063,
"step": 39600
},
{
"epoch": 0.08120324157605968,
"grad_norm": 0.9336755275726318,
"learning_rate": 0.00029536734111698567,
"loss": 3.8077597045898437,
"step": 39650
},
{
"epoch": 0.08130564162848851,
"grad_norm": 0.7618170380592346,
"learning_rate": 0.00029535540915704046,
"loss": 3.6045367431640627,
"step": 39700
},
{
"epoch": 0.08140804168091734,
"grad_norm": 0.7901123762130737,
"learning_rate": 0.0002953434620923991,
"loss": 3.8127349853515624,
"step": 39750
},
{
"epoch": 0.08151044173334616,
"grad_norm": 0.90858393907547,
"learning_rate": 0.0002953314999243032,
"loss": 3.246180419921875,
"step": 39800
},
{
"epoch": 0.08161284178577499,
"grad_norm": 1.1294665336608887,
"learning_rate": 0.00029531952265399565,
"loss": 3.8714260864257812,
"step": 39850
},
{
"epoch": 0.08171524183820382,
"grad_norm": 0.8707161545753479,
"learning_rate": 0.0002953075302827211,
"loss": 3.1378076171875,
"step": 39900
},
{
"epoch": 0.08181764189063265,
"grad_norm": 0.9953368902206421,
"learning_rate": 0.0002952955228117258,
"loss": 3.7785629272460937,
"step": 39950
},
{
"epoch": 0.08192004194306148,
"grad_norm": 1.116952657699585,
"learning_rate": 0.00029528350024225753,
"loss": 3.962169494628906,
"step": 40000
},
{
"epoch": 0.0820224419954903,
"grad_norm": 1.140546202659607,
"learning_rate": 0.0002952714625755656,
"loss": 4.067502136230469,
"step": 40050
},
{
"epoch": 0.08212484204791913,
"grad_norm": 0.8150861859321594,
"learning_rate": 0.0002952594098129008,
"loss": 3.7707586669921875,
"step": 40100
},
{
"epoch": 0.08222724210034796,
"grad_norm": 1.0610780715942383,
"learning_rate": 0.00029524734195551577,
"loss": 3.650087890625,
"step": 40150
},
{
"epoch": 0.08232964215277679,
"grad_norm": 1.0550084114074707,
"learning_rate": 0.00029523525900466453,
"loss": 3.259920959472656,
"step": 40200
},
{
"epoch": 0.08243204220520561,
"grad_norm": 0.9846071004867554,
"learning_rate": 0.00029522316096160256,
"loss": 3.823460998535156,
"step": 40250
},
{
"epoch": 0.08253444225763444,
"grad_norm": 0.932036280632019,
"learning_rate": 0.00029521104782758714,
"loss": 3.7148446655273437,
"step": 40300
},
{
"epoch": 0.08263684231006327,
"grad_norm": 0.7645614743232727,
"learning_rate": 0.00029519891960387703,
"loss": 2.9853546142578127,
"step": 40350
},
{
"epoch": 0.0827392423624921,
"grad_norm": 0.9208267331123352,
"learning_rate": 0.00029518677629173246,
"loss": 3.2360791015625,
"step": 40400
},
{
"epoch": 0.08284164241492091,
"grad_norm": 0.6297294497489929,
"learning_rate": 0.0002951746178924153,
"loss": 3.471868896484375,
"step": 40450
},
{
"epoch": 0.08294404246734974,
"grad_norm": 0.7016891241073608,
"learning_rate": 0.0002951624444071891,
"loss": 1.757879180908203,
"step": 40500
},
{
"epoch": 0.08304644251977857,
"grad_norm": 0.9499281048774719,
"learning_rate": 0.00029515025583731877,
"loss": 3.28075439453125,
"step": 40550
},
{
"epoch": 0.0831488425722074,
"grad_norm": 1.3156087398529053,
"learning_rate": 0.00029513805218407105,
"loss": 3.5208966064453127,
"step": 40600
},
{
"epoch": 0.08325124262463622,
"grad_norm": 0.9431557059288025,
"learning_rate": 0.00029512583344871383,
"loss": 3.794385986328125,
"step": 40650
},
{
"epoch": 0.08335364267706505,
"grad_norm": 0.9936553835868835,
"learning_rate": 0.0002951135996325171,
"loss": 3.725905456542969,
"step": 40700
},
{
"epoch": 0.08345604272949388,
"grad_norm": 0.9810524582862854,
"learning_rate": 0.00029510135073675196,
"loss": 3.7150784301757813,
"step": 40750
},
{
"epoch": 0.0835584427819227,
"grad_norm": 0.9625670313835144,
"learning_rate": 0.0002950890867626914,
"loss": 3.2056814575195314,
"step": 40800
},
{
"epoch": 0.08366084283435153,
"grad_norm": 0.8976542949676514,
"learning_rate": 0.0002950768077116097,
"loss": 3.6317849731445313,
"step": 40850
},
{
"epoch": 0.08376324288678036,
"grad_norm": 0.835045576095581,
"learning_rate": 0.00029506451358478293,
"loss": 3.50963623046875,
"step": 40900
},
{
"epoch": 0.08386564293920919,
"grad_norm": 0.9474772810935974,
"learning_rate": 0.0002950522043834886,
"loss": 3.357117919921875,
"step": 40950
},
{
"epoch": 0.08396804299163801,
"grad_norm": 0.9703244566917419,
"learning_rate": 0.0002950398801090059,
"loss": 3.6155599975585937,
"step": 41000
},
{
"epoch": 0.08407044304406684,
"grad_norm": 0.9425392746925354,
"learning_rate": 0.0002950275407626154,
"loss": 3.6346676635742186,
"step": 41050
},
{
"epoch": 0.08417284309649567,
"grad_norm": 0.8194516897201538,
"learning_rate": 0.00029501518634559947,
"loss": 3.578563232421875,
"step": 41100
},
{
"epoch": 0.0842752431489245,
"grad_norm": 0.9572771191596985,
"learning_rate": 0.00029500281685924186,
"loss": 3.8818609619140627,
"step": 41150
},
{
"epoch": 0.08437764320135333,
"grad_norm": 0.9095619320869446,
"learning_rate": 0.0002949904323048279,
"loss": 4.035207214355469,
"step": 41200
},
{
"epoch": 0.08448004325378215,
"grad_norm": 1.0001695156097412,
"learning_rate": 0.0002949780326836447,
"loss": 3.6542062377929687,
"step": 41250
},
{
"epoch": 0.08458244330621097,
"grad_norm": 0.7178096175193787,
"learning_rate": 0.00029496561799698064,
"loss": 4.071335754394531,
"step": 41300
},
{
"epoch": 0.0846848433586398,
"grad_norm": 1.246907114982605,
"learning_rate": 0.0002949531882461258,
"loss": 3.1734967041015625,
"step": 41350
},
{
"epoch": 0.08478724341106862,
"grad_norm": 0.930445671081543,
"learning_rate": 0.0002949407434323719,
"loss": 3.6632540893554686,
"step": 41400
},
{
"epoch": 0.08488964346349745,
"grad_norm": 1.0094538927078247,
"learning_rate": 0.0002949282835570121,
"loss": 3.3916510009765624,
"step": 41450
},
{
"epoch": 0.08499204351592628,
"grad_norm": 0.8062929511070251,
"learning_rate": 0.0002949158086213412,
"loss": 3.894981689453125,
"step": 41500
},
{
"epoch": 0.0850944435683551,
"grad_norm": 0.9255414009094238,
"learning_rate": 0.0002949033186266555,
"loss": 2.9597015380859375,
"step": 41550
},
{
"epoch": 0.08519684362078393,
"grad_norm": 1.0212881565093994,
"learning_rate": 0.00029489081357425296,
"loss": 3.641199951171875,
"step": 41600
},
{
"epoch": 0.08529924367321276,
"grad_norm": 1.0403817892074585,
"learning_rate": 0.000294878293465433,
"loss": 3.671487121582031,
"step": 41650
},
{
"epoch": 0.08540164372564159,
"grad_norm": 1.1286342144012451,
"learning_rate": 0.0002948657583014967,
"loss": 3.9286517333984374,
"step": 41700
},
{
"epoch": 0.08550404377807042,
"grad_norm": 0.8692294359207153,
"learning_rate": 0.00029485320808374666,
"loss": 3.6199725341796873,
"step": 41750
},
{
"epoch": 0.08560644383049924,
"grad_norm": 0.9791249632835388,
"learning_rate": 0.000294840642813487,
"loss": 4.0375192260742185,
"step": 41800
},
{
"epoch": 0.08570884388292807,
"grad_norm": 0.9460155367851257,
"learning_rate": 0.0002948280624920234,
"loss": 3.7891888427734375,
"step": 41850
},
{
"epoch": 0.0858112439353569,
"grad_norm": 0.9373695254325867,
"learning_rate": 0.0002948154671206633,
"loss": 3.796607666015625,
"step": 41900
},
{
"epoch": 0.08591364398778573,
"grad_norm": 0.9017443656921387,
"learning_rate": 0.0002948028567007155,
"loss": 3.572817077636719,
"step": 41950
},
{
"epoch": 0.08601604404021455,
"grad_norm": 1.0376108884811401,
"learning_rate": 0.0002947902312334904,
"loss": 3.006630859375,
"step": 42000
},
{
"epoch": 0.08611844409264338,
"grad_norm": 1.2481657266616821,
"learning_rate": 0.00029477759072029985,
"loss": 3.8225225830078124,
"step": 42050
},
{
"epoch": 0.08622084414507221,
"grad_norm": 0.9803751707077026,
"learning_rate": 0.00029476493516245766,
"loss": 3.7674041748046876,
"step": 42100
},
{
"epoch": 0.08632324419750102,
"grad_norm": 0.6287221908569336,
"learning_rate": 0.00029475226456127877,
"loss": 3.707611083984375,
"step": 42150
},
{
"epoch": 0.08642564424992985,
"grad_norm": 0.8981334567070007,
"learning_rate": 0.00029473957891807984,
"loss": 3.517123107910156,
"step": 42200
},
{
"epoch": 0.08652804430235868,
"grad_norm": 0.8311030864715576,
"learning_rate": 0.0002947268782341792,
"loss": 3.3404605102539064,
"step": 42250
},
{
"epoch": 0.0866304443547875,
"grad_norm": 0.9770768284797668,
"learning_rate": 0.00029471416251089657,
"loss": 3.50871337890625,
"step": 42300
},
{
"epoch": 0.08673284440721633,
"grad_norm": 0.9695789813995361,
"learning_rate": 0.0002947014317495534,
"loss": 2.9593045043945314,
"step": 42350
},
{
"epoch": 0.08683524445964516,
"grad_norm": 1.279931902885437,
"learning_rate": 0.0002946886859514726,
"loss": 3.4032522583007814,
"step": 42400
},
{
"epoch": 0.08693764451207399,
"grad_norm": 0.846924364566803,
"learning_rate": 0.00029467592511797853,
"loss": 3.4247207641601562,
"step": 42450
},
{
"epoch": 0.08704004456450282,
"grad_norm": 0.9505274891853333,
"learning_rate": 0.0002946631492503974,
"loss": 3.43554931640625,
"step": 42500
},
{
"epoch": 0.08714244461693164,
"grad_norm": 0.8762588500976562,
"learning_rate": 0.00029465035835005664,
"loss": 3.82656982421875,
"step": 42550
},
{
"epoch": 0.08724484466936047,
"grad_norm": 1.11127769947052,
"learning_rate": 0.0002946375524182856,
"loss": 3.80216796875,
"step": 42600
},
{
"epoch": 0.0873472447217893,
"grad_norm": 0.989086925983429,
"learning_rate": 0.00029462473145641497,
"loss": 3.4192626953125,
"step": 42650
},
{
"epoch": 0.08744964477421813,
"grad_norm": 1.940375804901123,
"learning_rate": 0.000294611895465777,
"loss": 3.6263909912109376,
"step": 42700
},
{
"epoch": 0.08755204482664695,
"grad_norm": 0.9889708161354065,
"learning_rate": 0.0002945990444477056,
"loss": 3.809459533691406,
"step": 42750
},
{
"epoch": 0.08765444487907578,
"grad_norm": 0.7751711010932922,
"learning_rate": 0.0002945861784035362,
"loss": 3.572983703613281,
"step": 42800
},
{
"epoch": 0.08775684493150461,
"grad_norm": 0.9419236779212952,
"learning_rate": 0.0002945732973346057,
"loss": 2.524838714599609,
"step": 42850
},
{
"epoch": 0.08785924498393344,
"grad_norm": 0.8961177468299866,
"learning_rate": 0.0002945604012422527,
"loss": 3.410054016113281,
"step": 42900
},
{
"epoch": 0.08796164503636227,
"grad_norm": 1.059244155883789,
"learning_rate": 0.00029454749012781733,
"loss": 3.40218994140625,
"step": 42950
},
{
"epoch": 0.08806404508879108,
"grad_norm": 1.054032325744629,
"learning_rate": 0.0002945345639926412,
"loss": 3.8609942626953124,
"step": 43000
},
{
"epoch": 0.0881664451412199,
"grad_norm": 1.5438601970672607,
"learning_rate": 0.00029452162283806764,
"loss": 3.6072705078125,
"step": 43050
},
{
"epoch": 0.08826884519364873,
"grad_norm": 1.3585573434829712,
"learning_rate": 0.0002945086666654413,
"loss": 2.9131259155273437,
"step": 43100
},
{
"epoch": 0.08837124524607756,
"grad_norm": 0.9829738736152649,
"learning_rate": 0.0002944956954761086,
"loss": 3.900540771484375,
"step": 43150
},
{
"epoch": 0.08847364529850639,
"grad_norm": 0.8328433036804199,
"learning_rate": 0.00029448270927141747,
"loss": 2.7704718017578127,
"step": 43200
},
{
"epoch": 0.08857604535093522,
"grad_norm": 0.9175614714622498,
"learning_rate": 0.0002944697080527173,
"loss": 3.7308444213867187,
"step": 43250
},
{
"epoch": 0.08867844540336405,
"grad_norm": 0.991267204284668,
"learning_rate": 0.0002944566918213592,
"loss": 3.726257019042969,
"step": 43300
},
{
"epoch": 0.08878084545579287,
"grad_norm": 0.8164985775947571,
"learning_rate": 0.0002944436605786958,
"loss": 3.6927761840820312,
"step": 43350
},
{
"epoch": 0.0888832455082217,
"grad_norm": 0.6943197846412659,
"learning_rate": 0.00029443061432608104,
"loss": 3.4184146118164063,
"step": 43400
},
{
"epoch": 0.08898564556065053,
"grad_norm": 0.7790623307228088,
"learning_rate": 0.00029441755306487086,
"loss": 2.976038818359375,
"step": 43450
},
{
"epoch": 0.08908804561307936,
"grad_norm": 1.6038352251052856,
"learning_rate": 0.00029440447679642245,
"loss": 3.82299560546875,
"step": 43500
},
{
"epoch": 0.08919044566550818,
"grad_norm": 0.8337633013725281,
"learning_rate": 0.0002943913855220946,
"loss": 3.1569622802734374,
"step": 43550
},
{
"epoch": 0.08929284571793701,
"grad_norm": 0.7999888062477112,
"learning_rate": 0.0002943782792432477,
"loss": 1.872430877685547,
"step": 43600
},
{
"epoch": 0.08939524577036584,
"grad_norm": 1.0902249813079834,
"learning_rate": 0.00029436515796124374,
"loss": 3.7972311401367187,
"step": 43650
},
{
"epoch": 0.08949764582279467,
"grad_norm": 1.0413930416107178,
"learning_rate": 0.0002943520216774462,
"loss": 3.2716705322265627,
"step": 43700
},
{
"epoch": 0.0896000458752235,
"grad_norm": 1.2011021375656128,
"learning_rate": 0.00029433887039322017,
"loss": 3.626478271484375,
"step": 43750
},
{
"epoch": 0.08970244592765232,
"grad_norm": 0.9314827919006348,
"learning_rate": 0.00029432570410993226,
"loss": 2.8823446655273437,
"step": 43800
},
{
"epoch": 0.08980484598008114,
"grad_norm": 0.7399948835372925,
"learning_rate": 0.0002943125228289507,
"loss": 4.1010568237304685,
"step": 43850
},
{
"epoch": 0.08990724603250996,
"grad_norm": 0.8250963687896729,
"learning_rate": 0.0002942993265516451,
"loss": 3.1346914672851565,
"step": 43900
},
{
"epoch": 0.09000964608493879,
"grad_norm": 0.9391615390777588,
"learning_rate": 0.00029428611527938683,
"loss": 3.304781188964844,
"step": 43950
},
{
"epoch": 0.09011204613736762,
"grad_norm": 1.3079005479812622,
"learning_rate": 0.0002942728890135488,
"loss": 3.67544189453125,
"step": 44000
},
{
"epoch": 0.09021444618979645,
"grad_norm": 0.8453630208969116,
"learning_rate": 0.0002942596477555054,
"loss": 3.5969857788085937,
"step": 44050
},
{
"epoch": 0.09031684624222527,
"grad_norm": 0.8319234251976013,
"learning_rate": 0.0002942463915066326,
"loss": 3.4878445434570313,
"step": 44100
},
{
"epoch": 0.0904192462946541,
"grad_norm": 0.8988983035087585,
"learning_rate": 0.0002942331202683078,
"loss": 3.648201599121094,
"step": 44150
},
{
"epoch": 0.09052164634708293,
"grad_norm": 0.9342271089553833,
"learning_rate": 0.00029421983404191027,
"loss": 3.274960632324219,
"step": 44200
},
{
"epoch": 0.09062404639951176,
"grad_norm": 0.6616138219833374,
"learning_rate": 0.0002942065328288206,
"loss": 3.499600830078125,
"step": 44250
},
{
"epoch": 0.09072644645194058,
"grad_norm": 0.5895013213157654,
"learning_rate": 0.00029419321663042106,
"loss": 3.312397155761719,
"step": 44300
},
{
"epoch": 0.09082884650436941,
"grad_norm": 0.9072443246841431,
"learning_rate": 0.0002941798854480952,
"loss": 3.4259283447265627,
"step": 44350
},
{
"epoch": 0.09093124655679824,
"grad_norm": 0.9704260230064392,
"learning_rate": 0.00029416653928322854,
"loss": 3.4393576049804686,
"step": 44400
},
{
"epoch": 0.09103364660922707,
"grad_norm": 1.2480257749557495,
"learning_rate": 0.0002941531781372079,
"loss": 3.769176025390625,
"step": 44450
},
{
"epoch": 0.0911360466616559,
"grad_norm": 0.7178895473480225,
"learning_rate": 0.0002941398020114217,
"loss": 3.466965637207031,
"step": 44500
},
{
"epoch": 0.09123844671408472,
"grad_norm": 0.870692789554596,
"learning_rate": 0.0002941264109072599,
"loss": 3.033865661621094,
"step": 44550
},
{
"epoch": 0.09134084676651355,
"grad_norm": 0.8359827399253845,
"learning_rate": 0.0002941130048261141,
"loss": 2.9268179321289063,
"step": 44600
},
{
"epoch": 0.09144324681894238,
"grad_norm": 0.8175760507583618,
"learning_rate": 0.0002940995837693774,
"loss": 3.4120257568359373,
"step": 44650
},
{
"epoch": 0.09154564687137119,
"grad_norm": 0.9331879615783691,
"learning_rate": 0.00029408614773844435,
"loss": 3.2513809204101562,
"step": 44700
},
{
"epoch": 0.09164804692380002,
"grad_norm": 1.0784333944320679,
"learning_rate": 0.0002940726967347113,
"loss": 3.6439849853515627,
"step": 44750
},
{
"epoch": 0.09175044697622885,
"grad_norm": 1.1995110511779785,
"learning_rate": 0.000294059230759576,
"loss": 3.4360501098632814,
"step": 44800
},
{
"epoch": 0.09185284702865767,
"grad_norm": 1.0588244199752808,
"learning_rate": 0.0002940457498144377,
"loss": 3.7745849609375,
"step": 44850
},
{
"epoch": 0.0919552470810865,
"grad_norm": 0.8767450451850891,
"learning_rate": 0.0002940322539006973,
"loss": 3.844217529296875,
"step": 44900
},
{
"epoch": 0.09205764713351533,
"grad_norm": 1.0808109045028687,
"learning_rate": 0.00029401874301975727,
"loss": 3.6274505615234376,
"step": 44950
},
{
"epoch": 0.09216004718594416,
"grad_norm": 0.7221155762672424,
"learning_rate": 0.00029400521717302166,
"loss": 3.7380535888671873,
"step": 45000
},
{
"epoch": 0.09226244723837299,
"grad_norm": 0.847489595413208,
"learning_rate": 0.0002939916763618958,
"loss": 3.911776428222656,
"step": 45050
},
{
"epoch": 0.09236484729080181,
"grad_norm": 0.9442451596260071,
"learning_rate": 0.00029397812058778707,
"loss": 3.614713439941406,
"step": 45100
},
{
"epoch": 0.09246724734323064,
"grad_norm": 0.8224995136260986,
"learning_rate": 0.0002939645498521039,
"loss": 2.9758087158203126,
"step": 45150
},
{
"epoch": 0.09256964739565947,
"grad_norm": 0.5964682698249817,
"learning_rate": 0.0002939509641562567,
"loss": 1.742069854736328,
"step": 45200
},
{
"epoch": 0.0926720474480883,
"grad_norm": 0.5689894556999207,
"learning_rate": 0.000293937363501657,
"loss": 2.5378482055664064,
"step": 45250
},
{
"epoch": 0.09277444750051712,
"grad_norm": 1.1072094440460205,
"learning_rate": 0.00029392374788971833,
"loss": 3.6468490600585937,
"step": 45300
},
{
"epoch": 0.09287684755294595,
"grad_norm": 0.8903659582138062,
"learning_rate": 0.0002939101173218555,
"loss": 3.3196234130859374,
"step": 45350
},
{
"epoch": 0.09297924760537478,
"grad_norm": 0.7800249457359314,
"learning_rate": 0.0002938964717994849,
"loss": 3.895177001953125,
"step": 45400
},
{
"epoch": 0.0930816476578036,
"grad_norm": 0.8999978303909302,
"learning_rate": 0.00029388281132402454,
"loss": 3.9075274658203125,
"step": 45450
},
{
"epoch": 0.09318404771023243,
"grad_norm": 0.8825941681861877,
"learning_rate": 0.00029386913589689393,
"loss": 3.787184753417969,
"step": 45500
},
{
"epoch": 0.09328644776266125,
"grad_norm": 0.8387885689735413,
"learning_rate": 0.0002938554455195142,
"loss": 3.0614547729492188,
"step": 45550
},
{
"epoch": 0.09338884781509008,
"grad_norm": 0.8926045894622803,
"learning_rate": 0.000293841740193308,
"loss": 3.60509765625,
"step": 45600
},
{
"epoch": 0.0934912478675189,
"grad_norm": 0.8237414956092834,
"learning_rate": 0.00029382801991969945,
"loss": 3.7173092651367186,
"step": 45650
},
{
"epoch": 0.09359364791994773,
"grad_norm": 0.9613683223724365,
"learning_rate": 0.0002938142847001144,
"loss": 3.6430303955078127,
"step": 45700
},
{
"epoch": 0.09369604797237656,
"grad_norm": 0.6676517724990845,
"learning_rate": 0.0002938005345359801,
"loss": 3.367287292480469,
"step": 45750
},
{
"epoch": 0.09379844802480539,
"grad_norm": 1.0207992792129517,
"learning_rate": 0.0002937867694287254,
"loss": 3.720477294921875,
"step": 45800
},
{
"epoch": 0.09390084807723421,
"grad_norm": 1.102271318435669,
"learning_rate": 0.00029377298937978077,
"loss": 3.620904846191406,
"step": 45850
},
{
"epoch": 0.09400324812966304,
"grad_norm": 0.7163046598434448,
"learning_rate": 0.0002937591943905781,
"loss": 2.606007080078125,
"step": 45900
},
{
"epoch": 0.09410564818209187,
"grad_norm": 0.4335630238056183,
"learning_rate": 0.0002937453844625509,
"loss": 1.4070957946777343,
"step": 45950
},
{
"epoch": 0.0942080482345207,
"grad_norm": 0.6361094117164612,
"learning_rate": 0.0002937315595971343,
"loss": 3.254652099609375,
"step": 46000
},
{
"epoch": 0.09431044828694952,
"grad_norm": 0.8255375027656555,
"learning_rate": 0.0002937177197957649,
"loss": 3.422788391113281,
"step": 46050
},
{
"epoch": 0.09441284833937835,
"grad_norm": 0.5633572340011597,
"learning_rate": 0.0002937038650598809,
"loss": 3.6044976806640623,
"step": 46100
},
{
"epoch": 0.09451524839180718,
"grad_norm": 1.2230876684188843,
"learning_rate": 0.0002936899953909219,
"loss": 3.22015869140625,
"step": 46150
},
{
"epoch": 0.09461764844423601,
"grad_norm": 0.8899142742156982,
"learning_rate": 0.0002936761107903293,
"loss": 4.0206103515625,
"step": 46200
},
{
"epoch": 0.09472004849666484,
"grad_norm": 0.9843432307243347,
"learning_rate": 0.00029366221125954586,
"loss": 3.70310791015625,
"step": 46250
},
{
"epoch": 0.09482244854909366,
"grad_norm": 0.9883196353912354,
"learning_rate": 0.000293648296800016,
"loss": 3.84428466796875,
"step": 46300
},
{
"epoch": 0.09492484860152249,
"grad_norm": 0.8828408718109131,
"learning_rate": 0.0002936343674131856,
"loss": 3.496847839355469,
"step": 46350
},
{
"epoch": 0.0950272486539513,
"grad_norm": 0.6513479351997375,
"learning_rate": 0.0002936204231005023,
"loss": 3.3328936767578123,
"step": 46400
},
{
"epoch": 0.09512964870638013,
"grad_norm": 0.9128335118293762,
"learning_rate": 0.0002936064638634149,
"loss": 2.726371154785156,
"step": 46450
},
{
"epoch": 0.09523204875880896,
"grad_norm": 0.9786936044692993,
"learning_rate": 0.00029359248970337406,
"loss": 3.190602111816406,
"step": 46500
},
{
"epoch": 0.09533444881123779,
"grad_norm": 0.8290608525276184,
"learning_rate": 0.00029357850062183203,
"loss": 3.8881295776367186,
"step": 46550
},
{
"epoch": 0.09543684886366662,
"grad_norm": 0.9058592319488525,
"learning_rate": 0.0002935644966202424,
"loss": 3.583518371582031,
"step": 46600
},
{
"epoch": 0.09553924891609544,
"grad_norm": 1.0789927244186401,
"learning_rate": 0.00029355047770006034,
"loss": 3.5978643798828127,
"step": 46650
},
{
"epoch": 0.09564164896852427,
"grad_norm": 0.9313961863517761,
"learning_rate": 0.00029353644386274273,
"loss": 3.6306307983398436,
"step": 46700
},
{
"epoch": 0.0957440490209531,
"grad_norm": 0.9315224289894104,
"learning_rate": 0.00029352239510974787,
"loss": 3.5369802856445314,
"step": 46750
},
{
"epoch": 0.09584644907338193,
"grad_norm": 0.8773080110549927,
"learning_rate": 0.0002935083314425357,
"loss": 3.766584777832031,
"step": 46800
},
{
"epoch": 0.09594884912581075,
"grad_norm": 0.8457773923873901,
"learning_rate": 0.00029349425286256763,
"loss": 3.876020812988281,
"step": 46850
},
{
"epoch": 0.09605124917823958,
"grad_norm": 0.9530948400497437,
"learning_rate": 0.00029348015937130656,
"loss": 3.862485046386719,
"step": 46900
},
{
"epoch": 0.09615364923066841,
"grad_norm": 1.1303527355194092,
"learning_rate": 0.0002934660509702171,
"loss": 2.8852374267578127,
"step": 46950
},
{
"epoch": 0.09625604928309724,
"grad_norm": 1.0009031295776367,
"learning_rate": 0.0002934519276607653,
"loss": 3.7352252197265625,
"step": 47000
},
{
"epoch": 0.09635844933552606,
"grad_norm": 0.5930376052856445,
"learning_rate": 0.00029343778944441887,
"loss": 2.9531982421875,
"step": 47050
},
{
"epoch": 0.09646084938795489,
"grad_norm": 0.8898753523826599,
"learning_rate": 0.0002934236363226469,
"loss": 3.4945404052734377,
"step": 47100
},
{
"epoch": 0.09656324944038372,
"grad_norm": 1.1294547319412231,
"learning_rate": 0.00029340946829692013,
"loss": 3.6753500366210936,
"step": 47150
},
{
"epoch": 0.09666564949281255,
"grad_norm": 1.0108319520950317,
"learning_rate": 0.00029339528536871087,
"loss": 3.531564025878906,
"step": 47200
},
{
"epoch": 0.09676804954524136,
"grad_norm": 1.2252336740493774,
"learning_rate": 0.00029338108753949296,
"loss": 3.4618963623046874,
"step": 47250
},
{
"epoch": 0.09687044959767019,
"grad_norm": 0.975235641002655,
"learning_rate": 0.0002933668748107418,
"loss": 3.814194641113281,
"step": 47300
},
{
"epoch": 0.09697284965009902,
"grad_norm": 0.9449312090873718,
"learning_rate": 0.00029335264718393424,
"loss": 3.584350891113281,
"step": 47350
},
{
"epoch": 0.09707524970252784,
"grad_norm": 0.677931010723114,
"learning_rate": 0.00029333840466054875,
"loss": 3.3688113403320314,
"step": 47400
},
{
"epoch": 0.09717764975495667,
"grad_norm": 0.8441532254219055,
"learning_rate": 0.0002933241472420654,
"loss": 3.5264968872070312,
"step": 47450
},
{
"epoch": 0.0972800498073855,
"grad_norm": 1.312727451324463,
"learning_rate": 0.0002933098749299657,
"loss": 3.4481561279296873,
"step": 47500
},
{
"epoch": 0.09738244985981433,
"grad_norm": 1.3327820301055908,
"learning_rate": 0.0002932955877257329,
"loss": 3.394440002441406,
"step": 47550
},
{
"epoch": 0.09748484991224315,
"grad_norm": 0.9248800277709961,
"learning_rate": 0.00029328128563085154,
"loss": 3.8233456420898437,
"step": 47600
},
{
"epoch": 0.09758724996467198,
"grad_norm": 0.8401134014129639,
"learning_rate": 0.00029326696864680787,
"loss": 3.518874206542969,
"step": 47650
},
{
"epoch": 0.09768965001710081,
"grad_norm": 1.0612273216247559,
"learning_rate": 0.0002932526367750896,
"loss": 3.5591195678710936,
"step": 47700
},
{
"epoch": 0.09779205006952964,
"grad_norm": 0.7437798380851746,
"learning_rate": 0.00029323829001718613,
"loss": 3.0408529663085937,
"step": 47750
},
{
"epoch": 0.09789445012195847,
"grad_norm": 0.8572849631309509,
"learning_rate": 0.0002932239283745882,
"loss": 3.37136474609375,
"step": 47800
},
{
"epoch": 0.09799685017438729,
"grad_norm": 1.062315583229065,
"learning_rate": 0.0002932095518487883,
"loss": 3.3033380126953125,
"step": 47850
},
{
"epoch": 0.09809925022681612,
"grad_norm": 1.2052414417266846,
"learning_rate": 0.0002931951604412804,
"loss": 2.8181661987304687,
"step": 47900
},
{
"epoch": 0.09820165027924495,
"grad_norm": 1.1409345865249634,
"learning_rate": 0.00029318075415355984,
"loss": 3.9486019897460936,
"step": 47950
},
{
"epoch": 0.09830405033167378,
"grad_norm": 1.0399415493011475,
"learning_rate": 0.0002931663329871238,
"loss": 3.3418069458007813,
"step": 48000
},
{
"epoch": 0.0984064503841026,
"grad_norm": 0.919865608215332,
"learning_rate": 0.0002931518969434708,
"loss": 3.688287658691406,
"step": 48050
},
{
"epoch": 0.09850885043653142,
"grad_norm": 0.994552493095398,
"learning_rate": 0.000293137446024101,
"loss": 3.589768371582031,
"step": 48100
},
{
"epoch": 0.09861125048896024,
"grad_norm": 0.9309687614440918,
"learning_rate": 0.00029312298023051605,
"loss": 3.7281314086914064,
"step": 48150
},
{
"epoch": 0.09871365054138907,
"grad_norm": 1.0229836702346802,
"learning_rate": 0.0002931084995642192,
"loss": 3.7916598510742188,
"step": 48200
},
{
"epoch": 0.0988160505938179,
"grad_norm": 0.7249611616134644,
"learning_rate": 0.0002930940040267152,
"loss": 3.300664367675781,
"step": 48250
},
{
"epoch": 0.09891845064624673,
"grad_norm": 1.0371336936950684,
"learning_rate": 0.0002930794936195104,
"loss": 3.6068963623046875,
"step": 48300
},
{
"epoch": 0.09902085069867556,
"grad_norm": 1.082552194595337,
"learning_rate": 0.0002930649683441126,
"loss": 3.424382629394531,
"step": 48350
},
{
"epoch": 0.09912325075110438,
"grad_norm": 1.1194738149642944,
"learning_rate": 0.0002930504282020312,
"loss": 3.5506494140625,
"step": 48400
},
{
"epoch": 0.09922565080353321,
"grad_norm": 0.8479589223861694,
"learning_rate": 0.00029303587319477715,
"loss": 3.7008261108398437,
"step": 48450
},
{
"epoch": 0.09932805085596204,
"grad_norm": 1.6099497079849243,
"learning_rate": 0.00029302130332386307,
"loss": 3.1615875244140623,
"step": 48500
},
{
"epoch": 0.09943045090839087,
"grad_norm": 0.9683935046195984,
"learning_rate": 0.00029300671859080275,
"loss": 3.681039123535156,
"step": 48550
},
{
"epoch": 0.0995328509608197,
"grad_norm": 1.2820624113082886,
"learning_rate": 0.000292992118997112,
"loss": 3.5511508178710938,
"step": 48600
},
{
"epoch": 0.09963525101324852,
"grad_norm": 0.9168037176132202,
"learning_rate": 0.00029297750454430785,
"loss": 3.657781677246094,
"step": 48650
},
{
"epoch": 0.09973765106567735,
"grad_norm": 4.044058322906494,
"learning_rate": 0.000292962875233909,
"loss": 3.3354425048828125,
"step": 48700
},
{
"epoch": 0.09984005111810618,
"grad_norm": 1.3989704847335815,
"learning_rate": 0.00029294823106743565,
"loss": 3.1698623657226563,
"step": 48750
},
{
"epoch": 0.099942451170535,
"grad_norm": 1.0335566997528076,
"learning_rate": 0.00029293357204640953,
"loss": 2.8218838500976564,
"step": 48800
},
{
"epoch": 0.10004485122296383,
"grad_norm": 0.9593238234519958,
"learning_rate": 0.00029291889817235396,
"loss": 3.712968444824219,
"step": 48850
},
{
"epoch": 0.10014725127539266,
"grad_norm": 0.8883773684501648,
"learning_rate": 0.0002929042094467938,
"loss": 3.7107012939453123,
"step": 48900
},
{
"epoch": 0.10024965132782147,
"grad_norm": 0.817356526851654,
"learning_rate": 0.00029288950587125543,
"loss": 3.442810363769531,
"step": 48950
},
{
"epoch": 0.1003520513802503,
"grad_norm": 0.7103580236434937,
"learning_rate": 0.0002928747874472667,
"loss": 3.518086242675781,
"step": 49000
},
{
"epoch": 0.10045445143267913,
"grad_norm": 0.9503324031829834,
"learning_rate": 0.0002928600541763573,
"loss": 3.3786196899414063,
"step": 49050
},
{
"epoch": 0.10055685148510796,
"grad_norm": 1.2780108451843262,
"learning_rate": 0.000292845306060058,
"loss": 3.57686767578125,
"step": 49100
},
{
"epoch": 0.10065925153753678,
"grad_norm": 0.7724966406822205,
"learning_rate": 0.0002928305430999015,
"loss": 3.4349874877929687,
"step": 49150
},
{
"epoch": 0.10076165158996561,
"grad_norm": 0.73604816198349,
"learning_rate": 0.0002928157652974219,
"loss": 2.7600396728515624,
"step": 49200
},
{
"epoch": 0.10086405164239444,
"grad_norm": 1.0744940042495728,
"learning_rate": 0.00029280097265415477,
"loss": 3.0249954223632813,
"step": 49250
},
{
"epoch": 0.10096645169482327,
"grad_norm": 1.065955400466919,
"learning_rate": 0.0002927861651716373,
"loss": 3.8623785400390624,
"step": 49300
},
{
"epoch": 0.1010688517472521,
"grad_norm": 0.9593607783317566,
"learning_rate": 0.00029277134285140833,
"loss": 3.2714468383789064,
"step": 49350
},
{
"epoch": 0.10117125179968092,
"grad_norm": 0.875238835811615,
"learning_rate": 0.00029275650569500803,
"loss": 3.738236999511719,
"step": 49400
},
{
"epoch": 0.10127365185210975,
"grad_norm": 0.6924005150794983,
"learning_rate": 0.00029274165370397827,
"loss": 3.841283874511719,
"step": 49450
},
{
"epoch": 0.10137605190453858,
"grad_norm": 0.8927252292633057,
"learning_rate": 0.00029272678687986236,
"loss": 3.7357077026367187,
"step": 49500
},
{
"epoch": 0.1014784519569674,
"grad_norm": 0.9974352121353149,
"learning_rate": 0.0002927119052242052,
"loss": 3.653075866699219,
"step": 49550
},
{
"epoch": 0.10158085200939623,
"grad_norm": 0.9928423762321472,
"learning_rate": 0.00029269700873855325,
"loss": 3.923564147949219,
"step": 49600
},
{
"epoch": 0.10168325206182506,
"grad_norm": 0.727768063545227,
"learning_rate": 0.0002926820974244544,
"loss": 3.398980712890625,
"step": 49650
},
{
"epoch": 0.10178565211425389,
"grad_norm": 0.5926229357719421,
"learning_rate": 0.00029266717128345837,
"loss": 2.8432931518554687,
"step": 49700
},
{
"epoch": 0.10188805216668272,
"grad_norm": 0.7883087992668152,
"learning_rate": 0.000292652230317116,
"loss": 3.0143255615234374,
"step": 49750
},
{
"epoch": 0.10199045221911153,
"grad_norm": 0.7131246328353882,
"learning_rate": 0.00029263727452698,
"loss": 2.733319091796875,
"step": 49800
},
{
"epoch": 0.10209285227154036,
"grad_norm": 0.6186954379081726,
"learning_rate": 0.0002926223039146045,
"loss": 2.7473519897460936,
"step": 49850
},
{
"epoch": 0.10219525232396919,
"grad_norm": 1.1395238637924194,
"learning_rate": 0.0002926073184815452,
"loss": 3.2758560180664062,
"step": 49900
},
{
"epoch": 0.10229765237639801,
"grad_norm": 0.9928449988365173,
"learning_rate": 0.0002925923182293592,
"loss": 4.000916442871094,
"step": 49950
},
{
"epoch": 0.10240005242882684,
"grad_norm": 2.3027689456939697,
"learning_rate": 0.00029257730315960547,
"loss": 3.35286865234375,
"step": 50000
},
{
"epoch": 0.10250245248125567,
"grad_norm": 0.7039443254470825,
"learning_rate": 0.0002925622732738441,
"loss": 3.695789794921875,
"step": 50050
},
{
"epoch": 0.1026048525336845,
"grad_norm": 0.8762661218643188,
"learning_rate": 0.00029254722857363706,
"loss": 3.2338931274414064,
"step": 50100
},
{
"epoch": 0.10270725258611332,
"grad_norm": 1.2041362524032593,
"learning_rate": 0.00029253216906054765,
"loss": 3.807637023925781,
"step": 50150
},
{
"epoch": 0.10280965263854215,
"grad_norm": 0.7898900508880615,
"learning_rate": 0.0002925170947361409,
"loss": 3.60753662109375,
"step": 50200
},
{
"epoch": 0.10291205269097098,
"grad_norm": 1.3529953956604004,
"learning_rate": 0.00029250200560198316,
"loss": 3.5016552734375,
"step": 50250
},
{
"epoch": 0.1030144527433998,
"grad_norm": 1.0065248012542725,
"learning_rate": 0.00029248690165964246,
"loss": 3.634730224609375,
"step": 50300
},
{
"epoch": 0.10311685279582863,
"grad_norm": 0.8603098392486572,
"learning_rate": 0.00029247178291068836,
"loss": 3.8783328247070314,
"step": 50350
},
{
"epoch": 0.10321925284825746,
"grad_norm": 0.9008740186691284,
"learning_rate": 0.00029245664935669186,
"loss": 3.682059631347656,
"step": 50400
},
{
"epoch": 0.10332165290068629,
"grad_norm": 1.109923243522644,
"learning_rate": 0.00029244150099922567,
"loss": 3.8022805786132814,
"step": 50450
},
{
"epoch": 0.10342405295311512,
"grad_norm": 0.9621108770370483,
"learning_rate": 0.0002924263378398639,
"loss": 4.338629455566406,
"step": 50500
},
{
"epoch": 0.10352645300554394,
"grad_norm": 0.8833173513412476,
"learning_rate": 0.00029241115988018224,
"loss": 3.44856689453125,
"step": 50550
},
{
"epoch": 0.10362885305797277,
"grad_norm": 0.8892256617546082,
"learning_rate": 0.0002923959671217579,
"loss": 3.9488174438476564,
"step": 50600
},
{
"epoch": 0.10373125311040159,
"grad_norm": 0.8669362664222717,
"learning_rate": 0.00029238075956616963,
"loss": 3.4380224609375,
"step": 50650
},
{
"epoch": 0.10383365316283041,
"grad_norm": 0.8463394045829773,
"learning_rate": 0.0002923655372149978,
"loss": 3.5007855224609377,
"step": 50700
},
{
"epoch": 0.10393605321525924,
"grad_norm": 1.0633169412612915,
"learning_rate": 0.00029235030006982416,
"loss": 3.4543692016601564,
"step": 50750
},
{
"epoch": 0.10403845326768807,
"grad_norm": 1.0498319864273071,
"learning_rate": 0.0002923350481322322,
"loss": 3.207664794921875,
"step": 50800
},
{
"epoch": 0.1041408533201169,
"grad_norm": 1.1241377592086792,
"learning_rate": 0.00029231978140380676,
"loss": 3.3383258056640623,
"step": 50850
},
{
"epoch": 0.10424325337254572,
"grad_norm": 0.990468442440033,
"learning_rate": 0.0002923044998861343,
"loss": 3.237965393066406,
"step": 50900
},
{
"epoch": 0.10434565342497455,
"grad_norm": 1.204306721687317,
"learning_rate": 0.0002922892035808027,
"loss": 3.101645812988281,
"step": 50950
},
{
"epoch": 0.10444805347740338,
"grad_norm": 0.9017521739006042,
"learning_rate": 0.00029227389248940173,
"loss": 2.567582702636719,
"step": 51000
},
{
"epoch": 0.10455045352983221,
"grad_norm": 1.0600897073745728,
"learning_rate": 0.00029225856661352226,
"loss": 3.3536370849609374,
"step": 51050
},
{
"epoch": 0.10465285358226104,
"grad_norm": 0.9761303663253784,
"learning_rate": 0.00029224322595475694,
"loss": 3.4230682373046877,
"step": 51100
},
{
"epoch": 0.10475525363468986,
"grad_norm": 0.7504201531410217,
"learning_rate": 0.0002922278705147,
"loss": 3.14410888671875,
"step": 51150
},
{
"epoch": 0.10485765368711869,
"grad_norm": 1.2177993059158325,
"learning_rate": 0.00029221250029494694,
"loss": 3.2004080200195313,
"step": 51200
},
{
"epoch": 0.10496005373954752,
"grad_norm": 0.744257926940918,
"learning_rate": 0.000292197115297095,
"loss": 3.388042297363281,
"step": 51250
},
{
"epoch": 0.10506245379197635,
"grad_norm": 0.8854607939720154,
"learning_rate": 0.000292181715522743,
"loss": 3.436142578125,
"step": 51300
},
{
"epoch": 0.10516485384440517,
"grad_norm": 1.030616283416748,
"learning_rate": 0.00029216630097349125,
"loss": 3.2875115966796873,
"step": 51350
},
{
"epoch": 0.105267253896834,
"grad_norm": 0.8778656721115112,
"learning_rate": 0.00029215087165094145,
"loss": 3.6679806518554687,
"step": 51400
},
{
"epoch": 0.10536965394926283,
"grad_norm": 0.6675243377685547,
"learning_rate": 0.000292135427556697,
"loss": 2.4680940246582033,
"step": 51450
},
{
"epoch": 0.10547205400169164,
"grad_norm": 0.7153152227401733,
"learning_rate": 0.0002921199686923628,
"loss": 2.0736355590820312,
"step": 51500
},
{
"epoch": 0.10557445405412047,
"grad_norm": 0.9837950468063354,
"learning_rate": 0.0002921044950595452,
"loss": 3.5398410034179686,
"step": 51550
},
{
"epoch": 0.1056768541065493,
"grad_norm": 0.9022551774978638,
"learning_rate": 0.00029208900665985213,
"loss": 3.0752154541015626,
"step": 51600
},
{
"epoch": 0.10577925415897813,
"grad_norm": 0.802068293094635,
"learning_rate": 0.0002920735034948932,
"loss": 3.4963739013671873,
"step": 51650
},
{
"epoch": 0.10588165421140695,
"grad_norm": 0.8395520448684692,
"learning_rate": 0.00029205798556627944,
"loss": 3.05790771484375,
"step": 51700
},
{
"epoch": 0.10598405426383578,
"grad_norm": 0.8915894627571106,
"learning_rate": 0.0002920424528756233,
"loss": 3.4001889038085937,
"step": 51750
},
{
"epoch": 0.10608645431626461,
"grad_norm": 0.6118106842041016,
"learning_rate": 0.00029202690542453886,
"loss": 2.7612185668945313,
"step": 51800
},
{
"epoch": 0.10618885436869344,
"grad_norm": 0.8468356132507324,
"learning_rate": 0.00029201134321464177,
"loss": 3.045502014160156,
"step": 51850
},
{
"epoch": 0.10629125442112226,
"grad_norm": 0.837311327457428,
"learning_rate": 0.00029199576624754927,
"loss": 2.9287734985351563,
"step": 51900
},
{
"epoch": 0.10639365447355109,
"grad_norm": 0.8536468744277954,
"learning_rate": 0.00029198017452487996,
"loss": 3.28405517578125,
"step": 51950
},
{
"epoch": 0.10649605452597992,
"grad_norm": 0.5843203067779541,
"learning_rate": 0.0002919645680482541,
"loss": 3.158900146484375,
"step": 52000
},
{
"epoch": 0.10659845457840875,
"grad_norm": 0.8552372455596924,
"learning_rate": 0.0002919489468192934,
"loss": 2.7685336303710937,
"step": 52050
},
{
"epoch": 0.10670085463083757,
"grad_norm": 0.9539399743080139,
"learning_rate": 0.00029193331083962127,
"loss": 3.3179580688476564,
"step": 52100
},
{
"epoch": 0.1068032546832664,
"grad_norm": 0.9066966772079468,
"learning_rate": 0.00029191766011086234,
"loss": 3.2776177978515624,
"step": 52150
},
{
"epoch": 0.10690565473569523,
"grad_norm": 1.242811918258667,
"learning_rate": 0.0002919019946346431,
"loss": 3.0137930297851563,
"step": 52200
},
{
"epoch": 0.10700805478812406,
"grad_norm": 0.7365036606788635,
"learning_rate": 0.0002918863144125915,
"loss": 3.2038583374023437,
"step": 52250
},
{
"epoch": 0.10711045484055289,
"grad_norm": 1.0112019777297974,
"learning_rate": 0.00029187061944633674,
"loss": 3.0209481811523435,
"step": 52300
},
{
"epoch": 0.1072128548929817,
"grad_norm": 0.9452877640724182,
"learning_rate": 0.00029185490973751,
"loss": 3.137978210449219,
"step": 52350
},
{
"epoch": 0.10731525494541053,
"grad_norm": 0.6544405817985535,
"learning_rate": 0.0002918391852877436,
"loss": 3.417147521972656,
"step": 52400
},
{
"epoch": 0.10741765499783935,
"grad_norm": 0.5501437783241272,
"learning_rate": 0.0002918234460986717,
"loss": 2.9117431640625,
"step": 52450
},
{
"epoch": 0.10752005505026818,
"grad_norm": 0.8474003076553345,
"learning_rate": 0.0002918076921719297,
"loss": 3.441578063964844,
"step": 52500
},
{
"epoch": 0.10762245510269701,
"grad_norm": 1.1429380178451538,
"learning_rate": 0.0002917919235091548,
"loss": 3.6524429321289062,
"step": 52550
},
{
"epoch": 0.10772485515512584,
"grad_norm": 0.6069464683532715,
"learning_rate": 0.0002917761401119855,
"loss": 3.335174865722656,
"step": 52600
},
{
"epoch": 0.10782725520755466,
"grad_norm": 0.880016028881073,
"learning_rate": 0.00029176034198206204,
"loss": 3.1002215576171874,
"step": 52650
},
{
"epoch": 0.10792965525998349,
"grad_norm": 0.8357744812965393,
"learning_rate": 0.000291744529121026,
"loss": 3.146689147949219,
"step": 52700
},
{
"epoch": 0.10803205531241232,
"grad_norm": 1.0419034957885742,
"learning_rate": 0.0002917287015305207,
"loss": 2.675668029785156,
"step": 52750
},
{
"epoch": 0.10813445536484115,
"grad_norm": 1.0537099838256836,
"learning_rate": 0.0002917128592121908,
"loss": 3.45556640625,
"step": 52800
},
{
"epoch": 0.10823685541726998,
"grad_norm": 0.8088594675064087,
"learning_rate": 0.0002916970021676825,
"loss": 3.272125244140625,
"step": 52850
},
{
"epoch": 0.1083392554696988,
"grad_norm": 0.9163171648979187,
"learning_rate": 0.0002916811303986437,
"loss": 3.303933410644531,
"step": 52900
},
{
"epoch": 0.10844165552212763,
"grad_norm": 1.4584760665893555,
"learning_rate": 0.00029166524390672374,
"loss": 3.420548095703125,
"step": 52950
},
{
"epoch": 0.10854405557455646,
"grad_norm": 0.9255147576332092,
"learning_rate": 0.0002916493426935734,
"loss": 3.422330017089844,
"step": 53000
},
{
"epoch": 0.10864645562698529,
"grad_norm": 0.7951823472976685,
"learning_rate": 0.0002916334267608451,
"loss": 3.435027770996094,
"step": 53050
},
{
"epoch": 0.10874885567941411,
"grad_norm": 0.8804548978805542,
"learning_rate": 0.00029161749611019273,
"loss": 3.147249755859375,
"step": 53100
},
{
"epoch": 0.10885125573184294,
"grad_norm": 0.6578207015991211,
"learning_rate": 0.00029160155074327174,
"loss": 3.2707638549804687,
"step": 53150
},
{
"epoch": 0.10895365578427176,
"grad_norm": 0.9386240243911743,
"learning_rate": 0.0002915855906617391,
"loss": 3.227253723144531,
"step": 53200
},
{
"epoch": 0.10905605583670058,
"grad_norm": 1.0355428457260132,
"learning_rate": 0.00029156961586725334,
"loss": 3.180726318359375,
"step": 53250
},
{
"epoch": 0.10915845588912941,
"grad_norm": 0.7636610269546509,
"learning_rate": 0.0002915536263614745,
"loss": 3.335189208984375,
"step": 53300
},
{
"epoch": 0.10926085594155824,
"grad_norm": 0.8508041501045227,
"learning_rate": 0.0002915376221460641,
"loss": 3.4625811767578125,
"step": 53350
},
{
"epoch": 0.10936325599398707,
"grad_norm": 0.8359129428863525,
"learning_rate": 0.0002915216032226852,
"loss": 3.2652053833007812,
"step": 53400
},
{
"epoch": 0.1094656560464159,
"grad_norm": 0.7385704517364502,
"learning_rate": 0.0002915055695930025,
"loss": 3.322744140625,
"step": 53450
},
{
"epoch": 0.10956805609884472,
"grad_norm": 0.8769361972808838,
"learning_rate": 0.0002914895212586821,
"loss": 2.853853454589844,
"step": 53500
},
{
"epoch": 0.10967045615127355,
"grad_norm": 0.8474721908569336,
"learning_rate": 0.00029147345822139165,
"loss": 3.6555825805664064,
"step": 53550
},
{
"epoch": 0.10977285620370238,
"grad_norm": 1.0334205627441406,
"learning_rate": 0.0002914573804828004,
"loss": 3.35653564453125,
"step": 53600
},
{
"epoch": 0.1098752562561312,
"grad_norm": 0.9623045325279236,
"learning_rate": 0.000291441288044579,
"loss": 3.910505676269531,
"step": 53650
},
{
"epoch": 0.10997765630856003,
"grad_norm": 1.3744142055511475,
"learning_rate": 0.0002914251809083998,
"loss": 3.520959777832031,
"step": 53700
},
{
"epoch": 0.11008005636098886,
"grad_norm": 0.7532607316970825,
"learning_rate": 0.00029140905907593654,
"loss": 3.816366882324219,
"step": 53750
},
{
"epoch": 0.11018245641341769,
"grad_norm": 0.8342536091804504,
"learning_rate": 0.00029139292254886447,
"loss": 3.52778076171875,
"step": 53800
},
{
"epoch": 0.11028485646584651,
"grad_norm": 0.928033173084259,
"learning_rate": 0.0002913767713288606,
"loss": 3.7655780029296877,
"step": 53850
},
{
"epoch": 0.11038725651827534,
"grad_norm": 0.7441654801368713,
"learning_rate": 0.00029136060541760304,
"loss": 3.655460205078125,
"step": 53900
},
{
"epoch": 0.11048965657070417,
"grad_norm": 0.9803574085235596,
"learning_rate": 0.0002913444248167719,
"loss": 3.4816769409179686,
"step": 53950
},
{
"epoch": 0.110592056623133,
"grad_norm": 0.9637227654457092,
"learning_rate": 0.00029132822952804846,
"loss": 3.7213009643554686,
"step": 54000
},
{
"epoch": 0.11069445667556181,
"grad_norm": 1.0016183853149414,
"learning_rate": 0.0002913120195531158,
"loss": 3.2758993530273437,
"step": 54050
},
{
"epoch": 0.11079685672799064,
"grad_norm": 0.8481519818305969,
"learning_rate": 0.0002912957948936583,
"loss": 4.208623962402344,
"step": 54100
},
{
"epoch": 0.11089925678041947,
"grad_norm": 0.9075637459754944,
"learning_rate": 0.00029127955555136194,
"loss": 3.764527587890625,
"step": 54150
},
{
"epoch": 0.1110016568328483,
"grad_norm": 0.648526132106781,
"learning_rate": 0.0002912633015279143,
"loss": 3.3832241821289064,
"step": 54200
},
{
"epoch": 0.11110405688527712,
"grad_norm": 0.9115505218505859,
"learning_rate": 0.0002912470328250044,
"loss": 3.866526794433594,
"step": 54250
},
{
"epoch": 0.11120645693770595,
"grad_norm": 1.0018901824951172,
"learning_rate": 0.00029123074944432275,
"loss": 3.3686892700195314,
"step": 54300
},
{
"epoch": 0.11130885699013478,
"grad_norm": 0.9239192605018616,
"learning_rate": 0.0002912144513875615,
"loss": 3.808809509277344,
"step": 54350
},
{
"epoch": 0.1114112570425636,
"grad_norm": 0.7452714443206787,
"learning_rate": 0.0002911981386564143,
"loss": 3.228931884765625,
"step": 54400
},
{
"epoch": 0.11151365709499243,
"grad_norm": 1.3459135293960571,
"learning_rate": 0.0002911818112525763,
"loss": 3.20574951171875,
"step": 54450
},
{
"epoch": 0.11161605714742126,
"grad_norm": 1.0211737155914307,
"learning_rate": 0.0002911654691777441,
"loss": 2.2418772888183596,
"step": 54500
},
{
"epoch": 0.11171845719985009,
"grad_norm": 1.1944788694381714,
"learning_rate": 0.00029114911243361595,
"loss": 3.102964172363281,
"step": 54550
},
{
"epoch": 0.11182085725227892,
"grad_norm": 0.8922857642173767,
"learning_rate": 0.0002911327410218916,
"loss": 3.3322735595703126,
"step": 54600
},
{
"epoch": 0.11192325730470774,
"grad_norm": 0.8530144095420837,
"learning_rate": 0.0002911163549442722,
"loss": 3.8322744750976563,
"step": 54650
},
{
"epoch": 0.11202565735713657,
"grad_norm": 0.9170437455177307,
"learning_rate": 0.00029109995420246066,
"loss": 4.006968994140625,
"step": 54700
},
{
"epoch": 0.1121280574095654,
"grad_norm": 0.8506373763084412,
"learning_rate": 0.0002910835387981612,
"loss": 3.5530404663085937,
"step": 54750
},
{
"epoch": 0.11223045746199423,
"grad_norm": 1.0387393236160278,
"learning_rate": 0.00029106710873307956,
"loss": 3.33231201171875,
"step": 54800
},
{
"epoch": 0.11233285751442305,
"grad_norm": 0.5560868382453918,
"learning_rate": 0.00029105066400892315,
"loss": 2.382226867675781,
"step": 54850
},
{
"epoch": 0.11243525756685187,
"grad_norm": 0.9407156705856323,
"learning_rate": 0.00029103420462740087,
"loss": 3.685501708984375,
"step": 54900
},
{
"epoch": 0.1125376576192807,
"grad_norm": 0.6936938166618347,
"learning_rate": 0.000291017730590223,
"loss": 3.4359500122070314,
"step": 54950
},
{
"epoch": 0.11264005767170952,
"grad_norm": 0.9318833947181702,
"learning_rate": 0.0002910012418991016,
"loss": 3.430309143066406,
"step": 55000
},
{
"epoch": 0.11274245772413835,
"grad_norm": 1.0374469757080078,
"learning_rate": 0.00029098473855574997,
"loss": 3.467359619140625,
"step": 55050
},
{
"epoch": 0.11284485777656718,
"grad_norm": 1.1647804975509644,
"learning_rate": 0.0002909682205618831,
"loss": 3.3597537231445314,
"step": 55100
},
{
"epoch": 0.112947257828996,
"grad_norm": 1.8363399505615234,
"learning_rate": 0.00029095168791921753,
"loss": 3.4801220703125,
"step": 55150
},
{
"epoch": 0.11304965788142483,
"grad_norm": 0.8499084711074829,
"learning_rate": 0.0002909351406294712,
"loss": 3.76212158203125,
"step": 55200
},
{
"epoch": 0.11315205793385366,
"grad_norm": 0.6988071203231812,
"learning_rate": 0.0002909185786943636,
"loss": 3.5692535400390626,
"step": 55250
},
{
"epoch": 0.11325445798628249,
"grad_norm": 0.6894858479499817,
"learning_rate": 0.0002909020021156159,
"loss": 3.47705810546875,
"step": 55300
},
{
"epoch": 0.11335685803871132,
"grad_norm": 1.0136258602142334,
"learning_rate": 0.00029088541089495056,
"loss": 3.2412789916992186,
"step": 55350
},
{
"epoch": 0.11345925809114014,
"grad_norm": 0.6407994031906128,
"learning_rate": 0.00029086880503409164,
"loss": 3.150567626953125,
"step": 55400
},
{
"epoch": 0.11356165814356897,
"grad_norm": 0.8704736828804016,
"learning_rate": 0.00029085218453476483,
"loss": 3.5778497314453124,
"step": 55450
},
{
"epoch": 0.1136640581959978,
"grad_norm": 0.9866794943809509,
"learning_rate": 0.00029083554939869725,
"loss": 3.52720703125,
"step": 55500
},
{
"epoch": 0.11376645824842663,
"grad_norm": 0.4646882712841034,
"learning_rate": 0.0002908188996276175,
"loss": 2.8131982421875,
"step": 55550
},
{
"epoch": 0.11386885830085546,
"grad_norm": 0.7451179623603821,
"learning_rate": 0.00029080223522325575,
"loss": 3.3548162841796874,
"step": 55600
},
{
"epoch": 0.11397125835328428,
"grad_norm": 1.027496337890625,
"learning_rate": 0.0002907855561873438,
"loss": 3.047060852050781,
"step": 55650
},
{
"epoch": 0.11407365840571311,
"grad_norm": 1.0272102355957031,
"learning_rate": 0.0002907688625216147,
"loss": 3.21407958984375,
"step": 55700
},
{
"epoch": 0.11417605845814192,
"grad_norm": 0.8392390012741089,
"learning_rate": 0.0002907521542278033,
"loss": 3.5421328735351563,
"step": 55750
},
{
"epoch": 0.11427845851057075,
"grad_norm": 0.8752363324165344,
"learning_rate": 0.0002907354313076458,
"loss": 3.536468811035156,
"step": 55800
},
{
"epoch": 0.11438085856299958,
"grad_norm": 0.6718413233757019,
"learning_rate": 0.00029071869376288,
"loss": 3.5975299072265625,
"step": 55850
},
{
"epoch": 0.11448325861542841,
"grad_norm": 0.8909393548965454,
"learning_rate": 0.0002907019415952452,
"loss": 3.8420440673828127,
"step": 55900
},
{
"epoch": 0.11458565866785723,
"grad_norm": 0.7395539879798889,
"learning_rate": 0.00029068517480648217,
"loss": 3.4465701293945314,
"step": 55950
},
{
"epoch": 0.11468805872028606,
"grad_norm": 0.7831642627716064,
"learning_rate": 0.00029066839339833333,
"loss": 3.2164300537109374,
"step": 56000
},
{
"epoch": 0.11479045877271489,
"grad_norm": 0.8047283291816711,
"learning_rate": 0.0002906515973725424,
"loss": 3.697811279296875,
"step": 56050
},
{
"epoch": 0.11489285882514372,
"grad_norm": 0.7210569977760315,
"learning_rate": 0.00029063478673085484,
"loss": 3.0727462768554688,
"step": 56100
},
{
"epoch": 0.11499525887757255,
"grad_norm": 0.9832913875579834,
"learning_rate": 0.0002906179614750175,
"loss": 3.4165048217773437,
"step": 56150
},
{
"epoch": 0.11509765893000137,
"grad_norm": 0.9115371108055115,
"learning_rate": 0.0002906011216067788,
"loss": 3.485976257324219,
"step": 56200
},
{
"epoch": 0.1152000589824302,
"grad_norm": 0.9409294724464417,
"learning_rate": 0.0002905842671278887,
"loss": 2.942160339355469,
"step": 56250
},
{
"epoch": 0.11530245903485903,
"grad_norm": 1.1528805494308472,
"learning_rate": 0.0002905673980400986,
"loss": 3.5174395751953127,
"step": 56300
},
{
"epoch": 0.11540485908728786,
"grad_norm": 0.740906834602356,
"learning_rate": 0.0002905505143451614,
"loss": 3.086756286621094,
"step": 56350
},
{
"epoch": 0.11550725913971668,
"grad_norm": 0.888832688331604,
"learning_rate": 0.00029053361604483173,
"loss": 3.832029113769531,
"step": 56400
},
{
"epoch": 0.11560965919214551,
"grad_norm": 0.889111340045929,
"learning_rate": 0.00029051670314086546,
"loss": 3.207186584472656,
"step": 56450
},
{
"epoch": 0.11571205924457434,
"grad_norm": 0.7387615442276001,
"learning_rate": 0.0002904997756350202,
"loss": 3.152142333984375,
"step": 56500
},
{
"epoch": 0.11581445929700317,
"grad_norm": 0.8200859427452087,
"learning_rate": 0.00029048283352905486,
"loss": 3.671814270019531,
"step": 56550
},
{
"epoch": 0.11591685934943198,
"grad_norm": 1.1036324501037598,
"learning_rate": 0.0002904658768247301,
"loss": 3.4229196166992186,
"step": 56600
},
{
"epoch": 0.11601925940186081,
"grad_norm": 0.8785697221755981,
"learning_rate": 0.00029044890552380796,
"loss": 3.5630813598632813,
"step": 56650
},
{
"epoch": 0.11612165945428964,
"grad_norm": 1.0525559186935425,
"learning_rate": 0.000290431919628052,
"loss": 3.124271240234375,
"step": 56700
},
{
"epoch": 0.11622405950671846,
"grad_norm": 0.9268920421600342,
"learning_rate": 0.00029041491913922736,
"loss": 3.26138916015625,
"step": 56750
},
{
"epoch": 0.11632645955914729,
"grad_norm": 0.8036125898361206,
"learning_rate": 0.0002903979040591006,
"loss": 3.1208505249023437,
"step": 56800
},
{
"epoch": 0.11642885961157612,
"grad_norm": 0.871330976486206,
"learning_rate": 0.0002903808743894399,
"loss": 3.6094674682617187,
"step": 56850
},
{
"epoch": 0.11653125966400495,
"grad_norm": 0.7573062181472778,
"learning_rate": 0.00029036383013201486,
"loss": 3.4403109741210938,
"step": 56900
},
{
"epoch": 0.11663365971643377,
"grad_norm": 0.8866212964057922,
"learning_rate": 0.0002903467712885967,
"loss": 3.023941955566406,
"step": 56950
},
{
"epoch": 0.1167360597688626,
"grad_norm": 1.240868330001831,
"learning_rate": 0.00029032969786095807,
"loss": 3.81320556640625,
"step": 57000
},
{
"epoch": 0.11683845982129143,
"grad_norm": 0.9338199496269226,
"learning_rate": 0.0002903126098508732,
"loss": 3.1648443603515624,
"step": 57050
},
{
"epoch": 0.11694085987372026,
"grad_norm": 0.8921442031860352,
"learning_rate": 0.0002902955072601177,
"loss": 3.7754312133789063,
"step": 57100
},
{
"epoch": 0.11704325992614908,
"grad_norm": 0.7555287480354309,
"learning_rate": 0.00029027839009046887,
"loss": 3.6020452880859377,
"step": 57150
},
{
"epoch": 0.11714565997857791,
"grad_norm": 0.8668673038482666,
"learning_rate": 0.00029026125834370547,
"loss": 3.1613735961914062,
"step": 57200
},
{
"epoch": 0.11724806003100674,
"grad_norm": 0.8572468757629395,
"learning_rate": 0.00029024411202160775,
"loss": 3.5449398803710936,
"step": 57250
},
{
"epoch": 0.11735046008343557,
"grad_norm": 1.0183916091918945,
"learning_rate": 0.0002902269511259575,
"loss": 3.4537921142578125,
"step": 57300
},
{
"epoch": 0.1174528601358644,
"grad_norm": 0.7662498354911804,
"learning_rate": 0.00029020977565853793,
"loss": 3.6329010009765623,
"step": 57350
},
{
"epoch": 0.11755526018829322,
"grad_norm": 0.7248380780220032,
"learning_rate": 0.0002901925856211339,
"loss": 3.476121826171875,
"step": 57400
},
{
"epoch": 0.11765766024072204,
"grad_norm": 0.5883516073226929,
"learning_rate": 0.0002901753810155316,
"loss": 3.1229867553710937,
"step": 57450
},
{
"epoch": 0.11776006029315086,
"grad_norm": 1.3006634712219238,
"learning_rate": 0.00029015816184351905,
"loss": 3.42736572265625,
"step": 57500
},
{
"epoch": 0.11786246034557969,
"grad_norm": 0.9047501683235168,
"learning_rate": 0.0002901409281068855,
"loss": 3.190472412109375,
"step": 57550
},
{
"epoch": 0.11796486039800852,
"grad_norm": 0.8864745497703552,
"learning_rate": 0.00029012367980742177,
"loss": 3.5744329833984376,
"step": 57600
},
{
"epoch": 0.11806726045043735,
"grad_norm": 0.9945940971374512,
"learning_rate": 0.0002901064169469203,
"loss": 3.613050537109375,
"step": 57650
},
{
"epoch": 0.11816966050286618,
"grad_norm": 0.9474062919616699,
"learning_rate": 0.00029008913952717486,
"loss": 3.755731201171875,
"step": 57700
},
{
"epoch": 0.118272060555295,
"grad_norm": 0.9160423874855042,
"learning_rate": 0.000290071847549981,
"loss": 3.7074078369140624,
"step": 57750
},
{
"epoch": 0.11837446060772383,
"grad_norm": 0.7401031851768494,
"learning_rate": 0.0002900545410171355,
"loss": 3.9821441650390623,
"step": 57800
},
{
"epoch": 0.11847686066015266,
"grad_norm": 0.9346218705177307,
"learning_rate": 0.00029003721993043686,
"loss": 3.6518328857421873,
"step": 57850
},
{
"epoch": 0.11857926071258149,
"grad_norm": 0.8762102723121643,
"learning_rate": 0.0002900198842916849,
"loss": 3.3181643676757813,
"step": 57900
},
{
"epoch": 0.11868166076501031,
"grad_norm": 0.8260309100151062,
"learning_rate": 0.00029000253410268117,
"loss": 3.745126953125,
"step": 57950
},
{
"epoch": 0.11878406081743914,
"grad_norm": 0.728134274482727,
"learning_rate": 0.00028998516936522864,
"loss": 3.5524822998046877,
"step": 58000
},
{
"epoch": 0.11888646086986797,
"grad_norm": 1.0089305639266968,
"learning_rate": 0.0002899677900811316,
"loss": 3.7944915771484373,
"step": 58050
},
{
"epoch": 0.1189888609222968,
"grad_norm": 0.8292215466499329,
"learning_rate": 0.0002899503962521963,
"loss": 3.89322021484375,
"step": 58100
},
{
"epoch": 0.11909126097472562,
"grad_norm": 0.8529530167579651,
"learning_rate": 0.00028993298788023005,
"loss": 2.9111569213867186,
"step": 58150
},
{
"epoch": 0.11919366102715445,
"grad_norm": 0.8902004361152649,
"learning_rate": 0.00028991556496704186,
"loss": 3.6739492797851563,
"step": 58200
},
{
"epoch": 0.11929606107958328,
"grad_norm": 0.9264180660247803,
"learning_rate": 0.0002898981275144423,
"loss": 4.041621704101562,
"step": 58250
},
{
"epoch": 0.1193984611320121,
"grad_norm": 0.8773983716964722,
"learning_rate": 0.0002898806755242433,
"loss": 3.3137640380859374,
"step": 58300
},
{
"epoch": 0.11950086118444092,
"grad_norm": 0.8258083462715149,
"learning_rate": 0.00028986320899825855,
"loss": 3.7007760620117187,
"step": 58350
},
{
"epoch": 0.11960326123686975,
"grad_norm": 0.4484880566596985,
"learning_rate": 0.00028984572793830295,
"loss": 2.6619467163085937,
"step": 58400
},
{
"epoch": 0.11970566128929858,
"grad_norm": 0.7841198444366455,
"learning_rate": 0.0002898282323461931,
"loss": 3.9329864501953127,
"step": 58450
},
{
"epoch": 0.1198080613417274,
"grad_norm": 1.0872740745544434,
"learning_rate": 0.0002898107222237471,
"loss": 3.7529037475585936,
"step": 58500
},
{
"epoch": 0.11991046139415623,
"grad_norm": 1.1370536088943481,
"learning_rate": 0.0002897931975727845,
"loss": 3.598294372558594,
"step": 58550
},
{
"epoch": 0.12001286144658506,
"grad_norm": 0.5730462670326233,
"learning_rate": 0.0002897756583951264,
"loss": 3.4985086059570314,
"step": 58600
},
{
"epoch": 0.12011526149901389,
"grad_norm": 0.8920771479606628,
"learning_rate": 0.00028975810469259535,
"loss": 3.025179443359375,
"step": 58650
},
{
"epoch": 0.12021766155144271,
"grad_norm": 0.877116858959198,
"learning_rate": 0.0002897405364670155,
"loss": 3.4373843383789064,
"step": 58700
},
{
"epoch": 0.12032006160387154,
"grad_norm": 0.9569665193557739,
"learning_rate": 0.0002897229537202124,
"loss": 4.03067626953125,
"step": 58750
},
{
"epoch": 0.12042246165630037,
"grad_norm": 0.9027877449989319,
"learning_rate": 0.00028970535645401324,
"loss": 3.0247479248046876,
"step": 58800
},
{
"epoch": 0.1205248617087292,
"grad_norm": 0.7448411583900452,
"learning_rate": 0.0002896877446702467,
"loss": 3.632384948730469,
"step": 58850
},
{
"epoch": 0.12062726176115803,
"grad_norm": 0.8100590705871582,
"learning_rate": 0.0002896701183707428,
"loss": 3.263778076171875,
"step": 58900
},
{
"epoch": 0.12072966181358685,
"grad_norm": 1.191540241241455,
"learning_rate": 0.0002896524775573332,
"loss": 3.6475961303710935,
"step": 58950
},
{
"epoch": 0.12083206186601568,
"grad_norm": 0.7784574031829834,
"learning_rate": 0.00028963482223185106,
"loss": 3.554160461425781,
"step": 59000
},
{
"epoch": 0.12093446191844451,
"grad_norm": 0.5998643040657043,
"learning_rate": 0.0002896171523961312,
"loss": 3.2943960571289064,
"step": 59050
},
{
"epoch": 0.12103686197087334,
"grad_norm": 0.8640596270561218,
"learning_rate": 0.0002895994680520096,
"loss": 2.6187591552734375,
"step": 59100
},
{
"epoch": 0.12113926202330215,
"grad_norm": 0.6376426219940186,
"learning_rate": 0.00028958176920132396,
"loss": 3.336057434082031,
"step": 59150
},
{
"epoch": 0.12124166207573098,
"grad_norm": 1.1029490232467651,
"learning_rate": 0.0002895640558459136,
"loss": 3.192468566894531,
"step": 59200
},
{
"epoch": 0.1213440621281598,
"grad_norm": 0.9253978729248047,
"learning_rate": 0.00028954632798761906,
"loss": 3.82802490234375,
"step": 59250
},
{
"epoch": 0.12144646218058863,
"grad_norm": 1.0808192491531372,
"learning_rate": 0.0002895285856282826,
"loss": 3.537474365234375,
"step": 59300
},
{
"epoch": 0.12154886223301746,
"grad_norm": 0.7610458731651306,
"learning_rate": 0.000289510828769748,
"loss": 3.7149560546875,
"step": 59350
},
{
"epoch": 0.12165126228544629,
"grad_norm": 1.0239511728286743,
"learning_rate": 0.0002894930574138604,
"loss": 3.168520202636719,
"step": 59400
},
{
"epoch": 0.12175366233787512,
"grad_norm": 1.482177495956421,
"learning_rate": 0.0002894752715624665,
"loss": 3.8551751708984376,
"step": 59450
},
{
"epoch": 0.12185606239030394,
"grad_norm": 0.8012579083442688,
"learning_rate": 0.00028945747121741455,
"loss": 3.244693603515625,
"step": 59500
},
{
"epoch": 0.12195846244273277,
"grad_norm": 0.6927148699760437,
"learning_rate": 0.0002894396563805543,
"loss": 3.722396240234375,
"step": 59550
},
{
"epoch": 0.1220608624951616,
"grad_norm": 0.7614629864692688,
"learning_rate": 0.00028942182705373707,
"loss": 3.183421630859375,
"step": 59600
},
{
"epoch": 0.12216326254759043,
"grad_norm": 0.8808593153953552,
"learning_rate": 0.0002894039832388154,
"loss": 3.444737854003906,
"step": 59650
},
{
"epoch": 0.12226566260001925,
"grad_norm": 0.9153810143470764,
"learning_rate": 0.0002893861249376437,
"loss": 3.483736267089844,
"step": 59700
},
{
"epoch": 0.12236806265244808,
"grad_norm": 0.8851150870323181,
"learning_rate": 0.0002893682521520777,
"loss": 3.7175869750976562,
"step": 59750
},
{
"epoch": 0.12247046270487691,
"grad_norm": 0.7266696095466614,
"learning_rate": 0.00028935036488397466,
"loss": 3.335245361328125,
"step": 59800
},
{
"epoch": 0.12257286275730574,
"grad_norm": 0.9137750864028931,
"learning_rate": 0.0002893324631351933,
"loss": 2.9954302978515623,
"step": 59850
},
{
"epoch": 0.12267526280973456,
"grad_norm": 0.8360620141029358,
"learning_rate": 0.00028931454690759396,
"loss": 3.0706732177734377,
"step": 59900
},
{
"epoch": 0.12277766286216339,
"grad_norm": 0.8443347811698914,
"learning_rate": 0.00028929661620303833,
"loss": 3.848203430175781,
"step": 59950
},
{
"epoch": 0.1228800629145922,
"grad_norm": 0.9306533932685852,
"learning_rate": 0.0002892786710233898,
"loss": 2.975295104980469,
"step": 60000
},
{
"epoch": 0.12298246296702103,
"grad_norm": 0.9441879391670227,
"learning_rate": 0.00028926071137051307,
"loss": 3.9100912475585936,
"step": 60050
},
{
"epoch": 0.12308486301944986,
"grad_norm": 0.7004597187042236,
"learning_rate": 0.00028924273724627444,
"loss": 3.670945739746094,
"step": 60100
},
{
"epoch": 0.12318726307187869,
"grad_norm": 0.7978895306587219,
"learning_rate": 0.00028922474865254174,
"loss": 3.39288818359375,
"step": 60150
},
{
"epoch": 0.12328966312430752,
"grad_norm": 0.8944730758666992,
"learning_rate": 0.0002892067455911842,
"loss": 3.4790631103515626,
"step": 60200
},
{
"epoch": 0.12339206317673634,
"grad_norm": 0.9302740097045898,
"learning_rate": 0.0002891887280640727,
"loss": 3.564194641113281,
"step": 60250
},
{
"epoch": 0.12349446322916517,
"grad_norm": 0.7751696109771729,
"learning_rate": 0.0002891706960730795,
"loss": 3.4631011962890623,
"step": 60300
},
{
"epoch": 0.123596863281594,
"grad_norm": 0.998839795589447,
"learning_rate": 0.00028915264962007836,
"loss": 3.490992126464844,
"step": 60350
},
{
"epoch": 0.12369926333402283,
"grad_norm": 1.2390878200531006,
"learning_rate": 0.0002891345887069447,
"loss": 3.6483099365234377,
"step": 60400
},
{
"epoch": 0.12380166338645165,
"grad_norm": 0.8795660138130188,
"learning_rate": 0.0002891165133355553,
"loss": 3.6523648071289063,
"step": 60450
},
{
"epoch": 0.12390406343888048,
"grad_norm": 0.8491701483726501,
"learning_rate": 0.00028909842350778836,
"loss": 3.5479266357421877,
"step": 60500
},
{
"epoch": 0.12400646349130931,
"grad_norm": 0.7775550484657288,
"learning_rate": 0.00028908031922552377,
"loss": 3.1797994995117187,
"step": 60550
},
{
"epoch": 0.12410886354373814,
"grad_norm": 0.7711923718452454,
"learning_rate": 0.0002890622004906429,
"loss": 3.398070068359375,
"step": 60600
},
{
"epoch": 0.12421126359616697,
"grad_norm": 0.490875244140625,
"learning_rate": 0.0002890440673050285,
"loss": 3.022109069824219,
"step": 60650
},
{
"epoch": 0.12431366364859579,
"grad_norm": 0.7348693609237671,
"learning_rate": 0.0002890259196705649,
"loss": 3.46414794921875,
"step": 60700
},
{
"epoch": 0.12441606370102462,
"grad_norm": 0.9327791929244995,
"learning_rate": 0.000289007757589138,
"loss": 3.681882629394531,
"step": 60750
},
{
"epoch": 0.12451846375345345,
"grad_norm": 0.8426567912101746,
"learning_rate": 0.000288989581062635,
"loss": 4.021507568359375,
"step": 60800
},
{
"epoch": 0.12462086380588226,
"grad_norm": 0.9796308875083923,
"learning_rate": 0.0002889713900929448,
"loss": 3.8382940673828125,
"step": 60850
},
{
"epoch": 0.12472326385831109,
"grad_norm": 0.7347166538238525,
"learning_rate": 0.0002889531846819577,
"loss": 3.32147216796875,
"step": 60900
},
{
"epoch": 0.12482566391073992,
"grad_norm": 0.770237147808075,
"learning_rate": 0.0002889349648315655,
"loss": 3.648823547363281,
"step": 60950
},
{
"epoch": 0.12492806396316875,
"grad_norm": 0.6420400738716125,
"learning_rate": 0.00028891673054366165,
"loss": 3.17007568359375,
"step": 61000
},
{
"epoch": 0.1250304640155976,
"grad_norm": 0.7027015089988708,
"learning_rate": 0.00028889848182014086,
"loss": 3.246382141113281,
"step": 61050
},
{
"epoch": 0.1251328640680264,
"grad_norm": 0.868607759475708,
"learning_rate": 0.0002888802186628995,
"loss": 3.6044903564453126,
"step": 61100
},
{
"epoch": 0.12523526412045524,
"grad_norm": 0.8410335183143616,
"learning_rate": 0.00028886194107383535,
"loss": 3.066201171875,
"step": 61150
},
{
"epoch": 0.12533766417288406,
"grad_norm": 1.0808706283569336,
"learning_rate": 0.00028884364905484784,
"loss": 3.1906118774414063,
"step": 61200
},
{
"epoch": 0.12544006422531287,
"grad_norm": 0.872553825378418,
"learning_rate": 0.00028882534260783765,
"loss": 3.3807113647460936,
"step": 61250
},
{
"epoch": 0.1255424642777417,
"grad_norm": 0.9935702681541443,
"learning_rate": 0.0002888070217347072,
"loss": 3.3980447387695314,
"step": 61300
},
{
"epoch": 0.12564486433017052,
"grad_norm": 0.8990649580955505,
"learning_rate": 0.0002887886864373603,
"loss": 3.4861651611328126,
"step": 61350
},
{
"epoch": 0.12574726438259937,
"grad_norm": 0.8892736434936523,
"learning_rate": 0.0002887703367177023,
"loss": 3.9071136474609376,
"step": 61400
},
{
"epoch": 0.12584966443502818,
"grad_norm": 0.7861908078193665,
"learning_rate": 0.00028875197257763997,
"loss": 3.886827392578125,
"step": 61450
},
{
"epoch": 0.12595206448745702,
"grad_norm": 0.8096019625663757,
"learning_rate": 0.0002887335940190817,
"loss": 2.9763027954101564,
"step": 61500
},
{
"epoch": 0.12605446453988584,
"grad_norm": 0.8015087246894836,
"learning_rate": 0.00028871520104393724,
"loss": 3.5265399169921876,
"step": 61550
},
{
"epoch": 0.12615686459231468,
"grad_norm": 1.0955448150634766,
"learning_rate": 0.00028869679365411786,
"loss": 3.746468811035156,
"step": 61600
},
{
"epoch": 0.1262592646447435,
"grad_norm": 0.9293431043624878,
"learning_rate": 0.00028867837185153654,
"loss": 3.725838317871094,
"step": 61650
},
{
"epoch": 0.12636166469717233,
"grad_norm": 0.881248950958252,
"learning_rate": 0.0002886599356381075,
"loss": 3.873548583984375,
"step": 61700
},
{
"epoch": 0.12646406474960115,
"grad_norm": 0.7995479702949524,
"learning_rate": 0.00028864148501574655,
"loss": 3.55103515625,
"step": 61750
},
{
"epoch": 0.12656646480203,
"grad_norm": 0.7834081053733826,
"learning_rate": 0.00028862301998637096,
"loss": 3.5016546630859375,
"step": 61800
},
{
"epoch": 0.1266688648544588,
"grad_norm": 0.8396415710449219,
"learning_rate": 0.00028860454055189955,
"loss": 3.15347900390625,
"step": 61850
},
{
"epoch": 0.12677126490688764,
"grad_norm": 0.7540357112884521,
"learning_rate": 0.00028858604671425266,
"loss": 3.5248077392578123,
"step": 61900
},
{
"epoch": 0.12687366495931646,
"grad_norm": 1.1297228336334229,
"learning_rate": 0.00028856753847535213,
"loss": 3.4668838500976564,
"step": 61950
},
{
"epoch": 0.1269760650117453,
"grad_norm": 0.7924526929855347,
"learning_rate": 0.0002885490158371212,
"loss": 3.7679620361328126,
"step": 62000
},
{
"epoch": 0.1270784650641741,
"grad_norm": 0.8227761387825012,
"learning_rate": 0.0002885304788014846,
"loss": 3.809046325683594,
"step": 62050
},
{
"epoch": 0.12718086511660295,
"grad_norm": 0.8400523662567139,
"learning_rate": 0.0002885119273703687,
"loss": 3.74009765625,
"step": 62100
},
{
"epoch": 0.12728326516903177,
"grad_norm": 1.1692306995391846,
"learning_rate": 0.0002884933615457012,
"loss": 4.000062866210937,
"step": 62150
},
{
"epoch": 0.12738566522146058,
"grad_norm": 0.7943342328071594,
"learning_rate": 0.00028847478132941153,
"loss": 3.8031546020507814,
"step": 62200
},
{
"epoch": 0.12748806527388942,
"grad_norm": 0.9809468984603882,
"learning_rate": 0.0002884561867234303,
"loss": 3.805234680175781,
"step": 62250
},
{
"epoch": 0.12759046532631824,
"grad_norm": 0.9183539748191833,
"learning_rate": 0.00028843757772968994,
"loss": 4.105808715820313,
"step": 62300
},
{
"epoch": 0.12769286537874708,
"grad_norm": 0.9354544281959534,
"learning_rate": 0.0002884189543501241,
"loss": 3.7343814086914064,
"step": 62350
},
{
"epoch": 0.1277952654311759,
"grad_norm": 0.8216899633407593,
"learning_rate": 0.00028840031658666803,
"loss": 3.678810729980469,
"step": 62400
},
{
"epoch": 0.12789766548360473,
"grad_norm": 0.8827342987060547,
"learning_rate": 0.00028838166444125857,
"loss": 3.634096374511719,
"step": 62450
},
{
"epoch": 0.12800006553603355,
"grad_norm": 0.9468240141868591,
"learning_rate": 0.00028836299791583386,
"loss": 3.0597830200195313,
"step": 62500
},
{
"epoch": 0.1281024655884624,
"grad_norm": 0.9269732236862183,
"learning_rate": 0.00028834431701233376,
"loss": 3.667522277832031,
"step": 62550
},
{
"epoch": 0.1282048656408912,
"grad_norm": 0.7396625280380249,
"learning_rate": 0.0002883256217326994,
"loss": 3.823531494140625,
"step": 62600
},
{
"epoch": 0.12830726569332004,
"grad_norm": 0.653838574886322,
"learning_rate": 0.0002883069120788737,
"loss": 3.5563314819335936,
"step": 62650
},
{
"epoch": 0.12840966574574886,
"grad_norm": 0.8573964834213257,
"learning_rate": 0.0002882881880528006,
"loss": 3.9133944702148438,
"step": 62700
},
{
"epoch": 0.1285120657981777,
"grad_norm": 0.8567407727241516,
"learning_rate": 0.00028826944965642604,
"loss": 3.63771484375,
"step": 62750
},
{
"epoch": 0.1286144658506065,
"grad_norm": 0.8221452832221985,
"learning_rate": 0.00028825069689169706,
"loss": 3.7375106811523438,
"step": 62800
},
{
"epoch": 0.12871686590303535,
"grad_norm": 0.8458483815193176,
"learning_rate": 0.0002882319297605626,
"loss": 3.4764666748046875,
"step": 62850
},
{
"epoch": 0.12881926595546417,
"grad_norm": 0.5829837322235107,
"learning_rate": 0.0002882131482649727,
"loss": 3.4318243408203126,
"step": 62900
},
{
"epoch": 0.128921666007893,
"grad_norm": 0.6864559054374695,
"learning_rate": 0.000288194352406879,
"loss": 2.9922601318359376,
"step": 62950
},
{
"epoch": 0.12902406606032182,
"grad_norm": 0.7200921177864075,
"learning_rate": 0.0002881755421882348,
"loss": 3.436331481933594,
"step": 63000
},
{
"epoch": 0.12912646611275064,
"grad_norm": 0.8018766045570374,
"learning_rate": 0.00028815671761099474,
"loss": 3.8753070068359374,
"step": 63050
},
{
"epoch": 0.12922886616517948,
"grad_norm": 0.7417867183685303,
"learning_rate": 0.00028813787867711495,
"loss": 3.4831881713867188,
"step": 63100
},
{
"epoch": 0.1293312662176083,
"grad_norm": 0.8872492909431458,
"learning_rate": 0.0002881190253885531,
"loss": 3.471279296875,
"step": 63150
},
{
"epoch": 0.12943366627003713,
"grad_norm": 0.9026205539703369,
"learning_rate": 0.00028810015774726844,
"loss": 3.930486755371094,
"step": 63200
},
{
"epoch": 0.12953606632246595,
"grad_norm": 0.8319406509399414,
"learning_rate": 0.0002880812757552215,
"loss": 3.876917419433594,
"step": 63250
},
{
"epoch": 0.1296384663748948,
"grad_norm": 0.7153857946395874,
"learning_rate": 0.00028806237941437444,
"loss": 3.4448760986328124,
"step": 63300
},
{
"epoch": 0.1297408664273236,
"grad_norm": 0.7869312763214111,
"learning_rate": 0.00028804346872669085,
"loss": 3.9350848388671875,
"step": 63350
},
{
"epoch": 0.12984326647975244,
"grad_norm": 0.7719307541847229,
"learning_rate": 0.00028802454369413594,
"loss": 3.8888482666015625,
"step": 63400
},
{
"epoch": 0.12994566653218126,
"grad_norm": 1.150686502456665,
"learning_rate": 0.00028800560431867624,
"loss": 2.990634765625,
"step": 63450
},
{
"epoch": 0.1300480665846101,
"grad_norm": 0.7204848527908325,
"learning_rate": 0.00028798665060227984,
"loss": 2.155850067138672,
"step": 63500
},
{
"epoch": 0.13015046663703891,
"grad_norm": 1.2251051664352417,
"learning_rate": 0.0002879676825469164,
"loss": 3.9703302001953125,
"step": 63550
},
{
"epoch": 0.13025286668946776,
"grad_norm": 0.7243852615356445,
"learning_rate": 0.00028794870015455695,
"loss": 3.6895037841796876,
"step": 63600
},
{
"epoch": 0.13035526674189657,
"grad_norm": 0.8284658193588257,
"learning_rate": 0.00028792970342717407,
"loss": 3.7690008544921874,
"step": 63650
},
{
"epoch": 0.1304576667943254,
"grad_norm": 0.05757651478052139,
"learning_rate": 0.0002879106923667418,
"loss": 1.9905595397949218,
"step": 63700
},
{
"epoch": 0.13056006684675422,
"grad_norm": 0.7437557578086853,
"learning_rate": 0.0002878916669752357,
"loss": 1.6622731018066406,
"step": 63750
},
{
"epoch": 0.13066246689918307,
"grad_norm": 0.8517412543296814,
"learning_rate": 0.0002878726272546328,
"loss": 4.094966430664062,
"step": 63800
},
{
"epoch": 0.13076486695161188,
"grad_norm": 0.8423788547515869,
"learning_rate": 0.00028785357320691154,
"loss": 4.379864196777344,
"step": 63850
},
{
"epoch": 0.1308672670040407,
"grad_norm": 0.853164792060852,
"learning_rate": 0.0002878345048340521,
"loss": 3.3010690307617185,
"step": 63900
},
{
"epoch": 0.13096966705646954,
"grad_norm": 0.7724624872207642,
"learning_rate": 0.00028781542213803587,
"loss": 3.83298095703125,
"step": 63950
},
{
"epoch": 0.13107206710889835,
"grad_norm": 0.599814236164093,
"learning_rate": 0.0002877963251208459,
"loss": 3.6117398071289064,
"step": 64000
},
{
"epoch": 0.1311744671613272,
"grad_norm": 0.87944096326828,
"learning_rate": 0.00028777721378446655,
"loss": 3.77650390625,
"step": 64050
},
{
"epoch": 0.131276867213756,
"grad_norm": 0.8216091990470886,
"learning_rate": 0.000287758088130884,
"loss": 3.966680603027344,
"step": 64100
},
{
"epoch": 0.13137926726618485,
"grad_norm": 0.9879843592643738,
"learning_rate": 0.00028773894816208547,
"loss": 3.368244323730469,
"step": 64150
},
{
"epoch": 0.13148166731861366,
"grad_norm": 0.7889556288719177,
"learning_rate": 0.00028771979388006,
"loss": 3.514817199707031,
"step": 64200
},
{
"epoch": 0.1315840673710425,
"grad_norm": 0.7712281942367554,
"learning_rate": 0.00028770062528679814,
"loss": 3.8969122314453126,
"step": 64250
},
{
"epoch": 0.13168646742347132,
"grad_norm": 0.6825302243232727,
"learning_rate": 0.0002876814423842916,
"loss": 2.702755126953125,
"step": 64300
},
{
"epoch": 0.13178886747590016,
"grad_norm": 0.7740472555160522,
"learning_rate": 0.0002876622451745339,
"loss": 3.027957763671875,
"step": 64350
},
{
"epoch": 0.13189126752832897,
"grad_norm": 0.7272697687149048,
"learning_rate": 0.00028764303365951986,
"loss": 3.3588211059570314,
"step": 64400
},
{
"epoch": 0.1319936675807578,
"grad_norm": 0.8405432105064392,
"learning_rate": 0.00028762380784124597,
"loss": 3.6030569458007813,
"step": 64450
},
{
"epoch": 0.13209606763318663,
"grad_norm": 0.8467888236045837,
"learning_rate": 0.00028760456772171004,
"loss": 3.6072647094726564,
"step": 64500
},
{
"epoch": 0.13219846768561547,
"grad_norm": 0.7771287560462952,
"learning_rate": 0.0002875853133029113,
"loss": 3.8847897338867186,
"step": 64550
},
{
"epoch": 0.13230086773804428,
"grad_norm": 0.9590752720832825,
"learning_rate": 0.0002875660445868507,
"loss": 3.6549798583984376,
"step": 64600
},
{
"epoch": 0.13240326779047312,
"grad_norm": 0.7539810538291931,
"learning_rate": 0.0002875467615755306,
"loss": 3.3866226196289064,
"step": 64650
},
{
"epoch": 0.13250566784290194,
"grad_norm": 0.8308656215667725,
"learning_rate": 0.0002875274642709548,
"loss": 3.721044006347656,
"step": 64700
},
{
"epoch": 0.13260806789533075,
"grad_norm": 0.8056835532188416,
"learning_rate": 0.00028750815267512847,
"loss": 3.9817669677734373,
"step": 64750
},
{
"epoch": 0.1327104679477596,
"grad_norm": 1.1581485271453857,
"learning_rate": 0.0002874888267900585,
"loss": 3.785094909667969,
"step": 64800
},
{
"epoch": 0.1328128680001884,
"grad_norm": 0.6927155256271362,
"learning_rate": 0.0002874694866177531,
"loss": 3.867703857421875,
"step": 64850
},
{
"epoch": 0.13291526805261725,
"grad_norm": 1.1712969541549683,
"learning_rate": 0.00028745013216022197,
"loss": 3.79897705078125,
"step": 64900
},
{
"epoch": 0.13301766810504606,
"grad_norm": 0.7104830741882324,
"learning_rate": 0.0002874307634194765,
"loss": 3.595622863769531,
"step": 64950
},
{
"epoch": 0.1331200681574749,
"grad_norm": 0.8754029273986816,
"learning_rate": 0.00028741138039752923,
"loss": 3.6854147338867187,
"step": 65000
},
{
"epoch": 0.13322246820990372,
"grad_norm": 0.8316354751586914,
"learning_rate": 0.00028739198309639445,
"loss": 3.955341491699219,
"step": 65050
},
{
"epoch": 0.13332486826233256,
"grad_norm": 0.7100203037261963,
"learning_rate": 0.00028737257151808783,
"loss": 3.53195556640625,
"step": 65100
},
{
"epoch": 0.13342726831476137,
"grad_norm": 0.7703724503517151,
"learning_rate": 0.00028735314566462653,
"loss": 3.5027481079101563,
"step": 65150
},
{
"epoch": 0.1335296683671902,
"grad_norm": 0.6825149059295654,
"learning_rate": 0.00028733370553802917,
"loss": 2.5823513793945314,
"step": 65200
},
{
"epoch": 0.13363206841961903,
"grad_norm": 0.7070282101631165,
"learning_rate": 0.00028731425114031595,
"loss": 3.5302462768554688,
"step": 65250
},
{
"epoch": 0.13373446847204787,
"grad_norm": 0.8907217383384705,
"learning_rate": 0.0002872947824735084,
"loss": 3.7343438720703124,
"step": 65300
},
{
"epoch": 0.13383686852447668,
"grad_norm": 0.6310061812400818,
"learning_rate": 0.00028727529953962973,
"loss": 3.046968688964844,
"step": 65350
},
{
"epoch": 0.13393926857690552,
"grad_norm": 0.830430269241333,
"learning_rate": 0.00028725580234070444,
"loss": 3.7792376708984374,
"step": 65400
},
{
"epoch": 0.13404166862933434,
"grad_norm": 0.7595807313919067,
"learning_rate": 0.0002872362908787586,
"loss": 4.01267578125,
"step": 65450
},
{
"epoch": 0.13414406868176318,
"grad_norm": 0.939785897731781,
"learning_rate": 0.00028721676515581975,
"loss": 3.7015313720703125,
"step": 65500
},
{
"epoch": 0.134246468734192,
"grad_norm": 0.7830142378807068,
"learning_rate": 0.00028719722517391694,
"loss": 3.7573947143554687,
"step": 65550
},
{
"epoch": 0.1343488687866208,
"grad_norm": 0.8249261379241943,
"learning_rate": 0.00028717767093508066,
"loss": 2.924357604980469,
"step": 65600
},
{
"epoch": 0.13445126883904965,
"grad_norm": 0.8103399276733398,
"learning_rate": 0.00028715810244134293,
"loss": 3.1508941650390625,
"step": 65650
},
{
"epoch": 0.13455366889147846,
"grad_norm": 1.0751904249191284,
"learning_rate": 0.0002871385196947372,
"loss": 3.2744952392578126,
"step": 65700
},
{
"epoch": 0.1346560689439073,
"grad_norm": 0.8905739188194275,
"learning_rate": 0.0002871189226972984,
"loss": 3.7018252563476564,
"step": 65750
},
{
"epoch": 0.13475846899633612,
"grad_norm": 0.9014281630516052,
"learning_rate": 0.00028709931145106304,
"loss": 3.712538757324219,
"step": 65800
},
{
"epoch": 0.13486086904876496,
"grad_norm": 0.9917147159576416,
"learning_rate": 0.000287079685958069,
"loss": 3.321624755859375,
"step": 65850
},
{
"epoch": 0.13496326910119377,
"grad_norm": 0.9449427127838135,
"learning_rate": 0.0002870600462203556,
"loss": 3.589186096191406,
"step": 65900
},
{
"epoch": 0.13506566915362261,
"grad_norm": 0.8208171725273132,
"learning_rate": 0.00028704039223996383,
"loss": 3.7818731689453124,
"step": 65950
},
{
"epoch": 0.13516806920605143,
"grad_norm": 0.8270769119262695,
"learning_rate": 0.0002870207240189359,
"loss": 3.731416015625,
"step": 66000
},
{
"epoch": 0.13527046925848027,
"grad_norm": 1.045253038406372,
"learning_rate": 0.0002870010415593159,
"loss": 3.7312091064453123,
"step": 66050
},
{
"epoch": 0.13537286931090908,
"grad_norm": 0.7662860155105591,
"learning_rate": 0.00028698134486314884,
"loss": 3.7503961181640624,
"step": 66100
},
{
"epoch": 0.13547526936333792,
"grad_norm": 0.7599702477455139,
"learning_rate": 0.0002869616339324817,
"loss": 2.9531689453125,
"step": 66150
},
{
"epoch": 0.13557766941576674,
"grad_norm": 0.9016150236129761,
"learning_rate": 0.00028694190876936274,
"loss": 3.9108657836914062,
"step": 66200
},
{
"epoch": 0.13568006946819558,
"grad_norm": 0.9253189563751221,
"learning_rate": 0.00028692216937584164,
"loss": 3.645496520996094,
"step": 66250
},
{
"epoch": 0.1357824695206244,
"grad_norm": 0.9780471324920654,
"learning_rate": 0.0002869024157539697,
"loss": 3.7777984619140623,
"step": 66300
},
{
"epoch": 0.13588486957305324,
"grad_norm": 0.7383785843849182,
"learning_rate": 0.00028688264790579956,
"loss": 3.588190002441406,
"step": 66350
},
{
"epoch": 0.13598726962548205,
"grad_norm": 0.8228618502616882,
"learning_rate": 0.00028686286583338554,
"loss": 3.2836099243164063,
"step": 66400
},
{
"epoch": 0.13608966967791086,
"grad_norm": 0.989874541759491,
"learning_rate": 0.00028684306953878316,
"loss": 3.2741690063476563,
"step": 66450
},
{
"epoch": 0.1361920697303397,
"grad_norm": 0.6227463483810425,
"learning_rate": 0.00028682325902404957,
"loss": 3.5655419921875,
"step": 66500
},
{
"epoch": 0.13629446978276852,
"grad_norm": 0.9205330014228821,
"learning_rate": 0.00028680343429124356,
"loss": 3.891072998046875,
"step": 66550
},
{
"epoch": 0.13639686983519736,
"grad_norm": 0.9149171113967896,
"learning_rate": 0.000286783595342425,
"loss": 3.59095458984375,
"step": 66600
},
{
"epoch": 0.13649926988762617,
"grad_norm": 0.9638737440109253,
"learning_rate": 0.00028676374217965567,
"loss": 3.623572998046875,
"step": 66650
},
{
"epoch": 0.13660166994005501,
"grad_norm": 1.3770073652267456,
"learning_rate": 0.0002867438748049985,
"loss": 3.716294250488281,
"step": 66700
},
{
"epoch": 0.13670406999248383,
"grad_norm": 0.7525309324264526,
"learning_rate": 0.000286723993220518,
"loss": 4.217930603027344,
"step": 66750
},
{
"epoch": 0.13680647004491267,
"grad_norm": 0.8076726198196411,
"learning_rate": 0.0002867040974282803,
"loss": 3.8803009033203124,
"step": 66800
},
{
"epoch": 0.13690887009734148,
"grad_norm": 0.6948472261428833,
"learning_rate": 0.00028668418743035275,
"loss": 3.5436331176757814,
"step": 66850
},
{
"epoch": 0.13701127014977033,
"grad_norm": 0.8509873151779175,
"learning_rate": 0.00028666426322880443,
"loss": 3.499276428222656,
"step": 66900
},
{
"epoch": 0.13711367020219914,
"grad_norm": 0.734075665473938,
"learning_rate": 0.0002866443248257057,
"loss": 3.3526876831054686,
"step": 66950
},
{
"epoch": 0.13721607025462798,
"grad_norm": 0.8169065713882446,
"learning_rate": 0.0002866243722231285,
"loss": 3.3126312255859376,
"step": 67000
},
{
"epoch": 0.1373184703070568,
"grad_norm": 0.8438522219657898,
"learning_rate": 0.0002866044054231462,
"loss": 3.438792419433594,
"step": 67050
},
{
"epoch": 0.13742087035948564,
"grad_norm": 0.8047662973403931,
"learning_rate": 0.00028658442442783364,
"loss": 3.4803237915039062,
"step": 67100
},
{
"epoch": 0.13752327041191445,
"grad_norm": 0.7526935338973999,
"learning_rate": 0.00028656442923926723,
"loss": 3.5479522705078126,
"step": 67150
},
{
"epoch": 0.1376256704643433,
"grad_norm": 0.8287502527236938,
"learning_rate": 0.0002865444198595247,
"loss": 3.7390045166015624,
"step": 67200
},
{
"epoch": 0.1377280705167721,
"grad_norm": 0.6148055791854858,
"learning_rate": 0.00028652439629068535,
"loss": 3.7372897338867186,
"step": 67250
},
{
"epoch": 0.13783047056920092,
"grad_norm": 0.8581375479698181,
"learning_rate": 0.00028650435853483006,
"loss": 2.9981643676757814,
"step": 67300
},
{
"epoch": 0.13793287062162976,
"grad_norm": 1.0106624364852905,
"learning_rate": 0.0002864843065940409,
"loss": 3.4451068115234373,
"step": 67350
},
{
"epoch": 0.13803527067405857,
"grad_norm": 0.938605785369873,
"learning_rate": 0.0002864642404704017,
"loss": 3.6765966796875,
"step": 67400
},
{
"epoch": 0.13813767072648742,
"grad_norm": 1.1929186582565308,
"learning_rate": 0.0002864441601659975,
"loss": 3.147588195800781,
"step": 67450
},
{
"epoch": 0.13824007077891623,
"grad_norm": 0.5836741328239441,
"learning_rate": 0.00028642406568291513,
"loss": 2.8205252075195313,
"step": 67500
},
{
"epoch": 0.13834247083134507,
"grad_norm": 0.8532480001449585,
"learning_rate": 0.0002864039570232426,
"loss": 3.003614501953125,
"step": 67550
},
{
"epoch": 0.13844487088377389,
"grad_norm": 0.7523052096366882,
"learning_rate": 0.0002863838341890696,
"loss": 3.6742901611328125,
"step": 67600
},
{
"epoch": 0.13854727093620273,
"grad_norm": 0.8439714908599854,
"learning_rate": 0.0002863636971824872,
"loss": 3.3492770385742188,
"step": 67650
},
{
"epoch": 0.13864967098863154,
"grad_norm": 0.8030802607536316,
"learning_rate": 0.00028634354600558785,
"loss": 3.6775198364257813,
"step": 67700
},
{
"epoch": 0.13875207104106038,
"grad_norm": 0.8020223379135132,
"learning_rate": 0.00028632338066046566,
"loss": 3.622167663574219,
"step": 67750
},
{
"epoch": 0.1388544710934892,
"grad_norm": 0.7629789710044861,
"learning_rate": 0.00028630320114921606,
"loss": 3.1032611083984376,
"step": 67800
},
{
"epoch": 0.13895687114591804,
"grad_norm": 0.8953397274017334,
"learning_rate": 0.0002862830074739361,
"loss": 3.3124514770507814,
"step": 67850
},
{
"epoch": 0.13905927119834685,
"grad_norm": 0.7486206293106079,
"learning_rate": 0.00028626279963672415,
"loss": 3.154571838378906,
"step": 67900
},
{
"epoch": 0.1391616712507757,
"grad_norm": 0.8250375986099243,
"learning_rate": 0.00028624257763968015,
"loss": 3.6296453857421875,
"step": 67950
},
{
"epoch": 0.1392640713032045,
"grad_norm": 1.0587407350540161,
"learning_rate": 0.00028622234148490544,
"loss": 3.5324700927734374,
"step": 68000
},
{
"epoch": 0.13936647135563335,
"grad_norm": 0.7875683903694153,
"learning_rate": 0.00028620209117450295,
"loss": 3.170576477050781,
"step": 68050
},
{
"epoch": 0.13946887140806216,
"grad_norm": 1.1913716793060303,
"learning_rate": 0.00028618182671057694,
"loss": 3.6836483764648436,
"step": 68100
},
{
"epoch": 0.13957127146049098,
"grad_norm": 0.8803089261054993,
"learning_rate": 0.00028616154809523326,
"loss": 3.468567199707031,
"step": 68150
},
{
"epoch": 0.13967367151291982,
"grad_norm": 0.6812267303466797,
"learning_rate": 0.00028614125533057906,
"loss": 3.56980712890625,
"step": 68200
},
{
"epoch": 0.13977607156534863,
"grad_norm": 0.6622804999351501,
"learning_rate": 0.0002861209484187232,
"loss": 3.2988763427734376,
"step": 68250
},
{
"epoch": 0.13987847161777747,
"grad_norm": 0.8914295434951782,
"learning_rate": 0.0002861006273617758,
"loss": 3.1821719360351564,
"step": 68300
},
{
"epoch": 0.13998087167020629,
"grad_norm": 0.9383370876312256,
"learning_rate": 0.00028608029216184867,
"loss": 3.6463201904296874,
"step": 68350
},
{
"epoch": 0.14008327172263513,
"grad_norm": 0.795408308506012,
"learning_rate": 0.0002860599428210548,
"loss": 3.4441323852539063,
"step": 68400
},
{
"epoch": 0.14018567177506394,
"grad_norm": 0.9368188381195068,
"learning_rate": 0.0002860395793415088,
"loss": 3.4534127807617185,
"step": 68450
},
{
"epoch": 0.14028807182749278,
"grad_norm": 0.9888190627098083,
"learning_rate": 0.0002860192017253269,
"loss": 3.839812927246094,
"step": 68500
},
{
"epoch": 0.1403904718799216,
"grad_norm": 1.1791257858276367,
"learning_rate": 0.0002859988099746266,
"loss": 3.30308837890625,
"step": 68550
},
{
"epoch": 0.14049287193235044,
"grad_norm": 0.8144651651382446,
"learning_rate": 0.00028597840409152683,
"loss": 3.5844757080078127,
"step": 68600
},
{
"epoch": 0.14059527198477925,
"grad_norm": 0.8788326382637024,
"learning_rate": 0.00028595798407814817,
"loss": 3.5440103149414064,
"step": 68650
},
{
"epoch": 0.1406976720372081,
"grad_norm": 0.754426121711731,
"learning_rate": 0.00028593754993661247,
"loss": 3.38293701171875,
"step": 68700
},
{
"epoch": 0.1408000720896369,
"grad_norm": 0.8822509050369263,
"learning_rate": 0.0002859171016690433,
"loss": 3.699421691894531,
"step": 68750
},
{
"epoch": 0.14090247214206575,
"grad_norm": 0.6882439255714417,
"learning_rate": 0.00028589663927756546,
"loss": 3.33095947265625,
"step": 68800
},
{
"epoch": 0.14100487219449456,
"grad_norm": 0.8108435273170471,
"learning_rate": 0.00028587616276430536,
"loss": 3.5015853881835937,
"step": 68850
},
{
"epoch": 0.1411072722469234,
"grad_norm": 0.6340552568435669,
"learning_rate": 0.00028585567213139075,
"loss": 3.374276123046875,
"step": 68900
},
{
"epoch": 0.14120967229935222,
"grad_norm": 0.6358705163002014,
"learning_rate": 0.0002858351673809511,
"loss": 3.372686462402344,
"step": 68950
},
{
"epoch": 0.14131207235178103,
"grad_norm": 0.6987962126731873,
"learning_rate": 0.000285814648515117,
"loss": 3.635752868652344,
"step": 69000
},
{
"epoch": 0.14141447240420987,
"grad_norm": 0.9426242113113403,
"learning_rate": 0.0002857941155360207,
"loss": 3.790219421386719,
"step": 69050
},
{
"epoch": 0.1415168724566387,
"grad_norm": 0.5323778986930847,
"learning_rate": 0.000285773568445796,
"loss": 3.5456610107421875,
"step": 69100
},
{
"epoch": 0.14161927250906753,
"grad_norm": 0.7765032052993774,
"learning_rate": 0.000285753007246578,
"loss": 2.8608853149414064,
"step": 69150
},
{
"epoch": 0.14172167256149634,
"grad_norm": 1.0102488994598389,
"learning_rate": 0.0002857324319405033,
"loss": 3.7360980224609377,
"step": 69200
},
{
"epoch": 0.14182407261392518,
"grad_norm": 0.6676150560379028,
"learning_rate": 0.00028571184252971,
"loss": 3.7574533081054686,
"step": 69250
},
{
"epoch": 0.141926472666354,
"grad_norm": 0.8389192223548889,
"learning_rate": 0.00028569123901633773,
"loss": 3.7205816650390626,
"step": 69300
},
{
"epoch": 0.14202887271878284,
"grad_norm": 0.9630427956581116,
"learning_rate": 0.0002856706214025275,
"loss": 3.1625067138671876,
"step": 69350
},
{
"epoch": 0.14213127277121165,
"grad_norm": 0.8320639729499817,
"learning_rate": 0.0002856499896904217,
"loss": 2.9422607421875,
"step": 69400
},
{
"epoch": 0.1422336728236405,
"grad_norm": 0.9393151998519897,
"learning_rate": 0.0002856293438821644,
"loss": 3.5568783569335936,
"step": 69450
},
{
"epoch": 0.1423360728760693,
"grad_norm": 0.8972524404525757,
"learning_rate": 0.000285608683979901,
"loss": 3.5590420532226563,
"step": 69500
},
{
"epoch": 0.14243847292849815,
"grad_norm": 0.5622543096542358,
"learning_rate": 0.00028558800998577835,
"loss": 2.8899127197265626,
"step": 69550
},
{
"epoch": 0.14254087298092696,
"grad_norm": 0.8466945886611938,
"learning_rate": 0.00028556732190194485,
"loss": 3.2979135131835937,
"step": 69600
},
{
"epoch": 0.1426432730333558,
"grad_norm": 1.3375204801559448,
"learning_rate": 0.00028554661973055026,
"loss": 3.5246792602539063,
"step": 69650
},
{
"epoch": 0.14274567308578462,
"grad_norm": 0.7531492114067078,
"learning_rate": 0.00028552590347374586,
"loss": 3.3118746948242186,
"step": 69700
},
{
"epoch": 0.14284807313821346,
"grad_norm": 0.8651145100593567,
"learning_rate": 0.00028550517313368444,
"loss": 3.485458984375,
"step": 69750
},
{
"epoch": 0.14295047319064227,
"grad_norm": 0.625991940498352,
"learning_rate": 0.0002854844287125202,
"loss": 3.164065246582031,
"step": 69800
},
{
"epoch": 0.1430528732430711,
"grad_norm": 2.080458402633667,
"learning_rate": 0.0002854636702124088,
"loss": 3.0976217651367186,
"step": 69850
},
{
"epoch": 0.14315527329549993,
"grad_norm": 0.7514007687568665,
"learning_rate": 0.00028544289763550733,
"loss": 3.78799072265625,
"step": 69900
},
{
"epoch": 0.14325767334792874,
"grad_norm": 0.5652868151664734,
"learning_rate": 0.00028542211098397447,
"loss": 2.5083651733398438,
"step": 69950
},
{
"epoch": 0.14336007340035758,
"grad_norm": 0.7237803339958191,
"learning_rate": 0.0002854013102599702,
"loss": 3.534099426269531,
"step": 70000
},
{
"epoch": 0.1434624734527864,
"grad_norm": 0.8753382563591003,
"learning_rate": 0.00028538049546565603,
"loss": 4.047043762207031,
"step": 70050
},
{
"epoch": 0.14356487350521524,
"grad_norm": 0.8999593257904053,
"learning_rate": 0.000285359666603195,
"loss": 3.6831619262695314,
"step": 70100
},
{
"epoch": 0.14366727355764405,
"grad_norm": 0.7087032794952393,
"learning_rate": 0.00028533882367475156,
"loss": 2.866451416015625,
"step": 70150
},
{
"epoch": 0.1437696736100729,
"grad_norm": 0.6140325665473938,
"learning_rate": 0.0002853179666824916,
"loss": 1.8367611694335937,
"step": 70200
},
{
"epoch": 0.1438720736625017,
"grad_norm": 0.7460519671440125,
"learning_rate": 0.0002852970956285824,
"loss": 2.957001037597656,
"step": 70250
},
{
"epoch": 0.14397447371493055,
"grad_norm": 1.0516009330749512,
"learning_rate": 0.0002852762105151929,
"loss": 2.5348553466796875,
"step": 70300
},
{
"epoch": 0.14407687376735936,
"grad_norm": 0.5429277420043945,
"learning_rate": 0.0002852553113444934,
"loss": 3.53834228515625,
"step": 70350
},
{
"epoch": 0.1441792738197882,
"grad_norm": 0.8015134334564209,
"learning_rate": 0.0002852343981186556,
"loss": 3.64453857421875,
"step": 70400
},
{
"epoch": 0.14428167387221702,
"grad_norm": 0.7445142269134521,
"learning_rate": 0.00028521347083985266,
"loss": 3.6188226318359376,
"step": 70450
},
{
"epoch": 0.14438407392464586,
"grad_norm": 0.9419240355491638,
"learning_rate": 0.00028519252951025935,
"loss": 2.8771868896484376,
"step": 70500
},
{
"epoch": 0.14448647397707468,
"grad_norm": 0.8755508065223694,
"learning_rate": 0.0002851715741320517,
"loss": 3.6672409057617186,
"step": 70550
},
{
"epoch": 0.14458887402950352,
"grad_norm": 0.6970762014389038,
"learning_rate": 0.00028515060470740743,
"loss": 3.7528070068359374,
"step": 70600
},
{
"epoch": 0.14469127408193233,
"grad_norm": 0.5237036943435669,
"learning_rate": 0.0002851296212385055,
"loss": 3.318054504394531,
"step": 70650
},
{
"epoch": 0.14479367413436114,
"grad_norm": 0.7218162417411804,
"learning_rate": 0.0002851086237275264,
"loss": 1.9354142761230468,
"step": 70700
},
{
"epoch": 0.14489607418678999,
"grad_norm": 0.46474677324295044,
"learning_rate": 0.00028508761217665215,
"loss": 1.882958221435547,
"step": 70750
},
{
"epoch": 0.1449984742392188,
"grad_norm": 0.665745198726654,
"learning_rate": 0.0002850665865880662,
"loss": 1.9264730834960937,
"step": 70800
},
{
"epoch": 0.14510087429164764,
"grad_norm": 0.9114018082618713,
"learning_rate": 0.00028504554696395334,
"loss": 2.677998046875,
"step": 70850
},
{
"epoch": 0.14520327434407646,
"grad_norm": 0.729942798614502,
"learning_rate": 0.0002850244933065,
"loss": 3.4295562744140624,
"step": 70900
},
{
"epoch": 0.1453056743965053,
"grad_norm": 1.0335681438446045,
"learning_rate": 0.000285003425617894,
"loss": 3.524678039550781,
"step": 70950
},
{
"epoch": 0.1454080744489341,
"grad_norm": 1.325173258781433,
"learning_rate": 0.00028498234390032453,
"loss": 3.4061398315429687,
"step": 71000
},
{
"epoch": 0.14551047450136295,
"grad_norm": 0.7562994956970215,
"learning_rate": 0.00028496124815598233,
"loss": 3.4216473388671873,
"step": 71050
},
{
"epoch": 0.14561287455379177,
"grad_norm": 0.8231451511383057,
"learning_rate": 0.00028494013838705964,
"loss": 3.4331399536132814,
"step": 71100
},
{
"epoch": 0.1457152746062206,
"grad_norm": 0.9096212387084961,
"learning_rate": 0.00028491901459575,
"loss": 3.2372637939453126,
"step": 71150
},
{
"epoch": 0.14581767465864942,
"grad_norm": 0.8419906497001648,
"learning_rate": 0.00028489787678424855,
"loss": 3.490650329589844,
"step": 71200
},
{
"epoch": 0.14592007471107826,
"grad_norm": 0.9181749224662781,
"learning_rate": 0.00028487672495475187,
"loss": 3.988592224121094,
"step": 71250
},
{
"epoch": 0.14602247476350708,
"grad_norm": 0.7534500360488892,
"learning_rate": 0.0002848555591094579,
"loss": 3.256888427734375,
"step": 71300
},
{
"epoch": 0.14612487481593592,
"grad_norm": 1.1033469438552856,
"learning_rate": 0.00028483437925056615,
"loss": 3.3334320068359373,
"step": 71350
},
{
"epoch": 0.14622727486836473,
"grad_norm": 0.9356803894042969,
"learning_rate": 0.0002848131853802775,
"loss": 2.933785705566406,
"step": 71400
},
{
"epoch": 0.14632967492079357,
"grad_norm": 0.8622822165489197,
"learning_rate": 0.00028479197750079434,
"loss": 3.4190252685546874,
"step": 71450
},
{
"epoch": 0.1464320749732224,
"grad_norm": 0.7642265558242798,
"learning_rate": 0.0002847707556143205,
"loss": 3.637124328613281,
"step": 71500
},
{
"epoch": 0.1465344750256512,
"grad_norm": 0.8362610340118408,
"learning_rate": 0.0002847495197230613,
"loss": 3.78397705078125,
"step": 71550
},
{
"epoch": 0.14663687507808004,
"grad_norm": 0.7937034964561462,
"learning_rate": 0.0002847282698292234,
"loss": 3.7184579467773435,
"step": 71600
},
{
"epoch": 0.14673927513050886,
"grad_norm": 0.8799037933349609,
"learning_rate": 0.0002847070059350151,
"loss": 3.535165100097656,
"step": 71650
},
{
"epoch": 0.1468416751829377,
"grad_norm": 0.7818918824195862,
"learning_rate": 0.000284685728042646,
"loss": 3.6417041015625,
"step": 71700
},
{
"epoch": 0.1469440752353665,
"grad_norm": 0.84147709608078,
"learning_rate": 0.00028466443615432713,
"loss": 3.487315673828125,
"step": 71750
},
{
"epoch": 0.14704647528779535,
"grad_norm": 0.6987602710723877,
"learning_rate": 0.00028464313027227117,
"loss": 3.3947982788085938,
"step": 71800
},
{
"epoch": 0.14714887534022417,
"grad_norm": 0.7040350437164307,
"learning_rate": 0.0002846218103986921,
"loss": 3.48228271484375,
"step": 71850
},
{
"epoch": 0.147251275392653,
"grad_norm": 0.753700852394104,
"learning_rate": 0.0002846004765358053,
"loss": 2.8886663818359377,
"step": 71900
},
{
"epoch": 0.14735367544508182,
"grad_norm": 0.6964828968048096,
"learning_rate": 0.0002845791286858278,
"loss": 3.3485205078125,
"step": 71950
},
{
"epoch": 0.14745607549751066,
"grad_norm": 0.7957376837730408,
"learning_rate": 0.00028455776685097796,
"loss": 4.061175842285156,
"step": 72000
},
{
"epoch": 0.14755847554993948,
"grad_norm": 0.8718839883804321,
"learning_rate": 0.00028453639103347557,
"loss": 3.8466424560546875,
"step": 72050
},
{
"epoch": 0.14766087560236832,
"grad_norm": 0.6680410504341125,
"learning_rate": 0.00028451500123554194,
"loss": 3.4465017700195313,
"step": 72100
},
{
"epoch": 0.14776327565479713,
"grad_norm": 0.7632986903190613,
"learning_rate": 0.0002844935974593998,
"loss": 3.496392822265625,
"step": 72150
},
{
"epoch": 0.14786567570722597,
"grad_norm": 0.8088258504867554,
"learning_rate": 0.0002844721797072733,
"loss": 3.6753024291992187,
"step": 72200
},
{
"epoch": 0.1479680757596548,
"grad_norm": 0.7746132016181946,
"learning_rate": 0.0002844507479813881,
"loss": 3.4665756225585938,
"step": 72250
},
{
"epoch": 0.14807047581208363,
"grad_norm": 0.9574618339538574,
"learning_rate": 0.00028442930228397134,
"loss": 3.6266830444335936,
"step": 72300
},
{
"epoch": 0.14817287586451244,
"grad_norm": 3.6261954307556152,
"learning_rate": 0.0002844078426172515,
"loss": 3.3307794189453124,
"step": 72350
},
{
"epoch": 0.14827527591694126,
"grad_norm": 1.3732120990753174,
"learning_rate": 0.00028438636898345856,
"loss": 3.371138000488281,
"step": 72400
},
{
"epoch": 0.1483776759693701,
"grad_norm": 0.8364174962043762,
"learning_rate": 0.00028436488138482407,
"loss": 3.6167337036132814,
"step": 72450
},
{
"epoch": 0.1484800760217989,
"grad_norm": 0.7466509938240051,
"learning_rate": 0.0002843433798235808,
"loss": 3.7814892578125,
"step": 72500
},
{
"epoch": 0.14858247607422775,
"grad_norm": 0.8802339434623718,
"learning_rate": 0.00028432186430196315,
"loss": 3.364360656738281,
"step": 72550
},
{
"epoch": 0.14868487612665657,
"grad_norm": 0.6437531113624573,
"learning_rate": 0.00028430033482220693,
"loss": 3.211015625,
"step": 72600
},
{
"epoch": 0.1487872761790854,
"grad_norm": 0.7954172492027283,
"learning_rate": 0.0002842787913865494,
"loss": 3.1693716430664063,
"step": 72650
},
{
"epoch": 0.14888967623151422,
"grad_norm": 0.735313892364502,
"learning_rate": 0.0002842572339972292,
"loss": 3.4131680297851563,
"step": 72700
},
{
"epoch": 0.14899207628394306,
"grad_norm": 1.2493815422058105,
"learning_rate": 0.0002842356626564866,
"loss": 2.6316799926757812,
"step": 72750
},
{
"epoch": 0.14909447633637188,
"grad_norm": 0.506629228591919,
"learning_rate": 0.00028421407736656305,
"loss": 2.7435052490234373,
"step": 72800
},
{
"epoch": 0.14919687638880072,
"grad_norm": 0.4948784112930298,
"learning_rate": 0.0002841924781297017,
"loss": 1.9055368041992187,
"step": 72850
},
{
"epoch": 0.14929927644122953,
"grad_norm": 0.9006844162940979,
"learning_rate": 0.000284170864948147,
"loss": 2.6189675903320313,
"step": 72900
},
{
"epoch": 0.14940167649365838,
"grad_norm": 0.6539232134819031,
"learning_rate": 0.00028414923782414496,
"loss": 3.5976483154296877,
"step": 72950
},
{
"epoch": 0.1495040765460872,
"grad_norm": 0.6356167793273926,
"learning_rate": 0.0002841275967599429,
"loss": 3.402171936035156,
"step": 73000
},
{
"epoch": 0.14960647659851603,
"grad_norm": 1.5707745552062988,
"learning_rate": 0.00028410594175778964,
"loss": 3.7100360107421877,
"step": 73050
},
{
"epoch": 0.14970887665094484,
"grad_norm": 0.5473312139511108,
"learning_rate": 0.0002840842728199356,
"loss": 2.539022674560547,
"step": 73100
},
{
"epoch": 0.14981127670337369,
"grad_norm": 0.8895722031593323,
"learning_rate": 0.00028406258994863245,
"loss": 2.5107452392578127,
"step": 73150
},
{
"epoch": 0.1499136767558025,
"grad_norm": 0.5692305564880371,
"learning_rate": 0.00028404089314613333,
"loss": 1.6453628540039062,
"step": 73200
},
{
"epoch": 0.1500160768082313,
"grad_norm": 0.6427550315856934,
"learning_rate": 0.00028401918241469294,
"loss": 1.628760986328125,
"step": 73250
},
{
"epoch": 0.15011847686066015,
"grad_norm": 0.705071747303009,
"learning_rate": 0.0002839974577565674,
"loss": 3.1914212036132814,
"step": 73300
},
{
"epoch": 0.15022087691308897,
"grad_norm": 0.7019383907318115,
"learning_rate": 0.0002839757191740141,
"loss": 3.628504333496094,
"step": 73350
},
{
"epoch": 0.1503232769655178,
"grad_norm": 0.8977711200714111,
"learning_rate": 0.0002839539666692921,
"loss": 3.583207092285156,
"step": 73400
},
{
"epoch": 0.15042567701794662,
"grad_norm": 0.7372389435768127,
"learning_rate": 0.00028393220024466187,
"loss": 2.969400634765625,
"step": 73450
},
{
"epoch": 0.15052807707037547,
"grad_norm": 0.7931883931159973,
"learning_rate": 0.0002839104199023853,
"loss": 3.232490234375,
"step": 73500
},
{
"epoch": 0.15063047712280428,
"grad_norm": 0.6523383259773254,
"learning_rate": 0.0002838886256447256,
"loss": 3.325892028808594,
"step": 73550
},
{
"epoch": 0.15073287717523312,
"grad_norm": 0.6729732155799866,
"learning_rate": 0.00028386681747394755,
"loss": 3.335216064453125,
"step": 73600
},
{
"epoch": 0.15083527722766193,
"grad_norm": 0.818371057510376,
"learning_rate": 0.0002838449953923174,
"loss": 3.518477783203125,
"step": 73650
},
{
"epoch": 0.15093767728009078,
"grad_norm": 0.7028401494026184,
"learning_rate": 0.00028382315940210284,
"loss": 3.509742431640625,
"step": 73700
},
{
"epoch": 0.1510400773325196,
"grad_norm": 1.2517348527908325,
"learning_rate": 0.0002838013095055729,
"loss": 3.626214599609375,
"step": 73750
},
{
"epoch": 0.15114247738494843,
"grad_norm": 0.7776418328285217,
"learning_rate": 0.00028377944570499814,
"loss": 3.3807473754882813,
"step": 73800
},
{
"epoch": 0.15124487743737725,
"grad_norm": 1.246304988861084,
"learning_rate": 0.0002837575680026506,
"loss": 3.671220703125,
"step": 73850
},
{
"epoch": 0.1513472774898061,
"grad_norm": 0.8468489050865173,
"learning_rate": 0.00028373567640080366,
"loss": 3.7939553833007813,
"step": 73900
},
{
"epoch": 0.1514496775422349,
"grad_norm": 0.9071077108383179,
"learning_rate": 0.0002837137709017322,
"loss": 4.0460302734375,
"step": 73950
},
{
"epoch": 0.15155207759466374,
"grad_norm": 0.8705784678459167,
"learning_rate": 0.00028369185150771257,
"loss": 3.0467730712890626,
"step": 74000
},
{
"epoch": 0.15165447764709256,
"grad_norm": 1.1212836503982544,
"learning_rate": 0.00028366991822102256,
"loss": 3.2574063110351563,
"step": 74050
},
{
"epoch": 0.15175687769952137,
"grad_norm": 0.6991548538208008,
"learning_rate": 0.0002836479710439413,
"loss": 3.221210632324219,
"step": 74100
},
{
"epoch": 0.1518592777519502,
"grad_norm": 0.7652693390846252,
"learning_rate": 0.00028362600997874953,
"loss": 3.4262896728515626,
"step": 74150
},
{
"epoch": 0.15196167780437903,
"grad_norm": 0.5328712463378906,
"learning_rate": 0.00028360403502772927,
"loss": 2.2504594421386717,
"step": 74200
},
{
"epoch": 0.15206407785680787,
"grad_norm": 0.623674750328064,
"learning_rate": 0.00028358204619316414,
"loss": 1.6738412475585938,
"step": 74250
},
{
"epoch": 0.15216647790923668,
"grad_norm": 0.7511982321739197,
"learning_rate": 0.0002835600434773391,
"loss": 3.5196023559570313,
"step": 74300
},
{
"epoch": 0.15226887796166552,
"grad_norm": 0.7045626640319824,
"learning_rate": 0.0002835380268825405,
"loss": 3.4125076293945313,
"step": 74350
},
{
"epoch": 0.15237127801409434,
"grad_norm": 0.688127875328064,
"learning_rate": 0.00028351599641105634,
"loss": 3.532620544433594,
"step": 74400
},
{
"epoch": 0.15247367806652318,
"grad_norm": 0.7123726606369019,
"learning_rate": 0.0002834939520651758,
"loss": 3.450240478515625,
"step": 74450
},
{
"epoch": 0.152576078118952,
"grad_norm": 0.6914170980453491,
"learning_rate": 0.0002834718938471897,
"loss": 3.7383859252929685,
"step": 74500
},
{
"epoch": 0.15267847817138083,
"grad_norm": 1.8187841176986694,
"learning_rate": 0.0002834498217593902,
"loss": 3.352877197265625,
"step": 74550
},
{
"epoch": 0.15278087822380965,
"grad_norm": 0.5876966118812561,
"learning_rate": 0.00028342773580407104,
"loss": 3.5138931274414062,
"step": 74600
},
{
"epoch": 0.1528832782762385,
"grad_norm": 0.6575474143028259,
"learning_rate": 0.00028340563598352716,
"loss": 2.800203857421875,
"step": 74650
},
{
"epoch": 0.1529856783286673,
"grad_norm": 0.9087119102478027,
"learning_rate": 0.0002833835223000551,
"loss": 3.22402587890625,
"step": 74700
},
{
"epoch": 0.15308807838109614,
"grad_norm": 0.5269556641578674,
"learning_rate": 0.0002833613947559529,
"loss": 3.43998291015625,
"step": 74750
},
{
"epoch": 0.15319047843352496,
"grad_norm": 0.7771069407463074,
"learning_rate": 0.0002833392533535198,
"loss": 3.7308123779296873,
"step": 74800
},
{
"epoch": 0.1532928784859538,
"grad_norm": 0.9969513416290283,
"learning_rate": 0.00028331709809505687,
"loss": 3.7192803955078126,
"step": 74850
},
{
"epoch": 0.1533952785383826,
"grad_norm": 0.705575704574585,
"learning_rate": 0.00028329492898286623,
"loss": 3.504131164550781,
"step": 74900
},
{
"epoch": 0.15349767859081143,
"grad_norm": 0.5487853288650513,
"learning_rate": 0.0002832727460192516,
"loss": 2.8572216796875,
"step": 74950
},
{
"epoch": 0.15360007864324027,
"grad_norm": 0.7012720108032227,
"learning_rate": 0.00028325054920651813,
"loss": 2.238103485107422,
"step": 75000
},
{
"epoch": 0.15370247869566908,
"grad_norm": 0.7673011422157288,
"learning_rate": 0.00028322833854697247,
"loss": 3.7670169067382813,
"step": 75050
},
{
"epoch": 0.15380487874809792,
"grad_norm": 0.7762806415557861,
"learning_rate": 0.00028320611404292266,
"loss": 3.65732177734375,
"step": 75100
},
{
"epoch": 0.15390727880052674,
"grad_norm": 0.8351573348045349,
"learning_rate": 0.0002831838756966781,
"loss": 3.3902908325195313,
"step": 75150
},
{
"epoch": 0.15400967885295558,
"grad_norm": 0.9058986306190491,
"learning_rate": 0.00028316162351054976,
"loss": 3.2655902099609375,
"step": 75200
},
{
"epoch": 0.1541120789053844,
"grad_norm": 0.7718729376792908,
"learning_rate": 0.0002831393574868499,
"loss": 2.826809997558594,
"step": 75250
},
{
"epoch": 0.15421447895781323,
"grad_norm": 0.8767629861831665,
"learning_rate": 0.00028311707762789255,
"loss": 3.345711975097656,
"step": 75300
},
{
"epoch": 0.15431687901024205,
"grad_norm": 0.7267951369285583,
"learning_rate": 0.00028309478393599263,
"loss": 3.440138244628906,
"step": 75350
},
{
"epoch": 0.1544192790626709,
"grad_norm": 0.8214264512062073,
"learning_rate": 0.000283072476413467,
"loss": 3.795940246582031,
"step": 75400
},
{
"epoch": 0.1545216791150997,
"grad_norm": 0.7978084087371826,
"learning_rate": 0.0002830501550626337,
"loss": 3.481332092285156,
"step": 75450
},
{
"epoch": 0.15462407916752854,
"grad_norm": 0.6108945608139038,
"learning_rate": 0.0002830278198858122,
"loss": 3.0552932739257814,
"step": 75500
},
{
"epoch": 0.15472647921995736,
"grad_norm": 0.7742500901222229,
"learning_rate": 0.0002830054708853236,
"loss": 3.446549072265625,
"step": 75550
},
{
"epoch": 0.1548288792723862,
"grad_norm": 0.8176902532577515,
"learning_rate": 0.0002829831080634903,
"loss": 3.7375308227539064,
"step": 75600
},
{
"epoch": 0.154931279324815,
"grad_norm": 0.7569313645362854,
"learning_rate": 0.00028296073142263596,
"loss": 3.7137493896484375,
"step": 75650
},
{
"epoch": 0.15503367937724385,
"grad_norm": 0.4838169515132904,
"learning_rate": 0.00028293834096508613,
"loss": 3.6915240478515625,
"step": 75700
},
{
"epoch": 0.15513607942967267,
"grad_norm": 0.669226348400116,
"learning_rate": 0.0002829159366931673,
"loss": 3.220513916015625,
"step": 75750
},
{
"epoch": 0.15523847948210148,
"grad_norm": 0.8119651079177856,
"learning_rate": 0.0002828935186092078,
"loss": 3.58036865234375,
"step": 75800
},
{
"epoch": 0.15534087953453032,
"grad_norm": 0.5975949168205261,
"learning_rate": 0.00028287108671553706,
"loss": 3.3334951782226563,
"step": 75850
},
{
"epoch": 0.15544327958695914,
"grad_norm": 0.8672727942466736,
"learning_rate": 0.0002828486410144862,
"loss": 3.343040771484375,
"step": 75900
},
{
"epoch": 0.15554567963938798,
"grad_norm": 0.7106810212135315,
"learning_rate": 0.0002828261815083877,
"loss": 3.3205633544921875,
"step": 75950
},
{
"epoch": 0.1556480796918168,
"grad_norm": 0.6641435623168945,
"learning_rate": 0.0002828037081995754,
"loss": 3.215283203125,
"step": 76000
},
{
"epoch": 0.15575047974424563,
"grad_norm": 0.8929319977760315,
"learning_rate": 0.0002827812210903846,
"loss": 2.9793209838867187,
"step": 76050
},
{
"epoch": 0.15585287979667445,
"grad_norm": 0.9121986627578735,
"learning_rate": 0.0002827587201831522,
"loss": 3.9058187866210936,
"step": 76100
},
{
"epoch": 0.1559552798491033,
"grad_norm": 0.7762316465377808,
"learning_rate": 0.00028273620548021624,
"loss": 3.6440216064453126,
"step": 76150
},
{
"epoch": 0.1560576799015321,
"grad_norm": 0.9442132711410522,
"learning_rate": 0.0002827136769839164,
"loss": 3.2715243530273437,
"step": 76200
},
{
"epoch": 0.15616007995396095,
"grad_norm": 0.8481264710426331,
"learning_rate": 0.00028269113469659373,
"loss": 3.4502252197265624,
"step": 76250
},
{
"epoch": 0.15626248000638976,
"grad_norm": 0.48445141315460205,
"learning_rate": 0.00028266857862059076,
"loss": 2.9071063232421874,
"step": 76300
},
{
"epoch": 0.1563648800588186,
"grad_norm": 0.7879681587219238,
"learning_rate": 0.00028264600875825145,
"loss": 3.08685546875,
"step": 76350
},
{
"epoch": 0.15646728011124741,
"grad_norm": 0.6723935604095459,
"learning_rate": 0.00028262342511192106,
"loss": 3.23456298828125,
"step": 76400
},
{
"epoch": 0.15656968016367626,
"grad_norm": 0.8366503119468689,
"learning_rate": 0.0002826008276839465,
"loss": 3.12440185546875,
"step": 76450
},
{
"epoch": 0.15667208021610507,
"grad_norm": 0.6648255586624146,
"learning_rate": 0.00028257821647667585,
"loss": 3.6282342529296874,
"step": 76500
},
{
"epoch": 0.1567744802685339,
"grad_norm": 0.233867809176445,
"learning_rate": 0.00028255559149245894,
"loss": 2.4692172241210937,
"step": 76550
},
{
"epoch": 0.15687688032096272,
"grad_norm": 0.9552132487297058,
"learning_rate": 0.00028253295273364675,
"loss": 2.3723199462890623,
"step": 76600
},
{
"epoch": 0.15697928037339154,
"grad_norm": 0.9119518995285034,
"learning_rate": 0.00028251030020259177,
"loss": 3.5183111572265626,
"step": 76650
},
{
"epoch": 0.15708168042582038,
"grad_norm": 0.7538524866104126,
"learning_rate": 0.00028248763390164807,
"loss": 3.69493896484375,
"step": 76700
},
{
"epoch": 0.1571840804782492,
"grad_norm": 0.9746021628379822,
"learning_rate": 0.00028246495383317093,
"loss": 2.4459327697753905,
"step": 76750
},
{
"epoch": 0.15728648053067804,
"grad_norm": 1.0355682373046875,
"learning_rate": 0.0002824422599995172,
"loss": 3.1521530151367188,
"step": 76800
},
{
"epoch": 0.15738888058310685,
"grad_norm": 1.0927115678787231,
"learning_rate": 0.00028241955240304513,
"loss": 3.066300048828125,
"step": 76850
},
{
"epoch": 0.1574912806355357,
"grad_norm": 0.9498497247695923,
"learning_rate": 0.00028239683104611433,
"loss": 3.6030181884765624,
"step": 76900
},
{
"epoch": 0.1575936806879645,
"grad_norm": 0.6711775660514832,
"learning_rate": 0.00028237409593108605,
"loss": 2.9892807006835938,
"step": 76950
},
{
"epoch": 0.15769608074039335,
"grad_norm": 0.8505134582519531,
"learning_rate": 0.00028235134706032267,
"loss": 3.558472900390625,
"step": 77000
},
{
"epoch": 0.15779848079282216,
"grad_norm": 0.7711685299873352,
"learning_rate": 0.0002823285844361883,
"loss": 3.2466412353515626,
"step": 77050
},
{
"epoch": 0.157900880845251,
"grad_norm": 0.8435817360877991,
"learning_rate": 0.00028230580806104814,
"loss": 3.2454754638671877,
"step": 77100
},
{
"epoch": 0.15800328089767982,
"grad_norm": 0.7799451947212219,
"learning_rate": 0.00028228301793726916,
"loss": 3.4074356079101564,
"step": 77150
},
{
"epoch": 0.15810568095010866,
"grad_norm": 0.7728955149650574,
"learning_rate": 0.0002822602140672196,
"loss": 3.511580505371094,
"step": 77200
},
{
"epoch": 0.15820808100253747,
"grad_norm": 0.7063544988632202,
"learning_rate": 0.0002822373964532691,
"loss": 3.2738442993164063,
"step": 77250
},
{
"epoch": 0.1583104810549663,
"grad_norm": 0.7023544907569885,
"learning_rate": 0.00028221456509778875,
"loss": 3.7345950317382814,
"step": 77300
},
{
"epoch": 0.15841288110739513,
"grad_norm": 0.9115304946899414,
"learning_rate": 0.0002821917200031511,
"loss": 3.140256042480469,
"step": 77350
},
{
"epoch": 0.15851528115982397,
"grad_norm": 0.7438466548919678,
"learning_rate": 0.00028216886117173013,
"loss": 3.7709716796875,
"step": 77400
},
{
"epoch": 0.15861768121225278,
"grad_norm": 0.7658351063728333,
"learning_rate": 0.0002821459886059013,
"loss": 2.747354736328125,
"step": 77450
},
{
"epoch": 0.1587200812646816,
"grad_norm": 0.6408258676528931,
"learning_rate": 0.0002821231023080412,
"loss": 3.999375915527344,
"step": 77500
},
{
"epoch": 0.15882248131711044,
"grad_norm": 0.772813081741333,
"learning_rate": 0.0002821002022805283,
"loss": 3.42555419921875,
"step": 77550
},
{
"epoch": 0.15892488136953925,
"grad_norm": 0.950205385684967,
"learning_rate": 0.0002820772885257422,
"loss": 3.6866400146484377,
"step": 77600
},
{
"epoch": 0.1590272814219681,
"grad_norm": 0.5234514474868774,
"learning_rate": 0.000282054361046064,
"loss": 2.6636093139648436,
"step": 77650
},
{
"epoch": 0.1591296814743969,
"grad_norm": 0.7165791988372803,
"learning_rate": 0.0002820314198438761,
"loss": 3.308489990234375,
"step": 77700
},
{
"epoch": 0.15923208152682575,
"grad_norm": 0.7389218211174011,
"learning_rate": 0.00028200846492156266,
"loss": 3.701646728515625,
"step": 77750
},
{
"epoch": 0.15933448157925456,
"grad_norm": 0.6821298599243164,
"learning_rate": 0.0002819854962815089,
"loss": 3.6112545776367186,
"step": 77800
},
{
"epoch": 0.1594368816316834,
"grad_norm": 0.6218832731246948,
"learning_rate": 0.00028196251392610173,
"loss": 3.4739862060546876,
"step": 77850
},
{
"epoch": 0.15953928168411222,
"grad_norm": 0.7211093902587891,
"learning_rate": 0.00028193951785772923,
"loss": 3.646156921386719,
"step": 77900
},
{
"epoch": 0.15964168173654106,
"grad_norm": 0.9792724847793579,
"learning_rate": 0.00028191650807878125,
"loss": 2.4151596069335937,
"step": 77950
},
{
"epoch": 0.15974408178896987,
"grad_norm": 0.6804146766662598,
"learning_rate": 0.0002818934845916487,
"loss": 2.791448059082031,
"step": 78000
},
{
"epoch": 0.1598464818413987,
"grad_norm": 0.7963101863861084,
"learning_rate": 0.0002818704473987241,
"loss": 2.887415771484375,
"step": 78050
},
{
"epoch": 0.15994888189382753,
"grad_norm": 1.2759873867034912,
"learning_rate": 0.00028184739650240144,
"loss": 3.3000274658203126,
"step": 78100
},
{
"epoch": 0.16005128194625637,
"grad_norm": 0.9473127126693726,
"learning_rate": 0.0002818243319050761,
"loss": 3.280038146972656,
"step": 78150
},
{
"epoch": 0.16015368199868518,
"grad_norm": 1.025072693824768,
"learning_rate": 0.0002818012536091447,
"loss": 3.278867492675781,
"step": 78200
},
{
"epoch": 0.16025608205111402,
"grad_norm": 0.6705909967422485,
"learning_rate": 0.00028177816161700553,
"loss": 3.659829406738281,
"step": 78250
},
{
"epoch": 0.16035848210354284,
"grad_norm": 0.5486139059066772,
"learning_rate": 0.00028175505593105825,
"loss": 2.785064697265625,
"step": 78300
},
{
"epoch": 0.16046088215597165,
"grad_norm": 0.7707447409629822,
"learning_rate": 0.0002817319365537038,
"loss": 3.8123992919921874,
"step": 78350
},
{
"epoch": 0.1605632822084005,
"grad_norm": 0.8866334557533264,
"learning_rate": 0.0002817088034873448,
"loss": 3.45692626953125,
"step": 78400
},
{
"epoch": 0.1606656822608293,
"grad_norm": 0.6880883574485779,
"learning_rate": 0.0002816856567343849,
"loss": 3.526729736328125,
"step": 78450
},
{
"epoch": 0.16076808231325815,
"grad_norm": 0.6768396496772766,
"learning_rate": 0.00028166249629722956,
"loss": 3.4408908081054688,
"step": 78500
},
{
"epoch": 0.16087048236568696,
"grad_norm": 0.9719237685203552,
"learning_rate": 0.0002816393221782856,
"loss": 2.7149722290039064,
"step": 78550
},
{
"epoch": 0.1609728824181158,
"grad_norm": 0.9855000376701355,
"learning_rate": 0.000281616134379961,
"loss": 3.5189361572265625,
"step": 78600
},
{
"epoch": 0.16107528247054462,
"grad_norm": 0.8920623660087585,
"learning_rate": 0.0002815929329046654,
"loss": 3.5056884765625,
"step": 78650
},
{
"epoch": 0.16117768252297346,
"grad_norm": 1.0610677003860474,
"learning_rate": 0.0002815697177548098,
"loss": 3.097916259765625,
"step": 78700
},
{
"epoch": 0.16128008257540227,
"grad_norm": 0.8963422775268555,
"learning_rate": 0.0002815464889328066,
"loss": 3.181166076660156,
"step": 78750
},
{
"epoch": 0.16138248262783111,
"grad_norm": 0.7184786200523376,
"learning_rate": 0.00028152324644106964,
"loss": 3.4025540161132812,
"step": 78800
},
{
"epoch": 0.16148488268025993,
"grad_norm": 0.8972223401069641,
"learning_rate": 0.00028149999028201426,
"loss": 3.6795730590820312,
"step": 78850
},
{
"epoch": 0.16158728273268877,
"grad_norm": 0.7884389758110046,
"learning_rate": 0.000281476720458057,
"loss": 3.221437683105469,
"step": 78900
},
{
"epoch": 0.16168968278511758,
"grad_norm": 0.5186575651168823,
"learning_rate": 0.00028145343697161604,
"loss": 3.423157043457031,
"step": 78950
},
{
"epoch": 0.16179208283754642,
"grad_norm": 0.6838876605033875,
"learning_rate": 0.0002814301398251109,
"loss": 2.6037109375,
"step": 79000
},
{
"epoch": 0.16189448288997524,
"grad_norm": 0.7351782321929932,
"learning_rate": 0.0002814068290209625,
"loss": 3.258736877441406,
"step": 79050
},
{
"epoch": 0.16199688294240408,
"grad_norm": 0.8734768033027649,
"learning_rate": 0.00028138350456159315,
"loss": 3.6974835205078125,
"step": 79100
},
{
"epoch": 0.1620992829948329,
"grad_norm": 0.7983621954917908,
"learning_rate": 0.00028136016644942665,
"loss": 3.60543701171875,
"step": 79150
},
{
"epoch": 0.1622016830472617,
"grad_norm": 0.6577921509742737,
"learning_rate": 0.0002813368146868883,
"loss": 3.336464538574219,
"step": 79200
},
{
"epoch": 0.16230408309969055,
"grad_norm": 0.6782569289207458,
"learning_rate": 0.0002813134492764046,
"loss": 3.735032043457031,
"step": 79250
},
{
"epoch": 0.16240648315211936,
"grad_norm": 0.8127371072769165,
"learning_rate": 0.0002812900702204036,
"loss": 3.584259033203125,
"step": 79300
},
{
"epoch": 0.1625088832045482,
"grad_norm": 0.8227265477180481,
"learning_rate": 0.00028126667752131473,
"loss": 2.95750244140625,
"step": 79350
},
{
"epoch": 0.16261128325697702,
"grad_norm": 0.8162257671356201,
"learning_rate": 0.00028124327118156893,
"loss": 3.5253372192382812,
"step": 79400
},
{
"epoch": 0.16271368330940586,
"grad_norm": 0.7429577708244324,
"learning_rate": 0.0002812198512035984,
"loss": 3.1853790283203125,
"step": 79450
},
{
"epoch": 0.16281608336183467,
"grad_norm": 0.7135078310966492,
"learning_rate": 0.00028119641758983695,
"loss": 3.3338772583007814,
"step": 79500
},
{
"epoch": 0.16291848341426352,
"grad_norm": 0.8013560175895691,
"learning_rate": 0.00028117297034271953,
"loss": 3.761092224121094,
"step": 79550
},
{
"epoch": 0.16302088346669233,
"grad_norm": 0.678429126739502,
"learning_rate": 0.0002811495094646828,
"loss": 3.4881576538085937,
"step": 79600
},
{
"epoch": 0.16312328351912117,
"grad_norm": 0.9312568306922913,
"learning_rate": 0.0002811260349581647,
"loss": 3.203521728515625,
"step": 79650
},
{
"epoch": 0.16322568357154998,
"grad_norm": 0.8246340751647949,
"learning_rate": 0.0002811025468256046,
"loss": 3.5164459228515623,
"step": 79700
},
{
"epoch": 0.16332808362397883,
"grad_norm": 1.7668465375900269,
"learning_rate": 0.00028107904506944324,
"loss": 2.763003234863281,
"step": 79750
},
{
"epoch": 0.16343048367640764,
"grad_norm": 0.4166733920574188,
"learning_rate": 0.00028105552969212284,
"loss": 2.9914471435546877,
"step": 79800
},
{
"epoch": 0.16353288372883648,
"grad_norm": 0.6907941699028015,
"learning_rate": 0.0002810320006960871,
"loss": 3.1851446533203127,
"step": 79850
},
{
"epoch": 0.1636352837812653,
"grad_norm": 0.6729636192321777,
"learning_rate": 0.00028100845808378083,
"loss": 3.1257308959960937,
"step": 79900
},
{
"epoch": 0.16373768383369414,
"grad_norm": 0.7356630563735962,
"learning_rate": 0.0002809849018576507,
"loss": 3.1083673095703124,
"step": 79950
},
{
"epoch": 0.16384008388612295,
"grad_norm": 0.9909185767173767,
"learning_rate": 0.00028096133202014443,
"loss": 3.5541717529296877,
"step": 80000
},
{
"epoch": 0.16394248393855176,
"grad_norm": 0.4952726662158966,
"learning_rate": 0.00028093774857371146,
"loss": 2.9160995483398438,
"step": 80050
},
{
"epoch": 0.1640448839909806,
"grad_norm": 0.6858778595924377,
"learning_rate": 0.00028091415152080225,
"loss": 3.611160888671875,
"step": 80100
},
{
"epoch": 0.16414728404340942,
"grad_norm": 0.6998670101165771,
"learning_rate": 0.0002808905408638691,
"loss": 3.506941833496094,
"step": 80150
},
{
"epoch": 0.16424968409583826,
"grad_norm": 0.8609181642532349,
"learning_rate": 0.0002808669166053654,
"loss": 3.5502630615234376,
"step": 80200
},
{
"epoch": 0.16435208414826707,
"grad_norm": 0.7192445993423462,
"learning_rate": 0.00028084327874774615,
"loss": 3.56413330078125,
"step": 80250
},
{
"epoch": 0.16445448420069592,
"grad_norm": 0.7595858573913574,
"learning_rate": 0.0002808196272934676,
"loss": 3.339110107421875,
"step": 80300
},
{
"epoch": 0.16455688425312473,
"grad_norm": 0.7111859321594238,
"learning_rate": 0.0002807959622449877,
"loss": 3.051424560546875,
"step": 80350
},
{
"epoch": 0.16465928430555357,
"grad_norm": 0.6211318373680115,
"learning_rate": 0.00028077228360476537,
"loss": 3.5583587646484376,
"step": 80400
},
{
"epoch": 0.16476168435798239,
"grad_norm": 0.783703625202179,
"learning_rate": 0.00028074859137526136,
"loss": 2.740151062011719,
"step": 80450
},
{
"epoch": 0.16486408441041123,
"grad_norm": 0.30799928307533264,
"learning_rate": 0.0002807248855589376,
"loss": 2.932861633300781,
"step": 80500
},
{
"epoch": 0.16496648446284004,
"grad_norm": 0.9074276089668274,
"learning_rate": 0.0002807011661582575,
"loss": 2.6673263549804687,
"step": 80550
},
{
"epoch": 0.16506888451526888,
"grad_norm": 0.680355429649353,
"learning_rate": 0.00028067743317568587,
"loss": 3.8975335693359376,
"step": 80600
},
{
"epoch": 0.1651712845676977,
"grad_norm": 1.1440931558609009,
"learning_rate": 0.000280653686613689,
"loss": 3.4063519287109374,
"step": 80650
},
{
"epoch": 0.16527368462012654,
"grad_norm": 0.7195169925689697,
"learning_rate": 0.00028062992647473445,
"loss": 3.4463735961914064,
"step": 80700
},
{
"epoch": 0.16537608467255535,
"grad_norm": 2.4933488368988037,
"learning_rate": 0.0002806061527612913,
"loss": 2.4636448669433593,
"step": 80750
},
{
"epoch": 0.1654784847249842,
"grad_norm": 0.8146184086799622,
"learning_rate": 0.00028058236547582997,
"loss": 3.016216125488281,
"step": 80800
},
{
"epoch": 0.165580884777413,
"grad_norm": 0.7645831108093262,
"learning_rate": 0.0002805585646208224,
"loss": 3.7815518188476562,
"step": 80850
},
{
"epoch": 0.16568328482984182,
"grad_norm": 0.5321808457374573,
"learning_rate": 0.00028053475019874187,
"loss": 3.1232025146484377,
"step": 80900
},
{
"epoch": 0.16578568488227066,
"grad_norm": 0.5005676746368408,
"learning_rate": 0.000280510922212063,
"loss": 2.5462303161621094,
"step": 80950
},
{
"epoch": 0.16588808493469948,
"grad_norm": 0.5022799372673035,
"learning_rate": 0.00028048708066326193,
"loss": 2.064752502441406,
"step": 81000
},
{
"epoch": 0.16599048498712832,
"grad_norm": 0.40124621987342834,
"learning_rate": 0.0002804632255548162,
"loss": 3.146656799316406,
"step": 81050
},
{
"epoch": 0.16609288503955713,
"grad_norm": 0.45695436000823975,
"learning_rate": 0.00028043935688920466,
"loss": 1.9398663330078125,
"step": 81100
},
{
"epoch": 0.16619528509198597,
"grad_norm": 0.9878008365631104,
"learning_rate": 0.0002804154746689077,
"loss": 3.3690643310546875,
"step": 81150
},
{
"epoch": 0.1662976851444148,
"grad_norm": 0.9054487943649292,
"learning_rate": 0.000280391578896407,
"loss": 3.499583740234375,
"step": 81200
},
{
"epoch": 0.16640008519684363,
"grad_norm": 0.7078624367713928,
"learning_rate": 0.00028036766957418576,
"loss": 3.645855712890625,
"step": 81250
},
{
"epoch": 0.16650248524927244,
"grad_norm": 0.8132025003433228,
"learning_rate": 0.0002803437467047285,
"loss": 2.701116027832031,
"step": 81300
},
{
"epoch": 0.16660488530170128,
"grad_norm": 0.6939849257469177,
"learning_rate": 0.00028031981029052116,
"loss": 3.399428405761719,
"step": 81350
},
{
"epoch": 0.1667072853541301,
"grad_norm": 0.7499716877937317,
"learning_rate": 0.00028029586033405114,
"loss": 3.7939776611328124,
"step": 81400
},
{
"epoch": 0.16680968540655894,
"grad_norm": 0.6567860245704651,
"learning_rate": 0.00028027189683780716,
"loss": 3.5192059326171874,
"step": 81450
},
{
"epoch": 0.16691208545898775,
"grad_norm": 0.5446729063987732,
"learning_rate": 0.0002802479198042795,
"loss": 4.063154602050782,
"step": 81500
},
{
"epoch": 0.1670144855114166,
"grad_norm": 1.0988305807113647,
"learning_rate": 0.00028022392923595973,
"loss": 3.8179937744140626,
"step": 81550
},
{
"epoch": 0.1671168855638454,
"grad_norm": 0.7563129663467407,
"learning_rate": 0.00028019992513534075,
"loss": 3.4095263671875,
"step": 81600
},
{
"epoch": 0.16721928561627425,
"grad_norm": 0.8743115067481995,
"learning_rate": 0.0002801759075049171,
"loss": 3.422995300292969,
"step": 81650
},
{
"epoch": 0.16732168566870306,
"grad_norm": 0.7077412605285645,
"learning_rate": 0.0002801518763471844,
"loss": 3.8168359375,
"step": 81700
},
{
"epoch": 0.16742408572113188,
"grad_norm": 0.5643756985664368,
"learning_rate": 0.00028012783166464,
"loss": 3.531881103515625,
"step": 81750
},
{
"epoch": 0.16752648577356072,
"grad_norm": 0.5498968362808228,
"learning_rate": 0.0002801037734597825,
"loss": 3.502477722167969,
"step": 81800
},
{
"epoch": 0.16762888582598953,
"grad_norm": 0.6478745937347412,
"learning_rate": 0.00028007970173511194,
"loss": 3.3485955810546875,
"step": 81850
},
{
"epoch": 0.16773128587841837,
"grad_norm": 0.837211012840271,
"learning_rate": 0.0002800556164931297,
"loss": 2.738769836425781,
"step": 81900
},
{
"epoch": 0.1678336859308472,
"grad_norm": 0.8199461102485657,
"learning_rate": 0.0002800315177363386,
"loss": 3.374551086425781,
"step": 81950
},
{
"epoch": 0.16793608598327603,
"grad_norm": 0.6247820258140564,
"learning_rate": 0.00028000740546724293,
"loss": 3.48384521484375,
"step": 82000
},
{
"epoch": 0.16803848603570484,
"grad_norm": 0.6754735708236694,
"learning_rate": 0.0002799832796883483,
"loss": 3.0733773803710935,
"step": 82050
},
{
"epoch": 0.16814088608813368,
"grad_norm": 0.7133215069770813,
"learning_rate": 0.0002799591404021617,
"loss": 3.0067816162109375,
"step": 82100
},
{
"epoch": 0.1682432861405625,
"grad_norm": 0.7670831680297852,
"learning_rate": 0.0002799349876111918,
"loss": 3.1390863037109376,
"step": 82150
},
{
"epoch": 0.16834568619299134,
"grad_norm": 0.4578395485877991,
"learning_rate": 0.0002799108213179482,
"loss": 2.519589691162109,
"step": 82200
},
{
"epoch": 0.16844808624542015,
"grad_norm": 0.6592589020729065,
"learning_rate": 0.0002798866415249422,
"loss": 2.3550537109375,
"step": 82250
},
{
"epoch": 0.168550486297849,
"grad_norm": 0.3382033407688141,
"learning_rate": 0.0002798624482346866,
"loss": 3.0852642822265626,
"step": 82300
},
{
"epoch": 0.1686528863502778,
"grad_norm": 0.5380098819732666,
"learning_rate": 0.00027983824144969533,
"loss": 2.1543919372558595,
"step": 82350
},
{
"epoch": 0.16875528640270665,
"grad_norm": 0.7728050351142883,
"learning_rate": 0.0002798140211724839,
"loss": 2.7856259155273437,
"step": 82400
},
{
"epoch": 0.16885768645513546,
"grad_norm": 0.7540838718414307,
"learning_rate": 0.00027978978740556915,
"loss": 3.5116085815429687,
"step": 82450
},
{
"epoch": 0.1689600865075643,
"grad_norm": 1.1563106775283813,
"learning_rate": 0.0002797655401514693,
"loss": 3.811059875488281,
"step": 82500
}
],
"logging_steps": 50,
"max_steps": 488281,
"num_input_tokens_seen": 0,
"num_train_epochs": 9223372036854775807,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.6106783358976e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}