ContextLM / contextlm_gpt2_large /trainer_state.json
daibeiya's picture
model upload
4d46bcb verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9999709884243814,
"eval_steps": 1000,
"global_step": 17234,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.000580231512373437,
"grad_norm": 4.4758100509643555,
"learning_rate": 6.264501160092807e-06,
"loss": 10.4749,
"step": 10
},
{
"epoch": 0.001160463024746874,
"grad_norm": 1.6773627996444702,
"learning_rate": 1.322505800464037e-05,
"loss": 9.159,
"step": 20
},
{
"epoch": 0.001740694537120311,
"grad_norm": 1.5999170541763306,
"learning_rate": 2.018561484918793e-05,
"loss": 8.8189,
"step": 30
},
{
"epoch": 0.002320926049493748,
"grad_norm": 1.9260104894638062,
"learning_rate": 2.7146171693735496e-05,
"loss": 8.4574,
"step": 40
},
{
"epoch": 0.002901157561867185,
"grad_norm": 2.173593282699585,
"learning_rate": 3.410672853828306e-05,
"loss": 8.0835,
"step": 50
},
{
"epoch": 0.003481389074240622,
"grad_norm": 1.5830281972885132,
"learning_rate": 4.1067285382830626e-05,
"loss": 7.7376,
"step": 60
},
{
"epoch": 0.004061620586614059,
"grad_norm": 2.772728443145752,
"learning_rate": 4.802784222737819e-05,
"loss": 7.4168,
"step": 70
},
{
"epoch": 0.004641852098987496,
"grad_norm": 1.511775016784668,
"learning_rate": 5.498839907192575e-05,
"loss": 7.1442,
"step": 80
},
{
"epoch": 0.005222083611360933,
"grad_norm": 1.9058183431625366,
"learning_rate": 6.194895591647331e-05,
"loss": 6.9324,
"step": 90
},
{
"epoch": 0.00580231512373437,
"grad_norm": 1.6976985931396484,
"learning_rate": 6.890951276102087e-05,
"loss": 6.8005,
"step": 100
},
{
"epoch": 0.006382546636107807,
"grad_norm": 1.4346176385879517,
"learning_rate": 7.587006960556844e-05,
"loss": 6.6814,
"step": 110
},
{
"epoch": 0.006962778148481244,
"grad_norm": 1.0364270210266113,
"learning_rate": 8.283062645011599e-05,
"loss": 6.5547,
"step": 120
},
{
"epoch": 0.007543009660854681,
"grad_norm": 0.6528536677360535,
"learning_rate": 8.979118329466357e-05,
"loss": 6.4482,
"step": 130
},
{
"epoch": 0.008123241173228117,
"grad_norm": 1.1468390226364136,
"learning_rate": 9.675174013921112e-05,
"loss": 6.3518,
"step": 140
},
{
"epoch": 0.008703472685601555,
"grad_norm": 0.6249582171440125,
"learning_rate": 0.0001037122969837587,
"loss": 6.2749,
"step": 150
},
{
"epoch": 0.009283704197974993,
"grad_norm": 0.9577043652534485,
"learning_rate": 0.00011067285382830626,
"loss": 6.2026,
"step": 160
},
{
"epoch": 0.009863935710348428,
"grad_norm": 1.156731367111206,
"learning_rate": 0.00011763341067285381,
"loss": 6.1482,
"step": 170
},
{
"epoch": 0.010444167222721866,
"grad_norm": 0.7919487357139587,
"learning_rate": 0.0001245939675174014,
"loss": 6.0907,
"step": 180
},
{
"epoch": 0.011024398735095304,
"grad_norm": 0.5902596712112427,
"learning_rate": 0.00013155452436194894,
"loss": 6.0469,
"step": 190
},
{
"epoch": 0.01160463024746874,
"grad_norm": 0.9712298512458801,
"learning_rate": 0.00013851508120649652,
"loss": 6.0128,
"step": 200
},
{
"epoch": 0.012184861759842177,
"grad_norm": 0.6487208008766174,
"learning_rate": 0.00014547563805104407,
"loss": 5.949,
"step": 210
},
{
"epoch": 0.012765093272215615,
"grad_norm": 0.6659431457519531,
"learning_rate": 0.00015243619489559162,
"loss": 5.9004,
"step": 220
},
{
"epoch": 0.01334532478458905,
"grad_norm": 0.9973188042640686,
"learning_rate": 0.0001593967517401392,
"loss": 5.8727,
"step": 230
},
{
"epoch": 0.013925556296962488,
"grad_norm": 0.592413067817688,
"learning_rate": 0.00016635730858468675,
"loss": 5.8594,
"step": 240
},
{
"epoch": 0.014505787809335926,
"grad_norm": 0.6143619418144226,
"learning_rate": 0.00017331786542923433,
"loss": 5.8114,
"step": 250
},
{
"epoch": 0.015086019321709361,
"grad_norm": 0.5780689120292664,
"learning_rate": 0.00018027842227378188,
"loss": 5.7829,
"step": 260
},
{
"epoch": 0.0156662508340828,
"grad_norm": 0.41307076811790466,
"learning_rate": 0.00018723897911832944,
"loss": 5.7197,
"step": 270
},
{
"epoch": 0.016246482346456235,
"grad_norm": 0.6880993247032166,
"learning_rate": 0.00019419953596287701,
"loss": 5.7168,
"step": 280
},
{
"epoch": 0.016826713858829674,
"grad_norm": 0.4273562431335449,
"learning_rate": 0.0002011600928074246,
"loss": 5.6639,
"step": 290
},
{
"epoch": 0.01740694537120311,
"grad_norm": 0.5025382041931152,
"learning_rate": 0.00020812064965197212,
"loss": 5.6305,
"step": 300
},
{
"epoch": 0.017987176883576546,
"grad_norm": 0.7127647995948792,
"learning_rate": 0.0002150812064965197,
"loss": 5.5991,
"step": 310
},
{
"epoch": 0.018567408395949985,
"grad_norm": 0.6494776010513306,
"learning_rate": 0.00022204176334106727,
"loss": 5.5961,
"step": 320
},
{
"epoch": 0.01914763990832342,
"grad_norm": 0.43809765577316284,
"learning_rate": 0.00022900232018561485,
"loss": 5.5242,
"step": 330
},
{
"epoch": 0.019727871420696857,
"grad_norm": 0.5514947175979614,
"learning_rate": 0.00023596287703016238,
"loss": 5.4885,
"step": 340
},
{
"epoch": 0.020308102933070296,
"grad_norm": 0.7086557745933533,
"learning_rate": 0.00024292343387470995,
"loss": 5.4558,
"step": 350
},
{
"epoch": 0.020888334445443732,
"grad_norm": 0.44333210587501526,
"learning_rate": 0.0002498839907192575,
"loss": 5.4249,
"step": 360
},
{
"epoch": 0.021468565957817168,
"grad_norm": 0.5971847772598267,
"learning_rate": 0.0002568445475638051,
"loss": 5.3896,
"step": 370
},
{
"epoch": 0.022048797470190607,
"grad_norm": 0.5358195900917053,
"learning_rate": 0.0002638051044083526,
"loss": 5.3647,
"step": 380
},
{
"epoch": 0.022629028982564043,
"grad_norm": 0.4231407046318054,
"learning_rate": 0.0002707656612529002,
"loss": 5.3325,
"step": 390
},
{
"epoch": 0.02320926049493748,
"grad_norm": 0.48789191246032715,
"learning_rate": 0.00027772621809744777,
"loss": 5.2922,
"step": 400
},
{
"epoch": 0.023789492007310918,
"grad_norm": 0.46154582500457764,
"learning_rate": 0.0002846867749419953,
"loss": 5.2881,
"step": 410
},
{
"epoch": 0.024369723519684354,
"grad_norm": 0.44972172379493713,
"learning_rate": 0.00029164733178654287,
"loss": 5.2397,
"step": 420
},
{
"epoch": 0.02494995503205779,
"grad_norm": 0.505415678024292,
"learning_rate": 0.0002986078886310905,
"loss": 5.1841,
"step": 430
},
{
"epoch": 0.02553018654443123,
"grad_norm": 0.42717623710632324,
"learning_rate": 0.0003055684454756381,
"loss": 5.1848,
"step": 440
},
{
"epoch": 0.026110418056804665,
"grad_norm": 0.4216056168079376,
"learning_rate": 0.0003125290023201856,
"loss": 5.1447,
"step": 450
},
{
"epoch": 0.0266906495691781,
"grad_norm": 0.5051509141921997,
"learning_rate": 0.00031948955916473313,
"loss": 5.1084,
"step": 460
},
{
"epoch": 0.02727088108155154,
"grad_norm": 0.5205376744270325,
"learning_rate": 0.0003264501160092807,
"loss": 5.0462,
"step": 470
},
{
"epoch": 0.027851112593924976,
"grad_norm": 0.5111084580421448,
"learning_rate": 0.0003334106728538283,
"loss": 5.0225,
"step": 480
},
{
"epoch": 0.028431344106298412,
"grad_norm": 0.4395337402820587,
"learning_rate": 0.00034037122969837584,
"loss": 4.991,
"step": 490
},
{
"epoch": 0.02901157561867185,
"grad_norm": 0.2879785895347595,
"learning_rate": 0.00034733178654292344,
"loss": 4.9628,
"step": 500
},
{
"epoch": 0.029591807131045287,
"grad_norm": 0.3356530964374542,
"learning_rate": 0.000354292343387471,
"loss": 4.9165,
"step": 510
},
{
"epoch": 0.030172038643418723,
"grad_norm": 0.39410287141799927,
"learning_rate": 0.00036125290023201855,
"loss": 4.8802,
"step": 520
},
{
"epoch": 0.030752270155792162,
"grad_norm": 0.4210626184940338,
"learning_rate": 0.00036821345707656604,
"loss": 4.8403,
"step": 530
},
{
"epoch": 0.0313325016681656,
"grad_norm": 0.4170067608356476,
"learning_rate": 0.00037517401392111365,
"loss": 4.8156,
"step": 540
},
{
"epoch": 0.031912733180539034,
"grad_norm": 0.40876781940460205,
"learning_rate": 0.0003821345707656612,
"loss": 4.7932,
"step": 550
},
{
"epoch": 0.03249296469291247,
"grad_norm": 0.3717671036720276,
"learning_rate": 0.0003890951276102088,
"loss": 4.7812,
"step": 560
},
{
"epoch": 0.03307319620528591,
"grad_norm": 0.37275081872940063,
"learning_rate": 0.00039605568445475636,
"loss": 4.7324,
"step": 570
},
{
"epoch": 0.03365342771765935,
"grad_norm": 0.32523536682128906,
"learning_rate": 0.0004030162412993039,
"loss": 4.6891,
"step": 580
},
{
"epoch": 0.034233659230032784,
"grad_norm": 0.2909957468509674,
"learning_rate": 0.0004099767981438515,
"loss": 4.6555,
"step": 590
},
{
"epoch": 0.03481389074240622,
"grad_norm": 0.40268951654434204,
"learning_rate": 0.00041693735498839906,
"loss": 4.622,
"step": 600
},
{
"epoch": 0.035394122254779656,
"grad_norm": 0.433383584022522,
"learning_rate": 0.00042389791183294656,
"loss": 4.6122,
"step": 610
},
{
"epoch": 0.03597435376715309,
"grad_norm": 0.3096088171005249,
"learning_rate": 0.0004308584686774941,
"loss": 4.5976,
"step": 620
},
{
"epoch": 0.036554585279526534,
"grad_norm": 0.30540433526039124,
"learning_rate": 0.0004378190255220417,
"loss": 4.5569,
"step": 630
},
{
"epoch": 0.03713481679189997,
"grad_norm": 0.3136671781539917,
"learning_rate": 0.00044477958236658927,
"loss": 4.5228,
"step": 640
},
{
"epoch": 0.037715048304273406,
"grad_norm": 0.332621693611145,
"learning_rate": 0.0004517401392111369,
"loss": 4.4901,
"step": 650
},
{
"epoch": 0.03829527981664684,
"grad_norm": 0.3817736804485321,
"learning_rate": 0.0004587006960556844,
"loss": 4.475,
"step": 660
},
{
"epoch": 0.03887551132902028,
"grad_norm": 0.458741158246994,
"learning_rate": 0.000465661252900232,
"loss": 4.4545,
"step": 670
},
{
"epoch": 0.039455742841393714,
"grad_norm": 0.27561265230178833,
"learning_rate": 0.0004726218097447796,
"loss": 4.4406,
"step": 680
},
{
"epoch": 0.040035974353767156,
"grad_norm": 0.380633145570755,
"learning_rate": 0.0004795823665893271,
"loss": 4.4027,
"step": 690
},
{
"epoch": 0.04061620586614059,
"grad_norm": 0.3662358820438385,
"learning_rate": 0.00048654292343387463,
"loss": 4.377,
"step": 700
},
{
"epoch": 0.04119643737851403,
"grad_norm": 0.31104594469070435,
"learning_rate": 0.0004935034802784222,
"loss": 4.3399,
"step": 710
},
{
"epoch": 0.041776668890887464,
"grad_norm": 0.43897074460983276,
"learning_rate": 0.0005004640371229698,
"loss": 4.3229,
"step": 720
},
{
"epoch": 0.0423569004032609,
"grad_norm": 0.2685506343841553,
"learning_rate": 0.0005074245939675173,
"loss": 4.302,
"step": 730
},
{
"epoch": 0.042937131915634336,
"grad_norm": 0.2662206292152405,
"learning_rate": 0.0005143851508120649,
"loss": 4.2533,
"step": 740
},
{
"epoch": 0.04351736342800778,
"grad_norm": 0.31665244698524475,
"learning_rate": 0.0005213457076566126,
"loss": 4.2463,
"step": 750
},
{
"epoch": 0.044097594940381214,
"grad_norm": 0.3573771119117737,
"learning_rate": 0.0005283062645011601,
"loss": 4.2177,
"step": 760
},
{
"epoch": 0.04467782645275465,
"grad_norm": 0.3051789402961731,
"learning_rate": 0.0005352668213457077,
"loss": 4.2098,
"step": 770
},
{
"epoch": 0.045258057965128086,
"grad_norm": 0.26946839690208435,
"learning_rate": 0.0005422273781902551,
"loss": 4.1739,
"step": 780
},
{
"epoch": 0.04583828947750152,
"grad_norm": 0.21327945590019226,
"learning_rate": 0.0005491879350348028,
"loss": 4.151,
"step": 790
},
{
"epoch": 0.04641852098987496,
"grad_norm": 0.28413307666778564,
"learning_rate": 0.0005561484918793503,
"loss": 4.1455,
"step": 800
},
{
"epoch": 0.0469987525022484,
"grad_norm": 0.2847752869129181,
"learning_rate": 0.0005631090487238979,
"loss": 4.1166,
"step": 810
},
{
"epoch": 0.047578984014621836,
"grad_norm": 0.25382527709007263,
"learning_rate": 0.0005700696055684454,
"loss": 4.0986,
"step": 820
},
{
"epoch": 0.04815921552699527,
"grad_norm": 0.2375078797340393,
"learning_rate": 0.000577030162412993,
"loss": 4.0765,
"step": 830
},
{
"epoch": 0.04873944703936871,
"grad_norm": 0.3032638430595398,
"learning_rate": 0.0005839907192575406,
"loss": 4.085,
"step": 840
},
{
"epoch": 0.049319678551742144,
"grad_norm": 0.2454582005739212,
"learning_rate": 0.0005909512761020882,
"loss": 4.0505,
"step": 850
},
{
"epoch": 0.04989991006411558,
"grad_norm": 0.23829826712608337,
"learning_rate": 0.0005979118329466356,
"loss": 4.0391,
"step": 860
},
{
"epoch": 0.05048014157648902,
"grad_norm": 0.29694074392318726,
"learning_rate": 0.0005999997293652579,
"loss": 4.0195,
"step": 870
},
{
"epoch": 0.05106037308886246,
"grad_norm": 0.20268426835536957,
"learning_rate": 0.0005999984038085133,
"loss": 4.0023,
"step": 880
},
{
"epoch": 0.051640604601235894,
"grad_norm": 0.2563273310661316,
"learning_rate": 0.000599995973626219,
"loss": 3.98,
"step": 890
},
{
"epoch": 0.05222083611360933,
"grad_norm": 0.26515451073646545,
"learning_rate": 0.0005999924388273229,
"loss": 3.9799,
"step": 900
},
{
"epoch": 0.052801067625982766,
"grad_norm": 0.23011842370033264,
"learning_rate": 0.0005999877994248407,
"loss": 3.9592,
"step": 910
},
{
"epoch": 0.0533812991383562,
"grad_norm": 0.21570523083209991,
"learning_rate": 0.0005999820554358552,
"loss": 3.9366,
"step": 920
},
{
"epoch": 0.053961530650729644,
"grad_norm": 0.24623119831085205,
"learning_rate": 0.0005999752068815162,
"loss": 3.923,
"step": 930
},
{
"epoch": 0.05454176216310308,
"grad_norm": 0.26557642221450806,
"learning_rate": 0.0005999672537870409,
"loss": 3.9114,
"step": 940
},
{
"epoch": 0.055121993675476516,
"grad_norm": 0.23711174726486206,
"learning_rate": 0.0005999581961817135,
"loss": 3.9021,
"step": 950
},
{
"epoch": 0.05570222518784995,
"grad_norm": 0.2636472284793854,
"learning_rate": 0.000599948034098885,
"loss": 3.8945,
"step": 960
},
{
"epoch": 0.05628245670022339,
"grad_norm": 0.2139461785554886,
"learning_rate": 0.000599936767575973,
"loss": 3.8742,
"step": 970
},
{
"epoch": 0.056862688212596824,
"grad_norm": 0.2411975860595703,
"learning_rate": 0.0005999243966544624,
"loss": 3.8627,
"step": 980
},
{
"epoch": 0.057442919724970266,
"grad_norm": 0.22522902488708496,
"learning_rate": 0.000599910921379904,
"loss": 3.8439,
"step": 990
},
{
"epoch": 0.0580231512373437,
"grad_norm": 0.2505146861076355,
"learning_rate": 0.0005998963418019153,
"loss": 3.8376,
"step": 1000
},
{
"epoch": 0.0580231512373437,
"eval_loss": 3.7977514266967773,
"eval_runtime": 3.2666,
"eval_samples_per_second": 1325.524,
"eval_steps_per_second": 2.755,
"step": 1000
},
{
"epoch": 0.05860338274971714,
"grad_norm": 0.21931585669517517,
"learning_rate": 0.0005998806579741798,
"loss": 3.8196,
"step": 1010
},
{
"epoch": 0.059183614262090574,
"grad_norm": 0.19973556697368622,
"learning_rate": 0.0005998638699544469,
"loss": 3.813,
"step": 1020
},
{
"epoch": 0.05976384577446401,
"grad_norm": 0.21615122258663177,
"learning_rate": 0.0005998459778045319,
"loss": 3.7993,
"step": 1030
},
{
"epoch": 0.060344077286837446,
"grad_norm": 0.18904747068881989,
"learning_rate": 0.0005998269815903156,
"loss": 3.8122,
"step": 1040
},
{
"epoch": 0.06092430879921089,
"grad_norm": 0.20379868149757385,
"learning_rate": 0.000599806881381744,
"loss": 3.7891,
"step": 1050
},
{
"epoch": 0.061504540311584324,
"grad_norm": 0.21616701781749725,
"learning_rate": 0.0005997856772528283,
"loss": 3.7768,
"step": 1060
},
{
"epoch": 0.06208477182395776,
"grad_norm": 0.1838783323764801,
"learning_rate": 0.0005997633692816442,
"loss": 3.7744,
"step": 1070
},
{
"epoch": 0.0626650033363312,
"grad_norm": 0.17894767224788666,
"learning_rate": 0.0005997399575503321,
"loss": 3.7667,
"step": 1080
},
{
"epoch": 0.06324523484870463,
"grad_norm": 0.20992882549762726,
"learning_rate": 0.0005997154421450963,
"loss": 3.7449,
"step": 1090
},
{
"epoch": 0.06382546636107807,
"grad_norm": 0.19586902856826782,
"learning_rate": 0.0005996898231562051,
"loss": 3.7423,
"step": 1100
},
{
"epoch": 0.0644056978734515,
"grad_norm": 0.24105612933635712,
"learning_rate": 0.0005996631006779903,
"loss": 3.7223,
"step": 1110
},
{
"epoch": 0.06498592938582494,
"grad_norm": 0.19526907801628113,
"learning_rate": 0.0005996352748088471,
"loss": 3.7189,
"step": 1120
},
{
"epoch": 0.06556616089819838,
"grad_norm": 0.16144131124019623,
"learning_rate": 0.000599606345651233,
"loss": 3.7118,
"step": 1130
},
{
"epoch": 0.06614639241057182,
"grad_norm": 0.167442187666893,
"learning_rate": 0.0005995763133116683,
"loss": 3.6986,
"step": 1140
},
{
"epoch": 0.06672662392294526,
"grad_norm": 0.23503893613815308,
"learning_rate": 0.0005995451779007352,
"loss": 3.7049,
"step": 1150
},
{
"epoch": 0.0673068554353187,
"grad_norm": 0.2096278965473175,
"learning_rate": 0.0005995129395330776,
"loss": 3.6865,
"step": 1160
},
{
"epoch": 0.06788708694769213,
"grad_norm": 0.19825097918510437,
"learning_rate": 0.0005994795983274004,
"loss": 3.6712,
"step": 1170
},
{
"epoch": 0.06846731846006557,
"grad_norm": 0.15405306220054626,
"learning_rate": 0.0005994451544064696,
"loss": 3.6711,
"step": 1180
},
{
"epoch": 0.069047549972439,
"grad_norm": 0.563884437084198,
"learning_rate": 0.0005994096078971111,
"loss": 3.677,
"step": 1190
},
{
"epoch": 0.06962778148481244,
"grad_norm": 0.1655234694480896,
"learning_rate": 0.0005993729589302111,
"loss": 3.7143,
"step": 1200
},
{
"epoch": 0.07020801299718588,
"grad_norm": 0.15598031878471375,
"learning_rate": 0.0005993352076407148,
"loss": 3.6689,
"step": 1210
},
{
"epoch": 0.07078824450955931,
"grad_norm": 0.14992448687553406,
"learning_rate": 0.0005992963541676265,
"loss": 3.6581,
"step": 1220
},
{
"epoch": 0.07136847602193275,
"grad_norm": 0.1618255376815796,
"learning_rate": 0.0005992563986540086,
"loss": 3.642,
"step": 1230
},
{
"epoch": 0.07194870753430618,
"grad_norm": 0.16188852488994598,
"learning_rate": 0.0005992153412469816,
"loss": 3.6399,
"step": 1240
},
{
"epoch": 0.07252893904667962,
"grad_norm": 0.17180649936199188,
"learning_rate": 0.0005991731820977231,
"loss": 3.6252,
"step": 1250
},
{
"epoch": 0.07310917055905307,
"grad_norm": 0.1691058874130249,
"learning_rate": 0.0005991299213614678,
"loss": 3.6244,
"step": 1260
},
{
"epoch": 0.0736894020714265,
"grad_norm": 0.19470703601837158,
"learning_rate": 0.0005990855591975059,
"loss": 3.6199,
"step": 1270
},
{
"epoch": 0.07426963358379994,
"grad_norm": 0.15482653677463531,
"learning_rate": 0.0005990400957691835,
"loss": 3.6176,
"step": 1280
},
{
"epoch": 0.07484986509617338,
"grad_norm": 0.18342998623847961,
"learning_rate": 0.000598993531243902,
"loss": 3.6082,
"step": 1290
},
{
"epoch": 0.07543009660854681,
"grad_norm": 0.17348110675811768,
"learning_rate": 0.0005989458657931167,
"loss": 3.6063,
"step": 1300
},
{
"epoch": 0.07601032812092025,
"grad_norm": 0.1687677949666977,
"learning_rate": 0.0005988970995923368,
"loss": 3.6015,
"step": 1310
},
{
"epoch": 0.07659055963329368,
"grad_norm": 0.19341568648815155,
"learning_rate": 0.0005988472328211246,
"loss": 3.5912,
"step": 1320
},
{
"epoch": 0.07717079114566712,
"grad_norm": 0.15345478057861328,
"learning_rate": 0.0005987962656630947,
"loss": 3.586,
"step": 1330
},
{
"epoch": 0.07775102265804056,
"grad_norm": 0.16126085817813873,
"learning_rate": 0.0005987441983059136,
"loss": 3.5797,
"step": 1340
},
{
"epoch": 0.07833125417041399,
"grad_norm": 0.1716892272233963,
"learning_rate": 0.0005986910309412986,
"loss": 3.5751,
"step": 1350
},
{
"epoch": 0.07891148568278743,
"grad_norm": 0.15669932961463928,
"learning_rate": 0.0005986367637650177,
"loss": 3.5799,
"step": 1360
},
{
"epoch": 0.07949171719516086,
"grad_norm": 0.19878168404102325,
"learning_rate": 0.0005985813969768884,
"loss": 3.572,
"step": 1370
},
{
"epoch": 0.08007194870753431,
"grad_norm": 0.1505119651556015,
"learning_rate": 0.0005985249307807767,
"loss": 3.567,
"step": 1380
},
{
"epoch": 0.08065218021990775,
"grad_norm": 0.1548507809638977,
"learning_rate": 0.0005984673653845972,
"loss": 3.5427,
"step": 1390
},
{
"epoch": 0.08123241173228118,
"grad_norm": 0.15786635875701904,
"learning_rate": 0.0005984087010003119,
"loss": 3.5637,
"step": 1400
},
{
"epoch": 0.08181264324465462,
"grad_norm": 0.15546779334545135,
"learning_rate": 0.0005983489378439289,
"loss": 3.5475,
"step": 1410
},
{
"epoch": 0.08239287475702806,
"grad_norm": 0.17267097532749176,
"learning_rate": 0.0005982880761355026,
"loss": 3.5519,
"step": 1420
},
{
"epoch": 0.08297310626940149,
"grad_norm": 0.2120850831270218,
"learning_rate": 0.0005982261160991321,
"loss": 3.545,
"step": 1430
},
{
"epoch": 0.08355333778177493,
"grad_norm": 0.1541440784931183,
"learning_rate": 0.0005981630579629609,
"loss": 3.5236,
"step": 1440
},
{
"epoch": 0.08413356929414836,
"grad_norm": 0.1610753834247589,
"learning_rate": 0.0005980989019591753,
"loss": 3.5153,
"step": 1450
},
{
"epoch": 0.0847138008065218,
"grad_norm": 0.1872093677520752,
"learning_rate": 0.0005980336483240048,
"loss": 3.5208,
"step": 1460
},
{
"epoch": 0.08529403231889524,
"grad_norm": 0.15793032944202423,
"learning_rate": 0.0005979672972977201,
"loss": 3.5294,
"step": 1470
},
{
"epoch": 0.08587426383126867,
"grad_norm": 0.1738296002149582,
"learning_rate": 0.0005978998491246324,
"loss": 3.5234,
"step": 1480
},
{
"epoch": 0.08645449534364211,
"grad_norm": 0.1644987314939499,
"learning_rate": 0.0005978313040530931,
"loss": 3.515,
"step": 1490
},
{
"epoch": 0.08703472685601556,
"grad_norm": 0.16707918047904968,
"learning_rate": 0.0005977616623354923,
"loss": 3.5014,
"step": 1500
},
{
"epoch": 0.08761495836838899,
"grad_norm": 0.14812146127223969,
"learning_rate": 0.0005976909242282581,
"loss": 3.4923,
"step": 1510
},
{
"epoch": 0.08819518988076243,
"grad_norm": 0.15653282403945923,
"learning_rate": 0.0005976190899918555,
"loss": 3.4899,
"step": 1520
},
{
"epoch": 0.08877542139313586,
"grad_norm": 0.1531265377998352,
"learning_rate": 0.0005975461598907858,
"loss": 3.4939,
"step": 1530
},
{
"epoch": 0.0893556529055093,
"grad_norm": 0.19499650597572327,
"learning_rate": 0.0005974721341935854,
"loss": 3.4776,
"step": 1540
},
{
"epoch": 0.08993588441788274,
"grad_norm": 0.16522051393985748,
"learning_rate": 0.0005973970131728245,
"loss": 3.4843,
"step": 1550
},
{
"epoch": 0.09051611593025617,
"grad_norm": 0.14911240339279175,
"learning_rate": 0.0005973207971051066,
"loss": 3.4854,
"step": 1560
},
{
"epoch": 0.09109634744262961,
"grad_norm": 0.1797751784324646,
"learning_rate": 0.0005972434862710673,
"loss": 3.4814,
"step": 1570
},
{
"epoch": 0.09167657895500304,
"grad_norm": 0.14958298206329346,
"learning_rate": 0.0005971650809553729,
"loss": 3.4791,
"step": 1580
},
{
"epoch": 0.09225681046737648,
"grad_norm": 0.17834265530109406,
"learning_rate": 0.0005970855814467205,
"loss": 3.4633,
"step": 1590
},
{
"epoch": 0.09283704197974992,
"grad_norm": 0.15738125145435333,
"learning_rate": 0.0005970049880378353,
"loss": 3.4676,
"step": 1600
},
{
"epoch": 0.09341727349212335,
"grad_norm": 0.14483994245529175,
"learning_rate": 0.0005969233010254707,
"loss": 3.4661,
"step": 1610
},
{
"epoch": 0.0939975050044968,
"grad_norm": 0.14126789569854736,
"learning_rate": 0.0005968405207104068,
"loss": 3.4571,
"step": 1620
},
{
"epoch": 0.09457773651687024,
"grad_norm": 0.1578633040189743,
"learning_rate": 0.0005967566473974495,
"loss": 3.4558,
"step": 1630
},
{
"epoch": 0.09515796802924367,
"grad_norm": 0.1565486639738083,
"learning_rate": 0.000596671681395429,
"loss": 3.4604,
"step": 1640
},
{
"epoch": 0.09573819954161711,
"grad_norm": 0.13866451382637024,
"learning_rate": 0.0005965856230171993,
"loss": 3.4552,
"step": 1650
},
{
"epoch": 0.09631843105399054,
"grad_norm": 0.2121124267578125,
"learning_rate": 0.0005964984725796359,
"loss": 3.4541,
"step": 1660
},
{
"epoch": 0.09689866256636398,
"grad_norm": 0.17082008719444275,
"learning_rate": 0.0005964102304036363,
"loss": 3.4382,
"step": 1670
},
{
"epoch": 0.09747889407873742,
"grad_norm": 0.20681622624397278,
"learning_rate": 0.0005963208968141172,
"loss": 3.4372,
"step": 1680
},
{
"epoch": 0.09805912559111085,
"grad_norm": 0.1384105086326599,
"learning_rate": 0.0005962304721400142,
"loss": 3.4484,
"step": 1690
},
{
"epoch": 0.09863935710348429,
"grad_norm": 0.16820856928825378,
"learning_rate": 0.0005961389567142806,
"loss": 3.4302,
"step": 1700
},
{
"epoch": 0.09921958861585772,
"grad_norm": 0.16617996990680695,
"learning_rate": 0.0005960463508738855,
"loss": 3.4328,
"step": 1710
},
{
"epoch": 0.09979982012823116,
"grad_norm": 0.16344214975833893,
"learning_rate": 0.0005959526549598137,
"loss": 3.4326,
"step": 1720
},
{
"epoch": 0.1003800516406046,
"grad_norm": 0.16235540807247162,
"learning_rate": 0.000595857869317063,
"loss": 3.4271,
"step": 1730
},
{
"epoch": 0.10096028315297804,
"grad_norm": 0.1524738371372223,
"learning_rate": 0.0005957619942946442,
"loss": 3.424,
"step": 1740
},
{
"epoch": 0.10154051466535148,
"grad_norm": 0.18023791909217834,
"learning_rate": 0.0005956650302455793,
"loss": 3.4266,
"step": 1750
},
{
"epoch": 0.10212074617772492,
"grad_norm": 0.17738115787506104,
"learning_rate": 0.0005955669775268999,
"loss": 3.4046,
"step": 1760
},
{
"epoch": 0.10270097769009835,
"grad_norm": 0.13939271867275238,
"learning_rate": 0.0005954678364996466,
"loss": 3.4177,
"step": 1770
},
{
"epoch": 0.10328120920247179,
"grad_norm": 0.18028447031974792,
"learning_rate": 0.0005953676075288668,
"loss": 3.4113,
"step": 1780
},
{
"epoch": 0.10386144071484522,
"grad_norm": 0.15911422669887543,
"learning_rate": 0.0005952662909836142,
"loss": 3.4191,
"step": 1790
},
{
"epoch": 0.10444167222721866,
"grad_norm": 0.15596607327461243,
"learning_rate": 0.0005951638872369469,
"loss": 3.3993,
"step": 1800
},
{
"epoch": 0.1050219037395921,
"grad_norm": 0.15493981540203094,
"learning_rate": 0.0005950603966659264,
"loss": 3.4043,
"step": 1810
},
{
"epoch": 0.10560213525196553,
"grad_norm": 0.1727568507194519,
"learning_rate": 0.0005949558196516154,
"loss": 3.4028,
"step": 1820
},
{
"epoch": 0.10618236676433897,
"grad_norm": 0.1614874303340912,
"learning_rate": 0.0005948501565790779,
"loss": 3.3998,
"step": 1830
},
{
"epoch": 0.1067625982767124,
"grad_norm": 0.13620299100875854,
"learning_rate": 0.000594743407837376,
"loss": 3.3896,
"step": 1840
},
{
"epoch": 0.10734282978908584,
"grad_norm": 0.15391112864017487,
"learning_rate": 0.0005946355738195701,
"loss": 3.3823,
"step": 1850
},
{
"epoch": 0.10792306130145929,
"grad_norm": 0.15937426686286926,
"learning_rate": 0.0005945266549227162,
"loss": 3.3893,
"step": 1860
},
{
"epoch": 0.10850329281383272,
"grad_norm": 0.16253319382667542,
"learning_rate": 0.0005944166515478649,
"loss": 3.3905,
"step": 1870
},
{
"epoch": 0.10908352432620616,
"grad_norm": 0.14502382278442383,
"learning_rate": 0.0005943055641000604,
"loss": 3.3836,
"step": 1880
},
{
"epoch": 0.1096637558385796,
"grad_norm": 0.14128324389457703,
"learning_rate": 0.0005941933929883384,
"loss": 3.3854,
"step": 1890
},
{
"epoch": 0.11024398735095303,
"grad_norm": 0.19345618784427643,
"learning_rate": 0.0005940801386257244,
"loss": 3.3746,
"step": 1900
},
{
"epoch": 0.11082421886332647,
"grad_norm": 0.1499020904302597,
"learning_rate": 0.000593965801429233,
"loss": 3.3729,
"step": 1910
},
{
"epoch": 0.1114044503756999,
"grad_norm": 0.14975206553936005,
"learning_rate": 0.0005938503818198656,
"loss": 3.3676,
"step": 1920
},
{
"epoch": 0.11198468188807334,
"grad_norm": 0.13726426661014557,
"learning_rate": 0.0005937338802226094,
"loss": 3.373,
"step": 1930
},
{
"epoch": 0.11256491340044678,
"grad_norm": 0.1749139279127121,
"learning_rate": 0.0005936162970664355,
"loss": 3.3761,
"step": 1940
},
{
"epoch": 0.11314514491282021,
"grad_norm": 0.14197006821632385,
"learning_rate": 0.0005934976327842974,
"loss": 3.3513,
"step": 1950
},
{
"epoch": 0.11372537642519365,
"grad_norm": 0.15288510918617249,
"learning_rate": 0.0005933778878131294,
"loss": 3.357,
"step": 1960
},
{
"epoch": 0.11430560793756708,
"grad_norm": 0.1787514090538025,
"learning_rate": 0.000593257062593845,
"loss": 3.3642,
"step": 1970
},
{
"epoch": 0.11488583944994053,
"grad_norm": 0.13630741834640503,
"learning_rate": 0.0005931351575713353,
"loss": 3.3614,
"step": 1980
},
{
"epoch": 0.11546607096231397,
"grad_norm": 0.16102264821529388,
"learning_rate": 0.0005930121731944674,
"loss": 3.3523,
"step": 1990
},
{
"epoch": 0.1160463024746874,
"grad_norm": 0.16226573288440704,
"learning_rate": 0.0005928881099160826,
"loss": 3.3595,
"step": 2000
},
{
"epoch": 0.1160463024746874,
"eval_loss": 3.3178560733795166,
"eval_runtime": 3.2576,
"eval_samples_per_second": 1329.214,
"eval_steps_per_second": 2.763,
"step": 2000
},
{
"epoch": 0.11662653398706084,
"grad_norm": 0.14609858393669128,
"learning_rate": 0.0005927629681929951,
"loss": 3.3585,
"step": 2010
},
{
"epoch": 0.11720676549943428,
"grad_norm": 0.14387281239032745,
"learning_rate": 0.0005926367484859896,
"loss": 3.3517,
"step": 2020
},
{
"epoch": 0.11778699701180771,
"grad_norm": 0.14605766534805298,
"learning_rate": 0.0005925094512598202,
"loss": 3.3524,
"step": 2030
},
{
"epoch": 0.11836722852418115,
"grad_norm": 0.22022885084152222,
"learning_rate": 0.000592381076983209,
"loss": 3.3356,
"step": 2040
},
{
"epoch": 0.11894746003655458,
"grad_norm": 0.1847839504480362,
"learning_rate": 0.0005922516261288431,
"loss": 3.3441,
"step": 2050
},
{
"epoch": 0.11952769154892802,
"grad_norm": 0.13915176689624786,
"learning_rate": 0.0005921210991733745,
"loss": 3.352,
"step": 2060
},
{
"epoch": 0.12010792306130146,
"grad_norm": 0.1398390680551529,
"learning_rate": 0.0005919894965974168,
"loss": 3.3455,
"step": 2070
},
{
"epoch": 0.12068815457367489,
"grad_norm": 0.1368722915649414,
"learning_rate": 0.0005918568188855447,
"loss": 3.3403,
"step": 2080
},
{
"epoch": 0.12126838608604833,
"grad_norm": 0.16239017248153687,
"learning_rate": 0.0005917230665262914,
"loss": 3.3334,
"step": 2090
},
{
"epoch": 0.12184861759842178,
"grad_norm": 0.14380386471748352,
"learning_rate": 0.000591588240012147,
"loss": 3.3294,
"step": 2100
},
{
"epoch": 0.12242884911079521,
"grad_norm": 0.16626037657260895,
"learning_rate": 0.0005914523398395569,
"loss": 3.3425,
"step": 2110
},
{
"epoch": 0.12300908062316865,
"grad_norm": 0.15981921553611755,
"learning_rate": 0.0005913153665089197,
"loss": 3.3403,
"step": 2120
},
{
"epoch": 0.12358931213554208,
"grad_norm": 0.15275150537490845,
"learning_rate": 0.0005911773205245857,
"loss": 3.3261,
"step": 2130
},
{
"epoch": 0.12416954364791552,
"grad_norm": 0.1598198413848877,
"learning_rate": 0.0005910382023948546,
"loss": 3.3264,
"step": 2140
},
{
"epoch": 0.12474977516028896,
"grad_norm": 0.138661190867424,
"learning_rate": 0.0005908980126319739,
"loss": 3.3216,
"step": 2150
},
{
"epoch": 0.1253300066726624,
"grad_norm": 0.15583263337612152,
"learning_rate": 0.000590756751752137,
"loss": 3.3204,
"step": 2160
},
{
"epoch": 0.12591023818503583,
"grad_norm": 0.15883944928646088,
"learning_rate": 0.0005906144202754813,
"loss": 3.3274,
"step": 2170
},
{
"epoch": 0.12649046969740926,
"grad_norm": 0.15031637251377106,
"learning_rate": 0.0005904710187260862,
"loss": 3.3224,
"step": 2180
},
{
"epoch": 0.1270707012097827,
"grad_norm": 0.1994715929031372,
"learning_rate": 0.0005903265476319712,
"loss": 3.3204,
"step": 2190
},
{
"epoch": 0.12765093272215614,
"grad_norm": 0.16986873745918274,
"learning_rate": 0.000590181007525094,
"loss": 3.327,
"step": 2200
},
{
"epoch": 0.12823116423452957,
"grad_norm": 0.147616907954216,
"learning_rate": 0.0005900343989413485,
"loss": 3.3063,
"step": 2210
},
{
"epoch": 0.128811395746903,
"grad_norm": 0.16532088816165924,
"learning_rate": 0.0005898867224205629,
"loss": 3.3198,
"step": 2220
},
{
"epoch": 0.12939162725927644,
"grad_norm": 0.16687408089637756,
"learning_rate": 0.0005897379785064977,
"loss": 3.3193,
"step": 2230
},
{
"epoch": 0.12997185877164988,
"grad_norm": 0.16683116555213928,
"learning_rate": 0.0005895881677468434,
"loss": 3.3078,
"step": 2240
},
{
"epoch": 0.13055209028402331,
"grad_norm": 0.15461483597755432,
"learning_rate": 0.000589437290693219,
"loss": 3.3126,
"step": 2250
},
{
"epoch": 0.13113232179639675,
"grad_norm": 0.1432589441537857,
"learning_rate": 0.0005892853479011696,
"loss": 3.3004,
"step": 2260
},
{
"epoch": 0.13171255330877019,
"grad_norm": 0.1792496293783188,
"learning_rate": 0.0005891323399301646,
"loss": 3.2946,
"step": 2270
},
{
"epoch": 0.13229278482114365,
"grad_norm": 0.15189994871616364,
"learning_rate": 0.0005889782673435952,
"loss": 3.3013,
"step": 2280
},
{
"epoch": 0.13287301633351709,
"grad_norm": 0.15026351809501648,
"learning_rate": 0.0005888231307087728,
"loss": 3.295,
"step": 2290
},
{
"epoch": 0.13345324784589052,
"grad_norm": 0.16199465095996857,
"learning_rate": 0.0005886669305969269,
"loss": 3.2955,
"step": 2300
},
{
"epoch": 0.13403347935826396,
"grad_norm": 0.16704988479614258,
"learning_rate": 0.0005885096675832027,
"loss": 3.3057,
"step": 2310
},
{
"epoch": 0.1346137108706374,
"grad_norm": 0.14401213824748993,
"learning_rate": 0.0005883513422466588,
"loss": 3.2876,
"step": 2320
},
{
"epoch": 0.13519394238301083,
"grad_norm": 0.15336865186691284,
"learning_rate": 0.000588191955170266,
"loss": 3.2903,
"step": 2330
},
{
"epoch": 0.13577417389538426,
"grad_norm": 0.16176366806030273,
"learning_rate": 0.0005880315069409039,
"loss": 3.2873,
"step": 2340
},
{
"epoch": 0.1363544054077577,
"grad_norm": 0.14728406071662903,
"learning_rate": 0.00058786999814936,
"loss": 3.2862,
"step": 2350
},
{
"epoch": 0.13693463692013114,
"grad_norm": 0.14426636695861816,
"learning_rate": 0.0005877074293903264,
"loss": 3.2786,
"step": 2360
},
{
"epoch": 0.13751486843250457,
"grad_norm": 0.15023665130138397,
"learning_rate": 0.0005875438012623984,
"loss": 3.2888,
"step": 2370
},
{
"epoch": 0.138095099944878,
"grad_norm": 0.1882687211036682,
"learning_rate": 0.0005873791143680718,
"loss": 3.2806,
"step": 2380
},
{
"epoch": 0.13867533145725144,
"grad_norm": 0.14847789704799652,
"learning_rate": 0.000587213369313741,
"loss": 3.2698,
"step": 2390
},
{
"epoch": 0.13925556296962488,
"grad_norm": 0.14070352911949158,
"learning_rate": 0.0005870465667096969,
"loss": 3.2782,
"step": 2400
},
{
"epoch": 0.13983579448199832,
"grad_norm": 0.19226056337356567,
"learning_rate": 0.0005868787071701238,
"loss": 3.2639,
"step": 2410
},
{
"epoch": 0.14041602599437175,
"grad_norm": 0.1776312291622162,
"learning_rate": 0.0005867097913130982,
"loss": 3.2792,
"step": 2420
},
{
"epoch": 0.1409962575067452,
"grad_norm": 0.13482613861560822,
"learning_rate": 0.0005865398197605863,
"loss": 3.2834,
"step": 2430
},
{
"epoch": 0.14157648901911862,
"grad_norm": 0.16731715202331543,
"learning_rate": 0.0005863687931384408,
"loss": 3.2773,
"step": 2440
},
{
"epoch": 0.14215672053149206,
"grad_norm": 0.14542406797409058,
"learning_rate": 0.0005861967120763997,
"loss": 3.2676,
"step": 2450
},
{
"epoch": 0.1427369520438655,
"grad_norm": 0.1490476280450821,
"learning_rate": 0.0005860235772080836,
"loss": 3.2783,
"step": 2460
},
{
"epoch": 0.14331718355623893,
"grad_norm": 0.1446717530488968,
"learning_rate": 0.0005858493891709932,
"loss": 3.283,
"step": 2470
},
{
"epoch": 0.14389741506861237,
"grad_norm": 0.1412891447544098,
"learning_rate": 0.0005856741486065071,
"loss": 3.2652,
"step": 2480
},
{
"epoch": 0.1444776465809858,
"grad_norm": 0.14674563705921173,
"learning_rate": 0.0005854978561598794,
"loss": 3.2613,
"step": 2490
},
{
"epoch": 0.14505787809335924,
"grad_norm": 0.14808981120586395,
"learning_rate": 0.0005853205124802374,
"loss": 3.2742,
"step": 2500
},
{
"epoch": 0.14563810960573267,
"grad_norm": 0.14043253660202026,
"learning_rate": 0.0005851421182205789,
"loss": 3.2685,
"step": 2510
},
{
"epoch": 0.14621834111810614,
"grad_norm": 0.1568257212638855,
"learning_rate": 0.0005849626740377705,
"loss": 3.2711,
"step": 2520
},
{
"epoch": 0.14679857263047957,
"grad_norm": 0.13545943796634674,
"learning_rate": 0.0005847821805925444,
"loss": 3.2573,
"step": 2530
},
{
"epoch": 0.147378804142853,
"grad_norm": 0.18863698840141296,
"learning_rate": 0.0005846006385494964,
"loss": 3.2526,
"step": 2540
},
{
"epoch": 0.14795903565522645,
"grad_norm": 0.14628858864307404,
"learning_rate": 0.0005844180485770832,
"loss": 3.2629,
"step": 2550
},
{
"epoch": 0.14853926716759988,
"grad_norm": 0.1624503880739212,
"learning_rate": 0.0005842344113476202,
"loss": 3.2529,
"step": 2560
},
{
"epoch": 0.14911949867997332,
"grad_norm": 0.16218945384025574,
"learning_rate": 0.0005840497275372792,
"loss": 3.2548,
"step": 2570
},
{
"epoch": 0.14969973019234675,
"grad_norm": 0.16516704857349396,
"learning_rate": 0.0005838639978260851,
"loss": 3.2501,
"step": 2580
},
{
"epoch": 0.1502799617047202,
"grad_norm": 0.1366761326789856,
"learning_rate": 0.0005836772228979142,
"loss": 3.2467,
"step": 2590
},
{
"epoch": 0.15086019321709362,
"grad_norm": 0.15526661276817322,
"learning_rate": 0.0005834894034404913,
"loss": 3.242,
"step": 2600
},
{
"epoch": 0.15144042472946706,
"grad_norm": 0.1441916972398758,
"learning_rate": 0.0005833005401453874,
"loss": 3.2399,
"step": 2610
},
{
"epoch": 0.1520206562418405,
"grad_norm": 0.1708252727985382,
"learning_rate": 0.0005831106337080169,
"loss": 3.2427,
"step": 2620
},
{
"epoch": 0.15260088775421393,
"grad_norm": 0.14945155382156372,
"learning_rate": 0.0005829196848276351,
"loss": 3.2449,
"step": 2630
},
{
"epoch": 0.15318111926658737,
"grad_norm": 0.1512700468301773,
"learning_rate": 0.000582727694207336,
"loss": 3.2438,
"step": 2640
},
{
"epoch": 0.1537613507789608,
"grad_norm": 0.15101619064807892,
"learning_rate": 0.0005825346625540491,
"loss": 3.2396,
"step": 2650
},
{
"epoch": 0.15434158229133424,
"grad_norm": 0.13658584654331207,
"learning_rate": 0.000582340590578537,
"loss": 3.2475,
"step": 2660
},
{
"epoch": 0.15492181380370768,
"grad_norm": 0.16723176836967468,
"learning_rate": 0.0005821454789953932,
"loss": 3.2385,
"step": 2670
},
{
"epoch": 0.1555020453160811,
"grad_norm": 0.16236084699630737,
"learning_rate": 0.000581949328523039,
"loss": 3.2287,
"step": 2680
},
{
"epoch": 0.15608227682845455,
"grad_norm": 0.1473713517189026,
"learning_rate": 0.0005817521398837209,
"loss": 3.2335,
"step": 2690
},
{
"epoch": 0.15666250834082798,
"grad_norm": 0.14422966539859772,
"learning_rate": 0.0005815539138035082,
"loss": 3.2217,
"step": 2700
},
{
"epoch": 0.15724273985320142,
"grad_norm": 0.1676100343465805,
"learning_rate": 0.00058135465101229,
"loss": 3.2329,
"step": 2710
},
{
"epoch": 0.15782297136557485,
"grad_norm": 0.14574168622493744,
"learning_rate": 0.000581154352243773,
"loss": 3.2278,
"step": 2720
},
{
"epoch": 0.1584032028779483,
"grad_norm": 0.16981543600559235,
"learning_rate": 0.000580953018235478,
"loss": 3.229,
"step": 2730
},
{
"epoch": 0.15898343439032173,
"grad_norm": 0.13945645093917847,
"learning_rate": 0.0005807506497287379,
"loss": 3.2297,
"step": 2740
},
{
"epoch": 0.15956366590269516,
"grad_norm": 0.17302276194095612,
"learning_rate": 0.0005805472474686949,
"loss": 3.2227,
"step": 2750
},
{
"epoch": 0.16014389741506863,
"grad_norm": 0.15059055387973785,
"learning_rate": 0.0005803428122042974,
"loss": 3.2288,
"step": 2760
},
{
"epoch": 0.16072412892744206,
"grad_norm": 0.14908020198345184,
"learning_rate": 0.0005801373446882973,
"loss": 3.2293,
"step": 2770
},
{
"epoch": 0.1613043604398155,
"grad_norm": 0.1653462052345276,
"learning_rate": 0.0005799308456772478,
"loss": 3.2189,
"step": 2780
},
{
"epoch": 0.16188459195218893,
"grad_norm": 0.14483293890953064,
"learning_rate": 0.0005797233159314997,
"loss": 3.2239,
"step": 2790
},
{
"epoch": 0.16246482346456237,
"grad_norm": 0.15277917683124542,
"learning_rate": 0.0005795147562151992,
"loss": 3.2155,
"step": 2800
},
{
"epoch": 0.1630450549769358,
"grad_norm": 0.13660204410552979,
"learning_rate": 0.0005793051672962852,
"loss": 3.2183,
"step": 2810
},
{
"epoch": 0.16362528648930924,
"grad_norm": 0.15595564246177673,
"learning_rate": 0.0005790945499464861,
"loss": 3.2163,
"step": 2820
},
{
"epoch": 0.16420551800168268,
"grad_norm": 0.14608708024024963,
"learning_rate": 0.0005788829049413167,
"loss": 3.2222,
"step": 2830
},
{
"epoch": 0.1647857495140561,
"grad_norm": 0.14129003882408142,
"learning_rate": 0.0005786702330600764,
"loss": 3.2115,
"step": 2840
},
{
"epoch": 0.16536598102642955,
"grad_norm": 0.13925908505916595,
"learning_rate": 0.0005784565350858453,
"loss": 3.2115,
"step": 2850
},
{
"epoch": 0.16594621253880298,
"grad_norm": 0.15094564855098724,
"learning_rate": 0.0005782418118054816,
"loss": 3.216,
"step": 2860
},
{
"epoch": 0.16652644405117642,
"grad_norm": 0.1384998857975006,
"learning_rate": 0.0005780260640096189,
"loss": 3.2084,
"step": 2870
},
{
"epoch": 0.16710667556354986,
"grad_norm": 0.15442876517772675,
"learning_rate": 0.0005778092924926634,
"loss": 3.2071,
"step": 2880
},
{
"epoch": 0.1676869070759233,
"grad_norm": 0.16494965553283691,
"learning_rate": 0.0005775914980527904,
"loss": 3.2101,
"step": 2890
},
{
"epoch": 0.16826713858829673,
"grad_norm": 0.16855239868164062,
"learning_rate": 0.0005773726814919419,
"loss": 3.2019,
"step": 2900
},
{
"epoch": 0.16884737010067016,
"grad_norm": 0.1579483449459076,
"learning_rate": 0.0005771528436158233,
"loss": 3.209,
"step": 2910
},
{
"epoch": 0.1694276016130436,
"grad_norm": 0.1417829543352127,
"learning_rate": 0.0005769319852339008,
"loss": 3.2019,
"step": 2920
},
{
"epoch": 0.17000783312541703,
"grad_norm": 0.14454993605613708,
"learning_rate": 0.0005767101071593979,
"loss": 3.2047,
"step": 2930
},
{
"epoch": 0.17058806463779047,
"grad_norm": 0.16087666153907776,
"learning_rate": 0.0005764872102092931,
"loss": 3.2062,
"step": 2940
},
{
"epoch": 0.1711682961501639,
"grad_norm": 0.139312744140625,
"learning_rate": 0.0005762632952043163,
"loss": 3.1988,
"step": 2950
},
{
"epoch": 0.17174852766253734,
"grad_norm": 0.15459179878234863,
"learning_rate": 0.000576038362968946,
"loss": 3.2002,
"step": 2960
},
{
"epoch": 0.17232875917491078,
"grad_norm": 0.18820500373840332,
"learning_rate": 0.0005758124143314062,
"loss": 3.2035,
"step": 2970
},
{
"epoch": 0.17290899068728421,
"grad_norm": 0.14626365900039673,
"learning_rate": 0.0005755854501236635,
"loss": 3.194,
"step": 2980
},
{
"epoch": 0.17348922219965765,
"grad_norm": 0.14270606637001038,
"learning_rate": 0.0005753574711814238,
"loss": 3.1879,
"step": 2990
},
{
"epoch": 0.1740694537120311,
"grad_norm": 0.15857936441898346,
"learning_rate": 0.0005751284783441297,
"loss": 3.207,
"step": 3000
},
{
"epoch": 0.1740694537120311,
"eval_loss": 3.158046245574951,
"eval_runtime": 3.2654,
"eval_samples_per_second": 1326.029,
"eval_steps_per_second": 2.756,
"step": 3000
},
{
"epoch": 0.17464968522440455,
"grad_norm": 0.14403465390205383,
"learning_rate": 0.0005748984724549565,
"loss": 3.1895,
"step": 3010
},
{
"epoch": 0.17522991673677799,
"grad_norm": 0.1392756998538971,
"learning_rate": 0.0005746674543608101,
"loss": 3.1942,
"step": 3020
},
{
"epoch": 0.17581014824915142,
"grad_norm": 0.13957557082176208,
"learning_rate": 0.0005744354249123234,
"loss": 3.1969,
"step": 3030
},
{
"epoch": 0.17639037976152486,
"grad_norm": 0.151198148727417,
"learning_rate": 0.0005742023849638531,
"loss": 3.1903,
"step": 3040
},
{
"epoch": 0.1769706112738983,
"grad_norm": 0.14607684314250946,
"learning_rate": 0.0005739683353734766,
"loss": 3.2003,
"step": 3050
},
{
"epoch": 0.17755084278627173,
"grad_norm": 0.13925622403621674,
"learning_rate": 0.0005737332770029891,
"loss": 3.1927,
"step": 3060
},
{
"epoch": 0.17813107429864516,
"grad_norm": 0.13125456869602203,
"learning_rate": 0.0005734972107179001,
"loss": 3.1849,
"step": 3070
},
{
"epoch": 0.1787113058110186,
"grad_norm": 0.16905735433101654,
"learning_rate": 0.0005732601373874306,
"loss": 3.187,
"step": 3080
},
{
"epoch": 0.17929153732339204,
"grad_norm": 0.13563838601112366,
"learning_rate": 0.0005730220578845091,
"loss": 3.1853,
"step": 3090
},
{
"epoch": 0.17987176883576547,
"grad_norm": 0.15470236539840698,
"learning_rate": 0.0005727829730857695,
"loss": 3.1906,
"step": 3100
},
{
"epoch": 0.1804520003481389,
"grad_norm": 0.160013347864151,
"learning_rate": 0.0005725428838715469,
"loss": 3.1705,
"step": 3110
},
{
"epoch": 0.18103223186051234,
"grad_norm": 0.14684250950813293,
"learning_rate": 0.0005723017911258752,
"loss": 3.1825,
"step": 3120
},
{
"epoch": 0.18161246337288578,
"grad_norm": 0.1529027372598648,
"learning_rate": 0.0005720596957364829,
"loss": 3.1817,
"step": 3130
},
{
"epoch": 0.18219269488525922,
"grad_norm": 0.13860736787319183,
"learning_rate": 0.0005718165985947907,
"loss": 3.1844,
"step": 3140
},
{
"epoch": 0.18277292639763265,
"grad_norm": 0.14795511960983276,
"learning_rate": 0.0005715725005959077,
"loss": 3.1741,
"step": 3150
},
{
"epoch": 0.1833531579100061,
"grad_norm": 0.1455545276403427,
"learning_rate": 0.0005713274026386283,
"loss": 3.1869,
"step": 3160
},
{
"epoch": 0.18393338942237952,
"grad_norm": 0.14845995604991913,
"learning_rate": 0.0005710813056254289,
"loss": 3.1735,
"step": 3170
},
{
"epoch": 0.18451362093475296,
"grad_norm": 0.14949209988117218,
"learning_rate": 0.0005708342104624645,
"loss": 3.178,
"step": 3180
},
{
"epoch": 0.1850938524471264,
"grad_norm": 0.16276435554027557,
"learning_rate": 0.0005705861180595653,
"loss": 3.1712,
"step": 3190
},
{
"epoch": 0.18567408395949983,
"grad_norm": 0.14152179658412933,
"learning_rate": 0.0005703370293302335,
"loss": 3.1752,
"step": 3200
},
{
"epoch": 0.18625431547187327,
"grad_norm": 0.1554255187511444,
"learning_rate": 0.00057008694519164,
"loss": 3.169,
"step": 3210
},
{
"epoch": 0.1868345469842467,
"grad_norm": 0.14890237152576447,
"learning_rate": 0.0005698358665646207,
"loss": 3.1706,
"step": 3220
},
{
"epoch": 0.18741477849662014,
"grad_norm": 0.15197904407978058,
"learning_rate": 0.0005695837943736735,
"loss": 3.1691,
"step": 3230
},
{
"epoch": 0.1879950100089936,
"grad_norm": 0.15369053184986115,
"learning_rate": 0.0005693307295469547,
"loss": 3.1678,
"step": 3240
},
{
"epoch": 0.18857524152136704,
"grad_norm": 0.19938114285469055,
"learning_rate": 0.0005690766730162752,
"loss": 3.1706,
"step": 3250
},
{
"epoch": 0.18915547303374047,
"grad_norm": 0.14962078630924225,
"learning_rate": 0.0005688216257170979,
"loss": 3.1665,
"step": 3260
},
{
"epoch": 0.1897357045461139,
"grad_norm": 0.14826686680316925,
"learning_rate": 0.0005685655885885337,
"loss": 3.1478,
"step": 3270
},
{
"epoch": 0.19031593605848734,
"grad_norm": 0.137392058968544,
"learning_rate": 0.0005683085625733382,
"loss": 3.1645,
"step": 3280
},
{
"epoch": 0.19089616757086078,
"grad_norm": 0.15559589862823486,
"learning_rate": 0.000568050548617908,
"loss": 3.1674,
"step": 3290
},
{
"epoch": 0.19147639908323422,
"grad_norm": 0.17506170272827148,
"learning_rate": 0.0005677915476722775,
"loss": 3.1606,
"step": 3300
},
{
"epoch": 0.19205663059560765,
"grad_norm": 0.1602877825498581,
"learning_rate": 0.0005675315606901155,
"loss": 3.1586,
"step": 3310
},
{
"epoch": 0.1926368621079811,
"grad_norm": 0.13343220949172974,
"learning_rate": 0.0005672705886287211,
"loss": 3.1553,
"step": 3320
},
{
"epoch": 0.19321709362035452,
"grad_norm": 0.15390737354755402,
"learning_rate": 0.0005670086324490208,
"loss": 3.1687,
"step": 3330
},
{
"epoch": 0.19379732513272796,
"grad_norm": 0.13513082265853882,
"learning_rate": 0.0005667456931155647,
"loss": 3.1543,
"step": 3340
},
{
"epoch": 0.1943775566451014,
"grad_norm": 0.1489078551530838,
"learning_rate": 0.0005664817715965231,
"loss": 3.1623,
"step": 3350
},
{
"epoch": 0.19495778815747483,
"grad_norm": 0.14149461686611176,
"learning_rate": 0.0005662168688636826,
"loss": 3.1487,
"step": 3360
},
{
"epoch": 0.19553801966984827,
"grad_norm": 0.150479257106781,
"learning_rate": 0.0005659509858924428,
"loss": 3.1588,
"step": 3370
},
{
"epoch": 0.1961182511822217,
"grad_norm": 0.15041102468967438,
"learning_rate": 0.0005656841236618127,
"loss": 3.155,
"step": 3380
},
{
"epoch": 0.19669848269459514,
"grad_norm": 0.14053913950920105,
"learning_rate": 0.0005654162831544068,
"loss": 3.1581,
"step": 3390
},
{
"epoch": 0.19727871420696858,
"grad_norm": 0.15485486388206482,
"learning_rate": 0.0005651474653564421,
"loss": 3.1465,
"step": 3400
},
{
"epoch": 0.197858945719342,
"grad_norm": 0.1425885111093521,
"learning_rate": 0.0005648776712577338,
"loss": 3.1535,
"step": 3410
},
{
"epoch": 0.19843917723171545,
"grad_norm": 0.1361316442489624,
"learning_rate": 0.0005646069018516921,
"loss": 3.1466,
"step": 3420
},
{
"epoch": 0.19901940874408888,
"grad_norm": 0.15521439909934998,
"learning_rate": 0.0005643351581353184,
"loss": 3.1415,
"step": 3430
},
{
"epoch": 0.19959964025646232,
"grad_norm": 0.14644280076026917,
"learning_rate": 0.0005640624411092014,
"loss": 3.1411,
"step": 3440
},
{
"epoch": 0.20017987176883575,
"grad_norm": 0.14116531610488892,
"learning_rate": 0.0005637887517775137,
"loss": 3.1542,
"step": 3450
},
{
"epoch": 0.2007601032812092,
"grad_norm": 0.1301729828119278,
"learning_rate": 0.0005635140911480082,
"loss": 3.1448,
"step": 3460
},
{
"epoch": 0.20134033479358263,
"grad_norm": 0.16307103633880615,
"learning_rate": 0.000563238460232014,
"loss": 3.1397,
"step": 3470
},
{
"epoch": 0.2019205663059561,
"grad_norm": 0.13141117990016937,
"learning_rate": 0.0005629618600444332,
"loss": 3.1469,
"step": 3480
},
{
"epoch": 0.20250079781832953,
"grad_norm": 0.13741467893123627,
"learning_rate": 0.0005626842916037365,
"loss": 3.1419,
"step": 3490
},
{
"epoch": 0.20308102933070296,
"grad_norm": 0.16112880408763885,
"learning_rate": 0.0005624057559319601,
"loss": 3.1449,
"step": 3500
},
{
"epoch": 0.2036612608430764,
"grad_norm": 0.153072327375412,
"learning_rate": 0.0005621262540547015,
"loss": 3.1365,
"step": 3510
},
{
"epoch": 0.20424149235544983,
"grad_norm": 0.1413891613483429,
"learning_rate": 0.0005618457870011158,
"loss": 3.1307,
"step": 3520
},
{
"epoch": 0.20482172386782327,
"grad_norm": 0.15589068830013275,
"learning_rate": 0.0005615643558039121,
"loss": 3.1418,
"step": 3530
},
{
"epoch": 0.2054019553801967,
"grad_norm": 0.12889379262924194,
"learning_rate": 0.0005612819614993496,
"loss": 3.1366,
"step": 3540
},
{
"epoch": 0.20598218689257014,
"grad_norm": 0.14375300705432892,
"learning_rate": 0.0005609986051272336,
"loss": 3.13,
"step": 3550
},
{
"epoch": 0.20656241840494358,
"grad_norm": 0.1587209552526474,
"learning_rate": 0.000560714287730912,
"loss": 3.1338,
"step": 3560
},
{
"epoch": 0.207142649917317,
"grad_norm": 0.15273341536521912,
"learning_rate": 0.0005604290103572714,
"loss": 3.1393,
"step": 3570
},
{
"epoch": 0.20772288142969045,
"grad_norm": 0.13435807824134827,
"learning_rate": 0.0005601427740567328,
"loss": 3.137,
"step": 3580
},
{
"epoch": 0.20830311294206388,
"grad_norm": 0.1391715109348297,
"learning_rate": 0.0005598555798832482,
"loss": 3.1347,
"step": 3590
},
{
"epoch": 0.20888334445443732,
"grad_norm": 0.16318084299564362,
"learning_rate": 0.0005595674288942969,
"loss": 3.1279,
"step": 3600
},
{
"epoch": 0.20946357596681076,
"grad_norm": 0.1386035829782486,
"learning_rate": 0.0005592783221508807,
"loss": 3.1335,
"step": 3610
},
{
"epoch": 0.2100438074791842,
"grad_norm": 0.14639577269554138,
"learning_rate": 0.000558988260717521,
"loss": 3.142,
"step": 3620
},
{
"epoch": 0.21062403899155763,
"grad_norm": 0.13666051626205444,
"learning_rate": 0.0005586972456622546,
"loss": 3.1287,
"step": 3630
},
{
"epoch": 0.21120427050393106,
"grad_norm": 0.14930284023284912,
"learning_rate": 0.0005584052780566293,
"loss": 3.1283,
"step": 3640
},
{
"epoch": 0.2117845020163045,
"grad_norm": 0.13987945020198822,
"learning_rate": 0.0005581123589757002,
"loss": 3.1329,
"step": 3650
},
{
"epoch": 0.21236473352867793,
"grad_norm": 0.1452946811914444,
"learning_rate": 0.0005578184894980263,
"loss": 3.1294,
"step": 3660
},
{
"epoch": 0.21294496504105137,
"grad_norm": 0.15192043781280518,
"learning_rate": 0.0005575236707056657,
"loss": 3.1206,
"step": 3670
},
{
"epoch": 0.2135251965534248,
"grad_norm": 0.16006827354431152,
"learning_rate": 0.0005572279036841721,
"loss": 3.1273,
"step": 3680
},
{
"epoch": 0.21410542806579824,
"grad_norm": 0.18141302466392517,
"learning_rate": 0.0005569311895225906,
"loss": 3.1245,
"step": 3690
},
{
"epoch": 0.21468565957817168,
"grad_norm": 0.14263153076171875,
"learning_rate": 0.0005566335293134539,
"loss": 3.1211,
"step": 3700
},
{
"epoch": 0.21526589109054511,
"grad_norm": 0.1435001790523529,
"learning_rate": 0.0005563349241527781,
"loss": 3.1258,
"step": 3710
},
{
"epoch": 0.21584612260291858,
"grad_norm": 0.15155887603759766,
"learning_rate": 0.0005560353751400585,
"loss": 3.1233,
"step": 3720
},
{
"epoch": 0.216426354115292,
"grad_norm": 0.1545734703540802,
"learning_rate": 0.0005557348833782663,
"loss": 3.1292,
"step": 3730
},
{
"epoch": 0.21700658562766545,
"grad_norm": 0.15549300611019135,
"learning_rate": 0.0005554334499738433,
"loss": 3.1142,
"step": 3740
},
{
"epoch": 0.21758681714003889,
"grad_norm": 0.15990693867206573,
"learning_rate": 0.000555131076036699,
"loss": 3.125,
"step": 3750
},
{
"epoch": 0.21816704865241232,
"grad_norm": 0.16630201041698456,
"learning_rate": 0.0005548277626802058,
"loss": 3.1216,
"step": 3760
},
{
"epoch": 0.21874728016478576,
"grad_norm": 0.1408713161945343,
"learning_rate": 0.0005545235110211954,
"loss": 3.1111,
"step": 3770
},
{
"epoch": 0.2193275116771592,
"grad_norm": 0.1488475650548935,
"learning_rate": 0.0005542183221799544,
"loss": 3.1253,
"step": 3780
},
{
"epoch": 0.21990774318953263,
"grad_norm": 0.14259935915470123,
"learning_rate": 0.0005539121972802198,
"loss": 3.1179,
"step": 3790
},
{
"epoch": 0.22048797470190606,
"grad_norm": 0.14055614173412323,
"learning_rate": 0.0005536051374491757,
"loss": 3.1113,
"step": 3800
},
{
"epoch": 0.2210682062142795,
"grad_norm": 0.1665177196264267,
"learning_rate": 0.0005532971438174485,
"loss": 3.1197,
"step": 3810
},
{
"epoch": 0.22164843772665294,
"grad_norm": 0.15349626541137695,
"learning_rate": 0.0005529882175191031,
"loss": 3.1086,
"step": 3820
},
{
"epoch": 0.22222866923902637,
"grad_norm": 0.14321498572826385,
"learning_rate": 0.0005526783596916385,
"loss": 3.1161,
"step": 3830
},
{
"epoch": 0.2228089007513998,
"grad_norm": 0.14768148958683014,
"learning_rate": 0.0005523675714759835,
"loss": 3.1164,
"step": 3840
},
{
"epoch": 0.22338913226377324,
"grad_norm": 0.1546637862920761,
"learning_rate": 0.000552055854016493,
"loss": 3.1185,
"step": 3850
},
{
"epoch": 0.22396936377614668,
"grad_norm": 0.16114896535873413,
"learning_rate": 0.0005517432084609434,
"loss": 3.1083,
"step": 3860
},
{
"epoch": 0.22454959528852012,
"grad_norm": 0.13796792924404144,
"learning_rate": 0.0005514296359605284,
"loss": 3.102,
"step": 3870
},
{
"epoch": 0.22512982680089355,
"grad_norm": 0.13948635756969452,
"learning_rate": 0.0005511151376698546,
"loss": 3.1079,
"step": 3880
},
{
"epoch": 0.225710058313267,
"grad_norm": 0.13826532661914825,
"learning_rate": 0.0005507997147469378,
"loss": 3.107,
"step": 3890
},
{
"epoch": 0.22629028982564042,
"grad_norm": 0.1437525451183319,
"learning_rate": 0.0005504833683531981,
"loss": 3.1076,
"step": 3900
},
{
"epoch": 0.22687052133801386,
"grad_norm": 0.14256474375724792,
"learning_rate": 0.0005501660996534563,
"loss": 3.1056,
"step": 3910
},
{
"epoch": 0.2274507528503873,
"grad_norm": 0.1531156748533249,
"learning_rate": 0.0005498479098159289,
"loss": 3.101,
"step": 3920
},
{
"epoch": 0.22803098436276073,
"grad_norm": 0.16901366412639618,
"learning_rate": 0.0005495288000122242,
"loss": 3.0981,
"step": 3930
},
{
"epoch": 0.22861121587513417,
"grad_norm": 0.1440243273973465,
"learning_rate": 0.0005492087714173378,
"loss": 3.1052,
"step": 3940
},
{
"epoch": 0.2291914473875076,
"grad_norm": 0.1603139340877533,
"learning_rate": 0.0005488878252096487,
"loss": 3.105,
"step": 3950
},
{
"epoch": 0.22977167889988107,
"grad_norm": 0.1588706523180008,
"learning_rate": 0.0005485659625709144,
"loss": 3.1107,
"step": 3960
},
{
"epoch": 0.2303519104122545,
"grad_norm": 0.1452343761920929,
"learning_rate": 0.0005482431846862667,
"loss": 3.1074,
"step": 3970
},
{
"epoch": 0.23093214192462794,
"grad_norm": 0.15799881517887115,
"learning_rate": 0.0005479194927442078,
"loss": 3.0985,
"step": 3980
},
{
"epoch": 0.23151237343700137,
"grad_norm": 0.12657681107521057,
"learning_rate": 0.0005475948879366053,
"loss": 3.0958,
"step": 3990
},
{
"epoch": 0.2320926049493748,
"grad_norm": 0.13606688380241394,
"learning_rate": 0.000547269371458688,
"loss": 3.0999,
"step": 4000
},
{
"epoch": 0.2320926049493748,
"eval_loss": 3.0630993843078613,
"eval_runtime": 3.264,
"eval_samples_per_second": 1326.576,
"eval_steps_per_second": 2.757,
"step": 4000
},
{
"epoch": 0.23267283646174824,
"grad_norm": 0.16136619448661804,
"learning_rate": 0.0005469429445090417,
"loss": 3.1004,
"step": 4010
},
{
"epoch": 0.23325306797412168,
"grad_norm": 0.14767828583717346,
"learning_rate": 0.0005466156082896047,
"loss": 3.1075,
"step": 4020
},
{
"epoch": 0.23383329948649512,
"grad_norm": 0.1492021530866623,
"learning_rate": 0.0005462873640056632,
"loss": 3.1025,
"step": 4030
},
{
"epoch": 0.23441353099886855,
"grad_norm": 0.14654645323753357,
"learning_rate": 0.000545958212865847,
"loss": 3.0966,
"step": 4040
},
{
"epoch": 0.234993762511242,
"grad_norm": 0.15648731589317322,
"learning_rate": 0.0005456281560821252,
"loss": 3.0937,
"step": 4050
},
{
"epoch": 0.23557399402361542,
"grad_norm": 0.13584694266319275,
"learning_rate": 0.0005452971948698014,
"loss": 3.1052,
"step": 4060
},
{
"epoch": 0.23615422553598886,
"grad_norm": 0.13829472661018372,
"learning_rate": 0.0005449653304475094,
"loss": 3.0933,
"step": 4070
},
{
"epoch": 0.2367344570483623,
"grad_norm": 0.16889816522598267,
"learning_rate": 0.0005446325640372088,
"loss": 3.0949,
"step": 4080
},
{
"epoch": 0.23731468856073573,
"grad_norm": 0.12351599335670471,
"learning_rate": 0.0005442988968641804,
"loss": 3.0914,
"step": 4090
},
{
"epoch": 0.23789492007310917,
"grad_norm": 0.14327877759933472,
"learning_rate": 0.0005439643301570216,
"loss": 3.0814,
"step": 4100
},
{
"epoch": 0.2384751515854826,
"grad_norm": 0.15155468881130219,
"learning_rate": 0.0005436288651476421,
"loss": 3.0849,
"step": 4110
},
{
"epoch": 0.23905538309785604,
"grad_norm": 0.14292922616004944,
"learning_rate": 0.0005432925030712594,
"loss": 3.0887,
"step": 4120
},
{
"epoch": 0.23963561461022947,
"grad_norm": 0.14884264767169952,
"learning_rate": 0.0005429552451663936,
"loss": 3.0911,
"step": 4130
},
{
"epoch": 0.2402158461226029,
"grad_norm": 0.1403530389070511,
"learning_rate": 0.0005426170926748639,
"loss": 3.0926,
"step": 4140
},
{
"epoch": 0.24079607763497635,
"grad_norm": 0.14543718099594116,
"learning_rate": 0.0005422780468417829,
"loss": 3.0897,
"step": 4150
},
{
"epoch": 0.24137630914734978,
"grad_norm": 0.12813718616962433,
"learning_rate": 0.0005419381089155532,
"loss": 3.0902,
"step": 4160
},
{
"epoch": 0.24195654065972322,
"grad_norm": 0.13375824689865112,
"learning_rate": 0.0005415972801478617,
"loss": 3.0915,
"step": 4170
},
{
"epoch": 0.24253677217209665,
"grad_norm": 0.14347635209560394,
"learning_rate": 0.0005412555617936755,
"loss": 3.0892,
"step": 4180
},
{
"epoch": 0.2431170036844701,
"grad_norm": 0.14166522026062012,
"learning_rate": 0.0005409129551112377,
"loss": 3.0808,
"step": 4190
},
{
"epoch": 0.24369723519684355,
"grad_norm": 0.13924048840999603,
"learning_rate": 0.0005405694613620617,
"loss": 3.0854,
"step": 4200
},
{
"epoch": 0.244277466709217,
"grad_norm": 0.13338492810726166,
"learning_rate": 0.0005402250818109276,
"loss": 3.0836,
"step": 4210
},
{
"epoch": 0.24485769822159043,
"grad_norm": 0.14531342685222626,
"learning_rate": 0.0005398798177258768,
"loss": 3.0971,
"step": 4220
},
{
"epoch": 0.24543792973396386,
"grad_norm": 0.1432162970304489,
"learning_rate": 0.0005395336703782082,
"loss": 3.0838,
"step": 4230
},
{
"epoch": 0.2460181612463373,
"grad_norm": 0.15475274622440338,
"learning_rate": 0.0005391866410424722,
"loss": 3.0764,
"step": 4240
},
{
"epoch": 0.24659839275871073,
"grad_norm": 0.15521539747714996,
"learning_rate": 0.0005388387309964675,
"loss": 3.0837,
"step": 4250
},
{
"epoch": 0.24717862427108417,
"grad_norm": 0.1430870145559311,
"learning_rate": 0.0005384899415212351,
"loss": 3.0889,
"step": 4260
},
{
"epoch": 0.2477588557834576,
"grad_norm": 0.14807622134685516,
"learning_rate": 0.0005381402739010545,
"loss": 3.0769,
"step": 4270
},
{
"epoch": 0.24833908729583104,
"grad_norm": 0.1509249359369278,
"learning_rate": 0.0005377897294234385,
"loss": 3.0815,
"step": 4280
},
{
"epoch": 0.24891931880820448,
"grad_norm": 0.1451188027858734,
"learning_rate": 0.0005374383093791287,
"loss": 3.0766,
"step": 4290
},
{
"epoch": 0.2494995503205779,
"grad_norm": 0.130240797996521,
"learning_rate": 0.0005370860150620901,
"loss": 3.0824,
"step": 4300
},
{
"epoch": 0.2500797818329513,
"grad_norm": 0.14696471393108368,
"learning_rate": 0.0005367328477695077,
"loss": 3.0678,
"step": 4310
},
{
"epoch": 0.2506600133453248,
"grad_norm": 0.13198255002498627,
"learning_rate": 0.0005363788088017803,
"loss": 3.0759,
"step": 4320
},
{
"epoch": 0.25124024485769825,
"grad_norm": 0.1413690447807312,
"learning_rate": 0.0005360238994625166,
"loss": 3.0842,
"step": 4330
},
{
"epoch": 0.25182047637007166,
"grad_norm": 0.1560727059841156,
"learning_rate": 0.0005356681210585297,
"loss": 3.074,
"step": 4340
},
{
"epoch": 0.2524007078824451,
"grad_norm": 0.13727669417858124,
"learning_rate": 0.0005353114748998332,
"loss": 3.082,
"step": 4350
},
{
"epoch": 0.2529809393948185,
"grad_norm": 0.1479531228542328,
"learning_rate": 0.0005349539622996356,
"loss": 3.0804,
"step": 4360
},
{
"epoch": 0.253561170907192,
"grad_norm": 0.13756506145000458,
"learning_rate": 0.0005345955845743358,
"loss": 3.0829,
"step": 4370
},
{
"epoch": 0.2541414024195654,
"grad_norm": 0.14778585731983185,
"learning_rate": 0.0005342363430435177,
"loss": 3.0785,
"step": 4380
},
{
"epoch": 0.25472163393193886,
"grad_norm": 0.13227440416812897,
"learning_rate": 0.0005338762390299467,
"loss": 3.0776,
"step": 4390
},
{
"epoch": 0.25530186544431227,
"grad_norm": 0.14178766310214996,
"learning_rate": 0.0005335152738595634,
"loss": 3.0799,
"step": 4400
},
{
"epoch": 0.25588209695668573,
"grad_norm": 0.14833244681358337,
"learning_rate": 0.0005331534488614794,
"loss": 3.0674,
"step": 4410
},
{
"epoch": 0.25646232846905914,
"grad_norm": 0.13829241693019867,
"learning_rate": 0.0005327907653679721,
"loss": 3.0643,
"step": 4420
},
{
"epoch": 0.2570425599814326,
"grad_norm": 0.16908784210681915,
"learning_rate": 0.0005324272247144802,
"loss": 3.0649,
"step": 4430
},
{
"epoch": 0.257622791493806,
"grad_norm": 0.14392369985580444,
"learning_rate": 0.0005320628282395985,
"loss": 3.0761,
"step": 4440
},
{
"epoch": 0.2582030230061795,
"grad_norm": 0.16387993097305298,
"learning_rate": 0.0005316975772850729,
"loss": 3.0666,
"step": 4450
},
{
"epoch": 0.2587832545185529,
"grad_norm": 0.13506962358951569,
"learning_rate": 0.0005313314731957957,
"loss": 3.0672,
"step": 4460
},
{
"epoch": 0.25936348603092635,
"grad_norm": 0.1522989273071289,
"learning_rate": 0.0005309645173198007,
"loss": 3.0607,
"step": 4470
},
{
"epoch": 0.25994371754329976,
"grad_norm": 0.13824021816253662,
"learning_rate": 0.0005305967110082576,
"loss": 3.0627,
"step": 4480
},
{
"epoch": 0.2605239490556732,
"grad_norm": 0.13685718178749084,
"learning_rate": 0.000530228055615468,
"loss": 3.0612,
"step": 4490
},
{
"epoch": 0.26110418056804663,
"grad_norm": 0.13309134542942047,
"learning_rate": 0.0005298585524988594,
"loss": 3.0548,
"step": 4500
},
{
"epoch": 0.2616844120804201,
"grad_norm": 0.17121103405952454,
"learning_rate": 0.0005294882030189812,
"loss": 3.066,
"step": 4510
},
{
"epoch": 0.2622646435927935,
"grad_norm": 0.13467055559158325,
"learning_rate": 0.000529117008539499,
"loss": 3.0606,
"step": 4520
},
{
"epoch": 0.26284487510516696,
"grad_norm": 0.12970523536205292,
"learning_rate": 0.0005287449704271896,
"loss": 3.0553,
"step": 4530
},
{
"epoch": 0.26342510661754037,
"grad_norm": 0.1509917676448822,
"learning_rate": 0.0005283720900519365,
"loss": 3.0571,
"step": 4540
},
{
"epoch": 0.26400533812991384,
"grad_norm": 0.1372883915901184,
"learning_rate": 0.0005279983687867243,
"loss": 3.0635,
"step": 4550
},
{
"epoch": 0.2645855696422873,
"grad_norm": 0.1482354998588562,
"learning_rate": 0.0005276238080076335,
"loss": 3.0619,
"step": 4560
},
{
"epoch": 0.2651658011546607,
"grad_norm": 0.13884900510311127,
"learning_rate": 0.0005272484090938365,
"loss": 3.069,
"step": 4570
},
{
"epoch": 0.26574603266703417,
"grad_norm": 0.14500798285007477,
"learning_rate": 0.0005268721734275914,
"loss": 3.0715,
"step": 4580
},
{
"epoch": 0.2663262641794076,
"grad_norm": 0.1357218474149704,
"learning_rate": 0.000526495102394237,
"loss": 3.0584,
"step": 4590
},
{
"epoch": 0.26690649569178104,
"grad_norm": 0.14025723934173584,
"learning_rate": 0.0005261171973821887,
"loss": 3.0613,
"step": 4600
},
{
"epoch": 0.26748672720415445,
"grad_norm": 0.15253092348575592,
"learning_rate": 0.0005257384597829322,
"loss": 3.0584,
"step": 4610
},
{
"epoch": 0.2680669587165279,
"grad_norm": 0.14573270082473755,
"learning_rate": 0.0005253588909910191,
"loss": 3.0634,
"step": 4620
},
{
"epoch": 0.2686471902289013,
"grad_norm": 0.15005233883857727,
"learning_rate": 0.0005249784924040614,
"loss": 3.0526,
"step": 4630
},
{
"epoch": 0.2692274217412748,
"grad_norm": 0.15314225852489471,
"learning_rate": 0.0005245972654227265,
"loss": 3.0635,
"step": 4640
},
{
"epoch": 0.2698076532536482,
"grad_norm": 0.14412705600261688,
"learning_rate": 0.0005242152114507321,
"loss": 3.055,
"step": 4650
},
{
"epoch": 0.27038788476602166,
"grad_norm": 0.15046367049217224,
"learning_rate": 0.0005238323318948412,
"loss": 3.066,
"step": 4660
},
{
"epoch": 0.27096811627839507,
"grad_norm": 0.12618590891361237,
"learning_rate": 0.0005234486281648559,
"loss": 3.0433,
"step": 4670
},
{
"epoch": 0.27154834779076853,
"grad_norm": 0.14097653329372406,
"learning_rate": 0.000523064101673614,
"loss": 3.0593,
"step": 4680
},
{
"epoch": 0.27212857930314194,
"grad_norm": 0.14015048742294312,
"learning_rate": 0.0005226787538369821,
"loss": 3.057,
"step": 4690
},
{
"epoch": 0.2727088108155154,
"grad_norm": 0.1534152328968048,
"learning_rate": 0.0005222925860738513,
"loss": 3.06,
"step": 4700
},
{
"epoch": 0.2732890423278888,
"grad_norm": 0.1350966989994049,
"learning_rate": 0.0005219055998061319,
"loss": 3.0518,
"step": 4710
},
{
"epoch": 0.2738692738402623,
"grad_norm": 0.15589705109596252,
"learning_rate": 0.0005215177964587478,
"loss": 3.0468,
"step": 4720
},
{
"epoch": 0.2744495053526357,
"grad_norm": 0.14144299924373627,
"learning_rate": 0.0005211291774596316,
"loss": 3.0555,
"step": 4730
},
{
"epoch": 0.27502973686500914,
"grad_norm": 0.14553704857826233,
"learning_rate": 0.000520739744239719,
"loss": 3.0531,
"step": 4740
},
{
"epoch": 0.27560996837738255,
"grad_norm": 0.15157508850097656,
"learning_rate": 0.0005203494982329441,
"loss": 3.0504,
"step": 4750
},
{
"epoch": 0.276190199889756,
"grad_norm": 0.14391539990901947,
"learning_rate": 0.0005199584408762335,
"loss": 3.0512,
"step": 4760
},
{
"epoch": 0.2767704314021294,
"grad_norm": 0.1297539621591568,
"learning_rate": 0.0005195665736095013,
"loss": 3.036,
"step": 4770
},
{
"epoch": 0.2773506629145029,
"grad_norm": 0.13723768293857574,
"learning_rate": 0.0005191738978756439,
"loss": 3.0532,
"step": 4780
},
{
"epoch": 0.2779308944268763,
"grad_norm": 0.1422174870967865,
"learning_rate": 0.0005187804151205345,
"loss": 3.0605,
"step": 4790
},
{
"epoch": 0.27851112593924976,
"grad_norm": 0.137346088886261,
"learning_rate": 0.0005183861267930177,
"loss": 3.0552,
"step": 4800
},
{
"epoch": 0.2790913574516232,
"grad_norm": 0.13471810519695282,
"learning_rate": 0.0005179910343449046,
"loss": 3.0426,
"step": 4810
},
{
"epoch": 0.27967158896399663,
"grad_norm": 0.12727439403533936,
"learning_rate": 0.0005175951392309669,
"loss": 3.0448,
"step": 4820
},
{
"epoch": 0.2802518204763701,
"grad_norm": 0.13242101669311523,
"learning_rate": 0.0005171984429089318,
"loss": 3.0546,
"step": 4830
},
{
"epoch": 0.2808320519887435,
"grad_norm": 0.14276637136936188,
"learning_rate": 0.0005168009468394769,
"loss": 3.0392,
"step": 4840
},
{
"epoch": 0.28141228350111697,
"grad_norm": 0.1340208798646927,
"learning_rate": 0.0005164026524862242,
"loss": 3.0491,
"step": 4850
},
{
"epoch": 0.2819925150134904,
"grad_norm": 0.14000356197357178,
"learning_rate": 0.0005160035613157354,
"loss": 3.0396,
"step": 4860
},
{
"epoch": 0.28257274652586384,
"grad_norm": 0.15974439680576324,
"learning_rate": 0.0005156036747975059,
"loss": 3.0406,
"step": 4870
},
{
"epoch": 0.28315297803823725,
"grad_norm": 0.1382746398448944,
"learning_rate": 0.0005152029944039597,
"loss": 3.0449,
"step": 4880
},
{
"epoch": 0.2837332095506107,
"grad_norm": 0.14049001038074493,
"learning_rate": 0.000514801521610444,
"loss": 3.0463,
"step": 4890
},
{
"epoch": 0.2843134410629841,
"grad_norm": 0.13699445128440857,
"learning_rate": 0.0005143992578952238,
"loss": 3.0393,
"step": 4900
},
{
"epoch": 0.2848936725753576,
"grad_norm": 0.1515870988368988,
"learning_rate": 0.0005139962047394761,
"loss": 3.0399,
"step": 4910
},
{
"epoch": 0.285473904087731,
"grad_norm": 0.1437605917453766,
"learning_rate": 0.0005135923636272849,
"loss": 3.0378,
"step": 4920
},
{
"epoch": 0.28605413560010445,
"grad_norm": 0.13769088685512543,
"learning_rate": 0.0005131877360456355,
"loss": 3.0377,
"step": 4930
},
{
"epoch": 0.28663436711247786,
"grad_norm": 0.15194256603717804,
"learning_rate": 0.000512782323484409,
"loss": 3.0399,
"step": 4940
},
{
"epoch": 0.2872145986248513,
"grad_norm": 0.14672812819480896,
"learning_rate": 0.0005123761274363769,
"loss": 3.04,
"step": 4950
},
{
"epoch": 0.28779483013722473,
"grad_norm": 0.13162557780742645,
"learning_rate": 0.0005119691493971957,
"loss": 3.0317,
"step": 4960
},
{
"epoch": 0.2883750616495982,
"grad_norm": 0.13286751508712769,
"learning_rate": 0.0005115613908654011,
"loss": 3.0486,
"step": 4970
},
{
"epoch": 0.2889552931619716,
"grad_norm": 0.13034851849079132,
"learning_rate": 0.0005111528533424027,
"loss": 3.0399,
"step": 4980
},
{
"epoch": 0.28953552467434507,
"grad_norm": 0.1405908614397049,
"learning_rate": 0.0005107435383324786,
"loss": 3.0372,
"step": 4990
},
{
"epoch": 0.2901157561867185,
"grad_norm": 0.16415055096149445,
"learning_rate": 0.0005103334473427695,
"loss": 3.0333,
"step": 5000
},
{
"epoch": 0.2901157561867185,
"eval_loss": 2.9981322288513184,
"eval_runtime": 3.2581,
"eval_samples_per_second": 1329.001,
"eval_steps_per_second": 2.762,
"step": 5000
},
{
"epoch": 0.29069598769909194,
"grad_norm": 0.12301915884017944,
"learning_rate": 0.0005099225818832731,
"loss": 3.0312,
"step": 5010
},
{
"epoch": 0.29127621921146535,
"grad_norm": 0.16767041385173798,
"learning_rate": 0.0005095109434668395,
"loss": 3.0247,
"step": 5020
},
{
"epoch": 0.2918564507238388,
"grad_norm": 0.13234609365463257,
"learning_rate": 0.0005090985336091642,
"loss": 3.0348,
"step": 5030
},
{
"epoch": 0.2924366822362123,
"grad_norm": 0.14020933210849762,
"learning_rate": 0.0005086853538287835,
"loss": 3.0317,
"step": 5040
},
{
"epoch": 0.2930169137485857,
"grad_norm": 0.14580604434013367,
"learning_rate": 0.0005082714056470687,
"loss": 3.0321,
"step": 5050
},
{
"epoch": 0.29359714526095915,
"grad_norm": 0.13627541065216064,
"learning_rate": 0.0005078566905882205,
"loss": 3.0318,
"step": 5060
},
{
"epoch": 0.29417737677333256,
"grad_norm": 0.12629657983779907,
"learning_rate": 0.0005074412101792631,
"loss": 3.0284,
"step": 5070
},
{
"epoch": 0.294757608285706,
"grad_norm": 0.13409367203712463,
"learning_rate": 0.0005070249659500387,
"loss": 3.0381,
"step": 5080
},
{
"epoch": 0.2953378397980794,
"grad_norm": 0.1341470181941986,
"learning_rate": 0.0005066079594332023,
"loss": 3.0229,
"step": 5090
},
{
"epoch": 0.2959180713104529,
"grad_norm": 0.1630919873714447,
"learning_rate": 0.0005061901921642156,
"loss": 3.0315,
"step": 5100
},
{
"epoch": 0.2964983028228263,
"grad_norm": 0.12825888395309448,
"learning_rate": 0.0005057716656813416,
"loss": 3.0249,
"step": 5110
},
{
"epoch": 0.29707853433519976,
"grad_norm": 0.1613105833530426,
"learning_rate": 0.0005053523815256384,
"loss": 3.0238,
"step": 5120
},
{
"epoch": 0.29765876584757317,
"grad_norm": 0.14038483798503876,
"learning_rate": 0.0005049323412409542,
"loss": 3.0294,
"step": 5130
},
{
"epoch": 0.29823899735994663,
"grad_norm": 0.16509568691253662,
"learning_rate": 0.0005045115463739215,
"loss": 3.0356,
"step": 5140
},
{
"epoch": 0.29881922887232004,
"grad_norm": 0.14289237558841705,
"learning_rate": 0.0005040899984739509,
"loss": 3.0228,
"step": 5150
},
{
"epoch": 0.2993994603846935,
"grad_norm": 0.14584140479564667,
"learning_rate": 0.000503667699093226,
"loss": 3.0294,
"step": 5160
},
{
"epoch": 0.2999796918970669,
"grad_norm": 0.12970221042633057,
"learning_rate": 0.0005032446497866973,
"loss": 3.0321,
"step": 5170
},
{
"epoch": 0.3005599234094404,
"grad_norm": 0.13744401931762695,
"learning_rate": 0.0005028208521120769,
"loss": 3.0236,
"step": 5180
},
{
"epoch": 0.3011401549218138,
"grad_norm": 0.1317235380411148,
"learning_rate": 0.0005023963076298321,
"loss": 3.0254,
"step": 5190
},
{
"epoch": 0.30172038643418725,
"grad_norm": 0.14213494956493378,
"learning_rate": 0.0005019710179031801,
"loss": 3.0275,
"step": 5200
},
{
"epoch": 0.30230061794656066,
"grad_norm": 0.13712069392204285,
"learning_rate": 0.0005015449844980823,
"loss": 3.0249,
"step": 5210
},
{
"epoch": 0.3028808494589341,
"grad_norm": 0.14411009848117828,
"learning_rate": 0.0005011182089832381,
"loss": 3.0215,
"step": 5220
},
{
"epoch": 0.30346108097130753,
"grad_norm": 0.12583871185779572,
"learning_rate": 0.0005006906929300799,
"loss": 3.0275,
"step": 5230
},
{
"epoch": 0.304041312483681,
"grad_norm": 0.14499635994434357,
"learning_rate": 0.0005002624379127666,
"loss": 3.0258,
"step": 5240
},
{
"epoch": 0.3046215439960544,
"grad_norm": 0.14918765425682068,
"learning_rate": 0.0004998334455081779,
"loss": 3.0209,
"step": 5250
},
{
"epoch": 0.30520177550842786,
"grad_norm": 0.13245496153831482,
"learning_rate": 0.0004994037172959089,
"loss": 3.0212,
"step": 5260
},
{
"epoch": 0.3057820070208013,
"grad_norm": 0.12850724160671234,
"learning_rate": 0.0004989732548582638,
"loss": 3.0258,
"step": 5270
},
{
"epoch": 0.30636223853317474,
"grad_norm": 0.1346123367547989,
"learning_rate": 0.0004985420597802503,
"loss": 3.0138,
"step": 5280
},
{
"epoch": 0.3069424700455482,
"grad_norm": 0.14746621251106262,
"learning_rate": 0.0004981101336495741,
"loss": 3.0202,
"step": 5290
},
{
"epoch": 0.3075227015579216,
"grad_norm": 0.140406534075737,
"learning_rate": 0.0004976774780566324,
"loss": 3.0276,
"step": 5300
},
{
"epoch": 0.30810293307029507,
"grad_norm": 0.133416548371315,
"learning_rate": 0.0004972440945945083,
"loss": 3.0228,
"step": 5310
},
{
"epoch": 0.3086831645826685,
"grad_norm": 0.140433207154274,
"learning_rate": 0.0004968099848589651,
"loss": 3.0219,
"step": 5320
},
{
"epoch": 0.30926339609504194,
"grad_norm": 0.14963370561599731,
"learning_rate": 0.0004963751504484403,
"loss": 3.0119,
"step": 5330
},
{
"epoch": 0.30984362760741535,
"grad_norm": 0.12273452430963516,
"learning_rate": 0.0004959395929640401,
"loss": 3.0136,
"step": 5340
},
{
"epoch": 0.3104238591197888,
"grad_norm": 0.14232607185840607,
"learning_rate": 0.0004955033140095322,
"loss": 3.0088,
"step": 5350
},
{
"epoch": 0.3110040906321622,
"grad_norm": 0.15276071429252625,
"learning_rate": 0.0004950663151913419,
"loss": 3.0189,
"step": 5360
},
{
"epoch": 0.3115843221445357,
"grad_norm": 0.14110638201236725,
"learning_rate": 0.0004946285981185446,
"loss": 3.0273,
"step": 5370
},
{
"epoch": 0.3121645536569091,
"grad_norm": 0.12971307337284088,
"learning_rate": 0.0004941901644028601,
"loss": 3.0181,
"step": 5380
},
{
"epoch": 0.31274478516928256,
"grad_norm": 0.12775759398937225,
"learning_rate": 0.0004937510156586474,
"loss": 3.0108,
"step": 5390
},
{
"epoch": 0.31332501668165597,
"grad_norm": 0.15120139718055725,
"learning_rate": 0.0004933111535028983,
"loss": 3.0142,
"step": 5400
},
{
"epoch": 0.31390524819402943,
"grad_norm": 0.14965811371803284,
"learning_rate": 0.0004928705795552312,
"loss": 3.0137,
"step": 5410
},
{
"epoch": 0.31448547970640284,
"grad_norm": 0.1459018588066101,
"learning_rate": 0.0004924292954378856,
"loss": 3.0146,
"step": 5420
},
{
"epoch": 0.3150657112187763,
"grad_norm": 0.1286230981349945,
"learning_rate": 0.0004919873027757159,
"loss": 3.0162,
"step": 5430
},
{
"epoch": 0.3156459427311497,
"grad_norm": 0.13560357689857483,
"learning_rate": 0.0004915446031961854,
"loss": 3.0129,
"step": 5440
},
{
"epoch": 0.3162261742435232,
"grad_norm": 0.1419978141784668,
"learning_rate": 0.0004911011983293601,
"loss": 3.0115,
"step": 5450
},
{
"epoch": 0.3168064057558966,
"grad_norm": 0.12910611927509308,
"learning_rate": 0.0004906570898079032,
"loss": 3.0151,
"step": 5460
},
{
"epoch": 0.31738663726827004,
"grad_norm": 0.15491628646850586,
"learning_rate": 0.0004902122792670692,
"loss": 3.0118,
"step": 5470
},
{
"epoch": 0.31796686878064345,
"grad_norm": 0.12448934465646744,
"learning_rate": 0.0004897667683446967,
"loss": 3.0119,
"step": 5480
},
{
"epoch": 0.3185471002930169,
"grad_norm": 0.1288510411977768,
"learning_rate": 0.0004893205586812036,
"loss": 3.0078,
"step": 5490
},
{
"epoch": 0.3191273318053903,
"grad_norm": 0.12903016805648804,
"learning_rate": 0.000488873651919581,
"loss": 3.0085,
"step": 5500
},
{
"epoch": 0.3197075633177638,
"grad_norm": 0.14042973518371582,
"learning_rate": 0.0004884260497053859,
"loss": 3.0093,
"step": 5510
},
{
"epoch": 0.32028779483013725,
"grad_norm": 0.13995361328125,
"learning_rate": 0.0004879777536867369,
"loss": 3.0009,
"step": 5520
},
{
"epoch": 0.32086802634251066,
"grad_norm": 0.13979199528694153,
"learning_rate": 0.00048752876551430677,
"loss": 3.0089,
"step": 5530
},
{
"epoch": 0.3214482578548841,
"grad_norm": 0.130417600274086,
"learning_rate": 0.0004870790868413171,
"loss": 3.0087,
"step": 5540
},
{
"epoch": 0.32202848936725753,
"grad_norm": 0.13676275312900543,
"learning_rate": 0.00048662871932353164,
"loss": 3.0092,
"step": 5550
},
{
"epoch": 0.322608720879631,
"grad_norm": 0.12869158387184143,
"learning_rate": 0.00048617766461925104,
"loss": 3.0074,
"step": 5560
},
{
"epoch": 0.3231889523920044,
"grad_norm": 0.13846737146377563,
"learning_rate": 0.0004857259243893058,
"loss": 3.0079,
"step": 5570
},
{
"epoch": 0.32376918390437787,
"grad_norm": 0.1349971890449524,
"learning_rate": 0.0004852735002970509,
"loss": 2.9915,
"step": 5580
},
{
"epoch": 0.3243494154167513,
"grad_norm": 0.13398951292037964,
"learning_rate": 0.000484820394008359,
"loss": 2.9982,
"step": 5590
},
{
"epoch": 0.32492964692912474,
"grad_norm": 0.13627557456493378,
"learning_rate": 0.0004843666071916152,
"loss": 3.0019,
"step": 5600
},
{
"epoch": 0.32550987844149815,
"grad_norm": 0.13470283150672913,
"learning_rate": 0.00048391214151771,
"loss": 3.0015,
"step": 5610
},
{
"epoch": 0.3260901099538716,
"grad_norm": 0.14207038283348083,
"learning_rate": 0.0004834569986600336,
"loss": 3.0051,
"step": 5620
},
{
"epoch": 0.326670341466245,
"grad_norm": 0.13324964046478271,
"learning_rate": 0.00048300118029446967,
"loss": 2.9956,
"step": 5630
},
{
"epoch": 0.3272505729786185,
"grad_norm": 0.15288645029067993,
"learning_rate": 0.0004825446880993892,
"loss": 3.0087,
"step": 5640
},
{
"epoch": 0.3278308044909919,
"grad_norm": 0.13744772970676422,
"learning_rate": 0.00048208752375564424,
"loss": 3.0049,
"step": 5650
},
{
"epoch": 0.32841103600336535,
"grad_norm": 0.13114534318447113,
"learning_rate": 0.00048162968894656193,
"loss": 2.9993,
"step": 5660
},
{
"epoch": 0.32899126751573876,
"grad_norm": 0.1254429966211319,
"learning_rate": 0.00048117118535793773,
"loss": 2.9937,
"step": 5670
},
{
"epoch": 0.3295714990281122,
"grad_norm": 0.15155521035194397,
"learning_rate": 0.00048071201467803017,
"loss": 3.0017,
"step": 5680
},
{
"epoch": 0.33015173054048563,
"grad_norm": 0.1420249044895172,
"learning_rate": 0.00048025217859755365,
"loss": 3.017,
"step": 5690
},
{
"epoch": 0.3307319620528591,
"grad_norm": 0.14615775644779205,
"learning_rate": 0.0004797916788096728,
"loss": 3.0052,
"step": 5700
},
{
"epoch": 0.3313121935652325,
"grad_norm": 0.12851493060588837,
"learning_rate": 0.00047933051700999605,
"loss": 3.0041,
"step": 5710
},
{
"epoch": 0.33189242507760597,
"grad_norm": 0.13371190428733826,
"learning_rate": 0.00047886869489656956,
"loss": 2.9879,
"step": 5720
},
{
"epoch": 0.3324726565899794,
"grad_norm": 0.13223771750926971,
"learning_rate": 0.0004784062141698707,
"loss": 2.993,
"step": 5730
},
{
"epoch": 0.33305288810235284,
"grad_norm": 0.13460920751094818,
"learning_rate": 0.00047794307653280184,
"loss": 2.9928,
"step": 5740
},
{
"epoch": 0.3336331196147263,
"grad_norm": 0.12678171694278717,
"learning_rate": 0.0004774792836906844,
"loss": 3.0053,
"step": 5750
},
{
"epoch": 0.3342133511270997,
"grad_norm": 0.14595790207386017,
"learning_rate": 0.0004770148373512522,
"loss": 2.9974,
"step": 5760
},
{
"epoch": 0.3347935826394732,
"grad_norm": 0.1505734771490097,
"learning_rate": 0.00047654973922464525,
"loss": 3.0053,
"step": 5770
},
{
"epoch": 0.3353738141518466,
"grad_norm": 0.13636811077594757,
"learning_rate": 0.00047608399102340367,
"loss": 2.9984,
"step": 5780
},
{
"epoch": 0.33595404566422005,
"grad_norm": 0.14487333595752716,
"learning_rate": 0.000475617594462461,
"loss": 3.0013,
"step": 5790
},
{
"epoch": 0.33653427717659345,
"grad_norm": 0.13392585515975952,
"learning_rate": 0.00047515055125913825,
"loss": 2.9897,
"step": 5800
},
{
"epoch": 0.3371145086889669,
"grad_norm": 0.1241224929690361,
"learning_rate": 0.0004746828631331376,
"loss": 2.9918,
"step": 5810
},
{
"epoch": 0.3376947402013403,
"grad_norm": 0.1381169706583023,
"learning_rate": 0.00047421453180653553,
"loss": 2.9874,
"step": 5820
},
{
"epoch": 0.3382749717137138,
"grad_norm": 0.12413561344146729,
"learning_rate": 0.00047374555900377716,
"loss": 2.9928,
"step": 5830
},
{
"epoch": 0.3388552032260872,
"grad_norm": 0.13286706805229187,
"learning_rate": 0.0004732759464516694,
"loss": 2.9907,
"step": 5840
},
{
"epoch": 0.33943543473846066,
"grad_norm": 0.1558184027671814,
"learning_rate": 0.0004728056958793749,
"loss": 3.0036,
"step": 5850
},
{
"epoch": 0.34001566625083407,
"grad_norm": 0.13220670819282532,
"learning_rate": 0.0004723348090184056,
"loss": 2.9945,
"step": 5860
},
{
"epoch": 0.34059589776320753,
"grad_norm": 0.13015997409820557,
"learning_rate": 0.00047186328760261603,
"loss": 3.0005,
"step": 5870
},
{
"epoch": 0.34117612927558094,
"grad_norm": 0.146441251039505,
"learning_rate": 0.0004713911333681976,
"loss": 2.9984,
"step": 5880
},
{
"epoch": 0.3417563607879544,
"grad_norm": 0.12352869659662247,
"learning_rate": 0.0004709183480536718,
"loss": 2.9946,
"step": 5890
},
{
"epoch": 0.3423365923003278,
"grad_norm": 0.12516902387142181,
"learning_rate": 0.0004704449333998834,
"loss": 2.9918,
"step": 5900
},
{
"epoch": 0.3429168238127013,
"grad_norm": 0.14155182242393494,
"learning_rate": 0.00046997089114999494,
"loss": 2.9937,
"step": 5910
},
{
"epoch": 0.3434970553250747,
"grad_norm": 0.12636148929595947,
"learning_rate": 0.0004694962230494796,
"loss": 2.9869,
"step": 5920
},
{
"epoch": 0.34407728683744815,
"grad_norm": 0.14390048384666443,
"learning_rate": 0.000469020930846115,
"loss": 2.9759,
"step": 5930
},
{
"epoch": 0.34465751834982156,
"grad_norm": 0.14705798029899597,
"learning_rate": 0.0004685450162899768,
"loss": 2.9876,
"step": 5940
},
{
"epoch": 0.345237749862195,
"grad_norm": 0.13937653601169586,
"learning_rate": 0.00046806848113343234,
"loss": 2.9872,
"step": 5950
},
{
"epoch": 0.34581798137456843,
"grad_norm": 0.13351042568683624,
"learning_rate": 0.00046759132713113403,
"loss": 2.986,
"step": 5960
},
{
"epoch": 0.3463982128869419,
"grad_norm": 0.133000910282135,
"learning_rate": 0.0004671135560400127,
"loss": 2.9886,
"step": 5970
},
{
"epoch": 0.3469784443993153,
"grad_norm": 0.1261400580406189,
"learning_rate": 0.0004666351696192718,
"loss": 2.9811,
"step": 5980
},
{
"epoch": 0.34755867591168876,
"grad_norm": 0.13575439155101776,
"learning_rate": 0.00046615616963038007,
"loss": 2.9796,
"step": 5990
},
{
"epoch": 0.3481389074240622,
"grad_norm": 0.13202066719532013,
"learning_rate": 0.0004656765578370657,
"loss": 2.9958,
"step": 6000
},
{
"epoch": 0.3481389074240622,
"eval_loss": 2.949599027633667,
"eval_runtime": 3.2655,
"eval_samples_per_second": 1325.986,
"eval_steps_per_second": 2.756,
"step": 6000
},
{
"epoch": 0.34871913893643564,
"grad_norm": 0.14002783596515656,
"learning_rate": 0.0004651963360053096,
"loss": 2.9811,
"step": 6010
},
{
"epoch": 0.3492993704488091,
"grad_norm": 0.1519598364830017,
"learning_rate": 0.00046471550590333874,
"loss": 2.9884,
"step": 6020
},
{
"epoch": 0.3498796019611825,
"grad_norm": 0.1435564160346985,
"learning_rate": 0.00046423406930162,
"loss": 2.9831,
"step": 6030
},
{
"epoch": 0.35045983347355597,
"grad_norm": 0.1241581067442894,
"learning_rate": 0.0004637520279728534,
"loss": 2.9801,
"step": 6040
},
{
"epoch": 0.3510400649859294,
"grad_norm": 0.124722421169281,
"learning_rate": 0.00046326938369196566,
"loss": 2.9872,
"step": 6050
},
{
"epoch": 0.35162029649830284,
"grad_norm": 0.12400694936513901,
"learning_rate": 0.0004627861382361034,
"loss": 2.9863,
"step": 6060
},
{
"epoch": 0.35220052801067625,
"grad_norm": 0.14388398826122284,
"learning_rate": 0.0004623022933846272,
"loss": 2.973,
"step": 6070
},
{
"epoch": 0.3527807595230497,
"grad_norm": 0.14111004769802094,
"learning_rate": 0.0004618178509191045,
"loss": 2.9902,
"step": 6080
},
{
"epoch": 0.3533609910354231,
"grad_norm": 0.1257510930299759,
"learning_rate": 0.000461332812623303,
"loss": 2.9877,
"step": 6090
},
{
"epoch": 0.3539412225477966,
"grad_norm": 0.1282566338777542,
"learning_rate": 0.00046084718028318466,
"loss": 2.9832,
"step": 6100
},
{
"epoch": 0.35452145406017,
"grad_norm": 0.14325213432312012,
"learning_rate": 0.00046036095568689864,
"loss": 2.9782,
"step": 6110
},
{
"epoch": 0.35510168557254346,
"grad_norm": 0.1563083529472351,
"learning_rate": 0.0004598741406247748,
"loss": 2.9793,
"step": 6120
},
{
"epoch": 0.35568191708491687,
"grad_norm": 0.1327456384897232,
"learning_rate": 0.0004593867368893172,
"loss": 2.9843,
"step": 6130
},
{
"epoch": 0.35626214859729033,
"grad_norm": 0.13930997252464294,
"learning_rate": 0.0004588987462751975,
"loss": 2.976,
"step": 6140
},
{
"epoch": 0.35684238010966374,
"grad_norm": 0.1295255720615387,
"learning_rate": 0.00045841017057924807,
"loss": 2.9801,
"step": 6150
},
{
"epoch": 0.3574226116220372,
"grad_norm": 0.1404607594013214,
"learning_rate": 0.00045792101160045613,
"loss": 2.9788,
"step": 6160
},
{
"epoch": 0.3580028431344106,
"grad_norm": 0.12297389656305313,
"learning_rate": 0.0004574312711399561,
"loss": 2.9853,
"step": 6170
},
{
"epoch": 0.3585830746467841,
"grad_norm": 0.15521986782550812,
"learning_rate": 0.0004569409510010236,
"loss": 2.9825,
"step": 6180
},
{
"epoch": 0.3591633061591575,
"grad_norm": 0.12915629148483276,
"learning_rate": 0.00045645005298906887,
"loss": 2.984,
"step": 6190
},
{
"epoch": 0.35974353767153094,
"grad_norm": 0.12852182984352112,
"learning_rate": 0.00045595857891162964,
"loss": 2.9703,
"step": 6200
},
{
"epoch": 0.36032376918390435,
"grad_norm": 0.1300152987241745,
"learning_rate": 0.00045546653057836517,
"loss": 2.971,
"step": 6210
},
{
"epoch": 0.3609040006962778,
"grad_norm": 0.13348935544490814,
"learning_rate": 0.00045497390980104885,
"loss": 2.9762,
"step": 6220
},
{
"epoch": 0.3614842322086513,
"grad_norm": 0.13476519286632538,
"learning_rate": 0.00045448071839356203,
"loss": 2.9756,
"step": 6230
},
{
"epoch": 0.3620644637210247,
"grad_norm": 0.13884297013282776,
"learning_rate": 0.000453986958171887,
"loss": 2.9829,
"step": 6240
},
{
"epoch": 0.36264469523339815,
"grad_norm": 0.12928573787212372,
"learning_rate": 0.00045349263095410087,
"loss": 2.9752,
"step": 6250
},
{
"epoch": 0.36322492674577156,
"grad_norm": 0.13350141048431396,
"learning_rate": 0.000452997738560368,
"loss": 2.9748,
"step": 6260
},
{
"epoch": 0.363805158258145,
"grad_norm": 0.13747799396514893,
"learning_rate": 0.00045250228281293423,
"loss": 2.9705,
"step": 6270
},
{
"epoch": 0.36438538977051843,
"grad_norm": 0.1344989687204361,
"learning_rate": 0.00045200626553611943,
"loss": 2.9801,
"step": 6280
},
{
"epoch": 0.3649656212828919,
"grad_norm": 0.1321888118982315,
"learning_rate": 0.00045150968855631104,
"loss": 2.9781,
"step": 6290
},
{
"epoch": 0.3655458527952653,
"grad_norm": 0.12561041116714478,
"learning_rate": 0.0004510125537019577,
"loss": 2.973,
"step": 6300
},
{
"epoch": 0.36612608430763877,
"grad_norm": 0.13948814570903778,
"learning_rate": 0.00045051486280356194,
"loss": 2.9731,
"step": 6310
},
{
"epoch": 0.3667063158200122,
"grad_norm": 0.12595129013061523,
"learning_rate": 0.0004500166176936739,
"loss": 2.9659,
"step": 6320
},
{
"epoch": 0.36728654733238564,
"grad_norm": 0.12941335141658783,
"learning_rate": 0.00044951782020688415,
"loss": 2.973,
"step": 6330
},
{
"epoch": 0.36786677884475905,
"grad_norm": 0.14215658605098724,
"learning_rate": 0.00044901847217981736,
"loss": 2.975,
"step": 6340
},
{
"epoch": 0.3684470103571325,
"grad_norm": 0.12309448421001434,
"learning_rate": 0.00044851857545112525,
"loss": 2.9749,
"step": 6350
},
{
"epoch": 0.3690272418695059,
"grad_norm": 0.12824192643165588,
"learning_rate": 0.00044801813186147986,
"loss": 2.9672,
"step": 6360
},
{
"epoch": 0.3696074733818794,
"grad_norm": 0.12063992768526077,
"learning_rate": 0.00044751714325356697,
"loss": 2.9708,
"step": 6370
},
{
"epoch": 0.3701877048942528,
"grad_norm": 0.12898465991020203,
"learning_rate": 0.0004470156114720792,
"loss": 2.9699,
"step": 6380
},
{
"epoch": 0.37076793640662625,
"grad_norm": 0.1321457326412201,
"learning_rate": 0.00044651353836370897,
"loss": 2.9661,
"step": 6390
},
{
"epoch": 0.37134816791899966,
"grad_norm": 0.13804246485233307,
"learning_rate": 0.0004460109257771422,
"loss": 2.9783,
"step": 6400
},
{
"epoch": 0.3719283994313731,
"grad_norm": 0.12447643280029297,
"learning_rate": 0.00044550777556305094,
"loss": 2.9691,
"step": 6410
},
{
"epoch": 0.37250863094374653,
"grad_norm": 0.1610770970582962,
"learning_rate": 0.00044500408957408706,
"loss": 2.972,
"step": 6420
},
{
"epoch": 0.37308886245612,
"grad_norm": 0.1278504580259323,
"learning_rate": 0.00044449986966487527,
"loss": 2.9694,
"step": 6430
},
{
"epoch": 0.3736690939684934,
"grad_norm": 0.13527578115463257,
"learning_rate": 0.0004439951176920059,
"loss": 2.9707,
"step": 6440
},
{
"epoch": 0.37424932548086687,
"grad_norm": 0.14050637185573578,
"learning_rate": 0.0004434898355140287,
"loss": 2.9712,
"step": 6450
},
{
"epoch": 0.3748295569932403,
"grad_norm": 0.1513315588235855,
"learning_rate": 0.00044298402499144554,
"loss": 2.9705,
"step": 6460
},
{
"epoch": 0.37540978850561374,
"grad_norm": 0.1299854964017868,
"learning_rate": 0.00044247768798670367,
"loss": 2.9662,
"step": 6470
},
{
"epoch": 0.3759900200179872,
"grad_norm": 0.1321675330400467,
"learning_rate": 0.00044197082636418907,
"loss": 2.9675,
"step": 6480
},
{
"epoch": 0.3765702515303606,
"grad_norm": 0.1453583687543869,
"learning_rate": 0.00044146344199021934,
"loss": 2.9639,
"step": 6490
},
{
"epoch": 0.3771504830427341,
"grad_norm": 0.13450521230697632,
"learning_rate": 0.00044095553673303685,
"loss": 2.9661,
"step": 6500
},
{
"epoch": 0.3777307145551075,
"grad_norm": 0.13579097390174866,
"learning_rate": 0.00044044711246280215,
"loss": 2.9608,
"step": 6510
},
{
"epoch": 0.37831094606748095,
"grad_norm": 0.1469910442829132,
"learning_rate": 0.00043993817105158627,
"loss": 2.9686,
"step": 6520
},
{
"epoch": 0.37889117757985435,
"grad_norm": 0.1311839371919632,
"learning_rate": 0.00043942871437336527,
"loss": 2.9636,
"step": 6530
},
{
"epoch": 0.3794714090922278,
"grad_norm": 0.15060357749462128,
"learning_rate": 0.0004389187443040116,
"loss": 2.9613,
"step": 6540
},
{
"epoch": 0.3800516406046012,
"grad_norm": 0.13408997654914856,
"learning_rate": 0.00043840826272128873,
"loss": 2.9626,
"step": 6550
},
{
"epoch": 0.3806318721169747,
"grad_norm": 0.1458410769701004,
"learning_rate": 0.0004378972715048434,
"loss": 2.9604,
"step": 6560
},
{
"epoch": 0.3812121036293481,
"grad_norm": 0.13342171907424927,
"learning_rate": 0.0004373857725361984,
"loss": 2.9602,
"step": 6570
},
{
"epoch": 0.38179233514172156,
"grad_norm": 0.12624911963939667,
"learning_rate": 0.00043687376769874686,
"loss": 2.9703,
"step": 6580
},
{
"epoch": 0.38237256665409497,
"grad_norm": 0.13120518624782562,
"learning_rate": 0.0004363612588777442,
"loss": 2.9601,
"step": 6590
},
{
"epoch": 0.38295279816646843,
"grad_norm": 0.1357596516609192,
"learning_rate": 0.00043584824796030145,
"loss": 2.9561,
"step": 6600
},
{
"epoch": 0.38353302967884184,
"grad_norm": 0.1270647495985031,
"learning_rate": 0.00043533473683537863,
"loss": 2.9522,
"step": 6610
},
{
"epoch": 0.3841132611912153,
"grad_norm": 0.1325126439332962,
"learning_rate": 0.0004348207273937776,
"loss": 2.9603,
"step": 6620
},
{
"epoch": 0.3846934927035887,
"grad_norm": 0.13015331327915192,
"learning_rate": 0.0004343062215281347,
"loss": 2.955,
"step": 6630
},
{
"epoch": 0.3852737242159622,
"grad_norm": 0.12867479026317596,
"learning_rate": 0.00043379122113291465,
"loss": 2.9692,
"step": 6640
},
{
"epoch": 0.3858539557283356,
"grad_norm": 0.14423881471157074,
"learning_rate": 0.00043327572810440283,
"loss": 2.9539,
"step": 6650
},
{
"epoch": 0.38643418724070905,
"grad_norm": 0.13097575306892395,
"learning_rate": 0.00043275974434069846,
"loss": 2.9576,
"step": 6660
},
{
"epoch": 0.38701441875308246,
"grad_norm": 0.129910409450531,
"learning_rate": 0.0004322432717417079,
"loss": 2.9617,
"step": 6670
},
{
"epoch": 0.3875946502654559,
"grad_norm": 0.13308489322662354,
"learning_rate": 0.00043172631220913735,
"loss": 2.9514,
"step": 6680
},
{
"epoch": 0.38817488177782933,
"grad_norm": 0.12263292074203491,
"learning_rate": 0.00043120886764648605,
"loss": 2.9557,
"step": 6690
},
{
"epoch": 0.3887551132902028,
"grad_norm": 0.1288110911846161,
"learning_rate": 0.0004306909399590389,
"loss": 2.9558,
"step": 6700
},
{
"epoch": 0.38933534480257626,
"grad_norm": 0.12322728335857391,
"learning_rate": 0.00043017253105386005,
"loss": 2.9551,
"step": 6710
},
{
"epoch": 0.38991557631494966,
"grad_norm": 0.1551227867603302,
"learning_rate": 0.0004296536428397853,
"loss": 2.9583,
"step": 6720
},
{
"epoch": 0.3904958078273231,
"grad_norm": 0.12883497774600983,
"learning_rate": 0.00042913427722741546,
"loss": 2.9495,
"step": 6730
},
{
"epoch": 0.39107603933969654,
"grad_norm": 0.12460558116436005,
"learning_rate": 0.00042861443612910913,
"loss": 2.9597,
"step": 6740
},
{
"epoch": 0.39165627085207,
"grad_norm": 0.122388556599617,
"learning_rate": 0.00042809412145897576,
"loss": 2.9557,
"step": 6750
},
{
"epoch": 0.3922365023644434,
"grad_norm": 0.12150498479604721,
"learning_rate": 0.00042757333513286834,
"loss": 2.9489,
"step": 6760
},
{
"epoch": 0.39281673387681687,
"grad_norm": 0.15273340046405792,
"learning_rate": 0.00042705207906837666,
"loss": 2.9503,
"step": 6770
},
{
"epoch": 0.3933969653891903,
"grad_norm": 0.13954737782478333,
"learning_rate": 0.00042653035518482025,
"loss": 2.9481,
"step": 6780
},
{
"epoch": 0.39397719690156374,
"grad_norm": 0.15386004745960236,
"learning_rate": 0.0004260081654032411,
"loss": 2.9596,
"step": 6790
},
{
"epoch": 0.39455742841393715,
"grad_norm": 0.1319696307182312,
"learning_rate": 0.0004254855116463966,
"loss": 2.9526,
"step": 6800
},
{
"epoch": 0.3951376599263106,
"grad_norm": 0.14486876130104065,
"learning_rate": 0.00042496239583875286,
"loss": 2.9501,
"step": 6810
},
{
"epoch": 0.395717891438684,
"grad_norm": 0.12461838871240616,
"learning_rate": 0.0004244388199064768,
"loss": 2.9519,
"step": 6820
},
{
"epoch": 0.3962981229510575,
"grad_norm": 0.14132647216320038,
"learning_rate": 0.00042391478577743006,
"loss": 2.9533,
"step": 6830
},
{
"epoch": 0.3968783544634309,
"grad_norm": 0.12907026708126068,
"learning_rate": 0.00042339029538116104,
"loss": 2.9451,
"step": 6840
},
{
"epoch": 0.39745858597580436,
"grad_norm": 0.13801275193691254,
"learning_rate": 0.0004228653506488984,
"loss": 2.9382,
"step": 6850
},
{
"epoch": 0.39803881748817777,
"grad_norm": 0.11962810158729553,
"learning_rate": 0.00042233995351354366,
"loss": 2.9501,
"step": 6860
},
{
"epoch": 0.39861904900055123,
"grad_norm": 0.12804014980793,
"learning_rate": 0.00042181410590966413,
"loss": 2.9556,
"step": 6870
},
{
"epoch": 0.39919928051292464,
"grad_norm": 0.1232592836022377,
"learning_rate": 0.0004212878097734857,
"loss": 2.9493,
"step": 6880
},
{
"epoch": 0.3997795120252981,
"grad_norm": 0.12467402964830399,
"learning_rate": 0.0004207610670428859,
"loss": 2.9518,
"step": 6890
},
{
"epoch": 0.4003597435376715,
"grad_norm": 0.13029509782791138,
"learning_rate": 0.0004202338796573866,
"loss": 2.9476,
"step": 6900
},
{
"epoch": 0.40093997505004497,
"grad_norm": 0.13504283130168915,
"learning_rate": 0.0004197062495581471,
"loss": 2.9457,
"step": 6910
},
{
"epoch": 0.4015202065624184,
"grad_norm": 0.12205976992845535,
"learning_rate": 0.00041917817868795666,
"loss": 2.9418,
"step": 6920
},
{
"epoch": 0.40210043807479184,
"grad_norm": 0.14173905551433563,
"learning_rate": 0.0004186496689912275,
"loss": 2.9401,
"step": 6930
},
{
"epoch": 0.40268066958716525,
"grad_norm": 0.131003275513649,
"learning_rate": 0.00041812072241398764,
"loss": 2.9416,
"step": 6940
},
{
"epoch": 0.4032609010995387,
"grad_norm": 0.1430942267179489,
"learning_rate": 0.00041759134090387396,
"loss": 2.9526,
"step": 6950
},
{
"epoch": 0.4038411326119122,
"grad_norm": 0.11908053606748581,
"learning_rate": 0.00041706152641012435,
"loss": 2.9457,
"step": 6960
},
{
"epoch": 0.4044213641242856,
"grad_norm": 0.12189971655607224,
"learning_rate": 0.0004165312808835716,
"loss": 2.9497,
"step": 6970
},
{
"epoch": 0.40500159563665905,
"grad_norm": 0.1238475888967514,
"learning_rate": 0.00041600060627663515,
"loss": 2.9426,
"step": 6980
},
{
"epoch": 0.40558182714903246,
"grad_norm": 0.13269031047821045,
"learning_rate": 0.00041546950454331437,
"loss": 2.9441,
"step": 6990
},
{
"epoch": 0.4061620586614059,
"grad_norm": 0.14216388761997223,
"learning_rate": 0.0004149379776391817,
"loss": 2.9443,
"step": 7000
},
{
"epoch": 0.4061620586614059,
"eval_loss": 2.910210609436035,
"eval_runtime": 3.2597,
"eval_samples_per_second": 1328.339,
"eval_steps_per_second": 2.761,
"step": 7000
},
{
"epoch": 0.40674229017377933,
"grad_norm": 0.13298869132995605,
"learning_rate": 0.0004144060275213747,
"loss": 2.946,
"step": 7010
},
{
"epoch": 0.4073225216861528,
"grad_norm": 0.14648084342479706,
"learning_rate": 0.00041387365614858955,
"loss": 2.9468,
"step": 7020
},
{
"epoch": 0.4079027531985262,
"grad_norm": 0.13918638229370117,
"learning_rate": 0.00041334086548107336,
"loss": 2.9561,
"step": 7030
},
{
"epoch": 0.40848298471089967,
"grad_norm": 0.1421622335910797,
"learning_rate": 0.00041280765748061727,
"loss": 2.9437,
"step": 7040
},
{
"epoch": 0.4090632162232731,
"grad_norm": 0.1364564597606659,
"learning_rate": 0.0004122740341105488,
"loss": 2.9354,
"step": 7050
},
{
"epoch": 0.40964344773564654,
"grad_norm": 0.1310495287179947,
"learning_rate": 0.00041173999733572523,
"loss": 2.9471,
"step": 7060
},
{
"epoch": 0.41022367924801995,
"grad_norm": 0.14024296402931213,
"learning_rate": 0.000411205549122526,
"loss": 2.9372,
"step": 7070
},
{
"epoch": 0.4108039107603934,
"grad_norm": 0.1430574357509613,
"learning_rate": 0.0004106706914388452,
"loss": 2.9468,
"step": 7080
},
{
"epoch": 0.4113841422727668,
"grad_norm": 0.12103896588087082,
"learning_rate": 0.00041013542625408504,
"loss": 2.9463,
"step": 7090
},
{
"epoch": 0.4119643737851403,
"grad_norm": 0.12720054388046265,
"learning_rate": 0.00040959975553914787,
"loss": 2.9427,
"step": 7100
},
{
"epoch": 0.4125446052975137,
"grad_norm": 0.14135150611400604,
"learning_rate": 0.0004090636812664295,
"loss": 2.9407,
"step": 7110
},
{
"epoch": 0.41312483680988715,
"grad_norm": 0.14666588604450226,
"learning_rate": 0.0004085272054098115,
"loss": 2.9435,
"step": 7120
},
{
"epoch": 0.41370506832226056,
"grad_norm": 0.13804596662521362,
"learning_rate": 0.0004079903299446541,
"loss": 2.9365,
"step": 7130
},
{
"epoch": 0.414285299834634,
"grad_norm": 0.1470736414194107,
"learning_rate": 0.00040745305684778907,
"loss": 2.9278,
"step": 7140
},
{
"epoch": 0.41486553134700743,
"grad_norm": 0.12926244735717773,
"learning_rate": 0.00040691538809751234,
"loss": 2.9354,
"step": 7150
},
{
"epoch": 0.4154457628593809,
"grad_norm": 0.1294509321451187,
"learning_rate": 0.00040637732567357635,
"loss": 2.9466,
"step": 7160
},
{
"epoch": 0.4160259943717543,
"grad_norm": 0.12196213006973267,
"learning_rate": 0.0004058388715571835,
"loss": 2.9322,
"step": 7170
},
{
"epoch": 0.41660622588412777,
"grad_norm": 0.15902066230773926,
"learning_rate": 0.00040530002773097825,
"loss": 2.9448,
"step": 7180
},
{
"epoch": 0.41718645739650123,
"grad_norm": 0.11859998106956482,
"learning_rate": 0.0004047607961790399,
"loss": 2.9428,
"step": 7190
},
{
"epoch": 0.41776668890887464,
"grad_norm": 0.13470393419265747,
"learning_rate": 0.00040422117888687555,
"loss": 2.942,
"step": 7200
},
{
"epoch": 0.4183469204212481,
"grad_norm": 0.1288190484046936,
"learning_rate": 0.0004036811778414125,
"loss": 2.9362,
"step": 7210
},
{
"epoch": 0.4189271519336215,
"grad_norm": 0.12759481370449066,
"learning_rate": 0.0004031407950309915,
"loss": 2.9447,
"step": 7220
},
{
"epoch": 0.419507383445995,
"grad_norm": 0.13468439877033234,
"learning_rate": 0.0004026000324453584,
"loss": 2.9313,
"step": 7230
},
{
"epoch": 0.4200876149583684,
"grad_norm": 0.12287794053554535,
"learning_rate": 0.0004020588920756577,
"loss": 2.9369,
"step": 7240
},
{
"epoch": 0.42066784647074185,
"grad_norm": 0.12006892263889313,
"learning_rate": 0.00040151737591442497,
"loss": 2.9329,
"step": 7250
},
{
"epoch": 0.42124807798311525,
"grad_norm": 0.13062633574008942,
"learning_rate": 0.00040097548595557935,
"loss": 2.9474,
"step": 7260
},
{
"epoch": 0.4218283094954887,
"grad_norm": 0.12141095846891403,
"learning_rate": 0.00040043322419441667,
"loss": 2.9386,
"step": 7270
},
{
"epoch": 0.4224085410078621,
"grad_norm": 0.13452979922294617,
"learning_rate": 0.0003998905926276014,
"loss": 2.9203,
"step": 7280
},
{
"epoch": 0.4229887725202356,
"grad_norm": 0.13672851026058197,
"learning_rate": 0.0003993475932531598,
"loss": 2.9353,
"step": 7290
},
{
"epoch": 0.423569004032609,
"grad_norm": 0.1266540139913559,
"learning_rate": 0.0003988042280704724,
"loss": 2.929,
"step": 7300
},
{
"epoch": 0.42414923554498246,
"grad_norm": 0.1192171648144722,
"learning_rate": 0.0003982604990802668,
"loss": 2.9314,
"step": 7310
},
{
"epoch": 0.42472946705735587,
"grad_norm": 0.11528236418962479,
"learning_rate": 0.0003977164082846101,
"loss": 2.9349,
"step": 7320
},
{
"epoch": 0.42530969856972933,
"grad_norm": 0.12837885320186615,
"learning_rate": 0.00039717195768690155,
"loss": 2.9211,
"step": 7330
},
{
"epoch": 0.42588993008210274,
"grad_norm": 0.1254536211490631,
"learning_rate": 0.0003966271492918654,
"loss": 2.9311,
"step": 7340
},
{
"epoch": 0.4264701615944762,
"grad_norm": 0.12365511804819107,
"learning_rate": 0.0003960819851055432,
"loss": 2.9411,
"step": 7350
},
{
"epoch": 0.4270503931068496,
"grad_norm": 0.14178220927715302,
"learning_rate": 0.00039553646713528644,
"loss": 2.9322,
"step": 7360
},
{
"epoch": 0.4276306246192231,
"grad_norm": 0.13220851123332977,
"learning_rate": 0.0003949905973897496,
"loss": 2.9397,
"step": 7370
},
{
"epoch": 0.4282108561315965,
"grad_norm": 0.12264362722635269,
"learning_rate": 0.00039444437787888224,
"loss": 2.9355,
"step": 7380
},
{
"epoch": 0.42879108764396995,
"grad_norm": 0.12907512485980988,
"learning_rate": 0.00039389781061392184,
"loss": 2.9259,
"step": 7390
},
{
"epoch": 0.42937131915634336,
"grad_norm": 0.1319524645805359,
"learning_rate": 0.00039335089760738625,
"loss": 2.9284,
"step": 7400
},
{
"epoch": 0.4299515506687168,
"grad_norm": 0.1404864490032196,
"learning_rate": 0.0003928036408730664,
"loss": 2.932,
"step": 7410
},
{
"epoch": 0.43053178218109023,
"grad_norm": 0.12499509751796722,
"learning_rate": 0.00039225604242601914,
"loss": 2.9313,
"step": 7420
},
{
"epoch": 0.4311120136934637,
"grad_norm": 0.13161097466945648,
"learning_rate": 0.0003917081042825591,
"loss": 2.9261,
"step": 7430
},
{
"epoch": 0.43169224520583716,
"grad_norm": 0.13262121379375458,
"learning_rate": 0.000391159828460252,
"loss": 2.9302,
"step": 7440
},
{
"epoch": 0.43227247671821056,
"grad_norm": 0.13169781863689423,
"learning_rate": 0.0003906112169779069,
"loss": 2.9247,
"step": 7450
},
{
"epoch": 0.432852708230584,
"grad_norm": 0.1297696828842163,
"learning_rate": 0.00039006227185556865,
"loss": 2.9422,
"step": 7460
},
{
"epoch": 0.43343293974295743,
"grad_norm": 0.1292199194431305,
"learning_rate": 0.00038951299511451077,
"loss": 2.9232,
"step": 7470
},
{
"epoch": 0.4340131712553309,
"grad_norm": 0.13055439293384552,
"learning_rate": 0.0003889633887772278,
"loss": 2.9246,
"step": 7480
},
{
"epoch": 0.4345934027677043,
"grad_norm": 0.1166820153594017,
"learning_rate": 0.0003884134548674278,
"loss": 2.9361,
"step": 7490
},
{
"epoch": 0.43517363428007777,
"grad_norm": 0.12382174283266068,
"learning_rate": 0.00038786319541002487,
"loss": 2.9221,
"step": 7500
},
{
"epoch": 0.4357538657924512,
"grad_norm": 0.12510880827903748,
"learning_rate": 0.0003873126124311323,
"loss": 2.9289,
"step": 7510
},
{
"epoch": 0.43633409730482464,
"grad_norm": 0.13196755945682526,
"learning_rate": 0.000386761707958054,
"loss": 2.9203,
"step": 7520
},
{
"epoch": 0.43691432881719805,
"grad_norm": 0.13719266653060913,
"learning_rate": 0.00038621048401927817,
"loss": 2.9319,
"step": 7530
},
{
"epoch": 0.4374945603295715,
"grad_norm": 0.13211804628372192,
"learning_rate": 0.000385658942644469,
"loss": 2.9326,
"step": 7540
},
{
"epoch": 0.4380747918419449,
"grad_norm": 0.12999597191810608,
"learning_rate": 0.0003851070858644596,
"loss": 2.9239,
"step": 7550
},
{
"epoch": 0.4386550233543184,
"grad_norm": 0.13165125250816345,
"learning_rate": 0.0003845549157112445,
"loss": 2.9312,
"step": 7560
},
{
"epoch": 0.4392352548666918,
"grad_norm": 0.13743376731872559,
"learning_rate": 0.00038400243421797206,
"loss": 2.9254,
"step": 7570
},
{
"epoch": 0.43981548637906526,
"grad_norm": 0.12621231377124786,
"learning_rate": 0.00038344964341893684,
"loss": 2.9203,
"step": 7580
},
{
"epoch": 0.44039571789143866,
"grad_norm": 0.12167075276374817,
"learning_rate": 0.00038289654534957266,
"loss": 2.9281,
"step": 7590
},
{
"epoch": 0.44097594940381213,
"grad_norm": 0.13523493707180023,
"learning_rate": 0.0003823431420464444,
"loss": 2.916,
"step": 7600
},
{
"epoch": 0.44155618091618554,
"grad_norm": 0.11718156933784485,
"learning_rate": 0.0003817894355472413,
"loss": 2.9145,
"step": 7610
},
{
"epoch": 0.442136412428559,
"grad_norm": 0.13470205664634705,
"learning_rate": 0.0003812354278907683,
"loss": 2.9173,
"step": 7620
},
{
"epoch": 0.4427166439409324,
"grad_norm": 0.1286102533340454,
"learning_rate": 0.00038068112111693984,
"loss": 2.9249,
"step": 7630
},
{
"epoch": 0.44329687545330587,
"grad_norm": 0.13669750094413757,
"learning_rate": 0.00038012651726677146,
"loss": 2.9239,
"step": 7640
},
{
"epoch": 0.4438771069656793,
"grad_norm": 0.14638318121433258,
"learning_rate": 0.0003795716183823728,
"loss": 2.9306,
"step": 7650
},
{
"epoch": 0.44445733847805274,
"grad_norm": 0.13569045066833496,
"learning_rate": 0.00037901642650693944,
"loss": 2.9168,
"step": 7660
},
{
"epoch": 0.4450375699904262,
"grad_norm": 0.1257532387971878,
"learning_rate": 0.00037846094368474613,
"loss": 2.9242,
"step": 7670
},
{
"epoch": 0.4456178015027996,
"grad_norm": 0.11852803826332092,
"learning_rate": 0.0003779051719611389,
"loss": 2.9209,
"step": 7680
},
{
"epoch": 0.4461980330151731,
"grad_norm": 0.12594154477119446,
"learning_rate": 0.0003773491133825273,
"loss": 2.929,
"step": 7690
},
{
"epoch": 0.4467782645275465,
"grad_norm": 0.12566526234149933,
"learning_rate": 0.00037679276999637746,
"loss": 2.9119,
"step": 7700
},
{
"epoch": 0.44735849603991995,
"grad_norm": 0.13207079470157623,
"learning_rate": 0.0003762361438512038,
"loss": 2.917,
"step": 7710
},
{
"epoch": 0.44793872755229336,
"grad_norm": 0.13788865506649017,
"learning_rate": 0.00037567923699656226,
"loss": 2.92,
"step": 7720
},
{
"epoch": 0.4485189590646668,
"grad_norm": 0.13110986351966858,
"learning_rate": 0.00037512205148304204,
"loss": 2.9249,
"step": 7730
},
{
"epoch": 0.44909919057704023,
"grad_norm": 0.1643168181180954,
"learning_rate": 0.00037456458936225873,
"loss": 2.9232,
"step": 7740
},
{
"epoch": 0.4496794220894137,
"grad_norm": 0.14076946675777435,
"learning_rate": 0.00037400685268684623,
"loss": 2.9252,
"step": 7750
},
{
"epoch": 0.4502596536017871,
"grad_norm": 0.1238834485411644,
"learning_rate": 0.0003734488435104494,
"loss": 2.9093,
"step": 7760
},
{
"epoch": 0.45083988511416057,
"grad_norm": 0.11924099922180176,
"learning_rate": 0.00037289056388771643,
"loss": 2.9324,
"step": 7770
},
{
"epoch": 0.451420116626534,
"grad_norm": 0.13720078766345978,
"learning_rate": 0.0003723320158742914,
"loss": 2.9154,
"step": 7780
},
{
"epoch": 0.45200034813890744,
"grad_norm": 0.12532520294189453,
"learning_rate": 0.00037177320152680663,
"loss": 2.9228,
"step": 7790
},
{
"epoch": 0.45258057965128085,
"grad_norm": 0.129350483417511,
"learning_rate": 0.0003712141229028751,
"loss": 2.9071,
"step": 7800
},
{
"epoch": 0.4531608111636543,
"grad_norm": 0.12484076619148254,
"learning_rate": 0.0003706547820610828,
"loss": 2.9107,
"step": 7810
},
{
"epoch": 0.4537410426760277,
"grad_norm": 0.12527912855148315,
"learning_rate": 0.0003700951810609815,
"loss": 2.9166,
"step": 7820
},
{
"epoch": 0.4543212741884012,
"grad_norm": 0.1453130692243576,
"learning_rate": 0.0003695353219630803,
"loss": 2.9195,
"step": 7830
},
{
"epoch": 0.4549015057007746,
"grad_norm": 0.1291913241147995,
"learning_rate": 0.0003689752068288395,
"loss": 2.9124,
"step": 7840
},
{
"epoch": 0.45548173721314805,
"grad_norm": 0.12470022588968277,
"learning_rate": 0.0003684148377206615,
"loss": 2.9241,
"step": 7850
},
{
"epoch": 0.45606196872552146,
"grad_norm": 0.1276790350675583,
"learning_rate": 0.00036785421670188395,
"loss": 2.9178,
"step": 7860
},
{
"epoch": 0.4566422002378949,
"grad_norm": 0.15164950489997864,
"learning_rate": 0.0003672933458367724,
"loss": 2.9072,
"step": 7870
},
{
"epoch": 0.45722243175026833,
"grad_norm": 0.14891022443771362,
"learning_rate": 0.00036673222719051194,
"loss": 2.9235,
"step": 7880
},
{
"epoch": 0.4578026632626418,
"grad_norm": 0.1266569346189499,
"learning_rate": 0.0003661708628292003,
"loss": 2.9159,
"step": 7890
},
{
"epoch": 0.4583828947750152,
"grad_norm": 0.12030439078807831,
"learning_rate": 0.0003656092548198399,
"loss": 2.912,
"step": 7900
},
{
"epoch": 0.45896312628738867,
"grad_norm": 0.12590278685092926,
"learning_rate": 0.00036504740523033016,
"loss": 2.91,
"step": 7910
},
{
"epoch": 0.45954335779976213,
"grad_norm": 0.1255042403936386,
"learning_rate": 0.0003644853161294601,
"loss": 2.9127,
"step": 7920
},
{
"epoch": 0.46012358931213554,
"grad_norm": 0.1253713071346283,
"learning_rate": 0.0003639229895869009,
"loss": 2.9242,
"step": 7930
},
{
"epoch": 0.460703820824509,
"grad_norm": 0.1254982203245163,
"learning_rate": 0.0003633604276731975,
"loss": 2.9115,
"step": 7940
},
{
"epoch": 0.4612840523368824,
"grad_norm": 0.12157725542783737,
"learning_rate": 0.00036279763245976207,
"loss": 2.9114,
"step": 7950
},
{
"epoch": 0.4618642838492559,
"grad_norm": 0.12421195954084396,
"learning_rate": 0.00036223460601886537,
"loss": 2.9083,
"step": 7960
},
{
"epoch": 0.4624445153616293,
"grad_norm": 0.11870937049388885,
"learning_rate": 0.00036167135042362977,
"loss": 2.907,
"step": 7970
},
{
"epoch": 0.46302474687400275,
"grad_norm": 0.12460967898368835,
"learning_rate": 0.00036110786774802133,
"loss": 2.9088,
"step": 7980
},
{
"epoch": 0.46360497838637615,
"grad_norm": 0.1310334950685501,
"learning_rate": 0.00036054416006684245,
"loss": 2.9102,
"step": 7990
},
{
"epoch": 0.4641852098987496,
"grad_norm": 0.12560488283634186,
"learning_rate": 0.00035998022945572366,
"loss": 2.9097,
"step": 8000
},
{
"epoch": 0.4641852098987496,
"eval_loss": 2.875955820083618,
"eval_runtime": 3.2545,
"eval_samples_per_second": 1330.484,
"eval_steps_per_second": 2.765,
"step": 8000
},
{
"epoch": 0.464765441411123,
"grad_norm": 0.12761953473091125,
"learning_rate": 0.00035941607799111675,
"loss": 2.91,
"step": 8010
},
{
"epoch": 0.4653456729234965,
"grad_norm": 0.1247384324669838,
"learning_rate": 0.0003588517077502864,
"loss": 2.9149,
"step": 8020
},
{
"epoch": 0.4659259044358699,
"grad_norm": 0.14209751784801483,
"learning_rate": 0.00035828712081130296,
"loss": 2.9083,
"step": 8030
},
{
"epoch": 0.46650613594824336,
"grad_norm": 0.12985317409038544,
"learning_rate": 0.00035772231925303464,
"loss": 2.9046,
"step": 8040
},
{
"epoch": 0.46708636746061677,
"grad_norm": 0.14672869443893433,
"learning_rate": 0.00035715730515514,
"loss": 2.9113,
"step": 8050
},
{
"epoch": 0.46766659897299023,
"grad_norm": 0.13361111283302307,
"learning_rate": 0.0003565920805980602,
"loss": 2.913,
"step": 8060
},
{
"epoch": 0.46824683048536364,
"grad_norm": 0.12082985788583755,
"learning_rate": 0.0003560266476630112,
"loss": 2.9138,
"step": 8070
},
{
"epoch": 0.4688270619977371,
"grad_norm": 0.1150035560131073,
"learning_rate": 0.0003554610084319763,
"loss": 2.9048,
"step": 8080
},
{
"epoch": 0.4694072935101105,
"grad_norm": 0.1214471235871315,
"learning_rate": 0.0003548951649876984,
"loss": 2.9123,
"step": 8090
},
{
"epoch": 0.469987525022484,
"grad_norm": 0.12934035062789917,
"learning_rate": 0.0003543291194136723,
"loss": 2.9028,
"step": 8100
},
{
"epoch": 0.4705677565348574,
"grad_norm": 0.15276013314723969,
"learning_rate": 0.00035376287379413723,
"loss": 2.9031,
"step": 8110
},
{
"epoch": 0.47114798804723085,
"grad_norm": 0.1335725337266922,
"learning_rate": 0.00035319643021406886,
"loss": 2.9124,
"step": 8120
},
{
"epoch": 0.47172821955960426,
"grad_norm": 0.12289181351661682,
"learning_rate": 0.00035262979075917166,
"loss": 2.9053,
"step": 8130
},
{
"epoch": 0.4723084510719777,
"grad_norm": 0.11827896535396576,
"learning_rate": 0.0003520629575158715,
"loss": 2.9138,
"step": 8140
},
{
"epoch": 0.4728886825843512,
"grad_norm": 0.12505313754081726,
"learning_rate": 0.0003514959325713078,
"loss": 2.909,
"step": 8150
},
{
"epoch": 0.4734689140967246,
"grad_norm": 0.1321611851453781,
"learning_rate": 0.00035092871801332574,
"loss": 2.9075,
"step": 8160
},
{
"epoch": 0.47404914560909805,
"grad_norm": 0.12144722044467926,
"learning_rate": 0.00035036131593046895,
"loss": 2.9046,
"step": 8170
},
{
"epoch": 0.47462937712147146,
"grad_norm": 0.11893021315336227,
"learning_rate": 0.0003497937284119711,
"loss": 2.9021,
"step": 8180
},
{
"epoch": 0.4752096086338449,
"grad_norm": 0.13043691217899323,
"learning_rate": 0.0003492259575477491,
"loss": 2.9052,
"step": 8190
},
{
"epoch": 0.47578984014621833,
"grad_norm": 0.12443230301141739,
"learning_rate": 0.00034865800542839445,
"loss": 2.9003,
"step": 8200
},
{
"epoch": 0.4763700716585918,
"grad_norm": 0.1350659728050232,
"learning_rate": 0.0003480898741451667,
"loss": 2.9077,
"step": 8210
},
{
"epoch": 0.4769503031709652,
"grad_norm": 0.13212652504444122,
"learning_rate": 0.0003475215657899844,
"loss": 2.8955,
"step": 8220
},
{
"epoch": 0.47753053468333867,
"grad_norm": 0.13865076005458832,
"learning_rate": 0.0003469530824554188,
"loss": 2.9015,
"step": 8230
},
{
"epoch": 0.4781107661957121,
"grad_norm": 0.1313691884279251,
"learning_rate": 0.00034638442623468484,
"loss": 2.9014,
"step": 8240
},
{
"epoch": 0.47869099770808554,
"grad_norm": 0.13368923962116241,
"learning_rate": 0.00034581559922163447,
"loss": 2.8962,
"step": 8250
},
{
"epoch": 0.47927122922045895,
"grad_norm": 0.12228936702013016,
"learning_rate": 0.0003452466035107481,
"loss": 2.8997,
"step": 8260
},
{
"epoch": 0.4798514607328324,
"grad_norm": 0.12648892402648926,
"learning_rate": 0.00034467744119712787,
"loss": 2.9052,
"step": 8270
},
{
"epoch": 0.4804316922452058,
"grad_norm": 0.12937045097351074,
"learning_rate": 0.00034410811437648873,
"loss": 2.9037,
"step": 8280
},
{
"epoch": 0.4810119237575793,
"grad_norm": 0.12095940858125687,
"learning_rate": 0.00034353862514515185,
"loss": 2.9002,
"step": 8290
},
{
"epoch": 0.4815921552699527,
"grad_norm": 0.11992644518613815,
"learning_rate": 0.0003429689756000362,
"loss": 2.9051,
"step": 8300
},
{
"epoch": 0.48217238678232616,
"grad_norm": 0.1110587939620018,
"learning_rate": 0.0003423991678386511,
"loss": 2.9046,
"step": 8310
},
{
"epoch": 0.48275261829469956,
"grad_norm": 0.11831989139318466,
"learning_rate": 0.00034182920395908837,
"loss": 2.9001,
"step": 8320
},
{
"epoch": 0.48333284980707303,
"grad_norm": 0.11492130905389786,
"learning_rate": 0.0003412590860600148,
"loss": 2.8944,
"step": 8330
},
{
"epoch": 0.48391308131944644,
"grad_norm": 0.12855441868305206,
"learning_rate": 0.00034068881624066405,
"loss": 2.8941,
"step": 8340
},
{
"epoch": 0.4844933128318199,
"grad_norm": 0.12829254567623138,
"learning_rate": 0.0003401183966008296,
"loss": 2.8989,
"step": 8350
},
{
"epoch": 0.4850735443441933,
"grad_norm": 0.1167573556303978,
"learning_rate": 0.00033954782924085604,
"loss": 2.9027,
"step": 8360
},
{
"epoch": 0.48565377585656677,
"grad_norm": 0.12906575202941895,
"learning_rate": 0.0003389771162616324,
"loss": 2.893,
"step": 8370
},
{
"epoch": 0.4862340073689402,
"grad_norm": 0.12219451367855072,
"learning_rate": 0.00033840625976458357,
"loss": 2.8971,
"step": 8380
},
{
"epoch": 0.48681423888131364,
"grad_norm": 0.1430503875017166,
"learning_rate": 0.00033783526185166295,
"loss": 2.8945,
"step": 8390
},
{
"epoch": 0.4873944703936871,
"grad_norm": 0.1279267519712448,
"learning_rate": 0.00033726412462534454,
"loss": 2.8969,
"step": 8400
},
{
"epoch": 0.4879747019060605,
"grad_norm": 0.1239406168460846,
"learning_rate": 0.00033669285018861567,
"loss": 2.8994,
"step": 8410
},
{
"epoch": 0.488554933418434,
"grad_norm": 0.1379164159297943,
"learning_rate": 0.00033612144064496853,
"loss": 2.8949,
"step": 8420
},
{
"epoch": 0.4891351649308074,
"grad_norm": 0.12819483876228333,
"learning_rate": 0.00033554989809839294,
"loss": 2.897,
"step": 8430
},
{
"epoch": 0.48971539644318085,
"grad_norm": 0.12451434880495071,
"learning_rate": 0.00033497822465336854,
"loss": 2.903,
"step": 8440
},
{
"epoch": 0.49029562795555426,
"grad_norm": 0.1466275155544281,
"learning_rate": 0.0003344064224148567,
"loss": 2.8912,
"step": 8450
},
{
"epoch": 0.4908758594679277,
"grad_norm": 0.12186205387115479,
"learning_rate": 0.0003338344934882932,
"loss": 2.8998,
"step": 8460
},
{
"epoch": 0.49145609098030113,
"grad_norm": 0.12687867879867554,
"learning_rate": 0.00033326243997958014,
"loss": 2.8983,
"step": 8470
},
{
"epoch": 0.4920363224926746,
"grad_norm": 0.12620693445205688,
"learning_rate": 0.00033269026399507874,
"loss": 2.895,
"step": 8480
},
{
"epoch": 0.492616554005048,
"grad_norm": 0.1362224668264389,
"learning_rate": 0.00033211796764160074,
"loss": 2.9007,
"step": 8490
},
{
"epoch": 0.49319678551742147,
"grad_norm": 0.1300470530986786,
"learning_rate": 0.00033154555302640135,
"loss": 2.8914,
"step": 8500
},
{
"epoch": 0.4937770170297949,
"grad_norm": 0.12057654559612274,
"learning_rate": 0.00033097302225717096,
"loss": 2.8971,
"step": 8510
},
{
"epoch": 0.49435724854216834,
"grad_norm": 0.13263335824012756,
"learning_rate": 0.00033040037744202805,
"loss": 2.8971,
"step": 8520
},
{
"epoch": 0.49493748005454175,
"grad_norm": 0.12660051882266998,
"learning_rate": 0.00032982762068951073,
"loss": 2.8914,
"step": 8530
},
{
"epoch": 0.4955177115669152,
"grad_norm": 0.12398383021354675,
"learning_rate": 0.0003292547541085694,
"loss": 2.8936,
"step": 8540
},
{
"epoch": 0.4960979430792886,
"grad_norm": 0.1229000836610794,
"learning_rate": 0.00032868177980855876,
"loss": 2.888,
"step": 8550
},
{
"epoch": 0.4966781745916621,
"grad_norm": 0.11801040917634964,
"learning_rate": 0.0003281086998992303,
"loss": 2.8909,
"step": 8560
},
{
"epoch": 0.4972584061040355,
"grad_norm": 0.12945981323719025,
"learning_rate": 0.0003275355164907241,
"loss": 2.8878,
"step": 8570
},
{
"epoch": 0.49783863761640895,
"grad_norm": 0.12002068758010864,
"learning_rate": 0.0003269622316935618,
"loss": 2.892,
"step": 8580
},
{
"epoch": 0.49841886912878236,
"grad_norm": 0.12449994683265686,
"learning_rate": 0.0003263888476186377,
"loss": 2.8912,
"step": 8590
},
{
"epoch": 0.4989991006411558,
"grad_norm": 0.13638156652450562,
"learning_rate": 0.0003258153663772124,
"loss": 2.8877,
"step": 8600
},
{
"epoch": 0.49957933215352923,
"grad_norm": 0.12280316650867462,
"learning_rate": 0.0003252417900809038,
"loss": 2.8879,
"step": 8610
},
{
"epoch": 0.5001595636659026,
"grad_norm": 0.12275322526693344,
"learning_rate": 0.0003246681208416797,
"loss": 2.8906,
"step": 8620
},
{
"epoch": 0.5007397951782762,
"grad_norm": 0.1220172718167305,
"learning_rate": 0.0003240943607718506,
"loss": 2.8952,
"step": 8630
},
{
"epoch": 0.5013200266906496,
"grad_norm": 0.11458177119493484,
"learning_rate": 0.00032352051198406104,
"loss": 2.902,
"step": 8640
},
{
"epoch": 0.501900258203023,
"grad_norm": 0.12652765214443207,
"learning_rate": 0.0003229465765912824,
"loss": 2.9038,
"step": 8650
},
{
"epoch": 0.5024804897153965,
"grad_norm": 0.12456042319536209,
"learning_rate": 0.000322372556706805,
"loss": 2.8844,
"step": 8660
},
{
"epoch": 0.5030607212277699,
"grad_norm": 0.13799023628234863,
"learning_rate": 0.0003217984544442301,
"loss": 2.8987,
"step": 8670
},
{
"epoch": 0.5036409527401433,
"grad_norm": 0.12474406510591507,
"learning_rate": 0.00032122427191746234,
"loss": 2.8976,
"step": 8680
},
{
"epoch": 0.5042211842525167,
"grad_norm": 0.12724703550338745,
"learning_rate": 0.00032065001124070207,
"loss": 2.8862,
"step": 8690
},
{
"epoch": 0.5048014157648902,
"grad_norm": 0.11946358531713486,
"learning_rate": 0.0003200756745284371,
"loss": 2.8926,
"step": 8700
},
{
"epoch": 0.5053816472772636,
"grad_norm": 0.1258503645658493,
"learning_rate": 0.0003195012638954354,
"loss": 2.8932,
"step": 8710
},
{
"epoch": 0.505961878789637,
"grad_norm": 0.12079302221536636,
"learning_rate": 0.00031892678145673724,
"loss": 2.8914,
"step": 8720
},
{
"epoch": 0.5065421103020105,
"grad_norm": 0.12168605625629425,
"learning_rate": 0.000318352229327647,
"loss": 2.8867,
"step": 8730
},
{
"epoch": 0.507122341814384,
"grad_norm": 0.13427579402923584,
"learning_rate": 0.00031777760962372584,
"loss": 2.8893,
"step": 8740
},
{
"epoch": 0.5077025733267574,
"grad_norm": 0.1176985576748848,
"learning_rate": 0.00031720292446078374,
"loss": 2.8887,
"step": 8750
},
{
"epoch": 0.5082828048391308,
"grad_norm": 0.12351604551076889,
"learning_rate": 0.00031662817595487166,
"loss": 2.8915,
"step": 8760
},
{
"epoch": 0.5088630363515042,
"grad_norm": 0.1390778124332428,
"learning_rate": 0.00031605336622227365,
"loss": 2.8737,
"step": 8770
},
{
"epoch": 0.5094432678638777,
"grad_norm": 0.11954103410243988,
"learning_rate": 0.00031547849737949957,
"loss": 2.8888,
"step": 8780
},
{
"epoch": 0.5100234993762511,
"grad_norm": 0.12293373793363571,
"learning_rate": 0.00031490357154327674,
"loss": 2.8814,
"step": 8790
},
{
"epoch": 0.5106037308886245,
"grad_norm": 0.12284509837627411,
"learning_rate": 0.0003143285908305422,
"loss": 2.8874,
"step": 8800
},
{
"epoch": 0.511183962400998,
"grad_norm": 0.11924895644187927,
"learning_rate": 0.00031375355735843523,
"loss": 2.8813,
"step": 8810
},
{
"epoch": 0.5117641939133715,
"grad_norm": 0.12003005295991898,
"learning_rate": 0.00031317847324428924,
"loss": 2.8836,
"step": 8820
},
{
"epoch": 0.5123444254257449,
"grad_norm": 0.13070861995220184,
"learning_rate": 0.00031260334060562416,
"loss": 2.8851,
"step": 8830
},
{
"epoch": 0.5129246569381183,
"grad_norm": 0.11900255084037781,
"learning_rate": 0.0003120281615601387,
"loss": 2.8827,
"step": 8840
},
{
"epoch": 0.5135048884504917,
"grad_norm": 0.12470702081918716,
"learning_rate": 0.0003114529382257024,
"loss": 2.8916,
"step": 8850
},
{
"epoch": 0.5140851199628652,
"grad_norm": 0.1312616765499115,
"learning_rate": 0.0003108776727203478,
"loss": 2.897,
"step": 8860
},
{
"epoch": 0.5146653514752386,
"grad_norm": 0.13872870802879333,
"learning_rate": 0.00031030236716226265,
"loss": 2.8836,
"step": 8870
},
{
"epoch": 0.515245582987612,
"grad_norm": 0.11608674377202988,
"learning_rate": 0.00030972702366978237,
"loss": 2.8875,
"step": 8880
},
{
"epoch": 0.5158258144999855,
"grad_norm": 0.12205769121646881,
"learning_rate": 0.000309151644361382,
"loss": 2.8862,
"step": 8890
},
{
"epoch": 0.516406046012359,
"grad_norm": 0.12009671330451965,
"learning_rate": 0.0003085762313556683,
"loss": 2.8797,
"step": 8900
},
{
"epoch": 0.5169862775247324,
"grad_norm": 0.12120591104030609,
"learning_rate": 0.0003080007867713724,
"loss": 2.8905,
"step": 8910
},
{
"epoch": 0.5175665090371058,
"grad_norm": 0.12842518091201782,
"learning_rate": 0.00030742531272734153,
"loss": 2.8747,
"step": 8920
},
{
"epoch": 0.5181467405494793,
"grad_norm": 0.12532438337802887,
"learning_rate": 0.00030684981134253123,
"loss": 2.8892,
"step": 8930
},
{
"epoch": 0.5187269720618527,
"grad_norm": 0.1295221596956253,
"learning_rate": 0.0003062742847359981,
"loss": 2.8842,
"step": 8940
},
{
"epoch": 0.5193072035742261,
"grad_norm": 0.1296953707933426,
"learning_rate": 0.00030569873502689116,
"loss": 2.878,
"step": 8950
},
{
"epoch": 0.5198874350865995,
"grad_norm": 0.14120282232761383,
"learning_rate": 0.00030512316433444495,
"loss": 2.8809,
"step": 8960
},
{
"epoch": 0.520467666598973,
"grad_norm": 0.12610268592834473,
"learning_rate": 0.000304547574777971,
"loss": 2.8794,
"step": 8970
},
{
"epoch": 0.5210478981113464,
"grad_norm": 0.11908390372991562,
"learning_rate": 0.0003039719684768503,
"loss": 2.8839,
"step": 8980
},
{
"epoch": 0.5216281296237198,
"grad_norm": 0.13508306443691254,
"learning_rate": 0.0003033963475505256,
"loss": 2.8782,
"step": 8990
},
{
"epoch": 0.5222083611360933,
"grad_norm": 0.12108524888753891,
"learning_rate": 0.00030282071411849343,
"loss": 2.879,
"step": 9000
},
{
"epoch": 0.5222083611360933,
"eval_loss": 2.845144271850586,
"eval_runtime": 3.2553,
"eval_samples_per_second": 1330.14,
"eval_steps_per_second": 2.765,
"step": 9000
},
{
"epoch": 0.5227885926484668,
"grad_norm": 0.13046176731586456,
"learning_rate": 0.00030224507030029627,
"loss": 2.8809,
"step": 9010
},
{
"epoch": 0.5233688241608402,
"grad_norm": 0.12113803625106812,
"learning_rate": 0.0003016694182155152,
"loss": 2.8839,
"step": 9020
},
{
"epoch": 0.5239490556732136,
"grad_norm": 0.12337899953126907,
"learning_rate": 0.0003010937599837613,
"loss": 2.8821,
"step": 9030
},
{
"epoch": 0.524529287185587,
"grad_norm": 0.11981160938739777,
"learning_rate": 0.0003005180977246686,
"loss": 2.888,
"step": 9040
},
{
"epoch": 0.5251095186979605,
"grad_norm": 0.12357629835605621,
"learning_rate": 0.0002999424335578858,
"loss": 2.8804,
"step": 9050
},
{
"epoch": 0.5256897502103339,
"grad_norm": 0.11688230186700821,
"learning_rate": 0.00029936676960306863,
"loss": 2.8891,
"step": 9060
},
{
"epoch": 0.5262699817227073,
"grad_norm": 0.11743608117103577,
"learning_rate": 0.0002987911079798723,
"loss": 2.8685,
"step": 9070
},
{
"epoch": 0.5268502132350807,
"grad_norm": 0.1338096410036087,
"learning_rate": 0.0002982154508079428,
"loss": 2.8758,
"step": 9080
},
{
"epoch": 0.5274304447474543,
"grad_norm": 0.13182982802391052,
"learning_rate": 0.0002976398002069105,
"loss": 2.882,
"step": 9090
},
{
"epoch": 0.5280106762598277,
"grad_norm": 0.12470164895057678,
"learning_rate": 0.000297064158296381,
"loss": 2.8817,
"step": 9100
},
{
"epoch": 0.5285909077722011,
"grad_norm": 0.11741513013839722,
"learning_rate": 0.0002964885271959282,
"loss": 2.8768,
"step": 9110
},
{
"epoch": 0.5291711392845746,
"grad_norm": 0.1364392340183258,
"learning_rate": 0.0002959129090250863,
"loss": 2.8822,
"step": 9120
},
{
"epoch": 0.529751370796948,
"grad_norm": 0.12005024403333664,
"learning_rate": 0.0002953373059033413,
"loss": 2.8789,
"step": 9130
},
{
"epoch": 0.5303316023093214,
"grad_norm": 0.1239180713891983,
"learning_rate": 0.0002947617199501245,
"loss": 2.8754,
"step": 9140
},
{
"epoch": 0.5309118338216948,
"grad_norm": 0.12774530053138733,
"learning_rate": 0.00029418615328480357,
"loss": 2.8773,
"step": 9150
},
{
"epoch": 0.5314920653340683,
"grad_norm": 0.11815381795167923,
"learning_rate": 0.00029361060802667526,
"loss": 2.8711,
"step": 9160
},
{
"epoch": 0.5320722968464418,
"grad_norm": 0.12450312077999115,
"learning_rate": 0.0002930350862949577,
"loss": 2.8743,
"step": 9170
},
{
"epoch": 0.5326525283588152,
"grad_norm": 0.12741632759571075,
"learning_rate": 0.00029245959020878187,
"loss": 2.8846,
"step": 9180
},
{
"epoch": 0.5332327598711886,
"grad_norm": 0.12712997198104858,
"learning_rate": 0.0002918841218871848,
"loss": 2.8774,
"step": 9190
},
{
"epoch": 0.5338129913835621,
"grad_norm": 0.11238303780555725,
"learning_rate": 0.0002913086834491012,
"loss": 2.8782,
"step": 9200
},
{
"epoch": 0.5343932228959355,
"grad_norm": 0.1266774982213974,
"learning_rate": 0.00029073327701335566,
"loss": 2.883,
"step": 9210
},
{
"epoch": 0.5349734544083089,
"grad_norm": 0.12266207486391068,
"learning_rate": 0.00029015790469865484,
"loss": 2.8735,
"step": 9220
},
{
"epoch": 0.5355536859206823,
"grad_norm": 0.10979332774877548,
"learning_rate": 0.0002895825686235799,
"loss": 2.8791,
"step": 9230
},
{
"epoch": 0.5361339174330558,
"grad_norm": 0.11939531564712524,
"learning_rate": 0.0002890072709065787,
"loss": 2.8745,
"step": 9240
},
{
"epoch": 0.5367141489454292,
"grad_norm": 0.12080537527799606,
"learning_rate": 0.0002884320136659575,
"loss": 2.8775,
"step": 9250
},
{
"epoch": 0.5372943804578026,
"grad_norm": 0.12394317239522934,
"learning_rate": 0.00028785679901987394,
"loss": 2.8734,
"step": 9260
},
{
"epoch": 0.537874611970176,
"grad_norm": 0.12320924550294876,
"learning_rate": 0.0002872816290863283,
"loss": 2.8703,
"step": 9270
},
{
"epoch": 0.5384548434825496,
"grad_norm": 0.12183520197868347,
"learning_rate": 0.0002867065059831568,
"loss": 2.8731,
"step": 9280
},
{
"epoch": 0.539035074994923,
"grad_norm": 0.13638751208782196,
"learning_rate": 0.0002861314318280229,
"loss": 2.8725,
"step": 9290
},
{
"epoch": 0.5396153065072964,
"grad_norm": 0.12684093415737152,
"learning_rate": 0.0002855564087384098,
"loss": 2.8714,
"step": 9300
},
{
"epoch": 0.5401955380196698,
"grad_norm": 0.11322664469480515,
"learning_rate": 0.00028498143883161277,
"loss": 2.8693,
"step": 9310
},
{
"epoch": 0.5407757695320433,
"grad_norm": 0.11759771406650543,
"learning_rate": 0.00028440652422473124,
"loss": 2.8679,
"step": 9320
},
{
"epoch": 0.5413560010444167,
"grad_norm": 0.12511123716831207,
"learning_rate": 0.0002838316670346612,
"loss": 2.8744,
"step": 9330
},
{
"epoch": 0.5419362325567901,
"grad_norm": 0.1160508468747139,
"learning_rate": 0.00028325686937808673,
"loss": 2.874,
"step": 9340
},
{
"epoch": 0.5425164640691637,
"grad_norm": 0.11813979595899582,
"learning_rate": 0.0002826821333714732,
"loss": 2.8691,
"step": 9350
},
{
"epoch": 0.5430966955815371,
"grad_norm": 0.11728700250387192,
"learning_rate": 0.0002821074611310588,
"loss": 2.8717,
"step": 9360
},
{
"epoch": 0.5436769270939105,
"grad_norm": 0.12824493646621704,
"learning_rate": 0.0002815328547728469,
"loss": 2.875,
"step": 9370
},
{
"epoch": 0.5442571586062839,
"grad_norm": 0.12653270363807678,
"learning_rate": 0.0002809583164125983,
"loss": 2.8682,
"step": 9380
},
{
"epoch": 0.5448373901186574,
"grad_norm": 0.13113363087177277,
"learning_rate": 0.00028038384816582337,
"loss": 2.8583,
"step": 9390
},
{
"epoch": 0.5454176216310308,
"grad_norm": 0.11145169287919998,
"learning_rate": 0.0002798094521477744,
"loss": 2.8714,
"step": 9400
},
{
"epoch": 0.5459978531434042,
"grad_norm": 0.12025914341211319,
"learning_rate": 0.0002792351304734378,
"loss": 2.8689,
"step": 9410
},
{
"epoch": 0.5465780846557776,
"grad_norm": 0.1347450315952301,
"learning_rate": 0.000278660885257526,
"loss": 2.8803,
"step": 9420
},
{
"epoch": 0.5471583161681511,
"grad_norm": 0.11728854477405548,
"learning_rate": 0.0002780867186144703,
"loss": 2.8614,
"step": 9430
},
{
"epoch": 0.5477385476805245,
"grad_norm": 0.1399793028831482,
"learning_rate": 0.00027751263265841204,
"loss": 2.8777,
"step": 9440
},
{
"epoch": 0.548318779192898,
"grad_norm": 0.13229645788669586,
"learning_rate": 0.0002769386295031961,
"loss": 2.8723,
"step": 9450
},
{
"epoch": 0.5488990107052714,
"grad_norm": 0.12199070304632187,
"learning_rate": 0.00027636471126236213,
"loss": 2.8577,
"step": 9460
},
{
"epoch": 0.5494792422176449,
"grad_norm": 0.14131730794906616,
"learning_rate": 0.0002757908800491373,
"loss": 2.857,
"step": 9470
},
{
"epoch": 0.5500594737300183,
"grad_norm": 0.1343252956867218,
"learning_rate": 0.0002752171379764283,
"loss": 2.8689,
"step": 9480
},
{
"epoch": 0.5506397052423917,
"grad_norm": 0.1338685154914856,
"learning_rate": 0.0002746434871568133,
"loss": 2.8775,
"step": 9490
},
{
"epoch": 0.5512199367547651,
"grad_norm": 0.12388128787279129,
"learning_rate": 0.00027406992970253506,
"loss": 2.8761,
"step": 9500
},
{
"epoch": 0.5518001682671386,
"grad_norm": 0.12272147834300995,
"learning_rate": 0.0002734964677254918,
"loss": 2.8722,
"step": 9510
},
{
"epoch": 0.552380399779512,
"grad_norm": 0.12000911682844162,
"learning_rate": 0.00027292310333723086,
"loss": 2.8743,
"step": 9520
},
{
"epoch": 0.5529606312918854,
"grad_norm": 0.13635672628879547,
"learning_rate": 0.00027234983864894,
"loss": 2.8657,
"step": 9530
},
{
"epoch": 0.5535408628042588,
"grad_norm": 0.12129581719636917,
"learning_rate": 0.0002717766757714398,
"loss": 2.8661,
"step": 9540
},
{
"epoch": 0.5541210943166324,
"grad_norm": 0.11717355996370316,
"learning_rate": 0.00027120361681517606,
"loss": 2.8707,
"step": 9550
},
{
"epoch": 0.5547013258290058,
"grad_norm": 0.12199341505765915,
"learning_rate": 0.0002706306638902117,
"loss": 2.8555,
"step": 9560
},
{
"epoch": 0.5552815573413792,
"grad_norm": 0.1175154522061348,
"learning_rate": 0.0002700578191062196,
"loss": 2.8721,
"step": 9570
},
{
"epoch": 0.5558617888537526,
"grad_norm": 0.12546683847904205,
"learning_rate": 0.00026948508457247416,
"loss": 2.8689,
"step": 9580
},
{
"epoch": 0.5564420203661261,
"grad_norm": 0.11439734697341919,
"learning_rate": 0.000268912462397844,
"loss": 2.8552,
"step": 9590
},
{
"epoch": 0.5570222518784995,
"grad_norm": 0.13139833509922028,
"learning_rate": 0.00026833995469078404,
"loss": 2.8728,
"step": 9600
},
{
"epoch": 0.5576024833908729,
"grad_norm": 0.14722158014774323,
"learning_rate": 0.00026776756355932743,
"loss": 2.8594,
"step": 9610
},
{
"epoch": 0.5581827149032464,
"grad_norm": 0.12206868082284927,
"learning_rate": 0.00026719529111107846,
"loss": 2.8713,
"step": 9620
},
{
"epoch": 0.5587629464156199,
"grad_norm": 0.11777371913194656,
"learning_rate": 0.00026662313945320404,
"loss": 2.8656,
"step": 9630
},
{
"epoch": 0.5593431779279933,
"grad_norm": 0.12058188021183014,
"learning_rate": 0.00026605111069242664,
"loss": 2.8712,
"step": 9640
},
{
"epoch": 0.5599234094403667,
"grad_norm": 0.1278459131717682,
"learning_rate": 0.00026547920693501616,
"loss": 2.8686,
"step": 9650
},
{
"epoch": 0.5605036409527402,
"grad_norm": 0.12272592633962631,
"learning_rate": 0.00026490743028678194,
"loss": 2.8636,
"step": 9660
},
{
"epoch": 0.5610838724651136,
"grad_norm": 0.11543965339660645,
"learning_rate": 0.00026433578285306567,
"loss": 2.8592,
"step": 9670
},
{
"epoch": 0.561664103977487,
"grad_norm": 0.11765621602535248,
"learning_rate": 0.0002637642667387329,
"loss": 2.867,
"step": 9680
},
{
"epoch": 0.5622443354898604,
"grad_norm": 0.12996822595596313,
"learning_rate": 0.0002631928840481662,
"loss": 2.8669,
"step": 9690
},
{
"epoch": 0.5628245670022339,
"grad_norm": 0.11992313712835312,
"learning_rate": 0.00026262163688525606,
"loss": 2.8576,
"step": 9700
},
{
"epoch": 0.5634047985146073,
"grad_norm": 0.1216612309217453,
"learning_rate": 0.00026205052735339457,
"loss": 2.8656,
"step": 9710
},
{
"epoch": 0.5639850300269807,
"grad_norm": 0.11923664063215256,
"learning_rate": 0.00026147955755546686,
"loss": 2.8625,
"step": 9720
},
{
"epoch": 0.5645652615393542,
"grad_norm": 0.1174679845571518,
"learning_rate": 0.00026090872959384353,
"loss": 2.8589,
"step": 9730
},
{
"epoch": 0.5651454930517277,
"grad_norm": 0.12439408898353577,
"learning_rate": 0.00026033804557037304,
"loss": 2.8573,
"step": 9740
},
{
"epoch": 0.5657257245641011,
"grad_norm": 0.12268688529729843,
"learning_rate": 0.0002597675075863735,
"loss": 2.8612,
"step": 9750
},
{
"epoch": 0.5663059560764745,
"grad_norm": 0.11994469910860062,
"learning_rate": 0.0002591971177426256,
"loss": 2.8667,
"step": 9760
},
{
"epoch": 0.5668861875888479,
"grad_norm": 0.12739793956279755,
"learning_rate": 0.0002586268781393648,
"loss": 2.8657,
"step": 9770
},
{
"epoch": 0.5674664191012214,
"grad_norm": 0.12942016124725342,
"learning_rate": 0.00025805679087627267,
"loss": 2.863,
"step": 9780
},
{
"epoch": 0.5680466506135948,
"grad_norm": 0.12867708504199982,
"learning_rate": 0.00025748685805247046,
"loss": 2.8596,
"step": 9790
},
{
"epoch": 0.5686268821259682,
"grad_norm": 0.1384700983762741,
"learning_rate": 0.00025691708176651034,
"loss": 2.8612,
"step": 9800
},
{
"epoch": 0.5692071136383416,
"grad_norm": 0.11695626378059387,
"learning_rate": 0.0002563474641163686,
"loss": 2.8613,
"step": 9810
},
{
"epoch": 0.5697873451507152,
"grad_norm": 0.12379258126020432,
"learning_rate": 0.0002557780071994367,
"loss": 2.8637,
"step": 9820
},
{
"epoch": 0.5703675766630886,
"grad_norm": 0.13220758736133575,
"learning_rate": 0.00025520871311251493,
"loss": 2.8572,
"step": 9830
},
{
"epoch": 0.570947808175462,
"grad_norm": 0.12004509568214417,
"learning_rate": 0.00025463958395180377,
"loss": 2.8614,
"step": 9840
},
{
"epoch": 0.5715280396878355,
"grad_norm": 0.12457242608070374,
"learning_rate": 0.0002540706218128962,
"loss": 2.8606,
"step": 9850
},
{
"epoch": 0.5721082712002089,
"grad_norm": 0.125260129570961,
"learning_rate": 0.0002535018287907707,
"loss": 2.8606,
"step": 9860
},
{
"epoch": 0.5726885027125823,
"grad_norm": 0.11718660593032837,
"learning_rate": 0.00025293320697978254,
"loss": 2.86,
"step": 9870
},
{
"epoch": 0.5732687342249557,
"grad_norm": 0.1096329316496849,
"learning_rate": 0.0002523647584736568,
"loss": 2.8743,
"step": 9880
},
{
"epoch": 0.5738489657373292,
"grad_norm": 0.11327598243951797,
"learning_rate": 0.0002517964853654806,
"loss": 2.8492,
"step": 9890
},
{
"epoch": 0.5744291972497026,
"grad_norm": 0.1237105280160904,
"learning_rate": 0.0002512283897476949,
"loss": 2.852,
"step": 9900
},
{
"epoch": 0.5750094287620761,
"grad_norm": 0.11739984154701233,
"learning_rate": 0.0002506604737120874,
"loss": 2.8535,
"step": 9910
},
{
"epoch": 0.5755896602744495,
"grad_norm": 0.12682320177555084,
"learning_rate": 0.00025009273934978424,
"loss": 2.8575,
"step": 9920
},
{
"epoch": 0.576169891786823,
"grad_norm": 0.12347414344549179,
"learning_rate": 0.00024952518875124305,
"loss": 2.8596,
"step": 9930
},
{
"epoch": 0.5767501232991964,
"grad_norm": 0.11207421123981476,
"learning_rate": 0.0002489578240062444,
"loss": 2.8563,
"step": 9940
},
{
"epoch": 0.5773303548115698,
"grad_norm": 0.12151192873716354,
"learning_rate": 0.0002483906472038848,
"loss": 2.8513,
"step": 9950
},
{
"epoch": 0.5779105863239432,
"grad_norm": 0.11661417037248611,
"learning_rate": 0.00024782366043256876,
"loss": 2.8538,
"step": 9960
},
{
"epoch": 0.5784908178363167,
"grad_norm": 0.11908597499132156,
"learning_rate": 0.0002472568657800007,
"loss": 2.8549,
"step": 9970
},
{
"epoch": 0.5790710493486901,
"grad_norm": 0.12369140982627869,
"learning_rate": 0.00024669026533317816,
"loss": 2.859,
"step": 9980
},
{
"epoch": 0.5796512808610635,
"grad_norm": 0.12169597297906876,
"learning_rate": 0.0002461238611783832,
"loss": 2.8516,
"step": 9990
},
{
"epoch": 0.580231512373437,
"grad_norm": 0.1137092188000679,
"learning_rate": 0.0002455576554011753,
"loss": 2.8506,
"step": 10000
},
{
"epoch": 0.580231512373437,
"eval_loss": 2.8198139667510986,
"eval_runtime": 3.2544,
"eval_samples_per_second": 1330.504,
"eval_steps_per_second": 2.765,
"step": 10000
},
{
"epoch": 0.5808117438858105,
"grad_norm": 0.11945224553346634,
"learning_rate": 0.00024499165008638355,
"loss": 2.8527,
"step": 10010
},
{
"epoch": 0.5813919753981839,
"grad_norm": 0.12194681167602539,
"learning_rate": 0.0002444258473180986,
"loss": 2.8676,
"step": 10020
},
{
"epoch": 0.5819722069105573,
"grad_norm": 0.12587039172649384,
"learning_rate": 0.00024386024917966563,
"loss": 2.8468,
"step": 10030
},
{
"epoch": 0.5825524384229307,
"grad_norm": 0.12192162871360779,
"learning_rate": 0.0002432948577536762,
"loss": 2.8484,
"step": 10040
},
{
"epoch": 0.5831326699353042,
"grad_norm": 0.11401449888944626,
"learning_rate": 0.00024272967512196093,
"loss": 2.8636,
"step": 10050
},
{
"epoch": 0.5837129014476776,
"grad_norm": 0.12227935343980789,
"learning_rate": 0.0002421647033655812,
"loss": 2.8497,
"step": 10060
},
{
"epoch": 0.584293132960051,
"grad_norm": 0.11773716658353806,
"learning_rate": 0.00024159994456482233,
"loss": 2.857,
"step": 10070
},
{
"epoch": 0.5848733644724246,
"grad_norm": 0.124253049492836,
"learning_rate": 0.00024103540079918555,
"loss": 2.8499,
"step": 10080
},
{
"epoch": 0.585453595984798,
"grad_norm": 0.11704014986753464,
"learning_rate": 0.00024047107414737985,
"loss": 2.8522,
"step": 10090
},
{
"epoch": 0.5860338274971714,
"grad_norm": 0.11885286867618561,
"learning_rate": 0.0002399069666873153,
"loss": 2.855,
"step": 10100
},
{
"epoch": 0.5866140590095448,
"grad_norm": 0.12006965279579163,
"learning_rate": 0.00023934308049609453,
"loss": 2.8488,
"step": 10110
},
{
"epoch": 0.5871942905219183,
"grad_norm": 0.12023113667964935,
"learning_rate": 0.00023877941765000564,
"loss": 2.8542,
"step": 10120
},
{
"epoch": 0.5877745220342917,
"grad_norm": 0.12737338244915009,
"learning_rate": 0.00023821598022451436,
"loss": 2.8588,
"step": 10130
},
{
"epoch": 0.5883547535466651,
"grad_norm": 0.11698620766401291,
"learning_rate": 0.00023765277029425607,
"loss": 2.8544,
"step": 10140
},
{
"epoch": 0.5889349850590385,
"grad_norm": 0.12589864432811737,
"learning_rate": 0.000237089789933029,
"loss": 2.8448,
"step": 10150
},
{
"epoch": 0.589515216571412,
"grad_norm": 0.11532309651374817,
"learning_rate": 0.0002365270412137856,
"loss": 2.8618,
"step": 10160
},
{
"epoch": 0.5900954480837854,
"grad_norm": 0.10937913507223129,
"learning_rate": 0.00023596452620862585,
"loss": 2.8527,
"step": 10170
},
{
"epoch": 0.5906756795961589,
"grad_norm": 0.11980416625738144,
"learning_rate": 0.00023540224698878861,
"loss": 2.8553,
"step": 10180
},
{
"epoch": 0.5912559111085323,
"grad_norm": 0.11810686439275742,
"learning_rate": 0.00023484020562464507,
"loss": 2.8545,
"step": 10190
},
{
"epoch": 0.5918361426209058,
"grad_norm": 0.11651547253131866,
"learning_rate": 0.00023427840418569043,
"loss": 2.8522,
"step": 10200
},
{
"epoch": 0.5924163741332792,
"grad_norm": 0.11145967990159988,
"learning_rate": 0.00023371684474053633,
"loss": 2.8564,
"step": 10210
},
{
"epoch": 0.5929966056456526,
"grad_norm": 0.11742381006479263,
"learning_rate": 0.0002331555293569037,
"loss": 2.8529,
"step": 10220
},
{
"epoch": 0.593576837158026,
"grad_norm": 0.1287650465965271,
"learning_rate": 0.00023259446010161425,
"loss": 2.847,
"step": 10230
},
{
"epoch": 0.5941570686703995,
"grad_norm": 0.12560808658599854,
"learning_rate": 0.00023203363904058394,
"loss": 2.8424,
"step": 10240
},
{
"epoch": 0.5947373001827729,
"grad_norm": 0.13144509494304657,
"learning_rate": 0.0002314730682388147,
"loss": 2.8497,
"step": 10250
},
{
"epoch": 0.5953175316951463,
"grad_norm": 0.11483640223741531,
"learning_rate": 0.00023091274976038686,
"loss": 2.8525,
"step": 10260
},
{
"epoch": 0.5958977632075197,
"grad_norm": 0.12085619568824768,
"learning_rate": 0.0002303526856684519,
"loss": 2.846,
"step": 10270
},
{
"epoch": 0.5964779947198933,
"grad_norm": 0.13581375777721405,
"learning_rate": 0.00022979287802522423,
"loss": 2.8471,
"step": 10280
},
{
"epoch": 0.5970582262322667,
"grad_norm": 0.11522037535905838,
"learning_rate": 0.00022923332889197447,
"loss": 2.841,
"step": 10290
},
{
"epoch": 0.5976384577446401,
"grad_norm": 0.1114853248000145,
"learning_rate": 0.00022867404032902097,
"loss": 2.8507,
"step": 10300
},
{
"epoch": 0.5982186892570136,
"grad_norm": 0.1106984093785286,
"learning_rate": 0.00022811501439572288,
"loss": 2.8501,
"step": 10310
},
{
"epoch": 0.598798920769387,
"grad_norm": 0.12095363438129425,
"learning_rate": 0.0002275562531504724,
"loss": 2.8392,
"step": 10320
},
{
"epoch": 0.5993791522817604,
"grad_norm": 0.11527710407972336,
"learning_rate": 0.00022699775865068667,
"loss": 2.8498,
"step": 10330
},
{
"epoch": 0.5999593837941338,
"grad_norm": 0.11631615459918976,
"learning_rate": 0.00022643953295280127,
"loss": 2.8526,
"step": 10340
},
{
"epoch": 0.6005396153065073,
"grad_norm": 0.1107979491353035,
"learning_rate": 0.0002258815781122614,
"loss": 2.8488,
"step": 10350
},
{
"epoch": 0.6011198468188808,
"grad_norm": 0.1126491129398346,
"learning_rate": 0.00022532389618351532,
"loss": 2.8404,
"step": 10360
},
{
"epoch": 0.6017000783312542,
"grad_norm": 0.11740950495004654,
"learning_rate": 0.00022476648922000646,
"loss": 2.8499,
"step": 10370
},
{
"epoch": 0.6022803098436276,
"grad_norm": 0.11938904970884323,
"learning_rate": 0.00022420935927416547,
"loss": 2.8547,
"step": 10380
},
{
"epoch": 0.6028605413560011,
"grad_norm": 0.11484769731760025,
"learning_rate": 0.00022365250839740338,
"loss": 2.8392,
"step": 10390
},
{
"epoch": 0.6034407728683745,
"grad_norm": 0.12051428109407425,
"learning_rate": 0.0002230959386401032,
"loss": 2.8416,
"step": 10400
},
{
"epoch": 0.6040210043807479,
"grad_norm": 0.12364054471254349,
"learning_rate": 0.00022253965205161326,
"loss": 2.8343,
"step": 10410
},
{
"epoch": 0.6046012358931213,
"grad_norm": 0.1125280112028122,
"learning_rate": 0.00022198365068023892,
"loss": 2.8441,
"step": 10420
},
{
"epoch": 0.6051814674054948,
"grad_norm": 0.11715447157621384,
"learning_rate": 0.00022142793657323558,
"loss": 2.8391,
"step": 10430
},
{
"epoch": 0.6057616989178682,
"grad_norm": 0.11433437466621399,
"learning_rate": 0.00022087251177680086,
"loss": 2.8549,
"step": 10440
},
{
"epoch": 0.6063419304302416,
"grad_norm": 0.1222948208451271,
"learning_rate": 0.00022031737833606686,
"loss": 2.8406,
"step": 10450
},
{
"epoch": 0.6069221619426151,
"grad_norm": 0.11805406212806702,
"learning_rate": 0.0002197625382950932,
"loss": 2.8415,
"step": 10460
},
{
"epoch": 0.6075023934549886,
"grad_norm": 0.13002602756023407,
"learning_rate": 0.00021920799369685892,
"loss": 2.851,
"step": 10470
},
{
"epoch": 0.608082624967362,
"grad_norm": 0.11929357796907425,
"learning_rate": 0.00021865374658325544,
"loss": 2.8437,
"step": 10480
},
{
"epoch": 0.6086628564797354,
"grad_norm": 0.11752030998468399,
"learning_rate": 0.00021809979899507876,
"loss": 2.8532,
"step": 10490
},
{
"epoch": 0.6092430879921088,
"grad_norm": 0.12201694399118423,
"learning_rate": 0.00021754615297202168,
"loss": 2.8474,
"step": 10500
},
{
"epoch": 0.6098233195044823,
"grad_norm": 0.12019883096218109,
"learning_rate": 0.00021699281055266706,
"loss": 2.8422,
"step": 10510
},
{
"epoch": 0.6104035510168557,
"grad_norm": 0.12413442134857178,
"learning_rate": 0.00021643977377447954,
"loss": 2.8316,
"step": 10520
},
{
"epoch": 0.6109837825292291,
"grad_norm": 0.11983013898134232,
"learning_rate": 0.00021588704467379862,
"loss": 2.8448,
"step": 10530
},
{
"epoch": 0.6115640140416027,
"grad_norm": 0.13365738093852997,
"learning_rate": 0.0002153346252858306,
"loss": 2.837,
"step": 10540
},
{
"epoch": 0.6121442455539761,
"grad_norm": 0.13185539841651917,
"learning_rate": 0.00021478251764464148,
"loss": 2.8468,
"step": 10550
},
{
"epoch": 0.6127244770663495,
"grad_norm": 0.1213960349559784,
"learning_rate": 0.00021423072378314964,
"loss": 2.8444,
"step": 10560
},
{
"epoch": 0.6133047085787229,
"grad_norm": 0.12037312239408493,
"learning_rate": 0.00021367924573311773,
"loss": 2.8438,
"step": 10570
},
{
"epoch": 0.6138849400910964,
"grad_norm": 0.12542636692523956,
"learning_rate": 0.00021312808552514592,
"loss": 2.8424,
"step": 10580
},
{
"epoch": 0.6144651716034698,
"grad_norm": 0.14415085315704346,
"learning_rate": 0.00021257724518866352,
"loss": 2.8417,
"step": 10590
},
{
"epoch": 0.6150454031158432,
"grad_norm": 0.1150176003575325,
"learning_rate": 0.00021202672675192248,
"loss": 2.8435,
"step": 10600
},
{
"epoch": 0.6156256346282166,
"grad_norm": 0.11662835627794266,
"learning_rate": 0.00021147653224198951,
"loss": 2.8441,
"step": 10610
},
{
"epoch": 0.6162058661405901,
"grad_norm": 0.11693531274795532,
"learning_rate": 0.00021092666368473817,
"loss": 2.8391,
"step": 10620
},
{
"epoch": 0.6167860976529635,
"grad_norm": 0.11077579110860825,
"learning_rate": 0.0002103771231048423,
"loss": 2.8345,
"step": 10630
},
{
"epoch": 0.617366329165337,
"grad_norm": 0.11653861403465271,
"learning_rate": 0.00020982791252576773,
"loss": 2.8448,
"step": 10640
},
{
"epoch": 0.6179465606777104,
"grad_norm": 0.11749275773763657,
"learning_rate": 0.00020927903396976552,
"loss": 2.8558,
"step": 10650
},
{
"epoch": 0.6185267921900839,
"grad_norm": 0.11677636206150055,
"learning_rate": 0.00020873048945786382,
"loss": 2.8353,
"step": 10660
},
{
"epoch": 0.6191070237024573,
"grad_norm": 0.11745753139257431,
"learning_rate": 0.00020818228100986106,
"loss": 2.8494,
"step": 10670
},
{
"epoch": 0.6196872552148307,
"grad_norm": 0.11747489869594574,
"learning_rate": 0.00020763441064431827,
"loss": 2.8397,
"step": 10680
},
{
"epoch": 0.6202674867272041,
"grad_norm": 0.11356910318136215,
"learning_rate": 0.00020708688037855138,
"loss": 2.8472,
"step": 10690
},
{
"epoch": 0.6208477182395776,
"grad_norm": 0.11063719540834427,
"learning_rate": 0.00020653969222862435,
"loss": 2.8508,
"step": 10700
},
{
"epoch": 0.621427949751951,
"grad_norm": 0.10978058725595474,
"learning_rate": 0.00020599284820934112,
"loss": 2.8308,
"step": 10710
},
{
"epoch": 0.6220081812643244,
"grad_norm": 0.11860186606645584,
"learning_rate": 0.00020544635033423867,
"loss": 2.8263,
"step": 10720
},
{
"epoch": 0.6225884127766979,
"grad_norm": 0.1312050074338913,
"learning_rate": 0.00020490020061557953,
"loss": 2.8455,
"step": 10730
},
{
"epoch": 0.6231686442890714,
"grad_norm": 0.13181331753730774,
"learning_rate": 0.00020435440106434408,
"loss": 2.8489,
"step": 10740
},
{
"epoch": 0.6237488758014448,
"grad_norm": 0.1471181958913803,
"learning_rate": 0.00020380895369022357,
"loss": 2.8285,
"step": 10750
},
{
"epoch": 0.6243291073138182,
"grad_norm": 0.12075991183519363,
"learning_rate": 0.00020326386050161215,
"loss": 2.8402,
"step": 10760
},
{
"epoch": 0.6249093388261916,
"grad_norm": 0.1117480993270874,
"learning_rate": 0.0002027191235056003,
"loss": 2.8426,
"step": 10770
},
{
"epoch": 0.6254895703385651,
"grad_norm": 0.11622477322816849,
"learning_rate": 0.0002021747447079665,
"loss": 2.8423,
"step": 10780
},
{
"epoch": 0.6260698018509385,
"grad_norm": 0.11475232988595963,
"learning_rate": 0.00020163072611317055,
"loss": 2.835,
"step": 10790
},
{
"epoch": 0.6266500333633119,
"grad_norm": 0.12252891808748245,
"learning_rate": 0.00020108706972434606,
"loss": 2.8381,
"step": 10800
},
{
"epoch": 0.6272302648756855,
"grad_norm": 0.11319098621606827,
"learning_rate": 0.00020054377754329258,
"loss": 2.8326,
"step": 10810
},
{
"epoch": 0.6278104963880589,
"grad_norm": 0.11103735119104385,
"learning_rate": 0.00020000085157046902,
"loss": 2.8292,
"step": 10820
},
{
"epoch": 0.6283907279004323,
"grad_norm": 0.12254971265792847,
"learning_rate": 0.00019945829380498556,
"loss": 2.8379,
"step": 10830
},
{
"epoch": 0.6289709594128057,
"grad_norm": 0.1253294050693512,
"learning_rate": 0.00019891610624459674,
"loss": 2.8404,
"step": 10840
},
{
"epoch": 0.6295511909251792,
"grad_norm": 0.12701797485351562,
"learning_rate": 0.0001983742908856942,
"loss": 2.8331,
"step": 10850
},
{
"epoch": 0.6301314224375526,
"grad_norm": 0.1351822167634964,
"learning_rate": 0.00019783284972329845,
"loss": 2.831,
"step": 10860
},
{
"epoch": 0.630711653949926,
"grad_norm": 0.11504077911376953,
"learning_rate": 0.00019729178475105292,
"loss": 2.8397,
"step": 10870
},
{
"epoch": 0.6312918854622994,
"grad_norm": 0.11900710314512253,
"learning_rate": 0.00019675109796121523,
"loss": 2.8328,
"step": 10880
},
{
"epoch": 0.6318721169746729,
"grad_norm": 0.11879398673772812,
"learning_rate": 0.00019621079134465096,
"loss": 2.8275,
"step": 10890
},
{
"epoch": 0.6324523484870463,
"grad_norm": 0.11795203387737274,
"learning_rate": 0.00019567086689082562,
"loss": 2.828,
"step": 10900
},
{
"epoch": 0.6330325799994198,
"grad_norm": 0.1163572296500206,
"learning_rate": 0.00019513132658779758,
"loss": 2.8387,
"step": 10910
},
{
"epoch": 0.6336128115117932,
"grad_norm": 0.11812139302492142,
"learning_rate": 0.00019459217242221092,
"loss": 2.8336,
"step": 10920
},
{
"epoch": 0.6341930430241667,
"grad_norm": 0.11195320636034012,
"learning_rate": 0.00019405340637928755,
"loss": 2.8427,
"step": 10930
},
{
"epoch": 0.6347732745365401,
"grad_norm": 0.11674754321575165,
"learning_rate": 0.0001935150304428206,
"loss": 2.8279,
"step": 10940
},
{
"epoch": 0.6353535060489135,
"grad_norm": 0.11432943493127823,
"learning_rate": 0.00019297704659516655,
"loss": 2.8267,
"step": 10950
},
{
"epoch": 0.6359337375612869,
"grad_norm": 0.12507887184619904,
"learning_rate": 0.0001924394568172384,
"loss": 2.8309,
"step": 10960
},
{
"epoch": 0.6365139690736604,
"grad_norm": 0.12057894468307495,
"learning_rate": 0.0001919022630884981,
"loss": 2.8422,
"step": 10970
},
{
"epoch": 0.6370942005860338,
"grad_norm": 0.11377721279859543,
"learning_rate": 0.000191365467386949,
"loss": 2.8381,
"step": 10980
},
{
"epoch": 0.6376744320984072,
"grad_norm": 0.11800755560398102,
"learning_rate": 0.00019082907168912932,
"loss": 2.8331,
"step": 10990
},
{
"epoch": 0.6382546636107806,
"grad_norm": 0.12301038950681686,
"learning_rate": 0.00019029307797010402,
"loss": 2.831,
"step": 11000
},
{
"epoch": 0.6382546636107806,
"eval_loss": 2.796895742416382,
"eval_runtime": 3.2627,
"eval_samples_per_second": 1327.123,
"eval_steps_per_second": 2.758,
"step": 11000
},
{
"epoch": 0.6388348951231542,
"grad_norm": 0.1179603561758995,
"learning_rate": 0.00018975748820345838,
"loss": 2.8436,
"step": 11010
},
{
"epoch": 0.6394151266355276,
"grad_norm": 0.13155020773410797,
"learning_rate": 0.0001892223043612898,
"loss": 2.8317,
"step": 11020
},
{
"epoch": 0.639995358147901,
"grad_norm": 0.11468763649463654,
"learning_rate": 0.00018868752841420122,
"loss": 2.8284,
"step": 11030
},
{
"epoch": 0.6405755896602745,
"grad_norm": 0.10960279405117035,
"learning_rate": 0.00018815316233129393,
"loss": 2.8286,
"step": 11040
},
{
"epoch": 0.6411558211726479,
"grad_norm": 0.1298363208770752,
"learning_rate": 0.00018761920808015966,
"loss": 2.8326,
"step": 11050
},
{
"epoch": 0.6417360526850213,
"grad_norm": 0.11535240709781647,
"learning_rate": 0.00018708566762687403,
"loss": 2.8281,
"step": 11060
},
{
"epoch": 0.6423162841973947,
"grad_norm": 0.12528617680072784,
"learning_rate": 0.00018655254293598866,
"loss": 2.8179,
"step": 11070
},
{
"epoch": 0.6428965157097682,
"grad_norm": 0.11952237784862518,
"learning_rate": 0.00018601983597052468,
"loss": 2.8294,
"step": 11080
},
{
"epoch": 0.6434767472221417,
"grad_norm": 0.12121649086475372,
"learning_rate": 0.00018548754869196496,
"loss": 2.8336,
"step": 11090
},
{
"epoch": 0.6440569787345151,
"grad_norm": 0.12465447187423706,
"learning_rate": 0.00018495568306024687,
"loss": 2.8314,
"step": 11100
},
{
"epoch": 0.6446372102468885,
"grad_norm": 0.10858411341905594,
"learning_rate": 0.00018442424103375563,
"loss": 2.8191,
"step": 11110
},
{
"epoch": 0.645217441759262,
"grad_norm": 0.1240803673863411,
"learning_rate": 0.00018389322456931616,
"loss": 2.8334,
"step": 11120
},
{
"epoch": 0.6457976732716354,
"grad_norm": 0.11604313552379608,
"learning_rate": 0.00018336263562218695,
"loss": 2.8241,
"step": 11130
},
{
"epoch": 0.6463779047840088,
"grad_norm": 0.10764401406049728,
"learning_rate": 0.00018283247614605185,
"loss": 2.8343,
"step": 11140
},
{
"epoch": 0.6469581362963822,
"grad_norm": 0.11341771483421326,
"learning_rate": 0.00018230274809301377,
"loss": 2.8323,
"step": 11150
},
{
"epoch": 0.6475383678087557,
"grad_norm": 0.11618595570325851,
"learning_rate": 0.00018177345341358699,
"loss": 2.8295,
"step": 11160
},
{
"epoch": 0.6481185993211291,
"grad_norm": 0.11492364853620529,
"learning_rate": 0.00018124459405668967,
"loss": 2.8253,
"step": 11170
},
{
"epoch": 0.6486988308335025,
"grad_norm": 0.12541726231575012,
"learning_rate": 0.0001807161719696377,
"loss": 2.8305,
"step": 11180
},
{
"epoch": 0.649279062345876,
"grad_norm": 0.1240224838256836,
"learning_rate": 0.0001801881890981362,
"loss": 2.832,
"step": 11190
},
{
"epoch": 0.6498592938582495,
"grad_norm": 0.12260005623102188,
"learning_rate": 0.00017966064738627363,
"loss": 2.8274,
"step": 11200
},
{
"epoch": 0.6504395253706229,
"grad_norm": 0.11284399777650833,
"learning_rate": 0.00017913354877651386,
"loss": 2.8291,
"step": 11210
},
{
"epoch": 0.6510197568829963,
"grad_norm": 0.11993937194347382,
"learning_rate": 0.00017860689520968906,
"loss": 2.8357,
"step": 11220
},
{
"epoch": 0.6515999883953697,
"grad_norm": 0.11259515583515167,
"learning_rate": 0.00017808068862499302,
"loss": 2.8134,
"step": 11230
},
{
"epoch": 0.6521802199077432,
"grad_norm": 0.1146656796336174,
"learning_rate": 0.0001775549309599733,
"loss": 2.8275,
"step": 11240
},
{
"epoch": 0.6527604514201166,
"grad_norm": 0.11118417978286743,
"learning_rate": 0.0001770296241505248,
"loss": 2.8276,
"step": 11250
},
{
"epoch": 0.65334068293249,
"grad_norm": 0.1155654564499855,
"learning_rate": 0.00017650477013088218,
"loss": 2.8333,
"step": 11260
},
{
"epoch": 0.6539209144448636,
"grad_norm": 0.12370238453149796,
"learning_rate": 0.000175980370833613,
"loss": 2.8209,
"step": 11270
},
{
"epoch": 0.654501145957237,
"grad_norm": 0.11332956701517105,
"learning_rate": 0.00017545642818961045,
"loss": 2.824,
"step": 11280
},
{
"epoch": 0.6550813774696104,
"grad_norm": 0.11696597188711166,
"learning_rate": 0.00017493294412808603,
"loss": 2.8285,
"step": 11290
},
{
"epoch": 0.6556616089819838,
"grad_norm": 0.11556991934776306,
"learning_rate": 0.00017440992057656302,
"loss": 2.833,
"step": 11300
},
{
"epoch": 0.6562418404943573,
"grad_norm": 0.11072834581136703,
"learning_rate": 0.000173887359460869,
"loss": 2.8202,
"step": 11310
},
{
"epoch": 0.6568220720067307,
"grad_norm": 0.12139474600553513,
"learning_rate": 0.0001733652627051285,
"loss": 2.8323,
"step": 11320
},
{
"epoch": 0.6574023035191041,
"grad_norm": 0.11882605403661728,
"learning_rate": 0.0001728436322317567,
"loss": 2.8325,
"step": 11330
},
{
"epoch": 0.6579825350314775,
"grad_norm": 0.10851707309484482,
"learning_rate": 0.00017232246996145163,
"loss": 2.8304,
"step": 11340
},
{
"epoch": 0.658562766543851,
"grad_norm": 0.11566723883152008,
"learning_rate": 0.0001718017778131873,
"loss": 2.8359,
"step": 11350
},
{
"epoch": 0.6591429980562244,
"grad_norm": 0.1224483922123909,
"learning_rate": 0.00017128155770420673,
"loss": 2.8246,
"step": 11360
},
{
"epoch": 0.6597232295685979,
"grad_norm": 0.11472085118293762,
"learning_rate": 0.00017076181155001492,
"loss": 2.8274,
"step": 11370
},
{
"epoch": 0.6603034610809713,
"grad_norm": 0.11463634669780731,
"learning_rate": 0.00017024254126437149,
"loss": 2.8208,
"step": 11380
},
{
"epoch": 0.6608836925933448,
"grad_norm": 0.11640073359012604,
"learning_rate": 0.00016972374875928427,
"loss": 2.8351,
"step": 11390
},
{
"epoch": 0.6614639241057182,
"grad_norm": 0.12146312743425369,
"learning_rate": 0.00016920543594500147,
"loss": 2.8249,
"step": 11400
},
{
"epoch": 0.6620441556180916,
"grad_norm": 0.11683548241853714,
"learning_rate": 0.00016868760473000524,
"loss": 2.8281,
"step": 11410
},
{
"epoch": 0.662624387130465,
"grad_norm": 0.11443763226270676,
"learning_rate": 0.0001681702570210043,
"loss": 2.8239,
"step": 11420
},
{
"epoch": 0.6632046186428385,
"grad_norm": 0.1136617586016655,
"learning_rate": 0.00016765339472292714,
"loss": 2.827,
"step": 11430
},
{
"epoch": 0.6637848501552119,
"grad_norm": 0.11093004792928696,
"learning_rate": 0.00016713701973891472,
"loss": 2.8359,
"step": 11440
},
{
"epoch": 0.6643650816675853,
"grad_norm": 0.12110643088817596,
"learning_rate": 0.00016662113397031413,
"loss": 2.8164,
"step": 11450
},
{
"epoch": 0.6649453131799588,
"grad_norm": 0.12236957252025604,
"learning_rate": 0.00016610573931667065,
"loss": 2.8295,
"step": 11460
},
{
"epoch": 0.6655255446923323,
"grad_norm": 0.11643628776073456,
"learning_rate": 0.0001655908376757214,
"loss": 2.8199,
"step": 11470
},
{
"epoch": 0.6661057762047057,
"grad_norm": 0.12198419123888016,
"learning_rate": 0.00016507643094338818,
"loss": 2.8234,
"step": 11480
},
{
"epoch": 0.6666860077170791,
"grad_norm": 0.11697736382484436,
"learning_rate": 0.00016456252101377042,
"loss": 2.8309,
"step": 11490
},
{
"epoch": 0.6672662392294526,
"grad_norm": 0.11377154290676117,
"learning_rate": 0.00016404910977913824,
"loss": 2.8174,
"step": 11500
},
{
"epoch": 0.667846470741826,
"grad_norm": 0.1169874370098114,
"learning_rate": 0.0001635361991299258,
"loss": 2.8174,
"step": 11510
},
{
"epoch": 0.6684267022541994,
"grad_norm": 0.11022408306598663,
"learning_rate": 0.00016302379095472374,
"loss": 2.8251,
"step": 11520
},
{
"epoch": 0.6690069337665728,
"grad_norm": 0.11143022775650024,
"learning_rate": 0.00016251188714027265,
"loss": 2.832,
"step": 11530
},
{
"epoch": 0.6695871652789464,
"grad_norm": 0.11829391121864319,
"learning_rate": 0.00016200048957145597,
"loss": 2.8181,
"step": 11540
},
{
"epoch": 0.6701673967913198,
"grad_norm": 0.11668332666158676,
"learning_rate": 0.00016148960013129303,
"loss": 2.8163,
"step": 11550
},
{
"epoch": 0.6707476283036932,
"grad_norm": 0.11444656550884247,
"learning_rate": 0.0001609792207009325,
"loss": 2.8171,
"step": 11560
},
{
"epoch": 0.6713278598160666,
"grad_norm": 0.11538255959749222,
"learning_rate": 0.00016046935315964476,
"loss": 2.8192,
"step": 11570
},
{
"epoch": 0.6719080913284401,
"grad_norm": 0.13890443742275238,
"learning_rate": 0.0001599599993848155,
"loss": 2.814,
"step": 11580
},
{
"epoch": 0.6724883228408135,
"grad_norm": 0.10878733545541763,
"learning_rate": 0.00015945116125193876,
"loss": 2.8161,
"step": 11590
},
{
"epoch": 0.6730685543531869,
"grad_norm": 0.11337769776582718,
"learning_rate": 0.00015894284063460966,
"loss": 2.8161,
"step": 11600
},
{
"epoch": 0.6736487858655603,
"grad_norm": 0.1095629557967186,
"learning_rate": 0.00015843503940451834,
"loss": 2.8087,
"step": 11610
},
{
"epoch": 0.6742290173779338,
"grad_norm": 0.1378069370985031,
"learning_rate": 0.00015792775943144165,
"loss": 2.8151,
"step": 11620
},
{
"epoch": 0.6748092488903072,
"grad_norm": 0.1202809140086174,
"learning_rate": 0.00015742100258323794,
"loss": 2.831,
"step": 11630
},
{
"epoch": 0.6753894804026807,
"grad_norm": 0.12298610061407089,
"learning_rate": 0.00015691477072583894,
"loss": 2.8247,
"step": 11640
},
{
"epoch": 0.6759697119150541,
"grad_norm": 0.11947082728147507,
"learning_rate": 0.00015640906572324319,
"loss": 2.8238,
"step": 11650
},
{
"epoch": 0.6765499434274276,
"grad_norm": 0.11039472371339798,
"learning_rate": 0.00015590388943750988,
"loss": 2.8267,
"step": 11660
},
{
"epoch": 0.677130174939801,
"grad_norm": 0.11807908117771149,
"learning_rate": 0.0001553992437287505,
"loss": 2.8222,
"step": 11670
},
{
"epoch": 0.6777104064521744,
"grad_norm": 0.11934113502502441,
"learning_rate": 0.00015489513045512386,
"loss": 2.8193,
"step": 11680
},
{
"epoch": 0.6782906379645478,
"grad_norm": 0.11163033545017242,
"learning_rate": 0.00015439155147282764,
"loss": 2.8137,
"step": 11690
},
{
"epoch": 0.6788708694769213,
"grad_norm": 0.11381068080663681,
"learning_rate": 0.0001538885086360923,
"loss": 2.8202,
"step": 11700
},
{
"epoch": 0.6794511009892947,
"grad_norm": 0.11011006683111191,
"learning_rate": 0.0001533860037971747,
"loss": 2.8213,
"step": 11710
},
{
"epoch": 0.6800313325016681,
"grad_norm": 0.11611464619636536,
"learning_rate": 0.0001528840388063497,
"loss": 2.8216,
"step": 11720
},
{
"epoch": 0.6806115640140415,
"grad_norm": 0.10734301805496216,
"learning_rate": 0.0001523826155119055,
"loss": 2.8188,
"step": 11730
},
{
"epoch": 0.6811917955264151,
"grad_norm": 0.12189003825187683,
"learning_rate": 0.00015188173576013482,
"loss": 2.8206,
"step": 11740
},
{
"epoch": 0.6817720270387885,
"grad_norm": 0.11146776378154755,
"learning_rate": 0.0001513814013953296,
"loss": 2.8176,
"step": 11750
},
{
"epoch": 0.6823522585511619,
"grad_norm": 0.11531021445989609,
"learning_rate": 0.0001508816142597733,
"loss": 2.8192,
"step": 11760
},
{
"epoch": 0.6829324900635354,
"grad_norm": 0.11541693657636642,
"learning_rate": 0.00015038237619373443,
"loss": 2.8219,
"step": 11770
},
{
"epoch": 0.6835127215759088,
"grad_norm": 0.11345332115888596,
"learning_rate": 0.0001498836890354602,
"loss": 2.8024,
"step": 11780
},
{
"epoch": 0.6840929530882822,
"grad_norm": 0.10796009749174118,
"learning_rate": 0.00014938555462116842,
"loss": 2.8119,
"step": 11790
},
{
"epoch": 0.6846731846006556,
"grad_norm": 0.11463455855846405,
"learning_rate": 0.00014888797478504261,
"loss": 2.8119,
"step": 11800
},
{
"epoch": 0.6852534161130291,
"grad_norm": 0.11192594468593597,
"learning_rate": 0.00014839095135922372,
"loss": 2.8252,
"step": 11810
},
{
"epoch": 0.6858336476254026,
"grad_norm": 0.11805829405784607,
"learning_rate": 0.000147894486173804,
"loss": 2.8095,
"step": 11820
},
{
"epoch": 0.686413879137776,
"grad_norm": 0.11721805483102798,
"learning_rate": 0.00014739858105682053,
"loss": 2.8123,
"step": 11830
},
{
"epoch": 0.6869941106501494,
"grad_norm": 0.11619780957698822,
"learning_rate": 0.0001469032378342475,
"loss": 2.8177,
"step": 11840
},
{
"epoch": 0.6875743421625229,
"grad_norm": 0.10933215916156769,
"learning_rate": 0.00014640845832999087,
"loss": 2.8078,
"step": 11850
},
{
"epoch": 0.6881545736748963,
"grad_norm": 0.11362309753894806,
"learning_rate": 0.0001459142443658805,
"loss": 2.8103,
"step": 11860
},
{
"epoch": 0.6887348051872697,
"grad_norm": 0.10805781930685043,
"learning_rate": 0.00014542059776166382,
"loss": 2.8073,
"step": 11870
},
{
"epoch": 0.6893150366996431,
"grad_norm": 0.124758280813694,
"learning_rate": 0.00014492752033499977,
"loss": 2.8133,
"step": 11880
},
{
"epoch": 0.6898952682120166,
"grad_norm": 0.11096182465553284,
"learning_rate": 0.00014443501390145057,
"loss": 2.8061,
"step": 11890
},
{
"epoch": 0.69047549972439,
"grad_norm": 0.1132817193865776,
"learning_rate": 0.00014394308027447685,
"loss": 2.8209,
"step": 11900
},
{
"epoch": 0.6910557312367634,
"grad_norm": 0.10996360331773758,
"learning_rate": 0.00014345172126542966,
"loss": 2.8161,
"step": 11910
},
{
"epoch": 0.6916359627491369,
"grad_norm": 0.11297384649515152,
"learning_rate": 0.0001429609386835442,
"loss": 2.8116,
"step": 11920
},
{
"epoch": 0.6922161942615104,
"grad_norm": 0.12191120535135269,
"learning_rate": 0.00014247073433593373,
"loss": 2.8156,
"step": 11930
},
{
"epoch": 0.6927964257738838,
"grad_norm": 0.11631318181753159,
"learning_rate": 0.00014198111002758154,
"loss": 2.8225,
"step": 11940
},
{
"epoch": 0.6933766572862572,
"grad_norm": 0.14487071335315704,
"learning_rate": 0.00014149206756133595,
"loss": 2.8153,
"step": 11950
},
{
"epoch": 0.6939568887986306,
"grad_norm": 0.11780226230621338,
"learning_rate": 0.00014100360873790248,
"loss": 2.8163,
"step": 11960
},
{
"epoch": 0.6945371203110041,
"grad_norm": 0.11396613717079163,
"learning_rate": 0.00014051573535583766,
"loss": 2.8101,
"step": 11970
},
{
"epoch": 0.6951173518233775,
"grad_norm": 0.11514125019311905,
"learning_rate": 0.00014002844921154233,
"loss": 2.819,
"step": 11980
},
{
"epoch": 0.6956975833357509,
"grad_norm": 0.11687569320201874,
"learning_rate": 0.00013954175209925513,
"loss": 2.8106,
"step": 11990
},
{
"epoch": 0.6962778148481245,
"grad_norm": 0.11218845099210739,
"learning_rate": 0.00013905564581104607,
"loss": 2.8156,
"step": 12000
},
{
"epoch": 0.6962778148481245,
"eval_loss": 2.778130531311035,
"eval_runtime": 3.2555,
"eval_samples_per_second": 1330.053,
"eval_steps_per_second": 2.765,
"step": 12000
},
{
"epoch": 0.6968580463604979,
"grad_norm": 0.11513704061508179,
"learning_rate": 0.000138570132136809,
"loss": 2.8185,
"step": 12010
},
{
"epoch": 0.6974382778728713,
"grad_norm": 0.12384956330060959,
"learning_rate": 0.00013808521286425644,
"loss": 2.8159,
"step": 12020
},
{
"epoch": 0.6980185093852447,
"grad_norm": 0.11136494576931,
"learning_rate": 0.0001376008897789119,
"loss": 2.8196,
"step": 12030
},
{
"epoch": 0.6985987408976182,
"grad_norm": 0.11704517900943756,
"learning_rate": 0.00013711716466410353,
"loss": 2.8118,
"step": 12040
},
{
"epoch": 0.6991789724099916,
"grad_norm": 0.11521551758050919,
"learning_rate": 0.00013663403930095827,
"loss": 2.8131,
"step": 12050
},
{
"epoch": 0.699759203922365,
"grad_norm": 0.10568945109844208,
"learning_rate": 0.00013615151546839382,
"loss": 2.8098,
"step": 12060
},
{
"epoch": 0.7003394354347384,
"grad_norm": 0.1213884949684143,
"learning_rate": 0.00013566959494311386,
"loss": 2.8091,
"step": 12070
},
{
"epoch": 0.7009196669471119,
"grad_norm": 0.11004059761762619,
"learning_rate": 0.00013518827949960015,
"loss": 2.8238,
"step": 12080
},
{
"epoch": 0.7014998984594853,
"grad_norm": 0.11095508933067322,
"learning_rate": 0.00013470757091010649,
"loss": 2.8116,
"step": 12090
},
{
"epoch": 0.7020801299718588,
"grad_norm": 0.11275944113731384,
"learning_rate": 0.00013422747094465234,
"loss": 2.8109,
"step": 12100
},
{
"epoch": 0.7026603614842322,
"grad_norm": 0.11312493681907654,
"learning_rate": 0.00013374798137101595,
"loss": 2.814,
"step": 12110
},
{
"epoch": 0.7032405929966057,
"grad_norm": 0.10738647729158401,
"learning_rate": 0.00013326910395472833,
"loss": 2.8111,
"step": 12120
},
{
"epoch": 0.7038208245089791,
"grad_norm": 0.11198966205120087,
"learning_rate": 0.00013279084045906623,
"loss": 2.806,
"step": 12130
},
{
"epoch": 0.7044010560213525,
"grad_norm": 0.11718153953552246,
"learning_rate": 0.00013231319264504594,
"loss": 2.8186,
"step": 12140
},
{
"epoch": 0.7049812875337259,
"grad_norm": 0.11054380983114243,
"learning_rate": 0.00013183616227141674,
"loss": 2.8144,
"step": 12150
},
{
"epoch": 0.7055615190460994,
"grad_norm": 0.11579257249832153,
"learning_rate": 0.0001313597510946543,
"loss": 2.8101,
"step": 12160
},
{
"epoch": 0.7061417505584728,
"grad_norm": 0.10710903257131577,
"learning_rate": 0.00013088396086895476,
"loss": 2.8104,
"step": 12170
},
{
"epoch": 0.7067219820708462,
"grad_norm": 0.11220473051071167,
"learning_rate": 0.00013040879334622738,
"loss": 2.8049,
"step": 12180
},
{
"epoch": 0.7073022135832197,
"grad_norm": 0.10872667282819748,
"learning_rate": 0.00012993425027608884,
"loss": 2.8175,
"step": 12190
},
{
"epoch": 0.7078824450955932,
"grad_norm": 0.10861840099096298,
"learning_rate": 0.00012946033340585641,
"loss": 2.8072,
"step": 12200
},
{
"epoch": 0.7084626766079666,
"grad_norm": 0.11558268964290619,
"learning_rate": 0.00012898704448054162,
"loss": 2.8034,
"step": 12210
},
{
"epoch": 0.70904290812034,
"grad_norm": 0.11709378659725189,
"learning_rate": 0.00012851438524284382,
"loss": 2.8047,
"step": 12220
},
{
"epoch": 0.7096231396327135,
"grad_norm": 0.12139759957790375,
"learning_rate": 0.00012804235743314401,
"loss": 2.8056,
"step": 12230
},
{
"epoch": 0.7102033711450869,
"grad_norm": 0.11130308359861374,
"learning_rate": 0.00012757096278949792,
"loss": 2.8138,
"step": 12240
},
{
"epoch": 0.7107836026574603,
"grad_norm": 0.1112653836607933,
"learning_rate": 0.00012710020304763003,
"loss": 2.8004,
"step": 12250
},
{
"epoch": 0.7113638341698337,
"grad_norm": 0.11182957142591476,
"learning_rate": 0.00012663007994092703,
"loss": 2.8064,
"step": 12260
},
{
"epoch": 0.7119440656822072,
"grad_norm": 0.13386094570159912,
"learning_rate": 0.00012616059520043145,
"loss": 2.8148,
"step": 12270
},
{
"epoch": 0.7125242971945807,
"grad_norm": 0.11641652137041092,
"learning_rate": 0.0001256917505548352,
"loss": 2.8102,
"step": 12280
},
{
"epoch": 0.7131045287069541,
"grad_norm": 0.10916447639465332,
"learning_rate": 0.00012522354773047352,
"loss": 2.8148,
"step": 12290
},
{
"epoch": 0.7136847602193275,
"grad_norm": 0.10887318104505539,
"learning_rate": 0.0001247559884513182,
"loss": 2.8047,
"step": 12300
},
{
"epoch": 0.714264991731701,
"grad_norm": 0.11701834946870804,
"learning_rate": 0.0001242890744389715,
"loss": 2.8144,
"step": 12310
},
{
"epoch": 0.7148452232440744,
"grad_norm": 0.10473381727933884,
"learning_rate": 0.00012382280741265968,
"loss": 2.8057,
"step": 12320
},
{
"epoch": 0.7154254547564478,
"grad_norm": 0.10586260259151459,
"learning_rate": 0.00012335718908922685,
"loss": 2.8032,
"step": 12330
},
{
"epoch": 0.7160056862688212,
"grad_norm": 0.10688824206590652,
"learning_rate": 0.00012289222118312822,
"loss": 2.8054,
"step": 12340
},
{
"epoch": 0.7165859177811947,
"grad_norm": 0.11233460903167725,
"learning_rate": 0.0001224279054064247,
"loss": 2.801,
"step": 12350
},
{
"epoch": 0.7171661492935681,
"grad_norm": 0.10600557923316956,
"learning_rate": 0.00012196424346877541,
"loss": 2.8035,
"step": 12360
},
{
"epoch": 0.7177463808059416,
"grad_norm": 0.11300963163375854,
"learning_rate": 0.00012150123707743219,
"loss": 2.8098,
"step": 12370
},
{
"epoch": 0.718326612318315,
"grad_norm": 0.11773265898227692,
"learning_rate": 0.00012103888793723312,
"loss": 2.8103,
"step": 12380
},
{
"epoch": 0.7189068438306885,
"grad_norm": 0.11092250049114227,
"learning_rate": 0.00012057719775059602,
"loss": 2.8028,
"step": 12390
},
{
"epoch": 0.7194870753430619,
"grad_norm": 0.10554751008749008,
"learning_rate": 0.00012011616821751271,
"loss": 2.8044,
"step": 12400
},
{
"epoch": 0.7200673068554353,
"grad_norm": 0.1148175522685051,
"learning_rate": 0.0001196558010355422,
"loss": 2.8099,
"step": 12410
},
{
"epoch": 0.7206475383678087,
"grad_norm": 0.10981535166501999,
"learning_rate": 0.00011919609789980458,
"loss": 2.7991,
"step": 12420
},
{
"epoch": 0.7212277698801822,
"grad_norm": 0.11188452690839767,
"learning_rate": 0.00011873706050297508,
"loss": 2.8067,
"step": 12430
},
{
"epoch": 0.7218080013925556,
"grad_norm": 0.11328940838575363,
"learning_rate": 0.00011827869053527727,
"loss": 2.8049,
"step": 12440
},
{
"epoch": 0.722388232904929,
"grad_norm": 0.11542364954948425,
"learning_rate": 0.00011782098968447774,
"loss": 2.7988,
"step": 12450
},
{
"epoch": 0.7229684644173026,
"grad_norm": 0.11087549477815628,
"learning_rate": 0.00011736395963587857,
"loss": 2.8102,
"step": 12460
},
{
"epoch": 0.723548695929676,
"grad_norm": 0.11298040300607681,
"learning_rate": 0.00011690760207231256,
"loss": 2.8063,
"step": 12470
},
{
"epoch": 0.7241289274420494,
"grad_norm": 0.10775293409824371,
"learning_rate": 0.00011645191867413596,
"loss": 2.8065,
"step": 12480
},
{
"epoch": 0.7247091589544228,
"grad_norm": 0.11240221560001373,
"learning_rate": 0.00011599691111922272,
"loss": 2.8062,
"step": 12490
},
{
"epoch": 0.7252893904667963,
"grad_norm": 0.1069854348897934,
"learning_rate": 0.00011554258108295859,
"loss": 2.79,
"step": 12500
},
{
"epoch": 0.7258696219791697,
"grad_norm": 0.11566832661628723,
"learning_rate": 0.00011508893023823393,
"loss": 2.7977,
"step": 12510
},
{
"epoch": 0.7264498534915431,
"grad_norm": 0.11771980673074722,
"learning_rate": 0.00011463596025543905,
"loss": 2.803,
"step": 12520
},
{
"epoch": 0.7270300850039165,
"grad_norm": 0.11435101926326752,
"learning_rate": 0.0001141836728024567,
"loss": 2.7985,
"step": 12530
},
{
"epoch": 0.72761031651629,
"grad_norm": 0.10902056097984314,
"learning_rate": 0.0001137320695446566,
"loss": 2.8096,
"step": 12540
},
{
"epoch": 0.7281905480286635,
"grad_norm": 0.10939980298280716,
"learning_rate": 0.0001132811521448896,
"loss": 2.8121,
"step": 12550
},
{
"epoch": 0.7287707795410369,
"grad_norm": 0.10922636091709137,
"learning_rate": 0.00011283092226348031,
"loss": 2.8093,
"step": 12560
},
{
"epoch": 0.7293510110534103,
"grad_norm": 0.10520195960998535,
"learning_rate": 0.00011238138155822275,
"loss": 2.8031,
"step": 12570
},
{
"epoch": 0.7299312425657838,
"grad_norm": 0.10655706375837326,
"learning_rate": 0.00011193253168437253,
"loss": 2.8083,
"step": 12580
},
{
"epoch": 0.7305114740781572,
"grad_norm": 0.11627507954835892,
"learning_rate": 0.00011148437429464215,
"loss": 2.7994,
"step": 12590
},
{
"epoch": 0.7310917055905306,
"grad_norm": 0.1093965470790863,
"learning_rate": 0.00011103691103919401,
"loss": 2.8054,
"step": 12600
},
{
"epoch": 0.731671937102904,
"grad_norm": 0.113887257874012,
"learning_rate": 0.00011059014356563458,
"loss": 2.7963,
"step": 12610
},
{
"epoch": 0.7322521686152775,
"grad_norm": 0.10929399728775024,
"learning_rate": 0.00011014407351900879,
"loss": 2.8033,
"step": 12620
},
{
"epoch": 0.7328324001276509,
"grad_norm": 0.11176785826683044,
"learning_rate": 0.00010969870254179285,
"loss": 2.8061,
"step": 12630
},
{
"epoch": 0.7334126316400243,
"grad_norm": 0.10631275177001953,
"learning_rate": 0.00010925403227388973,
"loss": 2.8107,
"step": 12640
},
{
"epoch": 0.7339928631523978,
"grad_norm": 0.11108485609292984,
"learning_rate": 0.00010881006435262179,
"loss": 2.8059,
"step": 12650
},
{
"epoch": 0.7345730946647713,
"grad_norm": 0.10749488323926926,
"learning_rate": 0.00010836680041272536,
"loss": 2.8004,
"step": 12660
},
{
"epoch": 0.7351533261771447,
"grad_norm": 0.10994744300842285,
"learning_rate": 0.00010792424208634495,
"loss": 2.8093,
"step": 12670
},
{
"epoch": 0.7357335576895181,
"grad_norm": 0.10910103470087051,
"learning_rate": 0.00010748239100302627,
"loss": 2.7928,
"step": 12680
},
{
"epoch": 0.7363137892018915,
"grad_norm": 0.10835743695497513,
"learning_rate": 0.0001070412487897117,
"loss": 2.8077,
"step": 12690
},
{
"epoch": 0.736894020714265,
"grad_norm": 0.10580655187368393,
"learning_rate": 0.00010660081707073288,
"loss": 2.7991,
"step": 12700
},
{
"epoch": 0.7374742522266384,
"grad_norm": 0.10928157716989517,
"learning_rate": 0.00010616109746780546,
"loss": 2.7905,
"step": 12710
},
{
"epoch": 0.7380544837390118,
"grad_norm": 0.10654684156179428,
"learning_rate": 0.00010572209160002339,
"loss": 2.8021,
"step": 12720
},
{
"epoch": 0.7386347152513854,
"grad_norm": 0.10834140330553055,
"learning_rate": 0.00010528380108385186,
"loss": 2.805,
"step": 12730
},
{
"epoch": 0.7392149467637588,
"grad_norm": 0.1152142882347107,
"learning_rate": 0.00010484622753312279,
"loss": 2.7916,
"step": 12740
},
{
"epoch": 0.7397951782761322,
"grad_norm": 0.10981319844722748,
"learning_rate": 0.0001044093725590277,
"loss": 2.8029,
"step": 12750
},
{
"epoch": 0.7403754097885056,
"grad_norm": 0.1065368577837944,
"learning_rate": 0.00010397323777011229,
"loss": 2.8048,
"step": 12760
},
{
"epoch": 0.7409556413008791,
"grad_norm": 0.10563939809799194,
"learning_rate": 0.00010353782477227083,
"loss": 2.8058,
"step": 12770
},
{
"epoch": 0.7415358728132525,
"grad_norm": 0.11117275804281235,
"learning_rate": 0.00010310313516873922,
"loss": 2.7985,
"step": 12780
},
{
"epoch": 0.7421161043256259,
"grad_norm": 0.11544723808765411,
"learning_rate": 0.00010266917056009036,
"loss": 2.8001,
"step": 12790
},
{
"epoch": 0.7426963358379993,
"grad_norm": 0.11005005240440369,
"learning_rate": 0.00010223593254422733,
"loss": 2.7954,
"step": 12800
},
{
"epoch": 0.7432765673503728,
"grad_norm": 0.11374104768037796,
"learning_rate": 0.0001018034227163779,
"loss": 2.8053,
"step": 12810
},
{
"epoch": 0.7438567988627462,
"grad_norm": 0.11264318227767944,
"learning_rate": 0.00010137164266908854,
"loss": 2.8029,
"step": 12820
},
{
"epoch": 0.7444370303751197,
"grad_norm": 0.10718287527561188,
"learning_rate": 0.00010094059399221855,
"loss": 2.7964,
"step": 12830
},
{
"epoch": 0.7450172618874931,
"grad_norm": 0.11395127326250076,
"learning_rate": 0.00010051027827293457,
"loss": 2.8057,
"step": 12840
},
{
"epoch": 0.7455974933998666,
"grad_norm": 0.11251317709684372,
"learning_rate": 0.00010008069709570378,
"loss": 2.8036,
"step": 12850
},
{
"epoch": 0.74617772491224,
"grad_norm": 0.1180030032992363,
"learning_rate": 9.965185204228941e-05,
"loss": 2.8016,
"step": 12860
},
{
"epoch": 0.7467579564246134,
"grad_norm": 0.12361141294240952,
"learning_rate": 9.922374469174372e-05,
"loss": 2.7891,
"step": 12870
},
{
"epoch": 0.7473381879369868,
"grad_norm": 0.11456003040075302,
"learning_rate": 9.879637662040275e-05,
"loss": 2.8028,
"step": 12880
},
{
"epoch": 0.7479184194493603,
"grad_norm": 0.11008987575769424,
"learning_rate": 9.83697494018808e-05,
"loss": 2.8093,
"step": 12890
},
{
"epoch": 0.7484986509617337,
"grad_norm": 0.11017616838216782,
"learning_rate": 9.794386460706356e-05,
"loss": 2.8005,
"step": 12900
},
{
"epoch": 0.7490788824741071,
"grad_norm": 0.11627316474914551,
"learning_rate": 9.751872380410378e-05,
"loss": 2.799,
"step": 12910
},
{
"epoch": 0.7496591139864806,
"grad_norm": 0.11369270831346512,
"learning_rate": 9.709432855841436e-05,
"loss": 2.7941,
"step": 12920
},
{
"epoch": 0.7502393454988541,
"grad_norm": 0.10983362793922424,
"learning_rate": 9.667068043266302e-05,
"loss": 2.7996,
"step": 12930
},
{
"epoch": 0.7508195770112275,
"grad_norm": 0.10419350117444992,
"learning_rate": 9.624778098676652e-05,
"loss": 2.8052,
"step": 12940
},
{
"epoch": 0.7513998085236009,
"grad_norm": 0.10500075668096542,
"learning_rate": 9.582563177788487e-05,
"loss": 2.7993,
"step": 12950
},
{
"epoch": 0.7519800400359744,
"grad_norm": 0.10765775293111801,
"learning_rate": 9.540423436041585e-05,
"loss": 2.7964,
"step": 12960
},
{
"epoch": 0.7525602715483478,
"grad_norm": 0.10872151702642441,
"learning_rate": 9.49835902859888e-05,
"loss": 2.7876,
"step": 12970
},
{
"epoch": 0.7531405030607212,
"grad_norm": 0.10935165733098984,
"learning_rate": 9.456370110345927e-05,
"loss": 2.8003,
"step": 12980
},
{
"epoch": 0.7537207345730946,
"grad_norm": 0.1083398386836052,
"learning_rate": 9.414456835890322e-05,
"loss": 2.7945,
"step": 12990
},
{
"epoch": 0.7543009660854681,
"grad_norm": 0.10846253484487534,
"learning_rate": 9.372619359561121e-05,
"loss": 2.799,
"step": 13000
},
{
"epoch": 0.7543009660854681,
"eval_loss": 2.7616169452667236,
"eval_runtime": 3.2768,
"eval_samples_per_second": 1321.408,
"eval_steps_per_second": 2.747,
"step": 13000
},
{
"epoch": 0.7548811975978416,
"grad_norm": 0.10937865823507309,
"learning_rate": 9.330857835408318e-05,
"loss": 2.7962,
"step": 13010
},
{
"epoch": 0.755461429110215,
"grad_norm": 0.10633205622434616,
"learning_rate": 9.289172417202205e-05,
"loss": 2.7989,
"step": 13020
},
{
"epoch": 0.7560416606225884,
"grad_norm": 0.11001235246658325,
"learning_rate": 9.247563258432861e-05,
"loss": 2.7955,
"step": 13030
},
{
"epoch": 0.7566218921349619,
"grad_norm": 0.10847952216863632,
"learning_rate": 9.206030512309566e-05,
"loss": 2.7959,
"step": 13040
},
{
"epoch": 0.7572021236473353,
"grad_norm": 0.10858704149723053,
"learning_rate": 9.164574331760246e-05,
"loss": 2.7965,
"step": 13050
},
{
"epoch": 0.7577823551597087,
"grad_norm": 0.10710106790065765,
"learning_rate": 9.123194869430888e-05,
"loss": 2.7921,
"step": 13060
},
{
"epoch": 0.7583625866720821,
"grad_norm": 0.10932508111000061,
"learning_rate": 9.081892277685026e-05,
"loss": 2.7921,
"step": 13070
},
{
"epoch": 0.7589428181844556,
"grad_norm": 0.11362321674823761,
"learning_rate": 9.040666708603125e-05,
"loss": 2.7981,
"step": 13080
},
{
"epoch": 0.759523049696829,
"grad_norm": 0.10791613906621933,
"learning_rate": 8.999518313982039e-05,
"loss": 2.7993,
"step": 13090
},
{
"epoch": 0.7601032812092025,
"grad_norm": 0.11038652807474136,
"learning_rate": 8.958447245334476e-05,
"loss": 2.7922,
"step": 13100
},
{
"epoch": 0.7606835127215759,
"grad_norm": 0.11153964698314667,
"learning_rate": 8.91745365388841e-05,
"loss": 2.8016,
"step": 13110
},
{
"epoch": 0.7612637442339494,
"grad_norm": 0.10748942941427231,
"learning_rate": 8.876537690586529e-05,
"loss": 2.791,
"step": 13120
},
{
"epoch": 0.7618439757463228,
"grad_norm": 0.1106482520699501,
"learning_rate": 8.83569950608572e-05,
"loss": 2.8008,
"step": 13130
},
{
"epoch": 0.7624242072586962,
"grad_norm": 0.10443028807640076,
"learning_rate": 8.794939250756441e-05,
"loss": 2.7936,
"step": 13140
},
{
"epoch": 0.7630044387710696,
"grad_norm": 0.11383570730686188,
"learning_rate": 8.754257074682222e-05,
"loss": 2.7912,
"step": 13150
},
{
"epoch": 0.7635846702834431,
"grad_norm": 0.10836578160524368,
"learning_rate": 8.713653127659105e-05,
"loss": 2.7939,
"step": 13160
},
{
"epoch": 0.7641649017958165,
"grad_norm": 0.10870825499296188,
"learning_rate": 8.673127559195066e-05,
"loss": 2.7991,
"step": 13170
},
{
"epoch": 0.7647451333081899,
"grad_norm": 0.10718671977519989,
"learning_rate": 8.632680518509492e-05,
"loss": 2.7879,
"step": 13180
},
{
"epoch": 0.7653253648205635,
"grad_norm": 0.11277935653924942,
"learning_rate": 8.592312154532637e-05,
"loss": 2.7947,
"step": 13190
},
{
"epoch": 0.7659055963329369,
"grad_norm": 0.11088382452726364,
"learning_rate": 8.552022615905038e-05,
"loss": 2.7996,
"step": 13200
},
{
"epoch": 0.7664858278453103,
"grad_norm": 0.10912182927131653,
"learning_rate": 8.511812050977003e-05,
"loss": 2.7943,
"step": 13210
},
{
"epoch": 0.7670660593576837,
"grad_norm": 0.10919041931629181,
"learning_rate": 8.471680607808035e-05,
"loss": 2.7992,
"step": 13220
},
{
"epoch": 0.7676462908700572,
"grad_norm": 0.10616286844015121,
"learning_rate": 8.431628434166309e-05,
"loss": 2.7977,
"step": 13230
},
{
"epoch": 0.7682265223824306,
"grad_norm": 0.10572168231010437,
"learning_rate": 8.391655677528143e-05,
"loss": 2.7959,
"step": 13240
},
{
"epoch": 0.768806753894804,
"grad_norm": 0.10937794297933578,
"learning_rate": 8.3517624850774e-05,
"loss": 2.793,
"step": 13250
},
{
"epoch": 0.7693869854071774,
"grad_norm": 0.10820769518613815,
"learning_rate": 8.311949003704996e-05,
"loss": 2.7991,
"step": 13260
},
{
"epoch": 0.769967216919551,
"grad_norm": 0.10802992433309555,
"learning_rate": 8.272215380008343e-05,
"loss": 2.7965,
"step": 13270
},
{
"epoch": 0.7705474484319244,
"grad_norm": 0.10747858881950378,
"learning_rate": 8.232561760290794e-05,
"loss": 2.7957,
"step": 13280
},
{
"epoch": 0.7711276799442978,
"grad_norm": 0.11238089948892593,
"learning_rate": 8.192988290561157e-05,
"loss": 2.7922,
"step": 13290
},
{
"epoch": 0.7717079114566712,
"grad_norm": 0.1034981980919838,
"learning_rate": 8.153495116533056e-05,
"loss": 2.789,
"step": 13300
},
{
"epoch": 0.7722881429690447,
"grad_norm": 0.10910629481077194,
"learning_rate": 8.11408238362453e-05,
"loss": 2.7899,
"step": 13310
},
{
"epoch": 0.7728683744814181,
"grad_norm": 0.11309719830751419,
"learning_rate": 8.07475023695737e-05,
"loss": 2.7978,
"step": 13320
},
{
"epoch": 0.7734486059937915,
"grad_norm": 0.10908596217632294,
"learning_rate": 8.035498821356664e-05,
"loss": 2.7938,
"step": 13330
},
{
"epoch": 0.7740288375061649,
"grad_norm": 0.11714279651641846,
"learning_rate": 7.996328281350252e-05,
"loss": 2.7967,
"step": 13340
},
{
"epoch": 0.7746090690185384,
"grad_norm": 0.10943669080734253,
"learning_rate": 7.957238761168135e-05,
"loss": 2.7803,
"step": 13350
},
{
"epoch": 0.7751893005309118,
"grad_norm": 0.11171719431877136,
"learning_rate": 7.918230404742045e-05,
"loss": 2.7941,
"step": 13360
},
{
"epoch": 0.7757695320432852,
"grad_norm": 0.10363152623176575,
"learning_rate": 7.879303355704834e-05,
"loss": 2.8043,
"step": 13370
},
{
"epoch": 0.7763497635556587,
"grad_norm": 0.1147744432091713,
"learning_rate": 7.840457757389968e-05,
"loss": 2.8022,
"step": 13380
},
{
"epoch": 0.7769299950680322,
"grad_norm": 0.10682083666324615,
"learning_rate": 7.801693752831012e-05,
"loss": 2.7914,
"step": 13390
},
{
"epoch": 0.7775102265804056,
"grad_norm": 0.11352023482322693,
"learning_rate": 7.763011484761082e-05,
"loss": 2.7958,
"step": 13400
},
{
"epoch": 0.778090458092779,
"grad_norm": 0.10785870254039764,
"learning_rate": 7.724411095612366e-05,
"loss": 2.7971,
"step": 13410
},
{
"epoch": 0.7786706896051525,
"grad_norm": 0.10762759298086166,
"learning_rate": 7.68589272751551e-05,
"loss": 2.7916,
"step": 13420
},
{
"epoch": 0.7792509211175259,
"grad_norm": 0.10556434839963913,
"learning_rate": 7.647456522299207e-05,
"loss": 2.784,
"step": 13430
},
{
"epoch": 0.7798311526298993,
"grad_norm": 0.1077750101685524,
"learning_rate": 7.609102621489577e-05,
"loss": 2.7906,
"step": 13440
},
{
"epoch": 0.7804113841422727,
"grad_norm": 0.10472170263528824,
"learning_rate": 7.570831166309693e-05,
"loss": 2.7833,
"step": 13450
},
{
"epoch": 0.7809916156546463,
"grad_norm": 0.1061674952507019,
"learning_rate": 7.532642297679093e-05,
"loss": 2.796,
"step": 13460
},
{
"epoch": 0.7815718471670197,
"grad_norm": 0.10716653615236282,
"learning_rate": 7.494536156213151e-05,
"loss": 2.791,
"step": 13470
},
{
"epoch": 0.7821520786793931,
"grad_norm": 0.11008104681968689,
"learning_rate": 7.456512882222703e-05,
"loss": 2.7874,
"step": 13480
},
{
"epoch": 0.7827323101917665,
"grad_norm": 0.11095033586025238,
"learning_rate": 7.418572615713413e-05,
"loss": 2.7874,
"step": 13490
},
{
"epoch": 0.78331254170414,
"grad_norm": 0.10690274834632874,
"learning_rate": 7.380715496385316e-05,
"loss": 2.7897,
"step": 13500
},
{
"epoch": 0.7838927732165134,
"grad_norm": 0.10463336110115051,
"learning_rate": 7.34294166363231e-05,
"loss": 2.7965,
"step": 13510
},
{
"epoch": 0.7844730047288868,
"grad_norm": 0.10628803819417953,
"learning_rate": 7.30525125654157e-05,
"loss": 2.7878,
"step": 13520
},
{
"epoch": 0.7850532362412602,
"grad_norm": 0.10758186876773834,
"learning_rate": 7.267644413893152e-05,
"loss": 2.7893,
"step": 13530
},
{
"epoch": 0.7856334677536337,
"grad_norm": 0.10785481333732605,
"learning_rate": 7.230121274159384e-05,
"loss": 2.7896,
"step": 13540
},
{
"epoch": 0.7862136992660071,
"grad_norm": 0.10700030624866486,
"learning_rate": 7.192681975504382e-05,
"loss": 2.786,
"step": 13550
},
{
"epoch": 0.7867939307783806,
"grad_norm": 0.10182949900627136,
"learning_rate": 7.155326655783597e-05,
"loss": 2.7889,
"step": 13560
},
{
"epoch": 0.787374162290754,
"grad_norm": 0.10802864283323288,
"learning_rate": 7.118055452543193e-05,
"loss": 2.7946,
"step": 13570
},
{
"epoch": 0.7879543938031275,
"grad_norm": 0.10849913954734802,
"learning_rate": 7.080868503019672e-05,
"loss": 2.786,
"step": 13580
},
{
"epoch": 0.7885346253155009,
"grad_norm": 0.10770730674266815,
"learning_rate": 7.043765944139264e-05,
"loss": 2.7804,
"step": 13590
},
{
"epoch": 0.7891148568278743,
"grad_norm": 0.11441770195960999,
"learning_rate": 7.006747912517475e-05,
"loss": 2.79,
"step": 13600
},
{
"epoch": 0.7896950883402477,
"grad_norm": 0.10908571630716324,
"learning_rate": 6.9698145444586e-05,
"loss": 2.7897,
"step": 13610
},
{
"epoch": 0.7902753198526212,
"grad_norm": 0.10705877095460892,
"learning_rate": 6.932965975955134e-05,
"loss": 2.7857,
"step": 13620
},
{
"epoch": 0.7908555513649946,
"grad_norm": 0.11635982990264893,
"learning_rate": 6.896202342687397e-05,
"loss": 2.7888,
"step": 13630
},
{
"epoch": 0.791435782877368,
"grad_norm": 0.1107436865568161,
"learning_rate": 6.859523780022911e-05,
"loss": 2.7902,
"step": 13640
},
{
"epoch": 0.7920160143897415,
"grad_norm": 0.11131720244884491,
"learning_rate": 6.822930423016003e-05,
"loss": 2.7982,
"step": 13650
},
{
"epoch": 0.792596245902115,
"grad_norm": 0.10535065829753876,
"learning_rate": 6.786422406407247e-05,
"loss": 2.7838,
"step": 13660
},
{
"epoch": 0.7931764774144884,
"grad_norm": 0.10784085094928741,
"learning_rate": 6.749999864622973e-05,
"loss": 2.7778,
"step": 13670
},
{
"epoch": 0.7937567089268618,
"grad_norm": 0.10266363620758057,
"learning_rate": 6.713662931774818e-05,
"loss": 2.7929,
"step": 13680
},
{
"epoch": 0.7943369404392353,
"grad_norm": 0.11121921241283417,
"learning_rate": 6.677411741659145e-05,
"loss": 2.787,
"step": 13690
},
{
"epoch": 0.7949171719516087,
"grad_norm": 0.10687406361103058,
"learning_rate": 6.641246427756657e-05,
"loss": 2.7915,
"step": 13700
},
{
"epoch": 0.7954974034639821,
"grad_norm": 0.10604474693536758,
"learning_rate": 6.605167123231822e-05,
"loss": 2.7816,
"step": 13710
},
{
"epoch": 0.7960776349763555,
"grad_norm": 0.10484491288661957,
"learning_rate": 6.569173960932404e-05,
"loss": 2.7844,
"step": 13720
},
{
"epoch": 0.796657866488729,
"grad_norm": 0.10788851231336594,
"learning_rate": 6.533267073389034e-05,
"loss": 2.7815,
"step": 13730
},
{
"epoch": 0.7972380980011025,
"grad_norm": 0.10421809554100037,
"learning_rate": 6.49744659281459e-05,
"loss": 2.7953,
"step": 13740
},
{
"epoch": 0.7978183295134759,
"grad_norm": 0.10567434132099152,
"learning_rate": 6.461712651103859e-05,
"loss": 2.7898,
"step": 13750
},
{
"epoch": 0.7983985610258493,
"grad_norm": 0.10381162911653519,
"learning_rate": 6.426065379832959e-05,
"loss": 2.7902,
"step": 13760
},
{
"epoch": 0.7989787925382228,
"grad_norm": 0.10707089304924011,
"learning_rate": 6.390504910258867e-05,
"loss": 2.7923,
"step": 13770
},
{
"epoch": 0.7995590240505962,
"grad_norm": 0.10568366944789886,
"learning_rate": 6.355031373318961e-05,
"loss": 2.793,
"step": 13780
},
{
"epoch": 0.8001392555629696,
"grad_norm": 0.10662976652383804,
"learning_rate": 6.319644899630514e-05,
"loss": 2.7954,
"step": 13790
},
{
"epoch": 0.800719487075343,
"grad_norm": 0.10822783410549164,
"learning_rate": 6.28434561949024e-05,
"loss": 2.7875,
"step": 13800
},
{
"epoch": 0.8012997185877165,
"grad_norm": 0.10903995484113693,
"learning_rate": 6.249133662873783e-05,
"loss": 2.7952,
"step": 13810
},
{
"epoch": 0.8018799501000899,
"grad_norm": 0.11016574501991272,
"learning_rate": 6.214009159435254e-05,
"loss": 2.7833,
"step": 13820
},
{
"epoch": 0.8024601816124634,
"grad_norm": 0.10669629275798798,
"learning_rate": 6.178972238506758e-05,
"loss": 2.7966,
"step": 13830
},
{
"epoch": 0.8030404131248368,
"grad_norm": 0.10725666582584381,
"learning_rate": 6.144023029097891e-05,
"loss": 2.781,
"step": 13840
},
{
"epoch": 0.8036206446372103,
"grad_norm": 0.10259473323822021,
"learning_rate": 6.10916165989533e-05,
"loss": 2.7858,
"step": 13850
},
{
"epoch": 0.8042008761495837,
"grad_norm": 0.10819372534751892,
"learning_rate": 6.0743882592622736e-05,
"loss": 2.782,
"step": 13860
},
{
"epoch": 0.8047811076619571,
"grad_norm": 0.09982424229383469,
"learning_rate": 6.039702955238026e-05,
"loss": 2.7767,
"step": 13870
},
{
"epoch": 0.8053613391743305,
"grad_norm": 0.11254626512527466,
"learning_rate": 6.005105875537515e-05,
"loss": 2.7773,
"step": 13880
},
{
"epoch": 0.805941570686704,
"grad_norm": 0.10880761593580246,
"learning_rate": 5.970597147550808e-05,
"loss": 2.7925,
"step": 13890
},
{
"epoch": 0.8065218021990774,
"grad_norm": 0.10454876720905304,
"learning_rate": 5.936176898342649e-05,
"loss": 2.7887,
"step": 13900
},
{
"epoch": 0.8071020337114508,
"grad_norm": 0.10871117562055588,
"learning_rate": 5.9018452546520165e-05,
"loss": 2.7914,
"step": 13910
},
{
"epoch": 0.8076822652238244,
"grad_norm": 0.10645408183336258,
"learning_rate": 5.8676023428916175e-05,
"loss": 2.7946,
"step": 13920
},
{
"epoch": 0.8082624967361978,
"grad_norm": 0.11597729474306107,
"learning_rate": 5.83344828914743e-05,
"loss": 2.7917,
"step": 13930
},
{
"epoch": 0.8088427282485712,
"grad_norm": 0.1034785658121109,
"learning_rate": 5.799383219178264e-05,
"loss": 2.7912,
"step": 13940
},
{
"epoch": 0.8094229597609446,
"grad_norm": 0.10739534348249435,
"learning_rate": 5.7654072584152787e-05,
"loss": 2.7848,
"step": 13950
},
{
"epoch": 0.8100031912733181,
"grad_norm": 0.10825861990451813,
"learning_rate": 5.731520531961505e-05,
"loss": 2.7908,
"step": 13960
},
{
"epoch": 0.8105834227856915,
"grad_norm": 0.10880185663700104,
"learning_rate": 5.697723164591441e-05,
"loss": 2.7904,
"step": 13970
},
{
"epoch": 0.8111636542980649,
"grad_norm": 0.1085624098777771,
"learning_rate": 5.6640152807505236e-05,
"loss": 2.7839,
"step": 13980
},
{
"epoch": 0.8117438858104383,
"grad_norm": 0.10740832984447479,
"learning_rate": 5.630397004554713e-05,
"loss": 2.7858,
"step": 13990
},
{
"epoch": 0.8123241173228118,
"grad_norm": 0.10401804000139236,
"learning_rate": 5.596868459790025e-05,
"loss": 2.7802,
"step": 14000
},
{
"epoch": 0.8123241173228118,
"eval_loss": 2.749423027038574,
"eval_runtime": 3.2586,
"eval_samples_per_second": 1328.792,
"eval_steps_per_second": 2.762,
"step": 14000
},
{
"epoch": 0.8129043488351853,
"grad_norm": 0.10784956812858582,
"learning_rate": 5.563429769912071e-05,
"loss": 2.7852,
"step": 14010
},
{
"epoch": 0.8134845803475587,
"grad_norm": 0.10523492097854614,
"learning_rate": 5.530081058045606e-05,
"loss": 2.7856,
"step": 14020
},
{
"epoch": 0.8140648118599321,
"grad_norm": 0.10354667156934738,
"learning_rate": 5.4968224469840935e-05,
"loss": 2.7826,
"step": 14030
},
{
"epoch": 0.8146450433723056,
"grad_norm": 0.10460636019706726,
"learning_rate": 5.4636540591892164e-05,
"loss": 2.7844,
"step": 14040
},
{
"epoch": 0.815225274884679,
"grad_norm": 0.11116158217191696,
"learning_rate": 5.430576016790453e-05,
"loss": 2.7879,
"step": 14050
},
{
"epoch": 0.8158055063970524,
"grad_norm": 0.11445162445306778,
"learning_rate": 5.3975884415846206e-05,
"loss": 2.7847,
"step": 14060
},
{
"epoch": 0.8163857379094258,
"grad_norm": 0.10757939517498016,
"learning_rate": 5.3646914550354204e-05,
"loss": 2.7884,
"step": 14070
},
{
"epoch": 0.8169659694217993,
"grad_norm": 0.10770777612924576,
"learning_rate": 5.331885178273015e-05,
"loss": 2.775,
"step": 14080
},
{
"epoch": 0.8175462009341727,
"grad_norm": 0.10863149166107178,
"learning_rate": 5.2991697320935486e-05,
"loss": 2.7883,
"step": 14090
},
{
"epoch": 0.8181264324465461,
"grad_norm": 0.10049009323120117,
"learning_rate": 5.266545236958718e-05,
"loss": 2.7878,
"step": 14100
},
{
"epoch": 0.8187066639589196,
"grad_norm": 0.104975625872612,
"learning_rate": 5.2340118129953346e-05,
"loss": 2.7806,
"step": 14110
},
{
"epoch": 0.8192868954712931,
"grad_norm": 0.10563846677541733,
"learning_rate": 5.201569579994865e-05,
"loss": 2.7807,
"step": 14120
},
{
"epoch": 0.8198671269836665,
"grad_norm": 0.10182633996009827,
"learning_rate": 5.1692186574130324e-05,
"loss": 2.7782,
"step": 14130
},
{
"epoch": 0.8204473584960399,
"grad_norm": 0.10903611779212952,
"learning_rate": 5.1369591643692896e-05,
"loss": 2.7792,
"step": 14140
},
{
"epoch": 0.8210275900084134,
"grad_norm": 0.10453125089406967,
"learning_rate": 5.1047912196464944e-05,
"loss": 2.7814,
"step": 14150
},
{
"epoch": 0.8216078215207868,
"grad_norm": 0.11026264727115631,
"learning_rate": 5.072714941690387e-05,
"loss": 2.7847,
"step": 14160
},
{
"epoch": 0.8221880530331602,
"grad_norm": 0.10732634365558624,
"learning_rate": 5.040730448609166e-05,
"loss": 2.7716,
"step": 14170
},
{
"epoch": 0.8227682845455336,
"grad_norm": 0.10351432114839554,
"learning_rate": 5.008837858173113e-05,
"loss": 2.7883,
"step": 14180
},
{
"epoch": 0.8233485160579072,
"grad_norm": 0.10946208238601685,
"learning_rate": 4.9770372878140575e-05,
"loss": 2.786,
"step": 14190
},
{
"epoch": 0.8239287475702806,
"grad_norm": 0.1038416251540184,
"learning_rate": 4.9453288546250494e-05,
"loss": 2.7799,
"step": 14200
},
{
"epoch": 0.824508979082654,
"grad_norm": 0.10568647086620331,
"learning_rate": 4.913712675359861e-05,
"loss": 2.7874,
"step": 14210
},
{
"epoch": 0.8250892105950274,
"grad_norm": 0.10334275662899017,
"learning_rate": 4.882188866432568e-05,
"loss": 2.7835,
"step": 14220
},
{
"epoch": 0.8256694421074009,
"grad_norm": 0.10559739917516708,
"learning_rate": 4.850757543917144e-05,
"loss": 2.7791,
"step": 14230
},
{
"epoch": 0.8262496736197743,
"grad_norm": 0.1026688888669014,
"learning_rate": 4.819418823546999e-05,
"loss": 2.7777,
"step": 14240
},
{
"epoch": 0.8268299051321477,
"grad_norm": 0.10159046947956085,
"learning_rate": 4.788172820714611e-05,
"loss": 2.7876,
"step": 14250
},
{
"epoch": 0.8274101366445211,
"grad_norm": 0.114133320748806,
"learning_rate": 4.7570196504710026e-05,
"loss": 2.7777,
"step": 14260
},
{
"epoch": 0.8279903681568946,
"grad_norm": 0.10327325016260147,
"learning_rate": 4.725959427525432e-05,
"loss": 2.7976,
"step": 14270
},
{
"epoch": 0.828570599669268,
"grad_norm": 0.10618502646684647,
"learning_rate": 4.694992266244889e-05,
"loss": 2.7904,
"step": 14280
},
{
"epoch": 0.8291508311816415,
"grad_norm": 0.10732074081897736,
"learning_rate": 4.6641182806537e-05,
"loss": 2.7724,
"step": 14290
},
{
"epoch": 0.8297310626940149,
"grad_norm": 0.10467931628227234,
"learning_rate": 4.63333758443313e-05,
"loss": 2.7843,
"step": 14300
},
{
"epoch": 0.8303112942063884,
"grad_norm": 0.10281146317720413,
"learning_rate": 4.6026502909209004e-05,
"loss": 2.7842,
"step": 14310
},
{
"epoch": 0.8308915257187618,
"grad_norm": 0.1023208498954773,
"learning_rate": 4.572056513110867e-05,
"loss": 2.774,
"step": 14320
},
{
"epoch": 0.8314717572311352,
"grad_norm": 0.10323374718427658,
"learning_rate": 4.541556363652511e-05,
"loss": 2.7755,
"step": 14330
},
{
"epoch": 0.8320519887435086,
"grad_norm": 0.10136920213699341,
"learning_rate": 4.5111499548505727e-05,
"loss": 2.7814,
"step": 14340
},
{
"epoch": 0.8326322202558821,
"grad_norm": 0.10571028292179108,
"learning_rate": 4.4808373986646565e-05,
"loss": 2.7878,
"step": 14350
},
{
"epoch": 0.8332124517682555,
"grad_norm": 0.10252848267555237,
"learning_rate": 4.45061880670874e-05,
"loss": 2.7754,
"step": 14360
},
{
"epoch": 0.8337926832806289,
"grad_norm": 0.10471548140048981,
"learning_rate": 4.420494290250869e-05,
"loss": 2.7767,
"step": 14370
},
{
"epoch": 0.8343729147930025,
"grad_norm": 0.10701679438352585,
"learning_rate": 4.390463960212658e-05,
"loss": 2.7792,
"step": 14380
},
{
"epoch": 0.8349531463053759,
"grad_norm": 0.10377515107393265,
"learning_rate": 4.3605279271689264e-05,
"loss": 2.7829,
"step": 14390
},
{
"epoch": 0.8355333778177493,
"grad_norm": 0.10350141674280167,
"learning_rate": 4.330686301347298e-05,
"loss": 2.7861,
"step": 14400
},
{
"epoch": 0.8361136093301227,
"grad_norm": 0.10299152880907059,
"learning_rate": 4.300939192627742e-05,
"loss": 2.7891,
"step": 14410
},
{
"epoch": 0.8366938408424962,
"grad_norm": 0.1038345992565155,
"learning_rate": 4.2712867105422465e-05,
"loss": 2.7812,
"step": 14420
},
{
"epoch": 0.8372740723548696,
"grad_norm": 0.10262761265039444,
"learning_rate": 4.241728964274352e-05,
"loss": 2.7784,
"step": 14430
},
{
"epoch": 0.837854303867243,
"grad_norm": 0.10034337639808655,
"learning_rate": 4.212266062658777e-05,
"loss": 2.7857,
"step": 14440
},
{
"epoch": 0.8384345353796164,
"grad_norm": 0.10054679960012436,
"learning_rate": 4.1828981141810104e-05,
"loss": 2.7783,
"step": 14450
},
{
"epoch": 0.83901476689199,
"grad_norm": 0.10352133959531784,
"learning_rate": 4.15362522697691e-05,
"loss": 2.7936,
"step": 14460
},
{
"epoch": 0.8395949984043634,
"grad_norm": 0.10465723276138306,
"learning_rate": 4.124447508832332e-05,
"loss": 2.7692,
"step": 14470
},
{
"epoch": 0.8401752299167368,
"grad_norm": 0.10384640097618103,
"learning_rate": 4.095365067182665e-05,
"loss": 2.781,
"step": 14480
},
{
"epoch": 0.8407554614291102,
"grad_norm": 0.10312188416719437,
"learning_rate": 4.066378009112523e-05,
"loss": 2.7767,
"step": 14490
},
{
"epoch": 0.8413356929414837,
"grad_norm": 0.10447024554014206,
"learning_rate": 4.037486441355288e-05,
"loss": 2.7832,
"step": 14500
},
{
"epoch": 0.8419159244538571,
"grad_norm": 0.10162138938903809,
"learning_rate": 4.008690470292732e-05,
"loss": 2.7786,
"step": 14510
},
{
"epoch": 0.8424961559662305,
"grad_norm": 0.09777431935071945,
"learning_rate": 3.979990201954653e-05,
"loss": 2.7792,
"step": 14520
},
{
"epoch": 0.8430763874786039,
"grad_norm": 0.10050346702337265,
"learning_rate": 3.9513857420184216e-05,
"loss": 2.7866,
"step": 14530
},
{
"epoch": 0.8436566189909774,
"grad_norm": 0.10209480673074722,
"learning_rate": 3.922877195808678e-05,
"loss": 2.7886,
"step": 14540
},
{
"epoch": 0.8442368505033508,
"grad_norm": 0.10496553033590317,
"learning_rate": 3.894464668296864e-05,
"loss": 2.7854,
"step": 14550
},
{
"epoch": 0.8448170820157243,
"grad_norm": 0.10205195099115372,
"learning_rate": 3.8661482641008866e-05,
"loss": 2.7869,
"step": 14560
},
{
"epoch": 0.8453973135280977,
"grad_norm": 0.10940441489219666,
"learning_rate": 3.837928087484711e-05,
"loss": 2.7799,
"step": 14570
},
{
"epoch": 0.8459775450404712,
"grad_norm": 0.10287832468748093,
"learning_rate": 3.8098042423579766e-05,
"loss": 2.7804,
"step": 14580
},
{
"epoch": 0.8465577765528446,
"grad_norm": 0.0999421551823616,
"learning_rate": 3.781776832275639e-05,
"loss": 2.7835,
"step": 14590
},
{
"epoch": 0.847138008065218,
"grad_norm": 0.10340355336666107,
"learning_rate": 3.753845960437557e-05,
"loss": 2.7831,
"step": 14600
},
{
"epoch": 0.8477182395775914,
"grad_norm": 0.10355892032384872,
"learning_rate": 3.72601172968812e-05,
"loss": 2.7749,
"step": 14610
},
{
"epoch": 0.8482984710899649,
"grad_norm": 0.10467097908258438,
"learning_rate": 3.6982742425158886e-05,
"loss": 2.7834,
"step": 14620
},
{
"epoch": 0.8488787026023383,
"grad_norm": 0.1060672402381897,
"learning_rate": 3.670633601053182e-05,
"loss": 2.7801,
"step": 14630
},
{
"epoch": 0.8494589341147117,
"grad_norm": 0.10443491488695145,
"learning_rate": 3.643089907075759e-05,
"loss": 2.7896,
"step": 14640
},
{
"epoch": 0.8500391656270853,
"grad_norm": 0.1023486852645874,
"learning_rate": 3.6156432620023726e-05,
"loss": 2.7691,
"step": 14650
},
{
"epoch": 0.8506193971394587,
"grad_norm": 0.10417921096086502,
"learning_rate": 3.5882937668944476e-05,
"loss": 2.7703,
"step": 14660
},
{
"epoch": 0.8511996286518321,
"grad_norm": 0.10138606280088425,
"learning_rate": 3.561041522455691e-05,
"loss": 2.7885,
"step": 14670
},
{
"epoch": 0.8517798601642055,
"grad_norm": 0.10121186077594757,
"learning_rate": 3.5338866290317204e-05,
"loss": 2.7721,
"step": 14680
},
{
"epoch": 0.852360091676579,
"grad_norm": 0.10391680151224136,
"learning_rate": 3.506829186609691e-05,
"loss": 2.7818,
"step": 14690
},
{
"epoch": 0.8529403231889524,
"grad_norm": 0.10207725316286087,
"learning_rate": 3.479869294817955e-05,
"loss": 2.775,
"step": 14700
},
{
"epoch": 0.8535205547013258,
"grad_norm": 0.10676626861095428,
"learning_rate": 3.4530070529256524e-05,
"loss": 2.7759,
"step": 14710
},
{
"epoch": 0.8541007862136992,
"grad_norm": 0.10105539858341217,
"learning_rate": 3.42624255984237e-05,
"loss": 2.7855,
"step": 14720
},
{
"epoch": 0.8546810177260727,
"grad_norm": 0.10040144622325897,
"learning_rate": 3.399575914117777e-05,
"loss": 2.7736,
"step": 14730
},
{
"epoch": 0.8552612492384462,
"grad_norm": 0.10322125256061554,
"learning_rate": 3.3730072139412456e-05,
"loss": 2.7834,
"step": 14740
},
{
"epoch": 0.8558414807508196,
"grad_norm": 0.10220754891633987,
"learning_rate": 3.3465365571415315e-05,
"loss": 2.7692,
"step": 14750
},
{
"epoch": 0.856421712263193,
"grad_norm": 0.10107099264860153,
"learning_rate": 3.3201640411863584e-05,
"loss": 2.7672,
"step": 14760
},
{
"epoch": 0.8570019437755665,
"grad_norm": 0.10284842550754547,
"learning_rate": 3.293889763182089e-05,
"loss": 2.7851,
"step": 14770
},
{
"epoch": 0.8575821752879399,
"grad_norm": 0.10386528819799423,
"learning_rate": 3.26771381987337e-05,
"loss": 2.7787,
"step": 14780
},
{
"epoch": 0.8581624068003133,
"grad_norm": 0.1039406880736351,
"learning_rate": 3.241636307642769e-05,
"loss": 2.7838,
"step": 14790
},
{
"epoch": 0.8587426383126867,
"grad_norm": 0.1034376472234726,
"learning_rate": 3.2156573225104145e-05,
"loss": 2.7794,
"step": 14800
},
{
"epoch": 0.8593228698250602,
"grad_norm": 0.10199546813964844,
"learning_rate": 3.189776960133645e-05,
"loss": 2.7806,
"step": 14810
},
{
"epoch": 0.8599031013374336,
"grad_norm": 0.10086624324321747,
"learning_rate": 3.163995315806681e-05,
"loss": 2.7666,
"step": 14820
},
{
"epoch": 0.860483332849807,
"grad_norm": 0.10021676123142242,
"learning_rate": 3.138312484460228e-05,
"loss": 2.7738,
"step": 14830
},
{
"epoch": 0.8610635643621805,
"grad_norm": 0.10465867072343826,
"learning_rate": 3.112728560661164e-05,
"loss": 2.7786,
"step": 14840
},
{
"epoch": 0.861643795874554,
"grad_norm": 0.10076703131198883,
"learning_rate": 3.0872436386121776e-05,
"loss": 2.7705,
"step": 14850
},
{
"epoch": 0.8622240273869274,
"grad_norm": 0.10121941566467285,
"learning_rate": 3.061857812151414e-05,
"loss": 2.7737,
"step": 14860
},
{
"epoch": 0.8628042588993008,
"grad_norm": 0.10309196263551712,
"learning_rate": 3.0365711747521538e-05,
"loss": 2.7783,
"step": 14870
},
{
"epoch": 0.8633844904116743,
"grad_norm": 0.10456740111112595,
"learning_rate": 3.011383819522446e-05,
"loss": 2.7809,
"step": 14880
},
{
"epoch": 0.8639647219240477,
"grad_norm": 0.1025143563747406,
"learning_rate": 2.986295839204764e-05,
"loss": 2.7813,
"step": 14890
},
{
"epoch": 0.8645449534364211,
"grad_norm": 0.10585116595029831,
"learning_rate": 2.961307326175688e-05,
"loss": 2.7738,
"step": 14900
},
{
"epoch": 0.8651251849487945,
"grad_norm": 0.10203658789396286,
"learning_rate": 2.936418372445527e-05,
"loss": 2.7777,
"step": 14910
},
{
"epoch": 0.865705416461168,
"grad_norm": 0.10538860410451889,
"learning_rate": 2.911629069658037e-05,
"loss": 2.7757,
"step": 14920
},
{
"epoch": 0.8662856479735415,
"grad_norm": 0.10184674710035324,
"learning_rate": 2.8869395090900037e-05,
"loss": 2.7797,
"step": 14930
},
{
"epoch": 0.8668658794859149,
"grad_norm": 0.10757064819335938,
"learning_rate": 2.862349781650991e-05,
"loss": 2.7837,
"step": 14940
},
{
"epoch": 0.8674461109982883,
"grad_norm": 0.09947676211595535,
"learning_rate": 2.8378599778829492e-05,
"loss": 2.7764,
"step": 14950
},
{
"epoch": 0.8680263425106618,
"grad_norm": 0.0980169028043747,
"learning_rate": 2.8134701879598965e-05,
"loss": 2.7877,
"step": 14960
},
{
"epoch": 0.8686065740230352,
"grad_norm": 0.09837668389081955,
"learning_rate": 2.7891805016876057e-05,
"loss": 2.7806,
"step": 14970
},
{
"epoch": 0.8691868055354086,
"grad_norm": 0.09911120682954788,
"learning_rate": 2.7649910085032277e-05,
"loss": 2.7807,
"step": 14980
},
{
"epoch": 0.869767037047782,
"grad_norm": 0.09837288409471512,
"learning_rate": 2.7409017974750257e-05,
"loss": 2.7677,
"step": 14990
},
{
"epoch": 0.8703472685601555,
"grad_norm": 0.10560393333435059,
"learning_rate": 2.7169129573019943e-05,
"loss": 2.7785,
"step": 15000
},
{
"epoch": 0.8703472685601555,
"eval_loss": 2.7414441108703613,
"eval_runtime": 3.2661,
"eval_samples_per_second": 1325.755,
"eval_steps_per_second": 2.756,
"step": 15000
},
{
"epoch": 0.870927500072529,
"grad_norm": 0.09839779883623123,
"learning_rate": 2.6930245763135504e-05,
"loss": 2.7759,
"step": 15010
},
{
"epoch": 0.8715077315849024,
"grad_norm": 0.09770379960536957,
"learning_rate": 2.6692367424692272e-05,
"loss": 2.787,
"step": 15020
},
{
"epoch": 0.8720879630972758,
"grad_norm": 0.09834130108356476,
"learning_rate": 2.645549543358304e-05,
"loss": 2.7731,
"step": 15030
},
{
"epoch": 0.8726681946096493,
"grad_norm": 0.1047162264585495,
"learning_rate": 2.6219630661995528e-05,
"loss": 2.7832,
"step": 15040
},
{
"epoch": 0.8732484261220227,
"grad_norm": 0.10111907124519348,
"learning_rate": 2.5984773978408257e-05,
"loss": 2.779,
"step": 15050
},
{
"epoch": 0.8738286576343961,
"grad_norm": 0.10093654692173004,
"learning_rate": 2.5750926247588322e-05,
"loss": 2.768,
"step": 15060
},
{
"epoch": 0.8744088891467695,
"grad_norm": 0.10071719437837601,
"learning_rate": 2.551808833058755e-05,
"loss": 2.7867,
"step": 15070
},
{
"epoch": 0.874989120659143,
"grad_norm": 0.10237322747707367,
"learning_rate": 2.5286261084739445e-05,
"loss": 2.7838,
"step": 15080
},
{
"epoch": 0.8755693521715164,
"grad_norm": 0.09815766662359238,
"learning_rate": 2.5055445363656358e-05,
"loss": 2.7839,
"step": 15090
},
{
"epoch": 0.8761495836838898,
"grad_norm": 0.10203532874584198,
"learning_rate": 2.482564201722581e-05,
"loss": 2.7878,
"step": 15100
},
{
"epoch": 0.8767298151962634,
"grad_norm": 0.10766585171222687,
"learning_rate": 2.4596851891607884e-05,
"loss": 2.7823,
"step": 15110
},
{
"epoch": 0.8773100467086368,
"grad_norm": 0.09876078367233276,
"learning_rate": 2.4369075829231766e-05,
"loss": 2.7762,
"step": 15120
},
{
"epoch": 0.8778902782210102,
"grad_norm": 0.10014016181230545,
"learning_rate": 2.414231466879274e-05,
"loss": 2.7733,
"step": 15130
},
{
"epoch": 0.8784705097333836,
"grad_norm": 0.10114018619060516,
"learning_rate": 2.3916569245249306e-05,
"loss": 2.7861,
"step": 15140
},
{
"epoch": 0.8790507412457571,
"grad_norm": 0.10012462735176086,
"learning_rate": 2.3691840389819526e-05,
"loss": 2.7635,
"step": 15150
},
{
"epoch": 0.8796309727581305,
"grad_norm": 0.10367590934038162,
"learning_rate": 2.3468128929978757e-05,
"loss": 2.7727,
"step": 15160
},
{
"epoch": 0.8802112042705039,
"grad_norm": 0.10224179178476334,
"learning_rate": 2.3245435689456015e-05,
"loss": 2.7712,
"step": 15170
},
{
"epoch": 0.8807914357828773,
"grad_norm": 0.0989450216293335,
"learning_rate": 2.302376148823102e-05,
"loss": 2.7761,
"step": 15180
},
{
"epoch": 0.8813716672952508,
"grad_norm": 0.10036759078502655,
"learning_rate": 2.2803107142531617e-05,
"loss": 2.7815,
"step": 15190
},
{
"epoch": 0.8819518988076243,
"grad_norm": 0.10400567203760147,
"learning_rate": 2.2583473464830005e-05,
"loss": 2.7826,
"step": 15200
},
{
"epoch": 0.8825321303199977,
"grad_norm": 0.09990741312503815,
"learning_rate": 2.2364861263840507e-05,
"loss": 2.7869,
"step": 15210
},
{
"epoch": 0.8831123618323711,
"grad_norm": 0.10067487508058548,
"learning_rate": 2.2147271344516128e-05,
"loss": 2.7771,
"step": 15220
},
{
"epoch": 0.8836925933447446,
"grad_norm": 0.10068360716104507,
"learning_rate": 2.1930704508045714e-05,
"loss": 2.781,
"step": 15230
},
{
"epoch": 0.884272824857118,
"grad_norm": 0.10076344013214111,
"learning_rate": 2.171516155185117e-05,
"loss": 2.7793,
"step": 15240
},
{
"epoch": 0.8848530563694914,
"grad_norm": 0.0988764762878418,
"learning_rate": 2.1500643269584027e-05,
"loss": 2.772,
"step": 15250
},
{
"epoch": 0.8854332878818648,
"grad_norm": 0.09937159717082977,
"learning_rate": 2.1287150451123224e-05,
"loss": 2.7786,
"step": 15260
},
{
"epoch": 0.8860135193942383,
"grad_norm": 0.10244645178318024,
"learning_rate": 2.1074683882571675e-05,
"loss": 2.7752,
"step": 15270
},
{
"epoch": 0.8865937509066117,
"grad_norm": 0.09691537171602249,
"learning_rate": 2.0863244346253517e-05,
"loss": 2.7735,
"step": 15280
},
{
"epoch": 0.8871739824189852,
"grad_norm": 0.09877140074968338,
"learning_rate": 2.065283262071128e-05,
"loss": 2.777,
"step": 15290
},
{
"epoch": 0.8877542139313586,
"grad_norm": 0.09832227975130081,
"learning_rate": 2.044344948070289e-05,
"loss": 2.7718,
"step": 15300
},
{
"epoch": 0.8883344454437321,
"grad_norm": 0.09934905916452408,
"learning_rate": 2.02350956971992e-05,
"loss": 2.7725,
"step": 15310
},
{
"epoch": 0.8889146769561055,
"grad_norm": 0.09960002452135086,
"learning_rate": 2.0027772037380463e-05,
"loss": 2.77,
"step": 15320
},
{
"epoch": 0.8894949084684789,
"grad_norm": 0.10142461210489273,
"learning_rate": 1.9821479264634234e-05,
"loss": 2.7781,
"step": 15330
},
{
"epoch": 0.8900751399808524,
"grad_norm": 0.09648580849170685,
"learning_rate": 1.96162181385521e-05,
"loss": 2.7774,
"step": 15340
},
{
"epoch": 0.8906553714932258,
"grad_norm": 0.09822871536016464,
"learning_rate": 1.9411989414926953e-05,
"loss": 2.7718,
"step": 15350
},
{
"epoch": 0.8912356030055992,
"grad_norm": 0.1000954881310463,
"learning_rate": 1.9208793845750504e-05,
"loss": 2.7763,
"step": 15360
},
{
"epoch": 0.8918158345179726,
"grad_norm": 0.10170748084783554,
"learning_rate": 1.9006632179209925e-05,
"loss": 2.78,
"step": 15370
},
{
"epoch": 0.8923960660303462,
"grad_norm": 0.10458207130432129,
"learning_rate": 1.8805505159685807e-05,
"loss": 2.77,
"step": 15380
},
{
"epoch": 0.8929762975427196,
"grad_norm": 0.09986699372529984,
"learning_rate": 1.8605413527748823e-05,
"loss": 2.776,
"step": 15390
},
{
"epoch": 0.893556529055093,
"grad_norm": 0.09813553094863892,
"learning_rate": 1.8406358020157364e-05,
"loss": 2.7711,
"step": 15400
},
{
"epoch": 0.8941367605674664,
"grad_norm": 0.09960541874170303,
"learning_rate": 1.8208339369854663e-05,
"loss": 2.7781,
"step": 15410
},
{
"epoch": 0.8947169920798399,
"grad_norm": 0.09737250953912735,
"learning_rate": 1.801135830596605e-05,
"loss": 2.7657,
"step": 15420
},
{
"epoch": 0.8952972235922133,
"grad_norm": 0.0949782207608223,
"learning_rate": 1.7815415553796575e-05,
"loss": 2.7705,
"step": 15430
},
{
"epoch": 0.8958774551045867,
"grad_norm": 0.09773328900337219,
"learning_rate": 1.762051183482788e-05,
"loss": 2.7684,
"step": 15440
},
{
"epoch": 0.8964576866169601,
"grad_norm": 0.09638100862503052,
"learning_rate": 1.7426647866715925e-05,
"loss": 2.7724,
"step": 15450
},
{
"epoch": 0.8970379181293336,
"grad_norm": 0.09620904177427292,
"learning_rate": 1.7233824363288118e-05,
"loss": 2.7738,
"step": 15460
},
{
"epoch": 0.897618149641707,
"grad_norm": 0.09929810464382172,
"learning_rate": 1.7042042034540783e-05,
"loss": 2.7754,
"step": 15470
},
{
"epoch": 0.8981983811540805,
"grad_norm": 0.09778960049152374,
"learning_rate": 1.6851301586636613e-05,
"loss": 2.7766,
"step": 15480
},
{
"epoch": 0.8987786126664539,
"grad_norm": 0.09684190899133682,
"learning_rate": 1.6661603721901873e-05,
"loss": 2.7777,
"step": 15490
},
{
"epoch": 0.8993588441788274,
"grad_norm": 0.09664195775985718,
"learning_rate": 1.6472949138823967e-05,
"loss": 2.7859,
"step": 15500
},
{
"epoch": 0.8999390756912008,
"grad_norm": 0.10036718100309372,
"learning_rate": 1.628533853204883e-05,
"loss": 2.7713,
"step": 15510
},
{
"epoch": 0.9005193072035742,
"grad_norm": 0.09811628609895706,
"learning_rate": 1.6098772592378417e-05,
"loss": 2.7733,
"step": 15520
},
{
"epoch": 0.9010995387159476,
"grad_norm": 0.09862551838159561,
"learning_rate": 1.591325200676795e-05,
"loss": 2.7701,
"step": 15530
},
{
"epoch": 0.9016797702283211,
"grad_norm": 0.09947618097066879,
"learning_rate": 1.5728777458323803e-05,
"loss": 2.7771,
"step": 15540
},
{
"epoch": 0.9022600017406945,
"grad_norm": 0.09834101796150208,
"learning_rate": 1.554534962630053e-05,
"loss": 2.7768,
"step": 15550
},
{
"epoch": 0.902840233253068,
"grad_norm": 0.10113567858934402,
"learning_rate": 1.5362969186098594e-05,
"loss": 2.7682,
"step": 15560
},
{
"epoch": 0.9034204647654415,
"grad_norm": 0.0977102592587471,
"learning_rate": 1.5181636809261921e-05,
"loss": 2.7769,
"step": 15570
},
{
"epoch": 0.9040006962778149,
"grad_norm": 0.09831026196479797,
"learning_rate": 1.5001353163475283e-05,
"loss": 2.7681,
"step": 15580
},
{
"epoch": 0.9045809277901883,
"grad_norm": 0.09537149965763092,
"learning_rate": 1.4822118912561943e-05,
"loss": 2.7628,
"step": 15590
},
{
"epoch": 0.9051611593025617,
"grad_norm": 0.09654498845338821,
"learning_rate": 1.4643934716481253e-05,
"loss": 2.7676,
"step": 15600
},
{
"epoch": 0.9057413908149352,
"grad_norm": 0.09738855808973312,
"learning_rate": 1.446680123132603e-05,
"loss": 2.7744,
"step": 15610
},
{
"epoch": 0.9063216223273086,
"grad_norm": 0.10082467645406723,
"learning_rate": 1.4290719109320382e-05,
"loss": 2.7706,
"step": 15620
},
{
"epoch": 0.906901853839682,
"grad_norm": 0.10283984988927841,
"learning_rate": 1.4115688998817043e-05,
"loss": 2.7742,
"step": 15630
},
{
"epoch": 0.9074820853520554,
"grad_norm": 0.09994236379861832,
"learning_rate": 1.3941711544295287e-05,
"loss": 2.7638,
"step": 15640
},
{
"epoch": 0.908062316864429,
"grad_norm": 0.09737379103899002,
"learning_rate": 1.3768787386358282e-05,
"loss": 2.7715,
"step": 15650
},
{
"epoch": 0.9086425483768024,
"grad_norm": 0.09915235638618469,
"learning_rate": 1.3596917161730902e-05,
"loss": 2.7694,
"step": 15660
},
{
"epoch": 0.9092227798891758,
"grad_norm": 0.09791626036167145,
"learning_rate": 1.3426101503257358e-05,
"loss": 2.7628,
"step": 15670
},
{
"epoch": 0.9098030114015492,
"grad_norm": 0.09681922197341919,
"learning_rate": 1.3256341039898766e-05,
"loss": 2.7741,
"step": 15680
},
{
"epoch": 0.9103832429139227,
"grad_norm": 0.09645412862300873,
"learning_rate": 1.3087636396730949e-05,
"loss": 2.7704,
"step": 15690
},
{
"epoch": 0.9109634744262961,
"grad_norm": 0.09795381873846054,
"learning_rate": 1.2919988194942011e-05,
"loss": 2.7666,
"step": 15700
},
{
"epoch": 0.9115437059386695,
"grad_norm": 0.09636548161506653,
"learning_rate": 1.2753397051830294e-05,
"loss": 2.7763,
"step": 15710
},
{
"epoch": 0.9121239374510429,
"grad_norm": 0.0992702841758728,
"learning_rate": 1.2587863580801794e-05,
"loss": 2.7693,
"step": 15720
},
{
"epoch": 0.9127041689634164,
"grad_norm": 0.09708980470895767,
"learning_rate": 1.2423388391368083e-05,
"loss": 2.7696,
"step": 15730
},
{
"epoch": 0.9132844004757898,
"grad_norm": 0.09657064080238342,
"learning_rate": 1.2259972089144054e-05,
"loss": 2.7799,
"step": 15740
},
{
"epoch": 0.9138646319881633,
"grad_norm": 0.09743205457925797,
"learning_rate": 1.2097615275845617e-05,
"loss": 2.7683,
"step": 15750
},
{
"epoch": 0.9144448635005367,
"grad_norm": 0.09803003072738647,
"learning_rate": 1.1936318549287638e-05,
"loss": 2.7731,
"step": 15760
},
{
"epoch": 0.9150250950129102,
"grad_norm": 0.0977969542145729,
"learning_rate": 1.1776082503381468e-05,
"loss": 2.778,
"step": 15770
},
{
"epoch": 0.9156053265252836,
"grad_norm": 0.0986003428697586,
"learning_rate": 1.1616907728133084e-05,
"loss": 2.7794,
"step": 15780
},
{
"epoch": 0.916185558037657,
"grad_norm": 0.09887285530567169,
"learning_rate": 1.1458794809640693e-05,
"loss": 2.7743,
"step": 15790
},
{
"epoch": 0.9167657895500304,
"grad_norm": 0.10056151449680328,
"learning_rate": 1.1301744330092522e-05,
"loss": 2.7739,
"step": 15800
},
{
"epoch": 0.9173460210624039,
"grad_norm": 0.09636414051055908,
"learning_rate": 1.1145756867765033e-05,
"loss": 2.7772,
"step": 15810
},
{
"epoch": 0.9179262525747773,
"grad_norm": 0.09793318808078766,
"learning_rate": 1.0990832997020282e-05,
"loss": 2.7729,
"step": 15820
},
{
"epoch": 0.9185064840871507,
"grad_norm": 0.09378232061862946,
"learning_rate": 1.0836973288304229e-05,
"loss": 2.7783,
"step": 15830
},
{
"epoch": 0.9190867155995243,
"grad_norm": 0.09904693067073822,
"learning_rate": 1.0684178308144498e-05,
"loss": 2.7697,
"step": 15840
},
{
"epoch": 0.9196669471118977,
"grad_norm": 0.0982363149523735,
"learning_rate": 1.0532448619148115e-05,
"loss": 2.7712,
"step": 15850
},
{
"epoch": 0.9202471786242711,
"grad_norm": 0.0995451807975769,
"learning_rate": 1.038178477999978e-05,
"loss": 2.7702,
"step": 15860
},
{
"epoch": 0.9208274101366445,
"grad_norm": 0.09749618917703629,
"learning_rate": 1.0232187345459431e-05,
"loss": 2.771,
"step": 15870
},
{
"epoch": 0.921407641649018,
"grad_norm": 0.09808894246816635,
"learning_rate": 1.0083656866360646e-05,
"loss": 2.7706,
"step": 15880
},
{
"epoch": 0.9219878731613914,
"grad_norm": 0.09838584810495377,
"learning_rate": 9.936193889608012e-06,
"loss": 2.7656,
"step": 15890
},
{
"epoch": 0.9225681046737648,
"grad_norm": 0.10016359388828278,
"learning_rate": 9.789798958175832e-06,
"loss": 2.7749,
"step": 15900
},
{
"epoch": 0.9231483361861382,
"grad_norm": 0.09670013934373856,
"learning_rate": 9.64447261110548e-06,
"loss": 2.7693,
"step": 15910
},
{
"epoch": 0.9237285676985117,
"grad_norm": 0.09639087319374084,
"learning_rate": 9.500215383503784e-06,
"loss": 2.7675,
"step": 15920
},
{
"epoch": 0.9243087992108852,
"grad_norm": 0.09851641952991486,
"learning_rate": 9.357027806541084e-06,
"loss": 2.7748,
"step": 15930
},
{
"epoch": 0.9248890307232586,
"grad_norm": 0.10145829617977142,
"learning_rate": 9.214910407448871e-06,
"loss": 2.7841,
"step": 15940
},
{
"epoch": 0.925469262235632,
"grad_norm": 0.09769120067358017,
"learning_rate": 9.073863709518426e-06,
"loss": 2.7703,
"step": 15950
},
{
"epoch": 0.9260494937480055,
"grad_norm": 0.09475893527269363,
"learning_rate": 8.933888232098408e-06,
"loss": 2.7703,
"step": 15960
},
{
"epoch": 0.9266297252603789,
"grad_norm": 0.09624000638723373,
"learning_rate": 8.794984490593171e-06,
"loss": 2.7753,
"step": 15970
},
{
"epoch": 0.9272099567727523,
"grad_norm": 0.09569297730922699,
"learning_rate": 8.657152996460958e-06,
"loss": 2.7635,
"step": 15980
},
{
"epoch": 0.9277901882851257,
"grad_norm": 0.10107609629631042,
"learning_rate": 8.520394257211605e-06,
"loss": 2.7714,
"step": 15990
},
{
"epoch": 0.9283704197974992,
"grad_norm": 0.09753672778606415,
"learning_rate": 8.384708776405236e-06,
"loss": 2.7706,
"step": 16000
},
{
"epoch": 0.9283704197974992,
"eval_loss": 2.7369606494903564,
"eval_runtime": 3.2559,
"eval_samples_per_second": 1329.896,
"eval_steps_per_second": 2.764,
"step": 16000
},
{
"epoch": 0.9289506513098726,
"grad_norm": 0.09548928588628769,
"learning_rate": 8.25009705364994e-06,
"loss": 2.7754,
"step": 16010
},
{
"epoch": 0.929530882822246,
"grad_norm": 0.09287203848361969,
"learning_rate": 8.116559584600201e-06,
"loss": 2.7777,
"step": 16020
},
{
"epoch": 0.9301111143346195,
"grad_norm": 0.0972280502319336,
"learning_rate": 7.984096860955036e-06,
"loss": 2.781,
"step": 16030
},
{
"epoch": 0.930691345846993,
"grad_norm": 0.09617298096418381,
"learning_rate": 7.852709370455922e-06,
"loss": 2.7692,
"step": 16040
},
{
"epoch": 0.9312715773593664,
"grad_norm": 0.09682459384202957,
"learning_rate": 7.72239759688551e-06,
"loss": 2.7742,
"step": 16050
},
{
"epoch": 0.9318518088717398,
"grad_norm": 0.09648177772760391,
"learning_rate": 7.593162020065313e-06,
"loss": 2.7783,
"step": 16060
},
{
"epoch": 0.9324320403841133,
"grad_norm": 0.09511367976665497,
"learning_rate": 7.4650031158542845e-06,
"loss": 2.7706,
"step": 16070
},
{
"epoch": 0.9330122718964867,
"grad_norm": 0.09434488415718079,
"learning_rate": 7.337921356146981e-06,
"loss": 2.7694,
"step": 16080
},
{
"epoch": 0.9335925034088601,
"grad_norm": 0.09737717360258102,
"learning_rate": 7.211917208871665e-06,
"loss": 2.7674,
"step": 16090
},
{
"epoch": 0.9341727349212335,
"grad_norm": 0.09725455194711685,
"learning_rate": 7.086991137988906e-06,
"loss": 2.7639,
"step": 16100
},
{
"epoch": 0.9347529664336071,
"grad_norm": 0.10136746615171432,
"learning_rate": 6.963143603489518e-06,
"loss": 2.7677,
"step": 16110
},
{
"epoch": 0.9353331979459805,
"grad_norm": 0.09756675362586975,
"learning_rate": 6.840375061393122e-06,
"loss": 2.765,
"step": 16120
},
{
"epoch": 0.9359134294583539,
"grad_norm": 0.09939330816268921,
"learning_rate": 6.718685963746318e-06,
"loss": 2.7751,
"step": 16130
},
{
"epoch": 0.9364936609707273,
"grad_norm": 0.09836092591285706,
"learning_rate": 6.598076758621118e-06,
"loss": 2.7828,
"step": 16140
},
{
"epoch": 0.9370738924831008,
"grad_norm": 0.09677501767873764,
"learning_rate": 6.4785478901133506e-06,
"loss": 2.769,
"step": 16150
},
{
"epoch": 0.9376541239954742,
"grad_norm": 0.097322478890419,
"learning_rate": 6.360099798340656e-06,
"loss": 2.7656,
"step": 16160
},
{
"epoch": 0.9382343555078476,
"grad_norm": 0.09472298622131348,
"learning_rate": 6.242732919441462e-06,
"loss": 2.7737,
"step": 16170
},
{
"epoch": 0.938814587020221,
"grad_norm": 0.09517394751310349,
"learning_rate": 6.126447685572844e-06,
"loss": 2.7807,
"step": 16180
},
{
"epoch": 0.9393948185325945,
"grad_norm": 0.09591302275657654,
"learning_rate": 6.011244524909198e-06,
"loss": 2.7774,
"step": 16190
},
{
"epoch": 0.939975050044968,
"grad_norm": 0.09797896444797516,
"learning_rate": 5.8971238616407405e-06,
"loss": 2.7637,
"step": 16200
},
{
"epoch": 0.9405552815573414,
"grad_norm": 0.09744720160961151,
"learning_rate": 5.7840861159715425e-06,
"loss": 2.7773,
"step": 16210
},
{
"epoch": 0.9411355130697148,
"grad_norm": 0.09814444929361343,
"learning_rate": 5.672131704118565e-06,
"loss": 2.7741,
"step": 16220
},
{
"epoch": 0.9417157445820883,
"grad_norm": 0.09604529291391373,
"learning_rate": 5.561261038309628e-06,
"loss": 2.7727,
"step": 16230
},
{
"epoch": 0.9422959760944617,
"grad_norm": 0.09737398475408554,
"learning_rate": 5.4514745267821404e-06,
"loss": 2.7737,
"step": 16240
},
{
"epoch": 0.9428762076068351,
"grad_norm": 0.09697815030813217,
"learning_rate": 5.342772573781507e-06,
"loss": 2.7638,
"step": 16250
},
{
"epoch": 0.9434564391192085,
"grad_norm": 0.09917178004980087,
"learning_rate": 5.235155579559725e-06,
"loss": 2.7709,
"step": 16260
},
{
"epoch": 0.944036670631582,
"grad_norm": 0.096290223300457,
"learning_rate": 5.128623940373888e-06,
"loss": 2.7674,
"step": 16270
},
{
"epoch": 0.9446169021439554,
"grad_norm": 0.09504272043704987,
"learning_rate": 5.023178048484589e-06,
"loss": 2.7694,
"step": 16280
},
{
"epoch": 0.9451971336563288,
"grad_norm": 0.09743209183216095,
"learning_rate": 4.91881829215468e-06,
"loss": 2.781,
"step": 16290
},
{
"epoch": 0.9457773651687024,
"grad_norm": 0.09843679517507553,
"learning_rate": 4.815545055647718e-06,
"loss": 2.776,
"step": 16300
},
{
"epoch": 0.9463575966810758,
"grad_norm": 0.0955999493598938,
"learning_rate": 4.713358719226523e-06,
"loss": 2.7789,
"step": 16310
},
{
"epoch": 0.9469378281934492,
"grad_norm": 0.09576351940631866,
"learning_rate": 4.612259659151984e-06,
"loss": 2.7716,
"step": 16320
},
{
"epoch": 0.9475180597058226,
"grad_norm": 0.09730935841798782,
"learning_rate": 4.512248247681394e-06,
"loss": 2.7802,
"step": 16330
},
{
"epoch": 0.9480982912181961,
"grad_norm": 0.09646177291870117,
"learning_rate": 4.413324853067213e-06,
"loss": 2.7765,
"step": 16340
},
{
"epoch": 0.9486785227305695,
"grad_norm": 0.09553349018096924,
"learning_rate": 4.3154898395557744e-06,
"loss": 2.778,
"step": 16350
},
{
"epoch": 0.9492587542429429,
"grad_norm": 0.09604230523109436,
"learning_rate": 4.218743567385852e-06,
"loss": 2.78,
"step": 16360
},
{
"epoch": 0.9498389857553163,
"grad_norm": 0.09518173336982727,
"learning_rate": 4.123086392787289e-06,
"loss": 2.7695,
"step": 16370
},
{
"epoch": 0.9504192172676899,
"grad_norm": 0.09625556319952011,
"learning_rate": 4.0285186679799406e-06,
"loss": 2.7694,
"step": 16380
},
{
"epoch": 0.9509994487800633,
"grad_norm": 0.09755248576402664,
"learning_rate": 3.935040741171969e-06,
"loss": 2.7625,
"step": 16390
},
{
"epoch": 0.9515796802924367,
"grad_norm": 0.09465952962636948,
"learning_rate": 3.842652956558945e-06,
"loss": 2.7658,
"step": 16400
},
{
"epoch": 0.9521599118048101,
"grad_norm": 0.0960998460650444,
"learning_rate": 3.7513556543223855e-06,
"loss": 2.7846,
"step": 16410
},
{
"epoch": 0.9527401433171836,
"grad_norm": 0.09892145544290543,
"learning_rate": 3.6611491706284856e-06,
"loss": 2.7708,
"step": 16420
},
{
"epoch": 0.953320374829557,
"grad_norm": 0.09714221954345703,
"learning_rate": 3.572033837626953e-06,
"loss": 2.7874,
"step": 16430
},
{
"epoch": 0.9539006063419304,
"grad_norm": 0.09727420657873154,
"learning_rate": 3.484009983449809e-06,
"loss": 2.7834,
"step": 16440
},
{
"epoch": 0.9544808378543038,
"grad_norm": 0.09665530920028687,
"learning_rate": 3.397077932210124e-06,
"loss": 2.7726,
"step": 16450
},
{
"epoch": 0.9550610693666773,
"grad_norm": 0.09558922797441483,
"learning_rate": 3.3112380040008156e-06,
"loss": 2.7723,
"step": 16460
},
{
"epoch": 0.9556413008790507,
"grad_norm": 0.0972527414560318,
"learning_rate": 3.2264905148934208e-06,
"loss": 2.772,
"step": 16470
},
{
"epoch": 0.9562215323914242,
"grad_norm": 0.09882599860429764,
"learning_rate": 3.142835776937158e-06,
"loss": 2.7685,
"step": 16480
},
{
"epoch": 0.9568017639037976,
"grad_norm": 0.09505190700292587,
"learning_rate": 3.060274098157467e-06,
"loss": 2.7694,
"step": 16490
},
{
"epoch": 0.9573819954161711,
"grad_norm": 0.09600254893302917,
"learning_rate": 2.9788057825551714e-06,
"loss": 2.7778,
"step": 16500
},
{
"epoch": 0.9579622269285445,
"grad_norm": 0.09696151316165924,
"learning_rate": 2.8984311301050835e-06,
"loss": 2.784,
"step": 16510
},
{
"epoch": 0.9585424584409179,
"grad_norm": 0.09621264785528183,
"learning_rate": 2.819150436755135e-06,
"loss": 2.7668,
"step": 16520
},
{
"epoch": 0.9591226899532914,
"grad_norm": 0.09673577547073364,
"learning_rate": 2.7409639944251162e-06,
"loss": 2.774,
"step": 16530
},
{
"epoch": 0.9597029214656648,
"grad_norm": 0.09513070434331894,
"learning_rate": 2.6638720910056697e-06,
"loss": 2.7783,
"step": 16540
},
{
"epoch": 0.9602831529780382,
"grad_norm": 0.09311112761497498,
"learning_rate": 2.587875010357332e-06,
"loss": 2.7665,
"step": 16550
},
{
"epoch": 0.9608633844904116,
"grad_norm": 0.09406144171953201,
"learning_rate": 2.5129730323092622e-06,
"loss": 2.7671,
"step": 16560
},
{
"epoch": 0.9614436160027852,
"grad_norm": 0.09770730882883072,
"learning_rate": 2.439166432658446e-06,
"loss": 2.7673,
"step": 16570
},
{
"epoch": 0.9620238475151586,
"grad_norm": 0.09938254207372665,
"learning_rate": 2.366455483168428e-06,
"loss": 2.7637,
"step": 16580
},
{
"epoch": 0.962604079027532,
"grad_norm": 0.09504234790802002,
"learning_rate": 2.2948404515686136e-06,
"loss": 2.7708,
"step": 16590
},
{
"epoch": 0.9631843105399054,
"grad_norm": 0.09619156271219254,
"learning_rate": 2.2243216015530362e-06,
"loss": 2.7716,
"step": 16600
},
{
"epoch": 0.9637645420522789,
"grad_norm": 0.09520803391933441,
"learning_rate": 2.1548991927794244e-06,
"loss": 2.771,
"step": 16610
},
{
"epoch": 0.9643447735646523,
"grad_norm": 0.09521950781345367,
"learning_rate": 2.0865734808684697e-06,
"loss": 2.7679,
"step": 16620
},
{
"epoch": 0.9649250050770257,
"grad_norm": 0.09744451195001602,
"learning_rate": 2.0193447174025268e-06,
"loss": 2.7715,
"step": 16630
},
{
"epoch": 0.9655052365893991,
"grad_norm": 0.09531662613153458,
"learning_rate": 1.953213149924948e-06,
"loss": 2.7824,
"step": 16640
},
{
"epoch": 0.9660854681017726,
"grad_norm": 0.09525689482688904,
"learning_rate": 1.8881790219391512e-06,
"loss": 2.7694,
"step": 16650
},
{
"epoch": 0.9666656996141461,
"grad_norm": 0.09457177668809891,
"learning_rate": 1.8242425729075527e-06,
"loss": 2.7588,
"step": 16660
},
{
"epoch": 0.9672459311265195,
"grad_norm": 0.09685463458299637,
"learning_rate": 1.7614040382508687e-06,
"loss": 2.7714,
"step": 16670
},
{
"epoch": 0.9678261626388929,
"grad_norm": 0.09774652868509293,
"learning_rate": 1.6996636493471494e-06,
"loss": 2.7683,
"step": 16680
},
{
"epoch": 0.9684063941512664,
"grad_norm": 0.09525836259126663,
"learning_rate": 1.6390216335309792e-06,
"loss": 2.77,
"step": 16690
},
{
"epoch": 0.9689866256636398,
"grad_norm": 0.09421420842409134,
"learning_rate": 1.5794782140926775e-06,
"loss": 2.7723,
"step": 16700
},
{
"epoch": 0.9695668571760132,
"grad_norm": 0.09693361073732376,
"learning_rate": 1.5210336102772668e-06,
"loss": 2.772,
"step": 16710
},
{
"epoch": 0.9701470886883866,
"grad_norm": 0.09740012139081955,
"learning_rate": 1.463688037283972e-06,
"loss": 2.7673,
"step": 16720
},
{
"epoch": 0.9707273202007601,
"grad_norm": 0.09596629440784454,
"learning_rate": 1.4074417062651221e-06,
"loss": 2.7878,
"step": 16730
},
{
"epoch": 0.9713075517131335,
"grad_norm": 0.09561031311750412,
"learning_rate": 1.3522948243256503e-06,
"loss": 2.7728,
"step": 16740
},
{
"epoch": 0.971887783225507,
"grad_norm": 0.09793524444103241,
"learning_rate": 1.2982475945221615e-06,
"loss": 2.7718,
"step": 16750
},
{
"epoch": 0.9724680147378804,
"grad_norm": 0.09407012164592743,
"learning_rate": 1.245300215862166e-06,
"loss": 2.7797,
"step": 16760
},
{
"epoch": 0.9730482462502539,
"grad_norm": 0.09444325417280197,
"learning_rate": 1.1934528833035139e-06,
"loss": 2.7725,
"step": 16770
},
{
"epoch": 0.9736284777626273,
"grad_norm": 0.09787797182798386,
"learning_rate": 1.1427057877534951e-06,
"loss": 2.7691,
"step": 16780
},
{
"epoch": 0.9742087092750007,
"grad_norm": 0.09456036239862442,
"learning_rate": 1.09305911606824e-06,
"loss": 2.7766,
"step": 16790
},
{
"epoch": 0.9747889407873742,
"grad_norm": 0.095250204205513,
"learning_rate": 1.044513051051954e-06,
"loss": 2.7701,
"step": 16800
},
{
"epoch": 0.9753691722997476,
"grad_norm": 0.09521818906068802,
"learning_rate": 9.970677714563835e-07,
"loss": 2.7734,
"step": 16810
},
{
"epoch": 0.975949403812121,
"grad_norm": 0.09462135285139084,
"learning_rate": 9.507234519800178e-07,
"loss": 2.7705,
"step": 16820
},
{
"epoch": 0.9765296353244944,
"grad_norm": 0.09560775011777878,
"learning_rate": 9.054802632674551e-07,
"loss": 2.7691,
"step": 16830
},
{
"epoch": 0.977109866836868,
"grad_norm": 0.09410873800516129,
"learning_rate": 8.61338371908904e-07,
"loss": 2.7787,
"step": 16840
},
{
"epoch": 0.9776900983492414,
"grad_norm": 0.09606259316205978,
"learning_rate": 8.18297940439383e-07,
"loss": 2.7766,
"step": 16850
},
{
"epoch": 0.9782703298616148,
"grad_norm": 0.09549134224653244,
"learning_rate": 7.763591273382885e-07,
"loss": 2.7701,
"step": 16860
},
{
"epoch": 0.9788505613739882,
"grad_norm": 0.09225918352603912,
"learning_rate": 7.355220870287615e-07,
"loss": 2.7635,
"step": 16870
},
{
"epoch": 0.9794307928863617,
"grad_norm": 0.09305543452501297,
"learning_rate": 6.95786969876988e-07,
"loss": 2.7659,
"step": 16880
},
{
"epoch": 0.9800110243987351,
"grad_norm": 0.09393244236707687,
"learning_rate": 6.571539221918997e-07,
"loss": 2.7743,
"step": 16890
},
{
"epoch": 0.9805912559111085,
"grad_norm": 0.09278815984725952,
"learning_rate": 6.196230862244078e-07,
"loss": 2.78,
"step": 16900
},
{
"epoch": 0.9811714874234819,
"grad_norm": 0.09347451478242874,
"learning_rate": 5.831946001669697e-07,
"loss": 2.7747,
"step": 16910
},
{
"epoch": 0.9817517189358554,
"grad_norm": 0.09540887176990509,
"learning_rate": 5.478685981530894e-07,
"loss": 2.7758,
"step": 16920
},
{
"epoch": 0.9823319504482289,
"grad_norm": 0.09621070325374603,
"learning_rate": 5.136452102567856e-07,
"loss": 2.7713,
"step": 16930
},
{
"epoch": 0.9829121819606023,
"grad_norm": 0.09409264475107193,
"learning_rate": 4.805245624922238e-07,
"loss": 2.7778,
"step": 16940
},
{
"epoch": 0.9834924134729757,
"grad_norm": 0.09619985520839691,
"learning_rate": 4.4850677681301795e-07,
"loss": 2.7701,
"step": 16950
},
{
"epoch": 0.9840726449853492,
"grad_norm": 0.09401355683803558,
"learning_rate": 4.1759197111206344e-07,
"loss": 2.7689,
"step": 16960
},
{
"epoch": 0.9846528764977226,
"grad_norm": 0.09698129445314407,
"learning_rate": 3.877802592209045e-07,
"loss": 2.7703,
"step": 16970
},
{
"epoch": 0.985233108010096,
"grad_norm": 0.09333529323339462,
"learning_rate": 3.590717509093677e-07,
"loss": 2.7784,
"step": 16980
},
{
"epoch": 0.9858133395224694,
"grad_norm": 0.09353555738925934,
"learning_rate": 3.3146655188519557e-07,
"loss": 2.7687,
"step": 16990
},
{
"epoch": 0.9863935710348429,
"grad_norm": 0.09438835084438324,
"learning_rate": 3.0496476379364697e-07,
"loss": 2.7665,
"step": 17000
},
{
"epoch": 0.9863935710348429,
"eval_loss": 2.735684633255005,
"eval_runtime": 3.2561,
"eval_samples_per_second": 1329.798,
"eval_steps_per_second": 2.764,
"step": 17000
},
{
"epoch": 0.9869738025472163,
"grad_norm": 0.09504197537899017,
"learning_rate": 2.7956648421703087e-07,
"loss": 2.7762,
"step": 17010
},
{
"epoch": 0.9875540340595897,
"grad_norm": 0.09602217376232147,
"learning_rate": 2.5527180667453963e-07,
"loss": 2.7673,
"step": 17020
},
{
"epoch": 0.9881342655719633,
"grad_norm": 0.09483738243579865,
"learning_rate": 2.3208082062168288e-07,
"loss": 2.7705,
"step": 17030
},
{
"epoch": 0.9887144970843367,
"grad_norm": 0.09395676851272583,
"learning_rate": 2.0999361145008775e-07,
"loss": 2.7692,
"step": 17040
},
{
"epoch": 0.9892947285967101,
"grad_norm": 0.09432484954595566,
"learning_rate": 1.8901026048719902e-07,
"loss": 2.7707,
"step": 17050
},
{
"epoch": 0.9898749601090835,
"grad_norm": 0.09382540732622147,
"learning_rate": 1.6913084499587948e-07,
"loss": 2.7788,
"step": 17060
},
{
"epoch": 0.990455191621457,
"grad_norm": 0.09619873762130737,
"learning_rate": 1.5035543817427663e-07,
"loss": 2.7604,
"step": 17070
},
{
"epoch": 0.9910354231338304,
"grad_norm": 0.09365525841712952,
"learning_rate": 1.3268410915532323e-07,
"loss": 2.7785,
"step": 17080
},
{
"epoch": 0.9916156546462038,
"grad_norm": 0.09718578308820724,
"learning_rate": 1.1611692300680376e-07,
"loss": 2.7745,
"step": 17090
},
{
"epoch": 0.9921958861585772,
"grad_norm": 0.0956762507557869,
"learning_rate": 1.0065394073075494e-07,
"loss": 2.7813,
"step": 17100
},
{
"epoch": 0.9927761176709508,
"grad_norm": 0.09347262978553772,
"learning_rate": 8.629521926353244e-08,
"loss": 2.7714,
"step": 17110
},
{
"epoch": 0.9933563491833242,
"grad_norm": 0.09415694326162338,
"learning_rate": 7.304081147544439e-08,
"loss": 2.7837,
"step": 17120
},
{
"epoch": 0.9939365806956976,
"grad_norm": 0.09390881657600403,
"learning_rate": 6.089076617058486e-08,
"loss": 2.7725,
"step": 17130
},
{
"epoch": 0.994516812208071,
"grad_norm": 0.09363935142755508,
"learning_rate": 4.984512808673402e-08,
"loss": 2.776,
"step": 17140
},
{
"epoch": 0.9950970437204445,
"grad_norm": 0.0957217812538147,
"learning_rate": 3.9903937895091606e-08,
"loss": 2.7731,
"step": 17150
},
{
"epoch": 0.9956772752328179,
"grad_norm": 0.09717927128076553,
"learning_rate": 3.1067232200110426e-08,
"loss": 2.7703,
"step": 17160
},
{
"epoch": 0.9962575067451913,
"grad_norm": 0.09413953870534897,
"learning_rate": 2.333504353952964e-08,
"loss": 2.7733,
"step": 17170
},
{
"epoch": 0.9968377382575647,
"grad_norm": 0.09774868190288544,
"learning_rate": 1.670740038400842e-08,
"loss": 2.7658,
"step": 17180
},
{
"epoch": 0.9974179697699382,
"grad_norm": 0.09658750146627426,
"learning_rate": 1.1184327137292448e-08,
"loss": 2.7734,
"step": 17190
},
{
"epoch": 0.9979982012823116,
"grad_norm": 0.0932522714138031,
"learning_rate": 6.765844135847576e-09,
"loss": 2.7708,
"step": 17200
},
{
"epoch": 0.9985784327946851,
"grad_norm": 0.09543392807245255,
"learning_rate": 3.4519676490596393e-09,
"loss": 2.7746,
"step": 17210
},
{
"epoch": 0.9991586643070585,
"grad_norm": 0.09391433745622635,
"learning_rate": 1.2427098789347111e-09,
"loss": 2.7707,
"step": 17220
},
{
"epoch": 0.999738895819432,
"grad_norm": 0.0975637212395668,
"learning_rate": 1.3807896016571064e-10,
"loss": 2.77,
"step": 17230
},
{
"epoch": 0.9999709884243814,
"step": 17234,
"total_flos": 4.402536853133695e+19,
"train_loss": 3.082940493684724,
"train_runtime": 20985.9807,
"train_samples_per_second": 420.462,
"train_steps_per_second": 0.821
}
],
"logging_steps": 10,
"max_steps": 17234,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.402536853133695e+19,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}