terry69's picture
Model save
67d5744 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 6185,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00016168148746968473,
"grad_norm": 3.2100687490225734,
"learning_rate": 1.6155088852988694e-08,
"loss": 1.6934,
"step": 1
},
{
"epoch": 0.0008084074373484236,
"grad_norm": 3.4649028983783934,
"learning_rate": 8.077544426494346e-08,
"loss": 1.7213,
"step": 5
},
{
"epoch": 0.0016168148746968471,
"grad_norm": 3.221507146420423,
"learning_rate": 1.6155088852988693e-07,
"loss": 1.6956,
"step": 10
},
{
"epoch": 0.002425222312045271,
"grad_norm": 3.3319921152603014,
"learning_rate": 2.4232633279483037e-07,
"loss": 1.7147,
"step": 15
},
{
"epoch": 0.0032336297493936943,
"grad_norm": 3.2098440129054313,
"learning_rate": 3.2310177705977386e-07,
"loss": 1.703,
"step": 20
},
{
"epoch": 0.004042037186742118,
"grad_norm": 3.3447921032483268,
"learning_rate": 4.038772213247173e-07,
"loss": 1.7119,
"step": 25
},
{
"epoch": 0.004850444624090542,
"grad_norm": 3.142745813218036,
"learning_rate": 4.846526655896607e-07,
"loss": 1.7269,
"step": 30
},
{
"epoch": 0.005658852061438965,
"grad_norm": 3.150831163401336,
"learning_rate": 5.654281098546043e-07,
"loss": 1.7258,
"step": 35
},
{
"epoch": 0.0064672594987873885,
"grad_norm": 2.8932615743223953,
"learning_rate": 6.462035541195477e-07,
"loss": 1.6689,
"step": 40
},
{
"epoch": 0.007275666936135812,
"grad_norm": 2.8110189620165857,
"learning_rate": 7.269789983844912e-07,
"loss": 1.6853,
"step": 45
},
{
"epoch": 0.008084074373484237,
"grad_norm": 2.7038882979842787,
"learning_rate": 8.077544426494346e-07,
"loss": 1.6557,
"step": 50
},
{
"epoch": 0.00889248181083266,
"grad_norm": 2.483453858975139,
"learning_rate": 8.885298869143781e-07,
"loss": 1.6717,
"step": 55
},
{
"epoch": 0.009700889248181084,
"grad_norm": 2.52510825170289,
"learning_rate": 9.693053311793215e-07,
"loss": 1.6649,
"step": 60
},
{
"epoch": 0.010509296685529508,
"grad_norm": 2.3691095179729538,
"learning_rate": 1.0500807754442651e-06,
"loss": 1.6265,
"step": 65
},
{
"epoch": 0.01131770412287793,
"grad_norm": 1.9488878151454319,
"learning_rate": 1.1308562197092086e-06,
"loss": 1.6181,
"step": 70
},
{
"epoch": 0.012126111560226353,
"grad_norm": 2.2451551696944954,
"learning_rate": 1.211631663974152e-06,
"loss": 1.5957,
"step": 75
},
{
"epoch": 0.012934518997574777,
"grad_norm": 1.8628223064786595,
"learning_rate": 1.2924071082390954e-06,
"loss": 1.5846,
"step": 80
},
{
"epoch": 0.0137429264349232,
"grad_norm": 1.718641914773257,
"learning_rate": 1.3731825525040387e-06,
"loss": 1.5563,
"step": 85
},
{
"epoch": 0.014551333872271624,
"grad_norm": 1.5468505378881632,
"learning_rate": 1.4539579967689823e-06,
"loss": 1.5123,
"step": 90
},
{
"epoch": 0.015359741309620048,
"grad_norm": 1.3814602687012587,
"learning_rate": 1.5347334410339258e-06,
"loss": 1.5073,
"step": 95
},
{
"epoch": 0.016168148746968473,
"grad_norm": 1.3287767546309797,
"learning_rate": 1.6155088852988692e-06,
"loss": 1.4911,
"step": 100
},
{
"epoch": 0.016976556184316895,
"grad_norm": 1.2392531890824057,
"learning_rate": 1.6962843295638126e-06,
"loss": 1.466,
"step": 105
},
{
"epoch": 0.01778496362166532,
"grad_norm": 1.1786674587956345,
"learning_rate": 1.7770597738287563e-06,
"loss": 1.4641,
"step": 110
},
{
"epoch": 0.018593371059013743,
"grad_norm": 1.131646019707904,
"learning_rate": 1.8578352180936995e-06,
"loss": 1.4205,
"step": 115
},
{
"epoch": 0.019401778496362168,
"grad_norm": 1.1355478936146342,
"learning_rate": 1.938610662358643e-06,
"loss": 1.4253,
"step": 120
},
{
"epoch": 0.02021018593371059,
"grad_norm": 1.0935126326356333,
"learning_rate": 2.0193861066235864e-06,
"loss": 1.4107,
"step": 125
},
{
"epoch": 0.021018593371059015,
"grad_norm": 1.0807157318374936,
"learning_rate": 2.1001615508885302e-06,
"loss": 1.3773,
"step": 130
},
{
"epoch": 0.021827000808407437,
"grad_norm": 1.0813532465125704,
"learning_rate": 2.1809369951534733e-06,
"loss": 1.3614,
"step": 135
},
{
"epoch": 0.02263540824575586,
"grad_norm": 1.0721664075597053,
"learning_rate": 2.261712439418417e-06,
"loss": 1.3454,
"step": 140
},
{
"epoch": 0.023443815683104285,
"grad_norm": 1.1706087580447129,
"learning_rate": 2.34248788368336e-06,
"loss": 1.3002,
"step": 145
},
{
"epoch": 0.024252223120452707,
"grad_norm": 1.2130738107133745,
"learning_rate": 2.423263327948304e-06,
"loss": 1.256,
"step": 150
},
{
"epoch": 0.025060630557801132,
"grad_norm": 1.349088393672931,
"learning_rate": 2.5040387722132474e-06,
"loss": 1.2689,
"step": 155
},
{
"epoch": 0.025869037995149554,
"grad_norm": 1.507002519970385,
"learning_rate": 2.584814216478191e-06,
"loss": 1.1915,
"step": 160
},
{
"epoch": 0.02667744543249798,
"grad_norm": 1.8229173794208704,
"learning_rate": 2.6655896607431343e-06,
"loss": 1.1551,
"step": 165
},
{
"epoch": 0.0274858528698464,
"grad_norm": 1.7143482970110884,
"learning_rate": 2.7463651050080773e-06,
"loss": 1.111,
"step": 170
},
{
"epoch": 0.028294260307194827,
"grad_norm": 1.6459358408927556,
"learning_rate": 2.827140549273021e-06,
"loss": 1.0532,
"step": 175
},
{
"epoch": 0.02910266774454325,
"grad_norm": 1.4413642537677436,
"learning_rate": 2.9079159935379646e-06,
"loss": 1.0274,
"step": 180
},
{
"epoch": 0.029911075181891674,
"grad_norm": 1.1419491679189164,
"learning_rate": 2.988691437802908e-06,
"loss": 1.0013,
"step": 185
},
{
"epoch": 0.030719482619240096,
"grad_norm": 0.9309353953036712,
"learning_rate": 3.0694668820678515e-06,
"loss": 0.9753,
"step": 190
},
{
"epoch": 0.03152789005658852,
"grad_norm": 0.8025838589275166,
"learning_rate": 3.1502423263327954e-06,
"loss": 0.9696,
"step": 195
},
{
"epoch": 0.03233629749393695,
"grad_norm": 0.7733858683355683,
"learning_rate": 3.2310177705977384e-06,
"loss": 0.9494,
"step": 200
},
{
"epoch": 0.033144704931285365,
"grad_norm": 0.7464642252029693,
"learning_rate": 3.311793214862682e-06,
"loss": 0.9462,
"step": 205
},
{
"epoch": 0.03395311236863379,
"grad_norm": 0.7055514389315549,
"learning_rate": 3.3925686591276253e-06,
"loss": 0.927,
"step": 210
},
{
"epoch": 0.034761519805982216,
"grad_norm": 0.7104304189422785,
"learning_rate": 3.473344103392569e-06,
"loss": 0.924,
"step": 215
},
{
"epoch": 0.03556992724333064,
"grad_norm": 0.6574144588785804,
"learning_rate": 3.5541195476575126e-06,
"loss": 0.9327,
"step": 220
},
{
"epoch": 0.03637833468067906,
"grad_norm": 0.7173567285454316,
"learning_rate": 3.6348949919224556e-06,
"loss": 0.9049,
"step": 225
},
{
"epoch": 0.037186742118027485,
"grad_norm": 0.6476430881143054,
"learning_rate": 3.715670436187399e-06,
"loss": 0.8999,
"step": 230
},
{
"epoch": 0.03799514955537591,
"grad_norm": 0.6762537582692184,
"learning_rate": 3.796445880452343e-06,
"loss": 0.8911,
"step": 235
},
{
"epoch": 0.038803556992724336,
"grad_norm": 0.5916230517794187,
"learning_rate": 3.877221324717286e-06,
"loss": 0.9092,
"step": 240
},
{
"epoch": 0.039611964430072755,
"grad_norm": 0.6767171855576188,
"learning_rate": 3.95799676898223e-06,
"loss": 0.9173,
"step": 245
},
{
"epoch": 0.04042037186742118,
"grad_norm": 0.6057387003801464,
"learning_rate": 4.038772213247173e-06,
"loss": 0.9092,
"step": 250
},
{
"epoch": 0.041228779304769606,
"grad_norm": 0.6860012772175927,
"learning_rate": 4.119547657512117e-06,
"loss": 0.9265,
"step": 255
},
{
"epoch": 0.04203718674211803,
"grad_norm": 0.6193353966875188,
"learning_rate": 4.2003231017770605e-06,
"loss": 0.9054,
"step": 260
},
{
"epoch": 0.04284559417946645,
"grad_norm": 0.7553774828322988,
"learning_rate": 4.2810985460420035e-06,
"loss": 0.9065,
"step": 265
},
{
"epoch": 0.043654001616814875,
"grad_norm": 0.6613818499045624,
"learning_rate": 4.3618739903069465e-06,
"loss": 0.918,
"step": 270
},
{
"epoch": 0.0444624090541633,
"grad_norm": 0.6620831207682815,
"learning_rate": 4.44264943457189e-06,
"loss": 0.9037,
"step": 275
},
{
"epoch": 0.04527081649151172,
"grad_norm": 0.7222236926277061,
"learning_rate": 4.523424878836834e-06,
"loss": 0.8979,
"step": 280
},
{
"epoch": 0.046079223928860144,
"grad_norm": 0.6242092234384768,
"learning_rate": 4.604200323101777e-06,
"loss": 0.8859,
"step": 285
},
{
"epoch": 0.04688763136620857,
"grad_norm": 0.6308938354658095,
"learning_rate": 4.68497576736672e-06,
"loss": 0.8841,
"step": 290
},
{
"epoch": 0.047696038803556995,
"grad_norm": 0.60859508951431,
"learning_rate": 4.765751211631664e-06,
"loss": 0.8838,
"step": 295
},
{
"epoch": 0.04850444624090541,
"grad_norm": 0.6652854237730623,
"learning_rate": 4.846526655896608e-06,
"loss": 0.9062,
"step": 300
},
{
"epoch": 0.04931285367825384,
"grad_norm": 0.642163853982114,
"learning_rate": 4.927302100161551e-06,
"loss": 0.9076,
"step": 305
},
{
"epoch": 0.050121261115602264,
"grad_norm": 0.6957211657326403,
"learning_rate": 5.008077544426495e-06,
"loss": 0.9021,
"step": 310
},
{
"epoch": 0.05092966855295069,
"grad_norm": 0.6995520981355653,
"learning_rate": 5.088852988691439e-06,
"loss": 0.8737,
"step": 315
},
{
"epoch": 0.05173807599029911,
"grad_norm": 0.7059034170974082,
"learning_rate": 5.169628432956382e-06,
"loss": 0.8858,
"step": 320
},
{
"epoch": 0.05254648342764753,
"grad_norm": 0.835327047755355,
"learning_rate": 5.250403877221325e-06,
"loss": 0.8537,
"step": 325
},
{
"epoch": 0.05335489086499596,
"grad_norm": 0.6934448269317276,
"learning_rate": 5.331179321486269e-06,
"loss": 0.8777,
"step": 330
},
{
"epoch": 0.054163298302344384,
"grad_norm": 0.6740382005002135,
"learning_rate": 5.411954765751212e-06,
"loss": 0.8776,
"step": 335
},
{
"epoch": 0.0549717057396928,
"grad_norm": 0.6721017017683236,
"learning_rate": 5.492730210016155e-06,
"loss": 0.8596,
"step": 340
},
{
"epoch": 0.05578011317704123,
"grad_norm": 0.6684833482788531,
"learning_rate": 5.573505654281099e-06,
"loss": 0.8802,
"step": 345
},
{
"epoch": 0.056588520614389654,
"grad_norm": 0.6830751371635061,
"learning_rate": 5.654281098546042e-06,
"loss": 0.8847,
"step": 350
},
{
"epoch": 0.05739692805173808,
"grad_norm": 0.6376927762203839,
"learning_rate": 5.735056542810986e-06,
"loss": 0.8952,
"step": 355
},
{
"epoch": 0.0582053354890865,
"grad_norm": 0.7491586670802004,
"learning_rate": 5.815831987075929e-06,
"loss": 0.8696,
"step": 360
},
{
"epoch": 0.05901374292643492,
"grad_norm": 0.7899182711032194,
"learning_rate": 5.896607431340873e-06,
"loss": 0.8708,
"step": 365
},
{
"epoch": 0.05982215036378335,
"grad_norm": 0.6729809655437429,
"learning_rate": 5.977382875605816e-06,
"loss": 0.8788,
"step": 370
},
{
"epoch": 0.060630557801131774,
"grad_norm": 0.7074821523168202,
"learning_rate": 6.058158319870759e-06,
"loss": 0.8774,
"step": 375
},
{
"epoch": 0.06143896523848019,
"grad_norm": 0.7531217874613877,
"learning_rate": 6.138933764135703e-06,
"loss": 0.861,
"step": 380
},
{
"epoch": 0.06224737267582862,
"grad_norm": 0.700415167204527,
"learning_rate": 6.219709208400647e-06,
"loss": 0.8564,
"step": 385
},
{
"epoch": 0.06305578011317704,
"grad_norm": 0.6636924053582388,
"learning_rate": 6.300484652665591e-06,
"loss": 0.8771,
"step": 390
},
{
"epoch": 0.06386418755052546,
"grad_norm": 0.66699758864019,
"learning_rate": 6.381260096930534e-06,
"loss": 0.8562,
"step": 395
},
{
"epoch": 0.0646725949878739,
"grad_norm": 0.6787528779482374,
"learning_rate": 6.462035541195477e-06,
"loss": 0.849,
"step": 400
},
{
"epoch": 0.06548100242522231,
"grad_norm": 0.728449788329189,
"learning_rate": 6.542810985460421e-06,
"loss": 0.8844,
"step": 405
},
{
"epoch": 0.06628940986257073,
"grad_norm": 0.6910443567228122,
"learning_rate": 6.623586429725364e-06,
"loss": 0.8627,
"step": 410
},
{
"epoch": 0.06709781729991916,
"grad_norm": 0.7043536672068673,
"learning_rate": 6.7043618739903075e-06,
"loss": 0.877,
"step": 415
},
{
"epoch": 0.06790622473726758,
"grad_norm": 0.6952950226557627,
"learning_rate": 6.7851373182552505e-06,
"loss": 0.8593,
"step": 420
},
{
"epoch": 0.068714632174616,
"grad_norm": 0.7416956200244156,
"learning_rate": 6.865912762520195e-06,
"loss": 0.8784,
"step": 425
},
{
"epoch": 0.06952303961196443,
"grad_norm": 0.6558940843547532,
"learning_rate": 6.946688206785138e-06,
"loss": 0.8449,
"step": 430
},
{
"epoch": 0.07033144704931285,
"grad_norm": 0.7636013144707001,
"learning_rate": 7.027463651050081e-06,
"loss": 0.8625,
"step": 435
},
{
"epoch": 0.07113985448666128,
"grad_norm": 0.7003307897432925,
"learning_rate": 7.108239095315025e-06,
"loss": 0.8516,
"step": 440
},
{
"epoch": 0.0719482619240097,
"grad_norm": 0.8469481736942237,
"learning_rate": 7.189014539579968e-06,
"loss": 0.8485,
"step": 445
},
{
"epoch": 0.07275666936135812,
"grad_norm": 0.8403326547267631,
"learning_rate": 7.269789983844911e-06,
"loss": 0.8505,
"step": 450
},
{
"epoch": 0.07356507679870655,
"grad_norm": 0.7357863965541986,
"learning_rate": 7.350565428109855e-06,
"loss": 0.8391,
"step": 455
},
{
"epoch": 0.07437348423605497,
"grad_norm": 0.7683662988715164,
"learning_rate": 7.431340872374798e-06,
"loss": 0.8644,
"step": 460
},
{
"epoch": 0.07518189167340339,
"grad_norm": 0.7352096943254265,
"learning_rate": 7.512116316639743e-06,
"loss": 0.8758,
"step": 465
},
{
"epoch": 0.07599029911075182,
"grad_norm": 0.793107329910895,
"learning_rate": 7.592891760904686e-06,
"loss": 0.8548,
"step": 470
},
{
"epoch": 0.07679870654810024,
"grad_norm": 0.7644006635626036,
"learning_rate": 7.673667205169629e-06,
"loss": 0.8569,
"step": 475
},
{
"epoch": 0.07760711398544867,
"grad_norm": 0.7654747646539816,
"learning_rate": 7.754442649434572e-06,
"loss": 0.8513,
"step": 480
},
{
"epoch": 0.07841552142279709,
"grad_norm": 0.7186596171490216,
"learning_rate": 7.835218093699516e-06,
"loss": 0.8694,
"step": 485
},
{
"epoch": 0.07922392886014551,
"grad_norm": 0.7515718660820482,
"learning_rate": 7.91599353796446e-06,
"loss": 0.8497,
"step": 490
},
{
"epoch": 0.08003233629749394,
"grad_norm": 0.7475774619661114,
"learning_rate": 7.996768982229403e-06,
"loss": 0.8577,
"step": 495
},
{
"epoch": 0.08084074373484236,
"grad_norm": 0.742290022489521,
"learning_rate": 8.077544426494346e-06,
"loss": 0.8402,
"step": 500
},
{
"epoch": 0.08164915117219078,
"grad_norm": 0.8472564346824212,
"learning_rate": 8.15831987075929e-06,
"loss": 0.8383,
"step": 505
},
{
"epoch": 0.08245755860953921,
"grad_norm": 0.7770969163651785,
"learning_rate": 8.239095315024233e-06,
"loss": 0.8622,
"step": 510
},
{
"epoch": 0.08326596604688763,
"grad_norm": 0.8440539457306321,
"learning_rate": 8.319870759289176e-06,
"loss": 0.8633,
"step": 515
},
{
"epoch": 0.08407437348423606,
"grad_norm": 0.8914813539424271,
"learning_rate": 8.400646203554121e-06,
"loss": 0.8595,
"step": 520
},
{
"epoch": 0.08488278092158448,
"grad_norm": 0.800403073002308,
"learning_rate": 8.481421647819064e-06,
"loss": 0.848,
"step": 525
},
{
"epoch": 0.0856911883589329,
"grad_norm": 0.9509086588684886,
"learning_rate": 8.562197092084007e-06,
"loss": 0.8474,
"step": 530
},
{
"epoch": 0.08649959579628133,
"grad_norm": 0.8140338179022737,
"learning_rate": 8.64297253634895e-06,
"loss": 0.8373,
"step": 535
},
{
"epoch": 0.08730800323362975,
"grad_norm": 0.7006852263989337,
"learning_rate": 8.723747980613893e-06,
"loss": 0.8482,
"step": 540
},
{
"epoch": 0.08811641067097817,
"grad_norm": 0.7785561327612173,
"learning_rate": 8.804523424878838e-06,
"loss": 0.8305,
"step": 545
},
{
"epoch": 0.0889248181083266,
"grad_norm": 0.7986486796854503,
"learning_rate": 8.88529886914378e-06,
"loss": 0.8648,
"step": 550
},
{
"epoch": 0.08973322554567502,
"grad_norm": 0.7985244858238523,
"learning_rate": 8.966074313408725e-06,
"loss": 0.8559,
"step": 555
},
{
"epoch": 0.09054163298302344,
"grad_norm": 0.7908671911880187,
"learning_rate": 9.046849757673668e-06,
"loss": 0.8352,
"step": 560
},
{
"epoch": 0.09135004042037187,
"grad_norm": 0.7783182513226582,
"learning_rate": 9.127625201938612e-06,
"loss": 0.841,
"step": 565
},
{
"epoch": 0.09215844785772029,
"grad_norm": 0.8114380820943968,
"learning_rate": 9.208400646203555e-06,
"loss": 0.8696,
"step": 570
},
{
"epoch": 0.09296685529506872,
"grad_norm": 0.7244790781977708,
"learning_rate": 9.289176090468498e-06,
"loss": 0.8781,
"step": 575
},
{
"epoch": 0.09377526273241714,
"grad_norm": 0.7757605741684829,
"learning_rate": 9.36995153473344e-06,
"loss": 0.8344,
"step": 580
},
{
"epoch": 0.09458367016976556,
"grad_norm": 0.8251723121959252,
"learning_rate": 9.450726978998385e-06,
"loss": 0.8699,
"step": 585
},
{
"epoch": 0.09539207760711399,
"grad_norm": 0.7770030641428826,
"learning_rate": 9.531502423263328e-06,
"loss": 0.8453,
"step": 590
},
{
"epoch": 0.09620048504446241,
"grad_norm": 0.8828633900844275,
"learning_rate": 9.612277867528273e-06,
"loss": 0.8574,
"step": 595
},
{
"epoch": 0.09700889248181083,
"grad_norm": 0.8430425689960669,
"learning_rate": 9.693053311793216e-06,
"loss": 0.8534,
"step": 600
},
{
"epoch": 0.09781729991915926,
"grad_norm": 0.9081575104833137,
"learning_rate": 9.773828756058159e-06,
"loss": 0.8588,
"step": 605
},
{
"epoch": 0.09862570735650768,
"grad_norm": 0.8428340938970948,
"learning_rate": 9.854604200323102e-06,
"loss": 0.8431,
"step": 610
},
{
"epoch": 0.09943411479385611,
"grad_norm": 0.7107849668872003,
"learning_rate": 9.935379644588045e-06,
"loss": 0.8446,
"step": 615
},
{
"epoch": 0.10024252223120453,
"grad_norm": 0.7895600318895202,
"learning_rate": 9.999999203559496e-06,
"loss": 0.8453,
"step": 620
},
{
"epoch": 0.10105092966855295,
"grad_norm": 0.8083601931655888,
"learning_rate": 9.999971328168497e-06,
"loss": 0.8488,
"step": 625
},
{
"epoch": 0.10185933710590138,
"grad_norm": 0.7985500406230298,
"learning_rate": 9.999903631006022e-06,
"loss": 0.8425,
"step": 630
},
{
"epoch": 0.1026677445432498,
"grad_norm": 0.7746795817816715,
"learning_rate": 9.999796112611239e-06,
"loss": 0.8319,
"step": 635
},
{
"epoch": 0.10347615198059822,
"grad_norm": 0.8389146849754998,
"learning_rate": 9.999648773840469e-06,
"loss": 0.8235,
"step": 640
},
{
"epoch": 0.10428455941794665,
"grad_norm": 0.7186706984947462,
"learning_rate": 9.999461615867176e-06,
"loss": 0.8458,
"step": 645
},
{
"epoch": 0.10509296685529507,
"grad_norm": 0.8396675978961405,
"learning_rate": 9.99923464018196e-06,
"loss": 0.8429,
"step": 650
},
{
"epoch": 0.1059013742926435,
"grad_norm": 0.8459924496521772,
"learning_rate": 9.998967848592548e-06,
"loss": 0.8435,
"step": 655
},
{
"epoch": 0.10670978172999192,
"grad_norm": 0.8256550955754403,
"learning_rate": 9.998661243223772e-06,
"loss": 0.8266,
"step": 660
},
{
"epoch": 0.10751818916734034,
"grad_norm": 0.8794829930536063,
"learning_rate": 9.998314826517564e-06,
"loss": 0.8618,
"step": 665
},
{
"epoch": 0.10832659660468877,
"grad_norm": 0.9329957202025235,
"learning_rate": 9.99792860123292e-06,
"loss": 0.8459,
"step": 670
},
{
"epoch": 0.10913500404203719,
"grad_norm": 0.8109786838523474,
"learning_rate": 9.997502570445898e-06,
"loss": 0.8357,
"step": 675
},
{
"epoch": 0.1099434114793856,
"grad_norm": 0.8368185449034288,
"learning_rate": 9.997036737549573e-06,
"loss": 0.8293,
"step": 680
},
{
"epoch": 0.11075181891673404,
"grad_norm": 0.8442652760477221,
"learning_rate": 9.996531106254027e-06,
"loss": 0.8444,
"step": 685
},
{
"epoch": 0.11156022635408246,
"grad_norm": 0.8400214154235522,
"learning_rate": 9.99598568058631e-06,
"loss": 0.8292,
"step": 690
},
{
"epoch": 0.11236863379143087,
"grad_norm": 0.9180831766991244,
"learning_rate": 9.995400464890409e-06,
"loss": 0.8156,
"step": 695
},
{
"epoch": 0.11317704122877931,
"grad_norm": 0.8609257429862948,
"learning_rate": 9.994775463827218e-06,
"loss": 0.8616,
"step": 700
},
{
"epoch": 0.11398544866612773,
"grad_norm": 0.8666792501707015,
"learning_rate": 9.994110682374491e-06,
"loss": 0.8271,
"step": 705
},
{
"epoch": 0.11479385610347616,
"grad_norm": 0.7912432563155621,
"learning_rate": 9.993406125826818e-06,
"loss": 0.8401,
"step": 710
},
{
"epoch": 0.11560226354082458,
"grad_norm": 0.8597765866247851,
"learning_rate": 9.992661799795568e-06,
"loss": 0.8431,
"step": 715
},
{
"epoch": 0.116410670978173,
"grad_norm": 0.807630693691931,
"learning_rate": 9.991877710208851e-06,
"loss": 0.8373,
"step": 720
},
{
"epoch": 0.11721907841552143,
"grad_norm": 0.817871310017788,
"learning_rate": 9.991053863311468e-06,
"loss": 0.8564,
"step": 725
},
{
"epoch": 0.11802748585286985,
"grad_norm": 0.7613026800591074,
"learning_rate": 9.990190265664868e-06,
"loss": 0.8197,
"step": 730
},
{
"epoch": 0.11883589329021826,
"grad_norm": 0.8187146133464235,
"learning_rate": 9.989286924147085e-06,
"loss": 0.8222,
"step": 735
},
{
"epoch": 0.1196443007275667,
"grad_norm": 0.8489781785397903,
"learning_rate": 9.988343845952697e-06,
"loss": 0.8024,
"step": 740
},
{
"epoch": 0.12045270816491511,
"grad_norm": 0.7613031011178205,
"learning_rate": 9.987361038592751e-06,
"loss": 0.856,
"step": 745
},
{
"epoch": 0.12126111560226355,
"grad_norm": 0.8718470878736602,
"learning_rate": 9.986338509894722e-06,
"loss": 0.8429,
"step": 750
},
{
"epoch": 0.12206952303961197,
"grad_norm": 0.8137535001101496,
"learning_rate": 9.985276268002434e-06,
"loss": 0.8251,
"step": 755
},
{
"epoch": 0.12287793047696038,
"grad_norm": 0.9141558760999264,
"learning_rate": 9.984174321376008e-06,
"loss": 0.8387,
"step": 760
},
{
"epoch": 0.12368633791430882,
"grad_norm": 0.8433283575738159,
"learning_rate": 9.983032678791787e-06,
"loss": 0.8333,
"step": 765
},
{
"epoch": 0.12449474535165723,
"grad_norm": 0.8208485969816852,
"learning_rate": 9.98185134934227e-06,
"loss": 0.8435,
"step": 770
},
{
"epoch": 0.12530315278900567,
"grad_norm": 0.8537472068527471,
"learning_rate": 9.980630342436038e-06,
"loss": 0.8537,
"step": 775
},
{
"epoch": 0.12611156022635409,
"grad_norm": 0.8514401120204336,
"learning_rate": 9.979369667797675e-06,
"loss": 0.8253,
"step": 780
},
{
"epoch": 0.1269199676637025,
"grad_norm": 0.8159339445564436,
"learning_rate": 9.978069335467702e-06,
"loss": 0.8485,
"step": 785
},
{
"epoch": 0.12772837510105092,
"grad_norm": 0.8290585282345666,
"learning_rate": 9.976729355802483e-06,
"loss": 0.8327,
"step": 790
},
{
"epoch": 0.12853678253839934,
"grad_norm": 0.8280894035719621,
"learning_rate": 9.975349739474156e-06,
"loss": 0.8209,
"step": 795
},
{
"epoch": 0.1293451899757478,
"grad_norm": 0.8666934295453808,
"learning_rate": 9.97393049747053e-06,
"loss": 0.8284,
"step": 800
},
{
"epoch": 0.1301535974130962,
"grad_norm": 0.8707107448163076,
"learning_rate": 9.972471641095023e-06,
"loss": 0.8255,
"step": 805
},
{
"epoch": 0.13096200485044462,
"grad_norm": 0.8245531731991685,
"learning_rate": 9.970973181966548e-06,
"loss": 0.8155,
"step": 810
},
{
"epoch": 0.13177041228779304,
"grad_norm": 0.7959320542029523,
"learning_rate": 9.96943513201943e-06,
"loss": 0.8205,
"step": 815
},
{
"epoch": 0.13257881972514146,
"grad_norm": 0.9111851063669996,
"learning_rate": 9.967857503503318e-06,
"loss": 0.8356,
"step": 820
},
{
"epoch": 0.1333872271624899,
"grad_norm": 0.8307380488248262,
"learning_rate": 9.966240308983078e-06,
"loss": 0.8216,
"step": 825
},
{
"epoch": 0.13419563459983833,
"grad_norm": 0.9078436840276765,
"learning_rate": 9.964583561338688e-06,
"loss": 0.8027,
"step": 830
},
{
"epoch": 0.13500404203718674,
"grad_norm": 0.8201106707374893,
"learning_rate": 9.962887273765152e-06,
"loss": 0.8196,
"step": 835
},
{
"epoch": 0.13581244947453516,
"grad_norm": 0.8774372243990994,
"learning_rate": 9.961151459772384e-06,
"loss": 0.8342,
"step": 840
},
{
"epoch": 0.13662085691188358,
"grad_norm": 0.9217986467550336,
"learning_rate": 9.959376133185098e-06,
"loss": 0.8281,
"step": 845
},
{
"epoch": 0.137429264349232,
"grad_norm": 0.8384616776862577,
"learning_rate": 9.95756130814271e-06,
"loss": 0.828,
"step": 850
},
{
"epoch": 0.13823767178658045,
"grad_norm": 0.8506654451968726,
"learning_rate": 9.955706999099207e-06,
"loss": 0.8117,
"step": 855
},
{
"epoch": 0.13904607922392886,
"grad_norm": 0.8389541534284436,
"learning_rate": 9.953813220823048e-06,
"loss": 0.7975,
"step": 860
},
{
"epoch": 0.13985448666127728,
"grad_norm": 0.830733428685578,
"learning_rate": 9.951879988397045e-06,
"loss": 0.8303,
"step": 865
},
{
"epoch": 0.1406628940986257,
"grad_norm": 0.8021435064021754,
"learning_rate": 9.949907317218233e-06,
"loss": 0.828,
"step": 870
},
{
"epoch": 0.14147130153597412,
"grad_norm": 0.8658787105545401,
"learning_rate": 9.94789522299775e-06,
"loss": 0.8097,
"step": 875
},
{
"epoch": 0.14227970897332257,
"grad_norm": 0.9685280853430394,
"learning_rate": 9.945843721760725e-06,
"loss": 0.8232,
"step": 880
},
{
"epoch": 0.14308811641067098,
"grad_norm": 0.8148173552026365,
"learning_rate": 9.943752829846132e-06,
"loss": 0.8227,
"step": 885
},
{
"epoch": 0.1438965238480194,
"grad_norm": 0.8662458438707715,
"learning_rate": 9.941622563906667e-06,
"loss": 0.8292,
"step": 890
},
{
"epoch": 0.14470493128536782,
"grad_norm": 0.8109041012364064,
"learning_rate": 9.939452940908627e-06,
"loss": 0.7983,
"step": 895
},
{
"epoch": 0.14551333872271624,
"grad_norm": 0.7979068544718093,
"learning_rate": 9.937243978131751e-06,
"loss": 0.8109,
"step": 900
},
{
"epoch": 0.1463217461600647,
"grad_norm": 0.9947738130271577,
"learning_rate": 9.934995693169104e-06,
"loss": 0.8112,
"step": 905
},
{
"epoch": 0.1471301535974131,
"grad_norm": 0.8091228479590519,
"learning_rate": 9.932708103926932e-06,
"loss": 0.8282,
"step": 910
},
{
"epoch": 0.14793856103476152,
"grad_norm": 0.8487266612698499,
"learning_rate": 9.930381228624501e-06,
"loss": 0.8195,
"step": 915
},
{
"epoch": 0.14874696847210994,
"grad_norm": 0.8322926755681642,
"learning_rate": 9.928015085793983e-06,
"loss": 0.83,
"step": 920
},
{
"epoch": 0.14955537590945836,
"grad_norm": 0.8200762139699286,
"learning_rate": 9.925609694280284e-06,
"loss": 0.8162,
"step": 925
},
{
"epoch": 0.15036378334680678,
"grad_norm": 0.8102910947287206,
"learning_rate": 9.923165073240905e-06,
"loss": 0.8177,
"step": 930
},
{
"epoch": 0.15117219078415522,
"grad_norm": 0.785366694042482,
"learning_rate": 9.920681242145787e-06,
"loss": 0.8085,
"step": 935
},
{
"epoch": 0.15198059822150364,
"grad_norm": 0.8512969626348545,
"learning_rate": 9.918158220777152e-06,
"loss": 0.8116,
"step": 940
},
{
"epoch": 0.15278900565885206,
"grad_norm": 0.749686550031715,
"learning_rate": 9.91559602922935e-06,
"loss": 0.7995,
"step": 945
},
{
"epoch": 0.15359741309620048,
"grad_norm": 0.8078570006806167,
"learning_rate": 9.912994687908701e-06,
"loss": 0.809,
"step": 950
},
{
"epoch": 0.1544058205335489,
"grad_norm": 0.8901185428475071,
"learning_rate": 9.91035421753333e-06,
"loss": 0.8311,
"step": 955
},
{
"epoch": 0.15521422797089734,
"grad_norm": 0.8271675548109904,
"learning_rate": 9.907674639132995e-06,
"loss": 0.824,
"step": 960
},
{
"epoch": 0.15602263540824576,
"grad_norm": 0.8820995149717548,
"learning_rate": 9.904955974048934e-06,
"loss": 0.8107,
"step": 965
},
{
"epoch": 0.15683104284559418,
"grad_norm": 0.9196878526802965,
"learning_rate": 9.902198243933679e-06,
"loss": 0.8151,
"step": 970
},
{
"epoch": 0.1576394502829426,
"grad_norm": 0.8351142569585079,
"learning_rate": 9.899401470750898e-06,
"loss": 0.8304,
"step": 975
},
{
"epoch": 0.15844785772029102,
"grad_norm": 0.9150380540529176,
"learning_rate": 9.896565676775212e-06,
"loss": 0.8071,
"step": 980
},
{
"epoch": 0.15925626515763944,
"grad_norm": 0.8959222346902678,
"learning_rate": 9.893690884592017e-06,
"loss": 0.8215,
"step": 985
},
{
"epoch": 0.16006467259498788,
"grad_norm": 0.8761786707963919,
"learning_rate": 9.89077711709731e-06,
"loss": 0.8088,
"step": 990
},
{
"epoch": 0.1608730800323363,
"grad_norm": 0.9148814698747098,
"learning_rate": 9.887824397497498e-06,
"loss": 0.8226,
"step": 995
},
{
"epoch": 0.16168148746968472,
"grad_norm": 0.8153076566020756,
"learning_rate": 9.884832749309221e-06,
"loss": 0.8159,
"step": 1000
},
{
"epoch": 0.16248989490703314,
"grad_norm": 0.8770559374607161,
"learning_rate": 9.881802196359162e-06,
"loss": 0.8174,
"step": 1005
},
{
"epoch": 0.16329830234438156,
"grad_norm": 0.8362399592900118,
"learning_rate": 9.87873276278386e-06,
"loss": 0.8345,
"step": 1010
},
{
"epoch": 0.16410670978173,
"grad_norm": 1.0016233711345273,
"learning_rate": 9.875624473029508e-06,
"loss": 0.834,
"step": 1015
},
{
"epoch": 0.16491511721907842,
"grad_norm": 0.8247561849661889,
"learning_rate": 9.87247735185177e-06,
"loss": 0.8247,
"step": 1020
},
{
"epoch": 0.16572352465642684,
"grad_norm": 0.9429399357811139,
"learning_rate": 9.869291424315577e-06,
"loss": 0.8069,
"step": 1025
},
{
"epoch": 0.16653193209377526,
"grad_norm": 0.9563552234329712,
"learning_rate": 9.866066715794932e-06,
"loss": 0.8158,
"step": 1030
},
{
"epoch": 0.16734033953112368,
"grad_norm": 0.8804257837923922,
"learning_rate": 9.862803251972701e-06,
"loss": 0.8203,
"step": 1035
},
{
"epoch": 0.16814874696847212,
"grad_norm": 0.9024236063093718,
"learning_rate": 9.859501058840416e-06,
"loss": 0.8174,
"step": 1040
},
{
"epoch": 0.16895715440582054,
"grad_norm": 0.8479328206922394,
"learning_rate": 9.856160162698068e-06,
"loss": 0.8261,
"step": 1045
},
{
"epoch": 0.16976556184316896,
"grad_norm": 0.8622837966852652,
"learning_rate": 9.852780590153884e-06,
"loss": 0.8253,
"step": 1050
},
{
"epoch": 0.17057396928051738,
"grad_norm": 0.9701332391764099,
"learning_rate": 9.849362368124134e-06,
"loss": 0.8199,
"step": 1055
},
{
"epoch": 0.1713823767178658,
"grad_norm": 0.95228412728592,
"learning_rate": 9.845905523832903e-06,
"loss": 0.7991,
"step": 1060
},
{
"epoch": 0.17219078415521422,
"grad_norm": 0.8976839442354373,
"learning_rate": 9.842410084811888e-06,
"loss": 0.822,
"step": 1065
},
{
"epoch": 0.17299919159256266,
"grad_norm": 0.8593896455734684,
"learning_rate": 9.838876078900158e-06,
"loss": 0.7995,
"step": 1070
},
{
"epoch": 0.17380759902991108,
"grad_norm": 0.9295880765287333,
"learning_rate": 9.83530353424395e-06,
"loss": 0.8128,
"step": 1075
},
{
"epoch": 0.1746160064672595,
"grad_norm": 0.875753400493014,
"learning_rate": 9.83169247929644e-06,
"loss": 0.7784,
"step": 1080
},
{
"epoch": 0.17542441390460792,
"grad_norm": 0.9211393522356855,
"learning_rate": 9.828042942817513e-06,
"loss": 0.813,
"step": 1085
},
{
"epoch": 0.17623282134195634,
"grad_norm": 0.8300443612741539,
"learning_rate": 9.824354953873536e-06,
"loss": 0.8092,
"step": 1090
},
{
"epoch": 0.17704122877930478,
"grad_norm": 0.8401922945789401,
"learning_rate": 9.82062854183713e-06,
"loss": 0.8206,
"step": 1095
},
{
"epoch": 0.1778496362166532,
"grad_norm": 0.910386403590293,
"learning_rate": 9.816863736386934e-06,
"loss": 0.8206,
"step": 1100
},
{
"epoch": 0.17865804365400162,
"grad_norm": 0.9153963356098873,
"learning_rate": 9.813060567507358e-06,
"loss": 0.8233,
"step": 1105
},
{
"epoch": 0.17946645109135004,
"grad_norm": 0.9854523603560855,
"learning_rate": 9.809219065488362e-06,
"loss": 0.8054,
"step": 1110
},
{
"epoch": 0.18027485852869846,
"grad_norm": 0.9518212116514045,
"learning_rate": 9.805339260925209e-06,
"loss": 0.782,
"step": 1115
},
{
"epoch": 0.18108326596604687,
"grad_norm": 0.8628818391355276,
"learning_rate": 9.801421184718207e-06,
"loss": 0.8209,
"step": 1120
},
{
"epoch": 0.18189167340339532,
"grad_norm": 0.8864541634165986,
"learning_rate": 9.797464868072489e-06,
"loss": 0.7954,
"step": 1125
},
{
"epoch": 0.18270008084074374,
"grad_norm": 0.9167346076357672,
"learning_rate": 9.793470342497737e-06,
"loss": 0.8061,
"step": 1130
},
{
"epoch": 0.18350848827809216,
"grad_norm": 0.9927145545390887,
"learning_rate": 9.789437639807956e-06,
"loss": 0.7994,
"step": 1135
},
{
"epoch": 0.18431689571544058,
"grad_norm": 0.8867181379160483,
"learning_rate": 9.785366792121199e-06,
"loss": 0.8105,
"step": 1140
},
{
"epoch": 0.185125303152789,
"grad_norm": 0.8543686171692966,
"learning_rate": 9.781257831859326e-06,
"loss": 0.819,
"step": 1145
},
{
"epoch": 0.18593371059013744,
"grad_norm": 0.881943155622054,
"learning_rate": 9.777110791747741e-06,
"loss": 0.8011,
"step": 1150
},
{
"epoch": 0.18674211802748586,
"grad_norm": 0.9138030348872207,
"learning_rate": 9.77292570481513e-06,
"loss": 0.8161,
"step": 1155
},
{
"epoch": 0.18755052546483428,
"grad_norm": 0.934593231124601,
"learning_rate": 9.7687026043932e-06,
"loss": 0.8162,
"step": 1160
},
{
"epoch": 0.1883589329021827,
"grad_norm": 0.9327906245764378,
"learning_rate": 9.76444152411641e-06,
"loss": 0.8128,
"step": 1165
},
{
"epoch": 0.18916734033953111,
"grad_norm": 0.8269574194887537,
"learning_rate": 9.760142497921708e-06,
"loss": 0.8296,
"step": 1170
},
{
"epoch": 0.18997574777687956,
"grad_norm": 0.8942466721266422,
"learning_rate": 9.755805560048259e-06,
"loss": 0.7915,
"step": 1175
},
{
"epoch": 0.19078415521422798,
"grad_norm": 0.9491654283531196,
"learning_rate": 9.75143074503717e-06,
"loss": 0.8095,
"step": 1180
},
{
"epoch": 0.1915925626515764,
"grad_norm": 0.911028544312517,
"learning_rate": 9.74701808773122e-06,
"loss": 0.7965,
"step": 1185
},
{
"epoch": 0.19240097008892482,
"grad_norm": 0.9665924166790011,
"learning_rate": 9.742567623274571e-06,
"loss": 0.8485,
"step": 1190
},
{
"epoch": 0.19320937752627323,
"grad_norm": 0.8510809811181654,
"learning_rate": 9.738079387112509e-06,
"loss": 0.8127,
"step": 1195
},
{
"epoch": 0.19401778496362165,
"grad_norm": 0.9337458560349892,
"learning_rate": 9.733553414991135e-06,
"loss": 0.8196,
"step": 1200
},
{
"epoch": 0.1948261924009701,
"grad_norm": 0.8504908193588703,
"learning_rate": 9.728989742957107e-06,
"loss": 0.803,
"step": 1205
},
{
"epoch": 0.19563459983831852,
"grad_norm": 0.8952911780784167,
"learning_rate": 9.724388407357333e-06,
"loss": 0.8127,
"step": 1210
},
{
"epoch": 0.19644300727566694,
"grad_norm": 0.8693475240728679,
"learning_rate": 9.719749444838687e-06,
"loss": 0.8161,
"step": 1215
},
{
"epoch": 0.19725141471301536,
"grad_norm": 1.0067254194816264,
"learning_rate": 9.715072892347724e-06,
"loss": 0.8127,
"step": 1220
},
{
"epoch": 0.19805982215036377,
"grad_norm": 0.9913591568953714,
"learning_rate": 9.71035878713038e-06,
"loss": 0.7877,
"step": 1225
},
{
"epoch": 0.19886822958771222,
"grad_norm": 0.930691684815205,
"learning_rate": 9.705607166731673e-06,
"loss": 0.8103,
"step": 1230
},
{
"epoch": 0.19967663702506064,
"grad_norm": 0.9913080323236753,
"learning_rate": 9.700818068995407e-06,
"loss": 0.8248,
"step": 1235
},
{
"epoch": 0.20048504446240906,
"grad_norm": 0.9338686380215503,
"learning_rate": 9.695991532063875e-06,
"loss": 0.804,
"step": 1240
},
{
"epoch": 0.20129345189975748,
"grad_norm": 0.881337013423663,
"learning_rate": 9.691127594377546e-06,
"loss": 0.7993,
"step": 1245
},
{
"epoch": 0.2021018593371059,
"grad_norm": 0.9651413063403884,
"learning_rate": 9.686226294674763e-06,
"loss": 0.8157,
"step": 1250
},
{
"epoch": 0.2029102667744543,
"grad_norm": 0.9142968016401275,
"learning_rate": 9.68128767199144e-06,
"loss": 0.7956,
"step": 1255
},
{
"epoch": 0.20371867421180276,
"grad_norm": 0.9699692592410872,
"learning_rate": 9.676311765660743e-06,
"loss": 0.7878,
"step": 1260
},
{
"epoch": 0.20452708164915118,
"grad_norm": 0.9791128074057168,
"learning_rate": 9.67129861531278e-06,
"loss": 0.7988,
"step": 1265
},
{
"epoch": 0.2053354890864996,
"grad_norm": 0.9461753802895918,
"learning_rate": 9.666248260874283e-06,
"loss": 0.8027,
"step": 1270
},
{
"epoch": 0.206143896523848,
"grad_norm": 0.9557703955773883,
"learning_rate": 9.661160742568298e-06,
"loss": 0.8149,
"step": 1275
},
{
"epoch": 0.20695230396119643,
"grad_norm": 0.9648851489245359,
"learning_rate": 9.656036100913854e-06,
"loss": 0.8156,
"step": 1280
},
{
"epoch": 0.20776071139854488,
"grad_norm": 0.933652528739753,
"learning_rate": 9.65087437672565e-06,
"loss": 0.834,
"step": 1285
},
{
"epoch": 0.2085691188358933,
"grad_norm": 0.952902750681017,
"learning_rate": 9.645675611113715e-06,
"loss": 0.7919,
"step": 1290
},
{
"epoch": 0.20937752627324172,
"grad_norm": 0.9227467408489822,
"learning_rate": 9.640439845483106e-06,
"loss": 0.7791,
"step": 1295
},
{
"epoch": 0.21018593371059013,
"grad_norm": 0.9281667306865055,
"learning_rate": 9.635167121533548e-06,
"loss": 0.8075,
"step": 1300
},
{
"epoch": 0.21099434114793855,
"grad_norm": 0.9491984682902288,
"learning_rate": 9.629857481259128e-06,
"loss": 0.7853,
"step": 1305
},
{
"epoch": 0.211802748585287,
"grad_norm": 0.9590804939597338,
"learning_rate": 9.62451096694794e-06,
"loss": 0.8096,
"step": 1310
},
{
"epoch": 0.21261115602263542,
"grad_norm": 0.9022937078982735,
"learning_rate": 9.619127621181767e-06,
"loss": 0.7615,
"step": 1315
},
{
"epoch": 0.21341956345998384,
"grad_norm": 0.9211905503781073,
"learning_rate": 9.613707486835725e-06,
"loss": 0.8009,
"step": 1320
},
{
"epoch": 0.21422797089733225,
"grad_norm": 0.9224553203069952,
"learning_rate": 9.608250607077933e-06,
"loss": 0.8095,
"step": 1325
},
{
"epoch": 0.21503637833468067,
"grad_norm": 0.936067812857163,
"learning_rate": 9.602757025369165e-06,
"loss": 0.8012,
"step": 1330
},
{
"epoch": 0.2158447857720291,
"grad_norm": 0.9252870381796091,
"learning_rate": 9.597226785462501e-06,
"loss": 0.7986,
"step": 1335
},
{
"epoch": 0.21665319320937754,
"grad_norm": 0.9592213802782941,
"learning_rate": 9.591659931402983e-06,
"loss": 0.805,
"step": 1340
},
{
"epoch": 0.21746160064672596,
"grad_norm": 0.9896013218895608,
"learning_rate": 9.586056507527266e-06,
"loss": 0.7993,
"step": 1345
},
{
"epoch": 0.21827000808407437,
"grad_norm": 0.9440277526851043,
"learning_rate": 9.580416558463257e-06,
"loss": 0.8161,
"step": 1350
},
{
"epoch": 0.2190784155214228,
"grad_norm": 0.9781000660563656,
"learning_rate": 9.574740129129767e-06,
"loss": 0.8046,
"step": 1355
},
{
"epoch": 0.2198868229587712,
"grad_norm": 0.927988233216835,
"learning_rate": 9.569027264736148e-06,
"loss": 0.7956,
"step": 1360
},
{
"epoch": 0.22069523039611966,
"grad_norm": 0.9666122184957214,
"learning_rate": 9.563278010781939e-06,
"loss": 0.7913,
"step": 1365
},
{
"epoch": 0.22150363783346808,
"grad_norm": 0.9291765513048547,
"learning_rate": 9.557492413056497e-06,
"loss": 0.7919,
"step": 1370
},
{
"epoch": 0.2223120452708165,
"grad_norm": 0.9610201773035711,
"learning_rate": 9.551670517638637e-06,
"loss": 0.7902,
"step": 1375
},
{
"epoch": 0.2231204527081649,
"grad_norm": 0.9238229313792048,
"learning_rate": 9.545812370896262e-06,
"loss": 0.8058,
"step": 1380
},
{
"epoch": 0.22392886014551333,
"grad_norm": 0.9054092521932388,
"learning_rate": 9.539918019485995e-06,
"loss": 0.7892,
"step": 1385
},
{
"epoch": 0.22473726758286175,
"grad_norm": 1.0579833418628013,
"learning_rate": 9.53398751035281e-06,
"loss": 0.8148,
"step": 1390
},
{
"epoch": 0.2255456750202102,
"grad_norm": 1.0226788158408266,
"learning_rate": 9.528020890729653e-06,
"loss": 0.8031,
"step": 1395
},
{
"epoch": 0.22635408245755861,
"grad_norm": 1.1479246469449391,
"learning_rate": 9.522018208137066e-06,
"loss": 0.8037,
"step": 1400
},
{
"epoch": 0.22716248989490703,
"grad_norm": 0.9756911102647868,
"learning_rate": 9.51597951038282e-06,
"loss": 0.8097,
"step": 1405
},
{
"epoch": 0.22797089733225545,
"grad_norm": 1.035765200896759,
"learning_rate": 9.509904845561517e-06,
"loss": 0.8077,
"step": 1410
},
{
"epoch": 0.22877930476960387,
"grad_norm": 1.0689274362300878,
"learning_rate": 9.503794262054214e-06,
"loss": 0.7851,
"step": 1415
},
{
"epoch": 0.22958771220695232,
"grad_norm": 0.9132533214587567,
"learning_rate": 9.497647808528045e-06,
"loss": 0.7887,
"step": 1420
},
{
"epoch": 0.23039611964430073,
"grad_norm": 1.050699012001928,
"learning_rate": 9.491465533935824e-06,
"loss": 0.7932,
"step": 1425
},
{
"epoch": 0.23120452708164915,
"grad_norm": 0.9946090130405577,
"learning_rate": 9.485247487515658e-06,
"loss": 0.7722,
"step": 1430
},
{
"epoch": 0.23201293451899757,
"grad_norm": 1.06286243808036,
"learning_rate": 9.478993718790558e-06,
"loss": 0.7939,
"step": 1435
},
{
"epoch": 0.232821341956346,
"grad_norm": 1.0038879730182135,
"learning_rate": 9.472704277568034e-06,
"loss": 0.7963,
"step": 1440
},
{
"epoch": 0.23362974939369444,
"grad_norm": 1.0200294772824388,
"learning_rate": 9.466379213939717e-06,
"loss": 0.7919,
"step": 1445
},
{
"epoch": 0.23443815683104285,
"grad_norm": 1.099962358491636,
"learning_rate": 9.46001857828094e-06,
"loss": 0.784,
"step": 1450
},
{
"epoch": 0.23524656426839127,
"grad_norm": 0.9471318804802602,
"learning_rate": 9.453622421250353e-06,
"loss": 0.7974,
"step": 1455
},
{
"epoch": 0.2360549717057397,
"grad_norm": 0.9961320997864533,
"learning_rate": 9.447190793789504e-06,
"loss": 0.7677,
"step": 1460
},
{
"epoch": 0.2368633791430881,
"grad_norm": 1.1123725273599259,
"learning_rate": 9.44072374712245e-06,
"loss": 0.7877,
"step": 1465
},
{
"epoch": 0.23767178658043653,
"grad_norm": 0.9673736978640572,
"learning_rate": 9.43422133275534e-06,
"loss": 0.7851,
"step": 1470
},
{
"epoch": 0.23848019401778497,
"grad_norm": 1.0263547753632962,
"learning_rate": 9.427683602475994e-06,
"loss": 0.7915,
"step": 1475
},
{
"epoch": 0.2392886014551334,
"grad_norm": 0.9685100408061789,
"learning_rate": 9.42111060835352e-06,
"loss": 0.7887,
"step": 1480
},
{
"epoch": 0.2400970088924818,
"grad_norm": 1.0736685654002145,
"learning_rate": 9.414502402737866e-06,
"loss": 0.8043,
"step": 1485
},
{
"epoch": 0.24090541632983023,
"grad_norm": 1.0421660296064565,
"learning_rate": 9.407859038259428e-06,
"loss": 0.7854,
"step": 1490
},
{
"epoch": 0.24171382376717865,
"grad_norm": 1.0032053987863772,
"learning_rate": 9.401180567828615e-06,
"loss": 0.7746,
"step": 1495
},
{
"epoch": 0.2425222312045271,
"grad_norm": 0.9419030918663129,
"learning_rate": 9.394467044635439e-06,
"loss": 0.7803,
"step": 1500
},
{
"epoch": 0.2433306386418755,
"grad_norm": 1.0221215187873243,
"learning_rate": 9.387718522149084e-06,
"loss": 0.7947,
"step": 1505
},
{
"epoch": 0.24413904607922393,
"grad_norm": 1.0090765896605816,
"learning_rate": 9.38093505411748e-06,
"loss": 0.7791,
"step": 1510
},
{
"epoch": 0.24494745351657235,
"grad_norm": 0.9942329189536906,
"learning_rate": 9.374116694566882e-06,
"loss": 0.8088,
"step": 1515
},
{
"epoch": 0.24575586095392077,
"grad_norm": 1.1192063441058093,
"learning_rate": 9.36726349780143e-06,
"loss": 0.8059,
"step": 1520
},
{
"epoch": 0.2465642683912692,
"grad_norm": 1.05918617263924,
"learning_rate": 9.360375518402728e-06,
"loss": 0.7849,
"step": 1525
},
{
"epoch": 0.24737267582861763,
"grad_norm": 0.9510057659465685,
"learning_rate": 9.353452811229395e-06,
"loss": 0.8025,
"step": 1530
},
{
"epoch": 0.24818108326596605,
"grad_norm": 1.0012384708775823,
"learning_rate": 9.346495431416642e-06,
"loss": 0.785,
"step": 1535
},
{
"epoch": 0.24898949070331447,
"grad_norm": 0.9393516177220199,
"learning_rate": 9.339503434375823e-06,
"loss": 0.789,
"step": 1540
},
{
"epoch": 0.2497978981406629,
"grad_norm": 1.0793839568734547,
"learning_rate": 9.332476875794e-06,
"loss": 0.7923,
"step": 1545
},
{
"epoch": 0.25060630557801133,
"grad_norm": 1.0985732881069987,
"learning_rate": 9.325415811633497e-06,
"loss": 0.7662,
"step": 1550
},
{
"epoch": 0.2514147130153597,
"grad_norm": 1.0071649808511798,
"learning_rate": 9.318320298131452e-06,
"loss": 0.8159,
"step": 1555
},
{
"epoch": 0.25222312045270817,
"grad_norm": 1.0609048611209526,
"learning_rate": 9.31119039179937e-06,
"loss": 0.7745,
"step": 1560
},
{
"epoch": 0.25303152789005656,
"grad_norm": 0.9691770014769086,
"learning_rate": 9.30402614942268e-06,
"loss": 0.7966,
"step": 1565
},
{
"epoch": 0.253839935327405,
"grad_norm": 1.072937339455477,
"learning_rate": 9.296827628060274e-06,
"loss": 0.8029,
"step": 1570
},
{
"epoch": 0.25464834276475345,
"grad_norm": 1.078855254896766,
"learning_rate": 9.289594885044054e-06,
"loss": 0.7839,
"step": 1575
},
{
"epoch": 0.25545675020210185,
"grad_norm": 1.0105792692115017,
"learning_rate": 9.282327977978477e-06,
"loss": 0.7881,
"step": 1580
},
{
"epoch": 0.2562651576394503,
"grad_norm": 1.1055275025950306,
"learning_rate": 9.275026964740101e-06,
"loss": 0.8059,
"step": 1585
},
{
"epoch": 0.2570735650767987,
"grad_norm": 1.0897616544392064,
"learning_rate": 9.267691903477112e-06,
"loss": 0.7973,
"step": 1590
},
{
"epoch": 0.25788197251414713,
"grad_norm": 1.1206071430492686,
"learning_rate": 9.260322852608874e-06,
"loss": 0.7887,
"step": 1595
},
{
"epoch": 0.2586903799514956,
"grad_norm": 1.075935762788828,
"learning_rate": 9.252919870825453e-06,
"loss": 0.7904,
"step": 1600
},
{
"epoch": 0.25949878738884397,
"grad_norm": 1.172946439353908,
"learning_rate": 9.245483017087158e-06,
"loss": 0.794,
"step": 1605
},
{
"epoch": 0.2603071948261924,
"grad_norm": 1.1163921302502833,
"learning_rate": 9.238012350624069e-06,
"loss": 0.7888,
"step": 1610
},
{
"epoch": 0.2611156022635408,
"grad_norm": 0.9856503130011599,
"learning_rate": 9.230507930935559e-06,
"loss": 0.7793,
"step": 1615
},
{
"epoch": 0.26192400970088925,
"grad_norm": 1.0253170497677628,
"learning_rate": 9.222969817789829e-06,
"loss": 0.7732,
"step": 1620
},
{
"epoch": 0.2627324171382377,
"grad_norm": 1.08074229250819,
"learning_rate": 9.215398071223427e-06,
"loss": 0.7967,
"step": 1625
},
{
"epoch": 0.2635408245755861,
"grad_norm": 1.0525950674502662,
"learning_rate": 9.20779275154077e-06,
"loss": 0.802,
"step": 1630
},
{
"epoch": 0.26434923201293453,
"grad_norm": 1.0778232477194654,
"learning_rate": 9.200153919313667e-06,
"loss": 0.7747,
"step": 1635
},
{
"epoch": 0.2651576394502829,
"grad_norm": 1.069258333704327,
"learning_rate": 9.192481635380834e-06,
"loss": 0.7666,
"step": 1640
},
{
"epoch": 0.26596604688763137,
"grad_norm": 0.9963364554248915,
"learning_rate": 9.184775960847405e-06,
"loss": 0.7732,
"step": 1645
},
{
"epoch": 0.2667744543249798,
"grad_norm": 1.1349246014179366,
"learning_rate": 9.177036957084459e-06,
"loss": 0.7953,
"step": 1650
},
{
"epoch": 0.2675828617623282,
"grad_norm": 1.0581160333050574,
"learning_rate": 9.169264685728515e-06,
"loss": 0.7784,
"step": 1655
},
{
"epoch": 0.26839126919967665,
"grad_norm": 1.177618781034446,
"learning_rate": 9.161459208681049e-06,
"loss": 0.7961,
"step": 1660
},
{
"epoch": 0.26919967663702504,
"grad_norm": 1.1579983554909417,
"learning_rate": 9.153620588108006e-06,
"loss": 0.7938,
"step": 1665
},
{
"epoch": 0.2700080840743735,
"grad_norm": 1.0681808089207117,
"learning_rate": 9.14574888643929e-06,
"loss": 0.7998,
"step": 1670
},
{
"epoch": 0.27081649151172194,
"grad_norm": 1.0290242122559143,
"learning_rate": 9.137844166368289e-06,
"loss": 0.7897,
"step": 1675
},
{
"epoch": 0.2716248989490703,
"grad_norm": 1.0757364324046452,
"learning_rate": 9.129906490851348e-06,
"loss": 0.7967,
"step": 1680
},
{
"epoch": 0.27243330638641877,
"grad_norm": 1.174666163846864,
"learning_rate": 9.121935923107293e-06,
"loss": 0.7784,
"step": 1685
},
{
"epoch": 0.27324171382376716,
"grad_norm": 1.0474993963836983,
"learning_rate": 9.113932526616912e-06,
"loss": 0.7932,
"step": 1690
},
{
"epoch": 0.2740501212611156,
"grad_norm": 1.3191834743977484,
"learning_rate": 9.10589636512246e-06,
"loss": 0.803,
"step": 1695
},
{
"epoch": 0.274858528698464,
"grad_norm": 1.1027943580362713,
"learning_rate": 9.097827502627137e-06,
"loss": 0.7937,
"step": 1700
},
{
"epoch": 0.27566693613581245,
"grad_norm": 1.40441127971191,
"learning_rate": 9.089726003394593e-06,
"loss": 0.7784,
"step": 1705
},
{
"epoch": 0.2764753435731609,
"grad_norm": 1.13330113493992,
"learning_rate": 9.081591931948405e-06,
"loss": 0.7873,
"step": 1710
},
{
"epoch": 0.2772837510105093,
"grad_norm": 1.0889710176936576,
"learning_rate": 9.073425353071576e-06,
"loss": 0.7704,
"step": 1715
},
{
"epoch": 0.27809215844785773,
"grad_norm": 1.164862749945273,
"learning_rate": 9.065226331806006e-06,
"loss": 0.7627,
"step": 1720
},
{
"epoch": 0.2789005658852061,
"grad_norm": 1.1263042754286632,
"learning_rate": 9.056994933451975e-06,
"loss": 0.7772,
"step": 1725
},
{
"epoch": 0.27970897332255457,
"grad_norm": 1.1536982085539453,
"learning_rate": 9.048731223567636e-06,
"loss": 0.7753,
"step": 1730
},
{
"epoch": 0.280517380759903,
"grad_norm": 1.040290077185074,
"learning_rate": 9.04043526796848e-06,
"loss": 0.7956,
"step": 1735
},
{
"epoch": 0.2813257881972514,
"grad_norm": 1.1200292709537885,
"learning_rate": 9.032107132726812e-06,
"loss": 0.7746,
"step": 1740
},
{
"epoch": 0.28213419563459985,
"grad_norm": 1.1605146350184878,
"learning_rate": 9.023746884171234e-06,
"loss": 0.7875,
"step": 1745
},
{
"epoch": 0.28294260307194824,
"grad_norm": 1.1126474642028563,
"learning_rate": 9.015354588886112e-06,
"loss": 0.7572,
"step": 1750
},
{
"epoch": 0.2837510105092967,
"grad_norm": 1.1900400445950285,
"learning_rate": 9.006930313711038e-06,
"loss": 0.7537,
"step": 1755
},
{
"epoch": 0.28455941794664513,
"grad_norm": 1.1111691561929655,
"learning_rate": 8.99847412574031e-06,
"loss": 0.777,
"step": 1760
},
{
"epoch": 0.2853678253839935,
"grad_norm": 1.160025861824882,
"learning_rate": 8.989986092322394e-06,
"loss": 0.757,
"step": 1765
},
{
"epoch": 0.28617623282134197,
"grad_norm": 1.1040655040153644,
"learning_rate": 8.981466281059378e-06,
"loss": 0.765,
"step": 1770
},
{
"epoch": 0.28698464025869036,
"grad_norm": 1.030626487972238,
"learning_rate": 8.972914759806453e-06,
"loss": 0.7694,
"step": 1775
},
{
"epoch": 0.2877930476960388,
"grad_norm": 1.1545022043122366,
"learning_rate": 8.964331596671348e-06,
"loss": 0.7799,
"step": 1780
},
{
"epoch": 0.28860145513338725,
"grad_norm": 1.0671347264829774,
"learning_rate": 8.955716860013812e-06,
"loss": 0.785,
"step": 1785
},
{
"epoch": 0.28940986257073564,
"grad_norm": 1.1085542970283513,
"learning_rate": 8.94707061844505e-06,
"loss": 0.7938,
"step": 1790
},
{
"epoch": 0.2902182700080841,
"grad_norm": 1.0340008147017365,
"learning_rate": 8.938392940827191e-06,
"loss": 0.7904,
"step": 1795
},
{
"epoch": 0.2910266774454325,
"grad_norm": 1.1186878316073905,
"learning_rate": 8.929683896272728e-06,
"loss": 0.7847,
"step": 1800
},
{
"epoch": 0.2918350848827809,
"grad_norm": 1.0973963525253956,
"learning_rate": 8.920943554143978e-06,
"loss": 0.7689,
"step": 1805
},
{
"epoch": 0.2926434923201294,
"grad_norm": 1.2283562191922641,
"learning_rate": 8.912171984052517e-06,
"loss": 0.7974,
"step": 1810
},
{
"epoch": 0.29345189975747776,
"grad_norm": 1.077126989544477,
"learning_rate": 8.90336925585864e-06,
"loss": 0.7747,
"step": 1815
},
{
"epoch": 0.2942603071948262,
"grad_norm": 1.1424653162262948,
"learning_rate": 8.894535439670798e-06,
"loss": 0.7701,
"step": 1820
},
{
"epoch": 0.2950687146321746,
"grad_norm": 1.05283219197678,
"learning_rate": 8.885670605845032e-06,
"loss": 0.7922,
"step": 1825
},
{
"epoch": 0.29587712206952305,
"grad_norm": 1.2468236264933765,
"learning_rate": 8.876774824984426e-06,
"loss": 0.766,
"step": 1830
},
{
"epoch": 0.29668552950687144,
"grad_norm": 1.175827633568805,
"learning_rate": 8.867848167938535e-06,
"loss": 0.7861,
"step": 1835
},
{
"epoch": 0.2974939369442199,
"grad_norm": 1.156576419379891,
"learning_rate": 8.85889070580283e-06,
"loss": 0.7848,
"step": 1840
},
{
"epoch": 0.29830234438156833,
"grad_norm": 1.152830758997776,
"learning_rate": 8.849902509918119e-06,
"loss": 0.7643,
"step": 1845
},
{
"epoch": 0.2991107518189167,
"grad_norm": 1.2062288809904451,
"learning_rate": 8.84088365186999e-06,
"loss": 0.7971,
"step": 1850
},
{
"epoch": 0.29991915925626517,
"grad_norm": 1.264589031563977,
"learning_rate": 8.831834203488236e-06,
"loss": 0.7715,
"step": 1855
},
{
"epoch": 0.30072756669361356,
"grad_norm": 1.1659553441185666,
"learning_rate": 8.822754236846283e-06,
"loss": 0.7965,
"step": 1860
},
{
"epoch": 0.301535974130962,
"grad_norm": 1.2155282318642753,
"learning_rate": 8.813643824260616e-06,
"loss": 0.7746,
"step": 1865
},
{
"epoch": 0.30234438156831045,
"grad_norm": 1.1209411583569155,
"learning_rate": 8.804503038290204e-06,
"loss": 0.7803,
"step": 1870
},
{
"epoch": 0.30315278900565884,
"grad_norm": 1.1638621979766686,
"learning_rate": 8.795331951735927e-06,
"loss": 0.7645,
"step": 1875
},
{
"epoch": 0.3039611964430073,
"grad_norm": 1.1721735188433011,
"learning_rate": 8.786130637639983e-06,
"loss": 0.8013,
"step": 1880
},
{
"epoch": 0.3047696038803557,
"grad_norm": 1.227142983021926,
"learning_rate": 8.776899169285318e-06,
"loss": 0.7673,
"step": 1885
},
{
"epoch": 0.3055780113177041,
"grad_norm": 1.1530557801034091,
"learning_rate": 8.767637620195037e-06,
"loss": 0.7827,
"step": 1890
},
{
"epoch": 0.30638641875505257,
"grad_norm": 1.215930807426272,
"learning_rate": 8.758346064131824e-06,
"loss": 0.7849,
"step": 1895
},
{
"epoch": 0.30719482619240096,
"grad_norm": 1.2893329374001863,
"learning_rate": 8.749024575097347e-06,
"loss": 0.7688,
"step": 1900
},
{
"epoch": 0.3080032336297494,
"grad_norm": 1.1224840611348765,
"learning_rate": 8.739673227331671e-06,
"loss": 0.769,
"step": 1905
},
{
"epoch": 0.3088116410670978,
"grad_norm": 1.2783902442834993,
"learning_rate": 8.730292095312672e-06,
"loss": 0.7821,
"step": 1910
},
{
"epoch": 0.30962004850444624,
"grad_norm": 1.1486346702768837,
"learning_rate": 8.720881253755438e-06,
"loss": 0.7875,
"step": 1915
},
{
"epoch": 0.3104284559417947,
"grad_norm": 1.3327269392432468,
"learning_rate": 8.711440777611672e-06,
"loss": 0.7651,
"step": 1920
},
{
"epoch": 0.3112368633791431,
"grad_norm": 1.2580934477997214,
"learning_rate": 8.701970742069104e-06,
"loss": 0.7973,
"step": 1925
},
{
"epoch": 0.3120452708164915,
"grad_norm": 1.4391244366514906,
"learning_rate": 8.692471222550886e-06,
"loss": 0.773,
"step": 1930
},
{
"epoch": 0.3128536782538399,
"grad_norm": 1.1961637113375232,
"learning_rate": 8.68294229471499e-06,
"loss": 0.7892,
"step": 1935
},
{
"epoch": 0.31366208569118836,
"grad_norm": 1.3232907922114132,
"learning_rate": 8.673384034453606e-06,
"loss": 0.7524,
"step": 1940
},
{
"epoch": 0.3144704931285368,
"grad_norm": 1.288089316781721,
"learning_rate": 8.663796517892545e-06,
"loss": 0.7786,
"step": 1945
},
{
"epoch": 0.3152789005658852,
"grad_norm": 1.235383067935505,
"learning_rate": 8.65417982139062e-06,
"loss": 0.774,
"step": 1950
},
{
"epoch": 0.31608730800323365,
"grad_norm": 1.283153495299508,
"learning_rate": 8.644534021539053e-06,
"loss": 0.7825,
"step": 1955
},
{
"epoch": 0.31689571544058204,
"grad_norm": 1.216792856214731,
"learning_rate": 8.63485919516085e-06,
"loss": 0.7521,
"step": 1960
},
{
"epoch": 0.3177041228779305,
"grad_norm": 1.3530498421771213,
"learning_rate": 8.625155419310196e-06,
"loss": 0.7677,
"step": 1965
},
{
"epoch": 0.3185125303152789,
"grad_norm": 1.3306248413123443,
"learning_rate": 8.615422771271846e-06,
"loss": 0.7665,
"step": 1970
},
{
"epoch": 0.3193209377526273,
"grad_norm": 1.1191260051140974,
"learning_rate": 8.6056613285605e-06,
"loss": 0.7803,
"step": 1975
},
{
"epoch": 0.32012934518997577,
"grad_norm": 1.2579595003666681,
"learning_rate": 8.595871168920192e-06,
"loss": 0.7947,
"step": 1980
},
{
"epoch": 0.32093775262732416,
"grad_norm": 1.2493019784221522,
"learning_rate": 8.586052370323668e-06,
"loss": 0.7827,
"step": 1985
},
{
"epoch": 0.3217461600646726,
"grad_norm": 1.379586194222712,
"learning_rate": 8.57620501097177e-06,
"loss": 0.7958,
"step": 1990
},
{
"epoch": 0.322554567502021,
"grad_norm": 1.3364714838240768,
"learning_rate": 8.566329169292805e-06,
"loss": 0.7613,
"step": 1995
},
{
"epoch": 0.32336297493936944,
"grad_norm": 1.2559576604873097,
"learning_rate": 8.556424923941927e-06,
"loss": 0.7761,
"step": 2000
},
{
"epoch": 0.3241713823767179,
"grad_norm": 1.137717977458081,
"learning_rate": 8.546492353800504e-06,
"loss": 0.7714,
"step": 2005
},
{
"epoch": 0.3249797898140663,
"grad_norm": 1.1790974788016992,
"learning_rate": 8.536531537975502e-06,
"loss": 0.763,
"step": 2010
},
{
"epoch": 0.3257881972514147,
"grad_norm": 1.3052435801766198,
"learning_rate": 8.526542555798841e-06,
"loss": 0.7747,
"step": 2015
},
{
"epoch": 0.3265966046887631,
"grad_norm": 1.3641107292304886,
"learning_rate": 8.516525486826766e-06,
"loss": 0.7587,
"step": 2020
},
{
"epoch": 0.32740501212611156,
"grad_norm": 1.293959981732072,
"learning_rate": 8.506480410839226e-06,
"loss": 0.7666,
"step": 2025
},
{
"epoch": 0.32821341956346,
"grad_norm": 1.2669851763928037,
"learning_rate": 8.496407407839222e-06,
"loss": 0.7773,
"step": 2030
},
{
"epoch": 0.3290218270008084,
"grad_norm": 1.6431510329413388,
"learning_rate": 8.486306558052177e-06,
"loss": 0.7525,
"step": 2035
},
{
"epoch": 0.32983023443815684,
"grad_norm": 1.282525872896183,
"learning_rate": 8.476177941925304e-06,
"loss": 0.7761,
"step": 2040
},
{
"epoch": 0.33063864187550523,
"grad_norm": 1.4186889444880473,
"learning_rate": 8.466021640126946e-06,
"loss": 0.7763,
"step": 2045
},
{
"epoch": 0.3314470493128537,
"grad_norm": 1.2342391960980574,
"learning_rate": 8.455837733545958e-06,
"loss": 0.7787,
"step": 2050
},
{
"epoch": 0.3322554567502021,
"grad_norm": 1.1798583685640944,
"learning_rate": 8.445626303291042e-06,
"loss": 0.7651,
"step": 2055
},
{
"epoch": 0.3330638641875505,
"grad_norm": 1.4340728360934016,
"learning_rate": 8.435387430690114e-06,
"loss": 0.7739,
"step": 2060
},
{
"epoch": 0.33387227162489896,
"grad_norm": 1.2875171500390412,
"learning_rate": 8.425121197289651e-06,
"loss": 0.7625,
"step": 2065
},
{
"epoch": 0.33468067906224735,
"grad_norm": 1.36506923501983,
"learning_rate": 8.414827684854043e-06,
"loss": 0.7832,
"step": 2070
},
{
"epoch": 0.3354890864995958,
"grad_norm": 1.2183591792746593,
"learning_rate": 8.404506975364936e-06,
"loss": 0.7774,
"step": 2075
},
{
"epoch": 0.33629749393694425,
"grad_norm": 1.4107910163303898,
"learning_rate": 8.394159151020592e-06,
"loss": 0.7612,
"step": 2080
},
{
"epoch": 0.33710590137429264,
"grad_norm": 1.354922028103792,
"learning_rate": 8.383784294235223e-06,
"loss": 0.7563,
"step": 2085
},
{
"epoch": 0.3379143088116411,
"grad_norm": 1.3755048390820226,
"learning_rate": 8.373382487638336e-06,
"loss": 0.7636,
"step": 2090
},
{
"epoch": 0.3387227162489895,
"grad_norm": 1.2930084423245742,
"learning_rate": 8.36295381407408e-06,
"loss": 0.7693,
"step": 2095
},
{
"epoch": 0.3395311236863379,
"grad_norm": 1.378365146427395,
"learning_rate": 8.352498356600582e-06,
"loss": 0.7746,
"step": 2100
},
{
"epoch": 0.3403395311236863,
"grad_norm": 1.3177858986323527,
"learning_rate": 8.342016198489287e-06,
"loss": 0.7777,
"step": 2105
},
{
"epoch": 0.34114793856103476,
"grad_norm": 1.1946361921703772,
"learning_rate": 8.331507423224297e-06,
"loss": 0.758,
"step": 2110
},
{
"epoch": 0.3419563459983832,
"grad_norm": 1.2196165253989297,
"learning_rate": 8.320972114501698e-06,
"loss": 0.7717,
"step": 2115
},
{
"epoch": 0.3427647534357316,
"grad_norm": 1.5233528090812753,
"learning_rate": 8.310410356228905e-06,
"loss": 0.7643,
"step": 2120
},
{
"epoch": 0.34357316087308004,
"grad_norm": 1.3922351628505543,
"learning_rate": 8.299822232523983e-06,
"loss": 0.7652,
"step": 2125
},
{
"epoch": 0.34438156831042843,
"grad_norm": 1.299229036894897,
"learning_rate": 8.289207827714985e-06,
"loss": 0.7701,
"step": 2130
},
{
"epoch": 0.3451899757477769,
"grad_norm": 1.4076244701750404,
"learning_rate": 8.278567226339278e-06,
"loss": 0.7787,
"step": 2135
},
{
"epoch": 0.3459983831851253,
"grad_norm": 1.3491904132063997,
"learning_rate": 8.267900513142865e-06,
"loss": 0.7761,
"step": 2140
},
{
"epoch": 0.3468067906224737,
"grad_norm": 1.389818329274582,
"learning_rate": 8.257207773079717e-06,
"loss": 0.78,
"step": 2145
},
{
"epoch": 0.34761519805982216,
"grad_norm": 1.2993500493593475,
"learning_rate": 8.246489091311093e-06,
"loss": 0.7534,
"step": 2150
},
{
"epoch": 0.34842360549717055,
"grad_norm": 1.4908211704294125,
"learning_rate": 8.235744553204862e-06,
"loss": 0.7598,
"step": 2155
},
{
"epoch": 0.349232012934519,
"grad_norm": 1.3704141190016572,
"learning_rate": 8.22497424433482e-06,
"loss": 0.7882,
"step": 2160
},
{
"epoch": 0.35004042037186744,
"grad_norm": 1.4318928400255833,
"learning_rate": 8.214178250480018e-06,
"loss": 0.7743,
"step": 2165
},
{
"epoch": 0.35084882780921584,
"grad_norm": 1.3025693765735056,
"learning_rate": 8.20335665762407e-06,
"loss": 0.7513,
"step": 2170
},
{
"epoch": 0.3516572352465643,
"grad_norm": 1.5596677809789021,
"learning_rate": 8.192509551954464e-06,
"loss": 0.7587,
"step": 2175
},
{
"epoch": 0.35246564268391267,
"grad_norm": 1.4384756804728538,
"learning_rate": 8.181637019861894e-06,
"loss": 0.7594,
"step": 2180
},
{
"epoch": 0.3532740501212611,
"grad_norm": 1.323585903657254,
"learning_rate": 8.17073914793955e-06,
"loss": 0.7628,
"step": 2185
},
{
"epoch": 0.35408245755860956,
"grad_norm": 1.421616795580572,
"learning_rate": 8.159816022982448e-06,
"loss": 0.7483,
"step": 2190
},
{
"epoch": 0.35489086499595796,
"grad_norm": 1.64783406542589,
"learning_rate": 8.148867731986719e-06,
"loss": 0.758,
"step": 2195
},
{
"epoch": 0.3556992724333064,
"grad_norm": 1.3954883204221082,
"learning_rate": 8.137894362148932e-06,
"loss": 0.7557,
"step": 2200
},
{
"epoch": 0.3565076798706548,
"grad_norm": 1.4347324336354104,
"learning_rate": 8.126896000865396e-06,
"loss": 0.7727,
"step": 2205
},
{
"epoch": 0.35731608730800324,
"grad_norm": 1.3916989023657484,
"learning_rate": 8.115872735731456e-06,
"loss": 0.7805,
"step": 2210
},
{
"epoch": 0.3581244947453517,
"grad_norm": 1.3896721696095782,
"learning_rate": 8.104824654540808e-06,
"loss": 0.7756,
"step": 2215
},
{
"epoch": 0.3589329021827001,
"grad_norm": 1.5753672301918493,
"learning_rate": 8.093751845284788e-06,
"loss": 0.7444,
"step": 2220
},
{
"epoch": 0.3597413096200485,
"grad_norm": 1.4739360020234273,
"learning_rate": 8.082654396151676e-06,
"loss": 0.7407,
"step": 2225
},
{
"epoch": 0.3605497170573969,
"grad_norm": 1.6455271651692198,
"learning_rate": 8.071532395525997e-06,
"loss": 0.7664,
"step": 2230
},
{
"epoch": 0.36135812449474536,
"grad_norm": 1.3036154952077734,
"learning_rate": 8.060385931987813e-06,
"loss": 0.7829,
"step": 2235
},
{
"epoch": 0.36216653193209375,
"grad_norm": 1.58649380991059,
"learning_rate": 8.049215094312016e-06,
"loss": 0.777,
"step": 2240
},
{
"epoch": 0.3629749393694422,
"grad_norm": 1.426073276884676,
"learning_rate": 8.038019971467627e-06,
"loss": 0.7661,
"step": 2245
},
{
"epoch": 0.36378334680679064,
"grad_norm": 1.37105840839501,
"learning_rate": 8.026800652617082e-06,
"loss": 0.7627,
"step": 2250
},
{
"epoch": 0.36459175424413903,
"grad_norm": 1.3575348993109124,
"learning_rate": 8.01555722711552e-06,
"loss": 0.7595,
"step": 2255
},
{
"epoch": 0.3654001616814875,
"grad_norm": 1.3386365085409289,
"learning_rate": 8.004289784510085e-06,
"loss": 0.7521,
"step": 2260
},
{
"epoch": 0.36620856911883587,
"grad_norm": 1.4824129268204114,
"learning_rate": 7.992998414539192e-06,
"loss": 0.772,
"step": 2265
},
{
"epoch": 0.3670169765561843,
"grad_norm": 1.2895098918031378,
"learning_rate": 7.981683207131828e-06,
"loss": 0.7689,
"step": 2270
},
{
"epoch": 0.36782538399353276,
"grad_norm": 1.3803230665428012,
"learning_rate": 7.970344252406832e-06,
"loss": 0.7602,
"step": 2275
},
{
"epoch": 0.36863379143088115,
"grad_norm": 1.3275993845905512,
"learning_rate": 7.958981640672173e-06,
"loss": 0.7517,
"step": 2280
},
{
"epoch": 0.3694421988682296,
"grad_norm": 1.5336775968372514,
"learning_rate": 7.947595462424237e-06,
"loss": 0.7608,
"step": 2285
},
{
"epoch": 0.370250606305578,
"grad_norm": 1.324781077505621,
"learning_rate": 7.9361858083471e-06,
"loss": 0.7554,
"step": 2290
},
{
"epoch": 0.37105901374292644,
"grad_norm": 1.3538550136393135,
"learning_rate": 7.924752769311812e-06,
"loss": 0.752,
"step": 2295
},
{
"epoch": 0.3718674211802749,
"grad_norm": 1.6085089085439872,
"learning_rate": 7.913296436375669e-06,
"loss": 0.7346,
"step": 2300
},
{
"epoch": 0.3726758286176233,
"grad_norm": 1.8804974173920925,
"learning_rate": 7.901816900781487e-06,
"loss": 0.7623,
"step": 2305
},
{
"epoch": 0.3734842360549717,
"grad_norm": 1.5290904872516846,
"learning_rate": 7.89031425395688e-06,
"loss": 0.7481,
"step": 2310
},
{
"epoch": 0.3742926434923201,
"grad_norm": 1.2645807911489806,
"learning_rate": 7.87878858751353e-06,
"loss": 0.7398,
"step": 2315
},
{
"epoch": 0.37510105092966856,
"grad_norm": 1.430758612094428,
"learning_rate": 7.86723999324645e-06,
"loss": 0.762,
"step": 2320
},
{
"epoch": 0.375909458367017,
"grad_norm": 1.5523221259007942,
"learning_rate": 7.855668563133266e-06,
"loss": 0.7636,
"step": 2325
},
{
"epoch": 0.3767178658043654,
"grad_norm": 1.4039231457429788,
"learning_rate": 7.844074389333475e-06,
"loss": 0.7741,
"step": 2330
},
{
"epoch": 0.37752627324171384,
"grad_norm": 1.5800498692580414,
"learning_rate": 7.832457564187715e-06,
"loss": 0.7584,
"step": 2335
},
{
"epoch": 0.37833468067906223,
"grad_norm": 1.4231018707509073,
"learning_rate": 7.82081818021703e-06,
"loss": 0.7535,
"step": 2340
},
{
"epoch": 0.3791430881164107,
"grad_norm": 1.4043910558336434,
"learning_rate": 7.809156330122126e-06,
"loss": 0.7629,
"step": 2345
},
{
"epoch": 0.3799514955537591,
"grad_norm": 1.3535251605510783,
"learning_rate": 7.79747210678264e-06,
"loss": 0.7611,
"step": 2350
},
{
"epoch": 0.3807599029911075,
"grad_norm": 1.5600412590315658,
"learning_rate": 7.785765603256403e-06,
"loss": 0.7561,
"step": 2355
},
{
"epoch": 0.38156831042845596,
"grad_norm": 1.485858509627278,
"learning_rate": 7.774036912778693e-06,
"loss": 0.7689,
"step": 2360
},
{
"epoch": 0.38237671786580435,
"grad_norm": 1.455361982040585,
"learning_rate": 7.762286128761488e-06,
"loss": 0.7427,
"step": 2365
},
{
"epoch": 0.3831851253031528,
"grad_norm": 1.4379434832935507,
"learning_rate": 7.750513344792735e-06,
"loss": 0.7512,
"step": 2370
},
{
"epoch": 0.3839935327405012,
"grad_norm": 1.4392546100628072,
"learning_rate": 7.738718654635593e-06,
"loss": 0.7707,
"step": 2375
},
{
"epoch": 0.38480194017784963,
"grad_norm": 1.5225174077271784,
"learning_rate": 7.726902152227692e-06,
"loss": 0.7592,
"step": 2380
},
{
"epoch": 0.3856103476151981,
"grad_norm": 1.3542620619226433,
"learning_rate": 7.715063931680382e-06,
"loss": 0.755,
"step": 2385
},
{
"epoch": 0.38641875505254647,
"grad_norm": 1.4675073613366663,
"learning_rate": 7.703204087277989e-06,
"loss": 0.7487,
"step": 2390
},
{
"epoch": 0.3872271624898949,
"grad_norm": 1.525792717822675,
"learning_rate": 7.691322713477055e-06,
"loss": 0.7563,
"step": 2395
},
{
"epoch": 0.3880355699272433,
"grad_norm": 1.5306298024716511,
"learning_rate": 7.679419904905594e-06,
"loss": 0.7647,
"step": 2400
},
{
"epoch": 0.38884397736459175,
"grad_norm": 1.471740454637615,
"learning_rate": 7.667495756362333e-06,
"loss": 0.7466,
"step": 2405
},
{
"epoch": 0.3896523848019402,
"grad_norm": 1.433993552907344,
"learning_rate": 7.655550362815961e-06,
"loss": 0.7723,
"step": 2410
},
{
"epoch": 0.3904607922392886,
"grad_norm": 1.3890048044664887,
"learning_rate": 7.643583819404373e-06,
"loss": 0.7645,
"step": 2415
},
{
"epoch": 0.39126919967663704,
"grad_norm": 1.458717033811816,
"learning_rate": 7.631596221433903e-06,
"loss": 0.7438,
"step": 2420
},
{
"epoch": 0.3920776071139854,
"grad_norm": 1.5403786743607832,
"learning_rate": 7.619587664378576e-06,
"loss": 0.7583,
"step": 2425
},
{
"epoch": 0.3928860145513339,
"grad_norm": 1.5554771708646353,
"learning_rate": 7.607558243879345e-06,
"loss": 0.7568,
"step": 2430
},
{
"epoch": 0.3936944219886823,
"grad_norm": 1.4050877774215123,
"learning_rate": 7.595508055743327e-06,
"loss": 0.7318,
"step": 2435
},
{
"epoch": 0.3945028294260307,
"grad_norm": 1.4349627242310348,
"learning_rate": 7.583437195943038e-06,
"loss": 0.7466,
"step": 2440
},
{
"epoch": 0.39531123686337916,
"grad_norm": 1.362494365903188,
"learning_rate": 7.5713457606156335e-06,
"loss": 0.7541,
"step": 2445
},
{
"epoch": 0.39611964430072755,
"grad_norm": 1.6090479796008157,
"learning_rate": 7.5592338460621414e-06,
"loss": 0.7542,
"step": 2450
},
{
"epoch": 0.396928051738076,
"grad_norm": 1.4498197495892686,
"learning_rate": 7.547101548746694e-06,
"loss": 0.7683,
"step": 2455
},
{
"epoch": 0.39773645917542444,
"grad_norm": 1.5497610550177745,
"learning_rate": 7.534948965295759e-06,
"loss": 0.743,
"step": 2460
},
{
"epoch": 0.39854486661277283,
"grad_norm": 1.633197313139618,
"learning_rate": 7.5227761924973695e-06,
"loss": 0.7619,
"step": 2465
},
{
"epoch": 0.3993532740501213,
"grad_norm": 1.4063313558901078,
"learning_rate": 7.510583327300361e-06,
"loss": 0.757,
"step": 2470
},
{
"epoch": 0.40016168148746967,
"grad_norm": 1.416947436523921,
"learning_rate": 7.498370466813586e-06,
"loss": 0.7473,
"step": 2475
},
{
"epoch": 0.4009700889248181,
"grad_norm": 1.436986729489712,
"learning_rate": 7.4861377083051514e-06,
"loss": 0.7482,
"step": 2480
},
{
"epoch": 0.40177849636216656,
"grad_norm": 1.606273588901474,
"learning_rate": 7.473885149201636e-06,
"loss": 0.7499,
"step": 2485
},
{
"epoch": 0.40258690379951495,
"grad_norm": 1.516840232761339,
"learning_rate": 7.461612887087324e-06,
"loss": 0.7544,
"step": 2490
},
{
"epoch": 0.4033953112368634,
"grad_norm": 1.6762062202796544,
"learning_rate": 7.449321019703419e-06,
"loss": 0.7484,
"step": 2495
},
{
"epoch": 0.4042037186742118,
"grad_norm": 1.4007361159543554,
"learning_rate": 7.437009644947268e-06,
"loss": 0.7531,
"step": 2500
},
{
"epoch": 0.40501212611156023,
"grad_norm": 1.6334162039294893,
"learning_rate": 7.424678860871584e-06,
"loss": 0.7507,
"step": 2505
},
{
"epoch": 0.4058205335489086,
"grad_norm": 1.5489666316051582,
"learning_rate": 7.4123287656836625e-06,
"loss": 0.7466,
"step": 2510
},
{
"epoch": 0.40662894098625707,
"grad_norm": 1.521691255397242,
"learning_rate": 7.399959457744603e-06,
"loss": 0.7441,
"step": 2515
},
{
"epoch": 0.4074373484236055,
"grad_norm": 1.5151056129411498,
"learning_rate": 7.387571035568523e-06,
"loss": 0.7535,
"step": 2520
},
{
"epoch": 0.4082457558609539,
"grad_norm": 1.5961542357400647,
"learning_rate": 7.375163597821766e-06,
"loss": 0.7738,
"step": 2525
},
{
"epoch": 0.40905416329830235,
"grad_norm": 1.747860345661276,
"learning_rate": 7.362737243322132e-06,
"loss": 0.7298,
"step": 2530
},
{
"epoch": 0.40986257073565074,
"grad_norm": 1.3979379187367524,
"learning_rate": 7.350292071038079e-06,
"loss": 0.7421,
"step": 2535
},
{
"epoch": 0.4106709781729992,
"grad_norm": 1.5286299117288702,
"learning_rate": 7.337828180087934e-06,
"loss": 0.7606,
"step": 2540
},
{
"epoch": 0.41147938561034764,
"grad_norm": 1.482169171151244,
"learning_rate": 7.3253456697391145e-06,
"loss": 0.7534,
"step": 2545
},
{
"epoch": 0.412287793047696,
"grad_norm": 1.507882701454434,
"learning_rate": 7.3128446394073216e-06,
"loss": 0.7617,
"step": 2550
},
{
"epoch": 0.4130962004850445,
"grad_norm": 1.3817103348743367,
"learning_rate": 7.300325188655762e-06,
"loss": 0.7612,
"step": 2555
},
{
"epoch": 0.41390460792239286,
"grad_norm": 1.449395598404693,
"learning_rate": 7.287787417194348e-06,
"loss": 0.7467,
"step": 2560
},
{
"epoch": 0.4147130153597413,
"grad_norm": 1.6097878448442098,
"learning_rate": 7.275231424878906e-06,
"loss": 0.7833,
"step": 2565
},
{
"epoch": 0.41552142279708976,
"grad_norm": 1.4604575128666755,
"learning_rate": 7.262657311710383e-06,
"loss": 0.7547,
"step": 2570
},
{
"epoch": 0.41632983023443815,
"grad_norm": 1.4553068247385297,
"learning_rate": 7.2500651778340425e-06,
"loss": 0.7272,
"step": 2575
},
{
"epoch": 0.4171382376717866,
"grad_norm": 1.5580605108332357,
"learning_rate": 7.237455123538678e-06,
"loss": 0.7622,
"step": 2580
},
{
"epoch": 0.417946645109135,
"grad_norm": 1.397575829167641,
"learning_rate": 7.224827249255804e-06,
"loss": 0.7439,
"step": 2585
},
{
"epoch": 0.41875505254648343,
"grad_norm": 1.7002294742720083,
"learning_rate": 7.212181655558863e-06,
"loss": 0.7463,
"step": 2590
},
{
"epoch": 0.4195634599838319,
"grad_norm": 1.5013399985702793,
"learning_rate": 7.199518443162419e-06,
"loss": 0.7527,
"step": 2595
},
{
"epoch": 0.42037186742118027,
"grad_norm": 1.619027591523225,
"learning_rate": 7.186837712921362e-06,
"loss": 0.7536,
"step": 2600
},
{
"epoch": 0.4211802748585287,
"grad_norm": 1.5305697974020676,
"learning_rate": 7.174139565830098e-06,
"loss": 0.7551,
"step": 2605
},
{
"epoch": 0.4219886822958771,
"grad_norm": 1.7269161713804273,
"learning_rate": 7.161424103021752e-06,
"loss": 0.7676,
"step": 2610
},
{
"epoch": 0.42279708973322555,
"grad_norm": 1.5491570246039656,
"learning_rate": 7.148691425767354e-06,
"loss": 0.7314,
"step": 2615
},
{
"epoch": 0.423605497170574,
"grad_norm": 1.7174735150190135,
"learning_rate": 7.1359416354750365e-06,
"loss": 0.7291,
"step": 2620
},
{
"epoch": 0.4244139046079224,
"grad_norm": 1.5540965935630928,
"learning_rate": 7.12317483368923e-06,
"loss": 0.7572,
"step": 2625
},
{
"epoch": 0.42522231204527083,
"grad_norm": 1.449181152489211,
"learning_rate": 7.1103911220898544e-06,
"loss": 0.743,
"step": 2630
},
{
"epoch": 0.4260307194826192,
"grad_norm": 1.652128164913507,
"learning_rate": 7.097590602491495e-06,
"loss": 0.7619,
"step": 2635
},
{
"epoch": 0.42683912691996767,
"grad_norm": 1.5607840252827987,
"learning_rate": 7.084773376842615e-06,
"loss": 0.748,
"step": 2640
},
{
"epoch": 0.42764753435731606,
"grad_norm": 1.6399087632058216,
"learning_rate": 7.0719395472247225e-06,
"loss": 0.7618,
"step": 2645
},
{
"epoch": 0.4284559417946645,
"grad_norm": 1.6180383108334129,
"learning_rate": 7.05908921585157e-06,
"loss": 0.7473,
"step": 2650
},
{
"epoch": 0.42926434923201295,
"grad_norm": 1.8684396820882947,
"learning_rate": 7.046222485068339e-06,
"loss": 0.7198,
"step": 2655
},
{
"epoch": 0.43007275666936134,
"grad_norm": 1.5146248805420086,
"learning_rate": 7.0333394573508185e-06,
"loss": 0.7504,
"step": 2660
},
{
"epoch": 0.4308811641067098,
"grad_norm": 1.6657404552018036,
"learning_rate": 7.020440235304593e-06,
"loss": 0.7469,
"step": 2665
},
{
"epoch": 0.4316895715440582,
"grad_norm": 1.597433899337064,
"learning_rate": 7.007524921664226e-06,
"loss": 0.7218,
"step": 2670
},
{
"epoch": 0.43249797898140663,
"grad_norm": 1.6034447006676202,
"learning_rate": 6.994593619292441e-06,
"loss": 0.7484,
"step": 2675
},
{
"epoch": 0.4333063864187551,
"grad_norm": 1.4825714570898108,
"learning_rate": 6.981646431179304e-06,
"loss": 0.7515,
"step": 2680
},
{
"epoch": 0.43411479385610346,
"grad_norm": 1.6102467057772276,
"learning_rate": 6.968683460441398e-06,
"loss": 0.7426,
"step": 2685
},
{
"epoch": 0.4349232012934519,
"grad_norm": 1.6012582102326722,
"learning_rate": 6.9557048103210065e-06,
"loss": 0.7158,
"step": 2690
},
{
"epoch": 0.4357316087308003,
"grad_norm": 1.5017101591262598,
"learning_rate": 6.942710584185292e-06,
"loss": 0.7265,
"step": 2695
},
{
"epoch": 0.43654001616814875,
"grad_norm": 1.491162425876339,
"learning_rate": 6.929700885525466e-06,
"loss": 0.7296,
"step": 2700
},
{
"epoch": 0.4373484236054972,
"grad_norm": 1.618547335353381,
"learning_rate": 6.916675817955973e-06,
"loss": 0.7587,
"step": 2705
},
{
"epoch": 0.4381568310428456,
"grad_norm": 1.5583303827019015,
"learning_rate": 6.9036354852136625e-06,
"loss": 0.763,
"step": 2710
},
{
"epoch": 0.43896523848019403,
"grad_norm": 1.5580614184472952,
"learning_rate": 6.890579991156958e-06,
"loss": 0.7393,
"step": 2715
},
{
"epoch": 0.4397736459175424,
"grad_norm": 1.6599720531878508,
"learning_rate": 6.8775094397650375e-06,
"loss": 0.7413,
"step": 2720
},
{
"epoch": 0.44058205335489087,
"grad_norm": 1.6475335959819555,
"learning_rate": 6.864423935136999e-06,
"loss": 0.7319,
"step": 2725
},
{
"epoch": 0.4413904607922393,
"grad_norm": 1.6868001230472809,
"learning_rate": 6.851323581491034e-06,
"loss": 0.7317,
"step": 2730
},
{
"epoch": 0.4421988682295877,
"grad_norm": 1.7072560513083037,
"learning_rate": 6.838208483163601e-06,
"loss": 0.7502,
"step": 2735
},
{
"epoch": 0.44300727566693615,
"grad_norm": 1.4703001017567308,
"learning_rate": 6.825078744608589e-06,
"loss": 0.7497,
"step": 2740
},
{
"epoch": 0.44381568310428454,
"grad_norm": 1.6393092026980918,
"learning_rate": 6.811934470396484e-06,
"loss": 0.7306,
"step": 2745
},
{
"epoch": 0.444624090541633,
"grad_norm": 1.6755312874196466,
"learning_rate": 6.7987757652135456e-06,
"loss": 0.739,
"step": 2750
},
{
"epoch": 0.44543249797898143,
"grad_norm": 1.6202018958792113,
"learning_rate": 6.785602733860963e-06,
"loss": 0.7381,
"step": 2755
},
{
"epoch": 0.4462409054163298,
"grad_norm": 1.5932719199258238,
"learning_rate": 6.77241548125403e-06,
"loss": 0.7329,
"step": 2760
},
{
"epoch": 0.44704931285367827,
"grad_norm": 1.7002494620385344,
"learning_rate": 6.759214112421297e-06,
"loss": 0.7509,
"step": 2765
},
{
"epoch": 0.44785772029102666,
"grad_norm": 1.8232932108091804,
"learning_rate": 6.745998732503749e-06,
"loss": 0.7465,
"step": 2770
},
{
"epoch": 0.4486661277283751,
"grad_norm": 1.8027285940055193,
"learning_rate": 6.732769446753954e-06,
"loss": 0.7512,
"step": 2775
},
{
"epoch": 0.4494745351657235,
"grad_norm": 1.476838340418493,
"learning_rate": 6.719526360535238e-06,
"loss": 0.7478,
"step": 2780
},
{
"epoch": 0.45028294260307195,
"grad_norm": 1.6030014859566264,
"learning_rate": 6.706269579320834e-06,
"loss": 0.7491,
"step": 2785
},
{
"epoch": 0.4510913500404204,
"grad_norm": 2.0330037226973148,
"learning_rate": 6.6929992086930515e-06,
"loss": 0.7374,
"step": 2790
},
{
"epoch": 0.4518997574777688,
"grad_norm": 1.5613281088896387,
"learning_rate": 6.6797153543424285e-06,
"loss": 0.7342,
"step": 2795
},
{
"epoch": 0.45270816491511723,
"grad_norm": 1.5249975129037057,
"learning_rate": 6.666418122066896e-06,
"loss": 0.7227,
"step": 2800
},
{
"epoch": 0.4535165723524656,
"grad_norm": 1.6452739375240444,
"learning_rate": 6.653107617770928e-06,
"loss": 0.754,
"step": 2805
},
{
"epoch": 0.45432497978981407,
"grad_norm": 1.6396256738334043,
"learning_rate": 6.639783947464707e-06,
"loss": 0.7337,
"step": 2810
},
{
"epoch": 0.4551333872271625,
"grad_norm": 1.8559299829849996,
"learning_rate": 6.626447217263269e-06,
"loss": 0.7486,
"step": 2815
},
{
"epoch": 0.4559417946645109,
"grad_norm": 1.724861765792677,
"learning_rate": 6.613097533385671e-06,
"loss": 0.729,
"step": 2820
},
{
"epoch": 0.45675020210185935,
"grad_norm": 1.6120913915780044,
"learning_rate": 6.599735002154133e-06,
"loss": 0.7246,
"step": 2825
},
{
"epoch": 0.45755860953920774,
"grad_norm": 1.5561331554910032,
"learning_rate": 6.5863597299932e-06,
"loss": 0.7424,
"step": 2830
},
{
"epoch": 0.4583670169765562,
"grad_norm": 1.5756166020131763,
"learning_rate": 6.572971823428885e-06,
"loss": 0.736,
"step": 2835
},
{
"epoch": 0.45917542441390463,
"grad_norm": 1.5901722292065572,
"learning_rate": 6.559571389087834e-06,
"loss": 0.7277,
"step": 2840
},
{
"epoch": 0.459983831851253,
"grad_norm": 1.8199798497598791,
"learning_rate": 6.546158533696465e-06,
"loss": 0.7521,
"step": 2845
},
{
"epoch": 0.46079223928860147,
"grad_norm": 1.6353472073783677,
"learning_rate": 6.532733364080126e-06,
"loss": 0.7558,
"step": 2850
},
{
"epoch": 0.46160064672594986,
"grad_norm": 1.6335140922701896,
"learning_rate": 6.519295987162232e-06,
"loss": 0.7401,
"step": 2855
},
{
"epoch": 0.4624090541632983,
"grad_norm": 1.751367401482544,
"learning_rate": 6.50584650996343e-06,
"loss": 0.7434,
"step": 2860
},
{
"epoch": 0.46321746160064675,
"grad_norm": 1.566395823390269,
"learning_rate": 6.492385039600735e-06,
"loss": 0.7803,
"step": 2865
},
{
"epoch": 0.46402586903799514,
"grad_norm": 1.5360516049112365,
"learning_rate": 6.4789116832866834e-06,
"loss": 0.7587,
"step": 2870
},
{
"epoch": 0.4648342764753436,
"grad_norm": 1.819847858777574,
"learning_rate": 6.465426548328473e-06,
"loss": 0.7478,
"step": 2875
},
{
"epoch": 0.465642683912692,
"grad_norm": 1.5385023230471138,
"learning_rate": 6.451929742127109e-06,
"loss": 0.7337,
"step": 2880
},
{
"epoch": 0.4664510913500404,
"grad_norm": 1.4847036423315738,
"learning_rate": 6.4384213721765565e-06,
"loss": 0.7367,
"step": 2885
},
{
"epoch": 0.46725949878738887,
"grad_norm": 1.6452154053274373,
"learning_rate": 6.424901546062878e-06,
"loss": 0.7464,
"step": 2890
},
{
"epoch": 0.46806790622473726,
"grad_norm": 1.891586315433217,
"learning_rate": 6.411370371463373e-06,
"loss": 0.7587,
"step": 2895
},
{
"epoch": 0.4688763136620857,
"grad_norm": 1.6847638248373114,
"learning_rate": 6.397827956145732e-06,
"loss": 0.757,
"step": 2900
},
{
"epoch": 0.4696847210994341,
"grad_norm": 1.935186064874233,
"learning_rate": 6.3842744079671634e-06,
"loss": 0.7285,
"step": 2905
},
{
"epoch": 0.47049312853678255,
"grad_norm": 1.6081940113542759,
"learning_rate": 6.370709834873547e-06,
"loss": 0.7466,
"step": 2910
},
{
"epoch": 0.47130153597413094,
"grad_norm": 1.9116295949365476,
"learning_rate": 6.35713434489857e-06,
"loss": 0.72,
"step": 2915
},
{
"epoch": 0.4721099434114794,
"grad_norm": 1.775823518041551,
"learning_rate": 6.343548046162863e-06,
"loss": 0.7538,
"step": 2920
},
{
"epoch": 0.47291835084882783,
"grad_norm": 1.62571587035583,
"learning_rate": 6.329951046873143e-06,
"loss": 0.7426,
"step": 2925
},
{
"epoch": 0.4737267582861762,
"grad_norm": 1.774624905090093,
"learning_rate": 6.31634345532135e-06,
"loss": 0.718,
"step": 2930
},
{
"epoch": 0.47453516572352467,
"grad_norm": 1.6468612905160713,
"learning_rate": 6.302725379883787e-06,
"loss": 0.7293,
"step": 2935
},
{
"epoch": 0.47534357316087306,
"grad_norm": 1.6068150290028567,
"learning_rate": 6.289096929020254e-06,
"loss": 0.7227,
"step": 2940
},
{
"epoch": 0.4761519805982215,
"grad_norm": 1.821341348490976,
"learning_rate": 6.275458211273182e-06,
"loss": 0.7291,
"step": 2945
},
{
"epoch": 0.47696038803556995,
"grad_norm": 1.646392168409669,
"learning_rate": 6.261809335266776e-06,
"loss": 0.7588,
"step": 2950
},
{
"epoch": 0.47776879547291834,
"grad_norm": 1.4998598776984355,
"learning_rate": 6.248150409706144e-06,
"loss": 0.7431,
"step": 2955
},
{
"epoch": 0.4785772029102668,
"grad_norm": 1.6291849374923184,
"learning_rate": 6.234481543376433e-06,
"loss": 0.7494,
"step": 2960
},
{
"epoch": 0.4793856103476152,
"grad_norm": 1.6806295233872666,
"learning_rate": 6.2208028451419575e-06,
"loss": 0.7506,
"step": 2965
},
{
"epoch": 0.4801940177849636,
"grad_norm": 1.788703909711479,
"learning_rate": 6.207114423945346e-06,
"loss": 0.7391,
"step": 2970
},
{
"epoch": 0.48100242522231207,
"grad_norm": 1.7460679090246425,
"learning_rate": 6.193416388806655e-06,
"loss": 0.7512,
"step": 2975
},
{
"epoch": 0.48181083265966046,
"grad_norm": 1.7991177181949694,
"learning_rate": 6.179708848822521e-06,
"loss": 0.7494,
"step": 2980
},
{
"epoch": 0.4826192400970089,
"grad_norm": 1.6195605598787102,
"learning_rate": 6.165991913165271e-06,
"loss": 0.7395,
"step": 2985
},
{
"epoch": 0.4834276475343573,
"grad_norm": 1.9898749874558108,
"learning_rate": 6.152265691082067e-06,
"loss": 0.7169,
"step": 2990
},
{
"epoch": 0.48423605497170574,
"grad_norm": 1.8398403882057845,
"learning_rate": 6.138530291894033e-06,
"loss": 0.7584,
"step": 2995
},
{
"epoch": 0.4850444624090542,
"grad_norm": 1.828005720680138,
"learning_rate": 6.124785824995381e-06,
"loss": 0.7314,
"step": 3000
},
{
"epoch": 0.4858528698464026,
"grad_norm": 1.7421782056931043,
"learning_rate": 6.111032399852542e-06,
"loss": 0.7388,
"step": 3005
},
{
"epoch": 0.486661277283751,
"grad_norm": 1.6022841844735267,
"learning_rate": 6.097270126003297e-06,
"loss": 0.7241,
"step": 3010
},
{
"epoch": 0.4874696847210994,
"grad_norm": 1.743402917972022,
"learning_rate": 6.083499113055897e-06,
"loss": 0.7354,
"step": 3015
},
{
"epoch": 0.48827809215844786,
"grad_norm": 1.4072740337152898,
"learning_rate": 6.069719470688199e-06,
"loss": 0.7334,
"step": 3020
},
{
"epoch": 0.4890864995957963,
"grad_norm": 1.8931792386123252,
"learning_rate": 6.0559313086467854e-06,
"loss": 0.7301,
"step": 3025
},
{
"epoch": 0.4898949070331447,
"grad_norm": 1.5281809673914062,
"learning_rate": 6.042134736746093e-06,
"loss": 0.7324,
"step": 3030
},
{
"epoch": 0.49070331447049315,
"grad_norm": 1.856573916290289,
"learning_rate": 6.028329864867538e-06,
"loss": 0.7324,
"step": 3035
},
{
"epoch": 0.49151172190784154,
"grad_norm": 2.038603374836649,
"learning_rate": 6.0145168029586434e-06,
"loss": 0.7276,
"step": 3040
},
{
"epoch": 0.49232012934519,
"grad_norm": 1.9183080921170146,
"learning_rate": 6.000695661032158e-06,
"loss": 0.7344,
"step": 3045
},
{
"epoch": 0.4931285367825384,
"grad_norm": 1.6918091903565058,
"learning_rate": 5.986866549165185e-06,
"loss": 0.7121,
"step": 3050
},
{
"epoch": 0.4939369442198868,
"grad_norm": 1.467542289658805,
"learning_rate": 5.9730295774983e-06,
"loss": 0.7412,
"step": 3055
},
{
"epoch": 0.49474535165723527,
"grad_norm": 1.6510602277072752,
"learning_rate": 5.959184856234681e-06,
"loss": 0.7089,
"step": 3060
},
{
"epoch": 0.49555375909458366,
"grad_norm": 1.5658481426672775,
"learning_rate": 5.9453324956392264e-06,
"loss": 0.7382,
"step": 3065
},
{
"epoch": 0.4963621665319321,
"grad_norm": 1.810370657979415,
"learning_rate": 5.931472606037677e-06,
"loss": 0.7269,
"step": 3070
},
{
"epoch": 0.4971705739692805,
"grad_norm": 2.1265390970725675,
"learning_rate": 5.917605297815736e-06,
"loss": 0.7319,
"step": 3075
},
{
"epoch": 0.49797898140662894,
"grad_norm": 1.9115609327468914,
"learning_rate": 5.903730681418191e-06,
"loss": 0.7489,
"step": 3080
},
{
"epoch": 0.4987873888439774,
"grad_norm": 1.7058868208634674,
"learning_rate": 5.8898488673480385e-06,
"loss": 0.7291,
"step": 3085
},
{
"epoch": 0.4995957962813258,
"grad_norm": 1.5461578620231866,
"learning_rate": 5.8759599661655975e-06,
"loss": 0.7216,
"step": 3090
},
{
"epoch": 0.5004042037186742,
"grad_norm": 1.7707716709348011,
"learning_rate": 5.862064088487632e-06,
"loss": 0.7209,
"step": 3095
},
{
"epoch": 0.5012126111560227,
"grad_norm": 1.6647808647832354,
"learning_rate": 5.8481613449864695e-06,
"loss": 0.733,
"step": 3100
},
{
"epoch": 0.502021018593371,
"grad_norm": 1.9301899340867452,
"learning_rate": 5.8342518463891195e-06,
"loss": 0.7321,
"step": 3105
},
{
"epoch": 0.5028294260307195,
"grad_norm": 1.8541494136601961,
"learning_rate": 5.820335703476394e-06,
"loss": 0.7195,
"step": 3110
},
{
"epoch": 0.5036378334680679,
"grad_norm": 1.684397088706739,
"learning_rate": 5.806413027082018e-06,
"loss": 0.736,
"step": 3115
},
{
"epoch": 0.5044462409054163,
"grad_norm": 1.838888999222473,
"learning_rate": 5.792483928091759e-06,
"loss": 0.7188,
"step": 3120
},
{
"epoch": 0.5052546483427648,
"grad_norm": 1.6586266130026301,
"learning_rate": 5.7785485174425285e-06,
"loss": 0.7341,
"step": 3125
},
{
"epoch": 0.5060630557801131,
"grad_norm": 1.6053396069937373,
"learning_rate": 5.764606906121513e-06,
"loss": 0.7415,
"step": 3130
},
{
"epoch": 0.5068714632174616,
"grad_norm": 1.8485601800302767,
"learning_rate": 5.75065920516528e-06,
"loss": 0.7358,
"step": 3135
},
{
"epoch": 0.50767987065481,
"grad_norm": 1.7801558687500054,
"learning_rate": 5.7367055256589e-06,
"loss": 0.7389,
"step": 3140
},
{
"epoch": 0.5084882780921585,
"grad_norm": 1.7682829788110874,
"learning_rate": 5.722745978735056e-06,
"loss": 0.7463,
"step": 3145
},
{
"epoch": 0.5092966855295069,
"grad_norm": 2.0310097528031514,
"learning_rate": 5.708780675573163e-06,
"loss": 0.7495,
"step": 3150
},
{
"epoch": 0.5101050929668552,
"grad_norm": 1.6939653960527117,
"learning_rate": 5.694809727398483e-06,
"loss": 0.735,
"step": 3155
},
{
"epoch": 0.5109135004042037,
"grad_norm": 1.4907854480309246,
"learning_rate": 5.680833245481234e-06,
"loss": 0.7112,
"step": 3160
},
{
"epoch": 0.5117219078415521,
"grad_norm": 1.5916645160260514,
"learning_rate": 5.666851341135706e-06,
"loss": 0.7314,
"step": 3165
},
{
"epoch": 0.5125303152789006,
"grad_norm": 1.7263945402809162,
"learning_rate": 5.652864125719382e-06,
"loss": 0.7453,
"step": 3170
},
{
"epoch": 0.513338722716249,
"grad_norm": 1.7250369660017415,
"learning_rate": 5.638871710632037e-06,
"loss": 0.7499,
"step": 3175
},
{
"epoch": 0.5141471301535974,
"grad_norm": 1.66803893376865,
"learning_rate": 5.624874207314861e-06,
"loss": 0.7165,
"step": 3180
},
{
"epoch": 0.5149555375909458,
"grad_norm": 1.9530837325477433,
"learning_rate": 5.61087172724957e-06,
"loss": 0.751,
"step": 3185
},
{
"epoch": 0.5157639450282943,
"grad_norm": 1.7036232716696973,
"learning_rate": 5.596864381957514e-06,
"loss": 0.7072,
"step": 3190
},
{
"epoch": 0.5165723524656427,
"grad_norm": 2.0287938769391585,
"learning_rate": 5.5828522829987965e-06,
"loss": 0.7456,
"step": 3195
},
{
"epoch": 0.5173807599029911,
"grad_norm": 1.961486108448002,
"learning_rate": 5.5688355419713766e-06,
"loss": 0.729,
"step": 3200
},
{
"epoch": 0.5181891673403395,
"grad_norm": 1.747396445262512,
"learning_rate": 5.554814270510185e-06,
"loss": 0.7428,
"step": 3205
},
{
"epoch": 0.5189975747776879,
"grad_norm": 1.9789250758812496,
"learning_rate": 5.540788580286236e-06,
"loss": 0.7216,
"step": 3210
},
{
"epoch": 0.5198059822150364,
"grad_norm": 1.7067424322279225,
"learning_rate": 5.526758583005736e-06,
"loss": 0.7388,
"step": 3215
},
{
"epoch": 0.5206143896523848,
"grad_norm": 1.530137654068918,
"learning_rate": 5.512724390409197e-06,
"loss": 0.7456,
"step": 3220
},
{
"epoch": 0.5214227970897333,
"grad_norm": 1.9188410439442418,
"learning_rate": 5.4986861142705396e-06,
"loss": 0.7257,
"step": 3225
},
{
"epoch": 0.5222312045270816,
"grad_norm": 1.4829883700282378,
"learning_rate": 5.484643866396211e-06,
"loss": 0.7231,
"step": 3230
},
{
"epoch": 0.52303961196443,
"grad_norm": 1.9423854124597335,
"learning_rate": 5.47059775862429e-06,
"loss": 0.7327,
"step": 3235
},
{
"epoch": 0.5238480194017785,
"grad_norm": 1.585692364516504,
"learning_rate": 5.456547902823596e-06,
"loss": 0.7095,
"step": 3240
},
{
"epoch": 0.5246564268391269,
"grad_norm": 1.6935294419570495,
"learning_rate": 5.4424944108928005e-06,
"loss": 0.7176,
"step": 3245
},
{
"epoch": 0.5254648342764754,
"grad_norm": 1.9443703299521362,
"learning_rate": 5.428437394759534e-06,
"loss": 0.7548,
"step": 3250
},
{
"epoch": 0.5262732417138237,
"grad_norm": 1.8920530979032508,
"learning_rate": 5.414376966379494e-06,
"loss": 0.7295,
"step": 3255
},
{
"epoch": 0.5270816491511722,
"grad_norm": 1.742331924607013,
"learning_rate": 5.4003132377355594e-06,
"loss": 0.7507,
"step": 3260
},
{
"epoch": 0.5278900565885206,
"grad_norm": 1.8552434605554495,
"learning_rate": 5.386246320836887e-06,
"loss": 0.7311,
"step": 3265
},
{
"epoch": 0.5286984640258691,
"grad_norm": 1.804743780018075,
"learning_rate": 5.372176327718029e-06,
"loss": 0.7357,
"step": 3270
},
{
"epoch": 0.5295068714632175,
"grad_norm": 1.6662837706239626,
"learning_rate": 5.35810337043804e-06,
"loss": 0.7281,
"step": 3275
},
{
"epoch": 0.5303152789005658,
"grad_norm": 1.6292430108827105,
"learning_rate": 5.34402756107958e-06,
"loss": 0.7355,
"step": 3280
},
{
"epoch": 0.5311236863379143,
"grad_norm": 1.7350237488555562,
"learning_rate": 5.3299490117480245e-06,
"loss": 0.7472,
"step": 3285
},
{
"epoch": 0.5319320937752627,
"grad_norm": 1.7851311336238955,
"learning_rate": 5.315867834570573e-06,
"loss": 0.7263,
"step": 3290
},
{
"epoch": 0.5327405012126112,
"grad_norm": 1.6356935298013957,
"learning_rate": 5.301784141695348e-06,
"loss": 0.7409,
"step": 3295
},
{
"epoch": 0.5335489086499596,
"grad_norm": 1.719781130457011,
"learning_rate": 5.287698045290514e-06,
"loss": 0.7433,
"step": 3300
},
{
"epoch": 0.534357316087308,
"grad_norm": 2.003270367765294,
"learning_rate": 5.2736096575433805e-06,
"loss": 0.7356,
"step": 3305
},
{
"epoch": 0.5351657235246564,
"grad_norm": 1.4439742700866685,
"learning_rate": 5.2595190906595e-06,
"loss": 0.7364,
"step": 3310
},
{
"epoch": 0.5359741309620049,
"grad_norm": 1.740158385555015,
"learning_rate": 5.2454264568617815e-06,
"loss": 0.7312,
"step": 3315
},
{
"epoch": 0.5367825383993533,
"grad_norm": 1.9626910345859885,
"learning_rate": 5.231331868389599e-06,
"loss": 0.7503,
"step": 3320
},
{
"epoch": 0.5375909458367018,
"grad_norm": 1.68124601360505,
"learning_rate": 5.2172354374978905e-06,
"loss": 0.7406,
"step": 3325
},
{
"epoch": 0.5383993532740501,
"grad_norm": 2.1050005093144994,
"learning_rate": 5.203137276456272e-06,
"loss": 0.7235,
"step": 3330
},
{
"epoch": 0.5392077607113985,
"grad_norm": 1.8947511081977626,
"learning_rate": 5.189037497548136e-06,
"loss": 0.7267,
"step": 3335
},
{
"epoch": 0.540016168148747,
"grad_norm": 1.8668010593718123,
"learning_rate": 5.174936213069761e-06,
"loss": 0.7309,
"step": 3340
},
{
"epoch": 0.5408245755860954,
"grad_norm": 1.9553385282357956,
"learning_rate": 5.160833535329417e-06,
"loss": 0.7292,
"step": 3345
},
{
"epoch": 0.5416329830234439,
"grad_norm": 1.5662310478755188,
"learning_rate": 5.146729576646469e-06,
"loss": 0.7083,
"step": 3350
},
{
"epoch": 0.5424413904607922,
"grad_norm": 1.982515442376847,
"learning_rate": 5.132624449350486e-06,
"loss": 0.7473,
"step": 3355
},
{
"epoch": 0.5432497978981407,
"grad_norm": 1.696121107818326,
"learning_rate": 5.118518265780343e-06,
"loss": 0.7127,
"step": 3360
},
{
"epoch": 0.5440582053354891,
"grad_norm": 1.792047111863481,
"learning_rate": 5.1044111382833284e-06,
"loss": 0.7315,
"step": 3365
},
{
"epoch": 0.5448666127728375,
"grad_norm": 1.6410942741564183,
"learning_rate": 5.090303179214248e-06,
"loss": 0.7202,
"step": 3370
},
{
"epoch": 0.5456750202101859,
"grad_norm": 1.7980781288846641,
"learning_rate": 5.0761945009345295e-06,
"loss": 0.708,
"step": 3375
},
{
"epoch": 0.5464834276475343,
"grad_norm": 1.7891594579241739,
"learning_rate": 5.06208521581133e-06,
"loss": 0.739,
"step": 3380
},
{
"epoch": 0.5472918350848828,
"grad_norm": 1.8996907092180035,
"learning_rate": 5.04797543621664e-06,
"loss": 0.7259,
"step": 3385
},
{
"epoch": 0.5481002425222312,
"grad_norm": 1.7441485524528433,
"learning_rate": 5.033865274526388e-06,
"loss": 0.7234,
"step": 3390
},
{
"epoch": 0.5489086499595797,
"grad_norm": 1.8800934694644797,
"learning_rate": 5.019754843119544e-06,
"loss": 0.718,
"step": 3395
},
{
"epoch": 0.549717057396928,
"grad_norm": 1.639197995446039,
"learning_rate": 5.00564425437723e-06,
"loss": 0.7505,
"step": 3400
},
{
"epoch": 0.5505254648342764,
"grad_norm": 1.7066220016466764,
"learning_rate": 4.991533620681814e-06,
"loss": 0.6972,
"step": 3405
},
{
"epoch": 0.5513338722716249,
"grad_norm": 1.6540407961792254,
"learning_rate": 4.977423054416031e-06,
"loss": 0.7369,
"step": 3410
},
{
"epoch": 0.5521422797089733,
"grad_norm": 2.3849302708294666,
"learning_rate": 4.963312667962072e-06,
"loss": 0.737,
"step": 3415
},
{
"epoch": 0.5529506871463218,
"grad_norm": 1.6980580304581847,
"learning_rate": 4.949202573700699e-06,
"loss": 0.7243,
"step": 3420
},
{
"epoch": 0.5537590945836701,
"grad_norm": 1.8442646934282483,
"learning_rate": 4.935092884010347e-06,
"loss": 0.7174,
"step": 3425
},
{
"epoch": 0.5545675020210186,
"grad_norm": 1.7357839718219499,
"learning_rate": 4.920983711266225e-06,
"loss": 0.7252,
"step": 3430
},
{
"epoch": 0.555375909458367,
"grad_norm": 2.068408711770871,
"learning_rate": 4.906875167839433e-06,
"loss": 0.7427,
"step": 3435
},
{
"epoch": 0.5561843168957155,
"grad_norm": 1.6763421869702522,
"learning_rate": 4.89276736609605e-06,
"loss": 0.7285,
"step": 3440
},
{
"epoch": 0.5569927243330639,
"grad_norm": 1.7651505838736554,
"learning_rate": 4.878660418396254e-06,
"loss": 0.7296,
"step": 3445
},
{
"epoch": 0.5578011317704122,
"grad_norm": 1.703327968146327,
"learning_rate": 4.864554437093416e-06,
"loss": 0.7208,
"step": 3450
},
{
"epoch": 0.5586095392077607,
"grad_norm": 1.7704160277811705,
"learning_rate": 4.850449534533213e-06,
"loss": 0.7493,
"step": 3455
},
{
"epoch": 0.5594179466451091,
"grad_norm": 1.907706505426485,
"learning_rate": 4.836345823052735e-06,
"loss": 0.7242,
"step": 3460
},
{
"epoch": 0.5602263540824576,
"grad_norm": 1.8298403134205508,
"learning_rate": 4.822243414979578e-06,
"loss": 0.7126,
"step": 3465
},
{
"epoch": 0.561034761519806,
"grad_norm": 2.0492118497937484,
"learning_rate": 4.8081424226309605e-06,
"loss": 0.7193,
"step": 3470
},
{
"epoch": 0.5618431689571544,
"grad_norm": 1.9413458452075376,
"learning_rate": 4.794042958312824e-06,
"loss": 0.7177,
"step": 3475
},
{
"epoch": 0.5626515763945028,
"grad_norm": 1.5066422062448757,
"learning_rate": 4.779945134318944e-06,
"loss": 0.7048,
"step": 3480
},
{
"epoch": 0.5634599838318513,
"grad_norm": 1.554271039498153,
"learning_rate": 4.765849062930029e-06,
"loss": 0.7344,
"step": 3485
},
{
"epoch": 0.5642683912691997,
"grad_norm": 1.600921934972658,
"learning_rate": 4.75175485641283e-06,
"loss": 0.7101,
"step": 3490
},
{
"epoch": 0.5650767987065481,
"grad_norm": 2.424494189852517,
"learning_rate": 4.737662627019244e-06,
"loss": 0.7251,
"step": 3495
},
{
"epoch": 0.5658852061438965,
"grad_norm": 1.6300840852213847,
"learning_rate": 4.723572486985421e-06,
"loss": 0.728,
"step": 3500
},
{
"epoch": 0.5666936135812449,
"grad_norm": 2.2647753462217683,
"learning_rate": 4.7094845485308735e-06,
"loss": 0.7185,
"step": 3505
},
{
"epoch": 0.5675020210185934,
"grad_norm": 1.7811122307778515,
"learning_rate": 4.695398923857579e-06,
"loss": 0.7331,
"step": 3510
},
{
"epoch": 0.5683104284559418,
"grad_norm": 1.9941270454793851,
"learning_rate": 4.681315725149083e-06,
"loss": 0.7357,
"step": 3515
},
{
"epoch": 0.5691188358932903,
"grad_norm": 1.8422816032944238,
"learning_rate": 4.667235064569616e-06,
"loss": 0.7043,
"step": 3520
},
{
"epoch": 0.5699272433306386,
"grad_norm": 1.7136459947309182,
"learning_rate": 4.6531570542631884e-06,
"loss": 0.7283,
"step": 3525
},
{
"epoch": 0.570735650767987,
"grad_norm": 2.09761849915152,
"learning_rate": 4.639081806352707e-06,
"loss": 0.7309,
"step": 3530
},
{
"epoch": 0.5715440582053355,
"grad_norm": 1.6239566569251038,
"learning_rate": 4.625009432939075e-06,
"loss": 0.7194,
"step": 3535
},
{
"epoch": 0.5723524656426839,
"grad_norm": 1.6412212532766335,
"learning_rate": 4.6109400461003005e-06,
"loss": 0.706,
"step": 3540
},
{
"epoch": 0.5731608730800324,
"grad_norm": 1.8885243243418546,
"learning_rate": 4.596873757890612e-06,
"loss": 0.7402,
"step": 3545
},
{
"epoch": 0.5739692805173807,
"grad_norm": 1.6571244102934766,
"learning_rate": 4.582810680339551e-06,
"loss": 0.7245,
"step": 3550
},
{
"epoch": 0.5747776879547292,
"grad_norm": 1.6702595320617768,
"learning_rate": 4.5687509254510924e-06,
"loss": 0.7219,
"step": 3555
},
{
"epoch": 0.5755860953920776,
"grad_norm": 1.5072913826468763,
"learning_rate": 4.5546946052027505e-06,
"loss": 0.7228,
"step": 3560
},
{
"epoch": 0.5763945028294261,
"grad_norm": 1.9412492977577889,
"learning_rate": 4.540641831544678e-06,
"loss": 0.7209,
"step": 3565
},
{
"epoch": 0.5772029102667745,
"grad_norm": 1.9123748628207098,
"learning_rate": 4.526592716398788e-06,
"loss": 0.7314,
"step": 3570
},
{
"epoch": 0.5780113177041228,
"grad_norm": 1.9224889275166772,
"learning_rate": 4.51254737165785e-06,
"loss": 0.7199,
"step": 3575
},
{
"epoch": 0.5788197251414713,
"grad_norm": 1.648787454331624,
"learning_rate": 4.49850590918461e-06,
"loss": 0.7292,
"step": 3580
},
{
"epoch": 0.5796281325788197,
"grad_norm": 2.2311883887862187,
"learning_rate": 4.484468440810888e-06,
"loss": 0.7138,
"step": 3585
},
{
"epoch": 0.5804365400161682,
"grad_norm": 1.8060045733095076,
"learning_rate": 4.470435078336699e-06,
"loss": 0.723,
"step": 3590
},
{
"epoch": 0.5812449474535166,
"grad_norm": 1.8077856651950424,
"learning_rate": 4.456405933529355e-06,
"loss": 0.7089,
"step": 3595
},
{
"epoch": 0.582053354890865,
"grad_norm": 1.5403196099771954,
"learning_rate": 4.442381118122573e-06,
"loss": 0.7187,
"step": 3600
},
{
"epoch": 0.5828617623282134,
"grad_norm": 1.77289982258147,
"learning_rate": 4.428360743815597e-06,
"loss": 0.7036,
"step": 3605
},
{
"epoch": 0.5836701697655619,
"grad_norm": 1.6034581186012247,
"learning_rate": 4.414344922272292e-06,
"loss": 0.7228,
"step": 3610
},
{
"epoch": 0.5844785772029103,
"grad_norm": 1.6471712911511105,
"learning_rate": 4.400333765120268e-06,
"loss": 0.7317,
"step": 3615
},
{
"epoch": 0.5852869846402587,
"grad_norm": 1.6056924533457961,
"learning_rate": 4.386327383949986e-06,
"loss": 0.7223,
"step": 3620
},
{
"epoch": 0.5860953920776071,
"grad_norm": 1.7550468882082644,
"learning_rate": 4.372325890313864e-06,
"loss": 0.7164,
"step": 3625
},
{
"epoch": 0.5869037995149555,
"grad_norm": 1.7554697237013572,
"learning_rate": 4.358329395725403e-06,
"loss": 0.7177,
"step": 3630
},
{
"epoch": 0.587712206952304,
"grad_norm": 1.891791374124361,
"learning_rate": 4.3443380116582776e-06,
"loss": 0.694,
"step": 3635
},
{
"epoch": 0.5885206143896524,
"grad_norm": 2.103271809218415,
"learning_rate": 4.330351849545471e-06,
"loss": 0.7278,
"step": 3640
},
{
"epoch": 0.5893290218270008,
"grad_norm": 1.9049264666214307,
"learning_rate": 4.316371020778372e-06,
"loss": 0.6899,
"step": 3645
},
{
"epoch": 0.5901374292643492,
"grad_norm": 1.834904512639952,
"learning_rate": 4.302395636705888e-06,
"loss": 0.7336,
"step": 3650
},
{
"epoch": 0.5909458367016976,
"grad_norm": 1.5107909122682632,
"learning_rate": 4.2884258086335755e-06,
"loss": 0.7322,
"step": 3655
},
{
"epoch": 0.5917542441390461,
"grad_norm": 1.6914566375954405,
"learning_rate": 4.274461647822726e-06,
"loss": 0.6987,
"step": 3660
},
{
"epoch": 0.5925626515763945,
"grad_norm": 2.047859565864153,
"learning_rate": 4.260503265489503e-06,
"loss": 0.7284,
"step": 3665
},
{
"epoch": 0.5933710590137429,
"grad_norm": 1.8868258227842758,
"learning_rate": 4.24655077280405e-06,
"loss": 0.7185,
"step": 3670
},
{
"epoch": 0.5941794664510913,
"grad_norm": 2.0502511378713373,
"learning_rate": 4.232604280889593e-06,
"loss": 0.7183,
"step": 3675
},
{
"epoch": 0.5949878738884398,
"grad_norm": 2.0904096614555137,
"learning_rate": 4.218663900821578e-06,
"loss": 0.7386,
"step": 3680
},
{
"epoch": 0.5957962813257882,
"grad_norm": 1.6951556666400156,
"learning_rate": 4.2047297436267635e-06,
"loss": 0.7203,
"step": 3685
},
{
"epoch": 0.5966046887631367,
"grad_norm": 1.858909449068441,
"learning_rate": 4.190801920282349e-06,
"loss": 0.7116,
"step": 3690
},
{
"epoch": 0.597413096200485,
"grad_norm": 2.110239776887249,
"learning_rate": 4.176880541715097e-06,
"loss": 0.7291,
"step": 3695
},
{
"epoch": 0.5982215036378334,
"grad_norm": 1.9230056997299079,
"learning_rate": 4.162965718800428e-06,
"loss": 0.7217,
"step": 3700
},
{
"epoch": 0.5990299110751819,
"grad_norm": 1.8462970903130231,
"learning_rate": 4.149057562361562e-06,
"loss": 0.7365,
"step": 3705
},
{
"epoch": 0.5998383185125303,
"grad_norm": 1.6321519110155143,
"learning_rate": 4.1351561831686136e-06,
"loss": 0.7315,
"step": 3710
},
{
"epoch": 0.6006467259498788,
"grad_norm": 1.6282666293170165,
"learning_rate": 4.121261691937732e-06,
"loss": 0.7213,
"step": 3715
},
{
"epoch": 0.6014551333872271,
"grad_norm": 1.897782087386442,
"learning_rate": 4.1073741993302005e-06,
"loss": 0.7123,
"step": 3720
},
{
"epoch": 0.6022635408245756,
"grad_norm": 1.7494970647862944,
"learning_rate": 4.093493815951566e-06,
"loss": 0.7088,
"step": 3725
},
{
"epoch": 0.603071948261924,
"grad_norm": 1.7243589892541922,
"learning_rate": 4.079620652350754e-06,
"loss": 0.715,
"step": 3730
},
{
"epoch": 0.6038803556992725,
"grad_norm": 1.8316631448911362,
"learning_rate": 4.065754819019183e-06,
"loss": 0.7248,
"step": 3735
},
{
"epoch": 0.6046887631366209,
"grad_norm": 1.6198889183822909,
"learning_rate": 4.051896426389904e-06,
"loss": 0.7189,
"step": 3740
},
{
"epoch": 0.6054971705739692,
"grad_norm": 1.8244917176494815,
"learning_rate": 4.038045584836691e-06,
"loss": 0.7309,
"step": 3745
},
{
"epoch": 0.6063055780113177,
"grad_norm": 1.51923723630652,
"learning_rate": 4.02420240467319e-06,
"loss": 0.7214,
"step": 3750
},
{
"epoch": 0.6071139854486661,
"grad_norm": 1.6915516287756491,
"learning_rate": 4.010366996152025e-06,
"loss": 0.7017,
"step": 3755
},
{
"epoch": 0.6079223928860146,
"grad_norm": 1.8155249672465137,
"learning_rate": 3.99653946946392e-06,
"loss": 0.7436,
"step": 3760
},
{
"epoch": 0.608730800323363,
"grad_norm": 2.2453571071282403,
"learning_rate": 3.982719934736832e-06,
"loss": 0.7281,
"step": 3765
},
{
"epoch": 0.6095392077607114,
"grad_norm": 2.070645865764563,
"learning_rate": 3.96890850203506e-06,
"loss": 0.6972,
"step": 3770
},
{
"epoch": 0.6103476151980598,
"grad_norm": 1.6532992237811048,
"learning_rate": 3.9551052813583776e-06,
"loss": 0.7188,
"step": 3775
},
{
"epoch": 0.6111560226354082,
"grad_norm": 1.757629266525101,
"learning_rate": 3.9413103826411595e-06,
"loss": 0.7095,
"step": 3780
},
{
"epoch": 0.6119644300727567,
"grad_norm": 1.6234603453267293,
"learning_rate": 3.927523915751491e-06,
"loss": 0.7291,
"step": 3785
},
{
"epoch": 0.6127728375101051,
"grad_norm": 1.589881321535228,
"learning_rate": 3.913745990490314e-06,
"loss": 0.694,
"step": 3790
},
{
"epoch": 0.6135812449474535,
"grad_norm": 1.6706437491058566,
"learning_rate": 3.899976716590531e-06,
"loss": 0.7335,
"step": 3795
},
{
"epoch": 0.6143896523848019,
"grad_norm": 1.694799314467617,
"learning_rate": 3.886216203716149e-06,
"loss": 0.721,
"step": 3800
},
{
"epoch": 0.6151980598221504,
"grad_norm": 2.1158395577766274,
"learning_rate": 3.872464561461397e-06,
"loss": 0.7092,
"step": 3805
},
{
"epoch": 0.6160064672594988,
"grad_norm": 1.6236255430260125,
"learning_rate": 3.8587218993498525e-06,
"loss": 0.7313,
"step": 3810
},
{
"epoch": 0.6168148746968473,
"grad_norm": 1.7100289433829274,
"learning_rate": 3.844988326833574e-06,
"loss": 0.7169,
"step": 3815
},
{
"epoch": 0.6176232821341956,
"grad_norm": 1.66125609703824,
"learning_rate": 3.831263953292225e-06,
"loss": 0.741,
"step": 3820
},
{
"epoch": 0.618431689571544,
"grad_norm": 1.8691718045573207,
"learning_rate": 3.817548888032207e-06,
"loss": 0.7092,
"step": 3825
},
{
"epoch": 0.6192400970088925,
"grad_norm": 1.9063385283071115,
"learning_rate": 3.803843240285784e-06,
"loss": 0.724,
"step": 3830
},
{
"epoch": 0.6200485044462409,
"grad_norm": 1.7196545690379716,
"learning_rate": 3.7901471192102173e-06,
"loss": 0.7204,
"step": 3835
},
{
"epoch": 0.6208569118835894,
"grad_norm": 1.8618730408898592,
"learning_rate": 3.7764606338868943e-06,
"loss": 0.7218,
"step": 3840
},
{
"epoch": 0.6216653193209377,
"grad_norm": 1.68489988594914,
"learning_rate": 3.7627838933204547e-06,
"loss": 0.7262,
"step": 3845
},
{
"epoch": 0.6224737267582862,
"grad_norm": 2.0315811113955013,
"learning_rate": 3.7491170064379346e-06,
"loss": 0.7127,
"step": 3850
},
{
"epoch": 0.6232821341956346,
"grad_norm": 1.8075962655722444,
"learning_rate": 3.735460082087884e-06,
"loss": 0.7166,
"step": 3855
},
{
"epoch": 0.624090541632983,
"grad_norm": 1.643356272004426,
"learning_rate": 3.7218132290395125e-06,
"loss": 0.7094,
"step": 3860
},
{
"epoch": 0.6248989490703315,
"grad_norm": 1.8603079492566412,
"learning_rate": 3.7081765559818184e-06,
"loss": 0.7174,
"step": 3865
},
{
"epoch": 0.6257073565076798,
"grad_norm": 2.203684498237761,
"learning_rate": 3.6945501715227146e-06,
"loss": 0.6886,
"step": 3870
},
{
"epoch": 0.6265157639450283,
"grad_norm": 1.6838584000200525,
"learning_rate": 3.680934184188182e-06,
"loss": 0.7029,
"step": 3875
},
{
"epoch": 0.6273241713823767,
"grad_norm": 1.750412853978724,
"learning_rate": 3.6673287024213868e-06,
"loss": 0.7133,
"step": 3880
},
{
"epoch": 0.6281325788197252,
"grad_norm": 1.6786616957245395,
"learning_rate": 3.6537338345818273e-06,
"loss": 0.7208,
"step": 3885
},
{
"epoch": 0.6289409862570736,
"grad_norm": 1.7458825802515532,
"learning_rate": 3.640149688944472e-06,
"loss": 0.695,
"step": 3890
},
{
"epoch": 0.629749393694422,
"grad_norm": 1.787947265054849,
"learning_rate": 3.626576373698885e-06,
"loss": 0.7026,
"step": 3895
},
{
"epoch": 0.6305578011317704,
"grad_norm": 1.839057642052893,
"learning_rate": 3.6130139969483825e-06,
"loss": 0.7226,
"step": 3900
},
{
"epoch": 0.6313662085691188,
"grad_norm": 1.6591662592024992,
"learning_rate": 3.599462666709155e-06,
"loss": 0.7167,
"step": 3905
},
{
"epoch": 0.6321746160064673,
"grad_norm": 1.7026548619494275,
"learning_rate": 3.5859224909094147e-06,
"loss": 0.7306,
"step": 3910
},
{
"epoch": 0.6329830234438156,
"grad_norm": 1.6797177301247779,
"learning_rate": 3.5723935773885414e-06,
"loss": 0.6974,
"step": 3915
},
{
"epoch": 0.6337914308811641,
"grad_norm": 1.623329714146258,
"learning_rate": 3.558876033896211e-06,
"loss": 0.7283,
"step": 3920
},
{
"epoch": 0.6345998383185125,
"grad_norm": 1.863028763705875,
"learning_rate": 3.5453699680915476e-06,
"loss": 0.7356,
"step": 3925
},
{
"epoch": 0.635408245755861,
"grad_norm": 2.274790139096009,
"learning_rate": 3.5318754875422588e-06,
"loss": 0.7042,
"step": 3930
},
{
"epoch": 0.6362166531932094,
"grad_norm": 1.8149361524248941,
"learning_rate": 3.518392699723786e-06,
"loss": 0.7112,
"step": 3935
},
{
"epoch": 0.6370250606305577,
"grad_norm": 1.6623579800376278,
"learning_rate": 3.5049217120184476e-06,
"loss": 0.7007,
"step": 3940
},
{
"epoch": 0.6378334680679062,
"grad_norm": 1.7895960646702003,
"learning_rate": 3.491462631714574e-06,
"loss": 0.7328,
"step": 3945
},
{
"epoch": 0.6386418755052546,
"grad_norm": 1.5179464123128517,
"learning_rate": 3.4780155660056653e-06,
"loss": 0.7212,
"step": 3950
},
{
"epoch": 0.6394502829426031,
"grad_norm": 1.6048128854081531,
"learning_rate": 3.464580621989528e-06,
"loss": 0.7119,
"step": 3955
},
{
"epoch": 0.6402586903799515,
"grad_norm": 1.8256911462243481,
"learning_rate": 3.4511579066674354e-06,
"loss": 0.7139,
"step": 3960
},
{
"epoch": 0.6410670978172999,
"grad_norm": 1.6782507649830145,
"learning_rate": 3.437747526943256e-06,
"loss": 0.7112,
"step": 3965
},
{
"epoch": 0.6418755052546483,
"grad_norm": 1.9042260389204226,
"learning_rate": 3.42434958962262e-06,
"loss": 0.7424,
"step": 3970
},
{
"epoch": 0.6426839126919968,
"grad_norm": 1.5774540955627445,
"learning_rate": 3.410964201412059e-06,
"loss": 0.7023,
"step": 3975
},
{
"epoch": 0.6434923201293452,
"grad_norm": 1.9330271182491578,
"learning_rate": 3.3975914689181565e-06,
"loss": 0.6915,
"step": 3980
},
{
"epoch": 0.6443007275666937,
"grad_norm": 1.729192351032169,
"learning_rate": 3.384231498646706e-06,
"loss": 0.7332,
"step": 3985
},
{
"epoch": 0.645109135004042,
"grad_norm": 1.7304833054051807,
"learning_rate": 3.370884397001851e-06,
"loss": 0.7259,
"step": 3990
},
{
"epoch": 0.6459175424413904,
"grad_norm": 1.668285332848988,
"learning_rate": 3.3575502702852486e-06,
"loss": 0.6954,
"step": 3995
},
{
"epoch": 0.6467259498787389,
"grad_norm": 1.6461002316066478,
"learning_rate": 3.344229224695219e-06,
"loss": 0.7078,
"step": 4000
},
{
"epoch": 0.6475343573160873,
"grad_norm": 2.265529307566932,
"learning_rate": 3.3309213663258933e-06,
"loss": 0.7097,
"step": 4005
},
{
"epoch": 0.6483427647534358,
"grad_norm": 1.8168921242914984,
"learning_rate": 3.3176268011663826e-06,
"loss": 0.7335,
"step": 4010
},
{
"epoch": 0.6491511721907841,
"grad_norm": 1.5972240421560917,
"learning_rate": 3.304345635099918e-06,
"loss": 0.727,
"step": 4015
},
{
"epoch": 0.6499595796281326,
"grad_norm": 1.7262174376471393,
"learning_rate": 3.291077973903018e-06,
"loss": 0.7384,
"step": 4020
},
{
"epoch": 0.650767987065481,
"grad_norm": 1.7197250528022299,
"learning_rate": 3.2778239232446462e-06,
"loss": 0.7212,
"step": 4025
},
{
"epoch": 0.6515763945028294,
"grad_norm": 1.8271752649285784,
"learning_rate": 3.2645835886853604e-06,
"loss": 0.7254,
"step": 4030
},
{
"epoch": 0.6523848019401779,
"grad_norm": 1.6271051386234956,
"learning_rate": 3.251357075676482e-06,
"loss": 0.712,
"step": 4035
},
{
"epoch": 0.6531932093775262,
"grad_norm": 1.6767642174095554,
"learning_rate": 3.2381444895592483e-06,
"loss": 0.7218,
"step": 4040
},
{
"epoch": 0.6540016168148747,
"grad_norm": 1.7632914896547123,
"learning_rate": 3.224945935563982e-06,
"loss": 0.715,
"step": 4045
},
{
"epoch": 0.6548100242522231,
"grad_norm": 1.8702788784549214,
"learning_rate": 3.2117615188092475e-06,
"loss": 0.7367,
"step": 4050
},
{
"epoch": 0.6556184316895716,
"grad_norm": 1.9298182916897795,
"learning_rate": 3.1985913443010106e-06,
"loss": 0.7164,
"step": 4055
},
{
"epoch": 0.65642683912692,
"grad_norm": 2.1333743836535555,
"learning_rate": 3.185435516931811e-06,
"loss": 0.7175,
"step": 4060
},
{
"epoch": 0.6572352465642683,
"grad_norm": 1.9618873764352864,
"learning_rate": 3.1722941414799152e-06,
"loss": 0.7293,
"step": 4065
},
{
"epoch": 0.6580436540016168,
"grad_norm": 1.8406120340882652,
"learning_rate": 3.159167322608498e-06,
"loss": 0.7204,
"step": 4070
},
{
"epoch": 0.6588520614389652,
"grad_norm": 1.8153933978636514,
"learning_rate": 3.146055164864794e-06,
"loss": 0.7096,
"step": 4075
},
{
"epoch": 0.6596604688763137,
"grad_norm": 1.6106430966301524,
"learning_rate": 3.1329577726792705e-06,
"loss": 0.7199,
"step": 4080
},
{
"epoch": 0.6604688763136621,
"grad_norm": 1.6587566226994688,
"learning_rate": 3.1198752503647995e-06,
"loss": 0.7059,
"step": 4085
},
{
"epoch": 0.6612772837510105,
"grad_norm": 1.6152976800316061,
"learning_rate": 3.1068077021158185e-06,
"loss": 0.7155,
"step": 4090
},
{
"epoch": 0.6620856911883589,
"grad_norm": 1.668320625864469,
"learning_rate": 3.0937552320075116e-06,
"loss": 0.6997,
"step": 4095
},
{
"epoch": 0.6628940986257074,
"grad_norm": 1.6309737459520801,
"learning_rate": 3.0807179439949685e-06,
"loss": 0.7242,
"step": 4100
},
{
"epoch": 0.6637025060630558,
"grad_norm": 1.8832810217520193,
"learning_rate": 3.0676959419123666e-06,
"loss": 0.6975,
"step": 4105
},
{
"epoch": 0.6645109135004043,
"grad_norm": 1.750667565585888,
"learning_rate": 3.05468932947214e-06,
"loss": 0.7229,
"step": 4110
},
{
"epoch": 0.6653193209377526,
"grad_norm": 1.7153179028178922,
"learning_rate": 3.041698210264149e-06,
"loss": 0.7051,
"step": 4115
},
{
"epoch": 0.666127728375101,
"grad_norm": 1.644510456041389,
"learning_rate": 3.028722687754867e-06,
"loss": 0.7254,
"step": 4120
},
{
"epoch": 0.6669361358124495,
"grad_norm": 1.7526546920401012,
"learning_rate": 3.0157628652865426e-06,
"loss": 0.725,
"step": 4125
},
{
"epoch": 0.6677445432497979,
"grad_norm": 1.6156156934111832,
"learning_rate": 3.0028188460763853e-06,
"loss": 0.7109,
"step": 4130
},
{
"epoch": 0.6685529506871464,
"grad_norm": 1.377558375885402,
"learning_rate": 2.9898907332157432e-06,
"loss": 0.7234,
"step": 4135
},
{
"epoch": 0.6693613581244947,
"grad_norm": 1.5406583460490257,
"learning_rate": 2.976978629669276e-06,
"loss": 0.6983,
"step": 4140
},
{
"epoch": 0.6701697655618432,
"grad_norm": 1.6610140694054174,
"learning_rate": 2.9640826382741427e-06,
"loss": 0.7082,
"step": 4145
},
{
"epoch": 0.6709781729991916,
"grad_norm": 1.8696909034008466,
"learning_rate": 2.951202861739173e-06,
"loss": 0.7039,
"step": 4150
},
{
"epoch": 0.67178658043654,
"grad_norm": 1.699448437132235,
"learning_rate": 2.938339402644061e-06,
"loss": 0.7069,
"step": 4155
},
{
"epoch": 0.6725949878738885,
"grad_norm": 1.9050926736431988,
"learning_rate": 2.9254923634385425e-06,
"loss": 0.7083,
"step": 4160
},
{
"epoch": 0.6734033953112368,
"grad_norm": 1.6041414448997582,
"learning_rate": 2.912661846441572e-06,
"loss": 0.7154,
"step": 4165
},
{
"epoch": 0.6742118027485853,
"grad_norm": 1.5644306988254755,
"learning_rate": 2.8998479538405218e-06,
"loss": 0.727,
"step": 4170
},
{
"epoch": 0.6750202101859337,
"grad_norm": 1.5805303077306065,
"learning_rate": 2.8870507876903536e-06,
"loss": 0.694,
"step": 4175
},
{
"epoch": 0.6758286176232822,
"grad_norm": 1.5605132401182347,
"learning_rate": 2.87427044991282e-06,
"loss": 0.712,
"step": 4180
},
{
"epoch": 0.6766370250606305,
"grad_norm": 1.634348076934335,
"learning_rate": 2.861507042295644e-06,
"loss": 0.7134,
"step": 4185
},
{
"epoch": 0.677445432497979,
"grad_norm": 1.5331952726149434,
"learning_rate": 2.8487606664917056e-06,
"loss": 0.7311,
"step": 4190
},
{
"epoch": 0.6782538399353274,
"grad_norm": 1.9231338256054877,
"learning_rate": 2.836031424018243e-06,
"loss": 0.7053,
"step": 4195
},
{
"epoch": 0.6790622473726758,
"grad_norm": 1.6648310177555483,
"learning_rate": 2.823319416256033e-06,
"loss": 0.7094,
"step": 4200
},
{
"epoch": 0.6798706548100243,
"grad_norm": 1.7703929497048758,
"learning_rate": 2.810624744448588e-06,
"loss": 0.6877,
"step": 4205
},
{
"epoch": 0.6806790622473726,
"grad_norm": 1.5831872562872369,
"learning_rate": 2.797947509701354e-06,
"loss": 0.7031,
"step": 4210
},
{
"epoch": 0.6814874696847211,
"grad_norm": 1.4317597869567902,
"learning_rate": 2.785287812980898e-06,
"loss": 0.7371,
"step": 4215
},
{
"epoch": 0.6822958771220695,
"grad_norm": 1.7340298885830805,
"learning_rate": 2.7726457551141093e-06,
"loss": 0.7366,
"step": 4220
},
{
"epoch": 0.683104284559418,
"grad_norm": 1.6924476828852524,
"learning_rate": 2.7600214367873913e-06,
"loss": 0.697,
"step": 4225
},
{
"epoch": 0.6839126919967664,
"grad_norm": 1.4262764228242009,
"learning_rate": 2.7474149585458666e-06,
"loss": 0.7228,
"step": 4230
},
{
"epoch": 0.6847210994341147,
"grad_norm": 1.9027434686970166,
"learning_rate": 2.734826420792568e-06,
"loss": 0.7167,
"step": 4235
},
{
"epoch": 0.6855295068714632,
"grad_norm": 1.6470078258301304,
"learning_rate": 2.7222559237876467e-06,
"loss": 0.7287,
"step": 4240
},
{
"epoch": 0.6863379143088116,
"grad_norm": 1.7103358496942485,
"learning_rate": 2.709703567647569e-06,
"loss": 0.6992,
"step": 4245
},
{
"epoch": 0.6871463217461601,
"grad_norm": 1.5289126014423284,
"learning_rate": 2.697169452344316e-06,
"loss": 0.6908,
"step": 4250
},
{
"epoch": 0.6879547291835085,
"grad_norm": 1.5353664972333323,
"learning_rate": 2.6846536777046004e-06,
"loss": 0.7066,
"step": 4255
},
{
"epoch": 0.6887631366208569,
"grad_norm": 1.9300314181188833,
"learning_rate": 2.672156343409053e-06,
"loss": 0.7056,
"step": 4260
},
{
"epoch": 0.6895715440582053,
"grad_norm": 1.7170832117281503,
"learning_rate": 2.659677548991444e-06,
"loss": 0.7065,
"step": 4265
},
{
"epoch": 0.6903799514955538,
"grad_norm": 1.5377665823804576,
"learning_rate": 2.647217393837886e-06,
"loss": 0.7258,
"step": 4270
},
{
"epoch": 0.6911883589329022,
"grad_norm": 1.7401588076713146,
"learning_rate": 2.6347759771860336e-06,
"loss": 0.7026,
"step": 4275
},
{
"epoch": 0.6919967663702506,
"grad_norm": 1.799996686879021,
"learning_rate": 2.62235339812431e-06,
"loss": 0.6998,
"step": 4280
},
{
"epoch": 0.692805173807599,
"grad_norm": 1.7779728222118754,
"learning_rate": 2.6099497555911006e-06,
"loss": 0.6993,
"step": 4285
},
{
"epoch": 0.6936135812449474,
"grad_norm": 1.4924029650433472,
"learning_rate": 2.5975651483739745e-06,
"loss": 0.7161,
"step": 4290
},
{
"epoch": 0.6944219886822959,
"grad_norm": 1.7534120757891243,
"learning_rate": 2.5851996751088997e-06,
"loss": 0.7072,
"step": 4295
},
{
"epoch": 0.6952303961196443,
"grad_norm": 1.9089320497241284,
"learning_rate": 2.5728534342794487e-06,
"loss": 0.7063,
"step": 4300
},
{
"epoch": 0.6960388035569928,
"grad_norm": 1.919875371600586,
"learning_rate": 2.560526524216024e-06,
"loss": 0.7033,
"step": 4305
},
{
"epoch": 0.6968472109943411,
"grad_norm": 1.6478824265950052,
"learning_rate": 2.548219043095064e-06,
"loss": 0.7205,
"step": 4310
},
{
"epoch": 0.6976556184316896,
"grad_norm": 1.684623132888406,
"learning_rate": 2.535931088938274e-06,
"loss": 0.6847,
"step": 4315
},
{
"epoch": 0.698464025869038,
"grad_norm": 1.67134719450202,
"learning_rate": 2.5236627596118362e-06,
"loss": 0.703,
"step": 4320
},
{
"epoch": 0.6992724333063864,
"grad_norm": 1.7627300301593591,
"learning_rate": 2.511414152825631e-06,
"loss": 0.6908,
"step": 4325
},
{
"epoch": 0.7000808407437349,
"grad_norm": 1.5484331606717547,
"learning_rate": 2.499185366132462e-06,
"loss": 0.7235,
"step": 4330
},
{
"epoch": 0.7008892481810832,
"grad_norm": 1.6640413077548424,
"learning_rate": 2.4869764969272757e-06,
"loss": 0.7027,
"step": 4335
},
{
"epoch": 0.7016976556184317,
"grad_norm": 1.578928776955349,
"learning_rate": 2.474787642446393e-06,
"loss": 0.7164,
"step": 4340
},
{
"epoch": 0.7025060630557801,
"grad_norm": 1.9767016634135068,
"learning_rate": 2.4626188997667224e-06,
"loss": 0.7161,
"step": 4345
},
{
"epoch": 0.7033144704931286,
"grad_norm": 1.7184637637432352,
"learning_rate": 2.4504703658049994e-06,
"loss": 0.6947,
"step": 4350
},
{
"epoch": 0.704122877930477,
"grad_norm": 1.6400468293429349,
"learning_rate": 2.43834213731701e-06,
"loss": 0.7072,
"step": 4355
},
{
"epoch": 0.7049312853678253,
"grad_norm": 1.5907149181306826,
"learning_rate": 2.426234310896812e-06,
"loss": 0.7036,
"step": 4360
},
{
"epoch": 0.7057396928051738,
"grad_norm": 1.418762913742883,
"learning_rate": 2.414146982975983e-06,
"loss": 0.7,
"step": 4365
},
{
"epoch": 0.7065481002425222,
"grad_norm": 2.069021252053298,
"learning_rate": 2.4020802498228333e-06,
"loss": 0.7131,
"step": 4370
},
{
"epoch": 0.7073565076798707,
"grad_norm": 1.7124723237801922,
"learning_rate": 2.3900342075416514e-06,
"loss": 0.6877,
"step": 4375
},
{
"epoch": 0.7081649151172191,
"grad_norm": 1.473031145225499,
"learning_rate": 2.37800895207194e-06,
"loss": 0.7242,
"step": 4380
},
{
"epoch": 0.7089733225545675,
"grad_norm": 1.7703727073517148,
"learning_rate": 2.3660045791876386e-06,
"loss": 0.6832,
"step": 4385
},
{
"epoch": 0.7097817299919159,
"grad_norm": 1.7898901246572265,
"learning_rate": 2.3540211844963783e-06,
"loss": 0.7167,
"step": 4390
},
{
"epoch": 0.7105901374292644,
"grad_norm": 1.6018154696798712,
"learning_rate": 2.342058863438704e-06,
"loss": 0.6873,
"step": 4395
},
{
"epoch": 0.7113985448666128,
"grad_norm": 1.7123245457707612,
"learning_rate": 2.330117711287327e-06,
"loss": 0.7074,
"step": 4400
},
{
"epoch": 0.7122069523039612,
"grad_norm": 2.0993312473611363,
"learning_rate": 2.3181978231463604e-06,
"loss": 0.7036,
"step": 4405
},
{
"epoch": 0.7130153597413096,
"grad_norm": 1.5813117767291411,
"learning_rate": 2.306299293950557e-06,
"loss": 0.7153,
"step": 4410
},
{
"epoch": 0.713823767178658,
"grad_norm": 1.6125648806682966,
"learning_rate": 2.294422218464567e-06,
"loss": 0.6898,
"step": 4415
},
{
"epoch": 0.7146321746160065,
"grad_norm": 2.051605303416789,
"learning_rate": 2.2825666912821674e-06,
"loss": 0.7156,
"step": 4420
},
{
"epoch": 0.7154405820533549,
"grad_norm": 1.6595060216820654,
"learning_rate": 2.270732806825517e-06,
"loss": 0.719,
"step": 4425
},
{
"epoch": 0.7162489894907034,
"grad_norm": 1.531575665848643,
"learning_rate": 2.2589206593444084e-06,
"loss": 0.7335,
"step": 4430
},
{
"epoch": 0.7170573969280517,
"grad_norm": 1.4778525732647043,
"learning_rate": 2.2471303429155043e-06,
"loss": 0.7191,
"step": 4435
},
{
"epoch": 0.7178658043654002,
"grad_norm": 1.6929860001853365,
"learning_rate": 2.2353619514416052e-06,
"loss": 0.7216,
"step": 4440
},
{
"epoch": 0.7186742118027486,
"grad_norm": 1.782449408026849,
"learning_rate": 2.223615578650884e-06,
"loss": 0.7009,
"step": 4445
},
{
"epoch": 0.719482619240097,
"grad_norm": 1.4215615691395374,
"learning_rate": 2.2118913180961522e-06,
"loss": 0.6972,
"step": 4450
},
{
"epoch": 0.7202910266774454,
"grad_norm": 1.7771811992721345,
"learning_rate": 2.2001892631541132e-06,
"loss": 0.7133,
"step": 4455
},
{
"epoch": 0.7210994341147938,
"grad_norm": 1.53041881310807,
"learning_rate": 2.1885095070246116e-06,
"loss": 0.6989,
"step": 4460
},
{
"epoch": 0.7219078415521423,
"grad_norm": 1.8037641077557016,
"learning_rate": 2.176852142729895e-06,
"loss": 0.7102,
"step": 4465
},
{
"epoch": 0.7227162489894907,
"grad_norm": 1.7035192300078206,
"learning_rate": 2.165217263113875e-06,
"loss": 0.7106,
"step": 4470
},
{
"epoch": 0.7235246564268392,
"grad_norm": 1.599875721865672,
"learning_rate": 2.153604960841389e-06,
"loss": 0.7055,
"step": 4475
},
{
"epoch": 0.7243330638641875,
"grad_norm": 1.678804995013585,
"learning_rate": 2.142015328397454e-06,
"loss": 0.6962,
"step": 4480
},
{
"epoch": 0.725141471301536,
"grad_norm": 1.7056685395536566,
"learning_rate": 2.130448458086539e-06,
"loss": 0.7177,
"step": 4485
},
{
"epoch": 0.7259498787388844,
"grad_norm": 1.9835575111149595,
"learning_rate": 2.118904442031829e-06,
"loss": 0.7136,
"step": 4490
},
{
"epoch": 0.7267582861762328,
"grad_norm": 1.556702686596829,
"learning_rate": 2.1073833721744796e-06,
"loss": 0.7113,
"step": 4495
},
{
"epoch": 0.7275666936135813,
"grad_norm": 1.7685805077897936,
"learning_rate": 2.095885340272904e-06,
"loss": 0.6973,
"step": 4500
},
{
"epoch": 0.7283751010509296,
"grad_norm": 1.8918140713245204,
"learning_rate": 2.084410437902025e-06,
"loss": 0.7104,
"step": 4505
},
{
"epoch": 0.7291835084882781,
"grad_norm": 1.6842141260766332,
"learning_rate": 2.0729587564525525e-06,
"loss": 0.7058,
"step": 4510
},
{
"epoch": 0.7299919159256265,
"grad_norm": 1.988830288548141,
"learning_rate": 2.0615303871302617e-06,
"loss": 0.6982,
"step": 4515
},
{
"epoch": 0.730800323362975,
"grad_norm": 1.7150008974104203,
"learning_rate": 2.0501254209552536e-06,
"loss": 0.7253,
"step": 4520
},
{
"epoch": 0.7316087308003234,
"grad_norm": 1.6368813181817818,
"learning_rate": 2.038743948761243e-06,
"loss": 0.7251,
"step": 4525
},
{
"epoch": 0.7324171382376717,
"grad_norm": 1.6425874295988543,
"learning_rate": 2.0273860611948244e-06,
"loss": 0.7024,
"step": 4530
},
{
"epoch": 0.7332255456750202,
"grad_norm": 1.5620038329412886,
"learning_rate": 2.016051848714758e-06,
"loss": 0.6972,
"step": 4535
},
{
"epoch": 0.7340339531123686,
"grad_norm": 1.5155991077502002,
"learning_rate": 2.004741401591247e-06,
"loss": 0.6966,
"step": 4540
},
{
"epoch": 0.7348423605497171,
"grad_norm": 1.5087838986819468,
"learning_rate": 1.9934548099052147e-06,
"loss": 0.704,
"step": 4545
},
{
"epoch": 0.7356507679870655,
"grad_norm": 1.540725635553023,
"learning_rate": 1.9821921635475923e-06,
"loss": 0.711,
"step": 4550
},
{
"epoch": 0.7364591754244139,
"grad_norm": 1.7449377277549325,
"learning_rate": 1.9709535522185963e-06,
"loss": 0.7262,
"step": 4555
},
{
"epoch": 0.7372675828617623,
"grad_norm": 1.5400854167352056,
"learning_rate": 1.959739065427026e-06,
"loss": 0.685,
"step": 4560
},
{
"epoch": 0.7380759902991108,
"grad_norm": 1.6573782698805526,
"learning_rate": 1.94854879248954e-06,
"loss": 0.6949,
"step": 4565
},
{
"epoch": 0.7388843977364592,
"grad_norm": 1.444089691451337,
"learning_rate": 1.9373828225299458e-06,
"loss": 0.7192,
"step": 4570
},
{
"epoch": 0.7396928051738076,
"grad_norm": 1.6503277393010736,
"learning_rate": 1.926241244478496e-06,
"loss": 0.7012,
"step": 4575
},
{
"epoch": 0.740501212611156,
"grad_norm": 1.4505458685379473,
"learning_rate": 1.9151241470711725e-06,
"loss": 0.7064,
"step": 4580
},
{
"epoch": 0.7413096200485044,
"grad_norm": 1.6098471332377016,
"learning_rate": 1.904031618848987e-06,
"loss": 0.7168,
"step": 4585
},
{
"epoch": 0.7421180274858529,
"grad_norm": 1.5386973446650158,
"learning_rate": 1.8929637481572715e-06,
"loss": 0.6851,
"step": 4590
},
{
"epoch": 0.7429264349232013,
"grad_norm": 1.6231381655370265,
"learning_rate": 1.8819206231449717e-06,
"loss": 0.6933,
"step": 4595
},
{
"epoch": 0.7437348423605498,
"grad_norm": 1.8771029991361663,
"learning_rate": 1.8709023317639558e-06,
"loss": 0.7155,
"step": 4600
},
{
"epoch": 0.7445432497978981,
"grad_norm": 1.6673306502115761,
"learning_rate": 1.8599089617682997e-06,
"loss": 0.6922,
"step": 4605
},
{
"epoch": 0.7453516572352465,
"grad_norm": 1.7070238047014281,
"learning_rate": 1.848940600713603e-06,
"loss": 0.7036,
"step": 4610
},
{
"epoch": 0.746160064672595,
"grad_norm": 1.5626624796074473,
"learning_rate": 1.8379973359562765e-06,
"loss": 0.7121,
"step": 4615
},
{
"epoch": 0.7469684721099434,
"grad_norm": 1.5529354764314784,
"learning_rate": 1.8270792546528593e-06,
"loss": 0.7194,
"step": 4620
},
{
"epoch": 0.7477768795472919,
"grad_norm": 1.7007088230671603,
"learning_rate": 1.816186443759319e-06,
"loss": 0.7124,
"step": 4625
},
{
"epoch": 0.7485852869846402,
"grad_norm": 1.4923622281091249,
"learning_rate": 1.8053189900303553e-06,
"loss": 0.7166,
"step": 4630
},
{
"epoch": 0.7493936944219887,
"grad_norm": 1.6071608830962205,
"learning_rate": 1.7944769800187201e-06,
"loss": 0.7148,
"step": 4635
},
{
"epoch": 0.7502021018593371,
"grad_norm": 1.4700051890576207,
"learning_rate": 1.7836605000745154e-06,
"loss": 0.7216,
"step": 4640
},
{
"epoch": 0.7510105092966856,
"grad_norm": 1.6606431322123616,
"learning_rate": 1.772869636344512e-06,
"loss": 0.6907,
"step": 4645
},
{
"epoch": 0.751818916734034,
"grad_norm": 1.4813002160453033,
"learning_rate": 1.7621044747714683e-06,
"loss": 0.7098,
"step": 4650
},
{
"epoch": 0.7526273241713823,
"grad_norm": 1.661640433481697,
"learning_rate": 1.751365101093433e-06,
"loss": 0.6964,
"step": 4655
},
{
"epoch": 0.7534357316087308,
"grad_norm": 1.7279822572241663,
"learning_rate": 1.7406516008430774e-06,
"loss": 0.6834,
"step": 4660
},
{
"epoch": 0.7542441390460792,
"grad_norm": 1.8915619961292818,
"learning_rate": 1.729964059346998e-06,
"loss": 0.7122,
"step": 4665
},
{
"epoch": 0.7550525464834277,
"grad_norm": 1.6083179781780426,
"learning_rate": 1.719302561725053e-06,
"loss": 0.6946,
"step": 4670
},
{
"epoch": 0.7558609539207761,
"grad_norm": 1.7461258307464391,
"learning_rate": 1.7086671928896747e-06,
"loss": 0.6846,
"step": 4675
},
{
"epoch": 0.7566693613581245,
"grad_norm": 1.4237466961185907,
"learning_rate": 1.6980580375451928e-06,
"loss": 0.686,
"step": 4680
},
{
"epoch": 0.7574777687954729,
"grad_norm": 1.7755619878515854,
"learning_rate": 1.687475180187163e-06,
"loss": 0.7112,
"step": 4685
},
{
"epoch": 0.7582861762328214,
"grad_norm": 1.61719801995157,
"learning_rate": 1.6769187051016933e-06,
"loss": 0.7094,
"step": 4690
},
{
"epoch": 0.7590945836701698,
"grad_norm": 1.5568841653418648,
"learning_rate": 1.6663886963647753e-06,
"loss": 0.7276,
"step": 4695
},
{
"epoch": 0.7599029911075182,
"grad_norm": 1.7325661061608444,
"learning_rate": 1.6558852378416113e-06,
"loss": 0.7134,
"step": 4700
},
{
"epoch": 0.7607113985448666,
"grad_norm": 1.7935022266405718,
"learning_rate": 1.6454084131859427e-06,
"loss": 0.7126,
"step": 4705
},
{
"epoch": 0.761519805982215,
"grad_norm": 1.644414000022595,
"learning_rate": 1.6349583058393953e-06,
"loss": 0.7072,
"step": 4710
},
{
"epoch": 0.7623282134195635,
"grad_norm": 1.6391071208815438,
"learning_rate": 1.6245349990307997e-06,
"loss": 0.7022,
"step": 4715
},
{
"epoch": 0.7631366208569119,
"grad_norm": 1.693934420854349,
"learning_rate": 1.614138575775544e-06,
"loss": 0.6864,
"step": 4720
},
{
"epoch": 0.7639450282942603,
"grad_norm": 1.538392225632054,
"learning_rate": 1.6037691188748995e-06,
"loss": 0.7145,
"step": 4725
},
{
"epoch": 0.7647534357316087,
"grad_norm": 1.7487697179941406,
"learning_rate": 1.5934267109153667e-06,
"loss": 0.6828,
"step": 4730
},
{
"epoch": 0.7655618431689571,
"grad_norm": 1.4221518439513514,
"learning_rate": 1.5831114342680225e-06,
"loss": 0.6978,
"step": 4735
},
{
"epoch": 0.7663702506063056,
"grad_norm": 1.6852680227123094,
"learning_rate": 1.5728233710878527e-06,
"loss": 0.689,
"step": 4740
},
{
"epoch": 0.767178658043654,
"grad_norm": 1.4910729061389476,
"learning_rate": 1.5625626033131102e-06,
"loss": 0.7148,
"step": 4745
},
{
"epoch": 0.7679870654810024,
"grad_norm": 1.3823498315519551,
"learning_rate": 1.5523292126646505e-06,
"loss": 0.7111,
"step": 4750
},
{
"epoch": 0.7687954729183508,
"grad_norm": 1.5120794248749074,
"learning_rate": 1.542123280645292e-06,
"loss": 0.7169,
"step": 4755
},
{
"epoch": 0.7696038803556993,
"grad_norm": 1.4393108032014577,
"learning_rate": 1.5319448885391596e-06,
"loss": 0.7061,
"step": 4760
},
{
"epoch": 0.7704122877930477,
"grad_norm": 1.579487027567959,
"learning_rate": 1.521794117411039e-06,
"loss": 0.7112,
"step": 4765
},
{
"epoch": 0.7712206952303962,
"grad_norm": 1.7223557601541144,
"learning_rate": 1.5116710481057301e-06,
"loss": 0.712,
"step": 4770
},
{
"epoch": 0.7720291026677445,
"grad_norm": 1.3942438179900838,
"learning_rate": 1.5015757612474048e-06,
"loss": 0.7128,
"step": 4775
},
{
"epoch": 0.7728375101050929,
"grad_norm": 1.614918823377606,
"learning_rate": 1.4915083372389665e-06,
"loss": 0.7,
"step": 4780
},
{
"epoch": 0.7736459175424414,
"grad_norm": 1.565824384446224,
"learning_rate": 1.4814688562614094e-06,
"loss": 0.7168,
"step": 4785
},
{
"epoch": 0.7744543249797898,
"grad_norm": 1.426544859266959,
"learning_rate": 1.4714573982731705e-06,
"loss": 0.6955,
"step": 4790
},
{
"epoch": 0.7752627324171383,
"grad_norm": 1.5620988707197838,
"learning_rate": 1.4614740430095104e-06,
"loss": 0.7234,
"step": 4795
},
{
"epoch": 0.7760711398544866,
"grad_norm": 1.526988525228804,
"learning_rate": 1.451518869981859e-06,
"loss": 0.7241,
"step": 4800
},
{
"epoch": 0.7768795472918351,
"grad_norm": 1.5578195610213377,
"learning_rate": 1.4415919584771999e-06,
"loss": 0.7097,
"step": 4805
},
{
"epoch": 0.7776879547291835,
"grad_norm": 1.4842378634518532,
"learning_rate": 1.431693387557424e-06,
"loss": 0.7054,
"step": 4810
},
{
"epoch": 0.778496362166532,
"grad_norm": 1.5526408435473718,
"learning_rate": 1.4218232360587092e-06,
"loss": 0.6938,
"step": 4815
},
{
"epoch": 0.7793047696038804,
"grad_norm": 1.6055389555966928,
"learning_rate": 1.4119815825908922e-06,
"loss": 0.711,
"step": 4820
},
{
"epoch": 0.7801131770412287,
"grad_norm": 1.5107126562884128,
"learning_rate": 1.4021685055368345e-06,
"loss": 0.7109,
"step": 4825
},
{
"epoch": 0.7809215844785772,
"grad_norm": 1.8143707926782138,
"learning_rate": 1.392384083051808e-06,
"loss": 0.7067,
"step": 4830
},
{
"epoch": 0.7817299919159256,
"grad_norm": 1.5209084571946596,
"learning_rate": 1.3826283930628686e-06,
"loss": 0.7137,
"step": 4835
},
{
"epoch": 0.7825383993532741,
"grad_norm": 1.6615578812614993,
"learning_rate": 1.37290151326823e-06,
"loss": 0.7295,
"step": 4840
},
{
"epoch": 0.7833468067906225,
"grad_norm": 1.4826721800499079,
"learning_rate": 1.3632035211366562e-06,
"loss": 0.6925,
"step": 4845
},
{
"epoch": 0.7841552142279709,
"grad_norm": 1.5444908726157,
"learning_rate": 1.3535344939068347e-06,
"loss": 0.7287,
"step": 4850
},
{
"epoch": 0.7849636216653193,
"grad_norm": 1.6021717100804354,
"learning_rate": 1.3438945085867644e-06,
"loss": 0.6999,
"step": 4855
},
{
"epoch": 0.7857720291026677,
"grad_norm": 1.4010708923873407,
"learning_rate": 1.3342836419531434e-06,
"loss": 0.7173,
"step": 4860
},
{
"epoch": 0.7865804365400162,
"grad_norm": 1.6090770001687282,
"learning_rate": 1.3247019705507596e-06,
"loss": 0.7228,
"step": 4865
},
{
"epoch": 0.7873888439773646,
"grad_norm": 1.424793150875814,
"learning_rate": 1.3151495706918766e-06,
"loss": 0.7151,
"step": 4870
},
{
"epoch": 0.788197251414713,
"grad_norm": 1.4789106548880404,
"learning_rate": 1.3056265184556255e-06,
"loss": 0.7072,
"step": 4875
},
{
"epoch": 0.7890056588520614,
"grad_norm": 1.472129014937297,
"learning_rate": 1.2961328896874053e-06,
"loss": 0.695,
"step": 4880
},
{
"epoch": 0.7898140662894099,
"grad_norm": 1.6637166864071475,
"learning_rate": 1.2866687599982709e-06,
"loss": 0.7001,
"step": 4885
},
{
"epoch": 0.7906224737267583,
"grad_norm": 1.5057877081189113,
"learning_rate": 1.2772342047643365e-06,
"loss": 0.7008,
"step": 4890
},
{
"epoch": 0.7914308811641068,
"grad_norm": 1.548339961776595,
"learning_rate": 1.267829299126176e-06,
"loss": 0.6978,
"step": 4895
},
{
"epoch": 0.7922392886014551,
"grad_norm": 1.464593794442187,
"learning_rate": 1.2584541179882177e-06,
"loss": 0.7177,
"step": 4900
},
{
"epoch": 0.7930476960388035,
"grad_norm": 1.6351654658678574,
"learning_rate": 1.2491087360181542e-06,
"loss": 0.7026,
"step": 4905
},
{
"epoch": 0.793856103476152,
"grad_norm": 1.4879686812861164,
"learning_rate": 1.2397932276463436e-06,
"loss": 0.7392,
"step": 4910
},
{
"epoch": 0.7946645109135004,
"grad_norm": 1.3378650693860414,
"learning_rate": 1.2305076670652223e-06,
"loss": 0.6888,
"step": 4915
},
{
"epoch": 0.7954729183508489,
"grad_norm": 1.522150314193267,
"learning_rate": 1.2212521282287093e-06,
"loss": 0.7076,
"step": 4920
},
{
"epoch": 0.7962813257881972,
"grad_norm": 1.417333869582,
"learning_rate": 1.2120266848516154e-06,
"loss": 0.7037,
"step": 4925
},
{
"epoch": 0.7970897332255457,
"grad_norm": 1.6740080315406156,
"learning_rate": 1.202831410409065e-06,
"loss": 0.7061,
"step": 4930
},
{
"epoch": 0.7978981406628941,
"grad_norm": 1.553227192647926,
"learning_rate": 1.1936663781358977e-06,
"loss": 0.7079,
"step": 4935
},
{
"epoch": 0.7987065481002426,
"grad_norm": 1.8204170112679587,
"learning_rate": 1.1845316610260992e-06,
"loss": 0.7018,
"step": 4940
},
{
"epoch": 0.799514955537591,
"grad_norm": 1.634253956796262,
"learning_rate": 1.1754273318322096e-06,
"loss": 0.6829,
"step": 4945
},
{
"epoch": 0.8003233629749393,
"grad_norm": 1.6598849108252804,
"learning_rate": 1.1663534630647455e-06,
"loss": 0.693,
"step": 4950
},
{
"epoch": 0.8011317704122878,
"grad_norm": 1.4148173842136398,
"learning_rate": 1.1573101269916304e-06,
"loss": 0.7105,
"step": 4955
},
{
"epoch": 0.8019401778496362,
"grad_norm": 1.8151836099791694,
"learning_rate": 1.148297395637607e-06,
"loss": 0.6941,
"step": 4960
},
{
"epoch": 0.8027485852869847,
"grad_norm": 1.6808272789284275,
"learning_rate": 1.1393153407836742e-06,
"loss": 0.7136,
"step": 4965
},
{
"epoch": 0.8035569927243331,
"grad_norm": 1.519780990446839,
"learning_rate": 1.1303640339665106e-06,
"loss": 0.7162,
"step": 4970
},
{
"epoch": 0.8043654001616815,
"grad_norm": 1.8667006592783422,
"learning_rate": 1.1214435464779006e-06,
"loss": 0.7098,
"step": 4975
},
{
"epoch": 0.8051738075990299,
"grad_norm": 1.563431852094894,
"learning_rate": 1.1125539493641774e-06,
"loss": 0.7108,
"step": 4980
},
{
"epoch": 0.8059822150363783,
"grad_norm": 1.4400199022026983,
"learning_rate": 1.1036953134256474e-06,
"loss": 0.7061,
"step": 4985
},
{
"epoch": 0.8067906224737268,
"grad_norm": 1.4832879916845079,
"learning_rate": 1.0948677092160291e-06,
"loss": 0.7221,
"step": 4990
},
{
"epoch": 0.8075990299110751,
"grad_norm": 1.4525799430657014,
"learning_rate": 1.0860712070418933e-06,
"loss": 0.699,
"step": 4995
},
{
"epoch": 0.8084074373484236,
"grad_norm": 1.547946240178616,
"learning_rate": 1.0773058769621015e-06,
"loss": 0.7287,
"step": 5000
},
{
"epoch": 0.809215844785772,
"grad_norm": 1.6138843907647331,
"learning_rate": 1.0685717887872504e-06,
"loss": 0.6947,
"step": 5005
},
{
"epoch": 0.8100242522231205,
"grad_norm": 1.5640680796292272,
"learning_rate": 1.059869012079109e-06,
"loss": 0.7008,
"step": 5010
},
{
"epoch": 0.8108326596604689,
"grad_norm": 1.8041754598860973,
"learning_rate": 1.0511976161500737e-06,
"loss": 0.7132,
"step": 5015
},
{
"epoch": 0.8116410670978172,
"grad_norm": 1.4268846951440264,
"learning_rate": 1.0425576700626084e-06,
"loss": 0.682,
"step": 5020
},
{
"epoch": 0.8124494745351657,
"grad_norm": 1.4889490939202308,
"learning_rate": 1.0339492426287012e-06,
"loss": 0.7013,
"step": 5025
},
{
"epoch": 0.8132578819725141,
"grad_norm": 1.3950850556481698,
"learning_rate": 1.0253724024093103e-06,
"loss": 0.7251,
"step": 5030
},
{
"epoch": 0.8140662894098626,
"grad_norm": 1.4223156108512096,
"learning_rate": 1.01682721771382e-06,
"loss": 0.6944,
"step": 5035
},
{
"epoch": 0.814874696847211,
"grad_norm": 1.5095272095787708,
"learning_rate": 1.008313756599502e-06,
"loss": 0.6973,
"step": 5040
},
{
"epoch": 0.8156831042845594,
"grad_norm": 1.590280282889916,
"learning_rate": 9.998320868709632e-07,
"loss": 0.7052,
"step": 5045
},
{
"epoch": 0.8164915117219078,
"grad_norm": 1.4357330509882016,
"learning_rate": 9.91382276079615e-07,
"loss": 0.7014,
"step": 5050
},
{
"epoch": 0.8172999191592563,
"grad_norm": 1.6143043421499383,
"learning_rate": 9.829643915231308e-07,
"loss": 0.7177,
"step": 5055
},
{
"epoch": 0.8181083265966047,
"grad_norm": 1.6121487430257533,
"learning_rate": 9.745785002449076e-07,
"loss": 0.6849,
"step": 5060
},
{
"epoch": 0.8189167340339532,
"grad_norm": 1.550845575196845,
"learning_rate": 9.662246690335414e-07,
"loss": 0.7213,
"step": 5065
},
{
"epoch": 0.8197251414713015,
"grad_norm": 1.9173659278509716,
"learning_rate": 9.579029644222827e-07,
"loss": 0.7148,
"step": 5070
},
{
"epoch": 0.8205335489086499,
"grad_norm": 1.5661412726726536,
"learning_rate": 9.496134526885142e-07,
"loss": 0.7012,
"step": 5075
},
{
"epoch": 0.8213419563459984,
"grad_norm": 1.4097401722824516,
"learning_rate": 9.413561998532262e-07,
"loss": 0.6902,
"step": 5080
},
{
"epoch": 0.8221503637833468,
"grad_norm": 1.5165033710569626,
"learning_rate": 9.331312716804791e-07,
"loss": 0.7072,
"step": 5085
},
{
"epoch": 0.8229587712206953,
"grad_norm": 1.4294694612217773,
"learning_rate": 9.249387336768944e-07,
"loss": 0.7064,
"step": 5090
},
{
"epoch": 0.8237671786580436,
"grad_norm": 1.579502358096418,
"learning_rate": 9.167786510911186e-07,
"loss": 0.7231,
"step": 5095
},
{
"epoch": 0.824575586095392,
"grad_norm": 1.6220290591033253,
"learning_rate": 9.086510889133154e-07,
"loss": 0.7057,
"step": 5100
},
{
"epoch": 0.8253839935327405,
"grad_norm": 1.3661914009939502,
"learning_rate": 9.005561118746381e-07,
"loss": 0.6835,
"step": 5105
},
{
"epoch": 0.826192400970089,
"grad_norm": 1.4598647759066914,
"learning_rate": 8.92493784446724e-07,
"loss": 0.6836,
"step": 5110
},
{
"epoch": 0.8270008084074374,
"grad_norm": 1.628879143000057,
"learning_rate": 8.844641708411716e-07,
"loss": 0.7071,
"step": 5115
},
{
"epoch": 0.8278092158447857,
"grad_norm": 1.3295496775714484,
"learning_rate": 8.764673350090375e-07,
"loss": 0.7048,
"step": 5120
},
{
"epoch": 0.8286176232821342,
"grad_norm": 1.5416846282683871,
"learning_rate": 8.685033406403193e-07,
"loss": 0.7318,
"step": 5125
},
{
"epoch": 0.8294260307194826,
"grad_norm": 1.4353888828295704,
"learning_rate": 8.605722511634517e-07,
"loss": 0.6864,
"step": 5130
},
{
"epoch": 0.8302344381568311,
"grad_norm": 1.5168677776758168,
"learning_rate": 8.526741297448055e-07,
"loss": 0.7042,
"step": 5135
},
{
"epoch": 0.8310428455941795,
"grad_norm": 1.8851927163642166,
"learning_rate": 8.448090392881797e-07,
"loss": 0.6996,
"step": 5140
},
{
"epoch": 0.8318512530315278,
"grad_norm": 1.5320231668585813,
"learning_rate": 8.369770424342977e-07,
"loss": 0.7029,
"step": 5145
},
{
"epoch": 0.8326596604688763,
"grad_norm": 1.4671336484132966,
"learning_rate": 8.291782015603179e-07,
"loss": 0.7119,
"step": 5150
},
{
"epoch": 0.8334680679062247,
"grad_norm": 1.457906963528756,
"learning_rate": 8.214125787793253e-07,
"loss": 0.6918,
"step": 5155
},
{
"epoch": 0.8342764753435732,
"grad_norm": 1.763667060951844,
"learning_rate": 8.136802359398488e-07,
"loss": 0.7089,
"step": 5160
},
{
"epoch": 0.8350848827809216,
"grad_norm": 1.4454996633987895,
"learning_rate": 8.059812346253576e-07,
"loss": 0.7034,
"step": 5165
},
{
"epoch": 0.83589329021827,
"grad_norm": 1.5476432315521382,
"learning_rate": 7.983156361537764e-07,
"loss": 0.7167,
"step": 5170
},
{
"epoch": 0.8367016976556184,
"grad_norm": 1.5243845322424452,
"learning_rate": 7.906835015770003e-07,
"loss": 0.7141,
"step": 5175
},
{
"epoch": 0.8375101050929669,
"grad_norm": 1.5839952810553182,
"learning_rate": 7.830848916803985e-07,
"loss": 0.7094,
"step": 5180
},
{
"epoch": 0.8383185125303153,
"grad_norm": 1.4970253587506417,
"learning_rate": 7.755198669823416e-07,
"loss": 0.6893,
"step": 5185
},
{
"epoch": 0.8391269199676638,
"grad_norm": 1.6540289854643369,
"learning_rate": 7.679884877337124e-07,
"loss": 0.7106,
"step": 5190
},
{
"epoch": 0.8399353274050121,
"grad_norm": 1.7088190894280755,
"learning_rate": 7.604908139174255e-07,
"loss": 0.7042,
"step": 5195
},
{
"epoch": 0.8407437348423605,
"grad_norm": 1.6508440609892434,
"learning_rate": 7.530269052479561e-07,
"loss": 0.688,
"step": 5200
},
{
"epoch": 0.841552142279709,
"grad_norm": 1.519149972828735,
"learning_rate": 7.455968211708569e-07,
"loss": 0.6955,
"step": 5205
},
{
"epoch": 0.8423605497170574,
"grad_norm": 1.6854832467205276,
"learning_rate": 7.382006208622889e-07,
"loss": 0.7115,
"step": 5210
},
{
"epoch": 0.8431689571544059,
"grad_norm": 1.4128267828965078,
"learning_rate": 7.30838363228551e-07,
"loss": 0.7038,
"step": 5215
},
{
"epoch": 0.8439773645917542,
"grad_norm": 1.461389977118903,
"learning_rate": 7.235101069056061e-07,
"loss": 0.7149,
"step": 5220
},
{
"epoch": 0.8447857720291027,
"grad_norm": 1.5690615776923793,
"learning_rate": 7.162159102586203e-07,
"loss": 0.7015,
"step": 5225
},
{
"epoch": 0.8455941794664511,
"grad_norm": 1.5502555021326645,
"learning_rate": 7.089558313814909e-07,
"loss": 0.7079,
"step": 5230
},
{
"epoch": 0.8464025869037995,
"grad_norm": 1.6274177821627565,
"learning_rate": 7.017299280963918e-07,
"loss": 0.7039,
"step": 5235
},
{
"epoch": 0.847210994341148,
"grad_norm": 1.2857795563254066,
"learning_rate": 6.945382579533061e-07,
"loss": 0.7262,
"step": 5240
},
{
"epoch": 0.8480194017784963,
"grad_norm": 1.7801567985934275,
"learning_rate": 6.873808782295715e-07,
"loss": 0.694,
"step": 5245
},
{
"epoch": 0.8488278092158448,
"grad_norm": 1.465131459024236,
"learning_rate": 6.802578459294235e-07,
"loss": 0.7064,
"step": 5250
},
{
"epoch": 0.8496362166531932,
"grad_norm": 1.4110023215116603,
"learning_rate": 6.731692177835381e-07,
"loss": 0.7042,
"step": 5255
},
{
"epoch": 0.8504446240905417,
"grad_norm": 1.470115925149478,
"learning_rate": 6.661150502485875e-07,
"loss": 0.6949,
"step": 5260
},
{
"epoch": 0.85125303152789,
"grad_norm": 1.5987322730856806,
"learning_rate": 6.590953995067812e-07,
"loss": 0.6898,
"step": 5265
},
{
"epoch": 0.8520614389652384,
"grad_norm": 1.4703074530943865,
"learning_rate": 6.521103214654262e-07,
"loss": 0.7021,
"step": 5270
},
{
"epoch": 0.8528698464025869,
"grad_norm": 1.5583892786451392,
"learning_rate": 6.451598717564794e-07,
"loss": 0.7127,
"step": 5275
},
{
"epoch": 0.8536782538399353,
"grad_norm": 1.6831916055099336,
"learning_rate": 6.382441057361e-07,
"loss": 0.7242,
"step": 5280
},
{
"epoch": 0.8544866612772838,
"grad_norm": 1.7877284298900433,
"learning_rate": 6.313630784842168e-07,
"loss": 0.7057,
"step": 5285
},
{
"epoch": 0.8552950687146321,
"grad_norm": 1.5818124463307648,
"learning_rate": 6.245168448040811e-07,
"loss": 0.6779,
"step": 5290
},
{
"epoch": 0.8561034761519806,
"grad_norm": 1.2767314164369492,
"learning_rate": 6.177054592218363e-07,
"loss": 0.7158,
"step": 5295
},
{
"epoch": 0.856911883589329,
"grad_norm": 1.453861814250626,
"learning_rate": 6.109289759860826e-07,
"loss": 0.7206,
"step": 5300
},
{
"epoch": 0.8577202910266775,
"grad_norm": 1.519729144030008,
"learning_rate": 6.041874490674416e-07,
"loss": 0.6963,
"step": 5305
},
{
"epoch": 0.8585286984640259,
"grad_norm": 1.5493355027852693,
"learning_rate": 5.974809321581315e-07,
"loss": 0.6907,
"step": 5310
},
{
"epoch": 0.8593371059013742,
"grad_norm": 1.4314722635920178,
"learning_rate": 5.908094786715341e-07,
"loss": 0.6837,
"step": 5315
},
{
"epoch": 0.8601455133387227,
"grad_norm": 1.5610865108849659,
"learning_rate": 5.841731417417735e-07,
"loss": 0.6957,
"step": 5320
},
{
"epoch": 0.8609539207760711,
"grad_norm": 1.4724518613233466,
"learning_rate": 5.775719742232927e-07,
"loss": 0.7125,
"step": 5325
},
{
"epoch": 0.8617623282134196,
"grad_norm": 1.3737829099361796,
"learning_rate": 5.71006028690429e-07,
"loss": 0.6942,
"step": 5330
},
{
"epoch": 0.862570735650768,
"grad_norm": 1.5545604898279772,
"learning_rate": 5.644753574369987e-07,
"loss": 0.7006,
"step": 5335
},
{
"epoch": 0.8633791430881164,
"grad_norm": 1.5672128023139609,
"learning_rate": 5.579800124758789e-07,
"loss": 0.6858,
"step": 5340
},
{
"epoch": 0.8641875505254648,
"grad_norm": 1.5144163766376024,
"learning_rate": 5.515200455385955e-07,
"loss": 0.7224,
"step": 5345
},
{
"epoch": 0.8649959579628133,
"grad_norm": 1.7093478444467205,
"learning_rate": 5.450955080749099e-07,
"loss": 0.7012,
"step": 5350
},
{
"epoch": 0.8658043654001617,
"grad_norm": 1.4151660570069253,
"learning_rate": 5.387064512524065e-07,
"loss": 0.6955,
"step": 5355
},
{
"epoch": 0.8666127728375101,
"grad_norm": 1.6373642598729596,
"learning_rate": 5.323529259560911e-07,
"loss": 0.6996,
"step": 5360
},
{
"epoch": 0.8674211802748585,
"grad_norm": 1.4331673797986197,
"learning_rate": 5.260349827879785e-07,
"loss": 0.7088,
"step": 5365
},
{
"epoch": 0.8682295877122069,
"grad_norm": 1.6411349503885173,
"learning_rate": 5.197526720666963e-07,
"loss": 0.686,
"step": 5370
},
{
"epoch": 0.8690379951495554,
"grad_norm": 1.4837007483886673,
"learning_rate": 5.135060438270784e-07,
"loss": 0.6867,
"step": 5375
},
{
"epoch": 0.8698464025869038,
"grad_norm": 1.5770702161207801,
"learning_rate": 5.072951478197724e-07,
"loss": 0.7245,
"step": 5380
},
{
"epoch": 0.8706548100242523,
"grad_norm": 1.552555517411911,
"learning_rate": 5.011200335108379e-07,
"loss": 0.7042,
"step": 5385
},
{
"epoch": 0.8714632174616006,
"grad_norm": 1.7490965534967693,
"learning_rate": 4.94980750081353e-07,
"loss": 0.7021,
"step": 5390
},
{
"epoch": 0.872271624898949,
"grad_norm": 1.6919904276080737,
"learning_rate": 4.888773464270286e-07,
"loss": 0.7054,
"step": 5395
},
{
"epoch": 0.8730800323362975,
"grad_norm": 2.059158063480853,
"learning_rate": 4.828098711578116e-07,
"loss": 0.7055,
"step": 5400
},
{
"epoch": 0.8738884397736459,
"grad_norm": 1.7275320576169928,
"learning_rate": 4.767783725975017e-07,
"loss": 0.71,
"step": 5405
},
{
"epoch": 0.8746968472109944,
"grad_norm": 1.7182854663915335,
"learning_rate": 4.7078289878336737e-07,
"loss": 0.6998,
"step": 5410
},
{
"epoch": 0.8755052546483427,
"grad_norm": 1.5247564407950007,
"learning_rate": 4.6482349746575783e-07,
"loss": 0.6861,
"step": 5415
},
{
"epoch": 0.8763136620856912,
"grad_norm": 1.6432058150970736,
"learning_rate": 4.589002161077305e-07,
"loss": 0.686,
"step": 5420
},
{
"epoch": 0.8771220695230396,
"grad_norm": 1.5847832306944918,
"learning_rate": 4.5301310188466676e-07,
"loss": 0.7039,
"step": 5425
},
{
"epoch": 0.8779304769603881,
"grad_norm": 1.4647113649819359,
"learning_rate": 4.4716220168389777e-07,
"loss": 0.6938,
"step": 5430
},
{
"epoch": 0.8787388843977365,
"grad_norm": 1.5111904344889613,
"learning_rate": 4.4134756210433505e-07,
"loss": 0.6937,
"step": 5435
},
{
"epoch": 0.8795472918350848,
"grad_norm": 1.8253966937758632,
"learning_rate": 4.355692294560915e-07,
"loss": 0.6878,
"step": 5440
},
{
"epoch": 0.8803556992724333,
"grad_norm": 1.426451063493756,
"learning_rate": 4.2982724976012134e-07,
"loss": 0.6902,
"step": 5445
},
{
"epoch": 0.8811641067097817,
"grad_norm": 1.4894657386745687,
"learning_rate": 4.241216687478455e-07,
"loss": 0.6967,
"step": 5450
},
{
"epoch": 0.8819725141471302,
"grad_norm": 1.413659434241133,
"learning_rate": 4.1845253186079513e-07,
"loss": 0.7019,
"step": 5455
},
{
"epoch": 0.8827809215844786,
"grad_norm": 1.4472062769677223,
"learning_rate": 4.12819884250244e-07,
"loss": 0.6845,
"step": 5460
},
{
"epoch": 0.883589329021827,
"grad_norm": 1.6958560319752523,
"learning_rate": 4.0722377077684947e-07,
"loss": 0.6912,
"step": 5465
},
{
"epoch": 0.8843977364591754,
"grad_norm": 1.5193788282930156,
"learning_rate": 4.0166423601029735e-07,
"loss": 0.7096,
"step": 5470
},
{
"epoch": 0.8852061438965239,
"grad_norm": 1.4417388906412087,
"learning_rate": 3.9614132422894637e-07,
"loss": 0.6979,
"step": 5475
},
{
"epoch": 0.8860145513338723,
"grad_norm": 1.6887342097664206,
"learning_rate": 3.9065507941947467e-07,
"loss": 0.711,
"step": 5480
},
{
"epoch": 0.8868229587712207,
"grad_norm": 1.5209079055984485,
"learning_rate": 3.852055452765313e-07,
"loss": 0.7,
"step": 5485
},
{
"epoch": 0.8876313662085691,
"grad_norm": 1.4951444788612174,
"learning_rate": 3.797927652023847e-07,
"loss": 0.7025,
"step": 5490
},
{
"epoch": 0.8884397736459175,
"grad_norm": 1.5994588482558039,
"learning_rate": 3.744167823065814e-07,
"loss": 0.7053,
"step": 5495
},
{
"epoch": 0.889248181083266,
"grad_norm": 1.6223372164127636,
"learning_rate": 3.6907763940559784e-07,
"loss": 0.6903,
"step": 5500
},
{
"epoch": 0.8900565885206144,
"grad_norm": 1.5787830461685994,
"learning_rate": 3.6377537902250573e-07,
"loss": 0.6968,
"step": 5505
},
{
"epoch": 0.8908649959579629,
"grad_norm": 1.5012854464186227,
"learning_rate": 3.5851004338662564e-07,
"loss": 0.7075,
"step": 5510
},
{
"epoch": 0.8916734033953112,
"grad_norm": 1.5816110130647192,
"learning_rate": 3.532816744331963e-07,
"loss": 0.7063,
"step": 5515
},
{
"epoch": 0.8924818108326596,
"grad_norm": 1.6516342043629912,
"learning_rate": 3.4809031380304114e-07,
"loss": 0.7056,
"step": 5520
},
{
"epoch": 0.8932902182700081,
"grad_norm": 1.4324191798602168,
"learning_rate": 3.429360028422307e-07,
"loss": 0.7124,
"step": 5525
},
{
"epoch": 0.8940986257073565,
"grad_norm": 1.5278351089643472,
"learning_rate": 3.378187826017604e-07,
"loss": 0.6951,
"step": 5530
},
{
"epoch": 0.8949070331447049,
"grad_norm": 1.6668491680320379,
"learning_rate": 3.3273869383721734e-07,
"loss": 0.7165,
"step": 5535
},
{
"epoch": 0.8957154405820533,
"grad_norm": 1.4121703345362437,
"learning_rate": 3.276957770084616e-07,
"loss": 0.705,
"step": 5540
},
{
"epoch": 0.8965238480194018,
"grad_norm": 1.295321212557015,
"learning_rate": 3.2269007227930026e-07,
"loss": 0.6945,
"step": 5545
},
{
"epoch": 0.8973322554567502,
"grad_norm": 1.491642893775521,
"learning_rate": 3.177216195171673e-07,
"loss": 0.71,
"step": 5550
},
{
"epoch": 0.8981406628940987,
"grad_norm": 1.5093040697730336,
"learning_rate": 3.1279045829280706e-07,
"loss": 0.7097,
"step": 5555
},
{
"epoch": 0.898949070331447,
"grad_norm": 1.2316504521547396,
"learning_rate": 3.0789662787996e-07,
"loss": 0.6965,
"step": 5560
},
{
"epoch": 0.8997574777687954,
"grad_norm": 1.550192376633313,
"learning_rate": 3.030401672550487e-07,
"loss": 0.6996,
"step": 5565
},
{
"epoch": 0.9005658852061439,
"grad_norm": 1.484003056341841,
"learning_rate": 2.9822111509687e-07,
"loss": 0.7065,
"step": 5570
},
{
"epoch": 0.9013742926434923,
"grad_norm": 1.443207613038234,
"learning_rate": 2.9343950978627965e-07,
"loss": 0.7074,
"step": 5575
},
{
"epoch": 0.9021827000808408,
"grad_norm": 1.3381987402914712,
"learning_rate": 2.88695389405898e-07,
"loss": 0.6988,
"step": 5580
},
{
"epoch": 0.9029911075181891,
"grad_norm": 1.5561292741790862,
"learning_rate": 2.8398879173979434e-07,
"loss": 0.6943,
"step": 5585
},
{
"epoch": 0.9037995149555376,
"grad_norm": 1.5456031956017944,
"learning_rate": 2.7931975427319734e-07,
"loss": 0.7075,
"step": 5590
},
{
"epoch": 0.904607922392886,
"grad_norm": 1.2770024516759357,
"learning_rate": 2.746883141921869e-07,
"loss": 0.7082,
"step": 5595
},
{
"epoch": 0.9054163298302345,
"grad_norm": 1.5503292032340625,
"learning_rate": 2.7009450838340613e-07,
"loss": 0.7019,
"step": 5600
},
{
"epoch": 0.9062247372675829,
"grad_norm": 1.4285805273520935,
"learning_rate": 2.6553837343376023e-07,
"loss": 0.7018,
"step": 5605
},
{
"epoch": 0.9070331447049312,
"grad_norm": 1.3528073503244311,
"learning_rate": 2.61019945630131e-07,
"loss": 0.6897,
"step": 5610
},
{
"epoch": 0.9078415521422797,
"grad_norm": 1.8090163212162127,
"learning_rate": 2.5653926095908446e-07,
"loss": 0.7228,
"step": 5615
},
{
"epoch": 0.9086499595796281,
"grad_norm": 1.3823209097986053,
"learning_rate": 2.520963551065853e-07,
"loss": 0.7024,
"step": 5620
},
{
"epoch": 0.9094583670169766,
"grad_norm": 1.5088907496000867,
"learning_rate": 2.476912634577128e-07,
"loss": 0.689,
"step": 5625
},
{
"epoch": 0.910266774454325,
"grad_norm": 1.4516203595743193,
"learning_rate": 2.4332402109638e-07,
"loss": 0.7139,
"step": 5630
},
{
"epoch": 0.9110751818916734,
"grad_norm": 1.8781761391343346,
"learning_rate": 2.3899466280504936e-07,
"loss": 0.6915,
"step": 5635
},
{
"epoch": 0.9118835893290218,
"grad_norm": 1.563214928880183,
"learning_rate": 2.3470322306446468e-07,
"loss": 0.7289,
"step": 5640
},
{
"epoch": 0.9126919967663703,
"grad_norm": 1.8065914033787347,
"learning_rate": 2.304497360533664e-07,
"loss": 0.6889,
"step": 5645
},
{
"epoch": 0.9135004042037187,
"grad_norm": 1.889183469859966,
"learning_rate": 2.2623423564822666e-07,
"loss": 0.72,
"step": 5650
},
{
"epoch": 0.9143088116410671,
"grad_norm": 1.6535990324640262,
"learning_rate": 2.22056755422978e-07,
"loss": 0.7238,
"step": 5655
},
{
"epoch": 0.9151172190784155,
"grad_norm": 1.3746775282920567,
"learning_rate": 2.1791732864874182e-07,
"loss": 0.7097,
"step": 5660
},
{
"epoch": 0.9159256265157639,
"grad_norm": 1.3780616233800305,
"learning_rate": 2.1381598829357031e-07,
"loss": 0.7201,
"step": 5665
},
{
"epoch": 0.9167340339531124,
"grad_norm": 1.496841353204978,
"learning_rate": 2.0975276702217716e-07,
"loss": 0.7155,
"step": 5670
},
{
"epoch": 0.9175424413904608,
"grad_norm": 1.3354995789369437,
"learning_rate": 2.0572769719568286e-07,
"loss": 0.7035,
"step": 5675
},
{
"epoch": 0.9183508488278093,
"grad_norm": 1.2888588034538615,
"learning_rate": 2.0174081087135312e-07,
"loss": 0.7035,
"step": 5680
},
{
"epoch": 0.9191592562651576,
"grad_norm": 1.3070065889530205,
"learning_rate": 1.9779213980234468e-07,
"loss": 0.6906,
"step": 5685
},
{
"epoch": 0.919967663702506,
"grad_norm": 1.649361503680924,
"learning_rate": 1.9388171543745394e-07,
"loss": 0.6991,
"step": 5690
},
{
"epoch": 0.9207760711398545,
"grad_norm": 1.4618951093045796,
"learning_rate": 1.9000956892086363e-07,
"loss": 0.7114,
"step": 5695
},
{
"epoch": 0.9215844785772029,
"grad_norm": 1.858821992427871,
"learning_rate": 1.861757310918977e-07,
"loss": 0.6981,
"step": 5700
},
{
"epoch": 0.9223928860145514,
"grad_norm": 1.4390333751784132,
"learning_rate": 1.823802324847751e-07,
"loss": 0.6947,
"step": 5705
},
{
"epoch": 0.9232012934518997,
"grad_norm": 1.3446164451118023,
"learning_rate": 1.7862310332836307e-07,
"loss": 0.7165,
"step": 5710
},
{
"epoch": 0.9240097008892482,
"grad_norm": 1.3267887077701044,
"learning_rate": 1.749043735459427e-07,
"loss": 0.6914,
"step": 5715
},
{
"epoch": 0.9248181083265966,
"grad_norm": 1.4452492665400665,
"learning_rate": 1.7122407275496411e-07,
"loss": 0.6994,
"step": 5720
},
{
"epoch": 0.9256265157639451,
"grad_norm": 1.6067816385523603,
"learning_rate": 1.6758223026681507e-07,
"loss": 0.7077,
"step": 5725
},
{
"epoch": 0.9264349232012935,
"grad_norm": 1.3759663515194218,
"learning_rate": 1.639788750865867e-07,
"loss": 0.6867,
"step": 5730
},
{
"epoch": 0.9272433306386418,
"grad_norm": 1.4050656917463036,
"learning_rate": 1.6041403591283866e-07,
"loss": 0.7155,
"step": 5735
},
{
"epoch": 0.9280517380759903,
"grad_norm": 1.2966156059566525,
"learning_rate": 1.5688774113737814e-07,
"loss": 0.6991,
"step": 5740
},
{
"epoch": 0.9288601455133387,
"grad_norm": 1.3415985484676374,
"learning_rate": 1.5340001884502577e-07,
"loss": 0.7077,
"step": 5745
},
{
"epoch": 0.9296685529506872,
"grad_norm": 1.3587674188346928,
"learning_rate": 1.499508968133978e-07,
"loss": 0.6907,
"step": 5750
},
{
"epoch": 0.9304769603880356,
"grad_norm": 1.4836521085637517,
"learning_rate": 1.4654040251268097e-07,
"loss": 0.711,
"step": 5755
},
{
"epoch": 0.931285367825384,
"grad_norm": 1.5356253203571395,
"learning_rate": 1.4316856310541638e-07,
"loss": 0.7027,
"step": 5760
},
{
"epoch": 0.9320937752627324,
"grad_norm": 1.3179793459708602,
"learning_rate": 1.3983540544628138e-07,
"loss": 0.6885,
"step": 5765
},
{
"epoch": 0.9329021827000809,
"grad_norm": 1.4026702983758432,
"learning_rate": 1.3654095608187757e-07,
"loss": 0.681,
"step": 5770
},
{
"epoch": 0.9337105901374293,
"grad_norm": 1.6044541361178586,
"learning_rate": 1.332852412505159e-07,
"loss": 0.7184,
"step": 5775
},
{
"epoch": 0.9345189975747777,
"grad_norm": 1.3569634212248618,
"learning_rate": 1.300682868820119e-07,
"loss": 0.6993,
"step": 5780
},
{
"epoch": 0.9353274050121261,
"grad_norm": 1.4476284125206074,
"learning_rate": 1.2689011859747745e-07,
"loss": 0.699,
"step": 5785
},
{
"epoch": 0.9361358124494745,
"grad_norm": 1.2406212283464149,
"learning_rate": 1.2375076170911604e-07,
"loss": 0.6838,
"step": 5790
},
{
"epoch": 0.936944219886823,
"grad_norm": 1.3821866002119294,
"learning_rate": 1.2065024122002055e-07,
"loss": 0.6936,
"step": 5795
},
{
"epoch": 0.9377526273241714,
"grad_norm": 1.5651244760885679,
"learning_rate": 1.1758858182397692e-07,
"loss": 0.6886,
"step": 5800
},
{
"epoch": 0.9385610347615198,
"grad_norm": 1.4282620996566417,
"learning_rate": 1.1456580790526528e-07,
"loss": 0.7081,
"step": 5805
},
{
"epoch": 0.9393694421988682,
"grad_norm": 1.5025168367086414,
"learning_rate": 1.1158194353846574e-07,
"loss": 0.6859,
"step": 5810
},
{
"epoch": 0.9401778496362166,
"grad_norm": 1.3512739774700069,
"learning_rate": 1.0863701248826797e-07,
"loss": 0.7225,
"step": 5815
},
{
"epoch": 0.9409862570735651,
"grad_norm": 1.4231363820423177,
"learning_rate": 1.0573103820928022e-07,
"loss": 0.706,
"step": 5820
},
{
"epoch": 0.9417946645109135,
"grad_norm": 1.4255699165992657,
"learning_rate": 1.0286404384584448e-07,
"loss": 0.7289,
"step": 5825
},
{
"epoch": 0.9426030719482619,
"grad_norm": 1.3866774534958695,
"learning_rate": 1.0003605223184998e-07,
"loss": 0.6676,
"step": 5830
},
{
"epoch": 0.9434114793856103,
"grad_norm": 1.558561584720674,
"learning_rate": 9.724708589055332e-08,
"loss": 0.6827,
"step": 5835
},
{
"epoch": 0.9442198868229588,
"grad_norm": 1.4814199460791437,
"learning_rate": 9.449716703439805e-08,
"loss": 0.7012,
"step": 5840
},
{
"epoch": 0.9450282942603072,
"grad_norm": 1.4398896590301569,
"learning_rate": 9.178631756483758e-08,
"loss": 0.7222,
"step": 5845
},
{
"epoch": 0.9458367016976557,
"grad_norm": 1.4274832095739076,
"learning_rate": 8.911455907216149e-08,
"loss": 0.6974,
"step": 5850
},
{
"epoch": 0.946645109135004,
"grad_norm": 1.3302674791379907,
"learning_rate": 8.648191283532337e-08,
"loss": 0.7109,
"step": 5855
},
{
"epoch": 0.9474535165723524,
"grad_norm": 1.518044488774888,
"learning_rate": 8.388839982176988e-08,
"loss": 0.6706,
"step": 5860
},
{
"epoch": 0.9482619240097009,
"grad_norm": 1.3913177767364981,
"learning_rate": 8.133404068727702e-08,
"loss": 0.7175,
"step": 5865
},
{
"epoch": 0.9490703314470493,
"grad_norm": 1.340352098968631,
"learning_rate": 7.881885577578185e-08,
"loss": 0.696,
"step": 5870
},
{
"epoch": 0.9498787388843978,
"grad_norm": 1.371861271599684,
"learning_rate": 7.634286511922384e-08,
"loss": 0.7122,
"step": 5875
},
{
"epoch": 0.9506871463217461,
"grad_norm": 1.30296931416904,
"learning_rate": 7.390608843738156e-08,
"loss": 0.6949,
"step": 5880
},
{
"epoch": 0.9514955537590946,
"grad_norm": 1.3002210105122816,
"learning_rate": 7.150854513772009e-08,
"loss": 0.7001,
"step": 5885
},
{
"epoch": 0.952303961196443,
"grad_norm": 1.2765177058238852,
"learning_rate": 6.915025431523282e-08,
"loss": 0.7014,
"step": 5890
},
{
"epoch": 0.9531123686337915,
"grad_norm": 1.5746971313785787,
"learning_rate": 6.683123475229148e-08,
"loss": 0.7083,
"step": 5895
},
{
"epoch": 0.9539207760711399,
"grad_norm": 1.5888397024986494,
"learning_rate": 6.455150491849527e-08,
"loss": 0.6858,
"step": 5900
},
{
"epoch": 0.9547291835084882,
"grad_norm": 1.4426978109483788,
"learning_rate": 6.231108297052424e-08,
"loss": 0.7146,
"step": 5905
},
{
"epoch": 0.9555375909458367,
"grad_norm": 1.5455545862291462,
"learning_rate": 6.010998675199554e-08,
"loss": 0.7077,
"step": 5910
},
{
"epoch": 0.9563459983831851,
"grad_norm": 1.3957709152429087,
"learning_rate": 5.794823379331793e-08,
"loss": 0.7192,
"step": 5915
},
{
"epoch": 0.9571544058205336,
"grad_norm": 1.5204415869547911,
"learning_rate": 5.582584131155866e-08,
"loss": 0.7096,
"step": 5920
},
{
"epoch": 0.957962813257882,
"grad_norm": 1.3205805736575398,
"learning_rate": 5.3742826210299584e-08,
"loss": 0.7033,
"step": 5925
},
{
"epoch": 0.9587712206952304,
"grad_norm": 1.4266292287008944,
"learning_rate": 5.169920507950621e-08,
"loss": 0.6987,
"step": 5930
},
{
"epoch": 0.9595796281325788,
"grad_norm": 1.480234828774731,
"learning_rate": 4.9694994195394474e-08,
"loss": 0.7157,
"step": 5935
},
{
"epoch": 0.9603880355699272,
"grad_norm": 1.7268574237528078,
"learning_rate": 4.773020952030083e-08,
"loss": 0.6952,
"step": 5940
},
{
"epoch": 0.9611964430072757,
"grad_norm": 1.4923064684180711,
"learning_rate": 4.58048667025579e-08,
"loss": 0.7001,
"step": 5945
},
{
"epoch": 0.9620048504446241,
"grad_norm": 1.5378356376814888,
"learning_rate": 4.391898107636461e-08,
"loss": 0.6915,
"step": 5950
},
{
"epoch": 0.9628132578819725,
"grad_norm": 1.3364927348435258,
"learning_rate": 4.207256766166845e-08,
"loss": 0.695,
"step": 5955
},
{
"epoch": 0.9636216653193209,
"grad_norm": 1.3683902840034843,
"learning_rate": 4.0265641164045075e-08,
"loss": 0.6916,
"step": 5960
},
{
"epoch": 0.9644300727566694,
"grad_norm": 1.4925361751549704,
"learning_rate": 3.849821597457892e-08,
"loss": 0.6817,
"step": 5965
},
{
"epoch": 0.9652384801940178,
"grad_norm": 1.5452153453031308,
"learning_rate": 3.677030616975163e-08,
"loss": 0.7034,
"step": 5970
},
{
"epoch": 0.9660468876313663,
"grad_norm": 1.4457585595064018,
"learning_rate": 3.508192551132883e-08,
"loss": 0.6761,
"step": 5975
},
{
"epoch": 0.9668552950687146,
"grad_norm": 1.5607103585541133,
"learning_rate": 3.34330874462474e-08,
"loss": 0.698,
"step": 5980
},
{
"epoch": 0.967663702506063,
"grad_norm": 1.388649382711461,
"learning_rate": 3.182380510651506e-08,
"loss": 0.7104,
"step": 5985
},
{
"epoch": 0.9684721099434115,
"grad_norm": 1.5025532102637633,
"learning_rate": 3.025409130909929e-08,
"loss": 0.6927,
"step": 5990
},
{
"epoch": 0.9692805173807599,
"grad_norm": 1.5115410164451122,
"learning_rate": 2.8723958555827993e-08,
"loss": 0.7021,
"step": 5995
},
{
"epoch": 0.9700889248181084,
"grad_norm": 1.4667370044065153,
"learning_rate": 2.723341903329124e-08,
"loss": 0.7011,
"step": 6000
},
{
"epoch": 0.9708973322554567,
"grad_norm": 1.3729938733212024,
"learning_rate": 2.5782484612741908e-08,
"loss": 0.7062,
"step": 6005
},
{
"epoch": 0.9717057396928052,
"grad_norm": 1.3730064673661673,
"learning_rate": 2.4371166850001292e-08,
"loss": 0.7119,
"step": 6010
},
{
"epoch": 0.9725141471301536,
"grad_norm": 1.4865892461508956,
"learning_rate": 2.2999476985369196e-08,
"loss": 0.6984,
"step": 6015
},
{
"epoch": 0.973322554567502,
"grad_norm": 1.5028651779794058,
"learning_rate": 2.1667425943532884e-08,
"loss": 0.695,
"step": 6020
},
{
"epoch": 0.9741309620048505,
"grad_norm": 1.5599213421453288,
"learning_rate": 2.0375024333478267e-08,
"loss": 0.7174,
"step": 6025
},
{
"epoch": 0.9749393694421988,
"grad_norm": 2.0104805596152,
"learning_rate": 1.9122282448409413e-08,
"loss": 0.7369,
"step": 6030
},
{
"epoch": 0.9757477768795473,
"grad_norm": 1.581779872366385,
"learning_rate": 1.7909210265664167e-08,
"loss": 0.6792,
"step": 6035
},
{
"epoch": 0.9765561843168957,
"grad_norm": 1.3891174140800855,
"learning_rate": 1.6735817446633663e-08,
"loss": 0.7007,
"step": 6040
},
{
"epoch": 0.9773645917542442,
"grad_norm": 1.3798388142109828,
"learning_rate": 1.5602113336688485e-08,
"loss": 0.6959,
"step": 6045
},
{
"epoch": 0.9781729991915926,
"grad_norm": 1.4507674372361408,
"learning_rate": 1.450810696510041e-08,
"loss": 0.7079,
"step": 6050
},
{
"epoch": 0.978981406628941,
"grad_norm": 1.5549948970077827,
"learning_rate": 1.3453807044975232e-08,
"loss": 0.6892,
"step": 6055
},
{
"epoch": 0.9797898140662894,
"grad_norm": 1.3944647878700718,
"learning_rate": 1.2439221973178372e-08,
"loss": 0.6956,
"step": 6060
},
{
"epoch": 0.9805982215036378,
"grad_norm": 1.2971921385179899,
"learning_rate": 1.1464359830271055e-08,
"loss": 0.6756,
"step": 6065
},
{
"epoch": 0.9814066289409863,
"grad_norm": 1.4195361555610082,
"learning_rate": 1.05292283804459e-08,
"loss": 0.7044,
"step": 6070
},
{
"epoch": 0.9822150363783346,
"grad_norm": 1.3029462140916905,
"learning_rate": 9.633835071463094e-09,
"loss": 0.6926,
"step": 6075
},
{
"epoch": 0.9830234438156831,
"grad_norm": 1.3681427565121465,
"learning_rate": 8.778187034593766e-09,
"loss": 0.7141,
"step": 6080
},
{
"epoch": 0.9838318512530315,
"grad_norm": 1.580351014203535,
"learning_rate": 7.962291084560592e-09,
"loss": 0.6982,
"step": 6085
},
{
"epoch": 0.98464025869038,
"grad_norm": 1.402295885158622,
"learning_rate": 7.186153719485056e-09,
"loss": 0.7241,
"step": 6090
},
{
"epoch": 0.9854486661277284,
"grad_norm": 1.4372946746103246,
"learning_rate": 6.449781120836385e-09,
"loss": 0.6943,
"step": 6095
},
{
"epoch": 0.9862570735650767,
"grad_norm": 1.354857429409591,
"learning_rate": 5.753179153379362e-09,
"loss": 0.6893,
"step": 6100
},
{
"epoch": 0.9870654810024252,
"grad_norm": 1.2880968583775416,
"learning_rate": 5.09635336513159e-09,
"loss": 0.7132,
"step": 6105
},
{
"epoch": 0.9878738884397736,
"grad_norm": 1.5205030135158186,
"learning_rate": 4.4793089873162995e-09,
"loss": 0.6998,
"step": 6110
},
{
"epoch": 0.9886822958771221,
"grad_norm": 1.4082262313295961,
"learning_rate": 3.9020509343212775e-09,
"loss": 0.6831,
"step": 6115
},
{
"epoch": 0.9894907033144705,
"grad_norm": 1.3411345047646837,
"learning_rate": 3.3645838036611146e-09,
"loss": 0.7041,
"step": 6120
},
{
"epoch": 0.9902991107518189,
"grad_norm": 1.4474166300255564,
"learning_rate": 2.8669118759383497e-09,
"loss": 0.7057,
"step": 6125
},
{
"epoch": 0.9911075181891673,
"grad_norm": 1.4662855334778262,
"learning_rate": 2.4090391148112734e-09,
"loss": 0.6817,
"step": 6130
},
{
"epoch": 0.9919159256265158,
"grad_norm": 1.4814179796345557,
"learning_rate": 1.9909691669622868e-09,
"loss": 0.6871,
"step": 6135
},
{
"epoch": 0.9927243330638642,
"grad_norm": 1.574608287576024,
"learning_rate": 1.6127053620673683e-09,
"loss": 0.7386,
"step": 6140
},
{
"epoch": 0.9935327405012127,
"grad_norm": 1.3774684096651242,
"learning_rate": 1.2742507127710967e-09,
"loss": 0.695,
"step": 6145
},
{
"epoch": 0.994341147938561,
"grad_norm": 1.557883359993657,
"learning_rate": 9.75607914660559e-10,
"loss": 0.7087,
"step": 6150
},
{
"epoch": 0.9951495553759094,
"grad_norm": 1.382818271691103,
"learning_rate": 7.167793462475869e-10,
"loss": 0.7278,
"step": 6155
},
{
"epoch": 0.9959579628132579,
"grad_norm": 1.6471775994899713,
"learning_rate": 4.977670689459979e-10,
"loss": 0.7092,
"step": 6160
},
{
"epoch": 0.9967663702506063,
"grad_norm": 1.5090005897564358,
"learning_rate": 3.18572827057162e-10,
"loss": 0.7281,
"step": 6165
},
{
"epoch": 0.9975747776879548,
"grad_norm": 1.6213042923583454,
"learning_rate": 1.7919804775612394e-10,
"loss": 0.6912,
"step": 6170
},
{
"epoch": 0.9983831851253031,
"grad_norm": 1.3874750896868153,
"learning_rate": 7.964384107828071e-11,
"loss": 0.6994,
"step": 6175
},
{
"epoch": 0.9991915925626516,
"grad_norm": 1.2804574968031643,
"learning_rate": 1.9910999914385386e-11,
"loss": 0.6947,
"step": 6180
},
{
"epoch": 1.0,
"grad_norm": 1.58598368671411,
"learning_rate": 0.0,
"loss": 0.7102,
"step": 6185
},
{
"epoch": 1.0,
"eval_loss": 0.7062155604362488,
"eval_runtime": 3.4997,
"eval_samples_per_second": 2.857,
"eval_steps_per_second": 0.857,
"step": 6185
},
{
"epoch": 1.0,
"step": 6185,
"total_flos": 1963824696786944.0,
"train_loss": 0.7727144511168101,
"train_runtime": 22636.3522,
"train_samples_per_second": 4.371,
"train_steps_per_second": 0.273
}
],
"logging_steps": 5,
"max_steps": 6185,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1963824696786944.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}