trapoom555's picture
Upload checkpoints
7b65800
raw
history blame
147 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9143553794574825,
"eval_steps": 500,
"global_step": 9000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.001015950421619425,
"grad_norm": 17.625,
"learning_rate": 5e-06,
"loss": 3.4264,
"step": 10
},
{
"epoch": 0.00203190084323885,
"grad_norm": 12.5625,
"learning_rate": 1e-05,
"loss": 3.432,
"step": 20
},
{
"epoch": 0.003047851264858275,
"grad_norm": 14.0625,
"learning_rate": 1.5e-05,
"loss": 3.23,
"step": 30
},
{
"epoch": 0.0040638016864777,
"grad_norm": 12.4375,
"learning_rate": 2e-05,
"loss": 2.9762,
"step": 40
},
{
"epoch": 0.005079752108097125,
"grad_norm": 10.0625,
"learning_rate": 2.5e-05,
"loss": 2.6173,
"step": 50
},
{
"epoch": 0.00609570252971655,
"grad_norm": 10.1875,
"learning_rate": 3e-05,
"loss": 2.2004,
"step": 60
},
{
"epoch": 0.007111652951335975,
"grad_norm": 7.03125,
"learning_rate": 3.5e-05,
"loss": 1.4176,
"step": 70
},
{
"epoch": 0.0081276033729554,
"grad_norm": 4.375,
"learning_rate": 4e-05,
"loss": 1.0122,
"step": 80
},
{
"epoch": 0.009143553794574825,
"grad_norm": 6.5625,
"learning_rate": 4.5e-05,
"loss": 0.9116,
"step": 90
},
{
"epoch": 0.01015950421619425,
"grad_norm": 5.28125,
"learning_rate": 5e-05,
"loss": 0.6832,
"step": 100
},
{
"epoch": 0.011175454637813675,
"grad_norm": 5.5,
"learning_rate": 4.9999870035728426e-05,
"loss": 0.7355,
"step": 110
},
{
"epoch": 0.0121914050594331,
"grad_norm": 5.1875,
"learning_rate": 4.9999480144264944e-05,
"loss": 0.6673,
"step": 120
},
{
"epoch": 0.013207355481052525,
"grad_norm": 4.5,
"learning_rate": 4.9998830329663314e-05,
"loss": 0.6792,
"step": 130
},
{
"epoch": 0.01422330590267195,
"grad_norm": 3.9375,
"learning_rate": 4.9997920598679756e-05,
"loss": 0.6207,
"step": 140
},
{
"epoch": 0.015239256324291375,
"grad_norm": 3.15625,
"learning_rate": 4.999675096077286e-05,
"loss": 0.483,
"step": 150
},
{
"epoch": 0.0162552067459108,
"grad_norm": 5.28125,
"learning_rate": 4.999532142810354e-05,
"loss": 0.5319,
"step": 160
},
{
"epoch": 0.017271157167530225,
"grad_norm": 4.59375,
"learning_rate": 4.999363201553483e-05,
"loss": 0.6052,
"step": 170
},
{
"epoch": 0.01828710758914965,
"grad_norm": 5.03125,
"learning_rate": 4.9991682740631794e-05,
"loss": 0.4258,
"step": 180
},
{
"epoch": 0.019303058010769075,
"grad_norm": 3.859375,
"learning_rate": 4.998947362366133e-05,
"loss": 0.4309,
"step": 190
},
{
"epoch": 0.0203190084323885,
"grad_norm": 3.328125,
"learning_rate": 4.998700468759193e-05,
"loss": 0.3957,
"step": 200
},
{
"epoch": 0.021334958854007924,
"grad_norm": 4.9375,
"learning_rate": 4.9984275958093475e-05,
"loss": 0.4777,
"step": 210
},
{
"epoch": 0.02235090927562735,
"grad_norm": 4.78125,
"learning_rate": 4.998128746353695e-05,
"loss": 0.3549,
"step": 220
},
{
"epoch": 0.023366859697246774,
"grad_norm": 4.0625,
"learning_rate": 4.997803923499417e-05,
"loss": 0.4447,
"step": 230
},
{
"epoch": 0.0243828101188662,
"grad_norm": 6.375,
"learning_rate": 4.99745313062374e-05,
"loss": 0.3808,
"step": 240
},
{
"epoch": 0.025398760540485624,
"grad_norm": 3.59375,
"learning_rate": 4.99707637137391e-05,
"loss": 0.3827,
"step": 250
},
{
"epoch": 0.02641471096210505,
"grad_norm": 3.015625,
"learning_rate": 4.996673649667145e-05,
"loss": 0.3694,
"step": 260
},
{
"epoch": 0.027430661383724474,
"grad_norm": 2.296875,
"learning_rate": 4.9962449696906e-05,
"loss": 0.3586,
"step": 270
},
{
"epoch": 0.0284466118053439,
"grad_norm": 4.125,
"learning_rate": 4.9957903359013214e-05,
"loss": 0.3832,
"step": 280
},
{
"epoch": 0.029462562226963324,
"grad_norm": 3.296875,
"learning_rate": 4.995309753026201e-05,
"loss": 0.328,
"step": 290
},
{
"epoch": 0.03047851264858275,
"grad_norm": 4.5,
"learning_rate": 4.994803226061927e-05,
"loss": 0.3667,
"step": 300
},
{
"epoch": 0.03149446307020217,
"grad_norm": 4.3125,
"learning_rate": 4.994270760274933e-05,
"loss": 0.3811,
"step": 310
},
{
"epoch": 0.0325104134918216,
"grad_norm": 3.421875,
"learning_rate": 4.99371236120134e-05,
"loss": 0.3065,
"step": 320
},
{
"epoch": 0.03352636391344102,
"grad_norm": 4.6875,
"learning_rate": 4.993128034646902e-05,
"loss": 0.4177,
"step": 330
},
{
"epoch": 0.03454231433506045,
"grad_norm": 3.046875,
"learning_rate": 4.992517786686947e-05,
"loss": 0.33,
"step": 340
},
{
"epoch": 0.03555826475667987,
"grad_norm": 1.8828125,
"learning_rate": 4.9918816236663077e-05,
"loss": 0.3287,
"step": 350
},
{
"epoch": 0.0365742151782993,
"grad_norm": 3.8125,
"learning_rate": 4.991219552199262e-05,
"loss": 0.2934,
"step": 360
},
{
"epoch": 0.03759016559991872,
"grad_norm": 4.28125,
"learning_rate": 4.99053157916946e-05,
"loss": 0.3176,
"step": 370
},
{
"epoch": 0.03860611602153815,
"grad_norm": 2.609375,
"learning_rate": 4.989817711729856e-05,
"loss": 0.3318,
"step": 380
},
{
"epoch": 0.03962206644315757,
"grad_norm": 2.375,
"learning_rate": 4.98907795730263e-05,
"loss": 0.3234,
"step": 390
},
{
"epoch": 0.040638016864777,
"grad_norm": 4.46875,
"learning_rate": 4.988312323579114e-05,
"loss": 0.267,
"step": 400
},
{
"epoch": 0.04165396728639642,
"grad_norm": 3.75,
"learning_rate": 4.98752081851971e-05,
"loss": 0.3081,
"step": 410
},
{
"epoch": 0.04266991770801585,
"grad_norm": 2.203125,
"learning_rate": 4.986703450353809e-05,
"loss": 0.2917,
"step": 420
},
{
"epoch": 0.04368586812963527,
"grad_norm": 1.6015625,
"learning_rate": 4.985860227579703e-05,
"loss": 0.2805,
"step": 430
},
{
"epoch": 0.0447018185512547,
"grad_norm": 3.140625,
"learning_rate": 4.984991158964499e-05,
"loss": 0.3534,
"step": 440
},
{
"epoch": 0.04571776897287412,
"grad_norm": 3.296875,
"learning_rate": 4.9840962535440265e-05,
"loss": 0.335,
"step": 450
},
{
"epoch": 0.04673371939449355,
"grad_norm": 3.25,
"learning_rate": 4.983175520622744e-05,
"loss": 0.2544,
"step": 460
},
{
"epoch": 0.04774966981611297,
"grad_norm": 2.25,
"learning_rate": 4.982228969773642e-05,
"loss": 0.3449,
"step": 470
},
{
"epoch": 0.0487656202377324,
"grad_norm": 4.9375,
"learning_rate": 4.9812566108381435e-05,
"loss": 0.2964,
"step": 480
},
{
"epoch": 0.04978157065935182,
"grad_norm": 1.5703125,
"learning_rate": 4.9802584539260035e-05,
"loss": 0.2799,
"step": 490
},
{
"epoch": 0.05079752108097125,
"grad_norm": 2.828125,
"learning_rate": 4.979234509415199e-05,
"loss": 0.3231,
"step": 500
},
{
"epoch": 0.05181347150259067,
"grad_norm": 2.9375,
"learning_rate": 4.978184787951828e-05,
"loss": 0.2943,
"step": 510
},
{
"epoch": 0.0528294219242101,
"grad_norm": 2.34375,
"learning_rate": 4.977109300449992e-05,
"loss": 0.2705,
"step": 520
},
{
"epoch": 0.05384537234582952,
"grad_norm": 3.140625,
"learning_rate": 4.9760080580916876e-05,
"loss": 0.2998,
"step": 530
},
{
"epoch": 0.05486132276744895,
"grad_norm": 3.5625,
"learning_rate": 4.974881072326688e-05,
"loss": 0.2595,
"step": 540
},
{
"epoch": 0.05587727318906837,
"grad_norm": 4.25,
"learning_rate": 4.9737283548724236e-05,
"loss": 0.2803,
"step": 550
},
{
"epoch": 0.0568932236106878,
"grad_norm": 4.0625,
"learning_rate": 4.97254991771386e-05,
"loss": 0.3511,
"step": 560
},
{
"epoch": 0.05790917403230722,
"grad_norm": 2.515625,
"learning_rate": 4.971345773103377e-05,
"loss": 0.312,
"step": 570
},
{
"epoch": 0.05892512445392665,
"grad_norm": 3.21875,
"learning_rate": 4.9701159335606365e-05,
"loss": 0.2482,
"step": 580
},
{
"epoch": 0.05994107487554607,
"grad_norm": 5.5,
"learning_rate": 4.968860411872454e-05,
"loss": 0.2537,
"step": 590
},
{
"epoch": 0.0609570252971655,
"grad_norm": 3.546875,
"learning_rate": 4.967579221092666e-05,
"loss": 0.3125,
"step": 600
},
{
"epoch": 0.06197297571878492,
"grad_norm": 2.984375,
"learning_rate": 4.966272374541996e-05,
"loss": 0.2354,
"step": 610
},
{
"epoch": 0.06298892614040434,
"grad_norm": 3.6875,
"learning_rate": 4.964939885807912e-05,
"loss": 0.3213,
"step": 620
},
{
"epoch": 0.06400487656202378,
"grad_norm": 2.140625,
"learning_rate": 4.9635817687444876e-05,
"loss": 0.3003,
"step": 630
},
{
"epoch": 0.0650208269836432,
"grad_norm": 3.484375,
"learning_rate": 4.962198037472259e-05,
"loss": 0.2996,
"step": 640
},
{
"epoch": 0.06603677740526262,
"grad_norm": 3.21875,
"learning_rate": 4.9607887063780776e-05,
"loss": 0.2257,
"step": 650
},
{
"epoch": 0.06705272782688204,
"grad_norm": 5.375,
"learning_rate": 4.9593537901149564e-05,
"loss": 0.223,
"step": 660
},
{
"epoch": 0.06806867824850148,
"grad_norm": 4.1875,
"learning_rate": 4.957893303601924e-05,
"loss": 0.3407,
"step": 670
},
{
"epoch": 0.0690846286701209,
"grad_norm": 3.328125,
"learning_rate": 4.956407262023866e-05,
"loss": 0.2589,
"step": 680
},
{
"epoch": 0.07010057909174032,
"grad_norm": 2.953125,
"learning_rate": 4.954895680831367e-05,
"loss": 0.2949,
"step": 690
},
{
"epoch": 0.07111652951335974,
"grad_norm": 4.0625,
"learning_rate": 4.9533585757405506e-05,
"loss": 0.2995,
"step": 700
},
{
"epoch": 0.07213247993497918,
"grad_norm": 4.625,
"learning_rate": 4.951795962732917e-05,
"loss": 0.2894,
"step": 710
},
{
"epoch": 0.0731484303565986,
"grad_norm": 3.0,
"learning_rate": 4.9502078580551755e-05,
"loss": 0.3082,
"step": 720
},
{
"epoch": 0.07416438077821802,
"grad_norm": 3.65625,
"learning_rate": 4.9485942782190734e-05,
"loss": 0.2308,
"step": 730
},
{
"epoch": 0.07518033119983744,
"grad_norm": 4.78125,
"learning_rate": 4.9469552400012306e-05,
"loss": 0.2272,
"step": 740
},
{
"epoch": 0.07619628162145688,
"grad_norm": 4.25,
"learning_rate": 4.94529076044296e-05,
"loss": 0.2701,
"step": 750
},
{
"epoch": 0.0772122320430763,
"grad_norm": 3.140625,
"learning_rate": 4.94360085685009e-05,
"loss": 0.2686,
"step": 760
},
{
"epoch": 0.07822818246469572,
"grad_norm": 0.765625,
"learning_rate": 4.9418855467927894e-05,
"loss": 0.2051,
"step": 770
},
{
"epoch": 0.07924413288631514,
"grad_norm": 1.796875,
"learning_rate": 4.940144848105379e-05,
"loss": 0.2267,
"step": 780
},
{
"epoch": 0.08026008330793458,
"grad_norm": 4.5625,
"learning_rate": 4.93837877888615e-05,
"loss": 0.2597,
"step": 790
},
{
"epoch": 0.081276033729554,
"grad_norm": 3.03125,
"learning_rate": 4.9365873574971745e-05,
"loss": 0.3701,
"step": 800
},
{
"epoch": 0.08229198415117342,
"grad_norm": 4.5625,
"learning_rate": 4.9347706025641136e-05,
"loss": 0.2559,
"step": 810
},
{
"epoch": 0.08330793457279284,
"grad_norm": 3.90625,
"learning_rate": 4.9329285329760275e-05,
"loss": 0.2799,
"step": 820
},
{
"epoch": 0.08432388499441228,
"grad_norm": 3.140625,
"learning_rate": 4.9310611678851735e-05,
"loss": 0.2866,
"step": 830
},
{
"epoch": 0.0853398354160317,
"grad_norm": 2.46875,
"learning_rate": 4.929168526706811e-05,
"loss": 0.3105,
"step": 840
},
{
"epoch": 0.08635578583765112,
"grad_norm": 13.625,
"learning_rate": 4.927250629119e-05,
"loss": 0.2454,
"step": 850
},
{
"epoch": 0.08737173625927054,
"grad_norm": 3.921875,
"learning_rate": 4.9253074950623925e-05,
"loss": 0.2424,
"step": 860
},
{
"epoch": 0.08838768668088998,
"grad_norm": 2.90625,
"learning_rate": 4.9233391447400286e-05,
"loss": 0.2481,
"step": 870
},
{
"epoch": 0.0894036371025094,
"grad_norm": 2.96875,
"learning_rate": 4.921345598617125e-05,
"loss": 0.2231,
"step": 880
},
{
"epoch": 0.09041958752412882,
"grad_norm": 5.375,
"learning_rate": 4.9193268774208654e-05,
"loss": 0.3447,
"step": 890
},
{
"epoch": 0.09143553794574824,
"grad_norm": 2.0,
"learning_rate": 4.9172830021401785e-05,
"loss": 0.229,
"step": 900
},
{
"epoch": 0.09245148836736768,
"grad_norm": 3.1875,
"learning_rate": 4.9152139940255245e-05,
"loss": 0.2122,
"step": 910
},
{
"epoch": 0.0934674387889871,
"grad_norm": 3.40625,
"learning_rate": 4.913119874588677e-05,
"loss": 0.2386,
"step": 920
},
{
"epoch": 0.09448338921060652,
"grad_norm": 1.4609375,
"learning_rate": 4.911000665602489e-05,
"loss": 0.1944,
"step": 930
},
{
"epoch": 0.09549933963222594,
"grad_norm": 5.0625,
"learning_rate": 4.9088563891006786e-05,
"loss": 0.2038,
"step": 940
},
{
"epoch": 0.09651529005384538,
"grad_norm": 4.53125,
"learning_rate": 4.906687067377592e-05,
"loss": 0.3122,
"step": 950
},
{
"epoch": 0.0975312404754648,
"grad_norm": 2.84375,
"learning_rate": 4.904492722987976e-05,
"loss": 0.3157,
"step": 960
},
{
"epoch": 0.09854719089708422,
"grad_norm": 2.171875,
"learning_rate": 4.902273378746738e-05,
"loss": 0.3077,
"step": 970
},
{
"epoch": 0.09956314131870364,
"grad_norm": 2.84375,
"learning_rate": 4.9000290577287165e-05,
"loss": 0.2756,
"step": 980
},
{
"epoch": 0.10057909174032308,
"grad_norm": 0.99609375,
"learning_rate": 4.897759783268434e-05,
"loss": 0.2915,
"step": 990
},
{
"epoch": 0.1015950421619425,
"grad_norm": 3.53125,
"learning_rate": 4.895465578959859e-05,
"loss": 0.2052,
"step": 1000
},
{
"epoch": 0.10261099258356192,
"grad_norm": 4.0,
"learning_rate": 4.893146468656159e-05,
"loss": 0.2499,
"step": 1010
},
{
"epoch": 0.10362694300518134,
"grad_norm": 1.65625,
"learning_rate": 4.890802476469452e-05,
"loss": 0.278,
"step": 1020
},
{
"epoch": 0.10464289342680078,
"grad_norm": 3.625,
"learning_rate": 4.888433626770558e-05,
"loss": 0.2143,
"step": 1030
},
{
"epoch": 0.1056588438484202,
"grad_norm": 5.0625,
"learning_rate": 4.886039944188741e-05,
"loss": 0.2878,
"step": 1040
},
{
"epoch": 0.10667479427003962,
"grad_norm": 4.5,
"learning_rate": 4.883621453611461e-05,
"loss": 0.2744,
"step": 1050
},
{
"epoch": 0.10769074469165904,
"grad_norm": 4.5625,
"learning_rate": 4.881178180184106e-05,
"loss": 0.2734,
"step": 1060
},
{
"epoch": 0.10870669511327848,
"grad_norm": 3.125,
"learning_rate": 4.878710149309735e-05,
"loss": 0.3574,
"step": 1070
},
{
"epoch": 0.1097226455348979,
"grad_norm": 3.0625,
"learning_rate": 4.876217386648816e-05,
"loss": 0.2625,
"step": 1080
},
{
"epoch": 0.11073859595651732,
"grad_norm": 4.0625,
"learning_rate": 4.873699918118955e-05,
"loss": 0.2437,
"step": 1090
},
{
"epoch": 0.11175454637813674,
"grad_norm": 1.59375,
"learning_rate": 4.87115776989463e-05,
"loss": 0.2051,
"step": 1100
},
{
"epoch": 0.11277049679975618,
"grad_norm": 4.375,
"learning_rate": 4.8685909684069153e-05,
"loss": 0.1727,
"step": 1110
},
{
"epoch": 0.1137864472213756,
"grad_norm": 2.28125,
"learning_rate": 4.865999540343211e-05,
"loss": 0.2256,
"step": 1120
},
{
"epoch": 0.11480239764299502,
"grad_norm": 2.265625,
"learning_rate": 4.86338351264696e-05,
"loss": 0.3529,
"step": 1130
},
{
"epoch": 0.11581834806461444,
"grad_norm": 2.34375,
"learning_rate": 4.8607429125173754e-05,
"loss": 0.2113,
"step": 1140
},
{
"epoch": 0.11683429848623388,
"grad_norm": 0.7578125,
"learning_rate": 4.858077767409149e-05,
"loss": 0.2759,
"step": 1150
},
{
"epoch": 0.1178502489078533,
"grad_norm": 3.640625,
"learning_rate": 4.855388105032174e-05,
"loss": 0.2482,
"step": 1160
},
{
"epoch": 0.11886619932947272,
"grad_norm": 3.5,
"learning_rate": 4.852673953351249e-05,
"loss": 0.1865,
"step": 1170
},
{
"epoch": 0.11988214975109214,
"grad_norm": 3.75,
"learning_rate": 4.849935340585796e-05,
"loss": 0.2659,
"step": 1180
},
{
"epoch": 0.12089810017271158,
"grad_norm": 3.375,
"learning_rate": 4.8471722952095586e-05,
"loss": 0.1506,
"step": 1190
},
{
"epoch": 0.121914050594331,
"grad_norm": 3.34375,
"learning_rate": 4.844384845950312e-05,
"loss": 0.307,
"step": 1200
},
{
"epoch": 0.12293000101595042,
"grad_norm": 1.578125,
"learning_rate": 4.841573021789561e-05,
"loss": 0.1952,
"step": 1210
},
{
"epoch": 0.12394595143756984,
"grad_norm": 1.2890625,
"learning_rate": 4.838736851962239e-05,
"loss": 0.1779,
"step": 1220
},
{
"epoch": 0.12496190185918928,
"grad_norm": 1.265625,
"learning_rate": 4.835876365956408e-05,
"loss": 0.1235,
"step": 1230
},
{
"epoch": 0.12597785228080868,
"grad_norm": 1.9609375,
"learning_rate": 4.8329915935129436e-05,
"loss": 0.1876,
"step": 1240
},
{
"epoch": 0.12699380270242813,
"grad_norm": 1.6328125,
"learning_rate": 4.830082564625235e-05,
"loss": 0.2188,
"step": 1250
},
{
"epoch": 0.12800975312404755,
"grad_norm": 3.96875,
"learning_rate": 4.8271493095388684e-05,
"loss": 0.2622,
"step": 1260
},
{
"epoch": 0.12902570354566698,
"grad_norm": 3.765625,
"learning_rate": 4.824191858751312e-05,
"loss": 0.2724,
"step": 1270
},
{
"epoch": 0.1300416539672864,
"grad_norm": 5.59375,
"learning_rate": 4.821210243011601e-05,
"loss": 0.2413,
"step": 1280
},
{
"epoch": 0.13105760438890582,
"grad_norm": 3.34375,
"learning_rate": 4.818204493320016e-05,
"loss": 0.2618,
"step": 1290
},
{
"epoch": 0.13207355481052524,
"grad_norm": 2.78125,
"learning_rate": 4.8151746409277634e-05,
"loss": 0.2295,
"step": 1300
},
{
"epoch": 0.13308950523214466,
"grad_norm": 3.1875,
"learning_rate": 4.8121207173366484e-05,
"loss": 0.2733,
"step": 1310
},
{
"epoch": 0.13410545565376408,
"grad_norm": 2.28125,
"learning_rate": 4.809042754298746e-05,
"loss": 0.2311,
"step": 1320
},
{
"epoch": 0.13512140607538353,
"grad_norm": 2.171875,
"learning_rate": 4.805940783816075e-05,
"loss": 0.2059,
"step": 1330
},
{
"epoch": 0.13613735649700295,
"grad_norm": 2.796875,
"learning_rate": 4.8028148381402625e-05,
"loss": 0.2102,
"step": 1340
},
{
"epoch": 0.13715330691862238,
"grad_norm": 2.96875,
"learning_rate": 4.7996649497722084e-05,
"loss": 0.2708,
"step": 1350
},
{
"epoch": 0.1381692573402418,
"grad_norm": 2.4375,
"learning_rate": 4.7964911514617485e-05,
"loss": 0.2429,
"step": 1360
},
{
"epoch": 0.13918520776186122,
"grad_norm": 5.8125,
"learning_rate": 4.793293476207312e-05,
"loss": 0.2725,
"step": 1370
},
{
"epoch": 0.14020115818348064,
"grad_norm": 2.40625,
"learning_rate": 4.790071957255585e-05,
"loss": 0.2098,
"step": 1380
},
{
"epoch": 0.14121710860510006,
"grad_norm": 4.25,
"learning_rate": 4.786826628101154e-05,
"loss": 0.2101,
"step": 1390
},
{
"epoch": 0.14223305902671948,
"grad_norm": 2.578125,
"learning_rate": 4.783557522486167e-05,
"loss": 0.2624,
"step": 1400
},
{
"epoch": 0.14324900944833893,
"grad_norm": 3.125,
"learning_rate": 4.780264674399978e-05,
"loss": 0.2518,
"step": 1410
},
{
"epoch": 0.14426495986995835,
"grad_norm": 3.671875,
"learning_rate": 4.7769481180787966e-05,
"loss": 0.3112,
"step": 1420
},
{
"epoch": 0.14528091029157778,
"grad_norm": 3.984375,
"learning_rate": 4.773607888005327e-05,
"loss": 0.2747,
"step": 1430
},
{
"epoch": 0.1462968607131972,
"grad_norm": 3.234375,
"learning_rate": 4.770244018908416e-05,
"loss": 0.1572,
"step": 1440
},
{
"epoch": 0.14731281113481662,
"grad_norm": 4.09375,
"learning_rate": 4.766856545762687e-05,
"loss": 0.2148,
"step": 1450
},
{
"epoch": 0.14832876155643604,
"grad_norm": 1.6875,
"learning_rate": 4.763445503788178e-05,
"loss": 0.2531,
"step": 1460
},
{
"epoch": 0.14934471197805546,
"grad_norm": 2.375,
"learning_rate": 4.760010928449976e-05,
"loss": 0.199,
"step": 1470
},
{
"epoch": 0.15036066239967488,
"grad_norm": 4.6875,
"learning_rate": 4.7565528554578485e-05,
"loss": 0.2366,
"step": 1480
},
{
"epoch": 0.15137661282129433,
"grad_norm": 5.4375,
"learning_rate": 4.75307132076587e-05,
"loss": 0.1862,
"step": 1490
},
{
"epoch": 0.15239256324291375,
"grad_norm": 2.484375,
"learning_rate": 4.749566360572049e-05,
"loss": 0.2143,
"step": 1500
},
{
"epoch": 0.15340851366453317,
"grad_norm": 2.1875,
"learning_rate": 4.746038011317955e-05,
"loss": 0.1877,
"step": 1510
},
{
"epoch": 0.1544244640861526,
"grad_norm": 2.84375,
"learning_rate": 4.742486309688333e-05,
"loss": 0.2831,
"step": 1520
},
{
"epoch": 0.15544041450777202,
"grad_norm": 2.015625,
"learning_rate": 4.738911292610732e-05,
"loss": 0.1708,
"step": 1530
},
{
"epoch": 0.15645636492939144,
"grad_norm": 3.953125,
"learning_rate": 4.735312997255107e-05,
"loss": 0.192,
"step": 1540
},
{
"epoch": 0.15747231535101086,
"grad_norm": 2.09375,
"learning_rate": 4.7316914610334475e-05,
"loss": 0.2586,
"step": 1550
},
{
"epoch": 0.15848826577263028,
"grad_norm": 3.6875,
"learning_rate": 4.728046721599378e-05,
"loss": 0.2141,
"step": 1560
},
{
"epoch": 0.15950421619424973,
"grad_norm": 2.9375,
"learning_rate": 4.724378816847771e-05,
"loss": 0.193,
"step": 1570
},
{
"epoch": 0.16052016661586915,
"grad_norm": 1.5625,
"learning_rate": 4.720687784914352e-05,
"loss": 0.191,
"step": 1580
},
{
"epoch": 0.16153611703748857,
"grad_norm": 3.75,
"learning_rate": 4.716973664175304e-05,
"loss": 0.2172,
"step": 1590
},
{
"epoch": 0.162552067459108,
"grad_norm": 3.125,
"learning_rate": 4.7132364932468645e-05,
"loss": 0.2134,
"step": 1600
},
{
"epoch": 0.16356801788072742,
"grad_norm": 4.09375,
"learning_rate": 4.709476310984932e-05,
"loss": 0.2055,
"step": 1610
},
{
"epoch": 0.16458396830234684,
"grad_norm": 3.875,
"learning_rate": 4.705693156484652e-05,
"loss": 0.2136,
"step": 1620
},
{
"epoch": 0.16559991872396626,
"grad_norm": 1.1796875,
"learning_rate": 4.7018870690800196e-05,
"loss": 0.1471,
"step": 1630
},
{
"epoch": 0.16661586914558568,
"grad_norm": 2.5,
"learning_rate": 4.698058088343465e-05,
"loss": 0.2308,
"step": 1640
},
{
"epoch": 0.16763181956720513,
"grad_norm": 1.390625,
"learning_rate": 4.6942062540854425e-05,
"loss": 0.2456,
"step": 1650
},
{
"epoch": 0.16864776998882455,
"grad_norm": 3.125,
"learning_rate": 4.69033160635402e-05,
"loss": 0.2654,
"step": 1660
},
{
"epoch": 0.16966372041044397,
"grad_norm": 3.984375,
"learning_rate": 4.6864341854344587e-05,
"loss": 0.2226,
"step": 1670
},
{
"epoch": 0.1706796708320634,
"grad_norm": 2.328125,
"learning_rate": 4.682514031848795e-05,
"loss": 0.2438,
"step": 1680
},
{
"epoch": 0.17169562125368282,
"grad_norm": 3.078125,
"learning_rate": 4.678571186355423e-05,
"loss": 0.1889,
"step": 1690
},
{
"epoch": 0.17271157167530224,
"grad_norm": 3.328125,
"learning_rate": 4.6746056899486644e-05,
"loss": 0.2117,
"step": 1700
},
{
"epoch": 0.17372752209692166,
"grad_norm": 2.78125,
"learning_rate": 4.67061758385835e-05,
"loss": 0.1953,
"step": 1710
},
{
"epoch": 0.17474347251854108,
"grad_norm": 3.09375,
"learning_rate": 4.6666069095493816e-05,
"loss": 0.1844,
"step": 1720
},
{
"epoch": 0.17575942294016053,
"grad_norm": 3.234375,
"learning_rate": 4.662573708721309e-05,
"loss": 0.2774,
"step": 1730
},
{
"epoch": 0.17677537336177995,
"grad_norm": 4.03125,
"learning_rate": 4.658518023307894e-05,
"loss": 0.2527,
"step": 1740
},
{
"epoch": 0.17779132378339937,
"grad_norm": 3.21875,
"learning_rate": 4.654439895476671e-05,
"loss": 0.2164,
"step": 1750
},
{
"epoch": 0.1788072742050188,
"grad_norm": 2.390625,
"learning_rate": 4.6503393676285146e-05,
"loss": 0.2424,
"step": 1760
},
{
"epoch": 0.17982322462663822,
"grad_norm": 1.8359375,
"learning_rate": 4.646216482397192e-05,
"loss": 0.2428,
"step": 1770
},
{
"epoch": 0.18083917504825764,
"grad_norm": 2.796875,
"learning_rate": 4.6420712826489275e-05,
"loss": 0.2155,
"step": 1780
},
{
"epoch": 0.18185512546987706,
"grad_norm": 0.69921875,
"learning_rate": 4.6379038114819485e-05,
"loss": 0.1544,
"step": 1790
},
{
"epoch": 0.18287107589149648,
"grad_norm": 3.40625,
"learning_rate": 4.6337141122260444e-05,
"loss": 0.2029,
"step": 1800
},
{
"epoch": 0.18388702631311593,
"grad_norm": 2.359375,
"learning_rate": 4.629502228442112e-05,
"loss": 0.1489,
"step": 1810
},
{
"epoch": 0.18490297673473535,
"grad_norm": 1.4453125,
"learning_rate": 4.6252682039217045e-05,
"loss": 0.2101,
"step": 1820
},
{
"epoch": 0.18591892715635477,
"grad_norm": 2.71875,
"learning_rate": 4.621012082686573e-05,
"loss": 0.2076,
"step": 1830
},
{
"epoch": 0.1869348775779742,
"grad_norm": 3.0625,
"learning_rate": 4.616733908988216e-05,
"loss": 0.2719,
"step": 1840
},
{
"epoch": 0.18795082799959362,
"grad_norm": 1.953125,
"learning_rate": 4.612433727307409e-05,
"loss": 0.2105,
"step": 1850
},
{
"epoch": 0.18896677842121304,
"grad_norm": 3.46875,
"learning_rate": 4.608111582353751e-05,
"loss": 0.1877,
"step": 1860
},
{
"epoch": 0.18998272884283246,
"grad_norm": 2.546875,
"learning_rate": 4.603767519065197e-05,
"loss": 0.2238,
"step": 1870
},
{
"epoch": 0.19099867926445188,
"grad_norm": 1.5703125,
"learning_rate": 4.599401582607589e-05,
"loss": 0.243,
"step": 1880
},
{
"epoch": 0.19201462968607133,
"grad_norm": 2.4375,
"learning_rate": 4.595013818374185e-05,
"loss": 0.1867,
"step": 1890
},
{
"epoch": 0.19303058010769075,
"grad_norm": 2.203125,
"learning_rate": 4.5906042719851925e-05,
"loss": 0.1994,
"step": 1900
},
{
"epoch": 0.19404653052931017,
"grad_norm": 3.984375,
"learning_rate": 4.586172989287291e-05,
"loss": 0.1899,
"step": 1910
},
{
"epoch": 0.1950624809509296,
"grad_norm": 2.6875,
"learning_rate": 4.5817200163531534e-05,
"loss": 0.2528,
"step": 1920
},
{
"epoch": 0.19607843137254902,
"grad_norm": 2.71875,
"learning_rate": 4.577245399480972e-05,
"loss": 0.2336,
"step": 1930
},
{
"epoch": 0.19709438179416844,
"grad_norm": 2.640625,
"learning_rate": 4.5727491851939715e-05,
"loss": 0.2204,
"step": 1940
},
{
"epoch": 0.19811033221578786,
"grad_norm": 1.78125,
"learning_rate": 4.568231420239929e-05,
"loss": 0.1656,
"step": 1950
},
{
"epoch": 0.19912628263740728,
"grad_norm": 3.15625,
"learning_rate": 4.563692151590687e-05,
"loss": 0.2105,
"step": 1960
},
{
"epoch": 0.20014223305902673,
"grad_norm": 1.3671875,
"learning_rate": 4.5591314264416666e-05,
"loss": 0.1464,
"step": 1970
},
{
"epoch": 0.20115818348064615,
"grad_norm": 4.25,
"learning_rate": 4.554549292211371e-05,
"loss": 0.2103,
"step": 1980
},
{
"epoch": 0.20217413390226557,
"grad_norm": 2.984375,
"learning_rate": 4.549945796540901e-05,
"loss": 0.144,
"step": 1990
},
{
"epoch": 0.203190084323885,
"grad_norm": 1.859375,
"learning_rate": 4.545320987293453e-05,
"loss": 0.1963,
"step": 2000
},
{
"epoch": 0.20420603474550442,
"grad_norm": 1.078125,
"learning_rate": 4.540674912553824e-05,
"loss": 0.2115,
"step": 2010
},
{
"epoch": 0.20522198516712384,
"grad_norm": 4.25,
"learning_rate": 4.536007620627911e-05,
"loss": 0.1682,
"step": 2020
},
{
"epoch": 0.20623793558874326,
"grad_norm": 2.71875,
"learning_rate": 4.531319160042212e-05,
"loss": 0.1992,
"step": 2030
},
{
"epoch": 0.20725388601036268,
"grad_norm": 1.2890625,
"learning_rate": 4.5266095795433126e-05,
"loss": 0.1134,
"step": 2040
},
{
"epoch": 0.20826983643198213,
"grad_norm": 3.296875,
"learning_rate": 4.5218789280973925e-05,
"loss": 0.1474,
"step": 2050
},
{
"epoch": 0.20928578685360155,
"grad_norm": 1.9375,
"learning_rate": 4.5171272548897024e-05,
"loss": 0.1955,
"step": 2060
},
{
"epoch": 0.21030173727522097,
"grad_norm": 2.734375,
"learning_rate": 4.512354609324063e-05,
"loss": 0.2042,
"step": 2070
},
{
"epoch": 0.2113176876968404,
"grad_norm": 2.921875,
"learning_rate": 4.507561041022347e-05,
"loss": 0.2174,
"step": 2080
},
{
"epoch": 0.21233363811845982,
"grad_norm": 2.40625,
"learning_rate": 4.502746599823963e-05,
"loss": 0.2634,
"step": 2090
},
{
"epoch": 0.21334958854007924,
"grad_norm": 1.71875,
"learning_rate": 4.497911335785339e-05,
"loss": 0.1884,
"step": 2100
},
{
"epoch": 0.21436553896169866,
"grad_norm": 0.79296875,
"learning_rate": 4.4930552991794e-05,
"loss": 0.1872,
"step": 2110
},
{
"epoch": 0.21538148938331808,
"grad_norm": 3.171875,
"learning_rate": 4.4881785404950474e-05,
"loss": 0.2233,
"step": 2120
},
{
"epoch": 0.21639743980493753,
"grad_norm": 2.59375,
"learning_rate": 4.483281110436631e-05,
"loss": 0.2374,
"step": 2130
},
{
"epoch": 0.21741339022655695,
"grad_norm": 3.328125,
"learning_rate": 4.478363059923426e-05,
"loss": 0.2545,
"step": 2140
},
{
"epoch": 0.21842934064817637,
"grad_norm": 2.3125,
"learning_rate": 4.4734244400891014e-05,
"loss": 0.2063,
"step": 2150
},
{
"epoch": 0.2194452910697958,
"grad_norm": 3.40625,
"learning_rate": 4.4684653022811865e-05,
"loss": 0.1219,
"step": 2160
},
{
"epoch": 0.22046124149141522,
"grad_norm": 4.1875,
"learning_rate": 4.463485698060541e-05,
"loss": 0.2805,
"step": 2170
},
{
"epoch": 0.22147719191303464,
"grad_norm": 2.3125,
"learning_rate": 4.458485679200814e-05,
"loss": 0.1998,
"step": 2180
},
{
"epoch": 0.22249314233465406,
"grad_norm": 3.578125,
"learning_rate": 4.453465297687912e-05,
"loss": 0.2489,
"step": 2190
},
{
"epoch": 0.22350909275627348,
"grad_norm": 2.59375,
"learning_rate": 4.448424605719452e-05,
"loss": 0.2731,
"step": 2200
},
{
"epoch": 0.22452504317789293,
"grad_norm": 3.28125,
"learning_rate": 4.443363655704224e-05,
"loss": 0.2425,
"step": 2210
},
{
"epoch": 0.22554099359951235,
"grad_norm": 2.78125,
"learning_rate": 4.438282500261641e-05,
"loss": 0.2938,
"step": 2220
},
{
"epoch": 0.22655694402113177,
"grad_norm": 1.1953125,
"learning_rate": 4.433181192221197e-05,
"loss": 0.1728,
"step": 2230
},
{
"epoch": 0.2275728944427512,
"grad_norm": 1.34375,
"learning_rate": 4.4280597846219155e-05,
"loss": 0.216,
"step": 2240
},
{
"epoch": 0.22858884486437062,
"grad_norm": 1.8515625,
"learning_rate": 4.422918330711796e-05,
"loss": 0.1612,
"step": 2250
},
{
"epoch": 0.22960479528599004,
"grad_norm": 1.90625,
"learning_rate": 4.417756883947263e-05,
"loss": 0.107,
"step": 2260
},
{
"epoch": 0.23062074570760946,
"grad_norm": 3.375,
"learning_rate": 4.412575497992611e-05,
"loss": 0.1756,
"step": 2270
},
{
"epoch": 0.23163669612922888,
"grad_norm": 4.375,
"learning_rate": 4.407374226719445e-05,
"loss": 0.234,
"step": 2280
},
{
"epoch": 0.23265264655084833,
"grad_norm": 3.25,
"learning_rate": 4.402153124206119e-05,
"loss": 0.2144,
"step": 2290
},
{
"epoch": 0.23366859697246775,
"grad_norm": 1.703125,
"learning_rate": 4.396912244737173e-05,
"loss": 0.1696,
"step": 2300
},
{
"epoch": 0.23468454739408717,
"grad_norm": 2.84375,
"learning_rate": 4.391651642802778e-05,
"loss": 0.2506,
"step": 2310
},
{
"epoch": 0.2357004978157066,
"grad_norm": 4.5,
"learning_rate": 4.386371373098155e-05,
"loss": 0.1686,
"step": 2320
},
{
"epoch": 0.23671644823732602,
"grad_norm": 2.515625,
"learning_rate": 4.381071490523018e-05,
"loss": 0.2403,
"step": 2330
},
{
"epoch": 0.23773239865894544,
"grad_norm": 4.4375,
"learning_rate": 4.3757520501809955e-05,
"loss": 0.1611,
"step": 2340
},
{
"epoch": 0.23874834908056486,
"grad_norm": 1.609375,
"learning_rate": 4.370413107379065e-05,
"loss": 0.1698,
"step": 2350
},
{
"epoch": 0.23976429950218428,
"grad_norm": 4.96875,
"learning_rate": 4.36505471762697e-05,
"loss": 0.1928,
"step": 2360
},
{
"epoch": 0.24078024992380373,
"grad_norm": 0.8984375,
"learning_rate": 4.3596769366366474e-05,
"loss": 0.2035,
"step": 2370
},
{
"epoch": 0.24179620034542315,
"grad_norm": 5.75,
"learning_rate": 4.354279820321649e-05,
"loss": 0.16,
"step": 2380
},
{
"epoch": 0.24281215076704257,
"grad_norm": 1.9453125,
"learning_rate": 4.34886342479656e-05,
"loss": 0.1851,
"step": 2390
},
{
"epoch": 0.243828101188662,
"grad_norm": 1.015625,
"learning_rate": 4.34342780637641e-05,
"loss": 0.1726,
"step": 2400
},
{
"epoch": 0.24484405161028142,
"grad_norm": 4.59375,
"learning_rate": 4.337973021576095e-05,
"loss": 0.2847,
"step": 2410
},
{
"epoch": 0.24586000203190084,
"grad_norm": 1.03125,
"learning_rate": 4.3324991271097846e-05,
"loss": 0.2528,
"step": 2420
},
{
"epoch": 0.24687595245352026,
"grad_norm": 2.1875,
"learning_rate": 4.3270061798903374e-05,
"loss": 0.1573,
"step": 2430
},
{
"epoch": 0.24789190287513968,
"grad_norm": 0.98046875,
"learning_rate": 4.321494237028701e-05,
"loss": 0.1703,
"step": 2440
},
{
"epoch": 0.24890785329675913,
"grad_norm": 3.8125,
"learning_rate": 4.31596335583333e-05,
"loss": 0.2613,
"step": 2450
},
{
"epoch": 0.24992380371837855,
"grad_norm": 4.0625,
"learning_rate": 4.310413593809579e-05,
"loss": 0.22,
"step": 2460
},
{
"epoch": 0.250939754139998,
"grad_norm": 3.15625,
"learning_rate": 4.304845008659108e-05,
"loss": 0.1263,
"step": 2470
},
{
"epoch": 0.25195570456161737,
"grad_norm": 3.046875,
"learning_rate": 4.2992576582792895e-05,
"loss": 0.1639,
"step": 2480
},
{
"epoch": 0.2529716549832368,
"grad_norm": 9.8125,
"learning_rate": 4.293651600762595e-05,
"loss": 0.2681,
"step": 2490
},
{
"epoch": 0.25398760540485626,
"grad_norm": 3.734375,
"learning_rate": 4.288026894395999e-05,
"loss": 0.2292,
"step": 2500
},
{
"epoch": 0.25500355582647566,
"grad_norm": 0.455078125,
"learning_rate": 4.2823835976603723e-05,
"loss": 0.2324,
"step": 2510
},
{
"epoch": 0.2560195062480951,
"grad_norm": 5.625,
"learning_rate": 4.276721769229869e-05,
"loss": 0.1834,
"step": 2520
},
{
"epoch": 0.2570354566697145,
"grad_norm": 1.3671875,
"learning_rate": 4.271041467971323e-05,
"loss": 0.1826,
"step": 2530
},
{
"epoch": 0.25805140709133395,
"grad_norm": 5.0625,
"learning_rate": 4.265342752943632e-05,
"loss": 0.2463,
"step": 2540
},
{
"epoch": 0.25906735751295334,
"grad_norm": 2.859375,
"learning_rate": 4.2596256833971425e-05,
"loss": 0.2598,
"step": 2550
},
{
"epoch": 0.2600833079345728,
"grad_norm": 1.8515625,
"learning_rate": 4.2538903187730374e-05,
"loss": 0.1148,
"step": 2560
},
{
"epoch": 0.26109925835619224,
"grad_norm": 2.71875,
"learning_rate": 4.248136718702716e-05,
"loss": 0.2123,
"step": 2570
},
{
"epoch": 0.26211520877781164,
"grad_norm": 4.5625,
"learning_rate": 4.242364943007172e-05,
"loss": 0.2369,
"step": 2580
},
{
"epoch": 0.2631311591994311,
"grad_norm": 2.296875,
"learning_rate": 4.236575051696377e-05,
"loss": 0.261,
"step": 2590
},
{
"epoch": 0.2641471096210505,
"grad_norm": 2.75,
"learning_rate": 4.2307671049686514e-05,
"loss": 0.1564,
"step": 2600
},
{
"epoch": 0.26516306004266993,
"grad_norm": 3.5,
"learning_rate": 4.2249411632100396e-05,
"loss": 0.1563,
"step": 2610
},
{
"epoch": 0.2661790104642893,
"grad_norm": 2.84375,
"learning_rate": 4.219097286993684e-05,
"loss": 0.1697,
"step": 2620
},
{
"epoch": 0.26719496088590877,
"grad_norm": 2.125,
"learning_rate": 4.2132355370791946e-05,
"loss": 0.1844,
"step": 2630
},
{
"epoch": 0.26821091130752817,
"grad_norm": 4.03125,
"learning_rate": 4.2073559744120156e-05,
"loss": 0.2144,
"step": 2640
},
{
"epoch": 0.2692268617291476,
"grad_norm": 2.375,
"learning_rate": 4.201458660122793e-05,
"loss": 0.2013,
"step": 2650
},
{
"epoch": 0.27024281215076706,
"grad_norm": 3.625,
"learning_rate": 4.1955436555267393e-05,
"loss": 0.2166,
"step": 2660
},
{
"epoch": 0.27125876257238646,
"grad_norm": 0.328125,
"learning_rate": 4.189611022122997e-05,
"loss": 0.1934,
"step": 2670
},
{
"epoch": 0.2722747129940059,
"grad_norm": 2.75,
"learning_rate": 4.1836608215939944e-05,
"loss": 0.2157,
"step": 2680
},
{
"epoch": 0.2732906634156253,
"grad_norm": 3.5,
"learning_rate": 4.17769311580481e-05,
"loss": 0.18,
"step": 2690
},
{
"epoch": 0.27430661383724475,
"grad_norm": 2.109375,
"learning_rate": 4.171707966802528e-05,
"loss": 0.2178,
"step": 2700
},
{
"epoch": 0.27532256425886414,
"grad_norm": 4.65625,
"learning_rate": 4.16570543681559e-05,
"loss": 0.1896,
"step": 2710
},
{
"epoch": 0.2763385146804836,
"grad_norm": 4.8125,
"learning_rate": 4.159685588253151e-05,
"loss": 0.1322,
"step": 2720
},
{
"epoch": 0.27735446510210304,
"grad_norm": 3.9375,
"learning_rate": 4.153648483704429e-05,
"loss": 0.184,
"step": 2730
},
{
"epoch": 0.27837041552372244,
"grad_norm": 4.53125,
"learning_rate": 4.147594185938057e-05,
"loss": 0.2451,
"step": 2740
},
{
"epoch": 0.2793863659453419,
"grad_norm": 1.0390625,
"learning_rate": 4.141522757901426e-05,
"loss": 0.2367,
"step": 2750
},
{
"epoch": 0.2804023163669613,
"grad_norm": 3.375,
"learning_rate": 4.1354342627200345e-05,
"loss": 0.179,
"step": 2760
},
{
"epoch": 0.28141826678858073,
"grad_norm": 2.953125,
"learning_rate": 4.1293287636968286e-05,
"loss": 0.1396,
"step": 2770
},
{
"epoch": 0.2824342172102001,
"grad_norm": 2.546875,
"learning_rate": 4.1232063243115485e-05,
"loss": 0.1963,
"step": 2780
},
{
"epoch": 0.28345016763181957,
"grad_norm": 5.09375,
"learning_rate": 4.117067008220063e-05,
"loss": 0.2457,
"step": 2790
},
{
"epoch": 0.28446611805343897,
"grad_norm": 2.046875,
"learning_rate": 4.110910879253712e-05,
"loss": 0.2262,
"step": 2800
},
{
"epoch": 0.2854820684750584,
"grad_norm": 2.1875,
"learning_rate": 4.104738001418641e-05,
"loss": 0.2499,
"step": 2810
},
{
"epoch": 0.28649801889667786,
"grad_norm": 2.59375,
"learning_rate": 4.098548438895135e-05,
"loss": 0.1667,
"step": 2820
},
{
"epoch": 0.28751396931829726,
"grad_norm": 2.875,
"learning_rate": 4.092342256036954e-05,
"loss": 0.2288,
"step": 2830
},
{
"epoch": 0.2885299197399167,
"grad_norm": 3.015625,
"learning_rate": 4.086119517370659e-05,
"loss": 0.2038,
"step": 2840
},
{
"epoch": 0.2895458701615361,
"grad_norm": 3.53125,
"learning_rate": 4.0798802875949485e-05,
"loss": 0.181,
"step": 2850
},
{
"epoch": 0.29056182058315555,
"grad_norm": 2.296875,
"learning_rate": 4.073624631579975e-05,
"loss": 0.1886,
"step": 2860
},
{
"epoch": 0.29157777100477494,
"grad_norm": 3.609375,
"learning_rate": 4.067352614366685e-05,
"loss": 0.2053,
"step": 2870
},
{
"epoch": 0.2925937214263944,
"grad_norm": 2.328125,
"learning_rate": 4.061064301166128e-05,
"loss": 0.1409,
"step": 2880
},
{
"epoch": 0.29360967184801384,
"grad_norm": 4.9375,
"learning_rate": 4.054759757358787e-05,
"loss": 0.184,
"step": 2890
},
{
"epoch": 0.29462562226963324,
"grad_norm": 4.6875,
"learning_rate": 4.048439048493898e-05,
"loss": 0.2306,
"step": 2900
},
{
"epoch": 0.2956415726912527,
"grad_norm": 4.09375,
"learning_rate": 4.0421022402887676e-05,
"loss": 0.1914,
"step": 2910
},
{
"epoch": 0.2966575231128721,
"grad_norm": 2.3125,
"learning_rate": 4.035749398628088e-05,
"loss": 0.1653,
"step": 2920
},
{
"epoch": 0.29767347353449153,
"grad_norm": 2.515625,
"learning_rate": 4.029380589563256e-05,
"loss": 0.1941,
"step": 2930
},
{
"epoch": 0.2986894239561109,
"grad_norm": 1.78125,
"learning_rate": 4.02299587931168e-05,
"loss": 0.1117,
"step": 2940
},
{
"epoch": 0.29970537437773037,
"grad_norm": 0.8359375,
"learning_rate": 4.0165953342560974e-05,
"loss": 0.1605,
"step": 2950
},
{
"epoch": 0.30072132479934977,
"grad_norm": 3.046875,
"learning_rate": 4.010179020943884e-05,
"loss": 0.1726,
"step": 2960
},
{
"epoch": 0.3017372752209692,
"grad_norm": 3.453125,
"learning_rate": 4.003747006086357e-05,
"loss": 0.2208,
"step": 2970
},
{
"epoch": 0.30275322564258866,
"grad_norm": 2.515625,
"learning_rate": 3.9972993565580866e-05,
"loss": 0.1325,
"step": 2980
},
{
"epoch": 0.30376917606420806,
"grad_norm": 3.046875,
"learning_rate": 3.9908361393962e-05,
"loss": 0.2014,
"step": 2990
},
{
"epoch": 0.3047851264858275,
"grad_norm": 2.28125,
"learning_rate": 3.984357421799681e-05,
"loss": 0.165,
"step": 3000
},
{
"epoch": 0.3058010769074469,
"grad_norm": 5.09375,
"learning_rate": 3.9778632711286756e-05,
"loss": 0.212,
"step": 3010
},
{
"epoch": 0.30681702732906635,
"grad_norm": 4.25,
"learning_rate": 3.971353754903788e-05,
"loss": 0.2388,
"step": 3020
},
{
"epoch": 0.30783297775068574,
"grad_norm": 2.34375,
"learning_rate": 3.964828940805381e-05,
"loss": 0.2175,
"step": 3030
},
{
"epoch": 0.3088489281723052,
"grad_norm": 4.09375,
"learning_rate": 3.95828889667287e-05,
"loss": 0.2088,
"step": 3040
},
{
"epoch": 0.30986487859392464,
"grad_norm": 2.359375,
"learning_rate": 3.9517336905040244e-05,
"loss": 0.1913,
"step": 3050
},
{
"epoch": 0.31088082901554404,
"grad_norm": 1.1640625,
"learning_rate": 3.9451633904542483e-05,
"loss": 0.2185,
"step": 3060
},
{
"epoch": 0.3118967794371635,
"grad_norm": 2.59375,
"learning_rate": 3.9385780648358846e-05,
"loss": 0.2072,
"step": 3070
},
{
"epoch": 0.3129127298587829,
"grad_norm": 3.015625,
"learning_rate": 3.9319777821174955e-05,
"loss": 0.1902,
"step": 3080
},
{
"epoch": 0.31392868028040233,
"grad_norm": 2.375,
"learning_rate": 3.925362610923158e-05,
"loss": 0.259,
"step": 3090
},
{
"epoch": 0.3149446307020217,
"grad_norm": 4.65625,
"learning_rate": 3.918732620031742e-05,
"loss": 0.2026,
"step": 3100
},
{
"epoch": 0.31596058112364117,
"grad_norm": 2.1875,
"learning_rate": 3.912087878376205e-05,
"loss": 0.1478,
"step": 3110
},
{
"epoch": 0.31697653154526056,
"grad_norm": 2.34375,
"learning_rate": 3.905428455042865e-05,
"loss": 0.167,
"step": 3120
},
{
"epoch": 0.31799248196688,
"grad_norm": 2.390625,
"learning_rate": 3.898754419270693e-05,
"loss": 0.1629,
"step": 3130
},
{
"epoch": 0.31900843238849946,
"grad_norm": 1.546875,
"learning_rate": 3.892065840450583e-05,
"loss": 0.1308,
"step": 3140
},
{
"epoch": 0.32002438281011886,
"grad_norm": 4.625,
"learning_rate": 3.885362788124637e-05,
"loss": 0.2008,
"step": 3150
},
{
"epoch": 0.3210403332317383,
"grad_norm": 3.8125,
"learning_rate": 3.8786453319854396e-05,
"loss": 0.2225,
"step": 3160
},
{
"epoch": 0.3220562836533577,
"grad_norm": 3.015625,
"learning_rate": 3.8719135418753366e-05,
"loss": 0.2243,
"step": 3170
},
{
"epoch": 0.32307223407497715,
"grad_norm": 5.6875,
"learning_rate": 3.865167487785702e-05,
"loss": 0.1981,
"step": 3180
},
{
"epoch": 0.32408818449659654,
"grad_norm": 4.84375,
"learning_rate": 3.8584072398562164e-05,
"loss": 0.2031,
"step": 3190
},
{
"epoch": 0.325104134918216,
"grad_norm": 4.0625,
"learning_rate": 3.851632868374136e-05,
"loss": 0.1621,
"step": 3200
},
{
"epoch": 0.32612008533983544,
"grad_norm": 3.421875,
"learning_rate": 3.844844443773562e-05,
"loss": 0.1674,
"step": 3210
},
{
"epoch": 0.32713603576145484,
"grad_norm": 1.3671875,
"learning_rate": 3.8380420366347046e-05,
"loss": 0.1502,
"step": 3220
},
{
"epoch": 0.3281519861830743,
"grad_norm": 3.734375,
"learning_rate": 3.831225717683157e-05,
"loss": 0.1868,
"step": 3230
},
{
"epoch": 0.3291679366046937,
"grad_norm": 2.703125,
"learning_rate": 3.8243955577891534e-05,
"loss": 0.1818,
"step": 3240
},
{
"epoch": 0.3301838870263131,
"grad_norm": 3.796875,
"learning_rate": 3.8175516279668335e-05,
"loss": 0.2215,
"step": 3250
},
{
"epoch": 0.3311998374479325,
"grad_norm": 3.203125,
"learning_rate": 3.810693999373505e-05,
"loss": 0.2544,
"step": 3260
},
{
"epoch": 0.33221578786955197,
"grad_norm": 4.0,
"learning_rate": 3.8038227433089056e-05,
"loss": 0.1175,
"step": 3270
},
{
"epoch": 0.33323173829117136,
"grad_norm": 3.625,
"learning_rate": 3.796937931214458e-05,
"loss": 0.2213,
"step": 3280
},
{
"epoch": 0.3342476887127908,
"grad_norm": 1.7265625,
"learning_rate": 3.7900396346725296e-05,
"loss": 0.1711,
"step": 3290
},
{
"epoch": 0.33526363913441026,
"grad_norm": 3.140625,
"learning_rate": 3.783127925405686e-05,
"loss": 0.2628,
"step": 3300
},
{
"epoch": 0.33627958955602966,
"grad_norm": 2.1875,
"learning_rate": 3.77620287527595e-05,
"loss": 0.1671,
"step": 3310
},
{
"epoch": 0.3372955399776491,
"grad_norm": 5.28125,
"learning_rate": 3.769264556284048e-05,
"loss": 0.2109,
"step": 3320
},
{
"epoch": 0.3383114903992685,
"grad_norm": 2.875,
"learning_rate": 3.762313040568665e-05,
"loss": 0.1978,
"step": 3330
},
{
"epoch": 0.33932744082088795,
"grad_norm": 2.234375,
"learning_rate": 3.755348400405697e-05,
"loss": 0.1275,
"step": 3340
},
{
"epoch": 0.34034339124250734,
"grad_norm": 1.9453125,
"learning_rate": 3.7483707082074945e-05,
"loss": 0.1482,
"step": 3350
},
{
"epoch": 0.3413593416641268,
"grad_norm": 5.40625,
"learning_rate": 3.741380036522111e-05,
"loss": 0.1933,
"step": 3360
},
{
"epoch": 0.34237529208574624,
"grad_norm": 4.53125,
"learning_rate": 3.734376458032551e-05,
"loss": 0.1925,
"step": 3370
},
{
"epoch": 0.34339124250736563,
"grad_norm": 4.0625,
"learning_rate": 3.727360045556014e-05,
"loss": 0.2297,
"step": 3380
},
{
"epoch": 0.3444071929289851,
"grad_norm": 2.53125,
"learning_rate": 3.7203308720431336e-05,
"loss": 0.1704,
"step": 3390
},
{
"epoch": 0.3454231433506045,
"grad_norm": 1.859375,
"learning_rate": 3.7132890105772234e-05,
"loss": 0.258,
"step": 3400
},
{
"epoch": 0.3464390937722239,
"grad_norm": 3.90625,
"learning_rate": 3.706234534373515e-05,
"loss": 0.2376,
"step": 3410
},
{
"epoch": 0.3474550441938433,
"grad_norm": 1.1015625,
"learning_rate": 3.6991675167783985e-05,
"loss": 0.2403,
"step": 3420
},
{
"epoch": 0.34847099461546277,
"grad_norm": 1.1640625,
"learning_rate": 3.6920880312686556e-05,
"loss": 0.1642,
"step": 3430
},
{
"epoch": 0.34948694503708216,
"grad_norm": 2.875,
"learning_rate": 3.684996151450702e-05,
"loss": 0.1455,
"step": 3440
},
{
"epoch": 0.3505028954587016,
"grad_norm": 0.59765625,
"learning_rate": 3.6778919510598155e-05,
"loss": 0.2175,
"step": 3450
},
{
"epoch": 0.35151884588032106,
"grad_norm": 0.93359375,
"learning_rate": 3.670775503959376e-05,
"loss": 0.1858,
"step": 3460
},
{
"epoch": 0.35253479630194046,
"grad_norm": 4.1875,
"learning_rate": 3.6636468841400917e-05,
"loss": 0.1911,
"step": 3470
},
{
"epoch": 0.3535507467235599,
"grad_norm": 3.734375,
"learning_rate": 3.656506165719233e-05,
"loss": 0.2114,
"step": 3480
},
{
"epoch": 0.3545666971451793,
"grad_norm": 1.171875,
"learning_rate": 3.649353422939863e-05,
"loss": 0.1841,
"step": 3490
},
{
"epoch": 0.35558264756679875,
"grad_norm": 2.53125,
"learning_rate": 3.6421887301700615e-05,
"loss": 0.1505,
"step": 3500
},
{
"epoch": 0.35659859798841814,
"grad_norm": 4.9375,
"learning_rate": 3.6350121619021524e-05,
"loss": 0.2625,
"step": 3510
},
{
"epoch": 0.3576145484100376,
"grad_norm": 5.25,
"learning_rate": 3.627823792751936e-05,
"loss": 0.1676,
"step": 3520
},
{
"epoch": 0.35863049883165704,
"grad_norm": 1.09375,
"learning_rate": 3.620623697457905e-05,
"loss": 0.1963,
"step": 3530
},
{
"epoch": 0.35964644925327643,
"grad_norm": 4.03125,
"learning_rate": 3.613411950880468e-05,
"loss": 0.2048,
"step": 3540
},
{
"epoch": 0.3606623996748959,
"grad_norm": 4.40625,
"learning_rate": 3.606188628001178e-05,
"loss": 0.226,
"step": 3550
},
{
"epoch": 0.3616783500965153,
"grad_norm": 2.375,
"learning_rate": 3.598953803921947e-05,
"loss": 0.1884,
"step": 3560
},
{
"epoch": 0.3626943005181347,
"grad_norm": 3.21875,
"learning_rate": 3.591707553864266e-05,
"loss": 0.224,
"step": 3570
},
{
"epoch": 0.3637102509397541,
"grad_norm": 3.5625,
"learning_rate": 3.584449953168423e-05,
"loss": 0.1866,
"step": 3580
},
{
"epoch": 0.36472620136137357,
"grad_norm": 2.359375,
"learning_rate": 3.577181077292722e-05,
"loss": 0.1663,
"step": 3590
},
{
"epoch": 0.36574215178299296,
"grad_norm": 5.0,
"learning_rate": 3.569901001812696e-05,
"loss": 0.2032,
"step": 3600
},
{
"epoch": 0.3667581022046124,
"grad_norm": 1.953125,
"learning_rate": 3.562609802420321e-05,
"loss": 0.2395,
"step": 3610
},
{
"epoch": 0.36777405262623186,
"grad_norm": 3.796875,
"learning_rate": 3.555307554923229e-05,
"loss": 0.1799,
"step": 3620
},
{
"epoch": 0.36879000304785126,
"grad_norm": 4.4375,
"learning_rate": 3.547994335243925e-05,
"loss": 0.1771,
"step": 3630
},
{
"epoch": 0.3698059534694707,
"grad_norm": 1.890625,
"learning_rate": 3.540670219418989e-05,
"loss": 0.2123,
"step": 3640
},
{
"epoch": 0.3708219038910901,
"grad_norm": 4.03125,
"learning_rate": 3.53333528359829e-05,
"loss": 0.2159,
"step": 3650
},
{
"epoch": 0.37183785431270955,
"grad_norm": 3.265625,
"learning_rate": 3.525989604044198e-05,
"loss": 0.2749,
"step": 3660
},
{
"epoch": 0.37285380473432894,
"grad_norm": 1.4375,
"learning_rate": 3.5186332571307826e-05,
"loss": 0.1613,
"step": 3670
},
{
"epoch": 0.3738697551559484,
"grad_norm": 3.984375,
"learning_rate": 3.511266319343025e-05,
"loss": 0.1877,
"step": 3680
},
{
"epoch": 0.37488570557756784,
"grad_norm": 2.203125,
"learning_rate": 3.503888867276022e-05,
"loss": 0.2185,
"step": 3690
},
{
"epoch": 0.37590165599918723,
"grad_norm": 1.5078125,
"learning_rate": 3.4965009776341894e-05,
"loss": 0.2195,
"step": 3700
},
{
"epoch": 0.3769176064208067,
"grad_norm": 4.375,
"learning_rate": 3.489102727230461e-05,
"loss": 0.2344,
"step": 3710
},
{
"epoch": 0.3779335568424261,
"grad_norm": 2.984375,
"learning_rate": 3.481694192985496e-05,
"loss": 0.1863,
"step": 3720
},
{
"epoch": 0.3789495072640455,
"grad_norm": 1.1328125,
"learning_rate": 3.474275451926875e-05,
"loss": 0.1894,
"step": 3730
},
{
"epoch": 0.3799654576856649,
"grad_norm": 2.265625,
"learning_rate": 3.4668465811883e-05,
"loss": 0.2127,
"step": 3740
},
{
"epoch": 0.38098140810728437,
"grad_norm": 2.921875,
"learning_rate": 3.4594076580087914e-05,
"loss": 0.2125,
"step": 3750
},
{
"epoch": 0.38199735852890376,
"grad_norm": 2.390625,
"learning_rate": 3.451958759731889e-05,
"loss": 0.1801,
"step": 3760
},
{
"epoch": 0.3830133089505232,
"grad_norm": 3.046875,
"learning_rate": 3.4444999638048456e-05,
"loss": 0.1949,
"step": 3770
},
{
"epoch": 0.38402925937214266,
"grad_norm": 2.890625,
"learning_rate": 3.437031347777817e-05,
"loss": 0.2719,
"step": 3780
},
{
"epoch": 0.38504520979376206,
"grad_norm": 3.9375,
"learning_rate": 3.4295529893030634e-05,
"loss": 0.1697,
"step": 3790
},
{
"epoch": 0.3860611602153815,
"grad_norm": 2.0625,
"learning_rate": 3.422064966134138e-05,
"loss": 0.1557,
"step": 3800
},
{
"epoch": 0.3870771106370009,
"grad_norm": 2.234375,
"learning_rate": 3.4145673561250794e-05,
"loss": 0.2129,
"step": 3810
},
{
"epoch": 0.38809306105862035,
"grad_norm": 4.96875,
"learning_rate": 3.4070602372296e-05,
"loss": 0.2068,
"step": 3820
},
{
"epoch": 0.38910901148023974,
"grad_norm": 2.234375,
"learning_rate": 3.39954368750028e-05,
"loss": 0.1634,
"step": 3830
},
{
"epoch": 0.3901249619018592,
"grad_norm": 1.75,
"learning_rate": 3.392017785087752e-05,
"loss": 0.2299,
"step": 3840
},
{
"epoch": 0.39114091232347864,
"grad_norm": 3.90625,
"learning_rate": 3.38448260823989e-05,
"loss": 0.1585,
"step": 3850
},
{
"epoch": 0.39215686274509803,
"grad_norm": 2.8125,
"learning_rate": 3.376938235300996e-05,
"loss": 0.2382,
"step": 3860
},
{
"epoch": 0.3931728131667175,
"grad_norm": 5.375,
"learning_rate": 3.369384744710984e-05,
"loss": 0.1987,
"step": 3870
},
{
"epoch": 0.3941887635883369,
"grad_norm": 2.578125,
"learning_rate": 3.361822215004566e-05,
"loss": 0.2316,
"step": 3880
},
{
"epoch": 0.3952047140099563,
"grad_norm": 2.0,
"learning_rate": 3.354250724810436e-05,
"loss": 0.2019,
"step": 3890
},
{
"epoch": 0.3962206644315757,
"grad_norm": 2.3125,
"learning_rate": 3.34667035285045e-05,
"loss": 0.187,
"step": 3900
},
{
"epoch": 0.39723661485319517,
"grad_norm": 3.53125,
"learning_rate": 3.339081177938811e-05,
"loss": 0.2353,
"step": 3910
},
{
"epoch": 0.39825256527481456,
"grad_norm": 1.9609375,
"learning_rate": 3.331483278981244e-05,
"loss": 0.2078,
"step": 3920
},
{
"epoch": 0.399268515696434,
"grad_norm": 1.2109375,
"learning_rate": 3.323876734974183e-05,
"loss": 0.1761,
"step": 3930
},
{
"epoch": 0.40028446611805346,
"grad_norm": 4.0625,
"learning_rate": 3.316261625003943e-05,
"loss": 0.2081,
"step": 3940
},
{
"epoch": 0.40130041653967286,
"grad_norm": 1.953125,
"learning_rate": 3.308638028245902e-05,
"loss": 0.2087,
"step": 3950
},
{
"epoch": 0.4023163669612923,
"grad_norm": 2.390625,
"learning_rate": 3.301006023963676e-05,
"loss": 0.1579,
"step": 3960
},
{
"epoch": 0.4033323173829117,
"grad_norm": 3.53125,
"learning_rate": 3.293365691508295e-05,
"loss": 0.1904,
"step": 3970
},
{
"epoch": 0.40434826780453115,
"grad_norm": 3.0,
"learning_rate": 3.285717110317379e-05,
"loss": 0.1991,
"step": 3980
},
{
"epoch": 0.40536421822615054,
"grad_norm": 7.21875,
"learning_rate": 3.27806035991431e-05,
"loss": 0.1445,
"step": 3990
},
{
"epoch": 0.40638016864777,
"grad_norm": 1.0859375,
"learning_rate": 3.2703955199074075e-05,
"loss": 0.2393,
"step": 4000
},
{
"epoch": 0.40739611906938944,
"grad_norm": 4.5625,
"learning_rate": 3.262722669989098e-05,
"loss": 0.1789,
"step": 4010
},
{
"epoch": 0.40841206949100883,
"grad_norm": 3.09375,
"learning_rate": 3.255041889935092e-05,
"loss": 0.1511,
"step": 4020
},
{
"epoch": 0.4094280199126283,
"grad_norm": 1.90625,
"learning_rate": 3.247353259603547e-05,
"loss": 0.2066,
"step": 4030
},
{
"epoch": 0.4104439703342477,
"grad_norm": 2.28125,
"learning_rate": 3.239656858934242e-05,
"loss": 0.1564,
"step": 4040
},
{
"epoch": 0.4114599207558671,
"grad_norm": 2.609375,
"learning_rate": 3.231952767947746e-05,
"loss": 0.1503,
"step": 4050
},
{
"epoch": 0.4124758711774865,
"grad_norm": 1.4453125,
"learning_rate": 3.2242410667445844e-05,
"loss": 0.1633,
"step": 4060
},
{
"epoch": 0.41349182159910597,
"grad_norm": 3.015625,
"learning_rate": 3.2165218355044076e-05,
"loss": 0.1492,
"step": 4070
},
{
"epoch": 0.41450777202072536,
"grad_norm": 3.234375,
"learning_rate": 3.2087951544851566e-05,
"loss": 0.3051,
"step": 4080
},
{
"epoch": 0.4155237224423448,
"grad_norm": 2.9375,
"learning_rate": 3.20106110402223e-05,
"loss": 0.2229,
"step": 4090
},
{
"epoch": 0.41653967286396426,
"grad_norm": 3.171875,
"learning_rate": 3.1933197645276455e-05,
"loss": 0.2224,
"step": 4100
},
{
"epoch": 0.41755562328558365,
"grad_norm": 2.09375,
"learning_rate": 3.185571216489209e-05,
"loss": 0.1297,
"step": 4110
},
{
"epoch": 0.4185715737072031,
"grad_norm": 3.625,
"learning_rate": 3.177815540469669e-05,
"loss": 0.2074,
"step": 4120
},
{
"epoch": 0.4195875241288225,
"grad_norm": 2.296875,
"learning_rate": 3.1700528171058916e-05,
"loss": 0.1949,
"step": 4130
},
{
"epoch": 0.42060347455044195,
"grad_norm": 3.8125,
"learning_rate": 3.162283127108011e-05,
"loss": 0.1661,
"step": 4140
},
{
"epoch": 0.42161942497206134,
"grad_norm": 2.5,
"learning_rate": 3.154506551258594e-05,
"loss": 0.2275,
"step": 4150
},
{
"epoch": 0.4226353753936808,
"grad_norm": 2.96875,
"learning_rate": 3.146723170411804e-05,
"loss": 0.2242,
"step": 4160
},
{
"epoch": 0.42365132581530024,
"grad_norm": 6.625,
"learning_rate": 3.138933065492552e-05,
"loss": 0.1897,
"step": 4170
},
{
"epoch": 0.42466727623691963,
"grad_norm": 0.8515625,
"learning_rate": 3.131136317495665e-05,
"loss": 0.1629,
"step": 4180
},
{
"epoch": 0.4256832266585391,
"grad_norm": 0.94140625,
"learning_rate": 3.1233330074850364e-05,
"loss": 0.1535,
"step": 4190
},
{
"epoch": 0.4266991770801585,
"grad_norm": 2.6875,
"learning_rate": 3.115523216592786e-05,
"loss": 0.2494,
"step": 4200
},
{
"epoch": 0.4277151275017779,
"grad_norm": 2.578125,
"learning_rate": 3.107707026018417e-05,
"loss": 0.1705,
"step": 4210
},
{
"epoch": 0.4287310779233973,
"grad_norm": 3.0625,
"learning_rate": 3.09988451702797e-05,
"loss": 0.1507,
"step": 4220
},
{
"epoch": 0.42974702834501677,
"grad_norm": 2.421875,
"learning_rate": 3.0920557709531804e-05,
"loss": 0.3071,
"step": 4230
},
{
"epoch": 0.43076297876663616,
"grad_norm": 3.640625,
"learning_rate": 3.0842208691906306e-05,
"loss": 0.199,
"step": 4240
},
{
"epoch": 0.4317789291882556,
"grad_norm": 3.5625,
"learning_rate": 3.076379893200904e-05,
"loss": 0.1987,
"step": 4250
},
{
"epoch": 0.43279487960987506,
"grad_norm": 3.65625,
"learning_rate": 3.068532924507739e-05,
"loss": 0.1945,
"step": 4260
},
{
"epoch": 0.43381083003149445,
"grad_norm": 5.875,
"learning_rate": 3.060680044697183e-05,
"loss": 0.1937,
"step": 4270
},
{
"epoch": 0.4348267804531139,
"grad_norm": 2.859375,
"learning_rate": 3.052821335416739e-05,
"loss": 0.1643,
"step": 4280
},
{
"epoch": 0.4358427308747333,
"grad_norm": 3.296875,
"learning_rate": 3.0449568783745203e-05,
"loss": 0.1455,
"step": 4290
},
{
"epoch": 0.43685868129635275,
"grad_norm": 0.427734375,
"learning_rate": 3.0370867553384023e-05,
"loss": 0.1891,
"step": 4300
},
{
"epoch": 0.43787463171797214,
"grad_norm": 0.361328125,
"learning_rate": 3.029211048135171e-05,
"loss": 0.1377,
"step": 4310
},
{
"epoch": 0.4388905821395916,
"grad_norm": 1.8203125,
"learning_rate": 3.021329838649668e-05,
"loss": 0.2194,
"step": 4320
},
{
"epoch": 0.43990653256121104,
"grad_norm": 1.8828125,
"learning_rate": 3.0134432088239462e-05,
"loss": 0.1915,
"step": 4330
},
{
"epoch": 0.44092248298283043,
"grad_norm": 2.015625,
"learning_rate": 3.0055512406564146e-05,
"loss": 0.1794,
"step": 4340
},
{
"epoch": 0.4419384334044499,
"grad_norm": 2.546875,
"learning_rate": 2.9976540162009836e-05,
"loss": 0.2154,
"step": 4350
},
{
"epoch": 0.4429543838260693,
"grad_norm": 4.09375,
"learning_rate": 2.9897516175662155e-05,
"loss": 0.1861,
"step": 4360
},
{
"epoch": 0.4439703342476887,
"grad_norm": 3.953125,
"learning_rate": 2.9818441269144693e-05,
"loss": 0.1857,
"step": 4370
},
{
"epoch": 0.4449862846693081,
"grad_norm": 2.234375,
"learning_rate": 2.9739316264610452e-05,
"loss": 0.1493,
"step": 4380
},
{
"epoch": 0.44600223509092757,
"grad_norm": 1.109375,
"learning_rate": 2.966014198473332e-05,
"loss": 0.186,
"step": 4390
},
{
"epoch": 0.44701818551254696,
"grad_norm": 4.5625,
"learning_rate": 2.9580919252699502e-05,
"loss": 0.1963,
"step": 4400
},
{
"epoch": 0.4480341359341664,
"grad_norm": 7.3125,
"learning_rate": 2.9501648892198984e-05,
"loss": 0.2882,
"step": 4410
},
{
"epoch": 0.44905008635578586,
"grad_norm": 3.03125,
"learning_rate": 2.942233172741693e-05,
"loss": 0.2154,
"step": 4420
},
{
"epoch": 0.45006603677740525,
"grad_norm": 2.421875,
"learning_rate": 2.934296858302515e-05,
"loss": 0.2228,
"step": 4430
},
{
"epoch": 0.4510819871990247,
"grad_norm": 1.6015625,
"learning_rate": 2.9263560284173485e-05,
"loss": 0.1637,
"step": 4440
},
{
"epoch": 0.4520979376206441,
"grad_norm": 4.5,
"learning_rate": 2.91841076564813e-05,
"loss": 0.1396,
"step": 4450
},
{
"epoch": 0.45311388804226355,
"grad_norm": 1.9609375,
"learning_rate": 2.9104611526028808e-05,
"loss": 0.186,
"step": 4460
},
{
"epoch": 0.45412983846388294,
"grad_norm": 2.046875,
"learning_rate": 2.902507271934855e-05,
"loss": 0.1706,
"step": 4470
},
{
"epoch": 0.4551457888855024,
"grad_norm": 2.390625,
"learning_rate": 2.8945492063416768e-05,
"loss": 0.2191,
"step": 4480
},
{
"epoch": 0.45616173930712184,
"grad_norm": 2.734375,
"learning_rate": 2.8865870385644823e-05,
"loss": 0.1651,
"step": 4490
},
{
"epoch": 0.45717768972874123,
"grad_norm": 4.4375,
"learning_rate": 2.8786208513870583e-05,
"loss": 0.1907,
"step": 4500
},
{
"epoch": 0.4581936401503607,
"grad_norm": 1.9609375,
"learning_rate": 2.8706507276349815e-05,
"loss": 0.2256,
"step": 4510
},
{
"epoch": 0.4592095905719801,
"grad_norm": 3.375,
"learning_rate": 2.8626767501747588e-05,
"loss": 0.215,
"step": 4520
},
{
"epoch": 0.4602255409935995,
"grad_norm": 2.296875,
"learning_rate": 2.854699001912964e-05,
"loss": 0.2241,
"step": 4530
},
{
"epoch": 0.4612414914152189,
"grad_norm": 2.078125,
"learning_rate": 2.846717565795376e-05,
"loss": 0.1541,
"step": 4540
},
{
"epoch": 0.46225744183683837,
"grad_norm": 0.81640625,
"learning_rate": 2.8387325248061164e-05,
"loss": 0.1718,
"step": 4550
},
{
"epoch": 0.46327339225845776,
"grad_norm": 5.6875,
"learning_rate": 2.8307439619667897e-05,
"loss": 0.259,
"step": 4560
},
{
"epoch": 0.4642893426800772,
"grad_norm": 1.78125,
"learning_rate": 2.8227519603356157e-05,
"loss": 0.2205,
"step": 4570
},
{
"epoch": 0.46530529310169666,
"grad_norm": 4.78125,
"learning_rate": 2.8147566030065677e-05,
"loss": 0.2256,
"step": 4580
},
{
"epoch": 0.46632124352331605,
"grad_norm": 3.296875,
"learning_rate": 2.8067579731085085e-05,
"loss": 0.1671,
"step": 4590
},
{
"epoch": 0.4673371939449355,
"grad_norm": 3.265625,
"learning_rate": 2.7987561538043273e-05,
"loss": 0.2471,
"step": 4600
},
{
"epoch": 0.4683531443665549,
"grad_norm": 3.390625,
"learning_rate": 2.7907512282900727e-05,
"loss": 0.1749,
"step": 4610
},
{
"epoch": 0.46936909478817435,
"grad_norm": 3.140625,
"learning_rate": 2.782743279794091e-05,
"loss": 0.2276,
"step": 4620
},
{
"epoch": 0.47038504520979374,
"grad_norm": 2.921875,
"learning_rate": 2.7747323915761574e-05,
"loss": 0.1971,
"step": 4630
},
{
"epoch": 0.4714009956314132,
"grad_norm": 4.15625,
"learning_rate": 2.7667186469266122e-05,
"loss": 0.1951,
"step": 4640
},
{
"epoch": 0.47241694605303264,
"grad_norm": 2.953125,
"learning_rate": 2.7587021291654924e-05,
"loss": 0.2045,
"step": 4650
},
{
"epoch": 0.47343289647465203,
"grad_norm": 1.6640625,
"learning_rate": 2.750682921641672e-05,
"loss": 0.155,
"step": 4660
},
{
"epoch": 0.4744488468962715,
"grad_norm": 4.375,
"learning_rate": 2.7426611077319864e-05,
"loss": 0.2038,
"step": 4670
},
{
"epoch": 0.4754647973178909,
"grad_norm": 5.5,
"learning_rate": 2.734636770840372e-05,
"loss": 0.159,
"step": 4680
},
{
"epoch": 0.4764807477395103,
"grad_norm": 1.703125,
"learning_rate": 2.7266099943969976e-05,
"loss": 0.1566,
"step": 4690
},
{
"epoch": 0.4774966981611297,
"grad_norm": 0.81640625,
"learning_rate": 2.7185808618573943e-05,
"loss": 0.1927,
"step": 4700
},
{
"epoch": 0.47851264858274917,
"grad_norm": 0.81640625,
"learning_rate": 2.710549456701592e-05,
"loss": 0.1873,
"step": 4710
},
{
"epoch": 0.47952859900436856,
"grad_norm": 3.828125,
"learning_rate": 2.702515862433247e-05,
"loss": 0.2474,
"step": 4720
},
{
"epoch": 0.480544549425988,
"grad_norm": 1.1640625,
"learning_rate": 2.6944801625787795e-05,
"loss": 0.204,
"step": 4730
},
{
"epoch": 0.48156049984760746,
"grad_norm": 2.953125,
"learning_rate": 2.6864424406864984e-05,
"loss": 0.1758,
"step": 4740
},
{
"epoch": 0.48257645026922685,
"grad_norm": 3.265625,
"learning_rate": 2.6784027803257377e-05,
"loss": 0.161,
"step": 4750
},
{
"epoch": 0.4835924006908463,
"grad_norm": 2.046875,
"learning_rate": 2.6703612650859848e-05,
"loss": 0.1469,
"step": 4760
},
{
"epoch": 0.4846083511124657,
"grad_norm": 4.03125,
"learning_rate": 2.6623179785760148e-05,
"loss": 0.1858,
"step": 4770
},
{
"epoch": 0.48562430153408515,
"grad_norm": 2.65625,
"learning_rate": 2.6542730044230175e-05,
"loss": 0.176,
"step": 4780
},
{
"epoch": 0.48664025195570454,
"grad_norm": 2.59375,
"learning_rate": 2.6462264262717278e-05,
"loss": 0.1657,
"step": 4790
},
{
"epoch": 0.487656202377324,
"grad_norm": 4.78125,
"learning_rate": 2.6381783277835605e-05,
"loss": 0.2705,
"step": 4800
},
{
"epoch": 0.48867215279894344,
"grad_norm": 3.65625,
"learning_rate": 2.6301287926357355e-05,
"loss": 0.2252,
"step": 4810
},
{
"epoch": 0.48968810322056283,
"grad_norm": 0.734375,
"learning_rate": 2.622077904520411e-05,
"loss": 0.2141,
"step": 4820
},
{
"epoch": 0.4907040536421823,
"grad_norm": 5.15625,
"learning_rate": 2.6140257471438108e-05,
"loss": 0.1935,
"step": 4830
},
{
"epoch": 0.4917200040638017,
"grad_norm": 3.625,
"learning_rate": 2.6059724042253574e-05,
"loss": 0.2121,
"step": 4840
},
{
"epoch": 0.4927359544854211,
"grad_norm": 1.2890625,
"learning_rate": 2.5979179594967983e-05,
"loss": 0.1221,
"step": 4850
},
{
"epoch": 0.4937519049070405,
"grad_norm": 3.4375,
"learning_rate": 2.5898624967013367e-05,
"loss": 0.2208,
"step": 4860
},
{
"epoch": 0.49476785532865997,
"grad_norm": 2.40625,
"learning_rate": 2.5818060995927607e-05,
"loss": 0.1904,
"step": 4870
},
{
"epoch": 0.49578380575027936,
"grad_norm": 2.921875,
"learning_rate": 2.573748851934574e-05,
"loss": 0.1658,
"step": 4880
},
{
"epoch": 0.4967997561718988,
"grad_norm": 1.6640625,
"learning_rate": 2.5656908374991213e-05,
"loss": 0.1626,
"step": 4890
},
{
"epoch": 0.49781570659351826,
"grad_norm": 1.8046875,
"learning_rate": 2.557632140066721e-05,
"loss": 0.1905,
"step": 4900
},
{
"epoch": 0.49883165701513765,
"grad_norm": 4.875,
"learning_rate": 2.5495728434247917e-05,
"loss": 0.2591,
"step": 4910
},
{
"epoch": 0.4998476074367571,
"grad_norm": 1.4453125,
"learning_rate": 2.5415130313669845e-05,
"loss": 0.1359,
"step": 4920
},
{
"epoch": 0.5008635578583766,
"grad_norm": 2.109375,
"learning_rate": 2.5334527876923063e-05,
"loss": 0.2353,
"step": 4930
},
{
"epoch": 0.501879508279996,
"grad_norm": 3.546875,
"learning_rate": 2.5253921962042525e-05,
"loss": 0.2173,
"step": 4940
},
{
"epoch": 0.5028954587016153,
"grad_norm": 1.8125,
"learning_rate": 2.5173313407099373e-05,
"loss": 0.1631,
"step": 4950
},
{
"epoch": 0.5039114091232347,
"grad_norm": 2.671875,
"learning_rate": 2.5092703050192163e-05,
"loss": 0.1884,
"step": 4960
},
{
"epoch": 0.5049273595448542,
"grad_norm": 2.5625,
"learning_rate": 2.501209172943819e-05,
"loss": 0.217,
"step": 4970
},
{
"epoch": 0.5059433099664736,
"grad_norm": 4.375,
"learning_rate": 2.49314802829648e-05,
"loss": 0.1854,
"step": 4980
},
{
"epoch": 0.506959260388093,
"grad_norm": 2.3125,
"learning_rate": 2.4850869548900628e-05,
"loss": 0.2049,
"step": 4990
},
{
"epoch": 0.5079752108097125,
"grad_norm": 3.859375,
"learning_rate": 2.477026036536688e-05,
"loss": 0.2093,
"step": 5000
},
{
"epoch": 0.5089911612313319,
"grad_norm": 1.09375,
"learning_rate": 2.4689653570468677e-05,
"loss": 0.164,
"step": 5010
},
{
"epoch": 0.5100071116529513,
"grad_norm": 3.40625,
"learning_rate": 2.460905000228628e-05,
"loss": 0.1649,
"step": 5020
},
{
"epoch": 0.5110230620745707,
"grad_norm": 3.546875,
"learning_rate": 2.4528450498866428e-05,
"loss": 0.1777,
"step": 5030
},
{
"epoch": 0.5120390124961902,
"grad_norm": 3.0,
"learning_rate": 2.444785589821356e-05,
"loss": 0.1505,
"step": 5040
},
{
"epoch": 0.5130549629178096,
"grad_norm": 1.6484375,
"learning_rate": 2.436726703828118e-05,
"loss": 0.2672,
"step": 5050
},
{
"epoch": 0.514070913339429,
"grad_norm": 4.34375,
"learning_rate": 2.428668475696308e-05,
"loss": 0.1756,
"step": 5060
},
{
"epoch": 0.5150868637610485,
"grad_norm": 2.78125,
"learning_rate": 2.420610989208465e-05,
"loss": 0.1655,
"step": 5070
},
{
"epoch": 0.5161028141826679,
"grad_norm": 1.4609375,
"learning_rate": 2.412554328139419e-05,
"loss": 0.1579,
"step": 5080
},
{
"epoch": 0.5171187646042873,
"grad_norm": 2.28125,
"learning_rate": 2.404498576255416e-05,
"loss": 0.1599,
"step": 5090
},
{
"epoch": 0.5181347150259067,
"grad_norm": 0.6484375,
"learning_rate": 2.3964438173132522e-05,
"loss": 0.1508,
"step": 5100
},
{
"epoch": 0.5191506654475262,
"grad_norm": 3.390625,
"learning_rate": 2.388390135059395e-05,
"loss": 0.1578,
"step": 5110
},
{
"epoch": 0.5201666158691456,
"grad_norm": 1.21875,
"learning_rate": 2.3803376132291226e-05,
"loss": 0.1374,
"step": 5120
},
{
"epoch": 0.521182566290765,
"grad_norm": 4.0625,
"learning_rate": 2.3722863355456436e-05,
"loss": 0.1854,
"step": 5130
},
{
"epoch": 0.5221985167123845,
"grad_norm": 4.71875,
"learning_rate": 2.364236385719236e-05,
"loss": 0.1391,
"step": 5140
},
{
"epoch": 0.5232144671340039,
"grad_norm": 3.296875,
"learning_rate": 2.356187847446366e-05,
"loss": 0.2106,
"step": 5150
},
{
"epoch": 0.5242304175556233,
"grad_norm": 3.296875,
"learning_rate": 2.348140804408829e-05,
"loss": 0.2383,
"step": 5160
},
{
"epoch": 0.5252463679772427,
"grad_norm": 3.359375,
"learning_rate": 2.3400953402728713e-05,
"loss": 0.1537,
"step": 5170
},
{
"epoch": 0.5262623183988622,
"grad_norm": 1.4921875,
"learning_rate": 2.332051538688322e-05,
"loss": 0.1841,
"step": 5180
},
{
"epoch": 0.5272782688204816,
"grad_norm": 3.25,
"learning_rate": 2.3240094832877287e-05,
"loss": 0.1855,
"step": 5190
},
{
"epoch": 0.528294219242101,
"grad_norm": 3.34375,
"learning_rate": 2.3159692576854793e-05,
"loss": 0.2625,
"step": 5200
},
{
"epoch": 0.5293101696637205,
"grad_norm": 3.6875,
"learning_rate": 2.3079309454769413e-05,
"loss": 0.1292,
"step": 5210
},
{
"epoch": 0.5303261200853399,
"grad_norm": 1.1171875,
"learning_rate": 2.2998946302375827e-05,
"loss": 0.1263,
"step": 5220
},
{
"epoch": 0.5313420705069593,
"grad_norm": 2.71875,
"learning_rate": 2.2918603955221148e-05,
"loss": 0.2296,
"step": 5230
},
{
"epoch": 0.5323580209285786,
"grad_norm": 2.015625,
"learning_rate": 2.283828324863613e-05,
"loss": 0.1231,
"step": 5240
},
{
"epoch": 0.5333739713501982,
"grad_norm": 3.671875,
"learning_rate": 2.2757985017726557e-05,
"loss": 0.1939,
"step": 5250
},
{
"epoch": 0.5343899217718175,
"grad_norm": 1.9765625,
"learning_rate": 2.2677710097364495e-05,
"loss": 0.168,
"step": 5260
},
{
"epoch": 0.5354058721934369,
"grad_norm": 2.609375,
"learning_rate": 2.259745932217969e-05,
"loss": 0.1883,
"step": 5270
},
{
"epoch": 0.5364218226150563,
"grad_norm": 2.8125,
"learning_rate": 2.2517233526550817e-05,
"loss": 0.1898,
"step": 5280
},
{
"epoch": 0.5374377730366758,
"grad_norm": 3.125,
"learning_rate": 2.2437033544596837e-05,
"loss": 0.1838,
"step": 5290
},
{
"epoch": 0.5384537234582952,
"grad_norm": 4.90625,
"learning_rate": 2.2356860210168336e-05,
"loss": 0.1553,
"step": 5300
},
{
"epoch": 0.5394696738799146,
"grad_norm": 3.171875,
"learning_rate": 2.2276714356838824e-05,
"loss": 0.2248,
"step": 5310
},
{
"epoch": 0.5404856243015341,
"grad_norm": 1.34375,
"learning_rate": 2.2196596817896118e-05,
"loss": 0.1421,
"step": 5320
},
{
"epoch": 0.5415015747231535,
"grad_norm": 3.28125,
"learning_rate": 2.2116508426333596e-05,
"loss": 0.1947,
"step": 5330
},
{
"epoch": 0.5425175251447729,
"grad_norm": 1.9296875,
"learning_rate": 2.2036450014841652e-05,
"loss": 0.2207,
"step": 5340
},
{
"epoch": 0.5435334755663923,
"grad_norm": 0.5703125,
"learning_rate": 2.19564224157989e-05,
"loss": 0.2208,
"step": 5350
},
{
"epoch": 0.5445494259880118,
"grad_norm": 7.5625,
"learning_rate": 2.1876426461263654e-05,
"loss": 0.1739,
"step": 5360
},
{
"epoch": 0.5455653764096312,
"grad_norm": 2.15625,
"learning_rate": 2.179646298296519e-05,
"loss": 0.1938,
"step": 5370
},
{
"epoch": 0.5465813268312506,
"grad_norm": 4.1875,
"learning_rate": 2.171653281229511e-05,
"loss": 0.1736,
"step": 5380
},
{
"epoch": 0.5475972772528701,
"grad_norm": 4.65625,
"learning_rate": 2.1636636780298732e-05,
"loss": 0.2167,
"step": 5390
},
{
"epoch": 0.5486132276744895,
"grad_norm": 1.84375,
"learning_rate": 2.1556775717666427e-05,
"loss": 0.1711,
"step": 5400
},
{
"epoch": 0.5496291780961089,
"grad_norm": 5.125,
"learning_rate": 2.147695045472499e-05,
"loss": 0.1789,
"step": 5410
},
{
"epoch": 0.5506451285177283,
"grad_norm": 3.859375,
"learning_rate": 2.1397161821428973e-05,
"loss": 0.2187,
"step": 5420
},
{
"epoch": 0.5516610789393478,
"grad_norm": 2.25,
"learning_rate": 2.131741064735212e-05,
"loss": 0.1367,
"step": 5430
},
{
"epoch": 0.5526770293609672,
"grad_norm": 4.65625,
"learning_rate": 2.1237697761678684e-05,
"loss": 0.1574,
"step": 5440
},
{
"epoch": 0.5536929797825866,
"grad_norm": 1.2265625,
"learning_rate": 2.1158023993194848e-05,
"loss": 0.1301,
"step": 5450
},
{
"epoch": 0.5547089302042061,
"grad_norm": 4.21875,
"learning_rate": 2.107839017028005e-05,
"loss": 0.2782,
"step": 5460
},
{
"epoch": 0.5557248806258255,
"grad_norm": 0.52734375,
"learning_rate": 2.0998797120898457e-05,
"loss": 0.2024,
"step": 5470
},
{
"epoch": 0.5567408310474449,
"grad_norm": 1.46875,
"learning_rate": 2.0919245672590277e-05,
"loss": 0.1755,
"step": 5480
},
{
"epoch": 0.5577567814690643,
"grad_norm": 2.140625,
"learning_rate": 2.083973665246318e-05,
"loss": 0.2058,
"step": 5490
},
{
"epoch": 0.5587727318906838,
"grad_norm": 1.5390625,
"learning_rate": 2.076027088718373e-05,
"loss": 0.2159,
"step": 5500
},
{
"epoch": 0.5597886823123032,
"grad_norm": 1.9921875,
"learning_rate": 2.0680849202968743e-05,
"loss": 0.2139,
"step": 5510
},
{
"epoch": 0.5608046327339226,
"grad_norm": 2.4375,
"learning_rate": 2.060147242557674e-05,
"loss": 0.183,
"step": 5520
},
{
"epoch": 0.5618205831555421,
"grad_norm": 5.5,
"learning_rate": 2.0522141380299308e-05,
"loss": 0.1673,
"step": 5530
},
{
"epoch": 0.5628365335771615,
"grad_norm": 4.25,
"learning_rate": 2.044285689195258e-05,
"loss": 0.1674,
"step": 5540
},
{
"epoch": 0.5638524839987809,
"grad_norm": 2.109375,
"learning_rate": 2.0363619784868604e-05,
"loss": 0.1531,
"step": 5550
},
{
"epoch": 0.5648684344204002,
"grad_norm": 2.59375,
"learning_rate": 2.0284430882886836e-05,
"loss": 0.1665,
"step": 5560
},
{
"epoch": 0.5658843848420197,
"grad_norm": 3.984375,
"learning_rate": 2.020529100934549e-05,
"loss": 0.1717,
"step": 5570
},
{
"epoch": 0.5669003352636391,
"grad_norm": 1.6015625,
"learning_rate": 2.012620098707306e-05,
"loss": 0.1167,
"step": 5580
},
{
"epoch": 0.5679162856852585,
"grad_norm": 6.0625,
"learning_rate": 2.004716163837972e-05,
"loss": 0.2084,
"step": 5590
},
{
"epoch": 0.5689322361068779,
"grad_norm": 2.5625,
"learning_rate": 1.996817378504876e-05,
"loss": 0.1939,
"step": 5600
},
{
"epoch": 0.5699481865284974,
"grad_norm": 3.109375,
"learning_rate": 1.9889238248328108e-05,
"loss": 0.1241,
"step": 5610
},
{
"epoch": 0.5709641369501168,
"grad_norm": 4.875,
"learning_rate": 1.981035584892171e-05,
"loss": 0.1865,
"step": 5620
},
{
"epoch": 0.5719800873717362,
"grad_norm": 2.984375,
"learning_rate": 1.9731527406981072e-05,
"loss": 0.1639,
"step": 5630
},
{
"epoch": 0.5729960377933557,
"grad_norm": 4.4375,
"learning_rate": 1.9652753742096655e-05,
"loss": 0.2019,
"step": 5640
},
{
"epoch": 0.5740119882149751,
"grad_norm": 4.3125,
"learning_rate": 1.9574035673289432e-05,
"loss": 0.1829,
"step": 5650
},
{
"epoch": 0.5750279386365945,
"grad_norm": 3.203125,
"learning_rate": 1.9495374019002312e-05,
"loss": 0.2267,
"step": 5660
},
{
"epoch": 0.5760438890582139,
"grad_norm": 1.765625,
"learning_rate": 1.9416769597091673e-05,
"loss": 0.1411,
"step": 5670
},
{
"epoch": 0.5770598394798334,
"grad_norm": 2.640625,
"learning_rate": 1.9338223224818818e-05,
"loss": 0.1476,
"step": 5680
},
{
"epoch": 0.5780757899014528,
"grad_norm": 4.84375,
"learning_rate": 1.9259735718841524e-05,
"loss": 0.1417,
"step": 5690
},
{
"epoch": 0.5790917403230722,
"grad_norm": 2.421875,
"learning_rate": 1.918130789520551e-05,
"loss": 0.1592,
"step": 5700
},
{
"epoch": 0.5801076907446917,
"grad_norm": 2.984375,
"learning_rate": 1.9102940569335963e-05,
"loss": 0.161,
"step": 5710
},
{
"epoch": 0.5811236411663111,
"grad_norm": 1.0234375,
"learning_rate": 1.9024634556029093e-05,
"loss": 0.1614,
"step": 5720
},
{
"epoch": 0.5821395915879305,
"grad_norm": 2.90625,
"learning_rate": 1.89463906694436e-05,
"loss": 0.1505,
"step": 5730
},
{
"epoch": 0.5831555420095499,
"grad_norm": 2.875,
"learning_rate": 1.8868209723092286e-05,
"loss": 0.1674,
"step": 5740
},
{
"epoch": 0.5841714924311694,
"grad_norm": 0.408203125,
"learning_rate": 1.8790092529833508e-05,
"loss": 0.1468,
"step": 5750
},
{
"epoch": 0.5851874428527888,
"grad_norm": 5.1875,
"learning_rate": 1.871203990186281e-05,
"loss": 0.1903,
"step": 5760
},
{
"epoch": 0.5862033932744082,
"grad_norm": 0.5546875,
"learning_rate": 1.8634052650704415e-05,
"loss": 0.2644,
"step": 5770
},
{
"epoch": 0.5872193436960277,
"grad_norm": 3.203125,
"learning_rate": 1.8556131587202848e-05,
"loss": 0.1968,
"step": 5780
},
{
"epoch": 0.5882352941176471,
"grad_norm": 2.484375,
"learning_rate": 1.8478277521514424e-05,
"loss": 0.2249,
"step": 5790
},
{
"epoch": 0.5892512445392665,
"grad_norm": 4.0,
"learning_rate": 1.8400491263098906e-05,
"loss": 0.1881,
"step": 5800
},
{
"epoch": 0.5902671949608859,
"grad_norm": 1.90625,
"learning_rate": 1.832277362071106e-05,
"loss": 0.1352,
"step": 5810
},
{
"epoch": 0.5912831453825054,
"grad_norm": 2.765625,
"learning_rate": 1.824512540239221e-05,
"loss": 0.2737,
"step": 5820
},
{
"epoch": 0.5922990958041248,
"grad_norm": 2.609375,
"learning_rate": 1.81675474154619e-05,
"loss": 0.1566,
"step": 5830
},
{
"epoch": 0.5933150462257442,
"grad_norm": 2.6875,
"learning_rate": 1.8090040466509444e-05,
"loss": 0.1999,
"step": 5840
},
{
"epoch": 0.5943309966473637,
"grad_norm": 2.609375,
"learning_rate": 1.8012605361385592e-05,
"loss": 0.2372,
"step": 5850
},
{
"epoch": 0.5953469470689831,
"grad_norm": 8.125,
"learning_rate": 1.7935242905194087e-05,
"loss": 0.2411,
"step": 5860
},
{
"epoch": 0.5963628974906025,
"grad_norm": 3.46875,
"learning_rate": 1.785795390228336e-05,
"loss": 0.138,
"step": 5870
},
{
"epoch": 0.5973788479122218,
"grad_norm": 2.3125,
"learning_rate": 1.7780739156238125e-05,
"loss": 0.1867,
"step": 5880
},
{
"epoch": 0.5983947983338413,
"grad_norm": 4.0625,
"learning_rate": 1.770359946987105e-05,
"loss": 0.2091,
"step": 5890
},
{
"epoch": 0.5994107487554607,
"grad_norm": 5.21875,
"learning_rate": 1.7626535645214378e-05,
"loss": 0.2091,
"step": 5900
},
{
"epoch": 0.6004266991770801,
"grad_norm": 3.15625,
"learning_rate": 1.7549548483511614e-05,
"loss": 0.1927,
"step": 5910
},
{
"epoch": 0.6014426495986995,
"grad_norm": 4.71875,
"learning_rate": 1.7472638785209198e-05,
"loss": 0.1893,
"step": 5920
},
{
"epoch": 0.602458600020319,
"grad_norm": 3.015625,
"learning_rate": 1.7395807349948145e-05,
"loss": 0.1557,
"step": 5930
},
{
"epoch": 0.6034745504419384,
"grad_norm": 2.9375,
"learning_rate": 1.73190549765558e-05,
"loss": 0.1717,
"step": 5940
},
{
"epoch": 0.6044905008635578,
"grad_norm": 3.109375,
"learning_rate": 1.724238246303745e-05,
"loss": 0.1879,
"step": 5950
},
{
"epoch": 0.6055064512851773,
"grad_norm": 3.875,
"learning_rate": 1.71657906065681e-05,
"loss": 0.1908,
"step": 5960
},
{
"epoch": 0.6065224017067967,
"grad_norm": 5.09375,
"learning_rate": 1.7089280203484115e-05,
"loss": 0.1712,
"step": 5970
},
{
"epoch": 0.6075383521284161,
"grad_norm": 3.015625,
"learning_rate": 1.701285204927502e-05,
"loss": 0.1454,
"step": 5980
},
{
"epoch": 0.6085543025500355,
"grad_norm": 3.265625,
"learning_rate": 1.693650693857515e-05,
"loss": 0.2283,
"step": 5990
},
{
"epoch": 0.609570252971655,
"grad_norm": 3.40625,
"learning_rate": 1.6860245665155466e-05,
"loss": 0.2188,
"step": 6000
},
{
"epoch": 0.6105862033932744,
"grad_norm": 2.5625,
"learning_rate": 1.678406902191521e-05,
"loss": 0.1605,
"step": 6010
},
{
"epoch": 0.6116021538148938,
"grad_norm": 0.6796875,
"learning_rate": 1.670797780087374e-05,
"loss": 0.1472,
"step": 6020
},
{
"epoch": 0.6126181042365133,
"grad_norm": 2.234375,
"learning_rate": 1.6631972793162288e-05,
"loss": 0.1676,
"step": 6030
},
{
"epoch": 0.6136340546581327,
"grad_norm": 1.25,
"learning_rate": 1.6556054789015662e-05,
"loss": 0.1508,
"step": 6040
},
{
"epoch": 0.6146500050797521,
"grad_norm": 4.78125,
"learning_rate": 1.6480224577764132e-05,
"loss": 0.1981,
"step": 6050
},
{
"epoch": 0.6156659555013715,
"grad_norm": 3.46875,
"learning_rate": 1.6404482947825137e-05,
"loss": 0.2514,
"step": 6060
},
{
"epoch": 0.616681905922991,
"grad_norm": 1.265625,
"learning_rate": 1.6328830686695154e-05,
"loss": 0.2397,
"step": 6070
},
{
"epoch": 0.6176978563446104,
"grad_norm": 1.953125,
"learning_rate": 1.625326858094144e-05,
"loss": 0.1523,
"step": 6080
},
{
"epoch": 0.6187138067662298,
"grad_norm": 3.484375,
"learning_rate": 1.6177797416193953e-05,
"loss": 0.218,
"step": 6090
},
{
"epoch": 0.6197297571878493,
"grad_norm": 3.484375,
"learning_rate": 1.6102417977137052e-05,
"loss": 0.1476,
"step": 6100
},
{
"epoch": 0.6207457076094687,
"grad_norm": 4.90625,
"learning_rate": 1.602713104750147e-05,
"loss": 0.1818,
"step": 6110
},
{
"epoch": 0.6217616580310881,
"grad_norm": 4.375,
"learning_rate": 1.5951937410056087e-05,
"loss": 0.2061,
"step": 6120
},
{
"epoch": 0.6227776084527075,
"grad_norm": 6.3125,
"learning_rate": 1.587683784659979e-05,
"loss": 0.1566,
"step": 6130
},
{
"epoch": 0.623793558874327,
"grad_norm": 2.828125,
"learning_rate": 1.58018331379534e-05,
"loss": 0.1376,
"step": 6140
},
{
"epoch": 0.6248095092959464,
"grad_norm": 2.40625,
"learning_rate": 1.572692406395149e-05,
"loss": 0.1655,
"step": 6150
},
{
"epoch": 0.6258254597175658,
"grad_norm": 4.34375,
"learning_rate": 1.5652111403434338e-05,
"loss": 0.2363,
"step": 6160
},
{
"epoch": 0.6268414101391853,
"grad_norm": 2.453125,
"learning_rate": 1.5577395934239757e-05,
"loss": 0.2464,
"step": 6170
},
{
"epoch": 0.6278573605608047,
"grad_norm": 2.53125,
"learning_rate": 1.5502778433195085e-05,
"loss": 0.1898,
"step": 6180
},
{
"epoch": 0.628873310982424,
"grad_norm": 2.28125,
"learning_rate": 1.5428259676109048e-05,
"loss": 0.1804,
"step": 6190
},
{
"epoch": 0.6298892614040434,
"grad_norm": 4.3125,
"learning_rate": 1.5353840437763732e-05,
"loss": 0.1409,
"step": 6200
},
{
"epoch": 0.630905211825663,
"grad_norm": 2.5625,
"learning_rate": 1.5279521491906496e-05,
"loss": 0.2449,
"step": 6210
},
{
"epoch": 0.6319211622472823,
"grad_norm": 3.0625,
"learning_rate": 1.520530361124195e-05,
"loss": 0.2103,
"step": 6220
},
{
"epoch": 0.6329371126689017,
"grad_norm": 2.609375,
"learning_rate": 1.5131187567423937e-05,
"loss": 0.2156,
"step": 6230
},
{
"epoch": 0.6339530630905211,
"grad_norm": 2.703125,
"learning_rate": 1.5057174131047446e-05,
"loss": 0.161,
"step": 6240
},
{
"epoch": 0.6349690135121406,
"grad_norm": 3.265625,
"learning_rate": 1.4983264071640679e-05,
"loss": 0.1757,
"step": 6250
},
{
"epoch": 0.63598496393376,
"grad_norm": 3.15625,
"learning_rate": 1.490945815765699e-05,
"loss": 0.2011,
"step": 6260
},
{
"epoch": 0.6370009143553794,
"grad_norm": 5.375,
"learning_rate": 1.4835757156466945e-05,
"loss": 0.1658,
"step": 6270
},
{
"epoch": 0.6380168647769989,
"grad_norm": 2.984375,
"learning_rate": 1.4762161834350271e-05,
"loss": 0.1754,
"step": 6280
},
{
"epoch": 0.6390328151986183,
"grad_norm": 2.015625,
"learning_rate": 1.4688672956487987e-05,
"loss": 0.1427,
"step": 6290
},
{
"epoch": 0.6400487656202377,
"grad_norm": 3.78125,
"learning_rate": 1.4615291286954352e-05,
"loss": 0.1517,
"step": 6300
},
{
"epoch": 0.6410647160418571,
"grad_norm": 2.859375,
"learning_rate": 1.4542017588709005e-05,
"loss": 0.2348,
"step": 6310
},
{
"epoch": 0.6420806664634766,
"grad_norm": 2.421875,
"learning_rate": 1.4468852623588961e-05,
"loss": 0.2089,
"step": 6320
},
{
"epoch": 0.643096616885096,
"grad_norm": 2.15625,
"learning_rate": 1.4395797152300719e-05,
"loss": 0.1702,
"step": 6330
},
{
"epoch": 0.6441125673067154,
"grad_norm": 1.53125,
"learning_rate": 1.4322851934412382e-05,
"loss": 0.1017,
"step": 6340
},
{
"epoch": 0.6451285177283349,
"grad_norm": 1.90625,
"learning_rate": 1.4250017728345716e-05,
"loss": 0.1813,
"step": 6350
},
{
"epoch": 0.6461444681499543,
"grad_norm": 2.015625,
"learning_rate": 1.4177295291368292e-05,
"loss": 0.1095,
"step": 6360
},
{
"epoch": 0.6471604185715737,
"grad_norm": 2.625,
"learning_rate": 1.410468537958558e-05,
"loss": 0.2259,
"step": 6370
},
{
"epoch": 0.6481763689931931,
"grad_norm": 3.5,
"learning_rate": 1.4032188747933136e-05,
"loss": 0.1595,
"step": 6380
},
{
"epoch": 0.6491923194148126,
"grad_norm": 5.21875,
"learning_rate": 1.39598061501687e-05,
"loss": 0.2226,
"step": 6390
},
{
"epoch": 0.650208269836432,
"grad_norm": 5.34375,
"learning_rate": 1.388753833886442e-05,
"loss": 0.2132,
"step": 6400
},
{
"epoch": 0.6512242202580514,
"grad_norm": 3.640625,
"learning_rate": 1.3815386065398945e-05,
"loss": 0.1227,
"step": 6410
},
{
"epoch": 0.6522401706796709,
"grad_norm": 1.0,
"learning_rate": 1.3743350079949705e-05,
"loss": 0.1755,
"step": 6420
},
{
"epoch": 0.6532561211012903,
"grad_norm": 2.359375,
"learning_rate": 1.3671431131485057e-05,
"loss": 0.1552,
"step": 6430
},
{
"epoch": 0.6542720715229097,
"grad_norm": 5.3125,
"learning_rate": 1.3599629967756483e-05,
"loss": 0.1917,
"step": 6440
},
{
"epoch": 0.6552880219445291,
"grad_norm": 4.625,
"learning_rate": 1.3527947335290877e-05,
"loss": 0.1812,
"step": 6450
},
{
"epoch": 0.6563039723661486,
"grad_norm": 1.234375,
"learning_rate": 1.3456383979382708e-05,
"loss": 0.1896,
"step": 6460
},
{
"epoch": 0.657319922787768,
"grad_norm": 3.984375,
"learning_rate": 1.3384940644086352e-05,
"loss": 0.1484,
"step": 6470
},
{
"epoch": 0.6583358732093874,
"grad_norm": 2.40625,
"learning_rate": 1.3313618072208268e-05,
"loss": 0.1334,
"step": 6480
},
{
"epoch": 0.6593518236310069,
"grad_norm": 4.375,
"learning_rate": 1.3242417005299357e-05,
"loss": 0.1351,
"step": 6490
},
{
"epoch": 0.6603677740526263,
"grad_norm": 2.640625,
"learning_rate": 1.31713381836472e-05,
"loss": 0.1717,
"step": 6500
},
{
"epoch": 0.6613837244742456,
"grad_norm": 2.640625,
"learning_rate": 1.3100382346268392e-05,
"loss": 0.1867,
"step": 6510
},
{
"epoch": 0.662399674895865,
"grad_norm": 1.734375,
"learning_rate": 1.3029550230900812e-05,
"loss": 0.1997,
"step": 6520
},
{
"epoch": 0.6634156253174845,
"grad_norm": 3.609375,
"learning_rate": 1.2958842573996016e-05,
"loss": 0.1969,
"step": 6530
},
{
"epoch": 0.6644315757391039,
"grad_norm": 3.578125,
"learning_rate": 1.2888260110711525e-05,
"loss": 0.1469,
"step": 6540
},
{
"epoch": 0.6654475261607233,
"grad_norm": 1.3515625,
"learning_rate": 1.2817803574903212e-05,
"loss": 0.1524,
"step": 6550
},
{
"epoch": 0.6664634765823427,
"grad_norm": 2.109375,
"learning_rate": 1.2747473699117668e-05,
"loss": 0.159,
"step": 6560
},
{
"epoch": 0.6674794270039622,
"grad_norm": 1.53125,
"learning_rate": 1.267727121458458e-05,
"loss": 0.1999,
"step": 6570
},
{
"epoch": 0.6684953774255816,
"grad_norm": 1.7265625,
"learning_rate": 1.2607196851209137e-05,
"loss": 0.2216,
"step": 6580
},
{
"epoch": 0.669511327847201,
"grad_norm": 3.125,
"learning_rate": 1.2537251337564412e-05,
"loss": 0.1607,
"step": 6590
},
{
"epoch": 0.6705272782688205,
"grad_norm": 2.421875,
"learning_rate": 1.2467435400883839e-05,
"loss": 0.2187,
"step": 6600
},
{
"epoch": 0.6715432286904399,
"grad_norm": 1.5078125,
"learning_rate": 1.239774976705359e-05,
"loss": 0.1753,
"step": 6610
},
{
"epoch": 0.6725591791120593,
"grad_norm": 1.140625,
"learning_rate": 1.2328195160605092e-05,
"loss": 0.194,
"step": 6620
},
{
"epoch": 0.6735751295336787,
"grad_norm": 4.9375,
"learning_rate": 1.225877230470743e-05,
"loss": 0.1485,
"step": 6630
},
{
"epoch": 0.6745910799552982,
"grad_norm": 3.65625,
"learning_rate": 1.218948192115988e-05,
"loss": 0.1847,
"step": 6640
},
{
"epoch": 0.6756070303769176,
"grad_norm": 3.875,
"learning_rate": 1.21203247303844e-05,
"loss": 0.1874,
"step": 6650
},
{
"epoch": 0.676622980798537,
"grad_norm": 2.65625,
"learning_rate": 1.2051301451418073e-05,
"loss": 0.2377,
"step": 6660
},
{
"epoch": 0.6776389312201565,
"grad_norm": 2.09375,
"learning_rate": 1.198241280190574e-05,
"loss": 0.1508,
"step": 6670
},
{
"epoch": 0.6786548816417759,
"grad_norm": 2.203125,
"learning_rate": 1.1913659498092431e-05,
"loss": 0.1537,
"step": 6680
},
{
"epoch": 0.6796708320633953,
"grad_norm": 2.484375,
"learning_rate": 1.184504225481601e-05,
"loss": 0.2339,
"step": 6690
},
{
"epoch": 0.6806867824850147,
"grad_norm": 5.625,
"learning_rate": 1.177656178549966e-05,
"loss": 0.2102,
"step": 6700
},
{
"epoch": 0.6817027329066342,
"grad_norm": 2.5,
"learning_rate": 1.1708218802144536e-05,
"loss": 0.1435,
"step": 6710
},
{
"epoch": 0.6827186833282536,
"grad_norm": 3.84375,
"learning_rate": 1.1640014015322323e-05,
"loss": 0.1823,
"step": 6720
},
{
"epoch": 0.683734633749873,
"grad_norm": 2.359375,
"learning_rate": 1.1571948134167862e-05,
"loss": 0.1154,
"step": 6730
},
{
"epoch": 0.6847505841714925,
"grad_norm": 2.90625,
"learning_rate": 1.1504021866371761e-05,
"loss": 0.2105,
"step": 6740
},
{
"epoch": 0.6857665345931119,
"grad_norm": 5.46875,
"learning_rate": 1.143623591817304e-05,
"loss": 0.1317,
"step": 6750
},
{
"epoch": 0.6867824850147313,
"grad_norm": 3.34375,
"learning_rate": 1.1368590994351835e-05,
"loss": 0.1406,
"step": 6760
},
{
"epoch": 0.6877984354363507,
"grad_norm": 3.78125,
"learning_rate": 1.130108779822198e-05,
"loss": 0.1425,
"step": 6770
},
{
"epoch": 0.6888143858579702,
"grad_norm": 0.77734375,
"learning_rate": 1.1233727031623783e-05,
"loss": 0.1623,
"step": 6780
},
{
"epoch": 0.6898303362795896,
"grad_norm": 4.625,
"learning_rate": 1.1166509394916682e-05,
"loss": 0.1591,
"step": 6790
},
{
"epoch": 0.690846286701209,
"grad_norm": 3.84375,
"learning_rate": 1.1099435586971982e-05,
"loss": 0.1758,
"step": 6800
},
{
"epoch": 0.6918622371228285,
"grad_norm": 2.4375,
"learning_rate": 1.1032506305165555e-05,
"loss": 0.1018,
"step": 6810
},
{
"epoch": 0.6928781875444479,
"grad_norm": 3.203125,
"learning_rate": 1.0965722245370641e-05,
"loss": 0.1485,
"step": 6820
},
{
"epoch": 0.6938941379660672,
"grad_norm": 0.7109375,
"learning_rate": 1.0899084101950561e-05,
"loss": 0.1762,
"step": 6830
},
{
"epoch": 0.6949100883876866,
"grad_norm": 1.9765625,
"learning_rate": 1.0832592567751555e-05,
"loss": 0.1402,
"step": 6840
},
{
"epoch": 0.6959260388093061,
"grad_norm": 1.4609375,
"learning_rate": 1.0766248334095505e-05,
"loss": 0.2278,
"step": 6850
},
{
"epoch": 0.6969419892309255,
"grad_norm": 3.953125,
"learning_rate": 1.0700052090772828e-05,
"loss": 0.1969,
"step": 6860
},
{
"epoch": 0.6979579396525449,
"grad_norm": 2.453125,
"learning_rate": 1.0634004526035249e-05,
"loss": 0.2073,
"step": 6870
},
{
"epoch": 0.6989738900741643,
"grad_norm": 1.6171875,
"learning_rate": 1.0568106326588645e-05,
"loss": 0.1902,
"step": 6880
},
{
"epoch": 0.6999898404957838,
"grad_norm": 1.2734375,
"learning_rate": 1.0502358177585953e-05,
"loss": 0.2165,
"step": 6890
},
{
"epoch": 0.7010057909174032,
"grad_norm": 1.671875,
"learning_rate": 1.0436760762619977e-05,
"loss": 0.1952,
"step": 6900
},
{
"epoch": 0.7020217413390226,
"grad_norm": 2.8125,
"learning_rate": 1.0371314763716347e-05,
"loss": 0.1422,
"step": 6910
},
{
"epoch": 0.7030376917606421,
"grad_norm": 2.53125,
"learning_rate": 1.0306020861326388e-05,
"loss": 0.0961,
"step": 6920
},
{
"epoch": 0.7040536421822615,
"grad_norm": 3.046875,
"learning_rate": 1.0240879734320068e-05,
"loss": 0.1542,
"step": 6930
},
{
"epoch": 0.7050695926038809,
"grad_norm": 2.859375,
"learning_rate": 1.0175892059978901e-05,
"loss": 0.1748,
"step": 6940
},
{
"epoch": 0.7060855430255003,
"grad_norm": 2.671875,
"learning_rate": 1.0111058513988958e-05,
"loss": 0.0819,
"step": 6950
},
{
"epoch": 0.7071014934471198,
"grad_norm": 3.5625,
"learning_rate": 1.0046379770433803e-05,
"loss": 0.1933,
"step": 6960
},
{
"epoch": 0.7081174438687392,
"grad_norm": 2.859375,
"learning_rate": 9.98185650178749e-06,
"loss": 0.1891,
"step": 6970
},
{
"epoch": 0.7091333942903586,
"grad_norm": 3.15625,
"learning_rate": 9.917489378907591e-06,
"loss": 0.2102,
"step": 6980
},
{
"epoch": 0.7101493447119781,
"grad_norm": 6.40625,
"learning_rate": 9.853279071028212e-06,
"loss": 0.1714,
"step": 6990
},
{
"epoch": 0.7111652951335975,
"grad_norm": 2.375,
"learning_rate": 9.78922624575303e-06,
"loss": 0.1299,
"step": 7000
},
{
"epoch": 0.7121812455552169,
"grad_norm": 2.078125,
"learning_rate": 9.72533156904833e-06,
"loss": 0.1914,
"step": 7010
},
{
"epoch": 0.7131971959768363,
"grad_norm": 3.859375,
"learning_rate": 9.661595705236137e-06,
"loss": 0.2377,
"step": 7020
},
{
"epoch": 0.7142131463984558,
"grad_norm": 1.171875,
"learning_rate": 9.598019316987244e-06,
"loss": 0.1851,
"step": 7030
},
{
"epoch": 0.7152290968200752,
"grad_norm": 1.078125,
"learning_rate": 9.53460306531439e-06,
"loss": 0.2661,
"step": 7040
},
{
"epoch": 0.7162450472416946,
"grad_norm": 1.6484375,
"learning_rate": 9.471347609565311e-06,
"loss": 0.1669,
"step": 7050
},
{
"epoch": 0.7172609976633141,
"grad_norm": 4.59375,
"learning_rate": 9.408253607415957e-06,
"loss": 0.2487,
"step": 7060
},
{
"epoch": 0.7182769480849335,
"grad_norm": 3.09375,
"learning_rate": 9.345321714863614e-06,
"loss": 0.186,
"step": 7070
},
{
"epoch": 0.7192928985065529,
"grad_norm": 6.0625,
"learning_rate": 9.282552586220075e-06,
"loss": 0.2249,
"step": 7080
},
{
"epoch": 0.7203088489281723,
"grad_norm": 1.5703125,
"learning_rate": 9.219946874104885e-06,
"loss": 0.1255,
"step": 7090
},
{
"epoch": 0.7213247993497918,
"grad_norm": 1.9453125,
"learning_rate": 9.157505229438481e-06,
"loss": 0.1999,
"step": 7100
},
{
"epoch": 0.7223407497714112,
"grad_norm": 5.1875,
"learning_rate": 9.095228301435518e-06,
"loss": 0.199,
"step": 7110
},
{
"epoch": 0.7233567001930306,
"grad_norm": 2.078125,
"learning_rate": 9.03311673759802e-06,
"loss": 0.2182,
"step": 7120
},
{
"epoch": 0.7243726506146501,
"grad_norm": 6.46875,
"learning_rate": 8.971171183708733e-06,
"loss": 0.1573,
"step": 7130
},
{
"epoch": 0.7253886010362695,
"grad_norm": 3.015625,
"learning_rate": 8.909392283824353e-06,
"loss": 0.2044,
"step": 7140
},
{
"epoch": 0.7264045514578888,
"grad_norm": 2.921875,
"learning_rate": 8.847780680268872e-06,
"loss": 0.11,
"step": 7150
},
{
"epoch": 0.7274205018795082,
"grad_norm": 2.96875,
"learning_rate": 8.786337013626853e-06,
"loss": 0.1897,
"step": 7160
},
{
"epoch": 0.7284364523011277,
"grad_norm": 1.7578125,
"learning_rate": 8.725061922736799e-06,
"loss": 0.153,
"step": 7170
},
{
"epoch": 0.7294524027227471,
"grad_norm": 1.609375,
"learning_rate": 8.663956044684532e-06,
"loss": 0.1746,
"step": 7180
},
{
"epoch": 0.7304683531443665,
"grad_norm": 1.9375,
"learning_rate": 8.603020014796507e-06,
"loss": 0.2284,
"step": 7190
},
{
"epoch": 0.7314843035659859,
"grad_norm": 1.515625,
"learning_rate": 8.542254466633273e-06,
"loss": 0.1186,
"step": 7200
},
{
"epoch": 0.7325002539876054,
"grad_norm": 1.671875,
"learning_rate": 8.481660031982844e-06,
"loss": 0.1971,
"step": 7210
},
{
"epoch": 0.7335162044092248,
"grad_norm": 1.453125,
"learning_rate": 8.421237340854157e-06,
"loss": 0.196,
"step": 7220
},
{
"epoch": 0.7345321548308442,
"grad_norm": 0.65234375,
"learning_rate": 8.360987021470479e-06,
"loss": 0.1724,
"step": 7230
},
{
"epoch": 0.7355481052524637,
"grad_norm": 2.84375,
"learning_rate": 8.300909700262929e-06,
"loss": 0.175,
"step": 7240
},
{
"epoch": 0.7365640556740831,
"grad_norm": 3.109375,
"learning_rate": 8.241006001863924e-06,
"loss": 0.2276,
"step": 7250
},
{
"epoch": 0.7375800060957025,
"grad_norm": 4.8125,
"learning_rate": 8.181276549100714e-06,
"loss": 0.2029,
"step": 7260
},
{
"epoch": 0.7385959565173219,
"grad_norm": 4.03125,
"learning_rate": 8.12172196298887e-06,
"loss": 0.175,
"step": 7270
},
{
"epoch": 0.7396119069389414,
"grad_norm": 3.046875,
"learning_rate": 8.062342862725878e-06,
"loss": 0.1662,
"step": 7280
},
{
"epoch": 0.7406278573605608,
"grad_norm": 3.375,
"learning_rate": 8.003139865684662e-06,
"loss": 0.1616,
"step": 7290
},
{
"epoch": 0.7416438077821802,
"grad_norm": 2.5625,
"learning_rate": 7.944113587407157e-06,
"loss": 0.2448,
"step": 7300
},
{
"epoch": 0.7426597582037997,
"grad_norm": 4.125,
"learning_rate": 7.885264641597961e-06,
"loss": 0.1618,
"step": 7310
},
{
"epoch": 0.7436757086254191,
"grad_norm": 3.5,
"learning_rate": 7.826593640117889e-06,
"loss": 0.1134,
"step": 7320
},
{
"epoch": 0.7446916590470385,
"grad_norm": 2.6875,
"learning_rate": 7.76810119297767e-06,
"loss": 0.1795,
"step": 7330
},
{
"epoch": 0.7457076094686579,
"grad_norm": 4.34375,
"learning_rate": 7.709787908331556e-06,
"loss": 0.2736,
"step": 7340
},
{
"epoch": 0.7467235598902774,
"grad_norm": 1.21875,
"learning_rate": 7.651654392471038e-06,
"loss": 0.139,
"step": 7350
},
{
"epoch": 0.7477395103118968,
"grad_norm": 3.578125,
"learning_rate": 7.593701249818521e-06,
"loss": 0.2023,
"step": 7360
},
{
"epoch": 0.7487554607335162,
"grad_norm": 2.15625,
"learning_rate": 7.535929082921048e-06,
"loss": 0.1702,
"step": 7370
},
{
"epoch": 0.7497714111551357,
"grad_norm": 1.96875,
"learning_rate": 7.47833849244402e-06,
"loss": 0.1835,
"step": 7380
},
{
"epoch": 0.7507873615767551,
"grad_norm": 2.796875,
"learning_rate": 7.420930077164959e-06,
"loss": 0.1713,
"step": 7390
},
{
"epoch": 0.7518033119983745,
"grad_norm": 4.46875,
"learning_rate": 7.363704433967311e-06,
"loss": 0.1906,
"step": 7400
},
{
"epoch": 0.7528192624199939,
"grad_norm": 1.75,
"learning_rate": 7.306662157834185e-06,
"loss": 0.1421,
"step": 7410
},
{
"epoch": 0.7538352128416134,
"grad_norm": 1.140625,
"learning_rate": 7.2498038418422145e-06,
"loss": 0.1793,
"step": 7420
},
{
"epoch": 0.7548511632632328,
"grad_norm": 2.578125,
"learning_rate": 7.193130077155374e-06,
"loss": 0.1603,
"step": 7430
},
{
"epoch": 0.7558671136848522,
"grad_norm": 4.3125,
"learning_rate": 7.13664145301883e-06,
"loss": 0.2169,
"step": 7440
},
{
"epoch": 0.7568830641064717,
"grad_norm": 3.078125,
"learning_rate": 7.0803385567528025e-06,
"loss": 0.1685,
"step": 7450
},
{
"epoch": 0.757899014528091,
"grad_norm": 3.5625,
"learning_rate": 7.024221973746495e-06,
"loss": 0.2282,
"step": 7460
},
{
"epoch": 0.7589149649497104,
"grad_norm": 2.265625,
"learning_rate": 6.968292287451961e-06,
"loss": 0.1786,
"step": 7470
},
{
"epoch": 0.7599309153713298,
"grad_norm": 4.71875,
"learning_rate": 6.912550079378091e-06,
"loss": 0.1811,
"step": 7480
},
{
"epoch": 0.7609468657929493,
"grad_norm": 2.328125,
"learning_rate": 6.856995929084506e-06,
"loss": 0.1747,
"step": 7490
},
{
"epoch": 0.7619628162145687,
"grad_norm": 5.21875,
"learning_rate": 6.801630414175589e-06,
"loss": 0.2028,
"step": 7500
},
{
"epoch": 0.7629787666361881,
"grad_norm": 3.78125,
"learning_rate": 6.746454110294451e-06,
"loss": 0.2255,
"step": 7510
},
{
"epoch": 0.7639947170578075,
"grad_norm": 1.625,
"learning_rate": 6.691467591116931e-06,
"loss": 0.1604,
"step": 7520
},
{
"epoch": 0.765010667479427,
"grad_norm": 1.7734375,
"learning_rate": 6.6366714283456755e-06,
"loss": 0.2559,
"step": 7530
},
{
"epoch": 0.7660266179010464,
"grad_norm": 4.59375,
"learning_rate": 6.582066191704142e-06,
"loss": 0.2034,
"step": 7540
},
{
"epoch": 0.7670425683226658,
"grad_norm": 1.578125,
"learning_rate": 6.527652448930724e-06,
"loss": 0.148,
"step": 7550
},
{
"epoch": 0.7680585187442853,
"grad_norm": 1.7109375,
"learning_rate": 6.4734307657728e-06,
"loss": 0.1811,
"step": 7560
},
{
"epoch": 0.7690744691659047,
"grad_norm": 1.2734375,
"learning_rate": 6.419401705980924e-06,
"loss": 0.1407,
"step": 7570
},
{
"epoch": 0.7700904195875241,
"grad_norm": 2.25,
"learning_rate": 6.365565831302869e-06,
"loss": 0.1893,
"step": 7580
},
{
"epoch": 0.7711063700091435,
"grad_norm": 1.625,
"learning_rate": 6.311923701477854e-06,
"loss": 0.1835,
"step": 7590
},
{
"epoch": 0.772122320430763,
"grad_norm": 2.375,
"learning_rate": 6.258475874230713e-06,
"loss": 0.1579,
"step": 7600
},
{
"epoch": 0.7731382708523824,
"grad_norm": 4.5,
"learning_rate": 6.205222905266067e-06,
"loss": 0.1794,
"step": 7610
},
{
"epoch": 0.7741542212740018,
"grad_norm": 4.25,
"learning_rate": 6.152165348262598e-06,
"loss": 0.1477,
"step": 7620
},
{
"epoch": 0.7751701716956213,
"grad_norm": 1.9765625,
"learning_rate": 6.0993037548672246e-06,
"loss": 0.2396,
"step": 7630
},
{
"epoch": 0.7761861221172407,
"grad_norm": 2.671875,
"learning_rate": 6.046638674689454e-06,
"loss": 0.1717,
"step": 7640
},
{
"epoch": 0.7772020725388601,
"grad_norm": 3.671875,
"learning_rate": 5.994170655295567e-06,
"loss": 0.2646,
"step": 7650
},
{
"epoch": 0.7782180229604795,
"grad_norm": 1.3046875,
"learning_rate": 5.9419002422030106e-06,
"loss": 0.1553,
"step": 7660
},
{
"epoch": 0.779233973382099,
"grad_norm": 3.734375,
"learning_rate": 5.889827978874665e-06,
"loss": 0.1854,
"step": 7670
},
{
"epoch": 0.7802499238037184,
"grad_norm": 2.140625,
"learning_rate": 5.837954406713245e-06,
"loss": 0.1857,
"step": 7680
},
{
"epoch": 0.7812658742253378,
"grad_norm": 3.34375,
"learning_rate": 5.786280065055619e-06,
"loss": 0.1797,
"step": 7690
},
{
"epoch": 0.7822818246469573,
"grad_norm": 0.97265625,
"learning_rate": 5.734805491167244e-06,
"loss": 0.1488,
"step": 7700
},
{
"epoch": 0.7832977750685767,
"grad_norm": 2.078125,
"learning_rate": 5.683531220236576e-06,
"loss": 0.1688,
"step": 7710
},
{
"epoch": 0.7843137254901961,
"grad_norm": 3.046875,
"learning_rate": 5.632457785369455e-06,
"loss": 0.1503,
"step": 7720
},
{
"epoch": 0.7853296759118155,
"grad_norm": 1.6875,
"learning_rate": 5.581585717583637e-06,
"loss": 0.1658,
"step": 7730
},
{
"epoch": 0.786345626333435,
"grad_norm": 3.421875,
"learning_rate": 5.530915545803209e-06,
"loss": 0.2112,
"step": 7740
},
{
"epoch": 0.7873615767550544,
"grad_norm": 4.1875,
"learning_rate": 5.480447796853141e-06,
"loss": 0.165,
"step": 7750
},
{
"epoch": 0.7883775271766738,
"grad_norm": 5.3125,
"learning_rate": 5.430182995453756e-06,
"loss": 0.1499,
"step": 7760
},
{
"epoch": 0.7893934775982933,
"grad_norm": 2.1875,
"learning_rate": 5.380121664215329e-06,
"loss": 0.1559,
"step": 7770
},
{
"epoch": 0.7904094280199127,
"grad_norm": 1.46875,
"learning_rate": 5.330264323632611e-06,
"loss": 0.2098,
"step": 7780
},
{
"epoch": 0.791425378441532,
"grad_norm": 4.65625,
"learning_rate": 5.280611492079449e-06,
"loss": 0.1776,
"step": 7790
},
{
"epoch": 0.7924413288631514,
"grad_norm": 1.3359375,
"learning_rate": 5.231163685803361e-06,
"loss": 0.1497,
"step": 7800
},
{
"epoch": 0.7934572792847709,
"grad_norm": 2.640625,
"learning_rate": 5.181921418920191e-06,
"loss": 0.12,
"step": 7810
},
{
"epoch": 0.7944732297063903,
"grad_norm": 2.328125,
"learning_rate": 5.13288520340878e-06,
"loss": 0.1981,
"step": 7820
},
{
"epoch": 0.7954891801280097,
"grad_norm": 3.0625,
"learning_rate": 5.084055549105596e-06,
"loss": 0.1389,
"step": 7830
},
{
"epoch": 0.7965051305496291,
"grad_norm": 2.796875,
"learning_rate": 5.035432963699479e-06,
"loss": 0.2293,
"step": 7840
},
{
"epoch": 0.7975210809712486,
"grad_norm": 5.0625,
"learning_rate": 4.98701795272635e-06,
"loss": 0.1618,
"step": 7850
},
{
"epoch": 0.798537031392868,
"grad_norm": 5.09375,
"learning_rate": 4.938811019563938e-06,
"loss": 0.1755,
"step": 7860
},
{
"epoch": 0.7995529818144874,
"grad_norm": 2.140625,
"learning_rate": 4.8908126654265475e-06,
"loss": 0.1565,
"step": 7870
},
{
"epoch": 0.8005689322361069,
"grad_norm": 0.76171875,
"learning_rate": 4.843023389359885e-06,
"loss": 0.2176,
"step": 7880
},
{
"epoch": 0.8015848826577263,
"grad_norm": 2.625,
"learning_rate": 4.79544368823581e-06,
"loss": 0.2013,
"step": 7890
},
{
"epoch": 0.8026008330793457,
"grad_norm": 2.078125,
"learning_rate": 4.748074056747234e-06,
"loss": 0.1246,
"step": 7900
},
{
"epoch": 0.8036167835009651,
"grad_norm": 3.5,
"learning_rate": 4.700914987402919e-06,
"loss": 0.1638,
"step": 7910
},
{
"epoch": 0.8046327339225846,
"grad_norm": 3.4375,
"learning_rate": 4.6539669705223916e-06,
"loss": 0.2213,
"step": 7920
},
{
"epoch": 0.805648684344204,
"grad_norm": 2.96875,
"learning_rate": 4.607230494230849e-06,
"loss": 0.1822,
"step": 7930
},
{
"epoch": 0.8066646347658234,
"grad_norm": 2.359375,
"learning_rate": 4.560706044454047e-06,
"loss": 0.1763,
"step": 7940
},
{
"epoch": 0.8076805851874429,
"grad_norm": 4.59375,
"learning_rate": 4.514394104913291e-06,
"loss": 0.234,
"step": 7950
},
{
"epoch": 0.8086965356090623,
"grad_norm": 1.96875,
"learning_rate": 4.468295157120372e-06,
"loss": 0.1939,
"step": 7960
},
{
"epoch": 0.8097124860306817,
"grad_norm": 2.578125,
"learning_rate": 4.422409680372594e-06,
"loss": 0.174,
"step": 7970
},
{
"epoch": 0.8107284364523011,
"grad_norm": 4.5625,
"learning_rate": 4.3767381517477505e-06,
"loss": 0.2375,
"step": 7980
},
{
"epoch": 0.8117443868739206,
"grad_norm": 0.9609375,
"learning_rate": 4.331281046099203e-06,
"loss": 0.2076,
"step": 7990
},
{
"epoch": 0.81276033729554,
"grad_norm": 6.0625,
"learning_rate": 4.286038836050929e-06,
"loss": 0.2504,
"step": 8000
},
{
"epoch": 0.8137762877171594,
"grad_norm": 3.484375,
"learning_rate": 4.241011991992586e-06,
"loss": 0.2102,
"step": 8010
},
{
"epoch": 0.8147922381387789,
"grad_norm": 1.9765625,
"learning_rate": 4.1962009820746635e-06,
"loss": 0.1846,
"step": 8020
},
{
"epoch": 0.8158081885603983,
"grad_norm": 1.875,
"learning_rate": 4.15160627220357e-06,
"loss": 0.1741,
"step": 8030
},
{
"epoch": 0.8168241389820177,
"grad_norm": 5.5625,
"learning_rate": 4.107228326036838e-06,
"loss": 0.2078,
"step": 8040
},
{
"epoch": 0.8178400894036371,
"grad_norm": 1.7578125,
"learning_rate": 4.063067604978252e-06,
"loss": 0.212,
"step": 8050
},
{
"epoch": 0.8188560398252566,
"grad_norm": 4.09375,
"learning_rate": 4.019124568173094e-06,
"loss": 0.1831,
"step": 8060
},
{
"epoch": 0.819871990246876,
"grad_norm": 6.625,
"learning_rate": 3.975399672503341e-06,
"loss": 0.2196,
"step": 8070
},
{
"epoch": 0.8208879406684954,
"grad_norm": 2.78125,
"learning_rate": 3.931893372582943e-06,
"loss": 0.2002,
"step": 8080
},
{
"epoch": 0.8219038910901149,
"grad_norm": 6.90625,
"learning_rate": 3.888606120753047e-06,
"loss": 0.2138,
"step": 8090
},
{
"epoch": 0.8229198415117343,
"grad_norm": 4.09375,
"learning_rate": 3.845538367077362e-06,
"loss": 0.2593,
"step": 8100
},
{
"epoch": 0.8239357919333536,
"grad_norm": 1.859375,
"learning_rate": 3.8026905593374213e-06,
"loss": 0.2062,
"step": 8110
},
{
"epoch": 0.824951742354973,
"grad_norm": 4.3125,
"learning_rate": 3.760063143027945e-06,
"loss": 0.1343,
"step": 8120
},
{
"epoch": 0.8259676927765925,
"grad_norm": 1.984375,
"learning_rate": 3.7176565613522313e-06,
"loss": 0.2494,
"step": 8130
},
{
"epoch": 0.8269836431982119,
"grad_norm": 3.71875,
"learning_rate": 3.675471255217516e-06,
"loss": 0.1502,
"step": 8140
},
{
"epoch": 0.8279995936198313,
"grad_norm": 2.359375,
"learning_rate": 3.6335076632304175e-06,
"loss": 0.1256,
"step": 8150
},
{
"epoch": 0.8290155440414507,
"grad_norm": 1.46875,
"learning_rate": 3.5917662216923332e-06,
"loss": 0.1709,
"step": 8160
},
{
"epoch": 0.8300314944630702,
"grad_norm": 2.78125,
"learning_rate": 3.550247364594958e-06,
"loss": 0.1881,
"step": 8170
},
{
"epoch": 0.8310474448846896,
"grad_norm": 1.0703125,
"learning_rate": 3.508951523615725e-06,
"loss": 0.1998,
"step": 8180
},
{
"epoch": 0.832063395306309,
"grad_norm": 2.40625,
"learning_rate": 3.467879128113352e-06,
"loss": 0.2429,
"step": 8190
},
{
"epoch": 0.8330793457279285,
"grad_norm": 2.609375,
"learning_rate": 3.427030605123352e-06,
"loss": 0.1942,
"step": 8200
},
{
"epoch": 0.8340952961495479,
"grad_norm": 1.6015625,
"learning_rate": 3.3864063793536043e-06,
"loss": 0.1898,
"step": 8210
},
{
"epoch": 0.8351112465711673,
"grad_norm": 5.375,
"learning_rate": 3.3460068731799577e-06,
"loss": 0.1919,
"step": 8220
},
{
"epoch": 0.8361271969927867,
"grad_norm": 3.3125,
"learning_rate": 3.3058325066417818e-06,
"loss": 0.1516,
"step": 8230
},
{
"epoch": 0.8371431474144062,
"grad_norm": 0.76171875,
"learning_rate": 3.26588369743768e-06,
"loss": 0.1068,
"step": 8240
},
{
"epoch": 0.8381590978360256,
"grad_norm": 3.171875,
"learning_rate": 3.2261608609210653e-06,
"loss": 0.1203,
"step": 8250
},
{
"epoch": 0.839175048257645,
"grad_norm": 2.359375,
"learning_rate": 3.186664410095913e-06,
"loss": 0.2172,
"step": 8260
},
{
"epoch": 0.8401909986792645,
"grad_norm": 3.328125,
"learning_rate": 3.1473947556124093e-06,
"loss": 0.1249,
"step": 8270
},
{
"epoch": 0.8412069491008839,
"grad_norm": 2.484375,
"learning_rate": 3.1083523057627213e-06,
"loss": 0.1744,
"step": 8280
},
{
"epoch": 0.8422228995225033,
"grad_norm": 4.46875,
"learning_rate": 3.0695374664767353e-06,
"loss": 0.1772,
"step": 8290
},
{
"epoch": 0.8432388499441227,
"grad_norm": 0.59375,
"learning_rate": 3.0309506413178397e-06,
"loss": 0.2302,
"step": 8300
},
{
"epoch": 0.8442548003657422,
"grad_norm": 2.390625,
"learning_rate": 2.9925922314787136e-06,
"loss": 0.1635,
"step": 8310
},
{
"epoch": 0.8452707507873616,
"grad_norm": 2.34375,
"learning_rate": 2.954462635777194e-06,
"loss": 0.1573,
"step": 8320
},
{
"epoch": 0.846286701208981,
"grad_norm": 2.015625,
"learning_rate": 2.916562250652083e-06,
"loss": 0.1608,
"step": 8330
},
{
"epoch": 0.8473026516306005,
"grad_norm": 4.125,
"learning_rate": 2.878891470159048e-06,
"loss": 0.184,
"step": 8340
},
{
"epoch": 0.8483186020522199,
"grad_norm": 2.515625,
"learning_rate": 2.8414506859665514e-06,
"loss": 0.2141,
"step": 8350
},
{
"epoch": 0.8493345524738393,
"grad_norm": 3.375,
"learning_rate": 2.8042402873517197e-06,
"loss": 0.1729,
"step": 8360
},
{
"epoch": 0.8503505028954587,
"grad_norm": 3.078125,
"learning_rate": 2.76726066119635e-06,
"loss": 0.2252,
"step": 8370
},
{
"epoch": 0.8513664533170782,
"grad_norm": 1.5390625,
"learning_rate": 2.730512191982845e-06,
"loss": 0.1644,
"step": 8380
},
{
"epoch": 0.8523824037386976,
"grad_norm": 1.9296875,
"learning_rate": 2.693995261790261e-06,
"loss": 0.1822,
"step": 8390
},
{
"epoch": 0.853398354160317,
"grad_norm": 3.3125,
"learning_rate": 2.657710250290285e-06,
"loss": 0.2068,
"step": 8400
},
{
"epoch": 0.8544143045819365,
"grad_norm": 0.640625,
"learning_rate": 2.621657534743327e-06,
"loss": 0.1224,
"step": 8410
},
{
"epoch": 0.8554302550035559,
"grad_norm": 3.421875,
"learning_rate": 2.5858374899945804e-06,
"loss": 0.179,
"step": 8420
},
{
"epoch": 0.8564462054251752,
"grad_norm": 3.484375,
"learning_rate": 2.550250488470135e-06,
"loss": 0.1873,
"step": 8430
},
{
"epoch": 0.8574621558467946,
"grad_norm": 3.984375,
"learning_rate": 2.5148969001730806e-06,
"loss": 0.1799,
"step": 8440
},
{
"epoch": 0.8584781062684141,
"grad_norm": 1.375,
"learning_rate": 2.4797770926796858e-06,
"loss": 0.176,
"step": 8450
},
{
"epoch": 0.8594940566900335,
"grad_norm": 1.8984375,
"learning_rate": 2.444891431135571e-06,
"loss": 0.1664,
"step": 8460
},
{
"epoch": 0.8605100071116529,
"grad_norm": 4.15625,
"learning_rate": 2.4102402782518936e-06,
"loss": 0.1512,
"step": 8470
},
{
"epoch": 0.8615259575332723,
"grad_norm": 1.34375,
"learning_rate": 2.3758239943016096e-06,
"loss": 0.1629,
"step": 8480
},
{
"epoch": 0.8625419079548918,
"grad_norm": 5.3125,
"learning_rate": 2.3416429371157013e-06,
"loss": 0.2099,
"step": 8490
},
{
"epoch": 0.8635578583765112,
"grad_norm": 5.9375,
"learning_rate": 2.307697462079464e-06,
"loss": 0.2221,
"step": 8500
},
{
"epoch": 0.8645738087981306,
"grad_norm": 5.4375,
"learning_rate": 2.273987922128809e-06,
"loss": 0.2191,
"step": 8510
},
{
"epoch": 0.8655897592197501,
"grad_norm": 2.171875,
"learning_rate": 2.240514667746607e-06,
"loss": 0.1843,
"step": 8520
},
{
"epoch": 0.8666057096413695,
"grad_norm": 2.5625,
"learning_rate": 2.2072780469590245e-06,
"loss": 0.2494,
"step": 8530
},
{
"epoch": 0.8676216600629889,
"grad_norm": 2.25,
"learning_rate": 2.1742784053319116e-06,
"loss": 0.1712,
"step": 8540
},
{
"epoch": 0.8686376104846083,
"grad_norm": 4.5625,
"learning_rate": 2.141516085967224e-06,
"loss": 0.1169,
"step": 8550
},
{
"epoch": 0.8696535609062278,
"grad_norm": 4.25,
"learning_rate": 2.1089914294994434e-06,
"loss": 0.1374,
"step": 8560
},
{
"epoch": 0.8706695113278472,
"grad_norm": 3.265625,
"learning_rate": 2.0767047740920336e-06,
"loss": 0.2162,
"step": 8570
},
{
"epoch": 0.8716854617494666,
"grad_norm": 1.8203125,
"learning_rate": 2.0446564554339187e-06,
"loss": 0.1593,
"step": 8580
},
{
"epoch": 0.8727014121710861,
"grad_norm": 2.671875,
"learning_rate": 2.0128468067360185e-06,
"loss": 0.1857,
"step": 8590
},
{
"epoch": 0.8737173625927055,
"grad_norm": 2.765625,
"learning_rate": 1.981276158727749e-06,
"loss": 0.1989,
"step": 8600
},
{
"epoch": 0.8747333130143249,
"grad_norm": 2.65625,
"learning_rate": 1.949944839653625e-06,
"loss": 0.2077,
"step": 8610
},
{
"epoch": 0.8757492634359443,
"grad_norm": 2.625,
"learning_rate": 1.918853175269797e-06,
"loss": 0.2003,
"step": 8620
},
{
"epoch": 0.8767652138575638,
"grad_norm": 0.71875,
"learning_rate": 1.8880014888407127e-06,
"loss": 0.2486,
"step": 8630
},
{
"epoch": 0.8777811642791832,
"grad_norm": 4.71875,
"learning_rate": 1.8573901011357336e-06,
"loss": 0.1896,
"step": 8640
},
{
"epoch": 0.8787971147008026,
"grad_norm": 5.0625,
"learning_rate": 1.8270193304257887e-06,
"loss": 0.1727,
"step": 8650
},
{
"epoch": 0.8798130651224221,
"grad_norm": 1.75,
"learning_rate": 1.7968894924800916e-06,
"loss": 0.1687,
"step": 8660
},
{
"epoch": 0.8808290155440415,
"grad_norm": 2.65625,
"learning_rate": 1.7670009005628291e-06,
"loss": 0.166,
"step": 8670
},
{
"epoch": 0.8818449659656609,
"grad_norm": 4.71875,
"learning_rate": 1.737353865429936e-06,
"loss": 0.1471,
"step": 8680
},
{
"epoch": 0.8828609163872803,
"grad_norm": 0.546875,
"learning_rate": 1.7079486953258283e-06,
"loss": 0.1075,
"step": 8690
},
{
"epoch": 0.8838768668088998,
"grad_norm": 1.640625,
"learning_rate": 1.6787856959802367e-06,
"loss": 0.2113,
"step": 8700
},
{
"epoch": 0.8848928172305192,
"grad_norm": 2.953125,
"learning_rate": 1.6498651706049945e-06,
"loss": 0.1412,
"step": 8710
},
{
"epoch": 0.8859087676521386,
"grad_norm": 3.796875,
"learning_rate": 1.6211874198909072e-06,
"loss": 0.1701,
"step": 8720
},
{
"epoch": 0.8869247180737581,
"grad_norm": 3.734375,
"learning_rate": 1.592752742004605e-06,
"loss": 0.1348,
"step": 8730
},
{
"epoch": 0.8879406684953774,
"grad_norm": 2.21875,
"learning_rate": 1.5645614325854735e-06,
"loss": 0.1931,
"step": 8740
},
{
"epoch": 0.8889566189169968,
"grad_norm": 3.4375,
"learning_rate": 1.5366137847425466e-06,
"loss": 0.1705,
"step": 8750
},
{
"epoch": 0.8899725693386162,
"grad_norm": 3.5625,
"learning_rate": 1.5089100890514769e-06,
"loss": 0.1889,
"step": 8760
},
{
"epoch": 0.8909885197602357,
"grad_norm": 2.65625,
"learning_rate": 1.4814506335515176e-06,
"loss": 0.1837,
"step": 8770
},
{
"epoch": 0.8920044701818551,
"grad_norm": 1.421875,
"learning_rate": 1.4542357037425207e-06,
"loss": 0.1728,
"step": 8780
},
{
"epoch": 0.8930204206034745,
"grad_norm": 1.625,
"learning_rate": 1.4272655825819713e-06,
"loss": 0.1562,
"step": 8790
},
{
"epoch": 0.8940363710250939,
"grad_norm": 4.0625,
"learning_rate": 1.4005405504820351e-06,
"loss": 0.1681,
"step": 8800
},
{
"epoch": 0.8950523214467134,
"grad_norm": 2.328125,
"learning_rate": 1.3740608853066634e-06,
"loss": 0.1449,
"step": 8810
},
{
"epoch": 0.8960682718683328,
"grad_norm": 4.0625,
"learning_rate": 1.347826862368684e-06,
"loss": 0.2418,
"step": 8820
},
{
"epoch": 0.8970842222899522,
"grad_norm": 0.55859375,
"learning_rate": 1.3218387544269545e-06,
"loss": 0.2473,
"step": 8830
},
{
"epoch": 0.8981001727115717,
"grad_norm": 4.78125,
"learning_rate": 1.2960968316835132e-06,
"loss": 0.194,
"step": 8840
},
{
"epoch": 0.8991161231331911,
"grad_norm": 3.921875,
"learning_rate": 1.2706013617807822e-06,
"loss": 0.2109,
"step": 8850
},
{
"epoch": 0.9001320735548105,
"grad_norm": 5.03125,
"learning_rate": 1.2453526097987778e-06,
"loss": 0.151,
"step": 8860
},
{
"epoch": 0.9011480239764299,
"grad_norm": 5.96875,
"learning_rate": 1.2203508382523431e-06,
"loss": 0.1811,
"step": 8870
},
{
"epoch": 0.9021639743980494,
"grad_norm": 3.828125,
"learning_rate": 1.1955963070884534e-06,
"loss": 0.2004,
"step": 8880
},
{
"epoch": 0.9031799248196688,
"grad_norm": 1.9765625,
"learning_rate": 1.171089273683465e-06,
"loss": 0.1395,
"step": 8890
},
{
"epoch": 0.9041958752412882,
"grad_norm": 2.328125,
"learning_rate": 1.1468299928404868e-06,
"loss": 0.1915,
"step": 8900
},
{
"epoch": 0.9052118256629077,
"grad_norm": 1.265625,
"learning_rate": 1.1228187167866943e-06,
"loss": 0.1281,
"step": 8910
},
{
"epoch": 0.9062277760845271,
"grad_norm": 1.4375,
"learning_rate": 1.099055695170728e-06,
"loss": 0.1627,
"step": 8920
},
{
"epoch": 0.9072437265061465,
"grad_norm": 0.6953125,
"learning_rate": 1.0755411750600962e-06,
"loss": 0.1768,
"step": 8930
},
{
"epoch": 0.9082596769277659,
"grad_norm": 1.046875,
"learning_rate": 1.052275400938596e-06,
"loss": 0.1544,
"step": 8940
},
{
"epoch": 0.9092756273493854,
"grad_norm": 2.71875,
"learning_rate": 1.0292586147037764e-06,
"loss": 0.2498,
"step": 8950
},
{
"epoch": 0.9102915777710048,
"grad_norm": 3.0625,
"learning_rate": 1.0064910556644214e-06,
"loss": 0.1918,
"step": 8960
},
{
"epoch": 0.9113075281926242,
"grad_norm": 4.0,
"learning_rate": 9.839729605380766e-07,
"loss": 0.2388,
"step": 8970
},
{
"epoch": 0.9123234786142437,
"grad_norm": 3.765625,
"learning_rate": 9.61704563448565e-07,
"loss": 0.1944,
"step": 8980
},
{
"epoch": 0.9133394290358631,
"grad_norm": 2.90625,
"learning_rate": 9.396860959235671e-07,
"loss": 0.1667,
"step": 8990
},
{
"epoch": 0.9143553794574825,
"grad_norm": 2.4375,
"learning_rate": 9.179177868922085e-07,
"loss": 0.2143,
"step": 9000
}
],
"logging_steps": 10,
"max_steps": 9843,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"total_flos": 0.0,
"train_batch_size": 7,
"trial_name": null,
"trial_params": null
}