Dynosaur's picture
Model save
6f3b42d verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9988694177501414,
"eval_steps": 500,
"global_step": 1326,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0015074429998115696,
"grad_norm": 2.758618933635527,
"learning_rate": 1.5037593984962406e-07,
"loss": 0.7902,
"step": 1
},
{
"epoch": 0.003014885999623139,
"grad_norm": 2.6991611264758757,
"learning_rate": 3.007518796992481e-07,
"loss": 0.7844,
"step": 2
},
{
"epoch": 0.0045223289994347085,
"grad_norm": 2.7272565236365143,
"learning_rate": 4.511278195488722e-07,
"loss": 0.7626,
"step": 3
},
{
"epoch": 0.006029771999246278,
"grad_norm": 2.7528117182790965,
"learning_rate": 6.015037593984962e-07,
"loss": 0.7858,
"step": 4
},
{
"epoch": 0.007537214999057848,
"grad_norm": 2.775699578303916,
"learning_rate": 7.518796992481203e-07,
"loss": 0.8026,
"step": 5
},
{
"epoch": 0.009044657998869417,
"grad_norm": 2.6767851045683204,
"learning_rate": 9.022556390977444e-07,
"loss": 0.769,
"step": 6
},
{
"epoch": 0.010552100998680987,
"grad_norm": 2.5261120500748224,
"learning_rate": 1.0526315789473685e-06,
"loss": 0.7706,
"step": 7
},
{
"epoch": 0.012059543998492557,
"grad_norm": 2.514977460751076,
"learning_rate": 1.2030075187969925e-06,
"loss": 0.7785,
"step": 8
},
{
"epoch": 0.013566986998304126,
"grad_norm": 2.200241814288396,
"learning_rate": 1.3533834586466167e-06,
"loss": 0.7499,
"step": 9
},
{
"epoch": 0.015074429998115696,
"grad_norm": 2.212492267506947,
"learning_rate": 1.5037593984962406e-06,
"loss": 0.784,
"step": 10
},
{
"epoch": 0.016581872997927266,
"grad_norm": 1.9319703389138259,
"learning_rate": 1.6541353383458648e-06,
"loss": 0.7302,
"step": 11
},
{
"epoch": 0.018089315997738834,
"grad_norm": 1.2550044831716305,
"learning_rate": 1.8045112781954887e-06,
"loss": 0.7056,
"step": 12
},
{
"epoch": 0.019596758997550406,
"grad_norm": 1.2195135372913062,
"learning_rate": 1.9548872180451127e-06,
"loss": 0.7014,
"step": 13
},
{
"epoch": 0.021104201997361974,
"grad_norm": 1.2455313883450765,
"learning_rate": 2.105263157894737e-06,
"loss": 0.7374,
"step": 14
},
{
"epoch": 0.022611644997173545,
"grad_norm": 1.1872107473955416,
"learning_rate": 2.255639097744361e-06,
"loss": 0.7186,
"step": 15
},
{
"epoch": 0.024119087996985113,
"grad_norm": 1.1982009143546264,
"learning_rate": 2.406015037593985e-06,
"loss": 0.6828,
"step": 16
},
{
"epoch": 0.025626530996796685,
"grad_norm": 2.3539969715580384,
"learning_rate": 2.556390977443609e-06,
"loss": 0.6636,
"step": 17
},
{
"epoch": 0.027133973996608253,
"grad_norm": 2.6228143835706765,
"learning_rate": 2.7067669172932333e-06,
"loss": 0.6967,
"step": 18
},
{
"epoch": 0.028641416996419825,
"grad_norm": 2.224580122320562,
"learning_rate": 2.8571428571428573e-06,
"loss": 0.6761,
"step": 19
},
{
"epoch": 0.030148859996231393,
"grad_norm": 1.6908938829069853,
"learning_rate": 3.007518796992481e-06,
"loss": 0.6711,
"step": 20
},
{
"epoch": 0.03165630299604296,
"grad_norm": 1.2313675848377437,
"learning_rate": 3.157894736842105e-06,
"loss": 0.6731,
"step": 21
},
{
"epoch": 0.03316374599585453,
"grad_norm": 1.0688239960942414,
"learning_rate": 3.3082706766917295e-06,
"loss": 0.6766,
"step": 22
},
{
"epoch": 0.034671188995666104,
"grad_norm": 1.0400364411240592,
"learning_rate": 3.4586466165413535e-06,
"loss": 0.6388,
"step": 23
},
{
"epoch": 0.03617863199547767,
"grad_norm": 1.1273406799386165,
"learning_rate": 3.6090225563909775e-06,
"loss": 0.6406,
"step": 24
},
{
"epoch": 0.03768607499528924,
"grad_norm": 1.1097242153227487,
"learning_rate": 3.7593984962406014e-06,
"loss": 0.6316,
"step": 25
},
{
"epoch": 0.03919351799510081,
"grad_norm": 0.9446010057713108,
"learning_rate": 3.909774436090225e-06,
"loss": 0.6023,
"step": 26
},
{
"epoch": 0.04070096099491238,
"grad_norm": 0.8323088497080903,
"learning_rate": 4.06015037593985e-06,
"loss": 0.6183,
"step": 27
},
{
"epoch": 0.04220840399472395,
"grad_norm": 0.7872962129475931,
"learning_rate": 4.210526315789474e-06,
"loss": 0.603,
"step": 28
},
{
"epoch": 0.04371584699453552,
"grad_norm": 0.7767297100748087,
"learning_rate": 4.360902255639098e-06,
"loss": 0.6217,
"step": 29
},
{
"epoch": 0.04522328999434709,
"grad_norm": 0.8604923635307716,
"learning_rate": 4.511278195488722e-06,
"loss": 0.5855,
"step": 30
},
{
"epoch": 0.046730732994158655,
"grad_norm": 0.8294236539215625,
"learning_rate": 4.661654135338346e-06,
"loss": 0.5933,
"step": 31
},
{
"epoch": 0.04823817599397023,
"grad_norm": 0.7967164005183986,
"learning_rate": 4.81203007518797e-06,
"loss": 0.6159,
"step": 32
},
{
"epoch": 0.0497456189937818,
"grad_norm": 0.6830740716112117,
"learning_rate": 4.962406015037594e-06,
"loss": 0.5752,
"step": 33
},
{
"epoch": 0.05125306199359337,
"grad_norm": 0.6829489822638658,
"learning_rate": 5.112781954887218e-06,
"loss": 0.5792,
"step": 34
},
{
"epoch": 0.052760504993404934,
"grad_norm": 0.7505911666509206,
"learning_rate": 5.263157894736842e-06,
"loss": 0.602,
"step": 35
},
{
"epoch": 0.054267947993216506,
"grad_norm": 0.7281087618287696,
"learning_rate": 5.413533834586467e-06,
"loss": 0.5994,
"step": 36
},
{
"epoch": 0.05577539099302808,
"grad_norm": 0.7061965815038841,
"learning_rate": 5.56390977443609e-06,
"loss": 0.5734,
"step": 37
},
{
"epoch": 0.05728283399283965,
"grad_norm": 0.6836377512068608,
"learning_rate": 5.7142857142857145e-06,
"loss": 0.5543,
"step": 38
},
{
"epoch": 0.058790276992651214,
"grad_norm": 0.6751595521927064,
"learning_rate": 5.864661654135339e-06,
"loss": 0.5983,
"step": 39
},
{
"epoch": 0.060297719992462785,
"grad_norm": 0.6517307660683558,
"learning_rate": 6.015037593984962e-06,
"loss": 0.5482,
"step": 40
},
{
"epoch": 0.06180516299227436,
"grad_norm": 0.670740477832527,
"learning_rate": 6.165413533834587e-06,
"loss": 0.5771,
"step": 41
},
{
"epoch": 0.06331260599208592,
"grad_norm": 0.6386183426057812,
"learning_rate": 6.31578947368421e-06,
"loss": 0.5338,
"step": 42
},
{
"epoch": 0.0648200489918975,
"grad_norm": 0.6325251598452951,
"learning_rate": 6.466165413533835e-06,
"loss": 0.5409,
"step": 43
},
{
"epoch": 0.06632749199170906,
"grad_norm": 0.6190880971896819,
"learning_rate": 6.616541353383459e-06,
"loss": 0.5386,
"step": 44
},
{
"epoch": 0.06783493499152063,
"grad_norm": 0.6390822260018926,
"learning_rate": 6.766917293233083e-06,
"loss": 0.5824,
"step": 45
},
{
"epoch": 0.06934237799133221,
"grad_norm": 0.6866065901609671,
"learning_rate": 6.917293233082707e-06,
"loss": 0.5661,
"step": 46
},
{
"epoch": 0.07084982099114377,
"grad_norm": 0.6253742924174672,
"learning_rate": 7.067669172932331e-06,
"loss": 0.5371,
"step": 47
},
{
"epoch": 0.07235726399095534,
"grad_norm": 0.6086976797344416,
"learning_rate": 7.218045112781955e-06,
"loss": 0.5394,
"step": 48
},
{
"epoch": 0.07386470699076692,
"grad_norm": 0.6617551336853821,
"learning_rate": 7.368421052631579e-06,
"loss": 0.5481,
"step": 49
},
{
"epoch": 0.07537214999057848,
"grad_norm": 0.6511571841438215,
"learning_rate": 7.518796992481203e-06,
"loss": 0.557,
"step": 50
},
{
"epoch": 0.07687959299039004,
"grad_norm": 0.6424690635636273,
"learning_rate": 7.669172932330828e-06,
"loss": 0.554,
"step": 51
},
{
"epoch": 0.07838703599020162,
"grad_norm": 0.6208252498748196,
"learning_rate": 7.81954887218045e-06,
"loss": 0.5316,
"step": 52
},
{
"epoch": 0.07989447899001319,
"grad_norm": 0.6178927978791646,
"learning_rate": 7.969924812030075e-06,
"loss": 0.5303,
"step": 53
},
{
"epoch": 0.08140192198982477,
"grad_norm": 0.6246216171964205,
"learning_rate": 8.1203007518797e-06,
"loss": 0.5551,
"step": 54
},
{
"epoch": 0.08290936498963633,
"grad_norm": 0.6071119189590479,
"learning_rate": 8.270676691729324e-06,
"loss": 0.5324,
"step": 55
},
{
"epoch": 0.0844168079894479,
"grad_norm": 0.6123261613238393,
"learning_rate": 8.421052631578948e-06,
"loss": 0.5503,
"step": 56
},
{
"epoch": 0.08592425098925947,
"grad_norm": 0.620387110972641,
"learning_rate": 8.571428571428571e-06,
"loss": 0.5353,
"step": 57
},
{
"epoch": 0.08743169398907104,
"grad_norm": 0.6185030072259556,
"learning_rate": 8.721804511278195e-06,
"loss": 0.5544,
"step": 58
},
{
"epoch": 0.0889391369888826,
"grad_norm": 0.6244700607026835,
"learning_rate": 8.87218045112782e-06,
"loss": 0.5768,
"step": 59
},
{
"epoch": 0.09044657998869418,
"grad_norm": 0.655865564676625,
"learning_rate": 9.022556390977444e-06,
"loss": 0.5541,
"step": 60
},
{
"epoch": 0.09195402298850575,
"grad_norm": 0.6355623162806917,
"learning_rate": 9.172932330827068e-06,
"loss": 0.5317,
"step": 61
},
{
"epoch": 0.09346146598831731,
"grad_norm": 0.6620650186277286,
"learning_rate": 9.323308270676693e-06,
"loss": 0.5825,
"step": 62
},
{
"epoch": 0.09496890898812889,
"grad_norm": 0.6544984607034259,
"learning_rate": 9.473684210526315e-06,
"loss": 0.5367,
"step": 63
},
{
"epoch": 0.09647635198794045,
"grad_norm": 0.6566999876216955,
"learning_rate": 9.62406015037594e-06,
"loss": 0.5334,
"step": 64
},
{
"epoch": 0.09798379498775203,
"grad_norm": 0.6538652733227992,
"learning_rate": 9.774436090225564e-06,
"loss": 0.5088,
"step": 65
},
{
"epoch": 0.0994912379875636,
"grad_norm": 0.7184816645886852,
"learning_rate": 9.924812030075189e-06,
"loss": 0.5015,
"step": 66
},
{
"epoch": 0.10099868098737516,
"grad_norm": 0.6287887378220718,
"learning_rate": 1.0075187969924813e-05,
"loss": 0.5171,
"step": 67
},
{
"epoch": 0.10250612398718674,
"grad_norm": 0.7045986205120561,
"learning_rate": 1.0225563909774436e-05,
"loss": 0.5499,
"step": 68
},
{
"epoch": 0.1040135669869983,
"grad_norm": 0.6263524660452249,
"learning_rate": 1.0375939849624062e-05,
"loss": 0.5319,
"step": 69
},
{
"epoch": 0.10552100998680987,
"grad_norm": 0.6340009161866458,
"learning_rate": 1.0526315789473684e-05,
"loss": 0.5295,
"step": 70
},
{
"epoch": 0.10702845298662145,
"grad_norm": 0.6930114141626272,
"learning_rate": 1.0676691729323309e-05,
"loss": 0.543,
"step": 71
},
{
"epoch": 0.10853589598643301,
"grad_norm": 0.6582132112309497,
"learning_rate": 1.0827067669172933e-05,
"loss": 0.5164,
"step": 72
},
{
"epoch": 0.11004333898624458,
"grad_norm": 0.6774079053656817,
"learning_rate": 1.0977443609022558e-05,
"loss": 0.5202,
"step": 73
},
{
"epoch": 0.11155078198605616,
"grad_norm": 0.6435562490480392,
"learning_rate": 1.112781954887218e-05,
"loss": 0.5095,
"step": 74
},
{
"epoch": 0.11305822498586772,
"grad_norm": 0.7112297517037395,
"learning_rate": 1.1278195488721806e-05,
"loss": 0.5316,
"step": 75
},
{
"epoch": 0.1145656679856793,
"grad_norm": 0.709494451956929,
"learning_rate": 1.1428571428571429e-05,
"loss": 0.4935,
"step": 76
},
{
"epoch": 0.11607311098549086,
"grad_norm": 0.6777802836075782,
"learning_rate": 1.1578947368421053e-05,
"loss": 0.5043,
"step": 77
},
{
"epoch": 0.11758055398530243,
"grad_norm": 0.6296151489375509,
"learning_rate": 1.1729323308270678e-05,
"loss": 0.4874,
"step": 78
},
{
"epoch": 0.119087996985114,
"grad_norm": 0.6808431409244452,
"learning_rate": 1.1879699248120302e-05,
"loss": 0.4788,
"step": 79
},
{
"epoch": 0.12059543998492557,
"grad_norm": 0.6704429377361576,
"learning_rate": 1.2030075187969925e-05,
"loss": 0.5011,
"step": 80
},
{
"epoch": 0.12210288298473713,
"grad_norm": 0.6926069766970787,
"learning_rate": 1.2180451127819551e-05,
"loss": 0.496,
"step": 81
},
{
"epoch": 0.12361032598454871,
"grad_norm": 0.639818862010909,
"learning_rate": 1.2330827067669174e-05,
"loss": 0.5308,
"step": 82
},
{
"epoch": 0.12511776898436028,
"grad_norm": 0.6204899572762589,
"learning_rate": 1.2481203007518798e-05,
"loss": 0.5063,
"step": 83
},
{
"epoch": 0.12662521198417184,
"grad_norm": 0.6865925022658576,
"learning_rate": 1.263157894736842e-05,
"loss": 0.507,
"step": 84
},
{
"epoch": 0.1281326549839834,
"grad_norm": 0.7029706975479946,
"learning_rate": 1.2781954887218047e-05,
"loss": 0.5209,
"step": 85
},
{
"epoch": 0.129640097983795,
"grad_norm": 0.6524424672188123,
"learning_rate": 1.293233082706767e-05,
"loss": 0.5527,
"step": 86
},
{
"epoch": 0.13114754098360656,
"grad_norm": 0.6074805023838824,
"learning_rate": 1.3082706766917295e-05,
"loss": 0.4873,
"step": 87
},
{
"epoch": 0.13265498398341813,
"grad_norm": 0.6891851394415897,
"learning_rate": 1.3233082706766918e-05,
"loss": 0.5335,
"step": 88
},
{
"epoch": 0.1341624269832297,
"grad_norm": 0.6322157680641546,
"learning_rate": 1.3383458646616543e-05,
"loss": 0.5163,
"step": 89
},
{
"epoch": 0.13566986998304126,
"grad_norm": 0.6652911326311045,
"learning_rate": 1.3533834586466165e-05,
"loss": 0.5227,
"step": 90
},
{
"epoch": 0.13717731298285282,
"grad_norm": 0.7241927650908743,
"learning_rate": 1.3684210526315791e-05,
"loss": 0.5271,
"step": 91
},
{
"epoch": 0.13868475598266441,
"grad_norm": 0.654474241851782,
"learning_rate": 1.3834586466165414e-05,
"loss": 0.5154,
"step": 92
},
{
"epoch": 0.14019219898247598,
"grad_norm": 0.7184414157305912,
"learning_rate": 1.3984962406015038e-05,
"loss": 0.5077,
"step": 93
},
{
"epoch": 0.14169964198228754,
"grad_norm": 0.6449190071052974,
"learning_rate": 1.4135338345864663e-05,
"loss": 0.5038,
"step": 94
},
{
"epoch": 0.1432070849820991,
"grad_norm": 0.691580302982374,
"learning_rate": 1.4285714285714287e-05,
"loss": 0.4667,
"step": 95
},
{
"epoch": 0.14471452798191067,
"grad_norm": 0.6288139344926038,
"learning_rate": 1.443609022556391e-05,
"loss": 0.49,
"step": 96
},
{
"epoch": 0.14622197098172227,
"grad_norm": 0.643873596183986,
"learning_rate": 1.4586466165413536e-05,
"loss": 0.4749,
"step": 97
},
{
"epoch": 0.14772941398153383,
"grad_norm": 0.6755660192421138,
"learning_rate": 1.4736842105263159e-05,
"loss": 0.5075,
"step": 98
},
{
"epoch": 0.1492368569813454,
"grad_norm": 0.646796309870399,
"learning_rate": 1.4887218045112783e-05,
"loss": 0.5296,
"step": 99
},
{
"epoch": 0.15074429998115696,
"grad_norm": 0.7044467423953411,
"learning_rate": 1.5037593984962406e-05,
"loss": 0.5303,
"step": 100
},
{
"epoch": 0.15225174298096852,
"grad_norm": 0.7730989605725896,
"learning_rate": 1.5187969924812032e-05,
"loss": 0.5099,
"step": 101
},
{
"epoch": 0.1537591859807801,
"grad_norm": 0.6620556405595589,
"learning_rate": 1.5338345864661656e-05,
"loss": 0.5189,
"step": 102
},
{
"epoch": 0.15526662898059168,
"grad_norm": 0.8038364131821449,
"learning_rate": 1.548872180451128e-05,
"loss": 0.5152,
"step": 103
},
{
"epoch": 0.15677407198040325,
"grad_norm": 0.6537353177538859,
"learning_rate": 1.56390977443609e-05,
"loss": 0.5012,
"step": 104
},
{
"epoch": 0.1582815149802148,
"grad_norm": 0.7802019309424624,
"learning_rate": 1.578947368421053e-05,
"loss": 0.4851,
"step": 105
},
{
"epoch": 0.15978895798002637,
"grad_norm": 0.7590456113216669,
"learning_rate": 1.593984962406015e-05,
"loss": 0.5077,
"step": 106
},
{
"epoch": 0.16129640097983794,
"grad_norm": 0.6740526095538228,
"learning_rate": 1.6090225563909775e-05,
"loss": 0.4794,
"step": 107
},
{
"epoch": 0.16280384397964953,
"grad_norm": 0.7266285917065574,
"learning_rate": 1.62406015037594e-05,
"loss": 0.5368,
"step": 108
},
{
"epoch": 0.1643112869794611,
"grad_norm": 0.7202106895600753,
"learning_rate": 1.6390977443609023e-05,
"loss": 0.5077,
"step": 109
},
{
"epoch": 0.16581872997927266,
"grad_norm": 0.7646664609937389,
"learning_rate": 1.6541353383458648e-05,
"loss": 0.517,
"step": 110
},
{
"epoch": 0.16732617297908423,
"grad_norm": 0.7090240598112959,
"learning_rate": 1.6691729323308272e-05,
"loss": 0.5217,
"step": 111
},
{
"epoch": 0.1688336159788958,
"grad_norm": 0.7260255784190195,
"learning_rate": 1.6842105263157896e-05,
"loss": 0.4977,
"step": 112
},
{
"epoch": 0.17034105897870735,
"grad_norm": 0.6392143364785348,
"learning_rate": 1.699248120300752e-05,
"loss": 0.4844,
"step": 113
},
{
"epoch": 0.17184850197851895,
"grad_norm": 0.7380625519153193,
"learning_rate": 1.7142857142857142e-05,
"loss": 0.5063,
"step": 114
},
{
"epoch": 0.1733559449783305,
"grad_norm": 0.6999588028799495,
"learning_rate": 1.729323308270677e-05,
"loss": 0.5124,
"step": 115
},
{
"epoch": 0.17486338797814208,
"grad_norm": 0.7189938723035283,
"learning_rate": 1.744360902255639e-05,
"loss": 0.4723,
"step": 116
},
{
"epoch": 0.17637083097795364,
"grad_norm": 0.755925477740424,
"learning_rate": 1.7593984962406015e-05,
"loss": 0.5071,
"step": 117
},
{
"epoch": 0.1778782739777652,
"grad_norm": 0.6932179016486248,
"learning_rate": 1.774436090225564e-05,
"loss": 0.4813,
"step": 118
},
{
"epoch": 0.1793857169775768,
"grad_norm": 0.6803984999939205,
"learning_rate": 1.7894736842105264e-05,
"loss": 0.4537,
"step": 119
},
{
"epoch": 0.18089315997738836,
"grad_norm": 0.6943064601614549,
"learning_rate": 1.8045112781954888e-05,
"loss": 0.5302,
"step": 120
},
{
"epoch": 0.18240060297719993,
"grad_norm": 0.7024561418818766,
"learning_rate": 1.8195488721804512e-05,
"loss": 0.5292,
"step": 121
},
{
"epoch": 0.1839080459770115,
"grad_norm": 0.6516091971357849,
"learning_rate": 1.8345864661654137e-05,
"loss": 0.5192,
"step": 122
},
{
"epoch": 0.18541548897682306,
"grad_norm": 0.733121234769519,
"learning_rate": 1.849624060150376e-05,
"loss": 0.5137,
"step": 123
},
{
"epoch": 0.18692293197663462,
"grad_norm": 0.6379606914374305,
"learning_rate": 1.8646616541353386e-05,
"loss": 0.5004,
"step": 124
},
{
"epoch": 0.1884303749764462,
"grad_norm": 0.682116565034204,
"learning_rate": 1.879699248120301e-05,
"loss": 0.4932,
"step": 125
},
{
"epoch": 0.18993781797625778,
"grad_norm": 0.6586357032334851,
"learning_rate": 1.894736842105263e-05,
"loss": 0.4949,
"step": 126
},
{
"epoch": 0.19144526097606934,
"grad_norm": 0.6996866499647233,
"learning_rate": 1.909774436090226e-05,
"loss": 0.4926,
"step": 127
},
{
"epoch": 0.1929527039758809,
"grad_norm": 0.7344954131354208,
"learning_rate": 1.924812030075188e-05,
"loss": 0.477,
"step": 128
},
{
"epoch": 0.19446014697569247,
"grad_norm": 0.6945294612726404,
"learning_rate": 1.9398496240601504e-05,
"loss": 0.4872,
"step": 129
},
{
"epoch": 0.19596758997550406,
"grad_norm": 0.7624604146665339,
"learning_rate": 1.954887218045113e-05,
"loss": 0.5224,
"step": 130
},
{
"epoch": 0.19747503297531563,
"grad_norm": 0.685229042181731,
"learning_rate": 1.9699248120300753e-05,
"loss": 0.4964,
"step": 131
},
{
"epoch": 0.1989824759751272,
"grad_norm": 0.8306163709490333,
"learning_rate": 1.9849624060150377e-05,
"loss": 0.5021,
"step": 132
},
{
"epoch": 0.20048991897493876,
"grad_norm": 0.6752867411242717,
"learning_rate": 2e-05,
"loss": 0.4946,
"step": 133
},
{
"epoch": 0.20199736197475032,
"grad_norm": 0.9473906923308808,
"learning_rate": 1.99999653272242e-05,
"loss": 0.5112,
"step": 134
},
{
"epoch": 0.2035048049745619,
"grad_norm": 0.6355233169612663,
"learning_rate": 1.9999861309137232e-05,
"loss": 0.5318,
"step": 135
},
{
"epoch": 0.20501224797437348,
"grad_norm": 0.8423903087733013,
"learning_rate": 1.999968794646042e-05,
"loss": 0.5148,
"step": 136
},
{
"epoch": 0.20651969097418504,
"grad_norm": 0.6660475408627802,
"learning_rate": 1.9999445240395953e-05,
"loss": 0.5178,
"step": 137
},
{
"epoch": 0.2080271339739966,
"grad_norm": 0.72967331295993,
"learning_rate": 1.9999133192626893e-05,
"loss": 0.5262,
"step": 138
},
{
"epoch": 0.20953457697380817,
"grad_norm": 0.7393548066200798,
"learning_rate": 1.9998751805317152e-05,
"loss": 0.5057,
"step": 139
},
{
"epoch": 0.21104201997361974,
"grad_norm": 0.687138877245702,
"learning_rate": 1.999830108111148e-05,
"loss": 0.4958,
"step": 140
},
{
"epoch": 0.21254946297343133,
"grad_norm": 0.7007673502124087,
"learning_rate": 1.999778102313545e-05,
"loss": 0.4948,
"step": 141
},
{
"epoch": 0.2140569059732429,
"grad_norm": 0.7183800488623966,
"learning_rate": 1.999719163499543e-05,
"loss": 0.5104,
"step": 142
},
{
"epoch": 0.21556434897305446,
"grad_norm": 0.6535866563135689,
"learning_rate": 1.999653292077857e-05,
"loss": 0.5145,
"step": 143
},
{
"epoch": 0.21707179197286602,
"grad_norm": 0.64107430044815,
"learning_rate": 1.999580488505276e-05,
"loss": 0.4659,
"step": 144
},
{
"epoch": 0.2185792349726776,
"grad_norm": 0.6779403955139097,
"learning_rate": 1.9995007532866594e-05,
"loss": 0.4964,
"step": 145
},
{
"epoch": 0.22008667797248915,
"grad_norm": 0.6539110005752458,
"learning_rate": 1.9994140869749366e-05,
"loss": 0.5092,
"step": 146
},
{
"epoch": 0.22159412097230075,
"grad_norm": 0.6565635872751927,
"learning_rate": 1.9993204901710995e-05,
"loss": 0.5185,
"step": 147
},
{
"epoch": 0.2231015639721123,
"grad_norm": 0.6372834025735034,
"learning_rate": 1.9992199635241997e-05,
"loss": 0.5152,
"step": 148
},
{
"epoch": 0.22460900697192387,
"grad_norm": 0.622264657968412,
"learning_rate": 1.999112507731346e-05,
"loss": 0.5,
"step": 149
},
{
"epoch": 0.22611644997173544,
"grad_norm": 0.6267246999704467,
"learning_rate": 1.9989981235376956e-05,
"loss": 0.4897,
"step": 150
},
{
"epoch": 0.227623892971547,
"grad_norm": 0.6551262788067906,
"learning_rate": 1.9988768117364526e-05,
"loss": 0.5165,
"step": 151
},
{
"epoch": 0.2291313359713586,
"grad_norm": 0.6664514520384526,
"learning_rate": 1.9987485731688595e-05,
"loss": 0.5002,
"step": 152
},
{
"epoch": 0.23063877897117016,
"grad_norm": 0.6076256276502832,
"learning_rate": 1.998613408724195e-05,
"loss": 0.5084,
"step": 153
},
{
"epoch": 0.23214622197098173,
"grad_norm": 0.6373492462291207,
"learning_rate": 1.998471319339763e-05,
"loss": 0.5026,
"step": 154
},
{
"epoch": 0.2336536649707933,
"grad_norm": 1.745130262060046,
"learning_rate": 1.9983223060008908e-05,
"loss": 0.5034,
"step": 155
},
{
"epoch": 0.23516110797060485,
"grad_norm": 5.046536940100192,
"learning_rate": 1.9981663697409203e-05,
"loss": 0.5424,
"step": 156
},
{
"epoch": 0.23666855097041642,
"grad_norm": 1.5507957638980387,
"learning_rate": 1.998003511641199e-05,
"loss": 0.5301,
"step": 157
},
{
"epoch": 0.238175993970228,
"grad_norm": 0.9453551152302114,
"learning_rate": 1.997833732831076e-05,
"loss": 0.4793,
"step": 158
},
{
"epoch": 0.23968343697003958,
"grad_norm": 1.1864330727309345,
"learning_rate": 1.9976570344878916e-05,
"loss": 0.5125,
"step": 159
},
{
"epoch": 0.24119087996985114,
"grad_norm": 0.8095198214822489,
"learning_rate": 1.9974734178369702e-05,
"loss": 0.4904,
"step": 160
},
{
"epoch": 0.2426983229696627,
"grad_norm": 0.620347427984293,
"learning_rate": 1.997282884151612e-05,
"loss": 0.4611,
"step": 161
},
{
"epoch": 0.24420576596947427,
"grad_norm": 0.7590913383659819,
"learning_rate": 1.9970854347530828e-05,
"loss": 0.5085,
"step": 162
},
{
"epoch": 0.24571320896928586,
"grad_norm": 0.5932187358928716,
"learning_rate": 1.9968810710106065e-05,
"loss": 0.49,
"step": 163
},
{
"epoch": 0.24722065196909743,
"grad_norm": 0.7421744580230403,
"learning_rate": 1.9966697943413548e-05,
"loss": 0.4789,
"step": 164
},
{
"epoch": 0.248728094968909,
"grad_norm": 0.634606964098851,
"learning_rate": 1.9964516062104377e-05,
"loss": 0.5008,
"step": 165
},
{
"epoch": 0.25023553796872056,
"grad_norm": 0.721486256547781,
"learning_rate": 1.996226508130892e-05,
"loss": 0.4546,
"step": 166
},
{
"epoch": 0.25174298096853215,
"grad_norm": 0.6785391250628956,
"learning_rate": 1.995994501663674e-05,
"loss": 0.4892,
"step": 167
},
{
"epoch": 0.2532504239683437,
"grad_norm": 0.5862796281463728,
"learning_rate": 1.995755588417644e-05,
"loss": 0.4736,
"step": 168
},
{
"epoch": 0.2547578669681553,
"grad_norm": 0.6656043784418029,
"learning_rate": 1.99550977004956e-05,
"loss": 0.4749,
"step": 169
},
{
"epoch": 0.2562653099679668,
"grad_norm": 0.6091440270236256,
"learning_rate": 1.9952570482640628e-05,
"loss": 0.4997,
"step": 170
},
{
"epoch": 0.2577727529677784,
"grad_norm": 0.7671131220608588,
"learning_rate": 1.9949974248136655e-05,
"loss": 0.4741,
"step": 171
},
{
"epoch": 0.25928019596759,
"grad_norm": 0.6246356814602296,
"learning_rate": 1.9947309014987414e-05,
"loss": 0.4727,
"step": 172
},
{
"epoch": 0.26078763896740154,
"grad_norm": 0.7874820318511245,
"learning_rate": 1.9944574801675106e-05,
"loss": 0.4965,
"step": 173
},
{
"epoch": 0.26229508196721313,
"grad_norm": 0.6727323732082747,
"learning_rate": 1.9941771627160287e-05,
"loss": 0.5361,
"step": 174
},
{
"epoch": 0.26380252496702467,
"grad_norm": 0.6896058530733802,
"learning_rate": 1.9938899510881732e-05,
"loss": 0.4574,
"step": 175
},
{
"epoch": 0.26530996796683626,
"grad_norm": 0.6396080754362474,
"learning_rate": 1.9935958472756283e-05,
"loss": 0.4791,
"step": 176
},
{
"epoch": 0.26681741096664785,
"grad_norm": 0.6978820918898457,
"learning_rate": 1.993294853317873e-05,
"loss": 0.4936,
"step": 177
},
{
"epoch": 0.2683248539664594,
"grad_norm": 0.6200726182474722,
"learning_rate": 1.9929869713021668e-05,
"loss": 0.4809,
"step": 178
},
{
"epoch": 0.269832296966271,
"grad_norm": 0.6621164817055001,
"learning_rate": 1.9926722033635343e-05,
"loss": 0.4833,
"step": 179
},
{
"epoch": 0.2713397399660825,
"grad_norm": 0.6443066814567524,
"learning_rate": 1.9923505516847514e-05,
"loss": 0.452,
"step": 180
},
{
"epoch": 0.2728471829658941,
"grad_norm": 0.6324303832157692,
"learning_rate": 1.9920220184963296e-05,
"loss": 0.4942,
"step": 181
},
{
"epoch": 0.27435462596570565,
"grad_norm": 0.7093590972609833,
"learning_rate": 1.9916866060764994e-05,
"loss": 0.4666,
"step": 182
},
{
"epoch": 0.27586206896551724,
"grad_norm": 0.6056554558199854,
"learning_rate": 1.991344316751198e-05,
"loss": 0.48,
"step": 183
},
{
"epoch": 0.27736951196532883,
"grad_norm": 0.6668247301054864,
"learning_rate": 1.9909951528940485e-05,
"loss": 0.4892,
"step": 184
},
{
"epoch": 0.27887695496514037,
"grad_norm": 0.6248269362388523,
"learning_rate": 1.990639116926348e-05,
"loss": 0.4601,
"step": 185
},
{
"epoch": 0.28038439796495196,
"grad_norm": 0.5988280290089756,
"learning_rate": 1.9902762113170467e-05,
"loss": 0.4671,
"step": 186
},
{
"epoch": 0.2818918409647635,
"grad_norm": 0.6183022850194487,
"learning_rate": 1.989906438582734e-05,
"loss": 0.5023,
"step": 187
},
{
"epoch": 0.2833992839645751,
"grad_norm": 0.6345731772578389,
"learning_rate": 1.9895298012876192e-05,
"loss": 0.4749,
"step": 188
},
{
"epoch": 0.2849067269643867,
"grad_norm": 0.6026738883514794,
"learning_rate": 1.9891463020435144e-05,
"loss": 0.4884,
"step": 189
},
{
"epoch": 0.2864141699641982,
"grad_norm": 0.6275566066201014,
"learning_rate": 1.9887559435098162e-05,
"loss": 0.4868,
"step": 190
},
{
"epoch": 0.2879216129640098,
"grad_norm": 0.6830623512458401,
"learning_rate": 1.9883587283934875e-05,
"loss": 0.4797,
"step": 191
},
{
"epoch": 0.28942905596382135,
"grad_norm": 0.621100203862078,
"learning_rate": 1.9879546594490383e-05,
"loss": 0.4781,
"step": 192
},
{
"epoch": 0.29093649896363294,
"grad_norm": 0.7266845450092815,
"learning_rate": 1.987543739478507e-05,
"loss": 0.4838,
"step": 193
},
{
"epoch": 0.29244394196344453,
"grad_norm": 0.5998498411317879,
"learning_rate": 1.987125971331441e-05,
"loss": 0.4809,
"step": 194
},
{
"epoch": 0.29395138496325607,
"grad_norm": 0.7629414665635117,
"learning_rate": 1.9867013579048765e-05,
"loss": 0.4891,
"step": 195
},
{
"epoch": 0.29545882796306766,
"grad_norm": 0.6340989750127195,
"learning_rate": 1.9862699021433186e-05,
"loss": 0.4696,
"step": 196
},
{
"epoch": 0.2969662709628792,
"grad_norm": 0.6600966786500729,
"learning_rate": 1.9858316070387208e-05,
"loss": 0.4568,
"step": 197
},
{
"epoch": 0.2984737139626908,
"grad_norm": 0.6836572646612057,
"learning_rate": 1.9853864756304654e-05,
"loss": 0.4849,
"step": 198
},
{
"epoch": 0.2999811569625024,
"grad_norm": 0.5912116643865833,
"learning_rate": 1.9849345110053405e-05,
"loss": 0.4752,
"step": 199
},
{
"epoch": 0.3014885999623139,
"grad_norm": 0.6202584603281575,
"learning_rate": 1.984475716297519e-05,
"loss": 0.478,
"step": 200
},
{
"epoch": 0.3029960429621255,
"grad_norm": 0.6021031729150327,
"learning_rate": 1.984010094688539e-05,
"loss": 0.4818,
"step": 201
},
{
"epoch": 0.30450348596193705,
"grad_norm": 0.6013263404823498,
"learning_rate": 1.9835376494072788e-05,
"loss": 0.4798,
"step": 202
},
{
"epoch": 0.30601092896174864,
"grad_norm": 0.6391395935977097,
"learning_rate": 1.9830583837299363e-05,
"loss": 0.5079,
"step": 203
},
{
"epoch": 0.3075183719615602,
"grad_norm": 0.6403126078695583,
"learning_rate": 1.9825723009800058e-05,
"loss": 0.4994,
"step": 204
},
{
"epoch": 0.30902581496137177,
"grad_norm": 0.6996650791864127,
"learning_rate": 1.9820794045282553e-05,
"loss": 0.458,
"step": 205
},
{
"epoch": 0.31053325796118336,
"grad_norm": 0.6120915229627125,
"learning_rate": 1.9815796977927015e-05,
"loss": 0.4837,
"step": 206
},
{
"epoch": 0.3120407009609949,
"grad_norm": 0.698625059347094,
"learning_rate": 1.9810731842385892e-05,
"loss": 0.4762,
"step": 207
},
{
"epoch": 0.3135481439608065,
"grad_norm": 0.6458152328354264,
"learning_rate": 1.9805598673783644e-05,
"loss": 0.4877,
"step": 208
},
{
"epoch": 0.31505558696061803,
"grad_norm": 0.6183456339468536,
"learning_rate": 1.980039750771651e-05,
"loss": 0.4555,
"step": 209
},
{
"epoch": 0.3165630299604296,
"grad_norm": 0.6625119162294268,
"learning_rate": 1.9795128380252263e-05,
"loss": 0.467,
"step": 210
},
{
"epoch": 0.3180704729602412,
"grad_norm": 0.5634839413053515,
"learning_rate": 1.978979132792996e-05,
"loss": 0.4664,
"step": 211
},
{
"epoch": 0.31957791596005275,
"grad_norm": 0.6026219616185302,
"learning_rate": 1.9784386387759684e-05,
"loss": 0.4774,
"step": 212
},
{
"epoch": 0.32108535895986434,
"grad_norm": 0.6269218843440012,
"learning_rate": 1.977891359722229e-05,
"loss": 0.4432,
"step": 213
},
{
"epoch": 0.3225928019596759,
"grad_norm": 0.5910572611931473,
"learning_rate": 1.9773372994269147e-05,
"loss": 0.4699,
"step": 214
},
{
"epoch": 0.32410024495948747,
"grad_norm": 0.6536939644754692,
"learning_rate": 1.976776461732187e-05,
"loss": 0.4807,
"step": 215
},
{
"epoch": 0.32560768795929906,
"grad_norm": 0.6191726708771672,
"learning_rate": 1.976208850527206e-05,
"loss": 0.4944,
"step": 216
},
{
"epoch": 0.3271151309591106,
"grad_norm": 0.6298298802683915,
"learning_rate": 1.9756344697481027e-05,
"loss": 0.4862,
"step": 217
},
{
"epoch": 0.3286225739589222,
"grad_norm": 0.6539080215758202,
"learning_rate": 1.975053323377952e-05,
"loss": 0.4817,
"step": 218
},
{
"epoch": 0.33013001695873373,
"grad_norm": 0.6146092404035427,
"learning_rate": 1.9744654154467468e-05,
"loss": 0.5422,
"step": 219
},
{
"epoch": 0.3316374599585453,
"grad_norm": 0.6490399293285286,
"learning_rate": 1.9738707500313655e-05,
"loss": 0.4703,
"step": 220
},
{
"epoch": 0.3331449029583569,
"grad_norm": 0.615019483124787,
"learning_rate": 1.9732693312555492e-05,
"loss": 0.4801,
"step": 221
},
{
"epoch": 0.33465234595816845,
"grad_norm": 0.5547372740595196,
"learning_rate": 1.9726611632898693e-05,
"loss": 0.4719,
"step": 222
},
{
"epoch": 0.33615978895798004,
"grad_norm": 0.6076625752065381,
"learning_rate": 1.9720462503517e-05,
"loss": 0.498,
"step": 223
},
{
"epoch": 0.3376672319577916,
"grad_norm": 0.6047398581844834,
"learning_rate": 1.971424596705189e-05,
"loss": 0.4643,
"step": 224
},
{
"epoch": 0.3391746749576032,
"grad_norm": 0.5727445152315086,
"learning_rate": 1.9707962066612278e-05,
"loss": 0.4515,
"step": 225
},
{
"epoch": 0.3406821179574147,
"grad_norm": 0.5573272058322264,
"learning_rate": 1.970161084577422e-05,
"loss": 0.4524,
"step": 226
},
{
"epoch": 0.3421895609572263,
"grad_norm": 0.6257676922974255,
"learning_rate": 1.9695192348580606e-05,
"loss": 0.4815,
"step": 227
},
{
"epoch": 0.3436970039570379,
"grad_norm": 0.5745183403896584,
"learning_rate": 1.9688706619540863e-05,
"loss": 0.4717,
"step": 228
},
{
"epoch": 0.34520444695684943,
"grad_norm": 0.5964564340890054,
"learning_rate": 1.968215370363063e-05,
"loss": 0.4839,
"step": 229
},
{
"epoch": 0.346711889956661,
"grad_norm": 0.5672877352491237,
"learning_rate": 1.9675533646291463e-05,
"loss": 0.4914,
"step": 230
},
{
"epoch": 0.34821933295647256,
"grad_norm": 0.6672213227292868,
"learning_rate": 1.9668846493430522e-05,
"loss": 0.4718,
"step": 231
},
{
"epoch": 0.34972677595628415,
"grad_norm": 0.5546791014566226,
"learning_rate": 1.9662092291420233e-05,
"loss": 0.4392,
"step": 232
},
{
"epoch": 0.35123421895609575,
"grad_norm": 0.6546100852352986,
"learning_rate": 1.965527108709798e-05,
"loss": 0.4836,
"step": 233
},
{
"epoch": 0.3527416619559073,
"grad_norm": 0.5992362903479123,
"learning_rate": 1.964838292776579e-05,
"loss": 0.4464,
"step": 234
},
{
"epoch": 0.3542491049557189,
"grad_norm": 0.63523888294575,
"learning_rate": 1.9641427861189973e-05,
"loss": 0.4856,
"step": 235
},
{
"epoch": 0.3557565479555304,
"grad_norm": 0.6032723041133213,
"learning_rate": 1.963440593560083e-05,
"loss": 0.4966,
"step": 236
},
{
"epoch": 0.357263990955342,
"grad_norm": 0.6306498314236755,
"learning_rate": 1.9627317199692287e-05,
"loss": 0.4771,
"step": 237
},
{
"epoch": 0.3587714339551536,
"grad_norm": 0.5865071462782886,
"learning_rate": 1.962016170262157e-05,
"loss": 0.4573,
"step": 238
},
{
"epoch": 0.36027887695496513,
"grad_norm": 0.5665927327271444,
"learning_rate": 1.961293949400888e-05,
"loss": 0.4485,
"step": 239
},
{
"epoch": 0.3617863199547767,
"grad_norm": 0.554220781330076,
"learning_rate": 1.960565062393701e-05,
"loss": 0.4686,
"step": 240
},
{
"epoch": 0.36329376295458826,
"grad_norm": 0.6455923732389204,
"learning_rate": 1.9598295142951035e-05,
"loss": 0.4592,
"step": 241
},
{
"epoch": 0.36480120595439985,
"grad_norm": 0.596721778819204,
"learning_rate": 1.9590873102057948e-05,
"loss": 0.4907,
"step": 242
},
{
"epoch": 0.36630864895421145,
"grad_norm": 0.6716627212373145,
"learning_rate": 1.9583384552726294e-05,
"loss": 0.4799,
"step": 243
},
{
"epoch": 0.367816091954023,
"grad_norm": 0.6229194933798746,
"learning_rate": 1.957582954688584e-05,
"loss": 0.4652,
"step": 244
},
{
"epoch": 0.3693235349538346,
"grad_norm": 0.6298037085236075,
"learning_rate": 1.9568208136927177e-05,
"loss": 0.4717,
"step": 245
},
{
"epoch": 0.3708309779536461,
"grad_norm": 0.5787887978421966,
"learning_rate": 1.9560520375701408e-05,
"loss": 0.4845,
"step": 246
},
{
"epoch": 0.3723384209534577,
"grad_norm": 0.6451526421523999,
"learning_rate": 1.9552766316519726e-05,
"loss": 0.4516,
"step": 247
},
{
"epoch": 0.37384586395326924,
"grad_norm": 0.538692705705553,
"learning_rate": 1.9544946013153093e-05,
"loss": 0.4649,
"step": 248
},
{
"epoch": 0.37535330695308083,
"grad_norm": 0.6399780775437526,
"learning_rate": 1.9537059519831822e-05,
"loss": 0.4594,
"step": 249
},
{
"epoch": 0.3768607499528924,
"grad_norm": 0.6082935211607333,
"learning_rate": 1.9529106891245244e-05,
"loss": 0.4709,
"step": 250
},
{
"epoch": 0.37836819295270396,
"grad_norm": 0.6106738888512755,
"learning_rate": 1.9521088182541298e-05,
"loss": 0.492,
"step": 251
},
{
"epoch": 0.37987563595251556,
"grad_norm": 0.5803041737823633,
"learning_rate": 1.951300344932616e-05,
"loss": 0.4646,
"step": 252
},
{
"epoch": 0.3813830789523271,
"grad_norm": 0.5647638332240319,
"learning_rate": 1.9504852747663862e-05,
"loss": 0.4725,
"step": 253
},
{
"epoch": 0.3828905219521387,
"grad_norm": 0.664315669006426,
"learning_rate": 1.9496636134075894e-05,
"loss": 0.4689,
"step": 254
},
{
"epoch": 0.3843979649519503,
"grad_norm": 0.6019633789641826,
"learning_rate": 1.9488353665540813e-05,
"loss": 0.4613,
"step": 255
},
{
"epoch": 0.3859054079517618,
"grad_norm": 0.5805016640621002,
"learning_rate": 1.9480005399493857e-05,
"loss": 0.4613,
"step": 256
},
{
"epoch": 0.3874128509515734,
"grad_norm": 0.6053466035481387,
"learning_rate": 1.9471591393826536e-05,
"loss": 0.4877,
"step": 257
},
{
"epoch": 0.38892029395138494,
"grad_norm": 0.5443749204002357,
"learning_rate": 1.9463111706886234e-05,
"loss": 0.481,
"step": 258
},
{
"epoch": 0.39042773695119654,
"grad_norm": 0.6422687053592201,
"learning_rate": 1.9454566397475813e-05,
"loss": 0.464,
"step": 259
},
{
"epoch": 0.39193517995100813,
"grad_norm": 0.5911574213296809,
"learning_rate": 1.944595552485319e-05,
"loss": 0.4451,
"step": 260
},
{
"epoch": 0.39344262295081966,
"grad_norm": 0.6244696365384524,
"learning_rate": 1.943727914873094e-05,
"loss": 0.465,
"step": 261
},
{
"epoch": 0.39495006595063126,
"grad_norm": 0.6787496907794774,
"learning_rate": 1.9428537329275862e-05,
"loss": 0.4591,
"step": 262
},
{
"epoch": 0.3964575089504428,
"grad_norm": 0.584284155721975,
"learning_rate": 1.941973012710859e-05,
"loss": 0.4835,
"step": 263
},
{
"epoch": 0.3979649519502544,
"grad_norm": 0.6636147745329853,
"learning_rate": 1.941085760330316e-05,
"loss": 0.4558,
"step": 264
},
{
"epoch": 0.3994723949500659,
"grad_norm": 0.580002453326873,
"learning_rate": 1.940191981938657e-05,
"loss": 0.4848,
"step": 265
},
{
"epoch": 0.4009798379498775,
"grad_norm": 0.6067452479296194,
"learning_rate": 1.9392916837338376e-05,
"loss": 0.4783,
"step": 266
},
{
"epoch": 0.4024872809496891,
"grad_norm": 0.6517612748843483,
"learning_rate": 1.9383848719590257e-05,
"loss": 0.4849,
"step": 267
},
{
"epoch": 0.40399472394950064,
"grad_norm": 0.6355304966389256,
"learning_rate": 1.9374715529025575e-05,
"loss": 0.4312,
"step": 268
},
{
"epoch": 0.40550216694931224,
"grad_norm": 0.627744747765263,
"learning_rate": 1.9365517328978943e-05,
"loss": 0.4762,
"step": 269
},
{
"epoch": 0.4070096099491238,
"grad_norm": 0.6640367945419465,
"learning_rate": 1.9356254183235785e-05,
"loss": 0.432,
"step": 270
},
{
"epoch": 0.40851705294893537,
"grad_norm": 0.647008694411896,
"learning_rate": 1.93469261560319e-05,
"loss": 0.4795,
"step": 271
},
{
"epoch": 0.41002449594874696,
"grad_norm": 0.6742117075938286,
"learning_rate": 1.9337533312053002e-05,
"loss": 0.4573,
"step": 272
},
{
"epoch": 0.4115319389485585,
"grad_norm": 0.6000668524451142,
"learning_rate": 1.9328075716434287e-05,
"loss": 0.4474,
"step": 273
},
{
"epoch": 0.4130393819483701,
"grad_norm": 0.6027061587937567,
"learning_rate": 1.931855343475998e-05,
"loss": 0.4283,
"step": 274
},
{
"epoch": 0.4145468249481816,
"grad_norm": 0.56875377174764,
"learning_rate": 1.930896653306286e-05,
"loss": 0.4446,
"step": 275
},
{
"epoch": 0.4160542679479932,
"grad_norm": 0.6494800822344575,
"learning_rate": 1.929931507782383e-05,
"loss": 0.4504,
"step": 276
},
{
"epoch": 0.4175617109478048,
"grad_norm": 0.5925306999643124,
"learning_rate": 1.9289599135971437e-05,
"loss": 0.4993,
"step": 277
},
{
"epoch": 0.41906915394761635,
"grad_norm": 0.5812846521774916,
"learning_rate": 1.9279818774881418e-05,
"loss": 0.4574,
"step": 278
},
{
"epoch": 0.42057659694742794,
"grad_norm": 0.5625417674563119,
"learning_rate": 1.9269974062376224e-05,
"loss": 0.4325,
"step": 279
},
{
"epoch": 0.4220840399472395,
"grad_norm": 0.5839055838922522,
"learning_rate": 1.926006506672456e-05,
"loss": 0.4669,
"step": 280
},
{
"epoch": 0.42359148294705107,
"grad_norm": 0.6042605173402862,
"learning_rate": 1.9250091856640895e-05,
"loss": 0.4224,
"step": 281
},
{
"epoch": 0.42509892594686266,
"grad_norm": 0.5856982708883072,
"learning_rate": 1.9240054501285015e-05,
"loss": 0.4709,
"step": 282
},
{
"epoch": 0.4266063689466742,
"grad_norm": 0.5631263514578662,
"learning_rate": 1.922995307026151e-05,
"loss": 0.4614,
"step": 283
},
{
"epoch": 0.4281138119464858,
"grad_norm": 0.5583569731432177,
"learning_rate": 1.921978763361931e-05,
"loss": 0.4589,
"step": 284
},
{
"epoch": 0.4296212549462973,
"grad_norm": 0.6050421963625475,
"learning_rate": 1.9209558261851194e-05,
"loss": 0.4382,
"step": 285
},
{
"epoch": 0.4311286979461089,
"grad_norm": 0.533785762634786,
"learning_rate": 1.919926502589331e-05,
"loss": 0.4862,
"step": 286
},
{
"epoch": 0.43263614094592046,
"grad_norm": 0.5693448486944194,
"learning_rate": 1.9188907997124666e-05,
"loss": 0.4562,
"step": 287
},
{
"epoch": 0.43414358394573205,
"grad_norm": 0.5654990613672617,
"learning_rate": 1.9178487247366652e-05,
"loss": 0.4492,
"step": 288
},
{
"epoch": 0.43565102694554364,
"grad_norm": 0.5771432152665512,
"learning_rate": 1.916800284888253e-05,
"loss": 0.4478,
"step": 289
},
{
"epoch": 0.4371584699453552,
"grad_norm": 0.5734596310020046,
"learning_rate": 1.915745487437694e-05,
"loss": 0.4801,
"step": 290
},
{
"epoch": 0.43866591294516677,
"grad_norm": 0.5832753483996317,
"learning_rate": 1.9146843396995396e-05,
"loss": 0.4563,
"step": 291
},
{
"epoch": 0.4401733559449783,
"grad_norm": 0.5879841082366902,
"learning_rate": 1.9136168490323772e-05,
"loss": 0.4689,
"step": 292
},
{
"epoch": 0.4416807989447899,
"grad_norm": 0.5521570450782258,
"learning_rate": 1.9125430228387794e-05,
"loss": 0.4581,
"step": 293
},
{
"epoch": 0.4431882419446015,
"grad_norm": 0.5673604877581071,
"learning_rate": 1.9114628685652535e-05,
"loss": 0.4668,
"step": 294
},
{
"epoch": 0.444695684944413,
"grad_norm": 0.5866077006525799,
"learning_rate": 1.9103763937021887e-05,
"loss": 0.4588,
"step": 295
},
{
"epoch": 0.4462031279442246,
"grad_norm": 0.5731048741878798,
"learning_rate": 1.909283605783805e-05,
"loss": 0.4774,
"step": 296
},
{
"epoch": 0.44771057094403616,
"grad_norm": 0.6251177027508026,
"learning_rate": 1.9081845123881002e-05,
"loss": 0.4813,
"step": 297
},
{
"epoch": 0.44921801394384775,
"grad_norm": 0.5256954818277138,
"learning_rate": 1.9070791211367984e-05,
"loss": 0.4473,
"step": 298
},
{
"epoch": 0.45072545694365934,
"grad_norm": 0.6199874516009303,
"learning_rate": 1.9059674396952963e-05,
"loss": 0.4629,
"step": 299
},
{
"epoch": 0.4522328999434709,
"grad_norm": 0.5917017492987557,
"learning_rate": 1.90484947577261e-05,
"loss": 0.4979,
"step": 300
},
{
"epoch": 0.45374034294328247,
"grad_norm": 0.6120361922704654,
"learning_rate": 1.903725237121322e-05,
"loss": 0.4831,
"step": 301
},
{
"epoch": 0.455247785943094,
"grad_norm": 0.5514120347682593,
"learning_rate": 1.902594731537527e-05,
"loss": 0.4452,
"step": 302
},
{
"epoch": 0.4567552289429056,
"grad_norm": 0.5767336190747095,
"learning_rate": 1.901457966860779e-05,
"loss": 0.4435,
"step": 303
},
{
"epoch": 0.4582626719427172,
"grad_norm": 0.5868519118956824,
"learning_rate": 1.9003149509740347e-05,
"loss": 0.492,
"step": 304
},
{
"epoch": 0.45977011494252873,
"grad_norm": 0.6168191655686016,
"learning_rate": 1.899165691803601e-05,
"loss": 0.4512,
"step": 305
},
{
"epoch": 0.4612775579423403,
"grad_norm": 0.6053359821845329,
"learning_rate": 1.8980101973190787e-05,
"loss": 0.4749,
"step": 306
},
{
"epoch": 0.46278500094215186,
"grad_norm": 0.60634572707715,
"learning_rate": 1.896848475533309e-05,
"loss": 0.4682,
"step": 307
},
{
"epoch": 0.46429244394196345,
"grad_norm": 0.6339199718330278,
"learning_rate": 1.8956805345023145e-05,
"loss": 0.4643,
"step": 308
},
{
"epoch": 0.465799886941775,
"grad_norm": 0.6011561135843241,
"learning_rate": 1.894506382325248e-05,
"loss": 0.435,
"step": 309
},
{
"epoch": 0.4673073299415866,
"grad_norm": 0.6067579490346751,
"learning_rate": 1.8933260271443313e-05,
"loss": 0.4162,
"step": 310
},
{
"epoch": 0.4688147729413982,
"grad_norm": 0.5747986536629459,
"learning_rate": 1.8921394771448032e-05,
"loss": 0.448,
"step": 311
},
{
"epoch": 0.4703222159412097,
"grad_norm": 0.605434367981348,
"learning_rate": 1.89094674055486e-05,
"loss": 0.4264,
"step": 312
},
{
"epoch": 0.4718296589410213,
"grad_norm": 0.6028982875539595,
"learning_rate": 1.889747825645599e-05,
"loss": 0.447,
"step": 313
},
{
"epoch": 0.47333710194083284,
"grad_norm": 0.6024460995063091,
"learning_rate": 1.8885427407309627e-05,
"loss": 0.4689,
"step": 314
},
{
"epoch": 0.47484454494064443,
"grad_norm": 0.6726949468749703,
"learning_rate": 1.887331494167678e-05,
"loss": 0.4562,
"step": 315
},
{
"epoch": 0.476351987940456,
"grad_norm": 0.6108367421924343,
"learning_rate": 1.8861140943552014e-05,
"loss": 0.4574,
"step": 316
},
{
"epoch": 0.47785943094026756,
"grad_norm": 0.6095993211515124,
"learning_rate": 1.884890549735659e-05,
"loss": 0.429,
"step": 317
},
{
"epoch": 0.47936687394007915,
"grad_norm": 0.5708366516060817,
"learning_rate": 1.8836608687937883e-05,
"loss": 0.4494,
"step": 318
},
{
"epoch": 0.4808743169398907,
"grad_norm": 0.6319148329863508,
"learning_rate": 1.8824250600568798e-05,
"loss": 0.4457,
"step": 319
},
{
"epoch": 0.4823817599397023,
"grad_norm": 0.5817901717334689,
"learning_rate": 1.8811831320947177e-05,
"loss": 0.4444,
"step": 320
},
{
"epoch": 0.4838892029395139,
"grad_norm": 0.6167253992638152,
"learning_rate": 1.879935093519519e-05,
"loss": 0.4758,
"step": 321
},
{
"epoch": 0.4853966459393254,
"grad_norm": 0.5918299912550289,
"learning_rate": 1.878680952985877e-05,
"loss": 0.4586,
"step": 322
},
{
"epoch": 0.486904088939137,
"grad_norm": 0.5897988696893806,
"learning_rate": 1.8774207191906976e-05,
"loss": 0.4548,
"step": 323
},
{
"epoch": 0.48841153193894854,
"grad_norm": 0.5336492924439385,
"learning_rate": 1.8761544008731426e-05,
"loss": 0.4477,
"step": 324
},
{
"epoch": 0.48991897493876013,
"grad_norm": 0.5969332291879268,
"learning_rate": 1.874882006814565e-05,
"loss": 0.4423,
"step": 325
},
{
"epoch": 0.4914264179385717,
"grad_norm": 0.5894559630672119,
"learning_rate": 1.8736035458384528e-05,
"loss": 0.4681,
"step": 326
},
{
"epoch": 0.49293386093838326,
"grad_norm": 0.583381204713255,
"learning_rate": 1.8723190268103634e-05,
"loss": 0.431,
"step": 327
},
{
"epoch": 0.49444130393819485,
"grad_norm": 0.5501857874739489,
"learning_rate": 1.8710284586378645e-05,
"loss": 0.4501,
"step": 328
},
{
"epoch": 0.4959487469380064,
"grad_norm": 0.5807568427837185,
"learning_rate": 1.8697318502704734e-05,
"loss": 0.446,
"step": 329
},
{
"epoch": 0.497456189937818,
"grad_norm": 0.5344952874232914,
"learning_rate": 1.8684292106995916e-05,
"loss": 0.464,
"step": 330
},
{
"epoch": 0.4989636329376295,
"grad_norm": 0.5875400091192824,
"learning_rate": 1.8671205489584453e-05,
"loss": 0.462,
"step": 331
},
{
"epoch": 0.5004710759374411,
"grad_norm": 0.5898142606962845,
"learning_rate": 1.865805874122021e-05,
"loss": 0.4495,
"step": 332
},
{
"epoch": 0.5019785189372526,
"grad_norm": 0.5383180946864506,
"learning_rate": 1.8644851953070045e-05,
"loss": 0.474,
"step": 333
},
{
"epoch": 0.5034859619370643,
"grad_norm": 0.5701159430118912,
"learning_rate": 1.863158521671716e-05,
"loss": 0.4644,
"step": 334
},
{
"epoch": 0.5049934049368758,
"grad_norm": 0.5456550772582448,
"learning_rate": 1.8618258624160465e-05,
"loss": 0.4426,
"step": 335
},
{
"epoch": 0.5065008479366874,
"grad_norm": 0.5806062450133762,
"learning_rate": 1.8604872267813954e-05,
"loss": 0.4428,
"step": 336
},
{
"epoch": 0.508008290936499,
"grad_norm": 0.5723184224994758,
"learning_rate": 1.859142624050605e-05,
"loss": 0.427,
"step": 337
},
{
"epoch": 0.5095157339363106,
"grad_norm": 0.5503430826330011,
"learning_rate": 1.8577920635478976e-05,
"loss": 0.4863,
"step": 338
},
{
"epoch": 0.5110231769361221,
"grad_norm": 0.5922429005891785,
"learning_rate": 1.8564355546388094e-05,
"loss": 0.472,
"step": 339
},
{
"epoch": 0.5125306199359336,
"grad_norm": 0.5243816217609505,
"learning_rate": 1.855073106730126e-05,
"loss": 0.4563,
"step": 340
},
{
"epoch": 0.5140380629357453,
"grad_norm": 0.571898057341335,
"learning_rate": 1.8537047292698175e-05,
"loss": 0.4686,
"step": 341
},
{
"epoch": 0.5155455059355568,
"grad_norm": 0.5389787797747003,
"learning_rate": 1.852330431746973e-05,
"loss": 0.4044,
"step": 342
},
{
"epoch": 0.5170529489353684,
"grad_norm": 0.5755069679771695,
"learning_rate": 1.8509502236917353e-05,
"loss": 0.4536,
"step": 343
},
{
"epoch": 0.51856039193518,
"grad_norm": 0.5386650306089089,
"learning_rate": 1.8495641146752322e-05,
"loss": 0.4285,
"step": 344
},
{
"epoch": 0.5200678349349915,
"grad_norm": 0.5775045065740545,
"learning_rate": 1.848172114309513e-05,
"loss": 0.4579,
"step": 345
},
{
"epoch": 0.5215752779348031,
"grad_norm": 0.6222104655446267,
"learning_rate": 1.8467742322474822e-05,
"loss": 0.4733,
"step": 346
},
{
"epoch": 0.5230827209346146,
"grad_norm": 0.5869893846228816,
"learning_rate": 1.845370478182829e-05,
"loss": 0.5073,
"step": 347
},
{
"epoch": 0.5245901639344263,
"grad_norm": 0.6007295355780623,
"learning_rate": 1.8439608618499637e-05,
"loss": 0.4859,
"step": 348
},
{
"epoch": 0.5260976069342378,
"grad_norm": 0.5715093886190423,
"learning_rate": 1.842545393023949e-05,
"loss": 0.436,
"step": 349
},
{
"epoch": 0.5276050499340493,
"grad_norm": 0.5370655215760771,
"learning_rate": 1.841124081520431e-05,
"loss": 0.4545,
"step": 350
},
{
"epoch": 0.529112492933861,
"grad_norm": 0.5468526752808022,
"learning_rate": 1.8396969371955724e-05,
"loss": 0.4412,
"step": 351
},
{
"epoch": 0.5306199359336725,
"grad_norm": 0.5386055180491347,
"learning_rate": 1.838263969945985e-05,
"loss": 0.455,
"step": 352
},
{
"epoch": 0.532127378933484,
"grad_norm": 0.5273830292324821,
"learning_rate": 1.836825189708659e-05,
"loss": 0.4208,
"step": 353
},
{
"epoch": 0.5336348219332957,
"grad_norm": 0.5324858057392972,
"learning_rate": 1.8353806064608953e-05,
"loss": 0.4259,
"step": 354
},
{
"epoch": 0.5351422649331072,
"grad_norm": 0.5185086851614243,
"learning_rate": 1.833930230220236e-05,
"loss": 0.4506,
"step": 355
},
{
"epoch": 0.5366497079329188,
"grad_norm": 0.5553133756097826,
"learning_rate": 1.8324740710443955e-05,
"loss": 0.4629,
"step": 356
},
{
"epoch": 0.5381571509327303,
"grad_norm": 0.5742120676044152,
"learning_rate": 1.831012139031189e-05,
"loss": 0.4357,
"step": 357
},
{
"epoch": 0.539664593932542,
"grad_norm": 0.5605121444976939,
"learning_rate": 1.829544444318466e-05,
"loss": 0.4606,
"step": 358
},
{
"epoch": 0.5411720369323535,
"grad_norm": 0.6092704764024721,
"learning_rate": 1.8280709970840352e-05,
"loss": 0.4589,
"step": 359
},
{
"epoch": 0.542679479932165,
"grad_norm": 0.5515104498699946,
"learning_rate": 1.8265918075455985e-05,
"loss": 0.4554,
"step": 360
},
{
"epoch": 0.5441869229319767,
"grad_norm": 0.5517752011641777,
"learning_rate": 1.8251068859606777e-05,
"loss": 0.4446,
"step": 361
},
{
"epoch": 0.5456943659317882,
"grad_norm": 0.523313087940014,
"learning_rate": 1.823616242626542e-05,
"loss": 0.4453,
"step": 362
},
{
"epoch": 0.5472018089315998,
"grad_norm": 0.5555090795115328,
"learning_rate": 1.8221198878801415e-05,
"loss": 0.431,
"step": 363
},
{
"epoch": 0.5487092519314113,
"grad_norm": 0.5254077832278897,
"learning_rate": 1.8206178320980295e-05,
"loss": 0.4512,
"step": 364
},
{
"epoch": 0.5502166949312229,
"grad_norm": 0.5382752275452225,
"learning_rate": 1.819110085696295e-05,
"loss": 0.4489,
"step": 365
},
{
"epoch": 0.5517241379310345,
"grad_norm": 0.5752845306460045,
"learning_rate": 1.817596659130489e-05,
"loss": 0.4659,
"step": 366
},
{
"epoch": 0.553231580930846,
"grad_norm": 0.534082668899082,
"learning_rate": 1.816077562895551e-05,
"loss": 0.443,
"step": 367
},
{
"epoch": 0.5547390239306577,
"grad_norm": 0.4935673245960411,
"learning_rate": 1.814552807525738e-05,
"loss": 0.4265,
"step": 368
},
{
"epoch": 0.5562464669304692,
"grad_norm": 0.5587086828843211,
"learning_rate": 1.81302240359455e-05,
"loss": 0.4171,
"step": 369
},
{
"epoch": 0.5577539099302807,
"grad_norm": 0.5706799552715889,
"learning_rate": 1.8114863617146576e-05,
"loss": 0.4419,
"step": 370
},
{
"epoch": 0.5592613529300924,
"grad_norm": 0.5559814423377313,
"learning_rate": 1.8099446925378278e-05,
"loss": 0.4646,
"step": 371
},
{
"epoch": 0.5607687959299039,
"grad_norm": 0.6399807563842037,
"learning_rate": 1.8083974067548506e-05,
"loss": 0.4662,
"step": 372
},
{
"epoch": 0.5622762389297155,
"grad_norm": 0.5499667823126643,
"learning_rate": 1.806844515095465e-05,
"loss": 0.4705,
"step": 373
},
{
"epoch": 0.563783681929527,
"grad_norm": 0.5802308318791667,
"learning_rate": 1.8052860283282832e-05,
"loss": 0.4285,
"step": 374
},
{
"epoch": 0.5652911249293386,
"grad_norm": 0.616061675009139,
"learning_rate": 1.8037219572607177e-05,
"loss": 0.4661,
"step": 375
},
{
"epoch": 0.5667985679291502,
"grad_norm": 0.5381388831653736,
"learning_rate": 1.8021523127389066e-05,
"loss": 0.442,
"step": 376
},
{
"epoch": 0.5683060109289617,
"grad_norm": 0.5427863037336617,
"learning_rate": 1.800577105647635e-05,
"loss": 0.4737,
"step": 377
},
{
"epoch": 0.5698134539287734,
"grad_norm": 0.647319829296571,
"learning_rate": 1.7989963469102643e-05,
"loss": 0.4597,
"step": 378
},
{
"epoch": 0.5713208969285849,
"grad_norm": 0.5361993689265471,
"learning_rate": 1.797410047488653e-05,
"loss": 0.4515,
"step": 379
},
{
"epoch": 0.5728283399283964,
"grad_norm": 0.5928443348297506,
"learning_rate": 1.7958182183830816e-05,
"loss": 0.4383,
"step": 380
},
{
"epoch": 0.574335782928208,
"grad_norm": 0.5525429424873411,
"learning_rate": 1.794220870632177e-05,
"loss": 0.4676,
"step": 381
},
{
"epoch": 0.5758432259280196,
"grad_norm": 0.6048913598018805,
"learning_rate": 1.7926180153128358e-05,
"loss": 0.4803,
"step": 382
},
{
"epoch": 0.5773506689278312,
"grad_norm": 0.6159208841600681,
"learning_rate": 1.791009663540146e-05,
"loss": 0.4446,
"step": 383
},
{
"epoch": 0.5788581119276427,
"grad_norm": 0.604058916697408,
"learning_rate": 1.789395826467312e-05,
"loss": 0.4406,
"step": 384
},
{
"epoch": 0.5803655549274543,
"grad_norm": 0.6189321454832999,
"learning_rate": 1.7877765152855757e-05,
"loss": 0.4757,
"step": 385
},
{
"epoch": 0.5818729979272659,
"grad_norm": 0.5252310621840579,
"learning_rate": 1.78615174122414e-05,
"loss": 0.4226,
"step": 386
},
{
"epoch": 0.5833804409270774,
"grad_norm": 0.6058698433864601,
"learning_rate": 1.78452151555009e-05,
"loss": 0.4242,
"step": 387
},
{
"epoch": 0.5848878839268891,
"grad_norm": 0.5784597918661724,
"learning_rate": 1.7828858495683162e-05,
"loss": 0.4546,
"step": 388
},
{
"epoch": 0.5863953269267006,
"grad_norm": 0.5778733445604559,
"learning_rate": 1.781244754621434e-05,
"loss": 0.4474,
"step": 389
},
{
"epoch": 0.5879027699265121,
"grad_norm": 0.5574362195371769,
"learning_rate": 1.779598242089707e-05,
"loss": 0.4461,
"step": 390
},
{
"epoch": 0.5894102129263237,
"grad_norm": 0.6035018906117913,
"learning_rate": 1.7779463233909677e-05,
"loss": 0.4647,
"step": 391
},
{
"epoch": 0.5909176559261353,
"grad_norm": 0.5783320653215531,
"learning_rate": 1.7762890099805362e-05,
"loss": 0.4509,
"step": 392
},
{
"epoch": 0.5924250989259469,
"grad_norm": 0.608063697903211,
"learning_rate": 1.774626313351145e-05,
"loss": 0.4496,
"step": 393
},
{
"epoch": 0.5939325419257584,
"grad_norm": 0.5637493289630973,
"learning_rate": 1.7729582450328547e-05,
"loss": 0.4548,
"step": 394
},
{
"epoch": 0.59543998492557,
"grad_norm": 0.5878505952019026,
"learning_rate": 1.771284816592978e-05,
"loss": 0.4025,
"step": 395
},
{
"epoch": 0.5969474279253816,
"grad_norm": 0.5732228081169485,
"learning_rate": 1.7696060396359956e-05,
"loss": 0.4155,
"step": 396
},
{
"epoch": 0.5984548709251931,
"grad_norm": 0.5275574748856542,
"learning_rate": 1.7679219258034798e-05,
"loss": 0.4668,
"step": 397
},
{
"epoch": 0.5999623139250048,
"grad_norm": 0.565193432089848,
"learning_rate": 1.7662324867740102e-05,
"loss": 0.464,
"step": 398
},
{
"epoch": 0.6014697569248163,
"grad_norm": 0.5276065053060457,
"learning_rate": 1.7645377342630956e-05,
"loss": 0.4641,
"step": 399
},
{
"epoch": 0.6029771999246278,
"grad_norm": 0.5504334109425478,
"learning_rate": 1.76283768002309e-05,
"loss": 0.4288,
"step": 400
},
{
"epoch": 0.6044846429244394,
"grad_norm": 0.6059296820868759,
"learning_rate": 1.7611323358431145e-05,
"loss": 0.4961,
"step": 401
},
{
"epoch": 0.605992085924251,
"grad_norm": 0.5077017761738585,
"learning_rate": 1.759421713548971e-05,
"loss": 0.4706,
"step": 402
},
{
"epoch": 0.6074995289240626,
"grad_norm": 0.5590656170710925,
"learning_rate": 1.757705825003065e-05,
"loss": 0.4034,
"step": 403
},
{
"epoch": 0.6090069719238741,
"grad_norm": 0.525709220345065,
"learning_rate": 1.7559846821043205e-05,
"loss": 0.4379,
"step": 404
},
{
"epoch": 0.6105144149236857,
"grad_norm": 0.5538945207929713,
"learning_rate": 1.754258296788097e-05,
"loss": 0.445,
"step": 405
},
{
"epoch": 0.6120218579234973,
"grad_norm": 0.5517645766831191,
"learning_rate": 1.7525266810261096e-05,
"loss": 0.4469,
"step": 406
},
{
"epoch": 0.6135293009233088,
"grad_norm": 0.5594555749715797,
"learning_rate": 1.7507898468263422e-05,
"loss": 0.4343,
"step": 407
},
{
"epoch": 0.6150367439231204,
"grad_norm": 0.5530618540850076,
"learning_rate": 1.7490478062329686e-05,
"loss": 0.4625,
"step": 408
},
{
"epoch": 0.616544186922932,
"grad_norm": 0.5948076942836006,
"learning_rate": 1.7473005713262644e-05,
"loss": 0.4497,
"step": 409
},
{
"epoch": 0.6180516299227435,
"grad_norm": 0.5776155556563956,
"learning_rate": 1.7455481542225272e-05,
"loss": 0.3959,
"step": 410
},
{
"epoch": 0.6195590729225551,
"grad_norm": 0.5391682645939875,
"learning_rate": 1.7437905670739893e-05,
"loss": 0.4337,
"step": 411
},
{
"epoch": 0.6210665159223667,
"grad_norm": 0.5918312025262793,
"learning_rate": 1.7420278220687366e-05,
"loss": 0.4749,
"step": 412
},
{
"epoch": 0.6225739589221783,
"grad_norm": 0.5169533403943937,
"learning_rate": 1.7402599314306207e-05,
"loss": 0.4361,
"step": 413
},
{
"epoch": 0.6240814019219898,
"grad_norm": 0.5757476582664114,
"learning_rate": 1.7384869074191777e-05,
"loss": 0.4423,
"step": 414
},
{
"epoch": 0.6255888449218014,
"grad_norm": 0.5789420594237762,
"learning_rate": 1.7367087623295394e-05,
"loss": 0.4493,
"step": 415
},
{
"epoch": 0.627096287921613,
"grad_norm": 0.5146689624027024,
"learning_rate": 1.7349255084923517e-05,
"loss": 0.4128,
"step": 416
},
{
"epoch": 0.6286037309214245,
"grad_norm": 0.5556214483108315,
"learning_rate": 1.7331371582736864e-05,
"loss": 0.4097,
"step": 417
},
{
"epoch": 0.6301111739212361,
"grad_norm": 0.5781033815860408,
"learning_rate": 1.731343724074957e-05,
"loss": 0.4755,
"step": 418
},
{
"epoch": 0.6316186169210477,
"grad_norm": 0.505299705771376,
"learning_rate": 1.7295452183328317e-05,
"loss": 0.423,
"step": 419
},
{
"epoch": 0.6331260599208592,
"grad_norm": 0.6019529322565086,
"learning_rate": 1.7277416535191478e-05,
"loss": 0.4467,
"step": 420
},
{
"epoch": 0.6346335029206708,
"grad_norm": 0.5423258091864472,
"learning_rate": 1.7259330421408247e-05,
"loss": 0.4297,
"step": 421
},
{
"epoch": 0.6361409459204824,
"grad_norm": 0.550859799446333,
"learning_rate": 1.7241193967397784e-05,
"loss": 0.4334,
"step": 422
},
{
"epoch": 0.637648388920294,
"grad_norm": 0.5436505610454662,
"learning_rate": 1.7223007298928322e-05,
"loss": 0.4227,
"step": 423
},
{
"epoch": 0.6391558319201055,
"grad_norm": 0.5265015330498195,
"learning_rate": 1.7204770542116326e-05,
"loss": 0.4407,
"step": 424
},
{
"epoch": 0.640663274919917,
"grad_norm": 0.577557633955233,
"learning_rate": 1.7186483823425582e-05,
"loss": 0.4794,
"step": 425
},
{
"epoch": 0.6421707179197287,
"grad_norm": 0.5304780945155085,
"learning_rate": 1.7168147269666357e-05,
"loss": 0.4306,
"step": 426
},
{
"epoch": 0.6436781609195402,
"grad_norm": 0.5436263482054755,
"learning_rate": 1.714976100799449e-05,
"loss": 0.4505,
"step": 427
},
{
"epoch": 0.6451856039193518,
"grad_norm": 0.5239803005942689,
"learning_rate": 1.713132516591053e-05,
"loss": 0.4204,
"step": 428
},
{
"epoch": 0.6466930469191634,
"grad_norm": 0.5640485363783228,
"learning_rate": 1.7112839871258838e-05,
"loss": 0.4709,
"step": 429
},
{
"epoch": 0.6482004899189749,
"grad_norm": 0.5112413611963181,
"learning_rate": 1.7094305252226713e-05,
"loss": 0.4352,
"step": 430
},
{
"epoch": 0.6497079329187865,
"grad_norm": 0.5839208365283748,
"learning_rate": 1.7075721437343488e-05,
"loss": 0.467,
"step": 431
},
{
"epoch": 0.6512153759185981,
"grad_norm": 0.5264144807133015,
"learning_rate": 1.705708855547966e-05,
"loss": 0.4427,
"step": 432
},
{
"epoch": 0.6527228189184097,
"grad_norm": 0.503285177882026,
"learning_rate": 1.7038406735845967e-05,
"loss": 0.4206,
"step": 433
},
{
"epoch": 0.6542302619182212,
"grad_norm": 0.523921175908132,
"learning_rate": 1.7019676107992523e-05,
"loss": 0.4636,
"step": 434
},
{
"epoch": 0.6557377049180327,
"grad_norm": 0.5213012549969936,
"learning_rate": 1.70008968018079e-05,
"loss": 0.4385,
"step": 435
},
{
"epoch": 0.6572451479178444,
"grad_norm": 0.5616975925596913,
"learning_rate": 1.6982068947518235e-05,
"loss": 0.4495,
"step": 436
},
{
"epoch": 0.6587525909176559,
"grad_norm": 0.5094741288290618,
"learning_rate": 1.6963192675686312e-05,
"loss": 0.4354,
"step": 437
},
{
"epoch": 0.6602600339174675,
"grad_norm": 0.5691859599654164,
"learning_rate": 1.694426811721069e-05,
"loss": 0.4121,
"step": 438
},
{
"epoch": 0.6617674769172791,
"grad_norm": 0.565755177059836,
"learning_rate": 1.6925295403324758e-05,
"loss": 0.4291,
"step": 439
},
{
"epoch": 0.6632749199170906,
"grad_norm": 0.5182694692522232,
"learning_rate": 1.6906274665595854e-05,
"loss": 0.4187,
"step": 440
},
{
"epoch": 0.6647823629169022,
"grad_norm": 0.5442306033345655,
"learning_rate": 1.688720603592432e-05,
"loss": 0.4596,
"step": 441
},
{
"epoch": 0.6662898059167138,
"grad_norm": 0.508987211991653,
"learning_rate": 1.6868089646542632e-05,
"loss": 0.4218,
"step": 442
},
{
"epoch": 0.6677972489165254,
"grad_norm": 0.5409018441358341,
"learning_rate": 1.6848925630014445e-05,
"loss": 0.4422,
"step": 443
},
{
"epoch": 0.6693046919163369,
"grad_norm": 0.5332135170482968,
"learning_rate": 1.6829714119233688e-05,
"loss": 0.4742,
"step": 444
},
{
"epoch": 0.6708121349161484,
"grad_norm": 0.510365685539909,
"learning_rate": 1.6810455247423634e-05,
"loss": 0.4308,
"step": 445
},
{
"epoch": 0.6723195779159601,
"grad_norm": 0.5088383566851198,
"learning_rate": 1.6791149148136003e-05,
"loss": 0.4491,
"step": 446
},
{
"epoch": 0.6738270209157716,
"grad_norm": 0.5398522018308489,
"learning_rate": 1.677179595525e-05,
"loss": 0.465,
"step": 447
},
{
"epoch": 0.6753344639155832,
"grad_norm": 0.5312851766133058,
"learning_rate": 1.675239580297141e-05,
"loss": 0.4574,
"step": 448
},
{
"epoch": 0.6768419069153948,
"grad_norm": 0.5377924163432233,
"learning_rate": 1.6732948825831657e-05,
"loss": 0.4282,
"step": 449
},
{
"epoch": 0.6783493499152063,
"grad_norm": 0.5411515105207517,
"learning_rate": 1.671345515868688e-05,
"loss": 0.437,
"step": 450
},
{
"epoch": 0.6798567929150179,
"grad_norm": 0.5061423487479686,
"learning_rate": 1.6693914936716983e-05,
"loss": 0.4244,
"step": 451
},
{
"epoch": 0.6813642359148294,
"grad_norm": 0.5390647508447596,
"learning_rate": 1.6674328295424723e-05,
"loss": 0.4395,
"step": 452
},
{
"epoch": 0.6828716789146411,
"grad_norm": 0.5706362763533134,
"learning_rate": 1.6654695370634738e-05,
"loss": 0.4421,
"step": 453
},
{
"epoch": 0.6843791219144526,
"grad_norm": 0.5330284685793139,
"learning_rate": 1.6635016298492628e-05,
"loss": 0.4303,
"step": 454
},
{
"epoch": 0.6858865649142641,
"grad_norm": 0.5267067326608682,
"learning_rate": 1.6615291215464005e-05,
"loss": 0.4245,
"step": 455
},
{
"epoch": 0.6873940079140758,
"grad_norm": 0.5726680200512305,
"learning_rate": 1.6595520258333545e-05,
"loss": 0.4752,
"step": 456
},
{
"epoch": 0.6889014509138873,
"grad_norm": 0.5183865668680759,
"learning_rate": 1.657570356420404e-05,
"loss": 0.4542,
"step": 457
},
{
"epoch": 0.6904088939136989,
"grad_norm": 0.553551099478117,
"learning_rate": 1.6555841270495456e-05,
"loss": 0.445,
"step": 458
},
{
"epoch": 0.6919163369135105,
"grad_norm": 0.5929224658029257,
"learning_rate": 1.6535933514943955e-05,
"loss": 0.4183,
"step": 459
},
{
"epoch": 0.693423779913322,
"grad_norm": 0.5010271872134405,
"learning_rate": 1.6515980435600965e-05,
"loss": 0.4169,
"step": 460
},
{
"epoch": 0.6949312229131336,
"grad_norm": 0.49068598527278895,
"learning_rate": 1.6495982170832224e-05,
"loss": 0.4122,
"step": 461
},
{
"epoch": 0.6964386659129451,
"grad_norm": 0.5288472547252633,
"learning_rate": 1.6475938859316795e-05,
"loss": 0.4154,
"step": 462
},
{
"epoch": 0.6979461089127568,
"grad_norm": 0.5364001246117184,
"learning_rate": 1.6455850640046134e-05,
"loss": 0.4247,
"step": 463
},
{
"epoch": 0.6994535519125683,
"grad_norm": 0.5248089160285507,
"learning_rate": 1.6435717652323097e-05,
"loss": 0.4522,
"step": 464
},
{
"epoch": 0.7009609949123798,
"grad_norm": 0.5871578611838155,
"learning_rate": 1.6415540035761008e-05,
"loss": 0.4477,
"step": 465
},
{
"epoch": 0.7024684379121915,
"grad_norm": 0.531098674787926,
"learning_rate": 1.639531793028265e-05,
"loss": 0.43,
"step": 466
},
{
"epoch": 0.703975880912003,
"grad_norm": 0.6050322359617515,
"learning_rate": 1.637505147611934e-05,
"loss": 0.4533,
"step": 467
},
{
"epoch": 0.7054833239118146,
"grad_norm": 0.5045703819799817,
"learning_rate": 1.6354740813809917e-05,
"loss": 0.4021,
"step": 468
},
{
"epoch": 0.7069907669116261,
"grad_norm": 0.5129545738188582,
"learning_rate": 1.6334386084199787e-05,
"loss": 0.4517,
"step": 469
},
{
"epoch": 0.7084982099114377,
"grad_norm": 0.5736577274561188,
"learning_rate": 1.631398742843995e-05,
"loss": 0.418,
"step": 470
},
{
"epoch": 0.7100056529112493,
"grad_norm": 0.5323460252829038,
"learning_rate": 1.629354498798601e-05,
"loss": 0.4251,
"step": 471
},
{
"epoch": 0.7115130959110608,
"grad_norm": 0.5747199097534378,
"learning_rate": 1.627305890459719e-05,
"loss": 0.4394,
"step": 472
},
{
"epoch": 0.7130205389108725,
"grad_norm": 0.5646262513047455,
"learning_rate": 1.625252932033538e-05,
"loss": 0.4297,
"step": 473
},
{
"epoch": 0.714527981910684,
"grad_norm": 0.49304427786239235,
"learning_rate": 1.6231956377564095e-05,
"loss": 0.4224,
"step": 474
},
{
"epoch": 0.7160354249104955,
"grad_norm": 0.5791416730858486,
"learning_rate": 1.621134021894756e-05,
"loss": 0.4388,
"step": 475
},
{
"epoch": 0.7175428679103072,
"grad_norm": 0.5186150019034591,
"learning_rate": 1.619068098744965e-05,
"loss": 0.4422,
"step": 476
},
{
"epoch": 0.7190503109101187,
"grad_norm": 0.5839335428128258,
"learning_rate": 1.6169978826332955e-05,
"loss": 0.458,
"step": 477
},
{
"epoch": 0.7205577539099303,
"grad_norm": 0.5613046419371709,
"learning_rate": 1.6149233879157747e-05,
"loss": 0.4669,
"step": 478
},
{
"epoch": 0.7220651969097418,
"grad_norm": 0.5154157204007299,
"learning_rate": 1.6128446289781012e-05,
"loss": 0.4372,
"step": 479
},
{
"epoch": 0.7235726399095535,
"grad_norm": 0.5677977726488427,
"learning_rate": 1.610761620235543e-05,
"loss": 0.4731,
"step": 480
},
{
"epoch": 0.725080082909365,
"grad_norm": 0.5375971717165063,
"learning_rate": 1.60867437613284e-05,
"loss": 0.4566,
"step": 481
},
{
"epoch": 0.7265875259091765,
"grad_norm": 0.49724342603457516,
"learning_rate": 1.6065829111441e-05,
"loss": 0.4507,
"step": 482
},
{
"epoch": 0.7280949689089882,
"grad_norm": 0.5827089081742053,
"learning_rate": 1.6044872397727037e-05,
"loss": 0.4564,
"step": 483
},
{
"epoch": 0.7296024119087997,
"grad_norm": 0.5474489228753104,
"learning_rate": 1.6023873765511993e-05,
"loss": 0.4309,
"step": 484
},
{
"epoch": 0.7311098549086112,
"grad_norm": 0.5319969584661621,
"learning_rate": 1.6002833360412044e-05,
"loss": 0.4394,
"step": 485
},
{
"epoch": 0.7326172979084229,
"grad_norm": 0.5521662619957021,
"learning_rate": 1.5981751328333036e-05,
"loss": 0.4568,
"step": 486
},
{
"epoch": 0.7341247409082344,
"grad_norm": 0.4814653766664411,
"learning_rate": 1.5960627815469486e-05,
"loss": 0.4066,
"step": 487
},
{
"epoch": 0.735632183908046,
"grad_norm": 0.5109256400558994,
"learning_rate": 1.5939462968303554e-05,
"loss": 0.4272,
"step": 488
},
{
"epoch": 0.7371396269078575,
"grad_norm": 0.5357957318401174,
"learning_rate": 1.5918256933604047e-05,
"loss": 0.4237,
"step": 489
},
{
"epoch": 0.7386470699076692,
"grad_norm": 0.5396229844011063,
"learning_rate": 1.589700985842538e-05,
"loss": 0.4205,
"step": 490
},
{
"epoch": 0.7401545129074807,
"grad_norm": 0.5056971418930007,
"learning_rate": 1.5875721890106574e-05,
"loss": 0.4558,
"step": 491
},
{
"epoch": 0.7416619559072922,
"grad_norm": 0.5466763607345122,
"learning_rate": 1.5854393176270205e-05,
"loss": 0.4262,
"step": 492
},
{
"epoch": 0.7431693989071039,
"grad_norm": 0.5318696480713733,
"learning_rate": 1.5833023864821427e-05,
"loss": 0.4222,
"step": 493
},
{
"epoch": 0.7446768419069154,
"grad_norm": 0.5577732122364522,
"learning_rate": 1.5811614103946905e-05,
"loss": 0.4643,
"step": 494
},
{
"epoch": 0.746184284906727,
"grad_norm": 0.5396811070945262,
"learning_rate": 1.5790164042113805e-05,
"loss": 0.4619,
"step": 495
},
{
"epoch": 0.7476917279065385,
"grad_norm": 0.5116348501037207,
"learning_rate": 1.576867382806877e-05,
"loss": 0.4257,
"step": 496
},
{
"epoch": 0.7491991709063501,
"grad_norm": 0.5376269628887883,
"learning_rate": 1.5747143610836873e-05,
"loss": 0.4431,
"step": 497
},
{
"epoch": 0.7507066139061617,
"grad_norm": 0.5552456121649234,
"learning_rate": 1.5725573539720592e-05,
"loss": 0.4345,
"step": 498
},
{
"epoch": 0.7522140569059732,
"grad_norm": 0.5525594597252514,
"learning_rate": 1.570396376429877e-05,
"loss": 0.4288,
"step": 499
},
{
"epoch": 0.7537214999057849,
"grad_norm": 0.5130914024917077,
"learning_rate": 1.5682314434425593e-05,
"loss": 0.4506,
"step": 500
},
{
"epoch": 0.7552289429055964,
"grad_norm": 0.5438445066019086,
"learning_rate": 1.5660625700229526e-05,
"loss": 0.451,
"step": 501
},
{
"epoch": 0.7567363859054079,
"grad_norm": 0.5393532424898553,
"learning_rate": 1.5638897712112303e-05,
"loss": 0.4339,
"step": 502
},
{
"epoch": 0.7582438289052196,
"grad_norm": 0.5067131473915181,
"learning_rate": 1.561713062074785e-05,
"loss": 0.4452,
"step": 503
},
{
"epoch": 0.7597512719050311,
"grad_norm": 0.511705817056659,
"learning_rate": 1.5595324577081265e-05,
"loss": 0.4227,
"step": 504
},
{
"epoch": 0.7612587149048426,
"grad_norm": 0.5105016396695756,
"learning_rate": 1.5573479732327758e-05,
"loss": 0.4223,
"step": 505
},
{
"epoch": 0.7627661579046542,
"grad_norm": 0.508814615305124,
"learning_rate": 1.555159623797161e-05,
"loss": 0.4649,
"step": 506
},
{
"epoch": 0.7642736009044658,
"grad_norm": 0.5115538447430213,
"learning_rate": 1.552967424576512e-05,
"loss": 0.4257,
"step": 507
},
{
"epoch": 0.7657810439042774,
"grad_norm": 0.519809456344861,
"learning_rate": 1.5507713907727557e-05,
"loss": 0.4393,
"step": 508
},
{
"epoch": 0.7672884869040889,
"grad_norm": 0.5220982867467517,
"learning_rate": 1.5485715376144087e-05,
"loss": 0.4296,
"step": 509
},
{
"epoch": 0.7687959299039006,
"grad_norm": 0.4819994486336346,
"learning_rate": 1.5463678803564753e-05,
"loss": 0.4227,
"step": 510
},
{
"epoch": 0.7703033729037121,
"grad_norm": 0.5721785385849657,
"learning_rate": 1.5441604342803374e-05,
"loss": 0.4446,
"step": 511
},
{
"epoch": 0.7718108159035236,
"grad_norm": 0.5203314012229143,
"learning_rate": 1.5419492146936518e-05,
"loss": 0.4205,
"step": 512
},
{
"epoch": 0.7733182589033352,
"grad_norm": 0.5359755271436466,
"learning_rate": 1.5397342369302425e-05,
"loss": 0.4402,
"step": 513
},
{
"epoch": 0.7748257019031468,
"grad_norm": 0.5233100133672925,
"learning_rate": 1.5375155163499953e-05,
"loss": 0.4177,
"step": 514
},
{
"epoch": 0.7763331449029583,
"grad_norm": 0.5349268255121612,
"learning_rate": 1.5352930683387502e-05,
"loss": 0.4586,
"step": 515
},
{
"epoch": 0.7778405879027699,
"grad_norm": 0.5815705753331589,
"learning_rate": 1.5330669083081956e-05,
"loss": 0.4427,
"step": 516
},
{
"epoch": 0.7793480309025815,
"grad_norm": 0.49665657788094364,
"learning_rate": 1.5308370516957617e-05,
"loss": 0.4201,
"step": 517
},
{
"epoch": 0.7808554739023931,
"grad_norm": 0.5160010880115449,
"learning_rate": 1.528603513964511e-05,
"loss": 0.4261,
"step": 518
},
{
"epoch": 0.7823629169022046,
"grad_norm": 0.5468406227400142,
"learning_rate": 1.5263663106030347e-05,
"loss": 0.4116,
"step": 519
},
{
"epoch": 0.7838703599020163,
"grad_norm": 0.5236112386795565,
"learning_rate": 1.5241254571253433e-05,
"loss": 0.4317,
"step": 520
},
{
"epoch": 0.7853778029018278,
"grad_norm": 0.5715363020786929,
"learning_rate": 1.5218809690707583e-05,
"loss": 0.4288,
"step": 521
},
{
"epoch": 0.7868852459016393,
"grad_norm": 0.5191719390295657,
"learning_rate": 1.5196328620038059e-05,
"loss": 0.4126,
"step": 522
},
{
"epoch": 0.7883926889014509,
"grad_norm": 0.5236405890133281,
"learning_rate": 1.5173811515141083e-05,
"loss": 0.4024,
"step": 523
},
{
"epoch": 0.7899001319012625,
"grad_norm": 0.5689433953014548,
"learning_rate": 1.5151258532162771e-05,
"loss": 0.4377,
"step": 524
},
{
"epoch": 0.791407574901074,
"grad_norm": 0.5352416985872532,
"learning_rate": 1.5128669827498024e-05,
"loss": 0.4354,
"step": 525
},
{
"epoch": 0.7929150179008856,
"grad_norm": 0.5274897373659767,
"learning_rate": 1.5106045557789453e-05,
"loss": 0.4391,
"step": 526
},
{
"epoch": 0.7944224609006972,
"grad_norm": 0.5240353462138522,
"learning_rate": 1.5083385879926309e-05,
"loss": 0.4461,
"step": 527
},
{
"epoch": 0.7959299039005088,
"grad_norm": 0.5040339622037668,
"learning_rate": 1.5060690951043385e-05,
"loss": 0.428,
"step": 528
},
{
"epoch": 0.7974373469003203,
"grad_norm": 0.6056664440579997,
"learning_rate": 1.5037960928519902e-05,
"loss": 0.4667,
"step": 529
},
{
"epoch": 0.7989447899001318,
"grad_norm": 0.5064874652403102,
"learning_rate": 1.501519596997847e-05,
"loss": 0.4174,
"step": 530
},
{
"epoch": 0.8004522328999435,
"grad_norm": 0.5178815992344113,
"learning_rate": 1.499239623328394e-05,
"loss": 0.4143,
"step": 531
},
{
"epoch": 0.801959675899755,
"grad_norm": 0.5433275328773004,
"learning_rate": 1.4969561876542348e-05,
"loss": 0.4308,
"step": 532
},
{
"epoch": 0.8034671188995666,
"grad_norm": 0.5067490187395532,
"learning_rate": 1.4946693058099802e-05,
"loss": 0.4383,
"step": 533
},
{
"epoch": 0.8049745618993782,
"grad_norm": 0.49712616081242367,
"learning_rate": 1.4923789936541378e-05,
"loss": 0.423,
"step": 534
},
{
"epoch": 0.8064820048991898,
"grad_norm": 0.5142222567824052,
"learning_rate": 1.4900852670690044e-05,
"loss": 0.4427,
"step": 535
},
{
"epoch": 0.8079894478990013,
"grad_norm": 0.5138167933634391,
"learning_rate": 1.487788141960553e-05,
"loss": 0.426,
"step": 536
},
{
"epoch": 0.8094968908988129,
"grad_norm": 0.49938679145962556,
"learning_rate": 1.4854876342583246e-05,
"loss": 0.4116,
"step": 537
},
{
"epoch": 0.8110043338986245,
"grad_norm": 0.5630302514996013,
"learning_rate": 1.4831837599153165e-05,
"loss": 0.4569,
"step": 538
},
{
"epoch": 0.812511776898436,
"grad_norm": 0.5068845911186761,
"learning_rate": 1.4808765349078729e-05,
"loss": 0.4174,
"step": 539
},
{
"epoch": 0.8140192198982475,
"grad_norm": 0.5402742918446363,
"learning_rate": 1.4785659752355724e-05,
"loss": 0.4046,
"step": 540
},
{
"epoch": 0.8155266628980592,
"grad_norm": 0.5486844481668101,
"learning_rate": 1.4762520969211186e-05,
"loss": 0.4225,
"step": 541
},
{
"epoch": 0.8170341058978707,
"grad_norm": 0.5290035366810187,
"learning_rate": 1.4739349160102285e-05,
"loss": 0.4378,
"step": 542
},
{
"epoch": 0.8185415488976823,
"grad_norm": 0.5374079241254692,
"learning_rate": 1.4716144485715209e-05,
"loss": 0.4299,
"step": 543
},
{
"epoch": 0.8200489918974939,
"grad_norm": 0.4778906030205072,
"learning_rate": 1.4692907106964051e-05,
"loss": 0.3992,
"step": 544
},
{
"epoch": 0.8215564348973055,
"grad_norm": 0.49060078784195343,
"learning_rate": 1.4669637184989696e-05,
"loss": 0.4243,
"step": 545
},
{
"epoch": 0.823063877897117,
"grad_norm": 0.5253862030306666,
"learning_rate": 1.4646334881158704e-05,
"loss": 0.4236,
"step": 546
},
{
"epoch": 0.8245713208969286,
"grad_norm": 0.5215051723939326,
"learning_rate": 1.4623000357062184e-05,
"loss": 0.4274,
"step": 547
},
{
"epoch": 0.8260787638967402,
"grad_norm": 0.5071119070406966,
"learning_rate": 1.459963377451468e-05,
"loss": 0.4081,
"step": 548
},
{
"epoch": 0.8275862068965517,
"grad_norm": 0.5180772114309931,
"learning_rate": 1.457623529555305e-05,
"loss": 0.4228,
"step": 549
},
{
"epoch": 0.8290936498963632,
"grad_norm": 0.5198434876057629,
"learning_rate": 1.4552805082435333e-05,
"loss": 0.4328,
"step": 550
},
{
"epoch": 0.8306010928961749,
"grad_norm": 0.53696356685593,
"learning_rate": 1.4529343297639638e-05,
"loss": 0.4311,
"step": 551
},
{
"epoch": 0.8321085358959864,
"grad_norm": 0.5522072703618133,
"learning_rate": 1.4505850103863007e-05,
"loss": 0.4441,
"step": 552
},
{
"epoch": 0.833615978895798,
"grad_norm": 0.5022303098504759,
"learning_rate": 1.448232566402028e-05,
"loss": 0.4233,
"step": 553
},
{
"epoch": 0.8351234218956096,
"grad_norm": 0.5522095422296431,
"learning_rate": 1.4458770141242992e-05,
"loss": 0.4333,
"step": 554
},
{
"epoch": 0.8366308648954212,
"grad_norm": 0.5232096502230357,
"learning_rate": 1.4435183698878212e-05,
"loss": 0.4286,
"step": 555
},
{
"epoch": 0.8381383078952327,
"grad_norm": 0.46986995612699417,
"learning_rate": 1.4411566500487425e-05,
"loss": 0.4049,
"step": 556
},
{
"epoch": 0.8396457508950442,
"grad_norm": 0.5077507527784849,
"learning_rate": 1.4387918709845395e-05,
"loss": 0.4144,
"step": 557
},
{
"epoch": 0.8411531938948559,
"grad_norm": 0.5253570052023816,
"learning_rate": 1.4364240490939032e-05,
"loss": 0.4547,
"step": 558
},
{
"epoch": 0.8426606368946674,
"grad_norm": 0.49921819408434215,
"learning_rate": 1.4340532007966252e-05,
"loss": 0.3949,
"step": 559
},
{
"epoch": 0.844168079894479,
"grad_norm": 0.5411234788441551,
"learning_rate": 1.4316793425334836e-05,
"loss": 0.4445,
"step": 560
},
{
"epoch": 0.8456755228942906,
"grad_norm": 0.5264546536830835,
"learning_rate": 1.4293024907661295e-05,
"loss": 0.4117,
"step": 561
},
{
"epoch": 0.8471829658941021,
"grad_norm": 0.518655972625287,
"learning_rate": 1.4269226619769727e-05,
"loss": 0.4159,
"step": 562
},
{
"epoch": 0.8486904088939137,
"grad_norm": 0.537382287002897,
"learning_rate": 1.424539872669067e-05,
"loss": 0.4395,
"step": 563
},
{
"epoch": 0.8501978518937253,
"grad_norm": 0.4871628601960703,
"learning_rate": 1.4221541393659966e-05,
"loss": 0.4244,
"step": 564
},
{
"epoch": 0.8517052948935369,
"grad_norm": 0.5323818502275258,
"learning_rate": 1.4197654786117604e-05,
"loss": 0.442,
"step": 565
},
{
"epoch": 0.8532127378933484,
"grad_norm": 0.49211277864065,
"learning_rate": 1.4173739069706586e-05,
"loss": 0.4333,
"step": 566
},
{
"epoch": 0.8547201808931599,
"grad_norm": 0.5016763716077036,
"learning_rate": 1.414979441027176e-05,
"loss": 0.4223,
"step": 567
},
{
"epoch": 0.8562276238929716,
"grad_norm": 0.5072197589397037,
"learning_rate": 1.4125820973858693e-05,
"loss": 0.4166,
"step": 568
},
{
"epoch": 0.8577350668927831,
"grad_norm": 0.5379841247223495,
"learning_rate": 1.41018189267125e-05,
"loss": 0.4457,
"step": 569
},
{
"epoch": 0.8592425098925947,
"grad_norm": 0.5156171430561991,
"learning_rate": 1.4077788435276701e-05,
"loss": 0.4154,
"step": 570
},
{
"epoch": 0.8607499528924063,
"grad_norm": 0.5377878469372074,
"learning_rate": 1.4053729666192067e-05,
"loss": 0.4437,
"step": 571
},
{
"epoch": 0.8622573958922178,
"grad_norm": 0.5606843337820052,
"learning_rate": 1.4029642786295452e-05,
"loss": 0.4479,
"step": 572
},
{
"epoch": 0.8637648388920294,
"grad_norm": 0.4989731388746451,
"learning_rate": 1.400552796261866e-05,
"loss": 0.407,
"step": 573
},
{
"epoch": 0.8652722818918409,
"grad_norm": 0.5136932503470173,
"learning_rate": 1.3981385362387268e-05,
"loss": 0.4211,
"step": 574
},
{
"epoch": 0.8667797248916526,
"grad_norm": 0.495625389098895,
"learning_rate": 1.3957215153019463e-05,
"loss": 0.4203,
"step": 575
},
{
"epoch": 0.8682871678914641,
"grad_norm": 0.49590492700182753,
"learning_rate": 1.3933017502124897e-05,
"loss": 0.4123,
"step": 576
},
{
"epoch": 0.8697946108912756,
"grad_norm": 0.5389299185456149,
"learning_rate": 1.3908792577503514e-05,
"loss": 0.4309,
"step": 577
},
{
"epoch": 0.8713020538910873,
"grad_norm": 0.5014871721652727,
"learning_rate": 1.3884540547144393e-05,
"loss": 0.4159,
"step": 578
},
{
"epoch": 0.8728094968908988,
"grad_norm": 0.49719473763201644,
"learning_rate": 1.3860261579224574e-05,
"loss": 0.4191,
"step": 579
},
{
"epoch": 0.8743169398907104,
"grad_norm": 0.5102002869995407,
"learning_rate": 1.3835955842107897e-05,
"loss": 0.418,
"step": 580
},
{
"epoch": 0.875824382890522,
"grad_norm": 0.497268362475834,
"learning_rate": 1.3811623504343845e-05,
"loss": 0.4092,
"step": 581
},
{
"epoch": 0.8773318258903335,
"grad_norm": 0.49466892349875324,
"learning_rate": 1.378726473466635e-05,
"loss": 0.4154,
"step": 582
},
{
"epoch": 0.8788392688901451,
"grad_norm": 0.5485556900908343,
"learning_rate": 1.3762879701992642e-05,
"loss": 0.4327,
"step": 583
},
{
"epoch": 0.8803467118899566,
"grad_norm": 0.49193915962474927,
"learning_rate": 1.373846857542208e-05,
"loss": 0.4233,
"step": 584
},
{
"epoch": 0.8818541548897683,
"grad_norm": 0.49157440442050665,
"learning_rate": 1.3714031524234965e-05,
"loss": 0.4255,
"step": 585
},
{
"epoch": 0.8833615978895798,
"grad_norm": 0.5153566919676954,
"learning_rate": 1.3689568717891381e-05,
"loss": 0.4433,
"step": 586
},
{
"epoch": 0.8848690408893913,
"grad_norm": 0.5151771531878016,
"learning_rate": 1.3665080326029997e-05,
"loss": 0.4313,
"step": 587
},
{
"epoch": 0.886376483889203,
"grad_norm": 0.5172038128981158,
"learning_rate": 1.364056651846693e-05,
"loss": 0.4025,
"step": 588
},
{
"epoch": 0.8878839268890145,
"grad_norm": 0.5197034910270297,
"learning_rate": 1.3616027465194525e-05,
"loss": 0.432,
"step": 589
},
{
"epoch": 0.889391369888826,
"grad_norm": 0.5280686849313844,
"learning_rate": 1.35914633363802e-05,
"loss": 0.4093,
"step": 590
},
{
"epoch": 0.8908988128886377,
"grad_norm": 0.5192774851448931,
"learning_rate": 1.356687430236526e-05,
"loss": 0.426,
"step": 591
},
{
"epoch": 0.8924062558884492,
"grad_norm": 0.5407059497728999,
"learning_rate": 1.3542260533663723e-05,
"loss": 0.4408,
"step": 592
},
{
"epoch": 0.8939136988882608,
"grad_norm": 0.5029787366533781,
"learning_rate": 1.351762220096112e-05,
"loss": 0.4134,
"step": 593
},
{
"epoch": 0.8954211418880723,
"grad_norm": 0.5557133502339159,
"learning_rate": 1.3492959475113332e-05,
"loss": 0.4247,
"step": 594
},
{
"epoch": 0.896928584887884,
"grad_norm": 0.5446161829977666,
"learning_rate": 1.3468272527145388e-05,
"loss": 0.4133,
"step": 595
},
{
"epoch": 0.8984360278876955,
"grad_norm": 0.5055328441209378,
"learning_rate": 1.3443561528250295e-05,
"loss": 0.3916,
"step": 596
},
{
"epoch": 0.899943470887507,
"grad_norm": 0.5874519416857665,
"learning_rate": 1.3418826649787834e-05,
"loss": 0.4339,
"step": 597
},
{
"epoch": 0.9014509138873187,
"grad_norm": 0.5577170031704589,
"learning_rate": 1.3394068063283387e-05,
"loss": 0.458,
"step": 598
},
{
"epoch": 0.9029583568871302,
"grad_norm": 0.5332814444729285,
"learning_rate": 1.3369285940426737e-05,
"loss": 0.4206,
"step": 599
},
{
"epoch": 0.9044657998869418,
"grad_norm": 0.5654643143753597,
"learning_rate": 1.334448045307088e-05,
"loss": 0.4113,
"step": 600
},
{
"epoch": 0.9059732428867533,
"grad_norm": 0.4979334800098818,
"learning_rate": 1.331965177323084e-05,
"loss": 0.4093,
"step": 601
},
{
"epoch": 0.9074806858865649,
"grad_norm": 0.5415874467915235,
"learning_rate": 1.3294800073082464e-05,
"loss": 0.4366,
"step": 602
},
{
"epoch": 0.9089881288863765,
"grad_norm": 0.5813207766062746,
"learning_rate": 1.3269925524961237e-05,
"loss": 0.4448,
"step": 603
},
{
"epoch": 0.910495571886188,
"grad_norm": 0.5078359282634053,
"learning_rate": 1.3245028301361086e-05,
"loss": 0.4161,
"step": 604
},
{
"epoch": 0.9120030148859997,
"grad_norm": 0.5539022471684321,
"learning_rate": 1.3220108574933185e-05,
"loss": 0.4056,
"step": 605
},
{
"epoch": 0.9135104578858112,
"grad_norm": 0.48460567118259956,
"learning_rate": 1.3195166518484748e-05,
"loss": 0.4009,
"step": 606
},
{
"epoch": 0.9150179008856227,
"grad_norm": 0.4843343744091719,
"learning_rate": 1.317020230497784e-05,
"loss": 0.4231,
"step": 607
},
{
"epoch": 0.9165253438854344,
"grad_norm": 0.5190197613843625,
"learning_rate": 1.3145216107528178e-05,
"loss": 0.4029,
"step": 608
},
{
"epoch": 0.9180327868852459,
"grad_norm": 0.4867573763184133,
"learning_rate": 1.3120208099403926e-05,
"loss": 0.3801,
"step": 609
},
{
"epoch": 0.9195402298850575,
"grad_norm": 0.48900894299598635,
"learning_rate": 1.3095178454024496e-05,
"loss": 0.4413,
"step": 610
},
{
"epoch": 0.921047672884869,
"grad_norm": 0.5555266761898254,
"learning_rate": 1.3070127344959348e-05,
"loss": 0.4144,
"step": 611
},
{
"epoch": 0.9225551158846806,
"grad_norm": 0.5128649153965221,
"learning_rate": 1.3045054945926775e-05,
"loss": 0.4616,
"step": 612
},
{
"epoch": 0.9240625588844922,
"grad_norm": 0.5194503259126656,
"learning_rate": 1.3019961430792711e-05,
"loss": 0.4229,
"step": 613
},
{
"epoch": 0.9255700018843037,
"grad_norm": 0.49285532678009114,
"learning_rate": 1.2994846973569524e-05,
"loss": 0.4165,
"step": 614
},
{
"epoch": 0.9270774448841154,
"grad_norm": 0.5197963588456296,
"learning_rate": 1.2969711748414804e-05,
"loss": 0.3947,
"step": 615
},
{
"epoch": 0.9285848878839269,
"grad_norm": 0.542725727252665,
"learning_rate": 1.2944555929630152e-05,
"loss": 0.4261,
"step": 616
},
{
"epoch": 0.9300923308837384,
"grad_norm": 0.5068570325444082,
"learning_rate": 1.2919379691659979e-05,
"loss": 0.453,
"step": 617
},
{
"epoch": 0.93159977388355,
"grad_norm": 0.5138431602453551,
"learning_rate": 1.2894183209090304e-05,
"loss": 0.4482,
"step": 618
},
{
"epoch": 0.9331072168833616,
"grad_norm": 0.5098264236378465,
"learning_rate": 1.2868966656647522e-05,
"loss": 0.4344,
"step": 619
},
{
"epoch": 0.9346146598831732,
"grad_norm": 0.4932368518544031,
"learning_rate": 1.2843730209197203e-05,
"loss": 0.4444,
"step": 620
},
{
"epoch": 0.9361221028829847,
"grad_norm": 0.48787838834596486,
"learning_rate": 1.2818474041742885e-05,
"loss": 0.3909,
"step": 621
},
{
"epoch": 0.9376295458827963,
"grad_norm": 0.5042148044417084,
"learning_rate": 1.2793198329424858e-05,
"loss": 0.4114,
"step": 622
},
{
"epoch": 0.9391369888826079,
"grad_norm": 0.5164275014163481,
"learning_rate": 1.2767903247518945e-05,
"loss": 0.4042,
"step": 623
},
{
"epoch": 0.9406444318824194,
"grad_norm": 0.4878553181808082,
"learning_rate": 1.2742588971435276e-05,
"loss": 0.4108,
"step": 624
},
{
"epoch": 0.9421518748822311,
"grad_norm": 0.4953872026297146,
"learning_rate": 1.2717255676717106e-05,
"loss": 0.4227,
"step": 625
},
{
"epoch": 0.9436593178820426,
"grad_norm": 0.5623597137703112,
"learning_rate": 1.2691903539039563e-05,
"loss": 0.4436,
"step": 626
},
{
"epoch": 0.9451667608818541,
"grad_norm": 0.539298059881258,
"learning_rate": 1.2666532734208437e-05,
"loss": 0.4384,
"step": 627
},
{
"epoch": 0.9466742038816657,
"grad_norm": 0.5443120200340641,
"learning_rate": 1.264114343815898e-05,
"loss": 0.4413,
"step": 628
},
{
"epoch": 0.9481816468814773,
"grad_norm": 0.5142650264217846,
"learning_rate": 1.2615735826954664e-05,
"loss": 0.4231,
"step": 629
},
{
"epoch": 0.9496890898812889,
"grad_norm": 0.5566560995617864,
"learning_rate": 1.2590310076785974e-05,
"loss": 0.4458,
"step": 630
},
{
"epoch": 0.9511965328811004,
"grad_norm": 0.484643722468428,
"learning_rate": 1.256486636396917e-05,
"loss": 0.3868,
"step": 631
},
{
"epoch": 0.952703975880912,
"grad_norm": 0.5278211197592041,
"learning_rate": 1.2539404864945087e-05,
"loss": 0.3956,
"step": 632
},
{
"epoch": 0.9542114188807236,
"grad_norm": 0.5339784329738423,
"learning_rate": 1.2513925756277894e-05,
"loss": 0.4065,
"step": 633
},
{
"epoch": 0.9557188618805351,
"grad_norm": 0.4808436521240299,
"learning_rate": 1.2488429214653871e-05,
"loss": 0.3733,
"step": 634
},
{
"epoch": 0.9572263048803467,
"grad_norm": 0.5245674565988473,
"learning_rate": 1.24629154168802e-05,
"loss": 0.4206,
"step": 635
},
{
"epoch": 0.9587337478801583,
"grad_norm": 0.5091922264135481,
"learning_rate": 1.2437384539883715e-05,
"loss": 0.4321,
"step": 636
},
{
"epoch": 0.9602411908799698,
"grad_norm": 0.48729820029525145,
"learning_rate": 1.2411836760709686e-05,
"loss": 0.3961,
"step": 637
},
{
"epoch": 0.9617486338797814,
"grad_norm": 0.5224677796102979,
"learning_rate": 1.2386272256520606e-05,
"loss": 0.4094,
"step": 638
},
{
"epoch": 0.963256076879593,
"grad_norm": 0.5291193644566966,
"learning_rate": 1.2360691204594937e-05,
"loss": 0.4202,
"step": 639
},
{
"epoch": 0.9647635198794046,
"grad_norm": 0.5090746283917961,
"learning_rate": 1.2335093782325889e-05,
"loss": 0.4115,
"step": 640
},
{
"epoch": 0.9662709628792161,
"grad_norm": 0.49844277614657384,
"learning_rate": 1.2309480167220203e-05,
"loss": 0.4138,
"step": 641
},
{
"epoch": 0.9677784058790277,
"grad_norm": 0.5085446797250271,
"learning_rate": 1.2283850536896907e-05,
"loss": 0.4403,
"step": 642
},
{
"epoch": 0.9692858488788393,
"grad_norm": 0.48811956114780947,
"learning_rate": 1.2258205069086082e-05,
"loss": 0.4132,
"step": 643
},
{
"epoch": 0.9707932918786508,
"grad_norm": 0.5804699645229868,
"learning_rate": 1.2232543941627641e-05,
"loss": 0.4145,
"step": 644
},
{
"epoch": 0.9723007348784624,
"grad_norm": 0.5223286630706884,
"learning_rate": 1.2206867332470091e-05,
"loss": 0.4451,
"step": 645
},
{
"epoch": 0.973808177878274,
"grad_norm": 0.5431240213202171,
"learning_rate": 1.2181175419669293e-05,
"loss": 0.4106,
"step": 646
},
{
"epoch": 0.9753156208780855,
"grad_norm": 0.4788749668502741,
"learning_rate": 1.215546838138723e-05,
"loss": 0.3947,
"step": 647
},
{
"epoch": 0.9768230638778971,
"grad_norm": 0.4823666614879542,
"learning_rate": 1.212974639589078e-05,
"loss": 0.3805,
"step": 648
},
{
"epoch": 0.9783305068777087,
"grad_norm": 0.5272835049687891,
"learning_rate": 1.2104009641550472e-05,
"loss": 0.4192,
"step": 649
},
{
"epoch": 0.9798379498775203,
"grad_norm": 0.4899435333806439,
"learning_rate": 1.2078258296839245e-05,
"loss": 0.4242,
"step": 650
},
{
"epoch": 0.9813453928773318,
"grad_norm": 0.48267520902055755,
"learning_rate": 1.2052492540331218e-05,
"loss": 0.3819,
"step": 651
},
{
"epoch": 0.9828528358771434,
"grad_norm": 0.5208472855722491,
"learning_rate": 1.2026712550700457e-05,
"loss": 0.4268,
"step": 652
},
{
"epoch": 0.984360278876955,
"grad_norm": 0.5182048450359825,
"learning_rate": 1.200091850671972e-05,
"loss": 0.3833,
"step": 653
},
{
"epoch": 0.9858677218767665,
"grad_norm": 0.5524884939555313,
"learning_rate": 1.1975110587259222e-05,
"loss": 0.4099,
"step": 654
},
{
"epoch": 0.9873751648765781,
"grad_norm": 0.5724743146915252,
"learning_rate": 1.1949288971285411e-05,
"loss": 0.4451,
"step": 655
},
{
"epoch": 0.9888826078763897,
"grad_norm": 0.5560489536019798,
"learning_rate": 1.1923453837859706e-05,
"loss": 0.4245,
"step": 656
},
{
"epoch": 0.9903900508762012,
"grad_norm": 0.5241693566205756,
"learning_rate": 1.1897605366137264e-05,
"loss": 0.426,
"step": 657
},
{
"epoch": 0.9918974938760128,
"grad_norm": 0.5078011601273249,
"learning_rate": 1.1871743735365735e-05,
"loss": 0.4147,
"step": 658
},
{
"epoch": 0.9934049368758244,
"grad_norm": 0.5014207467428378,
"learning_rate": 1.1845869124884027e-05,
"loss": 0.4029,
"step": 659
},
{
"epoch": 0.994912379875636,
"grad_norm": 0.5184450473918536,
"learning_rate": 1.1819981714121054e-05,
"loss": 0.4338,
"step": 660
},
{
"epoch": 0.9964198228754475,
"grad_norm": 0.5218529509897015,
"learning_rate": 1.1794081682594491e-05,
"loss": 0.4001,
"step": 661
},
{
"epoch": 0.997927265875259,
"grad_norm": 0.5277285874094648,
"learning_rate": 1.176816920990954e-05,
"loss": 0.4225,
"step": 662
},
{
"epoch": 0.9994347088750707,
"grad_norm": 0.506018413554039,
"learning_rate": 1.174224447575767e-05,
"loss": 0.4398,
"step": 663
},
{
"epoch": 1.0009421518748822,
"grad_norm": 0.6655724719416495,
"learning_rate": 1.171630765991538e-05,
"loss": 0.377,
"step": 664
},
{
"epoch": 1.0024495948746939,
"grad_norm": 0.5752880840432146,
"learning_rate": 1.169035894224295e-05,
"loss": 0.325,
"step": 665
},
{
"epoch": 1.0039570378745053,
"grad_norm": 0.547046172496627,
"learning_rate": 1.1664398502683194e-05,
"loss": 0.3422,
"step": 666
},
{
"epoch": 1.005464480874317,
"grad_norm": 0.6183956576461548,
"learning_rate": 1.1638426521260211e-05,
"loss": 0.3551,
"step": 667
},
{
"epoch": 1.0069719238741286,
"grad_norm": 0.6272202909322583,
"learning_rate": 1.1612443178078138e-05,
"loss": 0.3293,
"step": 668
},
{
"epoch": 1.00847936687394,
"grad_norm": 0.6733584649632783,
"learning_rate": 1.1586448653319908e-05,
"loss": 0.3495,
"step": 669
},
{
"epoch": 1.0099868098737517,
"grad_norm": 0.5974677434978642,
"learning_rate": 1.156044312724598e-05,
"loss": 0.3339,
"step": 670
},
{
"epoch": 1.0114942528735633,
"grad_norm": 0.5526640261136243,
"learning_rate": 1.153442678019311e-05,
"loss": 0.3629,
"step": 671
},
{
"epoch": 1.0130016958733747,
"grad_norm": 0.5666634240071511,
"learning_rate": 1.1508399792573095e-05,
"loss": 0.3361,
"step": 672
},
{
"epoch": 1.0145091388731864,
"grad_norm": 0.6224882966351409,
"learning_rate": 1.1482362344871514e-05,
"loss": 0.3545,
"step": 673
},
{
"epoch": 1.0160165818729978,
"grad_norm": 0.5615749930186623,
"learning_rate": 1.1456314617646482e-05,
"loss": 0.3208,
"step": 674
},
{
"epoch": 1.0175240248728095,
"grad_norm": 0.548490348465347,
"learning_rate": 1.1430256791527406e-05,
"loss": 0.3278,
"step": 675
},
{
"epoch": 1.019031467872621,
"grad_norm": 0.6137191595237155,
"learning_rate": 1.1404189047213716e-05,
"loss": 0.3684,
"step": 676
},
{
"epoch": 1.0205389108724325,
"grad_norm": 0.6128432091688398,
"learning_rate": 1.137811156547362e-05,
"loss": 0.3479,
"step": 677
},
{
"epoch": 1.0220463538722442,
"grad_norm": 0.5530398492501923,
"learning_rate": 1.1352024527142855e-05,
"loss": 0.3258,
"step": 678
},
{
"epoch": 1.0235537968720558,
"grad_norm": 0.5691801541559598,
"learning_rate": 1.1325928113123431e-05,
"loss": 0.3359,
"step": 679
},
{
"epoch": 1.0250612398718673,
"grad_norm": 0.5996898750429057,
"learning_rate": 1.129982250438237e-05,
"loss": 0.34,
"step": 680
},
{
"epoch": 1.026568682871679,
"grad_norm": 0.5203916917045198,
"learning_rate": 1.1273707881950445e-05,
"loss": 0.3194,
"step": 681
},
{
"epoch": 1.0280761258714906,
"grad_norm": 0.5706678991613441,
"learning_rate": 1.1247584426920962e-05,
"loss": 0.3394,
"step": 682
},
{
"epoch": 1.029583568871302,
"grad_norm": 0.558797423405198,
"learning_rate": 1.1221452320448449e-05,
"loss": 0.3476,
"step": 683
},
{
"epoch": 1.0310910118711136,
"grad_norm": 0.5491796357132722,
"learning_rate": 1.1195311743747445e-05,
"loss": 0.3287,
"step": 684
},
{
"epoch": 1.0325984548709253,
"grad_norm": 0.5423270097914835,
"learning_rate": 1.116916287809122e-05,
"loss": 0.3315,
"step": 685
},
{
"epoch": 1.0341058978707367,
"grad_norm": 0.5440784988767636,
"learning_rate": 1.1143005904810527e-05,
"loss": 0.3409,
"step": 686
},
{
"epoch": 1.0356133408705483,
"grad_norm": 0.5506460404964368,
"learning_rate": 1.1116841005292339e-05,
"loss": 0.3665,
"step": 687
},
{
"epoch": 1.03712078387036,
"grad_norm": 0.5271450898091751,
"learning_rate": 1.1090668360978589e-05,
"loss": 0.3354,
"step": 688
},
{
"epoch": 1.0386282268701714,
"grad_norm": 0.5116723363561022,
"learning_rate": 1.106448815336493e-05,
"loss": 0.3055,
"step": 689
},
{
"epoch": 1.040135669869983,
"grad_norm": 0.5261827472069973,
"learning_rate": 1.1038300563999455e-05,
"loss": 0.3141,
"step": 690
},
{
"epoch": 1.0416431128697947,
"grad_norm": 0.5675715863653521,
"learning_rate": 1.1012105774481446e-05,
"loss": 0.3576,
"step": 691
},
{
"epoch": 1.0431505558696061,
"grad_norm": 0.542765155631167,
"learning_rate": 1.0985903966460115e-05,
"loss": 0.337,
"step": 692
},
{
"epoch": 1.0446579988694178,
"grad_norm": 0.576467518182856,
"learning_rate": 1.0959695321633346e-05,
"loss": 0.3345,
"step": 693
},
{
"epoch": 1.0461654418692292,
"grad_norm": 0.5261227763098979,
"learning_rate": 1.0933480021746432e-05,
"loss": 0.3137,
"step": 694
},
{
"epoch": 1.0476728848690409,
"grad_norm": 0.5529375328569147,
"learning_rate": 1.0907258248590816e-05,
"loss": 0.332,
"step": 695
},
{
"epoch": 1.0491803278688525,
"grad_norm": 0.5136240223834705,
"learning_rate": 1.0881030184002827e-05,
"loss": 0.3276,
"step": 696
},
{
"epoch": 1.050687770868664,
"grad_norm": 0.5367848385477425,
"learning_rate": 1.0854796009862434e-05,
"loss": 0.3163,
"step": 697
},
{
"epoch": 1.0521952138684756,
"grad_norm": 0.544930166455388,
"learning_rate": 1.0828555908091958e-05,
"loss": 0.359,
"step": 698
},
{
"epoch": 1.0537026568682872,
"grad_norm": 0.5387564352002492,
"learning_rate": 1.0802310060654832e-05,
"loss": 0.339,
"step": 699
},
{
"epoch": 1.0552100998680987,
"grad_norm": 0.5496802508408758,
"learning_rate": 1.0776058649554336e-05,
"loss": 0.3535,
"step": 700
},
{
"epoch": 1.0567175428679103,
"grad_norm": 0.5348548485090446,
"learning_rate": 1.0749801856832325e-05,
"loss": 0.3368,
"step": 701
},
{
"epoch": 1.058224985867722,
"grad_norm": 0.5794289951348468,
"learning_rate": 1.0723539864567983e-05,
"loss": 0.3596,
"step": 702
},
{
"epoch": 1.0597324288675334,
"grad_norm": 0.5365708234277743,
"learning_rate": 1.0697272854876537e-05,
"loss": 0.3421,
"step": 703
},
{
"epoch": 1.061239871867345,
"grad_norm": 0.5904399117303262,
"learning_rate": 1.0671001009908015e-05,
"loss": 0.3348,
"step": 704
},
{
"epoch": 1.0627473148671567,
"grad_norm": 0.5204976732643493,
"learning_rate": 1.0644724511845976e-05,
"loss": 0.3525,
"step": 705
},
{
"epoch": 1.064254757866968,
"grad_norm": 0.6120309774969117,
"learning_rate": 1.0618443542906251e-05,
"loss": 0.3727,
"step": 706
},
{
"epoch": 1.0657622008667798,
"grad_norm": 0.6091575812702822,
"learning_rate": 1.059215828533566e-05,
"loss": 0.3588,
"step": 707
},
{
"epoch": 1.0672696438665912,
"grad_norm": 0.591151755333861,
"learning_rate": 1.0565868921410776e-05,
"loss": 0.3363,
"step": 708
},
{
"epoch": 1.0687770868664028,
"grad_norm": 0.5984602535754296,
"learning_rate": 1.0539575633436645e-05,
"loss": 0.3616,
"step": 709
},
{
"epoch": 1.0702845298662145,
"grad_norm": 0.5604228857922577,
"learning_rate": 1.0513278603745523e-05,
"loss": 0.3398,
"step": 710
},
{
"epoch": 1.071791972866026,
"grad_norm": 0.5557030870304388,
"learning_rate": 1.0486978014695606e-05,
"loss": 0.338,
"step": 711
},
{
"epoch": 1.0732994158658375,
"grad_norm": 0.5730991612503363,
"learning_rate": 1.0460674048669783e-05,
"loss": 0.3219,
"step": 712
},
{
"epoch": 1.0748068588656492,
"grad_norm": 0.6009828081011681,
"learning_rate": 1.0434366888074363e-05,
"loss": 0.3237,
"step": 713
},
{
"epoch": 1.0763143018654606,
"grad_norm": 0.5386294130513889,
"learning_rate": 1.0408056715337797e-05,
"loss": 0.3391,
"step": 714
},
{
"epoch": 1.0778217448652723,
"grad_norm": 0.5345878263288965,
"learning_rate": 1.0381743712909424e-05,
"loss": 0.3384,
"step": 715
},
{
"epoch": 1.079329187865084,
"grad_norm": 0.6369538253688138,
"learning_rate": 1.0355428063258224e-05,
"loss": 0.35,
"step": 716
},
{
"epoch": 1.0808366308648953,
"grad_norm": 0.5615591275271141,
"learning_rate": 1.0329109948871512e-05,
"loss": 0.3467,
"step": 717
},
{
"epoch": 1.082344073864707,
"grad_norm": 0.6406352309238248,
"learning_rate": 1.0302789552253702e-05,
"loss": 0.3523,
"step": 718
},
{
"epoch": 1.0838515168645186,
"grad_norm": 0.5212977047595297,
"learning_rate": 1.0276467055925044e-05,
"loss": 0.3185,
"step": 719
},
{
"epoch": 1.08535895986433,
"grad_norm": 0.5443802073020193,
"learning_rate": 1.0250142642420335e-05,
"loss": 0.3396,
"step": 720
},
{
"epoch": 1.0868664028641417,
"grad_norm": 0.5516128695838226,
"learning_rate": 1.0223816494287675e-05,
"loss": 0.3199,
"step": 721
},
{
"epoch": 1.0883738458639534,
"grad_norm": 0.5459335385131995,
"learning_rate": 1.0197488794087188e-05,
"loss": 0.2979,
"step": 722
},
{
"epoch": 1.0898812888637648,
"grad_norm": 0.5660471338581954,
"learning_rate": 1.0171159724389766e-05,
"loss": 0.3578,
"step": 723
},
{
"epoch": 1.0913887318635764,
"grad_norm": 0.577383627814168,
"learning_rate": 1.0144829467775794e-05,
"loss": 0.3253,
"step": 724
},
{
"epoch": 1.092896174863388,
"grad_norm": 0.5656943231881854,
"learning_rate": 1.0118498206833886e-05,
"loss": 0.3559,
"step": 725
},
{
"epoch": 1.0944036178631995,
"grad_norm": 0.5427797556871369,
"learning_rate": 1.0092166124159628e-05,
"loss": 0.3299,
"step": 726
},
{
"epoch": 1.0959110608630112,
"grad_norm": 0.5582731085039236,
"learning_rate": 1.0065833402354302e-05,
"loss": 0.342,
"step": 727
},
{
"epoch": 1.0974185038628228,
"grad_norm": 0.5809252708008414,
"learning_rate": 1.003950022402361e-05,
"loss": 0.3553,
"step": 728
},
{
"epoch": 1.0989259468626342,
"grad_norm": 0.5400373499865376,
"learning_rate": 1.0013166771776441e-05,
"loss": 0.3283,
"step": 729
},
{
"epoch": 1.1004333898624459,
"grad_norm": 0.5280335723569519,
"learning_rate": 9.986833228223562e-06,
"loss": 0.3567,
"step": 730
},
{
"epoch": 1.1019408328622573,
"grad_norm": 0.5756207231701386,
"learning_rate": 9.96049977597639e-06,
"loss": 0.3422,
"step": 731
},
{
"epoch": 1.103448275862069,
"grad_norm": 0.5917844968799806,
"learning_rate": 9.934166597645703e-06,
"loss": 0.3803,
"step": 732
},
{
"epoch": 1.1049557188618806,
"grad_norm": 0.5375048549376539,
"learning_rate": 9.907833875840374e-06,
"loss": 0.3421,
"step": 733
},
{
"epoch": 1.106463161861692,
"grad_norm": 0.5685857382900585,
"learning_rate": 9.881501793166117e-06,
"loss": 0.3658,
"step": 734
},
{
"epoch": 1.1079706048615037,
"grad_norm": 0.6069823667279429,
"learning_rate": 9.85517053222421e-06,
"loss": 0.3273,
"step": 735
},
{
"epoch": 1.1094780478613153,
"grad_norm": 0.5196609920900314,
"learning_rate": 9.82884027561024e-06,
"loss": 0.3233,
"step": 736
},
{
"epoch": 1.1109854908611267,
"grad_norm": 0.5702922246540342,
"learning_rate": 9.802511205912815e-06,
"loss": 0.35,
"step": 737
},
{
"epoch": 1.1124929338609384,
"grad_norm": 0.563216447988931,
"learning_rate": 9.776183505712327e-06,
"loss": 0.3578,
"step": 738
},
{
"epoch": 1.11400037686075,
"grad_norm": 0.5835000476343075,
"learning_rate": 9.749857357579667e-06,
"loss": 0.3753,
"step": 739
},
{
"epoch": 1.1155078198605615,
"grad_norm": 0.5822012862085456,
"learning_rate": 9.723532944074961e-06,
"loss": 0.3035,
"step": 740
},
{
"epoch": 1.1170152628603731,
"grad_norm": 0.5625362231656639,
"learning_rate": 9.6972104477463e-06,
"loss": 0.3669,
"step": 741
},
{
"epoch": 1.1185227058601848,
"grad_norm": 0.5816421569187623,
"learning_rate": 9.670890051128493e-06,
"loss": 0.3264,
"step": 742
},
{
"epoch": 1.1200301488599962,
"grad_norm": 0.6076866614497781,
"learning_rate": 9.644571936741778e-06,
"loss": 0.3448,
"step": 743
},
{
"epoch": 1.1215375918598078,
"grad_norm": 0.5868211335333723,
"learning_rate": 9.618256287090576e-06,
"loss": 0.3453,
"step": 744
},
{
"epoch": 1.1230450348596195,
"grad_norm": 0.5784910781884745,
"learning_rate": 9.591943284662206e-06,
"loss": 0.3543,
"step": 745
},
{
"epoch": 1.124552477859431,
"grad_norm": 0.5577968039251089,
"learning_rate": 9.56563311192564e-06,
"loss": 0.356,
"step": 746
},
{
"epoch": 1.1260599208592426,
"grad_norm": 0.5624603535612774,
"learning_rate": 9.53932595133022e-06,
"loss": 0.322,
"step": 747
},
{
"epoch": 1.127567363859054,
"grad_norm": 0.5863194939952109,
"learning_rate": 9.513021985304399e-06,
"loss": 0.341,
"step": 748
},
{
"epoch": 1.1290748068588656,
"grad_norm": 0.5297072497497793,
"learning_rate": 9.486721396254484e-06,
"loss": 0.3263,
"step": 749
},
{
"epoch": 1.1305822498586773,
"grad_norm": 0.5597259831895821,
"learning_rate": 9.460424366563355e-06,
"loss": 0.3243,
"step": 750
},
{
"epoch": 1.1320896928584887,
"grad_norm": 0.5464179018975297,
"learning_rate": 9.434131078589224e-06,
"loss": 0.3206,
"step": 751
},
{
"epoch": 1.1335971358583004,
"grad_norm": 0.5464450895465798,
"learning_rate": 9.407841714664343e-06,
"loss": 0.3387,
"step": 752
},
{
"epoch": 1.135104578858112,
"grad_norm": 0.5546542012199714,
"learning_rate": 9.381556457093752e-06,
"loss": 0.337,
"step": 753
},
{
"epoch": 1.1366120218579234,
"grad_norm": 0.5753540187155672,
"learning_rate": 9.355275488154025e-06,
"loss": 0.3644,
"step": 754
},
{
"epoch": 1.138119464857735,
"grad_norm": 0.571883771055452,
"learning_rate": 9.32899899009199e-06,
"loss": 0.332,
"step": 755
},
{
"epoch": 1.1396269078575467,
"grad_norm": 0.5383157533846678,
"learning_rate": 9.30272714512347e-06,
"loss": 0.3397,
"step": 756
},
{
"epoch": 1.1411343508573581,
"grad_norm": 0.564086061412075,
"learning_rate": 9.276460135432019e-06,
"loss": 0.3592,
"step": 757
},
{
"epoch": 1.1426417938571698,
"grad_norm": 0.510671608636206,
"learning_rate": 9.250198143167675e-06,
"loss": 0.3301,
"step": 758
},
{
"epoch": 1.1441492368569812,
"grad_norm": 0.5323793942216957,
"learning_rate": 9.223941350445666e-06,
"loss": 0.3341,
"step": 759
},
{
"epoch": 1.1456566798567929,
"grad_norm": 0.5359098725485097,
"learning_rate": 9.19768993934517e-06,
"loss": 0.3214,
"step": 760
},
{
"epoch": 1.1471641228566045,
"grad_norm": 0.5451361788825891,
"learning_rate": 9.171444091908046e-06,
"loss": 0.3195,
"step": 761
},
{
"epoch": 1.1486715658564162,
"grad_norm": 0.5246946467273069,
"learning_rate": 9.145203990137571e-06,
"loss": 0.3417,
"step": 762
},
{
"epoch": 1.1501790088562276,
"grad_norm": 0.5919155354849388,
"learning_rate": 9.118969815997174e-06,
"loss": 0.3417,
"step": 763
},
{
"epoch": 1.1516864518560392,
"grad_norm": 0.549339616533448,
"learning_rate": 9.092741751409186e-06,
"loss": 0.3323,
"step": 764
},
{
"epoch": 1.1531938948558507,
"grad_norm": 0.5520952523067345,
"learning_rate": 9.06651997825357e-06,
"loss": 0.325,
"step": 765
},
{
"epoch": 1.1547013378556623,
"grad_norm": 0.5546050111571403,
"learning_rate": 9.040304678366658e-06,
"loss": 0.3798,
"step": 766
},
{
"epoch": 1.156208780855474,
"grad_norm": 0.5355016099382737,
"learning_rate": 9.014096033539889e-06,
"loss": 0.3324,
"step": 767
},
{
"epoch": 1.1577162238552854,
"grad_norm": 0.5063535090434689,
"learning_rate": 8.987894225518556e-06,
"loss": 0.3098,
"step": 768
},
{
"epoch": 1.159223666855097,
"grad_norm": 0.538083701203612,
"learning_rate": 8.961699436000548e-06,
"loss": 0.3378,
"step": 769
},
{
"epoch": 1.1607311098549087,
"grad_norm": 0.5611833134978637,
"learning_rate": 8.93551184663507e-06,
"loss": 0.3291,
"step": 770
},
{
"epoch": 1.16223855285472,
"grad_norm": 0.5334833426063799,
"learning_rate": 8.909331639021414e-06,
"loss": 0.3265,
"step": 771
},
{
"epoch": 1.1637459958545318,
"grad_norm": 0.5370028500892263,
"learning_rate": 8.883158994707666e-06,
"loss": 0.353,
"step": 772
},
{
"epoch": 1.1652534388543434,
"grad_norm": 0.5564851227581507,
"learning_rate": 8.856994095189477e-06,
"loss": 0.314,
"step": 773
},
{
"epoch": 1.1667608818541548,
"grad_norm": 0.5516816420442727,
"learning_rate": 8.830837121908783e-06,
"loss": 0.3459,
"step": 774
},
{
"epoch": 1.1682683248539665,
"grad_norm": 0.546010896691211,
"learning_rate": 8.804688256252557e-06,
"loss": 0.3564,
"step": 775
},
{
"epoch": 1.1697757678537781,
"grad_norm": 0.5204242216440147,
"learning_rate": 8.778547679551555e-06,
"loss": 0.3093,
"step": 776
},
{
"epoch": 1.1712832108535896,
"grad_norm": 0.5530932960461594,
"learning_rate": 8.75241557307904e-06,
"loss": 0.3169,
"step": 777
},
{
"epoch": 1.1727906538534012,
"grad_norm": 0.5433657189299205,
"learning_rate": 8.726292118049555e-06,
"loss": 0.3238,
"step": 778
},
{
"epoch": 1.1742980968532128,
"grad_norm": 0.536612091168906,
"learning_rate": 8.700177495617635e-06,
"loss": 0.3375,
"step": 779
},
{
"epoch": 1.1758055398530243,
"grad_norm": 0.5547355998217709,
"learning_rate": 8.674071886876572e-06,
"loss": 0.3285,
"step": 780
},
{
"epoch": 1.177312982852836,
"grad_norm": 0.6048276095962777,
"learning_rate": 8.647975472857148e-06,
"loss": 0.3704,
"step": 781
},
{
"epoch": 1.1788204258526473,
"grad_norm": 0.571295755561053,
"learning_rate": 8.621888434526382e-06,
"loss": 0.374,
"step": 782
},
{
"epoch": 1.180327868852459,
"grad_norm": 0.5189210618741348,
"learning_rate": 8.595810952786289e-06,
"loss": 0.3247,
"step": 783
},
{
"epoch": 1.1818353118522706,
"grad_norm": 0.5407807768349286,
"learning_rate": 8.569743208472594e-06,
"loss": 0.318,
"step": 784
},
{
"epoch": 1.183342754852082,
"grad_norm": 0.5555658150734397,
"learning_rate": 8.543685382353518e-06,
"loss": 0.342,
"step": 785
},
{
"epoch": 1.1848501978518937,
"grad_norm": 0.5478498420172522,
"learning_rate": 8.51763765512849e-06,
"loss": 0.3386,
"step": 786
},
{
"epoch": 1.1863576408517054,
"grad_norm": 0.5229096665922429,
"learning_rate": 8.491600207426907e-06,
"loss": 0.3218,
"step": 787
},
{
"epoch": 1.1878650838515168,
"grad_norm": 0.5706786350698708,
"learning_rate": 8.465573219806893e-06,
"loss": 0.3491,
"step": 788
},
{
"epoch": 1.1893725268513284,
"grad_norm": 0.5792169640912351,
"learning_rate": 8.439556872754025e-06,
"loss": 0.3482,
"step": 789
},
{
"epoch": 1.19087996985114,
"grad_norm": 0.5219487046954187,
"learning_rate": 8.413551346680095e-06,
"loss": 0.3183,
"step": 790
},
{
"epoch": 1.1923874128509515,
"grad_norm": 0.5680359320055756,
"learning_rate": 8.38755682192186e-06,
"loss": 0.3257,
"step": 791
},
{
"epoch": 1.1938948558507632,
"grad_norm": 0.54324153892485,
"learning_rate": 8.36157347873979e-06,
"loss": 0.3151,
"step": 792
},
{
"epoch": 1.1954022988505748,
"grad_norm": 0.5584145695371312,
"learning_rate": 8.335601497316809e-06,
"loss": 0.3474,
"step": 793
},
{
"epoch": 1.1969097418503862,
"grad_norm": 0.5414937178807059,
"learning_rate": 8.309641057757052e-06,
"loss": 0.3348,
"step": 794
},
{
"epoch": 1.1984171848501979,
"grad_norm": 0.5933495505366142,
"learning_rate": 8.283692340084623e-06,
"loss": 0.3743,
"step": 795
},
{
"epoch": 1.1999246278500095,
"grad_norm": 0.5730820400742883,
"learning_rate": 8.257755524242333e-06,
"loss": 0.3437,
"step": 796
},
{
"epoch": 1.201432070849821,
"grad_norm": 0.5154842086228131,
"learning_rate": 8.231830790090461e-06,
"loss": 0.3271,
"step": 797
},
{
"epoch": 1.2029395138496326,
"grad_norm": 0.5445619151521616,
"learning_rate": 8.205918317405508e-06,
"loss": 0.3229,
"step": 798
},
{
"epoch": 1.204446956849444,
"grad_norm": 0.6016710522110904,
"learning_rate": 8.18001828587895e-06,
"loss": 0.3609,
"step": 799
},
{
"epoch": 1.2059543998492557,
"grad_norm": 0.5457762036159068,
"learning_rate": 8.154130875115978e-06,
"loss": 0.318,
"step": 800
},
{
"epoch": 1.2074618428490673,
"grad_norm": 0.5404902176604001,
"learning_rate": 8.12825626463427e-06,
"loss": 0.3323,
"step": 801
},
{
"epoch": 1.2089692858488787,
"grad_norm": 0.5722847292063646,
"learning_rate": 8.102394633862743e-06,
"loss": 0.3147,
"step": 802
},
{
"epoch": 1.2104767288486904,
"grad_norm": 0.5531842770730636,
"learning_rate": 8.0765461621403e-06,
"loss": 0.331,
"step": 803
},
{
"epoch": 1.211984171848502,
"grad_norm": 0.5256780853712785,
"learning_rate": 8.050711028714589e-06,
"loss": 0.3176,
"step": 804
},
{
"epoch": 1.2134916148483135,
"grad_norm": 0.6144603881477418,
"learning_rate": 8.02488941274078e-06,
"loss": 0.3383,
"step": 805
},
{
"epoch": 1.2149990578481251,
"grad_norm": 0.571788365434139,
"learning_rate": 7.999081493280283e-06,
"loss": 0.3258,
"step": 806
},
{
"epoch": 1.2165065008479368,
"grad_norm": 0.5982762464323738,
"learning_rate": 7.973287449299545e-06,
"loss": 0.3503,
"step": 807
},
{
"epoch": 1.2180139438477482,
"grad_norm": 0.5363356894959806,
"learning_rate": 7.947507459668784e-06,
"loss": 0.3436,
"step": 808
},
{
"epoch": 1.2195213868475598,
"grad_norm": 0.5730894211276505,
"learning_rate": 7.921741703160758e-06,
"loss": 0.3584,
"step": 809
},
{
"epoch": 1.2210288298473715,
"grad_norm": 0.563926690224309,
"learning_rate": 7.895990358449533e-06,
"loss": 0.3291,
"step": 810
},
{
"epoch": 1.222536272847183,
"grad_norm": 0.5254920217508706,
"learning_rate": 7.87025360410922e-06,
"loss": 0.316,
"step": 811
},
{
"epoch": 1.2240437158469946,
"grad_norm": 0.5313261676986573,
"learning_rate": 7.844531618612772e-06,
"loss": 0.3319,
"step": 812
},
{
"epoch": 1.2255511588468062,
"grad_norm": 0.5790168954324271,
"learning_rate": 7.81882458033071e-06,
"loss": 0.3202,
"step": 813
},
{
"epoch": 1.2270586018466176,
"grad_norm": 0.5385942852927429,
"learning_rate": 7.79313266752991e-06,
"loss": 0.3259,
"step": 814
},
{
"epoch": 1.2285660448464293,
"grad_norm": 0.5551972130449111,
"learning_rate": 7.767456058372362e-06,
"loss": 0.3385,
"step": 815
},
{
"epoch": 1.2300734878462407,
"grad_norm": 0.5322043372006761,
"learning_rate": 7.741794930913922e-06,
"loss": 0.3215,
"step": 816
},
{
"epoch": 1.2315809308460524,
"grad_norm": 0.5541120887430956,
"learning_rate": 7.7161494631031e-06,
"loss": 0.3428,
"step": 817
},
{
"epoch": 1.233088373845864,
"grad_norm": 0.5527885462222231,
"learning_rate": 7.690519832779799e-06,
"loss": 0.3389,
"step": 818
},
{
"epoch": 1.2345958168456754,
"grad_norm": 0.5406331170872595,
"learning_rate": 7.664906217674115e-06,
"loss": 0.3112,
"step": 819
},
{
"epoch": 1.236103259845487,
"grad_norm": 0.5055150883042695,
"learning_rate": 7.639308795405066e-06,
"loss": 0.3202,
"step": 820
},
{
"epoch": 1.2376107028452987,
"grad_norm": 0.5563269801825349,
"learning_rate": 7.613727743479395e-06,
"loss": 0.3571,
"step": 821
},
{
"epoch": 1.2391181458451102,
"grad_norm": 0.5792057709615847,
"learning_rate": 7.588163239290316e-06,
"loss": 0.3329,
"step": 822
},
{
"epoch": 1.2406255888449218,
"grad_norm": 0.5666249401867434,
"learning_rate": 7.562615460116289e-06,
"loss": 0.351,
"step": 823
},
{
"epoch": 1.2421330318447334,
"grad_norm": 0.5265355387938444,
"learning_rate": 7.537084583119802e-06,
"loss": 0.3701,
"step": 824
},
{
"epoch": 1.2436404748445449,
"grad_norm": 0.5495841716595921,
"learning_rate": 7.511570785346129e-06,
"loss": 0.329,
"step": 825
},
{
"epoch": 1.2451479178443565,
"grad_norm": 0.5587199026990006,
"learning_rate": 7.486074243722109e-06,
"loss": 0.3252,
"step": 826
},
{
"epoch": 1.2466553608441682,
"grad_norm": 0.5211341468152613,
"learning_rate": 7.460595135054916e-06,
"loss": 0.3311,
"step": 827
},
{
"epoch": 1.2481628038439796,
"grad_norm": 0.5364245497529563,
"learning_rate": 7.435133636030831e-06,
"loss": 0.3208,
"step": 828
},
{
"epoch": 1.2496702468437912,
"grad_norm": 0.5314247345107659,
"learning_rate": 7.4096899232140295e-06,
"loss": 0.3317,
"step": 829
},
{
"epoch": 1.251177689843603,
"grad_norm": 0.5630710895853528,
"learning_rate": 7.384264173045339e-06,
"loss": 0.3351,
"step": 830
},
{
"epoch": 1.2526851328434143,
"grad_norm": 0.5197283769421239,
"learning_rate": 7.358856561841021e-06,
"loss": 0.3065,
"step": 831
},
{
"epoch": 1.254192575843226,
"grad_norm": 0.5568788382198039,
"learning_rate": 7.333467265791563e-06,
"loss": 0.351,
"step": 832
},
{
"epoch": 1.2557000188430374,
"grad_norm": 0.5725220505007355,
"learning_rate": 7.308096460960441e-06,
"loss": 0.3439,
"step": 833
},
{
"epoch": 1.257207461842849,
"grad_norm": 0.5304098730159461,
"learning_rate": 7.282744323282895e-06,
"loss": 0.3188,
"step": 834
},
{
"epoch": 1.2587149048426607,
"grad_norm": 0.5263594760039901,
"learning_rate": 7.2574110285647244e-06,
"loss": 0.3209,
"step": 835
},
{
"epoch": 1.2602223478424723,
"grad_norm": 0.6039158733618325,
"learning_rate": 7.232096752481061e-06,
"loss": 0.3366,
"step": 836
},
{
"epoch": 1.2617297908422838,
"grad_norm": 0.5807414247418556,
"learning_rate": 7.206801670575145e-06,
"loss": 0.3446,
"step": 837
},
{
"epoch": 1.2632372338420954,
"grad_norm": 0.5398549865816707,
"learning_rate": 7.181525958257116e-06,
"loss": 0.2976,
"step": 838
},
{
"epoch": 1.2647446768419068,
"grad_norm": 0.5502842121004295,
"learning_rate": 7.156269790802801e-06,
"loss": 0.3308,
"step": 839
},
{
"epoch": 1.2662521198417185,
"grad_norm": 0.5520318040890088,
"learning_rate": 7.131033343352483e-06,
"loss": 0.3347,
"step": 840
},
{
"epoch": 1.2677595628415301,
"grad_norm": 0.5430821284421434,
"learning_rate": 7.105816790909699e-06,
"loss": 0.3199,
"step": 841
},
{
"epoch": 1.2692670058413416,
"grad_norm": 0.5268656785617308,
"learning_rate": 7.080620308340024e-06,
"loss": 0.3368,
"step": 842
},
{
"epoch": 1.2707744488411532,
"grad_norm": 0.5488558866283424,
"learning_rate": 7.055444070369852e-06,
"loss": 0.3185,
"step": 843
},
{
"epoch": 1.2722818918409646,
"grad_norm": 0.5234636191148432,
"learning_rate": 7.0302882515852025e-06,
"loss": 0.3156,
"step": 844
},
{
"epoch": 1.2737893348407763,
"grad_norm": 0.5571922620156962,
"learning_rate": 7.005153026430476e-06,
"loss": 0.3475,
"step": 845
},
{
"epoch": 1.275296777840588,
"grad_norm": 0.6043172841328527,
"learning_rate": 6.980038569207291e-06,
"loss": 0.3535,
"step": 846
},
{
"epoch": 1.2768042208403996,
"grad_norm": 0.5449462283830545,
"learning_rate": 6.954945054073228e-06,
"loss": 0.3485,
"step": 847
},
{
"epoch": 1.278311663840211,
"grad_norm": 0.553125976275942,
"learning_rate": 6.929872655040655e-06,
"loss": 0.3392,
"step": 848
},
{
"epoch": 1.2798191068400226,
"grad_norm": 0.5313032640250875,
"learning_rate": 6.904821545975507e-06,
"loss": 0.3533,
"step": 849
},
{
"epoch": 1.281326549839834,
"grad_norm": 0.5461530058972931,
"learning_rate": 6.879791900596077e-06,
"loss": 0.3082,
"step": 850
},
{
"epoch": 1.2828339928396457,
"grad_norm": 0.5268975792503748,
"learning_rate": 6.854783892471823e-06,
"loss": 0.3507,
"step": 851
},
{
"epoch": 1.2843414358394574,
"grad_norm": 0.525335909935522,
"learning_rate": 6.829797695022163e-06,
"loss": 0.3137,
"step": 852
},
{
"epoch": 1.285848878839269,
"grad_norm": 0.5439698304073414,
"learning_rate": 6.804833481515256e-06,
"loss": 0.3269,
"step": 853
},
{
"epoch": 1.2873563218390804,
"grad_norm": 0.5426503592650488,
"learning_rate": 6.7798914250668154e-06,
"loss": 0.3255,
"step": 854
},
{
"epoch": 1.288863764838892,
"grad_norm": 0.546578985401071,
"learning_rate": 6.7549716986389146e-06,
"loss": 0.3357,
"step": 855
},
{
"epoch": 1.2903712078387035,
"grad_norm": 0.5433998763126892,
"learning_rate": 6.730074475038766e-06,
"loss": 0.3316,
"step": 856
},
{
"epoch": 1.2918786508385152,
"grad_norm": 0.5364588967630985,
"learning_rate": 6.7051999269175405e-06,
"loss": 0.3305,
"step": 857
},
{
"epoch": 1.2933860938383268,
"grad_norm": 0.5658934841388523,
"learning_rate": 6.680348226769162e-06,
"loss": 0.329,
"step": 858
},
{
"epoch": 1.2948935368381382,
"grad_norm": 0.5643062239325746,
"learning_rate": 6.655519546929121e-06,
"loss": 0.3297,
"step": 859
},
{
"epoch": 1.2964009798379499,
"grad_norm": 0.5371342456598566,
"learning_rate": 6.630714059573267e-06,
"loss": 0.3411,
"step": 860
},
{
"epoch": 1.2979084228377613,
"grad_norm": 0.5429869820067992,
"learning_rate": 6.6059319367166165e-06,
"loss": 0.3162,
"step": 861
},
{
"epoch": 1.299415865837573,
"grad_norm": 0.6163498341710386,
"learning_rate": 6.581173350212169e-06,
"loss": 0.3346,
"step": 862
},
{
"epoch": 1.3009233088373846,
"grad_norm": 0.5249574401357171,
"learning_rate": 6.55643847174971e-06,
"loss": 0.3184,
"step": 863
},
{
"epoch": 1.3024307518371963,
"grad_norm": 0.5652427669527782,
"learning_rate": 6.531727472854617e-06,
"loss": 0.3277,
"step": 864
},
{
"epoch": 1.3039381948370077,
"grad_norm": 0.5499255875094143,
"learning_rate": 6.507040524886672e-06,
"loss": 0.3099,
"step": 865
},
{
"epoch": 1.3054456378368193,
"grad_norm": 0.5395982289283698,
"learning_rate": 6.482377799038882e-06,
"loss": 0.312,
"step": 866
},
{
"epoch": 1.3069530808366308,
"grad_norm": 0.5425266392409812,
"learning_rate": 6.45773946633628e-06,
"loss": 0.3288,
"step": 867
},
{
"epoch": 1.3084605238364424,
"grad_norm": 0.5289252666187554,
"learning_rate": 6.4331256976347434e-06,
"loss": 0.3143,
"step": 868
},
{
"epoch": 1.309967966836254,
"grad_norm": 0.5829209174715098,
"learning_rate": 6.408536663619803e-06,
"loss": 0.3215,
"step": 869
},
{
"epoch": 1.3114754098360657,
"grad_norm": 0.5955771972087047,
"learning_rate": 6.383972534805478e-06,
"loss": 0.3232,
"step": 870
},
{
"epoch": 1.3129828528358771,
"grad_norm": 0.5434757595303122,
"learning_rate": 6.359433481533074e-06,
"loss": 0.318,
"step": 871
},
{
"epoch": 1.3144902958356888,
"grad_norm": 0.5313303734643687,
"learning_rate": 6.3349196739700024e-06,
"loss": 0.3159,
"step": 872
},
{
"epoch": 1.3159977388355002,
"grad_norm": 0.5678985167703594,
"learning_rate": 6.310431282108622e-06,
"loss": 0.3757,
"step": 873
},
{
"epoch": 1.3175051818353118,
"grad_norm": 0.5648846998829979,
"learning_rate": 6.2859684757650365e-06,
"loss": 0.3493,
"step": 874
},
{
"epoch": 1.3190126248351235,
"grad_norm": 0.5488508237006199,
"learning_rate": 6.261531424577923e-06,
"loss": 0.3219,
"step": 875
},
{
"epoch": 1.320520067834935,
"grad_norm": 0.5438041497270804,
"learning_rate": 6.2371202980073596e-06,
"loss": 0.3417,
"step": 876
},
{
"epoch": 1.3220275108347466,
"grad_norm": 0.5400748419899576,
"learning_rate": 6.212735265333655e-06,
"loss": 0.3025,
"step": 877
},
{
"epoch": 1.323534953834558,
"grad_norm": 0.5299843425249701,
"learning_rate": 6.188376495656156e-06,
"loss": 0.3374,
"step": 878
},
{
"epoch": 1.3250423968343696,
"grad_norm": 0.5236709426443396,
"learning_rate": 6.164044157892102e-06,
"loss": 0.3221,
"step": 879
},
{
"epoch": 1.3265498398341813,
"grad_norm": 0.5699051867060005,
"learning_rate": 6.13973842077543e-06,
"loss": 0.3226,
"step": 880
},
{
"epoch": 1.328057282833993,
"grad_norm": 0.5655665319377791,
"learning_rate": 6.11545945285561e-06,
"loss": 0.3175,
"step": 881
},
{
"epoch": 1.3295647258338044,
"grad_norm": 0.5387118438674878,
"learning_rate": 6.091207422496489e-06,
"loss": 0.3243,
"step": 882
},
{
"epoch": 1.331072168833616,
"grad_norm": 0.5744706591584219,
"learning_rate": 6.066982497875109e-06,
"loss": 0.3286,
"step": 883
},
{
"epoch": 1.3325796118334274,
"grad_norm": 0.542466249812019,
"learning_rate": 6.042784846980542e-06,
"loss": 0.3225,
"step": 884
},
{
"epoch": 1.334087054833239,
"grad_norm": 0.5515488785701044,
"learning_rate": 6.018614637612733e-06,
"loss": 0.3238,
"step": 885
},
{
"epoch": 1.3355944978330507,
"grad_norm": 0.5349896204088196,
"learning_rate": 5.99447203738134e-06,
"loss": 0.324,
"step": 886
},
{
"epoch": 1.3371019408328624,
"grad_norm": 0.5371367792089301,
"learning_rate": 5.9703572137045495e-06,
"loss": 0.3369,
"step": 887
},
{
"epoch": 1.3386093838326738,
"grad_norm": 0.5615358147993731,
"learning_rate": 5.946270333807937e-06,
"loss": 0.3052,
"step": 888
},
{
"epoch": 1.3401168268324855,
"grad_norm": 0.5743727933679714,
"learning_rate": 5.922211564723302e-06,
"loss": 0.3455,
"step": 889
},
{
"epoch": 1.3416242698322969,
"grad_norm": 0.5336291605723125,
"learning_rate": 5.898181073287504e-06,
"loss": 0.3226,
"step": 890
},
{
"epoch": 1.3431317128321085,
"grad_norm": 0.5722163135210774,
"learning_rate": 5.87417902614131e-06,
"loss": 0.3646,
"step": 891
},
{
"epoch": 1.3446391558319202,
"grad_norm": 0.5524401803992677,
"learning_rate": 5.850205589728239e-06,
"loss": 0.3016,
"step": 892
},
{
"epoch": 1.3461465988317316,
"grad_norm": 0.5311906031167264,
"learning_rate": 5.826260930293417e-06,
"loss": 0.3174,
"step": 893
},
{
"epoch": 1.3476540418315432,
"grad_norm": 0.5385817256109608,
"learning_rate": 5.802345213882396e-06,
"loss": 0.3447,
"step": 894
},
{
"epoch": 1.349161484831355,
"grad_norm": 0.5443226270708521,
"learning_rate": 5.778458606340037e-06,
"loss": 0.3056,
"step": 895
},
{
"epoch": 1.3506689278311663,
"grad_norm": 0.5247659222065347,
"learning_rate": 5.754601273309333e-06,
"loss": 0.3045,
"step": 896
},
{
"epoch": 1.352176370830978,
"grad_norm": 0.5555702487315548,
"learning_rate": 5.730773380230276e-06,
"loss": 0.3186,
"step": 897
},
{
"epoch": 1.3536838138307896,
"grad_norm": 0.5661524854903914,
"learning_rate": 5.70697509233871e-06,
"loss": 0.3248,
"step": 898
},
{
"epoch": 1.355191256830601,
"grad_norm": 0.5284857763614461,
"learning_rate": 5.683206574665165e-06,
"loss": 0.3018,
"step": 899
},
{
"epoch": 1.3566986998304127,
"grad_norm": 0.5740850180912764,
"learning_rate": 5.6594679920337514e-06,
"loss": 0.3529,
"step": 900
},
{
"epoch": 1.3582061428302241,
"grad_norm": 0.5299086867590524,
"learning_rate": 5.635759509060969e-06,
"loss": 0.32,
"step": 901
},
{
"epoch": 1.3597135858300358,
"grad_norm": 0.5330329949433913,
"learning_rate": 5.612081290154607e-06,
"loss": 0.3156,
"step": 902
},
{
"epoch": 1.3612210288298474,
"grad_norm": 0.5347611903592508,
"learning_rate": 5.58843349951258e-06,
"loss": 0.3183,
"step": 903
},
{
"epoch": 1.362728471829659,
"grad_norm": 0.5409021340662399,
"learning_rate": 5.564816301121792e-06,
"loss": 0.3411,
"step": 904
},
{
"epoch": 1.3642359148294705,
"grad_norm": 0.5560565848550149,
"learning_rate": 5.541229858757011e-06,
"loss": 0.3508,
"step": 905
},
{
"epoch": 1.3657433578292821,
"grad_norm": 0.5040665273430834,
"learning_rate": 5.517674335979721e-06,
"loss": 0.3038,
"step": 906
},
{
"epoch": 1.3672508008290936,
"grad_norm": 0.5520505173652595,
"learning_rate": 5.494149896136998e-06,
"loss": 0.3342,
"step": 907
},
{
"epoch": 1.3687582438289052,
"grad_norm": 0.5286100688050495,
"learning_rate": 5.470656702360367e-06,
"loss": 0.3051,
"step": 908
},
{
"epoch": 1.3702656868287169,
"grad_norm": 0.5540464877346475,
"learning_rate": 5.447194917564671e-06,
"loss": 0.3327,
"step": 909
},
{
"epoch": 1.3717731298285283,
"grad_norm": 0.5401690086723988,
"learning_rate": 5.423764704446954e-06,
"loss": 0.332,
"step": 910
},
{
"epoch": 1.37328057282834,
"grad_norm": 0.5440262612621518,
"learning_rate": 5.400366225485326e-06,
"loss": 0.3326,
"step": 911
},
{
"epoch": 1.3747880158281516,
"grad_norm": 0.5291318028597245,
"learning_rate": 5.376999642937817e-06,
"loss": 0.3262,
"step": 912
},
{
"epoch": 1.376295458827963,
"grad_norm": 0.5361093139503608,
"learning_rate": 5.353665118841296e-06,
"loss": 0.3258,
"step": 913
},
{
"epoch": 1.3778029018277747,
"grad_norm": 0.5442991814951846,
"learning_rate": 5.330362815010306e-06,
"loss": 0.3162,
"step": 914
},
{
"epoch": 1.3793103448275863,
"grad_norm": 0.5384147655921361,
"learning_rate": 5.307092893035951e-06,
"loss": 0.3381,
"step": 915
},
{
"epoch": 1.3808177878273977,
"grad_norm": 0.539100490777508,
"learning_rate": 5.2838555142847925e-06,
"loss": 0.3423,
"step": 916
},
{
"epoch": 1.3823252308272094,
"grad_norm": 0.5427293981456651,
"learning_rate": 5.260650839897719e-06,
"loss": 0.3217,
"step": 917
},
{
"epoch": 1.3838326738270208,
"grad_norm": 0.5726046303707281,
"learning_rate": 5.237479030788817e-06,
"loss": 0.3507,
"step": 918
},
{
"epoch": 1.3853401168268324,
"grad_norm": 0.5527176893463295,
"learning_rate": 5.214340247644278e-06,
"loss": 0.331,
"step": 919
},
{
"epoch": 1.386847559826644,
"grad_norm": 0.5481340240469819,
"learning_rate": 5.191234650921273e-06,
"loss": 0.318,
"step": 920
},
{
"epoch": 1.3883550028264557,
"grad_norm": 0.5543962320620248,
"learning_rate": 5.168162400846835e-06,
"loss": 0.3155,
"step": 921
},
{
"epoch": 1.3898624458262672,
"grad_norm": 0.5367203136891187,
"learning_rate": 5.145123657416759e-06,
"loss": 0.3326,
"step": 922
},
{
"epoch": 1.3913698888260788,
"grad_norm": 0.5460167764137122,
"learning_rate": 5.122118580394473e-06,
"loss": 0.337,
"step": 923
},
{
"epoch": 1.3928773318258902,
"grad_norm": 0.5195547700814616,
"learning_rate": 5.099147329309959e-06,
"loss": 0.326,
"step": 924
},
{
"epoch": 1.394384774825702,
"grad_norm": 0.5456994955845843,
"learning_rate": 5.076210063458622e-06,
"loss": 0.3322,
"step": 925
},
{
"epoch": 1.3958922178255135,
"grad_norm": 0.5843461937914468,
"learning_rate": 5.0533069419002e-06,
"loss": 0.339,
"step": 926
},
{
"epoch": 1.397399660825325,
"grad_norm": 0.5150644731537505,
"learning_rate": 5.030438123457655e-06,
"loss": 0.2913,
"step": 927
},
{
"epoch": 1.3989071038251366,
"grad_norm": 0.5258159746479392,
"learning_rate": 5.007603766716063e-06,
"loss": 0.3107,
"step": 928
},
{
"epoch": 1.4004145468249483,
"grad_norm": 0.5748165660930256,
"learning_rate": 4.984804030021533e-06,
"loss": 0.3328,
"step": 929
},
{
"epoch": 1.4019219898247597,
"grad_norm": 0.5630833950584739,
"learning_rate": 4.962039071480102e-06,
"loss": 0.3233,
"step": 930
},
{
"epoch": 1.4034294328245713,
"grad_norm": 0.537736675931464,
"learning_rate": 4.939309048956622e-06,
"loss": 0.3451,
"step": 931
},
{
"epoch": 1.404936875824383,
"grad_norm": 0.5230851918523695,
"learning_rate": 4.9166141200736885e-06,
"loss": 0.3389,
"step": 932
},
{
"epoch": 1.4064443188241944,
"grad_norm": 0.5434274109432955,
"learning_rate": 4.89395444221055e-06,
"loss": 0.3189,
"step": 933
},
{
"epoch": 1.407951761824006,
"grad_norm": 0.5467058284642171,
"learning_rate": 4.871330172501979e-06,
"loss": 0.3218,
"step": 934
},
{
"epoch": 1.4094592048238175,
"grad_norm": 0.5643806859737526,
"learning_rate": 4.848741467837228e-06,
"loss": 0.339,
"step": 935
},
{
"epoch": 1.4109666478236291,
"grad_norm": 0.5210588389675654,
"learning_rate": 4.826188484858918e-06,
"loss": 0.2865,
"step": 936
},
{
"epoch": 1.4124740908234408,
"grad_norm": 0.5575173474168307,
"learning_rate": 4.803671379961945e-06,
"loss": 0.3448,
"step": 937
},
{
"epoch": 1.4139815338232524,
"grad_norm": 0.5553202634668,
"learning_rate": 4.781190309292421e-06,
"loss": 0.318,
"step": 938
},
{
"epoch": 1.4154889768230638,
"grad_norm": 0.5277473116633332,
"learning_rate": 4.758745428746569e-06,
"loss": 0.3047,
"step": 939
},
{
"epoch": 1.4169964198228755,
"grad_norm": 0.5308118737562897,
"learning_rate": 4.736336893969652e-06,
"loss": 0.3126,
"step": 940
},
{
"epoch": 1.418503862822687,
"grad_norm": 0.562016412444855,
"learning_rate": 4.7139648603548925e-06,
"loss": 0.3306,
"step": 941
},
{
"epoch": 1.4200113058224986,
"grad_norm": 0.5112809802949265,
"learning_rate": 4.691629483042387e-06,
"loss": 0.2877,
"step": 942
},
{
"epoch": 1.4215187488223102,
"grad_norm": 0.555811318693021,
"learning_rate": 4.669330916918043e-06,
"loss": 0.3346,
"step": 943
},
{
"epoch": 1.4230261918221216,
"grad_norm": 0.5388925234150407,
"learning_rate": 4.647069316612502e-06,
"loss": 0.3137,
"step": 944
},
{
"epoch": 1.4245336348219333,
"grad_norm": 0.5265475799399302,
"learning_rate": 4.624844836500052e-06,
"loss": 0.3162,
"step": 945
},
{
"epoch": 1.426041077821745,
"grad_norm": 0.5112485522585755,
"learning_rate": 4.60265763069758e-06,
"loss": 0.2914,
"step": 946
},
{
"epoch": 1.4275485208215564,
"grad_norm": 0.5285723749640436,
"learning_rate": 4.580507853063487e-06,
"loss": 0.3098,
"step": 947
},
{
"epoch": 1.429055963821368,
"grad_norm": 0.5408160656578395,
"learning_rate": 4.5583956571966295e-06,
"loss": 0.3365,
"step": 948
},
{
"epoch": 1.4305634068211797,
"grad_norm": 0.5598936258222863,
"learning_rate": 4.5363211964352524e-06,
"loss": 0.3292,
"step": 949
},
{
"epoch": 1.432070849820991,
"grad_norm": 0.5180426805197446,
"learning_rate": 4.514284623855915e-06,
"loss": 0.3174,
"step": 950
},
{
"epoch": 1.4335782928208027,
"grad_norm": 0.5639401953538313,
"learning_rate": 4.4922860922724466e-06,
"loss": 0.3617,
"step": 951
},
{
"epoch": 1.4350857358206142,
"grad_norm": 0.5482846937319309,
"learning_rate": 4.470325754234881e-06,
"loss": 0.3256,
"step": 952
},
{
"epoch": 1.4365931788204258,
"grad_norm": 0.530946653125974,
"learning_rate": 4.448403762028391e-06,
"loss": 0.3367,
"step": 953
},
{
"epoch": 1.4381006218202375,
"grad_norm": 0.5630491613208096,
"learning_rate": 4.426520267672244e-06,
"loss": 0.33,
"step": 954
},
{
"epoch": 1.439608064820049,
"grad_norm": 0.5281029541497921,
"learning_rate": 4.40467542291874e-06,
"loss": 0.3266,
"step": 955
},
{
"epoch": 1.4411155078198605,
"grad_norm": 0.5134408808419982,
"learning_rate": 4.382869379252152e-06,
"loss": 0.3002,
"step": 956
},
{
"epoch": 1.4426229508196722,
"grad_norm": 0.5379209815628555,
"learning_rate": 4.361102287887698e-06,
"loss": 0.3197,
"step": 957
},
{
"epoch": 1.4441303938194836,
"grad_norm": 0.5118973398445678,
"learning_rate": 4.339374299770477e-06,
"loss": 0.316,
"step": 958
},
{
"epoch": 1.4456378368192953,
"grad_norm": 0.5340927744773125,
"learning_rate": 4.31768556557441e-06,
"loss": 0.2995,
"step": 959
},
{
"epoch": 1.447145279819107,
"grad_norm": 0.5574015013189253,
"learning_rate": 4.296036235701235e-06,
"loss": 0.3214,
"step": 960
},
{
"epoch": 1.4486527228189185,
"grad_norm": 0.544283711827625,
"learning_rate": 4.274426460279412e-06,
"loss": 0.309,
"step": 961
},
{
"epoch": 1.45016016581873,
"grad_norm": 0.5456093505990249,
"learning_rate": 4.252856389163128e-06,
"loss": 0.3056,
"step": 962
},
{
"epoch": 1.4516676088185416,
"grad_norm": 0.5524390480774677,
"learning_rate": 4.231326171931231e-06,
"loss": 0.2988,
"step": 963
},
{
"epoch": 1.453175051818353,
"grad_norm": 0.5357243619653109,
"learning_rate": 4.209835957886196e-06,
"loss": 0.3051,
"step": 964
},
{
"epoch": 1.4546824948181647,
"grad_norm": 0.5567188365205857,
"learning_rate": 4.188385896053098e-06,
"loss": 0.3211,
"step": 965
},
{
"epoch": 1.4561899378179763,
"grad_norm": 0.5571208137663407,
"learning_rate": 4.166976135178575e-06,
"loss": 0.3212,
"step": 966
},
{
"epoch": 1.4576973808177878,
"grad_norm": 0.5288681574805124,
"learning_rate": 4.1456068237297964e-06,
"loss": 0.3247,
"step": 967
},
{
"epoch": 1.4592048238175994,
"grad_norm": 0.5362640814930834,
"learning_rate": 4.124278109893432e-06,
"loss": 0.3206,
"step": 968
},
{
"epoch": 1.4607122668174108,
"grad_norm": 0.5300069434968542,
"learning_rate": 4.10299014157462e-06,
"loss": 0.299,
"step": 969
},
{
"epoch": 1.4622197098172225,
"grad_norm": 0.5207197137299924,
"learning_rate": 4.0817430663959536e-06,
"loss": 0.2872,
"step": 970
},
{
"epoch": 1.4637271528170341,
"grad_norm": 0.5361880777046366,
"learning_rate": 4.06053703169645e-06,
"loss": 0.3432,
"step": 971
},
{
"epoch": 1.4652345958168458,
"grad_norm": 0.5390277129867954,
"learning_rate": 4.039372184530521e-06,
"loss": 0.3121,
"step": 972
},
{
"epoch": 1.4667420388166572,
"grad_norm": 0.5098624467494199,
"learning_rate": 4.0182486716669656e-06,
"loss": 0.3057,
"step": 973
},
{
"epoch": 1.4682494818164689,
"grad_norm": 0.5530264319623549,
"learning_rate": 3.9971666395879605e-06,
"loss": 0.316,
"step": 974
},
{
"epoch": 1.4697569248162803,
"grad_norm": 0.5648165554049958,
"learning_rate": 3.9761262344880096e-06,
"loss": 0.3456,
"step": 975
},
{
"epoch": 1.471264367816092,
"grad_norm": 0.5376597362402104,
"learning_rate": 3.9551276022729644e-06,
"loss": 0.3075,
"step": 976
},
{
"epoch": 1.4727718108159036,
"grad_norm": 0.5207214018679573,
"learning_rate": 3.9341708885590034e-06,
"loss": 0.3043,
"step": 977
},
{
"epoch": 1.4742792538157152,
"grad_norm": 0.5346717454580582,
"learning_rate": 3.913256238671607e-06,
"loss": 0.3187,
"step": 978
},
{
"epoch": 1.4757866968155267,
"grad_norm": 0.5474625757974195,
"learning_rate": 3.89238379764457e-06,
"loss": 0.3341,
"step": 979
},
{
"epoch": 1.4772941398153383,
"grad_norm": 0.553265646517597,
"learning_rate": 3.871553710218988e-06,
"loss": 0.3615,
"step": 980
},
{
"epoch": 1.4788015828151497,
"grad_norm": 0.5178190237398634,
"learning_rate": 3.850766120842252e-06,
"loss": 0.3087,
"step": 981
},
{
"epoch": 1.4803090258149614,
"grad_norm": 0.5676605070232937,
"learning_rate": 3.830021173667048e-06,
"loss": 0.3331,
"step": 982
},
{
"epoch": 1.481816468814773,
"grad_norm": 0.5366490741054173,
"learning_rate": 3.809319012550352e-06,
"loss": 0.3134,
"step": 983
},
{
"epoch": 1.4833239118145845,
"grad_norm": 0.5237338303143243,
"learning_rate": 3.788659781052444e-06,
"loss": 0.3426,
"step": 984
},
{
"epoch": 1.484831354814396,
"grad_norm": 0.5118568891202759,
"learning_rate": 3.7680436224359084e-06,
"loss": 0.3049,
"step": 985
},
{
"epoch": 1.4863387978142075,
"grad_norm": 0.5474188971913913,
"learning_rate": 3.747470679664624e-06,
"loss": 0.3177,
"step": 986
},
{
"epoch": 1.4878462408140192,
"grad_norm": 0.5623337896836218,
"learning_rate": 3.7269410954028107e-06,
"loss": 0.3268,
"step": 987
},
{
"epoch": 1.4893536838138308,
"grad_norm": 0.5320249898828978,
"learning_rate": 3.706455012013994e-06,
"loss": 0.3135,
"step": 988
},
{
"epoch": 1.4908611268136425,
"grad_norm": 0.5258630499196119,
"learning_rate": 3.6860125715600513e-06,
"loss": 0.2922,
"step": 989
},
{
"epoch": 1.492368569813454,
"grad_norm": 0.5488691155290143,
"learning_rate": 3.665613915800217e-06,
"loss": 0.3093,
"step": 990
},
{
"epoch": 1.4938760128132655,
"grad_norm": 0.534561267695021,
"learning_rate": 3.6452591861900886e-06,
"loss": 0.3201,
"step": 991
},
{
"epoch": 1.495383455813077,
"grad_norm": 0.5493934402188156,
"learning_rate": 3.6249485238806637e-06,
"loss": 0.3258,
"step": 992
},
{
"epoch": 1.4968908988128886,
"grad_norm": 0.5213745241077384,
"learning_rate": 3.6046820697173514e-06,
"loss": 0.3206,
"step": 993
},
{
"epoch": 1.4983983418127003,
"grad_norm": 0.5189896345789112,
"learning_rate": 3.5844599642389965e-06,
"loss": 0.3093,
"step": 994
},
{
"epoch": 1.499905784812512,
"grad_norm": 0.535438446636319,
"learning_rate": 3.564282347676903e-06,
"loss": 0.3449,
"step": 995
},
{
"epoch": 1.5014132278123233,
"grad_norm": 0.5151601964534807,
"learning_rate": 3.54414935995387e-06,
"loss": 0.3002,
"step": 996
},
{
"epoch": 1.5029206708121348,
"grad_norm": 0.5669837505751246,
"learning_rate": 3.524061140683206e-06,
"loss": 0.3367,
"step": 997
},
{
"epoch": 1.5044281138119464,
"grad_norm": 0.5376128901605735,
"learning_rate": 3.5040178291677816e-06,
"loss": 0.3362,
"step": 998
},
{
"epoch": 1.505935556811758,
"grad_norm": 0.5816182611830706,
"learning_rate": 3.4840195643990383e-06,
"loss": 0.316,
"step": 999
},
{
"epoch": 1.5074429998115697,
"grad_norm": 0.5333548248485912,
"learning_rate": 3.464066485056048e-06,
"loss": 0.3223,
"step": 1000
},
{
"epoch": 1.5089504428113814,
"grad_norm": 0.5574217586347463,
"learning_rate": 3.444158729504549e-06,
"loss": 0.2994,
"step": 1001
},
{
"epoch": 1.5104578858111928,
"grad_norm": 0.5272699065186945,
"learning_rate": 3.4242964357959597e-06,
"loss": 0.3152,
"step": 1002
},
{
"epoch": 1.5119653288110042,
"grad_norm": 0.5352889213452704,
"learning_rate": 3.4044797416664564e-06,
"loss": 0.3103,
"step": 1003
},
{
"epoch": 1.5134727718108159,
"grad_norm": 0.5602956634920077,
"learning_rate": 3.3847087845359996e-06,
"loss": 0.334,
"step": 1004
},
{
"epoch": 1.5149802148106275,
"grad_norm": 0.5402201680847467,
"learning_rate": 3.364983701507376e-06,
"loss": 0.3291,
"step": 1005
},
{
"epoch": 1.5164876578104391,
"grad_norm": 0.5526297524617138,
"learning_rate": 3.3453046293652657e-06,
"loss": 0.3232,
"step": 1006
},
{
"epoch": 1.5179951008102506,
"grad_norm": 0.5401106392320315,
"learning_rate": 3.3256717045752794e-06,
"loss": 0.3219,
"step": 1007
},
{
"epoch": 1.5195025438100622,
"grad_norm": 0.5589978845369276,
"learning_rate": 3.3060850632830167e-06,
"loss": 0.3215,
"step": 1008
},
{
"epoch": 1.5210099868098736,
"grad_norm": 0.5628427903081042,
"learning_rate": 3.286544841313126e-06,
"loss": 0.3042,
"step": 1009
},
{
"epoch": 1.5225174298096853,
"grad_norm": 0.5291974603976658,
"learning_rate": 3.2670511741683475e-06,
"loss": 0.3039,
"step": 1010
},
{
"epoch": 1.524024872809497,
"grad_norm": 0.5307771510625195,
"learning_rate": 3.2476041970285945e-06,
"loss": 0.3225,
"step": 1011
},
{
"epoch": 1.5255323158093086,
"grad_norm": 0.5798408138665074,
"learning_rate": 3.2282040447500063e-06,
"loss": 0.3574,
"step": 1012
},
{
"epoch": 1.52703975880912,
"grad_norm": 0.5262954379509106,
"learning_rate": 3.208850851863998e-06,
"loss": 0.3074,
"step": 1013
},
{
"epoch": 1.5285472018089314,
"grad_norm": 0.5411329822808086,
"learning_rate": 3.189544752576369e-06,
"loss": 0.3291,
"step": 1014
},
{
"epoch": 1.530054644808743,
"grad_norm": 0.512225504454937,
"learning_rate": 3.1702858807663175e-06,
"loss": 0.2967,
"step": 1015
},
{
"epoch": 1.5315620878085547,
"grad_norm": 0.5358326980021074,
"learning_rate": 3.151074369985556e-06,
"loss": 0.3281,
"step": 1016
},
{
"epoch": 1.5330695308083664,
"grad_norm": 0.5412115741377782,
"learning_rate": 3.131910353457369e-06,
"loss": 0.3321,
"step": 1017
},
{
"epoch": 1.534576973808178,
"grad_norm": 0.5355770866583788,
"learning_rate": 3.112793964075681e-06,
"loss": 0.2999,
"step": 1018
},
{
"epoch": 1.5360844168079895,
"grad_norm": 0.53044658803981,
"learning_rate": 3.0937253344041507e-06,
"loss": 0.3271,
"step": 1019
},
{
"epoch": 1.5375918598078009,
"grad_norm": 0.5274519029189704,
"learning_rate": 3.074704596675242e-06,
"loss": 0.3174,
"step": 1020
},
{
"epoch": 1.5390993028076125,
"grad_norm": 0.5410440028748313,
"learning_rate": 3.055731882789311e-06,
"loss": 0.3268,
"step": 1021
},
{
"epoch": 1.5406067458074242,
"grad_norm": 0.5400373191606019,
"learning_rate": 3.0368073243136874e-06,
"loss": 0.325,
"step": 1022
},
{
"epoch": 1.5421141888072358,
"grad_norm": 0.5520146881220487,
"learning_rate": 3.0179310524817707e-06,
"loss": 0.32,
"step": 1023
},
{
"epoch": 1.5436216318070473,
"grad_norm": 0.5351759755594372,
"learning_rate": 2.9991031981921026e-06,
"loss": 0.3269,
"step": 1024
},
{
"epoch": 1.545129074806859,
"grad_norm": 0.5327689357374978,
"learning_rate": 2.9803238920074784e-06,
"loss": 0.3082,
"step": 1025
},
{
"epoch": 1.5466365178066703,
"grad_norm": 0.5326589519994432,
"learning_rate": 2.961593264154038e-06,
"loss": 0.3157,
"step": 1026
},
{
"epoch": 1.548143960806482,
"grad_norm": 0.5249678984746798,
"learning_rate": 2.9429114445203423e-06,
"loss": 0.3119,
"step": 1027
},
{
"epoch": 1.5496514038062936,
"grad_norm": 0.5606341770532942,
"learning_rate": 2.924278562656514e-06,
"loss": 0.3351,
"step": 1028
},
{
"epoch": 1.5511588468061053,
"grad_norm": 0.5156779633424778,
"learning_rate": 2.90569474777329e-06,
"loss": 0.3256,
"step": 1029
},
{
"epoch": 1.5526662898059167,
"grad_norm": 0.5374626464876353,
"learning_rate": 2.8871601287411634e-06,
"loss": 0.3303,
"step": 1030
},
{
"epoch": 1.5541737328057281,
"grad_norm": 0.5262890780017794,
"learning_rate": 2.8686748340894744e-06,
"loss": 0.3114,
"step": 1031
},
{
"epoch": 1.5556811758055398,
"grad_norm": 0.5260995538471516,
"learning_rate": 2.850238992005514e-06,
"loss": 0.2979,
"step": 1032
},
{
"epoch": 1.5571886188053514,
"grad_norm": 0.5573768187241204,
"learning_rate": 2.8318527303336465e-06,
"loss": 0.3475,
"step": 1033
},
{
"epoch": 1.558696061805163,
"grad_norm": 0.5542071850347167,
"learning_rate": 2.81351617657442e-06,
"loss": 0.3359,
"step": 1034
},
{
"epoch": 1.5602035048049747,
"grad_norm": 0.5376949139111594,
"learning_rate": 2.795229457883678e-06,
"loss": 0.3299,
"step": 1035
},
{
"epoch": 1.5617109478047861,
"grad_norm": 0.5213690515169962,
"learning_rate": 2.7769927010716814e-06,
"loss": 0.3187,
"step": 1036
},
{
"epoch": 1.5632183908045976,
"grad_norm": 0.5282868034684867,
"learning_rate": 2.7588060326022205e-06,
"loss": 0.3124,
"step": 1037
},
{
"epoch": 1.5647258338044092,
"grad_norm": 0.5629003594886647,
"learning_rate": 2.740669578591755e-06,
"loss": 0.3453,
"step": 1038
},
{
"epoch": 1.5662332768042209,
"grad_norm": 0.5452741440289394,
"learning_rate": 2.7225834648085282e-06,
"loss": 0.3148,
"step": 1039
},
{
"epoch": 1.5677407198040325,
"grad_norm": 0.5576848374307647,
"learning_rate": 2.7045478166716843e-06,
"loss": 0.3362,
"step": 1040
},
{
"epoch": 1.569248162803844,
"grad_norm": 0.5230478868120295,
"learning_rate": 2.6865627592504295e-06,
"loss": 0.3074,
"step": 1041
},
{
"epoch": 1.5707556058036556,
"grad_norm": 0.5476529275243367,
"learning_rate": 2.668628417263137e-06,
"loss": 0.314,
"step": 1042
},
{
"epoch": 1.572263048803467,
"grad_norm": 0.5340674210452238,
"learning_rate": 2.6507449150764852e-06,
"loss": 0.3035,
"step": 1043
},
{
"epoch": 1.5737704918032787,
"grad_norm": 0.5245422709481129,
"learning_rate": 2.632912376704607e-06,
"loss": 0.3344,
"step": 1044
},
{
"epoch": 1.5752779348030903,
"grad_norm": 0.52661514981572,
"learning_rate": 2.615130925808228e-06,
"loss": 0.3054,
"step": 1045
},
{
"epoch": 1.576785377802902,
"grad_norm": 0.5147590149467712,
"learning_rate": 2.597400685693795e-06,
"loss": 0.2879,
"step": 1046
},
{
"epoch": 1.5782928208027134,
"grad_norm": 0.5476707768783776,
"learning_rate": 2.5797217793126373e-06,
"loss": 0.3395,
"step": 1047
},
{
"epoch": 1.5798002638025248,
"grad_norm": 0.5338841668417198,
"learning_rate": 2.5620943292601074e-06,
"loss": 0.3211,
"step": 1048
},
{
"epoch": 1.5813077068023365,
"grad_norm": 0.526860653464564,
"learning_rate": 2.5445184577747305e-06,
"loss": 0.3251,
"step": 1049
},
{
"epoch": 1.582815149802148,
"grad_norm": 0.5250152267933532,
"learning_rate": 2.52699428673736e-06,
"loss": 0.3126,
"step": 1050
},
{
"epoch": 1.5843225928019598,
"grad_norm": 0.5470636297967526,
"learning_rate": 2.5095219376703183e-06,
"loss": 0.3063,
"step": 1051
},
{
"epoch": 1.5858300358017714,
"grad_norm": 0.5581707395933467,
"learning_rate": 2.4921015317365794e-06,
"loss": 0.3624,
"step": 1052
},
{
"epoch": 1.5873374788015828,
"grad_norm": 0.5280009933911688,
"learning_rate": 2.4747331897389103e-06,
"loss": 0.3106,
"step": 1053
},
{
"epoch": 1.5888449218013942,
"grad_norm": 0.5234869653748981,
"learning_rate": 2.4574170321190305e-06,
"loss": 0.2956,
"step": 1054
},
{
"epoch": 1.590352364801206,
"grad_norm": 0.546217705596414,
"learning_rate": 2.440153178956798e-06,
"loss": 0.3215,
"step": 1055
},
{
"epoch": 1.5918598078010175,
"grad_norm": 0.5556302525952723,
"learning_rate": 2.42294174996935e-06,
"loss": 0.3204,
"step": 1056
},
{
"epoch": 1.5933672508008292,
"grad_norm": 0.5588880844097838,
"learning_rate": 2.40578286451029e-06,
"loss": 0.3282,
"step": 1057
},
{
"epoch": 1.5948746938006406,
"grad_norm": 0.5241614280996468,
"learning_rate": 2.38867664156886e-06,
"loss": 0.3255,
"step": 1058
},
{
"epoch": 1.5963821368004523,
"grad_norm": 0.5543274849783603,
"learning_rate": 2.3716231997691007e-06,
"loss": 0.3175,
"step": 1059
},
{
"epoch": 1.5978895798002637,
"grad_norm": 0.5306578564545272,
"learning_rate": 2.3546226573690444e-06,
"loss": 0.3211,
"step": 1060
},
{
"epoch": 1.5993970228000753,
"grad_norm": 0.5401209566379707,
"learning_rate": 2.3376751322599e-06,
"loss": 0.3117,
"step": 1061
},
{
"epoch": 1.600904465799887,
"grad_norm": 0.5339229576030943,
"learning_rate": 2.320780741965206e-06,
"loss": 0.3064,
"step": 1062
},
{
"epoch": 1.6024119087996986,
"grad_norm": 0.5291570037477905,
"learning_rate": 2.3039396036400463e-06,
"loss": 0.3001,
"step": 1063
},
{
"epoch": 1.60391935179951,
"grad_norm": 0.5544131085966325,
"learning_rate": 2.287151834070226e-06,
"loss": 0.3173,
"step": 1064
},
{
"epoch": 1.6054267947993215,
"grad_norm": 0.5042273491393638,
"learning_rate": 2.2704175496714552e-06,
"loss": 0.3035,
"step": 1065
},
{
"epoch": 1.6069342377991331,
"grad_norm": 0.5164264296676705,
"learning_rate": 2.2537368664885527e-06,
"loss": 0.306,
"step": 1066
},
{
"epoch": 1.6084416807989448,
"grad_norm": 0.540939444102417,
"learning_rate": 2.2371099001946385e-06,
"loss": 0.3417,
"step": 1067
},
{
"epoch": 1.6099491237987564,
"grad_norm": 0.5349172500611197,
"learning_rate": 2.2205367660903267e-06,
"loss": 0.3155,
"step": 1068
},
{
"epoch": 1.611456566798568,
"grad_norm": 0.5392150017492342,
"learning_rate": 2.2040175791029305e-06,
"loss": 0.334,
"step": 1069
},
{
"epoch": 1.6129640097983795,
"grad_norm": 0.5420224175155496,
"learning_rate": 2.187552453785662e-06,
"loss": 0.2981,
"step": 1070
},
{
"epoch": 1.614471452798191,
"grad_norm": 0.5385758816342323,
"learning_rate": 2.1711415043168395e-06,
"loss": 0.3313,
"step": 1071
},
{
"epoch": 1.6159788957980026,
"grad_norm": 0.5437131207841849,
"learning_rate": 2.1547848444991025e-06,
"loss": 0.3352,
"step": 1072
},
{
"epoch": 1.6174863387978142,
"grad_norm": 0.5395621366503963,
"learning_rate": 2.138482587758605e-06,
"loss": 0.308,
"step": 1073
},
{
"epoch": 1.6189937817976259,
"grad_norm": 0.5255773037738669,
"learning_rate": 2.1222348471442477e-06,
"loss": 0.3014,
"step": 1074
},
{
"epoch": 1.6205012247974373,
"grad_norm": 0.5226567653631905,
"learning_rate": 2.1060417353268845e-06,
"loss": 0.3143,
"step": 1075
},
{
"epoch": 1.622008667797249,
"grad_norm": 0.5449482690998529,
"learning_rate": 2.0899033645985423e-06,
"loss": 0.3091,
"step": 1076
},
{
"epoch": 1.6235161107970604,
"grad_norm": 0.5424238529202222,
"learning_rate": 2.073819846871646e-06,
"loss": 0.3185,
"step": 1077
},
{
"epoch": 1.625023553796872,
"grad_norm": 0.5311225228497766,
"learning_rate": 2.0577912936782317e-06,
"loss": 0.2983,
"step": 1078
},
{
"epoch": 1.6265309967966837,
"grad_norm": 0.5208053420833654,
"learning_rate": 2.041817816169187e-06,
"loss": 0.3295,
"step": 1079
},
{
"epoch": 1.6280384397964953,
"grad_norm": 0.5269145283569221,
"learning_rate": 2.025899525113474e-06,
"loss": 0.3026,
"step": 1080
},
{
"epoch": 1.6295458827963067,
"grad_norm": 0.5371643301644231,
"learning_rate": 2.010036530897359e-06,
"loss": 0.3196,
"step": 1081
},
{
"epoch": 1.6310533257961182,
"grad_norm": 0.5349956576564423,
"learning_rate": 1.9942289435236506e-06,
"loss": 0.3304,
"step": 1082
},
{
"epoch": 1.6325607687959298,
"grad_norm": 0.5604150538269126,
"learning_rate": 1.978476872610939e-06,
"loss": 0.3485,
"step": 1083
},
{
"epoch": 1.6340682117957415,
"grad_norm": 0.5245207862200475,
"learning_rate": 1.962780427392823e-06,
"loss": 0.2882,
"step": 1084
},
{
"epoch": 1.6355756547955531,
"grad_norm": 0.5195585896558206,
"learning_rate": 1.9471397167171714e-06,
"loss": 0.3051,
"step": 1085
},
{
"epoch": 1.6370830977953648,
"grad_norm": 0.5147063039454572,
"learning_rate": 1.931554849045355e-06,
"loss": 0.3078,
"step": 1086
},
{
"epoch": 1.6385905407951762,
"grad_norm": 0.5303520825987951,
"learning_rate": 1.916025932451493e-06,
"loss": 0.3141,
"step": 1087
},
{
"epoch": 1.6400979837949876,
"grad_norm": 0.5154838630662848,
"learning_rate": 1.9005530746217238e-06,
"loss": 0.2971,
"step": 1088
},
{
"epoch": 1.6416054267947993,
"grad_norm": 0.5537432078636199,
"learning_rate": 1.8851363828534253e-06,
"loss": 0.3124,
"step": 1089
},
{
"epoch": 1.643112869794611,
"grad_norm": 0.5634336334894083,
"learning_rate": 1.869775964054501e-06,
"loss": 0.3271,
"step": 1090
},
{
"epoch": 1.6446203127944226,
"grad_norm": 0.5433031560068617,
"learning_rate": 1.8544719247426224e-06,
"loss": 0.3191,
"step": 1091
},
{
"epoch": 1.646127755794234,
"grad_norm": 0.5357448136347239,
"learning_rate": 1.8392243710444911e-06,
"loss": 0.2982,
"step": 1092
},
{
"epoch": 1.6476351987940456,
"grad_norm": 0.5552897165798768,
"learning_rate": 1.8240334086951117e-06,
"loss": 0.3537,
"step": 1093
},
{
"epoch": 1.649142641793857,
"grad_norm": 0.5318934621576651,
"learning_rate": 1.8088991430370506e-06,
"loss": 0.3005,
"step": 1094
},
{
"epoch": 1.6506500847936687,
"grad_norm": 0.5465559179605479,
"learning_rate": 1.7938216790197071e-06,
"loss": 0.3207,
"step": 1095
},
{
"epoch": 1.6521575277934804,
"grad_norm": 0.5641671337079456,
"learning_rate": 1.77880112119859e-06,
"loss": 0.3095,
"step": 1096
},
{
"epoch": 1.653664970793292,
"grad_norm": 0.5270236586496325,
"learning_rate": 1.7638375737345804e-06,
"loss": 0.312,
"step": 1097
},
{
"epoch": 1.6551724137931034,
"grad_norm": 0.5192997218773957,
"learning_rate": 1.7489311403932274e-06,
"loss": 0.2937,
"step": 1098
},
{
"epoch": 1.656679856792915,
"grad_norm": 0.5620286825583494,
"learning_rate": 1.7340819245440166e-06,
"loss": 0.3186,
"step": 1099
},
{
"epoch": 1.6581872997927265,
"grad_norm": 0.5445642066374056,
"learning_rate": 1.7192900291596493e-06,
"loss": 0.3222,
"step": 1100
},
{
"epoch": 1.6596947427925381,
"grad_norm": 0.5157869374514513,
"learning_rate": 1.7045555568153415e-06,
"loss": 0.306,
"step": 1101
},
{
"epoch": 1.6612021857923498,
"grad_norm": 0.5138381277919514,
"learning_rate": 1.6898786096881104e-06,
"loss": 0.2715,
"step": 1102
},
{
"epoch": 1.6627096287921614,
"grad_norm": 0.5431676779116132,
"learning_rate": 1.6752592895560493e-06,
"loss": 0.3202,
"step": 1103
},
{
"epoch": 1.6642170717919729,
"grad_norm": 0.5396928068758252,
"learning_rate": 1.6606976977976408e-06,
"loss": 0.3122,
"step": 1104
},
{
"epoch": 1.6657245147917843,
"grad_norm": 0.5332820215409003,
"learning_rate": 1.6461939353910494e-06,
"loss": 0.3083,
"step": 1105
},
{
"epoch": 1.667231957791596,
"grad_norm": 0.5475766503326961,
"learning_rate": 1.631748102913412e-06,
"loss": 0.3166,
"step": 1106
},
{
"epoch": 1.6687394007914076,
"grad_norm": 0.5416380330717204,
"learning_rate": 1.6173603005401505e-06,
"loss": 0.3158,
"step": 1107
},
{
"epoch": 1.6702468437912192,
"grad_norm": 0.5416213794269614,
"learning_rate": 1.6030306280442764e-06,
"loss": 0.3077,
"step": 1108
},
{
"epoch": 1.6717542867910307,
"grad_norm": 0.5153535761957841,
"learning_rate": 1.588759184795694e-06,
"loss": 0.3064,
"step": 1109
},
{
"epoch": 1.6732617297908423,
"grad_norm": 0.5315610226872074,
"learning_rate": 1.574546069760514e-06,
"loss": 0.3241,
"step": 1110
},
{
"epoch": 1.6747691727906537,
"grad_norm": 0.5684878986820251,
"learning_rate": 1.5603913815003634e-06,
"loss": 0.3403,
"step": 1111
},
{
"epoch": 1.6762766157904654,
"grad_norm": 0.5361918937068931,
"learning_rate": 1.5462952181717117e-06,
"loss": 0.3157,
"step": 1112
},
{
"epoch": 1.677784058790277,
"grad_norm": 0.5495570916809654,
"learning_rate": 1.532257677525183e-06,
"loss": 0.3224,
"step": 1113
},
{
"epoch": 1.6792915017900887,
"grad_norm": 0.5281943000093583,
"learning_rate": 1.5182788569048689e-06,
"loss": 0.3209,
"step": 1114
},
{
"epoch": 1.6807989447899,
"grad_norm": 0.5572787989841019,
"learning_rate": 1.5043588532476827e-06,
"loss": 0.3663,
"step": 1115
},
{
"epoch": 1.6823063877897118,
"grad_norm": 0.5228968415248135,
"learning_rate": 1.49049776308265e-06,
"loss": 0.2889,
"step": 1116
},
{
"epoch": 1.6838138307895232,
"grad_norm": 0.5220477714238411,
"learning_rate": 1.476695682530268e-06,
"loss": 0.3031,
"step": 1117
},
{
"epoch": 1.6853212737893348,
"grad_norm": 0.5351071420566498,
"learning_rate": 1.4629527073018267e-06,
"loss": 0.3308,
"step": 1118
},
{
"epoch": 1.6868287167891465,
"grad_norm": 0.5396208371722178,
"learning_rate": 1.449268932698743e-06,
"loss": 0.2971,
"step": 1119
},
{
"epoch": 1.6883361597889581,
"grad_norm": 0.5416569763589452,
"learning_rate": 1.4356444536119085e-06,
"loss": 0.3024,
"step": 1120
},
{
"epoch": 1.6898436027887695,
"grad_norm": 0.5370439069377987,
"learning_rate": 1.422079364521024e-06,
"loss": 0.3169,
"step": 1121
},
{
"epoch": 1.691351045788581,
"grad_norm": 0.5450238281058462,
"learning_rate": 1.4085737594939497e-06,
"loss": 0.333,
"step": 1122
},
{
"epoch": 1.6928584887883926,
"grad_norm": 0.5168406644621856,
"learning_rate": 1.3951277321860468e-06,
"loss": 0.3006,
"step": 1123
},
{
"epoch": 1.6943659317882043,
"grad_norm": 0.5087028192552481,
"learning_rate": 1.381741375839537e-06,
"loss": 0.2664,
"step": 1124
},
{
"epoch": 1.695873374788016,
"grad_norm": 0.5165999383002566,
"learning_rate": 1.3684147832828409e-06,
"loss": 0.281,
"step": 1125
},
{
"epoch": 1.6973808177878273,
"grad_norm": 0.5150348541332692,
"learning_rate": 1.355148046929956e-06,
"loss": 0.307,
"step": 1126
},
{
"epoch": 1.698888260787639,
"grad_norm": 0.5168141041103775,
"learning_rate": 1.3419412587797908e-06,
"loss": 0.293,
"step": 1127
},
{
"epoch": 1.7003957037874504,
"grad_norm": 0.5133973098786774,
"learning_rate": 1.3287945104155487e-06,
"loss": 0.3015,
"step": 1128
},
{
"epoch": 1.701903146787262,
"grad_norm": 0.5513676691974454,
"learning_rate": 1.3157078930040856e-06,
"loss": 0.3179,
"step": 1129
},
{
"epoch": 1.7034105897870737,
"grad_norm": 0.546503387891844,
"learning_rate": 1.3026814972952674e-06,
"loss": 0.3043,
"step": 1130
},
{
"epoch": 1.7049180327868854,
"grad_norm": 0.5133928571679112,
"learning_rate": 1.2897154136213542e-06,
"loss": 0.2938,
"step": 1131
},
{
"epoch": 1.7064254757866968,
"grad_norm": 0.5441429881648664,
"learning_rate": 1.2768097318963701e-06,
"loss": 0.2978,
"step": 1132
},
{
"epoch": 1.7079329187865084,
"grad_norm": 0.5430821498300733,
"learning_rate": 1.2639645416154744e-06,
"loss": 0.3204,
"step": 1133
},
{
"epoch": 1.7094403617863199,
"grad_norm": 0.5536392631462185,
"learning_rate": 1.2511799318543493e-06,
"loss": 0.3196,
"step": 1134
},
{
"epoch": 1.7109478047861315,
"grad_norm": 0.5403946840340975,
"learning_rate": 1.2384559912685768e-06,
"loss": 0.3156,
"step": 1135
},
{
"epoch": 1.7124552477859432,
"grad_norm": 0.5360872549447121,
"learning_rate": 1.2257928080930236e-06,
"loss": 0.3275,
"step": 1136
},
{
"epoch": 1.7139626907857548,
"grad_norm": 0.5482125645711281,
"learning_rate": 1.2131904701412345e-06,
"loss": 0.3041,
"step": 1137
},
{
"epoch": 1.7154701337855662,
"grad_norm": 0.5497152879102077,
"learning_rate": 1.2006490648048118e-06,
"loss": 0.3148,
"step": 1138
},
{
"epoch": 1.7169775767853777,
"grad_norm": 0.5141035384965538,
"learning_rate": 1.1881686790528279e-06,
"loss": 0.3068,
"step": 1139
},
{
"epoch": 1.7184850197851893,
"grad_norm": 0.529010142033819,
"learning_rate": 1.1757493994312052e-06,
"loss": 0.3088,
"step": 1140
},
{
"epoch": 1.719992462785001,
"grad_norm": 0.5495066854090749,
"learning_rate": 1.1633913120621188e-06,
"loss": 0.3236,
"step": 1141
},
{
"epoch": 1.7214999057848126,
"grad_norm": 0.5492447937245492,
"learning_rate": 1.151094502643414e-06,
"loss": 0.3308,
"step": 1142
},
{
"epoch": 1.723007348784624,
"grad_norm": 0.5562630282965828,
"learning_rate": 1.1388590564479895e-06,
"loss": 0.299,
"step": 1143
},
{
"epoch": 1.7245147917844357,
"grad_norm": 0.5231751271166386,
"learning_rate": 1.1266850583232224e-06,
"loss": 0.3053,
"step": 1144
},
{
"epoch": 1.726022234784247,
"grad_norm": 0.5201419013982067,
"learning_rate": 1.1145725926903772e-06,
"loss": 0.3023,
"step": 1145
},
{
"epoch": 1.7275296777840587,
"grad_norm": 0.511856980805046,
"learning_rate": 1.1025217435440116e-06,
"loss": 0.2867,
"step": 1146
},
{
"epoch": 1.7290371207838704,
"grad_norm": 0.565983997374927,
"learning_rate": 1.0905325944514034e-06,
"loss": 0.3232,
"step": 1147
},
{
"epoch": 1.730544563783682,
"grad_norm": 0.5363045072212188,
"learning_rate": 1.078605228551971e-06,
"loss": 0.3182,
"step": 1148
},
{
"epoch": 1.7320520067834935,
"grad_norm": 0.5329886787330583,
"learning_rate": 1.0667397285566893e-06,
"loss": 0.3061,
"step": 1149
},
{
"epoch": 1.7335594497833051,
"grad_norm": 0.5397864180847504,
"learning_rate": 1.0549361767475241e-06,
"loss": 0.2873,
"step": 1150
},
{
"epoch": 1.7350668927831165,
"grad_norm": 0.5436027401118747,
"learning_rate": 1.0431946549768567e-06,
"loss": 0.3213,
"step": 1151
},
{
"epoch": 1.7365743357829282,
"grad_norm": 0.5304426225729307,
"learning_rate": 1.0315152446669142e-06,
"loss": 0.295,
"step": 1152
},
{
"epoch": 1.7380817787827398,
"grad_norm": 0.5119724476906113,
"learning_rate": 1.019898026809214e-06,
"loss": 0.3009,
"step": 1153
},
{
"epoch": 1.7395892217825515,
"grad_norm": 0.5399752438286287,
"learning_rate": 1.0083430819639962e-06,
"loss": 0.3097,
"step": 1154
},
{
"epoch": 1.741096664782363,
"grad_norm": 0.5329124149971953,
"learning_rate": 9.968504902596566e-07,
"loss": 0.3094,
"step": 1155
},
{
"epoch": 1.7426041077821743,
"grad_norm": 0.5265575018375785,
"learning_rate": 9.85420331392214e-07,
"loss": 0.3001,
"step": 1156
},
{
"epoch": 1.744111550781986,
"grad_norm": 0.5415027063140824,
"learning_rate": 9.74052684624731e-07,
"loss": 0.3052,
"step": 1157
},
{
"epoch": 1.7456189937817976,
"grad_norm": 0.5273083269054069,
"learning_rate": 9.62747628786782e-07,
"loss": 0.2918,
"step": 1158
},
{
"epoch": 1.7471264367816093,
"grad_norm": 0.5467675396074031,
"learning_rate": 9.515052422739035e-07,
"loss": 0.3013,
"step": 1159
},
{
"epoch": 1.748633879781421,
"grad_norm": 0.5012647001758278,
"learning_rate": 9.403256030470386e-07,
"loss": 0.2922,
"step": 1160
},
{
"epoch": 1.7501413227812324,
"grad_norm": 0.5473662670334606,
"learning_rate": 9.292087886320166e-07,
"loss": 0.3275,
"step": 1161
},
{
"epoch": 1.7516487657810438,
"grad_norm": 0.5119711645632107,
"learning_rate": 9.181548761189996e-07,
"loss": 0.2997,
"step": 1162
},
{
"epoch": 1.7531562087808554,
"grad_norm": 0.5579153837729429,
"learning_rate": 9.071639421619527e-07,
"loss": 0.3373,
"step": 1163
},
{
"epoch": 1.754663651780667,
"grad_norm": 0.5369066551498206,
"learning_rate": 8.962360629781164e-07,
"loss": 0.3013,
"step": 1164
},
{
"epoch": 1.7561710947804787,
"grad_norm": 0.5299407293801213,
"learning_rate": 8.853713143474685e-07,
"loss": 0.2977,
"step": 1165
},
{
"epoch": 1.7576785377802902,
"grad_norm": 0.5246411563266595,
"learning_rate": 8.745697716122081e-07,
"loss": 0.3192,
"step": 1166
},
{
"epoch": 1.7591859807801018,
"grad_norm": 0.5450557723814945,
"learning_rate": 8.638315096762318e-07,
"loss": 0.3075,
"step": 1167
},
{
"epoch": 1.7606934237799132,
"grad_norm": 0.5392032704205785,
"learning_rate": 8.531566030046035e-07,
"loss": 0.3231,
"step": 1168
},
{
"epoch": 1.7622008667797249,
"grad_norm": 0.5370373229347402,
"learning_rate": 8.425451256230588e-07,
"loss": 0.3012,
"step": 1169
},
{
"epoch": 1.7637083097795365,
"grad_norm": 0.5405109358545394,
"learning_rate": 8.319971511174718e-07,
"loss": 0.3165,
"step": 1170
},
{
"epoch": 1.7652157527793482,
"grad_norm": 0.528010212076121,
"learning_rate": 8.215127526333499e-07,
"loss": 0.3236,
"step": 1171
},
{
"epoch": 1.7667231957791596,
"grad_norm": 0.5226712118154457,
"learning_rate": 8.110920028753355e-07,
"loss": 0.3088,
"step": 1172
},
{
"epoch": 1.768230638778971,
"grad_norm": 0.5574504221106463,
"learning_rate": 8.007349741066939e-07,
"loss": 0.3258,
"step": 1173
},
{
"epoch": 1.7697380817787827,
"grad_norm": 0.5452638755092976,
"learning_rate": 7.904417381488083e-07,
"loss": 0.3167,
"step": 1174
},
{
"epoch": 1.7712455247785943,
"grad_norm": 0.5517680932401637,
"learning_rate": 7.802123663806938e-07,
"loss": 0.3227,
"step": 1175
},
{
"epoch": 1.772752967778406,
"grad_norm": 0.5480833770797733,
"learning_rate": 7.700469297384927e-07,
"loss": 0.3307,
"step": 1176
},
{
"epoch": 1.7742604107782176,
"grad_norm": 0.5430614726412718,
"learning_rate": 7.599454987149868e-07,
"loss": 0.3463,
"step": 1177
},
{
"epoch": 1.775767853778029,
"grad_norm": 0.549255722400039,
"learning_rate": 7.499081433591071e-07,
"loss": 0.3284,
"step": 1178
},
{
"epoch": 1.7772752967778405,
"grad_norm": 0.5410909371721678,
"learning_rate": 7.399349332754458e-07,
"loss": 0.315,
"step": 1179
},
{
"epoch": 1.778782739777652,
"grad_norm": 0.530772119547457,
"learning_rate": 7.300259376237795e-07,
"loss": 0.3101,
"step": 1180
},
{
"epoch": 1.7802901827774638,
"grad_norm": 0.5484763677836378,
"learning_rate": 7.201812251185869e-07,
"loss": 0.3192,
"step": 1181
},
{
"epoch": 1.7817976257772754,
"grad_norm": 0.5299690672941552,
"learning_rate": 7.104008640285642e-07,
"loss": 0.3115,
"step": 1182
},
{
"epoch": 1.7833050687770868,
"grad_norm": 0.5424348226189657,
"learning_rate": 7.006849221761736e-07,
"loss": 0.3119,
"step": 1183
},
{
"epoch": 1.7848125117768985,
"grad_norm": 0.5221149596372863,
"learning_rate": 6.910334669371433e-07,
"loss": 0.3078,
"step": 1184
},
{
"epoch": 1.78631995477671,
"grad_norm": 0.5581110749980865,
"learning_rate": 6.814465652400237e-07,
"loss": 0.3364,
"step": 1185
},
{
"epoch": 1.7878273977765216,
"grad_norm": 0.5227271898985753,
"learning_rate": 6.719242835657147e-07,
"loss": 0.3057,
"step": 1186
},
{
"epoch": 1.7893348407763332,
"grad_norm": 0.5541663205023336,
"learning_rate": 6.62466687947001e-07,
"loss": 0.335,
"step": 1187
},
{
"epoch": 1.7908422837761448,
"grad_norm": 0.5269336686543489,
"learning_rate": 6.530738439681017e-07,
"loss": 0.3151,
"step": 1188
},
{
"epoch": 1.7923497267759563,
"grad_norm": 0.5786593133724774,
"learning_rate": 6.437458167642164e-07,
"loss": 0.3366,
"step": 1189
},
{
"epoch": 1.7938571697757677,
"grad_norm": 0.5253461874134103,
"learning_rate": 6.344826710210584e-07,
"loss": 0.3127,
"step": 1190
},
{
"epoch": 1.7953646127755793,
"grad_norm": 0.5299856089834871,
"learning_rate": 6.252844709744255e-07,
"loss": 0.3135,
"step": 1191
},
{
"epoch": 1.796872055775391,
"grad_norm": 0.521456043233,
"learning_rate": 6.161512804097436e-07,
"loss": 0.2977,
"step": 1192
},
{
"epoch": 1.7983794987752026,
"grad_norm": 0.5469215546867617,
"learning_rate": 6.070831626616236e-07,
"loss": 0.3255,
"step": 1193
},
{
"epoch": 1.7998869417750143,
"grad_norm": 0.5228672904607026,
"learning_rate": 5.980801806134318e-07,
"loss": 0.2866,
"step": 1194
},
{
"epoch": 1.8013943847748257,
"grad_norm": 0.5146912409802952,
"learning_rate": 5.891423966968413e-07,
"loss": 0.2881,
"step": 1195
},
{
"epoch": 1.8029018277746371,
"grad_norm": 0.5474095770320819,
"learning_rate": 5.80269872891408e-07,
"loss": 0.3036,
"step": 1196
},
{
"epoch": 1.8044092707744488,
"grad_norm": 0.5298374503272721,
"learning_rate": 5.714626707241411e-07,
"loss": 0.3032,
"step": 1197
},
{
"epoch": 1.8059167137742604,
"grad_norm": 0.5577685295614593,
"learning_rate": 5.627208512690641e-07,
"loss": 0.3136,
"step": 1198
},
{
"epoch": 1.807424156774072,
"grad_norm": 0.5641313668289314,
"learning_rate": 5.5404447514681e-07,
"loss": 0.3057,
"step": 1199
},
{
"epoch": 1.8089315997738835,
"grad_norm": 0.5295651592847044,
"learning_rate": 5.45433602524188e-07,
"loss": 0.292,
"step": 1200
},
{
"epoch": 1.8104390427736952,
"grad_norm": 0.5164790354946905,
"learning_rate": 5.368882931137675e-07,
"loss": 0.3171,
"step": 1201
},
{
"epoch": 1.8119464857735066,
"grad_norm": 0.5440394178727653,
"learning_rate": 5.284086061734672e-07,
"loss": 0.3389,
"step": 1202
},
{
"epoch": 1.8134539287733182,
"grad_norm": 0.5379140129646219,
"learning_rate": 5.199946005061462e-07,
"loss": 0.3191,
"step": 1203
},
{
"epoch": 1.8149613717731299,
"grad_norm": 0.5586596394798488,
"learning_rate": 5.116463344591893e-07,
"loss": 0.3297,
"step": 1204
},
{
"epoch": 1.8164688147729415,
"grad_norm": 0.5460139307968361,
"learning_rate": 5.033638659241102e-07,
"loss": 0.3179,
"step": 1205
},
{
"epoch": 1.817976257772753,
"grad_norm": 0.5099561737975997,
"learning_rate": 4.951472523361401e-07,
"loss": 0.2881,
"step": 1206
},
{
"epoch": 1.8194837007725644,
"grad_norm": 0.5127038476010487,
"learning_rate": 4.869965506738416e-07,
"loss": 0.301,
"step": 1207
},
{
"epoch": 1.820991143772376,
"grad_norm": 0.5160382306170839,
"learning_rate": 4.789118174587071e-07,
"loss": 0.2951,
"step": 1208
},
{
"epoch": 1.8224985867721877,
"grad_norm": 0.5368235750006268,
"learning_rate": 4.7089310875475856e-07,
"loss": 0.3244,
"step": 1209
},
{
"epoch": 1.8240060297719993,
"grad_norm": 0.5486310091200752,
"learning_rate": 4.6294048016817917e-07,
"loss": 0.3357,
"step": 1210
},
{
"epoch": 1.825513472771811,
"grad_norm": 0.5531427959479509,
"learning_rate": 4.550539868469106e-07,
"loss": 0.3404,
"step": 1211
},
{
"epoch": 1.8270209157716224,
"grad_norm": 0.5715387730686571,
"learning_rate": 4.4723368348027375e-07,
"loss": 0.3172,
"step": 1212
},
{
"epoch": 1.8285283587714338,
"grad_norm": 0.5208566706980682,
"learning_rate": 4.394796242985933e-07,
"loss": 0.3334,
"step": 1213
},
{
"epoch": 1.8300358017712455,
"grad_norm": 0.5088911051543478,
"learning_rate": 4.317918630728235e-07,
"loss": 0.3022,
"step": 1214
},
{
"epoch": 1.8315432447710571,
"grad_norm": 0.5459330178466746,
"learning_rate": 4.241704531141633e-07,
"loss": 0.3192,
"step": 1215
},
{
"epoch": 1.8330506877708688,
"grad_norm": 0.5256076032695434,
"learning_rate": 4.166154472737061e-07,
"loss": 0.2962,
"step": 1216
},
{
"epoch": 1.8345581307706802,
"grad_norm": 0.525111212488327,
"learning_rate": 4.091268979420537e-07,
"loss": 0.3015,
"step": 1217
},
{
"epoch": 1.8360655737704918,
"grad_norm": 0.5131054923837834,
"learning_rate": 4.0170485704896453e-07,
"loss": 0.2984,
"step": 1218
},
{
"epoch": 1.8375730167703033,
"grad_norm": 0.5222731798701179,
"learning_rate": 3.943493760629924e-07,
"loss": 0.3007,
"step": 1219
},
{
"epoch": 1.839080459770115,
"grad_norm": 0.526864271558084,
"learning_rate": 3.8706050599112363e-07,
"loss": 0.3265,
"step": 1220
},
{
"epoch": 1.8405879027699266,
"grad_norm": 0.5429845817805197,
"learning_rate": 3.798382973784298e-07,
"loss": 0.3126,
"step": 1221
},
{
"epoch": 1.8420953457697382,
"grad_norm": 0.5203043666384023,
"learning_rate": 3.7268280030771655e-07,
"loss": 0.3005,
"step": 1222
},
{
"epoch": 1.8436027887695496,
"grad_norm": 0.5553304676785623,
"learning_rate": 3.655940643991718e-07,
"loss": 0.3033,
"step": 1223
},
{
"epoch": 1.845110231769361,
"grad_norm": 0.5453329521030166,
"learning_rate": 3.585721388100283e-07,
"loss": 0.3197,
"step": 1224
},
{
"epoch": 1.8466176747691727,
"grad_norm": 0.5358448339719871,
"learning_rate": 3.516170722342127e-07,
"loss": 0.3123,
"step": 1225
},
{
"epoch": 1.8481251177689844,
"grad_norm": 0.5255670215605667,
"learning_rate": 3.4472891290201927e-07,
"loss": 0.3052,
"step": 1226
},
{
"epoch": 1.849632560768796,
"grad_norm": 0.5558001652650641,
"learning_rate": 3.3790770857976995e-07,
"loss": 0.326,
"step": 1227
},
{
"epoch": 1.8511400037686077,
"grad_norm": 0.5383925013665675,
"learning_rate": 3.3115350656948043e-07,
"loss": 0.3074,
"step": 1228
},
{
"epoch": 1.852647446768419,
"grad_norm": 0.537705868540796,
"learning_rate": 3.2446635370853686e-07,
"loss": 0.3304,
"step": 1229
},
{
"epoch": 1.8541548897682305,
"grad_norm": 0.5382781367285551,
"learning_rate": 3.1784629636937404e-07,
"loss": 0.2883,
"step": 1230
},
{
"epoch": 1.8556623327680422,
"grad_norm": 0.5198656289929648,
"learning_rate": 3.1129338045914004e-07,
"loss": 0.3067,
"step": 1231
},
{
"epoch": 1.8571697757678538,
"grad_norm": 0.5472844326917599,
"learning_rate": 3.0480765141939316e-07,
"loss": 0.2992,
"step": 1232
},
{
"epoch": 1.8586772187676655,
"grad_norm": 0.5507075256404199,
"learning_rate": 2.9838915422578e-07,
"loss": 0.3179,
"step": 1233
},
{
"epoch": 1.8601846617674769,
"grad_norm": 0.5510217424809797,
"learning_rate": 2.920379333877221e-07,
"loss": 0.2994,
"step": 1234
},
{
"epoch": 1.8616921047672885,
"grad_norm": 0.5400374711049234,
"learning_rate": 2.8575403294811123e-07,
"loss": 0.32,
"step": 1235
},
{
"epoch": 1.8631995477671,
"grad_norm": 0.5378450793044806,
"learning_rate": 2.795374964830022e-07,
"loss": 0.2982,
"step": 1236
},
{
"epoch": 1.8647069907669116,
"grad_norm": 0.5537869321901812,
"learning_rate": 2.733883671013082e-07,
"loss": 0.3215,
"step": 1237
},
{
"epoch": 1.8662144337667232,
"grad_norm": 0.5214658307160525,
"learning_rate": 2.673066874445096e-07,
"loss": 0.2967,
"step": 1238
},
{
"epoch": 1.867721876766535,
"grad_norm": 0.5396057264477051,
"learning_rate": 2.612924996863453e-07,
"loss": 0.3323,
"step": 1239
},
{
"epoch": 1.8692293197663463,
"grad_norm": 0.5029335509841266,
"learning_rate": 2.5534584553253526e-07,
"loss": 0.2874,
"step": 1240
},
{
"epoch": 1.8707367627661577,
"grad_norm": 0.5258804019291271,
"learning_rate": 2.494667662204797e-07,
"loss": 0.2899,
"step": 1241
},
{
"epoch": 1.8722442057659694,
"grad_norm": 0.5248481914254555,
"learning_rate": 2.436553025189758e-07,
"loss": 0.3024,
"step": 1242
},
{
"epoch": 1.873751648765781,
"grad_norm": 0.5561247405618174,
"learning_rate": 2.3791149472794373e-07,
"loss": 0.3224,
"step": 1243
},
{
"epoch": 1.8752590917655927,
"grad_norm": 0.5386355445772364,
"learning_rate": 2.3223538267813317e-07,
"loss": 0.3252,
"step": 1244
},
{
"epoch": 1.8767665347654043,
"grad_norm": 0.5387316814949316,
"learning_rate": 2.2662700573085505e-07,
"loss": 0.3188,
"step": 1245
},
{
"epoch": 1.8782739777652158,
"grad_norm": 0.5140491567851894,
"learning_rate": 2.2108640277771153e-07,
"loss": 0.3087,
"step": 1246
},
{
"epoch": 1.8797814207650272,
"grad_norm": 0.5578574961077984,
"learning_rate": 2.156136122403174e-07,
"loss": 0.3339,
"step": 1247
},
{
"epoch": 1.8812888637648388,
"grad_norm": 0.531923059262347,
"learning_rate": 2.1020867207004026e-07,
"loss": 0.302,
"step": 1248
},
{
"epoch": 1.8827963067646505,
"grad_norm": 0.5199091046599704,
"learning_rate": 2.048716197477374e-07,
"loss": 0.3,
"step": 1249
},
{
"epoch": 1.8843037497644621,
"grad_norm": 0.5292062225255757,
"learning_rate": 1.996024922834905e-07,
"loss": 0.3136,
"step": 1250
},
{
"epoch": 1.8858111927642736,
"grad_norm": 0.5116718173644801,
"learning_rate": 1.9440132621635687e-07,
"loss": 0.3022,
"step": 1251
},
{
"epoch": 1.8873186357640852,
"grad_norm": 0.5311851666913588,
"learning_rate": 1.8926815761410867e-07,
"loss": 0.3119,
"step": 1252
},
{
"epoch": 1.8888260787638966,
"grad_norm": 0.5164961460809835,
"learning_rate": 1.8420302207298623e-07,
"loss": 0.3366,
"step": 1253
},
{
"epoch": 1.8903335217637083,
"grad_norm": 0.504315215516738,
"learning_rate": 1.792059547174507e-07,
"loss": 0.2975,
"step": 1254
},
{
"epoch": 1.89184096476352,
"grad_norm": 0.5345352832708335,
"learning_rate": 1.7427699019994415e-07,
"loss": 0.3236,
"step": 1255
},
{
"epoch": 1.8933484077633316,
"grad_norm": 0.5381742517110331,
"learning_rate": 1.6941616270063854e-07,
"loss": 0.3279,
"step": 1256
},
{
"epoch": 1.894855850763143,
"grad_norm": 0.5375566393713683,
"learning_rate": 1.6462350592721498e-07,
"loss": 0.3362,
"step": 1257
},
{
"epoch": 1.8963632937629544,
"grad_norm": 0.5285514266127366,
"learning_rate": 1.5989905311461274e-07,
"loss": 0.3204,
"step": 1258
},
{
"epoch": 1.897870736762766,
"grad_norm": 0.5414536639413304,
"learning_rate": 1.5524283702481158e-07,
"loss": 0.3335,
"step": 1259
},
{
"epoch": 1.8993781797625777,
"grad_norm": 0.512538356667461,
"learning_rate": 1.5065488994659983e-07,
"loss": 0.3053,
"step": 1260
},
{
"epoch": 1.9008856227623894,
"grad_norm": 0.5324212249495981,
"learning_rate": 1.461352436953478e-07,
"loss": 0.3072,
"step": 1261
},
{
"epoch": 1.902393065762201,
"grad_norm": 0.5455697348576503,
"learning_rate": 1.4168392961279254e-07,
"loss": 0.3316,
"step": 1262
},
{
"epoch": 1.9039005087620124,
"grad_norm": 0.5466375519251029,
"learning_rate": 1.3730097856681668e-07,
"loss": 0.3226,
"step": 1263
},
{
"epoch": 1.9054079517618239,
"grad_norm": 0.5312632713929628,
"learning_rate": 1.329864209512377e-07,
"loss": 0.315,
"step": 1264
},
{
"epoch": 1.9069153947616355,
"grad_norm": 0.5425648068314173,
"learning_rate": 1.2874028668559247e-07,
"loss": 0.3235,
"step": 1265
},
{
"epoch": 1.9084228377614472,
"grad_norm": 0.5312642091039448,
"learning_rate": 1.245626052149318e-07,
"loss": 0.3203,
"step": 1266
},
{
"epoch": 1.9099302807612588,
"grad_norm": 0.532495465640754,
"learning_rate": 1.2045340550961958e-07,
"loss": 0.3155,
"step": 1267
},
{
"epoch": 1.9114377237610702,
"grad_norm": 0.5246778980321247,
"learning_rate": 1.164127160651285e-07,
"loss": 0.2926,
"step": 1268
},
{
"epoch": 1.9129451667608819,
"grad_norm": 0.5339514500193528,
"learning_rate": 1.1244056490184008e-07,
"loss": 0.3029,
"step": 1269
},
{
"epoch": 1.9144526097606933,
"grad_norm": 0.520828858822998,
"learning_rate": 1.0853697956485942e-07,
"loss": 0.3065,
"step": 1270
},
{
"epoch": 1.915960052760505,
"grad_norm": 0.520817868672033,
"learning_rate": 1.0470198712381086e-07,
"loss": 0.307,
"step": 1271
},
{
"epoch": 1.9174674957603166,
"grad_norm": 0.516414932582989,
"learning_rate": 1.009356141726614e-07,
"loss": 0.3101,
"step": 1272
},
{
"epoch": 1.9189749387601283,
"grad_norm": 0.549210829131398,
"learning_rate": 9.723788682953539e-08,
"loss": 0.3562,
"step": 1273
},
{
"epoch": 1.9204823817599397,
"grad_norm": 0.5457067373758283,
"learning_rate": 9.360883073652238e-08,
"loss": 0.3179,
"step": 1274
},
{
"epoch": 1.921989824759751,
"grad_norm": 0.5418508804321499,
"learning_rate": 9.004847105951509e-08,
"loss": 0.3159,
"step": 1275
},
{
"epoch": 1.9234972677595628,
"grad_norm": 0.5366441286826634,
"learning_rate": 8.655683248802282e-08,
"loss": 0.2996,
"step": 1276
},
{
"epoch": 1.9250047107593744,
"grad_norm": 0.5442333602669928,
"learning_rate": 8.313393923500613e-08,
"loss": 0.3088,
"step": 1277
},
{
"epoch": 1.926512153759186,
"grad_norm": 0.5391901662166373,
"learning_rate": 7.977981503670795e-08,
"loss": 0.3061,
"step": 1278
},
{
"epoch": 1.9280195967589977,
"grad_norm": 0.5435340810409717,
"learning_rate": 7.64944831524872e-08,
"loss": 0.3285,
"step": 1279
},
{
"epoch": 1.9295270397588091,
"grad_norm": 0.521864945549257,
"learning_rate": 7.327796636465767e-08,
"loss": 0.3076,
"step": 1280
},
{
"epoch": 1.9310344827586206,
"grad_norm": 0.5514257916288331,
"learning_rate": 7.01302869783338e-08,
"loss": 0.3094,
"step": 1281
},
{
"epoch": 1.9325419257584322,
"grad_norm": 0.5219787357249853,
"learning_rate": 6.705146682127184e-08,
"loss": 0.2995,
"step": 1282
},
{
"epoch": 1.9340493687582438,
"grad_norm": 0.5276229371813537,
"learning_rate": 6.404152724371892e-08,
"loss": 0.3091,
"step": 1283
},
{
"epoch": 1.9355568117580555,
"grad_norm": 0.5314226178684127,
"learning_rate": 6.110048911826871e-08,
"loss": 0.3158,
"step": 1284
},
{
"epoch": 1.937064254757867,
"grad_norm": 0.5502887577785275,
"learning_rate": 5.82283728397115e-08,
"loss": 0.3215,
"step": 1285
},
{
"epoch": 1.9385716977576786,
"grad_norm": 0.5486357521857442,
"learning_rate": 5.542519832489546e-08,
"loss": 0.3386,
"step": 1286
},
{
"epoch": 1.94007914075749,
"grad_norm": 0.5249846416693436,
"learning_rate": 5.269098501259007e-08,
"loss": 0.3097,
"step": 1287
},
{
"epoch": 1.9415865837573016,
"grad_norm": 0.5254444548298214,
"learning_rate": 5.002575186334735e-08,
"loss": 0.3357,
"step": 1288
},
{
"epoch": 1.9430940267571133,
"grad_norm": 0.5333212942639225,
"learning_rate": 4.742951735937418e-08,
"loss": 0.3051,
"step": 1289
},
{
"epoch": 1.944601469756925,
"grad_norm": 0.551663577132892,
"learning_rate": 4.490229950440239e-08,
"loss": 0.3004,
"step": 1290
},
{
"epoch": 1.9461089127567364,
"grad_norm": 0.5255799475848183,
"learning_rate": 4.2444115823562226e-08,
"loss": 0.2978,
"step": 1291
},
{
"epoch": 1.9476163557565478,
"grad_norm": 0.5122243774948546,
"learning_rate": 4.005498336326463e-08,
"loss": 0.2904,
"step": 1292
},
{
"epoch": 1.9491237987563594,
"grad_norm": 0.5387853567764707,
"learning_rate": 3.773491869108137e-08,
"loss": 0.3186,
"step": 1293
},
{
"epoch": 1.950631241756171,
"grad_norm": 0.5188343179014033,
"learning_rate": 3.548393789562732e-08,
"loss": 0.3009,
"step": 1294
},
{
"epoch": 1.9521386847559827,
"grad_norm": 0.5377374320626865,
"learning_rate": 3.3302056586453916e-08,
"loss": 0.3054,
"step": 1295
},
{
"epoch": 1.9536461277557944,
"grad_norm": 0.535561804417277,
"learning_rate": 3.118928989393699e-08,
"loss": 0.296,
"step": 1296
},
{
"epoch": 1.9551535707556058,
"grad_norm": 0.5298311028053398,
"learning_rate": 2.9145652469174666e-08,
"loss": 0.3049,
"step": 1297
},
{
"epoch": 1.9566610137554172,
"grad_norm": 0.5269215357647239,
"learning_rate": 2.7171158483882963e-08,
"loss": 0.2986,
"step": 1298
},
{
"epoch": 1.9581684567552289,
"grad_norm": 0.5270798678914951,
"learning_rate": 2.5265821630298116e-08,
"loss": 0.3214,
"step": 1299
},
{
"epoch": 1.9596758997550405,
"grad_norm": 0.5488284780441306,
"learning_rate": 2.3429655121085525e-08,
"loss": 0.3293,
"step": 1300
},
{
"epoch": 1.9611833427548522,
"grad_norm": 0.5281296618472574,
"learning_rate": 2.1662671689242076e-08,
"loss": 0.3269,
"step": 1301
},
{
"epoch": 1.9626907857546636,
"grad_norm": 0.5477047385786338,
"learning_rate": 1.996488358801174e-08,
"loss": 0.3116,
"step": 1302
},
{
"epoch": 1.9641982287544753,
"grad_norm": 0.548270877454329,
"learning_rate": 1.8336302590798992e-08,
"loss": 0.3415,
"step": 1303
},
{
"epoch": 1.9657056717542867,
"grad_norm": 0.5385366961987965,
"learning_rate": 1.677693999109109e-08,
"loss": 0.3036,
"step": 1304
},
{
"epoch": 1.9672131147540983,
"grad_norm": 0.5125316134927453,
"learning_rate": 1.5286806602372583e-08,
"loss": 0.2899,
"step": 1305
},
{
"epoch": 1.96872055775391,
"grad_norm": 0.5211226615097172,
"learning_rate": 1.3865912758054267e-08,
"loss": 0.3025,
"step": 1306
},
{
"epoch": 1.9702280007537216,
"grad_norm": 0.5265304987884217,
"learning_rate": 1.2514268311405452e-08,
"loss": 0.3005,
"step": 1307
},
{
"epoch": 1.971735443753533,
"grad_norm": 0.5410147654111483,
"learning_rate": 1.1231882635477364e-08,
"loss": 0.3119,
"step": 1308
},
{
"epoch": 1.9732428867533447,
"grad_norm": 0.5170988890501786,
"learning_rate": 1.0018764623045407e-08,
"loss": 0.2958,
"step": 1309
},
{
"epoch": 1.9747503297531561,
"grad_norm": 0.5184171784095163,
"learning_rate": 8.874922686541442e-09,
"loss": 0.2924,
"step": 1310
},
{
"epoch": 1.9762577727529678,
"grad_norm": 0.5140179548472411,
"learning_rate": 7.800364758002721e-09,
"loss": 0.2935,
"step": 1311
},
{
"epoch": 1.9777652157527794,
"grad_norm": 0.5486141459025122,
"learning_rate": 6.795098289008595e-09,
"loss": 0.315,
"step": 1312
},
{
"epoch": 1.979272658752591,
"grad_norm": 0.5311267803536656,
"learning_rate": 5.859130250636113e-09,
"loss": 0.3115,
"step": 1313
},
{
"epoch": 1.9807801017524025,
"grad_norm": 0.5127976418049099,
"learning_rate": 4.992467133406731e-09,
"loss": 0.2853,
"step": 1314
},
{
"epoch": 1.982287544752214,
"grad_norm": 0.5328293684979241,
"learning_rate": 4.195114947244117e-09,
"loss": 0.3117,
"step": 1315
},
{
"epoch": 1.9837949877520256,
"grad_norm": 0.5211491118658048,
"learning_rate": 3.4670792214297476e-09,
"loss": 0.3049,
"step": 1316
},
{
"epoch": 1.9853024307518372,
"grad_norm": 0.532080710582646,
"learning_rate": 2.808365004569602e-09,
"loss": 0.3075,
"step": 1317
},
{
"epoch": 1.9868098737516489,
"grad_norm": 0.530526147652671,
"learning_rate": 2.2189768645519693e-09,
"loss": 0.3158,
"step": 1318
},
{
"epoch": 1.9883173167514603,
"grad_norm": 0.5335931657065038,
"learning_rate": 1.6989188885219165e-09,
"loss": 0.319,
"step": 1319
},
{
"epoch": 1.989824759751272,
"grad_norm": 0.527218281586083,
"learning_rate": 1.2481946828502011e-09,
"loss": 0.2986,
"step": 1320
},
{
"epoch": 1.9913322027510834,
"grad_norm": 0.5444631806162264,
"learning_rate": 8.668073731088467e-10,
"loss": 0.3414,
"step": 1321
},
{
"epoch": 1.992839645750895,
"grad_norm": 0.5458452249259766,
"learning_rate": 5.547596040489378e-10,
"loss": 0.3312,
"step": 1322
},
{
"epoch": 1.9943470887507067,
"grad_norm": 0.5337875506880636,
"learning_rate": 3.1205353958285724e-10,
"loss": 0.3065,
"step": 1323
},
{
"epoch": 1.9958545317505183,
"grad_norm": 0.5477429410153635,
"learning_rate": 1.3869086276985243e-10,
"loss": 0.308,
"step": 1324
},
{
"epoch": 1.9973619747503297,
"grad_norm": 0.5355633680169556,
"learning_rate": 3.467277580271322e-11,
"loss": 0.3114,
"step": 1325
},
{
"epoch": 1.9988694177501414,
"grad_norm": 0.5487135118890082,
"learning_rate": 0.0,
"loss": 0.3367,
"step": 1326
},
{
"epoch": 1.9988694177501414,
"step": 1326,
"total_flos": 5.576345153511096e+17,
"train_loss": 0.3973805017061363,
"train_runtime": 5664.6789,
"train_samples_per_second": 29.975,
"train_steps_per_second": 0.234
}
],
"logging_steps": 1,
"max_steps": 1326,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.576345153511096e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}