dminhvu02's picture
Upload folder using huggingface_hub
b165130 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.3879728419010669,
"eval_steps": 500,
"global_step": 525,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 1.9372913499539586,
"learning_rate": 2.4390243902439027e-06,
"loss": 1.6191,
"step": 1
},
{
"epoch": 0.0,
"grad_norm": 1.8879048490473127,
"learning_rate": 4.8780487804878055e-06,
"loss": 1.6982,
"step": 2
},
{
"epoch": 0.0,
"grad_norm": 1.857307355920545,
"learning_rate": 7.317073170731707e-06,
"loss": 1.6724,
"step": 3
},
{
"epoch": 0.0,
"grad_norm": 1.808250401091683,
"learning_rate": 9.756097560975611e-06,
"loss": 1.647,
"step": 4
},
{
"epoch": 0.0,
"grad_norm": 2.5133500505596453,
"learning_rate": 1.2195121951219513e-05,
"loss": 1.6079,
"step": 5
},
{
"epoch": 0.0,
"grad_norm": 1.2734146289947597,
"learning_rate": 1.4634146341463415e-05,
"loss": 1.5908,
"step": 6
},
{
"epoch": 0.01,
"grad_norm": 1.1812917040861377,
"learning_rate": 1.707317073170732e-05,
"loss": 1.5518,
"step": 7
},
{
"epoch": 0.01,
"grad_norm": 1.293637431287248,
"learning_rate": 1.9512195121951222e-05,
"loss": 1.5952,
"step": 8
},
{
"epoch": 0.01,
"grad_norm": 1.1620676440097686,
"learning_rate": 2.1951219512195124e-05,
"loss": 1.5493,
"step": 9
},
{
"epoch": 0.01,
"grad_norm": 1.3191260666446372,
"learning_rate": 2.4390243902439026e-05,
"loss": 1.5625,
"step": 10
},
{
"epoch": 0.01,
"grad_norm": 1.182981202097366,
"learning_rate": 2.682926829268293e-05,
"loss": 1.5498,
"step": 11
},
{
"epoch": 0.01,
"grad_norm": 1.0724491677903074,
"learning_rate": 2.926829268292683e-05,
"loss": 1.5547,
"step": 12
},
{
"epoch": 0.01,
"grad_norm": 0.9434780094091623,
"learning_rate": 3.170731707317073e-05,
"loss": 1.5327,
"step": 13
},
{
"epoch": 0.01,
"grad_norm": 1.0202543546064133,
"learning_rate": 3.414634146341464e-05,
"loss": 1.5933,
"step": 14
},
{
"epoch": 0.01,
"grad_norm": 0.996865818341891,
"learning_rate": 3.6585365853658535e-05,
"loss": 1.5796,
"step": 15
},
{
"epoch": 0.01,
"grad_norm": 0.9288105887086908,
"learning_rate": 3.9024390243902444e-05,
"loss": 1.4609,
"step": 16
},
{
"epoch": 0.01,
"grad_norm": 0.9726608694986103,
"learning_rate": 4.146341463414634e-05,
"loss": 1.5161,
"step": 17
},
{
"epoch": 0.01,
"grad_norm": 0.8619245275928736,
"learning_rate": 4.390243902439025e-05,
"loss": 1.5122,
"step": 18
},
{
"epoch": 0.01,
"grad_norm": 0.9215398746800475,
"learning_rate": 4.634146341463415e-05,
"loss": 1.5078,
"step": 19
},
{
"epoch": 0.01,
"grad_norm": 0.903097203515963,
"learning_rate": 4.878048780487805e-05,
"loss": 1.4502,
"step": 20
},
{
"epoch": 0.02,
"grad_norm": 0.8761498232482394,
"learning_rate": 5.121951219512195e-05,
"loss": 1.4893,
"step": 21
},
{
"epoch": 0.02,
"grad_norm": 0.8353385747464918,
"learning_rate": 5.365853658536586e-05,
"loss": 1.4717,
"step": 22
},
{
"epoch": 0.02,
"grad_norm": 0.8000291372477917,
"learning_rate": 5.6097560975609764e-05,
"loss": 1.481,
"step": 23
},
{
"epoch": 0.02,
"grad_norm": 0.8452088500727898,
"learning_rate": 5.853658536585366e-05,
"loss": 1.4644,
"step": 24
},
{
"epoch": 0.02,
"grad_norm": 0.8829309199222577,
"learning_rate": 6.097560975609756e-05,
"loss": 1.4868,
"step": 25
},
{
"epoch": 0.02,
"grad_norm": 0.8162327363449975,
"learning_rate": 6.341463414634146e-05,
"loss": 1.4883,
"step": 26
},
{
"epoch": 0.02,
"grad_norm": 0.7987925882960866,
"learning_rate": 6.585365853658538e-05,
"loss": 1.4268,
"step": 27
},
{
"epoch": 0.02,
"grad_norm": 0.7909140922467949,
"learning_rate": 6.829268292682928e-05,
"loss": 1.4873,
"step": 28
},
{
"epoch": 0.02,
"grad_norm": 0.7560592825415925,
"learning_rate": 7.073170731707317e-05,
"loss": 1.4116,
"step": 29
},
{
"epoch": 0.02,
"grad_norm": 0.7058796878894483,
"learning_rate": 7.317073170731707e-05,
"loss": 1.4023,
"step": 30
},
{
"epoch": 0.02,
"grad_norm": 0.7614550996113684,
"learning_rate": 7.560975609756099e-05,
"loss": 1.4312,
"step": 31
},
{
"epoch": 0.02,
"grad_norm": 0.7531993296256376,
"learning_rate": 7.804878048780489e-05,
"loss": 1.5024,
"step": 32
},
{
"epoch": 0.02,
"grad_norm": 0.7475795582718757,
"learning_rate": 8.048780487804879e-05,
"loss": 1.4363,
"step": 33
},
{
"epoch": 0.03,
"grad_norm": 0.7561530704205457,
"learning_rate": 8.292682926829268e-05,
"loss": 1.4873,
"step": 34
},
{
"epoch": 0.03,
"grad_norm": 0.7606234092420118,
"learning_rate": 8.53658536585366e-05,
"loss": 1.4204,
"step": 35
},
{
"epoch": 0.03,
"grad_norm": 0.7078849092381325,
"learning_rate": 8.78048780487805e-05,
"loss": 1.418,
"step": 36
},
{
"epoch": 0.03,
"grad_norm": 0.7583459620401868,
"learning_rate": 9.02439024390244e-05,
"loss": 1.4365,
"step": 37
},
{
"epoch": 0.03,
"grad_norm": 0.6479336734201823,
"learning_rate": 9.26829268292683e-05,
"loss": 1.3911,
"step": 38
},
{
"epoch": 0.03,
"grad_norm": 0.7138445522030739,
"learning_rate": 9.51219512195122e-05,
"loss": 1.4287,
"step": 39
},
{
"epoch": 0.03,
"grad_norm": 0.6772243082870256,
"learning_rate": 9.75609756097561e-05,
"loss": 1.3779,
"step": 40
},
{
"epoch": 0.03,
"grad_norm": 0.7001769060106223,
"learning_rate": 0.0001,
"loss": 1.3623,
"step": 41
},
{
"epoch": 0.03,
"grad_norm": 0.6593306891481673,
"learning_rate": 9.999985665852258e-05,
"loss": 1.3745,
"step": 42
},
{
"epoch": 0.03,
"grad_norm": 0.7111159325021309,
"learning_rate": 9.999942663491213e-05,
"loss": 1.3799,
"step": 43
},
{
"epoch": 0.03,
"grad_norm": 0.7023696510759943,
"learning_rate": 9.999870993163431e-05,
"loss": 1.4399,
"step": 44
},
{
"epoch": 0.03,
"grad_norm": 0.6736689337950041,
"learning_rate": 9.999770655279843e-05,
"loss": 1.4106,
"step": 45
},
{
"epoch": 0.03,
"grad_norm": 0.6746379997849087,
"learning_rate": 9.999641650415752e-05,
"loss": 1.4409,
"step": 46
},
{
"epoch": 0.03,
"grad_norm": 0.6615592598917496,
"learning_rate": 9.99948397931083e-05,
"loss": 1.3984,
"step": 47
},
{
"epoch": 0.04,
"grad_norm": 0.6538222984665192,
"learning_rate": 9.999297642869105e-05,
"loss": 1.4031,
"step": 48
},
{
"epoch": 0.04,
"grad_norm": 0.6129031974400467,
"learning_rate": 9.999082642158973e-05,
"loss": 1.396,
"step": 49
},
{
"epoch": 0.04,
"grad_norm": 0.6148818612628825,
"learning_rate": 9.998838978413168e-05,
"loss": 1.3574,
"step": 50
},
{
"epoch": 0.04,
"grad_norm": 0.6869612852614861,
"learning_rate": 9.99856665302878e-05,
"loss": 1.3762,
"step": 51
},
{
"epoch": 0.04,
"grad_norm": 0.7410178778694718,
"learning_rate": 9.998265667567226e-05,
"loss": 1.3481,
"step": 52
},
{
"epoch": 0.04,
"grad_norm": 0.6380516168920353,
"learning_rate": 9.997936023754257e-05,
"loss": 1.3513,
"step": 53
},
{
"epoch": 0.04,
"grad_norm": 0.6192351492724488,
"learning_rate": 9.997577723479938e-05,
"loss": 1.3662,
"step": 54
},
{
"epoch": 0.04,
"grad_norm": 0.633774941417789,
"learning_rate": 9.997190768798639e-05,
"loss": 1.3457,
"step": 55
},
{
"epoch": 0.04,
"grad_norm": 0.6016840416873676,
"learning_rate": 9.996775161929027e-05,
"loss": 1.3877,
"step": 56
},
{
"epoch": 0.04,
"grad_norm": 0.638026596140304,
"learning_rate": 9.99633090525405e-05,
"loss": 1.3892,
"step": 57
},
{
"epoch": 0.04,
"grad_norm": 0.5934027179170136,
"learning_rate": 9.995858001320926e-05,
"loss": 1.3223,
"step": 58
},
{
"epoch": 0.04,
"grad_norm": 0.6143195436309025,
"learning_rate": 9.995356452841122e-05,
"loss": 1.3862,
"step": 59
},
{
"epoch": 0.04,
"grad_norm": 0.6076935190423259,
"learning_rate": 9.994826262690347e-05,
"loss": 1.3584,
"step": 60
},
{
"epoch": 0.05,
"grad_norm": 0.6239965555110781,
"learning_rate": 9.994267433908533e-05,
"loss": 1.2771,
"step": 61
},
{
"epoch": 0.05,
"grad_norm": 0.5469871219286494,
"learning_rate": 9.99367996969981e-05,
"loss": 1.3579,
"step": 62
},
{
"epoch": 0.05,
"grad_norm": 0.5975500231663011,
"learning_rate": 9.9930638734325e-05,
"loss": 1.3872,
"step": 63
},
{
"epoch": 0.05,
"grad_norm": 0.6160102854784424,
"learning_rate": 9.992419148639087e-05,
"loss": 1.3831,
"step": 64
},
{
"epoch": 0.05,
"grad_norm": 0.5815474376554662,
"learning_rate": 9.991745799016206e-05,
"loss": 1.3745,
"step": 65
},
{
"epoch": 0.05,
"grad_norm": 0.5994591436721235,
"learning_rate": 9.991043828424612e-05,
"loss": 1.396,
"step": 66
},
{
"epoch": 0.05,
"grad_norm": 0.5896523240727669,
"learning_rate": 9.990313240889167e-05,
"loss": 1.3608,
"step": 67
},
{
"epoch": 0.05,
"grad_norm": 0.6062100949214702,
"learning_rate": 9.989554040598807e-05,
"loss": 1.2996,
"step": 68
},
{
"epoch": 0.05,
"grad_norm": 0.5941049216825265,
"learning_rate": 9.988766231906533e-05,
"loss": 1.4106,
"step": 69
},
{
"epoch": 0.05,
"grad_norm": 0.5604128113953568,
"learning_rate": 9.987949819329365e-05,
"loss": 1.3931,
"step": 70
},
{
"epoch": 0.05,
"grad_norm": 0.5519277490096212,
"learning_rate": 9.98710480754834e-05,
"loss": 1.3691,
"step": 71
},
{
"epoch": 0.05,
"grad_norm": 0.5900021330626725,
"learning_rate": 9.986231201408467e-05,
"loss": 1.4058,
"step": 72
},
{
"epoch": 0.05,
"grad_norm": 0.5699754681306506,
"learning_rate": 9.985329005918702e-05,
"loss": 1.355,
"step": 73
},
{
"epoch": 0.05,
"grad_norm": 0.593149750992695,
"learning_rate": 9.98439822625193e-05,
"loss": 1.3545,
"step": 74
},
{
"epoch": 0.06,
"grad_norm": 0.5824626045065218,
"learning_rate": 9.983438867744923e-05,
"loss": 1.3896,
"step": 75
},
{
"epoch": 0.06,
"grad_norm": 0.5900786393120402,
"learning_rate": 9.982450935898316e-05,
"loss": 1.3716,
"step": 76
},
{
"epoch": 0.06,
"grad_norm": 0.5688141367114475,
"learning_rate": 9.981434436376572e-05,
"loss": 1.3921,
"step": 77
},
{
"epoch": 0.06,
"grad_norm": 0.557565379686218,
"learning_rate": 9.980389375007955e-05,
"loss": 1.3506,
"step": 78
},
{
"epoch": 0.06,
"grad_norm": 0.5740715320740841,
"learning_rate": 9.979315757784488e-05,
"loss": 1.2917,
"step": 79
},
{
"epoch": 0.06,
"grad_norm": 0.5717745274109229,
"learning_rate": 9.97821359086193e-05,
"loss": 1.3154,
"step": 80
},
{
"epoch": 0.06,
"grad_norm": 0.609615875256831,
"learning_rate": 9.977082880559725e-05,
"loss": 1.3328,
"step": 81
},
{
"epoch": 0.06,
"grad_norm": 0.5777864702702744,
"learning_rate": 9.975923633360985e-05,
"loss": 1.3599,
"step": 82
},
{
"epoch": 0.06,
"grad_norm": 0.575948499045498,
"learning_rate": 9.974735855912436e-05,
"loss": 1.4038,
"step": 83
},
{
"epoch": 0.06,
"grad_norm": 0.550693122074238,
"learning_rate": 9.97351955502439e-05,
"loss": 1.3203,
"step": 84
},
{
"epoch": 0.06,
"grad_norm": 0.5561601283605949,
"learning_rate": 9.972274737670701e-05,
"loss": 1.3477,
"step": 85
},
{
"epoch": 0.06,
"grad_norm": 0.5601251180421914,
"learning_rate": 9.971001410988728e-05,
"loss": 1.333,
"step": 86
},
{
"epoch": 0.06,
"grad_norm": 0.6207004745075507,
"learning_rate": 9.969699582279292e-05,
"loss": 1.4048,
"step": 87
},
{
"epoch": 0.07,
"grad_norm": 0.5475040554880181,
"learning_rate": 9.968369259006634e-05,
"loss": 1.3208,
"step": 88
},
{
"epoch": 0.07,
"grad_norm": 0.6054670378552847,
"learning_rate": 9.967010448798375e-05,
"loss": 1.4131,
"step": 89
},
{
"epoch": 0.07,
"grad_norm": 0.5486336748948858,
"learning_rate": 9.965623159445471e-05,
"loss": 1.3843,
"step": 90
},
{
"epoch": 0.07,
"grad_norm": 0.585603864758025,
"learning_rate": 9.964207398902163e-05,
"loss": 1.3186,
"step": 91
},
{
"epoch": 0.07,
"grad_norm": 0.5412960874208915,
"learning_rate": 9.96276317528594e-05,
"loss": 1.2861,
"step": 92
},
{
"epoch": 0.07,
"grad_norm": 0.5442105369162202,
"learning_rate": 9.96129049687749e-05,
"loss": 1.3262,
"step": 93
},
{
"epoch": 0.07,
"grad_norm": 0.5816978676309428,
"learning_rate": 9.959789372120649e-05,
"loss": 1.3279,
"step": 94
},
{
"epoch": 0.07,
"grad_norm": 0.5557519862862452,
"learning_rate": 9.958259809622352e-05,
"loss": 1.3672,
"step": 95
},
{
"epoch": 0.07,
"grad_norm": 0.5666965195077155,
"learning_rate": 9.956701818152591e-05,
"loss": 1.3203,
"step": 96
},
{
"epoch": 0.07,
"grad_norm": 0.5354511291609182,
"learning_rate": 9.955115406644356e-05,
"loss": 1.3081,
"step": 97
},
{
"epoch": 0.07,
"grad_norm": 0.5685729288533676,
"learning_rate": 9.953500584193592e-05,
"loss": 1.3452,
"step": 98
},
{
"epoch": 0.07,
"grad_norm": 0.5922446508548838,
"learning_rate": 9.95185736005914e-05,
"loss": 1.3682,
"step": 99
},
{
"epoch": 0.07,
"grad_norm": 0.5837642463681222,
"learning_rate": 9.950185743662685e-05,
"loss": 1.3691,
"step": 100
},
{
"epoch": 0.07,
"grad_norm": 0.5761448966076219,
"learning_rate": 9.948485744588709e-05,
"loss": 1.3281,
"step": 101
},
{
"epoch": 0.08,
"grad_norm": 0.553490008569796,
"learning_rate": 9.946757372584423e-05,
"loss": 1.292,
"step": 102
},
{
"epoch": 0.08,
"grad_norm": 0.5686477341821499,
"learning_rate": 9.945000637559727e-05,
"loss": 1.3486,
"step": 103
},
{
"epoch": 0.08,
"grad_norm": 0.5772487636958804,
"learning_rate": 9.943215549587138e-05,
"loss": 1.3425,
"step": 104
},
{
"epoch": 0.08,
"grad_norm": 0.5758207849461601,
"learning_rate": 9.941402118901744e-05,
"loss": 1.3701,
"step": 105
},
{
"epoch": 0.08,
"grad_norm": 0.5582099537521159,
"learning_rate": 9.939560355901136e-05,
"loss": 1.3794,
"step": 106
},
{
"epoch": 0.08,
"grad_norm": 0.5336197399728324,
"learning_rate": 9.937690271145354e-05,
"loss": 1.3179,
"step": 107
},
{
"epoch": 0.08,
"grad_norm": 0.539037656457371,
"learning_rate": 9.935791875356832e-05,
"loss": 1.3071,
"step": 108
},
{
"epoch": 0.08,
"grad_norm": 0.5584770963502244,
"learning_rate": 9.933865179420321e-05,
"loss": 1.3945,
"step": 109
},
{
"epoch": 0.08,
"grad_norm": 0.5364047388288558,
"learning_rate": 9.931910194382837e-05,
"loss": 1.3462,
"step": 110
},
{
"epoch": 0.08,
"grad_norm": 0.5956933567804931,
"learning_rate": 9.929926931453599e-05,
"loss": 1.2585,
"step": 111
},
{
"epoch": 0.08,
"grad_norm": 0.5548298244830802,
"learning_rate": 9.927915402003964e-05,
"loss": 1.3765,
"step": 112
},
{
"epoch": 0.08,
"grad_norm": 0.5528131728204222,
"learning_rate": 9.92587561756735e-05,
"loss": 1.3452,
"step": 113
},
{
"epoch": 0.08,
"grad_norm": 0.5181397205586854,
"learning_rate": 9.92380758983919e-05,
"loss": 1.2671,
"step": 114
},
{
"epoch": 0.08,
"grad_norm": 0.5429954425262675,
"learning_rate": 9.921711330676848e-05,
"loss": 1.3574,
"step": 115
},
{
"epoch": 0.09,
"grad_norm": 0.5523231773869766,
"learning_rate": 9.919586852099562e-05,
"loss": 1.3184,
"step": 116
},
{
"epoch": 0.09,
"grad_norm": 0.5583959107787768,
"learning_rate": 9.917434166288364e-05,
"loss": 1.3442,
"step": 117
},
{
"epoch": 0.09,
"grad_norm": 0.5850081526075311,
"learning_rate": 9.915253285586024e-05,
"loss": 1.3477,
"step": 118
},
{
"epoch": 0.09,
"grad_norm": 0.5498743192645993,
"learning_rate": 9.913044222496966e-05,
"loss": 1.3398,
"step": 119
},
{
"epoch": 0.09,
"grad_norm": 0.5853233345937257,
"learning_rate": 9.910806989687206e-05,
"loss": 1.3276,
"step": 120
},
{
"epoch": 0.09,
"grad_norm": 0.559389561256856,
"learning_rate": 9.908541599984276e-05,
"loss": 1.3462,
"step": 121
},
{
"epoch": 0.09,
"grad_norm": 0.5298088621667728,
"learning_rate": 9.906248066377143e-05,
"loss": 1.2568,
"step": 122
},
{
"epoch": 0.09,
"grad_norm": 0.5731884986496186,
"learning_rate": 9.903926402016153e-05,
"loss": 1.3394,
"step": 123
},
{
"epoch": 0.09,
"grad_norm": 0.5549155957971303,
"learning_rate": 9.901576620212933e-05,
"loss": 1.311,
"step": 124
},
{
"epoch": 0.09,
"grad_norm": 0.5620092141236146,
"learning_rate": 9.899198734440335e-05,
"loss": 1.291,
"step": 125
},
{
"epoch": 0.09,
"grad_norm": 0.5405164924320079,
"learning_rate": 9.896792758332341e-05,
"loss": 1.248,
"step": 126
},
{
"epoch": 0.09,
"grad_norm": 0.5602202105737174,
"learning_rate": 9.894358705684002e-05,
"loss": 1.3115,
"step": 127
},
{
"epoch": 0.09,
"grad_norm": 0.5580296998093701,
"learning_rate": 9.891896590451344e-05,
"loss": 1.2947,
"step": 128
},
{
"epoch": 0.1,
"grad_norm": 0.5755635897570144,
"learning_rate": 9.889406426751296e-05,
"loss": 1.3086,
"step": 129
},
{
"epoch": 0.1,
"grad_norm": 0.6025851962917577,
"learning_rate": 9.886888228861608e-05,
"loss": 1.3447,
"step": 130
},
{
"epoch": 0.1,
"grad_norm": 0.5660419268974345,
"learning_rate": 9.88434201122077e-05,
"loss": 1.3232,
"step": 131
},
{
"epoch": 0.1,
"grad_norm": 0.5495648120402916,
"learning_rate": 9.881767788427925e-05,
"loss": 1.3096,
"step": 132
},
{
"epoch": 0.1,
"grad_norm": 0.5577872798163368,
"learning_rate": 9.879165575242787e-05,
"loss": 1.291,
"step": 133
},
{
"epoch": 0.1,
"grad_norm": 0.5540620803629338,
"learning_rate": 9.876535386585561e-05,
"loss": 1.335,
"step": 134
},
{
"epoch": 0.1,
"grad_norm": 0.5573425731012122,
"learning_rate": 9.873877237536853e-05,
"loss": 1.2327,
"step": 135
},
{
"epoch": 0.1,
"grad_norm": 0.5827857038389533,
"learning_rate": 9.871191143337582e-05,
"loss": 1.3333,
"step": 136
},
{
"epoch": 0.1,
"grad_norm": 0.5897883061496167,
"learning_rate": 9.868477119388896e-05,
"loss": 1.3076,
"step": 137
},
{
"epoch": 0.1,
"grad_norm": 0.5800275384221499,
"learning_rate": 9.865735181252085e-05,
"loss": 1.3188,
"step": 138
},
{
"epoch": 0.1,
"grad_norm": 0.5605765677262206,
"learning_rate": 9.862965344648485e-05,
"loss": 1.3086,
"step": 139
},
{
"epoch": 0.1,
"grad_norm": 0.5432447170586258,
"learning_rate": 9.860167625459398e-05,
"loss": 1.2861,
"step": 140
},
{
"epoch": 0.1,
"grad_norm": 0.5687257803544524,
"learning_rate": 9.85734203972599e-05,
"loss": 1.2839,
"step": 141
},
{
"epoch": 0.1,
"grad_norm": 0.5475328993701518,
"learning_rate": 9.854488603649206e-05,
"loss": 1.3169,
"step": 142
},
{
"epoch": 0.11,
"grad_norm": 0.5408143803639806,
"learning_rate": 9.851607333589677e-05,
"loss": 1.3374,
"step": 143
},
{
"epoch": 0.11,
"grad_norm": 0.5350053494827027,
"learning_rate": 9.848698246067623e-05,
"loss": 1.2888,
"step": 144
},
{
"epoch": 0.11,
"grad_norm": 0.5642075781884446,
"learning_rate": 9.84576135776276e-05,
"loss": 1.3105,
"step": 145
},
{
"epoch": 0.11,
"grad_norm": 0.5725161088840623,
"learning_rate": 9.842796685514203e-05,
"loss": 1.3516,
"step": 146
},
{
"epoch": 0.11,
"grad_norm": 0.5837888943455876,
"learning_rate": 9.839804246320375e-05,
"loss": 1.2871,
"step": 147
},
{
"epoch": 0.11,
"grad_norm": 0.5833329842842448,
"learning_rate": 9.836784057338899e-05,
"loss": 1.3232,
"step": 148
},
{
"epoch": 0.11,
"grad_norm": 0.5244172538585695,
"learning_rate": 9.833736135886512e-05,
"loss": 1.2568,
"step": 149
},
{
"epoch": 0.11,
"grad_norm": 0.5163576076330887,
"learning_rate": 9.830660499438955e-05,
"loss": 1.2759,
"step": 150
},
{
"epoch": 0.11,
"grad_norm": 0.5617840717093857,
"learning_rate": 9.827557165630879e-05,
"loss": 1.2524,
"step": 151
},
{
"epoch": 0.11,
"grad_norm": 0.547220410155329,
"learning_rate": 9.824426152255741e-05,
"loss": 1.312,
"step": 152
},
{
"epoch": 0.11,
"grad_norm": 0.5715922980351898,
"learning_rate": 9.821267477265705e-05,
"loss": 1.335,
"step": 153
},
{
"epoch": 0.11,
"grad_norm": 0.5626236612178414,
"learning_rate": 9.818081158771538e-05,
"loss": 1.3633,
"step": 154
},
{
"epoch": 0.11,
"grad_norm": 0.556817713740677,
"learning_rate": 9.814867215042502e-05,
"loss": 1.3345,
"step": 155
},
{
"epoch": 0.12,
"grad_norm": 0.5658424328358594,
"learning_rate": 9.811625664506259e-05,
"loss": 1.3325,
"step": 156
},
{
"epoch": 0.12,
"grad_norm": 0.5518987143292007,
"learning_rate": 9.808356525748748e-05,
"loss": 1.3179,
"step": 157
},
{
"epoch": 0.12,
"grad_norm": 0.5509045139485853,
"learning_rate": 9.805059817514101e-05,
"loss": 1.3276,
"step": 158
},
{
"epoch": 0.12,
"grad_norm": 0.5612999607711056,
"learning_rate": 9.801735558704517e-05,
"loss": 1.2192,
"step": 159
},
{
"epoch": 0.12,
"grad_norm": 0.530326353544212,
"learning_rate": 9.798383768380164e-05,
"loss": 1.2988,
"step": 160
},
{
"epoch": 0.12,
"grad_norm": 0.5524425336112486,
"learning_rate": 9.795004465759065e-05,
"loss": 1.2622,
"step": 161
},
{
"epoch": 0.12,
"grad_norm": 0.5121240819278214,
"learning_rate": 9.791597670216989e-05,
"loss": 1.2603,
"step": 162
},
{
"epoch": 0.12,
"grad_norm": 0.5262701595678754,
"learning_rate": 9.78816340128734e-05,
"loss": 1.22,
"step": 163
},
{
"epoch": 0.12,
"grad_norm": 0.5866254674193113,
"learning_rate": 9.784701678661045e-05,
"loss": 1.311,
"step": 164
},
{
"epoch": 0.12,
"grad_norm": 0.567120419528464,
"learning_rate": 9.781212522186443e-05,
"loss": 1.3145,
"step": 165
},
{
"epoch": 0.12,
"grad_norm": 0.5704512174009239,
"learning_rate": 9.777695951869164e-05,
"loss": 1.2612,
"step": 166
},
{
"epoch": 0.12,
"grad_norm": 0.5359884622353506,
"learning_rate": 9.774151987872027e-05,
"loss": 1.2117,
"step": 167
},
{
"epoch": 0.12,
"grad_norm": 0.5772321074843504,
"learning_rate": 9.770580650514914e-05,
"loss": 1.3525,
"step": 168
},
{
"epoch": 0.12,
"grad_norm": 0.5316876920831217,
"learning_rate": 9.766981960274653e-05,
"loss": 1.3442,
"step": 169
},
{
"epoch": 0.13,
"grad_norm": 0.5622203218145027,
"learning_rate": 9.763355937784909e-05,
"loss": 1.2964,
"step": 170
},
{
"epoch": 0.13,
"grad_norm": 0.5614932814360857,
"learning_rate": 9.759702603836059e-05,
"loss": 1.3389,
"step": 171
},
{
"epoch": 0.13,
"grad_norm": 0.568962837143467,
"learning_rate": 9.756021979375071e-05,
"loss": 1.3174,
"step": 172
},
{
"epoch": 0.13,
"grad_norm": 0.5382419139994956,
"learning_rate": 9.752314085505395e-05,
"loss": 1.3125,
"step": 173
},
{
"epoch": 0.13,
"grad_norm": 0.5677837729549118,
"learning_rate": 9.748578943486828e-05,
"loss": 1.2871,
"step": 174
},
{
"epoch": 0.13,
"grad_norm": 0.5602612877442024,
"learning_rate": 9.744816574735405e-05,
"loss": 1.3438,
"step": 175
},
{
"epoch": 0.13,
"grad_norm": 0.5735194400650546,
"learning_rate": 9.74102700082326e-05,
"loss": 1.3208,
"step": 176
},
{
"epoch": 0.13,
"grad_norm": 0.5670876099448275,
"learning_rate": 9.737210243478521e-05,
"loss": 1.2969,
"step": 177
},
{
"epoch": 0.13,
"grad_norm": 0.5450536272385241,
"learning_rate": 9.733366324585175e-05,
"loss": 1.2673,
"step": 178
},
{
"epoch": 0.13,
"grad_norm": 0.5340701964695135,
"learning_rate": 9.72949526618294e-05,
"loss": 1.3403,
"step": 179
},
{
"epoch": 0.13,
"grad_norm": 0.5422933717116616,
"learning_rate": 9.725597090467144e-05,
"loss": 1.2539,
"step": 180
},
{
"epoch": 0.13,
"grad_norm": 0.5680150103490264,
"learning_rate": 9.721671819788602e-05,
"loss": 1.3149,
"step": 181
},
{
"epoch": 0.13,
"grad_norm": 0.560101859043945,
"learning_rate": 9.717719476653475e-05,
"loss": 1.321,
"step": 182
},
{
"epoch": 0.14,
"grad_norm": 0.5267278121510764,
"learning_rate": 9.71374008372315e-05,
"loss": 1.2227,
"step": 183
},
{
"epoch": 0.14,
"grad_norm": 0.5687530339596342,
"learning_rate": 9.709733663814113e-05,
"loss": 1.3159,
"step": 184
},
{
"epoch": 0.14,
"grad_norm": 0.5321503974993333,
"learning_rate": 9.705700239897809e-05,
"loss": 1.3188,
"step": 185
},
{
"epoch": 0.14,
"grad_norm": 0.5593956329311583,
"learning_rate": 9.701639835100513e-05,
"loss": 1.249,
"step": 186
},
{
"epoch": 0.14,
"grad_norm": 0.5591047172889141,
"learning_rate": 9.697552472703205e-05,
"loss": 1.2756,
"step": 187
},
{
"epoch": 0.14,
"grad_norm": 0.5543029039316694,
"learning_rate": 9.693438176141425e-05,
"loss": 1.2915,
"step": 188
},
{
"epoch": 0.14,
"grad_norm": 0.5494961227055172,
"learning_rate": 9.68929696900515e-05,
"loss": 1.313,
"step": 189
},
{
"epoch": 0.14,
"grad_norm": 0.5541252042617403,
"learning_rate": 9.685128875038647e-05,
"loss": 1.2754,
"step": 190
},
{
"epoch": 0.14,
"grad_norm": 0.5163534781462605,
"learning_rate": 9.680933918140348e-05,
"loss": 1.2681,
"step": 191
},
{
"epoch": 0.14,
"grad_norm": 0.537157272716453,
"learning_rate": 9.676712122362706e-05,
"loss": 1.2551,
"step": 192
},
{
"epoch": 0.14,
"grad_norm": 0.5397175193183968,
"learning_rate": 9.672463511912055e-05,
"loss": 1.2822,
"step": 193
},
{
"epoch": 0.14,
"grad_norm": 0.5488691397441863,
"learning_rate": 9.668188111148484e-05,
"loss": 1.283,
"step": 194
},
{
"epoch": 0.14,
"grad_norm": 0.5905761212464122,
"learning_rate": 9.66388594458568e-05,
"loss": 1.2896,
"step": 195
},
{
"epoch": 0.14,
"grad_norm": 0.580369444338734,
"learning_rate": 9.659557036890801e-05,
"loss": 1.3416,
"step": 196
},
{
"epoch": 0.15,
"grad_norm": 0.5262728809847318,
"learning_rate": 9.655201412884327e-05,
"loss": 1.2554,
"step": 197
},
{
"epoch": 0.15,
"grad_norm": 0.5375550652008795,
"learning_rate": 9.650819097539922e-05,
"loss": 1.2612,
"step": 198
},
{
"epoch": 0.15,
"grad_norm": 0.5208197207069616,
"learning_rate": 9.646410115984289e-05,
"loss": 1.2358,
"step": 199
},
{
"epoch": 0.15,
"grad_norm": 0.5409371788748774,
"learning_rate": 9.641974493497024e-05,
"loss": 1.3262,
"step": 200
},
{
"epoch": 0.15,
"grad_norm": 0.5389211233425135,
"learning_rate": 9.637512255510475e-05,
"loss": 1.2729,
"step": 201
},
{
"epoch": 0.15,
"grad_norm": 0.5501782779153785,
"learning_rate": 9.633023427609591e-05,
"loss": 1.2322,
"step": 202
},
{
"epoch": 0.15,
"grad_norm": 0.5678681105856288,
"learning_rate": 9.628508035531785e-05,
"loss": 1.3721,
"step": 203
},
{
"epoch": 0.15,
"grad_norm": 0.5559621306210715,
"learning_rate": 9.623966105166772e-05,
"loss": 1.3267,
"step": 204
},
{
"epoch": 0.15,
"grad_norm": 0.5417687907113425,
"learning_rate": 9.619397662556435e-05,
"loss": 1.2666,
"step": 205
},
{
"epoch": 0.15,
"grad_norm": 0.5546614199696198,
"learning_rate": 9.614802733894665e-05,
"loss": 1.3389,
"step": 206
},
{
"epoch": 0.15,
"grad_norm": 0.5594799442475286,
"learning_rate": 9.610181345527217e-05,
"loss": 1.2671,
"step": 207
},
{
"epoch": 0.15,
"grad_norm": 0.5852167375394156,
"learning_rate": 9.605533523951558e-05,
"loss": 1.3335,
"step": 208
},
{
"epoch": 0.15,
"grad_norm": 0.5465110917787175,
"learning_rate": 9.600859295816708e-05,
"loss": 1.3096,
"step": 209
},
{
"epoch": 0.16,
"grad_norm": 0.5704616015169348,
"learning_rate": 9.596158687923104e-05,
"loss": 1.3022,
"step": 210
},
{
"epoch": 0.16,
"grad_norm": 0.5617616139462727,
"learning_rate": 9.591431727222424e-05,
"loss": 1.3159,
"step": 211
},
{
"epoch": 0.16,
"grad_norm": 0.5465602681324426,
"learning_rate": 9.586678440817453e-05,
"loss": 1.2708,
"step": 212
},
{
"epoch": 0.16,
"grad_norm": 0.5864421378413351,
"learning_rate": 9.581898855961912e-05,
"loss": 1.2607,
"step": 213
},
{
"epoch": 0.16,
"grad_norm": 0.556548001041405,
"learning_rate": 9.577093000060312e-05,
"loss": 1.3081,
"step": 214
},
{
"epoch": 0.16,
"grad_norm": 0.5642842704902283,
"learning_rate": 9.572260900667794e-05,
"loss": 1.2759,
"step": 215
},
{
"epoch": 0.16,
"grad_norm": 0.5486665255067006,
"learning_rate": 9.567402585489963e-05,
"loss": 1.2104,
"step": 216
},
{
"epoch": 0.16,
"grad_norm": 0.5361207508020517,
"learning_rate": 9.56251808238275e-05,
"loss": 1.2451,
"step": 217
},
{
"epoch": 0.16,
"grad_norm": 0.5149380805556683,
"learning_rate": 9.557607419352226e-05,
"loss": 1.2778,
"step": 218
},
{
"epoch": 0.16,
"grad_norm": 0.5469266902951428,
"learning_rate": 9.552670624554461e-05,
"loss": 1.2617,
"step": 219
},
{
"epoch": 0.16,
"grad_norm": 0.5430295319416,
"learning_rate": 9.54770772629535e-05,
"loss": 1.2915,
"step": 220
},
{
"epoch": 0.16,
"grad_norm": 0.5744217791056692,
"learning_rate": 9.542718753030463e-05,
"loss": 1.3281,
"step": 221
},
{
"epoch": 0.16,
"grad_norm": 0.5587545969611539,
"learning_rate": 9.537703733364871e-05,
"loss": 1.2837,
"step": 222
},
{
"epoch": 0.16,
"grad_norm": 0.5288053303373643,
"learning_rate": 9.532662696052985e-05,
"loss": 1.2949,
"step": 223
},
{
"epoch": 0.17,
"grad_norm": 0.5791175310063906,
"learning_rate": 9.527595669998399e-05,
"loss": 1.2917,
"step": 224
},
{
"epoch": 0.17,
"grad_norm": 0.5250029719207272,
"learning_rate": 9.522502684253709e-05,
"loss": 1.2375,
"step": 225
},
{
"epoch": 0.17,
"grad_norm": 0.5177601049436101,
"learning_rate": 9.517383768020361e-05,
"loss": 1.2695,
"step": 226
},
{
"epoch": 0.17,
"grad_norm": 0.5554993860583297,
"learning_rate": 9.512238950648474e-05,
"loss": 1.2917,
"step": 227
},
{
"epoch": 0.17,
"grad_norm": 0.5738329488665082,
"learning_rate": 9.507068261636679e-05,
"loss": 1.2944,
"step": 228
},
{
"epoch": 0.17,
"grad_norm": 0.5562896023700302,
"learning_rate": 9.501871730631942e-05,
"loss": 1.3296,
"step": 229
},
{
"epoch": 0.17,
"grad_norm": 0.5416347008024398,
"learning_rate": 9.496649387429404e-05,
"loss": 1.2437,
"step": 230
},
{
"epoch": 0.17,
"grad_norm": 0.5699356753997783,
"learning_rate": 9.491401261972195e-05,
"loss": 1.2705,
"step": 231
},
{
"epoch": 0.17,
"grad_norm": 0.5481624625613764,
"learning_rate": 9.486127384351282e-05,
"loss": 1.3779,
"step": 232
},
{
"epoch": 0.17,
"grad_norm": 0.5688206917165098,
"learning_rate": 9.480827784805278e-05,
"loss": 1.2754,
"step": 233
},
{
"epoch": 0.17,
"grad_norm": 0.5490377714658476,
"learning_rate": 9.475502493720283e-05,
"loss": 1.3125,
"step": 234
},
{
"epoch": 0.17,
"grad_norm": 0.5355672804730123,
"learning_rate": 9.470151541629699e-05,
"loss": 1.2627,
"step": 235
},
{
"epoch": 0.17,
"grad_norm": 0.5905840590902287,
"learning_rate": 9.464774959214063e-05,
"loss": 1.3027,
"step": 236
},
{
"epoch": 0.18,
"grad_norm": 0.56064622426517,
"learning_rate": 9.459372777300864e-05,
"loss": 1.2065,
"step": 237
},
{
"epoch": 0.18,
"grad_norm": 0.5568610691565873,
"learning_rate": 9.45394502686437e-05,
"loss": 1.3223,
"step": 238
},
{
"epoch": 0.18,
"grad_norm": 0.5300725401389981,
"learning_rate": 9.448491739025454e-05,
"loss": 1.2805,
"step": 239
},
{
"epoch": 0.18,
"grad_norm": 0.5519662242216672,
"learning_rate": 9.44301294505141e-05,
"loss": 1.2371,
"step": 240
},
{
"epoch": 0.18,
"grad_norm": 0.5402101018249572,
"learning_rate": 9.437508676355773e-05,
"loss": 1.2749,
"step": 241
},
{
"epoch": 0.18,
"grad_norm": 0.5389383005608104,
"learning_rate": 9.431978964498143e-05,
"loss": 1.2876,
"step": 242
},
{
"epoch": 0.18,
"grad_norm": 0.5310718244911751,
"learning_rate": 9.426423841184005e-05,
"loss": 1.3057,
"step": 243
},
{
"epoch": 0.18,
"grad_norm": 0.5454082533825911,
"learning_rate": 9.420843338264542e-05,
"loss": 1.2578,
"step": 244
},
{
"epoch": 0.18,
"grad_norm": 0.565349361879851,
"learning_rate": 9.415237487736452e-05,
"loss": 1.3306,
"step": 245
},
{
"epoch": 0.18,
"grad_norm": 0.5224746893789486,
"learning_rate": 9.409606321741775e-05,
"loss": 1.2598,
"step": 246
},
{
"epoch": 0.18,
"grad_norm": 0.5440997273729092,
"learning_rate": 9.403949872567695e-05,
"loss": 1.2749,
"step": 247
},
{
"epoch": 0.18,
"grad_norm": 0.5668696203741111,
"learning_rate": 9.398268172646365e-05,
"loss": 1.2739,
"step": 248
},
{
"epoch": 0.18,
"grad_norm": 0.538410569856225,
"learning_rate": 9.392561254554713e-05,
"loss": 1.2734,
"step": 249
},
{
"epoch": 0.18,
"grad_norm": 0.5458663263053075,
"learning_rate": 9.386829151014262e-05,
"loss": 1.3101,
"step": 250
},
{
"epoch": 0.19,
"grad_norm": 0.537905713825921,
"learning_rate": 9.381071894890941e-05,
"loss": 1.2666,
"step": 251
},
{
"epoch": 0.19,
"grad_norm": 0.5288916095430457,
"learning_rate": 9.375289519194894e-05,
"loss": 1.2666,
"step": 252
},
{
"epoch": 0.19,
"grad_norm": 0.5335913282729025,
"learning_rate": 9.369482057080292e-05,
"loss": 1.2886,
"step": 253
},
{
"epoch": 0.19,
"grad_norm": 0.5523824410197196,
"learning_rate": 9.363649541845142e-05,
"loss": 1.2571,
"step": 254
},
{
"epoch": 0.19,
"grad_norm": 0.5912264857528259,
"learning_rate": 9.357792006931098e-05,
"loss": 1.261,
"step": 255
},
{
"epoch": 0.19,
"grad_norm": 0.5594499774840426,
"learning_rate": 9.35190948592327e-05,
"loss": 1.3027,
"step": 256
},
{
"epoch": 0.19,
"grad_norm": 0.5379207919206825,
"learning_rate": 9.346002012550027e-05,
"loss": 1.2983,
"step": 257
},
{
"epoch": 0.19,
"grad_norm": 0.5455629199690059,
"learning_rate": 9.340069620682806e-05,
"loss": 1.2695,
"step": 258
},
{
"epoch": 0.19,
"grad_norm": 0.5471737544580354,
"learning_rate": 9.334112344335924e-05,
"loss": 1.3047,
"step": 259
},
{
"epoch": 0.19,
"grad_norm": 0.5397100655209365,
"learning_rate": 9.328130217666366e-05,
"loss": 1.2896,
"step": 260
},
{
"epoch": 0.19,
"grad_norm": 0.5636004509867364,
"learning_rate": 9.322123274973613e-05,
"loss": 1.3501,
"step": 261
},
{
"epoch": 0.19,
"grad_norm": 0.5605154015144495,
"learning_rate": 9.316091550699424e-05,
"loss": 1.2983,
"step": 262
},
{
"epoch": 0.19,
"grad_norm": 0.5461515781521593,
"learning_rate": 9.310035079427651e-05,
"loss": 1.269,
"step": 263
},
{
"epoch": 0.2,
"grad_norm": 0.5175024878789147,
"learning_rate": 9.303953895884033e-05,
"loss": 1.1653,
"step": 264
},
{
"epoch": 0.2,
"grad_norm": 0.5224669601631107,
"learning_rate": 9.297848034936006e-05,
"loss": 1.2554,
"step": 265
},
{
"epoch": 0.2,
"grad_norm": 0.5444106809363777,
"learning_rate": 9.291717531592494e-05,
"loss": 1.293,
"step": 266
},
{
"epoch": 0.2,
"grad_norm": 0.5287552712313793,
"learning_rate": 9.285562421003715e-05,
"loss": 1.2651,
"step": 267
},
{
"epoch": 0.2,
"grad_norm": 0.5381309609110954,
"learning_rate": 9.279382738460971e-05,
"loss": 1.2812,
"step": 268
},
{
"epoch": 0.2,
"grad_norm": 0.5528803396804242,
"learning_rate": 9.273178519396459e-05,
"loss": 1.3149,
"step": 269
},
{
"epoch": 0.2,
"grad_norm": 0.5270531797880375,
"learning_rate": 9.266949799383053e-05,
"loss": 1.2615,
"step": 270
},
{
"epoch": 0.2,
"grad_norm": 0.5488129774725259,
"learning_rate": 9.260696614134114e-05,
"loss": 1.2837,
"step": 271
},
{
"epoch": 0.2,
"grad_norm": 0.5335083589116082,
"learning_rate": 9.254418999503271e-05,
"loss": 1.2339,
"step": 272
},
{
"epoch": 0.2,
"grad_norm": 0.5974061497388541,
"learning_rate": 9.248116991484229e-05,
"loss": 1.2825,
"step": 273
},
{
"epoch": 0.2,
"grad_norm": 0.5381713380415607,
"learning_rate": 9.241790626210549e-05,
"loss": 1.1895,
"step": 274
},
{
"epoch": 0.2,
"grad_norm": 0.5384430847504001,
"learning_rate": 9.235439939955457e-05,
"loss": 1.2358,
"step": 275
},
{
"epoch": 0.2,
"grad_norm": 0.5256588888016233,
"learning_rate": 9.229064969131621e-05,
"loss": 1.2407,
"step": 276
},
{
"epoch": 0.2,
"grad_norm": 0.5242296953154587,
"learning_rate": 9.222665750290953e-05,
"loss": 1.2832,
"step": 277
},
{
"epoch": 0.21,
"grad_norm": 0.5224106607183625,
"learning_rate": 9.216242320124388e-05,
"loss": 1.2388,
"step": 278
},
{
"epoch": 0.21,
"grad_norm": 0.540400861953043,
"learning_rate": 9.20979471546169e-05,
"loss": 1.2695,
"step": 279
},
{
"epoch": 0.21,
"grad_norm": 0.5289483661482471,
"learning_rate": 9.203322973271223e-05,
"loss": 1.2832,
"step": 280
},
{
"epoch": 0.21,
"grad_norm": 0.5376637104674151,
"learning_rate": 9.19682713065975e-05,
"loss": 1.2783,
"step": 281
},
{
"epoch": 0.21,
"grad_norm": 0.5547766359095799,
"learning_rate": 9.19030722487222e-05,
"loss": 1.2515,
"step": 282
},
{
"epoch": 0.21,
"grad_norm": 0.5431030883095361,
"learning_rate": 9.183763293291549e-05,
"loss": 1.2346,
"step": 283
},
{
"epoch": 0.21,
"grad_norm": 0.5767856753870191,
"learning_rate": 9.17719537343841e-05,
"loss": 1.2974,
"step": 284
},
{
"epoch": 0.21,
"grad_norm": 0.5356401648893151,
"learning_rate": 9.170603502971016e-05,
"loss": 1.2532,
"step": 285
},
{
"epoch": 0.21,
"grad_norm": 0.5528695803408737,
"learning_rate": 9.163987719684907e-05,
"loss": 1.3442,
"step": 286
},
{
"epoch": 0.21,
"grad_norm": 0.5356080125920785,
"learning_rate": 9.157348061512727e-05,
"loss": 1.2686,
"step": 287
},
{
"epoch": 0.21,
"grad_norm": 0.5778656916381988,
"learning_rate": 9.150684566524012e-05,
"loss": 1.2041,
"step": 288
},
{
"epoch": 0.21,
"grad_norm": 0.5328749801157324,
"learning_rate": 9.143997272924973e-05,
"loss": 1.2437,
"step": 289
},
{
"epoch": 0.21,
"grad_norm": 0.5656275076768376,
"learning_rate": 9.13728621905827e-05,
"loss": 1.2886,
"step": 290
},
{
"epoch": 0.22,
"grad_norm": 0.5655646337419664,
"learning_rate": 9.130551443402799e-05,
"loss": 1.2783,
"step": 291
},
{
"epoch": 0.22,
"grad_norm": 0.567975953014803,
"learning_rate": 9.123792984573466e-05,
"loss": 1.3223,
"step": 292
},
{
"epoch": 0.22,
"grad_norm": 0.5361585380833186,
"learning_rate": 9.117010881320973e-05,
"loss": 1.2231,
"step": 293
},
{
"epoch": 0.22,
"grad_norm": 0.5527612532950269,
"learning_rate": 9.110205172531585e-05,
"loss": 1.3506,
"step": 294
},
{
"epoch": 0.22,
"grad_norm": 0.5330323483779986,
"learning_rate": 9.103375897226918e-05,
"loss": 1.2974,
"step": 295
},
{
"epoch": 0.22,
"grad_norm": 0.541076058179259,
"learning_rate": 9.096523094563708e-05,
"loss": 1.2617,
"step": 296
},
{
"epoch": 0.22,
"grad_norm": 0.5340836977689315,
"learning_rate": 9.089646803833589e-05,
"loss": 1.2603,
"step": 297
},
{
"epoch": 0.22,
"grad_norm": 0.5383753245320845,
"learning_rate": 9.082747064462867e-05,
"loss": 1.2583,
"step": 298
},
{
"epoch": 0.22,
"grad_norm": 0.5192836861689345,
"learning_rate": 9.075823916012298e-05,
"loss": 1.2568,
"step": 299
},
{
"epoch": 0.22,
"grad_norm": 0.5744817919271316,
"learning_rate": 9.068877398176852e-05,
"loss": 1.2131,
"step": 300
},
{
"epoch": 0.22,
"grad_norm": 0.5323047093147705,
"learning_rate": 9.061907550785498e-05,
"loss": 1.2783,
"step": 301
},
{
"epoch": 0.22,
"grad_norm": 0.5607328564400242,
"learning_rate": 9.054914413800961e-05,
"loss": 1.3398,
"step": 302
},
{
"epoch": 0.22,
"grad_norm": 0.5782257895199574,
"learning_rate": 9.047898027319507e-05,
"loss": 1.2759,
"step": 303
},
{
"epoch": 0.22,
"grad_norm": 0.546644793451931,
"learning_rate": 9.040858431570702e-05,
"loss": 1.2632,
"step": 304
},
{
"epoch": 0.23,
"grad_norm": 0.5535852227341702,
"learning_rate": 9.033795666917191e-05,
"loss": 1.312,
"step": 305
},
{
"epoch": 0.23,
"grad_norm": 0.5371002551511538,
"learning_rate": 9.026709773854457e-05,
"loss": 1.2593,
"step": 306
},
{
"epoch": 0.23,
"grad_norm": 0.5394441228369942,
"learning_rate": 9.019600793010597e-05,
"loss": 1.269,
"step": 307
},
{
"epoch": 0.23,
"grad_norm": 0.5512445550522174,
"learning_rate": 9.012468765146079e-05,
"loss": 1.2686,
"step": 308
},
{
"epoch": 0.23,
"grad_norm": 0.5043850111181398,
"learning_rate": 9.005313731153524e-05,
"loss": 1.2363,
"step": 309
},
{
"epoch": 0.23,
"grad_norm": 0.5294693808157453,
"learning_rate": 8.998135732057458e-05,
"loss": 1.2725,
"step": 310
},
{
"epoch": 0.23,
"grad_norm": 0.5235449664008548,
"learning_rate": 8.990934809014077e-05,
"loss": 1.249,
"step": 311
},
{
"epoch": 0.23,
"grad_norm": 0.5228082226582549,
"learning_rate": 8.983711003311024e-05,
"loss": 1.2153,
"step": 312
},
{
"epoch": 0.23,
"grad_norm": 0.5525620828249341,
"learning_rate": 8.976464356367134e-05,
"loss": 1.2136,
"step": 313
},
{
"epoch": 0.23,
"grad_norm": 0.5605215996168639,
"learning_rate": 8.96919490973221e-05,
"loss": 1.271,
"step": 314
},
{
"epoch": 0.23,
"grad_norm": 0.5277359930208506,
"learning_rate": 8.961902705086785e-05,
"loss": 1.1836,
"step": 315
},
{
"epoch": 0.23,
"grad_norm": 0.5405930304733125,
"learning_rate": 8.954587784241871e-05,
"loss": 1.2705,
"step": 316
},
{
"epoch": 0.23,
"grad_norm": 0.5248476194932483,
"learning_rate": 8.947250189138731e-05,
"loss": 1.2607,
"step": 317
},
{
"epoch": 0.24,
"grad_norm": 0.573678896783169,
"learning_rate": 8.939889961848634e-05,
"loss": 1.2727,
"step": 318
},
{
"epoch": 0.24,
"grad_norm": 0.5773485095137408,
"learning_rate": 8.932507144572616e-05,
"loss": 1.2607,
"step": 319
},
{
"epoch": 0.24,
"grad_norm": 0.5633980526681968,
"learning_rate": 8.925101779641232e-05,
"loss": 1.1917,
"step": 320
},
{
"epoch": 0.24,
"grad_norm": 0.5300371631849218,
"learning_rate": 8.917673909514322e-05,
"loss": 1.3105,
"step": 321
},
{
"epoch": 0.24,
"grad_norm": 0.5310192196200603,
"learning_rate": 8.910223576780758e-05,
"loss": 1.2808,
"step": 322
},
{
"epoch": 0.24,
"grad_norm": 0.5234569464366723,
"learning_rate": 8.902750824158212e-05,
"loss": 1.2468,
"step": 323
},
{
"epoch": 0.24,
"grad_norm": 0.5473770126434013,
"learning_rate": 8.895255694492896e-05,
"loss": 1.2676,
"step": 324
},
{
"epoch": 0.24,
"grad_norm": 0.5670393642092653,
"learning_rate": 8.887738230759333e-05,
"loss": 1.2456,
"step": 325
},
{
"epoch": 0.24,
"grad_norm": 0.5484650752546845,
"learning_rate": 8.880198476060095e-05,
"loss": 1.251,
"step": 326
},
{
"epoch": 0.24,
"grad_norm": 0.5569076336735002,
"learning_rate": 8.872636473625565e-05,
"loss": 1.272,
"step": 327
},
{
"epoch": 0.24,
"grad_norm": 0.5237290090420638,
"learning_rate": 8.865052266813685e-05,
"loss": 1.2822,
"step": 328
},
{
"epoch": 0.24,
"grad_norm": 0.5507489271814671,
"learning_rate": 8.857445899109715e-05,
"loss": 1.2783,
"step": 329
},
{
"epoch": 0.24,
"grad_norm": 0.5527246685898635,
"learning_rate": 8.849817414125973e-05,
"loss": 1.2705,
"step": 330
},
{
"epoch": 0.24,
"grad_norm": 0.5544016696123183,
"learning_rate": 8.84216685560159e-05,
"loss": 1.2856,
"step": 331
},
{
"epoch": 0.25,
"grad_norm": 0.5424146088216879,
"learning_rate": 8.834494267402263e-05,
"loss": 1.2202,
"step": 332
},
{
"epoch": 0.25,
"grad_norm": 0.5323806898987287,
"learning_rate": 8.826799693519996e-05,
"loss": 1.248,
"step": 333
},
{
"epoch": 0.25,
"grad_norm": 0.5595146324987165,
"learning_rate": 8.819083178072852e-05,
"loss": 1.1672,
"step": 334
},
{
"epoch": 0.25,
"grad_norm": 0.5854406580169095,
"learning_rate": 8.811344765304698e-05,
"loss": 1.2146,
"step": 335
},
{
"epoch": 0.25,
"grad_norm": 0.5697562446019094,
"learning_rate": 8.80358449958496e-05,
"loss": 1.2568,
"step": 336
},
{
"epoch": 0.25,
"grad_norm": 0.5538906977604374,
"learning_rate": 8.795802425408352e-05,
"loss": 1.2544,
"step": 337
},
{
"epoch": 0.25,
"grad_norm": 0.5211793067308176,
"learning_rate": 8.787998587394637e-05,
"loss": 1.2183,
"step": 338
},
{
"epoch": 0.25,
"grad_norm": 0.5732446722628473,
"learning_rate": 8.780173030288359e-05,
"loss": 1.3057,
"step": 339
},
{
"epoch": 0.25,
"grad_norm": 0.5352980539739127,
"learning_rate": 8.772325798958597e-05,
"loss": 1.2598,
"step": 340
},
{
"epoch": 0.25,
"grad_norm": 0.5234917926015726,
"learning_rate": 8.7644569383987e-05,
"loss": 1.1982,
"step": 341
},
{
"epoch": 0.25,
"grad_norm": 0.5844314852721842,
"learning_rate": 8.75656649372603e-05,
"loss": 1.2656,
"step": 342
},
{
"epoch": 0.25,
"grad_norm": 0.5646854448914282,
"learning_rate": 8.748654510181709e-05,
"loss": 1.21,
"step": 343
},
{
"epoch": 0.25,
"grad_norm": 0.5216723813831847,
"learning_rate": 8.740721033130352e-05,
"loss": 1.2329,
"step": 344
},
{
"epoch": 0.25,
"grad_norm": 0.5099027314874095,
"learning_rate": 8.732766108059813e-05,
"loss": 1.2236,
"step": 345
},
{
"epoch": 0.26,
"grad_norm": 0.5188769999186538,
"learning_rate": 8.72478978058092e-05,
"loss": 1.2905,
"step": 346
},
{
"epoch": 0.26,
"grad_norm": 0.5245157404984339,
"learning_rate": 8.716792096427217e-05,
"loss": 1.2339,
"step": 347
},
{
"epoch": 0.26,
"grad_norm": 0.5160205485678449,
"learning_rate": 8.708773101454697e-05,
"loss": 1.2524,
"step": 348
},
{
"epoch": 0.26,
"grad_norm": 0.510633107323387,
"learning_rate": 8.700732841641542e-05,
"loss": 1.2756,
"step": 349
},
{
"epoch": 0.26,
"grad_norm": 0.5097028901140956,
"learning_rate": 8.692671363087863e-05,
"loss": 1.2539,
"step": 350
},
{
"epoch": 0.26,
"grad_norm": 0.5506040438253419,
"learning_rate": 8.68458871201543e-05,
"loss": 1.1733,
"step": 351
},
{
"epoch": 0.26,
"grad_norm": 0.5339837805003954,
"learning_rate": 8.676484934767409e-05,
"loss": 1.1919,
"step": 352
},
{
"epoch": 0.26,
"grad_norm": 0.5243053855032012,
"learning_rate": 8.668360077808093e-05,
"loss": 1.2637,
"step": 353
},
{
"epoch": 0.26,
"grad_norm": 0.5475923045103417,
"learning_rate": 8.660214187722646e-05,
"loss": 1.2583,
"step": 354
},
{
"epoch": 0.26,
"grad_norm": 0.5139607250185231,
"learning_rate": 8.652047311216822e-05,
"loss": 1.2939,
"step": 355
},
{
"epoch": 0.26,
"grad_norm": 0.5310090229071474,
"learning_rate": 8.64385949511671e-05,
"loss": 1.2788,
"step": 356
},
{
"epoch": 0.26,
"grad_norm": 0.5531120494965365,
"learning_rate": 8.635650786368452e-05,
"loss": 1.25,
"step": 357
},
{
"epoch": 0.26,
"grad_norm": 0.5315969577054235,
"learning_rate": 8.627421232037989e-05,
"loss": 1.2357,
"step": 358
},
{
"epoch": 0.27,
"grad_norm": 0.5266216921573422,
"learning_rate": 8.619170879310779e-05,
"loss": 1.2729,
"step": 359
},
{
"epoch": 0.27,
"grad_norm": 0.5593055072800345,
"learning_rate": 8.61089977549153e-05,
"loss": 1.2529,
"step": 360
},
{
"epoch": 0.27,
"grad_norm": 0.5596710951308123,
"learning_rate": 8.602607968003935e-05,
"loss": 1.2725,
"step": 361
},
{
"epoch": 0.27,
"grad_norm": 0.5433552854623133,
"learning_rate": 8.59429550439039e-05,
"loss": 1.2446,
"step": 362
},
{
"epoch": 0.27,
"grad_norm": 0.5818949631250041,
"learning_rate": 8.585962432311727e-05,
"loss": 1.2998,
"step": 363
},
{
"epoch": 0.27,
"grad_norm": 0.514243535892493,
"learning_rate": 8.577608799546942e-05,
"loss": 1.23,
"step": 364
},
{
"epoch": 0.27,
"grad_norm": 0.5465838481685172,
"learning_rate": 8.569234653992916e-05,
"loss": 1.2532,
"step": 365
},
{
"epoch": 0.27,
"grad_norm": 0.519563471824199,
"learning_rate": 8.560840043664144e-05,
"loss": 1.2607,
"step": 366
},
{
"epoch": 0.27,
"grad_norm": 0.5334398982863738,
"learning_rate": 8.552425016692464e-05,
"loss": 1.2363,
"step": 367
},
{
"epoch": 0.27,
"grad_norm": 0.5530652812053678,
"learning_rate": 8.543989621326768e-05,
"loss": 1.2681,
"step": 368
},
{
"epoch": 0.27,
"grad_norm": 0.5502954863671434,
"learning_rate": 8.535533905932738e-05,
"loss": 1.1721,
"step": 369
},
{
"epoch": 0.27,
"grad_norm": 0.5180001078920966,
"learning_rate": 8.527057918992565e-05,
"loss": 1.2139,
"step": 370
},
{
"epoch": 0.27,
"grad_norm": 0.5333180911534254,
"learning_rate": 8.518561709104667e-05,
"loss": 1.2461,
"step": 371
},
{
"epoch": 0.27,
"grad_norm": 0.5479350107655593,
"learning_rate": 8.510045324983417e-05,
"loss": 1.2512,
"step": 372
},
{
"epoch": 0.28,
"grad_norm": 0.5246093324411485,
"learning_rate": 8.501508815458855e-05,
"loss": 1.1787,
"step": 373
},
{
"epoch": 0.28,
"grad_norm": 0.50033135264865,
"learning_rate": 8.492952229476421e-05,
"loss": 1.2271,
"step": 374
},
{
"epoch": 0.28,
"grad_norm": 0.5418162221365314,
"learning_rate": 8.484375616096658e-05,
"loss": 1.2383,
"step": 375
},
{
"epoch": 0.28,
"grad_norm": 0.516783670359288,
"learning_rate": 8.475779024494945e-05,
"loss": 1.2681,
"step": 376
},
{
"epoch": 0.28,
"grad_norm": 0.5298750460233759,
"learning_rate": 8.467162503961208e-05,
"loss": 1.2451,
"step": 377
},
{
"epoch": 0.28,
"grad_norm": 0.5149476400550106,
"learning_rate": 8.45852610389964e-05,
"loss": 1.23,
"step": 378
},
{
"epoch": 0.28,
"grad_norm": 0.5268563601419046,
"learning_rate": 8.449869873828411e-05,
"loss": 1.2129,
"step": 379
},
{
"epoch": 0.28,
"grad_norm": 0.5357435202461692,
"learning_rate": 8.441193863379396e-05,
"loss": 1.2881,
"step": 380
},
{
"epoch": 0.28,
"grad_norm": 0.5407114377511073,
"learning_rate": 8.432498122297878e-05,
"loss": 1.2559,
"step": 381
},
{
"epoch": 0.28,
"grad_norm": 0.5376253272809564,
"learning_rate": 8.423782700442277e-05,
"loss": 1.2346,
"step": 382
},
{
"epoch": 0.28,
"grad_norm": 0.5378153063595059,
"learning_rate": 8.415047647783847e-05,
"loss": 1.2031,
"step": 383
},
{
"epoch": 0.28,
"grad_norm": 0.514779002563088,
"learning_rate": 8.406293014406403e-05,
"loss": 1.2056,
"step": 384
},
{
"epoch": 0.28,
"grad_norm": 0.5659231392943161,
"learning_rate": 8.397518850506028e-05,
"loss": 1.2346,
"step": 385
},
{
"epoch": 0.29,
"grad_norm": 0.5483974446090379,
"learning_rate": 8.388725206390788e-05,
"loss": 1.2974,
"step": 386
},
{
"epoch": 0.29,
"grad_norm": 0.5297423113703096,
"learning_rate": 8.379912132480441e-05,
"loss": 1.2427,
"step": 387
},
{
"epoch": 0.29,
"grad_norm": 0.5339239833592698,
"learning_rate": 8.371079679306146e-05,
"loss": 1.2788,
"step": 388
},
{
"epoch": 0.29,
"grad_norm": 0.5346762752364651,
"learning_rate": 8.36222789751018e-05,
"loss": 1.2329,
"step": 389
},
{
"epoch": 0.29,
"grad_norm": 0.5267945253503268,
"learning_rate": 8.353356837845642e-05,
"loss": 1.3101,
"step": 390
},
{
"epoch": 0.29,
"grad_norm": 0.5227678407329124,
"learning_rate": 8.344466551176164e-05,
"loss": 1.2544,
"step": 391
},
{
"epoch": 0.29,
"grad_norm": 0.5351886972585579,
"learning_rate": 8.335557088475618e-05,
"loss": 1.2036,
"step": 392
},
{
"epoch": 0.29,
"grad_norm": 0.547855768363372,
"learning_rate": 8.326628500827826e-05,
"loss": 1.2256,
"step": 393
},
{
"epoch": 0.29,
"grad_norm": 0.5232912428703006,
"learning_rate": 8.31768083942627e-05,
"loss": 1.2524,
"step": 394
},
{
"epoch": 0.29,
"grad_norm": 0.5355407135538937,
"learning_rate": 8.308714155573785e-05,
"loss": 1.1904,
"step": 395
},
{
"epoch": 0.29,
"grad_norm": 0.5398818834520477,
"learning_rate": 8.29972850068228e-05,
"loss": 1.2544,
"step": 396
},
{
"epoch": 0.29,
"grad_norm": 0.5365767973671521,
"learning_rate": 8.290723926272439e-05,
"loss": 1.2378,
"step": 397
},
{
"epoch": 0.29,
"grad_norm": 0.5505960932890972,
"learning_rate": 8.281700483973421e-05,
"loss": 1.2471,
"step": 398
},
{
"epoch": 0.29,
"grad_norm": 0.5479428166637395,
"learning_rate": 8.272658225522569e-05,
"loss": 1.2607,
"step": 399
},
{
"epoch": 0.3,
"grad_norm": 0.5764125413085645,
"learning_rate": 8.263597202765109e-05,
"loss": 1.2888,
"step": 400
},
{
"epoch": 0.3,
"grad_norm": 0.5193462362673806,
"learning_rate": 8.254517467653858e-05,
"loss": 1.1882,
"step": 401
},
{
"epoch": 0.3,
"grad_norm": 0.5374168368793678,
"learning_rate": 8.245419072248919e-05,
"loss": 1.2358,
"step": 402
},
{
"epoch": 0.3,
"grad_norm": 0.5560345573494497,
"learning_rate": 8.236302068717392e-05,
"loss": 1.3,
"step": 403
},
{
"epoch": 0.3,
"grad_norm": 0.5223138605512301,
"learning_rate": 8.227166509333068e-05,
"loss": 1.2559,
"step": 404
},
{
"epoch": 0.3,
"grad_norm": 0.5009208364979428,
"learning_rate": 8.218012446476128e-05,
"loss": 1.2617,
"step": 405
},
{
"epoch": 0.3,
"grad_norm": 0.509867725986647,
"learning_rate": 8.208839932632849e-05,
"loss": 1.2715,
"step": 406
},
{
"epoch": 0.3,
"grad_norm": 0.5190782935920448,
"learning_rate": 8.199649020395298e-05,
"loss": 1.2183,
"step": 407
},
{
"epoch": 0.3,
"grad_norm": 0.551317848502644,
"learning_rate": 8.190439762461033e-05,
"loss": 1.2241,
"step": 408
},
{
"epoch": 0.3,
"grad_norm": 0.5299140869699253,
"learning_rate": 8.181212211632799e-05,
"loss": 1.1746,
"step": 409
},
{
"epoch": 0.3,
"grad_norm": 0.5161200175965883,
"learning_rate": 8.171966420818228e-05,
"loss": 1.2544,
"step": 410
},
{
"epoch": 0.3,
"grad_norm": 0.5368310977870265,
"learning_rate": 8.162702443029531e-05,
"loss": 1.2505,
"step": 411
},
{
"epoch": 0.3,
"grad_norm": 0.5392135585371384,
"learning_rate": 8.153420331383199e-05,
"loss": 1.2378,
"step": 412
},
{
"epoch": 0.31,
"grad_norm": 0.5652426070182841,
"learning_rate": 8.144120139099697e-05,
"loss": 1.2788,
"step": 413
},
{
"epoch": 0.31,
"grad_norm": 0.5264883521440279,
"learning_rate": 8.134801919503154e-05,
"loss": 1.2432,
"step": 414
},
{
"epoch": 0.31,
"grad_norm": 0.5391198787958846,
"learning_rate": 8.125465726021069e-05,
"loss": 1.2642,
"step": 415
},
{
"epoch": 0.31,
"grad_norm": 0.5447234901673647,
"learning_rate": 8.116111612183989e-05,
"loss": 1.2598,
"step": 416
},
{
"epoch": 0.31,
"grad_norm": 0.5239448356746366,
"learning_rate": 8.106739631625217e-05,
"loss": 1.2383,
"step": 417
},
{
"epoch": 0.31,
"grad_norm": 0.522466994953917,
"learning_rate": 8.09734983808049e-05,
"loss": 1.21,
"step": 418
},
{
"epoch": 0.31,
"grad_norm": 0.49320728726020635,
"learning_rate": 8.087942285387688e-05,
"loss": 1.1643,
"step": 419
},
{
"epoch": 0.31,
"grad_norm": 0.538615135680076,
"learning_rate": 8.07851702748651e-05,
"loss": 1.2485,
"step": 420
},
{
"epoch": 0.31,
"grad_norm": 0.5546864636999657,
"learning_rate": 8.06907411841817e-05,
"loss": 1.1887,
"step": 421
},
{
"epoch": 0.31,
"grad_norm": 0.5337150121699967,
"learning_rate": 8.05961361232509e-05,
"loss": 1.2378,
"step": 422
},
{
"epoch": 0.31,
"grad_norm": 0.5548120199862732,
"learning_rate": 8.050135563450587e-05,
"loss": 1.2129,
"step": 423
},
{
"epoch": 0.31,
"grad_norm": 0.5491477319207145,
"learning_rate": 8.040640026138562e-05,
"loss": 1.2615,
"step": 424
},
{
"epoch": 0.31,
"grad_norm": 0.5292609791678348,
"learning_rate": 8.03112705483319e-05,
"loss": 1.1963,
"step": 425
},
{
"epoch": 0.31,
"grad_norm": 0.5386073890465884,
"learning_rate": 8.021596704078605e-05,
"loss": 1.2822,
"step": 426
},
{
"epoch": 0.32,
"grad_norm": 0.5208877771953219,
"learning_rate": 8.012049028518589e-05,
"loss": 1.2468,
"step": 427
},
{
"epoch": 0.32,
"grad_norm": 0.5300893442105213,
"learning_rate": 8.002484082896257e-05,
"loss": 1.2141,
"step": 428
},
{
"epoch": 0.32,
"grad_norm": 0.5426660622332912,
"learning_rate": 7.992901922053752e-05,
"loss": 1.2083,
"step": 429
},
{
"epoch": 0.32,
"grad_norm": 0.5280778314237736,
"learning_rate": 7.983302600931911e-05,
"loss": 1.2556,
"step": 430
},
{
"epoch": 0.32,
"grad_norm": 0.5303015472910759,
"learning_rate": 7.973686174569972e-05,
"loss": 1.2246,
"step": 431
},
{
"epoch": 0.32,
"grad_norm": 0.5385117857553907,
"learning_rate": 7.964052698105247e-05,
"loss": 1.2544,
"step": 432
},
{
"epoch": 0.32,
"grad_norm": 0.5175160927509813,
"learning_rate": 7.954402226772804e-05,
"loss": 1.1724,
"step": 433
},
{
"epoch": 0.32,
"grad_norm": 0.5167307050244405,
"learning_rate": 7.944734815905154e-05,
"loss": 1.228,
"step": 434
},
{
"epoch": 0.32,
"grad_norm": 0.533666702216578,
"learning_rate": 7.93505052093194e-05,
"loss": 1.2349,
"step": 435
},
{
"epoch": 0.32,
"grad_norm": 0.5259498652131873,
"learning_rate": 7.925349397379604e-05,
"loss": 1.2415,
"step": 436
},
{
"epoch": 0.32,
"grad_norm": 0.5445977576017799,
"learning_rate": 7.915631500871083e-05,
"loss": 1.2065,
"step": 437
},
{
"epoch": 0.32,
"grad_norm": 0.5649990455410109,
"learning_rate": 7.905896887125482e-05,
"loss": 1.2417,
"step": 438
},
{
"epoch": 0.32,
"grad_norm": 0.5260513948557283,
"learning_rate": 7.896145611957759e-05,
"loss": 1.1918,
"step": 439
},
{
"epoch": 0.33,
"grad_norm": 0.5258410063287358,
"learning_rate": 7.8863777312784e-05,
"loss": 1.2124,
"step": 440
},
{
"epoch": 0.33,
"grad_norm": 0.5434644442116746,
"learning_rate": 7.876593301093104e-05,
"loss": 1.2349,
"step": 441
},
{
"epoch": 0.33,
"grad_norm": 0.5462561748612222,
"learning_rate": 7.866792377502457e-05,
"loss": 1.2373,
"step": 442
},
{
"epoch": 0.33,
"grad_norm": 0.5661256454549024,
"learning_rate": 7.856975016701615e-05,
"loss": 1.2334,
"step": 443
},
{
"epoch": 0.33,
"grad_norm": 0.5517524055311237,
"learning_rate": 7.847141274979977e-05,
"loss": 1.2549,
"step": 444
},
{
"epoch": 0.33,
"grad_norm": 0.5588533911643465,
"learning_rate": 7.837291208720866e-05,
"loss": 1.248,
"step": 445
},
{
"epoch": 0.33,
"grad_norm": 0.5432341108696274,
"learning_rate": 7.827424874401203e-05,
"loss": 1.207,
"step": 446
},
{
"epoch": 0.33,
"grad_norm": 0.5185655878803792,
"learning_rate": 7.81754232859119e-05,
"loss": 1.2087,
"step": 447
},
{
"epoch": 0.33,
"grad_norm": 0.546989000271988,
"learning_rate": 7.807643627953969e-05,
"loss": 1.2852,
"step": 448
},
{
"epoch": 0.33,
"grad_norm": 0.5609807732483688,
"learning_rate": 7.797728829245321e-05,
"loss": 1.23,
"step": 449
},
{
"epoch": 0.33,
"grad_norm": 0.5290536891546959,
"learning_rate": 7.787797989313317e-05,
"loss": 1.1687,
"step": 450
},
{
"epoch": 0.33,
"grad_norm": 0.527486366572943,
"learning_rate": 7.777851165098012e-05,
"loss": 1.2349,
"step": 451
},
{
"epoch": 0.33,
"grad_norm": 0.5444668761845415,
"learning_rate": 7.767888413631101e-05,
"loss": 1.248,
"step": 452
},
{
"epoch": 0.33,
"grad_norm": 0.5194113505588946,
"learning_rate": 7.757909792035608e-05,
"loss": 1.3081,
"step": 453
},
{
"epoch": 0.34,
"grad_norm": 0.5174613130879753,
"learning_rate": 7.747915357525545e-05,
"loss": 1.2046,
"step": 454
},
{
"epoch": 0.34,
"grad_norm": 0.5535670191712191,
"learning_rate": 7.737905167405595e-05,
"loss": 1.2136,
"step": 455
},
{
"epoch": 0.34,
"grad_norm": 0.546209627520353,
"learning_rate": 7.727879279070773e-05,
"loss": 1.2097,
"step": 456
},
{
"epoch": 0.34,
"grad_norm": 0.5221397456131871,
"learning_rate": 7.717837750006106e-05,
"loss": 1.2832,
"step": 457
},
{
"epoch": 0.34,
"grad_norm": 0.5380906003507856,
"learning_rate": 7.7077806377863e-05,
"loss": 1.1807,
"step": 458
},
{
"epoch": 0.34,
"grad_norm": 0.546159089637007,
"learning_rate": 7.697708000075403e-05,
"loss": 1.262,
"step": 459
},
{
"epoch": 0.34,
"grad_norm": 0.5378903447286532,
"learning_rate": 7.687619894626493e-05,
"loss": 1.2639,
"step": 460
},
{
"epoch": 0.34,
"grad_norm": 0.5183593724417229,
"learning_rate": 7.677516379281321e-05,
"loss": 1.2344,
"step": 461
},
{
"epoch": 0.34,
"grad_norm": 0.5110004203240966,
"learning_rate": 7.667397511970005e-05,
"loss": 1.2144,
"step": 462
},
{
"epoch": 0.34,
"grad_norm": 0.5237401648978784,
"learning_rate": 7.657263350710676e-05,
"loss": 1.1992,
"step": 463
},
{
"epoch": 0.34,
"grad_norm": 0.5458624581753624,
"learning_rate": 7.647113953609163e-05,
"loss": 1.252,
"step": 464
},
{
"epoch": 0.34,
"grad_norm": 0.55612272064723,
"learning_rate": 7.636949378858646e-05,
"loss": 1.188,
"step": 465
},
{
"epoch": 0.34,
"grad_norm": 0.5578526299155908,
"learning_rate": 7.626769684739337e-05,
"loss": 1.1951,
"step": 466
},
{
"epoch": 0.35,
"grad_norm": 0.5092511020982519,
"learning_rate": 7.616574929618125e-05,
"loss": 1.1543,
"step": 467
},
{
"epoch": 0.35,
"grad_norm": 0.5348616024567703,
"learning_rate": 7.606365171948267e-05,
"loss": 1.2368,
"step": 468
},
{
"epoch": 0.35,
"grad_norm": 0.532298079012496,
"learning_rate": 7.596140470269029e-05,
"loss": 1.2107,
"step": 469
},
{
"epoch": 0.35,
"grad_norm": 0.5514395726265122,
"learning_rate": 7.585900883205364e-05,
"loss": 1.241,
"step": 470
},
{
"epoch": 0.35,
"grad_norm": 0.5539874834294591,
"learning_rate": 7.575646469467575e-05,
"loss": 1.2249,
"step": 471
},
{
"epoch": 0.35,
"grad_norm": 0.5141238427544136,
"learning_rate": 7.565377287850977e-05,
"loss": 1.21,
"step": 472
},
{
"epoch": 0.35,
"grad_norm": 0.526119772429715,
"learning_rate": 7.555093397235552e-05,
"loss": 1.2141,
"step": 473
},
{
"epoch": 0.35,
"grad_norm": 0.5239544155150679,
"learning_rate": 7.544794856585626e-05,
"loss": 1.2446,
"step": 474
},
{
"epoch": 0.35,
"grad_norm": 0.5116743183638587,
"learning_rate": 7.53448172494952e-05,
"loss": 1.2251,
"step": 475
},
{
"epoch": 0.35,
"grad_norm": 0.5465278452905271,
"learning_rate": 7.524154061459215e-05,
"loss": 1.1744,
"step": 476
},
{
"epoch": 0.35,
"grad_norm": 0.5242898434746838,
"learning_rate": 7.51381192533001e-05,
"loss": 1.2305,
"step": 477
},
{
"epoch": 0.35,
"grad_norm": 0.5524906450650563,
"learning_rate": 7.503455375860192e-05,
"loss": 1.271,
"step": 478
},
{
"epoch": 0.35,
"grad_norm": 0.5422094091125237,
"learning_rate": 7.493084472430682e-05,
"loss": 1.2983,
"step": 479
},
{
"epoch": 0.35,
"grad_norm": 0.5100606069460412,
"learning_rate": 7.482699274504708e-05,
"loss": 1.1914,
"step": 480
},
{
"epoch": 0.36,
"grad_norm": 0.5258246755815246,
"learning_rate": 7.472299841627451e-05,
"loss": 1.1948,
"step": 481
},
{
"epoch": 0.36,
"grad_norm": 0.5183104456102203,
"learning_rate": 7.461886233425717e-05,
"loss": 1.1658,
"step": 482
},
{
"epoch": 0.36,
"grad_norm": 0.5283305385961874,
"learning_rate": 7.451458509607582e-05,
"loss": 1.2378,
"step": 483
},
{
"epoch": 0.36,
"grad_norm": 0.5552677702446687,
"learning_rate": 7.441016729962064e-05,
"loss": 1.1938,
"step": 484
},
{
"epoch": 0.36,
"grad_norm": 0.5198625616185957,
"learning_rate": 7.430560954358764e-05,
"loss": 1.2515,
"step": 485
},
{
"epoch": 0.36,
"grad_norm": 0.524907115545136,
"learning_rate": 7.420091242747536e-05,
"loss": 1.2437,
"step": 486
},
{
"epoch": 0.36,
"grad_norm": 0.520819742542826,
"learning_rate": 7.409607655158139e-05,
"loss": 1.2764,
"step": 487
},
{
"epoch": 0.36,
"grad_norm": 0.5297968503831433,
"learning_rate": 7.399110251699887e-05,
"loss": 1.2529,
"step": 488
},
{
"epoch": 0.36,
"grad_norm": 0.5214545833543685,
"learning_rate": 7.388599092561315e-05,
"loss": 1.2979,
"step": 489
},
{
"epoch": 0.36,
"grad_norm": 0.5158994351772959,
"learning_rate": 7.378074238009826e-05,
"loss": 1.2363,
"step": 490
},
{
"epoch": 0.36,
"grad_norm": 0.49265767229951024,
"learning_rate": 7.367535748391349e-05,
"loss": 1.228,
"step": 491
},
{
"epoch": 0.36,
"grad_norm": 0.5308141896787576,
"learning_rate": 7.35698368412999e-05,
"loss": 1.2527,
"step": 492
},
{
"epoch": 0.36,
"grad_norm": 0.5185543266636785,
"learning_rate": 7.346418105727686e-05,
"loss": 1.2192,
"step": 493
},
{
"epoch": 0.37,
"grad_norm": 0.5231300605729964,
"learning_rate": 7.335839073763865e-05,
"loss": 1.2065,
"step": 494
},
{
"epoch": 0.37,
"grad_norm": 0.5399567824066669,
"learning_rate": 7.325246648895088e-05,
"loss": 1.2563,
"step": 495
},
{
"epoch": 0.37,
"grad_norm": 0.5239942836551379,
"learning_rate": 7.31464089185471e-05,
"loss": 1.2549,
"step": 496
},
{
"epoch": 0.37,
"grad_norm": 0.5367247940798874,
"learning_rate": 7.304021863452524e-05,
"loss": 1.2061,
"step": 497
},
{
"epoch": 0.37,
"grad_norm": 0.5404506218621764,
"learning_rate": 7.293389624574422e-05,
"loss": 1.2142,
"step": 498
},
{
"epoch": 0.37,
"grad_norm": 0.5055969660442964,
"learning_rate": 7.282744236182034e-05,
"loss": 1.2451,
"step": 499
},
{
"epoch": 0.37,
"grad_norm": 0.5423433133756662,
"learning_rate": 7.27208575931239e-05,
"loss": 1.2012,
"step": 500
},
{
"epoch": 0.37,
"grad_norm": 0.5291351969461193,
"learning_rate": 7.26141425507756e-05,
"loss": 1.1768,
"step": 501
},
{
"epoch": 0.37,
"grad_norm": 0.5217703642849318,
"learning_rate": 7.250729784664316e-05,
"loss": 1.209,
"step": 502
},
{
"epoch": 0.37,
"grad_norm": 0.5201622197991884,
"learning_rate": 7.240032409333764e-05,
"loss": 1.2031,
"step": 503
},
{
"epoch": 0.37,
"grad_norm": 0.5281271991799672,
"learning_rate": 7.22932219042101e-05,
"loss": 1.1987,
"step": 504
},
{
"epoch": 0.37,
"grad_norm": 0.5573441678253518,
"learning_rate": 7.218599189334799e-05,
"loss": 1.2739,
"step": 505
},
{
"epoch": 0.37,
"grad_norm": 0.5665017191299871,
"learning_rate": 7.207863467557162e-05,
"loss": 1.2773,
"step": 506
},
{
"epoch": 0.37,
"grad_norm": 0.5325104774494102,
"learning_rate": 7.19711508664307e-05,
"loss": 1.209,
"step": 507
},
{
"epoch": 0.38,
"grad_norm": 0.518792873366363,
"learning_rate": 7.186354108220072e-05,
"loss": 1.2173,
"step": 508
},
{
"epoch": 0.38,
"grad_norm": 0.530762745727063,
"learning_rate": 7.175580593987951e-05,
"loss": 1.2466,
"step": 509
},
{
"epoch": 0.38,
"grad_norm": 0.5140061528285057,
"learning_rate": 7.164794605718366e-05,
"loss": 1.2139,
"step": 510
},
{
"epoch": 0.38,
"grad_norm": 0.5194168189274216,
"learning_rate": 7.153996205254495e-05,
"loss": 1.2476,
"step": 511
},
{
"epoch": 0.38,
"grad_norm": 0.5487088087238914,
"learning_rate": 7.143185454510686e-05,
"loss": 1.2251,
"step": 512
},
{
"epoch": 0.38,
"grad_norm": 0.49449833617368844,
"learning_rate": 7.1323624154721e-05,
"loss": 1.2021,
"step": 513
},
{
"epoch": 0.38,
"grad_norm": 0.5209680110441622,
"learning_rate": 7.121527150194349e-05,
"loss": 1.229,
"step": 514
},
{
"epoch": 0.38,
"grad_norm": 0.5179658980514732,
"learning_rate": 7.110679720803156e-05,
"loss": 1.2324,
"step": 515
},
{
"epoch": 0.38,
"grad_norm": 0.5237224991500224,
"learning_rate": 7.099820189493977e-05,
"loss": 1.269,
"step": 516
},
{
"epoch": 0.38,
"grad_norm": 0.5302189416292129,
"learning_rate": 7.088948618531667e-05,
"loss": 1.2041,
"step": 517
},
{
"epoch": 0.38,
"grad_norm": 0.5384341108312423,
"learning_rate": 7.078065070250106e-05,
"loss": 1.1746,
"step": 518
},
{
"epoch": 0.38,
"grad_norm": 0.5521437637462966,
"learning_rate": 7.067169607051851e-05,
"loss": 1.2886,
"step": 519
},
{
"epoch": 0.38,
"grad_norm": 0.5328288678743964,
"learning_rate": 7.056262291407772e-05,
"loss": 1.1877,
"step": 520
},
{
"epoch": 0.39,
"grad_norm": 0.5359494830051162,
"learning_rate": 7.045343185856701e-05,
"loss": 1.2202,
"step": 521
},
{
"epoch": 0.39,
"grad_norm": 0.5288532232218185,
"learning_rate": 7.034412353005063e-05,
"loss": 1.21,
"step": 522
},
{
"epoch": 0.39,
"grad_norm": 0.5512085122241619,
"learning_rate": 7.02346985552653e-05,
"loss": 1.2798,
"step": 523
},
{
"epoch": 0.39,
"grad_norm": 0.533944460040126,
"learning_rate": 7.01251575616165e-05,
"loss": 1.2539,
"step": 524
},
{
"epoch": 0.39,
"grad_norm": 0.5837632563221825,
"learning_rate": 7.0015501177175e-05,
"loss": 1.1335,
"step": 525
}
],
"logging_steps": 1.0,
"max_steps": 1353,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 25,
"total_flos": 4.4085090777437307e+18,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}