functionary-small-v2.5 / trainer_state.json
khaimai's picture
Upload folder using huggingface_hub
486ee32 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9994060582062958,
"eval_steps": 210,
"global_step": 631,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0015838447832112453,
"grad_norm": 20.634467679531244,
"learning_rate": 4.2105263157894733e-07,
"loss": 1.0835,
"step": 1
},
{
"epoch": 0.0031676895664224905,
"grad_norm": 14.28313933877416,
"learning_rate": 8.421052631578947e-07,
"loss": 0.9055,
"step": 2
},
{
"epoch": 0.004751534349633736,
"grad_norm": 24.632900629424984,
"learning_rate": 1.263157894736842e-06,
"loss": 0.9363,
"step": 3
},
{
"epoch": 0.006335379132844981,
"grad_norm": 14.378103932792914,
"learning_rate": 1.6842105263157893e-06,
"loss": 0.9769,
"step": 4
},
{
"epoch": 0.007919223916056227,
"grad_norm": 12.06138179201479,
"learning_rate": 2.1052631578947366e-06,
"loss": 0.879,
"step": 5
},
{
"epoch": 0.009503068699267472,
"grad_norm": 8.884749518187892,
"learning_rate": 2.526315789473684e-06,
"loss": 0.8971,
"step": 6
},
{
"epoch": 0.011086913482478717,
"grad_norm": 4.885104408560213,
"learning_rate": 2.9473684210526313e-06,
"loss": 0.8804,
"step": 7
},
{
"epoch": 0.012670758265689962,
"grad_norm": 6.068115898146199,
"learning_rate": 3.3684210526315786e-06,
"loss": 0.7672,
"step": 8
},
{
"epoch": 0.014254603048901207,
"grad_norm": 5.471835730774232,
"learning_rate": 3.789473684210526e-06,
"loss": 0.7564,
"step": 9
},
{
"epoch": 0.015838447832112454,
"grad_norm": 4.785289669398289,
"learning_rate": 4.210526315789473e-06,
"loss": 0.8132,
"step": 10
},
{
"epoch": 0.017422292615323697,
"grad_norm": 6.148192535233664,
"learning_rate": 4.631578947368421e-06,
"loss": 0.788,
"step": 11
},
{
"epoch": 0.019006137398534944,
"grad_norm": 3.1102232110909718,
"learning_rate": 5.052631578947368e-06,
"loss": 0.7744,
"step": 12
},
{
"epoch": 0.020589982181746187,
"grad_norm": 2.2116201137156697,
"learning_rate": 5.473684210526316e-06,
"loss": 0.705,
"step": 13
},
{
"epoch": 0.022173826964957434,
"grad_norm": 2.3891700304125965,
"learning_rate": 5.894736842105263e-06,
"loss": 0.782,
"step": 14
},
{
"epoch": 0.02375767174816868,
"grad_norm": 2.6800168254599552,
"learning_rate": 6.31578947368421e-06,
"loss": 0.6875,
"step": 15
},
{
"epoch": 0.025341516531379924,
"grad_norm": 2.069273801603203,
"learning_rate": 6.736842105263157e-06,
"loss": 0.7177,
"step": 16
},
{
"epoch": 0.02692536131459117,
"grad_norm": 1.9967678968867362,
"learning_rate": 7.157894736842105e-06,
"loss": 0.6317,
"step": 17
},
{
"epoch": 0.028509206097802414,
"grad_norm": 1.8973112856460428,
"learning_rate": 7.578947368421052e-06,
"loss": 0.668,
"step": 18
},
{
"epoch": 0.03009305088101366,
"grad_norm": 1.6427223591398545,
"learning_rate": 8e-06,
"loss": 0.6237,
"step": 19
},
{
"epoch": 0.03167689566422491,
"grad_norm": 1.8126563046343525,
"learning_rate": 7.999947298139988e-06,
"loss": 0.7286,
"step": 20
},
{
"epoch": 0.033260740447436155,
"grad_norm": 2.0321574723423437,
"learning_rate": 7.999789193948692e-06,
"loss": 0.6975,
"step": 21
},
{
"epoch": 0.034844585230647394,
"grad_norm": 1.6806869406461464,
"learning_rate": 7.999525691592307e-06,
"loss": 0.577,
"step": 22
},
{
"epoch": 0.03642843001385864,
"grad_norm": 1.9508098776351803,
"learning_rate": 7.999156798014364e-06,
"loss": 0.7186,
"step": 23
},
{
"epoch": 0.03801227479706989,
"grad_norm": 1.818938048059436,
"learning_rate": 7.998682522935554e-06,
"loss": 0.6235,
"step": 24
},
{
"epoch": 0.039596119580281135,
"grad_norm": 1.763078793834794,
"learning_rate": 7.998102878853464e-06,
"loss": 0.6797,
"step": 25
},
{
"epoch": 0.041179964363492375,
"grad_norm": 1.7024348305084676,
"learning_rate": 7.997417881042254e-06,
"loss": 0.6611,
"step": 26
},
{
"epoch": 0.04276380914670362,
"grad_norm": 2.651929076311587,
"learning_rate": 7.996627547552254e-06,
"loss": 0.6378,
"step": 27
},
{
"epoch": 0.04434765392991487,
"grad_norm": 1.8122638725536788,
"learning_rate": 7.99573189920949e-06,
"loss": 0.6018,
"step": 28
},
{
"epoch": 0.045931498713126115,
"grad_norm": 1.715907567678854,
"learning_rate": 7.994730959615124e-06,
"loss": 0.5851,
"step": 29
},
{
"epoch": 0.04751534349633736,
"grad_norm": 4.091743172227538,
"learning_rate": 7.993624755144846e-06,
"loss": 0.7245,
"step": 30
},
{
"epoch": 0.0490991882795486,
"grad_norm": 2.044771221320041,
"learning_rate": 7.992413314948177e-06,
"loss": 0.5109,
"step": 31
},
{
"epoch": 0.05068303306275985,
"grad_norm": 1.7849876948071914,
"learning_rate": 7.991096670947687e-06,
"loss": 0.669,
"step": 32
},
{
"epoch": 0.052266877845971095,
"grad_norm": 2.0286922346779614,
"learning_rate": 7.989674857838172e-06,
"loss": 0.6604,
"step": 33
},
{
"epoch": 0.05385072262918234,
"grad_norm": 1.5979263059089825,
"learning_rate": 7.988147913085731e-06,
"loss": 0.6031,
"step": 34
},
{
"epoch": 0.05543456741239359,
"grad_norm": 1.8983896550694437,
"learning_rate": 7.986515876926776e-06,
"loss": 0.6479,
"step": 35
},
{
"epoch": 0.05701841219560483,
"grad_norm": 1.5118226263390602,
"learning_rate": 7.984778792366982e-06,
"loss": 0.7121,
"step": 36
},
{
"epoch": 0.058602256978816075,
"grad_norm": 1.654106143113525,
"learning_rate": 7.982936705180138e-06,
"loss": 0.5762,
"step": 37
},
{
"epoch": 0.06018610176202732,
"grad_norm": 1.656696649887629,
"learning_rate": 7.980989663906955e-06,
"loss": 0.5968,
"step": 38
},
{
"epoch": 0.06176994654523857,
"grad_norm": 1.6128252735493993,
"learning_rate": 7.978937719853785e-06,
"loss": 0.6237,
"step": 39
},
{
"epoch": 0.06335379132844982,
"grad_norm": 1.8318236683351248,
"learning_rate": 7.976780927091259e-06,
"loss": 0.5261,
"step": 40
},
{
"epoch": 0.06493763611166106,
"grad_norm": 1.7317276246642117,
"learning_rate": 7.97451934245287e-06,
"loss": 0.6436,
"step": 41
},
{
"epoch": 0.06652148089487231,
"grad_norm": 1.8588379937728277,
"learning_rate": 7.97215302553348e-06,
"loss": 0.6273,
"step": 42
},
{
"epoch": 0.06810532567808354,
"grad_norm": 1.478388678926987,
"learning_rate": 7.969682038687744e-06,
"loss": 0.6108,
"step": 43
},
{
"epoch": 0.06968917046129479,
"grad_norm": 1.6631272451799215,
"learning_rate": 7.967106447028455e-06,
"loss": 0.6865,
"step": 44
},
{
"epoch": 0.07127301524450604,
"grad_norm": 1.736621303615563,
"learning_rate": 7.964426318424854e-06,
"loss": 0.6554,
"step": 45
},
{
"epoch": 0.07285686002771728,
"grad_norm": 1.7271619263651752,
"learning_rate": 7.96164172350082e-06,
"loss": 0.6234,
"step": 46
},
{
"epoch": 0.07444070481092853,
"grad_norm": 1.8475249444720505,
"learning_rate": 7.958752735633022e-06,
"loss": 0.6844,
"step": 47
},
{
"epoch": 0.07602454959413978,
"grad_norm": 1.5407374760971033,
"learning_rate": 7.955759430948973e-06,
"loss": 0.5816,
"step": 48
},
{
"epoch": 0.07760839437735102,
"grad_norm": 1.6776551127492765,
"learning_rate": 7.952661888325037e-06,
"loss": 0.5999,
"step": 49
},
{
"epoch": 0.07919223916056227,
"grad_norm": 1.674394955432216,
"learning_rate": 7.949460189384344e-06,
"loss": 0.6809,
"step": 50
},
{
"epoch": 0.08077608394377352,
"grad_norm": 1.71808445149699,
"learning_rate": 7.946154418494638e-06,
"loss": 0.6433,
"step": 51
},
{
"epoch": 0.08235992872698475,
"grad_norm": 1.5598421305012693,
"learning_rate": 7.942744662766056e-06,
"loss": 0.6166,
"step": 52
},
{
"epoch": 0.083943773510196,
"grad_norm": 2.134433653759547,
"learning_rate": 7.939231012048832e-06,
"loss": 0.6213,
"step": 53
},
{
"epoch": 0.08552761829340724,
"grad_norm": 1.513738891290282,
"learning_rate": 7.935613558930931e-06,
"loss": 0.5797,
"step": 54
},
{
"epoch": 0.08711146307661849,
"grad_norm": 1.8993782752151005,
"learning_rate": 7.931892398735607e-06,
"loss": 0.6062,
"step": 55
},
{
"epoch": 0.08869530785982974,
"grad_norm": 1.6362105443395145,
"learning_rate": 7.92806762951889e-06,
"loss": 0.6701,
"step": 56
},
{
"epoch": 0.09027915264304098,
"grad_norm": 1.6152582877305293,
"learning_rate": 7.92413935206701e-06,
"loss": 0.5999,
"step": 57
},
{
"epoch": 0.09186299742625223,
"grad_norm": 1.5796190225435545,
"learning_rate": 7.920107669893728e-06,
"loss": 0.6414,
"step": 58
},
{
"epoch": 0.09344684220946348,
"grad_norm": 1.5796707579391902,
"learning_rate": 7.915972689237618e-06,
"loss": 0.6168,
"step": 59
},
{
"epoch": 0.09503068699267472,
"grad_norm": 1.4556074881511978,
"learning_rate": 7.911734519059266e-06,
"loss": 0.6623,
"step": 60
},
{
"epoch": 0.09661453177588597,
"grad_norm": 1.5078600739444816,
"learning_rate": 7.907393271038402e-06,
"loss": 0.6546,
"step": 61
},
{
"epoch": 0.0981983765590972,
"grad_norm": 1.590183144064633,
"learning_rate": 7.902949059570945e-06,
"loss": 0.6175,
"step": 62
},
{
"epoch": 0.09978222134230845,
"grad_norm": 1.665920028614456,
"learning_rate": 7.898402001766002e-06,
"loss": 0.6758,
"step": 63
},
{
"epoch": 0.1013660661255197,
"grad_norm": 1.6053295719352823,
"learning_rate": 7.89375221744277e-06,
"loss": 0.5228,
"step": 64
},
{
"epoch": 0.10294991090873094,
"grad_norm": 1.538829090522207,
"learning_rate": 7.888999829127398e-06,
"loss": 0.5839,
"step": 65
},
{
"epoch": 0.10453375569194219,
"grad_norm": 1.4764506284200576,
"learning_rate": 7.884144962049733e-06,
"loss": 0.5371,
"step": 66
},
{
"epoch": 0.10611760047515344,
"grad_norm": 1.7787882173817327,
"learning_rate": 7.879187744140039e-06,
"loss": 0.5687,
"step": 67
},
{
"epoch": 0.10770144525836468,
"grad_norm": 1.6757502009522847,
"learning_rate": 7.874128306025616e-06,
"loss": 0.6602,
"step": 68
},
{
"epoch": 0.10928529004157593,
"grad_norm": 1.5384088321644194,
"learning_rate": 7.868966781027365e-06,
"loss": 0.5848,
"step": 69
},
{
"epoch": 0.11086913482478718,
"grad_norm": 1.3974387424168984,
"learning_rate": 7.863703305156273e-06,
"loss": 0.4492,
"step": 70
},
{
"epoch": 0.11245297960799841,
"grad_norm": 1.4838230417656662,
"learning_rate": 7.858338017109821e-06,
"loss": 0.5937,
"step": 71
},
{
"epoch": 0.11403682439120966,
"grad_norm": 1.4619003743991448,
"learning_rate": 7.852871058268338e-06,
"loss": 0.6154,
"step": 72
},
{
"epoch": 0.1156206691744209,
"grad_norm": 1.4943665339665302,
"learning_rate": 7.847302572691277e-06,
"loss": 0.6561,
"step": 73
},
{
"epoch": 0.11720451395763215,
"grad_norm": 1.4942484366322168,
"learning_rate": 7.841632707113408e-06,
"loss": 0.6133,
"step": 74
},
{
"epoch": 0.1187883587408434,
"grad_norm": 1.6703395416125706,
"learning_rate": 7.835861610940964e-06,
"loss": 0.5878,
"step": 75
},
{
"epoch": 0.12037220352405464,
"grad_norm": 1.5664444643827788,
"learning_rate": 7.829989436247697e-06,
"loss": 0.7375,
"step": 76
},
{
"epoch": 0.12195604830726589,
"grad_norm": 1.4638878839717737,
"learning_rate": 7.824016337770871e-06,
"loss": 0.5211,
"step": 77
},
{
"epoch": 0.12353989309047714,
"grad_norm": 1.787375030347028,
"learning_rate": 7.817942472907183e-06,
"loss": 0.5647,
"step": 78
},
{
"epoch": 0.12512373787368838,
"grad_norm": 1.884380332078967,
"learning_rate": 7.811768001708626e-06,
"loss": 0.6087,
"step": 79
},
{
"epoch": 0.12670758265689963,
"grad_norm": 1.5929236515264444,
"learning_rate": 7.805493086878254e-06,
"loss": 0.6642,
"step": 80
},
{
"epoch": 0.12829142744011088,
"grad_norm": 1.5772424009229602,
"learning_rate": 7.799117893765911e-06,
"loss": 0.5835,
"step": 81
},
{
"epoch": 0.12987527222332212,
"grad_norm": 1.7489491911949495,
"learning_rate": 7.792642590363864e-06,
"loss": 0.6714,
"step": 82
},
{
"epoch": 0.13145911700653337,
"grad_norm": 1.6319871292096388,
"learning_rate": 7.786067347302378e-06,
"loss": 0.6794,
"step": 83
},
{
"epoch": 0.13304296178974462,
"grad_norm": 1.4796412781963182,
"learning_rate": 7.779392337845224e-06,
"loss": 0.5173,
"step": 84
},
{
"epoch": 0.13462680657295584,
"grad_norm": 1.582366953055958,
"learning_rate": 7.772617737885109e-06,
"loss": 0.6008,
"step": 85
},
{
"epoch": 0.13621065135616708,
"grad_norm": 1.4986337444583717,
"learning_rate": 7.765743725939044e-06,
"loss": 0.6157,
"step": 86
},
{
"epoch": 0.13779449613937833,
"grad_norm": 1.577968697496146,
"learning_rate": 7.758770483143633e-06,
"loss": 0.5652,
"step": 87
},
{
"epoch": 0.13937834092258958,
"grad_norm": 1.6508937367068126,
"learning_rate": 7.751698193250313e-06,
"loss": 0.5759,
"step": 88
},
{
"epoch": 0.14096218570580082,
"grad_norm": 1.7727311004928359,
"learning_rate": 7.744527042620495e-06,
"loss": 0.621,
"step": 89
},
{
"epoch": 0.14254603048901207,
"grad_norm": 1.5183748104433379,
"learning_rate": 7.737257220220672e-06,
"loss": 0.6053,
"step": 90
},
{
"epoch": 0.14412987527222332,
"grad_norm": 1.5930246968878465,
"learning_rate": 7.729888917617423e-06,
"loss": 0.5267,
"step": 91
},
{
"epoch": 0.14571372005543456,
"grad_norm": 1.5445708944187808,
"learning_rate": 7.722422328972375e-06,
"loss": 0.5988,
"step": 92
},
{
"epoch": 0.1472975648386458,
"grad_norm": 1.644950232043727,
"learning_rate": 7.71485765103708e-06,
"loss": 0.523,
"step": 93
},
{
"epoch": 0.14888140962185706,
"grad_norm": 1.582899153506888,
"learning_rate": 7.707195083147842e-06,
"loss": 0.5703,
"step": 94
},
{
"epoch": 0.1504652544050683,
"grad_norm": 1.5412622272946352,
"learning_rate": 7.699434827220446e-06,
"loss": 0.6049,
"step": 95
},
{
"epoch": 0.15204909918827955,
"grad_norm": 1.3877844675757782,
"learning_rate": 7.691577087744858e-06,
"loss": 0.6088,
"step": 96
},
{
"epoch": 0.1536329439714908,
"grad_norm": 1.7758770361443341,
"learning_rate": 7.683622071779814e-06,
"loss": 0.5779,
"step": 97
},
{
"epoch": 0.15521678875470205,
"grad_norm": 1.727972351073385,
"learning_rate": 7.675569988947388e-06,
"loss": 0.6189,
"step": 98
},
{
"epoch": 0.1568006335379133,
"grad_norm": 1.726280844012036,
"learning_rate": 7.66742105142745e-06,
"loss": 0.6134,
"step": 99
},
{
"epoch": 0.15838447832112454,
"grad_norm": 1.478777494556154,
"learning_rate": 7.659175473952084e-06,
"loss": 0.5614,
"step": 100
},
{
"epoch": 0.15996832310433579,
"grad_norm": 1.5790584926585527,
"learning_rate": 7.65083347379992e-06,
"loss": 0.659,
"step": 101
},
{
"epoch": 0.16155216788754703,
"grad_norm": 1.5037297538511123,
"learning_rate": 7.642395270790426e-06,
"loss": 0.4981,
"step": 102
},
{
"epoch": 0.16313601267075828,
"grad_norm": 1.5590318704111625,
"learning_rate": 7.633861087278093e-06,
"loss": 0.5807,
"step": 103
},
{
"epoch": 0.1647198574539695,
"grad_norm": 1.4332619998935698,
"learning_rate": 7.6252311481465996e-06,
"loss": 0.6309,
"step": 104
},
{
"epoch": 0.16630370223718074,
"grad_norm": 1.4427352973548397,
"learning_rate": 7.616505680802863e-06,
"loss": 0.6623,
"step": 105
},
{
"epoch": 0.167887547020392,
"grad_norm": 1.6827061244824466,
"learning_rate": 7.607684915171065e-06,
"loss": 0.6589,
"step": 106
},
{
"epoch": 0.16947139180360324,
"grad_norm": 1.5663175326090033,
"learning_rate": 7.598769083686582e-06,
"loss": 0.627,
"step": 107
},
{
"epoch": 0.17105523658681449,
"grad_norm": 1.423604206610221,
"learning_rate": 7.589758421289864e-06,
"loss": 0.6335,
"step": 108
},
{
"epoch": 0.17263908137002573,
"grad_norm": 1.4675532083940528,
"learning_rate": 7.58065316542025e-06,
"loss": 0.59,
"step": 109
},
{
"epoch": 0.17422292615323698,
"grad_norm": 1.8233813273626727,
"learning_rate": 7.571453556009695e-06,
"loss": 0.5213,
"step": 110
},
{
"epoch": 0.17580677093644823,
"grad_norm": 1.8255857113037794,
"learning_rate": 7.562159835476465e-06,
"loss": 0.6255,
"step": 111
},
{
"epoch": 0.17739061571965947,
"grad_norm": 1.4953012811749822,
"learning_rate": 7.552772248718739e-06,
"loss": 0.6206,
"step": 112
},
{
"epoch": 0.17897446050287072,
"grad_norm": 1.56255118528589,
"learning_rate": 7.5432910431081586e-06,
"loss": 0.5783,
"step": 113
},
{
"epoch": 0.18055830528608197,
"grad_norm": 1.664536250355845,
"learning_rate": 7.533716468483311e-06,
"loss": 0.6409,
"step": 114
},
{
"epoch": 0.1821421500692932,
"grad_norm": 1.4958558339075811,
"learning_rate": 7.524048777143137e-06,
"loss": 0.569,
"step": 115
},
{
"epoch": 0.18372599485250446,
"grad_norm": 1.4039476439318943,
"learning_rate": 7.5142882238403e-06,
"loss": 0.6021,
"step": 116
},
{
"epoch": 0.1853098396357157,
"grad_norm": 1.6306064613151194,
"learning_rate": 7.504435065774454e-06,
"loss": 0.7385,
"step": 117
},
{
"epoch": 0.18689368441892695,
"grad_norm": 1.668079082925941,
"learning_rate": 7.494489562585478e-06,
"loss": 0.5724,
"step": 118
},
{
"epoch": 0.1884775292021382,
"grad_norm": 1.434447147714598,
"learning_rate": 7.48445197634663e-06,
"loss": 0.6092,
"step": 119
},
{
"epoch": 0.19006137398534945,
"grad_norm": 1.984109033026855,
"learning_rate": 7.474322571557644e-06,
"loss": 0.6691,
"step": 120
},
{
"epoch": 0.1916452187685607,
"grad_norm": 1.5769599681060413,
"learning_rate": 7.4641016151377545e-06,
"loss": 0.6061,
"step": 121
},
{
"epoch": 0.19322906355177194,
"grad_norm": 1.384122268885966,
"learning_rate": 7.45378937641867e-06,
"loss": 0.5634,
"step": 122
},
{
"epoch": 0.19481290833498316,
"grad_norm": 1.498669953718918,
"learning_rate": 7.44338612713747e-06,
"loss": 0.5627,
"step": 123
},
{
"epoch": 0.1963967531181944,
"grad_norm": 1.4288041695671259,
"learning_rate": 7.43289214142945e-06,
"loss": 0.5726,
"step": 124
},
{
"epoch": 0.19798059790140565,
"grad_norm": 1.5646983954502622,
"learning_rate": 7.422307695820892e-06,
"loss": 0.623,
"step": 125
},
{
"epoch": 0.1995644426846169,
"grad_norm": 1.7202467615332813,
"learning_rate": 7.411633069221782e-06,
"loss": 0.6123,
"step": 126
},
{
"epoch": 0.20114828746782815,
"grad_norm": 1.767901675287711,
"learning_rate": 7.400868542918457e-06,
"loss": 0.5208,
"step": 127
},
{
"epoch": 0.2027321322510394,
"grad_norm": 1.5246478568278783,
"learning_rate": 7.390014400566196e-06,
"loss": 0.5708,
"step": 128
},
{
"epoch": 0.20431597703425064,
"grad_norm": 1.6477524599468607,
"learning_rate": 7.379070928181746e-06,
"loss": 0.5288,
"step": 129
},
{
"epoch": 0.2058998218174619,
"grad_norm": 1.4399111543584902,
"learning_rate": 7.3680384141357805e-06,
"loss": 0.5898,
"step": 130
},
{
"epoch": 0.20748366660067313,
"grad_norm": 1.5830239221298632,
"learning_rate": 7.356917149145307e-06,
"loss": 0.5797,
"step": 131
},
{
"epoch": 0.20906751138388438,
"grad_norm": 1.403523262072001,
"learning_rate": 7.3457074262659974e-06,
"loss": 0.5581,
"step": 132
},
{
"epoch": 0.21065135616709563,
"grad_norm": 1.7026128256384587,
"learning_rate": 7.334409540884478e-06,
"loss": 0.5859,
"step": 133
},
{
"epoch": 0.21223520095030687,
"grad_norm": 1.4478577966897537,
"learning_rate": 7.323023790710534e-06,
"loss": 0.6038,
"step": 134
},
{
"epoch": 0.21381904573351812,
"grad_norm": 1.5371200095544344,
"learning_rate": 7.3115504757692715e-06,
"loss": 0.5528,
"step": 135
},
{
"epoch": 0.21540289051672937,
"grad_norm": 1.4893093530093662,
"learning_rate": 7.299989898393209e-06,
"loss": 0.6717,
"step": 136
},
{
"epoch": 0.21698673529994061,
"grad_norm": 1.3874990168288865,
"learning_rate": 7.288342363214313e-06,
"loss": 0.5586,
"step": 137
},
{
"epoch": 0.21857058008315186,
"grad_norm": 1.5863006868461444,
"learning_rate": 7.276608177155967e-06,
"loss": 0.4951,
"step": 138
},
{
"epoch": 0.2201544248663631,
"grad_norm": 1.489868624576753,
"learning_rate": 7.264787649424887e-06,
"loss": 0.5833,
"step": 139
},
{
"epoch": 0.22173826964957435,
"grad_norm": 1.3377913662186058,
"learning_rate": 7.2528810915029705e-06,
"loss": 0.6079,
"step": 140
},
{
"epoch": 0.22332211443278557,
"grad_norm": 1.4551371038570358,
"learning_rate": 7.240888817139094e-06,
"loss": 0.629,
"step": 141
},
{
"epoch": 0.22490595921599682,
"grad_norm": 1.4800684134742632,
"learning_rate": 7.228811142340838e-06,
"loss": 0.5218,
"step": 142
},
{
"epoch": 0.22648980399920807,
"grad_norm": 1.655894288397452,
"learning_rate": 7.2166483853661666e-06,
"loss": 0.5851,
"step": 143
},
{
"epoch": 0.2280736487824193,
"grad_norm": 1.5072103166941109,
"learning_rate": 7.204400866715038e-06,
"loss": 0.5484,
"step": 144
},
{
"epoch": 0.22965749356563056,
"grad_norm": 1.5118758556170837,
"learning_rate": 7.192068909120959e-06,
"loss": 0.6607,
"step": 145
},
{
"epoch": 0.2312413383488418,
"grad_norm": 1.6979615740477183,
"learning_rate": 7.179652837542479e-06,
"loss": 0.6278,
"step": 146
},
{
"epoch": 0.23282518313205305,
"grad_norm": 1.4816588827641894,
"learning_rate": 7.167152979154632e-06,
"loss": 0.5747,
"step": 147
},
{
"epoch": 0.2344090279152643,
"grad_norm": 1.477778782495537,
"learning_rate": 7.154569663340312e-06,
"loss": 0.6037,
"step": 148
},
{
"epoch": 0.23599287269847555,
"grad_norm": 1.524987409114406,
"learning_rate": 7.141903221681595e-06,
"loss": 0.5202,
"step": 149
},
{
"epoch": 0.2375767174816868,
"grad_norm": 1.4531762635508665,
"learning_rate": 7.1291539879509956e-06,
"loss": 0.6053,
"step": 150
},
{
"epoch": 0.23916056226489804,
"grad_norm": 1.6848330098071658,
"learning_rate": 7.116322298102681e-06,
"loss": 0.5205,
"step": 151
},
{
"epoch": 0.2407444070481093,
"grad_norm": 1.53168338669591,
"learning_rate": 7.1034084902636125e-06,
"loss": 0.5588,
"step": 152
},
{
"epoch": 0.24232825183132053,
"grad_norm": 1.7084343204301633,
"learning_rate": 7.090412904724635e-06,
"loss": 0.6402,
"step": 153
},
{
"epoch": 0.24391209661453178,
"grad_norm": 1.8530857047977658,
"learning_rate": 7.077335883931516e-06,
"loss": 0.5897,
"step": 154
},
{
"epoch": 0.24549594139774303,
"grad_norm": 1.5361296146734253,
"learning_rate": 7.064177772475912e-06,
"loss": 0.542,
"step": 155
},
{
"epoch": 0.24707978618095428,
"grad_norm": 1.6919147409675659,
"learning_rate": 7.050938917086298e-06,
"loss": 0.6055,
"step": 156
},
{
"epoch": 0.24866363096416552,
"grad_norm": 1.5618314491824075,
"learning_rate": 7.037619666618829e-06,
"loss": 0.535,
"step": 157
},
{
"epoch": 0.25024747574737677,
"grad_norm": 1.4872965507690663,
"learning_rate": 7.024220372048137e-06,
"loss": 0.5813,
"step": 158
},
{
"epoch": 0.251831320530588,
"grad_norm": 1.6363093857295996,
"learning_rate": 7.010741386458098e-06,
"loss": 0.4529,
"step": 159
},
{
"epoch": 0.25341516531379926,
"grad_norm": 2.056313824339786,
"learning_rate": 6.997183065032517e-06,
"loss": 0.5332,
"step": 160
},
{
"epoch": 0.2549990100970105,
"grad_norm": 1.5062280307878337,
"learning_rate": 6.983545765045774e-06,
"loss": 0.5586,
"step": 161
},
{
"epoch": 0.25658285488022176,
"grad_norm": 1.7057261325931998,
"learning_rate": 6.969829845853404e-06,
"loss": 0.5615,
"step": 162
},
{
"epoch": 0.258166699663433,
"grad_norm": 1.5223711914805576,
"learning_rate": 6.956035668882636e-06,
"loss": 0.5553,
"step": 163
},
{
"epoch": 0.25975054444664425,
"grad_norm": 1.7269587442848504,
"learning_rate": 6.942163597622862e-06,
"loss": 0.5982,
"step": 164
},
{
"epoch": 0.26133438922985547,
"grad_norm": 1.5775708792423282,
"learning_rate": 6.928213997616058e-06,
"loss": 0.5816,
"step": 165
},
{
"epoch": 0.26291823401306674,
"grad_norm": 1.7646652820246194,
"learning_rate": 6.914187236447161e-06,
"loss": 0.5582,
"step": 166
},
{
"epoch": 0.26450207879627796,
"grad_norm": 1.5836559806147648,
"learning_rate": 6.90008368373437e-06,
"loss": 0.5268,
"step": 167
},
{
"epoch": 0.26608592357948924,
"grad_norm": 1.4526060644749479,
"learning_rate": 6.885903711119417e-06,
"loss": 0.5842,
"step": 168
},
{
"epoch": 0.26766976836270046,
"grad_norm": 1.469565911234753,
"learning_rate": 6.8716476922577676e-06,
"loss": 0.5691,
"step": 169
},
{
"epoch": 0.2692536131459117,
"grad_norm": 1.6734917590165517,
"learning_rate": 6.857316002808776e-06,
"loss": 0.4855,
"step": 170
},
{
"epoch": 0.27083745792912295,
"grad_norm": 1.5732695992638241,
"learning_rate": 6.8429090204257885e-06,
"loss": 0.6122,
"step": 171
},
{
"epoch": 0.27242130271233417,
"grad_norm": 1.4978871767664772,
"learning_rate": 6.82842712474619e-06,
"loss": 0.4655,
"step": 172
},
{
"epoch": 0.27400514749554544,
"grad_norm": 1.503094339164215,
"learning_rate": 6.8138706973813995e-06,
"loss": 0.6282,
"step": 173
},
{
"epoch": 0.27558899227875666,
"grad_norm": 2.1292899427160927,
"learning_rate": 6.799240121906814e-06,
"loss": 0.6792,
"step": 174
},
{
"epoch": 0.27717283706196794,
"grad_norm": 1.4647489604808317,
"learning_rate": 6.784535783851707e-06,
"loss": 0.644,
"step": 175
},
{
"epoch": 0.27875668184517916,
"grad_norm": 1.5780789456446127,
"learning_rate": 6.7697580706890585e-06,
"loss": 0.5134,
"step": 176
},
{
"epoch": 0.28034052662839043,
"grad_norm": 1.754003508034847,
"learning_rate": 6.754907371825354e-06,
"loss": 0.5424,
"step": 177
},
{
"epoch": 0.28192437141160165,
"grad_norm": 1.4864993829740512,
"learning_rate": 6.739984078590322e-06,
"loss": 0.4967,
"step": 178
},
{
"epoch": 0.2835082161948129,
"grad_norm": 1.4504899701696683,
"learning_rate": 6.724988584226616e-06,
"loss": 0.5067,
"step": 179
},
{
"epoch": 0.28509206097802414,
"grad_norm": 1.470233988558896,
"learning_rate": 6.70992128387946e-06,
"loss": 0.5655,
"step": 180
},
{
"epoch": 0.2866759057612354,
"grad_norm": 1.5181769899461575,
"learning_rate": 6.694782574586229e-06,
"loss": 0.5062,
"step": 181
},
{
"epoch": 0.28825975054444664,
"grad_norm": 1.4730340611570927,
"learning_rate": 6.679572855265992e-06,
"loss": 0.5855,
"step": 182
},
{
"epoch": 0.2898435953276579,
"grad_norm": 1.4595330624813205,
"learning_rate": 6.664292526709001e-06,
"loss": 0.4989,
"step": 183
},
{
"epoch": 0.29142744011086913,
"grad_norm": 1.323456475759639,
"learning_rate": 6.648941991566121e-06,
"loss": 0.5448,
"step": 184
},
{
"epoch": 0.2930112848940804,
"grad_norm": 1.6848076983535807,
"learning_rate": 6.633521654338231e-06,
"loss": 0.5494,
"step": 185
},
{
"epoch": 0.2945951296772916,
"grad_norm": 1.5461470851002552,
"learning_rate": 6.618031921365557e-06,
"loss": 0.5979,
"step": 186
},
{
"epoch": 0.2961789744605029,
"grad_norm": 1.4521318914121286,
"learning_rate": 6.602473200816968e-06,
"loss": 0.6329,
"step": 187
},
{
"epoch": 0.2977628192437141,
"grad_norm": 1.4482304443134255,
"learning_rate": 6.586845902679222e-06,
"loss": 0.5603,
"step": 188
},
{
"epoch": 0.29934666402692534,
"grad_norm": 1.729429057098183,
"learning_rate": 6.571150438746157e-06,
"loss": 0.5174,
"step": 189
},
{
"epoch": 0.3009305088101366,
"grad_norm": 1.4522833098019283,
"learning_rate": 6.555387222607845e-06,
"loss": 0.4707,
"step": 190
},
{
"epoch": 0.30251435359334783,
"grad_norm": 1.5694908204548417,
"learning_rate": 6.5395566696396914e-06,
"loss": 0.6268,
"step": 191
},
{
"epoch": 0.3040981983765591,
"grad_norm": 1.4125293528433154,
"learning_rate": 6.523659196991488e-06,
"loss": 0.4955,
"step": 192
},
{
"epoch": 0.3056820431597703,
"grad_norm": 1.5051250957949753,
"learning_rate": 6.507695223576427e-06,
"loss": 0.487,
"step": 193
},
{
"epoch": 0.3072658879429816,
"grad_norm": 1.4094675941552535,
"learning_rate": 6.491665170060049e-06,
"loss": 0.4969,
"step": 194
},
{
"epoch": 0.3088497327261928,
"grad_norm": 1.4384575522029888,
"learning_rate": 6.475569458849178e-06,
"loss": 0.5492,
"step": 195
},
{
"epoch": 0.3104335775094041,
"grad_norm": 1.7678769340548022,
"learning_rate": 6.45940851408077e-06,
"loss": 0.5836,
"step": 196
},
{
"epoch": 0.3120174222926153,
"grad_norm": 2.046226775911686,
"learning_rate": 6.4431827616107514e-06,
"loss": 0.5301,
"step": 197
},
{
"epoch": 0.3136012670758266,
"grad_norm": 1.4943535053844237,
"learning_rate": 6.426892629002788e-06,
"loss": 0.5501,
"step": 198
},
{
"epoch": 0.3151851118590378,
"grad_norm": 1.5251076879939447,
"learning_rate": 6.410538545517026e-06,
"loss": 0.5089,
"step": 199
},
{
"epoch": 0.3167689566422491,
"grad_norm": 1.8164625462077661,
"learning_rate": 6.394120942098772e-06,
"loss": 0.5319,
"step": 200
},
{
"epoch": 0.3183528014254603,
"grad_norm": 1.4408492309184018,
"learning_rate": 6.377640251367147e-06,
"loss": 0.4609,
"step": 201
},
{
"epoch": 0.31993664620867157,
"grad_norm": 1.4200405326569705,
"learning_rate": 6.361096907603678e-06,
"loss": 0.5396,
"step": 202
},
{
"epoch": 0.3215204909918828,
"grad_norm": 1.471578227709618,
"learning_rate": 6.344491346740859e-06,
"loss": 0.546,
"step": 203
},
{
"epoch": 0.32310433577509406,
"grad_norm": 1.4907309711106325,
"learning_rate": 6.3278240063506605e-06,
"loss": 0.5093,
"step": 204
},
{
"epoch": 0.3246881805583053,
"grad_norm": 1.3696992488542155,
"learning_rate": 6.311095325633005e-06,
"loss": 0.4799,
"step": 205
},
{
"epoch": 0.32627202534151656,
"grad_norm": 1.7334631553518818,
"learning_rate": 6.294305745404184e-06,
"loss": 0.5837,
"step": 206
},
{
"epoch": 0.3278558701247278,
"grad_norm": 1.9680923980600566,
"learning_rate": 6.277455708085254e-06,
"loss": 0.6013,
"step": 207
},
{
"epoch": 0.329439714907939,
"grad_norm": 1.575234585905169,
"learning_rate": 6.260545657690367e-06,
"loss": 0.5846,
"step": 208
},
{
"epoch": 0.33102355969115027,
"grad_norm": 1.5428445148256473,
"learning_rate": 6.243576039815079e-06,
"loss": 0.4724,
"step": 209
},
{
"epoch": 0.3326074044743615,
"grad_norm": 1.5051331668280472,
"learning_rate": 6.226547301624601e-06,
"loss": 0.5778,
"step": 210
},
{
"epoch": 0.3326074044743615,
"eval_accuracy": 0.8062780751393809,
"eval_loss": 0.6072185039520264,
"eval_perplexity": 1.2109892812795882,
"eval_runtime": 533.963,
"eval_samples_per_second": 1.425,
"eval_steps_per_second": 1.425,
"step": 210
},
{
"epoch": 0.33419124925757276,
"grad_norm": 1.8036724779367772,
"learning_rate": 6.209459891842023e-06,
"loss": 0.6231,
"step": 211
},
{
"epoch": 0.335775094040784,
"grad_norm": 1.622010722669537,
"learning_rate": 6.192314260736483e-06,
"loss": 0.4884,
"step": 212
},
{
"epoch": 0.33735893882399526,
"grad_norm": 1.4671354082285062,
"learning_rate": 6.1751108601113065e-06,
"loss": 0.4331,
"step": 213
},
{
"epoch": 0.3389427836072065,
"grad_norm": 1.3462750015093699,
"learning_rate": 6.157850143292099e-06,
"loss": 0.5651,
"step": 214
},
{
"epoch": 0.34052662839041775,
"grad_norm": 4.450327389512898,
"learning_rate": 6.140532565114801e-06,
"loss": 0.5063,
"step": 215
},
{
"epoch": 0.34211047317362897,
"grad_norm": 1.4358337913572403,
"learning_rate": 6.123158581913703e-06,
"loss": 0.5133,
"step": 216
},
{
"epoch": 0.34369431795684025,
"grad_norm": 1.5296654139200274,
"learning_rate": 6.105728651509423e-06,
"loss": 0.5617,
"step": 217
},
{
"epoch": 0.34527816274005146,
"grad_norm": 1.7233319268867215,
"learning_rate": 6.088243233196833e-06,
"loss": 0.578,
"step": 218
},
{
"epoch": 0.34686200752326274,
"grad_norm": 1.3545296291875666,
"learning_rate": 6.07070278773297e-06,
"loss": 0.6077,
"step": 219
},
{
"epoch": 0.34844585230647396,
"grad_norm": 1.4071408386524482,
"learning_rate": 6.053107777324882e-06,
"loss": 0.4709,
"step": 220
},
{
"epoch": 0.35002969708968523,
"grad_norm": 1.625931561657156,
"learning_rate": 6.0354586656174594e-06,
"loss": 0.5402,
"step": 221
},
{
"epoch": 0.35161354187289645,
"grad_norm": 1.4770799573957036,
"learning_rate": 6.017755917681208e-06,
"loss": 0.5878,
"step": 222
},
{
"epoch": 0.3531973866561077,
"grad_norm": 1.371967202631883,
"learning_rate": 6e-06,
"loss": 0.5646,
"step": 223
},
{
"epoch": 0.35478123143931894,
"grad_norm": 1.670286600590674,
"learning_rate": 5.982191380458779e-06,
"loss": 0.5459,
"step": 224
},
{
"epoch": 0.3563650762225302,
"grad_norm": 1.657503866936318,
"learning_rate": 5.964330528331233e-06,
"loss": 0.6107,
"step": 225
},
{
"epoch": 0.35794892100574144,
"grad_norm": 1.437798148422675,
"learning_rate": 5.946417914267424e-06,
"loss": 0.5283,
"step": 226
},
{
"epoch": 0.35953276578895266,
"grad_norm": 1.5841028963525154,
"learning_rate": 5.928454010281395e-06,
"loss": 0.5566,
"step": 227
},
{
"epoch": 0.36111661057216393,
"grad_norm": 2.912275328636749,
"learning_rate": 5.91043928973872e-06,
"loss": 0.5524,
"step": 228
},
{
"epoch": 0.36270045535537515,
"grad_norm": 1.39224219742789,
"learning_rate": 5.8923742273440405e-06,
"loss": 0.5018,
"step": 229
},
{
"epoch": 0.3642843001385864,
"grad_norm": 1.531343821785256,
"learning_rate": 5.87425929912855e-06,
"loss": 0.5498,
"step": 230
},
{
"epoch": 0.36586814492179764,
"grad_norm": 1.6907756868854458,
"learning_rate": 5.856094982437453e-06,
"loss": 0.6188,
"step": 231
},
{
"epoch": 0.3674519897050089,
"grad_norm": 1.8925539993976666,
"learning_rate": 5.83788175591739e-06,
"loss": 0.6473,
"step": 232
},
{
"epoch": 0.36903583448822014,
"grad_norm": 1.3770006280258493,
"learning_rate": 5.819620099503818e-06,
"loss": 0.4686,
"step": 233
},
{
"epoch": 0.3706196792714314,
"grad_norm": 1.5431315768010794,
"learning_rate": 5.801310494408365e-06,
"loss": 0.5691,
"step": 234
},
{
"epoch": 0.37220352405464263,
"grad_norm": 1.6551290914393502,
"learning_rate": 5.782953423106153e-06,
"loss": 0.5874,
"step": 235
},
{
"epoch": 0.3737873688378539,
"grad_norm": 1.9002502737234077,
"learning_rate": 5.764549369323084e-06,
"loss": 0.5529,
"step": 236
},
{
"epoch": 0.3753712136210651,
"grad_norm": 1.686985111412407,
"learning_rate": 5.746098818023092e-06,
"loss": 0.5603,
"step": 237
},
{
"epoch": 0.3769550584042764,
"grad_norm": 1.3657672575822837,
"learning_rate": 5.727602255395364e-06,
"loss": 0.5568,
"step": 238
},
{
"epoch": 0.3785389031874876,
"grad_norm": 1.9695232006427876,
"learning_rate": 5.7090601688415235e-06,
"loss": 0.5658,
"step": 239
},
{
"epoch": 0.3801227479706989,
"grad_norm": 1.5461356891055895,
"learning_rate": 5.690473046962798e-06,
"loss": 0.4673,
"step": 240
},
{
"epoch": 0.3817065927539101,
"grad_norm": 2.3698398593391965,
"learning_rate": 5.671841379547133e-06,
"loss": 0.5763,
"step": 241
},
{
"epoch": 0.3832904375371214,
"grad_norm": 1.6233296254529663,
"learning_rate": 5.6531656575562954e-06,
"loss": 0.4775,
"step": 242
},
{
"epoch": 0.3848742823203326,
"grad_norm": 1.5057753839519041,
"learning_rate": 5.634446373112926e-06,
"loss": 0.5759,
"step": 243
},
{
"epoch": 0.3864581271035439,
"grad_norm": 1.569673186402757,
"learning_rate": 5.615684019487579e-06,
"loss": 0.542,
"step": 244
},
{
"epoch": 0.3880419718867551,
"grad_norm": 1.4845831954568514,
"learning_rate": 5.596879091085723e-06,
"loss": 0.4803,
"step": 245
},
{
"epoch": 0.3896258166699663,
"grad_norm": 1.8705825155512474,
"learning_rate": 5.57803208343471e-06,
"loss": 0.5017,
"step": 246
},
{
"epoch": 0.3912096614531776,
"grad_norm": 1.5197093986457457,
"learning_rate": 5.559143493170717e-06,
"loss": 0.541,
"step": 247
},
{
"epoch": 0.3927935062363888,
"grad_norm": 1.5301810316128006,
"learning_rate": 5.540213818025666e-06,
"loss": 0.5427,
"step": 248
},
{
"epoch": 0.3943773510196001,
"grad_norm": 1.8516941433643899,
"learning_rate": 5.5212435568141035e-06,
"loss": 0.5974,
"step": 249
},
{
"epoch": 0.3959611958028113,
"grad_norm": 1.3334174668528216,
"learning_rate": 5.5022332094200505e-06,
"loss": 0.5429,
"step": 250
},
{
"epoch": 0.3975450405860226,
"grad_norm": 1.4702275400288973,
"learning_rate": 5.483183276783843e-06,
"loss": 0.5766,
"step": 251
},
{
"epoch": 0.3991288853692338,
"grad_norm": 1.5459152423352713,
"learning_rate": 5.464094260888924e-06,
"loss": 0.5527,
"step": 252
},
{
"epoch": 0.4007127301524451,
"grad_norm": 1.8640878576069286,
"learning_rate": 5.4449666647486125e-06,
"loss": 0.6205,
"step": 253
},
{
"epoch": 0.4022965749356563,
"grad_norm": 1.9092864583223532,
"learning_rate": 5.425800992392856e-06,
"loss": 0.548,
"step": 254
},
{
"epoch": 0.40388041971886757,
"grad_norm": 1.5276995541859555,
"learning_rate": 5.406597748854947e-06,
"loss": 0.5498,
"step": 255
},
{
"epoch": 0.4054642645020788,
"grad_norm": 1.4771875777008228,
"learning_rate": 5.38735744015821e-06,
"loss": 0.4411,
"step": 256
},
{
"epoch": 0.40704810928529006,
"grad_norm": 1.637711951004362,
"learning_rate": 5.368080573302675e-06,
"loss": 0.5537,
"step": 257
},
{
"epoch": 0.4086319540685013,
"grad_norm": 1.606555141779956,
"learning_rate": 5.348767656251709e-06,
"loss": 0.558,
"step": 258
},
{
"epoch": 0.41021579885171255,
"grad_norm": 1.4771059602395231,
"learning_rate": 5.329419197918638e-06,
"loss": 0.4915,
"step": 259
},
{
"epoch": 0.4117996436349238,
"grad_norm": 1.5943720324164687,
"learning_rate": 5.310035708153335e-06,
"loss": 0.583,
"step": 260
},
{
"epoch": 0.41338348841813505,
"grad_norm": 1.666964957956596,
"learning_rate": 5.2906176977287795e-06,
"loss": 0.5493,
"step": 261
},
{
"epoch": 0.41496733320134627,
"grad_norm": 1.4711154343447124,
"learning_rate": 5.271165678327606e-06,
"loss": 0.5519,
"step": 262
},
{
"epoch": 0.4165511779845575,
"grad_norm": 1.5995792262282518,
"learning_rate": 5.251680162528617e-06,
"loss": 0.5377,
"step": 263
},
{
"epoch": 0.41813502276776876,
"grad_norm": 1.4526790782720835,
"learning_rate": 5.232161663793275e-06,
"loss": 0.5335,
"step": 264
},
{
"epoch": 0.41971886755098,
"grad_norm": 1.5478646380088532,
"learning_rate": 5.212610696452174e-06,
"loss": 0.6434,
"step": 265
},
{
"epoch": 0.42130271233419125,
"grad_norm": 1.4956181853598964,
"learning_rate": 5.193027775691485e-06,
"loss": 0.498,
"step": 266
},
{
"epoch": 0.4228865571174025,
"grad_norm": 1.4974980436315144,
"learning_rate": 5.173413417539384e-06,
"loss": 0.6171,
"step": 267
},
{
"epoch": 0.42447040190061375,
"grad_norm": 1.466644696421707,
"learning_rate": 5.153768138852449e-06,
"loss": 0.501,
"step": 268
},
{
"epoch": 0.42605424668382497,
"grad_norm": 1.627652008126963,
"learning_rate": 5.134092457302043e-06,
"loss": 0.6258,
"step": 269
},
{
"epoch": 0.42763809146703624,
"grad_norm": 1.648118892712504,
"learning_rate": 5.114386891360675e-06,
"loss": 0.5565,
"step": 270
},
{
"epoch": 0.42922193625024746,
"grad_norm": 1.848328087802553,
"learning_rate": 5.094651960288332e-06,
"loss": 0.5803,
"step": 271
},
{
"epoch": 0.43080578103345873,
"grad_norm": 1.3558392557511212,
"learning_rate": 5.074888184118801e-06,
"loss": 0.4598,
"step": 272
},
{
"epoch": 0.43238962581666995,
"grad_norm": 1.4376325696738284,
"learning_rate": 5.055096083645967e-06,
"loss": 0.5144,
"step": 273
},
{
"epoch": 0.43397347059988123,
"grad_norm": 2.013114345583861,
"learning_rate": 5.035276180410083e-06,
"loss": 0.5365,
"step": 274
},
{
"epoch": 0.43555731538309245,
"grad_norm": 1.7720963379244992,
"learning_rate": 5.015428996684031e-06,
"loss": 0.5965,
"step": 275
},
{
"epoch": 0.4371411601663037,
"grad_norm": 1.5956864806503275,
"learning_rate": 4.995555055459562e-06,
"loss": 0.5399,
"step": 276
},
{
"epoch": 0.43872500494951494,
"grad_norm": 1.705695286576096,
"learning_rate": 4.975654880433508e-06,
"loss": 0.5492,
"step": 277
},
{
"epoch": 0.4403088497327262,
"grad_norm": 1.7149353403443588,
"learning_rate": 4.95572899599399e-06,
"loss": 0.6011,
"step": 278
},
{
"epoch": 0.44189269451593743,
"grad_norm": 1.327638735973327,
"learning_rate": 4.935777927206595e-06,
"loss": 0.4993,
"step": 279
},
{
"epoch": 0.4434765392991487,
"grad_norm": 1.5644224950432588,
"learning_rate": 4.915802199800536e-06,
"loss": 0.595,
"step": 280
},
{
"epoch": 0.44506038408235993,
"grad_norm": 1.517553789603623,
"learning_rate": 4.8958023401548124e-06,
"loss": 0.5383,
"step": 281
},
{
"epoch": 0.44664422886557115,
"grad_norm": 1.7980504027520112,
"learning_rate": 4.875778875284322e-06,
"loss": 0.486,
"step": 282
},
{
"epoch": 0.4482280736487824,
"grad_norm": 1.618198969176812,
"learning_rate": 4.855732332825989e-06,
"loss": 0.5041,
"step": 283
},
{
"epoch": 0.44981191843199364,
"grad_norm": 1.6074207482159366,
"learning_rate": 4.8356632410248495e-06,
"loss": 0.5225,
"step": 284
},
{
"epoch": 0.4513957632152049,
"grad_norm": 1.3840009026279627,
"learning_rate": 4.815572128720138e-06,
"loss": 0.4984,
"step": 285
},
{
"epoch": 0.45297960799841613,
"grad_norm": 2.0549483079657644,
"learning_rate": 4.795459525331346e-06,
"loss": 0.7242,
"step": 286
},
{
"epoch": 0.4545634527816274,
"grad_norm": 1.4634854048670438,
"learning_rate": 4.77532596084428e-06,
"loss": 0.5575,
"step": 287
},
{
"epoch": 0.4561472975648386,
"grad_norm": 1.4194408672102914,
"learning_rate": 4.755171965797087e-06,
"loss": 0.5493,
"step": 288
},
{
"epoch": 0.4577311423480499,
"grad_norm": 1.6579210683410825,
"learning_rate": 4.734998071266282e-06,
"loss": 0.4842,
"step": 289
},
{
"epoch": 0.4593149871312611,
"grad_norm": 2.2481565569431003,
"learning_rate": 4.714804808852744e-06,
"loss": 0.5556,
"step": 290
},
{
"epoch": 0.4608988319144724,
"grad_norm": 1.7503586240205726,
"learning_rate": 4.694592710667722e-06,
"loss": 0.5027,
"step": 291
},
{
"epoch": 0.4624826766976836,
"grad_norm": 1.5013077261326937,
"learning_rate": 4.674362309318796e-06,
"loss": 0.5387,
"step": 292
},
{
"epoch": 0.4640665214808949,
"grad_norm": 1.5044654112516254,
"learning_rate": 4.65411413789586e-06,
"loss": 0.4191,
"step": 293
},
{
"epoch": 0.4656503662641061,
"grad_norm": 1.4661324959060025,
"learning_rate": 4.6338487299570605e-06,
"loss": 0.5883,
"step": 294
},
{
"epoch": 0.4672342110473174,
"grad_norm": 1.7266562955889597,
"learning_rate": 4.613566619514742e-06,
"loss": 0.6532,
"step": 295
},
{
"epoch": 0.4688180558305286,
"grad_norm": 1.4130438958548992,
"learning_rate": 4.593268341021378e-06,
"loss": 0.5274,
"step": 296
},
{
"epoch": 0.4704019006137399,
"grad_norm": 1.4628610094368864,
"learning_rate": 4.572954429355486e-06,
"loss": 0.4546,
"step": 297
},
{
"epoch": 0.4719857453969511,
"grad_norm": 2.6067716280602493,
"learning_rate": 4.552625419807529e-06,
"loss": 0.5247,
"step": 298
},
{
"epoch": 0.47356959018016237,
"grad_norm": 1.5391344972032668,
"learning_rate": 4.532281848065815e-06,
"loss": 0.5648,
"step": 299
},
{
"epoch": 0.4751534349633736,
"grad_norm": 1.4085395552662021,
"learning_rate": 4.5119242502023795e-06,
"loss": 0.5333,
"step": 300
},
{
"epoch": 0.4767372797465848,
"grad_norm": 1.4740227736979468,
"learning_rate": 4.4915531626588566e-06,
"loss": 0.4993,
"step": 301
},
{
"epoch": 0.4783211245297961,
"grad_norm": 1.5696976913793732,
"learning_rate": 4.4711691222323505e-06,
"loss": 0.5829,
"step": 302
},
{
"epoch": 0.4799049693130073,
"grad_norm": 1.3762468410600768,
"learning_rate": 4.450772666061285e-06,
"loss": 0.5585,
"step": 303
},
{
"epoch": 0.4814888140962186,
"grad_norm": 1.4028769788977258,
"learning_rate": 4.4303643316112455e-06,
"loss": 0.4688,
"step": 304
},
{
"epoch": 0.4830726588794298,
"grad_norm": 1.6682668294458134,
"learning_rate": 4.409944656660828e-06,
"loss": 0.6571,
"step": 305
},
{
"epoch": 0.48465650366264107,
"grad_norm": 1.413594427095901,
"learning_rate": 4.389514179287455e-06,
"loss": 0.5522,
"step": 306
},
{
"epoch": 0.4862403484458523,
"grad_norm": 1.519057639158845,
"learning_rate": 4.369073437853208e-06,
"loss": 0.5334,
"step": 307
},
{
"epoch": 0.48782419322906356,
"grad_norm": 1.6208921075298082,
"learning_rate": 4.348622970990633e-06,
"loss": 0.5182,
"step": 308
},
{
"epoch": 0.4894080380122748,
"grad_norm": 1.8501674500734415,
"learning_rate": 4.328163317588551e-06,
"loss": 0.6517,
"step": 309
},
{
"epoch": 0.49099188279548606,
"grad_norm": 1.4363229280193845,
"learning_rate": 4.307695016777855e-06,
"loss": 0.5416,
"step": 310
},
{
"epoch": 0.4925757275786973,
"grad_norm": 1.7153136971595235,
"learning_rate": 4.28721860791731e-06,
"loss": 0.588,
"step": 311
},
{
"epoch": 0.49415957236190855,
"grad_norm": 1.7049912502271836,
"learning_rate": 4.2667346305793305e-06,
"loss": 0.5894,
"step": 312
},
{
"epoch": 0.49574341714511977,
"grad_norm": 1.7928491666503086,
"learning_rate": 4.246243624535772e-06,
"loss": 0.5509,
"step": 313
},
{
"epoch": 0.49732726192833104,
"grad_norm": 1.4067335450901148,
"learning_rate": 4.2257461297436975e-06,
"loss": 0.5372,
"step": 314
},
{
"epoch": 0.49891110671154226,
"grad_norm": 1.551109249256133,
"learning_rate": 4.205242686331158e-06,
"loss": 0.4888,
"step": 315
},
{
"epoch": 0.5004949514947535,
"grad_norm": 1.5295528770849849,
"learning_rate": 4.184733834582958e-06,
"loss": 0.5244,
"step": 316
},
{
"epoch": 0.5020787962779648,
"grad_norm": 1.568867534852523,
"learning_rate": 4.164220114926413e-06,
"loss": 0.5243,
"step": 317
},
{
"epoch": 0.503662641061176,
"grad_norm": 1.5276609286295317,
"learning_rate": 4.143702067917114e-06,
"loss": 0.5557,
"step": 318
},
{
"epoch": 0.5052464858443872,
"grad_norm": 1.7692820390689328,
"learning_rate": 4.123180234224682e-06,
"loss": 0.5533,
"step": 319
},
{
"epoch": 0.5068303306275985,
"grad_norm": 1.6282684636897258,
"learning_rate": 4.102655154618519e-06,
"loss": 0.54,
"step": 320
},
{
"epoch": 0.5084141754108097,
"grad_norm": 1.4884338495205554,
"learning_rate": 4.082127369953562e-06,
"loss": 0.5187,
"step": 321
},
{
"epoch": 0.509998020194021,
"grad_norm": 1.4653962436440777,
"learning_rate": 4.061597421156027e-06,
"loss": 0.4915,
"step": 322
},
{
"epoch": 0.5115818649772322,
"grad_norm": 1.395223278506742,
"learning_rate": 4.04106584920916e-06,
"loss": 0.496,
"step": 323
},
{
"epoch": 0.5131657097604435,
"grad_norm": 2.1426438371002487,
"learning_rate": 4.0205331951389745e-06,
"loss": 0.6205,
"step": 324
},
{
"epoch": 0.5147495545436547,
"grad_norm": 1.867498513188697,
"learning_rate": 4e-06,
"loss": 0.4003,
"step": 325
},
{
"epoch": 0.516333399326866,
"grad_norm": 1.614897925155514,
"learning_rate": 3.979466804861026e-06,
"loss": 0.5554,
"step": 326
},
{
"epoch": 0.5179172441100772,
"grad_norm": 1.621850139141135,
"learning_rate": 3.958934150790841e-06,
"loss": 0.4116,
"step": 327
},
{
"epoch": 0.5195010888932885,
"grad_norm": 1.6540960382507581,
"learning_rate": 3.938402578843973e-06,
"loss": 0.4899,
"step": 328
},
{
"epoch": 0.5210849336764997,
"grad_norm": 1.4757999919149907,
"learning_rate": 3.917872630046439e-06,
"loss": 0.4871,
"step": 329
},
{
"epoch": 0.5226687784597109,
"grad_norm": 1.4101743650910379,
"learning_rate": 3.8973448453814815e-06,
"loss": 0.5557,
"step": 330
},
{
"epoch": 0.5242526232429222,
"grad_norm": 1.820304069777086,
"learning_rate": 3.876819765775319e-06,
"loss": 0.5178,
"step": 331
},
{
"epoch": 0.5258364680261335,
"grad_norm": 1.3457029032306118,
"learning_rate": 3.856297932082886e-06,
"loss": 0.5481,
"step": 332
},
{
"epoch": 0.5274203128093446,
"grad_norm": 1.2926453088911345,
"learning_rate": 3.835779885073587e-06,
"loss": 0.47,
"step": 333
},
{
"epoch": 0.5290041575925559,
"grad_norm": 1.7195140710061863,
"learning_rate": 3.815266165417042e-06,
"loss": 0.4018,
"step": 334
},
{
"epoch": 0.5305880023757672,
"grad_norm": 1.5294915569869552,
"learning_rate": 3.7947573136688406e-06,
"loss": 0.4889,
"step": 335
},
{
"epoch": 0.5321718471589785,
"grad_norm": 1.5318098339821966,
"learning_rate": 3.774253870256302e-06,
"loss": 0.429,
"step": 336
},
{
"epoch": 0.5337556919421896,
"grad_norm": 1.5322388550699502,
"learning_rate": 3.7537563754642285e-06,
"loss": 0.5065,
"step": 337
},
{
"epoch": 0.5353395367254009,
"grad_norm": 1.7066209125681409,
"learning_rate": 3.7332653694206683e-06,
"loss": 0.4947,
"step": 338
},
{
"epoch": 0.5369233815086122,
"grad_norm": 1.3425460536457656,
"learning_rate": 3.7127813920826896e-06,
"loss": 0.5448,
"step": 339
},
{
"epoch": 0.5385072262918233,
"grad_norm": 1.3885953993055378,
"learning_rate": 3.6923049832221447e-06,
"loss": 0.5269,
"step": 340
},
{
"epoch": 0.5400910710750346,
"grad_norm": 1.562252798906571,
"learning_rate": 3.6718366824114497e-06,
"loss": 0.5145,
"step": 341
},
{
"epoch": 0.5416749158582459,
"grad_norm": 1.7621418644258409,
"learning_rate": 3.651377029009367e-06,
"loss": 0.5345,
"step": 342
},
{
"epoch": 0.5432587606414572,
"grad_norm": 1.6334983556675244,
"learning_rate": 3.6309265621467923e-06,
"loss": 0.5435,
"step": 343
},
{
"epoch": 0.5448426054246683,
"grad_norm": 1.6029717902895917,
"learning_rate": 3.6104858207125447e-06,
"loss": 0.5734,
"step": 344
},
{
"epoch": 0.5464264502078796,
"grad_norm": 1.7090791562879204,
"learning_rate": 3.590055343339172e-06,
"loss": 0.5325,
"step": 345
},
{
"epoch": 0.5480102949910909,
"grad_norm": 1.5947834681663235,
"learning_rate": 3.5696356683887545e-06,
"loss": 0.4975,
"step": 346
},
{
"epoch": 0.5495941397743022,
"grad_norm": 1.4472742169549009,
"learning_rate": 3.5492273339387156e-06,
"loss": 0.4894,
"step": 347
},
{
"epoch": 0.5511779845575133,
"grad_norm": 2.0434685400743082,
"learning_rate": 3.5288308777676487e-06,
"loss": 0.4684,
"step": 348
},
{
"epoch": 0.5527618293407246,
"grad_norm": 1.9208895400830441,
"learning_rate": 3.508446837341144e-06,
"loss": 0.5969,
"step": 349
},
{
"epoch": 0.5543456741239359,
"grad_norm": 1.4909570903888723,
"learning_rate": 3.488075749797622e-06,
"loss": 0.5022,
"step": 350
},
{
"epoch": 0.5559295189071471,
"grad_norm": 1.6489325052677233,
"learning_rate": 3.4677181519341864e-06,
"loss": 0.5895,
"step": 351
},
{
"epoch": 0.5575133636903583,
"grad_norm": 1.613873880473099,
"learning_rate": 3.447374580192472e-06,
"loss": 0.5907,
"step": 352
},
{
"epoch": 0.5590972084735696,
"grad_norm": 1.7320655964325145,
"learning_rate": 3.427045570644515e-06,
"loss": 0.4408,
"step": 353
},
{
"epoch": 0.5606810532567809,
"grad_norm": 1.5966735398972682,
"learning_rate": 3.406731658978621e-06,
"loss": 0.5518,
"step": 354
},
{
"epoch": 0.5622648980399921,
"grad_norm": 1.507137639160325,
"learning_rate": 3.386433380485258e-06,
"loss": 0.5487,
"step": 355
},
{
"epoch": 0.5638487428232033,
"grad_norm": 1.477023097202977,
"learning_rate": 3.36615127004294e-06,
"loss": 0.5277,
"step": 356
},
{
"epoch": 0.5654325876064146,
"grad_norm": 1.4345842364104429,
"learning_rate": 3.3458858621041395e-06,
"loss": 0.5825,
"step": 357
},
{
"epoch": 0.5670164323896258,
"grad_norm": 1.5249472435401055,
"learning_rate": 3.3256376906812026e-06,
"loss": 0.5686,
"step": 358
},
{
"epoch": 0.568600277172837,
"grad_norm": 1.652799125000088,
"learning_rate": 3.3054072893322785e-06,
"loss": 0.5356,
"step": 359
},
{
"epoch": 0.5701841219560483,
"grad_norm": 1.7523508161150247,
"learning_rate": 3.285195191147255e-06,
"loss": 0.558,
"step": 360
},
{
"epoch": 0.5717679667392596,
"grad_norm": 1.5826594727198133,
"learning_rate": 3.265001928733718e-06,
"loss": 0.5513,
"step": 361
},
{
"epoch": 0.5733518115224708,
"grad_norm": 1.4553905463035222,
"learning_rate": 3.2448280342029128e-06,
"loss": 0.4994,
"step": 362
},
{
"epoch": 0.574935656305682,
"grad_norm": 1.4388765353827664,
"learning_rate": 3.2246740391557196e-06,
"loss": 0.5027,
"step": 363
},
{
"epoch": 0.5765195010888933,
"grad_norm": 1.330505454003212,
"learning_rate": 3.2045404746686542e-06,
"loss": 0.508,
"step": 364
},
{
"epoch": 0.5781033458721045,
"grad_norm": 1.3429439845943842,
"learning_rate": 3.1844278712798626e-06,
"loss": 0.4263,
"step": 365
},
{
"epoch": 0.5796871906553158,
"grad_norm": 1.4396944573827641,
"learning_rate": 3.1643367589751497e-06,
"loss": 0.5179,
"step": 366
},
{
"epoch": 0.581271035438527,
"grad_norm": 1.414735328601168,
"learning_rate": 3.1442676671740113e-06,
"loss": 0.4259,
"step": 367
},
{
"epoch": 0.5828548802217383,
"grad_norm": 1.66831199255592,
"learning_rate": 3.124221124715678e-06,
"loss": 0.4971,
"step": 368
},
{
"epoch": 0.5844387250049495,
"grad_norm": 1.5810788020061641,
"learning_rate": 3.104197659845188e-06,
"loss": 0.5772,
"step": 369
},
{
"epoch": 0.5860225697881608,
"grad_norm": 1.5715742632475282,
"learning_rate": 3.0841978001994645e-06,
"loss": 0.5036,
"step": 370
},
{
"epoch": 0.587606414571372,
"grad_norm": 1.751121812265499,
"learning_rate": 3.0642220727934067e-06,
"loss": 0.5295,
"step": 371
},
{
"epoch": 0.5891902593545832,
"grad_norm": 1.5098119881063428,
"learning_rate": 3.0442710040060098e-06,
"loss": 0.5466,
"step": 372
},
{
"epoch": 0.5907741041377945,
"grad_norm": 1.5207898613100728,
"learning_rate": 3.0243451195664913e-06,
"loss": 0.5579,
"step": 373
},
{
"epoch": 0.5923579489210058,
"grad_norm": 1.4240138870827277,
"learning_rate": 3.004444944540437e-06,
"loss": 0.5507,
"step": 374
},
{
"epoch": 0.593941793704217,
"grad_norm": 1.506087376349933,
"learning_rate": 2.9845710033159684e-06,
"loss": 0.4465,
"step": 375
},
{
"epoch": 0.5955256384874282,
"grad_norm": 1.607368732995549,
"learning_rate": 2.9647238195899164e-06,
"loss": 0.5378,
"step": 376
},
{
"epoch": 0.5971094832706395,
"grad_norm": 1.578131220264289,
"learning_rate": 2.9449039163540316e-06,
"loss": 0.4516,
"step": 377
},
{
"epoch": 0.5986933280538507,
"grad_norm": 1.488407789608305,
"learning_rate": 2.9251118158811984e-06,
"loss": 0.5087,
"step": 378
},
{
"epoch": 0.600277172837062,
"grad_norm": 1.4648982592871571,
"learning_rate": 2.9053480397116684e-06,
"loss": 0.5531,
"step": 379
},
{
"epoch": 0.6018610176202732,
"grad_norm": 1.378206069077312,
"learning_rate": 2.885613108639326e-06,
"loss": 0.437,
"step": 380
},
{
"epoch": 0.6034448624034845,
"grad_norm": 1.4399539996573616,
"learning_rate": 2.865907542697957e-06,
"loss": 0.5327,
"step": 381
},
{
"epoch": 0.6050287071866957,
"grad_norm": 1.4113473201037952,
"learning_rate": 2.846231861147551e-06,
"loss": 0.5414,
"step": 382
},
{
"epoch": 0.6066125519699069,
"grad_norm": 1.3798536815664695,
"learning_rate": 2.8265865824606165e-06,
"loss": 0.5537,
"step": 383
},
{
"epoch": 0.6081963967531182,
"grad_norm": 1.6278939154714502,
"learning_rate": 2.806972224308515e-06,
"loss": 0.5272,
"step": 384
},
{
"epoch": 0.6097802415363295,
"grad_norm": 1.4122415614433053,
"learning_rate": 2.787389303547826e-06,
"loss": 0.5437,
"step": 385
},
{
"epoch": 0.6113640863195406,
"grad_norm": 1.4410982986916903,
"learning_rate": 2.7678383362067257e-06,
"loss": 0.6161,
"step": 386
},
{
"epoch": 0.6129479311027519,
"grad_norm": 1.6053888158206666,
"learning_rate": 2.748319837471383e-06,
"loss": 0.5462,
"step": 387
},
{
"epoch": 0.6145317758859632,
"grad_norm": 1.6416876364791944,
"learning_rate": 2.7288343216723933e-06,
"loss": 0.5041,
"step": 388
},
{
"epoch": 0.6161156206691745,
"grad_norm": 1.6000769765570262,
"learning_rate": 2.7093823022712215e-06,
"loss": 0.6001,
"step": 389
},
{
"epoch": 0.6176994654523856,
"grad_norm": 1.5229151146342543,
"learning_rate": 2.6899642918466656e-06,
"loss": 0.5966,
"step": 390
},
{
"epoch": 0.6192833102355969,
"grad_norm": 1.2082061870063965,
"learning_rate": 2.6705808020813617e-06,
"loss": 0.4404,
"step": 391
},
{
"epoch": 0.6208671550188082,
"grad_norm": 1.7357266576917079,
"learning_rate": 2.6512323437482903e-06,
"loss": 0.509,
"step": 392
},
{
"epoch": 0.6224509998020195,
"grad_norm": 1.4629821094699202,
"learning_rate": 2.631919426697325e-06,
"loss": 0.5477,
"step": 393
},
{
"epoch": 0.6240348445852306,
"grad_norm": 1.6391626920610356,
"learning_rate": 2.612642559841789e-06,
"loss": 0.5424,
"step": 394
},
{
"epoch": 0.6256186893684419,
"grad_norm": 1.6354675394639997,
"learning_rate": 2.5934022511450525e-06,
"loss": 0.4486,
"step": 395
},
{
"epoch": 0.6272025341516532,
"grad_norm": 1.5362602160998997,
"learning_rate": 2.574199007607144e-06,
"loss": 0.452,
"step": 396
},
{
"epoch": 0.6287863789348643,
"grad_norm": 1.4546308476604741,
"learning_rate": 2.5550333352513884e-06,
"loss": 0.5295,
"step": 397
},
{
"epoch": 0.6303702237180756,
"grad_norm": 1.5353573065379114,
"learning_rate": 2.535905739111077e-06,
"loss": 0.4627,
"step": 398
},
{
"epoch": 0.6319540685012869,
"grad_norm": 1.7076979214421926,
"learning_rate": 2.516816723216157e-06,
"loss": 0.5024,
"step": 399
},
{
"epoch": 0.6335379132844982,
"grad_norm": 1.8486022817022656,
"learning_rate": 2.49776679057995e-06,
"loss": 0.5149,
"step": 400
},
{
"epoch": 0.6351217580677093,
"grad_norm": 1.566887589871668,
"learning_rate": 2.4787564431858974e-06,
"loss": 0.5059,
"step": 401
},
{
"epoch": 0.6367056028509206,
"grad_norm": 1.564549295152443,
"learning_rate": 2.4597861819743334e-06,
"loss": 0.4603,
"step": 402
},
{
"epoch": 0.6382894476341319,
"grad_norm": 1.380492154373069,
"learning_rate": 2.4408565068292827e-06,
"loss": 0.4929,
"step": 403
},
{
"epoch": 0.6398732924173431,
"grad_norm": 1.5407455494599405,
"learning_rate": 2.4219679165652902e-06,
"loss": 0.5311,
"step": 404
},
{
"epoch": 0.6414571372005543,
"grad_norm": 1.4174285596906695,
"learning_rate": 2.403120908914277e-06,
"loss": 0.4834,
"step": 405
},
{
"epoch": 0.6430409819837656,
"grad_norm": 1.873920827014659,
"learning_rate": 2.3843159805124203e-06,
"loss": 0.5017,
"step": 406
},
{
"epoch": 0.6446248267669769,
"grad_norm": 1.7835980447150575,
"learning_rate": 2.365553626887074e-06,
"loss": 0.5418,
"step": 407
},
{
"epoch": 0.6462086715501881,
"grad_norm": 1.3633583035406078,
"learning_rate": 2.3468343424437055e-06,
"loss": 0.4608,
"step": 408
},
{
"epoch": 0.6477925163333993,
"grad_norm": 1.8174838090865337,
"learning_rate": 2.3281586204528677e-06,
"loss": 0.5257,
"step": 409
},
{
"epoch": 0.6493763611166106,
"grad_norm": 1.5982388803586878,
"learning_rate": 2.309526953037203e-06,
"loss": 0.5193,
"step": 410
},
{
"epoch": 0.6509602058998218,
"grad_norm": 2.817392677546035,
"learning_rate": 2.2909398311584775e-06,
"loss": 0.4578,
"step": 411
},
{
"epoch": 0.6525440506830331,
"grad_norm": 1.581257330292036,
"learning_rate": 2.272397744604636e-06,
"loss": 0.5021,
"step": 412
},
{
"epoch": 0.6541278954662443,
"grad_norm": 1.6043151167776697,
"learning_rate": 2.253901181976905e-06,
"loss": 0.5405,
"step": 413
},
{
"epoch": 0.6557117402494556,
"grad_norm": 2.1291440852646817,
"learning_rate": 2.2354506306769143e-06,
"loss": 0.5301,
"step": 414
},
{
"epoch": 0.6572955850326668,
"grad_norm": 1.9230439043144794,
"learning_rate": 2.2170465768938473e-06,
"loss": 0.5709,
"step": 415
},
{
"epoch": 0.658879429815878,
"grad_norm": 1.472833355116063,
"learning_rate": 2.1986895055916366e-06,
"loss": 0.4326,
"step": 416
},
{
"epoch": 0.6604632745990893,
"grad_norm": 1.87905372939008,
"learning_rate": 2.1803799004961824e-06,
"loss": 0.462,
"step": 417
},
{
"epoch": 0.6620471193823005,
"grad_norm": 1.4366052509179692,
"learning_rate": 2.1621182440826096e-06,
"loss": 0.5735,
"step": 418
},
{
"epoch": 0.6636309641655118,
"grad_norm": 1.7279130546658286,
"learning_rate": 2.143905017562547e-06,
"loss": 0.408,
"step": 419
},
{
"epoch": 0.665214808948723,
"grad_norm": 1.536940380154465,
"learning_rate": 2.12574070087145e-06,
"loss": 0.5199,
"step": 420
},
{
"epoch": 0.665214808948723,
"eval_accuracy": 0.810934698088188,
"eval_loss": 0.5862451195716858,
"eval_perplexity": 1.2037860680935488,
"eval_runtime": 531.3769,
"eval_samples_per_second": 1.432,
"eval_steps_per_second": 1.432,
"step": 420
},
{
"epoch": 0.6667986537319343,
"grad_norm": 1.565207882157784,
"learning_rate": 2.10762577265596e-06,
"loss": 0.4879,
"step": 421
},
{
"epoch": 0.6683824985151455,
"grad_norm": 1.4691468768515952,
"learning_rate": 2.0895607102612803e-06,
"loss": 0.5024,
"step": 422
},
{
"epoch": 0.6699663432983568,
"grad_norm": 1.3928452284838215,
"learning_rate": 2.0715459897186044e-06,
"loss": 0.5901,
"step": 423
},
{
"epoch": 0.671550188081568,
"grad_norm": 1.4610053346508791,
"learning_rate": 2.0535820857325753e-06,
"loss": 0.5062,
"step": 424
},
{
"epoch": 0.6731340328647792,
"grad_norm": 1.3712954252365843,
"learning_rate": 2.0356694716687682e-06,
"loss": 0.5479,
"step": 425
},
{
"epoch": 0.6747178776479905,
"grad_norm": 1.3758715850017582,
"learning_rate": 2.017808619541221e-06,
"loss": 0.4969,
"step": 426
},
{
"epoch": 0.6763017224312018,
"grad_norm": 1.2871110284961038,
"learning_rate": 2.0000000000000008e-06,
"loss": 0.4832,
"step": 427
},
{
"epoch": 0.677885567214413,
"grad_norm": 1.4668871631682787,
"learning_rate": 1.982244082318793e-06,
"loss": 0.4901,
"step": 428
},
{
"epoch": 0.6794694119976242,
"grad_norm": 1.515067727433541,
"learning_rate": 1.9645413343825406e-06,
"loss": 0.5362,
"step": 429
},
{
"epoch": 0.6810532567808355,
"grad_norm": 1.427224417336294,
"learning_rate": 1.946892222675118e-06,
"loss": 0.467,
"step": 430
},
{
"epoch": 0.6826371015640468,
"grad_norm": 1.3486800429336776,
"learning_rate": 1.92929721226703e-06,
"loss": 0.4614,
"step": 431
},
{
"epoch": 0.6842209463472579,
"grad_norm": 1.7205504786002677,
"learning_rate": 1.9117567668031665e-06,
"loss": 0.45,
"step": 432
},
{
"epoch": 0.6858047911304692,
"grad_norm": 1.6996871872002546,
"learning_rate": 1.8942713484905761e-06,
"loss": 0.5028,
"step": 433
},
{
"epoch": 0.6873886359136805,
"grad_norm": 1.5407143434367043,
"learning_rate": 1.8768414180862956e-06,
"loss": 0.5294,
"step": 434
},
{
"epoch": 0.6889724806968917,
"grad_norm": 1.3480994367623826,
"learning_rate": 1.859467434885199e-06,
"loss": 0.4558,
"step": 435
},
{
"epoch": 0.6905563254801029,
"grad_norm": 1.7334838280078013,
"learning_rate": 1.8421498567079005e-06,
"loss": 0.5249,
"step": 436
},
{
"epoch": 0.6921401702633142,
"grad_norm": 2.254316615478953,
"learning_rate": 1.8248891398886936e-06,
"loss": 0.6142,
"step": 437
},
{
"epoch": 0.6937240150465255,
"grad_norm": 1.5257795875613787,
"learning_rate": 1.8076857392635176e-06,
"loss": 0.4471,
"step": 438
},
{
"epoch": 0.6953078598297366,
"grad_norm": 1.700232100023801,
"learning_rate": 1.7905401081579768e-06,
"loss": 0.52,
"step": 439
},
{
"epoch": 0.6968917046129479,
"grad_norm": 1.33014022877584,
"learning_rate": 1.7734526983753986e-06,
"loss": 0.4591,
"step": 440
},
{
"epoch": 0.6984755493961592,
"grad_norm": 1.7591582976289475,
"learning_rate": 1.7564239601849216e-06,
"loss": 0.556,
"step": 441
},
{
"epoch": 0.7000593941793705,
"grad_norm": 1.5534876715610133,
"learning_rate": 1.7394543423096325e-06,
"loss": 0.4904,
"step": 442
},
{
"epoch": 0.7016432389625816,
"grad_norm": 1.4877228267210632,
"learning_rate": 1.7225442919147465e-06,
"loss": 0.5103,
"step": 443
},
{
"epoch": 0.7032270837457929,
"grad_norm": 1.4261700371364836,
"learning_rate": 1.7056942545958167e-06,
"loss": 0.4619,
"step": 444
},
{
"epoch": 0.7048109285290042,
"grad_norm": 1.5419799312577944,
"learning_rate": 1.6889046743669955e-06,
"loss": 0.5397,
"step": 445
},
{
"epoch": 0.7063947733122155,
"grad_norm": 1.6636001306308572,
"learning_rate": 1.6721759936493398e-06,
"loss": 0.5358,
"step": 446
},
{
"epoch": 0.7079786180954266,
"grad_norm": 1.5271438092360352,
"learning_rate": 1.6555086532591425e-06,
"loss": 0.4629,
"step": 447
},
{
"epoch": 0.7095624628786379,
"grad_norm": 1.6441451799989772,
"learning_rate": 1.6389030923963221e-06,
"loss": 0.4495,
"step": 448
},
{
"epoch": 0.7111463076618492,
"grad_norm": 1.5088562455266363,
"learning_rate": 1.6223597486328533e-06,
"loss": 0.4715,
"step": 449
},
{
"epoch": 0.7127301524450604,
"grad_norm": 1.4463031901473735,
"learning_rate": 1.6058790579012275e-06,
"loss": 0.5491,
"step": 450
},
{
"epoch": 0.7143139972282716,
"grad_norm": 1.471434921379099,
"learning_rate": 1.5894614544829747e-06,
"loss": 0.4864,
"step": 451
},
{
"epoch": 0.7158978420114829,
"grad_norm": 1.6603661388527584,
"learning_rate": 1.5731073709972113e-06,
"loss": 0.4506,
"step": 452
},
{
"epoch": 0.7174816867946942,
"grad_norm": 1.5613545052853812,
"learning_rate": 1.5568172383892488e-06,
"loss": 0.5735,
"step": 453
},
{
"epoch": 0.7190655315779053,
"grad_norm": 1.6697903304969928,
"learning_rate": 1.54059148591923e-06,
"loss": 0.539,
"step": 454
},
{
"epoch": 0.7206493763611166,
"grad_norm": 1.4866099563811799,
"learning_rate": 1.5244305411508215e-06,
"loss": 0.5165,
"step": 455
},
{
"epoch": 0.7222332211443279,
"grad_norm": 1.5688875196040748,
"learning_rate": 1.5083348299399506e-06,
"loss": 0.4431,
"step": 456
},
{
"epoch": 0.7238170659275391,
"grad_norm": 1.470987976048739,
"learning_rate": 1.492304776423575e-06,
"loss": 0.4692,
"step": 457
},
{
"epoch": 0.7254009107107503,
"grad_norm": 1.373015781073825,
"learning_rate": 1.4763408030085112e-06,
"loss": 0.4408,
"step": 458
},
{
"epoch": 0.7269847554939616,
"grad_norm": 1.7939617236990213,
"learning_rate": 1.460443330360309e-06,
"loss": 0.4836,
"step": 459
},
{
"epoch": 0.7285686002771729,
"grad_norm": 1.7961810685183246,
"learning_rate": 1.4446127773921557e-06,
"loss": 0.5373,
"step": 460
},
{
"epoch": 0.7301524450603841,
"grad_norm": 1.8372334481084611,
"learning_rate": 1.4288495612538425e-06,
"loss": 0.4675,
"step": 461
},
{
"epoch": 0.7317362898435953,
"grad_norm": 1.4460691785589137,
"learning_rate": 1.413154097320778e-06,
"loss": 0.4537,
"step": 462
},
{
"epoch": 0.7333201346268066,
"grad_norm": 1.54932404541608,
"learning_rate": 1.3975267991830327e-06,
"loss": 0.4473,
"step": 463
},
{
"epoch": 0.7349039794100178,
"grad_norm": 1.4531601052014125,
"learning_rate": 1.3819680786344434e-06,
"loss": 0.5499,
"step": 464
},
{
"epoch": 0.7364878241932291,
"grad_norm": 1.6713295832702009,
"learning_rate": 1.3664783456617702e-06,
"loss": 0.4369,
"step": 465
},
{
"epoch": 0.7380716689764403,
"grad_norm": 1.4987250866458606,
"learning_rate": 1.3510580084338803e-06,
"loss": 0.5176,
"step": 466
},
{
"epoch": 0.7396555137596516,
"grad_norm": 1.7331331058430652,
"learning_rate": 1.3357074732909995e-06,
"loss": 0.4903,
"step": 467
},
{
"epoch": 0.7412393585428628,
"grad_norm": 1.6659137598021865,
"learning_rate": 1.320427144734008e-06,
"loss": 0.4823,
"step": 468
},
{
"epoch": 0.7428232033260741,
"grad_norm": 1.5499926208087895,
"learning_rate": 1.3052174254137712e-06,
"loss": 0.3442,
"step": 469
},
{
"epoch": 0.7444070481092853,
"grad_norm": 1.92051205057408,
"learning_rate": 1.2900787161205404e-06,
"loss": 0.5399,
"step": 470
},
{
"epoch": 0.7459908928924965,
"grad_norm": 1.3472536176738465,
"learning_rate": 1.2750114157733829e-06,
"loss": 0.4111,
"step": 471
},
{
"epoch": 0.7475747376757078,
"grad_norm": 1.71412692276887,
"learning_rate": 1.2600159214096775e-06,
"loss": 0.5043,
"step": 472
},
{
"epoch": 0.749158582458919,
"grad_norm": 1.3640389981993508,
"learning_rate": 1.2450926281746456e-06,
"loss": 0.5684,
"step": 473
},
{
"epoch": 0.7507424272421303,
"grad_norm": 1.4068613006077029,
"learning_rate": 1.2302419293109414e-06,
"loss": 0.5849,
"step": 474
},
{
"epoch": 0.7523262720253415,
"grad_norm": 1.430129815588007,
"learning_rate": 1.2154642161482937e-06,
"loss": 0.5287,
"step": 475
},
{
"epoch": 0.7539101168085528,
"grad_norm": 1.2831468841594065,
"learning_rate": 1.2007598780931863e-06,
"loss": 0.4214,
"step": 476
},
{
"epoch": 0.755493961591764,
"grad_norm": 1.49905785043907,
"learning_rate": 1.1861293026186006e-06,
"loss": 0.4839,
"step": 477
},
{
"epoch": 0.7570778063749752,
"grad_norm": 1.7028591333085308,
"learning_rate": 1.1715728752538101e-06,
"loss": 0.5414,
"step": 478
},
{
"epoch": 0.7586616511581865,
"grad_norm": 1.4586336430125626,
"learning_rate": 1.1570909795742116e-06,
"loss": 0.4434,
"step": 479
},
{
"epoch": 0.7602454959413978,
"grad_norm": 1.5067616302391904,
"learning_rate": 1.1426839971912236e-06,
"loss": 0.4858,
"step": 480
},
{
"epoch": 0.761829340724609,
"grad_norm": 1.3432603589549863,
"learning_rate": 1.1283523077422325e-06,
"loss": 0.4768,
"step": 481
},
{
"epoch": 0.7634131855078202,
"grad_norm": 1.6178725773986766,
"learning_rate": 1.1140962888805834e-06,
"loss": 0.5535,
"step": 482
},
{
"epoch": 0.7649970302910315,
"grad_norm": 1.4689663997635967,
"learning_rate": 1.0999163162656295e-06,
"loss": 0.5227,
"step": 483
},
{
"epoch": 0.7665808750742428,
"grad_norm": 1.5383027461516126,
"learning_rate": 1.0858127635528394e-06,
"loss": 0.4011,
"step": 484
},
{
"epoch": 0.7681647198574539,
"grad_norm": 1.5160653262215487,
"learning_rate": 1.0717860023839421e-06,
"loss": 0.5865,
"step": 485
},
{
"epoch": 0.7697485646406652,
"grad_norm": 1.667492242744668,
"learning_rate": 1.0578364023771382e-06,
"loss": 0.5631,
"step": 486
},
{
"epoch": 0.7713324094238765,
"grad_norm": 1.5597231965693572,
"learning_rate": 1.043964331117364e-06,
"loss": 0.4257,
"step": 487
},
{
"epoch": 0.7729162542070878,
"grad_norm": 1.4809241793279104,
"learning_rate": 1.0301701541465954e-06,
"loss": 0.4555,
"step": 488
},
{
"epoch": 0.7745000989902989,
"grad_norm": 1.539628448856408,
"learning_rate": 1.016454234954227e-06,
"loss": 0.5371,
"step": 489
},
{
"epoch": 0.7760839437735102,
"grad_norm": 1.507321371340315,
"learning_rate": 1.0028169349674827e-06,
"loss": 0.43,
"step": 490
},
{
"epoch": 0.7776677885567215,
"grad_norm": 1.6940747096415172,
"learning_rate": 9.892586135419021e-07,
"loss": 0.531,
"step": 491
},
{
"epoch": 0.7792516333399326,
"grad_norm": 1.458267308736236,
"learning_rate": 9.757796279518636e-07,
"loss": 0.5206,
"step": 492
},
{
"epoch": 0.7808354781231439,
"grad_norm": 1.510680045767434,
"learning_rate": 9.623803333811712e-07,
"loss": 0.4083,
"step": 493
},
{
"epoch": 0.7824193229063552,
"grad_norm": 1.5237051298755062,
"learning_rate": 9.490610829137007e-07,
"loss": 0.4753,
"step": 494
},
{
"epoch": 0.7840031676895665,
"grad_norm": 1.9004733408315109,
"learning_rate": 9.358222275240884e-07,
"loss": 0.4432,
"step": 495
},
{
"epoch": 0.7855870124727776,
"grad_norm": 1.4568257328395944,
"learning_rate": 9.226641160684842e-07,
"loss": 0.5099,
"step": 496
},
{
"epoch": 0.7871708572559889,
"grad_norm": 1.7318874449726016,
"learning_rate": 9.095870952753646e-07,
"loss": 0.4351,
"step": 497
},
{
"epoch": 0.7887547020392002,
"grad_norm": 1.6137017147533719,
"learning_rate": 8.965915097363881e-07,
"loss": 0.5928,
"step": 498
},
{
"epoch": 0.7903385468224114,
"grad_norm": 1.5650409842133206,
"learning_rate": 8.83677701897318e-07,
"loss": 0.3981,
"step": 499
},
{
"epoch": 0.7919223916056226,
"grad_norm": 1.5782538718757142,
"learning_rate": 8.708460120490037e-07,
"loss": 0.5337,
"step": 500
},
{
"epoch": 0.7935062363888339,
"grad_norm": 1.5981195947747417,
"learning_rate": 8.580967783184055e-07,
"loss": 0.5147,
"step": 501
},
{
"epoch": 0.7950900811720452,
"grad_norm": 1.5440200056277393,
"learning_rate": 8.454303366596866e-07,
"loss": 0.5308,
"step": 502
},
{
"epoch": 0.7966739259552564,
"grad_norm": 1.5571076654650473,
"learning_rate": 8.328470208453682e-07,
"loss": 0.4665,
"step": 503
},
{
"epoch": 0.7982577707384676,
"grad_norm": 1.4953480568816988,
"learning_rate": 8.203471624575224e-07,
"loss": 0.5417,
"step": 504
},
{
"epoch": 0.7998416155216789,
"grad_norm": 1.4524866840898938,
"learning_rate": 8.079310908790419e-07,
"loss": 0.4489,
"step": 505
},
{
"epoch": 0.8014254603048901,
"grad_norm": 2.045243169901501,
"learning_rate": 7.955991332849623e-07,
"loss": 0.6222,
"step": 506
},
{
"epoch": 0.8030093050881013,
"grad_norm": 1.4598656015711513,
"learning_rate": 7.833516146338329e-07,
"loss": 0.4226,
"step": 507
},
{
"epoch": 0.8045931498713126,
"grad_norm": 1.3467480479408616,
"learning_rate": 7.711888576591618e-07,
"loss": 0.4603,
"step": 508
},
{
"epoch": 0.8061769946545239,
"grad_norm": 1.5045994773603517,
"learning_rate": 7.591111828609058e-07,
"loss": 0.4625,
"step": 509
},
{
"epoch": 0.8077608394377351,
"grad_norm": 1.7584779249154874,
"learning_rate": 7.471189084970291e-07,
"loss": 0.4409,
"step": 510
},
{
"epoch": 0.8093446842209463,
"grad_norm": 1.4108080454193086,
"learning_rate": 7.352123505751135e-07,
"loss": 0.4703,
"step": 511
},
{
"epoch": 0.8109285290041576,
"grad_norm": 1.8380506141002417,
"learning_rate": 7.233918228440323e-07,
"loss": 0.449,
"step": 512
},
{
"epoch": 0.8125123737873688,
"grad_norm": 1.58792661941251,
"learning_rate": 7.116576367856871e-07,
"loss": 0.5837,
"step": 513
},
{
"epoch": 0.8140962185705801,
"grad_norm": 1.450768559703693,
"learning_rate": 7.000101016067912e-07,
"loss": 0.4332,
"step": 514
},
{
"epoch": 0.8156800633537913,
"grad_norm": 1.4028521815647677,
"learning_rate": 6.884495242307284e-07,
"loss": 0.4748,
"step": 515
},
{
"epoch": 0.8172639081370026,
"grad_norm": 1.6546175297694308,
"learning_rate": 6.769762092894664e-07,
"loss": 0.5074,
"step": 516
},
{
"epoch": 0.8188477529202138,
"grad_norm": 1.56576785928193,
"learning_rate": 6.655904591155223e-07,
"loss": 0.5381,
"step": 517
},
{
"epoch": 0.8204315977034251,
"grad_norm": 1.387565724939211,
"learning_rate": 6.542925737340019e-07,
"loss": 0.4561,
"step": 518
},
{
"epoch": 0.8220154424866363,
"grad_norm": 1.528956816316375,
"learning_rate": 6.430828508546935e-07,
"loss": 0.4937,
"step": 519
},
{
"epoch": 0.8235992872698475,
"grad_norm": 1.4223442587210688,
"learning_rate": 6.319615858642193e-07,
"loss": 0.5643,
"step": 520
},
{
"epoch": 0.8251831320530588,
"grad_norm": 1.6114797659220879,
"learning_rate": 6.209290718182538e-07,
"loss": 0.4748,
"step": 521
},
{
"epoch": 0.8267669768362701,
"grad_norm": 1.440732361602322,
"learning_rate": 6.09985599433804e-07,
"loss": 0.4529,
"step": 522
},
{
"epoch": 0.8283508216194813,
"grad_norm": 1.7456468155042586,
"learning_rate": 5.99131457081544e-07,
"loss": 0.4569,
"step": 523
},
{
"epoch": 0.8299346664026925,
"grad_norm": 1.5548170916399435,
"learning_rate": 5.883669307782182e-07,
"loss": 0.4917,
"step": 524
},
{
"epoch": 0.8315185111859038,
"grad_norm": 1.5325320661573854,
"learning_rate": 5.776923041791076e-07,
"loss": 0.4514,
"step": 525
},
{
"epoch": 0.833102355969115,
"grad_norm": 1.7221755053166576,
"learning_rate": 5.671078585705489e-07,
"loss": 0.5491,
"step": 526
},
{
"epoch": 0.8346862007523262,
"grad_norm": 1.3704260247894937,
"learning_rate": 5.566138728625293e-07,
"loss": 0.4455,
"step": 527
},
{
"epoch": 0.8362700455355375,
"grad_norm": 1.538307339121543,
"learning_rate": 5.462106235813296e-07,
"loss": 0.5443,
"step": 528
},
{
"epoch": 0.8378538903187488,
"grad_norm": 1.4697584276805509,
"learning_rate": 5.358983848622451e-07,
"loss": 0.4608,
"step": 529
},
{
"epoch": 0.83943773510196,
"grad_norm": 1.8369715460955707,
"learning_rate": 5.256774284423561e-07,
"loss": 0.5062,
"step": 530
},
{
"epoch": 0.8410215798851712,
"grad_norm": 1.4372888694581465,
"learning_rate": 5.155480236533689e-07,
"loss": 0.4203,
"step": 531
},
{
"epoch": 0.8426054246683825,
"grad_norm": 1.4932893467966832,
"learning_rate": 5.055104374145221e-07,
"loss": 0.4823,
"step": 532
},
{
"epoch": 0.8441892694515938,
"grad_norm": 1.4880388898611958,
"learning_rate": 4.955649342255462e-07,
"loss": 0.4552,
"step": 533
},
{
"epoch": 0.845773114234805,
"grad_norm": 1.6481037142268005,
"learning_rate": 4.857117761596994e-07,
"loss": 0.5839,
"step": 534
},
{
"epoch": 0.8473569590180162,
"grad_norm": 1.3437637677463443,
"learning_rate": 4.759512228568621e-07,
"loss": 0.4662,
"step": 535
},
{
"epoch": 0.8489408038012275,
"grad_norm": 1.4761647343726298,
"learning_rate": 4.6628353151668995e-07,
"loss": 0.568,
"step": 536
},
{
"epoch": 0.8505246485844388,
"grad_norm": 1.4720639230416912,
"learning_rate": 4.567089568918403e-07,
"loss": 0.5371,
"step": 537
},
{
"epoch": 0.8521084933676499,
"grad_norm": 1.5747841685012622,
"learning_rate": 4.472277512812606e-07,
"loss": 0.441,
"step": 538
},
{
"epoch": 0.8536923381508612,
"grad_norm": 1.650334451611286,
"learning_rate": 4.378401645235352e-07,
"loss": 0.5202,
"step": 539
},
{
"epoch": 0.8552761829340725,
"grad_norm": 1.5611475616764667,
"learning_rate": 4.2854644399030526e-07,
"loss": 0.4419,
"step": 540
},
{
"epoch": 0.8568600277172838,
"grad_norm": 1.4489741710945974,
"learning_rate": 4.193468345797511e-07,
"loss": 0.4335,
"step": 541
},
{
"epoch": 0.8584438725004949,
"grad_norm": 1.410090306645375,
"learning_rate": 4.1024157871013586e-07,
"loss": 0.5519,
"step": 542
},
{
"epoch": 0.8600277172837062,
"grad_norm": 2.1083504219892966,
"learning_rate": 4.0123091631341933e-07,
"loss": 0.5082,
"step": 543
},
{
"epoch": 0.8616115620669175,
"grad_norm": 1.513783834747692,
"learning_rate": 3.9231508482893584e-07,
"loss": 0.5122,
"step": 544
},
{
"epoch": 0.8631954068501286,
"grad_norm": 1.5251998914887834,
"learning_rate": 3.834943191971365e-07,
"loss": 0.4445,
"step": 545
},
{
"epoch": 0.8647792516333399,
"grad_norm": 1.5084805197178865,
"learning_rate": 3.7476885185340023e-07,
"loss": 0.5231,
"step": 546
},
{
"epoch": 0.8663630964165512,
"grad_norm": 1.6145093162288873,
"learning_rate": 3.66138912721905e-07,
"loss": 0.4943,
"step": 547
},
{
"epoch": 0.8679469411997625,
"grad_norm": 1.6928224805035605,
"learning_rate": 3.5760472920957387e-07,
"loss": 0.4923,
"step": 548
},
{
"epoch": 0.8695307859829736,
"grad_norm": 1.5962608727170904,
"learning_rate": 3.491665262000789e-07,
"loss": 0.4839,
"step": 549
},
{
"epoch": 0.8711146307661849,
"grad_norm": 1.606681077130113,
"learning_rate": 3.4082452604791587e-07,
"loss": 0.5515,
"step": 550
},
{
"epoch": 0.8726984755493962,
"grad_norm": 1.746327383134502,
"learning_rate": 3.3257894857254877e-07,
"loss": 0.4445,
"step": 551
},
{
"epoch": 0.8742823203326074,
"grad_norm": 1.6535392291397746,
"learning_rate": 3.2443001105261127e-07,
"loss": 0.3439,
"step": 552
},
{
"epoch": 0.8758661651158186,
"grad_norm": 1.5448042422189747,
"learning_rate": 3.163779282201853e-07,
"loss": 0.4854,
"step": 553
},
{
"epoch": 0.8774500098990299,
"grad_norm": 1.634766739227388,
"learning_rate": 3.0842291225514314e-07,
"loss": 0.4817,
"step": 554
},
{
"epoch": 0.8790338546822412,
"grad_norm": 1.7014309371471963,
"learning_rate": 3.005651727795535e-07,
"loss": 0.5209,
"step": 555
},
{
"epoch": 0.8806176994654524,
"grad_norm": 1.4990930673709706,
"learning_rate": 2.9280491685215847e-07,
"loss": 0.4922,
"step": 556
},
{
"epoch": 0.8822015442486636,
"grad_norm": 1.6902362872195928,
"learning_rate": 2.85142348962919e-07,
"loss": 0.5189,
"step": 557
},
{
"epoch": 0.8837853890318749,
"grad_norm": 1.5111021907783055,
"learning_rate": 2.7757767102762587e-07,
"loss": 0.5379,
"step": 558
},
{
"epoch": 0.8853692338150861,
"grad_norm": 1.4445290757135123,
"learning_rate": 2.701110823825772e-07,
"loss": 0.464,
"step": 559
},
{
"epoch": 0.8869530785982974,
"grad_norm": 1.4484298586379043,
"learning_rate": 2.62742779779328e-07,
"loss": 0.4532,
"step": 560
},
{
"epoch": 0.8885369233815086,
"grad_norm": 1.8044306708569506,
"learning_rate": 2.5547295737950467e-07,
"loss": 0.4846,
"step": 561
},
{
"epoch": 0.8901207681647199,
"grad_norm": 1.6320763637605389,
"learning_rate": 2.483018067496885e-07,
"loss": 0.4102,
"step": 562
},
{
"epoch": 0.8917046129479311,
"grad_norm": 1.513426697301029,
"learning_rate": 2.412295168563667e-07,
"loss": 0.3977,
"step": 563
},
{
"epoch": 0.8932884577311423,
"grad_norm": 1.739063565024179,
"learning_rate": 2.3425627406095682e-07,
"loss": 0.4883,
"step": 564
},
{
"epoch": 0.8948723025143536,
"grad_norm": 1.4344210490396612,
"learning_rate": 2.273822621148902e-07,
"loss": 0.523,
"step": 565
},
{
"epoch": 0.8964561472975648,
"grad_norm": 1.503570765479746,
"learning_rate": 2.206076621547752e-07,
"loss": 0.4387,
"step": 566
},
{
"epoch": 0.8980399920807761,
"grad_norm": 1.3957138101279774,
"learning_rate": 2.1393265269762194e-07,
"loss": 0.4629,
"step": 567
},
{
"epoch": 0.8996238368639873,
"grad_norm": 1.5135178225024162,
"learning_rate": 2.0735740963613656e-07,
"loss": 0.5019,
"step": 568
},
{
"epoch": 0.9012076816471986,
"grad_norm": 1.7552187868476394,
"learning_rate": 2.0088210623408907e-07,
"loss": 0.5353,
"step": 569
},
{
"epoch": 0.9027915264304098,
"grad_norm": 1.5773847982428775,
"learning_rate": 1.9450691312174538e-07,
"loss": 0.4794,
"step": 570
},
{
"epoch": 0.9043753712136211,
"grad_norm": 1.4347191250584008,
"learning_rate": 1.8823199829137405e-07,
"loss": 0.5398,
"step": 571
},
{
"epoch": 0.9059592159968323,
"grad_norm": 1.4568382476735973,
"learning_rate": 1.8205752709281597e-07,
"loss": 0.4439,
"step": 572
},
{
"epoch": 0.9075430607800435,
"grad_norm": 1.7936484686086873,
"learning_rate": 1.759836622291293e-07,
"loss": 0.4999,
"step": 573
},
{
"epoch": 0.9091269055632548,
"grad_norm": 1.5069741923698488,
"learning_rate": 1.700105637523026e-07,
"loss": 0.4889,
"step": 574
},
{
"epoch": 0.9107107503464661,
"grad_norm": 1.3815954432502042,
"learning_rate": 1.6413838905903554e-07,
"loss": 0.4927,
"step": 575
},
{
"epoch": 0.9122945951296773,
"grad_norm": 1.4408197785033354,
"learning_rate": 1.58367292886592e-07,
"loss": 0.4667,
"step": 576
},
{
"epoch": 0.9138784399128885,
"grad_norm": 1.5272866153375004,
"learning_rate": 1.526974273087238e-07,
"loss": 0.5076,
"step": 577
},
{
"epoch": 0.9154622846960998,
"grad_norm": 1.478729805351153,
"learning_rate": 1.4712894173166192e-07,
"loss": 0.461,
"step": 578
},
{
"epoch": 0.9170461294793111,
"grad_norm": 1.5805538071531569,
"learning_rate": 1.416619828901795e-07,
"loss": 0.5271,
"step": 579
},
{
"epoch": 0.9186299742625222,
"grad_norm": 1.5508552013902985,
"learning_rate": 1.3629669484372718e-07,
"loss": 0.5267,
"step": 580
},
{
"epoch": 0.9202138190457335,
"grad_norm": 1.8225403408379726,
"learning_rate": 1.310332189726342e-07,
"loss": 0.461,
"step": 581
},
{
"epoch": 0.9217976638289448,
"grad_norm": 1.4608984268403977,
"learning_rate": 1.2587169397438425e-07,
"loss": 0.4602,
"step": 582
},
{
"epoch": 0.923381508612156,
"grad_norm": 1.5279726414031836,
"learning_rate": 1.2081225585996246e-07,
"loss": 0.4588,
"step": 583
},
{
"epoch": 0.9249653533953672,
"grad_norm": 1.515611028063376,
"learning_rate": 1.1585503795026718e-07,
"loss": 0.5179,
"step": 584
},
{
"epoch": 0.9265491981785785,
"grad_norm": 1.5467139023737668,
"learning_rate": 1.1100017087260205e-07,
"loss": 0.5355,
"step": 585
},
{
"epoch": 0.9281330429617898,
"grad_norm": 1.4948762852593562,
"learning_rate": 1.0624778255722855e-07,
"loss": 0.4236,
"step": 586
},
{
"epoch": 0.9297168877450009,
"grad_norm": 1.7477468489012902,
"learning_rate": 1.0159799823399939e-07,
"loss": 0.4904,
"step": 587
},
{
"epoch": 0.9313007325282122,
"grad_norm": 1.5893000765049863,
"learning_rate": 9.705094042905492e-08,
"loss": 0.5515,
"step": 588
},
{
"epoch": 0.9328845773114235,
"grad_norm": 1.702578584367335,
"learning_rate": 9.260672896159727e-08,
"loss": 0.4751,
"step": 589
},
{
"epoch": 0.9344684220946348,
"grad_norm": 1.6463785462145422,
"learning_rate": 8.826548094073194e-08,
"loss": 0.4154,
"step": 590
},
{
"epoch": 0.9360522668778459,
"grad_norm": 1.3759922290335456,
"learning_rate": 8.402731076238189e-08,
"loss": 0.4738,
"step": 591
},
{
"epoch": 0.9376361116610572,
"grad_norm": 1.5640512221883043,
"learning_rate": 7.989233010627261e-08,
"loss": 0.416,
"step": 592
},
{
"epoch": 0.9392199564442685,
"grad_norm": 1.6313053854796913,
"learning_rate": 7.586064793298997e-08,
"loss": 0.4193,
"step": 593
},
{
"epoch": 0.9408038012274798,
"grad_norm": 1.7183629342589013,
"learning_rate": 7.193237048110879e-08,
"loss": 0.5383,
"step": 594
},
{
"epoch": 0.9423876460106909,
"grad_norm": 1.8057345431306828,
"learning_rate": 6.810760126439285e-08,
"loss": 0.4636,
"step": 595
},
{
"epoch": 0.9439714907939022,
"grad_norm": 1.581711206380294,
"learning_rate": 6.438644106906866e-08,
"loss": 0.4735,
"step": 596
},
{
"epoch": 0.9455553355771135,
"grad_norm": 2.0406365820491277,
"learning_rate": 6.076898795116792e-08,
"loss": 0.5349,
"step": 597
},
{
"epoch": 0.9471391803603247,
"grad_norm": 1.3665960620474567,
"learning_rate": 5.7255337233944376e-08,
"loss": 0.5118,
"step": 598
},
{
"epoch": 0.9487230251435359,
"grad_norm": 1.4545280151752455,
"learning_rate": 5.3845581505362005e-08,
"loss": 0.4706,
"step": 599
},
{
"epoch": 0.9503068699267472,
"grad_norm": 1.6568106629852815,
"learning_rate": 5.05398106156556e-08,
"loss": 0.4208,
"step": 600
},
{
"epoch": 0.9518907147099585,
"grad_norm": 1.3009128558987804,
"learning_rate": 4.733811167496249e-08,
"loss": 0.4618,
"step": 601
},
{
"epoch": 0.9534745594931696,
"grad_norm": 1.6350507056865986,
"learning_rate": 4.4240569051027466e-08,
"loss": 0.4818,
"step": 602
},
{
"epoch": 0.9550584042763809,
"grad_norm": 1.7692229180589922,
"learning_rate": 4.124726436697878e-08,
"loss": 0.4662,
"step": 603
},
{
"epoch": 0.9566422490595922,
"grad_norm": 1.4494075240429927,
"learning_rate": 3.8358276499179664e-08,
"loss": 0.6283,
"step": 604
},
{
"epoch": 0.9582260938428034,
"grad_norm": 1.5176956748111556,
"learning_rate": 3.557368157514595e-08,
"loss": 0.4618,
"step": 605
},
{
"epoch": 0.9598099386260146,
"grad_norm": 1.5615472024568975,
"learning_rate": 3.2893552971545056e-08,
"loss": 0.3706,
"step": 606
},
{
"epoch": 0.9613937834092259,
"grad_norm": 2.0109994652257055,
"learning_rate": 3.031796131225706e-08,
"loss": 0.4617,
"step": 607
},
{
"epoch": 0.9629776281924372,
"grad_norm": 1.6571285729822172,
"learning_rate": 2.7846974466517957e-08,
"loss": 0.4621,
"step": 608
},
{
"epoch": 0.9645614729756484,
"grad_norm": 1.9648211635566186,
"learning_rate": 2.5480657547129135e-08,
"loss": 0.5031,
"step": 609
},
{
"epoch": 0.9661453177588596,
"grad_norm": 1.3759221221620814,
"learning_rate": 2.3219072908742253e-08,
"loss": 0.4284,
"step": 610
},
{
"epoch": 0.9677291625420709,
"grad_norm": 1.5776287851272248,
"learning_rate": 2.106228014621525e-08,
"loss": 0.4965,
"step": 611
},
{
"epoch": 0.9693130073252821,
"grad_norm": 1.4564588215310144,
"learning_rate": 1.901033609304381e-08,
"loss": 0.5313,
"step": 612
},
{
"epoch": 0.9708968521084934,
"grad_norm": 1.4650178606419144,
"learning_rate": 1.706329481986213e-08,
"loss": 0.4695,
"step": 613
},
{
"epoch": 0.9724806968917046,
"grad_norm": 8.364306128275436,
"learning_rate": 1.522120763301782e-08,
"loss": 0.5742,
"step": 614
},
{
"epoch": 0.9740645416749159,
"grad_norm": 1.5436098149612576,
"learning_rate": 1.348412307322233e-08,
"loss": 0.4163,
"step": 615
},
{
"epoch": 0.9756483864581271,
"grad_norm": 1.4282895143928616,
"learning_rate": 1.1852086914268423e-08,
"loss": 0.5281,
"step": 616
},
{
"epoch": 0.9772322312413384,
"grad_norm": 1.2864356451763799,
"learning_rate": 1.032514216182756e-08,
"loss": 0.6423,
"step": 617
},
{
"epoch": 0.9788160760245496,
"grad_norm": 1.3530070123033577,
"learning_rate": 8.903329052313502e-09,
"loss": 0.4773,
"step": 618
},
{
"epoch": 0.9803999208077608,
"grad_norm": 1.4191474026879363,
"learning_rate": 7.586685051823583e-09,
"loss": 0.5286,
"step": 619
},
{
"epoch": 0.9819837655909721,
"grad_norm": 1.6895691048157127,
"learning_rate": 6.375244855152839e-09,
"loss": 0.5007,
"step": 620
},
{
"epoch": 0.9835676103741833,
"grad_norm": 1.414899383758995,
"learning_rate": 5.269040384876078e-09,
"loss": 0.4556,
"step": 621
},
{
"epoch": 0.9851514551573946,
"grad_norm": 1.8800481913954115,
"learning_rate": 4.2681007905103206e-09,
"loss": 0.5251,
"step": 622
},
{
"epoch": 0.9867352999406058,
"grad_norm": 1.3389313374375587,
"learning_rate": 3.372452447744756e-09,
"loss": 0.3714,
"step": 623
},
{
"epoch": 0.9883191447238171,
"grad_norm": 1.274196273086913,
"learning_rate": 2.582118957745738e-09,
"loss": 0.4538,
"step": 624
},
{
"epoch": 0.9899029895070283,
"grad_norm": 1.5641909121422626,
"learning_rate": 1.8971211465363955e-09,
"loss": 0.445,
"step": 625
},
{
"epoch": 0.9914868342902395,
"grad_norm": 1.431529316600778,
"learning_rate": 1.31747706444596e-09,
"loss": 0.4297,
"step": 626
},
{
"epoch": 0.9930706790734508,
"grad_norm": 1.3641264104243216,
"learning_rate": 8.432019856345896e-10,
"loss": 0.4418,
"step": 627
},
{
"epoch": 0.9946545238566621,
"grad_norm": 1.409407684955624,
"learning_rate": 4.743084076923587e-10,
"loss": 0.5325,
"step": 628
},
{
"epoch": 0.9962383686398733,
"grad_norm": 1.4066497217484986,
"learning_rate": 2.108060513075216e-10,
"loss": 0.4452,
"step": 629
},
{
"epoch": 0.9978222134230845,
"grad_norm": 1.5846124004550377,
"learning_rate": 5.270186001249399e-11,
"loss": 0.3571,
"step": 630
},
{
"epoch": 0.9978222134230845,
"eval_accuracy": 0.8122857271681789,
"eval_loss": 0.5811628699302673,
"eval_perplexity": 1.2019592776188088,
"eval_runtime": 530.5565,
"eval_samples_per_second": 1.434,
"eval_steps_per_second": 1.434,
"step": 630
},
{
"epoch": 0.9994060582062958,
"grad_norm": 1.4491753654503536,
"learning_rate": 0.0,
"loss": 0.5137,
"step": 631
},
{
"epoch": 0.9994060582062958,
"step": 631,
"total_flos": 132105476505600.0,
"train_loss": 0.5424914620757669,
"train_runtime": 50500.2662,
"train_samples_per_second": 0.2,
"train_steps_per_second": 0.012
}
],
"logging_steps": 1.0,
"max_steps": 631,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 132105476505600.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}