|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9994060582062958, |
|
"eval_steps": 210, |
|
"global_step": 631, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0015838447832112453, |
|
"grad_norm": 20.634467679531244, |
|
"learning_rate": 4.2105263157894733e-07, |
|
"loss": 1.0835, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0031676895664224905, |
|
"grad_norm": 14.28313933877416, |
|
"learning_rate": 8.421052631578947e-07, |
|
"loss": 0.9055, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.004751534349633736, |
|
"grad_norm": 24.632900629424984, |
|
"learning_rate": 1.263157894736842e-06, |
|
"loss": 0.9363, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.006335379132844981, |
|
"grad_norm": 14.378103932792914, |
|
"learning_rate": 1.6842105263157893e-06, |
|
"loss": 0.9769, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.007919223916056227, |
|
"grad_norm": 12.06138179201479, |
|
"learning_rate": 2.1052631578947366e-06, |
|
"loss": 0.879, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.009503068699267472, |
|
"grad_norm": 8.884749518187892, |
|
"learning_rate": 2.526315789473684e-06, |
|
"loss": 0.8971, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.011086913482478717, |
|
"grad_norm": 4.885104408560213, |
|
"learning_rate": 2.9473684210526313e-06, |
|
"loss": 0.8804, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.012670758265689962, |
|
"grad_norm": 6.068115898146199, |
|
"learning_rate": 3.3684210526315786e-06, |
|
"loss": 0.7672, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.014254603048901207, |
|
"grad_norm": 5.471835730774232, |
|
"learning_rate": 3.789473684210526e-06, |
|
"loss": 0.7564, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.015838447832112454, |
|
"grad_norm": 4.785289669398289, |
|
"learning_rate": 4.210526315789473e-06, |
|
"loss": 0.8132, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.017422292615323697, |
|
"grad_norm": 6.148192535233664, |
|
"learning_rate": 4.631578947368421e-06, |
|
"loss": 0.788, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.019006137398534944, |
|
"grad_norm": 3.1102232110909718, |
|
"learning_rate": 5.052631578947368e-06, |
|
"loss": 0.7744, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.020589982181746187, |
|
"grad_norm": 2.2116201137156697, |
|
"learning_rate": 5.473684210526316e-06, |
|
"loss": 0.705, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.022173826964957434, |
|
"grad_norm": 2.3891700304125965, |
|
"learning_rate": 5.894736842105263e-06, |
|
"loss": 0.782, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.02375767174816868, |
|
"grad_norm": 2.6800168254599552, |
|
"learning_rate": 6.31578947368421e-06, |
|
"loss": 0.6875, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.025341516531379924, |
|
"grad_norm": 2.069273801603203, |
|
"learning_rate": 6.736842105263157e-06, |
|
"loss": 0.7177, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.02692536131459117, |
|
"grad_norm": 1.9967678968867362, |
|
"learning_rate": 7.157894736842105e-06, |
|
"loss": 0.6317, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.028509206097802414, |
|
"grad_norm": 1.8973112856460428, |
|
"learning_rate": 7.578947368421052e-06, |
|
"loss": 0.668, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.03009305088101366, |
|
"grad_norm": 1.6427223591398545, |
|
"learning_rate": 8e-06, |
|
"loss": 0.6237, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.03167689566422491, |
|
"grad_norm": 1.8126563046343525, |
|
"learning_rate": 7.999947298139988e-06, |
|
"loss": 0.7286, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.033260740447436155, |
|
"grad_norm": 2.0321574723423437, |
|
"learning_rate": 7.999789193948692e-06, |
|
"loss": 0.6975, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.034844585230647394, |
|
"grad_norm": 1.6806869406461464, |
|
"learning_rate": 7.999525691592307e-06, |
|
"loss": 0.577, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.03642843001385864, |
|
"grad_norm": 1.9508098776351803, |
|
"learning_rate": 7.999156798014364e-06, |
|
"loss": 0.7186, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.03801227479706989, |
|
"grad_norm": 1.818938048059436, |
|
"learning_rate": 7.998682522935554e-06, |
|
"loss": 0.6235, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.039596119580281135, |
|
"grad_norm": 1.763078793834794, |
|
"learning_rate": 7.998102878853464e-06, |
|
"loss": 0.6797, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.041179964363492375, |
|
"grad_norm": 1.7024348305084676, |
|
"learning_rate": 7.997417881042254e-06, |
|
"loss": 0.6611, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.04276380914670362, |
|
"grad_norm": 2.651929076311587, |
|
"learning_rate": 7.996627547552254e-06, |
|
"loss": 0.6378, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.04434765392991487, |
|
"grad_norm": 1.8122638725536788, |
|
"learning_rate": 7.99573189920949e-06, |
|
"loss": 0.6018, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.045931498713126115, |
|
"grad_norm": 1.715907567678854, |
|
"learning_rate": 7.994730959615124e-06, |
|
"loss": 0.5851, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.04751534349633736, |
|
"grad_norm": 4.091743172227538, |
|
"learning_rate": 7.993624755144846e-06, |
|
"loss": 0.7245, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0490991882795486, |
|
"grad_norm": 2.044771221320041, |
|
"learning_rate": 7.992413314948177e-06, |
|
"loss": 0.5109, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.05068303306275985, |
|
"grad_norm": 1.7849876948071914, |
|
"learning_rate": 7.991096670947687e-06, |
|
"loss": 0.669, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.052266877845971095, |
|
"grad_norm": 2.0286922346779614, |
|
"learning_rate": 7.989674857838172e-06, |
|
"loss": 0.6604, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.05385072262918234, |
|
"grad_norm": 1.5979263059089825, |
|
"learning_rate": 7.988147913085731e-06, |
|
"loss": 0.6031, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.05543456741239359, |
|
"grad_norm": 1.8983896550694437, |
|
"learning_rate": 7.986515876926776e-06, |
|
"loss": 0.6479, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.05701841219560483, |
|
"grad_norm": 1.5118226263390602, |
|
"learning_rate": 7.984778792366982e-06, |
|
"loss": 0.7121, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.058602256978816075, |
|
"grad_norm": 1.654106143113525, |
|
"learning_rate": 7.982936705180138e-06, |
|
"loss": 0.5762, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.06018610176202732, |
|
"grad_norm": 1.656696649887629, |
|
"learning_rate": 7.980989663906955e-06, |
|
"loss": 0.5968, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.06176994654523857, |
|
"grad_norm": 1.6128252735493993, |
|
"learning_rate": 7.978937719853785e-06, |
|
"loss": 0.6237, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.06335379132844982, |
|
"grad_norm": 1.8318236683351248, |
|
"learning_rate": 7.976780927091259e-06, |
|
"loss": 0.5261, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.06493763611166106, |
|
"grad_norm": 1.7317276246642117, |
|
"learning_rate": 7.97451934245287e-06, |
|
"loss": 0.6436, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.06652148089487231, |
|
"grad_norm": 1.8588379937728277, |
|
"learning_rate": 7.97215302553348e-06, |
|
"loss": 0.6273, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.06810532567808354, |
|
"grad_norm": 1.478388678926987, |
|
"learning_rate": 7.969682038687744e-06, |
|
"loss": 0.6108, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.06968917046129479, |
|
"grad_norm": 1.6631272451799215, |
|
"learning_rate": 7.967106447028455e-06, |
|
"loss": 0.6865, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.07127301524450604, |
|
"grad_norm": 1.736621303615563, |
|
"learning_rate": 7.964426318424854e-06, |
|
"loss": 0.6554, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.07285686002771728, |
|
"grad_norm": 1.7271619263651752, |
|
"learning_rate": 7.96164172350082e-06, |
|
"loss": 0.6234, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.07444070481092853, |
|
"grad_norm": 1.8475249444720505, |
|
"learning_rate": 7.958752735633022e-06, |
|
"loss": 0.6844, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.07602454959413978, |
|
"grad_norm": 1.5407374760971033, |
|
"learning_rate": 7.955759430948973e-06, |
|
"loss": 0.5816, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.07760839437735102, |
|
"grad_norm": 1.6776551127492765, |
|
"learning_rate": 7.952661888325037e-06, |
|
"loss": 0.5999, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.07919223916056227, |
|
"grad_norm": 1.674394955432216, |
|
"learning_rate": 7.949460189384344e-06, |
|
"loss": 0.6809, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.08077608394377352, |
|
"grad_norm": 1.71808445149699, |
|
"learning_rate": 7.946154418494638e-06, |
|
"loss": 0.6433, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.08235992872698475, |
|
"grad_norm": 1.5598421305012693, |
|
"learning_rate": 7.942744662766056e-06, |
|
"loss": 0.6166, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.083943773510196, |
|
"grad_norm": 2.134433653759547, |
|
"learning_rate": 7.939231012048832e-06, |
|
"loss": 0.6213, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.08552761829340724, |
|
"grad_norm": 1.513738891290282, |
|
"learning_rate": 7.935613558930931e-06, |
|
"loss": 0.5797, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.08711146307661849, |
|
"grad_norm": 1.8993782752151005, |
|
"learning_rate": 7.931892398735607e-06, |
|
"loss": 0.6062, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.08869530785982974, |
|
"grad_norm": 1.6362105443395145, |
|
"learning_rate": 7.92806762951889e-06, |
|
"loss": 0.6701, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.09027915264304098, |
|
"grad_norm": 1.6152582877305293, |
|
"learning_rate": 7.92413935206701e-06, |
|
"loss": 0.5999, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.09186299742625223, |
|
"grad_norm": 1.5796190225435545, |
|
"learning_rate": 7.920107669893728e-06, |
|
"loss": 0.6414, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.09344684220946348, |
|
"grad_norm": 1.5796707579391902, |
|
"learning_rate": 7.915972689237618e-06, |
|
"loss": 0.6168, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.09503068699267472, |
|
"grad_norm": 1.4556074881511978, |
|
"learning_rate": 7.911734519059266e-06, |
|
"loss": 0.6623, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.09661453177588597, |
|
"grad_norm": 1.5078600739444816, |
|
"learning_rate": 7.907393271038402e-06, |
|
"loss": 0.6546, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.0981983765590972, |
|
"grad_norm": 1.590183144064633, |
|
"learning_rate": 7.902949059570945e-06, |
|
"loss": 0.6175, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.09978222134230845, |
|
"grad_norm": 1.665920028614456, |
|
"learning_rate": 7.898402001766002e-06, |
|
"loss": 0.6758, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.1013660661255197, |
|
"grad_norm": 1.6053295719352823, |
|
"learning_rate": 7.89375221744277e-06, |
|
"loss": 0.5228, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.10294991090873094, |
|
"grad_norm": 1.538829090522207, |
|
"learning_rate": 7.888999829127398e-06, |
|
"loss": 0.5839, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.10453375569194219, |
|
"grad_norm": 1.4764506284200576, |
|
"learning_rate": 7.884144962049733e-06, |
|
"loss": 0.5371, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.10611760047515344, |
|
"grad_norm": 1.7787882173817327, |
|
"learning_rate": 7.879187744140039e-06, |
|
"loss": 0.5687, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.10770144525836468, |
|
"grad_norm": 1.6757502009522847, |
|
"learning_rate": 7.874128306025616e-06, |
|
"loss": 0.6602, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.10928529004157593, |
|
"grad_norm": 1.5384088321644194, |
|
"learning_rate": 7.868966781027365e-06, |
|
"loss": 0.5848, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.11086913482478718, |
|
"grad_norm": 1.3974387424168984, |
|
"learning_rate": 7.863703305156273e-06, |
|
"loss": 0.4492, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.11245297960799841, |
|
"grad_norm": 1.4838230417656662, |
|
"learning_rate": 7.858338017109821e-06, |
|
"loss": 0.5937, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.11403682439120966, |
|
"grad_norm": 1.4619003743991448, |
|
"learning_rate": 7.852871058268338e-06, |
|
"loss": 0.6154, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.1156206691744209, |
|
"grad_norm": 1.4943665339665302, |
|
"learning_rate": 7.847302572691277e-06, |
|
"loss": 0.6561, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.11720451395763215, |
|
"grad_norm": 1.4942484366322168, |
|
"learning_rate": 7.841632707113408e-06, |
|
"loss": 0.6133, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.1187883587408434, |
|
"grad_norm": 1.6703395416125706, |
|
"learning_rate": 7.835861610940964e-06, |
|
"loss": 0.5878, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.12037220352405464, |
|
"grad_norm": 1.5664444643827788, |
|
"learning_rate": 7.829989436247697e-06, |
|
"loss": 0.7375, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.12195604830726589, |
|
"grad_norm": 1.4638878839717737, |
|
"learning_rate": 7.824016337770871e-06, |
|
"loss": 0.5211, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.12353989309047714, |
|
"grad_norm": 1.787375030347028, |
|
"learning_rate": 7.817942472907183e-06, |
|
"loss": 0.5647, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.12512373787368838, |
|
"grad_norm": 1.884380332078967, |
|
"learning_rate": 7.811768001708626e-06, |
|
"loss": 0.6087, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.12670758265689963, |
|
"grad_norm": 1.5929236515264444, |
|
"learning_rate": 7.805493086878254e-06, |
|
"loss": 0.6642, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.12829142744011088, |
|
"grad_norm": 1.5772424009229602, |
|
"learning_rate": 7.799117893765911e-06, |
|
"loss": 0.5835, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.12987527222332212, |
|
"grad_norm": 1.7489491911949495, |
|
"learning_rate": 7.792642590363864e-06, |
|
"loss": 0.6714, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.13145911700653337, |
|
"grad_norm": 1.6319871292096388, |
|
"learning_rate": 7.786067347302378e-06, |
|
"loss": 0.6794, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.13304296178974462, |
|
"grad_norm": 1.4796412781963182, |
|
"learning_rate": 7.779392337845224e-06, |
|
"loss": 0.5173, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.13462680657295584, |
|
"grad_norm": 1.582366953055958, |
|
"learning_rate": 7.772617737885109e-06, |
|
"loss": 0.6008, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.13621065135616708, |
|
"grad_norm": 1.4986337444583717, |
|
"learning_rate": 7.765743725939044e-06, |
|
"loss": 0.6157, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.13779449613937833, |
|
"grad_norm": 1.577968697496146, |
|
"learning_rate": 7.758770483143633e-06, |
|
"loss": 0.5652, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.13937834092258958, |
|
"grad_norm": 1.6508937367068126, |
|
"learning_rate": 7.751698193250313e-06, |
|
"loss": 0.5759, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.14096218570580082, |
|
"grad_norm": 1.7727311004928359, |
|
"learning_rate": 7.744527042620495e-06, |
|
"loss": 0.621, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.14254603048901207, |
|
"grad_norm": 1.5183748104433379, |
|
"learning_rate": 7.737257220220672e-06, |
|
"loss": 0.6053, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.14412987527222332, |
|
"grad_norm": 1.5930246968878465, |
|
"learning_rate": 7.729888917617423e-06, |
|
"loss": 0.5267, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.14571372005543456, |
|
"grad_norm": 1.5445708944187808, |
|
"learning_rate": 7.722422328972375e-06, |
|
"loss": 0.5988, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.1472975648386458, |
|
"grad_norm": 1.644950232043727, |
|
"learning_rate": 7.71485765103708e-06, |
|
"loss": 0.523, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.14888140962185706, |
|
"grad_norm": 1.582899153506888, |
|
"learning_rate": 7.707195083147842e-06, |
|
"loss": 0.5703, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.1504652544050683, |
|
"grad_norm": 1.5412622272946352, |
|
"learning_rate": 7.699434827220446e-06, |
|
"loss": 0.6049, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.15204909918827955, |
|
"grad_norm": 1.3877844675757782, |
|
"learning_rate": 7.691577087744858e-06, |
|
"loss": 0.6088, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.1536329439714908, |
|
"grad_norm": 1.7758770361443341, |
|
"learning_rate": 7.683622071779814e-06, |
|
"loss": 0.5779, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.15521678875470205, |
|
"grad_norm": 1.727972351073385, |
|
"learning_rate": 7.675569988947388e-06, |
|
"loss": 0.6189, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.1568006335379133, |
|
"grad_norm": 1.726280844012036, |
|
"learning_rate": 7.66742105142745e-06, |
|
"loss": 0.6134, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.15838447832112454, |
|
"grad_norm": 1.478777494556154, |
|
"learning_rate": 7.659175473952084e-06, |
|
"loss": 0.5614, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.15996832310433579, |
|
"grad_norm": 1.5790584926585527, |
|
"learning_rate": 7.65083347379992e-06, |
|
"loss": 0.659, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.16155216788754703, |
|
"grad_norm": 1.5037297538511123, |
|
"learning_rate": 7.642395270790426e-06, |
|
"loss": 0.4981, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.16313601267075828, |
|
"grad_norm": 1.5590318704111625, |
|
"learning_rate": 7.633861087278093e-06, |
|
"loss": 0.5807, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.1647198574539695, |
|
"grad_norm": 1.4332619998935698, |
|
"learning_rate": 7.6252311481465996e-06, |
|
"loss": 0.6309, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.16630370223718074, |
|
"grad_norm": 1.4427352973548397, |
|
"learning_rate": 7.616505680802863e-06, |
|
"loss": 0.6623, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.167887547020392, |
|
"grad_norm": 1.6827061244824466, |
|
"learning_rate": 7.607684915171065e-06, |
|
"loss": 0.6589, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.16947139180360324, |
|
"grad_norm": 1.5663175326090033, |
|
"learning_rate": 7.598769083686582e-06, |
|
"loss": 0.627, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.17105523658681449, |
|
"grad_norm": 1.423604206610221, |
|
"learning_rate": 7.589758421289864e-06, |
|
"loss": 0.6335, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.17263908137002573, |
|
"grad_norm": 1.4675532083940528, |
|
"learning_rate": 7.58065316542025e-06, |
|
"loss": 0.59, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.17422292615323698, |
|
"grad_norm": 1.8233813273626727, |
|
"learning_rate": 7.571453556009695e-06, |
|
"loss": 0.5213, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.17580677093644823, |
|
"grad_norm": 1.8255857113037794, |
|
"learning_rate": 7.562159835476465e-06, |
|
"loss": 0.6255, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.17739061571965947, |
|
"grad_norm": 1.4953012811749822, |
|
"learning_rate": 7.552772248718739e-06, |
|
"loss": 0.6206, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.17897446050287072, |
|
"grad_norm": 1.56255118528589, |
|
"learning_rate": 7.5432910431081586e-06, |
|
"loss": 0.5783, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.18055830528608197, |
|
"grad_norm": 1.664536250355845, |
|
"learning_rate": 7.533716468483311e-06, |
|
"loss": 0.6409, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.1821421500692932, |
|
"grad_norm": 1.4958558339075811, |
|
"learning_rate": 7.524048777143137e-06, |
|
"loss": 0.569, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.18372599485250446, |
|
"grad_norm": 1.4039476439318943, |
|
"learning_rate": 7.5142882238403e-06, |
|
"loss": 0.6021, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.1853098396357157, |
|
"grad_norm": 1.6306064613151194, |
|
"learning_rate": 7.504435065774454e-06, |
|
"loss": 0.7385, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.18689368441892695, |
|
"grad_norm": 1.668079082925941, |
|
"learning_rate": 7.494489562585478e-06, |
|
"loss": 0.5724, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.1884775292021382, |
|
"grad_norm": 1.434447147714598, |
|
"learning_rate": 7.48445197634663e-06, |
|
"loss": 0.6092, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.19006137398534945, |
|
"grad_norm": 1.984109033026855, |
|
"learning_rate": 7.474322571557644e-06, |
|
"loss": 0.6691, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.1916452187685607, |
|
"grad_norm": 1.5769599681060413, |
|
"learning_rate": 7.4641016151377545e-06, |
|
"loss": 0.6061, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.19322906355177194, |
|
"grad_norm": 1.384122268885966, |
|
"learning_rate": 7.45378937641867e-06, |
|
"loss": 0.5634, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.19481290833498316, |
|
"grad_norm": 1.498669953718918, |
|
"learning_rate": 7.44338612713747e-06, |
|
"loss": 0.5627, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.1963967531181944, |
|
"grad_norm": 1.4288041695671259, |
|
"learning_rate": 7.43289214142945e-06, |
|
"loss": 0.5726, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.19798059790140565, |
|
"grad_norm": 1.5646983954502622, |
|
"learning_rate": 7.422307695820892e-06, |
|
"loss": 0.623, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.1995644426846169, |
|
"grad_norm": 1.7202467615332813, |
|
"learning_rate": 7.411633069221782e-06, |
|
"loss": 0.6123, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.20114828746782815, |
|
"grad_norm": 1.767901675287711, |
|
"learning_rate": 7.400868542918457e-06, |
|
"loss": 0.5208, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.2027321322510394, |
|
"grad_norm": 1.5246478568278783, |
|
"learning_rate": 7.390014400566196e-06, |
|
"loss": 0.5708, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.20431597703425064, |
|
"grad_norm": 1.6477524599468607, |
|
"learning_rate": 7.379070928181746e-06, |
|
"loss": 0.5288, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.2058998218174619, |
|
"grad_norm": 1.4399111543584902, |
|
"learning_rate": 7.3680384141357805e-06, |
|
"loss": 0.5898, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.20748366660067313, |
|
"grad_norm": 1.5830239221298632, |
|
"learning_rate": 7.356917149145307e-06, |
|
"loss": 0.5797, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.20906751138388438, |
|
"grad_norm": 1.403523262072001, |
|
"learning_rate": 7.3457074262659974e-06, |
|
"loss": 0.5581, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.21065135616709563, |
|
"grad_norm": 1.7026128256384587, |
|
"learning_rate": 7.334409540884478e-06, |
|
"loss": 0.5859, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.21223520095030687, |
|
"grad_norm": 1.4478577966897537, |
|
"learning_rate": 7.323023790710534e-06, |
|
"loss": 0.6038, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.21381904573351812, |
|
"grad_norm": 1.5371200095544344, |
|
"learning_rate": 7.3115504757692715e-06, |
|
"loss": 0.5528, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.21540289051672937, |
|
"grad_norm": 1.4893093530093662, |
|
"learning_rate": 7.299989898393209e-06, |
|
"loss": 0.6717, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.21698673529994061, |
|
"grad_norm": 1.3874990168288865, |
|
"learning_rate": 7.288342363214313e-06, |
|
"loss": 0.5586, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.21857058008315186, |
|
"grad_norm": 1.5863006868461444, |
|
"learning_rate": 7.276608177155967e-06, |
|
"loss": 0.4951, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.2201544248663631, |
|
"grad_norm": 1.489868624576753, |
|
"learning_rate": 7.264787649424887e-06, |
|
"loss": 0.5833, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.22173826964957435, |
|
"grad_norm": 1.3377913662186058, |
|
"learning_rate": 7.2528810915029705e-06, |
|
"loss": 0.6079, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.22332211443278557, |
|
"grad_norm": 1.4551371038570358, |
|
"learning_rate": 7.240888817139094e-06, |
|
"loss": 0.629, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.22490595921599682, |
|
"grad_norm": 1.4800684134742632, |
|
"learning_rate": 7.228811142340838e-06, |
|
"loss": 0.5218, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.22648980399920807, |
|
"grad_norm": 1.655894288397452, |
|
"learning_rate": 7.2166483853661666e-06, |
|
"loss": 0.5851, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.2280736487824193, |
|
"grad_norm": 1.5072103166941109, |
|
"learning_rate": 7.204400866715038e-06, |
|
"loss": 0.5484, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.22965749356563056, |
|
"grad_norm": 1.5118758556170837, |
|
"learning_rate": 7.192068909120959e-06, |
|
"loss": 0.6607, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.2312413383488418, |
|
"grad_norm": 1.6979615740477183, |
|
"learning_rate": 7.179652837542479e-06, |
|
"loss": 0.6278, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.23282518313205305, |
|
"grad_norm": 1.4816588827641894, |
|
"learning_rate": 7.167152979154632e-06, |
|
"loss": 0.5747, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.2344090279152643, |
|
"grad_norm": 1.477778782495537, |
|
"learning_rate": 7.154569663340312e-06, |
|
"loss": 0.6037, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.23599287269847555, |
|
"grad_norm": 1.524987409114406, |
|
"learning_rate": 7.141903221681595e-06, |
|
"loss": 0.5202, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.2375767174816868, |
|
"grad_norm": 1.4531762635508665, |
|
"learning_rate": 7.1291539879509956e-06, |
|
"loss": 0.6053, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.23916056226489804, |
|
"grad_norm": 1.6848330098071658, |
|
"learning_rate": 7.116322298102681e-06, |
|
"loss": 0.5205, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.2407444070481093, |
|
"grad_norm": 1.53168338669591, |
|
"learning_rate": 7.1034084902636125e-06, |
|
"loss": 0.5588, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.24232825183132053, |
|
"grad_norm": 1.7084343204301633, |
|
"learning_rate": 7.090412904724635e-06, |
|
"loss": 0.6402, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.24391209661453178, |
|
"grad_norm": 1.8530857047977658, |
|
"learning_rate": 7.077335883931516e-06, |
|
"loss": 0.5897, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.24549594139774303, |
|
"grad_norm": 1.5361296146734253, |
|
"learning_rate": 7.064177772475912e-06, |
|
"loss": 0.542, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.24707978618095428, |
|
"grad_norm": 1.6919147409675659, |
|
"learning_rate": 7.050938917086298e-06, |
|
"loss": 0.6055, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.24866363096416552, |
|
"grad_norm": 1.5618314491824075, |
|
"learning_rate": 7.037619666618829e-06, |
|
"loss": 0.535, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.25024747574737677, |
|
"grad_norm": 1.4872965507690663, |
|
"learning_rate": 7.024220372048137e-06, |
|
"loss": 0.5813, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.251831320530588, |
|
"grad_norm": 1.6363093857295996, |
|
"learning_rate": 7.010741386458098e-06, |
|
"loss": 0.4529, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.25341516531379926, |
|
"grad_norm": 2.056313824339786, |
|
"learning_rate": 6.997183065032517e-06, |
|
"loss": 0.5332, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.2549990100970105, |
|
"grad_norm": 1.5062280307878337, |
|
"learning_rate": 6.983545765045774e-06, |
|
"loss": 0.5586, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.25658285488022176, |
|
"grad_norm": 1.7057261325931998, |
|
"learning_rate": 6.969829845853404e-06, |
|
"loss": 0.5615, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.258166699663433, |
|
"grad_norm": 1.5223711914805576, |
|
"learning_rate": 6.956035668882636e-06, |
|
"loss": 0.5553, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.25975054444664425, |
|
"grad_norm": 1.7269587442848504, |
|
"learning_rate": 6.942163597622862e-06, |
|
"loss": 0.5982, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.26133438922985547, |
|
"grad_norm": 1.5775708792423282, |
|
"learning_rate": 6.928213997616058e-06, |
|
"loss": 0.5816, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.26291823401306674, |
|
"grad_norm": 1.7646652820246194, |
|
"learning_rate": 6.914187236447161e-06, |
|
"loss": 0.5582, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.26450207879627796, |
|
"grad_norm": 1.5836559806147648, |
|
"learning_rate": 6.90008368373437e-06, |
|
"loss": 0.5268, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.26608592357948924, |
|
"grad_norm": 1.4526060644749479, |
|
"learning_rate": 6.885903711119417e-06, |
|
"loss": 0.5842, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.26766976836270046, |
|
"grad_norm": 1.469565911234753, |
|
"learning_rate": 6.8716476922577676e-06, |
|
"loss": 0.5691, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.2692536131459117, |
|
"grad_norm": 1.6734917590165517, |
|
"learning_rate": 6.857316002808776e-06, |
|
"loss": 0.4855, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.27083745792912295, |
|
"grad_norm": 1.5732695992638241, |
|
"learning_rate": 6.8429090204257885e-06, |
|
"loss": 0.6122, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.27242130271233417, |
|
"grad_norm": 1.4978871767664772, |
|
"learning_rate": 6.82842712474619e-06, |
|
"loss": 0.4655, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.27400514749554544, |
|
"grad_norm": 1.503094339164215, |
|
"learning_rate": 6.8138706973813995e-06, |
|
"loss": 0.6282, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.27558899227875666, |
|
"grad_norm": 2.1292899427160927, |
|
"learning_rate": 6.799240121906814e-06, |
|
"loss": 0.6792, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.27717283706196794, |
|
"grad_norm": 1.4647489604808317, |
|
"learning_rate": 6.784535783851707e-06, |
|
"loss": 0.644, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.27875668184517916, |
|
"grad_norm": 1.5780789456446127, |
|
"learning_rate": 6.7697580706890585e-06, |
|
"loss": 0.5134, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.28034052662839043, |
|
"grad_norm": 1.754003508034847, |
|
"learning_rate": 6.754907371825354e-06, |
|
"loss": 0.5424, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.28192437141160165, |
|
"grad_norm": 1.4864993829740512, |
|
"learning_rate": 6.739984078590322e-06, |
|
"loss": 0.4967, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.2835082161948129, |
|
"grad_norm": 1.4504899701696683, |
|
"learning_rate": 6.724988584226616e-06, |
|
"loss": 0.5067, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.28509206097802414, |
|
"grad_norm": 1.470233988558896, |
|
"learning_rate": 6.70992128387946e-06, |
|
"loss": 0.5655, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.2866759057612354, |
|
"grad_norm": 1.5181769899461575, |
|
"learning_rate": 6.694782574586229e-06, |
|
"loss": 0.5062, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.28825975054444664, |
|
"grad_norm": 1.4730340611570927, |
|
"learning_rate": 6.679572855265992e-06, |
|
"loss": 0.5855, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.2898435953276579, |
|
"grad_norm": 1.4595330624813205, |
|
"learning_rate": 6.664292526709001e-06, |
|
"loss": 0.4989, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.29142744011086913, |
|
"grad_norm": 1.323456475759639, |
|
"learning_rate": 6.648941991566121e-06, |
|
"loss": 0.5448, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.2930112848940804, |
|
"grad_norm": 1.6848076983535807, |
|
"learning_rate": 6.633521654338231e-06, |
|
"loss": 0.5494, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.2945951296772916, |
|
"grad_norm": 1.5461470851002552, |
|
"learning_rate": 6.618031921365557e-06, |
|
"loss": 0.5979, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.2961789744605029, |
|
"grad_norm": 1.4521318914121286, |
|
"learning_rate": 6.602473200816968e-06, |
|
"loss": 0.6329, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.2977628192437141, |
|
"grad_norm": 1.4482304443134255, |
|
"learning_rate": 6.586845902679222e-06, |
|
"loss": 0.5603, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.29934666402692534, |
|
"grad_norm": 1.729429057098183, |
|
"learning_rate": 6.571150438746157e-06, |
|
"loss": 0.5174, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.3009305088101366, |
|
"grad_norm": 1.4522833098019283, |
|
"learning_rate": 6.555387222607845e-06, |
|
"loss": 0.4707, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.30251435359334783, |
|
"grad_norm": 1.5694908204548417, |
|
"learning_rate": 6.5395566696396914e-06, |
|
"loss": 0.6268, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.3040981983765591, |
|
"grad_norm": 1.4125293528433154, |
|
"learning_rate": 6.523659196991488e-06, |
|
"loss": 0.4955, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.3056820431597703, |
|
"grad_norm": 1.5051250957949753, |
|
"learning_rate": 6.507695223576427e-06, |
|
"loss": 0.487, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.3072658879429816, |
|
"grad_norm": 1.4094675941552535, |
|
"learning_rate": 6.491665170060049e-06, |
|
"loss": 0.4969, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.3088497327261928, |
|
"grad_norm": 1.4384575522029888, |
|
"learning_rate": 6.475569458849178e-06, |
|
"loss": 0.5492, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.3104335775094041, |
|
"grad_norm": 1.7678769340548022, |
|
"learning_rate": 6.45940851408077e-06, |
|
"loss": 0.5836, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.3120174222926153, |
|
"grad_norm": 2.046226775911686, |
|
"learning_rate": 6.4431827616107514e-06, |
|
"loss": 0.5301, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.3136012670758266, |
|
"grad_norm": 1.4943535053844237, |
|
"learning_rate": 6.426892629002788e-06, |
|
"loss": 0.5501, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.3151851118590378, |
|
"grad_norm": 1.5251076879939447, |
|
"learning_rate": 6.410538545517026e-06, |
|
"loss": 0.5089, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.3167689566422491, |
|
"grad_norm": 1.8164625462077661, |
|
"learning_rate": 6.394120942098772e-06, |
|
"loss": 0.5319, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.3183528014254603, |
|
"grad_norm": 1.4408492309184018, |
|
"learning_rate": 6.377640251367147e-06, |
|
"loss": 0.4609, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.31993664620867157, |
|
"grad_norm": 1.4200405326569705, |
|
"learning_rate": 6.361096907603678e-06, |
|
"loss": 0.5396, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.3215204909918828, |
|
"grad_norm": 1.471578227709618, |
|
"learning_rate": 6.344491346740859e-06, |
|
"loss": 0.546, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.32310433577509406, |
|
"grad_norm": 1.4907309711106325, |
|
"learning_rate": 6.3278240063506605e-06, |
|
"loss": 0.5093, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.3246881805583053, |
|
"grad_norm": 1.3696992488542155, |
|
"learning_rate": 6.311095325633005e-06, |
|
"loss": 0.4799, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.32627202534151656, |
|
"grad_norm": 1.7334631553518818, |
|
"learning_rate": 6.294305745404184e-06, |
|
"loss": 0.5837, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.3278558701247278, |
|
"grad_norm": 1.9680923980600566, |
|
"learning_rate": 6.277455708085254e-06, |
|
"loss": 0.6013, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.329439714907939, |
|
"grad_norm": 1.575234585905169, |
|
"learning_rate": 6.260545657690367e-06, |
|
"loss": 0.5846, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.33102355969115027, |
|
"grad_norm": 1.5428445148256473, |
|
"learning_rate": 6.243576039815079e-06, |
|
"loss": 0.4724, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.3326074044743615, |
|
"grad_norm": 1.5051331668280472, |
|
"learning_rate": 6.226547301624601e-06, |
|
"loss": 0.5778, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.3326074044743615, |
|
"eval_accuracy": 0.8062780751393809, |
|
"eval_loss": 0.6072185039520264, |
|
"eval_perplexity": 1.2109892812795882, |
|
"eval_runtime": 533.963, |
|
"eval_samples_per_second": 1.425, |
|
"eval_steps_per_second": 1.425, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.33419124925757276, |
|
"grad_norm": 1.8036724779367772, |
|
"learning_rate": 6.209459891842023e-06, |
|
"loss": 0.6231, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.335775094040784, |
|
"grad_norm": 1.622010722669537, |
|
"learning_rate": 6.192314260736483e-06, |
|
"loss": 0.4884, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.33735893882399526, |
|
"grad_norm": 1.4671354082285062, |
|
"learning_rate": 6.1751108601113065e-06, |
|
"loss": 0.4331, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.3389427836072065, |
|
"grad_norm": 1.3462750015093699, |
|
"learning_rate": 6.157850143292099e-06, |
|
"loss": 0.5651, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.34052662839041775, |
|
"grad_norm": 4.450327389512898, |
|
"learning_rate": 6.140532565114801e-06, |
|
"loss": 0.5063, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.34211047317362897, |
|
"grad_norm": 1.4358337913572403, |
|
"learning_rate": 6.123158581913703e-06, |
|
"loss": 0.5133, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.34369431795684025, |
|
"grad_norm": 1.5296654139200274, |
|
"learning_rate": 6.105728651509423e-06, |
|
"loss": 0.5617, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.34527816274005146, |
|
"grad_norm": 1.7233319268867215, |
|
"learning_rate": 6.088243233196833e-06, |
|
"loss": 0.578, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.34686200752326274, |
|
"grad_norm": 1.3545296291875666, |
|
"learning_rate": 6.07070278773297e-06, |
|
"loss": 0.6077, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.34844585230647396, |
|
"grad_norm": 1.4071408386524482, |
|
"learning_rate": 6.053107777324882e-06, |
|
"loss": 0.4709, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.35002969708968523, |
|
"grad_norm": 1.625931561657156, |
|
"learning_rate": 6.0354586656174594e-06, |
|
"loss": 0.5402, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.35161354187289645, |
|
"grad_norm": 1.4770799573957036, |
|
"learning_rate": 6.017755917681208e-06, |
|
"loss": 0.5878, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.3531973866561077, |
|
"grad_norm": 1.371967202631883, |
|
"learning_rate": 6e-06, |
|
"loss": 0.5646, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.35478123143931894, |
|
"grad_norm": 1.670286600590674, |
|
"learning_rate": 5.982191380458779e-06, |
|
"loss": 0.5459, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.3563650762225302, |
|
"grad_norm": 1.657503866936318, |
|
"learning_rate": 5.964330528331233e-06, |
|
"loss": 0.6107, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.35794892100574144, |
|
"grad_norm": 1.437798148422675, |
|
"learning_rate": 5.946417914267424e-06, |
|
"loss": 0.5283, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.35953276578895266, |
|
"grad_norm": 1.5841028963525154, |
|
"learning_rate": 5.928454010281395e-06, |
|
"loss": 0.5566, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.36111661057216393, |
|
"grad_norm": 2.912275328636749, |
|
"learning_rate": 5.91043928973872e-06, |
|
"loss": 0.5524, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.36270045535537515, |
|
"grad_norm": 1.39224219742789, |
|
"learning_rate": 5.8923742273440405e-06, |
|
"loss": 0.5018, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.3642843001385864, |
|
"grad_norm": 1.531343821785256, |
|
"learning_rate": 5.87425929912855e-06, |
|
"loss": 0.5498, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.36586814492179764, |
|
"grad_norm": 1.6907756868854458, |
|
"learning_rate": 5.856094982437453e-06, |
|
"loss": 0.6188, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.3674519897050089, |
|
"grad_norm": 1.8925539993976666, |
|
"learning_rate": 5.83788175591739e-06, |
|
"loss": 0.6473, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.36903583448822014, |
|
"grad_norm": 1.3770006280258493, |
|
"learning_rate": 5.819620099503818e-06, |
|
"loss": 0.4686, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.3706196792714314, |
|
"grad_norm": 1.5431315768010794, |
|
"learning_rate": 5.801310494408365e-06, |
|
"loss": 0.5691, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.37220352405464263, |
|
"grad_norm": 1.6551290914393502, |
|
"learning_rate": 5.782953423106153e-06, |
|
"loss": 0.5874, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.3737873688378539, |
|
"grad_norm": 1.9002502737234077, |
|
"learning_rate": 5.764549369323084e-06, |
|
"loss": 0.5529, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.3753712136210651, |
|
"grad_norm": 1.686985111412407, |
|
"learning_rate": 5.746098818023092e-06, |
|
"loss": 0.5603, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.3769550584042764, |
|
"grad_norm": 1.3657672575822837, |
|
"learning_rate": 5.727602255395364e-06, |
|
"loss": 0.5568, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.3785389031874876, |
|
"grad_norm": 1.9695232006427876, |
|
"learning_rate": 5.7090601688415235e-06, |
|
"loss": 0.5658, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.3801227479706989, |
|
"grad_norm": 1.5461356891055895, |
|
"learning_rate": 5.690473046962798e-06, |
|
"loss": 0.4673, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.3817065927539101, |
|
"grad_norm": 2.3698398593391965, |
|
"learning_rate": 5.671841379547133e-06, |
|
"loss": 0.5763, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.3832904375371214, |
|
"grad_norm": 1.6233296254529663, |
|
"learning_rate": 5.6531656575562954e-06, |
|
"loss": 0.4775, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.3848742823203326, |
|
"grad_norm": 1.5057753839519041, |
|
"learning_rate": 5.634446373112926e-06, |
|
"loss": 0.5759, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.3864581271035439, |
|
"grad_norm": 1.569673186402757, |
|
"learning_rate": 5.615684019487579e-06, |
|
"loss": 0.542, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.3880419718867551, |
|
"grad_norm": 1.4845831954568514, |
|
"learning_rate": 5.596879091085723e-06, |
|
"loss": 0.4803, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.3896258166699663, |
|
"grad_norm": 1.8705825155512474, |
|
"learning_rate": 5.57803208343471e-06, |
|
"loss": 0.5017, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.3912096614531776, |
|
"grad_norm": 1.5197093986457457, |
|
"learning_rate": 5.559143493170717e-06, |
|
"loss": 0.541, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.3927935062363888, |
|
"grad_norm": 1.5301810316128006, |
|
"learning_rate": 5.540213818025666e-06, |
|
"loss": 0.5427, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.3943773510196001, |
|
"grad_norm": 1.8516941433643899, |
|
"learning_rate": 5.5212435568141035e-06, |
|
"loss": 0.5974, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.3959611958028113, |
|
"grad_norm": 1.3334174668528216, |
|
"learning_rate": 5.5022332094200505e-06, |
|
"loss": 0.5429, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.3975450405860226, |
|
"grad_norm": 1.4702275400288973, |
|
"learning_rate": 5.483183276783843e-06, |
|
"loss": 0.5766, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.3991288853692338, |
|
"grad_norm": 1.5459152423352713, |
|
"learning_rate": 5.464094260888924e-06, |
|
"loss": 0.5527, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.4007127301524451, |
|
"grad_norm": 1.8640878576069286, |
|
"learning_rate": 5.4449666647486125e-06, |
|
"loss": 0.6205, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.4022965749356563, |
|
"grad_norm": 1.9092864583223532, |
|
"learning_rate": 5.425800992392856e-06, |
|
"loss": 0.548, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.40388041971886757, |
|
"grad_norm": 1.5276995541859555, |
|
"learning_rate": 5.406597748854947e-06, |
|
"loss": 0.5498, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.4054642645020788, |
|
"grad_norm": 1.4771875777008228, |
|
"learning_rate": 5.38735744015821e-06, |
|
"loss": 0.4411, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.40704810928529006, |
|
"grad_norm": 1.637711951004362, |
|
"learning_rate": 5.368080573302675e-06, |
|
"loss": 0.5537, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.4086319540685013, |
|
"grad_norm": 1.606555141779956, |
|
"learning_rate": 5.348767656251709e-06, |
|
"loss": 0.558, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.41021579885171255, |
|
"grad_norm": 1.4771059602395231, |
|
"learning_rate": 5.329419197918638e-06, |
|
"loss": 0.4915, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.4117996436349238, |
|
"grad_norm": 1.5943720324164687, |
|
"learning_rate": 5.310035708153335e-06, |
|
"loss": 0.583, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.41338348841813505, |
|
"grad_norm": 1.666964957956596, |
|
"learning_rate": 5.2906176977287795e-06, |
|
"loss": 0.5493, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.41496733320134627, |
|
"grad_norm": 1.4711154343447124, |
|
"learning_rate": 5.271165678327606e-06, |
|
"loss": 0.5519, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.4165511779845575, |
|
"grad_norm": 1.5995792262282518, |
|
"learning_rate": 5.251680162528617e-06, |
|
"loss": 0.5377, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.41813502276776876, |
|
"grad_norm": 1.4526790782720835, |
|
"learning_rate": 5.232161663793275e-06, |
|
"loss": 0.5335, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.41971886755098, |
|
"grad_norm": 1.5478646380088532, |
|
"learning_rate": 5.212610696452174e-06, |
|
"loss": 0.6434, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.42130271233419125, |
|
"grad_norm": 1.4956181853598964, |
|
"learning_rate": 5.193027775691485e-06, |
|
"loss": 0.498, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.4228865571174025, |
|
"grad_norm": 1.4974980436315144, |
|
"learning_rate": 5.173413417539384e-06, |
|
"loss": 0.6171, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.42447040190061375, |
|
"grad_norm": 1.466644696421707, |
|
"learning_rate": 5.153768138852449e-06, |
|
"loss": 0.501, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.42605424668382497, |
|
"grad_norm": 1.627652008126963, |
|
"learning_rate": 5.134092457302043e-06, |
|
"loss": 0.6258, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.42763809146703624, |
|
"grad_norm": 1.648118892712504, |
|
"learning_rate": 5.114386891360675e-06, |
|
"loss": 0.5565, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.42922193625024746, |
|
"grad_norm": 1.848328087802553, |
|
"learning_rate": 5.094651960288332e-06, |
|
"loss": 0.5803, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.43080578103345873, |
|
"grad_norm": 1.3558392557511212, |
|
"learning_rate": 5.074888184118801e-06, |
|
"loss": 0.4598, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.43238962581666995, |
|
"grad_norm": 1.4376325696738284, |
|
"learning_rate": 5.055096083645967e-06, |
|
"loss": 0.5144, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.43397347059988123, |
|
"grad_norm": 2.013114345583861, |
|
"learning_rate": 5.035276180410083e-06, |
|
"loss": 0.5365, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.43555731538309245, |
|
"grad_norm": 1.7720963379244992, |
|
"learning_rate": 5.015428996684031e-06, |
|
"loss": 0.5965, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.4371411601663037, |
|
"grad_norm": 1.5956864806503275, |
|
"learning_rate": 4.995555055459562e-06, |
|
"loss": 0.5399, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.43872500494951494, |
|
"grad_norm": 1.705695286576096, |
|
"learning_rate": 4.975654880433508e-06, |
|
"loss": 0.5492, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.4403088497327262, |
|
"grad_norm": 1.7149353403443588, |
|
"learning_rate": 4.95572899599399e-06, |
|
"loss": 0.6011, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.44189269451593743, |
|
"grad_norm": 1.327638735973327, |
|
"learning_rate": 4.935777927206595e-06, |
|
"loss": 0.4993, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.4434765392991487, |
|
"grad_norm": 1.5644224950432588, |
|
"learning_rate": 4.915802199800536e-06, |
|
"loss": 0.595, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.44506038408235993, |
|
"grad_norm": 1.517553789603623, |
|
"learning_rate": 4.8958023401548124e-06, |
|
"loss": 0.5383, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.44664422886557115, |
|
"grad_norm": 1.7980504027520112, |
|
"learning_rate": 4.875778875284322e-06, |
|
"loss": 0.486, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.4482280736487824, |
|
"grad_norm": 1.618198969176812, |
|
"learning_rate": 4.855732332825989e-06, |
|
"loss": 0.5041, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.44981191843199364, |
|
"grad_norm": 1.6074207482159366, |
|
"learning_rate": 4.8356632410248495e-06, |
|
"loss": 0.5225, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.4513957632152049, |
|
"grad_norm": 1.3840009026279627, |
|
"learning_rate": 4.815572128720138e-06, |
|
"loss": 0.4984, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.45297960799841613, |
|
"grad_norm": 2.0549483079657644, |
|
"learning_rate": 4.795459525331346e-06, |
|
"loss": 0.7242, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.4545634527816274, |
|
"grad_norm": 1.4634854048670438, |
|
"learning_rate": 4.77532596084428e-06, |
|
"loss": 0.5575, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.4561472975648386, |
|
"grad_norm": 1.4194408672102914, |
|
"learning_rate": 4.755171965797087e-06, |
|
"loss": 0.5493, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.4577311423480499, |
|
"grad_norm": 1.6579210683410825, |
|
"learning_rate": 4.734998071266282e-06, |
|
"loss": 0.4842, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.4593149871312611, |
|
"grad_norm": 2.2481565569431003, |
|
"learning_rate": 4.714804808852744e-06, |
|
"loss": 0.5556, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.4608988319144724, |
|
"grad_norm": 1.7503586240205726, |
|
"learning_rate": 4.694592710667722e-06, |
|
"loss": 0.5027, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.4624826766976836, |
|
"grad_norm": 1.5013077261326937, |
|
"learning_rate": 4.674362309318796e-06, |
|
"loss": 0.5387, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.4640665214808949, |
|
"grad_norm": 1.5044654112516254, |
|
"learning_rate": 4.65411413789586e-06, |
|
"loss": 0.4191, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.4656503662641061, |
|
"grad_norm": 1.4661324959060025, |
|
"learning_rate": 4.6338487299570605e-06, |
|
"loss": 0.5883, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.4672342110473174, |
|
"grad_norm": 1.7266562955889597, |
|
"learning_rate": 4.613566619514742e-06, |
|
"loss": 0.6532, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.4688180558305286, |
|
"grad_norm": 1.4130438958548992, |
|
"learning_rate": 4.593268341021378e-06, |
|
"loss": 0.5274, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.4704019006137399, |
|
"grad_norm": 1.4628610094368864, |
|
"learning_rate": 4.572954429355486e-06, |
|
"loss": 0.4546, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.4719857453969511, |
|
"grad_norm": 2.6067716280602493, |
|
"learning_rate": 4.552625419807529e-06, |
|
"loss": 0.5247, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.47356959018016237, |
|
"grad_norm": 1.5391344972032668, |
|
"learning_rate": 4.532281848065815e-06, |
|
"loss": 0.5648, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.4751534349633736, |
|
"grad_norm": 1.4085395552662021, |
|
"learning_rate": 4.5119242502023795e-06, |
|
"loss": 0.5333, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.4767372797465848, |
|
"grad_norm": 1.4740227736979468, |
|
"learning_rate": 4.4915531626588566e-06, |
|
"loss": 0.4993, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.4783211245297961, |
|
"grad_norm": 1.5696976913793732, |
|
"learning_rate": 4.4711691222323505e-06, |
|
"loss": 0.5829, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.4799049693130073, |
|
"grad_norm": 1.3762468410600768, |
|
"learning_rate": 4.450772666061285e-06, |
|
"loss": 0.5585, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.4814888140962186, |
|
"grad_norm": 1.4028769788977258, |
|
"learning_rate": 4.4303643316112455e-06, |
|
"loss": 0.4688, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.4830726588794298, |
|
"grad_norm": 1.6682668294458134, |
|
"learning_rate": 4.409944656660828e-06, |
|
"loss": 0.6571, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.48465650366264107, |
|
"grad_norm": 1.413594427095901, |
|
"learning_rate": 4.389514179287455e-06, |
|
"loss": 0.5522, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.4862403484458523, |
|
"grad_norm": 1.519057639158845, |
|
"learning_rate": 4.369073437853208e-06, |
|
"loss": 0.5334, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.48782419322906356, |
|
"grad_norm": 1.6208921075298082, |
|
"learning_rate": 4.348622970990633e-06, |
|
"loss": 0.5182, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.4894080380122748, |
|
"grad_norm": 1.8501674500734415, |
|
"learning_rate": 4.328163317588551e-06, |
|
"loss": 0.6517, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.49099188279548606, |
|
"grad_norm": 1.4363229280193845, |
|
"learning_rate": 4.307695016777855e-06, |
|
"loss": 0.5416, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.4925757275786973, |
|
"grad_norm": 1.7153136971595235, |
|
"learning_rate": 4.28721860791731e-06, |
|
"loss": 0.588, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.49415957236190855, |
|
"grad_norm": 1.7049912502271836, |
|
"learning_rate": 4.2667346305793305e-06, |
|
"loss": 0.5894, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.49574341714511977, |
|
"grad_norm": 1.7928491666503086, |
|
"learning_rate": 4.246243624535772e-06, |
|
"loss": 0.5509, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.49732726192833104, |
|
"grad_norm": 1.4067335450901148, |
|
"learning_rate": 4.2257461297436975e-06, |
|
"loss": 0.5372, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.49891110671154226, |
|
"grad_norm": 1.551109249256133, |
|
"learning_rate": 4.205242686331158e-06, |
|
"loss": 0.4888, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.5004949514947535, |
|
"grad_norm": 1.5295528770849849, |
|
"learning_rate": 4.184733834582958e-06, |
|
"loss": 0.5244, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.5020787962779648, |
|
"grad_norm": 1.568867534852523, |
|
"learning_rate": 4.164220114926413e-06, |
|
"loss": 0.5243, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.503662641061176, |
|
"grad_norm": 1.5276609286295317, |
|
"learning_rate": 4.143702067917114e-06, |
|
"loss": 0.5557, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.5052464858443872, |
|
"grad_norm": 1.7692820390689328, |
|
"learning_rate": 4.123180234224682e-06, |
|
"loss": 0.5533, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.5068303306275985, |
|
"grad_norm": 1.6282684636897258, |
|
"learning_rate": 4.102655154618519e-06, |
|
"loss": 0.54, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.5084141754108097, |
|
"grad_norm": 1.4884338495205554, |
|
"learning_rate": 4.082127369953562e-06, |
|
"loss": 0.5187, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.509998020194021, |
|
"grad_norm": 1.4653962436440777, |
|
"learning_rate": 4.061597421156027e-06, |
|
"loss": 0.4915, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.5115818649772322, |
|
"grad_norm": 1.395223278506742, |
|
"learning_rate": 4.04106584920916e-06, |
|
"loss": 0.496, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.5131657097604435, |
|
"grad_norm": 2.1426438371002487, |
|
"learning_rate": 4.0205331951389745e-06, |
|
"loss": 0.6205, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.5147495545436547, |
|
"grad_norm": 1.867498513188697, |
|
"learning_rate": 4e-06, |
|
"loss": 0.4003, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.516333399326866, |
|
"grad_norm": 1.614897925155514, |
|
"learning_rate": 3.979466804861026e-06, |
|
"loss": 0.5554, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.5179172441100772, |
|
"grad_norm": 1.621850139141135, |
|
"learning_rate": 3.958934150790841e-06, |
|
"loss": 0.4116, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.5195010888932885, |
|
"grad_norm": 1.6540960382507581, |
|
"learning_rate": 3.938402578843973e-06, |
|
"loss": 0.4899, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.5210849336764997, |
|
"grad_norm": 1.4757999919149907, |
|
"learning_rate": 3.917872630046439e-06, |
|
"loss": 0.4871, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.5226687784597109, |
|
"grad_norm": 1.4101743650910379, |
|
"learning_rate": 3.8973448453814815e-06, |
|
"loss": 0.5557, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.5242526232429222, |
|
"grad_norm": 1.820304069777086, |
|
"learning_rate": 3.876819765775319e-06, |
|
"loss": 0.5178, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.5258364680261335, |
|
"grad_norm": 1.3457029032306118, |
|
"learning_rate": 3.856297932082886e-06, |
|
"loss": 0.5481, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.5274203128093446, |
|
"grad_norm": 1.2926453088911345, |
|
"learning_rate": 3.835779885073587e-06, |
|
"loss": 0.47, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.5290041575925559, |
|
"grad_norm": 1.7195140710061863, |
|
"learning_rate": 3.815266165417042e-06, |
|
"loss": 0.4018, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.5305880023757672, |
|
"grad_norm": 1.5294915569869552, |
|
"learning_rate": 3.7947573136688406e-06, |
|
"loss": 0.4889, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.5321718471589785, |
|
"grad_norm": 1.5318098339821966, |
|
"learning_rate": 3.774253870256302e-06, |
|
"loss": 0.429, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.5337556919421896, |
|
"grad_norm": 1.5322388550699502, |
|
"learning_rate": 3.7537563754642285e-06, |
|
"loss": 0.5065, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.5353395367254009, |
|
"grad_norm": 1.7066209125681409, |
|
"learning_rate": 3.7332653694206683e-06, |
|
"loss": 0.4947, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.5369233815086122, |
|
"grad_norm": 1.3425460536457656, |
|
"learning_rate": 3.7127813920826896e-06, |
|
"loss": 0.5448, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.5385072262918233, |
|
"grad_norm": 1.3885953993055378, |
|
"learning_rate": 3.6923049832221447e-06, |
|
"loss": 0.5269, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.5400910710750346, |
|
"grad_norm": 1.562252798906571, |
|
"learning_rate": 3.6718366824114497e-06, |
|
"loss": 0.5145, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.5416749158582459, |
|
"grad_norm": 1.7621418644258409, |
|
"learning_rate": 3.651377029009367e-06, |
|
"loss": 0.5345, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.5432587606414572, |
|
"grad_norm": 1.6334983556675244, |
|
"learning_rate": 3.6309265621467923e-06, |
|
"loss": 0.5435, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.5448426054246683, |
|
"grad_norm": 1.6029717902895917, |
|
"learning_rate": 3.6104858207125447e-06, |
|
"loss": 0.5734, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.5464264502078796, |
|
"grad_norm": 1.7090791562879204, |
|
"learning_rate": 3.590055343339172e-06, |
|
"loss": 0.5325, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.5480102949910909, |
|
"grad_norm": 1.5947834681663235, |
|
"learning_rate": 3.5696356683887545e-06, |
|
"loss": 0.4975, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.5495941397743022, |
|
"grad_norm": 1.4472742169549009, |
|
"learning_rate": 3.5492273339387156e-06, |
|
"loss": 0.4894, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.5511779845575133, |
|
"grad_norm": 2.0434685400743082, |
|
"learning_rate": 3.5288308777676487e-06, |
|
"loss": 0.4684, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.5527618293407246, |
|
"grad_norm": 1.9208895400830441, |
|
"learning_rate": 3.508446837341144e-06, |
|
"loss": 0.5969, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.5543456741239359, |
|
"grad_norm": 1.4909570903888723, |
|
"learning_rate": 3.488075749797622e-06, |
|
"loss": 0.5022, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.5559295189071471, |
|
"grad_norm": 1.6489325052677233, |
|
"learning_rate": 3.4677181519341864e-06, |
|
"loss": 0.5895, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.5575133636903583, |
|
"grad_norm": 1.613873880473099, |
|
"learning_rate": 3.447374580192472e-06, |
|
"loss": 0.5907, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.5590972084735696, |
|
"grad_norm": 1.7320655964325145, |
|
"learning_rate": 3.427045570644515e-06, |
|
"loss": 0.4408, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.5606810532567809, |
|
"grad_norm": 1.5966735398972682, |
|
"learning_rate": 3.406731658978621e-06, |
|
"loss": 0.5518, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.5622648980399921, |
|
"grad_norm": 1.507137639160325, |
|
"learning_rate": 3.386433380485258e-06, |
|
"loss": 0.5487, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.5638487428232033, |
|
"grad_norm": 1.477023097202977, |
|
"learning_rate": 3.36615127004294e-06, |
|
"loss": 0.5277, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.5654325876064146, |
|
"grad_norm": 1.4345842364104429, |
|
"learning_rate": 3.3458858621041395e-06, |
|
"loss": 0.5825, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.5670164323896258, |
|
"grad_norm": 1.5249472435401055, |
|
"learning_rate": 3.3256376906812026e-06, |
|
"loss": 0.5686, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.568600277172837, |
|
"grad_norm": 1.652799125000088, |
|
"learning_rate": 3.3054072893322785e-06, |
|
"loss": 0.5356, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.5701841219560483, |
|
"grad_norm": 1.7523508161150247, |
|
"learning_rate": 3.285195191147255e-06, |
|
"loss": 0.558, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.5717679667392596, |
|
"grad_norm": 1.5826594727198133, |
|
"learning_rate": 3.265001928733718e-06, |
|
"loss": 0.5513, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.5733518115224708, |
|
"grad_norm": 1.4553905463035222, |
|
"learning_rate": 3.2448280342029128e-06, |
|
"loss": 0.4994, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.574935656305682, |
|
"grad_norm": 1.4388765353827664, |
|
"learning_rate": 3.2246740391557196e-06, |
|
"loss": 0.5027, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.5765195010888933, |
|
"grad_norm": 1.330505454003212, |
|
"learning_rate": 3.2045404746686542e-06, |
|
"loss": 0.508, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.5781033458721045, |
|
"grad_norm": 1.3429439845943842, |
|
"learning_rate": 3.1844278712798626e-06, |
|
"loss": 0.4263, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.5796871906553158, |
|
"grad_norm": 1.4396944573827641, |
|
"learning_rate": 3.1643367589751497e-06, |
|
"loss": 0.5179, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.581271035438527, |
|
"grad_norm": 1.414735328601168, |
|
"learning_rate": 3.1442676671740113e-06, |
|
"loss": 0.4259, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.5828548802217383, |
|
"grad_norm": 1.66831199255592, |
|
"learning_rate": 3.124221124715678e-06, |
|
"loss": 0.4971, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.5844387250049495, |
|
"grad_norm": 1.5810788020061641, |
|
"learning_rate": 3.104197659845188e-06, |
|
"loss": 0.5772, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.5860225697881608, |
|
"grad_norm": 1.5715742632475282, |
|
"learning_rate": 3.0841978001994645e-06, |
|
"loss": 0.5036, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.587606414571372, |
|
"grad_norm": 1.751121812265499, |
|
"learning_rate": 3.0642220727934067e-06, |
|
"loss": 0.5295, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.5891902593545832, |
|
"grad_norm": 1.5098119881063428, |
|
"learning_rate": 3.0442710040060098e-06, |
|
"loss": 0.5466, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.5907741041377945, |
|
"grad_norm": 1.5207898613100728, |
|
"learning_rate": 3.0243451195664913e-06, |
|
"loss": 0.5579, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.5923579489210058, |
|
"grad_norm": 1.4240138870827277, |
|
"learning_rate": 3.004444944540437e-06, |
|
"loss": 0.5507, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.593941793704217, |
|
"grad_norm": 1.506087376349933, |
|
"learning_rate": 2.9845710033159684e-06, |
|
"loss": 0.4465, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.5955256384874282, |
|
"grad_norm": 1.607368732995549, |
|
"learning_rate": 2.9647238195899164e-06, |
|
"loss": 0.5378, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.5971094832706395, |
|
"grad_norm": 1.578131220264289, |
|
"learning_rate": 2.9449039163540316e-06, |
|
"loss": 0.4516, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.5986933280538507, |
|
"grad_norm": 1.488407789608305, |
|
"learning_rate": 2.9251118158811984e-06, |
|
"loss": 0.5087, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.600277172837062, |
|
"grad_norm": 1.4648982592871571, |
|
"learning_rate": 2.9053480397116684e-06, |
|
"loss": 0.5531, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.6018610176202732, |
|
"grad_norm": 1.378206069077312, |
|
"learning_rate": 2.885613108639326e-06, |
|
"loss": 0.437, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.6034448624034845, |
|
"grad_norm": 1.4399539996573616, |
|
"learning_rate": 2.865907542697957e-06, |
|
"loss": 0.5327, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.6050287071866957, |
|
"grad_norm": 1.4113473201037952, |
|
"learning_rate": 2.846231861147551e-06, |
|
"loss": 0.5414, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.6066125519699069, |
|
"grad_norm": 1.3798536815664695, |
|
"learning_rate": 2.8265865824606165e-06, |
|
"loss": 0.5537, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.6081963967531182, |
|
"grad_norm": 1.6278939154714502, |
|
"learning_rate": 2.806972224308515e-06, |
|
"loss": 0.5272, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.6097802415363295, |
|
"grad_norm": 1.4122415614433053, |
|
"learning_rate": 2.787389303547826e-06, |
|
"loss": 0.5437, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.6113640863195406, |
|
"grad_norm": 1.4410982986916903, |
|
"learning_rate": 2.7678383362067257e-06, |
|
"loss": 0.6161, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.6129479311027519, |
|
"grad_norm": 1.6053888158206666, |
|
"learning_rate": 2.748319837471383e-06, |
|
"loss": 0.5462, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.6145317758859632, |
|
"grad_norm": 1.6416876364791944, |
|
"learning_rate": 2.7288343216723933e-06, |
|
"loss": 0.5041, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.6161156206691745, |
|
"grad_norm": 1.6000769765570262, |
|
"learning_rate": 2.7093823022712215e-06, |
|
"loss": 0.6001, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.6176994654523856, |
|
"grad_norm": 1.5229151146342543, |
|
"learning_rate": 2.6899642918466656e-06, |
|
"loss": 0.5966, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.6192833102355969, |
|
"grad_norm": 1.2082061870063965, |
|
"learning_rate": 2.6705808020813617e-06, |
|
"loss": 0.4404, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.6208671550188082, |
|
"grad_norm": 1.7357266576917079, |
|
"learning_rate": 2.6512323437482903e-06, |
|
"loss": 0.509, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.6224509998020195, |
|
"grad_norm": 1.4629821094699202, |
|
"learning_rate": 2.631919426697325e-06, |
|
"loss": 0.5477, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.6240348445852306, |
|
"grad_norm": 1.6391626920610356, |
|
"learning_rate": 2.612642559841789e-06, |
|
"loss": 0.5424, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.6256186893684419, |
|
"grad_norm": 1.6354675394639997, |
|
"learning_rate": 2.5934022511450525e-06, |
|
"loss": 0.4486, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.6272025341516532, |
|
"grad_norm": 1.5362602160998997, |
|
"learning_rate": 2.574199007607144e-06, |
|
"loss": 0.452, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.6287863789348643, |
|
"grad_norm": 1.4546308476604741, |
|
"learning_rate": 2.5550333352513884e-06, |
|
"loss": 0.5295, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.6303702237180756, |
|
"grad_norm": 1.5353573065379114, |
|
"learning_rate": 2.535905739111077e-06, |
|
"loss": 0.4627, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.6319540685012869, |
|
"grad_norm": 1.7076979214421926, |
|
"learning_rate": 2.516816723216157e-06, |
|
"loss": 0.5024, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.6335379132844982, |
|
"grad_norm": 1.8486022817022656, |
|
"learning_rate": 2.49776679057995e-06, |
|
"loss": 0.5149, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.6351217580677093, |
|
"grad_norm": 1.566887589871668, |
|
"learning_rate": 2.4787564431858974e-06, |
|
"loss": 0.5059, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 0.6367056028509206, |
|
"grad_norm": 1.564549295152443, |
|
"learning_rate": 2.4597861819743334e-06, |
|
"loss": 0.4603, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.6382894476341319, |
|
"grad_norm": 1.380492154373069, |
|
"learning_rate": 2.4408565068292827e-06, |
|
"loss": 0.4929, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 0.6398732924173431, |
|
"grad_norm": 1.5407455494599405, |
|
"learning_rate": 2.4219679165652902e-06, |
|
"loss": 0.5311, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.6414571372005543, |
|
"grad_norm": 1.4174285596906695, |
|
"learning_rate": 2.403120908914277e-06, |
|
"loss": 0.4834, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.6430409819837656, |
|
"grad_norm": 1.873920827014659, |
|
"learning_rate": 2.3843159805124203e-06, |
|
"loss": 0.5017, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.6446248267669769, |
|
"grad_norm": 1.7835980447150575, |
|
"learning_rate": 2.365553626887074e-06, |
|
"loss": 0.5418, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 0.6462086715501881, |
|
"grad_norm": 1.3633583035406078, |
|
"learning_rate": 2.3468343424437055e-06, |
|
"loss": 0.4608, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.6477925163333993, |
|
"grad_norm": 1.8174838090865337, |
|
"learning_rate": 2.3281586204528677e-06, |
|
"loss": 0.5257, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 0.6493763611166106, |
|
"grad_norm": 1.5982388803586878, |
|
"learning_rate": 2.309526953037203e-06, |
|
"loss": 0.5193, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.6509602058998218, |
|
"grad_norm": 2.817392677546035, |
|
"learning_rate": 2.2909398311584775e-06, |
|
"loss": 0.4578, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 0.6525440506830331, |
|
"grad_norm": 1.581257330292036, |
|
"learning_rate": 2.272397744604636e-06, |
|
"loss": 0.5021, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.6541278954662443, |
|
"grad_norm": 1.6043151167776697, |
|
"learning_rate": 2.253901181976905e-06, |
|
"loss": 0.5405, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 0.6557117402494556, |
|
"grad_norm": 2.1291440852646817, |
|
"learning_rate": 2.2354506306769143e-06, |
|
"loss": 0.5301, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.6572955850326668, |
|
"grad_norm": 1.9230439043144794, |
|
"learning_rate": 2.2170465768938473e-06, |
|
"loss": 0.5709, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.658879429815878, |
|
"grad_norm": 1.472833355116063, |
|
"learning_rate": 2.1986895055916366e-06, |
|
"loss": 0.4326, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.6604632745990893, |
|
"grad_norm": 1.87905372939008, |
|
"learning_rate": 2.1803799004961824e-06, |
|
"loss": 0.462, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 0.6620471193823005, |
|
"grad_norm": 1.4366052509179692, |
|
"learning_rate": 2.1621182440826096e-06, |
|
"loss": 0.5735, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.6636309641655118, |
|
"grad_norm": 1.7279130546658286, |
|
"learning_rate": 2.143905017562547e-06, |
|
"loss": 0.408, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 0.665214808948723, |
|
"grad_norm": 1.536940380154465, |
|
"learning_rate": 2.12574070087145e-06, |
|
"loss": 0.5199, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.665214808948723, |
|
"eval_accuracy": 0.810934698088188, |
|
"eval_loss": 0.5862451195716858, |
|
"eval_perplexity": 1.2037860680935488, |
|
"eval_runtime": 531.3769, |
|
"eval_samples_per_second": 1.432, |
|
"eval_steps_per_second": 1.432, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.6667986537319343, |
|
"grad_norm": 1.565207882157784, |
|
"learning_rate": 2.10762577265596e-06, |
|
"loss": 0.4879, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 0.6683824985151455, |
|
"grad_norm": 1.4691468768515952, |
|
"learning_rate": 2.0895607102612803e-06, |
|
"loss": 0.5024, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.6699663432983568, |
|
"grad_norm": 1.3928452284838215, |
|
"learning_rate": 2.0715459897186044e-06, |
|
"loss": 0.5901, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 0.671550188081568, |
|
"grad_norm": 1.4610053346508791, |
|
"learning_rate": 2.0535820857325753e-06, |
|
"loss": 0.5062, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.6731340328647792, |
|
"grad_norm": 1.3712954252365843, |
|
"learning_rate": 2.0356694716687682e-06, |
|
"loss": 0.5479, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.6747178776479905, |
|
"grad_norm": 1.3758715850017582, |
|
"learning_rate": 2.017808619541221e-06, |
|
"loss": 0.4969, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.6763017224312018, |
|
"grad_norm": 1.2871110284961038, |
|
"learning_rate": 2.0000000000000008e-06, |
|
"loss": 0.4832, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 0.677885567214413, |
|
"grad_norm": 1.4668871631682787, |
|
"learning_rate": 1.982244082318793e-06, |
|
"loss": 0.4901, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.6794694119976242, |
|
"grad_norm": 1.515067727433541, |
|
"learning_rate": 1.9645413343825406e-06, |
|
"loss": 0.5362, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 0.6810532567808355, |
|
"grad_norm": 1.427224417336294, |
|
"learning_rate": 1.946892222675118e-06, |
|
"loss": 0.467, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.6826371015640468, |
|
"grad_norm": 1.3486800429336776, |
|
"learning_rate": 1.92929721226703e-06, |
|
"loss": 0.4614, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 0.6842209463472579, |
|
"grad_norm": 1.7205504786002677, |
|
"learning_rate": 1.9117567668031665e-06, |
|
"loss": 0.45, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.6858047911304692, |
|
"grad_norm": 1.6996871872002546, |
|
"learning_rate": 1.8942713484905761e-06, |
|
"loss": 0.5028, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 0.6873886359136805, |
|
"grad_norm": 1.5407143434367043, |
|
"learning_rate": 1.8768414180862956e-06, |
|
"loss": 0.5294, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.6889724806968917, |
|
"grad_norm": 1.3480994367623826, |
|
"learning_rate": 1.859467434885199e-06, |
|
"loss": 0.4558, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.6905563254801029, |
|
"grad_norm": 1.7334838280078013, |
|
"learning_rate": 1.8421498567079005e-06, |
|
"loss": 0.5249, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.6921401702633142, |
|
"grad_norm": 2.254316615478953, |
|
"learning_rate": 1.8248891398886936e-06, |
|
"loss": 0.6142, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 0.6937240150465255, |
|
"grad_norm": 1.5257795875613787, |
|
"learning_rate": 1.8076857392635176e-06, |
|
"loss": 0.4471, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.6953078598297366, |
|
"grad_norm": 1.700232100023801, |
|
"learning_rate": 1.7905401081579768e-06, |
|
"loss": 0.52, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 0.6968917046129479, |
|
"grad_norm": 1.33014022877584, |
|
"learning_rate": 1.7734526983753986e-06, |
|
"loss": 0.4591, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.6984755493961592, |
|
"grad_norm": 1.7591582976289475, |
|
"learning_rate": 1.7564239601849216e-06, |
|
"loss": 0.556, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 0.7000593941793705, |
|
"grad_norm": 1.5534876715610133, |
|
"learning_rate": 1.7394543423096325e-06, |
|
"loss": 0.4904, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 0.7016432389625816, |
|
"grad_norm": 1.4877228267210632, |
|
"learning_rate": 1.7225442919147465e-06, |
|
"loss": 0.5103, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 0.7032270837457929, |
|
"grad_norm": 1.4261700371364836, |
|
"learning_rate": 1.7056942545958167e-06, |
|
"loss": 0.4619, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.7048109285290042, |
|
"grad_norm": 1.5419799312577944, |
|
"learning_rate": 1.6889046743669955e-06, |
|
"loss": 0.5397, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.7063947733122155, |
|
"grad_norm": 1.6636001306308572, |
|
"learning_rate": 1.6721759936493398e-06, |
|
"loss": 0.5358, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 0.7079786180954266, |
|
"grad_norm": 1.5271438092360352, |
|
"learning_rate": 1.6555086532591425e-06, |
|
"loss": 0.4629, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 0.7095624628786379, |
|
"grad_norm": 1.6441451799989772, |
|
"learning_rate": 1.6389030923963221e-06, |
|
"loss": 0.4495, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.7111463076618492, |
|
"grad_norm": 1.5088562455266363, |
|
"learning_rate": 1.6223597486328533e-06, |
|
"loss": 0.4715, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 0.7127301524450604, |
|
"grad_norm": 1.4463031901473735, |
|
"learning_rate": 1.6058790579012275e-06, |
|
"loss": 0.5491, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.7143139972282716, |
|
"grad_norm": 1.471434921379099, |
|
"learning_rate": 1.5894614544829747e-06, |
|
"loss": 0.4864, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 0.7158978420114829, |
|
"grad_norm": 1.6603661388527584, |
|
"learning_rate": 1.5731073709972113e-06, |
|
"loss": 0.4506, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.7174816867946942, |
|
"grad_norm": 1.5613545052853812, |
|
"learning_rate": 1.5568172383892488e-06, |
|
"loss": 0.5735, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 0.7190655315779053, |
|
"grad_norm": 1.6697903304969928, |
|
"learning_rate": 1.54059148591923e-06, |
|
"loss": 0.539, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 0.7206493763611166, |
|
"grad_norm": 1.4866099563811799, |
|
"learning_rate": 1.5244305411508215e-06, |
|
"loss": 0.5165, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.7222332211443279, |
|
"grad_norm": 1.5688875196040748, |
|
"learning_rate": 1.5083348299399506e-06, |
|
"loss": 0.4431, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.7238170659275391, |
|
"grad_norm": 1.470987976048739, |
|
"learning_rate": 1.492304776423575e-06, |
|
"loss": 0.4692, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 0.7254009107107503, |
|
"grad_norm": 1.373015781073825, |
|
"learning_rate": 1.4763408030085112e-06, |
|
"loss": 0.4408, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 0.7269847554939616, |
|
"grad_norm": 1.7939617236990213, |
|
"learning_rate": 1.460443330360309e-06, |
|
"loss": 0.4836, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 0.7285686002771729, |
|
"grad_norm": 1.7961810685183246, |
|
"learning_rate": 1.4446127773921557e-06, |
|
"loss": 0.5373, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.7301524450603841, |
|
"grad_norm": 1.8372334481084611, |
|
"learning_rate": 1.4288495612538425e-06, |
|
"loss": 0.4675, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 0.7317362898435953, |
|
"grad_norm": 1.4460691785589137, |
|
"learning_rate": 1.413154097320778e-06, |
|
"loss": 0.4537, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.7333201346268066, |
|
"grad_norm": 1.54932404541608, |
|
"learning_rate": 1.3975267991830327e-06, |
|
"loss": 0.4473, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 0.7349039794100178, |
|
"grad_norm": 1.4531601052014125, |
|
"learning_rate": 1.3819680786344434e-06, |
|
"loss": 0.5499, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.7364878241932291, |
|
"grad_norm": 1.6713295832702009, |
|
"learning_rate": 1.3664783456617702e-06, |
|
"loss": 0.4369, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.7380716689764403, |
|
"grad_norm": 1.4987250866458606, |
|
"learning_rate": 1.3510580084338803e-06, |
|
"loss": 0.5176, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 0.7396555137596516, |
|
"grad_norm": 1.7331331058430652, |
|
"learning_rate": 1.3357074732909995e-06, |
|
"loss": 0.4903, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 0.7412393585428628, |
|
"grad_norm": 1.6659137598021865, |
|
"learning_rate": 1.320427144734008e-06, |
|
"loss": 0.4823, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.7428232033260741, |
|
"grad_norm": 1.5499926208087895, |
|
"learning_rate": 1.3052174254137712e-06, |
|
"loss": 0.3442, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 0.7444070481092853, |
|
"grad_norm": 1.92051205057408, |
|
"learning_rate": 1.2900787161205404e-06, |
|
"loss": 0.5399, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.7459908928924965, |
|
"grad_norm": 1.3472536176738465, |
|
"learning_rate": 1.2750114157733829e-06, |
|
"loss": 0.4111, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 0.7475747376757078, |
|
"grad_norm": 1.71412692276887, |
|
"learning_rate": 1.2600159214096775e-06, |
|
"loss": 0.5043, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.749158582458919, |
|
"grad_norm": 1.3640389981993508, |
|
"learning_rate": 1.2450926281746456e-06, |
|
"loss": 0.5684, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 0.7507424272421303, |
|
"grad_norm": 1.4068613006077029, |
|
"learning_rate": 1.2302419293109414e-06, |
|
"loss": 0.5849, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 0.7523262720253415, |
|
"grad_norm": 1.430129815588007, |
|
"learning_rate": 1.2154642161482937e-06, |
|
"loss": 0.5287, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.7539101168085528, |
|
"grad_norm": 1.2831468841594065, |
|
"learning_rate": 1.2007598780931863e-06, |
|
"loss": 0.4214, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.755493961591764, |
|
"grad_norm": 1.49905785043907, |
|
"learning_rate": 1.1861293026186006e-06, |
|
"loss": 0.4839, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 0.7570778063749752, |
|
"grad_norm": 1.7028591333085308, |
|
"learning_rate": 1.1715728752538101e-06, |
|
"loss": 0.5414, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 0.7586616511581865, |
|
"grad_norm": 1.4586336430125626, |
|
"learning_rate": 1.1570909795742116e-06, |
|
"loss": 0.4434, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 0.7602454959413978, |
|
"grad_norm": 1.5067616302391904, |
|
"learning_rate": 1.1426839971912236e-06, |
|
"loss": 0.4858, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.761829340724609, |
|
"grad_norm": 1.3432603589549863, |
|
"learning_rate": 1.1283523077422325e-06, |
|
"loss": 0.4768, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 0.7634131855078202, |
|
"grad_norm": 1.6178725773986766, |
|
"learning_rate": 1.1140962888805834e-06, |
|
"loss": 0.5535, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 0.7649970302910315, |
|
"grad_norm": 1.4689663997635967, |
|
"learning_rate": 1.0999163162656295e-06, |
|
"loss": 0.5227, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 0.7665808750742428, |
|
"grad_norm": 1.5383027461516126, |
|
"learning_rate": 1.0858127635528394e-06, |
|
"loss": 0.4011, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 0.7681647198574539, |
|
"grad_norm": 1.5160653262215487, |
|
"learning_rate": 1.0717860023839421e-06, |
|
"loss": 0.5865, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.7697485646406652, |
|
"grad_norm": 1.667492242744668, |
|
"learning_rate": 1.0578364023771382e-06, |
|
"loss": 0.5631, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 0.7713324094238765, |
|
"grad_norm": 1.5597231965693572, |
|
"learning_rate": 1.043964331117364e-06, |
|
"loss": 0.4257, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 0.7729162542070878, |
|
"grad_norm": 1.4809241793279104, |
|
"learning_rate": 1.0301701541465954e-06, |
|
"loss": 0.4555, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.7745000989902989, |
|
"grad_norm": 1.539628448856408, |
|
"learning_rate": 1.016454234954227e-06, |
|
"loss": 0.5371, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 0.7760839437735102, |
|
"grad_norm": 1.507321371340315, |
|
"learning_rate": 1.0028169349674827e-06, |
|
"loss": 0.43, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.7776677885567215, |
|
"grad_norm": 1.6940747096415172, |
|
"learning_rate": 9.892586135419021e-07, |
|
"loss": 0.531, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 0.7792516333399326, |
|
"grad_norm": 1.458267308736236, |
|
"learning_rate": 9.757796279518636e-07, |
|
"loss": 0.5206, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.7808354781231439, |
|
"grad_norm": 1.510680045767434, |
|
"learning_rate": 9.623803333811712e-07, |
|
"loss": 0.4083, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 0.7824193229063552, |
|
"grad_norm": 1.5237051298755062, |
|
"learning_rate": 9.490610829137007e-07, |
|
"loss": 0.4753, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 0.7840031676895665, |
|
"grad_norm": 1.9004733408315109, |
|
"learning_rate": 9.358222275240884e-07, |
|
"loss": 0.4432, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.7855870124727776, |
|
"grad_norm": 1.4568257328395944, |
|
"learning_rate": 9.226641160684842e-07, |
|
"loss": 0.5099, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.7871708572559889, |
|
"grad_norm": 1.7318874449726016, |
|
"learning_rate": 9.095870952753646e-07, |
|
"loss": 0.4351, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 0.7887547020392002, |
|
"grad_norm": 1.6137017147533719, |
|
"learning_rate": 8.965915097363881e-07, |
|
"loss": 0.5928, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 0.7903385468224114, |
|
"grad_norm": 1.5650409842133206, |
|
"learning_rate": 8.83677701897318e-07, |
|
"loss": 0.3981, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 0.7919223916056226, |
|
"grad_norm": 1.5782538718757142, |
|
"learning_rate": 8.708460120490037e-07, |
|
"loss": 0.5337, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.7935062363888339, |
|
"grad_norm": 1.5981195947747417, |
|
"learning_rate": 8.580967783184055e-07, |
|
"loss": 0.5147, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 0.7950900811720452, |
|
"grad_norm": 1.5440200056277393, |
|
"learning_rate": 8.454303366596866e-07, |
|
"loss": 0.5308, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 0.7966739259552564, |
|
"grad_norm": 1.5571076654650473, |
|
"learning_rate": 8.328470208453682e-07, |
|
"loss": 0.4665, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 0.7982577707384676, |
|
"grad_norm": 1.4953480568816988, |
|
"learning_rate": 8.203471624575224e-07, |
|
"loss": 0.5417, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 0.7998416155216789, |
|
"grad_norm": 1.4524866840898938, |
|
"learning_rate": 8.079310908790419e-07, |
|
"loss": 0.4489, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.8014254603048901, |
|
"grad_norm": 2.045243169901501, |
|
"learning_rate": 7.955991332849623e-07, |
|
"loss": 0.6222, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 0.8030093050881013, |
|
"grad_norm": 1.4598656015711513, |
|
"learning_rate": 7.833516146338329e-07, |
|
"loss": 0.4226, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 0.8045931498713126, |
|
"grad_norm": 1.3467480479408616, |
|
"learning_rate": 7.711888576591618e-07, |
|
"loss": 0.4603, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 0.8061769946545239, |
|
"grad_norm": 1.5045994773603517, |
|
"learning_rate": 7.591111828609058e-07, |
|
"loss": 0.4625, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 0.8077608394377351, |
|
"grad_norm": 1.7584779249154874, |
|
"learning_rate": 7.471189084970291e-07, |
|
"loss": 0.4409, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.8093446842209463, |
|
"grad_norm": 1.4108080454193086, |
|
"learning_rate": 7.352123505751135e-07, |
|
"loss": 0.4703, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 0.8109285290041576, |
|
"grad_norm": 1.8380506141002417, |
|
"learning_rate": 7.233918228440323e-07, |
|
"loss": 0.449, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 0.8125123737873688, |
|
"grad_norm": 1.58792661941251, |
|
"learning_rate": 7.116576367856871e-07, |
|
"loss": 0.5837, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 0.8140962185705801, |
|
"grad_norm": 1.450768559703693, |
|
"learning_rate": 7.000101016067912e-07, |
|
"loss": 0.4332, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 0.8156800633537913, |
|
"grad_norm": 1.4028521815647677, |
|
"learning_rate": 6.884495242307284e-07, |
|
"loss": 0.4748, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.8172639081370026, |
|
"grad_norm": 1.6546175297694308, |
|
"learning_rate": 6.769762092894664e-07, |
|
"loss": 0.5074, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 0.8188477529202138, |
|
"grad_norm": 1.56576785928193, |
|
"learning_rate": 6.655904591155223e-07, |
|
"loss": 0.5381, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 0.8204315977034251, |
|
"grad_norm": 1.387565724939211, |
|
"learning_rate": 6.542925737340019e-07, |
|
"loss": 0.4561, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 0.8220154424866363, |
|
"grad_norm": 1.528956816316375, |
|
"learning_rate": 6.430828508546935e-07, |
|
"loss": 0.4937, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 0.8235992872698475, |
|
"grad_norm": 1.4223442587210688, |
|
"learning_rate": 6.319615858642193e-07, |
|
"loss": 0.5643, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.8251831320530588, |
|
"grad_norm": 1.6114797659220879, |
|
"learning_rate": 6.209290718182538e-07, |
|
"loss": 0.4748, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 0.8267669768362701, |
|
"grad_norm": 1.440732361602322, |
|
"learning_rate": 6.09985599433804e-07, |
|
"loss": 0.4529, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 0.8283508216194813, |
|
"grad_norm": 1.7456468155042586, |
|
"learning_rate": 5.99131457081544e-07, |
|
"loss": 0.4569, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 0.8299346664026925, |
|
"grad_norm": 1.5548170916399435, |
|
"learning_rate": 5.883669307782182e-07, |
|
"loss": 0.4917, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 0.8315185111859038, |
|
"grad_norm": 1.5325320661573854, |
|
"learning_rate": 5.776923041791076e-07, |
|
"loss": 0.4514, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.833102355969115, |
|
"grad_norm": 1.7221755053166576, |
|
"learning_rate": 5.671078585705489e-07, |
|
"loss": 0.5491, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 0.8346862007523262, |
|
"grad_norm": 1.3704260247894937, |
|
"learning_rate": 5.566138728625293e-07, |
|
"loss": 0.4455, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 0.8362700455355375, |
|
"grad_norm": 1.538307339121543, |
|
"learning_rate": 5.462106235813296e-07, |
|
"loss": 0.5443, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 0.8378538903187488, |
|
"grad_norm": 1.4697584276805509, |
|
"learning_rate": 5.358983848622451e-07, |
|
"loss": 0.4608, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 0.83943773510196, |
|
"grad_norm": 1.8369715460955707, |
|
"learning_rate": 5.256774284423561e-07, |
|
"loss": 0.5062, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.8410215798851712, |
|
"grad_norm": 1.4372888694581465, |
|
"learning_rate": 5.155480236533689e-07, |
|
"loss": 0.4203, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 0.8426054246683825, |
|
"grad_norm": 1.4932893467966832, |
|
"learning_rate": 5.055104374145221e-07, |
|
"loss": 0.4823, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 0.8441892694515938, |
|
"grad_norm": 1.4880388898611958, |
|
"learning_rate": 4.955649342255462e-07, |
|
"loss": 0.4552, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 0.845773114234805, |
|
"grad_norm": 1.6481037142268005, |
|
"learning_rate": 4.857117761596994e-07, |
|
"loss": 0.5839, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 0.8473569590180162, |
|
"grad_norm": 1.3437637677463443, |
|
"learning_rate": 4.759512228568621e-07, |
|
"loss": 0.4662, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.8489408038012275, |
|
"grad_norm": 1.4761647343726298, |
|
"learning_rate": 4.6628353151668995e-07, |
|
"loss": 0.568, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 0.8505246485844388, |
|
"grad_norm": 1.4720639230416912, |
|
"learning_rate": 4.567089568918403e-07, |
|
"loss": 0.5371, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 0.8521084933676499, |
|
"grad_norm": 1.5747841685012622, |
|
"learning_rate": 4.472277512812606e-07, |
|
"loss": 0.441, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 0.8536923381508612, |
|
"grad_norm": 1.650334451611286, |
|
"learning_rate": 4.378401645235352e-07, |
|
"loss": 0.5202, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 0.8552761829340725, |
|
"grad_norm": 1.5611475616764667, |
|
"learning_rate": 4.2854644399030526e-07, |
|
"loss": 0.4419, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.8568600277172838, |
|
"grad_norm": 1.4489741710945974, |
|
"learning_rate": 4.193468345797511e-07, |
|
"loss": 0.4335, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 0.8584438725004949, |
|
"grad_norm": 1.410090306645375, |
|
"learning_rate": 4.1024157871013586e-07, |
|
"loss": 0.5519, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 0.8600277172837062, |
|
"grad_norm": 2.1083504219892966, |
|
"learning_rate": 4.0123091631341933e-07, |
|
"loss": 0.5082, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 0.8616115620669175, |
|
"grad_norm": 1.513783834747692, |
|
"learning_rate": 3.9231508482893584e-07, |
|
"loss": 0.5122, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 0.8631954068501286, |
|
"grad_norm": 1.5251998914887834, |
|
"learning_rate": 3.834943191971365e-07, |
|
"loss": 0.4445, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.8647792516333399, |
|
"grad_norm": 1.5084805197178865, |
|
"learning_rate": 3.7476885185340023e-07, |
|
"loss": 0.5231, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 0.8663630964165512, |
|
"grad_norm": 1.6145093162288873, |
|
"learning_rate": 3.66138912721905e-07, |
|
"loss": 0.4943, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 0.8679469411997625, |
|
"grad_norm": 1.6928224805035605, |
|
"learning_rate": 3.5760472920957387e-07, |
|
"loss": 0.4923, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 0.8695307859829736, |
|
"grad_norm": 1.5962608727170904, |
|
"learning_rate": 3.491665262000789e-07, |
|
"loss": 0.4839, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 0.8711146307661849, |
|
"grad_norm": 1.606681077130113, |
|
"learning_rate": 3.4082452604791587e-07, |
|
"loss": 0.5515, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.8726984755493962, |
|
"grad_norm": 1.746327383134502, |
|
"learning_rate": 3.3257894857254877e-07, |
|
"loss": 0.4445, |
|
"step": 551 |
|
}, |
|
{ |
|
"epoch": 0.8742823203326074, |
|
"grad_norm": 1.6535392291397746, |
|
"learning_rate": 3.2443001105261127e-07, |
|
"loss": 0.3439, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 0.8758661651158186, |
|
"grad_norm": 1.5448042422189747, |
|
"learning_rate": 3.163779282201853e-07, |
|
"loss": 0.4854, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 0.8774500098990299, |
|
"grad_norm": 1.634766739227388, |
|
"learning_rate": 3.0842291225514314e-07, |
|
"loss": 0.4817, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 0.8790338546822412, |
|
"grad_norm": 1.7014309371471963, |
|
"learning_rate": 3.005651727795535e-07, |
|
"loss": 0.5209, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.8806176994654524, |
|
"grad_norm": 1.4990930673709706, |
|
"learning_rate": 2.9280491685215847e-07, |
|
"loss": 0.4922, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 0.8822015442486636, |
|
"grad_norm": 1.6902362872195928, |
|
"learning_rate": 2.85142348962919e-07, |
|
"loss": 0.5189, |
|
"step": 557 |
|
}, |
|
{ |
|
"epoch": 0.8837853890318749, |
|
"grad_norm": 1.5111021907783055, |
|
"learning_rate": 2.7757767102762587e-07, |
|
"loss": 0.5379, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 0.8853692338150861, |
|
"grad_norm": 1.4445290757135123, |
|
"learning_rate": 2.701110823825772e-07, |
|
"loss": 0.464, |
|
"step": 559 |
|
}, |
|
{ |
|
"epoch": 0.8869530785982974, |
|
"grad_norm": 1.4484298586379043, |
|
"learning_rate": 2.62742779779328e-07, |
|
"loss": 0.4532, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.8885369233815086, |
|
"grad_norm": 1.8044306708569506, |
|
"learning_rate": 2.5547295737950467e-07, |
|
"loss": 0.4846, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 0.8901207681647199, |
|
"grad_norm": 1.6320763637605389, |
|
"learning_rate": 2.483018067496885e-07, |
|
"loss": 0.4102, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 0.8917046129479311, |
|
"grad_norm": 1.513426697301029, |
|
"learning_rate": 2.412295168563667e-07, |
|
"loss": 0.3977, |
|
"step": 563 |
|
}, |
|
{ |
|
"epoch": 0.8932884577311423, |
|
"grad_norm": 1.739063565024179, |
|
"learning_rate": 2.3425627406095682e-07, |
|
"loss": 0.4883, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 0.8948723025143536, |
|
"grad_norm": 1.4344210490396612, |
|
"learning_rate": 2.273822621148902e-07, |
|
"loss": 0.523, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.8964561472975648, |
|
"grad_norm": 1.503570765479746, |
|
"learning_rate": 2.206076621547752e-07, |
|
"loss": 0.4387, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 0.8980399920807761, |
|
"grad_norm": 1.3957138101279774, |
|
"learning_rate": 2.1393265269762194e-07, |
|
"loss": 0.4629, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 0.8996238368639873, |
|
"grad_norm": 1.5135178225024162, |
|
"learning_rate": 2.0735740963613656e-07, |
|
"loss": 0.5019, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 0.9012076816471986, |
|
"grad_norm": 1.7552187868476394, |
|
"learning_rate": 2.0088210623408907e-07, |
|
"loss": 0.5353, |
|
"step": 569 |
|
}, |
|
{ |
|
"epoch": 0.9027915264304098, |
|
"grad_norm": 1.5773847982428775, |
|
"learning_rate": 1.9450691312174538e-07, |
|
"loss": 0.4794, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.9043753712136211, |
|
"grad_norm": 1.4347191250584008, |
|
"learning_rate": 1.8823199829137405e-07, |
|
"loss": 0.5398, |
|
"step": 571 |
|
}, |
|
{ |
|
"epoch": 0.9059592159968323, |
|
"grad_norm": 1.4568382476735973, |
|
"learning_rate": 1.8205752709281597e-07, |
|
"loss": 0.4439, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 0.9075430607800435, |
|
"grad_norm": 1.7936484686086873, |
|
"learning_rate": 1.759836622291293e-07, |
|
"loss": 0.4999, |
|
"step": 573 |
|
}, |
|
{ |
|
"epoch": 0.9091269055632548, |
|
"grad_norm": 1.5069741923698488, |
|
"learning_rate": 1.700105637523026e-07, |
|
"loss": 0.4889, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 0.9107107503464661, |
|
"grad_norm": 1.3815954432502042, |
|
"learning_rate": 1.6413838905903554e-07, |
|
"loss": 0.4927, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.9122945951296773, |
|
"grad_norm": 1.4408197785033354, |
|
"learning_rate": 1.58367292886592e-07, |
|
"loss": 0.4667, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 0.9138784399128885, |
|
"grad_norm": 1.5272866153375004, |
|
"learning_rate": 1.526974273087238e-07, |
|
"loss": 0.5076, |
|
"step": 577 |
|
}, |
|
{ |
|
"epoch": 0.9154622846960998, |
|
"grad_norm": 1.478729805351153, |
|
"learning_rate": 1.4712894173166192e-07, |
|
"loss": 0.461, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 0.9170461294793111, |
|
"grad_norm": 1.5805538071531569, |
|
"learning_rate": 1.416619828901795e-07, |
|
"loss": 0.5271, |
|
"step": 579 |
|
}, |
|
{ |
|
"epoch": 0.9186299742625222, |
|
"grad_norm": 1.5508552013902985, |
|
"learning_rate": 1.3629669484372718e-07, |
|
"loss": 0.5267, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.9202138190457335, |
|
"grad_norm": 1.8225403408379726, |
|
"learning_rate": 1.310332189726342e-07, |
|
"loss": 0.461, |
|
"step": 581 |
|
}, |
|
{ |
|
"epoch": 0.9217976638289448, |
|
"grad_norm": 1.4608984268403977, |
|
"learning_rate": 1.2587169397438425e-07, |
|
"loss": 0.4602, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 0.923381508612156, |
|
"grad_norm": 1.5279726414031836, |
|
"learning_rate": 1.2081225585996246e-07, |
|
"loss": 0.4588, |
|
"step": 583 |
|
}, |
|
{ |
|
"epoch": 0.9249653533953672, |
|
"grad_norm": 1.515611028063376, |
|
"learning_rate": 1.1585503795026718e-07, |
|
"loss": 0.5179, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 0.9265491981785785, |
|
"grad_norm": 1.5467139023737668, |
|
"learning_rate": 1.1100017087260205e-07, |
|
"loss": 0.5355, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.9281330429617898, |
|
"grad_norm": 1.4948762852593562, |
|
"learning_rate": 1.0624778255722855e-07, |
|
"loss": 0.4236, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 0.9297168877450009, |
|
"grad_norm": 1.7477468489012902, |
|
"learning_rate": 1.0159799823399939e-07, |
|
"loss": 0.4904, |
|
"step": 587 |
|
}, |
|
{ |
|
"epoch": 0.9313007325282122, |
|
"grad_norm": 1.5893000765049863, |
|
"learning_rate": 9.705094042905492e-08, |
|
"loss": 0.5515, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 0.9328845773114235, |
|
"grad_norm": 1.702578584367335, |
|
"learning_rate": 9.260672896159727e-08, |
|
"loss": 0.4751, |
|
"step": 589 |
|
}, |
|
{ |
|
"epoch": 0.9344684220946348, |
|
"grad_norm": 1.6463785462145422, |
|
"learning_rate": 8.826548094073194e-08, |
|
"loss": 0.4154, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.9360522668778459, |
|
"grad_norm": 1.3759922290335456, |
|
"learning_rate": 8.402731076238189e-08, |
|
"loss": 0.4738, |
|
"step": 591 |
|
}, |
|
{ |
|
"epoch": 0.9376361116610572, |
|
"grad_norm": 1.5640512221883043, |
|
"learning_rate": 7.989233010627261e-08, |
|
"loss": 0.416, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 0.9392199564442685, |
|
"grad_norm": 1.6313053854796913, |
|
"learning_rate": 7.586064793298997e-08, |
|
"loss": 0.4193, |
|
"step": 593 |
|
}, |
|
{ |
|
"epoch": 0.9408038012274798, |
|
"grad_norm": 1.7183629342589013, |
|
"learning_rate": 7.193237048110879e-08, |
|
"loss": 0.5383, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 0.9423876460106909, |
|
"grad_norm": 1.8057345431306828, |
|
"learning_rate": 6.810760126439285e-08, |
|
"loss": 0.4636, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.9439714907939022, |
|
"grad_norm": 1.581711206380294, |
|
"learning_rate": 6.438644106906866e-08, |
|
"loss": 0.4735, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 0.9455553355771135, |
|
"grad_norm": 2.0406365820491277, |
|
"learning_rate": 6.076898795116792e-08, |
|
"loss": 0.5349, |
|
"step": 597 |
|
}, |
|
{ |
|
"epoch": 0.9471391803603247, |
|
"grad_norm": 1.3665960620474567, |
|
"learning_rate": 5.7255337233944376e-08, |
|
"loss": 0.5118, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 0.9487230251435359, |
|
"grad_norm": 1.4545280151752455, |
|
"learning_rate": 5.3845581505362005e-08, |
|
"loss": 0.4706, |
|
"step": 599 |
|
}, |
|
{ |
|
"epoch": 0.9503068699267472, |
|
"grad_norm": 1.6568106629852815, |
|
"learning_rate": 5.05398106156556e-08, |
|
"loss": 0.4208, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.9518907147099585, |
|
"grad_norm": 1.3009128558987804, |
|
"learning_rate": 4.733811167496249e-08, |
|
"loss": 0.4618, |
|
"step": 601 |
|
}, |
|
{ |
|
"epoch": 0.9534745594931696, |
|
"grad_norm": 1.6350507056865986, |
|
"learning_rate": 4.4240569051027466e-08, |
|
"loss": 0.4818, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 0.9550584042763809, |
|
"grad_norm": 1.7692229180589922, |
|
"learning_rate": 4.124726436697878e-08, |
|
"loss": 0.4662, |
|
"step": 603 |
|
}, |
|
{ |
|
"epoch": 0.9566422490595922, |
|
"grad_norm": 1.4494075240429927, |
|
"learning_rate": 3.8358276499179664e-08, |
|
"loss": 0.6283, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 0.9582260938428034, |
|
"grad_norm": 1.5176956748111556, |
|
"learning_rate": 3.557368157514595e-08, |
|
"loss": 0.4618, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.9598099386260146, |
|
"grad_norm": 1.5615472024568975, |
|
"learning_rate": 3.2893552971545056e-08, |
|
"loss": 0.3706, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 0.9613937834092259, |
|
"grad_norm": 2.0109994652257055, |
|
"learning_rate": 3.031796131225706e-08, |
|
"loss": 0.4617, |
|
"step": 607 |
|
}, |
|
{ |
|
"epoch": 0.9629776281924372, |
|
"grad_norm": 1.6571285729822172, |
|
"learning_rate": 2.7846974466517957e-08, |
|
"loss": 0.4621, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 0.9645614729756484, |
|
"grad_norm": 1.9648211635566186, |
|
"learning_rate": 2.5480657547129135e-08, |
|
"loss": 0.5031, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 0.9661453177588596, |
|
"grad_norm": 1.3759221221620814, |
|
"learning_rate": 2.3219072908742253e-08, |
|
"loss": 0.4284, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.9677291625420709, |
|
"grad_norm": 1.5776287851272248, |
|
"learning_rate": 2.106228014621525e-08, |
|
"loss": 0.4965, |
|
"step": 611 |
|
}, |
|
{ |
|
"epoch": 0.9693130073252821, |
|
"grad_norm": 1.4564588215310144, |
|
"learning_rate": 1.901033609304381e-08, |
|
"loss": 0.5313, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 0.9708968521084934, |
|
"grad_norm": 1.4650178606419144, |
|
"learning_rate": 1.706329481986213e-08, |
|
"loss": 0.4695, |
|
"step": 613 |
|
}, |
|
{ |
|
"epoch": 0.9724806968917046, |
|
"grad_norm": 8.364306128275436, |
|
"learning_rate": 1.522120763301782e-08, |
|
"loss": 0.5742, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 0.9740645416749159, |
|
"grad_norm": 1.5436098149612576, |
|
"learning_rate": 1.348412307322233e-08, |
|
"loss": 0.4163, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.9756483864581271, |
|
"grad_norm": 1.4282895143928616, |
|
"learning_rate": 1.1852086914268423e-08, |
|
"loss": 0.5281, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 0.9772322312413384, |
|
"grad_norm": 1.2864356451763799, |
|
"learning_rate": 1.032514216182756e-08, |
|
"loss": 0.6423, |
|
"step": 617 |
|
}, |
|
{ |
|
"epoch": 0.9788160760245496, |
|
"grad_norm": 1.3530070123033577, |
|
"learning_rate": 8.903329052313502e-09, |
|
"loss": 0.4773, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 0.9803999208077608, |
|
"grad_norm": 1.4191474026879363, |
|
"learning_rate": 7.586685051823583e-09, |
|
"loss": 0.5286, |
|
"step": 619 |
|
}, |
|
{ |
|
"epoch": 0.9819837655909721, |
|
"grad_norm": 1.6895691048157127, |
|
"learning_rate": 6.375244855152839e-09, |
|
"loss": 0.5007, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.9835676103741833, |
|
"grad_norm": 1.414899383758995, |
|
"learning_rate": 5.269040384876078e-09, |
|
"loss": 0.4556, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 0.9851514551573946, |
|
"grad_norm": 1.8800481913954115, |
|
"learning_rate": 4.2681007905103206e-09, |
|
"loss": 0.5251, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 0.9867352999406058, |
|
"grad_norm": 1.3389313374375587, |
|
"learning_rate": 3.372452447744756e-09, |
|
"loss": 0.3714, |
|
"step": 623 |
|
}, |
|
{ |
|
"epoch": 0.9883191447238171, |
|
"grad_norm": 1.274196273086913, |
|
"learning_rate": 2.582118957745738e-09, |
|
"loss": 0.4538, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 0.9899029895070283, |
|
"grad_norm": 1.5641909121422626, |
|
"learning_rate": 1.8971211465363955e-09, |
|
"loss": 0.445, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.9914868342902395, |
|
"grad_norm": 1.431529316600778, |
|
"learning_rate": 1.31747706444596e-09, |
|
"loss": 0.4297, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 0.9930706790734508, |
|
"grad_norm": 1.3641264104243216, |
|
"learning_rate": 8.432019856345896e-10, |
|
"loss": 0.4418, |
|
"step": 627 |
|
}, |
|
{ |
|
"epoch": 0.9946545238566621, |
|
"grad_norm": 1.409407684955624, |
|
"learning_rate": 4.743084076923587e-10, |
|
"loss": 0.5325, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 0.9962383686398733, |
|
"grad_norm": 1.4066497217484986, |
|
"learning_rate": 2.108060513075216e-10, |
|
"loss": 0.4452, |
|
"step": 629 |
|
}, |
|
{ |
|
"epoch": 0.9978222134230845, |
|
"grad_norm": 1.5846124004550377, |
|
"learning_rate": 5.270186001249399e-11, |
|
"loss": 0.3571, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.9978222134230845, |
|
"eval_accuracy": 0.8122857271681789, |
|
"eval_loss": 0.5811628699302673, |
|
"eval_perplexity": 1.2019592776188088, |
|
"eval_runtime": 530.5565, |
|
"eval_samples_per_second": 1.434, |
|
"eval_steps_per_second": 1.434, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.9994060582062958, |
|
"grad_norm": 1.4491753654503536, |
|
"learning_rate": 0.0, |
|
"loss": 0.5137, |
|
"step": 631 |
|
}, |
|
{ |
|
"epoch": 0.9994060582062958, |
|
"step": 631, |
|
"total_flos": 132105476505600.0, |
|
"train_loss": 0.5424914620757669, |
|
"train_runtime": 50500.2662, |
|
"train_samples_per_second": 0.2, |
|
"train_steps_per_second": 0.012 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 631, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 132105476505600.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|