INDIC-TAMIL / trainer_state.json
RudranshAgnihotri's picture
Upload 8 files
4e7520a verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.13198192806300124,
"eval_steps": 500,
"global_step": 237060,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 2.6396658420562744,
"learning_rate": 5e-06,
"loss": 11.2923,
"step": 500
},
{
"epoch": 0.0,
"grad_norm": 3.1433205604553223,
"learning_rate": 1e-05,
"loss": 10.5195,
"step": 1000
},
{
"epoch": 0.0,
"grad_norm": 2.825376510620117,
"learning_rate": 1.5e-05,
"loss": 9.4381,
"step": 1500
},
{
"epoch": 0.0,
"grad_norm": 6.385465621948242,
"learning_rate": 2e-05,
"loss": 7.8175,
"step": 2000
},
{
"epoch": 0.0,
"grad_norm": 2.1213717460632324,
"learning_rate": 2.5e-05,
"loss": 5.8245,
"step": 2500
},
{
"epoch": 0.0,
"grad_norm": 1.302095890045166,
"learning_rate": 3e-05,
"loss": 4.455,
"step": 3000
},
{
"epoch": 0.0,
"grad_norm": 3.764803886413574,
"learning_rate": 3.5e-05,
"loss": 4.1131,
"step": 3500
},
{
"epoch": 0.0,
"grad_norm": 1.2695655822753906,
"learning_rate": 4e-05,
"loss": 4.0351,
"step": 4000
},
{
"epoch": 0.0,
"grad_norm": 1.787291169166565,
"learning_rate": 4.5e-05,
"loss": 3.9797,
"step": 4500
},
{
"epoch": 0.0,
"grad_norm": 1.4003510475158691,
"learning_rate": 5e-05,
"loss": 3.9632,
"step": 5000
},
{
"epoch": 0.0,
"grad_norm": 1.7297732830047607,
"learning_rate": 5.500000000000001e-05,
"loss": 3.8884,
"step": 5500
},
{
"epoch": 0.0,
"grad_norm": 1.6848719120025635,
"learning_rate": 6e-05,
"loss": 3.8555,
"step": 6000
},
{
"epoch": 0.0,
"grad_norm": 1.5225064754486084,
"learning_rate": 6.500000000000001e-05,
"loss": 3.8361,
"step": 6500
},
{
"epoch": 0.0,
"grad_norm": 1.369327187538147,
"learning_rate": 7e-05,
"loss": 3.8418,
"step": 7000
},
{
"epoch": 0.0,
"grad_norm": 1.8992069959640503,
"learning_rate": 7.500000000000001e-05,
"loss": 3.7863,
"step": 7500
},
{
"epoch": 0.0,
"grad_norm": 0.6943246722221375,
"learning_rate": 8e-05,
"loss": 3.7756,
"step": 8000
},
{
"epoch": 0.0,
"grad_norm": 0.9557390213012695,
"learning_rate": 8.5e-05,
"loss": 3.7761,
"step": 8500
},
{
"epoch": 0.01,
"grad_norm": 5.63359260559082,
"learning_rate": 9e-05,
"loss": 3.7632,
"step": 9000
},
{
"epoch": 0.01,
"grad_norm": 2.1707723140716553,
"learning_rate": 9.5e-05,
"loss": 3.7302,
"step": 9500
},
{
"epoch": 0.01,
"grad_norm": 1.575547456741333,
"learning_rate": 0.0001,
"loss": 3.6666,
"step": 10000
},
{
"epoch": 0.01,
"grad_norm": 1.7842143774032593,
"learning_rate": 9.999907191972478e-05,
"loss": 3.6669,
"step": 10500
},
{
"epoch": 0.01,
"grad_norm": 3.472308874130249,
"learning_rate": 9.999814383944955e-05,
"loss": 3.6722,
"step": 11000
},
{
"epoch": 0.01,
"grad_norm": 4.764346122741699,
"learning_rate": 9.99972157591743e-05,
"loss": 3.6258,
"step": 11500
},
{
"epoch": 0.01,
"grad_norm": 1.4110618829727173,
"learning_rate": 9.999628767889908e-05,
"loss": 3.6232,
"step": 12000
},
{
"epoch": 0.01,
"grad_norm": 1.8532825708389282,
"learning_rate": 9.999535959862385e-05,
"loss": 3.6201,
"step": 12500
},
{
"epoch": 0.01,
"grad_norm": 3.7213845252990723,
"learning_rate": 9.999443151834862e-05,
"loss": 3.5771,
"step": 13000
},
{
"epoch": 0.01,
"grad_norm": 6.306430816650391,
"learning_rate": 9.999350343807339e-05,
"loss": 3.5676,
"step": 13500
},
{
"epoch": 0.01,
"grad_norm": 3.5671603679656982,
"learning_rate": 9.999257535779816e-05,
"loss": 3.542,
"step": 14000
},
{
"epoch": 0.01,
"grad_norm": 1.9145666360855103,
"learning_rate": 9.999164727752292e-05,
"loss": 3.524,
"step": 14500
},
{
"epoch": 0.01,
"grad_norm": 2.1453213691711426,
"learning_rate": 9.999071919724769e-05,
"loss": 3.507,
"step": 15000
},
{
"epoch": 0.01,
"grad_norm": 2.5378761291503906,
"learning_rate": 9.998979111697245e-05,
"loss": 3.5117,
"step": 15500
},
{
"epoch": 0.01,
"grad_norm": 1.8997902870178223,
"learning_rate": 9.998886303669722e-05,
"loss": 3.4705,
"step": 16000
},
{
"epoch": 0.01,
"grad_norm": 1.9014019966125488,
"learning_rate": 9.998793495642199e-05,
"loss": 3.4625,
"step": 16500
},
{
"epoch": 0.01,
"grad_norm": 4.82717227935791,
"learning_rate": 9.998700687614676e-05,
"loss": 3.462,
"step": 17000
},
{
"epoch": 0.01,
"grad_norm": 4.583270072937012,
"learning_rate": 9.998607879587154e-05,
"loss": 3.434,
"step": 17500
},
{
"epoch": 0.01,
"grad_norm": 2.3539793491363525,
"learning_rate": 9.998515071559629e-05,
"loss": 3.3932,
"step": 18000
},
{
"epoch": 0.01,
"grad_norm": 3.745349884033203,
"learning_rate": 9.998422263532106e-05,
"loss": 3.3875,
"step": 18500
},
{
"epoch": 0.01,
"grad_norm": 3.2298102378845215,
"learning_rate": 9.998329455504584e-05,
"loss": 3.3663,
"step": 19000
},
{
"epoch": 0.01,
"grad_norm": 2.3267478942871094,
"learning_rate": 9.998236647477061e-05,
"loss": 3.3471,
"step": 19500
},
{
"epoch": 0.01,
"grad_norm": 4.170761585235596,
"learning_rate": 9.998143839449538e-05,
"loss": 3.3523,
"step": 20000
},
{
"epoch": 0.01,
"grad_norm": 3.3661534786224365,
"learning_rate": 9.998051031422015e-05,
"loss": 3.3571,
"step": 20500
},
{
"epoch": 0.01,
"grad_norm": 3.1715431213378906,
"learning_rate": 9.997958223394491e-05,
"loss": 3.3627,
"step": 21000
},
{
"epoch": 0.01,
"grad_norm": 12.678780555725098,
"learning_rate": 9.997865415366968e-05,
"loss": 3.3433,
"step": 21500
},
{
"epoch": 0.01,
"grad_norm": 4.594781398773193,
"learning_rate": 9.997772607339445e-05,
"loss": 3.3309,
"step": 22000
},
{
"epoch": 0.01,
"grad_norm": 3.517892599105835,
"learning_rate": 9.997679799311922e-05,
"loss": 3.3274,
"step": 22500
},
{
"epoch": 0.01,
"grad_norm": 2.783637285232544,
"learning_rate": 9.9975869912844e-05,
"loss": 3.3317,
"step": 23000
},
{
"epoch": 0.01,
"grad_norm": 4.124772548675537,
"learning_rate": 9.997494183256876e-05,
"loss": 3.3127,
"step": 23500
},
{
"epoch": 0.01,
"grad_norm": 2.7662994861602783,
"learning_rate": 9.997401375229352e-05,
"loss": 3.3101,
"step": 24000
},
{
"epoch": 0.01,
"grad_norm": 2.652294158935547,
"learning_rate": 9.99730856720183e-05,
"loss": 3.2721,
"step": 24500
},
{
"epoch": 0.01,
"grad_norm": 2.3313260078430176,
"learning_rate": 9.997215759174307e-05,
"loss": 3.2796,
"step": 25000
},
{
"epoch": 0.01,
"grad_norm": 4.2452392578125,
"learning_rate": 9.997122951146782e-05,
"loss": 3.2846,
"step": 25500
},
{
"epoch": 0.01,
"grad_norm": 3.510793924331665,
"learning_rate": 9.99703014311926e-05,
"loss": 3.29,
"step": 26000
},
{
"epoch": 0.01,
"grad_norm": 2.9583752155303955,
"learning_rate": 9.996937335091737e-05,
"loss": 3.2618,
"step": 26500
},
{
"epoch": 0.02,
"grad_norm": 3.7444071769714355,
"learning_rate": 9.996844527064212e-05,
"loss": 3.2348,
"step": 27000
},
{
"epoch": 0.02,
"grad_norm": 2.852471113204956,
"learning_rate": 9.99675171903669e-05,
"loss": 3.2315,
"step": 27500
},
{
"epoch": 0.02,
"grad_norm": 2.542102813720703,
"learning_rate": 9.996658911009167e-05,
"loss": 3.2564,
"step": 28000
},
{
"epoch": 0.02,
"grad_norm": 3.195061445236206,
"learning_rate": 9.996566102981644e-05,
"loss": 3.2477,
"step": 28500
},
{
"epoch": 0.02,
"grad_norm": 4.133997917175293,
"learning_rate": 9.996473294954121e-05,
"loss": 3.2246,
"step": 29000
},
{
"epoch": 0.02,
"grad_norm": 3.147731304168701,
"learning_rate": 9.996380486926598e-05,
"loss": 3.2483,
"step": 29500
},
{
"epoch": 0.02,
"grad_norm": 2.417142868041992,
"learning_rate": 9.996287678899074e-05,
"loss": 3.2084,
"step": 30000
},
{
"epoch": 0.02,
"grad_norm": 2.3051340579986572,
"learning_rate": 9.996194870871551e-05,
"loss": 3.2305,
"step": 30500
},
{
"epoch": 0.02,
"grad_norm": 3.268831491470337,
"learning_rate": 9.996102062844028e-05,
"loss": 3.2198,
"step": 31000
},
{
"epoch": 0.02,
"grad_norm": 2.9484684467315674,
"learning_rate": 9.996009254816505e-05,
"loss": 3.1927,
"step": 31500
},
{
"epoch": 0.02,
"grad_norm": 5.482509613037109,
"learning_rate": 9.995916446788982e-05,
"loss": 3.2018,
"step": 32000
},
{
"epoch": 0.02,
"grad_norm": 3.319070339202881,
"learning_rate": 9.99582363876146e-05,
"loss": 3.2034,
"step": 32500
},
{
"epoch": 0.02,
"grad_norm": 2.9254605770111084,
"learning_rate": 9.995730830733935e-05,
"loss": 3.1935,
"step": 33000
},
{
"epoch": 0.02,
"grad_norm": 3.508344888687134,
"learning_rate": 9.995638022706412e-05,
"loss": 3.1913,
"step": 33500
},
{
"epoch": 0.02,
"grad_norm": 2.0120761394500732,
"learning_rate": 9.99554521467889e-05,
"loss": 3.2221,
"step": 34000
},
{
"epoch": 0.02,
"grad_norm": 2.6379246711730957,
"learning_rate": 9.995452406651367e-05,
"loss": 3.1848,
"step": 34500
},
{
"epoch": 0.02,
"grad_norm": 4.178890705108643,
"learning_rate": 9.995359598623844e-05,
"loss": 3.184,
"step": 35000
},
{
"epoch": 0.02,
"grad_norm": 3.148407459259033,
"learning_rate": 9.995266790596321e-05,
"loss": 3.2119,
"step": 35500
},
{
"epoch": 0.02,
"grad_norm": 2.3678390979766846,
"learning_rate": 9.995173982568797e-05,
"loss": 3.1744,
"step": 36000
},
{
"epoch": 0.02,
"grad_norm": 3.0567054748535156,
"learning_rate": 9.995081174541273e-05,
"loss": 3.1821,
"step": 36500
},
{
"epoch": 0.02,
"grad_norm": 3.4158051013946533,
"learning_rate": 9.99498836651375e-05,
"loss": 3.1615,
"step": 37000
},
{
"epoch": 0.02,
"grad_norm": 3.654043197631836,
"learning_rate": 9.994895558486227e-05,
"loss": 3.174,
"step": 37500
},
{
"epoch": 0.02,
"grad_norm": 3.634075403213501,
"learning_rate": 9.994802750458704e-05,
"loss": 3.1792,
"step": 38000
},
{
"epoch": 0.02,
"grad_norm": 2.8857271671295166,
"learning_rate": 9.994709942431181e-05,
"loss": 3.1441,
"step": 38500
},
{
"epoch": 0.02,
"grad_norm": 3.0216064453125,
"learning_rate": 9.994617134403658e-05,
"loss": 3.1464,
"step": 39000
},
{
"epoch": 0.02,
"grad_norm": 4.89854097366333,
"learning_rate": 9.994524326376134e-05,
"loss": 3.1514,
"step": 39500
},
{
"epoch": 0.02,
"grad_norm": 3.495551586151123,
"learning_rate": 9.994431518348611e-05,
"loss": 3.153,
"step": 40000
},
{
"epoch": 0.02,
"grad_norm": 2.9397456645965576,
"learning_rate": 9.994338710321088e-05,
"loss": 3.1329,
"step": 40500
},
{
"epoch": 0.02,
"grad_norm": 5.4169721603393555,
"learning_rate": 9.994245902293566e-05,
"loss": 3.1485,
"step": 41000
},
{
"epoch": 0.02,
"grad_norm": 2.784404754638672,
"learning_rate": 9.994153094266043e-05,
"loss": 3.1423,
"step": 41500
},
{
"epoch": 0.02,
"grad_norm": 2.435594320297241,
"learning_rate": 9.994060286238518e-05,
"loss": 3.133,
"step": 42000
},
{
"epoch": 0.02,
"grad_norm": 2.478294610977173,
"learning_rate": 9.993967478210996e-05,
"loss": 3.135,
"step": 42500
},
{
"epoch": 0.02,
"grad_norm": 3.871967315673828,
"learning_rate": 9.993874670183473e-05,
"loss": 3.1133,
"step": 43000
},
{
"epoch": 0.02,
"grad_norm": 2.735368490219116,
"learning_rate": 9.99378186215595e-05,
"loss": 3.1333,
"step": 43500
},
{
"epoch": 0.02,
"grad_norm": 3.2882280349731445,
"learning_rate": 9.993689054128427e-05,
"loss": 3.1231,
"step": 44000
},
{
"epoch": 0.02,
"grad_norm": 4.579193115234375,
"learning_rate": 9.993596246100904e-05,
"loss": 3.1051,
"step": 44500
},
{
"epoch": 0.03,
"grad_norm": 4.136692047119141,
"learning_rate": 9.99350343807338e-05,
"loss": 3.1295,
"step": 45000
},
{
"epoch": 0.03,
"grad_norm": 3.8374266624450684,
"learning_rate": 9.993410630045857e-05,
"loss": 3.1182,
"step": 45500
},
{
"epoch": 0.03,
"grad_norm": 3.8597323894500732,
"learning_rate": 9.993317822018334e-05,
"loss": 3.1102,
"step": 46000
},
{
"epoch": 0.03,
"grad_norm": 2.7364611625671387,
"learning_rate": 9.99322501399081e-05,
"loss": 3.1021,
"step": 46500
},
{
"epoch": 0.03,
"grad_norm": 10.505576133728027,
"learning_rate": 9.993132205963287e-05,
"loss": 3.087,
"step": 47000
},
{
"epoch": 0.03,
"grad_norm": 3.8307197093963623,
"learning_rate": 9.993039397935764e-05,
"loss": 3.1033,
"step": 47500
},
{
"epoch": 0.03,
"grad_norm": 3.2991435527801514,
"learning_rate": 9.992946589908241e-05,
"loss": 3.0994,
"step": 48000
},
{
"epoch": 0.03,
"grad_norm": 2.461336612701416,
"learning_rate": 9.992853781880717e-05,
"loss": 3.0768,
"step": 48500
},
{
"epoch": 0.03,
"grad_norm": 4.653237819671631,
"learning_rate": 9.992760973853194e-05,
"loss": 3.0751,
"step": 49000
},
{
"epoch": 0.03,
"grad_norm": 3.9689579010009766,
"learning_rate": 9.992668165825671e-05,
"loss": 3.0845,
"step": 49500
},
{
"epoch": 0.03,
"grad_norm": 3.50116229057312,
"learning_rate": 9.992575357798149e-05,
"loss": 3.0563,
"step": 50000
},
{
"epoch": 0.03,
"grad_norm": 7.10615873336792,
"learning_rate": 9.992482549770626e-05,
"loss": 3.0585,
"step": 50500
},
{
"epoch": 0.03,
"grad_norm": 3.484778642654419,
"learning_rate": 9.992389741743103e-05,
"loss": 3.0657,
"step": 51000
},
{
"epoch": 0.03,
"grad_norm": 4.087871074676514,
"learning_rate": 9.992296933715579e-05,
"loss": 3.0302,
"step": 51500
},
{
"epoch": 0.03,
"grad_norm": 6.728222846984863,
"learning_rate": 9.992204125688056e-05,
"loss": 3.0197,
"step": 52000
},
{
"epoch": 0.03,
"grad_norm": 4.08004093170166,
"learning_rate": 9.992111317660533e-05,
"loss": 3.0261,
"step": 52500
},
{
"epoch": 0.03,
"grad_norm": 4.262966156005859,
"learning_rate": 9.99201850963301e-05,
"loss": 3.0185,
"step": 53000
},
{
"epoch": 0.03,
"grad_norm": 3.3461170196533203,
"learning_rate": 9.991925701605487e-05,
"loss": 2.9916,
"step": 53500
},
{
"epoch": 0.03,
"grad_norm": 5.650246620178223,
"learning_rate": 9.991832893577964e-05,
"loss": 3.0014,
"step": 54000
},
{
"epoch": 0.03,
"grad_norm": 4.820484161376953,
"learning_rate": 9.99174008555044e-05,
"loss": 2.9966,
"step": 54500
},
{
"epoch": 0.03,
"grad_norm": 3.245210886001587,
"learning_rate": 9.991647277522917e-05,
"loss": 2.9829,
"step": 55000
},
{
"epoch": 0.03,
"grad_norm": 3.009565591812134,
"learning_rate": 9.991554469495394e-05,
"loss": 2.9665,
"step": 55500
},
{
"epoch": 0.03,
"grad_norm": 5.277273654937744,
"learning_rate": 9.991461661467872e-05,
"loss": 2.9572,
"step": 56000
},
{
"epoch": 0.03,
"grad_norm": 3.0885508060455322,
"learning_rate": 9.991368853440347e-05,
"loss": 2.953,
"step": 56500
},
{
"epoch": 0.03,
"grad_norm": 3.3525726795196533,
"learning_rate": 9.991276045412824e-05,
"loss": 2.9494,
"step": 57000
},
{
"epoch": 0.03,
"grad_norm": 3.699075937271118,
"learning_rate": 9.9911832373853e-05,
"loss": 2.9731,
"step": 57500
},
{
"epoch": 0.03,
"grad_norm": 2.363311529159546,
"learning_rate": 9.991090429357777e-05,
"loss": 2.9257,
"step": 58000
},
{
"epoch": 0.03,
"grad_norm": 3.144495964050293,
"learning_rate": 9.990997621330255e-05,
"loss": 2.9135,
"step": 58500
},
{
"epoch": 0.03,
"grad_norm": 3.192347288131714,
"learning_rate": 9.990904813302732e-05,
"loss": 2.8925,
"step": 59000
},
{
"epoch": 0.03,
"grad_norm": 3.4419093132019043,
"learning_rate": 9.990812005275209e-05,
"loss": 2.9018,
"step": 59500
},
{
"epoch": 0.03,
"grad_norm": 3.506303071975708,
"learning_rate": 9.990719197247686e-05,
"loss": 2.8921,
"step": 60000
},
{
"epoch": 0.03,
"grad_norm": 2.7846992015838623,
"learning_rate": 9.990626389220162e-05,
"loss": 2.8904,
"step": 60500
},
{
"epoch": 0.03,
"grad_norm": 3.9778714179992676,
"learning_rate": 9.990533581192639e-05,
"loss": 2.854,
"step": 61000
},
{
"epoch": 0.03,
"grad_norm": 3.1654438972473145,
"learning_rate": 9.990440773165116e-05,
"loss": 2.8481,
"step": 61500
},
{
"epoch": 0.03,
"grad_norm": 5.007691860198975,
"learning_rate": 9.990347965137593e-05,
"loss": 2.836,
"step": 62000
},
{
"epoch": 0.03,
"grad_norm": 3.4846861362457275,
"learning_rate": 9.99025515711007e-05,
"loss": 2.8312,
"step": 62500
},
{
"epoch": 0.04,
"grad_norm": 6.449169158935547,
"learning_rate": 9.990162349082547e-05,
"loss": 2.8371,
"step": 63000
},
{
"epoch": 0.04,
"grad_norm": 5.483363628387451,
"learning_rate": 9.990069541055023e-05,
"loss": 2.8074,
"step": 63500
},
{
"epoch": 0.04,
"grad_norm": 4.194338798522949,
"learning_rate": 9.9899767330275e-05,
"loss": 2.8134,
"step": 64000
},
{
"epoch": 0.04,
"grad_norm": 4.445028305053711,
"learning_rate": 9.989883924999978e-05,
"loss": 2.8024,
"step": 64500
},
{
"epoch": 0.04,
"grad_norm": 4.259857177734375,
"learning_rate": 9.989791116972455e-05,
"loss": 2.8063,
"step": 65000
},
{
"epoch": 0.04,
"grad_norm": 15.38962173461914,
"learning_rate": 9.989698308944932e-05,
"loss": 2.7849,
"step": 65500
},
{
"epoch": 0.04,
"grad_norm": 7.087435245513916,
"learning_rate": 9.989605500917409e-05,
"loss": 2.7755,
"step": 66000
},
{
"epoch": 0.04,
"grad_norm": 4.6525044441223145,
"learning_rate": 9.989512692889885e-05,
"loss": 2.7692,
"step": 66500
},
{
"epoch": 0.04,
"grad_norm": 3.657395124435425,
"learning_rate": 9.98941988486236e-05,
"loss": 2.7754,
"step": 67000
},
{
"epoch": 0.04,
"grad_norm": 4.082229137420654,
"learning_rate": 9.989327076834838e-05,
"loss": 2.7462,
"step": 67500
},
{
"epoch": 0.04,
"grad_norm": 5.71895694732666,
"learning_rate": 9.989234268807315e-05,
"loss": 2.7489,
"step": 68000
},
{
"epoch": 0.04,
"grad_norm": 4.07025671005249,
"learning_rate": 9.989141460779792e-05,
"loss": 2.7429,
"step": 68500
},
{
"epoch": 0.04,
"grad_norm": 6.3579888343811035,
"learning_rate": 9.989048652752269e-05,
"loss": 2.7359,
"step": 69000
},
{
"epoch": 0.04,
"grad_norm": 5.608436107635498,
"learning_rate": 9.988955844724746e-05,
"loss": 2.7413,
"step": 69500
},
{
"epoch": 0.04,
"grad_norm": 3.6080968379974365,
"learning_rate": 9.988863036697222e-05,
"loss": 2.7374,
"step": 70000
},
{
"epoch": 0.04,
"grad_norm": 3.0696682929992676,
"learning_rate": 9.988770228669699e-05,
"loss": 2.7024,
"step": 70500
},
{
"epoch": 0.04,
"grad_norm": 3.4028148651123047,
"learning_rate": 9.988677420642176e-05,
"loss": 2.6969,
"step": 71000
},
{
"epoch": 0.04,
"grad_norm": 3.2408607006073,
"learning_rate": 9.988584612614653e-05,
"loss": 2.7142,
"step": 71500
},
{
"epoch": 0.04,
"grad_norm": 4.686740875244141,
"learning_rate": 9.98849180458713e-05,
"loss": 2.6779,
"step": 72000
},
{
"epoch": 0.04,
"grad_norm": 4.289364814758301,
"learning_rate": 9.988398996559606e-05,
"loss": 2.6924,
"step": 72500
},
{
"epoch": 0.04,
"grad_norm": 5.570862293243408,
"learning_rate": 9.988306188532083e-05,
"loss": 2.7066,
"step": 73000
},
{
"epoch": 0.04,
"grad_norm": 6.265756130218506,
"learning_rate": 9.98821338050456e-05,
"loss": 2.6904,
"step": 73500
},
{
"epoch": 0.04,
"grad_norm": 4.149326324462891,
"learning_rate": 9.988120572477038e-05,
"loss": 2.675,
"step": 74000
},
{
"epoch": 0.04,
"grad_norm": 3.304511547088623,
"learning_rate": 9.988027764449515e-05,
"loss": 2.6588,
"step": 74500
},
{
"epoch": 0.04,
"grad_norm": 4.896495342254639,
"learning_rate": 9.987934956421992e-05,
"loss": 2.6543,
"step": 75000
},
{
"epoch": 0.04,
"grad_norm": 4.827699661254883,
"learning_rate": 9.987842148394468e-05,
"loss": 2.6547,
"step": 75500
},
{
"epoch": 0.04,
"grad_norm": 4.508848667144775,
"learning_rate": 9.987749340366945e-05,
"loss": 2.6737,
"step": 76000
},
{
"epoch": 0.04,
"grad_norm": 2.711066722869873,
"learning_rate": 9.987656532339422e-05,
"loss": 2.6596,
"step": 76500
},
{
"epoch": 0.04,
"grad_norm": 4.568380832672119,
"learning_rate": 9.987563724311898e-05,
"loss": 2.6336,
"step": 77000
},
{
"epoch": 0.04,
"grad_norm": 4.230466365814209,
"learning_rate": 9.987470916284375e-05,
"loss": 2.6513,
"step": 77500
},
{
"epoch": 0.04,
"grad_norm": 7.702442646026611,
"learning_rate": 9.987378108256852e-05,
"loss": 2.6448,
"step": 78000
},
{
"epoch": 0.04,
"grad_norm": 5.38634729385376,
"learning_rate": 9.987285300229329e-05,
"loss": 2.6557,
"step": 78500
},
{
"epoch": 0.04,
"grad_norm": 4.267560005187988,
"learning_rate": 9.987192492201805e-05,
"loss": 2.6387,
"step": 79000
},
{
"epoch": 0.04,
"grad_norm": 4.3129801750183105,
"learning_rate": 9.987099684174282e-05,
"loss": 2.6109,
"step": 79500
},
{
"epoch": 0.04,
"grad_norm": 4.636094570159912,
"learning_rate": 9.98700687614676e-05,
"loss": 2.6198,
"step": 80000
},
{
"epoch": 0.04,
"grad_norm": 3.763615846633911,
"learning_rate": 9.986914068119236e-05,
"loss": 2.611,
"step": 80500
},
{
"epoch": 0.05,
"grad_norm": 4.402085781097412,
"learning_rate": 9.986821260091714e-05,
"loss": 2.6057,
"step": 81000
},
{
"epoch": 0.05,
"grad_norm": 4.437091827392578,
"learning_rate": 9.986728452064191e-05,
"loss": 2.6322,
"step": 81500
},
{
"epoch": 0.05,
"grad_norm": 4.0002923011779785,
"learning_rate": 9.986635644036667e-05,
"loss": 2.5895,
"step": 82000
},
{
"epoch": 0.05,
"grad_norm": 7.929385662078857,
"learning_rate": 9.986542836009144e-05,
"loss": 2.5913,
"step": 82500
},
{
"epoch": 0.05,
"grad_norm": 3.028870105743408,
"learning_rate": 9.986450027981621e-05,
"loss": 2.5852,
"step": 83000
},
{
"epoch": 0.05,
"grad_norm": 4.816354274749756,
"learning_rate": 9.986357219954098e-05,
"loss": 2.612,
"step": 83500
},
{
"epoch": 0.05,
"grad_norm": 3.2722418308258057,
"learning_rate": 9.986264411926575e-05,
"loss": 2.5844,
"step": 84000
},
{
"epoch": 0.05,
"grad_norm": 4.4086713790893555,
"learning_rate": 9.986171603899052e-05,
"loss": 2.5766,
"step": 84500
},
{
"epoch": 0.05,
"grad_norm": 4.107178211212158,
"learning_rate": 9.986078795871528e-05,
"loss": 2.5861,
"step": 85000
},
{
"epoch": 0.05,
"grad_norm": 4.0130133628845215,
"learning_rate": 9.985985987844005e-05,
"loss": 2.5946,
"step": 85500
},
{
"epoch": 0.05,
"grad_norm": 4.053595066070557,
"learning_rate": 9.985893179816482e-05,
"loss": 2.5411,
"step": 86000
},
{
"epoch": 0.05,
"grad_norm": 4.136240482330322,
"learning_rate": 9.98580037178896e-05,
"loss": 2.5791,
"step": 86500
},
{
"epoch": 0.05,
"grad_norm": 6.301804542541504,
"learning_rate": 9.985707563761437e-05,
"loss": 2.5621,
"step": 87000
},
{
"epoch": 0.05,
"grad_norm": 4.376827239990234,
"learning_rate": 9.985614755733912e-05,
"loss": 2.5693,
"step": 87500
},
{
"epoch": 0.05,
"grad_norm": 3.8407466411590576,
"learning_rate": 9.985521947706388e-05,
"loss": 2.5354,
"step": 88000
},
{
"epoch": 0.05,
"grad_norm": 6.222740650177002,
"learning_rate": 9.985429139678865e-05,
"loss": 2.5402,
"step": 88500
},
{
"epoch": 0.05,
"grad_norm": 5.93278169631958,
"learning_rate": 9.985336331651342e-05,
"loss": 2.5285,
"step": 89000
},
{
"epoch": 0.05,
"grad_norm": 2.4788284301757812,
"learning_rate": 9.98524352362382e-05,
"loss": 2.511,
"step": 89500
},
{
"epoch": 0.05,
"grad_norm": 3.4369866847991943,
"learning_rate": 9.985150715596297e-05,
"loss": 2.5223,
"step": 90000
},
{
"epoch": 0.05,
"grad_norm": 5.007632732391357,
"learning_rate": 9.985057907568774e-05,
"loss": 2.5231,
"step": 90500
},
{
"epoch": 0.05,
"grad_norm": 3.440267562866211,
"learning_rate": 9.98496509954125e-05,
"loss": 2.5233,
"step": 91000
},
{
"epoch": 0.05,
"grad_norm": 9.757936477661133,
"learning_rate": 9.984872291513727e-05,
"loss": 2.5099,
"step": 91500
},
{
"epoch": 0.05,
"grad_norm": 4.741192817687988,
"learning_rate": 9.984779483486204e-05,
"loss": 2.5212,
"step": 92000
},
{
"epoch": 0.05,
"grad_norm": 4.7662811279296875,
"learning_rate": 9.984686675458681e-05,
"loss": 2.5023,
"step": 92500
},
{
"epoch": 0.05,
"grad_norm": 4.681964874267578,
"learning_rate": 9.984593867431158e-05,
"loss": 2.5055,
"step": 93000
},
{
"epoch": 0.05,
"grad_norm": 3.7245185375213623,
"learning_rate": 9.984501059403635e-05,
"loss": 2.5083,
"step": 93500
},
{
"epoch": 0.05,
"grad_norm": 5.7924418449401855,
"learning_rate": 9.984408251376111e-05,
"loss": 2.5024,
"step": 94000
},
{
"epoch": 0.05,
"grad_norm": 5.420963764190674,
"learning_rate": 9.984315443348588e-05,
"loss": 2.5053,
"step": 94500
},
{
"epoch": 0.05,
"grad_norm": 4.608907699584961,
"learning_rate": 9.984222635321065e-05,
"loss": 2.4765,
"step": 95000
},
{
"epoch": 0.05,
"grad_norm": 5.017517566680908,
"learning_rate": 9.984129827293543e-05,
"loss": 2.5004,
"step": 95500
},
{
"epoch": 0.05,
"grad_norm": 6.300387859344482,
"learning_rate": 9.98403701926602e-05,
"loss": 2.4943,
"step": 96000
},
{
"epoch": 0.05,
"grad_norm": 6.16803503036499,
"learning_rate": 9.983944211238497e-05,
"loss": 2.5001,
"step": 96500
},
{
"epoch": 0.05,
"grad_norm": 4.007481098175049,
"learning_rate": 9.983851403210973e-05,
"loss": 2.4898,
"step": 97000
},
{
"epoch": 0.05,
"grad_norm": 5.498426914215088,
"learning_rate": 9.983758595183448e-05,
"loss": 2.4874,
"step": 97500
},
{
"epoch": 0.05,
"grad_norm": 4.115726470947266,
"learning_rate": 9.983665787155926e-05,
"loss": 2.4714,
"step": 98000
},
{
"epoch": 0.05,
"grad_norm": 4.724228382110596,
"learning_rate": 9.983572979128403e-05,
"loss": 2.4748,
"step": 98500
},
{
"epoch": 0.06,
"grad_norm": 6.842497825622559,
"learning_rate": 9.98348017110088e-05,
"loss": 2.4761,
"step": 99000
},
{
"epoch": 0.06,
"grad_norm": 4.527170181274414,
"learning_rate": 9.983387363073357e-05,
"loss": 2.462,
"step": 99500
},
{
"epoch": 0.06,
"grad_norm": 4.956340789794922,
"learning_rate": 9.983294555045834e-05,
"loss": 2.4568,
"step": 100000
},
{
"epoch": 0.06,
"grad_norm": 4.177946090698242,
"learning_rate": 9.98320174701831e-05,
"loss": 2.4418,
"step": 100500
},
{
"epoch": 0.06,
"grad_norm": 6.623388767242432,
"learning_rate": 9.983108938990787e-05,
"loss": 2.4467,
"step": 101000
},
{
"epoch": 0.06,
"grad_norm": 6.102182865142822,
"learning_rate": 9.983016130963264e-05,
"loss": 2.444,
"step": 101500
},
{
"epoch": 0.06,
"grad_norm": 4.186117649078369,
"learning_rate": 9.982923322935741e-05,
"loss": 2.4396,
"step": 102000
},
{
"epoch": 0.06,
"grad_norm": 3.916994571685791,
"learning_rate": 9.982830514908218e-05,
"loss": 2.4612,
"step": 102500
},
{
"epoch": 0.06,
"grad_norm": 4.213066577911377,
"learning_rate": 9.982737706880694e-05,
"loss": 2.4244,
"step": 103000
},
{
"epoch": 0.06,
"grad_norm": 4.665497779846191,
"learning_rate": 9.982644898853171e-05,
"loss": 2.4364,
"step": 103500
},
{
"epoch": 0.06,
"grad_norm": 4.808815956115723,
"learning_rate": 9.982552090825648e-05,
"loss": 2.4381,
"step": 104000
},
{
"epoch": 0.06,
"grad_norm": 5.384184837341309,
"learning_rate": 9.982459282798126e-05,
"loss": 2.4219,
"step": 104500
},
{
"epoch": 0.06,
"grad_norm": 8.32588005065918,
"learning_rate": 9.982366474770603e-05,
"loss": 2.4207,
"step": 105000
},
{
"epoch": 0.06,
"grad_norm": 6.564486503601074,
"learning_rate": 9.98227366674308e-05,
"loss": 2.4182,
"step": 105500
},
{
"epoch": 0.06,
"grad_norm": 4.123614311218262,
"learning_rate": 9.982180858715556e-05,
"loss": 2.4214,
"step": 106000
},
{
"epoch": 0.06,
"grad_norm": 5.507079124450684,
"learning_rate": 9.982088050688033e-05,
"loss": 2.4096,
"step": 106500
},
{
"epoch": 0.06,
"grad_norm": 5.728691577911377,
"learning_rate": 9.98199524266051e-05,
"loss": 2.4201,
"step": 107000
},
{
"epoch": 0.06,
"grad_norm": 6.611893177032471,
"learning_rate": 9.981902434632987e-05,
"loss": 2.4021,
"step": 107500
},
{
"epoch": 0.06,
"grad_norm": 5.187854766845703,
"learning_rate": 9.981809626605463e-05,
"loss": 2.3956,
"step": 108000
},
{
"epoch": 0.06,
"grad_norm": 6.5400166511535645,
"learning_rate": 9.98171681857794e-05,
"loss": 2.4168,
"step": 108500
},
{
"epoch": 0.06,
"grad_norm": 4.0128173828125,
"learning_rate": 9.981624010550417e-05,
"loss": 2.4017,
"step": 109000
},
{
"epoch": 0.06,
"grad_norm": 3.7369205951690674,
"learning_rate": 9.981531202522893e-05,
"loss": 2.3895,
"step": 109500
},
{
"epoch": 0.06,
"grad_norm": 3.8515870571136475,
"learning_rate": 9.98143839449537e-05,
"loss": 2.3728,
"step": 110000
},
{
"epoch": 0.06,
"grad_norm": 4.47413969039917,
"learning_rate": 9.981345586467847e-05,
"loss": 2.4067,
"step": 110500
},
{
"epoch": 0.06,
"grad_norm": 4.970263481140137,
"learning_rate": 9.981252778440324e-05,
"loss": 2.3775,
"step": 111000
},
{
"epoch": 0.06,
"grad_norm": 4.086507320404053,
"learning_rate": 9.981159970412802e-05,
"loss": 2.3695,
"step": 111500
},
{
"epoch": 0.06,
"grad_norm": 4.484976768493652,
"learning_rate": 9.981067162385279e-05,
"loss": 2.3774,
"step": 112000
},
{
"epoch": 0.06,
"grad_norm": 4.321993350982666,
"learning_rate": 9.980974354357754e-05,
"loss": 2.3563,
"step": 112500
},
{
"epoch": 0.06,
"grad_norm": 5.6485066413879395,
"learning_rate": 9.980881546330232e-05,
"loss": 2.3601,
"step": 113000
},
{
"epoch": 0.06,
"grad_norm": 6.259582996368408,
"learning_rate": 9.980788738302709e-05,
"loss": 2.3409,
"step": 113500
},
{
"epoch": 0.06,
"grad_norm": 5.368774890899658,
"learning_rate": 9.980695930275186e-05,
"loss": 2.3809,
"step": 114000
},
{
"epoch": 0.06,
"grad_norm": 7.465442657470703,
"learning_rate": 9.980603122247663e-05,
"loss": 2.3439,
"step": 114500
},
{
"epoch": 0.06,
"grad_norm": 10.82461929321289,
"learning_rate": 9.98051031422014e-05,
"loss": 2.3738,
"step": 115000
},
{
"epoch": 0.06,
"grad_norm": 3.924154758453369,
"learning_rate": 9.980417506192616e-05,
"loss": 2.344,
"step": 115500
},
{
"epoch": 0.06,
"grad_norm": 3.8393290042877197,
"learning_rate": 9.980324698165093e-05,
"loss": 2.3423,
"step": 116000
},
{
"epoch": 0.06,
"grad_norm": 3.9444777965545654,
"learning_rate": 9.98023189013757e-05,
"loss": 2.3426,
"step": 116500
},
{
"epoch": 0.07,
"grad_norm": 5.0410566329956055,
"learning_rate": 9.980139082110047e-05,
"loss": 2.3129,
"step": 117000
},
{
"epoch": 0.07,
"grad_norm": 5.183213233947754,
"learning_rate": 9.980046274082524e-05,
"loss": 2.3187,
"step": 117500
},
{
"epoch": 0.07,
"grad_norm": 6.824711322784424,
"learning_rate": 9.979953466055e-05,
"loss": 2.3206,
"step": 118000
},
{
"epoch": 0.07,
"grad_norm": 4.512765884399414,
"learning_rate": 9.979860658027476e-05,
"loss": 2.3176,
"step": 118500
},
{
"epoch": 0.07,
"grad_norm": 6.9497480392456055,
"learning_rate": 9.979767849999953e-05,
"loss": 2.3418,
"step": 119000
},
{
"epoch": 0.07,
"grad_norm": 4.003693580627441,
"learning_rate": 9.97967504197243e-05,
"loss": 2.3193,
"step": 119500
},
{
"epoch": 0.07,
"grad_norm": 6.719389915466309,
"learning_rate": 9.979582233944907e-05,
"loss": 2.3244,
"step": 120000
},
{
"epoch": 0.07,
"grad_norm": 4.46160364151001,
"learning_rate": 9.979489425917385e-05,
"loss": 2.3313,
"step": 120500
},
{
"epoch": 0.07,
"grad_norm": 8.168338775634766,
"learning_rate": 9.979396617889862e-05,
"loss": 2.311,
"step": 121000
},
{
"epoch": 0.07,
"grad_norm": 4.333712100982666,
"learning_rate": 9.979303809862338e-05,
"loss": 2.328,
"step": 121500
},
{
"epoch": 0.07,
"grad_norm": 8.339402198791504,
"learning_rate": 9.979211001834815e-05,
"loss": 2.3118,
"step": 122000
},
{
"epoch": 0.07,
"grad_norm": 4.38870906829834,
"learning_rate": 9.979118193807292e-05,
"loss": 2.3037,
"step": 122500
},
{
"epoch": 0.07,
"grad_norm": 4.003161430358887,
"learning_rate": 9.979025385779769e-05,
"loss": 2.2959,
"step": 123000
},
{
"epoch": 0.07,
"grad_norm": 6.8454389572143555,
"learning_rate": 9.978932577752246e-05,
"loss": 2.3192,
"step": 123500
},
{
"epoch": 0.07,
"grad_norm": 6.631998538970947,
"learning_rate": 9.978839769724723e-05,
"loss": 2.2975,
"step": 124000
},
{
"epoch": 0.07,
"grad_norm": 4.8582963943481445,
"learning_rate": 9.978746961697199e-05,
"loss": 2.2983,
"step": 124500
},
{
"epoch": 0.07,
"grad_norm": 3.7582802772521973,
"learning_rate": 9.978654153669676e-05,
"loss": 2.3105,
"step": 125000
},
{
"epoch": 0.07,
"grad_norm": 4.227505683898926,
"learning_rate": 9.978561345642153e-05,
"loss": 2.2866,
"step": 125500
},
{
"epoch": 0.07,
"grad_norm": 5.702460765838623,
"learning_rate": 9.97846853761463e-05,
"loss": 2.2854,
"step": 126000
},
{
"epoch": 0.07,
"grad_norm": 4.512174129486084,
"learning_rate": 9.978375729587108e-05,
"loss": 2.2962,
"step": 126500
},
{
"epoch": 0.07,
"grad_norm": 4.127579212188721,
"learning_rate": 9.978282921559585e-05,
"loss": 2.3017,
"step": 127000
},
{
"epoch": 0.07,
"grad_norm": 4.7636284828186035,
"learning_rate": 9.97819011353206e-05,
"loss": 2.26,
"step": 127500
},
{
"epoch": 0.07,
"grad_norm": 5.8895697593688965,
"learning_rate": 9.978097305504538e-05,
"loss": 2.2851,
"step": 128000
},
{
"epoch": 0.07,
"grad_norm": 5.10247278213501,
"learning_rate": 9.978004497477013e-05,
"loss": 2.2912,
"step": 128500
},
{
"epoch": 0.07,
"grad_norm": 4.03781795501709,
"learning_rate": 9.97791168944949e-05,
"loss": 2.2631,
"step": 129000
},
{
"epoch": 0.07,
"grad_norm": 6.225619316101074,
"learning_rate": 9.977818881421968e-05,
"loss": 2.2959,
"step": 129500
},
{
"epoch": 0.07,
"grad_norm": 8.12752628326416,
"learning_rate": 9.977726073394445e-05,
"loss": 2.2574,
"step": 130000
},
{
"epoch": 0.07,
"grad_norm": 8.98240852355957,
"learning_rate": 9.977633265366922e-05,
"loss": 2.2765,
"step": 130500
},
{
"epoch": 0.07,
"grad_norm": 6.665409088134766,
"learning_rate": 9.977540457339398e-05,
"loss": 2.2831,
"step": 131000
},
{
"epoch": 0.07,
"grad_norm": 5.666757106781006,
"learning_rate": 9.977447649311875e-05,
"loss": 2.2514,
"step": 131500
},
{
"epoch": 0.07,
"grad_norm": 4.039821147918701,
"learning_rate": 9.977354841284352e-05,
"loss": 2.2553,
"step": 132000
},
{
"epoch": 0.07,
"grad_norm": 4.816211223602295,
"learning_rate": 9.977262033256829e-05,
"loss": 2.2512,
"step": 132500
},
{
"epoch": 0.07,
"grad_norm": 7.379537105560303,
"learning_rate": 9.977169225229306e-05,
"loss": 2.256,
"step": 133000
},
{
"epoch": 0.07,
"grad_norm": 3.727262258529663,
"learning_rate": 9.977076417201782e-05,
"loss": 2.2447,
"step": 133500
},
{
"epoch": 0.07,
"grad_norm": 4.287083625793457,
"learning_rate": 9.976983609174259e-05,
"loss": 2.233,
"step": 134000
},
{
"epoch": 0.07,
"grad_norm": 4.225050926208496,
"learning_rate": 9.976890801146736e-05,
"loss": 2.2474,
"step": 134500
},
{
"epoch": 0.08,
"grad_norm": 5.888236999511719,
"learning_rate": 9.976797993119214e-05,
"loss": 2.2187,
"step": 135000
},
{
"epoch": 0.08,
"grad_norm": 5.869006156921387,
"learning_rate": 9.97670518509169e-05,
"loss": 2.2442,
"step": 135500
},
{
"epoch": 0.08,
"grad_norm": 6.458480358123779,
"learning_rate": 9.976612377064168e-05,
"loss": 2.2501,
"step": 136000
},
{
"epoch": 0.08,
"grad_norm": 3.7204341888427734,
"learning_rate": 9.976519569036644e-05,
"loss": 2.2464,
"step": 136500
},
{
"epoch": 0.08,
"grad_norm": 4.325768947601318,
"learning_rate": 9.976426761009121e-05,
"loss": 2.2421,
"step": 137000
},
{
"epoch": 0.08,
"grad_norm": 4.79429292678833,
"learning_rate": 9.976333952981598e-05,
"loss": 2.2466,
"step": 137500
},
{
"epoch": 0.08,
"grad_norm": 8.317536354064941,
"learning_rate": 9.976241144954075e-05,
"loss": 2.2169,
"step": 138000
},
{
"epoch": 0.08,
"grad_norm": 5.129164695739746,
"learning_rate": 9.976148336926552e-05,
"loss": 2.2188,
"step": 138500
},
{
"epoch": 0.08,
"grad_norm": 3.7128493785858154,
"learning_rate": 9.976055528899028e-05,
"loss": 2.2372,
"step": 139000
},
{
"epoch": 0.08,
"grad_norm": 4.394794940948486,
"learning_rate": 9.975962720871505e-05,
"loss": 2.2277,
"step": 139500
},
{
"epoch": 0.08,
"grad_norm": 6.397000789642334,
"learning_rate": 9.975869912843981e-05,
"loss": 2.2509,
"step": 140000
},
{
"epoch": 0.08,
"grad_norm": 5.2059526443481445,
"learning_rate": 9.975777104816458e-05,
"loss": 2.2082,
"step": 140500
},
{
"epoch": 0.08,
"grad_norm": 3.5649585723876953,
"learning_rate": 9.975684296788935e-05,
"loss": 2.2383,
"step": 141000
},
{
"epoch": 0.08,
"grad_norm": 4.717801094055176,
"learning_rate": 9.975591488761412e-05,
"loss": 2.1973,
"step": 141500
},
{
"epoch": 0.08,
"grad_norm": 4.133371829986572,
"learning_rate": 9.97549868073389e-05,
"loss": 2.2214,
"step": 142000
},
{
"epoch": 0.08,
"grad_norm": 5.321709156036377,
"learning_rate": 9.975405872706367e-05,
"loss": 2.2012,
"step": 142500
},
{
"epoch": 0.08,
"grad_norm": 5.209156513214111,
"learning_rate": 9.975313064678842e-05,
"loss": 2.1892,
"step": 143000
},
{
"epoch": 0.08,
"grad_norm": 5.405091285705566,
"learning_rate": 9.97522025665132e-05,
"loss": 2.217,
"step": 143500
},
{
"epoch": 0.08,
"grad_norm": 5.35112190246582,
"learning_rate": 9.975127448623797e-05,
"loss": 2.2128,
"step": 144000
},
{
"epoch": 0.08,
"grad_norm": 4.242053985595703,
"learning_rate": 9.975034640596274e-05,
"loss": 2.191,
"step": 144500
},
{
"epoch": 0.08,
"grad_norm": 4.181004047393799,
"learning_rate": 9.974941832568751e-05,
"loss": 2.1965,
"step": 145000
},
{
"epoch": 0.08,
"grad_norm": 3.5158121585845947,
"learning_rate": 9.974849024541228e-05,
"loss": 2.1692,
"step": 145500
},
{
"epoch": 0.08,
"grad_norm": 4.8536577224731445,
"learning_rate": 9.974756216513704e-05,
"loss": 2.1861,
"step": 146000
},
{
"epoch": 0.08,
"grad_norm": 5.284401893615723,
"learning_rate": 9.974663408486181e-05,
"loss": 2.1801,
"step": 146500
},
{
"epoch": 0.08,
"grad_norm": 4.611184597015381,
"learning_rate": 9.974570600458658e-05,
"loss": 2.1879,
"step": 147000
},
{
"epoch": 0.08,
"grad_norm": 4.935207843780518,
"learning_rate": 9.974477792431135e-05,
"loss": 2.1608,
"step": 147500
},
{
"epoch": 0.08,
"grad_norm": 4.852113246917725,
"learning_rate": 9.974384984403612e-05,
"loss": 2.1788,
"step": 148000
},
{
"epoch": 0.08,
"grad_norm": 5.22275972366333,
"learning_rate": 9.974292176376088e-05,
"loss": 2.1585,
"step": 148500
},
{
"epoch": 0.08,
"grad_norm": 5.024623394012451,
"learning_rate": 9.974199368348564e-05,
"loss": 2.1757,
"step": 149000
},
{
"epoch": 0.08,
"grad_norm": 5.230965614318848,
"learning_rate": 9.974106560321041e-05,
"loss": 2.1685,
"step": 149500
},
{
"epoch": 0.08,
"grad_norm": 4.0713090896606445,
"learning_rate": 9.974013752293518e-05,
"loss": 2.1586,
"step": 150000
},
{
"epoch": 0.08,
"grad_norm": 5.00492000579834,
"learning_rate": 9.973920944265995e-05,
"loss": 2.1538,
"step": 150500
},
{
"epoch": 0.08,
"grad_norm": 3.9473533630371094,
"learning_rate": 9.973828136238473e-05,
"loss": 2.1585,
"step": 151000
},
{
"epoch": 0.08,
"grad_norm": 4.452467918395996,
"learning_rate": 9.97373532821095e-05,
"loss": 2.1579,
"step": 151500
},
{
"epoch": 0.08,
"grad_norm": 4.7615647315979,
"learning_rate": 9.973642520183425e-05,
"loss": 2.1609,
"step": 152000
},
{
"epoch": 0.08,
"grad_norm": 4.646842956542969,
"learning_rate": 9.973549712155903e-05,
"loss": 2.1762,
"step": 152500
},
{
"epoch": 0.09,
"grad_norm": 5.9310526847839355,
"learning_rate": 9.97345690412838e-05,
"loss": 2.1578,
"step": 153000
},
{
"epoch": 0.09,
"grad_norm": 3.432331085205078,
"learning_rate": 9.973364096100857e-05,
"loss": 2.1606,
"step": 153500
},
{
"epoch": 0.09,
"grad_norm": 3.7542684078216553,
"learning_rate": 9.973271288073334e-05,
"loss": 2.1403,
"step": 154000
},
{
"epoch": 0.09,
"grad_norm": 5.2377400398254395,
"learning_rate": 9.973178480045811e-05,
"loss": 2.154,
"step": 154500
},
{
"epoch": 0.09,
"grad_norm": 4.928728103637695,
"learning_rate": 9.973085672018287e-05,
"loss": 2.1478,
"step": 155000
},
{
"epoch": 0.09,
"grad_norm": 7.469805717468262,
"learning_rate": 9.972992863990764e-05,
"loss": 2.1443,
"step": 155500
},
{
"epoch": 0.09,
"grad_norm": 5.656938076019287,
"learning_rate": 9.972900055963241e-05,
"loss": 2.1472,
"step": 156000
},
{
"epoch": 0.09,
"grad_norm": 6.054020404815674,
"learning_rate": 9.972807247935718e-05,
"loss": 2.1766,
"step": 156500
},
{
"epoch": 0.09,
"grad_norm": 4.4340925216674805,
"learning_rate": 9.972714439908195e-05,
"loss": 2.1198,
"step": 157000
},
{
"epoch": 0.09,
"grad_norm": 5.291192531585693,
"learning_rate": 9.972621631880673e-05,
"loss": 2.1446,
"step": 157500
},
{
"epoch": 0.09,
"grad_norm": 5.266712188720703,
"learning_rate": 9.972528823853148e-05,
"loss": 2.1402,
"step": 158000
},
{
"epoch": 0.09,
"grad_norm": 7.1873250007629395,
"learning_rate": 9.972436015825626e-05,
"loss": 2.1163,
"step": 158500
},
{
"epoch": 0.09,
"grad_norm": 8.807872772216797,
"learning_rate": 9.972343207798103e-05,
"loss": 2.087,
"step": 159000
},
{
"epoch": 0.09,
"grad_norm": 4.034912109375,
"learning_rate": 9.972250399770578e-05,
"loss": 2.1372,
"step": 159500
},
{
"epoch": 0.09,
"grad_norm": 4.9043755531311035,
"learning_rate": 9.972157591743056e-05,
"loss": 2.1184,
"step": 160000
},
{
"epoch": 0.09,
"grad_norm": 4.8470458984375,
"learning_rate": 9.972064783715533e-05,
"loss": 2.1304,
"step": 160500
},
{
"epoch": 0.09,
"grad_norm": 4.68010139465332,
"learning_rate": 9.97197197568801e-05,
"loss": 2.1093,
"step": 161000
},
{
"epoch": 0.09,
"grad_norm": 4.731649398803711,
"learning_rate": 9.971879167660486e-05,
"loss": 2.1178,
"step": 161500
},
{
"epoch": 0.09,
"grad_norm": 4.586087226867676,
"learning_rate": 9.971786359632963e-05,
"loss": 2.1154,
"step": 162000
},
{
"epoch": 0.09,
"grad_norm": 4.602361679077148,
"learning_rate": 9.97169355160544e-05,
"loss": 2.0925,
"step": 162500
},
{
"epoch": 0.09,
"grad_norm": 5.405307292938232,
"learning_rate": 9.971600743577917e-05,
"loss": 2.1131,
"step": 163000
},
{
"epoch": 0.09,
"grad_norm": 5.402783393859863,
"learning_rate": 9.971507935550394e-05,
"loss": 2.1112,
"step": 163500
},
{
"epoch": 0.09,
"grad_norm": 3.7548913955688477,
"learning_rate": 9.97141512752287e-05,
"loss": 2.0885,
"step": 164000
},
{
"epoch": 0.09,
"grad_norm": 5.215375900268555,
"learning_rate": 9.971322319495347e-05,
"loss": 2.1016,
"step": 164500
},
{
"epoch": 0.09,
"grad_norm": 5.195767402648926,
"learning_rate": 9.971229511467824e-05,
"loss": 2.0821,
"step": 165000
},
{
"epoch": 0.09,
"grad_norm": 5.254913806915283,
"learning_rate": 9.971136703440301e-05,
"loss": 2.0876,
"step": 165500
},
{
"epoch": 0.09,
"grad_norm": 4.161681652069092,
"learning_rate": 9.971043895412779e-05,
"loss": 2.0771,
"step": 166000
},
{
"epoch": 0.09,
"grad_norm": 7.586195468902588,
"learning_rate": 9.970951087385256e-05,
"loss": 2.0893,
"step": 166500
},
{
"epoch": 0.09,
"grad_norm": 4.742598533630371,
"learning_rate": 9.970858279357731e-05,
"loss": 2.1043,
"step": 167000
},
{
"epoch": 0.09,
"grad_norm": 7.583818435668945,
"learning_rate": 9.970765471330209e-05,
"loss": 2.0643,
"step": 167500
},
{
"epoch": 0.09,
"grad_norm": 15.07123851776123,
"learning_rate": 9.970672663302686e-05,
"loss": 2.0719,
"step": 168000
},
{
"epoch": 0.09,
"grad_norm": 4.9100565910339355,
"learning_rate": 9.970579855275163e-05,
"loss": 2.0645,
"step": 168500
},
{
"epoch": 0.09,
"grad_norm": 4.305502891540527,
"learning_rate": 9.97048704724764e-05,
"loss": 2.0661,
"step": 169000
},
{
"epoch": 0.09,
"grad_norm": 4.631031513214111,
"learning_rate": 9.970394239220116e-05,
"loss": 2.0729,
"step": 169500
},
{
"epoch": 0.09,
"grad_norm": 4.102370738983154,
"learning_rate": 9.970301431192593e-05,
"loss": 2.0621,
"step": 170000
},
{
"epoch": 0.09,
"grad_norm": 9.088913917541504,
"learning_rate": 9.970208623165069e-05,
"loss": 2.0566,
"step": 170500
},
{
"epoch": 0.1,
"grad_norm": 3.1607444286346436,
"learning_rate": 9.970115815137546e-05,
"loss": 2.053,
"step": 171000
},
{
"epoch": 0.1,
"grad_norm": 5.192753791809082,
"learning_rate": 9.970023007110023e-05,
"loss": 2.0443,
"step": 171500
},
{
"epoch": 0.1,
"grad_norm": 5.527139186859131,
"learning_rate": 9.9699301990825e-05,
"loss": 2.0356,
"step": 172000
},
{
"epoch": 0.1,
"grad_norm": 4.856675148010254,
"learning_rate": 9.969837391054977e-05,
"loss": 2.0405,
"step": 172500
},
{
"epoch": 0.1,
"grad_norm": 6.9087724685668945,
"learning_rate": 9.969744583027454e-05,
"loss": 2.0544,
"step": 173000
},
{
"epoch": 0.1,
"grad_norm": 5.108808517456055,
"learning_rate": 9.96965177499993e-05,
"loss": 2.0277,
"step": 173500
},
{
"epoch": 0.1,
"grad_norm": 3.581963300704956,
"learning_rate": 9.969558966972407e-05,
"loss": 2.0569,
"step": 174000
},
{
"epoch": 0.1,
"grad_norm": 2.8868958950042725,
"learning_rate": 9.969466158944885e-05,
"loss": 2.0569,
"step": 174500
},
{
"epoch": 0.1,
"grad_norm": 4.1303510665893555,
"learning_rate": 9.969373350917362e-05,
"loss": 2.0378,
"step": 175000
},
{
"epoch": 0.1,
"grad_norm": 3.9498367309570312,
"learning_rate": 9.969280542889839e-05,
"loss": 2.0487,
"step": 175500
},
{
"epoch": 0.1,
"grad_norm": 4.58161735534668,
"learning_rate": 9.969187734862316e-05,
"loss": 2.0113,
"step": 176000
},
{
"epoch": 0.1,
"grad_norm": 4.350900650024414,
"learning_rate": 9.969094926834792e-05,
"loss": 2.0371,
"step": 176500
},
{
"epoch": 0.1,
"grad_norm": 4.318021297454834,
"learning_rate": 9.969002118807269e-05,
"loss": 2.0379,
"step": 177000
},
{
"epoch": 0.1,
"grad_norm": 5.780760288238525,
"learning_rate": 9.968909310779746e-05,
"loss": 2.0117,
"step": 177500
},
{
"epoch": 0.1,
"grad_norm": 4.759991645812988,
"learning_rate": 9.968816502752223e-05,
"loss": 2.034,
"step": 178000
},
{
"epoch": 0.1,
"grad_norm": 3.5731894969940186,
"learning_rate": 9.9687236947247e-05,
"loss": 2.0386,
"step": 178500
},
{
"epoch": 0.1,
"grad_norm": 5.845817565917969,
"learning_rate": 9.968630886697176e-05,
"loss": 2.0078,
"step": 179000
},
{
"epoch": 0.1,
"grad_norm": 4.691517353057861,
"learning_rate": 9.968538078669653e-05,
"loss": 2.0223,
"step": 179500
},
{
"epoch": 0.1,
"grad_norm": 3.678426742553711,
"learning_rate": 9.968445270642129e-05,
"loss": 2.0235,
"step": 180000
},
{
"epoch": 0.1,
"grad_norm": 4.184782981872559,
"learning_rate": 9.968352462614606e-05,
"loss": 2.0036,
"step": 180500
},
{
"epoch": 0.1,
"grad_norm": 5.912731647491455,
"learning_rate": 9.968259654587083e-05,
"loss": 2.016,
"step": 181000
},
{
"epoch": 0.1,
"grad_norm": 5.004282474517822,
"learning_rate": 9.96816684655956e-05,
"loss": 1.9935,
"step": 181500
},
{
"epoch": 0.1,
"grad_norm": 4.136642932891846,
"learning_rate": 9.968074038532038e-05,
"loss": 2.0201,
"step": 182000
},
{
"epoch": 0.1,
"grad_norm": 4.304795742034912,
"learning_rate": 9.967981230504513e-05,
"loss": 2.0064,
"step": 182500
},
{
"epoch": 0.1,
"grad_norm": 4.177530765533447,
"learning_rate": 9.96788842247699e-05,
"loss": 2.0087,
"step": 183000
},
{
"epoch": 0.1,
"grad_norm": 3.730872869491577,
"learning_rate": 9.967795614449468e-05,
"loss": 2.0008,
"step": 183500
},
{
"epoch": 0.1,
"grad_norm": 4.738286972045898,
"learning_rate": 9.967702806421945e-05,
"loss": 1.9961,
"step": 184000
},
{
"epoch": 0.1,
"grad_norm": 4.57652473449707,
"learning_rate": 9.967609998394422e-05,
"loss": 1.9861,
"step": 184500
},
{
"epoch": 0.1,
"grad_norm": 3.8996493816375732,
"learning_rate": 9.967517190366899e-05,
"loss": 1.9821,
"step": 185000
},
{
"epoch": 0.1,
"grad_norm": 5.269779205322266,
"learning_rate": 9.967424382339375e-05,
"loss": 1.9761,
"step": 185500
},
{
"epoch": 0.1,
"grad_norm": 3.3818140029907227,
"learning_rate": 9.967331574311852e-05,
"loss": 1.9679,
"step": 186000
},
{
"epoch": 0.1,
"grad_norm": 3.7173426151275635,
"learning_rate": 9.967238766284329e-05,
"loss": 1.9992,
"step": 186500
},
{
"epoch": 0.1,
"grad_norm": 3.1090087890625,
"learning_rate": 9.967145958256806e-05,
"loss": 1.9782,
"step": 187000
},
{
"epoch": 0.1,
"grad_norm": 4.101827621459961,
"learning_rate": 9.967053150229283e-05,
"loss": 1.9704,
"step": 187500
},
{
"epoch": 0.1,
"grad_norm": 3.682985782623291,
"learning_rate": 9.96696034220176e-05,
"loss": 2.0032,
"step": 188000
},
{
"epoch": 0.1,
"grad_norm": 19.02726173400879,
"learning_rate": 9.966867534174236e-05,
"loss": 1.9946,
"step": 188500
},
{
"epoch": 0.11,
"grad_norm": 4.69189453125,
"learning_rate": 9.966774726146713e-05,
"loss": 1.9245,
"step": 189000
},
{
"epoch": 0.11,
"grad_norm": 12.00914192199707,
"learning_rate": 9.96668191811919e-05,
"loss": 1.9725,
"step": 189500
},
{
"epoch": 0.11,
"grad_norm": 6.565895080566406,
"learning_rate": 9.966589110091668e-05,
"loss": 1.982,
"step": 190000
},
{
"epoch": 0.11,
"grad_norm": 4.286581993103027,
"learning_rate": 9.966496302064143e-05,
"loss": 1.9978,
"step": 190500
},
{
"epoch": 0.11,
"grad_norm": 5.0877509117126465,
"learning_rate": 9.96640349403662e-05,
"loss": 1.9735,
"step": 191000
},
{
"epoch": 0.11,
"grad_norm": 4.9631242752075195,
"learning_rate": 9.966310686009098e-05,
"loss": 1.9508,
"step": 191500
},
{
"epoch": 0.11,
"grad_norm": 4.4933552742004395,
"learning_rate": 9.966217877981574e-05,
"loss": 1.962,
"step": 192000
},
{
"epoch": 0.11,
"grad_norm": 4.3112874031066895,
"learning_rate": 9.966125069954051e-05,
"loss": 1.9566,
"step": 192500
},
{
"epoch": 0.11,
"grad_norm": 4.248539924621582,
"learning_rate": 9.966032261926528e-05,
"loss": 1.9379,
"step": 193000
},
{
"epoch": 0.11,
"grad_norm": 3.26794171333313,
"learning_rate": 9.965939453899005e-05,
"loss": 1.9297,
"step": 193500
},
{
"epoch": 0.11,
"grad_norm": 3.2197368144989014,
"learning_rate": 9.965846645871482e-05,
"loss": 1.9494,
"step": 194000
},
{
"epoch": 0.11,
"grad_norm": 4.919404983520508,
"learning_rate": 9.965753837843958e-05,
"loss": 1.9529,
"step": 194500
},
{
"epoch": 0.11,
"grad_norm": 4.171731948852539,
"learning_rate": 9.965661029816435e-05,
"loss": 1.9323,
"step": 195000
},
{
"epoch": 0.11,
"grad_norm": 4.448825836181641,
"learning_rate": 9.965568221788912e-05,
"loss": 1.9484,
"step": 195500
},
{
"epoch": 0.11,
"grad_norm": 5.62270975112915,
"learning_rate": 9.965475413761389e-05,
"loss": 1.9269,
"step": 196000
},
{
"epoch": 0.11,
"grad_norm": 6.439905166625977,
"learning_rate": 9.965382605733866e-05,
"loss": 1.9193,
"step": 196500
},
{
"epoch": 0.11,
"grad_norm": 4.133171081542969,
"learning_rate": 9.965289797706344e-05,
"loss": 1.9307,
"step": 197000
},
{
"epoch": 0.11,
"grad_norm": 5.984546184539795,
"learning_rate": 9.96519698967882e-05,
"loss": 1.9042,
"step": 197500
},
{
"epoch": 0.11,
"grad_norm": 4.515778064727783,
"learning_rate": 9.965104181651297e-05,
"loss": 1.9541,
"step": 198000
},
{
"epoch": 0.11,
"grad_norm": 3.409749984741211,
"learning_rate": 9.965011373623774e-05,
"loss": 1.9203,
"step": 198500
},
{
"epoch": 0.11,
"grad_norm": 3.724917411804199,
"learning_rate": 9.964918565596251e-05,
"loss": 1.9262,
"step": 199000
},
{
"epoch": 0.11,
"grad_norm": 3.5863378047943115,
"learning_rate": 9.964825757568728e-05,
"loss": 1.9383,
"step": 199500
},
{
"epoch": 0.11,
"grad_norm": 6.107095718383789,
"learning_rate": 9.964732949541205e-05,
"loss": 1.9263,
"step": 200000
},
{
"epoch": 0.11,
"grad_norm": 3.91813588142395,
"learning_rate": 9.964640141513681e-05,
"loss": 1.9217,
"step": 200500
},
{
"epoch": 0.11,
"grad_norm": 4.080438137054443,
"learning_rate": 9.964547333486157e-05,
"loss": 1.9089,
"step": 201000
},
{
"epoch": 0.11,
"grad_norm": 3.550732374191284,
"learning_rate": 9.964454525458634e-05,
"loss": 1.9029,
"step": 201500
},
{
"epoch": 0.11,
"grad_norm": 6.719958305358887,
"learning_rate": 9.964361717431111e-05,
"loss": 1.9201,
"step": 202000
},
{
"epoch": 0.11,
"grad_norm": 4.790652751922607,
"learning_rate": 9.964268909403588e-05,
"loss": 1.8942,
"step": 202500
},
{
"epoch": 0.11,
"grad_norm": 5.303153991699219,
"learning_rate": 9.964176101376065e-05,
"loss": 1.8945,
"step": 203000
},
{
"epoch": 0.11,
"grad_norm": 6.912900924682617,
"learning_rate": 9.964083293348542e-05,
"loss": 1.9146,
"step": 203500
},
{
"epoch": 0.11,
"grad_norm": 4.400740623474121,
"learning_rate": 9.963990485321018e-05,
"loss": 1.8796,
"step": 204000
},
{
"epoch": 0.11,
"grad_norm": 5.205254554748535,
"learning_rate": 9.963897677293495e-05,
"loss": 1.8949,
"step": 204500
},
{
"epoch": 0.11,
"grad_norm": 5.543479919433594,
"learning_rate": 9.963804869265972e-05,
"loss": 1.8846,
"step": 205000
},
{
"epoch": 0.11,
"grad_norm": 8.72082233428955,
"learning_rate": 9.96371206123845e-05,
"loss": 1.8709,
"step": 205500
},
{
"epoch": 0.11,
"grad_norm": 6.174784183502197,
"learning_rate": 9.963619253210927e-05,
"loss": 1.9181,
"step": 206000
},
{
"epoch": 0.11,
"grad_norm": 5.813675880432129,
"learning_rate": 9.963526445183404e-05,
"loss": 1.8797,
"step": 206500
},
{
"epoch": 0.12,
"grad_norm": 5.238151550292969,
"learning_rate": 9.96343363715588e-05,
"loss": 1.8747,
"step": 207000
},
{
"epoch": 0.12,
"grad_norm": 3.6273601055145264,
"learning_rate": 9.963340829128357e-05,
"loss": 1.8942,
"step": 207500
},
{
"epoch": 0.12,
"grad_norm": 6.024681091308594,
"learning_rate": 9.963248021100834e-05,
"loss": 1.8974,
"step": 208000
},
{
"epoch": 0.12,
"grad_norm": 6.6017746925354,
"learning_rate": 9.963155213073311e-05,
"loss": 1.8878,
"step": 208500
},
{
"epoch": 0.12,
"grad_norm": 4.99990177154541,
"learning_rate": 9.963062405045788e-05,
"loss": 1.8741,
"step": 209000
},
{
"epoch": 0.12,
"grad_norm": 4.064300060272217,
"learning_rate": 9.962969597018264e-05,
"loss": 1.8638,
"step": 209500
},
{
"epoch": 0.12,
"grad_norm": 4.6583685874938965,
"learning_rate": 9.962876788990741e-05,
"loss": 1.8723,
"step": 210000
},
{
"epoch": 0.12,
"grad_norm": 4.108057498931885,
"learning_rate": 9.962783980963218e-05,
"loss": 1.8643,
"step": 210500
},
{
"epoch": 0.12,
"grad_norm": 4.108830451965332,
"learning_rate": 9.962691172935694e-05,
"loss": 1.866,
"step": 211000
},
{
"epoch": 0.12,
"grad_norm": 4.949222564697266,
"learning_rate": 9.962598364908171e-05,
"loss": 1.8723,
"step": 211500
},
{
"epoch": 0.12,
"grad_norm": 3.929126501083374,
"learning_rate": 9.962505556880648e-05,
"loss": 1.85,
"step": 212000
},
{
"epoch": 0.12,
"grad_norm": 4.211127281188965,
"learning_rate": 9.962412748853125e-05,
"loss": 1.8463,
"step": 212500
},
{
"epoch": 0.12,
"grad_norm": 4.776076316833496,
"learning_rate": 9.962319940825601e-05,
"loss": 1.869,
"step": 213000
},
{
"epoch": 0.12,
"grad_norm": 4.429445743560791,
"learning_rate": 9.962227132798078e-05,
"loss": 1.8667,
"step": 213500
},
{
"epoch": 0.12,
"grad_norm": 3.9247748851776123,
"learning_rate": 9.962134324770555e-05,
"loss": 1.8632,
"step": 214000
},
{
"epoch": 0.12,
"grad_norm": 4.082817554473877,
"learning_rate": 9.962041516743033e-05,
"loss": 1.8438,
"step": 214500
},
{
"epoch": 0.12,
"grad_norm": 5.592115879058838,
"learning_rate": 9.96194870871551e-05,
"loss": 1.8395,
"step": 215000
},
{
"epoch": 0.12,
"grad_norm": 3.4248578548431396,
"learning_rate": 9.961855900687987e-05,
"loss": 1.8423,
"step": 215500
},
{
"epoch": 0.12,
"grad_norm": 3.3221466541290283,
"learning_rate": 9.961763092660463e-05,
"loss": 1.8258,
"step": 216000
},
{
"epoch": 0.12,
"grad_norm": 3.8490660190582275,
"learning_rate": 9.96167028463294e-05,
"loss": 1.8574,
"step": 216500
},
{
"epoch": 0.12,
"grad_norm": 5.5256547927856445,
"learning_rate": 9.961577476605417e-05,
"loss": 1.8312,
"step": 217000
},
{
"epoch": 0.12,
"grad_norm": 3.6203320026397705,
"learning_rate": 9.961484668577894e-05,
"loss": 1.8483,
"step": 217500
},
{
"epoch": 0.12,
"grad_norm": 4.3675856590271,
"learning_rate": 9.961391860550371e-05,
"loss": 1.8526,
"step": 218000
},
{
"epoch": 0.12,
"grad_norm": 4.40755558013916,
"learning_rate": 9.961299052522848e-05,
"loss": 1.8617,
"step": 218500
},
{
"epoch": 0.12,
"grad_norm": 12.9111967086792,
"learning_rate": 9.961206244495324e-05,
"loss": 1.8488,
"step": 219000
},
{
"epoch": 0.12,
"grad_norm": 4.0077009201049805,
"learning_rate": 9.961113436467801e-05,
"loss": 1.8336,
"step": 219500
},
{
"epoch": 0.12,
"grad_norm": 16.747453689575195,
"learning_rate": 9.961020628440278e-05,
"loss": 1.8535,
"step": 220000
},
{
"epoch": 0.12,
"grad_norm": 5.349842071533203,
"learning_rate": 9.960927820412756e-05,
"loss": 1.8272,
"step": 220500
},
{
"epoch": 0.12,
"grad_norm": 4.0585808753967285,
"learning_rate": 9.960835012385231e-05,
"loss": 1.8361,
"step": 221000
},
{
"epoch": 0.12,
"grad_norm": 10.322525978088379,
"learning_rate": 9.960742204357709e-05,
"loss": 1.8171,
"step": 221500
},
{
"epoch": 0.12,
"grad_norm": 3.9094197750091553,
"learning_rate": 9.960649396330186e-05,
"loss": 1.8434,
"step": 222000
},
{
"epoch": 0.12,
"grad_norm": 4.250068187713623,
"learning_rate": 9.960556588302661e-05,
"loss": 1.8312,
"step": 222500
},
{
"epoch": 0.12,
"grad_norm": 3.535179853439331,
"learning_rate": 9.960463780275139e-05,
"loss": 1.8263,
"step": 223000
},
{
"epoch": 0.12,
"grad_norm": 3.5814437866210938,
"learning_rate": 9.960370972247616e-05,
"loss": 1.833,
"step": 223500
},
{
"epoch": 0.12,
"grad_norm": 5.914824962615967,
"learning_rate": 9.960278164220093e-05,
"loss": 1.8196,
"step": 224000
},
{
"epoch": 0.12,
"grad_norm": 3.966111660003662,
"learning_rate": 9.96018535619257e-05,
"loss": 1.8219,
"step": 224500
},
{
"epoch": 0.13,
"grad_norm": 3.899972915649414,
"learning_rate": 9.960092548165046e-05,
"loss": 1.8173,
"step": 225000
},
{
"epoch": 0.13,
"grad_norm": 3.0840604305267334,
"learning_rate": 9.959999740137523e-05,
"loss": 1.8036,
"step": 225500
},
{
"epoch": 0.13,
"grad_norm": 4.453856468200684,
"learning_rate": 9.95990693211e-05,
"loss": 1.839,
"step": 226000
},
{
"epoch": 0.13,
"grad_norm": 5.8350443840026855,
"learning_rate": 9.959814124082477e-05,
"loss": 1.7956,
"step": 226500
},
{
"epoch": 0.13,
"grad_norm": 3.109984874725342,
"learning_rate": 9.959721316054954e-05,
"loss": 1.8198,
"step": 227000
},
{
"epoch": 0.13,
"grad_norm": 5.034292221069336,
"learning_rate": 9.959628508027431e-05,
"loss": 1.7978,
"step": 227500
},
{
"epoch": 0.13,
"grad_norm": 5.237682819366455,
"learning_rate": 9.959535699999907e-05,
"loss": 1.8309,
"step": 228000
},
{
"epoch": 0.13,
"grad_norm": 3.0368969440460205,
"learning_rate": 9.959442891972384e-05,
"loss": 1.8012,
"step": 228500
},
{
"epoch": 0.13,
"grad_norm": 3.3586440086364746,
"learning_rate": 9.959350083944862e-05,
"loss": 1.7965,
"step": 229000
},
{
"epoch": 0.13,
"grad_norm": 4.194300651550293,
"learning_rate": 9.959257275917339e-05,
"loss": 1.8158,
"step": 229500
},
{
"epoch": 0.13,
"grad_norm": 5.295820713043213,
"learning_rate": 9.959164467889816e-05,
"loss": 1.8035,
"step": 230000
},
{
"epoch": 0.13,
"grad_norm": 3.1233255863189697,
"learning_rate": 9.959071659862293e-05,
"loss": 1.8076,
"step": 230500
},
{
"epoch": 0.13,
"grad_norm": 4.108482837677002,
"learning_rate": 9.958978851834769e-05,
"loss": 1.8169,
"step": 231000
},
{
"epoch": 0.13,
"grad_norm": 3.3223822116851807,
"learning_rate": 9.958886043807245e-05,
"loss": 1.7914,
"step": 231500
},
{
"epoch": 0.13,
"grad_norm": 3.5875344276428223,
"learning_rate": 9.958793235779722e-05,
"loss": 1.8124,
"step": 232000
},
{
"epoch": 0.13,
"grad_norm": 3.7725822925567627,
"learning_rate": 9.958700427752199e-05,
"loss": 1.7907,
"step": 232500
},
{
"epoch": 0.13,
"grad_norm": 3.5657663345336914,
"learning_rate": 9.958607619724676e-05,
"loss": 1.795,
"step": 233000
},
{
"epoch": 0.13,
"grad_norm": 4.3107404708862305,
"learning_rate": 9.958514811697153e-05,
"loss": 1.7687,
"step": 233500
},
{
"epoch": 0.13,
"grad_norm": 4.887343883514404,
"learning_rate": 9.95842200366963e-05,
"loss": 1.7831,
"step": 234000
},
{
"epoch": 0.13,
"grad_norm": 3.7076539993286133,
"learning_rate": 9.958329195642106e-05,
"loss": 1.7842,
"step": 234500
},
{
"epoch": 0.13,
"grad_norm": 3.3431081771850586,
"learning_rate": 9.958236387614583e-05,
"loss": 1.797,
"step": 235000
},
{
"epoch": 0.13,
"grad_norm": 2.8526740074157715,
"learning_rate": 9.95814357958706e-05,
"loss": 1.7879,
"step": 235500
},
{
"epoch": 0.13,
"grad_norm": 4.57257604598999,
"learning_rate": 9.958050771559537e-05,
"loss": 1.7924,
"step": 236000
},
{
"epoch": 0.13,
"grad_norm": 8.872370719909668,
"learning_rate": 9.957957963532015e-05,
"loss": 1.796,
"step": 236500
},
{
"epoch": 0.13,
"grad_norm": 3.647901773452759,
"learning_rate": 9.957865155504492e-05,
"loss": 1.7801,
"step": 237000
}
],
"logging_steps": 500,
"max_steps": 53884650,
"num_input_tokens_seen": 0,
"num_train_epochs": 30,
"save_steps": 10,
"total_flos": 1.6575934896869376e+16,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}