{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.13198192806300124, "eval_steps": 500, "global_step": 237060, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 2.6396658420562744, "learning_rate": 5e-06, "loss": 11.2923, "step": 500 }, { "epoch": 0.0, "grad_norm": 3.1433205604553223, "learning_rate": 1e-05, "loss": 10.5195, "step": 1000 }, { "epoch": 0.0, "grad_norm": 2.825376510620117, "learning_rate": 1.5e-05, "loss": 9.4381, "step": 1500 }, { "epoch": 0.0, "grad_norm": 6.385465621948242, "learning_rate": 2e-05, "loss": 7.8175, "step": 2000 }, { "epoch": 0.0, "grad_norm": 2.1213717460632324, "learning_rate": 2.5e-05, "loss": 5.8245, "step": 2500 }, { "epoch": 0.0, "grad_norm": 1.302095890045166, "learning_rate": 3e-05, "loss": 4.455, "step": 3000 }, { "epoch": 0.0, "grad_norm": 3.764803886413574, "learning_rate": 3.5e-05, "loss": 4.1131, "step": 3500 }, { "epoch": 0.0, "grad_norm": 1.2695655822753906, "learning_rate": 4e-05, "loss": 4.0351, "step": 4000 }, { "epoch": 0.0, "grad_norm": 1.787291169166565, "learning_rate": 4.5e-05, "loss": 3.9797, "step": 4500 }, { "epoch": 0.0, "grad_norm": 1.4003510475158691, "learning_rate": 5e-05, "loss": 3.9632, "step": 5000 }, { "epoch": 0.0, "grad_norm": 1.7297732830047607, "learning_rate": 5.500000000000001e-05, "loss": 3.8884, "step": 5500 }, { "epoch": 0.0, "grad_norm": 1.6848719120025635, "learning_rate": 6e-05, "loss": 3.8555, "step": 6000 }, { "epoch": 0.0, "grad_norm": 1.5225064754486084, "learning_rate": 6.500000000000001e-05, "loss": 3.8361, "step": 6500 }, { "epoch": 0.0, "grad_norm": 1.369327187538147, "learning_rate": 7e-05, "loss": 3.8418, "step": 7000 }, { "epoch": 0.0, "grad_norm": 1.8992069959640503, "learning_rate": 7.500000000000001e-05, "loss": 3.7863, "step": 7500 }, { "epoch": 0.0, "grad_norm": 0.6943246722221375, "learning_rate": 8e-05, "loss": 3.7756, "step": 8000 }, { "epoch": 0.0, "grad_norm": 0.9557390213012695, "learning_rate": 8.5e-05, "loss": 3.7761, "step": 8500 }, { "epoch": 0.01, "grad_norm": 5.63359260559082, "learning_rate": 9e-05, "loss": 3.7632, "step": 9000 }, { "epoch": 0.01, "grad_norm": 2.1707723140716553, "learning_rate": 9.5e-05, "loss": 3.7302, "step": 9500 }, { "epoch": 0.01, "grad_norm": 1.575547456741333, "learning_rate": 0.0001, "loss": 3.6666, "step": 10000 }, { "epoch": 0.01, "grad_norm": 1.7842143774032593, "learning_rate": 9.999907191972478e-05, "loss": 3.6669, "step": 10500 }, { "epoch": 0.01, "grad_norm": 3.472308874130249, "learning_rate": 9.999814383944955e-05, "loss": 3.6722, "step": 11000 }, { "epoch": 0.01, "grad_norm": 4.764346122741699, "learning_rate": 9.99972157591743e-05, "loss": 3.6258, "step": 11500 }, { "epoch": 0.01, "grad_norm": 1.4110618829727173, "learning_rate": 9.999628767889908e-05, "loss": 3.6232, "step": 12000 }, { "epoch": 0.01, "grad_norm": 1.8532825708389282, "learning_rate": 9.999535959862385e-05, "loss": 3.6201, "step": 12500 }, { "epoch": 0.01, "grad_norm": 3.7213845252990723, "learning_rate": 9.999443151834862e-05, "loss": 3.5771, "step": 13000 }, { "epoch": 0.01, "grad_norm": 6.306430816650391, "learning_rate": 9.999350343807339e-05, "loss": 3.5676, "step": 13500 }, { "epoch": 0.01, "grad_norm": 3.5671603679656982, "learning_rate": 9.999257535779816e-05, "loss": 3.542, "step": 14000 }, { "epoch": 0.01, "grad_norm": 1.9145666360855103, "learning_rate": 9.999164727752292e-05, "loss": 3.524, "step": 14500 }, { "epoch": 0.01, "grad_norm": 2.1453213691711426, "learning_rate": 9.999071919724769e-05, "loss": 3.507, "step": 15000 }, { "epoch": 0.01, "grad_norm": 2.5378761291503906, "learning_rate": 9.998979111697245e-05, "loss": 3.5117, "step": 15500 }, { "epoch": 0.01, "grad_norm": 1.8997902870178223, "learning_rate": 9.998886303669722e-05, "loss": 3.4705, "step": 16000 }, { "epoch": 0.01, "grad_norm": 1.9014019966125488, "learning_rate": 9.998793495642199e-05, "loss": 3.4625, "step": 16500 }, { "epoch": 0.01, "grad_norm": 4.82717227935791, "learning_rate": 9.998700687614676e-05, "loss": 3.462, "step": 17000 }, { "epoch": 0.01, "grad_norm": 4.583270072937012, "learning_rate": 9.998607879587154e-05, "loss": 3.434, "step": 17500 }, { "epoch": 0.01, "grad_norm": 2.3539793491363525, "learning_rate": 9.998515071559629e-05, "loss": 3.3932, "step": 18000 }, { "epoch": 0.01, "grad_norm": 3.745349884033203, "learning_rate": 9.998422263532106e-05, "loss": 3.3875, "step": 18500 }, { "epoch": 0.01, "grad_norm": 3.2298102378845215, "learning_rate": 9.998329455504584e-05, "loss": 3.3663, "step": 19000 }, { "epoch": 0.01, "grad_norm": 2.3267478942871094, "learning_rate": 9.998236647477061e-05, "loss": 3.3471, "step": 19500 }, { "epoch": 0.01, "grad_norm": 4.170761585235596, "learning_rate": 9.998143839449538e-05, "loss": 3.3523, "step": 20000 }, { "epoch": 0.01, "grad_norm": 3.3661534786224365, "learning_rate": 9.998051031422015e-05, "loss": 3.3571, "step": 20500 }, { "epoch": 0.01, "grad_norm": 3.1715431213378906, "learning_rate": 9.997958223394491e-05, "loss": 3.3627, "step": 21000 }, { "epoch": 0.01, "grad_norm": 12.678780555725098, "learning_rate": 9.997865415366968e-05, "loss": 3.3433, "step": 21500 }, { "epoch": 0.01, "grad_norm": 4.594781398773193, "learning_rate": 9.997772607339445e-05, "loss": 3.3309, "step": 22000 }, { "epoch": 0.01, "grad_norm": 3.517892599105835, "learning_rate": 9.997679799311922e-05, "loss": 3.3274, "step": 22500 }, { "epoch": 0.01, "grad_norm": 2.783637285232544, "learning_rate": 9.9975869912844e-05, "loss": 3.3317, "step": 23000 }, { "epoch": 0.01, "grad_norm": 4.124772548675537, "learning_rate": 9.997494183256876e-05, "loss": 3.3127, "step": 23500 }, { "epoch": 0.01, "grad_norm": 2.7662994861602783, "learning_rate": 9.997401375229352e-05, "loss": 3.3101, "step": 24000 }, { "epoch": 0.01, "grad_norm": 2.652294158935547, "learning_rate": 9.99730856720183e-05, "loss": 3.2721, "step": 24500 }, { "epoch": 0.01, "grad_norm": 2.3313260078430176, "learning_rate": 9.997215759174307e-05, "loss": 3.2796, "step": 25000 }, { "epoch": 0.01, "grad_norm": 4.2452392578125, "learning_rate": 9.997122951146782e-05, "loss": 3.2846, "step": 25500 }, { "epoch": 0.01, "grad_norm": 3.510793924331665, "learning_rate": 9.99703014311926e-05, "loss": 3.29, "step": 26000 }, { "epoch": 0.01, "grad_norm": 2.9583752155303955, "learning_rate": 9.996937335091737e-05, "loss": 3.2618, "step": 26500 }, { "epoch": 0.02, "grad_norm": 3.7444071769714355, "learning_rate": 9.996844527064212e-05, "loss": 3.2348, "step": 27000 }, { "epoch": 0.02, "grad_norm": 2.852471113204956, "learning_rate": 9.99675171903669e-05, "loss": 3.2315, "step": 27500 }, { "epoch": 0.02, "grad_norm": 2.542102813720703, "learning_rate": 9.996658911009167e-05, "loss": 3.2564, "step": 28000 }, { "epoch": 0.02, "grad_norm": 3.195061445236206, "learning_rate": 9.996566102981644e-05, "loss": 3.2477, "step": 28500 }, { "epoch": 0.02, "grad_norm": 4.133997917175293, "learning_rate": 9.996473294954121e-05, "loss": 3.2246, "step": 29000 }, { "epoch": 0.02, "grad_norm": 3.147731304168701, "learning_rate": 9.996380486926598e-05, "loss": 3.2483, "step": 29500 }, { "epoch": 0.02, "grad_norm": 2.417142868041992, "learning_rate": 9.996287678899074e-05, "loss": 3.2084, "step": 30000 }, { "epoch": 0.02, "grad_norm": 2.3051340579986572, "learning_rate": 9.996194870871551e-05, "loss": 3.2305, "step": 30500 }, { "epoch": 0.02, "grad_norm": 3.268831491470337, "learning_rate": 9.996102062844028e-05, "loss": 3.2198, "step": 31000 }, { "epoch": 0.02, "grad_norm": 2.9484684467315674, "learning_rate": 9.996009254816505e-05, "loss": 3.1927, "step": 31500 }, { "epoch": 0.02, "grad_norm": 5.482509613037109, "learning_rate": 9.995916446788982e-05, "loss": 3.2018, "step": 32000 }, { "epoch": 0.02, "grad_norm": 3.319070339202881, "learning_rate": 9.99582363876146e-05, "loss": 3.2034, "step": 32500 }, { "epoch": 0.02, "grad_norm": 2.9254605770111084, "learning_rate": 9.995730830733935e-05, "loss": 3.1935, "step": 33000 }, { "epoch": 0.02, "grad_norm": 3.508344888687134, "learning_rate": 9.995638022706412e-05, "loss": 3.1913, "step": 33500 }, { "epoch": 0.02, "grad_norm": 2.0120761394500732, "learning_rate": 9.99554521467889e-05, "loss": 3.2221, "step": 34000 }, { "epoch": 0.02, "grad_norm": 2.6379246711730957, "learning_rate": 9.995452406651367e-05, "loss": 3.1848, "step": 34500 }, { "epoch": 0.02, "grad_norm": 4.178890705108643, "learning_rate": 9.995359598623844e-05, "loss": 3.184, "step": 35000 }, { "epoch": 0.02, "grad_norm": 3.148407459259033, "learning_rate": 9.995266790596321e-05, "loss": 3.2119, "step": 35500 }, { "epoch": 0.02, "grad_norm": 2.3678390979766846, "learning_rate": 9.995173982568797e-05, "loss": 3.1744, "step": 36000 }, { "epoch": 0.02, "grad_norm": 3.0567054748535156, "learning_rate": 9.995081174541273e-05, "loss": 3.1821, "step": 36500 }, { "epoch": 0.02, "grad_norm": 3.4158051013946533, "learning_rate": 9.99498836651375e-05, "loss": 3.1615, "step": 37000 }, { "epoch": 0.02, "grad_norm": 3.654043197631836, "learning_rate": 9.994895558486227e-05, "loss": 3.174, "step": 37500 }, { "epoch": 0.02, "grad_norm": 3.634075403213501, "learning_rate": 9.994802750458704e-05, "loss": 3.1792, "step": 38000 }, { "epoch": 0.02, "grad_norm": 2.8857271671295166, "learning_rate": 9.994709942431181e-05, "loss": 3.1441, "step": 38500 }, { "epoch": 0.02, "grad_norm": 3.0216064453125, "learning_rate": 9.994617134403658e-05, "loss": 3.1464, "step": 39000 }, { "epoch": 0.02, "grad_norm": 4.89854097366333, "learning_rate": 9.994524326376134e-05, "loss": 3.1514, "step": 39500 }, { "epoch": 0.02, "grad_norm": 3.495551586151123, "learning_rate": 9.994431518348611e-05, "loss": 3.153, "step": 40000 }, { "epoch": 0.02, "grad_norm": 2.9397456645965576, "learning_rate": 9.994338710321088e-05, "loss": 3.1329, "step": 40500 }, { "epoch": 0.02, "grad_norm": 5.4169721603393555, "learning_rate": 9.994245902293566e-05, "loss": 3.1485, "step": 41000 }, { "epoch": 0.02, "grad_norm": 2.784404754638672, "learning_rate": 9.994153094266043e-05, "loss": 3.1423, "step": 41500 }, { "epoch": 0.02, "grad_norm": 2.435594320297241, "learning_rate": 9.994060286238518e-05, "loss": 3.133, "step": 42000 }, { "epoch": 0.02, "grad_norm": 2.478294610977173, "learning_rate": 9.993967478210996e-05, "loss": 3.135, "step": 42500 }, { "epoch": 0.02, "grad_norm": 3.871967315673828, "learning_rate": 9.993874670183473e-05, "loss": 3.1133, "step": 43000 }, { "epoch": 0.02, "grad_norm": 2.735368490219116, "learning_rate": 9.99378186215595e-05, "loss": 3.1333, "step": 43500 }, { "epoch": 0.02, "grad_norm": 3.2882280349731445, "learning_rate": 9.993689054128427e-05, "loss": 3.1231, "step": 44000 }, { "epoch": 0.02, "grad_norm": 4.579193115234375, "learning_rate": 9.993596246100904e-05, "loss": 3.1051, "step": 44500 }, { "epoch": 0.03, "grad_norm": 4.136692047119141, "learning_rate": 9.99350343807338e-05, "loss": 3.1295, "step": 45000 }, { "epoch": 0.03, "grad_norm": 3.8374266624450684, "learning_rate": 9.993410630045857e-05, "loss": 3.1182, "step": 45500 }, { "epoch": 0.03, "grad_norm": 3.8597323894500732, "learning_rate": 9.993317822018334e-05, "loss": 3.1102, "step": 46000 }, { "epoch": 0.03, "grad_norm": 2.7364611625671387, "learning_rate": 9.99322501399081e-05, "loss": 3.1021, "step": 46500 }, { "epoch": 0.03, "grad_norm": 10.505576133728027, "learning_rate": 9.993132205963287e-05, "loss": 3.087, "step": 47000 }, { "epoch": 0.03, "grad_norm": 3.8307197093963623, "learning_rate": 9.993039397935764e-05, "loss": 3.1033, "step": 47500 }, { "epoch": 0.03, "grad_norm": 3.2991435527801514, "learning_rate": 9.992946589908241e-05, "loss": 3.0994, "step": 48000 }, { "epoch": 0.03, "grad_norm": 2.461336612701416, "learning_rate": 9.992853781880717e-05, "loss": 3.0768, "step": 48500 }, { "epoch": 0.03, "grad_norm": 4.653237819671631, "learning_rate": 9.992760973853194e-05, "loss": 3.0751, "step": 49000 }, { "epoch": 0.03, "grad_norm": 3.9689579010009766, "learning_rate": 9.992668165825671e-05, "loss": 3.0845, "step": 49500 }, { "epoch": 0.03, "grad_norm": 3.50116229057312, "learning_rate": 9.992575357798149e-05, "loss": 3.0563, "step": 50000 }, { "epoch": 0.03, "grad_norm": 7.10615873336792, "learning_rate": 9.992482549770626e-05, "loss": 3.0585, "step": 50500 }, { "epoch": 0.03, "grad_norm": 3.484778642654419, "learning_rate": 9.992389741743103e-05, "loss": 3.0657, "step": 51000 }, { "epoch": 0.03, "grad_norm": 4.087871074676514, "learning_rate": 9.992296933715579e-05, "loss": 3.0302, "step": 51500 }, { "epoch": 0.03, "grad_norm": 6.728222846984863, "learning_rate": 9.992204125688056e-05, "loss": 3.0197, "step": 52000 }, { "epoch": 0.03, "grad_norm": 4.08004093170166, "learning_rate": 9.992111317660533e-05, "loss": 3.0261, "step": 52500 }, { "epoch": 0.03, "grad_norm": 4.262966156005859, "learning_rate": 9.99201850963301e-05, "loss": 3.0185, "step": 53000 }, { "epoch": 0.03, "grad_norm": 3.3461170196533203, "learning_rate": 9.991925701605487e-05, "loss": 2.9916, "step": 53500 }, { "epoch": 0.03, "grad_norm": 5.650246620178223, "learning_rate": 9.991832893577964e-05, "loss": 3.0014, "step": 54000 }, { "epoch": 0.03, "grad_norm": 4.820484161376953, "learning_rate": 9.99174008555044e-05, "loss": 2.9966, "step": 54500 }, { "epoch": 0.03, "grad_norm": 3.245210886001587, "learning_rate": 9.991647277522917e-05, "loss": 2.9829, "step": 55000 }, { "epoch": 0.03, "grad_norm": 3.009565591812134, "learning_rate": 9.991554469495394e-05, "loss": 2.9665, "step": 55500 }, { "epoch": 0.03, "grad_norm": 5.277273654937744, "learning_rate": 9.991461661467872e-05, "loss": 2.9572, "step": 56000 }, { "epoch": 0.03, "grad_norm": 3.0885508060455322, "learning_rate": 9.991368853440347e-05, "loss": 2.953, "step": 56500 }, { "epoch": 0.03, "grad_norm": 3.3525726795196533, "learning_rate": 9.991276045412824e-05, "loss": 2.9494, "step": 57000 }, { "epoch": 0.03, "grad_norm": 3.699075937271118, "learning_rate": 9.9911832373853e-05, "loss": 2.9731, "step": 57500 }, { "epoch": 0.03, "grad_norm": 2.363311529159546, "learning_rate": 9.991090429357777e-05, "loss": 2.9257, "step": 58000 }, { "epoch": 0.03, "grad_norm": 3.144495964050293, "learning_rate": 9.990997621330255e-05, "loss": 2.9135, "step": 58500 }, { "epoch": 0.03, "grad_norm": 3.192347288131714, "learning_rate": 9.990904813302732e-05, "loss": 2.8925, "step": 59000 }, { "epoch": 0.03, "grad_norm": 3.4419093132019043, "learning_rate": 9.990812005275209e-05, "loss": 2.9018, "step": 59500 }, { "epoch": 0.03, "grad_norm": 3.506303071975708, "learning_rate": 9.990719197247686e-05, "loss": 2.8921, "step": 60000 }, { "epoch": 0.03, "grad_norm": 2.7846992015838623, "learning_rate": 9.990626389220162e-05, "loss": 2.8904, "step": 60500 }, { "epoch": 0.03, "grad_norm": 3.9778714179992676, "learning_rate": 9.990533581192639e-05, "loss": 2.854, "step": 61000 }, { "epoch": 0.03, "grad_norm": 3.1654438972473145, "learning_rate": 9.990440773165116e-05, "loss": 2.8481, "step": 61500 }, { "epoch": 0.03, "grad_norm": 5.007691860198975, "learning_rate": 9.990347965137593e-05, "loss": 2.836, "step": 62000 }, { "epoch": 0.03, "grad_norm": 3.4846861362457275, "learning_rate": 9.99025515711007e-05, "loss": 2.8312, "step": 62500 }, { "epoch": 0.04, "grad_norm": 6.449169158935547, "learning_rate": 9.990162349082547e-05, "loss": 2.8371, "step": 63000 }, { "epoch": 0.04, "grad_norm": 5.483363628387451, "learning_rate": 9.990069541055023e-05, "loss": 2.8074, "step": 63500 }, { "epoch": 0.04, "grad_norm": 4.194338798522949, "learning_rate": 9.9899767330275e-05, "loss": 2.8134, "step": 64000 }, { "epoch": 0.04, "grad_norm": 4.445028305053711, "learning_rate": 9.989883924999978e-05, "loss": 2.8024, "step": 64500 }, { "epoch": 0.04, "grad_norm": 4.259857177734375, "learning_rate": 9.989791116972455e-05, "loss": 2.8063, "step": 65000 }, { "epoch": 0.04, "grad_norm": 15.38962173461914, "learning_rate": 9.989698308944932e-05, "loss": 2.7849, "step": 65500 }, { "epoch": 0.04, "grad_norm": 7.087435245513916, "learning_rate": 9.989605500917409e-05, "loss": 2.7755, "step": 66000 }, { "epoch": 0.04, "grad_norm": 4.6525044441223145, "learning_rate": 9.989512692889885e-05, "loss": 2.7692, "step": 66500 }, { "epoch": 0.04, "grad_norm": 3.657395124435425, "learning_rate": 9.98941988486236e-05, "loss": 2.7754, "step": 67000 }, { "epoch": 0.04, "grad_norm": 4.082229137420654, "learning_rate": 9.989327076834838e-05, "loss": 2.7462, "step": 67500 }, { "epoch": 0.04, "grad_norm": 5.71895694732666, "learning_rate": 9.989234268807315e-05, "loss": 2.7489, "step": 68000 }, { "epoch": 0.04, "grad_norm": 4.07025671005249, "learning_rate": 9.989141460779792e-05, "loss": 2.7429, "step": 68500 }, { "epoch": 0.04, "grad_norm": 6.3579888343811035, "learning_rate": 9.989048652752269e-05, "loss": 2.7359, "step": 69000 }, { "epoch": 0.04, "grad_norm": 5.608436107635498, "learning_rate": 9.988955844724746e-05, "loss": 2.7413, "step": 69500 }, { "epoch": 0.04, "grad_norm": 3.6080968379974365, "learning_rate": 9.988863036697222e-05, "loss": 2.7374, "step": 70000 }, { "epoch": 0.04, "grad_norm": 3.0696682929992676, "learning_rate": 9.988770228669699e-05, "loss": 2.7024, "step": 70500 }, { "epoch": 0.04, "grad_norm": 3.4028148651123047, "learning_rate": 9.988677420642176e-05, "loss": 2.6969, "step": 71000 }, { "epoch": 0.04, "grad_norm": 3.2408607006073, "learning_rate": 9.988584612614653e-05, "loss": 2.7142, "step": 71500 }, { "epoch": 0.04, "grad_norm": 4.686740875244141, "learning_rate": 9.98849180458713e-05, "loss": 2.6779, "step": 72000 }, { "epoch": 0.04, "grad_norm": 4.289364814758301, "learning_rate": 9.988398996559606e-05, "loss": 2.6924, "step": 72500 }, { "epoch": 0.04, "grad_norm": 5.570862293243408, "learning_rate": 9.988306188532083e-05, "loss": 2.7066, "step": 73000 }, { "epoch": 0.04, "grad_norm": 6.265756130218506, "learning_rate": 9.98821338050456e-05, "loss": 2.6904, "step": 73500 }, { "epoch": 0.04, "grad_norm": 4.149326324462891, "learning_rate": 9.988120572477038e-05, "loss": 2.675, "step": 74000 }, { "epoch": 0.04, "grad_norm": 3.304511547088623, "learning_rate": 9.988027764449515e-05, "loss": 2.6588, "step": 74500 }, { "epoch": 0.04, "grad_norm": 4.896495342254639, "learning_rate": 9.987934956421992e-05, "loss": 2.6543, "step": 75000 }, { "epoch": 0.04, "grad_norm": 4.827699661254883, "learning_rate": 9.987842148394468e-05, "loss": 2.6547, "step": 75500 }, { "epoch": 0.04, "grad_norm": 4.508848667144775, "learning_rate": 9.987749340366945e-05, "loss": 2.6737, "step": 76000 }, { "epoch": 0.04, "grad_norm": 2.711066722869873, "learning_rate": 9.987656532339422e-05, "loss": 2.6596, "step": 76500 }, { "epoch": 0.04, "grad_norm": 4.568380832672119, "learning_rate": 9.987563724311898e-05, "loss": 2.6336, "step": 77000 }, { "epoch": 0.04, "grad_norm": 4.230466365814209, "learning_rate": 9.987470916284375e-05, "loss": 2.6513, "step": 77500 }, { "epoch": 0.04, "grad_norm": 7.702442646026611, "learning_rate": 9.987378108256852e-05, "loss": 2.6448, "step": 78000 }, { "epoch": 0.04, "grad_norm": 5.38634729385376, "learning_rate": 9.987285300229329e-05, "loss": 2.6557, "step": 78500 }, { "epoch": 0.04, "grad_norm": 4.267560005187988, "learning_rate": 9.987192492201805e-05, "loss": 2.6387, "step": 79000 }, { "epoch": 0.04, "grad_norm": 4.3129801750183105, "learning_rate": 9.987099684174282e-05, "loss": 2.6109, "step": 79500 }, { "epoch": 0.04, "grad_norm": 4.636094570159912, "learning_rate": 9.98700687614676e-05, "loss": 2.6198, "step": 80000 }, { "epoch": 0.04, "grad_norm": 3.763615846633911, "learning_rate": 9.986914068119236e-05, "loss": 2.611, "step": 80500 }, { "epoch": 0.05, "grad_norm": 4.402085781097412, "learning_rate": 9.986821260091714e-05, "loss": 2.6057, "step": 81000 }, { "epoch": 0.05, "grad_norm": 4.437091827392578, "learning_rate": 9.986728452064191e-05, "loss": 2.6322, "step": 81500 }, { "epoch": 0.05, "grad_norm": 4.0002923011779785, "learning_rate": 9.986635644036667e-05, "loss": 2.5895, "step": 82000 }, { "epoch": 0.05, "grad_norm": 7.929385662078857, "learning_rate": 9.986542836009144e-05, "loss": 2.5913, "step": 82500 }, { "epoch": 0.05, "grad_norm": 3.028870105743408, "learning_rate": 9.986450027981621e-05, "loss": 2.5852, "step": 83000 }, { "epoch": 0.05, "grad_norm": 4.816354274749756, "learning_rate": 9.986357219954098e-05, "loss": 2.612, "step": 83500 }, { "epoch": 0.05, "grad_norm": 3.2722418308258057, "learning_rate": 9.986264411926575e-05, "loss": 2.5844, "step": 84000 }, { "epoch": 0.05, "grad_norm": 4.4086713790893555, "learning_rate": 9.986171603899052e-05, "loss": 2.5766, "step": 84500 }, { "epoch": 0.05, "grad_norm": 4.107178211212158, "learning_rate": 9.986078795871528e-05, "loss": 2.5861, "step": 85000 }, { "epoch": 0.05, "grad_norm": 4.0130133628845215, "learning_rate": 9.985985987844005e-05, "loss": 2.5946, "step": 85500 }, { "epoch": 0.05, "grad_norm": 4.053595066070557, "learning_rate": 9.985893179816482e-05, "loss": 2.5411, "step": 86000 }, { "epoch": 0.05, "grad_norm": 4.136240482330322, "learning_rate": 9.98580037178896e-05, "loss": 2.5791, "step": 86500 }, { "epoch": 0.05, "grad_norm": 6.301804542541504, "learning_rate": 9.985707563761437e-05, "loss": 2.5621, "step": 87000 }, { "epoch": 0.05, "grad_norm": 4.376827239990234, "learning_rate": 9.985614755733912e-05, "loss": 2.5693, "step": 87500 }, { "epoch": 0.05, "grad_norm": 3.8407466411590576, "learning_rate": 9.985521947706388e-05, "loss": 2.5354, "step": 88000 }, { "epoch": 0.05, "grad_norm": 6.222740650177002, "learning_rate": 9.985429139678865e-05, "loss": 2.5402, "step": 88500 }, { "epoch": 0.05, "grad_norm": 5.93278169631958, "learning_rate": 9.985336331651342e-05, "loss": 2.5285, "step": 89000 }, { "epoch": 0.05, "grad_norm": 2.4788284301757812, "learning_rate": 9.98524352362382e-05, "loss": 2.511, "step": 89500 }, { "epoch": 0.05, "grad_norm": 3.4369866847991943, "learning_rate": 9.985150715596297e-05, "loss": 2.5223, "step": 90000 }, { "epoch": 0.05, "grad_norm": 5.007632732391357, "learning_rate": 9.985057907568774e-05, "loss": 2.5231, "step": 90500 }, { "epoch": 0.05, "grad_norm": 3.440267562866211, "learning_rate": 9.98496509954125e-05, "loss": 2.5233, "step": 91000 }, { "epoch": 0.05, "grad_norm": 9.757936477661133, "learning_rate": 9.984872291513727e-05, "loss": 2.5099, "step": 91500 }, { "epoch": 0.05, "grad_norm": 4.741192817687988, "learning_rate": 9.984779483486204e-05, "loss": 2.5212, "step": 92000 }, { "epoch": 0.05, "grad_norm": 4.7662811279296875, "learning_rate": 9.984686675458681e-05, "loss": 2.5023, "step": 92500 }, { "epoch": 0.05, "grad_norm": 4.681964874267578, "learning_rate": 9.984593867431158e-05, "loss": 2.5055, "step": 93000 }, { "epoch": 0.05, "grad_norm": 3.7245185375213623, "learning_rate": 9.984501059403635e-05, "loss": 2.5083, "step": 93500 }, { "epoch": 0.05, "grad_norm": 5.7924418449401855, "learning_rate": 9.984408251376111e-05, "loss": 2.5024, "step": 94000 }, { "epoch": 0.05, "grad_norm": 5.420963764190674, "learning_rate": 9.984315443348588e-05, "loss": 2.5053, "step": 94500 }, { "epoch": 0.05, "grad_norm": 4.608907699584961, "learning_rate": 9.984222635321065e-05, "loss": 2.4765, "step": 95000 }, { "epoch": 0.05, "grad_norm": 5.017517566680908, "learning_rate": 9.984129827293543e-05, "loss": 2.5004, "step": 95500 }, { "epoch": 0.05, "grad_norm": 6.300387859344482, "learning_rate": 9.98403701926602e-05, "loss": 2.4943, "step": 96000 }, { "epoch": 0.05, "grad_norm": 6.16803503036499, "learning_rate": 9.983944211238497e-05, "loss": 2.5001, "step": 96500 }, { "epoch": 0.05, "grad_norm": 4.007481098175049, "learning_rate": 9.983851403210973e-05, "loss": 2.4898, "step": 97000 }, { "epoch": 0.05, "grad_norm": 5.498426914215088, "learning_rate": 9.983758595183448e-05, "loss": 2.4874, "step": 97500 }, { "epoch": 0.05, "grad_norm": 4.115726470947266, "learning_rate": 9.983665787155926e-05, "loss": 2.4714, "step": 98000 }, { "epoch": 0.05, "grad_norm": 4.724228382110596, "learning_rate": 9.983572979128403e-05, "loss": 2.4748, "step": 98500 }, { "epoch": 0.06, "grad_norm": 6.842497825622559, "learning_rate": 9.98348017110088e-05, "loss": 2.4761, "step": 99000 }, { "epoch": 0.06, "grad_norm": 4.527170181274414, "learning_rate": 9.983387363073357e-05, "loss": 2.462, "step": 99500 }, { "epoch": 0.06, "grad_norm": 4.956340789794922, "learning_rate": 9.983294555045834e-05, "loss": 2.4568, "step": 100000 }, { "epoch": 0.06, "grad_norm": 4.177946090698242, "learning_rate": 9.98320174701831e-05, "loss": 2.4418, "step": 100500 }, { "epoch": 0.06, "grad_norm": 6.623388767242432, "learning_rate": 9.983108938990787e-05, "loss": 2.4467, "step": 101000 }, { "epoch": 0.06, "grad_norm": 6.102182865142822, "learning_rate": 9.983016130963264e-05, "loss": 2.444, "step": 101500 }, { "epoch": 0.06, "grad_norm": 4.186117649078369, "learning_rate": 9.982923322935741e-05, "loss": 2.4396, "step": 102000 }, { "epoch": 0.06, "grad_norm": 3.916994571685791, "learning_rate": 9.982830514908218e-05, "loss": 2.4612, "step": 102500 }, { "epoch": 0.06, "grad_norm": 4.213066577911377, "learning_rate": 9.982737706880694e-05, "loss": 2.4244, "step": 103000 }, { "epoch": 0.06, "grad_norm": 4.665497779846191, "learning_rate": 9.982644898853171e-05, "loss": 2.4364, "step": 103500 }, { "epoch": 0.06, "grad_norm": 4.808815956115723, "learning_rate": 9.982552090825648e-05, "loss": 2.4381, "step": 104000 }, { "epoch": 0.06, "grad_norm": 5.384184837341309, "learning_rate": 9.982459282798126e-05, "loss": 2.4219, "step": 104500 }, { "epoch": 0.06, "grad_norm": 8.32588005065918, "learning_rate": 9.982366474770603e-05, "loss": 2.4207, "step": 105000 }, { "epoch": 0.06, "grad_norm": 6.564486503601074, "learning_rate": 9.98227366674308e-05, "loss": 2.4182, "step": 105500 }, { "epoch": 0.06, "grad_norm": 4.123614311218262, "learning_rate": 9.982180858715556e-05, "loss": 2.4214, "step": 106000 }, { "epoch": 0.06, "grad_norm": 5.507079124450684, "learning_rate": 9.982088050688033e-05, "loss": 2.4096, "step": 106500 }, { "epoch": 0.06, "grad_norm": 5.728691577911377, "learning_rate": 9.98199524266051e-05, "loss": 2.4201, "step": 107000 }, { "epoch": 0.06, "grad_norm": 6.611893177032471, "learning_rate": 9.981902434632987e-05, "loss": 2.4021, "step": 107500 }, { "epoch": 0.06, "grad_norm": 5.187854766845703, "learning_rate": 9.981809626605463e-05, "loss": 2.3956, "step": 108000 }, { "epoch": 0.06, "grad_norm": 6.5400166511535645, "learning_rate": 9.98171681857794e-05, "loss": 2.4168, "step": 108500 }, { "epoch": 0.06, "grad_norm": 4.0128173828125, "learning_rate": 9.981624010550417e-05, "loss": 2.4017, "step": 109000 }, { "epoch": 0.06, "grad_norm": 3.7369205951690674, "learning_rate": 9.981531202522893e-05, "loss": 2.3895, "step": 109500 }, { "epoch": 0.06, "grad_norm": 3.8515870571136475, "learning_rate": 9.98143839449537e-05, "loss": 2.3728, "step": 110000 }, { "epoch": 0.06, "grad_norm": 4.47413969039917, "learning_rate": 9.981345586467847e-05, "loss": 2.4067, "step": 110500 }, { "epoch": 0.06, "grad_norm": 4.970263481140137, "learning_rate": 9.981252778440324e-05, "loss": 2.3775, "step": 111000 }, { "epoch": 0.06, "grad_norm": 4.086507320404053, "learning_rate": 9.981159970412802e-05, "loss": 2.3695, "step": 111500 }, { "epoch": 0.06, "grad_norm": 4.484976768493652, "learning_rate": 9.981067162385279e-05, "loss": 2.3774, "step": 112000 }, { "epoch": 0.06, "grad_norm": 4.321993350982666, "learning_rate": 9.980974354357754e-05, "loss": 2.3563, "step": 112500 }, { "epoch": 0.06, "grad_norm": 5.6485066413879395, "learning_rate": 9.980881546330232e-05, "loss": 2.3601, "step": 113000 }, { "epoch": 0.06, "grad_norm": 6.259582996368408, "learning_rate": 9.980788738302709e-05, "loss": 2.3409, "step": 113500 }, { "epoch": 0.06, "grad_norm": 5.368774890899658, "learning_rate": 9.980695930275186e-05, "loss": 2.3809, "step": 114000 }, { "epoch": 0.06, "grad_norm": 7.465442657470703, "learning_rate": 9.980603122247663e-05, "loss": 2.3439, "step": 114500 }, { "epoch": 0.06, "grad_norm": 10.82461929321289, "learning_rate": 9.98051031422014e-05, "loss": 2.3738, "step": 115000 }, { "epoch": 0.06, "grad_norm": 3.924154758453369, "learning_rate": 9.980417506192616e-05, "loss": 2.344, "step": 115500 }, { "epoch": 0.06, "grad_norm": 3.8393290042877197, "learning_rate": 9.980324698165093e-05, "loss": 2.3423, "step": 116000 }, { "epoch": 0.06, "grad_norm": 3.9444777965545654, "learning_rate": 9.98023189013757e-05, "loss": 2.3426, "step": 116500 }, { "epoch": 0.07, "grad_norm": 5.0410566329956055, "learning_rate": 9.980139082110047e-05, "loss": 2.3129, "step": 117000 }, { "epoch": 0.07, "grad_norm": 5.183213233947754, "learning_rate": 9.980046274082524e-05, "loss": 2.3187, "step": 117500 }, { "epoch": 0.07, "grad_norm": 6.824711322784424, "learning_rate": 9.979953466055e-05, "loss": 2.3206, "step": 118000 }, { "epoch": 0.07, "grad_norm": 4.512765884399414, "learning_rate": 9.979860658027476e-05, "loss": 2.3176, "step": 118500 }, { "epoch": 0.07, "grad_norm": 6.9497480392456055, "learning_rate": 9.979767849999953e-05, "loss": 2.3418, "step": 119000 }, { "epoch": 0.07, "grad_norm": 4.003693580627441, "learning_rate": 9.97967504197243e-05, "loss": 2.3193, "step": 119500 }, { "epoch": 0.07, "grad_norm": 6.719389915466309, "learning_rate": 9.979582233944907e-05, "loss": 2.3244, "step": 120000 }, { "epoch": 0.07, "grad_norm": 4.46160364151001, "learning_rate": 9.979489425917385e-05, "loss": 2.3313, "step": 120500 }, { "epoch": 0.07, "grad_norm": 8.168338775634766, "learning_rate": 9.979396617889862e-05, "loss": 2.311, "step": 121000 }, { "epoch": 0.07, "grad_norm": 4.333712100982666, "learning_rate": 9.979303809862338e-05, "loss": 2.328, "step": 121500 }, { "epoch": 0.07, "grad_norm": 8.339402198791504, "learning_rate": 9.979211001834815e-05, "loss": 2.3118, "step": 122000 }, { "epoch": 0.07, "grad_norm": 4.38870906829834, "learning_rate": 9.979118193807292e-05, "loss": 2.3037, "step": 122500 }, { "epoch": 0.07, "grad_norm": 4.003161430358887, "learning_rate": 9.979025385779769e-05, "loss": 2.2959, "step": 123000 }, { "epoch": 0.07, "grad_norm": 6.8454389572143555, "learning_rate": 9.978932577752246e-05, "loss": 2.3192, "step": 123500 }, { "epoch": 0.07, "grad_norm": 6.631998538970947, "learning_rate": 9.978839769724723e-05, "loss": 2.2975, "step": 124000 }, { "epoch": 0.07, "grad_norm": 4.8582963943481445, "learning_rate": 9.978746961697199e-05, "loss": 2.2983, "step": 124500 }, { "epoch": 0.07, "grad_norm": 3.7582802772521973, "learning_rate": 9.978654153669676e-05, "loss": 2.3105, "step": 125000 }, { "epoch": 0.07, "grad_norm": 4.227505683898926, "learning_rate": 9.978561345642153e-05, "loss": 2.2866, "step": 125500 }, { "epoch": 0.07, "grad_norm": 5.702460765838623, "learning_rate": 9.97846853761463e-05, "loss": 2.2854, "step": 126000 }, { "epoch": 0.07, "grad_norm": 4.512174129486084, "learning_rate": 9.978375729587108e-05, "loss": 2.2962, "step": 126500 }, { "epoch": 0.07, "grad_norm": 4.127579212188721, "learning_rate": 9.978282921559585e-05, "loss": 2.3017, "step": 127000 }, { "epoch": 0.07, "grad_norm": 4.7636284828186035, "learning_rate": 9.97819011353206e-05, "loss": 2.26, "step": 127500 }, { "epoch": 0.07, "grad_norm": 5.8895697593688965, "learning_rate": 9.978097305504538e-05, "loss": 2.2851, "step": 128000 }, { "epoch": 0.07, "grad_norm": 5.10247278213501, "learning_rate": 9.978004497477013e-05, "loss": 2.2912, "step": 128500 }, { "epoch": 0.07, "grad_norm": 4.03781795501709, "learning_rate": 9.97791168944949e-05, "loss": 2.2631, "step": 129000 }, { "epoch": 0.07, "grad_norm": 6.225619316101074, "learning_rate": 9.977818881421968e-05, "loss": 2.2959, "step": 129500 }, { "epoch": 0.07, "grad_norm": 8.12752628326416, "learning_rate": 9.977726073394445e-05, "loss": 2.2574, "step": 130000 }, { "epoch": 0.07, "grad_norm": 8.98240852355957, "learning_rate": 9.977633265366922e-05, "loss": 2.2765, "step": 130500 }, { "epoch": 0.07, "grad_norm": 6.665409088134766, "learning_rate": 9.977540457339398e-05, "loss": 2.2831, "step": 131000 }, { "epoch": 0.07, "grad_norm": 5.666757106781006, "learning_rate": 9.977447649311875e-05, "loss": 2.2514, "step": 131500 }, { "epoch": 0.07, "grad_norm": 4.039821147918701, "learning_rate": 9.977354841284352e-05, "loss": 2.2553, "step": 132000 }, { "epoch": 0.07, "grad_norm": 4.816211223602295, "learning_rate": 9.977262033256829e-05, "loss": 2.2512, "step": 132500 }, { "epoch": 0.07, "grad_norm": 7.379537105560303, "learning_rate": 9.977169225229306e-05, "loss": 2.256, "step": 133000 }, { "epoch": 0.07, "grad_norm": 3.727262258529663, "learning_rate": 9.977076417201782e-05, "loss": 2.2447, "step": 133500 }, { "epoch": 0.07, "grad_norm": 4.287083625793457, "learning_rate": 9.976983609174259e-05, "loss": 2.233, "step": 134000 }, { "epoch": 0.07, "grad_norm": 4.225050926208496, "learning_rate": 9.976890801146736e-05, "loss": 2.2474, "step": 134500 }, { "epoch": 0.08, "grad_norm": 5.888236999511719, "learning_rate": 9.976797993119214e-05, "loss": 2.2187, "step": 135000 }, { "epoch": 0.08, "grad_norm": 5.869006156921387, "learning_rate": 9.97670518509169e-05, "loss": 2.2442, "step": 135500 }, { "epoch": 0.08, "grad_norm": 6.458480358123779, "learning_rate": 9.976612377064168e-05, "loss": 2.2501, "step": 136000 }, { "epoch": 0.08, "grad_norm": 3.7204341888427734, "learning_rate": 9.976519569036644e-05, "loss": 2.2464, "step": 136500 }, { "epoch": 0.08, "grad_norm": 4.325768947601318, "learning_rate": 9.976426761009121e-05, "loss": 2.2421, "step": 137000 }, { "epoch": 0.08, "grad_norm": 4.79429292678833, "learning_rate": 9.976333952981598e-05, "loss": 2.2466, "step": 137500 }, { "epoch": 0.08, "grad_norm": 8.317536354064941, "learning_rate": 9.976241144954075e-05, "loss": 2.2169, "step": 138000 }, { "epoch": 0.08, "grad_norm": 5.129164695739746, "learning_rate": 9.976148336926552e-05, "loss": 2.2188, "step": 138500 }, { "epoch": 0.08, "grad_norm": 3.7128493785858154, "learning_rate": 9.976055528899028e-05, "loss": 2.2372, "step": 139000 }, { "epoch": 0.08, "grad_norm": 4.394794940948486, "learning_rate": 9.975962720871505e-05, "loss": 2.2277, "step": 139500 }, { "epoch": 0.08, "grad_norm": 6.397000789642334, "learning_rate": 9.975869912843981e-05, "loss": 2.2509, "step": 140000 }, { "epoch": 0.08, "grad_norm": 5.2059526443481445, "learning_rate": 9.975777104816458e-05, "loss": 2.2082, "step": 140500 }, { "epoch": 0.08, "grad_norm": 3.5649585723876953, "learning_rate": 9.975684296788935e-05, "loss": 2.2383, "step": 141000 }, { "epoch": 0.08, "grad_norm": 4.717801094055176, "learning_rate": 9.975591488761412e-05, "loss": 2.1973, "step": 141500 }, { "epoch": 0.08, "grad_norm": 4.133371829986572, "learning_rate": 9.97549868073389e-05, "loss": 2.2214, "step": 142000 }, { "epoch": 0.08, "grad_norm": 5.321709156036377, "learning_rate": 9.975405872706367e-05, "loss": 2.2012, "step": 142500 }, { "epoch": 0.08, "grad_norm": 5.209156513214111, "learning_rate": 9.975313064678842e-05, "loss": 2.1892, "step": 143000 }, { "epoch": 0.08, "grad_norm": 5.405091285705566, "learning_rate": 9.97522025665132e-05, "loss": 2.217, "step": 143500 }, { "epoch": 0.08, "grad_norm": 5.35112190246582, "learning_rate": 9.975127448623797e-05, "loss": 2.2128, "step": 144000 }, { "epoch": 0.08, "grad_norm": 4.242053985595703, "learning_rate": 9.975034640596274e-05, "loss": 2.191, "step": 144500 }, { "epoch": 0.08, "grad_norm": 4.181004047393799, "learning_rate": 9.974941832568751e-05, "loss": 2.1965, "step": 145000 }, { "epoch": 0.08, "grad_norm": 3.5158121585845947, "learning_rate": 9.974849024541228e-05, "loss": 2.1692, "step": 145500 }, { "epoch": 0.08, "grad_norm": 4.8536577224731445, "learning_rate": 9.974756216513704e-05, "loss": 2.1861, "step": 146000 }, { "epoch": 0.08, "grad_norm": 5.284401893615723, "learning_rate": 9.974663408486181e-05, "loss": 2.1801, "step": 146500 }, { "epoch": 0.08, "grad_norm": 4.611184597015381, "learning_rate": 9.974570600458658e-05, "loss": 2.1879, "step": 147000 }, { "epoch": 0.08, "grad_norm": 4.935207843780518, "learning_rate": 9.974477792431135e-05, "loss": 2.1608, "step": 147500 }, { "epoch": 0.08, "grad_norm": 4.852113246917725, "learning_rate": 9.974384984403612e-05, "loss": 2.1788, "step": 148000 }, { "epoch": 0.08, "grad_norm": 5.22275972366333, "learning_rate": 9.974292176376088e-05, "loss": 2.1585, "step": 148500 }, { "epoch": 0.08, "grad_norm": 5.024623394012451, "learning_rate": 9.974199368348564e-05, "loss": 2.1757, "step": 149000 }, { "epoch": 0.08, "grad_norm": 5.230965614318848, "learning_rate": 9.974106560321041e-05, "loss": 2.1685, "step": 149500 }, { "epoch": 0.08, "grad_norm": 4.0713090896606445, "learning_rate": 9.974013752293518e-05, "loss": 2.1586, "step": 150000 }, { "epoch": 0.08, "grad_norm": 5.00492000579834, "learning_rate": 9.973920944265995e-05, "loss": 2.1538, "step": 150500 }, { "epoch": 0.08, "grad_norm": 3.9473533630371094, "learning_rate": 9.973828136238473e-05, "loss": 2.1585, "step": 151000 }, { "epoch": 0.08, "grad_norm": 4.452467918395996, "learning_rate": 9.97373532821095e-05, "loss": 2.1579, "step": 151500 }, { "epoch": 0.08, "grad_norm": 4.7615647315979, "learning_rate": 9.973642520183425e-05, "loss": 2.1609, "step": 152000 }, { "epoch": 0.08, "grad_norm": 4.646842956542969, "learning_rate": 9.973549712155903e-05, "loss": 2.1762, "step": 152500 }, { "epoch": 0.09, "grad_norm": 5.9310526847839355, "learning_rate": 9.97345690412838e-05, "loss": 2.1578, "step": 153000 }, { "epoch": 0.09, "grad_norm": 3.432331085205078, "learning_rate": 9.973364096100857e-05, "loss": 2.1606, "step": 153500 }, { "epoch": 0.09, "grad_norm": 3.7542684078216553, "learning_rate": 9.973271288073334e-05, "loss": 2.1403, "step": 154000 }, { "epoch": 0.09, "grad_norm": 5.2377400398254395, "learning_rate": 9.973178480045811e-05, "loss": 2.154, "step": 154500 }, { "epoch": 0.09, "grad_norm": 4.928728103637695, "learning_rate": 9.973085672018287e-05, "loss": 2.1478, "step": 155000 }, { "epoch": 0.09, "grad_norm": 7.469805717468262, "learning_rate": 9.972992863990764e-05, "loss": 2.1443, "step": 155500 }, { "epoch": 0.09, "grad_norm": 5.656938076019287, "learning_rate": 9.972900055963241e-05, "loss": 2.1472, "step": 156000 }, { "epoch": 0.09, "grad_norm": 6.054020404815674, "learning_rate": 9.972807247935718e-05, "loss": 2.1766, "step": 156500 }, { "epoch": 0.09, "grad_norm": 4.4340925216674805, "learning_rate": 9.972714439908195e-05, "loss": 2.1198, "step": 157000 }, { "epoch": 0.09, "grad_norm": 5.291192531585693, "learning_rate": 9.972621631880673e-05, "loss": 2.1446, "step": 157500 }, { "epoch": 0.09, "grad_norm": 5.266712188720703, "learning_rate": 9.972528823853148e-05, "loss": 2.1402, "step": 158000 }, { "epoch": 0.09, "grad_norm": 7.1873250007629395, "learning_rate": 9.972436015825626e-05, "loss": 2.1163, "step": 158500 }, { "epoch": 0.09, "grad_norm": 8.807872772216797, "learning_rate": 9.972343207798103e-05, "loss": 2.087, "step": 159000 }, { "epoch": 0.09, "grad_norm": 4.034912109375, "learning_rate": 9.972250399770578e-05, "loss": 2.1372, "step": 159500 }, { "epoch": 0.09, "grad_norm": 4.9043755531311035, "learning_rate": 9.972157591743056e-05, "loss": 2.1184, "step": 160000 }, { "epoch": 0.09, "grad_norm": 4.8470458984375, "learning_rate": 9.972064783715533e-05, "loss": 2.1304, "step": 160500 }, { "epoch": 0.09, "grad_norm": 4.68010139465332, "learning_rate": 9.97197197568801e-05, "loss": 2.1093, "step": 161000 }, { "epoch": 0.09, "grad_norm": 4.731649398803711, "learning_rate": 9.971879167660486e-05, "loss": 2.1178, "step": 161500 }, { "epoch": 0.09, "grad_norm": 4.586087226867676, "learning_rate": 9.971786359632963e-05, "loss": 2.1154, "step": 162000 }, { "epoch": 0.09, "grad_norm": 4.602361679077148, "learning_rate": 9.97169355160544e-05, "loss": 2.0925, "step": 162500 }, { "epoch": 0.09, "grad_norm": 5.405307292938232, "learning_rate": 9.971600743577917e-05, "loss": 2.1131, "step": 163000 }, { "epoch": 0.09, "grad_norm": 5.402783393859863, "learning_rate": 9.971507935550394e-05, "loss": 2.1112, "step": 163500 }, { "epoch": 0.09, "grad_norm": 3.7548913955688477, "learning_rate": 9.97141512752287e-05, "loss": 2.0885, "step": 164000 }, { "epoch": 0.09, "grad_norm": 5.215375900268555, "learning_rate": 9.971322319495347e-05, "loss": 2.1016, "step": 164500 }, { "epoch": 0.09, "grad_norm": 5.195767402648926, "learning_rate": 9.971229511467824e-05, "loss": 2.0821, "step": 165000 }, { "epoch": 0.09, "grad_norm": 5.254913806915283, "learning_rate": 9.971136703440301e-05, "loss": 2.0876, "step": 165500 }, { "epoch": 0.09, "grad_norm": 4.161681652069092, "learning_rate": 9.971043895412779e-05, "loss": 2.0771, "step": 166000 }, { "epoch": 0.09, "grad_norm": 7.586195468902588, "learning_rate": 9.970951087385256e-05, "loss": 2.0893, "step": 166500 }, { "epoch": 0.09, "grad_norm": 4.742598533630371, "learning_rate": 9.970858279357731e-05, "loss": 2.1043, "step": 167000 }, { "epoch": 0.09, "grad_norm": 7.583818435668945, "learning_rate": 9.970765471330209e-05, "loss": 2.0643, "step": 167500 }, { "epoch": 0.09, "grad_norm": 15.07123851776123, "learning_rate": 9.970672663302686e-05, "loss": 2.0719, "step": 168000 }, { "epoch": 0.09, "grad_norm": 4.9100565910339355, "learning_rate": 9.970579855275163e-05, "loss": 2.0645, "step": 168500 }, { "epoch": 0.09, "grad_norm": 4.305502891540527, "learning_rate": 9.97048704724764e-05, "loss": 2.0661, "step": 169000 }, { "epoch": 0.09, "grad_norm": 4.631031513214111, "learning_rate": 9.970394239220116e-05, "loss": 2.0729, "step": 169500 }, { "epoch": 0.09, "grad_norm": 4.102370738983154, "learning_rate": 9.970301431192593e-05, "loss": 2.0621, "step": 170000 }, { "epoch": 0.09, "grad_norm": 9.088913917541504, "learning_rate": 9.970208623165069e-05, "loss": 2.0566, "step": 170500 }, { "epoch": 0.1, "grad_norm": 3.1607444286346436, "learning_rate": 9.970115815137546e-05, "loss": 2.053, "step": 171000 }, { "epoch": 0.1, "grad_norm": 5.192753791809082, "learning_rate": 9.970023007110023e-05, "loss": 2.0443, "step": 171500 }, { "epoch": 0.1, "grad_norm": 5.527139186859131, "learning_rate": 9.9699301990825e-05, "loss": 2.0356, "step": 172000 }, { "epoch": 0.1, "grad_norm": 4.856675148010254, "learning_rate": 9.969837391054977e-05, "loss": 2.0405, "step": 172500 }, { "epoch": 0.1, "grad_norm": 6.9087724685668945, "learning_rate": 9.969744583027454e-05, "loss": 2.0544, "step": 173000 }, { "epoch": 0.1, "grad_norm": 5.108808517456055, "learning_rate": 9.96965177499993e-05, "loss": 2.0277, "step": 173500 }, { "epoch": 0.1, "grad_norm": 3.581963300704956, "learning_rate": 9.969558966972407e-05, "loss": 2.0569, "step": 174000 }, { "epoch": 0.1, "grad_norm": 2.8868958950042725, "learning_rate": 9.969466158944885e-05, "loss": 2.0569, "step": 174500 }, { "epoch": 0.1, "grad_norm": 4.1303510665893555, "learning_rate": 9.969373350917362e-05, "loss": 2.0378, "step": 175000 }, { "epoch": 0.1, "grad_norm": 3.9498367309570312, "learning_rate": 9.969280542889839e-05, "loss": 2.0487, "step": 175500 }, { "epoch": 0.1, "grad_norm": 4.58161735534668, "learning_rate": 9.969187734862316e-05, "loss": 2.0113, "step": 176000 }, { "epoch": 0.1, "grad_norm": 4.350900650024414, "learning_rate": 9.969094926834792e-05, "loss": 2.0371, "step": 176500 }, { "epoch": 0.1, "grad_norm": 4.318021297454834, "learning_rate": 9.969002118807269e-05, "loss": 2.0379, "step": 177000 }, { "epoch": 0.1, "grad_norm": 5.780760288238525, "learning_rate": 9.968909310779746e-05, "loss": 2.0117, "step": 177500 }, { "epoch": 0.1, "grad_norm": 4.759991645812988, "learning_rate": 9.968816502752223e-05, "loss": 2.034, "step": 178000 }, { "epoch": 0.1, "grad_norm": 3.5731894969940186, "learning_rate": 9.9687236947247e-05, "loss": 2.0386, "step": 178500 }, { "epoch": 0.1, "grad_norm": 5.845817565917969, "learning_rate": 9.968630886697176e-05, "loss": 2.0078, "step": 179000 }, { "epoch": 0.1, "grad_norm": 4.691517353057861, "learning_rate": 9.968538078669653e-05, "loss": 2.0223, "step": 179500 }, { "epoch": 0.1, "grad_norm": 3.678426742553711, "learning_rate": 9.968445270642129e-05, "loss": 2.0235, "step": 180000 }, { "epoch": 0.1, "grad_norm": 4.184782981872559, "learning_rate": 9.968352462614606e-05, "loss": 2.0036, "step": 180500 }, { "epoch": 0.1, "grad_norm": 5.912731647491455, "learning_rate": 9.968259654587083e-05, "loss": 2.016, "step": 181000 }, { "epoch": 0.1, "grad_norm": 5.004282474517822, "learning_rate": 9.96816684655956e-05, "loss": 1.9935, "step": 181500 }, { "epoch": 0.1, "grad_norm": 4.136642932891846, "learning_rate": 9.968074038532038e-05, "loss": 2.0201, "step": 182000 }, { "epoch": 0.1, "grad_norm": 4.304795742034912, "learning_rate": 9.967981230504513e-05, "loss": 2.0064, "step": 182500 }, { "epoch": 0.1, "grad_norm": 4.177530765533447, "learning_rate": 9.96788842247699e-05, "loss": 2.0087, "step": 183000 }, { "epoch": 0.1, "grad_norm": 3.730872869491577, "learning_rate": 9.967795614449468e-05, "loss": 2.0008, "step": 183500 }, { "epoch": 0.1, "grad_norm": 4.738286972045898, "learning_rate": 9.967702806421945e-05, "loss": 1.9961, "step": 184000 }, { "epoch": 0.1, "grad_norm": 4.57652473449707, "learning_rate": 9.967609998394422e-05, "loss": 1.9861, "step": 184500 }, { "epoch": 0.1, "grad_norm": 3.8996493816375732, "learning_rate": 9.967517190366899e-05, "loss": 1.9821, "step": 185000 }, { "epoch": 0.1, "grad_norm": 5.269779205322266, "learning_rate": 9.967424382339375e-05, "loss": 1.9761, "step": 185500 }, { "epoch": 0.1, "grad_norm": 3.3818140029907227, "learning_rate": 9.967331574311852e-05, "loss": 1.9679, "step": 186000 }, { "epoch": 0.1, "grad_norm": 3.7173426151275635, "learning_rate": 9.967238766284329e-05, "loss": 1.9992, "step": 186500 }, { "epoch": 0.1, "grad_norm": 3.1090087890625, "learning_rate": 9.967145958256806e-05, "loss": 1.9782, "step": 187000 }, { "epoch": 0.1, "grad_norm": 4.101827621459961, "learning_rate": 9.967053150229283e-05, "loss": 1.9704, "step": 187500 }, { "epoch": 0.1, "grad_norm": 3.682985782623291, "learning_rate": 9.96696034220176e-05, "loss": 2.0032, "step": 188000 }, { "epoch": 0.1, "grad_norm": 19.02726173400879, "learning_rate": 9.966867534174236e-05, "loss": 1.9946, "step": 188500 }, { "epoch": 0.11, "grad_norm": 4.69189453125, "learning_rate": 9.966774726146713e-05, "loss": 1.9245, "step": 189000 }, { "epoch": 0.11, "grad_norm": 12.00914192199707, "learning_rate": 9.96668191811919e-05, "loss": 1.9725, "step": 189500 }, { "epoch": 0.11, "grad_norm": 6.565895080566406, "learning_rate": 9.966589110091668e-05, "loss": 1.982, "step": 190000 }, { "epoch": 0.11, "grad_norm": 4.286581993103027, "learning_rate": 9.966496302064143e-05, "loss": 1.9978, "step": 190500 }, { "epoch": 0.11, "grad_norm": 5.0877509117126465, "learning_rate": 9.96640349403662e-05, "loss": 1.9735, "step": 191000 }, { "epoch": 0.11, "grad_norm": 4.9631242752075195, "learning_rate": 9.966310686009098e-05, "loss": 1.9508, "step": 191500 }, { "epoch": 0.11, "grad_norm": 4.4933552742004395, "learning_rate": 9.966217877981574e-05, "loss": 1.962, "step": 192000 }, { "epoch": 0.11, "grad_norm": 4.3112874031066895, "learning_rate": 9.966125069954051e-05, "loss": 1.9566, "step": 192500 }, { "epoch": 0.11, "grad_norm": 4.248539924621582, "learning_rate": 9.966032261926528e-05, "loss": 1.9379, "step": 193000 }, { "epoch": 0.11, "grad_norm": 3.26794171333313, "learning_rate": 9.965939453899005e-05, "loss": 1.9297, "step": 193500 }, { "epoch": 0.11, "grad_norm": 3.2197368144989014, "learning_rate": 9.965846645871482e-05, "loss": 1.9494, "step": 194000 }, { "epoch": 0.11, "grad_norm": 4.919404983520508, "learning_rate": 9.965753837843958e-05, "loss": 1.9529, "step": 194500 }, { "epoch": 0.11, "grad_norm": 4.171731948852539, "learning_rate": 9.965661029816435e-05, "loss": 1.9323, "step": 195000 }, { "epoch": 0.11, "grad_norm": 4.448825836181641, "learning_rate": 9.965568221788912e-05, "loss": 1.9484, "step": 195500 }, { "epoch": 0.11, "grad_norm": 5.62270975112915, "learning_rate": 9.965475413761389e-05, "loss": 1.9269, "step": 196000 }, { "epoch": 0.11, "grad_norm": 6.439905166625977, "learning_rate": 9.965382605733866e-05, "loss": 1.9193, "step": 196500 }, { "epoch": 0.11, "grad_norm": 4.133171081542969, "learning_rate": 9.965289797706344e-05, "loss": 1.9307, "step": 197000 }, { "epoch": 0.11, "grad_norm": 5.984546184539795, "learning_rate": 9.96519698967882e-05, "loss": 1.9042, "step": 197500 }, { "epoch": 0.11, "grad_norm": 4.515778064727783, "learning_rate": 9.965104181651297e-05, "loss": 1.9541, "step": 198000 }, { "epoch": 0.11, "grad_norm": 3.409749984741211, "learning_rate": 9.965011373623774e-05, "loss": 1.9203, "step": 198500 }, { "epoch": 0.11, "grad_norm": 3.724917411804199, "learning_rate": 9.964918565596251e-05, "loss": 1.9262, "step": 199000 }, { "epoch": 0.11, "grad_norm": 3.5863378047943115, "learning_rate": 9.964825757568728e-05, "loss": 1.9383, "step": 199500 }, { "epoch": 0.11, "grad_norm": 6.107095718383789, "learning_rate": 9.964732949541205e-05, "loss": 1.9263, "step": 200000 }, { "epoch": 0.11, "grad_norm": 3.91813588142395, "learning_rate": 9.964640141513681e-05, "loss": 1.9217, "step": 200500 }, { "epoch": 0.11, "grad_norm": 4.080438137054443, "learning_rate": 9.964547333486157e-05, "loss": 1.9089, "step": 201000 }, { "epoch": 0.11, "grad_norm": 3.550732374191284, "learning_rate": 9.964454525458634e-05, "loss": 1.9029, "step": 201500 }, { "epoch": 0.11, "grad_norm": 6.719958305358887, "learning_rate": 9.964361717431111e-05, "loss": 1.9201, "step": 202000 }, { "epoch": 0.11, "grad_norm": 4.790652751922607, "learning_rate": 9.964268909403588e-05, "loss": 1.8942, "step": 202500 }, { "epoch": 0.11, "grad_norm": 5.303153991699219, "learning_rate": 9.964176101376065e-05, "loss": 1.8945, "step": 203000 }, { "epoch": 0.11, "grad_norm": 6.912900924682617, "learning_rate": 9.964083293348542e-05, "loss": 1.9146, "step": 203500 }, { "epoch": 0.11, "grad_norm": 4.400740623474121, "learning_rate": 9.963990485321018e-05, "loss": 1.8796, "step": 204000 }, { "epoch": 0.11, "grad_norm": 5.205254554748535, "learning_rate": 9.963897677293495e-05, "loss": 1.8949, "step": 204500 }, { "epoch": 0.11, "grad_norm": 5.543479919433594, "learning_rate": 9.963804869265972e-05, "loss": 1.8846, "step": 205000 }, { "epoch": 0.11, "grad_norm": 8.72082233428955, "learning_rate": 9.96371206123845e-05, "loss": 1.8709, "step": 205500 }, { "epoch": 0.11, "grad_norm": 6.174784183502197, "learning_rate": 9.963619253210927e-05, "loss": 1.9181, "step": 206000 }, { "epoch": 0.11, "grad_norm": 5.813675880432129, "learning_rate": 9.963526445183404e-05, "loss": 1.8797, "step": 206500 }, { "epoch": 0.12, "grad_norm": 5.238151550292969, "learning_rate": 9.96343363715588e-05, "loss": 1.8747, "step": 207000 }, { "epoch": 0.12, "grad_norm": 3.6273601055145264, "learning_rate": 9.963340829128357e-05, "loss": 1.8942, "step": 207500 }, { "epoch": 0.12, "grad_norm": 6.024681091308594, "learning_rate": 9.963248021100834e-05, "loss": 1.8974, "step": 208000 }, { "epoch": 0.12, "grad_norm": 6.6017746925354, "learning_rate": 9.963155213073311e-05, "loss": 1.8878, "step": 208500 }, { "epoch": 0.12, "grad_norm": 4.99990177154541, "learning_rate": 9.963062405045788e-05, "loss": 1.8741, "step": 209000 }, { "epoch": 0.12, "grad_norm": 4.064300060272217, "learning_rate": 9.962969597018264e-05, "loss": 1.8638, "step": 209500 }, { "epoch": 0.12, "grad_norm": 4.6583685874938965, "learning_rate": 9.962876788990741e-05, "loss": 1.8723, "step": 210000 }, { "epoch": 0.12, "grad_norm": 4.108057498931885, "learning_rate": 9.962783980963218e-05, "loss": 1.8643, "step": 210500 }, { "epoch": 0.12, "grad_norm": 4.108830451965332, "learning_rate": 9.962691172935694e-05, "loss": 1.866, "step": 211000 }, { "epoch": 0.12, "grad_norm": 4.949222564697266, "learning_rate": 9.962598364908171e-05, "loss": 1.8723, "step": 211500 }, { "epoch": 0.12, "grad_norm": 3.929126501083374, "learning_rate": 9.962505556880648e-05, "loss": 1.85, "step": 212000 }, { "epoch": 0.12, "grad_norm": 4.211127281188965, "learning_rate": 9.962412748853125e-05, "loss": 1.8463, "step": 212500 }, { "epoch": 0.12, "grad_norm": 4.776076316833496, "learning_rate": 9.962319940825601e-05, "loss": 1.869, "step": 213000 }, { "epoch": 0.12, "grad_norm": 4.429445743560791, "learning_rate": 9.962227132798078e-05, "loss": 1.8667, "step": 213500 }, { "epoch": 0.12, "grad_norm": 3.9247748851776123, "learning_rate": 9.962134324770555e-05, "loss": 1.8632, "step": 214000 }, { "epoch": 0.12, "grad_norm": 4.082817554473877, "learning_rate": 9.962041516743033e-05, "loss": 1.8438, "step": 214500 }, { "epoch": 0.12, "grad_norm": 5.592115879058838, "learning_rate": 9.96194870871551e-05, "loss": 1.8395, "step": 215000 }, { "epoch": 0.12, "grad_norm": 3.4248578548431396, "learning_rate": 9.961855900687987e-05, "loss": 1.8423, "step": 215500 }, { "epoch": 0.12, "grad_norm": 3.3221466541290283, "learning_rate": 9.961763092660463e-05, "loss": 1.8258, "step": 216000 }, { "epoch": 0.12, "grad_norm": 3.8490660190582275, "learning_rate": 9.96167028463294e-05, "loss": 1.8574, "step": 216500 }, { "epoch": 0.12, "grad_norm": 5.5256547927856445, "learning_rate": 9.961577476605417e-05, "loss": 1.8312, "step": 217000 }, { "epoch": 0.12, "grad_norm": 3.6203320026397705, "learning_rate": 9.961484668577894e-05, "loss": 1.8483, "step": 217500 }, { "epoch": 0.12, "grad_norm": 4.3675856590271, "learning_rate": 9.961391860550371e-05, "loss": 1.8526, "step": 218000 }, { "epoch": 0.12, "grad_norm": 4.40755558013916, "learning_rate": 9.961299052522848e-05, "loss": 1.8617, "step": 218500 }, { "epoch": 0.12, "grad_norm": 12.9111967086792, "learning_rate": 9.961206244495324e-05, "loss": 1.8488, "step": 219000 }, { "epoch": 0.12, "grad_norm": 4.0077009201049805, "learning_rate": 9.961113436467801e-05, "loss": 1.8336, "step": 219500 }, { "epoch": 0.12, "grad_norm": 16.747453689575195, "learning_rate": 9.961020628440278e-05, "loss": 1.8535, "step": 220000 }, { "epoch": 0.12, "grad_norm": 5.349842071533203, "learning_rate": 9.960927820412756e-05, "loss": 1.8272, "step": 220500 }, { "epoch": 0.12, "grad_norm": 4.0585808753967285, "learning_rate": 9.960835012385231e-05, "loss": 1.8361, "step": 221000 }, { "epoch": 0.12, "grad_norm": 10.322525978088379, "learning_rate": 9.960742204357709e-05, "loss": 1.8171, "step": 221500 }, { "epoch": 0.12, "grad_norm": 3.9094197750091553, "learning_rate": 9.960649396330186e-05, "loss": 1.8434, "step": 222000 }, { "epoch": 0.12, "grad_norm": 4.250068187713623, "learning_rate": 9.960556588302661e-05, "loss": 1.8312, "step": 222500 }, { "epoch": 0.12, "grad_norm": 3.535179853439331, "learning_rate": 9.960463780275139e-05, "loss": 1.8263, "step": 223000 }, { "epoch": 0.12, "grad_norm": 3.5814437866210938, "learning_rate": 9.960370972247616e-05, "loss": 1.833, "step": 223500 }, { "epoch": 0.12, "grad_norm": 5.914824962615967, "learning_rate": 9.960278164220093e-05, "loss": 1.8196, "step": 224000 }, { "epoch": 0.12, "grad_norm": 3.966111660003662, "learning_rate": 9.96018535619257e-05, "loss": 1.8219, "step": 224500 }, { "epoch": 0.13, "grad_norm": 3.899972915649414, "learning_rate": 9.960092548165046e-05, "loss": 1.8173, "step": 225000 }, { "epoch": 0.13, "grad_norm": 3.0840604305267334, "learning_rate": 9.959999740137523e-05, "loss": 1.8036, "step": 225500 }, { "epoch": 0.13, "grad_norm": 4.453856468200684, "learning_rate": 9.95990693211e-05, "loss": 1.839, "step": 226000 }, { "epoch": 0.13, "grad_norm": 5.8350443840026855, "learning_rate": 9.959814124082477e-05, "loss": 1.7956, "step": 226500 }, { "epoch": 0.13, "grad_norm": 3.109984874725342, "learning_rate": 9.959721316054954e-05, "loss": 1.8198, "step": 227000 }, { "epoch": 0.13, "grad_norm": 5.034292221069336, "learning_rate": 9.959628508027431e-05, "loss": 1.7978, "step": 227500 }, { "epoch": 0.13, "grad_norm": 5.237682819366455, "learning_rate": 9.959535699999907e-05, "loss": 1.8309, "step": 228000 }, { "epoch": 0.13, "grad_norm": 3.0368969440460205, "learning_rate": 9.959442891972384e-05, "loss": 1.8012, "step": 228500 }, { "epoch": 0.13, "grad_norm": 3.3586440086364746, "learning_rate": 9.959350083944862e-05, "loss": 1.7965, "step": 229000 }, { "epoch": 0.13, "grad_norm": 4.194300651550293, "learning_rate": 9.959257275917339e-05, "loss": 1.8158, "step": 229500 }, { "epoch": 0.13, "grad_norm": 5.295820713043213, "learning_rate": 9.959164467889816e-05, "loss": 1.8035, "step": 230000 }, { "epoch": 0.13, "grad_norm": 3.1233255863189697, "learning_rate": 9.959071659862293e-05, "loss": 1.8076, "step": 230500 }, { "epoch": 0.13, "grad_norm": 4.108482837677002, "learning_rate": 9.958978851834769e-05, "loss": 1.8169, "step": 231000 }, { "epoch": 0.13, "grad_norm": 3.3223822116851807, "learning_rate": 9.958886043807245e-05, "loss": 1.7914, "step": 231500 }, { "epoch": 0.13, "grad_norm": 3.5875344276428223, "learning_rate": 9.958793235779722e-05, "loss": 1.8124, "step": 232000 }, { "epoch": 0.13, "grad_norm": 3.7725822925567627, "learning_rate": 9.958700427752199e-05, "loss": 1.7907, "step": 232500 }, { "epoch": 0.13, "grad_norm": 3.5657663345336914, "learning_rate": 9.958607619724676e-05, "loss": 1.795, "step": 233000 }, { "epoch": 0.13, "grad_norm": 4.3107404708862305, "learning_rate": 9.958514811697153e-05, "loss": 1.7687, "step": 233500 }, { "epoch": 0.13, "grad_norm": 4.887343883514404, "learning_rate": 9.95842200366963e-05, "loss": 1.7831, "step": 234000 }, { "epoch": 0.13, "grad_norm": 3.7076539993286133, "learning_rate": 9.958329195642106e-05, "loss": 1.7842, "step": 234500 }, { "epoch": 0.13, "grad_norm": 3.3431081771850586, "learning_rate": 9.958236387614583e-05, "loss": 1.797, "step": 235000 }, { "epoch": 0.13, "grad_norm": 2.8526740074157715, "learning_rate": 9.95814357958706e-05, "loss": 1.7879, "step": 235500 }, { "epoch": 0.13, "grad_norm": 4.57257604598999, "learning_rate": 9.958050771559537e-05, "loss": 1.7924, "step": 236000 }, { "epoch": 0.13, "grad_norm": 8.872370719909668, "learning_rate": 9.957957963532015e-05, "loss": 1.796, "step": 236500 }, { "epoch": 0.13, "grad_norm": 3.647901773452759, "learning_rate": 9.957865155504492e-05, "loss": 1.7801, "step": 237000 } ], "logging_steps": 500, "max_steps": 53884650, "num_input_tokens_seen": 0, "num_train_epochs": 30, "save_steps": 10, "total_flos": 1.6575934896869376e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }