{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.026143702519938035, "eval_steps": 73, "global_step": 3869, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 7.270200174029405, "learning_rate": 2e-07, "loss": 1.5886, "step": 1 }, { "epoch": 0.0, "eval_loss": 1.8586688041687012, "eval_runtime": 4.4089, "eval_samples_per_second": 2.041, "eval_steps_per_second": 1.134, "step": 1 }, { "epoch": 0.0, "grad_norm": 7.088138615023864, "learning_rate": 4e-07, "loss": 1.4809, "step": 2 }, { "epoch": 0.0, "grad_norm": 7.409449066839206, "learning_rate": 6e-07, "loss": 1.6585, "step": 3 }, { "epoch": 0.0, "grad_norm": 9.366466325593624, "learning_rate": 8e-07, "loss": 1.6965, "step": 4 }, { "epoch": 0.0, "grad_norm": 7.247338916406195, "learning_rate": 1e-06, "loss": 1.532, "step": 5 }, { "epoch": 0.0, "grad_norm": 8.049902060629968, "learning_rate": 1.2e-06, "loss": 1.596, "step": 6 }, { "epoch": 0.0, "grad_norm": 7.1820303407795745, "learning_rate": 1.4e-06, "loss": 1.5625, "step": 7 }, { "epoch": 0.0, "grad_norm": 6.7480024222126165, "learning_rate": 1.6e-06, "loss": 1.4882, "step": 8 }, { "epoch": 0.0, "grad_norm": 6.09119480827286, "learning_rate": 1.8e-06, "loss": 1.5345, "step": 9 }, { "epoch": 0.0, "grad_norm": 5.701895875897247, "learning_rate": 2e-06, "loss": 1.5403, "step": 10 }, { "epoch": 0.0, "grad_norm": 12.444180689164233, "learning_rate": 1.9999999999859167e-06, "loss": 1.7593, "step": 11 }, { "epoch": 0.0, "grad_norm": 6.671206404255702, "learning_rate": 1.9999999999436667e-06, "loss": 1.5666, "step": 12 }, { "epoch": 0.0, "grad_norm": 5.879938934255732, "learning_rate": 1.99999999987325e-06, "loss": 1.5774, "step": 13 }, { "epoch": 0.0, "grad_norm": 5.598165698155055, "learning_rate": 1.9999999997746663e-06, "loss": 1.3795, "step": 14 }, { "epoch": 0.0, "grad_norm": 5.241997738434293, "learning_rate": 1.9999999996479163e-06, "loss": 1.5449, "step": 15 }, { "epoch": 0.0, "grad_norm": 9.92151785969858, "learning_rate": 1.999999999493e-06, "loss": 1.6951, "step": 16 }, { "epoch": 0.0, "grad_norm": 5.539525947721768, "learning_rate": 1.9999999993099162e-06, "loss": 1.5409, "step": 17 }, { "epoch": 0.0, "grad_norm": 5.351267308329018, "learning_rate": 1.9999999990986662e-06, "loss": 1.426, "step": 18 }, { "epoch": 0.0, "grad_norm": 5.705234009184205, "learning_rate": 1.99999999885925e-06, "loss": 1.5517, "step": 19 }, { "epoch": 0.0, "grad_norm": 5.5021957856088175, "learning_rate": 1.999999998591666e-06, "loss": 1.6403, "step": 20 }, { "epoch": 0.0, "grad_norm": 5.230793498668246, "learning_rate": 1.999999998295916e-06, "loss": 1.3916, "step": 21 }, { "epoch": 0.0, "grad_norm": 7.632016493197849, "learning_rate": 1.9999999979719993e-06, "loss": 1.5029, "step": 22 }, { "epoch": 0.0, "grad_norm": 5.224094987664232, "learning_rate": 1.999999997619916e-06, "loss": 1.5172, "step": 23 }, { "epoch": 0.0, "grad_norm": 4.9052975256388995, "learning_rate": 1.9999999972396656e-06, "loss": 1.2809, "step": 24 }, { "epoch": 0.0, "grad_norm": 5.749261607817126, "learning_rate": 1.9999999968312492e-06, "loss": 1.682, "step": 25 }, { "epoch": 0.0, "grad_norm": 5.255433017333782, "learning_rate": 1.9999999963946656e-06, "loss": 1.5094, "step": 26 }, { "epoch": 0.0, "grad_norm": 5.093790043356135, "learning_rate": 1.9999999959299155e-06, "loss": 1.4338, "step": 27 }, { "epoch": 0.0, "grad_norm": 6.001582030581129, "learning_rate": 1.9999999954369987e-06, "loss": 1.5456, "step": 28 }, { "epoch": 0.0, "grad_norm": 8.863618311056822, "learning_rate": 1.999999994915915e-06, "loss": 1.5252, "step": 29 }, { "epoch": 0.0, "grad_norm": 5.926702449626208, "learning_rate": 1.9999999943666654e-06, "loss": 1.5579, "step": 30 }, { "epoch": 0.0, "grad_norm": 5.798968821403948, "learning_rate": 1.9999999937892486e-06, "loss": 1.5616, "step": 31 }, { "epoch": 0.0, "grad_norm": 5.306082122633555, "learning_rate": 1.999999993183665e-06, "loss": 1.6885, "step": 32 }, { "epoch": 0.0, "grad_norm": 7.324360862135967, "learning_rate": 1.999999992549915e-06, "loss": 1.6009, "step": 33 }, { "epoch": 0.0, "grad_norm": 5.712084687468965, "learning_rate": 1.9999999918879976e-06, "loss": 1.5022, "step": 34 }, { "epoch": 0.0, "grad_norm": 5.562761534033398, "learning_rate": 1.9999999911979143e-06, "loss": 1.7051, "step": 35 }, { "epoch": 0.0, "grad_norm": 11.4469400814171, "learning_rate": 1.9999999904796643e-06, "loss": 1.5599, "step": 36 }, { "epoch": 0.0, "grad_norm": 5.361519499368759, "learning_rate": 1.9999999897332474e-06, "loss": 1.5086, "step": 37 }, { "epoch": 0.0, "grad_norm": 4.9815078648545965, "learning_rate": 1.9999999889586637e-06, "loss": 1.5386, "step": 38 }, { "epoch": 0.0, "grad_norm": 5.162059963508055, "learning_rate": 1.9999999881559136e-06, "loss": 1.4769, "step": 39 }, { "epoch": 0.0, "grad_norm": 5.122293609109329, "learning_rate": 1.9999999873249968e-06, "loss": 1.561, "step": 40 }, { "epoch": 0.0, "grad_norm": 5.152152754881128, "learning_rate": 1.999999986465913e-06, "loss": 1.5329, "step": 41 }, { "epoch": 0.0, "grad_norm": 4.991233383631504, "learning_rate": 1.999999985578663e-06, "loss": 1.6348, "step": 42 }, { "epoch": 0.0, "grad_norm": 5.1180827908562545, "learning_rate": 1.999999984663246e-06, "loss": 1.5922, "step": 43 }, { "epoch": 0.0, "grad_norm": 5.447368695015986, "learning_rate": 1.9999999837196624e-06, "loss": 1.3119, "step": 44 }, { "epoch": 0.0, "grad_norm": 5.058434450512851, "learning_rate": 1.9999999827479124e-06, "loss": 1.4098, "step": 45 }, { "epoch": 0.0, "grad_norm": 6.792299679667143, "learning_rate": 1.9999999817479955e-06, "loss": 1.427, "step": 46 }, { "epoch": 0.0, "grad_norm": 6.86507140594805, "learning_rate": 1.9999999807199118e-06, "loss": 1.6528, "step": 47 }, { "epoch": 0.0, "grad_norm": 5.121726652742493, "learning_rate": 1.9999999796636617e-06, "loss": 1.4907, "step": 48 }, { "epoch": 0.0, "grad_norm": 4.897225770537551, "learning_rate": 1.9999999785792444e-06, "loss": 1.5129, "step": 49 }, { "epoch": 0.0, "grad_norm": 5.447285160025109, "learning_rate": 1.999999977466661e-06, "loss": 1.4732, "step": 50 }, { "epoch": 0.0, "grad_norm": 6.435992424750984, "learning_rate": 1.9999999763259106e-06, "loss": 1.5531, "step": 51 }, { "epoch": 0.0, "grad_norm": 5.126344987263065, "learning_rate": 1.9999999751569937e-06, "loss": 1.4008, "step": 52 }, { "epoch": 0.0, "grad_norm": 6.003394959358567, "learning_rate": 1.99999997395991e-06, "loss": 1.5076, "step": 53 }, { "epoch": 0.0, "grad_norm": 5.375009945600638, "learning_rate": 1.99999997273466e-06, "loss": 1.3866, "step": 54 }, { "epoch": 0.0, "grad_norm": 4.849278700300698, "learning_rate": 1.999999971481243e-06, "loss": 1.427, "step": 55 }, { "epoch": 0.0, "grad_norm": 5.532239141247253, "learning_rate": 1.9999999701996592e-06, "loss": 1.6364, "step": 56 }, { "epoch": 0.0, "grad_norm": 5.438678986247806, "learning_rate": 1.999999968889909e-06, "loss": 1.4648, "step": 57 }, { "epoch": 0.0, "grad_norm": 5.393102110640265, "learning_rate": 1.9999999675519918e-06, "loss": 1.5226, "step": 58 }, { "epoch": 0.0, "grad_norm": 5.602009023037625, "learning_rate": 1.999999966185908e-06, "loss": 1.745, "step": 59 }, { "epoch": 0.0, "grad_norm": 5.30661712292468, "learning_rate": 1.999999964791658e-06, "loss": 1.5059, "step": 60 }, { "epoch": 0.0, "grad_norm": 5.15102423620726, "learning_rate": 1.999999963369241e-06, "loss": 1.3974, "step": 61 }, { "epoch": 0.0, "grad_norm": 5.870501279592248, "learning_rate": 1.9999999619186573e-06, "loss": 1.5026, "step": 62 }, { "epoch": 0.0, "grad_norm": 5.270272741292025, "learning_rate": 1.9999999604399067e-06, "loss": 1.6544, "step": 63 }, { "epoch": 0.0, "grad_norm": 4.886919913741941, "learning_rate": 1.99999995893299e-06, "loss": 1.5181, "step": 64 }, { "epoch": 0.0, "grad_norm": 7.05398621579166, "learning_rate": 1.999999957397906e-06, "loss": 1.5616, "step": 65 }, { "epoch": 0.0, "grad_norm": 4.836134104240407, "learning_rate": 1.9999999558346555e-06, "loss": 1.54, "step": 66 }, { "epoch": 0.0, "grad_norm": 5.082059874604205, "learning_rate": 1.9999999542432386e-06, "loss": 1.5239, "step": 67 }, { "epoch": 0.0, "grad_norm": 4.874373987895782, "learning_rate": 1.999999952623655e-06, "loss": 1.5878, "step": 68 }, { "epoch": 0.0, "grad_norm": 4.946163953606462, "learning_rate": 1.9999999509759043e-06, "loss": 1.4434, "step": 69 }, { "epoch": 0.0, "grad_norm": 4.876405244918954, "learning_rate": 1.9999999492999873e-06, "loss": 1.4608, "step": 70 }, { "epoch": 0.0, "grad_norm": 5.5778213104294005, "learning_rate": 1.9999999475959036e-06, "loss": 1.5291, "step": 71 }, { "epoch": 0.0, "grad_norm": 4.99889754273985, "learning_rate": 1.9999999458636535e-06, "loss": 1.3987, "step": 72 }, { "epoch": 0.0, "grad_norm": 5.130434143656013, "learning_rate": 1.999999944103236e-06, "loss": 1.6093, "step": 73 }, { "epoch": 0.0, "eval_loss": 1.7611451148986816, "eval_runtime": 4.6217, "eval_samples_per_second": 1.947, "eval_steps_per_second": 1.082, "step": 73 }, { "epoch": 0.0, "grad_norm": 5.212510949695165, "learning_rate": 1.9999999423146523e-06, "loss": 1.7044, "step": 74 }, { "epoch": 0.0, "grad_norm": 5.2823971109355705, "learning_rate": 1.999999940497902e-06, "loss": 1.7013, "step": 75 }, { "epoch": 0.0, "grad_norm": 5.045619546308826, "learning_rate": 1.999999938652985e-06, "loss": 1.5755, "step": 76 }, { "epoch": 0.0, "grad_norm": 5.87457170524847, "learning_rate": 1.999999936779901e-06, "loss": 1.6232, "step": 77 }, { "epoch": 0.0, "grad_norm": 6.082677620793644, "learning_rate": 1.999999934878651e-06, "loss": 1.6031, "step": 78 }, { "epoch": 0.0, "grad_norm": 5.872910823814863, "learning_rate": 1.9999999329492335e-06, "loss": 1.5715, "step": 79 }, { "epoch": 0.0, "grad_norm": 5.244165093478454, "learning_rate": 1.9999999309916497e-06, "loss": 1.3898, "step": 80 }, { "epoch": 0.0, "grad_norm": 5.113300123505611, "learning_rate": 1.999999929005899e-06, "loss": 1.5425, "step": 81 }, { "epoch": 0.0, "grad_norm": 6.028851212735335, "learning_rate": 1.999999926991982e-06, "loss": 1.5812, "step": 82 }, { "epoch": 0.0, "grad_norm": 5.11348885382359, "learning_rate": 1.9999999249498984e-06, "loss": 1.4328, "step": 83 }, { "epoch": 0.0, "grad_norm": 5.535550818095342, "learning_rate": 1.9999999228796482e-06, "loss": 1.6637, "step": 84 }, { "epoch": 0.0, "grad_norm": 7.007537107122481, "learning_rate": 1.999999920781231e-06, "loss": 1.5714, "step": 85 }, { "epoch": 0.0, "grad_norm": 6.267096315507558, "learning_rate": 1.9999999186546466e-06, "loss": 1.5447, "step": 86 }, { "epoch": 0.0, "grad_norm": 5.796533721972671, "learning_rate": 1.9999999164998964e-06, "loss": 1.5971, "step": 87 }, { "epoch": 0.0, "grad_norm": 6.228074731568103, "learning_rate": 1.999999914316979e-06, "loss": 1.7089, "step": 88 }, { "epoch": 0.0, "grad_norm": 5.7122834140340295, "learning_rate": 1.9999999121058956e-06, "loss": 1.2932, "step": 89 }, { "epoch": 0.0, "grad_norm": 6.2806919781906965, "learning_rate": 1.999999909866645e-06, "loss": 1.653, "step": 90 }, { "epoch": 0.0, "grad_norm": 5.368043140767265, "learning_rate": 1.9999999075992276e-06, "loss": 1.6132, "step": 91 }, { "epoch": 0.0, "grad_norm": 5.76243769319436, "learning_rate": 1.999999905303644e-06, "loss": 1.6384, "step": 92 }, { "epoch": 0.0, "grad_norm": 5.672061728664766, "learning_rate": 1.9999999029798932e-06, "loss": 1.4254, "step": 93 }, { "epoch": 0.0, "grad_norm": 5.002515171971739, "learning_rate": 1.9999999006279762e-06, "loss": 1.4739, "step": 94 }, { "epoch": 0.0, "grad_norm": 5.2077060992013084, "learning_rate": 1.9999998982478924e-06, "loss": 1.4923, "step": 95 }, { "epoch": 0.0, "grad_norm": 5.167170360061274, "learning_rate": 1.999999895839642e-06, "loss": 1.6129, "step": 96 }, { "epoch": 0.0, "grad_norm": 4.886352788159953, "learning_rate": 1.9999998934032244e-06, "loss": 1.4177, "step": 97 }, { "epoch": 0.0, "grad_norm": 5.408894422523236, "learning_rate": 1.999999890938641e-06, "loss": 1.6144, "step": 98 }, { "epoch": 0.0, "grad_norm": 5.490500856061367, "learning_rate": 1.9999998884458904e-06, "loss": 1.5951, "step": 99 }, { "epoch": 0.0, "grad_norm": 5.1356644085000065, "learning_rate": 1.9999998859249733e-06, "loss": 1.587, "step": 100 }, { "epoch": 0.0, "grad_norm": 14.038552559799147, "learning_rate": 1.999999883375889e-06, "loss": 1.2883, "step": 101 }, { "epoch": 0.0, "grad_norm": 5.226960512898749, "learning_rate": 1.999999880798639e-06, "loss": 1.593, "step": 102 }, { "epoch": 0.0, "grad_norm": 6.414801916717659, "learning_rate": 1.9999998781932214e-06, "loss": 1.444, "step": 103 }, { "epoch": 0.0, "grad_norm": 4.966168140745907, "learning_rate": 1.9999998755596376e-06, "loss": 1.4407, "step": 104 }, { "epoch": 0.0, "grad_norm": 6.159957368813785, "learning_rate": 1.999999872897887e-06, "loss": 1.465, "step": 105 }, { "epoch": 0.0, "grad_norm": 5.6281356469376655, "learning_rate": 1.9999998702079695e-06, "loss": 1.5183, "step": 106 }, { "epoch": 0.0, "grad_norm": 4.823355492437726, "learning_rate": 1.9999998674898857e-06, "loss": 1.3937, "step": 107 }, { "epoch": 0.0, "grad_norm": 5.900910155703639, "learning_rate": 1.999999864743635e-06, "loss": 1.5294, "step": 108 }, { "epoch": 0.0, "grad_norm": 5.025636152106037, "learning_rate": 1.999999861969218e-06, "loss": 1.6249, "step": 109 }, { "epoch": 0.0, "grad_norm": 5.116169125449381, "learning_rate": 1.999999859166634e-06, "loss": 1.4079, "step": 110 }, { "epoch": 0.0, "grad_norm": 4.778904603716487, "learning_rate": 1.999999856335883e-06, "loss": 1.5865, "step": 111 }, { "epoch": 0.0, "grad_norm": 5.756222368715884, "learning_rate": 1.999999853476966e-06, "loss": 1.7328, "step": 112 }, { "epoch": 0.0, "grad_norm": 5.412891077813045, "learning_rate": 1.9999998505898822e-06, "loss": 1.6096, "step": 113 }, { "epoch": 0.0, "grad_norm": 4.918651260002986, "learning_rate": 1.9999998476746316e-06, "loss": 1.4487, "step": 114 }, { "epoch": 0.0, "grad_norm": 5.5264820158788845, "learning_rate": 1.9999998447312145e-06, "loss": 1.5762, "step": 115 }, { "epoch": 0.0, "grad_norm": 4.9444017993692375, "learning_rate": 1.9999998417596302e-06, "loss": 1.6108, "step": 116 }, { "epoch": 0.0, "grad_norm": 5.3741051561343065, "learning_rate": 1.9999998387598796e-06, "loss": 1.4945, "step": 117 }, { "epoch": 0.0, "grad_norm": 5.086038907073674, "learning_rate": 1.9999998357319625e-06, "loss": 1.4727, "step": 118 }, { "epoch": 0.0, "grad_norm": 5.374970852866187, "learning_rate": 1.9999998326758787e-06, "loss": 1.5108, "step": 119 }, { "epoch": 0.0, "grad_norm": 5.046226292058247, "learning_rate": 1.999999829591628e-06, "loss": 1.547, "step": 120 }, { "epoch": 0.0, "grad_norm": 4.920252413086519, "learning_rate": 1.9999998264792105e-06, "loss": 1.5885, "step": 121 }, { "epoch": 0.0, "grad_norm": 4.9199662171825, "learning_rate": 1.9999998233386266e-06, "loss": 1.5461, "step": 122 }, { "epoch": 0.0, "grad_norm": 4.849224216338868, "learning_rate": 1.9999998201698764e-06, "loss": 1.3918, "step": 123 }, { "epoch": 0.0, "grad_norm": 5.122690985334921, "learning_rate": 1.999999816972959e-06, "loss": 1.7063, "step": 124 }, { "epoch": 0.0, "grad_norm": 5.4642737814450495, "learning_rate": 1.999999813747875e-06, "loss": 1.4925, "step": 125 }, { "epoch": 0.0, "grad_norm": 6.985722953022008, "learning_rate": 1.9999998104946243e-06, "loss": 1.6928, "step": 126 }, { "epoch": 0.0, "grad_norm": 9.95531201412864, "learning_rate": 1.999999807213207e-06, "loss": 1.3453, "step": 127 }, { "epoch": 0.0, "grad_norm": 4.9615775535071025, "learning_rate": 1.999999803903623e-06, "loss": 1.4873, "step": 128 }, { "epoch": 0.0, "grad_norm": 4.995275322635195, "learning_rate": 1.9999998005658727e-06, "loss": 1.4477, "step": 129 }, { "epoch": 0.0, "grad_norm": 5.042565222809337, "learning_rate": 1.999999797199955e-06, "loss": 1.4494, "step": 130 }, { "epoch": 0.0, "grad_norm": 6.123208053454304, "learning_rate": 1.999999793805871e-06, "loss": 1.6236, "step": 131 }, { "epoch": 0.0, "grad_norm": 5.125753408156544, "learning_rate": 1.9999997903836206e-06, "loss": 1.4867, "step": 132 }, { "epoch": 0.0, "grad_norm": 4.953051829249889, "learning_rate": 1.999999786933203e-06, "loss": 1.4471, "step": 133 }, { "epoch": 0.0, "grad_norm": 5.01487688139561, "learning_rate": 1.999999783454619e-06, "loss": 1.5202, "step": 134 }, { "epoch": 0.0, "grad_norm": 4.9862855831810355, "learning_rate": 1.9999997799478685e-06, "loss": 1.2726, "step": 135 }, { "epoch": 0.0, "grad_norm": 4.664158110626264, "learning_rate": 1.9999997764129514e-06, "loss": 1.453, "step": 136 }, { "epoch": 0.0, "grad_norm": 4.515072838378242, "learning_rate": 1.9999997728498675e-06, "loss": 1.4493, "step": 137 }, { "epoch": 0.0, "grad_norm": 4.638297190169102, "learning_rate": 1.9999997692586168e-06, "loss": 1.4888, "step": 138 }, { "epoch": 0.0, "grad_norm": 4.88971951842883, "learning_rate": 1.9999997656391992e-06, "loss": 1.4683, "step": 139 }, { "epoch": 0.0, "grad_norm": 5.907276772043835, "learning_rate": 1.9999997619916153e-06, "loss": 1.4255, "step": 140 }, { "epoch": 0.0, "grad_norm": 5.130307226886763, "learning_rate": 1.9999997583158646e-06, "loss": 1.4295, "step": 141 }, { "epoch": 0.0, "grad_norm": 5.878724833576819, "learning_rate": 1.9999997546119475e-06, "loss": 1.6857, "step": 142 }, { "epoch": 0.0, "grad_norm": 4.889147537162318, "learning_rate": 1.9999997508798636e-06, "loss": 1.2668, "step": 143 }, { "epoch": 0.0, "grad_norm": 5.079666872019813, "learning_rate": 1.9999997471196124e-06, "loss": 1.4661, "step": 144 }, { "epoch": 0.0, "grad_norm": 4.896421399772935, "learning_rate": 1.9999997433311953e-06, "loss": 1.4462, "step": 145 }, { "epoch": 0.0, "grad_norm": 4.904873841992958, "learning_rate": 1.9999997395146114e-06, "loss": 1.5005, "step": 146 }, { "epoch": 0.0, "eval_loss": 1.7462446689605713, "eval_runtime": 4.5857, "eval_samples_per_second": 1.963, "eval_steps_per_second": 1.09, "step": 146 }, { "epoch": 0.0, "grad_norm": 4.780194658732313, "learning_rate": 1.9999997356698607e-06, "loss": 1.2281, "step": 147 }, { "epoch": 0.0, "grad_norm": 4.947954070905395, "learning_rate": 1.9999997317969435e-06, "loss": 1.5465, "step": 148 }, { "epoch": 0.0, "grad_norm": 5.262663754638768, "learning_rate": 1.9999997278958596e-06, "loss": 1.7203, "step": 149 }, { "epoch": 0.0, "grad_norm": 4.980503880353371, "learning_rate": 1.999999723966609e-06, "loss": 1.5602, "step": 150 }, { "epoch": 0.0, "grad_norm": 5.070738329007044, "learning_rate": 1.9999997200091917e-06, "loss": 1.4116, "step": 151 }, { "epoch": 0.0, "grad_norm": 5.443381763060975, "learning_rate": 1.999999716023608e-06, "loss": 1.6409, "step": 152 }, { "epoch": 0.0, "grad_norm": 4.836870306399049, "learning_rate": 1.999999712009857e-06, "loss": 1.3384, "step": 153 }, { "epoch": 0.0, "grad_norm": 7.427614614695787, "learning_rate": 1.9999997079679395e-06, "loss": 1.4659, "step": 154 }, { "epoch": 0.0, "grad_norm": 5.172785821462023, "learning_rate": 1.999999703897856e-06, "loss": 1.4818, "step": 155 }, { "epoch": 0.0, "grad_norm": 5.300155696147022, "learning_rate": 1.9999996997996052e-06, "loss": 1.4057, "step": 156 }, { "epoch": 0.0, "grad_norm": 4.918600876049223, "learning_rate": 1.9999996956731877e-06, "loss": 1.5059, "step": 157 }, { "epoch": 0.0, "grad_norm": 6.916667098440986, "learning_rate": 1.999999691518604e-06, "loss": 1.4173, "step": 158 }, { "epoch": 0.0, "grad_norm": 4.8728284717219905, "learning_rate": 1.999999687335853e-06, "loss": 1.3795, "step": 159 }, { "epoch": 0.0, "grad_norm": 5.287532891218524, "learning_rate": 1.999999683124936e-06, "loss": 1.4764, "step": 160 }, { "epoch": 0.0, "grad_norm": 4.987191884182559, "learning_rate": 1.999999678885852e-06, "loss": 1.5181, "step": 161 }, { "epoch": 0.0, "grad_norm": 5.186035530995563, "learning_rate": 1.999999674618601e-06, "loss": 1.4576, "step": 162 }, { "epoch": 0.0, "grad_norm": 4.918294071223459, "learning_rate": 1.999999670323184e-06, "loss": 1.4588, "step": 163 }, { "epoch": 0.0, "grad_norm": 5.065760262070265, "learning_rate": 1.9999996659996e-06, "loss": 1.5348, "step": 164 }, { "epoch": 0.0, "grad_norm": 5.017202795011469, "learning_rate": 1.999999661647849e-06, "loss": 1.5338, "step": 165 }, { "epoch": 0.0, "grad_norm": 5.6648484254216225, "learning_rate": 1.999999657267932e-06, "loss": 1.6196, "step": 166 }, { "epoch": 0.0, "grad_norm": 5.4750502865711965, "learning_rate": 1.999999652859848e-06, "loss": 1.5665, "step": 167 }, { "epoch": 0.0, "grad_norm": 5.586077526270329, "learning_rate": 1.9999996484235977e-06, "loss": 1.5768, "step": 168 }, { "epoch": 0.0, "grad_norm": 5.064098969414124, "learning_rate": 1.9999996439591805e-06, "loss": 1.5485, "step": 169 }, { "epoch": 0.0, "grad_norm": 6.831991975556203, "learning_rate": 1.999999639466596e-06, "loss": 1.6605, "step": 170 }, { "epoch": 0.0, "grad_norm": 5.386693105255357, "learning_rate": 1.9999996349458458e-06, "loss": 1.5403, "step": 171 }, { "epoch": 0.0, "grad_norm": 5.383469172835463, "learning_rate": 1.9999996303969286e-06, "loss": 1.4123, "step": 172 }, { "epoch": 0.0, "grad_norm": 6.136249870917203, "learning_rate": 1.9999996258198446e-06, "loss": 1.5223, "step": 173 }, { "epoch": 0.0, "grad_norm": 5.497494715675666, "learning_rate": 1.999999621214594e-06, "loss": 1.4759, "step": 174 }, { "epoch": 0.0, "grad_norm": 5.359298992694515, "learning_rate": 1.9999996165811766e-06, "loss": 1.5216, "step": 175 }, { "epoch": 0.0, "grad_norm": 4.554159062585914, "learning_rate": 1.9999996119195926e-06, "loss": 1.3022, "step": 176 }, { "epoch": 0.0, "grad_norm": 5.655783926270953, "learning_rate": 1.9999996072298423e-06, "loss": 1.6985, "step": 177 }, { "epoch": 0.0, "grad_norm": 6.815355669323773, "learning_rate": 1.999999602511925e-06, "loss": 1.6323, "step": 178 }, { "epoch": 0.0, "grad_norm": 5.090330865005731, "learning_rate": 1.999999597765841e-06, "loss": 1.4405, "step": 179 }, { "epoch": 0.0, "grad_norm": 5.453237104330645, "learning_rate": 1.9999995929915903e-06, "loss": 1.6569, "step": 180 }, { "epoch": 0.0, "grad_norm": 5.140949180379015, "learning_rate": 1.999999588189173e-06, "loss": 1.5043, "step": 181 }, { "epoch": 0.0, "grad_norm": 6.312297253169766, "learning_rate": 1.999999583358589e-06, "loss": 1.4393, "step": 182 }, { "epoch": 0.0, "grad_norm": 5.5602461468607745, "learning_rate": 1.9999995784998387e-06, "loss": 1.5034, "step": 183 }, { "epoch": 0.0, "grad_norm": 4.984563051637153, "learning_rate": 1.9999995736129215e-06, "loss": 1.5951, "step": 184 }, { "epoch": 0.0, "grad_norm": 5.210447188187252, "learning_rate": 1.9999995686978374e-06, "loss": 1.4994, "step": 185 }, { "epoch": 0.0, "grad_norm": 4.986374115612521, "learning_rate": 1.999999563754587e-06, "loss": 1.3567, "step": 186 }, { "epoch": 0.0, "grad_norm": 5.868262699823536, "learning_rate": 1.99999955878317e-06, "loss": 1.5591, "step": 187 }, { "epoch": 0.0, "grad_norm": 5.532695287284847, "learning_rate": 1.999999553783586e-06, "loss": 1.6909, "step": 188 }, { "epoch": 0.0, "grad_norm": 5.064305786007753, "learning_rate": 1.9999995487558354e-06, "loss": 1.6499, "step": 189 }, { "epoch": 0.0, "grad_norm": 5.3615235293142645, "learning_rate": 1.999999543699918e-06, "loss": 1.5116, "step": 190 }, { "epoch": 0.0, "grad_norm": 5.820360691431748, "learning_rate": 1.999999538615834e-06, "loss": 1.585, "step": 191 }, { "epoch": 0.0, "grad_norm": 4.848927324752224, "learning_rate": 1.9999995335035838e-06, "loss": 1.5208, "step": 192 }, { "epoch": 0.0, "grad_norm": 5.008810784644451, "learning_rate": 1.999999528363167e-06, "loss": 1.4648, "step": 193 }, { "epoch": 0.0, "grad_norm": 5.012428143646969, "learning_rate": 1.999999523194583e-06, "loss": 1.5394, "step": 194 }, { "epoch": 0.0, "grad_norm": 5.014801660701698, "learning_rate": 1.9999995179978325e-06, "loss": 1.5681, "step": 195 }, { "epoch": 0.0, "grad_norm": 5.044808653366915, "learning_rate": 1.9999995127729153e-06, "loss": 1.5071, "step": 196 }, { "epoch": 0.0, "grad_norm": 5.138850239659894, "learning_rate": 1.9999995075198317e-06, "loss": 1.5622, "step": 197 }, { "epoch": 0.0, "grad_norm": 4.942376781806205, "learning_rate": 1.999999502238581e-06, "loss": 1.4336, "step": 198 }, { "epoch": 0.0, "grad_norm": 5.454348532164084, "learning_rate": 1.999999496929164e-06, "loss": 1.5379, "step": 199 }, { "epoch": 0.0, "grad_norm": 4.515385692801986, "learning_rate": 1.99999949159158e-06, "loss": 1.3406, "step": 200 }, { "epoch": 0.0, "grad_norm": 5.124129865470126, "learning_rate": 1.9999994862258295e-06, "loss": 1.3091, "step": 201 }, { "epoch": 0.0, "grad_norm": 6.04812044323597, "learning_rate": 1.9999994808319127e-06, "loss": 1.4656, "step": 202 }, { "epoch": 0.0, "grad_norm": 4.832399296186657, "learning_rate": 1.9999994754098286e-06, "loss": 1.4201, "step": 203 }, { "epoch": 0.0, "grad_norm": 5.180488033599416, "learning_rate": 1.999999469959578e-06, "loss": 1.385, "step": 204 }, { "epoch": 0.0, "grad_norm": 5.190369524582554, "learning_rate": 1.9999994644811614e-06, "loss": 1.561, "step": 205 }, { "epoch": 0.0, "grad_norm": 5.554453318862103, "learning_rate": 1.9999994589745773e-06, "loss": 1.67, "step": 206 }, { "epoch": 0.0, "grad_norm": 7.035958799888537, "learning_rate": 1.999999453439827e-06, "loss": 1.3742, "step": 207 }, { "epoch": 0.0, "grad_norm": 5.829678242084621, "learning_rate": 1.99999944787691e-06, "loss": 1.5053, "step": 208 }, { "epoch": 0.0, "grad_norm": 5.67709513251063, "learning_rate": 1.999999442285826e-06, "loss": 1.6848, "step": 209 }, { "epoch": 0.0, "grad_norm": 4.611739029374384, "learning_rate": 1.999999436666576e-06, "loss": 1.534, "step": 210 }, { "epoch": 0.0, "grad_norm": 7.303224421644464, "learning_rate": 1.9999994310191587e-06, "loss": 1.4115, "step": 211 }, { "epoch": 0.0, "grad_norm": 4.739064239711381, "learning_rate": 1.999999425343575e-06, "loss": 1.5154, "step": 212 }, { "epoch": 0.0, "grad_norm": 4.998482123577443, "learning_rate": 1.999999419639825e-06, "loss": 1.4069, "step": 213 }, { "epoch": 0.0, "grad_norm": 4.777605807898373, "learning_rate": 1.9999994139079077e-06, "loss": 1.4424, "step": 214 }, { "epoch": 0.0, "grad_norm": 4.91838714700508, "learning_rate": 1.999999408147824e-06, "loss": 1.4662, "step": 215 }, { "epoch": 0.0, "grad_norm": 5.080194035605747, "learning_rate": 1.999999402359574e-06, "loss": 1.5027, "step": 216 }, { "epoch": 0.0, "grad_norm": 7.028785668898977, "learning_rate": 1.9999993965431567e-06, "loss": 1.4228, "step": 217 }, { "epoch": 0.0, "grad_norm": 4.840444295707605, "learning_rate": 1.999999390698573e-06, "loss": 1.415, "step": 218 }, { "epoch": 0.0, "grad_norm": 5.684443121932812, "learning_rate": 1.9999993848258226e-06, "loss": 1.5323, "step": 219 }, { "epoch": 0.0, "eval_loss": 1.7417716979980469, "eval_runtime": 4.6034, "eval_samples_per_second": 1.955, "eval_steps_per_second": 1.086, "step": 219 }, { "epoch": 0.0, "grad_norm": 4.888070245726793, "learning_rate": 1.9999993789249058e-06, "loss": 1.3426, "step": 220 }, { "epoch": 0.0, "grad_norm": 6.874791209837943, "learning_rate": 1.999999372995822e-06, "loss": 1.5123, "step": 221 }, { "epoch": 0.0, "grad_norm": 5.351391729703106, "learning_rate": 1.999999367038572e-06, "loss": 1.511, "step": 222 }, { "epoch": 0.0, "grad_norm": 5.252575209784133, "learning_rate": 1.999999361053155e-06, "loss": 1.5191, "step": 223 }, { "epoch": 0.0, "grad_norm": 5.0084979782736765, "learning_rate": 1.9999993550395715e-06, "loss": 1.4319, "step": 224 }, { "epoch": 0.0, "grad_norm": 5.032795555975619, "learning_rate": 1.999999348997821e-06, "loss": 1.5326, "step": 225 }, { "epoch": 0.0, "grad_norm": 4.882732392699978, "learning_rate": 1.9999993429279045e-06, "loss": 1.5562, "step": 226 }, { "epoch": 0.0, "grad_norm": 10.5857954829159, "learning_rate": 1.999999336829821e-06, "loss": 1.5575, "step": 227 }, { "epoch": 0.0, "grad_norm": 5.368713396037795, "learning_rate": 1.9999993307035704e-06, "loss": 1.5341, "step": 228 }, { "epoch": 0.0, "grad_norm": 6.564926482513187, "learning_rate": 1.999999324549154e-06, "loss": 1.5136, "step": 229 }, { "epoch": 0.0, "grad_norm": 4.87983027120232, "learning_rate": 1.9999993183665702e-06, "loss": 1.5424, "step": 230 }, { "epoch": 0.0, "grad_norm": 5.130333500849336, "learning_rate": 1.99999931215582e-06, "loss": 1.4305, "step": 231 }, { "epoch": 0.0, "grad_norm": 4.810216498710757, "learning_rate": 1.9999993059169032e-06, "loss": 1.3897, "step": 232 }, { "epoch": 0.0, "grad_norm": 4.75131060987687, "learning_rate": 1.99999929964982e-06, "loss": 1.4905, "step": 233 }, { "epoch": 0.0, "grad_norm": 4.9417367696494185, "learning_rate": 1.99999929335457e-06, "loss": 1.5133, "step": 234 }, { "epoch": 0.0, "grad_norm": 5.129642959459062, "learning_rate": 1.999999287031153e-06, "loss": 1.4786, "step": 235 }, { "epoch": 0.0, "grad_norm": 5.589926026863031, "learning_rate": 1.9999992806795697e-06, "loss": 1.4966, "step": 236 }, { "epoch": 0.0, "grad_norm": 5.509054909009049, "learning_rate": 1.999999274299819e-06, "loss": 1.4371, "step": 237 }, { "epoch": 0.0, "grad_norm": 4.6496971486682135, "learning_rate": 1.9999992678919027e-06, "loss": 1.4595, "step": 238 }, { "epoch": 0.0, "grad_norm": 5.832332761745741, "learning_rate": 1.999999261455819e-06, "loss": 1.5185, "step": 239 }, { "epoch": 0.0, "grad_norm": 5.257353906344085, "learning_rate": 1.9999992549915693e-06, "loss": 1.5056, "step": 240 }, { "epoch": 0.0, "grad_norm": 7.37222280076076, "learning_rate": 1.9999992484991524e-06, "loss": 1.504, "step": 241 }, { "epoch": 0.0, "grad_norm": 4.824423532897361, "learning_rate": 1.999999241978569e-06, "loss": 1.4981, "step": 242 }, { "epoch": 0.0, "grad_norm": 4.9091796444197575, "learning_rate": 1.999999235429819e-06, "loss": 1.4832, "step": 243 }, { "epoch": 0.0, "grad_norm": 7.748583674899383, "learning_rate": 1.9999992288529025e-06, "loss": 1.3462, "step": 244 }, { "epoch": 0.0, "grad_norm": 5.125022502509408, "learning_rate": 1.9999992222478192e-06, "loss": 1.4612, "step": 245 }, { "epoch": 0.0, "grad_norm": 32.53860605587701, "learning_rate": 1.999999215614569e-06, "loss": 1.4692, "step": 246 }, { "epoch": 0.0, "grad_norm": 5.052996211809871, "learning_rate": 1.9999992089531526e-06, "loss": 1.4238, "step": 247 }, { "epoch": 0.0, "grad_norm": 5.542095695226945, "learning_rate": 1.9999992022635693e-06, "loss": 1.4174, "step": 248 }, { "epoch": 0.0, "grad_norm": 5.686250494352825, "learning_rate": 1.999999195545819e-06, "loss": 1.6754, "step": 249 }, { "epoch": 0.0, "grad_norm": 5.6359874929300915, "learning_rate": 1.9999991887999027e-06, "loss": 1.5792, "step": 250 }, { "epoch": 0.0, "grad_norm": 5.0936043548140795, "learning_rate": 1.9999991820258194e-06, "loss": 1.4941, "step": 251 }, { "epoch": 0.0, "grad_norm": 6.406141836123647, "learning_rate": 1.9999991752235697e-06, "loss": 1.5123, "step": 252 }, { "epoch": 0.0, "grad_norm": 6.199896874003351, "learning_rate": 1.999999168393153e-06, "loss": 1.476, "step": 253 }, { "epoch": 0.0, "grad_norm": 5.66865264194522, "learning_rate": 1.99999916153457e-06, "loss": 1.5464, "step": 254 }, { "epoch": 0.0, "grad_norm": 5.021383409806155, "learning_rate": 1.99999915464782e-06, "loss": 1.5831, "step": 255 }, { "epoch": 0.0, "grad_norm": 6.615349724765676, "learning_rate": 1.9999991477329036e-06, "loss": 1.6743, "step": 256 }, { "epoch": 0.0, "grad_norm": 5.024216598608762, "learning_rate": 1.9999991407898203e-06, "loss": 1.5263, "step": 257 }, { "epoch": 0.0, "grad_norm": 5.352770335446249, "learning_rate": 1.9999991338185706e-06, "loss": 1.6316, "step": 258 }, { "epoch": 0.0, "grad_norm": 5.472919888558574, "learning_rate": 1.9999991268191545e-06, "loss": 1.4915, "step": 259 }, { "epoch": 0.0, "grad_norm": 5.46803445951313, "learning_rate": 1.999999119791571e-06, "loss": 1.5079, "step": 260 }, { "epoch": 0.0, "grad_norm": 5.154033494012857, "learning_rate": 1.9999991127358214e-06, "loss": 1.5658, "step": 261 }, { "epoch": 0.0, "grad_norm": 5.032995728347807, "learning_rate": 1.9999991056519053e-06, "loss": 1.5069, "step": 262 }, { "epoch": 0.0, "grad_norm": 4.916728694225458, "learning_rate": 1.9999990985398224e-06, "loss": 1.5897, "step": 263 }, { "epoch": 0.0, "grad_norm": 5.075661387114088, "learning_rate": 1.9999990913995727e-06, "loss": 1.5507, "step": 264 }, { "epoch": 0.0, "grad_norm": 10.051445604812477, "learning_rate": 1.999999084231156e-06, "loss": 1.6803, "step": 265 }, { "epoch": 0.0, "grad_norm": 4.971780062105953, "learning_rate": 1.999999077034573e-06, "loss": 1.4367, "step": 266 }, { "epoch": 0.0, "grad_norm": 4.934330515559447, "learning_rate": 1.9999990698098235e-06, "loss": 1.356, "step": 267 }, { "epoch": 0.0, "grad_norm": 6.5083027384301015, "learning_rate": 1.9999990625569073e-06, "loss": 1.4274, "step": 268 }, { "epoch": 0.0, "grad_norm": 5.109968483536287, "learning_rate": 1.9999990552758244e-06, "loss": 1.4728, "step": 269 }, { "epoch": 0.0, "grad_norm": 5.775385612499377, "learning_rate": 1.999999047966575e-06, "loss": 1.5146, "step": 270 }, { "epoch": 0.0, "grad_norm": 4.948468121480936, "learning_rate": 1.9999990406291585e-06, "loss": 1.4915, "step": 271 }, { "epoch": 0.0, "grad_norm": 5.965168560424121, "learning_rate": 1.999999033263576e-06, "loss": 1.4582, "step": 272 }, { "epoch": 0.0, "grad_norm": 5.442431763765304, "learning_rate": 1.9999990258698267e-06, "loss": 1.4613, "step": 273 }, { "epoch": 0.0, "grad_norm": 5.7776067027301545, "learning_rate": 1.99999901844791e-06, "loss": 1.503, "step": 274 }, { "epoch": 0.0, "grad_norm": 5.178892269148893, "learning_rate": 1.999999010997827e-06, "loss": 1.4328, "step": 275 }, { "epoch": 0.0, "grad_norm": 4.939222731616965, "learning_rate": 1.9999990035195783e-06, "loss": 1.4259, "step": 276 }, { "epoch": 0.0, "grad_norm": 4.900888732024022, "learning_rate": 1.9999989960131617e-06, "loss": 1.4883, "step": 277 }, { "epoch": 0.0, "grad_norm": 5.480965646206889, "learning_rate": 1.999998988478579e-06, "loss": 1.4998, "step": 278 }, { "epoch": 0.0, "grad_norm": 5.387613428010502, "learning_rate": 1.99999898091583e-06, "loss": 1.5284, "step": 279 }, { "epoch": 0.0, "grad_norm": 6.93375972130229, "learning_rate": 1.9999989733249137e-06, "loss": 1.5188, "step": 280 }, { "epoch": 0.0, "grad_norm": 4.887766431921025, "learning_rate": 1.999998965705831e-06, "loss": 1.4875, "step": 281 }, { "epoch": 0.0, "grad_norm": 4.872411589272914, "learning_rate": 1.999998958058582e-06, "loss": 1.4924, "step": 282 }, { "epoch": 0.0, "grad_norm": 4.8912427815939825, "learning_rate": 1.999998950383166e-06, "loss": 1.3855, "step": 283 }, { "epoch": 0.0, "grad_norm": 4.849266593984057, "learning_rate": 1.9999989426795835e-06, "loss": 1.252, "step": 284 }, { "epoch": 0.0, "grad_norm": 4.642331959124537, "learning_rate": 1.999998934947834e-06, "loss": 1.4346, "step": 285 }, { "epoch": 0.0, "grad_norm": 4.824479351331792, "learning_rate": 1.999998927187918e-06, "loss": 1.3413, "step": 286 }, { "epoch": 0.0, "grad_norm": 5.088834406666165, "learning_rate": 1.999998919399836e-06, "loss": 1.5249, "step": 287 }, { "epoch": 0.0, "grad_norm": 6.380115793381058, "learning_rate": 1.9999989115835865e-06, "loss": 1.6594, "step": 288 }, { "epoch": 0.0, "grad_norm": 4.560993714860048, "learning_rate": 1.9999989037391708e-06, "loss": 1.378, "step": 289 }, { "epoch": 0.0, "grad_norm": 11.285453090943186, "learning_rate": 1.9999988958665882e-06, "loss": 1.5047, "step": 290 }, { "epoch": 0.0, "grad_norm": 6.939941853112505, "learning_rate": 1.9999988879658393e-06, "loss": 1.4266, "step": 291 }, { "epoch": 0.0, "grad_norm": 4.842983984638459, "learning_rate": 1.9999988800369235e-06, "loss": 1.4714, "step": 292 }, { "epoch": 0.0, "eval_loss": 1.7301859855651855, "eval_runtime": 4.6111, "eval_samples_per_second": 1.952, "eval_steps_per_second": 1.084, "step": 292 }, { "epoch": 0.0, "grad_norm": 6.779278420529169, "learning_rate": 1.999998872079841e-06, "loss": 1.59, "step": 293 }, { "epoch": 0.0, "grad_norm": 5.66455555017548, "learning_rate": 1.999998864094592e-06, "loss": 1.5995, "step": 294 }, { "epoch": 0.0, "grad_norm": 5.542518840920991, "learning_rate": 1.9999988560811762e-06, "loss": 1.3785, "step": 295 }, { "epoch": 0.0, "grad_norm": 4.741540663363095, "learning_rate": 1.999998848039594e-06, "loss": 1.3725, "step": 296 }, { "epoch": 0.0, "grad_norm": 4.958319396026163, "learning_rate": 1.999998839969845e-06, "loss": 1.4389, "step": 297 }, { "epoch": 0.0, "grad_norm": 4.858392697584495, "learning_rate": 1.99999883187193e-06, "loss": 1.4766, "step": 298 }, { "epoch": 0.0, "grad_norm": 4.77843632873252, "learning_rate": 1.9999988237458472e-06, "loss": 1.4784, "step": 299 }, { "epoch": 0.0, "grad_norm": 5.166101873536924, "learning_rate": 1.9999988155915983e-06, "loss": 1.5636, "step": 300 }, { "epoch": 0.0, "grad_norm": 5.541352000059288, "learning_rate": 1.999998807409183e-06, "loss": 1.3294, "step": 301 }, { "epoch": 0.0, "grad_norm": 4.738532728807263, "learning_rate": 1.9999987991986007e-06, "loss": 1.3765, "step": 302 }, { "epoch": 0.0, "grad_norm": 6.12811751220863, "learning_rate": 1.9999987909598518e-06, "loss": 1.6032, "step": 303 }, { "epoch": 0.0, "grad_norm": 5.897961031920527, "learning_rate": 1.9999987826929364e-06, "loss": 1.5486, "step": 304 }, { "epoch": 0.0, "grad_norm": 5.108120627298669, "learning_rate": 1.9999987743978542e-06, "loss": 1.5455, "step": 305 }, { "epoch": 0.0, "grad_norm": 6.654624244259197, "learning_rate": 1.9999987660746057e-06, "loss": 1.563, "step": 306 }, { "epoch": 0.0, "grad_norm": 6.32748407900337, "learning_rate": 1.9999987577231903e-06, "loss": 1.5657, "step": 307 }, { "epoch": 0.0, "grad_norm": 6.643090403562411, "learning_rate": 1.9999987493436086e-06, "loss": 1.4312, "step": 308 }, { "epoch": 0.0, "grad_norm": 5.16915443300704, "learning_rate": 1.99999874093586e-06, "loss": 1.4448, "step": 309 }, { "epoch": 0.0, "grad_norm": 5.277599731706892, "learning_rate": 1.9999987324999446e-06, "loss": 1.6571, "step": 310 }, { "epoch": 0.0, "grad_norm": 5.2349500478147215, "learning_rate": 1.9999987240358625e-06, "loss": 1.4609, "step": 311 }, { "epoch": 0.0, "grad_norm": 6.248179668341996, "learning_rate": 1.9999987155436143e-06, "loss": 1.6418, "step": 312 }, { "epoch": 0.0, "grad_norm": 5.223835299607056, "learning_rate": 1.9999987070231985e-06, "loss": 1.495, "step": 313 }, { "epoch": 0.0, "grad_norm": 5.202665887318379, "learning_rate": 1.999998698474617e-06, "loss": 1.5282, "step": 314 }, { "epoch": 0.0, "grad_norm": 4.938213910400009, "learning_rate": 1.9999986898978686e-06, "loss": 1.4169, "step": 315 }, { "epoch": 0.0, "grad_norm": 9.193414540211652, "learning_rate": 1.9999986812929536e-06, "loss": 1.3747, "step": 316 }, { "epoch": 0.0, "grad_norm": 4.870538982226375, "learning_rate": 1.999998672659872e-06, "loss": 1.3901, "step": 317 }, { "epoch": 0.0, "grad_norm": 5.491542785298596, "learning_rate": 1.9999986639986233e-06, "loss": 1.5331, "step": 318 }, { "epoch": 0.0, "grad_norm": 5.500143758119544, "learning_rate": 1.9999986553092083e-06, "loss": 1.6651, "step": 319 }, { "epoch": 0.0, "grad_norm": 10.286451328879394, "learning_rate": 1.9999986465916265e-06, "loss": 1.6562, "step": 320 }, { "epoch": 0.0, "grad_norm": 5.19858420548839, "learning_rate": 1.9999986378458784e-06, "loss": 1.4969, "step": 321 }, { "epoch": 0.0, "grad_norm": 5.524050041906956, "learning_rate": 1.9999986290719634e-06, "loss": 1.4227, "step": 322 }, { "epoch": 0.0, "grad_norm": 7.65324091844115, "learning_rate": 1.999998620269882e-06, "loss": 1.5175, "step": 323 }, { "epoch": 0.0, "grad_norm": 4.955466116973382, "learning_rate": 1.999998611439634e-06, "loss": 1.3881, "step": 324 }, { "epoch": 0.0, "grad_norm": 4.98640055971755, "learning_rate": 1.9999986025812193e-06, "loss": 1.6281, "step": 325 }, { "epoch": 0.0, "grad_norm": 5.372570880498082, "learning_rate": 1.9999985936946375e-06, "loss": 1.4439, "step": 326 }, { "epoch": 0.0, "grad_norm": 5.307836526026473, "learning_rate": 1.9999985847798893e-06, "loss": 1.5315, "step": 327 }, { "epoch": 0.0, "grad_norm": 4.967657152244879, "learning_rate": 1.9999985758369748e-06, "loss": 1.5171, "step": 328 }, { "epoch": 0.0, "grad_norm": 6.189830333449287, "learning_rate": 1.9999985668658934e-06, "loss": 1.5468, "step": 329 }, { "epoch": 0.0, "grad_norm": 5.903849880093316, "learning_rate": 1.9999985578666452e-06, "loss": 1.6242, "step": 330 }, { "epoch": 0.0, "grad_norm": 4.784805252959356, "learning_rate": 1.999998548839231e-06, "loss": 1.4167, "step": 331 }, { "epoch": 0.0, "grad_norm": 5.196004132700368, "learning_rate": 1.9999985397836497e-06, "loss": 1.5647, "step": 332 }, { "epoch": 0.0, "grad_norm": 5.103521406136126, "learning_rate": 1.999998530699902e-06, "loss": 1.4414, "step": 333 }, { "epoch": 0.0, "grad_norm": 5.570598621348803, "learning_rate": 1.9999985215879873e-06, "loss": 1.3089, "step": 334 }, { "epoch": 0.0, "grad_norm": 5.777301480121715, "learning_rate": 1.999998512447906e-06, "loss": 1.5408, "step": 335 }, { "epoch": 0.0, "grad_norm": 4.997876127994012, "learning_rate": 1.9999985032796586e-06, "loss": 1.4277, "step": 336 }, { "epoch": 0.0, "grad_norm": 5.633061500236633, "learning_rate": 1.999998494083244e-06, "loss": 1.6895, "step": 337 }, { "epoch": 0.0, "grad_norm": 4.834617334661858, "learning_rate": 1.999998484858663e-06, "loss": 1.426, "step": 338 }, { "epoch": 0.0, "grad_norm": 6.993606113796682, "learning_rate": 1.9999984756059153e-06, "loss": 1.3854, "step": 339 }, { "epoch": 0.0, "grad_norm": 5.101502771813274, "learning_rate": 1.999998466325001e-06, "loss": 1.6404, "step": 340 }, { "epoch": 0.0, "grad_norm": 5.089236533265877, "learning_rate": 1.9999984570159197e-06, "loss": 1.4611, "step": 341 }, { "epoch": 0.0, "grad_norm": 5.206319847977299, "learning_rate": 1.9999984476786723e-06, "loss": 1.3741, "step": 342 }, { "epoch": 0.0, "grad_norm": 5.331926916079775, "learning_rate": 1.999998438313258e-06, "loss": 1.5426, "step": 343 }, { "epoch": 0.0, "grad_norm": 7.3624137401783445, "learning_rate": 1.9999984289196776e-06, "loss": 1.6399, "step": 344 }, { "epoch": 0.0, "grad_norm": 6.918510892290634, "learning_rate": 1.99999841949793e-06, "loss": 1.4012, "step": 345 }, { "epoch": 0.0, "grad_norm": 4.5972792983175665, "learning_rate": 1.999998410048016e-06, "loss": 1.4264, "step": 346 }, { "epoch": 0.0, "grad_norm": 4.916021369254276, "learning_rate": 1.999998400569935e-06, "loss": 1.5523, "step": 347 }, { "epoch": 0.0, "grad_norm": 5.5315684631864, "learning_rate": 1.9999983910636877e-06, "loss": 1.5335, "step": 348 }, { "epoch": 0.0, "grad_norm": 4.656760353737745, "learning_rate": 1.999998381529274e-06, "loss": 1.4952, "step": 349 }, { "epoch": 0.0, "grad_norm": 5.583073365446635, "learning_rate": 1.9999983719666933e-06, "loss": 1.4312, "step": 350 }, { "epoch": 0.0, "grad_norm": 5.293004596545783, "learning_rate": 1.999998362375946e-06, "loss": 1.5153, "step": 351 }, { "epoch": 0.0, "grad_norm": 4.998189663482702, "learning_rate": 1.999998352757032e-06, "loss": 1.5468, "step": 352 }, { "epoch": 0.0, "grad_norm": 4.704748816779011, "learning_rate": 1.9999983431099516e-06, "loss": 1.4175, "step": 353 }, { "epoch": 0.0, "grad_norm": 4.816706794317481, "learning_rate": 1.9999983334347046e-06, "loss": 1.5202, "step": 354 }, { "epoch": 0.0, "grad_norm": 4.973712391629575, "learning_rate": 1.999998323731291e-06, "loss": 1.4952, "step": 355 }, { "epoch": 0.0, "grad_norm": 5.4799024863000305, "learning_rate": 1.9999983139997107e-06, "loss": 1.6129, "step": 356 }, { "epoch": 0.0, "grad_norm": 5.773895048702807, "learning_rate": 1.9999983042399637e-06, "loss": 1.5075, "step": 357 }, { "epoch": 0.0, "grad_norm": 5.786637205360941, "learning_rate": 1.99999829445205e-06, "loss": 1.5998, "step": 358 }, { "epoch": 0.0, "grad_norm": 5.417325890208001, "learning_rate": 1.9999982846359697e-06, "loss": 1.491, "step": 359 }, { "epoch": 0.0, "grad_norm": 6.441996636074185, "learning_rate": 1.9999982747917228e-06, "loss": 1.452, "step": 360 }, { "epoch": 0.0, "grad_norm": 5.484858685210035, "learning_rate": 1.9999982649193094e-06, "loss": 1.5286, "step": 361 }, { "epoch": 0.0, "grad_norm": 4.304285744838827, "learning_rate": 1.999998255018729e-06, "loss": 1.261, "step": 362 }, { "epoch": 0.0, "grad_norm": 5.00715136108103, "learning_rate": 1.9999982450899826e-06, "loss": 1.5213, "step": 363 }, { "epoch": 0.0, "grad_norm": 7.9421250934625744, "learning_rate": 1.9999982351330693e-06, "loss": 1.5245, "step": 364 }, { "epoch": 0.0, "grad_norm": 7.249784350703182, "learning_rate": 1.999998225147989e-06, "loss": 1.4959, "step": 365 }, { "epoch": 0.0, "eval_loss": 1.7152172327041626, "eval_runtime": 4.5993, "eval_samples_per_second": 1.957, "eval_steps_per_second": 1.087, "step": 365 }, { "epoch": 0.0, "grad_norm": 4.950903475262251, "learning_rate": 1.9999982151347425e-06, "loss": 1.5132, "step": 366 }, { "epoch": 0.0, "grad_norm": 5.001256076322384, "learning_rate": 1.999998205093329e-06, "loss": 1.4178, "step": 367 }, { "epoch": 0.0, "grad_norm": 5.145343234500232, "learning_rate": 1.9999981950237494e-06, "loss": 1.6928, "step": 368 }, { "epoch": 0.0, "grad_norm": 4.926796515825282, "learning_rate": 1.999998184926003e-06, "loss": 1.4183, "step": 369 }, { "epoch": 0.0, "grad_norm": 6.205577250811191, "learning_rate": 1.99999817480009e-06, "loss": 1.4067, "step": 370 }, { "epoch": 0.0, "grad_norm": 6.181548092407497, "learning_rate": 1.99999816464601e-06, "loss": 1.3839, "step": 371 }, { "epoch": 0.0, "grad_norm": 4.8304254951229355, "learning_rate": 1.9999981544637634e-06, "loss": 1.3712, "step": 372 }, { "epoch": 0.0, "grad_norm": 12.315569431338385, "learning_rate": 1.9999981442533505e-06, "loss": 1.5337, "step": 373 }, { "epoch": 0.0, "grad_norm": 4.969790738605887, "learning_rate": 1.999998134014771e-06, "loss": 1.5229, "step": 374 }, { "epoch": 0.0, "grad_norm": 5.073608092190098, "learning_rate": 1.999998123748025e-06, "loss": 1.579, "step": 375 }, { "epoch": 0.0, "grad_norm": 5.111176704390247, "learning_rate": 1.999998113453112e-06, "loss": 1.4727, "step": 376 }, { "epoch": 0.0, "grad_norm": 4.404278529357815, "learning_rate": 1.9999981031300326e-06, "loss": 1.3756, "step": 377 }, { "epoch": 0.0, "grad_norm": 4.66133544977496, "learning_rate": 1.9999980927787864e-06, "loss": 1.3958, "step": 378 }, { "epoch": 0.0, "grad_norm": 4.811042205639708, "learning_rate": 1.999998082399374e-06, "loss": 1.4358, "step": 379 }, { "epoch": 0.0, "grad_norm": 5.082316240684894, "learning_rate": 1.9999980719917945e-06, "loss": 1.5925, "step": 380 }, { "epoch": 0.0, "grad_norm": 5.316025491311415, "learning_rate": 1.9999980615560487e-06, "loss": 1.3726, "step": 381 }, { "epoch": 0.0, "grad_norm": 4.922423456498873, "learning_rate": 1.9999980510921357e-06, "loss": 1.3629, "step": 382 }, { "epoch": 0.0, "grad_norm": 4.966088631224505, "learning_rate": 1.9999980406000568e-06, "loss": 1.4834, "step": 383 }, { "epoch": 0.0, "grad_norm": 4.8135477821063475, "learning_rate": 1.999998030079811e-06, "loss": 1.5864, "step": 384 }, { "epoch": 0.0, "grad_norm": 7.2288779061936035, "learning_rate": 1.999998019531399e-06, "loss": 1.5929, "step": 385 }, { "epoch": 0.0, "grad_norm": 7.583909712278632, "learning_rate": 1.9999980089548195e-06, "loss": 1.4818, "step": 386 }, { "epoch": 0.0, "grad_norm": 5.161770592187534, "learning_rate": 1.999997998350074e-06, "loss": 1.7042, "step": 387 }, { "epoch": 0.0, "grad_norm": 5.53786371138208, "learning_rate": 1.999997987717162e-06, "loss": 1.4949, "step": 388 }, { "epoch": 0.0, "grad_norm": 5.547955232044561, "learning_rate": 1.999997977056083e-06, "loss": 1.5075, "step": 389 }, { "epoch": 0.0, "grad_norm": 5.443846222937035, "learning_rate": 1.999997966366837e-06, "loss": 1.5852, "step": 390 }, { "epoch": 0.0, "grad_norm": 4.948645185345369, "learning_rate": 1.9999979556494255e-06, "loss": 1.3367, "step": 391 }, { "epoch": 0.0, "grad_norm": 4.580424596028237, "learning_rate": 1.9999979449038465e-06, "loss": 1.2409, "step": 392 }, { "epoch": 0.0, "grad_norm": 5.704258128824565, "learning_rate": 1.999997934130101e-06, "loss": 1.4568, "step": 393 }, { "epoch": 0.0, "grad_norm": 5.441676548521125, "learning_rate": 1.9999979233281894e-06, "loss": 1.5953, "step": 394 }, { "epoch": 0.0, "grad_norm": 4.912142407168767, "learning_rate": 1.999997912498111e-06, "loss": 1.54, "step": 395 }, { "epoch": 0.0, "grad_norm": 5.1706944951472895, "learning_rate": 1.9999979016398654e-06, "loss": 1.3123, "step": 396 }, { "epoch": 0.0, "grad_norm": 8.26460889745062, "learning_rate": 1.9999978907534537e-06, "loss": 1.5813, "step": 397 }, { "epoch": 0.0, "grad_norm": 4.564397081017155, "learning_rate": 1.999997879838875e-06, "loss": 1.3763, "step": 398 }, { "epoch": 0.0, "grad_norm": 4.9015032586368905, "learning_rate": 1.99999786889613e-06, "loss": 1.4756, "step": 399 }, { "epoch": 0.0, "grad_norm": 4.917803358556267, "learning_rate": 1.9999978579252184e-06, "loss": 1.2537, "step": 400 }, { "epoch": 0.0, "grad_norm": 4.5181742977699555, "learning_rate": 1.9999978469261402e-06, "loss": 1.4328, "step": 401 }, { "epoch": 0.0, "grad_norm": 4.660663659308694, "learning_rate": 1.9999978358988953e-06, "loss": 1.4581, "step": 402 }, { "epoch": 0.0, "grad_norm": 5.0474102698666385, "learning_rate": 1.999997824843484e-06, "loss": 1.4002, "step": 403 }, { "epoch": 0.0, "grad_norm": 9.834707543136538, "learning_rate": 1.9999978137599058e-06, "loss": 1.4236, "step": 404 }, { "epoch": 0.0, "grad_norm": 5.168260756918322, "learning_rate": 1.9999978026481612e-06, "loss": 1.5046, "step": 405 }, { "epoch": 0.0, "grad_norm": 5.035029265083008, "learning_rate": 1.99999779150825e-06, "loss": 1.3636, "step": 406 }, { "epoch": 0.0, "grad_norm": 5.28716001715498, "learning_rate": 1.999997780340172e-06, "loss": 1.4576, "step": 407 }, { "epoch": 0.0, "grad_norm": 4.8852077758822015, "learning_rate": 1.999997769143927e-06, "loss": 1.4481, "step": 408 }, { "epoch": 0.0, "grad_norm": 5.232260975458147, "learning_rate": 1.999997757919516e-06, "loss": 1.5196, "step": 409 }, { "epoch": 0.0, "grad_norm": 5.264068624514251, "learning_rate": 1.9999977466669385e-06, "loss": 1.4934, "step": 410 }, { "epoch": 0.0, "grad_norm": 5.469538493605676, "learning_rate": 1.999997735386194e-06, "loss": 1.505, "step": 411 }, { "epoch": 0.0, "grad_norm": 4.999328912549101, "learning_rate": 1.999997724077283e-06, "loss": 1.4756, "step": 412 }, { "epoch": 0.0, "grad_norm": 5.076217295124521, "learning_rate": 1.9999977127402057e-06, "loss": 1.5694, "step": 413 }, { "epoch": 0.0, "grad_norm": 5.226359452459192, "learning_rate": 1.999997701374961e-06, "loss": 1.4211, "step": 414 }, { "epoch": 0.0, "grad_norm": 4.819718845384241, "learning_rate": 1.99999768998155e-06, "loss": 1.5322, "step": 415 }, { "epoch": 0.0, "grad_norm": 5.122043265582762, "learning_rate": 1.9999976785599732e-06, "loss": 1.4893, "step": 416 }, { "epoch": 0.0, "grad_norm": 8.297154589932777, "learning_rate": 1.9999976671102287e-06, "loss": 1.5449, "step": 417 }, { "epoch": 0.0, "grad_norm": 4.890157121974458, "learning_rate": 1.999997655632318e-06, "loss": 1.4776, "step": 418 }, { "epoch": 0.0, "grad_norm": 5.589475875269356, "learning_rate": 1.9999976441262408e-06, "loss": 1.4896, "step": 419 }, { "epoch": 0.0, "grad_norm": 5.277939766849428, "learning_rate": 1.999997632591997e-06, "loss": 1.5828, "step": 420 }, { "epoch": 0.0, "grad_norm": 4.470441711000115, "learning_rate": 1.9999976210295865e-06, "loss": 1.3217, "step": 421 }, { "epoch": 0.0, "grad_norm": 5.995769226585237, "learning_rate": 1.9999976094390096e-06, "loss": 1.5426, "step": 422 }, { "epoch": 0.0, "grad_norm": 6.160211017382697, "learning_rate": 1.999997597820266e-06, "loss": 1.5534, "step": 423 }, { "epoch": 0.0, "grad_norm": 5.8941787869537325, "learning_rate": 1.9999975861733557e-06, "loss": 1.4683, "step": 424 }, { "epoch": 0.0, "grad_norm": 5.213637265959196, "learning_rate": 1.9999975744982788e-06, "loss": 1.7138, "step": 425 }, { "epoch": 0.0, "grad_norm": 4.62520919093602, "learning_rate": 1.999997562795035e-06, "loss": 1.4748, "step": 426 }, { "epoch": 0.0, "grad_norm": 4.836310097570104, "learning_rate": 1.999997551063625e-06, "loss": 1.5654, "step": 427 }, { "epoch": 0.0, "grad_norm": 4.4455635910745555, "learning_rate": 1.9999975393040484e-06, "loss": 1.3387, "step": 428 }, { "epoch": 0.0, "grad_norm": 6.820553071368411, "learning_rate": 1.999997527516305e-06, "loss": 1.4821, "step": 429 }, { "epoch": 0.0, "grad_norm": 4.97958037440551, "learning_rate": 1.9999975157003953e-06, "loss": 1.5386, "step": 430 }, { "epoch": 0.0, "grad_norm": 5.218479803099335, "learning_rate": 1.9999975038563184e-06, "loss": 1.5253, "step": 431 }, { "epoch": 0.0, "grad_norm": 4.817934555265224, "learning_rate": 1.9999974919840755e-06, "loss": 1.4677, "step": 432 }, { "epoch": 0.0, "grad_norm": 7.04962891837311, "learning_rate": 1.9999974800836658e-06, "loss": 1.4149, "step": 433 }, { "epoch": 0.0, "grad_norm": 4.977731627262668, "learning_rate": 1.9999974681550892e-06, "loss": 1.4752, "step": 434 }, { "epoch": 0.0, "grad_norm": 5.593445830695836, "learning_rate": 1.9999974561983467e-06, "loss": 1.317, "step": 435 }, { "epoch": 0.0, "grad_norm": 5.061091292458903, "learning_rate": 1.999997444213437e-06, "loss": 1.486, "step": 436 }, { "epoch": 0.0, "grad_norm": 4.608804550083991, "learning_rate": 1.999997432200361e-06, "loss": 1.3927, "step": 437 }, { "epoch": 0.0, "grad_norm": 4.670859026053921, "learning_rate": 1.999997420159118e-06, "loss": 1.4257, "step": 438 }, { "epoch": 0.0, "eval_loss": 1.7094385623931885, "eval_runtime": 4.6238, "eval_samples_per_second": 1.946, "eval_steps_per_second": 1.081, "step": 438 }, { "epoch": 0.0, "grad_norm": 5.859564981544054, "learning_rate": 1.9999974080897087e-06, "loss": 1.3354, "step": 439 }, { "epoch": 0.0, "grad_norm": 5.52294064206441, "learning_rate": 1.999997395992133e-06, "loss": 1.6065, "step": 440 }, { "epoch": 0.0, "grad_norm": 4.424488305091629, "learning_rate": 1.99999738386639e-06, "loss": 1.4306, "step": 441 }, { "epoch": 0.0, "grad_norm": 4.798968031648101, "learning_rate": 1.999997371712481e-06, "loss": 1.4365, "step": 442 }, { "epoch": 0.0, "grad_norm": 4.932924371225362, "learning_rate": 1.9999973595304054e-06, "loss": 1.3648, "step": 443 }, { "epoch": 0.0, "grad_norm": 5.234896304295093, "learning_rate": 1.999997347320163e-06, "loss": 1.4025, "step": 444 }, { "epoch": 0.0, "grad_norm": 7.0542656681469715, "learning_rate": 1.9999973350817544e-06, "loss": 1.2087, "step": 445 }, { "epoch": 0.0, "grad_norm": 5.129429012048715, "learning_rate": 1.9999973228151787e-06, "loss": 1.4321, "step": 446 }, { "epoch": 0.0, "grad_norm": 4.613105028702654, "learning_rate": 1.999997310520436e-06, "loss": 1.4545, "step": 447 }, { "epoch": 0.0, "grad_norm": 5.0864337050088295, "learning_rate": 1.9999972981975277e-06, "loss": 1.3047, "step": 448 }, { "epoch": 0.0, "grad_norm": 5.123209043678265, "learning_rate": 1.9999972858464524e-06, "loss": 1.4923, "step": 449 }, { "epoch": 0.0, "grad_norm": 4.770437981538585, "learning_rate": 1.9999972734672107e-06, "loss": 1.3795, "step": 450 }, { "epoch": 0.0, "grad_norm": 4.66047161014276, "learning_rate": 1.999997261059802e-06, "loss": 1.4359, "step": 451 }, { "epoch": 0.0, "grad_norm": 5.661063937671921, "learning_rate": 1.999997248624227e-06, "loss": 1.6533, "step": 452 }, { "epoch": 0.0, "grad_norm": 4.631944386384085, "learning_rate": 1.9999972361604853e-06, "loss": 1.4383, "step": 453 }, { "epoch": 0.0, "grad_norm": 4.732909501037933, "learning_rate": 1.9999972236685768e-06, "loss": 1.3426, "step": 454 }, { "epoch": 0.0, "grad_norm": 5.460195121758093, "learning_rate": 1.999997211148502e-06, "loss": 1.5001, "step": 455 }, { "epoch": 0.0, "grad_norm": 4.942983319180353, "learning_rate": 1.99999719860026e-06, "loss": 1.5882, "step": 456 }, { "epoch": 0.0, "grad_norm": 4.541373356599531, "learning_rate": 1.999997186023852e-06, "loss": 1.3362, "step": 457 }, { "epoch": 0.0, "grad_norm": 4.6766662288287515, "learning_rate": 1.9999971734192776e-06, "loss": 1.4117, "step": 458 }, { "epoch": 0.0, "grad_norm": 5.240283985252159, "learning_rate": 1.9999971607865364e-06, "loss": 1.2851, "step": 459 }, { "epoch": 0.0, "grad_norm": 6.6165761980037265, "learning_rate": 1.9999971481256283e-06, "loss": 1.4075, "step": 460 }, { "epoch": 0.0, "grad_norm": 5.356136217997657, "learning_rate": 1.999997135436554e-06, "loss": 1.2614, "step": 461 }, { "epoch": 0.0, "grad_norm": 5.235383295905265, "learning_rate": 1.999997122719313e-06, "loss": 1.5025, "step": 462 }, { "epoch": 0.0, "grad_norm": 5.59435469640636, "learning_rate": 1.999997109973905e-06, "loss": 1.6514, "step": 463 }, { "epoch": 0.0, "grad_norm": 8.674661542805518, "learning_rate": 1.9999970972003308e-06, "loss": 1.5387, "step": 464 }, { "epoch": 0.0, "grad_norm": 5.954501972594385, "learning_rate": 1.99999708439859e-06, "loss": 1.376, "step": 465 }, { "epoch": 0.0, "grad_norm": 5.031043958621965, "learning_rate": 1.9999970715686827e-06, "loss": 1.5981, "step": 466 }, { "epoch": 0.0, "grad_norm": 5.849912677371338, "learning_rate": 1.9999970587106086e-06, "loss": 1.4769, "step": 467 }, { "epoch": 0.0, "grad_norm": 5.18062121080789, "learning_rate": 1.9999970458243677e-06, "loss": 1.5745, "step": 468 }, { "epoch": 0.0, "grad_norm": 6.171389079148798, "learning_rate": 1.999997032909961e-06, "loss": 1.4238, "step": 469 }, { "epoch": 0.0, "grad_norm": 4.888754083257385, "learning_rate": 1.999997019967387e-06, "loss": 1.5669, "step": 470 }, { "epoch": 0.0, "grad_norm": 4.661884685242049, "learning_rate": 1.9999970069966463e-06, "loss": 1.2735, "step": 471 }, { "epoch": 0.0, "grad_norm": 5.231399197062376, "learning_rate": 1.9999969939977395e-06, "loss": 1.4674, "step": 472 }, { "epoch": 0.0, "grad_norm": 4.392315151719007, "learning_rate": 1.999996980970666e-06, "loss": 1.2746, "step": 473 }, { "epoch": 0.0, "grad_norm": 14.143521137726825, "learning_rate": 1.999996967915426e-06, "loss": 1.4949, "step": 474 }, { "epoch": 0.0, "grad_norm": 7.609979863275451, "learning_rate": 1.999996954832019e-06, "loss": 1.5421, "step": 475 }, { "epoch": 0.0, "grad_norm": 4.901316944779091, "learning_rate": 1.9999969417204457e-06, "loss": 1.467, "step": 476 }, { "epoch": 0.0, "grad_norm": 4.9893724431034085, "learning_rate": 1.999996928580706e-06, "loss": 1.5033, "step": 477 }, { "epoch": 0.0, "grad_norm": 7.145370058739066, "learning_rate": 1.9999969154127992e-06, "loss": 1.6459, "step": 478 }, { "epoch": 0.0, "grad_norm": 5.6040849521366045, "learning_rate": 1.999996902216726e-06, "loss": 1.39, "step": 479 }, { "epoch": 0.0, "grad_norm": 4.871162617374601, "learning_rate": 1.9999968889924863e-06, "loss": 1.5008, "step": 480 }, { "epoch": 0.0, "grad_norm": 5.041180910719072, "learning_rate": 1.9999968757400803e-06, "loss": 1.417, "step": 481 }, { "epoch": 0.0, "grad_norm": 7.1416379779540415, "learning_rate": 1.9999968624595075e-06, "loss": 1.3683, "step": 482 }, { "epoch": 0.0, "grad_norm": 4.655432088463851, "learning_rate": 1.999996849150768e-06, "loss": 1.2764, "step": 483 }, { "epoch": 0.0, "grad_norm": 5.390658631829691, "learning_rate": 1.999996835813862e-06, "loss": 1.4183, "step": 484 }, { "epoch": 0.0, "grad_norm": 5.139267857868251, "learning_rate": 1.9999968224487894e-06, "loss": 1.5219, "step": 485 }, { "epoch": 0.0, "grad_norm": 4.98265828385138, "learning_rate": 1.9999968090555498e-06, "loss": 1.5828, "step": 486 }, { "epoch": 0.0, "grad_norm": 4.947025615061189, "learning_rate": 1.999996795634144e-06, "loss": 1.4291, "step": 487 }, { "epoch": 0.0, "grad_norm": 4.8361628814359445, "learning_rate": 1.999996782184572e-06, "loss": 1.5098, "step": 488 }, { "epoch": 0.0, "grad_norm": 6.213733320329145, "learning_rate": 1.999996768706833e-06, "loss": 1.5318, "step": 489 }, { "epoch": 0.0, "grad_norm": 5.354255255395692, "learning_rate": 1.9999967552009273e-06, "loss": 1.5049, "step": 490 }, { "epoch": 0.0, "grad_norm": 5.509817618529389, "learning_rate": 1.9999967416668553e-06, "loss": 1.5207, "step": 491 }, { "epoch": 0.0, "grad_norm": 4.820085190259705, "learning_rate": 1.9999967281046165e-06, "loss": 1.4661, "step": 492 }, { "epoch": 0.0, "grad_norm": 6.5842798335932855, "learning_rate": 1.9999967145142113e-06, "loss": 1.5606, "step": 493 }, { "epoch": 0.0, "grad_norm": 4.832919175129925, "learning_rate": 1.9999967008956397e-06, "loss": 1.5222, "step": 494 }, { "epoch": 0.0, "grad_norm": 5.401711540963192, "learning_rate": 1.9999966872489013e-06, "loss": 1.4282, "step": 495 }, { "epoch": 0.0, "grad_norm": 5.352942988278965, "learning_rate": 1.999996673573996e-06, "loss": 1.4907, "step": 496 }, { "epoch": 0.0, "grad_norm": 4.767236823164161, "learning_rate": 1.9999966598709245e-06, "loss": 1.2979, "step": 497 }, { "epoch": 0.0, "grad_norm": 4.7839487752581995, "learning_rate": 1.999996646139686e-06, "loss": 1.1931, "step": 498 }, { "epoch": 0.0, "grad_norm": 4.526284836165884, "learning_rate": 1.9999966323802813e-06, "loss": 1.4276, "step": 499 }, { "epoch": 0.0, "grad_norm": 4.968983179245585, "learning_rate": 1.99999661859271e-06, "loss": 1.5143, "step": 500 }, { "epoch": 0.0, "grad_norm": 4.929416318558409, "learning_rate": 1.999996604776972e-06, "loss": 1.4274, "step": 501 }, { "epoch": 0.0, "grad_norm": 5.319727005695438, "learning_rate": 1.999996590933068e-06, "loss": 1.4896, "step": 502 }, { "epoch": 0.0, "grad_norm": 5.166912706207141, "learning_rate": 1.9999965770609966e-06, "loss": 1.4971, "step": 503 }, { "epoch": 0.0, "grad_norm": 9.724664850608164, "learning_rate": 1.999996563160759e-06, "loss": 1.314, "step": 504 }, { "epoch": 0.0, "grad_norm": 6.033309554063063, "learning_rate": 1.9999965492323547e-06, "loss": 1.6318, "step": 505 }, { "epoch": 0.0, "grad_norm": 4.693812921757236, "learning_rate": 1.999996535275784e-06, "loss": 1.5998, "step": 506 }, { "epoch": 0.0, "grad_norm": 5.027615373760868, "learning_rate": 1.9999965212910468e-06, "loss": 1.4803, "step": 507 }, { "epoch": 0.0, "grad_norm": 5.150014426528141, "learning_rate": 1.999996507278143e-06, "loss": 1.5533, "step": 508 }, { "epoch": 0.0, "grad_norm": 6.251365830445326, "learning_rate": 1.999996493237072e-06, "loss": 1.5345, "step": 509 }, { "epoch": 0.0, "grad_norm": 5.068491757206887, "learning_rate": 1.999996479167835e-06, "loss": 1.4934, "step": 510 }, { "epoch": 0.0, "grad_norm": 6.510297092824997, "learning_rate": 1.9999964650704313e-06, "loss": 1.4957, "step": 511 }, { "epoch": 0.0, "eval_loss": 1.702190637588501, "eval_runtime": 4.6182, "eval_samples_per_second": 1.949, "eval_steps_per_second": 1.083, "step": 511 }, { "epoch": 0.0, "grad_norm": 5.755438253224136, "learning_rate": 1.999996450944861e-06, "loss": 1.4702, "step": 512 }, { "epoch": 0.0, "grad_norm": 4.529822887512446, "learning_rate": 1.9999964367911242e-06, "loss": 1.4433, "step": 513 }, { "epoch": 0.0, "grad_norm": 4.923234715248454, "learning_rate": 1.9999964226092207e-06, "loss": 1.2158, "step": 514 }, { "epoch": 0.0, "grad_norm": 4.99275767811329, "learning_rate": 1.9999964083991507e-06, "loss": 1.4035, "step": 515 }, { "epoch": 0.0, "grad_norm": 5.540700855208618, "learning_rate": 1.9999963941609144e-06, "loss": 1.4089, "step": 516 }, { "epoch": 0.0, "grad_norm": 5.423934034968752, "learning_rate": 1.9999963798945113e-06, "loss": 1.4395, "step": 517 }, { "epoch": 0.0, "grad_norm": 4.938286770911095, "learning_rate": 1.9999963655999413e-06, "loss": 1.5267, "step": 518 }, { "epoch": 0.0, "grad_norm": 6.135159232589015, "learning_rate": 1.9999963512772054e-06, "loss": 1.5788, "step": 519 }, { "epoch": 0.0, "grad_norm": 41.00137108815898, "learning_rate": 1.9999963369263023e-06, "loss": 1.681, "step": 520 }, { "epoch": 0.0, "grad_norm": 4.982490976686388, "learning_rate": 1.9999963225472327e-06, "loss": 1.3849, "step": 521 }, { "epoch": 0.0, "grad_norm": 5.200076278688873, "learning_rate": 1.999996308139997e-06, "loss": 1.4505, "step": 522 }, { "epoch": 0.0, "grad_norm": 5.0374386572637935, "learning_rate": 1.9999962937045945e-06, "loss": 1.6146, "step": 523 }, { "epoch": 0.0, "grad_norm": 5.38514239730456, "learning_rate": 1.9999962792410254e-06, "loss": 1.4862, "step": 524 }, { "epoch": 0.0, "grad_norm": 5.278831951738085, "learning_rate": 1.9999962647492895e-06, "loss": 1.5097, "step": 525 }, { "epoch": 0.0, "grad_norm": 4.988682511151002, "learning_rate": 1.999996250229387e-06, "loss": 1.5579, "step": 526 }, { "epoch": 0.0, "grad_norm": 6.6329832866037854, "learning_rate": 1.999996235681319e-06, "loss": 1.494, "step": 527 }, { "epoch": 0.0, "grad_norm": 6.101238841635504, "learning_rate": 1.999996221105083e-06, "loss": 1.5725, "step": 528 }, { "epoch": 0.0, "grad_norm": 5.618252608407, "learning_rate": 1.999996206500681e-06, "loss": 1.4834, "step": 529 }, { "epoch": 0.0, "grad_norm": 5.783224896235011, "learning_rate": 1.999996191868113e-06, "loss": 1.7878, "step": 530 }, { "epoch": 0.0, "grad_norm": 5.039904911976764, "learning_rate": 1.9999961772073773e-06, "loss": 1.5341, "step": 531 }, { "epoch": 0.0, "grad_norm": 4.749210275476864, "learning_rate": 1.999996162518476e-06, "loss": 1.5232, "step": 532 }, { "epoch": 0.0, "grad_norm": 4.75083641214273, "learning_rate": 1.999996147801408e-06, "loss": 1.4457, "step": 533 }, { "epoch": 0.0, "grad_norm": 4.797512529634022, "learning_rate": 1.999996133056173e-06, "loss": 1.3914, "step": 534 }, { "epoch": 0.0, "grad_norm": 4.996223642108566, "learning_rate": 1.999996118282772e-06, "loss": 1.53, "step": 535 }, { "epoch": 0.0, "grad_norm": 4.7829974351407305, "learning_rate": 1.9999961034812035e-06, "loss": 1.3418, "step": 536 }, { "epoch": 0.0, "grad_norm": 4.718076792623708, "learning_rate": 1.9999960886514693e-06, "loss": 1.3486, "step": 537 }, { "epoch": 0.0, "grad_norm": 5.156179924159171, "learning_rate": 1.9999960737935678e-06, "loss": 1.5375, "step": 538 }, { "epoch": 0.0, "grad_norm": 5.3419660684441705, "learning_rate": 1.9999960589075003e-06, "loss": 1.4394, "step": 539 }, { "epoch": 0.0, "grad_norm": 4.969835171700577, "learning_rate": 1.999996043993266e-06, "loss": 1.4885, "step": 540 }, { "epoch": 0.0, "grad_norm": 5.141114094857767, "learning_rate": 1.9999960290508654e-06, "loss": 1.6062, "step": 541 }, { "epoch": 0.0, "grad_norm": 6.732914023923969, "learning_rate": 1.9999960140802984e-06, "loss": 1.7097, "step": 542 }, { "epoch": 0.0, "grad_norm": 5.405085642943276, "learning_rate": 1.9999959990815645e-06, "loss": 1.5708, "step": 543 }, { "epoch": 0.0, "grad_norm": 4.951768383120494, "learning_rate": 1.9999959840546643e-06, "loss": 1.5517, "step": 544 }, { "epoch": 0.0, "grad_norm": 4.805093975628436, "learning_rate": 1.9999959689995972e-06, "loss": 1.4752, "step": 545 }, { "epoch": 0.0, "grad_norm": 4.475429973625488, "learning_rate": 1.9999959539163634e-06, "loss": 1.3248, "step": 546 }, { "epoch": 0.0, "grad_norm": 5.138477383552902, "learning_rate": 1.9999959388049636e-06, "loss": 1.5739, "step": 547 }, { "epoch": 0.0, "grad_norm": 4.991661833650293, "learning_rate": 1.9999959236653965e-06, "loss": 1.4348, "step": 548 }, { "epoch": 0.0, "grad_norm": 4.701391682646717, "learning_rate": 1.9999959084976635e-06, "loss": 1.3925, "step": 549 }, { "epoch": 0.0, "grad_norm": 4.910083477148781, "learning_rate": 1.9999958933017637e-06, "loss": 1.5543, "step": 550 }, { "epoch": 0.0, "grad_norm": 5.027398384260184, "learning_rate": 1.9999958780776975e-06, "loss": 1.4994, "step": 551 }, { "epoch": 0.0, "grad_norm": 5.5869467738586645, "learning_rate": 1.9999958628254645e-06, "loss": 1.5429, "step": 552 }, { "epoch": 0.0, "grad_norm": 4.951650316590327, "learning_rate": 1.999995847545065e-06, "loss": 1.4204, "step": 553 }, { "epoch": 0.0, "grad_norm": 4.779335727504338, "learning_rate": 1.999995832236499e-06, "loss": 1.5872, "step": 554 }, { "epoch": 0.0, "grad_norm": 4.995816902206547, "learning_rate": 1.9999958168997667e-06, "loss": 1.434, "step": 555 }, { "epoch": 0.0, "grad_norm": 5.2068709341822235, "learning_rate": 1.9999958015348677e-06, "loss": 1.5262, "step": 556 }, { "epoch": 0.0, "grad_norm": 4.993826056871235, "learning_rate": 1.9999957861418014e-06, "loss": 1.4638, "step": 557 }, { "epoch": 0.0, "grad_norm": 6.2251786792625206, "learning_rate": 1.9999957707205693e-06, "loss": 1.508, "step": 558 }, { "epoch": 0.0, "grad_norm": 6.073040511213923, "learning_rate": 1.9999957552711707e-06, "loss": 1.4745, "step": 559 }, { "epoch": 0.0, "grad_norm": 5.738891638073386, "learning_rate": 1.9999957397936053e-06, "loss": 1.6243, "step": 560 }, { "epoch": 0.0, "grad_norm": 5.419662722088751, "learning_rate": 1.9999957242878735e-06, "loss": 1.2774, "step": 561 }, { "epoch": 0.0, "grad_norm": 5.122152529238781, "learning_rate": 1.999995708753975e-06, "loss": 1.4376, "step": 562 }, { "epoch": 0.0, "grad_norm": 5.4113125087183205, "learning_rate": 1.99999569319191e-06, "loss": 1.5806, "step": 563 }, { "epoch": 0.0, "grad_norm": 5.379407688726184, "learning_rate": 1.999995677601678e-06, "loss": 1.4604, "step": 564 }, { "epoch": 0.0, "grad_norm": 5.47622178703681, "learning_rate": 1.9999956619832805e-06, "loss": 1.4344, "step": 565 }, { "epoch": 0.0, "grad_norm": 5.117263096048395, "learning_rate": 1.999995646336716e-06, "loss": 1.4896, "step": 566 }, { "epoch": 0.0, "grad_norm": 5.4922547794799, "learning_rate": 1.9999956306619846e-06, "loss": 1.7709, "step": 567 }, { "epoch": 0.0, "grad_norm": 6.489699330478132, "learning_rate": 1.999995614959087e-06, "loss": 1.5452, "step": 568 }, { "epoch": 0.0, "grad_norm": 11.437005541277756, "learning_rate": 1.9999955992280223e-06, "loss": 1.347, "step": 569 }, { "epoch": 0.0, "grad_norm": 5.224691095393556, "learning_rate": 1.9999955834687917e-06, "loss": 1.5102, "step": 570 }, { "epoch": 0.0, "grad_norm": 5.56916203726456, "learning_rate": 1.999995567681394e-06, "loss": 1.2866, "step": 571 }, { "epoch": 0.0, "grad_norm": 5.203471880109494, "learning_rate": 1.99999555186583e-06, "loss": 1.2939, "step": 572 }, { "epoch": 0.0, "grad_norm": 5.295050199339514, "learning_rate": 1.9999955360220997e-06, "loss": 1.244, "step": 573 }, { "epoch": 0.0, "grad_norm": 5.558252219814004, "learning_rate": 1.9999955201502024e-06, "loss": 1.381, "step": 574 }, { "epoch": 0.0, "grad_norm": 4.693879682065885, "learning_rate": 1.999995504250139e-06, "loss": 1.5172, "step": 575 }, { "epoch": 0.0, "grad_norm": 5.054746659752432, "learning_rate": 1.9999954883219086e-06, "loss": 1.5245, "step": 576 }, { "epoch": 0.0, "grad_norm": 6.288250112135058, "learning_rate": 1.999995472365512e-06, "loss": 1.6557, "step": 577 }, { "epoch": 0.0, "grad_norm": 4.7842439911659005, "learning_rate": 1.9999954563809488e-06, "loss": 1.4294, "step": 578 }, { "epoch": 0.0, "grad_norm": 4.995544020303297, "learning_rate": 1.999995440368219e-06, "loss": 1.3965, "step": 579 }, { "epoch": 0.0, "grad_norm": 4.759630572595074, "learning_rate": 1.9999954243273226e-06, "loss": 1.3996, "step": 580 }, { "epoch": 0.0, "grad_norm": 4.712211700693446, "learning_rate": 1.99999540825826e-06, "loss": 1.4983, "step": 581 }, { "epoch": 0.0, "grad_norm": 5.9470327763676405, "learning_rate": 1.9999953921610304e-06, "loss": 1.355, "step": 582 }, { "epoch": 0.0, "grad_norm": 5.063038997344527, "learning_rate": 1.9999953760356343e-06, "loss": 1.4099, "step": 583 }, { "epoch": 0.0, "grad_norm": 4.844081361582967, "learning_rate": 1.999995359882072e-06, "loss": 1.5044, "step": 584 }, { "epoch": 0.0, "eval_loss": 1.696353793144226, "eval_runtime": 4.6168, "eval_samples_per_second": 1.949, "eval_steps_per_second": 1.083, "step": 584 }, { "epoch": 0.0, "grad_norm": 6.493708899953276, "learning_rate": 1.999995343700343e-06, "loss": 1.6627, "step": 585 }, { "epoch": 0.0, "grad_norm": 4.954875007169818, "learning_rate": 1.9999953274904473e-06, "loss": 1.4136, "step": 586 }, { "epoch": 0.0, "grad_norm": 4.910325383229665, "learning_rate": 1.9999953112523853e-06, "loss": 1.4251, "step": 587 }, { "epoch": 0.0, "grad_norm": 4.714961872146683, "learning_rate": 1.9999952949861564e-06, "loss": 1.5491, "step": 588 }, { "epoch": 0.0, "grad_norm": 5.2328265295940435, "learning_rate": 1.999995278691761e-06, "loss": 1.4251, "step": 589 }, { "epoch": 0.0, "grad_norm": 4.533584320037035, "learning_rate": 1.9999952623691995e-06, "loss": 1.4121, "step": 590 }, { "epoch": 0.0, "grad_norm": 4.882067176722944, "learning_rate": 1.999995246018471e-06, "loss": 1.3394, "step": 591 }, { "epoch": 0.0, "grad_norm": 4.9509375278675405, "learning_rate": 1.9999952296395762e-06, "loss": 1.3379, "step": 592 }, { "epoch": 0.0, "grad_norm": 5.653441569963669, "learning_rate": 1.999995213232515e-06, "loss": 1.4337, "step": 593 }, { "epoch": 0.0, "grad_norm": 4.426491174400903, "learning_rate": 1.999995196797287e-06, "loss": 1.2995, "step": 594 }, { "epoch": 0.0, "grad_norm": 5.485353755158814, "learning_rate": 1.999995180333893e-06, "loss": 1.379, "step": 595 }, { "epoch": 0.0, "grad_norm": 4.79710641338462, "learning_rate": 1.9999951638423317e-06, "loss": 1.4514, "step": 596 }, { "epoch": 0.0, "grad_norm": 4.92546100162479, "learning_rate": 1.999995147322604e-06, "loss": 1.4666, "step": 597 }, { "epoch": 0.0, "grad_norm": 5.9766360974271056, "learning_rate": 1.99999513077471e-06, "loss": 1.353, "step": 598 }, { "epoch": 0.0, "grad_norm": 6.442918937982415, "learning_rate": 1.9999951141986493e-06, "loss": 1.6356, "step": 599 }, { "epoch": 0.0, "grad_norm": 5.523537086483564, "learning_rate": 1.9999950975944225e-06, "loss": 1.5245, "step": 600 }, { "epoch": 0.0, "grad_norm": 4.955021932634082, "learning_rate": 1.9999950809620285e-06, "loss": 1.3658, "step": 601 }, { "epoch": 0.0, "grad_norm": 4.8036782108306975, "learning_rate": 1.999995064301468e-06, "loss": 1.4261, "step": 602 }, { "epoch": 0.0, "grad_norm": 4.952441088003973, "learning_rate": 1.9999950476127418e-06, "loss": 1.5592, "step": 603 }, { "epoch": 0.0, "grad_norm": 4.712458785442027, "learning_rate": 1.999995030895848e-06, "loss": 1.6329, "step": 604 }, { "epoch": 0.0, "grad_norm": 6.39848861393844, "learning_rate": 1.9999950141507886e-06, "loss": 1.3637, "step": 605 }, { "epoch": 0.0, "grad_norm": 4.565908788566706, "learning_rate": 1.9999949973775623e-06, "loss": 1.3901, "step": 606 }, { "epoch": 0.0, "grad_norm": 4.839741237419302, "learning_rate": 1.9999949805761695e-06, "loss": 1.5408, "step": 607 }, { "epoch": 0.0, "grad_norm": 5.65666455626493, "learning_rate": 1.99999496374661e-06, "loss": 1.3274, "step": 608 }, { "epoch": 0.0, "grad_norm": 5.003729201972364, "learning_rate": 1.999994946888884e-06, "loss": 1.506, "step": 609 }, { "epoch": 0.0, "grad_norm": 6.133503970487184, "learning_rate": 1.9999949300029917e-06, "loss": 1.5401, "step": 610 }, { "epoch": 0.0, "grad_norm": 5.6024698311473795, "learning_rate": 1.9999949130889325e-06, "loss": 1.6896, "step": 611 }, { "epoch": 0.0, "grad_norm": 4.532215717740302, "learning_rate": 1.999994896146707e-06, "loss": 1.3587, "step": 612 }, { "epoch": 0.0, "grad_norm": 5.70340339038857, "learning_rate": 1.9999948791763146e-06, "loss": 1.577, "step": 613 }, { "epoch": 0.0, "grad_norm": 5.082395639793893, "learning_rate": 1.9999948621777563e-06, "loss": 1.5421, "step": 614 }, { "epoch": 0.0, "grad_norm": 4.988618029308243, "learning_rate": 1.9999948451510312e-06, "loss": 1.5131, "step": 615 }, { "epoch": 0.0, "grad_norm": 6.413069306779731, "learning_rate": 1.9999948280961393e-06, "loss": 1.4763, "step": 616 }, { "epoch": 0.0, "grad_norm": 4.723299915394361, "learning_rate": 1.999994811013081e-06, "loss": 1.4926, "step": 617 }, { "epoch": 0.0, "grad_norm": 4.818607426839722, "learning_rate": 1.9999947939018567e-06, "loss": 1.4112, "step": 618 }, { "epoch": 0.0, "grad_norm": 5.090960497753154, "learning_rate": 1.9999947767624656e-06, "loss": 1.6447, "step": 619 }, { "epoch": 0.0, "grad_norm": 4.869884243493298, "learning_rate": 1.9999947595949073e-06, "loss": 1.4467, "step": 620 }, { "epoch": 0.0, "grad_norm": 4.952829084694713, "learning_rate": 1.999994742399183e-06, "loss": 1.5202, "step": 621 }, { "epoch": 0.0, "grad_norm": 5.000534437688967, "learning_rate": 1.9999947251752923e-06, "loss": 1.569, "step": 622 }, { "epoch": 0.0, "grad_norm": 4.891724084780857, "learning_rate": 1.9999947079232353e-06, "loss": 1.2758, "step": 623 }, { "epoch": 0.0, "grad_norm": 4.720768304907384, "learning_rate": 1.999994690643011e-06, "loss": 1.5005, "step": 624 }, { "epoch": 0.0, "grad_norm": 4.8481849499077, "learning_rate": 1.9999946733346208e-06, "loss": 1.5816, "step": 625 }, { "epoch": 0.0, "grad_norm": 5.153018684172472, "learning_rate": 1.9999946559980637e-06, "loss": 1.6261, "step": 626 }, { "epoch": 0.0, "grad_norm": 5.041326934986502, "learning_rate": 1.9999946386333402e-06, "loss": 1.4737, "step": 627 }, { "epoch": 0.0, "grad_norm": 6.615624017109531, "learning_rate": 1.9999946212404504e-06, "loss": 1.3258, "step": 628 }, { "epoch": 0.0, "grad_norm": 5.5498876909447405, "learning_rate": 1.999994603819394e-06, "loss": 1.6407, "step": 629 }, { "epoch": 0.0, "grad_norm": 5.466395618769352, "learning_rate": 1.999994586370171e-06, "loss": 1.3873, "step": 630 }, { "epoch": 0.0, "grad_norm": 4.84924829351922, "learning_rate": 1.9999945688927813e-06, "loss": 1.4199, "step": 631 }, { "epoch": 0.0, "grad_norm": 5.489922257516625, "learning_rate": 1.999994551387225e-06, "loss": 1.4957, "step": 632 }, { "epoch": 0.0, "grad_norm": 7.178583734354207, "learning_rate": 1.999994533853503e-06, "loss": 1.3008, "step": 633 }, { "epoch": 0.0, "grad_norm": 4.631980034454648, "learning_rate": 1.9999945162916135e-06, "loss": 1.2049, "step": 634 }, { "epoch": 0.0, "grad_norm": 4.715631420481108, "learning_rate": 1.999994498701558e-06, "loss": 1.506, "step": 635 }, { "epoch": 0.0, "grad_norm": 4.866461878636338, "learning_rate": 1.999994481083336e-06, "loss": 1.4913, "step": 636 }, { "epoch": 0.0, "grad_norm": 4.81038769741456, "learning_rate": 1.9999944634369473e-06, "loss": 1.4033, "step": 637 }, { "epoch": 0.0, "grad_norm": 5.08078031950503, "learning_rate": 1.999994445762392e-06, "loss": 1.5048, "step": 638 }, { "epoch": 0.0, "grad_norm": 5.047631225535758, "learning_rate": 1.9999944280596705e-06, "loss": 1.4223, "step": 639 }, { "epoch": 0.0, "grad_norm": 5.040805784015986, "learning_rate": 1.9999944103287824e-06, "loss": 1.5275, "step": 640 }, { "epoch": 0.0, "grad_norm": 5.349390518960892, "learning_rate": 1.9999943925697274e-06, "loss": 1.3653, "step": 641 }, { "epoch": 0.0, "grad_norm": 5.378881266474736, "learning_rate": 1.9999943747825064e-06, "loss": 1.4901, "step": 642 }, { "epoch": 0.0, "grad_norm": 5.959820929884786, "learning_rate": 1.9999943569671187e-06, "loss": 1.5757, "step": 643 }, { "epoch": 0.0, "grad_norm": 4.839111038325676, "learning_rate": 1.999994339123564e-06, "loss": 1.3587, "step": 644 }, { "epoch": 0.0, "grad_norm": 4.487918310261601, "learning_rate": 1.999994321251843e-06, "loss": 1.2875, "step": 645 }, { "epoch": 0.0, "grad_norm": 4.622421469454646, "learning_rate": 1.9999943033519563e-06, "loss": 1.4613, "step": 646 }, { "epoch": 0.0, "grad_norm": 4.851420800533353, "learning_rate": 1.999994285423902e-06, "loss": 1.4116, "step": 647 }, { "epoch": 0.0, "grad_norm": 5.13772278531517, "learning_rate": 1.999994267467682e-06, "loss": 1.503, "step": 648 }, { "epoch": 0.0, "grad_norm": 4.995119073749364, "learning_rate": 1.9999942494832955e-06, "loss": 1.5454, "step": 649 }, { "epoch": 0.0, "grad_norm": 5.260803959027427, "learning_rate": 1.999994231470742e-06, "loss": 1.5893, "step": 650 }, { "epoch": 0.0, "grad_norm": 4.836602513098617, "learning_rate": 1.999994213430022e-06, "loss": 1.5162, "step": 651 }, { "epoch": 0.0, "grad_norm": 5.155960472302761, "learning_rate": 1.9999941953611356e-06, "loss": 1.5372, "step": 652 }, { "epoch": 0.0, "grad_norm": 4.746462550620719, "learning_rate": 1.999994177264083e-06, "loss": 1.4229, "step": 653 }, { "epoch": 0.0, "grad_norm": 5.028762735096975, "learning_rate": 1.9999941591388634e-06, "loss": 1.4108, "step": 654 }, { "epoch": 0.0, "grad_norm": 4.8256605605224, "learning_rate": 1.9999941409854773e-06, "loss": 1.4537, "step": 655 }, { "epoch": 0.0, "grad_norm": 4.976124834899918, "learning_rate": 1.9999941228039253e-06, "loss": 1.4796, "step": 656 }, { "epoch": 0.0, "grad_norm": 8.040837341128276, "learning_rate": 1.999994104594206e-06, "loss": 1.5095, "step": 657 }, { "epoch": 0.0, "eval_loss": 1.6910474300384521, "eval_runtime": 4.6444, "eval_samples_per_second": 1.938, "eval_steps_per_second": 1.077, "step": 657 }, { "epoch": 0.0, "grad_norm": 5.4913712932783305, "learning_rate": 1.9999940863563208e-06, "loss": 1.7822, "step": 658 }, { "epoch": 0.0, "grad_norm": 5.723500971046963, "learning_rate": 1.999994068090269e-06, "loss": 1.3798, "step": 659 }, { "epoch": 0.0, "grad_norm": 5.445742218350867, "learning_rate": 1.9999940497960503e-06, "loss": 1.2929, "step": 660 }, { "epoch": 0.0, "grad_norm": 4.422756108863784, "learning_rate": 1.9999940314736654e-06, "loss": 1.3135, "step": 661 }, { "epoch": 0.0, "grad_norm": 4.932728683114205, "learning_rate": 1.9999940131231142e-06, "loss": 1.5933, "step": 662 }, { "epoch": 0.0, "grad_norm": 4.725603325024966, "learning_rate": 1.999993994744396e-06, "loss": 1.4112, "step": 663 }, { "epoch": 0.0, "grad_norm": 4.709408296070619, "learning_rate": 1.9999939763375114e-06, "loss": 1.3498, "step": 664 }, { "epoch": 0.0, "grad_norm": 4.936026121310175, "learning_rate": 1.9999939579024606e-06, "loss": 1.4758, "step": 665 }, { "epoch": 0.0, "grad_norm": 4.71294086987452, "learning_rate": 1.999993939439243e-06, "loss": 1.4319, "step": 666 }, { "epoch": 0.0, "grad_norm": 6.066575513242086, "learning_rate": 1.999993920947859e-06, "loss": 1.6162, "step": 667 }, { "epoch": 0.0, "grad_norm": 5.234786123258312, "learning_rate": 1.9999939024283086e-06, "loss": 1.4918, "step": 668 }, { "epoch": 0.0, "grad_norm": 4.8552625381814565, "learning_rate": 1.999993883880592e-06, "loss": 1.4518, "step": 669 }, { "epoch": 0.0, "grad_norm": 4.497782097949712, "learning_rate": 1.9999938653047082e-06, "loss": 1.394, "step": 670 }, { "epoch": 0.0, "grad_norm": 6.189146286839885, "learning_rate": 1.9999938467006583e-06, "loss": 1.4676, "step": 671 }, { "epoch": 0.0, "grad_norm": 5.318939072932948, "learning_rate": 1.999993828068442e-06, "loss": 1.4845, "step": 672 }, { "epoch": 0.0, "grad_norm": 5.231313892198888, "learning_rate": 1.9999938094080588e-06, "loss": 1.5264, "step": 673 }, { "epoch": 0.0, "grad_norm": 5.190824126212358, "learning_rate": 1.9999937907195092e-06, "loss": 1.5604, "step": 674 }, { "epoch": 0.0, "grad_norm": 4.911434823319875, "learning_rate": 1.9999937720027937e-06, "loss": 1.4599, "step": 675 }, { "epoch": 0.0, "grad_norm": 5.527825329074028, "learning_rate": 1.999993753257911e-06, "loss": 1.4444, "step": 676 }, { "epoch": 0.0, "grad_norm": 4.612727754818132, "learning_rate": 1.9999937344848622e-06, "loss": 1.4543, "step": 677 }, { "epoch": 0.0, "grad_norm": 4.912996568592968, "learning_rate": 1.9999937156836463e-06, "loss": 1.4475, "step": 678 }, { "epoch": 0.0, "grad_norm": 4.899245327794625, "learning_rate": 1.9999936968542644e-06, "loss": 1.4971, "step": 679 }, { "epoch": 0.0, "grad_norm": 5.683408208952821, "learning_rate": 1.999993677996716e-06, "loss": 1.4405, "step": 680 }, { "epoch": 0.0, "grad_norm": 4.868881120149723, "learning_rate": 1.999993659111001e-06, "loss": 1.3217, "step": 681 }, { "epoch": 0.0, "grad_norm": 4.725153463356434, "learning_rate": 1.9999936401971195e-06, "loss": 1.3043, "step": 682 }, { "epoch": 0.0, "grad_norm": 4.909021709681444, "learning_rate": 1.9999936212550716e-06, "loss": 1.4595, "step": 683 }, { "epoch": 0.0, "grad_norm": 4.906414442711592, "learning_rate": 1.9999936022848574e-06, "loss": 1.4195, "step": 684 }, { "epoch": 0.0, "grad_norm": 6.200952942030242, "learning_rate": 1.9999935832864763e-06, "loss": 1.3076, "step": 685 }, { "epoch": 0.0, "grad_norm": 4.601568858946815, "learning_rate": 1.999993564259929e-06, "loss": 1.3687, "step": 686 }, { "epoch": 0.0, "grad_norm": 4.694173785690684, "learning_rate": 1.999993545205215e-06, "loss": 1.3167, "step": 687 }, { "epoch": 0.0, "grad_norm": 5.925517734547198, "learning_rate": 1.9999935261223344e-06, "loss": 1.524, "step": 688 }, { "epoch": 0.0, "grad_norm": 6.058783499529197, "learning_rate": 1.9999935070112877e-06, "loss": 1.3536, "step": 689 }, { "epoch": 0.0, "grad_norm": 4.7403320365456185, "learning_rate": 1.9999934878720743e-06, "loss": 1.45, "step": 690 }, { "epoch": 0.0, "grad_norm": 8.216158970114753, "learning_rate": 1.9999934687046945e-06, "loss": 1.4329, "step": 691 }, { "epoch": 0.0, "grad_norm": 4.907151976261744, "learning_rate": 1.999993449509148e-06, "loss": 1.579, "step": 692 }, { "epoch": 0.0, "grad_norm": 5.26679103924561, "learning_rate": 1.9999934302854353e-06, "loss": 1.6084, "step": 693 }, { "epoch": 0.0, "grad_norm": 5.831000211583377, "learning_rate": 1.999993411033556e-06, "loss": 1.574, "step": 694 }, { "epoch": 0.0, "grad_norm": 4.9068622026627065, "learning_rate": 1.99999339175351e-06, "loss": 1.5251, "step": 695 }, { "epoch": 0.0, "grad_norm": 4.824731944389374, "learning_rate": 1.9999933724452975e-06, "loss": 1.4736, "step": 696 }, { "epoch": 0.0, "grad_norm": 4.737465979855922, "learning_rate": 1.999993353108919e-06, "loss": 1.3993, "step": 697 }, { "epoch": 0.0, "grad_norm": 5.601956191342637, "learning_rate": 1.9999933337443736e-06, "loss": 1.4853, "step": 698 }, { "epoch": 0.0, "grad_norm": 4.891756552820562, "learning_rate": 1.999993314351662e-06, "loss": 1.3733, "step": 699 }, { "epoch": 0.0, "grad_norm": 5.343260811587546, "learning_rate": 1.9999932949307833e-06, "loss": 1.5779, "step": 700 }, { "epoch": 0.0, "grad_norm": 4.646914554700779, "learning_rate": 1.9999932754817387e-06, "loss": 1.4926, "step": 701 }, { "epoch": 0.0, "grad_norm": 5.004390716593502, "learning_rate": 1.9999932560045274e-06, "loss": 1.343, "step": 702 }, { "epoch": 0.0, "grad_norm": 4.8435215224734, "learning_rate": 1.9999932364991497e-06, "loss": 1.5128, "step": 703 }, { "epoch": 0.0, "grad_norm": 5.094980526251057, "learning_rate": 1.9999932169656056e-06, "loss": 1.4424, "step": 704 }, { "epoch": 0.0, "grad_norm": 4.71606625952716, "learning_rate": 1.9999931974038946e-06, "loss": 1.2818, "step": 705 }, { "epoch": 0.0, "grad_norm": 5.019306981145199, "learning_rate": 1.9999931778140178e-06, "loss": 1.3311, "step": 706 }, { "epoch": 0.0, "grad_norm": 4.402365143728885, "learning_rate": 1.999993158195974e-06, "loss": 1.269, "step": 707 }, { "epoch": 0.0, "grad_norm": 5.4490937352418065, "learning_rate": 1.999993138549764e-06, "loss": 1.4177, "step": 708 }, { "epoch": 0.0, "grad_norm": 4.710938425019419, "learning_rate": 1.999993118875387e-06, "loss": 1.3903, "step": 709 }, { "epoch": 0.0, "grad_norm": 4.853647057982472, "learning_rate": 1.999993099172844e-06, "loss": 1.3143, "step": 710 }, { "epoch": 0.0, "grad_norm": 5.583393631131937, "learning_rate": 1.9999930794421346e-06, "loss": 1.5035, "step": 711 }, { "epoch": 0.0, "grad_norm": 4.9723347976661625, "learning_rate": 1.9999930596832585e-06, "loss": 1.4268, "step": 712 }, { "epoch": 0.0, "grad_norm": 5.280480245670652, "learning_rate": 1.999993039896216e-06, "loss": 1.5342, "step": 713 }, { "epoch": 0.0, "grad_norm": 5.594520301277882, "learning_rate": 1.999993020081007e-06, "loss": 1.5057, "step": 714 }, { "epoch": 0.0, "grad_norm": 5.0248961526344385, "learning_rate": 1.9999930002376316e-06, "loss": 1.5489, "step": 715 }, { "epoch": 0.0, "grad_norm": 4.710291069541927, "learning_rate": 1.999992980366089e-06, "loss": 1.4276, "step": 716 }, { "epoch": 0.0, "grad_norm": 5.12715968906524, "learning_rate": 1.999992960466381e-06, "loss": 1.4865, "step": 717 }, { "epoch": 0.0, "grad_norm": 4.916214864105662, "learning_rate": 1.999992940538506e-06, "loss": 1.4147, "step": 718 }, { "epoch": 0.0, "grad_norm": 5.383613097319036, "learning_rate": 1.999992920582465e-06, "loss": 1.3599, "step": 719 }, { "epoch": 0.0, "grad_norm": 5.341289838892921, "learning_rate": 1.999992900598257e-06, "loss": 1.4866, "step": 720 }, { "epoch": 0.0, "grad_norm": 5.032528541347688, "learning_rate": 1.9999928805858824e-06, "loss": 1.5285, "step": 721 }, { "epoch": 0.0, "grad_norm": 5.113668938434738, "learning_rate": 1.9999928605453417e-06, "loss": 1.4797, "step": 722 }, { "epoch": 0.0, "grad_norm": 4.891988446714629, "learning_rate": 1.9999928404766345e-06, "loss": 1.6038, "step": 723 }, { "epoch": 0.0, "grad_norm": 4.679783775035857, "learning_rate": 1.9999928203797606e-06, "loss": 1.4729, "step": 724 }, { "epoch": 0.0, "grad_norm": 4.772986476245937, "learning_rate": 1.9999928002547202e-06, "loss": 1.429, "step": 725 }, { "epoch": 0.0, "grad_norm": 4.468260713060045, "learning_rate": 1.999992780101514e-06, "loss": 1.3496, "step": 726 }, { "epoch": 0.0, "grad_norm": 4.7466257837247365, "learning_rate": 1.9999927599201404e-06, "loss": 1.5408, "step": 727 }, { "epoch": 0.0, "grad_norm": 4.584370484620455, "learning_rate": 1.999992739710601e-06, "loss": 1.4405, "step": 728 }, { "epoch": 0.0, "grad_norm": 4.783272641625955, "learning_rate": 1.9999927194728945e-06, "loss": 1.4221, "step": 729 }, { "epoch": 0.0, "grad_norm": 4.52556643023439, "learning_rate": 1.9999926992070223e-06, "loss": 1.3328, "step": 730 }, { "epoch": 0.0, "eval_loss": 1.6846652030944824, "eval_runtime": 4.641, "eval_samples_per_second": 1.939, "eval_steps_per_second": 1.077, "step": 730 }, { "epoch": 0.0, "grad_norm": 4.657217221686939, "learning_rate": 1.999992678912983e-06, "loss": 1.4454, "step": 731 }, { "epoch": 0.0, "grad_norm": 4.590847103289048, "learning_rate": 1.9999926585907777e-06, "loss": 1.3924, "step": 732 }, { "epoch": 0.0, "grad_norm": 5.336722077748262, "learning_rate": 1.9999926382404054e-06, "loss": 1.4235, "step": 733 }, { "epoch": 0.0, "grad_norm": 4.450736230018037, "learning_rate": 1.999992617861867e-06, "loss": 1.3447, "step": 734 }, { "epoch": 0.0, "grad_norm": 5.138005519098043, "learning_rate": 1.9999925974551625e-06, "loss": 1.5023, "step": 735 }, { "epoch": 0.0, "grad_norm": 5.723499031096059, "learning_rate": 1.9999925770202907e-06, "loss": 1.5875, "step": 736 }, { "epoch": 0.0, "grad_norm": 6.103542969210838, "learning_rate": 1.999992556557253e-06, "loss": 1.6389, "step": 737 }, { "epoch": 0.0, "grad_norm": 5.358708666669913, "learning_rate": 1.9999925360660486e-06, "loss": 1.5501, "step": 738 }, { "epoch": 0.0, "grad_norm": 4.8728681154106, "learning_rate": 1.999992515546678e-06, "loss": 1.5199, "step": 739 }, { "epoch": 0.01, "grad_norm": 4.254099594123959, "learning_rate": 1.9999924949991406e-06, "loss": 1.2561, "step": 740 }, { "epoch": 0.01, "grad_norm": 4.841976889554841, "learning_rate": 1.999992474423437e-06, "loss": 1.4955, "step": 741 }, { "epoch": 0.01, "grad_norm": 5.3286918973656965, "learning_rate": 1.999992453819567e-06, "loss": 1.5191, "step": 742 }, { "epoch": 0.01, "grad_norm": 12.100923749893232, "learning_rate": 1.99999243318753e-06, "loss": 1.5339, "step": 743 }, { "epoch": 0.01, "grad_norm": 4.937435968281523, "learning_rate": 1.999992412527327e-06, "loss": 1.5689, "step": 744 }, { "epoch": 0.01, "grad_norm": 4.794383156069311, "learning_rate": 1.9999923918389573e-06, "loss": 1.4932, "step": 745 }, { "epoch": 0.01, "grad_norm": 6.328354730784901, "learning_rate": 1.9999923711224215e-06, "loss": 1.4243, "step": 746 }, { "epoch": 0.01, "grad_norm": 5.585973437130427, "learning_rate": 1.999992350377719e-06, "loss": 1.5275, "step": 747 }, { "epoch": 0.01, "grad_norm": 5.252037658328408, "learning_rate": 1.99999232960485e-06, "loss": 1.6859, "step": 748 }, { "epoch": 0.01, "grad_norm": 4.999962938796536, "learning_rate": 1.9999923088038147e-06, "loss": 1.3763, "step": 749 }, { "epoch": 0.01, "grad_norm": 4.6706037411530135, "learning_rate": 1.999992287974613e-06, "loss": 1.4393, "step": 750 }, { "epoch": 0.01, "grad_norm": 5.644118215700577, "learning_rate": 1.9999922671172445e-06, "loss": 1.3404, "step": 751 }, { "epoch": 0.01, "grad_norm": 4.762262865598859, "learning_rate": 1.9999922462317096e-06, "loss": 1.1912, "step": 752 }, { "epoch": 0.01, "grad_norm": 6.509565742435461, "learning_rate": 1.9999922253180088e-06, "loss": 1.5547, "step": 753 }, { "epoch": 0.01, "grad_norm": 4.906236638847442, "learning_rate": 1.999992204376141e-06, "loss": 1.5497, "step": 754 }, { "epoch": 0.01, "grad_norm": 5.389720327672479, "learning_rate": 1.999992183406107e-06, "loss": 1.5383, "step": 755 }, { "epoch": 0.01, "grad_norm": 6.936731396149539, "learning_rate": 1.9999921624079066e-06, "loss": 1.5598, "step": 756 }, { "epoch": 0.01, "grad_norm": 5.891074624493145, "learning_rate": 1.9999921413815393e-06, "loss": 1.4646, "step": 757 }, { "epoch": 0.01, "grad_norm": 5.931816574873316, "learning_rate": 1.9999921203270057e-06, "loss": 1.5956, "step": 758 }, { "epoch": 0.01, "grad_norm": 4.631245635122659, "learning_rate": 1.9999920992443056e-06, "loss": 1.3381, "step": 759 }, { "epoch": 0.01, "grad_norm": 4.69596344452636, "learning_rate": 1.9999920781334392e-06, "loss": 1.4328, "step": 760 }, { "epoch": 0.01, "grad_norm": 4.630359889510624, "learning_rate": 1.999992056994407e-06, "loss": 1.4445, "step": 761 }, { "epoch": 0.01, "grad_norm": 5.4898331473385795, "learning_rate": 1.9999920358272076e-06, "loss": 1.4693, "step": 762 }, { "epoch": 0.01, "grad_norm": 4.976976788582667, "learning_rate": 1.9999920146318416e-06, "loss": 1.4112, "step": 763 }, { "epoch": 0.01, "grad_norm": 5.642721073810829, "learning_rate": 1.9999919934083092e-06, "loss": 1.507, "step": 764 }, { "epoch": 0.01, "grad_norm": 4.938224038260358, "learning_rate": 1.999991972156611e-06, "loss": 1.4828, "step": 765 }, { "epoch": 0.01, "grad_norm": 5.4124846916190545, "learning_rate": 1.9999919508767457e-06, "loss": 1.5935, "step": 766 }, { "epoch": 0.01, "grad_norm": 5.0298956818677665, "learning_rate": 1.9999919295687146e-06, "loss": 1.5506, "step": 767 }, { "epoch": 0.01, "grad_norm": 4.902554353229615, "learning_rate": 1.9999919082325163e-06, "loss": 1.4076, "step": 768 }, { "epoch": 0.01, "grad_norm": 5.046670733247908, "learning_rate": 1.999991886868152e-06, "loss": 1.5376, "step": 769 }, { "epoch": 0.01, "grad_norm": 5.029796705277875, "learning_rate": 1.999991865475621e-06, "loss": 1.5587, "step": 770 }, { "epoch": 0.01, "grad_norm": 5.392708356030036, "learning_rate": 1.9999918440549237e-06, "loss": 1.5396, "step": 771 }, { "epoch": 0.01, "grad_norm": 6.421457119639919, "learning_rate": 1.9999918226060602e-06, "loss": 1.4929, "step": 772 }, { "epoch": 0.01, "grad_norm": 4.752656726954079, "learning_rate": 1.99999180112903e-06, "loss": 1.1638, "step": 773 }, { "epoch": 0.01, "grad_norm": 4.831954478385221, "learning_rate": 1.9999917796238333e-06, "loss": 1.4306, "step": 774 }, { "epoch": 0.01, "grad_norm": 5.101830458146313, "learning_rate": 1.99999175809047e-06, "loss": 1.4116, "step": 775 }, { "epoch": 0.01, "grad_norm": 4.404228318451748, "learning_rate": 1.9999917365289403e-06, "loss": 1.5313, "step": 776 }, { "epoch": 0.01, "grad_norm": 4.736260085200967, "learning_rate": 1.9999917149392445e-06, "loss": 1.4743, "step": 777 }, { "epoch": 0.01, "grad_norm": 4.839914114082853, "learning_rate": 1.9999916933213823e-06, "loss": 1.4311, "step": 778 }, { "epoch": 0.01, "grad_norm": 4.591247776677874, "learning_rate": 1.9999916716753536e-06, "loss": 1.4233, "step": 779 }, { "epoch": 0.01, "grad_norm": 4.6578387294024335, "learning_rate": 1.9999916500011582e-06, "loss": 1.5085, "step": 780 }, { "epoch": 0.01, "grad_norm": 4.93242923456189, "learning_rate": 1.9999916282987964e-06, "loss": 1.5293, "step": 781 }, { "epoch": 0.01, "grad_norm": 4.84156183065501, "learning_rate": 1.9999916065682682e-06, "loss": 1.4976, "step": 782 }, { "epoch": 0.01, "grad_norm": 4.780256207356861, "learning_rate": 1.9999915848095736e-06, "loss": 1.3376, "step": 783 }, { "epoch": 0.01, "grad_norm": 4.519528759054672, "learning_rate": 1.9999915630227127e-06, "loss": 1.4027, "step": 784 }, { "epoch": 0.01, "grad_norm": 10.363927799642175, "learning_rate": 1.9999915412076853e-06, "loss": 1.4659, "step": 785 }, { "epoch": 0.01, "grad_norm": 4.342877391406383, "learning_rate": 1.9999915193644916e-06, "loss": 1.4013, "step": 786 }, { "epoch": 0.01, "grad_norm": 4.813148676207006, "learning_rate": 1.999991497493131e-06, "loss": 1.4737, "step": 787 }, { "epoch": 0.01, "grad_norm": 5.745668191994972, "learning_rate": 1.999991475593604e-06, "loss": 1.2512, "step": 788 }, { "epoch": 0.01, "grad_norm": 5.796669035814396, "learning_rate": 1.999991453665911e-06, "loss": 1.318, "step": 789 }, { "epoch": 0.01, "grad_norm": 5.807933615414672, "learning_rate": 1.999991431710051e-06, "loss": 1.3862, "step": 790 }, { "epoch": 0.01, "grad_norm": 5.092568125176769, "learning_rate": 1.9999914097260254e-06, "loss": 1.5014, "step": 791 }, { "epoch": 0.01, "grad_norm": 6.824966187432624, "learning_rate": 1.999991387713833e-06, "loss": 1.4623, "step": 792 }, { "epoch": 0.01, "grad_norm": 5.0068907060455485, "learning_rate": 1.9999913656734736e-06, "loss": 1.4753, "step": 793 }, { "epoch": 0.01, "grad_norm": 4.838451440188585, "learning_rate": 1.9999913436049483e-06, "loss": 1.4478, "step": 794 }, { "epoch": 0.01, "grad_norm": 4.553033347591773, "learning_rate": 1.9999913215082567e-06, "loss": 1.5087, "step": 795 }, { "epoch": 0.01, "grad_norm": 4.639914766985077, "learning_rate": 1.9999912993833987e-06, "loss": 1.331, "step": 796 }, { "epoch": 0.01, "grad_norm": 5.035741243327864, "learning_rate": 1.999991277230374e-06, "loss": 1.4473, "step": 797 }, { "epoch": 0.01, "grad_norm": 6.0968663437701975, "learning_rate": 1.9999912550491826e-06, "loss": 1.5815, "step": 798 }, { "epoch": 0.01, "grad_norm": 4.730455270173384, "learning_rate": 1.999991232839825e-06, "loss": 1.4369, "step": 799 }, { "epoch": 0.01, "grad_norm": 5.219089987469455, "learning_rate": 1.9999912106023014e-06, "loss": 1.5169, "step": 800 }, { "epoch": 0.01, "grad_norm": 5.750971191979698, "learning_rate": 1.999991188336611e-06, "loss": 1.4153, "step": 801 }, { "epoch": 0.01, "grad_norm": 5.406182419591723, "learning_rate": 1.9999911660427543e-06, "loss": 1.4437, "step": 802 }, { "epoch": 0.01, "grad_norm": 5.063524970869385, "learning_rate": 1.9999911437207307e-06, "loss": 1.507, "step": 803 }, { "epoch": 0.01, "eval_loss": 1.6881712675094604, "eval_runtime": 4.626, "eval_samples_per_second": 1.946, "eval_steps_per_second": 1.081, "step": 803 }, { "epoch": 0.01, "grad_norm": 5.467776894504789, "learning_rate": 1.999991121370541e-06, "loss": 1.522, "step": 804 }, { "epoch": 0.01, "grad_norm": 4.971642369149256, "learning_rate": 1.999991098992185e-06, "loss": 1.5321, "step": 805 }, { "epoch": 0.01, "grad_norm": 4.814938121020308, "learning_rate": 1.999991076585663e-06, "loss": 1.433, "step": 806 }, { "epoch": 0.01, "grad_norm": 5.2209822250293065, "learning_rate": 1.9999910541509737e-06, "loss": 1.335, "step": 807 }, { "epoch": 0.01, "grad_norm": 5.296510549803633, "learning_rate": 1.9999910316881186e-06, "loss": 1.7574, "step": 808 }, { "epoch": 0.01, "grad_norm": 4.913991460057143, "learning_rate": 1.9999910091970967e-06, "loss": 1.5179, "step": 809 }, { "epoch": 0.01, "grad_norm": 4.668407347559244, "learning_rate": 1.9999909866779085e-06, "loss": 1.3712, "step": 810 }, { "epoch": 0.01, "grad_norm": 5.453144611957878, "learning_rate": 1.9999909641305538e-06, "loss": 1.3764, "step": 811 }, { "epoch": 0.01, "grad_norm": 5.2650106360957185, "learning_rate": 1.9999909415550327e-06, "loss": 1.4595, "step": 812 }, { "epoch": 0.01, "grad_norm": 5.250833767951291, "learning_rate": 1.9999909189513457e-06, "loss": 1.3007, "step": 813 }, { "epoch": 0.01, "grad_norm": 5.354917246649793, "learning_rate": 1.999990896319492e-06, "loss": 1.6322, "step": 814 }, { "epoch": 0.01, "grad_norm": 5.012749978414504, "learning_rate": 1.9999908736594716e-06, "loss": 1.5338, "step": 815 }, { "epoch": 0.01, "grad_norm": 4.848442817388502, "learning_rate": 1.999990850971285e-06, "loss": 1.2774, "step": 816 }, { "epoch": 0.01, "grad_norm": 6.984924234695706, "learning_rate": 1.9999908282549316e-06, "loss": 1.4143, "step": 817 }, { "epoch": 0.01, "grad_norm": 4.762256555459132, "learning_rate": 1.9999908055104122e-06, "loss": 1.3228, "step": 818 }, { "epoch": 0.01, "grad_norm": 4.844771964077254, "learning_rate": 1.9999907827377265e-06, "loss": 1.4724, "step": 819 }, { "epoch": 0.01, "grad_norm": 4.952200259787622, "learning_rate": 1.999990759936874e-06, "loss": 1.4344, "step": 820 }, { "epoch": 0.01, "grad_norm": 4.711382910161586, "learning_rate": 1.999990737107855e-06, "loss": 1.5817, "step": 821 }, { "epoch": 0.01, "grad_norm": 5.57192836371835, "learning_rate": 1.99999071425067e-06, "loss": 1.4359, "step": 822 }, { "epoch": 0.01, "grad_norm": 5.2612613709184775, "learning_rate": 1.9999906913653187e-06, "loss": 1.683, "step": 823 }, { "epoch": 0.01, "grad_norm": 4.651207872271117, "learning_rate": 1.9999906684518005e-06, "loss": 1.4202, "step": 824 }, { "epoch": 0.01, "grad_norm": 4.33548743621847, "learning_rate": 1.999990645510116e-06, "loss": 1.2962, "step": 825 }, { "epoch": 0.01, "grad_norm": 4.998357754076746, "learning_rate": 1.9999906225402655e-06, "loss": 1.4576, "step": 826 }, { "epoch": 0.01, "grad_norm": 5.936966406696661, "learning_rate": 1.9999905995422483e-06, "loss": 1.4039, "step": 827 }, { "epoch": 0.01, "grad_norm": 5.330845387260364, "learning_rate": 1.9999905765160646e-06, "loss": 1.5228, "step": 828 }, { "epoch": 0.01, "grad_norm": 4.8056099201876705, "learning_rate": 1.9999905534617145e-06, "loss": 1.3755, "step": 829 }, { "epoch": 0.01, "grad_norm": 5.029573312366041, "learning_rate": 1.999990530379198e-06, "loss": 1.5501, "step": 830 }, { "epoch": 0.01, "grad_norm": 4.539685125822293, "learning_rate": 1.999990507268515e-06, "loss": 1.3357, "step": 831 }, { "epoch": 0.01, "grad_norm": 4.880928277155648, "learning_rate": 1.9999904841296656e-06, "loss": 1.6046, "step": 832 }, { "epoch": 0.01, "grad_norm": 5.503556293375853, "learning_rate": 1.99999046096265e-06, "loss": 1.7223, "step": 833 }, { "epoch": 0.01, "grad_norm": 6.225897163215507, "learning_rate": 1.999990437767468e-06, "loss": 1.3195, "step": 834 }, { "epoch": 0.01, "grad_norm": 7.690064080550938, "learning_rate": 1.9999904145441196e-06, "loss": 1.6465, "step": 835 }, { "epoch": 0.01, "grad_norm": 4.5459726864219725, "learning_rate": 1.999990391292605e-06, "loss": 1.4026, "step": 836 }, { "epoch": 0.01, "grad_norm": 4.680412720191206, "learning_rate": 1.9999903680129237e-06, "loss": 1.4353, "step": 837 }, { "epoch": 0.01, "grad_norm": 7.622275997037577, "learning_rate": 1.9999903447050758e-06, "loss": 1.5267, "step": 838 }, { "epoch": 0.01, "grad_norm": 4.741060382625769, "learning_rate": 1.999990321369062e-06, "loss": 1.4955, "step": 839 }, { "epoch": 0.01, "grad_norm": 4.875869067274173, "learning_rate": 1.999990298004881e-06, "loss": 1.5815, "step": 840 }, { "epoch": 0.01, "grad_norm": 4.6600192764255395, "learning_rate": 1.9999902746125344e-06, "loss": 1.3205, "step": 841 }, { "epoch": 0.01, "grad_norm": 5.551113040685454, "learning_rate": 1.999990251192021e-06, "loss": 1.6559, "step": 842 }, { "epoch": 0.01, "grad_norm": 5.3125600816949365, "learning_rate": 1.9999902277433414e-06, "loss": 1.422, "step": 843 }, { "epoch": 0.01, "grad_norm": 5.0897853002323155, "learning_rate": 1.999990204266495e-06, "loss": 1.429, "step": 844 }, { "epoch": 0.01, "grad_norm": 4.620329234269747, "learning_rate": 1.999990180761483e-06, "loss": 1.3901, "step": 845 }, { "epoch": 0.01, "grad_norm": 4.952006426703027, "learning_rate": 1.999990157228304e-06, "loss": 1.4605, "step": 846 }, { "epoch": 0.01, "grad_norm": 5.051790815462307, "learning_rate": 1.999990133666959e-06, "loss": 1.5361, "step": 847 }, { "epoch": 0.01, "grad_norm": 4.723454559230461, "learning_rate": 1.999990110077447e-06, "loss": 1.4063, "step": 848 }, { "epoch": 0.01, "grad_norm": 4.714623591910726, "learning_rate": 1.999990086459769e-06, "loss": 1.4844, "step": 849 }, { "epoch": 0.01, "grad_norm": 4.866772682439225, "learning_rate": 1.999990062813924e-06, "loss": 1.4243, "step": 850 }, { "epoch": 0.01, "grad_norm": 4.982727315839908, "learning_rate": 1.999990039139913e-06, "loss": 1.5169, "step": 851 }, { "epoch": 0.01, "grad_norm": 4.88323303732355, "learning_rate": 1.9999900154377363e-06, "loss": 1.331, "step": 852 }, { "epoch": 0.01, "grad_norm": 4.98858734123965, "learning_rate": 1.9999899917073925e-06, "loss": 1.4386, "step": 853 }, { "epoch": 0.01, "grad_norm": 5.7732616301555, "learning_rate": 1.9999899679488823e-06, "loss": 1.483, "step": 854 }, { "epoch": 0.01, "grad_norm": 10.195015855643826, "learning_rate": 1.9999899441622062e-06, "loss": 1.489, "step": 855 }, { "epoch": 0.01, "grad_norm": 4.777377833871161, "learning_rate": 1.9999899203473633e-06, "loss": 1.3607, "step": 856 }, { "epoch": 0.01, "grad_norm": 8.851948399516926, "learning_rate": 1.999989896504354e-06, "loss": 1.3547, "step": 857 }, { "epoch": 0.01, "grad_norm": 4.7729850075029905, "learning_rate": 1.9999898726331787e-06, "loss": 1.3561, "step": 858 }, { "epoch": 0.01, "grad_norm": 4.764811136272429, "learning_rate": 1.9999898487338367e-06, "loss": 1.3792, "step": 859 }, { "epoch": 0.01, "grad_norm": 4.889712073966274, "learning_rate": 1.999989824806328e-06, "loss": 1.3774, "step": 860 }, { "epoch": 0.01, "grad_norm": 5.465422723384496, "learning_rate": 1.9999898008506533e-06, "loss": 1.3464, "step": 861 }, { "epoch": 0.01, "grad_norm": 5.480222802088414, "learning_rate": 1.9999897768668125e-06, "loss": 1.5737, "step": 862 }, { "epoch": 0.01, "grad_norm": 5.492784136252579, "learning_rate": 1.999989752854805e-06, "loss": 1.5299, "step": 863 }, { "epoch": 0.01, "grad_norm": 5.194980321543087, "learning_rate": 1.999989728814631e-06, "loss": 1.2601, "step": 864 }, { "epoch": 0.01, "grad_norm": 5.373628545087017, "learning_rate": 1.9999897047462905e-06, "loss": 1.6205, "step": 865 }, { "epoch": 0.01, "grad_norm": 4.834528620174113, "learning_rate": 1.9999896806497837e-06, "loss": 1.3969, "step": 866 }, { "epoch": 0.01, "grad_norm": 5.365382013388016, "learning_rate": 1.999989656525111e-06, "loss": 1.4079, "step": 867 }, { "epoch": 0.01, "grad_norm": 5.10671741683615, "learning_rate": 1.9999896323722714e-06, "loss": 1.5323, "step": 868 }, { "epoch": 0.01, "grad_norm": 4.794974200564266, "learning_rate": 1.9999896081912655e-06, "loss": 1.5685, "step": 869 }, { "epoch": 0.01, "grad_norm": 5.460786387110753, "learning_rate": 1.999989583982093e-06, "loss": 1.4878, "step": 870 }, { "epoch": 0.01, "grad_norm": 4.954812125157006, "learning_rate": 1.999989559744755e-06, "loss": 1.6783, "step": 871 }, { "epoch": 0.01, "grad_norm": 4.893553227272597, "learning_rate": 1.9999895354792497e-06, "loss": 1.5179, "step": 872 }, { "epoch": 0.01, "grad_norm": 4.971125258081157, "learning_rate": 1.9999895111855786e-06, "loss": 1.42, "step": 873 }, { "epoch": 0.01, "grad_norm": 5.466531237826114, "learning_rate": 1.9999894868637408e-06, "loss": 1.4594, "step": 874 }, { "epoch": 0.01, "grad_norm": 4.643918053264017, "learning_rate": 1.9999894625137365e-06, "loss": 1.4087, "step": 875 }, { "epoch": 0.01, "grad_norm": 4.946505174466154, "learning_rate": 1.999989438135566e-06, "loss": 1.5287, "step": 876 }, { "epoch": 0.01, "eval_loss": 1.6835453510284424, "eval_runtime": 4.6442, "eval_samples_per_second": 1.938, "eval_steps_per_second": 1.077, "step": 876 }, { "epoch": 0.01, "grad_norm": 5.239412305756631, "learning_rate": 1.9999894137292292e-06, "loss": 1.3614, "step": 877 }, { "epoch": 0.01, "grad_norm": 5.5395595579858075, "learning_rate": 1.999989389294726e-06, "loss": 1.5485, "step": 878 }, { "epoch": 0.01, "grad_norm": 4.712593321081159, "learning_rate": 1.9999893648320564e-06, "loss": 1.4338, "step": 879 }, { "epoch": 0.01, "grad_norm": 4.458574377375978, "learning_rate": 1.9999893403412202e-06, "loss": 1.41, "step": 880 }, { "epoch": 0.01, "grad_norm": 5.551492101493687, "learning_rate": 1.999989315822218e-06, "loss": 1.3577, "step": 881 }, { "epoch": 0.01, "grad_norm": 5.207575524594766, "learning_rate": 1.999989291275049e-06, "loss": 1.5347, "step": 882 }, { "epoch": 0.01, "grad_norm": 5.6630265828087, "learning_rate": 1.999989266699714e-06, "loss": 1.5364, "step": 883 }, { "epoch": 0.01, "grad_norm": 4.879665992047349, "learning_rate": 1.9999892420962124e-06, "loss": 1.4612, "step": 884 }, { "epoch": 0.01, "grad_norm": 5.561642320920106, "learning_rate": 1.9999892174645447e-06, "loss": 1.3238, "step": 885 }, { "epoch": 0.01, "grad_norm": 4.803746896535269, "learning_rate": 1.9999891928047106e-06, "loss": 1.3759, "step": 886 }, { "epoch": 0.01, "grad_norm": 4.881001838381561, "learning_rate": 1.99998916811671e-06, "loss": 1.3259, "step": 887 }, { "epoch": 0.01, "grad_norm": 4.9185588818445165, "learning_rate": 1.9999891434005433e-06, "loss": 1.468, "step": 888 }, { "epoch": 0.01, "grad_norm": 6.4140094573174045, "learning_rate": 1.9999891186562096e-06, "loss": 1.5786, "step": 889 }, { "epoch": 0.01, "grad_norm": 5.4016730660955625, "learning_rate": 1.99998909388371e-06, "loss": 1.5057, "step": 890 }, { "epoch": 0.01, "grad_norm": 5.4186135870000145, "learning_rate": 1.999989069083044e-06, "loss": 1.5624, "step": 891 }, { "epoch": 0.01, "grad_norm": 4.747116381476095, "learning_rate": 1.9999890442542115e-06, "loss": 1.4637, "step": 892 }, { "epoch": 0.01, "grad_norm": 5.40275388365515, "learning_rate": 1.9999890193972127e-06, "loss": 1.5146, "step": 893 }, { "epoch": 0.01, "grad_norm": 5.991497086501674, "learning_rate": 1.9999889945120475e-06, "loss": 1.4213, "step": 894 }, { "epoch": 0.01, "grad_norm": 5.054323720643013, "learning_rate": 1.999988969598716e-06, "loss": 1.523, "step": 895 }, { "epoch": 0.01, "grad_norm": 4.610288116972087, "learning_rate": 1.9999889446572184e-06, "loss": 1.351, "step": 896 }, { "epoch": 0.01, "grad_norm": 5.3918041618620896, "learning_rate": 1.999988919687554e-06, "loss": 1.5724, "step": 897 }, { "epoch": 0.01, "grad_norm": 5.833705802062882, "learning_rate": 1.9999888946897233e-06, "loss": 1.572, "step": 898 }, { "epoch": 0.01, "grad_norm": 5.166823884179871, "learning_rate": 1.999988869663726e-06, "loss": 1.3641, "step": 899 }, { "epoch": 0.01, "grad_norm": 4.444407021344696, "learning_rate": 1.999988844609563e-06, "loss": 1.4219, "step": 900 }, { "epoch": 0.01, "grad_norm": 4.960252287317098, "learning_rate": 1.9999888195272332e-06, "loss": 1.4062, "step": 901 }, { "epoch": 0.01, "grad_norm": 5.074258501650094, "learning_rate": 1.9999887944167374e-06, "loss": 1.5055, "step": 902 }, { "epoch": 0.01, "grad_norm": 4.815381295386099, "learning_rate": 1.9999887692780747e-06, "loss": 1.5434, "step": 903 }, { "epoch": 0.01, "grad_norm": 4.811556472980446, "learning_rate": 1.999988744111246e-06, "loss": 1.4531, "step": 904 }, { "epoch": 0.01, "grad_norm": 6.5338580513110704, "learning_rate": 1.999988718916251e-06, "loss": 1.5654, "step": 905 }, { "epoch": 0.01, "grad_norm": 4.8547503509093985, "learning_rate": 1.9999886936930897e-06, "loss": 1.4963, "step": 906 }, { "epoch": 0.01, "grad_norm": 4.9529825947141655, "learning_rate": 1.9999886684417614e-06, "loss": 1.424, "step": 907 }, { "epoch": 0.01, "grad_norm": 4.992257236115625, "learning_rate": 1.9999886431622673e-06, "loss": 1.4438, "step": 908 }, { "epoch": 0.01, "grad_norm": 4.607129503512642, "learning_rate": 1.999988617854607e-06, "loss": 1.4505, "step": 909 }, { "epoch": 0.01, "grad_norm": 6.157982067067892, "learning_rate": 1.99998859251878e-06, "loss": 1.5385, "step": 910 }, { "epoch": 0.01, "grad_norm": 5.084561010511645, "learning_rate": 1.999988567154787e-06, "loss": 1.4966, "step": 911 }, { "epoch": 0.01, "grad_norm": 4.81877282821209, "learning_rate": 1.999988541762627e-06, "loss": 1.4731, "step": 912 }, { "epoch": 0.01, "grad_norm": 5.009288310745413, "learning_rate": 1.9999885163423014e-06, "loss": 1.432, "step": 913 }, { "epoch": 0.01, "grad_norm": 5.2256414414976184, "learning_rate": 1.999988490893809e-06, "loss": 1.5411, "step": 914 }, { "epoch": 0.01, "grad_norm": 5.261047012186723, "learning_rate": 1.99998846541715e-06, "loss": 1.6397, "step": 915 }, { "epoch": 0.01, "grad_norm": 4.657445695072026, "learning_rate": 1.9999884399123252e-06, "loss": 1.4764, "step": 916 }, { "epoch": 0.01, "grad_norm": 5.668390029076262, "learning_rate": 1.999988414379334e-06, "loss": 1.5984, "step": 917 }, { "epoch": 0.01, "grad_norm": 4.810866617300379, "learning_rate": 1.9999883888181764e-06, "loss": 1.4278, "step": 918 }, { "epoch": 0.01, "grad_norm": 8.825331474808909, "learning_rate": 1.9999883632288524e-06, "loss": 1.5525, "step": 919 }, { "epoch": 0.01, "grad_norm": 6.055755208418581, "learning_rate": 1.999988337611362e-06, "loss": 1.4505, "step": 920 }, { "epoch": 0.01, "grad_norm": 4.750972505525963, "learning_rate": 1.999988311965705e-06, "loss": 1.486, "step": 921 }, { "epoch": 0.01, "grad_norm": 6.217010575239663, "learning_rate": 1.999988286291882e-06, "loss": 1.7214, "step": 922 }, { "epoch": 0.01, "grad_norm": 5.507300171459353, "learning_rate": 1.9999882605898925e-06, "loss": 1.346, "step": 923 }, { "epoch": 0.01, "grad_norm": 5.017321187244063, "learning_rate": 1.999988234859737e-06, "loss": 1.5818, "step": 924 }, { "epoch": 0.01, "grad_norm": 5.028645991367795, "learning_rate": 1.999988209101415e-06, "loss": 1.5805, "step": 925 }, { "epoch": 0.01, "grad_norm": 7.2062677616231765, "learning_rate": 1.9999881833149264e-06, "loss": 1.6537, "step": 926 }, { "epoch": 0.01, "grad_norm": 5.145636588072226, "learning_rate": 1.9999881575002717e-06, "loss": 1.4681, "step": 927 }, { "epoch": 0.01, "grad_norm": 4.828710624451239, "learning_rate": 1.99998813165745e-06, "loss": 1.496, "step": 928 }, { "epoch": 0.01, "grad_norm": 5.549188105899666, "learning_rate": 1.9999881057864627e-06, "loss": 1.3865, "step": 929 }, { "epoch": 0.01, "grad_norm": 5.443520100837284, "learning_rate": 1.9999880798873093e-06, "loss": 1.2829, "step": 930 }, { "epoch": 0.01, "grad_norm": 5.13373771642738, "learning_rate": 1.999988053959989e-06, "loss": 1.5362, "step": 931 }, { "epoch": 0.01, "grad_norm": 4.999494186839008, "learning_rate": 1.9999880280045025e-06, "loss": 1.3462, "step": 932 }, { "epoch": 0.01, "grad_norm": 5.190959037095125, "learning_rate": 1.9999880020208495e-06, "loss": 1.4841, "step": 933 }, { "epoch": 0.01, "grad_norm": 4.780119303450498, "learning_rate": 1.9999879760090306e-06, "loss": 1.5854, "step": 934 }, { "epoch": 0.01, "grad_norm": 5.066755607666266, "learning_rate": 1.9999879499690452e-06, "loss": 1.3359, "step": 935 }, { "epoch": 0.01, "grad_norm": 5.252072684548736, "learning_rate": 1.999987923900893e-06, "loss": 1.4493, "step": 936 }, { "epoch": 0.01, "grad_norm": 4.711513346090706, "learning_rate": 1.999987897804575e-06, "loss": 1.2207, "step": 937 }, { "epoch": 0.01, "grad_norm": 7.728638763732817, "learning_rate": 1.9999878716800904e-06, "loss": 1.3846, "step": 938 }, { "epoch": 0.01, "grad_norm": 5.278677797997787, "learning_rate": 1.9999878455274396e-06, "loss": 1.336, "step": 939 }, { "epoch": 0.01, "grad_norm": 5.0220126825749425, "learning_rate": 1.9999878193466223e-06, "loss": 1.4228, "step": 940 }, { "epoch": 0.01, "grad_norm": 5.034910106052013, "learning_rate": 1.999987793137639e-06, "loss": 1.3907, "step": 941 }, { "epoch": 0.01, "grad_norm": 4.889404163229133, "learning_rate": 1.9999877669004894e-06, "loss": 1.4144, "step": 942 }, { "epoch": 0.01, "grad_norm": 5.162469539125017, "learning_rate": 1.999987740635173e-06, "loss": 1.5852, "step": 943 }, { "epoch": 0.01, "grad_norm": 5.399193356261801, "learning_rate": 1.9999877143416906e-06, "loss": 1.5513, "step": 944 }, { "epoch": 0.01, "grad_norm": 5.138316370644956, "learning_rate": 1.9999876880200418e-06, "loss": 1.467, "step": 945 }, { "epoch": 0.01, "grad_norm": 4.767554379371538, "learning_rate": 1.9999876616702266e-06, "loss": 1.4015, "step": 946 }, { "epoch": 0.01, "grad_norm": 4.432154096584277, "learning_rate": 1.9999876352922455e-06, "loss": 1.3144, "step": 947 }, { "epoch": 0.01, "grad_norm": 5.6901051217643035, "learning_rate": 1.9999876088860975e-06, "loss": 1.6762, "step": 948 }, { "epoch": 0.01, "grad_norm": 4.89608317041777, "learning_rate": 1.999987582451783e-06, "loss": 1.4469, "step": 949 }, { "epoch": 0.01, "eval_loss": 1.6775009632110596, "eval_runtime": 4.6401, "eval_samples_per_second": 1.94, "eval_steps_per_second": 1.078, "step": 949 }, { "epoch": 0.01, "grad_norm": 4.685652449039854, "learning_rate": 1.999987555989303e-06, "loss": 1.3646, "step": 950 }, { "epoch": 0.01, "grad_norm": 5.980010862799234, "learning_rate": 1.9999875294986562e-06, "loss": 1.5188, "step": 951 }, { "epoch": 0.01, "grad_norm": 5.218097124666707, "learning_rate": 1.999987502979843e-06, "loss": 1.4779, "step": 952 }, { "epoch": 0.01, "grad_norm": 4.803466971822877, "learning_rate": 1.9999874764328637e-06, "loss": 1.3129, "step": 953 }, { "epoch": 0.01, "grad_norm": 4.729648111188237, "learning_rate": 1.999987449857718e-06, "loss": 1.4093, "step": 954 }, { "epoch": 0.01, "grad_norm": 5.666417668066331, "learning_rate": 1.999987423254406e-06, "loss": 1.637, "step": 955 }, { "epoch": 0.01, "grad_norm": 4.92053994601316, "learning_rate": 1.999987396622928e-06, "loss": 1.4616, "step": 956 }, { "epoch": 0.01, "grad_norm": 4.729794144404417, "learning_rate": 1.999987369963283e-06, "loss": 1.2795, "step": 957 }, { "epoch": 0.01, "grad_norm": 5.48059643960075, "learning_rate": 1.999987343275472e-06, "loss": 1.4871, "step": 958 }, { "epoch": 0.01, "grad_norm": 4.807955183113965, "learning_rate": 1.9999873165594945e-06, "loss": 1.4542, "step": 959 }, { "epoch": 0.01, "grad_norm": 5.3782702475558475, "learning_rate": 1.999987289815351e-06, "loss": 1.4009, "step": 960 }, { "epoch": 0.01, "grad_norm": 5.465383132137198, "learning_rate": 1.999987263043041e-06, "loss": 1.5234, "step": 961 }, { "epoch": 0.01, "grad_norm": 4.863141270288387, "learning_rate": 1.9999872362425646e-06, "loss": 1.328, "step": 962 }, { "epoch": 0.01, "grad_norm": 5.513281389437473, "learning_rate": 1.999987209413922e-06, "loss": 1.3662, "step": 963 }, { "epoch": 0.01, "grad_norm": 6.182406119541521, "learning_rate": 1.9999871825571133e-06, "loss": 1.4304, "step": 964 }, { "epoch": 0.01, "grad_norm": 4.682777815258765, "learning_rate": 1.999987155672138e-06, "loss": 1.3698, "step": 965 }, { "epoch": 0.01, "grad_norm": 5.698433297833218, "learning_rate": 1.999987128758997e-06, "loss": 1.4941, "step": 966 }, { "epoch": 0.01, "grad_norm": 5.122381225960383, "learning_rate": 1.9999871018176888e-06, "loss": 1.4768, "step": 967 }, { "epoch": 0.01, "grad_norm": 4.72681263242357, "learning_rate": 1.999987074848215e-06, "loss": 1.3749, "step": 968 }, { "epoch": 0.01, "grad_norm": 5.507215592270518, "learning_rate": 1.999987047850574e-06, "loss": 1.5244, "step": 969 }, { "epoch": 0.01, "grad_norm": 4.870388076906783, "learning_rate": 1.9999870208247677e-06, "loss": 1.5131, "step": 970 }, { "epoch": 0.01, "grad_norm": 4.513166729875274, "learning_rate": 1.999986993770794e-06, "loss": 1.4241, "step": 971 }, { "epoch": 0.01, "grad_norm": 4.820438122653338, "learning_rate": 1.999986966688655e-06, "loss": 1.2826, "step": 972 }, { "epoch": 0.01, "grad_norm": 4.839651321923382, "learning_rate": 1.9999869395783495e-06, "loss": 1.2476, "step": 973 }, { "epoch": 0.01, "grad_norm": 5.720429959744313, "learning_rate": 1.999986912439877e-06, "loss": 1.4291, "step": 974 }, { "epoch": 0.01, "grad_norm": 4.4799874056887745, "learning_rate": 1.999986885273239e-06, "loss": 1.3034, "step": 975 }, { "epoch": 0.01, "grad_norm": 5.966740554354801, "learning_rate": 1.9999868580784343e-06, "loss": 1.346, "step": 976 }, { "epoch": 0.01, "grad_norm": 4.748631285522209, "learning_rate": 1.9999868308554637e-06, "loss": 1.3915, "step": 977 }, { "epoch": 0.01, "grad_norm": 5.117205503667891, "learning_rate": 1.9999868036043262e-06, "loss": 1.531, "step": 978 }, { "epoch": 0.01, "grad_norm": 4.761977448039064, "learning_rate": 1.999986776325023e-06, "loss": 1.5092, "step": 979 }, { "epoch": 0.01, "grad_norm": 5.383839203745775, "learning_rate": 1.999986749017553e-06, "loss": 1.4312, "step": 980 }, { "epoch": 0.01, "grad_norm": 4.898589027829305, "learning_rate": 1.999986721681917e-06, "loss": 1.4059, "step": 981 }, { "epoch": 0.01, "grad_norm": 7.05243832236956, "learning_rate": 1.9999866943181144e-06, "loss": 1.4345, "step": 982 }, { "epoch": 0.01, "grad_norm": 8.171888830122464, "learning_rate": 1.999986666926146e-06, "loss": 1.5769, "step": 983 }, { "epoch": 0.01, "grad_norm": 5.268910529823187, "learning_rate": 1.999986639506011e-06, "loss": 1.5645, "step": 984 }, { "epoch": 0.01, "grad_norm": 5.1967337373113525, "learning_rate": 1.9999866120577097e-06, "loss": 1.5157, "step": 985 }, { "epoch": 0.01, "grad_norm": 5.42988030195917, "learning_rate": 1.999986584581242e-06, "loss": 1.5816, "step": 986 }, { "epoch": 0.01, "grad_norm": 4.73138904838603, "learning_rate": 1.999986557076608e-06, "loss": 1.5813, "step": 987 }, { "epoch": 0.01, "grad_norm": 4.753580436572943, "learning_rate": 1.999986529543808e-06, "loss": 1.4126, "step": 988 }, { "epoch": 0.01, "grad_norm": 4.7046114145354565, "learning_rate": 1.999986501982841e-06, "loss": 1.5255, "step": 989 }, { "epoch": 0.01, "grad_norm": 7.691510890894805, "learning_rate": 1.9999864743937083e-06, "loss": 1.4457, "step": 990 }, { "epoch": 0.01, "grad_norm": 5.010108387001166, "learning_rate": 1.9999864467764095e-06, "loss": 1.2545, "step": 991 }, { "epoch": 0.01, "grad_norm": 5.100682664462816, "learning_rate": 1.9999864191309444e-06, "loss": 1.39, "step": 992 }, { "epoch": 0.01, "grad_norm": 5.918040690814786, "learning_rate": 1.9999863914573124e-06, "loss": 1.3931, "step": 993 }, { "epoch": 0.01, "grad_norm": 13.32376088870836, "learning_rate": 1.9999863637555145e-06, "loss": 1.5364, "step": 994 }, { "epoch": 0.01, "grad_norm": 6.404294035760159, "learning_rate": 1.9999863360255502e-06, "loss": 1.6188, "step": 995 }, { "epoch": 0.01, "grad_norm": 4.395708410571946, "learning_rate": 1.99998630826742e-06, "loss": 1.4324, "step": 996 }, { "epoch": 0.01, "grad_norm": 4.64843688225223, "learning_rate": 1.999986280481123e-06, "loss": 1.4221, "step": 997 }, { "epoch": 0.01, "grad_norm": 9.483948754696783, "learning_rate": 1.99998625266666e-06, "loss": 1.4758, "step": 998 }, { "epoch": 0.01, "grad_norm": 6.680984970928925, "learning_rate": 1.999986224824031e-06, "loss": 1.3525, "step": 999 }, { "epoch": 0.01, "grad_norm": 4.5483509269337326, "learning_rate": 1.9999861969532346e-06, "loss": 1.5931, "step": 1000 }, { "epoch": 0.01, "grad_norm": 4.685591189533344, "learning_rate": 1.999986169054273e-06, "loss": 1.4752, "step": 1001 }, { "epoch": 0.01, "grad_norm": 5.027111632819377, "learning_rate": 1.9999861411271447e-06, "loss": 1.4839, "step": 1002 }, { "epoch": 0.01, "grad_norm": 4.813630389301654, "learning_rate": 1.99998611317185e-06, "loss": 1.2419, "step": 1003 }, { "epoch": 0.01, "grad_norm": 4.8555527438063875, "learning_rate": 1.9999860851883896e-06, "loss": 1.5733, "step": 1004 }, { "epoch": 0.01, "grad_norm": 4.863459059811092, "learning_rate": 1.9999860571767623e-06, "loss": 1.4519, "step": 1005 }, { "epoch": 0.01, "grad_norm": 8.069266897105678, "learning_rate": 1.999986029136969e-06, "loss": 1.5143, "step": 1006 }, { "epoch": 0.01, "grad_norm": 6.396817433935851, "learning_rate": 1.9999860010690093e-06, "loss": 1.4442, "step": 1007 }, { "epoch": 0.01, "grad_norm": 4.846992814886912, "learning_rate": 1.9999859729728833e-06, "loss": 1.514, "step": 1008 }, { "epoch": 0.01, "grad_norm": 4.610189787028616, "learning_rate": 1.9999859448485912e-06, "loss": 1.3104, "step": 1009 }, { "epoch": 0.01, "grad_norm": 5.131296471286178, "learning_rate": 1.999985916696133e-06, "loss": 1.3135, "step": 1010 }, { "epoch": 0.01, "grad_norm": 5.091140293402042, "learning_rate": 1.999985888515508e-06, "loss": 1.5342, "step": 1011 }, { "epoch": 0.01, "grad_norm": 5.120127148783947, "learning_rate": 1.999985860306717e-06, "loss": 1.5248, "step": 1012 }, { "epoch": 0.01, "grad_norm": 5.479120892520487, "learning_rate": 1.9999858320697597e-06, "loss": 1.5342, "step": 1013 }, { "epoch": 0.01, "grad_norm": 5.065215097181712, "learning_rate": 1.999985803804636e-06, "loss": 1.5236, "step": 1014 }, { "epoch": 0.01, "grad_norm": 4.790915278833422, "learning_rate": 1.9999857755113463e-06, "loss": 1.5493, "step": 1015 }, { "epoch": 0.01, "grad_norm": 5.245342572510602, "learning_rate": 1.99998574718989e-06, "loss": 1.4241, "step": 1016 }, { "epoch": 0.01, "grad_norm": 5.450685311013884, "learning_rate": 1.9999857188402677e-06, "loss": 1.5392, "step": 1017 }, { "epoch": 0.01, "grad_norm": 5.167929668678754, "learning_rate": 1.9999856904624786e-06, "loss": 1.4839, "step": 1018 }, { "epoch": 0.01, "grad_norm": 4.655577606549074, "learning_rate": 1.999985662056524e-06, "loss": 1.3708, "step": 1019 }, { "epoch": 0.01, "grad_norm": 5.083589051170835, "learning_rate": 1.9999856336224026e-06, "loss": 1.4276, "step": 1020 }, { "epoch": 0.01, "grad_norm": 7.518979837632458, "learning_rate": 1.9999856051601156e-06, "loss": 1.5506, "step": 1021 }, { "epoch": 0.01, "grad_norm": 9.022963635961508, "learning_rate": 1.9999855766696614e-06, "loss": 1.503, "step": 1022 }, { "epoch": 0.01, "eval_loss": 1.6751856803894043, "eval_runtime": 4.6457, "eval_samples_per_second": 1.937, "eval_steps_per_second": 1.076, "step": 1022 }, { "epoch": 0.01, "grad_norm": 4.888363610346068, "learning_rate": 1.9999855481510416e-06, "loss": 1.3999, "step": 1023 }, { "epoch": 0.01, "grad_norm": 6.785844821190048, "learning_rate": 1.999985519604255e-06, "loss": 1.5704, "step": 1024 }, { "epoch": 0.01, "grad_norm": 5.817704365569473, "learning_rate": 1.9999854910293026e-06, "loss": 1.5568, "step": 1025 }, { "epoch": 0.01, "grad_norm": 4.359712088912163, "learning_rate": 1.9999854624261837e-06, "loss": 1.2568, "step": 1026 }, { "epoch": 0.01, "grad_norm": 5.591954237737313, "learning_rate": 1.999985433794899e-06, "loss": 1.4747, "step": 1027 }, { "epoch": 0.01, "grad_norm": 5.102013647933071, "learning_rate": 1.9999854051354476e-06, "loss": 1.4146, "step": 1028 }, { "epoch": 0.01, "grad_norm": 4.931384249700325, "learning_rate": 1.99998537644783e-06, "loss": 1.4324, "step": 1029 }, { "epoch": 0.01, "grad_norm": 4.59036254886226, "learning_rate": 1.999985347732046e-06, "loss": 1.467, "step": 1030 }, { "epoch": 0.01, "grad_norm": 4.994175894124642, "learning_rate": 1.999985318988096e-06, "loss": 1.3865, "step": 1031 }, { "epoch": 0.01, "grad_norm": 4.545053819093002, "learning_rate": 1.999985290215979e-06, "loss": 1.4351, "step": 1032 }, { "epoch": 0.01, "grad_norm": 5.004106893385651, "learning_rate": 1.999985261415697e-06, "loss": 1.4621, "step": 1033 }, { "epoch": 0.01, "grad_norm": 6.279172341870177, "learning_rate": 1.9999852325872476e-06, "loss": 1.7436, "step": 1034 }, { "epoch": 0.01, "grad_norm": 5.343377788723042, "learning_rate": 1.9999852037306325e-06, "loss": 1.5041, "step": 1035 }, { "epoch": 0.01, "grad_norm": 5.268385681335901, "learning_rate": 1.999985174845851e-06, "loss": 1.5105, "step": 1036 }, { "epoch": 0.01, "grad_norm": 6.048875950329858, "learning_rate": 1.9999851459329036e-06, "loss": 1.5769, "step": 1037 }, { "epoch": 0.01, "grad_norm": 5.74228089002837, "learning_rate": 1.9999851169917897e-06, "loss": 1.5507, "step": 1038 }, { "epoch": 0.01, "grad_norm": 4.698397540213852, "learning_rate": 1.9999850880225095e-06, "loss": 1.3009, "step": 1039 }, { "epoch": 0.01, "grad_norm": 4.641564143897353, "learning_rate": 1.999985059025063e-06, "loss": 1.3415, "step": 1040 }, { "epoch": 0.01, "grad_norm": 4.77396271067677, "learning_rate": 1.99998502999945e-06, "loss": 1.4983, "step": 1041 }, { "epoch": 0.01, "grad_norm": 4.772762227532076, "learning_rate": 1.9999850009456713e-06, "loss": 1.3664, "step": 1042 }, { "epoch": 0.01, "grad_norm": 4.914962455888292, "learning_rate": 1.999984971863726e-06, "loss": 1.4483, "step": 1043 }, { "epoch": 0.01, "grad_norm": 5.292636683131925, "learning_rate": 1.9999849427536147e-06, "loss": 1.614, "step": 1044 }, { "epoch": 0.01, "grad_norm": 4.816438676458731, "learning_rate": 1.999984913615337e-06, "loss": 1.4438, "step": 1045 }, { "epoch": 0.01, "grad_norm": 4.61943572442077, "learning_rate": 1.999984884448893e-06, "loss": 1.3256, "step": 1046 }, { "epoch": 0.01, "grad_norm": 16.818243184224574, "learning_rate": 1.999984855254283e-06, "loss": 1.5516, "step": 1047 }, { "epoch": 0.01, "grad_norm": 4.944906131189107, "learning_rate": 1.9999848260315064e-06, "loss": 1.4782, "step": 1048 }, { "epoch": 0.01, "grad_norm": 5.372035876331552, "learning_rate": 1.9999847967805635e-06, "loss": 1.5814, "step": 1049 }, { "epoch": 0.01, "grad_norm": 5.05598468387018, "learning_rate": 1.9999847675014548e-06, "loss": 1.4904, "step": 1050 }, { "epoch": 0.01, "grad_norm": 8.983166835216965, "learning_rate": 1.9999847381941796e-06, "loss": 1.5657, "step": 1051 }, { "epoch": 0.01, "grad_norm": 5.880234134964031, "learning_rate": 1.999984708858738e-06, "loss": 1.2615, "step": 1052 }, { "epoch": 0.01, "grad_norm": 5.383111410300504, "learning_rate": 1.9999846794951305e-06, "loss": 1.5257, "step": 1053 }, { "epoch": 0.01, "grad_norm": 5.319783609516555, "learning_rate": 1.9999846501033566e-06, "loss": 1.3974, "step": 1054 }, { "epoch": 0.01, "grad_norm": 5.276781780830751, "learning_rate": 1.9999846206834163e-06, "loss": 1.5434, "step": 1055 }, { "epoch": 0.01, "grad_norm": 6.704196444791288, "learning_rate": 1.99998459123531e-06, "loss": 1.4933, "step": 1056 }, { "epoch": 0.01, "grad_norm": 7.249291359747204, "learning_rate": 1.9999845617590374e-06, "loss": 1.4506, "step": 1057 }, { "epoch": 0.01, "grad_norm": 4.675801641902775, "learning_rate": 1.9999845322545983e-06, "loss": 1.582, "step": 1058 }, { "epoch": 0.01, "grad_norm": 5.63075829111936, "learning_rate": 1.9999845027219933e-06, "loss": 1.5674, "step": 1059 }, { "epoch": 0.01, "grad_norm": 6.305134362562168, "learning_rate": 1.999984473161222e-06, "loss": 1.1858, "step": 1060 }, { "epoch": 0.01, "grad_norm": 4.923187333888181, "learning_rate": 1.999984443572284e-06, "loss": 1.5338, "step": 1061 }, { "epoch": 0.01, "grad_norm": 5.615589160701651, "learning_rate": 1.9999844139551804e-06, "loss": 1.3402, "step": 1062 }, { "epoch": 0.01, "grad_norm": 5.080506923679196, "learning_rate": 1.9999843843099103e-06, "loss": 1.4556, "step": 1063 }, { "epoch": 0.01, "grad_norm": 4.739895475840536, "learning_rate": 1.9999843546364742e-06, "loss": 1.4932, "step": 1064 }, { "epoch": 0.01, "grad_norm": 4.404861163545689, "learning_rate": 1.9999843249348718e-06, "loss": 1.3194, "step": 1065 }, { "epoch": 0.01, "grad_norm": 6.345308311669248, "learning_rate": 1.999984295205103e-06, "loss": 1.3354, "step": 1066 }, { "epoch": 0.01, "grad_norm": 5.01753123609855, "learning_rate": 1.9999842654471677e-06, "loss": 1.4866, "step": 1067 }, { "epoch": 0.01, "grad_norm": 5.5591172662604595, "learning_rate": 1.9999842356610664e-06, "loss": 1.3546, "step": 1068 }, { "epoch": 0.01, "grad_norm": 4.7557963260332565, "learning_rate": 1.999984205846799e-06, "loss": 1.4518, "step": 1069 }, { "epoch": 0.01, "grad_norm": 4.562175319819017, "learning_rate": 1.9999841760043657e-06, "loss": 1.3072, "step": 1070 }, { "epoch": 0.01, "grad_norm": 4.812788016867621, "learning_rate": 1.9999841461337653e-06, "loss": 1.371, "step": 1071 }, { "epoch": 0.01, "grad_norm": 4.67438764053753, "learning_rate": 1.9999841162349994e-06, "loss": 1.4096, "step": 1072 }, { "epoch": 0.01, "grad_norm": 4.993252181714291, "learning_rate": 1.999984086308067e-06, "loss": 1.4942, "step": 1073 }, { "epoch": 0.01, "grad_norm": 4.910843860455304, "learning_rate": 1.9999840563529685e-06, "loss": 1.4151, "step": 1074 }, { "epoch": 0.01, "grad_norm": 19.5766042127858, "learning_rate": 1.999984026369704e-06, "loss": 1.4966, "step": 1075 }, { "epoch": 0.01, "grad_norm": 5.483294588888528, "learning_rate": 1.9999839963582724e-06, "loss": 1.4772, "step": 1076 }, { "epoch": 0.01, "grad_norm": 4.999652845868447, "learning_rate": 1.9999839663186754e-06, "loss": 1.496, "step": 1077 }, { "epoch": 0.01, "grad_norm": 5.272781233458372, "learning_rate": 1.999983936250912e-06, "loss": 1.4132, "step": 1078 }, { "epoch": 0.01, "grad_norm": 4.980996814545554, "learning_rate": 1.9999839061549822e-06, "loss": 1.2901, "step": 1079 }, { "epoch": 0.01, "grad_norm": 4.953763987949976, "learning_rate": 1.999983876030886e-06, "loss": 1.5155, "step": 1080 }, { "epoch": 0.01, "grad_norm": 5.403058384399181, "learning_rate": 1.9999838458786244e-06, "loss": 1.5269, "step": 1081 }, { "epoch": 0.01, "grad_norm": 6.962939986871904, "learning_rate": 1.999983815698196e-06, "loss": 1.5753, "step": 1082 }, { "epoch": 0.01, "grad_norm": 5.283384276439903, "learning_rate": 1.999983785489601e-06, "loss": 1.5311, "step": 1083 }, { "epoch": 0.01, "grad_norm": 5.614239854059685, "learning_rate": 1.9999837552528406e-06, "loss": 1.5236, "step": 1084 }, { "epoch": 0.01, "grad_norm": 4.5938014662143, "learning_rate": 1.9999837249879133e-06, "loss": 1.449, "step": 1085 }, { "epoch": 0.01, "grad_norm": 4.828053203793281, "learning_rate": 1.99998369469482e-06, "loss": 1.3608, "step": 1086 }, { "epoch": 0.01, "grad_norm": 4.86723210959235, "learning_rate": 1.999983664373561e-06, "loss": 1.3185, "step": 1087 }, { "epoch": 0.01, "grad_norm": 5.15651657506136, "learning_rate": 1.999983634024135e-06, "loss": 1.3037, "step": 1088 }, { "epoch": 0.01, "grad_norm": 4.781841302352801, "learning_rate": 1.999983603646543e-06, "loss": 1.5157, "step": 1089 }, { "epoch": 0.01, "grad_norm": 5.6250573996377256, "learning_rate": 1.999983573240785e-06, "loss": 1.4612, "step": 1090 }, { "epoch": 0.01, "grad_norm": 4.853270627798325, "learning_rate": 1.999983542806861e-06, "loss": 1.3915, "step": 1091 }, { "epoch": 0.01, "grad_norm": 5.416508032931958, "learning_rate": 1.99998351234477e-06, "loss": 1.4846, "step": 1092 }, { "epoch": 0.01, "grad_norm": 5.154167527363172, "learning_rate": 1.9999834818545135e-06, "loss": 1.4909, "step": 1093 }, { "epoch": 0.01, "grad_norm": 5.015726983035818, "learning_rate": 1.9999834513360905e-06, "loss": 1.4462, "step": 1094 }, { "epoch": 0.01, "grad_norm": 4.859626471859029, "learning_rate": 1.9999834207895015e-06, "loss": 1.5644, "step": 1095 }, { "epoch": 0.01, "eval_loss": 1.6735153198242188, "eval_runtime": 4.6256, "eval_samples_per_second": 1.946, "eval_steps_per_second": 1.081, "step": 1095 }, { "epoch": 0.01, "grad_norm": 4.566823707743832, "learning_rate": 1.999983390214746e-06, "loss": 1.4629, "step": 1096 }, { "epoch": 0.01, "grad_norm": 4.735335408472524, "learning_rate": 1.9999833596118244e-06, "loss": 1.4391, "step": 1097 }, { "epoch": 0.01, "grad_norm": 4.790636259147894, "learning_rate": 1.999983328980737e-06, "loss": 1.4981, "step": 1098 }, { "epoch": 0.01, "grad_norm": 5.349178643835508, "learning_rate": 1.999983298321483e-06, "loss": 1.3753, "step": 1099 }, { "epoch": 0.01, "grad_norm": 5.263976566555674, "learning_rate": 1.9999832676340625e-06, "loss": 1.55, "step": 1100 }, { "epoch": 0.01, "grad_norm": 4.657912236522682, "learning_rate": 1.9999832369184764e-06, "loss": 1.4841, "step": 1101 }, { "epoch": 0.01, "grad_norm": 4.867208669497604, "learning_rate": 1.999983206174724e-06, "loss": 1.4229, "step": 1102 }, { "epoch": 0.01, "grad_norm": 4.862064804403664, "learning_rate": 1.9999831754028052e-06, "loss": 1.4944, "step": 1103 }, { "epoch": 0.01, "grad_norm": 5.3872647723902585, "learning_rate": 1.99998314460272e-06, "loss": 1.4938, "step": 1104 }, { "epoch": 0.01, "grad_norm": 4.878142365865869, "learning_rate": 1.9999831137744693e-06, "loss": 1.4478, "step": 1105 }, { "epoch": 0.01, "grad_norm": 4.632417620908414, "learning_rate": 1.9999830829180517e-06, "loss": 1.4551, "step": 1106 }, { "epoch": 0.01, "grad_norm": 4.666710624917455, "learning_rate": 1.999983052033468e-06, "loss": 1.4641, "step": 1107 }, { "epoch": 0.01, "grad_norm": 4.732751527383862, "learning_rate": 1.9999830211207183e-06, "loss": 1.3838, "step": 1108 }, { "epoch": 0.01, "grad_norm": 6.192163188860683, "learning_rate": 1.9999829901798025e-06, "loss": 1.5361, "step": 1109 }, { "epoch": 0.01, "grad_norm": 6.029036683181163, "learning_rate": 1.9999829592107202e-06, "loss": 1.5428, "step": 1110 }, { "epoch": 0.01, "grad_norm": 5.235770433822521, "learning_rate": 1.999982928213472e-06, "loss": 1.4727, "step": 1111 }, { "epoch": 0.01, "grad_norm": 17.301123468125486, "learning_rate": 1.9999828971880574e-06, "loss": 1.3757, "step": 1112 }, { "epoch": 0.01, "grad_norm": 4.804238410996875, "learning_rate": 1.9999828661344764e-06, "loss": 1.3589, "step": 1113 }, { "epoch": 0.01, "grad_norm": 4.616566753097097, "learning_rate": 1.9999828350527295e-06, "loss": 1.3691, "step": 1114 }, { "epoch": 0.01, "grad_norm": 5.347116928777432, "learning_rate": 1.999982803942817e-06, "loss": 1.4647, "step": 1115 }, { "epoch": 0.01, "grad_norm": 4.932531917778219, "learning_rate": 1.9999827728047373e-06, "loss": 1.4675, "step": 1116 }, { "epoch": 0.01, "grad_norm": 4.660294529470657, "learning_rate": 1.999982741638492e-06, "loss": 1.4151, "step": 1117 }, { "epoch": 0.01, "grad_norm": 7.620813961955892, "learning_rate": 1.99998271044408e-06, "loss": 1.6348, "step": 1118 }, { "epoch": 0.01, "grad_norm": 5.446077565129274, "learning_rate": 1.9999826792215024e-06, "loss": 1.3873, "step": 1119 }, { "epoch": 0.01, "grad_norm": 5.0629846402349195, "learning_rate": 1.9999826479707584e-06, "loss": 1.4435, "step": 1120 }, { "epoch": 0.01, "grad_norm": 5.390133111401672, "learning_rate": 1.999982616691848e-06, "loss": 1.5374, "step": 1121 }, { "epoch": 0.01, "grad_norm": 4.635104269305596, "learning_rate": 1.9999825853847717e-06, "loss": 1.4144, "step": 1122 }, { "epoch": 0.01, "grad_norm": 4.932450441815705, "learning_rate": 1.999982554049529e-06, "loss": 1.6113, "step": 1123 }, { "epoch": 0.01, "grad_norm": 4.571391891954345, "learning_rate": 1.9999825226861202e-06, "loss": 1.385, "step": 1124 }, { "epoch": 0.01, "grad_norm": 4.463278206524636, "learning_rate": 1.999982491294545e-06, "loss": 1.3112, "step": 1125 }, { "epoch": 0.01, "grad_norm": 5.169524463711344, "learning_rate": 1.999982459874804e-06, "loss": 1.3639, "step": 1126 }, { "epoch": 0.01, "grad_norm": 4.706816189602849, "learning_rate": 1.999982428426897e-06, "loss": 1.4906, "step": 1127 }, { "epoch": 0.01, "grad_norm": 5.4003758033814435, "learning_rate": 1.9999823969508233e-06, "loss": 1.2288, "step": 1128 }, { "epoch": 0.01, "grad_norm": 5.07733560063141, "learning_rate": 1.999982365446584e-06, "loss": 1.4117, "step": 1129 }, { "epoch": 0.01, "grad_norm": 7.969998640669905, "learning_rate": 1.9999823339141778e-06, "loss": 1.3877, "step": 1130 }, { "epoch": 0.01, "grad_norm": 5.142101849234914, "learning_rate": 1.9999823023536056e-06, "loss": 1.4276, "step": 1131 }, { "epoch": 0.01, "grad_norm": 5.223738858689869, "learning_rate": 1.9999822707648675e-06, "loss": 1.4385, "step": 1132 }, { "epoch": 0.01, "grad_norm": 4.955334460812677, "learning_rate": 1.999982239147963e-06, "loss": 1.4981, "step": 1133 }, { "epoch": 0.01, "grad_norm": 5.707569119145383, "learning_rate": 1.9999822075028926e-06, "loss": 1.5537, "step": 1134 }, { "epoch": 0.01, "grad_norm": 6.259173620055015, "learning_rate": 1.999982175829656e-06, "loss": 1.4162, "step": 1135 }, { "epoch": 0.01, "grad_norm": 6.160930511739169, "learning_rate": 1.999982144128253e-06, "loss": 1.6684, "step": 1136 }, { "epoch": 0.01, "grad_norm": 12.436420759166786, "learning_rate": 1.999982112398684e-06, "loss": 1.2647, "step": 1137 }, { "epoch": 0.01, "grad_norm": 5.264705476269943, "learning_rate": 1.9999820806409487e-06, "loss": 1.5094, "step": 1138 }, { "epoch": 0.01, "grad_norm": 4.82448740492418, "learning_rate": 1.9999820488550476e-06, "loss": 1.4738, "step": 1139 }, { "epoch": 0.01, "grad_norm": 5.590049555555406, "learning_rate": 1.9999820170409797e-06, "loss": 1.4759, "step": 1140 }, { "epoch": 0.01, "grad_norm": 4.864218721440189, "learning_rate": 1.9999819851987462e-06, "loss": 1.5362, "step": 1141 }, { "epoch": 0.01, "grad_norm": 5.32497769317765, "learning_rate": 1.9999819533283464e-06, "loss": 1.4001, "step": 1142 }, { "epoch": 0.01, "grad_norm": 4.788543241974733, "learning_rate": 1.99998192142978e-06, "loss": 1.5497, "step": 1143 }, { "epoch": 0.01, "grad_norm": 8.851113964577435, "learning_rate": 1.999981889503048e-06, "loss": 1.4904, "step": 1144 }, { "epoch": 0.01, "grad_norm": 5.107648195516455, "learning_rate": 1.9999818575481494e-06, "loss": 1.5098, "step": 1145 }, { "epoch": 0.01, "grad_norm": 4.583169283696968, "learning_rate": 1.999981825565085e-06, "loss": 1.3993, "step": 1146 }, { "epoch": 0.01, "grad_norm": 4.807776598248037, "learning_rate": 1.999981793553854e-06, "loss": 1.3587, "step": 1147 }, { "epoch": 0.01, "grad_norm": 4.614047081189212, "learning_rate": 1.999981761514457e-06, "loss": 1.3482, "step": 1148 }, { "epoch": 0.01, "grad_norm": 6.35657561717519, "learning_rate": 1.999981729446894e-06, "loss": 1.5398, "step": 1149 }, { "epoch": 0.01, "grad_norm": 4.933553997697057, "learning_rate": 1.999981697351165e-06, "loss": 1.3589, "step": 1150 }, { "epoch": 0.01, "grad_norm": 5.872175454134656, "learning_rate": 1.9999816652272697e-06, "loss": 1.4732, "step": 1151 }, { "epoch": 0.01, "grad_norm": 5.288241183644056, "learning_rate": 1.999981633075208e-06, "loss": 1.3574, "step": 1152 }, { "epoch": 0.01, "grad_norm": 5.009781459825191, "learning_rate": 1.99998160089498e-06, "loss": 1.394, "step": 1153 }, { "epoch": 0.01, "grad_norm": 4.55801046260358, "learning_rate": 1.9999815686865867e-06, "loss": 1.3084, "step": 1154 }, { "epoch": 0.01, "grad_norm": 4.821680285232777, "learning_rate": 1.9999815364500263e-06, "loss": 1.4516, "step": 1155 }, { "epoch": 0.01, "grad_norm": 4.952975805784374, "learning_rate": 1.9999815041853005e-06, "loss": 1.4385, "step": 1156 }, { "epoch": 0.01, "grad_norm": 5.576070368663535, "learning_rate": 1.9999814718924082e-06, "loss": 1.3069, "step": 1157 }, { "epoch": 0.01, "grad_norm": 4.951804621604359, "learning_rate": 1.9999814395713496e-06, "loss": 1.4112, "step": 1158 }, { "epoch": 0.01, "grad_norm": 4.741717751755461, "learning_rate": 1.9999814072221254e-06, "loss": 1.4899, "step": 1159 }, { "epoch": 0.01, "grad_norm": 4.893581963366542, "learning_rate": 1.9999813748447344e-06, "loss": 1.3654, "step": 1160 }, { "epoch": 0.01, "grad_norm": 4.688325242766382, "learning_rate": 1.9999813424391775e-06, "loss": 1.4792, "step": 1161 }, { "epoch": 0.01, "grad_norm": 4.958921548979553, "learning_rate": 1.9999813100054546e-06, "loss": 1.1737, "step": 1162 }, { "epoch": 0.01, "grad_norm": 4.988069853619337, "learning_rate": 1.9999812775435653e-06, "loss": 1.5248, "step": 1163 }, { "epoch": 0.01, "grad_norm": 4.323386669021145, "learning_rate": 1.99998124505351e-06, "loss": 1.3352, "step": 1164 }, { "epoch": 0.01, "grad_norm": 4.863439791281708, "learning_rate": 1.9999812125352883e-06, "loss": 1.4522, "step": 1165 }, { "epoch": 0.01, "grad_norm": 4.50022212533205, "learning_rate": 1.9999811799889007e-06, "loss": 1.4328, "step": 1166 }, { "epoch": 0.01, "grad_norm": 5.283635450013209, "learning_rate": 1.9999811474143467e-06, "loss": 1.4352, "step": 1167 }, { "epoch": 0.01, "grad_norm": 4.613546626420899, "learning_rate": 1.999981114811627e-06, "loss": 1.4733, "step": 1168 }, { "epoch": 0.01, "eval_loss": 1.6666209697723389, "eval_runtime": 4.627, "eval_samples_per_second": 1.945, "eval_steps_per_second": 1.081, "step": 1168 }, { "epoch": 0.01, "grad_norm": 5.243713431888752, "learning_rate": 1.999981082180741e-06, "loss": 1.4229, "step": 1169 }, { "epoch": 0.01, "grad_norm": 5.180916470723546, "learning_rate": 1.999981049521689e-06, "loss": 1.5532, "step": 1170 }, { "epoch": 0.01, "grad_norm": 4.634457636781247, "learning_rate": 1.9999810168344707e-06, "loss": 1.4628, "step": 1171 }, { "epoch": 0.01, "grad_norm": 4.841975621960265, "learning_rate": 1.999980984119086e-06, "loss": 1.4714, "step": 1172 }, { "epoch": 0.01, "grad_norm": 4.856407984159355, "learning_rate": 1.9999809513755354e-06, "loss": 1.3953, "step": 1173 }, { "epoch": 0.01, "grad_norm": 4.924593632584216, "learning_rate": 1.9999809186038184e-06, "loss": 1.4777, "step": 1174 }, { "epoch": 0.01, "grad_norm": 10.804521073545573, "learning_rate": 1.999980885803936e-06, "loss": 1.5477, "step": 1175 }, { "epoch": 0.01, "grad_norm": 4.849633875980324, "learning_rate": 1.9999808529758865e-06, "loss": 1.3616, "step": 1176 }, { "epoch": 0.01, "grad_norm": 4.918561776125233, "learning_rate": 1.9999808201196716e-06, "loss": 1.4449, "step": 1177 }, { "epoch": 0.01, "grad_norm": 4.559134203442522, "learning_rate": 1.9999807872352903e-06, "loss": 1.373, "step": 1178 }, { "epoch": 0.01, "grad_norm": 4.7385730138344675, "learning_rate": 1.9999807543227426e-06, "loss": 1.4561, "step": 1179 }, { "epoch": 0.01, "grad_norm": 5.477240093311455, "learning_rate": 1.9999807213820294e-06, "loss": 1.3705, "step": 1180 }, { "epoch": 0.01, "grad_norm": 6.069434481225469, "learning_rate": 1.99998068841315e-06, "loss": 1.2873, "step": 1181 }, { "epoch": 0.01, "grad_norm": 4.697371499630333, "learning_rate": 1.999980655416104e-06, "loss": 1.4569, "step": 1182 }, { "epoch": 0.01, "grad_norm": 4.533455071498911, "learning_rate": 1.999980622390892e-06, "loss": 1.4891, "step": 1183 }, { "epoch": 0.01, "grad_norm": 4.807849990271847, "learning_rate": 1.9999805893375135e-06, "loss": 1.4576, "step": 1184 }, { "epoch": 0.01, "grad_norm": 5.812651120417968, "learning_rate": 1.9999805562559696e-06, "loss": 1.4451, "step": 1185 }, { "epoch": 0.01, "grad_norm": 4.764537937046826, "learning_rate": 1.9999805231462594e-06, "loss": 1.4257, "step": 1186 }, { "epoch": 0.01, "grad_norm": 5.106952053198868, "learning_rate": 1.9999804900083827e-06, "loss": 1.5414, "step": 1187 }, { "epoch": 0.01, "grad_norm": 4.655479441251472, "learning_rate": 1.99998045684234e-06, "loss": 1.4561, "step": 1188 }, { "epoch": 0.01, "grad_norm": 4.473182229797437, "learning_rate": 1.9999804236481316e-06, "loss": 1.4701, "step": 1189 }, { "epoch": 0.01, "grad_norm": 5.065099661197906, "learning_rate": 1.9999803904257566e-06, "loss": 1.4211, "step": 1190 }, { "epoch": 0.01, "grad_norm": 4.538986656004763, "learning_rate": 1.9999803571752157e-06, "loss": 1.5027, "step": 1191 }, { "epoch": 0.01, "grad_norm": 4.4128695506388365, "learning_rate": 1.9999803238965084e-06, "loss": 1.5027, "step": 1192 }, { "epoch": 0.01, "grad_norm": 6.032103944177775, "learning_rate": 1.999980290589635e-06, "loss": 1.5644, "step": 1193 }, { "epoch": 0.01, "grad_norm": 4.682343321057221, "learning_rate": 1.9999802572545963e-06, "loss": 1.5309, "step": 1194 }, { "epoch": 0.01, "grad_norm": 4.343314622774007, "learning_rate": 1.9999802238913906e-06, "loss": 1.3003, "step": 1195 }, { "epoch": 0.01, "grad_norm": 4.630952351276132, "learning_rate": 1.999980190500019e-06, "loss": 1.4954, "step": 1196 }, { "epoch": 0.01, "grad_norm": 5.198619248496427, "learning_rate": 1.9999801570804815e-06, "loss": 1.397, "step": 1197 }, { "epoch": 0.01, "grad_norm": 4.618621734517426, "learning_rate": 1.9999801236327776e-06, "loss": 1.3981, "step": 1198 }, { "epoch": 0.01, "grad_norm": 5.147623660905904, "learning_rate": 1.999980090156908e-06, "loss": 1.3596, "step": 1199 }, { "epoch": 0.01, "grad_norm": 4.946475818289631, "learning_rate": 1.999980056652872e-06, "loss": 1.5295, "step": 1200 }, { "epoch": 0.01, "grad_norm": 4.702884825661346, "learning_rate": 1.9999800231206696e-06, "loss": 1.4685, "step": 1201 }, { "epoch": 0.01, "grad_norm": 7.029321508506549, "learning_rate": 1.9999799895603013e-06, "loss": 1.2344, "step": 1202 }, { "epoch": 0.01, "grad_norm": 5.130362694352546, "learning_rate": 1.999979955971767e-06, "loss": 1.4078, "step": 1203 }, { "epoch": 0.01, "grad_norm": 4.638399765026865, "learning_rate": 1.9999799223550666e-06, "loss": 1.2877, "step": 1204 }, { "epoch": 0.01, "grad_norm": 6.419197466434014, "learning_rate": 1.9999798887102e-06, "loss": 1.5245, "step": 1205 }, { "epoch": 0.01, "grad_norm": 5.048810252708306, "learning_rate": 1.999979855037167e-06, "loss": 1.6089, "step": 1206 }, { "epoch": 0.01, "grad_norm": 5.594986736232871, "learning_rate": 1.9999798213359687e-06, "loss": 1.4686, "step": 1207 }, { "epoch": 0.01, "grad_norm": 4.341694734185055, "learning_rate": 1.999979787606604e-06, "loss": 1.2083, "step": 1208 }, { "epoch": 0.01, "grad_norm": 4.721844492494166, "learning_rate": 1.999979753849073e-06, "loss": 1.3191, "step": 1209 }, { "epoch": 0.01, "grad_norm": 4.660083437293136, "learning_rate": 1.9999797200633755e-06, "loss": 1.405, "step": 1210 }, { "epoch": 0.01, "grad_norm": 7.550917622485102, "learning_rate": 1.9999796862495127e-06, "loss": 1.392, "step": 1211 }, { "epoch": 0.01, "grad_norm": 4.855465281579127, "learning_rate": 1.999979652407483e-06, "loss": 1.2981, "step": 1212 }, { "epoch": 0.01, "grad_norm": 4.7605359185049645, "learning_rate": 1.999979618537288e-06, "loss": 1.475, "step": 1213 }, { "epoch": 0.01, "grad_norm": 4.992826613342054, "learning_rate": 1.9999795846389267e-06, "loss": 1.309, "step": 1214 }, { "epoch": 0.01, "grad_norm": 5.751213987512799, "learning_rate": 1.999979550712399e-06, "loss": 1.3181, "step": 1215 }, { "epoch": 0.01, "grad_norm": 5.56222348172062, "learning_rate": 1.999979516757705e-06, "loss": 1.4476, "step": 1216 }, { "epoch": 0.01, "grad_norm": 7.211498865140055, "learning_rate": 1.9999794827748456e-06, "loss": 1.6613, "step": 1217 }, { "epoch": 0.01, "grad_norm": 4.76843858033082, "learning_rate": 1.9999794487638195e-06, "loss": 1.455, "step": 1218 }, { "epoch": 0.01, "grad_norm": 4.971537042597962, "learning_rate": 1.9999794147246278e-06, "loss": 1.4828, "step": 1219 }, { "epoch": 0.01, "grad_norm": 4.740101198670973, "learning_rate": 1.9999793806572697e-06, "loss": 1.4563, "step": 1220 }, { "epoch": 0.01, "grad_norm": 4.772585640252896, "learning_rate": 1.9999793465617452e-06, "loss": 1.4642, "step": 1221 }, { "epoch": 0.01, "grad_norm": 7.057122514777892, "learning_rate": 1.999979312438055e-06, "loss": 1.4538, "step": 1222 }, { "epoch": 0.01, "grad_norm": 5.39056314627121, "learning_rate": 1.999979278286199e-06, "loss": 1.547, "step": 1223 }, { "epoch": 0.01, "grad_norm": 4.991611184305103, "learning_rate": 1.9999792441061764e-06, "loss": 1.4423, "step": 1224 }, { "epoch": 0.01, "grad_norm": 4.647894296933476, "learning_rate": 1.999979209897988e-06, "loss": 1.4384, "step": 1225 }, { "epoch": 0.01, "grad_norm": 4.99949285749195, "learning_rate": 1.9999791756616334e-06, "loss": 1.5835, "step": 1226 }, { "epoch": 0.01, "grad_norm": 4.567738724749353, "learning_rate": 1.9999791413971127e-06, "loss": 1.3663, "step": 1227 }, { "epoch": 0.01, "grad_norm": 5.394852681906166, "learning_rate": 1.9999791071044256e-06, "loss": 1.3405, "step": 1228 }, { "epoch": 0.01, "grad_norm": 5.3926441113626185, "learning_rate": 1.999979072783573e-06, "loss": 1.4969, "step": 1229 }, { "epoch": 0.01, "grad_norm": 5.406148666652971, "learning_rate": 1.999979038434554e-06, "loss": 1.5782, "step": 1230 }, { "epoch": 0.01, "grad_norm": 4.894426333285634, "learning_rate": 1.999979004057369e-06, "loss": 1.4171, "step": 1231 }, { "epoch": 0.01, "grad_norm": 4.662885913776113, "learning_rate": 1.9999789696520178e-06, "loss": 1.3504, "step": 1232 }, { "epoch": 0.01, "grad_norm": 4.859148631113401, "learning_rate": 1.9999789352185005e-06, "loss": 1.3593, "step": 1233 }, { "epoch": 0.01, "grad_norm": 4.501735921525826, "learning_rate": 1.999978900756817e-06, "loss": 1.497, "step": 1234 }, { "epoch": 0.01, "grad_norm": 4.767981300489952, "learning_rate": 1.999978866266968e-06, "loss": 1.4222, "step": 1235 }, { "epoch": 0.01, "grad_norm": 5.601659778166056, "learning_rate": 1.9999788317489523e-06, "loss": 1.3368, "step": 1236 }, { "epoch": 0.01, "grad_norm": 5.981059658355397, "learning_rate": 1.999978797202771e-06, "loss": 1.3621, "step": 1237 }, { "epoch": 0.01, "grad_norm": 6.128261931608396, "learning_rate": 1.9999787626284236e-06, "loss": 1.6156, "step": 1238 }, { "epoch": 0.01, "grad_norm": 4.881645282270224, "learning_rate": 1.9999787280259097e-06, "loss": 1.3842, "step": 1239 }, { "epoch": 0.01, "grad_norm": 5.893724788852448, "learning_rate": 1.9999786933952302e-06, "loss": 1.6442, "step": 1240 }, { "epoch": 0.01, "grad_norm": 5.222605242350468, "learning_rate": 1.9999786587363844e-06, "loss": 1.373, "step": 1241 }, { "epoch": 0.01, "eval_loss": 1.6639937162399292, "eval_runtime": 4.616, "eval_samples_per_second": 1.95, "eval_steps_per_second": 1.083, "step": 1241 }, { "epoch": 0.01, "grad_norm": 4.66908931719967, "learning_rate": 1.999978624049372e-06, "loss": 1.4993, "step": 1242 }, { "epoch": 0.01, "grad_norm": 4.9307281292109035, "learning_rate": 1.9999785893341944e-06, "loss": 1.5608, "step": 1243 }, { "epoch": 0.01, "grad_norm": 4.826571091140564, "learning_rate": 1.99997855459085e-06, "loss": 1.3162, "step": 1244 }, { "epoch": 0.01, "grad_norm": 4.8493801546756465, "learning_rate": 1.9999785198193405e-06, "loss": 1.4282, "step": 1245 }, { "epoch": 0.01, "grad_norm": 4.697208964299474, "learning_rate": 1.999978485019664e-06, "loss": 1.5024, "step": 1246 }, { "epoch": 0.01, "grad_norm": 5.919002989067272, "learning_rate": 1.999978450191822e-06, "loss": 1.5431, "step": 1247 }, { "epoch": 0.01, "grad_norm": 4.99900985273033, "learning_rate": 1.9999784153358135e-06, "loss": 1.4855, "step": 1248 }, { "epoch": 0.01, "grad_norm": 4.617964526314292, "learning_rate": 1.999978380451639e-06, "loss": 1.5545, "step": 1249 }, { "epoch": 0.01, "grad_norm": 5.192495194044367, "learning_rate": 1.9999783455392987e-06, "loss": 1.4625, "step": 1250 }, { "epoch": 0.01, "grad_norm": 5.017628302905397, "learning_rate": 1.9999783105987924e-06, "loss": 1.5347, "step": 1251 }, { "epoch": 0.01, "grad_norm": 4.576487300446409, "learning_rate": 1.9999782756301197e-06, "loss": 1.5079, "step": 1252 }, { "epoch": 0.01, "grad_norm": 5.5622763050535085, "learning_rate": 1.999978240633281e-06, "loss": 1.4778, "step": 1253 }, { "epoch": 0.01, "grad_norm": 4.320462635223293, "learning_rate": 1.9999782056082763e-06, "loss": 1.1527, "step": 1254 }, { "epoch": 0.01, "grad_norm": 4.753621834652253, "learning_rate": 1.9999781705551057e-06, "loss": 1.4019, "step": 1255 }, { "epoch": 0.01, "grad_norm": 4.399262354558424, "learning_rate": 1.9999781354737687e-06, "loss": 1.4149, "step": 1256 }, { "epoch": 0.01, "grad_norm": 4.602639150018978, "learning_rate": 1.9999781003642658e-06, "loss": 1.3364, "step": 1257 }, { "epoch": 0.01, "grad_norm": 4.589898719773274, "learning_rate": 1.999978065226597e-06, "loss": 1.4087, "step": 1258 }, { "epoch": 0.01, "grad_norm": 4.536362353334885, "learning_rate": 1.999978030060762e-06, "loss": 1.4561, "step": 1259 }, { "epoch": 0.01, "grad_norm": 4.565066571845603, "learning_rate": 1.999977994866761e-06, "loss": 1.4477, "step": 1260 }, { "epoch": 0.01, "grad_norm": 4.82724395797132, "learning_rate": 1.999977959644594e-06, "loss": 1.385, "step": 1261 }, { "epoch": 0.01, "grad_norm": 4.485711386546417, "learning_rate": 1.999977924394261e-06, "loss": 1.3381, "step": 1262 }, { "epoch": 0.01, "grad_norm": 5.007143806674197, "learning_rate": 1.9999778891157615e-06, "loss": 1.4557, "step": 1263 }, { "epoch": 0.01, "grad_norm": 5.361949819262946, "learning_rate": 1.9999778538090964e-06, "loss": 1.3389, "step": 1264 }, { "epoch": 0.01, "grad_norm": 4.503528627850973, "learning_rate": 1.999977818474265e-06, "loss": 1.4698, "step": 1265 }, { "epoch": 0.01, "grad_norm": 5.233324651443236, "learning_rate": 1.999977783111268e-06, "loss": 1.3247, "step": 1266 }, { "epoch": 0.01, "grad_norm": 5.5046307792398865, "learning_rate": 1.9999777477201044e-06, "loss": 1.3497, "step": 1267 }, { "epoch": 0.01, "grad_norm": 5.645109391595665, "learning_rate": 1.999977712300775e-06, "loss": 1.3201, "step": 1268 }, { "epoch": 0.01, "grad_norm": 5.1446712059940465, "learning_rate": 1.9999776768532796e-06, "loss": 1.4262, "step": 1269 }, { "epoch": 0.01, "grad_norm": 5.28587028516593, "learning_rate": 1.999977641377618e-06, "loss": 1.6932, "step": 1270 }, { "epoch": 0.01, "grad_norm": 5.598379302979389, "learning_rate": 1.9999776058737906e-06, "loss": 1.4316, "step": 1271 }, { "epoch": 0.01, "grad_norm": 6.016449286637428, "learning_rate": 1.9999775703417965e-06, "loss": 1.5183, "step": 1272 }, { "epoch": 0.01, "grad_norm": 5.415644315937934, "learning_rate": 1.999977534781637e-06, "loss": 1.3129, "step": 1273 }, { "epoch": 0.01, "grad_norm": 4.704232037612598, "learning_rate": 1.9999774991933113e-06, "loss": 1.4044, "step": 1274 }, { "epoch": 0.01, "grad_norm": 4.6149493250979, "learning_rate": 1.9999774635768197e-06, "loss": 1.3827, "step": 1275 }, { "epoch": 0.01, "grad_norm": 5.530098577109759, "learning_rate": 1.9999774279321617e-06, "loss": 1.6327, "step": 1276 }, { "epoch": 0.01, "grad_norm": 4.743687061339907, "learning_rate": 1.9999773922593383e-06, "loss": 1.5278, "step": 1277 }, { "epoch": 0.01, "grad_norm": 4.8692755474083205, "learning_rate": 1.9999773565583484e-06, "loss": 1.5159, "step": 1278 }, { "epoch": 0.01, "grad_norm": 7.0274537662792556, "learning_rate": 1.9999773208291925e-06, "loss": 1.6578, "step": 1279 }, { "epoch": 0.01, "grad_norm": 5.493325452863065, "learning_rate": 1.9999772850718703e-06, "loss": 1.3068, "step": 1280 }, { "epoch": 0.01, "grad_norm": 4.623446755309784, "learning_rate": 1.9999772492863825e-06, "loss": 1.3591, "step": 1281 }, { "epoch": 0.01, "grad_norm": 6.58471292058994, "learning_rate": 1.999977213472729e-06, "loss": 1.2837, "step": 1282 }, { "epoch": 0.01, "grad_norm": 4.718580482751096, "learning_rate": 1.9999771776309083e-06, "loss": 1.5876, "step": 1283 }, { "epoch": 0.01, "grad_norm": 6.699986818479373, "learning_rate": 1.999977141760922e-06, "loss": 1.3901, "step": 1284 }, { "epoch": 0.01, "grad_norm": 5.60582750403323, "learning_rate": 1.99997710586277e-06, "loss": 1.4277, "step": 1285 }, { "epoch": 0.01, "grad_norm": 4.673408978804247, "learning_rate": 1.9999770699364526e-06, "loss": 1.4454, "step": 1286 }, { "epoch": 0.01, "grad_norm": 4.9768592067046695, "learning_rate": 1.999977033981968e-06, "loss": 1.3688, "step": 1287 }, { "epoch": 0.01, "grad_norm": 6.49039104639861, "learning_rate": 1.999976997999318e-06, "loss": 1.3585, "step": 1288 }, { "epoch": 0.01, "grad_norm": 5.441000592991114, "learning_rate": 1.999976961988502e-06, "loss": 1.4013, "step": 1289 }, { "epoch": 0.01, "grad_norm": 5.248224180467002, "learning_rate": 1.9999769259495197e-06, "loss": 1.6231, "step": 1290 }, { "epoch": 0.01, "grad_norm": 4.967205396228199, "learning_rate": 1.9999768898823714e-06, "loss": 1.3776, "step": 1291 }, { "epoch": 0.01, "grad_norm": 4.786827176028182, "learning_rate": 1.999976853787057e-06, "loss": 1.5409, "step": 1292 }, { "epoch": 0.01, "grad_norm": 4.629480539543215, "learning_rate": 1.999976817663577e-06, "loss": 1.4354, "step": 1293 }, { "epoch": 0.01, "grad_norm": 5.100068802921767, "learning_rate": 1.9999767815119305e-06, "loss": 1.4632, "step": 1294 }, { "epoch": 0.01, "grad_norm": 5.134078229009721, "learning_rate": 1.999976745332118e-06, "loss": 1.4747, "step": 1295 }, { "epoch": 0.01, "grad_norm": 5.039774129292199, "learning_rate": 1.99997670912414e-06, "loss": 1.5499, "step": 1296 }, { "epoch": 0.01, "grad_norm": 5.581809616341501, "learning_rate": 1.9999766728879955e-06, "loss": 1.5581, "step": 1297 }, { "epoch": 0.01, "grad_norm": 4.769459899834711, "learning_rate": 1.999976636623685e-06, "loss": 1.6019, "step": 1298 }, { "epoch": 0.01, "grad_norm": 4.789913009365324, "learning_rate": 1.9999766003312087e-06, "loss": 1.4384, "step": 1299 }, { "epoch": 0.01, "grad_norm": 4.948273546122202, "learning_rate": 1.9999765640105663e-06, "loss": 1.4859, "step": 1300 }, { "epoch": 0.01, "grad_norm": 5.046098251037359, "learning_rate": 1.999976527661758e-06, "loss": 1.3215, "step": 1301 }, { "epoch": 0.01, "grad_norm": 4.759646769803917, "learning_rate": 1.9999764912847834e-06, "loss": 1.26, "step": 1302 }, { "epoch": 0.01, "grad_norm": 5.049312397622781, "learning_rate": 1.999976454879643e-06, "loss": 1.2966, "step": 1303 }, { "epoch": 0.01, "grad_norm": 4.7730400364294985, "learning_rate": 1.9999764184463365e-06, "loss": 1.5075, "step": 1304 }, { "epoch": 0.01, "grad_norm": 4.615090990630283, "learning_rate": 1.999976381984864e-06, "loss": 1.3308, "step": 1305 }, { "epoch": 0.01, "grad_norm": 5.050723699922145, "learning_rate": 1.999976345495226e-06, "loss": 1.5391, "step": 1306 }, { "epoch": 0.01, "grad_norm": 5.2292497493973995, "learning_rate": 1.999976308977421e-06, "loss": 1.4581, "step": 1307 }, { "epoch": 0.01, "grad_norm": 4.484046582647355, "learning_rate": 1.9999762724314504e-06, "loss": 1.2275, "step": 1308 }, { "epoch": 0.01, "grad_norm": 4.7418804258275, "learning_rate": 1.9999762358573144e-06, "loss": 1.4074, "step": 1309 }, { "epoch": 0.01, "grad_norm": 4.656737686412653, "learning_rate": 1.9999761992550116e-06, "loss": 1.4102, "step": 1310 }, { "epoch": 0.01, "grad_norm": 4.913690315220175, "learning_rate": 1.9999761626245433e-06, "loss": 1.5153, "step": 1311 }, { "epoch": 0.01, "grad_norm": 4.631259670029744, "learning_rate": 1.9999761259659085e-06, "loss": 1.4168, "step": 1312 }, { "epoch": 0.01, "grad_norm": 6.352234843978522, "learning_rate": 1.9999760892791083e-06, "loss": 1.7813, "step": 1313 }, { "epoch": 0.01, "grad_norm": 4.806086460573851, "learning_rate": 1.9999760525641416e-06, "loss": 1.4341, "step": 1314 }, { "epoch": 0.01, "eval_loss": 1.664663314819336, "eval_runtime": 4.6478, "eval_samples_per_second": 1.936, "eval_steps_per_second": 1.076, "step": 1314 }, { "epoch": 0.01, "grad_norm": 4.865234880117637, "learning_rate": 1.999976015821009e-06, "loss": 1.5092, "step": 1315 }, { "epoch": 0.01, "grad_norm": 4.593356313971813, "learning_rate": 1.999975979049711e-06, "loss": 1.2732, "step": 1316 }, { "epoch": 0.01, "grad_norm": 4.765904676157058, "learning_rate": 1.9999759422502462e-06, "loss": 1.4894, "step": 1317 }, { "epoch": 0.01, "grad_norm": 4.869472003176806, "learning_rate": 1.9999759054226157e-06, "loss": 1.3162, "step": 1318 }, { "epoch": 0.01, "grad_norm": 6.707074378059004, "learning_rate": 1.9999758685668192e-06, "loss": 1.3817, "step": 1319 }, { "epoch": 0.01, "grad_norm": 4.360722245763356, "learning_rate": 1.999975831682857e-06, "loss": 1.3389, "step": 1320 }, { "epoch": 0.01, "grad_norm": 5.432944358728482, "learning_rate": 1.9999757947707284e-06, "loss": 1.4327, "step": 1321 }, { "epoch": 0.01, "grad_norm": 4.82335037337341, "learning_rate": 1.9999757578304336e-06, "loss": 1.6028, "step": 1322 }, { "epoch": 0.01, "grad_norm": 4.585722041095415, "learning_rate": 1.9999757208619733e-06, "loss": 1.4382, "step": 1323 }, { "epoch": 0.01, "grad_norm": 4.668470608848745, "learning_rate": 1.999975683865347e-06, "loss": 1.2558, "step": 1324 }, { "epoch": 0.01, "grad_norm": 4.731181861451524, "learning_rate": 1.9999756468405543e-06, "loss": 1.4787, "step": 1325 }, { "epoch": 0.01, "grad_norm": 5.300289798255382, "learning_rate": 1.999975609787596e-06, "loss": 1.4782, "step": 1326 }, { "epoch": 0.01, "grad_norm": 4.942011068638616, "learning_rate": 1.9999755727064715e-06, "loss": 1.5667, "step": 1327 }, { "epoch": 0.01, "grad_norm": 5.309328004860009, "learning_rate": 1.9999755355971813e-06, "loss": 1.5442, "step": 1328 }, { "epoch": 0.01, "grad_norm": 4.915409373782858, "learning_rate": 1.9999754984597248e-06, "loss": 1.3834, "step": 1329 }, { "epoch": 0.01, "grad_norm": 4.89211942651002, "learning_rate": 1.9999754612941023e-06, "loss": 1.3551, "step": 1330 }, { "epoch": 0.01, "grad_norm": 5.687686388314984, "learning_rate": 1.999975424100314e-06, "loss": 1.2831, "step": 1331 }, { "epoch": 0.01, "grad_norm": 4.582159025430598, "learning_rate": 1.9999753868783594e-06, "loss": 1.4689, "step": 1332 }, { "epoch": 0.01, "grad_norm": 4.8213772730034865, "learning_rate": 1.9999753496282394e-06, "loss": 1.4182, "step": 1333 }, { "epoch": 0.01, "grad_norm": 4.5280013653649105, "learning_rate": 1.999975312349953e-06, "loss": 1.4439, "step": 1334 }, { "epoch": 0.01, "grad_norm": 4.893199334359817, "learning_rate": 1.9999752750435008e-06, "loss": 1.5473, "step": 1335 }, { "epoch": 0.01, "grad_norm": 4.869413876075934, "learning_rate": 1.9999752377088825e-06, "loss": 1.3372, "step": 1336 }, { "epoch": 0.01, "grad_norm": 6.9958240347842695, "learning_rate": 1.9999752003460982e-06, "loss": 1.3613, "step": 1337 }, { "epoch": 0.01, "grad_norm": 5.4921822785420655, "learning_rate": 1.999975162955148e-06, "loss": 1.4645, "step": 1338 }, { "epoch": 0.01, "grad_norm": 5.304921830032534, "learning_rate": 1.999975125536032e-06, "loss": 1.4629, "step": 1339 }, { "epoch": 0.01, "grad_norm": 4.649096231830719, "learning_rate": 1.9999750880887497e-06, "loss": 1.2108, "step": 1340 }, { "epoch": 0.01, "grad_norm": 5.420599578392317, "learning_rate": 1.9999750506133017e-06, "loss": 1.3255, "step": 1341 }, { "epoch": 0.01, "grad_norm": 4.8401246330468695, "learning_rate": 1.9999750131096876e-06, "loss": 1.4344, "step": 1342 }, { "epoch": 0.01, "grad_norm": 5.15766506468985, "learning_rate": 1.9999749755779076e-06, "loss": 1.5095, "step": 1343 }, { "epoch": 0.01, "grad_norm": 4.838423380109326, "learning_rate": 1.999974938017961e-06, "loss": 1.4351, "step": 1344 }, { "epoch": 0.01, "grad_norm": 5.7944677876331205, "learning_rate": 1.9999749004298492e-06, "loss": 1.4865, "step": 1345 }, { "epoch": 0.01, "grad_norm": 5.857475428823912, "learning_rate": 1.9999748628135713e-06, "loss": 1.5694, "step": 1346 }, { "epoch": 0.01, "grad_norm": 5.117298413799976, "learning_rate": 1.9999748251691275e-06, "loss": 1.3554, "step": 1347 }, { "epoch": 0.01, "grad_norm": 4.729812229132196, "learning_rate": 1.9999747874965176e-06, "loss": 1.365, "step": 1348 }, { "epoch": 0.01, "grad_norm": 4.918522074472892, "learning_rate": 1.999974749795742e-06, "loss": 1.5329, "step": 1349 }, { "epoch": 0.01, "grad_norm": 6.128757606688177, "learning_rate": 1.9999747120668e-06, "loss": 1.4496, "step": 1350 }, { "epoch": 0.01, "grad_norm": 5.512588294782333, "learning_rate": 1.9999746743096923e-06, "loss": 1.4654, "step": 1351 }, { "epoch": 0.01, "grad_norm": 5.932892066430468, "learning_rate": 1.9999746365244182e-06, "loss": 1.5322, "step": 1352 }, { "epoch": 0.01, "grad_norm": 4.9351926580424585, "learning_rate": 1.999974598710979e-06, "loss": 1.509, "step": 1353 }, { "epoch": 0.01, "grad_norm": 4.73360746783656, "learning_rate": 1.999974560869373e-06, "loss": 1.4276, "step": 1354 }, { "epoch": 0.01, "grad_norm": 5.323019209743392, "learning_rate": 1.9999745229996014e-06, "loss": 1.4279, "step": 1355 }, { "epoch": 0.01, "grad_norm": 5.121493184418707, "learning_rate": 1.999974485101664e-06, "loss": 1.3617, "step": 1356 }, { "epoch": 0.01, "grad_norm": 4.924408494387465, "learning_rate": 1.9999744471755604e-06, "loss": 1.4852, "step": 1357 }, { "epoch": 0.01, "grad_norm": 4.577577155791868, "learning_rate": 1.999974409221291e-06, "loss": 1.463, "step": 1358 }, { "epoch": 0.01, "grad_norm": 4.677557438554256, "learning_rate": 1.9999743712388555e-06, "loss": 1.4702, "step": 1359 }, { "epoch": 0.01, "grad_norm": 4.869471619730696, "learning_rate": 1.999974333228254e-06, "loss": 1.4982, "step": 1360 }, { "epoch": 0.01, "grad_norm": 5.23329639876476, "learning_rate": 1.9999742951894867e-06, "loss": 1.4118, "step": 1361 }, { "epoch": 0.01, "grad_norm": 4.664165574708151, "learning_rate": 1.9999742571225534e-06, "loss": 1.4708, "step": 1362 }, { "epoch": 0.01, "grad_norm": 4.5907041789083705, "learning_rate": 1.999974219027454e-06, "loss": 1.4604, "step": 1363 }, { "epoch": 0.01, "grad_norm": 4.940547135047092, "learning_rate": 1.999974180904189e-06, "loss": 1.4571, "step": 1364 }, { "epoch": 0.01, "grad_norm": 5.234304088486649, "learning_rate": 1.9999741427527577e-06, "loss": 1.4042, "step": 1365 }, { "epoch": 0.01, "grad_norm": 5.075734957191831, "learning_rate": 1.9999741045731605e-06, "loss": 1.4649, "step": 1366 }, { "epoch": 0.01, "grad_norm": 4.545302713138621, "learning_rate": 1.999974066365398e-06, "loss": 1.3497, "step": 1367 }, { "epoch": 0.01, "grad_norm": 4.55221088039147, "learning_rate": 1.9999740281294687e-06, "loss": 1.342, "step": 1368 }, { "epoch": 0.01, "grad_norm": 4.6125447731949185, "learning_rate": 1.9999739898653736e-06, "loss": 1.4971, "step": 1369 }, { "epoch": 0.01, "grad_norm": 9.07610846685092, "learning_rate": 1.999973951573113e-06, "loss": 1.5226, "step": 1370 }, { "epoch": 0.01, "grad_norm": 5.318700322087232, "learning_rate": 1.999973913252686e-06, "loss": 1.4651, "step": 1371 }, { "epoch": 0.01, "grad_norm": 4.537300685685728, "learning_rate": 1.999973874904093e-06, "loss": 1.2609, "step": 1372 }, { "epoch": 0.01, "grad_norm": 5.193376063176117, "learning_rate": 1.9999738365273346e-06, "loss": 1.5545, "step": 1373 }, { "epoch": 0.01, "grad_norm": 5.273750459431588, "learning_rate": 1.99997379812241e-06, "loss": 1.4439, "step": 1374 }, { "epoch": 0.01, "grad_norm": 4.595241626337541, "learning_rate": 1.9999737596893198e-06, "loss": 1.4768, "step": 1375 }, { "epoch": 0.01, "grad_norm": 5.259906353516755, "learning_rate": 1.9999737212280634e-06, "loss": 1.5442, "step": 1376 }, { "epoch": 0.01, "grad_norm": 7.201372009358403, "learning_rate": 1.999973682738641e-06, "loss": 1.368, "step": 1377 }, { "epoch": 0.01, "grad_norm": 4.977174887944729, "learning_rate": 1.9999736442210523e-06, "loss": 1.469, "step": 1378 }, { "epoch": 0.01, "grad_norm": 4.782568509322288, "learning_rate": 1.999973605675298e-06, "loss": 1.4649, "step": 1379 }, { "epoch": 0.01, "grad_norm": 4.9003324240880515, "learning_rate": 1.999973567101378e-06, "loss": 1.4912, "step": 1380 }, { "epoch": 0.01, "grad_norm": 4.443690275403413, "learning_rate": 1.9999735284992916e-06, "loss": 1.3528, "step": 1381 }, { "epoch": 0.01, "grad_norm": 5.103063174767599, "learning_rate": 1.9999734898690395e-06, "loss": 1.5669, "step": 1382 }, { "epoch": 0.01, "grad_norm": 4.799160062093434, "learning_rate": 1.999973451210622e-06, "loss": 1.4682, "step": 1383 }, { "epoch": 0.01, "grad_norm": 4.89475639433693, "learning_rate": 1.9999734125240377e-06, "loss": 1.5212, "step": 1384 }, { "epoch": 0.01, "grad_norm": 4.458491045381737, "learning_rate": 1.999973373809288e-06, "loss": 1.2801, "step": 1385 }, { "epoch": 0.01, "grad_norm": 5.308111135518119, "learning_rate": 1.999973335066372e-06, "loss": 1.4821, "step": 1386 }, { "epoch": 0.01, "grad_norm": 4.548350666808942, "learning_rate": 1.9999732962952905e-06, "loss": 1.4494, "step": 1387 }, { "epoch": 0.01, "eval_loss": 1.6605433225631714, "eval_runtime": 4.645, "eval_samples_per_second": 1.938, "eval_steps_per_second": 1.076, "step": 1387 }, { "epoch": 0.01, "grad_norm": 5.872935272294134, "learning_rate": 1.999973257496043e-06, "loss": 1.5121, "step": 1388 }, { "epoch": 0.01, "grad_norm": 4.421100493500692, "learning_rate": 1.999973218668629e-06, "loss": 1.4706, "step": 1389 }, { "epoch": 0.01, "grad_norm": 4.651724509052579, "learning_rate": 1.99997317981305e-06, "loss": 1.3924, "step": 1390 }, { "epoch": 0.01, "grad_norm": 8.375754147106289, "learning_rate": 1.9999731409293047e-06, "loss": 1.6767, "step": 1391 }, { "epoch": 0.01, "grad_norm": 4.849899617039911, "learning_rate": 1.9999731020173934e-06, "loss": 1.4921, "step": 1392 }, { "epoch": 0.01, "grad_norm": 5.47624020449905, "learning_rate": 1.999973063077316e-06, "loss": 1.4762, "step": 1393 }, { "epoch": 0.01, "grad_norm": 4.357532784726696, "learning_rate": 1.999973024109073e-06, "loss": 1.3477, "step": 1394 }, { "epoch": 0.01, "grad_norm": 4.627544754203488, "learning_rate": 1.999972985112664e-06, "loss": 1.4234, "step": 1395 }, { "epoch": 0.01, "grad_norm": 4.782545342694373, "learning_rate": 1.999972946088089e-06, "loss": 1.3345, "step": 1396 }, { "epoch": 0.01, "grad_norm": 4.918312434237096, "learning_rate": 1.999972907035348e-06, "loss": 1.4585, "step": 1397 }, { "epoch": 0.01, "grad_norm": 4.74711940647535, "learning_rate": 1.9999728679544412e-06, "loss": 1.5588, "step": 1398 }, { "epoch": 0.01, "grad_norm": 6.756574444935555, "learning_rate": 1.999972828845369e-06, "loss": 1.5253, "step": 1399 }, { "epoch": 0.01, "grad_norm": 4.825155321890688, "learning_rate": 1.99997278970813e-06, "loss": 1.4386, "step": 1400 }, { "epoch": 0.01, "grad_norm": 5.893816643811335, "learning_rate": 1.9999727505427257e-06, "loss": 1.2703, "step": 1401 }, { "epoch": 0.01, "grad_norm": 5.271403904383769, "learning_rate": 1.999972711349155e-06, "loss": 1.332, "step": 1402 }, { "epoch": 0.01, "grad_norm": 5.094872014502599, "learning_rate": 1.999972672127419e-06, "loss": 1.4158, "step": 1403 }, { "epoch": 0.01, "grad_norm": 4.785102973962074, "learning_rate": 1.9999726328775166e-06, "loss": 1.4165, "step": 1404 }, { "epoch": 0.01, "grad_norm": 7.7983947530400535, "learning_rate": 1.9999725935994485e-06, "loss": 1.4353, "step": 1405 }, { "epoch": 0.01, "grad_norm": 14.646416220267023, "learning_rate": 1.9999725542932145e-06, "loss": 1.3781, "step": 1406 }, { "epoch": 0.01, "grad_norm": 5.037078769476571, "learning_rate": 1.9999725149588146e-06, "loss": 1.3702, "step": 1407 }, { "epoch": 0.01, "grad_norm": 4.9356881553322545, "learning_rate": 1.9999724755962486e-06, "loss": 1.5146, "step": 1408 }, { "epoch": 0.01, "grad_norm": 8.717638454921318, "learning_rate": 1.999972436205517e-06, "loss": 1.2775, "step": 1409 }, { "epoch": 0.01, "grad_norm": 4.998493967980832, "learning_rate": 1.9999723967866197e-06, "loss": 1.3866, "step": 1410 }, { "epoch": 0.01, "grad_norm": 4.7378104129164464, "learning_rate": 1.9999723573395564e-06, "loss": 1.32, "step": 1411 }, { "epoch": 0.01, "grad_norm": 5.599759309706441, "learning_rate": 1.9999723178643266e-06, "loss": 1.5095, "step": 1412 }, { "epoch": 0.01, "grad_norm": 4.876824835044584, "learning_rate": 1.9999722783609313e-06, "loss": 1.283, "step": 1413 }, { "epoch": 0.01, "grad_norm": 4.37874420839293, "learning_rate": 1.99997223882937e-06, "loss": 1.3672, "step": 1414 }, { "epoch": 0.01, "grad_norm": 6.233951321610936, "learning_rate": 1.999972199269643e-06, "loss": 1.5104, "step": 1415 }, { "epoch": 0.01, "grad_norm": 4.6266662891163195, "learning_rate": 1.99997215968175e-06, "loss": 1.422, "step": 1416 }, { "epoch": 0.01, "grad_norm": 5.264895342929483, "learning_rate": 1.9999721200656912e-06, "loss": 1.3256, "step": 1417 }, { "epoch": 0.01, "grad_norm": 5.126522085971787, "learning_rate": 1.9999720804214665e-06, "loss": 1.5388, "step": 1418 }, { "epoch": 0.01, "grad_norm": 5.0904993436779655, "learning_rate": 1.999972040749076e-06, "loss": 1.6427, "step": 1419 }, { "epoch": 0.01, "grad_norm": 4.557223446909417, "learning_rate": 1.999972001048519e-06, "loss": 1.4148, "step": 1420 }, { "epoch": 0.01, "grad_norm": 4.7803697348815914, "learning_rate": 1.999971961319797e-06, "loss": 1.5555, "step": 1421 }, { "epoch": 0.01, "grad_norm": 4.800921782137216, "learning_rate": 1.9999719215629085e-06, "loss": 1.4876, "step": 1422 }, { "epoch": 0.01, "grad_norm": 6.9994358843439, "learning_rate": 1.999971881777855e-06, "loss": 1.4667, "step": 1423 }, { "epoch": 0.01, "grad_norm": 4.464016639284502, "learning_rate": 1.9999718419646343e-06, "loss": 1.2817, "step": 1424 }, { "epoch": 0.01, "grad_norm": 4.791881059083396, "learning_rate": 1.9999718021232487e-06, "loss": 1.3099, "step": 1425 }, { "epoch": 0.01, "grad_norm": 5.046707728618333, "learning_rate": 1.9999717622536963e-06, "loss": 1.3161, "step": 1426 }, { "epoch": 0.01, "grad_norm": 4.941858775320615, "learning_rate": 1.9999717223559788e-06, "loss": 1.4566, "step": 1427 }, { "epoch": 0.01, "grad_norm": 5.008676930939444, "learning_rate": 1.9999716824300953e-06, "loss": 1.4165, "step": 1428 }, { "epoch": 0.01, "grad_norm": 5.280497134099123, "learning_rate": 1.999971642476046e-06, "loss": 1.4435, "step": 1429 }, { "epoch": 0.01, "grad_norm": 4.899113042250123, "learning_rate": 1.999971602493831e-06, "loss": 1.4571, "step": 1430 }, { "epoch": 0.01, "grad_norm": 5.250624631469121, "learning_rate": 1.9999715624834494e-06, "loss": 1.4758, "step": 1431 }, { "epoch": 0.01, "grad_norm": 4.646881476231068, "learning_rate": 1.999971522444902e-06, "loss": 1.581, "step": 1432 }, { "epoch": 0.01, "grad_norm": 6.1511243697760705, "learning_rate": 1.999971482378189e-06, "loss": 1.3673, "step": 1433 }, { "epoch": 0.01, "grad_norm": 4.963495678210403, "learning_rate": 1.9999714422833108e-06, "loss": 1.4672, "step": 1434 }, { "epoch": 0.01, "grad_norm": 4.879620074849547, "learning_rate": 1.999971402160266e-06, "loss": 1.4164, "step": 1435 }, { "epoch": 0.01, "grad_norm": 4.393239464097148, "learning_rate": 1.9999713620090556e-06, "loss": 1.2775, "step": 1436 }, { "epoch": 0.01, "grad_norm": 5.371614688959383, "learning_rate": 1.999971321829679e-06, "loss": 1.3789, "step": 1437 }, { "epoch": 0.01, "grad_norm": 4.859029231950951, "learning_rate": 1.9999712816221366e-06, "loss": 1.2764, "step": 1438 }, { "epoch": 0.01, "grad_norm": 5.978190883538404, "learning_rate": 1.9999712413864284e-06, "loss": 1.5097, "step": 1439 }, { "epoch": 0.01, "grad_norm": 4.859269754595811, "learning_rate": 1.9999712011225546e-06, "loss": 1.3549, "step": 1440 }, { "epoch": 0.01, "grad_norm": 4.598800701532648, "learning_rate": 1.9999711608305144e-06, "loss": 1.4791, "step": 1441 }, { "epoch": 0.01, "grad_norm": 5.233668406367012, "learning_rate": 1.9999711205103087e-06, "loss": 1.5436, "step": 1442 }, { "epoch": 0.01, "grad_norm": 6.585328618747414, "learning_rate": 1.9999710801619375e-06, "loss": 1.4573, "step": 1443 }, { "epoch": 0.01, "grad_norm": 4.975282503115869, "learning_rate": 1.9999710397854e-06, "loss": 1.3631, "step": 1444 }, { "epoch": 0.01, "grad_norm": 6.060428307246658, "learning_rate": 1.9999709993806967e-06, "loss": 1.4522, "step": 1445 }, { "epoch": 0.01, "grad_norm": 5.344016376470663, "learning_rate": 1.999970958947827e-06, "loss": 1.4702, "step": 1446 }, { "epoch": 0.01, "grad_norm": 6.153848988906273, "learning_rate": 1.999970918486792e-06, "loss": 1.5144, "step": 1447 }, { "epoch": 0.01, "grad_norm": 5.957762192580373, "learning_rate": 1.9999708779975914e-06, "loss": 1.5009, "step": 1448 }, { "epoch": 0.01, "grad_norm": 4.791562507924204, "learning_rate": 1.999970837480225e-06, "loss": 1.3169, "step": 1449 }, { "epoch": 0.01, "grad_norm": 4.757399799969445, "learning_rate": 1.9999707969346922e-06, "loss": 1.4207, "step": 1450 }, { "epoch": 0.01, "grad_norm": 4.939646292701733, "learning_rate": 1.9999707563609937e-06, "loss": 1.5557, "step": 1451 }, { "epoch": 0.01, "grad_norm": 5.213045670839742, "learning_rate": 1.9999707157591296e-06, "loss": 1.4483, "step": 1452 }, { "epoch": 0.01, "grad_norm": 4.262064916528809, "learning_rate": 1.999970675129099e-06, "loss": 1.395, "step": 1453 }, { "epoch": 0.01, "grad_norm": 4.936947311373785, "learning_rate": 1.999970634470903e-06, "loss": 1.4634, "step": 1454 }, { "epoch": 0.01, "grad_norm": 4.362006992850526, "learning_rate": 1.9999705937845413e-06, "loss": 1.2776, "step": 1455 }, { "epoch": 0.01, "grad_norm": 4.643820891960016, "learning_rate": 1.9999705530700138e-06, "loss": 1.2722, "step": 1456 }, { "epoch": 0.01, "grad_norm": 4.949308239528696, "learning_rate": 1.9999705123273203e-06, "loss": 1.5015, "step": 1457 }, { "epoch": 0.01, "grad_norm": 6.259335084505092, "learning_rate": 1.999970471556461e-06, "loss": 1.4663, "step": 1458 }, { "epoch": 0.01, "grad_norm": 5.267704451871602, "learning_rate": 1.9999704307574355e-06, "loss": 1.4835, "step": 1459 }, { "epoch": 0.01, "grad_norm": 5.251718898458907, "learning_rate": 1.9999703899302446e-06, "loss": 1.3878, "step": 1460 }, { "epoch": 0.01, "eval_loss": 1.6609848737716675, "eval_runtime": 4.6315, "eval_samples_per_second": 1.943, "eval_steps_per_second": 1.08, "step": 1460 }, { "epoch": 0.01, "grad_norm": 4.908979692543608, "learning_rate": 1.9999703490748877e-06, "loss": 1.4374, "step": 1461 }, { "epoch": 0.01, "grad_norm": 5.071932947528256, "learning_rate": 1.999970308191365e-06, "loss": 1.5086, "step": 1462 }, { "epoch": 0.01, "grad_norm": 4.519660989077838, "learning_rate": 1.9999702672796765e-06, "loss": 1.3081, "step": 1463 }, { "epoch": 0.01, "grad_norm": 5.423290223247818, "learning_rate": 1.999970226339822e-06, "loss": 1.5415, "step": 1464 }, { "epoch": 0.01, "grad_norm": 4.276573152012557, "learning_rate": 1.999970185371802e-06, "loss": 1.3497, "step": 1465 }, { "epoch": 0.01, "grad_norm": 4.873791431302587, "learning_rate": 1.9999701443756155e-06, "loss": 1.3745, "step": 1466 }, { "epoch": 0.01, "grad_norm": 4.9749684436238315, "learning_rate": 1.9999701033512637e-06, "loss": 1.4114, "step": 1467 }, { "epoch": 0.01, "grad_norm": 4.646373215103361, "learning_rate": 1.999970062298746e-06, "loss": 1.4112, "step": 1468 }, { "epoch": 0.01, "grad_norm": 4.864454376294363, "learning_rate": 1.9999700212180626e-06, "loss": 1.3469, "step": 1469 }, { "epoch": 0.01, "grad_norm": 4.937829789499327, "learning_rate": 1.999969980109213e-06, "loss": 1.5815, "step": 1470 }, { "epoch": 0.01, "grad_norm": 4.639905252344842, "learning_rate": 1.9999699389721977e-06, "loss": 1.3537, "step": 1471 }, { "epoch": 0.01, "grad_norm": 4.78299140918665, "learning_rate": 1.999969897807017e-06, "loss": 1.4826, "step": 1472 }, { "epoch": 0.01, "grad_norm": 4.793192549086234, "learning_rate": 1.9999698566136697e-06, "loss": 1.436, "step": 1473 }, { "epoch": 0.01, "grad_norm": 4.792423303861806, "learning_rate": 1.999969815392157e-06, "loss": 1.4389, "step": 1474 }, { "epoch": 0.01, "grad_norm": 8.817682445317553, "learning_rate": 1.9999697741424787e-06, "loss": 1.3081, "step": 1475 }, { "epoch": 0.01, "grad_norm": 4.763447598313762, "learning_rate": 1.9999697328646345e-06, "loss": 1.419, "step": 1476 }, { "epoch": 0.01, "grad_norm": 5.553254419945732, "learning_rate": 1.9999696915586243e-06, "loss": 1.5496, "step": 1477 }, { "epoch": 0.01, "grad_norm": 6.8449155578371785, "learning_rate": 1.999969650224448e-06, "loss": 1.4168, "step": 1478 }, { "epoch": 0.01, "grad_norm": 4.936042392237835, "learning_rate": 1.999969608862106e-06, "loss": 1.4295, "step": 1479 }, { "epoch": 0.01, "grad_norm": 4.692520927812083, "learning_rate": 1.9999695674715985e-06, "loss": 1.4999, "step": 1480 }, { "epoch": 0.01, "grad_norm": 4.618917093841338, "learning_rate": 1.9999695260529253e-06, "loss": 1.4672, "step": 1481 }, { "epoch": 0.01, "grad_norm": 4.540512172018715, "learning_rate": 1.999969484606086e-06, "loss": 1.4142, "step": 1482 }, { "epoch": 0.01, "grad_norm": 6.257964249055953, "learning_rate": 1.999969443131081e-06, "loss": 1.5545, "step": 1483 }, { "epoch": 0.01, "grad_norm": 5.361199744025823, "learning_rate": 1.99996940162791e-06, "loss": 1.4301, "step": 1484 }, { "epoch": 0.01, "grad_norm": 4.762262068823818, "learning_rate": 1.9999693600965733e-06, "loss": 1.3145, "step": 1485 }, { "epoch": 0.01, "grad_norm": 4.507782598355478, "learning_rate": 1.9999693185370708e-06, "loss": 1.346, "step": 1486 }, { "epoch": 0.01, "grad_norm": 5.20616920405698, "learning_rate": 1.9999692769494027e-06, "loss": 1.3654, "step": 1487 }, { "epoch": 0.01, "grad_norm": 4.9199475980705385, "learning_rate": 1.9999692353335686e-06, "loss": 1.4154, "step": 1488 }, { "epoch": 0.01, "grad_norm": 4.789863379301854, "learning_rate": 1.9999691936895685e-06, "loss": 1.4946, "step": 1489 }, { "epoch": 0.01, "grad_norm": 5.2869321337326225, "learning_rate": 1.9999691520174025e-06, "loss": 1.3195, "step": 1490 }, { "epoch": 0.01, "grad_norm": 5.757207256999973, "learning_rate": 1.999969110317071e-06, "loss": 1.6459, "step": 1491 }, { "epoch": 0.01, "grad_norm": 4.856756819873707, "learning_rate": 1.999969068588574e-06, "loss": 1.3793, "step": 1492 }, { "epoch": 0.01, "grad_norm": 5.1406066928499605, "learning_rate": 1.999969026831911e-06, "loss": 1.6314, "step": 1493 }, { "epoch": 0.01, "grad_norm": 4.339460225757653, "learning_rate": 1.9999689850470814e-06, "loss": 1.2897, "step": 1494 }, { "epoch": 0.01, "grad_norm": 5.118439798561242, "learning_rate": 1.999968943234087e-06, "loss": 1.3565, "step": 1495 }, { "epoch": 0.01, "grad_norm": 5.239267360652984, "learning_rate": 1.9999689013929264e-06, "loss": 1.3343, "step": 1496 }, { "epoch": 0.01, "grad_norm": 4.413381580899369, "learning_rate": 1.9999688595236003e-06, "loss": 1.3758, "step": 1497 }, { "epoch": 0.01, "grad_norm": 5.032731231134136, "learning_rate": 1.999968817626108e-06, "loss": 1.4616, "step": 1498 }, { "epoch": 0.01, "grad_norm": 5.181684854242491, "learning_rate": 1.99996877570045e-06, "loss": 1.5541, "step": 1499 }, { "epoch": 0.01, "grad_norm": 4.529939429132134, "learning_rate": 1.9999687337466264e-06, "loss": 1.4291, "step": 1500 }, { "epoch": 0.01, "grad_norm": 6.083882159448674, "learning_rate": 1.9999686917646365e-06, "loss": 1.527, "step": 1501 }, { "epoch": 0.01, "grad_norm": 5.290245063554081, "learning_rate": 1.999968649754481e-06, "loss": 1.4742, "step": 1502 }, { "epoch": 0.01, "grad_norm": 4.717789412166867, "learning_rate": 1.9999686077161605e-06, "loss": 1.4765, "step": 1503 }, { "epoch": 0.01, "grad_norm": 4.545533558985268, "learning_rate": 1.999968565649673e-06, "loss": 1.306, "step": 1504 }, { "epoch": 0.01, "grad_norm": 4.728947694233556, "learning_rate": 1.9999685235550206e-06, "loss": 1.3828, "step": 1505 }, { "epoch": 0.01, "grad_norm": 4.662992982900369, "learning_rate": 1.999968481432202e-06, "loss": 1.4146, "step": 1506 }, { "epoch": 0.01, "grad_norm": 6.039232945794986, "learning_rate": 1.9999684392812178e-06, "loss": 1.5415, "step": 1507 }, { "epoch": 0.01, "grad_norm": 5.039673874673663, "learning_rate": 1.999968397102068e-06, "loss": 1.3505, "step": 1508 }, { "epoch": 0.01, "grad_norm": 5.7877009049623025, "learning_rate": 1.9999683548947523e-06, "loss": 1.4874, "step": 1509 }, { "epoch": 0.01, "grad_norm": 4.641088327797642, "learning_rate": 1.9999683126592705e-06, "loss": 1.4175, "step": 1510 }, { "epoch": 0.01, "grad_norm": 4.831959829153163, "learning_rate": 1.999968270395623e-06, "loss": 1.4567, "step": 1511 }, { "epoch": 0.01, "grad_norm": 5.699928337044365, "learning_rate": 1.99996822810381e-06, "loss": 1.392, "step": 1512 }, { "epoch": 0.01, "grad_norm": 4.608406926276191, "learning_rate": 1.9999681857838308e-06, "loss": 1.4505, "step": 1513 }, { "epoch": 0.01, "grad_norm": 4.867903533500069, "learning_rate": 1.9999681434356863e-06, "loss": 1.3949, "step": 1514 }, { "epoch": 0.01, "grad_norm": 4.944562213455762, "learning_rate": 1.9999681010593754e-06, "loss": 1.2331, "step": 1515 }, { "epoch": 0.01, "grad_norm": 5.2571821063164474, "learning_rate": 1.9999680586548995e-06, "loss": 1.5634, "step": 1516 }, { "epoch": 0.01, "grad_norm": 4.907465548338297, "learning_rate": 1.999968016222257e-06, "loss": 1.3562, "step": 1517 }, { "epoch": 0.01, "grad_norm": 4.276801096528798, "learning_rate": 1.9999679737614493e-06, "loss": 1.1667, "step": 1518 }, { "epoch": 0.01, "grad_norm": 5.073955336024891, "learning_rate": 1.999967931272476e-06, "loss": 1.5431, "step": 1519 }, { "epoch": 0.01, "grad_norm": 5.183148059254548, "learning_rate": 1.9999678887553364e-06, "loss": 1.5844, "step": 1520 }, { "epoch": 0.01, "grad_norm": 4.591593727529927, "learning_rate": 1.9999678462100315e-06, "loss": 1.4007, "step": 1521 }, { "epoch": 0.01, "grad_norm": 4.83807070074554, "learning_rate": 1.99996780363656e-06, "loss": 1.4673, "step": 1522 }, { "epoch": 0.01, "grad_norm": 5.564201157030677, "learning_rate": 1.9999677610349238e-06, "loss": 1.4229, "step": 1523 }, { "epoch": 0.01, "grad_norm": 6.506652623301389, "learning_rate": 1.9999677184051214e-06, "loss": 1.4544, "step": 1524 }, { "epoch": 0.01, "grad_norm": 5.110861074492502, "learning_rate": 1.999967675747153e-06, "loss": 1.4452, "step": 1525 }, { "epoch": 0.01, "grad_norm": 4.550965773070968, "learning_rate": 1.999967633061019e-06, "loss": 1.39, "step": 1526 }, { "epoch": 0.01, "grad_norm": 15.977881886992321, "learning_rate": 1.9999675903467193e-06, "loss": 1.5182, "step": 1527 }, { "epoch": 0.01, "grad_norm": 4.886960525204347, "learning_rate": 1.999967547604254e-06, "loss": 1.3334, "step": 1528 }, { "epoch": 0.01, "grad_norm": 5.035221458256328, "learning_rate": 1.9999675048336225e-06, "loss": 1.5484, "step": 1529 }, { "epoch": 0.01, "grad_norm": 5.127298418854591, "learning_rate": 1.9999674620348256e-06, "loss": 1.5482, "step": 1530 }, { "epoch": 0.01, "grad_norm": 7.393021516760661, "learning_rate": 1.9999674192078627e-06, "loss": 1.3923, "step": 1531 }, { "epoch": 0.01, "grad_norm": 4.597322785666696, "learning_rate": 1.9999673763527343e-06, "loss": 1.4507, "step": 1532 }, { "epoch": 0.01, "grad_norm": 4.759409590128979, "learning_rate": 1.99996733346944e-06, "loss": 1.328, "step": 1533 }, { "epoch": 0.01, "eval_loss": 1.6537013053894043, "eval_runtime": 4.6319, "eval_samples_per_second": 1.943, "eval_steps_per_second": 1.079, "step": 1533 }, { "epoch": 0.01, "grad_norm": 5.004922521706707, "learning_rate": 1.99996729055798e-06, "loss": 1.3943, "step": 1534 }, { "epoch": 0.01, "grad_norm": 4.645165024309459, "learning_rate": 1.999967247618354e-06, "loss": 1.3322, "step": 1535 }, { "epoch": 0.01, "grad_norm": 4.999318863504048, "learning_rate": 1.9999672046505628e-06, "loss": 1.5395, "step": 1536 }, { "epoch": 0.01, "grad_norm": 4.86361645206254, "learning_rate": 1.9999671616546054e-06, "loss": 1.5212, "step": 1537 }, { "epoch": 0.01, "grad_norm": 5.153420847923612, "learning_rate": 1.9999671186304825e-06, "loss": 1.5427, "step": 1538 }, { "epoch": 0.01, "grad_norm": 5.0394644534786135, "learning_rate": 1.9999670755781936e-06, "loss": 1.2973, "step": 1539 }, { "epoch": 0.01, "grad_norm": 4.633634743770092, "learning_rate": 1.9999670324977388e-06, "loss": 1.5085, "step": 1540 }, { "epoch": 0.01, "grad_norm": 8.357519744982609, "learning_rate": 1.999966989389119e-06, "loss": 1.3645, "step": 1541 }, { "epoch": 0.01, "grad_norm": 4.802169847384737, "learning_rate": 1.9999669462523325e-06, "loss": 1.5779, "step": 1542 }, { "epoch": 0.01, "grad_norm": 4.668930926325562, "learning_rate": 1.999966903087381e-06, "loss": 1.4067, "step": 1543 }, { "epoch": 0.01, "grad_norm": 4.43999149148277, "learning_rate": 1.9999668598942636e-06, "loss": 1.2107, "step": 1544 }, { "epoch": 0.01, "grad_norm": 4.802567954958664, "learning_rate": 1.99996681667298e-06, "loss": 1.4996, "step": 1545 }, { "epoch": 0.01, "grad_norm": 5.043050576777043, "learning_rate": 1.999966773423531e-06, "loss": 1.4864, "step": 1546 }, { "epoch": 0.01, "grad_norm": 4.838293228729252, "learning_rate": 1.9999667301459164e-06, "loss": 1.5079, "step": 1547 }, { "epoch": 0.01, "grad_norm": 4.8303265370783235, "learning_rate": 1.999966686840136e-06, "loss": 1.6446, "step": 1548 }, { "epoch": 0.01, "grad_norm": 5.4088008516017005, "learning_rate": 1.9999666435061896e-06, "loss": 1.4987, "step": 1549 }, { "epoch": 0.01, "grad_norm": 4.441236329740386, "learning_rate": 1.9999666001440776e-06, "loss": 1.4233, "step": 1550 }, { "epoch": 0.01, "grad_norm": 4.854304337374531, "learning_rate": 1.9999665567538e-06, "loss": 1.5069, "step": 1551 }, { "epoch": 0.01, "grad_norm": 4.651751008584424, "learning_rate": 1.9999665133353567e-06, "loss": 1.4673, "step": 1552 }, { "epoch": 0.01, "grad_norm": 4.650463359683428, "learning_rate": 1.9999664698887473e-06, "loss": 1.4744, "step": 1553 }, { "epoch": 0.01, "grad_norm": 6.294289457136488, "learning_rate": 1.999966426413973e-06, "loss": 1.7254, "step": 1554 }, { "epoch": 0.01, "grad_norm": 5.026568380786172, "learning_rate": 1.9999663829110324e-06, "loss": 1.574, "step": 1555 }, { "epoch": 0.01, "grad_norm": 5.258151411074215, "learning_rate": 1.999966339379926e-06, "loss": 1.3902, "step": 1556 }, { "epoch": 0.01, "grad_norm": 4.636549612306679, "learning_rate": 1.999966295820654e-06, "loss": 1.3695, "step": 1557 }, { "epoch": 0.01, "grad_norm": 5.752515784505921, "learning_rate": 1.999966252233216e-06, "loss": 1.5815, "step": 1558 }, { "epoch": 0.01, "grad_norm": 5.150495198663999, "learning_rate": 1.9999662086176125e-06, "loss": 1.4708, "step": 1559 }, { "epoch": 0.01, "grad_norm": 4.696102556808951, "learning_rate": 1.9999661649738435e-06, "loss": 1.3237, "step": 1560 }, { "epoch": 0.01, "grad_norm": 4.487064828854456, "learning_rate": 1.9999661213019085e-06, "loss": 1.367, "step": 1561 }, { "epoch": 0.01, "grad_norm": 4.674090800118098, "learning_rate": 1.9999660776018076e-06, "loss": 1.3795, "step": 1562 }, { "epoch": 0.01, "grad_norm": 4.884374781206321, "learning_rate": 1.9999660338735415e-06, "loss": 1.4984, "step": 1563 }, { "epoch": 0.01, "grad_norm": 4.8404936570617, "learning_rate": 1.9999659901171095e-06, "loss": 1.4553, "step": 1564 }, { "epoch": 0.01, "grad_norm": 5.0291200173626, "learning_rate": 1.9999659463325115e-06, "loss": 1.4493, "step": 1565 }, { "epoch": 0.01, "grad_norm": 4.489038208651526, "learning_rate": 1.999965902519748e-06, "loss": 1.4916, "step": 1566 }, { "epoch": 0.01, "grad_norm": 5.325129603155862, "learning_rate": 1.999965858678819e-06, "loss": 1.4466, "step": 1567 }, { "epoch": 0.01, "grad_norm": 4.845547041345931, "learning_rate": 1.999965814809724e-06, "loss": 1.4709, "step": 1568 }, { "epoch": 0.01, "grad_norm": 4.6323997434080955, "learning_rate": 1.999965770912463e-06, "loss": 1.2593, "step": 1569 }, { "epoch": 0.01, "grad_norm": 5.263469443114738, "learning_rate": 1.999965726987037e-06, "loss": 1.4309, "step": 1570 }, { "epoch": 0.01, "grad_norm": 4.7842493763504, "learning_rate": 1.9999656830334447e-06, "loss": 1.5784, "step": 1571 }, { "epoch": 0.01, "grad_norm": 5.581499884497734, "learning_rate": 1.999965639051687e-06, "loss": 1.4264, "step": 1572 }, { "epoch": 0.01, "grad_norm": 4.713589565139688, "learning_rate": 1.9999655950417635e-06, "loss": 1.3676, "step": 1573 }, { "epoch": 0.01, "grad_norm": 5.224520331971595, "learning_rate": 1.9999655510036744e-06, "loss": 1.5145, "step": 1574 }, { "epoch": 0.01, "grad_norm": 5.044254687476926, "learning_rate": 1.9999655069374193e-06, "loss": 1.382, "step": 1575 }, { "epoch": 0.01, "grad_norm": 4.998067415237716, "learning_rate": 1.999965462842999e-06, "loss": 1.5149, "step": 1576 }, { "epoch": 0.01, "grad_norm": 5.376451146617778, "learning_rate": 1.9999654187204126e-06, "loss": 1.4605, "step": 1577 }, { "epoch": 0.01, "grad_norm": 5.62147315713685, "learning_rate": 1.9999653745696605e-06, "loss": 1.5111, "step": 1578 }, { "epoch": 0.01, "grad_norm": 5.386593885174143, "learning_rate": 1.999965330390743e-06, "loss": 1.5952, "step": 1579 }, { "epoch": 0.01, "grad_norm": 4.6360487402587784, "learning_rate": 1.999965286183659e-06, "loss": 1.3293, "step": 1580 }, { "epoch": 0.01, "grad_norm": 4.759388015554171, "learning_rate": 1.9999652419484104e-06, "loss": 1.3158, "step": 1581 }, { "epoch": 0.01, "grad_norm": 4.720044228281999, "learning_rate": 1.9999651976849957e-06, "loss": 1.5035, "step": 1582 }, { "epoch": 0.01, "grad_norm": 4.571250672292211, "learning_rate": 1.999965153393415e-06, "loss": 1.3919, "step": 1583 }, { "epoch": 0.01, "grad_norm": 4.512671101809202, "learning_rate": 1.9999651090736693e-06, "loss": 1.3005, "step": 1584 }, { "epoch": 0.01, "grad_norm": 5.311600498717284, "learning_rate": 1.999965064725757e-06, "loss": 1.5076, "step": 1585 }, { "epoch": 0.01, "grad_norm": 5.4380918504811815, "learning_rate": 1.9999650203496794e-06, "loss": 1.5773, "step": 1586 }, { "epoch": 0.01, "grad_norm": 4.821728820130227, "learning_rate": 1.999964975945436e-06, "loss": 1.4923, "step": 1587 }, { "epoch": 0.01, "grad_norm": 4.701896651829335, "learning_rate": 1.9999649315130274e-06, "loss": 1.3324, "step": 1588 }, { "epoch": 0.01, "grad_norm": 4.826124943857317, "learning_rate": 1.9999648870524526e-06, "loss": 1.3942, "step": 1589 }, { "epoch": 0.01, "grad_norm": 5.252549552664715, "learning_rate": 1.9999648425637123e-06, "loss": 1.4413, "step": 1590 }, { "epoch": 0.01, "grad_norm": 4.33848280401789, "learning_rate": 1.9999647980468065e-06, "loss": 1.3627, "step": 1591 }, { "epoch": 0.01, "grad_norm": 6.232495688477331, "learning_rate": 1.999964753501735e-06, "loss": 1.371, "step": 1592 }, { "epoch": 0.01, "grad_norm": 4.559527771002142, "learning_rate": 1.9999647089284974e-06, "loss": 1.4434, "step": 1593 }, { "epoch": 0.01, "grad_norm": 4.70894162267966, "learning_rate": 1.9999646643270945e-06, "loss": 1.4673, "step": 1594 }, { "epoch": 0.01, "grad_norm": 5.676920877976759, "learning_rate": 1.999964619697526e-06, "loss": 1.4621, "step": 1595 }, { "epoch": 0.01, "grad_norm": 6.232169194197823, "learning_rate": 1.9999645750397918e-06, "loss": 1.4599, "step": 1596 }, { "epoch": 0.01, "grad_norm": 4.693130001219432, "learning_rate": 1.9999645303538914e-06, "loss": 1.5755, "step": 1597 }, { "epoch": 0.01, "grad_norm": 4.616061797152542, "learning_rate": 1.9999644856398255e-06, "loss": 1.525, "step": 1598 }, { "epoch": 0.01, "grad_norm": 5.082035414984437, "learning_rate": 1.999964440897594e-06, "loss": 1.4748, "step": 1599 }, { "epoch": 0.01, "grad_norm": 5.530227379151722, "learning_rate": 1.999964396127197e-06, "loss": 1.4081, "step": 1600 }, { "epoch": 0.01, "grad_norm": 11.714257093847134, "learning_rate": 1.9999643513286343e-06, "loss": 1.4166, "step": 1601 }, { "epoch": 0.01, "grad_norm": 5.188422008714217, "learning_rate": 1.999964306501906e-06, "loss": 1.3882, "step": 1602 }, { "epoch": 0.01, "grad_norm": 5.037793578152508, "learning_rate": 1.999964261647012e-06, "loss": 1.4275, "step": 1603 }, { "epoch": 0.01, "grad_norm": 5.775212123850943, "learning_rate": 1.9999642167639523e-06, "loss": 1.6035, "step": 1604 }, { "epoch": 0.01, "grad_norm": 5.478124459351955, "learning_rate": 1.9999641718527268e-06, "loss": 1.3335, "step": 1605 }, { "epoch": 0.01, "grad_norm": 5.055970312783448, "learning_rate": 1.9999641269133357e-06, "loss": 1.5906, "step": 1606 }, { "epoch": 0.01, "eval_loss": 1.6535860300064087, "eval_runtime": 4.6131, "eval_samples_per_second": 1.951, "eval_steps_per_second": 1.084, "step": 1606 }, { "epoch": 0.01, "grad_norm": 4.7695927713899335, "learning_rate": 1.9999640819457787e-06, "loss": 1.4862, "step": 1607 }, { "epoch": 0.01, "grad_norm": 4.631860591367596, "learning_rate": 1.9999640369500566e-06, "loss": 1.3437, "step": 1608 }, { "epoch": 0.01, "grad_norm": 5.184619042074156, "learning_rate": 1.9999639919261685e-06, "loss": 1.3648, "step": 1609 }, { "epoch": 0.01, "grad_norm": 4.273228198303511, "learning_rate": 1.999963946874115e-06, "loss": 1.2881, "step": 1610 }, { "epoch": 0.01, "grad_norm": 5.019924351174469, "learning_rate": 1.9999639017938953e-06, "loss": 1.4094, "step": 1611 }, { "epoch": 0.01, "grad_norm": 6.796825605500974, "learning_rate": 1.99996385668551e-06, "loss": 1.4356, "step": 1612 }, { "epoch": 0.01, "grad_norm": 7.039100317720765, "learning_rate": 1.9999638115489595e-06, "loss": 1.507, "step": 1613 }, { "epoch": 0.01, "grad_norm": 4.760579560458608, "learning_rate": 1.9999637663842433e-06, "loss": 1.2805, "step": 1614 }, { "epoch": 0.01, "grad_norm": 5.289967418587153, "learning_rate": 1.999963721191361e-06, "loss": 1.5173, "step": 1615 }, { "epoch": 0.01, "grad_norm": 4.905881261758454, "learning_rate": 1.9999636759703135e-06, "loss": 1.4773, "step": 1616 }, { "epoch": 0.01, "grad_norm": 4.601006920242404, "learning_rate": 1.9999636307211002e-06, "loss": 1.4564, "step": 1617 }, { "epoch": 0.01, "grad_norm": 4.623950290324913, "learning_rate": 1.9999635854437214e-06, "loss": 1.5243, "step": 1618 }, { "epoch": 0.01, "grad_norm": 5.797135728663394, "learning_rate": 1.9999635401381767e-06, "loss": 1.6152, "step": 1619 }, { "epoch": 0.01, "grad_norm": 4.494921471069805, "learning_rate": 1.9999634948044664e-06, "loss": 1.3623, "step": 1620 }, { "epoch": 0.01, "grad_norm": 5.786655122435106, "learning_rate": 1.9999634494425906e-06, "loss": 1.3719, "step": 1621 }, { "epoch": 0.01, "grad_norm": 6.5772063648225405, "learning_rate": 1.9999634040525492e-06, "loss": 1.6291, "step": 1622 }, { "epoch": 0.01, "grad_norm": 4.797055299395125, "learning_rate": 1.999963358634342e-06, "loss": 1.4054, "step": 1623 }, { "epoch": 0.01, "grad_norm": 4.881210206846928, "learning_rate": 1.999963313187969e-06, "loss": 1.3922, "step": 1624 }, { "epoch": 0.01, "grad_norm": 4.7416575099427405, "learning_rate": 1.9999632677134306e-06, "loss": 1.3498, "step": 1625 }, { "epoch": 0.01, "grad_norm": 5.094353766754562, "learning_rate": 1.9999632222107262e-06, "loss": 1.4907, "step": 1626 }, { "epoch": 0.01, "grad_norm": 5.136116876869801, "learning_rate": 1.9999631766798568e-06, "loss": 1.4637, "step": 1627 }, { "epoch": 0.01, "grad_norm": 4.842724728052074, "learning_rate": 1.9999631311208213e-06, "loss": 1.5065, "step": 1628 }, { "epoch": 0.01, "grad_norm": 4.470556712718268, "learning_rate": 1.9999630855336203e-06, "loss": 1.4163, "step": 1629 }, { "epoch": 0.01, "grad_norm": 5.4357945417775015, "learning_rate": 1.999963039918254e-06, "loss": 1.5127, "step": 1630 }, { "epoch": 0.01, "grad_norm": 5.283784706095175, "learning_rate": 1.9999629942747213e-06, "loss": 1.5162, "step": 1631 }, { "epoch": 0.01, "grad_norm": 6.171430776129922, "learning_rate": 1.9999629486030233e-06, "loss": 1.4975, "step": 1632 }, { "epoch": 0.01, "grad_norm": 4.883035958460619, "learning_rate": 1.9999629029031597e-06, "loss": 1.5646, "step": 1633 }, { "epoch": 0.01, "grad_norm": 4.868003718244226, "learning_rate": 1.999962857175131e-06, "loss": 1.4711, "step": 1634 }, { "epoch": 0.01, "grad_norm": 4.688207116642901, "learning_rate": 1.999962811418936e-06, "loss": 1.235, "step": 1635 }, { "epoch": 0.01, "grad_norm": 4.779397974493194, "learning_rate": 1.9999627656345753e-06, "loss": 1.4816, "step": 1636 }, { "epoch": 0.01, "grad_norm": 4.787958938157594, "learning_rate": 1.9999627198220496e-06, "loss": 1.3246, "step": 1637 }, { "epoch": 0.01, "grad_norm": 5.294916108987848, "learning_rate": 1.999962673981358e-06, "loss": 1.4679, "step": 1638 }, { "epoch": 0.01, "grad_norm": 4.849618123624191, "learning_rate": 1.9999626281125002e-06, "loss": 1.4614, "step": 1639 }, { "epoch": 0.01, "grad_norm": 4.677011611090417, "learning_rate": 1.9999625822154775e-06, "loss": 1.3398, "step": 1640 }, { "epoch": 0.01, "grad_norm": 4.707575677733155, "learning_rate": 1.999962536290289e-06, "loss": 1.4338, "step": 1641 }, { "epoch": 0.01, "grad_norm": 5.1791912413051815, "learning_rate": 1.999962490336935e-06, "loss": 1.6644, "step": 1642 }, { "epoch": 0.01, "grad_norm": 4.636258199528408, "learning_rate": 1.999962444355415e-06, "loss": 1.3262, "step": 1643 }, { "epoch": 0.01, "grad_norm": 9.631481491029284, "learning_rate": 1.9999623983457297e-06, "loss": 1.3487, "step": 1644 }, { "epoch": 0.01, "grad_norm": 5.406573344850598, "learning_rate": 1.9999623523078784e-06, "loss": 1.4415, "step": 1645 }, { "epoch": 0.01, "grad_norm": 4.936420173298707, "learning_rate": 1.999962306241862e-06, "loss": 1.5284, "step": 1646 }, { "epoch": 0.01, "grad_norm": 7.408101415603079, "learning_rate": 1.9999622601476796e-06, "loss": 1.5724, "step": 1647 }, { "epoch": 0.01, "grad_norm": 4.91033489810393, "learning_rate": 1.999962214025332e-06, "loss": 1.4389, "step": 1648 }, { "epoch": 0.01, "grad_norm": 4.7490093326999965, "learning_rate": 1.9999621678748186e-06, "loss": 1.4127, "step": 1649 }, { "epoch": 0.01, "grad_norm": 5.02865055772892, "learning_rate": 1.999962121696139e-06, "loss": 1.5252, "step": 1650 }, { "epoch": 0.01, "grad_norm": 4.941820991121326, "learning_rate": 1.9999620754892946e-06, "loss": 1.5824, "step": 1651 }, { "epoch": 0.01, "grad_norm": 4.584733328893442, "learning_rate": 1.9999620292542845e-06, "loss": 1.4465, "step": 1652 }, { "epoch": 0.01, "grad_norm": 4.668886241818989, "learning_rate": 1.9999619829911084e-06, "loss": 1.5671, "step": 1653 }, { "epoch": 0.01, "grad_norm": 4.8595287693689455, "learning_rate": 1.9999619366997673e-06, "loss": 1.4179, "step": 1654 }, { "epoch": 0.01, "grad_norm": 4.505667922340234, "learning_rate": 1.99996189038026e-06, "loss": 1.3717, "step": 1655 }, { "epoch": 0.01, "grad_norm": 4.854733211903465, "learning_rate": 1.999961844032587e-06, "loss": 1.5526, "step": 1656 }, { "epoch": 0.01, "grad_norm": 6.052863076519862, "learning_rate": 1.999961797656749e-06, "loss": 1.5102, "step": 1657 }, { "epoch": 0.01, "grad_norm": 4.391873652615683, "learning_rate": 1.999961751252745e-06, "loss": 1.313, "step": 1658 }, { "epoch": 0.01, "grad_norm": 4.905334859088026, "learning_rate": 1.9999617048205758e-06, "loss": 1.4915, "step": 1659 }, { "epoch": 0.01, "grad_norm": 4.90511142571514, "learning_rate": 1.9999616583602405e-06, "loss": 1.3705, "step": 1660 }, { "epoch": 0.01, "grad_norm": 5.696590131572682, "learning_rate": 1.99996161187174e-06, "loss": 1.3774, "step": 1661 }, { "epoch": 0.01, "grad_norm": 4.901553847339917, "learning_rate": 1.999961565355074e-06, "loss": 1.4858, "step": 1662 }, { "epoch": 0.01, "grad_norm": 5.080014172214551, "learning_rate": 1.9999615188102415e-06, "loss": 1.3527, "step": 1663 }, { "epoch": 0.01, "grad_norm": 5.286819461235812, "learning_rate": 1.999961472237244e-06, "loss": 1.5443, "step": 1664 }, { "epoch": 0.01, "grad_norm": 5.083890130164741, "learning_rate": 1.999961425636081e-06, "loss": 1.4417, "step": 1665 }, { "epoch": 0.01, "grad_norm": 4.8993271722773235, "learning_rate": 1.9999613790067526e-06, "loss": 1.4137, "step": 1666 }, { "epoch": 0.01, "grad_norm": 5.130884125485172, "learning_rate": 1.9999613323492586e-06, "loss": 1.5135, "step": 1667 }, { "epoch": 0.01, "grad_norm": 4.560714520466941, "learning_rate": 1.9999612856635986e-06, "loss": 1.2326, "step": 1668 }, { "epoch": 0.01, "grad_norm": 4.469864395669994, "learning_rate": 1.9999612389497735e-06, "loss": 1.3952, "step": 1669 }, { "epoch": 0.01, "grad_norm": 5.0576333320329425, "learning_rate": 1.9999611922077824e-06, "loss": 1.4062, "step": 1670 }, { "epoch": 0.01, "grad_norm": 4.550473014043539, "learning_rate": 1.999961145437626e-06, "loss": 1.3792, "step": 1671 }, { "epoch": 0.01, "grad_norm": 4.749509524045929, "learning_rate": 1.9999610986393037e-06, "loss": 1.4487, "step": 1672 }, { "epoch": 0.01, "grad_norm": 5.365020045883757, "learning_rate": 1.999961051812816e-06, "loss": 1.4974, "step": 1673 }, { "epoch": 0.01, "grad_norm": 4.624114515501705, "learning_rate": 1.9999610049581627e-06, "loss": 1.388, "step": 1674 }, { "epoch": 0.01, "grad_norm": 4.510101567218574, "learning_rate": 1.999960958075344e-06, "loss": 1.3547, "step": 1675 }, { "epoch": 0.01, "grad_norm": 5.07080055515291, "learning_rate": 1.9999609111643596e-06, "loss": 1.4748, "step": 1676 }, { "epoch": 0.01, "grad_norm": 4.8404551347295985, "learning_rate": 1.9999608642252094e-06, "loss": 1.4301, "step": 1677 }, { "epoch": 0.01, "grad_norm": 5.636720567872511, "learning_rate": 1.999960817257894e-06, "loss": 1.351, "step": 1678 }, { "epoch": 0.01, "grad_norm": 4.652209867428249, "learning_rate": 1.999960770262413e-06, "loss": 1.4968, "step": 1679 }, { "epoch": 0.01, "eval_loss": 1.6513854265213013, "eval_runtime": 4.6169, "eval_samples_per_second": 1.949, "eval_steps_per_second": 1.083, "step": 1679 }, { "epoch": 0.01, "grad_norm": 4.725930228784767, "learning_rate": 1.999960723238766e-06, "loss": 1.409, "step": 1680 }, { "epoch": 0.01, "grad_norm": 4.954953461456659, "learning_rate": 1.9999606761869537e-06, "loss": 1.4448, "step": 1681 }, { "epoch": 0.01, "grad_norm": 5.255376181858065, "learning_rate": 1.9999606291069757e-06, "loss": 1.3616, "step": 1682 }, { "epoch": 0.01, "grad_norm": 4.709113389093899, "learning_rate": 1.9999605819988326e-06, "loss": 1.4076, "step": 1683 }, { "epoch": 0.01, "grad_norm": 5.766680383528532, "learning_rate": 1.9999605348625236e-06, "loss": 1.2462, "step": 1684 }, { "epoch": 0.01, "grad_norm": 4.409591894443744, "learning_rate": 1.999960487698049e-06, "loss": 1.4091, "step": 1685 }, { "epoch": 0.01, "grad_norm": 4.886994914850528, "learning_rate": 1.9999604405054093e-06, "loss": 1.4295, "step": 1686 }, { "epoch": 0.01, "grad_norm": 5.933768715491502, "learning_rate": 1.9999603932846036e-06, "loss": 1.3117, "step": 1687 }, { "epoch": 0.01, "grad_norm": 4.404087310320486, "learning_rate": 1.9999603460356324e-06, "loss": 1.4082, "step": 1688 }, { "epoch": 0.01, "grad_norm": 6.132686826605006, "learning_rate": 1.9999602987584952e-06, "loss": 1.4659, "step": 1689 }, { "epoch": 0.01, "grad_norm": 4.9809185718468205, "learning_rate": 1.999960251453193e-06, "loss": 1.5435, "step": 1690 }, { "epoch": 0.01, "grad_norm": 5.719735058251011, "learning_rate": 1.999960204119725e-06, "loss": 1.3241, "step": 1691 }, { "epoch": 0.01, "grad_norm": 5.016858779358495, "learning_rate": 1.9999601567580917e-06, "loss": 1.3907, "step": 1692 }, { "epoch": 0.01, "grad_norm": 5.518980098450484, "learning_rate": 1.999960109368293e-06, "loss": 1.5071, "step": 1693 }, { "epoch": 0.01, "grad_norm": 4.56680393314666, "learning_rate": 1.9999600619503284e-06, "loss": 1.277, "step": 1694 }, { "epoch": 0.01, "grad_norm": 5.221095667195953, "learning_rate": 1.9999600145041984e-06, "loss": 1.5565, "step": 1695 }, { "epoch": 0.01, "grad_norm": 4.838855837516108, "learning_rate": 1.9999599670299024e-06, "loss": 1.5075, "step": 1696 }, { "epoch": 0.01, "grad_norm": 5.181585087689867, "learning_rate": 1.9999599195274414e-06, "loss": 1.4358, "step": 1697 }, { "epoch": 0.01, "grad_norm": 4.775117548464921, "learning_rate": 1.9999598719968148e-06, "loss": 1.3676, "step": 1698 }, { "epoch": 0.01, "grad_norm": 4.675367411416997, "learning_rate": 1.9999598244380226e-06, "loss": 1.4902, "step": 1699 }, { "epoch": 0.01, "grad_norm": 4.534748298661586, "learning_rate": 1.999959776851065e-06, "loss": 1.3374, "step": 1700 }, { "epoch": 0.01, "grad_norm": 6.092872429354068, "learning_rate": 1.9999597292359413e-06, "loss": 1.3938, "step": 1701 }, { "epoch": 0.01, "grad_norm": 5.1363468139465835, "learning_rate": 1.9999596815926525e-06, "loss": 1.4641, "step": 1702 }, { "epoch": 0.01, "grad_norm": 4.679526003354978, "learning_rate": 1.9999596339211982e-06, "loss": 1.4826, "step": 1703 }, { "epoch": 0.01, "grad_norm": 5.409980918956625, "learning_rate": 1.999959586221578e-06, "loss": 1.459, "step": 1704 }, { "epoch": 0.01, "grad_norm": 4.506106987900174, "learning_rate": 1.9999595384937926e-06, "loss": 1.4694, "step": 1705 }, { "epoch": 0.01, "grad_norm": 5.139999841003741, "learning_rate": 1.9999594907378412e-06, "loss": 1.497, "step": 1706 }, { "epoch": 0.01, "grad_norm": 4.966305341143314, "learning_rate": 1.999959442953725e-06, "loss": 1.4117, "step": 1707 }, { "epoch": 0.01, "grad_norm": 11.81613488843635, "learning_rate": 1.9999593951414427e-06, "loss": 1.6682, "step": 1708 }, { "epoch": 0.01, "grad_norm": 5.377588601757201, "learning_rate": 1.999959347300995e-06, "loss": 1.3065, "step": 1709 }, { "epoch": 0.01, "grad_norm": 6.023397142367646, "learning_rate": 1.999959299432382e-06, "loss": 1.4304, "step": 1710 }, { "epoch": 0.01, "grad_norm": 4.675493936243796, "learning_rate": 1.9999592515356035e-06, "loss": 1.2623, "step": 1711 }, { "epoch": 0.01, "grad_norm": 4.86058388014828, "learning_rate": 1.9999592036106593e-06, "loss": 1.4823, "step": 1712 }, { "epoch": 0.01, "grad_norm": 4.556345518412603, "learning_rate": 1.9999591556575496e-06, "loss": 1.4409, "step": 1713 }, { "epoch": 0.01, "grad_norm": 7.066520926861509, "learning_rate": 1.9999591076762744e-06, "loss": 1.3931, "step": 1714 }, { "epoch": 0.01, "grad_norm": 5.416558606687308, "learning_rate": 1.9999590596668336e-06, "loss": 1.5513, "step": 1715 }, { "epoch": 0.01, "grad_norm": 4.566717284747848, "learning_rate": 1.9999590116292273e-06, "loss": 1.4196, "step": 1716 }, { "epoch": 0.01, "grad_norm": 4.774215759192912, "learning_rate": 1.9999589635634554e-06, "loss": 1.2179, "step": 1717 }, { "epoch": 0.01, "grad_norm": 5.871975120855977, "learning_rate": 1.999958915469518e-06, "loss": 1.3091, "step": 1718 }, { "epoch": 0.01, "grad_norm": 4.667731197924079, "learning_rate": 1.999958867347415e-06, "loss": 1.4477, "step": 1719 }, { "epoch": 0.01, "grad_norm": 8.363688597457996, "learning_rate": 1.9999588191971466e-06, "loss": 1.4968, "step": 1720 }, { "epoch": 0.01, "grad_norm": 6.524755329539832, "learning_rate": 1.999958771018713e-06, "loss": 1.5737, "step": 1721 }, { "epoch": 0.01, "grad_norm": 6.844327612634827, "learning_rate": 1.9999587228121135e-06, "loss": 1.5503, "step": 1722 }, { "epoch": 0.01, "grad_norm": 4.545944405243781, "learning_rate": 1.999958674577349e-06, "loss": 1.227, "step": 1723 }, { "epoch": 0.01, "grad_norm": 5.569019618596105, "learning_rate": 1.999958626314418e-06, "loss": 1.3704, "step": 1724 }, { "epoch": 0.01, "grad_norm": 4.840943336713329, "learning_rate": 1.9999585780233224e-06, "loss": 1.5026, "step": 1725 }, { "epoch": 0.01, "grad_norm": 5.04053411151362, "learning_rate": 1.9999585297040607e-06, "loss": 1.5642, "step": 1726 }, { "epoch": 0.01, "grad_norm": 5.6356258265297985, "learning_rate": 1.999958481356634e-06, "loss": 1.5226, "step": 1727 }, { "epoch": 0.01, "grad_norm": 4.640699731713796, "learning_rate": 1.9999584329810415e-06, "loss": 1.3299, "step": 1728 }, { "epoch": 0.01, "grad_norm": 8.472243462527622, "learning_rate": 1.9999583845772836e-06, "loss": 1.4519, "step": 1729 }, { "epoch": 0.01, "grad_norm": 7.611553303462497, "learning_rate": 1.99995833614536e-06, "loss": 1.4071, "step": 1730 }, { "epoch": 0.01, "grad_norm": 4.944281632533813, "learning_rate": 1.999958287685271e-06, "loss": 1.2544, "step": 1731 }, { "epoch": 0.01, "grad_norm": 4.6772733567861176, "learning_rate": 1.9999582391970166e-06, "loss": 1.3753, "step": 1732 }, { "epoch": 0.01, "grad_norm": 4.263194033542804, "learning_rate": 1.999958190680597e-06, "loss": 1.2437, "step": 1733 }, { "epoch": 0.01, "grad_norm": 6.654617300777728, "learning_rate": 1.9999581421360114e-06, "loss": 1.3443, "step": 1734 }, { "epoch": 0.01, "grad_norm": 5.17256863920859, "learning_rate": 1.9999580935632606e-06, "loss": 1.5056, "step": 1735 }, { "epoch": 0.01, "grad_norm": 5.491095887490403, "learning_rate": 1.999958044962344e-06, "loss": 1.32, "step": 1736 }, { "epoch": 0.01, "grad_norm": 5.391828873655179, "learning_rate": 1.999957996333262e-06, "loss": 1.342, "step": 1737 }, { "epoch": 0.01, "grad_norm": 4.966305374933118, "learning_rate": 1.999957947676015e-06, "loss": 1.3956, "step": 1738 }, { "epoch": 0.01, "grad_norm": 4.995740008299241, "learning_rate": 1.999957898990602e-06, "loss": 1.5289, "step": 1739 }, { "epoch": 0.01, "grad_norm": 4.381812845234866, "learning_rate": 1.9999578502770235e-06, "loss": 1.2572, "step": 1740 }, { "epoch": 0.01, "grad_norm": 4.61281321178712, "learning_rate": 1.9999578015352795e-06, "loss": 1.2066, "step": 1741 }, { "epoch": 0.01, "grad_norm": 4.441293918501137, "learning_rate": 1.99995775276537e-06, "loss": 1.3348, "step": 1742 }, { "epoch": 0.01, "grad_norm": 5.187132615217963, "learning_rate": 1.9999577039672954e-06, "loss": 1.5402, "step": 1743 }, { "epoch": 0.01, "grad_norm": 4.842621185034447, "learning_rate": 1.9999576551410553e-06, "loss": 1.3723, "step": 1744 }, { "epoch": 0.01, "grad_norm": 4.4907842747894895, "learning_rate": 1.9999576062866496e-06, "loss": 1.3577, "step": 1745 }, { "epoch": 0.01, "grad_norm": 4.71227076890854, "learning_rate": 1.999957557404078e-06, "loss": 1.3866, "step": 1746 }, { "epoch": 0.01, "grad_norm": 5.279369312058059, "learning_rate": 1.999957508493341e-06, "loss": 1.4093, "step": 1747 }, { "epoch": 0.01, "grad_norm": 4.646829536444046, "learning_rate": 1.9999574595544392e-06, "loss": 1.4286, "step": 1748 }, { "epoch": 0.01, "grad_norm": 4.593261228259904, "learning_rate": 1.9999574105873714e-06, "loss": 1.4716, "step": 1749 }, { "epoch": 0.01, "grad_norm": 5.0886616589245, "learning_rate": 1.999957361592138e-06, "loss": 1.6598, "step": 1750 }, { "epoch": 0.01, "grad_norm": 4.753829674453155, "learning_rate": 1.9999573125687395e-06, "loss": 1.3023, "step": 1751 }, { "epoch": 0.01, "grad_norm": 7.911294692900917, "learning_rate": 1.9999572635171754e-06, "loss": 1.4902, "step": 1752 }, { "epoch": 0.01, "eval_loss": 1.6478168964385986, "eval_runtime": 4.6461, "eval_samples_per_second": 1.937, "eval_steps_per_second": 1.076, "step": 1752 }, { "epoch": 0.01, "grad_norm": 4.700414649164786, "learning_rate": 1.999957214437446e-06, "loss": 1.482, "step": 1753 }, { "epoch": 0.01, "grad_norm": 4.9281821284726615, "learning_rate": 1.9999571653295507e-06, "loss": 1.306, "step": 1754 }, { "epoch": 0.01, "grad_norm": 4.787957118049158, "learning_rate": 1.99995711619349e-06, "loss": 1.3873, "step": 1755 }, { "epoch": 0.01, "grad_norm": 4.631636647001677, "learning_rate": 1.999957067029264e-06, "loss": 1.4089, "step": 1756 }, { "epoch": 0.01, "grad_norm": 4.754702249860543, "learning_rate": 1.9999570178368725e-06, "loss": 1.4074, "step": 1757 }, { "epoch": 0.01, "grad_norm": 5.069526479435708, "learning_rate": 1.9999569686163157e-06, "loss": 1.4378, "step": 1758 }, { "epoch": 0.01, "grad_norm": 4.9557014533023835, "learning_rate": 1.9999569193675933e-06, "loss": 1.3766, "step": 1759 }, { "epoch": 0.01, "grad_norm": 4.889794648818529, "learning_rate": 1.9999568700907053e-06, "loss": 1.5023, "step": 1760 }, { "epoch": 0.01, "grad_norm": 4.6039304576772, "learning_rate": 1.9999568207856523e-06, "loss": 1.3601, "step": 1761 }, { "epoch": 0.01, "grad_norm": 4.626450487742758, "learning_rate": 1.9999567714524337e-06, "loss": 1.3328, "step": 1762 }, { "epoch": 0.01, "grad_norm": 4.935339476011332, "learning_rate": 1.999956722091049e-06, "loss": 1.4359, "step": 1763 }, { "epoch": 0.01, "grad_norm": 5.059440744798255, "learning_rate": 1.9999566727014994e-06, "loss": 1.4256, "step": 1764 }, { "epoch": 0.01, "grad_norm": 4.856890423684793, "learning_rate": 1.999956623283784e-06, "loss": 1.3171, "step": 1765 }, { "epoch": 0.01, "grad_norm": 5.243931477116334, "learning_rate": 1.999956573837904e-06, "loss": 1.369, "step": 1766 }, { "epoch": 0.01, "grad_norm": 6.998539551416535, "learning_rate": 1.9999565243638576e-06, "loss": 1.439, "step": 1767 }, { "epoch": 0.01, "grad_norm": 4.836947003615344, "learning_rate": 1.999956474861646e-06, "loss": 1.4922, "step": 1768 }, { "epoch": 0.01, "grad_norm": 5.000816070522979, "learning_rate": 1.999956425331269e-06, "loss": 1.5798, "step": 1769 }, { "epoch": 0.01, "grad_norm": 5.162103021341265, "learning_rate": 1.9999563757727267e-06, "loss": 1.4635, "step": 1770 }, { "epoch": 0.01, "grad_norm": 4.754537659429389, "learning_rate": 1.999956326186019e-06, "loss": 1.4111, "step": 1771 }, { "epoch": 0.01, "grad_norm": 5.0189050672383, "learning_rate": 1.999956276571146e-06, "loss": 1.332, "step": 1772 }, { "epoch": 0.01, "grad_norm": 4.489911679774594, "learning_rate": 1.999956226928107e-06, "loss": 1.3697, "step": 1773 }, { "epoch": 0.01, "grad_norm": 5.139806714207968, "learning_rate": 1.999956177256903e-06, "loss": 1.4717, "step": 1774 }, { "epoch": 0.01, "grad_norm": 4.74100968652323, "learning_rate": 1.9999561275575333e-06, "loss": 1.4658, "step": 1775 }, { "epoch": 0.01, "grad_norm": 4.879178648686916, "learning_rate": 1.9999560778299984e-06, "loss": 1.4636, "step": 1776 }, { "epoch": 0.01, "grad_norm": 4.891078192372891, "learning_rate": 1.999956028074298e-06, "loss": 1.5723, "step": 1777 }, { "epoch": 0.01, "grad_norm": 5.653962016600203, "learning_rate": 1.999955978290432e-06, "loss": 1.5495, "step": 1778 }, { "epoch": 0.01, "grad_norm": 5.707164030187875, "learning_rate": 1.9999559284784005e-06, "loss": 1.64, "step": 1779 }, { "epoch": 0.01, "grad_norm": 4.454220280449505, "learning_rate": 1.999955878638204e-06, "loss": 1.336, "step": 1780 }, { "epoch": 0.01, "grad_norm": 4.593524682406172, "learning_rate": 1.9999558287698418e-06, "loss": 1.387, "step": 1781 }, { "epoch": 0.01, "grad_norm": 5.265214222108159, "learning_rate": 1.999955778873314e-06, "loss": 1.5196, "step": 1782 }, { "epoch": 0.01, "grad_norm": 4.8572705429997525, "learning_rate": 1.999955728948621e-06, "loss": 1.3624, "step": 1783 }, { "epoch": 0.01, "grad_norm": 4.789880139543268, "learning_rate": 1.9999556789957625e-06, "loss": 1.4488, "step": 1784 }, { "epoch": 0.01, "grad_norm": 6.621010536150032, "learning_rate": 1.9999556290147386e-06, "loss": 1.4625, "step": 1785 }, { "epoch": 0.01, "grad_norm": 5.383083285855116, "learning_rate": 1.999955579005549e-06, "loss": 1.4843, "step": 1786 }, { "epoch": 0.01, "grad_norm": 4.724436780612743, "learning_rate": 1.9999555289681946e-06, "loss": 1.3203, "step": 1787 }, { "epoch": 0.01, "grad_norm": 5.410873580372611, "learning_rate": 1.999955478902674e-06, "loss": 1.5224, "step": 1788 }, { "epoch": 0.01, "grad_norm": 5.290692087535022, "learning_rate": 1.9999554288089885e-06, "loss": 1.6514, "step": 1789 }, { "epoch": 0.01, "grad_norm": 4.756689362126989, "learning_rate": 1.9999553786871377e-06, "loss": 1.4927, "step": 1790 }, { "epoch": 0.01, "grad_norm": 5.336049328185806, "learning_rate": 1.9999553285371214e-06, "loss": 1.4515, "step": 1791 }, { "epoch": 0.01, "grad_norm": 5.194031618429659, "learning_rate": 1.999955278358939e-06, "loss": 1.5161, "step": 1792 }, { "epoch": 0.01, "grad_norm": 6.0237033121897845, "learning_rate": 1.9999552281525923e-06, "loss": 1.4292, "step": 1793 }, { "epoch": 0.01, "grad_norm": 4.482776055356631, "learning_rate": 1.9999551779180794e-06, "loss": 1.402, "step": 1794 }, { "epoch": 0.01, "grad_norm": 6.737389022108268, "learning_rate": 1.9999551276554014e-06, "loss": 1.5099, "step": 1795 }, { "epoch": 0.01, "grad_norm": 4.878318638241726, "learning_rate": 1.999955077364558e-06, "loss": 1.529, "step": 1796 }, { "epoch": 0.01, "grad_norm": 4.54561426906885, "learning_rate": 1.999955027045549e-06, "loss": 1.451, "step": 1797 }, { "epoch": 0.01, "grad_norm": 4.560824329023515, "learning_rate": 1.999954976698375e-06, "loss": 1.3774, "step": 1798 }, { "epoch": 0.01, "grad_norm": 4.798803868724221, "learning_rate": 1.9999549263230347e-06, "loss": 1.4121, "step": 1799 }, { "epoch": 0.01, "grad_norm": 6.383273482325183, "learning_rate": 1.99995487591953e-06, "loss": 1.4391, "step": 1800 }, { "epoch": 0.01, "grad_norm": 4.4091547065978425, "learning_rate": 1.9999548254878595e-06, "loss": 1.2512, "step": 1801 }, { "epoch": 0.01, "grad_norm": 4.838374021260141, "learning_rate": 1.999954775028023e-06, "loss": 1.4316, "step": 1802 }, { "epoch": 0.01, "grad_norm": 5.461055735973189, "learning_rate": 1.999954724540022e-06, "loss": 1.5318, "step": 1803 }, { "epoch": 0.01, "grad_norm": 5.633136261316013, "learning_rate": 1.9999546740238554e-06, "loss": 1.5676, "step": 1804 }, { "epoch": 0.01, "grad_norm": 4.303894260809806, "learning_rate": 1.9999546234795233e-06, "loss": 1.3127, "step": 1805 }, { "epoch": 0.01, "grad_norm": 5.072682679306013, "learning_rate": 1.9999545729070256e-06, "loss": 1.655, "step": 1806 }, { "epoch": 0.01, "grad_norm": 4.8324040981937815, "learning_rate": 1.999954522306363e-06, "loss": 1.4801, "step": 1807 }, { "epoch": 0.01, "grad_norm": 4.656940444517245, "learning_rate": 1.9999544716775345e-06, "loss": 1.3215, "step": 1808 }, { "epoch": 0.01, "grad_norm": 5.390009789147993, "learning_rate": 1.9999544210205406e-06, "loss": 1.4592, "step": 1809 }, { "epoch": 0.01, "grad_norm": 4.190582805213608, "learning_rate": 1.9999543703353816e-06, "loss": 1.2857, "step": 1810 }, { "epoch": 0.01, "grad_norm": 4.814450823568367, "learning_rate": 1.999954319622057e-06, "loss": 1.3498, "step": 1811 }, { "epoch": 0.01, "grad_norm": 4.868225374359218, "learning_rate": 1.9999542688805674e-06, "loss": 1.414, "step": 1812 }, { "epoch": 0.01, "grad_norm": 4.711417209971132, "learning_rate": 1.9999542181109123e-06, "loss": 1.355, "step": 1813 }, { "epoch": 0.01, "grad_norm": 5.007722725809002, "learning_rate": 1.9999541673130915e-06, "loss": 1.4713, "step": 1814 }, { "epoch": 0.01, "grad_norm": 4.8626655741355, "learning_rate": 1.9999541164871053e-06, "loss": 1.4264, "step": 1815 }, { "epoch": 0.01, "grad_norm": 4.723792462194043, "learning_rate": 1.999954065632954e-06, "loss": 1.4045, "step": 1816 }, { "epoch": 0.01, "grad_norm": 8.00100690699963, "learning_rate": 1.9999540147506374e-06, "loss": 1.4712, "step": 1817 }, { "epoch": 0.01, "grad_norm": 12.253942127148356, "learning_rate": 1.9999539638401553e-06, "loss": 1.4985, "step": 1818 }, { "epoch": 0.01, "grad_norm": 5.246538314584338, "learning_rate": 1.9999539129015073e-06, "loss": 1.4158, "step": 1819 }, { "epoch": 0.01, "grad_norm": 5.063213658151458, "learning_rate": 1.9999538619346947e-06, "loss": 1.4696, "step": 1820 }, { "epoch": 0.01, "grad_norm": 5.3593617360186405, "learning_rate": 1.9999538109397164e-06, "loss": 1.4141, "step": 1821 }, { "epoch": 0.01, "grad_norm": 4.648351531479633, "learning_rate": 1.9999537599165726e-06, "loss": 1.4502, "step": 1822 }, { "epoch": 0.01, "grad_norm": 6.134127084206851, "learning_rate": 1.9999537088652638e-06, "loss": 1.628, "step": 1823 }, { "epoch": 0.01, "grad_norm": 5.20685547745487, "learning_rate": 1.9999536577857893e-06, "loss": 1.2484, "step": 1824 }, { "epoch": 0.01, "grad_norm": 5.404803458184888, "learning_rate": 1.9999536066781494e-06, "loss": 1.4614, "step": 1825 }, { "epoch": 0.01, "eval_loss": 1.6452957391738892, "eval_runtime": 4.6377, "eval_samples_per_second": 1.941, "eval_steps_per_second": 1.078, "step": 1825 }, { "epoch": 0.01, "grad_norm": 5.325255851791553, "learning_rate": 1.9999535555423443e-06, "loss": 1.2913, "step": 1826 }, { "epoch": 0.01, "grad_norm": 4.812379984138036, "learning_rate": 1.9999535043783737e-06, "loss": 1.4321, "step": 1827 }, { "epoch": 0.01, "grad_norm": 4.574060048853807, "learning_rate": 1.999953453186238e-06, "loss": 1.4385, "step": 1828 }, { "epoch": 0.01, "grad_norm": 4.690167042783861, "learning_rate": 1.9999534019659366e-06, "loss": 1.4505, "step": 1829 }, { "epoch": 0.01, "grad_norm": 4.963037184486775, "learning_rate": 1.9999533507174702e-06, "loss": 1.3754, "step": 1830 }, { "epoch": 0.01, "grad_norm": 7.202540422032654, "learning_rate": 1.9999532994408383e-06, "loss": 1.5731, "step": 1831 }, { "epoch": 0.01, "grad_norm": 4.747566281161776, "learning_rate": 1.999953248136041e-06, "loss": 1.35, "step": 1832 }, { "epoch": 0.01, "grad_norm": 5.016639574119175, "learning_rate": 1.9999531968030782e-06, "loss": 1.5687, "step": 1833 }, { "epoch": 0.01, "grad_norm": 5.195599580383433, "learning_rate": 1.99995314544195e-06, "loss": 1.3277, "step": 1834 }, { "epoch": 0.01, "grad_norm": 4.71924385337544, "learning_rate": 1.999953094052657e-06, "loss": 1.3144, "step": 1835 }, { "epoch": 0.01, "grad_norm": 4.830368171047517, "learning_rate": 1.999953042635198e-06, "loss": 1.5285, "step": 1836 }, { "epoch": 0.01, "grad_norm": 8.34374594117172, "learning_rate": 1.999952991189574e-06, "loss": 1.4602, "step": 1837 }, { "epoch": 0.01, "grad_norm": 5.041136604255877, "learning_rate": 1.9999529397157847e-06, "loss": 1.4805, "step": 1838 }, { "epoch": 0.01, "grad_norm": 4.619671333702356, "learning_rate": 1.9999528882138298e-06, "loss": 1.4413, "step": 1839 }, { "epoch": 0.01, "grad_norm": 4.975416181507071, "learning_rate": 1.9999528366837097e-06, "loss": 1.395, "step": 1840 }, { "epoch": 0.01, "grad_norm": 5.04709900703668, "learning_rate": 1.9999527851254245e-06, "loss": 1.4027, "step": 1841 }, { "epoch": 0.01, "grad_norm": 5.834514593532968, "learning_rate": 1.9999527335389733e-06, "loss": 1.359, "step": 1842 }, { "epoch": 0.01, "grad_norm": 5.770095193729858, "learning_rate": 1.9999526819243574e-06, "loss": 1.5645, "step": 1843 }, { "epoch": 0.01, "grad_norm": 5.082331923840744, "learning_rate": 1.9999526302815756e-06, "loss": 1.436, "step": 1844 }, { "epoch": 0.01, "grad_norm": 4.516471050914241, "learning_rate": 1.999952578610629e-06, "loss": 1.338, "step": 1845 }, { "epoch": 0.01, "grad_norm": 4.952184920350597, "learning_rate": 1.999952526911517e-06, "loss": 1.4534, "step": 1846 }, { "epoch": 0.01, "grad_norm": 4.797375795198566, "learning_rate": 1.9999524751842394e-06, "loss": 1.3259, "step": 1847 }, { "epoch": 0.01, "grad_norm": 4.909335461688805, "learning_rate": 1.9999524234287963e-06, "loss": 1.464, "step": 1848 }, { "epoch": 0.01, "grad_norm": 4.7122725461586, "learning_rate": 1.9999523716451884e-06, "loss": 1.4756, "step": 1849 }, { "epoch": 0.01, "grad_norm": 4.836041938475466, "learning_rate": 1.9999523198334146e-06, "loss": 1.4444, "step": 1850 }, { "epoch": 0.01, "grad_norm": 4.42950126517274, "learning_rate": 1.999952267993476e-06, "loss": 1.3947, "step": 1851 }, { "epoch": 0.01, "grad_norm": 15.611894041325478, "learning_rate": 1.9999522161253717e-06, "loss": 1.5477, "step": 1852 }, { "epoch": 0.01, "grad_norm": 4.816487768685785, "learning_rate": 1.999952164229102e-06, "loss": 1.4487, "step": 1853 }, { "epoch": 0.01, "grad_norm": 4.705134898369493, "learning_rate": 1.9999521123046674e-06, "loss": 1.2334, "step": 1854 }, { "epoch": 0.01, "grad_norm": 4.9204906496645675, "learning_rate": 1.999952060352067e-06, "loss": 1.367, "step": 1855 }, { "epoch": 0.01, "grad_norm": 5.302614564464106, "learning_rate": 1.999952008371302e-06, "loss": 1.503, "step": 1856 }, { "epoch": 0.01, "grad_norm": 6.290262594106217, "learning_rate": 1.999951956362371e-06, "loss": 1.6158, "step": 1857 }, { "epoch": 0.01, "grad_norm": 4.832322054118164, "learning_rate": 1.999951904325275e-06, "loss": 1.3938, "step": 1858 }, { "epoch": 0.01, "grad_norm": 5.918339792868505, "learning_rate": 1.9999518522600134e-06, "loss": 1.5048, "step": 1859 }, { "epoch": 0.01, "grad_norm": 4.590992917749576, "learning_rate": 1.999951800166587e-06, "loss": 1.3696, "step": 1860 }, { "epoch": 0.01, "grad_norm": 6.614974294086556, "learning_rate": 1.9999517480449946e-06, "loss": 1.5505, "step": 1861 }, { "epoch": 0.01, "grad_norm": 4.7241251284973576, "learning_rate": 1.9999516958952373e-06, "loss": 1.3711, "step": 1862 }, { "epoch": 0.01, "grad_norm": 11.122789301908437, "learning_rate": 1.999951643717315e-06, "loss": 1.5607, "step": 1863 }, { "epoch": 0.01, "grad_norm": 4.849382872781319, "learning_rate": 1.9999515915112265e-06, "loss": 1.1324, "step": 1864 }, { "epoch": 0.01, "grad_norm": 4.873163835881678, "learning_rate": 1.9999515392769734e-06, "loss": 1.5033, "step": 1865 }, { "epoch": 0.01, "grad_norm": 5.173654117397512, "learning_rate": 1.9999514870145548e-06, "loss": 1.5117, "step": 1866 }, { "epoch": 0.01, "grad_norm": 4.642950123093243, "learning_rate": 1.9999514347239706e-06, "loss": 1.4977, "step": 1867 }, { "epoch": 0.01, "grad_norm": 6.846117886376132, "learning_rate": 1.9999513824052214e-06, "loss": 1.4254, "step": 1868 }, { "epoch": 0.01, "grad_norm": 5.101156060458363, "learning_rate": 1.999951330058307e-06, "loss": 1.5828, "step": 1869 }, { "epoch": 0.01, "grad_norm": 4.688953673802245, "learning_rate": 1.999951277683227e-06, "loss": 1.4109, "step": 1870 }, { "epoch": 0.01, "grad_norm": 4.703112399070017, "learning_rate": 1.999951225279982e-06, "loss": 1.4139, "step": 1871 }, { "epoch": 0.01, "grad_norm": 5.310901288329353, "learning_rate": 1.999951172848572e-06, "loss": 1.6931, "step": 1872 }, { "epoch": 0.01, "grad_norm": 5.741193705049953, "learning_rate": 1.9999511203889957e-06, "loss": 1.5699, "step": 1873 }, { "epoch": 0.01, "grad_norm": 4.6035588106756835, "learning_rate": 1.9999510679012545e-06, "loss": 1.4299, "step": 1874 }, { "epoch": 0.01, "grad_norm": 5.662657586420871, "learning_rate": 1.999951015385348e-06, "loss": 1.3702, "step": 1875 }, { "epoch": 0.01, "grad_norm": 4.8990643238694735, "learning_rate": 1.9999509628412766e-06, "loss": 1.4812, "step": 1876 }, { "epoch": 0.01, "grad_norm": 4.948600983425892, "learning_rate": 1.9999509102690396e-06, "loss": 1.4146, "step": 1877 }, { "epoch": 0.01, "grad_norm": 5.635028291248577, "learning_rate": 1.9999508576686375e-06, "loss": 1.4875, "step": 1878 }, { "epoch": 0.01, "grad_norm": 7.294273694280311, "learning_rate": 1.9999508050400702e-06, "loss": 1.332, "step": 1879 }, { "epoch": 0.01, "grad_norm": 5.184690906135398, "learning_rate": 1.999950752383337e-06, "loss": 1.3164, "step": 1880 }, { "epoch": 0.01, "grad_norm": 4.720673987223821, "learning_rate": 1.999950699698439e-06, "loss": 1.455, "step": 1881 }, { "epoch": 0.01, "grad_norm": 4.367468043629218, "learning_rate": 1.9999506469853757e-06, "loss": 1.3821, "step": 1882 }, { "epoch": 0.01, "grad_norm": 5.599591528503925, "learning_rate": 1.999950594244147e-06, "loss": 1.4663, "step": 1883 }, { "epoch": 0.01, "grad_norm": 4.428585648964182, "learning_rate": 1.999950541474753e-06, "loss": 1.3065, "step": 1884 }, { "epoch": 0.01, "grad_norm": 6.224684072029214, "learning_rate": 1.999950488677194e-06, "loss": 1.414, "step": 1885 }, { "epoch": 0.01, "grad_norm": 4.640404642843782, "learning_rate": 1.9999504358514695e-06, "loss": 1.4714, "step": 1886 }, { "epoch": 0.01, "grad_norm": 4.799354311682633, "learning_rate": 1.9999503829975796e-06, "loss": 1.3347, "step": 1887 }, { "epoch": 0.01, "grad_norm": 4.543001580495537, "learning_rate": 1.9999503301155246e-06, "loss": 1.3697, "step": 1888 }, { "epoch": 0.01, "grad_norm": 4.736298613103733, "learning_rate": 1.999950277205304e-06, "loss": 1.3371, "step": 1889 }, { "epoch": 0.01, "grad_norm": 4.571900592136959, "learning_rate": 1.999950224266919e-06, "loss": 1.385, "step": 1890 }, { "epoch": 0.01, "grad_norm": 6.969693151715899, "learning_rate": 1.9999501713003677e-06, "loss": 1.3239, "step": 1891 }, { "epoch": 0.01, "grad_norm": 4.655737378919456, "learning_rate": 1.999950118305652e-06, "loss": 1.5423, "step": 1892 }, { "epoch": 0.01, "grad_norm": 4.714447970923717, "learning_rate": 1.9999500652827704e-06, "loss": 1.5264, "step": 1893 }, { "epoch": 0.01, "grad_norm": 4.780305107625251, "learning_rate": 1.9999500122317235e-06, "loss": 1.3597, "step": 1894 }, { "epoch": 0.01, "grad_norm": 5.133517088409541, "learning_rate": 1.999949959152512e-06, "loss": 1.3169, "step": 1895 }, { "epoch": 0.01, "grad_norm": 4.924450244844968, "learning_rate": 1.9999499060451342e-06, "loss": 1.4864, "step": 1896 }, { "epoch": 0.01, "grad_norm": 6.275295688447123, "learning_rate": 1.9999498529095924e-06, "loss": 1.5597, "step": 1897 }, { "epoch": 0.01, "grad_norm": 4.7042297343124515, "learning_rate": 1.999949799745884e-06, "loss": 1.31, "step": 1898 }, { "epoch": 0.01, "eval_loss": 1.6416113376617432, "eval_runtime": 4.6259, "eval_samples_per_second": 1.946, "eval_steps_per_second": 1.081, "step": 1898 }, { "epoch": 0.01, "grad_norm": 6.423106928459506, "learning_rate": 1.9999497465540116e-06, "loss": 1.4495, "step": 1899 }, { "epoch": 0.01, "grad_norm": 5.0399225789912245, "learning_rate": 1.999949693333973e-06, "loss": 1.3536, "step": 1900 }, { "epoch": 0.01, "grad_norm": 4.928948443819752, "learning_rate": 1.9999496400857695e-06, "loss": 1.4797, "step": 1901 }, { "epoch": 0.01, "grad_norm": 4.67479846437279, "learning_rate": 1.9999495868094007e-06, "loss": 1.3, "step": 1902 }, { "epoch": 0.01, "grad_norm": 8.103095669421286, "learning_rate": 1.999949533504867e-06, "loss": 1.4622, "step": 1903 }, { "epoch": 0.01, "grad_norm": 5.134179165961643, "learning_rate": 1.9999494801721675e-06, "loss": 1.5459, "step": 1904 }, { "epoch": 0.01, "grad_norm": 5.025236385305749, "learning_rate": 1.999949426811303e-06, "loss": 1.5205, "step": 1905 }, { "epoch": 0.01, "grad_norm": 5.22119763423394, "learning_rate": 1.9999493734222734e-06, "loss": 1.3004, "step": 1906 }, { "epoch": 0.01, "grad_norm": 4.669191447439257, "learning_rate": 1.9999493200050783e-06, "loss": 1.3201, "step": 1907 }, { "epoch": 0.01, "grad_norm": 4.968754828299794, "learning_rate": 1.999949266559718e-06, "loss": 1.4346, "step": 1908 }, { "epoch": 0.01, "grad_norm": 4.487597166469777, "learning_rate": 1.9999492130861926e-06, "loss": 1.4025, "step": 1909 }, { "epoch": 0.01, "grad_norm": 4.879843563485657, "learning_rate": 1.9999491595845017e-06, "loss": 1.3078, "step": 1910 }, { "epoch": 0.01, "grad_norm": 4.910078993452133, "learning_rate": 1.9999491060546456e-06, "loss": 1.5774, "step": 1911 }, { "epoch": 0.01, "grad_norm": 4.647585620113467, "learning_rate": 1.9999490524966245e-06, "loss": 1.5707, "step": 1912 }, { "epoch": 0.01, "grad_norm": 4.567026839443758, "learning_rate": 1.999948998910438e-06, "loss": 1.3951, "step": 1913 }, { "epoch": 0.01, "grad_norm": 4.687048241254901, "learning_rate": 1.9999489452960864e-06, "loss": 1.4327, "step": 1914 }, { "epoch": 0.01, "grad_norm": 4.902103912285808, "learning_rate": 1.9999488916535695e-06, "loss": 1.477, "step": 1915 }, { "epoch": 0.01, "grad_norm": 4.57359910064146, "learning_rate": 1.9999488379828874e-06, "loss": 1.4144, "step": 1916 }, { "epoch": 0.01, "grad_norm": 5.105713137074651, "learning_rate": 1.99994878428404e-06, "loss": 1.4075, "step": 1917 }, { "epoch": 0.01, "grad_norm": 7.001514955863734, "learning_rate": 1.999948730557027e-06, "loss": 1.5653, "step": 1918 }, { "epoch": 0.01, "grad_norm": 5.65944563546782, "learning_rate": 1.9999486768018493e-06, "loss": 1.162, "step": 1919 }, { "epoch": 0.01, "grad_norm": 6.019156415872438, "learning_rate": 1.999948623018506e-06, "loss": 1.6276, "step": 1920 }, { "epoch": 0.01, "grad_norm": 4.677363975537384, "learning_rate": 1.999948569206998e-06, "loss": 1.4125, "step": 1921 }, { "epoch": 0.01, "grad_norm": 5.929950546593268, "learning_rate": 1.999948515367324e-06, "loss": 1.6226, "step": 1922 }, { "epoch": 0.01, "grad_norm": 4.6654710600977, "learning_rate": 1.9999484614994856e-06, "loss": 1.3902, "step": 1923 }, { "epoch": 0.01, "grad_norm": 5.450030912502321, "learning_rate": 1.9999484076034814e-06, "loss": 1.4725, "step": 1924 }, { "epoch": 0.01, "grad_norm": 4.994408282983105, "learning_rate": 1.9999483536793124e-06, "loss": 1.5852, "step": 1925 }, { "epoch": 0.01, "grad_norm": 4.714476017763944, "learning_rate": 1.999948299726978e-06, "loss": 1.4207, "step": 1926 }, { "epoch": 0.01, "grad_norm": 4.767073689418852, "learning_rate": 1.999948245746478e-06, "loss": 1.5189, "step": 1927 }, { "epoch": 0.01, "grad_norm": 4.392481238520719, "learning_rate": 1.9999481917378133e-06, "loss": 1.2504, "step": 1928 }, { "epoch": 0.01, "grad_norm": 4.354120579996221, "learning_rate": 1.9999481377009834e-06, "loss": 1.3287, "step": 1929 }, { "epoch": 0.01, "grad_norm": 4.643549574305054, "learning_rate": 1.999948083635988e-06, "loss": 1.4252, "step": 1930 }, { "epoch": 0.01, "grad_norm": 4.6243115760185605, "learning_rate": 1.9999480295428276e-06, "loss": 1.4642, "step": 1931 }, { "epoch": 0.01, "grad_norm": 5.261417138349398, "learning_rate": 1.9999479754215016e-06, "loss": 1.4865, "step": 1932 }, { "epoch": 0.01, "grad_norm": 4.618981173105002, "learning_rate": 1.999947921272011e-06, "loss": 1.495, "step": 1933 }, { "epoch": 0.01, "grad_norm": 4.958757287653689, "learning_rate": 1.9999478670943546e-06, "loss": 1.4359, "step": 1934 }, { "epoch": 0.01, "grad_norm": 4.930067872156233, "learning_rate": 1.9999478128885332e-06, "loss": 1.4284, "step": 1935 }, { "epoch": 0.01, "grad_norm": 4.470181062731635, "learning_rate": 1.9999477586545468e-06, "loss": 1.2989, "step": 1936 }, { "epoch": 0.01, "grad_norm": 5.127129338294685, "learning_rate": 1.999947704392395e-06, "loss": 1.443, "step": 1937 }, { "epoch": 0.01, "grad_norm": 5.127632526299122, "learning_rate": 1.999947650102078e-06, "loss": 1.3732, "step": 1938 }, { "epoch": 0.01, "grad_norm": 5.166443772114098, "learning_rate": 1.9999475957835958e-06, "loss": 1.4847, "step": 1939 }, { "epoch": 0.01, "grad_norm": 5.096431897023338, "learning_rate": 1.999947541436949e-06, "loss": 1.4193, "step": 1940 }, { "epoch": 0.01, "grad_norm": 4.836191132604963, "learning_rate": 1.999947487062136e-06, "loss": 1.4965, "step": 1941 }, { "epoch": 0.01, "grad_norm": 4.925828422364088, "learning_rate": 1.9999474326591583e-06, "loss": 1.4444, "step": 1942 }, { "epoch": 0.01, "grad_norm": 4.667390491716781, "learning_rate": 1.999947378228015e-06, "loss": 1.5763, "step": 1943 }, { "epoch": 0.01, "grad_norm": 5.279038785770578, "learning_rate": 1.9999473237687073e-06, "loss": 1.5275, "step": 1944 }, { "epoch": 0.01, "grad_norm": 5.641029202907494, "learning_rate": 1.999947269281234e-06, "loss": 1.5775, "step": 1945 }, { "epoch": 0.01, "grad_norm": 5.332390674887097, "learning_rate": 1.9999472147655955e-06, "loss": 1.5664, "step": 1946 }, { "epoch": 0.01, "grad_norm": 5.622675952825197, "learning_rate": 1.999947160221792e-06, "loss": 1.4099, "step": 1947 }, { "epoch": 0.01, "grad_norm": 4.613192639788906, "learning_rate": 1.9999471056498227e-06, "loss": 1.4579, "step": 1948 }, { "epoch": 0.01, "grad_norm": 4.879403704365523, "learning_rate": 1.999947051049689e-06, "loss": 1.4041, "step": 1949 }, { "epoch": 0.01, "grad_norm": 4.482648258267666, "learning_rate": 1.9999469964213895e-06, "loss": 1.317, "step": 1950 }, { "epoch": 0.01, "grad_norm": 5.108898563911747, "learning_rate": 1.999946941764925e-06, "loss": 1.4241, "step": 1951 }, { "epoch": 0.01, "grad_norm": 5.568831706403162, "learning_rate": 1.9999468870802954e-06, "loss": 1.5296, "step": 1952 }, { "epoch": 0.01, "grad_norm": 4.786414340270627, "learning_rate": 1.9999468323675007e-06, "loss": 1.5818, "step": 1953 }, { "epoch": 0.01, "grad_norm": 4.78919679188355, "learning_rate": 1.999946777626541e-06, "loss": 1.2252, "step": 1954 }, { "epoch": 0.01, "grad_norm": 4.524987858157338, "learning_rate": 1.9999467228574154e-06, "loss": 1.4747, "step": 1955 }, { "epoch": 0.01, "grad_norm": 4.948724773134819, "learning_rate": 1.9999466680601254e-06, "loss": 1.3693, "step": 1956 }, { "epoch": 0.01, "grad_norm": 4.498995965734504, "learning_rate": 1.9999466132346697e-06, "loss": 1.3926, "step": 1957 }, { "epoch": 0.01, "grad_norm": 6.690852300877899, "learning_rate": 1.9999465583810494e-06, "loss": 1.5302, "step": 1958 }, { "epoch": 0.01, "grad_norm": 4.628604279451408, "learning_rate": 1.9999465034992636e-06, "loss": 1.406, "step": 1959 }, { "epoch": 0.01, "grad_norm": 5.079437190752291, "learning_rate": 1.999946448589312e-06, "loss": 1.5523, "step": 1960 }, { "epoch": 0.01, "grad_norm": 4.966397313206571, "learning_rate": 1.9999463936511966e-06, "loss": 1.3872, "step": 1961 }, { "epoch": 0.01, "grad_norm": 4.652429476663953, "learning_rate": 1.999946338684915e-06, "loss": 1.4646, "step": 1962 }, { "epoch": 0.01, "grad_norm": 4.87657353554175, "learning_rate": 1.999946283690468e-06, "loss": 1.392, "step": 1963 }, { "epoch": 0.01, "grad_norm": 4.917643825546739, "learning_rate": 1.999946228667857e-06, "loss": 1.3851, "step": 1964 }, { "epoch": 0.01, "grad_norm": 5.67340721917725, "learning_rate": 1.9999461736170802e-06, "loss": 1.5101, "step": 1965 }, { "epoch": 0.01, "grad_norm": 4.39919847434518, "learning_rate": 1.999946118538138e-06, "loss": 1.3595, "step": 1966 }, { "epoch": 0.01, "grad_norm": 5.203293308343834, "learning_rate": 1.999946063431031e-06, "loss": 1.4727, "step": 1967 }, { "epoch": 0.01, "grad_norm": 4.678751954911345, "learning_rate": 1.9999460082957586e-06, "loss": 1.3947, "step": 1968 }, { "epoch": 0.01, "grad_norm": 4.569173814364991, "learning_rate": 1.999945953132321e-06, "loss": 1.4007, "step": 1969 }, { "epoch": 0.01, "grad_norm": 5.204406855731333, "learning_rate": 1.999945897940719e-06, "loss": 1.3463, "step": 1970 }, { "epoch": 0.01, "grad_norm": 6.853776529480603, "learning_rate": 1.999945842720951e-06, "loss": 1.6373, "step": 1971 }, { "epoch": 0.01, "eval_loss": 1.6432719230651855, "eval_runtime": 4.6393, "eval_samples_per_second": 1.94, "eval_steps_per_second": 1.078, "step": 1971 }, { "epoch": 0.01, "grad_norm": 4.975365230474304, "learning_rate": 1.9999457874730182e-06, "loss": 1.4259, "step": 1972 }, { "epoch": 0.01, "grad_norm": 5.106259928022218, "learning_rate": 1.9999457321969203e-06, "loss": 1.531, "step": 1973 }, { "epoch": 0.01, "grad_norm": 4.805401584156609, "learning_rate": 1.999945676892657e-06, "loss": 1.4088, "step": 1974 }, { "epoch": 0.01, "grad_norm": 6.312695783521927, "learning_rate": 1.9999456215602288e-06, "loss": 1.572, "step": 1975 }, { "epoch": 0.01, "grad_norm": 8.74462546951479, "learning_rate": 1.9999455661996355e-06, "loss": 1.2957, "step": 1976 }, { "epoch": 0.01, "grad_norm": 4.625243006738303, "learning_rate": 1.9999455108108767e-06, "loss": 1.4034, "step": 1977 }, { "epoch": 0.01, "grad_norm": 5.178605492022322, "learning_rate": 1.999945455393953e-06, "loss": 1.5405, "step": 1978 }, { "epoch": 0.01, "grad_norm": 4.655718915504933, "learning_rate": 1.999945399948864e-06, "loss": 1.5224, "step": 1979 }, { "epoch": 0.01, "grad_norm": 4.65830388317099, "learning_rate": 1.99994534447561e-06, "loss": 1.5357, "step": 1980 }, { "epoch": 0.01, "grad_norm": 5.288686031793038, "learning_rate": 1.999945288974191e-06, "loss": 1.5392, "step": 1981 }, { "epoch": 0.01, "grad_norm": 4.43642936625311, "learning_rate": 1.999945233444607e-06, "loss": 1.488, "step": 1982 }, { "epoch": 0.01, "grad_norm": 4.555178280796159, "learning_rate": 1.9999451778868575e-06, "loss": 1.3151, "step": 1983 }, { "epoch": 0.01, "grad_norm": 5.236202376421231, "learning_rate": 1.999945122300943e-06, "loss": 1.4033, "step": 1984 }, { "epoch": 0.01, "grad_norm": 4.887741373566786, "learning_rate": 1.999945066686863e-06, "loss": 1.506, "step": 1985 }, { "epoch": 0.01, "grad_norm": 4.9138979134608975, "learning_rate": 1.9999450110446183e-06, "loss": 1.4881, "step": 1986 }, { "epoch": 0.01, "grad_norm": 4.542007061012194, "learning_rate": 1.9999449553742084e-06, "loss": 1.4365, "step": 1987 }, { "epoch": 0.01, "grad_norm": 5.528814673826827, "learning_rate": 1.9999448996756337e-06, "loss": 1.4825, "step": 1988 }, { "epoch": 0.01, "grad_norm": 4.4557691220675375, "learning_rate": 1.9999448439488936e-06, "loss": 1.4531, "step": 1989 }, { "epoch": 0.01, "grad_norm": 4.758397625075794, "learning_rate": 1.999944788193988e-06, "loss": 1.4119, "step": 1990 }, { "epoch": 0.01, "grad_norm": 4.583454128627948, "learning_rate": 1.999944732410918e-06, "loss": 1.4116, "step": 1991 }, { "epoch": 0.01, "grad_norm": 4.532783999892837, "learning_rate": 1.999944676599682e-06, "loss": 1.2279, "step": 1992 }, { "epoch": 0.01, "grad_norm": 4.906450343559313, "learning_rate": 1.9999446207602813e-06, "loss": 1.4905, "step": 1993 }, { "epoch": 0.01, "grad_norm": 4.346899749377525, "learning_rate": 1.999944564892716e-06, "loss": 1.413, "step": 1994 }, { "epoch": 0.01, "grad_norm": 5.523495511164613, "learning_rate": 1.999944508996985e-06, "loss": 1.5664, "step": 1995 }, { "epoch": 0.01, "grad_norm": 4.959351356739527, "learning_rate": 1.999944453073089e-06, "loss": 1.3644, "step": 1996 }, { "epoch": 0.01, "grad_norm": 4.871410142653787, "learning_rate": 1.999944397121028e-06, "loss": 1.396, "step": 1997 }, { "epoch": 0.01, "grad_norm": 4.76408338524292, "learning_rate": 1.9999443411408018e-06, "loss": 1.4451, "step": 1998 }, { "epoch": 0.01, "grad_norm": 4.498293300971013, "learning_rate": 1.9999442851324104e-06, "loss": 1.397, "step": 1999 }, { "epoch": 0.01, "grad_norm": 4.9463867469913, "learning_rate": 1.999944229095854e-06, "loss": 1.459, "step": 2000 }, { "epoch": 0.01, "grad_norm": 4.4931004236648775, "learning_rate": 1.9999441730311324e-06, "loss": 1.4252, "step": 2001 }, { "epoch": 0.01, "grad_norm": 4.718233309528229, "learning_rate": 1.999944116938246e-06, "loss": 1.4942, "step": 2002 }, { "epoch": 0.01, "grad_norm": 5.889040911882571, "learning_rate": 1.9999440608171944e-06, "loss": 1.5412, "step": 2003 }, { "epoch": 0.01, "grad_norm": 5.014283756243838, "learning_rate": 1.9999440046679775e-06, "loss": 1.5232, "step": 2004 }, { "epoch": 0.01, "grad_norm": 5.395377515573, "learning_rate": 1.999943948490596e-06, "loss": 1.3687, "step": 2005 }, { "epoch": 0.01, "grad_norm": 5.1324825308243005, "learning_rate": 1.9999438922850487e-06, "loss": 1.459, "step": 2006 }, { "epoch": 0.01, "grad_norm": 5.302528839998102, "learning_rate": 1.9999438360513364e-06, "loss": 1.1898, "step": 2007 }, { "epoch": 0.01, "grad_norm": 4.921387927464629, "learning_rate": 1.9999437797894595e-06, "loss": 1.4922, "step": 2008 }, { "epoch": 0.01, "grad_norm": 4.66789838005482, "learning_rate": 1.9999437234994174e-06, "loss": 1.2908, "step": 2009 }, { "epoch": 0.01, "grad_norm": 4.466483592815993, "learning_rate": 1.99994366718121e-06, "loss": 1.3559, "step": 2010 }, { "epoch": 0.01, "grad_norm": 4.989407567640141, "learning_rate": 1.9999436108348375e-06, "loss": 1.3186, "step": 2011 }, { "epoch": 0.01, "grad_norm": 4.7584380055697135, "learning_rate": 1.9999435544602996e-06, "loss": 1.481, "step": 2012 }, { "epoch": 0.01, "grad_norm": 8.410113901804026, "learning_rate": 1.999943498057597e-06, "loss": 1.4516, "step": 2013 }, { "epoch": 0.01, "grad_norm": 5.145181164985424, "learning_rate": 1.99994344162673e-06, "loss": 1.2735, "step": 2014 }, { "epoch": 0.01, "grad_norm": 4.997375983666718, "learning_rate": 1.9999433851676967e-06, "loss": 1.5026, "step": 2015 }, { "epoch": 0.01, "grad_norm": 4.501900393004941, "learning_rate": 1.9999433286804992e-06, "loss": 1.5178, "step": 2016 }, { "epoch": 0.01, "grad_norm": 6.183628468382797, "learning_rate": 1.9999432721651362e-06, "loss": 1.5511, "step": 2017 }, { "epoch": 0.01, "grad_norm": 5.615198435575112, "learning_rate": 1.999943215621608e-06, "loss": 1.4319, "step": 2018 }, { "epoch": 0.01, "grad_norm": 4.91098537314749, "learning_rate": 1.999943159049915e-06, "loss": 1.5055, "step": 2019 }, { "epoch": 0.01, "grad_norm": 5.533623257731385, "learning_rate": 1.999943102450057e-06, "loss": 1.3607, "step": 2020 }, { "epoch": 0.01, "grad_norm": 4.823264533879004, "learning_rate": 1.9999430458220335e-06, "loss": 1.3773, "step": 2021 }, { "epoch": 0.01, "grad_norm": 4.610357246741694, "learning_rate": 1.9999429891658453e-06, "loss": 1.4646, "step": 2022 }, { "epoch": 0.01, "grad_norm": 6.279580425295216, "learning_rate": 1.999942932481492e-06, "loss": 1.3543, "step": 2023 }, { "epoch": 0.01, "grad_norm": 4.920902713497585, "learning_rate": 1.9999428757689737e-06, "loss": 1.4141, "step": 2024 }, { "epoch": 0.01, "grad_norm": 5.301852036011902, "learning_rate": 1.99994281902829e-06, "loss": 1.5213, "step": 2025 }, { "epoch": 0.01, "grad_norm": 5.135724505311549, "learning_rate": 1.9999427622594415e-06, "loss": 1.5948, "step": 2026 }, { "epoch": 0.01, "grad_norm": 4.852010935690981, "learning_rate": 1.9999427054624282e-06, "loss": 1.4197, "step": 2027 }, { "epoch": 0.01, "grad_norm": 4.9165669204888625, "learning_rate": 1.9999426486372494e-06, "loss": 1.5093, "step": 2028 }, { "epoch": 0.01, "grad_norm": 4.575212660119817, "learning_rate": 1.999942591783906e-06, "loss": 1.3396, "step": 2029 }, { "epoch": 0.01, "grad_norm": 5.259797384037423, "learning_rate": 1.9999425349023967e-06, "loss": 1.4153, "step": 2030 }, { "epoch": 0.01, "grad_norm": 5.548169399634219, "learning_rate": 1.999942477992723e-06, "loss": 1.3866, "step": 2031 }, { "epoch": 0.01, "grad_norm": 5.282143331135165, "learning_rate": 1.999942421054884e-06, "loss": 1.4014, "step": 2032 }, { "epoch": 0.01, "grad_norm": 4.574333910357711, "learning_rate": 1.99994236408888e-06, "loss": 1.3933, "step": 2033 }, { "epoch": 0.01, "grad_norm": 6.629247995921187, "learning_rate": 1.9999423070947114e-06, "loss": 1.5145, "step": 2034 }, { "epoch": 0.01, "grad_norm": 4.71772592571776, "learning_rate": 1.999942250072377e-06, "loss": 1.353, "step": 2035 }, { "epoch": 0.01, "grad_norm": 8.908977840581366, "learning_rate": 1.999942193021878e-06, "loss": 1.5628, "step": 2036 }, { "epoch": 0.01, "grad_norm": 6.021790328358759, "learning_rate": 1.9999421359432137e-06, "loss": 1.4148, "step": 2037 }, { "epoch": 0.01, "grad_norm": 5.176490897391262, "learning_rate": 1.999942078836385e-06, "loss": 1.586, "step": 2038 }, { "epoch": 0.01, "grad_norm": 4.99627582571255, "learning_rate": 1.9999420217013907e-06, "loss": 1.5591, "step": 2039 }, { "epoch": 0.01, "grad_norm": 5.02431904656082, "learning_rate": 1.9999419645382313e-06, "loss": 1.4402, "step": 2040 }, { "epoch": 0.01, "grad_norm": 4.713468056290315, "learning_rate": 1.999941907346907e-06, "loss": 1.339, "step": 2041 }, { "epoch": 0.01, "grad_norm": 5.7923489185700126, "learning_rate": 1.9999418501274176e-06, "loss": 1.4056, "step": 2042 }, { "epoch": 0.01, "grad_norm": 4.776165026219362, "learning_rate": 1.9999417928797632e-06, "loss": 1.5542, "step": 2043 }, { "epoch": 0.01, "grad_norm": 4.990681472067374, "learning_rate": 1.999941735603944e-06, "loss": 1.3998, "step": 2044 }, { "epoch": 0.01, "eval_loss": 1.6361159086227417, "eval_runtime": 4.6533, "eval_samples_per_second": 1.934, "eval_steps_per_second": 1.075, "step": 2044 }, { "epoch": 0.01, "grad_norm": 4.903134049101306, "learning_rate": 1.9999416782999592e-06, "loss": 1.4561, "step": 2045 }, { "epoch": 0.01, "grad_norm": 5.5369642093418285, "learning_rate": 1.99994162096781e-06, "loss": 1.4306, "step": 2046 }, { "epoch": 0.01, "grad_norm": 4.674123365349024, "learning_rate": 1.9999415636074956e-06, "loss": 1.397, "step": 2047 }, { "epoch": 0.01, "grad_norm": 5.020382697207387, "learning_rate": 1.9999415062190157e-06, "loss": 1.3635, "step": 2048 }, { "epoch": 0.01, "grad_norm": 5.402239102827217, "learning_rate": 1.9999414488023716e-06, "loss": 1.3843, "step": 2049 }, { "epoch": 0.01, "grad_norm": 5.367641687630019, "learning_rate": 1.999941391357562e-06, "loss": 1.4433, "step": 2050 }, { "epoch": 0.01, "grad_norm": 4.567561738037097, "learning_rate": 1.999941333884587e-06, "loss": 1.4309, "step": 2051 }, { "epoch": 0.01, "grad_norm": 5.525282756054969, "learning_rate": 1.9999412763834475e-06, "loss": 1.3293, "step": 2052 }, { "epoch": 0.01, "grad_norm": 5.908627704395454, "learning_rate": 1.999941218854143e-06, "loss": 1.3566, "step": 2053 }, { "epoch": 0.01, "grad_norm": 7.432919107784135, "learning_rate": 1.999941161296673e-06, "loss": 1.2394, "step": 2054 }, { "epoch": 0.01, "grad_norm": 4.778341393380081, "learning_rate": 1.9999411037110387e-06, "loss": 1.3844, "step": 2055 }, { "epoch": 0.01, "grad_norm": 4.954440841765967, "learning_rate": 1.9999410460972387e-06, "loss": 1.4503, "step": 2056 }, { "epoch": 0.01, "grad_norm": 4.758941261308998, "learning_rate": 1.999940988455274e-06, "loss": 1.4572, "step": 2057 }, { "epoch": 0.01, "grad_norm": 6.564392211755666, "learning_rate": 1.9999409307851446e-06, "loss": 1.1965, "step": 2058 }, { "epoch": 0.01, "grad_norm": 6.725945941216895, "learning_rate": 1.9999408730868497e-06, "loss": 1.3813, "step": 2059 }, { "epoch": 0.01, "grad_norm": 6.292635293917362, "learning_rate": 1.9999408153603897e-06, "loss": 1.4697, "step": 2060 }, { "epoch": 0.01, "grad_norm": 10.371826291437516, "learning_rate": 1.9999407576057654e-06, "loss": 1.4433, "step": 2061 }, { "epoch": 0.01, "grad_norm": 4.624824451917616, "learning_rate": 1.9999406998229756e-06, "loss": 1.5372, "step": 2062 }, { "epoch": 0.01, "grad_norm": 4.828294055435367, "learning_rate": 1.9999406420120206e-06, "loss": 1.4827, "step": 2063 }, { "epoch": 0.01, "grad_norm": 4.927358950382133, "learning_rate": 1.999940584172901e-06, "loss": 1.4385, "step": 2064 }, { "epoch": 0.01, "grad_norm": 5.4353174579637775, "learning_rate": 1.9999405263056162e-06, "loss": 1.4795, "step": 2065 }, { "epoch": 0.01, "grad_norm": 4.675940989803078, "learning_rate": 1.9999404684101663e-06, "loss": 1.4257, "step": 2066 }, { "epoch": 0.01, "grad_norm": 6.0245884345574225, "learning_rate": 1.9999404104865518e-06, "loss": 1.2672, "step": 2067 }, { "epoch": 0.01, "grad_norm": 6.1119365663762055, "learning_rate": 1.999940352534772e-06, "loss": 1.251, "step": 2068 }, { "epoch": 0.01, "grad_norm": 5.0121477273349315, "learning_rate": 1.9999402945548273e-06, "loss": 1.426, "step": 2069 }, { "epoch": 0.01, "grad_norm": 5.128014707129984, "learning_rate": 1.9999402365467174e-06, "loss": 1.3379, "step": 2070 }, { "epoch": 0.01, "grad_norm": 4.862057104969277, "learning_rate": 1.9999401785104428e-06, "loss": 1.424, "step": 2071 }, { "epoch": 0.01, "grad_norm": 4.873753921801018, "learning_rate": 1.999940120446003e-06, "loss": 1.3945, "step": 2072 }, { "epoch": 0.01, "grad_norm": 5.1012581647028945, "learning_rate": 1.999940062353398e-06, "loss": 1.5296, "step": 2073 }, { "epoch": 0.01, "grad_norm": 4.6157521798841055, "learning_rate": 1.9999400042326282e-06, "loss": 1.2629, "step": 2074 }, { "epoch": 0.01, "grad_norm": 4.803622065399791, "learning_rate": 1.999939946083694e-06, "loss": 1.408, "step": 2075 }, { "epoch": 0.01, "grad_norm": 4.740381722750265, "learning_rate": 1.9999398879065943e-06, "loss": 1.3605, "step": 2076 }, { "epoch": 0.01, "grad_norm": 4.831139523992052, "learning_rate": 1.99993982970133e-06, "loss": 1.2995, "step": 2077 }, { "epoch": 0.01, "grad_norm": 4.829459069984273, "learning_rate": 1.9999397714679002e-06, "loss": 1.3909, "step": 2078 }, { "epoch": 0.01, "grad_norm": 4.827491011717043, "learning_rate": 1.9999397132063055e-06, "loss": 1.559, "step": 2079 }, { "epoch": 0.01, "grad_norm": 4.947759137746265, "learning_rate": 1.9999396549165457e-06, "loss": 1.4971, "step": 2080 }, { "epoch": 0.01, "grad_norm": 4.739533384237727, "learning_rate": 1.9999395965986212e-06, "loss": 1.3768, "step": 2081 }, { "epoch": 0.01, "grad_norm": 7.339254038218756, "learning_rate": 1.9999395382525316e-06, "loss": 1.4479, "step": 2082 }, { "epoch": 0.01, "grad_norm": 4.537488781200089, "learning_rate": 1.9999394798782773e-06, "loss": 1.3548, "step": 2083 }, { "epoch": 0.01, "grad_norm": 4.7315607432035005, "learning_rate": 1.999939421475858e-06, "loss": 1.3923, "step": 2084 }, { "epoch": 0.01, "grad_norm": 5.154239758619694, "learning_rate": 1.9999393630452733e-06, "loss": 1.3989, "step": 2085 }, { "epoch": 0.01, "grad_norm": 4.535664302600297, "learning_rate": 1.999939304586524e-06, "loss": 1.3719, "step": 2086 }, { "epoch": 0.01, "grad_norm": 4.655332915728781, "learning_rate": 1.9999392460996098e-06, "loss": 1.3278, "step": 2087 }, { "epoch": 0.01, "grad_norm": 4.806855739816848, "learning_rate": 1.9999391875845303e-06, "loss": 1.5083, "step": 2088 }, { "epoch": 0.01, "grad_norm": 5.213130734185716, "learning_rate": 1.999939129041286e-06, "loss": 1.4663, "step": 2089 }, { "epoch": 0.01, "grad_norm": 6.229568233154151, "learning_rate": 1.999939070469877e-06, "loss": 1.3439, "step": 2090 }, { "epoch": 0.01, "grad_norm": 4.086635517733998, "learning_rate": 1.999939011870303e-06, "loss": 1.2555, "step": 2091 }, { "epoch": 0.01, "grad_norm": 4.727112810450526, "learning_rate": 1.999938953242564e-06, "loss": 1.4361, "step": 2092 }, { "epoch": 0.01, "grad_norm": 5.052258554851617, "learning_rate": 1.9999388945866596e-06, "loss": 1.3848, "step": 2093 }, { "epoch": 0.01, "grad_norm": 4.88161487935287, "learning_rate": 1.9999388359025903e-06, "loss": 1.4832, "step": 2094 }, { "epoch": 0.01, "grad_norm": 4.6739895014787685, "learning_rate": 1.9999387771903563e-06, "loss": 1.478, "step": 2095 }, { "epoch": 0.01, "grad_norm": 4.728323614320543, "learning_rate": 1.9999387184499576e-06, "loss": 1.5456, "step": 2096 }, { "epoch": 0.01, "grad_norm": 4.765256915622586, "learning_rate": 1.999938659681394e-06, "loss": 1.407, "step": 2097 }, { "epoch": 0.01, "grad_norm": 4.431982504309138, "learning_rate": 1.999938600884665e-06, "loss": 1.2764, "step": 2098 }, { "epoch": 0.01, "grad_norm": 4.902178089633812, "learning_rate": 1.999938542059771e-06, "loss": 1.5341, "step": 2099 }, { "epoch": 0.01, "grad_norm": 4.510226095063566, "learning_rate": 1.999938483206712e-06, "loss": 1.3526, "step": 2100 }, { "epoch": 0.01, "grad_norm": 5.074107216841095, "learning_rate": 1.9999384243254887e-06, "loss": 1.3247, "step": 2101 }, { "epoch": 0.01, "grad_norm": 4.667756343723743, "learning_rate": 1.9999383654160997e-06, "loss": 1.4049, "step": 2102 }, { "epoch": 0.01, "grad_norm": 4.752073321972883, "learning_rate": 1.999938306478546e-06, "loss": 1.4703, "step": 2103 }, { "epoch": 0.01, "grad_norm": 4.742868907788746, "learning_rate": 1.9999382475128277e-06, "loss": 1.2977, "step": 2104 }, { "epoch": 0.01, "grad_norm": 4.886382117755776, "learning_rate": 1.999938188518944e-06, "loss": 1.4292, "step": 2105 }, { "epoch": 0.01, "grad_norm": 4.781559805396283, "learning_rate": 1.999938129496896e-06, "loss": 1.3088, "step": 2106 }, { "epoch": 0.01, "grad_norm": 4.593766296713237, "learning_rate": 1.9999380704466828e-06, "loss": 1.3449, "step": 2107 }, { "epoch": 0.01, "grad_norm": 5.524007597424321, "learning_rate": 1.9999380113683048e-06, "loss": 1.4613, "step": 2108 }, { "epoch": 0.01, "grad_norm": 4.622852323760914, "learning_rate": 1.9999379522617613e-06, "loss": 1.4825, "step": 2109 }, { "epoch": 0.01, "grad_norm": 5.616671774319379, "learning_rate": 1.999937893127053e-06, "loss": 1.5888, "step": 2110 }, { "epoch": 0.01, "grad_norm": 4.481567608026818, "learning_rate": 1.99993783396418e-06, "loss": 1.4353, "step": 2111 }, { "epoch": 0.01, "grad_norm": 4.602477850579874, "learning_rate": 1.9999377747731426e-06, "loss": 1.3397, "step": 2112 }, { "epoch": 0.01, "grad_norm": 4.81449808655875, "learning_rate": 1.9999377155539395e-06, "loss": 1.4046, "step": 2113 }, { "epoch": 0.01, "grad_norm": 5.073953878891228, "learning_rate": 1.9999376563065716e-06, "loss": 1.4595, "step": 2114 }, { "epoch": 0.01, "grad_norm": 6.743457594007675, "learning_rate": 1.999937597031039e-06, "loss": 1.3975, "step": 2115 }, { "epoch": 0.01, "grad_norm": 4.530772105704062, "learning_rate": 1.9999375377273415e-06, "loss": 1.5028, "step": 2116 }, { "epoch": 0.01, "grad_norm": 5.257434903260648, "learning_rate": 1.999937478395479e-06, "loss": 1.4187, "step": 2117 }, { "epoch": 0.01, "eval_loss": 1.6335363388061523, "eval_runtime": 4.6337, "eval_samples_per_second": 1.942, "eval_steps_per_second": 1.079, "step": 2117 }, { "epoch": 0.01, "grad_norm": 4.722957307355561, "learning_rate": 1.9999374190354517e-06, "loss": 1.4206, "step": 2118 }, { "epoch": 0.01, "grad_norm": 4.473561914847432, "learning_rate": 1.999937359647259e-06, "loss": 1.3537, "step": 2119 }, { "epoch": 0.01, "grad_norm": 4.580086304567218, "learning_rate": 1.999937300230902e-06, "loss": 1.3944, "step": 2120 }, { "epoch": 0.01, "grad_norm": 4.245944086564923, "learning_rate": 1.9999372407863796e-06, "loss": 1.4074, "step": 2121 }, { "epoch": 0.01, "grad_norm": 4.676469646761249, "learning_rate": 1.999937181313693e-06, "loss": 1.4051, "step": 2122 }, { "epoch": 0.01, "grad_norm": 4.922681086761417, "learning_rate": 1.9999371218128408e-06, "loss": 1.4609, "step": 2123 }, { "epoch": 0.01, "grad_norm": 4.654470497689943, "learning_rate": 1.999937062283824e-06, "loss": 1.3944, "step": 2124 }, { "epoch": 0.01, "grad_norm": 4.961121386583948, "learning_rate": 1.9999370027266423e-06, "loss": 1.3877, "step": 2125 }, { "epoch": 0.01, "grad_norm": 5.590317374681009, "learning_rate": 1.9999369431412957e-06, "loss": 1.3661, "step": 2126 }, { "epoch": 0.01, "grad_norm": 5.530242304003091, "learning_rate": 1.9999368835277843e-06, "loss": 1.4709, "step": 2127 }, { "epoch": 0.01, "grad_norm": 4.619061402439089, "learning_rate": 1.999936823886108e-06, "loss": 1.3906, "step": 2128 }, { "epoch": 0.01, "grad_norm": 5.012913828479663, "learning_rate": 1.9999367642162662e-06, "loss": 1.5088, "step": 2129 }, { "epoch": 0.01, "grad_norm": 4.979952662806363, "learning_rate": 1.99993670451826e-06, "loss": 1.3039, "step": 2130 }, { "epoch": 0.01, "grad_norm": 4.5714181931648445, "learning_rate": 1.999936644792089e-06, "loss": 1.3549, "step": 2131 }, { "epoch": 0.01, "grad_norm": 4.770544347901491, "learning_rate": 1.999936585037753e-06, "loss": 1.3643, "step": 2132 }, { "epoch": 0.01, "grad_norm": 5.401807178237035, "learning_rate": 1.999936525255252e-06, "loss": 1.2191, "step": 2133 }, { "epoch": 0.01, "grad_norm": 5.894088878558767, "learning_rate": 1.999936465444586e-06, "loss": 1.5738, "step": 2134 }, { "epoch": 0.01, "grad_norm": 5.829142286770646, "learning_rate": 1.9999364056057555e-06, "loss": 1.5857, "step": 2135 }, { "epoch": 0.01, "grad_norm": 4.585395214505738, "learning_rate": 1.99993634573876e-06, "loss": 1.4653, "step": 2136 }, { "epoch": 0.01, "grad_norm": 4.752105085256273, "learning_rate": 1.9999362858435994e-06, "loss": 1.5093, "step": 2137 }, { "epoch": 0.01, "grad_norm": 4.567527923711839, "learning_rate": 1.9999362259202743e-06, "loss": 1.3605, "step": 2138 }, { "epoch": 0.01, "grad_norm": 4.928571238482233, "learning_rate": 1.999936165968784e-06, "loss": 1.4562, "step": 2139 }, { "epoch": 0.01, "grad_norm": 4.328826801769719, "learning_rate": 1.9999361059891288e-06, "loss": 1.347, "step": 2140 }, { "epoch": 0.01, "grad_norm": 4.886917145275371, "learning_rate": 1.9999360459813087e-06, "loss": 1.4164, "step": 2141 }, { "epoch": 0.01, "grad_norm": 6.291684965837117, "learning_rate": 1.999935985945324e-06, "loss": 1.4878, "step": 2142 }, { "epoch": 0.01, "grad_norm": 4.683286308224392, "learning_rate": 1.999935925881174e-06, "loss": 1.4892, "step": 2143 }, { "epoch": 0.01, "grad_norm": 5.5644324951779875, "learning_rate": 1.9999358657888597e-06, "loss": 1.4277, "step": 2144 }, { "epoch": 0.01, "grad_norm": 4.719071022798878, "learning_rate": 1.99993580566838e-06, "loss": 1.3753, "step": 2145 }, { "epoch": 0.01, "grad_norm": 5.384593412220494, "learning_rate": 1.9999357455197357e-06, "loss": 1.4575, "step": 2146 }, { "epoch": 0.01, "grad_norm": 4.49104950913911, "learning_rate": 1.999935685342927e-06, "loss": 1.4833, "step": 2147 }, { "epoch": 0.01, "grad_norm": 4.386345133556867, "learning_rate": 1.9999356251379525e-06, "loss": 1.2825, "step": 2148 }, { "epoch": 0.01, "grad_norm": 7.084357385014738, "learning_rate": 1.999935564904814e-06, "loss": 1.5971, "step": 2149 }, { "epoch": 0.01, "grad_norm": 4.483804488981741, "learning_rate": 1.99993550464351e-06, "loss": 1.3572, "step": 2150 }, { "epoch": 0.01, "grad_norm": 4.9800344533105045, "learning_rate": 1.999935444354042e-06, "loss": 1.4381, "step": 2151 }, { "epoch": 0.01, "grad_norm": 5.2020100481802025, "learning_rate": 1.999935384036408e-06, "loss": 1.5282, "step": 2152 }, { "epoch": 0.01, "grad_norm": 4.513002376188213, "learning_rate": 1.9999353236906097e-06, "loss": 1.4226, "step": 2153 }, { "epoch": 0.01, "grad_norm": 7.271851960236858, "learning_rate": 1.9999352633166466e-06, "loss": 1.4555, "step": 2154 }, { "epoch": 0.01, "grad_norm": 4.505644532012899, "learning_rate": 1.9999352029145187e-06, "loss": 1.3634, "step": 2155 }, { "epoch": 0.01, "grad_norm": 4.878236341705409, "learning_rate": 1.9999351424842258e-06, "loss": 1.3889, "step": 2156 }, { "epoch": 0.01, "grad_norm": 4.398260948524359, "learning_rate": 1.999935082025768e-06, "loss": 1.2472, "step": 2157 }, { "epoch": 0.01, "grad_norm": 4.83937816831458, "learning_rate": 1.9999350215391454e-06, "loss": 1.3623, "step": 2158 }, { "epoch": 0.01, "grad_norm": 4.865952677064821, "learning_rate": 1.999934961024358e-06, "loss": 1.4206, "step": 2159 }, { "epoch": 0.01, "grad_norm": 4.5376685001437, "learning_rate": 1.9999349004814058e-06, "loss": 1.4301, "step": 2160 }, { "epoch": 0.01, "grad_norm": 5.823420924569888, "learning_rate": 1.999934839910289e-06, "loss": 1.3083, "step": 2161 }, { "epoch": 0.01, "grad_norm": 4.854880352037673, "learning_rate": 1.999934779311007e-06, "loss": 1.4037, "step": 2162 }, { "epoch": 0.01, "grad_norm": 4.888876031412537, "learning_rate": 1.9999347186835603e-06, "loss": 1.4077, "step": 2163 }, { "epoch": 0.01, "grad_norm": 4.311155169710946, "learning_rate": 1.9999346580279485e-06, "loss": 1.2698, "step": 2164 }, { "epoch": 0.01, "grad_norm": 4.665174656087581, "learning_rate": 1.999934597344172e-06, "loss": 1.3712, "step": 2165 }, { "epoch": 0.01, "grad_norm": 5.162416137409393, "learning_rate": 1.999934536632231e-06, "loss": 1.4851, "step": 2166 }, { "epoch": 0.01, "grad_norm": 5.018889909992565, "learning_rate": 1.9999344758921247e-06, "loss": 1.452, "step": 2167 }, { "epoch": 0.01, "grad_norm": 4.793825358446103, "learning_rate": 1.999934415123854e-06, "loss": 1.4716, "step": 2168 }, { "epoch": 0.01, "grad_norm": 5.377258615565779, "learning_rate": 1.999934354327418e-06, "loss": 1.3063, "step": 2169 }, { "epoch": 0.01, "grad_norm": 4.656539358484254, "learning_rate": 1.9999342935028177e-06, "loss": 1.3844, "step": 2170 }, { "epoch": 0.01, "grad_norm": 5.869287289839831, "learning_rate": 1.9999342326500522e-06, "loss": 1.44, "step": 2171 }, { "epoch": 0.01, "grad_norm": 6.642787385567338, "learning_rate": 1.999934171769122e-06, "loss": 1.3566, "step": 2172 }, { "epoch": 0.01, "grad_norm": 4.8759975771352755, "learning_rate": 1.999934110860027e-06, "loss": 1.4363, "step": 2173 }, { "epoch": 0.01, "grad_norm": 5.319719105020677, "learning_rate": 1.999934049922767e-06, "loss": 1.3242, "step": 2174 }, { "epoch": 0.01, "grad_norm": 5.482396020182398, "learning_rate": 1.9999339889573426e-06, "loss": 1.5786, "step": 2175 }, { "epoch": 0.01, "grad_norm": 4.8751202038012265, "learning_rate": 1.999933927963753e-06, "loss": 1.5066, "step": 2176 }, { "epoch": 0.01, "grad_norm": 5.557230982959098, "learning_rate": 1.9999338669419984e-06, "loss": 1.4745, "step": 2177 }, { "epoch": 0.01, "grad_norm": 4.424760214415525, "learning_rate": 1.9999338058920797e-06, "loss": 1.3904, "step": 2178 }, { "epoch": 0.01, "grad_norm": 7.892348644844545, "learning_rate": 1.999933744813996e-06, "loss": 1.4981, "step": 2179 }, { "epoch": 0.01, "grad_norm": 4.751102176031589, "learning_rate": 1.999933683707747e-06, "loss": 1.4348, "step": 2180 }, { "epoch": 0.01, "grad_norm": 5.242736776599296, "learning_rate": 1.9999336225733336e-06, "loss": 1.5039, "step": 2181 }, { "epoch": 0.01, "grad_norm": 4.817718856225954, "learning_rate": 1.9999335614107553e-06, "loss": 1.4643, "step": 2182 }, { "epoch": 0.01, "grad_norm": 4.8671512340556315, "learning_rate": 1.999933500220012e-06, "loss": 1.4266, "step": 2183 }, { "epoch": 0.01, "grad_norm": 5.643685486109339, "learning_rate": 1.999933439001104e-06, "loss": 1.5044, "step": 2184 }, { "epoch": 0.01, "grad_norm": 4.812633898353339, "learning_rate": 1.9999333777540312e-06, "loss": 1.5403, "step": 2185 }, { "epoch": 0.01, "grad_norm": 4.5061691305089235, "learning_rate": 1.9999333164787937e-06, "loss": 1.2902, "step": 2186 }, { "epoch": 0.01, "grad_norm": 4.888665712532743, "learning_rate": 1.9999332551753915e-06, "loss": 1.4128, "step": 2187 }, { "epoch": 0.01, "grad_norm": 4.587451641455223, "learning_rate": 1.9999331938438245e-06, "loss": 1.4712, "step": 2188 }, { "epoch": 0.01, "grad_norm": 6.71997025307305, "learning_rate": 1.9999331324840925e-06, "loss": 1.3803, "step": 2189 }, { "epoch": 0.01, "grad_norm": 4.756832246172219, "learning_rate": 1.9999330710961957e-06, "loss": 1.4978, "step": 2190 }, { "epoch": 0.01, "eval_loss": 1.6319785118103027, "eval_runtime": 4.6267, "eval_samples_per_second": 1.945, "eval_steps_per_second": 1.081, "step": 2190 }, { "epoch": 0.01, "grad_norm": 5.050699124286467, "learning_rate": 1.9999330096801343e-06, "loss": 1.4176, "step": 2191 }, { "epoch": 0.01, "grad_norm": 4.898809642977276, "learning_rate": 1.9999329482359078e-06, "loss": 1.3222, "step": 2192 }, { "epoch": 0.01, "grad_norm": 4.704754463426861, "learning_rate": 1.999932886763517e-06, "loss": 1.3086, "step": 2193 }, { "epoch": 0.01, "grad_norm": 4.924235736627156, "learning_rate": 1.999932825262961e-06, "loss": 1.4768, "step": 2194 }, { "epoch": 0.01, "grad_norm": 7.042352873967065, "learning_rate": 1.9999327637342404e-06, "loss": 1.6917, "step": 2195 }, { "epoch": 0.01, "grad_norm": 4.9553596330060925, "learning_rate": 1.999932702177355e-06, "loss": 1.4411, "step": 2196 }, { "epoch": 0.01, "grad_norm": 4.963498435657598, "learning_rate": 1.9999326405923047e-06, "loss": 1.395, "step": 2197 }, { "epoch": 0.01, "grad_norm": 4.506052602096603, "learning_rate": 1.9999325789790895e-06, "loss": 1.247, "step": 2198 }, { "epoch": 0.01, "grad_norm": 5.278628706173458, "learning_rate": 1.99993251733771e-06, "loss": 1.4507, "step": 2199 }, { "epoch": 0.01, "grad_norm": 4.53129051287259, "learning_rate": 1.9999324556681656e-06, "loss": 1.4194, "step": 2200 }, { "epoch": 0.01, "grad_norm": 4.821670837144131, "learning_rate": 1.9999323939704564e-06, "loss": 1.3976, "step": 2201 }, { "epoch": 0.01, "grad_norm": 5.011321422146149, "learning_rate": 1.999932332244582e-06, "loss": 1.6157, "step": 2202 }, { "epoch": 0.01, "grad_norm": 4.951896374917011, "learning_rate": 1.9999322704905435e-06, "loss": 1.3892, "step": 2203 }, { "epoch": 0.01, "grad_norm": 4.653655462890473, "learning_rate": 1.99993220870834e-06, "loss": 1.4186, "step": 2204 }, { "epoch": 0.01, "grad_norm": 4.9073760614579784, "learning_rate": 1.9999321468979714e-06, "loss": 1.3918, "step": 2205 }, { "epoch": 0.01, "grad_norm": 4.615946898949113, "learning_rate": 1.9999320850594383e-06, "loss": 1.4513, "step": 2206 }, { "epoch": 0.01, "grad_norm": 4.781700938808737, "learning_rate": 1.99993202319274e-06, "loss": 1.4084, "step": 2207 }, { "epoch": 0.01, "grad_norm": 4.919243252625909, "learning_rate": 1.9999319612978777e-06, "loss": 1.3552, "step": 2208 }, { "epoch": 0.01, "grad_norm": 11.350832956279241, "learning_rate": 1.9999318993748505e-06, "loss": 1.6017, "step": 2209 }, { "epoch": 0.01, "grad_norm": 5.273831701204187, "learning_rate": 1.9999318374236587e-06, "loss": 1.4666, "step": 2210 }, { "epoch": 0.01, "grad_norm": 5.29032597581334, "learning_rate": 1.9999317754443013e-06, "loss": 1.6475, "step": 2211 }, { "epoch": 0.01, "grad_norm": 4.679461340547445, "learning_rate": 1.99993171343678e-06, "loss": 1.4847, "step": 2212 }, { "epoch": 0.01, "grad_norm": 4.881691601287856, "learning_rate": 1.9999316514010932e-06, "loss": 1.6707, "step": 2213 }, { "epoch": 0.01, "grad_norm": 4.880588356691005, "learning_rate": 1.999931589337242e-06, "loss": 1.4591, "step": 2214 }, { "epoch": 0.01, "grad_norm": 5.198012469615865, "learning_rate": 1.999931527245226e-06, "loss": 1.557, "step": 2215 }, { "epoch": 0.01, "grad_norm": 4.599284399092693, "learning_rate": 1.9999314651250456e-06, "loss": 1.4037, "step": 2216 }, { "epoch": 0.01, "grad_norm": 4.55023727427313, "learning_rate": 1.9999314029767004e-06, "loss": 1.3229, "step": 2217 }, { "epoch": 0.01, "grad_norm": 4.916366612795883, "learning_rate": 1.99993134080019e-06, "loss": 1.4152, "step": 2218 }, { "epoch": 0.01, "grad_norm": 4.272257622372692, "learning_rate": 1.9999312785955153e-06, "loss": 1.2069, "step": 2219 }, { "epoch": 0.02, "grad_norm": 5.793645780553963, "learning_rate": 1.9999312163626756e-06, "loss": 1.3437, "step": 2220 }, { "epoch": 0.02, "grad_norm": 4.765355672753897, "learning_rate": 1.9999311541016713e-06, "loss": 1.3828, "step": 2221 }, { "epoch": 0.02, "grad_norm": 4.609573170614245, "learning_rate": 1.9999310918125023e-06, "loss": 1.3513, "step": 2222 }, { "epoch": 0.02, "grad_norm": 4.705044702022353, "learning_rate": 1.9999310294951686e-06, "loss": 1.4292, "step": 2223 }, { "epoch": 0.02, "grad_norm": 6.549299970897536, "learning_rate": 1.99993096714967e-06, "loss": 1.3031, "step": 2224 }, { "epoch": 0.02, "grad_norm": 5.685926142426188, "learning_rate": 1.9999309047760067e-06, "loss": 1.4263, "step": 2225 }, { "epoch": 0.02, "grad_norm": 4.892295754105913, "learning_rate": 1.999930842374179e-06, "loss": 1.5274, "step": 2226 }, { "epoch": 0.02, "grad_norm": 4.723207695873954, "learning_rate": 1.999930779944186e-06, "loss": 1.2833, "step": 2227 }, { "epoch": 0.02, "grad_norm": 9.43058673722676, "learning_rate": 1.9999307174860284e-06, "loss": 1.3916, "step": 2228 }, { "epoch": 0.02, "grad_norm": 4.442144662758888, "learning_rate": 1.9999306549997065e-06, "loss": 1.3429, "step": 2229 }, { "epoch": 0.02, "grad_norm": 5.263310396091934, "learning_rate": 1.9999305924852196e-06, "loss": 1.4077, "step": 2230 }, { "epoch": 0.02, "grad_norm": 4.693930449937829, "learning_rate": 1.999930529942568e-06, "loss": 1.3103, "step": 2231 }, { "epoch": 0.02, "grad_norm": 4.394685919756182, "learning_rate": 1.9999304673717515e-06, "loss": 1.368, "step": 2232 }, { "epoch": 0.02, "grad_norm": 4.808214571784509, "learning_rate": 1.9999304047727705e-06, "loss": 1.5119, "step": 2233 }, { "epoch": 0.02, "grad_norm": 4.5647319860721165, "learning_rate": 1.9999303421456247e-06, "loss": 1.4842, "step": 2234 }, { "epoch": 0.02, "grad_norm": 5.270790805945803, "learning_rate": 1.9999302794903143e-06, "loss": 1.5416, "step": 2235 }, { "epoch": 0.02, "grad_norm": 5.115712741622331, "learning_rate": 1.999930216806839e-06, "loss": 1.6271, "step": 2236 }, { "epoch": 0.02, "grad_norm": 4.867749398037803, "learning_rate": 1.9999301540951993e-06, "loss": 1.4521, "step": 2237 }, { "epoch": 0.02, "grad_norm": 4.759697223017583, "learning_rate": 1.999930091355395e-06, "loss": 1.4084, "step": 2238 }, { "epoch": 0.02, "grad_norm": 4.910296412113315, "learning_rate": 1.999930028587425e-06, "loss": 1.3314, "step": 2239 }, { "epoch": 0.02, "grad_norm": 4.978260498852945, "learning_rate": 1.9999299657912913e-06, "loss": 1.389, "step": 2240 }, { "epoch": 0.02, "grad_norm": 5.148168402542014, "learning_rate": 1.9999299029669927e-06, "loss": 1.376, "step": 2241 }, { "epoch": 0.02, "grad_norm": 5.290115867763426, "learning_rate": 1.9999298401145294e-06, "loss": 1.6375, "step": 2242 }, { "epoch": 0.02, "grad_norm": 4.97467209793556, "learning_rate": 1.999929777233901e-06, "loss": 1.2882, "step": 2243 }, { "epoch": 0.02, "grad_norm": 5.344906181950449, "learning_rate": 1.9999297143251083e-06, "loss": 1.4118, "step": 2244 }, { "epoch": 0.02, "grad_norm": 4.308318004204632, "learning_rate": 1.999929651388151e-06, "loss": 1.3563, "step": 2245 }, { "epoch": 0.02, "grad_norm": 4.568152176449807, "learning_rate": 1.999929588423029e-06, "loss": 1.5212, "step": 2246 }, { "epoch": 0.02, "grad_norm": 4.601269314744774, "learning_rate": 1.9999295254297417e-06, "loss": 1.4808, "step": 2247 }, { "epoch": 0.02, "grad_norm": 4.854159941498488, "learning_rate": 1.9999294624082903e-06, "loss": 1.3002, "step": 2248 }, { "epoch": 0.02, "grad_norm": 4.718540879766618, "learning_rate": 1.9999293993586737e-06, "loss": 1.4872, "step": 2249 }, { "epoch": 0.02, "grad_norm": 4.7452734133361325, "learning_rate": 1.9999293362808933e-06, "loss": 1.3559, "step": 2250 }, { "epoch": 0.02, "grad_norm": 4.526122852996041, "learning_rate": 1.9999292731749473e-06, "loss": 1.4187, "step": 2251 }, { "epoch": 0.02, "grad_norm": 4.963529673562205, "learning_rate": 1.999929210040837e-06, "loss": 1.4198, "step": 2252 }, { "epoch": 0.02, "grad_norm": 4.77049575346727, "learning_rate": 1.999929146878562e-06, "loss": 1.2526, "step": 2253 }, { "epoch": 0.02, "grad_norm": 4.6993939688150395, "learning_rate": 1.999929083688122e-06, "loss": 1.5895, "step": 2254 }, { "epoch": 0.02, "grad_norm": 12.118619735033707, "learning_rate": 1.999929020469518e-06, "loss": 1.429, "step": 2255 }, { "epoch": 0.02, "grad_norm": 4.6599775134774815, "learning_rate": 1.999928957222749e-06, "loss": 1.5317, "step": 2256 }, { "epoch": 0.02, "grad_norm": 5.7355804324143405, "learning_rate": 1.9999288939478153e-06, "loss": 1.4884, "step": 2257 }, { "epoch": 0.02, "grad_norm": 5.3932795591976275, "learning_rate": 1.999928830644717e-06, "loss": 1.4564, "step": 2258 }, { "epoch": 0.02, "grad_norm": 5.091350488878629, "learning_rate": 1.9999287673134542e-06, "loss": 1.5135, "step": 2259 }, { "epoch": 0.02, "grad_norm": 5.7772806452872265, "learning_rate": 1.999928703954026e-06, "loss": 1.3913, "step": 2260 }, { "epoch": 0.02, "grad_norm": 4.604236988471221, "learning_rate": 1.999928640566434e-06, "loss": 1.5703, "step": 2261 }, { "epoch": 0.02, "grad_norm": 5.270866871119281, "learning_rate": 1.999928577150677e-06, "loss": 1.3042, "step": 2262 }, { "epoch": 0.02, "grad_norm": 4.55553737525265, "learning_rate": 1.9999285137067555e-06, "loss": 1.3917, "step": 2263 }, { "epoch": 0.02, "eval_loss": 1.6318128108978271, "eval_runtime": 4.6236, "eval_samples_per_second": 1.947, "eval_steps_per_second": 1.081, "step": 2263 }, { "epoch": 0.02, "grad_norm": 4.57569740895167, "learning_rate": 1.999928450234669e-06, "loss": 1.3676, "step": 2264 }, { "epoch": 0.02, "grad_norm": 5.233907230202161, "learning_rate": 1.9999283867344178e-06, "loss": 1.4965, "step": 2265 }, { "epoch": 0.02, "grad_norm": 4.36719675852659, "learning_rate": 1.9999283232060023e-06, "loss": 1.2795, "step": 2266 }, { "epoch": 0.02, "grad_norm": 6.6618468973225005, "learning_rate": 1.999928259649422e-06, "loss": 1.5947, "step": 2267 }, { "epoch": 0.02, "grad_norm": 5.211675232112218, "learning_rate": 1.999928196064677e-06, "loss": 1.6732, "step": 2268 }, { "epoch": 0.02, "grad_norm": 4.691350080075693, "learning_rate": 1.9999281324517672e-06, "loss": 1.4264, "step": 2269 }, { "epoch": 0.02, "grad_norm": 4.310562396402286, "learning_rate": 1.9999280688106934e-06, "loss": 1.3275, "step": 2270 }, { "epoch": 0.02, "grad_norm": 4.653521933252809, "learning_rate": 1.999928005141454e-06, "loss": 1.2394, "step": 2271 }, { "epoch": 0.02, "grad_norm": 5.251196276782816, "learning_rate": 1.999927941444051e-06, "loss": 1.4138, "step": 2272 }, { "epoch": 0.02, "grad_norm": 4.73612278780909, "learning_rate": 1.9999278777184825e-06, "loss": 1.3869, "step": 2273 }, { "epoch": 0.02, "grad_norm": 4.700582189189819, "learning_rate": 1.99992781396475e-06, "loss": 1.2867, "step": 2274 }, { "epoch": 0.02, "grad_norm": 4.995811821783478, "learning_rate": 1.999927750182852e-06, "loss": 1.4574, "step": 2275 }, { "epoch": 0.02, "grad_norm": 5.7048294811265405, "learning_rate": 1.99992768637279e-06, "loss": 1.5312, "step": 2276 }, { "epoch": 0.02, "grad_norm": 6.015500835378081, "learning_rate": 1.9999276225345635e-06, "loss": 1.4603, "step": 2277 }, { "epoch": 0.02, "grad_norm": 5.038403242723015, "learning_rate": 1.9999275586681717e-06, "loss": 1.4336, "step": 2278 }, { "epoch": 0.02, "grad_norm": 4.94366068107535, "learning_rate": 1.9999274947736156e-06, "loss": 1.4457, "step": 2279 }, { "epoch": 0.02, "grad_norm": 4.853975513709176, "learning_rate": 1.9999274308508953e-06, "loss": 1.4836, "step": 2280 }, { "epoch": 0.02, "grad_norm": 10.250005051833288, "learning_rate": 1.99992736690001e-06, "loss": 1.4746, "step": 2281 }, { "epoch": 0.02, "grad_norm": 4.706101925023441, "learning_rate": 1.9999273029209597e-06, "loss": 1.5153, "step": 2282 }, { "epoch": 0.02, "grad_norm": 8.691916068442643, "learning_rate": 1.9999272389137453e-06, "loss": 1.6037, "step": 2283 }, { "epoch": 0.02, "grad_norm": 4.891521294365354, "learning_rate": 1.999927174878366e-06, "loss": 1.4662, "step": 2284 }, { "epoch": 0.02, "grad_norm": 4.299833570856004, "learning_rate": 1.9999271108148224e-06, "loss": 1.4442, "step": 2285 }, { "epoch": 0.02, "grad_norm": 5.233339122045351, "learning_rate": 1.9999270467231143e-06, "loss": 1.593, "step": 2286 }, { "epoch": 0.02, "grad_norm": 4.876412995521755, "learning_rate": 1.999926982603241e-06, "loss": 1.4954, "step": 2287 }, { "epoch": 0.02, "grad_norm": 4.83398554670689, "learning_rate": 1.9999269184552033e-06, "loss": 1.4952, "step": 2288 }, { "epoch": 0.02, "grad_norm": 4.659977577870166, "learning_rate": 1.999926854279001e-06, "loss": 1.5399, "step": 2289 }, { "epoch": 0.02, "grad_norm": 4.709246532034214, "learning_rate": 1.9999267900746343e-06, "loss": 1.4852, "step": 2290 }, { "epoch": 0.02, "grad_norm": 4.469809734473597, "learning_rate": 1.9999267258421028e-06, "loss": 1.4133, "step": 2291 }, { "epoch": 0.02, "grad_norm": 4.470510103145094, "learning_rate": 1.999926661581407e-06, "loss": 1.393, "step": 2292 }, { "epoch": 0.02, "grad_norm": 5.045637874654517, "learning_rate": 1.999926597292546e-06, "loss": 1.4533, "step": 2293 }, { "epoch": 0.02, "grad_norm": 7.56360911520591, "learning_rate": 1.9999265329755204e-06, "loss": 1.4627, "step": 2294 }, { "epoch": 0.02, "grad_norm": 4.90860484176041, "learning_rate": 1.9999264686303306e-06, "loss": 1.4674, "step": 2295 }, { "epoch": 0.02, "grad_norm": 4.513345359016672, "learning_rate": 1.999926404256976e-06, "loss": 1.2764, "step": 2296 }, { "epoch": 0.02, "grad_norm": 5.223838246271355, "learning_rate": 1.9999263398554567e-06, "loss": 1.5127, "step": 2297 }, { "epoch": 0.02, "grad_norm": 5.195325189394897, "learning_rate": 1.999926275425773e-06, "loss": 1.3677, "step": 2298 }, { "epoch": 0.02, "grad_norm": 5.198912590441455, "learning_rate": 1.999926210967925e-06, "loss": 1.3427, "step": 2299 }, { "epoch": 0.02, "grad_norm": 4.614705914091713, "learning_rate": 1.999926146481912e-06, "loss": 1.4733, "step": 2300 }, { "epoch": 0.02, "grad_norm": 4.975373741434217, "learning_rate": 1.9999260819677345e-06, "loss": 1.4271, "step": 2301 }, { "epoch": 0.02, "grad_norm": 4.970231553749659, "learning_rate": 1.9999260174253926e-06, "loss": 1.52, "step": 2302 }, { "epoch": 0.02, "grad_norm": 5.1470565052529516, "learning_rate": 1.9999259528548856e-06, "loss": 1.4938, "step": 2303 }, { "epoch": 0.02, "grad_norm": 4.804932112180643, "learning_rate": 1.9999258882562143e-06, "loss": 1.3168, "step": 2304 }, { "epoch": 0.02, "grad_norm": 5.117885144420734, "learning_rate": 1.9999258236293784e-06, "loss": 1.4501, "step": 2305 }, { "epoch": 0.02, "grad_norm": 4.891670263126413, "learning_rate": 1.999925758974378e-06, "loss": 1.4911, "step": 2306 }, { "epoch": 0.02, "grad_norm": 5.195863211639049, "learning_rate": 1.9999256942912132e-06, "loss": 1.4502, "step": 2307 }, { "epoch": 0.02, "grad_norm": 5.2574150657434, "learning_rate": 1.9999256295798832e-06, "loss": 1.5026, "step": 2308 }, { "epoch": 0.02, "grad_norm": 5.872600619683232, "learning_rate": 1.9999255648403893e-06, "loss": 1.5434, "step": 2309 }, { "epoch": 0.02, "grad_norm": 4.686714043104333, "learning_rate": 1.9999255000727303e-06, "loss": 1.6448, "step": 2310 }, { "epoch": 0.02, "grad_norm": 4.631001470345563, "learning_rate": 1.999925435276907e-06, "loss": 1.3702, "step": 2311 }, { "epoch": 0.02, "grad_norm": 5.337508348563074, "learning_rate": 1.999925370452919e-06, "loss": 1.3578, "step": 2312 }, { "epoch": 0.02, "grad_norm": 4.65818564635677, "learning_rate": 1.999925305600767e-06, "loss": 1.4024, "step": 2313 }, { "epoch": 0.02, "grad_norm": 4.379822492720928, "learning_rate": 1.99992524072045e-06, "loss": 1.4887, "step": 2314 }, { "epoch": 0.02, "grad_norm": 5.299169016401586, "learning_rate": 1.9999251758119684e-06, "loss": 1.4817, "step": 2315 }, { "epoch": 0.02, "grad_norm": 5.225815129184746, "learning_rate": 1.999925110875322e-06, "loss": 1.4054, "step": 2316 }, { "epoch": 0.02, "grad_norm": 4.568549504456492, "learning_rate": 1.999925045910511e-06, "loss": 1.4551, "step": 2317 }, { "epoch": 0.02, "grad_norm": 5.431025457178463, "learning_rate": 1.999924980917536e-06, "loss": 1.4775, "step": 2318 }, { "epoch": 0.02, "grad_norm": 4.8399275566764235, "learning_rate": 1.999924915896396e-06, "loss": 1.4959, "step": 2319 }, { "epoch": 0.02, "grad_norm": 6.473563523649748, "learning_rate": 1.9999248508470912e-06, "loss": 1.4546, "step": 2320 }, { "epoch": 0.02, "grad_norm": 4.740884057821344, "learning_rate": 1.9999247857696227e-06, "loss": 1.4305, "step": 2321 }, { "epoch": 0.02, "grad_norm": 4.860828036066125, "learning_rate": 1.9999247206639887e-06, "loss": 1.4348, "step": 2322 }, { "epoch": 0.02, "grad_norm": 4.6662009244963, "learning_rate": 1.999924655530191e-06, "loss": 1.55, "step": 2323 }, { "epoch": 0.02, "grad_norm": 5.9234684189906925, "learning_rate": 1.9999245903682283e-06, "loss": 1.4198, "step": 2324 }, { "epoch": 0.02, "grad_norm": 4.492554780845503, "learning_rate": 1.999924525178101e-06, "loss": 1.3904, "step": 2325 }, { "epoch": 0.02, "grad_norm": 6.763408364164771, "learning_rate": 1.9999244599598095e-06, "loss": 1.2435, "step": 2326 }, { "epoch": 0.02, "grad_norm": 5.2225195105349975, "learning_rate": 1.9999243947133532e-06, "loss": 1.4942, "step": 2327 }, { "epoch": 0.02, "grad_norm": 5.06710806212036, "learning_rate": 1.9999243294387323e-06, "loss": 1.3485, "step": 2328 }, { "epoch": 0.02, "grad_norm": 4.97215088823951, "learning_rate": 1.9999242641359467e-06, "loss": 1.4744, "step": 2329 }, { "epoch": 0.02, "grad_norm": 6.085684880572574, "learning_rate": 1.999924198804997e-06, "loss": 1.347, "step": 2330 }, { "epoch": 0.02, "grad_norm": 7.378163474560379, "learning_rate": 1.9999241334458823e-06, "loss": 1.3741, "step": 2331 }, { "epoch": 0.02, "grad_norm": 4.503096219870561, "learning_rate": 1.999924068058603e-06, "loss": 1.5229, "step": 2332 }, { "epoch": 0.02, "grad_norm": 5.36927927637507, "learning_rate": 1.99992400264316e-06, "loss": 1.207, "step": 2333 }, { "epoch": 0.02, "grad_norm": 4.457857920180457, "learning_rate": 1.9999239371995517e-06, "loss": 1.4385, "step": 2334 }, { "epoch": 0.02, "grad_norm": 4.6240106572076956, "learning_rate": 1.999923871727779e-06, "loss": 1.4643, "step": 2335 }, { "epoch": 0.02, "grad_norm": 4.861127113148184, "learning_rate": 1.999923806227842e-06, "loss": 1.423, "step": 2336 }, { "epoch": 0.02, "eval_loss": 1.6273260116577148, "eval_runtime": 4.6368, "eval_samples_per_second": 1.941, "eval_steps_per_second": 1.078, "step": 2336 }, { "epoch": 0.02, "grad_norm": 4.9052593670852405, "learning_rate": 1.99992374069974e-06, "loss": 1.4129, "step": 2337 }, { "epoch": 0.02, "grad_norm": 5.35788095136664, "learning_rate": 1.9999236751434744e-06, "loss": 1.5536, "step": 2338 }, { "epoch": 0.02, "grad_norm": 4.5895028769970505, "learning_rate": 1.9999236095590435e-06, "loss": 1.3911, "step": 2339 }, { "epoch": 0.02, "grad_norm": 4.831901014472646, "learning_rate": 1.999923543946448e-06, "loss": 1.4226, "step": 2340 }, { "epoch": 0.02, "grad_norm": 4.898166932033894, "learning_rate": 1.9999234783056886e-06, "loss": 1.5627, "step": 2341 }, { "epoch": 0.02, "grad_norm": 4.579327466598639, "learning_rate": 1.999923412636764e-06, "loss": 1.4576, "step": 2342 }, { "epoch": 0.02, "grad_norm": 5.894443631125458, "learning_rate": 1.9999233469396754e-06, "loss": 1.1901, "step": 2343 }, { "epoch": 0.02, "grad_norm": 5.691429649313104, "learning_rate": 1.999923281214422e-06, "loss": 1.6006, "step": 2344 }, { "epoch": 0.02, "grad_norm": 5.688934834826396, "learning_rate": 1.999923215461004e-06, "loss": 1.562, "step": 2345 }, { "epoch": 0.02, "grad_norm": 5.587544882543127, "learning_rate": 1.999923149679422e-06, "loss": 1.4466, "step": 2346 }, { "epoch": 0.02, "grad_norm": 4.412183277646008, "learning_rate": 1.999923083869675e-06, "loss": 1.4389, "step": 2347 }, { "epoch": 0.02, "grad_norm": 4.939665962691669, "learning_rate": 1.9999230180317637e-06, "loss": 1.3812, "step": 2348 }, { "epoch": 0.02, "grad_norm": 5.20037573679403, "learning_rate": 1.9999229521656876e-06, "loss": 1.3061, "step": 2349 }, { "epoch": 0.02, "grad_norm": 5.219055105472269, "learning_rate": 1.9999228862714473e-06, "loss": 1.5609, "step": 2350 }, { "epoch": 0.02, "grad_norm": 4.346948682121618, "learning_rate": 1.9999228203490427e-06, "loss": 1.3565, "step": 2351 }, { "epoch": 0.02, "grad_norm": 4.644800626615502, "learning_rate": 1.999922754398473e-06, "loss": 1.3957, "step": 2352 }, { "epoch": 0.02, "grad_norm": 4.526521338106829, "learning_rate": 1.9999226884197394e-06, "loss": 1.3943, "step": 2353 }, { "epoch": 0.02, "grad_norm": 4.860775731784836, "learning_rate": 1.999922622412841e-06, "loss": 1.5376, "step": 2354 }, { "epoch": 0.02, "grad_norm": 4.987367583243536, "learning_rate": 1.999922556377778e-06, "loss": 1.1198, "step": 2355 }, { "epoch": 0.02, "grad_norm": 4.734455961994287, "learning_rate": 1.999922490314551e-06, "loss": 1.5315, "step": 2356 }, { "epoch": 0.02, "grad_norm": 4.71775855055711, "learning_rate": 1.999922424223159e-06, "loss": 1.4829, "step": 2357 }, { "epoch": 0.02, "grad_norm": 6.340625782014353, "learning_rate": 1.999922358103603e-06, "loss": 1.3968, "step": 2358 }, { "epoch": 0.02, "grad_norm": 4.624947314913078, "learning_rate": 1.999922291955882e-06, "loss": 1.3363, "step": 2359 }, { "epoch": 0.02, "grad_norm": 4.617825324071575, "learning_rate": 1.9999222257799964e-06, "loss": 1.5, "step": 2360 }, { "epoch": 0.02, "grad_norm": 4.485319822796489, "learning_rate": 1.999922159575947e-06, "loss": 1.4177, "step": 2361 }, { "epoch": 0.02, "grad_norm": 4.850158299358528, "learning_rate": 1.9999220933437324e-06, "loss": 1.3689, "step": 2362 }, { "epoch": 0.02, "grad_norm": 5.219052152565596, "learning_rate": 1.999922027083354e-06, "loss": 1.4092, "step": 2363 }, { "epoch": 0.02, "grad_norm": 4.931613378755357, "learning_rate": 1.999921960794811e-06, "loss": 1.326, "step": 2364 }, { "epoch": 0.02, "grad_norm": 4.992683247482379, "learning_rate": 1.9999218944781032e-06, "loss": 1.4233, "step": 2365 }, { "epoch": 0.02, "grad_norm": 4.349977050699164, "learning_rate": 1.9999218281332308e-06, "loss": 1.3647, "step": 2366 }, { "epoch": 0.02, "grad_norm": 4.701955352762745, "learning_rate": 1.999921761760194e-06, "loss": 1.4519, "step": 2367 }, { "epoch": 0.02, "grad_norm": 4.494673397335141, "learning_rate": 1.999921695358993e-06, "loss": 1.4392, "step": 2368 }, { "epoch": 0.02, "grad_norm": 4.7365593408579345, "learning_rate": 1.9999216289296273e-06, "loss": 1.3825, "step": 2369 }, { "epoch": 0.02, "grad_norm": 4.856324736948551, "learning_rate": 1.9999215624720974e-06, "loss": 1.5478, "step": 2370 }, { "epoch": 0.02, "grad_norm": 4.8131720405427645, "learning_rate": 1.999921495986403e-06, "loss": 1.4974, "step": 2371 }, { "epoch": 0.02, "grad_norm": 4.6424157982225935, "learning_rate": 1.9999214294725442e-06, "loss": 1.5017, "step": 2372 }, { "epoch": 0.02, "grad_norm": 5.7994216756118595, "learning_rate": 1.9999213629305206e-06, "loss": 1.3935, "step": 2373 }, { "epoch": 0.02, "grad_norm": 12.161522599209015, "learning_rate": 1.9999212963603328e-06, "loss": 1.5953, "step": 2374 }, { "epoch": 0.02, "grad_norm": 4.663179583938319, "learning_rate": 1.9999212297619806e-06, "loss": 1.3695, "step": 2375 }, { "epoch": 0.02, "grad_norm": 4.992382036334836, "learning_rate": 1.9999211631354633e-06, "loss": 1.5055, "step": 2376 }, { "epoch": 0.02, "grad_norm": 4.8753351925338695, "learning_rate": 1.9999210964807822e-06, "loss": 1.3948, "step": 2377 }, { "epoch": 0.02, "grad_norm": 6.222563145132559, "learning_rate": 1.9999210297979364e-06, "loss": 1.3826, "step": 2378 }, { "epoch": 0.02, "grad_norm": 4.88643245267929, "learning_rate": 1.9999209630869264e-06, "loss": 1.3931, "step": 2379 }, { "epoch": 0.02, "grad_norm": 5.560779789666663, "learning_rate": 1.999920896347752e-06, "loss": 1.4941, "step": 2380 }, { "epoch": 0.02, "grad_norm": 5.296411387581933, "learning_rate": 1.999920829580413e-06, "loss": 1.5771, "step": 2381 }, { "epoch": 0.02, "grad_norm": 4.739960436369839, "learning_rate": 1.9999207627849093e-06, "loss": 1.402, "step": 2382 }, { "epoch": 0.02, "grad_norm": 4.494606888309802, "learning_rate": 1.9999206959612413e-06, "loss": 1.3666, "step": 2383 }, { "epoch": 0.02, "grad_norm": 6.102337146702119, "learning_rate": 1.999920629109409e-06, "loss": 1.4514, "step": 2384 }, { "epoch": 0.02, "grad_norm": 4.2918281175179365, "learning_rate": 1.999920562229412e-06, "loss": 1.4275, "step": 2385 }, { "epoch": 0.02, "grad_norm": 4.5428032056841845, "learning_rate": 1.999920495321251e-06, "loss": 1.4875, "step": 2386 }, { "epoch": 0.02, "grad_norm": 4.981403898596884, "learning_rate": 1.9999204283849253e-06, "loss": 1.4604, "step": 2387 }, { "epoch": 0.02, "grad_norm": 4.656946904533867, "learning_rate": 1.999920361420435e-06, "loss": 1.3959, "step": 2388 }, { "epoch": 0.02, "grad_norm": 4.69793455074141, "learning_rate": 1.9999202944277807e-06, "loss": 1.3798, "step": 2389 }, { "epoch": 0.02, "grad_norm": 4.8697187270242415, "learning_rate": 1.9999202274069615e-06, "loss": 1.5877, "step": 2390 }, { "epoch": 0.02, "grad_norm": 6.303206163342291, "learning_rate": 1.999920160357978e-06, "loss": 1.3649, "step": 2391 }, { "epoch": 0.02, "grad_norm": 8.428043235458622, "learning_rate": 1.9999200932808304e-06, "loss": 1.4989, "step": 2392 }, { "epoch": 0.02, "grad_norm": 4.69775284325294, "learning_rate": 1.999920026175518e-06, "loss": 1.4359, "step": 2393 }, { "epoch": 0.02, "grad_norm": 7.468054893971967, "learning_rate": 1.9999199590420414e-06, "loss": 1.3861, "step": 2394 }, { "epoch": 0.02, "grad_norm": 5.1096088777638675, "learning_rate": 1.9999198918804005e-06, "loss": 1.3725, "step": 2395 }, { "epoch": 0.02, "grad_norm": 5.09309736334753, "learning_rate": 1.999919824690595e-06, "loss": 1.354, "step": 2396 }, { "epoch": 0.02, "grad_norm": 5.061146836059262, "learning_rate": 1.999919757472625e-06, "loss": 1.4047, "step": 2397 }, { "epoch": 0.02, "grad_norm": 4.698403918335731, "learning_rate": 1.999919690226491e-06, "loss": 1.5047, "step": 2398 }, { "epoch": 0.02, "grad_norm": 4.722675479564567, "learning_rate": 1.999919622952192e-06, "loss": 1.455, "step": 2399 }, { "epoch": 0.02, "grad_norm": 4.676824374912994, "learning_rate": 1.999919555649729e-06, "loss": 1.3603, "step": 2400 }, { "epoch": 0.02, "grad_norm": 4.660790706723943, "learning_rate": 1.999919488319101e-06, "loss": 1.4138, "step": 2401 }, { "epoch": 0.02, "grad_norm": 5.8886919642276006, "learning_rate": 1.9999194209603095e-06, "loss": 1.4662, "step": 2402 }, { "epoch": 0.02, "grad_norm": 4.728150466755214, "learning_rate": 1.999919353573353e-06, "loss": 1.3994, "step": 2403 }, { "epoch": 0.02, "grad_norm": 5.2893263295633, "learning_rate": 1.999919286158232e-06, "loss": 1.5503, "step": 2404 }, { "epoch": 0.02, "grad_norm": 4.462205296842043, "learning_rate": 1.9999192187149468e-06, "loss": 1.4898, "step": 2405 }, { "epoch": 0.02, "grad_norm": 4.171445485236358, "learning_rate": 1.9999191512434972e-06, "loss": 1.2432, "step": 2406 }, { "epoch": 0.02, "grad_norm": 7.065808066331224, "learning_rate": 1.9999190837438834e-06, "loss": 1.6226, "step": 2407 }, { "epoch": 0.02, "grad_norm": 4.94314214884708, "learning_rate": 1.999919016216105e-06, "loss": 1.4387, "step": 2408 }, { "epoch": 0.02, "grad_norm": 5.321472894774982, "learning_rate": 1.9999189486601625e-06, "loss": 1.525, "step": 2409 }, { "epoch": 0.02, "eval_loss": 1.6314270496368408, "eval_runtime": 4.6321, "eval_samples_per_second": 1.943, "eval_steps_per_second": 1.079, "step": 2409 }, { "epoch": 0.02, "grad_norm": 5.2667747042009365, "learning_rate": 1.9999188810760554e-06, "loss": 1.4374, "step": 2410 }, { "epoch": 0.02, "grad_norm": 8.33459038400234, "learning_rate": 1.9999188134637836e-06, "loss": 1.5239, "step": 2411 }, { "epoch": 0.02, "grad_norm": 4.682545029673098, "learning_rate": 1.9999187458233476e-06, "loss": 1.503, "step": 2412 }, { "epoch": 0.02, "grad_norm": 5.067629662325673, "learning_rate": 1.9999186781547473e-06, "loss": 1.5611, "step": 2413 }, { "epoch": 0.02, "grad_norm": 4.885518592851602, "learning_rate": 1.9999186104579827e-06, "loss": 1.399, "step": 2414 }, { "epoch": 0.02, "grad_norm": 4.8623966665305245, "learning_rate": 1.9999185427330535e-06, "loss": 1.5801, "step": 2415 }, { "epoch": 0.02, "grad_norm": 5.4540876235305555, "learning_rate": 1.9999184749799604e-06, "loss": 1.5821, "step": 2416 }, { "epoch": 0.02, "grad_norm": 4.705447443475058, "learning_rate": 1.999918407198702e-06, "loss": 1.3744, "step": 2417 }, { "epoch": 0.02, "grad_norm": 4.51004939941681, "learning_rate": 1.99991833938928e-06, "loss": 1.5278, "step": 2418 }, { "epoch": 0.02, "grad_norm": 5.781795445026661, "learning_rate": 1.9999182715516937e-06, "loss": 1.5668, "step": 2419 }, { "epoch": 0.02, "grad_norm": 5.4984493425238945, "learning_rate": 1.9999182036859427e-06, "loss": 1.4154, "step": 2420 }, { "epoch": 0.02, "grad_norm": 5.6682629310048345, "learning_rate": 1.999918135792027e-06, "loss": 1.1895, "step": 2421 }, { "epoch": 0.02, "grad_norm": 5.157730325288316, "learning_rate": 1.9999180678699474e-06, "loss": 1.3339, "step": 2422 }, { "epoch": 0.02, "grad_norm": 4.318416120479912, "learning_rate": 1.999917999919703e-06, "loss": 1.2886, "step": 2423 }, { "epoch": 0.02, "grad_norm": 6.782830315820668, "learning_rate": 1.999917931941295e-06, "loss": 1.3462, "step": 2424 }, { "epoch": 0.02, "grad_norm": 4.9884978695023685, "learning_rate": 1.9999178639347223e-06, "loss": 1.4021, "step": 2425 }, { "epoch": 0.02, "grad_norm": 4.93256932892002, "learning_rate": 1.999917795899985e-06, "loss": 1.493, "step": 2426 }, { "epoch": 0.02, "grad_norm": 4.556244224765811, "learning_rate": 1.9999177278370834e-06, "loss": 1.4557, "step": 2427 }, { "epoch": 0.02, "grad_norm": 5.629046540813747, "learning_rate": 1.999917659746018e-06, "loss": 1.5647, "step": 2428 }, { "epoch": 0.02, "grad_norm": 5.073793267819167, "learning_rate": 1.9999175916267875e-06, "loss": 1.5932, "step": 2429 }, { "epoch": 0.02, "grad_norm": 4.560810629230434, "learning_rate": 1.999917523479393e-06, "loss": 1.4904, "step": 2430 }, { "epoch": 0.02, "grad_norm": 5.097830006203227, "learning_rate": 1.999917455303834e-06, "loss": 1.4545, "step": 2431 }, { "epoch": 0.02, "grad_norm": 4.589161304086774, "learning_rate": 1.999917387100111e-06, "loss": 1.4455, "step": 2432 }, { "epoch": 0.02, "grad_norm": 5.262666403698471, "learning_rate": 1.9999173188682232e-06, "loss": 1.5229, "step": 2433 }, { "epoch": 0.02, "grad_norm": 6.509535961863751, "learning_rate": 1.999917250608171e-06, "loss": 1.4361, "step": 2434 }, { "epoch": 0.02, "grad_norm": 4.508763347593167, "learning_rate": 1.999917182319955e-06, "loss": 1.2921, "step": 2435 }, { "epoch": 0.02, "grad_norm": 5.40129312190739, "learning_rate": 1.999917114003574e-06, "loss": 1.3886, "step": 2436 }, { "epoch": 0.02, "grad_norm": 4.645761685555377, "learning_rate": 1.9999170456590293e-06, "loss": 1.3333, "step": 2437 }, { "epoch": 0.02, "grad_norm": 4.642316987637496, "learning_rate": 1.99991697728632e-06, "loss": 1.4236, "step": 2438 }, { "epoch": 0.02, "grad_norm": 5.516269375769559, "learning_rate": 1.9999169088854464e-06, "loss": 1.4596, "step": 2439 }, { "epoch": 0.02, "grad_norm": 4.96343766345647, "learning_rate": 1.9999168404564082e-06, "loss": 1.3929, "step": 2440 }, { "epoch": 0.02, "grad_norm": 4.69420089914068, "learning_rate": 1.999916771999206e-06, "loss": 1.4285, "step": 2441 }, { "epoch": 0.02, "grad_norm": 4.942122889945423, "learning_rate": 1.9999167035138392e-06, "loss": 1.4459, "step": 2442 }, { "epoch": 0.02, "grad_norm": 4.716647864815381, "learning_rate": 1.9999166350003083e-06, "loss": 1.4375, "step": 2443 }, { "epoch": 0.02, "grad_norm": 4.558659758531409, "learning_rate": 1.999916566458613e-06, "loss": 1.5505, "step": 2444 }, { "epoch": 0.02, "grad_norm": 5.032420136381452, "learning_rate": 1.9999164978887537e-06, "loss": 1.3798, "step": 2445 }, { "epoch": 0.02, "grad_norm": 4.387046415116763, "learning_rate": 1.9999164292907295e-06, "loss": 1.4069, "step": 2446 }, { "epoch": 0.02, "grad_norm": 4.829694624857794, "learning_rate": 1.999916360664541e-06, "loss": 1.4942, "step": 2447 }, { "epoch": 0.02, "grad_norm": 4.304283704762636, "learning_rate": 1.9999162920101885e-06, "loss": 1.3792, "step": 2448 }, { "epoch": 0.02, "grad_norm": 4.957652008482992, "learning_rate": 1.9999162233276715e-06, "loss": 1.5595, "step": 2449 }, { "epoch": 0.02, "grad_norm": 4.881099312639281, "learning_rate": 1.9999161546169903e-06, "loss": 1.3463, "step": 2450 }, { "epoch": 0.02, "grad_norm": 6.55918290195136, "learning_rate": 1.999916085878145e-06, "loss": 1.3157, "step": 2451 }, { "epoch": 0.02, "grad_norm": 4.933187617415705, "learning_rate": 1.999916017111135e-06, "loss": 1.325, "step": 2452 }, { "epoch": 0.02, "grad_norm": 5.522893367254299, "learning_rate": 1.999915948315961e-06, "loss": 1.3697, "step": 2453 }, { "epoch": 0.02, "grad_norm": 4.753016879907898, "learning_rate": 1.9999158794926227e-06, "loss": 1.5241, "step": 2454 }, { "epoch": 0.02, "grad_norm": 4.738367977115928, "learning_rate": 1.9999158106411197e-06, "loss": 1.5124, "step": 2455 }, { "epoch": 0.02, "grad_norm": 5.259436967977112, "learning_rate": 1.999915741761453e-06, "loss": 1.1845, "step": 2456 }, { "epoch": 0.02, "grad_norm": 5.327412482017732, "learning_rate": 1.9999156728536214e-06, "loss": 1.4195, "step": 2457 }, { "epoch": 0.02, "grad_norm": 4.815255649938258, "learning_rate": 1.9999156039176256e-06, "loss": 1.4678, "step": 2458 }, { "epoch": 0.02, "grad_norm": 4.830224872542197, "learning_rate": 1.999915534953466e-06, "loss": 1.4407, "step": 2459 }, { "epoch": 0.02, "grad_norm": 5.055447650223887, "learning_rate": 1.9999154659611416e-06, "loss": 1.3783, "step": 2460 }, { "epoch": 0.02, "grad_norm": 4.582184501347047, "learning_rate": 1.999915396940653e-06, "loss": 1.4415, "step": 2461 }, { "epoch": 0.02, "grad_norm": 4.691006151625649, "learning_rate": 1.999915327892e-06, "loss": 1.5197, "step": 2462 }, { "epoch": 0.02, "grad_norm": 5.591425262583514, "learning_rate": 1.999915258815183e-06, "loss": 1.4104, "step": 2463 }, { "epoch": 0.02, "grad_norm": 4.871239888750451, "learning_rate": 1.9999151897102016e-06, "loss": 1.3372, "step": 2464 }, { "epoch": 0.02, "grad_norm": 4.330668450921265, "learning_rate": 1.999915120577056e-06, "loss": 1.4389, "step": 2465 }, { "epoch": 0.02, "grad_norm": 5.340186565522926, "learning_rate": 1.999915051415746e-06, "loss": 1.4118, "step": 2466 }, { "epoch": 0.02, "grad_norm": 6.261449629532745, "learning_rate": 1.9999149822262718e-06, "loss": 1.7365, "step": 2467 }, { "epoch": 0.02, "grad_norm": 4.742851588374142, "learning_rate": 1.999914913008633e-06, "loss": 1.4563, "step": 2468 }, { "epoch": 0.02, "grad_norm": 5.025387402777135, "learning_rate": 1.99991484376283e-06, "loss": 1.4834, "step": 2469 }, { "epoch": 0.02, "grad_norm": 4.355185254680489, "learning_rate": 1.999914774488863e-06, "loss": 1.443, "step": 2470 }, { "epoch": 0.02, "grad_norm": 4.520118266978153, "learning_rate": 1.999914705186732e-06, "loss": 1.2398, "step": 2471 }, { "epoch": 0.02, "grad_norm": 4.679385582459207, "learning_rate": 1.9999146358564362e-06, "loss": 1.3546, "step": 2472 }, { "epoch": 0.02, "grad_norm": 4.653878537687508, "learning_rate": 1.9999145664979764e-06, "loss": 1.3641, "step": 2473 }, { "epoch": 0.02, "grad_norm": 5.288484525559772, "learning_rate": 1.9999144971113523e-06, "loss": 1.4576, "step": 2474 }, { "epoch": 0.02, "grad_norm": 5.113711451335311, "learning_rate": 1.999914427696564e-06, "loss": 1.1885, "step": 2475 }, { "epoch": 0.02, "grad_norm": 4.792513033095759, "learning_rate": 1.9999143582536113e-06, "loss": 1.49, "step": 2476 }, { "epoch": 0.02, "grad_norm": 4.916268869163306, "learning_rate": 1.999914288782494e-06, "loss": 1.4821, "step": 2477 }, { "epoch": 0.02, "grad_norm": 4.9324603436046415, "learning_rate": 1.999914219283213e-06, "loss": 1.5012, "step": 2478 }, { "epoch": 0.02, "grad_norm": 5.283635481534406, "learning_rate": 1.9999141497557674e-06, "loss": 1.3083, "step": 2479 }, { "epoch": 0.02, "grad_norm": 5.3789031660655935, "learning_rate": 1.9999140802001576e-06, "loss": 1.5692, "step": 2480 }, { "epoch": 0.02, "grad_norm": 4.496253992299153, "learning_rate": 1.9999140106163837e-06, "loss": 1.475, "step": 2481 }, { "epoch": 0.02, "grad_norm": 4.810348760053238, "learning_rate": 1.9999139410044454e-06, "loss": 1.3871, "step": 2482 }, { "epoch": 0.02, "eval_loss": 1.6317201852798462, "eval_runtime": 4.6311, "eval_samples_per_second": 1.943, "eval_steps_per_second": 1.08, "step": 2482 }, { "epoch": 0.02, "grad_norm": 4.936385744644064, "learning_rate": 1.999913871364343e-06, "loss": 1.5163, "step": 2483 }, { "epoch": 0.02, "grad_norm": 5.3932940350781315, "learning_rate": 1.999913801696076e-06, "loss": 1.2527, "step": 2484 }, { "epoch": 0.02, "grad_norm": 4.527212771343054, "learning_rate": 1.999913731999645e-06, "loss": 1.3808, "step": 2485 }, { "epoch": 0.02, "grad_norm": 4.6367894135268415, "learning_rate": 1.9999136622750497e-06, "loss": 1.4644, "step": 2486 }, { "epoch": 0.02, "grad_norm": 5.431008065838847, "learning_rate": 1.9999135925222905e-06, "loss": 1.4108, "step": 2487 }, { "epoch": 0.02, "grad_norm": 4.60955842936837, "learning_rate": 1.9999135227413667e-06, "loss": 1.5293, "step": 2488 }, { "epoch": 0.02, "grad_norm": 4.815249149207515, "learning_rate": 1.9999134529322785e-06, "loss": 1.4405, "step": 2489 }, { "epoch": 0.02, "grad_norm": 6.905408881518468, "learning_rate": 1.9999133830950266e-06, "loss": 1.5907, "step": 2490 }, { "epoch": 0.02, "grad_norm": 5.3654121964040815, "learning_rate": 1.99991331322961e-06, "loss": 1.5178, "step": 2491 }, { "epoch": 0.02, "grad_norm": 4.58569484491462, "learning_rate": 1.9999132433360294e-06, "loss": 1.3849, "step": 2492 }, { "epoch": 0.02, "grad_norm": 7.861754836669881, "learning_rate": 1.9999131734142846e-06, "loss": 1.5712, "step": 2493 }, { "epoch": 0.02, "grad_norm": 4.651684688922827, "learning_rate": 1.9999131034643755e-06, "loss": 1.4253, "step": 2494 }, { "epoch": 0.02, "grad_norm": 4.7784530707603885, "learning_rate": 1.9999130334863022e-06, "loss": 1.4367, "step": 2495 }, { "epoch": 0.02, "grad_norm": 4.616133306685504, "learning_rate": 1.9999129634800646e-06, "loss": 1.3498, "step": 2496 }, { "epoch": 0.02, "grad_norm": 5.119118711647458, "learning_rate": 1.9999128934456628e-06, "loss": 1.484, "step": 2497 }, { "epoch": 0.02, "grad_norm": 5.283146448319006, "learning_rate": 1.999912823383097e-06, "loss": 1.3398, "step": 2498 }, { "epoch": 0.02, "grad_norm": 4.571981013911557, "learning_rate": 1.9999127532923667e-06, "loss": 1.2996, "step": 2499 }, { "epoch": 0.02, "grad_norm": 5.332851655290826, "learning_rate": 1.999912683173472e-06, "loss": 1.5005, "step": 2500 }, { "epoch": 0.02, "grad_norm": 10.210017666226172, "learning_rate": 1.9999126130264135e-06, "loss": 1.3147, "step": 2501 }, { "epoch": 0.02, "grad_norm": 8.862337302086877, "learning_rate": 1.9999125428511907e-06, "loss": 1.3428, "step": 2502 }, { "epoch": 0.02, "grad_norm": 4.576195646780718, "learning_rate": 1.9999124726478036e-06, "loss": 1.4371, "step": 2503 }, { "epoch": 0.02, "grad_norm": 5.339924831875166, "learning_rate": 1.9999124024162523e-06, "loss": 1.5637, "step": 2504 }, { "epoch": 0.02, "grad_norm": 4.7192701371337025, "learning_rate": 1.9999123321565367e-06, "loss": 1.4242, "step": 2505 }, { "epoch": 0.02, "grad_norm": 5.163473966004869, "learning_rate": 1.999912261868657e-06, "loss": 1.3637, "step": 2506 }, { "epoch": 0.02, "grad_norm": 4.337334857544459, "learning_rate": 1.999912191552613e-06, "loss": 1.2784, "step": 2507 }, { "epoch": 0.02, "grad_norm": 6.051444830035015, "learning_rate": 1.999912121208405e-06, "loss": 1.6266, "step": 2508 }, { "epoch": 0.02, "grad_norm": 4.479649509721164, "learning_rate": 1.999912050836033e-06, "loss": 1.2662, "step": 2509 }, { "epoch": 0.02, "grad_norm": 4.472943621796013, "learning_rate": 1.9999119804354964e-06, "loss": 1.4028, "step": 2510 }, { "epoch": 0.02, "grad_norm": 4.710024013128741, "learning_rate": 1.9999119100067957e-06, "loss": 1.3498, "step": 2511 }, { "epoch": 0.02, "grad_norm": 4.970475591078188, "learning_rate": 1.9999118395499306e-06, "loss": 1.5096, "step": 2512 }, { "epoch": 0.02, "grad_norm": 4.75371637466236, "learning_rate": 1.9999117690649017e-06, "loss": 1.4432, "step": 2513 }, { "epoch": 0.02, "grad_norm": 4.468537061799282, "learning_rate": 1.9999116985517085e-06, "loss": 1.4252, "step": 2514 }, { "epoch": 0.02, "grad_norm": 4.49951522854794, "learning_rate": 1.999911628010351e-06, "loss": 1.3306, "step": 2515 }, { "epoch": 0.02, "grad_norm": 4.719020079035169, "learning_rate": 1.9999115574408294e-06, "loss": 1.3402, "step": 2516 }, { "epoch": 0.02, "grad_norm": 4.795670467668128, "learning_rate": 1.9999114868431434e-06, "loss": 1.4464, "step": 2517 }, { "epoch": 0.02, "grad_norm": 4.36164451676725, "learning_rate": 1.9999114162172936e-06, "loss": 1.3221, "step": 2518 }, { "epoch": 0.02, "grad_norm": 4.752857588130247, "learning_rate": 1.999911345563279e-06, "loss": 1.2821, "step": 2519 }, { "epoch": 0.02, "grad_norm": 5.279730766268046, "learning_rate": 1.9999112748811008e-06, "loss": 1.4016, "step": 2520 }, { "epoch": 0.02, "grad_norm": 4.695047237818786, "learning_rate": 1.9999112041707586e-06, "loss": 1.4613, "step": 2521 }, { "epoch": 0.02, "grad_norm": 4.692585340555871, "learning_rate": 1.9999111334322517e-06, "loss": 1.4694, "step": 2522 }, { "epoch": 0.02, "grad_norm": 4.771936002396662, "learning_rate": 1.999911062665581e-06, "loss": 1.3274, "step": 2523 }, { "epoch": 0.02, "grad_norm": 5.220174817764933, "learning_rate": 1.999910991870746e-06, "loss": 1.571, "step": 2524 }, { "epoch": 0.02, "grad_norm": 5.672229165440042, "learning_rate": 1.9999109210477466e-06, "loss": 1.5377, "step": 2525 }, { "epoch": 0.02, "grad_norm": 5.1839178319765935, "learning_rate": 1.999910850196583e-06, "loss": 1.3993, "step": 2526 }, { "epoch": 0.02, "grad_norm": 4.681220552283899, "learning_rate": 1.9999107793172557e-06, "loss": 1.4291, "step": 2527 }, { "epoch": 0.02, "grad_norm": 4.972007639821219, "learning_rate": 1.999910708409764e-06, "loss": 1.431, "step": 2528 }, { "epoch": 0.02, "grad_norm": 4.4861243909511295, "learning_rate": 1.999910637474108e-06, "loss": 1.4019, "step": 2529 }, { "epoch": 0.02, "grad_norm": 4.851005767238794, "learning_rate": 1.9999105665102883e-06, "loss": 1.4337, "step": 2530 }, { "epoch": 0.02, "grad_norm": 4.846084074130138, "learning_rate": 1.999910495518304e-06, "loss": 1.5946, "step": 2531 }, { "epoch": 0.02, "grad_norm": 4.536951575081545, "learning_rate": 1.9999104244981556e-06, "loss": 1.4743, "step": 2532 }, { "epoch": 0.02, "grad_norm": 7.248194758577965, "learning_rate": 1.9999103534498434e-06, "loss": 1.3195, "step": 2533 }, { "epoch": 0.02, "grad_norm": 6.056484295400697, "learning_rate": 1.9999102823733665e-06, "loss": 1.2914, "step": 2534 }, { "epoch": 0.02, "grad_norm": 4.505711846073996, "learning_rate": 1.999910211268726e-06, "loss": 1.4096, "step": 2535 }, { "epoch": 0.02, "grad_norm": 6.9063691562807055, "learning_rate": 1.999910140135921e-06, "loss": 1.2676, "step": 2536 }, { "epoch": 0.02, "grad_norm": 4.783198227824001, "learning_rate": 1.9999100689749517e-06, "loss": 1.4266, "step": 2537 }, { "epoch": 0.02, "grad_norm": 4.4803987517358665, "learning_rate": 1.9999099977858186e-06, "loss": 1.3878, "step": 2538 }, { "epoch": 0.02, "grad_norm": 21.125075865811542, "learning_rate": 1.9999099265685212e-06, "loss": 1.481, "step": 2539 }, { "epoch": 0.02, "grad_norm": 4.551735343167499, "learning_rate": 1.99990985532306e-06, "loss": 1.3335, "step": 2540 }, { "epoch": 0.02, "grad_norm": 4.525931993787852, "learning_rate": 1.999909784049434e-06, "loss": 1.4145, "step": 2541 }, { "epoch": 0.02, "grad_norm": 4.723242340525657, "learning_rate": 1.9999097127476444e-06, "loss": 1.425, "step": 2542 }, { "epoch": 0.02, "grad_norm": 11.52159628767104, "learning_rate": 1.9999096414176904e-06, "loss": 1.5459, "step": 2543 }, { "epoch": 0.02, "grad_norm": 4.499238976068291, "learning_rate": 1.9999095700595726e-06, "loss": 1.5052, "step": 2544 }, { "epoch": 0.02, "grad_norm": 4.3943003554534315, "learning_rate": 1.999909498673291e-06, "loss": 1.3532, "step": 2545 }, { "epoch": 0.02, "grad_norm": 5.384405241916104, "learning_rate": 1.9999094272588445e-06, "loss": 1.5439, "step": 2546 }, { "epoch": 0.02, "grad_norm": 5.401136108906954, "learning_rate": 1.999909355816234e-06, "loss": 1.4723, "step": 2547 }, { "epoch": 0.02, "grad_norm": 5.4369310319106114, "learning_rate": 1.9999092843454593e-06, "loss": 1.4131, "step": 2548 }, { "epoch": 0.02, "grad_norm": 4.975825034724867, "learning_rate": 1.999909212846521e-06, "loss": 1.5112, "step": 2549 }, { "epoch": 0.02, "grad_norm": 6.256239787239192, "learning_rate": 1.999909141319418e-06, "loss": 1.5494, "step": 2550 }, { "epoch": 0.02, "grad_norm": 4.563176202007188, "learning_rate": 1.9999090697641515e-06, "loss": 1.3635, "step": 2551 }, { "epoch": 0.02, "grad_norm": 4.431366662952086, "learning_rate": 1.9999089981807203e-06, "loss": 1.3677, "step": 2552 }, { "epoch": 0.02, "grad_norm": 4.189139491517448, "learning_rate": 1.9999089265691253e-06, "loss": 1.3525, "step": 2553 }, { "epoch": 0.02, "grad_norm": 5.080861898020357, "learning_rate": 1.999908854929366e-06, "loss": 1.3792, "step": 2554 }, { "epoch": 0.02, "grad_norm": 6.3571862989208325, "learning_rate": 1.9999087832614425e-06, "loss": 1.5974, "step": 2555 }, { "epoch": 0.02, "eval_loss": 1.626267433166504, "eval_runtime": 4.6223, "eval_samples_per_second": 1.947, "eval_steps_per_second": 1.082, "step": 2555 }, { "epoch": 0.02, "grad_norm": 5.82494449812951, "learning_rate": 1.9999087115653556e-06, "loss": 1.4692, "step": 2556 }, { "epoch": 0.02, "grad_norm": 4.929797834611915, "learning_rate": 1.999908639841104e-06, "loss": 1.4674, "step": 2557 }, { "epoch": 0.02, "grad_norm": 5.589160350904509, "learning_rate": 1.9999085680886884e-06, "loss": 1.4853, "step": 2558 }, { "epoch": 0.02, "grad_norm": 4.657411083617213, "learning_rate": 1.9999084963081087e-06, "loss": 1.3349, "step": 2559 }, { "epoch": 0.02, "grad_norm": 5.635155884205405, "learning_rate": 1.999908424499365e-06, "loss": 1.3812, "step": 2560 }, { "epoch": 0.02, "grad_norm": 11.039436368621972, "learning_rate": 1.9999083526624568e-06, "loss": 1.4228, "step": 2561 }, { "epoch": 0.02, "grad_norm": 4.731509118685059, "learning_rate": 1.999908280797385e-06, "loss": 1.5188, "step": 2562 }, { "epoch": 0.02, "grad_norm": 4.845729221450556, "learning_rate": 1.999908208904149e-06, "loss": 1.3457, "step": 2563 }, { "epoch": 0.02, "grad_norm": 4.6098361015362705, "learning_rate": 1.9999081369827488e-06, "loss": 1.4256, "step": 2564 }, { "epoch": 0.02, "grad_norm": 4.916273960056245, "learning_rate": 1.9999080650331847e-06, "loss": 1.3279, "step": 2565 }, { "epoch": 0.02, "grad_norm": 4.438335756166127, "learning_rate": 1.9999079930554563e-06, "loss": 1.241, "step": 2566 }, { "epoch": 0.02, "grad_norm": 4.817882655917825, "learning_rate": 1.999907921049564e-06, "loss": 1.4117, "step": 2567 }, { "epoch": 0.02, "grad_norm": 5.003123275534057, "learning_rate": 1.9999078490155076e-06, "loss": 1.2974, "step": 2568 }, { "epoch": 0.02, "grad_norm": 5.017000975631727, "learning_rate": 1.999907776953287e-06, "loss": 1.363, "step": 2569 }, { "epoch": 0.02, "grad_norm": 4.600005159137907, "learning_rate": 1.9999077048629022e-06, "loss": 1.4458, "step": 2570 }, { "epoch": 0.02, "grad_norm": 4.9265496357161425, "learning_rate": 1.9999076327443538e-06, "loss": 1.3172, "step": 2571 }, { "epoch": 0.02, "grad_norm": 5.824876884767345, "learning_rate": 1.9999075605976406e-06, "loss": 1.4476, "step": 2572 }, { "epoch": 0.02, "grad_norm": 4.361863130512629, "learning_rate": 1.9999074884227636e-06, "loss": 1.3836, "step": 2573 }, { "epoch": 0.02, "grad_norm": 4.7955376481293115, "learning_rate": 1.999907416219723e-06, "loss": 1.3473, "step": 2574 }, { "epoch": 0.02, "grad_norm": 4.666469831511099, "learning_rate": 1.9999073439885177e-06, "loss": 1.4382, "step": 2575 }, { "epoch": 0.02, "grad_norm": 4.73863410034199, "learning_rate": 1.999907271729149e-06, "loss": 1.3452, "step": 2576 }, { "epoch": 0.02, "grad_norm": 4.535702568291937, "learning_rate": 1.999907199441616e-06, "loss": 1.433, "step": 2577 }, { "epoch": 0.02, "grad_norm": 4.577074935520658, "learning_rate": 1.999907127125919e-06, "loss": 1.4831, "step": 2578 }, { "epoch": 0.02, "grad_norm": 5.722599811411352, "learning_rate": 1.9999070547820576e-06, "loss": 1.5383, "step": 2579 }, { "epoch": 0.02, "grad_norm": 22.655065655121753, "learning_rate": 1.999906982410032e-06, "loss": 1.3815, "step": 2580 }, { "epoch": 0.02, "grad_norm": 6.003296650995224, "learning_rate": 1.9999069100098425e-06, "loss": 1.5457, "step": 2581 }, { "epoch": 0.02, "grad_norm": 4.741277967056868, "learning_rate": 1.9999068375814896e-06, "loss": 1.5165, "step": 2582 }, { "epoch": 0.02, "grad_norm": 6.300018975344748, "learning_rate": 1.999906765124972e-06, "loss": 1.4036, "step": 2583 }, { "epoch": 0.02, "grad_norm": 7.552215789433446, "learning_rate": 1.9999066926402907e-06, "loss": 1.3368, "step": 2584 }, { "epoch": 0.02, "grad_norm": 5.470582591235478, "learning_rate": 1.999906620127445e-06, "loss": 1.5863, "step": 2585 }, { "epoch": 0.02, "grad_norm": 6.100878877106844, "learning_rate": 1.9999065475864355e-06, "loss": 1.501, "step": 2586 }, { "epoch": 0.02, "grad_norm": 4.389860076860747, "learning_rate": 1.9999064750172617e-06, "loss": 1.4151, "step": 2587 }, { "epoch": 0.02, "grad_norm": 5.63645147327467, "learning_rate": 1.999906402419924e-06, "loss": 1.5282, "step": 2588 }, { "epoch": 0.02, "grad_norm": 7.229014935819337, "learning_rate": 1.9999063297944226e-06, "loss": 1.5142, "step": 2589 }, { "epoch": 0.02, "grad_norm": 4.590820624310537, "learning_rate": 1.999906257140757e-06, "loss": 1.3774, "step": 2590 }, { "epoch": 0.02, "grad_norm": 4.724381873263039, "learning_rate": 1.9999061844589268e-06, "loss": 1.3214, "step": 2591 }, { "epoch": 0.02, "grad_norm": 4.484001614536028, "learning_rate": 1.999906111748933e-06, "loss": 1.3858, "step": 2592 }, { "epoch": 0.02, "grad_norm": 4.208403452992326, "learning_rate": 1.999906039010775e-06, "loss": 1.3302, "step": 2593 }, { "epoch": 0.02, "grad_norm": 4.505618278783764, "learning_rate": 1.9999059662444537e-06, "loss": 1.4443, "step": 2594 }, { "epoch": 0.02, "grad_norm": 4.591195710426086, "learning_rate": 1.999905893449968e-06, "loss": 1.3943, "step": 2595 }, { "epoch": 0.02, "grad_norm": 4.696805772544854, "learning_rate": 1.9999058206273177e-06, "loss": 1.4307, "step": 2596 }, { "epoch": 0.02, "grad_norm": 5.021073562891561, "learning_rate": 1.999905747776504e-06, "loss": 1.5345, "step": 2597 }, { "epoch": 0.02, "grad_norm": 4.7598018312861194, "learning_rate": 1.999905674897526e-06, "loss": 1.6453, "step": 2598 }, { "epoch": 0.02, "grad_norm": 4.581994132757299, "learning_rate": 1.9999056019903835e-06, "loss": 1.4905, "step": 2599 }, { "epoch": 0.02, "grad_norm": 4.841310271459569, "learning_rate": 1.999905529055078e-06, "loss": 1.4203, "step": 2600 }, { "epoch": 0.02, "grad_norm": 5.189830617523313, "learning_rate": 1.999905456091608e-06, "loss": 1.3274, "step": 2601 }, { "epoch": 0.02, "grad_norm": 4.720146352163821, "learning_rate": 1.999905383099974e-06, "loss": 1.4674, "step": 2602 }, { "epoch": 0.02, "grad_norm": 4.543963809407407, "learning_rate": 1.999905310080176e-06, "loss": 1.4255, "step": 2603 }, { "epoch": 0.02, "grad_norm": 5.18614032345166, "learning_rate": 1.999905237032214e-06, "loss": 1.4297, "step": 2604 }, { "epoch": 0.02, "grad_norm": 4.465791115799426, "learning_rate": 1.999905163956088e-06, "loss": 1.4399, "step": 2605 }, { "epoch": 0.02, "grad_norm": 4.694449197808517, "learning_rate": 1.9999050908517977e-06, "loss": 1.4938, "step": 2606 }, { "epoch": 0.02, "grad_norm": 5.4688966732418365, "learning_rate": 1.9999050177193436e-06, "loss": 1.4578, "step": 2607 }, { "epoch": 0.02, "grad_norm": 5.10844282586031, "learning_rate": 1.9999049445587256e-06, "loss": 1.3347, "step": 2608 }, { "epoch": 0.02, "grad_norm": 4.762680506088013, "learning_rate": 1.999904871369944e-06, "loss": 1.4342, "step": 2609 }, { "epoch": 0.02, "grad_norm": 5.979551926427287, "learning_rate": 1.999904798152998e-06, "loss": 1.5078, "step": 2610 }, { "epoch": 0.02, "grad_norm": 4.533251775082327, "learning_rate": 1.999904724907888e-06, "loss": 1.2065, "step": 2611 }, { "epoch": 0.02, "grad_norm": 5.121922341187516, "learning_rate": 1.9999046516346138e-06, "loss": 1.5454, "step": 2612 }, { "epoch": 0.02, "grad_norm": 5.781680034327118, "learning_rate": 1.9999045783331757e-06, "loss": 1.4578, "step": 2613 }, { "epoch": 0.02, "grad_norm": 5.826495337980005, "learning_rate": 1.999904505003574e-06, "loss": 1.4422, "step": 2614 }, { "epoch": 0.02, "grad_norm": 6.335079927329629, "learning_rate": 1.9999044316458078e-06, "loss": 1.3345, "step": 2615 }, { "epoch": 0.02, "grad_norm": 4.778422328814399, "learning_rate": 1.9999043582598782e-06, "loss": 1.3202, "step": 2616 }, { "epoch": 0.02, "grad_norm": 4.8282702985765145, "learning_rate": 1.999904284845784e-06, "loss": 1.407, "step": 2617 }, { "epoch": 0.02, "grad_norm": 4.867494484747974, "learning_rate": 1.999904211403526e-06, "loss": 1.5508, "step": 2618 }, { "epoch": 0.02, "grad_norm": 4.781917993855085, "learning_rate": 1.9999041379331044e-06, "loss": 1.3034, "step": 2619 }, { "epoch": 0.02, "grad_norm": 4.712392259942616, "learning_rate": 1.9999040644345186e-06, "loss": 1.5253, "step": 2620 }, { "epoch": 0.02, "grad_norm": 5.030532462772803, "learning_rate": 1.9999039909077686e-06, "loss": 1.5149, "step": 2621 }, { "epoch": 0.02, "grad_norm": 6.621139707916859, "learning_rate": 1.999903917352855e-06, "loss": 1.5093, "step": 2622 }, { "epoch": 0.02, "grad_norm": 8.008006829313361, "learning_rate": 1.9999038437697774e-06, "loss": 1.4026, "step": 2623 }, { "epoch": 0.02, "grad_norm": 6.136006728609445, "learning_rate": 1.9999037701585354e-06, "loss": 1.5477, "step": 2624 }, { "epoch": 0.02, "grad_norm": 4.760430548827488, "learning_rate": 1.9999036965191295e-06, "loss": 1.4864, "step": 2625 }, { "epoch": 0.02, "grad_norm": 5.006634707089645, "learning_rate": 1.9999036228515603e-06, "loss": 1.4774, "step": 2626 }, { "epoch": 0.02, "grad_norm": 4.588292407337218, "learning_rate": 1.9999035491558263e-06, "loss": 1.3735, "step": 2627 }, { "epoch": 0.02, "grad_norm": 4.816326208591829, "learning_rate": 1.999903475431929e-06, "loss": 1.2208, "step": 2628 }, { "epoch": 0.02, "eval_loss": 1.6295270919799805, "eval_runtime": 4.6318, "eval_samples_per_second": 1.943, "eval_steps_per_second": 1.079, "step": 2628 }, { "epoch": 0.02, "grad_norm": 4.638403311998232, "learning_rate": 1.9999034016798672e-06, "loss": 1.4943, "step": 2629 }, { "epoch": 0.02, "grad_norm": 4.961202684709964, "learning_rate": 1.999903327899642e-06, "loss": 1.4069, "step": 2630 }, { "epoch": 0.02, "grad_norm": 4.348996064319459, "learning_rate": 1.9999032540912524e-06, "loss": 1.1698, "step": 2631 }, { "epoch": 0.02, "grad_norm": 4.444823433064274, "learning_rate": 1.999903180254699e-06, "loss": 1.3074, "step": 2632 }, { "epoch": 0.02, "grad_norm": 4.965888101259268, "learning_rate": 1.9999031063899817e-06, "loss": 1.3823, "step": 2633 }, { "epoch": 0.02, "grad_norm": 6.453473043817252, "learning_rate": 1.999903032497101e-06, "loss": 1.4969, "step": 2634 }, { "epoch": 0.02, "grad_norm": 5.068247346576678, "learning_rate": 1.9999029585760553e-06, "loss": 1.5847, "step": 2635 }, { "epoch": 0.02, "grad_norm": 5.6600389675468294, "learning_rate": 1.9999028846268463e-06, "loss": 1.3724, "step": 2636 }, { "epoch": 0.02, "grad_norm": 4.374383238249181, "learning_rate": 1.9999028106494735e-06, "loss": 1.4313, "step": 2637 }, { "epoch": 0.02, "grad_norm": 4.680029067986695, "learning_rate": 1.999902736643936e-06, "loss": 1.3181, "step": 2638 }, { "epoch": 0.02, "grad_norm": 4.580712621593848, "learning_rate": 1.999902662610235e-06, "loss": 1.3987, "step": 2639 }, { "epoch": 0.02, "grad_norm": 6.092256320436487, "learning_rate": 1.9999025885483706e-06, "loss": 1.6909, "step": 2640 }, { "epoch": 0.02, "grad_norm": 4.7827050109074385, "learning_rate": 1.9999025144583415e-06, "loss": 1.517, "step": 2641 }, { "epoch": 0.02, "grad_norm": 5.011057538279476, "learning_rate": 1.9999024403401486e-06, "loss": 1.4349, "step": 2642 }, { "epoch": 0.02, "grad_norm": 4.8294099232047785, "learning_rate": 1.9999023661937923e-06, "loss": 1.4704, "step": 2643 }, { "epoch": 0.02, "grad_norm": 4.597159590269809, "learning_rate": 1.9999022920192712e-06, "loss": 1.3426, "step": 2644 }, { "epoch": 0.02, "grad_norm": 4.9150712542829496, "learning_rate": 1.9999022178165868e-06, "loss": 1.3268, "step": 2645 }, { "epoch": 0.02, "grad_norm": 5.027583647063347, "learning_rate": 1.9999021435857385e-06, "loss": 1.4659, "step": 2646 }, { "epoch": 0.02, "grad_norm": 4.68442647731963, "learning_rate": 1.9999020693267263e-06, "loss": 1.3447, "step": 2647 }, { "epoch": 0.02, "grad_norm": 4.9605688635740695, "learning_rate": 1.9999019950395503e-06, "loss": 1.4796, "step": 2648 }, { "epoch": 0.02, "grad_norm": 4.809892082871139, "learning_rate": 1.9999019207242097e-06, "loss": 1.4104, "step": 2649 }, { "epoch": 0.02, "grad_norm": 4.743744228034844, "learning_rate": 1.999901846380706e-06, "loss": 1.4046, "step": 2650 }, { "epoch": 0.02, "grad_norm": 4.269118159347582, "learning_rate": 1.999901772009038e-06, "loss": 1.2958, "step": 2651 }, { "epoch": 0.02, "grad_norm": 5.853949838326959, "learning_rate": 1.999901697609206e-06, "loss": 1.4926, "step": 2652 }, { "epoch": 0.02, "grad_norm": 4.776685722649282, "learning_rate": 1.99990162318121e-06, "loss": 1.4355, "step": 2653 }, { "epoch": 0.02, "grad_norm": 6.208703600062207, "learning_rate": 1.9999015487250503e-06, "loss": 1.5673, "step": 2654 }, { "epoch": 0.02, "grad_norm": 6.7039599749841186, "learning_rate": 1.999901474240727e-06, "loss": 1.5676, "step": 2655 }, { "epoch": 0.02, "grad_norm": 4.417347561144367, "learning_rate": 1.9999013997282393e-06, "loss": 1.3797, "step": 2656 }, { "epoch": 0.02, "grad_norm": 4.6270515827313465, "learning_rate": 1.999901325187588e-06, "loss": 1.4198, "step": 2657 }, { "epoch": 0.02, "grad_norm": 4.644490932379133, "learning_rate": 1.9999012506187726e-06, "loss": 1.3909, "step": 2658 }, { "epoch": 0.02, "grad_norm": 4.766952782820096, "learning_rate": 1.9999011760217935e-06, "loss": 1.5499, "step": 2659 }, { "epoch": 0.02, "grad_norm": 5.024704187001623, "learning_rate": 1.99990110139665e-06, "loss": 1.5967, "step": 2660 }, { "epoch": 0.02, "grad_norm": 4.484156877401999, "learning_rate": 1.9999010267433432e-06, "loss": 1.3459, "step": 2661 }, { "epoch": 0.02, "grad_norm": 4.960235392776955, "learning_rate": 1.9999009520618725e-06, "loss": 1.47, "step": 2662 }, { "epoch": 0.02, "grad_norm": 5.013252259638264, "learning_rate": 1.9999008773522376e-06, "loss": 1.2694, "step": 2663 }, { "epoch": 0.02, "grad_norm": 4.796804229686202, "learning_rate": 1.9999008026144392e-06, "loss": 1.4616, "step": 2664 }, { "epoch": 0.02, "grad_norm": 5.1481769265464825, "learning_rate": 1.9999007278484766e-06, "loss": 1.494, "step": 2665 }, { "epoch": 0.02, "grad_norm": 4.895324247068634, "learning_rate": 1.99990065305435e-06, "loss": 1.5384, "step": 2666 }, { "epoch": 0.02, "grad_norm": 4.655033924444686, "learning_rate": 1.99990057823206e-06, "loss": 1.427, "step": 2667 }, { "epoch": 0.02, "grad_norm": 4.7945063377020425, "learning_rate": 1.999900503381606e-06, "loss": 1.4635, "step": 2668 }, { "epoch": 0.02, "grad_norm": 6.282804236505679, "learning_rate": 1.999900428502988e-06, "loss": 1.3777, "step": 2669 }, { "epoch": 0.02, "grad_norm": 5.956119769772323, "learning_rate": 1.9999003535962058e-06, "loss": 1.5537, "step": 2670 }, { "epoch": 0.02, "grad_norm": 4.744533892549758, "learning_rate": 1.99990027866126e-06, "loss": 1.411, "step": 2671 }, { "epoch": 0.02, "grad_norm": 4.685437895155002, "learning_rate": 1.9999002036981505e-06, "loss": 1.3089, "step": 2672 }, { "epoch": 0.02, "grad_norm": 4.679001375905665, "learning_rate": 1.999900128706877e-06, "loss": 1.5004, "step": 2673 }, { "epoch": 0.02, "grad_norm": 4.816836882292446, "learning_rate": 1.99990005368744e-06, "loss": 1.4415, "step": 2674 }, { "epoch": 0.02, "grad_norm": 5.397353037395355, "learning_rate": 1.9998999786398383e-06, "loss": 1.457, "step": 2675 }, { "epoch": 0.02, "grad_norm": 4.836128128995581, "learning_rate": 1.9998999035640734e-06, "loss": 1.3645, "step": 2676 }, { "epoch": 0.02, "grad_norm": 6.345520428845227, "learning_rate": 1.9998998284601446e-06, "loss": 1.5424, "step": 2677 }, { "epoch": 0.02, "grad_norm": 5.582942854801658, "learning_rate": 1.999899753328052e-06, "loss": 1.4421, "step": 2678 }, { "epoch": 0.02, "grad_norm": 4.5571891655235275, "learning_rate": 1.999899678167795e-06, "loss": 1.3543, "step": 2679 }, { "epoch": 0.02, "grad_norm": 5.349049923012939, "learning_rate": 1.999899602979375e-06, "loss": 1.209, "step": 2680 }, { "epoch": 0.02, "grad_norm": 4.584099365206942, "learning_rate": 1.9998995277627906e-06, "loss": 1.4517, "step": 2681 }, { "epoch": 0.02, "grad_norm": 4.851375584879138, "learning_rate": 1.9998994525180426e-06, "loss": 1.3411, "step": 2682 }, { "epoch": 0.02, "grad_norm": 4.6601703251560025, "learning_rate": 1.9998993772451304e-06, "loss": 1.3298, "step": 2683 }, { "epoch": 0.02, "grad_norm": 4.8561771787958365, "learning_rate": 1.9998993019440547e-06, "loss": 1.4937, "step": 2684 }, { "epoch": 0.02, "grad_norm": 5.1790245554459755, "learning_rate": 1.999899226614815e-06, "loss": 1.471, "step": 2685 }, { "epoch": 0.02, "grad_norm": 4.742734422892583, "learning_rate": 1.9998991512574113e-06, "loss": 1.5035, "step": 2686 }, { "epoch": 0.02, "grad_norm": 4.785648242343571, "learning_rate": 1.999899075871844e-06, "loss": 1.5267, "step": 2687 }, { "epoch": 0.02, "grad_norm": 5.057841994753448, "learning_rate": 1.999899000458113e-06, "loss": 1.3789, "step": 2688 }, { "epoch": 0.02, "grad_norm": 5.0118905395912945, "learning_rate": 1.999898925016218e-06, "loss": 1.3579, "step": 2689 }, { "epoch": 0.02, "grad_norm": 5.35192729500763, "learning_rate": 1.999898849546159e-06, "loss": 1.4838, "step": 2690 }, { "epoch": 0.02, "grad_norm": 4.337685327376343, "learning_rate": 1.9998987740479367e-06, "loss": 1.3297, "step": 2691 }, { "epoch": 0.02, "grad_norm": 4.808629946983388, "learning_rate": 1.99989869852155e-06, "loss": 1.4873, "step": 2692 }, { "epoch": 0.02, "grad_norm": 5.2407858594510035, "learning_rate": 1.999898622967e-06, "loss": 1.5086, "step": 2693 }, { "epoch": 0.02, "grad_norm": 4.941322758554874, "learning_rate": 1.9998985473842858e-06, "loss": 1.3952, "step": 2694 }, { "epoch": 0.02, "grad_norm": 4.260790698502375, "learning_rate": 1.9998984717734078e-06, "loss": 1.3652, "step": 2695 }, { "epoch": 0.02, "grad_norm": 5.2005982502957595, "learning_rate": 1.999898396134366e-06, "loss": 1.5483, "step": 2696 }, { "epoch": 0.02, "grad_norm": 4.772216539864026, "learning_rate": 1.9998983204671606e-06, "loss": 1.2309, "step": 2697 }, { "epoch": 0.02, "grad_norm": 4.448621297124078, "learning_rate": 1.999898244771791e-06, "loss": 1.3967, "step": 2698 }, { "epoch": 0.02, "grad_norm": 5.6464418666503295, "learning_rate": 1.9998981690482577e-06, "loss": 1.2706, "step": 2699 }, { "epoch": 0.02, "grad_norm": 5.142174365370682, "learning_rate": 1.999898093296561e-06, "loss": 1.3629, "step": 2700 }, { "epoch": 0.02, "grad_norm": 4.489321589199872, "learning_rate": 1.9998980175167003e-06, "loss": 1.4768, "step": 2701 }, { "epoch": 0.02, "eval_loss": 1.6249644756317139, "eval_runtime": 4.6538, "eval_samples_per_second": 1.934, "eval_steps_per_second": 1.074, "step": 2701 }, { "epoch": 0.02, "grad_norm": 4.728941606346881, "learning_rate": 1.9998979417086758e-06, "loss": 1.4208, "step": 2702 }, { "epoch": 0.02, "grad_norm": 6.721435664278658, "learning_rate": 1.9998978658724874e-06, "loss": 1.6635, "step": 2703 }, { "epoch": 0.02, "grad_norm": 5.672729490886054, "learning_rate": 1.9998977900081353e-06, "loss": 1.3433, "step": 2704 }, { "epoch": 0.02, "grad_norm": 5.476896381988856, "learning_rate": 1.9998977141156192e-06, "loss": 1.4389, "step": 2705 }, { "epoch": 0.02, "grad_norm": 4.783782128937111, "learning_rate": 1.9998976381949394e-06, "loss": 1.4866, "step": 2706 }, { "epoch": 0.02, "grad_norm": 4.724902284051836, "learning_rate": 1.999897562246096e-06, "loss": 1.3447, "step": 2707 }, { "epoch": 0.02, "grad_norm": 4.850028686058834, "learning_rate": 1.999897486269089e-06, "loss": 1.3209, "step": 2708 }, { "epoch": 0.02, "grad_norm": 4.83053553627955, "learning_rate": 1.9998974102639175e-06, "loss": 1.5586, "step": 2709 }, { "epoch": 0.02, "grad_norm": 4.712477754987166, "learning_rate": 1.9998973342305827e-06, "loss": 1.4225, "step": 2710 }, { "epoch": 0.02, "grad_norm": 4.54227448455875, "learning_rate": 1.999897258169084e-06, "loss": 1.3462, "step": 2711 }, { "epoch": 0.02, "grad_norm": 5.5411282379403985, "learning_rate": 1.9998971820794215e-06, "loss": 1.4259, "step": 2712 }, { "epoch": 0.02, "grad_norm": 4.739705320723445, "learning_rate": 1.999897105961595e-06, "loss": 1.457, "step": 2713 }, { "epoch": 0.02, "grad_norm": 4.701603405635517, "learning_rate": 1.9998970298156053e-06, "loss": 1.4297, "step": 2714 }, { "epoch": 0.02, "grad_norm": 4.991641204953335, "learning_rate": 1.9998969536414512e-06, "loss": 1.4822, "step": 2715 }, { "epoch": 0.02, "grad_norm": 5.480036657625506, "learning_rate": 1.9998968774391338e-06, "loss": 1.3061, "step": 2716 }, { "epoch": 0.02, "grad_norm": 4.857570508327938, "learning_rate": 1.9998968012086524e-06, "loss": 1.5446, "step": 2717 }, { "epoch": 0.02, "grad_norm": 4.917848715898165, "learning_rate": 1.9998967249500073e-06, "loss": 1.4604, "step": 2718 }, { "epoch": 0.02, "grad_norm": 4.532991154379635, "learning_rate": 1.9998966486631987e-06, "loss": 1.3497, "step": 2719 }, { "epoch": 0.02, "grad_norm": 4.516960068763919, "learning_rate": 1.999896572348226e-06, "loss": 1.3371, "step": 2720 }, { "epoch": 0.02, "grad_norm": 5.4294845884659875, "learning_rate": 1.9998964960050895e-06, "loss": 1.4655, "step": 2721 }, { "epoch": 0.02, "grad_norm": 6.465778207032482, "learning_rate": 1.9998964196337894e-06, "loss": 1.1239, "step": 2722 }, { "epoch": 0.02, "grad_norm": 4.380579112263108, "learning_rate": 1.9998963432343254e-06, "loss": 1.4072, "step": 2723 }, { "epoch": 0.02, "grad_norm": 5.347121577221788, "learning_rate": 1.999896266806698e-06, "loss": 1.4905, "step": 2724 }, { "epoch": 0.02, "grad_norm": 4.091313845837881, "learning_rate": 1.9998961903509063e-06, "loss": 1.1784, "step": 2725 }, { "epoch": 0.02, "grad_norm": 4.9326180566194155, "learning_rate": 1.9998961138669512e-06, "loss": 1.5945, "step": 2726 }, { "epoch": 0.02, "grad_norm": 12.547464104001262, "learning_rate": 1.9998960373548323e-06, "loss": 1.7427, "step": 2727 }, { "epoch": 0.02, "grad_norm": 4.955919797294654, "learning_rate": 1.9998959608145495e-06, "loss": 1.5829, "step": 2728 }, { "epoch": 0.02, "grad_norm": 4.259954508473689, "learning_rate": 1.9998958842461033e-06, "loss": 1.2882, "step": 2729 }, { "epoch": 0.02, "grad_norm": 4.242648662389775, "learning_rate": 1.9998958076494933e-06, "loss": 1.2997, "step": 2730 }, { "epoch": 0.02, "grad_norm": 4.442253262087044, "learning_rate": 1.9998957310247194e-06, "loss": 1.2864, "step": 2731 }, { "epoch": 0.02, "grad_norm": 4.672333737798939, "learning_rate": 1.9998956543717816e-06, "loss": 1.5532, "step": 2732 }, { "epoch": 0.02, "grad_norm": 4.768920768366485, "learning_rate": 1.9998955776906805e-06, "loss": 1.4113, "step": 2733 }, { "epoch": 0.02, "grad_norm": 5.365298258133692, "learning_rate": 1.9998955009814155e-06, "loss": 1.4779, "step": 2734 }, { "epoch": 0.02, "grad_norm": 4.977640813439696, "learning_rate": 1.9998954242439866e-06, "loss": 1.3955, "step": 2735 }, { "epoch": 0.02, "grad_norm": 4.779169735265989, "learning_rate": 1.999895347478394e-06, "loss": 1.4087, "step": 2736 }, { "epoch": 0.02, "grad_norm": 5.7600337936905275, "learning_rate": 1.9998952706846378e-06, "loss": 1.46, "step": 2737 }, { "epoch": 0.02, "grad_norm": 4.654467318840138, "learning_rate": 1.999895193862718e-06, "loss": 1.3913, "step": 2738 }, { "epoch": 0.02, "grad_norm": 4.704589355819833, "learning_rate": 1.999895117012634e-06, "loss": 1.5544, "step": 2739 }, { "epoch": 0.02, "grad_norm": 5.1941897065206595, "learning_rate": 1.9998950401343863e-06, "loss": 1.273, "step": 2740 }, { "epoch": 0.02, "grad_norm": 4.439869003538217, "learning_rate": 1.9998949632279753e-06, "loss": 1.4237, "step": 2741 }, { "epoch": 0.02, "grad_norm": 4.50147659109475, "learning_rate": 1.9998948862934008e-06, "loss": 1.3462, "step": 2742 }, { "epoch": 0.02, "grad_norm": 5.584850957877264, "learning_rate": 1.999894809330662e-06, "loss": 1.2504, "step": 2743 }, { "epoch": 0.02, "grad_norm": 6.042375090827711, "learning_rate": 1.99989473233976e-06, "loss": 1.3003, "step": 2744 }, { "epoch": 0.02, "grad_norm": 5.023492077637436, "learning_rate": 1.9998946553206938e-06, "loss": 1.4422, "step": 2745 }, { "epoch": 0.02, "grad_norm": 4.748553264507999, "learning_rate": 1.999894578273464e-06, "loss": 1.5238, "step": 2746 }, { "epoch": 0.02, "grad_norm": 4.570660659610173, "learning_rate": 1.9998945011980706e-06, "loss": 1.4269, "step": 2747 }, { "epoch": 0.02, "grad_norm": 4.452679880713028, "learning_rate": 1.9998944240945134e-06, "loss": 1.4164, "step": 2748 }, { "epoch": 0.02, "grad_norm": 5.836656376655193, "learning_rate": 1.999894346962793e-06, "loss": 1.2856, "step": 2749 }, { "epoch": 0.02, "grad_norm": 4.798771542642795, "learning_rate": 1.999894269802908e-06, "loss": 1.3372, "step": 2750 }, { "epoch": 0.02, "grad_norm": 5.170216529268188, "learning_rate": 1.9998941926148597e-06, "loss": 1.4021, "step": 2751 }, { "epoch": 0.02, "grad_norm": 7.167356093869082, "learning_rate": 1.999894115398648e-06, "loss": 1.4013, "step": 2752 }, { "epoch": 0.02, "grad_norm": 4.587890795315358, "learning_rate": 1.9998940381542725e-06, "loss": 1.4176, "step": 2753 }, { "epoch": 0.02, "grad_norm": 5.177127842863576, "learning_rate": 1.999893960881733e-06, "loss": 1.5869, "step": 2754 }, { "epoch": 0.02, "grad_norm": 6.574137972828784, "learning_rate": 1.9998938835810304e-06, "loss": 1.5673, "step": 2755 }, { "epoch": 0.02, "grad_norm": 5.778126829684162, "learning_rate": 1.9998938062521637e-06, "loss": 1.5361, "step": 2756 }, { "epoch": 0.02, "grad_norm": 7.851913885459397, "learning_rate": 1.9998937288951332e-06, "loss": 1.412, "step": 2757 }, { "epoch": 0.02, "grad_norm": 5.122129218110728, "learning_rate": 1.9998936515099393e-06, "loss": 1.4406, "step": 2758 }, { "epoch": 0.02, "grad_norm": 4.775466238362049, "learning_rate": 1.9998935740965815e-06, "loss": 1.3279, "step": 2759 }, { "epoch": 0.02, "grad_norm": 10.097236710699216, "learning_rate": 1.9998934966550604e-06, "loss": 1.5103, "step": 2760 }, { "epoch": 0.02, "grad_norm": 4.852445289044562, "learning_rate": 1.999893419185375e-06, "loss": 1.469, "step": 2761 }, { "epoch": 0.02, "grad_norm": 4.912077857670805, "learning_rate": 1.9998933416875265e-06, "loss": 1.4218, "step": 2762 }, { "epoch": 0.02, "grad_norm": 4.689088161317069, "learning_rate": 1.999893264161514e-06, "loss": 1.3336, "step": 2763 }, { "epoch": 0.02, "grad_norm": 4.766112013484904, "learning_rate": 1.999893186607338e-06, "loss": 1.3066, "step": 2764 }, { "epoch": 0.02, "grad_norm": 5.593303699801557, "learning_rate": 1.9998931090249985e-06, "loss": 1.5657, "step": 2765 }, { "epoch": 0.02, "grad_norm": 5.163779733210183, "learning_rate": 1.9998930314144947e-06, "loss": 1.4395, "step": 2766 }, { "epoch": 0.02, "grad_norm": 5.674315487003727, "learning_rate": 1.999892953775828e-06, "loss": 1.5407, "step": 2767 }, { "epoch": 0.02, "grad_norm": 4.676661266930629, "learning_rate": 1.999892876108997e-06, "loss": 1.5139, "step": 2768 }, { "epoch": 0.02, "grad_norm": 9.128307746492878, "learning_rate": 1.9998927984140026e-06, "loss": 1.3218, "step": 2769 }, { "epoch": 0.02, "grad_norm": 4.240640196066125, "learning_rate": 1.9998927206908447e-06, "loss": 1.2989, "step": 2770 }, { "epoch": 0.02, "grad_norm": 6.27816898885737, "learning_rate": 1.999892642939523e-06, "loss": 1.3487, "step": 2771 }, { "epoch": 0.02, "grad_norm": 5.025010385489242, "learning_rate": 1.9998925651600377e-06, "loss": 1.3706, "step": 2772 }, { "epoch": 0.02, "grad_norm": 4.464981368592326, "learning_rate": 1.9998924873523886e-06, "loss": 1.3258, "step": 2773 }, { "epoch": 0.02, "grad_norm": 4.665177992679757, "learning_rate": 1.999892409516576e-06, "loss": 1.3428, "step": 2774 }, { "epoch": 0.02, "eval_loss": 1.6228926181793213, "eval_runtime": 4.6339, "eval_samples_per_second": 1.942, "eval_steps_per_second": 1.079, "step": 2774 }, { "epoch": 0.02, "grad_norm": 4.868974430046027, "learning_rate": 1.9998923316526e-06, "loss": 1.4719, "step": 2775 }, { "epoch": 0.02, "grad_norm": 5.304645907019113, "learning_rate": 1.99989225376046e-06, "loss": 1.5306, "step": 2776 }, { "epoch": 0.02, "grad_norm": 5.387713736296703, "learning_rate": 1.9998921758401565e-06, "loss": 1.3475, "step": 2777 }, { "epoch": 0.02, "grad_norm": 4.442792610487523, "learning_rate": 1.9998920978916895e-06, "loss": 1.36, "step": 2778 }, { "epoch": 0.02, "grad_norm": 5.683582295474543, "learning_rate": 1.9998920199150587e-06, "loss": 1.4342, "step": 2779 }, { "epoch": 0.02, "grad_norm": 5.777504817382186, "learning_rate": 1.9998919419102644e-06, "loss": 1.3633, "step": 2780 }, { "epoch": 0.02, "grad_norm": 4.598408814061361, "learning_rate": 1.9998918638773063e-06, "loss": 1.3561, "step": 2781 }, { "epoch": 0.02, "grad_norm": 4.531887898739241, "learning_rate": 1.9998917858161843e-06, "loss": 1.4105, "step": 2782 }, { "epoch": 0.02, "grad_norm": 4.379373053575446, "learning_rate": 1.999891707726899e-06, "loss": 1.3188, "step": 2783 }, { "epoch": 0.02, "grad_norm": 4.7609097936376195, "learning_rate": 1.99989162960945e-06, "loss": 1.3523, "step": 2784 }, { "epoch": 0.02, "grad_norm": 4.7932273540706545, "learning_rate": 1.9998915514638374e-06, "loss": 1.4839, "step": 2785 }, { "epoch": 0.02, "grad_norm": 4.403408099341777, "learning_rate": 1.9998914732900618e-06, "loss": 1.3651, "step": 2786 }, { "epoch": 0.02, "grad_norm": 5.363493337417038, "learning_rate": 1.9998913950881214e-06, "loss": 1.3903, "step": 2787 }, { "epoch": 0.02, "grad_norm": 4.703872779282757, "learning_rate": 1.999891316858018e-06, "loss": 1.3602, "step": 2788 }, { "epoch": 0.02, "grad_norm": 4.594866601682083, "learning_rate": 1.9998912385997513e-06, "loss": 1.2933, "step": 2789 }, { "epoch": 0.02, "grad_norm": 4.606930331365836, "learning_rate": 1.9998911603133207e-06, "loss": 1.4318, "step": 2790 }, { "epoch": 0.02, "grad_norm": 7.67703505544561, "learning_rate": 1.9998910819987262e-06, "loss": 1.1987, "step": 2791 }, { "epoch": 0.02, "grad_norm": 4.858347802822455, "learning_rate": 1.9998910036559688e-06, "loss": 1.3342, "step": 2792 }, { "epoch": 0.02, "grad_norm": 4.238675085196649, "learning_rate": 1.999890925285047e-06, "loss": 1.1874, "step": 2793 }, { "epoch": 0.02, "grad_norm": 4.636225801921176, "learning_rate": 1.999890846885962e-06, "loss": 1.3584, "step": 2794 }, { "epoch": 0.02, "grad_norm": 4.633817615345395, "learning_rate": 1.9998907684587133e-06, "loss": 1.4063, "step": 2795 }, { "epoch": 0.02, "grad_norm": 4.893305859480175, "learning_rate": 1.999890690003301e-06, "loss": 1.4289, "step": 2796 }, { "epoch": 0.02, "grad_norm": 4.7172189836983796, "learning_rate": 1.999890611519725e-06, "loss": 1.4481, "step": 2797 }, { "epoch": 0.02, "grad_norm": 4.881737643176644, "learning_rate": 1.999890533007986e-06, "loss": 1.4956, "step": 2798 }, { "epoch": 0.02, "grad_norm": 4.986642114604307, "learning_rate": 1.9998904544680827e-06, "loss": 1.6169, "step": 2799 }, { "epoch": 0.02, "grad_norm": 5.2687198972018905, "learning_rate": 1.999890375900016e-06, "loss": 1.3991, "step": 2800 }, { "epoch": 0.02, "grad_norm": 4.692782105304108, "learning_rate": 1.9998902973037858e-06, "loss": 1.3991, "step": 2801 }, { "epoch": 0.02, "grad_norm": 5.509363415701804, "learning_rate": 1.999890218679392e-06, "loss": 1.4011, "step": 2802 }, { "epoch": 0.02, "grad_norm": 6.431731471169426, "learning_rate": 1.9998901400268348e-06, "loss": 1.3621, "step": 2803 }, { "epoch": 0.02, "grad_norm": 5.881655716824346, "learning_rate": 1.9998900613461137e-06, "loss": 1.4308, "step": 2804 }, { "epoch": 0.02, "grad_norm": 5.081486388140841, "learning_rate": 1.9998899826372292e-06, "loss": 1.3531, "step": 2805 }, { "epoch": 0.02, "grad_norm": 5.1521080991642965, "learning_rate": 1.9998899039001813e-06, "loss": 1.31, "step": 2806 }, { "epoch": 0.02, "grad_norm": 5.210301192592582, "learning_rate": 1.9998898251349696e-06, "loss": 1.5408, "step": 2807 }, { "epoch": 0.02, "grad_norm": 4.589178232793303, "learning_rate": 1.9998897463415944e-06, "loss": 1.3702, "step": 2808 }, { "epoch": 0.02, "grad_norm": 4.375935678979782, "learning_rate": 1.999889667520056e-06, "loss": 1.3905, "step": 2809 }, { "epoch": 0.02, "grad_norm": 6.436455510679924, "learning_rate": 1.999889588670353e-06, "loss": 1.3327, "step": 2810 }, { "epoch": 0.02, "grad_norm": 5.29249042088829, "learning_rate": 1.9998895097924875e-06, "loss": 1.4343, "step": 2811 }, { "epoch": 0.02, "grad_norm": 4.784394796780278, "learning_rate": 1.9998894308864578e-06, "loss": 1.4381, "step": 2812 }, { "epoch": 0.02, "grad_norm": 4.946717504980601, "learning_rate": 1.9998893519522646e-06, "loss": 1.4603, "step": 2813 }, { "epoch": 0.02, "grad_norm": 4.766983915055353, "learning_rate": 1.999889272989908e-06, "loss": 1.5148, "step": 2814 }, { "epoch": 0.02, "grad_norm": 4.708307718420187, "learning_rate": 1.9998891939993877e-06, "loss": 1.3662, "step": 2815 }, { "epoch": 0.02, "grad_norm": 5.562866445292407, "learning_rate": 1.999889114980704e-06, "loss": 1.5425, "step": 2816 }, { "epoch": 0.02, "grad_norm": 4.8892392384319745, "learning_rate": 1.9998890359338566e-06, "loss": 1.5102, "step": 2817 }, { "epoch": 0.02, "grad_norm": 6.681445930399619, "learning_rate": 1.999888956858846e-06, "loss": 1.5407, "step": 2818 }, { "epoch": 0.02, "grad_norm": 4.932383185973984, "learning_rate": 1.9998888777556714e-06, "loss": 1.3519, "step": 2819 }, { "epoch": 0.02, "grad_norm": 4.573974144160327, "learning_rate": 1.9998887986243335e-06, "loss": 1.3792, "step": 2820 }, { "epoch": 0.02, "grad_norm": 5.945964809681693, "learning_rate": 1.999888719464832e-06, "loss": 1.3079, "step": 2821 }, { "epoch": 0.02, "grad_norm": 4.6784540645297366, "learning_rate": 1.999888640277167e-06, "loss": 1.3658, "step": 2822 }, { "epoch": 0.02, "grad_norm": 5.4758180832015775, "learning_rate": 1.9998885610613383e-06, "loss": 1.4252, "step": 2823 }, { "epoch": 0.02, "grad_norm": 4.924403479676831, "learning_rate": 1.9998884818173467e-06, "loss": 1.4362, "step": 2824 }, { "epoch": 0.02, "grad_norm": 5.1946491484073745, "learning_rate": 1.999888402545191e-06, "loss": 1.4022, "step": 2825 }, { "epoch": 0.02, "grad_norm": 5.116594932180298, "learning_rate": 1.999888323244872e-06, "loss": 1.458, "step": 2826 }, { "epoch": 0.02, "grad_norm": 4.641238557929875, "learning_rate": 1.999888243916389e-06, "loss": 1.3547, "step": 2827 }, { "epoch": 0.02, "grad_norm": 4.594999462463972, "learning_rate": 1.999888164559743e-06, "loss": 1.4215, "step": 2828 }, { "epoch": 0.02, "grad_norm": 5.047703148420556, "learning_rate": 1.9998880851749335e-06, "loss": 1.4627, "step": 2829 }, { "epoch": 0.02, "grad_norm": 5.422103915078788, "learning_rate": 1.99988800576196e-06, "loss": 1.5064, "step": 2830 }, { "epoch": 0.02, "grad_norm": 4.927470474587449, "learning_rate": 1.999887926320823e-06, "loss": 1.4422, "step": 2831 }, { "epoch": 0.02, "grad_norm": 4.8891825059531095, "learning_rate": 1.999887846851523e-06, "loss": 1.3049, "step": 2832 }, { "epoch": 0.02, "grad_norm": 4.89929973889463, "learning_rate": 1.9998877673540592e-06, "loss": 1.4641, "step": 2833 }, { "epoch": 0.02, "grad_norm": 4.330474087668212, "learning_rate": 1.999887687828432e-06, "loss": 1.2395, "step": 2834 }, { "epoch": 0.02, "grad_norm": 4.689316103906117, "learning_rate": 1.999887608274641e-06, "loss": 1.5746, "step": 2835 }, { "epoch": 0.02, "grad_norm": 30.022522598437778, "learning_rate": 1.999887528692687e-06, "loss": 1.552, "step": 2836 }, { "epoch": 0.02, "grad_norm": 4.979619290046022, "learning_rate": 1.999887449082569e-06, "loss": 1.4355, "step": 2837 }, { "epoch": 0.02, "grad_norm": 4.5639834865515185, "learning_rate": 1.9998873694442878e-06, "loss": 1.383, "step": 2838 }, { "epoch": 0.02, "grad_norm": 4.572884425917966, "learning_rate": 1.9998872897778427e-06, "loss": 1.4367, "step": 2839 }, { "epoch": 0.02, "grad_norm": 4.935183007183054, "learning_rate": 1.9998872100832346e-06, "loss": 1.2201, "step": 2840 }, { "epoch": 0.02, "grad_norm": 6.211909738145334, "learning_rate": 1.9998871303604627e-06, "loss": 1.3496, "step": 2841 }, { "epoch": 0.02, "grad_norm": 4.5122328941008, "learning_rate": 1.9998870506095274e-06, "loss": 1.2845, "step": 2842 }, { "epoch": 0.02, "grad_norm": 4.924447610654198, "learning_rate": 1.9998869708304286e-06, "loss": 1.4161, "step": 2843 }, { "epoch": 0.02, "grad_norm": 5.2054369143001145, "learning_rate": 1.9998868910231665e-06, "loss": 1.5187, "step": 2844 }, { "epoch": 0.02, "grad_norm": 4.861035173442859, "learning_rate": 1.9998868111877404e-06, "loss": 1.3305, "step": 2845 }, { "epoch": 0.02, "grad_norm": 4.63315608539278, "learning_rate": 1.9998867313241514e-06, "loss": 1.3722, "step": 2846 }, { "epoch": 0.02, "grad_norm": 4.566229003342197, "learning_rate": 1.9998866514323985e-06, "loss": 1.3612, "step": 2847 }, { "epoch": 0.02, "eval_loss": 1.6186871528625488, "eval_runtime": 4.6182, "eval_samples_per_second": 1.949, "eval_steps_per_second": 1.083, "step": 2847 }, { "epoch": 0.02, "grad_norm": 4.9773867457031615, "learning_rate": 1.9998865715124822e-06, "loss": 1.3684, "step": 2848 }, { "epoch": 0.02, "grad_norm": 4.776322816477926, "learning_rate": 1.9998864915644025e-06, "loss": 1.355, "step": 2849 }, { "epoch": 0.02, "grad_norm": 5.200825295575172, "learning_rate": 1.9998864115881594e-06, "loss": 1.4849, "step": 2850 }, { "epoch": 0.02, "grad_norm": 8.669774310904668, "learning_rate": 1.999886331583753e-06, "loss": 1.439, "step": 2851 }, { "epoch": 0.02, "grad_norm": 4.961641023055851, "learning_rate": 1.9998862515511824e-06, "loss": 1.3667, "step": 2852 }, { "epoch": 0.02, "grad_norm": 4.822249685134192, "learning_rate": 1.999886171490449e-06, "loss": 1.3395, "step": 2853 }, { "epoch": 0.02, "grad_norm": 6.098165282053863, "learning_rate": 1.999886091401552e-06, "loss": 1.4478, "step": 2854 }, { "epoch": 0.02, "grad_norm": 6.297976415034525, "learning_rate": 1.999886011284491e-06, "loss": 1.3084, "step": 2855 }, { "epoch": 0.02, "grad_norm": 4.60213897121254, "learning_rate": 1.9998859311392675e-06, "loss": 1.3949, "step": 2856 }, { "epoch": 0.02, "grad_norm": 5.128426005267793, "learning_rate": 1.99988585096588e-06, "loss": 1.5439, "step": 2857 }, { "epoch": 0.02, "grad_norm": 4.238279682926694, "learning_rate": 1.999885770764329e-06, "loss": 1.3442, "step": 2858 }, { "epoch": 0.02, "grad_norm": 4.454111710242467, "learning_rate": 1.9998856905346142e-06, "loss": 1.2708, "step": 2859 }, { "epoch": 0.02, "grad_norm": 6.597041686087069, "learning_rate": 1.9998856102767365e-06, "loss": 1.4927, "step": 2860 }, { "epoch": 0.02, "grad_norm": 4.9566004997832325, "learning_rate": 1.9998855299906953e-06, "loss": 1.4864, "step": 2861 }, { "epoch": 0.02, "grad_norm": 4.803022534988008, "learning_rate": 1.9998854496764907e-06, "loss": 1.4479, "step": 2862 }, { "epoch": 0.02, "grad_norm": 5.3284227061764025, "learning_rate": 1.9998853693341226e-06, "loss": 1.3775, "step": 2863 }, { "epoch": 0.02, "grad_norm": 4.449051605434009, "learning_rate": 1.9998852889635907e-06, "loss": 1.3944, "step": 2864 }, { "epoch": 0.02, "grad_norm": 4.524264356204093, "learning_rate": 1.999885208564896e-06, "loss": 1.3925, "step": 2865 }, { "epoch": 0.02, "grad_norm": 4.576738634642325, "learning_rate": 1.9998851281380375e-06, "loss": 1.4023, "step": 2866 }, { "epoch": 0.02, "grad_norm": 4.5128730915629305, "learning_rate": 1.999885047683015e-06, "loss": 1.3308, "step": 2867 }, { "epoch": 0.02, "grad_norm": 5.774177800040769, "learning_rate": 1.99988496719983e-06, "loss": 1.3723, "step": 2868 }, { "epoch": 0.02, "grad_norm": 4.841933746799073, "learning_rate": 1.999884886688481e-06, "loss": 1.4242, "step": 2869 }, { "epoch": 0.02, "grad_norm": 7.190890743181518, "learning_rate": 1.999884806148969e-06, "loss": 1.4457, "step": 2870 }, { "epoch": 0.02, "grad_norm": 4.389488660525391, "learning_rate": 1.999884725581293e-06, "loss": 1.3262, "step": 2871 }, { "epoch": 0.02, "grad_norm": 4.9366007381855725, "learning_rate": 1.999884644985454e-06, "loss": 1.4215, "step": 2872 }, { "epoch": 0.02, "grad_norm": 4.51134223312188, "learning_rate": 1.9998845643614514e-06, "loss": 1.4528, "step": 2873 }, { "epoch": 0.02, "grad_norm": 4.5911234789565345, "learning_rate": 1.9998844837092857e-06, "loss": 1.3725, "step": 2874 }, { "epoch": 0.02, "grad_norm": 4.714425532161108, "learning_rate": 1.9998844030289562e-06, "loss": 1.3326, "step": 2875 }, { "epoch": 0.02, "grad_norm": 6.194761253541817, "learning_rate": 1.9998843223204633e-06, "loss": 1.4908, "step": 2876 }, { "epoch": 0.02, "grad_norm": 4.504642312252081, "learning_rate": 1.999884241583807e-06, "loss": 1.3287, "step": 2877 }, { "epoch": 0.02, "grad_norm": 6.794699180375288, "learning_rate": 1.9998841608189875e-06, "loss": 1.4244, "step": 2878 }, { "epoch": 0.02, "grad_norm": 4.751999842845239, "learning_rate": 1.9998840800260043e-06, "loss": 1.454, "step": 2879 }, { "epoch": 0.02, "grad_norm": 4.726264353641071, "learning_rate": 1.999883999204858e-06, "loss": 1.3897, "step": 2880 }, { "epoch": 0.02, "grad_norm": 4.729803974265776, "learning_rate": 1.999883918355548e-06, "loss": 1.5002, "step": 2881 }, { "epoch": 0.02, "grad_norm": 5.285212933171045, "learning_rate": 1.999883837478075e-06, "loss": 1.6833, "step": 2882 }, { "epoch": 0.02, "grad_norm": 4.596625945889018, "learning_rate": 1.999883756572438e-06, "loss": 1.3805, "step": 2883 }, { "epoch": 0.02, "grad_norm": 5.0059586218503105, "learning_rate": 1.999883675638638e-06, "loss": 1.4069, "step": 2884 }, { "epoch": 0.02, "grad_norm": 4.8719439751839575, "learning_rate": 1.999883594676675e-06, "loss": 1.4273, "step": 2885 }, { "epoch": 0.02, "grad_norm": 4.836717800773741, "learning_rate": 1.999883513686548e-06, "loss": 1.5191, "step": 2886 }, { "epoch": 0.02, "grad_norm": 5.448400261385887, "learning_rate": 1.9998834326682575e-06, "loss": 1.5117, "step": 2887 }, { "epoch": 0.02, "grad_norm": 4.341830112851468, "learning_rate": 1.9998833516218043e-06, "loss": 1.3386, "step": 2888 }, { "epoch": 0.02, "grad_norm": 4.330108245622447, "learning_rate": 1.999883270547187e-06, "loss": 1.3865, "step": 2889 }, { "epoch": 0.02, "grad_norm": 5.161889492402766, "learning_rate": 1.9998831894444064e-06, "loss": 1.4923, "step": 2890 }, { "epoch": 0.02, "grad_norm": 4.583340804795185, "learning_rate": 1.9998831083134626e-06, "loss": 1.4128, "step": 2891 }, { "epoch": 0.02, "grad_norm": 4.475403361495518, "learning_rate": 1.9998830271543557e-06, "loss": 1.2633, "step": 2892 }, { "epoch": 0.02, "grad_norm": 4.429544299978376, "learning_rate": 1.999882945967085e-06, "loss": 1.2271, "step": 2893 }, { "epoch": 0.02, "grad_norm": 4.924573918325059, "learning_rate": 1.999882864751651e-06, "loss": 1.5329, "step": 2894 }, { "epoch": 0.02, "grad_norm": 4.72164854751247, "learning_rate": 1.9998827835080538e-06, "loss": 1.3411, "step": 2895 }, { "epoch": 0.02, "grad_norm": 5.513620466728661, "learning_rate": 1.9998827022362932e-06, "loss": 1.557, "step": 2896 }, { "epoch": 0.02, "grad_norm": 4.893569608594087, "learning_rate": 1.9998826209363693e-06, "loss": 1.4545, "step": 2897 }, { "epoch": 0.02, "grad_norm": 5.59988609368051, "learning_rate": 1.999882539608282e-06, "loss": 1.4207, "step": 2898 }, { "epoch": 0.02, "grad_norm": 6.289259835995451, "learning_rate": 1.9998824582520306e-06, "loss": 1.4633, "step": 2899 }, { "epoch": 0.02, "grad_norm": 4.600672131161404, "learning_rate": 1.999882376867617e-06, "loss": 1.2747, "step": 2900 }, { "epoch": 0.02, "grad_norm": 4.73209077596222, "learning_rate": 1.9998822954550396e-06, "loss": 1.4534, "step": 2901 }, { "epoch": 0.02, "grad_norm": 4.649416419045721, "learning_rate": 1.9998822140142985e-06, "loss": 1.423, "step": 2902 }, { "epoch": 0.02, "grad_norm": 4.815284149190514, "learning_rate": 1.999882132545394e-06, "loss": 1.1985, "step": 2903 }, { "epoch": 0.02, "grad_norm": 4.993787069518763, "learning_rate": 1.999882051048327e-06, "loss": 1.3226, "step": 2904 }, { "epoch": 0.02, "grad_norm": 5.30556271239256, "learning_rate": 1.9998819695230956e-06, "loss": 1.6076, "step": 2905 }, { "epoch": 0.02, "grad_norm": 5.111676758762733, "learning_rate": 1.9998818879697013e-06, "loss": 1.4156, "step": 2906 }, { "epoch": 0.02, "grad_norm": 4.57532368590634, "learning_rate": 1.999881806388144e-06, "loss": 1.5127, "step": 2907 }, { "epoch": 0.02, "grad_norm": 5.097665766053441, "learning_rate": 1.9998817247784227e-06, "loss": 1.2153, "step": 2908 }, { "epoch": 0.02, "grad_norm": 4.7664796065004404, "learning_rate": 1.9998816431405386e-06, "loss": 1.4997, "step": 2909 }, { "epoch": 0.02, "grad_norm": 4.707849161238309, "learning_rate": 1.999881561474491e-06, "loss": 1.3614, "step": 2910 }, { "epoch": 0.02, "grad_norm": 4.715418586973402, "learning_rate": 1.99988147978028e-06, "loss": 1.3648, "step": 2911 }, { "epoch": 0.02, "grad_norm": 4.698559700737574, "learning_rate": 1.9998813980579055e-06, "loss": 1.4627, "step": 2912 }, { "epoch": 0.02, "grad_norm": 4.492256458663923, "learning_rate": 1.999881316307368e-06, "loss": 1.3053, "step": 2913 }, { "epoch": 0.02, "grad_norm": 4.154059898049573, "learning_rate": 1.9998812345286667e-06, "loss": 1.1044, "step": 2914 }, { "epoch": 0.02, "grad_norm": 5.62208125851778, "learning_rate": 1.999881152721803e-06, "loss": 1.3578, "step": 2915 }, { "epoch": 0.02, "grad_norm": 4.66847610055548, "learning_rate": 1.999881070886775e-06, "loss": 1.5454, "step": 2916 }, { "epoch": 0.02, "grad_norm": 4.317475104898607, "learning_rate": 1.999880989023584e-06, "loss": 1.3935, "step": 2917 }, { "epoch": 0.02, "grad_norm": 4.527930294875366, "learning_rate": 1.99988090713223e-06, "loss": 1.4274, "step": 2918 }, { "epoch": 0.02, "grad_norm": 4.416272106126581, "learning_rate": 1.9998808252127123e-06, "loss": 1.3224, "step": 2919 }, { "epoch": 0.02, "grad_norm": 5.316728315979006, "learning_rate": 1.9998807432650313e-06, "loss": 1.5215, "step": 2920 }, { "epoch": 0.02, "eval_loss": 1.6199383735656738, "eval_runtime": 4.6314, "eval_samples_per_second": 1.943, "eval_steps_per_second": 1.08, "step": 2920 }, { "epoch": 0.02, "grad_norm": 4.490493646901724, "learning_rate": 1.999880661289187e-06, "loss": 1.4416, "step": 2921 }, { "epoch": 0.02, "grad_norm": 4.624580242853857, "learning_rate": 1.9998805792851795e-06, "loss": 1.3179, "step": 2922 }, { "epoch": 0.02, "grad_norm": 8.466914354748946, "learning_rate": 1.9998804972530087e-06, "loss": 1.6161, "step": 2923 }, { "epoch": 0.02, "grad_norm": 4.764398192424773, "learning_rate": 1.9998804151926745e-06, "loss": 1.5612, "step": 2924 }, { "epoch": 0.02, "grad_norm": 6.270011750308185, "learning_rate": 1.9998803331041772e-06, "loss": 1.3574, "step": 2925 }, { "epoch": 0.02, "grad_norm": 5.688420881807707, "learning_rate": 1.999880250987516e-06, "loss": 1.4334, "step": 2926 }, { "epoch": 0.02, "grad_norm": 4.649454207981505, "learning_rate": 1.999880168842692e-06, "loss": 1.3212, "step": 2927 }, { "epoch": 0.02, "grad_norm": 5.156111274716404, "learning_rate": 1.9998800866697046e-06, "loss": 1.1788, "step": 2928 }, { "epoch": 0.02, "grad_norm": 5.189510589727771, "learning_rate": 1.999880004468554e-06, "loss": 1.5181, "step": 2929 }, { "epoch": 0.02, "grad_norm": 5.20778314788957, "learning_rate": 1.99987992223924e-06, "loss": 1.4894, "step": 2930 }, { "epoch": 0.02, "grad_norm": 4.613692977472925, "learning_rate": 1.9998798399817628e-06, "loss": 1.3359, "step": 2931 }, { "epoch": 0.02, "grad_norm": 4.45441252637321, "learning_rate": 1.999879757696122e-06, "loss": 1.3484, "step": 2932 }, { "epoch": 0.02, "grad_norm": 5.84796908759116, "learning_rate": 1.9998796753823182e-06, "loss": 1.3102, "step": 2933 }, { "epoch": 0.02, "grad_norm": 5.057071265216301, "learning_rate": 1.9998795930403515e-06, "loss": 1.4964, "step": 2934 }, { "epoch": 0.02, "grad_norm": 4.65737240395791, "learning_rate": 1.9998795106702213e-06, "loss": 1.3021, "step": 2935 }, { "epoch": 0.02, "grad_norm": 4.466035985587055, "learning_rate": 1.9998794282719273e-06, "loss": 1.5032, "step": 2936 }, { "epoch": 0.02, "grad_norm": 5.856160311135987, "learning_rate": 1.9998793458454702e-06, "loss": 1.5538, "step": 2937 }, { "epoch": 0.02, "grad_norm": 4.610346348605172, "learning_rate": 1.99987926339085e-06, "loss": 1.462, "step": 2938 }, { "epoch": 0.02, "grad_norm": 4.928466235921176, "learning_rate": 1.9998791809080667e-06, "loss": 1.3877, "step": 2939 }, { "epoch": 0.02, "grad_norm": 4.973945408707448, "learning_rate": 1.99987909839712e-06, "loss": 1.3815, "step": 2940 }, { "epoch": 0.02, "grad_norm": 5.625537540163681, "learning_rate": 1.99987901585801e-06, "loss": 1.2582, "step": 2941 }, { "epoch": 0.02, "grad_norm": 4.477355848214243, "learning_rate": 1.9998789332907363e-06, "loss": 1.236, "step": 2942 }, { "epoch": 0.02, "grad_norm": 4.492618763572616, "learning_rate": 1.9998788506953e-06, "loss": 1.3155, "step": 2943 }, { "epoch": 0.02, "grad_norm": 4.602003607577262, "learning_rate": 1.9998787680717002e-06, "loss": 1.2852, "step": 2944 }, { "epoch": 0.02, "grad_norm": 4.567502332448367, "learning_rate": 1.999878685419937e-06, "loss": 1.4841, "step": 2945 }, { "epoch": 0.02, "grad_norm": 4.9405542923073815, "learning_rate": 1.9998786027400105e-06, "loss": 1.3458, "step": 2946 }, { "epoch": 0.02, "grad_norm": 4.6929594602807025, "learning_rate": 1.999878520031921e-06, "loss": 1.4189, "step": 2947 }, { "epoch": 0.02, "grad_norm": 5.197148859419166, "learning_rate": 1.9998784372956684e-06, "loss": 1.3436, "step": 2948 }, { "epoch": 0.02, "grad_norm": 6.599441454159472, "learning_rate": 1.999878354531252e-06, "loss": 1.3806, "step": 2949 }, { "epoch": 0.02, "grad_norm": 4.615989461212095, "learning_rate": 1.9998782717386726e-06, "loss": 1.3673, "step": 2950 }, { "epoch": 0.02, "grad_norm": 4.9156780589895295, "learning_rate": 1.9998781889179302e-06, "loss": 1.3702, "step": 2951 }, { "epoch": 0.02, "grad_norm": 4.29684223795888, "learning_rate": 1.9998781060690244e-06, "loss": 1.3308, "step": 2952 }, { "epoch": 0.02, "grad_norm": 4.517575497946087, "learning_rate": 1.999878023191955e-06, "loss": 1.3706, "step": 2953 }, { "epoch": 0.02, "grad_norm": 4.767296353617181, "learning_rate": 1.999877940286723e-06, "loss": 1.5224, "step": 2954 }, { "epoch": 0.02, "grad_norm": 5.751314082046642, "learning_rate": 1.9998778573533272e-06, "loss": 1.3569, "step": 2955 }, { "epoch": 0.02, "grad_norm": 4.40734821128334, "learning_rate": 1.9998777743917686e-06, "loss": 1.4663, "step": 2956 }, { "epoch": 0.02, "grad_norm": 4.350543171262743, "learning_rate": 1.9998776914020465e-06, "loss": 1.2895, "step": 2957 }, { "epoch": 0.02, "grad_norm": 4.6281115386375395, "learning_rate": 1.9998776083841614e-06, "loss": 1.5124, "step": 2958 }, { "epoch": 0.02, "grad_norm": 5.2232144573948185, "learning_rate": 1.9998775253381125e-06, "loss": 1.2376, "step": 2959 }, { "epoch": 0.02, "grad_norm": 5.689282233407712, "learning_rate": 1.999877442263901e-06, "loss": 1.5472, "step": 2960 }, { "epoch": 0.02, "grad_norm": 4.917409633814714, "learning_rate": 1.999877359161526e-06, "loss": 1.4326, "step": 2961 }, { "epoch": 0.02, "grad_norm": 4.994380191725462, "learning_rate": 1.9998772760309877e-06, "loss": 1.3549, "step": 2962 }, { "epoch": 0.02, "grad_norm": 14.47259236356199, "learning_rate": 1.9998771928722868e-06, "loss": 1.4232, "step": 2963 }, { "epoch": 0.02, "grad_norm": 4.656592137584895, "learning_rate": 1.999877109685422e-06, "loss": 1.3973, "step": 2964 }, { "epoch": 0.02, "grad_norm": 5.6307964918603615, "learning_rate": 1.9998770264703942e-06, "loss": 1.3096, "step": 2965 }, { "epoch": 0.02, "grad_norm": 4.5164633394118825, "learning_rate": 1.9998769432272035e-06, "loss": 1.3215, "step": 2966 }, { "epoch": 0.02, "grad_norm": 4.5103839413771185, "learning_rate": 1.9998768599558493e-06, "loss": 1.3789, "step": 2967 }, { "epoch": 0.02, "grad_norm": 4.2216327904598385, "learning_rate": 1.9998767766563317e-06, "loss": 1.3297, "step": 2968 }, { "epoch": 0.02, "grad_norm": 5.059047029865863, "learning_rate": 1.999876693328651e-06, "loss": 1.3984, "step": 2969 }, { "epoch": 0.02, "grad_norm": 4.700581236103446, "learning_rate": 1.9998766099728075e-06, "loss": 1.5166, "step": 2970 }, { "epoch": 0.02, "grad_norm": 4.854212931369697, "learning_rate": 1.9998765265888e-06, "loss": 1.4143, "step": 2971 }, { "epoch": 0.02, "grad_norm": 7.725119262001126, "learning_rate": 1.99987644317663e-06, "loss": 1.178, "step": 2972 }, { "epoch": 0.02, "grad_norm": 4.370144258521158, "learning_rate": 1.9998763597362965e-06, "loss": 1.3893, "step": 2973 }, { "epoch": 0.02, "grad_norm": 4.638058498542562, "learning_rate": 1.9998762762678e-06, "loss": 1.4689, "step": 2974 }, { "epoch": 0.02, "grad_norm": 5.213106554526003, "learning_rate": 1.9998761927711402e-06, "loss": 1.3459, "step": 2975 }, { "epoch": 0.02, "grad_norm": 4.858477665120681, "learning_rate": 1.9998761092463173e-06, "loss": 1.4884, "step": 2976 }, { "epoch": 0.02, "grad_norm": 4.570024440771522, "learning_rate": 1.999876025693331e-06, "loss": 1.469, "step": 2977 }, { "epoch": 0.02, "grad_norm": 8.16955091971151, "learning_rate": 1.9998759421121818e-06, "loss": 1.4388, "step": 2978 }, { "epoch": 0.02, "grad_norm": 4.903458987120334, "learning_rate": 1.999875858502869e-06, "loss": 1.3786, "step": 2979 }, { "epoch": 0.02, "grad_norm": 4.529203011610496, "learning_rate": 1.9998757748653933e-06, "loss": 1.2684, "step": 2980 }, { "epoch": 0.02, "grad_norm": 4.729523457919252, "learning_rate": 1.9998756911997546e-06, "loss": 1.5905, "step": 2981 }, { "epoch": 0.02, "grad_norm": 5.582684926177018, "learning_rate": 1.9998756075059525e-06, "loss": 1.598, "step": 2982 }, { "epoch": 0.02, "grad_norm": 4.659482522086951, "learning_rate": 1.9998755237839874e-06, "loss": 1.3603, "step": 2983 }, { "epoch": 0.02, "grad_norm": 5.628018340481434, "learning_rate": 1.999875440033859e-06, "loss": 1.5392, "step": 2984 }, { "epoch": 0.02, "grad_norm": 5.03947825852203, "learning_rate": 1.9998753562555673e-06, "loss": 1.47, "step": 2985 }, { "epoch": 0.02, "grad_norm": 4.908146139932012, "learning_rate": 1.9998752724491127e-06, "loss": 1.5795, "step": 2986 }, { "epoch": 0.02, "grad_norm": 4.642150400889997, "learning_rate": 1.9998751886144948e-06, "loss": 1.4156, "step": 2987 }, { "epoch": 0.02, "grad_norm": 5.580016701743678, "learning_rate": 1.999875104751714e-06, "loss": 1.3989, "step": 2988 }, { "epoch": 0.02, "grad_norm": 4.60906478003677, "learning_rate": 1.99987502086077e-06, "loss": 1.4226, "step": 2989 }, { "epoch": 0.02, "grad_norm": 4.563599437751002, "learning_rate": 1.9998749369416624e-06, "loss": 1.5099, "step": 2990 }, { "epoch": 0.02, "grad_norm": 4.223270236357775, "learning_rate": 1.9998748529943916e-06, "loss": 1.3115, "step": 2991 }, { "epoch": 0.02, "grad_norm": 5.633656419315019, "learning_rate": 1.9998747690189583e-06, "loss": 1.4029, "step": 2992 }, { "epoch": 0.02, "grad_norm": 4.630285775575748, "learning_rate": 1.9998746850153614e-06, "loss": 1.3736, "step": 2993 }, { "epoch": 0.02, "eval_loss": 1.6208245754241943, "eval_runtime": 4.6165, "eval_samples_per_second": 1.95, "eval_steps_per_second": 1.083, "step": 2993 }, { "epoch": 0.02, "grad_norm": 4.936246851895618, "learning_rate": 1.9998746009836016e-06, "loss": 1.406, "step": 2994 }, { "epoch": 0.02, "grad_norm": 5.149643170506049, "learning_rate": 1.9998745169236784e-06, "loss": 1.3712, "step": 2995 }, { "epoch": 0.02, "grad_norm": 4.788568030128947, "learning_rate": 1.999874432835592e-06, "loss": 1.4386, "step": 2996 }, { "epoch": 0.02, "grad_norm": 4.578352571887644, "learning_rate": 1.999874348719343e-06, "loss": 1.3946, "step": 2997 }, { "epoch": 0.02, "grad_norm": 5.124704832034737, "learning_rate": 1.9998742645749303e-06, "loss": 1.301, "step": 2998 }, { "epoch": 0.02, "grad_norm": 4.405037637799255, "learning_rate": 1.999874180402355e-06, "loss": 1.4424, "step": 2999 }, { "epoch": 0.02, "grad_norm": 4.6389112469188, "learning_rate": 1.999874096201616e-06, "loss": 1.4466, "step": 3000 }, { "epoch": 0.02, "grad_norm": 5.19152230930798, "learning_rate": 1.9998740119727144e-06, "loss": 1.4456, "step": 3001 }, { "epoch": 0.02, "grad_norm": 5.121951342555157, "learning_rate": 1.999873927715649e-06, "loss": 1.3808, "step": 3002 }, { "epoch": 0.02, "grad_norm": 4.607444674665859, "learning_rate": 1.999873843430421e-06, "loss": 1.4636, "step": 3003 }, { "epoch": 0.02, "grad_norm": 4.46848570773085, "learning_rate": 1.9998737591170298e-06, "loss": 1.3631, "step": 3004 }, { "epoch": 0.02, "grad_norm": 4.330823996962009, "learning_rate": 1.9998736747754753e-06, "loss": 1.3945, "step": 3005 }, { "epoch": 0.02, "grad_norm": 5.168688715915758, "learning_rate": 1.999873590405758e-06, "loss": 1.265, "step": 3006 }, { "epoch": 0.02, "grad_norm": 5.21101121326567, "learning_rate": 1.9998735060078773e-06, "loss": 1.4035, "step": 3007 }, { "epoch": 0.02, "grad_norm": 4.9325677986365335, "learning_rate": 1.999873421581834e-06, "loss": 1.5848, "step": 3008 }, { "epoch": 0.02, "grad_norm": 4.726896704178509, "learning_rate": 1.999873337127627e-06, "loss": 1.3209, "step": 3009 }, { "epoch": 0.02, "grad_norm": 5.923881575075372, "learning_rate": 1.999873252645257e-06, "loss": 1.3872, "step": 3010 }, { "epoch": 0.02, "grad_norm": 4.405371972429703, "learning_rate": 1.999873168134724e-06, "loss": 1.3634, "step": 3011 }, { "epoch": 0.02, "grad_norm": 4.953407177301594, "learning_rate": 1.999873083596028e-06, "loss": 1.4459, "step": 3012 }, { "epoch": 0.02, "grad_norm": 4.432312541640287, "learning_rate": 1.9998729990291685e-06, "loss": 1.36, "step": 3013 }, { "epoch": 0.02, "grad_norm": 4.6106671367098855, "learning_rate": 1.9998729144341462e-06, "loss": 1.3693, "step": 3014 }, { "epoch": 0.02, "grad_norm": 4.363880892273669, "learning_rate": 1.999872829810961e-06, "loss": 1.4125, "step": 3015 }, { "epoch": 0.02, "grad_norm": 4.671858356354371, "learning_rate": 1.9998727451596126e-06, "loss": 1.4628, "step": 3016 }, { "epoch": 0.02, "grad_norm": 5.1327904162947835, "learning_rate": 1.999872660480101e-06, "loss": 1.484, "step": 3017 }, { "epoch": 0.02, "grad_norm": 6.726123143448659, "learning_rate": 1.999872575772426e-06, "loss": 1.3706, "step": 3018 }, { "epoch": 0.02, "grad_norm": 4.6201517878177984, "learning_rate": 1.9998724910365884e-06, "loss": 1.3531, "step": 3019 }, { "epoch": 0.02, "grad_norm": 4.7617011200342585, "learning_rate": 1.9998724062725877e-06, "loss": 1.292, "step": 3020 }, { "epoch": 0.02, "grad_norm": 4.684197716362908, "learning_rate": 1.999872321480424e-06, "loss": 1.4344, "step": 3021 }, { "epoch": 0.02, "grad_norm": 4.683892557856816, "learning_rate": 1.9998722366600968e-06, "loss": 1.5146, "step": 3022 }, { "epoch": 0.02, "grad_norm": 4.659566302966707, "learning_rate": 1.9998721518116066e-06, "loss": 1.4766, "step": 3023 }, { "epoch": 0.02, "grad_norm": 4.56223709999532, "learning_rate": 1.9998720669349535e-06, "loss": 1.4421, "step": 3024 }, { "epoch": 0.02, "grad_norm": 6.040663438792937, "learning_rate": 1.9998719820301373e-06, "loss": 1.4359, "step": 3025 }, { "epoch": 0.02, "grad_norm": 4.513110115307069, "learning_rate": 1.9998718970971578e-06, "loss": 1.4452, "step": 3026 }, { "epoch": 0.02, "grad_norm": 4.912192039188993, "learning_rate": 1.9998718121360156e-06, "loss": 1.4882, "step": 3027 }, { "epoch": 0.02, "grad_norm": 4.525564198825131, "learning_rate": 1.99987172714671e-06, "loss": 1.2955, "step": 3028 }, { "epoch": 0.02, "grad_norm": 5.369450073058397, "learning_rate": 1.999871642129242e-06, "loss": 1.4109, "step": 3029 }, { "epoch": 0.02, "grad_norm": 6.399069353447917, "learning_rate": 1.9998715570836104e-06, "loss": 1.5341, "step": 3030 }, { "epoch": 0.02, "grad_norm": 4.863376705625461, "learning_rate": 1.999871472009816e-06, "loss": 1.3178, "step": 3031 }, { "epoch": 0.02, "grad_norm": 5.999004673955154, "learning_rate": 1.999871386907858e-06, "loss": 1.358, "step": 3032 }, { "epoch": 0.02, "grad_norm": 4.667839262748732, "learning_rate": 1.9998713017777373e-06, "loss": 1.4091, "step": 3033 }, { "epoch": 0.02, "grad_norm": 4.5743647889890475, "learning_rate": 1.9998712166194537e-06, "loss": 1.3385, "step": 3034 }, { "epoch": 0.02, "grad_norm": 4.975891971893521, "learning_rate": 1.9998711314330067e-06, "loss": 1.4072, "step": 3035 }, { "epoch": 0.02, "grad_norm": 4.830200195962722, "learning_rate": 1.999871046218397e-06, "loss": 1.4913, "step": 3036 }, { "epoch": 0.02, "grad_norm": 4.805815436884879, "learning_rate": 1.9998709609756238e-06, "loss": 1.4484, "step": 3037 }, { "epoch": 0.02, "grad_norm": 4.600176911297759, "learning_rate": 1.9998708757046882e-06, "loss": 1.2973, "step": 3038 }, { "epoch": 0.02, "grad_norm": 5.060093592105702, "learning_rate": 1.9998707904055893e-06, "loss": 1.1942, "step": 3039 }, { "epoch": 0.02, "grad_norm": 4.802674568723147, "learning_rate": 1.9998707050783273e-06, "loss": 1.5334, "step": 3040 }, { "epoch": 0.02, "grad_norm": 5.426696065645248, "learning_rate": 1.9998706197229023e-06, "loss": 1.4872, "step": 3041 }, { "epoch": 0.02, "grad_norm": 5.898642916611915, "learning_rate": 1.999870534339314e-06, "loss": 1.5561, "step": 3042 }, { "epoch": 0.02, "grad_norm": 4.5979860791423635, "learning_rate": 1.999870448927563e-06, "loss": 1.5231, "step": 3043 }, { "epoch": 0.02, "grad_norm": 4.535954226116485, "learning_rate": 1.999870363487649e-06, "loss": 1.3058, "step": 3044 }, { "epoch": 0.02, "grad_norm": 4.745795440686125, "learning_rate": 1.9998702780195716e-06, "loss": 1.4387, "step": 3045 }, { "epoch": 0.02, "grad_norm": 4.635960415984835, "learning_rate": 1.999870192523332e-06, "loss": 1.4911, "step": 3046 }, { "epoch": 0.02, "grad_norm": 4.759909878239948, "learning_rate": 1.9998701069989287e-06, "loss": 1.4024, "step": 3047 }, { "epoch": 0.02, "grad_norm": 5.802542836315548, "learning_rate": 1.9998700214463623e-06, "loss": 1.4512, "step": 3048 }, { "epoch": 0.02, "grad_norm": 6.378746183379735, "learning_rate": 1.9998699358656334e-06, "loss": 1.5516, "step": 3049 }, { "epoch": 0.02, "grad_norm": 4.618905225926722, "learning_rate": 1.999869850256741e-06, "loss": 1.3609, "step": 3050 }, { "epoch": 0.02, "grad_norm": 4.905191662886102, "learning_rate": 1.9998697646196856e-06, "loss": 1.5205, "step": 3051 }, { "epoch": 0.02, "grad_norm": 4.391956056937744, "learning_rate": 1.9998696789544677e-06, "loss": 1.3854, "step": 3052 }, { "epoch": 0.02, "grad_norm": 5.57834379820915, "learning_rate": 1.9998695932610863e-06, "loss": 1.4971, "step": 3053 }, { "epoch": 0.02, "grad_norm": 4.9358795449257, "learning_rate": 1.9998695075395423e-06, "loss": 1.3831, "step": 3054 }, { "epoch": 0.02, "grad_norm": 5.168823055094264, "learning_rate": 1.999869421789835e-06, "loss": 1.2735, "step": 3055 }, { "epoch": 0.02, "grad_norm": 5.134532360101292, "learning_rate": 1.9998693360119646e-06, "loss": 1.3684, "step": 3056 }, { "epoch": 0.02, "grad_norm": 5.281772250542135, "learning_rate": 1.9998692502059317e-06, "loss": 1.25, "step": 3057 }, { "epoch": 0.02, "grad_norm": 4.862624797364669, "learning_rate": 1.9998691643717353e-06, "loss": 1.4888, "step": 3058 }, { "epoch": 0.02, "grad_norm": 4.5369738915051965, "learning_rate": 1.999869078509376e-06, "loss": 1.3752, "step": 3059 }, { "epoch": 0.02, "grad_norm": 5.264457712237391, "learning_rate": 1.999868992618854e-06, "loss": 1.3389, "step": 3060 }, { "epoch": 0.02, "grad_norm": 5.148997720929055, "learning_rate": 1.9998689067001686e-06, "loss": 1.4634, "step": 3061 }, { "epoch": 0.02, "grad_norm": 5.69514649110256, "learning_rate": 1.9998688207533207e-06, "loss": 1.4781, "step": 3062 }, { "epoch": 0.02, "grad_norm": 6.906133670567751, "learning_rate": 1.9998687347783098e-06, "loss": 1.2841, "step": 3063 }, { "epoch": 0.02, "grad_norm": 4.450305903996531, "learning_rate": 1.9998686487751354e-06, "loss": 1.3008, "step": 3064 }, { "epoch": 0.02, "grad_norm": 4.9761783730067926, "learning_rate": 1.9998685627437985e-06, "loss": 1.1917, "step": 3065 }, { "epoch": 0.02, "grad_norm": 4.90862469089785, "learning_rate": 1.9998684766842985e-06, "loss": 1.3536, "step": 3066 }, { "epoch": 0.02, "eval_loss": 1.614479660987854, "eval_runtime": 4.6281, "eval_samples_per_second": 1.945, "eval_steps_per_second": 1.08, "step": 3066 }, { "epoch": 0.02, "grad_norm": 4.514246443648035, "learning_rate": 1.999868390596635e-06, "loss": 1.46, "step": 3067 }, { "epoch": 0.02, "grad_norm": 4.795614462660274, "learning_rate": 1.9998683044808097e-06, "loss": 1.4595, "step": 3068 }, { "epoch": 0.02, "grad_norm": 5.018533958527286, "learning_rate": 1.9998682183368203e-06, "loss": 1.3904, "step": 3069 }, { "epoch": 0.02, "grad_norm": 4.942520732496524, "learning_rate": 1.999868132164669e-06, "loss": 1.4983, "step": 3070 }, { "epoch": 0.02, "grad_norm": 4.777495716431802, "learning_rate": 1.999868045964354e-06, "loss": 1.4747, "step": 3071 }, { "epoch": 0.02, "grad_norm": 4.426971447313713, "learning_rate": 1.999867959735876e-06, "loss": 1.2999, "step": 3072 }, { "epoch": 0.02, "grad_norm": 6.039700517061406, "learning_rate": 1.999867873479235e-06, "loss": 1.3682, "step": 3073 }, { "epoch": 0.02, "grad_norm": 7.018072800299674, "learning_rate": 1.9998677871944316e-06, "loss": 1.4626, "step": 3074 }, { "epoch": 0.02, "grad_norm": 4.620448908678396, "learning_rate": 1.9998677008814647e-06, "loss": 1.3042, "step": 3075 }, { "epoch": 0.02, "grad_norm": 4.309072084405069, "learning_rate": 1.999867614540335e-06, "loss": 1.3497, "step": 3076 }, { "epoch": 0.02, "grad_norm": 6.1018581186592025, "learning_rate": 1.9998675281710427e-06, "loss": 1.333, "step": 3077 }, { "epoch": 0.02, "grad_norm": 4.945567567355229, "learning_rate": 1.999867441773587e-06, "loss": 1.2968, "step": 3078 }, { "epoch": 0.02, "grad_norm": 5.406409232689529, "learning_rate": 1.9998673553479687e-06, "loss": 1.4256, "step": 3079 }, { "epoch": 0.02, "grad_norm": 7.218194975860625, "learning_rate": 1.9998672688941872e-06, "loss": 1.4264, "step": 3080 }, { "epoch": 0.02, "grad_norm": 4.739664546488815, "learning_rate": 1.999867182412243e-06, "loss": 1.355, "step": 3081 }, { "epoch": 0.02, "grad_norm": 4.91149132142301, "learning_rate": 1.9998670959021357e-06, "loss": 1.524, "step": 3082 }, { "epoch": 0.02, "grad_norm": 4.432247494986651, "learning_rate": 1.9998670093638656e-06, "loss": 1.2699, "step": 3083 }, { "epoch": 0.02, "grad_norm": 5.31888360981259, "learning_rate": 1.9998669227974326e-06, "loss": 1.3485, "step": 3084 }, { "epoch": 0.02, "grad_norm": 4.766908478434986, "learning_rate": 1.9998668362028365e-06, "loss": 1.3453, "step": 3085 }, { "epoch": 0.02, "grad_norm": 4.684318593661933, "learning_rate": 1.9998667495800775e-06, "loss": 1.3912, "step": 3086 }, { "epoch": 0.02, "grad_norm": 4.707084972080545, "learning_rate": 1.9998666629291554e-06, "loss": 1.4999, "step": 3087 }, { "epoch": 0.02, "grad_norm": 4.658271583331655, "learning_rate": 1.999866576250071e-06, "loss": 1.3584, "step": 3088 }, { "epoch": 0.02, "grad_norm": 5.8632287556229645, "learning_rate": 1.999866489542823e-06, "loss": 1.4268, "step": 3089 }, { "epoch": 0.02, "grad_norm": 5.305841703345282, "learning_rate": 1.9998664028074126e-06, "loss": 1.425, "step": 3090 }, { "epoch": 0.02, "grad_norm": 4.537852499872765, "learning_rate": 1.999866316043839e-06, "loss": 1.3343, "step": 3091 }, { "epoch": 0.02, "grad_norm": 4.525709370028815, "learning_rate": 1.9998662292521023e-06, "loss": 1.4023, "step": 3092 }, { "epoch": 0.02, "grad_norm": 4.566090943972325, "learning_rate": 1.999866142432203e-06, "loss": 1.4643, "step": 3093 }, { "epoch": 0.02, "grad_norm": 4.2695795542821875, "learning_rate": 1.9998660555841405e-06, "loss": 1.2825, "step": 3094 }, { "epoch": 0.02, "grad_norm": 4.396645953705008, "learning_rate": 1.9998659687079157e-06, "loss": 1.2208, "step": 3095 }, { "epoch": 0.02, "grad_norm": 4.585920026446324, "learning_rate": 1.9998658818035275e-06, "loss": 1.3192, "step": 3096 }, { "epoch": 0.02, "grad_norm": 4.724254453766402, "learning_rate": 1.9998657948709768e-06, "loss": 1.3259, "step": 3097 }, { "epoch": 0.02, "grad_norm": 4.546497072265447, "learning_rate": 1.9998657079102626e-06, "loss": 1.4462, "step": 3098 }, { "epoch": 0.02, "grad_norm": 5.067058100414485, "learning_rate": 1.999865620921386e-06, "loss": 1.371, "step": 3099 }, { "epoch": 0.02, "grad_norm": 4.680817047547335, "learning_rate": 1.9998655339043465e-06, "loss": 1.382, "step": 3100 }, { "epoch": 0.02, "grad_norm": 4.684807070794312, "learning_rate": 1.999865446859144e-06, "loss": 1.3792, "step": 3101 }, { "epoch": 0.02, "grad_norm": 5.108394614810223, "learning_rate": 1.999865359785779e-06, "loss": 1.3695, "step": 3102 }, { "epoch": 0.02, "grad_norm": 4.678775301437812, "learning_rate": 1.99986527268425e-06, "loss": 1.3234, "step": 3103 }, { "epoch": 0.02, "grad_norm": 7.198154854127655, "learning_rate": 1.999865185554559e-06, "loss": 1.3228, "step": 3104 }, { "epoch": 0.02, "grad_norm": 4.954029857664215, "learning_rate": 1.9998650983967053e-06, "loss": 1.2434, "step": 3105 }, { "epoch": 0.02, "grad_norm": 4.716268546944123, "learning_rate": 1.999865011210688e-06, "loss": 1.3835, "step": 3106 }, { "epoch": 0.02, "grad_norm": 4.706544890974926, "learning_rate": 1.9998649239965085e-06, "loss": 1.4839, "step": 3107 }, { "epoch": 0.02, "grad_norm": 5.0878170468870545, "learning_rate": 1.9998648367541656e-06, "loss": 1.5127, "step": 3108 }, { "epoch": 0.02, "grad_norm": 4.546409694838889, "learning_rate": 1.99986474948366e-06, "loss": 1.4668, "step": 3109 }, { "epoch": 0.02, "grad_norm": 4.262790272972708, "learning_rate": 1.9998646621849916e-06, "loss": 1.3487, "step": 3110 }, { "epoch": 0.02, "grad_norm": 5.437228037164254, "learning_rate": 1.9998645748581606e-06, "loss": 1.4362, "step": 3111 }, { "epoch": 0.02, "grad_norm": 4.591524369547653, "learning_rate": 1.9998644875031665e-06, "loss": 1.3518, "step": 3112 }, { "epoch": 0.02, "grad_norm": 4.943081436316897, "learning_rate": 1.9998644001200095e-06, "loss": 1.2977, "step": 3113 }, { "epoch": 0.02, "grad_norm": 4.745398436424019, "learning_rate": 1.9998643127086894e-06, "loss": 1.2858, "step": 3114 }, { "epoch": 0.02, "grad_norm": 4.451084562147302, "learning_rate": 1.9998642252692072e-06, "loss": 1.4795, "step": 3115 }, { "epoch": 0.02, "grad_norm": 4.642520886807483, "learning_rate": 1.9998641378015616e-06, "loss": 1.5245, "step": 3116 }, { "epoch": 0.02, "grad_norm": 4.829976676988873, "learning_rate": 1.9998640503057534e-06, "loss": 1.3036, "step": 3117 }, { "epoch": 0.02, "grad_norm": 4.272610276691716, "learning_rate": 1.9998639627817822e-06, "loss": 1.4307, "step": 3118 }, { "epoch": 0.02, "grad_norm": 4.279012748336821, "learning_rate": 1.999863875229648e-06, "loss": 1.3246, "step": 3119 }, { "epoch": 0.02, "grad_norm": 5.017262661372447, "learning_rate": 1.999863787649351e-06, "loss": 1.4561, "step": 3120 }, { "epoch": 0.02, "grad_norm": 7.159770574845129, "learning_rate": 1.9998637000408915e-06, "loss": 1.4325, "step": 3121 }, { "epoch": 0.02, "grad_norm": 5.138809355786813, "learning_rate": 1.9998636124042688e-06, "loss": 1.3595, "step": 3122 }, { "epoch": 0.02, "grad_norm": 7.411909404561028, "learning_rate": 1.9998635247394834e-06, "loss": 1.2877, "step": 3123 }, { "epoch": 0.02, "grad_norm": 4.8356590420018675, "learning_rate": 1.999863437046535e-06, "loss": 1.382, "step": 3124 }, { "epoch": 0.02, "grad_norm": 5.438539228016791, "learning_rate": 1.999863349325424e-06, "loss": 1.2727, "step": 3125 }, { "epoch": 0.02, "grad_norm": 5.290518205135856, "learning_rate": 1.9998632615761503e-06, "loss": 1.2096, "step": 3126 }, { "epoch": 0.02, "grad_norm": 4.869806082110767, "learning_rate": 1.9998631737987134e-06, "loss": 1.3335, "step": 3127 }, { "epoch": 0.02, "grad_norm": 5.001272070760005, "learning_rate": 1.999863085993114e-06, "loss": 1.48, "step": 3128 }, { "epoch": 0.02, "grad_norm": 4.588174640600946, "learning_rate": 1.999862998159352e-06, "loss": 1.3918, "step": 3129 }, { "epoch": 0.02, "grad_norm": 4.48282878680433, "learning_rate": 1.9998629102974263e-06, "loss": 1.3621, "step": 3130 }, { "epoch": 0.02, "grad_norm": 4.327695042907354, "learning_rate": 1.9998628224073387e-06, "loss": 1.1881, "step": 3131 }, { "epoch": 0.02, "grad_norm": 4.935240520013496, "learning_rate": 1.999862734489088e-06, "loss": 1.4896, "step": 3132 }, { "epoch": 0.02, "grad_norm": 4.762415048958248, "learning_rate": 1.999862646542674e-06, "loss": 1.3664, "step": 3133 }, { "epoch": 0.02, "grad_norm": 4.514983890100395, "learning_rate": 1.999862558568098e-06, "loss": 1.3976, "step": 3134 }, { "epoch": 0.02, "grad_norm": 4.318544973311922, "learning_rate": 1.9998624705653586e-06, "loss": 1.3479, "step": 3135 }, { "epoch": 0.02, "grad_norm": 4.831718076957779, "learning_rate": 1.999862382534457e-06, "loss": 1.4235, "step": 3136 }, { "epoch": 0.02, "grad_norm": 5.289805977486961, "learning_rate": 1.9998622944753917e-06, "loss": 1.2588, "step": 3137 }, { "epoch": 0.02, "grad_norm": 4.49073990213304, "learning_rate": 1.9998622063881643e-06, "loss": 1.4302, "step": 3138 }, { "epoch": 0.02, "grad_norm": 9.548814969360347, "learning_rate": 1.999862118272774e-06, "loss": 1.3936, "step": 3139 }, { "epoch": 0.02, "eval_loss": 1.6162147521972656, "eval_runtime": 4.6224, "eval_samples_per_second": 1.947, "eval_steps_per_second": 1.082, "step": 3139 }, { "epoch": 0.02, "grad_norm": 5.563259529239899, "learning_rate": 1.999862030129221e-06, "loss": 1.4137, "step": 3140 }, { "epoch": 0.02, "grad_norm": 4.64213902825873, "learning_rate": 1.999861941957505e-06, "loss": 1.5853, "step": 3141 }, { "epoch": 0.02, "grad_norm": 5.394890330300038, "learning_rate": 1.999861853757626e-06, "loss": 1.4709, "step": 3142 }, { "epoch": 0.02, "grad_norm": 4.911886598072668, "learning_rate": 1.9998617655295847e-06, "loss": 1.4397, "step": 3143 }, { "epoch": 0.02, "grad_norm": 13.65470961214868, "learning_rate": 1.9998616772733802e-06, "loss": 1.199, "step": 3144 }, { "epoch": 0.02, "grad_norm": 4.636379664598463, "learning_rate": 1.999861588989013e-06, "loss": 1.2613, "step": 3145 }, { "epoch": 0.02, "grad_norm": 4.9044581943490275, "learning_rate": 1.9998615006764835e-06, "loss": 1.4446, "step": 3146 }, { "epoch": 0.02, "grad_norm": 4.966547861520061, "learning_rate": 1.999861412335791e-06, "loss": 1.3623, "step": 3147 }, { "epoch": 0.02, "grad_norm": 5.124726053985599, "learning_rate": 1.9998613239669356e-06, "loss": 1.3474, "step": 3148 }, { "epoch": 0.02, "grad_norm": 4.572636835170383, "learning_rate": 1.9998612355699174e-06, "loss": 1.3531, "step": 3149 }, { "epoch": 0.02, "grad_norm": 4.618987542008811, "learning_rate": 1.9998611471447362e-06, "loss": 1.4206, "step": 3150 }, { "epoch": 0.02, "grad_norm": 4.3621396822779746, "learning_rate": 1.999861058691393e-06, "loss": 1.4164, "step": 3151 }, { "epoch": 0.02, "grad_norm": 5.341999064682612, "learning_rate": 1.999860970209886e-06, "loss": 1.5216, "step": 3152 }, { "epoch": 0.02, "grad_norm": 16.061509297130684, "learning_rate": 1.999860881700217e-06, "loss": 1.3329, "step": 3153 }, { "epoch": 0.02, "grad_norm": 4.808958846677696, "learning_rate": 1.999860793162385e-06, "loss": 1.483, "step": 3154 }, { "epoch": 0.02, "grad_norm": 5.114117981524813, "learning_rate": 1.9998607045963903e-06, "loss": 1.5337, "step": 3155 }, { "epoch": 0.02, "grad_norm": 5.295040011290924, "learning_rate": 1.9998606160022328e-06, "loss": 1.3669, "step": 3156 }, { "epoch": 0.02, "grad_norm": 4.704314977626543, "learning_rate": 1.9998605273799127e-06, "loss": 1.3311, "step": 3157 }, { "epoch": 0.02, "grad_norm": 4.488680014172767, "learning_rate": 1.9998604387294296e-06, "loss": 1.4133, "step": 3158 }, { "epoch": 0.02, "grad_norm": 5.104286348143494, "learning_rate": 1.999860350050784e-06, "loss": 1.5682, "step": 3159 }, { "epoch": 0.02, "grad_norm": 4.485660767564034, "learning_rate": 1.9998602613439757e-06, "loss": 1.5762, "step": 3160 }, { "epoch": 0.02, "grad_norm": 4.939741107373614, "learning_rate": 1.9998601726090045e-06, "loss": 1.4258, "step": 3161 }, { "epoch": 0.02, "grad_norm": 4.881029948420793, "learning_rate": 1.9998600838458707e-06, "loss": 1.4878, "step": 3162 }, { "epoch": 0.02, "grad_norm": 5.300753949774209, "learning_rate": 1.999859995054574e-06, "loss": 1.5591, "step": 3163 }, { "epoch": 0.02, "grad_norm": 4.919255455770104, "learning_rate": 1.999859906235114e-06, "loss": 1.5089, "step": 3164 }, { "epoch": 0.02, "grad_norm": 4.903957763395779, "learning_rate": 1.9998598173874925e-06, "loss": 1.3316, "step": 3165 }, { "epoch": 0.02, "grad_norm": 4.786877472597078, "learning_rate": 1.9998597285117076e-06, "loss": 1.4113, "step": 3166 }, { "epoch": 0.02, "grad_norm": 4.642407926930237, "learning_rate": 1.99985963960776e-06, "loss": 1.4368, "step": 3167 }, { "epoch": 0.02, "grad_norm": 4.9722903457303715, "learning_rate": 1.9998595506756495e-06, "loss": 1.5279, "step": 3168 }, { "epoch": 0.02, "grad_norm": 4.735779629402262, "learning_rate": 1.999859461715377e-06, "loss": 1.4509, "step": 3169 }, { "epoch": 0.02, "grad_norm": 6.012513337480798, "learning_rate": 1.999859372726941e-06, "loss": 1.2937, "step": 3170 }, { "epoch": 0.02, "grad_norm": 4.373925924257122, "learning_rate": 1.9998592837103425e-06, "loss": 1.3488, "step": 3171 }, { "epoch": 0.02, "grad_norm": 4.813452991567045, "learning_rate": 1.9998591946655812e-06, "loss": 1.4029, "step": 3172 }, { "epoch": 0.02, "grad_norm": 4.354945741505938, "learning_rate": 1.9998591055926574e-06, "loss": 1.2901, "step": 3173 }, { "epoch": 0.02, "grad_norm": 4.4499436163682775, "learning_rate": 1.999859016491571e-06, "loss": 1.3946, "step": 3174 }, { "epoch": 0.02, "grad_norm": 5.212636696076956, "learning_rate": 1.9998589273623216e-06, "loss": 1.4967, "step": 3175 }, { "epoch": 0.02, "grad_norm": 6.480145318055372, "learning_rate": 1.99985883820491e-06, "loss": 1.3128, "step": 3176 }, { "epoch": 0.02, "grad_norm": 5.7962291969592865, "learning_rate": 1.999858749019335e-06, "loss": 1.3714, "step": 3177 }, { "epoch": 0.02, "grad_norm": 4.699063147883259, "learning_rate": 1.999858659805598e-06, "loss": 1.3319, "step": 3178 }, { "epoch": 0.02, "grad_norm": 5.026292120171505, "learning_rate": 1.999858570563698e-06, "loss": 1.2571, "step": 3179 }, { "epoch": 0.02, "grad_norm": 4.409840960626129, "learning_rate": 1.999858481293635e-06, "loss": 1.2903, "step": 3180 }, { "epoch": 0.02, "grad_norm": 4.625128875585861, "learning_rate": 1.9998583919954095e-06, "loss": 1.3866, "step": 3181 }, { "epoch": 0.02, "grad_norm": 4.5097657585652895, "learning_rate": 1.9998583026690216e-06, "loss": 1.3992, "step": 3182 }, { "epoch": 0.02, "grad_norm": 4.614507760819113, "learning_rate": 1.9998582133144708e-06, "loss": 1.4418, "step": 3183 }, { "epoch": 0.02, "grad_norm": 4.5439263422269685, "learning_rate": 1.9998581239317573e-06, "loss": 1.4107, "step": 3184 }, { "epoch": 0.02, "grad_norm": 5.919848478106863, "learning_rate": 1.9998580345208814e-06, "loss": 1.44, "step": 3185 }, { "epoch": 0.02, "grad_norm": 5.270625301228678, "learning_rate": 1.9998579450818424e-06, "loss": 1.3487, "step": 3186 }, { "epoch": 0.02, "grad_norm": 12.770245132363083, "learning_rate": 1.999857855614641e-06, "loss": 1.2209, "step": 3187 }, { "epoch": 0.02, "grad_norm": 4.851859510913612, "learning_rate": 1.9998577661192766e-06, "loss": 1.5614, "step": 3188 }, { "epoch": 0.02, "grad_norm": 4.675470634925794, "learning_rate": 1.99985767659575e-06, "loss": 1.3947, "step": 3189 }, { "epoch": 0.02, "grad_norm": 7.468219419933821, "learning_rate": 1.9998575870440602e-06, "loss": 1.3805, "step": 3190 }, { "epoch": 0.02, "grad_norm": 4.966019364108259, "learning_rate": 1.9998574974642083e-06, "loss": 1.5527, "step": 3191 }, { "epoch": 0.02, "grad_norm": 9.403779222001704, "learning_rate": 1.9998574078561935e-06, "loss": 1.4134, "step": 3192 }, { "epoch": 0.02, "grad_norm": 6.913592743917321, "learning_rate": 1.999857318220016e-06, "loss": 1.3894, "step": 3193 }, { "epoch": 0.02, "grad_norm": 4.977596675633243, "learning_rate": 1.999857228555676e-06, "loss": 1.5042, "step": 3194 }, { "epoch": 0.02, "grad_norm": 5.206567891787514, "learning_rate": 1.999857138863173e-06, "loss": 1.4468, "step": 3195 }, { "epoch": 0.02, "grad_norm": 5.04153279402594, "learning_rate": 1.999857049142508e-06, "loss": 1.3358, "step": 3196 }, { "epoch": 0.02, "grad_norm": 5.061753008719967, "learning_rate": 1.9998569593936797e-06, "loss": 1.2138, "step": 3197 }, { "epoch": 0.02, "grad_norm": 4.5006757680467615, "learning_rate": 1.999856869616689e-06, "loss": 1.4535, "step": 3198 }, { "epoch": 0.02, "grad_norm": 4.783752399409675, "learning_rate": 1.9998567798115356e-06, "loss": 1.371, "step": 3199 }, { "epoch": 0.02, "grad_norm": 4.316562075427887, "learning_rate": 1.9998566899782197e-06, "loss": 1.242, "step": 3200 }, { "epoch": 0.02, "grad_norm": 5.77063522200799, "learning_rate": 1.999856600116741e-06, "loss": 1.3504, "step": 3201 }, { "epoch": 0.02, "grad_norm": 5.811809642589763, "learning_rate": 1.9998565102271e-06, "loss": 1.244, "step": 3202 }, { "epoch": 0.02, "grad_norm": 4.246394909798491, "learning_rate": 1.9998564203092958e-06, "loss": 1.2308, "step": 3203 }, { "epoch": 0.02, "grad_norm": 5.515471870625349, "learning_rate": 1.999856330363329e-06, "loss": 1.4938, "step": 3204 }, { "epoch": 0.02, "grad_norm": 7.4680360822544944, "learning_rate": 1.9998562403892e-06, "loss": 1.456, "step": 3205 }, { "epoch": 0.02, "grad_norm": 4.505670670091151, "learning_rate": 1.999856150386908e-06, "loss": 1.3296, "step": 3206 }, { "epoch": 0.02, "grad_norm": 4.709421641146748, "learning_rate": 1.999856060356454e-06, "loss": 1.5864, "step": 3207 }, { "epoch": 0.02, "grad_norm": 5.147973376306806, "learning_rate": 1.999855970297837e-06, "loss": 1.5874, "step": 3208 }, { "epoch": 0.02, "grad_norm": 4.382675576687693, "learning_rate": 1.999855880211057e-06, "loss": 1.3383, "step": 3209 }, { "epoch": 0.02, "grad_norm": 5.6100363440541745, "learning_rate": 1.999855790096115e-06, "loss": 1.354, "step": 3210 }, { "epoch": 0.02, "grad_norm": 4.6842429101328005, "learning_rate": 1.99985569995301e-06, "loss": 1.2987, "step": 3211 }, { "epoch": 0.02, "grad_norm": 4.40109444884483, "learning_rate": 1.9998556097817423e-06, "loss": 1.2192, "step": 3212 }, { "epoch": 0.02, "eval_loss": 1.6133770942687988, "eval_runtime": 4.6291, "eval_samples_per_second": 1.944, "eval_steps_per_second": 1.08, "step": 3212 }, { "epoch": 0.02, "grad_norm": 4.686660224921117, "learning_rate": 1.9998555195823125e-06, "loss": 1.3872, "step": 3213 }, { "epoch": 0.02, "grad_norm": 5.0508756090986076, "learning_rate": 1.9998554293547197e-06, "loss": 1.3356, "step": 3214 }, { "epoch": 0.02, "grad_norm": 4.52397968155206, "learning_rate": 1.9998553390989644e-06, "loss": 1.4718, "step": 3215 }, { "epoch": 0.02, "grad_norm": 4.345795773897772, "learning_rate": 1.9998552488150464e-06, "loss": 1.3888, "step": 3216 }, { "epoch": 0.02, "grad_norm": 4.4749772284400064, "learning_rate": 1.9998551585029663e-06, "loss": 1.4003, "step": 3217 }, { "epoch": 0.02, "grad_norm": 4.894626866747002, "learning_rate": 1.9998550681627233e-06, "loss": 1.3879, "step": 3218 }, { "epoch": 0.02, "grad_norm": 6.358238813293754, "learning_rate": 1.9998549777943176e-06, "loss": 1.3516, "step": 3219 }, { "epoch": 0.02, "grad_norm": 10.113707594899338, "learning_rate": 1.999854887397749e-06, "loss": 1.4023, "step": 3220 }, { "epoch": 0.02, "grad_norm": 5.254325564929292, "learning_rate": 1.999854796973018e-06, "loss": 1.39, "step": 3221 }, { "epoch": 0.02, "grad_norm": 4.960705725899118, "learning_rate": 1.9998547065201247e-06, "loss": 1.5043, "step": 3222 }, { "epoch": 0.02, "grad_norm": 4.358280513258368, "learning_rate": 1.9998546160390688e-06, "loss": 1.334, "step": 3223 }, { "epoch": 0.02, "grad_norm": 5.374786742690436, "learning_rate": 1.9998545255298502e-06, "loss": 1.4208, "step": 3224 }, { "epoch": 0.02, "grad_norm": 5.11312037967647, "learning_rate": 1.999854434992469e-06, "loss": 1.3896, "step": 3225 }, { "epoch": 0.02, "grad_norm": 5.056956101904353, "learning_rate": 1.9998543444269254e-06, "loss": 1.6125, "step": 3226 }, { "epoch": 0.02, "grad_norm": 5.4444491440324345, "learning_rate": 1.999854253833219e-06, "loss": 1.4266, "step": 3227 }, { "epoch": 0.02, "grad_norm": 4.562601418248393, "learning_rate": 1.99985416321135e-06, "loss": 1.3555, "step": 3228 }, { "epoch": 0.02, "grad_norm": 4.494814980020431, "learning_rate": 1.9998540725613185e-06, "loss": 1.2719, "step": 3229 }, { "epoch": 0.02, "grad_norm": 4.463077255851529, "learning_rate": 1.9998539818831245e-06, "loss": 1.4654, "step": 3230 }, { "epoch": 0.02, "grad_norm": 4.6659227436502615, "learning_rate": 1.999853891176768e-06, "loss": 1.4081, "step": 3231 }, { "epoch": 0.02, "grad_norm": 5.27576441173752, "learning_rate": 1.999853800442249e-06, "loss": 1.3004, "step": 3232 }, { "epoch": 0.02, "grad_norm": 7.132500624849133, "learning_rate": 1.9998537096795676e-06, "loss": 1.4915, "step": 3233 }, { "epoch": 0.02, "grad_norm": 5.3773789294030445, "learning_rate": 1.9998536188887233e-06, "loss": 1.2791, "step": 3234 }, { "epoch": 0.02, "grad_norm": 4.608523278365858, "learning_rate": 1.999853528069716e-06, "loss": 1.5425, "step": 3235 }, { "epoch": 0.02, "grad_norm": 5.707153438089712, "learning_rate": 1.999853437222547e-06, "loss": 1.46, "step": 3236 }, { "epoch": 0.02, "grad_norm": 4.354798489424228, "learning_rate": 1.999853346347215e-06, "loss": 1.3055, "step": 3237 }, { "epoch": 0.02, "grad_norm": 4.717858890133739, "learning_rate": 1.999853255443721e-06, "loss": 1.4242, "step": 3238 }, { "epoch": 0.02, "grad_norm": 4.881285175892339, "learning_rate": 1.9998531645120637e-06, "loss": 1.3225, "step": 3239 }, { "epoch": 0.02, "grad_norm": 4.351033621024971, "learning_rate": 1.999853073552244e-06, "loss": 1.4343, "step": 3240 }, { "epoch": 0.02, "grad_norm": 6.092636219063135, "learning_rate": 1.999852982564262e-06, "loss": 1.4519, "step": 3241 }, { "epoch": 0.02, "grad_norm": 4.523318551270846, "learning_rate": 1.9998528915481176e-06, "loss": 1.3111, "step": 3242 }, { "epoch": 0.02, "grad_norm": 5.12499436718178, "learning_rate": 1.9998528005038105e-06, "loss": 1.4857, "step": 3243 }, { "epoch": 0.02, "grad_norm": 4.379826186575379, "learning_rate": 1.999852709431341e-06, "loss": 1.5748, "step": 3244 }, { "epoch": 0.02, "grad_norm": 4.845476579476725, "learning_rate": 1.9998526183307083e-06, "loss": 1.5415, "step": 3245 }, { "epoch": 0.02, "grad_norm": 4.5232667716691735, "learning_rate": 1.999852527201914e-06, "loss": 1.4959, "step": 3246 }, { "epoch": 0.02, "grad_norm": 5.0382078094923335, "learning_rate": 1.9998524360449566e-06, "loss": 1.3551, "step": 3247 }, { "epoch": 0.02, "grad_norm": 7.089730656857905, "learning_rate": 1.999852344859837e-06, "loss": 1.4391, "step": 3248 }, { "epoch": 0.02, "grad_norm": 4.233017204929559, "learning_rate": 1.9998522536465547e-06, "loss": 1.3869, "step": 3249 }, { "epoch": 0.02, "grad_norm": 4.927879269649831, "learning_rate": 1.99985216240511e-06, "loss": 1.3307, "step": 3250 }, { "epoch": 0.02, "grad_norm": 4.8097509464845585, "learning_rate": 1.999852071135503e-06, "loss": 1.5221, "step": 3251 }, { "epoch": 0.02, "grad_norm": 4.199061944873702, "learning_rate": 1.999851979837733e-06, "loss": 1.0908, "step": 3252 }, { "epoch": 0.02, "grad_norm": 4.531291994897648, "learning_rate": 1.9998518885118007e-06, "loss": 1.3583, "step": 3253 }, { "epoch": 0.02, "grad_norm": 4.903810401327965, "learning_rate": 1.999851797157706e-06, "loss": 1.4129, "step": 3254 }, { "epoch": 0.02, "grad_norm": 7.132170562144993, "learning_rate": 1.9998517057754487e-06, "loss": 1.2676, "step": 3255 }, { "epoch": 0.02, "grad_norm": 4.252451485945251, "learning_rate": 1.999851614365029e-06, "loss": 1.3305, "step": 3256 }, { "epoch": 0.02, "grad_norm": 9.063511090876979, "learning_rate": 1.9998515229264468e-06, "loss": 1.5813, "step": 3257 }, { "epoch": 0.02, "grad_norm": 5.235824119971203, "learning_rate": 1.999851431459702e-06, "loss": 1.186, "step": 3258 }, { "epoch": 0.02, "grad_norm": 4.324167091287429, "learning_rate": 1.9998513399647946e-06, "loss": 1.3319, "step": 3259 }, { "epoch": 0.02, "grad_norm": 4.418031249539854, "learning_rate": 1.999851248441725e-06, "loss": 1.3418, "step": 3260 }, { "epoch": 0.02, "grad_norm": 4.669234391667805, "learning_rate": 1.9998511568904925e-06, "loss": 1.4365, "step": 3261 }, { "epoch": 0.02, "grad_norm": 4.883978731284588, "learning_rate": 1.999851065311098e-06, "loss": 1.2247, "step": 3262 }, { "epoch": 0.02, "grad_norm": 4.3548479130386, "learning_rate": 1.9998509737035405e-06, "loss": 1.3808, "step": 3263 }, { "epoch": 0.02, "grad_norm": 4.5380211493875, "learning_rate": 1.999850882067821e-06, "loss": 1.4246, "step": 3264 }, { "epoch": 0.02, "grad_norm": 5.00105285779209, "learning_rate": 1.999850790403939e-06, "loss": 1.1821, "step": 3265 }, { "epoch": 0.02, "grad_norm": 4.375788203793615, "learning_rate": 1.9998506987118946e-06, "loss": 1.2683, "step": 3266 }, { "epoch": 0.02, "grad_norm": 4.435134668590281, "learning_rate": 1.9998506069916874e-06, "loss": 1.3995, "step": 3267 }, { "epoch": 0.02, "grad_norm": 5.011365587056869, "learning_rate": 1.9998505152433177e-06, "loss": 1.4123, "step": 3268 }, { "epoch": 0.02, "grad_norm": 5.3076628146390945, "learning_rate": 1.999850423466786e-06, "loss": 1.5271, "step": 3269 }, { "epoch": 0.02, "grad_norm": 6.518196335665455, "learning_rate": 1.9998503316620914e-06, "loss": 1.4203, "step": 3270 }, { "epoch": 0.02, "grad_norm": 4.862304791483059, "learning_rate": 1.9998502398292344e-06, "loss": 1.3987, "step": 3271 }, { "epoch": 0.02, "grad_norm": 4.78170745299268, "learning_rate": 1.999850147968215e-06, "loss": 1.3068, "step": 3272 }, { "epoch": 0.02, "grad_norm": 7.178380281277487, "learning_rate": 1.999850056079033e-06, "loss": 1.2724, "step": 3273 }, { "epoch": 0.02, "grad_norm": 5.433887746386324, "learning_rate": 1.999849964161689e-06, "loss": 1.1814, "step": 3274 }, { "epoch": 0.02, "grad_norm": 4.528452236866192, "learning_rate": 1.9998498722161823e-06, "loss": 1.3414, "step": 3275 }, { "epoch": 0.02, "grad_norm": 5.177232219836514, "learning_rate": 1.999849780242513e-06, "loss": 1.4879, "step": 3276 }, { "epoch": 0.02, "grad_norm": 7.066602473545536, "learning_rate": 1.9998496882406816e-06, "loss": 1.5043, "step": 3277 }, { "epoch": 0.02, "grad_norm": 4.589177379962832, "learning_rate": 1.9998495962106874e-06, "loss": 1.4113, "step": 3278 }, { "epoch": 0.02, "grad_norm": 5.298228051116629, "learning_rate": 1.999849504152531e-06, "loss": 1.5326, "step": 3279 }, { "epoch": 0.02, "grad_norm": 4.762978250317543, "learning_rate": 1.999849412066212e-06, "loss": 1.3229, "step": 3280 }, { "epoch": 0.02, "grad_norm": 4.719035320919961, "learning_rate": 1.9998493199517306e-06, "loss": 1.495, "step": 3281 }, { "epoch": 0.02, "grad_norm": 4.522910759102749, "learning_rate": 1.999849227809087e-06, "loss": 1.4205, "step": 3282 }, { "epoch": 0.02, "grad_norm": 4.837186220843632, "learning_rate": 1.9998491356382808e-06, "loss": 1.4704, "step": 3283 }, { "epoch": 0.02, "grad_norm": 4.817799074894965, "learning_rate": 1.9998490434393124e-06, "loss": 1.3439, "step": 3284 }, { "epoch": 0.02, "grad_norm": 4.763809137445679, "learning_rate": 1.999848951212181e-06, "loss": 1.3933, "step": 3285 }, { "epoch": 0.02, "eval_loss": 1.6162359714508057, "eval_runtime": 4.6209, "eval_samples_per_second": 1.948, "eval_steps_per_second": 1.082, "step": 3285 }, { "epoch": 0.02, "grad_norm": 4.2310982403645685, "learning_rate": 1.9998488589568875e-06, "loss": 1.3159, "step": 3286 }, { "epoch": 0.02, "grad_norm": 4.751228665002726, "learning_rate": 1.999848766673432e-06, "loss": 1.508, "step": 3287 }, { "epoch": 0.02, "grad_norm": 5.287367204511815, "learning_rate": 1.9998486743618136e-06, "loss": 1.2603, "step": 3288 }, { "epoch": 0.02, "grad_norm": 6.191777133783223, "learning_rate": 1.9998485820220328e-06, "loss": 1.4435, "step": 3289 }, { "epoch": 0.02, "grad_norm": 4.963362551339361, "learning_rate": 1.9998484896540898e-06, "loss": 1.6377, "step": 3290 }, { "epoch": 0.02, "grad_norm": 5.380914773426925, "learning_rate": 1.9998483972579842e-06, "loss": 1.3618, "step": 3291 }, { "epoch": 0.02, "grad_norm": 4.881974168253751, "learning_rate": 1.999848304833716e-06, "loss": 1.4793, "step": 3292 }, { "epoch": 0.02, "grad_norm": 4.35338471574215, "learning_rate": 1.999848212381286e-06, "loss": 1.3686, "step": 3293 }, { "epoch": 0.02, "grad_norm": 4.647483695518025, "learning_rate": 1.999848119900693e-06, "loss": 1.3954, "step": 3294 }, { "epoch": 0.02, "grad_norm": 4.374460008441114, "learning_rate": 1.999848027391938e-06, "loss": 1.3345, "step": 3295 }, { "epoch": 0.02, "grad_norm": 5.811138428730455, "learning_rate": 1.9998479348550204e-06, "loss": 1.5838, "step": 3296 }, { "epoch": 0.02, "grad_norm": 4.982938535210265, "learning_rate": 1.9998478422899407e-06, "loss": 1.3415, "step": 3297 }, { "epoch": 0.02, "grad_norm": 5.428729068613124, "learning_rate": 1.9998477496966984e-06, "loss": 1.5737, "step": 3298 }, { "epoch": 0.02, "grad_norm": 5.419973685315894, "learning_rate": 1.9998476570752935e-06, "loss": 1.4732, "step": 3299 }, { "epoch": 0.02, "grad_norm": 4.763708028938937, "learning_rate": 1.9998475644257264e-06, "loss": 1.3344, "step": 3300 }, { "epoch": 0.02, "grad_norm": 4.49987874526447, "learning_rate": 1.999847471747997e-06, "loss": 1.3842, "step": 3301 }, { "epoch": 0.02, "grad_norm": 6.059868654394915, "learning_rate": 1.999847379042105e-06, "loss": 1.5563, "step": 3302 }, { "epoch": 0.02, "grad_norm": 5.101031294475946, "learning_rate": 1.9998472863080507e-06, "loss": 1.3156, "step": 3303 }, { "epoch": 0.02, "grad_norm": 5.04036355932879, "learning_rate": 1.9998471935458343e-06, "loss": 1.4731, "step": 3304 }, { "epoch": 0.02, "grad_norm": 7.42245956153174, "learning_rate": 1.999847100755455e-06, "loss": 1.3902, "step": 3305 }, { "epoch": 0.02, "grad_norm": 4.774733280681285, "learning_rate": 1.999847007936914e-06, "loss": 1.4806, "step": 3306 }, { "epoch": 0.02, "grad_norm": 5.071391799173309, "learning_rate": 1.99984691509021e-06, "loss": 1.3596, "step": 3307 }, { "epoch": 0.02, "grad_norm": 5.2995003638028235, "learning_rate": 1.9998468222153443e-06, "loss": 1.4474, "step": 3308 }, { "epoch": 0.02, "grad_norm": 5.3213954454836445, "learning_rate": 1.999846729312316e-06, "loss": 1.4512, "step": 3309 }, { "epoch": 0.02, "grad_norm": 5.334198218103061, "learning_rate": 1.999846636381125e-06, "loss": 1.5658, "step": 3310 }, { "epoch": 0.02, "grad_norm": 4.974385468276713, "learning_rate": 1.9998465434217723e-06, "loss": 1.4068, "step": 3311 }, { "epoch": 0.02, "grad_norm": 4.891182503439006, "learning_rate": 1.999846450434257e-06, "loss": 1.321, "step": 3312 }, { "epoch": 0.02, "grad_norm": 5.359459633381728, "learning_rate": 1.999846357418579e-06, "loss": 1.428, "step": 3313 }, { "epoch": 0.02, "grad_norm": 6.035895559789005, "learning_rate": 1.999846264374739e-06, "loss": 1.2147, "step": 3314 }, { "epoch": 0.02, "grad_norm": 4.397355468592714, "learning_rate": 1.999846171302736e-06, "loss": 1.3722, "step": 3315 }, { "epoch": 0.02, "grad_norm": 7.155669309774268, "learning_rate": 1.9998460782025713e-06, "loss": 1.3978, "step": 3316 }, { "epoch": 0.02, "grad_norm": 4.757021430848576, "learning_rate": 1.9998459850742443e-06, "loss": 1.5465, "step": 3317 }, { "epoch": 0.02, "grad_norm": 4.786149548595665, "learning_rate": 1.9998458919177547e-06, "loss": 1.41, "step": 3318 }, { "epoch": 0.02, "grad_norm": 4.289680760432469, "learning_rate": 1.9998457987331026e-06, "loss": 1.3994, "step": 3319 }, { "epoch": 0.02, "grad_norm": 8.633968114607555, "learning_rate": 1.9998457055202887e-06, "loss": 1.5452, "step": 3320 }, { "epoch": 0.02, "grad_norm": 4.513494336305803, "learning_rate": 1.9998456122793123e-06, "loss": 1.3446, "step": 3321 }, { "epoch": 0.02, "grad_norm": 5.396311638031696, "learning_rate": 1.9998455190101732e-06, "loss": 1.4307, "step": 3322 }, { "epoch": 0.02, "grad_norm": 12.224345682379019, "learning_rate": 1.999845425712872e-06, "loss": 1.3998, "step": 3323 }, { "epoch": 0.02, "grad_norm": 5.037124480361044, "learning_rate": 1.9998453323874087e-06, "loss": 1.3464, "step": 3324 }, { "epoch": 0.02, "grad_norm": 4.619878980217735, "learning_rate": 1.999845239033783e-06, "loss": 1.3604, "step": 3325 }, { "epoch": 0.02, "grad_norm": 4.480008801845805, "learning_rate": 1.999845145651995e-06, "loss": 1.3031, "step": 3326 }, { "epoch": 0.02, "grad_norm": 4.454286328620764, "learning_rate": 1.9998450522420446e-06, "loss": 1.3064, "step": 3327 }, { "epoch": 0.02, "grad_norm": 4.57634704126347, "learning_rate": 1.999844958803932e-06, "loss": 1.4419, "step": 3328 }, { "epoch": 0.02, "grad_norm": 4.525977006177297, "learning_rate": 1.9998448653376565e-06, "loss": 1.4284, "step": 3329 }, { "epoch": 0.02, "grad_norm": 4.871951894152188, "learning_rate": 1.9998447718432194e-06, "loss": 1.5416, "step": 3330 }, { "epoch": 0.02, "grad_norm": 5.355722208034374, "learning_rate": 1.9998446783206197e-06, "loss": 1.3012, "step": 3331 }, { "epoch": 0.02, "grad_norm": 4.768559786700981, "learning_rate": 1.999844584769858e-06, "loss": 1.3515, "step": 3332 }, { "epoch": 0.02, "grad_norm": 4.700472874326849, "learning_rate": 1.9998444911909335e-06, "loss": 1.406, "step": 3333 }, { "epoch": 0.02, "grad_norm": 4.6108709253998414, "learning_rate": 1.999844397583847e-06, "loss": 1.2965, "step": 3334 }, { "epoch": 0.02, "grad_norm": 4.119228316339279, "learning_rate": 1.9998443039485983e-06, "loss": 1.2358, "step": 3335 }, { "epoch": 0.02, "grad_norm": 4.6431919347398525, "learning_rate": 1.999844210285187e-06, "loss": 1.3773, "step": 3336 }, { "epoch": 0.02, "grad_norm": 5.036505345315098, "learning_rate": 1.9998441165936137e-06, "loss": 1.426, "step": 3337 }, { "epoch": 0.02, "grad_norm": 4.916331951067264, "learning_rate": 1.999844022873878e-06, "loss": 1.4319, "step": 3338 }, { "epoch": 0.02, "grad_norm": 4.502760853285271, "learning_rate": 1.99984392912598e-06, "loss": 1.3404, "step": 3339 }, { "epoch": 0.02, "grad_norm": 4.674212137830531, "learning_rate": 1.9998438353499197e-06, "loss": 1.3524, "step": 3340 }, { "epoch": 0.02, "grad_norm": 4.969002536880517, "learning_rate": 1.9998437415456972e-06, "loss": 1.4791, "step": 3341 }, { "epoch": 0.02, "grad_norm": 5.336062177765582, "learning_rate": 1.9998436477133126e-06, "loss": 1.5062, "step": 3342 }, { "epoch": 0.02, "grad_norm": 6.803443625291179, "learning_rate": 1.9998435538527655e-06, "loss": 1.5203, "step": 3343 }, { "epoch": 0.02, "grad_norm": 4.519808932086315, "learning_rate": 1.999843459964056e-06, "loss": 1.3434, "step": 3344 }, { "epoch": 0.02, "grad_norm": 4.916582018340864, "learning_rate": 1.9998433660471847e-06, "loss": 1.2779, "step": 3345 }, { "epoch": 0.02, "grad_norm": 4.597694435049481, "learning_rate": 1.9998432721021506e-06, "loss": 1.4044, "step": 3346 }, { "epoch": 0.02, "grad_norm": 4.780005071775639, "learning_rate": 1.9998431781289544e-06, "loss": 1.1947, "step": 3347 }, { "epoch": 0.02, "grad_norm": 4.419392700972962, "learning_rate": 1.999843084127596e-06, "loss": 1.3743, "step": 3348 }, { "epoch": 0.02, "grad_norm": 4.791430996135701, "learning_rate": 1.9998429900980756e-06, "loss": 1.2246, "step": 3349 }, { "epoch": 0.02, "grad_norm": 4.482748079898622, "learning_rate": 1.9998428960403925e-06, "loss": 1.2955, "step": 3350 }, { "epoch": 0.02, "grad_norm": 4.958560367082578, "learning_rate": 1.9998428019545477e-06, "loss": 1.3907, "step": 3351 }, { "epoch": 0.02, "grad_norm": 5.686365686539982, "learning_rate": 1.9998427078405404e-06, "loss": 1.4705, "step": 3352 }, { "epoch": 0.02, "grad_norm": 4.865648897158208, "learning_rate": 1.9998426136983704e-06, "loss": 1.4142, "step": 3353 }, { "epoch": 0.02, "grad_norm": 4.720528487925224, "learning_rate": 1.9998425195280387e-06, "loss": 1.3621, "step": 3354 }, { "epoch": 0.02, "grad_norm": 6.23799082246452, "learning_rate": 1.999842425329545e-06, "loss": 1.4965, "step": 3355 }, { "epoch": 0.02, "grad_norm": 4.338394874264906, "learning_rate": 1.999842331102888e-06, "loss": 1.4025, "step": 3356 }, { "epoch": 0.02, "grad_norm": 22.872377507742062, "learning_rate": 1.99984223684807e-06, "loss": 1.2324, "step": 3357 }, { "epoch": 0.02, "grad_norm": 5.052623717517635, "learning_rate": 1.9998421425650892e-06, "loss": 1.1698, "step": 3358 }, { "epoch": 0.02, "eval_loss": 1.615498661994934, "eval_runtime": 4.6186, "eval_samples_per_second": 1.949, "eval_steps_per_second": 1.083, "step": 3358 }, { "epoch": 0.02, "grad_norm": 4.403452284708548, "learning_rate": 1.999842048253946e-06, "loss": 1.4041, "step": 3359 }, { "epoch": 0.02, "grad_norm": 5.612691456341564, "learning_rate": 1.999841953914641e-06, "loss": 1.4215, "step": 3360 }, { "epoch": 0.02, "grad_norm": 4.5286749608866055, "learning_rate": 1.9998418595471733e-06, "loss": 1.2596, "step": 3361 }, { "epoch": 0.02, "grad_norm": 4.675637414315521, "learning_rate": 1.9998417651515436e-06, "loss": 1.4302, "step": 3362 }, { "epoch": 0.02, "grad_norm": 4.99356723093498, "learning_rate": 1.9998416707277517e-06, "loss": 1.5043, "step": 3363 }, { "epoch": 0.02, "grad_norm": 5.346376765762083, "learning_rate": 1.9998415762757977e-06, "loss": 1.4411, "step": 3364 }, { "epoch": 0.02, "grad_norm": 4.465445688844873, "learning_rate": 1.9998414817956815e-06, "loss": 1.3665, "step": 3365 }, { "epoch": 0.02, "grad_norm": 4.947373911092748, "learning_rate": 1.9998413872874027e-06, "loss": 1.4684, "step": 3366 }, { "epoch": 0.02, "grad_norm": 5.336576397977949, "learning_rate": 1.9998412927509622e-06, "loss": 1.4559, "step": 3367 }, { "epoch": 0.02, "grad_norm": 5.554379287052272, "learning_rate": 1.999841198186359e-06, "loss": 1.3741, "step": 3368 }, { "epoch": 0.02, "grad_norm": 4.691693057261446, "learning_rate": 1.999841103593594e-06, "loss": 1.3809, "step": 3369 }, { "epoch": 0.02, "grad_norm": 4.841766853561151, "learning_rate": 1.9998410089726666e-06, "loss": 1.4358, "step": 3370 }, { "epoch": 0.02, "grad_norm": 4.445225429854286, "learning_rate": 1.999840914323577e-06, "loss": 1.2202, "step": 3371 }, { "epoch": 0.02, "grad_norm": 4.587434031150207, "learning_rate": 1.9998408196463254e-06, "loss": 1.3254, "step": 3372 }, { "epoch": 0.02, "grad_norm": 4.635518865319935, "learning_rate": 1.9998407249409115e-06, "loss": 1.4297, "step": 3373 }, { "epoch": 0.02, "grad_norm": 4.693662361831059, "learning_rate": 1.9998406302073356e-06, "loss": 1.5567, "step": 3374 }, { "epoch": 0.02, "grad_norm": 4.86578821971193, "learning_rate": 1.999840535445597e-06, "loss": 1.5298, "step": 3375 }, { "epoch": 0.02, "grad_norm": 5.196254708579967, "learning_rate": 1.9998404406556967e-06, "loss": 1.4366, "step": 3376 }, { "epoch": 0.02, "grad_norm": 5.704264956948557, "learning_rate": 1.999840345837634e-06, "loss": 1.2585, "step": 3377 }, { "epoch": 0.02, "grad_norm": 7.123480430806445, "learning_rate": 1.9998402509914093e-06, "loss": 1.4561, "step": 3378 }, { "epoch": 0.02, "grad_norm": 4.651652795610573, "learning_rate": 1.999840156117022e-06, "loss": 1.422, "step": 3379 }, { "epoch": 0.02, "grad_norm": 4.895531346419424, "learning_rate": 1.999840061214473e-06, "loss": 1.39, "step": 3380 }, { "epoch": 0.02, "grad_norm": 4.431768446645135, "learning_rate": 1.9998399662837614e-06, "loss": 1.2118, "step": 3381 }, { "epoch": 0.02, "grad_norm": 4.73997458177594, "learning_rate": 1.999839871324888e-06, "loss": 1.3665, "step": 3382 }, { "epoch": 0.02, "grad_norm": 4.995671187040337, "learning_rate": 1.9998397763378524e-06, "loss": 1.3142, "step": 3383 }, { "epoch": 0.02, "grad_norm": 5.272798024995659, "learning_rate": 1.9998396813226545e-06, "loss": 1.3689, "step": 3384 }, { "epoch": 0.02, "grad_norm": 4.4670178221368655, "learning_rate": 1.9998395862792944e-06, "loss": 1.4755, "step": 3385 }, { "epoch": 0.02, "grad_norm": 4.832806175109676, "learning_rate": 1.999839491207772e-06, "loss": 1.4044, "step": 3386 }, { "epoch": 0.02, "grad_norm": 4.27660795435882, "learning_rate": 1.9998393961080883e-06, "loss": 1.3548, "step": 3387 }, { "epoch": 0.02, "grad_norm": 4.819442397568239, "learning_rate": 1.9998393009802417e-06, "loss": 1.4295, "step": 3388 }, { "epoch": 0.02, "grad_norm": 4.911584590161657, "learning_rate": 1.999839205824233e-06, "loss": 1.5393, "step": 3389 }, { "epoch": 0.02, "grad_norm": 5.255900800064722, "learning_rate": 1.9998391106400622e-06, "loss": 1.4227, "step": 3390 }, { "epoch": 0.02, "grad_norm": 4.46927240762758, "learning_rate": 1.9998390154277293e-06, "loss": 1.3794, "step": 3391 }, { "epoch": 0.02, "grad_norm": 4.445886168494522, "learning_rate": 1.999838920187234e-06, "loss": 1.3245, "step": 3392 }, { "epoch": 0.02, "grad_norm": 4.605358505461649, "learning_rate": 1.9998388249185773e-06, "loss": 1.4412, "step": 3393 }, { "epoch": 0.02, "grad_norm": 4.535610642306297, "learning_rate": 1.999838729621758e-06, "loss": 1.5244, "step": 3394 }, { "epoch": 0.02, "grad_norm": 7.61293667721256, "learning_rate": 1.9998386342967767e-06, "loss": 1.3762, "step": 3395 }, { "epoch": 0.02, "grad_norm": 5.576615477919866, "learning_rate": 1.999838538943633e-06, "loss": 1.1744, "step": 3396 }, { "epoch": 0.02, "grad_norm": 4.982158065266377, "learning_rate": 1.999838443562327e-06, "loss": 1.4918, "step": 3397 }, { "epoch": 0.02, "grad_norm": 4.380167900793968, "learning_rate": 1.9998383481528595e-06, "loss": 1.4074, "step": 3398 }, { "epoch": 0.02, "grad_norm": 4.950258411072331, "learning_rate": 1.9998382527152297e-06, "loss": 1.4923, "step": 3399 }, { "epoch": 0.02, "grad_norm": 4.592714907688114, "learning_rate": 1.9998381572494374e-06, "loss": 1.3628, "step": 3400 }, { "epoch": 0.02, "grad_norm": 4.424967211816432, "learning_rate": 1.9998380617554833e-06, "loss": 1.4329, "step": 3401 }, { "epoch": 0.02, "grad_norm": 4.525704926126062, "learning_rate": 1.999837966233367e-06, "loss": 1.412, "step": 3402 }, { "epoch": 0.02, "grad_norm": 4.7381010367112735, "learning_rate": 1.9998378706830887e-06, "loss": 1.4563, "step": 3403 }, { "epoch": 0.02, "grad_norm": 5.357820102817708, "learning_rate": 1.999837775104648e-06, "loss": 1.392, "step": 3404 }, { "epoch": 0.02, "grad_norm": 4.877459475025764, "learning_rate": 1.9998376794980455e-06, "loss": 1.446, "step": 3405 }, { "epoch": 0.02, "grad_norm": 4.736389873828967, "learning_rate": 1.999837583863281e-06, "loss": 1.3899, "step": 3406 }, { "epoch": 0.02, "grad_norm": 4.6136258426518655, "learning_rate": 1.999837488200354e-06, "loss": 1.4183, "step": 3407 }, { "epoch": 0.02, "grad_norm": 4.573629869371029, "learning_rate": 1.9998373925092654e-06, "loss": 1.4148, "step": 3408 }, { "epoch": 0.02, "grad_norm": 5.331117554718104, "learning_rate": 1.999837296790014e-06, "loss": 1.4761, "step": 3409 }, { "epoch": 0.02, "grad_norm": 6.167559123390286, "learning_rate": 1.999837201042601e-06, "loss": 1.374, "step": 3410 }, { "epoch": 0.02, "grad_norm": 5.348496307914284, "learning_rate": 1.999837105267026e-06, "loss": 1.3786, "step": 3411 }, { "epoch": 0.02, "grad_norm": 5.130052691622517, "learning_rate": 1.9998370094632887e-06, "loss": 1.4301, "step": 3412 }, { "epoch": 0.02, "grad_norm": 4.497336313396663, "learning_rate": 1.9998369136313892e-06, "loss": 1.24, "step": 3413 }, { "epoch": 0.02, "grad_norm": 5.813744620601688, "learning_rate": 1.999836817771328e-06, "loss": 1.4909, "step": 3414 }, { "epoch": 0.02, "grad_norm": 4.591174887700467, "learning_rate": 1.9998367218831043e-06, "loss": 1.4375, "step": 3415 }, { "epoch": 0.02, "grad_norm": 4.522216046172172, "learning_rate": 1.9998366259667188e-06, "loss": 1.3572, "step": 3416 }, { "epoch": 0.02, "grad_norm": 4.717345593583763, "learning_rate": 1.999836530022171e-06, "loss": 1.4897, "step": 3417 }, { "epoch": 0.02, "grad_norm": 5.321027044092839, "learning_rate": 1.9998364340494614e-06, "loss": 1.4912, "step": 3418 }, { "epoch": 0.02, "grad_norm": 4.7605877409336195, "learning_rate": 1.99983633804859e-06, "loss": 1.4961, "step": 3419 }, { "epoch": 0.02, "grad_norm": 4.4178839583225775, "learning_rate": 1.999836242019556e-06, "loss": 1.2966, "step": 3420 }, { "epoch": 0.02, "grad_norm": 5.056171291505301, "learning_rate": 1.99983614596236e-06, "loss": 1.3807, "step": 3421 }, { "epoch": 0.02, "grad_norm": 5.4340753688988075, "learning_rate": 1.9998360498770024e-06, "loss": 1.3203, "step": 3422 }, { "epoch": 0.02, "grad_norm": 5.057972934945415, "learning_rate": 1.9998359537634822e-06, "loss": 1.4758, "step": 3423 }, { "epoch": 0.02, "grad_norm": 5.829632088605847, "learning_rate": 1.9998358576218004e-06, "loss": 1.517, "step": 3424 }, { "epoch": 0.02, "grad_norm": 4.85482687780751, "learning_rate": 1.999835761451956e-06, "loss": 1.4677, "step": 3425 }, { "epoch": 0.02, "grad_norm": 5.922500655311902, "learning_rate": 1.9998356652539498e-06, "loss": 1.3043, "step": 3426 }, { "epoch": 0.02, "grad_norm": 4.588250525378909, "learning_rate": 1.999835569027782e-06, "loss": 1.3587, "step": 3427 }, { "epoch": 0.02, "grad_norm": 4.382966019888421, "learning_rate": 1.9998354727734514e-06, "loss": 1.255, "step": 3428 }, { "epoch": 0.02, "grad_norm": 4.920889632741727, "learning_rate": 1.999835376490959e-06, "loss": 1.3713, "step": 3429 }, { "epoch": 0.02, "grad_norm": 4.80574963169274, "learning_rate": 1.9998352801803045e-06, "loss": 1.6275, "step": 3430 }, { "epoch": 0.02, "grad_norm": 5.028070166963233, "learning_rate": 1.999835183841488e-06, "loss": 1.4875, "step": 3431 }, { "epoch": 0.02, "eval_loss": 1.6133036613464355, "eval_runtime": 4.6357, "eval_samples_per_second": 1.941, "eval_steps_per_second": 1.079, "step": 3431 }, { "epoch": 0.02, "grad_norm": 4.434318252171852, "learning_rate": 1.9998350874745098e-06, "loss": 1.363, "step": 3432 }, { "epoch": 0.02, "grad_norm": 6.623393165669105, "learning_rate": 1.9998349910793694e-06, "loss": 1.5814, "step": 3433 }, { "epoch": 0.02, "grad_norm": 5.456113027521206, "learning_rate": 1.999834894656067e-06, "loss": 1.2424, "step": 3434 }, { "epoch": 0.02, "grad_norm": 5.519733967210898, "learning_rate": 1.9998347982046026e-06, "loss": 1.4818, "step": 3435 }, { "epoch": 0.02, "grad_norm": 4.648419524976711, "learning_rate": 1.9998347017249762e-06, "loss": 1.2958, "step": 3436 }, { "epoch": 0.02, "grad_norm": 4.479293176004785, "learning_rate": 1.9998346052171877e-06, "loss": 1.3048, "step": 3437 }, { "epoch": 0.02, "grad_norm": 4.723612084510464, "learning_rate": 1.999834508681237e-06, "loss": 1.4803, "step": 3438 }, { "epoch": 0.02, "grad_norm": 4.379602061766586, "learning_rate": 1.9998344121171245e-06, "loss": 1.3367, "step": 3439 }, { "epoch": 0.02, "grad_norm": 4.763559064357727, "learning_rate": 1.99983431552485e-06, "loss": 1.5105, "step": 3440 }, { "epoch": 0.02, "grad_norm": 7.923117534883834, "learning_rate": 1.9998342189044136e-06, "loss": 1.4361, "step": 3441 }, { "epoch": 0.02, "grad_norm": 4.714745220308912, "learning_rate": 1.9998341222558148e-06, "loss": 1.5172, "step": 3442 }, { "epoch": 0.02, "grad_norm": 7.225844294787685, "learning_rate": 1.999834025579054e-06, "loss": 1.4366, "step": 3443 }, { "epoch": 0.02, "grad_norm": 4.3720897698041625, "learning_rate": 1.999833928874132e-06, "loss": 1.3114, "step": 3444 }, { "epoch": 0.02, "grad_norm": 5.468815321569939, "learning_rate": 1.9998338321410473e-06, "loss": 1.3577, "step": 3445 }, { "epoch": 0.02, "grad_norm": 5.369771276792973, "learning_rate": 1.9998337353798007e-06, "loss": 1.4633, "step": 3446 }, { "epoch": 0.02, "grad_norm": 5.373295136247841, "learning_rate": 1.9998336385903923e-06, "loss": 1.3569, "step": 3447 }, { "epoch": 0.02, "grad_norm": 4.775224501462429, "learning_rate": 1.9998335417728218e-06, "loss": 1.3241, "step": 3448 }, { "epoch": 0.02, "grad_norm": 5.063351567591072, "learning_rate": 1.999833444927089e-06, "loss": 1.4984, "step": 3449 }, { "epoch": 0.02, "grad_norm": 5.460324129913872, "learning_rate": 1.9998333480531947e-06, "loss": 1.4443, "step": 3450 }, { "epoch": 0.02, "grad_norm": 4.49405386456319, "learning_rate": 1.999833251151138e-06, "loss": 1.3851, "step": 3451 }, { "epoch": 0.02, "grad_norm": 4.561632118793946, "learning_rate": 1.99983315422092e-06, "loss": 1.3756, "step": 3452 }, { "epoch": 0.02, "grad_norm": 5.787428115911711, "learning_rate": 1.9998330572625394e-06, "loss": 1.2516, "step": 3453 }, { "epoch": 0.02, "grad_norm": 4.2654374946244955, "learning_rate": 1.999832960275997e-06, "loss": 1.3323, "step": 3454 }, { "epoch": 0.02, "grad_norm": 4.943112051244451, "learning_rate": 1.9998328632612925e-06, "loss": 1.433, "step": 3455 }, { "epoch": 0.02, "grad_norm": 6.64758931877515, "learning_rate": 1.999832766218426e-06, "loss": 1.3929, "step": 3456 }, { "epoch": 0.02, "grad_norm": 5.004313093362644, "learning_rate": 1.999832669147398e-06, "loss": 1.3248, "step": 3457 }, { "epoch": 0.02, "grad_norm": 5.974142973723598, "learning_rate": 1.9998325720482075e-06, "loss": 1.4423, "step": 3458 }, { "epoch": 0.02, "grad_norm": 6.115923629555795, "learning_rate": 1.9998324749208554e-06, "loss": 1.3141, "step": 3459 }, { "epoch": 0.02, "grad_norm": 5.29889351882071, "learning_rate": 1.999832377765341e-06, "loss": 1.5085, "step": 3460 }, { "epoch": 0.02, "grad_norm": 4.933579769870536, "learning_rate": 1.999832280581665e-06, "loss": 1.3622, "step": 3461 }, { "epoch": 0.02, "grad_norm": 4.54785822775587, "learning_rate": 1.9998321833698267e-06, "loss": 1.4151, "step": 3462 }, { "epoch": 0.02, "grad_norm": 4.464340495701981, "learning_rate": 1.999832086129827e-06, "loss": 1.3599, "step": 3463 }, { "epoch": 0.02, "grad_norm": 5.73588324623857, "learning_rate": 1.999831988861665e-06, "loss": 1.6014, "step": 3464 }, { "epoch": 0.02, "grad_norm": 4.4591831342702966, "learning_rate": 1.9998318915653408e-06, "loss": 1.322, "step": 3465 }, { "epoch": 0.02, "grad_norm": 4.723127658638915, "learning_rate": 1.9998317942408553e-06, "loss": 1.5451, "step": 3466 }, { "epoch": 0.02, "grad_norm": 4.852467128721474, "learning_rate": 1.9998316968882073e-06, "loss": 1.301, "step": 3467 }, { "epoch": 0.02, "grad_norm": 4.810154485536803, "learning_rate": 1.9998315995073976e-06, "loss": 1.421, "step": 3468 }, { "epoch": 0.02, "grad_norm": 4.355440555741922, "learning_rate": 1.999831502098426e-06, "loss": 1.3608, "step": 3469 }, { "epoch": 0.02, "grad_norm": 4.2600779975771434, "learning_rate": 1.9998314046612925e-06, "loss": 1.3282, "step": 3470 }, { "epoch": 0.02, "grad_norm": 4.713840798819233, "learning_rate": 1.9998313071959967e-06, "loss": 1.469, "step": 3471 }, { "epoch": 0.02, "grad_norm": 4.40703476488466, "learning_rate": 1.9998312097025396e-06, "loss": 1.195, "step": 3472 }, { "epoch": 0.02, "grad_norm": 4.707652605308421, "learning_rate": 1.99983111218092e-06, "loss": 1.401, "step": 3473 }, { "epoch": 0.02, "grad_norm": 4.9351410023359135, "learning_rate": 1.9998310146311386e-06, "loss": 1.5518, "step": 3474 }, { "epoch": 0.02, "grad_norm": 7.835268364237103, "learning_rate": 1.999830917053196e-06, "loss": 1.4963, "step": 3475 }, { "epoch": 0.02, "grad_norm": 4.461769170842469, "learning_rate": 1.9998308194470907e-06, "loss": 1.3913, "step": 3476 }, { "epoch": 0.02, "grad_norm": 4.986732482048744, "learning_rate": 1.9998307218128237e-06, "loss": 1.4655, "step": 3477 }, { "epoch": 0.02, "grad_norm": 5.103030642318946, "learning_rate": 1.999830624150395e-06, "loss": 1.4183, "step": 3478 }, { "epoch": 0.02, "grad_norm": 5.347945323080289, "learning_rate": 1.999830526459804e-06, "loss": 1.5263, "step": 3479 }, { "epoch": 0.02, "grad_norm": 4.836255099815674, "learning_rate": 1.999830428741051e-06, "loss": 1.3183, "step": 3480 }, { "epoch": 0.02, "grad_norm": 4.36280200083679, "learning_rate": 1.9998303309941367e-06, "loss": 1.4045, "step": 3481 }, { "epoch": 0.02, "grad_norm": 4.277029361572978, "learning_rate": 1.9998302332190603e-06, "loss": 1.2107, "step": 3482 }, { "epoch": 0.02, "grad_norm": 4.367758928198617, "learning_rate": 1.9998301354158217e-06, "loss": 1.3907, "step": 3483 }, { "epoch": 0.02, "grad_norm": 4.39073373168116, "learning_rate": 1.9998300375844213e-06, "loss": 1.2528, "step": 3484 }, { "epoch": 0.02, "grad_norm": 4.626910873986676, "learning_rate": 1.9998299397248592e-06, "loss": 1.3984, "step": 3485 }, { "epoch": 0.02, "grad_norm": 4.690894198671245, "learning_rate": 1.9998298418371354e-06, "loss": 1.463, "step": 3486 }, { "epoch": 0.02, "grad_norm": 4.2521261877601555, "learning_rate": 1.9998297439212494e-06, "loss": 1.2599, "step": 3487 }, { "epoch": 0.02, "grad_norm": 9.954516216479742, "learning_rate": 1.9998296459772013e-06, "loss": 1.3263, "step": 3488 }, { "epoch": 0.02, "grad_norm": 4.64293902683026, "learning_rate": 1.999829548004992e-06, "loss": 1.4007, "step": 3489 }, { "epoch": 0.02, "grad_norm": 4.7871583563034035, "learning_rate": 1.9998294500046204e-06, "loss": 1.4429, "step": 3490 }, { "epoch": 0.02, "grad_norm": 4.615463851562626, "learning_rate": 1.999829351976087e-06, "loss": 1.5247, "step": 3491 }, { "epoch": 0.02, "grad_norm": 4.704943808755736, "learning_rate": 1.9998292539193916e-06, "loss": 1.576, "step": 3492 }, { "epoch": 0.02, "grad_norm": 5.7783079479655655, "learning_rate": 1.9998291558345344e-06, "loss": 1.4055, "step": 3493 }, { "epoch": 0.02, "grad_norm": 5.441145271123304, "learning_rate": 1.9998290577215155e-06, "loss": 1.5482, "step": 3494 }, { "epoch": 0.02, "grad_norm": 9.36043045218442, "learning_rate": 1.9998289595803345e-06, "loss": 1.3262, "step": 3495 }, { "epoch": 0.02, "grad_norm": 4.745604478290225, "learning_rate": 1.9998288614109917e-06, "loss": 1.5021, "step": 3496 }, { "epoch": 0.02, "grad_norm": 4.4255945297536385, "learning_rate": 1.999828763213487e-06, "loss": 1.4826, "step": 3497 }, { "epoch": 0.02, "grad_norm": 4.268280892701323, "learning_rate": 1.999828664987821e-06, "loss": 1.3277, "step": 3498 }, { "epoch": 0.02, "grad_norm": 4.925308919808803, "learning_rate": 1.9998285667339926e-06, "loss": 1.3895, "step": 3499 }, { "epoch": 0.02, "grad_norm": 4.388627815177559, "learning_rate": 1.9998284684520024e-06, "loss": 1.1406, "step": 3500 }, { "epoch": 0.02, "grad_norm": 4.7990704395075205, "learning_rate": 1.99982837014185e-06, "loss": 1.5664, "step": 3501 }, { "epoch": 0.02, "grad_norm": 5.069150584999445, "learning_rate": 1.9998282718035366e-06, "loss": 1.5177, "step": 3502 }, { "epoch": 0.02, "grad_norm": 4.792701471795762, "learning_rate": 1.999828173437061e-06, "loss": 1.4161, "step": 3503 }, { "epoch": 0.02, "grad_norm": 4.535667453487885, "learning_rate": 1.999828075042424e-06, "loss": 1.3286, "step": 3504 }, { "epoch": 0.02, "eval_loss": 1.612628698348999, "eval_runtime": 4.6365, "eval_samples_per_second": 1.941, "eval_steps_per_second": 1.078, "step": 3504 }, { "epoch": 0.02, "grad_norm": 4.836032937210056, "learning_rate": 1.9998279766196242e-06, "loss": 1.4098, "step": 3505 }, { "epoch": 0.02, "grad_norm": 4.780070229048976, "learning_rate": 1.999827878168663e-06, "loss": 1.4433, "step": 3506 }, { "epoch": 0.02, "grad_norm": 5.2390642232027655, "learning_rate": 1.9998277796895403e-06, "loss": 1.3572, "step": 3507 }, { "epoch": 0.02, "grad_norm": 4.973359117911899, "learning_rate": 1.999827681182255e-06, "loss": 1.3459, "step": 3508 }, { "epoch": 0.02, "grad_norm": 4.378947132862404, "learning_rate": 1.9998275826468085e-06, "loss": 1.4072, "step": 3509 }, { "epoch": 0.02, "grad_norm": 5.58519122967338, "learning_rate": 1.9998274840832003e-06, "loss": 1.4547, "step": 3510 }, { "epoch": 0.02, "grad_norm": 4.718146444105029, "learning_rate": 1.99982738549143e-06, "loss": 1.3933, "step": 3511 }, { "epoch": 0.02, "grad_norm": 4.433816849751808, "learning_rate": 1.9998272868714978e-06, "loss": 1.5081, "step": 3512 }, { "epoch": 0.02, "grad_norm": 4.668728068958253, "learning_rate": 1.999827188223404e-06, "loss": 1.4535, "step": 3513 }, { "epoch": 0.02, "grad_norm": 5.205098278485999, "learning_rate": 1.9998270895471484e-06, "loss": 1.3971, "step": 3514 }, { "epoch": 0.02, "grad_norm": 4.2812240461470905, "learning_rate": 1.999826990842731e-06, "loss": 1.2847, "step": 3515 }, { "epoch": 0.02, "grad_norm": 4.421468940513477, "learning_rate": 1.9998268921101516e-06, "loss": 1.2344, "step": 3516 }, { "epoch": 0.02, "grad_norm": 5.240697700729821, "learning_rate": 1.999826793349411e-06, "loss": 1.3846, "step": 3517 }, { "epoch": 0.02, "grad_norm": 4.624113047494641, "learning_rate": 1.999826694560508e-06, "loss": 1.5729, "step": 3518 }, { "epoch": 0.02, "grad_norm": 4.125813300538199, "learning_rate": 1.999826595743443e-06, "loss": 1.3067, "step": 3519 }, { "epoch": 0.02, "grad_norm": 4.72485279895408, "learning_rate": 1.9998264968982165e-06, "loss": 1.4624, "step": 3520 }, { "epoch": 0.02, "grad_norm": 7.440580236377528, "learning_rate": 1.9998263980248284e-06, "loss": 1.3921, "step": 3521 }, { "epoch": 0.02, "grad_norm": 5.506465574269989, "learning_rate": 1.9998262991232786e-06, "loss": 1.3429, "step": 3522 }, { "epoch": 0.02, "grad_norm": 4.892687291895052, "learning_rate": 1.9998262001935666e-06, "loss": 1.4849, "step": 3523 }, { "epoch": 0.02, "grad_norm": 4.714503092123628, "learning_rate": 1.999826101235693e-06, "loss": 1.3495, "step": 3524 }, { "epoch": 0.02, "grad_norm": 6.080297383574053, "learning_rate": 1.9998260022496574e-06, "loss": 1.4098, "step": 3525 }, { "epoch": 0.02, "grad_norm": 4.672363930477558, "learning_rate": 1.9998259032354607e-06, "loss": 1.1808, "step": 3526 }, { "epoch": 0.02, "grad_norm": 4.762254121900801, "learning_rate": 1.999825804193102e-06, "loss": 1.2784, "step": 3527 }, { "epoch": 0.02, "grad_norm": 5.0090886444132465, "learning_rate": 1.999825705122581e-06, "loss": 1.3983, "step": 3528 }, { "epoch": 0.02, "grad_norm": 4.541188950258432, "learning_rate": 1.9998256060238984e-06, "loss": 1.4821, "step": 3529 }, { "epoch": 0.02, "grad_norm": 4.508252601462831, "learning_rate": 1.9998255068970544e-06, "loss": 1.4049, "step": 3530 }, { "epoch": 0.02, "grad_norm": 4.384733739344799, "learning_rate": 1.9998254077420486e-06, "loss": 1.3762, "step": 3531 }, { "epoch": 0.02, "grad_norm": 4.91582726748242, "learning_rate": 1.9998253085588806e-06, "loss": 1.4418, "step": 3532 }, { "epoch": 0.02, "grad_norm": 4.228789208224146, "learning_rate": 1.9998252093475513e-06, "loss": 1.2963, "step": 3533 }, { "epoch": 0.02, "grad_norm": 4.697321495816711, "learning_rate": 1.99982511010806e-06, "loss": 1.4528, "step": 3534 }, { "epoch": 0.02, "grad_norm": 4.803901064568725, "learning_rate": 1.9998250108404072e-06, "loss": 1.569, "step": 3535 }, { "epoch": 0.02, "grad_norm": 4.353829814405607, "learning_rate": 1.9998249115445924e-06, "loss": 1.3503, "step": 3536 }, { "epoch": 0.02, "grad_norm": 4.469444434313929, "learning_rate": 1.9998248122206158e-06, "loss": 1.485, "step": 3537 }, { "epoch": 0.02, "grad_norm": 4.135382837905591, "learning_rate": 1.999824712868478e-06, "loss": 1.3355, "step": 3538 }, { "epoch": 0.02, "grad_norm": 5.198880866777614, "learning_rate": 1.999824613488178e-06, "loss": 1.469, "step": 3539 }, { "epoch": 0.02, "grad_norm": 5.2331303310285415, "learning_rate": 1.999824514079716e-06, "loss": 1.3518, "step": 3540 }, { "epoch": 0.02, "grad_norm": 4.8429386253121125, "learning_rate": 1.9998244146430926e-06, "loss": 1.4942, "step": 3541 }, { "epoch": 0.02, "grad_norm": 4.574611939532659, "learning_rate": 1.9998243151783077e-06, "loss": 1.4268, "step": 3542 }, { "epoch": 0.02, "grad_norm": 4.94932612125061, "learning_rate": 1.9998242156853608e-06, "loss": 1.6209, "step": 3543 }, { "epoch": 0.02, "grad_norm": 6.006103951828245, "learning_rate": 1.999824116164252e-06, "loss": 1.2351, "step": 3544 }, { "epoch": 0.02, "grad_norm": 4.166042167548124, "learning_rate": 1.999824016614982e-06, "loss": 1.142, "step": 3545 }, { "epoch": 0.02, "grad_norm": 5.0620007916305925, "learning_rate": 1.99982391703755e-06, "loss": 1.4374, "step": 3546 }, { "epoch": 0.02, "grad_norm": 4.971313439844006, "learning_rate": 1.999823817431956e-06, "loss": 1.4932, "step": 3547 }, { "epoch": 0.02, "grad_norm": 4.948526977664145, "learning_rate": 1.999823717798201e-06, "loss": 1.3942, "step": 3548 }, { "epoch": 0.02, "grad_norm": 6.433000571492981, "learning_rate": 1.9998236181362836e-06, "loss": 1.6458, "step": 3549 }, { "epoch": 0.02, "grad_norm": 5.7172630835592475, "learning_rate": 1.9998235184462045e-06, "loss": 1.3787, "step": 3550 }, { "epoch": 0.02, "grad_norm": 4.835524125983266, "learning_rate": 1.999823418727964e-06, "loss": 1.4862, "step": 3551 }, { "epoch": 0.02, "grad_norm": 4.538989738684982, "learning_rate": 1.999823318981562e-06, "loss": 1.2833, "step": 3552 }, { "epoch": 0.02, "grad_norm": 4.900851370893367, "learning_rate": 1.999823219206998e-06, "loss": 1.4289, "step": 3553 }, { "epoch": 0.02, "grad_norm": 4.742076041128345, "learning_rate": 1.9998231194042723e-06, "loss": 1.5084, "step": 3554 }, { "epoch": 0.02, "grad_norm": 5.130400288886862, "learning_rate": 1.999823019573385e-06, "loss": 1.251, "step": 3555 }, { "epoch": 0.02, "grad_norm": 5.2268449306817, "learning_rate": 1.999822919714336e-06, "loss": 1.2141, "step": 3556 }, { "epoch": 0.02, "grad_norm": 4.5252607523373936, "learning_rate": 1.9998228198271253e-06, "loss": 1.3066, "step": 3557 }, { "epoch": 0.02, "grad_norm": 7.351860568361683, "learning_rate": 1.999822719911753e-06, "loss": 1.3784, "step": 3558 }, { "epoch": 0.02, "grad_norm": 4.3402345801839415, "learning_rate": 1.9998226199682187e-06, "loss": 1.375, "step": 3559 }, { "epoch": 0.02, "grad_norm": 6.810672354782998, "learning_rate": 1.999822519996523e-06, "loss": 1.4548, "step": 3560 }, { "epoch": 0.02, "grad_norm": 4.856779921376611, "learning_rate": 1.999822419996666e-06, "loss": 1.3662, "step": 3561 }, { "epoch": 0.02, "grad_norm": 4.495518456829636, "learning_rate": 1.9998223199686466e-06, "loss": 1.3658, "step": 3562 }, { "epoch": 0.02, "grad_norm": 4.972175534549607, "learning_rate": 1.999822219912466e-06, "loss": 1.4305, "step": 3563 }, { "epoch": 0.02, "grad_norm": 4.5993830494584484, "learning_rate": 1.9998221198281236e-06, "loss": 1.3633, "step": 3564 }, { "epoch": 0.02, "grad_norm": 4.464312906911071, "learning_rate": 1.9998220197156194e-06, "loss": 1.3512, "step": 3565 }, { "epoch": 0.02, "grad_norm": 4.666030982227595, "learning_rate": 1.9998219195749536e-06, "loss": 1.4271, "step": 3566 }, { "epoch": 0.02, "grad_norm": 4.5017841565719365, "learning_rate": 1.9998218194061264e-06, "loss": 1.2379, "step": 3567 }, { "epoch": 0.02, "grad_norm": 5.160263531672295, "learning_rate": 1.9998217192091375e-06, "loss": 1.4269, "step": 3568 }, { "epoch": 0.02, "grad_norm": 5.394350494166971, "learning_rate": 1.9998216189839865e-06, "loss": 1.5847, "step": 3569 }, { "epoch": 0.02, "grad_norm": 4.543610113197201, "learning_rate": 1.999821518730674e-06, "loss": 1.5249, "step": 3570 }, { "epoch": 0.02, "grad_norm": 4.524606361318028, "learning_rate": 1.9998214184492e-06, "loss": 1.4545, "step": 3571 }, { "epoch": 0.02, "grad_norm": 4.8729914641558425, "learning_rate": 1.9998213181395643e-06, "loss": 1.4856, "step": 3572 }, { "epoch": 0.02, "grad_norm": 6.712818698671254, "learning_rate": 1.999821217801767e-06, "loss": 1.4254, "step": 3573 }, { "epoch": 0.02, "grad_norm": 7.129380023262798, "learning_rate": 1.9998211174358083e-06, "loss": 1.5263, "step": 3574 }, { "epoch": 0.02, "grad_norm": 5.632118655345206, "learning_rate": 1.999821017041688e-06, "loss": 1.2905, "step": 3575 }, { "epoch": 0.02, "grad_norm": 4.713339231139657, "learning_rate": 1.9998209166194055e-06, "loss": 1.4079, "step": 3576 }, { "epoch": 0.02, "grad_norm": 4.761666895797936, "learning_rate": 1.9998208161689615e-06, "loss": 1.348, "step": 3577 }, { "epoch": 0.02, "eval_loss": 1.6088365316390991, "eval_runtime": 4.6298, "eval_samples_per_second": 1.944, "eval_steps_per_second": 1.08, "step": 3577 }, { "epoch": 0.02, "grad_norm": 5.053550759411962, "learning_rate": 1.999820715690356e-06, "loss": 1.4378, "step": 3578 }, { "epoch": 0.02, "grad_norm": 4.656417963233415, "learning_rate": 1.999820615183589e-06, "loss": 1.4196, "step": 3579 }, { "epoch": 0.02, "grad_norm": 4.720114969577187, "learning_rate": 1.9998205146486604e-06, "loss": 1.4177, "step": 3580 }, { "epoch": 0.02, "grad_norm": 5.1590061020053435, "learning_rate": 1.99982041408557e-06, "loss": 1.3961, "step": 3581 }, { "epoch": 0.02, "grad_norm": 4.609450714516466, "learning_rate": 1.999820313494318e-06, "loss": 1.3561, "step": 3582 }, { "epoch": 0.02, "grad_norm": 6.864818427122029, "learning_rate": 1.9998202128749045e-06, "loss": 1.3521, "step": 3583 }, { "epoch": 0.02, "grad_norm": 4.532234217131842, "learning_rate": 1.9998201122273292e-06, "loss": 1.2275, "step": 3584 }, { "epoch": 0.02, "grad_norm": 5.083685514923316, "learning_rate": 1.9998200115515927e-06, "loss": 1.7183, "step": 3585 }, { "epoch": 0.02, "grad_norm": 5.1648857831860004, "learning_rate": 1.9998199108476944e-06, "loss": 1.4292, "step": 3586 }, { "epoch": 0.02, "grad_norm": 5.404570069932921, "learning_rate": 1.9998198101156344e-06, "loss": 1.4524, "step": 3587 }, { "epoch": 0.02, "grad_norm": 4.4636298910229435, "learning_rate": 1.9998197093554126e-06, "loss": 1.3022, "step": 3588 }, { "epoch": 0.02, "grad_norm": 4.698700968868405, "learning_rate": 1.9998196085670296e-06, "loss": 1.3204, "step": 3589 }, { "epoch": 0.02, "grad_norm": 4.210913435197878, "learning_rate": 1.9998195077504848e-06, "loss": 1.3195, "step": 3590 }, { "epoch": 0.02, "grad_norm": 6.072601657781737, "learning_rate": 1.9998194069057783e-06, "loss": 1.4633, "step": 3591 }, { "epoch": 0.02, "grad_norm": 4.947711709512535, "learning_rate": 1.9998193060329105e-06, "loss": 1.3934, "step": 3592 }, { "epoch": 0.02, "grad_norm": 4.489367907366415, "learning_rate": 1.999819205131881e-06, "loss": 1.3327, "step": 3593 }, { "epoch": 0.02, "grad_norm": 5.046442232090646, "learning_rate": 1.99981910420269e-06, "loss": 1.519, "step": 3594 }, { "epoch": 0.02, "grad_norm": 4.4848572461168015, "learning_rate": 1.999819003245337e-06, "loss": 1.4837, "step": 3595 }, { "epoch": 0.02, "grad_norm": 6.189445025381679, "learning_rate": 1.999818902259823e-06, "loss": 1.322, "step": 3596 }, { "epoch": 0.02, "grad_norm": 5.052080974977652, "learning_rate": 1.999818801246147e-06, "loss": 1.4127, "step": 3597 }, { "epoch": 0.02, "grad_norm": 5.364751238319127, "learning_rate": 1.99981870020431e-06, "loss": 1.2812, "step": 3598 }, { "epoch": 0.02, "grad_norm": 6.0230864070326104, "learning_rate": 1.9998185991343108e-06, "loss": 1.1783, "step": 3599 }, { "epoch": 0.02, "grad_norm": 5.459589611896564, "learning_rate": 1.9998184980361504e-06, "loss": 1.2804, "step": 3600 }, { "epoch": 0.02, "grad_norm": 6.424443515802799, "learning_rate": 1.9998183969098283e-06, "loss": 1.2931, "step": 3601 }, { "epoch": 0.02, "grad_norm": 4.769365934369103, "learning_rate": 1.9998182957553445e-06, "loss": 1.3411, "step": 3602 }, { "epoch": 0.02, "grad_norm": 4.944616559405836, "learning_rate": 1.9998181945726993e-06, "loss": 1.2028, "step": 3603 }, { "epoch": 0.02, "grad_norm": 4.852238644907209, "learning_rate": 1.9998180933618925e-06, "loss": 1.3328, "step": 3604 }, { "epoch": 0.02, "grad_norm": 4.822059635157921, "learning_rate": 1.9998179921229243e-06, "loss": 1.2309, "step": 3605 }, { "epoch": 0.02, "grad_norm": 4.495892687998826, "learning_rate": 1.9998178908557944e-06, "loss": 1.4956, "step": 3606 }, { "epoch": 0.02, "grad_norm": 4.355235617835983, "learning_rate": 1.999817789560503e-06, "loss": 1.181, "step": 3607 }, { "epoch": 0.02, "grad_norm": 4.220700935420473, "learning_rate": 1.99981768823705e-06, "loss": 1.2949, "step": 3608 }, { "epoch": 0.02, "grad_norm": 4.93767906248555, "learning_rate": 1.9998175868854353e-06, "loss": 1.4398, "step": 3609 }, { "epoch": 0.02, "grad_norm": 4.976911864648587, "learning_rate": 1.9998174855056593e-06, "loss": 1.3841, "step": 3610 }, { "epoch": 0.02, "grad_norm": 4.915768062533412, "learning_rate": 1.999817384097722e-06, "loss": 1.3776, "step": 3611 }, { "epoch": 0.02, "grad_norm": 5.400452386799031, "learning_rate": 1.999817282661623e-06, "loss": 1.5242, "step": 3612 }, { "epoch": 0.02, "grad_norm": 9.596731304469015, "learning_rate": 1.999817181197362e-06, "loss": 1.4432, "step": 3613 }, { "epoch": 0.02, "grad_norm": 4.417819290463723, "learning_rate": 1.99981707970494e-06, "loss": 1.3012, "step": 3614 }, { "epoch": 0.02, "grad_norm": 4.908570364549861, "learning_rate": 1.9998169781843566e-06, "loss": 1.4815, "step": 3615 }, { "epoch": 0.02, "grad_norm": 4.538384996779538, "learning_rate": 1.999816876635611e-06, "loss": 1.4016, "step": 3616 }, { "epoch": 0.02, "grad_norm": 4.527777934939574, "learning_rate": 1.9998167750587048e-06, "loss": 1.3748, "step": 3617 }, { "epoch": 0.02, "grad_norm": 5.9438933831368645, "learning_rate": 1.9998166734536362e-06, "loss": 1.3516, "step": 3618 }, { "epoch": 0.02, "grad_norm": 5.111197603829205, "learning_rate": 1.9998165718204064e-06, "loss": 1.4535, "step": 3619 }, { "epoch": 0.02, "grad_norm": 4.3741990248604035, "learning_rate": 1.9998164701590153e-06, "loss": 1.3127, "step": 3620 }, { "epoch": 0.02, "grad_norm": 4.416015259175785, "learning_rate": 1.999816368469463e-06, "loss": 1.2723, "step": 3621 }, { "epoch": 0.02, "grad_norm": 5.045256087161643, "learning_rate": 1.9998162667517488e-06, "loss": 1.4302, "step": 3622 }, { "epoch": 0.02, "grad_norm": 5.026141735704103, "learning_rate": 1.999816165005873e-06, "loss": 1.5235, "step": 3623 }, { "epoch": 0.02, "grad_norm": 4.817827993597373, "learning_rate": 1.9998160632318357e-06, "loss": 1.3734, "step": 3624 }, { "epoch": 0.02, "grad_norm": 5.13290587555347, "learning_rate": 1.999815961429637e-06, "loss": 1.6108, "step": 3625 }, { "epoch": 0.02, "grad_norm": 3.988580619133651, "learning_rate": 1.9998158595992766e-06, "loss": 1.1573, "step": 3626 }, { "epoch": 0.02, "grad_norm": 4.66265731554959, "learning_rate": 1.9998157577407546e-06, "loss": 1.1933, "step": 3627 }, { "epoch": 0.02, "grad_norm": 4.648371974483903, "learning_rate": 1.999815655854072e-06, "loss": 1.4983, "step": 3628 }, { "epoch": 0.02, "grad_norm": 4.62998311734592, "learning_rate": 1.9998155539392273e-06, "loss": 1.3605, "step": 3629 }, { "epoch": 0.02, "grad_norm": 4.650232836970276, "learning_rate": 1.999815451996221e-06, "loss": 1.4764, "step": 3630 }, { "epoch": 0.02, "grad_norm": 4.871641037949958, "learning_rate": 1.9998153500250534e-06, "loss": 1.2777, "step": 3631 }, { "epoch": 0.02, "grad_norm": 4.617802934649518, "learning_rate": 1.999815248025724e-06, "loss": 1.4119, "step": 3632 }, { "epoch": 0.02, "grad_norm": 4.411468513060949, "learning_rate": 1.999815145998234e-06, "loss": 1.2856, "step": 3633 }, { "epoch": 0.02, "grad_norm": 4.4136554607658995, "learning_rate": 1.999815043942582e-06, "loss": 1.3036, "step": 3634 }, { "epoch": 0.02, "grad_norm": 4.696213561907659, "learning_rate": 1.9998149418587684e-06, "loss": 1.4445, "step": 3635 }, { "epoch": 0.02, "grad_norm": 4.707575975813092, "learning_rate": 1.9998148397467934e-06, "loss": 1.4956, "step": 3636 }, { "epoch": 0.02, "grad_norm": 4.477661718215679, "learning_rate": 1.9998147376066567e-06, "loss": 1.4149, "step": 3637 }, { "epoch": 0.02, "grad_norm": 5.242955704509717, "learning_rate": 1.9998146354383588e-06, "loss": 1.3746, "step": 3638 }, { "epoch": 0.02, "grad_norm": 4.407436531750599, "learning_rate": 1.9998145332418995e-06, "loss": 1.3222, "step": 3639 }, { "epoch": 0.02, "grad_norm": 5.4015116251774575, "learning_rate": 1.999814431017279e-06, "loss": 1.4696, "step": 3640 }, { "epoch": 0.02, "grad_norm": 4.230060461672011, "learning_rate": 1.9998143287644966e-06, "loss": 1.1936, "step": 3641 }, { "epoch": 0.02, "grad_norm": 4.832174281020922, "learning_rate": 1.999814226483553e-06, "loss": 1.5139, "step": 3642 }, { "epoch": 0.02, "grad_norm": 4.458935294035869, "learning_rate": 1.999814124174448e-06, "loss": 1.366, "step": 3643 }, { "epoch": 0.02, "grad_norm": 4.858967822250987, "learning_rate": 1.9998140218371814e-06, "loss": 1.4256, "step": 3644 }, { "epoch": 0.02, "grad_norm": 5.384171006945276, "learning_rate": 1.9998139194717534e-06, "loss": 1.5396, "step": 3645 }, { "epoch": 0.02, "grad_norm": 5.362217816203799, "learning_rate": 1.999813817078164e-06, "loss": 1.4846, "step": 3646 }, { "epoch": 0.02, "grad_norm": 4.829886703693458, "learning_rate": 1.999813714656413e-06, "loss": 1.465, "step": 3647 }, { "epoch": 0.02, "grad_norm": 4.300928121079802, "learning_rate": 1.999813612206501e-06, "loss": 1.3776, "step": 3648 }, { "epoch": 0.02, "grad_norm": 4.186443596966576, "learning_rate": 1.9998135097284272e-06, "loss": 1.3409, "step": 3649 }, { "epoch": 0.02, "grad_norm": 4.883455465101393, "learning_rate": 1.9998134072221923e-06, "loss": 1.41, "step": 3650 }, { "epoch": 0.02, "eval_loss": 1.6074836254119873, "eval_runtime": 4.6299, "eval_samples_per_second": 1.944, "eval_steps_per_second": 1.08, "step": 3650 }, { "epoch": 0.02, "grad_norm": 5.160544967304233, "learning_rate": 1.9998133046877957e-06, "loss": 1.4479, "step": 3651 }, { "epoch": 0.02, "grad_norm": 4.591999173432497, "learning_rate": 1.9998132021252378e-06, "loss": 1.424, "step": 3652 }, { "epoch": 0.02, "grad_norm": 4.334725366745728, "learning_rate": 1.9998130995345185e-06, "loss": 1.3303, "step": 3653 }, { "epoch": 0.02, "grad_norm": 4.667679588938856, "learning_rate": 1.9998129969156376e-06, "loss": 1.3497, "step": 3654 }, { "epoch": 0.02, "grad_norm": 4.648593704664988, "learning_rate": 1.9998128942685953e-06, "loss": 1.5196, "step": 3655 }, { "epoch": 0.02, "grad_norm": 5.228106530732631, "learning_rate": 1.999812791593392e-06, "loss": 1.5034, "step": 3656 }, { "epoch": 0.02, "grad_norm": 4.825079154423852, "learning_rate": 1.9998126888900272e-06, "loss": 1.5114, "step": 3657 }, { "epoch": 0.02, "grad_norm": 5.588839443240046, "learning_rate": 1.9998125861585006e-06, "loss": 1.224, "step": 3658 }, { "epoch": 0.02, "grad_norm": 11.115206705835469, "learning_rate": 1.9998124833988127e-06, "loss": 1.402, "step": 3659 }, { "epoch": 0.02, "grad_norm": 5.028407264356555, "learning_rate": 1.9998123806109635e-06, "loss": 1.3366, "step": 3660 }, { "epoch": 0.02, "grad_norm": 4.883233407550293, "learning_rate": 1.999812277794953e-06, "loss": 1.3908, "step": 3661 }, { "epoch": 0.02, "grad_norm": 4.849560264366864, "learning_rate": 1.999812174950781e-06, "loss": 1.4642, "step": 3662 }, { "epoch": 0.02, "grad_norm": 5.134509693028477, "learning_rate": 1.999812072078448e-06, "loss": 1.338, "step": 3663 }, { "epoch": 0.02, "grad_norm": 4.64736551563639, "learning_rate": 1.999811969177953e-06, "loss": 1.3188, "step": 3664 }, { "epoch": 0.02, "grad_norm": 5.202295612938597, "learning_rate": 1.999811866249297e-06, "loss": 1.5156, "step": 3665 }, { "epoch": 0.02, "grad_norm": 4.864330255262217, "learning_rate": 1.9998117632924795e-06, "loss": 1.4044, "step": 3666 }, { "epoch": 0.02, "grad_norm": 5.095975352302671, "learning_rate": 1.9998116603075008e-06, "loss": 1.4834, "step": 3667 }, { "epoch": 0.02, "grad_norm": 4.401897149251593, "learning_rate": 1.9998115572943607e-06, "loss": 1.3598, "step": 3668 }, { "epoch": 0.02, "grad_norm": 4.516321121878343, "learning_rate": 1.9998114542530593e-06, "loss": 1.3812, "step": 3669 }, { "epoch": 0.02, "grad_norm": 4.582276547112316, "learning_rate": 1.999811351183596e-06, "loss": 1.4511, "step": 3670 }, { "epoch": 0.02, "grad_norm": 4.268406465536788, "learning_rate": 1.9998112480859718e-06, "loss": 1.3777, "step": 3671 }, { "epoch": 0.02, "grad_norm": 5.414725079724578, "learning_rate": 1.999811144960186e-06, "loss": 1.3594, "step": 3672 }, { "epoch": 0.02, "grad_norm": 5.036395550915229, "learning_rate": 1.9998110418062394e-06, "loss": 1.5366, "step": 3673 }, { "epoch": 0.02, "grad_norm": 4.811922967477642, "learning_rate": 1.9998109386241307e-06, "loss": 1.489, "step": 3674 }, { "epoch": 0.02, "grad_norm": 4.909204659000984, "learning_rate": 1.999810835413861e-06, "loss": 1.4438, "step": 3675 }, { "epoch": 0.02, "grad_norm": 4.933064342481729, "learning_rate": 1.99981073217543e-06, "loss": 1.3926, "step": 3676 }, { "epoch": 0.02, "grad_norm": 5.172865172666342, "learning_rate": 1.999810628908838e-06, "loss": 1.3739, "step": 3677 }, { "epoch": 0.02, "grad_norm": 4.581844754398152, "learning_rate": 1.999810525614084e-06, "loss": 1.3825, "step": 3678 }, { "epoch": 0.02, "grad_norm": 4.002706815163878, "learning_rate": 1.999810422291169e-06, "loss": 1.2023, "step": 3679 }, { "epoch": 0.02, "grad_norm": 4.529339089739914, "learning_rate": 1.9998103189400925e-06, "loss": 1.3956, "step": 3680 }, { "epoch": 0.02, "grad_norm": 4.371778302872117, "learning_rate": 1.999810215560855e-06, "loss": 1.0882, "step": 3681 }, { "epoch": 0.02, "grad_norm": 4.604473618690761, "learning_rate": 1.999810112153456e-06, "loss": 1.3797, "step": 3682 }, { "epoch": 0.02, "grad_norm": 4.229913176526649, "learning_rate": 1.9998100087178954e-06, "loss": 1.3024, "step": 3683 }, { "epoch": 0.02, "grad_norm": 4.892279631388166, "learning_rate": 1.9998099052541736e-06, "loss": 1.4261, "step": 3684 }, { "epoch": 0.02, "grad_norm": 4.375988978498727, "learning_rate": 1.9998098017622905e-06, "loss": 1.426, "step": 3685 }, { "epoch": 0.02, "grad_norm": 4.517183513297791, "learning_rate": 1.9998096982422465e-06, "loss": 1.4715, "step": 3686 }, { "epoch": 0.02, "grad_norm": 4.715497920821096, "learning_rate": 1.9998095946940408e-06, "loss": 1.2492, "step": 3687 }, { "epoch": 0.02, "grad_norm": 4.4409337214742814, "learning_rate": 1.9998094911176738e-06, "loss": 1.3908, "step": 3688 }, { "epoch": 0.02, "grad_norm": 4.768774973994264, "learning_rate": 1.9998093875131454e-06, "loss": 1.3043, "step": 3689 }, { "epoch": 0.02, "grad_norm": 4.488443359293641, "learning_rate": 1.999809283880456e-06, "loss": 1.4254, "step": 3690 }, { "epoch": 0.02, "grad_norm": 4.946907576554227, "learning_rate": 1.999809180219605e-06, "loss": 1.3815, "step": 3691 }, { "epoch": 0.02, "grad_norm": 5.024643467089974, "learning_rate": 1.9998090765305927e-06, "loss": 1.3476, "step": 3692 }, { "epoch": 0.02, "grad_norm": 5.375622450304601, "learning_rate": 1.9998089728134195e-06, "loss": 1.575, "step": 3693 }, { "epoch": 0.02, "grad_norm": 4.7741580232890755, "learning_rate": 1.9998088690680847e-06, "loss": 1.3576, "step": 3694 }, { "epoch": 0.02, "grad_norm": 4.690371879763435, "learning_rate": 1.9998087652945886e-06, "loss": 1.3921, "step": 3695 }, { "epoch": 0.02, "grad_norm": 4.895129039414259, "learning_rate": 1.999808661492931e-06, "loss": 1.5504, "step": 3696 }, { "epoch": 0.02, "grad_norm": 4.841292889636168, "learning_rate": 1.9998085576631128e-06, "loss": 1.6215, "step": 3697 }, { "epoch": 0.02, "grad_norm": 4.335373503405417, "learning_rate": 1.9998084538051327e-06, "loss": 1.4034, "step": 3698 }, { "epoch": 0.02, "grad_norm": 4.892579988645074, "learning_rate": 1.999808349918992e-06, "loss": 1.4615, "step": 3699 }, { "epoch": 0.03, "grad_norm": 4.80405901674261, "learning_rate": 1.999808246004689e-06, "loss": 1.3834, "step": 3700 }, { "epoch": 0.03, "grad_norm": 5.015073387547917, "learning_rate": 1.9998081420622256e-06, "loss": 1.5382, "step": 3701 }, { "epoch": 0.03, "grad_norm": 5.340605580327248, "learning_rate": 1.9998080380916003e-06, "loss": 1.5587, "step": 3702 }, { "epoch": 0.03, "grad_norm": 4.6548885345585145, "learning_rate": 1.999807934092814e-06, "loss": 1.4507, "step": 3703 }, { "epoch": 0.03, "grad_norm": 7.988246529171995, "learning_rate": 1.9998078300658667e-06, "loss": 1.2863, "step": 3704 }, { "epoch": 0.03, "grad_norm": 4.366479726955373, "learning_rate": 1.999807726010758e-06, "loss": 1.3532, "step": 3705 }, { "epoch": 0.03, "grad_norm": 4.954787351869969, "learning_rate": 1.999807621927488e-06, "loss": 1.4679, "step": 3706 }, { "epoch": 0.03, "grad_norm": 5.106594076376741, "learning_rate": 1.9998075178160565e-06, "loss": 1.4912, "step": 3707 }, { "epoch": 0.03, "grad_norm": 5.53644272240156, "learning_rate": 1.999807413676464e-06, "loss": 1.3928, "step": 3708 }, { "epoch": 0.03, "grad_norm": 4.3898086275317265, "learning_rate": 1.9998073095087102e-06, "loss": 1.2247, "step": 3709 }, { "epoch": 0.03, "grad_norm": 6.129858409048548, "learning_rate": 1.9998072053127954e-06, "loss": 1.2888, "step": 3710 }, { "epoch": 0.03, "grad_norm": 6.411528618345709, "learning_rate": 1.999807101088719e-06, "loss": 1.4598, "step": 3711 }, { "epoch": 0.03, "grad_norm": 5.245071088827053, "learning_rate": 1.9998069968364813e-06, "loss": 1.3139, "step": 3712 }, { "epoch": 0.03, "grad_norm": 5.001994498250823, "learning_rate": 1.9998068925560825e-06, "loss": 1.4319, "step": 3713 }, { "epoch": 0.03, "grad_norm": 4.5711686332255725, "learning_rate": 1.9998067882475225e-06, "loss": 1.4286, "step": 3714 }, { "epoch": 0.03, "grad_norm": 4.494680754322679, "learning_rate": 1.999806683910801e-06, "loss": 1.3624, "step": 3715 }, { "epoch": 0.03, "grad_norm": 4.920443475726351, "learning_rate": 1.999806579545919e-06, "loss": 1.4834, "step": 3716 }, { "epoch": 0.03, "grad_norm": 9.09972634364239, "learning_rate": 1.9998064751528752e-06, "loss": 1.3838, "step": 3717 }, { "epoch": 0.03, "grad_norm": 4.696129650401296, "learning_rate": 1.99980637073167e-06, "loss": 1.5004, "step": 3718 }, { "epoch": 0.03, "grad_norm": 5.3037260313404415, "learning_rate": 1.999806266282304e-06, "loss": 1.5459, "step": 3719 }, { "epoch": 0.03, "grad_norm": 4.88221476190435, "learning_rate": 1.9998061618047767e-06, "loss": 1.4734, "step": 3720 }, { "epoch": 0.03, "grad_norm": 4.956709505619386, "learning_rate": 1.999806057299088e-06, "loss": 1.3683, "step": 3721 }, { "epoch": 0.03, "grad_norm": 5.1834233457849495, "learning_rate": 1.9998059527652382e-06, "loss": 1.3125, "step": 3722 }, { "epoch": 0.03, "grad_norm": 4.492850793982879, "learning_rate": 1.9998058482032273e-06, "loss": 1.2745, "step": 3723 }, { "epoch": 0.03, "eval_loss": 1.6084189414978027, "eval_runtime": 4.642, "eval_samples_per_second": 1.939, "eval_steps_per_second": 1.077, "step": 3723 }, { "epoch": 0.03, "grad_norm": 4.713923973024249, "learning_rate": 1.999805743613055e-06, "loss": 1.4203, "step": 3724 }, { "epoch": 0.03, "grad_norm": 4.441921382155817, "learning_rate": 1.999805638994722e-06, "loss": 1.4163, "step": 3725 }, { "epoch": 0.03, "grad_norm": 5.164638367038176, "learning_rate": 1.9998055343482274e-06, "loss": 1.4703, "step": 3726 }, { "epoch": 0.03, "grad_norm": 4.707717776966889, "learning_rate": 1.999805429673571e-06, "loss": 1.4389, "step": 3727 }, { "epoch": 0.03, "grad_norm": 5.404527912836125, "learning_rate": 1.9998053249707545e-06, "loss": 1.4697, "step": 3728 }, { "epoch": 0.03, "grad_norm": 4.582127757876631, "learning_rate": 1.999805220239776e-06, "loss": 1.4003, "step": 3729 }, { "epoch": 0.03, "grad_norm": 4.82572183924319, "learning_rate": 1.999805115480637e-06, "loss": 1.3832, "step": 3730 }, { "epoch": 0.03, "grad_norm": 5.099262606984633, "learning_rate": 1.9998050106933363e-06, "loss": 1.4593, "step": 3731 }, { "epoch": 0.03, "grad_norm": 4.705269403296674, "learning_rate": 1.9998049058778745e-06, "loss": 1.4539, "step": 3732 }, { "epoch": 0.03, "grad_norm": 5.343650845199192, "learning_rate": 1.9998048010342517e-06, "loss": 1.4299, "step": 3733 }, { "epoch": 0.03, "grad_norm": 4.761090749890984, "learning_rate": 1.9998046961624677e-06, "loss": 1.376, "step": 3734 }, { "epoch": 0.03, "grad_norm": 4.47848246597197, "learning_rate": 1.9998045912625223e-06, "loss": 1.3492, "step": 3735 }, { "epoch": 0.03, "grad_norm": 4.553841135265006, "learning_rate": 1.999804486334416e-06, "loss": 1.4707, "step": 3736 }, { "epoch": 0.03, "grad_norm": 5.112011524717292, "learning_rate": 1.999804381378148e-06, "loss": 1.3998, "step": 3737 }, { "epoch": 0.03, "grad_norm": 5.053186099424353, "learning_rate": 1.9998042763937197e-06, "loss": 1.3719, "step": 3738 }, { "epoch": 0.03, "grad_norm": 4.9280058328899665, "learning_rate": 1.9998041713811296e-06, "loss": 1.4414, "step": 3739 }, { "epoch": 0.03, "grad_norm": 4.927412166137148, "learning_rate": 1.9998040663403785e-06, "loss": 1.3497, "step": 3740 }, { "epoch": 0.03, "grad_norm": 4.868129380898875, "learning_rate": 1.999803961271466e-06, "loss": 1.4854, "step": 3741 }, { "epoch": 0.03, "grad_norm": 5.083726937764924, "learning_rate": 1.999803856174393e-06, "loss": 1.3262, "step": 3742 }, { "epoch": 0.03, "grad_norm": 4.680945628502999, "learning_rate": 1.9998037510491585e-06, "loss": 1.4529, "step": 3743 }, { "epoch": 0.03, "grad_norm": 9.213903521376912, "learning_rate": 1.9998036458957626e-06, "loss": 1.5463, "step": 3744 }, { "epoch": 0.03, "grad_norm": 4.604742089204025, "learning_rate": 1.999803540714206e-06, "loss": 1.4577, "step": 3745 }, { "epoch": 0.03, "grad_norm": 5.36229884657475, "learning_rate": 1.999803435504488e-06, "loss": 1.4056, "step": 3746 }, { "epoch": 0.03, "grad_norm": 4.600285403993327, "learning_rate": 1.9998033302666086e-06, "loss": 1.4862, "step": 3747 }, { "epoch": 0.03, "grad_norm": 5.265400100794315, "learning_rate": 1.9998032250005684e-06, "loss": 1.5089, "step": 3748 }, { "epoch": 0.03, "grad_norm": 5.1969214189492305, "learning_rate": 1.9998031197063673e-06, "loss": 1.4962, "step": 3749 }, { "epoch": 0.03, "grad_norm": 4.320190989350664, "learning_rate": 1.9998030143840045e-06, "loss": 1.3034, "step": 3750 }, { "epoch": 0.03, "grad_norm": 4.579075184163126, "learning_rate": 1.9998029090334813e-06, "loss": 1.4736, "step": 3751 }, { "epoch": 0.03, "grad_norm": 5.007083829002728, "learning_rate": 1.9998028036547963e-06, "loss": 1.3576, "step": 3752 }, { "epoch": 0.03, "grad_norm": 5.20578658851182, "learning_rate": 1.9998026982479504e-06, "loss": 1.3413, "step": 3753 }, { "epoch": 0.03, "grad_norm": 4.940850610449721, "learning_rate": 1.9998025928129432e-06, "loss": 1.5133, "step": 3754 }, { "epoch": 0.03, "grad_norm": 4.822761183175982, "learning_rate": 1.9998024873497756e-06, "loss": 1.2899, "step": 3755 }, { "epoch": 0.03, "grad_norm": 4.545241760698208, "learning_rate": 1.9998023818584462e-06, "loss": 1.3797, "step": 3756 }, { "epoch": 0.03, "grad_norm": 4.742084979843826, "learning_rate": 1.999802276338956e-06, "loss": 1.4692, "step": 3757 }, { "epoch": 0.03, "grad_norm": 4.658419562131754, "learning_rate": 1.9998021707913045e-06, "loss": 1.3793, "step": 3758 }, { "epoch": 0.03, "grad_norm": 5.183021678979285, "learning_rate": 1.999802065215492e-06, "loss": 1.4525, "step": 3759 }, { "epoch": 0.03, "grad_norm": 5.736088379622611, "learning_rate": 1.9998019596115183e-06, "loss": 1.2959, "step": 3760 }, { "epoch": 0.03, "grad_norm": 4.656062094359612, "learning_rate": 1.9998018539793837e-06, "loss": 1.3658, "step": 3761 }, { "epoch": 0.03, "grad_norm": 4.616745476907714, "learning_rate": 1.9998017483190878e-06, "loss": 1.4136, "step": 3762 }, { "epoch": 0.03, "grad_norm": 7.731566326102406, "learning_rate": 1.999801642630631e-06, "loss": 1.3751, "step": 3763 }, { "epoch": 0.03, "grad_norm": 5.432514416044134, "learning_rate": 1.999801536914013e-06, "loss": 1.3943, "step": 3764 }, { "epoch": 0.03, "grad_norm": 4.866033462628277, "learning_rate": 1.999801431169234e-06, "loss": 1.3415, "step": 3765 }, { "epoch": 0.03, "grad_norm": 5.468340002269182, "learning_rate": 1.9998013253962936e-06, "loss": 1.3156, "step": 3766 }, { "epoch": 0.03, "grad_norm": 4.647820771341247, "learning_rate": 1.9998012195951925e-06, "loss": 1.3947, "step": 3767 }, { "epoch": 0.03, "grad_norm": 4.657410377037526, "learning_rate": 1.99980111376593e-06, "loss": 1.4256, "step": 3768 }, { "epoch": 0.03, "grad_norm": 4.8464614615398505, "learning_rate": 1.9998010079085066e-06, "loss": 1.3297, "step": 3769 }, { "epoch": 0.03, "grad_norm": 4.332318170076133, "learning_rate": 1.9998009020229224e-06, "loss": 1.2867, "step": 3770 }, { "epoch": 0.03, "grad_norm": 4.878338350222502, "learning_rate": 1.999800796109177e-06, "loss": 1.38, "step": 3771 }, { "epoch": 0.03, "grad_norm": 4.493111995241049, "learning_rate": 1.9998006901672704e-06, "loss": 1.4075, "step": 3772 }, { "epoch": 0.03, "grad_norm": 4.469116344826484, "learning_rate": 1.9998005841972027e-06, "loss": 1.4129, "step": 3773 }, { "epoch": 0.03, "grad_norm": 4.5213045347853535, "learning_rate": 1.999800478198974e-06, "loss": 1.3233, "step": 3774 }, { "epoch": 0.03, "grad_norm": 4.65101410292212, "learning_rate": 1.999800372172584e-06, "loss": 1.3441, "step": 3775 }, { "epoch": 0.03, "grad_norm": 8.216969674240255, "learning_rate": 1.9998002661180334e-06, "loss": 1.2946, "step": 3776 }, { "epoch": 0.03, "grad_norm": 5.3767982950299755, "learning_rate": 1.9998001600353217e-06, "loss": 1.4547, "step": 3777 }, { "epoch": 0.03, "grad_norm": 6.118813225734277, "learning_rate": 1.9998000539244488e-06, "loss": 1.4563, "step": 3778 }, { "epoch": 0.03, "grad_norm": 4.720418776141597, "learning_rate": 1.999799947785415e-06, "loss": 1.305, "step": 3779 }, { "epoch": 0.03, "grad_norm": 4.516048109136921, "learning_rate": 1.9997998416182198e-06, "loss": 1.3212, "step": 3780 }, { "epoch": 0.03, "grad_norm": 4.62003964568539, "learning_rate": 1.9997997354228637e-06, "loss": 1.3838, "step": 3781 }, { "epoch": 0.03, "grad_norm": 5.0358945130998976, "learning_rate": 1.999799629199347e-06, "loss": 1.4947, "step": 3782 }, { "epoch": 0.03, "grad_norm": 4.5824414800490185, "learning_rate": 1.9997995229476686e-06, "loss": 1.3435, "step": 3783 }, { "epoch": 0.03, "grad_norm": 4.75725254853182, "learning_rate": 1.99979941666783e-06, "loss": 1.3627, "step": 3784 }, { "epoch": 0.03, "grad_norm": 5.14007088469601, "learning_rate": 1.9997993103598295e-06, "loss": 1.4974, "step": 3785 }, { "epoch": 0.03, "grad_norm": 4.60496266899906, "learning_rate": 1.9997992040236686e-06, "loss": 1.4381, "step": 3786 }, { "epoch": 0.03, "grad_norm": 4.287067707408165, "learning_rate": 1.999799097659346e-06, "loss": 1.4361, "step": 3787 }, { "epoch": 0.03, "grad_norm": 4.864411077361251, "learning_rate": 1.999798991266863e-06, "loss": 1.4978, "step": 3788 }, { "epoch": 0.03, "grad_norm": 6.476884055738115, "learning_rate": 1.999798884846219e-06, "loss": 1.5751, "step": 3789 }, { "epoch": 0.03, "grad_norm": 4.607204170305541, "learning_rate": 1.999798778397414e-06, "loss": 1.2613, "step": 3790 }, { "epoch": 0.03, "grad_norm": 4.513574436603101, "learning_rate": 1.9997986719204477e-06, "loss": 1.2507, "step": 3791 }, { "epoch": 0.03, "grad_norm": 4.397374539254503, "learning_rate": 1.9997985654153202e-06, "loss": 1.3785, "step": 3792 }, { "epoch": 0.03, "grad_norm": 4.7647312609589285, "learning_rate": 1.999798458882032e-06, "loss": 1.398, "step": 3793 }, { "epoch": 0.03, "grad_norm": 5.532986119411458, "learning_rate": 1.999798352320583e-06, "loss": 1.473, "step": 3794 }, { "epoch": 0.03, "grad_norm": 4.289540776605135, "learning_rate": 1.9997982457309727e-06, "loss": 1.254, "step": 3795 }, { "epoch": 0.03, "grad_norm": 4.650435124831928, "learning_rate": 1.9997981391132013e-06, "loss": 1.3785, "step": 3796 }, { "epoch": 0.03, "eval_loss": 1.6124725341796875, "eval_runtime": 4.6361, "eval_samples_per_second": 1.941, "eval_steps_per_second": 1.078, "step": 3796 }, { "epoch": 0.03, "grad_norm": 4.606995994308827, "learning_rate": 1.9997980324672695e-06, "loss": 1.4598, "step": 3797 }, { "epoch": 0.03, "grad_norm": 4.628886714284138, "learning_rate": 1.9997979257931764e-06, "loss": 1.4204, "step": 3798 }, { "epoch": 0.03, "grad_norm": 4.472096608209409, "learning_rate": 1.9997978190909223e-06, "loss": 1.2646, "step": 3799 }, { "epoch": 0.03, "grad_norm": 6.899108769896611, "learning_rate": 1.999797712360507e-06, "loss": 1.7302, "step": 3800 }, { "epoch": 0.03, "grad_norm": 11.698902110094348, "learning_rate": 1.999797605601931e-06, "loss": 1.2991, "step": 3801 }, { "epoch": 0.03, "grad_norm": 4.678694191733831, "learning_rate": 1.999797498815194e-06, "loss": 1.3871, "step": 3802 }, { "epoch": 0.03, "grad_norm": 4.623073739366768, "learning_rate": 1.999797392000296e-06, "loss": 1.4762, "step": 3803 }, { "epoch": 0.03, "grad_norm": 4.528916340724794, "learning_rate": 1.999797285157237e-06, "loss": 1.2994, "step": 3804 }, { "epoch": 0.03, "grad_norm": 5.120009553724364, "learning_rate": 1.9997971782860172e-06, "loss": 1.4481, "step": 3805 }, { "epoch": 0.03, "grad_norm": 4.646312172610378, "learning_rate": 1.9997970713866358e-06, "loss": 1.4237, "step": 3806 }, { "epoch": 0.03, "grad_norm": 4.824187645447148, "learning_rate": 1.9997969644590943e-06, "loss": 1.3742, "step": 3807 }, { "epoch": 0.03, "grad_norm": 4.707453765528019, "learning_rate": 1.999796857503391e-06, "loss": 1.358, "step": 3808 }, { "epoch": 0.03, "grad_norm": 4.874962378800376, "learning_rate": 1.9997967505195274e-06, "loss": 1.4034, "step": 3809 }, { "epoch": 0.03, "grad_norm": 5.728054711081159, "learning_rate": 1.999796643507503e-06, "loss": 1.4522, "step": 3810 }, { "epoch": 0.03, "grad_norm": 4.488435565656918, "learning_rate": 1.999796536467317e-06, "loss": 1.3188, "step": 3811 }, { "epoch": 0.03, "grad_norm": 5.309632018627234, "learning_rate": 1.9997964293989703e-06, "loss": 1.4794, "step": 3812 }, { "epoch": 0.03, "grad_norm": 5.21029005693669, "learning_rate": 1.9997963223024627e-06, "loss": 1.4637, "step": 3813 }, { "epoch": 0.03, "grad_norm": 4.719975810384763, "learning_rate": 1.999796215177794e-06, "loss": 1.4294, "step": 3814 }, { "epoch": 0.03, "grad_norm": 4.947061511680672, "learning_rate": 1.9997961080249648e-06, "loss": 1.2268, "step": 3815 }, { "epoch": 0.03, "grad_norm": 5.717301549581428, "learning_rate": 1.9997960008439745e-06, "loss": 1.5564, "step": 3816 }, { "epoch": 0.03, "grad_norm": 4.337568360997204, "learning_rate": 1.999795893634823e-06, "loss": 1.3617, "step": 3817 }, { "epoch": 0.03, "grad_norm": 4.714909910098067, "learning_rate": 1.999795786397511e-06, "loss": 1.3707, "step": 3818 }, { "epoch": 0.03, "grad_norm": 4.863348635773201, "learning_rate": 1.999795679132038e-06, "loss": 1.5601, "step": 3819 }, { "epoch": 0.03, "grad_norm": 4.725409416597695, "learning_rate": 1.999795571838404e-06, "loss": 1.5155, "step": 3820 }, { "epoch": 0.03, "grad_norm": 4.390907850020624, "learning_rate": 1.9997954645166087e-06, "loss": 1.3048, "step": 3821 }, { "epoch": 0.03, "grad_norm": 5.204362569456592, "learning_rate": 1.9997953571666528e-06, "loss": 1.4248, "step": 3822 }, { "epoch": 0.03, "grad_norm": 4.373634118732567, "learning_rate": 1.9997952497885363e-06, "loss": 1.4316, "step": 3823 }, { "epoch": 0.03, "grad_norm": 4.830427217746328, "learning_rate": 1.999795142382258e-06, "loss": 1.5032, "step": 3824 }, { "epoch": 0.03, "grad_norm": 5.35064585778059, "learning_rate": 1.99979503494782e-06, "loss": 1.3917, "step": 3825 }, { "epoch": 0.03, "grad_norm": 4.621090262533992, "learning_rate": 1.99979492748522e-06, "loss": 1.4374, "step": 3826 }, { "epoch": 0.03, "grad_norm": 4.486560370946347, "learning_rate": 1.9997948199944597e-06, "loss": 1.3844, "step": 3827 }, { "epoch": 0.03, "grad_norm": 4.89173032023379, "learning_rate": 1.9997947124755385e-06, "loss": 1.4796, "step": 3828 }, { "epoch": 0.03, "grad_norm": 4.968544330306352, "learning_rate": 1.9997946049284563e-06, "loss": 1.5391, "step": 3829 }, { "epoch": 0.03, "grad_norm": 5.432121217331672, "learning_rate": 1.9997944973532133e-06, "loss": 1.3526, "step": 3830 }, { "epoch": 0.03, "grad_norm": 4.433372309371036, "learning_rate": 1.9997943897498094e-06, "loss": 1.3612, "step": 3831 }, { "epoch": 0.03, "grad_norm": 4.388748973282565, "learning_rate": 1.9997942821182442e-06, "loss": 1.2879, "step": 3832 }, { "epoch": 0.03, "grad_norm": 5.522001370681306, "learning_rate": 1.9997941744585186e-06, "loss": 1.3845, "step": 3833 }, { "epoch": 0.03, "grad_norm": 5.064642846647261, "learning_rate": 1.999794066770632e-06, "loss": 1.4568, "step": 3834 }, { "epoch": 0.03, "grad_norm": 4.99394851245948, "learning_rate": 1.9997939590545846e-06, "loss": 1.3925, "step": 3835 }, { "epoch": 0.03, "grad_norm": 6.180590276962394, "learning_rate": 1.9997938513103763e-06, "loss": 1.4198, "step": 3836 }, { "epoch": 0.03, "grad_norm": 4.625626685100316, "learning_rate": 1.999793743538007e-06, "loss": 1.4152, "step": 3837 }, { "epoch": 0.03, "grad_norm": 5.7309361359320725, "learning_rate": 1.999793635737477e-06, "loss": 1.448, "step": 3838 }, { "epoch": 0.03, "grad_norm": 4.700706207946969, "learning_rate": 1.9997935279087857e-06, "loss": 1.4112, "step": 3839 }, { "epoch": 0.03, "grad_norm": 4.344275675891425, "learning_rate": 1.9997934200519343e-06, "loss": 1.3493, "step": 3840 }, { "epoch": 0.03, "grad_norm": 5.112163685600387, "learning_rate": 1.9997933121669216e-06, "loss": 1.4048, "step": 3841 }, { "epoch": 0.03, "grad_norm": 4.620444881119614, "learning_rate": 1.999793204253748e-06, "loss": 1.48, "step": 3842 }, { "epoch": 0.03, "grad_norm": 4.318406868845656, "learning_rate": 1.9997930963124135e-06, "loss": 1.32, "step": 3843 }, { "epoch": 0.03, "grad_norm": 4.786834196466122, "learning_rate": 1.999792988342918e-06, "loss": 1.4818, "step": 3844 }, { "epoch": 0.03, "grad_norm": 4.616685881971367, "learning_rate": 1.9997928803452624e-06, "loss": 1.3424, "step": 3845 }, { "epoch": 0.03, "grad_norm": 10.976292065630325, "learning_rate": 1.9997927723194453e-06, "loss": 1.529, "step": 3846 }, { "epoch": 0.03, "grad_norm": 5.479056535258812, "learning_rate": 1.9997926642654677e-06, "loss": 1.3713, "step": 3847 }, { "epoch": 0.03, "grad_norm": 5.29815155050577, "learning_rate": 1.9997925561833293e-06, "loss": 1.4513, "step": 3848 }, { "epoch": 0.03, "grad_norm": 4.642297981539935, "learning_rate": 1.99979244807303e-06, "loss": 1.2959, "step": 3849 }, { "epoch": 0.03, "grad_norm": 5.1046013534719465, "learning_rate": 1.9997923399345697e-06, "loss": 1.4979, "step": 3850 }, { "epoch": 0.03, "grad_norm": 4.646559943022004, "learning_rate": 1.9997922317679486e-06, "loss": 1.444, "step": 3851 }, { "epoch": 0.03, "grad_norm": 5.21849530959606, "learning_rate": 1.9997921235731667e-06, "loss": 1.5293, "step": 3852 }, { "epoch": 0.03, "grad_norm": 5.091788114990917, "learning_rate": 1.999792015350224e-06, "loss": 1.3336, "step": 3853 }, { "epoch": 0.03, "grad_norm": 5.027116597468565, "learning_rate": 1.9997919070991205e-06, "loss": 1.4953, "step": 3854 }, { "epoch": 0.03, "grad_norm": 4.351333006720373, "learning_rate": 1.999791798819856e-06, "loss": 1.2278, "step": 3855 }, { "epoch": 0.03, "grad_norm": 5.065186193436848, "learning_rate": 1.999791690512431e-06, "loss": 1.5342, "step": 3856 }, { "epoch": 0.03, "grad_norm": 5.055266043001361, "learning_rate": 1.9997915821768453e-06, "loss": 1.4839, "step": 3857 }, { "epoch": 0.03, "grad_norm": 4.4172809483128495, "learning_rate": 1.9997914738130985e-06, "loss": 1.3977, "step": 3858 }, { "epoch": 0.03, "grad_norm": 5.314641655648586, "learning_rate": 1.9997913654211908e-06, "loss": 1.452, "step": 3859 }, { "epoch": 0.03, "grad_norm": 5.308280921675756, "learning_rate": 1.9997912570011226e-06, "loss": 1.4482, "step": 3860 }, { "epoch": 0.03, "grad_norm": 4.735078171685458, "learning_rate": 1.9997911485528935e-06, "loss": 1.3584, "step": 3861 }, { "epoch": 0.03, "grad_norm": 4.8743091116457995, "learning_rate": 1.9997910400765036e-06, "loss": 1.4825, "step": 3862 }, { "epoch": 0.03, "grad_norm": 4.46078710955025, "learning_rate": 1.9997909315719528e-06, "loss": 1.2225, "step": 3863 }, { "epoch": 0.03, "grad_norm": 5.185001019529193, "learning_rate": 1.999790823039241e-06, "loss": 1.1979, "step": 3864 }, { "epoch": 0.03, "grad_norm": 5.476108668212412, "learning_rate": 1.999790714478369e-06, "loss": 1.5096, "step": 3865 }, { "epoch": 0.03, "grad_norm": 5.333965154940147, "learning_rate": 1.999790605889336e-06, "loss": 1.4091, "step": 3866 }, { "epoch": 0.03, "grad_norm": 4.971115470731839, "learning_rate": 1.999790497272142e-06, "loss": 1.3178, "step": 3867 }, { "epoch": 0.03, "grad_norm": 4.595571343395057, "learning_rate": 1.9997903886267876e-06, "loss": 1.4777, "step": 3868 }, { "epoch": 0.03, "grad_norm": 6.804248226375533, "learning_rate": 1.9997902799532724e-06, "loss": 1.5475, "step": 3869 }, { "epoch": 0.03, "eval_loss": 1.609044075012207, "eval_runtime": 4.6325, "eval_samples_per_second": 1.943, "eval_steps_per_second": 1.079, "step": 3869 } ], "logging_steps": 1, "max_steps": 591956, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 73, "total_flos": 405018771456000.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }