{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 15.0, "global_step": 1575, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0019047619047619048, "grad_norm": 6.249958182433422, "learning_rate": 2.0833333333333333e-07, "loss": 1.2548, "step": 1 }, { "epoch": 0.0038095238095238095, "grad_norm": 7.3275358434995885, "learning_rate": 4.1666666666666667e-07, "loss": 1.1995, "step": 2 }, { "epoch": 0.005714285714285714, "grad_norm": 7.962704464287428, "learning_rate": 6.25e-07, "loss": 1.3552, "step": 3 }, { "epoch": 0.007619047619047619, "grad_norm": 6.834862381995382, "learning_rate": 8.333333333333333e-07, "loss": 1.2567, "step": 4 }, { "epoch": 0.009523809523809525, "grad_norm": 6.670482016594432, "learning_rate": 1.0416666666666667e-06, "loss": 1.2178, "step": 5 }, { "epoch": 0.011428571428571429, "grad_norm": 6.228408996779406, "learning_rate": 1.25e-06, "loss": 1.4297, "step": 6 }, { "epoch": 0.013333333333333334, "grad_norm": 7.978127041228856, "learning_rate": 1.4583333333333335e-06, "loss": 1.2592, "step": 7 }, { "epoch": 0.015238095238095238, "grad_norm": 6.521728745252264, "learning_rate": 1.6666666666666667e-06, "loss": 1.5888, "step": 8 }, { "epoch": 0.017142857142857144, "grad_norm": 6.430439812900178, "learning_rate": 1.8750000000000003e-06, "loss": 1.3054, "step": 9 }, { "epoch": 0.01904761904761905, "grad_norm": 5.440801146286208, "learning_rate": 2.0833333333333334e-06, "loss": 1.2474, "step": 10 }, { "epoch": 0.02095238095238095, "grad_norm": 4.484406039167107, "learning_rate": 2.2916666666666666e-06, "loss": 1.2364, "step": 11 }, { "epoch": 0.022857142857142857, "grad_norm": 4.551409690261343, "learning_rate": 2.5e-06, "loss": 1.2083, "step": 12 }, { "epoch": 0.024761904761904763, "grad_norm": 5.807313642046851, "learning_rate": 2.7083333333333334e-06, "loss": 1.4471, "step": 13 }, { "epoch": 0.02666666666666667, "grad_norm": 4.71193646005634, "learning_rate": 2.916666666666667e-06, "loss": 1.1337, "step": 14 }, { "epoch": 0.02857142857142857, "grad_norm": 4.842345764051664, "learning_rate": 3.125e-06, "loss": 1.1057, "step": 15 }, { "epoch": 0.030476190476190476, "grad_norm": 5.126936632402324, "learning_rate": 3.3333333333333333e-06, "loss": 1.1551, "step": 16 }, { "epoch": 0.03238095238095238, "grad_norm": 3.9078888869265036, "learning_rate": 3.5416666666666673e-06, "loss": 1.0285, "step": 17 }, { "epoch": 0.03428571428571429, "grad_norm": 4.298236902242096, "learning_rate": 3.7500000000000005e-06, "loss": 1.0764, "step": 18 }, { "epoch": 0.03619047619047619, "grad_norm": 3.7114863245838463, "learning_rate": 3.958333333333333e-06, "loss": 1.1386, "step": 19 }, { "epoch": 0.0380952380952381, "grad_norm": 3.9206535517036656, "learning_rate": 4.166666666666667e-06, "loss": 1.0228, "step": 20 }, { "epoch": 0.04, "grad_norm": 3.7918216997086343, "learning_rate": 4.3750000000000005e-06, "loss": 1.1698, "step": 21 }, { "epoch": 0.0419047619047619, "grad_norm": 4.133208891764793, "learning_rate": 4.583333333333333e-06, "loss": 1.0701, "step": 22 }, { "epoch": 0.04380952380952381, "grad_norm": 3.3540327254448496, "learning_rate": 4.791666666666668e-06, "loss": 1.1161, "step": 23 }, { "epoch": 0.045714285714285714, "grad_norm": 4.539012815101767, "learning_rate": 5e-06, "loss": 1.1009, "step": 24 }, { "epoch": 0.047619047619047616, "grad_norm": 3.4906605922972735, "learning_rate": 5.208333333333334e-06, "loss": 1.0132, "step": 25 }, { "epoch": 0.049523809523809526, "grad_norm": 3.0938294959210357, "learning_rate": 5.416666666666667e-06, "loss": 1.0779, "step": 26 }, { "epoch": 0.05142857142857143, "grad_norm": 3.706381579068903, "learning_rate": 5.625e-06, "loss": 1.0345, "step": 27 }, { "epoch": 0.05333333333333334, "grad_norm": 3.0374300244693315, "learning_rate": 5.833333333333334e-06, "loss": 1.1701, "step": 28 }, { "epoch": 0.05523809523809524, "grad_norm": 3.6018268851222794, "learning_rate": 6.041666666666667e-06, "loss": 1.0996, "step": 29 }, { "epoch": 0.05714285714285714, "grad_norm": 4.078241688010774, "learning_rate": 6.25e-06, "loss": 1.0101, "step": 30 }, { "epoch": 0.05904761904761905, "grad_norm": 2.5532194989459005, "learning_rate": 6.458333333333334e-06, "loss": 1.0543, "step": 31 }, { "epoch": 0.06095238095238095, "grad_norm": 3.016442373029697, "learning_rate": 6.666666666666667e-06, "loss": 1.1279, "step": 32 }, { "epoch": 0.06285714285714286, "grad_norm": 3.6902114984607413, "learning_rate": 6.875e-06, "loss": 0.9879, "step": 33 }, { "epoch": 0.06476190476190476, "grad_norm": 2.7977555469861115, "learning_rate": 7.083333333333335e-06, "loss": 0.8825, "step": 34 }, { "epoch": 0.06666666666666667, "grad_norm": 3.510142851115604, "learning_rate": 7.291666666666667e-06, "loss": 1.023, "step": 35 }, { "epoch": 0.06857142857142857, "grad_norm": 2.8452663535625113, "learning_rate": 7.500000000000001e-06, "loss": 0.9159, "step": 36 }, { "epoch": 0.07047619047619047, "grad_norm": 3.148967190761688, "learning_rate": 7.708333333333334e-06, "loss": 0.9985, "step": 37 }, { "epoch": 0.07238095238095238, "grad_norm": 4.712123502415102, "learning_rate": 7.916666666666667e-06, "loss": 1.2326, "step": 38 }, { "epoch": 0.07428571428571429, "grad_norm": 3.653286393810381, "learning_rate": 8.125000000000001e-06, "loss": 0.9915, "step": 39 }, { "epoch": 0.0761904761904762, "grad_norm": 3.0858499783164475, "learning_rate": 8.333333333333334e-06, "loss": 1.1663, "step": 40 }, { "epoch": 0.07809523809523809, "grad_norm": 3.5143089698882473, "learning_rate": 8.541666666666666e-06, "loss": 1.0356, "step": 41 }, { "epoch": 0.08, "grad_norm": 3.2811146063203975, "learning_rate": 8.750000000000001e-06, "loss": 1.0228, "step": 42 }, { "epoch": 0.08190476190476191, "grad_norm": 3.3011687607119127, "learning_rate": 8.958333333333334e-06, "loss": 1.1049, "step": 43 }, { "epoch": 0.0838095238095238, "grad_norm": 3.210258584400171, "learning_rate": 9.166666666666666e-06, "loss": 0.8777, "step": 44 }, { "epoch": 0.08571428571428572, "grad_norm": 3.441412832078474, "learning_rate": 9.375000000000001e-06, "loss": 1.137, "step": 45 }, { "epoch": 0.08761904761904762, "grad_norm": 2.5508244468454624, "learning_rate": 9.583333333333335e-06, "loss": 0.998, "step": 46 }, { "epoch": 0.08952380952380952, "grad_norm": 3.093215332044739, "learning_rate": 9.791666666666666e-06, "loss": 1.0617, "step": 47 }, { "epoch": 0.09142857142857143, "grad_norm": 3.2469121174531073, "learning_rate": 1e-05, "loss": 0.8773, "step": 48 }, { "epoch": 0.09333333333333334, "grad_norm": 3.4805042775317765, "learning_rate": 9.99998941815181e-06, "loss": 1.1255, "step": 49 }, { "epoch": 0.09523809523809523, "grad_norm": 3.1642573627760986, "learning_rate": 9.999957672652028e-06, "loss": 1.1809, "step": 50 }, { "epoch": 0.09714285714285714, "grad_norm": 2.586521545518349, "learning_rate": 9.999904763635027e-06, "loss": 0.8819, "step": 51 }, { "epoch": 0.09904761904761905, "grad_norm": 3.1568432074306565, "learning_rate": 9.999830691324755e-06, "loss": 1.0084, "step": 52 }, { "epoch": 0.10095238095238095, "grad_norm": 3.4278518629452703, "learning_rate": 9.99973545603474e-06, "loss": 1.1908, "step": 53 }, { "epoch": 0.10285714285714286, "grad_norm": 3.1248779175425794, "learning_rate": 9.99961905816809e-06, "loss": 1.0985, "step": 54 }, { "epoch": 0.10476190476190476, "grad_norm": 2.7546079451843615, "learning_rate": 9.999481498217488e-06, "loss": 0.9914, "step": 55 }, { "epoch": 0.10666666666666667, "grad_norm": 3.038797998164753, "learning_rate": 9.999322776765187e-06, "loss": 0.9948, "step": 56 }, { "epoch": 0.10857142857142857, "grad_norm": 3.5025783028380375, "learning_rate": 9.999142894483013e-06, "loss": 1.2092, "step": 57 }, { "epoch": 0.11047619047619048, "grad_norm": 2.9501991103847724, "learning_rate": 9.998941852132364e-06, "loss": 1.1009, "step": 58 }, { "epoch": 0.11238095238095239, "grad_norm": 3.0826054414101063, "learning_rate": 9.998719650564197e-06, "loss": 1.089, "step": 59 }, { "epoch": 0.11428571428571428, "grad_norm": 3.3237722878132065, "learning_rate": 9.998476290719035e-06, "loss": 1.0187, "step": 60 }, { "epoch": 0.11619047619047619, "grad_norm": 3.2244788165958256, "learning_rate": 9.998211773626955e-06, "loss": 1.0183, "step": 61 }, { "epoch": 0.1180952380952381, "grad_norm": 3.3388884723048493, "learning_rate": 9.99792610040759e-06, "loss": 0.9636, "step": 62 }, { "epoch": 0.12, "grad_norm": 3.1562420302292793, "learning_rate": 9.99761927227012e-06, "loss": 0.9614, "step": 63 }, { "epoch": 0.1219047619047619, "grad_norm": 3.267673603904302, "learning_rate": 9.997291290513268e-06, "loss": 1.0174, "step": 64 }, { "epoch": 0.12380952380952381, "grad_norm": 2.8273452169820876, "learning_rate": 9.996942156525298e-06, "loss": 1.1283, "step": 65 }, { "epoch": 0.12571428571428572, "grad_norm": 3.8370109921118014, "learning_rate": 9.996571871784e-06, "loss": 0.9912, "step": 66 }, { "epoch": 0.12761904761904763, "grad_norm": 2.700249267077012, "learning_rate": 9.996180437856695e-06, "loss": 1.0158, "step": 67 }, { "epoch": 0.1295238095238095, "grad_norm": 3.3193444159035566, "learning_rate": 9.995767856400218e-06, "loss": 1.1443, "step": 68 }, { "epoch": 0.13142857142857142, "grad_norm": 2.5774813529521414, "learning_rate": 9.99533412916092e-06, "loss": 1.0178, "step": 69 }, { "epoch": 0.13333333333333333, "grad_norm": 2.9479981129016433, "learning_rate": 9.994879257974657e-06, "loss": 1.0722, "step": 70 }, { "epoch": 0.13523809523809524, "grad_norm": 2.886095673232496, "learning_rate": 9.994403244766779e-06, "loss": 0.9772, "step": 71 }, { "epoch": 0.13714285714285715, "grad_norm": 2.660783801859094, "learning_rate": 9.993906091552124e-06, "loss": 1.091, "step": 72 }, { "epoch": 0.13904761904761906, "grad_norm": 3.084097105820264, "learning_rate": 9.993387800435015e-06, "loss": 1.1529, "step": 73 }, { "epoch": 0.14095238095238094, "grad_norm": 4.62363254320876, "learning_rate": 9.992848373609241e-06, "loss": 0.9261, "step": 74 }, { "epoch": 0.14285714285714285, "grad_norm": 2.9860719936761053, "learning_rate": 9.992287813358057e-06, "loss": 1.0594, "step": 75 }, { "epoch": 0.14476190476190476, "grad_norm": 2.691342320793716, "learning_rate": 9.991706122054166e-06, "loss": 1.1088, "step": 76 }, { "epoch": 0.14666666666666667, "grad_norm": 2.9594318822617596, "learning_rate": 9.991103302159717e-06, "loss": 1.0433, "step": 77 }, { "epoch": 0.14857142857142858, "grad_norm": 3.153330979738241, "learning_rate": 9.990479356226289e-06, "loss": 1.2322, "step": 78 }, { "epoch": 0.15047619047619049, "grad_norm": 3.3307200756954303, "learning_rate": 9.989834286894882e-06, "loss": 0.9688, "step": 79 }, { "epoch": 0.1523809523809524, "grad_norm": 2.858366771965065, "learning_rate": 9.989168096895909e-06, "loss": 1.1716, "step": 80 }, { "epoch": 0.15428571428571428, "grad_norm": 3.530158228069321, "learning_rate": 9.988480789049176e-06, "loss": 1.0263, "step": 81 }, { "epoch": 0.15619047619047619, "grad_norm": 3.698678311272925, "learning_rate": 9.987772366263876e-06, "loss": 1.0112, "step": 82 }, { "epoch": 0.1580952380952381, "grad_norm": 3.9611270751486485, "learning_rate": 9.987042831538585e-06, "loss": 1.119, "step": 83 }, { "epoch": 0.16, "grad_norm": 3.3068891014783226, "learning_rate": 9.986292187961227e-06, "loss": 1.083, "step": 84 }, { "epoch": 0.1619047619047619, "grad_norm": 4.071158215091613, "learning_rate": 9.98552043870908e-06, "loss": 1.0478, "step": 85 }, { "epoch": 0.16380952380952382, "grad_norm": 3.7596202230204834, "learning_rate": 9.98472758704876e-06, "loss": 0.9647, "step": 86 }, { "epoch": 0.1657142857142857, "grad_norm": 3.254331891200369, "learning_rate": 9.983913636336204e-06, "loss": 1.215, "step": 87 }, { "epoch": 0.1676190476190476, "grad_norm": 3.607528562832131, "learning_rate": 9.983078590016648e-06, "loss": 0.964, "step": 88 }, { "epoch": 0.16952380952380952, "grad_norm": 3.4095129182391, "learning_rate": 9.98222245162463e-06, "loss": 1.0764, "step": 89 }, { "epoch": 0.17142857142857143, "grad_norm": 3.1120779231959905, "learning_rate": 9.981345224783955e-06, "loss": 1.0883, "step": 90 }, { "epoch": 0.17333333333333334, "grad_norm": 3.4225409802187063, "learning_rate": 9.9804469132077e-06, "loss": 1.0097, "step": 91 }, { "epoch": 0.17523809523809525, "grad_norm": 4.701654702406365, "learning_rate": 9.979527520698182e-06, "loss": 1.0826, "step": 92 }, { "epoch": 0.17714285714285713, "grad_norm": 3.259130195982121, "learning_rate": 9.97858705114695e-06, "loss": 1.0748, "step": 93 }, { "epoch": 0.17904761904761904, "grad_norm": 3.0639511082253965, "learning_rate": 9.977625508534767e-06, "loss": 1.1188, "step": 94 }, { "epoch": 0.18095238095238095, "grad_norm": 2.932982742967428, "learning_rate": 9.976642896931592e-06, "loss": 1.0619, "step": 95 }, { "epoch": 0.18285714285714286, "grad_norm": 2.9226827615732605, "learning_rate": 9.975639220496564e-06, "loss": 1.02, "step": 96 }, { "epoch": 0.18476190476190477, "grad_norm": 2.364194910815137, "learning_rate": 9.974614483477983e-06, "loss": 1.0915, "step": 97 }, { "epoch": 0.18666666666666668, "grad_norm": 2.965291909603285, "learning_rate": 9.973568690213292e-06, "loss": 1.0658, "step": 98 }, { "epoch": 0.18857142857142858, "grad_norm": 3.183868457710933, "learning_rate": 9.972501845129064e-06, "loss": 1.2201, "step": 99 }, { "epoch": 0.19047619047619047, "grad_norm": 3.239515554221448, "learning_rate": 9.971413952740975e-06, "loss": 1.2212, "step": 100 }, { "epoch": 0.19238095238095237, "grad_norm": 4.114193945478856, "learning_rate": 9.97030501765379e-06, "loss": 1.1717, "step": 101 }, { "epoch": 0.19428571428571428, "grad_norm": 2.4733859830692415, "learning_rate": 9.96917504456134e-06, "loss": 1.0785, "step": 102 }, { "epoch": 0.1961904761904762, "grad_norm": 2.6776931241814, "learning_rate": 9.968024038246511e-06, "loss": 1.0503, "step": 103 }, { "epoch": 0.1980952380952381, "grad_norm": 2.8687433329822376, "learning_rate": 9.966852003581211e-06, "loss": 1.1634, "step": 104 }, { "epoch": 0.2, "grad_norm": 2.6125526624258764, "learning_rate": 9.965658945526354e-06, "loss": 0.9906, "step": 105 }, { "epoch": 0.2019047619047619, "grad_norm": 3.344534458296087, "learning_rate": 9.964444869131846e-06, "loss": 1.085, "step": 106 }, { "epoch": 0.2038095238095238, "grad_norm": 3.222299641905838, "learning_rate": 9.963209779536559e-06, "loss": 0.9999, "step": 107 }, { "epoch": 0.2057142857142857, "grad_norm": 3.0703693594229735, "learning_rate": 9.961953681968296e-06, "loss": 0.9924, "step": 108 }, { "epoch": 0.20761904761904762, "grad_norm": 2.8728815593750983, "learning_rate": 9.960676581743802e-06, "loss": 1.0864, "step": 109 }, { "epoch": 0.20952380952380953, "grad_norm": 3.2520040087112676, "learning_rate": 9.959378484268702e-06, "loss": 1.032, "step": 110 }, { "epoch": 0.21142857142857144, "grad_norm": 2.848023660479766, "learning_rate": 9.958059395037505e-06, "loss": 0.9938, "step": 111 }, { "epoch": 0.21333333333333335, "grad_norm": 3.03683212722849, "learning_rate": 9.956719319633574e-06, "loss": 1.052, "step": 112 }, { "epoch": 0.21523809523809523, "grad_norm": 3.534186781153816, "learning_rate": 9.955358263729098e-06, "loss": 1.0652, "step": 113 }, { "epoch": 0.21714285714285714, "grad_norm": 2.8102076591175082, "learning_rate": 9.95397623308507e-06, "loss": 0.9547, "step": 114 }, { "epoch": 0.21904761904761905, "grad_norm": 2.3182734267227882, "learning_rate": 9.952573233551268e-06, "loss": 1.3116, "step": 115 }, { "epoch": 0.22095238095238096, "grad_norm": 3.689992437884337, "learning_rate": 9.951149271066221e-06, "loss": 0.9639, "step": 116 }, { "epoch": 0.22285714285714286, "grad_norm": 3.1117182277237965, "learning_rate": 9.949704351657192e-06, "loss": 1.0475, "step": 117 }, { "epoch": 0.22476190476190477, "grad_norm": 3.2894246016451336, "learning_rate": 9.948238481440149e-06, "loss": 0.996, "step": 118 }, { "epoch": 0.22666666666666666, "grad_norm": 3.095497763452724, "learning_rate": 9.946751666619736e-06, "loss": 0.9967, "step": 119 }, { "epoch": 0.22857142857142856, "grad_norm": 2.5824494132376974, "learning_rate": 9.945243913489255e-06, "loss": 1.0582, "step": 120 }, { "epoch": 0.23047619047619047, "grad_norm": 3.4476576109467914, "learning_rate": 9.943715228430632e-06, "loss": 1.0397, "step": 121 }, { "epoch": 0.23238095238095238, "grad_norm": 4.299143325553733, "learning_rate": 9.94216561791439e-06, "loss": 1.4879, "step": 122 }, { "epoch": 0.2342857142857143, "grad_norm": 2.6941969303087703, "learning_rate": 9.940595088499628e-06, "loss": 1.1719, "step": 123 }, { "epoch": 0.2361904761904762, "grad_norm": 2.9075149270426968, "learning_rate": 9.939003646833985e-06, "loss": 1.0605, "step": 124 }, { "epoch": 0.23809523809523808, "grad_norm": 2.2748068715580665, "learning_rate": 9.937391299653622e-06, "loss": 1.0057, "step": 125 }, { "epoch": 0.24, "grad_norm": 3.8992899892410335, "learning_rate": 9.935758053783183e-06, "loss": 1.0813, "step": 126 }, { "epoch": 0.2419047619047619, "grad_norm": 3.0096302665768326, "learning_rate": 9.934103916135772e-06, "loss": 1.031, "step": 127 }, { "epoch": 0.2438095238095238, "grad_norm": 3.531846236903906, "learning_rate": 9.932428893712923e-06, "loss": 1.1547, "step": 128 }, { "epoch": 0.24571428571428572, "grad_norm": 3.391883343568946, "learning_rate": 9.930732993604566e-06, "loss": 1.0361, "step": 129 }, { "epoch": 0.24761904761904763, "grad_norm": 3.5977577229850755, "learning_rate": 9.929016222989009e-06, "loss": 1.0023, "step": 130 }, { "epoch": 0.24952380952380954, "grad_norm": 3.7571434809125153, "learning_rate": 9.927278589132891e-06, "loss": 1.1339, "step": 131 }, { "epoch": 0.25142857142857145, "grad_norm": 2.5261761857089344, "learning_rate": 9.925520099391164e-06, "loss": 1.105, "step": 132 }, { "epoch": 0.25333333333333335, "grad_norm": 2.518620573737334, "learning_rate": 9.923740761207055e-06, "loss": 0.965, "step": 133 }, { "epoch": 0.25523809523809526, "grad_norm": 3.374631413724398, "learning_rate": 9.921940582112041e-06, "loss": 1.0943, "step": 134 }, { "epoch": 0.2571428571428571, "grad_norm": 2.8350601779277236, "learning_rate": 9.920119569725811e-06, "loss": 1.0307, "step": 135 }, { "epoch": 0.259047619047619, "grad_norm": 3.078359484666857, "learning_rate": 9.918277731756234e-06, "loss": 1.2088, "step": 136 }, { "epoch": 0.26095238095238094, "grad_norm": 2.8683287365781154, "learning_rate": 9.91641507599933e-06, "loss": 0.9529, "step": 137 }, { "epoch": 0.26285714285714284, "grad_norm": 2.3865571133140207, "learning_rate": 9.914531610339236e-06, "loss": 1.1474, "step": 138 }, { "epoch": 0.26476190476190475, "grad_norm": 2.7123759498479436, "learning_rate": 9.912627342748168e-06, "loss": 1.045, "step": 139 }, { "epoch": 0.26666666666666666, "grad_norm": 3.4343711498632015, "learning_rate": 9.910702281286401e-06, "loss": 1.0996, "step": 140 }, { "epoch": 0.26857142857142857, "grad_norm": 3.3983243671309165, "learning_rate": 9.908756434102212e-06, "loss": 1.141, "step": 141 }, { "epoch": 0.2704761904761905, "grad_norm": 2.9951907358912306, "learning_rate": 9.906789809431867e-06, "loss": 0.9749, "step": 142 }, { "epoch": 0.2723809523809524, "grad_norm": 2.548630418636751, "learning_rate": 9.904802415599578e-06, "loss": 1.1229, "step": 143 }, { "epoch": 0.2742857142857143, "grad_norm": 3.1479147398397562, "learning_rate": 9.90279426101746e-06, "loss": 0.9784, "step": 144 }, { "epoch": 0.2761904761904762, "grad_norm": 2.9129195354416386, "learning_rate": 9.900765354185508e-06, "loss": 1.1494, "step": 145 }, { "epoch": 0.2780952380952381, "grad_norm": 3.021162287467609, "learning_rate": 9.898715703691562e-06, "loss": 1.075, "step": 146 }, { "epoch": 0.28, "grad_norm": 2.9927487776012365, "learning_rate": 9.896645318211251e-06, "loss": 1.0392, "step": 147 }, { "epoch": 0.2819047619047619, "grad_norm": 3.2610420257422432, "learning_rate": 9.894554206507982e-06, "loss": 0.9937, "step": 148 }, { "epoch": 0.2838095238095238, "grad_norm": 2.9699607268831065, "learning_rate": 9.892442377432882e-06, "loss": 1.1378, "step": 149 }, { "epoch": 0.2857142857142857, "grad_norm": 2.705524552687109, "learning_rate": 9.890309839924776e-06, "loss": 1.0683, "step": 150 }, { "epoch": 0.2876190476190476, "grad_norm": 3.3715417503889777, "learning_rate": 9.888156603010137e-06, "loss": 1.0635, "step": 151 }, { "epoch": 0.2895238095238095, "grad_norm": 2.739638304166437, "learning_rate": 9.885982675803056e-06, "loss": 0.9561, "step": 152 }, { "epoch": 0.2914285714285714, "grad_norm": 2.4241137147091987, "learning_rate": 9.8837880675052e-06, "loss": 1.1199, "step": 153 }, { "epoch": 0.29333333333333333, "grad_norm": 3.2548663265967885, "learning_rate": 9.881572787405774e-06, "loss": 1.0592, "step": 154 }, { "epoch": 0.29523809523809524, "grad_norm": 2.8847430466560593, "learning_rate": 9.87933684488148e-06, "loss": 1.0127, "step": 155 }, { "epoch": 0.29714285714285715, "grad_norm": 4.163773017576036, "learning_rate": 9.877080249396482e-06, "loss": 1.1959, "step": 156 }, { "epoch": 0.29904761904761906, "grad_norm": 2.43449383239491, "learning_rate": 9.874803010502359e-06, "loss": 0.9392, "step": 157 }, { "epoch": 0.30095238095238097, "grad_norm": 3.192868562187782, "learning_rate": 9.872505137838069e-06, "loss": 1.1568, "step": 158 }, { "epoch": 0.3028571428571429, "grad_norm": 3.4387041337301625, "learning_rate": 9.87018664112991e-06, "loss": 1.0076, "step": 159 }, { "epoch": 0.3047619047619048, "grad_norm": 2.645228572202429, "learning_rate": 9.867847530191472e-06, "loss": 1.1138, "step": 160 }, { "epoch": 0.30666666666666664, "grad_norm": 2.7359173146917413, "learning_rate": 9.8654878149236e-06, "loss": 1.0449, "step": 161 }, { "epoch": 0.30857142857142855, "grad_norm": 2.9870908289014735, "learning_rate": 9.863107505314359e-06, "loss": 1.0193, "step": 162 }, { "epoch": 0.31047619047619046, "grad_norm": 2.574232205962119, "learning_rate": 9.860706611438975e-06, "loss": 1.082, "step": 163 }, { "epoch": 0.31238095238095237, "grad_norm": 2.986482890199796, "learning_rate": 9.858285143459804e-06, "loss": 0.93, "step": 164 }, { "epoch": 0.3142857142857143, "grad_norm": 3.536206243869805, "learning_rate": 9.855843111626295e-06, "loss": 1.1111, "step": 165 }, { "epoch": 0.3161904761904762, "grad_norm": 2.686673444877999, "learning_rate": 9.853380526274928e-06, "loss": 1.2395, "step": 166 }, { "epoch": 0.3180952380952381, "grad_norm": 4.611197609530781, "learning_rate": 9.850897397829182e-06, "loss": 1.1801, "step": 167 }, { "epoch": 0.32, "grad_norm": 3.911101177768665, "learning_rate": 9.848393736799496e-06, "loss": 1.1439, "step": 168 }, { "epoch": 0.3219047619047619, "grad_norm": 2.422871904775949, "learning_rate": 9.845869553783214e-06, "loss": 1.0528, "step": 169 }, { "epoch": 0.3238095238095238, "grad_norm": 2.6621360710125095, "learning_rate": 9.843324859464547e-06, "loss": 1.0423, "step": 170 }, { "epoch": 0.32571428571428573, "grad_norm": 3.7362699560222805, "learning_rate": 9.840759664614516e-06, "loss": 1.1698, "step": 171 }, { "epoch": 0.32761904761904764, "grad_norm": 2.1733989857511644, "learning_rate": 9.838173980090928e-06, "loss": 1.0468, "step": 172 }, { "epoch": 0.3295238095238095, "grad_norm": 3.018576608971308, "learning_rate": 9.835567816838309e-06, "loss": 0.9909, "step": 173 }, { "epoch": 0.3314285714285714, "grad_norm": 3.2360372541399913, "learning_rate": 9.832941185887868e-06, "loss": 1.0334, "step": 174 }, { "epoch": 0.3333333333333333, "grad_norm": 3.5138295004700524, "learning_rate": 9.83029409835745e-06, "loss": 1.1439, "step": 175 }, { "epoch": 0.3352380952380952, "grad_norm": 2.596761327319496, "learning_rate": 9.827626565451487e-06, "loss": 0.9764, "step": 176 }, { "epoch": 0.33714285714285713, "grad_norm": 3.617953130813023, "learning_rate": 9.82493859846095e-06, "loss": 1.0809, "step": 177 }, { "epoch": 0.33904761904761904, "grad_norm": 2.6372949321239822, "learning_rate": 9.8222302087633e-06, "loss": 0.9211, "step": 178 }, { "epoch": 0.34095238095238095, "grad_norm": 3.00182530804867, "learning_rate": 9.819501407822446e-06, "loss": 1.1826, "step": 179 }, { "epoch": 0.34285714285714286, "grad_norm": 2.816028166000998, "learning_rate": 9.816752207188695e-06, "loss": 0.9066, "step": 180 }, { "epoch": 0.34476190476190477, "grad_norm": 3.566149344336227, "learning_rate": 9.81398261849869e-06, "loss": 1.013, "step": 181 }, { "epoch": 0.3466666666666667, "grad_norm": 2.6508129562771887, "learning_rate": 9.811192653475382e-06, "loss": 1.0807, "step": 182 }, { "epoch": 0.3485714285714286, "grad_norm": 2.403667822417291, "learning_rate": 9.808382323927962e-06, "loss": 0.8976, "step": 183 }, { "epoch": 0.3504761904761905, "grad_norm": 2.5863595375751274, "learning_rate": 9.805551641751826e-06, "loss": 1.0593, "step": 184 }, { "epoch": 0.3523809523809524, "grad_norm": 2.49618027877372, "learning_rate": 9.802700618928513e-06, "loss": 1.178, "step": 185 }, { "epoch": 0.35428571428571426, "grad_norm": 3.4940607815837548, "learning_rate": 9.799829267525656e-06, "loss": 0.9083, "step": 186 }, { "epoch": 0.35619047619047617, "grad_norm": 2.4823680559712673, "learning_rate": 9.79693759969694e-06, "loss": 0.9226, "step": 187 }, { "epoch": 0.3580952380952381, "grad_norm": 2.8744244733166857, "learning_rate": 9.794025627682042e-06, "loss": 0.9735, "step": 188 }, { "epoch": 0.36, "grad_norm": 3.3597483861310424, "learning_rate": 9.791093363806576e-06, "loss": 1.0966, "step": 189 }, { "epoch": 0.3619047619047619, "grad_norm": 3.3794514928646207, "learning_rate": 9.788140820482056e-06, "loss": 1.1086, "step": 190 }, { "epoch": 0.3638095238095238, "grad_norm": 2.237115025130188, "learning_rate": 9.785168010205822e-06, "loss": 0.9749, "step": 191 }, { "epoch": 0.3657142857142857, "grad_norm": 2.612924523749599, "learning_rate": 9.782174945561009e-06, "loss": 0.9963, "step": 192 }, { "epoch": 0.3676190476190476, "grad_norm": 3.4133183920646397, "learning_rate": 9.77916163921648e-06, "loss": 1.1557, "step": 193 }, { "epoch": 0.36952380952380953, "grad_norm": 3.1580639931199945, "learning_rate": 9.77612810392677e-06, "loss": 1.2647, "step": 194 }, { "epoch": 0.37142857142857144, "grad_norm": 2.678879647534476, "learning_rate": 9.773074352532048e-06, "loss": 1.0786, "step": 195 }, { "epoch": 0.37333333333333335, "grad_norm": 2.775674376589696, "learning_rate": 9.770000397958045e-06, "loss": 1.0459, "step": 196 }, { "epoch": 0.37523809523809526, "grad_norm": 3.302196763192064, "learning_rate": 9.766906253216011e-06, "loss": 1.0103, "step": 197 }, { "epoch": 0.37714285714285717, "grad_norm": 2.9143701416841172, "learning_rate": 9.763791931402652e-06, "loss": 1.1111, "step": 198 }, { "epoch": 0.379047619047619, "grad_norm": 2.291599732156606, "learning_rate": 9.760657445700082e-06, "loss": 1.0406, "step": 199 }, { "epoch": 0.38095238095238093, "grad_norm": 3.189432428360578, "learning_rate": 9.757502809375761e-06, "loss": 1.1519, "step": 200 }, { "epoch": 0.38285714285714284, "grad_norm": 2.6118089279457126, "learning_rate": 9.754328035782442e-06, "loss": 1.0255, "step": 201 }, { "epoch": 0.38476190476190475, "grad_norm": 3.3669356516916578, "learning_rate": 9.751133138358111e-06, "loss": 1.1824, "step": 202 }, { "epoch": 0.38666666666666666, "grad_norm": 2.791504768510238, "learning_rate": 9.747918130625941e-06, "loss": 1.0649, "step": 203 }, { "epoch": 0.38857142857142857, "grad_norm": 2.557484611681563, "learning_rate": 9.744683026194221e-06, "loss": 0.9955, "step": 204 }, { "epoch": 0.3904761904761905, "grad_norm": 2.9051992741904074, "learning_rate": 9.7414278387563e-06, "loss": 0.9927, "step": 205 }, { "epoch": 0.3923809523809524, "grad_norm": 3.1402167980474696, "learning_rate": 9.738152582090543e-06, "loss": 1.0115, "step": 206 }, { "epoch": 0.3942857142857143, "grad_norm": 2.6748264866511993, "learning_rate": 9.734857270060254e-06, "loss": 1.1955, "step": 207 }, { "epoch": 0.3961904761904762, "grad_norm": 5.605572655010524, "learning_rate": 9.731541916613631e-06, "loss": 0.9863, "step": 208 }, { "epoch": 0.3980952380952381, "grad_norm": 4.49082904136764, "learning_rate": 9.7282065357837e-06, "loss": 1.3237, "step": 209 }, { "epoch": 0.4, "grad_norm": 3.017378218982703, "learning_rate": 9.724851141688258e-06, "loss": 1.2822, "step": 210 }, { "epoch": 0.40190476190476193, "grad_norm": 3.4271261376555286, "learning_rate": 9.721475748529815e-06, "loss": 1.135, "step": 211 }, { "epoch": 0.4038095238095238, "grad_norm": 2.8093618506873317, "learning_rate": 9.71808037059553e-06, "loss": 1.0961, "step": 212 }, { "epoch": 0.4057142857142857, "grad_norm": 3.9587987374950617, "learning_rate": 9.714665022257152e-06, "loss": 1.2021, "step": 213 }, { "epoch": 0.4076190476190476, "grad_norm": 3.235909397764408, "learning_rate": 9.711229717970961e-06, "loss": 1.1709, "step": 214 }, { "epoch": 0.4095238095238095, "grad_norm": 2.9294343889888976, "learning_rate": 9.707774472277702e-06, "loss": 1.1405, "step": 215 }, { "epoch": 0.4114285714285714, "grad_norm": 2.8743119961776658, "learning_rate": 9.704299299802531e-06, "loss": 1.2047, "step": 216 }, { "epoch": 0.41333333333333333, "grad_norm": 3.740667872605965, "learning_rate": 9.700804215254946e-06, "loss": 1.0581, "step": 217 }, { "epoch": 0.41523809523809524, "grad_norm": 3.490121897988836, "learning_rate": 9.69728923342873e-06, "loss": 1.1561, "step": 218 }, { "epoch": 0.41714285714285715, "grad_norm": 3.3639327787089846, "learning_rate": 9.693754369201885e-06, "loss": 1.1634, "step": 219 }, { "epoch": 0.41904761904761906, "grad_norm": 3.0200761314830262, "learning_rate": 9.690199637536566e-06, "loss": 1.1064, "step": 220 }, { "epoch": 0.42095238095238097, "grad_norm": 3.5667974140280463, "learning_rate": 9.68662505347903e-06, "loss": 1.1377, "step": 221 }, { "epoch": 0.4228571428571429, "grad_norm": 3.7220254675880025, "learning_rate": 9.683030632159556e-06, "loss": 1.1443, "step": 222 }, { "epoch": 0.4247619047619048, "grad_norm": 2.595476925697928, "learning_rate": 9.679416388792393e-06, "loss": 1.092, "step": 223 }, { "epoch": 0.4266666666666667, "grad_norm": 3.853813921258193, "learning_rate": 9.675782338675692e-06, "loss": 1.0147, "step": 224 }, { "epoch": 0.42857142857142855, "grad_norm": 3.471695450855635, "learning_rate": 9.67212849719144e-06, "loss": 1.0765, "step": 225 }, { "epoch": 0.43047619047619046, "grad_norm": 2.8103206233957367, "learning_rate": 9.668454879805394e-06, "loss": 1.1322, "step": 226 }, { "epoch": 0.43238095238095237, "grad_norm": 2.8355047244496703, "learning_rate": 9.664761502067019e-06, "loss": 1.1095, "step": 227 }, { "epoch": 0.4342857142857143, "grad_norm": 3.114785788041395, "learning_rate": 9.661048379609419e-06, "loss": 1.1004, "step": 228 }, { "epoch": 0.4361904761904762, "grad_norm": 3.021879404601287, "learning_rate": 9.657315528149275e-06, "loss": 0.9346, "step": 229 }, { "epoch": 0.4380952380952381, "grad_norm": 3.026803250823136, "learning_rate": 9.653562963486774e-06, "loss": 1.0449, "step": 230 }, { "epoch": 0.44, "grad_norm": 3.4206628902791025, "learning_rate": 9.649790701505541e-06, "loss": 1.1101, "step": 231 }, { "epoch": 0.4419047619047619, "grad_norm": 2.6551465295801306, "learning_rate": 9.64599875817258e-06, "loss": 1.0131, "step": 232 }, { "epoch": 0.4438095238095238, "grad_norm": 2.854467341451607, "learning_rate": 9.6421871495382e-06, "loss": 1.1845, "step": 233 }, { "epoch": 0.44571428571428573, "grad_norm": 2.827217189746745, "learning_rate": 9.638355891735944e-06, "loss": 1.0443, "step": 234 }, { "epoch": 0.44761904761904764, "grad_norm": 3.006208737372956, "learning_rate": 9.634505000982529e-06, "loss": 1.0076, "step": 235 }, { "epoch": 0.44952380952380955, "grad_norm": 2.5796022640509086, "learning_rate": 9.630634493577767e-06, "loss": 1.0163, "step": 236 }, { "epoch": 0.4514285714285714, "grad_norm": 2.7106342118537485, "learning_rate": 9.626744385904512e-06, "loss": 0.9616, "step": 237 }, { "epoch": 0.4533333333333333, "grad_norm": 2.2487090446859574, "learning_rate": 9.622834694428574e-06, "loss": 0.953, "step": 238 }, { "epoch": 0.4552380952380952, "grad_norm": 3.3958126730237983, "learning_rate": 9.618905435698658e-06, "loss": 1.0927, "step": 239 }, { "epoch": 0.45714285714285713, "grad_norm": 2.305056273629618, "learning_rate": 9.61495662634629e-06, "loss": 1.0678, "step": 240 }, { "epoch": 0.45904761904761904, "grad_norm": 3.119636116341919, "learning_rate": 9.610988283085752e-06, "loss": 1.0628, "step": 241 }, { "epoch": 0.46095238095238095, "grad_norm": 3.4556491676683976, "learning_rate": 9.607000422714006e-06, "loss": 1.0585, "step": 242 }, { "epoch": 0.46285714285714286, "grad_norm": 3.4346262774329124, "learning_rate": 9.602993062110626e-06, "loss": 0.995, "step": 243 }, { "epoch": 0.46476190476190476, "grad_norm": 3.470708055444413, "learning_rate": 9.598966218237723e-06, "loss": 1.1668, "step": 244 }, { "epoch": 0.4666666666666667, "grad_norm": 2.7204516702659953, "learning_rate": 9.594919908139878e-06, "loss": 1.0181, "step": 245 }, { "epoch": 0.4685714285714286, "grad_norm": 2.978286252335797, "learning_rate": 9.590854148944067e-06, "loss": 1.0025, "step": 246 }, { "epoch": 0.4704761904761905, "grad_norm": 2.3895185947413604, "learning_rate": 9.586768957859589e-06, "loss": 1.0165, "step": 247 }, { "epoch": 0.4723809523809524, "grad_norm": 3.1874291056224178, "learning_rate": 9.58266435217799e-06, "loss": 0.9626, "step": 248 }, { "epoch": 0.4742857142857143, "grad_norm": 2.2712287800543205, "learning_rate": 9.578540349273e-06, "loss": 1.0621, "step": 249 }, { "epoch": 0.47619047619047616, "grad_norm": 2.643855435385986, "learning_rate": 9.574396966600445e-06, "loss": 1.102, "step": 250 }, { "epoch": 0.4780952380952381, "grad_norm": 2.5617281013572866, "learning_rate": 9.570234221698185e-06, "loss": 0.9841, "step": 251 }, { "epoch": 0.48, "grad_norm": 3.1302368487531282, "learning_rate": 9.566052132186032e-06, "loss": 0.9949, "step": 252 }, { "epoch": 0.4819047619047619, "grad_norm": 2.5440685943414665, "learning_rate": 9.561850715765684e-06, "loss": 1.2165, "step": 253 }, { "epoch": 0.4838095238095238, "grad_norm": 3.1267589828104905, "learning_rate": 9.557629990220637e-06, "loss": 1.0531, "step": 254 }, { "epoch": 0.4857142857142857, "grad_norm": 2.4958404703914208, "learning_rate": 9.553389973416125e-06, "loss": 1.028, "step": 255 }, { "epoch": 0.4876190476190476, "grad_norm": 3.2773918473984898, "learning_rate": 9.549130683299032e-06, "loss": 1.1487, "step": 256 }, { "epoch": 0.4895238095238095, "grad_norm": 2.4843863440219383, "learning_rate": 9.544852137897824e-06, "loss": 1.0878, "step": 257 }, { "epoch": 0.49142857142857144, "grad_norm": 3.0899480036220863, "learning_rate": 9.540554355322467e-06, "loss": 1.1547, "step": 258 }, { "epoch": 0.49333333333333335, "grad_norm": 2.4873063573655694, "learning_rate": 9.536237353764354e-06, "loss": 1.0231, "step": 259 }, { "epoch": 0.49523809523809526, "grad_norm": 2.4731566104989335, "learning_rate": 9.531901151496227e-06, "loss": 1.1443, "step": 260 }, { "epoch": 0.49714285714285716, "grad_norm": 3.644355028600177, "learning_rate": 9.527545766872102e-06, "loss": 1.0975, "step": 261 }, { "epoch": 0.4990476190476191, "grad_norm": 2.463223924477943, "learning_rate": 9.523171218327184e-06, "loss": 1.0939, "step": 262 }, { "epoch": 0.5009523809523809, "grad_norm": 3.66085062531903, "learning_rate": 9.518777524377796e-06, "loss": 1.2054, "step": 263 }, { "epoch": 0.5028571428571429, "grad_norm": 3.4454973268157447, "learning_rate": 9.514364703621299e-06, "loss": 1.0681, "step": 264 }, { "epoch": 0.5047619047619047, "grad_norm": 4.078934398255164, "learning_rate": 9.509932774736016e-06, "loss": 0.9327, "step": 265 }, { "epoch": 0.5066666666666667, "grad_norm": 3.6449246722536577, "learning_rate": 9.505481756481145e-06, "loss": 1.0902, "step": 266 }, { "epoch": 0.5085714285714286, "grad_norm": 3.0639601396623344, "learning_rate": 9.501011667696683e-06, "loss": 1.0834, "step": 267 }, { "epoch": 0.5104761904761905, "grad_norm": 2.3245739748017638, "learning_rate": 9.496522527303353e-06, "loss": 1.0887, "step": 268 }, { "epoch": 0.5123809523809524, "grad_norm": 2.649201264888993, "learning_rate": 9.492014354302516e-06, "loss": 1.0926, "step": 269 }, { "epoch": 0.5142857142857142, "grad_norm": 3.3699942327060137, "learning_rate": 9.48748716777609e-06, "loss": 1.1117, "step": 270 }, { "epoch": 0.5161904761904762, "grad_norm": 2.3315268640810807, "learning_rate": 9.482940986886479e-06, "loss": 1.1532, "step": 271 }, { "epoch": 0.518095238095238, "grad_norm": 3.2872520805798287, "learning_rate": 9.47837583087648e-06, "loss": 1.168, "step": 272 }, { "epoch": 0.52, "grad_norm": 3.175259967164933, "learning_rate": 9.47379171906921e-06, "loss": 1.1615, "step": 273 }, { "epoch": 0.5219047619047619, "grad_norm": 2.31119179225052, "learning_rate": 9.469188670868015e-06, "loss": 1.0789, "step": 274 }, { "epoch": 0.5238095238095238, "grad_norm": 3.665629221607055, "learning_rate": 9.4645667057564e-06, "loss": 1.1591, "step": 275 }, { "epoch": 0.5257142857142857, "grad_norm": 2.838895713766497, "learning_rate": 9.459925843297938e-06, "loss": 1.05, "step": 276 }, { "epoch": 0.5276190476190477, "grad_norm": 3.593344894023993, "learning_rate": 9.45526610313619e-06, "loss": 1.0057, "step": 277 }, { "epoch": 0.5295238095238095, "grad_norm": 3.4688559822187073, "learning_rate": 9.450587504994621e-06, "loss": 1.2173, "step": 278 }, { "epoch": 0.5314285714285715, "grad_norm": 3.221192760938516, "learning_rate": 9.445890068676518e-06, "loss": 1.2644, "step": 279 }, { "epoch": 0.5333333333333333, "grad_norm": 2.673413979534444, "learning_rate": 9.441173814064903e-06, "loss": 0.8941, "step": 280 }, { "epoch": 0.5352380952380953, "grad_norm": 2.7644194263380126, "learning_rate": 9.436438761122453e-06, "loss": 1.1562, "step": 281 }, { "epoch": 0.5371428571428571, "grad_norm": 3.415965862086195, "learning_rate": 9.431684929891412e-06, "loss": 1.2217, "step": 282 }, { "epoch": 0.539047619047619, "grad_norm": 3.07491566363796, "learning_rate": 9.42691234049351e-06, "loss": 1.0962, "step": 283 }, { "epoch": 0.540952380952381, "grad_norm": 2.9432367183464327, "learning_rate": 9.42212101312987e-06, "loss": 0.971, "step": 284 }, { "epoch": 0.5428571428571428, "grad_norm": 3.3779697079478272, "learning_rate": 9.417310968080934e-06, "loss": 1.1105, "step": 285 }, { "epoch": 0.5447619047619048, "grad_norm": 3.1745310820670407, "learning_rate": 9.412482225706368e-06, "loss": 0.9272, "step": 286 }, { "epoch": 0.5466666666666666, "grad_norm": 3.0696325027617806, "learning_rate": 9.407634806444981e-06, "loss": 0.9993, "step": 287 }, { "epoch": 0.5485714285714286, "grad_norm": 3.1633035674577417, "learning_rate": 9.402768730814633e-06, "loss": 1.1908, "step": 288 }, { "epoch": 0.5504761904761905, "grad_norm": 3.096994879166552, "learning_rate": 9.397884019412154e-06, "loss": 1.1474, "step": 289 }, { "epoch": 0.5523809523809524, "grad_norm": 2.2278564335531676, "learning_rate": 9.392980692913251e-06, "loss": 0.9674, "step": 290 }, { "epoch": 0.5542857142857143, "grad_norm": 2.578548275724668, "learning_rate": 9.388058772072433e-06, "loss": 0.9822, "step": 291 }, { "epoch": 0.5561904761904762, "grad_norm": 3.1635229335006945, "learning_rate": 9.383118277722902e-06, "loss": 1.1093, "step": 292 }, { "epoch": 0.5580952380952381, "grad_norm": 3.071945428911023, "learning_rate": 9.378159230776487e-06, "loss": 1.213, "step": 293 }, { "epoch": 0.56, "grad_norm": 2.3121878371586146, "learning_rate": 9.373181652223536e-06, "loss": 1.0306, "step": 294 }, { "epoch": 0.5619047619047619, "grad_norm": 3.249984941625326, "learning_rate": 9.368185563132845e-06, "loss": 1.0258, "step": 295 }, { "epoch": 0.5638095238095238, "grad_norm": 2.250049366695023, "learning_rate": 9.363170984651554e-06, "loss": 1.1255, "step": 296 }, { "epoch": 0.5657142857142857, "grad_norm": 3.282853646317834, "learning_rate": 9.358137938005067e-06, "loss": 1.0138, "step": 297 }, { "epoch": 0.5676190476190476, "grad_norm": 4.102435203986516, "learning_rate": 9.35308644449696e-06, "loss": 1.1136, "step": 298 }, { "epoch": 0.5695238095238095, "grad_norm": 3.190359078403304, "learning_rate": 9.348016525508886e-06, "loss": 1.0097, "step": 299 }, { "epoch": 0.5714285714285714, "grad_norm": 2.8055099383728406, "learning_rate": 9.342928202500492e-06, "loss": 1.0414, "step": 300 }, { "epoch": 0.5733333333333334, "grad_norm": 3.226391622091831, "learning_rate": 9.337821497009321e-06, "loss": 1.1383, "step": 301 }, { "epoch": 0.5752380952380952, "grad_norm": 2.7242755307654507, "learning_rate": 9.332696430650726e-06, "loss": 0.9899, "step": 302 }, { "epoch": 0.5771428571428572, "grad_norm": 2.7396710737709706, "learning_rate": 9.327553025117774e-06, "loss": 1.0438, "step": 303 }, { "epoch": 0.579047619047619, "grad_norm": 2.3803496855726767, "learning_rate": 9.322391302181167e-06, "loss": 1.1477, "step": 304 }, { "epoch": 0.580952380952381, "grad_norm": 2.657112971537801, "learning_rate": 9.317211283689126e-06, "loss": 1.1747, "step": 305 }, { "epoch": 0.5828571428571429, "grad_norm": 3.2004740781756698, "learning_rate": 9.312012991567322e-06, "loss": 1.0265, "step": 306 }, { "epoch": 0.5847619047619048, "grad_norm": 2.7345693906589466, "learning_rate": 9.306796447818768e-06, "loss": 0.8831, "step": 307 }, { "epoch": 0.5866666666666667, "grad_norm": 2.581473591303195, "learning_rate": 9.301561674523736e-06, "loss": 1.3194, "step": 308 }, { "epoch": 0.5885714285714285, "grad_norm": 3.0062420990531775, "learning_rate": 9.296308693839652e-06, "loss": 1.0905, "step": 309 }, { "epoch": 0.5904761904761905, "grad_norm": 3.15466633587545, "learning_rate": 9.291037528001021e-06, "loss": 0.8329, "step": 310 }, { "epoch": 0.5923809523809523, "grad_norm": 3.0856341101198232, "learning_rate": 9.285748199319307e-06, "loss": 1.0536, "step": 311 }, { "epoch": 0.5942857142857143, "grad_norm": 2.2216358207731584, "learning_rate": 9.280440730182863e-06, "loss": 0.9541, "step": 312 }, { "epoch": 0.5961904761904762, "grad_norm": 2.842861446009651, "learning_rate": 9.275115143056819e-06, "loss": 1.0248, "step": 313 }, { "epoch": 0.5980952380952381, "grad_norm": 3.8297523835881, "learning_rate": 9.269771460482998e-06, "loss": 1.1637, "step": 314 }, { "epoch": 0.6, "grad_norm": 2.685139474137949, "learning_rate": 9.264409705079819e-06, "loss": 1.1308, "step": 315 }, { "epoch": 0.6019047619047619, "grad_norm": 3.2078014043192438, "learning_rate": 9.25902989954219e-06, "loss": 1.1837, "step": 316 }, { "epoch": 0.6038095238095238, "grad_norm": 3.6262754832843958, "learning_rate": 9.253632066641427e-06, "loss": 1.1233, "step": 317 }, { "epoch": 0.6057142857142858, "grad_norm": 2.8480071200284254, "learning_rate": 9.248216229225148e-06, "loss": 0.997, "step": 318 }, { "epoch": 0.6076190476190476, "grad_norm": 2.965424675109055, "learning_rate": 9.242782410217182e-06, "loss": 1.0191, "step": 319 }, { "epoch": 0.6095238095238096, "grad_norm": 3.4390134855353938, "learning_rate": 9.237330632617469e-06, "loss": 1.0756, "step": 320 }, { "epoch": 0.6114285714285714, "grad_norm": 2.520790848512735, "learning_rate": 9.231860919501958e-06, "loss": 1.1408, "step": 321 }, { "epoch": 0.6133333333333333, "grad_norm": 3.0228482634517806, "learning_rate": 9.226373294022524e-06, "loss": 1.1168, "step": 322 }, { "epoch": 0.6152380952380953, "grad_norm": 2.8478464921869926, "learning_rate": 9.220867779406853e-06, "loss": 1.0273, "step": 323 }, { "epoch": 0.6171428571428571, "grad_norm": 3.403570442777272, "learning_rate": 9.215344398958351e-06, "loss": 1.0575, "step": 324 }, { "epoch": 0.6190476190476191, "grad_norm": 3.599725555546533, "learning_rate": 9.209803176056048e-06, "loss": 1.1933, "step": 325 }, { "epoch": 0.6209523809523809, "grad_norm": 3.276388578728829, "learning_rate": 9.204244134154499e-06, "loss": 1.1533, "step": 326 }, { "epoch": 0.6228571428571429, "grad_norm": 2.608627014756356, "learning_rate": 9.198667296783674e-06, "loss": 1.3496, "step": 327 }, { "epoch": 0.6247619047619047, "grad_norm": 3.300183900694987, "learning_rate": 9.193072687548875e-06, "loss": 1.2422, "step": 328 }, { "epoch": 0.6266666666666667, "grad_norm": 2.77850058830544, "learning_rate": 9.187460330130624e-06, "loss": 1.0319, "step": 329 }, { "epoch": 0.6285714285714286, "grad_norm": 3.376930054441844, "learning_rate": 9.181830248284565e-06, "loss": 1.3129, "step": 330 }, { "epoch": 0.6304761904761905, "grad_norm": 4.122798968695337, "learning_rate": 9.176182465841368e-06, "loss": 1.0245, "step": 331 }, { "epoch": 0.6323809523809524, "grad_norm": 2.7632021808150795, "learning_rate": 9.170517006706623e-06, "loss": 1.2431, "step": 332 }, { "epoch": 0.6342857142857142, "grad_norm": 3.0319102483966196, "learning_rate": 9.164833894860743e-06, "loss": 1.0567, "step": 333 }, { "epoch": 0.6361904761904762, "grad_norm": 2.414195345453208, "learning_rate": 9.159133154358856e-06, "loss": 1.1851, "step": 334 }, { "epoch": 0.638095238095238, "grad_norm": 3.4637316101079736, "learning_rate": 9.153414809330712e-06, "loss": 1.1527, "step": 335 }, { "epoch": 0.64, "grad_norm": 2.458716682265978, "learning_rate": 9.147678883980572e-06, "loss": 1.2253, "step": 336 }, { "epoch": 0.6419047619047619, "grad_norm": 3.0209779102998393, "learning_rate": 9.141925402587118e-06, "loss": 1.0282, "step": 337 }, { "epoch": 0.6438095238095238, "grad_norm": 2.884184752305691, "learning_rate": 9.136154389503332e-06, "loss": 1.082, "step": 338 }, { "epoch": 0.6457142857142857, "grad_norm": 2.8859380779810513, "learning_rate": 9.130365869156408e-06, "loss": 0.9967, "step": 339 }, { "epoch": 0.6476190476190476, "grad_norm": 4.099940868149714, "learning_rate": 9.124559866047646e-06, "loss": 1.1936, "step": 340 }, { "epoch": 0.6495238095238095, "grad_norm": 2.6309654217829745, "learning_rate": 9.118736404752342e-06, "loss": 1.0655, "step": 341 }, { "epoch": 0.6514285714285715, "grad_norm": 3.1773333103707446, "learning_rate": 9.112895509919687e-06, "loss": 1.1259, "step": 342 }, { "epoch": 0.6533333333333333, "grad_norm": 3.1167716935695893, "learning_rate": 9.10703720627267e-06, "loss": 1.1632, "step": 343 }, { "epoch": 0.6552380952380953, "grad_norm": 3.3279493562876197, "learning_rate": 9.101161518607962e-06, "loss": 1.1378, "step": 344 }, { "epoch": 0.6571428571428571, "grad_norm": 3.1981939157817947, "learning_rate": 9.095268471795813e-06, "loss": 1.1373, "step": 345 }, { "epoch": 0.659047619047619, "grad_norm": 2.8603014674607428, "learning_rate": 9.08935809077996e-06, "loss": 1.0392, "step": 346 }, { "epoch": 0.660952380952381, "grad_norm": 2.724259741971536, "learning_rate": 9.0834304005775e-06, "loss": 1.1743, "step": 347 }, { "epoch": 0.6628571428571428, "grad_norm": 2.9885426816774316, "learning_rate": 9.077485426278802e-06, "loss": 1.1454, "step": 348 }, { "epoch": 0.6647619047619048, "grad_norm": 2.504779329263032, "learning_rate": 9.071523193047392e-06, "loss": 1.0482, "step": 349 }, { "epoch": 0.6666666666666666, "grad_norm": 3.044001947744392, "learning_rate": 9.065543726119847e-06, "loss": 1.0667, "step": 350 }, { "epoch": 0.6685714285714286, "grad_norm": 2.412752952896956, "learning_rate": 9.059547050805696e-06, "loss": 1.0208, "step": 351 }, { "epoch": 0.6704761904761904, "grad_norm": 2.7191213066147517, "learning_rate": 9.053533192487298e-06, "loss": 1.0417, "step": 352 }, { "epoch": 0.6723809523809524, "grad_norm": 2.8814004911974176, "learning_rate": 9.047502176619749e-06, "loss": 1.1503, "step": 353 }, { "epoch": 0.6742857142857143, "grad_norm": 2.805131777342699, "learning_rate": 9.041454028730767e-06, "loss": 1.0081, "step": 354 }, { "epoch": 0.6761904761904762, "grad_norm": 2.565200413286613, "learning_rate": 9.035388774420584e-06, "loss": 1.1812, "step": 355 }, { "epoch": 0.6780952380952381, "grad_norm": 2.8238095599448245, "learning_rate": 9.029306439361841e-06, "loss": 1.0914, "step": 356 }, { "epoch": 0.68, "grad_norm": 3.6113097414494755, "learning_rate": 9.023207049299473e-06, "loss": 1.1112, "step": 357 }, { "epoch": 0.6819047619047619, "grad_norm": 3.1869036634410777, "learning_rate": 9.017090630050616e-06, "loss": 1.0125, "step": 358 }, { "epoch": 0.6838095238095238, "grad_norm": 3.0223119368517186, "learning_rate": 9.010957207504471e-06, "loss": 1.0529, "step": 359 }, { "epoch": 0.6857142857142857, "grad_norm": 2.80424218698681, "learning_rate": 9.004806807622219e-06, "loss": 1.009, "step": 360 }, { "epoch": 0.6876190476190476, "grad_norm": 3.259013929402233, "learning_rate": 8.998639456436898e-06, "loss": 1.1255, "step": 361 }, { "epoch": 0.6895238095238095, "grad_norm": 3.460844692910995, "learning_rate": 8.992455180053298e-06, "loss": 1.0553, "step": 362 }, { "epoch": 0.6914285714285714, "grad_norm": 3.663541327578872, "learning_rate": 8.98625400464785e-06, "loss": 1.1177, "step": 363 }, { "epoch": 0.6933333333333334, "grad_norm": 2.8186318281692113, "learning_rate": 8.98003595646851e-06, "loss": 1.0084, "step": 364 }, { "epoch": 0.6952380952380952, "grad_norm": 3.817357178726652, "learning_rate": 8.973801061834657e-06, "loss": 1.1294, "step": 365 }, { "epoch": 0.6971428571428572, "grad_norm": 3.0000571353197762, "learning_rate": 8.967549347136974e-06, "loss": 1.0939, "step": 366 }, { "epoch": 0.699047619047619, "grad_norm": 2.489047116573603, "learning_rate": 8.96128083883734e-06, "loss": 1.0108, "step": 367 }, { "epoch": 0.700952380952381, "grad_norm": 2.2374284799304758, "learning_rate": 8.954995563468713e-06, "loss": 0.9346, "step": 368 }, { "epoch": 0.7028571428571428, "grad_norm": 2.858066130943614, "learning_rate": 8.948693547635029e-06, "loss": 1.1154, "step": 369 }, { "epoch": 0.7047619047619048, "grad_norm": 2.5246337088575377, "learning_rate": 8.942374818011074e-06, "loss": 1.154, "step": 370 }, { "epoch": 0.7066666666666667, "grad_norm": 3.2249688145142525, "learning_rate": 8.936039401342388e-06, "loss": 1.1242, "step": 371 }, { "epoch": 0.7085714285714285, "grad_norm": 3.1953262833622103, "learning_rate": 8.929687324445135e-06, "loss": 1.1208, "step": 372 }, { "epoch": 0.7104761904761905, "grad_norm": 3.4019986996229905, "learning_rate": 8.923318614206e-06, "loss": 1.0769, "step": 373 }, { "epoch": 0.7123809523809523, "grad_norm": 2.9160497148249647, "learning_rate": 8.916933297582073e-06, "loss": 1.045, "step": 374 }, { "epoch": 0.7142857142857143, "grad_norm": 2.560075764013695, "learning_rate": 8.910531401600735e-06, "loss": 1.1377, "step": 375 }, { "epoch": 0.7161904761904762, "grad_norm": 2.8083474146713256, "learning_rate": 8.904112953359542e-06, "loss": 1.1011, "step": 376 }, { "epoch": 0.7180952380952381, "grad_norm": 2.5673621742011026, "learning_rate": 8.897677980026114e-06, "loss": 0.9616, "step": 377 }, { "epoch": 0.72, "grad_norm": 3.7642909388622736, "learning_rate": 8.891226508838013e-06, "loss": 1.0577, "step": 378 }, { "epoch": 0.7219047619047619, "grad_norm": 3.364840838181064, "learning_rate": 8.884758567102636e-06, "loss": 1.075, "step": 379 }, { "epoch": 0.7238095238095238, "grad_norm": 2.6838256528900337, "learning_rate": 8.878274182197092e-06, "loss": 1.025, "step": 380 }, { "epoch": 0.7257142857142858, "grad_norm": 3.0219185424141095, "learning_rate": 8.871773381568093e-06, "loss": 0.9733, "step": 381 }, { "epoch": 0.7276190476190476, "grad_norm": 2.542427915477238, "learning_rate": 8.865256192731835e-06, "loss": 1.0816, "step": 382 }, { "epoch": 0.7295238095238096, "grad_norm": 3.335494765427363, "learning_rate": 8.858722643273877e-06, "loss": 1.2553, "step": 383 }, { "epoch": 0.7314285714285714, "grad_norm": 4.050862992491009, "learning_rate": 8.852172760849029e-06, "loss": 1.1365, "step": 384 }, { "epoch": 0.7333333333333333, "grad_norm": 3.4436727393115127, "learning_rate": 8.845606573181241e-06, "loss": 1.1239, "step": 385 }, { "epoch": 0.7352380952380952, "grad_norm": 2.82839942474682, "learning_rate": 8.839024108063466e-06, "loss": 1.0783, "step": 386 }, { "epoch": 0.7371428571428571, "grad_norm": 2.7206052968291377, "learning_rate": 8.83242539335757e-06, "loss": 1.1512, "step": 387 }, { "epoch": 0.7390476190476191, "grad_norm": 3.510052220359161, "learning_rate": 8.825810456994184e-06, "loss": 1.124, "step": 388 }, { "epoch": 0.7409523809523809, "grad_norm": 2.967050518609841, "learning_rate": 8.819179326972615e-06, "loss": 1.1152, "step": 389 }, { "epoch": 0.7428571428571429, "grad_norm": 3.379005317255521, "learning_rate": 8.812532031360707e-06, "loss": 1.149, "step": 390 }, { "epoch": 0.7447619047619047, "grad_norm": 3.2123171616309336, "learning_rate": 8.805868598294728e-06, "loss": 1.0989, "step": 391 }, { "epoch": 0.7466666666666667, "grad_norm": 2.422084944704448, "learning_rate": 8.79918905597925e-06, "loss": 1.0587, "step": 392 }, { "epoch": 0.7485714285714286, "grad_norm": 3.4402615362346713, "learning_rate": 8.792493432687042e-06, "loss": 1.0559, "step": 393 }, { "epoch": 0.7504761904761905, "grad_norm": 2.427929273885637, "learning_rate": 8.785781756758923e-06, "loss": 1.0436, "step": 394 }, { "epoch": 0.7523809523809524, "grad_norm": 3.282090778146958, "learning_rate": 8.779054056603672e-06, "loss": 1.0019, "step": 395 }, { "epoch": 0.7542857142857143, "grad_norm": 3.5061662461280627, "learning_rate": 8.77231036069789e-06, "loss": 1.0609, "step": 396 }, { "epoch": 0.7561904761904762, "grad_norm": 2.552904246872334, "learning_rate": 8.765550697585879e-06, "loss": 0.9617, "step": 397 }, { "epoch": 0.758095238095238, "grad_norm": 3.3596424805810585, "learning_rate": 8.758775095879535e-06, "loss": 1.2109, "step": 398 }, { "epoch": 0.76, "grad_norm": 3.4348345089003622, "learning_rate": 8.751983584258213e-06, "loss": 1.1735, "step": 399 }, { "epoch": 0.7619047619047619, "grad_norm": 2.231821616463513, "learning_rate": 8.745176191468611e-06, "loss": 0.9655, "step": 400 }, { "epoch": 0.7638095238095238, "grad_norm": 2.6109892200645644, "learning_rate": 8.738352946324646e-06, "loss": 1.0254, "step": 401 }, { "epoch": 0.7657142857142857, "grad_norm": 2.980437739721515, "learning_rate": 8.731513877707336e-06, "loss": 1.1012, "step": 402 }, { "epoch": 0.7676190476190476, "grad_norm": 3.0083473362483533, "learning_rate": 8.724659014564676e-06, "loss": 0.9967, "step": 403 }, { "epoch": 0.7695238095238095, "grad_norm": 2.558923124814242, "learning_rate": 8.717788385911514e-06, "loss": 1.1585, "step": 404 }, { "epoch": 0.7714285714285715, "grad_norm": 2.3051163211822887, "learning_rate": 8.710902020829433e-06, "loss": 1.0719, "step": 405 }, { "epoch": 0.7733333333333333, "grad_norm": 2.41821834923688, "learning_rate": 8.703999948466617e-06, "loss": 1.0255, "step": 406 }, { "epoch": 0.7752380952380953, "grad_norm": 3.14367022632351, "learning_rate": 8.697082198037739e-06, "loss": 1.1382, "step": 407 }, { "epoch": 0.7771428571428571, "grad_norm": 2.9683955939402864, "learning_rate": 8.690148798823835e-06, "loss": 0.9404, "step": 408 }, { "epoch": 0.7790476190476191, "grad_norm": 2.529142172327783, "learning_rate": 8.683199780172175e-06, "loss": 1.2332, "step": 409 }, { "epoch": 0.780952380952381, "grad_norm": 2.4658388877395625, "learning_rate": 8.676235171496144e-06, "loss": 0.9892, "step": 410 }, { "epoch": 0.7828571428571428, "grad_norm": 2.8283641793738896, "learning_rate": 8.669255002275112e-06, "loss": 0.984, "step": 411 }, { "epoch": 0.7847619047619048, "grad_norm": 3.133647096585687, "learning_rate": 8.662259302054319e-06, "loss": 0.9927, "step": 412 }, { "epoch": 0.7866666666666666, "grad_norm": 2.758729735555187, "learning_rate": 8.655248100444737e-06, "loss": 1.1519, "step": 413 }, { "epoch": 0.7885714285714286, "grad_norm": 3.189001386891881, "learning_rate": 8.648221427122957e-06, "loss": 1.2207, "step": 414 }, { "epoch": 0.7904761904761904, "grad_norm": 2.9108592393995343, "learning_rate": 8.641179311831053e-06, "loss": 1.1063, "step": 415 }, { "epoch": 0.7923809523809524, "grad_norm": 3.6297074994123513, "learning_rate": 8.634121784376465e-06, "loss": 1.1972, "step": 416 }, { "epoch": 0.7942857142857143, "grad_norm": 3.0166932780857576, "learning_rate": 8.627048874631868e-06, "loss": 1.0153, "step": 417 }, { "epoch": 0.7961904761904762, "grad_norm": 2.352420567560322, "learning_rate": 8.619960612535038e-06, "loss": 0.8948, "step": 418 }, { "epoch": 0.7980952380952381, "grad_norm": 3.903440171260907, "learning_rate": 8.612857028088748e-06, "loss": 1.1302, "step": 419 }, { "epoch": 0.8, "grad_norm": 2.9690330836261984, "learning_rate": 8.605738151360615e-06, "loss": 1.0636, "step": 420 }, { "epoch": 0.8019047619047619, "grad_norm": 3.242096843945755, "learning_rate": 8.598604012482989e-06, "loss": 1.1882, "step": 421 }, { "epoch": 0.8038095238095239, "grad_norm": 2.601107974340937, "learning_rate": 8.59145464165282e-06, "loss": 0.9328, "step": 422 }, { "epoch": 0.8057142857142857, "grad_norm": 3.236040533371977, "learning_rate": 8.584290069131532e-06, "loss": 1.1176, "step": 423 }, { "epoch": 0.8076190476190476, "grad_norm": 3.1387425308191363, "learning_rate": 8.577110325244889e-06, "loss": 0.9882, "step": 424 }, { "epoch": 0.8095238095238095, "grad_norm": 2.7150796170310576, "learning_rate": 8.569915440382877e-06, "loss": 0.8892, "step": 425 }, { "epoch": 0.8114285714285714, "grad_norm": 3.065969460883179, "learning_rate": 8.562705444999568e-06, "loss": 1.0418, "step": 426 }, { "epoch": 0.8133333333333334, "grad_norm": 3.0573807259121804, "learning_rate": 8.555480369612993e-06, "loss": 1.0592, "step": 427 }, { "epoch": 0.8152380952380952, "grad_norm": 2.3194057375292894, "learning_rate": 8.54824024480501e-06, "loss": 1.1651, "step": 428 }, { "epoch": 0.8171428571428572, "grad_norm": 2.529280513955374, "learning_rate": 8.540985101221182e-06, "loss": 1.1728, "step": 429 }, { "epoch": 0.819047619047619, "grad_norm": 2.8174939455132675, "learning_rate": 8.533714969570642e-06, "loss": 1.0767, "step": 430 }, { "epoch": 0.820952380952381, "grad_norm": 3.4046233317345966, "learning_rate": 8.526429880625957e-06, "loss": 1.1272, "step": 431 }, { "epoch": 0.8228571428571428, "grad_norm": 3.1126606357344992, "learning_rate": 8.51912986522301e-06, "loss": 1.1608, "step": 432 }, { "epoch": 0.8247619047619048, "grad_norm": 2.711030441114635, "learning_rate": 8.511814954260868e-06, "loss": 1.0115, "step": 433 }, { "epoch": 0.8266666666666667, "grad_norm": 2.615811959017416, "learning_rate": 8.504485178701635e-06, "loss": 0.8514, "step": 434 }, { "epoch": 0.8285714285714286, "grad_norm": 3.13353585968966, "learning_rate": 8.497140569570344e-06, "loss": 1.1303, "step": 435 }, { "epoch": 0.8304761904761905, "grad_norm": 3.333661923768568, "learning_rate": 8.48978115795481e-06, "loss": 1.2062, "step": 436 }, { "epoch": 0.8323809523809523, "grad_norm": 3.1005760988695132, "learning_rate": 8.482406975005506e-06, "loss": 0.9391, "step": 437 }, { "epoch": 0.8342857142857143, "grad_norm": 2.9439892789366433, "learning_rate": 8.47501805193542e-06, "loss": 1.0021, "step": 438 }, { "epoch": 0.8361904761904762, "grad_norm": 3.1430386500120466, "learning_rate": 8.46761442001994e-06, "loss": 1.1745, "step": 439 }, { "epoch": 0.8380952380952381, "grad_norm": 3.152245132905031, "learning_rate": 8.46019611059671e-06, "loss": 0.9255, "step": 440 }, { "epoch": 0.84, "grad_norm": 2.6845203586161714, "learning_rate": 8.4527631550655e-06, "loss": 1.0977, "step": 441 }, { "epoch": 0.8419047619047619, "grad_norm": 3.096279054161254, "learning_rate": 8.445315584888073e-06, "loss": 1.1743, "step": 442 }, { "epoch": 0.8438095238095238, "grad_norm": 3.3066419584820257, "learning_rate": 8.43785343158805e-06, "loss": 1.0291, "step": 443 }, { "epoch": 0.8457142857142858, "grad_norm": 2.912561027411646, "learning_rate": 8.43037672675078e-06, "loss": 1.1286, "step": 444 }, { "epoch": 0.8476190476190476, "grad_norm": 3.365607195342652, "learning_rate": 8.422885502023207e-06, "loss": 1.0388, "step": 445 }, { "epoch": 0.8495238095238096, "grad_norm": 2.9397530588051612, "learning_rate": 8.41537978911373e-06, "loss": 1.1248, "step": 446 }, { "epoch": 0.8514285714285714, "grad_norm": 2.524457250295759, "learning_rate": 8.40785961979208e-06, "loss": 1.1046, "step": 447 }, { "epoch": 0.8533333333333334, "grad_norm": 2.5066645738851054, "learning_rate": 8.400325025889167e-06, "loss": 1.0707, "step": 448 }, { "epoch": 0.8552380952380952, "grad_norm": 2.1343550534829556, "learning_rate": 8.392776039296967e-06, "loss": 1.1798, "step": 449 }, { "epoch": 0.8571428571428571, "grad_norm": 2.4107377275946273, "learning_rate": 8.385212691968367e-06, "loss": 1.0357, "step": 450 }, { "epoch": 0.8590476190476191, "grad_norm": 3.225104794505188, "learning_rate": 8.37763501591705e-06, "loss": 1.0195, "step": 451 }, { "epoch": 0.8609523809523809, "grad_norm": 3.923484239965437, "learning_rate": 8.37004304321734e-06, "loss": 0.9942, "step": 452 }, { "epoch": 0.8628571428571429, "grad_norm": 2.5883022229346375, "learning_rate": 8.362436806004077e-06, "loss": 0.9948, "step": 453 }, { "epoch": 0.8647619047619047, "grad_norm": 3.1611212611466124, "learning_rate": 8.354816336472483e-06, "loss": 1.1588, "step": 454 }, { "epoch": 0.8666666666666667, "grad_norm": 2.227224884930765, "learning_rate": 8.347181666878016e-06, "loss": 1.051, "step": 455 }, { "epoch": 0.8685714285714285, "grad_norm": 3.343904034490303, "learning_rate": 8.339532829536243e-06, "loss": 1.1178, "step": 456 }, { "epoch": 0.8704761904761905, "grad_norm": 2.7042791692096837, "learning_rate": 8.331869856822699e-06, "loss": 0.9584, "step": 457 }, { "epoch": 0.8723809523809524, "grad_norm": 2.6112940045283892, "learning_rate": 8.324192781172748e-06, "loss": 1.1043, "step": 458 }, { "epoch": 0.8742857142857143, "grad_norm": 2.9972603993374625, "learning_rate": 8.316501635081452e-06, "loss": 0.9461, "step": 459 }, { "epoch": 0.8761904761904762, "grad_norm": 3.2457696049052602, "learning_rate": 8.308796451103425e-06, "loss": 1.049, "step": 460 }, { "epoch": 0.878095238095238, "grad_norm": 3.537077072294635, "learning_rate": 8.301077261852702e-06, "loss": 1.0805, "step": 461 }, { "epoch": 0.88, "grad_norm": 2.9377251873072283, "learning_rate": 8.293344100002596e-06, "loss": 1.0289, "step": 462 }, { "epoch": 0.8819047619047619, "grad_norm": 3.3156194597464483, "learning_rate": 8.285596998285572e-06, "loss": 1.0314, "step": 463 }, { "epoch": 0.8838095238095238, "grad_norm": 2.6050334252906815, "learning_rate": 8.277835989493086e-06, "loss": 1.269, "step": 464 }, { "epoch": 0.8857142857142857, "grad_norm": 2.615504751195819, "learning_rate": 8.270061106475466e-06, "loss": 1.293, "step": 465 }, { "epoch": 0.8876190476190476, "grad_norm": 3.7859955032929973, "learning_rate": 8.262272382141764e-06, "loss": 1.0463, "step": 466 }, { "epoch": 0.8895238095238095, "grad_norm": 2.9860498670291804, "learning_rate": 8.254469849459623e-06, "loss": 1.0942, "step": 467 }, { "epoch": 0.8914285714285715, "grad_norm": 2.7344778901565, "learning_rate": 8.246653541455125e-06, "loss": 0.9873, "step": 468 }, { "epoch": 0.8933333333333333, "grad_norm": 2.80940900597449, "learning_rate": 8.238823491212666e-06, "loss": 1.067, "step": 469 }, { "epoch": 0.8952380952380953, "grad_norm": 3.4775010761402703, "learning_rate": 8.230979731874809e-06, "loss": 1.1605, "step": 470 }, { "epoch": 0.8971428571428571, "grad_norm": 3.065434576482116, "learning_rate": 8.223122296642139e-06, "loss": 0.9812, "step": 471 }, { "epoch": 0.8990476190476191, "grad_norm": 2.7304169187074074, "learning_rate": 8.21525121877313e-06, "loss": 1.1026, "step": 472 }, { "epoch": 0.900952380952381, "grad_norm": 2.7940523187258823, "learning_rate": 8.207366531584007e-06, "loss": 1.244, "step": 473 }, { "epoch": 0.9028571428571428, "grad_norm": 3.0019016221120047, "learning_rate": 8.199468268448591e-06, "loss": 1.103, "step": 474 }, { "epoch": 0.9047619047619048, "grad_norm": 3.488895058692748, "learning_rate": 8.191556462798171e-06, "loss": 0.9693, "step": 475 }, { "epoch": 0.9066666666666666, "grad_norm": 2.052419927964473, "learning_rate": 8.183631148121362e-06, "loss": 1.0691, "step": 476 }, { "epoch": 0.9085714285714286, "grad_norm": 2.784756056228296, "learning_rate": 8.175692357963949e-06, "loss": 0.9683, "step": 477 }, { "epoch": 0.9104761904761904, "grad_norm": 3.312667416922426, "learning_rate": 8.167740125928764e-06, "loss": 1.0703, "step": 478 }, { "epoch": 0.9123809523809524, "grad_norm": 2.1762321202342987, "learning_rate": 8.15977448567553e-06, "loss": 1.0664, "step": 479 }, { "epoch": 0.9142857142857143, "grad_norm": 2.856371298100056, "learning_rate": 8.151795470920729e-06, "loss": 1.0039, "step": 480 }, { "epoch": 0.9161904761904762, "grad_norm": 3.808057593484791, "learning_rate": 8.143803115437444e-06, "loss": 1.0565, "step": 481 }, { "epoch": 0.9180952380952381, "grad_norm": 2.4083168883883492, "learning_rate": 8.13579745305524e-06, "loss": 0.9804, "step": 482 }, { "epoch": 0.92, "grad_norm": 2.458277457932072, "learning_rate": 8.12777851765999e-06, "loss": 0.9108, "step": 483 }, { "epoch": 0.9219047619047619, "grad_norm": 2.2041917678077008, "learning_rate": 8.119746343193764e-06, "loss": 0.8584, "step": 484 }, { "epoch": 0.9238095238095239, "grad_norm": 3.085155154904451, "learning_rate": 8.111700963654658e-06, "loss": 1.1146, "step": 485 }, { "epoch": 0.9257142857142857, "grad_norm": 2.5494148735504507, "learning_rate": 8.103642413096667e-06, "loss": 1.0932, "step": 486 }, { "epoch": 0.9276190476190476, "grad_norm": 2.516089543568002, "learning_rate": 8.095570725629537e-06, "loss": 1.1081, "step": 487 }, { "epoch": 0.9295238095238095, "grad_norm": 3.3273604855594607, "learning_rate": 8.08748593541861e-06, "loss": 1.0552, "step": 488 }, { "epoch": 0.9314285714285714, "grad_norm": 2.4935896236090453, "learning_rate": 8.079388076684703e-06, "loss": 1.0423, "step": 489 }, { "epoch": 0.9333333333333333, "grad_norm": 3.3186717714278116, "learning_rate": 8.071277183703935e-06, "loss": 1.1353, "step": 490 }, { "epoch": 0.9352380952380952, "grad_norm": 3.031819536929874, "learning_rate": 8.063153290807604e-06, "loss": 0.9664, "step": 491 }, { "epoch": 0.9371428571428572, "grad_norm": 3.3105066660602565, "learning_rate": 8.05501643238203e-06, "loss": 1.1321, "step": 492 }, { "epoch": 0.939047619047619, "grad_norm": 2.7712863621364865, "learning_rate": 8.046866642868414e-06, "loss": 1.0302, "step": 493 }, { "epoch": 0.940952380952381, "grad_norm": 2.9784754051557254, "learning_rate": 8.03870395676269e-06, "loss": 1.0708, "step": 494 }, { "epoch": 0.9428571428571428, "grad_norm": 3.74313100234932, "learning_rate": 8.030528408615377e-06, "loss": 1.304, "step": 495 }, { "epoch": 0.9447619047619048, "grad_norm": 2.34322724592221, "learning_rate": 8.022340033031443e-06, "loss": 0.9794, "step": 496 }, { "epoch": 0.9466666666666667, "grad_norm": 2.7200841196093117, "learning_rate": 8.014138864670144e-06, "loss": 1.1417, "step": 497 }, { "epoch": 0.9485714285714286, "grad_norm": 3.0826039786111097, "learning_rate": 8.005924938244889e-06, "loss": 1.2513, "step": 498 }, { "epoch": 0.9504761904761905, "grad_norm": 2.72441910058809, "learning_rate": 7.997698288523085e-06, "loss": 1.0193, "step": 499 }, { "epoch": 0.9523809523809523, "grad_norm": 2.5720275928169496, "learning_rate": 7.989458950326e-06, "loss": 1.119, "step": 500 }, { "epoch": 0.9542857142857143, "grad_norm": 2.8001662682136987, "learning_rate": 7.9812069585286e-06, "loss": 1.2259, "step": 501 }, { "epoch": 0.9561904761904761, "grad_norm": 2.6252536854215247, "learning_rate": 7.972942348059415e-06, "loss": 1.0859, "step": 502 }, { "epoch": 0.9580952380952381, "grad_norm": 2.892537843168099, "learning_rate": 7.964665153900389e-06, "loss": 1.1428, "step": 503 }, { "epoch": 0.96, "grad_norm": 2.650168201967266, "learning_rate": 7.956375411086723e-06, "loss": 1.242, "step": 504 }, { "epoch": 0.9619047619047619, "grad_norm": 2.4635114767396047, "learning_rate": 7.948073154706741e-06, "loss": 1.031, "step": 505 }, { "epoch": 0.9638095238095238, "grad_norm": 2.61394095575774, "learning_rate": 7.939758419901728e-06, "loss": 0.9719, "step": 506 }, { "epoch": 0.9657142857142857, "grad_norm": 2.8577982697972075, "learning_rate": 7.931431241865787e-06, "loss": 1.0958, "step": 507 }, { "epoch": 0.9676190476190476, "grad_norm": 2.9964565356802186, "learning_rate": 7.923091655845696e-06, "loss": 1.135, "step": 508 }, { "epoch": 0.9695238095238096, "grad_norm": 2.6243394304546706, "learning_rate": 7.914739697140742e-06, "loss": 1.1313, "step": 509 }, { "epoch": 0.9714285714285714, "grad_norm": 4.129503983689747, "learning_rate": 7.906375401102594e-06, "loss": 1.1608, "step": 510 }, { "epoch": 0.9733333333333334, "grad_norm": 3.0240446885326886, "learning_rate": 7.897998803135134e-06, "loss": 1.1815, "step": 511 }, { "epoch": 0.9752380952380952, "grad_norm": 2.7395588200297247, "learning_rate": 7.889609938694317e-06, "loss": 1.0299, "step": 512 }, { "epoch": 0.9771428571428571, "grad_norm": 2.812468027721245, "learning_rate": 7.88120884328802e-06, "loss": 1.0421, "step": 513 }, { "epoch": 0.979047619047619, "grad_norm": 3.2398794565010833, "learning_rate": 7.87279555247589e-06, "loss": 0.9394, "step": 514 }, { "epoch": 0.9809523809523809, "grad_norm": 2.285044181961201, "learning_rate": 7.86437010186919e-06, "loss": 0.933, "step": 515 }, { "epoch": 0.9828571428571429, "grad_norm": 3.162605788900836, "learning_rate": 7.85593252713066e-06, "loss": 1.0602, "step": 516 }, { "epoch": 0.9847619047619047, "grad_norm": 3.6290752229883885, "learning_rate": 7.847482863974351e-06, "loss": 1.1061, "step": 517 }, { "epoch": 0.9866666666666667, "grad_norm": 3.0548556746777455, "learning_rate": 7.839021148165485e-06, "loss": 1.0494, "step": 518 }, { "epoch": 0.9885714285714285, "grad_norm": 3.2934083286725335, "learning_rate": 7.8305474155203e-06, "loss": 1.0653, "step": 519 }, { "epoch": 0.9904761904761905, "grad_norm": 2.7343965449931935, "learning_rate": 7.822061701905896e-06, "loss": 1.1359, "step": 520 }, { "epoch": 0.9923809523809524, "grad_norm": 2.8988997887295374, "learning_rate": 7.813564043240087e-06, "loss": 0.9217, "step": 521 }, { "epoch": 0.9942857142857143, "grad_norm": 3.1683223411795214, "learning_rate": 7.805054475491247e-06, "loss": 1.0363, "step": 522 }, { "epoch": 0.9961904761904762, "grad_norm": 2.3955236534328908, "learning_rate": 7.796533034678155e-06, "loss": 1.174, "step": 523 }, { "epoch": 0.9980952380952381, "grad_norm": 3.368355854935716, "learning_rate": 7.787999756869849e-06, "loss": 1.0466, "step": 524 }, { "epoch": 1.0, "grad_norm": 2.8272177401137437, "learning_rate": 7.779454678185472e-06, "loss": 1.1049, "step": 525 }, { "epoch": 1.0019047619047619, "grad_norm": 3.278351794680604, "learning_rate": 7.770897834794111e-06, "loss": 0.78, "step": 526 }, { "epoch": 1.0038095238095237, "grad_norm": 3.5979306600958822, "learning_rate": 7.762329262914654e-06, "loss": 0.7994, "step": 527 }, { "epoch": 1.0057142857142858, "grad_norm": 2.459774911918089, "learning_rate": 7.753748998815633e-06, "loss": 0.591, "step": 528 }, { "epoch": 1.0076190476190476, "grad_norm": 2.943287897207152, "learning_rate": 7.745157078815068e-06, "loss": 0.7519, "step": 529 }, { "epoch": 1.0095238095238095, "grad_norm": 3.0078645478736665, "learning_rate": 7.736553539280315e-06, "loss": 0.8874, "step": 530 }, { "epoch": 1.0114285714285713, "grad_norm": 3.920537090277711, "learning_rate": 7.727938416627915e-06, "loss": 0.8486, "step": 531 }, { "epoch": 1.0133333333333334, "grad_norm": 3.4362938183059257, "learning_rate": 7.719311747323436e-06, "loss": 0.7709, "step": 532 }, { "epoch": 1.0152380952380953, "grad_norm": 2.8989669198690793, "learning_rate": 7.710673567881322e-06, "loss": 0.6786, "step": 533 }, { "epoch": 1.0171428571428571, "grad_norm": 3.132026672205312, "learning_rate": 7.70202391486473e-06, "loss": 0.6602, "step": 534 }, { "epoch": 1.019047619047619, "grad_norm": 3.2494009229771903, "learning_rate": 7.693362824885388e-06, "loss": 0.6235, "step": 535 }, { "epoch": 1.020952380952381, "grad_norm": 4.816066275924408, "learning_rate": 7.684690334603432e-06, "loss": 0.764, "step": 536 }, { "epoch": 1.022857142857143, "grad_norm": 2.913912105130997, "learning_rate": 7.676006480727255e-06, "loss": 0.7966, "step": 537 }, { "epoch": 1.0247619047619048, "grad_norm": 3.1319662563763835, "learning_rate": 7.667311300013341e-06, "loss": 0.6839, "step": 538 }, { "epoch": 1.0266666666666666, "grad_norm": 2.6749682101403445, "learning_rate": 7.658604829266125e-06, "loss": 0.6437, "step": 539 }, { "epoch": 1.0285714285714285, "grad_norm": 2.7598153061420927, "learning_rate": 7.64988710533783e-06, "loss": 0.8466, "step": 540 }, { "epoch": 1.0304761904761905, "grad_norm": 3.2302960082380268, "learning_rate": 7.641158165128305e-06, "loss": 0.7511, "step": 541 }, { "epoch": 1.0323809523809524, "grad_norm": 3.2816320724084114, "learning_rate": 7.632418045584881e-06, "loss": 0.602, "step": 542 }, { "epoch": 1.0342857142857143, "grad_norm": 3.043399357123019, "learning_rate": 7.623666783702204e-06, "loss": 0.5299, "step": 543 }, { "epoch": 1.036190476190476, "grad_norm": 3.7126576358318895, "learning_rate": 7.614904416522082e-06, "loss": 0.6604, "step": 544 }, { "epoch": 1.0380952380952382, "grad_norm": 3.293538551297259, "learning_rate": 7.6061309811333335e-06, "loss": 0.6533, "step": 545 }, { "epoch": 1.04, "grad_norm": 3.475721598976322, "learning_rate": 7.597346514671624e-06, "loss": 0.8101, "step": 546 }, { "epoch": 1.041904761904762, "grad_norm": 2.8801230420633104, "learning_rate": 7.588551054319306e-06, "loss": 0.7262, "step": 547 }, { "epoch": 1.0438095238095237, "grad_norm": 3.053328328032738, "learning_rate": 7.579744637305273e-06, "loss": 0.6568, "step": 548 }, { "epoch": 1.0457142857142858, "grad_norm": 2.559613550403858, "learning_rate": 7.5709273009047914e-06, "loss": 0.5685, "step": 549 }, { "epoch": 1.0476190476190477, "grad_norm": 2.2234064518033416, "learning_rate": 7.562099082439345e-06, "loss": 0.6424, "step": 550 }, { "epoch": 1.0495238095238095, "grad_norm": 2.686130863329895, "learning_rate": 7.553260019276485e-06, "loss": 0.5465, "step": 551 }, { "epoch": 1.0514285714285714, "grad_norm": 3.121543742475663, "learning_rate": 7.544410148829657e-06, "loss": 0.7379, "step": 552 }, { "epoch": 1.0533333333333332, "grad_norm": 3.075061107546592, "learning_rate": 7.535549508558059e-06, "loss": 0.7253, "step": 553 }, { "epoch": 1.0552380952380953, "grad_norm": 3.183575567094095, "learning_rate": 7.526678135966468e-06, "loss": 0.661, "step": 554 }, { "epoch": 1.0571428571428572, "grad_norm": 3.72039021918403, "learning_rate": 7.517796068605094e-06, "loss": 0.5585, "step": 555 }, { "epoch": 1.059047619047619, "grad_norm": 2.8858758418603156, "learning_rate": 7.508903344069409e-06, "loss": 0.7614, "step": 556 }, { "epoch": 1.0609523809523809, "grad_norm": 3.0022490934253914, "learning_rate": 7.500000000000001e-06, "loss": 0.5634, "step": 557 }, { "epoch": 1.062857142857143, "grad_norm": 2.7282699336617178, "learning_rate": 7.491086074082402e-06, "loss": 0.7474, "step": 558 }, { "epoch": 1.0647619047619048, "grad_norm": 2.9043729096498616, "learning_rate": 7.482161604046936e-06, "loss": 0.6343, "step": 559 }, { "epoch": 1.0666666666666667, "grad_norm": 3.259445858074413, "learning_rate": 7.47322662766856e-06, "loss": 0.5934, "step": 560 }, { "epoch": 1.0685714285714285, "grad_norm": 3.7749218119290853, "learning_rate": 7.4642811827666965e-06, "loss": 0.6366, "step": 561 }, { "epoch": 1.0704761904761906, "grad_norm": 2.954117102936782, "learning_rate": 7.455325307205084e-06, "loss": 0.6264, "step": 562 }, { "epoch": 1.0723809523809524, "grad_norm": 2.7256178855732074, "learning_rate": 7.4463590388916075e-06, "loss": 0.6833, "step": 563 }, { "epoch": 1.0742857142857143, "grad_norm": 2.928500454110158, "learning_rate": 7.4373824157781425e-06, "loss": 0.6062, "step": 564 }, { "epoch": 1.0761904761904761, "grad_norm": 2.6743871619584203, "learning_rate": 7.428395475860398e-06, "loss": 0.7074, "step": 565 }, { "epoch": 1.078095238095238, "grad_norm": 2.4180749186593253, "learning_rate": 7.419398257177741e-06, "loss": 0.7505, "step": 566 }, { "epoch": 1.08, "grad_norm": 2.4380295507604144, "learning_rate": 7.410390797813058e-06, "loss": 0.6839, "step": 567 }, { "epoch": 1.081904761904762, "grad_norm": 3.1902766944170438, "learning_rate": 7.401373135892574e-06, "loss": 0.7149, "step": 568 }, { "epoch": 1.0838095238095238, "grad_norm": 2.9368239492577586, "learning_rate": 7.3923453095857015e-06, "loss": 0.682, "step": 569 }, { "epoch": 1.0857142857142856, "grad_norm": 3.1126220587391202, "learning_rate": 7.383307357104874e-06, "loss": 0.6853, "step": 570 }, { "epoch": 1.0876190476190477, "grad_norm": 3.308310990512469, "learning_rate": 7.374259316705388e-06, "loss": 0.7989, "step": 571 }, { "epoch": 1.0895238095238096, "grad_norm": 2.813660310937022, "learning_rate": 7.365201226685241e-06, "loss": 0.6996, "step": 572 }, { "epoch": 1.0914285714285714, "grad_norm": 2.90914066558398, "learning_rate": 7.356133125384966e-06, "loss": 0.5593, "step": 573 }, { "epoch": 1.0933333333333333, "grad_norm": 3.9102161897815413, "learning_rate": 7.3470550511874704e-06, "loss": 0.7346, "step": 574 }, { "epoch": 1.0952380952380953, "grad_norm": 2.3314594413614906, "learning_rate": 7.337967042517876e-06, "loss": 0.5884, "step": 575 }, { "epoch": 1.0971428571428572, "grad_norm": 3.7294915583346055, "learning_rate": 7.3288691378433556e-06, "loss": 0.8694, "step": 576 }, { "epoch": 1.099047619047619, "grad_norm": 2.241611532747276, "learning_rate": 7.319761375672964e-06, "loss": 0.7215, "step": 577 }, { "epoch": 1.100952380952381, "grad_norm": 3.0065054180640094, "learning_rate": 7.310643794557489e-06, "loss": 0.6533, "step": 578 }, { "epoch": 1.1028571428571428, "grad_norm": 2.5124703183128965, "learning_rate": 7.30151643308927e-06, "loss": 0.7418, "step": 579 }, { "epoch": 1.1047619047619048, "grad_norm": 3.1635163327358726, "learning_rate": 7.292379329902049e-06, "loss": 0.6065, "step": 580 }, { "epoch": 1.1066666666666667, "grad_norm": 3.058968244142787, "learning_rate": 7.283232523670805e-06, "loss": 0.7095, "step": 581 }, { "epoch": 1.1085714285714285, "grad_norm": 3.2744676662799383, "learning_rate": 7.274076053111579e-06, "loss": 0.6943, "step": 582 }, { "epoch": 1.1104761904761904, "grad_norm": 3.150223760389491, "learning_rate": 7.2649099569813264e-06, "loss": 0.6957, "step": 583 }, { "epoch": 1.1123809523809525, "grad_norm": 3.0837214519697955, "learning_rate": 7.255734274077742e-06, "loss": 0.7712, "step": 584 }, { "epoch": 1.1142857142857143, "grad_norm": 3.2997911569532996, "learning_rate": 7.246549043239098e-06, "loss": 0.6497, "step": 585 }, { "epoch": 1.1161904761904762, "grad_norm": 3.629950209092766, "learning_rate": 7.237354303344083e-06, "loss": 0.697, "step": 586 }, { "epoch": 1.118095238095238, "grad_norm": 2.1997562865480624, "learning_rate": 7.228150093311634e-06, "loss": 0.6353, "step": 587 }, { "epoch": 1.12, "grad_norm": 3.35340362360685, "learning_rate": 7.218936452100771e-06, "loss": 0.6252, "step": 588 }, { "epoch": 1.121904761904762, "grad_norm": 2.5596578364560845, "learning_rate": 7.209713418710437e-06, "loss": 1.01, "step": 589 }, { "epoch": 1.1238095238095238, "grad_norm": 3.0831464502774755, "learning_rate": 7.200481032179328e-06, "loss": 0.5934, "step": 590 }, { "epoch": 1.1257142857142857, "grad_norm": 3.1422531731636396, "learning_rate": 7.1912393315857245e-06, "loss": 0.7947, "step": 591 }, { "epoch": 1.1276190476190475, "grad_norm": 4.120755328615173, "learning_rate": 7.1819883560473395e-06, "loss": 0.7441, "step": 592 }, { "epoch": 1.1295238095238096, "grad_norm": 2.782211801179133, "learning_rate": 7.172728144721138e-06, "loss": 0.6375, "step": 593 }, { "epoch": 1.1314285714285715, "grad_norm": 2.951628940328863, "learning_rate": 7.163458736803184e-06, "loss": 0.7329, "step": 594 }, { "epoch": 1.1333333333333333, "grad_norm": 3.9215786491138362, "learning_rate": 7.154180171528462e-06, "loss": 0.6429, "step": 595 }, { "epoch": 1.1352380952380952, "grad_norm": 3.7546044540408747, "learning_rate": 7.144892488170719e-06, "loss": 0.6793, "step": 596 }, { "epoch": 1.1371428571428572, "grad_norm": 3.0558176842479634, "learning_rate": 7.135595726042299e-06, "loss": 0.7436, "step": 597 }, { "epoch": 1.139047619047619, "grad_norm": 3.0034861847596446, "learning_rate": 7.126289924493971e-06, "loss": 0.6186, "step": 598 }, { "epoch": 1.140952380952381, "grad_norm": 3.2711780797566945, "learning_rate": 7.116975122914767e-06, "loss": 0.6456, "step": 599 }, { "epoch": 1.1428571428571428, "grad_norm": 2.76214778973002, "learning_rate": 7.107651360731812e-06, "loss": 0.5644, "step": 600 }, { "epoch": 1.1447619047619049, "grad_norm": 4.036426978694489, "learning_rate": 7.098318677410163e-06, "loss": 0.6523, "step": 601 }, { "epoch": 1.1466666666666667, "grad_norm": 3.028625517033124, "learning_rate": 7.088977112452634e-06, "loss": 0.7852, "step": 602 }, { "epoch": 1.1485714285714286, "grad_norm": 2.5915779061337045, "learning_rate": 7.079626705399636e-06, "loss": 0.6592, "step": 603 }, { "epoch": 1.1504761904761904, "grad_norm": 3.0988196962230634, "learning_rate": 7.070267495829e-06, "loss": 0.7292, "step": 604 }, { "epoch": 1.1523809523809523, "grad_norm": 3.06446152236741, "learning_rate": 7.060899523355822e-06, "loss": 0.6567, "step": 605 }, { "epoch": 1.1542857142857144, "grad_norm": 3.192339444894251, "learning_rate": 7.051522827632288e-06, "loss": 0.715, "step": 606 }, { "epoch": 1.1561904761904762, "grad_norm": 3.1922700149619097, "learning_rate": 7.042137448347504e-06, "loss": 0.7048, "step": 607 }, { "epoch": 1.158095238095238, "grad_norm": 2.914801277992762, "learning_rate": 7.032743425227336e-06, "loss": 0.7237, "step": 608 }, { "epoch": 1.16, "grad_norm": 2.8583540958618325, "learning_rate": 7.023340798034234e-06, "loss": 0.664, "step": 609 }, { "epoch": 1.161904761904762, "grad_norm": 2.8405966340374302, "learning_rate": 7.013929606567067e-06, "loss": 0.8086, "step": 610 }, { "epoch": 1.1638095238095238, "grad_norm": 2.96081352241689, "learning_rate": 7.004509890660955e-06, "loss": 0.7387, "step": 611 }, { "epoch": 1.1657142857142857, "grad_norm": 2.851385859021923, "learning_rate": 6.995081690187097e-06, "loss": 0.7045, "step": 612 }, { "epoch": 1.1676190476190476, "grad_norm": 2.783273572615768, "learning_rate": 6.985645045052611e-06, "loss": 0.5878, "step": 613 }, { "epoch": 1.1695238095238096, "grad_norm": 2.6671554453652178, "learning_rate": 6.976199995200354e-06, "loss": 0.7057, "step": 614 }, { "epoch": 1.1714285714285715, "grad_norm": 2.72720234773299, "learning_rate": 6.966746580608761e-06, "loss": 0.6468, "step": 615 }, { "epoch": 1.1733333333333333, "grad_norm": 2.7363828971637454, "learning_rate": 6.95728484129167e-06, "loss": 0.6577, "step": 616 }, { "epoch": 1.1752380952380952, "grad_norm": 3.0876827077248183, "learning_rate": 6.947814817298155e-06, "loss": 0.7497, "step": 617 }, { "epoch": 1.177142857142857, "grad_norm": 3.330842295767064, "learning_rate": 6.93833654871236e-06, "loss": 0.6806, "step": 618 }, { "epoch": 1.1790476190476191, "grad_norm": 2.9509529527079117, "learning_rate": 6.928850075653326e-06, "loss": 0.682, "step": 619 }, { "epoch": 1.180952380952381, "grad_norm": 2.5296803223351705, "learning_rate": 6.919355438274817e-06, "loss": 0.6394, "step": 620 }, { "epoch": 1.1828571428571428, "grad_norm": 2.6766781705506006, "learning_rate": 6.909852676765161e-06, "loss": 0.6751, "step": 621 }, { "epoch": 1.1847619047619047, "grad_norm": 2.6692385415608926, "learning_rate": 6.900341831347067e-06, "loss": 0.6577, "step": 622 }, { "epoch": 1.1866666666666668, "grad_norm": 2.895449612452324, "learning_rate": 6.8908229422774666e-06, "loss": 0.7271, "step": 623 }, { "epoch": 1.1885714285714286, "grad_norm": 2.572667406001419, "learning_rate": 6.881296049847333e-06, "loss": 0.6749, "step": 624 }, { "epoch": 1.1904761904761905, "grad_norm": 2.3550347849478284, "learning_rate": 6.8717611943815185e-06, "loss": 0.7193, "step": 625 }, { "epoch": 1.1923809523809523, "grad_norm": 2.9313104534606502, "learning_rate": 6.862218416238582e-06, "loss": 0.7198, "step": 626 }, { "epoch": 1.1942857142857144, "grad_norm": 2.684143088959432, "learning_rate": 6.852667755810614e-06, "loss": 0.7345, "step": 627 }, { "epoch": 1.1961904761904762, "grad_norm": 2.57853016776272, "learning_rate": 6.84310925352307e-06, "loss": 0.6601, "step": 628 }, { "epoch": 1.198095238095238, "grad_norm": 3.244096053912211, "learning_rate": 6.8335429498346e-06, "loss": 0.5982, "step": 629 }, { "epoch": 1.2, "grad_norm": 2.904330283712291, "learning_rate": 6.823968885236869e-06, "loss": 0.6148, "step": 630 }, { "epoch": 1.2019047619047618, "grad_norm": 2.813833695416442, "learning_rate": 6.814387100254399e-06, "loss": 0.7887, "step": 631 }, { "epoch": 1.2038095238095239, "grad_norm": 3.013400807925236, "learning_rate": 6.804797635444389e-06, "loss": 0.7622, "step": 632 }, { "epoch": 1.2057142857142857, "grad_norm": 3.247727385129131, "learning_rate": 6.795200531396542e-06, "loss": 0.661, "step": 633 }, { "epoch": 1.2076190476190476, "grad_norm": 3.2811715636074616, "learning_rate": 6.785595828732896e-06, "loss": 0.681, "step": 634 }, { "epoch": 1.2095238095238094, "grad_norm": 2.9902563399511664, "learning_rate": 6.775983568107653e-06, "loss": 0.7117, "step": 635 }, { "epoch": 1.2114285714285715, "grad_norm": 2.243652325070851, "learning_rate": 6.766363790207009e-06, "loss": 0.5765, "step": 636 }, { "epoch": 1.2133333333333334, "grad_norm": 2.5735329670900917, "learning_rate": 6.756736535748974e-06, "loss": 0.7017, "step": 637 }, { "epoch": 1.2152380952380952, "grad_norm": 3.80153409494151, "learning_rate": 6.747101845483206e-06, "loss": 0.7943, "step": 638 }, { "epoch": 1.217142857142857, "grad_norm": 2.6745618332686085, "learning_rate": 6.7374597601908365e-06, "loss": 0.6102, "step": 639 }, { "epoch": 1.2190476190476192, "grad_norm": 3.5009554061622836, "learning_rate": 6.7278103206843005e-06, "loss": 0.7141, "step": 640 }, { "epoch": 1.220952380952381, "grad_norm": 3.296137766731785, "learning_rate": 6.718153567807157e-06, "loss": 0.7761, "step": 641 }, { "epoch": 1.2228571428571429, "grad_norm": 2.3029113743264755, "learning_rate": 6.7084895424339254e-06, "loss": 0.7787, "step": 642 }, { "epoch": 1.2247619047619047, "grad_norm": 2.699693904526732, "learning_rate": 6.6988182854699045e-06, "loss": 0.6739, "step": 643 }, { "epoch": 1.2266666666666666, "grad_norm": 2.7706641633621936, "learning_rate": 6.6891398378510034e-06, "loss": 0.9543, "step": 644 }, { "epoch": 1.2285714285714286, "grad_norm": 3.114602507663263, "learning_rate": 6.679454240543567e-06, "loss": 0.7415, "step": 645 }, { "epoch": 1.2304761904761905, "grad_norm": 3.748018389276035, "learning_rate": 6.669761534544205e-06, "loss": 0.6831, "step": 646 }, { "epoch": 1.2323809523809524, "grad_norm": 2.932045476585695, "learning_rate": 6.6600617608796126e-06, "loss": 0.566, "step": 647 }, { "epoch": 1.2342857142857142, "grad_norm": 3.4254357212361652, "learning_rate": 6.650354960606405e-06, "loss": 0.7355, "step": 648 }, { "epoch": 1.2361904761904763, "grad_norm": 3.2171574185270275, "learning_rate": 6.640641174810934e-06, "loss": 0.7746, "step": 649 }, { "epoch": 1.2380952380952381, "grad_norm": 3.25023258863064, "learning_rate": 6.630920444609127e-06, "loss": 0.5981, "step": 650 }, { "epoch": 1.24, "grad_norm": 3.4511631219101995, "learning_rate": 6.621192811146297e-06, "loss": 0.6065, "step": 651 }, { "epoch": 1.2419047619047618, "grad_norm": 2.4793514657437696, "learning_rate": 6.611458315596979e-06, "loss": 0.7514, "step": 652 }, { "epoch": 1.243809523809524, "grad_norm": 2.842659106653368, "learning_rate": 6.601716999164759e-06, "loss": 0.6749, "step": 653 }, { "epoch": 1.2457142857142858, "grad_norm": 2.447668155196946, "learning_rate": 6.591968903082087e-06, "loss": 0.7702, "step": 654 }, { "epoch": 1.2476190476190476, "grad_norm": 2.4860929729475623, "learning_rate": 6.582214068610111e-06, "loss": 0.6024, "step": 655 }, { "epoch": 1.2495238095238095, "grad_norm": 2.7927351396236917, "learning_rate": 6.572452537038504e-06, "loss": 0.805, "step": 656 }, { "epoch": 1.2514285714285713, "grad_norm": 2.330341372653887, "learning_rate": 6.562684349685283e-06, "loss": 0.5915, "step": 657 }, { "epoch": 1.2533333333333334, "grad_norm": 2.6353598138337517, "learning_rate": 6.55290954789664e-06, "loss": 0.6989, "step": 658 }, { "epoch": 1.2552380952380953, "grad_norm": 3.4826327690358707, "learning_rate": 6.54312817304676e-06, "loss": 0.672, "step": 659 }, { "epoch": 1.2571428571428571, "grad_norm": 3.9266385131697135, "learning_rate": 6.533340266537656e-06, "loss": 0.7152, "step": 660 }, { "epoch": 1.259047619047619, "grad_norm": 3.311353164852456, "learning_rate": 6.52354586979898e-06, "loss": 0.6168, "step": 661 }, { "epoch": 1.2609523809523808, "grad_norm": 2.6540745304501554, "learning_rate": 6.513745024287861e-06, "loss": 0.6757, "step": 662 }, { "epoch": 1.262857142857143, "grad_norm": 2.5763142149127174, "learning_rate": 6.503937771488724e-06, "loss": 0.5506, "step": 663 }, { "epoch": 1.2647619047619048, "grad_norm": 2.7276428205823327, "learning_rate": 6.494124152913113e-06, "loss": 0.6259, "step": 664 }, { "epoch": 1.2666666666666666, "grad_norm": 3.5755199923419405, "learning_rate": 6.484304210099517e-06, "loss": 0.7152, "step": 665 }, { "epoch": 1.2685714285714287, "grad_norm": 3.7130504152492914, "learning_rate": 6.474477984613192e-06, "loss": 0.6839, "step": 666 }, { "epoch": 1.2704761904761905, "grad_norm": 3.1016323384414277, "learning_rate": 6.464645518045991e-06, "loss": 0.8442, "step": 667 }, { "epoch": 1.2723809523809524, "grad_norm": 2.374826299733099, "learning_rate": 6.454806852016179e-06, "loss": 0.6381, "step": 668 }, { "epoch": 1.2742857142857142, "grad_norm": 2.20642420777723, "learning_rate": 6.444962028168266e-06, "loss": 0.7221, "step": 669 }, { "epoch": 1.276190476190476, "grad_norm": 3.4295788673888477, "learning_rate": 6.435111088172823e-06, "loss": 0.617, "step": 670 }, { "epoch": 1.2780952380952382, "grad_norm": 2.655800684712996, "learning_rate": 6.425254073726311e-06, "loss": 0.6748, "step": 671 }, { "epoch": 1.28, "grad_norm": 3.077761933655665, "learning_rate": 6.415391026550903e-06, "loss": 0.6371, "step": 672 }, { "epoch": 1.2819047619047619, "grad_norm": 2.7718500409566946, "learning_rate": 6.405521988394305e-06, "loss": 0.7329, "step": 673 }, { "epoch": 1.2838095238095237, "grad_norm": 3.4275730335541748, "learning_rate": 6.395647001029586e-06, "loss": 0.7096, "step": 674 }, { "epoch": 1.2857142857142856, "grad_norm": 2.9976507369396623, "learning_rate": 6.385766106254987e-06, "loss": 0.6088, "step": 675 }, { "epoch": 1.2876190476190477, "grad_norm": 3.1837382805231202, "learning_rate": 6.375879345893763e-06, "loss": 0.725, "step": 676 }, { "epoch": 1.2895238095238095, "grad_norm": 2.5997684793341698, "learning_rate": 6.365986761793992e-06, "loss": 0.6453, "step": 677 }, { "epoch": 1.2914285714285714, "grad_norm": 2.6334802846490284, "learning_rate": 6.356088395828404e-06, "loss": 0.67, "step": 678 }, { "epoch": 1.2933333333333334, "grad_norm": 2.8475343588361572, "learning_rate": 6.346184289894202e-06, "loss": 0.718, "step": 679 }, { "epoch": 1.2952380952380953, "grad_norm": 3.293427470182712, "learning_rate": 6.336274485912881e-06, "loss": 0.8292, "step": 680 }, { "epoch": 1.2971428571428572, "grad_norm": 3.9640201165292663, "learning_rate": 6.326359025830061e-06, "loss": 0.6829, "step": 681 }, { "epoch": 1.299047619047619, "grad_norm": 2.75330361872787, "learning_rate": 6.3164379516152995e-06, "loss": 0.8595, "step": 682 }, { "epoch": 1.3009523809523809, "grad_norm": 3.3356147097372935, "learning_rate": 6.306511305261914e-06, "loss": 0.6987, "step": 683 }, { "epoch": 1.302857142857143, "grad_norm": 4.349370713917045, "learning_rate": 6.29657912878681e-06, "loss": 0.762, "step": 684 }, { "epoch": 1.3047619047619048, "grad_norm": 3.0295474631604953, "learning_rate": 6.2866414642303054e-06, "loss": 0.5335, "step": 685 }, { "epoch": 1.3066666666666666, "grad_norm": 2.912987585897963, "learning_rate": 6.2766983536559404e-06, "loss": 0.6494, "step": 686 }, { "epoch": 1.3085714285714285, "grad_norm": 3.3460715358833086, "learning_rate": 6.266749839150309e-06, "loss": 0.6154, "step": 687 }, { "epoch": 1.3104761904761904, "grad_norm": 2.965154041904548, "learning_rate": 6.256795962822881e-06, "loss": 0.6863, "step": 688 }, { "epoch": 1.3123809523809524, "grad_norm": 3.554960125513718, "learning_rate": 6.2468367668058195e-06, "loss": 0.8489, "step": 689 }, { "epoch": 1.3142857142857143, "grad_norm": 3.117852691852733, "learning_rate": 6.236872293253803e-06, "loss": 0.8616, "step": 690 }, { "epoch": 1.3161904761904761, "grad_norm": 3.39993417136851, "learning_rate": 6.22690258434385e-06, "loss": 0.6842, "step": 691 }, { "epoch": 1.3180952380952382, "grad_norm": 3.016683033095873, "learning_rate": 6.2169276822751425e-06, "loss": 0.7003, "step": 692 }, { "epoch": 1.32, "grad_norm": 2.535887025186407, "learning_rate": 6.206947629268838e-06, "loss": 0.6624, "step": 693 }, { "epoch": 1.321904761904762, "grad_norm": 3.193173944123266, "learning_rate": 6.196962467567897e-06, "loss": 0.7883, "step": 694 }, { "epoch": 1.3238095238095238, "grad_norm": 2.833232000100724, "learning_rate": 6.186972239436911e-06, "loss": 0.6097, "step": 695 }, { "epoch": 1.3257142857142856, "grad_norm": 3.816531669777505, "learning_rate": 6.1769769871619035e-06, "loss": 0.7582, "step": 696 }, { "epoch": 1.3276190476190477, "grad_norm": 2.727282902700753, "learning_rate": 6.166976753050176e-06, "loss": 0.7223, "step": 697 }, { "epoch": 1.3295238095238096, "grad_norm": 2.47254221153791, "learning_rate": 6.156971579430111e-06, "loss": 0.7326, "step": 698 }, { "epoch": 1.3314285714285714, "grad_norm": 3.3277609741062713, "learning_rate": 6.146961508651002e-06, "loss": 0.6196, "step": 699 }, { "epoch": 1.3333333333333333, "grad_norm": 2.488166322666529, "learning_rate": 6.136946583082866e-06, "loss": 0.5561, "step": 700 }, { "epoch": 1.3352380952380951, "grad_norm": 3.5366781350749448, "learning_rate": 6.126926845116272e-06, "loss": 0.6788, "step": 701 }, { "epoch": 1.3371428571428572, "grad_norm": 2.8607003125373467, "learning_rate": 6.11690233716216e-06, "loss": 0.7793, "step": 702 }, { "epoch": 1.339047619047619, "grad_norm": 3.0435458215778866, "learning_rate": 6.106873101651659e-06, "loss": 0.7998, "step": 703 }, { "epoch": 1.340952380952381, "grad_norm": 3.2575729935155735, "learning_rate": 6.096839181035905e-06, "loss": 0.5916, "step": 704 }, { "epoch": 1.342857142857143, "grad_norm": 2.9063740802818665, "learning_rate": 6.08680061778587e-06, "loss": 0.741, "step": 705 }, { "epoch": 1.3447619047619048, "grad_norm": 3.9752753569114696, "learning_rate": 6.076757454392176e-06, "loss": 0.6629, "step": 706 }, { "epoch": 1.3466666666666667, "grad_norm": 2.7449874531756864, "learning_rate": 6.066709733364914e-06, "loss": 0.7444, "step": 707 }, { "epoch": 1.3485714285714285, "grad_norm": 3.4410814124933573, "learning_rate": 6.056657497233465e-06, "loss": 0.658, "step": 708 }, { "epoch": 1.3504761904761904, "grad_norm": 3.7547841103755286, "learning_rate": 6.046600788546328e-06, "loss": 0.7313, "step": 709 }, { "epoch": 1.3523809523809525, "grad_norm": 2.816106699814712, "learning_rate": 6.036539649870928e-06, "loss": 0.6924, "step": 710 }, { "epoch": 1.3542857142857143, "grad_norm": 2.7898661832286744, "learning_rate": 6.026474123793436e-06, "loss": 0.7206, "step": 711 }, { "epoch": 1.3561904761904762, "grad_norm": 2.9719654540369307, "learning_rate": 6.0164042529186075e-06, "loss": 0.6863, "step": 712 }, { "epoch": 1.358095238095238, "grad_norm": 2.547070736691609, "learning_rate": 6.006330079869577e-06, "loss": 0.6067, "step": 713 }, { "epoch": 1.3599999999999999, "grad_norm": 3.514053199524337, "learning_rate": 5.996251647287692e-06, "loss": 0.9087, "step": 714 }, { "epoch": 1.361904761904762, "grad_norm": 2.6237497788605113, "learning_rate": 5.98616899783233e-06, "loss": 0.5744, "step": 715 }, { "epoch": 1.3638095238095238, "grad_norm": 3.041927925551627, "learning_rate": 5.97608217418072e-06, "loss": 0.7331, "step": 716 }, { "epoch": 1.3657142857142857, "grad_norm": 3.915936199058181, "learning_rate": 5.965991219027753e-06, "loss": 0.6955, "step": 717 }, { "epoch": 1.3676190476190477, "grad_norm": 3.5420297523927675, "learning_rate": 5.9558961750858114e-06, "loss": 0.7517, "step": 718 }, { "epoch": 1.3695238095238096, "grad_norm": 2.9957096252738973, "learning_rate": 5.945797085084587e-06, "loss": 0.6347, "step": 719 }, { "epoch": 1.3714285714285714, "grad_norm": 2.2230170081613587, "learning_rate": 5.9356939917708945e-06, "loss": 0.6186, "step": 720 }, { "epoch": 1.3733333333333333, "grad_norm": 3.042777312528671, "learning_rate": 5.925586937908491e-06, "loss": 0.6627, "step": 721 }, { "epoch": 1.3752380952380951, "grad_norm": 2.8848699148590016, "learning_rate": 5.915475966277901e-06, "loss": 0.6201, "step": 722 }, { "epoch": 1.3771428571428572, "grad_norm": 2.5897752694710703, "learning_rate": 5.905361119676234e-06, "loss": 0.5931, "step": 723 }, { "epoch": 1.379047619047619, "grad_norm": 2.5972834657518744, "learning_rate": 5.895242440916996e-06, "loss": 0.6679, "step": 724 }, { "epoch": 1.380952380952381, "grad_norm": 2.395772503877728, "learning_rate": 5.885119972829915e-06, "loss": 0.7798, "step": 725 }, { "epoch": 1.3828571428571428, "grad_norm": 3.001203393028994, "learning_rate": 5.874993758260762e-06, "loss": 0.6708, "step": 726 }, { "epoch": 1.3847619047619046, "grad_norm": 2.602232348223643, "learning_rate": 5.864863840071162e-06, "loss": 0.7032, "step": 727 }, { "epoch": 1.3866666666666667, "grad_norm": 2.7432460005389565, "learning_rate": 5.854730261138418e-06, "loss": 0.8601, "step": 728 }, { "epoch": 1.3885714285714286, "grad_norm": 4.492597819123363, "learning_rate": 5.844593064355327e-06, "loss": 0.7868, "step": 729 }, { "epoch": 1.3904761904761904, "grad_norm": 3.174561526439116, "learning_rate": 5.834452292629999e-06, "loss": 0.6974, "step": 730 }, { "epoch": 1.3923809523809525, "grad_norm": 2.7381709410077377, "learning_rate": 5.8243079888856806e-06, "loss": 0.7537, "step": 731 }, { "epoch": 1.3942857142857144, "grad_norm": 2.814631088171071, "learning_rate": 5.814160196060559e-06, "loss": 0.6452, "step": 732 }, { "epoch": 1.3961904761904762, "grad_norm": 2.4498587166561796, "learning_rate": 5.8040089571075995e-06, "loss": 0.7442, "step": 733 }, { "epoch": 1.398095238095238, "grad_norm": 3.2496619835380596, "learning_rate": 5.7938543149943495e-06, "loss": 0.6019, "step": 734 }, { "epoch": 1.4, "grad_norm": 4.032238546119449, "learning_rate": 5.78369631270276e-06, "loss": 0.6778, "step": 735 }, { "epoch": 1.401904761904762, "grad_norm": 3.140057835205741, "learning_rate": 5.773534993229007e-06, "loss": 0.7595, "step": 736 }, { "epoch": 1.4038095238095238, "grad_norm": 2.871132731659619, "learning_rate": 5.763370399583307e-06, "loss": 0.5589, "step": 737 }, { "epoch": 1.4057142857142857, "grad_norm": 3.1300745174835147, "learning_rate": 5.753202574789735e-06, "loss": 0.7086, "step": 738 }, { "epoch": 1.4076190476190475, "grad_norm": 2.79437180770245, "learning_rate": 5.743031561886039e-06, "loss": 0.6284, "step": 739 }, { "epoch": 1.4095238095238094, "grad_norm": 2.8627209838146364, "learning_rate": 5.7328574039234696e-06, "loss": 0.7121, "step": 740 }, { "epoch": 1.4114285714285715, "grad_norm": 3.7859419380269035, "learning_rate": 5.722680143966582e-06, "loss": 0.5724, "step": 741 }, { "epoch": 1.4133333333333333, "grad_norm": 2.479898353947677, "learning_rate": 5.7124998250930665e-06, "loss": 0.6805, "step": 742 }, { "epoch": 1.4152380952380952, "grad_norm": 3.3140708190612567, "learning_rate": 5.702316490393556e-06, "loss": 0.6668, "step": 743 }, { "epoch": 1.4171428571428573, "grad_norm": 3.0177328733621747, "learning_rate": 5.6921301829714525e-06, "loss": 0.7792, "step": 744 }, { "epoch": 1.4190476190476191, "grad_norm": 3.0086287709904362, "learning_rate": 5.681940945942739e-06, "loss": 0.5872, "step": 745 }, { "epoch": 1.420952380952381, "grad_norm": 3.519233924180684, "learning_rate": 5.6717488224358e-06, "loss": 0.6653, "step": 746 }, { "epoch": 1.4228571428571428, "grad_norm": 4.638840193856848, "learning_rate": 5.661553855591237e-06, "loss": 0.7945, "step": 747 }, { "epoch": 1.4247619047619047, "grad_norm": 2.9267759360744137, "learning_rate": 5.651356088561685e-06, "loss": 0.6684, "step": 748 }, { "epoch": 1.4266666666666667, "grad_norm": 3.046637456908023, "learning_rate": 5.641155564511634e-06, "loss": 0.7486, "step": 749 }, { "epoch": 1.4285714285714286, "grad_norm": 3.016922422419726, "learning_rate": 5.630952326617243e-06, "loss": 0.7607, "step": 750 }, { "epoch": 1.4304761904761905, "grad_norm": 2.077081545887961, "learning_rate": 5.620746418066158e-06, "loss": 0.6368, "step": 751 }, { "epoch": 1.4323809523809523, "grad_norm": 2.222786815953259, "learning_rate": 5.610537882057329e-06, "loss": 0.7579, "step": 752 }, { "epoch": 1.4342857142857142, "grad_norm": 2.8408310035328674, "learning_rate": 5.600326761800826e-06, "loss": 0.6567, "step": 753 }, { "epoch": 1.4361904761904762, "grad_norm": 3.620772954265577, "learning_rate": 5.59011310051766e-06, "loss": 0.8617, "step": 754 }, { "epoch": 1.438095238095238, "grad_norm": 3.4873578497643583, "learning_rate": 5.579896941439596e-06, "loss": 0.7156, "step": 755 }, { "epoch": 1.44, "grad_norm": 3.361434073720694, "learning_rate": 5.5696783278089724e-06, "loss": 0.6971, "step": 756 }, { "epoch": 1.441904761904762, "grad_norm": 2.412579114731131, "learning_rate": 5.559457302878517e-06, "loss": 0.5188, "step": 757 }, { "epoch": 1.4438095238095239, "grad_norm": 2.813791279931654, "learning_rate": 5.54923390991116e-06, "loss": 0.7952, "step": 758 }, { "epoch": 1.4457142857142857, "grad_norm": 3.2034583719282215, "learning_rate": 5.539008192179862e-06, "loss": 0.5124, "step": 759 }, { "epoch": 1.4476190476190476, "grad_norm": 2.5104234596805384, "learning_rate": 5.528780192967419e-06, "loss": 0.6554, "step": 760 }, { "epoch": 1.4495238095238094, "grad_norm": 4.1105396484207075, "learning_rate": 5.5185499555662845e-06, "loss": 0.7394, "step": 761 }, { "epoch": 1.4514285714285715, "grad_norm": 4.273975163273313, "learning_rate": 5.508317523278386e-06, "loss": 0.7807, "step": 762 }, { "epoch": 1.4533333333333334, "grad_norm": 2.946980421500577, "learning_rate": 5.498082939414943e-06, "loss": 0.6459, "step": 763 }, { "epoch": 1.4552380952380952, "grad_norm": 3.7878732338823076, "learning_rate": 5.487846247296278e-06, "loss": 0.7925, "step": 764 }, { "epoch": 1.457142857142857, "grad_norm": 2.8106610260560903, "learning_rate": 5.477607490251642e-06, "loss": 0.648, "step": 765 }, { "epoch": 1.459047619047619, "grad_norm": 1.986311245912668, "learning_rate": 5.467366711619024e-06, "loss": 0.6146, "step": 766 }, { "epoch": 1.460952380952381, "grad_norm": 2.8887232997693886, "learning_rate": 5.457123954744967e-06, "loss": 0.9805, "step": 767 }, { "epoch": 1.4628571428571429, "grad_norm": 3.2023413077412823, "learning_rate": 5.4468792629843944e-06, "loss": 0.5809, "step": 768 }, { "epoch": 1.4647619047619047, "grad_norm": 3.419380543553438, "learning_rate": 5.436632679700414e-06, "loss": 0.7441, "step": 769 }, { "epoch": 1.4666666666666668, "grad_norm": 3.562089726232499, "learning_rate": 5.42638424826414e-06, "loss": 0.7045, "step": 770 }, { "epoch": 1.4685714285714286, "grad_norm": 2.4486399824970544, "learning_rate": 5.416134012054512e-06, "loss": 0.6379, "step": 771 }, { "epoch": 1.4704761904761905, "grad_norm": 4.0111382244553, "learning_rate": 5.405882014458108e-06, "loss": 0.7735, "step": 772 }, { "epoch": 1.4723809523809523, "grad_norm": 3.1008837360887123, "learning_rate": 5.395628298868959e-06, "loss": 0.6142, "step": 773 }, { "epoch": 1.4742857142857142, "grad_norm": 3.076309986647131, "learning_rate": 5.385372908688371e-06, "loss": 0.6175, "step": 774 }, { "epoch": 1.4761904761904763, "grad_norm": 3.5369815120040684, "learning_rate": 5.375115887324736e-06, "loss": 0.7672, "step": 775 }, { "epoch": 1.4780952380952381, "grad_norm": 2.9278367481439895, "learning_rate": 5.364857278193352e-06, "loss": 0.6826, "step": 776 }, { "epoch": 1.48, "grad_norm": 2.4330230002175353, "learning_rate": 5.3545971247162355e-06, "loss": 0.5997, "step": 777 }, { "epoch": 1.4819047619047618, "grad_norm": 3.3558247094866593, "learning_rate": 5.344335470321943e-06, "loss": 0.72, "step": 778 }, { "epoch": 1.4838095238095237, "grad_norm": 2.645281645076092, "learning_rate": 5.33407235844538e-06, "loss": 0.584, "step": 779 }, { "epoch": 1.4857142857142858, "grad_norm": 2.549284753692442, "learning_rate": 5.3238078325276255e-06, "loss": 0.636, "step": 780 }, { "epoch": 1.4876190476190476, "grad_norm": 2.8779634095757567, "learning_rate": 5.313541936015738e-06, "loss": 0.7058, "step": 781 }, { "epoch": 1.4895238095238095, "grad_norm": 2.8052343478617017, "learning_rate": 5.303274712362585e-06, "loss": 0.6137, "step": 782 }, { "epoch": 1.4914285714285715, "grad_norm": 3.1443738009443525, "learning_rate": 5.293006205026646e-06, "loss": 0.7201, "step": 783 }, { "epoch": 1.4933333333333334, "grad_norm": 3.489118791079987, "learning_rate": 5.2827364574718345e-06, "loss": 0.73, "step": 784 }, { "epoch": 1.4952380952380953, "grad_norm": 2.7852702479871, "learning_rate": 5.272465513167314e-06, "loss": 0.5916, "step": 785 }, { "epoch": 1.497142857142857, "grad_norm": 2.957697607444805, "learning_rate": 5.2621934155873146e-06, "loss": 0.5232, "step": 786 }, { "epoch": 1.499047619047619, "grad_norm": 3.8701519060947946, "learning_rate": 5.2519202082109486e-06, "loss": 0.6723, "step": 787 }, { "epoch": 1.500952380952381, "grad_norm": 3.4906011764517864, "learning_rate": 5.24164593452202e-06, "loss": 0.7324, "step": 788 }, { "epoch": 1.502857142857143, "grad_norm": 3.023484572517283, "learning_rate": 5.231370638008856e-06, "loss": 0.5537, "step": 789 }, { "epoch": 1.5047619047619047, "grad_norm": 3.099329266333818, "learning_rate": 5.221094362164103e-06, "loss": 0.6879, "step": 790 }, { "epoch": 1.5066666666666668, "grad_norm": 3.1294355431433334, "learning_rate": 5.210817150484562e-06, "loss": 0.6488, "step": 791 }, { "epoch": 1.5085714285714285, "grad_norm": 2.7660308401375917, "learning_rate": 5.200539046470986e-06, "loss": 0.6598, "step": 792 }, { "epoch": 1.5104761904761905, "grad_norm": 2.7023455096788016, "learning_rate": 5.190260093627912e-06, "loss": 0.6454, "step": 793 }, { "epoch": 1.5123809523809524, "grad_norm": 2.5508349609780994, "learning_rate": 5.179980335463467e-06, "loss": 0.682, "step": 794 }, { "epoch": 1.5142857142857142, "grad_norm": 3.2152014721060604, "learning_rate": 5.169699815489187e-06, "loss": 0.7798, "step": 795 }, { "epoch": 1.5161904761904763, "grad_norm": 3.1630300513674756, "learning_rate": 5.159418577219832e-06, "loss": 0.5927, "step": 796 }, { "epoch": 1.518095238095238, "grad_norm": 3.5036069767173257, "learning_rate": 5.149136664173205e-06, "loss": 0.8095, "step": 797 }, { "epoch": 1.52, "grad_norm": 2.718578606579851, "learning_rate": 5.138854119869962e-06, "loss": 0.6012, "step": 798 }, { "epoch": 1.5219047619047619, "grad_norm": 3.8660886936224443, "learning_rate": 5.128570987833433e-06, "loss": 0.9095, "step": 799 }, { "epoch": 1.5238095238095237, "grad_norm": 2.8226941473058877, "learning_rate": 5.118287311589435e-06, "loss": 0.5466, "step": 800 }, { "epoch": 1.5257142857142858, "grad_norm": 3.5164132340577336, "learning_rate": 5.108003134666085e-06, "loss": 0.6947, "step": 801 }, { "epoch": 1.5276190476190477, "grad_norm": 3.446538166929701, "learning_rate": 5.0977185005936245e-06, "loss": 0.6454, "step": 802 }, { "epoch": 1.5295238095238095, "grad_norm": 2.8804223530054833, "learning_rate": 5.08743345290423e-06, "loss": 0.6789, "step": 803 }, { "epoch": 1.5314285714285716, "grad_norm": 3.728491017598619, "learning_rate": 5.0771480351318245e-06, "loss": 0.6611, "step": 804 }, { "epoch": 1.5333333333333332, "grad_norm": 2.683746756487477, "learning_rate": 5.066862290811901e-06, "loss": 0.628, "step": 805 }, { "epoch": 1.5352380952380953, "grad_norm": 2.826122839600481, "learning_rate": 5.056576263481333e-06, "loss": 0.6233, "step": 806 }, { "epoch": 1.5371428571428571, "grad_norm": 2.4424899804917395, "learning_rate": 5.046289996678193e-06, "loss": 0.6067, "step": 807 }, { "epoch": 1.539047619047619, "grad_norm": 3.315304635544952, "learning_rate": 5.036003533941566e-06, "loss": 0.5374, "step": 808 }, { "epoch": 1.540952380952381, "grad_norm": 2.5365866522887788, "learning_rate": 5.025716918811365e-06, "loss": 0.6411, "step": 809 }, { "epoch": 1.5428571428571427, "grad_norm": 3.0745905352339853, "learning_rate": 5.015430194828153e-06, "loss": 0.6221, "step": 810 }, { "epoch": 1.5447619047619048, "grad_norm": 3.395566166882911, "learning_rate": 5.005143405532949e-06, "loss": 0.7, "step": 811 }, { "epoch": 1.5466666666666666, "grad_norm": 3.177105463980441, "learning_rate": 4.994856594467052e-06, "loss": 0.7141, "step": 812 }, { "epoch": 1.5485714285714285, "grad_norm": 2.869780556824362, "learning_rate": 4.984569805171848e-06, "loss": 0.6307, "step": 813 }, { "epoch": 1.5504761904761906, "grad_norm": 2.781756018163063, "learning_rate": 4.974283081188637e-06, "loss": 0.6423, "step": 814 }, { "epoch": 1.5523809523809524, "grad_norm": 2.6451418188447886, "learning_rate": 4.963996466058437e-06, "loss": 0.6308, "step": 815 }, { "epoch": 1.5542857142857143, "grad_norm": 2.600168166749201, "learning_rate": 4.9537100033218095e-06, "loss": 0.7036, "step": 816 }, { "epoch": 1.5561904761904763, "grad_norm": 3.7102416903278344, "learning_rate": 4.9434237365186684e-06, "loss": 0.7789, "step": 817 }, { "epoch": 1.558095238095238, "grad_norm": 2.7509058272754063, "learning_rate": 4.933137709188101e-06, "loss": 0.6555, "step": 818 }, { "epoch": 1.56, "grad_norm": 3.693036365858434, "learning_rate": 4.9228519648681755e-06, "loss": 0.7627, "step": 819 }, { "epoch": 1.561904761904762, "grad_norm": 2.509385243026165, "learning_rate": 4.912566547095771e-06, "loss": 0.6935, "step": 820 }, { "epoch": 1.5638095238095238, "grad_norm": 2.835926938956802, "learning_rate": 4.9022814994063755e-06, "loss": 0.7063, "step": 821 }, { "epoch": 1.5657142857142858, "grad_norm": 4.228622983137872, "learning_rate": 4.891996865333916e-06, "loss": 0.8022, "step": 822 }, { "epoch": 1.5676190476190475, "grad_norm": 2.375014267818986, "learning_rate": 4.881712688410567e-06, "loss": 0.7894, "step": 823 }, { "epoch": 1.5695238095238095, "grad_norm": 2.9474220653843735, "learning_rate": 4.8714290121665685e-06, "loss": 0.7133, "step": 824 }, { "epoch": 1.5714285714285714, "grad_norm": 2.7906613623942857, "learning_rate": 4.8611458801300385e-06, "loss": 0.6856, "step": 825 }, { "epoch": 1.5733333333333333, "grad_norm": 2.70156780088855, "learning_rate": 4.850863335826796e-06, "loss": 0.7329, "step": 826 }, { "epoch": 1.5752380952380953, "grad_norm": 2.9695588772186587, "learning_rate": 4.840581422780169e-06, "loss": 0.7742, "step": 827 }, { "epoch": 1.5771428571428572, "grad_norm": 2.719873968817656, "learning_rate": 4.830300184510815e-06, "loss": 0.6049, "step": 828 }, { "epoch": 1.579047619047619, "grad_norm": 2.812046412806861, "learning_rate": 4.820019664536535e-06, "loss": 0.6597, "step": 829 }, { "epoch": 1.580952380952381, "grad_norm": 2.1229560914014574, "learning_rate": 4.809739906372091e-06, "loss": 0.6903, "step": 830 }, { "epoch": 1.5828571428571427, "grad_norm": 2.4094540674992806, "learning_rate": 4.799460953529017e-06, "loss": 0.7547, "step": 831 }, { "epoch": 1.5847619047619048, "grad_norm": 2.6862956326258485, "learning_rate": 4.789182849515441e-06, "loss": 0.5746, "step": 832 }, { "epoch": 1.5866666666666667, "grad_norm": 2.9202442761476366, "learning_rate": 4.778905637835896e-06, "loss": 0.6531, "step": 833 }, { "epoch": 1.5885714285714285, "grad_norm": 3.1979912363852487, "learning_rate": 4.768629361991145e-06, "loss": 0.6923, "step": 834 }, { "epoch": 1.5904761904761906, "grad_norm": 3.7943376500290826, "learning_rate": 4.75835406547798e-06, "loss": 0.7664, "step": 835 }, { "epoch": 1.5923809523809522, "grad_norm": 3.3082326577429453, "learning_rate": 4.748079791789053e-06, "loss": 0.7764, "step": 836 }, { "epoch": 1.5942857142857143, "grad_norm": 3.048345940965857, "learning_rate": 4.737806584412687e-06, "loss": 0.6646, "step": 837 }, { "epoch": 1.5961904761904762, "grad_norm": 2.918550148394755, "learning_rate": 4.727534486832688e-06, "loss": 0.6275, "step": 838 }, { "epoch": 1.598095238095238, "grad_norm": 2.674098077658351, "learning_rate": 4.717263542528168e-06, "loss": 0.738, "step": 839 }, { "epoch": 1.6, "grad_norm": 2.9715429623860703, "learning_rate": 4.706993794973355e-06, "loss": 0.6091, "step": 840 }, { "epoch": 1.601904761904762, "grad_norm": 2.847112291191056, "learning_rate": 4.696725287637416e-06, "loss": 0.6481, "step": 841 }, { "epoch": 1.6038095238095238, "grad_norm": 2.42127343761098, "learning_rate": 4.686458063984262e-06, "loss": 0.6373, "step": 842 }, { "epoch": 1.6057142857142859, "grad_norm": 3.289201855822001, "learning_rate": 4.676192167472377e-06, "loss": 0.6502, "step": 843 }, { "epoch": 1.6076190476190475, "grad_norm": 3.3588879451203244, "learning_rate": 4.665927641554622e-06, "loss": 0.7436, "step": 844 }, { "epoch": 1.6095238095238096, "grad_norm": 3.402550196707435, "learning_rate": 4.65566452967806e-06, "loss": 0.5958, "step": 845 }, { "epoch": 1.6114285714285714, "grad_norm": 3.0956646090286575, "learning_rate": 4.645402875283766e-06, "loss": 0.6677, "step": 846 }, { "epoch": 1.6133333333333333, "grad_norm": 2.7562886994563383, "learning_rate": 4.635142721806648e-06, "loss": 0.7779, "step": 847 }, { "epoch": 1.6152380952380954, "grad_norm": 2.0483098437098137, "learning_rate": 4.624884112675264e-06, "loss": 0.7145, "step": 848 }, { "epoch": 1.617142857142857, "grad_norm": 3.175126134568553, "learning_rate": 4.61462709131163e-06, "loss": 0.7021, "step": 849 }, { "epoch": 1.619047619047619, "grad_norm": 3.104049491157547, "learning_rate": 4.604371701131042e-06, "loss": 0.6795, "step": 850 }, { "epoch": 1.620952380952381, "grad_norm": 3.0202728359535747, "learning_rate": 4.594117985541894e-06, "loss": 0.6878, "step": 851 }, { "epoch": 1.6228571428571428, "grad_norm": 4.1585216294967875, "learning_rate": 4.5838659879454885e-06, "loss": 0.6134, "step": 852 }, { "epoch": 1.6247619047619049, "grad_norm": 2.4888343626712275, "learning_rate": 4.5736157517358605e-06, "loss": 0.5989, "step": 853 }, { "epoch": 1.6266666666666667, "grad_norm": 2.71491706666814, "learning_rate": 4.563367320299587e-06, "loss": 0.6745, "step": 854 }, { "epoch": 1.6285714285714286, "grad_norm": 3.4916761720558505, "learning_rate": 4.553120737015606e-06, "loss": 0.6381, "step": 855 }, { "epoch": 1.6304761904761906, "grad_norm": 2.8324096551846454, "learning_rate": 4.542876045255034e-06, "loss": 0.6739, "step": 856 }, { "epoch": 1.6323809523809523, "grad_norm": 2.2204855147124754, "learning_rate": 4.5326332883809786e-06, "loss": 0.6818, "step": 857 }, { "epoch": 1.6342857142857143, "grad_norm": 2.9580703323692688, "learning_rate": 4.52239250974836e-06, "loss": 0.5558, "step": 858 }, { "epoch": 1.6361904761904762, "grad_norm": 2.5163725808744504, "learning_rate": 4.512153752703724e-06, "loss": 0.7532, "step": 859 }, { "epoch": 1.638095238095238, "grad_norm": 2.7245795479033994, "learning_rate": 4.50191706058506e-06, "loss": 0.5273, "step": 860 }, { "epoch": 1.6400000000000001, "grad_norm": 2.837670571626123, "learning_rate": 4.491682476721614e-06, "loss": 0.6603, "step": 861 }, { "epoch": 1.6419047619047618, "grad_norm": 2.9029427498443545, "learning_rate": 4.481450044433716e-06, "loss": 0.702, "step": 862 }, { "epoch": 1.6438095238095238, "grad_norm": 3.6327705075800547, "learning_rate": 4.471219807032582e-06, "loss": 0.7007, "step": 863 }, { "epoch": 1.6457142857142857, "grad_norm": 2.9520614396209863, "learning_rate": 4.4609918078201386e-06, "loss": 0.6278, "step": 864 }, { "epoch": 1.6476190476190475, "grad_norm": 2.9406888144314323, "learning_rate": 4.4507660900888405e-06, "loss": 0.7473, "step": 865 }, { "epoch": 1.6495238095238096, "grad_norm": 2.589423462875807, "learning_rate": 4.440542697121485e-06, "loss": 0.6206, "step": 866 }, { "epoch": 1.6514285714285715, "grad_norm": 3.7177152584023814, "learning_rate": 4.430321672191028e-06, "loss": 0.6792, "step": 867 }, { "epoch": 1.6533333333333333, "grad_norm": 3.5635519844294645, "learning_rate": 4.420103058560405e-06, "loss": 0.792, "step": 868 }, { "epoch": 1.6552380952380954, "grad_norm": 2.5931410583648846, "learning_rate": 4.409886899482341e-06, "loss": 0.6266, "step": 869 }, { "epoch": 1.657142857142857, "grad_norm": 3.2482541957748174, "learning_rate": 4.399673238199176e-06, "loss": 0.6542, "step": 870 }, { "epoch": 1.659047619047619, "grad_norm": 2.7941492631087548, "learning_rate": 4.389462117942673e-06, "loss": 0.5911, "step": 871 }, { "epoch": 1.660952380952381, "grad_norm": 3.198917735948789, "learning_rate": 4.379253581933844e-06, "loss": 0.7099, "step": 872 }, { "epoch": 1.6628571428571428, "grad_norm": 3.5179040731056395, "learning_rate": 4.369047673382759e-06, "loss": 0.6747, "step": 873 }, { "epoch": 1.6647619047619049, "grad_norm": 2.4504790181751965, "learning_rate": 4.358844435488367e-06, "loss": 0.5693, "step": 874 }, { "epoch": 1.6666666666666665, "grad_norm": 2.9500759019933214, "learning_rate": 4.3486439114383174e-06, "loss": 0.6927, "step": 875 }, { "epoch": 1.6685714285714286, "grad_norm": 2.935824994705467, "learning_rate": 4.338446144408766e-06, "loss": 0.6949, "step": 876 }, { "epoch": 1.6704761904761904, "grad_norm": 2.581849917285868, "learning_rate": 4.328251177564202e-06, "loss": 0.8297, "step": 877 }, { "epoch": 1.6723809523809523, "grad_norm": 3.649626741114842, "learning_rate": 4.318059054057263e-06, "loss": 0.7397, "step": 878 }, { "epoch": 1.6742857142857144, "grad_norm": 3.5574483603650466, "learning_rate": 4.307869817028549e-06, "loss": 0.6174, "step": 879 }, { "epoch": 1.6761904761904762, "grad_norm": 3.278158882599443, "learning_rate": 4.297683509606446e-06, "loss": 0.6851, "step": 880 }, { "epoch": 1.678095238095238, "grad_norm": 3.539550696262606, "learning_rate": 4.287500174906935e-06, "loss": 0.8195, "step": 881 }, { "epoch": 1.6800000000000002, "grad_norm": 3.3932565080563886, "learning_rate": 4.2773198560334186e-06, "loss": 0.7482, "step": 882 }, { "epoch": 1.6819047619047618, "grad_norm": 2.7348422029401833, "learning_rate": 4.267142596076532e-06, "loss": 0.4363, "step": 883 }, { "epoch": 1.6838095238095239, "grad_norm": 2.6544372954428352, "learning_rate": 4.256968438113962e-06, "loss": 0.7329, "step": 884 }, { "epoch": 1.6857142857142857, "grad_norm": 3.4679395965649276, "learning_rate": 4.246797425210268e-06, "loss": 0.7275, "step": 885 }, { "epoch": 1.6876190476190476, "grad_norm": 2.7341568318252447, "learning_rate": 4.236629600416695e-06, "loss": 0.6893, "step": 886 }, { "epoch": 1.6895238095238096, "grad_norm": 2.655452784384076, "learning_rate": 4.226465006770995e-06, "loss": 0.6781, "step": 887 }, { "epoch": 1.6914285714285713, "grad_norm": 3.166798197497907, "learning_rate": 4.216303687297242e-06, "loss": 0.6028, "step": 888 }, { "epoch": 1.6933333333333334, "grad_norm": 2.6579456682906164, "learning_rate": 4.206145685005653e-06, "loss": 0.6088, "step": 889 }, { "epoch": 1.6952380952380952, "grad_norm": 2.978773460797544, "learning_rate": 4.195991042892401e-06, "loss": 0.5404, "step": 890 }, { "epoch": 1.697142857142857, "grad_norm": 3.5451760661992346, "learning_rate": 4.185839803939442e-06, "loss": 0.7008, "step": 891 }, { "epoch": 1.6990476190476191, "grad_norm": 3.219380575339356, "learning_rate": 4.175692011114322e-06, "loss": 0.7501, "step": 892 }, { "epoch": 1.700952380952381, "grad_norm": 7.776458381128656, "learning_rate": 4.1655477073700015e-06, "loss": 0.7529, "step": 893 }, { "epoch": 1.7028571428571428, "grad_norm": 3.3465572345350076, "learning_rate": 4.1554069356446746e-06, "loss": 0.7325, "step": 894 }, { "epoch": 1.704761904761905, "grad_norm": 2.450860718244936, "learning_rate": 4.145269738861583e-06, "loss": 0.581, "step": 895 }, { "epoch": 1.7066666666666666, "grad_norm": 2.7750989329792257, "learning_rate": 4.1351361599288385e-06, "loss": 0.5982, "step": 896 }, { "epoch": 1.7085714285714286, "grad_norm": 2.6798522757105503, "learning_rate": 4.125006241739239e-06, "loss": 0.7034, "step": 897 }, { "epoch": 1.7104761904761905, "grad_norm": 2.973051275506407, "learning_rate": 4.114880027170086e-06, "loss": 0.7019, "step": 898 }, { "epoch": 1.7123809523809523, "grad_norm": 2.764933346748833, "learning_rate": 4.104757559083007e-06, "loss": 0.7546, "step": 899 }, { "epoch": 1.7142857142857144, "grad_norm": 3.026951164354996, "learning_rate": 4.094638880323768e-06, "loss": 0.5869, "step": 900 }, { "epoch": 1.716190476190476, "grad_norm": 3.0933296947341544, "learning_rate": 4.0845240337221005e-06, "loss": 0.639, "step": 901 }, { "epoch": 1.7180952380952381, "grad_norm": 3.186168970973745, "learning_rate": 4.0744130620915105e-06, "loss": 0.6596, "step": 902 }, { "epoch": 1.72, "grad_norm": 3.5668548195343077, "learning_rate": 4.064306008229107e-06, "loss": 0.7901, "step": 903 }, { "epoch": 1.7219047619047618, "grad_norm": 3.2842572628708475, "learning_rate": 4.054202914915414e-06, "loss": 0.6883, "step": 904 }, { "epoch": 1.723809523809524, "grad_norm": 4.67391971111033, "learning_rate": 4.044103824914189e-06, "loss": 0.7509, "step": 905 }, { "epoch": 1.7257142857142858, "grad_norm": 3.5070012137829973, "learning_rate": 4.03400878097225e-06, "loss": 0.7955, "step": 906 }, { "epoch": 1.7276190476190476, "grad_norm": 2.6875978886234226, "learning_rate": 4.023917825819283e-06, "loss": 0.6201, "step": 907 }, { "epoch": 1.7295238095238097, "grad_norm": 3.100726327946747, "learning_rate": 4.013831002167671e-06, "loss": 0.7032, "step": 908 }, { "epoch": 1.7314285714285713, "grad_norm": 3.631607759969553, "learning_rate": 4.00374835271231e-06, "loss": 0.7222, "step": 909 }, { "epoch": 1.7333333333333334, "grad_norm": 2.73818249684235, "learning_rate": 3.993669920130425e-06, "loss": 0.509, "step": 910 }, { "epoch": 1.7352380952380952, "grad_norm": 2.9059609345818407, "learning_rate": 3.983595747081394e-06, "loss": 0.6586, "step": 911 }, { "epoch": 1.737142857142857, "grad_norm": 3.360713371381218, "learning_rate": 3.9735258762065655e-06, "loss": 0.7986, "step": 912 }, { "epoch": 1.7390476190476192, "grad_norm": 3.774601486556661, "learning_rate": 3.963460350129076e-06, "loss": 0.7997, "step": 913 }, { "epoch": 1.7409523809523808, "grad_norm": 2.8127772449564747, "learning_rate": 3.953399211453674e-06, "loss": 0.6729, "step": 914 }, { "epoch": 1.7428571428571429, "grad_norm": 2.4749517512555514, "learning_rate": 3.943342502766536e-06, "loss": 0.6299, "step": 915 }, { "epoch": 1.7447619047619047, "grad_norm": 3.817257079363956, "learning_rate": 3.933290266635088e-06, "loss": 0.6733, "step": 916 }, { "epoch": 1.7466666666666666, "grad_norm": 2.5918427988265598, "learning_rate": 3.9232425456078245e-06, "loss": 0.896, "step": 917 }, { "epoch": 1.7485714285714287, "grad_norm": 2.4734952674166664, "learning_rate": 3.9131993822141305e-06, "loss": 0.6836, "step": 918 }, { "epoch": 1.7504761904761905, "grad_norm": 2.804256649930396, "learning_rate": 3.9031608189640956e-06, "loss": 0.5622, "step": 919 }, { "epoch": 1.7523809523809524, "grad_norm": 2.6027738683533346, "learning_rate": 3.893126898348343e-06, "loss": 0.806, "step": 920 }, { "epoch": 1.7542857142857144, "grad_norm": 3.235966019007431, "learning_rate": 3.8830976628378404e-06, "loss": 0.6003, "step": 921 }, { "epoch": 1.756190476190476, "grad_norm": 3.0276903817709138, "learning_rate": 3.873073154883729e-06, "loss": 0.5287, "step": 922 }, { "epoch": 1.7580952380952382, "grad_norm": 3.4626169524862935, "learning_rate": 3.863053416917136e-06, "loss": 0.539, "step": 923 }, { "epoch": 1.76, "grad_norm": 2.4458581267907276, "learning_rate": 3.853038491349e-06, "loss": 0.6652, "step": 924 }, { "epoch": 1.7619047619047619, "grad_norm": 2.404546447757066, "learning_rate": 3.84302842056989e-06, "loss": 0.6102, "step": 925 }, { "epoch": 1.763809523809524, "grad_norm": 3.764939456234988, "learning_rate": 3.8330232469498266e-06, "loss": 0.6831, "step": 926 }, { "epoch": 1.7657142857142856, "grad_norm": 2.1595889230545358, "learning_rate": 3.823023012838099e-06, "loss": 0.7069, "step": 927 }, { "epoch": 1.7676190476190476, "grad_norm": 3.7038396355014136, "learning_rate": 3.8130277605630928e-06, "loss": 0.6964, "step": 928 }, { "epoch": 1.7695238095238095, "grad_norm": 2.736883383483487, "learning_rate": 3.8030375324321037e-06, "loss": 0.6855, "step": 929 }, { "epoch": 1.7714285714285714, "grad_norm": 3.7513936330334143, "learning_rate": 3.793052370731163e-06, "loss": 0.7708, "step": 930 }, { "epoch": 1.7733333333333334, "grad_norm": 2.8367499337672832, "learning_rate": 3.7830723177248574e-06, "loss": 0.65, "step": 931 }, { "epoch": 1.7752380952380953, "grad_norm": 3.8323012751674588, "learning_rate": 3.7730974156561494e-06, "loss": 0.6391, "step": 932 }, { "epoch": 1.7771428571428571, "grad_norm": 2.7479238941827777, "learning_rate": 3.763127706746198e-06, "loss": 0.6623, "step": 933 }, { "epoch": 1.7790476190476192, "grad_norm": 3.1677388425003787, "learning_rate": 3.7531632331941826e-06, "loss": 0.7287, "step": 934 }, { "epoch": 1.7809523809523808, "grad_norm": 3.3814399371999277, "learning_rate": 3.74320403717712e-06, "loss": 0.6708, "step": 935 }, { "epoch": 1.782857142857143, "grad_norm": 3.596369055589814, "learning_rate": 3.7332501608496917e-06, "loss": 0.6385, "step": 936 }, { "epoch": 1.7847619047619048, "grad_norm": 2.6366411572094997, "learning_rate": 3.7233016463440612e-06, "loss": 0.6257, "step": 937 }, { "epoch": 1.7866666666666666, "grad_norm": 2.836274956577337, "learning_rate": 3.7133585357696954e-06, "loss": 0.5953, "step": 938 }, { "epoch": 1.7885714285714287, "grad_norm": 3.007747276794172, "learning_rate": 3.70342087121319e-06, "loss": 0.8086, "step": 939 }, { "epoch": 1.7904761904761903, "grad_norm": 4.314502666779546, "learning_rate": 3.693488694738089e-06, "loss": 0.8031, "step": 940 }, { "epoch": 1.7923809523809524, "grad_norm": 3.165669381401534, "learning_rate": 3.683562048384703e-06, "loss": 0.6624, "step": 941 }, { "epoch": 1.7942857142857143, "grad_norm": 3.283653265862736, "learning_rate": 3.67364097416994e-06, "loss": 0.6585, "step": 942 }, { "epoch": 1.7961904761904761, "grad_norm": 3.196551561721693, "learning_rate": 3.6637255140871196e-06, "loss": 0.8229, "step": 943 }, { "epoch": 1.7980952380952382, "grad_norm": 3.7514672905574966, "learning_rate": 3.653815710105799e-06, "loss": 0.5775, "step": 944 }, { "epoch": 1.8, "grad_norm": 3.407540122692509, "learning_rate": 3.643911604171596e-06, "loss": 0.6983, "step": 945 }, { "epoch": 1.801904761904762, "grad_norm": 3.2162916453910912, "learning_rate": 3.6340132382060084e-06, "loss": 0.824, "step": 946 }, { "epoch": 1.803809523809524, "grad_norm": 3.2014625669277588, "learning_rate": 3.6241206541062378e-06, "loss": 0.6648, "step": 947 }, { "epoch": 1.8057142857142856, "grad_norm": 4.000883770798457, "learning_rate": 3.6142338937450143e-06, "loss": 0.6828, "step": 948 }, { "epoch": 1.8076190476190477, "grad_norm": 3.0455307406818206, "learning_rate": 3.604352998970416e-06, "loss": 0.8198, "step": 949 }, { "epoch": 1.8095238095238095, "grad_norm": 2.7766421105119043, "learning_rate": 3.5944780116056955e-06, "loss": 0.6788, "step": 950 }, { "epoch": 1.8114285714285714, "grad_norm": 2.1483977793142843, "learning_rate": 3.5846089734490983e-06, "loss": 0.6736, "step": 951 }, { "epoch": 1.8133333333333335, "grad_norm": 2.9116355673970644, "learning_rate": 3.5747459262736905e-06, "loss": 0.6319, "step": 952 }, { "epoch": 1.815238095238095, "grad_norm": 2.292376967765798, "learning_rate": 3.564888911827179e-06, "loss": 0.7351, "step": 953 }, { "epoch": 1.8171428571428572, "grad_norm": 3.7201113111301654, "learning_rate": 3.5550379718317364e-06, "loss": 0.6455, "step": 954 }, { "epoch": 1.819047619047619, "grad_norm": 3.143821334076954, "learning_rate": 3.5451931479838226e-06, "loss": 0.6841, "step": 955 }, { "epoch": 1.8209523809523809, "grad_norm": 2.8356800264594373, "learning_rate": 3.5353544819540107e-06, "loss": 0.6358, "step": 956 }, { "epoch": 1.822857142857143, "grad_norm": 2.455253693710853, "learning_rate": 3.5255220153868093e-06, "loss": 0.6615, "step": 957 }, { "epoch": 1.8247619047619048, "grad_norm": 3.0476554863606578, "learning_rate": 3.515695789900484e-06, "loss": 0.7617, "step": 958 }, { "epoch": 1.8266666666666667, "grad_norm": 3.395291262485949, "learning_rate": 3.505875847086887e-06, "loss": 0.6318, "step": 959 }, { "epoch": 1.8285714285714287, "grad_norm": 2.661352086088814, "learning_rate": 3.4960622285112768e-06, "loss": 0.7293, "step": 960 }, { "epoch": 1.8304761904761904, "grad_norm": 2.7634313031316866, "learning_rate": 3.48625497571214e-06, "loss": 0.683, "step": 961 }, { "epoch": 1.8323809523809524, "grad_norm": 3.4979611469618046, "learning_rate": 3.4764541302010224e-06, "loss": 0.6342, "step": 962 }, { "epoch": 1.8342857142857143, "grad_norm": 3.8328189313722234, "learning_rate": 3.4666597334623463e-06, "loss": 0.8184, "step": 963 }, { "epoch": 1.8361904761904762, "grad_norm": 3.9301064995000257, "learning_rate": 3.4568718269532407e-06, "loss": 0.6645, "step": 964 }, { "epoch": 1.8380952380952382, "grad_norm": 3.2556643627852475, "learning_rate": 3.447090452103361e-06, "loss": 0.6603, "step": 965 }, { "epoch": 1.8399999999999999, "grad_norm": 3.0439338169853785, "learning_rate": 3.4373156503147175e-06, "loss": 0.8471, "step": 966 }, { "epoch": 1.841904761904762, "grad_norm": 2.551604789353069, "learning_rate": 3.4275474629614974e-06, "loss": 0.7113, "step": 967 }, { "epoch": 1.8438095238095238, "grad_norm": 2.8233013118021035, "learning_rate": 3.417785931389891e-06, "loss": 0.8102, "step": 968 }, { "epoch": 1.8457142857142856, "grad_norm": 3.4451629055832087, "learning_rate": 3.408031096917915e-06, "loss": 0.8296, "step": 969 }, { "epoch": 1.8476190476190477, "grad_norm": 2.590423919061846, "learning_rate": 3.3982830008352426e-06, "loss": 0.7947, "step": 970 }, { "epoch": 1.8495238095238096, "grad_norm": 3.2163996343351213, "learning_rate": 3.3885416844030228e-06, "loss": 0.7455, "step": 971 }, { "epoch": 1.8514285714285714, "grad_norm": 3.7606931096655893, "learning_rate": 3.3788071888537046e-06, "loss": 0.7031, "step": 972 }, { "epoch": 1.8533333333333335, "grad_norm": 2.737429417892746, "learning_rate": 3.369079555390874e-06, "loss": 0.6771, "step": 973 }, { "epoch": 1.8552380952380951, "grad_norm": 2.9544444010544204, "learning_rate": 3.3593588251890654e-06, "loss": 0.7257, "step": 974 }, { "epoch": 1.8571428571428572, "grad_norm": 3.705886648160575, "learning_rate": 3.3496450393935964e-06, "loss": 0.6716, "step": 975 }, { "epoch": 1.859047619047619, "grad_norm": 2.66903969876622, "learning_rate": 3.3399382391203883e-06, "loss": 0.6443, "step": 976 }, { "epoch": 1.860952380952381, "grad_norm": 2.44918155965145, "learning_rate": 3.330238465455796e-06, "loss": 0.6651, "step": 977 }, { "epoch": 1.862857142857143, "grad_norm": 3.7074574220776153, "learning_rate": 3.3205457594564337e-06, "loss": 0.7179, "step": 978 }, { "epoch": 1.8647619047619046, "grad_norm": 3.46726522210293, "learning_rate": 3.310860162148998e-06, "loss": 0.6445, "step": 979 }, { "epoch": 1.8666666666666667, "grad_norm": 3.2617819343089125, "learning_rate": 3.301181714530097e-06, "loss": 0.6553, "step": 980 }, { "epoch": 1.8685714285714285, "grad_norm": 3.040505154545407, "learning_rate": 3.2915104575660762e-06, "loss": 0.6299, "step": 981 }, { "epoch": 1.8704761904761904, "grad_norm": 2.3678634776746805, "learning_rate": 3.281846432192844e-06, "loss": 0.6782, "step": 982 }, { "epoch": 1.8723809523809525, "grad_norm": 3.9216230243119417, "learning_rate": 3.272189679315701e-06, "loss": 0.5438, "step": 983 }, { "epoch": 1.8742857142857143, "grad_norm": 3.3827666136105083, "learning_rate": 3.2625402398091656e-06, "loss": 0.6743, "step": 984 }, { "epoch": 1.8761904761904762, "grad_norm": 2.66060656768813, "learning_rate": 3.252898154516797e-06, "loss": 0.7815, "step": 985 }, { "epoch": 1.878095238095238, "grad_norm": 3.614194893411707, "learning_rate": 3.2432634642510262e-06, "loss": 0.6835, "step": 986 }, { "epoch": 1.88, "grad_norm": 3.171225832381452, "learning_rate": 3.233636209792992e-06, "loss": 0.7357, "step": 987 }, { "epoch": 1.881904761904762, "grad_norm": 2.632051058171287, "learning_rate": 3.224016431892347e-06, "loss": 0.6752, "step": 988 }, { "epoch": 1.8838095238095238, "grad_norm": 4.15848478612645, "learning_rate": 3.214404171267106e-06, "loss": 0.7246, "step": 989 }, { "epoch": 1.8857142857142857, "grad_norm": 2.5402689839432124, "learning_rate": 3.20479946860346e-06, "loss": 0.6542, "step": 990 }, { "epoch": 1.8876190476190478, "grad_norm": 2.8793422010297243, "learning_rate": 3.1952023645556126e-06, "loss": 0.5687, "step": 991 }, { "epoch": 1.8895238095238094, "grad_norm": 2.485960071524638, "learning_rate": 3.1856128997456015e-06, "loss": 0.6152, "step": 992 }, { "epoch": 1.8914285714285715, "grad_norm": 2.675451120665098, "learning_rate": 3.176031114763133e-06, "loss": 0.779, "step": 993 }, { "epoch": 1.8933333333333333, "grad_norm": 2.708103339026693, "learning_rate": 3.1664570501654026e-06, "loss": 0.6607, "step": 994 }, { "epoch": 1.8952380952380952, "grad_norm": 4.097481901284391, "learning_rate": 3.1568907464769307e-06, "loss": 0.6029, "step": 995 }, { "epoch": 1.8971428571428572, "grad_norm": 3.1323265468145745, "learning_rate": 3.1473322441893885e-06, "loss": 0.5538, "step": 996 }, { "epoch": 1.899047619047619, "grad_norm": 2.433413823618976, "learning_rate": 3.13778158376142e-06, "loss": 0.6347, "step": 997 }, { "epoch": 1.900952380952381, "grad_norm": 2.4950458606931467, "learning_rate": 3.128238805618483e-06, "loss": 0.6297, "step": 998 }, { "epoch": 1.9028571428571428, "grad_norm": 3.490653170933105, "learning_rate": 3.118703950152667e-06, "loss": 0.8749, "step": 999 }, { "epoch": 1.9047619047619047, "grad_norm": 2.5501564144705413, "learning_rate": 3.1091770577225343e-06, "loss": 0.5847, "step": 1000 }, { "epoch": 1.9066666666666667, "grad_norm": 3.678700551498614, "learning_rate": 3.0996581686529337e-06, "loss": 0.8202, "step": 1001 }, { "epoch": 1.9085714285714286, "grad_norm": 3.0464766067692466, "learning_rate": 3.09014732323484e-06, "loss": 0.815, "step": 1002 }, { "epoch": 1.9104761904761904, "grad_norm": 2.801546039768304, "learning_rate": 3.0806445617251834e-06, "loss": 0.6446, "step": 1003 }, { "epoch": 1.9123809523809525, "grad_norm": 3.45647373721546, "learning_rate": 3.0711499243466757e-06, "loss": 0.7771, "step": 1004 }, { "epoch": 1.9142857142857141, "grad_norm": 2.699534712808293, "learning_rate": 3.061663451287641e-06, "loss": 0.6534, "step": 1005 }, { "epoch": 1.9161904761904762, "grad_norm": 2.7654829962859617, "learning_rate": 3.052185182701847e-06, "loss": 0.5684, "step": 1006 }, { "epoch": 1.918095238095238, "grad_norm": 3.421459596071975, "learning_rate": 3.042715158708332e-06, "loss": 0.6075, "step": 1007 }, { "epoch": 1.92, "grad_norm": 3.5066328280690393, "learning_rate": 3.03325341939124e-06, "loss": 0.5306, "step": 1008 }, { "epoch": 1.921904761904762, "grad_norm": 3.1911134834340853, "learning_rate": 3.0238000047996476e-06, "loss": 0.5241, "step": 1009 }, { "epoch": 1.9238095238095239, "grad_norm": 2.501138139258464, "learning_rate": 3.0143549549473904e-06, "loss": 0.7524, "step": 1010 }, { "epoch": 1.9257142857142857, "grad_norm": 3.647850545631917, "learning_rate": 3.0049183098129048e-06, "loss": 0.658, "step": 1011 }, { "epoch": 1.9276190476190476, "grad_norm": 2.744037371013075, "learning_rate": 2.9954901093390486e-06, "loss": 0.6452, "step": 1012 }, { "epoch": 1.9295238095238094, "grad_norm": 2.8659802053956613, "learning_rate": 2.986070393432934e-06, "loss": 0.7747, "step": 1013 }, { "epoch": 1.9314285714285715, "grad_norm": 2.8519051436590312, "learning_rate": 2.9766592019657666e-06, "loss": 0.6887, "step": 1014 }, { "epoch": 1.9333333333333333, "grad_norm": 2.7105018106037138, "learning_rate": 2.967256574772664e-06, "loss": 0.746, "step": 1015 }, { "epoch": 1.9352380952380952, "grad_norm": 3.5703490064192396, "learning_rate": 2.957862551652496e-06, "loss": 0.7283, "step": 1016 }, { "epoch": 1.9371428571428573, "grad_norm": 3.279438652392567, "learning_rate": 2.948477172367713e-06, "loss": 0.7389, "step": 1017 }, { "epoch": 1.939047619047619, "grad_norm": 2.541490937895215, "learning_rate": 2.9391004766441787e-06, "loss": 0.6688, "step": 1018 }, { "epoch": 1.940952380952381, "grad_norm": 3.616846672630028, "learning_rate": 2.9297325041710014e-06, "loss": 0.7739, "step": 1019 }, { "epoch": 1.9428571428571428, "grad_norm": 2.995718018703318, "learning_rate": 2.920373294600366e-06, "loss": 0.6873, "step": 1020 }, { "epoch": 1.9447619047619047, "grad_norm": 3.611165661455842, "learning_rate": 2.9110228875473674e-06, "loss": 0.6595, "step": 1021 }, { "epoch": 1.9466666666666668, "grad_norm": 2.4968443513012404, "learning_rate": 2.9016813225898376e-06, "loss": 0.7015, "step": 1022 }, { "epoch": 1.9485714285714286, "grad_norm": 3.2779763545269454, "learning_rate": 2.89234863926819e-06, "loss": 0.764, "step": 1023 }, { "epoch": 1.9504761904761905, "grad_norm": 2.5324476225016426, "learning_rate": 2.8830248770852353e-06, "loss": 0.5953, "step": 1024 }, { "epoch": 1.9523809523809523, "grad_norm": 2.7416031392621445, "learning_rate": 2.8737100755060322e-06, "loss": 0.8657, "step": 1025 }, { "epoch": 1.9542857142857142, "grad_norm": 2.4743787009536593, "learning_rate": 2.8644042739577027e-06, "loss": 0.662, "step": 1026 }, { "epoch": 1.9561904761904763, "grad_norm": 2.3127230950622866, "learning_rate": 2.8551075118292815e-06, "loss": 0.4841, "step": 1027 }, { "epoch": 1.958095238095238, "grad_norm": 2.8095548290062053, "learning_rate": 2.84581982847154e-06, "loss": 0.6555, "step": 1028 }, { "epoch": 1.96, "grad_norm": 2.955659645310816, "learning_rate": 2.836541263196817e-06, "loss": 0.6848, "step": 1029 }, { "epoch": 1.961904761904762, "grad_norm": 3.2003943913507573, "learning_rate": 2.8272718552788632e-06, "loss": 0.5073, "step": 1030 }, { "epoch": 1.9638095238095237, "grad_norm": 2.9190579627459377, "learning_rate": 2.818011643952662e-06, "loss": 0.7645, "step": 1031 }, { "epoch": 1.9657142857142857, "grad_norm": 3.519051828501603, "learning_rate": 2.808760668414278e-06, "loss": 0.7284, "step": 1032 }, { "epoch": 1.9676190476190476, "grad_norm": 3.5969210561054075, "learning_rate": 2.7995189678206745e-06, "loss": 0.6626, "step": 1033 }, { "epoch": 1.9695238095238095, "grad_norm": 2.560460669410691, "learning_rate": 2.7902865812895643e-06, "loss": 0.6835, "step": 1034 }, { "epoch": 1.9714285714285715, "grad_norm": 3.279912639171773, "learning_rate": 2.7810635478992288e-06, "loss": 0.559, "step": 1035 }, { "epoch": 1.9733333333333334, "grad_norm": 3.12364393300968, "learning_rate": 2.7718499066883674e-06, "loss": 0.6219, "step": 1036 }, { "epoch": 1.9752380952380952, "grad_norm": 2.522241057158423, "learning_rate": 2.762645696655918e-06, "loss": 0.6325, "step": 1037 }, { "epoch": 1.977142857142857, "grad_norm": 3.139250680280131, "learning_rate": 2.753450956760904e-06, "loss": 0.5558, "step": 1038 }, { "epoch": 1.979047619047619, "grad_norm": 3.6694684757927476, "learning_rate": 2.74426572592226e-06, "loss": 0.737, "step": 1039 }, { "epoch": 1.980952380952381, "grad_norm": 2.344084256099442, "learning_rate": 2.7350900430186765e-06, "loss": 0.5458, "step": 1040 }, { "epoch": 1.9828571428571429, "grad_norm": 2.367817415842074, "learning_rate": 2.7259239468884226e-06, "loss": 0.6367, "step": 1041 }, { "epoch": 1.9847619047619047, "grad_norm": 2.4794917294433096, "learning_rate": 2.716767476329196e-06, "loss": 0.8844, "step": 1042 }, { "epoch": 1.9866666666666668, "grad_norm": 3.35916805510129, "learning_rate": 2.7076206700979513e-06, "loss": 0.7125, "step": 1043 }, { "epoch": 1.9885714285714284, "grad_norm": 2.8537894720704355, "learning_rate": 2.6984835669107306e-06, "loss": 0.6735, "step": 1044 }, { "epoch": 1.9904761904761905, "grad_norm": 3.0004375782493473, "learning_rate": 2.6893562054425128e-06, "loss": 0.802, "step": 1045 }, { "epoch": 1.9923809523809524, "grad_norm": 3.5562355351148645, "learning_rate": 2.680238624327035e-06, "loss": 0.7207, "step": 1046 }, { "epoch": 1.9942857142857142, "grad_norm": 2.883149790302853, "learning_rate": 2.671130862156646e-06, "loss": 0.6391, "step": 1047 }, { "epoch": 1.9961904761904763, "grad_norm": 2.996491262752851, "learning_rate": 2.662032957482124e-06, "loss": 0.7328, "step": 1048 }, { "epoch": 1.9980952380952381, "grad_norm": 3.081309281423561, "learning_rate": 2.6529449488125312e-06, "loss": 0.8187, "step": 1049 }, { "epoch": 2.0, "grad_norm": 2.851740093939044, "learning_rate": 2.6438668746150354e-06, "loss": 0.5597, "step": 1050 }, { "epoch": 2.001904761904762, "grad_norm": 3.637068489397177, "learning_rate": 2.6347987733147607e-06, "loss": 0.385, "step": 1051 }, { "epoch": 2.0038095238095237, "grad_norm": 3.2375252146572073, "learning_rate": 2.625740683294613e-06, "loss": 0.3336, "step": 1052 }, { "epoch": 2.005714285714286, "grad_norm": 3.4134579978259496, "learning_rate": 2.616692642895129e-06, "loss": 0.4053, "step": 1053 }, { "epoch": 2.0076190476190474, "grad_norm": 3.199313958815869, "learning_rate": 2.6076546904143005e-06, "loss": 0.3686, "step": 1054 }, { "epoch": 2.0095238095238095, "grad_norm": 2.844795300700537, "learning_rate": 2.5986268641074263e-06, "loss": 0.5282, "step": 1055 }, { "epoch": 2.0114285714285716, "grad_norm": 4.06889303615382, "learning_rate": 2.589609202186943e-06, "loss": 0.4347, "step": 1056 }, { "epoch": 2.013333333333333, "grad_norm": 4.07097786992572, "learning_rate": 2.5806017428222586e-06, "loss": 0.4167, "step": 1057 }, { "epoch": 2.0152380952380953, "grad_norm": 5.0547059965679635, "learning_rate": 2.571604524139604e-06, "loss": 0.439, "step": 1058 }, { "epoch": 2.0171428571428573, "grad_norm": 3.892356662771511, "learning_rate": 2.562617584221857e-06, "loss": 0.3594, "step": 1059 }, { "epoch": 2.019047619047619, "grad_norm": 4.006910134325121, "learning_rate": 2.5536409611083938e-06, "loss": 0.4314, "step": 1060 }, { "epoch": 2.020952380952381, "grad_norm": 3.2198974596850913, "learning_rate": 2.5446746927949168e-06, "loss": 0.3748, "step": 1061 }, { "epoch": 2.0228571428571427, "grad_norm": 3.0382467482233215, "learning_rate": 2.5357188172333047e-06, "loss": 0.3801, "step": 1062 }, { "epoch": 2.0247619047619048, "grad_norm": 3.0279668403638955, "learning_rate": 2.5267733723314415e-06, "loss": 0.4278, "step": 1063 }, { "epoch": 2.026666666666667, "grad_norm": 3.666895705374802, "learning_rate": 2.517838395953066e-06, "loss": 0.3257, "step": 1064 }, { "epoch": 2.0285714285714285, "grad_norm": 2.882130718796121, "learning_rate": 2.5089139259175995e-06, "loss": 0.3724, "step": 1065 }, { "epoch": 2.0304761904761905, "grad_norm": 3.885836296926442, "learning_rate": 2.5000000000000015e-06, "loss": 0.3703, "step": 1066 }, { "epoch": 2.032380952380952, "grad_norm": 3.3787482844896823, "learning_rate": 2.4910966559305915e-06, "loss": 0.4495, "step": 1067 }, { "epoch": 2.0342857142857143, "grad_norm": 2.6043378408223106, "learning_rate": 2.4822039313949085e-06, "loss": 0.4384, "step": 1068 }, { "epoch": 2.0361904761904763, "grad_norm": 3.4165299550075163, "learning_rate": 2.4733218640335326e-06, "loss": 0.461, "step": 1069 }, { "epoch": 2.038095238095238, "grad_norm": 3.7409614105375035, "learning_rate": 2.4644504914419415e-06, "loss": 0.4579, "step": 1070 }, { "epoch": 2.04, "grad_norm": 2.7901050347361758, "learning_rate": 2.4555898511703442e-06, "loss": 0.3874, "step": 1071 }, { "epoch": 2.041904761904762, "grad_norm": 3.551625845594265, "learning_rate": 2.446739980723516e-06, "loss": 0.3443, "step": 1072 }, { "epoch": 2.0438095238095237, "grad_norm": 3.634716148388194, "learning_rate": 2.437900917560656e-06, "loss": 0.4921, "step": 1073 }, { "epoch": 2.045714285714286, "grad_norm": 3.9765704091420613, "learning_rate": 2.4290726990952102e-06, "loss": 0.4454, "step": 1074 }, { "epoch": 2.0476190476190474, "grad_norm": 4.402686833130976, "learning_rate": 2.4202553626947284e-06, "loss": 0.4732, "step": 1075 }, { "epoch": 2.0495238095238095, "grad_norm": 4.361861112667196, "learning_rate": 2.4114489456806946e-06, "loss": 0.4168, "step": 1076 }, { "epoch": 2.0514285714285716, "grad_norm": 2.929300835921894, "learning_rate": 2.4026534853283778e-06, "loss": 0.3405, "step": 1077 }, { "epoch": 2.0533333333333332, "grad_norm": 2.991802014109771, "learning_rate": 2.3938690188666664e-06, "loss": 0.344, "step": 1078 }, { "epoch": 2.0552380952380953, "grad_norm": 2.6690057769651254, "learning_rate": 2.3850955834779193e-06, "loss": 0.3568, "step": 1079 }, { "epoch": 2.057142857142857, "grad_norm": 2.578287566633522, "learning_rate": 2.376333216297798e-06, "loss": 0.3228, "step": 1080 }, { "epoch": 2.059047619047619, "grad_norm": 3.8783638362187647, "learning_rate": 2.367581954415122e-06, "loss": 0.3392, "step": 1081 }, { "epoch": 2.060952380952381, "grad_norm": 3.6678426369720216, "learning_rate": 2.3588418348716987e-06, "loss": 0.4087, "step": 1082 }, { "epoch": 2.0628571428571427, "grad_norm": 2.946493462555248, "learning_rate": 2.3501128946621715e-06, "loss": 0.466, "step": 1083 }, { "epoch": 2.064761904761905, "grad_norm": 2.6534360876858525, "learning_rate": 2.3413951707338767e-06, "loss": 0.3933, "step": 1084 }, { "epoch": 2.066666666666667, "grad_norm": 2.834606080607265, "learning_rate": 2.3326886999866603e-06, "loss": 0.347, "step": 1085 }, { "epoch": 2.0685714285714285, "grad_norm": 2.686933628880203, "learning_rate": 2.323993519272748e-06, "loss": 0.3219, "step": 1086 }, { "epoch": 2.0704761904761906, "grad_norm": 3.4857052478354826, "learning_rate": 2.315309665396568e-06, "loss": 0.4491, "step": 1087 }, { "epoch": 2.072380952380952, "grad_norm": 5.058133692422829, "learning_rate": 2.3066371751146132e-06, "loss": 0.3538, "step": 1088 }, { "epoch": 2.0742857142857143, "grad_norm": 2.4866568973729075, "learning_rate": 2.297976085135271e-06, "loss": 0.2771, "step": 1089 }, { "epoch": 2.0761904761904764, "grad_norm": 4.351501615409117, "learning_rate": 2.2893264321186803e-06, "loss": 0.4655, "step": 1090 }, { "epoch": 2.078095238095238, "grad_norm": 4.257936336090124, "learning_rate": 2.2806882526765635e-06, "loss": 0.6581, "step": 1091 }, { "epoch": 2.08, "grad_norm": 3.0426623147432075, "learning_rate": 2.2720615833720862e-06, "loss": 0.3481, "step": 1092 }, { "epoch": 2.0819047619047617, "grad_norm": 2.693294807397359, "learning_rate": 2.2634464607196855e-06, "loss": 0.3785, "step": 1093 }, { "epoch": 2.083809523809524, "grad_norm": 2.6583578382142274, "learning_rate": 2.2548429211849337e-06, "loss": 0.2755, "step": 1094 }, { "epoch": 2.085714285714286, "grad_norm": 2.9621505212145784, "learning_rate": 2.246251001184369e-06, "loss": 0.3734, "step": 1095 }, { "epoch": 2.0876190476190475, "grad_norm": 3.66829937704388, "learning_rate": 2.2376707370853473e-06, "loss": 0.4639, "step": 1096 }, { "epoch": 2.0895238095238096, "grad_norm": 3.8231149601138017, "learning_rate": 2.2291021652058897e-06, "loss": 0.4128, "step": 1097 }, { "epoch": 2.0914285714285716, "grad_norm": 2.7921788181447202, "learning_rate": 2.2205453218145286e-06, "loss": 0.3763, "step": 1098 }, { "epoch": 2.0933333333333333, "grad_norm": 3.9865751397877496, "learning_rate": 2.212000243130151e-06, "loss": 0.3469, "step": 1099 }, { "epoch": 2.0952380952380953, "grad_norm": 3.0954143664768914, "learning_rate": 2.203466965321846e-06, "loss": 0.4177, "step": 1100 }, { "epoch": 2.097142857142857, "grad_norm": 3.7135269645904363, "learning_rate": 2.1949455245087554e-06, "loss": 0.4683, "step": 1101 }, { "epoch": 2.099047619047619, "grad_norm": 3.957808295416001, "learning_rate": 2.186435956759913e-06, "loss": 0.5077, "step": 1102 }, { "epoch": 2.100952380952381, "grad_norm": 3.5007930514656644, "learning_rate": 2.1779382980941042e-06, "loss": 0.458, "step": 1103 }, { "epoch": 2.1028571428571428, "grad_norm": 2.9406873637767132, "learning_rate": 2.1694525844797e-06, "loss": 0.4427, "step": 1104 }, { "epoch": 2.104761904761905, "grad_norm": 3.9966184949988306, "learning_rate": 2.160978851834516e-06, "loss": 0.4219, "step": 1105 }, { "epoch": 2.1066666666666665, "grad_norm": 2.514429690188998, "learning_rate": 2.15251713602565e-06, "loss": 0.2744, "step": 1106 }, { "epoch": 2.1085714285714285, "grad_norm": 3.602114295244378, "learning_rate": 2.144067472869342e-06, "loss": 0.5359, "step": 1107 }, { "epoch": 2.1104761904761906, "grad_norm": 3.7462220837495566, "learning_rate": 2.1356298981308126e-06, "loss": 0.4043, "step": 1108 }, { "epoch": 2.1123809523809522, "grad_norm": 3.9329481396503154, "learning_rate": 2.1272044475241134e-06, "loss": 0.5052, "step": 1109 }, { "epoch": 2.1142857142857143, "grad_norm": 2.3815082299779666, "learning_rate": 2.1187911567119812e-06, "loss": 0.3828, "step": 1110 }, { "epoch": 2.1161904761904764, "grad_norm": 2.84877978554825, "learning_rate": 2.110390061305683e-06, "loss": 0.401, "step": 1111 }, { "epoch": 2.118095238095238, "grad_norm": 4.320354472675589, "learning_rate": 2.1020011968648673e-06, "loss": 0.4373, "step": 1112 }, { "epoch": 2.12, "grad_norm": 2.701440585490528, "learning_rate": 2.0936245988974062e-06, "loss": 0.4058, "step": 1113 }, { "epoch": 2.1219047619047617, "grad_norm": 3.3798638958857636, "learning_rate": 2.085260302859258e-06, "loss": 0.5092, "step": 1114 }, { "epoch": 2.123809523809524, "grad_norm": 3.130502295225098, "learning_rate": 2.0769083441543046e-06, "loss": 0.4058, "step": 1115 }, { "epoch": 2.125714285714286, "grad_norm": 3.090050846066777, "learning_rate": 2.0685687581342127e-06, "loss": 0.3739, "step": 1116 }, { "epoch": 2.1276190476190475, "grad_norm": 3.2830486598830486, "learning_rate": 2.060241580098272e-06, "loss": 0.4216, "step": 1117 }, { "epoch": 2.1295238095238096, "grad_norm": 3.3343154112824767, "learning_rate": 2.05192684529326e-06, "loss": 0.3602, "step": 1118 }, { "epoch": 2.1314285714285712, "grad_norm": 3.5671379639458602, "learning_rate": 2.0436245889132765e-06, "loss": 0.4054, "step": 1119 }, { "epoch": 2.1333333333333333, "grad_norm": 3.333639073382267, "learning_rate": 2.035334846099613e-06, "loss": 0.4402, "step": 1120 }, { "epoch": 2.1352380952380954, "grad_norm": 3.3805831725285542, "learning_rate": 2.027057651940587e-06, "loss": 0.3703, "step": 1121 }, { "epoch": 2.137142857142857, "grad_norm": 3.03027582099334, "learning_rate": 2.0187930414714023e-06, "loss": 0.5583, "step": 1122 }, { "epoch": 2.139047619047619, "grad_norm": 2.7024682625681673, "learning_rate": 2.010541049674003e-06, "loss": 0.4717, "step": 1123 }, { "epoch": 2.140952380952381, "grad_norm": 3.2578378613917622, "learning_rate": 2.0023017114769133e-06, "loss": 0.4811, "step": 1124 }, { "epoch": 2.142857142857143, "grad_norm": 3.3474506754750997, "learning_rate": 1.994075061755112e-06, "loss": 0.4186, "step": 1125 }, { "epoch": 2.144761904761905, "grad_norm": 4.22012112067756, "learning_rate": 1.9858611353298563e-06, "loss": 0.4412, "step": 1126 }, { "epoch": 2.1466666666666665, "grad_norm": 2.4212155137342624, "learning_rate": 1.977659966968558e-06, "loss": 0.4716, "step": 1127 }, { "epoch": 2.1485714285714286, "grad_norm": 4.64405232980952, "learning_rate": 1.9694715913846228e-06, "loss": 0.3662, "step": 1128 }, { "epoch": 2.1504761904761907, "grad_norm": 2.7658128060068705, "learning_rate": 1.961296043237312e-06, "loss": 0.4806, "step": 1129 }, { "epoch": 2.1523809523809523, "grad_norm": 3.540213415127392, "learning_rate": 1.953133357131586e-06, "loss": 0.5283, "step": 1130 }, { "epoch": 2.1542857142857144, "grad_norm": 2.969408319601373, "learning_rate": 1.94498356761797e-06, "loss": 0.3982, "step": 1131 }, { "epoch": 2.156190476190476, "grad_norm": 2.872092505122507, "learning_rate": 1.9368467091923978e-06, "loss": 0.3288, "step": 1132 }, { "epoch": 2.158095238095238, "grad_norm": 3.4439979488246877, "learning_rate": 1.928722816296066e-06, "loss": 0.381, "step": 1133 }, { "epoch": 2.16, "grad_norm": 3.4309509220122645, "learning_rate": 1.9206119233152996e-06, "loss": 0.3921, "step": 1134 }, { "epoch": 2.1619047619047618, "grad_norm": 3.1835470436040088, "learning_rate": 1.9125140645813904e-06, "loss": 0.372, "step": 1135 }, { "epoch": 2.163809523809524, "grad_norm": 2.3932633149660867, "learning_rate": 1.9044292743704673e-06, "loss": 0.3702, "step": 1136 }, { "epoch": 2.1657142857142855, "grad_norm": 3.1030703304533698, "learning_rate": 1.8963575869033346e-06, "loss": 0.3868, "step": 1137 }, { "epoch": 2.1676190476190476, "grad_norm": 2.521589719276709, "learning_rate": 1.888299036345343e-06, "loss": 0.3888, "step": 1138 }, { "epoch": 2.1695238095238096, "grad_norm": 4.825811235043827, "learning_rate": 1.8802536568062363e-06, "loss": 0.3248, "step": 1139 }, { "epoch": 2.1714285714285713, "grad_norm": 3.3612064619949007, "learning_rate": 1.8722214823400103e-06, "loss": 0.4765, "step": 1140 }, { "epoch": 2.1733333333333333, "grad_norm": 2.886250636366377, "learning_rate": 1.864202546944761e-06, "loss": 0.2861, "step": 1141 }, { "epoch": 2.1752380952380954, "grad_norm": 2.2837320689744387, "learning_rate": 1.8561968845625555e-06, "loss": 0.428, "step": 1142 }, { "epoch": 2.177142857142857, "grad_norm": 2.8346529566093013, "learning_rate": 1.848204529079272e-06, "loss": 0.4237, "step": 1143 }, { "epoch": 2.179047619047619, "grad_norm": 3.219090745045557, "learning_rate": 1.8402255143244701e-06, "loss": 0.3996, "step": 1144 }, { "epoch": 2.1809523809523808, "grad_norm": 3.192895426645066, "learning_rate": 1.8322598740712384e-06, "loss": 0.512, "step": 1145 }, { "epoch": 2.182857142857143, "grad_norm": 3.629443687291866, "learning_rate": 1.8243076420360523e-06, "loss": 0.4467, "step": 1146 }, { "epoch": 2.184761904761905, "grad_norm": 3.625294884298241, "learning_rate": 1.816368851878641e-06, "loss": 0.3876, "step": 1147 }, { "epoch": 2.1866666666666665, "grad_norm": 4.019880271073956, "learning_rate": 1.8084435372018295e-06, "loss": 0.4984, "step": 1148 }, { "epoch": 2.1885714285714286, "grad_norm": 2.6062029177907773, "learning_rate": 1.8005317315514114e-06, "loss": 0.3756, "step": 1149 }, { "epoch": 2.1904761904761907, "grad_norm": 2.566241646404076, "learning_rate": 1.792633468415995e-06, "loss": 0.2642, "step": 1150 }, { "epoch": 2.1923809523809523, "grad_norm": 3.4474740159911166, "learning_rate": 1.7847487812268721e-06, "loss": 0.4034, "step": 1151 }, { "epoch": 2.1942857142857144, "grad_norm": 2.670620051299626, "learning_rate": 1.776877703357862e-06, "loss": 0.3992, "step": 1152 }, { "epoch": 2.196190476190476, "grad_norm": 3.2849299047974, "learning_rate": 1.7690202681251927e-06, "loss": 0.4459, "step": 1153 }, { "epoch": 2.198095238095238, "grad_norm": 3.66880510649691, "learning_rate": 1.7611765087873333e-06, "loss": 0.4983, "step": 1154 }, { "epoch": 2.2, "grad_norm": 2.2016559419107278, "learning_rate": 1.7533464585448755e-06, "loss": 0.3522, "step": 1155 }, { "epoch": 2.201904761904762, "grad_norm": 3.415633656263071, "learning_rate": 1.7455301505403771e-06, "loss": 0.3607, "step": 1156 }, { "epoch": 2.203809523809524, "grad_norm": 3.511986918826307, "learning_rate": 1.7377276178582358e-06, "loss": 0.3274, "step": 1157 }, { "epoch": 2.2057142857142855, "grad_norm": 3.0218179740679294, "learning_rate": 1.7299388935245358e-06, "loss": 0.3865, "step": 1158 }, { "epoch": 2.2076190476190476, "grad_norm": 3.453936345865144, "learning_rate": 1.7221640105069154e-06, "loss": 0.3142, "step": 1159 }, { "epoch": 2.2095238095238097, "grad_norm": 3.2074621573328543, "learning_rate": 1.7144030017144304e-06, "loss": 0.471, "step": 1160 }, { "epoch": 2.2114285714285713, "grad_norm": 3.661789164085878, "learning_rate": 1.7066558999974043e-06, "loss": 0.4124, "step": 1161 }, { "epoch": 2.2133333333333334, "grad_norm": 3.0883965500037367, "learning_rate": 1.6989227381473016e-06, "loss": 0.3642, "step": 1162 }, { "epoch": 2.215238095238095, "grad_norm": 3.4525578562259582, "learning_rate": 1.6912035488965778e-06, "loss": 0.3326, "step": 1163 }, { "epoch": 2.217142857142857, "grad_norm": 2.930457124153742, "learning_rate": 1.683498364918551e-06, "loss": 0.3066, "step": 1164 }, { "epoch": 2.219047619047619, "grad_norm": 2.5723752034293614, "learning_rate": 1.6758072188272529e-06, "loss": 0.3661, "step": 1165 }, { "epoch": 2.220952380952381, "grad_norm": 4.0659896157792765, "learning_rate": 1.668130143177301e-06, "loss": 0.3379, "step": 1166 }, { "epoch": 2.222857142857143, "grad_norm": 3.737920467163103, "learning_rate": 1.6604671704637564e-06, "loss": 0.3717, "step": 1167 }, { "epoch": 2.224761904761905, "grad_norm": 3.035081661043054, "learning_rate": 1.6528183331219844e-06, "loss": 0.3519, "step": 1168 }, { "epoch": 2.2266666666666666, "grad_norm": 2.841489894580811, "learning_rate": 1.645183663527517e-06, "loss": 0.3696, "step": 1169 }, { "epoch": 2.2285714285714286, "grad_norm": 2.7676880519534763, "learning_rate": 1.6375631939959235e-06, "loss": 0.2515, "step": 1170 }, { "epoch": 2.2304761904761903, "grad_norm": 3.6765967083017186, "learning_rate": 1.6299569567826622e-06, "loss": 0.4866, "step": 1171 }, { "epoch": 2.2323809523809524, "grad_norm": 3.149916698208449, "learning_rate": 1.622364984082951e-06, "loss": 0.403, "step": 1172 }, { "epoch": 2.2342857142857144, "grad_norm": 2.9585551383344866, "learning_rate": 1.614787308031634e-06, "loss": 0.3332, "step": 1173 }, { "epoch": 2.236190476190476, "grad_norm": 2.7192716574200326, "learning_rate": 1.607223960703035e-06, "loss": 0.3447, "step": 1174 }, { "epoch": 2.238095238095238, "grad_norm": 2.682523524597222, "learning_rate": 1.5996749741108347e-06, "loss": 0.3914, "step": 1175 }, { "epoch": 2.24, "grad_norm": 3.8753179641552764, "learning_rate": 1.5921403802079215e-06, "loss": 0.3362, "step": 1176 }, { "epoch": 2.241904761904762, "grad_norm": 3.1454625399575704, "learning_rate": 1.5846202108862708e-06, "loss": 0.4496, "step": 1177 }, { "epoch": 2.243809523809524, "grad_norm": 2.614532609803121, "learning_rate": 1.5771144979767949e-06, "loss": 0.3388, "step": 1178 }, { "epoch": 2.2457142857142856, "grad_norm": 2.6261149744473937, "learning_rate": 1.5696232732492227e-06, "loss": 0.4328, "step": 1179 }, { "epoch": 2.2476190476190476, "grad_norm": 2.670501922851883, "learning_rate": 1.5621465684119513e-06, "loss": 0.43, "step": 1180 }, { "epoch": 2.2495238095238097, "grad_norm": 2.7287552818929766, "learning_rate": 1.5546844151119289e-06, "loss": 0.4136, "step": 1181 }, { "epoch": 2.2514285714285713, "grad_norm": 2.9070167886277685, "learning_rate": 1.547236844934501e-06, "loss": 0.4597, "step": 1182 }, { "epoch": 2.2533333333333334, "grad_norm": 3.283685902232836, "learning_rate": 1.5398038894032903e-06, "loss": 0.4166, "step": 1183 }, { "epoch": 2.255238095238095, "grad_norm": 4.230049290604123, "learning_rate": 1.5323855799800614e-06, "loss": 0.3438, "step": 1184 }, { "epoch": 2.257142857142857, "grad_norm": 3.406372929627461, "learning_rate": 1.5249819480645811e-06, "loss": 0.3863, "step": 1185 }, { "epoch": 2.259047619047619, "grad_norm": 4.1159605855140855, "learning_rate": 1.517593024994497e-06, "loss": 0.3358, "step": 1186 }, { "epoch": 2.260952380952381, "grad_norm": 3.051894376903517, "learning_rate": 1.5102188420451898e-06, "loss": 0.373, "step": 1187 }, { "epoch": 2.262857142857143, "grad_norm": 3.4516579564846133, "learning_rate": 1.5028594304296573e-06, "loss": 0.4894, "step": 1188 }, { "epoch": 2.2647619047619045, "grad_norm": 3.2107967908714405, "learning_rate": 1.495514821298366e-06, "loss": 0.5129, "step": 1189 }, { "epoch": 2.2666666666666666, "grad_norm": 2.338776131432317, "learning_rate": 1.488185045739135e-06, "loss": 0.3153, "step": 1190 }, { "epoch": 2.2685714285714287, "grad_norm": 2.7746390696586265, "learning_rate": 1.4808701347769905e-06, "loss": 0.3746, "step": 1191 }, { "epoch": 2.2704761904761903, "grad_norm": 3.130401139553232, "learning_rate": 1.4735701193740465e-06, "loss": 0.3716, "step": 1192 }, { "epoch": 2.2723809523809524, "grad_norm": 3.6994407092758466, "learning_rate": 1.4662850304293607e-06, "loss": 0.394, "step": 1193 }, { "epoch": 2.2742857142857145, "grad_norm": 5.126454460234536, "learning_rate": 1.4590148987788184e-06, "loss": 0.4805, "step": 1194 }, { "epoch": 2.276190476190476, "grad_norm": 3.0360236806265193, "learning_rate": 1.4517597551949913e-06, "loss": 0.3368, "step": 1195 }, { "epoch": 2.278095238095238, "grad_norm": 3.6170511722106684, "learning_rate": 1.444519630387009e-06, "loss": 0.3136, "step": 1196 }, { "epoch": 2.2800000000000002, "grad_norm": 2.300799446548077, "learning_rate": 1.4372945550004341e-06, "loss": 0.3717, "step": 1197 }, { "epoch": 2.281904761904762, "grad_norm": 3.572842146297453, "learning_rate": 1.4300845596171248e-06, "loss": 0.3624, "step": 1198 }, { "epoch": 2.283809523809524, "grad_norm": 2.952165089843252, "learning_rate": 1.4228896747551136e-06, "loss": 0.3673, "step": 1199 }, { "epoch": 2.2857142857142856, "grad_norm": 3.0322786600825014, "learning_rate": 1.4157099308684702e-06, "loss": 0.3715, "step": 1200 }, { "epoch": 2.2876190476190477, "grad_norm": 3.882708101241063, "learning_rate": 1.4085453583471814e-06, "loss": 0.4364, "step": 1201 }, { "epoch": 2.2895238095238097, "grad_norm": 3.196731937051594, "learning_rate": 1.401395987517012e-06, "loss": 0.3987, "step": 1202 }, { "epoch": 2.2914285714285714, "grad_norm": 3.385773170082968, "learning_rate": 1.3942618486393867e-06, "loss": 0.4438, "step": 1203 }, { "epoch": 2.2933333333333334, "grad_norm": 4.0973836344009245, "learning_rate": 1.3871429719112528e-06, "loss": 0.3532, "step": 1204 }, { "epoch": 2.295238095238095, "grad_norm": 3.219180576999784, "learning_rate": 1.380039387464963e-06, "loss": 0.2788, "step": 1205 }, { "epoch": 2.297142857142857, "grad_norm": 3.0657965191326544, "learning_rate": 1.3729511253681344e-06, "loss": 0.3791, "step": 1206 }, { "epoch": 2.2990476190476192, "grad_norm": 2.464035187586816, "learning_rate": 1.365878215623536e-06, "loss": 0.5005, "step": 1207 }, { "epoch": 2.300952380952381, "grad_norm": 2.5835905340795855, "learning_rate": 1.3588206881689476e-06, "loss": 0.3999, "step": 1208 }, { "epoch": 2.302857142857143, "grad_norm": 3.6957918375657735, "learning_rate": 1.3517785728770432e-06, "loss": 0.3608, "step": 1209 }, { "epoch": 2.3047619047619046, "grad_norm": 3.135304637867425, "learning_rate": 1.3447518995552643e-06, "loss": 0.4139, "step": 1210 }, { "epoch": 2.3066666666666666, "grad_norm": 3.2007051101269792, "learning_rate": 1.3377406979456825e-06, "loss": 0.3954, "step": 1211 }, { "epoch": 2.3085714285714287, "grad_norm": 2.780756756792339, "learning_rate": 1.3307449977248898e-06, "loss": 0.3687, "step": 1212 }, { "epoch": 2.3104761904761904, "grad_norm": 3.1272455954467167, "learning_rate": 1.323764828503858e-06, "loss": 0.3496, "step": 1213 }, { "epoch": 2.3123809523809524, "grad_norm": 2.830798158375888, "learning_rate": 1.3168002198278268e-06, "loss": 0.377, "step": 1214 }, { "epoch": 2.314285714285714, "grad_norm": 4.102361171482372, "learning_rate": 1.309851201176166e-06, "loss": 0.4986, "step": 1215 }, { "epoch": 2.316190476190476, "grad_norm": 2.9593571901729616, "learning_rate": 1.3029178019622624e-06, "loss": 0.5163, "step": 1216 }, { "epoch": 2.318095238095238, "grad_norm": 3.2051134620417785, "learning_rate": 1.2960000515333843e-06, "loss": 0.4319, "step": 1217 }, { "epoch": 2.32, "grad_norm": 3.5559167967252807, "learning_rate": 1.2890979791705688e-06, "loss": 0.4566, "step": 1218 }, { "epoch": 2.321904761904762, "grad_norm": 3.025199221927989, "learning_rate": 1.2822116140884855e-06, "loss": 0.3216, "step": 1219 }, { "epoch": 2.323809523809524, "grad_norm": 2.579715437235281, "learning_rate": 1.2753409854353254e-06, "loss": 0.4014, "step": 1220 }, { "epoch": 2.3257142857142856, "grad_norm": 2.4851024809676825, "learning_rate": 1.2684861222926654e-06, "loss": 0.2797, "step": 1221 }, { "epoch": 2.3276190476190477, "grad_norm": 2.7059064982076895, "learning_rate": 1.2616470536753555e-06, "loss": 0.435, "step": 1222 }, { "epoch": 2.3295238095238093, "grad_norm": 4.3042464570143375, "learning_rate": 1.2548238085313912e-06, "loss": 0.4358, "step": 1223 }, { "epoch": 2.3314285714285714, "grad_norm": 3.1448520627806036, "learning_rate": 1.2480164157417873e-06, "loss": 0.4141, "step": 1224 }, { "epoch": 2.3333333333333335, "grad_norm": 3.449490552266584, "learning_rate": 1.2412249041204654e-06, "loss": 0.3241, "step": 1225 }, { "epoch": 2.335238095238095, "grad_norm": 2.9052101577150333, "learning_rate": 1.2344493024141213e-06, "loss": 0.3339, "step": 1226 }, { "epoch": 2.337142857142857, "grad_norm": 3.785793650108638, "learning_rate": 1.227689639302113e-06, "loss": 0.3466, "step": 1227 }, { "epoch": 2.3390476190476193, "grad_norm": 3.7439990838976636, "learning_rate": 1.220945943396329e-06, "loss": 0.296, "step": 1228 }, { "epoch": 2.340952380952381, "grad_norm": 3.4275093920414337, "learning_rate": 1.2142182432410782e-06, "loss": 0.3064, "step": 1229 }, { "epoch": 2.342857142857143, "grad_norm": 3.0659322263681092, "learning_rate": 1.20750656731296e-06, "loss": 0.3069, "step": 1230 }, { "epoch": 2.3447619047619046, "grad_norm": 4.064174570719443, "learning_rate": 1.2008109440207499e-06, "loss": 0.3406, "step": 1231 }, { "epoch": 2.3466666666666667, "grad_norm": 4.077566287734659, "learning_rate": 1.1941314017052735e-06, "loss": 0.3993, "step": 1232 }, { "epoch": 2.3485714285714288, "grad_norm": 4.797164051684954, "learning_rate": 1.1874679686392942e-06, "loss": 0.4089, "step": 1233 }, { "epoch": 2.3504761904761904, "grad_norm": 3.596740884749646, "learning_rate": 1.1808206730273853e-06, "loss": 0.3792, "step": 1234 }, { "epoch": 2.3523809523809525, "grad_norm": 4.233433309827213, "learning_rate": 1.174189543005816e-06, "loss": 0.3625, "step": 1235 }, { "epoch": 2.354285714285714, "grad_norm": 3.3148035157580704, "learning_rate": 1.1675746066424332e-06, "loss": 0.3794, "step": 1236 }, { "epoch": 2.356190476190476, "grad_norm": 5.458566216130069, "learning_rate": 1.1609758919365343e-06, "loss": 0.4712, "step": 1237 }, { "epoch": 2.3580952380952382, "grad_norm": 3.1954593518316985, "learning_rate": 1.1543934268187617e-06, "loss": 0.3712, "step": 1238 }, { "epoch": 2.36, "grad_norm": 2.9362257307670765, "learning_rate": 1.147827239150971e-06, "loss": 0.4358, "step": 1239 }, { "epoch": 2.361904761904762, "grad_norm": 2.5507383849093395, "learning_rate": 1.141277356726125e-06, "loss": 0.3633, "step": 1240 }, { "epoch": 2.3638095238095236, "grad_norm": 3.930161843172662, "learning_rate": 1.1347438072681654e-06, "loss": 0.4992, "step": 1241 }, { "epoch": 2.3657142857142857, "grad_norm": 3.6442977513860497, "learning_rate": 1.1282266184319074e-06, "loss": 0.431, "step": 1242 }, { "epoch": 2.3676190476190477, "grad_norm": 2.88509525496193, "learning_rate": 1.1217258178029088e-06, "loss": 0.3725, "step": 1243 }, { "epoch": 2.3695238095238094, "grad_norm": 3.2848467649880493, "learning_rate": 1.1152414328973665e-06, "loss": 0.402, "step": 1244 }, { "epoch": 2.3714285714285714, "grad_norm": 3.815313038557439, "learning_rate": 1.1087734911619884e-06, "loss": 0.4339, "step": 1245 }, { "epoch": 2.3733333333333335, "grad_norm": 3.6309379264671815, "learning_rate": 1.1023220199738882e-06, "loss": 0.3348, "step": 1246 }, { "epoch": 2.375238095238095, "grad_norm": 3.1841497089404665, "learning_rate": 1.0958870466404587e-06, "loss": 0.3103, "step": 1247 }, { "epoch": 2.3771428571428572, "grad_norm": 2.3865502853297635, "learning_rate": 1.089468598399267e-06, "loss": 0.36, "step": 1248 }, { "epoch": 2.379047619047619, "grad_norm": 2.4753079239449973, "learning_rate": 1.0830667024179286e-06, "loss": 0.4125, "step": 1249 }, { "epoch": 2.380952380952381, "grad_norm": 3.637229151331397, "learning_rate": 1.0766813857940012e-06, "loss": 0.3054, "step": 1250 }, { "epoch": 2.382857142857143, "grad_norm": 3.5240887449222065, "learning_rate": 1.0703126755548666e-06, "loss": 0.3594, "step": 1251 }, { "epoch": 2.3847619047619046, "grad_norm": 2.8892907297195505, "learning_rate": 1.0639605986576117e-06, "loss": 0.3579, "step": 1252 }, { "epoch": 2.3866666666666667, "grad_norm": 2.7937069485868085, "learning_rate": 1.0576251819889256e-06, "loss": 0.3601, "step": 1253 }, { "epoch": 2.388571428571429, "grad_norm": 2.6891544024815306, "learning_rate": 1.0513064523649718e-06, "loss": 0.4678, "step": 1254 }, { "epoch": 2.3904761904761904, "grad_norm": 3.267206768884853, "learning_rate": 1.0450044365312878e-06, "loss": 0.3913, "step": 1255 }, { "epoch": 2.3923809523809525, "grad_norm": 2.9980974451733062, "learning_rate": 1.0387191611626613e-06, "loss": 0.3417, "step": 1256 }, { "epoch": 2.394285714285714, "grad_norm": 4.961973510439375, "learning_rate": 1.0324506528630275e-06, "loss": 0.57, "step": 1257 }, { "epoch": 2.396190476190476, "grad_norm": 2.2876925610079, "learning_rate": 1.0261989381653436e-06, "loss": 0.4377, "step": 1258 }, { "epoch": 2.3980952380952383, "grad_norm": 4.19103461984004, "learning_rate": 1.0199640435314912e-06, "loss": 0.5733, "step": 1259 }, { "epoch": 2.4, "grad_norm": 3.2220638450965318, "learning_rate": 1.0137459953521516e-06, "loss": 0.3965, "step": 1260 }, { "epoch": 2.401904761904762, "grad_norm": 2.7274877458539626, "learning_rate": 1.0075448199467037e-06, "loss": 0.3123, "step": 1261 }, { "epoch": 2.4038095238095236, "grad_norm": 2.6098326894467747, "learning_rate": 1.0013605435631035e-06, "loss": 0.4197, "step": 1262 }, { "epoch": 2.4057142857142857, "grad_norm": 3.1376814358846947, "learning_rate": 9.951931923777825e-07, "loss": 0.3685, "step": 1263 }, { "epoch": 2.4076190476190478, "grad_norm": 3.892536895474269, "learning_rate": 9.890427924955304e-07, "loss": 0.4284, "step": 1264 }, { "epoch": 2.4095238095238094, "grad_norm": 3.752672645105414, "learning_rate": 9.829093699493847e-07, "loss": 0.4295, "step": 1265 }, { "epoch": 2.4114285714285715, "grad_norm": 3.4976663359860223, "learning_rate": 9.767929507005263e-07, "loss": 0.2839, "step": 1266 }, { "epoch": 2.413333333333333, "grad_norm": 3.4083427867856915, "learning_rate": 9.706935606381606e-07, "loss": 0.4173, "step": 1267 }, { "epoch": 2.415238095238095, "grad_norm": 2.7603969390144716, "learning_rate": 9.646112255794182e-07, "loss": 0.4539, "step": 1268 }, { "epoch": 2.4171428571428573, "grad_norm": 2.86185545317862, "learning_rate": 9.585459712692347e-07, "loss": 0.4081, "step": 1269 }, { "epoch": 2.419047619047619, "grad_norm": 3.5080078245583177, "learning_rate": 9.524978233802529e-07, "loss": 0.3905, "step": 1270 }, { "epoch": 2.420952380952381, "grad_norm": 3.1485365755593784, "learning_rate": 9.464668075127032e-07, "loss": 0.5148, "step": 1271 }, { "epoch": 2.422857142857143, "grad_norm": 3.259521974462558, "learning_rate": 9.40452949194306e-07, "loss": 0.3295, "step": 1272 }, { "epoch": 2.4247619047619047, "grad_norm": 2.647219117161924, "learning_rate": 9.344562738801532e-07, "loss": 0.4309, "step": 1273 }, { "epoch": 2.4266666666666667, "grad_norm": 2.819943962692332, "learning_rate": 9.284768069526106e-07, "loss": 0.3978, "step": 1274 }, { "epoch": 2.4285714285714284, "grad_norm": 3.37110171891748, "learning_rate": 9.225145737211999e-07, "loss": 0.3497, "step": 1275 }, { "epoch": 2.4304761904761905, "grad_norm": 3.2343737650750284, "learning_rate": 9.165695994225027e-07, "loss": 0.484, "step": 1276 }, { "epoch": 2.4323809523809525, "grad_norm": 2.4102341743748266, "learning_rate": 9.106419092200419e-07, "loss": 0.3887, "step": 1277 }, { "epoch": 2.434285714285714, "grad_norm": 2.881380818665366, "learning_rate": 9.047315282041868e-07, "loss": 0.3833, "step": 1278 }, { "epoch": 2.4361904761904762, "grad_norm": 3.586289356745323, "learning_rate": 8.988384813920403e-07, "loss": 0.3013, "step": 1279 }, { "epoch": 2.4380952380952383, "grad_norm": 2.76177799494823, "learning_rate": 8.929627937273306e-07, "loss": 0.384, "step": 1280 }, { "epoch": 2.44, "grad_norm": 2.7208978965938915, "learning_rate": 8.871044900803139e-07, "loss": 0.3112, "step": 1281 }, { "epoch": 2.441904761904762, "grad_norm": 3.8741608271506425, "learning_rate": 8.812635952476595e-07, "loss": 0.4467, "step": 1282 }, { "epoch": 2.4438095238095237, "grad_norm": 2.547711345625711, "learning_rate": 8.754401339523555e-07, "loss": 0.371, "step": 1283 }, { "epoch": 2.4457142857142857, "grad_norm": 3.1248629401369876, "learning_rate": 8.696341308435918e-07, "loss": 0.6343, "step": 1284 }, { "epoch": 2.447619047619048, "grad_norm": 4.534808822420833, "learning_rate": 8.638456104966697e-07, "loss": 0.4842, "step": 1285 }, { "epoch": 2.4495238095238094, "grad_norm": 2.483158317099844, "learning_rate": 8.580745974128829e-07, "loss": 0.501, "step": 1286 }, { "epoch": 2.4514285714285715, "grad_norm": 3.1165792064698055, "learning_rate": 8.523211160194278e-07, "loss": 0.3496, "step": 1287 }, { "epoch": 2.453333333333333, "grad_norm": 2.9019588757409323, "learning_rate": 8.46585190669289e-07, "loss": 0.3665, "step": 1288 }, { "epoch": 2.455238095238095, "grad_norm": 3.585524995120755, "learning_rate": 8.408668456411456e-07, "loss": 0.5054, "step": 1289 }, { "epoch": 2.4571428571428573, "grad_norm": 3.017357271562797, "learning_rate": 8.351661051392584e-07, "loss": 0.3167, "step": 1290 }, { "epoch": 2.459047619047619, "grad_norm": 3.8636051850719144, "learning_rate": 8.294829932933767e-07, "loss": 0.4196, "step": 1291 }, { "epoch": 2.460952380952381, "grad_norm": 4.981089118580047, "learning_rate": 8.238175341586325e-07, "loss": 0.3626, "step": 1292 }, { "epoch": 2.4628571428571426, "grad_norm": 2.5514861598971197, "learning_rate": 8.181697517154352e-07, "loss": 0.4561, "step": 1293 }, { "epoch": 2.4647619047619047, "grad_norm": 2.8188578033727176, "learning_rate": 8.125396698693777e-07, "loss": 0.2887, "step": 1294 }, { "epoch": 2.466666666666667, "grad_norm": 2.478880902222213, "learning_rate": 8.069273124511256e-07, "loss": 0.3192, "step": 1295 }, { "epoch": 2.4685714285714284, "grad_norm": 4.658508362356186, "learning_rate": 8.013327032163277e-07, "loss": 0.4468, "step": 1296 }, { "epoch": 2.4704761904761905, "grad_norm": 2.7622379592389965, "learning_rate": 7.957558658455028e-07, "loss": 0.4443, "step": 1297 }, { "epoch": 2.4723809523809526, "grad_norm": 3.153774592176001, "learning_rate": 7.901968239439528e-07, "loss": 0.3512, "step": 1298 }, { "epoch": 2.474285714285714, "grad_norm": 4.3831048273406035, "learning_rate": 7.846556010416506e-07, "loss": 0.3387, "step": 1299 }, { "epoch": 2.4761904761904763, "grad_norm": 2.531799103099177, "learning_rate": 7.791322205931495e-07, "loss": 0.4368, "step": 1300 }, { "epoch": 2.478095238095238, "grad_norm": 2.6820930155282143, "learning_rate": 7.736267059774765e-07, "loss": 0.4298, "step": 1301 }, { "epoch": 2.48, "grad_norm": 3.6156272420973075, "learning_rate": 7.681390804980427e-07, "loss": 0.2211, "step": 1302 }, { "epoch": 2.481904761904762, "grad_norm": 2.5307331169149045, "learning_rate": 7.626693673825331e-07, "loss": 0.3678, "step": 1303 }, { "epoch": 2.4838095238095237, "grad_norm": 4.493697410671703, "learning_rate": 7.572175897828194e-07, "loss": 0.3768, "step": 1304 }, { "epoch": 2.4857142857142858, "grad_norm": 2.641598284220468, "learning_rate": 7.517837707748532e-07, "loss": 0.3967, "step": 1305 }, { "epoch": 2.487619047619048, "grad_norm": 3.4005298819475467, "learning_rate": 7.463679333585738e-07, "loss": 0.3599, "step": 1306 }, { "epoch": 2.4895238095238095, "grad_norm": 2.253509952531745, "learning_rate": 7.409701004578107e-07, "loss": 0.3859, "step": 1307 }, { "epoch": 2.4914285714285715, "grad_norm": 2.649069855195501, "learning_rate": 7.355902949201815e-07, "loss": 0.3645, "step": 1308 }, { "epoch": 2.493333333333333, "grad_norm": 2.985186218107175, "learning_rate": 7.302285395170012e-07, "loss": 0.3504, "step": 1309 }, { "epoch": 2.4952380952380953, "grad_norm": 3.688843440211025, "learning_rate": 7.248848569431815e-07, "loss": 0.3659, "step": 1310 }, { "epoch": 2.4971428571428573, "grad_norm": 4.05327133401772, "learning_rate": 7.195592698171394e-07, "loss": 0.3845, "step": 1311 }, { "epoch": 2.499047619047619, "grad_norm": 2.3142681128860776, "learning_rate": 7.142518006806936e-07, "loss": 0.3501, "step": 1312 }, { "epoch": 2.500952380952381, "grad_norm": 3.4686533405931126, "learning_rate": 7.089624719989807e-07, "loss": 0.3473, "step": 1313 }, { "epoch": 2.5028571428571427, "grad_norm": 2.9404207774730122, "learning_rate": 7.036913061603473e-07, "loss": 0.3238, "step": 1314 }, { "epoch": 2.5047619047619047, "grad_norm": 4.15485613539608, "learning_rate": 6.984383254762661e-07, "loss": 0.4574, "step": 1315 }, { "epoch": 2.506666666666667, "grad_norm": 2.7557936823792604, "learning_rate": 6.932035521812336e-07, "loss": 0.3371, "step": 1316 }, { "epoch": 2.5085714285714285, "grad_norm": 2.599832908064384, "learning_rate": 6.879870084326795e-07, "loss": 0.3433, "step": 1317 }, { "epoch": 2.5104761904761905, "grad_norm": 3.5663635213742815, "learning_rate": 6.827887163108754e-07, "loss": 0.5258, "step": 1318 }, { "epoch": 2.512380952380952, "grad_norm": 2.843365900601389, "learning_rate": 6.776086978188334e-07, "loss": 0.241, "step": 1319 }, { "epoch": 2.5142857142857142, "grad_norm": 2.536016449353789, "learning_rate": 6.724469748822254e-07, "loss": 0.512, "step": 1320 }, { "epoch": 2.5161904761904763, "grad_norm": 3.5526772582835204, "learning_rate": 6.673035693492758e-07, "loss": 0.3119, "step": 1321 }, { "epoch": 2.518095238095238, "grad_norm": 2.5327402916683863, "learning_rate": 6.621785029906813e-07, "loss": 0.3636, "step": 1322 }, { "epoch": 2.52, "grad_norm": 2.381014927773133, "learning_rate": 6.570717974995094e-07, "loss": 0.479, "step": 1323 }, { "epoch": 2.5219047619047616, "grad_norm": 2.3551915060559887, "learning_rate": 6.519834744911146e-07, "loss": 0.308, "step": 1324 }, { "epoch": 2.5238095238095237, "grad_norm": 3.1540737828322234, "learning_rate": 6.469135555030403e-07, "loss": 0.4138, "step": 1325 }, { "epoch": 2.525714285714286, "grad_norm": 2.6493262273934115, "learning_rate": 6.418620619949334e-07, "loss": 0.3728, "step": 1326 }, { "epoch": 2.527619047619048, "grad_norm": 2.8419165745822736, "learning_rate": 6.368290153484469e-07, "loss": 0.4861, "step": 1327 }, { "epoch": 2.5295238095238095, "grad_norm": 3.8726166413097607, "learning_rate": 6.31814436867157e-07, "loss": 0.4817, "step": 1328 }, { "epoch": 2.5314285714285716, "grad_norm": 3.2432496298782083, "learning_rate": 6.268183477764655e-07, "loss": 0.4095, "step": 1329 }, { "epoch": 2.533333333333333, "grad_norm": 4.032195981982127, "learning_rate": 6.218407692235152e-07, "loss": 0.3235, "step": 1330 }, { "epoch": 2.5352380952380953, "grad_norm": 3.5709363772617597, "learning_rate": 6.168817222770989e-07, "loss": 0.4395, "step": 1331 }, { "epoch": 2.5371428571428574, "grad_norm": 3.5346555515961247, "learning_rate": 6.119412279275683e-07, "loss": 0.3615, "step": 1332 }, { "epoch": 2.539047619047619, "grad_norm": 3.3497917270237303, "learning_rate": 6.070193070867486e-07, "loss": 0.4471, "step": 1333 }, { "epoch": 2.540952380952381, "grad_norm": 2.2466358209673634, "learning_rate": 6.021159805878474e-07, "loss": 0.5183, "step": 1334 }, { "epoch": 2.5428571428571427, "grad_norm": 3.596964073046631, "learning_rate": 5.972312691853688e-07, "loss": 0.4359, "step": 1335 }, { "epoch": 2.544761904761905, "grad_norm": 2.7079095099532067, "learning_rate": 5.923651935550195e-07, "loss": 0.3068, "step": 1336 }, { "epoch": 2.546666666666667, "grad_norm": 2.7033135992266817, "learning_rate": 5.875177742936322e-07, "loss": 0.3709, "step": 1337 }, { "epoch": 2.5485714285714285, "grad_norm": 2.652968881037819, "learning_rate": 5.826890319190665e-07, "loss": 0.4036, "step": 1338 }, { "epoch": 2.5504761904761906, "grad_norm": 2.6371934100581336, "learning_rate": 5.778789868701312e-07, "loss": 0.4442, "step": 1339 }, { "epoch": 2.552380952380952, "grad_norm": 2.8582048423887576, "learning_rate": 5.730876595064916e-07, "loss": 0.5423, "step": 1340 }, { "epoch": 2.5542857142857143, "grad_norm": 3.1528069659987383, "learning_rate": 5.683150701085887e-07, "loss": 0.3365, "step": 1341 }, { "epoch": 2.5561904761904763, "grad_norm": 3.930743161228964, "learning_rate": 5.635612388775486e-07, "loss": 0.5023, "step": 1342 }, { "epoch": 2.558095238095238, "grad_norm": 3.756322935202279, "learning_rate": 5.588261859350985e-07, "loss": 0.3707, "step": 1343 }, { "epoch": 2.56, "grad_norm": 3.405875755916657, "learning_rate": 5.541099313234844e-07, "loss": 0.4591, "step": 1344 }, { "epoch": 2.5619047619047617, "grad_norm": 4.1807164501016825, "learning_rate": 5.494124950053803e-07, "loss": 0.5218, "step": 1345 }, { "epoch": 2.5638095238095238, "grad_norm": 3.458898909701398, "learning_rate": 5.447338968638116e-07, "loss": 0.4553, "step": 1346 }, { "epoch": 2.565714285714286, "grad_norm": 6.058750231115991, "learning_rate": 5.400741567020634e-07, "loss": 0.334, "step": 1347 }, { "epoch": 2.5676190476190475, "grad_norm": 2.6103683471853616, "learning_rate": 5.354332942436019e-07, "loss": 0.3754, "step": 1348 }, { "epoch": 2.5695238095238095, "grad_norm": 2.3986875583973655, "learning_rate": 5.308113291319867e-07, "loss": 0.4138, "step": 1349 }, { "epoch": 2.571428571428571, "grad_norm": 3.4869072096598175, "learning_rate": 5.262082809307922e-07, "loss": 0.4711, "step": 1350 }, { "epoch": 2.5733333333333333, "grad_norm": 2.9784366132074536, "learning_rate": 5.216241691235197e-07, "loss": 0.3354, "step": 1351 }, { "epoch": 2.5752380952380953, "grad_norm": 3.180436288983747, "learning_rate": 5.170590131135211e-07, "loss": 0.3609, "step": 1352 }, { "epoch": 2.5771428571428574, "grad_norm": 3.8361573002155396, "learning_rate": 5.125128322239098e-07, "loss": 0.396, "step": 1353 }, { "epoch": 2.579047619047619, "grad_norm": 2.6090330681331313, "learning_rate": 5.079856456974857e-07, "loss": 0.3995, "step": 1354 }, { "epoch": 2.580952380952381, "grad_norm": 3.732794756755359, "learning_rate": 5.034774726966484e-07, "loss": 0.2656, "step": 1355 }, { "epoch": 2.5828571428571427, "grad_norm": 3.5102566381961995, "learning_rate": 4.989883323033184e-07, "loss": 0.4213, "step": 1356 }, { "epoch": 2.584761904761905, "grad_norm": 2.7909195722765356, "learning_rate": 4.945182435188578e-07, "loss": 0.394, "step": 1357 }, { "epoch": 2.586666666666667, "grad_norm": 2.7819914481638848, "learning_rate": 4.900672252639849e-07, "loss": 0.3475, "step": 1358 }, { "epoch": 2.5885714285714285, "grad_norm": 2.6724145817445946, "learning_rate": 4.85635296378702e-07, "loss": 0.4638, "step": 1359 }, { "epoch": 2.5904761904761906, "grad_norm": 3.5006447584276623, "learning_rate": 4.81222475622205e-07, "loss": 0.4069, "step": 1360 }, { "epoch": 2.5923809523809522, "grad_norm": 3.0844175410211525, "learning_rate": 4.7682878167281843e-07, "loss": 0.3402, "step": 1361 }, { "epoch": 2.5942857142857143, "grad_norm": 3.9376326123065835, "learning_rate": 4.724542331278992e-07, "loss": 0.3444, "step": 1362 }, { "epoch": 2.5961904761904764, "grad_norm": 2.823796348027554, "learning_rate": 4.680988485037735e-07, "loss": 0.2914, "step": 1363 }, { "epoch": 2.598095238095238, "grad_norm": 2.4702037734496, "learning_rate": 4.6376264623564704e-07, "loss": 0.413, "step": 1364 }, { "epoch": 2.6, "grad_norm": 3.849088204366855, "learning_rate": 4.594456446775347e-07, "loss": 0.3225, "step": 1365 }, { "epoch": 2.6019047619047617, "grad_norm": 2.51525711757472, "learning_rate": 4.5514786210217766e-07, "loss": 0.4084, "step": 1366 }, { "epoch": 2.603809523809524, "grad_norm": 2.642093902899198, "learning_rate": 4.5086931670096865e-07, "loss": 0.4074, "step": 1367 }, { "epoch": 2.605714285714286, "grad_norm": 2.9969418485889556, "learning_rate": 4.4661002658387685e-07, "loss": 0.414, "step": 1368 }, { "epoch": 2.6076190476190475, "grad_norm": 3.0151416444159054, "learning_rate": 4.4237000977936374e-07, "loss": 0.3344, "step": 1369 }, { "epoch": 2.6095238095238096, "grad_norm": 3.798310147755509, "learning_rate": 4.3814928423431826e-07, "loss": 0.5015, "step": 1370 }, { "epoch": 2.611428571428571, "grad_norm": 2.7051732198925413, "learning_rate": 4.339478678139686e-07, "loss": 0.3423, "step": 1371 }, { "epoch": 2.6133333333333333, "grad_norm": 3.5429724013247696, "learning_rate": 4.2976577830181663e-07, "loss": 0.3774, "step": 1372 }, { "epoch": 2.6152380952380954, "grad_norm": 3.9592191869302726, "learning_rate": 4.256030333995559e-07, "loss": 0.3812, "step": 1373 }, { "epoch": 2.617142857142857, "grad_norm": 2.282944362851788, "learning_rate": 4.2145965072700077e-07, "loss": 0.4928, "step": 1374 }, { "epoch": 2.619047619047619, "grad_norm": 2.8351895207824356, "learning_rate": 4.1733564782200973e-07, "loss": 0.3266, "step": 1375 }, { "epoch": 2.6209523809523807, "grad_norm": 2.779981276567339, "learning_rate": 4.132310421404123e-07, "loss": 0.3837, "step": 1376 }, { "epoch": 2.6228571428571428, "grad_norm": 2.845737076699493, "learning_rate": 4.091458510559332e-07, "loss": 0.4606, "step": 1377 }, { "epoch": 2.624761904761905, "grad_norm": 2.828519446651989, "learning_rate": 4.0508009186012277e-07, "loss": 0.2951, "step": 1378 }, { "epoch": 2.626666666666667, "grad_norm": 3.3949772921852968, "learning_rate": 4.0103378176227834e-07, "loss": 0.3903, "step": 1379 }, { "epoch": 2.6285714285714286, "grad_norm": 6.074886731771548, "learning_rate": 3.9700693788937537e-07, "loss": 0.436, "step": 1380 }, { "epoch": 2.6304761904761906, "grad_norm": 4.499566828369018, "learning_rate": 3.9299957728599516e-07, "loss": 0.3919, "step": 1381 }, { "epoch": 2.6323809523809523, "grad_norm": 2.9742452007328217, "learning_rate": 3.8901171691424876e-07, "loss": 0.507, "step": 1382 }, { "epoch": 2.6342857142857143, "grad_norm": 3.7095646212873645, "learning_rate": 3.8504337365371125e-07, "loss": 0.314, "step": 1383 }, { "epoch": 2.6361904761904764, "grad_norm": 3.3148733787507125, "learning_rate": 3.8109456430134364e-07, "loss": 0.3758, "step": 1384 }, { "epoch": 2.638095238095238, "grad_norm": 2.917740867278345, "learning_rate": 3.7716530557142695e-07, "loss": 0.3709, "step": 1385 }, { "epoch": 2.64, "grad_norm": 3.5169704814332583, "learning_rate": 3.732556140954885e-07, "loss": 0.4023, "step": 1386 }, { "epoch": 2.6419047619047618, "grad_norm": 4.205817670058243, "learning_rate": 3.69365506422234e-07, "loss": 0.3434, "step": 1387 }, { "epoch": 2.643809523809524, "grad_norm": 3.397887260874195, "learning_rate": 3.6549499901747276e-07, "loss": 0.3697, "step": 1388 }, { "epoch": 2.645714285714286, "grad_norm": 2.649276590805659, "learning_rate": 3.6164410826405704e-07, "loss": 0.3382, "step": 1389 }, { "epoch": 2.6476190476190475, "grad_norm": 3.6697458488411936, "learning_rate": 3.5781285046180005e-07, "loss": 0.4605, "step": 1390 }, { "epoch": 2.6495238095238096, "grad_norm": 4.573104325720787, "learning_rate": 3.5400124182741913e-07, "loss": 0.454, "step": 1391 }, { "epoch": 2.6514285714285712, "grad_norm": 2.742805647411334, "learning_rate": 3.502092984944594e-07, "loss": 0.3325, "step": 1392 }, { "epoch": 2.6533333333333333, "grad_norm": 2.9476397630311078, "learning_rate": 3.4643703651322745e-07, "loss": 0.4399, "step": 1393 }, { "epoch": 2.6552380952380954, "grad_norm": 3.55869789840011, "learning_rate": 3.426844718507261e-07, "loss": 0.4367, "step": 1394 }, { "epoch": 2.657142857142857, "grad_norm": 3.850299601922021, "learning_rate": 3.389516203905818e-07, "loss": 0.4826, "step": 1395 }, { "epoch": 2.659047619047619, "grad_norm": 2.830147432712001, "learning_rate": 3.352384979329826e-07, "loss": 0.374, "step": 1396 }, { "epoch": 2.6609523809523807, "grad_norm": 2.590774281488631, "learning_rate": 3.3154512019460736e-07, "loss": 0.3244, "step": 1397 }, { "epoch": 2.662857142857143, "grad_norm": 2.867865871729688, "learning_rate": 3.2787150280856106e-07, "loss": 0.3995, "step": 1398 }, { "epoch": 2.664761904761905, "grad_norm": 2.612773831567773, "learning_rate": 3.242176613243081e-07, "loss": 0.3918, "step": 1399 }, { "epoch": 2.6666666666666665, "grad_norm": 3.2587468213415027, "learning_rate": 3.205836112076077e-07, "loss": 0.3468, "step": 1400 }, { "epoch": 2.6685714285714286, "grad_norm": 3.106198659399771, "learning_rate": 3.169693678404456e-07, "loss": 0.4179, "step": 1401 }, { "epoch": 2.6704761904761902, "grad_norm": 3.6793058346398184, "learning_rate": 3.133749465209718e-07, "loss": 0.3862, "step": 1402 }, { "epoch": 2.6723809523809523, "grad_norm": 2.457921563894658, "learning_rate": 3.0980036246343435e-07, "loss": 0.4651, "step": 1403 }, { "epoch": 2.6742857142857144, "grad_norm": 3.561474305675107, "learning_rate": 3.0624563079811707e-07, "loss": 0.3603, "step": 1404 }, { "epoch": 2.6761904761904765, "grad_norm": 3.3406922223285735, "learning_rate": 3.0271076657127143e-07, "loss": 0.4338, "step": 1405 }, { "epoch": 2.678095238095238, "grad_norm": 3.209473700097369, "learning_rate": 2.991957847450544e-07, "loss": 0.4381, "step": 1406 }, { "epoch": 2.68, "grad_norm": 3.546067059971776, "learning_rate": 2.957007001974704e-07, "loss": 0.4188, "step": 1407 }, { "epoch": 2.681904761904762, "grad_norm": 3.96011605011883, "learning_rate": 2.922255277222991e-07, "loss": 0.3798, "step": 1408 }, { "epoch": 2.683809523809524, "grad_norm": 3.8059135549293424, "learning_rate": 2.8877028202904113e-07, "loss": 0.274, "step": 1409 }, { "epoch": 2.685714285714286, "grad_norm": 4.560997007280084, "learning_rate": 2.8533497774284824e-07, "loss": 0.4536, "step": 1410 }, { "epoch": 2.6876190476190476, "grad_norm": 2.7252847235298567, "learning_rate": 2.8191962940447073e-07, "loss": 0.4049, "step": 1411 }, { "epoch": 2.6895238095238096, "grad_norm": 2.7343016725312945, "learning_rate": 2.785242514701847e-07, "loss": 0.3729, "step": 1412 }, { "epoch": 2.6914285714285713, "grad_norm": 2.9167279858792137, "learning_rate": 2.7514885831174286e-07, "loss": 0.3592, "step": 1413 }, { "epoch": 2.6933333333333334, "grad_norm": 2.3358141872441576, "learning_rate": 2.7179346421630113e-07, "loss": 0.4178, "step": 1414 }, { "epoch": 2.6952380952380954, "grad_norm": 3.9378285249680385, "learning_rate": 2.684580833863709e-07, "loss": 0.3633, "step": 1415 }, { "epoch": 2.697142857142857, "grad_norm": 3.246397261454276, "learning_rate": 2.651427299397469e-07, "loss": 0.4293, "step": 1416 }, { "epoch": 2.699047619047619, "grad_norm": 2.6996868196263137, "learning_rate": 2.6184741790945767e-07, "loss": 0.3808, "step": 1417 }, { "epoch": 2.7009523809523808, "grad_norm": 3.3285473725669785, "learning_rate": 2.5857216124369977e-07, "loss": 0.3566, "step": 1418 }, { "epoch": 2.702857142857143, "grad_norm": 3.0290915678180266, "learning_rate": 2.553169738057798e-07, "loss": 0.4717, "step": 1419 }, { "epoch": 2.704761904761905, "grad_norm": 2.743377770077459, "learning_rate": 2.5208186937405845e-07, "loss": 0.3877, "step": 1420 }, { "epoch": 2.7066666666666666, "grad_norm": 3.499598182556431, "learning_rate": 2.488668616418888e-07, "loss": 0.3816, "step": 1421 }, { "epoch": 2.7085714285714286, "grad_norm": 3.3128107331165704, "learning_rate": 2.4567196421756034e-07, "loss": 0.4121, "step": 1422 }, { "epoch": 2.7104761904761903, "grad_norm": 2.825163170393998, "learning_rate": 2.424971906242407e-07, "loss": 0.3842, "step": 1423 }, { "epoch": 2.7123809523809523, "grad_norm": 2.6359801000422767, "learning_rate": 2.3934255429991946e-07, "loss": 0.3833, "step": 1424 }, { "epoch": 2.7142857142857144, "grad_norm": 2.7877973861490966, "learning_rate": 2.3620806859734892e-07, "loss": 0.335, "step": 1425 }, { "epoch": 2.716190476190476, "grad_norm": 2.325442852658568, "learning_rate": 2.330937467839911e-07, "loss": 0.4041, "step": 1426 }, { "epoch": 2.718095238095238, "grad_norm": 3.6700218613653393, "learning_rate": 2.2999960204195593e-07, "loss": 0.3112, "step": 1427 }, { "epoch": 2.7199999999999998, "grad_norm": 3.8014301204876633, "learning_rate": 2.2692564746795375e-07, "loss": 0.4377, "step": 1428 }, { "epoch": 2.721904761904762, "grad_norm": 2.4998097908967276, "learning_rate": 2.238718960732311e-07, "loss": 0.339, "step": 1429 }, { "epoch": 2.723809523809524, "grad_norm": 2.9387378035324563, "learning_rate": 2.2083836078352238e-07, "loss": 0.4116, "step": 1430 }, { "epoch": 2.725714285714286, "grad_norm": 3.1794317286718847, "learning_rate": 2.1782505443899205e-07, "loss": 0.4797, "step": 1431 }, { "epoch": 2.7276190476190476, "grad_norm": 2.632631500118067, "learning_rate": 2.1483198979417863e-07, "loss": 0.3901, "step": 1432 }, { "epoch": 2.7295238095238097, "grad_norm": 3.346673856710114, "learning_rate": 2.1185917951794644e-07, "loss": 0.5032, "step": 1433 }, { "epoch": 2.7314285714285713, "grad_norm": 2.8640736427366122, "learning_rate": 2.0890663619342387e-07, "loss": 0.2558, "step": 1434 }, { "epoch": 2.7333333333333334, "grad_norm": 2.363884429861665, "learning_rate": 2.059743723179597e-07, "loss": 0.4956, "step": 1435 }, { "epoch": 2.7352380952380955, "grad_norm": 2.6590632628301374, "learning_rate": 2.0306240030305956e-07, "loss": 0.4531, "step": 1436 }, { "epoch": 2.737142857142857, "grad_norm": 4.5851846282878315, "learning_rate": 2.001707324743446e-07, "loss": 0.3446, "step": 1437 }, { "epoch": 2.739047619047619, "grad_norm": 2.364896724892544, "learning_rate": 1.972993810714885e-07, "loss": 0.341, "step": 1438 }, { "epoch": 2.740952380952381, "grad_norm": 3.394643740963008, "learning_rate": 1.9444835824817488e-07, "loss": 0.4775, "step": 1439 }, { "epoch": 2.742857142857143, "grad_norm": 3.1497123888944936, "learning_rate": 1.916176760720384e-07, "loss": 0.4329, "step": 1440 }, { "epoch": 2.744761904761905, "grad_norm": 3.8255350611127605, "learning_rate": 1.8880734652461985e-07, "loss": 0.3892, "step": 1441 }, { "epoch": 2.7466666666666666, "grad_norm": 3.223193557047404, "learning_rate": 1.860173815013111e-07, "loss": 0.4634, "step": 1442 }, { "epoch": 2.7485714285714287, "grad_norm": 3.1081408487720728, "learning_rate": 1.8324779281130688e-07, "loss": 0.3539, "step": 1443 }, { "epoch": 2.7504761904761903, "grad_norm": 2.9818754324793275, "learning_rate": 1.8049859217755318e-07, "loss": 0.3294, "step": 1444 }, { "epoch": 2.7523809523809524, "grad_norm": 3.5304142104767005, "learning_rate": 1.7776979123670046e-07, "loss": 0.4255, "step": 1445 }, { "epoch": 2.7542857142857144, "grad_norm": 3.148823744356506, "learning_rate": 1.7506140153905172e-07, "loss": 0.3619, "step": 1446 }, { "epoch": 2.756190476190476, "grad_norm": 3.4208616640780805, "learning_rate": 1.723734345485134e-07, "loss": 0.4907, "step": 1447 }, { "epoch": 2.758095238095238, "grad_norm": 2.7842581607279127, "learning_rate": 1.6970590164255063e-07, "loss": 0.3696, "step": 1448 }, { "epoch": 2.76, "grad_norm": 2.474949464107439, "learning_rate": 1.6705881411213266e-07, "loss": 0.3597, "step": 1449 }, { "epoch": 2.761904761904762, "grad_norm": 3.167833960819498, "learning_rate": 1.644321831616924e-07, "loss": 0.2895, "step": 1450 }, { "epoch": 2.763809523809524, "grad_norm": 4.186950007948689, "learning_rate": 1.618260199090732e-07, "loss": 0.5209, "step": 1451 }, { "epoch": 2.7657142857142856, "grad_norm": 3.9017836964297627, "learning_rate": 1.592403353854849e-07, "loss": 0.4623, "step": 1452 }, { "epoch": 2.7676190476190476, "grad_norm": 3.1034275458652743, "learning_rate": 1.5667514053545497e-07, "loss": 0.1982, "step": 1453 }, { "epoch": 2.7695238095238093, "grad_norm": 2.5896371066569643, "learning_rate": 1.5413044621678585e-07, "loss": 0.3693, "step": 1454 }, { "epoch": 2.7714285714285714, "grad_norm": 2.9423138679709706, "learning_rate": 1.5160626320050441e-07, "loss": 0.3612, "step": 1455 }, { "epoch": 2.7733333333333334, "grad_norm": 3.0077242605314054, "learning_rate": 1.4910260217081917e-07, "loss": 0.4115, "step": 1456 }, { "epoch": 2.7752380952380955, "grad_norm": 2.8821019011482774, "learning_rate": 1.4661947372507478e-07, "loss": 0.3463, "step": 1457 }, { "epoch": 2.777142857142857, "grad_norm": 2.791275624534033, "learning_rate": 1.4415688837370602e-07, "loss": 0.3416, "step": 1458 }, { "epoch": 2.779047619047619, "grad_norm": 2.8027088738155257, "learning_rate": 1.4171485654019545e-07, "loss": 0.3663, "step": 1459 }, { "epoch": 2.780952380952381, "grad_norm": 2.871962893395915, "learning_rate": 1.3929338856102648e-07, "loss": 0.2784, "step": 1460 }, { "epoch": 2.782857142857143, "grad_norm": 2.5917801560004246, "learning_rate": 1.3689249468564204e-07, "loss": 0.4239, "step": 1461 }, { "epoch": 2.784761904761905, "grad_norm": 3.0061953368794767, "learning_rate": 1.3451218507639919e-07, "loss": 0.351, "step": 1462 }, { "epoch": 2.7866666666666666, "grad_norm": 3.0760968984821844, "learning_rate": 1.321524698085297e-07, "loss": 0.3419, "step": 1463 }, { "epoch": 2.7885714285714287, "grad_norm": 4.888186011705799, "learning_rate": 1.2981335887009117e-07, "loss": 0.432, "step": 1464 }, { "epoch": 2.7904761904761903, "grad_norm": 3.2387190691494605, "learning_rate": 1.2749486216193096e-07, "loss": 0.2988, "step": 1465 }, { "epoch": 2.7923809523809524, "grad_norm": 2.471741045610758, "learning_rate": 1.2519698949764125e-07, "loss": 0.3658, "step": 1466 }, { "epoch": 2.7942857142857145, "grad_norm": 3.230907017656687, "learning_rate": 1.2291975060351845e-07, "loss": 0.2395, "step": 1467 }, { "epoch": 2.796190476190476, "grad_norm": 2.9442534154426894, "learning_rate": 1.2066315511852e-07, "loss": 0.3982, "step": 1468 }, { "epoch": 2.798095238095238, "grad_norm": 2.985865849547776, "learning_rate": 1.1842721259422762e-07, "loss": 0.4266, "step": 1469 }, { "epoch": 2.8, "grad_norm": 2.4288167755474896, "learning_rate": 1.1621193249480134e-07, "loss": 0.3132, "step": 1470 }, { "epoch": 2.801904761904762, "grad_norm": 3.2452240068078386, "learning_rate": 1.14017324196945e-07, "loss": 0.4057, "step": 1471 }, { "epoch": 2.803809523809524, "grad_norm": 2.6775551540518143, "learning_rate": 1.1184339698986413e-07, "loss": 0.3664, "step": 1472 }, { "epoch": 2.8057142857142856, "grad_norm": 3.8951080894899546, "learning_rate": 1.0969016007522482e-07, "loss": 0.3875, "step": 1473 }, { "epoch": 2.8076190476190477, "grad_norm": 3.05342819085455, "learning_rate": 1.0755762256711821e-07, "loss": 0.3428, "step": 1474 }, { "epoch": 2.8095238095238093, "grad_norm": 2.830622206587528, "learning_rate": 1.0544579349201834e-07, "loss": 0.2576, "step": 1475 }, { "epoch": 2.8114285714285714, "grad_norm": 3.612325869758792, "learning_rate": 1.0335468178874875e-07, "loss": 0.4498, "step": 1476 }, { "epoch": 2.8133333333333335, "grad_norm": 2.251910206558631, "learning_rate": 1.0128429630843928e-07, "loss": 0.4282, "step": 1477 }, { "epoch": 2.815238095238095, "grad_norm": 2.3402149243566064, "learning_rate": 9.92346458144916e-08, "loss": 0.3128, "step": 1478 }, { "epoch": 2.817142857142857, "grad_norm": 3.0457760796060045, "learning_rate": 9.720573898254204e-08, "loss": 0.4435, "step": 1479 }, { "epoch": 2.819047619047619, "grad_norm": 3.2918956223155913, "learning_rate": 9.519758440042437e-08, "loss": 0.4222, "step": 1480 }, { "epoch": 2.820952380952381, "grad_norm": 4.799824062475176, "learning_rate": 9.321019056813319e-08, "loss": 0.3054, "step": 1481 }, { "epoch": 2.822857142857143, "grad_norm": 2.6879575793447126, "learning_rate": 9.124356589778894e-08, "loss": 0.432, "step": 1482 }, { "epoch": 2.824761904761905, "grad_norm": 3.7092648149545964, "learning_rate": 8.929771871360016e-08, "loss": 0.3283, "step": 1483 }, { "epoch": 2.8266666666666667, "grad_norm": 3.720362161985696, "learning_rate": 8.737265725183187e-08, "loss": 0.3941, "step": 1484 }, { "epoch": 2.8285714285714287, "grad_norm": 3.085587086580329, "learning_rate": 8.546838966076609e-08, "loss": 0.3258, "step": 1485 }, { "epoch": 2.8304761904761904, "grad_norm": 2.772409125190257, "learning_rate": 8.358492400067143e-08, "loss": 0.3166, "step": 1486 }, { "epoch": 2.8323809523809524, "grad_norm": 2.834852282659052, "learning_rate": 8.172226824376794e-08, "loss": 0.3456, "step": 1487 }, { "epoch": 2.8342857142857145, "grad_norm": 2.851141595395487, "learning_rate": 7.988043027419012e-08, "loss": 0.3203, "step": 1488 }, { "epoch": 2.836190476190476, "grad_norm": 3.1638348841918185, "learning_rate": 7.805941788795957e-08, "loss": 0.3634, "step": 1489 }, { "epoch": 2.8380952380952382, "grad_norm": 3.4988085681855936, "learning_rate": 7.625923879294562e-08, "loss": 0.3346, "step": 1490 }, { "epoch": 2.84, "grad_norm": 3.932817774645209, "learning_rate": 7.447990060883813e-08, "loss": 0.4711, "step": 1491 }, { "epoch": 2.841904761904762, "grad_norm": 3.314782741051733, "learning_rate": 7.272141086711038e-08, "loss": 0.3805, "step": 1492 }, { "epoch": 2.843809523809524, "grad_norm": 2.6586503316878343, "learning_rate": 7.098377701099224e-08, "loss": 0.3988, "step": 1493 }, { "epoch": 2.8457142857142856, "grad_norm": 3.1274166789629567, "learning_rate": 6.926700639543426e-08, "loss": 0.2704, "step": 1494 }, { "epoch": 2.8476190476190477, "grad_norm": 4.335230393574005, "learning_rate": 6.757110628707875e-08, "loss": 0.3717, "step": 1495 }, { "epoch": 2.8495238095238093, "grad_norm": 3.9943236036288834, "learning_rate": 6.589608386422864e-08, "loss": 0.4777, "step": 1496 }, { "epoch": 2.8514285714285714, "grad_norm": 2.3847339579793982, "learning_rate": 6.424194621681757e-08, "loss": 0.338, "step": 1497 }, { "epoch": 2.8533333333333335, "grad_norm": 5.75275060131789, "learning_rate": 6.260870034637823e-08, "loss": 0.3713, "step": 1498 }, { "epoch": 2.855238095238095, "grad_norm": 2.6313275408387398, "learning_rate": 6.099635316601516e-08, "loss": 0.387, "step": 1499 }, { "epoch": 2.857142857142857, "grad_norm": 2.6378410361597298, "learning_rate": 5.9404911500374195e-08, "loss": 0.3942, "step": 1500 }, { "epoch": 2.859047619047619, "grad_norm": 2.544117154140232, "learning_rate": 5.783438208561143e-08, "loss": 0.279, "step": 1501 }, { "epoch": 2.860952380952381, "grad_norm": 3.09851200759129, "learning_rate": 5.628477156936929e-08, "loss": 0.4811, "step": 1502 }, { "epoch": 2.862857142857143, "grad_norm": 3.3533025620016423, "learning_rate": 5.475608651074493e-08, "loss": 0.3734, "step": 1503 }, { "epoch": 2.8647619047619046, "grad_norm": 2.4971679525564454, "learning_rate": 5.324833338026414e-08, "loss": 0.2796, "step": 1504 }, { "epoch": 2.8666666666666667, "grad_norm": 2.8569636184413403, "learning_rate": 5.1761518559851895e-08, "loss": 0.396, "step": 1505 }, { "epoch": 2.8685714285714283, "grad_norm": 2.5957718140266084, "learning_rate": 5.029564834280854e-08, "loss": 0.3612, "step": 1506 }, { "epoch": 2.8704761904761904, "grad_norm": 2.5932946095313842, "learning_rate": 4.885072893377973e-08, "loss": 0.2873, "step": 1507 }, { "epoch": 2.8723809523809525, "grad_norm": 3.4386666381344955, "learning_rate": 4.742676644873323e-08, "loss": 0.3646, "step": 1508 }, { "epoch": 2.8742857142857146, "grad_norm": 2.7282812718294798, "learning_rate": 4.602376691493049e-08, "loss": 0.3819, "step": 1509 }, { "epoch": 2.876190476190476, "grad_norm": 3.502412241084784, "learning_rate": 4.46417362709034e-08, "loss": 0.4119, "step": 1510 }, { "epoch": 2.878095238095238, "grad_norm": 2.6490198698781304, "learning_rate": 4.328068036642652e-08, "loss": 0.3538, "step": 1511 }, { "epoch": 2.88, "grad_norm": 2.5486078555380267, "learning_rate": 4.1940604962495414e-08, "loss": 0.4733, "step": 1512 }, { "epoch": 2.881904761904762, "grad_norm": 2.876476371352215, "learning_rate": 4.062151573129891e-08, "loss": 0.4285, "step": 1513 }, { "epoch": 2.883809523809524, "grad_norm": 2.9246670664314123, "learning_rate": 3.932341825619912e-08, "loss": 0.4738, "step": 1514 }, { "epoch": 2.8857142857142857, "grad_norm": 5.16920512330396, "learning_rate": 3.804631803170311e-08, "loss": 0.4935, "step": 1515 }, { "epoch": 2.8876190476190478, "grad_norm": 3.555521994274648, "learning_rate": 3.6790220463443494e-08, "loss": 0.4549, "step": 1516 }, { "epoch": 2.8895238095238094, "grad_norm": 3.7071302903675396, "learning_rate": 3.555513086815398e-08, "loss": 0.4049, "step": 1517 }, { "epoch": 2.8914285714285715, "grad_norm": 3.9377888354070283, "learning_rate": 3.434105447364666e-08, "loss": 0.4843, "step": 1518 }, { "epoch": 2.8933333333333335, "grad_norm": 3.6947499162878907, "learning_rate": 3.314799641879029e-08, "loss": 0.4104, "step": 1519 }, { "epoch": 2.895238095238095, "grad_norm": 2.5344718134461997, "learning_rate": 3.197596175348872e-08, "loss": 0.3859, "step": 1520 }, { "epoch": 2.8971428571428572, "grad_norm": 2.968032795769075, "learning_rate": 3.0824955438659155e-08, "loss": 0.6132, "step": 1521 }, { "epoch": 2.899047619047619, "grad_norm": 3.4079549983296635, "learning_rate": 2.9694982346211155e-08, "loss": 0.3593, "step": 1522 }, { "epoch": 2.900952380952381, "grad_norm": 3.5174081383743983, "learning_rate": 2.8586047259026027e-08, "loss": 0.4305, "step": 1523 }, { "epoch": 2.902857142857143, "grad_norm": 2.823236769842876, "learning_rate": 2.7498154870936877e-08, "loss": 0.3946, "step": 1524 }, { "epoch": 2.9047619047619047, "grad_norm": 3.301918701278776, "learning_rate": 2.643130978670916e-08, "loss": 0.3012, "step": 1525 }, { "epoch": 2.9066666666666667, "grad_norm": 4.008765745850195, "learning_rate": 2.5385516522019038e-08, "loss": 0.3526, "step": 1526 }, { "epoch": 2.9085714285714284, "grad_norm": 2.647771121600037, "learning_rate": 2.4360779503436737e-08, "loss": 0.3276, "step": 1527 }, { "epoch": 2.9104761904761904, "grad_norm": 2.8371789333816144, "learning_rate": 2.3357103068408216e-08, "loss": 0.2851, "step": 1528 }, { "epoch": 2.9123809523809525, "grad_norm": 2.9382849895020655, "learning_rate": 2.2374491465232962e-08, "loss": 0.3664, "step": 1529 }, { "epoch": 2.914285714285714, "grad_norm": 3.877471654577267, "learning_rate": 2.1412948853050118e-08, "loss": 0.3694, "step": 1530 }, { "epoch": 2.916190476190476, "grad_norm": 3.0412428757357093, "learning_rate": 2.047247930181795e-08, "loss": 0.313, "step": 1531 }, { "epoch": 2.918095238095238, "grad_norm": 2.610434076947357, "learning_rate": 1.95530867923005e-08, "loss": 0.3205, "step": 1532 }, { "epoch": 2.92, "grad_norm": 2.872911037706269, "learning_rate": 1.8654775216045416e-08, "loss": 0.422, "step": 1533 }, { "epoch": 2.921904761904762, "grad_norm": 2.4915798889589142, "learning_rate": 1.777754837537171e-08, "loss": 0.4449, "step": 1534 }, { "epoch": 2.923809523809524, "grad_norm": 4.022478647232431, "learning_rate": 1.6921409983352012e-08, "loss": 0.3899, "step": 1535 }, { "epoch": 2.9257142857142857, "grad_norm": 2.493714838512802, "learning_rate": 1.6086363663797012e-08, "loss": 0.2982, "step": 1536 }, { "epoch": 2.9276190476190473, "grad_norm": 2.9697157354072816, "learning_rate": 1.527241295123938e-08, "loss": 0.3651, "step": 1537 }, { "epoch": 2.9295238095238094, "grad_norm": 2.7197768930952306, "learning_rate": 1.447956129092043e-08, "loss": 0.3514, "step": 1538 }, { "epoch": 2.9314285714285715, "grad_norm": 3.5781090251386924, "learning_rate": 1.3707812038775692e-08, "loss": 0.5047, "step": 1539 }, { "epoch": 2.9333333333333336, "grad_norm": 3.2730755264424953, "learning_rate": 1.2957168461416591e-08, "loss": 0.4054, "step": 1540 }, { "epoch": 2.935238095238095, "grad_norm": 2.528277959234961, "learning_rate": 1.2227633736123235e-08, "loss": 0.4211, "step": 1541 }, { "epoch": 2.9371428571428573, "grad_norm": 3.4226473693644284, "learning_rate": 1.1519210950825532e-08, "loss": 0.4452, "step": 1542 }, { "epoch": 2.939047619047619, "grad_norm": 2.6766217811106596, "learning_rate": 1.08319031040921e-08, "loss": 0.4472, "step": 1543 }, { "epoch": 2.940952380952381, "grad_norm": 3.4616430384994206, "learning_rate": 1.0165713105118047e-08, "loss": 0.4327, "step": 1544 }, { "epoch": 2.942857142857143, "grad_norm": 4.305382509125797, "learning_rate": 9.520643773712202e-09, "loss": 0.3988, "step": 1545 }, { "epoch": 2.9447619047619047, "grad_norm": 2.896555187828505, "learning_rate": 8.896697840284352e-09, "loss": 0.3547, "step": 1546 }, { "epoch": 2.9466666666666668, "grad_norm": 4.480592353258285, "learning_rate": 8.293877945835804e-09, "loss": 0.3365, "step": 1547 }, { "epoch": 2.9485714285714284, "grad_norm": 3.5724025531135903, "learning_rate": 7.712186641944952e-09, "loss": 0.3971, "step": 1548 }, { "epoch": 2.9504761904761905, "grad_norm": 3.3096316069641993, "learning_rate": 7.151626390760058e-09, "loss": 0.3749, "step": 1549 }, { "epoch": 2.9523809523809526, "grad_norm": 2.559914247262096, "learning_rate": 6.6121995649859326e-09, "loss": 0.3817, "step": 1550 }, { "epoch": 2.954285714285714, "grad_norm": 2.997375555550545, "learning_rate": 6.093908447876162e-09, "loss": 0.382, "step": 1551 }, { "epoch": 2.9561904761904763, "grad_norm": 3.108071839021827, "learning_rate": 5.596755233222561e-09, "loss": 0.4033, "step": 1552 }, { "epoch": 2.958095238095238, "grad_norm": 2.300321365538265, "learning_rate": 5.120742025343517e-09, "loss": 0.4264, "step": 1553 }, { "epoch": 2.96, "grad_norm": 2.6958337209009158, "learning_rate": 4.6658708390801e-09, "loss": 0.4062, "step": 1554 }, { "epoch": 2.961904761904762, "grad_norm": 3.5919435563455657, "learning_rate": 4.232143599782746e-09, "loss": 0.4033, "step": 1555 }, { "epoch": 2.9638095238095237, "grad_norm": 2.6611038770764903, "learning_rate": 3.819562143306255e-09, "loss": 0.3345, "step": 1556 }, { "epoch": 2.9657142857142857, "grad_norm": 3.4092163151172388, "learning_rate": 3.4281282160003594e-09, "loss": 0.4142, "step": 1557 }, { "epoch": 2.9676190476190474, "grad_norm": 3.6347390497508036, "learning_rate": 3.0578434747025042e-09, "loss": 0.4654, "step": 1558 }, { "epoch": 2.9695238095238095, "grad_norm": 2.4751005519357836, "learning_rate": 2.7087094867311868e-09, "loss": 0.4332, "step": 1559 }, { "epoch": 2.9714285714285715, "grad_norm": 3.022900699813934, "learning_rate": 2.3807277298804053e-09, "loss": 0.3647, "step": 1560 }, { "epoch": 2.9733333333333336, "grad_norm": 3.746902190262692, "learning_rate": 2.0738995924107774e-09, "loss": 0.4073, "step": 1561 }, { "epoch": 2.9752380952380952, "grad_norm": 3.1209308325275513, "learning_rate": 1.7882263730462091e-09, "loss": 0.2681, "step": 1562 }, { "epoch": 2.977142857142857, "grad_norm": 4.6229541934062, "learning_rate": 1.5237092809666786e-09, "loss": 0.4218, "step": 1563 }, { "epoch": 2.979047619047619, "grad_norm": 2.461331500764114, "learning_rate": 1.280349435803796e-09, "loss": 0.4157, "step": 1564 }, { "epoch": 2.980952380952381, "grad_norm": 4.0826884736518325, "learning_rate": 1.0581478676369161e-09, "loss": 0.4237, "step": 1565 }, { "epoch": 2.982857142857143, "grad_norm": 2.775846320928122, "learning_rate": 8.571055169870334e-10, "loss": 0.4322, "step": 1566 }, { "epoch": 2.9847619047619047, "grad_norm": 4.287841206088843, "learning_rate": 6.772232348140062e-10, "loss": 0.4391, "step": 1567 }, { "epoch": 2.986666666666667, "grad_norm": 3.237023079895034, "learning_rate": 5.185017825126704e-10, "loss": 0.4706, "step": 1568 }, { "epoch": 2.9885714285714284, "grad_norm": 3.029608142517041, "learning_rate": 3.809418319095093e-10, "loss": 0.4228, "step": 1569 }, { "epoch": 2.9904761904761905, "grad_norm": 4.206662515966221, "learning_rate": 2.64543965259878e-10, "loss": 0.628, "step": 1570 }, { "epoch": 2.9923809523809526, "grad_norm": 2.9028948979191402, "learning_rate": 1.6930867524633798e-10, "loss": 0.4524, "step": 1571 }, { "epoch": 2.994285714285714, "grad_norm": 2.7800671548159284, "learning_rate": 9.523636497366095e-11, "loss": 0.4379, "step": 1572 }, { "epoch": 2.9961904761904763, "grad_norm": 4.067541717640019, "learning_rate": 4.232734797215976e-11, "loss": 0.4566, "step": 1573 }, { "epoch": 2.998095238095238, "grad_norm": 2.445989675628037, "learning_rate": 1.0581848190471811e-11, "loss": 0.3629, "step": 1574 }, { "epoch": 3.0, "grad_norm": 2.71914974410319, "learning_rate": 0.0, "loss": 0.4207, "step": 1575 }, { "epoch": 3.0, "step": 1575, "total_flos": 247329718272000.0, "train_loss": 0.7211369104044778, "train_runtime": 32331.6664, "train_samples_per_second": 2.337, "train_steps_per_second": 0.049 } ], "logging_steps": 1.0, "max_steps": 1575, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100000, "total_flos": 247329718272000.0, "train_batch_size": 6, "trial_name": null, "trial_params": null }