{ "best_metric": null, "best_model_checkpoint": null, "epoch": 8.0, "eval_steps": 500, "global_step": 376, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02127659574468085, "grad_norm": 0.78515625, "learning_rate": 2e-05, "loss": 3.872, "step": 1 }, { "epoch": 0.0425531914893617, "grad_norm": 0.83203125, "learning_rate": 4e-05, "loss": 3.9714, "step": 2 }, { "epoch": 0.06382978723404255, "grad_norm": 0.82421875, "learning_rate": 6e-05, "loss": 3.9503, "step": 3 }, { "epoch": 0.0851063829787234, "grad_norm": 0.94921875, "learning_rate": 8e-05, "loss": 4.0784, "step": 4 }, { "epoch": 0.10638297872340426, "grad_norm": 0.94921875, "learning_rate": 0.0001, "loss": 3.9539, "step": 5 }, { "epoch": 0.1276595744680851, "grad_norm": 0.66796875, "learning_rate": 0.00012, "loss": 3.8024, "step": 6 }, { "epoch": 0.14893617021276595, "grad_norm": 1.2109375, "learning_rate": 0.00014, "loss": 3.6005, "step": 7 }, { "epoch": 0.1702127659574468, "grad_norm": 0.96484375, "learning_rate": 0.00016, "loss": 3.8633, "step": 8 }, { "epoch": 0.19148936170212766, "grad_norm": 0.64453125, "learning_rate": 0.00018, "loss": 3.6551, "step": 9 }, { "epoch": 0.2127659574468085, "grad_norm": 0.34765625, "learning_rate": 0.0002, "loss": 3.6645, "step": 10 }, { "epoch": 0.23404255319148937, "grad_norm": 0.34375, "learning_rate": 0.000199996316124771, "loss": 3.7208, "step": 11 }, { "epoch": 0.2553191489361702, "grad_norm": 0.345703125, "learning_rate": 0.0001999852647705027, "loss": 3.6193, "step": 12 }, { "epoch": 0.2765957446808511, "grad_norm": 0.283203125, "learning_rate": 0.0001999668467514313, "loss": 3.7057, "step": 13 }, { "epoch": 0.2978723404255319, "grad_norm": 0.234375, "learning_rate": 0.00019994106342455053, "loss": 3.5714, "step": 14 }, { "epoch": 0.3191489361702128, "grad_norm": 0.189453125, "learning_rate": 0.00019990791668951155, "loss": 3.582, "step": 15 }, { "epoch": 0.3404255319148936, "grad_norm": 0.28515625, "learning_rate": 0.00019986740898848306, "loss": 3.5228, "step": 16 }, { "epoch": 0.3617021276595745, "grad_norm": 0.271484375, "learning_rate": 0.00019981954330597143, "loss": 3.5893, "step": 17 }, { "epoch": 0.3829787234042553, "grad_norm": 0.279296875, "learning_rate": 0.00019976432316860067, "loss": 3.5203, "step": 18 }, { "epoch": 0.40425531914893614, "grad_norm": 0.21484375, "learning_rate": 0.00019970175264485266, "loss": 3.5939, "step": 19 }, { "epoch": 0.425531914893617, "grad_norm": 0.1796875, "learning_rate": 0.00019963183634476756, "loss": 3.5296, "step": 20 }, { "epoch": 0.44680851063829785, "grad_norm": 0.2138671875, "learning_rate": 0.00019955457941960383, "loss": 3.6242, "step": 21 }, { "epoch": 0.46808510638297873, "grad_norm": 0.240234375, "learning_rate": 0.0001994699875614589, "loss": 3.517, "step": 22 }, { "epoch": 0.48936170212765956, "grad_norm": 0.2158203125, "learning_rate": 0.00019937806700284986, "loss": 3.5748, "step": 23 }, { "epoch": 0.5106382978723404, "grad_norm": 0.1982421875, "learning_rate": 0.00019927882451625402, "loss": 3.5263, "step": 24 }, { "epoch": 0.5319148936170213, "grad_norm": 0.27734375, "learning_rate": 0.00019917226741361015, "loss": 3.5638, "step": 25 }, { "epoch": 0.5531914893617021, "grad_norm": 0.1748046875, "learning_rate": 0.00019905840354577972, "loss": 3.5424, "step": 26 }, { "epoch": 0.574468085106383, "grad_norm": 0.1982421875, "learning_rate": 0.00019893724130196828, "loss": 3.5726, "step": 27 }, { "epoch": 0.5957446808510638, "grad_norm": 0.2275390625, "learning_rate": 0.00019880878960910772, "loss": 3.5656, "step": 28 }, { "epoch": 0.6170212765957447, "grad_norm": 0.1904296875, "learning_rate": 0.00019867305793119816, "loss": 3.6008, "step": 29 }, { "epoch": 0.6382978723404256, "grad_norm": 0.228515625, "learning_rate": 0.0001985300562686109, "loss": 3.5136, "step": 30 }, { "epoch": 0.6595744680851063, "grad_norm": 0.2255859375, "learning_rate": 0.00019837979515735166, "loss": 3.5632, "step": 31 }, { "epoch": 0.6808510638297872, "grad_norm": 0.1884765625, "learning_rate": 0.0001982222856682841, "loss": 3.6284, "step": 32 }, { "epoch": 0.7021276595744681, "grad_norm": 0.2890625, "learning_rate": 0.0001980575394063143, "loss": 3.4885, "step": 33 }, { "epoch": 0.723404255319149, "grad_norm": 0.232421875, "learning_rate": 0.0001978855685095358, "loss": 3.6102, "step": 34 }, { "epoch": 0.7446808510638298, "grad_norm": 0.328125, "learning_rate": 0.0001977063856483351, "loss": 3.5844, "step": 35 }, { "epoch": 0.7659574468085106, "grad_norm": 0.302734375, "learning_rate": 0.00019752000402445825, "loss": 3.5097, "step": 36 }, { "epoch": 0.7872340425531915, "grad_norm": 0.30078125, "learning_rate": 0.00019732643737003827, "loss": 3.492, "step": 37 }, { "epoch": 0.8085106382978723, "grad_norm": 0.216796875, "learning_rate": 0.00019712569994658315, "loss": 3.6192, "step": 38 }, { "epoch": 0.8297872340425532, "grad_norm": 0.2314453125, "learning_rate": 0.00019691780654392535, "loss": 3.6314, "step": 39 }, { "epoch": 0.851063829787234, "grad_norm": 0.2421875, "learning_rate": 0.00019670277247913205, "loss": 3.6429, "step": 40 }, { "epoch": 0.8723404255319149, "grad_norm": 0.208984375, "learning_rate": 0.00019648061359537646, "loss": 3.5714, "step": 41 }, { "epoch": 0.8936170212765957, "grad_norm": 0.259765625, "learning_rate": 0.00019625134626077083, "loss": 3.574, "step": 42 }, { "epoch": 0.9148936170212766, "grad_norm": 0.224609375, "learning_rate": 0.00019601498736716017, "loss": 3.6269, "step": 43 }, { "epoch": 0.9361702127659575, "grad_norm": 0.349609375, "learning_rate": 0.00019577155432887804, "loss": 3.659, "step": 44 }, { "epoch": 0.9574468085106383, "grad_norm": 0.291015625, "learning_rate": 0.00019552106508146318, "loss": 3.6223, "step": 45 }, { "epoch": 0.9787234042553191, "grad_norm": 0.3828125, "learning_rate": 0.00019526353808033825, "loss": 3.6389, "step": 46 }, { "epoch": 1.0, "grad_norm": 0.5703125, "learning_rate": 0.00019499899229945012, "loss": 3.4551, "step": 47 }, { "epoch": 1.0212765957446808, "grad_norm": 1.203125, "learning_rate": 0.0001947274472298717, "loss": 3.2727, "step": 48 }, { "epoch": 1.0425531914893618, "grad_norm": 0.6015625, "learning_rate": 0.00019444892287836613, "loss": 3.3136, "step": 49 }, { "epoch": 1.0638297872340425, "grad_norm": 0.8125, "learning_rate": 0.00019416343976591261, "loss": 3.3188, "step": 50 }, { "epoch": 1.0851063829787233, "grad_norm": 1.09375, "learning_rate": 0.00019387101892619443, "loss": 3.424, "step": 51 }, { "epoch": 1.1063829787234043, "grad_norm": 0.28515625, "learning_rate": 0.00019357168190404936, "loss": 3.3676, "step": 52 }, { "epoch": 1.127659574468085, "grad_norm": 0.73828125, "learning_rate": 0.00019326545075388225, "loss": 3.3535, "step": 53 }, { "epoch": 1.148936170212766, "grad_norm": 0.73046875, "learning_rate": 0.00019295234803804004, "loss": 3.1686, "step": 54 }, { "epoch": 1.1702127659574468, "grad_norm": 0.234375, "learning_rate": 0.00019263239682514952, "loss": 3.3986, "step": 55 }, { "epoch": 1.1914893617021276, "grad_norm": 0.56640625, "learning_rate": 0.0001923056206884176, "loss": 3.2916, "step": 56 }, { "epoch": 1.2127659574468086, "grad_norm": 0.53515625, "learning_rate": 0.00019197204370389467, "loss": 3.3444, "step": 57 }, { "epoch": 1.2340425531914894, "grad_norm": 0.375, "learning_rate": 0.0001916316904487005, "loss": 3.3603, "step": 58 }, { "epoch": 1.2553191489361701, "grad_norm": 0.44140625, "learning_rate": 0.00019128458599921357, "loss": 3.308, "step": 59 }, { "epoch": 1.2765957446808511, "grad_norm": 0.7265625, "learning_rate": 0.00019093075592922358, "loss": 3.4154, "step": 60 }, { "epoch": 1.297872340425532, "grad_norm": 0.37890625, "learning_rate": 0.00019057022630804716, "loss": 3.3095, "step": 61 }, { "epoch": 1.3191489361702127, "grad_norm": 0.212890625, "learning_rate": 0.00019020302369860708, "loss": 3.3266, "step": 62 }, { "epoch": 1.3404255319148937, "grad_norm": 0.53125, "learning_rate": 0.0001898291751554753, "loss": 3.276, "step": 63 }, { "epoch": 1.3617021276595744, "grad_norm": 0.462890625, "learning_rate": 0.00018944870822287956, "loss": 3.3428, "step": 64 }, { "epoch": 1.3829787234042552, "grad_norm": 0.3125, "learning_rate": 0.00018906165093267405, "loss": 3.2515, "step": 65 }, { "epoch": 1.4042553191489362, "grad_norm": 0.232421875, "learning_rate": 0.00018866803180227402, "loss": 3.3125, "step": 66 }, { "epoch": 1.425531914893617, "grad_norm": 0.443359375, "learning_rate": 0.00018826787983255473, "loss": 3.2968, "step": 67 }, { "epoch": 1.4468085106382977, "grad_norm": 0.380859375, "learning_rate": 0.00018786122450571485, "loss": 3.3705, "step": 68 }, { "epoch": 1.4680851063829787, "grad_norm": 0.220703125, "learning_rate": 0.00018744809578310397, "loss": 3.2878, "step": 69 }, { "epoch": 1.4893617021276595, "grad_norm": 0.287109375, "learning_rate": 0.00018702852410301554, "loss": 3.3214, "step": 70 }, { "epoch": 1.5106382978723403, "grad_norm": 0.41796875, "learning_rate": 0.00018660254037844388, "loss": 3.2708, "step": 71 }, { "epoch": 1.5319148936170213, "grad_norm": 0.255859375, "learning_rate": 0.00018617017599480682, "loss": 3.3087, "step": 72 }, { "epoch": 1.5531914893617023, "grad_norm": 0.2431640625, "learning_rate": 0.00018573146280763324, "loss": 3.3227, "step": 73 }, { "epoch": 1.574468085106383, "grad_norm": 0.4375, "learning_rate": 0.000185286433140216, "loss": 3.3296, "step": 74 }, { "epoch": 1.5957446808510638, "grad_norm": 0.36328125, "learning_rate": 0.0001848351197812304, "loss": 3.3282, "step": 75 }, { "epoch": 1.6170212765957448, "grad_norm": 0.2294921875, "learning_rate": 0.00018437755598231856, "loss": 3.3421, "step": 76 }, { "epoch": 1.6382978723404256, "grad_norm": 0.345703125, "learning_rate": 0.00018391377545563938, "loss": 3.3002, "step": 77 }, { "epoch": 1.6595744680851063, "grad_norm": 0.349609375, "learning_rate": 0.00018344381237138472, "loss": 3.3293, "step": 78 }, { "epoch": 1.6808510638297873, "grad_norm": 0.255859375, "learning_rate": 0.0001829677013552619, "loss": 3.3688, "step": 79 }, { "epoch": 1.702127659574468, "grad_norm": 0.2431640625, "learning_rate": 0.00018248547748594244, "loss": 3.2586, "step": 80 }, { "epoch": 1.7234042553191489, "grad_norm": 0.2734375, "learning_rate": 0.00018199717629247773, "loss": 3.3783, "step": 81 }, { "epoch": 1.7446808510638299, "grad_norm": 0.326171875, "learning_rate": 0.00018150283375168114, "loss": 3.3503, "step": 82 }, { "epoch": 1.7659574468085106, "grad_norm": 0.28515625, "learning_rate": 0.0001810024862854775, "loss": 3.2862, "step": 83 }, { "epoch": 1.7872340425531914, "grad_norm": 0.337890625, "learning_rate": 0.00018049617075821962, "loss": 3.2503, "step": 84 }, { "epoch": 1.8085106382978724, "grad_norm": 0.28125, "learning_rate": 0.00017998392447397197, "loss": 3.3987, "step": 85 }, { "epoch": 1.8297872340425532, "grad_norm": 0.279296875, "learning_rate": 0.0001794657851737625, "loss": 3.3948, "step": 86 }, { "epoch": 1.851063829787234, "grad_norm": 0.29296875, "learning_rate": 0.00017894179103280198, "loss": 3.414, "step": 87 }, { "epoch": 1.872340425531915, "grad_norm": 0.267578125, "learning_rate": 0.00017841198065767107, "loss": 3.3495, "step": 88 }, { "epoch": 1.8936170212765957, "grad_norm": 0.28515625, "learning_rate": 0.00017787639308347608, "loss": 3.3357, "step": 89 }, { "epoch": 1.9148936170212765, "grad_norm": 0.359375, "learning_rate": 0.000177335067770973, "loss": 3.3956, "step": 90 }, { "epoch": 1.9361702127659575, "grad_norm": 0.2890625, "learning_rate": 0.00017678804460366, "loss": 3.4261, "step": 91 }, { "epoch": 1.9574468085106385, "grad_norm": 0.318359375, "learning_rate": 0.00017623536388483905, "loss": 3.3929, "step": 92 }, { "epoch": 1.978723404255319, "grad_norm": 0.33984375, "learning_rate": 0.00017567706633464628, "loss": 3.4055, "step": 93 }, { "epoch": 2.0, "grad_norm": 0.73046875, "learning_rate": 0.00017511319308705198, "loss": 3.0576, "step": 94 }, { "epoch": 2.021276595744681, "grad_norm": 0.6953125, "learning_rate": 0.00017454378568683003, "loss": 3.1095, "step": 95 }, { "epoch": 2.0425531914893615, "grad_norm": 0.68359375, "learning_rate": 0.0001739688860864967, "loss": 3.1669, "step": 96 }, { "epoch": 2.0638297872340425, "grad_norm": 0.306640625, "learning_rate": 0.00017338853664321992, "loss": 3.1293, "step": 97 }, { "epoch": 2.0851063829787235, "grad_norm": 0.86328125, "learning_rate": 0.00017280278011569847, "loss": 3.2461, "step": 98 }, { "epoch": 2.106382978723404, "grad_norm": 0.58203125, "learning_rate": 0.00017221165966101163, "loss": 3.2222, "step": 99 }, { "epoch": 2.127659574468085, "grad_norm": 0.298828125, "learning_rate": 0.00017161521883143934, "loss": 3.1956, "step": 100 }, { "epoch": 2.148936170212766, "grad_norm": 0.73046875, "learning_rate": 0.0001710135015712536, "loss": 3.0099, "step": 101 }, { "epoch": 2.1702127659574466, "grad_norm": 0.53515625, "learning_rate": 0.00017040655221348057, "loss": 3.2425, "step": 102 }, { "epoch": 2.1914893617021276, "grad_norm": 0.353515625, "learning_rate": 0.00016979441547663435, "loss": 3.1365, "step": 103 }, { "epoch": 2.2127659574468086, "grad_norm": 0.55859375, "learning_rate": 0.00016917713646142222, "loss": 3.1903, "step": 104 }, { "epoch": 2.2340425531914896, "grad_norm": 0.6328125, "learning_rate": 0.00016855476064742155, "loss": 3.1938, "step": 105 }, { "epoch": 2.25531914893617, "grad_norm": 0.310546875, "learning_rate": 0.00016792733388972932, "loss": 3.1561, "step": 106 }, { "epoch": 2.276595744680851, "grad_norm": 0.7421875, "learning_rate": 0.0001672949024155833, "loss": 3.259, "step": 107 }, { "epoch": 2.297872340425532, "grad_norm": 0.47265625, "learning_rate": 0.00016665751282095634, "loss": 3.1575, "step": 108 }, { "epoch": 2.3191489361702127, "grad_norm": 0.2578125, "learning_rate": 0.00016601521206712318, "loss": 3.1849, "step": 109 }, { "epoch": 2.3404255319148937, "grad_norm": 0.5703125, "learning_rate": 0.0001653680474772006, "loss": 3.1254, "step": 110 }, { "epoch": 2.3617021276595747, "grad_norm": 0.427734375, "learning_rate": 0.00016471606673266066, "loss": 3.1994, "step": 111 }, { "epoch": 2.382978723404255, "grad_norm": 0.283203125, "learning_rate": 0.00016405931786981755, "loss": 3.101, "step": 112 }, { "epoch": 2.404255319148936, "grad_norm": 0.3984375, "learning_rate": 0.00016339784927628867, "loss": 3.1611, "step": 113 }, { "epoch": 2.425531914893617, "grad_norm": 0.51171875, "learning_rate": 0.0001627317096874294, "loss": 3.1622, "step": 114 }, { "epoch": 2.4468085106382977, "grad_norm": 0.302734375, "learning_rate": 0.00016206094818274229, "loss": 3.2131, "step": 115 }, { "epoch": 2.4680851063829787, "grad_norm": 0.35546875, "learning_rate": 0.0001613856141822612, "loss": 3.1511, "step": 116 }, { "epoch": 2.4893617021276597, "grad_norm": 0.478515625, "learning_rate": 0.00016070575744291004, "loss": 3.1662, "step": 117 }, { "epoch": 2.5106382978723403, "grad_norm": 0.4140625, "learning_rate": 0.00016002142805483685, "loss": 3.1092, "step": 118 }, { "epoch": 2.5319148936170213, "grad_norm": 0.376953125, "learning_rate": 0.0001593326764377232, "loss": 3.1444, "step": 119 }, { "epoch": 2.5531914893617023, "grad_norm": 0.443359375, "learning_rate": 0.00015863955333706957, "loss": 3.1738, "step": 120 }, { "epoch": 2.574468085106383, "grad_norm": 0.470703125, "learning_rate": 0.00015794210982045636, "loss": 3.1766, "step": 121 }, { "epoch": 2.595744680851064, "grad_norm": 0.294921875, "learning_rate": 0.00015724039727378148, "loss": 3.166, "step": 122 }, { "epoch": 2.617021276595745, "grad_norm": 0.50390625, "learning_rate": 0.00015653446739747427, "loss": 3.1837, "step": 123 }, { "epoch": 2.6382978723404253, "grad_norm": 0.484375, "learning_rate": 0.00015582437220268647, "loss": 3.1519, "step": 124 }, { "epoch": 2.6595744680851063, "grad_norm": 0.326171875, "learning_rate": 0.00015511016400746, "loss": 3.165, "step": 125 }, { "epoch": 2.6808510638297873, "grad_norm": 0.431640625, "learning_rate": 0.00015439189543287247, "loss": 3.2062, "step": 126 }, { "epoch": 2.702127659574468, "grad_norm": 0.400390625, "learning_rate": 0.00015366961939916008, "loss": 3.0979, "step": 127 }, { "epoch": 2.723404255319149, "grad_norm": 0.359375, "learning_rate": 0.0001529433891218185, "loss": 3.217, "step": 128 }, { "epoch": 2.74468085106383, "grad_norm": 0.349609375, "learning_rate": 0.0001522132581076825, "loss": 3.1789, "step": 129 }, { "epoch": 2.7659574468085104, "grad_norm": 0.3984375, "learning_rate": 0.0001514792801509831, "loss": 3.1253, "step": 130 }, { "epoch": 2.7872340425531914, "grad_norm": 0.369140625, "learning_rate": 0.00015074150932938455, "loss": 3.0813, "step": 131 }, { "epoch": 2.8085106382978724, "grad_norm": 0.380859375, "learning_rate": 0.00015000000000000001, "loss": 3.2233, "step": 132 }, { "epoch": 2.829787234042553, "grad_norm": 0.404296875, "learning_rate": 0.00014925480679538647, "loss": 3.2241, "step": 133 }, { "epoch": 2.851063829787234, "grad_norm": 0.33984375, "learning_rate": 0.00014850598461951963, "loss": 3.2428, "step": 134 }, { "epoch": 2.872340425531915, "grad_norm": 0.404296875, "learning_rate": 0.00014775358864374885, "loss": 3.1833, "step": 135 }, { "epoch": 2.8936170212765955, "grad_norm": 0.412109375, "learning_rate": 0.000146997674302732, "loss": 3.162, "step": 136 }, { "epoch": 2.9148936170212765, "grad_norm": 0.3828125, "learning_rate": 0.0001462382972903515, "loss": 3.2095, "step": 137 }, { "epoch": 2.9361702127659575, "grad_norm": 0.51171875, "learning_rate": 0.0001454755135556106, "loss": 3.2355, "step": 138 }, { "epoch": 2.9574468085106385, "grad_norm": 0.466796875, "learning_rate": 0.0001447093792985114, "loss": 3.1969, "step": 139 }, { "epoch": 2.978723404255319, "grad_norm": 0.5703125, "learning_rate": 0.00014393995096591416, "loss": 3.2092, "step": 140 }, { "epoch": 3.0, "grad_norm": 0.984375, "learning_rate": 0.0001431672852473784, "loss": 2.7442, "step": 141 }, { "epoch": 3.021276595744681, "grad_norm": 0.8671875, "learning_rate": 0.0001423914390709861, "loss": 2.9817, "step": 142 }, { "epoch": 3.0425531914893615, "grad_norm": 1.03125, "learning_rate": 0.00014161246959914744, "loss": 3.0423, "step": 143 }, { "epoch": 3.0638297872340425, "grad_norm": 0.49609375, "learning_rate": 0.00014083043422438935, "loss": 2.9844, "step": 144 }, { "epoch": 3.0851063829787235, "grad_norm": 0.87890625, "learning_rate": 0.00014004539056512667, "loss": 3.0951, "step": 145 }, { "epoch": 3.106382978723404, "grad_norm": 0.8359375, "learning_rate": 0.0001392573964614172, "loss": 3.089, "step": 146 }, { "epoch": 3.127659574468085, "grad_norm": 0.44140625, "learning_rate": 0.00013846650997070012, "loss": 3.0649, "step": 147 }, { "epoch": 3.148936170212766, "grad_norm": 0.6328125, "learning_rate": 0.00013767278936351854, "loss": 2.8683, "step": 148 }, { "epoch": 3.1702127659574466, "grad_norm": 0.81640625, "learning_rate": 0.00013687629311922602, "loss": 3.1071, "step": 149 }, { "epoch": 3.1914893617021276, "grad_norm": 0.40234375, "learning_rate": 0.00013607707992167834, "loss": 3.0015, "step": 150 }, { "epoch": 3.2127659574468086, "grad_norm": 0.515625, "learning_rate": 0.0001352752086549095, "loss": 3.0506, "step": 151 }, { "epoch": 3.2340425531914896, "grad_norm": 0.80859375, "learning_rate": 0.0001344707383987934, "loss": 3.0533, "step": 152 }, { "epoch": 3.25531914893617, "grad_norm": 0.44140625, "learning_rate": 0.00013366372842469105, "loss": 3.0211, "step": 153 }, { "epoch": 3.276595744680851, "grad_norm": 0.478515625, "learning_rate": 0.0001328542381910835, "loss": 3.1129, "step": 154 }, { "epoch": 3.297872340425532, "grad_norm": 0.6171875, "learning_rate": 0.00013204232733919112, "loss": 3.0158, "step": 155 }, { "epoch": 3.3191489361702127, "grad_norm": 0.482421875, "learning_rate": 0.00013122805568857948, "loss": 3.0605, "step": 156 }, { "epoch": 3.3404255319148937, "grad_norm": 0.400390625, "learning_rate": 0.0001304114832327518, "loss": 2.9792, "step": 157 }, { "epoch": 3.3617021276595747, "grad_norm": 0.439453125, "learning_rate": 0.00012959267013472892, "loss": 3.0647, "step": 158 }, { "epoch": 3.382978723404255, "grad_norm": 0.48046875, "learning_rate": 0.0001287716767226167, "loss": 2.9722, "step": 159 }, { "epoch": 3.404255319148936, "grad_norm": 0.431640625, "learning_rate": 0.00012794856348516095, "loss": 3.0233, "step": 160 }, { "epoch": 3.425531914893617, "grad_norm": 0.458984375, "learning_rate": 0.000127123391067291, "loss": 3.0216, "step": 161 }, { "epoch": 3.4468085106382977, "grad_norm": 0.4921875, "learning_rate": 0.00012629622026565147, "loss": 3.0703, "step": 162 }, { "epoch": 3.4680851063829787, "grad_norm": 0.39453125, "learning_rate": 0.00012546711202412287, "loss": 3.0121, "step": 163 }, { "epoch": 3.4893617021276597, "grad_norm": 0.443359375, "learning_rate": 0.00012463612742933148, "loss": 3.0189, "step": 164 }, { "epoch": 3.5106382978723403, "grad_norm": 0.57421875, "learning_rate": 0.00012380332770614856, "loss": 2.9589, "step": 165 }, { "epoch": 3.5319148936170213, "grad_norm": 0.41015625, "learning_rate": 0.0001229687742131796, "loss": 2.9954, "step": 166 }, { "epoch": 3.5531914893617023, "grad_norm": 0.40234375, "learning_rate": 0.00012213252843824325, "loss": 3.0266, "step": 167 }, { "epoch": 3.574468085106383, "grad_norm": 0.54296875, "learning_rate": 0.00012129465199384157, "loss": 3.0273, "step": 168 }, { "epoch": 3.595744680851064, "grad_norm": 0.5234375, "learning_rate": 0.0001204552066126201, "loss": 3.0214, "step": 169 }, { "epoch": 3.617021276595745, "grad_norm": 0.400390625, "learning_rate": 0.0001196142541428197, "loss": 3.0232, "step": 170 }, { "epoch": 3.6382978723404253, "grad_norm": 0.49609375, "learning_rate": 0.00011877185654371987, "loss": 3.0004, "step": 171 }, { "epoch": 3.6595744680851063, "grad_norm": 0.515625, "learning_rate": 0.00011792807588107357, "loss": 3.0165, "step": 172 }, { "epoch": 3.6808510638297873, "grad_norm": 0.44921875, "learning_rate": 0.00011708297432253444, "loss": 3.0491, "step": 173 }, { "epoch": 3.702127659574468, "grad_norm": 0.41796875, "learning_rate": 0.00011623661413307639, "loss": 2.9456, "step": 174 }, { "epoch": 3.723404255319149, "grad_norm": 0.4921875, "learning_rate": 0.0001153890576704062, "loss": 3.0586, "step": 175 }, { "epoch": 3.74468085106383, "grad_norm": 0.4921875, "learning_rate": 0.00011454036738036899, "loss": 3.0125, "step": 176 }, { "epoch": 3.7659574468085104, "grad_norm": 0.427734375, "learning_rate": 0.00011369060579234754, "loss": 2.9722, "step": 177 }, { "epoch": 3.7872340425531914, "grad_norm": 0.498046875, "learning_rate": 0.00011283983551465511, "loss": 2.9201, "step": 178 }, { "epoch": 3.8085106382978724, "grad_norm": 0.4765625, "learning_rate": 0.00011198811922992274, "loss": 3.0565, "step": 179 }, { "epoch": 3.829787234042553, "grad_norm": 0.462890625, "learning_rate": 0.00011113551969048089, "loss": 3.0615, "step": 180 }, { "epoch": 3.851063829787234, "grad_norm": 0.474609375, "learning_rate": 0.00011028209971373605, "loss": 3.0731, "step": 181 }, { "epoch": 3.872340425531915, "grad_norm": 0.46484375, "learning_rate": 0.00010942792217754245, "loss": 3.0144, "step": 182 }, { "epoch": 3.8936170212765955, "grad_norm": 0.50390625, "learning_rate": 0.00010857305001556944, "loss": 2.9905, "step": 183 }, { "epoch": 3.9148936170212765, "grad_norm": 0.546875, "learning_rate": 0.00010771754621266466, "loss": 3.0232, "step": 184 }, { "epoch": 3.9361702127659575, "grad_norm": 0.53515625, "learning_rate": 0.00010686147380021342, "loss": 3.0408, "step": 185 }, { "epoch": 3.9574468085106385, "grad_norm": 0.609375, "learning_rate": 0.00010600489585149484, "loss": 2.9963, "step": 186 }, { "epoch": 3.978723404255319, "grad_norm": 0.6875, "learning_rate": 0.00010514787547703466, "loss": 3.0049, "step": 187 }, { "epoch": 4.0, "grad_norm": 1.1015625, "learning_rate": 0.00010429047581995546, "loss": 2.4433, "step": 188 }, { "epoch": 4.0212765957446805, "grad_norm": 0.5859375, "learning_rate": 0.00010343276005132436, "loss": 2.8295, "step": 189 }, { "epoch": 4.042553191489362, "grad_norm": 0.8359375, "learning_rate": 0.00010257479136549889, "loss": 2.8904, "step": 190 }, { "epoch": 4.0638297872340425, "grad_norm": 0.796875, "learning_rate": 0.00010171663297547076, "loss": 2.834, "step": 191 }, { "epoch": 4.085106382978723, "grad_norm": 0.498046875, "learning_rate": 0.00010085834810820871, "loss": 2.9309, "step": 192 }, { "epoch": 4.1063829787234045, "grad_norm": 0.5703125, "learning_rate": 0.0001, "loss": 2.9461, "step": 193 }, { "epoch": 4.127659574468085, "grad_norm": 0.65625, "learning_rate": 9.914165189179131e-05, "loss": 2.9405, "step": 194 }, { "epoch": 4.148936170212766, "grad_norm": 0.546875, "learning_rate": 9.828336702452927e-05, "loss": 2.7445, "step": 195 }, { "epoch": 4.170212765957447, "grad_norm": 0.50390625, "learning_rate": 9.742520863450115e-05, "loss": 2.963, "step": 196 }, { "epoch": 4.191489361702128, "grad_norm": 0.59375, "learning_rate": 9.656723994867566e-05, "loss": 2.8778, "step": 197 }, { "epoch": 4.212765957446808, "grad_norm": 0.58984375, "learning_rate": 9.570952418004455e-05, "loss": 2.9148, "step": 198 }, { "epoch": 4.23404255319149, "grad_norm": 0.5546875, "learning_rate": 9.485212452296535e-05, "loss": 2.9028, "step": 199 }, { "epoch": 4.25531914893617, "grad_norm": 0.62109375, "learning_rate": 9.399510414850518e-05, "loss": 2.898, "step": 200 }, { "epoch": 4.276595744680851, "grad_norm": 0.61328125, "learning_rate": 9.313852619978659e-05, "loss": 2.9883, "step": 201 }, { "epoch": 4.297872340425532, "grad_norm": 0.51171875, "learning_rate": 9.228245378733537e-05, "loss": 2.886, "step": 202 }, { "epoch": 4.319148936170213, "grad_norm": 0.5546875, "learning_rate": 9.142694998443056e-05, "loss": 2.9453, "step": 203 }, { "epoch": 4.340425531914893, "grad_norm": 0.58984375, "learning_rate": 9.057207782245757e-05, "loss": 2.8555, "step": 204 }, { "epoch": 4.361702127659575, "grad_norm": 0.56640625, "learning_rate": 8.971790028626395e-05, "loss": 2.9359, "step": 205 }, { "epoch": 4.382978723404255, "grad_norm": 0.45703125, "learning_rate": 8.886448030951912e-05, "loss": 2.8469, "step": 206 }, { "epoch": 4.404255319148936, "grad_norm": 0.6015625, "learning_rate": 8.801188077007728e-05, "loss": 2.8963, "step": 207 }, { "epoch": 4.425531914893617, "grad_norm": 0.54296875, "learning_rate": 8.71601644853449e-05, "loss": 2.8965, "step": 208 }, { "epoch": 4.446808510638298, "grad_norm": 0.57421875, "learning_rate": 8.630939420765247e-05, "loss": 2.9457, "step": 209 }, { "epoch": 4.468085106382979, "grad_norm": 0.5234375, "learning_rate": 8.545963261963102e-05, "loss": 2.8918, "step": 210 }, { "epoch": 4.48936170212766, "grad_norm": 0.60546875, "learning_rate": 8.461094232959381e-05, "loss": 2.8957, "step": 211 }, { "epoch": 4.51063829787234, "grad_norm": 0.6484375, "learning_rate": 8.376338586692366e-05, "loss": 2.8224, "step": 212 }, { "epoch": 4.531914893617021, "grad_norm": 0.5703125, "learning_rate": 8.29170256774656e-05, "loss": 2.859, "step": 213 }, { "epoch": 4.553191489361702, "grad_norm": 0.58203125, "learning_rate": 8.207192411892646e-05, "loss": 2.8896, "step": 214 }, { "epoch": 4.574468085106383, "grad_norm": 0.58203125, "learning_rate": 8.122814345628016e-05, "loss": 2.8874, "step": 215 }, { "epoch": 4.595744680851064, "grad_norm": 0.56640625, "learning_rate": 8.038574585718032e-05, "loss": 2.8869, "step": 216 }, { "epoch": 4.617021276595745, "grad_norm": 0.5703125, "learning_rate": 7.954479338737995e-05, "loss": 2.8923, "step": 217 }, { "epoch": 4.638297872340425, "grad_norm": 0.59765625, "learning_rate": 7.870534800615845e-05, "loss": 2.868, "step": 218 }, { "epoch": 4.659574468085106, "grad_norm": 0.625, "learning_rate": 7.786747156175676e-05, "loss": 2.8831, "step": 219 }, { "epoch": 4.680851063829787, "grad_norm": 0.578125, "learning_rate": 7.703122578682046e-05, "loss": 2.9084, "step": 220 }, { "epoch": 4.702127659574468, "grad_norm": 0.546875, "learning_rate": 7.619667229385146e-05, "loss": 2.8085, "step": 221 }, { "epoch": 4.723404255319149, "grad_norm": 0.6875, "learning_rate": 7.536387257066854e-05, "loss": 2.92, "step": 222 }, { "epoch": 4.74468085106383, "grad_norm": 0.65234375, "learning_rate": 7.453288797587714e-05, "loss": 2.8661, "step": 223 }, { "epoch": 4.76595744680851, "grad_norm": 0.55859375, "learning_rate": 7.370377973434855e-05, "loss": 2.8322, "step": 224 }, { "epoch": 4.787234042553192, "grad_norm": 0.5859375, "learning_rate": 7.2876608932709e-05, "loss": 2.772, "step": 225 }, { "epoch": 4.808510638297872, "grad_norm": 0.71875, "learning_rate": 7.205143651483906e-05, "loss": 2.905, "step": 226 }, { "epoch": 4.829787234042553, "grad_norm": 0.703125, "learning_rate": 7.122832327738331e-05, "loss": 2.9116, "step": 227 }, { "epoch": 4.851063829787234, "grad_norm": 0.61328125, "learning_rate": 7.040732986527108e-05, "loss": 2.9203, "step": 228 }, { "epoch": 4.872340425531915, "grad_norm": 0.6328125, "learning_rate": 6.958851676724823e-05, "loss": 2.8646, "step": 229 }, { "epoch": 4.8936170212765955, "grad_norm": 0.69140625, "learning_rate": 6.877194431142055e-05, "loss": 2.844, "step": 230 }, { "epoch": 4.914893617021277, "grad_norm": 0.73828125, "learning_rate": 6.79576726608089e-05, "loss": 2.8604, "step": 231 }, { "epoch": 4.9361702127659575, "grad_norm": 0.703125, "learning_rate": 6.714576180891654e-05, "loss": 2.8686, "step": 232 }, { "epoch": 4.957446808510638, "grad_norm": 0.72265625, "learning_rate": 6.633627157530899e-05, "loss": 2.8085, "step": 233 }, { "epoch": 4.9787234042553195, "grad_norm": 0.75390625, "learning_rate": 6.552926160120663e-05, "loss": 2.8017, "step": 234 }, { "epoch": 5.0, "grad_norm": 1.328125, "learning_rate": 6.472479134509052e-05, "loss": 2.189, "step": 235 }, { "epoch": 5.0212765957446805, "grad_norm": 0.8125, "learning_rate": 6.392292007832168e-05, "loss": 2.7068, "step": 236 }, { "epoch": 5.042553191489362, "grad_norm": 0.85546875, "learning_rate": 6.312370688077399e-05, "loss": 2.7591, "step": 237 }, { "epoch": 5.0638297872340425, "grad_norm": 0.984375, "learning_rate": 6.232721063648148e-05, "loss": 2.7161, "step": 238 }, { "epoch": 5.085106382978723, "grad_norm": 0.78515625, "learning_rate": 6.153349002929987e-05, "loss": 2.8126, "step": 239 }, { "epoch": 5.1063829787234045, "grad_norm": 0.7265625, "learning_rate": 6.0742603538582835e-05, "loss": 2.8485, "step": 240 }, { "epoch": 5.127659574468085, "grad_norm": 0.578125, "learning_rate": 5.9954609434873344e-05, "loss": 2.8336, "step": 241 }, { "epoch": 5.148936170212766, "grad_norm": 0.6171875, "learning_rate": 5.9169565775610656e-05, "loss": 2.6482, "step": 242 }, { "epoch": 5.170212765957447, "grad_norm": 0.66015625, "learning_rate": 5.838753040085256e-05, "loss": 2.8597, "step": 243 }, { "epoch": 5.191489361702128, "grad_norm": 0.6015625, "learning_rate": 5.7608560929013946e-05, "loss": 2.7875, "step": 244 }, { "epoch": 5.212765957446808, "grad_norm": 0.6015625, "learning_rate": 5.683271475262164e-05, "loss": 2.822, "step": 245 }, { "epoch": 5.23404255319149, "grad_norm": 0.7109375, "learning_rate": 5.6060049034085815e-05, "loss": 2.8034, "step": 246 }, { "epoch": 5.25531914893617, "grad_norm": 0.5859375, "learning_rate": 5.5290620701488594e-05, "loss": 2.7899, "step": 247 }, { "epoch": 5.276595744680851, "grad_norm": 0.65625, "learning_rate": 5.452448644438946e-05, "loss": 2.8848, "step": 248 }, { "epoch": 5.297872340425532, "grad_norm": 0.63671875, "learning_rate": 5.3761702709648556e-05, "loss": 2.7907, "step": 249 }, { "epoch": 5.319148936170213, "grad_norm": 0.65625, "learning_rate": 5.300232569726804e-05, "loss": 2.8616, "step": 250 }, { "epoch": 5.340425531914893, "grad_norm": 0.6953125, "learning_rate": 5.224641135625119e-05, "loss": 2.7745, "step": 251 }, { "epoch": 5.361702127659575, "grad_norm": 0.671875, "learning_rate": 5.1494015380480396e-05, "loss": 2.8555, "step": 252 }, { "epoch": 5.382978723404255, "grad_norm": 0.63671875, "learning_rate": 5.074519320461357e-05, "loss": 2.7636, "step": 253 }, { "epoch": 5.404255319148936, "grad_norm": 0.625, "learning_rate": 5.000000000000002e-05, "loss": 2.8076, "step": 254 }, { "epoch": 5.425531914893617, "grad_norm": 0.56640625, "learning_rate": 4.9258490670615475e-05, "loss": 2.8087, "step": 255 }, { "epoch": 5.446808510638298, "grad_norm": 0.55859375, "learning_rate": 4.852071984901696e-05, "loss": 2.8507, "step": 256 }, { "epoch": 5.468085106382979, "grad_norm": 0.5546875, "learning_rate": 4.778674189231751e-05, "loss": 2.7981, "step": 257 }, { "epoch": 5.48936170212766, "grad_norm": 0.58984375, "learning_rate": 4.7056610878181486e-05, "loss": 2.8039, "step": 258 }, { "epoch": 5.51063829787234, "grad_norm": 0.65234375, "learning_rate": 4.633038060083996e-05, "loss": 2.7239, "step": 259 }, { "epoch": 5.531914893617021, "grad_norm": 0.609375, "learning_rate": 4.560810456712754e-05, "loss": 2.7612, "step": 260 }, { "epoch": 5.553191489361702, "grad_norm": 0.58203125, "learning_rate": 4.488983599254001e-05, "loss": 2.7895, "step": 261 }, { "epoch": 5.574468085106383, "grad_norm": 0.63671875, "learning_rate": 4.417562779731355e-05, "loss": 2.7883, "step": 262 }, { "epoch": 5.595744680851064, "grad_norm": 0.69140625, "learning_rate": 4.346553260252574e-05, "loss": 2.7913, "step": 263 }, { "epoch": 5.617021276595745, "grad_norm": 0.7109375, "learning_rate": 4.275960272621852e-05, "loss": 2.7905, "step": 264 }, { "epoch": 5.638297872340425, "grad_norm": 0.703125, "learning_rate": 4.205789017954364e-05, "loss": 2.7663, "step": 265 }, { "epoch": 5.659574468085106, "grad_norm": 0.69140625, "learning_rate": 4.136044666293044e-05, "loss": 2.7839, "step": 266 }, { "epoch": 5.680851063829787, "grad_norm": 0.6640625, "learning_rate": 4.0667323562276814e-05, "loss": 2.7986, "step": 267 }, { "epoch": 5.702127659574468, "grad_norm": 0.6484375, "learning_rate": 3.997857194516319e-05, "loss": 2.7071, "step": 268 }, { "epoch": 5.723404255319149, "grad_norm": 0.64453125, "learning_rate": 3.929424255708999e-05, "loss": 2.8141, "step": 269 }, { "epoch": 5.74468085106383, "grad_norm": 0.69140625, "learning_rate": 3.8614385817738794e-05, "loss": 2.7508, "step": 270 }, { "epoch": 5.76595744680851, "grad_norm": 0.6484375, "learning_rate": 3.793905181725772e-05, "loss": 2.7273, "step": 271 }, { "epoch": 5.787234042553192, "grad_norm": 0.68359375, "learning_rate": 3.726829031257062e-05, "loss": 2.6695, "step": 272 }, { "epoch": 5.808510638297872, "grad_norm": 0.6796875, "learning_rate": 3.660215072371135e-05, "loss": 2.7872, "step": 273 }, { "epoch": 5.829787234042553, "grad_norm": 0.6875, "learning_rate": 3.594068213018249e-05, "loss": 2.7969, "step": 274 }, { "epoch": 5.851063829787234, "grad_norm": 0.703125, "learning_rate": 3.528393326733941e-05, "loss": 2.8035, "step": 275 }, { "epoch": 5.872340425531915, "grad_norm": 0.71484375, "learning_rate": 3.463195252279939e-05, "loss": 2.7496, "step": 276 }, { "epoch": 5.8936170212765955, "grad_norm": 0.796875, "learning_rate": 3.3984787932876814e-05, "loss": 2.7365, "step": 277 }, { "epoch": 5.914893617021277, "grad_norm": 0.8046875, "learning_rate": 3.334248717904368e-05, "loss": 2.7371, "step": 278 }, { "epoch": 5.9361702127659575, "grad_norm": 0.81640625, "learning_rate": 3.270509758441671e-05, "loss": 2.7465, "step": 279 }, { "epoch": 5.957446808510638, "grad_norm": 0.8828125, "learning_rate": 3.207266611027069e-05, "loss": 2.6859, "step": 280 }, { "epoch": 5.9787234042553195, "grad_norm": 0.9375, "learning_rate": 3.144523935257846e-05, "loss": 2.6722, "step": 281 }, { "epoch": 6.0, "grad_norm": 1.375, "learning_rate": 3.082286353857782e-05, "loss": 2.0584, "step": 282 }, { "epoch": 6.0212765957446805, "grad_norm": 0.69921875, "learning_rate": 3.0205584523365626e-05, "loss": 2.6076, "step": 283 }, { "epoch": 6.042553191489362, "grad_norm": 0.73046875, "learning_rate": 2.9593447786519425e-05, "loss": 2.6513, "step": 284 }, { "epoch": 6.0638297872340425, "grad_norm": 0.859375, "learning_rate": 2.8986498428746444e-05, "loss": 2.6075, "step": 285 }, { "epoch": 6.085106382978723, "grad_norm": 0.94140625, "learning_rate": 2.8384781168560693e-05, "loss": 2.7151, "step": 286 }, { "epoch": 6.1063829787234045, "grad_norm": 1.0390625, "learning_rate": 2.7788340338988385e-05, "loss": 2.7812, "step": 287 }, { "epoch": 6.127659574468085, "grad_norm": 1.1015625, "learning_rate": 2.719721988430153e-05, "loss": 2.7936, "step": 288 }, { "epoch": 6.148936170212766, "grad_norm": 1.0, "learning_rate": 2.6611463356780096e-05, "loss": 2.6086, "step": 289 }, { "epoch": 6.170212765957447, "grad_norm": 0.984375, "learning_rate": 2.6031113913503337e-05, "loss": 2.8151, "step": 290 }, { "epoch": 6.191489361702128, "grad_norm": 0.734375, "learning_rate": 2.5456214313170002e-05, "loss": 2.7246, "step": 291 }, { "epoch": 6.212765957446808, "grad_norm": 0.69140625, "learning_rate": 2.4886806912948035e-05, "loss": 2.7524, "step": 292 }, { "epoch": 6.23404255319149, "grad_norm": 0.75390625, "learning_rate": 2.4322933665353776e-05, "loss": 2.7285, "step": 293 }, { "epoch": 6.25531914893617, "grad_norm": 0.66015625, "learning_rate": 2.3764636115160978e-05, "loss": 2.7237, "step": 294 }, { "epoch": 6.276595744680851, "grad_norm": 0.734375, "learning_rate": 2.3211955396340002e-05, "loss": 2.818, "step": 295 }, { "epoch": 6.297872340425532, "grad_norm": 0.6796875, "learning_rate": 2.2664932229027024e-05, "loss": 2.7163, "step": 296 }, { "epoch": 6.319148936170213, "grad_norm": 0.609375, "learning_rate": 2.2123606916523953e-05, "loss": 2.7859, "step": 297 }, { "epoch": 6.340425531914893, "grad_norm": 0.64453125, "learning_rate": 2.1588019342328968e-05, "loss": 2.6892, "step": 298 }, { "epoch": 6.361702127659575, "grad_norm": 0.6328125, "learning_rate": 2.1058208967198045e-05, "loss": 2.767, "step": 299 }, { "epoch": 6.382978723404255, "grad_norm": 0.62109375, "learning_rate": 2.0534214826237484e-05, "loss": 2.6933, "step": 300 }, { "epoch": 6.404255319148936, "grad_norm": 0.62890625, "learning_rate": 2.0016075526028065e-05, "loss": 2.7303, "step": 301 }, { "epoch": 6.425531914893617, "grad_norm": 0.5703125, "learning_rate": 1.9503829241780412e-05, "loss": 2.7377, "step": 302 }, { "epoch": 6.446808510638298, "grad_norm": 0.6171875, "learning_rate": 1.8997513714522487e-05, "loss": 2.7818, "step": 303 }, { "epoch": 6.468085106382979, "grad_norm": 0.59765625, "learning_rate": 1.8497166248318876e-05, "loss": 2.7335, "step": 304 }, { "epoch": 6.48936170212766, "grad_norm": 0.60546875, "learning_rate": 1.8002823707522297e-05, "loss": 2.733, "step": 305 }, { "epoch": 6.51063829787234, "grad_norm": 0.66796875, "learning_rate": 1.7514522514057553e-05, "loss": 2.6446, "step": 306 }, { "epoch": 6.531914893617021, "grad_norm": 0.6484375, "learning_rate": 1.703229864473811e-05, "loss": 2.6907, "step": 307 }, { "epoch": 6.553191489361702, "grad_norm": 0.609375, "learning_rate": 1.6556187628615273e-05, "loss": 2.7176, "step": 308 }, { "epoch": 6.574468085106383, "grad_norm": 0.6484375, "learning_rate": 1.608622454436062e-05, "loss": 2.7109, "step": 309 }, { "epoch": 6.595744680851064, "grad_norm": 0.65625, "learning_rate": 1.562244401768144e-05, "loss": 2.7085, "step": 310 }, { "epoch": 6.617021276595745, "grad_norm": 0.65625, "learning_rate": 1.5164880218769618e-05, "loss": 2.6987, "step": 311 }, { "epoch": 6.638297872340425, "grad_norm": 0.62890625, "learning_rate": 1.4713566859784045e-05, "loss": 2.6835, "step": 312 }, { "epoch": 6.659574468085106, "grad_norm": 0.6328125, "learning_rate": 1.426853719236676e-05, "loss": 2.6981, "step": 313 }, { "epoch": 6.680851063829787, "grad_norm": 0.640625, "learning_rate": 1.3829824005193181e-05, "loss": 2.7132, "step": 314 }, { "epoch": 6.702127659574468, "grad_norm": 0.62890625, "learning_rate": 1.339745962155613e-05, "loss": 2.6319, "step": 315 }, { "epoch": 6.723404255319149, "grad_norm": 0.63671875, "learning_rate": 1.2971475896984475e-05, "loss": 2.7332, "step": 316 }, { "epoch": 6.74468085106383, "grad_norm": 0.68359375, "learning_rate": 1.2551904216896037e-05, "loss": 2.6649, "step": 317 }, { "epoch": 6.76595744680851, "grad_norm": 0.6484375, "learning_rate": 1.2138775494285182e-05, "loss": 2.6486, "step": 318 }, { "epoch": 6.787234042553192, "grad_norm": 0.69140625, "learning_rate": 1.1732120167445248e-05, "loss": 2.5875, "step": 319 }, { "epoch": 6.808510638297872, "grad_norm": 0.7109375, "learning_rate": 1.1331968197725984e-05, "loss": 2.7079, "step": 320 }, { "epoch": 6.829787234042553, "grad_norm": 0.70703125, "learning_rate": 1.0938349067325959e-05, "loss": 2.7134, "step": 321 }, { "epoch": 6.851063829787234, "grad_norm": 0.71875, "learning_rate": 1.0551291777120464e-05, "loss": 2.7199, "step": 322 }, { "epoch": 6.872340425531915, "grad_norm": 0.703125, "learning_rate": 1.0170824844524728e-05, "loss": 2.6655, "step": 323 }, { "epoch": 6.8936170212765955, "grad_norm": 0.73046875, "learning_rate": 9.796976301392934e-06, "loss": 2.6519, "step": 324 }, { "epoch": 6.914893617021277, "grad_norm": 0.75390625, "learning_rate": 9.429773691952858e-06, "loss": 2.6443, "step": 325 }, { "epoch": 6.9361702127659575, "grad_norm": 0.79296875, "learning_rate": 9.069244070776428e-06, "loss": 2.6531, "step": 326 }, { "epoch": 6.957446808510638, "grad_norm": 0.86328125, "learning_rate": 8.715414000786448e-06, "loss": 2.5897, "step": 327 }, { "epoch": 6.9787234042553195, "grad_norm": 1.03125, "learning_rate": 8.368309551299536e-06, "loss": 2.5772, "step": 328 }, { "epoch": 7.0, "grad_norm": 1.296875, "learning_rate": 8.027956296105354e-06, "loss": 1.9731, "step": 329 }, { "epoch": 7.0212765957446805, "grad_norm": 0.75, "learning_rate": 7.6943793115824e-06, "loss": 2.5669, "step": 330 }, { "epoch": 7.042553191489362, "grad_norm": 0.703125, "learning_rate": 7.367603174850502e-06, "loss": 2.6154, "step": 331 }, { "epoch": 7.0638297872340425, "grad_norm": 0.72265625, "learning_rate": 7.047651961959978e-06, "loss": 2.5542, "step": 332 }, { "epoch": 7.085106382978723, "grad_norm": 0.74609375, "learning_rate": 6.73454924611776e-06, "loss": 2.6428, "step": 333 }, { "epoch": 7.1063829787234045, "grad_norm": 0.64453125, "learning_rate": 6.428318095950647e-06, "loss": 2.6929, "step": 334 }, { "epoch": 7.127659574468085, "grad_norm": 0.63671875, "learning_rate": 6.128981073805584e-06, "loss": 2.6994, "step": 335 }, { "epoch": 7.148936170212766, "grad_norm": 0.67578125, "learning_rate": 5.836560234087418e-06, "loss": 2.5162, "step": 336 }, { "epoch": 7.170212765957447, "grad_norm": 0.76953125, "learning_rate": 5.551077121633874e-06, "loss": 2.7308, "step": 337 }, { "epoch": 7.191489361702128, "grad_norm": 0.734375, "learning_rate": 5.272552770128314e-06, "loss": 2.6655, "step": 338 }, { "epoch": 7.212765957446808, "grad_norm": 0.7578125, "learning_rate": 5.001007700549898e-06, "loss": 2.7014, "step": 339 }, { "epoch": 7.23404255319149, "grad_norm": 0.80078125, "learning_rate": 4.7364619196617495e-06, "loss": 2.6704, "step": 340 }, { "epoch": 7.25531914893617, "grad_norm": 0.734375, "learning_rate": 4.478934918536837e-06, "loss": 2.6756, "step": 341 }, { "epoch": 7.276595744680851, "grad_norm": 0.7578125, "learning_rate": 4.228445671121972e-06, "loss": 2.7574, "step": 342 }, { "epoch": 7.297872340425532, "grad_norm": 0.6640625, "learning_rate": 3.985012632839824e-06, "loss": 2.6565, "step": 343 }, { "epoch": 7.319148936170213, "grad_norm": 0.69140625, "learning_rate": 3.748653739229191e-06, "loss": 2.7389, "step": 344 }, { "epoch": 7.340425531914893, "grad_norm": 0.68359375, "learning_rate": 3.519386404623537e-06, "loss": 2.6382, "step": 345 }, { "epoch": 7.361702127659575, "grad_norm": 0.73828125, "learning_rate": 3.2972275208679625e-06, "loss": 2.7147, "step": 346 }, { "epoch": 7.382978723404255, "grad_norm": 0.7421875, "learning_rate": 3.0821934560746447e-06, "loss": 2.6497, "step": 347 }, { "epoch": 7.404255319148936, "grad_norm": 0.76171875, "learning_rate": 2.8743000534168675e-06, "loss": 2.6844, "step": 348 }, { "epoch": 7.425531914893617, "grad_norm": 0.72265625, "learning_rate": 2.6735626299617457e-06, "loss": 2.6961, "step": 349 }, { "epoch": 7.446808510638298, "grad_norm": 0.7109375, "learning_rate": 2.479995975541749e-06, "loss": 2.7341, "step": 350 }, { "epoch": 7.468085106382979, "grad_norm": 0.7109375, "learning_rate": 2.2936143516649188e-06, "loss": 2.6872, "step": 351 }, { "epoch": 7.48936170212766, "grad_norm": 0.69921875, "learning_rate": 2.1144314904642195e-06, "loss": 2.6879, "step": 352 }, { "epoch": 7.51063829787234, "grad_norm": 0.7109375, "learning_rate": 1.942460593685713e-06, "loss": 2.5916, "step": 353 }, { "epoch": 7.531914893617021, "grad_norm": 0.72265625, "learning_rate": 1.7777143317159406e-06, "loss": 2.643, "step": 354 }, { "epoch": 7.553191489361702, "grad_norm": 0.671875, "learning_rate": 1.6202048426483651e-06, "loss": 2.6724, "step": 355 }, { "epoch": 7.574468085106383, "grad_norm": 0.6875, "learning_rate": 1.4699437313891007e-06, "loss": 2.6634, "step": 356 }, { "epoch": 7.595744680851064, "grad_norm": 0.7265625, "learning_rate": 1.3269420688018508e-06, "loss": 2.6651, "step": 357 }, { "epoch": 7.617021276595745, "grad_norm": 0.70703125, "learning_rate": 1.1912103908922945e-06, "loss": 2.6545, "step": 358 }, { "epoch": 7.638297872340425, "grad_norm": 0.69921875, "learning_rate": 1.0627586980317073e-06, "loss": 2.6455, "step": 359 }, { "epoch": 7.659574468085106, "grad_norm": 0.69921875, "learning_rate": 9.415964542203059e-07, "loss": 2.6622, "step": 360 }, { "epoch": 7.680851063829787, "grad_norm": 0.7265625, "learning_rate": 8.277325863898511e-07, "loss": 2.6787, "step": 361 }, { "epoch": 7.702127659574468, "grad_norm": 0.7109375, "learning_rate": 7.21175483745995e-07, "loss": 2.6004, "step": 362 }, { "epoch": 7.723404255319149, "grad_norm": 0.7265625, "learning_rate": 6.219329971501653e-07, "loss": 2.7023, "step": 363 }, { "epoch": 7.74468085106383, "grad_norm": 0.73828125, "learning_rate": 5.300124385410943e-07, "loss": 2.6309, "step": 364 }, { "epoch": 7.76595744680851, "grad_norm": 0.6953125, "learning_rate": 4.4542058039619417e-07, "loss": 2.6197, "step": 365 }, { "epoch": 7.787234042553192, "grad_norm": 0.75, "learning_rate": 3.681636552324452e-07, "loss": 2.5579, "step": 366 }, { "epoch": 7.808510638297872, "grad_norm": 0.78515625, "learning_rate": 2.9824735514732974e-07, "loss": 2.6765, "step": 367 }, { "epoch": 7.829787234042553, "grad_norm": 0.8203125, "learning_rate": 2.3567683139936735e-07, "loss": 2.687, "step": 368 }, { "epoch": 7.851063829787234, "grad_norm": 0.796875, "learning_rate": 1.8045669402859677e-07, "loss": 2.6924, "step": 369 }, { "epoch": 7.872340425531915, "grad_norm": 0.79296875, "learning_rate": 1.3259101151694708e-07, "loss": 2.6409, "step": 370 }, { "epoch": 7.8936170212765955, "grad_norm": 0.78125, "learning_rate": 9.208331048846663e-08, "loss": 2.6251, "step": 371 }, { "epoch": 7.914893617021277, "grad_norm": 0.77734375, "learning_rate": 5.893657544947528e-08, "loss": 2.616, "step": 372 }, { "epoch": 7.9361702127659575, "grad_norm": 0.8515625, "learning_rate": 3.3153248568695835e-08, "loss": 2.626, "step": 373 }, { "epoch": 7.957446808510638, "grad_norm": 0.81640625, "learning_rate": 1.47352294973091e-08, "loss": 2.5585, "step": 374 }, { "epoch": 7.9787234042553195, "grad_norm": 0.89453125, "learning_rate": 3.6838752290102585e-09, "loss": 2.543, "step": 375 }, { "epoch": 8.0, "grad_norm": 1.046875, "learning_rate": 0.0, "loss": 1.9425, "step": 376 } ], "logging_steps": 1, "max_steps": 376, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 47, "total_flos": 5.59507839123456e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }