|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 8.0, |
|
"eval_steps": 500, |
|
"global_step": 376, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02127659574468085, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 2e-05, |
|
"loss": 3.872, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0425531914893617, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 4e-05, |
|
"loss": 3.9714, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.06382978723404255, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 6e-05, |
|
"loss": 3.9503, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0851063829787234, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 8e-05, |
|
"loss": 4.0784, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.10638297872340426, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 0.0001, |
|
"loss": 3.9539, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.1276595744680851, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.00012, |
|
"loss": 3.8024, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.14893617021276595, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 0.00014, |
|
"loss": 3.6005, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.1702127659574468, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 0.00016, |
|
"loss": 3.8633, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.19148936170212766, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 0.00018, |
|
"loss": 3.6551, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.2127659574468085, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 0.0002, |
|
"loss": 3.6645, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.23404255319148937, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 0.000199996316124771, |
|
"loss": 3.7208, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.2553191489361702, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 0.0001999852647705027, |
|
"loss": 3.6193, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.2765957446808511, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.0001999668467514313, |
|
"loss": 3.7057, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.2978723404255319, |
|
"grad_norm": 0.234375, |
|
"learning_rate": 0.00019994106342455053, |
|
"loss": 3.5714, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.3191489361702128, |
|
"grad_norm": 0.189453125, |
|
"learning_rate": 0.00019990791668951155, |
|
"loss": 3.582, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.3404255319148936, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.00019986740898848306, |
|
"loss": 3.5228, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.3617021276595745, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.00019981954330597143, |
|
"loss": 3.5893, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.3829787234042553, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.00019976432316860067, |
|
"loss": 3.5203, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.40425531914893614, |
|
"grad_norm": 0.21484375, |
|
"learning_rate": 0.00019970175264485266, |
|
"loss": 3.5939, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.425531914893617, |
|
"grad_norm": 0.1796875, |
|
"learning_rate": 0.00019963183634476756, |
|
"loss": 3.5296, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.44680851063829785, |
|
"grad_norm": 0.2138671875, |
|
"learning_rate": 0.00019955457941960383, |
|
"loss": 3.6242, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.46808510638297873, |
|
"grad_norm": 0.240234375, |
|
"learning_rate": 0.0001994699875614589, |
|
"loss": 3.517, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.48936170212765956, |
|
"grad_norm": 0.2158203125, |
|
"learning_rate": 0.00019937806700284986, |
|
"loss": 3.5748, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.5106382978723404, |
|
"grad_norm": 0.1982421875, |
|
"learning_rate": 0.00019927882451625402, |
|
"loss": 3.5263, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.5319148936170213, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.00019917226741361015, |
|
"loss": 3.5638, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.5531914893617021, |
|
"grad_norm": 0.1748046875, |
|
"learning_rate": 0.00019905840354577972, |
|
"loss": 3.5424, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.574468085106383, |
|
"grad_norm": 0.1982421875, |
|
"learning_rate": 0.00019893724130196828, |
|
"loss": 3.5726, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.5957446808510638, |
|
"grad_norm": 0.2275390625, |
|
"learning_rate": 0.00019880878960910772, |
|
"loss": 3.5656, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.6170212765957447, |
|
"grad_norm": 0.1904296875, |
|
"learning_rate": 0.00019867305793119816, |
|
"loss": 3.6008, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.6382978723404256, |
|
"grad_norm": 0.228515625, |
|
"learning_rate": 0.0001985300562686109, |
|
"loss": 3.5136, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.6595744680851063, |
|
"grad_norm": 0.2255859375, |
|
"learning_rate": 0.00019837979515735166, |
|
"loss": 3.5632, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.6808510638297872, |
|
"grad_norm": 0.1884765625, |
|
"learning_rate": 0.0001982222856682841, |
|
"loss": 3.6284, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.7021276595744681, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.0001980575394063143, |
|
"loss": 3.4885, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.723404255319149, |
|
"grad_norm": 0.232421875, |
|
"learning_rate": 0.0001978855685095358, |
|
"loss": 3.6102, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.7446808510638298, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 0.0001977063856483351, |
|
"loss": 3.5844, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.7659574468085106, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 0.00019752000402445825, |
|
"loss": 3.5097, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.7872340425531915, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.00019732643737003827, |
|
"loss": 3.492, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.8085106382978723, |
|
"grad_norm": 0.216796875, |
|
"learning_rate": 0.00019712569994658315, |
|
"loss": 3.6192, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.8297872340425532, |
|
"grad_norm": 0.2314453125, |
|
"learning_rate": 0.00019691780654392535, |
|
"loss": 3.6314, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.851063829787234, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 0.00019670277247913205, |
|
"loss": 3.6429, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.8723404255319149, |
|
"grad_norm": 0.208984375, |
|
"learning_rate": 0.00019648061359537646, |
|
"loss": 3.5714, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.8936170212765957, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.00019625134626077083, |
|
"loss": 3.574, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.9148936170212766, |
|
"grad_norm": 0.224609375, |
|
"learning_rate": 0.00019601498736716017, |
|
"loss": 3.6269, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.9361702127659575, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 0.00019577155432887804, |
|
"loss": 3.659, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.9574468085106383, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.00019552106508146318, |
|
"loss": 3.6223, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.9787234042553191, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 0.00019526353808033825, |
|
"loss": 3.6389, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 0.00019499899229945012, |
|
"loss": 3.4551, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 1.0212765957446808, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 0.0001947274472298717, |
|
"loss": 3.2727, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 1.0425531914893618, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 0.00019444892287836613, |
|
"loss": 3.3136, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 1.0638297872340425, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 0.00019416343976591261, |
|
"loss": 3.3188, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.0851063829787233, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 0.00019387101892619443, |
|
"loss": 3.424, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 1.1063829787234043, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.00019357168190404936, |
|
"loss": 3.3676, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 1.127659574468085, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 0.00019326545075388225, |
|
"loss": 3.3535, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 1.148936170212766, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 0.00019295234803804004, |
|
"loss": 3.1686, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 1.1702127659574468, |
|
"grad_norm": 0.234375, |
|
"learning_rate": 0.00019263239682514952, |
|
"loss": 3.3986, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.1914893617021276, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 0.0001923056206884176, |
|
"loss": 3.2916, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 1.2127659574468086, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.00019197204370389467, |
|
"loss": 3.3444, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 1.2340425531914894, |
|
"grad_norm": 0.375, |
|
"learning_rate": 0.0001916316904487005, |
|
"loss": 3.3603, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 1.2553191489361701, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 0.00019128458599921357, |
|
"loss": 3.308, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 1.2765957446808511, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 0.00019093075592922358, |
|
"loss": 3.4154, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.297872340425532, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 0.00019057022630804716, |
|
"loss": 3.3095, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 1.3191489361702127, |
|
"grad_norm": 0.212890625, |
|
"learning_rate": 0.00019020302369860708, |
|
"loss": 3.3266, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 1.3404255319148937, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 0.0001898291751554753, |
|
"loss": 3.276, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 1.3617021276595744, |
|
"grad_norm": 0.462890625, |
|
"learning_rate": 0.00018944870822287956, |
|
"loss": 3.3428, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 1.3829787234042552, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.00018906165093267405, |
|
"loss": 3.2515, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 1.4042553191489362, |
|
"grad_norm": 0.232421875, |
|
"learning_rate": 0.00018866803180227402, |
|
"loss": 3.3125, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 1.425531914893617, |
|
"grad_norm": 0.443359375, |
|
"learning_rate": 0.00018826787983255473, |
|
"loss": 3.2968, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 1.4468085106382977, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 0.00018786122450571485, |
|
"loss": 3.3705, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 1.4680851063829787, |
|
"grad_norm": 0.220703125, |
|
"learning_rate": 0.00018744809578310397, |
|
"loss": 3.2878, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 1.4893617021276595, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.00018702852410301554, |
|
"loss": 3.3214, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.5106382978723403, |
|
"grad_norm": 0.41796875, |
|
"learning_rate": 0.00018660254037844388, |
|
"loss": 3.2708, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 1.5319148936170213, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 0.00018617017599480682, |
|
"loss": 3.3087, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 1.5531914893617023, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 0.00018573146280763324, |
|
"loss": 3.3227, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 1.574468085106383, |
|
"grad_norm": 0.4375, |
|
"learning_rate": 0.000185286433140216, |
|
"loss": 3.3296, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 1.5957446808510638, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 0.0001848351197812304, |
|
"loss": 3.3282, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.6170212765957448, |
|
"grad_norm": 0.2294921875, |
|
"learning_rate": 0.00018437755598231856, |
|
"loss": 3.3421, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 1.6382978723404256, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 0.00018391377545563938, |
|
"loss": 3.3002, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 1.6595744680851063, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 0.00018344381237138472, |
|
"loss": 3.3293, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 1.6808510638297873, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 0.0001829677013552619, |
|
"loss": 3.3688, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 1.702127659574468, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 0.00018248547748594244, |
|
"loss": 3.2586, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.7234042553191489, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.00018199717629247773, |
|
"loss": 3.3783, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 1.7446808510638299, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.00018150283375168114, |
|
"loss": 3.3503, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 1.7659574468085106, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.0001810024862854775, |
|
"loss": 3.2862, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 1.7872340425531914, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 0.00018049617075821962, |
|
"loss": 3.2503, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 1.8085106382978724, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.00017998392447397197, |
|
"loss": 3.3987, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 1.8297872340425532, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.0001794657851737625, |
|
"loss": 3.3948, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 1.851063829787234, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.00017894179103280198, |
|
"loss": 3.414, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 1.872340425531915, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.00017841198065767107, |
|
"loss": 3.3495, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 1.8936170212765957, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.00017787639308347608, |
|
"loss": 3.3357, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 1.9148936170212765, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 0.000177335067770973, |
|
"loss": 3.3956, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.9361702127659575, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.00017678804460366, |
|
"loss": 3.4261, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 1.9574468085106385, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.00017623536388483905, |
|
"loss": 3.3929, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 1.978723404255319, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.00017567706633464628, |
|
"loss": 3.4055, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 0.00017511319308705198, |
|
"loss": 3.0576, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 2.021276595744681, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.00017454378568683003, |
|
"loss": 3.1095, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 2.0425531914893615, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.0001739688860864967, |
|
"loss": 3.1669, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 2.0638297872340425, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.00017338853664321992, |
|
"loss": 3.1293, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 2.0851063829787235, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 0.00017280278011569847, |
|
"loss": 3.2461, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 2.106382978723404, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.00017221165966101163, |
|
"loss": 3.2222, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 2.127659574468085, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.00017161521883143934, |
|
"loss": 3.1956, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 2.148936170212766, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 0.0001710135015712536, |
|
"loss": 3.0099, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 2.1702127659574466, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.00017040655221348057, |
|
"loss": 3.2425, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 2.1914893617021276, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 0.00016979441547663435, |
|
"loss": 3.1365, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 2.2127659574468086, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.00016917713646142222, |
|
"loss": 3.1903, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 2.2340425531914896, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 0.00016855476064742155, |
|
"loss": 3.1938, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 2.25531914893617, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.00016792733388972932, |
|
"loss": 3.1561, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 2.276595744680851, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 0.0001672949024155833, |
|
"loss": 3.259, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 2.297872340425532, |
|
"grad_norm": 0.47265625, |
|
"learning_rate": 0.00016665751282095634, |
|
"loss": 3.1575, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 2.3191489361702127, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.00016601521206712318, |
|
"loss": 3.1849, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 2.3404255319148937, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 0.0001653680474772006, |
|
"loss": 3.1254, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 2.3617021276595747, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 0.00016471606673266066, |
|
"loss": 3.1994, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 2.382978723404255, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.00016405931786981755, |
|
"loss": 3.101, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 2.404255319148936, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 0.00016339784927628867, |
|
"loss": 3.1611, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 2.425531914893617, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.0001627317096874294, |
|
"loss": 3.1622, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 2.4468085106382977, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 0.00016206094818274229, |
|
"loss": 3.2131, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 2.4680851063829787, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 0.0001613856141822612, |
|
"loss": 3.1511, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 2.4893617021276597, |
|
"grad_norm": 0.478515625, |
|
"learning_rate": 0.00016070575744291004, |
|
"loss": 3.1662, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 2.5106382978723403, |
|
"grad_norm": 0.4140625, |
|
"learning_rate": 0.00016002142805483685, |
|
"loss": 3.1092, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 2.5319148936170213, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 0.0001593326764377232, |
|
"loss": 3.1444, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 2.5531914893617023, |
|
"grad_norm": 0.443359375, |
|
"learning_rate": 0.00015863955333706957, |
|
"loss": 3.1738, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.574468085106383, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 0.00015794210982045636, |
|
"loss": 3.1766, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 2.595744680851064, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.00015724039727378148, |
|
"loss": 3.166, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 2.617021276595745, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 0.00015653446739747427, |
|
"loss": 3.1837, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 2.6382978723404253, |
|
"grad_norm": 0.484375, |
|
"learning_rate": 0.00015582437220268647, |
|
"loss": 3.1519, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 2.6595744680851063, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.00015511016400746, |
|
"loss": 3.165, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 2.6808510638297873, |
|
"grad_norm": 0.431640625, |
|
"learning_rate": 0.00015439189543287247, |
|
"loss": 3.2062, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 2.702127659574468, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 0.00015366961939916008, |
|
"loss": 3.0979, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 2.723404255319149, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 0.0001529433891218185, |
|
"loss": 3.217, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 2.74468085106383, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 0.0001522132581076825, |
|
"loss": 3.1789, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 2.7659574468085104, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 0.0001514792801509831, |
|
"loss": 3.1253, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.7872340425531914, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 0.00015074150932938455, |
|
"loss": 3.0813, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 2.8085106382978724, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 3.2233, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 2.829787234042553, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 0.00014925480679538647, |
|
"loss": 3.2241, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 2.851063829787234, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.00014850598461951963, |
|
"loss": 3.2428, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 2.872340425531915, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 0.00014775358864374885, |
|
"loss": 3.1833, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 2.8936170212765955, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 0.000146997674302732, |
|
"loss": 3.162, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 2.9148936170212765, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 0.0001462382972903515, |
|
"loss": 3.2095, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 2.9361702127659575, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.0001454755135556106, |
|
"loss": 3.2355, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 2.9574468085106385, |
|
"grad_norm": 0.466796875, |
|
"learning_rate": 0.0001447093792985114, |
|
"loss": 3.1969, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 2.978723404255319, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 0.00014393995096591416, |
|
"loss": 3.2092, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.984375, |
|
"learning_rate": 0.0001431672852473784, |
|
"loss": 2.7442, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 3.021276595744681, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 0.0001423914390709861, |
|
"loss": 2.9817, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 3.0425531914893615, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 0.00014161246959914744, |
|
"loss": 3.0423, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 3.0638297872340425, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 0.00014083043422438935, |
|
"loss": 2.9844, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 3.0851063829787235, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 0.00014004539056512667, |
|
"loss": 3.0951, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 3.106382978723404, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 0.0001392573964614172, |
|
"loss": 3.089, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 3.127659574468085, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 0.00013846650997070012, |
|
"loss": 3.0649, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 3.148936170212766, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 0.00013767278936351854, |
|
"loss": 2.8683, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 3.1702127659574466, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 0.00013687629311922602, |
|
"loss": 3.1071, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 3.1914893617021276, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 0.00013607707992167834, |
|
"loss": 3.0015, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 3.2127659574468086, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.0001352752086549095, |
|
"loss": 3.0506, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 3.2340425531914896, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 0.0001344707383987934, |
|
"loss": 3.0533, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 3.25531914893617, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 0.00013366372842469105, |
|
"loss": 3.0211, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 3.276595744680851, |
|
"grad_norm": 0.478515625, |
|
"learning_rate": 0.0001328542381910835, |
|
"loss": 3.1129, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 3.297872340425532, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 0.00013204232733919112, |
|
"loss": 3.0158, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 3.3191489361702127, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 0.00013122805568857948, |
|
"loss": 3.0605, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 3.3404255319148937, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 0.0001304114832327518, |
|
"loss": 2.9792, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 3.3617021276595747, |
|
"grad_norm": 0.439453125, |
|
"learning_rate": 0.00012959267013472892, |
|
"loss": 3.0647, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 3.382978723404255, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 0.0001287716767226167, |
|
"loss": 2.9722, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 3.404255319148936, |
|
"grad_norm": 0.431640625, |
|
"learning_rate": 0.00012794856348516095, |
|
"loss": 3.0233, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 3.425531914893617, |
|
"grad_norm": 0.458984375, |
|
"learning_rate": 0.000127123391067291, |
|
"loss": 3.0216, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 3.4468085106382977, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 0.00012629622026565147, |
|
"loss": 3.0703, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 3.4680851063829787, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 0.00012546711202412287, |
|
"loss": 3.0121, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 3.4893617021276597, |
|
"grad_norm": 0.443359375, |
|
"learning_rate": 0.00012463612742933148, |
|
"loss": 3.0189, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 3.5106382978723403, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 0.00012380332770614856, |
|
"loss": 2.9589, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 3.5319148936170213, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 0.0001229687742131796, |
|
"loss": 2.9954, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 3.5531914893617023, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 0.00012213252843824325, |
|
"loss": 3.0266, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 3.574468085106383, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.00012129465199384157, |
|
"loss": 3.0273, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 3.595744680851064, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.0001204552066126201, |
|
"loss": 3.0214, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 3.617021276595745, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 0.0001196142541428197, |
|
"loss": 3.0232, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 3.6382978723404253, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 0.00011877185654371987, |
|
"loss": 3.0004, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 3.6595744680851063, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.00011792807588107357, |
|
"loss": 3.0165, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 3.6808510638297873, |
|
"grad_norm": 0.44921875, |
|
"learning_rate": 0.00011708297432253444, |
|
"loss": 3.0491, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 3.702127659574468, |
|
"grad_norm": 0.41796875, |
|
"learning_rate": 0.00011623661413307639, |
|
"loss": 2.9456, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 3.723404255319149, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 0.0001153890576704062, |
|
"loss": 3.0586, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 3.74468085106383, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 0.00011454036738036899, |
|
"loss": 3.0125, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 3.7659574468085104, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 0.00011369060579234754, |
|
"loss": 2.9722, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 3.7872340425531914, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 0.00011283983551465511, |
|
"loss": 2.9201, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 3.8085106382978724, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 0.00011198811922992274, |
|
"loss": 3.0565, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 3.829787234042553, |
|
"grad_norm": 0.462890625, |
|
"learning_rate": 0.00011113551969048089, |
|
"loss": 3.0615, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 3.851063829787234, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 0.00011028209971373605, |
|
"loss": 3.0731, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 3.872340425531915, |
|
"grad_norm": 0.46484375, |
|
"learning_rate": 0.00010942792217754245, |
|
"loss": 3.0144, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 3.8936170212765955, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 0.00010857305001556944, |
|
"loss": 2.9905, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 3.9148936170212765, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.00010771754621266466, |
|
"loss": 3.0232, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 3.9361702127659575, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.00010686147380021342, |
|
"loss": 3.0408, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 3.9574468085106385, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 0.00010600489585149484, |
|
"loss": 2.9963, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 3.978723404255319, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.00010514787547703466, |
|
"loss": 3.0049, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.00010429047581995546, |
|
"loss": 2.4433, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 4.0212765957446805, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 0.00010343276005132436, |
|
"loss": 2.8295, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 4.042553191489362, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 0.00010257479136549889, |
|
"loss": 2.8904, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 4.0638297872340425, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 0.00010171663297547076, |
|
"loss": 2.834, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 4.085106382978723, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 0.00010085834810820871, |
|
"loss": 2.9309, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 4.1063829787234045, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 0.0001, |
|
"loss": 2.9461, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 4.127659574468085, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 9.914165189179131e-05, |
|
"loss": 2.9405, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 4.148936170212766, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 9.828336702452927e-05, |
|
"loss": 2.7445, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 4.170212765957447, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 9.742520863450115e-05, |
|
"loss": 2.963, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 4.191489361702128, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 9.656723994867566e-05, |
|
"loss": 2.8778, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 4.212765957446808, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 9.570952418004455e-05, |
|
"loss": 2.9148, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 4.23404255319149, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 9.485212452296535e-05, |
|
"loss": 2.9028, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 4.25531914893617, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 9.399510414850518e-05, |
|
"loss": 2.898, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 4.276595744680851, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 9.313852619978659e-05, |
|
"loss": 2.9883, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 4.297872340425532, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 9.228245378733537e-05, |
|
"loss": 2.886, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 4.319148936170213, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 9.142694998443056e-05, |
|
"loss": 2.9453, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 4.340425531914893, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 9.057207782245757e-05, |
|
"loss": 2.8555, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 4.361702127659575, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 8.971790028626395e-05, |
|
"loss": 2.9359, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 4.382978723404255, |
|
"grad_norm": 0.45703125, |
|
"learning_rate": 8.886448030951912e-05, |
|
"loss": 2.8469, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 4.404255319148936, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 8.801188077007728e-05, |
|
"loss": 2.8963, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 4.425531914893617, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 8.71601644853449e-05, |
|
"loss": 2.8965, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 4.446808510638298, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 8.630939420765247e-05, |
|
"loss": 2.9457, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 4.468085106382979, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 8.545963261963102e-05, |
|
"loss": 2.8918, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 4.48936170212766, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 8.461094232959381e-05, |
|
"loss": 2.8957, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 4.51063829787234, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 8.376338586692366e-05, |
|
"loss": 2.8224, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 4.531914893617021, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 8.29170256774656e-05, |
|
"loss": 2.859, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 4.553191489361702, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 8.207192411892646e-05, |
|
"loss": 2.8896, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 4.574468085106383, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 8.122814345628016e-05, |
|
"loss": 2.8874, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 4.595744680851064, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 8.038574585718032e-05, |
|
"loss": 2.8869, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 4.617021276595745, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 7.954479338737995e-05, |
|
"loss": 2.8923, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 4.638297872340425, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 7.870534800615845e-05, |
|
"loss": 2.868, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 4.659574468085106, |
|
"grad_norm": 0.625, |
|
"learning_rate": 7.786747156175676e-05, |
|
"loss": 2.8831, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 4.680851063829787, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 7.703122578682046e-05, |
|
"loss": 2.9084, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 4.702127659574468, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 7.619667229385146e-05, |
|
"loss": 2.8085, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 4.723404255319149, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 7.536387257066854e-05, |
|
"loss": 2.92, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 4.74468085106383, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 7.453288797587714e-05, |
|
"loss": 2.8661, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 4.76595744680851, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 7.370377973434855e-05, |
|
"loss": 2.8322, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 4.787234042553192, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 7.2876608932709e-05, |
|
"loss": 2.772, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 4.808510638297872, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 7.205143651483906e-05, |
|
"loss": 2.905, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 4.829787234042553, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 7.122832327738331e-05, |
|
"loss": 2.9116, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 4.851063829787234, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 7.040732986527108e-05, |
|
"loss": 2.9203, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 4.872340425531915, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 6.958851676724823e-05, |
|
"loss": 2.8646, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 4.8936170212765955, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 6.877194431142055e-05, |
|
"loss": 2.844, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 4.914893617021277, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 6.79576726608089e-05, |
|
"loss": 2.8604, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 4.9361702127659575, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 6.714576180891654e-05, |
|
"loss": 2.8686, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 4.957446808510638, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 6.633627157530899e-05, |
|
"loss": 2.8085, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 4.9787234042553195, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 6.552926160120663e-05, |
|
"loss": 2.8017, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 6.472479134509052e-05, |
|
"loss": 2.189, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 5.0212765957446805, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 6.392292007832168e-05, |
|
"loss": 2.7068, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 5.042553191489362, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 6.312370688077399e-05, |
|
"loss": 2.7591, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 5.0638297872340425, |
|
"grad_norm": 0.984375, |
|
"learning_rate": 6.232721063648148e-05, |
|
"loss": 2.7161, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 5.085106382978723, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 6.153349002929987e-05, |
|
"loss": 2.8126, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 5.1063829787234045, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 6.0742603538582835e-05, |
|
"loss": 2.8485, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 5.127659574468085, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 5.9954609434873344e-05, |
|
"loss": 2.8336, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 5.148936170212766, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 5.9169565775610656e-05, |
|
"loss": 2.6482, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 5.170212765957447, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 5.838753040085256e-05, |
|
"loss": 2.8597, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 5.191489361702128, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 5.7608560929013946e-05, |
|
"loss": 2.7875, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 5.212765957446808, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 5.683271475262164e-05, |
|
"loss": 2.822, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 5.23404255319149, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 5.6060049034085815e-05, |
|
"loss": 2.8034, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 5.25531914893617, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 5.5290620701488594e-05, |
|
"loss": 2.7899, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 5.276595744680851, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 5.452448644438946e-05, |
|
"loss": 2.8848, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 5.297872340425532, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 5.3761702709648556e-05, |
|
"loss": 2.7907, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 5.319148936170213, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 5.300232569726804e-05, |
|
"loss": 2.8616, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 5.340425531914893, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 5.224641135625119e-05, |
|
"loss": 2.7745, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 5.361702127659575, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 5.1494015380480396e-05, |
|
"loss": 2.8555, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 5.382978723404255, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 5.074519320461357e-05, |
|
"loss": 2.7636, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 5.404255319148936, |
|
"grad_norm": 0.625, |
|
"learning_rate": 5.000000000000002e-05, |
|
"loss": 2.8076, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 5.425531914893617, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 4.9258490670615475e-05, |
|
"loss": 2.8087, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 5.446808510638298, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 4.852071984901696e-05, |
|
"loss": 2.8507, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 5.468085106382979, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 4.778674189231751e-05, |
|
"loss": 2.7981, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 5.48936170212766, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 4.7056610878181486e-05, |
|
"loss": 2.8039, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 5.51063829787234, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 4.633038060083996e-05, |
|
"loss": 2.7239, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 5.531914893617021, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 4.560810456712754e-05, |
|
"loss": 2.7612, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 5.553191489361702, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 4.488983599254001e-05, |
|
"loss": 2.7895, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 5.574468085106383, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 4.417562779731355e-05, |
|
"loss": 2.7883, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 5.595744680851064, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 4.346553260252574e-05, |
|
"loss": 2.7913, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 5.617021276595745, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 4.275960272621852e-05, |
|
"loss": 2.7905, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 5.638297872340425, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 4.205789017954364e-05, |
|
"loss": 2.7663, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 5.659574468085106, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 4.136044666293044e-05, |
|
"loss": 2.7839, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 5.680851063829787, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 4.0667323562276814e-05, |
|
"loss": 2.7986, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 5.702127659574468, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 3.997857194516319e-05, |
|
"loss": 2.7071, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 5.723404255319149, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 3.929424255708999e-05, |
|
"loss": 2.8141, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 5.74468085106383, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 3.8614385817738794e-05, |
|
"loss": 2.7508, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 5.76595744680851, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 3.793905181725772e-05, |
|
"loss": 2.7273, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 5.787234042553192, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 3.726829031257062e-05, |
|
"loss": 2.6695, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 5.808510638297872, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 3.660215072371135e-05, |
|
"loss": 2.7872, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 5.829787234042553, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 3.594068213018249e-05, |
|
"loss": 2.7969, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 5.851063829787234, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 3.528393326733941e-05, |
|
"loss": 2.8035, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 5.872340425531915, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 3.463195252279939e-05, |
|
"loss": 2.7496, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 5.8936170212765955, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 3.3984787932876814e-05, |
|
"loss": 2.7365, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 5.914893617021277, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 3.334248717904368e-05, |
|
"loss": 2.7371, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 5.9361702127659575, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 3.270509758441671e-05, |
|
"loss": 2.7465, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 5.957446808510638, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 3.207266611027069e-05, |
|
"loss": 2.6859, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 5.9787234042553195, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 3.144523935257846e-05, |
|
"loss": 2.6722, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 1.375, |
|
"learning_rate": 3.082286353857782e-05, |
|
"loss": 2.0584, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 6.0212765957446805, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 3.0205584523365626e-05, |
|
"loss": 2.6076, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 6.042553191489362, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 2.9593447786519425e-05, |
|
"loss": 2.6513, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 6.0638297872340425, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 2.8986498428746444e-05, |
|
"loss": 2.6075, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 6.085106382978723, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 2.8384781168560693e-05, |
|
"loss": 2.7151, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 6.1063829787234045, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 2.7788340338988385e-05, |
|
"loss": 2.7812, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 6.127659574468085, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 2.719721988430153e-05, |
|
"loss": 2.7936, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 6.148936170212766, |
|
"grad_norm": 1.0, |
|
"learning_rate": 2.6611463356780096e-05, |
|
"loss": 2.6086, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 6.170212765957447, |
|
"grad_norm": 0.984375, |
|
"learning_rate": 2.6031113913503337e-05, |
|
"loss": 2.8151, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 6.191489361702128, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 2.5456214313170002e-05, |
|
"loss": 2.7246, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 6.212765957446808, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 2.4886806912948035e-05, |
|
"loss": 2.7524, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 6.23404255319149, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 2.4322933665353776e-05, |
|
"loss": 2.7285, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 6.25531914893617, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 2.3764636115160978e-05, |
|
"loss": 2.7237, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 6.276595744680851, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 2.3211955396340002e-05, |
|
"loss": 2.818, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 6.297872340425532, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 2.2664932229027024e-05, |
|
"loss": 2.7163, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 6.319148936170213, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 2.2123606916523953e-05, |
|
"loss": 2.7859, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 6.340425531914893, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 2.1588019342328968e-05, |
|
"loss": 2.6892, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 6.361702127659575, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 2.1058208967198045e-05, |
|
"loss": 2.767, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 6.382978723404255, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 2.0534214826237484e-05, |
|
"loss": 2.6933, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 6.404255319148936, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 2.0016075526028065e-05, |
|
"loss": 2.7303, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 6.425531914893617, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 1.9503829241780412e-05, |
|
"loss": 2.7377, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 6.446808510638298, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 1.8997513714522487e-05, |
|
"loss": 2.7818, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 6.468085106382979, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 1.8497166248318876e-05, |
|
"loss": 2.7335, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 6.48936170212766, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 1.8002823707522297e-05, |
|
"loss": 2.733, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 6.51063829787234, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 1.7514522514057553e-05, |
|
"loss": 2.6446, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 6.531914893617021, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 1.703229864473811e-05, |
|
"loss": 2.6907, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 6.553191489361702, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 1.6556187628615273e-05, |
|
"loss": 2.7176, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 6.574468085106383, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 1.608622454436062e-05, |
|
"loss": 2.7109, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 6.595744680851064, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 1.562244401768144e-05, |
|
"loss": 2.7085, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 6.617021276595745, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 1.5164880218769618e-05, |
|
"loss": 2.6987, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 6.638297872340425, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 1.4713566859784045e-05, |
|
"loss": 2.6835, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 6.659574468085106, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 1.426853719236676e-05, |
|
"loss": 2.6981, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 6.680851063829787, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 1.3829824005193181e-05, |
|
"loss": 2.7132, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 6.702127659574468, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 1.339745962155613e-05, |
|
"loss": 2.6319, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 6.723404255319149, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 1.2971475896984475e-05, |
|
"loss": 2.7332, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 6.74468085106383, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 1.2551904216896037e-05, |
|
"loss": 2.6649, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 6.76595744680851, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 1.2138775494285182e-05, |
|
"loss": 2.6486, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 6.787234042553192, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 1.1732120167445248e-05, |
|
"loss": 2.5875, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 6.808510638297872, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 1.1331968197725984e-05, |
|
"loss": 2.7079, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 6.829787234042553, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 1.0938349067325959e-05, |
|
"loss": 2.7134, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 6.851063829787234, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 1.0551291777120464e-05, |
|
"loss": 2.7199, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 6.872340425531915, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 1.0170824844524728e-05, |
|
"loss": 2.6655, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 6.8936170212765955, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 9.796976301392934e-06, |
|
"loss": 2.6519, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 6.914893617021277, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 9.429773691952858e-06, |
|
"loss": 2.6443, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 6.9361702127659575, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 9.069244070776428e-06, |
|
"loss": 2.6531, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 6.957446808510638, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 8.715414000786448e-06, |
|
"loss": 2.5897, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 6.9787234042553195, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 8.368309551299536e-06, |
|
"loss": 2.5772, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 8.027956296105354e-06, |
|
"loss": 1.9731, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 7.0212765957446805, |
|
"grad_norm": 0.75, |
|
"learning_rate": 7.6943793115824e-06, |
|
"loss": 2.5669, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 7.042553191489362, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 7.367603174850502e-06, |
|
"loss": 2.6154, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 7.0638297872340425, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 7.047651961959978e-06, |
|
"loss": 2.5542, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 7.085106382978723, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 6.73454924611776e-06, |
|
"loss": 2.6428, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 7.1063829787234045, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 6.428318095950647e-06, |
|
"loss": 2.6929, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 7.127659574468085, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 6.128981073805584e-06, |
|
"loss": 2.6994, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 7.148936170212766, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 5.836560234087418e-06, |
|
"loss": 2.5162, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 7.170212765957447, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 5.551077121633874e-06, |
|
"loss": 2.7308, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 7.191489361702128, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 5.272552770128314e-06, |
|
"loss": 2.6655, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 7.212765957446808, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 5.001007700549898e-06, |
|
"loss": 2.7014, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 7.23404255319149, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 4.7364619196617495e-06, |
|
"loss": 2.6704, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 7.25531914893617, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 4.478934918536837e-06, |
|
"loss": 2.6756, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 7.276595744680851, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 4.228445671121972e-06, |
|
"loss": 2.7574, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 7.297872340425532, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 3.985012632839824e-06, |
|
"loss": 2.6565, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 7.319148936170213, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 3.748653739229191e-06, |
|
"loss": 2.7389, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 7.340425531914893, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 3.519386404623537e-06, |
|
"loss": 2.6382, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 7.361702127659575, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 3.2972275208679625e-06, |
|
"loss": 2.7147, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 7.382978723404255, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 3.0821934560746447e-06, |
|
"loss": 2.6497, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 7.404255319148936, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 2.8743000534168675e-06, |
|
"loss": 2.6844, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 7.425531914893617, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 2.6735626299617457e-06, |
|
"loss": 2.6961, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 7.446808510638298, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 2.479995975541749e-06, |
|
"loss": 2.7341, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 7.468085106382979, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 2.2936143516649188e-06, |
|
"loss": 2.6872, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 7.48936170212766, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 2.1144314904642195e-06, |
|
"loss": 2.6879, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 7.51063829787234, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 1.942460593685713e-06, |
|
"loss": 2.5916, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 7.531914893617021, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 1.7777143317159406e-06, |
|
"loss": 2.643, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 7.553191489361702, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 1.6202048426483651e-06, |
|
"loss": 2.6724, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 7.574468085106383, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 1.4699437313891007e-06, |
|
"loss": 2.6634, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 7.595744680851064, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 1.3269420688018508e-06, |
|
"loss": 2.6651, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 7.617021276595745, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 1.1912103908922945e-06, |
|
"loss": 2.6545, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 7.638297872340425, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 1.0627586980317073e-06, |
|
"loss": 2.6455, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 7.659574468085106, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 9.415964542203059e-07, |
|
"loss": 2.6622, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 7.680851063829787, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 8.277325863898511e-07, |
|
"loss": 2.6787, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 7.702127659574468, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 7.21175483745995e-07, |
|
"loss": 2.6004, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 7.723404255319149, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 6.219329971501653e-07, |
|
"loss": 2.7023, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 7.74468085106383, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 5.300124385410943e-07, |
|
"loss": 2.6309, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 7.76595744680851, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 4.4542058039619417e-07, |
|
"loss": 2.6197, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 7.787234042553192, |
|
"grad_norm": 0.75, |
|
"learning_rate": 3.681636552324452e-07, |
|
"loss": 2.5579, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 7.808510638297872, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 2.9824735514732974e-07, |
|
"loss": 2.6765, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 7.829787234042553, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 2.3567683139936735e-07, |
|
"loss": 2.687, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 7.851063829787234, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 1.8045669402859677e-07, |
|
"loss": 2.6924, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 7.872340425531915, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 1.3259101151694708e-07, |
|
"loss": 2.6409, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 7.8936170212765955, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 9.208331048846663e-08, |
|
"loss": 2.6251, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 7.914893617021277, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 5.893657544947528e-08, |
|
"loss": 2.616, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 7.9361702127659575, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 3.3153248568695835e-08, |
|
"loss": 2.626, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 7.957446808510638, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 1.47352294973091e-08, |
|
"loss": 2.5585, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 7.9787234042553195, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 3.6838752290102585e-09, |
|
"loss": 2.543, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 0.0, |
|
"loss": 1.9425, |
|
"step": 376 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 376, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 8, |
|
"save_steps": 47, |
|
"total_flos": 5.59507839123456e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|