pixelmelt's picture
Upload 11 files
fb8adf9 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 8.0,
"eval_steps": 500,
"global_step": 376,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.02127659574468085,
"grad_norm": 0.78515625,
"learning_rate": 2e-05,
"loss": 3.872,
"step": 1
},
{
"epoch": 0.0425531914893617,
"grad_norm": 0.83203125,
"learning_rate": 4e-05,
"loss": 3.9714,
"step": 2
},
{
"epoch": 0.06382978723404255,
"grad_norm": 0.82421875,
"learning_rate": 6e-05,
"loss": 3.9503,
"step": 3
},
{
"epoch": 0.0851063829787234,
"grad_norm": 0.94921875,
"learning_rate": 8e-05,
"loss": 4.0784,
"step": 4
},
{
"epoch": 0.10638297872340426,
"grad_norm": 0.94921875,
"learning_rate": 0.0001,
"loss": 3.9539,
"step": 5
},
{
"epoch": 0.1276595744680851,
"grad_norm": 0.66796875,
"learning_rate": 0.00012,
"loss": 3.8024,
"step": 6
},
{
"epoch": 0.14893617021276595,
"grad_norm": 1.2109375,
"learning_rate": 0.00014,
"loss": 3.6005,
"step": 7
},
{
"epoch": 0.1702127659574468,
"grad_norm": 0.96484375,
"learning_rate": 0.00016,
"loss": 3.8633,
"step": 8
},
{
"epoch": 0.19148936170212766,
"grad_norm": 0.64453125,
"learning_rate": 0.00018,
"loss": 3.6551,
"step": 9
},
{
"epoch": 0.2127659574468085,
"grad_norm": 0.34765625,
"learning_rate": 0.0002,
"loss": 3.6645,
"step": 10
},
{
"epoch": 0.23404255319148937,
"grad_norm": 0.34375,
"learning_rate": 0.000199996316124771,
"loss": 3.7208,
"step": 11
},
{
"epoch": 0.2553191489361702,
"grad_norm": 0.345703125,
"learning_rate": 0.0001999852647705027,
"loss": 3.6193,
"step": 12
},
{
"epoch": 0.2765957446808511,
"grad_norm": 0.283203125,
"learning_rate": 0.0001999668467514313,
"loss": 3.7057,
"step": 13
},
{
"epoch": 0.2978723404255319,
"grad_norm": 0.234375,
"learning_rate": 0.00019994106342455053,
"loss": 3.5714,
"step": 14
},
{
"epoch": 0.3191489361702128,
"grad_norm": 0.189453125,
"learning_rate": 0.00019990791668951155,
"loss": 3.582,
"step": 15
},
{
"epoch": 0.3404255319148936,
"grad_norm": 0.28515625,
"learning_rate": 0.00019986740898848306,
"loss": 3.5228,
"step": 16
},
{
"epoch": 0.3617021276595745,
"grad_norm": 0.271484375,
"learning_rate": 0.00019981954330597143,
"loss": 3.5893,
"step": 17
},
{
"epoch": 0.3829787234042553,
"grad_norm": 0.279296875,
"learning_rate": 0.00019976432316860067,
"loss": 3.5203,
"step": 18
},
{
"epoch": 0.40425531914893614,
"grad_norm": 0.21484375,
"learning_rate": 0.00019970175264485266,
"loss": 3.5939,
"step": 19
},
{
"epoch": 0.425531914893617,
"grad_norm": 0.1796875,
"learning_rate": 0.00019963183634476756,
"loss": 3.5296,
"step": 20
},
{
"epoch": 0.44680851063829785,
"grad_norm": 0.2138671875,
"learning_rate": 0.00019955457941960383,
"loss": 3.6242,
"step": 21
},
{
"epoch": 0.46808510638297873,
"grad_norm": 0.240234375,
"learning_rate": 0.0001994699875614589,
"loss": 3.517,
"step": 22
},
{
"epoch": 0.48936170212765956,
"grad_norm": 0.2158203125,
"learning_rate": 0.00019937806700284986,
"loss": 3.5748,
"step": 23
},
{
"epoch": 0.5106382978723404,
"grad_norm": 0.1982421875,
"learning_rate": 0.00019927882451625402,
"loss": 3.5263,
"step": 24
},
{
"epoch": 0.5319148936170213,
"grad_norm": 0.27734375,
"learning_rate": 0.00019917226741361015,
"loss": 3.5638,
"step": 25
},
{
"epoch": 0.5531914893617021,
"grad_norm": 0.1748046875,
"learning_rate": 0.00019905840354577972,
"loss": 3.5424,
"step": 26
},
{
"epoch": 0.574468085106383,
"grad_norm": 0.1982421875,
"learning_rate": 0.00019893724130196828,
"loss": 3.5726,
"step": 27
},
{
"epoch": 0.5957446808510638,
"grad_norm": 0.2275390625,
"learning_rate": 0.00019880878960910772,
"loss": 3.5656,
"step": 28
},
{
"epoch": 0.6170212765957447,
"grad_norm": 0.1904296875,
"learning_rate": 0.00019867305793119816,
"loss": 3.6008,
"step": 29
},
{
"epoch": 0.6382978723404256,
"grad_norm": 0.228515625,
"learning_rate": 0.0001985300562686109,
"loss": 3.5136,
"step": 30
},
{
"epoch": 0.6595744680851063,
"grad_norm": 0.2255859375,
"learning_rate": 0.00019837979515735166,
"loss": 3.5632,
"step": 31
},
{
"epoch": 0.6808510638297872,
"grad_norm": 0.1884765625,
"learning_rate": 0.0001982222856682841,
"loss": 3.6284,
"step": 32
},
{
"epoch": 0.7021276595744681,
"grad_norm": 0.2890625,
"learning_rate": 0.0001980575394063143,
"loss": 3.4885,
"step": 33
},
{
"epoch": 0.723404255319149,
"grad_norm": 0.232421875,
"learning_rate": 0.0001978855685095358,
"loss": 3.6102,
"step": 34
},
{
"epoch": 0.7446808510638298,
"grad_norm": 0.328125,
"learning_rate": 0.0001977063856483351,
"loss": 3.5844,
"step": 35
},
{
"epoch": 0.7659574468085106,
"grad_norm": 0.302734375,
"learning_rate": 0.00019752000402445825,
"loss": 3.5097,
"step": 36
},
{
"epoch": 0.7872340425531915,
"grad_norm": 0.30078125,
"learning_rate": 0.00019732643737003827,
"loss": 3.492,
"step": 37
},
{
"epoch": 0.8085106382978723,
"grad_norm": 0.216796875,
"learning_rate": 0.00019712569994658315,
"loss": 3.6192,
"step": 38
},
{
"epoch": 0.8297872340425532,
"grad_norm": 0.2314453125,
"learning_rate": 0.00019691780654392535,
"loss": 3.6314,
"step": 39
},
{
"epoch": 0.851063829787234,
"grad_norm": 0.2421875,
"learning_rate": 0.00019670277247913205,
"loss": 3.6429,
"step": 40
},
{
"epoch": 0.8723404255319149,
"grad_norm": 0.208984375,
"learning_rate": 0.00019648061359537646,
"loss": 3.5714,
"step": 41
},
{
"epoch": 0.8936170212765957,
"grad_norm": 0.259765625,
"learning_rate": 0.00019625134626077083,
"loss": 3.574,
"step": 42
},
{
"epoch": 0.9148936170212766,
"grad_norm": 0.224609375,
"learning_rate": 0.00019601498736716017,
"loss": 3.6269,
"step": 43
},
{
"epoch": 0.9361702127659575,
"grad_norm": 0.349609375,
"learning_rate": 0.00019577155432887804,
"loss": 3.659,
"step": 44
},
{
"epoch": 0.9574468085106383,
"grad_norm": 0.291015625,
"learning_rate": 0.00019552106508146318,
"loss": 3.6223,
"step": 45
},
{
"epoch": 0.9787234042553191,
"grad_norm": 0.3828125,
"learning_rate": 0.00019526353808033825,
"loss": 3.6389,
"step": 46
},
{
"epoch": 1.0,
"grad_norm": 0.5703125,
"learning_rate": 0.00019499899229945012,
"loss": 3.4551,
"step": 47
},
{
"epoch": 1.0212765957446808,
"grad_norm": 1.203125,
"learning_rate": 0.0001947274472298717,
"loss": 3.2727,
"step": 48
},
{
"epoch": 1.0425531914893618,
"grad_norm": 0.6015625,
"learning_rate": 0.00019444892287836613,
"loss": 3.3136,
"step": 49
},
{
"epoch": 1.0638297872340425,
"grad_norm": 0.8125,
"learning_rate": 0.00019416343976591261,
"loss": 3.3188,
"step": 50
},
{
"epoch": 1.0851063829787233,
"grad_norm": 1.09375,
"learning_rate": 0.00019387101892619443,
"loss": 3.424,
"step": 51
},
{
"epoch": 1.1063829787234043,
"grad_norm": 0.28515625,
"learning_rate": 0.00019357168190404936,
"loss": 3.3676,
"step": 52
},
{
"epoch": 1.127659574468085,
"grad_norm": 0.73828125,
"learning_rate": 0.00019326545075388225,
"loss": 3.3535,
"step": 53
},
{
"epoch": 1.148936170212766,
"grad_norm": 0.73046875,
"learning_rate": 0.00019295234803804004,
"loss": 3.1686,
"step": 54
},
{
"epoch": 1.1702127659574468,
"grad_norm": 0.234375,
"learning_rate": 0.00019263239682514952,
"loss": 3.3986,
"step": 55
},
{
"epoch": 1.1914893617021276,
"grad_norm": 0.56640625,
"learning_rate": 0.0001923056206884176,
"loss": 3.2916,
"step": 56
},
{
"epoch": 1.2127659574468086,
"grad_norm": 0.53515625,
"learning_rate": 0.00019197204370389467,
"loss": 3.3444,
"step": 57
},
{
"epoch": 1.2340425531914894,
"grad_norm": 0.375,
"learning_rate": 0.0001916316904487005,
"loss": 3.3603,
"step": 58
},
{
"epoch": 1.2553191489361701,
"grad_norm": 0.44140625,
"learning_rate": 0.00019128458599921357,
"loss": 3.308,
"step": 59
},
{
"epoch": 1.2765957446808511,
"grad_norm": 0.7265625,
"learning_rate": 0.00019093075592922358,
"loss": 3.4154,
"step": 60
},
{
"epoch": 1.297872340425532,
"grad_norm": 0.37890625,
"learning_rate": 0.00019057022630804716,
"loss": 3.3095,
"step": 61
},
{
"epoch": 1.3191489361702127,
"grad_norm": 0.212890625,
"learning_rate": 0.00019020302369860708,
"loss": 3.3266,
"step": 62
},
{
"epoch": 1.3404255319148937,
"grad_norm": 0.53125,
"learning_rate": 0.0001898291751554753,
"loss": 3.276,
"step": 63
},
{
"epoch": 1.3617021276595744,
"grad_norm": 0.462890625,
"learning_rate": 0.00018944870822287956,
"loss": 3.3428,
"step": 64
},
{
"epoch": 1.3829787234042552,
"grad_norm": 0.3125,
"learning_rate": 0.00018906165093267405,
"loss": 3.2515,
"step": 65
},
{
"epoch": 1.4042553191489362,
"grad_norm": 0.232421875,
"learning_rate": 0.00018866803180227402,
"loss": 3.3125,
"step": 66
},
{
"epoch": 1.425531914893617,
"grad_norm": 0.443359375,
"learning_rate": 0.00018826787983255473,
"loss": 3.2968,
"step": 67
},
{
"epoch": 1.4468085106382977,
"grad_norm": 0.380859375,
"learning_rate": 0.00018786122450571485,
"loss": 3.3705,
"step": 68
},
{
"epoch": 1.4680851063829787,
"grad_norm": 0.220703125,
"learning_rate": 0.00018744809578310397,
"loss": 3.2878,
"step": 69
},
{
"epoch": 1.4893617021276595,
"grad_norm": 0.287109375,
"learning_rate": 0.00018702852410301554,
"loss": 3.3214,
"step": 70
},
{
"epoch": 1.5106382978723403,
"grad_norm": 0.41796875,
"learning_rate": 0.00018660254037844388,
"loss": 3.2708,
"step": 71
},
{
"epoch": 1.5319148936170213,
"grad_norm": 0.255859375,
"learning_rate": 0.00018617017599480682,
"loss": 3.3087,
"step": 72
},
{
"epoch": 1.5531914893617023,
"grad_norm": 0.2431640625,
"learning_rate": 0.00018573146280763324,
"loss": 3.3227,
"step": 73
},
{
"epoch": 1.574468085106383,
"grad_norm": 0.4375,
"learning_rate": 0.000185286433140216,
"loss": 3.3296,
"step": 74
},
{
"epoch": 1.5957446808510638,
"grad_norm": 0.36328125,
"learning_rate": 0.0001848351197812304,
"loss": 3.3282,
"step": 75
},
{
"epoch": 1.6170212765957448,
"grad_norm": 0.2294921875,
"learning_rate": 0.00018437755598231856,
"loss": 3.3421,
"step": 76
},
{
"epoch": 1.6382978723404256,
"grad_norm": 0.345703125,
"learning_rate": 0.00018391377545563938,
"loss": 3.3002,
"step": 77
},
{
"epoch": 1.6595744680851063,
"grad_norm": 0.349609375,
"learning_rate": 0.00018344381237138472,
"loss": 3.3293,
"step": 78
},
{
"epoch": 1.6808510638297873,
"grad_norm": 0.255859375,
"learning_rate": 0.0001829677013552619,
"loss": 3.3688,
"step": 79
},
{
"epoch": 1.702127659574468,
"grad_norm": 0.2431640625,
"learning_rate": 0.00018248547748594244,
"loss": 3.2586,
"step": 80
},
{
"epoch": 1.7234042553191489,
"grad_norm": 0.2734375,
"learning_rate": 0.00018199717629247773,
"loss": 3.3783,
"step": 81
},
{
"epoch": 1.7446808510638299,
"grad_norm": 0.326171875,
"learning_rate": 0.00018150283375168114,
"loss": 3.3503,
"step": 82
},
{
"epoch": 1.7659574468085106,
"grad_norm": 0.28515625,
"learning_rate": 0.0001810024862854775,
"loss": 3.2862,
"step": 83
},
{
"epoch": 1.7872340425531914,
"grad_norm": 0.337890625,
"learning_rate": 0.00018049617075821962,
"loss": 3.2503,
"step": 84
},
{
"epoch": 1.8085106382978724,
"grad_norm": 0.28125,
"learning_rate": 0.00017998392447397197,
"loss": 3.3987,
"step": 85
},
{
"epoch": 1.8297872340425532,
"grad_norm": 0.279296875,
"learning_rate": 0.0001794657851737625,
"loss": 3.3948,
"step": 86
},
{
"epoch": 1.851063829787234,
"grad_norm": 0.29296875,
"learning_rate": 0.00017894179103280198,
"loss": 3.414,
"step": 87
},
{
"epoch": 1.872340425531915,
"grad_norm": 0.267578125,
"learning_rate": 0.00017841198065767107,
"loss": 3.3495,
"step": 88
},
{
"epoch": 1.8936170212765957,
"grad_norm": 0.28515625,
"learning_rate": 0.00017787639308347608,
"loss": 3.3357,
"step": 89
},
{
"epoch": 1.9148936170212765,
"grad_norm": 0.359375,
"learning_rate": 0.000177335067770973,
"loss": 3.3956,
"step": 90
},
{
"epoch": 1.9361702127659575,
"grad_norm": 0.2890625,
"learning_rate": 0.00017678804460366,
"loss": 3.4261,
"step": 91
},
{
"epoch": 1.9574468085106385,
"grad_norm": 0.318359375,
"learning_rate": 0.00017623536388483905,
"loss": 3.3929,
"step": 92
},
{
"epoch": 1.978723404255319,
"grad_norm": 0.33984375,
"learning_rate": 0.00017567706633464628,
"loss": 3.4055,
"step": 93
},
{
"epoch": 2.0,
"grad_norm": 0.73046875,
"learning_rate": 0.00017511319308705198,
"loss": 3.0576,
"step": 94
},
{
"epoch": 2.021276595744681,
"grad_norm": 0.6953125,
"learning_rate": 0.00017454378568683003,
"loss": 3.1095,
"step": 95
},
{
"epoch": 2.0425531914893615,
"grad_norm": 0.68359375,
"learning_rate": 0.0001739688860864967,
"loss": 3.1669,
"step": 96
},
{
"epoch": 2.0638297872340425,
"grad_norm": 0.306640625,
"learning_rate": 0.00017338853664321992,
"loss": 3.1293,
"step": 97
},
{
"epoch": 2.0851063829787235,
"grad_norm": 0.86328125,
"learning_rate": 0.00017280278011569847,
"loss": 3.2461,
"step": 98
},
{
"epoch": 2.106382978723404,
"grad_norm": 0.58203125,
"learning_rate": 0.00017221165966101163,
"loss": 3.2222,
"step": 99
},
{
"epoch": 2.127659574468085,
"grad_norm": 0.298828125,
"learning_rate": 0.00017161521883143934,
"loss": 3.1956,
"step": 100
},
{
"epoch": 2.148936170212766,
"grad_norm": 0.73046875,
"learning_rate": 0.0001710135015712536,
"loss": 3.0099,
"step": 101
},
{
"epoch": 2.1702127659574466,
"grad_norm": 0.53515625,
"learning_rate": 0.00017040655221348057,
"loss": 3.2425,
"step": 102
},
{
"epoch": 2.1914893617021276,
"grad_norm": 0.353515625,
"learning_rate": 0.00016979441547663435,
"loss": 3.1365,
"step": 103
},
{
"epoch": 2.2127659574468086,
"grad_norm": 0.55859375,
"learning_rate": 0.00016917713646142222,
"loss": 3.1903,
"step": 104
},
{
"epoch": 2.2340425531914896,
"grad_norm": 0.6328125,
"learning_rate": 0.00016855476064742155,
"loss": 3.1938,
"step": 105
},
{
"epoch": 2.25531914893617,
"grad_norm": 0.310546875,
"learning_rate": 0.00016792733388972932,
"loss": 3.1561,
"step": 106
},
{
"epoch": 2.276595744680851,
"grad_norm": 0.7421875,
"learning_rate": 0.0001672949024155833,
"loss": 3.259,
"step": 107
},
{
"epoch": 2.297872340425532,
"grad_norm": 0.47265625,
"learning_rate": 0.00016665751282095634,
"loss": 3.1575,
"step": 108
},
{
"epoch": 2.3191489361702127,
"grad_norm": 0.2578125,
"learning_rate": 0.00016601521206712318,
"loss": 3.1849,
"step": 109
},
{
"epoch": 2.3404255319148937,
"grad_norm": 0.5703125,
"learning_rate": 0.0001653680474772006,
"loss": 3.1254,
"step": 110
},
{
"epoch": 2.3617021276595747,
"grad_norm": 0.427734375,
"learning_rate": 0.00016471606673266066,
"loss": 3.1994,
"step": 111
},
{
"epoch": 2.382978723404255,
"grad_norm": 0.283203125,
"learning_rate": 0.00016405931786981755,
"loss": 3.101,
"step": 112
},
{
"epoch": 2.404255319148936,
"grad_norm": 0.3984375,
"learning_rate": 0.00016339784927628867,
"loss": 3.1611,
"step": 113
},
{
"epoch": 2.425531914893617,
"grad_norm": 0.51171875,
"learning_rate": 0.0001627317096874294,
"loss": 3.1622,
"step": 114
},
{
"epoch": 2.4468085106382977,
"grad_norm": 0.302734375,
"learning_rate": 0.00016206094818274229,
"loss": 3.2131,
"step": 115
},
{
"epoch": 2.4680851063829787,
"grad_norm": 0.35546875,
"learning_rate": 0.0001613856141822612,
"loss": 3.1511,
"step": 116
},
{
"epoch": 2.4893617021276597,
"grad_norm": 0.478515625,
"learning_rate": 0.00016070575744291004,
"loss": 3.1662,
"step": 117
},
{
"epoch": 2.5106382978723403,
"grad_norm": 0.4140625,
"learning_rate": 0.00016002142805483685,
"loss": 3.1092,
"step": 118
},
{
"epoch": 2.5319148936170213,
"grad_norm": 0.376953125,
"learning_rate": 0.0001593326764377232,
"loss": 3.1444,
"step": 119
},
{
"epoch": 2.5531914893617023,
"grad_norm": 0.443359375,
"learning_rate": 0.00015863955333706957,
"loss": 3.1738,
"step": 120
},
{
"epoch": 2.574468085106383,
"grad_norm": 0.470703125,
"learning_rate": 0.00015794210982045636,
"loss": 3.1766,
"step": 121
},
{
"epoch": 2.595744680851064,
"grad_norm": 0.294921875,
"learning_rate": 0.00015724039727378148,
"loss": 3.166,
"step": 122
},
{
"epoch": 2.617021276595745,
"grad_norm": 0.50390625,
"learning_rate": 0.00015653446739747427,
"loss": 3.1837,
"step": 123
},
{
"epoch": 2.6382978723404253,
"grad_norm": 0.484375,
"learning_rate": 0.00015582437220268647,
"loss": 3.1519,
"step": 124
},
{
"epoch": 2.6595744680851063,
"grad_norm": 0.326171875,
"learning_rate": 0.00015511016400746,
"loss": 3.165,
"step": 125
},
{
"epoch": 2.6808510638297873,
"grad_norm": 0.431640625,
"learning_rate": 0.00015439189543287247,
"loss": 3.2062,
"step": 126
},
{
"epoch": 2.702127659574468,
"grad_norm": 0.400390625,
"learning_rate": 0.00015366961939916008,
"loss": 3.0979,
"step": 127
},
{
"epoch": 2.723404255319149,
"grad_norm": 0.359375,
"learning_rate": 0.0001529433891218185,
"loss": 3.217,
"step": 128
},
{
"epoch": 2.74468085106383,
"grad_norm": 0.349609375,
"learning_rate": 0.0001522132581076825,
"loss": 3.1789,
"step": 129
},
{
"epoch": 2.7659574468085104,
"grad_norm": 0.3984375,
"learning_rate": 0.0001514792801509831,
"loss": 3.1253,
"step": 130
},
{
"epoch": 2.7872340425531914,
"grad_norm": 0.369140625,
"learning_rate": 0.00015074150932938455,
"loss": 3.0813,
"step": 131
},
{
"epoch": 2.8085106382978724,
"grad_norm": 0.380859375,
"learning_rate": 0.00015000000000000001,
"loss": 3.2233,
"step": 132
},
{
"epoch": 2.829787234042553,
"grad_norm": 0.404296875,
"learning_rate": 0.00014925480679538647,
"loss": 3.2241,
"step": 133
},
{
"epoch": 2.851063829787234,
"grad_norm": 0.33984375,
"learning_rate": 0.00014850598461951963,
"loss": 3.2428,
"step": 134
},
{
"epoch": 2.872340425531915,
"grad_norm": 0.404296875,
"learning_rate": 0.00014775358864374885,
"loss": 3.1833,
"step": 135
},
{
"epoch": 2.8936170212765955,
"grad_norm": 0.412109375,
"learning_rate": 0.000146997674302732,
"loss": 3.162,
"step": 136
},
{
"epoch": 2.9148936170212765,
"grad_norm": 0.3828125,
"learning_rate": 0.0001462382972903515,
"loss": 3.2095,
"step": 137
},
{
"epoch": 2.9361702127659575,
"grad_norm": 0.51171875,
"learning_rate": 0.0001454755135556106,
"loss": 3.2355,
"step": 138
},
{
"epoch": 2.9574468085106385,
"grad_norm": 0.466796875,
"learning_rate": 0.0001447093792985114,
"loss": 3.1969,
"step": 139
},
{
"epoch": 2.978723404255319,
"grad_norm": 0.5703125,
"learning_rate": 0.00014393995096591416,
"loss": 3.2092,
"step": 140
},
{
"epoch": 3.0,
"grad_norm": 0.984375,
"learning_rate": 0.0001431672852473784,
"loss": 2.7442,
"step": 141
},
{
"epoch": 3.021276595744681,
"grad_norm": 0.8671875,
"learning_rate": 0.0001423914390709861,
"loss": 2.9817,
"step": 142
},
{
"epoch": 3.0425531914893615,
"grad_norm": 1.03125,
"learning_rate": 0.00014161246959914744,
"loss": 3.0423,
"step": 143
},
{
"epoch": 3.0638297872340425,
"grad_norm": 0.49609375,
"learning_rate": 0.00014083043422438935,
"loss": 2.9844,
"step": 144
},
{
"epoch": 3.0851063829787235,
"grad_norm": 0.87890625,
"learning_rate": 0.00014004539056512667,
"loss": 3.0951,
"step": 145
},
{
"epoch": 3.106382978723404,
"grad_norm": 0.8359375,
"learning_rate": 0.0001392573964614172,
"loss": 3.089,
"step": 146
},
{
"epoch": 3.127659574468085,
"grad_norm": 0.44140625,
"learning_rate": 0.00013846650997070012,
"loss": 3.0649,
"step": 147
},
{
"epoch": 3.148936170212766,
"grad_norm": 0.6328125,
"learning_rate": 0.00013767278936351854,
"loss": 2.8683,
"step": 148
},
{
"epoch": 3.1702127659574466,
"grad_norm": 0.81640625,
"learning_rate": 0.00013687629311922602,
"loss": 3.1071,
"step": 149
},
{
"epoch": 3.1914893617021276,
"grad_norm": 0.40234375,
"learning_rate": 0.00013607707992167834,
"loss": 3.0015,
"step": 150
},
{
"epoch": 3.2127659574468086,
"grad_norm": 0.515625,
"learning_rate": 0.0001352752086549095,
"loss": 3.0506,
"step": 151
},
{
"epoch": 3.2340425531914896,
"grad_norm": 0.80859375,
"learning_rate": 0.0001344707383987934,
"loss": 3.0533,
"step": 152
},
{
"epoch": 3.25531914893617,
"grad_norm": 0.44140625,
"learning_rate": 0.00013366372842469105,
"loss": 3.0211,
"step": 153
},
{
"epoch": 3.276595744680851,
"grad_norm": 0.478515625,
"learning_rate": 0.0001328542381910835,
"loss": 3.1129,
"step": 154
},
{
"epoch": 3.297872340425532,
"grad_norm": 0.6171875,
"learning_rate": 0.00013204232733919112,
"loss": 3.0158,
"step": 155
},
{
"epoch": 3.3191489361702127,
"grad_norm": 0.482421875,
"learning_rate": 0.00013122805568857948,
"loss": 3.0605,
"step": 156
},
{
"epoch": 3.3404255319148937,
"grad_norm": 0.400390625,
"learning_rate": 0.0001304114832327518,
"loss": 2.9792,
"step": 157
},
{
"epoch": 3.3617021276595747,
"grad_norm": 0.439453125,
"learning_rate": 0.00012959267013472892,
"loss": 3.0647,
"step": 158
},
{
"epoch": 3.382978723404255,
"grad_norm": 0.48046875,
"learning_rate": 0.0001287716767226167,
"loss": 2.9722,
"step": 159
},
{
"epoch": 3.404255319148936,
"grad_norm": 0.431640625,
"learning_rate": 0.00012794856348516095,
"loss": 3.0233,
"step": 160
},
{
"epoch": 3.425531914893617,
"grad_norm": 0.458984375,
"learning_rate": 0.000127123391067291,
"loss": 3.0216,
"step": 161
},
{
"epoch": 3.4468085106382977,
"grad_norm": 0.4921875,
"learning_rate": 0.00012629622026565147,
"loss": 3.0703,
"step": 162
},
{
"epoch": 3.4680851063829787,
"grad_norm": 0.39453125,
"learning_rate": 0.00012546711202412287,
"loss": 3.0121,
"step": 163
},
{
"epoch": 3.4893617021276597,
"grad_norm": 0.443359375,
"learning_rate": 0.00012463612742933148,
"loss": 3.0189,
"step": 164
},
{
"epoch": 3.5106382978723403,
"grad_norm": 0.57421875,
"learning_rate": 0.00012380332770614856,
"loss": 2.9589,
"step": 165
},
{
"epoch": 3.5319148936170213,
"grad_norm": 0.41015625,
"learning_rate": 0.0001229687742131796,
"loss": 2.9954,
"step": 166
},
{
"epoch": 3.5531914893617023,
"grad_norm": 0.40234375,
"learning_rate": 0.00012213252843824325,
"loss": 3.0266,
"step": 167
},
{
"epoch": 3.574468085106383,
"grad_norm": 0.54296875,
"learning_rate": 0.00012129465199384157,
"loss": 3.0273,
"step": 168
},
{
"epoch": 3.595744680851064,
"grad_norm": 0.5234375,
"learning_rate": 0.0001204552066126201,
"loss": 3.0214,
"step": 169
},
{
"epoch": 3.617021276595745,
"grad_norm": 0.400390625,
"learning_rate": 0.0001196142541428197,
"loss": 3.0232,
"step": 170
},
{
"epoch": 3.6382978723404253,
"grad_norm": 0.49609375,
"learning_rate": 0.00011877185654371987,
"loss": 3.0004,
"step": 171
},
{
"epoch": 3.6595744680851063,
"grad_norm": 0.515625,
"learning_rate": 0.00011792807588107357,
"loss": 3.0165,
"step": 172
},
{
"epoch": 3.6808510638297873,
"grad_norm": 0.44921875,
"learning_rate": 0.00011708297432253444,
"loss": 3.0491,
"step": 173
},
{
"epoch": 3.702127659574468,
"grad_norm": 0.41796875,
"learning_rate": 0.00011623661413307639,
"loss": 2.9456,
"step": 174
},
{
"epoch": 3.723404255319149,
"grad_norm": 0.4921875,
"learning_rate": 0.0001153890576704062,
"loss": 3.0586,
"step": 175
},
{
"epoch": 3.74468085106383,
"grad_norm": 0.4921875,
"learning_rate": 0.00011454036738036899,
"loss": 3.0125,
"step": 176
},
{
"epoch": 3.7659574468085104,
"grad_norm": 0.427734375,
"learning_rate": 0.00011369060579234754,
"loss": 2.9722,
"step": 177
},
{
"epoch": 3.7872340425531914,
"grad_norm": 0.498046875,
"learning_rate": 0.00011283983551465511,
"loss": 2.9201,
"step": 178
},
{
"epoch": 3.8085106382978724,
"grad_norm": 0.4765625,
"learning_rate": 0.00011198811922992274,
"loss": 3.0565,
"step": 179
},
{
"epoch": 3.829787234042553,
"grad_norm": 0.462890625,
"learning_rate": 0.00011113551969048089,
"loss": 3.0615,
"step": 180
},
{
"epoch": 3.851063829787234,
"grad_norm": 0.474609375,
"learning_rate": 0.00011028209971373605,
"loss": 3.0731,
"step": 181
},
{
"epoch": 3.872340425531915,
"grad_norm": 0.46484375,
"learning_rate": 0.00010942792217754245,
"loss": 3.0144,
"step": 182
},
{
"epoch": 3.8936170212765955,
"grad_norm": 0.50390625,
"learning_rate": 0.00010857305001556944,
"loss": 2.9905,
"step": 183
},
{
"epoch": 3.9148936170212765,
"grad_norm": 0.546875,
"learning_rate": 0.00010771754621266466,
"loss": 3.0232,
"step": 184
},
{
"epoch": 3.9361702127659575,
"grad_norm": 0.53515625,
"learning_rate": 0.00010686147380021342,
"loss": 3.0408,
"step": 185
},
{
"epoch": 3.9574468085106385,
"grad_norm": 0.609375,
"learning_rate": 0.00010600489585149484,
"loss": 2.9963,
"step": 186
},
{
"epoch": 3.978723404255319,
"grad_norm": 0.6875,
"learning_rate": 0.00010514787547703466,
"loss": 3.0049,
"step": 187
},
{
"epoch": 4.0,
"grad_norm": 1.1015625,
"learning_rate": 0.00010429047581995546,
"loss": 2.4433,
"step": 188
},
{
"epoch": 4.0212765957446805,
"grad_norm": 0.5859375,
"learning_rate": 0.00010343276005132436,
"loss": 2.8295,
"step": 189
},
{
"epoch": 4.042553191489362,
"grad_norm": 0.8359375,
"learning_rate": 0.00010257479136549889,
"loss": 2.8904,
"step": 190
},
{
"epoch": 4.0638297872340425,
"grad_norm": 0.796875,
"learning_rate": 0.00010171663297547076,
"loss": 2.834,
"step": 191
},
{
"epoch": 4.085106382978723,
"grad_norm": 0.498046875,
"learning_rate": 0.00010085834810820871,
"loss": 2.9309,
"step": 192
},
{
"epoch": 4.1063829787234045,
"grad_norm": 0.5703125,
"learning_rate": 0.0001,
"loss": 2.9461,
"step": 193
},
{
"epoch": 4.127659574468085,
"grad_norm": 0.65625,
"learning_rate": 9.914165189179131e-05,
"loss": 2.9405,
"step": 194
},
{
"epoch": 4.148936170212766,
"grad_norm": 0.546875,
"learning_rate": 9.828336702452927e-05,
"loss": 2.7445,
"step": 195
},
{
"epoch": 4.170212765957447,
"grad_norm": 0.50390625,
"learning_rate": 9.742520863450115e-05,
"loss": 2.963,
"step": 196
},
{
"epoch": 4.191489361702128,
"grad_norm": 0.59375,
"learning_rate": 9.656723994867566e-05,
"loss": 2.8778,
"step": 197
},
{
"epoch": 4.212765957446808,
"grad_norm": 0.58984375,
"learning_rate": 9.570952418004455e-05,
"loss": 2.9148,
"step": 198
},
{
"epoch": 4.23404255319149,
"grad_norm": 0.5546875,
"learning_rate": 9.485212452296535e-05,
"loss": 2.9028,
"step": 199
},
{
"epoch": 4.25531914893617,
"grad_norm": 0.62109375,
"learning_rate": 9.399510414850518e-05,
"loss": 2.898,
"step": 200
},
{
"epoch": 4.276595744680851,
"grad_norm": 0.61328125,
"learning_rate": 9.313852619978659e-05,
"loss": 2.9883,
"step": 201
},
{
"epoch": 4.297872340425532,
"grad_norm": 0.51171875,
"learning_rate": 9.228245378733537e-05,
"loss": 2.886,
"step": 202
},
{
"epoch": 4.319148936170213,
"grad_norm": 0.5546875,
"learning_rate": 9.142694998443056e-05,
"loss": 2.9453,
"step": 203
},
{
"epoch": 4.340425531914893,
"grad_norm": 0.58984375,
"learning_rate": 9.057207782245757e-05,
"loss": 2.8555,
"step": 204
},
{
"epoch": 4.361702127659575,
"grad_norm": 0.56640625,
"learning_rate": 8.971790028626395e-05,
"loss": 2.9359,
"step": 205
},
{
"epoch": 4.382978723404255,
"grad_norm": 0.45703125,
"learning_rate": 8.886448030951912e-05,
"loss": 2.8469,
"step": 206
},
{
"epoch": 4.404255319148936,
"grad_norm": 0.6015625,
"learning_rate": 8.801188077007728e-05,
"loss": 2.8963,
"step": 207
},
{
"epoch": 4.425531914893617,
"grad_norm": 0.54296875,
"learning_rate": 8.71601644853449e-05,
"loss": 2.8965,
"step": 208
},
{
"epoch": 4.446808510638298,
"grad_norm": 0.57421875,
"learning_rate": 8.630939420765247e-05,
"loss": 2.9457,
"step": 209
},
{
"epoch": 4.468085106382979,
"grad_norm": 0.5234375,
"learning_rate": 8.545963261963102e-05,
"loss": 2.8918,
"step": 210
},
{
"epoch": 4.48936170212766,
"grad_norm": 0.60546875,
"learning_rate": 8.461094232959381e-05,
"loss": 2.8957,
"step": 211
},
{
"epoch": 4.51063829787234,
"grad_norm": 0.6484375,
"learning_rate": 8.376338586692366e-05,
"loss": 2.8224,
"step": 212
},
{
"epoch": 4.531914893617021,
"grad_norm": 0.5703125,
"learning_rate": 8.29170256774656e-05,
"loss": 2.859,
"step": 213
},
{
"epoch": 4.553191489361702,
"grad_norm": 0.58203125,
"learning_rate": 8.207192411892646e-05,
"loss": 2.8896,
"step": 214
},
{
"epoch": 4.574468085106383,
"grad_norm": 0.58203125,
"learning_rate": 8.122814345628016e-05,
"loss": 2.8874,
"step": 215
},
{
"epoch": 4.595744680851064,
"grad_norm": 0.56640625,
"learning_rate": 8.038574585718032e-05,
"loss": 2.8869,
"step": 216
},
{
"epoch": 4.617021276595745,
"grad_norm": 0.5703125,
"learning_rate": 7.954479338737995e-05,
"loss": 2.8923,
"step": 217
},
{
"epoch": 4.638297872340425,
"grad_norm": 0.59765625,
"learning_rate": 7.870534800615845e-05,
"loss": 2.868,
"step": 218
},
{
"epoch": 4.659574468085106,
"grad_norm": 0.625,
"learning_rate": 7.786747156175676e-05,
"loss": 2.8831,
"step": 219
},
{
"epoch": 4.680851063829787,
"grad_norm": 0.578125,
"learning_rate": 7.703122578682046e-05,
"loss": 2.9084,
"step": 220
},
{
"epoch": 4.702127659574468,
"grad_norm": 0.546875,
"learning_rate": 7.619667229385146e-05,
"loss": 2.8085,
"step": 221
},
{
"epoch": 4.723404255319149,
"grad_norm": 0.6875,
"learning_rate": 7.536387257066854e-05,
"loss": 2.92,
"step": 222
},
{
"epoch": 4.74468085106383,
"grad_norm": 0.65234375,
"learning_rate": 7.453288797587714e-05,
"loss": 2.8661,
"step": 223
},
{
"epoch": 4.76595744680851,
"grad_norm": 0.55859375,
"learning_rate": 7.370377973434855e-05,
"loss": 2.8322,
"step": 224
},
{
"epoch": 4.787234042553192,
"grad_norm": 0.5859375,
"learning_rate": 7.2876608932709e-05,
"loss": 2.772,
"step": 225
},
{
"epoch": 4.808510638297872,
"grad_norm": 0.71875,
"learning_rate": 7.205143651483906e-05,
"loss": 2.905,
"step": 226
},
{
"epoch": 4.829787234042553,
"grad_norm": 0.703125,
"learning_rate": 7.122832327738331e-05,
"loss": 2.9116,
"step": 227
},
{
"epoch": 4.851063829787234,
"grad_norm": 0.61328125,
"learning_rate": 7.040732986527108e-05,
"loss": 2.9203,
"step": 228
},
{
"epoch": 4.872340425531915,
"grad_norm": 0.6328125,
"learning_rate": 6.958851676724823e-05,
"loss": 2.8646,
"step": 229
},
{
"epoch": 4.8936170212765955,
"grad_norm": 0.69140625,
"learning_rate": 6.877194431142055e-05,
"loss": 2.844,
"step": 230
},
{
"epoch": 4.914893617021277,
"grad_norm": 0.73828125,
"learning_rate": 6.79576726608089e-05,
"loss": 2.8604,
"step": 231
},
{
"epoch": 4.9361702127659575,
"grad_norm": 0.703125,
"learning_rate": 6.714576180891654e-05,
"loss": 2.8686,
"step": 232
},
{
"epoch": 4.957446808510638,
"grad_norm": 0.72265625,
"learning_rate": 6.633627157530899e-05,
"loss": 2.8085,
"step": 233
},
{
"epoch": 4.9787234042553195,
"grad_norm": 0.75390625,
"learning_rate": 6.552926160120663e-05,
"loss": 2.8017,
"step": 234
},
{
"epoch": 5.0,
"grad_norm": 1.328125,
"learning_rate": 6.472479134509052e-05,
"loss": 2.189,
"step": 235
},
{
"epoch": 5.0212765957446805,
"grad_norm": 0.8125,
"learning_rate": 6.392292007832168e-05,
"loss": 2.7068,
"step": 236
},
{
"epoch": 5.042553191489362,
"grad_norm": 0.85546875,
"learning_rate": 6.312370688077399e-05,
"loss": 2.7591,
"step": 237
},
{
"epoch": 5.0638297872340425,
"grad_norm": 0.984375,
"learning_rate": 6.232721063648148e-05,
"loss": 2.7161,
"step": 238
},
{
"epoch": 5.085106382978723,
"grad_norm": 0.78515625,
"learning_rate": 6.153349002929987e-05,
"loss": 2.8126,
"step": 239
},
{
"epoch": 5.1063829787234045,
"grad_norm": 0.7265625,
"learning_rate": 6.0742603538582835e-05,
"loss": 2.8485,
"step": 240
},
{
"epoch": 5.127659574468085,
"grad_norm": 0.578125,
"learning_rate": 5.9954609434873344e-05,
"loss": 2.8336,
"step": 241
},
{
"epoch": 5.148936170212766,
"grad_norm": 0.6171875,
"learning_rate": 5.9169565775610656e-05,
"loss": 2.6482,
"step": 242
},
{
"epoch": 5.170212765957447,
"grad_norm": 0.66015625,
"learning_rate": 5.838753040085256e-05,
"loss": 2.8597,
"step": 243
},
{
"epoch": 5.191489361702128,
"grad_norm": 0.6015625,
"learning_rate": 5.7608560929013946e-05,
"loss": 2.7875,
"step": 244
},
{
"epoch": 5.212765957446808,
"grad_norm": 0.6015625,
"learning_rate": 5.683271475262164e-05,
"loss": 2.822,
"step": 245
},
{
"epoch": 5.23404255319149,
"grad_norm": 0.7109375,
"learning_rate": 5.6060049034085815e-05,
"loss": 2.8034,
"step": 246
},
{
"epoch": 5.25531914893617,
"grad_norm": 0.5859375,
"learning_rate": 5.5290620701488594e-05,
"loss": 2.7899,
"step": 247
},
{
"epoch": 5.276595744680851,
"grad_norm": 0.65625,
"learning_rate": 5.452448644438946e-05,
"loss": 2.8848,
"step": 248
},
{
"epoch": 5.297872340425532,
"grad_norm": 0.63671875,
"learning_rate": 5.3761702709648556e-05,
"loss": 2.7907,
"step": 249
},
{
"epoch": 5.319148936170213,
"grad_norm": 0.65625,
"learning_rate": 5.300232569726804e-05,
"loss": 2.8616,
"step": 250
},
{
"epoch": 5.340425531914893,
"grad_norm": 0.6953125,
"learning_rate": 5.224641135625119e-05,
"loss": 2.7745,
"step": 251
},
{
"epoch": 5.361702127659575,
"grad_norm": 0.671875,
"learning_rate": 5.1494015380480396e-05,
"loss": 2.8555,
"step": 252
},
{
"epoch": 5.382978723404255,
"grad_norm": 0.63671875,
"learning_rate": 5.074519320461357e-05,
"loss": 2.7636,
"step": 253
},
{
"epoch": 5.404255319148936,
"grad_norm": 0.625,
"learning_rate": 5.000000000000002e-05,
"loss": 2.8076,
"step": 254
},
{
"epoch": 5.425531914893617,
"grad_norm": 0.56640625,
"learning_rate": 4.9258490670615475e-05,
"loss": 2.8087,
"step": 255
},
{
"epoch": 5.446808510638298,
"grad_norm": 0.55859375,
"learning_rate": 4.852071984901696e-05,
"loss": 2.8507,
"step": 256
},
{
"epoch": 5.468085106382979,
"grad_norm": 0.5546875,
"learning_rate": 4.778674189231751e-05,
"loss": 2.7981,
"step": 257
},
{
"epoch": 5.48936170212766,
"grad_norm": 0.58984375,
"learning_rate": 4.7056610878181486e-05,
"loss": 2.8039,
"step": 258
},
{
"epoch": 5.51063829787234,
"grad_norm": 0.65234375,
"learning_rate": 4.633038060083996e-05,
"loss": 2.7239,
"step": 259
},
{
"epoch": 5.531914893617021,
"grad_norm": 0.609375,
"learning_rate": 4.560810456712754e-05,
"loss": 2.7612,
"step": 260
},
{
"epoch": 5.553191489361702,
"grad_norm": 0.58203125,
"learning_rate": 4.488983599254001e-05,
"loss": 2.7895,
"step": 261
},
{
"epoch": 5.574468085106383,
"grad_norm": 0.63671875,
"learning_rate": 4.417562779731355e-05,
"loss": 2.7883,
"step": 262
},
{
"epoch": 5.595744680851064,
"grad_norm": 0.69140625,
"learning_rate": 4.346553260252574e-05,
"loss": 2.7913,
"step": 263
},
{
"epoch": 5.617021276595745,
"grad_norm": 0.7109375,
"learning_rate": 4.275960272621852e-05,
"loss": 2.7905,
"step": 264
},
{
"epoch": 5.638297872340425,
"grad_norm": 0.703125,
"learning_rate": 4.205789017954364e-05,
"loss": 2.7663,
"step": 265
},
{
"epoch": 5.659574468085106,
"grad_norm": 0.69140625,
"learning_rate": 4.136044666293044e-05,
"loss": 2.7839,
"step": 266
},
{
"epoch": 5.680851063829787,
"grad_norm": 0.6640625,
"learning_rate": 4.0667323562276814e-05,
"loss": 2.7986,
"step": 267
},
{
"epoch": 5.702127659574468,
"grad_norm": 0.6484375,
"learning_rate": 3.997857194516319e-05,
"loss": 2.7071,
"step": 268
},
{
"epoch": 5.723404255319149,
"grad_norm": 0.64453125,
"learning_rate": 3.929424255708999e-05,
"loss": 2.8141,
"step": 269
},
{
"epoch": 5.74468085106383,
"grad_norm": 0.69140625,
"learning_rate": 3.8614385817738794e-05,
"loss": 2.7508,
"step": 270
},
{
"epoch": 5.76595744680851,
"grad_norm": 0.6484375,
"learning_rate": 3.793905181725772e-05,
"loss": 2.7273,
"step": 271
},
{
"epoch": 5.787234042553192,
"grad_norm": 0.68359375,
"learning_rate": 3.726829031257062e-05,
"loss": 2.6695,
"step": 272
},
{
"epoch": 5.808510638297872,
"grad_norm": 0.6796875,
"learning_rate": 3.660215072371135e-05,
"loss": 2.7872,
"step": 273
},
{
"epoch": 5.829787234042553,
"grad_norm": 0.6875,
"learning_rate": 3.594068213018249e-05,
"loss": 2.7969,
"step": 274
},
{
"epoch": 5.851063829787234,
"grad_norm": 0.703125,
"learning_rate": 3.528393326733941e-05,
"loss": 2.8035,
"step": 275
},
{
"epoch": 5.872340425531915,
"grad_norm": 0.71484375,
"learning_rate": 3.463195252279939e-05,
"loss": 2.7496,
"step": 276
},
{
"epoch": 5.8936170212765955,
"grad_norm": 0.796875,
"learning_rate": 3.3984787932876814e-05,
"loss": 2.7365,
"step": 277
},
{
"epoch": 5.914893617021277,
"grad_norm": 0.8046875,
"learning_rate": 3.334248717904368e-05,
"loss": 2.7371,
"step": 278
},
{
"epoch": 5.9361702127659575,
"grad_norm": 0.81640625,
"learning_rate": 3.270509758441671e-05,
"loss": 2.7465,
"step": 279
},
{
"epoch": 5.957446808510638,
"grad_norm": 0.8828125,
"learning_rate": 3.207266611027069e-05,
"loss": 2.6859,
"step": 280
},
{
"epoch": 5.9787234042553195,
"grad_norm": 0.9375,
"learning_rate": 3.144523935257846e-05,
"loss": 2.6722,
"step": 281
},
{
"epoch": 6.0,
"grad_norm": 1.375,
"learning_rate": 3.082286353857782e-05,
"loss": 2.0584,
"step": 282
},
{
"epoch": 6.0212765957446805,
"grad_norm": 0.69921875,
"learning_rate": 3.0205584523365626e-05,
"loss": 2.6076,
"step": 283
},
{
"epoch": 6.042553191489362,
"grad_norm": 0.73046875,
"learning_rate": 2.9593447786519425e-05,
"loss": 2.6513,
"step": 284
},
{
"epoch": 6.0638297872340425,
"grad_norm": 0.859375,
"learning_rate": 2.8986498428746444e-05,
"loss": 2.6075,
"step": 285
},
{
"epoch": 6.085106382978723,
"grad_norm": 0.94140625,
"learning_rate": 2.8384781168560693e-05,
"loss": 2.7151,
"step": 286
},
{
"epoch": 6.1063829787234045,
"grad_norm": 1.0390625,
"learning_rate": 2.7788340338988385e-05,
"loss": 2.7812,
"step": 287
},
{
"epoch": 6.127659574468085,
"grad_norm": 1.1015625,
"learning_rate": 2.719721988430153e-05,
"loss": 2.7936,
"step": 288
},
{
"epoch": 6.148936170212766,
"grad_norm": 1.0,
"learning_rate": 2.6611463356780096e-05,
"loss": 2.6086,
"step": 289
},
{
"epoch": 6.170212765957447,
"grad_norm": 0.984375,
"learning_rate": 2.6031113913503337e-05,
"loss": 2.8151,
"step": 290
},
{
"epoch": 6.191489361702128,
"grad_norm": 0.734375,
"learning_rate": 2.5456214313170002e-05,
"loss": 2.7246,
"step": 291
},
{
"epoch": 6.212765957446808,
"grad_norm": 0.69140625,
"learning_rate": 2.4886806912948035e-05,
"loss": 2.7524,
"step": 292
},
{
"epoch": 6.23404255319149,
"grad_norm": 0.75390625,
"learning_rate": 2.4322933665353776e-05,
"loss": 2.7285,
"step": 293
},
{
"epoch": 6.25531914893617,
"grad_norm": 0.66015625,
"learning_rate": 2.3764636115160978e-05,
"loss": 2.7237,
"step": 294
},
{
"epoch": 6.276595744680851,
"grad_norm": 0.734375,
"learning_rate": 2.3211955396340002e-05,
"loss": 2.818,
"step": 295
},
{
"epoch": 6.297872340425532,
"grad_norm": 0.6796875,
"learning_rate": 2.2664932229027024e-05,
"loss": 2.7163,
"step": 296
},
{
"epoch": 6.319148936170213,
"grad_norm": 0.609375,
"learning_rate": 2.2123606916523953e-05,
"loss": 2.7859,
"step": 297
},
{
"epoch": 6.340425531914893,
"grad_norm": 0.64453125,
"learning_rate": 2.1588019342328968e-05,
"loss": 2.6892,
"step": 298
},
{
"epoch": 6.361702127659575,
"grad_norm": 0.6328125,
"learning_rate": 2.1058208967198045e-05,
"loss": 2.767,
"step": 299
},
{
"epoch": 6.382978723404255,
"grad_norm": 0.62109375,
"learning_rate": 2.0534214826237484e-05,
"loss": 2.6933,
"step": 300
},
{
"epoch": 6.404255319148936,
"grad_norm": 0.62890625,
"learning_rate": 2.0016075526028065e-05,
"loss": 2.7303,
"step": 301
},
{
"epoch": 6.425531914893617,
"grad_norm": 0.5703125,
"learning_rate": 1.9503829241780412e-05,
"loss": 2.7377,
"step": 302
},
{
"epoch": 6.446808510638298,
"grad_norm": 0.6171875,
"learning_rate": 1.8997513714522487e-05,
"loss": 2.7818,
"step": 303
},
{
"epoch": 6.468085106382979,
"grad_norm": 0.59765625,
"learning_rate": 1.8497166248318876e-05,
"loss": 2.7335,
"step": 304
},
{
"epoch": 6.48936170212766,
"grad_norm": 0.60546875,
"learning_rate": 1.8002823707522297e-05,
"loss": 2.733,
"step": 305
},
{
"epoch": 6.51063829787234,
"grad_norm": 0.66796875,
"learning_rate": 1.7514522514057553e-05,
"loss": 2.6446,
"step": 306
},
{
"epoch": 6.531914893617021,
"grad_norm": 0.6484375,
"learning_rate": 1.703229864473811e-05,
"loss": 2.6907,
"step": 307
},
{
"epoch": 6.553191489361702,
"grad_norm": 0.609375,
"learning_rate": 1.6556187628615273e-05,
"loss": 2.7176,
"step": 308
},
{
"epoch": 6.574468085106383,
"grad_norm": 0.6484375,
"learning_rate": 1.608622454436062e-05,
"loss": 2.7109,
"step": 309
},
{
"epoch": 6.595744680851064,
"grad_norm": 0.65625,
"learning_rate": 1.562244401768144e-05,
"loss": 2.7085,
"step": 310
},
{
"epoch": 6.617021276595745,
"grad_norm": 0.65625,
"learning_rate": 1.5164880218769618e-05,
"loss": 2.6987,
"step": 311
},
{
"epoch": 6.638297872340425,
"grad_norm": 0.62890625,
"learning_rate": 1.4713566859784045e-05,
"loss": 2.6835,
"step": 312
},
{
"epoch": 6.659574468085106,
"grad_norm": 0.6328125,
"learning_rate": 1.426853719236676e-05,
"loss": 2.6981,
"step": 313
},
{
"epoch": 6.680851063829787,
"grad_norm": 0.640625,
"learning_rate": 1.3829824005193181e-05,
"loss": 2.7132,
"step": 314
},
{
"epoch": 6.702127659574468,
"grad_norm": 0.62890625,
"learning_rate": 1.339745962155613e-05,
"loss": 2.6319,
"step": 315
},
{
"epoch": 6.723404255319149,
"grad_norm": 0.63671875,
"learning_rate": 1.2971475896984475e-05,
"loss": 2.7332,
"step": 316
},
{
"epoch": 6.74468085106383,
"grad_norm": 0.68359375,
"learning_rate": 1.2551904216896037e-05,
"loss": 2.6649,
"step": 317
},
{
"epoch": 6.76595744680851,
"grad_norm": 0.6484375,
"learning_rate": 1.2138775494285182e-05,
"loss": 2.6486,
"step": 318
},
{
"epoch": 6.787234042553192,
"grad_norm": 0.69140625,
"learning_rate": 1.1732120167445248e-05,
"loss": 2.5875,
"step": 319
},
{
"epoch": 6.808510638297872,
"grad_norm": 0.7109375,
"learning_rate": 1.1331968197725984e-05,
"loss": 2.7079,
"step": 320
},
{
"epoch": 6.829787234042553,
"grad_norm": 0.70703125,
"learning_rate": 1.0938349067325959e-05,
"loss": 2.7134,
"step": 321
},
{
"epoch": 6.851063829787234,
"grad_norm": 0.71875,
"learning_rate": 1.0551291777120464e-05,
"loss": 2.7199,
"step": 322
},
{
"epoch": 6.872340425531915,
"grad_norm": 0.703125,
"learning_rate": 1.0170824844524728e-05,
"loss": 2.6655,
"step": 323
},
{
"epoch": 6.8936170212765955,
"grad_norm": 0.73046875,
"learning_rate": 9.796976301392934e-06,
"loss": 2.6519,
"step": 324
},
{
"epoch": 6.914893617021277,
"grad_norm": 0.75390625,
"learning_rate": 9.429773691952858e-06,
"loss": 2.6443,
"step": 325
},
{
"epoch": 6.9361702127659575,
"grad_norm": 0.79296875,
"learning_rate": 9.069244070776428e-06,
"loss": 2.6531,
"step": 326
},
{
"epoch": 6.957446808510638,
"grad_norm": 0.86328125,
"learning_rate": 8.715414000786448e-06,
"loss": 2.5897,
"step": 327
},
{
"epoch": 6.9787234042553195,
"grad_norm": 1.03125,
"learning_rate": 8.368309551299536e-06,
"loss": 2.5772,
"step": 328
},
{
"epoch": 7.0,
"grad_norm": 1.296875,
"learning_rate": 8.027956296105354e-06,
"loss": 1.9731,
"step": 329
},
{
"epoch": 7.0212765957446805,
"grad_norm": 0.75,
"learning_rate": 7.6943793115824e-06,
"loss": 2.5669,
"step": 330
},
{
"epoch": 7.042553191489362,
"grad_norm": 0.703125,
"learning_rate": 7.367603174850502e-06,
"loss": 2.6154,
"step": 331
},
{
"epoch": 7.0638297872340425,
"grad_norm": 0.72265625,
"learning_rate": 7.047651961959978e-06,
"loss": 2.5542,
"step": 332
},
{
"epoch": 7.085106382978723,
"grad_norm": 0.74609375,
"learning_rate": 6.73454924611776e-06,
"loss": 2.6428,
"step": 333
},
{
"epoch": 7.1063829787234045,
"grad_norm": 0.64453125,
"learning_rate": 6.428318095950647e-06,
"loss": 2.6929,
"step": 334
},
{
"epoch": 7.127659574468085,
"grad_norm": 0.63671875,
"learning_rate": 6.128981073805584e-06,
"loss": 2.6994,
"step": 335
},
{
"epoch": 7.148936170212766,
"grad_norm": 0.67578125,
"learning_rate": 5.836560234087418e-06,
"loss": 2.5162,
"step": 336
},
{
"epoch": 7.170212765957447,
"grad_norm": 0.76953125,
"learning_rate": 5.551077121633874e-06,
"loss": 2.7308,
"step": 337
},
{
"epoch": 7.191489361702128,
"grad_norm": 0.734375,
"learning_rate": 5.272552770128314e-06,
"loss": 2.6655,
"step": 338
},
{
"epoch": 7.212765957446808,
"grad_norm": 0.7578125,
"learning_rate": 5.001007700549898e-06,
"loss": 2.7014,
"step": 339
},
{
"epoch": 7.23404255319149,
"grad_norm": 0.80078125,
"learning_rate": 4.7364619196617495e-06,
"loss": 2.6704,
"step": 340
},
{
"epoch": 7.25531914893617,
"grad_norm": 0.734375,
"learning_rate": 4.478934918536837e-06,
"loss": 2.6756,
"step": 341
},
{
"epoch": 7.276595744680851,
"grad_norm": 0.7578125,
"learning_rate": 4.228445671121972e-06,
"loss": 2.7574,
"step": 342
},
{
"epoch": 7.297872340425532,
"grad_norm": 0.6640625,
"learning_rate": 3.985012632839824e-06,
"loss": 2.6565,
"step": 343
},
{
"epoch": 7.319148936170213,
"grad_norm": 0.69140625,
"learning_rate": 3.748653739229191e-06,
"loss": 2.7389,
"step": 344
},
{
"epoch": 7.340425531914893,
"grad_norm": 0.68359375,
"learning_rate": 3.519386404623537e-06,
"loss": 2.6382,
"step": 345
},
{
"epoch": 7.361702127659575,
"grad_norm": 0.73828125,
"learning_rate": 3.2972275208679625e-06,
"loss": 2.7147,
"step": 346
},
{
"epoch": 7.382978723404255,
"grad_norm": 0.7421875,
"learning_rate": 3.0821934560746447e-06,
"loss": 2.6497,
"step": 347
},
{
"epoch": 7.404255319148936,
"grad_norm": 0.76171875,
"learning_rate": 2.8743000534168675e-06,
"loss": 2.6844,
"step": 348
},
{
"epoch": 7.425531914893617,
"grad_norm": 0.72265625,
"learning_rate": 2.6735626299617457e-06,
"loss": 2.6961,
"step": 349
},
{
"epoch": 7.446808510638298,
"grad_norm": 0.7109375,
"learning_rate": 2.479995975541749e-06,
"loss": 2.7341,
"step": 350
},
{
"epoch": 7.468085106382979,
"grad_norm": 0.7109375,
"learning_rate": 2.2936143516649188e-06,
"loss": 2.6872,
"step": 351
},
{
"epoch": 7.48936170212766,
"grad_norm": 0.69921875,
"learning_rate": 2.1144314904642195e-06,
"loss": 2.6879,
"step": 352
},
{
"epoch": 7.51063829787234,
"grad_norm": 0.7109375,
"learning_rate": 1.942460593685713e-06,
"loss": 2.5916,
"step": 353
},
{
"epoch": 7.531914893617021,
"grad_norm": 0.72265625,
"learning_rate": 1.7777143317159406e-06,
"loss": 2.643,
"step": 354
},
{
"epoch": 7.553191489361702,
"grad_norm": 0.671875,
"learning_rate": 1.6202048426483651e-06,
"loss": 2.6724,
"step": 355
},
{
"epoch": 7.574468085106383,
"grad_norm": 0.6875,
"learning_rate": 1.4699437313891007e-06,
"loss": 2.6634,
"step": 356
},
{
"epoch": 7.595744680851064,
"grad_norm": 0.7265625,
"learning_rate": 1.3269420688018508e-06,
"loss": 2.6651,
"step": 357
},
{
"epoch": 7.617021276595745,
"grad_norm": 0.70703125,
"learning_rate": 1.1912103908922945e-06,
"loss": 2.6545,
"step": 358
},
{
"epoch": 7.638297872340425,
"grad_norm": 0.69921875,
"learning_rate": 1.0627586980317073e-06,
"loss": 2.6455,
"step": 359
},
{
"epoch": 7.659574468085106,
"grad_norm": 0.69921875,
"learning_rate": 9.415964542203059e-07,
"loss": 2.6622,
"step": 360
},
{
"epoch": 7.680851063829787,
"grad_norm": 0.7265625,
"learning_rate": 8.277325863898511e-07,
"loss": 2.6787,
"step": 361
},
{
"epoch": 7.702127659574468,
"grad_norm": 0.7109375,
"learning_rate": 7.21175483745995e-07,
"loss": 2.6004,
"step": 362
},
{
"epoch": 7.723404255319149,
"grad_norm": 0.7265625,
"learning_rate": 6.219329971501653e-07,
"loss": 2.7023,
"step": 363
},
{
"epoch": 7.74468085106383,
"grad_norm": 0.73828125,
"learning_rate": 5.300124385410943e-07,
"loss": 2.6309,
"step": 364
},
{
"epoch": 7.76595744680851,
"grad_norm": 0.6953125,
"learning_rate": 4.4542058039619417e-07,
"loss": 2.6197,
"step": 365
},
{
"epoch": 7.787234042553192,
"grad_norm": 0.75,
"learning_rate": 3.681636552324452e-07,
"loss": 2.5579,
"step": 366
},
{
"epoch": 7.808510638297872,
"grad_norm": 0.78515625,
"learning_rate": 2.9824735514732974e-07,
"loss": 2.6765,
"step": 367
},
{
"epoch": 7.829787234042553,
"grad_norm": 0.8203125,
"learning_rate": 2.3567683139936735e-07,
"loss": 2.687,
"step": 368
},
{
"epoch": 7.851063829787234,
"grad_norm": 0.796875,
"learning_rate": 1.8045669402859677e-07,
"loss": 2.6924,
"step": 369
},
{
"epoch": 7.872340425531915,
"grad_norm": 0.79296875,
"learning_rate": 1.3259101151694708e-07,
"loss": 2.6409,
"step": 370
},
{
"epoch": 7.8936170212765955,
"grad_norm": 0.78125,
"learning_rate": 9.208331048846663e-08,
"loss": 2.6251,
"step": 371
},
{
"epoch": 7.914893617021277,
"grad_norm": 0.77734375,
"learning_rate": 5.893657544947528e-08,
"loss": 2.616,
"step": 372
},
{
"epoch": 7.9361702127659575,
"grad_norm": 0.8515625,
"learning_rate": 3.3153248568695835e-08,
"loss": 2.626,
"step": 373
},
{
"epoch": 7.957446808510638,
"grad_norm": 0.81640625,
"learning_rate": 1.47352294973091e-08,
"loss": 2.5585,
"step": 374
},
{
"epoch": 7.9787234042553195,
"grad_norm": 0.89453125,
"learning_rate": 3.6838752290102585e-09,
"loss": 2.543,
"step": 375
},
{
"epoch": 8.0,
"grad_norm": 1.046875,
"learning_rate": 0.0,
"loss": 1.9425,
"step": 376
}
],
"logging_steps": 1,
"max_steps": 376,
"num_input_tokens_seen": 0,
"num_train_epochs": 8,
"save_steps": 47,
"total_flos": 5.59507839123456e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}