diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,1735 +1,3443 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 1.0, + "epoch": 2.0, "eval_steps": 500, - "global_step": 977, + "global_step": 1954, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0040941658137154556, - "grad_norm": 0.37109375, - "learning_rate": 0.00011999999999999999, - "loss": 0.4557, + "grad_norm": 0.59375, + "learning_rate": 5.9999999999999995e-05, + "loss": 0.6558, "step": 4 }, { "epoch": 0.008188331627430911, - "grad_norm": 0.376953125, - "learning_rate": 0.00023999999999999998, - "loss": 0.4567, + "grad_norm": 0.46875, + "learning_rate": 0.00011999999999999999, + "loss": 0.6895, "step": 8 }, { "epoch": 0.012282497441146366, "grad_norm": 0.4296875, - "learning_rate": 0.0002999968335945527, - "loss": 0.4431, + "learning_rate": 0.00017999999999999998, + "loss": 0.641, "step": 12 }, { "epoch": 0.016376663254861822, - "grad_norm": 0.396484375, - "learning_rate": 0.0002999715031530591, - "loss": 0.4587, + "grad_norm": 0.357421875, + "learning_rate": 0.00023999999999999998, + "loss": 0.6635, "step": 16 }, { "epoch": 0.02047082906857728, - "grad_norm": 0.40234375, - "learning_rate": 0.0002999208465477039, - "loss": 0.4759, + "grad_norm": 0.4296875, + "learning_rate": 0.0003, + "loss": 0.6486, "step": 20 }, { "epoch": 0.02456499488229273, - "grad_norm": 0.361328125, - "learning_rate": 0.0002998448723330289, - "loss": 0.4522, + "grad_norm": 0.408203125, + "learning_rate": 0.0002999968335945527, + "loss": 0.6335, "step": 24 }, { "epoch": 0.028659160696008188, - "grad_norm": 0.412109375, - "learning_rate": 0.0002997435933390409, - "loss": 0.5118, + "grad_norm": 0.404296875, + "learning_rate": 0.00029998733451189267, + "loss": 0.6513, "step": 28 }, { "epoch": 0.032753326509723645, - "grad_norm": 0.416015625, - "learning_rate": 0.00029961702666904524, - "loss": 0.4726, + "grad_norm": 0.388671875, + "learning_rate": 0.0002999715031530591, + "loss": 0.6439, "step": 32 }, { "epoch": 0.0368474923234391, - "grad_norm": 0.44140625, - "learning_rate": 0.00029946519369675726, - "loss": 0.4631, + "grad_norm": 0.4296875, + "learning_rate": 0.000299949340186432, + "loss": 0.6304, "step": 36 }, { "epoch": 0.04094165813715456, - "grad_norm": 0.421875, - "learning_rate": 0.000299288120062693, - "loss": 0.5306, + "grad_norm": 0.423828125, + "learning_rate": 0.0002999208465477039, + "loss": 0.6772, "step": 40 }, { "epoch": 0.04503582395087001, - "grad_norm": 0.416015625, - "learning_rate": 0.0002990858356698392, - "loss": 0.4809, + "grad_norm": 0.404296875, + "learning_rate": 0.0002998860234398403, + "loss": 0.6294, "step": 44 }, { "epoch": 0.04912998976458546, - "grad_norm": 0.3984375, - "learning_rate": 0.0002988583746786035, - "loss": 0.4814, + "grad_norm": 0.396484375, + "learning_rate": 0.0002998448723330289, + "loss": 0.6093, "step": 48 }, { "epoch": 0.05322415557830092, - "grad_norm": 0.412109375, - "learning_rate": 0.00029860577550104567, - "loss": 0.518, + "grad_norm": 0.408203125, + "learning_rate": 0.0002997973949646176, + "loss": 0.6516, "step": 52 }, { "epoch": 0.057318321392016376, - "grad_norm": 0.435546875, - "learning_rate": 0.00029832808079439076, - "loss": 0.4904, + "grad_norm": 0.443359375, + "learning_rate": 0.0002997435933390409, + "loss": 0.6261, "step": 56 }, { "epoch": 0.06141248720573183, - "grad_norm": 0.41796875, - "learning_rate": 0.00029802533745382546, - "loss": 0.512, + "grad_norm": 0.390625, + "learning_rate": 0.0002996834697277358, + "loss": 0.6294, "step": 60 }, { "epoch": 0.06550665301944729, - "grad_norm": 0.400390625, - "learning_rate": 0.0002976975966045788, - "loss": 0.4935, + "grad_norm": 0.38671875, + "learning_rate": 0.00029961702666904524, + "loss": 0.615, "step": 64 }, { "epoch": 0.06960081883316274, - "grad_norm": 0.416015625, - "learning_rate": 0.00029734491359328854, - "loss": 0.4664, + "grad_norm": 0.40234375, + "learning_rate": 0.00029954426696811147, + "loss": 0.6036, "step": 68 }, { "epoch": 0.0736949846468782, - "grad_norm": 0.40234375, - "learning_rate": 0.0002969673479786545, - "loss": 0.445, + "grad_norm": 0.3984375, + "learning_rate": 0.00029946519369675726, + "loss": 0.5834, "step": 72 }, { "epoch": 0.07778915046059365, - "grad_norm": 0.3828125, - "learning_rate": 0.00029656496352138066, - "loss": 0.503, + "grad_norm": 0.37109375, + "learning_rate": 0.0002993798101933565, + "loss": 0.628, "step": 76 }, { "epoch": 0.08188331627430911, "grad_norm": 0.39453125, - "learning_rate": 0.0002961378281734078, - "loss": 0.5029, + "learning_rate": 0.000299288120062693, + "loss": 0.6307, "step": 80 }, { "epoch": 0.08597748208802457, - "grad_norm": 0.38671875, - "learning_rate": 0.00029568601406643826, - "loss": 0.5077, + "grad_norm": 0.3984375, + "learning_rate": 0.0002991901271758085, + "loss": 0.6392, "step": 84 }, { "epoch": 0.09007164790174002, - "grad_norm": 0.41015625, - "learning_rate": 0.0002952095974997546, - "loss": 0.4825, + "grad_norm": 0.419921875, + "learning_rate": 0.0002990858356698392, + "loss": 0.6184, "step": 88 }, { "epoch": 0.09416581371545547, - "grad_norm": 0.41796875, - "learning_rate": 0.000294708658927335, - "loss": 0.5425, + "grad_norm": 0.390625, + "learning_rate": 0.00029897524994784095, + "loss": 0.6669, "step": 92 }, { "epoch": 0.09825997952917093, - "grad_norm": 0.380859375, - "learning_rate": 0.00029418328294426643, - "loss": 0.5262, + "grad_norm": 0.353515625, + "learning_rate": 0.0002988583746786035, + "loss": 0.6474, "step": 96 }, { "epoch": 0.1023541453428864, - "grad_norm": 0.404296875, - "learning_rate": 0.00029363355827245925, - "loss": 0.5038, + "grad_norm": 0.41015625, + "learning_rate": 0.0002987352147964534, + "loss": 0.6427, "step": 100 }, { "epoch": 0.10644831115660185, - "grad_norm": 0.373046875, - "learning_rate": 0.000293059577745664, - "loss": 0.4963, + "grad_norm": 0.369140625, + "learning_rate": 0.00029860577550104567, + "loss": 0.6331, "step": 104 }, { "epoch": 0.1105424769703173, - "grad_norm": 0.369140625, - "learning_rate": 0.0002924614382937944, - "loss": 0.5006, + "grad_norm": 0.35546875, + "learning_rate": 0.0002984700622571441, + "loss": 0.6274, "step": 108 }, { "epoch": 0.11463664278403275, - "grad_norm": 0.4140625, - "learning_rate": 0.0002918392409265587, - "loss": 0.4985, + "grad_norm": 0.40234375, + "learning_rate": 0.00029832808079439076, + "loss": 0.6363, "step": 112 }, { "epoch": 0.1187308085977482, - "grad_norm": 0.412109375, - "learning_rate": 0.0002911930907164017, - "loss": 0.5109, + "grad_norm": 0.57421875, + "learning_rate": 0.000298179837107064, + "loss": 0.6413, "step": 116 }, { "epoch": 0.12282497441146366, - "grad_norm": 0.38671875, - "learning_rate": 0.00029052309678076065, - "loss": 0.5216, + "grad_norm": 0.390625, + "learning_rate": 0.00029802533745382546, + "loss": 0.6439, "step": 120 }, { "epoch": 0.1269191402251791, - "grad_norm": 0.43359375, - "learning_rate": 0.0002898293722636386, - "loss": 0.5082, + "grad_norm": 0.3984375, + "learning_rate": 0.00029786458835745564, + "loss": 0.6274, "step": 124 }, { "epoch": 0.13101330603889458, - "grad_norm": 0.3828125, - "learning_rate": 0.0002891120343164972, - "loss": 0.4873, + "grad_norm": 0.37890625, + "learning_rate": 0.0002976975966045788, + "loss": 0.605, "step": 128 }, { "epoch": 0.13510747185261002, - "grad_norm": 0.40234375, - "learning_rate": 0.00028837120407847286, - "loss": 0.5025, + "grad_norm": 0.384765625, + "learning_rate": 0.00029752436924537616, + "loss": 0.6316, "step": 132 }, { "epoch": 0.13920163766632548, - "grad_norm": 0.40625, - "learning_rate": 0.00028760700665591985, - "loss": 0.5645, + "grad_norm": 0.376953125, + "learning_rate": 0.00029734491359328854, + "loss": 0.6802, "step": 136 }, { "epoch": 0.14329580348004095, - "grad_norm": 0.421875, - "learning_rate": 0.00028681957110128313, - "loss": 0.5617, + "grad_norm": 0.373046875, + "learning_rate": 0.00029715923722470724, + "loss": 0.6841, "step": 140 }, { "epoch": 0.1473899692937564, - "grad_norm": 0.408203125, - "learning_rate": 0.0002860090303913048, - "loss": 0.4772, + "grad_norm": 0.396484375, + "learning_rate": 0.0002969673479786545, + "loss": 0.6004, "step": 144 }, { "epoch": 0.15148413510747186, - "grad_norm": 0.396484375, - "learning_rate": 0.0002851755214045676, - "loss": 0.5494, + "grad_norm": 0.380859375, + "learning_rate": 0.00029676925395645233, + "loss": 0.671, "step": 148 }, { "epoch": 0.1555783009211873, - "grad_norm": 0.41015625, - "learning_rate": 0.00028431918489838057, - "loss": 0.5034, + "grad_norm": 0.37890625, + "learning_rate": 0.00029656496352138066, + "loss": 0.6257, "step": 152 }, { "epoch": 0.15967246673490276, - "grad_norm": 0.3984375, - "learning_rate": 0.000283440165485008, - "loss": 0.4995, + "grad_norm": 0.384765625, + "learning_rate": 0.00029635448529832407, + "loss": 0.6141, "step": 156 }, { "epoch": 0.16376663254861823, - "grad_norm": 0.4140625, - "learning_rate": 0.000282538611607249, - "loss": 0.5073, + "grad_norm": 0.390625, + "learning_rate": 0.0002961378281734078, + "loss": 0.6391, "step": 160 }, { "epoch": 0.16786079836233367, - "grad_norm": 0.38671875, - "learning_rate": 0.000281614675513369, - "loss": 0.4984, + "grad_norm": 0.396484375, + "learning_rate": 0.00029591500129362255, + "loss": 0.6211, "step": 164 }, { "epoch": 0.17195496417604914, - "grad_norm": 0.392578125, - "learning_rate": 0.0002806685132313896, - "loss": 0.5475, + "grad_norm": 0.369140625, + "learning_rate": 0.00029568601406643826, + "loss": 0.6644, "step": 168 }, { "epoch": 0.17604912998976457, - "grad_norm": 0.423828125, - "learning_rate": 0.00027970028454273917, - "loss": 0.5389, + "grad_norm": 0.388671875, + "learning_rate": 0.0002954508761594069, + "loss": 0.6667, "step": 172 }, { "epoch": 0.18014329580348004, - "grad_norm": 0.419921875, - "learning_rate": 0.0002787101529552702, - "loss": 0.513, + "grad_norm": 0.412109375, + "learning_rate": 0.0002952095974997546, + "loss": 0.6351, "step": 176 }, { "epoch": 0.1842374616171955, - "grad_norm": 0.416015625, - "learning_rate": 0.0002776982856756473, - "loss": 0.4928, + "grad_norm": 0.390625, + "learning_rate": 0.0002949621882739621, + "loss": 0.6142, "step": 180 }, { "epoch": 0.18833162743091095, - "grad_norm": 0.416015625, - "learning_rate": 0.0002766648535811105, - "loss": 0.5224, + "grad_norm": 0.412109375, + "learning_rate": 0.000294708658927335, + "loss": 0.6569, "step": 184 }, { "epoch": 0.19242579324462641, - "grad_norm": 0.380859375, - "learning_rate": 0.0002756100311906185, - "loss": 0.5163, + "grad_norm": 0.353515625, + "learning_rate": 0.00029444902016356267, + "loss": 0.6272, "step": 188 }, { "epoch": 0.19651995905834185, - "grad_norm": 0.392578125, - "learning_rate": 0.00027453399663537707, - "loss": 0.5496, + "grad_norm": 0.37109375, + "learning_rate": 0.00029418328294426643, + "loss": 0.6742, "step": 192 }, { "epoch": 0.20061412487205732, - "grad_norm": 0.4296875, - "learning_rate": 0.0002734369316287578, - "loss": 0.5266, + "grad_norm": 0.404296875, + "learning_rate": 0.00029391145848853674, + "loss": 0.6513, "step": 196 }, { "epoch": 0.2047082906857728, - "grad_norm": 0.3984375, - "learning_rate": 0.0002723190214356113, - "loss": 0.5196, + "grad_norm": 0.37109375, + "learning_rate": 0.00029363355827245925, + "loss": 0.6369, "step": 200 }, { "epoch": 0.20880245649948823, - "grad_norm": 0.404296875, - "learning_rate": 0.00027118045484098095, - "loss": 0.5135, + "grad_norm": 0.376953125, + "learning_rate": 0.0002933495940286309, + "loss": 0.6371, "step": 204 }, { "epoch": 0.2128966223132037, - "grad_norm": 0.404296875, - "learning_rate": 0.0002700214241182223, - "loss": 0.5549, + "grad_norm": 0.376953125, + "learning_rate": 0.000293059577745664, + "loss": 0.6745, "step": 208 }, { "epoch": 0.21699078812691913, - "grad_norm": 0.40625, - "learning_rate": 0.0002688421249965331, - "loss": 0.5456, + "grad_norm": 0.376953125, + "learning_rate": 0.00029276352166768033, + "loss": 0.6577, "step": 212 }, { "epoch": 0.2210849539406346, - "grad_norm": 0.39453125, - "learning_rate": 0.00026764275662790005, - "loss": 0.4948, + "grad_norm": 0.392578125, + "learning_rate": 0.0002924614382937944, + "loss": 0.6224, "step": 216 }, { "epoch": 0.22517911975435004, - "grad_norm": 0.390625, - "learning_rate": 0.0002664235215534673, - "loss": 0.5228, + "grad_norm": 0.37109375, + "learning_rate": 0.0002921533403775853, + "loss": 0.6471, "step": 220 }, { "epoch": 0.2292732855680655, - "grad_norm": 0.408203125, - "learning_rate": 0.0002651846256693326, - "loss": 0.536, + "grad_norm": 0.3828125, + "learning_rate": 0.0002918392409265587, + "loss": 0.6583, "step": 224 }, { "epoch": 0.23336745138178097, - "grad_norm": 0.40234375, - "learning_rate": 0.0002639262781917771, - "loss": 0.5117, + "grad_norm": 0.390625, + "learning_rate": 0.00029151915320159747, + "loss": 0.6408, "step": 228 }, { "epoch": 0.2374616171954964, - "grad_norm": 0.408203125, - "learning_rate": 0.0002626486916219344, - "loss": 0.5132, + "grad_norm": 0.396484375, + "learning_rate": 0.0002911930907164017, + "loss": 0.6275, "step": 232 }, { "epoch": 0.24155578300921188, - "grad_norm": 0.39453125, - "learning_rate": 0.0002613520817099045, - "loss": 0.4914, + "grad_norm": 0.359375, + "learning_rate": 0.00029086106723691857, + "loss": 0.6083, "step": 236 }, { "epoch": 0.24564994882292732, - "grad_norm": 0.416015625, - "learning_rate": 0.0002600366674183196, - "loss": 0.4681, + "grad_norm": 0.37890625, + "learning_rate": 0.00029052309678076065, + "loss": 0.5966, "step": 240 }, { "epoch": 0.24974411463664278, - "grad_norm": 0.41015625, - "learning_rate": 0.0002587026708853674, - "loss": 0.5087, + "grad_norm": 0.3671875, + "learning_rate": 0.0002901791936166147, + "loss": 0.6294, "step": 244 }, { "epoch": 0.2538382804503582, - "grad_norm": 0.41796875, - "learning_rate": 0.00025735031738727753, - "loss": 0.5334, + "grad_norm": 0.390625, + "learning_rate": 0.0002898293722636386, + "loss": 0.647, "step": 248 }, { "epoch": 0.2579324462640737, - "grad_norm": 0.4765625, - "learning_rate": 0.0002559798353002785, - "loss": 0.531, + "grad_norm": 0.421875, + "learning_rate": 0.00028947364749084897, + "loss": 0.6532, "step": 252 }, { "epoch": 0.26202661207778916, - "grad_norm": 0.3828125, - "learning_rate": 0.0002545914560620313, - "loss": 0.4851, + "grad_norm": 0.38671875, + "learning_rate": 0.0002891120343164972, + "loss": 0.6059, "step": 256 }, { "epoch": 0.2661207778915046, - "grad_norm": 0.408203125, - "learning_rate": 0.00025318541413254587, - "loss": 0.5347, + "grad_norm": 0.39453125, + "learning_rate": 0.00028874454800743556, + "loss": 0.6545, "step": 260 }, { "epoch": 0.27021494370522003, - "grad_norm": 0.390625, - "learning_rate": 0.00025176194695458644, - "loss": 0.5297, + "grad_norm": 0.365234375, + "learning_rate": 0.00028837120407847286, + "loss": 0.6462, "step": 264 }, { "epoch": 0.2743091095189355, - "grad_norm": 0.408203125, - "learning_rate": 0.0002503212949135747, - "loss": 0.5131, + "grad_norm": 0.373046875, + "learning_rate": 0.000287992018291719, + "loss": 0.6337, "step": 268 }, { "epoch": 0.27840327533265097, - "grad_norm": 0.3984375, - "learning_rate": 0.0002488637012969945, - "loss": 0.5414, + "grad_norm": 0.365234375, + "learning_rate": 0.00028760700665591985, + "loss": 0.6431, "step": 272 }, { "epoch": 0.28249744114636643, - "grad_norm": 0.38671875, - "learning_rate": 0.00024738941225330727, - "loss": 0.5579, + "grad_norm": 0.353515625, + "learning_rate": 0.0002872161854257814, + "loss": 0.6797, "step": 276 }, { "epoch": 0.2865916069600819, - "grad_norm": 0.396484375, - "learning_rate": 0.0002458986767503845, - "loss": 0.4888, + "grad_norm": 0.369140625, + "learning_rate": 0.00028681957110128313, + "loss": 0.6191, "step": 280 }, { "epoch": 0.2906857727737973, - "grad_norm": 0.380859375, - "learning_rate": 0.00024439174653346325, - "loss": 0.4883, + "grad_norm": 0.384765625, + "learning_rate": 0.000286417180426982, + "loss": 0.6064, "step": 284 }, { "epoch": 0.2947799385875128, - "grad_norm": 0.384765625, - "learning_rate": 0.0002428688760826334, - "loss": 0.5249, + "grad_norm": 0.369140625, + "learning_rate": 0.0002860090303913048, + "loss": 0.6451, "step": 288 }, { "epoch": 0.29887410440122825, - "grad_norm": 0.404296875, - "learning_rate": 0.00024133032256986274, - "loss": 0.5179, + "grad_norm": 0.380859375, + "learning_rate": 0.00028559513822583153, + "loss": 0.6402, "step": 292 }, { "epoch": 0.3029682702149437, - "grad_norm": 0.451171875, - "learning_rate": 0.00023977634581556743, - "loss": 0.5349, - "step": 296 + "grad_norm": 0.408203125, + "learning_rate": 0.0002851755214045676, + "loss": 0.655, + "step": 296 }, { "epoch": 0.3070624360286592, - "grad_norm": 0.373046875, - "learning_rate": 0.00023820720824473555, - "loss": 0.5369, + "grad_norm": 0.349609375, + "learning_rate": 0.00028475019764320634, + "loss": 0.6627, "step": 300 }, { "epoch": 0.3111566018423746, - "grad_norm": 0.421875, - "learning_rate": 0.00023662317484261038, - "loss": 0.5333, + "grad_norm": 0.390625, + "learning_rate": 0.00028431918489838057, + "loss": 0.6654, "step": 304 }, { "epoch": 0.31525076765609006, - "grad_norm": 0.408203125, - "learning_rate": 0.00023502451310994138, - "loss": 0.4975, + "grad_norm": 0.365234375, + "learning_rate": 0.0002838825013669051, + "loss": 0.6193, "step": 308 }, { "epoch": 0.3193449334698055, - "grad_norm": 0.416015625, - "learning_rate": 0.00023341149301781076, - "loss": 0.52, + "grad_norm": 0.396484375, + "learning_rate": 0.000283440165485008, + "loss": 0.6411, "step": 312 }, { "epoch": 0.323439099283521, - "grad_norm": 0.3828125, - "learning_rate": 0.00023178438696204248, - "loss": 0.4807, + "grad_norm": 0.35546875, + "learning_rate": 0.00028299219592755264, + "loss": 0.5887, "step": 316 }, { "epoch": 0.32753326509723646, - "grad_norm": 0.38671875, - "learning_rate": 0.00023014346971720172, - "loss": 0.4658, + "grad_norm": 0.373046875, + "learning_rate": 0.000282538611607249, + "loss": 0.5907, "step": 320 }, { "epoch": 0.33162743091095187, - "grad_norm": 0.4140625, - "learning_rate": 0.00022848901839019325, - "loss": 0.5232, + "grad_norm": 0.369140625, + "learning_rate": 0.00028207943167385516, + "loss": 0.6408, "step": 324 }, { "epoch": 0.33572159672466734, - "grad_norm": 0.41015625, - "learning_rate": 0.00022682131237346514, - "loss": 0.4358, + "grad_norm": 0.4140625, + "learning_rate": 0.000281614675513369, + "loss": 0.5642, "step": 328 }, { "epoch": 0.3398157625383828, - "grad_norm": 0.427734375, - "learning_rate": 0.00022514063329782702, - "loss": 0.4954, + "grad_norm": 0.392578125, + "learning_rate": 0.0002811443627472098, + "loss": 0.6303, "step": 332 }, { "epoch": 0.34390992835209827, - "grad_norm": 0.40625, - "learning_rate": 0.00022344726498489009, - "loss": 0.5008, + "grad_norm": 0.369140625, + "learning_rate": 0.0002806685132313896, + "loss": 0.6267, "step": 336 }, { "epoch": 0.34800409416581374, - "grad_norm": 0.39453125, - "learning_rate": 0.00022174149339913745, - "loss": 0.5447, + "grad_norm": 0.35546875, + "learning_rate": 0.00028018714705567503, + "loss": 0.6681, "step": 340 }, { "epoch": 0.35209825997952915, - "grad_norm": 0.38671875, - "learning_rate": 0.0002200236065996322, - "loss": 0.5451, + "grad_norm": 0.3671875, + "learning_rate": 0.00027970028454273917, + "loss": 0.6606, "step": 344 }, { "epoch": 0.3561924257932446, - "grad_norm": 0.390625, - "learning_rate": 0.00021829389469137206, - "loss": 0.4629, + "grad_norm": 0.373046875, + "learning_rate": 0.0002792079462473035, + "loss": 0.6027, "step": 348 }, { "epoch": 0.3602865916069601, - "grad_norm": 0.3984375, - "learning_rate": 0.00021655264977629842, - "loss": 0.5172, + "grad_norm": 0.400390625, + "learning_rate": 0.0002787101529552702, + "loss": 0.6485, "step": 352 }, { "epoch": 0.36438075742067555, - "grad_norm": 0.373046875, - "learning_rate": 0.00021480016590396807, - "loss": 0.5204, + "grad_norm": 0.33984375, + "learning_rate": 0.0002782069256828445, + "loss": 0.6345, "step": 356 }, { "epoch": 0.368474923234391, - "grad_norm": 0.388671875, - "learning_rate": 0.00021303673902189636, - "loss": 0.4932, + "grad_norm": 0.38671875, + "learning_rate": 0.0002776982856756473, + "loss": 0.6211, "step": 360 }, { "epoch": 0.3725690890481064, - "grad_norm": 0.392578125, - "learning_rate": 0.00021126266692557917, - "loss": 0.4825, + "grad_norm": 0.384765625, + "learning_rate": 0.0002771842544078187, + "loss": 0.598, "step": 364 }, { "epoch": 0.3766632548618219, - "grad_norm": 0.392578125, - "learning_rate": 0.00020947824920820383, - "loss": 0.5429, + "grad_norm": 0.369140625, + "learning_rate": 0.0002766648535811105, + "loss": 0.6719, "step": 368 }, { "epoch": 0.38075742067553736, - "grad_norm": 0.3671875, - "learning_rate": 0.00020768378721005526, - "loss": 0.4995, + "grad_norm": 0.341796875, + "learning_rate": 0.000276140105123971, + "loss": 0.6142, "step": 372 }, { "epoch": 0.38485158648925283, - "grad_norm": 0.361328125, - "learning_rate": 0.00020587958396762815, - "loss": 0.4946, + "grad_norm": 0.365234375, + "learning_rate": 0.0002756100311906185, + "loss": 0.6187, "step": 376 }, { "epoch": 0.3889457523029683, - "grad_norm": 0.41796875, - "learning_rate": 0.0002040659441624519, - "loss": 0.4983, + "grad_norm": 0.365234375, + "learning_rate": 0.000275074654160106, + "loss": 0.6373, "step": 380 }, { "epoch": 0.3930399181166837, - "grad_norm": 0.41015625, - "learning_rate": 0.00020224317406963835, - "loss": 0.5102, + "grad_norm": 0.361328125, + "learning_rate": 0.00027453399663537707, + "loss": 0.6376, "step": 384 }, { "epoch": 0.3971340839303992, - "grad_norm": 0.37890625, - "learning_rate": 0.00020041158150615996, - "loss": 0.4997, + "grad_norm": 0.365234375, + "learning_rate": 0.0002739880814423106, + "loss": 0.6187, "step": 388 }, { "epoch": 0.40122824974411464, - "grad_norm": 0.369140625, - "learning_rate": 0.0001985714757788677, - "loss": 0.5261, + "grad_norm": 0.357421875, + "learning_rate": 0.0002734369316287578, + "loss": 0.648, "step": 392 }, { "epoch": 0.4053224155578301, - "grad_norm": 0.396484375, - "learning_rate": 0.00019672316763225773, - "loss": 0.5129, + "grad_norm": 0.365234375, + "learning_rate": 0.0002728805704635691, + "loss": 0.6342, "step": 396 }, { "epoch": 0.4094165813715456, - "grad_norm": 0.37890625, - "learning_rate": 0.0001948669691959947, - "loss": 0.47, + "grad_norm": 0.3671875, + "learning_rate": 0.0002723190214356113, + "loss": 0.584, "step": 400 }, { "epoch": 0.413510747185261, - "grad_norm": 0.412109375, - "learning_rate": 0.00019300319393220146, - "loss": 0.5295, + "grad_norm": 0.416015625, + "learning_rate": 0.0002717523082527766, + "loss": 0.6497, "step": 404 }, { "epoch": 0.41760491299897645, - "grad_norm": 0.3828125, - "learning_rate": 0.00019113215658252394, - "loss": 0.4823, + "grad_norm": 0.365234375, + "learning_rate": 0.00027118045484098095, + "loss": 0.6038, "step": 408 }, { "epoch": 0.4216990788126919, - "grad_norm": 0.390625, - "learning_rate": 0.00018925417311497944, - "loss": 0.5365, + "grad_norm": 0.37109375, + "learning_rate": 0.0002706034853431546, + "loss": 0.6665, "step": 412 }, { "epoch": 0.4257932446264074, - "grad_norm": 0.357421875, - "learning_rate": 0.00018736956067059827, - "loss": 0.5187, + "grad_norm": 0.341796875, + "learning_rate": 0.0002700214241182223, + "loss": 0.6422, "step": 416 }, { "epoch": 0.42988741044012285, - "grad_norm": 0.37890625, - "learning_rate": 0.00018547863750986715, - "loss": 0.4677, + "grad_norm": 0.36328125, + "learning_rate": 0.00026943429574007515, + "loss": 0.5954, "step": 420 }, { "epoch": 0.43398157625383826, - "grad_norm": 0.416015625, - "learning_rate": 0.0001835817229589834, - "loss": 0.4611, + "grad_norm": 0.388671875, + "learning_rate": 0.0002688421249965331, + "loss": 0.5899, "step": 424 }, { "epoch": 0.43807574206755373, - "grad_norm": 0.396484375, - "learning_rate": 0.00018167913735592955, - "loss": 0.4595, + "grad_norm": 0.388671875, + "learning_rate": 0.0002682449368882984, + "loss": 0.5858, "step": 428 }, { "epoch": 0.4421699078812692, - "grad_norm": 0.41796875, - "learning_rate": 0.0001797712019963766, - "loss": 0.5063, + "grad_norm": 0.376953125, + "learning_rate": 0.00026764275662790005, + "loss": 0.6247, "step": 432 }, { "epoch": 0.44626407369498466, - "grad_norm": 0.39453125, - "learning_rate": 0.00017785823907942602, - "loss": 0.4639, + "grad_norm": 0.357421875, + "learning_rate": 0.00026703560963862956, + "loss": 0.5961, "step": 436 }, { "epoch": 0.4503582395087001, - "grad_norm": 0.416015625, - "learning_rate": 0.00017594057165319876, - "loss": 0.5193, + "grad_norm": 0.390625, + "learning_rate": 0.0002664235215534673, + "loss": 0.6428, "step": 440 }, { "epoch": 0.45445240532241554, - "grad_norm": 0.408203125, - "learning_rate": 0.00017401852356028124, - "loss": 0.5114, + "grad_norm": 0.376953125, + "learning_rate": 0.00026580651821400057, + "loss": 0.6387, "step": 444 }, { "epoch": 0.458546571136131, - "grad_norm": 0.41015625, - "learning_rate": 0.00017209241938303697, - "loss": 0.4826, + "grad_norm": 0.3828125, + "learning_rate": 0.0002651846256693326, + "loss": 0.6024, "step": 448 }, { "epoch": 0.4626407369498465, - "grad_norm": 0.396484375, - "learning_rate": 0.00017016258438879323, - "loss": 0.5161, + "grad_norm": 0.369140625, + "learning_rate": 0.00026455787017498253, + "loss": 0.6385, "step": 452 }, { "epoch": 0.46673490276356194, - "grad_norm": 0.390625, - "learning_rate": 0.00016822934447491232, - "loss": 0.4837, + "grad_norm": 0.365234375, + "learning_rate": 0.0002639262781917771, + "loss": 0.6228, "step": 456 }, { "epoch": 0.47082906857727735, - "grad_norm": 0.38671875, - "learning_rate": 0.0001662930261137561, - "loss": 0.5136, + "grad_norm": 0.373046875, + "learning_rate": 0.0002632898763847338, + "loss": 0.6307, "step": 460 }, { "epoch": 0.4749232343909928, - "grad_norm": 0.416015625, - "learning_rate": 0.00016435395629755346, - "loss": 0.5118, + "grad_norm": 0.392578125, + "learning_rate": 0.0002626486916219344, + "loss": 0.6465, "step": 464 }, { "epoch": 0.4790174002047083, "grad_norm": 0.384765625, - "learning_rate": 0.0001624124624831805, - "loss": 0.4945, + "learning_rate": 0.0002620027509733914, + "loss": 0.6349, "step": 468 }, { "epoch": 0.48311156601842375, - "grad_norm": 0.390625, - "learning_rate": 0.00016046887253686135, - "loss": 0.4886, + "grad_norm": 0.3671875, + "learning_rate": 0.0002613520817099045, + "loss": 0.6223, "step": 472 }, { "epoch": 0.4872057318321392, - "grad_norm": 0.40625, - "learning_rate": 0.00015852351467880076, - "loss": 0.4682, + "grad_norm": 0.373046875, + "learning_rate": 0.0002606967113019098, + "loss": 0.605, "step": 476 }, { "epoch": 0.49129989764585463, - "grad_norm": 0.404296875, - "learning_rate": 0.00015657671742775613, - "loss": 0.4825, + "grad_norm": 0.40234375, + "learning_rate": 0.0002600366674183196, + "loss": 0.6169, "step": 480 }, { "epoch": 0.4953940634595701, - "grad_norm": 0.3828125, - "learning_rate": 0.00015462880954555998, - "loss": 0.4982, + "grad_norm": 0.3515625, + "learning_rate": 0.0002593719779253548, + "loss": 0.6289, "step": 484 }, { "epoch": 0.49948822927328557, - "grad_norm": 0.37890625, - "learning_rate": 0.00015268011998160048, - "loss": 0.529, + "grad_norm": 0.380859375, + "learning_rate": 0.0002587026708853674, + "loss": 0.6718, "step": 488 }, { "epoch": 0.503582395087001, - "grad_norm": 0.404296875, - "learning_rate": 0.000150730977817271, - "loss": 0.4266, + "grad_norm": 0.396484375, + "learning_rate": 0.0002580287745556572, + "loss": 0.5592, "step": 492 }, { "epoch": 0.5076765609007164, - "grad_norm": 0.3828125, - "learning_rate": 0.00014878171221039676, - "loss": 0.4789, + "grad_norm": 0.392578125, + "learning_rate": 0.00025735031738727753, + "loss": 0.6118, "step": 496 }, { "epoch": 0.5117707267144319, - "grad_norm": 0.392578125, - "learning_rate": 0.00014683265233964937, - "loss": 0.455, + "grad_norm": 0.380859375, + "learning_rate": 0.00025666732802383463, + "loss": 0.5798, "step": 500 }, { "epoch": 0.5158648925281474, - "grad_norm": 0.36328125, - "learning_rate": 0.00014488412734895692, - "loss": 0.5067, + "grad_norm": 0.341796875, + "learning_rate": 0.0002559798353002785, + "loss": 0.6488, "step": 504 }, { "epoch": 0.5199590583418628, - "grad_norm": 0.39453125, - "learning_rate": 0.0001429364662919208, - "loss": 0.5023, + "grad_norm": 0.359375, + "learning_rate": 0.0002552878682416851, + "loss": 0.6363, "step": 508 }, { "epoch": 0.5240532241555783, - "grad_norm": 0.3515625, - "learning_rate": 0.00014098999807624695, - "loss": 0.4871, + "grad_norm": 0.34375, + "learning_rate": 0.0002545914560620313, + "loss": 0.6246, "step": 512 }, { "epoch": 0.5281473899692938, - "grad_norm": 0.412109375, - "learning_rate": 0.00013904505140820264, - "loss": 0.4886, + "grad_norm": 0.37109375, + "learning_rate": 0.00025389062816296153, + "loss": 0.6277, "step": 516 }, { "epoch": 0.5322415557830092, - "grad_norm": 0.3828125, - "learning_rate": 0.00013710195473710636, - "loss": 0.4495, + "grad_norm": 0.357421875, + "learning_rate": 0.00025318541413254587, + "loss": 0.5822, "step": 520 }, { "epoch": 0.5363357215967247, - "grad_norm": 0.396484375, - "learning_rate": 0.00013516103619986192, - "loss": 0.449, + "grad_norm": 0.36328125, + "learning_rate": 0.0002524758437440318, + "loss": 0.581, "step": 524 }, { "epoch": 0.5404298874104401, - "grad_norm": 0.353515625, - "learning_rate": 0.00013322262356554456, - "loss": 0.4998, + "grad_norm": 0.345703125, + "learning_rate": 0.00025176194695458644, + "loss": 0.6365, "step": 528 }, { "epoch": 0.5445240532241555, - "grad_norm": 0.37890625, - "learning_rate": 0.00013128704418004995, - "loss": 0.4705, + "grad_norm": 0.357421875, + "learning_rate": 0.0002510437539040324, + "loss": 0.5974, "step": 532 }, { "epoch": 0.548618219037871, - "grad_norm": 0.376953125, - "learning_rate": 0.00012935462491081391, - "loss": 0.5173, + "grad_norm": 0.3671875, + "learning_rate": 0.0002503212949135747, + "loss": 0.646, "step": 536 }, { "epoch": 0.5527123848515865, - "grad_norm": 0.37890625, - "learning_rate": 0.00012742569209161334, - "loss": 0.5209, + "grad_norm": 0.373046875, + "learning_rate": 0.00024959460048452117, + "loss": 0.6508, "step": 540 }, { "epoch": 0.5568065506653019, - "grad_norm": 0.384765625, - "learning_rate": 0.0001255005714674573, - "loss": 0.4485, + "grad_norm": 0.357421875, + "learning_rate": 0.0002488637012969945, + "loss": 0.5838, "step": 544 }, { "epoch": 0.5609007164790174, - "grad_norm": 0.3515625, - "learning_rate": 0.00012357958813957748, - "loss": 0.4645, + "grad_norm": 0.337890625, + "learning_rate": 0.0002481286282086368, + "loss": 0.597, "step": 548 }, { "epoch": 0.5649948822927329, - "grad_norm": 0.42578125, - "learning_rate": 0.00012166306651052708, - "loss": 0.514, + "grad_norm": 0.380859375, + "learning_rate": 0.00024738941225330727, + "loss": 0.6617, "step": 552 }, { "epoch": 0.5690890481064483, - "grad_norm": 0.380859375, - "learning_rate": 0.00011975133022939816, - "loss": 0.4585, + "grad_norm": 0.3671875, + "learning_rate": 0.00024664608463977164, + "loss": 0.5968, "step": 556 }, { "epoch": 0.5731832139201638, - "grad_norm": 0.373046875, - "learning_rate": 0.00011784470213716574, - "loss": 0.4645, + "grad_norm": 0.3359375, + "learning_rate": 0.0002458986767503845, + "loss": 0.5837, "step": 560 }, { "epoch": 0.5772773797338793, - "grad_norm": 0.392578125, - "learning_rate": 0.00011594350421216891, - "loss": 0.4934, + "grad_norm": 0.3828125, + "learning_rate": 0.00024514722013976485, + "loss": 0.6175, "step": 564 }, { "epoch": 0.5813715455475946, - "grad_norm": 0.37109375, - "learning_rate": 0.00011404805751573712, - "loss": 0.4493, + "grad_norm": 0.4140625, + "learning_rate": 0.00024439174653346325, + "loss": 0.592, "step": 568 }, { "epoch": 0.5854657113613101, - "grad_norm": 0.396484375, - "learning_rate": 0.00011215868213797156, - "loss": 0.5022, + "grad_norm": 0.3515625, + "learning_rate": 0.00024363228782662308, + "loss": 0.6434, "step": 572 }, { "epoch": 0.5895598771750256, - "grad_norm": 0.400390625, - "learning_rate": 0.00011027569714369059, - "loss": 0.5136, + "grad_norm": 0.3671875, + "learning_rate": 0.0002428688760826334, + "loss": 0.6441, "step": 576 }, { "epoch": 0.593654042988741, - "grad_norm": 0.384765625, - "learning_rate": 0.00010839942051854829, - "loss": 0.4565, + "grad_norm": 0.3671875, + "learning_rate": 0.00024210154353177562, + "loss": 0.5881, "step": 580 }, { "epoch": 0.5977482088024565, - "grad_norm": 0.380859375, - "learning_rate": 0.000106530169115335, - "loss": 0.5131, + "grad_norm": 0.349609375, + "learning_rate": 0.00024133032256986274, + "loss": 0.6454, "step": 584 }, { "epoch": 0.601842374616172, - "grad_norm": 0.35546875, - "learning_rate": 0.00010466825860046967, - "loss": 0.4535, + "grad_norm": 0.3203125, + "learning_rate": 0.00024055524575687136, + "loss": 0.5999, "step": 588 }, { "epoch": 0.6059365404298874, - "grad_norm": 0.388671875, - "learning_rate": 0.00010281400340069205, - "loss": 0.4771, + "grad_norm": 0.345703125, + "learning_rate": 0.00023977634581556743, + "loss": 0.6028, "step": 592 }, { "epoch": 0.6100307062436029, - "grad_norm": 0.388671875, - "learning_rate": 0.00010096771664996456, - "loss": 0.4581, + "grad_norm": 0.376953125, + "learning_rate": 0.00023899365563012455, + "loss": 0.5945, "step": 596 }, { "epoch": 0.6141248720573184, - "grad_norm": 0.412109375, - "learning_rate": 9.912971013659232e-05, - "loss": 0.4728, + "grad_norm": 0.38671875, + "learning_rate": 0.00023820720824473555, + "loss": 0.6106, "step": 600 }, { "epoch": 0.6182190378710338, - "grad_norm": 0.36328125, - "learning_rate": 9.730029425057045e-05, - "loss": 0.5323, + "grad_norm": 0.34765625, + "learning_rate": 0.00023741703686221767, + "loss": 0.6626, "step": 604 }, { "epoch": 0.6223132036847492, - "grad_norm": 0.400390625, - "learning_rate": 9.547977793116762e-05, - "loss": 0.474, + "grad_norm": 0.361328125, + "learning_rate": 0.00023662317484261038, + "loss": 0.6107, "step": 608 }, { "epoch": 0.6264073694984647, - "grad_norm": 0.373046875, - "learning_rate": 9.366846861475435e-05, - "loss": 0.4372, + "grad_norm": 0.34765625, + "learning_rate": 0.00023582565570176738, + "loss": 0.5691, "step": 612 }, { "epoch": 0.6305015353121801, - "grad_norm": 0.365234375, - "learning_rate": 9.186667218288549e-05, - "loss": 0.5153, + "grad_norm": 0.330078125, + "learning_rate": 0.00023502451310994138, + "loss": 0.648, "step": 616 }, { "epoch": 0.6345957011258956, - "grad_norm": 0.376953125, - "learning_rate": 9.007469291064467e-05, - "loss": 0.4849, + "grad_norm": 0.359375, + "learning_rate": 0.0002342197808903626, + "loss": 0.6255, "step": 620 }, { "epoch": 0.638689866939611, - "grad_norm": 0.37890625, - "learning_rate": 8.829283341526067e-05, - "loss": 0.506, + "grad_norm": 0.33984375, + "learning_rate": 0.00023341149301781076, + "loss": 0.6423, "step": 624 }, { "epoch": 0.6427840327533265, - "grad_norm": 0.375, - "learning_rate": 8.652139460500359e-05, - "loss": 0.4905, + "grad_norm": 0.353515625, + "learning_rate": 0.00023259968361718093, + "loss": 0.6358, "step": 628 }, { "epoch": 0.646878198567042, - "grad_norm": 0.37109375, - "learning_rate": 8.47606756283691e-05, - "loss": 0.4814, + "grad_norm": 0.328125, + "learning_rate": 0.00023178438696204248, + "loss": 0.6217, "step": 632 }, { "epoch": 0.6509723643807575, - "grad_norm": 0.3984375, - "learning_rate": 8.301097382356067e-05, - "loss": 0.5113, + "grad_norm": 0.380859375, + "learning_rate": 0.0002309656374731923, + "loss": 0.6543, "step": 636 }, { "epoch": 0.6550665301944729, - "grad_norm": 0.3671875, - "learning_rate": 8.127258466827704e-05, - "loss": 0.5092, + "grad_norm": 0.326171875, + "learning_rate": 0.00023014346971720172, + "loss": 0.6438, "step": 640 }, { "epoch": 0.6591606960081884, - "grad_norm": 0.36328125, - "learning_rate": 7.95458017298138e-05, - "loss": 0.4926, + "grad_norm": 0.337890625, + "learning_rate": 0.00022931791840495683, + "loss": 0.6364, "step": 644 }, { "epoch": 0.6632548618219037, - "grad_norm": 0.392578125, - "learning_rate": 7.783091661548789e-05, - "loss": 0.4306, + "grad_norm": 0.375, + "learning_rate": 0.00022848901839019325, + "loss": 0.5709, "step": 648 }, { "epoch": 0.6673490276356192, - "grad_norm": 0.384765625, - "learning_rate": 7.612821892339284e-05, - "loss": 0.4902, + "grad_norm": 0.3671875, + "learning_rate": 0.00022765680466802467, + "loss": 0.6298, "step": 652 }, { "epoch": 0.6714431934493347, - "grad_norm": 0.38671875, - "learning_rate": 7.443799619349374e-05, - "loss": 0.4824, + "grad_norm": 0.380859375, + "learning_rate": 0.00022682131237346514, + "loss": 0.6143, "step": 656 }, { "epoch": 0.6755373592630501, - "grad_norm": 0.375, - "learning_rate": 7.276053385906896e-05, - "loss": 0.4947, + "grad_norm": 0.326171875, + "learning_rate": 0.00022598257677994616, + "loss": 0.64, "step": 660 }, { "epoch": 0.6796315250767656, - "grad_norm": 0.4140625, - "learning_rate": 7.109611519850845e-05, - "loss": 0.5158, + "grad_norm": 0.365234375, + "learning_rate": 0.00022514063329782702, + "loss": 0.6509, "step": 664 }, { "epoch": 0.6837256908904811, - "grad_norm": 0.416015625, - "learning_rate": 6.944502128747558e-05, - "loss": 0.502, + "grad_norm": 0.384765625, + "learning_rate": 0.0002242955174729001, + "loss": 0.6393, "step": 668 }, { "epoch": 0.6878198567041965, - "grad_norm": 0.404296875, - "learning_rate": 6.780753095144086e-05, - "loss": 0.5085, + "grad_norm": 0.357421875, + "learning_rate": 0.00022344726498489009, + "loss": 0.6492, "step": 672 }, { "epoch": 0.691914022517912, - "grad_norm": 0.380859375, - "learning_rate": 6.618392071859612e-05, - "loss": 0.5132, + "grad_norm": 0.384765625, + "learning_rate": 0.0002225959116459477, + "loss": 0.6519, "step": 676 }, { "epoch": 0.6960081883316275, - "grad_norm": 0.384765625, - "learning_rate": 6.457446477315588e-05, - "loss": 0.4436, + "grad_norm": 0.373046875, + "learning_rate": 0.00022174149339913745, + "loss": 0.5808, "step": 680 }, { "epoch": 0.7001023541453428, - "grad_norm": 0.3515625, - "learning_rate": 6.297943490905531e-05, - "loss": 0.5225, + "grad_norm": 0.341796875, + "learning_rate": 0.0002208840463169207, + "loss": 0.6531, "step": 684 }, { "epoch": 0.7041965199590583, - "grad_norm": 0.416015625, - "learning_rate": 6.139910048405134e-05, - "loss": 0.5289, + "grad_norm": 0.37109375, + "learning_rate": 0.0002200236065996322, + "loss": 0.677, "step": 688 }, { "epoch": 0.7082906857727738, - "grad_norm": 0.373046875, - "learning_rate": 5.9833728374235615e-05, - "loss": 0.4555, + "grad_norm": 0.34765625, + "learning_rate": 0.0002191602105739521, + "loss": 0.5903, "step": 692 }, { "epoch": 0.7123848515864892, - "grad_norm": 0.396484375, - "learning_rate": 5.8283582928965986e-05, - "loss": 0.4946, + "grad_norm": 0.40625, + "learning_rate": 0.00021829389469137206, + "loss": 0.6429, "step": 696 }, { "epoch": 0.7164790174002047, - "grad_norm": 0.40625, - "learning_rate": 5.674892592622502e-05, - "loss": 0.4962, + "grad_norm": 0.34375, + "learning_rate": 0.0002174246955266565, + "loss": 0.6316, "step": 700 }, { "epoch": 0.7205731832139202, - "grad_norm": 0.41015625, - "learning_rate": 5.5230016528413076e-05, - "loss": 0.4773, + "grad_norm": 0.3671875, + "learning_rate": 0.00021655264977629842, + "loss": 0.6191, "step": 704 }, { "epoch": 0.7246673490276356, - "grad_norm": 0.404296875, - "learning_rate": 5.37271112385823e-05, - "loss": 0.4523, + "grad_norm": 0.3828125, + "learning_rate": 0.00021567779425696993, + "loss": 0.5909, "step": 708 }, { "epoch": 0.7287615148413511, - "grad_norm": 0.369140625, - "learning_rate": 5.2240463857120365e-05, - "loss": 0.4379, + "grad_norm": 0.353515625, + "learning_rate": 0.00021480016590396807, + "loss": 0.5745, "step": 712 }, { "epoch": 0.7328556806550666, - "grad_norm": 0.392578125, - "learning_rate": 5.0770325438890304e-05, - "loss": 0.4866, + "grad_norm": 0.359375, + "learning_rate": 0.0002139198017696556, + "loss": 0.627, "step": 716 }, { "epoch": 0.736949846468782, - "grad_norm": 0.3828125, - "learning_rate": 4.9316944250834126e-05, - "loss": 0.4499, + "grad_norm": 0.375, + "learning_rate": 0.00021303673902189636, + "loss": 0.5907, "step": 720 }, { "epoch": 0.7410440122824974, - "grad_norm": 0.37109375, - "learning_rate": 4.788056573004726e-05, - "loss": 0.5186, + "grad_norm": 0.34375, + "learning_rate": 0.00021215101494248618, + "loss": 0.6565, "step": 724 }, { "epoch": 0.7451381780962129, - "grad_norm": 0.39453125, - "learning_rate": 4.646143244233068e-05, - "loss": 0.4881, + "grad_norm": 0.37890625, + "learning_rate": 0.00021126266692557917, + "loss": 0.6313, "step": 728 }, { "epoch": 0.7492323439099283, - "grad_norm": 0.35546875, - "learning_rate": 4.505978404122805e-05, - "loss": 0.4754, + "grad_norm": 0.337890625, + "learning_rate": 0.00021037173247610863, + "loss": 0.6126, "step": 732 }, { "epoch": 0.7533265097236438, - "grad_norm": 0.3671875, - "learning_rate": 4.367585722755474e-05, - "loss": 0.4925, + "grad_norm": 0.34765625, + "learning_rate": 0.00020947824920820383, + "loss": 0.6302, "step": 736 }, { "epoch": 0.7574206755373593, - "grad_norm": 0.390625, - "learning_rate": 4.23098857094255e-05, - "loss": 0.536, + "grad_norm": 0.3828125, + "learning_rate": 0.00020858225484360186, + "loss": 0.6709, "step": 740 }, { "epoch": 0.7615148413510747, - "grad_norm": 0.3828125, - "learning_rate": 4.0962100162787195e-05, - "loss": 0.4847, + "grad_norm": 0.36328125, + "learning_rate": 0.00020768378721005526, + "loss": 0.6173, "step": 744 }, { "epoch": 0.7656090071647902, - "grad_norm": 0.396484375, - "learning_rate": 3.9632728192463986e-05, - "loss": 0.4645, + "grad_norm": 0.357421875, + "learning_rate": 0.00020678288423973476, + "loss": 0.5911, "step": 748 }, { "epoch": 0.7697031729785057, - "grad_norm": 0.408203125, - "learning_rate": 3.8321994293720886e-05, - "loss": 0.4737, + "grad_norm": 0.396484375, + "learning_rate": 0.00020587958396762815, + "loss": 0.6153, "step": 752 }, { "epoch": 0.7737973387922211, - "grad_norm": 0.396484375, - "learning_rate": 3.703011981435276e-05, - "loss": 0.4454, + "grad_norm": 0.375, + "learning_rate": 0.00020497392452993395, + "loss": 0.5763, "step": 756 }, { "epoch": 0.7778915046059366, - "grad_norm": 0.396484375, - "learning_rate": 3.575732291730427e-05, - "loss": 0.4476, + "grad_norm": 0.34375, + "learning_rate": 0.0002040659441624519, + "loss": 0.5981, "step": 760 }, { "epoch": 0.781985670419652, - "grad_norm": 0.388671875, - "learning_rate": 3.450381854382825e-05, - "loss": 0.4757, + "grad_norm": 0.35546875, + "learning_rate": 0.00020315568119896846, + "loss": 0.6124, "step": 764 }, { "epoch": 0.7860798362333674, - "grad_norm": 0.38671875, - "learning_rate": 3.3269818377187804e-05, - "loss": 0.4888, + "grad_norm": 0.35546875, + "learning_rate": 0.00020224317406963835, + "loss": 0.6245, "step": 768 }, { "epoch": 0.7901740020470829, - "grad_norm": 0.369140625, - "learning_rate": 3.2055530806908794e-05, - "loss": 0.4818, + "grad_norm": 0.361328125, + "learning_rate": 0.00020132846129936223, + "loss": 0.6093, "step": 772 }, { "epoch": 0.7942681678607983, - "grad_norm": 0.37890625, - "learning_rate": 3.0861160893588323e-05, - "loss": 0.4803, + "grad_norm": 0.359375, + "learning_rate": 0.00020041158150615996, + "loss": 0.6212, "step": 776 }, { "epoch": 0.7983623336745138, - "grad_norm": 0.39453125, - "learning_rate": 2.9686910334265367e-05, - "loss": 0.488, + "grad_norm": 0.376953125, + "learning_rate": 0.00019949257339954056, + "loss": 0.6338, "step": 780 }, { "epoch": 0.8024564994882293, - "grad_norm": 0.35546875, - "learning_rate": 2.8532977428359882e-05, - "loss": 0.4819, + "grad_norm": 0.33203125, + "learning_rate": 0.0001985714757788677, + "loss": 0.6196, "step": 784 }, { "epoch": 0.8065506653019447, - "grad_norm": 0.357421875, - "learning_rate": 2.7399557044185216e-05, - "loss": 0.4281, + "grad_norm": 0.337890625, + "learning_rate": 0.00019764832753172172, + "loss": 0.5654, "step": 788 }, { "epoch": 0.8106448311156602, - "grad_norm": 0.380859375, - "learning_rate": 2.6286840586039965e-05, - "loss": 0.4555, + "grad_norm": 0.47265625, + "learning_rate": 0.00019672316763225773, + "loss": 0.5876, "step": 792 }, { "epoch": 0.8147389969293757, - "grad_norm": 0.390625, - "learning_rate": 2.5195015961885017e-05, - "loss": 0.454, + "grad_norm": 0.369140625, + "learning_rate": 0.0001957960351395604, + "loss": 0.5951, "step": 796 }, { "epoch": 0.8188331627430911, - "grad_norm": 0.392578125, - "learning_rate": 2.4124267551610883e-05, - "loss": 0.4713, + "grad_norm": 0.361328125, + "learning_rate": 0.0001948669691959947, + "loss": 0.5991, "step": 800 }, { "epoch": 0.8229273285568065, - "grad_norm": 0.36328125, - "learning_rate": 2.307477617590108e-05, - "loss": 0.4524, + "grad_norm": 0.353515625, + "learning_rate": 0.0001939360090255535, + "loss": 0.6002, "step": 804 }, { "epoch": 0.827021494370522, - "grad_norm": 0.37890625, - "learning_rate": 2.2046719065696077e-05, - "loss": 0.4492, + "grad_norm": 0.357421875, + "learning_rate": 0.00019300319393220146, + "loss": 0.5882, "step": 808 }, { "epoch": 0.8311156601842374, - "grad_norm": 0.373046875, - "learning_rate": 2.1040269832263895e-05, - "loss": 0.4976, + "grad_norm": 0.365234375, + "learning_rate": 0.00019206856329821595, + "loss": 0.6324, "step": 812 }, { "epoch": 0.8352098259979529, - "grad_norm": 0.380859375, - "learning_rate": 2.0055598437881786e-05, - "loss": 0.4803, + "grad_norm": 0.353515625, + "learning_rate": 0.00019113215658252394, + "loss": 0.608, "step": 816 }, { "epoch": 0.8393039918116684, - "grad_norm": 0.404296875, - "learning_rate": 1.9092871167134304e-05, - "loss": 0.5208, + "grad_norm": 0.37890625, + "learning_rate": 0.0001901940133190365, + "loss": 0.659, "step": 820 }, { "epoch": 0.8433981576253838, - "grad_norm": 0.38671875, - "learning_rate": 1.8152250598832045e-05, - "loss": 0.5138, + "grad_norm": 0.34765625, + "learning_rate": 0.00018925417311497944, + "loss": 0.641, "step": 824 }, { "epoch": 0.8474923234390993, - "grad_norm": 0.388671875, - "learning_rate": 1.7233895578556633e-05, - "loss": 0.4792, + "grad_norm": 0.33984375, + "learning_rate": 0.00018831267564922135, + "loss": 0.6171, "step": 828 }, { "epoch": 0.8515864892528148, - "grad_norm": 0.36328125, - "learning_rate": 1.6337961191835868e-05, - "loss": 0.4711, + "grad_norm": 0.326171875, + "learning_rate": 0.00018736956067059827, + "loss": 0.6022, "step": 832 }, { "epoch": 0.8556806550665302, - "grad_norm": 0.384765625, - "learning_rate": 1.5464598737954086e-05, - "loss": 0.4889, + "grad_norm": 0.34765625, + "learning_rate": 0.00018642486799623563, + "loss": 0.6303, "step": 836 }, { "epoch": 0.8597748208802457, - "grad_norm": 0.376953125, - "learning_rate": 1.4613955704401541e-05, - "loss": 0.5316, + "grad_norm": 0.3515625, + "learning_rate": 0.00018547863750986715, + "loss": 0.6694, "step": 840 }, { "epoch": 0.8638689866939611, - "grad_norm": 0.36328125, - "learning_rate": 1.3786175741967931e-05, - "loss": 0.4474, + "grad_norm": 0.3125, + "learning_rate": 0.000184530909160151, + "loss": 0.5785, "step": 844 }, { "epoch": 0.8679631525076765, - "grad_norm": 0.392578125, - "learning_rate": 1.298139864048348e-05, - "loss": 0.4909, + "grad_norm": 0.373046875, + "learning_rate": 0.0001835817229589834, + "loss": 0.6293, "step": 848 }, { "epoch": 0.872057318321392, - "grad_norm": 0.37890625, - "learning_rate": 1.2199760305212302e-05, - "loss": 0.4603, + "grad_norm": 0.349609375, + "learning_rate": 0.00018263111897980907, + "loss": 0.6031, "step": 852 }, { "epoch": 0.8761514841351075, - "grad_norm": 0.365234375, - "learning_rate": 1.1441392733901628e-05, - "loss": 0.4541, + "grad_norm": 0.326171875, + "learning_rate": 0.00018167913735592955, + "loss": 0.5936, "step": 856 }, { "epoch": 0.8802456499488229, - "grad_norm": 0.40234375, - "learning_rate": 1.0706423994490859e-05, - "loss": 0.4797, + "grad_norm": 0.375, + "learning_rate": 0.00018072581827880885, + "loss": 0.6135, "step": 860 }, { "epoch": 0.8843398157625384, - "grad_norm": 0.373046875, - "learning_rate": 9.994978203484305e-06, - "loss": 0.4451, + "grad_norm": 0.349609375, + "learning_rate": 0.0001797712019963766, + "loss": 0.5845, "step": 864 }, { "epoch": 0.8884339815762539, - "grad_norm": 0.390625, - "learning_rate": 9.307175504991427e-06, - "loss": 0.4595, + "grad_norm": 0.349609375, + "learning_rate": 0.00017881532881132878, + "loss": 0.5956, "step": 868 }, { "epoch": 0.8925281473899693, - "grad_norm": 0.408203125, - "learning_rate": 8.643132050437518e-06, - "loss": 0.5007, + "grad_norm": 0.373046875, + "learning_rate": 0.00017785823907942602, + "loss": 0.6384, "step": 872 }, { "epoch": 0.8966223132036848, - "grad_norm": 0.392578125, - "learning_rate": 8.00295997894893e-06, - "loss": 0.475, + "grad_norm": 0.3671875, + "learning_rate": 0.00017689997320779037, + "loss": 0.6084, "step": 876 }, { "epoch": 0.9007164790174002, - "grad_norm": 0.388671875, - "learning_rate": 7.386767398415738e-06, - "loss": 0.4688, + "grad_norm": 0.390625, + "learning_rate": 0.00017594057165319876, + "loss": 0.6023, "step": 880 }, { "epoch": 0.9048106448311156, - "grad_norm": 0.388671875, - "learning_rate": 6.794658367235356e-06, - "loss": 0.4836, + "grad_norm": 0.365234375, + "learning_rate": 0.00017498007492037536, + "loss": 0.6271, "step": 884 }, { "epoch": 0.9089048106448311, - "grad_norm": 0.37109375, - "learning_rate": 6.226732876739615e-06, - "loss": 0.4641, + "grad_norm": 0.3515625, + "learning_rate": 0.00017401852356028124, + "loss": 0.6071, "step": 888 }, { "epoch": 0.9129989764585466, - "grad_norm": 0.35546875, - "learning_rate": 5.68308683430917e-06, - "loss": 0.4521, + "grad_norm": 0.341796875, + "learning_rate": 0.00017305595816840267, + "loss": 0.5881, "step": 892 }, { "epoch": 0.917093142272262, - "grad_norm": 0.373046875, - "learning_rate": 5.163812047177102e-06, - "loss": 0.4702, + "grad_norm": 0.359375, + "learning_rate": 0.00017209241938303697, + "loss": 0.6022, "step": 896 }, { "epoch": 0.9211873080859775, - "grad_norm": 0.388671875, - "learning_rate": 4.668996206925357e-06, - "loss": 0.4587, + "grad_norm": 0.3671875, + "learning_rate": 0.00017112794788357686, + "loss": 0.5948, "step": 900 }, { "epoch": 0.925281473899693, - "grad_norm": 0.375, - "learning_rate": 4.198722874675742e-06, - "loss": 0.4201, + "grad_norm": 0.345703125, + "learning_rate": 0.00017016258438879323, + "loss": 0.5529, "step": 904 }, { "epoch": 0.9293756397134084, - "grad_norm": 0.376953125, - "learning_rate": 3.753071466978924e-06, - "loss": 0.4947, + "grad_norm": 0.33984375, + "learning_rate": 0.00016919636965511572, + "loss": 0.61, "step": 908 }, { "epoch": 0.9334698055271239, - "grad_norm": 0.37109375, - "learning_rate": 3.3321172424029318e-06, - "loss": 0.4874, + "grad_norm": 0.345703125, + "learning_rate": 0.00016822934447491232, + "loss": 0.6209, "step": 912 }, { "epoch": 0.9375639713408394, - "grad_norm": 0.419921875, - "learning_rate": 2.9359312888241005e-06, - "loss": 0.461, + "grad_norm": 0.396484375, + "learning_rate": 0.000167261549674767, + "loss": 0.5949, "step": 916 }, { "epoch": 0.9416581371545547, - "grad_norm": 0.365234375, - "learning_rate": 2.564580511422226e-06, - "loss": 0.4576, + "grad_norm": 0.3359375, + "learning_rate": 0.0001662930261137561, + "loss": 0.5936, "step": 920 }, { "epoch": 0.9457523029682702, - "grad_norm": 0.39453125, - "learning_rate": 2.2181276213820374e-06, - "loss": 0.474, + "grad_norm": 0.365234375, + "learning_rate": 0.0001653238146817233, + "loss": 0.6019, "step": 924 }, { "epoch": 0.9498464687819856, - "grad_norm": 0.349609375, - "learning_rate": 1.8966311253029497e-06, - "loss": 0.432, + "grad_norm": 0.326171875, + "learning_rate": 0.00016435395629755346, + "loss": 0.5651, "step": 928 }, { "epoch": 0.9539406345957011, - "grad_norm": 0.36328125, - "learning_rate": 1.6001453153189681e-06, - "loss": 0.4922, + "grad_norm": 0.34765625, + "learning_rate": 0.00016338349190744486, + "loss": 0.6279, "step": 932 }, { "epoch": 0.9580348004094166, - "grad_norm": 0.3671875, - "learning_rate": 1.3287202599300883e-06, - "loss": 0.4734, + "grad_norm": 0.365234375, + "learning_rate": 0.0001624124624831805, + "loss": 0.599, "step": 936 }, { "epoch": 0.962128966223132, - "grad_norm": 0.376953125, - "learning_rate": 1.082401795547111e-06, - "loss": 0.4651, + "grad_norm": 0.3671875, + "learning_rate": 0.00016144090902039856, + "loss": 0.593, "step": 940 }, { "epoch": 0.9662231320368475, - "grad_norm": 0.3515625, - "learning_rate": 8.612315187511321e-07, - "loss": 0.4569, + "grad_norm": 0.32421875, + "learning_rate": 0.00016046887253686135, + "loss": 0.5827, "step": 944 }, { "epoch": 0.970317297850563, - "grad_norm": 0.361328125, - "learning_rate": 6.652467792689465e-07, - "loss": 0.5034, + "grad_norm": 0.333984375, + "learning_rate": 0.00015949639407072383, + "loss": 0.6371, "step": 948 }, { "epoch": 0.9744114636642784, - "grad_norm": 0.404296875, - "learning_rate": 4.944806736657426e-07, - "loss": 0.4406, + "grad_norm": 0.37109375, + "learning_rate": 0.00015852351467880076, + "loss": 0.5856, "step": 952 }, { "epoch": 0.9785056294779939, - "grad_norm": 0.38671875, - "learning_rate": 3.4896203975587347e-07, - "loss": 0.4876, + "grad_norm": 0.359375, + "learning_rate": 0.00015755027543483353, + "loss": 0.6166, "step": 956 }, { "epoch": 0.9825997952917093, - "grad_norm": 0.392578125, - "learning_rate": 2.2871545173306915e-07, - "loss": 0.4446, + "grad_norm": 0.349609375, + "learning_rate": 0.00015657671742775613, + "loss": 0.5667, "step": 960 }, { "epoch": 0.9866939611054247, - "grad_norm": 0.373046875, - "learning_rate": 1.3376121602038937e-07, - "loss": 0.4238, + "grad_norm": 0.345703125, + "learning_rate": 0.00015560288175996023, + "loss": 0.5446, "step": 964 }, { "epoch": 0.9907881269191402, - "grad_norm": 0.412109375, - "learning_rate": 6.411536784106663e-08, - "loss": 0.5096, + "grad_norm": 0.380859375, + "learning_rate": 0.00015462880954555998, + "loss": 0.6376, "step": 968 }, { "epoch": 0.9948822927328557, - "grad_norm": 0.38671875, - "learning_rate": 1.978966851062247e-08, - "loss": 0.4813, + "grad_norm": 0.3515625, + "learning_rate": 0.0001536545419086563, + "loss": 0.6071, "step": 972 }, { "epoch": 0.9989764585465711, - "grad_norm": 0.416015625, - "learning_rate": 7.916034505894842e-10, - "loss": 0.4836, + "grad_norm": 0.376953125, + "learning_rate": 0.00015268011998160048, + "loss": 0.6143, "step": 976 }, { - "epoch": 1.0, - "step": 977, - "total_flos": 6.393618015065211e+17, - "train_loss": 0.49131298977252524, - "train_runtime": 3930.4246, + "epoch": 1.0030706243602865, + "grad_norm": 0.302734375, + "learning_rate": 0.00015170558490325793, + "loss": 0.5123, + "step": 980 + }, + { + "epoch": 1.007164790174002, + "grad_norm": 0.337890625, + "learning_rate": 0.000150730977817271, + "loss": 0.5264, + "step": 984 + }, + { + "epoch": 1.0112589559877174, + "grad_norm": 0.34375, + "learning_rate": 0.00014975633987032212, + "loss": 0.4917, + "step": 988 + }, + { + "epoch": 1.015353121801433, + "grad_norm": 0.34375, + "learning_rate": 0.00014878171221039676, + "loss": 0.5258, + "step": 992 + }, + { + "epoch": 1.0194472876151484, + "grad_norm": 0.353515625, + "learning_rate": 0.000147807135985046, + "loss": 0.4466, + "step": 996 + }, + { + "epoch": 1.0235414534288638, + "grad_norm": 0.3671875, + "learning_rate": 0.00014683265233964937, + "loss": 0.5049, + "step": 1000 + }, + { + "epoch": 1.0276356192425793, + "grad_norm": 0.345703125, + "learning_rate": 0.00014585830241567785, + "loss": 0.469, + "step": 1004 + }, + { + "epoch": 1.0317297850562948, + "grad_norm": 0.337890625, + "learning_rate": 0.00014488412734895692, + "loss": 0.4901, + "step": 1008 + }, + { + "epoch": 1.0358239508700102, + "grad_norm": 0.34375, + "learning_rate": 0.00014391016826792972, + "loss": 0.5008, + "step": 1012 + }, + { + "epoch": 1.0399181166837257, + "grad_norm": 0.35546875, + "learning_rate": 0.0001429364662919208, + "loss": 0.5037, + "step": 1016 + }, + { + "epoch": 1.0440122824974412, + "grad_norm": 0.32421875, + "learning_rate": 0.00014196306252939998, + "loss": 0.5418, + "step": 1020 + }, + { + "epoch": 1.0481064483111566, + "grad_norm": 0.373046875, + "learning_rate": 0.00014098999807624695, + "loss": 0.5068, + "step": 1024 + }, + { + "epoch": 1.052200614124872, + "grad_norm": 0.369140625, + "learning_rate": 0.00014001731401401622, + "loss": 0.523, + "step": 1028 + }, + { + "epoch": 1.0562947799385876, + "grad_norm": 0.34765625, + "learning_rate": 0.00013904505140820264, + "loss": 0.486, + "step": 1032 + }, + { + "epoch": 1.060388945752303, + "grad_norm": 0.322265625, + "learning_rate": 0.00013807325130650764, + "loss": 0.4964, + "step": 1036 + }, + { + "epoch": 1.0644831115660185, + "grad_norm": 0.349609375, + "learning_rate": 0.00013710195473710636, + "loss": 0.4921, + "step": 1040 + }, + { + "epoch": 1.068577277379734, + "grad_norm": 0.36328125, + "learning_rate": 0.00013613120270691552, + "loss": 0.5132, + "step": 1044 + }, + { + "epoch": 1.0726714431934494, + "grad_norm": 0.349609375, + "learning_rate": 0.00013516103619986192, + "loss": 0.5205, + "step": 1048 + }, + { + "epoch": 1.076765609007165, + "grad_norm": 0.341796875, + "learning_rate": 0.00013419149617515243, + "loss": 0.5278, + "step": 1052 + }, + { + "epoch": 1.0808597748208801, + "grad_norm": 0.3515625, + "learning_rate": 0.00013322262356554456, + "loss": 0.4682, + "step": 1056 + }, + { + "epoch": 1.0849539406345956, + "grad_norm": 0.35546875, + "learning_rate": 0.0001322544592756185, + "loss": 0.5016, + "step": 1060 + }, + { + "epoch": 1.089048106448311, + "grad_norm": 0.333984375, + "learning_rate": 0.00013128704418004995, + "loss": 0.5081, + "step": 1064 + }, + { + "epoch": 1.0931422722620265, + "grad_norm": 0.33203125, + "learning_rate": 0.00013032041912188467, + "loss": 0.5117, + "step": 1068 + }, + { + "epoch": 1.097236438075742, + "grad_norm": 0.3359375, + "learning_rate": 0.00012935462491081391, + "loss": 0.4805, + "step": 1072 + }, + { + "epoch": 1.1013306038894575, + "grad_norm": 0.369140625, + "learning_rate": 0.00012838970232145172, + "loss": 0.5378, + "step": 1076 + }, + { + "epoch": 1.105424769703173, + "grad_norm": 0.349609375, + "learning_rate": 0.00012742569209161334, + "loss": 0.494, + "step": 1080 + }, + { + "epoch": 1.1095189355168884, + "grad_norm": 0.34375, + "learning_rate": 0.00012646263492059528, + "loss": 0.4742, + "step": 1084 + }, + { + "epoch": 1.1136131013306039, + "grad_norm": 0.357421875, + "learning_rate": 0.0001255005714674573, + "loss": 0.5341, + "step": 1088 + }, + { + "epoch": 1.1177072671443193, + "grad_norm": 0.3515625, + "learning_rate": 0.00012453954234930542, + "loss": 0.5028, + "step": 1092 + }, + { + "epoch": 1.1218014329580348, + "grad_norm": 0.353515625, + "learning_rate": 0.00012357958813957748, + "loss": 0.4893, + "step": 1096 + }, + { + "epoch": 1.1258955987717503, + "grad_norm": 0.33984375, + "learning_rate": 0.00012262074936632994, + "loss": 0.548, + "step": 1100 + }, + { + "epoch": 1.1299897645854657, + "grad_norm": 0.36328125, + "learning_rate": 0.00012166306651052708, + "loss": 0.4871, + "step": 1104 + }, + { + "epoch": 1.1340839303991812, + "grad_norm": 0.349609375, + "learning_rate": 0.00012070658000433166, + "loss": 0.4393, + "step": 1108 + }, + { + "epoch": 1.1381780962128967, + "grad_norm": 0.353515625, + "learning_rate": 0.00011975133022939816, + "loss": 0.5077, + "step": 1112 + }, + { + "epoch": 1.1422722620266121, + "grad_norm": 0.369140625, + "learning_rate": 0.0001187973575151677, + "loss": 0.5037, + "step": 1116 + }, + { + "epoch": 1.1463664278403276, + "grad_norm": 0.341796875, + "learning_rate": 0.00011784470213716574, + "loss": 0.4682, + "step": 1120 + }, + { + "epoch": 1.150460593654043, + "grad_norm": 0.373046875, + "learning_rate": 0.00011689340431530123, + "loss": 0.5539, + "step": 1124 + }, + { + "epoch": 1.1545547594677585, + "grad_norm": 0.33984375, + "learning_rate": 0.00011594350421216891, + "loss": 0.4853, + "step": 1128 + }, + { + "epoch": 1.158648925281474, + "grad_norm": 0.36328125, + "learning_rate": 0.00011499504193135363, + "loss": 0.5045, + "step": 1132 + }, + { + "epoch": 1.1627430910951895, + "grad_norm": 0.388671875, + "learning_rate": 0.00011404805751573712, + "loss": 0.4964, + "step": 1136 + }, + { + "epoch": 1.1668372569089047, + "grad_norm": 0.37109375, + "learning_rate": 0.00011310259094580754, + "loss": 0.4819, + "step": 1140 + }, + { + "epoch": 1.1709314227226202, + "grad_norm": 0.3359375, + "learning_rate": 0.00011215868213797156, + "loss": 0.4805, + "step": 1144 + }, + { + "epoch": 1.1750255885363357, + "grad_norm": 0.361328125, + "learning_rate": 0.00011121637094286903, + "loss": 0.4872, + "step": 1148 + }, + { + "epoch": 1.1791197543500511, + "grad_norm": 0.359375, + "learning_rate": 0.00011027569714369059, + "loss": 0.4955, + "step": 1152 + }, + { + "epoch": 1.1832139201637666, + "grad_norm": 0.384765625, + "learning_rate": 0.00010933670045449822, + "loss": 0.4939, + "step": 1156 + }, + { + "epoch": 1.187308085977482, + "grad_norm": 0.345703125, + "learning_rate": 0.00010839942051854829, + "loss": 0.5074, + "step": 1160 + }, + { + "epoch": 1.1914022517911975, + "grad_norm": 0.384765625, + "learning_rate": 0.00010746389690661808, + "loss": 0.5396, + "step": 1164 + }, + { + "epoch": 1.195496417604913, + "grad_norm": 0.369140625, + "learning_rate": 0.000106530169115335, + "loss": 0.5302, + "step": 1168 + }, + { + "epoch": 1.1995905834186285, + "grad_norm": 0.376953125, + "learning_rate": 0.00010559827656550933, + "loss": 0.5012, + "step": 1172 + }, + { + "epoch": 1.203684749232344, + "grad_norm": 0.357421875, + "learning_rate": 0.00010466825860046967, + "loss": 0.5324, + "step": 1176 + }, + { + "epoch": 1.2077789150460594, + "grad_norm": 0.32421875, + "learning_rate": 0.00010374015448440203, + "loss": 0.498, + "step": 1180 + }, + { + "epoch": 1.2118730808597749, + "grad_norm": 0.365234375, + "learning_rate": 0.00010281400340069205, + "loss": 0.4906, + "step": 1184 + }, + { + "epoch": 1.2159672466734903, + "grad_norm": 0.357421875, + "learning_rate": 0.00010188984445027097, + "loss": 0.4885, + "step": 1188 + }, + { + "epoch": 1.2200614124872058, + "grad_norm": 0.376953125, + "learning_rate": 0.00010096771664996456, + "loss": 0.5133, + "step": 1192 + }, + { + "epoch": 1.2241555783009213, + "grad_norm": 0.349609375, + "learning_rate": 0.00010004765893084603, + "loss": 0.4521, + "step": 1196 + }, + { + "epoch": 1.2282497441146367, + "grad_norm": 0.37109375, + "learning_rate": 9.912971013659232e-05, + "loss": 0.5168, + "step": 1200 + }, + { + "epoch": 1.2323439099283522, + "grad_norm": 0.498046875, + "learning_rate": 9.821390902184426e-05, + "loss": 0.4759, + "step": 1204 + }, + { + "epoch": 1.2364380757420674, + "grad_norm": 0.36328125, + "learning_rate": 9.730029425057045e-05, + "loss": 0.5194, + "step": 1208 + }, + { + "epoch": 1.240532241555783, + "grad_norm": 0.376953125, + "learning_rate": 9.638890439443464e-05, + "loss": 0.4903, + "step": 1212 + }, + { + "epoch": 1.2446264073694984, + "grad_norm": 0.33984375, + "learning_rate": 9.547977793116762e-05, + "loss": 0.5149, + "step": 1216 + }, + { + "epoch": 1.2487205731832138, + "grad_norm": 0.3359375, + "learning_rate": 9.457295324294247e-05, + "loss": 0.4665, + "step": 1220 + }, + { + "epoch": 1.2528147389969293, + "grad_norm": 0.408203125, + "learning_rate": 9.366846861475435e-05, + "loss": 0.4834, + "step": 1224 + }, + { + "epoch": 1.2569089048106448, + "grad_norm": 0.34375, + "learning_rate": 9.276636223280396e-05, + "loss": 0.5027, + "step": 1228 + }, + { + "epoch": 1.2610030706243602, + "grad_norm": 0.36328125, + "learning_rate": 9.186667218288549e-05, + "loss": 0.5093, + "step": 1232 + }, + { + "epoch": 1.2650972364380757, + "grad_norm": 0.35546875, + "learning_rate": 9.096943644877854e-05, + "loss": 0.5105, + "step": 1236 + }, + { + "epoch": 1.2691914022517912, + "grad_norm": 0.380859375, + "learning_rate": 9.007469291064467e-05, + "loss": 0.5212, + "step": 1240 + }, + { + "epoch": 1.2732855680655066, + "grad_norm": 0.353515625, + "learning_rate": 8.918247934342806e-05, + "loss": 0.5191, + "step": 1244 + }, + { + "epoch": 1.277379733879222, + "grad_norm": 0.365234375, + "learning_rate": 8.829283341526067e-05, + "loss": 0.5019, + "step": 1248 + }, + { + "epoch": 1.2814738996929376, + "grad_norm": 0.384765625, + "learning_rate": 8.74057926858721e-05, + "loss": 0.5162, + "step": 1252 + }, + { + "epoch": 1.285568065506653, + "grad_norm": 0.3828125, + "learning_rate": 8.652139460500359e-05, + "loss": 0.5061, + "step": 1256 + }, + { + "epoch": 1.2896622313203685, + "grad_norm": 0.376953125, + "learning_rate": 8.563967651082713e-05, + "loss": 0.5003, + "step": 1260 + }, + { + "epoch": 1.293756397134084, + "grad_norm": 0.3671875, + "learning_rate": 8.47606756283691e-05, + "loss": 0.5196, + "step": 1264 + }, + { + "epoch": 1.2978505629477994, + "grad_norm": 0.34765625, + "learning_rate": 8.388442906793862e-05, + "loss": 0.4932, + "step": 1268 + }, + { + "epoch": 1.301944728761515, + "grad_norm": 0.369140625, + "learning_rate": 8.301097382356067e-05, + "loss": 0.4871, + "step": 1272 + }, + { + "epoch": 1.3060388945752304, + "grad_norm": 0.384765625, + "learning_rate": 8.214034677141465e-05, + "loss": 0.494, + "step": 1276 + }, + { + "epoch": 1.3101330603889458, + "grad_norm": 0.357421875, + "learning_rate": 8.127258466827704e-05, + "loss": 0.5034, + "step": 1280 + }, + { + "epoch": 1.3142272262026613, + "grad_norm": 0.365234375, + "learning_rate": 8.040772414996984e-05, + "loss": 0.5111, + "step": 1284 + }, + { + "epoch": 1.3183213920163768, + "grad_norm": 0.32421875, + "learning_rate": 7.95458017298138e-05, + "loss": 0.5169, + "step": 1288 + }, + { + "epoch": 1.3224155578300922, + "grad_norm": 0.37109375, + "learning_rate": 7.868685379708686e-05, + "loss": 0.4631, + "step": 1292 + }, + { + "epoch": 1.3265097236438077, + "grad_norm": 0.341796875, + "learning_rate": 7.783091661548789e-05, + "loss": 0.4756, + "step": 1296 + }, + { + "epoch": 1.330603889457523, + "grad_norm": 0.34765625, + "learning_rate": 7.697802632160557e-05, + "loss": 0.4705, + "step": 1300 + }, + { + "epoch": 1.3346980552712384, + "grad_norm": 0.357421875, + "learning_rate": 7.612821892339284e-05, + "loss": 0.522, + "step": 1304 + }, + { + "epoch": 1.3387922210849539, + "grad_norm": 0.375, + "learning_rate": 7.528153029864682e-05, + "loss": 0.5192, + "step": 1308 + }, + { + "epoch": 1.3428863868986693, + "grad_norm": 0.341796875, + "learning_rate": 7.443799619349374e-05, + "loss": 0.5183, + "step": 1312 + }, + { + "epoch": 1.3469805527123848, + "grad_norm": 0.35546875, + "learning_rate": 7.359765222088008e-05, + "loss": 0.506, + "step": 1316 + }, + { + "epoch": 1.3510747185261003, + "grad_norm": 0.3828125, + "learning_rate": 7.276053385906896e-05, + "loss": 0.5021, + "step": 1320 + }, + { + "epoch": 1.3551688843398157, + "grad_norm": 0.3515625, + "learning_rate": 7.192667645014223e-05, + "loss": 0.4803, + "step": 1324 + }, + { + "epoch": 1.3592630501535312, + "grad_norm": 0.365234375, + "learning_rate": 7.109611519850845e-05, + "loss": 0.4941, + "step": 1328 + }, + { + "epoch": 1.3633572159672467, + "grad_norm": 0.37890625, + "learning_rate": 7.026888516941658e-05, + "loss": 0.508, + "step": 1332 + }, + { + "epoch": 1.3674513817809621, + "grad_norm": 0.36328125, + "learning_rate": 6.944502128747558e-05, + "loss": 0.5139, + "step": 1336 + }, + { + "epoch": 1.3715455475946776, + "grad_norm": 0.36328125, + "learning_rate": 6.862455833517979e-05, + "loss": 0.4899, + "step": 1340 + }, + { + "epoch": 1.375639713408393, + "grad_norm": 0.38671875, + "learning_rate": 6.780753095144086e-05, + "loss": 0.526, + "step": 1344 + }, + { + "epoch": 1.3797338792221086, + "grad_norm": 0.373046875, + "learning_rate": 6.699397363012482e-05, + "loss": 0.499, + "step": 1348 + }, + { + "epoch": 1.383828045035824, + "grad_norm": 0.39453125, + "learning_rate": 6.618392071859612e-05, + "loss": 0.5155, + "step": 1352 + }, + { + "epoch": 1.3879222108495395, + "grad_norm": 0.384765625, + "learning_rate": 6.537740641626746e-05, + "loss": 0.5165, + "step": 1356 + }, + { + "epoch": 1.3920163766632547, + "grad_norm": 0.357421875, + "learning_rate": 6.457446477315588e-05, + "loss": 0.4815, + "step": 1360 + }, + { + "epoch": 1.3961105424769702, + "grad_norm": 0.3671875, + "learning_rate": 6.377512968844533e-05, + "loss": 0.5091, + "step": 1364 + }, + { + "epoch": 1.4002047082906857, + "grad_norm": 0.3515625, + "learning_rate": 6.297943490905531e-05, + "loss": 0.4868, + "step": 1368 + }, + { + "epoch": 1.4042988741044011, + "grad_norm": 0.373046875, + "learning_rate": 6.218741402821624e-05, + "loss": 0.4928, + "step": 1372 + }, + { + "epoch": 1.4083930399181166, + "grad_norm": 0.369140625, + "learning_rate": 6.139910048405134e-05, + "loss": 0.5173, + "step": 1376 + }, + { + "epoch": 1.412487205731832, + "grad_norm": 0.3515625, + "learning_rate": 6.061452755816451e-05, + "loss": 0.492, + "step": 1380 + }, + { + "epoch": 1.4165813715455475, + "grad_norm": 0.357421875, + "learning_rate": 5.9833728374235615e-05, + "loss": 0.5033, + "step": 1384 + }, + { + "epoch": 1.420675537359263, + "grad_norm": 0.33984375, + "learning_rate": 5.9056735896621796e-05, + "loss": 0.5119, + "step": 1388 + }, + { + "epoch": 1.4247697031729785, + "grad_norm": 0.34375, + "learning_rate": 5.8283582928965986e-05, + "loss": 0.4938, + "step": 1392 + }, + { + "epoch": 1.428863868986694, + "grad_norm": 0.36328125, + "learning_rate": 5.751430211281165e-05, + "loss": 0.4877, + "step": 1396 + }, + { + "epoch": 1.4329580348004094, + "grad_norm": 0.359375, + "learning_rate": 5.674892592622502e-05, + "loss": 0.4866, + "step": 1400 + }, + { + "epoch": 1.4370522006141249, + "grad_norm": 0.369140625, + "learning_rate": 5.5987486682423865e-05, + "loss": 0.4863, + "step": 1404 + }, + { + "epoch": 1.4411463664278403, + "grad_norm": 0.361328125, + "learning_rate": 5.5230016528413076e-05, + "loss": 0.5, + "step": 1408 + }, + { + "epoch": 1.4452405322415558, + "grad_norm": 0.357421875, + "learning_rate": 5.447654744362761e-05, + "loss": 0.4917, + "step": 1412 + }, + { + "epoch": 1.4493346980552713, + "grad_norm": 0.34765625, + "learning_rate": 5.37271112385823e-05, + "loss": 0.4828, + "step": 1416 + }, + { + "epoch": 1.4534288638689867, + "grad_norm": 0.3515625, + "learning_rate": 5.2981739553528944e-05, + "loss": 0.5278, + "step": 1420 + }, + { + "epoch": 1.4575230296827022, + "grad_norm": 0.365234375, + "learning_rate": 5.2240463857120365e-05, + "loss": 0.4959, + "step": 1424 + }, + { + "epoch": 1.4616171954964177, + "grad_norm": 0.384765625, + "learning_rate": 5.1503315445081946e-05, + "loss": 0.4767, + "step": 1428 + }, + { + "epoch": 1.4657113613101331, + "grad_norm": 0.357421875, + "learning_rate": 5.0770325438890304e-05, + "loss": 0.5052, + "step": 1432 + }, + { + "epoch": 1.4698055271238486, + "grad_norm": 0.365234375, + "learning_rate": 5.004152478445939e-05, + "loss": 0.4988, + "step": 1436 + }, + { + "epoch": 1.473899692937564, + "grad_norm": 0.36328125, + "learning_rate": 4.9316944250834126e-05, + "loss": 0.486, + "step": 1440 + }, + { + "epoch": 1.4779938587512795, + "grad_norm": 0.384765625, + "learning_rate": 4.8596614428891094e-05, + "loss": 0.5126, + "step": 1444 + }, + { + "epoch": 1.482088024564995, + "grad_norm": 0.35546875, + "learning_rate": 4.788056573004726e-05, + "loss": 0.5124, + "step": 1448 + }, + { + "epoch": 1.4861821903787105, + "grad_norm": 0.365234375, + "learning_rate": 4.7168828384975985e-05, + "loss": 0.5304, + "step": 1452 + }, + { + "epoch": 1.4902763561924257, + "grad_norm": 0.376953125, + "learning_rate": 4.646143244233068e-05, + "loss": 0.5023, + "step": 1456 + }, + { + "epoch": 1.4943705220061412, + "grad_norm": 0.33203125, + "learning_rate": 4.575840776747621e-05, + "loss": 0.453, + "step": 1460 + }, + { + "epoch": 1.4984646878198566, + "grad_norm": 0.365234375, + "learning_rate": 4.505978404122805e-05, + "loss": 0.4769, + "step": 1464 + }, + { + "epoch": 1.5025588536335721, + "grad_norm": 0.36328125, + "learning_rate": 4.436559075859911e-05, + "loss": 0.54, + "step": 1468 + }, + { + "epoch": 1.5066530194472876, + "grad_norm": 0.365234375, + "learning_rate": 4.367585722755474e-05, + "loss": 0.5083, + "step": 1472 + }, + { + "epoch": 1.510747185261003, + "grad_norm": 0.341796875, + "learning_rate": 4.299061256777498e-05, + "loss": 0.4746, + "step": 1476 + }, + { + "epoch": 1.5148413510747185, + "grad_norm": 0.3671875, + "learning_rate": 4.23098857094255e-05, + "loss": 0.511, + "step": 1480 + }, + { + "epoch": 1.518935516888434, + "grad_norm": 0.380859375, + "learning_rate": 4.163370539193606e-05, + "loss": 0.4853, + "step": 1484 + }, + { + "epoch": 1.5230296827021494, + "grad_norm": 0.35546875, + "learning_rate": 4.0962100162787195e-05, + "loss": 0.4949, + "step": 1488 + }, + { + "epoch": 1.527123848515865, + "grad_norm": 0.37109375, + "learning_rate": 4.029509837630499e-05, + "loss": 0.4859, + "step": 1492 + }, + { + "epoch": 1.5312180143295804, + "grad_norm": 0.353515625, + "learning_rate": 3.9632728192463986e-05, + "loss": 0.5075, + "step": 1496 + }, + { + "epoch": 1.5353121801432958, + "grad_norm": 0.380859375, + "learning_rate": 3.897501757569827e-05, + "loss": 0.5268, + "step": 1500 + }, + { + "epoch": 1.5394063459570113, + "grad_norm": 0.38671875, + "learning_rate": 3.8321994293720886e-05, + "loss": 0.5249, + "step": 1504 + }, + { + "epoch": 1.5435005117707266, + "grad_norm": 0.345703125, + "learning_rate": 3.76736859163516e-05, + "loss": 0.4993, + "step": 1508 + }, + { + "epoch": 1.547594677584442, + "grad_norm": 0.384765625, + "learning_rate": 3.703011981435276e-05, + "loss": 0.5127, + "step": 1512 + }, + { + "epoch": 1.5516888433981575, + "grad_norm": 0.353515625, + "learning_rate": 3.639132315827381e-05, + "loss": 0.48, + "step": 1516 + }, + { + "epoch": 1.555783009211873, + "grad_norm": 0.373046875, + "learning_rate": 3.575732291730427e-05, + "loss": 0.5048, + "step": 1520 + }, + { + "epoch": 1.5598771750255884, + "grad_norm": 0.388671875, + "learning_rate": 3.5128145858135e-05, + "loss": 0.4897, + "step": 1524 + }, + { + "epoch": 1.563971340839304, + "grad_norm": 0.38671875, + "learning_rate": 3.450381854382825e-05, + "loss": 0.4763, + "step": 1528 + }, + { + "epoch": 1.5680655066530194, + "grad_norm": 0.361328125, + "learning_rate": 3.388436733269613e-05, + "loss": 0.5356, + "step": 1532 + }, + { + "epoch": 1.5721596724667348, + "grad_norm": 0.35546875, + "learning_rate": 3.3269818377187804e-05, + "loss": 0.5083, + "step": 1536 + }, + { + "epoch": 1.5762538382804503, + "grad_norm": 0.341796875, + "learning_rate": 3.266019762278547e-05, + "loss": 0.4831, + "step": 1540 + }, + { + "epoch": 1.5803480040941658, + "grad_norm": 0.357421875, + "learning_rate": 3.2055530806908794e-05, + "loss": 0.4942, + "step": 1544 + }, + { + "epoch": 1.5844421699078812, + "grad_norm": 0.380859375, + "learning_rate": 3.1455843457828446e-05, + "loss": 0.4743, + "step": 1548 + }, + { + "epoch": 1.5885363357215967, + "grad_norm": 0.353515625, + "learning_rate": 3.0861160893588323e-05, + "loss": 0.4986, + "step": 1552 + }, + { + "epoch": 1.5926305015353122, + "grad_norm": 0.365234375, + "learning_rate": 3.0271508220936454e-05, + "loss": 0.4733, + "step": 1556 + }, + { + "epoch": 1.5967246673490276, + "grad_norm": 0.33203125, + "learning_rate": 2.9686910334265367e-05, + "loss": 0.4876, + "step": 1560 + }, + { + "epoch": 1.600818833162743, + "grad_norm": 0.365234375, + "learning_rate": 2.910739191456079e-05, + "loss": 0.4959, + "step": 1564 + }, + { + "epoch": 1.6049129989764586, + "grad_norm": 0.35546875, + "learning_rate": 2.8532977428359882e-05, + "loss": 0.4932, + "step": 1568 + }, + { + "epoch": 1.609007164790174, + "grad_norm": 0.36328125, + "learning_rate": 2.796369112671804e-05, + "loss": 0.4872, + "step": 1572 + }, + { + "epoch": 1.6131013306038895, + "grad_norm": 0.369140625, + "learning_rate": 2.7399557044185216e-05, + "loss": 0.4906, + "step": 1576 + }, + { + "epoch": 1.617195496417605, + "grad_norm": 0.34765625, + "learning_rate": 2.684059899779119e-05, + "loss": 0.5091, + "step": 1580 + }, + { + "epoch": 1.6212896622313204, + "grad_norm": 0.353515625, + "learning_rate": 2.6286840586039965e-05, + "loss": 0.487, + "step": 1584 + }, + { + "epoch": 1.625383828045036, + "grad_norm": 0.341796875, + "learning_rate": 2.573830518791359e-05, + "loss": 0.4899, + "step": 1588 + }, + { + "epoch": 1.6294779938587514, + "grad_norm": 0.345703125, + "learning_rate": 2.5195015961885017e-05, + "loss": 0.4985, + "step": 1592 + }, + { + "epoch": 1.6335721596724668, + "grad_norm": 0.330078125, + "learning_rate": 2.4656995844940397e-05, + "loss": 0.5141, + "step": 1596 + }, + { + "epoch": 1.6376663254861823, + "grad_norm": 0.3515625, + "learning_rate": 2.4124267551610883e-05, + "loss": 0.4987, + "step": 1600 + }, + { + "epoch": 1.6417604912998978, + "grad_norm": 0.359375, + "learning_rate": 2.3596853573013356e-05, + "loss": 0.4614, + "step": 1604 + }, + { + "epoch": 1.6458546571136132, + "grad_norm": 0.390625, + "learning_rate": 2.307477617590108e-05, + "loss": 0.5024, + "step": 1608 + }, + { + "epoch": 1.6499488229273287, + "grad_norm": 0.359375, + "learning_rate": 2.255805740172359e-05, + "loss": 0.4936, + "step": 1612 + }, + { + "epoch": 1.6540429887410442, + "grad_norm": 0.375, + "learning_rate": 2.2046719065696077e-05, + "loss": 0.4951, + "step": 1616 + }, + { + "epoch": 1.6581371545547596, + "grad_norm": 0.375, + "learning_rate": 2.1540782755878456e-05, + "loss": 0.485, + "step": 1620 + }, + { + "epoch": 1.6622313203684749, + "grad_norm": 0.369140625, + "learning_rate": 2.1040269832263895e-05, + "loss": 0.5239, + "step": 1624 + }, + { + "epoch": 1.6663254861821903, + "grad_norm": 0.3671875, + "learning_rate": 2.054520142587703e-05, + "loss": 0.5184, + "step": 1628 + }, + { + "epoch": 1.6704196519959058, + "grad_norm": 0.337890625, + "learning_rate": 2.0055598437881786e-05, + "loss": 0.4921, + "step": 1632 + }, + { + "epoch": 1.6745138178096213, + "grad_norm": 0.376953125, + "learning_rate": 1.957148153869918e-05, + "loss": 0.5345, + "step": 1636 + }, + { + "epoch": 1.6786079836233367, + "grad_norm": 0.3359375, + "learning_rate": 1.9092871167134304e-05, + "loss": 0.4352, + "step": 1640 + }, + { + "epoch": 1.6827021494370522, + "grad_norm": 0.3515625, + "learning_rate": 1.8619787529513674e-05, + "loss": 0.5166, + "step": 1644 + }, + { + "epoch": 1.6867963152507677, + "grad_norm": 0.3515625, + "learning_rate": 1.8152250598832045e-05, + "loss": 0.4968, + "step": 1648 + }, + { + "epoch": 1.6908904810644831, + "grad_norm": 0.34765625, + "learning_rate": 1.7690280113909215e-05, + "loss": 0.513, + "step": 1652 + }, + { + "epoch": 1.6949846468781986, + "grad_norm": 0.36328125, + "learning_rate": 1.7233895578556633e-05, + "loss": 0.4676, + "step": 1656 + }, + { + "epoch": 1.699078812691914, + "grad_norm": 0.375, + "learning_rate": 1.6783116260754027e-05, + "loss": 0.4749, + "step": 1660 + }, + { + "epoch": 1.7031729785056293, + "grad_norm": 0.375, + "learning_rate": 1.6337961191835868e-05, + "loss": 0.5206, + "step": 1664 + }, + { + "epoch": 1.7072671443193448, + "grad_norm": 0.349609375, + "learning_rate": 1.589844916568801e-05, + "loss": 0.473, + "step": 1668 + }, + { + "epoch": 1.7113613101330603, + "grad_norm": 0.345703125, + "learning_rate": 1.5464598737954086e-05, + "loss": 0.5221, + "step": 1672 + }, + { + "epoch": 1.7154554759467757, + "grad_norm": 0.353515625, + "learning_rate": 1.5036428225252174e-05, + "loss": 0.5134, + "step": 1676 + }, + { + "epoch": 1.7195496417604912, + "grad_norm": 0.3515625, + "learning_rate": 1.4613955704401541e-05, + "loss": 0.4537, + "step": 1680 + }, + { + "epoch": 1.7236438075742067, + "grad_norm": 0.3671875, + "learning_rate": 1.41971990116594e-05, + "loss": 0.5054, + "step": 1684 + }, + { + "epoch": 1.7277379733879221, + "grad_norm": 0.357421875, + "learning_rate": 1.3786175741967931e-05, + "loss": 0.4559, + "step": 1688 + }, + { + "epoch": 1.7318321392016376, + "grad_norm": 0.33203125, + "learning_rate": 1.3380903248211366e-05, + "loss": 0.525, + "step": 1692 + }, + { + "epoch": 1.735926305015353, + "grad_norm": 0.380859375, + "learning_rate": 1.298139864048348e-05, + "loss": 0.5126, + "step": 1696 + }, + { + "epoch": 1.7400204708290685, + "grad_norm": 0.365234375, + "learning_rate": 1.258767878536514e-05, + "loss": 0.5151, + "step": 1700 + }, + { + "epoch": 1.744114636642784, + "grad_norm": 0.37109375, + "learning_rate": 1.2199760305212302e-05, + "loss": 0.4852, + "step": 1704 + }, + { + "epoch": 1.7482088024564995, + "grad_norm": 0.369140625, + "learning_rate": 1.181765957745413e-05, + "loss": 0.4815, + "step": 1708 + }, + { + "epoch": 1.752302968270215, + "grad_norm": 0.369140625, + "learning_rate": 1.1441392733901628e-05, + "loss": 0.5223, + "step": 1712 + }, + { + "epoch": 1.7563971340839304, + "grad_norm": 0.36328125, + "learning_rate": 1.10709756600666e-05, + "loss": 0.5023, + "step": 1716 + }, + { + "epoch": 1.7604912998976459, + "grad_norm": 0.33984375, + "learning_rate": 1.0706423994490859e-05, + "loss": 0.4518, + "step": 1720 + }, + { + "epoch": 1.7645854657113613, + "grad_norm": 0.341796875, + "learning_rate": 1.0347753128086134e-05, + "loss": 0.4935, + "step": 1724 + }, + { + "epoch": 1.7686796315250768, + "grad_norm": 0.3671875, + "learning_rate": 9.994978203484305e-06, + "loss": 0.4814, + "step": 1728 + }, + { + "epoch": 1.7727737973387923, + "grad_norm": 0.3515625, + "learning_rate": 9.648114114397871e-06, + "loss": 0.4779, + "step": 1732 + }, + { + "epoch": 1.7768679631525077, + "grad_norm": 0.349609375, + "learning_rate": 9.307175504991427e-06, + "loss": 0.5067, + "step": 1736 + }, + { + "epoch": 1.7809621289662232, + "grad_norm": 0.384765625, + "learning_rate": 8.972176769263218e-06, + "loss": 0.4806, + "step": 1740 + }, + { + "epoch": 1.7850562947799387, + "grad_norm": 0.333984375, + "learning_rate": 8.643132050437518e-06, + "loss": 0.5255, + "step": 1744 + }, + { + "epoch": 1.7891504605936541, + "grad_norm": 0.38671875, + "learning_rate": 8.320055240367546e-06, + "loss": 0.4958, + "step": 1748 + }, + { + "epoch": 1.7932446264073696, + "grad_norm": 0.361328125, + "learning_rate": 8.00295997894893e-06, + "loss": 0.4584, + "step": 1752 + }, + { + "epoch": 1.797338792221085, + "grad_norm": 0.357421875, + "learning_rate": 7.691859653543825e-06, + "loss": 0.5105, + "step": 1756 + }, + { + "epoch": 1.8014329580348005, + "grad_norm": 0.34765625, + "learning_rate": 7.386767398415738e-06, + "loss": 0.4786, + "step": 1760 + }, + { + "epoch": 1.805527123848516, + "grad_norm": 0.33984375, + "learning_rate": 7.087696094175116e-06, + "loss": 0.4849, + "step": 1764 + }, + { + "epoch": 1.8096212896622315, + "grad_norm": 0.37109375, + "learning_rate": 6.794658367235356e-06, + "loss": 0.5323, + "step": 1768 + }, + { + "epoch": 1.813715455475947, + "grad_norm": 0.3515625, + "learning_rate": 6.507666589279836e-06, + "loss": 0.4986, + "step": 1772 + }, + { + "epoch": 1.8178096212896624, + "grad_norm": 0.369140625, + "learning_rate": 6.226732876739615e-06, + "loss": 0.4773, + "step": 1776 + }, + { + "epoch": 1.8219037871033776, + "grad_norm": 0.34765625, + "learning_rate": 5.951869090281891e-06, + "loss": 0.5024, + "step": 1780 + }, + { + "epoch": 1.825997952917093, + "grad_norm": 0.341796875, + "learning_rate": 5.68308683430917e-06, + "loss": 0.5089, + "step": 1784 + }, + { + "epoch": 1.8300921187308086, + "grad_norm": 0.36328125, + "learning_rate": 5.420397456469455e-06, + "loss": 0.5036, + "step": 1788 + }, + { + "epoch": 1.834186284544524, + "grad_norm": 0.341796875, + "learning_rate": 5.163812047177102e-06, + "loss": 0.4827, + "step": 1792 + }, + { + "epoch": 1.8382804503582395, + "grad_norm": 0.37109375, + "learning_rate": 4.913341439144636e-06, + "loss": 0.5148, + "step": 1796 + }, + { + "epoch": 1.842374616171955, + "grad_norm": 0.36328125, + "learning_rate": 4.668996206925357e-06, + "loss": 0.495, + "step": 1800 + }, + { + "epoch": 1.8464687819856704, + "grad_norm": 0.34765625, + "learning_rate": 4.430786666466895e-06, + "loss": 0.4838, + "step": 1804 + }, + { + "epoch": 1.850562947799386, + "grad_norm": 0.357421875, + "learning_rate": 4.198722874675742e-06, + "loss": 0.492, + "step": 1808 + }, + { + "epoch": 1.8546571136131014, + "grad_norm": 0.357421875, + "learning_rate": 3.9728146289926544e-06, + "loss": 0.518, + "step": 1812 + }, + { + "epoch": 1.8587512794268168, + "grad_norm": 0.373046875, + "learning_rate": 3.753071466978924e-06, + "loss": 0.5343, + "step": 1816 + }, + { + "epoch": 1.862845445240532, + "grad_norm": 0.365234375, + "learning_rate": 3.539502665913846e-06, + "loss": 0.4834, + "step": 1820 + }, + { + "epoch": 1.8669396110542475, + "grad_norm": 0.376953125, + "learning_rate": 3.3321172424029318e-06, + "loss": 0.5033, + "step": 1824 + }, + { + "epoch": 1.871033776867963, + "grad_norm": 0.369140625, + "learning_rate": 3.130923951997316e-06, + "loss": 0.5278, + "step": 1828 + }, + { + "epoch": 1.8751279426816785, + "grad_norm": 0.361328125, + "learning_rate": 2.9359312888241005e-06, + "loss": 0.5692, + "step": 1832 + }, + { + "epoch": 1.879222108495394, + "grad_norm": 0.36328125, + "learning_rate": 2.747147485227674e-06, + "loss": 0.5026, + "step": 1836 + }, + { + "epoch": 1.8833162743091094, + "grad_norm": 0.357421875, + "learning_rate": 2.564580511422226e-06, + "loss": 0.5102, + "step": 1840 + }, + { + "epoch": 1.8874104401228249, + "grad_norm": 0.361328125, + "learning_rate": 2.38823807515523e-06, + "loss": 0.4746, + "step": 1844 + }, + { + "epoch": 1.8915046059365404, + "grad_norm": 0.3828125, + "learning_rate": 2.2181276213820374e-06, + "loss": 0.4821, + "step": 1848 + }, + { + "epoch": 1.8955987717502558, + "grad_norm": 0.3828125, + "learning_rate": 2.0542563319515315e-06, + "loss": 0.4851, + "step": 1852 + }, + { + "epoch": 1.8996929375639713, + "grad_norm": 0.34375, + "learning_rate": 1.8966311253029497e-06, + "loss": 0.4923, + "step": 1856 + }, + { + "epoch": 1.9037871033776868, + "grad_norm": 0.353515625, + "learning_rate": 1.7452586561738203e-06, + "loss": 0.4947, + "step": 1860 + }, + { + "epoch": 1.9078812691914022, + "grad_norm": 0.349609375, + "learning_rate": 1.6001453153189681e-06, + "loss": 0.4607, + "step": 1864 + }, + { + "epoch": 1.9119754350051177, + "grad_norm": 0.345703125, + "learning_rate": 1.4612972292406655e-06, + "loss": 0.4958, + "step": 1868 + }, + { + "epoch": 1.9160696008188332, + "grad_norm": 0.349609375, + "learning_rate": 1.3287202599300883e-06, + "loss": 0.544, + "step": 1872 + }, + { + "epoch": 1.9201637666325486, + "grad_norm": 0.369140625, + "learning_rate": 1.2024200046197141e-06, + "loss": 0.5157, + "step": 1876 + }, + { + "epoch": 1.924257932446264, + "grad_norm": 0.34375, + "learning_rate": 1.082401795547111e-06, + "loss": 0.4889, + "step": 1880 + }, + { + "epoch": 1.9283520982599796, + "grad_norm": 0.34375, + "learning_rate": 9.686706997297676e-07, + "loss": 0.494, + "step": 1884 + }, + { + "epoch": 1.932446264073695, + "grad_norm": 0.376953125, + "learning_rate": 8.612315187511321e-07, + "loss": 0.5121, + "step": 1888 + }, + { + "epoch": 1.9365404298874105, + "grad_norm": 0.35546875, + "learning_rate": 7.600887885580053e-07, + "loss": 0.4891, + "step": 1892 + }, + { + "epoch": 1.940634595701126, + "grad_norm": 0.392578125, + "learning_rate": 6.652467792689465e-07, + "loss": 0.5027, + "step": 1896 + }, + { + "epoch": 1.9447287615148414, + "grad_norm": 0.359375, + "learning_rate": 5.767094949940154e-07, + "loss": 0.5149, + "step": 1900 + }, + { + "epoch": 1.9488229273285569, + "grad_norm": 0.365234375, + "learning_rate": 4.944806736657426e-07, + "loss": 0.485, + "step": 1904 + }, + { + "epoch": 1.9529170931422724, + "grad_norm": 0.349609375, + "learning_rate": 4.185637868812708e-07, + "loss": 0.4615, + "step": 1908 + }, + { + "epoch": 1.9570112589559878, + "grad_norm": 0.361328125, + "learning_rate": 3.4896203975587347e-07, + "loss": 0.4915, + "step": 1912 + }, + { + "epoch": 1.9611054247697033, + "grad_norm": 0.36328125, + "learning_rate": 2.8567837078756204e-07, + "loss": 0.5072, + "step": 1916 + }, + { + "epoch": 1.9651995905834188, + "grad_norm": 0.369140625, + "learning_rate": 2.2871545173306915e-07, + "loss": 0.509, + "step": 1920 + }, + { + "epoch": 1.9692937563971342, + "grad_norm": 0.3515625, + "learning_rate": 1.7807568749500512e-07, + "loss": 0.4865, + "step": 1924 + }, + { + "epoch": 1.9733879222108497, + "grad_norm": 0.3515625, + "learning_rate": 1.3376121602038937e-07, + "loss": 0.4669, + "step": 1928 + }, + { + "epoch": 1.9774820880245652, + "grad_norm": 0.349609375, + "learning_rate": 9.57739082103226e-08, + "loss": 0.4998, + "step": 1932 + }, + { + "epoch": 1.9815762538382804, + "grad_norm": 0.36328125, + "learning_rate": 6.411536784106663e-08, + "loss": 0.5088, + "step": 1936 + }, + { + "epoch": 1.9856704196519959, + "grad_norm": 0.349609375, + "learning_rate": 3.878693149628187e-08, + "loss": 0.4896, + "step": 1940 + }, + { + "epoch": 1.9897645854657113, + "grad_norm": 0.361328125, + "learning_rate": 1.978966851062247e-08, + "loss": 0.4867, + "step": 1944 + }, + { + "epoch": 1.9938587512794268, + "grad_norm": 0.38671875, + "learning_rate": 7.124380924555806e-09, + "loss": 0.5181, + "step": 1948 + }, + { + "epoch": 1.9979529170931423, + "grad_norm": 0.349609375, + "learning_rate": 7.916034505894842e-10, + "loss": 0.5421, + "step": 1952 + }, + { + "epoch": 2.0, + "step": 1954, + "total_flos": 1.2787236030130422e+18, + "train_loss": 0.5614301155387074, + "train_runtime": 7860.4318, "train_samples_per_second": 7.954, "train_steps_per_second": 0.249 } ], "logging_steps": 4, - "max_steps": 977, + "max_steps": 1954, "num_input_tokens_seen": 0, - "num_train_epochs": 1, + "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { @@ -1743,7 +3451,7 @@ "attributes": {} } }, - "total_flos": 6.393618015065211e+17, + "total_flos": 1.2787236030130422e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null