diff --git "a/checkpoint-6500/trainer_state.json" "b/checkpoint-6500/trainer_state.json" --- "a/checkpoint-6500/trainer_state.json" +++ "b/checkpoint-6500/trainer_state.json" @@ -1,7 +1,7 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 6.937033084311633, + "epoch": 6.653019447287615, "eval_steps": 500, "global_step": 6500, "is_hyper_param_search": false, @@ -9,11383 +9,11383 @@ "is_world_process_zero": true, "log_history": [ { - "epoch": 0.004268943436499467, + "epoch": 0.0040941658137154556, "grad_norm": 0.283203125, - "learning_rate": 1.8181818181818182e-05, - "loss": 1.3096, + "learning_rate": 1.7391304347826085e-05, + "loss": 1.2477, "step": 4 }, { - "epoch": 0.008537886872998933, - "grad_norm": 0.291015625, - "learning_rate": 3.6363636363636364e-05, - "loss": 1.2874, + "epoch": 0.008188331627430911, + "grad_norm": 0.2734375, + "learning_rate": 3.478260869565217e-05, + "loss": 1.292, "step": 8 }, { - "epoch": 0.012806830309498399, - "grad_norm": 0.244140625, - "learning_rate": 5.454545454545454e-05, - "loss": 1.2297, + "epoch": 0.012282497441146366, + "grad_norm": 0.212890625, + "learning_rate": 5.2173913043478256e-05, + "loss": 1.201, "step": 12 }, { - "epoch": 0.017075773745997867, - "grad_norm": 0.2197265625, - "learning_rate": 7.272727272727273e-05, - "loss": 1.1818, + "epoch": 0.016376663254861822, + "grad_norm": 0.208984375, + "learning_rate": 6.956521739130434e-05, + "loss": 1.218, "step": 16 }, { - "epoch": 0.021344717182497332, - "grad_norm": 0.244140625, - "learning_rate": 9.09090909090909e-05, - "loss": 1.2125, + "epoch": 0.02047082906857728, + "grad_norm": 0.259765625, + "learning_rate": 8.695652173913043e-05, + "loss": 1.2022, "step": 20 }, { - "epoch": 0.025613660618996798, - "grad_norm": 0.177734375, - "learning_rate": 0.00010909090909090908, - "loss": 1.1827, + "epoch": 0.02456499488229273, + "grad_norm": 0.171875, + "learning_rate": 0.00010434782608695651, + "loss": 1.1587, "step": 24 }, { - "epoch": 0.029882604055496264, - "grad_norm": 0.1884765625, - "learning_rate": 0.00012727272727272725, - "loss": 1.1747, + "epoch": 0.028659160696008188, + "grad_norm": 0.1728515625, + "learning_rate": 0.00012173913043478261, + "loss": 1.1458, "step": 28 }, { - "epoch": 0.03415154749199573, - "grad_norm": 0.1826171875, - "learning_rate": 0.00014545454545454546, - "loss": 1.13, + "epoch": 0.032753326509723645, + "grad_norm": 0.1640625, + "learning_rate": 0.00013913043478260868, + "loss": 1.1289, "step": 32 }, { - "epoch": 0.0384204909284952, - "grad_norm": 0.208984375, - "learning_rate": 0.0001636363636363636, - "loss": 1.1684, + "epoch": 0.0368474923234391, + "grad_norm": 0.2060546875, + "learning_rate": 0.00015652173913043477, + "loss": 1.1263, "step": 36 }, { - "epoch": 0.042689434364994665, - "grad_norm": 0.2275390625, - "learning_rate": 0.0001818181818181818, - "loss": 1.0837, + "epoch": 0.04094165813715456, + "grad_norm": 0.1962890625, + "learning_rate": 0.00017391304347826085, + "loss": 1.1077, "step": 40 }, { - "epoch": 0.04695837780149413, - "grad_norm": 0.1943359375, - "learning_rate": 0.00019999999999999998, - "loss": 1.1134, + "epoch": 0.04503582395087001, + "grad_norm": 0.2119140625, + "learning_rate": 0.0001913043478260869, + "loss": 1.1116, "step": 44 }, { - "epoch": 0.051227321237993596, - "grad_norm": 0.193359375, - "learning_rate": 0.00021818181818181816, - "loss": 1.0833, + "epoch": 0.04912998976458546, + "grad_norm": 0.208984375, + "learning_rate": 0.00020869565217391303, + "loss": 1.0815, "step": 48 }, { - "epoch": 0.05549626467449306, - "grad_norm": 0.2099609375, - "learning_rate": 0.00023636363636363633, - "loss": 1.0918, + "epoch": 0.05322415557830092, + "grad_norm": 0.203125, + "learning_rate": 0.0002260869565217391, + "loss": 1.1132, "step": 52 }, { - "epoch": 0.05976520811099253, - "grad_norm": 0.2177734375, - "learning_rate": 0.0002545454545454545, - "loss": 1.0809, + "epoch": 0.057318321392016376, + "grad_norm": 0.2109375, + "learning_rate": 0.00024347826086956522, + "loss": 1.1032, "step": 56 }, { - "epoch": 0.064034151547492, - "grad_norm": 0.1875, - "learning_rate": 0.0002727272727272727, - "loss": 1.088, + "epoch": 0.06141248720573183, + "grad_norm": 0.197265625, + "learning_rate": 0.0002608695652173913, + "loss": 1.0802, "step": 60 }, { - "epoch": 0.06830309498399147, - "grad_norm": 0.1845703125, - "learning_rate": 0.0002909090909090909, - "loss": 1.0458, + "epoch": 0.06550665301944729, + "grad_norm": 0.1962890625, + "learning_rate": 0.00027826086956521737, + "loss": 1.0692, "step": 64 }, { - "epoch": 0.07257203842049093, - "grad_norm": 0.181640625, - "learning_rate": 0.0002999999297687884, - "loss": 1.0937, + "epoch": 0.06960081883316274, + "grad_norm": 0.212890625, + "learning_rate": 0.00029565217391304345, + "loss": 1.0476, "step": 68 }, { - "epoch": 0.0768409818569904, - "grad_norm": 0.1953125, - "learning_rate": 0.00029999936791949057, - "loss": 1.0496, + "epoch": 0.0736949846468782, + "grad_norm": 0.1943359375, + "learning_rate": 0.00029999985464629347, + "loss": 1.0535, "step": 72 }, { - "epoch": 0.08110992529348986, - "grad_norm": 0.19921875, - "learning_rate": 0.0002999982442229994, - "loss": 1.0712, + "epoch": 0.07778915046059365, + "grad_norm": 0.1787109375, + "learning_rate": 0.00029999920863038815, + "loss": 1.042, "step": 76 }, { - "epoch": 0.08537886872998933, - "grad_norm": 0.2275390625, - "learning_rate": 0.00029999655868352383, - "loss": 1.0374, + "epoch": 0.08188331627430911, + "grad_norm": 0.1875, + "learning_rate": 0.0002999980458040957, + "loss": 1.0121, "step": 80 }, { - "epoch": 0.08964781216648879, - "grad_norm": 0.2177734375, - "learning_rate": 0.0002999943113073774, - "loss": 1.0349, + "epoch": 0.08597748208802457, + "grad_norm": 0.2197265625, + "learning_rate": 0.0002999963661714225, + "loss": 1.0423, "step": 84 }, { - "epoch": 0.09391675560298826, - "grad_norm": 0.1923828125, - "learning_rate": 0.00029999150210297795, - "loss": 1.0417, + "epoch": 0.09007164790174002, + "grad_norm": 0.2060546875, + "learning_rate": 0.0002999941697381556, + "loss": 1.0429, "step": 88 }, { - "epoch": 0.09818569903948772, - "grad_norm": 0.189453125, - "learning_rate": 0.0002999881310808479, - "loss": 1.0053, + "epoch": 0.09416581371545547, + "grad_norm": 0.193359375, + "learning_rate": 0.0002999914565118627, + "loss": 1.0316, "step": 92 }, { - "epoch": 0.10245464247598719, - "grad_norm": 0.203125, - "learning_rate": 0.0002999841982536139, - "loss": 1.0185, + "epoch": 0.09825997952917093, + "grad_norm": 0.1875, + "learning_rate": 0.0002999882265018919, + "loss": 1.0462, "step": 96 }, { - "epoch": 0.10672358591248667, - "grad_norm": 0.2158203125, - "learning_rate": 0.000299979703636007, - "loss": 1.0001, + "epoch": 0.1023541453428864, + "grad_norm": 0.2138671875, + "learning_rate": 0.000299984479719372, + "loss": 1.0391, "step": 100 }, { - "epoch": 0.11099252934898612, - "grad_norm": 0.208984375, - "learning_rate": 0.0002999746472448626, - "loss": 1.0292, + "epoch": 0.10644831115660185, + "grad_norm": 0.2001953125, + "learning_rate": 0.00029998021617721224, + "loss": 1.0167, "step": 104 }, { - "epoch": 0.1152614727854856, - "grad_norm": 0.1787109375, - "learning_rate": 0.00029996902909912017, - "loss": 1.0005, + "epoch": 0.1105424769703173, + "grad_norm": 0.1865234375, + "learning_rate": 0.0002999754358901023, + "loss": 1.001, "step": 108 }, { - "epoch": 0.11953041622198506, - "grad_norm": 0.2236328125, - "learning_rate": 0.0002999628492198234, - "loss": 0.9856, + "epoch": 0.11463664278403275, + "grad_norm": 0.2001953125, + "learning_rate": 0.00029997013887451236, + "loss": 1.0101, "step": 112 }, { - "epoch": 0.12379935965848453, - "grad_norm": 0.2294921875, - "learning_rate": 0.00029995610763012, - "loss": 0.9663, + "epoch": 0.1187308085977482, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002999643251486927, + "loss": 0.9859, "step": 116 }, { - "epoch": 0.128068303094984, - "grad_norm": 0.2421875, - "learning_rate": 0.00029994880435526184, - "loss": 0.9839, + "epoch": 0.12282497441146366, + "grad_norm": 0.205078125, + "learning_rate": 0.0002999579947326742, + "loss": 1.0245, "step": 120 }, { - "epoch": 0.13233724653148346, - "grad_norm": 0.2119140625, - "learning_rate": 0.0002999409394226044, - "loss": 1.0058, + "epoch": 0.1269191402251791, + "grad_norm": 0.22265625, + "learning_rate": 0.0002999511476482678, + "loss": 0.9762, "step": 124 }, { - "epoch": 0.13660618996798293, - "grad_norm": 0.2275390625, - "learning_rate": 0.0002999325128616071, - "loss": 1.0166, + "epoch": 0.13101330603889458, + "grad_norm": 0.2021484375, + "learning_rate": 0.00029994378391906453, + "loss": 0.9698, "step": 128 }, { - "epoch": 0.14087513340448238, - "grad_norm": 0.2080078125, - "learning_rate": 0.00029992352470383307, - "loss": 0.9636, + "epoch": 0.13510747185261002, + "grad_norm": 0.2001953125, + "learning_rate": 0.0002999359035704355, + "loss": 0.9787, "step": 132 }, { - "epoch": 0.14514407684098185, - "grad_norm": 0.2060546875, - "learning_rate": 0.0002999139749829488, - "loss": 0.9827, + "epoch": 0.13920163766632548, + "grad_norm": 0.2041015625, + "learning_rate": 0.00029992750662953196, + "loss": 1.0197, "step": 136 }, { - "epoch": 0.14941302027748132, - "grad_norm": 0.236328125, - "learning_rate": 0.00029990386373472444, - "loss": 0.9433, + "epoch": 0.14329580348004095, + "grad_norm": 0.208984375, + "learning_rate": 0.00029991859312528476, + "loss": 0.9958, "step": 140 }, { - "epoch": 0.1536819637139808, - "grad_norm": 0.2236328125, - "learning_rate": 0.0002998931909970333, - "loss": 0.9327, + "epoch": 0.1473899692937564, + "grad_norm": 0.2216796875, + "learning_rate": 0.00029990916308840476, + "loss": 0.9495, "step": 144 }, { - "epoch": 0.15795090715048027, - "grad_norm": 0.232421875, - "learning_rate": 0.0002998819568098519, - "loss": 0.9729, + "epoch": 0.15148413510747186, + "grad_norm": 0.1982421875, + "learning_rate": 0.0002998992165513824, + "loss": 1.0079, "step": 148 }, { - "epoch": 0.1622198505869797, - "grad_norm": 0.205078125, - "learning_rate": 0.0002998701612152596, - "loss": 0.9278, + "epoch": 0.1555783009211873, + "grad_norm": 0.212890625, + "learning_rate": 0.00029988875354848766, + "loss": 0.9369, "step": 152 }, { - "epoch": 0.1664887940234792, - "grad_norm": 0.23046875, - "learning_rate": 0.00029985780425743886, - "loss": 0.9414, + "epoch": 0.15967246673490276, + "grad_norm": 0.2119140625, + "learning_rate": 0.00029987777411576996, + "loss": 0.9607, "step": 156 }, { - "epoch": 0.17075773745997866, - "grad_norm": 0.21484375, - "learning_rate": 0.0002998448859826747, - "loss": 0.9558, + "epoch": 0.16376663254861823, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002998662782910581, + "loss": 0.9773, "step": 160 }, { - "epoch": 0.17502668089647813, - "grad_norm": 0.2216796875, - "learning_rate": 0.0002998314064393546, - "loss": 0.948, + "epoch": 0.16786079836233367, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002998542661139601, + "loss": 0.9218, "step": 164 }, { - "epoch": 0.17929562433297758, - "grad_norm": 0.2265625, - "learning_rate": 0.0002998173656779683, - "loss": 0.9017, + "epoch": 0.17195496417604914, + "grad_norm": 0.203125, + "learning_rate": 0.0002998417376258628, + "loss": 0.9753, "step": 168 }, { - "epoch": 0.18356456776947705, - "grad_norm": 0.216796875, - "learning_rate": 0.0002998027637511078, - "loss": 0.9413, + "epoch": 0.17604912998976457, + "grad_norm": 0.2294921875, + "learning_rate": 0.00029982869286993225, + "loss": 0.9777, "step": 172 }, { - "epoch": 0.18783351120597652, - "grad_norm": 0.2353515625, - "learning_rate": 0.0002997876007134671, - "loss": 0.9348, + "epoch": 0.18014329580348004, + "grad_norm": 0.265625, + "learning_rate": 0.00029981513189111314, + "loss": 0.9389, "step": 176 }, { - "epoch": 0.192102454642476, - "grad_norm": 0.2333984375, - "learning_rate": 0.0002997718766218419, - "loss": 0.9005, + "epoch": 0.1842374616171955, + "grad_norm": 0.2158203125, + "learning_rate": 0.00029980105473612865, + "loss": 0.9107, "step": 180 }, { - "epoch": 0.19637139807897544, - "grad_norm": 0.232421875, - "learning_rate": 0.00029975559153512917, - "loss": 0.9079, + "epoch": 0.18833162743091095, + "grad_norm": 0.205078125, + "learning_rate": 0.0002997864614534805, + "loss": 0.9503, "step": 184 }, { - "epoch": 0.2006403415154749, - "grad_norm": 0.2392578125, - "learning_rate": 0.0002997387455143275, - "loss": 0.9624, + "epoch": 0.19242579324462641, + "grad_norm": 0.2314453125, + "learning_rate": 0.00029977135209344874, + "loss": 0.9258, "step": 188 }, { - "epoch": 0.20490928495197439, - "grad_norm": 0.2109375, - "learning_rate": 0.0002997213386225363, - "loss": 0.9399, + "epoch": 0.19651995905834185, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002997557267080915, + "loss": 0.9276, "step": 192 }, { - "epoch": 0.20917822838847386, - "grad_norm": 0.25, - "learning_rate": 0.000299703370924956, - "loss": 0.9132, + "epoch": 0.20061412487205732, + "grad_norm": 0.216796875, + "learning_rate": 0.0002997395853512448, + "loss": 0.9303, "step": 196 }, { - "epoch": 0.21344717182497333, - "grad_norm": 0.2109375, - "learning_rate": 0.00029968484248888767, - "loss": 0.9243, + "epoch": 0.2047082906857728, + "grad_norm": 0.228515625, + "learning_rate": 0.00029972292807852233, + "loss": 0.9378, "step": 200 }, { - "epoch": 0.21771611526147278, - "grad_norm": 0.2255859375, - "learning_rate": 0.0002996657533837324, - "loss": 0.8978, + "epoch": 0.20880245649948823, + "grad_norm": 0.2216796875, + "learning_rate": 0.00029970575494731543, + "loss": 0.9271, "step": 204 }, { - "epoch": 0.22198505869797225, - "grad_norm": 0.203125, - "learning_rate": 0.0002996461036809917, - "loss": 0.9, + "epoch": 0.2128966223132037, + "grad_norm": 0.2119140625, + "learning_rate": 0.00029968806601679283, + "loss": 0.9574, "step": 208 }, { - "epoch": 0.22625400213447172, - "grad_norm": 0.22265625, - "learning_rate": 0.0002996258934542667, - "loss": 0.9007, + "epoch": 0.21699078812691913, + "grad_norm": 0.2236328125, + "learning_rate": 0.00029966986134790025, + "loss": 0.9354, "step": 212 }, { - "epoch": 0.2305229455709712, - "grad_norm": 0.232421875, - "learning_rate": 0.00029960512277925816, - "loss": 0.9005, + "epoch": 0.2210849539406346, + "grad_norm": 0.2216796875, + "learning_rate": 0.0002996511410033605, + "loss": 0.9185, "step": 216 }, { - "epoch": 0.23479188900747064, - "grad_norm": 0.216796875, - "learning_rate": 0.0002995837917337659, - "loss": 0.8934, + "epoch": 0.22517911975435004, + "grad_norm": 0.2314453125, + "learning_rate": 0.000299631905047673, + "loss": 0.9327, "step": 220 }, { - "epoch": 0.2390608324439701, - "grad_norm": 0.251953125, - "learning_rate": 0.00029956190039768897, - "loss": 0.9122, + "epoch": 0.2292732855680655, + "grad_norm": 0.2138671875, + "learning_rate": 0.00029961215354711376, + "loss": 0.9141, "step": 224 }, { - "epoch": 0.24332977588046958, - "grad_norm": 0.236328125, - "learning_rate": 0.000299539448853025, - "loss": 0.8902, + "epoch": 0.23336745138178097, + "grad_norm": 0.2421875, + "learning_rate": 0.000299591886569735, + "loss": 0.9369, "step": 228 }, { - "epoch": 0.24759871931696906, - "grad_norm": 0.224609375, - "learning_rate": 0.0002995164371838698, - "loss": 0.9001, + "epoch": 0.2374616171954964, + "grad_norm": 0.234375, + "learning_rate": 0.0002995711041853649, + "loss": 0.9163, "step": 232 }, { - "epoch": 0.2518676627534685, - "grad_norm": 0.236328125, - "learning_rate": 0.00029949286547641737, - "loss": 0.8653, + "epoch": 0.24155578300921188, + "grad_norm": 0.2255859375, + "learning_rate": 0.00029954980646560765, + "loss": 0.8721, "step": 236 }, { - "epoch": 0.256136606189968, - "grad_norm": 0.2431640625, - "learning_rate": 0.00029946873381895943, - "loss": 0.9168, + "epoch": 0.24564994882292732, + "grad_norm": 0.224609375, + "learning_rate": 0.0002995279934838427, + "loss": 0.8767, "step": 240 }, { - "epoch": 0.26040554962646745, - "grad_norm": 0.220703125, - "learning_rate": 0.00029944404230188503, - "loss": 0.9061, + "epoch": 0.24974411463664278, + "grad_norm": 0.240234375, + "learning_rate": 0.00029950566531522515, + "loss": 0.8851, "step": 244 }, { - "epoch": 0.2646744930629669, - "grad_norm": 0.2333984375, - "learning_rate": 0.00029941879101768037, - "loss": 0.896, + "epoch": 0.2538382804503582, + "grad_norm": 0.2197265625, + "learning_rate": 0.00029948282203668477, + "loss": 0.9255, "step": 248 }, { - "epoch": 0.2689434364994664, - "grad_norm": 0.23046875, - "learning_rate": 0.0002993929800609282, - "loss": 0.9208, + "epoch": 0.2579324462640737, + "grad_norm": 0.265625, + "learning_rate": 0.00029945946372692635, + "loss": 0.8865, "step": 252 }, { - "epoch": 0.27321237993596587, - "grad_norm": 0.23828125, - "learning_rate": 0.00029936660952830773, - "loss": 0.8437, + "epoch": 0.26202661207778916, + "grad_norm": 0.2216796875, + "learning_rate": 0.00029943559046642903, + "loss": 0.8514, "step": 256 }, { - "epoch": 0.27748132337246534, - "grad_norm": 0.2314453125, - "learning_rate": 0.00029933967951859404, - "loss": 0.8265, + "epoch": 0.2661207778915046, + "grad_norm": 0.234375, + "learning_rate": 0.00029941120233744625, + "loss": 0.9019, "step": 260 }, { - "epoch": 0.28175026680896476, - "grad_norm": 0.216796875, - "learning_rate": 0.00029931219013265786, - "loss": 0.8543, + "epoch": 0.27021494370522003, + "grad_norm": 0.251953125, + "learning_rate": 0.00029938629942400546, + "loss": 0.9104, "step": 264 }, { - "epoch": 0.28601921024546423, - "grad_norm": 0.2109375, - "learning_rate": 0.00029928414147346535, - "loss": 0.8803, + "epoch": 0.2743091095189355, + "grad_norm": 0.265625, + "learning_rate": 0.00029936088181190754, + "loss": 0.8915, "step": 268 }, { - "epoch": 0.2902881536819637, - "grad_norm": 0.2421875, - "learning_rate": 0.0002992555336460772, - "loss": 0.8672, + "epoch": 0.27840327533265097, + "grad_norm": 0.224609375, + "learning_rate": 0.000299334949588727, + "loss": 0.889, "step": 272 }, { - "epoch": 0.2945570971184632, - "grad_norm": 0.208984375, - "learning_rate": 0.0002992263667576488, - "loss": 0.862, + "epoch": 0.28249744114636643, + "grad_norm": 0.2275390625, + "learning_rate": 0.00029930850284381116, + "loss": 0.9146, "step": 276 }, { - "epoch": 0.29882604055496265, - "grad_norm": 0.248046875, - "learning_rate": 0.0002991966409174295, - "loss": 0.85, + "epoch": 0.2865916069600819, + "grad_norm": 0.23046875, + "learning_rate": 0.00029928154166828025, + "loss": 0.8627, "step": 280 }, { - "epoch": 0.3030949839914621, - "grad_norm": 0.2353515625, - "learning_rate": 0.0002991663562367622, - "loss": 0.8714, + "epoch": 0.2906857727737973, + "grad_norm": 0.25, + "learning_rate": 0.0002992540661550268, + "loss": 0.8735, "step": 284 }, { - "epoch": 0.3073639274279616, - "grad_norm": 0.234375, - "learning_rate": 0.00029913551282908325, - "loss": 0.8543, + "epoch": 0.2947799385875128, + "grad_norm": 0.23046875, + "learning_rate": 0.00029922607639871557, + "loss": 0.8819, "step": 288 }, { - "epoch": 0.31163287086446106, - "grad_norm": 0.2158203125, - "learning_rate": 0.00029910411080992164, - "loss": 0.8684, + "epoch": 0.29887410440122825, + "grad_norm": 0.244140625, + "learning_rate": 0.000299197572495783, + "loss": 0.8979, "step": 292 }, { - "epoch": 0.31590181430096054, - "grad_norm": 0.259765625, - "learning_rate": 0.00029907215029689873, - "loss": 0.8473, + "epoch": 0.3029682702149437, + "grad_norm": 0.244140625, + "learning_rate": 0.00029916855454443706, + "loss": 0.9047, "step": 296 }, { - "epoch": 0.32017075773745995, - "grad_norm": 0.26953125, - "learning_rate": 0.00029903963140972796, - "loss": 0.8555, + "epoch": 0.3070624360286592, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002991390226446568, + "loss": 0.8861, "step": 300 }, { - "epoch": 0.3244397011739594, - "grad_norm": 0.2197265625, - "learning_rate": 0.0002990065542702141, - "loss": 0.9187, + "epoch": 0.3111566018423746, + "grad_norm": 0.2353515625, + "learning_rate": 0.000299108976898192, + "loss": 0.8957, "step": 304 }, { - "epoch": 0.3287086446104589, - "grad_norm": 0.26953125, - "learning_rate": 0.00029897291900225293, - "loss": 0.8884, + "epoch": 0.31525076765609006, + "grad_norm": 0.26171875, + "learning_rate": 0.000299078417408563, + "loss": 0.8565, "step": 308 }, { - "epoch": 0.3329775880469584, - "grad_norm": 0.271484375, - "learning_rate": 0.0002989387257318309, - "loss": 0.8533, + "epoch": 0.3193449334698055, + "grad_norm": 0.244140625, + "learning_rate": 0.00029904734428105997, + "loss": 0.8656, "step": 312 }, { - "epoch": 0.33724653148345785, - "grad_norm": 0.236328125, - "learning_rate": 0.0002989039745870245, - "loss": 0.8292, + "epoch": 0.323439099283521, + "grad_norm": 0.263671875, + "learning_rate": 0.000299015757622743, + "loss": 0.8339, "step": 316 }, { - "epoch": 0.3415154749199573, - "grad_norm": 0.259765625, - "learning_rate": 0.0002988686656979999, - "loss": 0.8913, + "epoch": 0.32753326509723646, + "grad_norm": 0.232421875, + "learning_rate": 0.00029898365754244135, + "loss": 0.813, "step": 320 }, { - "epoch": 0.3457844183564568, - "grad_norm": 0.2333984375, - "learning_rate": 0.00029883279919701224, - "loss": 0.8792, + "epoch": 0.33162743091095187, + "grad_norm": 0.2353515625, + "learning_rate": 0.00029895104415075336, + "loss": 0.8969, "step": 324 }, { - "epoch": 0.35005336179295626, + "epoch": 0.33572159672466734, "grad_norm": 0.248046875, - "learning_rate": 0.0002987963752184054, - "loss": 0.853, + "learning_rate": 0.0002989179175600459, + "loss": 0.7858, "step": 328 }, { - "epoch": 0.35432230522945574, - "grad_norm": 0.2265625, - "learning_rate": 0.0002987593938986113, - "loss": 0.8576, + "epoch": 0.3398157625383828, + "grad_norm": 0.2333984375, + "learning_rate": 0.0002988842778844539, + "loss": 0.8451, "step": 332 }, { - "epoch": 0.35859124866595515, - "grad_norm": 0.2421875, - "learning_rate": 0.00029872185537614963, - "loss": 0.8549, + "epoch": 0.34390992835209827, + "grad_norm": 0.244140625, + "learning_rate": 0.00029885012523988034, + "loss": 0.8592, "step": 336 }, { - "epoch": 0.3628601921024546, - "grad_norm": 0.21484375, - "learning_rate": 0.000298683759791627, - "loss": 0.8209, + "epoch": 0.34800409416581374, + "grad_norm": 0.2275390625, + "learning_rate": 0.0002988154597439954, + "loss": 0.8831, "step": 340 }, { - "epoch": 0.3671291355389541, - "grad_norm": 0.27734375, - "learning_rate": 0.00029864510728773674, - "loss": 0.8402, + "epoch": 0.35209825997952915, + "grad_norm": 0.216796875, + "learning_rate": 0.0002987802815162363, + "loss": 0.8825, "step": 344 }, { - "epoch": 0.37139807897545357, - "grad_norm": 0.21484375, - "learning_rate": 0.00029860589800925796, - "loss": 0.8256, + "epoch": 0.3561924257932446, + "grad_norm": 0.251953125, + "learning_rate": 0.0002987445906778068, + "loss": 0.817, "step": 348 }, { - "epoch": 0.37566702241195304, - "grad_norm": 0.255859375, - "learning_rate": 0.00029856613210305565, - "loss": 0.8581, + "epoch": 0.3602865916069601, + "grad_norm": 0.2431640625, + "learning_rate": 0.00029870838735167684, + "loss": 0.8549, "step": 352 }, { - "epoch": 0.3799359658484525, - "grad_norm": 0.2734375, - "learning_rate": 0.0002985258097180794, - "loss": 0.8795, + "epoch": 0.36438075742067555, + "grad_norm": 0.212890625, + "learning_rate": 0.0002986716716625822, + "loss": 0.8458, "step": 356 }, { - "epoch": 0.384204909284952, - "grad_norm": 0.23046875, - "learning_rate": 0.00029848493100536325, - "loss": 0.8492, + "epoch": 0.368474923234391, + "grad_norm": 0.26171875, + "learning_rate": 0.0002986344437370238, + "loss": 0.8335, "step": 360 }, { - "epoch": 0.38847385272145146, - "grad_norm": 0.279296875, - "learning_rate": 0.00029844349611802526, - "loss": 0.8277, + "epoch": 0.3725690890481064, + "grad_norm": 0.2451171875, + "learning_rate": 0.00029859670370326757, + "loss": 0.8258, "step": 364 }, { - "epoch": 0.3927427961579509, - "grad_norm": 0.259765625, - "learning_rate": 0.0002984015052112665, - "loss": 0.7891, + "epoch": 0.3766632548618219, + "grad_norm": 0.228515625, + "learning_rate": 0.0002985584516913437, + "loss": 0.8816, "step": 368 }, { - "epoch": 0.39701173959445035, - "grad_norm": 0.265625, - "learning_rate": 0.0002983589584423708, - "loss": 0.8472, + "epoch": 0.38075742067553736, + "grad_norm": 0.2265625, + "learning_rate": 0.0002985196878330466, + "loss": 0.8361, "step": 372 }, { - "epoch": 0.4012806830309498, - "grad_norm": 0.255859375, - "learning_rate": 0.00029831585597070404, - "loss": 0.7938, + "epoch": 0.38485158648925283, + "grad_norm": 0.23828125, + "learning_rate": 0.000298480412261934, + "loss": 0.8259, "step": 376 }, { - "epoch": 0.4055496264674493, - "grad_norm": 0.265625, - "learning_rate": 0.0002982721979577136, - "loss": 0.8323, + "epoch": 0.3889457523029683, + "grad_norm": 0.25390625, + "learning_rate": 0.0002984406251133268, + "loss": 0.8277, "step": 380 }, { - "epoch": 0.40981856990394877, - "grad_norm": 0.283203125, - "learning_rate": 0.00029822798456692774, - "loss": 0.8585, + "epoch": 0.3930399181166837, + "grad_norm": 0.248046875, + "learning_rate": 0.0002984003265243084, + "loss": 0.861, "step": 384 }, { - "epoch": 0.41408751334044824, - "grad_norm": 0.2216796875, - "learning_rate": 0.0002981832159639548, - "loss": 0.7854, + "epoch": 0.3971340839303992, + "grad_norm": 0.240234375, + "learning_rate": 0.00029835951663372446, + "loss": 0.8145, "step": 388 }, { - "epoch": 0.4183564567769477, - "grad_norm": 0.2265625, - "learning_rate": 0.0002981378923164832, - "loss": 0.8226, + "epoch": 0.40122824974411464, + "grad_norm": 0.25390625, + "learning_rate": 0.0002983181955821822, + "loss": 0.8501, "step": 392 }, { - "epoch": 0.4226254002134472, - "grad_norm": 0.2412109375, - "learning_rate": 0.00029809201379427986, - "loss": 0.7998, + "epoch": 0.4053224155578301, + "grad_norm": 0.25390625, + "learning_rate": 0.00029827636351205004, + "loss": 0.8362, "step": 396 }, { - "epoch": 0.42689434364994666, - "grad_norm": 0.232421875, - "learning_rate": 0.00029804558056919036, - "loss": 0.8507, + "epoch": 0.4094165813715456, + "grad_norm": 0.25, + "learning_rate": 0.00029823402056745706, + "loss": 0.7834, "step": 400 }, { - "epoch": 0.4311632870864461, - "grad_norm": 0.23046875, - "learning_rate": 0.0002979985928151379, - "loss": 0.8109, + "epoch": 0.413510747185261, + "grad_norm": 0.236328125, + "learning_rate": 0.0002981911668942925, + "loss": 0.8486, "step": 404 }, { - "epoch": 0.43543223052294555, - "grad_norm": 0.2353515625, - "learning_rate": 0.0002979510507081229, - "loss": 0.797, + "epoch": 0.41760491299897645, + "grad_norm": 0.26171875, + "learning_rate": 0.00029814780264020535, + "loss": 0.8006, "step": 408 }, { - "epoch": 0.439701173959445, - "grad_norm": 0.2470703125, - "learning_rate": 0.00029790295442622204, - "loss": 0.7914, + "epoch": 0.4216990788126919, + "grad_norm": 0.263671875, + "learning_rate": 0.00029810392795460365, + "loss": 0.8553, "step": 412 }, { - "epoch": 0.4439701173959445, - "grad_norm": 0.240234375, - "learning_rate": 0.00029785430414958785, - "loss": 0.8193, + "epoch": 0.4257932446264074, + "grad_norm": 0.2119140625, + "learning_rate": 0.00029805954298865413, + "loss": 0.8271, "step": 416 }, { - "epoch": 0.44823906083244397, - "grad_norm": 0.2470703125, - "learning_rate": 0.00029780510006044794, - "loss": 0.8435, + "epoch": 0.42988741044012285, + "grad_norm": 0.2451171875, + "learning_rate": 0.0002980146478952818, + "loss": 0.8035, "step": 420 }, { - "epoch": 0.45250800426894344, - "grad_norm": 0.2490234375, - "learning_rate": 0.00029775534234310414, - "loss": 0.7973, + "epoch": 0.43398157625383826, + "grad_norm": 0.263671875, + "learning_rate": 0.000297969242829169, + "loss": 0.7901, "step": 424 }, { - "epoch": 0.4567769477054429, - "grad_norm": 0.2412109375, - "learning_rate": 0.0002977050311839322, - "loss": 0.8333, + "epoch": 0.43807574206755373, + "grad_norm": 0.267578125, + "learning_rate": 0.0002979233279467554, + "loss": 0.7974, "step": 428 }, { - "epoch": 0.4610458911419424, - "grad_norm": 0.275390625, - "learning_rate": 0.00029765416677138095, - "loss": 0.7915, + "epoch": 0.4421699078812692, + "grad_norm": 0.2421875, + "learning_rate": 0.000297876903406237, + "loss": 0.8294, "step": 432 }, { - "epoch": 0.46531483457844186, - "grad_norm": 0.267578125, - "learning_rate": 0.0002976027492959712, - "loss": 0.8047, + "epoch": 0.44626407369498466, + "grad_norm": 0.22265625, + "learning_rate": 0.000297829969367566, + "loss": 0.7868, "step": 436 }, { - "epoch": 0.4695837780149413, - "grad_norm": 0.232421875, - "learning_rate": 0.0002975507789502956, - "loss": 0.8397, + "epoch": 0.4503582395087001, + "grad_norm": 0.26171875, + "learning_rate": 0.0002977825259924497, + "loss": 0.842, "step": 440 }, { - "epoch": 0.47385272145144075, - "grad_norm": 0.259765625, - "learning_rate": 0.00029749825592901755, - "loss": 0.8664, + "epoch": 0.45445240532241554, + "grad_norm": 0.25390625, + "learning_rate": 0.00029773457344435067, + "loss": 0.8359, "step": 444 }, { - "epoch": 0.4781216648879402, - "grad_norm": 0.259765625, - "learning_rate": 0.0002974451804288706, - "loss": 0.8272, + "epoch": 0.458546571136131, + "grad_norm": 0.2421875, + "learning_rate": 0.0002976861118884856, + "loss": 0.8127, "step": 448 }, { - "epoch": 0.4823906083244397, + "epoch": 0.4626407369498465, "grad_norm": 0.2392578125, - "learning_rate": 0.00029739155264865783, - "loss": 0.8314, + "learning_rate": 0.00029763714149182483, + "loss": 0.8251, "step": 452 }, { - "epoch": 0.48665955176093917, - "grad_norm": 0.2578125, - "learning_rate": 0.0002973373727892508, - "loss": 0.8114, + "epoch": 0.46673490276356194, + "grad_norm": 0.28515625, + "learning_rate": 0.0002975876624230921, + "loss": 0.8145, "step": 456 }, { - "epoch": 0.49092849519743864, - "grad_norm": 0.236328125, - "learning_rate": 0.000297282641053589, - "loss": 0.8129, + "epoch": 0.47082906857727735, + "grad_norm": 0.26171875, + "learning_rate": 0.0002975376748527636, + "loss": 0.814, "step": 460 }, { - "epoch": 0.4951974386339381, - "grad_norm": 0.271484375, - "learning_rate": 0.0002972273576466792, - "loss": 0.8096, + "epoch": 0.4749232343909928, + "grad_norm": 0.255859375, + "learning_rate": 0.00029748717895306746, + "loss": 0.8419, "step": 464 }, { - "epoch": 0.4994663820704376, - "grad_norm": 0.25390625, - "learning_rate": 0.00029717152277559445, - "loss": 0.7984, + "epoch": 0.4790174002047083, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002974361748979834, + "loss": 0.814, "step": 468 }, { - "epoch": 0.503735325506937, - "grad_norm": 0.26953125, - "learning_rate": 0.00029711513664947334, - "loss": 0.816, + "epoch": 0.48311156601842375, + "grad_norm": 0.251953125, + "learning_rate": 0.00029738466286324176, + "loss": 0.8097, "step": 472 }, { - "epoch": 0.5080042689434365, - "grad_norm": 0.23046875, - "learning_rate": 0.0002970581994795194, - "loss": 0.8384, + "epoch": 0.4872057318321392, + "grad_norm": 0.25390625, + "learning_rate": 0.00029733264302632325, + "loss": 0.7909, "step": 476 }, { - "epoch": 0.512273212379936, - "grad_norm": 0.2490234375, - "learning_rate": 0.0002970007114790001, - "loss": 0.7474, + "epoch": 0.49129989764585463, + "grad_norm": 0.265625, + "learning_rate": 0.0002972801155664581, + "loss": 0.8078, "step": 480 }, { - "epoch": 0.5165421558164355, - "grad_norm": 0.26171875, - "learning_rate": 0.00029694267286324604, - "loss": 0.8275, + "epoch": 0.4953940634595701, + "grad_norm": 0.228515625, + "learning_rate": 0.00029722708066462543, + "loss": 0.8108, "step": 484 }, { - "epoch": 0.5208110992529349, - "grad_norm": 0.2451171875, - "learning_rate": 0.00029688408384965056, - "loss": 0.8404, + "epoch": 0.49948822927328557, + "grad_norm": 0.271484375, + "learning_rate": 0.00029717353850355286, + "loss": 0.852, "step": 488 }, { - "epoch": 0.5250800426894343, - "grad_norm": 0.23046875, - "learning_rate": 0.0002968249446576683, - "loss": 0.8155, + "epoch": 0.503582395087001, + "grad_norm": 0.2578125, + "learning_rate": 0.0002971194892677157, + "loss": 0.7383, "step": 492 }, { - "epoch": 0.5293489861259338, - "grad_norm": 0.2353515625, - "learning_rate": 0.0002967652555088148, - "loss": 0.7902, + "epoch": 0.5076765609007164, + "grad_norm": 0.2333984375, + "learning_rate": 0.0002970649331433362, + "loss": 0.7806, "step": 496 }, { - "epoch": 0.5336179295624333, - "grad_norm": 0.2421875, - "learning_rate": 0.00029670501662666546, - "loss": 0.8009, + "epoch": 0.5117707267144319, + "grad_norm": 0.267578125, + "learning_rate": 0.0002970098703183832, + "loss": 0.7578, "step": 500 }, { - "epoch": 0.5378868729989328, - "grad_norm": 0.220703125, - "learning_rate": 0.0002966442282368551, - "loss": 0.8057, + "epoch": 0.5158648925281474, + "grad_norm": 0.2314453125, + "learning_rate": 0.0002969543009825713, + "loss": 0.8208, "step": 504 }, { - "epoch": 0.5421558164354322, - "grad_norm": 0.23828125, - "learning_rate": 0.0002965828905670763, - "loss": 0.8033, + "epoch": 0.5199590583418628, + "grad_norm": 0.2314453125, + "learning_rate": 0.0002968982253273603, + "loss": 0.8233, "step": 508 }, { - "epoch": 0.5464247598719317, - "grad_norm": 0.2470703125, - "learning_rate": 0.00029652100384707956, - "loss": 0.7977, + "epoch": 0.5240532241555783, + "grad_norm": 0.224609375, + "learning_rate": 0.0002968416435459544, + "loss": 0.8092, "step": 512 }, { - "epoch": 0.5506937033084311, - "grad_norm": 0.2490234375, - "learning_rate": 0.00029645856830867165, - "loss": 0.7863, + "epoch": 0.5281473899692938, + "grad_norm": 0.2578125, + "learning_rate": 0.00029678455583330156, + "loss": 0.8246, "step": 516 }, { - "epoch": 0.5549626467449307, - "grad_norm": 0.240234375, - "learning_rate": 0.00029639558418571507, - "loss": 0.7807, + "epoch": 0.5322415557830092, + "grad_norm": 0.25390625, + "learning_rate": 0.0002967269623860931, + "loss": 0.7451, "step": 520 }, { - "epoch": 0.5592315901814301, - "grad_norm": 0.244140625, - "learning_rate": 0.00029633205171412724, - "loss": 0.7969, + "epoch": 0.5363357215967247, + "grad_norm": 0.248046875, + "learning_rate": 0.00029666886340276263, + "loss": 0.759, "step": 524 }, { - "epoch": 0.5635005336179295, - "grad_norm": 0.2392578125, - "learning_rate": 0.00029626797113187935, - "loss": 0.8441, + "epoch": 0.5404298874104401, + "grad_norm": 0.23046875, + "learning_rate": 0.00029661025908348556, + "loss": 0.8068, "step": 528 }, { - "epoch": 0.567769477054429, - "grad_norm": 0.255859375, - "learning_rate": 0.00029620334267899584, - "loss": 0.7523, + "epoch": 0.5445240532241555, + "grad_norm": 0.26171875, + "learning_rate": 0.0002965511496301784, + "loss": 0.7771, "step": 532 }, { - "epoch": 0.5720384204909285, - "grad_norm": 0.248046875, - "learning_rate": 0.000296138166597553, - "loss": 0.7924, + "epoch": 0.548618219037871, + "grad_norm": 0.24609375, + "learning_rate": 0.0002964915352464982, + "loss": 0.8213, "step": 536 }, { - "epoch": 0.576307363927428, - "grad_norm": 0.26171875, - "learning_rate": 0.00029607244313167863, - "loss": 0.7733, + "epoch": 0.5527123848515865, + "grad_norm": 0.263671875, + "learning_rate": 0.0002964314161378415, + "loss": 0.8283, "step": 540 }, { - "epoch": 0.5805763073639274, - "grad_norm": 0.26171875, - "learning_rate": 0.0002960061725275507, + "epoch": 0.5568065506653019, + "grad_norm": 0.271484375, + "learning_rate": 0.000296370792511344, "loss": 0.7587, "step": 544 }, { - "epoch": 0.5848452508004269, - "grad_norm": 0.25390625, - "learning_rate": 0.00029593935503339656, - "loss": 0.807, + "epoch": 0.5609007164790174, + "grad_norm": 0.2275390625, + "learning_rate": 0.0002963096645758795, + "loss": 0.7708, "step": 548 }, { - "epoch": 0.5891141942369263, - "grad_norm": 0.267578125, - "learning_rate": 0.00029587199089949214, - "loss": 0.8088, + "epoch": 0.5649948822927329, + "grad_norm": 0.2578125, + "learning_rate": 0.00029624803254205953, + "loss": 0.8349, "step": 552 }, { - "epoch": 0.5933831376734259, - "grad_norm": 0.240234375, - "learning_rate": 0.0002958040803781609, - "loss": 0.74, + "epoch": 0.5690890481064483, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002961858966222324, + "loss": 0.7805, "step": 556 }, { - "epoch": 0.5976520811099253, - "grad_norm": 0.25, - "learning_rate": 0.0002957356237237726, - "loss": 0.7626, + "epoch": 0.5731832139201638, + "grad_norm": 0.244140625, + "learning_rate": 0.0002961232570304824, + "loss": 0.757, "step": 560 }, { - "epoch": 0.6019210245464247, - "grad_norm": 0.2578125, - "learning_rate": 0.00029566662119274306, - "loss": 0.7676, + "epoch": 0.5772773797338793, + "grad_norm": 0.251953125, + "learning_rate": 0.0002960601139826294, + "loss": 0.7807, "step": 564 }, { - "epoch": 0.6061899679829242, - "grad_norm": 0.26171875, - "learning_rate": 0.00029559707304353247, - "loss": 0.8182, + "epoch": 0.5813715455475946, + "grad_norm": 0.2431640625, + "learning_rate": 0.00029599646769622775, + "loss": 0.7528, "step": 568 }, { - "epoch": 0.6104589114194237, - "grad_norm": 0.2412109375, - "learning_rate": 0.0002955269795366448, - "loss": 0.762, + "epoch": 0.5854657113613101, + "grad_norm": 0.236328125, + "learning_rate": 0.00029593231839056554, + "loss": 0.817, "step": 572 }, { - "epoch": 0.6147278548559232, - "grad_norm": 0.26171875, - "learning_rate": 0.00029545634093462676, - "loss": 0.8269, + "epoch": 0.5895598771750256, + "grad_norm": 0.26953125, + "learning_rate": 0.0002958676662866643, + "loss": 0.8212, "step": 576 }, { - "epoch": 0.6189967982924226, - "grad_norm": 0.2353515625, - "learning_rate": 0.0002953851575020669, - "loss": 0.7654, + "epoch": 0.593654042988741, + "grad_norm": 0.2470703125, + "learning_rate": 0.00029580251160727766, + "loss": 0.7561, "step": 580 }, { - "epoch": 0.6232657417289221, - "grad_norm": 0.244140625, - "learning_rate": 0.0002953134295055943, - "loss": 0.7842, + "epoch": 0.5977482088024565, + "grad_norm": 0.2353515625, + "learning_rate": 0.00029573685457689086, + "loss": 0.8184, "step": 584 }, { - "epoch": 0.6275346851654215, - "grad_norm": 0.255859375, - "learning_rate": 0.0002952411572138779, - "loss": 0.8006, + "epoch": 0.601842374616172, + "grad_norm": 0.236328125, + "learning_rate": 0.00029567069542172004, + "loss": 0.7591, "step": 588 }, { - "epoch": 0.6318036286019211, - "grad_norm": 0.259765625, - "learning_rate": 0.0002951683408976255, - "loss": 0.8125, + "epoch": 0.6059365404298874, + "grad_norm": 0.255859375, + "learning_rate": 0.0002956040343697114, + "loss": 0.7598, "step": 592 }, { - "epoch": 0.6360725720384205, - "grad_norm": 0.2275390625, - "learning_rate": 0.00029509498082958246, - "loss": 0.7812, + "epoch": 0.6100307062436029, + "grad_norm": 0.2578125, + "learning_rate": 0.0002955368716505401, + "loss": 0.7583, "step": 596 }, { - "epoch": 0.6403415154749199, - "grad_norm": 0.2470703125, - "learning_rate": 0.00029502107728453095, - "loss": 0.7902, + "epoch": 0.6141248720573184, + "grad_norm": 0.248046875, + "learning_rate": 0.0002954692074956102, + "loss": 0.7787, "step": 600 }, { - "epoch": 0.6446104589114194, - "grad_norm": 0.23828125, - "learning_rate": 0.0002949466305392889, - "loss": 0.8181, + "epoch": 0.6182190378710338, + "grad_norm": 0.2255859375, + "learning_rate": 0.00029540104213805307, + "loss": 0.8317, "step": 604 }, { - "epoch": 0.6488794023479189, - "grad_norm": 0.2353515625, - "learning_rate": 0.0002948716408727086, - "loss": 0.7977, + "epoch": 0.6223132036847492, + "grad_norm": 0.25, + "learning_rate": 0.00029533237581272706, + "loss": 0.7833, "step": 608 }, { - "epoch": 0.6531483457844184, - "grad_norm": 0.267578125, - "learning_rate": 0.00029479610856567625, - "loss": 0.8335, + "epoch": 0.6264073694984647, + "grad_norm": 0.248046875, + "learning_rate": 0.00029526320875621656, + "loss": 0.7263, "step": 612 }, { - "epoch": 0.6574172892209178, - "grad_norm": 0.23828125, - "learning_rate": 0.0002947200339011105, - "loss": 0.7471, + "epoch": 0.6305015353121801, + "grad_norm": 0.2333984375, + "learning_rate": 0.00029519354120683116, + "loss": 0.8114, "step": 616 }, { - "epoch": 0.6616862326574173, - "grad_norm": 0.251953125, - "learning_rate": 0.0002946434171639613, - "loss": 0.7966, + "epoch": 0.6345957011258956, + "grad_norm": 0.2578125, + "learning_rate": 0.0002951233734046049, + "loss": 0.785, "step": 620 }, { - "epoch": 0.6659551760939167, - "grad_norm": 0.265625, - "learning_rate": 0.0002945662586412093, - "loss": 0.7977, + "epoch": 0.638689866939611, + "grad_norm": 0.23828125, + "learning_rate": 0.0002950527055912955, + "loss": 0.8104, "step": 624 }, { - "epoch": 0.6702241195304163, - "grad_norm": 0.240234375, - "learning_rate": 0.00029448855862186445, - "loss": 0.7886, + "epoch": 0.6427840327533265, + "grad_norm": 0.234375, + "learning_rate": 0.00029498153801038303, + "loss": 0.7885, "step": 628 }, { - "epoch": 0.6744930629669157, - "grad_norm": 0.240234375, - "learning_rate": 0.00029441031739696477, - "loss": 0.8127, + "epoch": 0.646878198567042, + "grad_norm": 0.2373046875, + "learning_rate": 0.00029490987090707, + "loss": 0.7883, "step": 632 }, { - "epoch": 0.6787620064034151, - "grad_norm": 0.2412109375, - "learning_rate": 0.0002943315352595756, - "loss": 0.7775, + "epoch": 0.6509723643807575, + "grad_norm": 0.291015625, + "learning_rate": 0.0002948377045282796, + "loss": 0.8154, "step": 636 }, { - "epoch": 0.6830309498399146, + "epoch": 0.6550665301944729, "grad_norm": 0.25, - "learning_rate": 0.0002942522125047884, - "loss": 0.7769, + "learning_rate": 0.0002947650391226555, + "loss": 0.7979, "step": 640 }, { - "epoch": 0.687299893276414, - "grad_norm": 0.2177734375, - "learning_rate": 0.00029417234942971947, - "loss": 0.7934, + "epoch": 0.6591606960081884, + "grad_norm": 0.236328125, + "learning_rate": 0.00029469187494056046, + "loss": 0.7896, "step": 644 }, { - "epoch": 0.6915688367129136, - "grad_norm": 0.2470703125, - "learning_rate": 0.0002940919463335091, - "loss": 0.7217, + "epoch": 0.6632548618219037, + "grad_norm": 0.255859375, + "learning_rate": 0.0002946182122340759, + "loss": 0.7295, "step": 648 }, { - "epoch": 0.695837780149413, - "grad_norm": 0.232421875, - "learning_rate": 0.0002940110035173201, - "loss": 0.7463, + "epoch": 0.6673490276356192, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002945440512570009, + "loss": 0.7889, "step": 652 }, { - "epoch": 0.7001067235859125, - "grad_norm": 0.248046875, - "learning_rate": 0.00029392952128433727, - "loss": 0.7861, + "epoch": 0.6714431934493347, + "grad_norm": 0.24609375, + "learning_rate": 0.00029446939226485125, + "loss": 0.7931, "step": 656 }, { - "epoch": 0.7043756670224119, - "grad_norm": 0.244140625, - "learning_rate": 0.0002938474999397655, - "loss": 0.7669, + "epoch": 0.6755373592630501, + "grad_norm": 0.2578125, + "learning_rate": 0.00029439423551485844, + "loss": 0.7944, "step": 660 }, { - "epoch": 0.7086446104589115, - "grad_norm": 0.251953125, - "learning_rate": 0.00029376493979082916, - "loss": 0.7686, + "epoch": 0.6796315250767656, + "grad_norm": 0.24609375, + "learning_rate": 0.0002943185812659693, + "loss": 0.8238, "step": 664 }, { - "epoch": 0.7129135538954109, - "grad_norm": 0.236328125, - "learning_rate": 0.0002936818411467709, - "loss": 0.768, + "epoch": 0.6837256908904811, + "grad_norm": 0.263671875, + "learning_rate": 0.00029424242977884436, + "loss": 0.827, "step": 668 }, { - "epoch": 0.7171824973319103, - "grad_norm": 0.244140625, - "learning_rate": 0.00029359820431885025, - "loss": 0.7736, + "epoch": 0.6878198567041965, + "grad_norm": 0.2470703125, + "learning_rate": 0.00029416578131585765, + "loss": 0.7974, "step": 672 }, { - "epoch": 0.7214514407684098, - "grad_norm": 0.234375, - "learning_rate": 0.0002935140296203426, - "loss": 0.8166, + "epoch": 0.691914022517912, + "grad_norm": 0.26171875, + "learning_rate": 0.00029408863614109533, + "loss": 0.8207, "step": 676 }, { - "epoch": 0.7257203842049093, - "grad_norm": 0.2333984375, - "learning_rate": 0.00029342931736653816, - "loss": 0.7935, + "epoch": 0.6960081883316275, + "grad_norm": 0.2490234375, + "learning_rate": 0.000294010994520355, + "loss": 0.7433, "step": 680 }, { - "epoch": 0.7299893276414088, - "grad_norm": 0.25, - "learning_rate": 0.0002933440678747404, - "loss": 0.7525, + "epoch": 0.7001023541453428, + "grad_norm": 0.220703125, + "learning_rate": 0.00029393285672114477, + "loss": 0.8231, "step": 684 }, { - "epoch": 0.7342582710779082, - "grad_norm": 0.2392578125, - "learning_rate": 0.00029325828146426543, - "loss": 0.7382, + "epoch": 0.7041965199590583, + "grad_norm": 0.23828125, + "learning_rate": 0.0002938542230126821, + "loss": 0.8487, "step": 688 }, { - "epoch": 0.7385272145144077, - "grad_norm": 0.267578125, - "learning_rate": 0.0002931719584564402, - "loss": 0.7566, + "epoch": 0.7082906857727738, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002937750936658933, + "loss": 0.7503, "step": 692 }, { - "epoch": 0.7427961579509071, - "grad_norm": 0.265625, - "learning_rate": 0.0002930850991746017, - "loss": 0.7944, + "epoch": 0.7123848515864892, + "grad_norm": 0.2890625, + "learning_rate": 0.00029369546895341225, + "loss": 0.8059, "step": 696 }, { - "epoch": 0.7470651013874067, - "grad_norm": 0.236328125, - "learning_rate": 0.00029299770394409553, - "loss": 0.8058, + "epoch": 0.7164790174002047, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002936153491495796, + "loss": 0.7972, "step": 700 }, { - "epoch": 0.7513340448239061, - "grad_norm": 0.251953125, - "learning_rate": 0.0002929097730922749, - "loss": 0.7256, + "epoch": 0.7205731832139202, + "grad_norm": 0.2470703125, + "learning_rate": 0.0002935347345304417, + "loss": 0.7924, "step": 704 }, { - "epoch": 0.7556029882604055, - "grad_norm": 0.2451171875, - "learning_rate": 0.0002928213069484992, - "loss": 0.8137, + "epoch": 0.7246673490276356, + "grad_norm": 0.255859375, + "learning_rate": 0.00029345362537374996, + "loss": 0.7478, "step": 708 }, { - "epoch": 0.759871931696905, - "grad_norm": 0.2314453125, - "learning_rate": 0.0002927323058441328, - "loss": 0.8153, + "epoch": 0.7287615148413511, + "grad_norm": 0.2333984375, + "learning_rate": 0.0002933720219589595, + "loss": 0.7237, "step": 712 }, { - "epoch": 0.7641408751334045, - "grad_norm": 0.2431640625, - "learning_rate": 0.000292642770112544, - "loss": 0.8005, + "epoch": 0.7328556806550666, + "grad_norm": 0.255859375, + "learning_rate": 0.00029328992456722835, + "loss": 0.7898, "step": 716 }, { - "epoch": 0.768409818569904, - "grad_norm": 0.287109375, - "learning_rate": 0.0002925527000891035, - "loss": 0.7505, + "epoch": 0.736949846468782, + "grad_norm": 0.2578125, + "learning_rate": 0.00029320733348141666, + "loss": 0.7344, "step": 720 }, { - "epoch": 0.7726787620064034, - "grad_norm": 0.2578125, - "learning_rate": 0.00029246209611118336, - "loss": 0.7822, + "epoch": 0.7410440122824974, + "grad_norm": 0.2265625, + "learning_rate": 0.00029312424898608546, + "loss": 0.8047, "step": 724 }, { - "epoch": 0.7769477054429029, - "grad_norm": 0.24609375, - "learning_rate": 0.00029237095851815555, - "loss": 0.8299, + "epoch": 0.7451381780962129, + "grad_norm": 0.251953125, + "learning_rate": 0.0002930406713674957, + "loss": 0.7926, "step": 728 }, { - "epoch": 0.7812166488794023, - "grad_norm": 0.259765625, - "learning_rate": 0.0002922792876513909, - "loss": 0.7657, + "epoch": 0.7492323439099283, + "grad_norm": 0.255859375, + "learning_rate": 0.00029295660091360764, + "loss": 0.7631, "step": 732 }, { - "epoch": 0.7854855923159018, - "grad_norm": 0.2392578125, - "learning_rate": 0.0002921870838542576, - "loss": 0.7274, + "epoch": 0.7533265097236438, + "grad_norm": 0.234375, + "learning_rate": 0.00029287203791407917, + "loss": 0.7931, "step": 736 }, { - "epoch": 0.7897545357524013, - "grad_norm": 0.265625, - "learning_rate": 0.0002920943474721201, - "loss": 0.735, + "epoch": 0.7574206755373593, + "grad_norm": 0.2314453125, + "learning_rate": 0.00029278698266026545, + "loss": 0.8366, "step": 740 }, { - "epoch": 0.7940234791889007, - "grad_norm": 0.2490234375, - "learning_rate": 0.0002920010788523377, - "loss": 0.7685, + "epoch": 0.7615148413510747, + "grad_norm": 0.267578125, + "learning_rate": 0.0002927014354452177, + "loss": 0.7789, "step": 744 }, { - "epoch": 0.7982924226254002, - "grad_norm": 0.234375, - "learning_rate": 0.00029190727834426327, - "loss": 0.7693, + "epoch": 0.7656090071647902, + "grad_norm": 0.23828125, + "learning_rate": 0.000292615396563682, + "loss": 0.7381, "step": 748 }, { - "epoch": 0.8025613660618997, - "grad_norm": 0.2392578125, - "learning_rate": 0.000291812946299242, - "loss": 0.7253, + "epoch": 0.7697031729785057, + "grad_norm": 0.2451171875, + "learning_rate": 0.00029252886631209846, + "loss": 0.7583, "step": 752 }, { - "epoch": 0.8068303094983992, - "grad_norm": 0.2197265625, - "learning_rate": 0.00029171808307061, - "loss": 0.7483, + "epoch": 0.7737973387922211, + "grad_norm": 0.265625, + "learning_rate": 0.0002924418449886003, + "loss": 0.7299, "step": 756 }, { - "epoch": 0.8110992529348986, - "grad_norm": 0.259765625, - "learning_rate": 0.00029162268901369306, - "loss": 0.7539, + "epoch": 0.7778915046059366, + "grad_norm": 0.255859375, + "learning_rate": 0.00029235433289301257, + "loss": 0.753, "step": 760 }, { - "epoch": 0.8153681963713981, - "grad_norm": 0.244140625, - "learning_rate": 0.0002915267644858052, - "loss": 0.7549, + "epoch": 0.781985670419652, + "grad_norm": 0.2373046875, + "learning_rate": 0.0002922663303268512, + "loss": 0.7687, "step": 764 }, { - "epoch": 0.8196371398078975, - "grad_norm": 0.294921875, - "learning_rate": 0.0002914303098462474, - "loss": 0.8225, + "epoch": 0.7860798362333674, + "grad_norm": 0.25390625, + "learning_rate": 0.00029217783759332214, + "loss": 0.7792, "step": 768 }, { - "epoch": 0.823906083244397, - "grad_norm": 0.23828125, - "learning_rate": 0.00029133332545630645, - "loss": 0.7879, + "epoch": 0.7901740020470829, + "grad_norm": 0.263671875, + "learning_rate": 0.00029208885499732004, + "loss": 0.7712, "step": 772 }, { - "epoch": 0.8281750266808965, - "grad_norm": 0.248046875, - "learning_rate": 0.0002912358116792531, - "loss": 0.7922, + "epoch": 0.7942681678607983, + "grad_norm": 0.259765625, + "learning_rate": 0.0002919993828454274, + "loss": 0.7814, "step": 776 }, { - "epoch": 0.8324439701173959, - "grad_norm": 0.25, - "learning_rate": 0.00029113776888034125, - "loss": 0.7629, + "epoch": 0.7983623336745138, + "grad_norm": 0.2470703125, + "learning_rate": 0.0002919094214459134, + "loss": 0.7868, "step": 780 }, { - "epoch": 0.8367129135538954, - "grad_norm": 0.2412109375, - "learning_rate": 0.0002910391974268063, - "loss": 0.7856, + "epoch": 0.8024564994882293, + "grad_norm": 0.23046875, + "learning_rate": 0.000291818971108733, + "loss": 0.759, "step": 784 }, { - "epoch": 0.8409818569903948, - "grad_norm": 0.240234375, - "learning_rate": 0.00029094009768786366, - "loss": 0.7591, + "epoch": 0.8065506653019447, + "grad_norm": 0.25390625, + "learning_rate": 0.0002917280321455255, + "loss": 0.722, "step": 788 }, { - "epoch": 0.8452508004268944, - "grad_norm": 0.251953125, - "learning_rate": 0.00029084047003470766, - "loss": 0.7624, + "epoch": 0.8106448311156602, + "grad_norm": 0.2421875, + "learning_rate": 0.00029163660486961404, + "loss": 0.7269, "step": 792 }, { - "epoch": 0.8495197438633938, - "grad_norm": 0.236328125, - "learning_rate": 0.00029074031484051005, - "loss": 0.7285, + "epoch": 0.8147389969293757, + "grad_norm": 0.26953125, + "learning_rate": 0.0002915446895960041, + "loss": 0.7571, "step": 796 }, { - "epoch": 0.8537886872998933, - "grad_norm": 0.25, - "learning_rate": 0.00029063963248041844, - "loss": 0.7958, + "epoch": 0.8188331627430911, + "grad_norm": 0.26171875, + "learning_rate": 0.0002914522866413823, + "loss": 0.7498, "step": 800 }, { - "epoch": 0.8580576307363927, - "grad_norm": 0.2373046875, - "learning_rate": 0.00029053842333155516, - "loss": 0.8189, + "epoch": 0.8229273285568065, + "grad_norm": 0.298828125, + "learning_rate": 0.00029135939632411576, + "loss": 0.7437, "step": 804 }, { - "epoch": 0.8623265741728922, - "grad_norm": 0.248046875, - "learning_rate": 0.0002904366877730156, - "loss": 0.7676, + "epoch": 0.827021494370522, + "grad_norm": 0.265625, + "learning_rate": 0.00029126601896425084, + "loss": 0.7524, "step": 808 }, { - "epoch": 0.8665955176093917, - "grad_norm": 0.2353515625, - "learning_rate": 0.000290334426185867, - "loss": 0.7937, + "epoch": 0.8311156601842374, + "grad_norm": 0.255859375, + "learning_rate": 0.0002911721548835116, + "loss": 0.783, "step": 812 }, { - "epoch": 0.8708644610458911, - "grad_norm": 0.25, - "learning_rate": 0.00029023163895314685, - "loss": 0.7136, + "epoch": 0.8352098259979529, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002910778044052995, + "loss": 0.7625, "step": 816 }, { - "epoch": 0.8751334044823906, - "grad_norm": 0.236328125, - "learning_rate": 0.00029012832645986154, - "loss": 0.7553, + "epoch": 0.8393039918116684, + "grad_norm": 0.2333984375, + "learning_rate": 0.00029098296785469153, + "loss": 0.8289, "step": 820 }, { - "epoch": 0.87940234791889, - "grad_norm": 0.25390625, - "learning_rate": 0.00029002448909298497, - "loss": 0.7844, + "epoch": 0.8433981576253838, + "grad_norm": 0.23828125, + "learning_rate": 0.00029088764555843953, + "loss": 0.7998, "step": 824 }, { - "epoch": 0.8836712913553896, - "grad_norm": 0.2275390625, - "learning_rate": 0.00028992012724145694, - "loss": 0.7752, + "epoch": 0.8474923234390993, + "grad_norm": 0.3046875, + "learning_rate": 0.0002907918378449689, + "loss": 0.7699, "step": 828 }, { - "epoch": 0.887940234791889, - "grad_norm": 0.232421875, - "learning_rate": 0.0002898152412961819, - "loss": 0.769, + "epoch": 0.8515864892528148, + "grad_norm": 0.24609375, + "learning_rate": 0.00029069554504437757, + "loss": 0.7366, "step": 832 }, { - "epoch": 0.8922091782283885, - "grad_norm": 0.255859375, - "learning_rate": 0.0002897098316500273, - "loss": 0.7243, + "epoch": 0.8556806550665302, + "grad_norm": 0.25390625, + "learning_rate": 0.0002905987674884347, + "loss": 0.7756, "step": 836 }, { - "epoch": 0.8964781216648879, - "grad_norm": 0.2392578125, - "learning_rate": 0.00028960389869782225, - "loss": 0.7912, + "epoch": 0.8597748208802457, + "grad_norm": 0.244140625, + "learning_rate": 0.00029050150551057977, + "loss": 0.8355, "step": 840 }, { - "epoch": 0.9007470651013874, - "grad_norm": 0.2314453125, - "learning_rate": 0.000289497442836356, - "loss": 0.7629, + "epoch": 0.8638689866939611, + "grad_norm": 0.232421875, + "learning_rate": 0.00029040375944592114, + "loss": 0.7178, "step": 844 }, { - "epoch": 0.9050160085378869, - "grad_norm": 0.2373046875, - "learning_rate": 0.00028939046446437635, - "loss": 0.8218, + "epoch": 0.8679631525076765, + "grad_norm": 0.26171875, + "learning_rate": 0.00029030552963123517, + "loss": 0.7798, "step": 848 }, { - "epoch": 0.9092849519743863, - "grad_norm": 0.27734375, - "learning_rate": 0.0002892829639825884, - "loss": 0.7341, + "epoch": 0.872057318321392, + "grad_norm": 0.240234375, + "learning_rate": 0.0002902068164049649, + "loss": 0.7485, "step": 852 }, { - "epoch": 0.9135538954108858, - "grad_norm": 0.271484375, - "learning_rate": 0.00028917494179365273, - "loss": 0.7561, + "epoch": 0.8761514841351075, + "grad_norm": 0.31640625, + "learning_rate": 0.0002901076201072189, + "loss": 0.7272, "step": 856 }, { - "epoch": 0.9178228388473852, - "grad_norm": 0.279296875, - "learning_rate": 0.00028906639830218414, - "loss": 0.7344, + "epoch": 0.8802456499488229, + "grad_norm": 0.2412109375, + "learning_rate": 0.00029000794107977016, + "loss": 0.7575, "step": 860 }, { - "epoch": 0.9220917822838848, + "epoch": 0.8843398157625384, "grad_norm": 0.26171875, - "learning_rate": 0.00028895733391475, - "loss": 0.8057, + "learning_rate": 0.0002899077796660549, + "loss": 0.7391, "step": 864 }, { - "epoch": 0.9263607257203842, - "grad_norm": 0.279296875, - "learning_rate": 0.000288847749039869, - "loss": 0.6914, + "epoch": 0.8884339815762539, + "grad_norm": 0.234375, + "learning_rate": 0.0002898071362111713, + "loss": 0.7357, "step": 868 }, { - "epoch": 0.9306296691568837, - "grad_norm": 0.2314453125, - "learning_rate": 0.00028873764408800907, - "loss": 0.7737, + "epoch": 0.8925281473899693, + "grad_norm": 0.240234375, + "learning_rate": 0.00028970601106187844, + "loss": 0.7975, "step": 872 }, { - "epoch": 0.9348986125933831, - "grad_norm": 0.2578125, - "learning_rate": 0.0002886270194715864, - "loss": 0.7899, + "epoch": 0.8966223132036848, + "grad_norm": 0.263671875, + "learning_rate": 0.000289604404566595, + "loss": 0.7639, "step": 876 }, { - "epoch": 0.9391675560298826, - "grad_norm": 0.2431640625, - "learning_rate": 0.0002885158756049636, - "loss": 0.7628, + "epoch": 0.9007164790174002, + "grad_norm": 0.26953125, + "learning_rate": 0.00028950231707539813, + "loss": 0.7482, "step": 880 }, { - "epoch": 0.9434364994663821, - "grad_norm": 0.2451171875, - "learning_rate": 0.00028840421290444817, - "loss": 0.7288, + "epoch": 0.9048106448311156, + "grad_norm": 0.234375, + "learning_rate": 0.0002893997489400221, + "loss": 0.7782, "step": 884 }, { - "epoch": 0.9477054429028815, - "grad_norm": 0.251953125, - "learning_rate": 0.0002882920317882911, - "loss": 0.7506, + "epoch": 0.9089048106448311, + "grad_norm": 0.25, + "learning_rate": 0.00028929670051385745, + "loss": 0.7509, "step": 888 }, { - "epoch": 0.951974386339381, - "grad_norm": 0.2490234375, - "learning_rate": 0.0002881793326766852, - "loss": 0.743, + "epoch": 0.9129989764585466, + "grad_norm": 0.251953125, + "learning_rate": 0.0002891931721519492, + "loss": 0.7274, "step": 892 }, { - "epoch": 0.9562433297758804, - "grad_norm": 0.232421875, - "learning_rate": 0.00028806611599176325, - "loss": 0.7591, + "epoch": 0.917093142272262, + "grad_norm": 0.240234375, + "learning_rate": 0.0002890891642109962, + "loss": 0.7512, "step": 896 }, { - "epoch": 0.96051227321238, - "grad_norm": 0.2451171875, - "learning_rate": 0.00028795238215759703, - "loss": 0.7597, + "epoch": 0.9211873080859775, + "grad_norm": 0.26953125, + "learning_rate": 0.0002889846770493496, + "loss": 0.7375, "step": 900 }, { - "epoch": 0.9647812166488794, - "grad_norm": 0.2490234375, - "learning_rate": 0.00028783813160019526, - "loss": 0.7295, + "epoch": 0.925281473899693, + "grad_norm": 0.25, + "learning_rate": 0.00028887971102701154, + "loss": 0.6872, "step": 904 }, { - "epoch": 0.9690501600853789, - "grad_norm": 0.2470703125, - "learning_rate": 0.00028772336474750193, - "loss": 0.7862, + "epoch": 0.9293756397134084, + "grad_norm": 0.26171875, + "learning_rate": 0.0002887742665056342, + "loss": 0.7695, "step": 908 }, { - "epoch": 0.9733191035218783, - "grad_norm": 0.2431640625, - "learning_rate": 0.0002876080820293953, - "loss": 0.7436, + "epoch": 0.9334698055271239, + "grad_norm": 0.2451171875, + "learning_rate": 0.0002886683438485183, + "loss": 0.7726, "step": 912 }, { - "epoch": 0.9775880469583778, - "grad_norm": 0.2734375, - "learning_rate": 0.00028749228387768555, - "loss": 0.7865, + "epoch": 0.9375639713408394, + "grad_norm": 0.28515625, + "learning_rate": 0.000288561943420612, + "loss": 0.7426, "step": 916 }, { - "epoch": 0.9818569903948773, - "grad_norm": 0.248046875, - "learning_rate": 0.00028737597072611357, - "loss": 0.8468, + "epoch": 0.9416581371545547, + "grad_norm": 0.25390625, + "learning_rate": 0.0002884550655885095, + "loss": 0.7311, "step": 920 }, { - "epoch": 0.9861259338313767, - "grad_norm": 0.255859375, - "learning_rate": 0.0002872591430103495, - "loss": 0.7507, + "epoch": 0.9457523029682702, + "grad_norm": 0.259765625, + "learning_rate": 0.00028834771072044994, + "loss": 0.7497, "step": 924 }, { - "epoch": 0.9903948772678762, - "grad_norm": 0.25, - "learning_rate": 0.00028714180116799044, - "loss": 0.7358, + "epoch": 0.9498464687819856, + "grad_norm": 0.2421875, + "learning_rate": 0.00028823987918631596, + "loss": 0.7134, "step": 928 }, { - "epoch": 0.9946638207043756, - "grad_norm": 0.2490234375, - "learning_rate": 0.0002870239456385596, - "loss": 0.808, + "epoch": 0.9539406345957011, + "grad_norm": 0.24609375, + "learning_rate": 0.0002881315713576326, + "loss": 0.7818, "step": 932 }, { - "epoch": 0.9989327641408752, - "grad_norm": 0.2314453125, - "learning_rate": 0.00028690557686350424, - "loss": 0.7362, + "epoch": 0.9580348004094166, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002880227876075659, + "loss": 0.7358, "step": 936 }, { - "epoch": 1.0032017075773747, - "grad_norm": 0.24609375, - "learning_rate": 0.00028678669528619383, - "loss": 0.6681, + "epoch": 0.962128966223132, + "grad_norm": 0.271484375, + "learning_rate": 0.00028791352831092164, + "loss": 0.7309, "step": 940 }, { - "epoch": 1.007470651013874, - "grad_norm": 0.244140625, - "learning_rate": 0.00028666730135191884, - "loss": 0.7536, + "epoch": 0.9662231320368475, + "grad_norm": 0.2216796875, + "learning_rate": 0.0002878037938441441, + "loss": 0.7227, "step": 944 }, { - "epoch": 1.0117395944503735, - "grad_norm": 0.267578125, - "learning_rate": 0.00028654739550788887, - "loss": 0.7087, + "epoch": 0.970317297850563, + "grad_norm": 0.255859375, + "learning_rate": 0.0002876935845853146, + "loss": 0.7828, "step": 948 }, { - "epoch": 1.016008537886873, - "grad_norm": 0.25390625, - "learning_rate": 0.0002864269782032308, - "loss": 0.7074, + "epoch": 0.9744114636642784, + "grad_norm": 0.26953125, + "learning_rate": 0.0002875829009141505, + "loss": 0.7156, "step": 952 }, { - "epoch": 1.0202774813233724, - "grad_norm": 0.2353515625, - "learning_rate": 0.0002863060498889874, - "loss": 0.7294, + "epoch": 0.9785056294779939, + "grad_norm": 0.255859375, + "learning_rate": 0.00028747174321200363, + "loss": 0.7602, "step": 956 }, { - "epoch": 1.024546424759872, - "grad_norm": 0.2353515625, - "learning_rate": 0.0002861846110181155, - "loss": 0.6662, + "epoch": 0.9825997952917093, + "grad_norm": 0.248046875, + "learning_rate": 0.000287360111861859, + "loss": 0.7051, "step": 960 }, { - "epoch": 1.0288153681963714, - "grad_norm": 0.259765625, - "learning_rate": 0.0002860626620454842, - "loss": 0.6669, + "epoch": 0.9866939611054247, + "grad_norm": 0.267578125, + "learning_rate": 0.00028724800724833354, + "loss": 0.6861, "step": 964 }, { - "epoch": 1.033084311632871, - "grad_norm": 0.2392578125, - "learning_rate": 0.0002859402034278735, - "loss": 0.6592, + "epoch": 0.9907881269191402, + "grad_norm": 0.251953125, + "learning_rate": 0.00028713542975767486, + "loss": 0.7947, "step": 968 }, { - "epoch": 1.0373532550693703, - "grad_norm": 0.2373046875, - "learning_rate": 0.000285817235623972, - "loss": 0.6988, + "epoch": 0.9948822927328557, + "grad_norm": 0.25, + "learning_rate": 0.0002870223797777598, + "loss": 0.7499, "step": 972 }, { - "epoch": 1.0416221985058698, - "grad_norm": 0.23046875, - "learning_rate": 0.00028569375909437585, - "loss": 0.7218, + "epoch": 0.9989764585465711, + "grad_norm": 0.25390625, + "learning_rate": 0.0002869088576980931, + "loss": 0.7674, "step": 976 }, { - "epoch": 1.0458911419423693, - "grad_norm": 0.267578125, - "learning_rate": 0.0002855697743015866, - "loss": 0.6944, + "epoch": 1.0030706243602865, + "grad_norm": 0.25, + "learning_rate": 0.0002867948639098061, + "loss": 0.6806, "step": 980 }, { - "epoch": 1.0501600853788686, - "grad_norm": 0.2314453125, - "learning_rate": 0.0002854452817100096, - "loss": 0.6644, + "epoch": 1.007164790174002, + "grad_norm": 0.27734375, + "learning_rate": 0.00028668039880565526, + "loss": 0.7057, "step": 984 }, { - "epoch": 1.0544290288153682, - "grad_norm": 0.2333984375, - "learning_rate": 0.0002853202817859522, - "loss": 0.6779, + "epoch": 1.0112589559877174, + "grad_norm": 0.2255859375, + "learning_rate": 0.0002865654627800212, + "loss": 0.6704, "step": 988 }, { - "epoch": 1.0586979722518677, - "grad_norm": 0.251953125, - "learning_rate": 0.00028519477499762213, - "loss": 0.6798, + "epoch": 1.015353121801433, + "grad_norm": 0.2421875, + "learning_rate": 0.00028645005622890673, + "loss": 0.708, "step": 992 }, { - "epoch": 1.0629669156883672, - "grad_norm": 0.267578125, - "learning_rate": 0.0002850687618151256, - "loss": 0.6308, + "epoch": 1.0194472876151484, + "grad_norm": 0.275390625, + "learning_rate": 0.0002863341795499361, + "loss": 0.6166, "step": 996 }, { - "epoch": 1.0672358591248665, - "grad_norm": 0.2578125, - "learning_rate": 0.00028494224271046565, - "loss": 0.6534, + "epoch": 1.0235414534288638, + "grad_norm": 0.255859375, + "learning_rate": 0.00028621783314235314, + "loss": 0.6911, "step": 1000 }, { - "epoch": 1.071504802561366, - "grad_norm": 0.25390625, - "learning_rate": 0.0002848152181575402, - "loss": 0.7001, + "epoch": 1.0276356192425793, + "grad_norm": 0.251953125, + "learning_rate": 0.0002861010174070202, + "loss": 0.6379, "step": 1004 }, { - "epoch": 1.0757737459978656, - "grad_norm": 0.27734375, - "learning_rate": 0.0002846876886321406, - "loss": 0.6383, + "epoch": 1.0317297850562948, + "grad_norm": 0.259765625, + "learning_rate": 0.0002859837327464167, + "loss": 0.691, "step": 1008 }, { - "epoch": 1.0800426894343649, - "grad_norm": 0.25390625, - "learning_rate": 0.0002845596546119496, - "loss": 0.6803, + "epoch": 1.0358239508700102, + "grad_norm": 0.232421875, + "learning_rate": 0.0002858659795646375, + "loss": 0.6792, "step": 1012 }, { - "epoch": 1.0843116328708644, - "grad_norm": 0.2412109375, - "learning_rate": 0.00028443111657653947, - "loss": 0.7376, + "epoch": 1.0399181166837257, + "grad_norm": 0.26953125, + "learning_rate": 0.000285747758267392, + "loss": 0.6846, "step": 1016 }, { - "epoch": 1.088580576307364, - "grad_norm": 0.251953125, - "learning_rate": 0.00028430207500737054, - "loss": 0.6841, + "epoch": 1.0440122824974412, + "grad_norm": 0.23828125, + "learning_rate": 0.00028562906926200225, + "loss": 0.7281, "step": 1020 }, { - "epoch": 1.0928495197438635, - "grad_norm": 0.2255859375, - "learning_rate": 0.0002841725303877892, - "loss": 0.6774, + "epoch": 1.0481064483111566, + "grad_norm": 0.267578125, + "learning_rate": 0.0002855099129574018, + "loss": 0.6851, "step": 1024 }, { - "epoch": 1.0971184631803628, - "grad_norm": 0.263671875, - "learning_rate": 0.00028404248320302585, - "loss": 0.6982, + "epoch": 1.052200614124872, + "grad_norm": 0.265625, + "learning_rate": 0.00028539028976413435, + "loss": 0.7062, "step": 1028 }, { - "epoch": 1.1013874066168623, - "grad_norm": 0.244140625, - "learning_rate": 0.00028391193394019376, - "loss": 0.7055, + "epoch": 1.0562947799385876, + "grad_norm": 0.2373046875, + "learning_rate": 0.000285270200094352, + "loss": 0.6651, "step": 1032 }, { - "epoch": 1.1056563500533618, - "grad_norm": 0.25390625, - "learning_rate": 0.0002837808830882864, - "loss": 0.6633, + "epoch": 1.060388945752303, + "grad_norm": 0.236328125, + "learning_rate": 0.0002851496443618143, + "loss": 0.6657, "step": 1036 }, { - "epoch": 1.1099252934898614, - "grad_norm": 0.267578125, - "learning_rate": 0.00028364933113817615, - "loss": 0.6493, + "epoch": 1.0644831115660185, + "grad_norm": 0.2470703125, + "learning_rate": 0.00028502862298188634, + "loss": 0.6647, "step": 1040 }, { - "epoch": 1.1141942369263607, - "grad_norm": 0.25390625, - "learning_rate": 0.0002835172785826125, - "loss": 0.661, + "epoch": 1.068577277379734, + "grad_norm": 0.2578125, + "learning_rate": 0.00028490713637153786, + "loss": 0.7025, "step": 1044 }, { - "epoch": 1.1184631803628602, - "grad_norm": 0.2470703125, - "learning_rate": 0.0002833847259162199, - "loss": 0.6788, + "epoch": 1.0726714431934494, + "grad_norm": 0.23828125, + "learning_rate": 0.00028478518494934123, + "loss": 0.6967, "step": 1048 }, { - "epoch": 1.1227321237993597, + "epoch": 1.076765609007165, "grad_norm": 0.251953125, - "learning_rate": 0.00028325167363549606, - "loss": 0.7103, + "learning_rate": 0.0002846627691354705, + "loss": 0.7205, "step": 1052 }, { - "epoch": 1.127001067235859, - "grad_norm": 0.25, - "learning_rate": 0.00028311812223881, - "loss": 0.6641, + "epoch": 1.0808597748208801, + "grad_norm": 0.26171875, + "learning_rate": 0.00028453988935169954, + "loss": 0.6407, "step": 1056 }, { - "epoch": 1.1312700106723586, - "grad_norm": 0.263671875, - "learning_rate": 0.00028298407222640055, - "loss": 0.7192, + "epoch": 1.0849539406345956, + "grad_norm": 0.2421875, + "learning_rate": 0.000284416546021401, + "loss": 0.6704, "step": 1060 }, { - "epoch": 1.135538954108858, - "grad_norm": 0.28515625, - "learning_rate": 0.0002828495241003738, - "loss": 0.7105, + "epoch": 1.089048106448311, + "grad_norm": 0.2314453125, + "learning_rate": 0.0002842927395695446, + "loss": 0.692, "step": 1064 }, { - "epoch": 1.1398078975453576, - "grad_norm": 0.26171875, - "learning_rate": 0.00028271447836470174, - "loss": 0.6379, + "epoch": 1.0931422722620265, + "grad_norm": 0.236328125, + "learning_rate": 0.0002841684704226955, + "loss": 0.6846, "step": 1068 }, { - "epoch": 1.144076840981857, - "grad_norm": 0.2734375, - "learning_rate": 0.00028257893552522046, - "loss": 0.6747, + "epoch": 1.097236438075742, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002840437390090133, + "loss": 0.651, "step": 1072 }, { - "epoch": 1.1483457844183564, - "grad_norm": 0.26953125, - "learning_rate": 0.00028244289608962773, - "loss": 0.666, + "epoch": 1.1013306038894575, + "grad_norm": 0.267578125, + "learning_rate": 0.0002839185457582502, + "loss": 0.7293, "step": 1076 }, { - "epoch": 1.152614727854856, - "grad_norm": 0.2578125, - "learning_rate": 0.0002823063605674816, - "loss": 0.6981, + "epoch": 1.105424769703173, + "grad_norm": 0.248046875, + "learning_rate": 0.0002837928911017496, + "loss": 0.6636, "step": 1080 }, { - "epoch": 1.1568836712913555, - "grad_norm": 0.25390625, - "learning_rate": 0.00028216932947019817, - "loss": 0.7096, + "epoch": 1.1095189355168884, + "grad_norm": 0.25, + "learning_rate": 0.0002836667754724447, + "loss": 0.6393, "step": 1084 }, { - "epoch": 1.1611526147278548, - "grad_norm": 0.25390625, - "learning_rate": 0.0002820318033110499, - "loss": 0.724, + "epoch": 1.1136131013306039, + "grad_norm": 0.251953125, + "learning_rate": 0.0002835401993048568, + "loss": 0.7271, "step": 1088 }, { - "epoch": 1.1654215581643543, - "grad_norm": 0.24609375, - "learning_rate": 0.0002818937826051636, - "loss": 0.7105, + "epoch": 1.1177072671443193, + "grad_norm": 0.25, + "learning_rate": 0.0002834131630350942, + "loss": 0.6788, "step": 1092 }, { - "epoch": 1.1696905016008539, - "grad_norm": 0.2578125, - "learning_rate": 0.00028175526786951825, - "loss": 0.6659, + "epoch": 1.1218014329580348, + "grad_norm": 0.267578125, + "learning_rate": 0.00028328566710085024, + "loss": 0.6806, "step": 1096 }, { - "epoch": 1.1739594450373532, - "grad_norm": 0.267578125, - "learning_rate": 0.0002816162596229436, - "loss": 0.6361, + "epoch": 1.1258955987717503, + "grad_norm": 0.2294921875, + "learning_rate": 0.000283157711941402, + "loss": 0.7403, "step": 1100 }, { - "epoch": 1.1782283884738527, - "grad_norm": 0.24609375, - "learning_rate": 0.00028147675838611775, - "loss": 0.6774, + "epoch": 1.1299897645854657, + "grad_norm": 0.248046875, + "learning_rate": 0.000283029297997609, + "loss": 0.6585, "step": 1104 }, { - "epoch": 1.1824973319103522, - "grad_norm": 0.275390625, - "learning_rate": 0.0002813367646815655, - "loss": 0.681, + "epoch": 1.1340839303991812, + "grad_norm": 0.267578125, + "learning_rate": 0.00028290042571191114, + "loss": 0.6178, "step": 1108 }, { - "epoch": 1.1867662753468518, - "grad_norm": 0.2578125, - "learning_rate": 0.0002811962790336561, - "loss": 0.7276, + "epoch": 1.1381780962128967, + "grad_norm": 0.26171875, + "learning_rate": 0.0002827710955283277, + "loss": 0.688, "step": 1112 }, { - "epoch": 1.191035218783351, - "grad_norm": 0.267578125, - "learning_rate": 0.00028105530196860154, - "loss": 0.7113, + "epoch": 1.1422722620266121, + "grad_norm": 0.283203125, + "learning_rate": 0.00028264130789245565, + "loss": 0.6886, "step": 1116 }, { - "epoch": 1.1953041622198506, - "grad_norm": 0.263671875, - "learning_rate": 0.00028091383401445454, - "loss": 0.6788, + "epoch": 1.1463664278403276, + "grad_norm": 0.25390625, + "learning_rate": 0.00028251106325146797, + "loss": 0.6451, "step": 1120 }, { - "epoch": 1.1995731056563501, - "grad_norm": 0.244140625, - "learning_rate": 0.00028077187570110654, - "loss": 0.6695, + "epoch": 1.150460593654043, + "grad_norm": 0.25, + "learning_rate": 0.0002823803620541122, + "loss": 0.7498, "step": 1124 }, { - "epoch": 1.2038420490928496, - "grad_norm": 0.24609375, - "learning_rate": 0.00028062942756028553, - "loss": 0.6908, + "epoch": 1.1545547594677585, + "grad_norm": 0.26171875, + "learning_rate": 0.00028224920475070905, + "loss": 0.6605, "step": 1128 }, { - "epoch": 1.208110992529349, - "grad_norm": 0.251953125, - "learning_rate": 0.0002804864901255545, - "loss": 0.7365, + "epoch": 1.158648925281474, + "grad_norm": 0.2431640625, + "learning_rate": 0.00028211759179315053, + "loss": 0.6827, "step": 1132 }, { - "epoch": 1.2123799359658485, - "grad_norm": 0.26171875, - "learning_rate": 0.0002803430639323089, - "loss": 0.6305, + "epoch": 1.1627430910951895, + "grad_norm": 0.267578125, + "learning_rate": 0.00028198552363489874, + "loss": 0.6841, "step": 1136 }, { - "epoch": 1.216648879402348, - "grad_norm": 0.2451171875, - "learning_rate": 0.00028019914951777497, - "loss": 0.662, + "epoch": 1.1668372569089047, + "grad_norm": 0.27734375, + "learning_rate": 0.000281853000730984, + "loss": 0.6681, "step": 1140 }, { - "epoch": 1.2209178228388473, - "grad_norm": 0.24609375, - "learning_rate": 0.00028005474742100786, - "loss": 0.6763, + "epoch": 1.1709314227226202, + "grad_norm": 0.2578125, + "learning_rate": 0.0002817200235380035, + "loss": 0.6497, "step": 1144 }, { - "epoch": 1.2251867662753468, + "epoch": 1.1750255885363357, "grad_norm": 0.25, - "learning_rate": 0.0002799098581828892, - "loss": 0.6827, + "learning_rate": 0.00028158659251411954, + "loss": 0.6667, "step": 1148 }, { - "epoch": 1.2294557097118464, - "grad_norm": 0.251953125, - "learning_rate": 0.00027976448234612534, - "loss": 0.6805, + "epoch": 1.1791197543500511, + "grad_norm": 0.2578125, + "learning_rate": 0.0002814527081190583, + "loss": 0.6781, "step": 1152 }, { - "epoch": 1.2337246531483457, - "grad_norm": 0.25390625, - "learning_rate": 0.00027961862045524527, - "loss": 0.664, + "epoch": 1.1832139201637666, + "grad_norm": 0.28125, + "learning_rate": 0.0002813183708141077, + "loss": 0.6829, "step": 1156 }, { - "epoch": 1.2379935965848452, - "grad_norm": 0.26953125, - "learning_rate": 0.00027947227305659866, - "loss": 0.7206, + "epoch": 1.187308085977482, + "grad_norm": 0.25390625, + "learning_rate": 0.00028118358106211635, + "loss": 0.6888, "step": 1160 }, { - "epoch": 1.2422625400213447, - "grad_norm": 0.259765625, - "learning_rate": 0.00027932544069835354, - "loss": 0.6668, + "epoch": 1.1914022517911975, + "grad_norm": 0.267578125, + "learning_rate": 0.0002810483393274916, + "loss": 0.7212, "step": 1164 }, { - "epoch": 1.2465314834578443, - "grad_norm": 0.27734375, - "learning_rate": 0.0002791781239304947, - "loss": 0.6946, + "epoch": 1.195496417604913, + "grad_norm": 0.275390625, + "learning_rate": 0.00028091264607619826, + "loss": 0.7186, "step": 1168 }, { - "epoch": 1.2508004268943438, - "grad_norm": 0.26171875, - "learning_rate": 0.00027903032330482106, - "loss": 0.6449, + "epoch": 1.1995905834186285, + "grad_norm": 0.267578125, + "learning_rate": 0.0002807765017757565, + "loss": 0.6889, "step": 1172 }, { - "epoch": 1.255069370330843, - "grad_norm": 0.251953125, - "learning_rate": 0.0002788820393749442, - "loss": 0.6893, + "epoch": 1.203684749232344, + "grad_norm": 0.2578125, + "learning_rate": 0.00028063990689524093, + "loss": 0.7395, "step": 1176 }, { - "epoch": 1.2593383137673426, - "grad_norm": 0.26953125, - "learning_rate": 0.0002787332726962858, - "loss": 0.645, + "epoch": 1.2077789150460594, + "grad_norm": 0.25, + "learning_rate": 0.00028050286190527823, + "loss": 0.6695, "step": 1180 }, { - "epoch": 1.2636072572038421, - "grad_norm": 0.294921875, - "learning_rate": 0.00027858402382607573, - "loss": 0.689, + "epoch": 1.2118730808597749, + "grad_norm": 0.265625, + "learning_rate": 0.00028036536727804606, + "loss": 0.6742, "step": 1184 }, { - "epoch": 1.2678762006403415, - "grad_norm": 0.25, - "learning_rate": 0.0002784342933233502, - "loss": 0.6507, + "epoch": 1.2159672466734903, + "grad_norm": 0.267578125, + "learning_rate": 0.0002802274234872713, + "loss": 0.6568, "step": 1188 }, { - "epoch": 1.272145144076841, - "grad_norm": 0.265625, - "learning_rate": 0.00027828408174894925, - "loss": 0.6714, + "epoch": 1.2200614124872058, + "grad_norm": 0.267578125, + "learning_rate": 0.00028008903100822834, + "loss": 0.7136, "step": 1192 }, { - "epoch": 1.2764140875133405, - "grad_norm": 0.265625, - "learning_rate": 0.000278133389665515, - "loss": 0.695, + "epoch": 1.2241555783009213, + "grad_norm": 0.2734375, + "learning_rate": 0.0002799501903177375, + "loss": 0.631, "step": 1196 }, { - "epoch": 1.2806830309498398, - "grad_norm": 0.251953125, - "learning_rate": 0.0002779822176374892, - "loss": 0.7099, + "epoch": 1.2282497441146367, + "grad_norm": 0.2490234375, + "learning_rate": 0.00027981090189416343, + "loss": 0.7038, "step": 1200 }, { - "epoch": 1.2849519743863393, - "grad_norm": 0.2578125, - "learning_rate": 0.00027783056623111155, - "loss": 0.6568, + "epoch": 1.2323439099283522, + "grad_norm": 0.271484375, + "learning_rate": 0.00027967116621741326, + "loss": 0.6591, "step": 1204 }, { - "epoch": 1.2892209178228389, - "grad_norm": 0.24609375, - "learning_rate": 0.0002776784360144172, - "loss": 0.7091, + "epoch": 1.2364380757420674, + "grad_norm": 0.271484375, + "learning_rate": 0.0002795309837689352, + "loss": 0.7088, "step": 1208 }, { - "epoch": 1.2934898612593382, - "grad_norm": 0.25390625, - "learning_rate": 0.00027752582755723474, - "loss": 0.7185, + "epoch": 1.240532241555783, + "grad_norm": 0.267578125, + "learning_rate": 0.0002793903550317169, + "loss": 0.6818, "step": 1212 }, { - "epoch": 1.2977588046958377, - "grad_norm": 0.251953125, - "learning_rate": 0.0002773727414311842, - "loss": 0.6609, + "epoch": 1.2446264073694984, + "grad_norm": 0.23828125, + "learning_rate": 0.00027924928049028337, + "loss": 0.7158, "step": 1216 }, { - "epoch": 1.3020277481323372, - "grad_norm": 0.25390625, - "learning_rate": 0.00027721917820967465, - "loss": 0.6336, + "epoch": 1.2487205731832138, + "grad_norm": 0.25, + "learning_rate": 0.00027910776063069586, + "loss": 0.6467, "step": 1220 }, { - "epoch": 1.3062966915688368, - "grad_norm": 0.24609375, - "learning_rate": 0.00027706513846790235, - "loss": 0.6845, + "epoch": 1.2528147389969293, + "grad_norm": 0.296875, + "learning_rate": 0.0002789657959405498, + "loss": 0.6674, "step": 1224 }, { - "epoch": 1.3105656350053363, - "grad_norm": 0.265625, - "learning_rate": 0.00027691062278284835, - "loss": 0.6605, + "epoch": 1.2569089048106448, + "grad_norm": 0.23828125, + "learning_rate": 0.00027882338690897327, + "loss": 0.6747, "step": 1228 }, { - "epoch": 1.3148345784418356, - "grad_norm": 0.271484375, - "learning_rate": 0.0002767556317332764, - "loss": 0.6922, + "epoch": 1.2610030706243602, + "grad_norm": 0.2734375, + "learning_rate": 0.00027868053402662534, + "loss": 0.6999, "step": 1232 }, { - "epoch": 1.3191035218783351, - "grad_norm": 0.25, - "learning_rate": 0.00027660016589973097, - "loss": 0.6919, + "epoch": 1.2650972364380757, + "grad_norm": 0.25390625, + "learning_rate": 0.00027853723778569427, + "loss": 0.7059, "step": 1236 }, { - "epoch": 1.3233724653148347, - "grad_norm": 0.26171875, - "learning_rate": 0.0002764442258645347, - "loss": 0.6739, + "epoch": 1.2691914022517912, + "grad_norm": 0.28125, + "learning_rate": 0.00027839349867989587, + "loss": 0.714, "step": 1240 }, { - "epoch": 1.327641408751334, - "grad_norm": 0.28125, - "learning_rate": 0.00027628781221178655, - "loss": 0.7121, + "epoch": 1.2732855680655066, + "grad_norm": 0.267578125, + "learning_rate": 0.00027824931720447194, + "loss": 0.712, "step": 1244 }, { - "epoch": 1.3319103521878335, - "grad_norm": 0.263671875, - "learning_rate": 0.0002761309255273595, - "loss": 0.6966, + "epoch": 1.277379733879222, + "grad_norm": 0.275390625, + "learning_rate": 0.0002781046938561882, + "loss": 0.6872, "step": 1248 }, { - "epoch": 1.336179295624333, - "grad_norm": 0.263671875, - "learning_rate": 0.00027597356639889826, - "loss": 0.6589, + "epoch": 1.2814738996929376, + "grad_norm": 0.251953125, + "learning_rate": 0.00027795962913333304, + "loss": 0.7098, "step": 1252 }, { - "epoch": 1.3404482390608323, - "grad_norm": 0.251953125, - "learning_rate": 0.0002758157354158173, - "loss": 0.6605, + "epoch": 1.285568065506653, + "grad_norm": 0.275390625, + "learning_rate": 0.00027781412353571544, + "loss": 0.6901, "step": 1256 }, { - "epoch": 1.3447171824973319, - "grad_norm": 0.24609375, - "learning_rate": 0.00027565743316929824, - "loss": 0.7141, + "epoch": 1.2896622313203685, + "grad_norm": 0.287109375, + "learning_rate": 0.00027766817756466334, + "loss": 0.6867, "step": 1260 }, { - "epoch": 1.3489861259338314, - "grad_norm": 0.271484375, - "learning_rate": 0.0002754986602522882, - "loss": 0.682, + "epoch": 1.293756397134084, + "grad_norm": 0.25390625, + "learning_rate": 0.00027752179172302213, + "loss": 0.7032, "step": 1264 }, { - "epoch": 1.353255069370331, - "grad_norm": 0.271484375, - "learning_rate": 0.0002753394172594972, - "loss": 0.7279, + "epoch": 1.2978505629477994, + "grad_norm": 0.25390625, + "learning_rate": 0.0002773749665151525, + "loss": 0.6738, "step": 1268 }, { - "epoch": 1.3575240128068304, - "grad_norm": 0.2578125, - "learning_rate": 0.0002751797047873957, - "loss": 0.6412, + "epoch": 1.301944728761515, + "grad_norm": 0.267578125, + "learning_rate": 0.00027722770244692924, + "loss": 0.6769, "step": 1272 }, { - "epoch": 1.3617929562433297, - "grad_norm": 0.25, - "learning_rate": 0.00027501952343421323, - "loss": 0.6337, + "epoch": 1.3060388945752304, + "grad_norm": 0.283203125, + "learning_rate": 0.0002770800000257388, + "loss": 0.6839, "step": 1276 }, { - "epoch": 1.3660618996798293, - "grad_norm": 0.2890625, - "learning_rate": 0.000274858873799935, - "loss": 0.6944, + "epoch": 1.3101330603889458, + "grad_norm": 0.259765625, + "learning_rate": 0.0002769318597604784, + "loss": 0.6871, "step": 1280 }, { - "epoch": 1.3703308431163288, - "grad_norm": 0.25390625, - "learning_rate": 0.0002746977564863007, - "loss": 0.6983, + "epoch": 1.3142272262026613, + "grad_norm": 0.255859375, + "learning_rate": 0.0002767832821615534, + "loss": 0.7099, "step": 1284 }, { - "epoch": 1.374599786552828, - "grad_norm": 0.2412109375, - "learning_rate": 0.0002745361720968016, - "loss": 0.6566, + "epoch": 1.3183213920163768, + "grad_norm": 0.23828125, + "learning_rate": 0.0002766342677408763, + "loss": 0.702, "step": 1288 }, { - "epoch": 1.3788687299893276, - "grad_norm": 0.2470703125, - "learning_rate": 0.00027437412123667833, - "loss": 0.6634, + "epoch": 1.3224155578300922, + "grad_norm": 0.283203125, + "learning_rate": 0.0002764848170118644, + "loss": 0.6578, "step": 1292 }, { - "epoch": 1.3831376734258272, - "grad_norm": 0.251953125, - "learning_rate": 0.00027421160451291906, - "loss": 0.7026, + "epoch": 1.3265097236438077, + "grad_norm": 0.25, + "learning_rate": 0.00027633493048943825, + "loss": 0.6473, "step": 1296 }, { - "epoch": 1.3874066168623265, - "grad_norm": 0.26171875, - "learning_rate": 0.00027404862253425677, - "loss": 0.6845, + "epoch": 1.330603889457523, + "grad_norm": 0.275390625, + "learning_rate": 0.00027618460869002016, + "loss": 0.6512, "step": 1300 }, { - "epoch": 1.391675560298826, - "grad_norm": 0.279296875, - "learning_rate": 0.0002738851759111671, - "loss": 0.652, + "epoch": 1.3346980552712384, + "grad_norm": 0.263671875, + "learning_rate": 0.00027603385213153186, + "loss": 0.7167, "step": 1304 }, { - "epoch": 1.3959445037353255, - "grad_norm": 0.2470703125, - "learning_rate": 0.00027372126525586614, - "loss": 0.6708, + "epoch": 1.3387922210849539, + "grad_norm": 0.259765625, + "learning_rate": 0.0002758826613333932, + "loss": 0.7231, "step": 1308 }, { - "epoch": 1.400213447171825, - "grad_norm": 0.2392578125, - "learning_rate": 0.00027355689118230823, - "loss": 0.6644, + "epoch": 1.3428863868986693, + "grad_norm": 0.248046875, + "learning_rate": 0.00027573103681652, + "loss": 0.7133, "step": 1312 }, { - "epoch": 1.4044823906083244, - "grad_norm": 0.26953125, - "learning_rate": 0.0002733920543061832, - "loss": 0.6562, + "epoch": 1.3469805527123848, + "grad_norm": 0.265625, + "learning_rate": 0.0002755789791033227, + "loss": 0.6972, "step": 1316 }, { - "epoch": 1.4087513340448239, - "grad_norm": 0.275390625, - "learning_rate": 0.0002732267552449146, - "loss": 0.644, + "epoch": 1.3510747185261003, + "grad_norm": 0.27734375, + "learning_rate": 0.00027542648871770384, + "loss": 0.7027, "step": 1320 }, { - "epoch": 1.4130202774813234, - "grad_norm": 0.25390625, - "learning_rate": 0.00027306099461765716, - "loss": 0.6693, + "epoch": 1.3551688843398157, + "grad_norm": 0.259765625, + "learning_rate": 0.00027527356618505715, + "loss": 0.6562, "step": 1324 }, { - "epoch": 1.417289220917823, - "grad_norm": 0.251953125, - "learning_rate": 0.0002728947730452945, - "loss": 0.7215, + "epoch": 1.3592630501535312, + "grad_norm": 0.275390625, + "learning_rate": 0.00027512021203226507, + "loss": 0.6721, "step": 1328 }, { - "epoch": 1.4215581643543223, - "grad_norm": 0.2890625, - "learning_rate": 0.0002727280911504367, - "loss": 0.641, + "epoch": 1.3633572159672467, + "grad_norm": 0.29296875, + "learning_rate": 0.00027496642678769717, + "loss": 0.7029, "step": 1332 }, { - "epoch": 1.4258271077908218, - "grad_norm": 0.291015625, - "learning_rate": 0.000272560949557418, - "loss": 0.6825, + "epoch": 1.3674513817809621, + "grad_norm": 0.25390625, + "learning_rate": 0.0002748122109812083, + "loss": 0.7028, "step": 1336 }, { - "epoch": 1.4300960512273213, - "grad_norm": 0.25, - "learning_rate": 0.00027239334889229467, - "loss": 0.6791, + "epoch": 1.3715455475946776, + "grad_norm": 0.25390625, + "learning_rate": 0.00027465756514413677, + "loss": 0.6865, "step": 1340 }, { - "epoch": 1.4343649946638206, - "grad_norm": 0.291015625, - "learning_rate": 0.00027222528978284254, - "loss": 0.6332, + "epoch": 1.375639713408393, + "grad_norm": 0.275390625, + "learning_rate": 0.00027450248980930264, + "loss": 0.7197, "step": 1344 }, { - "epoch": 1.4386339381003201, - "grad_norm": 0.275390625, - "learning_rate": 0.0002720567728585544, - "loss": 0.6817, + "epoch": 1.3797338792221086, + "grad_norm": 0.2578125, + "learning_rate": 0.00027434698551100567, + "loss": 0.6694, "step": 1348 }, { - "epoch": 1.4429028815368197, - "grad_norm": 0.267578125, - "learning_rate": 0.000271887798750638, - "loss": 0.7123, + "epoch": 1.383828045035824, + "grad_norm": 0.2890625, + "learning_rate": 0.0002741910527850235, + "loss": 0.7057, "step": 1352 }, { - "epoch": 1.447171824973319, - "grad_norm": 0.2734375, - "learning_rate": 0.0002717183680920135, - "loss": 0.6692, + "epoch": 1.3879222108495395, + "grad_norm": 0.279296875, + "learning_rate": 0.0002740346921686101, + "loss": 0.7029, "step": 1356 }, { - "epoch": 1.4514407684098185, - "grad_norm": 0.2578125, - "learning_rate": 0.0002715484815173113, - "loss": 0.7111, + "epoch": 1.3920163766632547, + "grad_norm": 0.26953125, + "learning_rate": 0.00027387790420049357, + "loss": 0.6723, "step": 1360 }, { - "epoch": 1.455709711846318, - "grad_norm": 0.2333984375, - "learning_rate": 0.0002713781396628693, - "loss": 0.6938, + "epoch": 1.3961105424769702, + "grad_norm": 0.255859375, + "learning_rate": 0.0002737206894208744, + "loss": 0.6895, "step": 1364 }, { - "epoch": 1.4599786552828176, - "grad_norm": 0.279296875, - "learning_rate": 0.00027120734316673074, - "loss": 0.6775, + "epoch": 1.4002047082906857, + "grad_norm": 0.267578125, + "learning_rate": 0.0002735630483714236, + "loss": 0.6655, "step": 1368 }, { - "epoch": 1.464247598719317, - "grad_norm": 0.2578125, - "learning_rate": 0.000271036092668642, - "loss": 0.6807, + "epoch": 1.4042988741044011, + "grad_norm": 0.27734375, + "learning_rate": 0.00027340498159528106, + "loss": 0.6728, "step": 1372 }, { - "epoch": 1.4685165421558164, - "grad_norm": 0.283203125, - "learning_rate": 0.00027086438881004975, - "loss": 0.6702, + "epoch": 1.4083930399181166, + "grad_norm": 0.267578125, + "learning_rate": 0.00027324648963705317, + "loss": 0.7074, "step": 1376 }, { - "epoch": 1.472785485592316, - "grad_norm": 0.267578125, - "learning_rate": 0.000270692232234099, - "loss": 0.6432, + "epoch": 1.412487205731832, + "grad_norm": 0.263671875, + "learning_rate": 0.00027308757304281154, + "loss": 0.6722, "step": 1380 }, { - "epoch": 1.4770544290288155, - "grad_norm": 0.271484375, - "learning_rate": 0.0002705196235856303, - "loss": 0.7068, + "epoch": 1.4165813715455475, + "grad_norm": 0.2734375, + "learning_rate": 0.00027292823236009056, + "loss": 0.6882, "step": 1384 }, { - "epoch": 1.4813233724653148, - "grad_norm": 0.2578125, - "learning_rate": 0.0002703465635111777, - "loss": 0.6775, + "epoch": 1.420675537359263, + "grad_norm": 0.373046875, + "learning_rate": 0.0002727684681378861, + "loss": 0.6971, "step": 1388 }, { - "epoch": 1.4855923159018143, - "grad_norm": 0.259765625, - "learning_rate": 0.00027017305265896596, - "loss": 0.6731, + "epoch": 1.4247697031729785, + "grad_norm": 0.265625, + "learning_rate": 0.000272608280926653, + "loss": 0.6815, "step": 1392 }, { - "epoch": 1.4898612593383138, - "grad_norm": 0.271484375, - "learning_rate": 0.0002699990916789084, - "loss": 0.7215, + "epoch": 1.428863868986694, + "grad_norm": 0.279296875, + "learning_rate": 0.00027244767127830366, + "loss": 0.6712, "step": 1396 }, { - "epoch": 1.4941302027748131, - "grad_norm": 0.263671875, - "learning_rate": 0.00026982468122260436, - "loss": 0.6321, + "epoch": 1.4329580348004094, + "grad_norm": 0.322265625, + "learning_rate": 0.00027228663974620583, + "loss": 0.6738, "step": 1400 }, { - "epoch": 1.4983991462113126, - "grad_norm": 0.26171875, - "learning_rate": 0.00026964982194333686, - "loss": 0.6468, + "epoch": 1.4370522006141249, + "grad_norm": 0.294921875, + "learning_rate": 0.000272125186885181, + "loss": 0.6815, "step": 1404 }, { - "epoch": 1.5026680896478122, + "epoch": 1.4411463664278403, "grad_norm": 0.28515625, - "learning_rate": 0.00026947451449606984, - "loss": 0.6713, + "learning_rate": 0.00027196331325150217, + "loss": 0.6807, "step": 1408 }, { - "epoch": 1.5069370330843115, - "grad_norm": 0.265625, - "learning_rate": 0.0002692987595374461, - "loss": 0.6192, + "epoch": 1.4452405322415558, + "grad_norm": 0.26171875, + "learning_rate": 0.00027180101940289206, + "loss": 0.6822, "step": 1412 }, { - "epoch": 1.5112059765208112, - "grad_norm": 0.28125, - "learning_rate": 0.00026912255772578456, - "loss": 0.6668, + "epoch": 1.4493346980552713, + "grad_norm": 0.255859375, + "learning_rate": 0.0002716383058985213, + "loss": 0.6641, "step": 1416 }, { - "epoch": 1.5154749199573105, - "grad_norm": 0.2890625, - "learning_rate": 0.00026894590972107807, - "loss": 0.6811, + "epoch": 1.4534288638689867, + "grad_norm": 0.251953125, + "learning_rate": 0.00027147517329900636, + "loss": 0.7067, "step": 1420 }, { - "epoch": 1.51974386339381, - "grad_norm": 0.255859375, - "learning_rate": 0.0002687688161849906, - "loss": 0.7069, + "epoch": 1.4575230296827022, + "grad_norm": 0.279296875, + "learning_rate": 0.00027131162216640774, + "loss": 0.6732, "step": 1424 }, { - "epoch": 1.5240128068303096, - "grad_norm": 0.28125, - "learning_rate": 0.0002685912777808549, - "loss": 0.69, + "epoch": 1.4616171954964177, + "grad_norm": 0.2890625, + "learning_rate": 0.0002711476530642279, + "loss": 0.6609, "step": 1428 }, { - "epoch": 1.528281750266809, + "epoch": 1.4657113613101331, "grad_norm": 0.267578125, - "learning_rate": 0.00026841329517367023, - "loss": 0.7054, + "learning_rate": 0.0002709832665574093, + "loss": 0.7021, "step": 1432 }, { - "epoch": 1.5325506937033084, - "grad_norm": 0.259765625, - "learning_rate": 0.00026823486903009957, - "loss": 0.6814, + "epoch": 1.4698055271238486, + "grad_norm": 0.2490234375, + "learning_rate": 0.00027081846321233273, + "loss": 0.6793, "step": 1436 }, { - "epoch": 1.536819637139808, - "grad_norm": 0.263671875, - "learning_rate": 0.0002680560000184673, - "loss": 0.6867, + "epoch": 1.473899692937564, + "grad_norm": 0.279296875, + "learning_rate": 0.000270653243596815, + "loss": 0.6538, "step": 1440 }, { - "epoch": 1.5410885805763073, - "grad_norm": 0.26171875, - "learning_rate": 0.0002678766888087565, - "loss": 0.6085, + "epoch": 1.4779938587512795, + "grad_norm": 0.265625, + "learning_rate": 0.00027048760828010725, + "loss": 0.7086, "step": 1444 }, { - "epoch": 1.5453575240128068, - "grad_norm": 0.26171875, - "learning_rate": 0.00026769693607260674, - "loss": 0.6843, + "epoch": 1.482088024564995, + "grad_norm": 0.251953125, + "learning_rate": 0.00027032155783289274, + "loss": 0.6975, "step": 1448 }, { - "epoch": 1.5496264674493063, - "grad_norm": 0.28125, - "learning_rate": 0.00026751674248331125, - "loss": 0.6953, + "epoch": 1.4861821903787105, + "grad_norm": 0.2578125, + "learning_rate": 0.00027015509282728525, + "loss": 0.7226, "step": 1452 }, { - "epoch": 1.5538954108858056, - "grad_norm": 0.28515625, - "learning_rate": 0.0002673361087158147, - "loss": 0.6817, + "epoch": 1.4902763561924257, + "grad_norm": 0.265625, + "learning_rate": 0.00026998821383682664, + "loss": 0.6931, "step": 1456 }, { - "epoch": 1.5581643543223054, - "grad_norm": 0.26171875, - "learning_rate": 0.0002671550354467104, - "loss": 0.7003, + "epoch": 1.4943705220061412, + "grad_norm": 0.248046875, + "learning_rate": 0.00026982092143648537, + "loss": 0.624, "step": 1460 }, { - "epoch": 1.5624332977588047, - "grad_norm": 0.2490234375, - "learning_rate": 0.000266973523354238, - "loss": 0.6627, + "epoch": 1.4984646878198566, + "grad_norm": 0.255859375, + "learning_rate": 0.00026965321620265405, + "loss": 0.6446, "step": 1464 }, { - "epoch": 1.5667022411953042, - "grad_norm": 0.265625, - "learning_rate": 0.0002667915731182806, - "loss": 0.6727, + "epoch": 1.5025588536335721, + "grad_norm": 0.26171875, + "learning_rate": 0.0002694850987131478, + "loss": 0.7135, "step": 1468 }, { - "epoch": 1.5709711846318037, - "grad_norm": 0.26171875, - "learning_rate": 0.0002666091854203628, - "loss": 0.7468, + "epoch": 1.5066530194472876, + "grad_norm": 0.26953125, + "learning_rate": 0.0002693165695472022, + "loss": 0.6878, "step": 1472 }, { - "epoch": 1.575240128068303, - "grad_norm": 0.2490234375, - "learning_rate": 0.0002664263609436474, - "loss": 0.6985, + "epoch": 1.510747185261003, + "grad_norm": 0.263671875, + "learning_rate": 0.00026914762928547097, + "loss": 0.6612, "step": 1476 }, { - "epoch": 1.5795090715048026, - "grad_norm": 0.263671875, - "learning_rate": 0.00026624310037293354, - "loss": 0.7159, + "epoch": 1.5148413510747185, + "grad_norm": 0.265625, + "learning_rate": 0.00026897827851002457, + "loss": 0.6975, "step": 1480 }, { - "epoch": 1.583778014941302, - "grad_norm": 0.2490234375, - "learning_rate": 0.0002660594043946537, - "loss": 0.7057, + "epoch": 1.518935516888434, + "grad_norm": 0.271484375, + "learning_rate": 0.0002688085178043475, + "loss": 0.673, "step": 1484 }, { - "epoch": 1.5880469583778014, - "grad_norm": 0.2392578125, - "learning_rate": 0.00026587527369687115, - "loss": 0.6788, + "epoch": 1.5230296827021494, + "grad_norm": 0.263671875, + "learning_rate": 0.000268638347753337, + "loss": 0.6826, "step": 1488 }, { - "epoch": 1.592315901814301, - "grad_norm": 0.25390625, - "learning_rate": 0.00026569070896927767, - "loss": 0.6561, + "epoch": 1.527123848515865, + "grad_norm": 0.2890625, + "learning_rate": 0.0002684677689433004, + "loss": 0.6603, "step": 1492 }, { - "epoch": 1.5965848452508005, + "epoch": 1.5312180143295804, "grad_norm": 0.2734375, - "learning_rate": 0.0002655057109031906, - "loss": 0.6514, + "learning_rate": 0.0002682967819619535, + "loss": 0.698, "step": 1496 }, { - "epoch": 1.6008537886872998, - "grad_norm": 0.265625, - "learning_rate": 0.0002653202801915505, - "loss": 0.6549, + "epoch": 1.5353121801432958, + "grad_norm": 0.28515625, + "learning_rate": 0.00026812538739841833, + "loss": 0.7188, "step": 1500 }, { - "epoch": 1.6051227321237995, - "grad_norm": 0.244140625, - "learning_rate": 0.0002651344175289185, - "loss": 0.6261, + "epoch": 1.5394063459570113, + "grad_norm": 0.275390625, + "learning_rate": 0.00026795358584322135, + "loss": 0.7101, "step": 1504 }, { - "epoch": 1.6093916755602988, - "grad_norm": 0.265625, - "learning_rate": 0.0002649481236114735, - "loss": 0.6311, + "epoch": 1.5435005117707266, + "grad_norm": 0.255859375, + "learning_rate": 0.0002677813778882911, + "loss": 0.68, "step": 1508 }, { - "epoch": 1.6136606189967981, - "grad_norm": 0.263671875, - "learning_rate": 0.0002647613991370101, - "loss": 0.7317, + "epoch": 1.547594677584442, + "grad_norm": 0.271484375, + "learning_rate": 0.0002676087641269566, + "loss": 0.7013, "step": 1512 }, { - "epoch": 1.6179295624332979, - "grad_norm": 0.2578125, - "learning_rate": 0.0002645742448049352, - "loss": 0.6584, + "epoch": 1.5516888433981575, + "grad_norm": 0.25390625, + "learning_rate": 0.0002674357451539448, + "loss": 0.6477, "step": 1516 }, { - "epoch": 1.6221985058697972, - "grad_norm": 0.251953125, - "learning_rate": 0.0002643866613162661, - "loss": 0.7089, + "epoch": 1.555783009211873, + "grad_norm": 0.2734375, + "learning_rate": 0.00026726232156537886, + "loss": 0.6855, "step": 1520 }, { - "epoch": 1.6264674493062967, - "grad_norm": 0.275390625, - "learning_rate": 0.00026419864937362746, - "loss": 0.6484, + "epoch": 1.5598771750255884, + "grad_norm": 0.27734375, + "learning_rate": 0.000267088493958776, + "loss": 0.6618, "step": 1524 }, { - "epoch": 1.6307363927427962, - "grad_norm": 0.271484375, - "learning_rate": 0.0002640102096812487, - "loss": 0.6893, + "epoch": 1.563971340839304, + "grad_norm": 0.29296875, + "learning_rate": 0.0002669142629330455, + "loss": 0.6477, "step": 1528 }, { - "epoch": 1.6350053361792956, + "epoch": 1.5680655066530194, "grad_norm": 0.255859375, - "learning_rate": 0.0002638213429449617, - "loss": 0.7095, + "learning_rate": 0.00026673962908848654, + "loss": 0.7286, "step": 1532 }, { - "epoch": 1.639274279615795, - "grad_norm": 0.271484375, - "learning_rate": 0.00026363204987219765, - "loss": 0.6641, + "epoch": 1.5721596724667348, + "grad_norm": 0.255859375, + "learning_rate": 0.0002665645930267862, + "loss": 0.6947, "step": 1536 }, { - "epoch": 1.6435432230522946, - "grad_norm": 0.27734375, - "learning_rate": 0.0002634423311719847, - "loss": 0.7056, + "epoch": 1.5762538382804503, + "grad_norm": 0.2734375, + "learning_rate": 0.0002663891553510174, + "loss": 0.6563, "step": 1540 }, { - "epoch": 1.647812166488794, - "grad_norm": 0.263671875, - "learning_rate": 0.00026325218755494543, - "loss": 0.6934, + "epoch": 1.5803480040941658, + "grad_norm": 0.259765625, + "learning_rate": 0.00026621331666563665, + "loss": 0.6722, "step": 1544 }, { - "epoch": 1.6520811099252934, - "grad_norm": 0.283203125, - "learning_rate": 0.00026306161973329374, - "loss": 0.663, + "epoch": 1.5844421699078812, + "grad_norm": 0.298828125, + "learning_rate": 0.0002660370775764822, + "loss": 0.6563, "step": 1548 }, { - "epoch": 1.656350053361793, - "grad_norm": 0.27734375, - "learning_rate": 0.0002628706284208328, - "loss": 0.6673, + "epoch": 1.5885363357215967, + "grad_norm": 0.267578125, + "learning_rate": 0.000265860438690772, + "loss": 0.678, "step": 1552 }, { - "epoch": 1.6606189967982923, - "grad_norm": 0.259765625, - "learning_rate": 0.00026267921433295165, - "loss": 0.7057, + "epoch": 1.5926305015353122, + "grad_norm": 0.462890625, + "learning_rate": 0.00026568340061710124, + "loss": 0.6558, "step": 1556 }, { - "epoch": 1.664887940234792, - "grad_norm": 0.263671875, - "learning_rate": 0.0002624873781866233, - "loss": 0.6915, + "epoch": 1.5967246673490276, + "grad_norm": 0.26171875, + "learning_rate": 0.0002655059639654406, + "loss": 0.6684, "step": 1560 }, { - "epoch": 1.6691568836712913, - "grad_norm": 0.275390625, - "learning_rate": 0.00026229512070040123, - "loss": 0.6858, + "epoch": 1.600818833162743, + "grad_norm": 0.287109375, + "learning_rate": 0.000265328129347134, + "loss": 0.6614, "step": 1564 }, { - "epoch": 1.6734258271077909, - "grad_norm": 0.275390625, - "learning_rate": 0.0002621024425944175, - "loss": 0.6542, + "epoch": 1.6049129989764586, + "grad_norm": 0.271484375, + "learning_rate": 0.00026514989737489646, + "loss": 0.6763, "step": 1568 }, { - "epoch": 1.6776947705442904, - "grad_norm": 0.267578125, - "learning_rate": 0.00026190934459037945, - "loss": 0.7185, + "epoch": 1.609007164790174, + "grad_norm": 0.263671875, + "learning_rate": 0.00026497126866281223, + "loss": 0.6667, "step": 1572 }, { - "epoch": 1.6819637139807897, - "grad_norm": 0.279296875, - "learning_rate": 0.00026171582741156725, - "loss": 0.6859, + "epoch": 1.6131013306038895, + "grad_norm": 0.271484375, + "learning_rate": 0.0002647922438263323, + "loss": 0.6783, "step": 1576 }, { - "epoch": 1.6862326574172892, - "grad_norm": 0.26953125, - "learning_rate": 0.0002615218917828312, - "loss": 0.6788, + "epoch": 1.617195496417605, + "grad_norm": 0.265625, + "learning_rate": 0.00026461282348227267, + "loss": 0.6843, "step": 1580 }, { - "epoch": 1.6905016008537888, - "grad_norm": 0.2490234375, - "learning_rate": 0.00026132753843058896, - "loss": 0.7032, + "epoch": 1.6212896622313204, + "grad_norm": 0.271484375, + "learning_rate": 0.00026443300824881174, + "loss": 0.6728, "step": 1584 }, { - "epoch": 1.694770544290288, - "grad_norm": 0.265625, - "learning_rate": 0.00026113276808282284, - "loss": 0.6972, + "epoch": 1.625383828045036, + "grad_norm": 0.2490234375, + "learning_rate": 0.00026425279874548883, + "loss": 0.6666, "step": 1588 }, { - "epoch": 1.6990394877267876, - "grad_norm": 0.279296875, - "learning_rate": 0.00026093758146907703, - "loss": 0.682, + "epoch": 1.6294779938587514, + "grad_norm": 0.263671875, + "learning_rate": 0.0002640721955932013, + "loss": 0.6979, "step": 1592 }, { - "epoch": 1.7033084311632871, - "grad_norm": 0.271484375, - "learning_rate": 0.00026074197932045496, - "loss": 0.7364, + "epoch": 1.6335721596724668, + "grad_norm": 0.2490234375, + "learning_rate": 0.00026389119941420323, + "loss": 0.6888, "step": 1596 }, { - "epoch": 1.7075773745997864, - "grad_norm": 0.265625, - "learning_rate": 0.00026054596236961655, - "loss": 0.7075, + "epoch": 1.6376663254861823, + "grad_norm": 0.2734375, + "learning_rate": 0.0002637098108321024, + "loss": 0.6875, "step": 1600 }, { - "epoch": 1.7118463180362862, - "grad_norm": 0.25390625, - "learning_rate": 0.00026034953135077533, - "loss": 0.6649, + "epoch": 1.6417604912998978, + "grad_norm": 0.263671875, + "learning_rate": 0.000263528030471859, + "loss": 0.6285, "step": 1604 }, { - "epoch": 1.7161152614727855, - "grad_norm": 0.265625, - "learning_rate": 0.00026015268699969594, - "loss": 0.6433, + "epoch": 1.6458546571136132, + "grad_norm": 0.291015625, + "learning_rate": 0.0002633458589597827, + "loss": 0.6737, "step": 1608 }, { - "epoch": 1.7203842049092848, - "grad_norm": 0.314453125, - "learning_rate": 0.000259955430053691, - "loss": 0.6143, + "epoch": 1.6499488229273287, + "grad_norm": 0.259765625, + "learning_rate": 0.0002631632969235311, + "loss": 0.6747, "step": 1612 }, { - "epoch": 1.7246531483457845, - "grad_norm": 0.25, - "learning_rate": 0.0002597577612516187, - "loss": 0.7164, + "epoch": 1.6540429887410442, + "grad_norm": 0.2734375, + "learning_rate": 0.00026298034499210715, + "loss": 0.6784, "step": 1616 }, { - "epoch": 1.7289220917822838, - "grad_norm": 0.265625, - "learning_rate": 0.0002595596813338801, - "loss": 0.7572, + "epoch": 1.6581371545547596, + "grad_norm": 0.259765625, + "learning_rate": 0.00026279700379585724, + "loss": 0.6657, "step": 1620 }, { - "epoch": 1.7331910352187834, + "epoch": 1.6622313203684749, "grad_norm": 0.275390625, - "learning_rate": 0.00025936119104241577, - "loss": 0.6611, + "learning_rate": 0.000262613273966469, + "loss": 0.705, "step": 1624 }, { - "epoch": 1.737459978655283, - "grad_norm": 0.2578125, - "learning_rate": 0.0002591622911207036, - "loss": 0.656, + "epoch": 1.6663254861821903, + "grad_norm": 0.27734375, + "learning_rate": 0.00026242915613696897, + "loss": 0.7061, "step": 1628 }, { - "epoch": 1.7417289220917822, - "grad_norm": 0.25390625, - "learning_rate": 0.0002589629823137559, - "loss": 0.7315, + "epoch": 1.6704196519959058, + "grad_norm": 0.24609375, + "learning_rate": 0.0002622446509417206, + "loss": 0.6685, "step": 1632 }, { - "epoch": 1.7459978655282817, - "grad_norm": 0.28515625, - "learning_rate": 0.00025876326536811644, - "loss": 0.7098, + "epoch": 1.6745138178096213, + "grad_norm": 0.2734375, + "learning_rate": 0.00026205975901642174, + "loss": 0.7255, "step": 1636 }, { - "epoch": 1.7502668089647813, - "grad_norm": 0.27734375, - "learning_rate": 0.0002585631410318577, - "loss": 0.6563, + "epoch": 1.6786079836233367, + "grad_norm": 0.263671875, + "learning_rate": 0.000261874480998103, + "loss": 0.5933, "step": 1640 }, { - "epoch": 1.7545357524012806, - "grad_norm": 0.2578125, - "learning_rate": 0.0002583626100545782, - "loss": 0.6822, + "epoch": 1.6827021494370522, + "grad_norm": 0.251953125, + "learning_rate": 0.00026168881752512517, + "loss": 0.7004, "step": 1644 }, { - "epoch": 1.75880469583778, - "grad_norm": 0.28125, - "learning_rate": 0.00025816167318739946, - "loss": 0.711, + "epoch": 1.6867963152507677, + "grad_norm": 0.28515625, + "learning_rate": 0.00026150276923717693, + "loss": 0.6795, "step": 1648 }, { - "epoch": 1.7630736392742796, - "grad_norm": 0.283203125, - "learning_rate": 0.0002579603311829635, - "loss": 0.6557, + "epoch": 1.6908904810644831, + "grad_norm": 0.25390625, + "learning_rate": 0.0002613163367752729, + "loss": 0.6927, "step": 1652 }, { - "epoch": 1.767342582710779, - "grad_norm": 0.255859375, - "learning_rate": 0.0002577585847954297, - "loss": 0.6914, + "epoch": 1.6949846468781986, + "grad_norm": 0.302734375, + "learning_rate": 0.00026112952078175146, + "loss": 0.6589, "step": 1656 }, { - "epoch": 1.7716115261472787, - "grad_norm": 0.2578125, - "learning_rate": 0.00025755643478047224, - "loss": 0.6466, + "epoch": 1.699078812691914, + "grad_norm": 0.26953125, + "learning_rate": 0.0002609423219002722, + "loss": 0.6693, "step": 1660 }, { - "epoch": 1.775880469583778, - "grad_norm": 0.27734375, - "learning_rate": 0.0002573538818952771, - "loss": 0.7048, + "epoch": 1.7031729785056293, + "grad_norm": 0.26953125, + "learning_rate": 0.0002607547407758141, + "loss": 0.7185, "step": 1664 }, { - "epoch": 1.7801494130202775, - "grad_norm": 0.2578125, - "learning_rate": 0.00025715092689853925, - "loss": 0.6707, + "epoch": 1.7072671443193448, + "grad_norm": 0.275390625, + "learning_rate": 0.00026056677805467304, + "loss": 0.6349, "step": 1668 }, { - "epoch": 1.784418356456777, - "grad_norm": 0.265625, - "learning_rate": 0.00025694757055045984, - "loss": 0.6981, + "epoch": 1.7113613101330603, + "grad_norm": 0.271484375, + "learning_rate": 0.0002603784343844597, + "loss": 0.7128, "step": 1672 }, { - "epoch": 1.7886872998932764, - "grad_norm": 0.267578125, - "learning_rate": 0.00025674381361274345, - "loss": 0.6725, + "epoch": 1.7154554759467757, + "grad_norm": 0.265625, + "learning_rate": 0.00026018971041409715, + "loss": 0.6974, "step": 1676 }, { - "epoch": 1.7929562433297759, - "grad_norm": 0.2470703125, - "learning_rate": 0.000256539656848595, - "loss": 0.7183, + "epoch": 1.7195496417604912, + "grad_norm": 0.265625, + "learning_rate": 0.0002600006067938191, + "loss": 0.6253, "step": 1680 }, { - "epoch": 1.7972251867662754, - "grad_norm": 0.2490234375, - "learning_rate": 0.0002563351010227171, - "loss": 0.6967, + "epoch": 1.7236438075742067, + "grad_norm": 0.259765625, + "learning_rate": 0.00025981112417516693, + "loss": 0.688, "step": 1684 }, { - "epoch": 1.8014941302027747, - "grad_norm": 0.25390625, - "learning_rate": 0.000256130146901307, - "loss": 0.6597, + "epoch": 1.7277379733879221, + "grad_norm": 0.2734375, + "learning_rate": 0.000259621263210988, + "loss": 0.6281, "step": 1688 }, { - "epoch": 1.8057630736392742, - "grad_norm": 0.2890625, - "learning_rate": 0.0002559247952520541, - "loss": 0.6943, + "epoch": 1.7318321392016376, + "grad_norm": 0.2451171875, + "learning_rate": 0.0002594310245554333, + "loss": 0.7162, "step": 1692 }, { - "epoch": 1.8100320170757738, - "grad_norm": 0.2578125, - "learning_rate": 0.00025571904684413643, - "loss": 0.6596, + "epoch": 1.735926305015353, + "grad_norm": 0.275390625, + "learning_rate": 0.0002592404088639549, + "loss": 0.7011, "step": 1696 }, { - "epoch": 1.814300960512273, - "grad_norm": 0.26171875, - "learning_rate": 0.00025551290244821853, - "loss": 0.6709, + "epoch": 1.7400204708290685, + "grad_norm": 0.25390625, + "learning_rate": 0.0002590494167933042, + "loss": 0.6928, "step": 1700 }, { - "epoch": 1.8185699039487728, - "grad_norm": 0.25390625, - "learning_rate": 0.00025530636283644786, - "loss": 0.6656, + "epoch": 1.744114636642784, + "grad_norm": 0.271484375, + "learning_rate": 0.0002588580490015292, + "loss": 0.6628, "step": 1704 }, { - "epoch": 1.8228388473852721, - "grad_norm": 0.283203125, - "learning_rate": 0.0002550994287824523, - "loss": 0.6754, + "epoch": 1.7482088024564995, + "grad_norm": 0.26953125, + "learning_rate": 0.00025866630614797243, + "loss": 0.6604, "step": 1708 }, { - "epoch": 1.8271077908217717, - "grad_norm": 0.28125, - "learning_rate": 0.0002548921010613374, - "loss": 0.6981, + "epoch": 1.752302968270215, + "grad_norm": 0.26953125, + "learning_rate": 0.00025847418889326867, + "loss": 0.7133, "step": 1712 }, { - "epoch": 1.8313767342582712, - "grad_norm": 0.298828125, - "learning_rate": 0.00025468438044968296, - "loss": 0.6771, + "epoch": 1.7563971340839304, + "grad_norm": 0.27734375, + "learning_rate": 0.0002582816978993428, + "loss": 0.6795, "step": 1716 }, { - "epoch": 1.8356456776947705, - "grad_norm": 0.265625, - "learning_rate": 0.00025447626772554064, - "loss": 0.6968, + "epoch": 1.7604912998976459, + "grad_norm": 0.26953125, + "learning_rate": 0.0002580888338294072, + "loss": 0.6114, "step": 1720 }, { - "epoch": 1.83991462113127, - "grad_norm": 0.265625, - "learning_rate": 0.00025426776366843065, - "loss": 0.7235, + "epoch": 1.7645854657113613, + "grad_norm": 0.2578125, + "learning_rate": 0.00025789559734795984, + "loss": 0.6771, "step": 1724 }, { - "epoch": 1.8441835645677696, + "epoch": 1.7686796315250768, "grad_norm": 0.255859375, - "learning_rate": 0.0002540588690593391, - "loss": 0.6633, + "learning_rate": 0.0002577019891207816, + "loss": 0.6623, "step": 1728 }, { - "epoch": 1.8484525080042689, + "epoch": 1.7727737973387923, "grad_norm": 0.2734375, - "learning_rate": 0.000253849584680715, - "loss": 0.6679, + "learning_rate": 0.00025750800981493434, + "loss": 0.6542, "step": 1732 }, { - "epoch": 1.8527214514407684, - "grad_norm": 0.2734375, - "learning_rate": 0.0002536399113164671, - "loss": 0.711, + "epoch": 1.7768679631525077, + "grad_norm": 0.2578125, + "learning_rate": 0.0002573136600987584, + "loss": 0.6879, "step": 1736 }, { - "epoch": 1.856990394877268, - "grad_norm": 0.275390625, - "learning_rate": 0.0002534298497519615, - "loss": 0.694, + "epoch": 1.7809621289662232, + "grad_norm": 0.298828125, + "learning_rate": 0.0002571189406418702, + "loss": 0.6624, "step": 1740 }, { - "epoch": 1.8612593383137672, - "grad_norm": 0.267578125, - "learning_rate": 0.00025321940077401814, - "loss": 0.661, + "epoch": 1.7850562947799387, + "grad_norm": 0.25390625, + "learning_rate": 0.0002569238521151603, + "loss": 0.7035, "step": 1744 }, { - "epoch": 1.865528281750267, - "grad_norm": 0.267578125, - "learning_rate": 0.00025300856517090805, - "loss": 0.7393, + "epoch": 1.7891504605936541, + "grad_norm": 0.283203125, + "learning_rate": 0.0002567283951907908, + "loss": 0.6695, "step": 1748 }, { - "epoch": 1.8697972251867663, - "grad_norm": 0.275390625, - "learning_rate": 0.00025279734373235055, - "loss": 0.7019, + "epoch": 1.7932446264073696, + "grad_norm": 0.279296875, + "learning_rate": 0.000256532570542193, + "loss": 0.6326, "step": 1752 }, { - "epoch": 1.8740661686232656, - "grad_norm": 0.2890625, - "learning_rate": 0.0002525857372495102, - "loss": 0.6417, + "epoch": 1.797338792221085, + "grad_norm": 0.255859375, + "learning_rate": 0.0002563363788440652, + "loss": 0.6895, "step": 1756 }, { - "epoch": 1.8783351120597653, - "grad_norm": 0.263671875, - "learning_rate": 0.0002523737465149936, - "loss": 0.6431, + "epoch": 1.8014329580348005, + "grad_norm": 0.267578125, + "learning_rate": 0.00025613982077237043, + "loss": 0.6529, "step": 1760 }, { - "epoch": 1.8826040554962646, - "grad_norm": 0.2578125, - "learning_rate": 0.0002521613723228468, - "loss": 0.6928, + "epoch": 1.805527123848516, + "grad_norm": 0.263671875, + "learning_rate": 0.0002559428970043338, + "loss": 0.6558, "step": 1764 }, { - "epoch": 1.8868729989327642, - "grad_norm": 0.2578125, - "learning_rate": 0.0002519486154685521, - "loss": 0.6203, + "epoch": 1.8096212896622315, + "grad_norm": 0.271484375, + "learning_rate": 0.00025574560821844066, + "loss": 0.7197, "step": 1768 }, { - "epoch": 1.8911419423692637, - "grad_norm": 0.2578125, - "learning_rate": 0.00025173547674902524, - "loss": 0.6691, + "epoch": 1.813715455475947, + "grad_norm": 0.2734375, + "learning_rate": 0.0002555479550944338, + "loss": 0.6678, "step": 1772 }, { - "epoch": 1.895410885805763, - "grad_norm": 0.255859375, - "learning_rate": 0.00025152195696261206, - "loss": 0.6797, + "epoch": 1.8178096212896624, + "grad_norm": 0.275390625, + "learning_rate": 0.0002553499383133115, + "loss": 0.6602, "step": 1776 }, { - "epoch": 1.8996798292422625, - "grad_norm": 0.265625, - "learning_rate": 0.0002513080569090859, - "loss": 0.6709, + "epoch": 1.8219037871033776, + "grad_norm": 0.25390625, + "learning_rate": 0.0002551515585573248, + "loss": 0.6831, "step": 1780 }, { - "epoch": 1.903948772678762, - "grad_norm": 0.275390625, - "learning_rate": 0.00025109377738964455, - "loss": 0.6488, + "epoch": 1.825997952917093, + "grad_norm": 0.255859375, + "learning_rate": 0.0002549528165099755, + "loss": 0.6853, "step": 1784 }, { - "epoch": 1.9082177161152614, - "grad_norm": 0.251953125, - "learning_rate": 0.00025087911920690695, - "loss": 0.6707, + "epoch": 1.8300921187308086, + "grad_norm": 0.2734375, + "learning_rate": 0.00025475371285601356, + "loss": 0.6836, "step": 1788 }, { - "epoch": 1.912486659551761, - "grad_norm": 0.318359375, - "learning_rate": 0.0002506640831649104, - "loss": 0.7067, + "epoch": 1.834186284544524, + "grad_norm": 0.275390625, + "learning_rate": 0.00025455424828143473, + "loss": 0.6681, "step": 1792 }, { - "epoch": 1.9167556029882604, - "grad_norm": 0.255859375, - "learning_rate": 0.0002504486700691077, - "loss": 0.6498, + "epoch": 1.8382804503582395, + "grad_norm": 0.275390625, + "learning_rate": 0.0002543544234734786, + "loss": 0.6962, "step": 1796 }, { - "epoch": 1.9210245464247597, - "grad_norm": 0.255859375, - "learning_rate": 0.0002502328807263638, - "loss": 0.6911, + "epoch": 1.842374616171955, + "grad_norm": 0.267578125, + "learning_rate": 0.00025415423912062557, + "loss": 0.6646, "step": 1800 }, { - "epoch": 1.9252934898612595, - "grad_norm": 0.263671875, - "learning_rate": 0.00025001671594495316, - "loss": 0.6766, + "epoch": 1.8464687819856704, + "grad_norm": 0.251953125, + "learning_rate": 0.00025395369591259503, + "loss": 0.647, "step": 1804 }, { - "epoch": 1.9295624332977588, - "grad_norm": 0.259765625, - "learning_rate": 0.0002498001765345563, - "loss": 0.7028, + "epoch": 1.850562947799386, + "grad_norm": 0.27734375, + "learning_rate": 0.00025375279454034264, + "loss": 0.6682, "step": 1808 }, { - "epoch": 1.9338313767342583, - "grad_norm": 0.2431640625, - "learning_rate": 0.00024958326330625695, - "loss": 0.677, + "epoch": 1.8546571136131014, + "grad_norm": 0.259765625, + "learning_rate": 0.00025355153569605823, + "loss": 0.6989, "step": 1812 }, { - "epoch": 1.9381003201707578, - "grad_norm": 0.283203125, - "learning_rate": 0.0002493659770725392, - "loss": 0.6952, + "epoch": 1.8587512794268168, + "grad_norm": 0.25390625, + "learning_rate": 0.00025334992007316315, + "loss": 0.7076, "step": 1816 }, { - "epoch": 1.9423692636072571, + "epoch": 1.862845445240532, "grad_norm": 0.26953125, - "learning_rate": 0.0002491483186472842, - "loss": 0.6376, + "learning_rate": 0.00025314794836630807, + "loss": 0.6588, "step": 1820 }, { - "epoch": 1.9466382070437567, - "grad_norm": 0.271484375, - "learning_rate": 0.00024893028884576725, - "loss": 0.6701, + "epoch": 1.8669396110542475, + "grad_norm": 0.263671875, + "learning_rate": 0.0002529456212713705, + "loss": 0.6762, "step": 1824 }, { - "epoch": 1.9509071504802562, - "grad_norm": 0.287109375, - "learning_rate": 0.0002487118884846546, - "loss": 0.7071, + "epoch": 1.871033776867963, + "grad_norm": 0.255859375, + "learning_rate": 0.0002527429394854524, + "loss": 0.7133, "step": 1828 }, { - "epoch": 1.9551760939167555, + "epoch": 1.8751279426816785, "grad_norm": 0.2578125, - "learning_rate": 0.0002484931183820006, - "loss": 0.6462, + "learning_rate": 0.0002525399037068778, + "loss": 0.7536, "step": 1832 }, { - "epoch": 1.959445037353255, - "grad_norm": 0.271484375, - "learning_rate": 0.0002482739793572445, - "loss": 0.6398, + "epoch": 1.879222108495394, + "grad_norm": 0.259765625, + "learning_rate": 0.00025233651463519045, + "loss": 0.6799, "step": 1836 }, { - "epoch": 1.9637139807897546, - "grad_norm": 0.2353515625, - "learning_rate": 0.0002480544722312074, - "loss": 0.6437, + "epoch": 1.8833162743091094, + "grad_norm": 0.275390625, + "learning_rate": 0.00025213277297115124, + "loss": 0.6846, "step": 1840 }, { - "epoch": 1.9679829242262539, - "grad_norm": 0.26953125, - "learning_rate": 0.0002478345978260892, - "loss": 0.6784, + "epoch": 1.8874104401228249, + "grad_norm": 0.265625, + "learning_rate": 0.0002519286794167359, + "loss": 0.6479, "step": 1844 }, { - "epoch": 1.9722518676627536, - "grad_norm": 0.251953125, - "learning_rate": 0.00024761435696546544, - "loss": 0.7093, + "epoch": 1.8915046059365404, + "grad_norm": 0.279296875, + "learning_rate": 0.00025172423467513267, + "loss": 0.6588, "step": 1848 }, { - "epoch": 1.976520811099253, - "grad_norm": 0.275390625, - "learning_rate": 0.0002473937504742844, - "loss": 0.7183, + "epoch": 1.8955987717502558, + "grad_norm": 0.283203125, + "learning_rate": 0.0002515194394507396, + "loss": 0.6707, "step": 1852 }, { - "epoch": 1.9807897545357525, - "grad_norm": 0.275390625, - "learning_rate": 0.0002471727791788637, - "loss": 0.7027, + "epoch": 1.8996929375639713, + "grad_norm": 0.259765625, + "learning_rate": 0.00025131429444916247, + "loss": 0.6688, "step": 1856 }, { - "epoch": 1.985058697972252, - "grad_norm": 0.296875, - "learning_rate": 0.0002469514439068876, - "loss": 0.7189, + "epoch": 1.9037871033776868, + "grad_norm": 0.267578125, + "learning_rate": 0.00025110880037721215, + "loss": 0.6671, "step": 1860 }, { - "epoch": 1.9893276414087513, - "grad_norm": 0.2734375, - "learning_rate": 0.0002467297454874036, - "loss": 0.6315, + "epoch": 1.9078812691914022, + "grad_norm": 0.26953125, + "learning_rate": 0.00025090295794290214, + "loss": 0.6216, "step": 1864 }, { - "epoch": 1.9935965848452508, - "grad_norm": 0.26171875, - "learning_rate": 0.0002465076847508194, - "loss": 0.7154, + "epoch": 1.9119754350051177, + "grad_norm": 0.2578125, + "learning_rate": 0.00025069676785544623, + "loss": 0.6815, "step": 1868 }, { - "epoch": 1.9978655282817503, - "grad_norm": 0.26171875, - "learning_rate": 0.00024628526252889983, - "loss": 0.6901, + "epoch": 1.9160696008188332, + "grad_norm": 0.251953125, + "learning_rate": 0.00025049023082525607, + "loss": 0.7354, "step": 1872 }, { - "epoch": 2.0021344717182497, - "grad_norm": 0.23828125, - "learning_rate": 0.0002460624796547638, - "loss": 0.6671, + "epoch": 1.9201637666325486, + "grad_norm": 0.2734375, + "learning_rate": 0.0002502833475639386, + "loss": 0.6874, "step": 1876 }, { - "epoch": 2.0064034151547494, - "grad_norm": 0.26953125, - "learning_rate": 0.0002458393369628809, - "loss": 0.5825, + "epoch": 1.924257932446264, + "grad_norm": 0.26171875, + "learning_rate": 0.0002500761187842937, + "loss": 0.6667, "step": 1880 }, { - "epoch": 2.0106723585912487, - "grad_norm": 0.283203125, - "learning_rate": 0.0002456158352890689, - "loss": 0.5568, + "epoch": 1.9283520982599796, + "grad_norm": 0.2451171875, + "learning_rate": 0.0002498685452003118, + "loss": 0.6612, "step": 1884 }, { - "epoch": 2.014941302027748, - "grad_norm": 0.265625, - "learning_rate": 0.00024539197547048967, - "loss": 0.5848, + "epoch": 1.932446264073695, + "grad_norm": 0.287109375, + "learning_rate": 0.0002496606275271711, + "loss": 0.6915, "step": 1888 }, { - "epoch": 2.0192102454642478, - "grad_norm": 0.265625, - "learning_rate": 0.0002451677583456469, - "loss": 0.6178, + "epoch": 1.9365404298874105, + "grad_norm": 0.267578125, + "learning_rate": 0.0002494523664812355, + "loss": 0.6593, "step": 1892 }, { - "epoch": 2.023479188900747, - "grad_norm": 0.287109375, - "learning_rate": 0.00024494318475438254, - "loss": 0.5733, + "epoch": 1.940634595701126, + "grad_norm": 0.275390625, + "learning_rate": 0.00024924376278005197, + "loss": 0.6741, "step": 1896 }, { - "epoch": 2.0277481323372464, - "grad_norm": 0.2734375, - "learning_rate": 0.00024471825553787374, - "loss": 0.6131, - "step": 1900 + "epoch": 1.9447287615148414, + "grad_norm": 0.25390625, + "learning_rate": 0.000249034817142348, + "loss": 0.6915, + "step": 1900 }, { - "epoch": 2.032017075773746, - "grad_norm": 0.275390625, - "learning_rate": 0.0002444929715386297, - "loss": 0.616, + "epoch": 1.9488229273285569, + "grad_norm": 0.3125, + "learning_rate": 0.0002488255302880293, + "loss": 0.6599, "step": 1904 }, { - "epoch": 2.0362860192102454, - "grad_norm": 0.265625, - "learning_rate": 0.00024426733360048854, - "loss": 0.5947, + "epoch": 1.9529170931422724, + "grad_norm": 0.259765625, + "learning_rate": 0.000248615902938177, + "loss": 0.64, "step": 1908 }, { - "epoch": 2.0405549626467447, - "grad_norm": 0.27734375, - "learning_rate": 0.0002440413425686141, - "loss": 0.6151, + "epoch": 1.9570112589559878, + "grad_norm": 0.263671875, + "learning_rate": 0.00024840593581504567, + "loss": 0.6631, "step": 1912 }, { - "epoch": 2.0448239060832445, - "grad_norm": 0.275390625, - "learning_rate": 0.00024381499928949275, - "loss": 0.613, + "epoch": 1.9611054247697033, + "grad_norm": 0.2734375, + "learning_rate": 0.0002481956296420603, + "loss": 0.6774, "step": 1916 }, { - "epoch": 2.049092849519744, - "grad_norm": 0.255859375, - "learning_rate": 0.0002435883046109304, - "loss": 0.596, + "epoch": 1.9651995905834188, + "grad_norm": 0.275390625, + "learning_rate": 0.0002479849851438142, + "loss": 0.6896, "step": 1920 }, { - "epoch": 2.0533617929562435, - "grad_norm": 0.244140625, - "learning_rate": 0.00024336125938204906, - "loss": 0.5233, + "epoch": 1.9692937563971342, + "grad_norm": 0.267578125, + "learning_rate": 0.0002477740030460663, + "loss": 0.6584, "step": 1924 }, { - "epoch": 2.057630736392743, - "grad_norm": 0.259765625, - "learning_rate": 0.00024313386445328375, - "loss": 0.6059, + "epoch": 1.9733879222108497, + "grad_norm": 0.2734375, + "learning_rate": 0.0002475626840757386, + "loss": 0.6339, "step": 1928 }, { - "epoch": 2.061899679829242, - "grad_norm": 0.294921875, - "learning_rate": 0.0002429061206763795, - "loss": 0.6198, + "epoch": 1.9774820880245652, + "grad_norm": 0.26171875, + "learning_rate": 0.000247351028960914, + "loss": 0.6797, "step": 1932 }, { - "epoch": 2.066168623265742, - "grad_norm": 0.26953125, - "learning_rate": 0.00024267802890438783, - "loss": 0.6005, + "epoch": 1.9815762538382804, + "grad_norm": 0.28515625, + "learning_rate": 0.0002471390384308334, + "loss": 0.6773, "step": 1936 }, { - "epoch": 2.070437566702241, - "grad_norm": 0.283203125, - "learning_rate": 0.00024244958999166386, - "loss": 0.6141, + "epoch": 1.9856704196519959, + "grad_norm": 0.271484375, + "learning_rate": 0.00024692671321589326, + "loss": 0.6526, "step": 1940 }, { - "epoch": 2.0747065101387405, - "grad_norm": 0.28515625, - "learning_rate": 0.00024222080479386298, - "loss": 0.5931, + "epoch": 1.9897645854657113, + "grad_norm": 0.279296875, + "learning_rate": 0.00024671405404764335, + "loss": 0.6519, "step": 1944 }, { - "epoch": 2.0789754535752403, - "grad_norm": 0.287109375, - "learning_rate": 0.00024199167416793753, - "loss": 0.5823, + "epoch": 1.9938587512794268, + "grad_norm": 0.28125, + "learning_rate": 0.0002465010616587841, + "loss": 0.6989, "step": 1948 }, { - "epoch": 2.0832443970117396, - "grad_norm": 0.265625, - "learning_rate": 0.0002417621989721338, - "loss": 0.6171, + "epoch": 1.9979529170931423, + "grad_norm": 0.25, + "learning_rate": 0.0002462877367831637, + "loss": 0.7191, "step": 1952 }, { - "epoch": 2.087513340448239, - "grad_norm": 0.275390625, - "learning_rate": 0.00024153238006598872, - "loss": 0.5621, + "epoch": 2.0020470829068575, + "grad_norm": 0.248046875, + "learning_rate": 0.0002460740801557763, + "loss": 0.6218, "step": 1956 }, { - "epoch": 2.0917822838847386, - "grad_norm": 0.27734375, - "learning_rate": 0.00024130221831032667, - "loss": 0.5689, + "epoch": 2.006141248720573, + "grad_norm": 0.279296875, + "learning_rate": 0.0002458600925127587, + "loss": 0.6015, "step": 1960 }, { - "epoch": 2.096051227321238, - "grad_norm": 0.294921875, - "learning_rate": 0.00024107171456725618, - "loss": 0.5939, + "epoch": 2.0102354145342884, + "grad_norm": 0.27734375, + "learning_rate": 0.0002456457745913885, + "loss": 0.5645, "step": 1964 }, { - "epoch": 2.1003201707577372, - "grad_norm": 0.263671875, - "learning_rate": 0.00024084086970016676, - "loss": 0.6377, + "epoch": 2.014329580348004, + "grad_norm": 0.2490234375, + "learning_rate": 0.00024543112713008104, + "loss": 0.6114, "step": 1968 }, { - "epoch": 2.104589114194237, - "grad_norm": 0.314453125, - "learning_rate": 0.00024060968457372561, - "loss": 0.6324, + "epoch": 2.0184237461617194, + "grad_norm": 0.267578125, + "learning_rate": 0.00024521615086838726, + "loss": 0.5825, "step": 1972 }, { - "epoch": 2.1088580576307363, - "grad_norm": 0.28125, - "learning_rate": 0.00024037816005387455, - "loss": 0.5936, + "epoch": 2.022517911975435, + "grad_norm": 0.26171875, + "learning_rate": 0.00024500084654699065, + "loss": 0.6006, "step": 1976 }, { - "epoch": 2.113127001067236, + "epoch": 2.0266120777891503, "grad_norm": 0.296875, - "learning_rate": 0.00024014629700782654, - "loss": 0.5778, + "learning_rate": 0.0002447852149077053, + "loss": 0.5932, "step": 1980 }, { - "epoch": 2.1173959445037354, - "grad_norm": 0.318359375, - "learning_rate": 0.00023991409630406258, - "loss": 0.5956, + "epoch": 2.030706243602866, + "grad_norm": 0.263671875, + "learning_rate": 0.00024456925669347294, + "loss": 0.5681, "step": 1984 }, { - "epoch": 2.1216648879402347, - "grad_norm": 0.271484375, - "learning_rate": 0.00023968155881232844, - "loss": 0.5668, + "epoch": 2.0348004094165812, + "grad_norm": 0.259765625, + "learning_rate": 0.00024435297264836043, + "loss": 0.5712, "step": 1988 }, { - "epoch": 2.1259338313767344, - "grad_norm": 0.279296875, - "learning_rate": 0.00023944868540363135, - "loss": 0.587, + "epoch": 2.0388945752302967, + "grad_norm": 0.291015625, + "learning_rate": 0.00024413636351755736, + "loss": 0.5174, "step": 1992 }, { - "epoch": 2.1302027748132337, - "grad_norm": 0.279296875, - "learning_rate": 0.00023921547695023683, - "loss": 0.5851, + "epoch": 2.042988741044012, + "grad_norm": 0.28515625, + "learning_rate": 0.00024391943004737333, + "loss": 0.6106, "step": 1996 }, { - "epoch": 2.134471718249733, - "grad_norm": 0.267578125, - "learning_rate": 0.0002389819343256653, - "loss": 0.5977, + "epoch": 2.0470829068577276, + "grad_norm": 0.2734375, + "learning_rate": 0.00024370217298523534, + "loss": 0.5328, "step": 2000 }, { - "epoch": 2.138740661686233, + "epoch": 2.051177072671443, "grad_norm": 0.267578125, - "learning_rate": 0.00023874805840468888, - "loss": 0.5932, + "learning_rate": 0.00024348459307968546, + "loss": 0.6033, "step": 2004 }, { - "epoch": 2.143009605122732, - "grad_norm": 0.2890625, - "learning_rate": 0.00023851385006332817, - "loss": 0.5572, + "epoch": 2.0552712384851586, + "grad_norm": 0.279296875, + "learning_rate": 0.00024326669108037802, + "loss": 0.5893, "step": 2008 }, { - "epoch": 2.1472785485592314, - "grad_norm": 0.291015625, - "learning_rate": 0.00023827931017884885, - "loss": 0.5813, + "epoch": 2.059365404298874, + "grad_norm": 0.267578125, + "learning_rate": 0.00024304846773807708, + "loss": 0.6343, "step": 2012 }, { - "epoch": 2.151547491995731, - "grad_norm": 0.28125, - "learning_rate": 0.00023804443962975843, - "loss": 0.5636, + "epoch": 2.0634595701125895, + "grad_norm": 0.302734375, + "learning_rate": 0.0002428299238046538, + "loss": 0.5784, "step": 2016 }, { - "epoch": 2.1558164354322304, - "grad_norm": 0.265625, - "learning_rate": 0.00023780923929580308, - "loss": 0.5689, + "epoch": 2.067553735926305, + "grad_norm": 0.271484375, + "learning_rate": 0.00024261106003308408, + "loss": 0.562, "step": 2020 }, { - "epoch": 2.1600853788687298, - "grad_norm": 0.2734375, - "learning_rate": 0.0002375737100579641, - "loss": 0.5949, + "epoch": 2.0716479017400204, + "grad_norm": 0.279296875, + "learning_rate": 0.00024239187717744567, + "loss": 0.5716, "step": 2024 }, { - "epoch": 2.1643543223052295, - "grad_norm": 0.259765625, - "learning_rate": 0.00023733785279845477, - "loss": 0.5579, + "epoch": 2.075742067553736, + "grad_norm": 0.25390625, + "learning_rate": 0.0002421723759929157, + "loss": 0.6365, "step": 2028 }, { - "epoch": 2.168623265741729, - "grad_norm": 0.2734375, - "learning_rate": 0.00023710166840071724, - "loss": 0.6293, + "epoch": 2.0798362333674514, + "grad_norm": 0.296875, + "learning_rate": 0.0002419525572357682, + "loss": 0.6373, "step": 2032 }, { - "epoch": 2.1728922091782286, - "grad_norm": 0.26171875, - "learning_rate": 0.00023686515774941867, - "loss": 0.556, + "epoch": 2.083930399181167, + "grad_norm": 0.27734375, + "learning_rate": 0.00024173242166337114, + "loss": 0.628, "step": 2036 }, { - "epoch": 2.177161152614728, - "grad_norm": 0.279296875, - "learning_rate": 0.0002366283217304485, - "loss": 0.5949, + "epoch": 2.0880245649948823, + "grad_norm": 0.263671875, + "learning_rate": 0.00024151197003418427, + "loss": 0.5754, "step": 2040 }, { - "epoch": 2.181430096051227, - "grad_norm": 0.28125, - "learning_rate": 0.00023639116123091487, - "loss": 0.6119, + "epoch": 2.092118730808598, + "grad_norm": 0.27734375, + "learning_rate": 0.0002412912031077562, + "loss": 0.584, "step": 2044 }, { - "epoch": 2.185699039487727, - "grad_norm": 0.263671875, - "learning_rate": 0.00023615367713914119, - "loss": 0.6332, + "epoch": 2.0962128966223132, + "grad_norm": 0.291015625, + "learning_rate": 0.0002410701216447219, + "loss": 0.6191, "step": 2048 }, { - "epoch": 2.1899679829242262, - "grad_norm": 0.2890625, - "learning_rate": 0.00023591587034466302, - "loss": 0.6104, + "epoch": 2.1003070624360287, + "grad_norm": 0.265625, + "learning_rate": 0.0002408487264068, + "loss": 0.657, "step": 2052 }, { - "epoch": 2.1942369263607255, - "grad_norm": 0.279296875, - "learning_rate": 0.0002356777417382247, - "loss": 0.5749, + "epoch": 2.104401228249744, + "grad_norm": 0.275390625, + "learning_rate": 0.00024062701815679032, + "loss": 0.595, "step": 2056 }, { - "epoch": 2.1985058697972253, - "grad_norm": 0.314453125, - "learning_rate": 0.0002354392922117758, - "loss": 0.5365, + "epoch": 2.1084953940634596, + "grad_norm": 0.287109375, + "learning_rate": 0.00024040499765857093, + "loss": 0.6026, "step": 2060 }, { - "epoch": 2.2027748132337246, - "grad_norm": 0.28125, - "learning_rate": 0.00023520052265846823, - "loss": 0.6188, + "epoch": 2.112589559877175, + "grad_norm": 0.302734375, + "learning_rate": 0.000240182665677096, + "loss": 0.5219, "step": 2064 }, { - "epoch": 2.207043756670224, - "grad_norm": 0.271484375, - "learning_rate": 0.0002349614339726523, - "loss": 0.5737, + "epoch": 2.1166837256908906, + "grad_norm": 0.29296875, + "learning_rate": 0.00023996002297839268, + "loss": 0.6293, "step": 2068 }, { - "epoch": 2.2113127001067236, - "grad_norm": 0.27734375, - "learning_rate": 0.00023472202704987393, - "loss": 0.6577, + "epoch": 2.120777891504606, + "grad_norm": 0.28515625, + "learning_rate": 0.00023973707032955879, + "loss": 0.6043, "step": 2072 }, { - "epoch": 2.215581643543223, - "grad_norm": 0.2890625, - "learning_rate": 0.0002344823027868709, - "loss": 0.5555, + "epoch": 2.1248720573183215, + "grad_norm": 0.291015625, + "learning_rate": 0.00023951380849875995, + "loss": 0.6114, "step": 2076 }, { - "epoch": 2.2198505869797227, - "grad_norm": 0.28515625, - "learning_rate": 0.00023424226208156967, - "loss": 0.5833, + "epoch": 2.128966223132037, + "grad_norm": 0.271484375, + "learning_rate": 0.00023929023825522715, + "loss": 0.5865, "step": 2080 }, { - "epoch": 2.224119530416222, - "grad_norm": 0.30859375, - "learning_rate": 0.00023400190583308206, - "loss": 0.5973, + "epoch": 2.1330603889457525, + "grad_norm": 0.29296875, + "learning_rate": 0.00023906636036925396, + "loss": 0.6042, "step": 2084 }, { - "epoch": 2.2283884738527213, - "grad_norm": 0.271484375, - "learning_rate": 0.00023376123494170178, - "loss": 0.5994, + "epoch": 2.137154554759468, + "grad_norm": 0.3046875, + "learning_rate": 0.00023884217561219386, + "loss": 0.6083, "step": 2088 }, { - "epoch": 2.232657417289221, - "grad_norm": 0.294921875, - "learning_rate": 0.00023352025030890094, - "loss": 0.6407, + "epoch": 2.1412487205731834, + "grad_norm": 0.28125, + "learning_rate": 0.00023861768475645772, + "loss": 0.6202, "step": 2092 }, { - "epoch": 2.2369263607257204, - "grad_norm": 0.26953125, - "learning_rate": 0.00023327895283732706, - "loss": 0.6193, + "epoch": 2.145342886386899, + "grad_norm": 0.279296875, + "learning_rate": 0.00023839288857551095, + "loss": 0.5945, "step": 2096 }, { - "epoch": 2.2411953041622197, - "grad_norm": 0.302734375, - "learning_rate": 0.00023303734343079927, - "loss": 0.6367, + "epoch": 2.1494370522006143, + "grad_norm": 0.2734375, + "learning_rate": 0.00023816778784387094, + "loss": 0.6022, "step": 2100 }, { - "epoch": 2.2454642475987194, - "grad_norm": 0.287109375, - "learning_rate": 0.00023279542299430513, - "loss": 0.5854, + "epoch": 2.15353121801433, + "grad_norm": 0.271484375, + "learning_rate": 0.00023794238333710454, + "loss": 0.5561, "step": 2104 }, { - "epoch": 2.2497331910352187, - "grad_norm": 0.2734375, - "learning_rate": 0.00023255319243399737, - "loss": 0.5991, + "epoch": 2.1576253838280453, + "grad_norm": 0.27734375, + "learning_rate": 0.00023771667583182498, + "loss": 0.5638, "step": 2108 }, { - "epoch": 2.254002134471718, - "grad_norm": 0.31640625, - "learning_rate": 0.0002323106526571901, - "loss": 0.5852, + "epoch": 2.1617195496417603, + "grad_norm": 0.28125, + "learning_rate": 0.00023749066610568968, + "loss": 0.5325, "step": 2112 }, { - "epoch": 2.258271077908218, - "grad_norm": 0.29296875, - "learning_rate": 0.0002320678045723558, - "loss": 0.5732, + "epoch": 2.1658137154554757, + "grad_norm": 0.28125, + "learning_rate": 0.00023726435493739726, + "loss": 0.5708, "step": 2116 }, { - "epoch": 2.262540021344717, - "grad_norm": 0.32421875, - "learning_rate": 0.00023182464908912177, - "loss": 0.5196, + "epoch": 2.169907881269191, + "grad_norm": 0.302734375, + "learning_rate": 0.00023703774310668483, + "loss": 0.6038, "step": 2120 }, { - "epoch": 2.266808964781217, - "grad_norm": 0.28125, - "learning_rate": 0.00023158118711826662, - "loss": 0.599, + "epoch": 2.1740020470829067, + "grad_norm": 0.2734375, + "learning_rate": 0.00023681083139432549, + "loss": 0.5861, "step": 2124 }, { - "epoch": 2.271077908217716, - "grad_norm": 0.32421875, - "learning_rate": 0.0002313374195717171, - "loss": 0.5866, + "epoch": 2.178096212896622, + "grad_norm": 0.26953125, + "learning_rate": 0.00023658362058212568, + "loss": 0.6176, "step": 2128 }, { - "epoch": 2.2753468516542155, - "grad_norm": 0.26953125, - "learning_rate": 0.0002310933473625444, - "loss": 0.5709, + "epoch": 2.1821903787103376, + "grad_norm": 0.28125, + "learning_rate": 0.00023635611145292213, + "loss": 0.5727, "step": 2132 }, { - "epoch": 2.279615795090715, - "grad_norm": 0.30078125, - "learning_rate": 0.00023084897140496102, - "loss": 0.6166, + "epoch": 2.186284544524053, + "grad_norm": 0.294921875, + "learning_rate": 0.00023612830479057957, + "loss": 0.5516, "step": 2136 }, { - "epoch": 2.2838847385272145, - "grad_norm": 0.291015625, - "learning_rate": 0.00023060429261431703, - "loss": 0.5986, + "epoch": 2.1903787103377685, + "grad_norm": 0.298828125, + "learning_rate": 0.00023590020137998787, + "loss": 0.6297, "step": 2140 }, { - "epoch": 2.288153681963714, - "grad_norm": 0.291015625, - "learning_rate": 0.000230359311907097, - "loss": 0.5848, + "epoch": 2.194472876151484, + "grad_norm": 0.2734375, + "learning_rate": 0.0002356718020070591, + "loss": 0.5994, "step": 2144 }, { - "epoch": 2.2924226254002136, - "grad_norm": 0.275390625, - "learning_rate": 0.00023011403020091622, - "loss": 0.582, + "epoch": 2.1985670419651995, + "grad_norm": 0.271484375, + "learning_rate": 0.00023544310745872532, + "loss": 0.591, "step": 2148 }, { - "epoch": 2.296691568836713, - "grad_norm": 0.283203125, - "learning_rate": 0.00022986844841451747, - "loss": 0.5702, + "epoch": 2.202661207778915, + "grad_norm": 0.2890625, + "learning_rate": 0.00023521411852293545, + "loss": 0.6033, "step": 2152 }, { - "epoch": 2.300960512273212, - "grad_norm": 0.29296875, - "learning_rate": 0.00022962256746776762, - "loss": 0.587, + "epoch": 2.2067553735926304, + "grad_norm": 0.28515625, + "learning_rate": 0.00023498483598865272, + "loss": 0.5993, "step": 2156 }, { - "epoch": 2.305229455709712, - "grad_norm": 0.30859375, - "learning_rate": 0.00022937638828165397, - "loss": 0.5707, + "epoch": 2.210849539406346, + "grad_norm": 0.291015625, + "learning_rate": 0.00023475526064585192, + "loss": 0.6182, "step": 2160 }, { - "epoch": 2.3094983991462112, - "grad_norm": 0.30078125, - "learning_rate": 0.00022912991177828095, - "loss": 0.6133, + "epoch": 2.2149437052200613, + "grad_norm": 0.275390625, + "learning_rate": 0.00023452539328551664, + "loss": 0.6129, "step": 2164 }, { - "epoch": 2.313767342582711, - "grad_norm": 0.271484375, - "learning_rate": 0.00022888313888086672, - "loss": 0.5773, + "epoch": 2.219037871033777, + "grad_norm": 0.2734375, + "learning_rate": 0.00023429523469963682, + "loss": 0.6295, "step": 2168 }, { - "epoch": 2.3180362860192103, - "grad_norm": 0.28515625, - "learning_rate": 0.0002286360705137395, - "loss": 0.602, + "epoch": 2.2231320368474923, + "grad_norm": 0.27734375, + "learning_rate": 0.0002340647856812055, + "loss": 0.5664, "step": 2172 }, { - "epoch": 2.3223052294557096, - "grad_norm": 0.279296875, - "learning_rate": 0.00022838870760233436, - "loss": 0.5929, + "epoch": 2.2272262026612077, + "grad_norm": 0.275390625, + "learning_rate": 0.0002338340470242165, + "loss": 0.6029, "step": 2176 }, { - "epoch": 2.3265741728922094, - "grad_norm": 0.310546875, - "learning_rate": 0.00022814105107318952, - "loss": 0.6213, + "epoch": 2.231320368474923, + "grad_norm": 0.283203125, + "learning_rate": 0.00023360301952366166, + "loss": 0.6056, "step": 2180 }, { - "epoch": 2.3308431163287087, + "epoch": 2.2354145342886387, "grad_norm": 0.29296875, - "learning_rate": 0.00022789310185394315, - "loss": 0.5939, + "learning_rate": 0.0002333717039755279, + "loss": 0.577, "step": 2184 }, { - "epoch": 2.335112059765208, - "grad_norm": 0.271484375, - "learning_rate": 0.00022764486087332967, - "loss": 0.5494, + "epoch": 2.239508700102354, + "grad_norm": 0.28125, + "learning_rate": 0.0002331401011767946, + "loss": 0.6348, "step": 2188 }, { - "epoch": 2.3393810032017077, - "grad_norm": 0.30859375, - "learning_rate": 0.0002273963290611762, - "loss": 0.5768, + "epoch": 2.2436028659160696, + "grad_norm": 0.3046875, + "learning_rate": 0.00023290821192543083, + "loss": 0.6237, "step": 2192 }, { - "epoch": 2.343649946638207, + "epoch": 2.247697031729785, "grad_norm": 0.28125, - "learning_rate": 0.0002271475073483994, - "loss": 0.6515, + "learning_rate": 0.00023267603702039263, + "loss": 0.5623, "step": 2196 }, { - "epoch": 2.3479188900747063, - "grad_norm": 0.27734375, - "learning_rate": 0.0002268983966670018, - "loss": 0.6567, + "epoch": 2.2517911975435005, + "grad_norm": 0.2890625, + "learning_rate": 0.0002324435772616203, + "loss": 0.5739, "step": 2200 }, { - "epoch": 2.352187833511206, - "grad_norm": 0.279296875, - "learning_rate": 0.00022664899795006818, - "loss": 0.608, + "epoch": 2.255885363357216, + "grad_norm": 0.259765625, + "learning_rate": 0.00023221083345003544, + "loss": 0.5992, "step": 2204 }, { - "epoch": 2.3564567769477054, - "grad_norm": 0.294921875, - "learning_rate": 0.00022639931213176226, - "loss": 0.5777, + "epoch": 2.2599795291709315, + "grad_norm": 0.310546875, + "learning_rate": 0.00023197780638753847, + "loss": 0.5831, "step": 2208 }, { - "epoch": 2.360725720384205, - "grad_norm": 0.306640625, - "learning_rate": 0.00022614934014732316, - "loss": 0.5906, + "epoch": 2.264073694984647, + "grad_norm": 0.29296875, + "learning_rate": 0.00023174449687700564, + "loss": 0.6073, "step": 2212 }, { - "epoch": 2.3649946638207044, - "grad_norm": 0.296875, - "learning_rate": 0.00022589908293306187, - "loss": 0.5678, + "epoch": 2.2681678607983624, + "grad_norm": 0.265625, + "learning_rate": 0.00023151090572228635, + "loss": 0.563, "step": 2216 }, { - "epoch": 2.3692636072572038, - "grad_norm": 0.30078125, - "learning_rate": 0.0002256485414263578, - "loss": 0.564, + "epoch": 2.272262026612078, + "grad_norm": 0.3125, + "learning_rate": 0.0002312770337282005, + "loss": 0.5428, "step": 2220 }, { - "epoch": 2.3735325506937035, - "grad_norm": 0.27734375, - "learning_rate": 0.00022539771656565518, - "loss": 0.5967, + "epoch": 2.2763561924257933, + "grad_norm": 0.279296875, + "learning_rate": 0.00023104288170053543, + "loss": 0.5443, "step": 2224 }, { - "epoch": 2.377801494130203, - "grad_norm": 0.283203125, - "learning_rate": 0.00022514660929045963, - "loss": 0.5272, + "epoch": 2.280450358239509, + "grad_norm": 0.259765625, + "learning_rate": 0.0002308084504460435, + "loss": 0.5721, "step": 2228 }, { - "epoch": 2.382070437566702, - "grad_norm": 0.271484375, - "learning_rate": 0.00022489522054133448, - "loss": 0.5527, + "epoch": 2.2845445240532243, + "grad_norm": 0.2890625, + "learning_rate": 0.00023057374077243884, + "loss": 0.5796, "step": 2232 }, { - "epoch": 2.386339381003202, - "grad_norm": 0.27734375, - "learning_rate": 0.0002246435512598975, - "loss": 0.5927, + "epoch": 2.2886386898669397, + "grad_norm": 0.29296875, + "learning_rate": 0.00023033875348839526, + "loss": 0.6163, "step": 2236 }, { - "epoch": 2.390608324439701, - "grad_norm": 0.29296875, - "learning_rate": 0.0002243916023888172, - "loss": 0.6094, + "epoch": 2.292732855680655, + "grad_norm": 0.28515625, + "learning_rate": 0.00023010348940354262, + "loss": 0.5588, "step": 2240 }, { - "epoch": 2.3948772678762005, - "grad_norm": 0.30078125, - "learning_rate": 0.00022413937487180928, - "loss": 0.5965, + "epoch": 2.2968270214943707, + "grad_norm": 0.283203125, + "learning_rate": 0.0002298679493284648, + "loss": 0.6182, "step": 2244 }, { - "epoch": 2.3991462113127002, + "epoch": 2.300921187308086, "grad_norm": 0.28515625, - "learning_rate": 0.00022388686965363328, - "loss": 0.5922, + "learning_rate": 0.00022963213407469643, + "loss": 0.6273, "step": 2248 }, { - "epoch": 2.4034151547491995, - "grad_norm": 0.29296875, - "learning_rate": 0.00022363408768008878, - "loss": 0.6087, + "epoch": 2.3050153531218016, + "grad_norm": 0.279296875, + "learning_rate": 0.00022939604445472027, + "loss": 0.5913, "step": 2252 }, { - "epoch": 2.4076840981856993, - "grad_norm": 0.28125, - "learning_rate": 0.0002233810298980121, - "loss": 0.6428, + "epoch": 2.309109518935517, + "grad_norm": 0.28515625, + "learning_rate": 0.00022915968128196443, + "loss": 0.605, "step": 2256 }, { - "epoch": 2.4119530416221986, - "grad_norm": 0.279296875, - "learning_rate": 0.00022312769725527256, - "loss": 0.6142, + "epoch": 2.313203684749232, + "grad_norm": 0.30859375, + "learning_rate": 0.00022892304537079945, + "loss": 0.5575, "step": 2260 }, { - "epoch": 2.416221985058698, - "grad_norm": 0.279296875, - "learning_rate": 0.00022287409070076905, - "loss": 0.6185, + "epoch": 2.317297850562948, + "grad_norm": 0.30078125, + "learning_rate": 0.00022868613753653565, + "loss": 0.6122, "step": 2264 }, { - "epoch": 2.420490928495197, - "grad_norm": 0.306640625, - "learning_rate": 0.00022262021118442645, - "loss": 0.58, + "epoch": 2.321392016376663, + "grad_norm": 0.318359375, + "learning_rate": 0.00022844895859542016, + "loss": 0.644, "step": 2268 }, { - "epoch": 2.424759871931697, - "grad_norm": 0.29296875, - "learning_rate": 0.0002223660596571921, - "loss": 0.5625, + "epoch": 2.325486182190379, + "grad_norm": 0.30078125, + "learning_rate": 0.00022821150936463427, + "loss": 0.6069, "step": 2272 }, { - "epoch": 2.4290288153681963, - "grad_norm": 0.26171875, - "learning_rate": 0.00022211163707103215, - "loss": 0.5471, + "epoch": 2.329580348004094, + "grad_norm": 0.275390625, + "learning_rate": 0.0002279737906622905, + "loss": 0.5912, "step": 2276 }, { - "epoch": 2.433297758804696, - "grad_norm": 0.29296875, - "learning_rate": 0.000221856944378928, - "loss": 0.5963, + "epoch": 2.3336745138178094, + "grad_norm": 0.2890625, + "learning_rate": 0.00022773580330742973, + "loss": 0.5825, "step": 2280 }, { - "epoch": 2.4375667022411953, - "grad_norm": 0.27734375, - "learning_rate": 0.00022160198253487286, - "loss": 0.5698, + "epoch": 2.337768679631525, + "grad_norm": 0.267578125, + "learning_rate": 0.00022749754812001856, + "loss": 0.6167, "step": 2284 }, { - "epoch": 2.4418356456776946, - "grad_norm": 0.3046875, - "learning_rate": 0.00022134675249386808, - "loss": 0.5675, + "epoch": 2.3418628454452404, + "grad_norm": 0.27734375, + "learning_rate": 0.0002272590259209464, + "loss": 0.5872, "step": 2288 }, { - "epoch": 2.4461045891141944, - "grad_norm": 0.3125, - "learning_rate": 0.00022109125521191955, - "loss": 0.5796, + "epoch": 2.345957011258956, + "grad_norm": 0.298828125, + "learning_rate": 0.00022702023753202257, + "loss": 0.5571, "step": 2292 }, { - "epoch": 2.4503735325506937, - "grad_norm": 0.3125, - "learning_rate": 0.00022083549164603417, - "loss": 0.608, + "epoch": 2.3500511770726713, + "grad_norm": 0.28125, + "learning_rate": 0.0002267811837759735, + "loss": 0.5777, "step": 2296 }, { - "epoch": 2.454642475987193, - "grad_norm": 0.2890625, - "learning_rate": 0.0002205794627542163, - "loss": 0.6231, + "epoch": 2.3541453428863868, + "grad_norm": 0.28515625, + "learning_rate": 0.00022654186547644003, + "loss": 0.5692, "step": 2300 }, { - "epoch": 2.4589114194236927, - "grad_norm": 0.283203125, - "learning_rate": 0.00022032316949546405, - "loss": 0.6001, + "epoch": 2.3582395087001022, + "grad_norm": 0.298828125, + "learning_rate": 0.00022630228345797435, + "loss": 0.5789, "step": 2304 }, { - "epoch": 2.463180362860192, - "grad_norm": 0.279296875, - "learning_rate": 0.00022006661282976576, - "loss": 0.5945, + "epoch": 2.3623336745138177, + "grad_norm": 0.302734375, + "learning_rate": 0.00022606243854603729, + "loss": 0.6044, "step": 2308 }, { - "epoch": 2.4674493062966913, - "grad_norm": 0.310546875, - "learning_rate": 0.0002198097937180965, - "loss": 0.5861, + "epoch": 2.366427840327533, + "grad_norm": 0.294921875, + "learning_rate": 0.0002258223315669956, + "loss": 0.6143, "step": 2312 }, { - "epoch": 2.471718249733191, + "epoch": 2.3705220061412486, "grad_norm": 0.28125, - "learning_rate": 0.0002195527131224142, - "loss": 0.592, + "learning_rate": 0.00022558196334811873, + "loss": 0.5742, "step": 2316 }, { - "epoch": 2.4759871931696904, - "grad_norm": 0.28515625, - "learning_rate": 0.00021929537200565642, - "loss": 0.5731, + "epoch": 2.374616171954964, + "grad_norm": 0.291015625, + "learning_rate": 0.0002253413347175764, + "loss": 0.6163, "step": 2320 }, { - "epoch": 2.48025613660619, - "grad_norm": 0.3203125, - "learning_rate": 0.00021903777133173635, - "loss": 0.6447, + "epoch": 2.3787103377686796, + "grad_norm": 0.30078125, + "learning_rate": 0.00022510044650443547, + "loss": 0.612, "step": 2324 }, { - "epoch": 2.4845250800426895, - "grad_norm": 0.294921875, - "learning_rate": 0.00021877991206553963, - "loss": 0.5882, + "epoch": 2.382804503582395, + "grad_norm": 0.318359375, + "learning_rate": 0.00022485929953865714, + "loss": 0.5886, "step": 2328 }, { - "epoch": 2.4887940234791888, - "grad_norm": 0.283203125, - "learning_rate": 0.00021852179517292027, - "loss": 0.5735, + "epoch": 2.3868986693961105, + "grad_norm": 0.30859375, + "learning_rate": 0.00022461789465109426, + "loss": 0.5864, "step": 2332 }, { - "epoch": 2.4930629669156885, - "grad_norm": 0.2890625, - "learning_rate": 0.00021826342162069737, - "loss": 0.5654, + "epoch": 2.390992835209826, + "grad_norm": 0.29296875, + "learning_rate": 0.00022437623267348823, + "loss": 0.6113, "step": 2336 }, { - "epoch": 2.497331910352188, - "grad_norm": 0.306640625, - "learning_rate": 0.00021800479237665135, - "loss": 0.6594, + "epoch": 2.3950870010235414, + "grad_norm": 0.283203125, + "learning_rate": 0.00022413431443846617, + "loss": 0.6025, "step": 2340 }, { - "epoch": 2.5016008537886876, - "grad_norm": 0.2890625, - "learning_rate": 0.00021774590840952044, - "loss": 0.5994, + "epoch": 2.399181166837257, + "grad_norm": 0.298828125, + "learning_rate": 0.00022389214077953823, + "loss": 0.6034, "step": 2344 }, { - "epoch": 2.505869797225187, - "grad_norm": 0.283203125, - "learning_rate": 0.0002174867706889969, - "loss": 0.611, + "epoch": 2.4032753326509724, + "grad_norm": 0.298828125, + "learning_rate": 0.00022364971253109462, + "loss": 0.666, "step": 2348 }, { - "epoch": 2.510138740661686, - "grad_norm": 0.302734375, - "learning_rate": 0.00021722738018572352, - "loss": 0.6217, + "epoch": 2.407369498464688, + "grad_norm": 0.283203125, + "learning_rate": 0.00022340703052840257, + "loss": 0.5732, "step": 2352 }, { - "epoch": 2.5144076840981855, - "grad_norm": 0.3046875, - "learning_rate": 0.00021696773787128988, - "loss": 0.6403, + "epoch": 2.4114636642784033, + "grad_norm": 0.298828125, + "learning_rate": 0.0002231640956076037, + "loss": 0.6258, "step": 2356 }, { - "epoch": 2.5186766275346852, - "grad_norm": 0.3046875, - "learning_rate": 0.00021670784471822878, - "loss": 0.6053, + "epoch": 2.4155578300921188, + "grad_norm": 0.28515625, + "learning_rate": 0.00022292090860571108, + "loss": 0.5836, "step": 2360 }, { - "epoch": 2.5229455709711845, - "grad_norm": 0.294921875, - "learning_rate": 0.00021644770170001256, - "loss": 0.6154, + "epoch": 2.4196519959058342, + "grad_norm": 0.28125, + "learning_rate": 0.00022267747036060627, + "loss": 0.5722, "step": 2364 }, { - "epoch": 2.5272145144076843, - "grad_norm": 0.302734375, - "learning_rate": 0.00021618730979104954, - "loss": 0.5762, + "epoch": 2.4237461617195497, + "grad_norm": 0.287109375, + "learning_rate": 0.00022243378171103636, + "loss": 0.5837, "step": 2368 }, { - "epoch": 2.5314834578441836, - "grad_norm": 0.294921875, - "learning_rate": 0.00021592666996668022, - "loss": 0.5574, + "epoch": 2.427840327533265, + "grad_norm": 0.2734375, + "learning_rate": 0.00022218984349661134, + "loss": 0.6311, "step": 2372 }, { - "epoch": 2.535752401280683, - "grad_norm": 0.30859375, - "learning_rate": 0.00021566578320317376, - "loss": 0.611, + "epoch": 2.4319344933469806, + "grad_norm": 0.265625, + "learning_rate": 0.00022194565655780102, + "loss": 0.6041, "step": 2376 }, { - "epoch": 2.5400213447171827, - "grad_norm": 0.283203125, - "learning_rate": 0.00021540465047772422, - "loss": 0.6169, + "epoch": 2.436028659160696, + "grad_norm": 0.3046875, + "learning_rate": 0.00022170122173593206, + "loss": 0.5853, "step": 2380 }, { - "epoch": 2.544290288153682, - "grad_norm": 0.287109375, - "learning_rate": 0.000215143272768447, - "loss": 0.522, + "epoch": 2.4401228249744116, + "grad_norm": 0.3125, + "learning_rate": 0.00022145653987318536, + "loss": 0.6081, "step": 2384 }, { - "epoch": 2.5485592315901813, - "grad_norm": 0.3046875, - "learning_rate": 0.00021488165105437514, - "loss": 0.5935, + "epoch": 2.444216990788127, + "grad_norm": 0.294921875, + "learning_rate": 0.0002212116118125929, + "loss": 0.5846, "step": 2388 }, { - "epoch": 2.552828175026681, - "grad_norm": 0.275390625, - "learning_rate": 0.00021461978631545558, - "loss": 0.6303, + "epoch": 2.4483111566018425, + "grad_norm": 0.287109375, + "learning_rate": 0.00022096643839803482, + "loss": 0.5953, "step": 2392 }, { - "epoch": 2.5570971184631803, - "grad_norm": 0.287109375, - "learning_rate": 0.0002143576795325455, - "loss": 0.6188, + "epoch": 2.452405322415558, + "grad_norm": 0.298828125, + "learning_rate": 0.00022072102047423673, + "loss": 0.554, "step": 2396 }, { - "epoch": 2.5613660618996796, - "grad_norm": 0.298828125, - "learning_rate": 0.00021409533168740888, - "loss": 0.5681, + "epoch": 2.4564994882292734, + "grad_norm": 0.29296875, + "learning_rate": 0.0002204753588867667, + "loss": 0.6111, "step": 2400 }, { - "epoch": 2.5656350053361794, - "grad_norm": 0.3125, - "learning_rate": 0.0002138327437627125, - "loss": 0.5694, + "epoch": 2.460593654042989, + "grad_norm": 0.2890625, + "learning_rate": 0.0002202294544820321, + "loss": 0.6074, "step": 2404 }, { - "epoch": 2.5699039487726787, - "grad_norm": 0.302734375, - "learning_rate": 0.00021356991674202245, - "loss": 0.5456, + "epoch": 2.4646878198567044, + "grad_norm": 0.30859375, + "learning_rate": 0.00021998330810727725, + "loss": 0.6092, "step": 2408 }, { - "epoch": 2.5741728922091784, - "grad_norm": 0.302734375, - "learning_rate": 0.00021330685160980033, - "loss": 0.6229, + "epoch": 2.46878198567042, + "grad_norm": 0.3125, + "learning_rate": 0.00021973692061057987, + "loss": 0.5686, "step": 2412 }, { - "epoch": 2.5784418356456777, - "grad_norm": 0.310546875, - "learning_rate": 0.00021304354935139966, - "loss": 0.6189, + "epoch": 2.472876151484135, + "grad_norm": 0.287109375, + "learning_rate": 0.0002194902928408486, + "loss": 0.6034, "step": 2416 }, { - "epoch": 2.582710779082177, - "grad_norm": 0.310546875, - "learning_rate": 0.0002127800109530622, - "loss": 0.5721, + "epoch": 2.4769703172978508, + "grad_norm": 0.28515625, + "learning_rate": 0.0002192434256478199, + "loss": 0.5837, "step": 2420 }, { - "epoch": 2.5869797225186764, - "grad_norm": 0.298828125, - "learning_rate": 0.00021251623740191412, - "loss": 0.609, + "epoch": 2.481064483111566, + "grad_norm": 0.27734375, + "learning_rate": 0.00021899631988205506, + "loss": 0.5938, "step": 2424 }, { - "epoch": 2.591248665955176, - "grad_norm": 0.29296875, - "learning_rate": 0.0002122522296859625, - "loss": 0.6232, + "epoch": 2.4851586489252817, + "grad_norm": 0.296875, + "learning_rate": 0.00021874897639493745, + "loss": 0.5849, "step": 2428 }, { - "epoch": 2.5955176093916754, - "grad_norm": 0.3046875, - "learning_rate": 0.00021198798879409146, - "loss": 0.6303, + "epoch": 2.4892528147389967, + "grad_norm": 0.283203125, + "learning_rate": 0.00021850139603866946, + "loss": 0.6057, "step": 2432 }, { - "epoch": 2.599786552828175, - "grad_norm": 0.291015625, - "learning_rate": 0.00021172351571605851, - "loss": 0.6504, + "epoch": 2.493346980552712, + "grad_norm": 0.279296875, + "learning_rate": 0.0002182535796662696, + "loss": 0.555, "step": 2436 }, { - "epoch": 2.6040554962646745, - "grad_norm": 0.30078125, - "learning_rate": 0.0002114588114424909, - "loss": 0.5888, + "epoch": 2.4974411463664277, + "grad_norm": 0.287109375, + "learning_rate": 0.00021800552813156947, + "loss": 0.5598, "step": 2440 }, { - "epoch": 2.608324439701174, - "grad_norm": 0.265625, - "learning_rate": 0.00021119387696488174, - "loss": 0.5791, + "epoch": 2.501535312180143, + "grad_norm": 0.279296875, + "learning_rate": 0.00021775724228921108, + "loss": 0.6042, "step": 2444 }, { - "epoch": 2.6125933831376735, - "grad_norm": 0.3046875, - "learning_rate": 0.00021092871327558658, - "loss": 0.608, + "epoch": 2.5056294779938586, + "grad_norm": 0.314453125, + "learning_rate": 0.00021750872299464358, + "loss": 0.5955, "step": 2448 }, { - "epoch": 2.616862326574173, - "grad_norm": 0.298828125, - "learning_rate": 0.00021066332136781947, - "loss": 0.636, + "epoch": 2.509723643807574, + "grad_norm": 0.3046875, + "learning_rate": 0.00021725997110412043, + "loss": 0.5756, "step": 2452 }, { - "epoch": 2.6211312700106726, - "grad_norm": 0.3046875, - "learning_rate": 0.00021039770223564913, - "loss": 0.5648, + "epoch": 2.5138178096212895, + "grad_norm": 0.28125, + "learning_rate": 0.0002170109874746967, + "loss": 0.5865, "step": 2456 }, { - "epoch": 2.625400213447172, - "grad_norm": 0.28125, - "learning_rate": 0.00021013185687399565, - "loss": 0.5866, + "epoch": 2.517911975435005, + "grad_norm": 0.287109375, + "learning_rate": 0.00021676177296422566, + "loss": 0.6014, "step": 2460 }, { - "epoch": 2.629669156883671, - "grad_norm": 0.314453125, - "learning_rate": 0.00020986578627862627, - "loss": 0.6756, + "epoch": 2.5220061412487205, + "grad_norm": 0.291015625, + "learning_rate": 0.00021651232843135617, + "loss": 0.5804, "step": 2464 }, { - "epoch": 2.6339381003201705, - "grad_norm": 0.326171875, - "learning_rate": 0.00020959949144615199, - "loss": 0.6176, + "epoch": 2.526100307062436, + "grad_norm": 0.3046875, + "learning_rate": 0.00021626265473552965, + "loss": 0.6267, "step": 2468 }, { - "epoch": 2.6382070437566703, - "grad_norm": 0.296875, - "learning_rate": 0.0002093329733740237, - "loss": 0.5987, + "epoch": 2.5301944728761514, + "grad_norm": 0.30078125, + "learning_rate": 0.00021601275273697696, + "loss": 0.5653, "step": 2472 }, { - "epoch": 2.6424759871931696, - "grad_norm": 0.298828125, - "learning_rate": 0.00020906623306052852, - "loss": 0.5734, + "epoch": 2.534288638689867, + "grad_norm": 0.296875, + "learning_rate": 0.00021576262329671568, + "loss": 0.5678, "step": 2476 }, { - "epoch": 2.6467449306296693, - "grad_norm": 0.302734375, - "learning_rate": 0.00020879927150478588, - "loss": 0.6388, + "epoch": 2.5383828045035823, + "grad_norm": 0.326171875, + "learning_rate": 0.00021551226727654696, + "loss": 0.5694, "step": 2480 }, { - "epoch": 2.6510138740661686, - "grad_norm": 0.32421875, - "learning_rate": 0.0002085320897067441, - "loss": 0.6202, + "epoch": 2.542476970317298, + "grad_norm": 0.287109375, + "learning_rate": 0.00021526168553905265, + "loss": 0.5915, "step": 2484 }, { - "epoch": 2.655282817502668, - "grad_norm": 0.279296875, - "learning_rate": 0.00020826468866717637, - "loss": 0.6261, + "epoch": 2.5465711361310133, + "grad_norm": 0.294921875, + "learning_rate": 0.00021501087894759227, + "loss": 0.5917, "step": 2488 }, { - "epoch": 2.6595517609391677, - "grad_norm": 0.296875, - "learning_rate": 0.00020799706938767697, - "loss": 0.5854, + "epoch": 2.5506653019447287, + "grad_norm": 0.298828125, + "learning_rate": 0.00021475984836629998, + "loss": 0.583, "step": 2492 }, { - "epoch": 2.663820704375667, - "grad_norm": 0.267578125, - "learning_rate": 0.00020772923287065776, - "loss": 0.6162, + "epoch": 2.554759467758444, + "grad_norm": 0.29296875, + "learning_rate": 0.0002145085946600819, + "loss": 0.6088, "step": 2496 }, { - "epoch": 2.6680896478121667, - "grad_norm": 0.3203125, - "learning_rate": 0.00020746118011934428, - "loss": 0.6165, + "epoch": 2.5588536335721597, + "grad_norm": 0.265625, + "learning_rate": 0.00021425711869461266, + "loss": 0.5994, "step": 2500 }, { - "epoch": 2.672358591248666, - "grad_norm": 0.291015625, - "learning_rate": 0.00020719291213777208, - "loss": 0.643, + "epoch": 2.562947799385875, + "grad_norm": 0.2890625, + "learning_rate": 0.00021400542133633276, + "loss": 0.5656, "step": 2504 }, { - "epoch": 2.6766275346851653, - "grad_norm": 0.306640625, - "learning_rate": 0.00020692442993078264, - "loss": 0.6155, + "epoch": 2.5670419651995906, + "grad_norm": 0.2890625, + "learning_rate": 0.00021375350345244557, + "loss": 0.6544, "step": 2508 }, { - "epoch": 2.6808964781216647, - "grad_norm": 0.287109375, - "learning_rate": 0.00020665573450402017, - "loss": 0.6108, + "epoch": 2.571136131013306, + "grad_norm": 0.2890625, + "learning_rate": 0.00021350136591091415, + "loss": 0.5995, "step": 2512 }, { - "epoch": 2.6851654215581644, - "grad_norm": 0.359375, - "learning_rate": 0.00020638682686392734, - "loss": 0.6177, + "epoch": 2.5752302968270215, + "grad_norm": 0.287109375, + "learning_rate": 0.00021324900958045843, + "loss": 0.6408, "step": 2516 }, { - "epoch": 2.6894343649946637, - "grad_norm": 0.298828125, - "learning_rate": 0.00020611770801774168, - "loss": 0.5889, + "epoch": 2.579324462640737, + "grad_norm": 0.302734375, + "learning_rate": 0.00021299643533055214, + "loss": 0.62, "step": 2520 }, { - "epoch": 2.6937033084311635, - "grad_norm": 0.3125, - "learning_rate": 0.00020584837897349203, - "loss": 0.5726, + "epoch": 2.5834186284544525, + "grad_norm": 0.29296875, + "learning_rate": 0.0002127436440314199, + "loss": 0.5754, "step": 2524 }, { - "epoch": 2.6979722518676628, - "grad_norm": 0.28125, - "learning_rate": 0.00020557884073999432, - "loss": 0.5945, + "epoch": 2.587512794268168, + "grad_norm": 0.310546875, + "learning_rate": 0.000212490636554034, + "loss": 0.5731, "step": 2528 }, { - "epoch": 2.702241195304162, - "grad_norm": 0.302734375, - "learning_rate": 0.0002053090943268482, - "loss": 0.6184, + "epoch": 2.5916069600818834, + "grad_norm": 0.30078125, + "learning_rate": 0.00021223741377011178, + "loss": 0.582, "step": 2532 }, { - "epoch": 2.706510138740662, - "grad_norm": 0.310546875, - "learning_rate": 0.00020503914074443302, - "loss": 0.6313, + "epoch": 2.595701125895599, + "grad_norm": 0.287109375, + "learning_rate": 0.00021198397655211216, + "loss": 0.5946, "step": 2536 }, { - "epoch": 2.710779082177161, - "grad_norm": 0.2734375, - "learning_rate": 0.0002047689810039041, - "loss": 0.5739, + "epoch": 2.5997952917093143, + "grad_norm": 0.30859375, + "learning_rate": 0.00021173032577323302, + "loss": 0.6236, "step": 2540 }, { - "epoch": 2.715048025613661, - "grad_norm": 0.302734375, - "learning_rate": 0.00020449861611718896, - "loss": 0.5894, + "epoch": 2.60388945752303, + "grad_norm": 0.298828125, + "learning_rate": 0.00021147646230740814, + "loss": 0.6059, "step": 2544 }, { - "epoch": 2.71931696905016, - "grad_norm": 0.29296875, - "learning_rate": 0.00020422804709698358, - "loss": 0.6277, + "epoch": 2.6079836233367453, + "grad_norm": 0.298828125, + "learning_rate": 0.00021122238702930377, + "loss": 0.5637, "step": 2548 }, { - "epoch": 2.7235859124866595, - "grad_norm": 0.28515625, - "learning_rate": 0.00020395727495674856, - "loss": 0.646, + "epoch": 2.6120777891504607, + "grad_norm": 0.275390625, + "learning_rate": 0.00021096810081431628, + "loss": 0.5897, "step": 2552 }, { - "epoch": 2.727854855923159, + "epoch": 2.616171954964176, "grad_norm": 0.3046875, - "learning_rate": 0.0002036863007107052, - "loss": 0.6199, + "learning_rate": 0.00021071360453856866, + "loss": 0.5732, "step": 2556 }, { - "epoch": 2.7321237993596585, - "grad_norm": 0.30859375, - "learning_rate": 0.000203415125373832, - "loss": 0.5702, + "epoch": 2.6202661207778917, + "grad_norm": 0.28515625, + "learning_rate": 0.00021045889907890763, + "loss": 0.6022, "step": 2560 }, { - "epoch": 2.736392742796158, - "grad_norm": 0.29296875, - "learning_rate": 0.0002031437499618606, - "loss": 0.5644, + "epoch": 2.6243602865916067, + "grad_norm": 0.306640625, + "learning_rate": 0.00021020398531290067, + "loss": 0.6029, "step": 2564 }, { - "epoch": 2.7406616862326576, - "grad_norm": 0.283203125, - "learning_rate": 0.00020287217549127195, - "loss": 0.55, + "epoch": 2.6284544524053226, + "grad_norm": 0.291015625, + "learning_rate": 0.00020994886411883297, + "loss": 0.6171, "step": 2568 }, { - "epoch": 2.744930629669157, - "grad_norm": 0.30078125, - "learning_rate": 0.00020260040297929272, - "loss": 0.6404, + "epoch": 2.6325486182190376, + "grad_norm": 0.267578125, + "learning_rate": 0.00020969353637570443, + "loss": 0.5882, "step": 2572 }, { - "epoch": 2.749199573105656, - "grad_norm": 0.302734375, - "learning_rate": 0.00020232843344389145, - "loss": 0.6179, + "epoch": 2.6366427840327535, + "grad_norm": 0.30859375, + "learning_rate": 0.0002094380029632265, + "loss": 0.6273, "step": 2576 }, { - "epoch": 2.753468516542156, - "grad_norm": 0.30078125, - "learning_rate": 0.0002020562679037744, - "loss": 0.5967, + "epoch": 2.6407369498464686, + "grad_norm": 0.28125, + "learning_rate": 0.00020918226476181935, + "loss": 0.5672, "step": 2580 }, { - "epoch": 2.7577374599786553, - "grad_norm": 0.294921875, - "learning_rate": 0.0002017839073783823, - "loss": 0.596, + "epoch": 2.6448311156601845, + "grad_norm": 0.314453125, + "learning_rate": 0.00020892632265260866, + "loss": 0.6087, "step": 2584 }, { - "epoch": 2.762006403415155, - "grad_norm": 0.330078125, - "learning_rate": 0.00020151135288788607, - "loss": 0.6351, + "epoch": 2.6489252814738995, + "grad_norm": 0.298828125, + "learning_rate": 0.00020867017751742266, + "loss": 0.5993, "step": 2588 }, { - "epoch": 2.7662753468516543, - "grad_norm": 0.3125, - "learning_rate": 0.0002012386054531831, - "loss": 0.611, + "epoch": 2.6530194472876154, + "grad_norm": 0.30078125, + "learning_rate": 0.00020841383023878916, + "loss": 0.5903, "step": 2592 }, { - "epoch": 2.7705442902881536, - "grad_norm": 0.318359375, - "learning_rate": 0.00020096566609589364, - "loss": 0.5849, + "epoch": 2.6571136131013304, + "grad_norm": 0.291015625, + "learning_rate": 0.00020815728169993233, + "loss": 0.5986, "step": 2596 }, { - "epoch": 2.774813233724653, - "grad_norm": 0.287109375, - "learning_rate": 0.00020069253583835677, - "loss": 0.5559, + "epoch": 2.661207778915046, + "grad_norm": 0.28515625, + "learning_rate": 0.0002079005327847699, + "loss": 0.6083, "step": 2600 }, { - "epoch": 2.7790821771611527, - "grad_norm": 0.283203125, - "learning_rate": 0.0002004192157036265, - "loss": 0.6028, + "epoch": 2.6653019447287614, + "grad_norm": 0.298828125, + "learning_rate": 0.00020764358437790994, + "loss": 0.5921, "step": 2604 }, { - "epoch": 2.783351120597652, - "grad_norm": 0.28515625, - "learning_rate": 0.00020014570671546828, - "loss": 0.6117, + "epoch": 2.669396110542477, + "grad_norm": 0.30078125, + "learning_rate": 0.00020738643736464772, + "loss": 0.5782, "step": 2608 }, { - "epoch": 2.7876200640341517, - "grad_norm": 0.302734375, - "learning_rate": 0.00019987200989835468, - "loss": 0.6328, + "epoch": 2.6734902763561923, + "grad_norm": 0.322265625, + "learning_rate": 0.00020712909263096297, + "loss": 0.6629, "step": 2612 }, { - "epoch": 2.791889007470651, - "grad_norm": 0.30078125, - "learning_rate": 0.00019959812627746198, - "loss": 0.5806, + "epoch": 2.6775844421699078, + "grad_norm": 0.291015625, + "learning_rate": 0.00020687155106351661, + "loss": 0.6093, "step": 2616 }, { - "epoch": 2.7961579509071504, + "epoch": 2.6816786079836232, "grad_norm": 0.30859375, - "learning_rate": 0.00019932405687866616, - "loss": 0.6413, + "learning_rate": 0.00020661381354964762, + "loss": 0.5907, "step": 2620 }, { - "epoch": 2.80042689434365, - "grad_norm": 0.28125, - "learning_rate": 0.00019904980272853902, - "loss": 0.5669, + "epoch": 2.6857727737973387, + "grad_norm": 0.30859375, + "learning_rate": 0.00020635588097737015, + "loss": 0.5855, "step": 2624 }, { - "epoch": 2.8046958377801494, - "grad_norm": 0.298828125, - "learning_rate": 0.00019877536485434435, - "loss": 0.5743, + "epoch": 2.689866939611054, + "grad_norm": 0.31640625, + "learning_rate": 0.00020609775423537053, + "loss": 0.6578, "step": 2628 }, { - "epoch": 2.8089647812166487, - "grad_norm": 0.294921875, - "learning_rate": 0.00019850074428403417, - "loss": 0.5599, + "epoch": 2.6939611054247696, + "grad_norm": 0.333984375, + "learning_rate": 0.00020583943421300405, + "loss": 0.5752, "step": 2632 }, { - "epoch": 2.8132337246531485, - "grad_norm": 0.27734375, - "learning_rate": 0.00019822594204624478, - "loss": 0.5746, + "epoch": 2.698055271238485, + "grad_norm": 0.306640625, + "learning_rate": 0.0002055809218002917, + "loss": 0.6257, "step": 2636 }, { - "epoch": 2.8175026680896478, - "grad_norm": 0.306640625, - "learning_rate": 0.000197950959170293, - "loss": 0.5978, + "epoch": 2.7021494370522006, + "grad_norm": 0.3046875, + "learning_rate": 0.00020532221788791767, + "loss": 0.6225, "step": 2640 }, { - "epoch": 2.821771611526147, - "grad_norm": 0.30078125, - "learning_rate": 0.00019767579668617219, - "loss": 0.5769, + "epoch": 2.706243602865916, + "grad_norm": 0.318359375, + "learning_rate": 0.00020506332336722572, + "loss": 0.5765, "step": 2644 }, { - "epoch": 2.826040554962647, - "grad_norm": 0.3046875, - "learning_rate": 0.00019740045562454848, - "loss": 0.6237, + "epoch": 2.7103377686796315, + "grad_norm": 0.3125, + "learning_rate": 0.00020480423913021636, + "loss": 0.6118, "step": 2648 }, { - "epoch": 2.830309498399146, - "grad_norm": 0.310546875, - "learning_rate": 0.0001971249370167569, - "loss": 0.6006, + "epoch": 2.714431934493347, + "grad_norm": 0.30078125, + "learning_rate": 0.0002045449660695439, + "loss": 0.5911, "step": 2652 }, { - "epoch": 2.834578441835646, - "grad_norm": 0.3203125, - "learning_rate": 0.0001968492418947975, - "loss": 0.5673, + "epoch": 2.7185261003070624, + "grad_norm": 0.298828125, + "learning_rate": 0.00020428550507851313, + "loss": 0.5962, "step": 2656 }, { - "epoch": 2.838847385272145, - "grad_norm": 0.291015625, - "learning_rate": 0.00019657337129133155, - "loss": 0.5596, + "epoch": 2.722620266120778, + "grad_norm": 0.27734375, + "learning_rate": 0.00020402585705107617, + "loss": 0.5941, "step": 2660 }, { - "epoch": 2.8431163287086445, - "grad_norm": 0.283203125, - "learning_rate": 0.00019629732623967753, - "loss": 0.5872, + "epoch": 2.7267144319344934, + "grad_norm": 0.306640625, + "learning_rate": 0.00020376602288182992, + "loss": 0.6027, "step": 2664 }, { - "epoch": 2.847385272145144, - "grad_norm": 0.287109375, - "learning_rate": 0.00019602110777380737, - "loss": 0.6229, + "epoch": 2.730808597748209, + "grad_norm": 0.3125, + "learning_rate": 0.0002035060034660123, + "loss": 0.5518, "step": 2668 }, { - "epoch": 2.8516542155816436, - "grad_norm": 0.302734375, - "learning_rate": 0.00019574471692834257, - "loss": 0.6037, + "epoch": 2.7349027635619243, + "grad_norm": 0.29296875, + "learning_rate": 0.00020324579969949964, + "loss": 0.6127, "step": 2672 }, { - "epoch": 2.855923159018143, - "grad_norm": 0.3125, - "learning_rate": 0.0001954681547385503, - "loss": 0.6017, + "epoch": 2.7389969293756398, + "grad_norm": 0.30859375, + "learning_rate": 0.00020298541247880343, + "loss": 0.6011, "step": 2676 }, { - "epoch": 2.8601921024546426, - "grad_norm": 0.30859375, - "learning_rate": 0.00019519142224033956, - "loss": 0.6481, + "epoch": 2.7430910951893552, + "grad_norm": 0.296875, + "learning_rate": 0.00020272484270106712, + "loss": 0.5692, "step": 2680 }, { - "epoch": 2.864461045891142, - "grad_norm": 0.29296875, - "learning_rate": 0.00019491452047025714, - "loss": 0.6017, + "epoch": 2.7471852610030707, + "grad_norm": 0.3046875, + "learning_rate": 0.0002024640912640633, + "loss": 0.6303, "step": 2684 }, { - "epoch": 2.8687299893276412, - "grad_norm": 0.296875, - "learning_rate": 0.0001946374504654841, - "loss": 0.5909, + "epoch": 2.751279426816786, + "grad_norm": 0.302734375, + "learning_rate": 0.0002022031590661904, + "loss": 0.6613, "step": 2688 }, { - "epoch": 2.872998932764141, - "grad_norm": 0.267578125, - "learning_rate": 0.00019436021326383137, - "loss": 0.6528, + "epoch": 2.7553735926305016, + "grad_norm": 0.28125, + "learning_rate": 0.00020194204700646958, + "loss": 0.6369, "step": 2692 }, { - "epoch": 2.8772678762006403, + "epoch": 2.759467758444217, "grad_norm": 0.30078125, - "learning_rate": 0.0001940828099037364, - "loss": 0.6313, + "learning_rate": 0.0002016807559845418, + "loss": 0.6217, "step": 2696 }, { - "epoch": 2.88153681963714, - "grad_norm": 0.287109375, - "learning_rate": 0.0001938052414242588, - "loss": 0.5678, + "epoch": 2.7635619242579326, + "grad_norm": 0.30859375, + "learning_rate": 0.00020141928690066446, + "loss": 0.651, "step": 2700 }, { - "epoch": 2.8858057630736393, - "grad_norm": 0.287109375, - "learning_rate": 0.00019352750886507684, - "loss": 0.5905, + "epoch": 2.767656090071648, + "grad_norm": 0.27734375, + "learning_rate": 0.0002011576406557087, + "loss": 0.5942, "step": 2704 }, { - "epoch": 2.8900747065101386, - "grad_norm": 0.27734375, - "learning_rate": 0.00019324961326648322, - "loss": 0.6025, + "epoch": 2.7717502558853635, + "grad_norm": 0.318359375, + "learning_rate": 0.0002008958181511559, + "loss": 0.6106, "step": 2708 }, { - "epoch": 2.894343649946638, - "grad_norm": 0.294921875, - "learning_rate": 0.00019297155566938142, - "loss": 0.5867, + "epoch": 2.775844421699079, + "grad_norm": 0.3125, + "learning_rate": 0.00020063382028909468, + "loss": 0.5713, "step": 2712 }, { - "epoch": 2.8986125933831377, - "grad_norm": 0.30859375, - "learning_rate": 0.00019269333711528178, - "loss": 0.6047, + "epoch": 2.7799385875127944, + "grad_norm": 0.29296875, + "learning_rate": 0.00020037164797221798, + "loss": 0.6318, "step": 2716 }, { - "epoch": 2.902881536819637, - "grad_norm": 0.30859375, - "learning_rate": 0.00019241495864629737, - "loss": 0.6194, + "epoch": 2.7840327533265095, + "grad_norm": 0.28515625, + "learning_rate": 0.00020010930210381973, + "loss": 0.5837, "step": 2720 }, { - "epoch": 2.9071504802561368, - "grad_norm": 0.3125, - "learning_rate": 0.00019213642130514036, - "loss": 0.6012, + "epoch": 2.7881269191402254, + "grad_norm": 0.294921875, + "learning_rate": 0.00019984678358779182, + "loss": 0.5899, "step": 2724 }, { - "epoch": 2.911419423692636, - "grad_norm": 0.310546875, - "learning_rate": 0.000191857726135118, - "loss": 0.5665, + "epoch": 2.7922210849539404, + "grad_norm": 0.314453125, + "learning_rate": 0.000199584093328621, + "loss": 0.5779, "step": 2728 }, { - "epoch": 2.9156883671291354, - "grad_norm": 0.294921875, - "learning_rate": 0.00019157887418012857, - "loss": 0.6171, + "epoch": 2.7963152507676563, + "grad_norm": 0.328125, + "learning_rate": 0.00019932123223138573, + "loss": 0.6003, "step": 2732 }, { - "epoch": 2.919957310565635, - "grad_norm": 0.31640625, - "learning_rate": 0.00019129986648465784, - "loss": 0.5935, + "epoch": 2.8004094165813713, + "grad_norm": 0.283203125, + "learning_rate": 0.0001990582012017531, + "loss": 0.5879, "step": 2736 }, { - "epoch": 2.9242262540021344, - "grad_norm": 0.287109375, - "learning_rate": 0.00019102070409377474, - "loss": 0.6052, + "epoch": 2.8045035823950872, + "grad_norm": 0.30859375, + "learning_rate": 0.00019879500114597569, + "loss": 0.5881, "step": 2740 }, { - "epoch": 2.928495197438634, - "grad_norm": 0.298828125, - "learning_rate": 0.0001907413880531278, - "loss": 0.5968, + "epoch": 2.8085977482088023, + "grad_norm": 0.283203125, + "learning_rate": 0.00019853163297088843, + "loss": 0.5774, "step": 2744 }, { - "epoch": 2.9327641408751335, - "grad_norm": 0.283203125, - "learning_rate": 0.0001904619194089409, - "loss": 0.6177, + "epoch": 2.812691914022518, + "grad_norm": 0.310546875, + "learning_rate": 0.00019826809758390548, + "loss": 0.6252, "step": 2748 }, { - "epoch": 2.937033084311633, - "grad_norm": 0.306640625, - "learning_rate": 0.00019018229920800966, - "loss": 0.6111, + "epoch": 2.816786079836233, + "grad_norm": 0.287109375, + "learning_rate": 0.00019800439589301715, + "loss": 0.65, "step": 2752 }, { - "epoch": 2.941302027748132, - "grad_norm": 0.294921875, - "learning_rate": 0.00018990252849769733, - "loss": 0.6115, + "epoch": 2.8208802456499487, + "grad_norm": 0.318359375, + "learning_rate": 0.00019774052880678676, + "loss": 0.6235, "step": 2756 }, { - "epoch": 2.945570971184632, - "grad_norm": 0.30078125, - "learning_rate": 0.00018962260832593086, - "loss": 0.5695, + "epoch": 2.824974411463664, + "grad_norm": 0.30859375, + "learning_rate": 0.00019747649723434732, + "loss": 0.5734, "step": 2760 }, { - "epoch": 2.949839914621131, - "grad_norm": 0.298828125, - "learning_rate": 0.00018934253974119716, - "loss": 0.5689, + "epoch": 2.8290685772773796, + "grad_norm": 0.27734375, + "learning_rate": 0.00019721230208539882, + "loss": 0.6161, "step": 2764 }, { - "epoch": 2.954108858057631, - "grad_norm": 0.29296875, - "learning_rate": 0.0001890623237925389, - "loss": 0.6078, + "epoch": 2.833162743091095, + "grad_norm": 0.30859375, + "learning_rate": 0.00019694794427020461, + "loss": 0.5959, "step": 2768 }, { - "epoch": 2.95837780149413, - "grad_norm": 0.296875, - "learning_rate": 0.00018878196152955087, - "loss": 0.64, + "epoch": 2.8372569089048105, + "grad_norm": 0.302734375, + "learning_rate": 0.0001966834246995887, + "loss": 0.5986, "step": 2772 }, { - "epoch": 2.9626467449306295, - "grad_norm": 0.314453125, - "learning_rate": 0.00018850145400237578, - "loss": 0.6209, + "epoch": 2.841351074718526, + "grad_norm": 0.3046875, + "learning_rate": 0.00019641874428493223, + "loss": 0.612, "step": 2776 }, { - "epoch": 2.9669156883671293, - "grad_norm": 0.2890625, - "learning_rate": 0.0001882208022617005, - "loss": 0.5918, + "epoch": 2.8454452405322415, + "grad_norm": 0.296875, + "learning_rate": 0.00019615390393817067, + "loss": 0.5609, "step": 2780 }, { - "epoch": 2.9711846318036286, - "grad_norm": 0.287109375, - "learning_rate": 0.00018794000735875208, - "loss": 0.6031, + "epoch": 2.849539406345957, + "grad_norm": 0.3046875, + "learning_rate": 0.00019588890457179035, + "loss": 0.6219, "step": 2784 }, { - "epoch": 2.9754535752401283, - "grad_norm": 0.306640625, - "learning_rate": 0.0001876590703452939, - "loss": 0.6224, + "epoch": 2.8536335721596724, + "grad_norm": 0.302734375, + "learning_rate": 0.00019562374709882564, + "loss": 0.6171, "step": 2788 }, { - "epoch": 2.9797225186766276, - "grad_norm": 0.287109375, - "learning_rate": 0.0001873779922736214, - "loss": 0.5607, + "epoch": 2.857727737973388, + "grad_norm": 0.310546875, + "learning_rate": 0.00019535843243285566, + "loss": 0.6393, "step": 2792 }, { - "epoch": 2.983991462113127, - "grad_norm": 0.3046875, - "learning_rate": 0.0001870967741965586, - "loss": 0.6302, + "epoch": 2.8618219037871033, + "grad_norm": 0.31640625, + "learning_rate": 0.00019509296148800093, + "loss": 0.5619, "step": 2796 }, { - "epoch": 2.9882604055496262, - "grad_norm": 0.310546875, - "learning_rate": 0.00018681541716745388, - "loss": 0.6344, + "epoch": 2.865916069600819, + "grad_norm": 0.298828125, + "learning_rate": 0.0001948273351789207, + "loss": 0.6155, "step": 2800 }, { - "epoch": 2.992529348986126, - "grad_norm": 0.294921875, - "learning_rate": 0.000186533922240176, - "loss": 0.5858, + "epoch": 2.8700102354145343, + "grad_norm": 0.30859375, + "learning_rate": 0.00019456155442080928, + "loss": 0.6029, "step": 2804 }, { - "epoch": 2.9967982924226253, - "grad_norm": 0.3046875, - "learning_rate": 0.00018625229046911033, - "loss": 0.5743, + "epoch": 2.8741044012282497, + "grad_norm": 0.298828125, + "learning_rate": 0.00019429562012939316, + "loss": 0.593, "step": 2808 }, { - "epoch": 3.001067235859125, - "grad_norm": 0.275390625, - "learning_rate": 0.0001859705229091548, - "loss": 0.6399, + "epoch": 2.878198567041965, + "grad_norm": 0.310546875, + "learning_rate": 0.00019402953322092805, + "loss": 0.6088, "step": 2812 }, { - "epoch": 3.0053361792956244, - "grad_norm": 0.26953125, - "learning_rate": 0.0001856886206157159, - "loss": 0.4803, + "epoch": 2.8822927328556807, + "grad_norm": 0.296875, + "learning_rate": 0.00019376329461219516, + "loss": 0.64, "step": 2816 }, { - "epoch": 3.0096051227321237, - "grad_norm": 0.314453125, - "learning_rate": 0.00018540658464470485, - "loss": 0.539, + "epoch": 2.886386898669396, + "grad_norm": 0.29296875, + "learning_rate": 0.00019349690522049853, + "loss": 0.6028, "step": 2820 }, { - "epoch": 3.0138740661686234, - "grad_norm": 0.306640625, - "learning_rate": 0.00018512441605253357, - "loss": 0.502, + "epoch": 2.8904810644831116, + "grad_norm": 0.287109375, + "learning_rate": 0.00019323036596366174, + "loss": 0.6205, "step": 2824 }, { - "epoch": 3.0181430096051227, - "grad_norm": 0.310546875, - "learning_rate": 0.0001848421158961107, - "loss": 0.5283, + "epoch": 2.894575230296827, + "grad_norm": 0.291015625, + "learning_rate": 0.00019296367776002466, + "loss": 0.6108, "step": 2828 }, { - "epoch": 3.022411953041622, - "grad_norm": 0.302734375, - "learning_rate": 0.00018455968523283775, - "loss": 0.4959, + "epoch": 2.8986693961105425, + "grad_norm": 0.296875, + "learning_rate": 0.00019269684152844037, + "loss": 0.63, "step": 2832 }, { - "epoch": 3.0266808964781218, - "grad_norm": 0.314453125, - "learning_rate": 0.00018427712512060493, - "loss": 0.5063, + "epoch": 2.902763561924258, + "grad_norm": 0.294921875, + "learning_rate": 0.00019242985818827198, + "loss": 0.6686, "step": 2836 }, { - "epoch": 3.030949839914621, - "grad_norm": 0.3046875, - "learning_rate": 0.00018399443661778748, - "loss": 0.5275, + "epoch": 2.9068577277379735, + "grad_norm": 0.318359375, + "learning_rate": 0.0001921627286593894, + "loss": 0.5533, "step": 2840 }, { - "epoch": 3.0352187833511204, - "grad_norm": 0.3046875, - "learning_rate": 0.00018371162078324147, - "loss": 0.4883, + "epoch": 2.910951893551689, + "grad_norm": 0.298828125, + "learning_rate": 0.00019189545386216625, + "loss": 0.5544, "step": 2844 }, { - "epoch": 3.03948772678762, - "grad_norm": 0.3125, - "learning_rate": 0.00018342867867629986, - "loss": 0.4861, + "epoch": 2.9150460593654044, + "grad_norm": 0.306640625, + "learning_rate": 0.00019162803471747667, + "loss": 0.6254, "step": 2848 }, { - "epoch": 3.0437566702241194, - "grad_norm": 0.30078125, - "learning_rate": 0.0001831456113567687, - "loss": 0.4827, + "epoch": 2.91914022517912, + "grad_norm": 0.3125, + "learning_rate": 0.0001913604721466922, + "loss": 0.5852, "step": 2852 }, { - "epoch": 3.048025613660619, - "grad_norm": 0.28125, - "learning_rate": 0.00018286241988492292, - "loss": 0.4858, + "epoch": 2.9232343909928353, + "grad_norm": 0.3125, + "learning_rate": 0.00019109276707167839, + "loss": 0.5763, "step": 2856 }, { - "epoch": 3.0522945570971185, - "grad_norm": 0.306640625, - "learning_rate": 0.0001825791053215026, - "loss": 0.5651, + "epoch": 2.927328556806551, + "grad_norm": 0.28515625, + "learning_rate": 0.00019082492041479188, + "loss": 0.624, "step": 2860 }, { - "epoch": 3.056563500533618, - "grad_norm": 0.298828125, - "learning_rate": 0.00018229566872770886, - "loss": 0.5377, + "epoch": 2.9314227226202663, + "grad_norm": 0.28515625, + "learning_rate": 0.00019055693309887712, + "loss": 0.6114, "step": 2864 }, { - "epoch": 3.0608324439701176, + "epoch": 2.9355168884339817, "grad_norm": 0.296875, - "learning_rate": 0.00018201211116519977, - "loss": 0.4624, + "learning_rate": 0.00019028880604726316, + "loss": 0.6243, "step": 2868 }, { - "epoch": 3.065101387406617, - "grad_norm": 0.29296875, - "learning_rate": 0.00018172843369608673, - "loss": 0.4872, + "epoch": 2.939611054247697, + "grad_norm": 0.306640625, + "learning_rate": 0.00019002054018376052, + "loss": 0.595, "step": 2872 }, { - "epoch": 3.069370330843116, - "grad_norm": 0.31640625, - "learning_rate": 0.00018144463738293005, - "loss": 0.5293, + "epoch": 2.943705220061412, + "grad_norm": 0.296875, + "learning_rate": 0.00018975213643265799, + "loss": 0.5596, "step": 2876 }, { - "epoch": 3.073639274279616, - "grad_norm": 0.31640625, - "learning_rate": 0.00018116072328873536, - "loss": 0.5263, + "epoch": 2.947799385875128, + "grad_norm": 0.306640625, + "learning_rate": 0.00018948359571871936, + "loss": 0.5698, "step": 2880 }, { - "epoch": 3.0779082177161152, - "grad_norm": 0.298828125, - "learning_rate": 0.0001808766924769493, - "loss": 0.5073, + "epoch": 2.951893551688843, + "grad_norm": 0.3125, + "learning_rate": 0.00018921491896718048, + "loss": 0.6207, "step": 2884 }, { - "epoch": 3.0821771611526145, - "grad_norm": 0.2890625, - "learning_rate": 0.00018059254601145586, - "loss": 0.5269, + "epoch": 2.955987717502559, + "grad_norm": 0.306640625, + "learning_rate": 0.00018894610710374574, + "loss": 0.6277, "step": 2888 }, { - "epoch": 3.0864461045891143, - "grad_norm": 0.310546875, - "learning_rate": 0.00018030828495657218, - "loss": 0.4948, + "epoch": 2.960081883316274, + "grad_norm": 0.33203125, + "learning_rate": 0.00018867716105458506, + "loss": 0.5984, "step": 2892 }, { - "epoch": 3.0907150480256136, - "grad_norm": 0.306640625, - "learning_rate": 0.00018002391037704447, - "loss": 0.5062, + "epoch": 2.96417604912999, + "grad_norm": 0.291015625, + "learning_rate": 0.00018840808174633088, + "loss": 0.5292, "step": 2896 }, { - "epoch": 3.0949839914621133, - "grad_norm": 0.326171875, - "learning_rate": 0.00017973942333804436, - "loss": 0.527, + "epoch": 2.968270214943705, + "grad_norm": 0.29296875, + "learning_rate": 0.00018813887010607456, + "loss": 0.5835, "step": 2900 }, { - "epoch": 3.0992529348986126, - "grad_norm": 0.31640625, - "learning_rate": 0.00017945482490516464, - "loss": 0.503, + "epoch": 2.972364380757421, + "grad_norm": 0.302734375, + "learning_rate": 0.00018786952706136343, + "loss": 0.6388, "step": 2904 }, { - "epoch": 3.103521878335112, - "grad_norm": 0.310546875, - "learning_rate": 0.0001791701161444153, - "loss": 0.5259, + "epoch": 2.976458546571136, + "grad_norm": 0.298828125, + "learning_rate": 0.00018760005354019765, + "loss": 0.6434, "step": 2908 }, { - "epoch": 3.1077908217716117, - "grad_norm": 0.337890625, - "learning_rate": 0.00017888529812221964, - "loss": 0.5332, + "epoch": 2.9805527123848514, + "grad_norm": 0.287109375, + "learning_rate": 0.00018733045047102695, + "loss": 0.5781, "step": 2912 }, { - "epoch": 3.112059765208111, - "grad_norm": 0.302734375, - "learning_rate": 0.00017860037190541023, - "loss": 0.5093, + "epoch": 2.984646878198567, + "grad_norm": 0.30859375, + "learning_rate": 0.00018706071878274718, + "loss": 0.6452, "step": 2916 }, { - "epoch": 3.1163287086446103, - "grad_norm": 0.30859375, - "learning_rate": 0.0001783153385612248, - "loss": 0.5255, + "epoch": 2.9887410440122824, + "grad_norm": 0.283203125, + "learning_rate": 0.00018679085940469763, + "loss": 0.6056, "step": 2920 }, { - "epoch": 3.12059765208111, - "grad_norm": 0.302734375, - "learning_rate": 0.00017803019915730252, - "loss": 0.5086, + "epoch": 2.992835209825998, + "grad_norm": 0.291015625, + "learning_rate": 0.00018652087326665741, + "loss": 0.5462, "step": 2924 }, { - "epoch": 3.1248665955176094, - "grad_norm": 0.310546875, - "learning_rate": 0.0001777449547616796, - "loss": 0.4953, + "epoch": 2.9969293756397133, + "grad_norm": 0.291015625, + "learning_rate": 0.00018625076129884233, + "loss": 0.6397, "step": 2928 }, { - "epoch": 3.1291355389541087, - "grad_norm": 0.326171875, - "learning_rate": 0.00017745960644278567, - "loss": 0.4607, + "epoch": 3.0010235414534288, + "grad_norm": 0.2890625, + "learning_rate": 0.0001859805244319017, + "loss": 0.5911, "step": 2932 }, { - "epoch": 3.1334044823906084, - "grad_norm": 0.306640625, - "learning_rate": 0.00017717415526943958, - "loss": 0.4885, + "epoch": 3.0051177072671442, + "grad_norm": 0.28125, + "learning_rate": 0.00018571016359691532, + "loss": 0.5014, "step": 2936 }, { - "epoch": 3.1376734258271077, - "grad_norm": 0.3125, - "learning_rate": 0.0001768886023108455, - "loss": 0.5169, + "epoch": 3.0092118730808597, + "grad_norm": 0.28515625, + "learning_rate": 0.00018543967972539004, + "loss": 0.5192, "step": 2940 }, { - "epoch": 3.1419423692636075, - "grad_norm": 0.31640625, - "learning_rate": 0.0001766029486365887, - "loss": 0.5089, + "epoch": 3.013306038894575, + "grad_norm": 0.30859375, + "learning_rate": 0.00018516907374925651, + "loss": 0.5236, "step": 2944 }, { - "epoch": 3.146211312700107, - "grad_norm": 0.310546875, - "learning_rate": 0.00017631719531663187, - "loss": 0.4862, + "epoch": 3.0174002047082906, + "grad_norm": 0.2890625, + "learning_rate": 0.00018489834660086624, + "loss": 0.5035, "step": 2948 }, { - "epoch": 3.150480256136606, - "grad_norm": 0.330078125, - "learning_rate": 0.00017603134342131078, - "loss": 0.5473, + "epoch": 3.021494370522006, + "grad_norm": 0.29296875, + "learning_rate": 0.00018462749921298817, + "loss": 0.5226, "step": 2952 }, { - "epoch": 3.154749199573106, + "epoch": 3.0255885363357216, "grad_norm": 0.322265625, - "learning_rate": 0.00017574539402133058, - "loss": 0.4956, + "learning_rate": 0.0001843565325188055, + "loss": 0.5353, "step": 2956 }, { - "epoch": 3.159018143009605, - "grad_norm": 0.302734375, - "learning_rate": 0.00017545934818776152, - "loss": 0.5392, + "epoch": 3.029682702149437, + "grad_norm": 0.279296875, + "learning_rate": 0.00018408544745191247, + "loss": 0.4981, "step": 2960 }, { - "epoch": 3.1632870864461045, - "grad_norm": 0.322265625, - "learning_rate": 0.00017517320699203515, - "loss": 0.4986, + "epoch": 3.0337768679631525, + "grad_norm": 0.29296875, + "learning_rate": 0.00018381424494631128, + "loss": 0.4917, "step": 2964 }, { - "epoch": 3.167556029882604, - "grad_norm": 0.306640625, - "learning_rate": 0.0001748869715059401, - "loss": 0.4916, + "epoch": 3.037871033776868, + "grad_norm": 0.302734375, + "learning_rate": 0.00018354292593640857, + "loss": 0.4919, "step": 2968 }, { - "epoch": 3.1718249733191035, - "grad_norm": 0.31640625, - "learning_rate": 0.00017460064280161832, - "loss": 0.5525, + "epoch": 3.0419651995905834, + "grad_norm": 0.322265625, + "learning_rate": 0.0001832714913570126, + "loss": 0.5445, "step": 2972 }, { - "epoch": 3.176093916755603, - "grad_norm": 0.322265625, - "learning_rate": 0.00017431422195156082, - "loss": 0.5064, + "epoch": 3.046059365404299, + "grad_norm": 0.310546875, + "learning_rate": 0.00018299994214332962, + "loss": 0.5512, "step": 2976 }, { - "epoch": 3.1803628601921026, + "epoch": 3.0501535312180144, "grad_norm": 0.314453125, - "learning_rate": 0.00017402771002860383, - "loss": 0.4823, + "learning_rate": 0.00018272827923096095, + "loss": 0.483, "step": 2980 }, { - "epoch": 3.184631803628602, - "grad_norm": 0.328125, - "learning_rate": 0.00017374110810592465, - "loss": 0.5476, + "epoch": 3.05424769703173, + "grad_norm": 0.314453125, + "learning_rate": 0.00018245650355589964, + "loss": 0.4659, "step": 2984 }, { - "epoch": 3.188900747065101, - "grad_norm": 0.322265625, - "learning_rate": 0.00017345441725703767, - "loss": 0.5754, + "epoch": 3.0583418628454453, + "grad_norm": 0.310546875, + "learning_rate": 0.00018218461605452722, + "loss": 0.5279, "step": 2988 }, { - "epoch": 3.193169690501601, - "grad_norm": 0.34765625, - "learning_rate": 0.0001731676385557905, - "loss": 0.4915, + "epoch": 3.0624360286591608, + "grad_norm": 0.29296875, + "learning_rate": 0.00018191261766361053, + "loss": 0.5276, "step": 2992 }, { - "epoch": 3.1974386339381002, - "grad_norm": 0.29296875, - "learning_rate": 0.0001728807730763596, - "loss": 0.489, + "epoch": 3.0665301944728762, + "grad_norm": 0.306640625, + "learning_rate": 0.0001816405093202985, + "loss": 0.5401, "step": 2996 }, { - "epoch": 3.2017075773746, - "grad_norm": 0.31640625, - "learning_rate": 0.0001725938218932467, - "loss": 0.4872, + "epoch": 3.0706243602865917, + "grad_norm": 0.30859375, + "learning_rate": 0.0001813682919621189, + "loss": 0.5575, "step": 3000 }, { - "epoch": 3.2059765208110993, - "grad_norm": 0.322265625, - "learning_rate": 0.00017230678608127437, - "loss": 0.5863, + "epoch": 3.074718526100307, + "grad_norm": 0.314453125, + "learning_rate": 0.00018109596652697495, + "loss": 0.5572, "step": 3004 }, { - "epoch": 3.2102454642475986, - "grad_norm": 0.310546875, - "learning_rate": 0.00017201966671558227, - "loss": 0.568, + "epoch": 3.0788126919140226, + "grad_norm": 0.32421875, + "learning_rate": 0.00018082353395314243, + "loss": 0.4961, "step": 3008 }, { - "epoch": 3.2145144076840984, - "grad_norm": 0.3203125, - "learning_rate": 0.000171732464871623, - "loss": 0.5618, + "epoch": 3.082906857727738, + "grad_norm": 0.33203125, + "learning_rate": 0.00018055099517926625, + "loss": 0.5249, "step": 3012 }, { - "epoch": 3.2187833511205977, - "grad_norm": 0.3203125, - "learning_rate": 0.00017144518162515813, - "loss": 0.4897, + "epoch": 3.0870010235414536, + "grad_norm": 0.30859375, + "learning_rate": 0.00018027835114435702, + "loss": 0.541, "step": 3016 }, { - "epoch": 3.223052294557097, - "grad_norm": 0.322265625, - "learning_rate": 0.00017115781805225404, - "loss": 0.5517, + "epoch": 3.091095189355169, + "grad_norm": 0.318359375, + "learning_rate": 0.00018000560278778825, + "loss": 0.5231, "step": 3020 }, { - "epoch": 3.2273212379935967, - "grad_norm": 0.310546875, - "learning_rate": 0.00017087037522927806, - "loss": 0.5146, + "epoch": 3.0951893551688845, + "grad_norm": 0.330078125, + "learning_rate": 0.00017973275104929276, + "loss": 0.5053, "step": 3024 }, { - "epoch": 3.231590181430096, - "grad_norm": 0.314453125, - "learning_rate": 0.0001705828542328944, - "loss": 0.4868, + "epoch": 3.0992835209826, + "grad_norm": 0.3203125, + "learning_rate": 0.0001794597968689596, + "loss": 0.4864, "step": 3028 }, { - "epoch": 3.2358591248665953, - "grad_norm": 0.341796875, - "learning_rate": 0.00017029525614005995, - "loss": 0.5151, + "epoch": 3.1033776867963154, + "grad_norm": 0.306640625, + "learning_rate": 0.0001791867411872308, + "loss": 0.4932, "step": 3032 }, { - "epoch": 3.240128068303095, - "grad_norm": 0.34375, - "learning_rate": 0.00017000758202802047, - "loss": 0.53, + "epoch": 3.107471852610031, + "grad_norm": 0.3125, + "learning_rate": 0.00017891358494489805, + "loss": 0.5039, "step": 3036 }, { - "epoch": 3.2443970117395944, - "grad_norm": 0.318359375, - "learning_rate": 0.0001697198329743065, - "loss": 0.525, + "epoch": 3.1115660184237464, + "grad_norm": 0.310546875, + "learning_rate": 0.00017864032908309946, + "loss": 0.5224, "step": 3040 }, { - "epoch": 3.2486659551760937, - "grad_norm": 0.32421875, - "learning_rate": 0.00016943201005672917, - "loss": 0.5, + "epoch": 3.115660184237462, + "grad_norm": 0.30078125, + "learning_rate": 0.00017836697454331658, + "loss": 0.5166, "step": 3044 }, { - "epoch": 3.2529348986125934, - "grad_norm": 0.32421875, - "learning_rate": 0.00016914411435337644, - "loss": 0.5364, + "epoch": 3.119754350051177, + "grad_norm": 0.287109375, + "learning_rate": 0.00017809352226737075, + "loss": 0.5291, "step": 3048 }, { - "epoch": 3.2572038420490927, - "grad_norm": 0.3359375, - "learning_rate": 0.00016885614694260877, - "loss": 0.5358, + "epoch": 3.1238485158648923, + "grad_norm": 0.30078125, + "learning_rate": 0.0001778199731974201, + "loss": 0.5146, "step": 3052 }, { - "epoch": 3.2614727854855925, - "grad_norm": 0.3046875, - "learning_rate": 0.0001685681089030552, - "loss": 0.5054, + "epoch": 3.127942681678608, + "grad_norm": 0.3203125, + "learning_rate": 0.0001775463282759563, + "loss": 0.5323, "step": 3056 }, { - "epoch": 3.265741728922092, - "grad_norm": 0.326171875, - "learning_rate": 0.00016828000131360939, - "loss": 0.5387, + "epoch": 3.1320368474923233, + "grad_norm": 0.318359375, + "learning_rate": 0.00017727258844580125, + "loss": 0.5511, "step": 3060 }, { - "epoch": 3.270010672358591, - "grad_norm": 0.318359375, - "learning_rate": 0.00016799182525342553, - "loss": 0.5186, + "epoch": 3.1361310133060387, + "grad_norm": 0.326171875, + "learning_rate": 0.00017699875465010382, + "loss": 0.5502, "step": 3064 }, { - "epoch": 3.274279615795091, - "grad_norm": 0.30859375, - "learning_rate": 0.0001677035818019142, - "loss": 0.4767, + "epoch": 3.140225179119754, + "grad_norm": 0.310546875, + "learning_rate": 0.00017672482783233668, + "loss": 0.5313, "step": 3068 }, { - "epoch": 3.27854855923159, - "grad_norm": 0.328125, - "learning_rate": 0.0001674152720387385, - "loss": 0.5128, + "epoch": 3.1443193449334697, + "grad_norm": 0.294921875, + "learning_rate": 0.00017645080893629298, + "loss": 0.4961, "step": 3072 }, { - "epoch": 3.2828175026680895, - "grad_norm": 0.337890625, - "learning_rate": 0.00016712689704380978, - "loss": 0.5026, + "epoch": 3.148413510747185, + "grad_norm": 0.310546875, + "learning_rate": 0.00017617669890608305, + "loss": 0.5336, "step": 3076 }, { - "epoch": 3.287086446104589, - "grad_norm": 0.3203125, - "learning_rate": 0.00016683845789728383, - "loss": 0.5144, + "epoch": 3.1525076765609006, + "grad_norm": 0.3125, + "learning_rate": 0.00017590249868613137, + "loss": 0.5259, "step": 3080 }, { - "epoch": 3.2913553895410885, - "grad_norm": 0.36328125, - "learning_rate": 0.00016654995567955667, - "loss": 0.5295, + "epoch": 3.156601842374616, + "grad_norm": 0.318359375, + "learning_rate": 0.000175628209221173, + "loss": 0.5445, "step": 3084 }, { - "epoch": 3.295624332977588, + "epoch": 3.1606960081883315, "grad_norm": 0.31640625, - "learning_rate": 0.00016626139147126067, - "loss": 0.5135, + "learning_rate": 0.00017535383145625056, + "loss": 0.5006, "step": 3088 }, { - "epoch": 3.2998932764140876, - "grad_norm": 0.3203125, - "learning_rate": 0.0001659727663532603, - "loss": 0.5232, + "epoch": 3.164790174002047, + "grad_norm": 0.349609375, + "learning_rate": 0.00017507936633671093, + "loss": 0.4755, "step": 3092 }, { - "epoch": 3.304162219850587, - "grad_norm": 0.30078125, - "learning_rate": 0.00016568408140664817, - "loss": 0.5302, + "epoch": 3.1688843398157625, + "grad_norm": 0.3125, + "learning_rate": 0.00017480481480820195, + "loss": 0.5028, "step": 3096 }, { - "epoch": 3.3084311632870866, + "epoch": 3.172978505629478, "grad_norm": 0.32421875, - "learning_rate": 0.00016539533771274103, - "loss": 0.541, + "learning_rate": 0.0001745301778166691, + "loss": 0.4948, "step": 3100 }, { - "epoch": 3.312700106723586, + "epoch": 3.1770726714431934, "grad_norm": 0.330078125, - "learning_rate": 0.00016510653635307566, - "loss": 0.4886, + "learning_rate": 0.00017425545630835238, + "loss": 0.489, "step": 3104 }, { - "epoch": 3.3169690501600853, - "grad_norm": 0.31640625, - "learning_rate": 0.00016481767840940482, - "loss": 0.525, + "epoch": 3.181166837256909, + "grad_norm": 0.369140625, + "learning_rate": 0.000173980651229783, + "loss": 0.518, "step": 3108 }, { - "epoch": 3.321237993596585, - "grad_norm": 0.31640625, - "learning_rate": 0.0001645287649636933, - "loss": 0.5047, + "epoch": 3.1852610030706243, + "grad_norm": 0.314453125, + "learning_rate": 0.00017370576352778008, + "loss": 0.5336, "step": 3112 }, { - "epoch": 3.3255069370330843, - "grad_norm": 0.330078125, - "learning_rate": 0.00016423979709811367, - "loss": 0.5079, + "epoch": 3.18935516888434, + "grad_norm": 0.326171875, + "learning_rate": 0.0001734307941494474, + "loss": 0.5271, "step": 3116 }, { - "epoch": 3.3297758804695836, - "grad_norm": 0.322265625, - "learning_rate": 0.00016395077589504233, - "loss": 0.526, + "epoch": 3.1934493346980553, + "grad_norm": 0.30078125, + "learning_rate": 0.00017315574404217017, + "loss": 0.5106, "step": 3120 }, { - "epoch": 3.3340448239060834, - "grad_norm": 0.328125, - "learning_rate": 0.00016366170243705563, - "loss": 0.5898, + "epoch": 3.1975435005117707, + "grad_norm": 0.310546875, + "learning_rate": 0.00017288061415361174, + "loss": 0.4721, "step": 3124 }, { - "epoch": 3.3383137673425827, - "grad_norm": 0.306640625, - "learning_rate": 0.00016337257780692552, - "loss": 0.5152, + "epoch": 3.201637666325486, + "grad_norm": 0.3359375, + "learning_rate": 0.00017260540543171036, + "loss": 0.5574, "step": 3128 }, { - "epoch": 3.342582710779082, - "grad_norm": 0.330078125, - "learning_rate": 0.00016308340308761557, - "loss": 0.5282, + "epoch": 3.2057318321392017, + "grad_norm": 0.3125, + "learning_rate": 0.00017233011882467582, + "loss": 0.5063, "step": 3132 }, { - "epoch": 3.3468516542155817, - "grad_norm": 0.33984375, - "learning_rate": 0.00016279417936227713, - "loss": 0.5333, + "epoch": 3.209825997952917, + "grad_norm": 0.296875, + "learning_rate": 0.00017205475528098637, + "loss": 0.5305, "step": 3136 }, { - "epoch": 3.351120597652081, - "grad_norm": 0.32421875, - "learning_rate": 0.00016250490771424497, - "loss": 0.5153, + "epoch": 3.2139201637666326, + "grad_norm": 0.333984375, + "learning_rate": 0.0001717793157493852, + "loss": 0.5379, "step": 3140 }, { - "epoch": 3.355389541088581, - "grad_norm": 0.337890625, - "learning_rate": 0.00016221558922703343, - "loss": 0.5634, + "epoch": 3.218014329580348, + "grad_norm": 0.30859375, + "learning_rate": 0.00017150380117887751, + "loss": 0.5002, "step": 3144 }, { - "epoch": 3.35965848452508, - "grad_norm": 0.33984375, - "learning_rate": 0.00016192622498433225, - "loss": 0.5322, + "epoch": 3.2221084953940635, + "grad_norm": 0.302734375, + "learning_rate": 0.00017122821251872684, + "loss": 0.4996, "step": 3148 }, { - "epoch": 3.3639274279615794, + "epoch": 3.226202661207779, "grad_norm": 0.3203125, - "learning_rate": 0.00016163681607000258, - "loss": 0.5333, + "learning_rate": 0.00017095255071845206, + "loss": 0.5536, "step": 3152 }, { - "epoch": 3.368196371398079, - "grad_norm": 0.326171875, - "learning_rate": 0.00016134736356807292, - "loss": 0.5325, + "epoch": 3.2302968270214945, + "grad_norm": 0.3125, + "learning_rate": 0.00017067681672782416, + "loss": 0.5189, "step": 3156 }, { - "epoch": 3.3724653148345785, - "grad_norm": 0.3359375, - "learning_rate": 0.00016105786856273504, - "loss": 0.5468, + "epoch": 3.23439099283521, + "grad_norm": 0.34765625, + "learning_rate": 0.00017040101149686264, + "loss": 0.5405, "step": 3160 }, { - "epoch": 3.3767342582710778, + "epoch": 3.2384851586489254, "grad_norm": 0.3125, - "learning_rate": 0.00016076833213833982, - "loss": 0.4629, + "learning_rate": 0.0001701251359758326, + "loss": 0.5288, "step": 3164 }, { - "epoch": 3.3810032017075775, + "epoch": 3.242579324462641, "grad_norm": 0.31640625, - "learning_rate": 0.0001604787553793934, - "loss": 0.4933, + "learning_rate": 0.00016984919111524136, + "loss": 0.5539, "step": 3168 }, { - "epoch": 3.385272145144077, - "grad_norm": 0.30078125, - "learning_rate": 0.0001601891393705529, - "loss": 0.5063, + "epoch": 3.2466734902763563, + "grad_norm": 0.33203125, + "learning_rate": 0.00016957317786583497, + "loss": 0.5721, "step": 3172 }, { - "epoch": 3.389541088580576, - "grad_norm": 0.322265625, - "learning_rate": 0.00015989948519662254, - "loss": 0.5133, + "epoch": 3.250767656090072, + "grad_norm": 0.314453125, + "learning_rate": 0.00016929709717859525, + "loss": 0.4987, "step": 3176 }, { - "epoch": 3.393810032017076, - "grad_norm": 0.310546875, - "learning_rate": 0.00015960979394254944, - "loss": 0.5361, + "epoch": 3.2548618219037873, + "grad_norm": 0.306640625, + "learning_rate": 0.00016902095000473637, + "loss": 0.4804, "step": 3180 }, { - "epoch": 3.398078975453575, - "grad_norm": 0.3125, - "learning_rate": 0.00015932006669341963, - "loss": 0.5147, + "epoch": 3.2589559877175027, + "grad_norm": 0.326171875, + "learning_rate": 0.00016874473729570148, + "loss": 0.5401, "step": 3184 }, { - "epoch": 3.402347918890075, - "grad_norm": 0.36328125, - "learning_rate": 0.00015903030453445392, - "loss": 0.5254, + "epoch": 3.263050153531218, + "grad_norm": 0.314453125, + "learning_rate": 0.00016846846000315957, + "loss": 0.4868, "step": 3188 }, { - "epoch": 3.4066168623265742, - "grad_norm": 0.330078125, - "learning_rate": 0.00015874050855100398, - "loss": 0.5348, + "epoch": 3.2671443193449337, + "grad_norm": 0.3046875, + "learning_rate": 0.00016819211907900225, + "loss": 0.5596, "step": 3192 }, { - "epoch": 3.4108858057630735, - "grad_norm": 0.302734375, - "learning_rate": 0.00015845067982854809, - "loss": 0.5104, + "epoch": 3.2712384851586487, + "grad_norm": 0.298828125, + "learning_rate": 0.0001679157154753402, + "loss": 0.5044, "step": 3196 }, { - "epoch": 3.4151547491995733, - "grad_norm": 0.298828125, - "learning_rate": 0.0001581608194526872, - "loss": 0.5197, + "epoch": 3.2753326509723646, + "grad_norm": 0.326171875, + "learning_rate": 0.00016763925014450008, + "loss": 0.5377, "step": 3200 }, { - "epoch": 3.4194236926360726, - "grad_norm": 0.328125, - "learning_rate": 0.00015787092850914078, - "loss": 0.5313, + "epoch": 3.2794268167860796, + "grad_norm": 0.318359375, + "learning_rate": 0.0001673627240390214, + "loss": 0.5826, "step": 3204 }, { - "epoch": 3.423692636072572, - "grad_norm": 0.318359375, - "learning_rate": 0.00015758100808374286, - "loss": 0.498, + "epoch": 3.2835209825997955, + "grad_norm": 0.32421875, + "learning_rate": 0.0001670861381116529, + "loss": 0.5723, "step": 3208 }, { - "epoch": 3.4279615795090717, - "grad_norm": 0.314453125, - "learning_rate": 0.0001572910592624379, - "loss": 0.5305, + "epoch": 3.2876151484135105, + "grad_norm": 0.337890625, + "learning_rate": 0.00016680949331534948, + "loss": 0.4922, "step": 3212 }, { - "epoch": 3.432230522945571, - "grad_norm": 0.318359375, - "learning_rate": 0.00015700108313127659, - "loss": 0.5478, + "epoch": 3.291709314227226, + "grad_norm": 0.33203125, + "learning_rate": 0.00016653279060326883, + "loss": 0.5247, "step": 3216 }, { - "epoch": 3.4364994663820703, - "grad_norm": 0.3203125, - "learning_rate": 0.0001567110807764121, - "loss": 0.5547, + "epoch": 3.2958034800409415, + "grad_norm": 0.333984375, + "learning_rate": 0.00016625603092876824, + "loss": 0.5303, "step": 3220 }, { - "epoch": 3.44076840981857, - "grad_norm": 0.31640625, - "learning_rate": 0.00015642105328409565, - "loss": 0.5129, + "epoch": 3.299897645854657, + "grad_norm": 0.333984375, + "learning_rate": 0.00016597921524540125, + "loss": 0.5386, "step": 3224 }, { - "epoch": 3.4450373532550693, - "grad_norm": 0.337890625, - "learning_rate": 0.00015613100174067276, - "loss": 0.5963, + "epoch": 3.3039918116683724, + "grad_norm": 0.310546875, + "learning_rate": 0.00016570234450691436, + "loss": 0.5131, "step": 3228 }, { - "epoch": 3.449306296691569, - "grad_norm": 0.333984375, - "learning_rate": 0.00015584092723257897, - "loss": 0.5061, + "epoch": 3.308085977482088, + "grad_norm": 0.349609375, + "learning_rate": 0.00016542541966724374, + "loss": 0.473, "step": 3232 }, { - "epoch": 3.4535752401280684, - "grad_norm": 0.33984375, - "learning_rate": 0.00015555083084633584, - "loss": 0.563, + "epoch": 3.3121801432958033, + "grad_norm": 0.328125, + "learning_rate": 0.000165148441680512, + "loss": 0.5059, "step": 3236 }, { - "epoch": 3.4578441835645677, - "grad_norm": 0.330078125, - "learning_rate": 0.00015526071366854687, - "loss": 0.5036, + "epoch": 3.316274309109519, + "grad_norm": 0.32421875, + "learning_rate": 0.0001648714115010248, + "loss": 0.4812, "step": 3240 }, { - "epoch": 3.462113127001067, - "grad_norm": 0.3203125, - "learning_rate": 0.00015497057678589348, - "loss": 0.5358, + "epoch": 3.3203684749232343, + "grad_norm": 0.32421875, + "learning_rate": 0.0001645943300832678, + "loss": 0.5404, "step": 3244 }, { - "epoch": 3.4663820704375667, - "grad_norm": 0.341796875, - "learning_rate": 0.00015468042128513086, - "loss": 0.5037, + "epoch": 3.3244626407369497, + "grad_norm": 0.31640625, + "learning_rate": 0.00016431719838190287, + "loss": 0.5238, "step": 3248 }, { - "epoch": 3.470651013874066, - "grad_norm": 0.345703125, - "learning_rate": 0.00015439024825308396, - "loss": 0.5663, + "epoch": 3.328556806550665, + "grad_norm": 0.33984375, + "learning_rate": 0.00016404001735176549, + "loss": 0.4977, "step": 3252 }, { - "epoch": 3.474919957310566, - "grad_norm": 0.326171875, - "learning_rate": 0.00015410005877664336, - "loss": 0.5418, + "epoch": 3.3326509723643807, + "grad_norm": 0.337890625, + "learning_rate": 0.00016376278794786087, + "loss": 0.5405, "step": 3256 }, { - "epoch": 3.479188900747065, - "grad_norm": 0.333984375, - "learning_rate": 0.00015380985394276126, - "loss": 0.5644, + "epoch": 3.336745138178096, + "grad_norm": 0.318359375, + "learning_rate": 0.00016348551112536095, + "loss": 0.5508, "step": 3260 }, { - "epoch": 3.4834578441835644, - "grad_norm": 0.33984375, - "learning_rate": 0.0001535196348384474, - "loss": 0.514, + "epoch": 3.3408393039918116, + "grad_norm": 0.322265625, + "learning_rate": 0.00016320818783960105, + "loss": 0.5301, "step": 3264 }, { - "epoch": 3.487726787620064, - "grad_norm": 0.33203125, - "learning_rate": 0.00015322940255076497, - "loss": 0.5324, + "epoch": 3.344933469805527, + "grad_norm": 0.310546875, + "learning_rate": 0.00016293081904607663, + "loss": 0.4835, "step": 3268 }, { - "epoch": 3.4919957310565635, - "grad_norm": 0.328125, - "learning_rate": 0.00015293915816682645, - "loss": 0.5336, + "epoch": 3.3490276356192425, + "grad_norm": 0.30859375, + "learning_rate": 0.00016265340570043978, + "loss": 0.5369, "step": 3272 }, { - "epoch": 3.4962646744930628, - "grad_norm": 0.3203125, - "learning_rate": 0.0001526489027737898, - "loss": 0.5402, + "epoch": 3.353121801432958, + "grad_norm": 0.3359375, + "learning_rate": 0.00016237594875849628, + "loss": 0.5258, "step": 3276 }, { - "epoch": 3.5005336179295625, - "grad_norm": 0.3515625, - "learning_rate": 0.00015235863745885407, - "loss": 0.5029, + "epoch": 3.3572159672466735, + "grad_norm": 0.33203125, + "learning_rate": 0.00016209844917620207, + "loss": 0.579, "step": 3280 }, { - "epoch": 3.504802561366062, - "grad_norm": 0.3203125, - "learning_rate": 0.00015206836330925556, - "loss": 0.506, + "epoch": 3.361310133060389, + "grad_norm": 0.353515625, + "learning_rate": 0.00016182090790965988, + "loss": 0.5422, "step": 3284 }, { - "epoch": 3.509071504802561, - "grad_norm": 0.314453125, - "learning_rate": 0.00015177808141226358, - "loss": 0.5351, + "epoch": 3.3654042988741044, + "grad_norm": 0.306640625, + "learning_rate": 0.00016154332591511623, + "loss": 0.5292, "step": 3288 }, { - "epoch": 3.513340448239061, - "grad_norm": 0.306640625, - "learning_rate": 0.00015148779285517654, - "loss": 0.5121, + "epoch": 3.36949846468782, + "grad_norm": 0.330078125, + "learning_rate": 0.00016126570414895785, + "loss": 0.5345, "step": 3292 }, { - "epoch": 3.51760939167556, - "grad_norm": 0.341796875, - "learning_rate": 0.00015119749872531774, - "loss": 0.5102, + "epoch": 3.3735926305015353, + "grad_norm": 0.3203125, + "learning_rate": 0.0001609880435677085, + "loss": 0.5141, "step": 3296 }, { - "epoch": 3.52187833511206, - "grad_norm": 0.3359375, - "learning_rate": 0.00015090720011003142, - "loss": 0.5728, + "epoch": 3.377686796315251, + "grad_norm": 0.33203125, + "learning_rate": 0.00016071034512802577, + "loss": 0.4843, "step": 3300 }, { - "epoch": 3.5261472785485592, - "grad_norm": 0.326171875, - "learning_rate": 0.00015061689809667852, - "loss": 0.5172, + "epoch": 3.3817809621289663, + "grad_norm": 0.328125, + "learning_rate": 0.00016043260978669763, + "loss": 0.514, "step": 3304 }, { - "epoch": 3.5304162219850586, - "grad_norm": 0.3203125, - "learning_rate": 0.00015032659377263278, - "loss": 0.5217, + "epoch": 3.3858751279426818, + "grad_norm": 0.32421875, + "learning_rate": 0.00016015483850063912, + "loss": 0.5275, "step": 3308 }, { - "epoch": 3.5346851654215583, - "grad_norm": 0.33984375, - "learning_rate": 0.0001500362882252766, - "loss": 0.5536, + "epoch": 3.389969293756397, + "grad_norm": 0.2890625, + "learning_rate": 0.00015987703222688926, + "loss": 0.5134, "step": 3312 }, { - "epoch": 3.5389541088580576, - "grad_norm": 0.326171875, - "learning_rate": 0.0001497459825419969, - "loss": 0.5247, + "epoch": 3.3940634595701127, + "grad_norm": 0.314453125, + "learning_rate": 0.00015959919192260757, + "loss": 0.5475, "step": 3316 }, { - "epoch": 3.5432230522945574, - "grad_norm": 0.326171875, - "learning_rate": 0.00014945567781018122, - "loss": 0.5181, + "epoch": 3.398157625383828, + "grad_norm": 0.345703125, + "learning_rate": 0.00015932131854507072, + "loss": 0.5782, "step": 3320 }, { - "epoch": 3.5474919957310567, + "epoch": 3.4022517911975436, "grad_norm": 0.322265625, - "learning_rate": 0.00014916537511721343, - "loss": 0.5136, + "learning_rate": 0.0001590434130516695, + "loss": 0.5299, "step": 3324 }, { - "epoch": 3.551760939167556, - "grad_norm": 0.3203125, - "learning_rate": 0.00014887507555046974, - "loss": 0.5113, + "epoch": 3.406345957011259, + "grad_norm": 0.33203125, + "learning_rate": 0.00015876547639990518, + "loss": 0.549, "step": 3328 }, { - "epoch": 3.5560298826040553, - "grad_norm": 0.326171875, - "learning_rate": 0.00014858478019731485, - "loss": 0.5573, + "epoch": 3.4104401228249746, + "grad_norm": 0.3515625, + "learning_rate": 0.0001584875095473865, + "loss": 0.5149, "step": 3332 }, { - "epoch": 3.560298826040555, - "grad_norm": 0.322265625, - "learning_rate": 0.0001482944901450974, - "loss": 0.5207, + "epoch": 3.41453428863869, + "grad_norm": 0.328125, + "learning_rate": 0.0001582095134518263, + "loss": 0.5442, "step": 3336 }, { - "epoch": 3.5645677694770543, - "grad_norm": 0.34375, - "learning_rate": 0.00014800420648114642, - "loss": 0.5296, + "epoch": 3.4186284544524055, + "grad_norm": 0.3203125, + "learning_rate": 0.00015793148907103802, + "loss": 0.5089, "step": 3340 }, { - "epoch": 3.568836712913554, - "grad_norm": 0.32421875, - "learning_rate": 0.00014771393029276678, - "loss": 0.6069, + "epoch": 3.422722620266121, + "grad_norm": 0.337890625, + "learning_rate": 0.00015765343736293263, + "loss": 0.5063, "step": 3344 }, { - "epoch": 3.5731056563500534, - "grad_norm": 0.33203125, - "learning_rate": 0.00014742366266723566, - "loss": 0.556, + "epoch": 3.4268167860798364, + "grad_norm": 0.3203125, + "learning_rate": 0.00015737535928551528, + "loss": 0.508, "step": 3348 }, { - "epoch": 3.5773745997865527, - "grad_norm": 0.345703125, - "learning_rate": 0.00014713340469179776, - "loss": 0.5044, + "epoch": 3.4309109518935514, + "grad_norm": 0.318359375, + "learning_rate": 0.00015709725579688197, + "loss": 0.5439, "step": 3352 }, { - "epoch": 3.5816435432230525, - "grad_norm": 0.326171875, - "learning_rate": 0.00014684315745366205, - "loss": 0.5, + "epoch": 3.4350051177072674, + "grad_norm": 0.310546875, + "learning_rate": 0.00015681912785521618, + "loss": 0.5304, "step": 3356 }, { - "epoch": 3.5859124866595518, - "grad_norm": 0.34765625, - "learning_rate": 0.00014655292203999691, - "loss": 0.5212, + "epoch": 3.4390992835209824, + "grad_norm": 0.314453125, + "learning_rate": 0.0001565409764187857, + "loss": 0.5379, "step": 3360 }, { - "epoch": 3.590181430096051, - "grad_norm": 0.326171875, - "learning_rate": 0.00014626269953792677, - "loss": 0.4651, + "epoch": 3.4431934493346983, + "grad_norm": 0.333984375, + "learning_rate": 0.00015626280244593937, + "loss": 0.5265, "step": 3364 }, { - "epoch": 3.594450373532551, - "grad_norm": 0.3359375, - "learning_rate": 0.0001459724910345274, - "loss": 0.5483, + "epoch": 3.4472876151484133, + "grad_norm": 0.36328125, + "learning_rate": 0.00015598460689510342, + "loss": 0.5461, "step": 3368 }, { - "epoch": 3.59871931696905, - "grad_norm": 0.3359375, - "learning_rate": 0.0001456822976168223, - "loss": 0.5076, + "epoch": 3.4513817809621288, + "grad_norm": 0.33203125, + "learning_rate": 0.00015570639072477865, + "loss": 0.5123, "step": 3372 }, { - "epoch": 3.6029882604055494, - "grad_norm": 0.328125, - "learning_rate": 0.00014539212037177854, - "loss": 0.5367, + "epoch": 3.4554759467758442, + "grad_norm": 0.32421875, + "learning_rate": 0.00015542815489353687, + "loss": 0.5182, "step": 3376 }, { - "epoch": 3.607257203842049, - "grad_norm": 0.318359375, - "learning_rate": 0.00014510196038630232, - "loss": 0.4982, + "epoch": 3.4595701125895597, + "grad_norm": 0.32421875, + "learning_rate": 0.0001551499003600175, + "loss": 0.5118, "step": 3380 }, { - "epoch": 3.6115261472785485, - "grad_norm": 0.333984375, - "learning_rate": 0.00014481181874723557, - "loss": 0.5131, + "epoch": 3.463664278403275, + "grad_norm": 0.30078125, + "learning_rate": 0.00015487162808292454, + "loss": 0.5201, "step": 3384 }, { - "epoch": 3.6157950907150482, - "grad_norm": 0.30078125, - "learning_rate": 0.00014452169654135115, - "loss": 0.5494, + "epoch": 3.4677584442169906, + "grad_norm": 0.31640625, + "learning_rate": 0.00015459333902102302, + "loss": 0.5019, "step": 3388 }, { - "epoch": 3.6200640341515475, - "grad_norm": 0.31640625, - "learning_rate": 0.0001442315948553494, - "loss": 0.5233, + "epoch": 3.471852610030706, + "grad_norm": 0.328125, + "learning_rate": 0.00015431503413313594, + "loss": 0.5194, "step": 3392 }, { - "epoch": 3.624332977588047, + "epoch": 3.4759467758444216, "grad_norm": 0.326171875, - "learning_rate": 0.0001439415147758536, - "loss": 0.5378, + "learning_rate": 0.00015403671437814063, + "loss": 0.5196, "step": 3396 }, { - "epoch": 3.628601921024546, - "grad_norm": 0.326171875, - "learning_rate": 0.00014365145738940623, - "loss": 0.4954, + "epoch": 3.480040941658137, + "grad_norm": 0.3125, + "learning_rate": 0.00015375838071496583, + "loss": 0.5081, "step": 3400 }, { - "epoch": 3.632870864461046, - "grad_norm": 0.34375, - "learning_rate": 0.0001433614237824647, - "loss": 0.5358, + "epoch": 3.4841351074718525, + "grad_norm": 0.3203125, + "learning_rate": 0.00015348003410258813, + "loss": 0.5039, "step": 3404 }, { - "epoch": 3.637139807897545, - "grad_norm": 0.345703125, - "learning_rate": 0.00014307141504139737, - "loss": 0.5616, + "epoch": 3.488229273285568, + "grad_norm": 0.326171875, + "learning_rate": 0.00015320167550002863, + "loss": 0.536, "step": 3408 }, { - "epoch": 3.641408751334045, - "grad_norm": 0.30859375, - "learning_rate": 0.00014278143225247955, - "loss": 0.5346, + "epoch": 3.4923234390992834, + "grad_norm": 0.341796875, + "learning_rate": 0.00015292330586634997, + "loss": 0.5372, "step": 3412 }, { - "epoch": 3.6456776947705443, - "grad_norm": 0.310546875, - "learning_rate": 0.00014249147650188912, - "loss": 0.5462, + "epoch": 3.496417604912999, + "grad_norm": 0.33203125, + "learning_rate": 0.00015264492616065255, + "loss": 0.4794, "step": 3416 }, { - "epoch": 3.6499466382070436, - "grad_norm": 0.3203125, - "learning_rate": 0.00014220154887570295, - "loss": 0.5239, + "epoch": 3.5005117707267144, + "grad_norm": 0.32421875, + "learning_rate": 0.00015236653734207158, + "loss": 0.5065, "step": 3420 }, { - "epoch": 3.6542155816435433, - "grad_norm": 0.318359375, - "learning_rate": 0.00014191165045989235, - "loss": 0.5092, + "epoch": 3.50460593654043, + "grad_norm": 0.3203125, + "learning_rate": 0.0001520881403697738, + "loss": 0.4901, "step": 3424 }, { - "epoch": 3.6584845250800426, - "grad_norm": 0.32421875, - "learning_rate": 0.00014162178234031935, - "loss": 0.5623, + "epoch": 3.5087001023541453, + "grad_norm": 0.326171875, + "learning_rate": 0.00015180973620295383, + "loss": 0.5332, "step": 3428 }, { - "epoch": 3.6627534685165424, - "grad_norm": 0.314453125, - "learning_rate": 0.0001413319456027324, - "loss": 0.5408, + "epoch": 3.512794268167861, + "grad_norm": 0.322265625, + "learning_rate": 0.00015153132580083116, + "loss": 0.4868, "step": 3432 }, { - "epoch": 3.6670224119530417, + "epoch": 3.5168884339815762, "grad_norm": 0.322265625, - "learning_rate": 0.00014104214133276257, - "loss": 0.5285, + "learning_rate": 0.00015125291012264684, + "loss": 0.55, "step": 3436 }, { - "epoch": 3.671291355389541, - "grad_norm": 0.330078125, - "learning_rate": 0.00014075237061591907, - "loss": 0.5002, + "epoch": 3.5209825997952917, + "grad_norm": 0.328125, + "learning_rate": 0.00015097449012765993, + "loss": 0.4989, "step": 3440 }, { - "epoch": 3.6755602988260403, - "grad_norm": 0.3359375, - "learning_rate": 0.00014046263453758559, - "loss": 0.5481, + "epoch": 3.525076765609007, + "grad_norm": 0.31640625, + "learning_rate": 0.0001506960667751445, + "loss": 0.4985, "step": 3444 }, { - "epoch": 3.67982924226254, - "grad_norm": 0.328125, - "learning_rate": 0.0001401729341830162, - "loss": 0.5803, + "epoch": 3.5291709314227226, + "grad_norm": 0.45703125, + "learning_rate": 0.00015041764102438618, + "loss": 0.5499, "step": 3448 }, { - "epoch": 3.6840981856990394, - "grad_norm": 0.333984375, - "learning_rate": 0.0001398832706373308, - "loss": 0.5124, + "epoch": 3.533265097236438, + "grad_norm": 0.328125, + "learning_rate": 0.00015013921383467884, + "loss": 0.5324, "step": 3452 }, { - "epoch": 3.688367129135539, - "grad_norm": 0.337890625, - "learning_rate": 0.00013959364498551183, - "loss": 0.5015, + "epoch": 3.5373592630501536, + "grad_norm": 0.32421875, + "learning_rate": 0.0001498607861653212, + "loss": 0.5165, "step": 3456 }, { - "epoch": 3.6926360725720384, - "grad_norm": 0.3203125, - "learning_rate": 0.00013930405831239943, - "loss": 0.5209, + "epoch": 3.541453428863869, + "grad_norm": 0.3046875, + "learning_rate": 0.0001495823589756138, + "loss": 0.5517, "step": 3460 }, { - "epoch": 3.6969050160085377, - "grad_norm": 0.337890625, - "learning_rate": 0.00013901451170268803, - "loss": 0.5044, + "epoch": 3.5455475946775845, + "grad_norm": 0.341796875, + "learning_rate": 0.00014930393322485548, + "loss": 0.548, "step": 3464 }, { - "epoch": 3.7011739594450375, - "grad_norm": 0.31640625, - "learning_rate": 0.00013872500624092173, - "loss": 0.5419, + "epoch": 3.5496417604913, + "grad_norm": 0.318359375, + "learning_rate": 0.00014902550987234004, + "loss": 0.5582, "step": 3468 }, { - "epoch": 3.7054429028815368, - "grad_norm": 0.310546875, - "learning_rate": 0.0001384355430114908, - "loss": 0.5198, + "epoch": 3.5537359263050154, + "grad_norm": 0.330078125, + "learning_rate": 0.00014874708987735316, + "loss": 0.5943, "step": 3472 }, { - "epoch": 3.7097118463180365, - "grad_norm": 0.318359375, - "learning_rate": 0.000138146123098627, - "loss": 0.5329, + "epoch": 3.557830092118731, + "grad_norm": 0.330078125, + "learning_rate": 0.0001484686741991688, + "loss": 0.512, "step": 3476 }, { - "epoch": 3.713980789754536, - "grad_norm": 0.333984375, - "learning_rate": 0.00013785674758640012, - "loss": 0.4795, + "epoch": 3.5619242579324464, + "grad_norm": 0.33984375, + "learning_rate": 0.00014819026379704614, + "loss": 0.527, "step": 3480 }, { - "epoch": 3.718249733191035, - "grad_norm": 0.337890625, - "learning_rate": 0.00013756741755871352, - "loss": 0.5345, + "epoch": 3.566018423746162, + "grad_norm": 0.328125, + "learning_rate": 0.0001479118596302262, + "loss": 0.5487, "step": 3484 }, { - "epoch": 3.7225186766275344, - "grad_norm": 0.3203125, - "learning_rate": 0.0001372781340993001, - "loss": 0.4707, + "epoch": 3.5701125895598773, + "grad_norm": 0.3046875, + "learning_rate": 0.0001476334626579284, + "loss": 0.5161, "step": 3488 }, { - "epoch": 3.726787620064034, - "grad_norm": 0.3359375, - "learning_rate": 0.0001369888982917185, - "loss": 0.5731, + "epoch": 3.574206755373593, + "grad_norm": 0.333984375, + "learning_rate": 0.00014735507383934748, + "loss": 0.5076, "step": 3492 }, { - "epoch": 3.7310565635005335, - "grad_norm": 0.302734375, - "learning_rate": 0.00013669971121934874, - "loss": 0.5232, + "epoch": 3.5783009211873082, + "grad_norm": 0.3203125, + "learning_rate": 0.00014707669413365008, + "loss": 0.5335, "step": 3496 }, { - "epoch": 3.7353255069370332, - "grad_norm": 0.33984375, - "learning_rate": 0.00013641057396538846, - "loss": 0.5134, + "epoch": 3.5823950870010233, + "grad_norm": 0.322265625, + "learning_rate": 0.0001467983244999714, + "loss": 0.476, "step": 3500 }, { - "epoch": 3.7395944503735326, - "grad_norm": 0.318359375, - "learning_rate": 0.00013612148761284846, - "loss": 0.5263, + "epoch": 3.586489252814739, + "grad_norm": 0.34765625, + "learning_rate": 0.0001465199658974119, + "loss": 0.5597, "step": 3504 }, { - "epoch": 3.743863393810032, + "epoch": 3.590583418628454, "grad_norm": 0.310546875, - "learning_rate": 0.0001358324532445491, - "loss": 0.5561, + "learning_rate": 0.00014624161928503417, + "loss": 0.5411, "step": 3508 }, { - "epoch": 3.7481323372465316, - "grad_norm": 0.310546875, - "learning_rate": 0.00013554347194311587, - "loss": 0.5115, + "epoch": 3.59467758444217, + "grad_norm": 0.328125, + "learning_rate": 0.0001459632856218594, + "loss": 0.5302, "step": 3512 }, { - "epoch": 3.752401280683031, - "grad_norm": 0.318359375, - "learning_rate": 0.00013525454479097556, - "loss": 0.5685, + "epoch": 3.598771750255885, + "grad_norm": 0.34375, + "learning_rate": 0.0001456849658668641, + "loss": 0.5309, "step": 3516 }, { - "epoch": 3.7566702241195307, - "grad_norm": 0.322265625, - "learning_rate": 0.0001349656728703522, - "loss": 0.5418, + "epoch": 3.602865916069601, + "grad_norm": 0.3515625, + "learning_rate": 0.000145406660978977, + "loss": 0.5123, "step": 3520 }, { - "epoch": 3.76093916755603, - "grad_norm": 0.318359375, - "learning_rate": 0.0001346768572632628, - "loss": 0.517, + "epoch": 3.606960081883316, + "grad_norm": 0.59765625, + "learning_rate": 0.00014512837191707552, + "loss": 0.5087, "step": 3524 }, { - "epoch": 3.7652081109925293, - "grad_norm": 0.302734375, - "learning_rate": 0.0001343880990515135, - "loss": 0.5114, + "epoch": 3.611054247697032, + "grad_norm": 0.33203125, + "learning_rate": 0.0001448500996399825, + "loss": 0.5063, "step": 3528 }, { - "epoch": 3.7694770544290286, - "grad_norm": 0.3046875, - "learning_rate": 0.0001340993993166955, - "loss": 0.5065, + "epoch": 3.615148413510747, + "grad_norm": 0.349609375, + "learning_rate": 0.0001445718451064631, + "loss": 0.5133, "step": 3532 }, { - "epoch": 3.7737459978655283, - "grad_norm": 0.333984375, - "learning_rate": 0.00013381075914018097, - "loss": 0.5653, + "epoch": 3.619242579324463, + "grad_norm": 0.328125, + "learning_rate": 0.0001442936092752213, + "loss": 0.5234, "step": 3536 }, { - "epoch": 3.7780149413020276, - "grad_norm": 0.33984375, - "learning_rate": 0.0001335221796031188, - "loss": 0.5156, + "epoch": 3.623336745138178, + "grad_norm": 0.34375, + "learning_rate": 0.00014401539310489656, + "loss": 0.534, "step": 3540 }, { - "epoch": 3.7822838847385274, - "grad_norm": 0.333984375, - "learning_rate": 0.00013323366178643106, - "loss": 0.5307, + "epoch": 3.6274309109518934, + "grad_norm": 0.40234375, + "learning_rate": 0.0001437371975540606, + "loss": 0.4773, "step": 3544 }, { - "epoch": 3.7865528281750267, - "grad_norm": 0.3515625, - "learning_rate": 0.00013294520677080836, - "loss": 0.5329, + "epoch": 3.631525076765609, + "grad_norm": 0.33203125, + "learning_rate": 0.00014345902358121423, + "loss": 0.5007, "step": 3548 }, { - "epoch": 3.790821771611526, - "grad_norm": 0.314453125, - "learning_rate": 0.00013265681563670633, - "loss": 0.4968, + "epoch": 3.6356192425793243, + "grad_norm": 0.40625, + "learning_rate": 0.0001431808721447838, + "loss": 0.5236, "step": 3552 }, { - "epoch": 3.7950907150480258, - "grad_norm": 0.357421875, - "learning_rate": 0.00013236848946434097, - "loss": 0.5156, + "epoch": 3.63971340839304, + "grad_norm": 0.30859375, + "learning_rate": 0.000142902744203118, + "loss": 0.5174, "step": 3556 }, { - "epoch": 3.799359658484525, - "grad_norm": 0.3203125, - "learning_rate": 0.00013208022933368537, - "loss": 0.51, + "epoch": 3.6438075742067553, + "grad_norm": 0.337890625, + "learning_rate": 0.0001426246407144847, + "loss": 0.5517, "step": 3560 }, { - "epoch": 3.803628601921025, - "grad_norm": 0.322265625, - "learning_rate": 0.00013179203632446496, - "loss": 0.5558, + "epoch": 3.6479017400204707, + "grad_norm": 0.31640625, + "learning_rate": 0.00014234656263706737, + "loss": 0.5203, "step": 3564 }, { - "epoch": 3.807897545357524, + "epoch": 3.651995905834186, "grad_norm": 0.3359375, - "learning_rate": 0.00013150391151615386, - "loss": 0.4999, + "learning_rate": 0.00014206851092896195, + "loss": 0.515, "step": 3568 }, { - "epoch": 3.8121664887940234, - "grad_norm": 0.357421875, - "learning_rate": 0.00013121585598797075, - "loss": 0.5827, + "epoch": 3.6560900716479017, + "grad_norm": 0.306640625, + "learning_rate": 0.00014179048654817367, + "loss": 0.5206, "step": 3572 }, { - "epoch": 3.8164354322305227, - "grad_norm": 0.33203125, - "learning_rate": 0.0001309278708188747, - "loss": 0.4998, + "epoch": 3.660184237461617, + "grad_norm": 0.345703125, + "learning_rate": 0.00014151249045261347, + "loss": 0.5397, "step": 3576 }, { - "epoch": 3.8207043756670225, - "grad_norm": 0.30859375, - "learning_rate": 0.00013063995708756138, - "loss": 0.5295, + "epoch": 3.6642784032753326, + "grad_norm": 0.345703125, + "learning_rate": 0.0001412345236000948, + "loss": 0.5156, "step": 3580 }, { - "epoch": 3.824973319103522, - "grad_norm": 0.3203125, - "learning_rate": 0.00013035211587245874, - "loss": 0.5403, + "epoch": 3.668372569089048, + "grad_norm": 0.408203125, + "learning_rate": 0.0001409565869483305, + "loss": 0.5261, "step": 3584 }, { - "epoch": 3.8292422625400215, - "grad_norm": 0.361328125, - "learning_rate": 0.0001300643482517233, - "loss": 0.5708, + "epoch": 3.6724667349027635, + "grad_norm": 0.328125, + "learning_rate": 0.00014067868145492928, + "loss": 0.5615, "step": 3588 }, { - "epoch": 3.833511205976521, - "grad_norm": 0.298828125, - "learning_rate": 0.00012977665530323568, - "loss": 0.4615, + "epoch": 3.676560900716479, + "grad_norm": 0.3359375, + "learning_rate": 0.0001404008080773924, + "loss": 0.4976, "step": 3592 }, { - "epoch": 3.83778014941302, - "grad_norm": 0.2890625, - "learning_rate": 0.00012948903810459694, - "loss": 0.4947, + "epoch": 3.6806550665301945, + "grad_norm": 0.34375, + "learning_rate": 0.0001401229677731107, + "loss": 0.5772, "step": 3596 }, { - "epoch": 3.84204909284952, - "grad_norm": 0.330078125, - "learning_rate": 0.00012920149773312453, - "loss": 0.5208, + "epoch": 3.68474923234391, + "grad_norm": 0.33984375, + "learning_rate": 0.00013984516149936088, + "loss": 0.5169, "step": 3600 }, { - "epoch": 3.846318036286019, - "grad_norm": 0.302734375, - "learning_rate": 0.00012891403526584782, - "loss": 0.4672, + "epoch": 3.6888433981576254, + "grad_norm": 0.314453125, + "learning_rate": 0.00013956739021330234, + "loss": 0.5636, "step": 3604 }, { - "epoch": 3.8505869797225185, - "grad_norm": 0.32421875, - "learning_rate": 0.00012862665177950473, - "loss": 0.5478, + "epoch": 3.692937563971341, + "grad_norm": 0.3203125, + "learning_rate": 0.0001392896548719742, + "loss": 0.5431, "step": 3608 }, { - "epoch": 3.8548559231590183, - "grad_norm": 0.326171875, - "learning_rate": 0.00012833934835053694, - "loss": 0.5584, + "epoch": 3.6970317297850563, + "grad_norm": 0.345703125, + "learning_rate": 0.0001390119564322915, + "loss": 0.5717, "step": 3612 }, { - "epoch": 3.8591248665955176, - "grad_norm": 0.328125, - "learning_rate": 0.0001280521260550867, - "loss": 0.5175, + "epoch": 3.701125895598772, + "grad_norm": 0.33203125, + "learning_rate": 0.00013873429585104215, + "loss": 0.5165, "step": 3616 }, { - "epoch": 3.863393810032017, - "grad_norm": 0.306640625, - "learning_rate": 0.00012776498596899196, - "loss": 0.5283, + "epoch": 3.7052200614124873, + "grad_norm": 0.333984375, + "learning_rate": 0.00013845667408488377, + "loss": 0.5255, "step": 3620 }, { - "epoch": 3.8676627534685166, - "grad_norm": 0.322265625, - "learning_rate": 0.0001274779291677831, - "loss": 0.5144, + "epoch": 3.7093142272262027, + "grad_norm": 0.318359375, + "learning_rate": 0.0001381790920903401, + "loss": 0.5074, "step": 3624 }, { - "epoch": 3.871931696905016, - "grad_norm": 0.330078125, - "learning_rate": 0.00012719095672667822, - "loss": 0.5655, + "epoch": 3.713408393039918, + "grad_norm": 0.328125, + "learning_rate": 0.00013790155082379793, + "loss": 0.5742, "step": 3628 }, { - "epoch": 3.8762006403415157, - "grad_norm": 0.32421875, - "learning_rate": 0.0001269040697205797, - "loss": 0.5192, + "epoch": 3.7175025588536337, + "grad_norm": 0.314453125, + "learning_rate": 0.0001376240512415037, + "loss": 0.5069, "step": 3632 }, { - "epoch": 3.880469583778015, - "grad_norm": 0.328125, - "learning_rate": 0.00012661726922406978, - "loss": 0.5378, + "epoch": 3.721596724667349, + "grad_norm": 0.3203125, + "learning_rate": 0.00013734659429956024, + "loss": 0.5224, "step": 3636 }, { - "epoch": 3.8847385272145143, - "grad_norm": 0.34765625, - "learning_rate": 0.00012633055631140663, - "loss": 0.5993, + "epoch": 3.7256908904810646, + "grad_norm": 0.35546875, + "learning_rate": 0.0001370691809539234, + "loss": 0.6127, "step": 3640 }, { - "epoch": 3.8890074706510136, - "grad_norm": 0.318359375, - "learning_rate": 0.00012604393205652042, - "loss": 0.5803, + "epoch": 3.72978505629478, + "grad_norm": 0.32421875, + "learning_rate": 0.00013679181216039898, + "loss": 0.5235, "step": 3644 }, { - "epoch": 3.8932764140875133, - "grad_norm": 0.33984375, - "learning_rate": 0.00012575739753300927, - "loss": 0.4871, + "epoch": 3.7338792221084955, + "grad_norm": 0.330078125, + "learning_rate": 0.00013651448887463905, + "loss": 0.4979, "step": 3648 }, { - "epoch": 3.8975453575240127, - "grad_norm": 0.328125, - "learning_rate": 0.00012547095381413513, - "loss": 0.5436, + "epoch": 3.737973387922211, + "grad_norm": 0.326171875, + "learning_rate": 0.00013623721205213916, + "loss": 0.5628, "step": 3652 }, { - "epoch": 3.9018143009605124, - "grad_norm": 0.333984375, - "learning_rate": 0.00012518460197281982, - "loss": 0.5496, + "epoch": 3.742067553735926, + "grad_norm": 0.3203125, + "learning_rate": 0.00013595998264823454, + "loss": 0.5107, "step": 3656 }, { - "epoch": 3.9060832443970117, - "grad_norm": 0.318359375, - "learning_rate": 0.00012489834308164114, - "loss": 0.5225, + "epoch": 3.746161719549642, + "grad_norm": 0.333984375, + "learning_rate": 0.00013568280161809713, + "loss": 0.543, "step": 3660 }, { - "epoch": 3.910352187833511, - "grad_norm": 0.31640625, - "learning_rate": 0.0001246121782128285, - "loss": 0.5285, + "epoch": 3.750255885363357, + "grad_norm": 0.33203125, + "learning_rate": 0.00013540566991673226, + "loss": 0.4928, "step": 3664 }, { - "epoch": 3.9146211312700108, - "grad_norm": 0.3515625, - "learning_rate": 0.00012432610843825934, - "loss": 0.4903, + "epoch": 3.754350051177073, + "grad_norm": 0.328125, + "learning_rate": 0.0001351285884989752, + "loss": 0.4854, "step": 3668 }, { - "epoch": 3.91889007470651, - "grad_norm": 0.3359375, - "learning_rate": 0.00012404013482945496, - "loss": 0.5382, + "epoch": 3.758444216990788, + "grad_norm": 0.3203125, + "learning_rate": 0.000134851558319488, + "loss": 0.5305, "step": 3672 }, { - "epoch": 3.92315901814301, - "grad_norm": 0.32421875, - "learning_rate": 0.00012375425845757612, - "loss": 0.5086, + "epoch": 3.762538382804504, + "grad_norm": 0.33984375, + "learning_rate": 0.00013457458033275623, + "loss": 0.5603, "step": 3676 }, { - "epoch": 3.927427961579509, - "grad_norm": 0.328125, - "learning_rate": 0.00012346848039341976, - "loss": 0.4784, + "epoch": 3.766632548618219, + "grad_norm": 0.33203125, + "learning_rate": 0.00013429765549308558, + "loss": 0.5247, "step": 3680 }, { - "epoch": 3.9316969050160084, - "grad_norm": 0.3359375, - "learning_rate": 0.0001231828017074143, - "loss": 0.5401, + "epoch": 3.7707267144319347, + "grad_norm": 0.326171875, + "learning_rate": 0.0001340207847545987, + "loss": 0.5073, "step": 3684 }, { - "epoch": 3.9359658484525077, - "grad_norm": 0.318359375, - "learning_rate": 0.00012289722346961612, - "loss": 0.5282, + "epoch": 3.7748208802456498, + "grad_norm": 0.33203125, + "learning_rate": 0.00013374396907123173, + "loss": 0.5365, "step": 3688 }, { - "epoch": 3.9402347918890075, - "grad_norm": 0.32421875, - "learning_rate": 0.00012261174674970516, - "loss": 0.4625, + "epoch": 3.7789150460593657, + "grad_norm": 0.33984375, + "learning_rate": 0.00013346720939673112, + "loss": 0.5316, "step": 3692 }, { - "epoch": 3.944503735325507, - "grad_norm": 0.3125, - "learning_rate": 0.00012232637261698137, - "loss": 0.5061, + "epoch": 3.7830092118730807, + "grad_norm": 0.33203125, + "learning_rate": 0.0001331905066846505, + "loss": 0.5118, "step": 3696 }, { - "epoch": 3.9487726787620065, - "grad_norm": 0.314453125, - "learning_rate": 0.00012204110214036011, - "loss": 0.5255, + "epoch": 3.787103377686796, + "grad_norm": 0.337890625, + "learning_rate": 0.0001329138618883471, + "loss": 0.5699, "step": 3700 }, { - "epoch": 3.953041622198506, - "grad_norm": 0.3359375, - "learning_rate": 0.00012175593638836878, - "loss": 0.5236, + "epoch": 3.7911975435005116, + "grad_norm": 0.330078125, + "learning_rate": 0.00013263727596097855, + "loss": 0.5433, "step": 3704 }, { - "epoch": 3.957310565635005, - "grad_norm": 0.322265625, - "learning_rate": 0.00012147087642914245, - "loss": 0.4608, + "epoch": 3.795291709314227, + "grad_norm": 0.341796875, + "learning_rate": 0.0001323607498554999, + "loss": 0.4993, "step": 3708 }, { - "epoch": 3.961579509071505, - "grad_norm": 0.318359375, - "learning_rate": 0.00012118592333041974, - "loss": 0.51, + "epoch": 3.7993858751279426, + "grad_norm": 0.349609375, + "learning_rate": 0.0001320842845246598, + "loss": 0.4852, "step": 3712 }, { - "epoch": 3.965848452508004, - "grad_norm": 0.361328125, - "learning_rate": 0.0001209010781595392, - "loss": 0.508, + "epoch": 3.803480040941658, + "grad_norm": 0.333984375, + "learning_rate": 0.00013180788092099775, + "loss": 0.5193, "step": 3716 }, { - "epoch": 3.970117395944504, - "grad_norm": 0.3359375, - "learning_rate": 0.00012061634198343506, - "loss": 0.5114, + "epoch": 3.8075742067553735, + "grad_norm": 0.34375, + "learning_rate": 0.0001315315399968404, + "loss": 0.5161, "step": 3720 }, { - "epoch": 3.9743863393810033, - "grad_norm": 0.318359375, - "learning_rate": 0.00012033171586863328, - "loss": 0.5341, + "epoch": 3.811668372569089, + "grad_norm": 0.3125, + "learning_rate": 0.0001312552627042985, + "loss": 0.4898, "step": 3724 }, { - "epoch": 3.9786552828175026, - "grad_norm": 0.32421875, - "learning_rate": 0.00012004720088124757, - "loss": 0.5253, + "epoch": 3.8157625383828044, + "grad_norm": 0.337890625, + "learning_rate": 0.00013097904999526363, + "loss": 0.5347, "step": 3728 }, { - "epoch": 3.982924226254002, + "epoch": 3.81985670419652, "grad_norm": 0.33203125, - "learning_rate": 0.00011976279808697542, - "loss": 0.4828, + "learning_rate": 0.00013070290282140472, + "loss": 0.5471, "step": 3732 }, { - "epoch": 3.9871931696905016, - "grad_norm": 0.34765625, - "learning_rate": 0.00011947850855109406, - "loss": 0.5087, + "epoch": 3.8239508700102354, + "grad_norm": 0.31640625, + "learning_rate": 0.00013042682213416503, + "loss": 0.4795, "step": 3736 }, { - "epoch": 3.991462113127001, - "grad_norm": 0.33203125, - "learning_rate": 0.00011919433333845649, - "loss": 0.5251, + "epoch": 3.828045035823951, + "grad_norm": 0.333984375, + "learning_rate": 0.00013015080888475865, + "loss": 0.5442, "step": 3740 }, { - "epoch": 3.9957310565635007, - "grad_norm": 0.333984375, - "learning_rate": 0.0001189102735134875, - "loss": 0.5309, + "epoch": 3.8321392016376663, + "grad_norm": 0.330078125, + "learning_rate": 0.00012987486402416738, + "loss": 0.4921, "step": 3744 }, { - "epoch": 4.0, - "grad_norm": 0.8828125, - "learning_rate": 0.00011862633014017965, - "loss": 0.5049, + "epoch": 3.8362333674513818, + "grad_norm": 0.3125, + "learning_rate": 0.00012959898850313736, + "loss": 0.5354, "step": 3748 }, { - "epoch": 4.004268943436499, - "grad_norm": 0.33984375, - "learning_rate": 0.00011834250428208943, - "loss": 0.5173, + "epoch": 3.8403275332650972, + "grad_norm": 0.294921875, + "learning_rate": 0.00012932318327217585, + "loss": 0.4821, "step": 3752 }, { - "epoch": 4.008537886872999, - "grad_norm": 0.328125, - "learning_rate": 0.00011805879700233293, - "loss": 0.431, + "epoch": 3.8444216990788127, + "grad_norm": 0.3125, + "learning_rate": 0.00012904744928154792, + "loss": 0.5164, "step": 3756 }, { - "epoch": 4.012806830309499, - "grad_norm": 0.326171875, - "learning_rate": 0.00011777520936358237, - "loss": 0.441, + "epoch": 3.848515864892528, + "grad_norm": 0.333984375, + "learning_rate": 0.00012877178748127313, + "loss": 0.529, "step": 3760 }, { - "epoch": 4.017075773745998, - "grad_norm": 0.36328125, - "learning_rate": 0.00011749174242806152, - "loss": 0.4575, + "epoch": 3.8526100307062436, + "grad_norm": 0.328125, + "learning_rate": 0.0001284961988211225, + "loss": 0.5858, "step": 3764 }, { - "epoch": 4.021344717182497, - "grad_norm": 0.34375, - "learning_rate": 0.00011720839725754237, - "loss": 0.4469, + "epoch": 3.856704196519959, + "grad_norm": 0.3359375, + "learning_rate": 0.00012822068425061476, + "loss": 0.5239, "step": 3768 }, { - "epoch": 4.025613660618997, - "grad_norm": 0.314453125, - "learning_rate": 0.0001169251749133405, - "loss": 0.4558, + "epoch": 3.8607983623336746, + "grad_norm": 0.330078125, + "learning_rate": 0.00012794524471901363, + "loss": 0.5538, "step": 3772 }, { - "epoch": 4.029882604055496, - "grad_norm": 0.3359375, - "learning_rate": 0.0001166420764563117, - "loss": 0.4459, + "epoch": 3.86489252814739, + "grad_norm": 0.322265625, + "learning_rate": 0.00012766988117532418, + "loss": 0.5425, "step": 3776 }, { - "epoch": 4.034151547491995, - "grad_norm": 0.337890625, - "learning_rate": 0.00011635910294684747, - "loss": 0.4952, + "epoch": 3.8689866939611055, + "grad_norm": 0.3359375, + "learning_rate": 0.00012739459456828967, + "loss": 0.5078, "step": 3780 }, { - "epoch": 4.0384204909284955, - "grad_norm": 0.3125, - "learning_rate": 0.0001160762554448715, - "loss": 0.4439, + "epoch": 3.873080859774821, + "grad_norm": 0.330078125, + "learning_rate": 0.00012711938584638823, + "loss": 0.4744, "step": 3784 }, { - "epoch": 4.042689434364995, - "grad_norm": 0.345703125, - "learning_rate": 0.0001157935350098355, - "loss": 0.4712, + "epoch": 3.8771750255885364, + "grad_norm": 0.341796875, + "learning_rate": 0.00012684425595782984, + "loss": 0.5257, "step": 3788 }, { - "epoch": 4.046958377801494, - "grad_norm": 0.33984375, - "learning_rate": 0.00011551094270071499, - "loss": 0.449, + "epoch": 3.881269191402252, + "grad_norm": 0.326171875, + "learning_rate": 0.00012656920585055263, + "loss": 0.4963, "step": 3792 }, { - "epoch": 4.0512273212379935, - "grad_norm": 0.34765625, - "learning_rate": 0.00011522847957600585, - "loss": 0.4341, + "epoch": 3.8853633572159674, + "grad_norm": 0.33203125, + "learning_rate": 0.00012629423647221992, + "loss": 0.5248, "step": 3796 }, { - "epoch": 4.055496264674493, - "grad_norm": 0.328125, - "learning_rate": 0.00011494614669371991, - "loss": 0.4613, + "epoch": 3.889457523029683, + "grad_norm": 0.326171875, + "learning_rate": 0.000126019348770217, + "loss": 0.5464, "step": 3800 }, { - "epoch": 4.059765208110993, - "grad_norm": 0.30859375, - "learning_rate": 0.00011466394511138123, - "loss": 0.4788, + "epoch": 3.8935516888433983, + "grad_norm": 0.349609375, + "learning_rate": 0.00012574454369164762, + "loss": 0.5515, "step": 3804 }, { - "epoch": 4.064034151547492, - "grad_norm": 0.328125, - "learning_rate": 0.00011438187588602205, - "loss": 0.4853, + "epoch": 3.8976458546571138, + "grad_norm": 0.32421875, + "learning_rate": 0.0001254698221833309, + "loss": 0.5136, "step": 3808 }, { - "epoch": 4.068303094983992, - "grad_norm": 0.341796875, - "learning_rate": 0.00011409994007417881, - "loss": 0.4509, + "epoch": 3.901740020470829, + "grad_norm": 0.337890625, + "learning_rate": 0.00012519518519179805, + "loss": 0.5189, "step": 3812 }, { - "epoch": 4.072572038420491, - "grad_norm": 0.349609375, - "learning_rate": 0.00011381813873188827, - "loss": 0.4562, + "epoch": 3.9058341862845447, + "grad_norm": 0.322265625, + "learning_rate": 0.00012492063366328899, + "loss": 0.561, "step": 3816 }, { - "epoch": 4.07684098185699, - "grad_norm": 0.349609375, - "learning_rate": 0.00011353647291468349, - "loss": 0.4604, + "epoch": 3.9099283520982597, + "grad_norm": 0.30859375, + "learning_rate": 0.0001246461685437494, + "loss": 0.4895, "step": 3820 }, { - "epoch": 4.0811099252934895, - "grad_norm": 0.314453125, - "learning_rate": 0.00011325494367759, - "loss": 0.4389, + "epoch": 3.9140225179119756, + "grad_norm": 0.32421875, + "learning_rate": 0.00012437179077882693, + "loss": 0.5158, "step": 3824 }, { - "epoch": 4.08537886872999, - "grad_norm": 0.3203125, - "learning_rate": 0.00011297355207512149, - "loss": 0.4375, + "epoch": 3.9181166837256907, + "grad_norm": 0.326171875, + "learning_rate": 0.00012409750131386858, + "loss": 0.5381, "step": 3828 }, { - "epoch": 4.089647812166489, - "grad_norm": 0.3359375, - "learning_rate": 0.00011269229916127645, - "loss": 0.4594, + "epoch": 3.9222108495394066, + "grad_norm": 0.326171875, + "learning_rate": 0.0001238233010939169, + "loss": 0.5346, "step": 3832 }, { - "epoch": 4.093916755602988, - "grad_norm": 0.337890625, - "learning_rate": 0.00011241118598953358, - "loss": 0.4653, + "epoch": 3.9263050153531216, + "grad_norm": 0.330078125, + "learning_rate": 0.000123549191063707, + "loss": 0.5082, "step": 3836 }, { - "epoch": 4.098185699039488, - "grad_norm": 0.330078125, - "learning_rate": 0.00011213021361284841, - "loss": 0.4702, + "epoch": 3.9303991811668375, + "grad_norm": 0.333984375, + "learning_rate": 0.00012327517216766327, + "loss": 0.4927, "step": 3840 }, { - "epoch": 4.102454642475987, - "grad_norm": 0.3359375, - "learning_rate": 0.00011184938308364886, - "loss": 0.4141, + "epoch": 3.9344933469805525, + "grad_norm": 0.328125, + "learning_rate": 0.00012300124534989616, + "loss": 0.5242, "step": 3844 }, { - "epoch": 4.106723585912487, - "grad_norm": 0.357421875, - "learning_rate": 0.0001115686954538318, - "loss": 0.5026, + "epoch": 3.9385875127942684, + "grad_norm": 0.318359375, + "learning_rate": 0.0001227274115541987, + "loss": 0.5478, "step": 3848 }, { - "epoch": 4.110992529348986, - "grad_norm": 0.33203125, - "learning_rate": 0.00011128815177475857, - "loss": 0.4381, + "epoch": 3.9426816786079835, + "grad_norm": 0.328125, + "learning_rate": 0.00012245367172404367, + "loss": 0.5329, "step": 3852 }, { - "epoch": 4.115261472785486, - "grad_norm": 0.33984375, - "learning_rate": 0.00011100775309725151, - "loss": 0.4856, + "epoch": 3.946775844421699, + "grad_norm": 0.33203125, + "learning_rate": 0.0001221800268025799, + "loss": 0.5041, "step": 3856 }, { - "epoch": 4.119530416221985, - "grad_norm": 0.326171875, - "learning_rate": 0.00011072750047158987, - "loss": 0.446, + "epoch": 3.9508700102354144, + "grad_norm": 0.330078125, + "learning_rate": 0.00012190647773262923, + "loss": 0.5311, "step": 3860 }, { - "epoch": 4.123799359658484, + "epoch": 3.95496417604913, "grad_norm": 0.328125, - "learning_rate": 0.0001104473949475056, - "loss": 0.4616, + "learning_rate": 0.0001216330254566834, + "loss": 0.5264, "step": 3864 }, { - "epoch": 4.128068303094984, - "grad_norm": 0.341796875, - "learning_rate": 0.00011016743757417995, - "loss": 0.4653, + "epoch": 3.9590583418628453, + "grad_norm": 0.314453125, + "learning_rate": 0.00012135967091690053, + "loss": 0.4574, "step": 3868 }, { - "epoch": 4.132337246531484, - "grad_norm": 0.333984375, - "learning_rate": 0.00010988762940023896, - "loss": 0.4561, + "epoch": 3.963152507676561, + "grad_norm": 0.326171875, + "learning_rate": 0.00012108641505510195, + "loss": 0.5259, "step": 3872 }, { - "epoch": 4.136606189967983, - "grad_norm": 0.333984375, - "learning_rate": 0.00010960797147375007, - "loss": 0.5195, + "epoch": 3.9672466734902763, + "grad_norm": 0.337890625, + "learning_rate": 0.00012081325881276917, + "loss": 0.5375, "step": 3876 }, { - "epoch": 4.140875133404482, - "grad_norm": 0.345703125, - "learning_rate": 0.00010932846484221774, - "loss": 0.4554, + "epoch": 3.9713408393039917, + "grad_norm": 0.3203125, + "learning_rate": 0.00012054020313104037, + "loss": 0.5233, "step": 3880 }, { - "epoch": 4.145144076840982, + "epoch": 3.975435005117707, "grad_norm": 0.3203125, - "learning_rate": 0.00010904911055257994, - "loss": 0.4183, + "learning_rate": 0.0001202672489507072, + "loss": 0.5807, "step": 3884 }, { - "epoch": 4.149413020277481, - "grad_norm": 0.33203125, - "learning_rate": 0.0001087699096512038, - "loss": 0.4471, + "epoch": 3.9795291709314227, + "grad_norm": 0.349609375, + "learning_rate": 0.00011999439721221173, + "loss": 0.493, "step": 3888 }, { - "epoch": 4.15368196371398, - "grad_norm": 0.326171875, - "learning_rate": 0.00010849086318388205, - "loss": 0.4709, + "epoch": 3.983623336745138, + "grad_norm": 0.37109375, + "learning_rate": 0.00011972164885564298, + "loss": 0.5157, "step": 3892 }, { - "epoch": 4.1579509071504805, - "grad_norm": 0.341796875, - "learning_rate": 0.00010821197219582894, - "loss": 0.446, + "epoch": 3.9877175025588536, + "grad_norm": 0.34765625, + "learning_rate": 0.00011944900482073375, + "loss": 0.4709, "step": 3896 }, { - "epoch": 4.16221985058698, - "grad_norm": 0.3359375, - "learning_rate": 0.00010793323773167628, - "loss": 0.4398, + "epoch": 3.991811668372569, + "grad_norm": 0.330078125, + "learning_rate": 0.00011917646604685753, + "loss": 0.5762, "step": 3900 }, { - "epoch": 4.166488794023479, - "grad_norm": 0.341796875, - "learning_rate": 0.00010765466083546973, - "loss": 0.4566, + "epoch": 3.9959058341862845, + "grad_norm": 0.322265625, + "learning_rate": 0.00011890403347302505, + "loss": 0.4861, "step": 3904 }, { - "epoch": 4.1707577374599785, - "grad_norm": 0.369140625, - "learning_rate": 0.00010737624255066458, - "loss": 0.4657, + "epoch": 4.0, + "grad_norm": 0.3359375, + "learning_rate": 0.00011863170803788112, + "loss": 0.495, "step": 3908 }, { - "epoch": 4.175026680896478, + "epoch": 4.004094165813715, "grad_norm": 0.306640625, - "learning_rate": 0.0001070979839201222, - "loss": 0.4301, + "learning_rate": 0.00011835949067970147, + "loss": 0.4461, "step": 3912 }, { - "epoch": 4.179295624332978, - "grad_norm": 0.357421875, - "learning_rate": 0.00010681988598610572, - "loss": 0.4653, + "epoch": 4.008188331627431, + "grad_norm": 0.2890625, + "learning_rate": 0.00011808738233638947, + "loss": 0.4565, "step": 3916 }, { - "epoch": 4.183564567769477, - "grad_norm": 0.314453125, - "learning_rate": 0.00010654194979027664, - "loss": 0.4484, + "epoch": 4.012282497441146, + "grad_norm": 0.33203125, + "learning_rate": 0.00011781538394547278, + "loss": 0.4521, "step": 3920 }, { - "epoch": 4.187833511205977, - "grad_norm": 0.345703125, - "learning_rate": 0.00010626417637369036, - "loss": 0.4923, + "epoch": 4.016376663254862, + "grad_norm": 0.29296875, + "learning_rate": 0.00011754349644410038, + "loss": 0.4518, "step": 3924 }, { - "epoch": 4.192102454642476, - "grad_norm": 0.3515625, - "learning_rate": 0.00010598656677679281, - "loss": 0.4887, + "epoch": 4.020470829068577, + "grad_norm": 0.3359375, + "learning_rate": 0.00011727172076903907, + "loss": 0.5021, "step": 3928 }, { - "epoch": 4.196371398078975, - "grad_norm": 0.328125, - "learning_rate": 0.00010570912203941605, - "loss": 0.4493, + "epoch": 4.024564994882293, + "grad_norm": 0.322265625, + "learning_rate": 0.00011700005785667038, + "loss": 0.4439, "step": 3932 }, { - "epoch": 4.2006403415154745, - "grad_norm": 0.330078125, - "learning_rate": 0.00010543184320077488, - "loss": 0.4353, + "epoch": 4.028659160696008, + "grad_norm": 0.3203125, + "learning_rate": 0.0001167285086429874, + "loss": 0.4796, "step": 3936 }, { - "epoch": 4.204909284951975, - "grad_norm": 0.333984375, - "learning_rate": 0.0001051547312994626, - "loss": 0.4318, + "epoch": 4.032753326509724, + "grad_norm": 0.328125, + "learning_rate": 0.00011645707406359143, + "loss": 0.4299, "step": 3940 }, { - "epoch": 4.209178228388474, - "grad_norm": 0.330078125, - "learning_rate": 0.00010487778737344711, - "loss": 0.4578, + "epoch": 4.036847492323439, + "grad_norm": 0.318359375, + "learning_rate": 0.00011618575505368872, + "loss": 0.4309, "step": 3944 }, { - "epoch": 4.213447171824973, - "grad_norm": 0.3359375, - "learning_rate": 0.00010460101246006738, - "loss": 0.4882, + "epoch": 4.040941658137155, + "grad_norm": 0.349609375, + "learning_rate": 0.00011591455254808753, + "loss": 0.4322, "step": 3948 }, { - "epoch": 4.217716115261473, - "grad_norm": 0.341796875, - "learning_rate": 0.00010432440759602902, - "loss": 0.4627, + "epoch": 4.04503582395087, + "grad_norm": 0.318359375, + "learning_rate": 0.00011564346748119453, + "loss": 0.4437, "step": 3952 }, { - "epoch": 4.221985058697972, - "grad_norm": 0.36328125, - "learning_rate": 0.00010404797381740096, - "loss": 0.4424, + "epoch": 4.049129989764586, + "grad_norm": 0.30859375, + "learning_rate": 0.00011537250078701184, + "loss": 0.4282, "step": 3956 }, { - "epoch": 4.226254002134472, - "grad_norm": 0.341796875, - "learning_rate": 0.00010377171215961106, - "loss": 0.4816, + "epoch": 4.053224155578301, + "grad_norm": 0.345703125, + "learning_rate": 0.0001151016533991337, + "loss": 0.4951, "step": 3960 }, { - "epoch": 4.230522945570971, - "grad_norm": 0.357421875, - "learning_rate": 0.0001034956236574427, - "loss": 0.4705, + "epoch": 4.0573183213920165, + "grad_norm": 0.3359375, + "learning_rate": 0.00011483092625074347, + "loss": 0.4729, "step": 3964 }, { - "epoch": 4.234791889007471, + "epoch": 4.061412487205732, "grad_norm": 0.345703125, - "learning_rate": 0.00010321970934503051, - "loss": 0.4949, + "learning_rate": 0.00011456032027460996, + "loss": 0.4921, "step": 3968 }, { - "epoch": 4.23906083244397, - "grad_norm": 0.34375, - "learning_rate": 0.00010294397025585676, - "loss": 0.4712, + "epoch": 4.0655066530194475, + "grad_norm": 0.337890625, + "learning_rate": 0.00011428983640308463, + "loss": 0.4696, "step": 3972 }, { - "epoch": 4.243329775880469, - "grad_norm": 0.33203125, - "learning_rate": 0.00010266840742274735, - "loss": 0.4776, + "epoch": 4.0696008188331625, + "grad_norm": 0.333984375, + "learning_rate": 0.00011401947556809827, + "loss": 0.4852, "step": 3976 }, { - "epoch": 4.247598719316969, - "grad_norm": 0.345703125, - "learning_rate": 0.00010239302187786795, - "loss": 0.4343, + "epoch": 4.073694984646878, + "grad_norm": 0.357421875, + "learning_rate": 0.00011374923870115769, + "loss": 0.4644, "step": 3980 }, { - "epoch": 4.251867662753469, - "grad_norm": 0.3515625, - "learning_rate": 0.00010211781465272028, - "loss": 0.4418, + "epoch": 4.077789150460593, + "grad_norm": 0.318359375, + "learning_rate": 0.00011347912673334255, + "loss": 0.46, "step": 3984 }, { - "epoch": 4.256136606189968, - "grad_norm": 0.330078125, - "learning_rate": 0.00010184278677813798, - "loss": 0.4419, + "epoch": 4.081883316274309, + "grad_norm": 0.3359375, + "learning_rate": 0.00011320914059530232, + "loss": 0.48, "step": 3988 }, { - "epoch": 4.2604055496264674, - "grad_norm": 0.345703125, - "learning_rate": 0.0001015679392842831, - "loss": 0.4555, + "epoch": 4.085977482088024, + "grad_norm": 0.32421875, + "learning_rate": 0.00011293928121725278, + "loss": 0.4841, "step": 3992 }, { - "epoch": 4.264674493062967, - "grad_norm": 0.359375, - "learning_rate": 0.00010129327320064179, - "loss": 0.4332, + "epoch": 4.09007164790174, + "grad_norm": 0.353515625, + "learning_rate": 0.00011266954952897305, + "loss": 0.4967, "step": 3996 }, { - "epoch": 4.268943436499466, - "grad_norm": 0.333984375, - "learning_rate": 0.000101018789556021, - "loss": 0.4498, + "epoch": 4.094165813715455, + "grad_norm": 0.33203125, + "learning_rate": 0.00011239994645980233, + "loss": 0.4563, "step": 4000 }, { - "epoch": 4.273212379935966, - "grad_norm": 0.34375, - "learning_rate": 0.000100744489378544, - "loss": 0.4756, + "epoch": 4.098259979529171, + "grad_norm": 0.359375, + "learning_rate": 0.00011213047293863659, + "loss": 0.4613, "step": 4004 }, { - "epoch": 4.277481323372466, - "grad_norm": 0.333984375, - "learning_rate": 0.0001004703736956471, - "loss": 0.47, + "epoch": 4.102354145342886, + "grad_norm": 0.322265625, + "learning_rate": 0.00011186112989392545, + "loss": 0.454, "step": 4008 }, { - "epoch": 4.281750266808965, - "grad_norm": 0.34765625, - "learning_rate": 0.00010019644353407556, - "loss": 0.4328, + "epoch": 4.106448311156602, + "grad_norm": 0.330078125, + "learning_rate": 0.00011159191825366912, + "loss": 0.4905, "step": 4012 }, { - "epoch": 4.286019210245464, - "grad_norm": 0.353515625, - "learning_rate": 9.99226999198795e-05, - "loss": 0.4681, + "epoch": 4.110542476970317, + "grad_norm": 0.33984375, + "learning_rate": 0.00011132283894541492, + "loss": 0.4467, "step": 4016 }, { - "epoch": 4.2902881536819635, - "grad_norm": 0.326171875, - "learning_rate": 9.964914387841055e-05, - "loss": 0.4508, + "epoch": 4.114636642784033, + "grad_norm": 0.3203125, + "learning_rate": 0.00011105389289625427, + "loss": 0.4868, "step": 4020 }, { - "epoch": 4.294557097118463, - "grad_norm": 0.349609375, - "learning_rate": 9.93757764343176e-05, - "loss": 0.463, + "epoch": 4.118730808597748, + "grad_norm": 0.373046875, + "learning_rate": 0.00011078508103281952, + "loss": 0.4499, "step": 4024 }, { - "epoch": 4.298826040554963, - "grad_norm": 0.33203125, - "learning_rate": 9.910259861154322e-05, - "loss": 0.4611, + "epoch": 4.122824974411464, + "grad_norm": 0.357421875, + "learning_rate": 0.00011051640428128062, + "loss": 0.4711, "step": 4028 }, { - "epoch": 4.303094983991462, - "grad_norm": 0.328125, - "learning_rate": 9.88296114333196e-05, - "loss": 0.4357, + "epoch": 4.126919140225179, + "grad_norm": 0.35546875, + "learning_rate": 0.00011024786356734199, + "loss": 0.458, "step": 4032 }, { - "epoch": 4.307363927427962, - "grad_norm": 0.3515625, - "learning_rate": 9.855681592216503e-05, - "loss": 0.4493, + "epoch": 4.131013306038895, + "grad_norm": 0.357421875, + "learning_rate": 0.00010997945981623944, + "loss": 0.5013, "step": 4036 }, { - "epoch": 4.311632870864461, - "grad_norm": 0.353515625, - "learning_rate": 9.828421309987966e-05, - "loss": 0.4656, + "epoch": 4.13510747185261, + "grad_norm": 0.349609375, + "learning_rate": 0.00010971119395273683, + "loss": 0.449, "step": 4040 }, { - "epoch": 4.31590181430096, - "grad_norm": 0.3671875, - "learning_rate": 9.801180398754204e-05, - "loss": 0.4473, + "epoch": 4.139201637666326, + "grad_norm": 0.337890625, + "learning_rate": 0.00010944306690112285, + "loss": 0.4601, "step": 4044 }, { - "epoch": 4.3201707577374595, - "grad_norm": 0.318359375, - "learning_rate": 9.773958960550509e-05, - "loss": 0.3999, + "epoch": 4.143295803480041, + "grad_norm": 0.35546875, + "learning_rate": 0.00010917507958520812, + "loss": 0.4678, "step": 4048 }, { - "epoch": 4.32443970117396, - "grad_norm": 0.353515625, - "learning_rate": 9.746757097339238e-05, - "loss": 0.4582, + "epoch": 4.147389969293757, + "grad_norm": 0.333984375, + "learning_rate": 0.00010890723292832163, + "loss": 0.4787, "step": 4052 }, { - "epoch": 4.328708644610459, - "grad_norm": 0.359375, - "learning_rate": 9.719574911009422e-05, - "loss": 0.488, + "epoch": 4.151484135107472, + "grad_norm": 0.3203125, + "learning_rate": 0.00010863952785330779, + "loss": 0.4761, "step": 4056 }, { - "epoch": 4.332977588046958, - "grad_norm": 0.337890625, - "learning_rate": 9.692412503376388e-05, - "loss": 0.4812, + "epoch": 4.155578300921187, + "grad_norm": 0.353515625, + "learning_rate": 0.0001083719652825233, + "loss": 0.4994, "step": 4060 }, { - "epoch": 4.337246531483458, - "grad_norm": 0.337890625, - "learning_rate": 9.665269976181389e-05, - "loss": 0.4514, + "epoch": 4.159672466734903, + "grad_norm": 0.3671875, + "learning_rate": 0.00010810454613783376, + "loss": 0.5027, "step": 4064 }, { - "epoch": 4.341515474919957, - "grad_norm": 0.34375, - "learning_rate": 9.638147431091193e-05, - "loss": 0.4569, + "epoch": 4.163766632548619, + "grad_norm": 0.337890625, + "learning_rate": 0.0001078372713406106, + "loss": 0.4376, "step": 4068 }, { - "epoch": 4.345784418356457, - "grad_norm": 0.330078125, - "learning_rate": 9.611044969697745e-05, - "loss": 0.4836, + "epoch": 4.167860798362334, + "grad_norm": 0.34765625, + "learning_rate": 0.00010757014181172803, + "loss": 0.4762, "step": 4072 }, { - "epoch": 4.350053361792956, - "grad_norm": 0.349609375, - "learning_rate": 9.583962693517744e-05, - "loss": 0.4565, + "epoch": 4.171954964176049, + "grad_norm": 0.3203125, + "learning_rate": 0.00010730315847155966, + "loss": 0.4696, "step": 4076 }, { - "epoch": 4.354322305229456, - "grad_norm": 0.3671875, - "learning_rate": 9.556900703992299e-05, - "loss": 0.4488, + "epoch": 4.176049129989765, + "grad_norm": 0.345703125, + "learning_rate": 0.00010703632223997534, + "loss": 0.4367, "step": 4080 }, { - "epoch": 4.358591248665955, - "grad_norm": 0.330078125, - "learning_rate": 9.529859102486508e-05, - "loss": 0.4646, + "epoch": 4.18014329580348, + "grad_norm": 0.349609375, + "learning_rate": 0.00010676963403633828, + "loss": 0.4768, "step": 4084 }, { - "epoch": 4.362860192102454, - "grad_norm": 0.34765625, - "learning_rate": 9.502837990289129e-05, - "loss": 0.4815, + "epoch": 4.184237461617196, + "grad_norm": 0.326171875, + "learning_rate": 0.00010650309477950149, + "loss": 0.4386, "step": 4088 }, { - "epoch": 4.3671291355389545, - "grad_norm": 0.361328125, - "learning_rate": 9.475837468612162e-05, - "loss": 0.4595, + "epoch": 4.188331627430911, + "grad_norm": 0.33203125, + "learning_rate": 0.00010623670538780487, + "loss": 0.4399, "step": 4092 }, { - "epoch": 4.371398078975454, - "grad_norm": 0.36328125, - "learning_rate": 9.448857638590476e-05, - "loss": 0.4787, + "epoch": 4.1924257932446265, + "grad_norm": 0.341796875, + "learning_rate": 0.00010597046677907198, + "loss": 0.4919, "step": 4096 }, { - "epoch": 4.375667022411953, - "grad_norm": 0.35546875, - "learning_rate": 9.421898601281451e-05, - "loss": 0.4634, + "epoch": 4.1965199590583415, + "grad_norm": 0.36328125, + "learning_rate": 0.0001057043798706068, + "loss": 0.4441, "step": 4100 }, { - "epoch": 4.3799359658484525, - "grad_norm": 0.341796875, - "learning_rate": 9.394960457664564e-05, - "loss": 0.4688, + "epoch": 4.200614124872057, + "grad_norm": 0.32421875, + "learning_rate": 0.00010543844557919073, + "loss": 0.4588, "step": 4104 }, { - "epoch": 4.384204909284952, - "grad_norm": 0.337890625, - "learning_rate": 9.368043308641054e-05, - "loss": 0.4529, + "epoch": 4.2047082906857725, + "grad_norm": 0.35546875, + "learning_rate": 0.00010517266482107927, + "loss": 0.4188, "step": 4108 }, { - "epoch": 4.388473852721451, - "grad_norm": 0.3515625, - "learning_rate": 9.341147255033498e-05, - "loss": 0.4625, + "epoch": 4.208802456499488, + "grad_norm": 0.333984375, + "learning_rate": 0.00010490703851199903, + "loss": 0.5049, "step": 4112 }, { - "epoch": 4.392742796157951, - "grad_norm": 0.337890625, - "learning_rate": 9.314272397585482e-05, - "loss": 0.442, + "epoch": 4.212896622313203, + "grad_norm": 0.345703125, + "learning_rate": 0.00010464156756714434, + "loss": 0.4469, "step": 4116 }, { - "epoch": 4.397011739594451, - "grad_norm": 0.37109375, - "learning_rate": 9.287418836961173e-05, - "loss": 0.4462, + "epoch": 4.216990788126919, + "grad_norm": 0.328125, + "learning_rate": 0.00010437625290117429, + "loss": 0.5029, "step": 4120 }, { - "epoch": 4.40128068303095, - "grad_norm": 0.330078125, - "learning_rate": 9.260586673744986e-05, - "loss": 0.4353, + "epoch": 4.221084953940634, + "grad_norm": 0.3515625, + "learning_rate": 0.00010411109542820963, + "loss": 0.4443, "step": 4124 }, { - "epoch": 4.405549626467449, - "grad_norm": 0.349609375, - "learning_rate": 9.233776008441183e-05, - "loss": 0.4606, + "epoch": 4.22517911975435, + "grad_norm": 0.353515625, + "learning_rate": 0.00010384609606182933, + "loss": 0.4548, "step": 4128 }, { - "epoch": 4.4098185699039485, - "grad_norm": 0.333984375, - "learning_rate": 9.206986941473496e-05, - "loss": 0.4685, + "epoch": 4.229273285568065, + "grad_norm": 0.33203125, + "learning_rate": 0.00010358125571506772, + "loss": 0.427, "step": 4132 }, { - "epoch": 4.414087513340448, - "grad_norm": 0.361328125, - "learning_rate": 9.180219573184765e-05, - "loss": 0.4664, + "epoch": 4.233367451381781, + "grad_norm": 0.337890625, + "learning_rate": 0.00010331657530041128, + "loss": 0.4632, "step": 4136 }, { - "epoch": 4.418356456776948, - "grad_norm": 0.337890625, - "learning_rate": 9.153474003836552e-05, - "loss": 0.4663, + "epoch": 4.237461617195496, + "grad_norm": 0.35546875, + "learning_rate": 0.00010305205572979536, + "loss": 0.48, "step": 4140 }, { - "epoch": 4.422625400213447, - "grad_norm": 0.337890625, - "learning_rate": 9.126750333608762e-05, - "loss": 0.4855, + "epoch": 4.241555783009212, + "grad_norm": 0.33984375, + "learning_rate": 0.00010278769791460118, + "loss": 0.4348, "step": 4144 }, { - "epoch": 4.426894343649947, - "grad_norm": 0.33203125, - "learning_rate": 9.10004866259928e-05, - "loss": 0.4489, + "epoch": 4.245649948822927, + "grad_norm": 0.35546875, + "learning_rate": 0.00010252350276565269, + "loss": 0.4493, "step": 4148 }, { - "epoch": 4.431163287086446, - "grad_norm": 0.369140625, - "learning_rate": 9.07336909082359e-05, - "loss": 0.4741, + "epoch": 4.249744114636643, + "grad_norm": 0.3359375, + "learning_rate": 0.00010225947119321326, + "loss": 0.4539, "step": 4152 }, { - "epoch": 4.435432230522945, - "grad_norm": 0.361328125, - "learning_rate": 9.046711718214386e-05, - "loss": 0.4946, + "epoch": 4.253838280450358, + "grad_norm": 0.35546875, + "learning_rate": 0.00010199560410698284, + "loss": 0.4563, "step": 4156 }, { - "epoch": 4.439701173959445, - "grad_norm": 0.37890625, - "learning_rate": 9.020076644621231e-05, - "loss": 0.4543, + "epoch": 4.257932446264074, + "grad_norm": 0.333984375, + "learning_rate": 0.00010173190241609452, + "loss": 0.4621, "step": 4160 }, { - "epoch": 4.443970117395945, - "grad_norm": 0.33984375, - "learning_rate": 8.993463969810156e-05, - "loss": 0.4739, + "epoch": 4.262026612077789, + "grad_norm": 0.328125, + "learning_rate": 0.00010146836702911154, + "loss": 0.445, "step": 4164 }, { - "epoch": 4.448239060832444, - "grad_norm": 0.34375, - "learning_rate": 8.966873793463286e-05, - "loss": 0.4391, + "epoch": 4.266120777891505, + "grad_norm": 0.328125, + "learning_rate": 0.00010120499885402429, + "loss": 0.4708, "step": 4168 }, { - "epoch": 4.452508004268943, - "grad_norm": 0.34375, - "learning_rate": 8.940306215178488e-05, - "loss": 0.485, + "epoch": 4.27021494370522, + "grad_norm": 0.361328125, + "learning_rate": 0.00010094179879824689, + "loss": 0.4508, "step": 4172 }, { - "epoch": 4.456776947705443, - "grad_norm": 0.33203125, - "learning_rate": 8.913761334468965e-05, - "loss": 0.4527, + "epoch": 4.274309109518936, + "grad_norm": 0.345703125, + "learning_rate": 0.00010067876776861423, + "loss": 0.4957, "step": 4176 }, { - "epoch": 4.461045891141943, - "grad_norm": 0.357421875, - "learning_rate": 8.887239250762926e-05, - "loss": 0.477, + "epoch": 4.278403275332651, + "grad_norm": 0.34375, + "learning_rate": 0.00010041590667137899, + "loss": 0.4882, "step": 4180 }, { - "epoch": 4.465314834578442, - "grad_norm": 0.357421875, - "learning_rate": 8.860740063403169e-05, - "loss": 0.4881, + "epoch": 4.282497441146367, + "grad_norm": 0.337890625, + "learning_rate": 0.00010015321641220816, + "loss": 0.4791, "step": 4184 }, { - "epoch": 4.4695837780149414, - "grad_norm": 0.333984375, - "learning_rate": 8.834263871646747e-05, - "loss": 0.3817, + "epoch": 4.286591606960082, + "grad_norm": 0.34375, + "learning_rate": 9.989069789618023e-05, + "loss": 0.4626, "step": 4188 }, { - "epoch": 4.473852721451441, - "grad_norm": 0.34765625, - "learning_rate": 8.807810774664562e-05, - "loss": 0.4121, + "epoch": 4.290685772773798, + "grad_norm": 0.330078125, + "learning_rate": 9.9628352027782e-05, + "loss": 0.4212, "step": 4192 }, { - "epoch": 4.47812166488794, - "grad_norm": 0.341796875, - "learning_rate": 8.781380871541027e-05, - "loss": 0.416, + "epoch": 4.294779938587513, + "grad_norm": 0.34765625, + "learning_rate": 9.93661797109053e-05, + "loss": 0.4547, "step": 4196 }, { - "epoch": 4.482390608324439, - "grad_norm": 0.373046875, - "learning_rate": 8.754974261273675e-05, - "loss": 0.4839, + "epoch": 4.298874104401229, + "grad_norm": 0.330078125, + "learning_rate": 9.910418184884408e-05, + "loss": 0.457, "step": 4200 }, { - "epoch": 4.48665955176094, - "grad_norm": 0.345703125, - "learning_rate": 8.72859104277278e-05, - "loss": 0.4641, + "epoch": 4.302968270214944, + "grad_norm": 0.326171875, + "learning_rate": 9.884235934429126e-05, + "loss": 0.424, "step": 4204 }, { - "epoch": 4.490928495197439, - "grad_norm": 0.337890625, - "learning_rate": 8.702231314861016e-05, - "loss": 0.476, + "epoch": 4.30706243602866, + "grad_norm": 0.34765625, + "learning_rate": 9.858071309933554e-05, + "loss": 0.4666, "step": 4208 }, { - "epoch": 4.495197438633938, - "grad_norm": 0.345703125, - "learning_rate": 8.675895176273049e-05, - "loss": 0.4549, + "epoch": 4.311156601842375, + "grad_norm": 0.330078125, + "learning_rate": 9.831924401545822e-05, + "loss": 0.4658, "step": 4212 }, { - "epoch": 4.4994663820704375, - "grad_norm": 0.35546875, - "learning_rate": 8.64958272565521e-05, - "loss": 0.5211, + "epoch": 4.3152507676560905, + "grad_norm": 0.353515625, + "learning_rate": 9.805795299353042e-05, + "loss": 0.4881, "step": 4216 }, { - "epoch": 4.503735325506937, - "grad_norm": 0.33984375, - "learning_rate": 8.623294061565081e-05, - "loss": 0.4672, + "epoch": 4.3193449334698055, + "grad_norm": 0.326171875, + "learning_rate": 9.77968409338096e-05, + "loss": 0.4483, "step": 4220 }, { - "epoch": 4.508004268943436, - "grad_norm": 0.375, - "learning_rate": 8.597029282471167e-05, - "loss": 0.4948, + "epoch": 4.3234390992835205, + "grad_norm": 0.359375, + "learning_rate": 9.753590873593667e-05, + "loss": 0.4651, "step": 4224 }, { - "epoch": 4.512273212379936, - "grad_norm": 0.341796875, - "learning_rate": 8.570788486752487e-05, - "loss": 0.4638, + "epoch": 4.3275332650972365, + "grad_norm": 0.345703125, + "learning_rate": 9.727515729893288e-05, + "loss": 0.4345, "step": 4228 }, { - "epoch": 4.516542155816436, - "grad_norm": 0.353515625, - "learning_rate": 8.544571772698247e-05, - "loss": 0.4763, + "epoch": 4.3316274309109515, + "grad_norm": 0.34375, + "learning_rate": 9.701458752119661e-05, + "loss": 0.4293, "step": 4232 }, { - "epoch": 4.520811099252935, - "grad_norm": 0.33984375, - "learning_rate": 8.518379238507444e-05, - "loss": 0.4538, + "epoch": 4.335721596724667, + "grad_norm": 0.34375, + "learning_rate": 9.675420030050035e-05, + "loss": 0.4462, "step": 4236 }, { - "epoch": 4.525080042689434, - "grad_norm": 0.33203125, - "learning_rate": 8.492210982288497e-05, - "loss": 0.4507, + "epoch": 4.339815762538382, + "grad_norm": 0.333984375, + "learning_rate": 9.649399653398771e-05, + "loss": 0.4796, "step": 4240 }, { - "epoch": 4.5293489861259335, - "grad_norm": 0.333984375, - "learning_rate": 8.466067102058902e-05, - "loss": 0.4623, + "epoch": 4.343909928352098, + "grad_norm": 0.337890625, + "learning_rate": 9.623397711817012e-05, + "loss": 0.4458, "step": 4244 }, { - "epoch": 4.533617929562434, - "grad_norm": 0.3359375, - "learning_rate": 8.439947695744837e-05, - "loss": 0.5011, + "epoch": 4.348004094165813, + "grad_norm": 0.341796875, + "learning_rate": 9.597414294892379e-05, + "loss": 0.4275, "step": 4248 }, { - "epoch": 4.537886872998933, - "grad_norm": 0.34765625, - "learning_rate": 8.413852861180813e-05, - "loss": 0.4607, + "epoch": 4.352098259979529, + "grad_norm": 0.365234375, + "learning_rate": 9.571449492148686e-05, + "loss": 0.5074, "step": 4252 }, { - "epoch": 4.542155816435432, - "grad_norm": 0.3515625, - "learning_rate": 8.387782696109313e-05, - "loss": 0.4781, + "epoch": 4.356192425793244, + "grad_norm": 0.357421875, + "learning_rate": 9.545503393045605e-05, + "loss": 0.4946, "step": 4256 }, { - "epoch": 4.546424759871932, - "grad_norm": 0.3671875, - "learning_rate": 8.3617372981804e-05, - "loss": 0.4963, + "epoch": 4.36028659160696, + "grad_norm": 0.375, + "learning_rate": 9.519576086978357e-05, + "loss": 0.5084, "step": 4260 }, { - "epoch": 4.550693703308431, - "grad_norm": 0.3671875, - "learning_rate": 8.335716764951387e-05, - "loss": 0.5218, + "epoch": 4.364380757420675, + "grad_norm": 0.34375, + "learning_rate": 9.493667663277424e-05, + "loss": 0.4977, "step": 4264 }, { - "epoch": 4.554962646744931, - "grad_norm": 0.34375, - "learning_rate": 8.30972119388643e-05, - "loss": 0.4628, + "epoch": 4.368474923234391, + "grad_norm": 0.3515625, + "learning_rate": 9.467778211208231e-05, + "loss": 0.4551, "step": 4268 }, { - "epoch": 4.55923159018143, - "grad_norm": 0.31640625, - "learning_rate": 8.283750682356213e-05, - "loss": 0.4196, + "epoch": 4.372569089048106, + "grad_norm": 0.3359375, + "learning_rate": 9.441907819970826e-05, + "loss": 0.4583, "step": 4272 }, { - "epoch": 4.56350053361793, - "grad_norm": 0.341796875, - "learning_rate": 8.257805327637521e-05, - "loss": 0.4553, + "epoch": 4.376663254861822, + "grad_norm": 0.33984375, + "learning_rate": 9.416056578699593e-05, + "loss": 0.4288, "step": 4276 }, { - "epoch": 4.567769477054429, - "grad_norm": 0.353515625, - "learning_rate": 8.231885226912942e-05, - "loss": 0.4456, + "epoch": 4.380757420675537, + "grad_norm": 0.349609375, + "learning_rate": 9.39022457646294e-05, + "loss": 0.4798, "step": 4280 }, { - "epoch": 4.572038420490928, - "grad_norm": 0.365234375, - "learning_rate": 8.205990477270449e-05, - "loss": 0.4715, + "epoch": 4.384851586489253, + "grad_norm": 0.341796875, + "learning_rate": 9.364411902262982e-05, + "loss": 0.4408, "step": 4284 }, { - "epoch": 4.576307363927428, - "grad_norm": 0.328125, - "learning_rate": 8.180121175703076e-05, - "loss": 0.4911, + "epoch": 4.388945752302968, + "grad_norm": 0.349609375, + "learning_rate": 9.33861864503524e-05, + "loss": 0.4388, "step": 4288 }, { - "epoch": 4.580576307363927, - "grad_norm": 0.33203125, - "learning_rate": 8.154277419108515e-05, - "loss": 0.4513, + "epoch": 4.393039918116684, + "grad_norm": 0.353515625, + "learning_rate": 9.31284489364834e-05, + "loss": 0.48, "step": 4292 }, { - "epoch": 4.584845250800427, + "epoch": 4.397134083930399, "grad_norm": 0.349609375, - "learning_rate": 8.128459304288802e-05, - "loss": 0.4427, + "learning_rate": 9.287090736903701e-05, + "loss": 0.4625, "step": 4296 }, { - "epoch": 4.5891141942369265, - "grad_norm": 0.34375, - "learning_rate": 8.1026669279499e-05, - "loss": 0.4801, + "epoch": 4.401228249744115, + "grad_norm": 0.33984375, + "learning_rate": 9.261356263535225e-05, + "loss": 0.483, "step": 4300 }, { - "epoch": 4.593383137673426, - "grad_norm": 0.37890625, - "learning_rate": 8.076900386701393e-05, - "loss": 0.4279, + "epoch": 4.40532241555783, + "grad_norm": 0.33984375, + "learning_rate": 9.235641562209006e-05, + "loss": 0.4827, "step": 4304 }, { - "epoch": 4.597652081109925, - "grad_norm": 0.330078125, - "learning_rate": 8.051159777056063e-05, - "loss": 0.4912, + "epoch": 4.409416581371546, + "grad_norm": 0.328125, + "learning_rate": 9.209946721523007e-05, + "loss": 0.4702, "step": 4308 }, { - "epoch": 4.601921024546424, - "grad_norm": 0.3359375, - "learning_rate": 8.025445195429587e-05, - "loss": 0.4679, + "epoch": 4.413510747185261, + "grad_norm": 0.33984375, + "learning_rate": 9.184271830006764e-05, + "loss": 0.479, "step": 4312 }, { - "epoch": 4.606189967982925, - "grad_norm": 0.3515625, - "learning_rate": 7.999756738140148e-05, - "loss": 0.4619, + "epoch": 4.417604912998977, + "grad_norm": 0.35546875, + "learning_rate": 9.158616976121085e-05, + "loss": 0.4666, "step": 4316 }, { - "epoch": 4.610458911419424, + "epoch": 4.421699078812692, "grad_norm": 0.353515625, - "learning_rate": 7.974094501408056e-05, - "loss": 0.4972, + "learning_rate": 9.132982248257736e-05, + "loss": 0.4342, "step": 4320 }, { - "epoch": 4.614727854855923, - "grad_norm": 0.359375, - "learning_rate": 7.948458581355431e-05, - "loss": 0.5111, + "epoch": 4.425793244626408, + "grad_norm": 0.34375, + "learning_rate": 9.107367734739135e-05, + "loss": 0.4567, "step": 4324 }, { - "epoch": 4.6189967982924225, - "grad_norm": 0.369140625, - "learning_rate": 7.9228490740058e-05, - "loss": 0.456, - "step": 4328 + "epoch": 4.429887410440123, + "grad_norm": 0.3515625, + "learning_rate": 9.081773523818064e-05, + "loss": 0.4731, + "step": 4328 }, { - "epoch": 4.623265741728922, - "grad_norm": 0.375, - "learning_rate": 7.897266075283774e-05, - "loss": 0.4702, + "epoch": 4.433981576253839, + "grad_norm": 0.337890625, + "learning_rate": 9.05619970367735e-05, + "loss": 0.4324, "step": 4332 }, { - "epoch": 4.627534685165422, - "grad_norm": 0.35546875, - "learning_rate": 7.871709681014658e-05, - "loss": 0.4795, + "epoch": 4.438075742067554, + "grad_norm": 0.3359375, + "learning_rate": 9.030646362429553e-05, + "loss": 0.4642, "step": 4336 }, { - "epoch": 4.631803628601921, - "grad_norm": 0.333984375, - "learning_rate": 7.846179986924117e-05, - "loss": 0.4402, + "epoch": 4.4421699078812695, + "grad_norm": 0.376953125, + "learning_rate": 9.005113588116699e-05, + "loss": 0.5286, "step": 4340 }, { - "epoch": 4.636072572038421, - "grad_norm": 0.34765625, - "learning_rate": 7.820677088637793e-05, - "loss": 0.4795, + "epoch": 4.4462640736949846, + "grad_norm": 0.375, + "learning_rate": 8.979601468709933e-05, + "loss": 0.4724, "step": 4344 }, { - "epoch": 4.64034151547492, - "grad_norm": 0.349609375, - "learning_rate": 7.795201081680972e-05, - "loss": 0.4306, + "epoch": 4.4503582395087005, + "grad_norm": 0.3203125, + "learning_rate": 8.954110092109238e-05, + "loss": 0.4447, "step": 4348 }, { - "epoch": 4.644610458911419, - "grad_norm": 0.349609375, - "learning_rate": 7.769752061478213e-05, - "loss": 0.4304, + "epoch": 4.4544524053224155, + "grad_norm": 0.357421875, + "learning_rate": 8.928639546143135e-05, + "loss": 0.4771, "step": 4352 }, { - "epoch": 4.6488794023479185, - "grad_norm": 0.359375, - "learning_rate": 7.744330123352978e-05, - "loss": 0.4798, + "epoch": 4.458546571136131, + "grad_norm": 0.353515625, + "learning_rate": 8.903189918568372e-05, + "loss": 0.4557, "step": 4356 }, { - "epoch": 4.653148345784419, - "grad_norm": 0.365234375, - "learning_rate": 7.71893536252731e-05, - "loss": 0.4586, + "epoch": 4.462640736949846, + "grad_norm": 0.3515625, + "learning_rate": 8.877761297069622e-05, + "loss": 0.4514, "step": 4360 }, { - "epoch": 4.657417289220918, - "grad_norm": 0.33203125, - "learning_rate": 7.693567874121436e-05, - "loss": 0.4106, + "epoch": 4.466734902763562, + "grad_norm": 0.349609375, + "learning_rate": 8.85235376925919e-05, + "loss": 0.455, "step": 4364 }, { - "epoch": 4.661686232657417, - "grad_norm": 0.3359375, - "learning_rate": 7.668227753153444e-05, - "loss": 0.5, + "epoch": 4.470829068577277, + "grad_norm": 0.33203125, + "learning_rate": 8.826967422676698e-05, + "loss": 0.4382, "step": 4368 }, { - "epoch": 4.665955176093917, - "grad_norm": 0.349609375, - "learning_rate": 7.642915094538898e-05, - "loss": 0.4805, + "epoch": 4.474923234390992, + "grad_norm": 0.359375, + "learning_rate": 8.801602344788783e-05, + "loss": 0.4872, "step": 4372 }, { - "epoch": 4.670224119530416, - "grad_norm": 0.345703125, - "learning_rate": 7.617629993090516e-05, - "loss": 0.4428, + "epoch": 4.479017400204708, + "grad_norm": 0.365234375, + "learning_rate": 8.776258622988823e-05, + "loss": 0.4487, "step": 4376 }, { - "epoch": 4.674493062966915, - "grad_norm": 0.3203125, - "learning_rate": 7.592372543517772e-05, - "loss": 0.4533, + "epoch": 4.483111566018424, + "grad_norm": 0.375, + "learning_rate": 8.750936344596597e-05, + "loss": 0.4817, "step": 4380 }, { - "epoch": 4.678762006403415, - "grad_norm": 0.345703125, - "learning_rate": 7.56714284042659e-05, - "loss": 0.488, + "epoch": 4.487205731832139, + "grad_norm": 0.34375, + "learning_rate": 8.725635596858014e-05, + "loss": 0.4282, "step": 4384 }, { - "epoch": 4.683030949839915, - "grad_norm": 0.357421875, - "learning_rate": 7.541940978318952e-05, - "loss": 0.4891, + "epoch": 4.491299897645854, + "grad_norm": 0.353515625, + "learning_rate": 8.700356466944786e-05, + "loss": 0.4675, "step": 4388 }, { - "epoch": 4.687299893276414, - "grad_norm": 0.349609375, - "learning_rate": 7.516767051592553e-05, - "loss": 0.4062, + "epoch": 4.49539406345957, + "grad_norm": 0.34375, + "learning_rate": 8.675099041954158e-05, + "loss": 0.4866, "step": 4392 }, { - "epoch": 4.691568836712913, - "grad_norm": 0.32421875, - "learning_rate": 7.49162115454047e-05, - "loss": 0.4521, + "epoch": 4.499488229273285, + "grad_norm": 0.353515625, + "learning_rate": 8.649863408908586e-05, + "loss": 0.5158, "step": 4396 }, { - "epoch": 4.695837780149413, - "grad_norm": 0.32421875, - "learning_rate": 7.466503381350762e-05, - "loss": 0.4607, + "epoch": 4.503582395087001, + "grad_norm": 0.375, + "learning_rate": 8.62464965475544e-05, + "loss": 0.427, "step": 4400 }, { - "epoch": 4.700106723585913, - "grad_norm": 0.357421875, - "learning_rate": 7.441413826106176e-05, - "loss": 0.4585, + "epoch": 4.507676560900716, + "grad_norm": 0.3515625, + "learning_rate": 8.599457866366725e-05, + "loss": 0.4479, "step": 4404 }, { - "epoch": 4.704375667022412, - "grad_norm": 0.353515625, - "learning_rate": 7.416352582783738e-05, - "loss": 0.4606, + "epoch": 4.511770726714432, + "grad_norm": 0.33984375, + "learning_rate": 8.574288130538736e-05, + "loss": 0.4869, "step": 4408 }, { - "epoch": 4.7086446104589115, - "grad_norm": 0.34765625, - "learning_rate": 7.391319745254448e-05, - "loss": 0.4655, + "epoch": 4.515864892528147, + "grad_norm": 0.337890625, + "learning_rate": 8.549140533991807e-05, + "loss": 0.4635, "step": 4412 }, { - "epoch": 4.712913553895411, - "grad_norm": 0.384765625, - "learning_rate": 7.366315407282888e-05, - "loss": 0.4625, + "epoch": 4.519959058341863, + "grad_norm": 0.359375, + "learning_rate": 8.524015163369993e-05, + "loss": 0.4592, "step": 4416 }, { - "epoch": 4.71718249733191, - "grad_norm": 0.345703125, - "learning_rate": 7.341339662526908e-05, - "loss": 0.4787, + "epoch": 4.524053224155578, + "grad_norm": 0.330078125, + "learning_rate": 8.498912105240773e-05, + "loss": 0.4623, "step": 4420 }, { - "epoch": 4.72145144076841, + "epoch": 4.528147389969294, "grad_norm": 0.359375, - "learning_rate": 7.316392604537243e-05, - "loss": 0.4552, + "learning_rate": 8.473831446094733e-05, + "loss": 0.4546, "step": 4424 }, { - "epoch": 4.72572038420491, - "grad_norm": 0.37109375, - "learning_rate": 7.291474326757192e-05, - "loss": 0.5086, + "epoch": 4.532241555783009, + "grad_norm": 0.34765625, + "learning_rate": 8.448773272345298e-05, + "loss": 0.5183, "step": 4428 }, { - "epoch": 4.729989327641409, - "grad_norm": 0.35546875, - "learning_rate": 7.26658492252224e-05, - "loss": 0.4603, + "epoch": 4.536335721596725, + "grad_norm": 0.361328125, + "learning_rate": 8.423737670328432e-05, + "loss": 0.4457, "step": 4432 }, { - "epoch": 4.734258271077908, - "grad_norm": 0.36328125, - "learning_rate": 7.241724485059719e-05, - "loss": 0.4938, + "epoch": 4.54042988741044, + "grad_norm": 0.3203125, + "learning_rate": 8.398724726302301e-05, + "loss": 0.4337, "step": 4436 }, { - "epoch": 4.7385272145144075, - "grad_norm": 0.34765625, - "learning_rate": 7.21689310748848e-05, - "loss": 0.4472, + "epoch": 4.544524053224156, + "grad_norm": 0.359375, + "learning_rate": 8.373734526447032e-05, + "loss": 0.4435, "step": 4440 }, { - "epoch": 4.742796157950907, - "grad_norm": 0.33984375, - "learning_rate": 7.192090882818502e-05, - "loss": 0.4911, + "epoch": 4.548618219037871, + "grad_norm": 0.34375, + "learning_rate": 8.348767156864382e-05, + "loss": 0.4753, "step": 4444 }, { - "epoch": 4.747065101387407, - "grad_norm": 0.3359375, - "learning_rate": 7.167317903950591e-05, - "loss": 0.4696, + "epoch": 4.552712384851587, + "grad_norm": 0.35546875, + "learning_rate": 8.323822703577431e-05, + "loss": 0.4746, "step": 4448 }, { - "epoch": 4.751334044823906, - "grad_norm": 0.361328125, - "learning_rate": 7.142574263675983e-05, - "loss": 0.4716, + "epoch": 4.556806550665302, + "grad_norm": 0.37109375, + "learning_rate": 8.298901252530326e-05, + "loss": 0.44, "step": 4452 }, { - "epoch": 4.755602988260406, - "grad_norm": 0.357421875, - "learning_rate": 7.117860054676048e-05, - "loss": 0.4309, + "epoch": 4.560900716479018, + "grad_norm": 0.32421875, + "learning_rate": 8.274002889587954e-05, + "loss": 0.4366, "step": 4456 }, { - "epoch": 4.759871931696905, - "grad_norm": 0.353515625, - "learning_rate": 7.093175369521887e-05, - "loss": 0.4704, + "epoch": 4.564994882292733, + "grad_norm": 0.33984375, + "learning_rate": 8.249127700535643e-05, + "loss": 0.4715, "step": 4460 }, { - "epoch": 4.764140875133404, - "grad_norm": 0.3359375, - "learning_rate": 7.068520300674038e-05, - "loss": 0.4772, + "epoch": 4.569089048106449, + "grad_norm": 0.333984375, + "learning_rate": 8.224275771078889e-05, + "loss": 0.437, "step": 4464 }, { - "epoch": 4.7684098185699035, - "grad_norm": 0.328125, - "learning_rate": 7.043894940482106e-05, - "loss": 0.3981, + "epoch": 4.573183213920164, + "grad_norm": 0.34765625, + "learning_rate": 8.199447186843051e-05, + "loss": 0.4556, "step": 4468 }, { - "epoch": 4.772678762006404, - "grad_norm": 0.33203125, - "learning_rate": 7.019299381184396e-05, - "loss": 0.4228, + "epoch": 4.5772773797338795, + "grad_norm": 0.3671875, + "learning_rate": 8.174642033373037e-05, + "loss": 0.438, "step": 4472 }, { - "epoch": 4.776947705442903, - "grad_norm": 0.34765625, - "learning_rate": 6.994733714907614e-05, - "loss": 0.4329, + "epoch": 4.5813715455475945, + "grad_norm": 0.3515625, + "learning_rate": 8.149860396133048e-05, + "loss": 0.4682, "step": 4476 }, { - "epoch": 4.781216648879402, - "grad_norm": 0.36328125, - "learning_rate": 6.970198033666475e-05, - "loss": 0.4629, + "epoch": 4.58546571136131, + "grad_norm": 0.349609375, + "learning_rate": 8.125102360506255e-05, + "loss": 0.4744, "step": 4480 }, { - "epoch": 4.785485592315902, - "grad_norm": 0.34765625, - "learning_rate": 6.945692429363401e-05, - "loss": 0.4789, + "epoch": 4.5895598771750254, + "grad_norm": 0.361328125, + "learning_rate": 8.100368011794491e-05, + "loss": 0.5337, "step": 4484 }, { - "epoch": 4.789754535752401, - "grad_norm": 0.33984375, - "learning_rate": 6.921216993788137e-05, - "loss": 0.4654, + "epoch": 4.593654042988741, + "grad_norm": 0.337890625, + "learning_rate": 8.075657435218008e-05, + "loss": 0.4905, "step": 4488 }, { - "epoch": 4.794023479188901, - "grad_norm": 0.34375, - "learning_rate": 6.896771818617441e-05, - "loss": 0.4655, + "epoch": 4.597748208802456, + "grad_norm": 0.349609375, + "learning_rate": 8.050970715915138e-05, + "loss": 0.4834, "step": 4492 }, { - "epoch": 4.7982924226254005, - "grad_norm": 0.341796875, - "learning_rate": 6.87235699541471e-05, - "loss": 0.418, + "epoch": 4.601842374616172, + "grad_norm": 0.361328125, + "learning_rate": 8.02630793894201e-05, + "loss": 0.4572, "step": 4496 }, { - "epoch": 4.8025613660619, - "grad_norm": 0.369140625, - "learning_rate": 6.847972615629668e-05, - "loss": 0.4384, + "epoch": 4.605936540429887, + "grad_norm": 0.333984375, + "learning_rate": 8.001669189272272e-05, + "loss": 0.441, "step": 4500 }, { - "epoch": 4.806830309498399, - "grad_norm": 0.333984375, - "learning_rate": 6.823618770598009e-05, - "loss": 0.4601, + "epoch": 4.610030706243603, + "grad_norm": 0.34375, + "learning_rate": 7.977054551796792e-05, + "loss": 0.429, "step": 4504 }, { - "epoch": 4.811099252934898, - "grad_norm": 0.33984375, - "learning_rate": 6.799295551541035e-05, - "loss": 0.4756, + "epoch": 4.614124872057318, + "grad_norm": 0.337890625, + "learning_rate": 7.952464111323335e-05, + "loss": 0.458, "step": 4508 }, { - "epoch": 4.815368196371399, - "grad_norm": 0.34375, - "learning_rate": 6.775003049565355e-05, - "loss": 0.4785, + "epoch": 4.618219037871034, + "grad_norm": 0.3515625, + "learning_rate": 7.927897952576326e-05, + "loss": 0.4573, "step": 4512 }, { - "epoch": 4.819637139807898, - "grad_norm": 0.3359375, - "learning_rate": 6.750741355662502e-05, - "loss": 0.4597, + "epoch": 4.622313203684749, + "grad_norm": 0.3515625, + "learning_rate": 7.903356160196522e-05, + "loss": 0.4269, "step": 4516 }, { - "epoch": 4.823906083244397, - "grad_norm": 0.333984375, - "learning_rate": 6.726510560708639e-05, - "loss": 0.4657, + "epoch": 4.626407369498464, + "grad_norm": 0.3515625, + "learning_rate": 7.878838818740711e-05, + "loss": 0.4751, "step": 4520 }, { - "epoch": 4.8281750266808965, - "grad_norm": 0.337890625, - "learning_rate": 6.70231075546416e-05, - "loss": 0.4471, + "epoch": 4.63050153531218, + "grad_norm": 0.38671875, + "learning_rate": 7.85434601268146e-05, + "loss": 0.4122, "step": 4524 }, { - "epoch": 4.832443970117396, - "grad_norm": 0.341796875, - "learning_rate": 6.67814203057341e-05, - "loss": 0.4173, + "epoch": 4.634595701125896, + "grad_norm": 0.34375, + "learning_rate": 7.829877826406794e-05, + "loss": 0.4739, "step": 4528 }, { - "epoch": 4.836712913553895, - "grad_norm": 0.341796875, - "learning_rate": 6.654004476564297e-05, - "loss": 0.4853, + "epoch": 4.638689866939611, + "grad_norm": 0.361328125, + "learning_rate": 7.805434344219902e-05, + "loss": 0.4686, "step": 4532 }, { - "epoch": 4.840981856990394, - "grad_norm": 0.3671875, - "learning_rate": 6.629898183847983e-05, - "loss": 0.4331, + "epoch": 4.642784032753326, + "grad_norm": 0.34375, + "learning_rate": 7.781015650338865e-05, + "loss": 0.4247, "step": 4536 }, { - "epoch": 4.845250800426895, - "grad_norm": 0.341796875, - "learning_rate": 6.605823242718543e-05, - "loss": 0.447, + "epoch": 4.646878198567042, + "grad_norm": 0.421875, + "learning_rate": 7.756621828896363e-05, + "loss": 0.4744, "step": 4540 }, { - "epoch": 4.849519743863394, - "grad_norm": 0.3515625, - "learning_rate": 6.5817797433526e-05, - "loss": 0.4857, + "epoch": 4.650972364380758, + "grad_norm": 0.357421875, + "learning_rate": 7.732252963939369e-05, + "loss": 0.4481, "step": 4544 }, { - "epoch": 4.853788687299893, - "grad_norm": 0.349609375, - "learning_rate": 6.557767775809026e-05, - "loss": 0.4598, + "epoch": 4.655066530194473, + "grad_norm": 0.3359375, + "learning_rate": 7.707909139428889e-05, + "loss": 0.4948, "step": 4548 }, { - "epoch": 4.8580576307363925, - "grad_norm": 0.357421875, - "learning_rate": 6.53378743002857e-05, - "loss": 0.4475, + "epoch": 4.659160696008188, + "grad_norm": 0.33984375, + "learning_rate": 7.683590439239626e-05, + "loss": 0.5113, "step": 4552 }, { - "epoch": 4.862326574172892, + "epoch": 4.663254861821904, "grad_norm": 0.3515625, - "learning_rate": 6.509838795833553e-05, - "loss": 0.4592, + "learning_rate": 7.65929694715974e-05, + "loss": 0.4743, "step": 4556 }, { - "epoch": 4.866595517609392, - "grad_norm": 0.34375, - "learning_rate": 6.485921962927493e-05, - "loss": 0.4802, + "epoch": 4.667349027635619, + "grad_norm": 0.34765625, + "learning_rate": 7.63502874689054e-05, + "loss": 0.4563, "step": 4560 }, { - "epoch": 4.870864461045891, - "grad_norm": 0.36328125, - "learning_rate": 6.462037020894817e-05, - "loss": 0.4664, + "epoch": 4.671443193449335, + "grad_norm": 0.361328125, + "learning_rate": 7.610785922046173e-05, + "loss": 0.4818, "step": 4564 }, { - "epoch": 4.875133404482391, - "grad_norm": 0.34765625, - "learning_rate": 6.438184059200474e-05, - "loss": 0.4688, + "epoch": 4.67553735926305, + "grad_norm": 0.330078125, + "learning_rate": 7.586568556153378e-05, + "loss": 0.4595, "step": 4568 }, { - "epoch": 4.87940234791889, - "grad_norm": 0.36328125, - "learning_rate": 6.41436316718965e-05, - "loss": 0.5069, + "epoch": 4.679631525076766, + "grad_norm": 0.357421875, + "learning_rate": 7.562376732651177e-05, + "loss": 0.4212, "step": 4572 }, { - "epoch": 4.883671291355389, - "grad_norm": 0.361328125, - "learning_rate": 6.390574434087385e-05, - "loss": 0.433, + "epoch": 4.683725690890481, + "grad_norm": 0.345703125, + "learning_rate": 7.53821053489057e-05, + "loss": 0.4709, "step": 4576 }, { - "epoch": 4.887940234791889, - "grad_norm": 0.337890625, - "learning_rate": 6.366817948998284e-05, - "loss": 0.4681, + "epoch": 4.687819856704197, + "grad_norm": 0.37109375, + "learning_rate": 7.514070046134281e-05, + "loss": 0.5149, "step": 4580 }, { - "epoch": 4.892209178228389, - "grad_norm": 0.359375, - "learning_rate": 6.343093800906142e-05, - "loss": 0.4435, + "epoch": 4.691914022517912, + "grad_norm": 0.337890625, + "learning_rate": 7.489955349556457e-05, + "loss": 0.4564, "step": 4584 }, { - "epoch": 4.896478121664888, - "grad_norm": 0.376953125, - "learning_rate": 6.31940207867365e-05, - "loss": 0.5192, + "epoch": 4.696008188331628, + "grad_norm": 0.365234375, + "learning_rate": 7.465866528242361e-05, + "loss": 0.4646, "step": 4588 }, { - "epoch": 4.900747065101387, - "grad_norm": 0.357421875, - "learning_rate": 6.295742871042025e-05, - "loss": 0.4732, + "epoch": 4.700102354145343, + "grad_norm": 0.35546875, + "learning_rate": 7.441803665188124e-05, + "loss": 0.4564, "step": 4592 }, { - "epoch": 4.905016008537887, - "grad_norm": 0.375, - "learning_rate": 6.27211626663071e-05, - "loss": 0.4491, + "epoch": 4.7041965199590585, + "grad_norm": 0.361328125, + "learning_rate": 7.417766843300443e-05, + "loss": 0.4589, "step": 4596 }, { - "epoch": 4.909284951974386, - "grad_norm": 0.3515625, - "learning_rate": 6.24852235393701e-05, - "loss": 0.4309, + "epoch": 4.7082906857727735, + "grad_norm": 0.322265625, + "learning_rate": 7.393756145396267e-05, + "loss": 0.4152, "step": 4600 }, { - "epoch": 4.913553895410886, - "grad_norm": 0.365234375, - "learning_rate": 6.224961221335802e-05, - "loss": 0.5054, + "epoch": 4.7123848515864895, + "grad_norm": 0.35546875, + "learning_rate": 7.369771654202563e-05, + "loss": 0.4353, "step": 4604 }, { - "epoch": 4.9178228388473855, - "grad_norm": 0.345703125, - "learning_rate": 6.201432957079159e-05, - "loss": 0.4712, + "epoch": 4.7164790174002045, + "grad_norm": 0.3359375, + "learning_rate": 7.345813452355999e-05, + "loss": 0.4508, "step": 4608 }, { - "epoch": 4.922091782283885, - "grad_norm": 0.349609375, - "learning_rate": 6.177937649296042e-05, - "loss": 0.5025, + "epoch": 4.72057318321392, + "grad_norm": 0.37109375, + "learning_rate": 7.321881622402648e-05, + "loss": 0.4222, "step": 4612 }, { - "epoch": 4.926360725720384, - "grad_norm": 0.380859375, - "learning_rate": 6.154475385991978e-05, - "loss": 0.4562, + "epoch": 4.724667349027635, + "grad_norm": 0.353515625, + "learning_rate": 7.297976246797742e-05, + "loss": 0.4815, "step": 4616 }, { - "epoch": 4.930629669156883, - "grad_norm": 0.380859375, - "learning_rate": 6.131046255048722e-05, - "loss": 0.4456, + "epoch": 4.728761514841351, + "grad_norm": 0.373046875, + "learning_rate": 7.274097407905361e-05, + "loss": 0.5069, "step": 4620 }, { - "epoch": 4.934898612593383, - "grad_norm": 0.353515625, - "learning_rate": 6.107650344223908e-05, - "loss": 0.505, + "epoch": 4.732855680655066, + "grad_norm": 0.357421875, + "learning_rate": 7.250245187998141e-05, + "loss": 0.5212, "step": 4624 }, { - "epoch": 4.939167556029883, - "grad_norm": 0.341796875, - "learning_rate": 6.084287741150765e-05, - "loss": 0.4764, + "epoch": 4.736949846468782, + "grad_norm": 0.361328125, + "learning_rate": 7.226419669257027e-05, + "loss": 0.4838, "step": 4628 }, { - "epoch": 4.943436499466382, - "grad_norm": 0.333984375, - "learning_rate": 6.060958533337736e-05, - "loss": 0.4722, + "epoch": 4.741044012282497, + "grad_norm": 0.34375, + "learning_rate": 7.202620933770954e-05, + "loss": 0.5086, "step": 4632 }, { - "epoch": 4.9477054429028815, - "grad_norm": 0.376953125, - "learning_rate": 6.037662808168198e-05, - "loss": 0.4642, + "epoch": 4.745138178096213, + "grad_norm": 0.349609375, + "learning_rate": 7.178849063536572e-05, + "loss": 0.4991, "step": 4636 }, { - "epoch": 4.951974386339381, - "grad_norm": 0.37109375, - "learning_rate": 6.014400652900093e-05, - "loss": 0.4851, + "epoch": 4.749232343909928, + "grad_norm": 0.35546875, + "learning_rate": 7.155104140457982e-05, + "loss": 0.4491, "step": 4640 }, { - "epoch": 4.95624332977588, - "grad_norm": 0.384765625, - "learning_rate": 5.991172154665645e-05, - "loss": 0.4993, + "epoch": 4.753326509723644, + "grad_norm": 0.375, + "learning_rate": 7.131386246346439e-05, + "loss": 0.4606, "step": 4644 }, { - "epoch": 4.96051227321238, - "grad_norm": 0.33984375, - "learning_rate": 5.967977400470988e-05, - "loss": 0.4346, + "epoch": 4.757420675537359, + "grad_norm": 0.326171875, + "learning_rate": 7.107695462920057e-05, + "loss": 0.4968, "step": 4648 }, { - "epoch": 4.96478121664888, - "grad_norm": 0.34765625, - "learning_rate": 5.944816477195874e-05, - "loss": 0.4559, + "epoch": 4.761514841351075, + "grad_norm": 0.35546875, + "learning_rate": 7.084031871803559e-05, + "loss": 0.4662, "step": 4652 }, { - "epoch": 4.969050160085379, - "grad_norm": 0.37109375, - "learning_rate": 5.92168947159334e-05, - "loss": 0.4851, + "epoch": 4.76560900716479, + "grad_norm": 0.36328125, + "learning_rate": 7.060395554527977e-05, + "loss": 0.505, "step": 4656 }, { - "epoch": 4.973319103521878, - "grad_norm": 0.349609375, - "learning_rate": 5.898596470289363e-05, - "loss": 0.5008, + "epoch": 4.769703172978506, + "grad_norm": 0.34375, + "learning_rate": 7.03678659253036e-05, + "loss": 0.476, "step": 4660 }, { - "epoch": 4.9775880469583775, - "grad_norm": 0.361328125, - "learning_rate": 5.875537559782569e-05, - "loss": 0.4251, + "epoch": 4.773797338792221, + "grad_norm": 0.359375, + "learning_rate": 7.013205067153522e-05, + "loss": 0.4489, "step": 4664 }, { - "epoch": 4.981856990394878, - "grad_norm": 0.365234375, - "learning_rate": 5.852512826443875e-05, - "loss": 0.423, + "epoch": 4.777891504605937, + "grad_norm": 0.341796875, + "learning_rate": 6.989651059645743e-05, + "loss": 0.4559, "step": 4668 }, { - "epoch": 4.986125933831377, - "grad_norm": 0.365234375, - "learning_rate": 5.829522356516197e-05, - "loss": 0.5199, + "epoch": 4.781985670419652, + "grad_norm": 0.349609375, + "learning_rate": 6.966124651160479e-05, + "loss": 0.4769, "step": 4672 }, { - "epoch": 4.990394877267876, - "grad_norm": 0.361328125, - "learning_rate": 5.8065662361140965e-05, - "loss": 0.4578, + "epoch": 4.786079836233368, + "grad_norm": 0.3359375, + "learning_rate": 6.942625922756114e-05, + "loss": 0.4543, "step": 4676 }, { - "epoch": 4.994663820704376, - "grad_norm": 0.341796875, - "learning_rate": 5.783644551223491e-05, - "loss": 0.4284, + "epoch": 4.790174002047083, + "grad_norm": 0.3671875, + "learning_rate": 6.91915495539565e-05, + "loss": 0.4677, "step": 4680 }, { - "epoch": 4.998932764140875, - "grad_norm": 0.333984375, - "learning_rate": 5.760757387701295e-05, - "loss": 0.4219, + "epoch": 4.794268167860798, + "grad_norm": 0.34375, + "learning_rate": 6.89571182994645e-05, + "loss": 0.4448, "step": 4684 }, { - "epoch": 5.003201707577374, - "grad_norm": 0.32421875, - "learning_rate": 5.7379048312751295e-05, - "loss": 0.4038, + "epoch": 4.798362333674514, + "grad_norm": 0.373046875, + "learning_rate": 6.872296627179943e-05, + "loss": 0.4758, "step": 4688 }, { - "epoch": 5.0074706510138745, + "epoch": 4.80245649948823, "grad_norm": 0.3515625, - "learning_rate": 5.715086967542995e-05, - "loss": 0.4555, + "learning_rate": 6.848909427771361e-05, + "loss": 0.463, "step": 4692 }, { - "epoch": 5.011739594450374, - "grad_norm": 0.32421875, - "learning_rate": 5.692303881972924e-05, - "loss": 0.4328, + "epoch": 4.806550665301945, + "grad_norm": 0.35546875, + "learning_rate": 6.825550312299432e-05, + "loss": 0.4602, "step": 4696 }, { - "epoch": 5.016008537886873, - "grad_norm": 0.345703125, - "learning_rate": 5.6695556599027046e-05, - "loss": 0.4424, + "epoch": 4.81064483111566, + "grad_norm": 0.357421875, + "learning_rate": 6.802219361246149e-05, + "loss": 0.4589, "step": 4700 }, { - "epoch": 5.020277481323372, - "grad_norm": 0.330078125, - "learning_rate": 5.646842386539517e-05, - "loss": 0.4536, + "epoch": 4.814738996929376, + "grad_norm": 0.3359375, + "learning_rate": 6.778916654996455e-05, + "loss": 0.4715, "step": 4704 }, { - "epoch": 5.024546424759872, - "grad_norm": 0.3203125, - "learning_rate": 5.624164146959656e-05, - "loss": 0.4217, + "epoch": 4.818833162743092, + "grad_norm": 0.36328125, + "learning_rate": 6.755642273837969e-05, + "loss": 0.4401, "step": 4708 }, { - "epoch": 5.028815368196371, - "grad_norm": 0.326171875, - "learning_rate": 5.601521026108172e-05, - "loss": 0.4335, + "epoch": 4.822927328556807, + "grad_norm": 0.328125, + "learning_rate": 6.732396297960732e-05, + "loss": 0.4417, "step": 4712 }, { - "epoch": 5.033084311632871, - "grad_norm": 0.365234375, - "learning_rate": 5.57891310879859e-05, - "loss": 0.4477, + "epoch": 4.827021494370522, + "grad_norm": 0.333984375, + "learning_rate": 6.70917880745692e-05, + "loss": 0.4187, "step": 4716 }, { - "epoch": 5.0373532550693705, - "grad_norm": 0.345703125, - "learning_rate": 5.556340479712558e-05, - "loss": 0.4567, + "epoch": 4.8311156601842375, + "grad_norm": 0.36328125, + "learning_rate": 6.68598988232054e-05, + "loss": 0.4683, "step": 4720 }, { - "epoch": 5.04162219850587, - "grad_norm": 0.337890625, - "learning_rate": 5.5338032233995574e-05, - "loss": 0.4348, + "epoch": 4.835209825997953, + "grad_norm": 0.33203125, + "learning_rate": 6.662829602447207e-05, + "loss": 0.4654, "step": 4724 }, { - "epoch": 5.045891141942369, - "grad_norm": 0.349609375, - "learning_rate": 5.511301424276577e-05, - "loss": 0.4333, + "epoch": 4.8393039918116685, + "grad_norm": 0.341796875, + "learning_rate": 6.639698047633834e-05, + "loss": 0.4565, "step": 4728 }, { - "epoch": 5.050160085378868, - "grad_norm": 0.34375, - "learning_rate": 5.488835166627783e-05, - "loss": 0.368, + "epoch": 4.8433981576253835, + "grad_norm": 0.357421875, + "learning_rate": 6.616595297578346e-05, + "loss": 0.4464, "step": 4732 }, { - "epoch": 5.054429028815369, - "grad_norm": 0.326171875, - "learning_rate": 5.466404534604229e-05, - "loss": 0.3895, + "epoch": 4.847492323439099, + "grad_norm": 0.361328125, + "learning_rate": 6.59352143187945e-05, + "loss": 0.4892, "step": 4736 }, { - "epoch": 5.058697972251868, - "grad_norm": 0.357421875, - "learning_rate": 5.44400961222351e-05, - "loss": 0.4395, + "epoch": 4.851586489252814, + "grad_norm": 0.361328125, + "learning_rate": 6.57047653003632e-05, + "loss": 0.4535, "step": 4740 }, { - "epoch": 5.062966915688367, - "grad_norm": 0.365234375, - "learning_rate": 5.421650483369486e-05, - "loss": 0.4307, + "epoch": 4.85568065506653, + "grad_norm": 0.361328125, + "learning_rate": 6.547460671448333e-05, + "loss": 0.4681, "step": 4744 }, { - "epoch": 5.0672358591248665, - "grad_norm": 0.345703125, - "learning_rate": 5.3993272317919296e-05, - "loss": 0.4373, + "epoch": 4.859774820880245, + "grad_norm": 0.3515625, + "learning_rate": 6.524473935414807e-05, + "loss": 0.4673, "step": 4748 }, { - "epoch": 5.071504802561366, - "grad_norm": 0.34375, - "learning_rate": 5.377039941106229e-05, - "loss": 0.4661, + "epoch": 4.863868986693961, + "grad_norm": 0.37109375, + "learning_rate": 6.50151640113473e-05, + "loss": 0.4545, "step": 4752 }, { - "epoch": 5.075773745997865, - "grad_norm": 0.345703125, - "learning_rate": 5.354788694793093e-05, - "loss": 0.4519, + "epoch": 4.867963152507676, + "grad_norm": 0.353515625, + "learning_rate": 6.478588147706454e-05, + "loss": 0.4575, "step": 4756 }, { - "epoch": 5.080042689434365, - "grad_norm": 0.365234375, - "learning_rate": 5.332573576198194e-05, - "loss": 0.4807, + "epoch": 4.872057318321392, + "grad_norm": 0.353515625, + "learning_rate": 6.455689254127464e-05, + "loss": 0.4646, "step": 4760 }, { - "epoch": 5.084311632870865, - "grad_norm": 0.341796875, - "learning_rate": 5.3103946685319115e-05, - "loss": 0.4407, + "epoch": 4.876151484135107, + "grad_norm": 0.345703125, + "learning_rate": 6.43281979929409e-05, + "loss": 0.4741, "step": 4764 }, { - "epoch": 5.088580576307364, - "grad_norm": 0.353515625, - "learning_rate": 5.2882520548689615e-05, - "loss": 0.4046, + "epoch": 4.880245649948823, + "grad_norm": 0.37890625, + "learning_rate": 6.409979862001215e-05, + "loss": 0.4658, "step": 4768 }, { - "epoch": 5.092849519743863, - "grad_norm": 0.33203125, - "learning_rate": 5.266145818148142e-05, - "loss": 0.3884, + "epoch": 4.884339815762538, + "grad_norm": 0.34375, + "learning_rate": 6.387169520942037e-05, + "loss": 0.5014, "step": 4772 }, { - "epoch": 5.0971184631803625, - "grad_norm": 0.35546875, - "learning_rate": 5.244076041171971e-05, - "loss": 0.4297, + "epoch": 4.888433981576254, + "grad_norm": 0.33984375, + "learning_rate": 6.364388854707786e-05, + "loss": 0.4532, "step": 4776 }, { - "epoch": 5.101387406616863, - "grad_norm": 0.3515625, - "learning_rate": 5.22204280660642e-05, - "loss": 0.4182, + "epoch": 4.892528147389969, + "grad_norm": 0.4296875, + "learning_rate": 6.341637941787433e-05, + "loss": 0.4745, "step": 4780 }, { - "epoch": 5.105656350053362, - "grad_norm": 0.369140625, - "learning_rate": 5.200046196980571e-05, - "loss": 0.4596, + "epoch": 4.896622313203685, + "grad_norm": 0.33984375, + "learning_rate": 6.318916860567447e-05, + "loss": 0.4742, "step": 4784 }, { - "epoch": 5.109925293489861, - "grad_norm": 0.33203125, - "learning_rate": 5.178086294686332e-05, - "loss": 0.4252, + "epoch": 4.9007164790174, + "grad_norm": 0.3359375, + "learning_rate": 6.296225689331523e-05, + "loss": 0.4955, "step": 4788 }, { - "epoch": 5.114194236926361, + "epoch": 4.904810644831116, "grad_norm": 0.345703125, - "learning_rate": 5.156163181978108e-05, - "loss": 0.415, + "learning_rate": 6.273564506260277e-05, + "loss": 0.4486, "step": 4792 }, { - "epoch": 5.11846318036286, - "grad_norm": 0.36328125, - "learning_rate": 5.134276940972513e-05, - "loss": 0.4309, + "epoch": 4.908904810644831, + "grad_norm": 0.33203125, + "learning_rate": 6.250933389431029e-05, + "loss": 0.4609, "step": 4796 }, { - "epoch": 5.122732123799359, + "epoch": 4.912998976458547, "grad_norm": 0.361328125, - "learning_rate": 5.112427653648039e-05, - "loss": 0.4162, + "learning_rate": 6.228332416817504e-05, + "loss": 0.4645, "step": 4800 }, { - "epoch": 5.1270010672358595, - "grad_norm": 0.359375, - "learning_rate": 5.090615401844774e-05, - "loss": 0.443, + "epoch": 4.917093142272262, + "grad_norm": 0.376953125, + "learning_rate": 6.205761666289548e-05, + "loss": 0.4939, "step": 4804 }, { - "epoch": 5.131270010672359, - "grad_norm": 0.369140625, - "learning_rate": 5.068840267264081e-05, - "loss": 0.4277, + "epoch": 4.921187308085978, + "grad_norm": 0.333984375, + "learning_rate": 6.183221215612904e-05, + "loss": 0.4646, "step": 4808 }, { - "epoch": 5.135538954108858, - "grad_norm": 0.35546875, - "learning_rate": 5.047102331468286e-05, - "loss": 0.4332, + "epoch": 4.925281473899693, + "grad_norm": 0.369140625, + "learning_rate": 6.16071114244891e-05, + "loss": 0.4811, "step": 4812 }, { - "epoch": 5.139807897545357, - "grad_norm": 0.357421875, - "learning_rate": 5.025401675880393e-05, - "loss": 0.4439, + "epoch": 4.929375639713409, + "grad_norm": 0.365234375, + "learning_rate": 6.138231524354229e-05, + "loss": 0.4903, "step": 4816 }, { - "epoch": 5.144076840981857, - "grad_norm": 0.359375, - "learning_rate": 5.003738381783754e-05, - "loss": 0.4421, + "epoch": 4.933469805527124, + "grad_norm": 0.373046875, + "learning_rate": 6.115782438780612e-05, + "loss": 0.4538, "step": 4820 }, { - "epoch": 5.148345784418357, - "grad_norm": 0.341796875, - "learning_rate": 4.982112530321791e-05, - "loss": 0.4589, + "epoch": 4.93756397134084, + "grad_norm": 0.365234375, + "learning_rate": 6.093363963074602e-05, + "loss": 0.4825, "step": 4824 }, { - "epoch": 5.152614727854856, - "grad_norm": 0.333984375, - "learning_rate": 4.9605242024976656e-05, - "loss": 0.4492, + "epoch": 4.941658137154555, + "grad_norm": 0.32421875, + "learning_rate": 6.070976174477281e-05, + "loss": 0.4081, "step": 4828 }, { - "epoch": 5.1568836712913555, - "grad_norm": 0.3515625, - "learning_rate": 4.938973479174002e-05, - "loss": 0.4433, + "epoch": 4.94575230296827, + "grad_norm": 0.328125, + "learning_rate": 6.048619150124005e-05, + "loss": 0.4719, "step": 4832 }, { - "epoch": 5.161152614727855, - "grad_norm": 0.349609375, - "learning_rate": 4.917460441072552e-05, - "loss": 0.403, + "epoch": 4.949846468781986, + "grad_norm": 0.3671875, + "learning_rate": 6.026292967044121e-05, + "loss": 0.4626, "step": 4836 }, { - "epoch": 5.165421558164354, - "grad_norm": 0.33203125, - "learning_rate": 4.895985168773931e-05, - "loss": 0.4159, + "epoch": 4.9539406345957016, + "grad_norm": 0.36328125, + "learning_rate": 6.003997702160727e-05, + "loss": 0.4362, "step": 4840 }, { - "epoch": 5.169690501600853, - "grad_norm": 0.333984375, - "learning_rate": 4.8745477427172884e-05, - "loss": 0.4468, + "epoch": 4.958034800409417, + "grad_norm": 0.353515625, + "learning_rate": 5.981733432290399e-05, + "loss": 0.4964, "step": 4844 }, { - "epoch": 5.173959445037354, - "grad_norm": 0.388671875, - "learning_rate": 4.853148243200007e-05, - "loss": 0.4467, + "epoch": 4.962128966223132, + "grad_norm": 0.3515625, + "learning_rate": 5.959500234142904e-05, + "loss": 0.464, "step": 4848 }, { - "epoch": 5.178228388473853, - "grad_norm": 0.35546875, - "learning_rate": 4.831786750377425e-05, - "loss": 0.4175, + "epoch": 4.9662231320368475, + "grad_norm": 0.353515625, + "learning_rate": 5.937298184320967e-05, + "loss": 0.4479, "step": 4852 }, { - "epoch": 5.182497331910352, - "grad_norm": 0.3359375, - "learning_rate": 4.8104633442625026e-05, - "loss": 0.4225, + "epoch": 4.970317297850563, + "grad_norm": 0.3515625, + "learning_rate": 5.9151273593199924e-05, + "loss": 0.468, "step": 4856 }, { - "epoch": 5.1867662753468515, - "grad_norm": 0.326171875, - "learning_rate": 4.7891781047255564e-05, - "loss": 0.372, + "epoch": 4.974411463664278, + "grad_norm": 0.373046875, + "learning_rate": 5.892987835527809e-05, + "loss": 0.4376, "step": 4860 }, { - "epoch": 5.191035218783351, - "grad_norm": 0.359375, - "learning_rate": 4.767931111493928e-05, - "loss": 0.4298, + "epoch": 4.9785056294779935, + "grad_norm": 0.34765625, + "learning_rate": 5.870879689224377e-05, + "loss": 0.441, "step": 4864 }, { - "epoch": 5.19530416221985, - "grad_norm": 0.3515625, - "learning_rate": 4.746722444151718e-05, - "loss": 0.4175, + "epoch": 4.982599795291709, + "grad_norm": 0.333984375, + "learning_rate": 5.84880299658157e-05, + "loss": 0.4317, "step": 4868 }, { - "epoch": 5.19957310565635, - "grad_norm": 0.345703125, - "learning_rate": 4.725552182139455e-05, - "loss": 0.3751, + "epoch": 4.986693961105424, + "grad_norm": 0.357421875, + "learning_rate": 5.8267578336628875e-05, + "loss": 0.4447, "step": 4872 }, { - "epoch": 5.20384204909285, + "epoch": 4.99078812691914, "grad_norm": 0.357421875, - "learning_rate": 4.704420404753825e-05, - "loss": 0.4366, + "learning_rate": 5.804744276423181e-05, + "loss": 0.4438, "step": 4876 }, { - "epoch": 5.208110992529349, - "grad_norm": 0.390625, - "learning_rate": 4.6833271911473674e-05, - "loss": 0.3922, + "epoch": 4.994882292732855, + "grad_norm": 0.330078125, + "learning_rate": 5.782762400708424e-05, + "loss": 0.4339, "step": 4880 }, { - "epoch": 5.212379935965848, - "grad_norm": 0.33203125, - "learning_rate": 4.6622726203281546e-05, - "loss": 0.4245, + "epoch": 4.998976458546571, + "grad_norm": 0.3515625, + "learning_rate": 5.760812282255433e-05, + "loss": 0.4601, "step": 4884 }, { - "epoch": 5.216648879402348, - "grad_norm": 0.376953125, - "learning_rate": 4.641256771159541e-05, - "loss": 0.4335, + "epoch": 5.003070624360286, + "grad_norm": 0.33984375, + "learning_rate": 5.7388939966915894e-05, + "loss": 0.4073, "step": 4888 }, { - "epoch": 5.220917822838848, - "grad_norm": 0.353515625, - "learning_rate": 4.62027972235982e-05, - "loss": 0.3568, + "epoch": 5.007164790174002, + "grad_norm": 0.3203125, + "learning_rate": 5.7170076195346144e-05, + "loss": 0.4749, "step": 4892 }, { - "epoch": 5.225186766275347, - "grad_norm": 0.359375, - "learning_rate": 4.599341552501974e-05, - "loss": 0.4441, + "epoch": 5.011258955987717, + "grad_norm": 0.33203125, + "learning_rate": 5.695153226192293e-05, + "loss": 0.4516, "step": 4896 }, { - "epoch": 5.229455709711846, - "grad_norm": 0.32421875, - "learning_rate": 4.578442340013335e-05, - "loss": 0.4205, + "epoch": 5.015353121801433, + "grad_norm": 0.328125, + "learning_rate": 5.6733308919621946e-05, + "loss": 0.4908, "step": 4900 }, { - "epoch": 5.233724653148346, - "grad_norm": 0.353515625, - "learning_rate": 4.557582163175336e-05, - "loss": 0.455, + "epoch": 5.019447287615148, + "grad_norm": 0.33203125, + "learning_rate": 5.651540692031448e-05, + "loss": 0.4131, "step": 4904 }, { - "epoch": 5.237993596584845, - "grad_norm": 0.357421875, - "learning_rate": 4.5367611001231755e-05, - "loss": 0.4524, + "epoch": 5.023541453428864, + "grad_norm": 0.33984375, + "learning_rate": 5.629782701476464e-05, + "loss": 0.4205, "step": 4908 }, { - "epoch": 5.242262540021345, - "grad_norm": 0.341796875, - "learning_rate": 4.515979228845562e-05, - "loss": 0.4531, + "epoch": 5.027635619242579, + "grad_norm": 0.34765625, + "learning_rate": 5.608056995262668e-05, + "loss": 0.4144, "step": 4912 }, { - "epoch": 5.2465314834578445, - "grad_norm": 0.34765625, - "learning_rate": 4.495236627184391e-05, - "loss": 0.4045, + "epoch": 5.031729785056295, + "grad_norm": 0.3515625, + "learning_rate": 5.586363648244261e-05, + "loss": 0.4081, "step": 4916 }, { - "epoch": 5.250800426894344, - "grad_norm": 0.380859375, - "learning_rate": 4.4745333728344804e-05, - "loss": 0.4356, + "epoch": 5.03582395087001, + "grad_norm": 0.34765625, + "learning_rate": 5.564702735163956e-05, + "loss": 0.4058, "step": 4920 }, { - "epoch": 5.255069370330843, + "epoch": 5.039918116683726, "grad_norm": 0.35546875, - "learning_rate": 4.453869543343257e-05, - "loss": 0.4126, + "learning_rate": 5.543074330652706e-05, + "loss": 0.4239, "step": 4924 }, { - "epoch": 5.259338313767342, - "grad_norm": 0.33984375, - "learning_rate": 4.433245216110468e-05, - "loss": 0.4596, + "epoch": 5.044012282497441, + "grad_norm": 0.337890625, + "learning_rate": 5.521478509229468e-05, + "loss": 0.4397, "step": 4928 }, { - "epoch": 5.263607257203842, - "grad_norm": 0.32421875, - "learning_rate": 4.4126604683879226e-05, - "loss": 0.4338, + "epoch": 5.048106448311157, + "grad_norm": 0.349609375, + "learning_rate": 5.499915345300936e-05, + "loss": 0.4284, "step": 4932 }, { - "epoch": 5.267876200640342, - "grad_norm": 0.33984375, - "learning_rate": 4.3921153772791526e-05, - "loss": 0.389, + "epoch": 5.052200614124872, + "grad_norm": 0.35546875, + "learning_rate": 5.478384913161277e-05, + "loss": 0.4047, "step": 4936 }, { - "epoch": 5.272145144076841, - "grad_norm": 0.353515625, - "learning_rate": 4.3716100197391704e-05, - "loss": 0.423, + "epoch": 5.056294779938588, + "grad_norm": 0.33203125, + "learning_rate": 5.456887286991891e-05, + "loss": 0.4198, "step": 4940 }, { - "epoch": 5.2764140875133405, - "grad_norm": 0.37109375, - "learning_rate": 4.3511444725741406e-05, - "loss": 0.4227, + "epoch": 5.060388945752303, + "grad_norm": 0.3203125, + "learning_rate": 5.435422540861151e-05, + "loss": 0.3665, "step": 4944 }, { - "epoch": 5.28068303094984, - "grad_norm": 0.36328125, - "learning_rate": 4.330718812441125e-05, - "loss": 0.4305, + "epoch": 5.064483111566019, + "grad_norm": 0.375, + "learning_rate": 5.413990748724129e-05, + "loss": 0.4719, "step": 4948 }, { - "epoch": 5.284951974386339, - "grad_norm": 0.353515625, - "learning_rate": 4.3103331158477855e-05, - "loss": 0.4355, + "epoch": 5.068577277379734, + "grad_norm": 0.341796875, + "learning_rate": 5.392591984422371e-05, + "loss": 0.4073, "step": 4952 }, { - "epoch": 5.289220917822838, - "grad_norm": 0.34375, - "learning_rate": 4.289987459152073e-05, - "loss": 0.4461, + "epoch": 5.07267144319345, + "grad_norm": 0.34765625, + "learning_rate": 5.37122632168363e-05, + "loss": 0.3911, "step": 4956 }, { - "epoch": 5.293489861259339, - "grad_norm": 0.361328125, - "learning_rate": 4.2696819185619904e-05, - "loss": 0.4352, + "epoch": 5.076765609007165, + "grad_norm": 0.3828125, + "learning_rate": 5.349893834121593e-05, + "loss": 0.4298, "step": 4960 }, { - "epoch": 5.297758804695838, - "grad_norm": 0.349609375, - "learning_rate": 4.249416570135251e-05, - "loss": 0.4253, + "epoch": 5.080859774820881, + "grad_norm": 0.3359375, + "learning_rate": 5.32859459523566e-05, + "loss": 0.4137, "step": 4964 }, { - "epoch": 5.302027748132337, - "grad_norm": 0.337890625, - "learning_rate": 4.229191489779047e-05, - "loss": 0.4096, + "epoch": 5.084953940634596, + "grad_norm": 0.375, + "learning_rate": 5.3073286784106714e-05, + "loss": 0.439, "step": 4968 }, { - "epoch": 5.3062966915688365, - "grad_norm": 0.365234375, - "learning_rate": 4.20900675324972e-05, - "loss": 0.4336, + "epoch": 5.0890481064483115, + "grad_norm": 0.32421875, + "learning_rate": 5.2860961569166595e-05, + "loss": 0.3966, "step": 4972 }, { - "epoch": 5.310565635005336, + "epoch": 5.0931422722620265, "grad_norm": 0.33984375, - "learning_rate": 4.188862436152513e-05, - "loss": 0.4163, + "learning_rate": 5.264897103908599e-05, + "loss": 0.4607, "step": 4976 }, { - "epoch": 5.314834578441836, - "grad_norm": 0.359375, - "learning_rate": 4.168758613941257e-05, - "loss": 0.4522, + "epoch": 5.0972364380757424, + "grad_norm": 0.3515625, + "learning_rate": 5.243731592426135e-05, + "loss": 0.4216, "step": 4980 }, { - "epoch": 5.319103521878335, - "grad_norm": 0.375, - "learning_rate": 4.148695361918117e-05, - "loss": 0.4418, + "epoch": 5.1013306038894575, + "grad_norm": 0.328125, + "learning_rate": 5.222599695393368e-05, + "loss": 0.4576, "step": 4984 }, { - "epoch": 5.323372465314835, - "grad_norm": 0.3671875, - "learning_rate": 4.1286727552332796e-05, - "loss": 0.4095, + "epoch": 5.105424769703173, + "grad_norm": 0.357421875, + "learning_rate": 5.2015014856185796e-05, + "loss": 0.4706, "step": 4988 }, { - "epoch": 5.327641408751334, - "grad_norm": 0.369140625, - "learning_rate": 4.108690868884704e-05, - "loss": 0.4613, + "epoch": 5.109518935516888, + "grad_norm": 0.3359375, + "learning_rate": 5.1804370357939663e-05, + "loss": 0.4304, "step": 4992 }, { - "epoch": 5.331910352187833, - "grad_norm": 0.333984375, - "learning_rate": 4.088749777717818e-05, - "loss": 0.45, + "epoch": 5.113613101330604, + "grad_norm": 0.359375, + "learning_rate": 5.15940641849543e-05, + "loss": 0.4476, "step": 4996 }, { - "epoch": 5.336179295624333, - "grad_norm": 0.36328125, - "learning_rate": 4.068849556425238e-05, - "loss": 0.4189, + "epoch": 5.117707267144319, + "grad_norm": 0.349609375, + "learning_rate": 5.138409706182299e-05, + "loss": 0.4314, "step": 5000 }, { - "epoch": 5.340448239060833, - "grad_norm": 0.33203125, - "learning_rate": 4.048990279546508e-05, - "loss": 0.4027, + "epoch": 5.121801432958035, + "grad_norm": 0.353515625, + "learning_rate": 5.1174469711970716e-05, + "loss": 0.4327, "step": 5004 }, { - "epoch": 5.344717182497332, - "grad_norm": 0.34375, - "learning_rate": 4.029172021467794e-05, - "loss": 0.4234, + "epoch": 5.12589559877175, + "grad_norm": 0.349609375, + "learning_rate": 5.0965182857651964e-05, + "loss": 0.4191, "step": 5008 }, { - "epoch": 5.348986125933831, - "grad_norm": 0.33984375, - "learning_rate": 4.009394856421634e-05, - "loss": 0.438, + "epoch": 5.129989764585465, + "grad_norm": 0.3671875, + "learning_rate": 5.075623721994806e-05, + "loss": 0.4486, "step": 5012 }, { - "epoch": 5.353255069370331, - "grad_norm": 0.34375, - "learning_rate": 3.989658858486628e-05, - "loss": 0.417, + "epoch": 5.134083930399181, + "grad_norm": 0.34765625, + "learning_rate": 5.05476335187645e-05, + "loss": 0.4605, "step": 5016 }, { - "epoch": 5.35752401280683, - "grad_norm": 0.369140625, - "learning_rate": 3.9699641015872016e-05, - "loss": 0.4407, + "epoch": 5.138178096212896, + "grad_norm": 0.353515625, + "learning_rate": 5.033937247282891e-05, + "loss": 0.4839, "step": 5020 }, { - "epoch": 5.36179295624333, - "grad_norm": 0.341796875, - "learning_rate": 3.9503106594932775e-05, - "loss": 0.4158, + "epoch": 5.142272262026612, + "grad_norm": 0.34765625, + "learning_rate": 5.013145479968824e-05, + "loss": 0.4389, "step": 5024 }, { - "epoch": 5.3660618996798295, - "grad_norm": 0.38671875, - "learning_rate": 3.93069860582005e-05, - "loss": 0.4035, + "epoch": 5.146366427840327, + "grad_norm": 0.349609375, + "learning_rate": 4.992388121570625e-05, + "loss": 0.4528, "step": 5028 }, { - "epoch": 5.370330843116329, - "grad_norm": 0.34375, - "learning_rate": 3.9111280140276765e-05, - "loss": 0.4175, + "epoch": 5.150460593654043, + "grad_norm": 0.349609375, + "learning_rate": 4.9716652436061364e-05, + "loss": 0.4562, "step": 5032 }, { - "epoch": 5.374599786552828, - "grad_norm": 0.357421875, - "learning_rate": 3.89159895742101e-05, - "loss": 0.466, + "epoch": 5.154554759467758, + "grad_norm": 0.353515625, + "learning_rate": 4.950976917474393e-05, + "loss": 0.4521, "step": 5036 }, { - "epoch": 5.378868729989327, - "grad_norm": 0.345703125, - "learning_rate": 3.872111509149334e-05, - "loss": 0.422, + "epoch": 5.158648925281474, + "grad_norm": 0.337890625, + "learning_rate": 4.930323214455374e-05, + "loss": 0.3968, "step": 5040 }, { - "epoch": 5.383137673425827, - "grad_norm": 0.380859375, - "learning_rate": 3.8526657422060696e-05, - "loss": 0.4236, + "epoch": 5.162743091095189, + "grad_norm": 0.3515625, + "learning_rate": 4.909704205709785e-05, + "loss": 0.4224, "step": 5044 }, { - "epoch": 5.387406616862327, - "grad_norm": 0.341796875, - "learning_rate": 3.833261729428531e-05, - "loss": 0.4562, + "epoch": 5.166837256908905, + "grad_norm": 0.34375, + "learning_rate": 4.889119962278786e-05, + "loss": 0.4055, "step": 5048 }, { - "epoch": 5.391675560298826, - "grad_norm": 0.345703125, - "learning_rate": 3.8138995434976147e-05, - "loss": 0.4252, + "epoch": 5.17093142272262, + "grad_norm": 0.3515625, + "learning_rate": 4.868570555083752e-05, + "loss": 0.4874, "step": 5052 }, { - "epoch": 5.3959445037353255, - "grad_norm": 0.330078125, - "learning_rate": 3.794579256937568e-05, - "loss": 0.421, + "epoch": 5.175025588536336, + "grad_norm": 0.345703125, + "learning_rate": 4.8480560549260394e-05, + "loss": 0.4371, "step": 5056 }, { - "epoch": 5.400213447171825, - "grad_norm": 0.3359375, - "learning_rate": 3.7753009421156776e-05, - "loss": 0.4225, + "epoch": 5.179119754350051, + "grad_norm": 0.34765625, + "learning_rate": 4.827576532486737e-05, + "loss": 0.4315, "step": 5060 }, { - "epoch": 5.404482390608324, - "grad_norm": 0.3671875, - "learning_rate": 3.756064671242039e-05, - "loss": 0.4786, + "epoch": 5.183213920163767, + "grad_norm": 0.3515625, + "learning_rate": 4.807132058326409e-05, + "loss": 0.4226, "step": 5064 }, { - "epoch": 5.408751334044824, - "grad_norm": 0.365234375, - "learning_rate": 3.736870516369245e-05, - "loss": 0.4224, + "epoch": 5.187308085977482, + "grad_norm": 0.369140625, + "learning_rate": 4.786722702884874e-05, + "loss": 0.4251, "step": 5068 }, { - "epoch": 5.413020277481324, - "grad_norm": 0.35546875, - "learning_rate": 3.7177185493921553e-05, - "loss": 0.4515, + "epoch": 5.191402251791198, + "grad_norm": 0.34765625, + "learning_rate": 4.766348536480954e-05, + "loss": 0.4411, "step": 5072 }, { - "epoch": 5.417289220917823, - "grad_norm": 0.33203125, - "learning_rate": 3.6986088420475885e-05, - "loss": 0.405, + "epoch": 5.195496417604913, + "grad_norm": 0.33984375, + "learning_rate": 4.7460096293122174e-05, + "loss": 0.3946, "step": 5076 }, { - "epoch": 5.421558164354322, - "grad_norm": 0.341796875, - "learning_rate": 3.6795414659140895e-05, - "loss": 0.4452, + "epoch": 5.199590583418629, + "grad_norm": 0.337890625, + "learning_rate": 4.725706051454759e-05, + "loss": 0.4129, "step": 5080 }, { - "epoch": 5.425827107790822, - "grad_norm": 0.341796875, - "learning_rate": 3.660516492411631e-05, - "loss": 0.4512, + "epoch": 5.203684749232344, + "grad_norm": 0.34375, + "learning_rate": 4.705437872862955e-05, + "loss": 0.4273, "step": 5084 }, { - "epoch": 5.430096051227321, - "grad_norm": 0.3359375, - "learning_rate": 3.641533992801375e-05, - "loss": 0.4299, + "epoch": 5.20777891504606, + "grad_norm": 0.34765625, + "learning_rate": 4.685205163369197e-05, + "loss": 0.4656, "step": 5088 }, { - "epoch": 5.434364994663821, - "grad_norm": 0.349609375, - "learning_rate": 3.6225940381853726e-05, - "loss": 0.4892, + "epoch": 5.211873080859775, + "grad_norm": 0.345703125, + "learning_rate": 4.665007992683687e-05, + "loss": 0.4251, "step": 5092 }, { - "epoch": 5.43863393810032, - "grad_norm": 0.36328125, - "learning_rate": 3.6036966995063354e-05, - "loss": 0.4279, + "epoch": 5.2159672466734905, + "grad_norm": 0.33984375, + "learning_rate": 4.6448464303941824e-05, + "loss": 0.4032, "step": 5096 }, { - "epoch": 5.44290288153682, - "grad_norm": 0.33984375, - "learning_rate": 3.584842047547327e-05, - "loss": 0.4391, + "epoch": 5.220061412487206, + "grad_norm": 0.357421875, + "learning_rate": 4.6247205459657364e-05, + "loss": 0.4569, "step": 5100 }, { - "epoch": 5.447171824973319, - "grad_norm": 0.32421875, - "learning_rate": 3.566030152931547e-05, - "loss": 0.3981, + "epoch": 5.2241555783009215, + "grad_norm": 0.33984375, + "learning_rate": 4.604630408740498e-05, + "loss": 0.4433, "step": 5104 }, { - "epoch": 5.451440768409818, - "grad_norm": 0.37890625, - "learning_rate": 3.547261086122016e-05, - "loss": 0.4155, + "epoch": 5.2282497441146365, + "grad_norm": 0.353515625, + "learning_rate": 4.584576087937445e-05, + "loss": 0.4305, "step": 5108 }, { - "epoch": 5.455709711846318, - "grad_norm": 0.33984375, - "learning_rate": 3.528534917421355e-05, - "loss": 0.406, + "epoch": 5.232343909928352, + "grad_norm": 0.3515625, + "learning_rate": 4.5645576526521355e-05, + "loss": 0.3824, "step": 5112 }, { - "epoch": 5.459978655282818, + "epoch": 5.236438075742067, "grad_norm": 0.341796875, - "learning_rate": 3.509851716971487e-05, - "loss": 0.3866, + "learning_rate": 4.5445751718565165e-05, + "loss": 0.4235, "step": 5116 }, { - "epoch": 5.464247598719317, - "grad_norm": 0.34765625, - "learning_rate": 3.491211554753407e-05, - "loss": 0.4313, + "epoch": 5.240532241555783, + "grad_norm": 0.3671875, + "learning_rate": 4.524628714398645e-05, + "loss": 0.4215, "step": 5120 }, { - "epoch": 5.468516542155816, - "grad_norm": 0.359375, - "learning_rate": 3.472614500586887e-05, - "loss": 0.4399, + "epoch": 5.244626407369498, + "grad_norm": 0.369140625, + "learning_rate": 4.504718349002447e-05, + "loss": 0.4246, "step": 5124 }, { - "epoch": 5.472785485592316, - "grad_norm": 0.365234375, - "learning_rate": 3.4540606241302464e-05, - "loss": 0.4731, + "epoch": 5.248720573183214, + "grad_norm": 0.35546875, + "learning_rate": 4.4848441442675154e-05, + "loss": 0.4044, "step": 5128 }, { - "epoch": 5.477054429028815, - "grad_norm": 0.33984375, - "learning_rate": 3.43554999488006e-05, - "loss": 0.4415, + "epoch": 5.252814738996929, + "grad_norm": 0.34765625, + "learning_rate": 4.4650061686688514e-05, + "loss": 0.4406, "step": 5132 }, { - "epoch": 5.481323372465315, - "grad_norm": 0.3828125, - "learning_rate": 3.4170826821709264e-05, - "loss": 0.4452, + "epoch": 5.256908904810645, + "grad_norm": 0.34765625, + "learning_rate": 4.445204490556618e-05, + "loss": 0.4196, "step": 5136 }, { - "epoch": 5.4855923159018145, - "grad_norm": 0.3671875, - "learning_rate": 3.398658755175183e-05, - "loss": 0.3971, + "epoch": 5.26100307062436, + "grad_norm": 0.345703125, + "learning_rate": 4.4254391781559336e-05, + "loss": 0.4163, "step": 5140 }, { - "epoch": 5.489861259338314, - "grad_norm": 0.359375, - "learning_rate": 3.3802782829026685e-05, - "loss": 0.4456, + "epoch": 5.265097236438076, + "grad_norm": 0.341796875, + "learning_rate": 4.405710299566622e-05, + "loss": 0.454, "step": 5144 }, { - "epoch": 5.494130202774813, - "grad_norm": 0.349609375, - "learning_rate": 3.3619413342004556e-05, - "loss": 0.4326, + "epoch": 5.269191402251791, + "grad_norm": 0.359375, + "learning_rate": 4.386017922762958e-05, + "loss": 0.441, "step": 5148 }, { - "epoch": 5.498399146211312, - "grad_norm": 0.34765625, - "learning_rate": 3.3436479777525796e-05, - "loss": 0.4265, + "epoch": 5.273285568065507, + "grad_norm": 0.3515625, + "learning_rate": 4.3663621155934724e-05, + "loss": 0.4363, "step": 5152 }, { - "epoch": 5.502668089647813, - "grad_norm": 0.3515625, - "learning_rate": 3.3253982820798115e-05, - "loss": 0.4393, + "epoch": 5.277379733879222, + "grad_norm": 0.361328125, + "learning_rate": 4.3467429457806965e-05, + "loss": 0.4327, "step": 5156 }, { - "epoch": 5.506937033084312, - "grad_norm": 0.365234375, - "learning_rate": 3.3071923155393656e-05, - "loss": 0.4317, + "epoch": 5.281473899692937, + "grad_norm": 0.349609375, + "learning_rate": 4.327160480920915e-05, + "loss": 0.4212, "step": 5160 }, { - "epoch": 5.511205976520811, - "grad_norm": 0.32421875, - "learning_rate": 3.289030146324681e-05, - "loss": 0.3967, + "epoch": 5.285568065506653, + "grad_norm": 0.36328125, + "learning_rate": 4.307614788483963e-05, + "loss": 0.4076, "step": 5164 }, { - "epoch": 5.5154749199573105, - "grad_norm": 0.349609375, - "learning_rate": 3.270911842465124e-05, - "loss": 0.4288, + "epoch": 5.289662231320369, + "grad_norm": 0.353515625, + "learning_rate": 4.2881059358129806e-05, + "loss": 0.4527, "step": 5168 }, { - "epoch": 5.51974386339381, - "grad_norm": 0.35546875, - "learning_rate": 3.252837471825779e-05, - "loss": 0.4006, + "epoch": 5.293756397134084, + "grad_norm": 0.330078125, + "learning_rate": 4.268633990124163e-05, + "loss": 0.4017, "step": 5172 }, { - "epoch": 5.524012806830309, - "grad_norm": 0.37109375, - "learning_rate": 3.234807102107153e-05, - "loss": 0.4299, + "epoch": 5.297850562947799, + "grad_norm": 0.328125, + "learning_rate": 4.2491990185065625e-05, + "loss": 0.392, "step": 5176 }, { - "epoch": 5.528281750266809, - "grad_norm": 0.333984375, - "learning_rate": 3.216820800844952e-05, - "loss": 0.454, + "epoch": 5.301944728761515, + "grad_norm": 0.361328125, + "learning_rate": 4.22980108792184e-05, + "loss": 0.4646, "step": 5180 }, { - "epoch": 5.532550693703309, - "grad_norm": 0.353515625, - "learning_rate": 3.198878635409814e-05, - "loss": 0.4166, + "epoch": 5.30603889457523, + "grad_norm": 0.333984375, + "learning_rate": 4.2104402652040144e-05, + "loss": 0.4128, "step": 5184 }, { - "epoch": 5.536819637139808, - "grad_norm": 0.34375, - "learning_rate": 3.180980673007055e-05, - "loss": 0.3965, + "epoch": 5.310133060388946, + "grad_norm": 0.3671875, + "learning_rate": 4.191116617059272e-05, + "loss": 0.4126, "step": 5188 }, { - "epoch": 5.541088580576307, - "grad_norm": 0.328125, - "learning_rate": 3.163126980676425e-05, - "loss": 0.4384, + "epoch": 5.314227226202661, + "grad_norm": 0.3359375, + "learning_rate": 4.1718302100657176e-05, + "loss": 0.4295, "step": 5192 }, { - "epoch": 5.545357524012807, - "grad_norm": 0.3359375, - "learning_rate": 3.1453176252918495e-05, - "loss": 0.4153, + "epoch": 5.318321392016377, + "grad_norm": 0.33984375, + "learning_rate": 4.15258111067313e-05, + "loss": 0.4273, "step": 5196 }, { - "epoch": 5.549626467449306, - "grad_norm": 0.37109375, - "learning_rate": 3.1275526735611896e-05, - "loss": 0.4423, + "epoch": 5.322415557830092, + "grad_norm": 0.34375, + "learning_rate": 4.133369385202756e-05, + "loss": 0.42, "step": 5200 }, { - "epoch": 5.553895410885806, - "grad_norm": 0.37109375, - "learning_rate": 3.1098321920259736e-05, - "loss": 0.4317, + "epoch": 5.326509723643808, + "grad_norm": 0.3515625, + "learning_rate": 4.114195099847083e-05, + "loss": 0.4329, "step": 5204 }, { - "epoch": 5.558164354322305, - "grad_norm": 0.345703125, - "learning_rate": 3.0921562470611765e-05, - "loss": 0.4189, + "epoch": 5.330603889457523, + "grad_norm": 0.341796875, + "learning_rate": 4.0950583206695786e-05, + "loss": 0.4637, "step": 5208 }, { - "epoch": 5.562433297758805, - "grad_norm": 0.365234375, - "learning_rate": 3.074524904874935e-05, - "loss": 0.4512, + "epoch": 5.334698055271239, + "grad_norm": 0.361328125, + "learning_rate": 4.075959113604506e-05, + "loss": 0.4401, "step": 5212 }, { - "epoch": 5.566702241195304, - "grad_norm": 0.369140625, - "learning_rate": 3.056938231508333e-05, - "loss": 0.433, + "epoch": 5.338792221084954, + "grad_norm": 0.353515625, + "learning_rate": 4.056897544456673e-05, + "loss": 0.4225, "step": 5216 }, { - "epoch": 5.570971184631803, - "grad_norm": 0.349609375, - "learning_rate": 3.0393962928351373e-05, - "loss": 0.3729, + "epoch": 5.34288638689867, + "grad_norm": 0.361328125, + "learning_rate": 4.037873678901198e-05, + "loss": 0.4322, "step": 5220 }, { - "epoch": 5.5752401280683035, - "grad_norm": 0.359375, - "learning_rate": 3.021899154561545e-05, - "loss": 0.4289, + "epoch": 5.346980552712385, + "grad_norm": 0.34765625, + "learning_rate": 4.018887582483306e-05, + "loss": 0.4224, "step": 5224 }, { - "epoch": 5.579509071504803, - "grad_norm": 0.361328125, - "learning_rate": 3.0044468822259594e-05, - "loss": 0.4024, + "epoch": 5.3510747185261005, + "grad_norm": 0.375, + "learning_rate": 3.9999393206180914e-05, + "loss": 0.4694, "step": 5228 }, { - "epoch": 5.583778014941302, - "grad_norm": 0.3515625, - "learning_rate": 2.987039541198719e-05, - "loss": 0.4107, + "epoch": 5.3551688843398155, + "grad_norm": 0.34765625, + "learning_rate": 3.98102895859028e-05, + "loss": 0.4165, "step": 5232 }, { - "epoch": 5.588046958377801, - "grad_norm": 0.357421875, - "learning_rate": 2.9696771966818735e-05, - "loss": 0.3989, + "epoch": 5.359263050153531, + "grad_norm": 0.34375, + "learning_rate": 3.9621565615540325e-05, + "loss": 0.4271, "step": 5236 }, { - "epoch": 5.592315901814301, - "grad_norm": 0.35546875, - "learning_rate": 2.952359913708927e-05, - "loss": 0.3772, + "epoch": 5.3633572159672465, + "grad_norm": 0.365234375, + "learning_rate": 3.9433221945326985e-05, + "loss": 0.4569, "step": 5240 }, { - "epoch": 5.596584845250801, - "grad_norm": 0.31640625, - "learning_rate": 2.9350877571445924e-05, - "loss": 0.4425, + "epoch": 5.367451381780962, + "grad_norm": 0.361328125, + "learning_rate": 3.924525922418591e-05, + "loss": 0.4756, "step": 5244 }, { - "epoch": 5.6008537886873, - "grad_norm": 0.3515625, - "learning_rate": 2.91786079168457e-05, - "loss": 0.4352, + "epoch": 5.371545547594677, + "grad_norm": 0.359375, + "learning_rate": 3.905767809972779e-05, + "loss": 0.4734, "step": 5248 }, { - "epoch": 5.6051227321237995, - "grad_norm": 0.359375, - "learning_rate": 2.900679081855268e-05, - "loss": 0.4541, + "epoch": 5.375639713408393, + "grad_norm": 0.35546875, + "learning_rate": 3.887047921824858e-05, + "loss": 0.3928, "step": 5252 }, { - "epoch": 5.609391675560299, - "grad_norm": 0.36328125, - "learning_rate": 2.883542692013606e-05, - "loss": 0.4505, + "epoch": 5.379733879222108, + "grad_norm": 0.353515625, + "learning_rate": 3.868366322472704e-05, + "loss": 0.4826, "step": 5256 }, { - "epoch": 5.613660618996798, - "grad_norm": 0.373046875, - "learning_rate": 2.866451686346729e-05, - "loss": 0.4272, + "epoch": 5.383828045035824, + "grad_norm": 0.37890625, + "learning_rate": 3.849723076282308e-05, + "loss": 0.4614, "step": 5260 }, { - "epoch": 5.617929562433297, - "grad_norm": 0.388671875, - "learning_rate": 2.8494061288718033e-05, - "loss": 0.4172, + "epoch": 5.387922210849539, + "grad_norm": 0.359375, + "learning_rate": 3.831118247487481e-05, + "loss": 0.4696, "step": 5264 }, { - "epoch": 5.622198505869797, - "grad_norm": 0.349609375, - "learning_rate": 2.8324060834357505e-05, - "loss": 0.4151, + "epoch": 5.392016376663255, + "grad_norm": 0.3515625, + "learning_rate": 3.812551900189694e-05, + "loss": 0.4323, "step": 5268 }, { - "epoch": 5.626467449306297, - "grad_norm": 0.359375, - "learning_rate": 2.815451613715029e-05, - "loss": 0.3926, + "epoch": 5.39611054247697, + "grad_norm": 0.3359375, + "learning_rate": 3.794024098357826e-05, + "loss": 0.3972, "step": 5272 }, { - "epoch": 5.630736392742796, - "grad_norm": 0.359375, - "learning_rate": 2.798542783215375e-05, - "loss": 0.463, + "epoch": 5.400204708290686, + "grad_norm": 0.345703125, + "learning_rate": 3.775534905827943e-05, + "loss": 0.4397, "step": 5276 }, { - "epoch": 5.6350053361792956, - "grad_norm": 0.3671875, - "learning_rate": 2.7816796552715893e-05, - "loss": 0.444, + "epoch": 5.404298874104401, + "grad_norm": 0.359375, + "learning_rate": 3.7570843863030995e-05, + "loss": 0.426, "step": 5280 }, { - "epoch": 5.639274279615795, - "grad_norm": 0.33984375, - "learning_rate": 2.7648622930472725e-05, - "loss": 0.3938, + "epoch": 5.408393039918117, + "grad_norm": 0.3359375, + "learning_rate": 3.7386726033530995e-05, + "loss": 0.4399, "step": 5284 }, { - "epoch": 5.643543223052294, - "grad_norm": 0.349609375, - "learning_rate": 2.7480907595346098e-05, - "loss": 0.4214, + "epoch": 5.412487205731832, + "grad_norm": 0.35546875, + "learning_rate": 3.720299620414274e-05, + "loss": 0.4422, "step": 5288 }, { - "epoch": 5.647812166488794, - "grad_norm": 0.369140625, - "learning_rate": 2.7313651175541312e-05, - "loss": 0.4117, + "epoch": 5.416581371545548, + "grad_norm": 0.365234375, + "learning_rate": 3.701965500789287e-05, + "loss": 0.4196, "step": 5292 }, { - "epoch": 5.652081109925294, - "grad_norm": 0.35546875, - "learning_rate": 2.7146854297544602e-05, - "loss": 0.429, + "epoch": 5.420675537359263, + "grad_norm": 0.37890625, + "learning_rate": 3.68367030764689e-05, + "loss": 0.432, "step": 5296 }, { - "epoch": 5.656350053361793, - "grad_norm": 0.328125, - "learning_rate": 2.698051758612107e-05, - "loss": 0.4168, + "epoch": 5.424769703172979, + "grad_norm": 0.3515625, + "learning_rate": 3.665414104021729e-05, + "loss": 0.448, "step": 5300 }, { - "epoch": 5.660618996798292, - "grad_norm": 0.337890625, - "learning_rate": 2.6814641664312004e-05, - "loss": 0.4305, + "epoch": 5.428863868986694, + "grad_norm": 0.359375, + "learning_rate": 3.647196952814099e-05, + "loss": 0.435, "step": 5304 }, { - "epoch": 5.664887940234792, - "grad_norm": 0.341796875, - "learning_rate": 2.6649227153432916e-05, - "loss": 0.4034, + "epoch": 5.432958034800409, + "grad_norm": 0.36328125, + "learning_rate": 3.6290189167897526e-05, + "loss": 0.4299, "step": 5308 }, { - "epoch": 5.669156883671292, - "grad_norm": 0.376953125, - "learning_rate": 2.648427467307086e-05, - "loss": 0.4394, + "epoch": 5.437052200614125, + "grad_norm": 0.36328125, + "learning_rate": 3.6108800585796774e-05, + "loss": 0.4619, "step": 5312 }, { - "epoch": 5.673425827107791, - "grad_norm": 0.328125, - "learning_rate": 2.6319784841082464e-05, - "loss": 0.4339, + "epoch": 5.441146366427841, + "grad_norm": 0.353515625, + "learning_rate": 3.5927804406798655e-05, + "loss": 0.4072, "step": 5316 }, { - "epoch": 5.67769477054429, - "grad_norm": 0.345703125, - "learning_rate": 2.6155758273591215e-05, - "loss": 0.4682, + "epoch": 5.445240532241556, + "grad_norm": 0.361328125, + "learning_rate": 3.574720125451118e-05, + "loss": 0.425, "step": 5320 }, { - "epoch": 5.68196371398079, + "epoch": 5.449334698055271, "grad_norm": 0.3515625, - "learning_rate": 2.599219558498557e-05, - "loss": 0.4467, + "learning_rate": 3.556699175118824e-05, + "loss": 0.3989, "step": 5324 }, { - "epoch": 5.686232657417289, - "grad_norm": 0.375, - "learning_rate": 2.5829097387916294e-05, - "loss": 0.4677, + "epoch": 5.453428863868987, + "grad_norm": 0.361328125, + "learning_rate": 3.538717651772733e-05, + "loss": 0.4356, "step": 5328 }, { - "epoch": 5.690501600853788, - "grad_norm": 0.34765625, - "learning_rate": 2.5666464293294418e-05, - "loss": 0.4077, + "epoch": 5.457523029682702, + "grad_norm": 0.373046875, + "learning_rate": 3.520775617366763e-05, + "loss": 0.4129, "step": 5332 }, { - "epoch": 5.6947705442902885, - "grad_norm": 0.345703125, - "learning_rate": 2.5504296910288868e-05, - "loss": 0.4685, + "epoch": 5.461617195496418, + "grad_norm": 0.349609375, + "learning_rate": 3.502873133718775e-05, + "loss": 0.4549, "step": 5336 }, { - "epoch": 5.699039487726788, - "grad_norm": 0.380859375, - "learning_rate": 2.534259584632405e-05, - "loss": 0.4629, + "epoch": 5.465711361310133, + "grad_norm": 0.345703125, + "learning_rate": 3.4850102625103504e-05, + "loss": 0.436, "step": 5340 }, { - "epoch": 5.703308431163287, - "grad_norm": 0.3359375, - "learning_rate": 2.5181361707077852e-05, - "loss": 0.4192, + "epoch": 5.469805527123849, + "grad_norm": 0.365234375, + "learning_rate": 3.4671870652866e-05, + "loss": 0.4197, "step": 5344 }, { - "epoch": 5.707577374599786, - "grad_norm": 0.39453125, - "learning_rate": 2.5020595096479084e-05, - "loss": 0.4105, + "epoch": 5.473899692937564, + "grad_norm": 0.341796875, + "learning_rate": 3.449403603455941e-05, + "loss": 0.4587, "step": 5348 }, { - "epoch": 5.711846318036286, - "grad_norm": 0.353515625, - "learning_rate": 2.4860296616705457e-05, - "loss": 0.4562, + "epoch": 5.4779938587512795, + "grad_norm": 0.361328125, + "learning_rate": 3.431659938289875e-05, + "loss": 0.409, "step": 5352 }, { - "epoch": 5.716115261472785, - "grad_norm": 0.361328125, - "learning_rate": 2.4700466868181106e-05, - "loss": 0.4119, + "epoch": 5.4820880245649946, + "grad_norm": 0.36328125, + "learning_rate": 3.413956130922797e-05, + "loss": 0.3965, "step": 5356 }, { - "epoch": 5.720384204909285, - "grad_norm": 0.361328125, - "learning_rate": 2.454110644957462e-05, - "loss": 0.4017, + "epoch": 5.4861821903787105, + "grad_norm": 0.369140625, + "learning_rate": 3.396292242351779e-05, + "loss": 0.4204, "step": 5360 }, { - "epoch": 5.7246531483457845, - "grad_norm": 0.345703125, - "learning_rate": 2.4382215957796454e-05, - "loss": 0.4441, + "epoch": 5.4902763561924255, + "grad_norm": 0.330078125, + "learning_rate": 3.3786683334363376e-05, + "loss": 0.4489, "step": 5364 }, { - "epoch": 5.728922091782284, - "grad_norm": 0.373046875, - "learning_rate": 2.4223795987996997e-05, - "loss": 0.4319, + "epoch": 5.494370522006141, + "grad_norm": 0.35546875, + "learning_rate": 3.36108446489826e-05, + "loss": 0.4325, "step": 5368 }, { - "epoch": 5.733191035218783, + "epoch": 5.498464687819856, "grad_norm": 0.33984375, - "learning_rate": 2.406584713356421e-05, - "loss": 0.4403, + "learning_rate": 3.3435406973213777e-05, + "loss": 0.4531, "step": 5372 }, { - "epoch": 5.7374599786552825, - "grad_norm": 0.365234375, - "learning_rate": 2.390836998612134e-05, - "loss": 0.4331, + "epoch": 5.502558853633572, + "grad_norm": 0.3359375, + "learning_rate": 3.326037091151342e-05, + "loss": 0.4556, "step": 5376 }, { - "epoch": 5.741728922091783, - "grad_norm": 0.359375, - "learning_rate": 2.3751365135524887e-05, - "loss": 0.4297, + "epoch": 5.506653019447287, + "grad_norm": 0.365234375, + "learning_rate": 3.308573706695445e-05, + "loss": 0.4505, "step": 5380 }, { - "epoch": 5.745997865528282, - "grad_norm": 0.349609375, - "learning_rate": 2.359483316986216e-05, - "loss": 0.4406, + "epoch": 5.510747185261003, + "grad_norm": 0.337890625, + "learning_rate": 3.2911506041223984e-05, + "loss": 0.4272, "step": 5384 }, { - "epoch": 5.750266808964781, - "grad_norm": 0.3359375, - "learning_rate": 2.343877467544934e-05, - "loss": 0.4002, + "epoch": 5.514841351074718, + "grad_norm": 0.359375, + "learning_rate": 3.273767843462113e-05, + "loss": 0.4392, "step": 5388 }, { - "epoch": 5.754535752401281, - "grad_norm": 0.349609375, - "learning_rate": 2.3283190236828993e-05, - "loss": 0.4169, + "epoch": 5.518935516888434, + "grad_norm": 0.3515625, + "learning_rate": 3.2564254846055177e-05, + "loss": 0.4276, "step": 5392 }, { - "epoch": 5.75880469583778, - "grad_norm": 0.369140625, - "learning_rate": 2.312808043676818e-05, - "loss": 0.4487, + "epoch": 5.523029682702149, + "grad_norm": 0.3671875, + "learning_rate": 3.2391235873043396e-05, + "loss": 0.4145, "step": 5396 }, { - "epoch": 5.76307363927428, - "grad_norm": 0.365234375, - "learning_rate": 2.2973445856255983e-05, - "loss": 0.4472, + "epoch": 5.527123848515865, + "grad_norm": 0.34765625, + "learning_rate": 3.221862211170883e-05, + "loss": 0.3965, "step": 5400 }, { - "epoch": 5.767342582710779, - "grad_norm": 0.341796875, - "learning_rate": 2.2819287074501614e-05, - "loss": 0.4155, + "epoch": 5.53121801432958, + "grad_norm": 0.361328125, + "learning_rate": 3.2046414156778674e-05, + "loss": 0.4354, "step": 5404 }, { - "epoch": 5.771611526147279, - "grad_norm": 0.37109375, - "learning_rate": 2.2665604668931974e-05, - "loss": 0.3958, + "epoch": 5.535312180143296, + "grad_norm": 0.345703125, + "learning_rate": 3.187461260158166e-05, + "loss": 0.4275, "step": 5408 }, { - "epoch": 5.775880469583778, - "grad_norm": 0.36328125, - "learning_rate": 2.2512399215189735e-05, - "loss": 0.4213, + "epoch": 5.539406345957011, + "grad_norm": 0.37890625, + "learning_rate": 3.170321803804649e-05, + "loss": 0.4575, "step": 5412 }, { - "epoch": 5.780149413020277, - "grad_norm": 0.357421875, - "learning_rate": 2.2359671287130975e-05, - "loss": 0.4596, + "epoch": 5.543500511770727, + "grad_norm": 0.3515625, + "learning_rate": 3.153223105669957e-05, + "loss": 0.3868, "step": 5416 }, { - "epoch": 5.784418356456777, - "grad_norm": 0.68359375, - "learning_rate": 2.220742145682312e-05, - "loss": 0.4457, + "epoch": 5.547594677584442, + "grad_norm": 0.361328125, + "learning_rate": 3.1361652246662944e-05, + "loss": 0.4461, "step": 5420 }, { - "epoch": 5.788687299893277, - "grad_norm": 0.345703125, - "learning_rate": 2.205565029454293e-05, - "loss": 0.4362, + "epoch": 5.551688843398158, + "grad_norm": 0.37109375, + "learning_rate": 3.1191482195652405e-05, + "loss": 0.4191, "step": 5424 }, { - "epoch": 5.792956243329776, - "grad_norm": 0.34765625, - "learning_rate": 2.1904358368774034e-05, - "loss": 0.4062, + "epoch": 5.555783009211873, + "grad_norm": 0.369140625, + "learning_rate": 3.102172148997543e-05, + "loss": 0.4842, "step": 5428 }, { - "epoch": 5.797225186766275, - "grad_norm": 0.359375, - "learning_rate": 2.1753546246205224e-05, - "loss": 0.4612, + "epoch": 5.559877175025589, + "grad_norm": 0.333984375, + "learning_rate": 3.085237071452898e-05, + "loss": 0.439, "step": 5432 }, { - "epoch": 5.801494130202775, - "grad_norm": 0.33984375, - "learning_rate": 2.16032144917279e-05, - "loss": 0.422, + "epoch": 5.563971340839304, + "grad_norm": 0.365234375, + "learning_rate": 3.068343045279779e-05, + "loss": 0.4481, "step": 5436 }, { - "epoch": 5.805763073639274, - "grad_norm": 0.330078125, - "learning_rate": 2.1453363668434304e-05, - "loss": 0.4251, + "epoch": 5.56806550665302, + "grad_norm": 0.326171875, + "learning_rate": 3.0514901286852177e-05, + "loss": 0.3919, "step": 5440 }, { - "epoch": 5.810032017075773, - "grad_norm": 0.33203125, - "learning_rate": 2.1303994337615283e-05, - "loss": 0.4051, + "epoch": 5.572159672466735, + "grad_norm": 0.345703125, + "learning_rate": 3.0346783797345936e-05, + "loss": 0.4635, "step": 5444 }, { - "epoch": 5.8143009605122735, - "grad_norm": 0.357421875, - "learning_rate": 2.1155107058758024e-05, - "loss": 0.4795, + "epoch": 5.576253838280451, + "grad_norm": 0.337890625, + "learning_rate": 3.0179078563514625e-05, + "loss": 0.3848, "step": 5448 }, { - "epoch": 5.818569903948773, + "epoch": 5.580348004094166, "grad_norm": 0.353515625, - "learning_rate": 2.1006702389544315e-05, - "loss": 0.4346, + "learning_rate": 3.0011786163173357e-05, + "loss": 0.4465, "step": 5452 }, { - "epoch": 5.822838847385272, - "grad_norm": 0.34375, - "learning_rate": 2.0858780885848035e-05, - "loss": 0.4167, + "epoch": 5.584442169907881, + "grad_norm": 0.349609375, + "learning_rate": 2.9844907172714767e-05, + "loss": 0.4399, "step": 5456 }, { - "epoch": 5.827107790821771, - "grad_norm": 0.365234375, - "learning_rate": 2.0711343101733496e-05, - "loss": 0.4471, + "epoch": 5.588536335721597, + "grad_norm": 0.380859375, + "learning_rate": 2.9678442167107242e-05, + "loss": 0.475, "step": 5460 }, { - "epoch": 5.831376734258271, - "grad_norm": 0.353515625, - "learning_rate": 2.056438958945299e-05, - "loss": 0.4174, + "epoch": 5.592630501535313, + "grad_norm": 0.357421875, + "learning_rate": 2.951239171989278e-05, + "loss": 0.4141, "step": 5464 }, { - "epoch": 5.835645677694771, - "grad_norm": 0.33984375, - "learning_rate": 2.0417920899445013e-05, - "loss": 0.4145, + "epoch": 5.596724667349028, + "grad_norm": 0.373046875, + "learning_rate": 2.9346756403184974e-05, + "loss": 0.4536, "step": 5468 }, { - "epoch": 5.83991462113127, - "grad_norm": 0.3515625, - "learning_rate": 2.027193758033196e-05, - "loss": 0.4621, + "epoch": 5.600818833162743, + "grad_norm": 0.349609375, + "learning_rate": 2.9181536787667237e-05, + "loss": 0.4386, "step": 5472 }, { - "epoch": 5.8441835645677696, - "grad_norm": 0.373046875, - "learning_rate": 2.0126440178918285e-05, - "loss": 0.4449, + "epoch": 5.604912998976459, + "grad_norm": 0.337890625, + "learning_rate": 2.9016733442590683e-05, + "loss": 0.4425, "step": 5476 }, { - "epoch": 5.848452508004269, - "grad_norm": 0.35546875, - "learning_rate": 1.998142924018838e-05, - "loss": 0.4257, + "epoch": 5.6090071647901745, + "grad_norm": 0.345703125, + "learning_rate": 2.8852346935772107e-05, + "loss": 0.4275, "step": 5480 }, { - "epoch": 5.852721451440768, - "grad_norm": 0.34375, - "learning_rate": 1.983690530730439e-05, - "loss": 0.425, + "epoch": 5.6131013306038895, + "grad_norm": 0.392578125, + "learning_rate": 2.868837783359222e-05, + "loss": 0.4468, "step": 5484 }, { - "epoch": 5.856990394877268, - "grad_norm": 0.337890625, - "learning_rate": 1.9692868921604425e-05, - "loss": 0.3803, + "epoch": 5.6171954964176045, + "grad_norm": 0.34375, + "learning_rate": 2.8524826700993625e-05, + "loss": 0.4208, "step": 5488 }, { - "epoch": 5.861259338313768, - "grad_norm": 0.36328125, - "learning_rate": 1.954932062260029e-05, - "loss": 0.4228, + "epoch": 5.62128966223132, + "grad_norm": 0.34375, + "learning_rate": 2.8361694101478704e-05, + "loss": 0.4079, "step": 5492 }, { - "epoch": 5.865528281750267, - "grad_norm": 0.337890625, - "learning_rate": 1.940626094797571e-05, - "loss": 0.3995, + "epoch": 5.6253838280450355, + "grad_norm": 0.345703125, + "learning_rate": 2.8198980597107956e-05, + "loss": 0.4446, "step": 5496 }, { - "epoch": 5.869797225186766, - "grad_norm": 0.34765625, - "learning_rate": 1.9263690433584066e-05, - "loss": 0.4417, + "epoch": 5.629477993858751, + "grad_norm": 0.39453125, + "learning_rate": 2.8036686748497883e-05, + "loss": 0.4611, "step": 5500 }, { - "epoch": 5.874066168623266, - "grad_norm": 0.34765625, - "learning_rate": 1.9121609613446605e-05, - "loss": 0.411, + "epoch": 5.633572159672466, + "grad_norm": 0.337890625, + "learning_rate": 2.7874813114818994e-05, + "loss": 0.4355, "step": 5504 }, { - "epoch": 5.878335112059765, - "grad_norm": 0.359375, - "learning_rate": 1.8980019019750264e-05, - "loss": 0.4317, + "epoch": 5.637666325486182, + "grad_norm": 0.365234375, + "learning_rate": 2.7713360253794143e-05, + "loss": 0.4381, "step": 5508 }, { - "epoch": 5.882604055496264, - "grad_norm": 0.34765625, - "learning_rate": 1.8838919182845847e-05, - "loss": 0.462, + "epoch": 5.641760491299897, + "grad_norm": 0.3359375, + "learning_rate": 2.7552328721696388e-05, + "loss": 0.4526, "step": 5512 }, { - "epoch": 5.886872998932764, - "grad_norm": 0.345703125, - "learning_rate": 1.8698310631245816e-05, - "loss": 0.4091, + "epoch": 5.645854657113613, + "grad_norm": 0.35546875, + "learning_rate": 2.7391719073347018e-05, + "loss": 0.4705, "step": 5516 }, { - "epoch": 5.891141942369264, - "grad_norm": 0.375, - "learning_rate": 1.8558193891622597e-05, - "loss": 0.408, + "epoch": 5.649948822927328, + "grad_norm": 0.36328125, + "learning_rate": 2.723153186211392e-05, + "loss": 0.4437, "step": 5520 }, { - "epoch": 5.895410885805763, - "grad_norm": 0.35546875, - "learning_rate": 1.8418569488806385e-05, - "loss": 0.4194, + "epoch": 5.654042988741044, + "grad_norm": 0.375, + "learning_rate": 2.707176763990944e-05, + "loss": 0.438, "step": 5524 }, { - "epoch": 5.899679829242262, - "grad_norm": 0.361328125, - "learning_rate": 1.8279437945783166e-05, - "loss": 0.4776, + "epoch": 5.658137154554759, + "grad_norm": 0.3515625, + "learning_rate": 2.69124269571885e-05, + "loss": 0.4156, "step": 5528 }, { - "epoch": 5.903948772678762, - "grad_norm": 0.361328125, - "learning_rate": 1.814079978369302e-05, - "loss": 0.4469, + "epoch": 5.662231320368475, + "grad_norm": 0.33984375, + "learning_rate": 2.6753510362946823e-05, + "loss": 0.4496, "step": 5532 }, { - "epoch": 5.908217716115262, - "grad_norm": 0.365234375, - "learning_rate": 1.800265552182778e-05, - "loss": 0.5029, + "epoch": 5.66632548618219, + "grad_norm": 0.3515625, + "learning_rate": 2.659501840471898e-05, + "loss": 0.4761, "step": 5536 }, { - "epoch": 5.912486659551761, - "grad_norm": 0.361328125, - "learning_rate": 1.7865005677629495e-05, - "loss": 0.4646, + "epoch": 5.670419651995906, + "grad_norm": 0.37109375, + "learning_rate": 2.643695162857638e-05, + "loss": 0.4506, "step": 5540 }, { - "epoch": 5.91675560298826, - "grad_norm": 0.357421875, - "learning_rate": 1.7727850766688108e-05, - "loss": 0.4514, + "epoch": 5.674513817809621, + "grad_norm": 0.359375, + "learning_rate": 2.6279310579125562e-05, + "loss": 0.4432, "step": 5544 }, { - "epoch": 5.92102454642476, - "grad_norm": 0.369140625, - "learning_rate": 1.7591191302739866e-05, - "loss": 0.4539, + "epoch": 5.678607983623337, + "grad_norm": 0.34765625, + "learning_rate": 2.6122095799506394e-05, + "loss": 0.4281, "step": 5548 }, { - "epoch": 5.925293489861259, - "grad_norm": 0.365234375, - "learning_rate": 1.745502779766511e-05, - "loss": 0.4462, + "epoch": 5.682702149437052, + "grad_norm": 0.37109375, + "learning_rate": 2.596530783138983e-05, + "loss": 0.4595, "step": 5552 }, { - "epoch": 5.929562433297759, - "grad_norm": 0.341796875, - "learning_rate": 1.7319360761486566e-05, - "loss": 0.4291, + "epoch": 5.686796315250768, + "grad_norm": 0.36328125, + "learning_rate": 2.5808947214976428e-05, + "loss": 0.4131, "step": 5556 }, { - "epoch": 5.9338313767342585, - "grad_norm": 0.34765625, - "learning_rate": 1.7184190702367377e-05, - "loss": 0.4539, + "epoch": 5.690890481064483, + "grad_norm": 0.361328125, + "learning_rate": 2.5653014488994328e-05, + "loss": 0.4694, "step": 5560 }, { - "epoch": 5.938100320170758, - "grad_norm": 0.341796875, - "learning_rate": 1.704951812660908e-05, - "loss": 0.433, + "epoch": 5.694984646878199, + "grad_norm": 0.369140625, + "learning_rate": 2.5497510190697323e-05, + "loss": 0.4834, "step": 5564 }, { - "epoch": 5.942369263607257, - "grad_norm": 0.341796875, - "learning_rate": 1.6915343538649845e-05, - "loss": 0.5016, + "epoch": 5.699078812691914, + "grad_norm": 0.353515625, + "learning_rate": 2.5342434855863187e-05, + "loss": 0.4144, "step": 5568 }, { - "epoch": 5.9466382070437565, - "grad_norm": 0.345703125, - "learning_rate": 1.6781667441062607e-05, - "loss": 0.3977, + "epoch": 5.70317297850563, + "grad_norm": 0.3203125, + "learning_rate": 2.5187789018791703e-05, + "loss": 0.429, "step": 5572 }, { - "epoch": 5.950907150480256, - "grad_norm": 0.353515625, - "learning_rate": 1.6648490334553045e-05, - "loss": 0.4445, + "epoch": 5.707267144319345, + "grad_norm": 0.361328125, + "learning_rate": 2.5033573212302814e-05, + "loss": 0.4901, "step": 5576 }, { - "epoch": 5.955176093916756, - "grad_norm": 0.373046875, - "learning_rate": 1.6515812717957843e-05, - "loss": 0.4533, + "epoch": 5.711361310133061, + "grad_norm": 0.337890625, + "learning_rate": 2.4879787967734878e-05, + "loss": 0.39, "step": 5580 }, { - "epoch": 5.959445037353255, - "grad_norm": 0.34375, - "learning_rate": 1.638363508824271e-05, - "loss": 0.4409, + "epoch": 5.715455475946776, + "grad_norm": 0.345703125, + "learning_rate": 2.4726433814942814e-05, + "loss": 0.3819, "step": 5584 }, { - "epoch": 5.963713980789755, - "grad_norm": 0.353515625, - "learning_rate": 1.6251957940500636e-05, - "loss": 0.4325, + "epoch": 5.719549641760492, + "grad_norm": 0.349609375, + "learning_rate": 2.4573511282296126e-05, + "loss": 0.4443, "step": 5588 }, { - "epoch": 5.967982924226254, - "grad_norm": 0.3515625, - "learning_rate": 1.6120781767949897e-05, - "loss": 0.4231, + "epoch": 5.723643807574207, + "grad_norm": 0.34375, + "learning_rate": 2.4421020896677318e-05, + "loss": 0.4127, "step": 5592 }, { - "epoch": 5.972251867662753, - "grad_norm": 0.3515625, - "learning_rate": 1.5990107061932373e-05, - "loss": 0.396, + "epoch": 5.727737973387923, + "grad_norm": 0.34375, + "learning_rate": 2.4268963183479967e-05, + "loss": 0.488, "step": 5596 }, { - "epoch": 5.9765208110992525, + "epoch": 5.731832139201638, "grad_norm": 0.36328125, - "learning_rate": 1.5859934311911514e-05, - "loss": 0.4448, + "learning_rate": 2.4117338666606796e-05, + "loss": 0.4206, "step": 5600 }, { - "epoch": 5.980789754535753, - "grad_norm": 0.34375, - "learning_rate": 1.5730264005470734e-05, - "loss": 0.4321, + "epoch": 5.7359263050153535, + "grad_norm": 0.345703125, + "learning_rate": 2.39661478684681e-05, + "loss": 0.3882, "step": 5604 }, { - "epoch": 5.985058697972252, - "grad_norm": 0.375, - "learning_rate": 1.560109662831134e-05, - "loss": 0.4532, + "epoch": 5.7400204708290685, + "grad_norm": 0.353515625, + "learning_rate": 2.3815391309979843e-05, + "loss": 0.3997, "step": 5608 }, { - "epoch": 5.989327641408751, - "grad_norm": 0.373046875, - "learning_rate": 1.547243266425095e-05, - "loss": 0.4304, + "epoch": 5.744114636642784, + "grad_norm": 0.3359375, + "learning_rate": 2.366506951056173e-05, + "loss": 0.4571, "step": 5612 }, { - "epoch": 5.993596584845251, - "grad_norm": 0.361328125, - "learning_rate": 1.5344272595221448e-05, - "loss": 0.415, + "epoch": 5.7482088024564995, + "grad_norm": 0.3359375, + "learning_rate": 2.3515182988135618e-05, + "loss": 0.4476, "step": 5616 }, { - "epoch": 5.99786552828175, - "grad_norm": 0.341796875, - "learning_rate": 1.5216616901267398e-05, - "loss": 0.4507, + "epoch": 5.7523029682702145, + "grad_norm": 0.365234375, + "learning_rate": 2.336573225912371e-05, + "loss": 0.4467, "step": 5620 }, { - "epoch": 6.00213447171825, - "grad_norm": 0.322265625, - "learning_rate": 1.5089466060544086e-05, - "loss": 0.4352, + "epoch": 5.75639713408393, + "grad_norm": 0.341796875, + "learning_rate": 2.3216717838446565e-05, + "loss": 0.3617, "step": 5624 }, { - "epoch": 6.006403415154749, - "grad_norm": 0.357421875, - "learning_rate": 1.4962820549315808e-05, - "loss": 0.4194, + "epoch": 5.760491299897646, + "grad_norm": 0.341796875, + "learning_rate": 2.3068140239521588e-05, + "loss": 0.4339, "step": 5628 }, { - "epoch": 6.010672358591249, - "grad_norm": 0.35546875, - "learning_rate": 1.4836680841954085e-05, - "loss": 0.4156, + "epoch": 5.764585465711361, + "grad_norm": 0.357421875, + "learning_rate": 2.2919999974261177e-05, + "loss": 0.4, "step": 5632 }, { - "epoch": 6.014941302027748, - "grad_norm": 0.349609375, - "learning_rate": 1.471104741093581e-05, - "loss": 0.4571, + "epoch": 5.768679631525076, + "grad_norm": 0.3359375, + "learning_rate": 2.2772297553070784e-05, + "loss": 0.4217, "step": 5636 }, { - "epoch": 6.019210245464247, - "grad_norm": 0.349609375, - "learning_rate": 1.4585920726841627e-05, - "loss": 0.4524, + "epoch": 5.772773797338792, + "grad_norm": 0.33984375, + "learning_rate": 2.262503348484745e-05, + "loss": 0.4739, "step": 5640 }, { - "epoch": 6.023479188900747, - "grad_norm": 0.326171875, - "learning_rate": 1.446130125835394e-05, - "loss": 0.4418, + "epoch": 5.776867963152508, + "grad_norm": 0.373046875, + "learning_rate": 2.247820827697789e-05, + "loss": 0.4524, "step": 5644 }, { - "epoch": 6.027748132337247, - "grad_norm": 0.373046875, - "learning_rate": 1.4337189472255434e-05, - "loss": 0.4257, + "epoch": 5.780962128966223, + "grad_norm": 0.349609375, + "learning_rate": 2.2331822435336644e-05, + "loss": 0.418, "step": 5648 }, { - "epoch": 6.032017075773746, - "grad_norm": 0.330078125, - "learning_rate": 1.4213585833427089e-05, - "loss": 0.3953, + "epoch": 5.785056294779938, + "grad_norm": 0.345703125, + "learning_rate": 2.2185876464284554e-05, + "loss": 0.4336, "step": 5652 }, { - "epoch": 6.036286019210245, - "grad_norm": 0.33984375, - "learning_rate": 1.409049080484661e-05, - "loss": 0.497, + "epoch": 5.789150460593654, + "grad_norm": 0.3671875, + "learning_rate": 2.2040370866666945e-05, + "loss": 0.4555, "step": 5656 }, { - "epoch": 6.040554962646745, - "grad_norm": 0.337890625, - "learning_rate": 1.396790484758653e-05, - "loss": 0.4486, + "epoch": 5.793244626407369, + "grad_norm": 0.3671875, + "learning_rate": 2.1895306143811768e-05, + "loss": 0.4541, "step": 5660 }, { - "epoch": 6.044823906083244, - "grad_norm": 0.333984375, - "learning_rate": 1.3845828420812672e-05, - "loss": 0.4221, + "epoch": 5.797338792221085, + "grad_norm": 0.369140625, + "learning_rate": 2.175068279552805e-05, + "loss": 0.4081, "step": 5664 }, { - "epoch": 6.049092849519744, - "grad_norm": 0.359375, - "learning_rate": 1.372426198178228e-05, - "loss": 0.3952, + "epoch": 5.8014329580348, + "grad_norm": 0.328125, + "learning_rate": 2.16065013201041e-05, + "loss": 0.4147, "step": 5668 }, { - "epoch": 6.0533617929562435, - "grad_norm": 0.34765625, - "learning_rate": 1.3603205985842319e-05, - "loss": 0.4418, + "epoch": 5.805527123848516, + "grad_norm": 0.349609375, + "learning_rate": 2.146276221430572e-05, + "loss": 0.423, "step": 5672 }, { - "epoch": 6.057630736392743, - "grad_norm": 0.380859375, - "learning_rate": 1.3482660886427893e-05, - "loss": 0.3987, + "epoch": 5.809621289662231, + "grad_norm": 0.31640625, + "learning_rate": 2.131946597337463e-05, + "loss": 0.4226, "step": 5676 }, { - "epoch": 6.061899679829242, - "grad_norm": 0.6328125, - "learning_rate": 1.3362627135060356e-05, - "loss": 0.4132, + "epoch": 5.813715455475947, + "grad_norm": 0.34765625, + "learning_rate": 2.1176613091026716e-05, + "loss": 0.4253, "step": 5680 }, { - "epoch": 6.0661686232657415, - "grad_norm": 0.3515625, - "learning_rate": 1.3243105181345826e-05, - "loss": 0.4552, + "epoch": 5.817809621289662, + "grad_norm": 0.359375, + "learning_rate": 2.1034204059450193e-05, + "loss": 0.4547, "step": 5684 }, { - "epoch": 6.070437566702241, - "grad_norm": 0.36328125, - "learning_rate": 1.3124095472973306e-05, - "loss": 0.4393, + "epoch": 5.821903787103378, + "grad_norm": 0.341796875, + "learning_rate": 2.0892239369304122e-05, + "loss": 0.4063, "step": 5688 }, { - "epoch": 6.074706510138741, - "grad_norm": 0.36328125, - "learning_rate": 1.3005598455713201e-05, - "loss": 0.4293, + "epoch": 5.825997952917093, + "grad_norm": 0.357421875, + "learning_rate": 2.0750719509716584e-05, + "loss": 0.4275, "step": 5692 }, { - "epoch": 6.07897545357524, - "grad_norm": 0.341796875, - "learning_rate": 1.2887614573415434e-05, - "loss": 0.4416, + "epoch": 5.830092118730809, + "grad_norm": 0.376953125, + "learning_rate": 2.0609644968283068e-05, + "loss": 0.4478, "step": 5696 }, { - "epoch": 6.08324439701174, - "grad_norm": 0.318359375, - "learning_rate": 1.2770144268007998e-05, - "loss": 0.4191, + "epoch": 5.834186284544524, + "grad_norm": 0.375, + "learning_rate": 2.046901623106476e-05, + "loss": 0.3858, "step": 5700 }, { - "epoch": 6.087513340448239, + "epoch": 5.83828045035824, "grad_norm": 0.345703125, - "learning_rate": 1.2653187979495111e-05, - "loss": 0.4396, + "learning_rate": 2.0328833782586735e-05, + "loss": 0.4287, "step": 5704 }, { - "epoch": 6.091782283884738, - "grad_norm": 0.357421875, - "learning_rate": 1.2536746145955744e-05, - "loss": 0.4466, + "epoch": 5.842374616171955, + "grad_norm": 0.419921875, + "learning_rate": 2.018909810583657e-05, + "loss": 0.4537, "step": 5708 }, { - "epoch": 6.096051227321238, - "grad_norm": 0.34765625, - "learning_rate": 1.2420819203541848e-05, - "loss": 0.4069, + "epoch": 5.846468781985671, + "grad_norm": 0.369140625, + "learning_rate": 2.0049809682262485e-05, + "loss": 0.4189, "step": 5712 }, { - "epoch": 6.100320170757738, + "epoch": 5.850562947799386, "grad_norm": 0.3515625, - "learning_rate": 1.2305407586476746e-05, - "loss": 0.3761, + "learning_rate": 1.9910968991771642e-05, + "loss": 0.4401, "step": 5716 }, { - "epoch": 6.104589114194237, + "epoch": 5.854657113613102, "grad_norm": 0.3515625, - "learning_rate": 1.2190511727053571e-05, - "loss": 0.3931, + "learning_rate": 1.977257651272869e-05, + "loss": 0.4498, "step": 5720 }, { - "epoch": 6.108858057630736, - "grad_norm": 0.365234375, - "learning_rate": 1.2076132055633536e-05, - "loss": 0.4263, + "epoch": 5.858751279426817, + "grad_norm": 0.357421875, + "learning_rate": 1.963463272195394e-05, + "loss": 0.4612, "step": 5724 }, { - "epoch": 6.113127001067236, - "grad_norm": 0.3515625, - "learning_rate": 1.1962269000644486e-05, - "loss": 0.3988, + "epoch": 5.8628454452405325, + "grad_norm": 0.353515625, + "learning_rate": 1.949713809472177e-05, + "loss": 0.4135, "step": 5728 }, { - "epoch": 6.117395944503735, - "grad_norm": 0.353515625, - "learning_rate": 1.184892298857908e-05, - "loss": 0.4063, + "epoch": 5.8669396110542475, + "grad_norm": 0.359375, + "learning_rate": 1.9360093104759035e-05, + "loss": 0.4503, "step": 5732 }, { - "epoch": 6.121664887940235, - "grad_norm": 0.353515625, - "learning_rate": 1.1736094443993422e-05, - "loss": 0.4328, + "epoch": 5.8710337768679635, + "grad_norm": 0.37890625, + "learning_rate": 1.9223498224243395e-05, + "loss": 0.4107, "step": 5736 }, { - "epoch": 6.125933831376734, - "grad_norm": 0.357421875, - "learning_rate": 1.162378378950527e-05, - "loss": 0.4695, + "epoch": 5.8751279426816785, + "grad_norm": 0.3671875, + "learning_rate": 1.9087353923801723e-05, + "loss": 0.4364, "step": 5740 }, { - "epoch": 6.130202774813234, - "grad_norm": 0.3671875, - "learning_rate": 1.151199144579255e-05, - "loss": 0.4899, + "epoch": 5.879222108495394, + "grad_norm": 0.369140625, + "learning_rate": 1.8951660672508335e-05, + "loss": 0.4805, "step": 5744 }, { - "epoch": 6.134471718249733, - "grad_norm": 0.337890625, - "learning_rate": 1.1400717831591844e-05, - "loss": 0.4101, + "epoch": 5.883316274309109, + "grad_norm": 0.3828125, + "learning_rate": 1.8816418937883615e-05, + "loss": 0.4737, "step": 5748 }, { - "epoch": 6.138740661686232, - "grad_norm": 0.34765625, - "learning_rate": 1.1289963363696631e-05, - "loss": 0.42, + "epoch": 5.887410440122825, + "grad_norm": 0.3359375, + "learning_rate": 1.86816291858923e-05, + "loss": 0.4202, "step": 5752 }, { - "epoch": 6.1430096051227325, - "grad_norm": 0.337890625, - "learning_rate": 1.117972845695595e-05, - "loss": 0.4035, + "epoch": 5.89150460593654, + "grad_norm": 0.34765625, + "learning_rate": 1.8547291880941702e-05, + "loss": 0.4629, "step": 5756 }, { - "epoch": 6.147278548559232, - "grad_norm": 0.34375, - "learning_rate": 1.1070013524272664e-05, - "loss": 0.4258, + "epoch": 5.895598771750256, + "grad_norm": 0.353515625, + "learning_rate": 1.8413407485880394e-05, + "loss": 0.4427, "step": 5760 }, { - "epoch": 6.151547491995731, - "grad_norm": 0.3515625, - "learning_rate": 1.0960818976602059e-05, - "loss": 0.3922, + "epoch": 5.899692937563971, + "grad_norm": 0.33203125, + "learning_rate": 1.8279976461996505e-05, + "loss": 0.4057, "step": 5764 }, { - "epoch": 6.1558164354322304, - "grad_norm": 0.375, - "learning_rate": 1.0852145222950148e-05, - "loss": 0.4279, + "epoch": 5.903787103377686, + "grad_norm": 0.36328125, + "learning_rate": 1.814699926901597e-05, + "loss": 0.4321, "step": 5768 }, { - "epoch": 6.16008537886873, - "grad_norm": 0.341796875, - "learning_rate": 1.0743992670372304e-05, - "loss": 0.434, + "epoch": 5.907881269191402, + "grad_norm": 0.33984375, + "learning_rate": 1.8014476365101222e-05, + "loss": 0.4007, "step": 5772 }, { - "epoch": 6.164354322305229, - "grad_norm": 0.345703125, - "learning_rate": 1.0636361723971593e-05, - "loss": 0.4193, + "epoch": 5.911975435005118, + "grad_norm": 0.3515625, + "learning_rate": 1.7882408206849446e-05, + "loss": 0.423, "step": 5776 }, { - "epoch": 6.168623265741729, - "grad_norm": 0.359375, - "learning_rate": 1.0529252786897346e-05, - "loss": 0.4083, + "epoch": 5.916069600818833, + "grad_norm": 0.36328125, + "learning_rate": 1.7750795249290944e-05, + "loss": 0.4179, "step": 5780 }, { - "epoch": 6.172892209178229, - "grad_norm": 0.357421875, - "learning_rate": 1.0422666260343675e-05, - "loss": 0.4413, + "epoch": 5.920163766632548, + "grad_norm": 0.35546875, + "learning_rate": 1.7619637945887765e-05, + "loss": 0.4201, "step": 5784 }, { - "epoch": 6.177161152614728, - "grad_norm": 0.36328125, - "learning_rate": 1.0316602543547808e-05, - "loss": 0.4353, + "epoch": 5.924257932446264, + "grad_norm": 0.333984375, + "learning_rate": 1.748893674853205e-05, + "loss": 0.4133, "step": 5788 }, { - "epoch": 6.181430096051227, - "grad_norm": 0.369140625, - "learning_rate": 1.0211062033788819e-05, - "loss": 0.4319, + "epoch": 5.92835209825998, + "grad_norm": 0.341796875, + "learning_rate": 1.7358692107544363e-05, + "loss": 0.4251, "step": 5792 }, { - "epoch": 6.1856990394877265, - "grad_norm": 0.33203125, - "learning_rate": 1.0106045126385904e-05, - "loss": 0.4079, + "epoch": 5.932446264073695, + "grad_norm": 0.376953125, + "learning_rate": 1.7228904471672294e-05, + "loss": 0.4342, "step": 5796 }, { - "epoch": 6.189967982924227, - "grad_norm": 0.359375, - "learning_rate": 1.000155221469714e-05, - "loss": 0.4086, + "epoch": 5.93654042988741, + "grad_norm": 0.4609375, + "learning_rate": 1.7099574288088906e-05, + "loss": 0.4283, "step": 5800 }, { - "epoch": 6.194236926360726, - "grad_norm": 0.361328125, - "learning_rate": 9.89758369011781e-06, - "loss": 0.4405, + "epoch": 5.940634595701126, + "grad_norm": 0.376953125, + "learning_rate": 1.697070200239103e-05, + "loss": 0.4336, "step": 5804 }, { - "epoch": 6.198505869797225, - "grad_norm": 0.36328125, - "learning_rate": 9.794139942079055e-06, - "loss": 0.4013, + "epoch": 5.944728761514841, + "grad_norm": 0.326171875, + "learning_rate": 1.6842288058597946e-05, + "loss": 0.3919, "step": 5808 }, { - "epoch": 6.202774813233725, - "grad_norm": 0.34765625, - "learning_rate": 9.691221358046363e-06, - "loss": 0.4015, + "epoch": 5.948822927328557, + "grad_norm": 0.326171875, + "learning_rate": 1.6714332899149764e-05, + "loss": 0.4539, "step": 5812 }, { - "epoch": 6.207043756670224, - "grad_norm": 0.345703125, - "learning_rate": 9.58882832351814e-06, - "loss": 0.4249, + "epoch": 5.952917093142272, + "grad_norm": 0.373046875, + "learning_rate": 1.6586836964905775e-05, + "loss": 0.447, "step": 5816 }, { - "epoch": 6.211312700106723, - "grad_norm": 0.37890625, - "learning_rate": 9.486961222024287e-06, - "loss": 0.4196, + "epoch": 5.957011258955988, + "grad_norm": 0.36328125, + "learning_rate": 1.6459800695143166e-05, + "loss": 0.4306, "step": 5820 }, { - "epoch": 6.215581643543223, - "grad_norm": 0.333984375, - "learning_rate": 9.385620435124691e-06, - "loss": 0.4236, + "epoch": 5.961105424769703, + "grad_norm": 0.33984375, + "learning_rate": 1.6333224527555332e-05, + "loss": 0.3874, "step": 5824 }, { - "epoch": 6.219850586979723, - "grad_norm": 0.34765625, - "learning_rate": 9.28480634240788e-06, - "loss": 0.4004, + "epoch": 5.965199590583419, + "grad_norm": 0.3359375, + "learning_rate": 1.620710889825039e-05, + "loss": 0.3592, "step": 5828 }, { - "epoch": 6.224119530416222, - "grad_norm": 0.349609375, - "learning_rate": 9.184519321489509e-06, - "loss": 0.4866, + "epoch": 5.969293756397134, + "grad_norm": 0.359375, + "learning_rate": 1.6081454241749782e-05, + "loss": 0.4402, "step": 5832 }, { - "epoch": 6.228388473852721, - "grad_norm": 0.361328125, - "learning_rate": 9.084759748011095e-06, - "loss": 0.4178, + "epoch": 5.97338792221085, + "grad_norm": 0.357421875, + "learning_rate": 1.595626099098667e-05, + "loss": 0.4694, "step": 5836 }, { - "epoch": 6.232657417289221, - "grad_norm": 0.34765625, - "learning_rate": 8.985527995638386e-06, - "loss": 0.418, + "epoch": 5.977482088024565, + "grad_norm": 0.375, + "learning_rate": 1.583152957730447e-05, + "loss": 0.4011, "step": 5840 }, { - "epoch": 6.23692636072572, - "grad_norm": 0.361328125, - "learning_rate": 8.886824436060213e-06, - "loss": 0.4134, + "epoch": 5.981576253838281, + "grad_norm": 0.369140625, + "learning_rate": 1.5707260430455413e-05, + "loss": 0.4348, "step": 5844 }, { - "epoch": 6.24119530416222, - "grad_norm": 0.357421875, - "learning_rate": 8.788649438986888e-06, - "loss": 0.4561, + "epoch": 5.985670419651996, + "grad_norm": 0.333984375, + "learning_rate": 1.558345397859893e-05, + "loss": 0.3799, "step": 5848 }, { - "epoch": 6.245464247598719, - "grad_norm": 0.33984375, - "learning_rate": 8.691003372148959e-06, - "loss": 0.3838, + "epoch": 5.9897645854657116, + "grad_norm": 0.357421875, + "learning_rate": 1.54601106483004e-05, + "loss": 0.4204, "step": 5852 }, { - "epoch": 6.249733191035219, - "grad_norm": 0.361328125, - "learning_rate": 8.593886601295736e-06, - "loss": 0.4071, + "epoch": 5.993858751279427, + "grad_norm": 0.369140625, + "learning_rate": 1.53372308645295e-05, + "loss": 0.4476, "step": 5856 }, { - "epoch": 6.254002134471718, - "grad_norm": 0.330078125, - "learning_rate": 8.497299490194004e-06, - "loss": 0.424, + "epoch": 5.9979529170931425, + "grad_norm": 0.36328125, + "learning_rate": 1.521481505065873e-05, + "loss": 0.4125, "step": 5860 }, { - "epoch": 6.258271077908217, - "grad_norm": 0.36328125, - "learning_rate": 8.401242400626601e-06, - "loss": 0.4589, + "epoch": 6.0020470829068575, + "grad_norm": 0.341796875, + "learning_rate": 1.5092863628462093e-05, + "loss": 0.4496, "step": 5864 }, { - "epoch": 6.2625400213447175, - "grad_norm": 0.341796875, - "learning_rate": 8.305715692391069e-06, - "loss": 0.409, + "epoch": 6.006141248720573, + "grad_norm": 0.357421875, + "learning_rate": 1.4971377018113617e-05, + "loss": 0.4461, "step": 5868 }, { - "epoch": 6.266808964781217, - "grad_norm": 0.349609375, - "learning_rate": 8.21071972329837e-06, - "loss": 0.3921, + "epoch": 6.0102354145342884, + "grad_norm": 0.341796875, + "learning_rate": 1.4850355638185713e-05, + "loss": 0.3945, "step": 5872 }, { - "epoch": 6.271077908217716, - "grad_norm": 0.3359375, - "learning_rate": 8.116254849171427e-06, - "loss": 0.3989, + "epoch": 6.014329580348004, + "grad_norm": 0.328125, + "learning_rate": 1.472979990564797e-05, + "loss": 0.4214, "step": 5876 }, { - "epoch": 6.2753468516542155, - "grad_norm": 0.3515625, - "learning_rate": 8.022321423843903e-06, - "loss": 0.3277, + "epoch": 6.018423746161719, + "grad_norm": 0.33203125, + "learning_rate": 1.460971023586565e-05, + "loss": 0.4268, "step": 5880 }, { - "epoch": 6.279615795090715, - "grad_norm": 0.345703125, - "learning_rate": 7.928919799158818e-06, - "loss": 0.4409, + "epoch": 6.022517911975435, + "grad_norm": 0.337890625, + "learning_rate": 1.4490087042598147e-05, + "loss": 0.4359, "step": 5884 }, { - "epoch": 6.283884738527215, - "grad_norm": 0.341796875, - "learning_rate": 7.83605032496727e-06, - "loss": 0.4155, + "epoch": 6.02661207778915, + "grad_norm": 0.353515625, + "learning_rate": 1.4370930737997722e-05, + "loss": 0.4153, "step": 5888 }, { - "epoch": 6.288153681963714, - "grad_norm": 0.37109375, - "learning_rate": 7.74371334912705e-06, - "loss": 0.4119, + "epoch": 6.030706243602866, + "grad_norm": 0.337890625, + "learning_rate": 1.4252241732608004e-05, + "loss": 0.4195, "step": 5892 }, { - "epoch": 6.292422625400214, - "grad_norm": 0.3515625, - "learning_rate": 7.651909217501467e-06, - "loss": 0.421, + "epoch": 6.034800409416581, + "grad_norm": 0.337890625, + "learning_rate": 1.4134020435362487e-05, + "loss": 0.3509, "step": 5896 }, { - "epoch": 6.296691568836713, - "grad_norm": 0.361328125, - "learning_rate": 7.56063827395787e-06, - "loss": 0.4199, + "epoch": 6.038894575230297, + "grad_norm": 0.359375, + "learning_rate": 1.4016267253583324e-05, + "loss": 0.4191, "step": 5900 }, { - "epoch": 6.300960512273212, - "grad_norm": 0.33203125, - "learning_rate": 7.469900860366573e-06, - "loss": 0.4212, + "epoch": 6.042988741044012, + "grad_norm": 0.35546875, + "learning_rate": 1.3898982592979802e-05, + "loss": 0.4477, "step": 5904 }, { - "epoch": 6.3052294557097115, - "grad_norm": 0.365234375, - "learning_rate": 7.379697316599376e-06, - "loss": 0.4452, + "epoch": 6.047082906857728, + "grad_norm": 0.330078125, + "learning_rate": 1.378216685764686e-05, + "loss": 0.4107, "step": 5908 }, { - "epoch": 6.309498399146212, - "grad_norm": 0.326171875, - "learning_rate": 7.290027980528473e-06, - "loss": 0.3906, + "epoch": 6.051177072671443, + "grad_norm": 0.3671875, + "learning_rate": 1.3665820450063898e-05, + "loss": 0.3961, "step": 5912 }, { - "epoch": 6.313767342582711, - "grad_norm": 0.33984375, - "learning_rate": 7.200893188024998e-06, - "loss": 0.419, + "epoch": 6.055271238485159, + "grad_norm": 0.369140625, + "learning_rate": 1.3549943771093258e-05, + "loss": 0.4052, "step": 5916 }, { - "epoch": 6.31803628601921, - "grad_norm": 0.3515625, - "learning_rate": 7.1122932729579145e-06, - "loss": 0.446, + "epoch": 6.059365404298874, + "grad_norm": 0.328125, + "learning_rate": 1.3434537219978813e-05, + "loss": 0.389, "step": 5920 }, { - "epoch": 6.32230522945571, - "grad_norm": 0.357421875, - "learning_rate": 7.024228567192741e-06, - "loss": 0.4435, + "epoch": 6.06345957011259, + "grad_norm": 0.333984375, + "learning_rate": 1.3319601194344698e-05, + "loss": 0.4063, "step": 5924 }, { - "epoch": 6.326574172892209, - "grad_norm": 0.337890625, - "learning_rate": 6.936699400590201e-06, - "loss": 0.4534, + "epoch": 6.067553735926305, + "grad_norm": 0.341796875, + "learning_rate": 1.3205136090193923e-05, + "loss": 0.436, "step": 5928 }, { - "epoch": 6.330843116328708, - "grad_norm": 0.365234375, - "learning_rate": 6.84970610100517e-06, - "loss": 0.4267, + "epoch": 6.071647901740021, + "grad_norm": 0.37109375, + "learning_rate": 1.3091142301906887e-05, + "loss": 0.4485, "step": 5932 }, { - "epoch": 6.335112059765208, + "epoch": 6.075742067553736, "grad_norm": 0.353515625, - "learning_rate": 6.763248994285297e-06, - "loss": 0.4479, + "learning_rate": 1.2977620222240165e-05, + "loss": 0.4521, "step": 5936 }, { - "epoch": 6.339381003201708, - "grad_norm": 0.353515625, - "learning_rate": 6.677328404269788e-06, - "loss": 0.4158, + "epoch": 6.079836233367452, + "grad_norm": 0.34375, + "learning_rate": 1.2864570242325133e-05, + "loss": 0.4712, "step": 5940 }, { - "epoch": 6.343649946638207, - "grad_norm": 0.34765625, - "learning_rate": 6.591944652788323e-06, - "loss": 0.454, + "epoch": 6.083930399181167, + "grad_norm": 0.3515625, + "learning_rate": 1.2751992751666457e-05, + "loss": 0.4428, "step": 5944 }, { - "epoch": 6.347918890074706, - "grad_norm": 0.369140625, - "learning_rate": 6.507098059659705e-06, - "loss": 0.4538, + "epoch": 6.088024564994882, + "grad_norm": 0.359375, + "learning_rate": 1.2639888138141014e-05, + "loss": 0.4535, "step": 5948 }, { - "epoch": 6.352187833511206, - "grad_norm": 0.36328125, - "learning_rate": 6.422788942690748e-06, - "loss": 0.436, + "epoch": 6.092118730808598, + "grad_norm": 0.337890625, + "learning_rate": 1.2528256787996372e-05, + "loss": 0.4473, "step": 5952 }, { - "epoch": 6.356456776947706, - "grad_norm": 0.34375, - "learning_rate": 6.339017617675046e-06, - "loss": 0.4181, + "epoch": 6.096212896622313, + "grad_norm": 0.373046875, + "learning_rate": 1.2417099085849468e-05, + "loss": 0.3956, "step": 5956 }, { - "epoch": 6.360725720384205, - "grad_norm": 0.3671875, - "learning_rate": 6.255784398391833e-06, - "loss": 0.4432, + "epoch": 6.100307062436029, + "grad_norm": 0.3359375, + "learning_rate": 1.2306415414685366e-05, + "loss": 0.4278, "step": 5960 }, { - "epoch": 6.3649946638207044, - "grad_norm": 0.35546875, - "learning_rate": 6.173089596604724e-06, - "loss": 0.4172, + "epoch": 6.104401228249744, + "grad_norm": 0.392578125, + "learning_rate": 1.219620615585593e-05, + "loss": 0.445, "step": 5964 }, { - "epoch": 6.369263607257204, - "grad_norm": 0.37109375, - "learning_rate": 6.090933522060665e-06, - "loss": 0.4493, + "epoch": 6.10849539406346, + "grad_norm": 0.353515625, + "learning_rate": 1.2086471689078353e-05, + "loss": 0.4198, "step": 5968 }, { - "epoch": 6.373532550693703, - "grad_norm": 0.34765625, - "learning_rate": 6.009316482488696e-06, - "loss": 0.4276, + "epoch": 6.112589559877175, + "grad_norm": 0.37890625, + "learning_rate": 1.1977212392434082e-05, + "loss": 0.4577, "step": 5972 }, { - "epoch": 6.377801494130202, - "grad_norm": 0.322265625, - "learning_rate": 5.92823878359876e-06, - "loss": 0.4382, + "epoch": 6.116683725690891, + "grad_norm": 0.3359375, + "learning_rate": 1.1868428642367378e-05, + "loss": 0.4793, "step": 5976 }, { - "epoch": 6.382070437566703, - "grad_norm": 0.357421875, - "learning_rate": 5.8477007290807285e-06, - "loss": 0.413, + "epoch": 6.120777891504606, + "grad_norm": 0.3515625, + "learning_rate": 1.1760120813684009e-05, + "loss": 0.4424, "step": 5980 }, { - "epoch": 6.386339381003202, + "epoch": 6.1248720573183215, "grad_norm": 0.349609375, - "learning_rate": 5.767702620602976e-06, - "loss": 0.4251, + "learning_rate": 1.1652289279550026e-05, + "loss": 0.4027, "step": 5984 }, { - "epoch": 6.390608324439701, - "grad_norm": 0.37109375, - "learning_rate": 5.6882447578116105e-06, - "loss": 0.4585, + "epoch": 6.1289662231320365, + "grad_norm": 0.34765625, + "learning_rate": 1.1544934411490469e-05, + "loss": 0.3947, "step": 5988 }, { - "epoch": 6.3948772678762005, - "grad_norm": 0.328125, - "learning_rate": 5.609327438328992e-06, - "loss": 0.4004, + "epoch": 6.1330603889457525, + "grad_norm": 0.353515625, + "learning_rate": 1.1438056579387966e-05, + "loss": 0.4369, "step": 5992 }, { - "epoch": 6.3991462113127, - "grad_norm": 0.34765625, - "learning_rate": 5.5309509577529165e-06, - "loss": 0.4432, + "epoch": 6.1371545547594675, + "grad_norm": 0.349609375, + "learning_rate": 1.1331656151481654e-05, + "loss": 0.4243, "step": 5996 }, { - "epoch": 6.4034151547492, + "epoch": 6.141248720573183, "grad_norm": 0.34765625, - "learning_rate": 5.453115609655284e-06, - "loss": 0.4075, + "learning_rate": 1.1225733494365791e-05, + "loss": 0.4472, "step": 6000 }, { - "epoch": 6.407684098185699, + "epoch": 6.145342886386898, "grad_norm": 0.3515625, - "learning_rate": 5.37582168558115e-06, - "loss": 0.4164, + "learning_rate": 1.1120288972988445e-05, + "loss": 0.3859, "step": 6004 }, { - "epoch": 6.411953041622199, - "grad_norm": 0.349609375, - "learning_rate": 5.299069475047591e-06, - "loss": 0.4106, + "epoch": 6.149437052200614, + "grad_norm": 0.330078125, + "learning_rate": 1.1015322950650408e-05, + "loss": 0.4321, "step": 6008 }, { - "epoch": 6.416221985058698, - "grad_norm": 0.357421875, - "learning_rate": 5.222859265542539e-06, - "loss": 0.4265, + "epoch": 6.153531218014329, + "grad_norm": 0.34375, + "learning_rate": 1.0910835789003785e-05, + "loss": 0.4198, "step": 6012 }, { - "epoch": 6.420490928495197, - "grad_norm": 0.3515625, - "learning_rate": 5.147191342523838e-06, - "loss": 0.3976, + "epoch": 6.157625383828045, + "grad_norm": 0.35546875, + "learning_rate": 1.0806827848050791e-05, + "loss": 0.4539, "step": 6016 }, { - "epoch": 6.4247598719316965, - "grad_norm": 0.34765625, - "learning_rate": 5.072065989418067e-06, - "loss": 0.3704, + "epoch": 6.16171954964176, + "grad_norm": 0.341796875, + "learning_rate": 1.0703299486142541e-05, + "loss": 0.4371, "step": 6020 }, { - "epoch": 6.429028815368197, - "grad_norm": 0.33984375, - "learning_rate": 4.997483487619552e-06, - "loss": 0.4541, + "epoch": 6.165813715455476, + "grad_norm": 0.337890625, + "learning_rate": 1.0600251059977854e-05, + "loss": 0.3967, "step": 6024 }, { - "epoch": 6.433297758804696, + "epoch": 6.169907881269191, "grad_norm": 0.341796875, - "learning_rate": 4.923444116489245e-06, - "loss": 0.4111, + "learning_rate": 1.0497682924601841e-05, + "loss": 0.4088, "step": 6028 }, { - "epoch": 6.437566702241195, - "grad_norm": 0.349609375, - "learning_rate": 4.849948153353755e-06, - "loss": 0.3754, + "epoch": 6.174002047082907, + "grad_norm": 0.34765625, + "learning_rate": 1.0395595433404935e-05, + "loss": 0.4137, "step": 6032 }, { - "epoch": 6.441835645677695, - "grad_norm": 0.328125, - "learning_rate": 4.776995873504241e-06, - "loss": 0.4393, + "epoch": 6.178096212896622, + "grad_norm": 0.3671875, + "learning_rate": 1.029398893812151e-05, + "loss": 0.4145, "step": 6036 }, { - "epoch": 6.446104589114194, - "grad_norm": 0.34375, - "learning_rate": 4.704587550195404e-06, - "loss": 0.4263, + "epoch": 6.182190378710338, + "grad_norm": 0.357421875, + "learning_rate": 1.0192863788828654e-05, + "loss": 0.4265, "step": 6040 }, { - "epoch": 6.450373532550694, - "grad_norm": 0.318359375, - "learning_rate": 4.632723454644477e-06, - "loss": 0.3849, + "epoch": 6.186284544524053, + "grad_norm": 0.349609375, + "learning_rate": 1.0092220333945073e-05, + "loss": 0.4127, "step": 6044 }, { - "epoch": 6.454642475987193, - "grad_norm": 0.36328125, - "learning_rate": 4.561403856030188e-06, - "loss": 0.4042, + "epoch": 6.190378710337769, + "grad_norm": 0.359375, + "learning_rate": 9.992058920229823e-06, + "loss": 0.4384, "step": 6048 }, { - "epoch": 6.458911419423693, - "grad_norm": 0.341796875, - "learning_rate": 4.490629021491815e-06, - "loss": 0.4332, + "epoch": 6.194472876151484, + "grad_norm": 0.369140625, + "learning_rate": 9.892379892781088e-06, + "loss": 0.4406, "step": 6052 }, { - "epoch": 6.463180362860192, - "grad_norm": 0.361328125, - "learning_rate": 4.420399216128051e-06, - "loss": 0.4284, + "epoch": 6.1985670419652, + "grad_norm": 0.35546875, + "learning_rate": 9.793183595035082e-06, + "loss": 0.4378, "step": 6056 }, { - "epoch": 6.467449306296691, - "grad_norm": 0.3359375, - "learning_rate": 4.350714702996122e-06, - "loss": 0.4086, + "epoch": 6.202661207778915, + "grad_norm": 0.353515625, + "learning_rate": 9.694470368764812e-06, + "loss": 0.3789, "step": 6060 }, { - "epoch": 6.471718249733191, - "grad_norm": 0.359375, - "learning_rate": 4.281575743110804e-06, - "loss": 0.4516, + "epoch": 6.206755373592631, + "grad_norm": 0.345703125, + "learning_rate": 9.596240554078838e-06, + "loss": 0.3834, "step": 6064 }, { - "epoch": 6.475987193169691, - "grad_norm": 0.33203125, - "learning_rate": 4.212982595443376e-06, - "loss": 0.4195, + "epoch": 6.210849539406346, + "grad_norm": 0.361328125, + "learning_rate": 9.4984944894202e-06, + "loss": 0.4451, "step": 6068 }, { - "epoch": 6.48025613660619, - "grad_norm": 0.328125, - "learning_rate": 4.144935516920733e-06, - "loss": 0.4371, + "epoch": 6.214943705220062, + "grad_norm": 0.373046875, + "learning_rate": 9.40123251156527e-06, + "loss": 0.4477, "step": 6072 }, { - "epoch": 6.4845250800426895, - "grad_norm": 0.353515625, - "learning_rate": 4.077434762424308e-06, - "loss": 0.4639, + "epoch": 6.219037871033777, + "grad_norm": 0.345703125, + "learning_rate": 9.304454955622425e-06, + "loss": 0.401, "step": 6076 }, { - "epoch": 6.488794023479189, - "grad_norm": 0.345703125, - "learning_rate": 4.010480584789289e-06, - "loss": 0.3879, + "epoch": 6.223132036847493, + "grad_norm": 0.375, + "learning_rate": 9.208162155031074e-06, + "loss": 0.4438, "step": 6080 }, { - "epoch": 6.493062966915688, - "grad_norm": 0.333984375, - "learning_rate": 3.9440732348034296e-06, - "loss": 0.463, + "epoch": 6.227226202661208, + "grad_norm": 0.33984375, + "learning_rate": 9.112354441560476e-06, + "loss": 0.4212, "step": 6084 }, { - "epoch": 6.497331910352187, - "grad_norm": 0.359375, - "learning_rate": 3.878212961206412e-06, - "loss": 0.3724, + "epoch": 6.231320368474924, + "grad_norm": 0.3125, + "learning_rate": 9.017032145308483e-06, + "loss": 0.4082, "step": 6088 }, { - "epoch": 6.501600853788688, - "grad_norm": 0.357421875, - "learning_rate": 3.812900010688619e-06, - "loss": 0.3974, + "epoch": 6.235414534288639, + "grad_norm": 0.34765625, + "learning_rate": 8.9221955947005e-06, + "loss": 0.4046, "step": 6092 }, { - "epoch": 6.505869797225187, - "grad_norm": 0.345703125, - "learning_rate": 3.748134627890442e-06, - "loss": 0.4343, + "epoch": 6.239508700102354, + "grad_norm": 0.37890625, + "learning_rate": 8.82784511648838e-06, + "loss": 0.418, "step": 6096 }, { - "epoch": 6.510138740661686, - "grad_norm": 0.357421875, - "learning_rate": 3.6839170554011977e-06, - "loss": 0.4224, + "epoch": 6.24360286591607, + "grad_norm": 0.35546875, + "learning_rate": 8.733981035749193e-06, + "loss": 0.4293, "step": 6100 }, { - "epoch": 6.5144076840981855, - "grad_norm": 0.32421875, - "learning_rate": 3.620247533758325e-06, - "loss": 0.413, + "epoch": 6.247697031729785, + "grad_norm": 0.34375, + "learning_rate": 8.640603675884194e-06, + "loss": 0.4244, "step": 6104 }, { - "epoch": 6.518676627534685, - "grad_norm": 0.341796875, - "learning_rate": 3.557126301446439e-06, - "loss": 0.3969, + "epoch": 6.2517911975435005, + "grad_norm": 0.3671875, + "learning_rate": 8.547713358617714e-06, + "loss": 0.4109, "step": 6108 }, { - "epoch": 6.522945570971185, + "epoch": 6.255885363357216, "grad_norm": 0.357421875, - "learning_rate": 3.494553594896482e-06, - "loss": 0.4197, + "learning_rate": 8.455310403995924e-06, + "loss": 0.4285, "step": 6112 }, { - "epoch": 6.527214514407684, - "grad_norm": 0.37890625, - "learning_rate": 3.432529648484722e-06, - "loss": 0.4544, + "epoch": 6.2599795291709315, + "grad_norm": 0.36328125, + "learning_rate": 8.363395130385908e-06, + "loss": 0.4134, "step": 6116 }, { - "epoch": 6.531483457844184, - "grad_norm": 0.322265625, - "learning_rate": 3.3710546945320383e-06, - "loss": 0.3835, + "epoch": 6.2640736949846465, + "grad_norm": 0.353515625, + "learning_rate": 8.27196785447446e-06, + "loss": 0.438, "step": 6120 }, { - "epoch": 6.535752401280683, - "grad_norm": 0.337890625, - "learning_rate": 3.3101289633029384e-06, - "loss": 0.4282, + "epoch": 6.268167860798362, + "grad_norm": 0.3515625, + "learning_rate": 8.181028891267017e-06, + "loss": 0.4141, "step": 6124 }, { - "epoch": 6.540021344717182, - "grad_norm": 0.345703125, - "learning_rate": 3.2497526830047084e-06, - "loss": 0.3845, + "epoch": 6.272262026612077, + "grad_norm": 0.36328125, + "learning_rate": 8.09057855408658e-06, + "loss": 0.4089, "step": 6128 }, { - "epoch": 6.544290288153682, - "grad_norm": 0.322265625, - "learning_rate": 3.1899260797866145e-06, - "loss": 0.405, + "epoch": 6.276356192425793, + "grad_norm": 0.34765625, + "learning_rate": 8.000617154572597e-06, + "loss": 0.4153, "step": 6132 }, { - "epoch": 6.548559231590182, - "grad_norm": 0.34765625, - "learning_rate": 3.130649377738953e-06, - "loss": 0.4102, + "epoch": 6.280450358239508, + "grad_norm": 0.359375, + "learning_rate": 7.91114500267993e-06, + "loss": 0.4158, "step": 6136 }, { - "epoch": 6.552828175026681, - "grad_norm": 0.34375, - "learning_rate": 3.0719227988923844e-06, - "loss": 0.4326, + "epoch": 6.284544524053224, + "grad_norm": 0.3515625, + "learning_rate": 7.82216240667784e-06, + "loss": 0.4268, "step": 6140 }, { - "epoch": 6.55709711846318, - "grad_norm": 0.349609375, - "learning_rate": 3.013746563216851e-06, - "loss": 0.3952, + "epoch": 6.288638689866939, + "grad_norm": 0.353515625, + "learning_rate": 7.733669673148768e-06, + "loss": 0.4038, "step": 6144 }, { - "epoch": 6.56136606189968, - "grad_norm": 0.337890625, - "learning_rate": 2.9561208886210275e-06, - "loss": 0.4046, + "epoch": 6.292732855680655, + "grad_norm": 0.359375, + "learning_rate": 7.645667106987407e-06, + "loss": 0.4577, "step": 6148 }, { - "epoch": 6.565635005336179, - "grad_norm": 0.33984375, - "learning_rate": 2.899045990951271e-06, - "loss": 0.4215, + "epoch": 6.29682702149437, + "grad_norm": 0.34765625, + "learning_rate": 7.558155011399669e-06, + "loss": 0.412, "step": 6152 }, { - "epoch": 6.569903948772679, - "grad_norm": 0.33203125, - "learning_rate": 2.8425220839909892e-06, - "loss": 0.4504, + "epoch": 6.300921187308086, + "grad_norm": 0.33984375, + "learning_rate": 7.471133687901498e-06, + "loss": 0.3985, "step": 6156 }, { - "epoch": 6.574172892209178, - "grad_norm": 0.349609375, - "learning_rate": 2.7865493794597072e-06, - "loss": 0.4461, + "epoch": 6.305015353121801, + "grad_norm": 0.33984375, + "learning_rate": 7.384603436317993e-06, + "loss": 0.4508, "step": 6160 }, { - "epoch": 6.578441835645678, - "grad_norm": 0.34765625, - "learning_rate": 2.7311280870123687e-06, - "loss": 0.4312, + "epoch": 6.309109518935517, + "grad_norm": 0.3515625, + "learning_rate": 7.298564554782288e-06, + "loss": 0.4141, "step": 6164 }, { - "epoch": 6.582710779082177, - "grad_norm": 0.33984375, - "learning_rate": 2.6762584142385023e-06, - "loss": 0.4268, + "epoch": 6.313203684749232, + "grad_norm": 0.357421875, + "learning_rate": 7.213017339734506e-06, + "loss": 0.4444, "step": 6168 }, { - "epoch": 6.586979722518676, - "grad_norm": 0.341796875, - "learning_rate": 2.6219405666614402e-06, - "loss": 0.4287, + "epoch": 6.317297850562948, + "grad_norm": 0.34375, + "learning_rate": 7.127962085920808e-06, + "loss": 0.4156, "step": 6172 }, { - "epoch": 6.591248665955176, - "grad_norm": 0.3515625, - "learning_rate": 2.5681747477375834e-06, - "loss": 0.4265, + "epoch": 6.321392016376663, + "grad_norm": 0.341796875, + "learning_rate": 7.043399086392343e-06, + "loss": 0.4466, "step": 6176 }, { - "epoch": 6.595517609391676, - "grad_norm": 0.369140625, - "learning_rate": 2.5149611588556053e-06, - "loss": 0.4699, + "epoch": 6.325486182190379, + "grad_norm": 0.345703125, + "learning_rate": 6.9593286325042185e-06, + "loss": 0.4325, "step": 6180 }, { - "epoch": 6.599786552828175, - "grad_norm": 0.357421875, - "learning_rate": 2.462299999335715e-06, - "loss": 0.4402, + "epoch": 6.329580348004094, + "grad_norm": 0.375, + "learning_rate": 6.875751013914516e-06, + "loss": 0.431, "step": 6184 }, { - "epoch": 6.6040554962646745, - "grad_norm": 0.380859375, - "learning_rate": 2.4101914664289125e-06, - "loss": 0.4118, + "epoch": 6.33367451381781, + "grad_norm": 0.341796875, + "learning_rate": 6.79266651858329e-06, + "loss": 0.4705, "step": 6188 }, { - "epoch": 6.608324439701174, - "grad_norm": 0.361328125, - "learning_rate": 2.358635755316218e-06, - "loss": 0.4176, + "epoch": 6.337768679631525, + "grad_norm": 0.3671875, + "learning_rate": 6.710075432771606e-06, + "loss": 0.4316, "step": 6192 }, { - "epoch": 6.612593383137673, - "grad_norm": 0.34375, - "learning_rate": 2.307633059108044e-06, - "loss": 0.396, + "epoch": 6.341862845445241, + "grad_norm": 0.369140625, + "learning_rate": 6.627978041040488e-06, + "loss": 0.4479, "step": 6196 }, { - "epoch": 6.616862326574173, - "grad_norm": 0.349609375, - "learning_rate": 2.2571835688432737e-06, - "loss": 0.4517, + "epoch": 6.345957011258956, + "grad_norm": 0.345703125, + "learning_rate": 6.54637462624999e-06, + "loss": 0.3946, "step": 6200 }, { - "epoch": 6.621131270010673, - "grad_norm": 0.34375, - "learning_rate": 2.2072874734887835e-06, - "loss": 0.4267, + "epoch": 6.350051177072672, + "grad_norm": 0.34765625, + "learning_rate": 6.465265469558256e-06, + "loss": 0.4489, "step": 6204 }, { - "epoch": 6.625400213447172, - "grad_norm": 0.345703125, - "learning_rate": 2.1579449599385403e-06, - "loss": 0.3821, + "epoch": 6.354145342886387, + "grad_norm": 0.328125, + "learning_rate": 6.384650850420397e-06, + "loss": 0.4443, "step": 6208 }, { - "epoch": 6.629669156883671, - "grad_norm": 0.375, - "learning_rate": 2.1091562130130358e-06, - "loss": 0.444, + "epoch": 6.358239508700103, + "grad_norm": 0.35546875, + "learning_rate": 6.304531046587719e-06, + "loss": 0.4218, "step": 6212 }, { - "epoch": 6.6339381003201705, - "grad_norm": 0.345703125, - "learning_rate": 2.0609214154584717e-06, - "loss": 0.4546, + "epoch": 6.362333674513818, + "grad_norm": 0.34765625, + "learning_rate": 6.224906334106689e-06, + "loss": 0.4614, "step": 6216 }, { - "epoch": 6.638207043756671, - "grad_norm": 0.3359375, - "learning_rate": 2.0132407479462097e-06, - "loss": 0.4492, + "epoch": 6.366427840327534, + "grad_norm": 0.345703125, + "learning_rate": 6.145776987317891e-06, + "loss": 0.4615, "step": 6220 }, { - "epoch": 6.64247598719317, - "grad_norm": 0.345703125, - "learning_rate": 1.9661143890719877e-06, - "loss": 0.4657, + "epoch": 6.370522006141249, + "grad_norm": 0.35546875, + "learning_rate": 6.067143278855241e-06, + "loss": 0.4737, "step": 6224 }, { - "epoch": 6.646744930629669, - "grad_norm": 0.3359375, - "learning_rate": 1.9195425153553047e-06, - "loss": 0.4184, + "epoch": 6.3746161719549645, + "grad_norm": 0.341796875, + "learning_rate": 5.9890054796449875e-06, + "loss": 0.4364, "step": 6228 }, { - "epoch": 6.651013874066169, - "grad_norm": 0.33984375, - "learning_rate": 1.8735253012387207e-06, - "loss": 0.3755, + "epoch": 6.37871033776868, + "grad_norm": 0.349609375, + "learning_rate": 5.911363858904661e-06, + "loss": 0.4312, "step": 6232 }, { - "epoch": 6.655282817502668, - "grad_norm": 0.34375, - "learning_rate": 1.8280629190872586e-06, - "loss": 0.3719, + "epoch": 6.3828045035823955, + "grad_norm": 0.375, + "learning_rate": 5.834218684142344e-06, + "loss": 0.4062, "step": 6236 }, { - "epoch": 6.659551760939167, - "grad_norm": 0.369140625, - "learning_rate": 1.783155539187736e-06, - "loss": 0.4165, + "epoch": 6.3868986693961105, + "grad_norm": 0.3359375, + "learning_rate": 5.757570221155638e-06, + "loss": 0.4327, "step": 6240 }, { - "epoch": 6.6638207043756665, - "grad_norm": 0.365234375, - "learning_rate": 1.7388033297480509e-06, - "loss": 0.4176, + "epoch": 6.3909928352098255, + "grad_norm": 0.361328125, + "learning_rate": 5.6814187340307125e-06, + "loss": 0.4012, "step": 6244 }, { - "epoch": 6.668089647812167, - "grad_norm": 0.359375, - "learning_rate": 1.6950064568967137e-06, - "loss": 0.4079, + "epoch": 6.395087001023541, + "grad_norm": 0.349609375, + "learning_rate": 5.605764485141507e-06, + "loss": 0.3789, "step": 6248 }, { - "epoch": 6.672358591248666, - "grad_norm": 0.361328125, - "learning_rate": 1.6517650846820495e-06, - "loss": 0.3964, + "epoch": 6.399181166837257, + "grad_norm": 0.345703125, + "learning_rate": 5.530607735148762e-06, + "loss": 0.4342, "step": 6252 }, { - "epoch": 6.676627534685165, - "grad_norm": 0.345703125, - "learning_rate": 1.6090793750717145e-06, - "loss": 0.4027, + "epoch": 6.403275332650972, + "grad_norm": 0.37890625, + "learning_rate": 5.4559487429990615e-06, + "loss": 0.4502, "step": 6256 }, { - "epoch": 6.680896478121665, - "grad_norm": 0.369140625, - "learning_rate": 1.5669494879520128e-06, - "loss": 0.454, + "epoch": 6.407369498464687, + "grad_norm": 0.353515625, + "learning_rate": 5.381787765924056e-06, + "loss": 0.4161, "step": 6260 }, { - "epoch": 6.685165421558164, - "grad_norm": 0.353515625, - "learning_rate": 1.5253755811273638e-06, - "loss": 0.4305, + "epoch": 6.411463664278403, + "grad_norm": 0.35546875, + "learning_rate": 5.308125059439522e-06, + "loss": 0.4442, "step": 6264 }, { - "epoch": 6.689434364994664, - "grad_norm": 0.357421875, - "learning_rate": 1.4843578103195864e-06, - "loss": 0.4352, + "epoch": 6.415557830092118, + "grad_norm": 0.3359375, + "learning_rate": 5.234960877344491e-06, + "loss": 0.4584, "step": 6268 }, { - "epoch": 6.6937033084311635, - "grad_norm": 0.359375, - "learning_rate": 1.4438963291674988e-06, - "loss": 0.4269, + "epoch": 6.419651995905834, + "grad_norm": 0.357421875, + "learning_rate": 5.1622954717203514e-06, + "loss": 0.4413, "step": 6272 }, { - "epoch": 6.697972251867663, - "grad_norm": 0.357421875, - "learning_rate": 1.4039912892262195e-06, - "loss": 0.4617, + "epoch": 6.423746161719549, + "grad_norm": 0.3828125, + "learning_rate": 5.090129092929984e-06, + "loss": 0.396, "step": 6276 }, { - "epoch": 6.702241195304162, - "grad_norm": 0.341796875, - "learning_rate": 1.3646428399665676e-06, - "loss": 0.397, + "epoch": 6.427840327533265, + "grad_norm": 0.326171875, + "learning_rate": 5.018461989616928e-06, + "loss": 0.4253, "step": 6280 }, { - "epoch": 6.706510138740661, - "grad_norm": 0.31640625, - "learning_rate": 1.3258511287746299e-06, - "loss": 0.4148, + "epoch": 6.43193449334698, + "grad_norm": 0.34375, + "learning_rate": 4.947294408704533e-06, + "loss": 0.4244, "step": 6284 }, { - "epoch": 6.710779082177162, - "grad_norm": 0.34765625, - "learning_rate": 1.2876163009510943e-06, - "loss": 0.4369, + "epoch": 6.436028659160696, + "grad_norm": 0.337890625, + "learning_rate": 4.876626595395039e-06, + "loss": 0.434, "step": 6288 }, { - "epoch": 6.715048025613661, - "grad_norm": 0.34375, - "learning_rate": 1.2499384997107842e-06, - "loss": 0.3841, + "epoch": 6.440122824974411, + "grad_norm": 0.326171875, + "learning_rate": 4.806458793168799e-06, + "loss": 0.4057, "step": 6292 }, { - "epoch": 6.71931696905016, - "grad_norm": 0.337890625, - "learning_rate": 1.2128178661820586e-06, - "loss": 0.4223, + "epoch": 6.444216990788127, + "grad_norm": 0.34375, + "learning_rate": 4.736791243783427e-06, + "loss": 0.4292, "step": 6296 }, { - "epoch": 6.7235859124866595, - "grad_norm": 0.33984375, - "learning_rate": 1.176254539406346e-06, - "loss": 0.426, + "epoch": 6.448311156601842, + "grad_norm": 0.36328125, + "learning_rate": 4.667624187272917e-06, + "loss": 0.4297, "step": 6300 }, { - "epoch": 6.727854855923159, - "grad_norm": 0.3359375, - "learning_rate": 1.1402486563375613e-06, - "loss": 0.4266, + "epoch": 6.452405322415558, + "grad_norm": 0.349609375, + "learning_rate": 4.598957861946906e-06, + "loss": 0.4292, "step": 6304 }, { - "epoch": 6.732123799359658, - "grad_norm": 0.35546875, - "learning_rate": 1.1048003518416394e-06, - "loss": 0.4314, + "epoch": 6.456499488229273, + "grad_norm": 0.349609375, + "learning_rate": 4.53079250438978e-06, + "loss": 0.4307, "step": 6308 }, { - "epoch": 6.736392742796158, - "grad_norm": 0.353515625, - "learning_rate": 1.0699097586960192e-06, - "loss": 0.4188, + "epoch": 6.460593654042989, + "grad_norm": 0.35546875, + "learning_rate": 4.463128349459855e-06, + "loss": 0.4114, "step": 6312 }, { - "epoch": 6.740661686232658, - "grad_norm": 0.33984375, - "learning_rate": 1.0355770075891112e-06, - "loss": 0.3995, + "epoch": 6.464687819856704, + "grad_norm": 0.390625, + "learning_rate": 4.395965630288628e-06, + "loss": 0.4695, "step": 6316 }, { - "epoch": 6.744930629669157, - "grad_norm": 0.37890625, - "learning_rate": 1.0018022271198634e-06, - "loss": 0.4489, + "epoch": 6.46878198567042, + "grad_norm": 0.353515625, + "learning_rate": 4.32930457827993e-06, + "loss": 0.4417, "step": 6320 }, { - "epoch": 6.749199573105656, - "grad_norm": 0.3359375, - "learning_rate": 9.685855437972457e-07, - "loss": 0.4017, + "epoch": 6.472876151484135, + "grad_norm": 0.357421875, + "learning_rate": 4.263145423109121e-06, + "loss": 0.4577, "step": 6324 }, { - "epoch": 6.7534685165421555, - "grad_norm": 0.36328125, - "learning_rate": 9.359270820398002e-07, - "loss": 0.4647, + "epoch": 6.476970317297851, + "grad_norm": 0.333984375, + "learning_rate": 4.197488392722348e-06, + "loss": 0.3964, "step": 6328 }, { - "epoch": 6.757737459978655, - "grad_norm": 0.380859375, - "learning_rate": 9.03826964175125e-07, - "loss": 0.4367, + "epoch": 6.481064483111566, + "grad_norm": 0.322265625, + "learning_rate": 4.132333713335689e-06, + "loss": 0.4551, "step": 6332 }, { - "epoch": 6.762006403415155, - "grad_norm": 0.34765625, - "learning_rate": 8.722853104394578e-07, - "loss": 0.4079, + "epoch": 6.485158648925282, + "grad_norm": 0.388671875, + "learning_rate": 4.067681609434426e-06, + "loss": 0.432, "step": 6336 }, { - "epoch": 6.766275346851654, - "grad_norm": 0.333984375, - "learning_rate": 8.413022389772595e-07, - "loss": 0.3718, + "epoch": 6.489252814738997, + "grad_norm": 0.35546875, + "learning_rate": 4.003532303772256e-06, + "loss": 0.4323, "step": 6340 }, { - "epoch": 6.770544290288154, - "grad_norm": 0.349609375, - "learning_rate": 8.108778658406645e-07, - "loss": 0.4257, + "epoch": 6.493346980552713, + "grad_norm": 0.35546875, + "learning_rate": 3.939886017370564e-06, + "loss": 0.3916, "step": 6344 }, { - "epoch": 6.774813233724653, - "grad_norm": 0.341796875, - "learning_rate": 7.810123049891648e-07, - "loss": 0.407, + "epoch": 6.497441146366428, + "grad_norm": 0.396484375, + "learning_rate": 3.876742969517538e-06, + "loss": 0.4087, "step": 6348 }, { - "epoch": 6.779082177161152, - "grad_norm": 0.353515625, - "learning_rate": 7.517056682891099e-07, - "loss": 0.4074, + "epoch": 6.501535312180144, + "grad_norm": 0.359375, + "learning_rate": 3.8141033777675854e-06, + "loss": 0.4148, "step": 6352 }, { - "epoch": 6.783351120597652, - "grad_norm": 0.357421875, - "learning_rate": 7.229580655132905e-07, - "loss": 0.4029, + "epoch": 6.505629477993859, + "grad_norm": 0.341796875, + "learning_rate": 3.751967457940436e-06, + "loss": 0.4203, "step": 6356 }, { - "epoch": 6.787620064034152, + "epoch": 6.5097236438075745, "grad_norm": 0.357421875, - "learning_rate": 6.947696043405726e-07, - "loss": 0.418, + "learning_rate": 3.6903354241204886e-06, + "loss": 0.4405, "step": 6360 }, { - "epoch": 6.791889007470651, + "epoch": 6.5138178096212895, "grad_norm": 0.353515625, - "learning_rate": 6.671403903554473e-07, - "loss": 0.4573, + "learning_rate": 3.6292074886559995e-06, + "loss": 0.4195, "step": 6364 }, { - "epoch": 6.79615795090715, - "grad_norm": 0.33203125, - "learning_rate": 6.400705270476481e-07, - "loss": 0.3894, + "epoch": 6.5179119754350054, + "grad_norm": 0.361328125, + "learning_rate": 3.5685838621584804e-06, + "loss": 0.4599, "step": 6368 }, { - "epoch": 6.80042689434365, - "grad_norm": 0.3515625, - "learning_rate": 6.135601158118175e-07, - "loss": 0.446, + "epoch": 6.5220061412487205, + "grad_norm": 0.34765625, + "learning_rate": 3.5084647535017996e-06, + "loss": 0.4015, "step": 6372 }, { - "epoch": 6.80469583778015, - "grad_norm": 0.3828125, - "learning_rate": 5.87609255947008e-07, - "loss": 0.4481, + "epoch": 6.526100307062436, + "grad_norm": 0.349609375, + "learning_rate": 3.448850369821565e-06, + "loss": 0.3875, "step": 6376 }, { - "epoch": 6.808964781216649, + "epoch": 6.530194472876151, "grad_norm": 0.36328125, - "learning_rate": 5.622180446564151e-07, - "loss": 0.4419, + "learning_rate": 3.389740916514461e-06, + "loss": 0.4491, "step": 6380 }, { - "epoch": 6.8132337246531485, - "grad_norm": 0.36328125, - "learning_rate": 5.373865770469943e-07, - "loss": 0.4108, + "epoch": 6.534288638689867, + "grad_norm": 0.34765625, + "learning_rate": 3.331136597237377e-06, + "loss": 0.4337, "step": 6384 }, { - "epoch": 6.817502668089648, - "grad_norm": 0.341796875, - "learning_rate": 5.131149461290784e-07, - "loss": 0.4523, + "epoch": 6.538382804503582, + "grad_norm": 0.357421875, + "learning_rate": 3.2730376139068816e-06, + "loss": 0.4579, "step": 6388 }, { - "epoch": 6.821771611526147, - "grad_norm": 0.365234375, - "learning_rate": 4.89403242816011e-07, - "loss": 0.4355, + "epoch": 6.542476970317297, + "grad_norm": 0.359375, + "learning_rate": 3.2154441666984164e-06, + "loss": 0.3983, "step": 6392 }, { - "epoch": 6.826040554962646, - "grad_norm": 0.341796875, - "learning_rate": 4.662515559238633e-07, - "loss": 0.4443, + "epoch": 6.546571136131013, + "grad_norm": 0.34375, + "learning_rate": 3.158356454045602e-06, + "loss": 0.4155, "step": 6396 }, { - "epoch": 6.830309498399147, - "grad_norm": 0.36328125, - "learning_rate": 4.436599721710843e-07, - "loss": 0.4419, + "epoch": 6.550665301944729, + "grad_norm": 0.349609375, + "learning_rate": 3.101774672639684e-06, + "loss": 0.4145, "step": 6400 }, { - "epoch": 6.834578441835646, - "grad_norm": 0.35546875, - "learning_rate": 4.2162857617815127e-07, - "loss": 0.429, + "epoch": 6.554759467758444, + "grad_norm": 0.33984375, + "learning_rate": 3.045699017428671e-06, + "loss": 0.4317, "step": 6404 }, { - "epoch": 6.838847385272145, - "grad_norm": 0.376953125, - "learning_rate": 4.0015745046725336e-07, - "loss": 0.4691, + "epoch": 6.558853633572159, + "grad_norm": 0.369140625, + "learning_rate": 2.990129681616782e-06, + "loss": 0.4389, "step": 6408 }, { - "epoch": 6.8431163287086445, - "grad_norm": 0.34765625, - "learning_rate": 3.7924667546200826e-07, - "loss": 0.4274, + "epoch": 6.562947799385875, + "grad_norm": 0.34375, + "learning_rate": 2.9350668566637982e-06, + "loss": 0.4404, "step": 6412 }, { - "epoch": 6.847385272145144, - "grad_norm": 0.359375, - "learning_rate": 3.5889632948716273e-07, - "loss": 0.4345, + "epoch": 6.567041965199591, + "grad_norm": 0.333984375, + "learning_rate": 2.880510732284297e-06, + "loss": 0.4248, "step": 6416 }, { - "epoch": 6.851654215581643, - "grad_norm": 0.357421875, - "learning_rate": 3.391064887682426e-07, - "loss": 0.4309, + "epoch": 6.571136131013306, + "grad_norm": 0.365234375, + "learning_rate": 2.8264614964470856e-06, + "loss": 0.4753, "step": 6420 }, { - "epoch": 6.855923159018143, - "grad_norm": 0.333984375, - "learning_rate": 3.1987722743135325e-07, - "loss": 0.4297, + "epoch": 6.575230296827021, + "grad_norm": 0.359375, + "learning_rate": 2.7729193353745017e-06, + "loss": 0.4326, "step": 6424 }, { - "epoch": 6.860192102454643, - "grad_norm": 0.365234375, - "learning_rate": 3.012086175028461e-07, - "loss": 0.4073, + "epoch": 6.579324462640737, + "grad_norm": 0.35546875, + "learning_rate": 2.7198844335418637e-06, + "loss": 0.4478, "step": 6428 }, { - "epoch": 6.864461045891142, - "grad_norm": 0.326171875, - "learning_rate": 2.831007289090526e-07, - "loss": 0.4363, + "epoch": 6.583418628454452, + "grad_norm": 0.345703125, + "learning_rate": 2.6673569736766876e-06, + "loss": 0.4379, "step": 6432 }, { - "epoch": 6.868729989327641, - "grad_norm": 0.38671875, - "learning_rate": 2.6555362947601746e-07, - "loss": 0.4251, + "epoch": 6.587512794268168, + "grad_norm": 0.36328125, + "learning_rate": 2.615337136758172e-06, + "loss": 0.4266, "step": 6436 }, { - "epoch": 6.8729989327641405, - "grad_norm": 0.333984375, - "learning_rate": 2.4856738492931573e-07, - "loss": 0.4395, + "epoch": 6.591606960081883, + "grad_norm": 0.3515625, + "learning_rate": 2.5638251020165812e-06, + "loss": 0.3951, "step": 6440 }, { - "epoch": 6.877267876200641, - "grad_norm": 0.33203125, - "learning_rate": 2.3214205889366954e-07, - "loss": 0.3945, + "epoch": 6.595701125895599, + "grad_norm": 0.34375, + "learning_rate": 2.512821046932495e-06, + "loss": 0.4134, "step": 6444 }, { - "epoch": 6.88153681963714, - "grad_norm": 0.349609375, - "learning_rate": 2.162777128928317e-07, - "loss": 0.3737, + "epoch": 6.599795291709314, + "grad_norm": 0.33984375, + "learning_rate": 2.4623251472363937e-06, + "loss": 0.4396, "step": 6448 }, { - "epoch": 6.885805763073639, - "grad_norm": 0.361328125, - "learning_rate": 2.0097440634931906e-07, - "loss": 0.3549, + "epoch": 6.60388945752303, + "grad_norm": 0.359375, + "learning_rate": 2.412337576907858e-06, + "loss": 0.4173, "step": 6452 }, { - "epoch": 6.890074706510139, - "grad_norm": 0.357421875, - "learning_rate": 1.8623219658417953e-07, - "loss": 0.4497, + "epoch": 6.607983623336745, + "grad_norm": 0.36328125, + "learning_rate": 2.362858508175136e-06, + "loss": 0.4338, "step": 6456 }, { - "epoch": 6.894343649946638, + "epoch": 6.612077789150461, "grad_norm": 0.361328125, - "learning_rate": 1.7205113881674226e-07, - "loss": 0.4447, + "learning_rate": 2.313888111514395e-06, + "loss": 0.4044, "step": 6460 }, { - "epoch": 6.898612593383138, - "grad_norm": 0.375, - "learning_rate": 1.5843128616451762e-07, - "loss": 0.4265, + "epoch": 6.616171954964176, + "grad_norm": 0.34765625, + "learning_rate": 2.2654265556493022e-06, + "loss": 0.4584, "step": 6464 }, { - "epoch": 6.9028815368196375, - "grad_norm": 0.37109375, - "learning_rate": 1.4537268964288085e-07, - "loss": 0.4346, + "epoch": 6.620266120777892, + "grad_norm": 0.333984375, + "learning_rate": 2.2174740075502794e-06, + "loss": 0.4268, "step": 6468 }, { - "epoch": 6.907150480256137, - "grad_norm": 0.34765625, - "learning_rate": 1.328753981649222e-07, - "loss": 0.4533, + "epoch": 6.624360286591607, + "grad_norm": 0.34375, + "learning_rate": 2.1700306324340334e-06, + "loss": 0.4085, "step": 6472 }, { - "epoch": 6.911419423692636, - "grad_norm": 0.3671875, - "learning_rate": 1.2093945854133035e-07, - "loss": 0.404, + "epoch": 6.628454452405323, + "grad_norm": 0.361328125, + "learning_rate": 2.123096593762974e-06, + "loss": 0.4266, "step": 6476 }, { - "epoch": 6.915688367129135, - "grad_norm": 0.3359375, - "learning_rate": 1.0956491548009261e-07, - "loss": 0.4195, + "epoch": 6.632548618219038, + "grad_norm": 0.345703125, + "learning_rate": 2.076672053244599e-06, + "loss": 0.4212, "step": 6480 }, { - "epoch": 6.919957310565635, - "grad_norm": 0.34765625, - "learning_rate": 9.87518115864283e-08, - "loss": 0.3697, + "epoch": 6.6366427840327535, + "grad_norm": 0.337890625, + "learning_rate": 2.03075717083096e-06, + "loss": 0.4263, "step": 6484 }, { - "epoch": 6.924226254002134, - "grad_norm": 0.3671875, - "learning_rate": 8.850018736260567e-08, - "loss": 0.4148, + "epoch": 6.640736949846469, + "grad_norm": 0.353515625, + "learning_rate": 1.9853521047181963e-06, + "loss": 0.4296, "step": 6488 }, { - "epoch": 6.928495197438634, - "grad_norm": 0.3671875, - "learning_rate": 7.88100812077419e-08, - "loss": 0.3985, + "epoch": 6.6448311156601845, + "grad_norm": 0.33984375, + "learning_rate": 1.9404570113458197e-06, + "loss": 0.3664, "step": 6492 }, { - "epoch": 6.9327641408751335, - "grad_norm": 0.33984375, - "learning_rate": 6.968152941771999e-08, - "loss": 0.4048, + "epoch": 6.6489252814738995, + "grad_norm": 0.3359375, + "learning_rate": 1.8960720453963295e-06, + "loss": 0.4485, "step": 6496 }, { - "epoch": 6.937033084311633, - "grad_norm": 0.34765625, - "learning_rate": 6.111456618503874e-08, - "loss": 0.4433, + "epoch": 6.653019447287615, + "grad_norm": 0.3828125, + "learning_rate": 1.8521973597946326e-06, + "loss": 0.4253, "step": 6500 } ], "logging_steps": 4, - "max_steps": 6559, + "max_steps": 6839, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 500, @@ -11401,7 +11401,7 @@ "attributes": {} } }, - "total_flos": 4.2506587099166147e+18, + "total_flos": 4.253726566678266e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null