diff --git "a/checkpoint-4000/trainer_state.json" "b/checkpoint-4000/trainer_state.json" --- "a/checkpoint-4000/trainer_state.json" +++ "b/checkpoint-4000/trainer_state.json" @@ -1,7 +1,7 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 7.648183556405353, + "epoch": 6.4412238325281805, "eval_steps": 500, "global_step": 4000, "is_hyper_param_search": false, @@ -9,7010 +9,7010 @@ "is_world_process_zero": true, "log_history": [ { - "epoch": 0.0076481835564053535, - "grad_norm": 0.294921875, - "learning_rate": 2.8571428571428567e-05, - "loss": 1.3014, + "epoch": 0.00644122383252818, + "grad_norm": 0.5078125, + "learning_rate": 2.727272727272727e-05, + "loss": 1.3628, "step": 4 }, { - "epoch": 0.015296367112810707, - "grad_norm": 0.26953125, - "learning_rate": 5.7142857142857135e-05, - "loss": 1.3157, + "epoch": 0.01288244766505636, + "grad_norm": 0.3203125, + "learning_rate": 5.454545454545454e-05, + "loss": 1.3272, "step": 8 }, { - "epoch": 0.022944550669216062, - "grad_norm": 0.201171875, - "learning_rate": 8.57142857142857e-05, - "loss": 1.2369, + "epoch": 0.01932367149758454, + "grad_norm": 0.375, + "learning_rate": 8.18181818181818e-05, + "loss": 1.2626, "step": 12 }, { - "epoch": 0.030592734225621414, - "grad_norm": 0.255859375, - "learning_rate": 0.00011428571428571427, - "loss": 1.207, + "epoch": 0.02576489533011272, + "grad_norm": 0.2353515625, + "learning_rate": 0.00010909090909090908, + "loss": 1.2028, "step": 16 }, { - "epoch": 0.03824091778202677, - "grad_norm": 0.177734375, - "learning_rate": 0.00014285714285714284, - "loss": 1.1666, + "epoch": 0.0322061191626409, + "grad_norm": 0.189453125, + "learning_rate": 0.00013636363636363634, + "loss": 1.1822, "step": 20 }, { - "epoch": 0.045889101338432124, - "grad_norm": 0.1806640625, - "learning_rate": 0.0001714285714285714, - "loss": 1.178, + "epoch": 0.03864734299516908, + "grad_norm": 0.2060546875, + "learning_rate": 0.0001636363636363636, + "loss": 1.2029, "step": 24 }, { - "epoch": 0.05353728489483748, - "grad_norm": 0.1552734375, - "learning_rate": 0.00019999999999999998, - "loss": 1.1533, + "epoch": 0.04508856682769726, + "grad_norm": 0.236328125, + "learning_rate": 0.0001909090909090909, + "loss": 1.1609, "step": 28 }, { - "epoch": 0.06118546845124283, - "grad_norm": 0.1787109375, - "learning_rate": 0.00022857142857142854, - "loss": 1.1508, + "epoch": 0.05152979066022544, + "grad_norm": 0.255859375, + "learning_rate": 0.00021818181818181816, + "loss": 1.1137, "step": 32 }, { - "epoch": 0.06883365200764818, - "grad_norm": 0.1767578125, - "learning_rate": 0.0002571428571428571, - "loss": 1.1415, + "epoch": 0.057971014492753624, + "grad_norm": 0.2431640625, + "learning_rate": 0.00024545454545454545, + "loss": 1.085, "step": 36 }, { - "epoch": 0.07648183556405354, - "grad_norm": 0.1865234375, - "learning_rate": 0.0002857142857142857, - "loss": 1.1255, + "epoch": 0.0644122383252818, + "grad_norm": 0.23828125, + "learning_rate": 0.0002727272727272727, + "loss": 1.1052, "step": 40 }, { - "epoch": 0.0841300191204589, - "grad_norm": 0.17578125, - "learning_rate": 0.0002999998274159216, - "loss": 1.0581, + "epoch": 0.07085346215780998, + "grad_norm": 0.240234375, + "learning_rate": 0.0003, + "loss": 1.0712, "step": 44 }, { - "epoch": 0.09177820267686425, - "grad_norm": 0.197265625, - "learning_rate": 0.00029999844674567734, - "loss": 1.0987, + "epoch": 0.07729468599033816, + "grad_norm": 0.2373046875, + "learning_rate": 0.00029999936035650057, + "loss": 1.0588, "step": 48 }, { - "epoch": 0.0994263862332696, - "grad_norm": 0.185546875, - "learning_rate": 0.0002999956854178972, - "loss": 1.089, + "epoch": 0.08373590982286634, + "grad_norm": 0.2373046875, + "learning_rate": 0.0002999974414314574, + "loss": 1.0531, "step": 52 }, { - "epoch": 0.10707456978967496, - "grad_norm": 0.197265625, - "learning_rate": 0.00029999154345799773, - "loss": 1.0934, + "epoch": 0.09017713365539452, + "grad_norm": 0.2431640625, + "learning_rate": 0.00029999424324123633, + "loss": 0.9953, "step": 56 }, { - "epoch": 0.1147227533460803, - "grad_norm": 0.18359375, - "learning_rate": 0.0002999860209041035, - "loss": 1.0712, + "epoch": 0.0966183574879227, + "grad_norm": 0.25, + "learning_rate": 0.0002999897658131134, + "loss": 0.9887, "step": 60 }, { - "epoch": 0.12237093690248566, - "grad_norm": 0.19140625, - "learning_rate": 0.00029997911780704675, - "loss": 1.0253, + "epoch": 0.10305958132045089, + "grad_norm": 0.236328125, + "learning_rate": 0.0002999840091852746, + "loss": 0.9945, "step": 64 }, { - "epoch": 0.13001912045889102, - "grad_norm": 0.1845703125, - "learning_rate": 0.00029997083423036696, - "loss": 1.0526, + "epoch": 0.10950080515297907, + "grad_norm": 0.2451171875, + "learning_rate": 0.00029997697340681585, + "loss": 0.9306, "step": 68 }, { - "epoch": 0.13766730401529637, - "grad_norm": 0.2080078125, - "learning_rate": 0.00029996117025031, - "loss": 1.0746, + "epoch": 0.11594202898550725, + "grad_norm": 0.25390625, + "learning_rate": 0.00029996865853774236, + "loss": 0.9458, "step": 72 }, { - "epoch": 0.14531548757170173, - "grad_norm": 0.193359375, - "learning_rate": 0.00029995012595582796, - "loss": 1.0502, + "epoch": 0.12238325281803543, + "grad_norm": 0.25390625, + "learning_rate": 0.00029995906464896807, + "loss": 0.9487, "step": 76 }, { - "epoch": 0.15296367112810708, - "grad_norm": 0.1806640625, - "learning_rate": 0.0002999377014485777, - "loss": 1.0461, + "epoch": 0.1288244766505636, + "grad_norm": 0.26171875, + "learning_rate": 0.0002999481918223153, + "loss": 0.9144, "step": 80 }, { - "epoch": 0.16061185468451242, - "grad_norm": 0.1923828125, - "learning_rate": 0.00029992389684292025, - "loss": 1.0223, + "epoch": 0.13526570048309178, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002999360401505139, + "loss": 0.9289, "step": 84 }, { - "epoch": 0.1682600382409178, - "grad_norm": 0.2060546875, - "learning_rate": 0.00029990871226591995, - "loss": 1.0218, + "epoch": 0.14170692431561996, + "grad_norm": 0.265625, + "learning_rate": 0.00029992260973720023, + "loss": 0.882, "step": 88 }, { - "epoch": 0.17590822179732313, - "grad_norm": 0.2041015625, - "learning_rate": 0.00029989214785734286, - "loss": 1.0192, + "epoch": 0.14814814814814814, + "grad_norm": 0.2451171875, + "learning_rate": 0.00029990790069691665, + "loss": 0.9031, "step": 92 }, { - "epoch": 0.1835564053537285, - "grad_norm": 0.2109375, - "learning_rate": 0.00029987420376965577, - "loss": 1.0243, + "epoch": 0.15458937198067632, + "grad_norm": 0.255859375, + "learning_rate": 0.00029989191315511055, + "loss": 0.9127, "step": 96 }, { - "epoch": 0.19120458891013384, - "grad_norm": 0.205078125, - "learning_rate": 0.00029985488016802457, - "loss": 1.0202, + "epoch": 0.1610305958132045, + "grad_norm": 0.341796875, + "learning_rate": 0.0002998746472481328, + "loss": 0.8803, "step": 100 }, { - "epoch": 0.1988527724665392, - "grad_norm": 0.203125, - "learning_rate": 0.00029983417723031307, - "loss": 0.9904, + "epoch": 0.16747181964573268, + "grad_norm": 0.263671875, + "learning_rate": 0.0002998561031232371, + "loss": 0.8764, "step": 104 }, { - "epoch": 0.20650095602294455, - "grad_norm": 0.2236328125, - "learning_rate": 0.0002998120951470809, - "loss": 1.0228, + "epoch": 0.17391304347826086, + "grad_norm": 0.263671875, + "learning_rate": 0.00029983628093857855, + "loss": 0.9189, "step": 108 }, { - "epoch": 0.21414913957934992, - "grad_norm": 0.23828125, - "learning_rate": 0.00029978863412158217, - "loss": 1.004, + "epoch": 0.18035426731078905, + "grad_norm": 0.302734375, + "learning_rate": 0.00029981518086321225, + "loss": 0.8169, "step": 112 }, { - "epoch": 0.22179732313575526, - "grad_norm": 0.232421875, - "learning_rate": 0.0002997637943697635, - "loss": 0.9903, + "epoch": 0.18679549114331723, + "grad_norm": 0.275390625, + "learning_rate": 0.00029979280307709176, + "loss": 0.8672, "step": 116 }, { - "epoch": 0.2294455066921606, - "grad_norm": 0.2109375, - "learning_rate": 0.00029973757612026164, - "loss": 0.969, + "epoch": 0.1932367149758454, + "grad_norm": 0.294921875, + "learning_rate": 0.0002997691477710679, + "loss": 0.8387, "step": 120 }, { - "epoch": 0.23709369024856597, - "grad_norm": 0.232421875, - "learning_rate": 0.00029970997961440213, - "loss": 0.9869, + "epoch": 0.1996779388083736, + "grad_norm": 0.279296875, + "learning_rate": 0.0002997442151468869, + "loss": 0.8039, "step": 124 }, { - "epoch": 0.2447418738049713, - "grad_norm": 0.263671875, - "learning_rate": 0.0002996810051061963, - "loss": 0.9758, + "epoch": 0.20611916264090177, + "grad_norm": 0.267578125, + "learning_rate": 0.00029971800541718854, + "loss": 0.8294, "step": 128 }, { - "epoch": 0.25239005736137665, - "grad_norm": 0.2080078125, - "learning_rate": 0.00029965065286233943, - "loss": 0.9615, + "epoch": 0.21256038647342995, + "grad_norm": 0.291015625, + "learning_rate": 0.0002996905188055046, + "loss": 0.8228, "step": 132 }, { - "epoch": 0.26003824091778205, - "grad_norm": 0.224609375, - "learning_rate": 0.00029961892316220817, - "loss": 0.9541, + "epoch": 0.21900161030595813, + "grad_norm": 0.267578125, + "learning_rate": 0.00029966175554625696, + "loss": 0.8341, "step": 136 }, { - "epoch": 0.2676864244741874, - "grad_norm": 0.21484375, - "learning_rate": 0.0002995858162978577, - "loss": 0.9455, + "epoch": 0.22544283413848631, + "grad_norm": 0.259765625, + "learning_rate": 0.00029963171588475525, + "loss": 0.8095, "step": 140 }, { - "epoch": 0.27533460803059273, - "grad_norm": 0.2236328125, - "learning_rate": 0.0002995513325740197, - "loss": 0.9566, + "epoch": 0.2318840579710145, + "grad_norm": 0.294921875, + "learning_rate": 0.0002996004000771952, + "loss": 0.8285, "step": 144 }, { - "epoch": 0.2829827915869981, - "grad_norm": 0.25390625, - "learning_rate": 0.00029951547230809865, - "loss": 0.9405, + "epoch": 0.23832528180354268, + "grad_norm": 0.265625, + "learning_rate": 0.00029956780839065616, + "loss": 0.8123, "step": 148 }, { - "epoch": 0.29063097514340347, - "grad_norm": 0.2265625, - "learning_rate": 0.00029947823583016973, - "loss": 0.9119, + "epoch": 0.24476650563607086, + "grad_norm": 0.248046875, + "learning_rate": 0.00029953394110309887, + "loss": 0.7612, "step": 152 }, { - "epoch": 0.2982791586998088, - "grad_norm": 0.2412109375, - "learning_rate": 0.00029943962348297535, - "loss": 0.9507, + "epoch": 0.25120772946859904, + "grad_norm": 0.30078125, + "learning_rate": 0.0002994987985033633, + "loss": 0.7723, "step": 156 }, { - "epoch": 0.30592734225621415, - "grad_norm": 0.21484375, - "learning_rate": 0.00029939963562192196, - "loss": 0.9507, + "epoch": 0.2576489533011272, + "grad_norm": 0.27734375, + "learning_rate": 0.0002994623808911659, + "loss": 0.8202, "step": 160 }, { - "epoch": 0.3135755258126195, - "grad_norm": 0.234375, - "learning_rate": 0.000299358272615077, - "loss": 0.9815, + "epoch": 0.2640901771336554, + "grad_norm": 0.326171875, + "learning_rate": 0.00029942468857709715, + "loss": 0.7324, "step": 164 }, { - "epoch": 0.32122370936902483, - "grad_norm": 0.2255859375, - "learning_rate": 0.0002993155348431654, - "loss": 0.9364, + "epoch": 0.27053140096618356, + "grad_norm": 0.255859375, + "learning_rate": 0.000299385721882619, + "loss": 0.7818, "step": 168 }, { - "epoch": 0.32887189292543023, - "grad_norm": 0.21875, - "learning_rate": 0.0002992714226995661, - "loss": 0.9177, + "epoch": 0.27697262479871176, + "grad_norm": 0.298828125, + "learning_rate": 0.000299345481140062, + "loss": 0.7693, "step": 172 }, { - "epoch": 0.3365200764818356, - "grad_norm": 0.21875, - "learning_rate": 0.00029922593659030837, - "loss": 0.9224, + "epoch": 0.2834138486312399, + "grad_norm": 0.27734375, + "learning_rate": 0.00029930396669262255, + "loss": 0.7481, "step": 176 }, { - "epoch": 0.3441682600382409, - "grad_norm": 0.21875, - "learning_rate": 0.00029917907693406817, - "loss": 0.9359, + "epoch": 0.2898550724637681, + "grad_norm": 0.2890625, + "learning_rate": 0.00029926117889435993, + "loss": 0.7478, "step": 180 }, { - "epoch": 0.35181644359464626, - "grad_norm": 0.2265625, - "learning_rate": 0.00029913084416216415, - "loss": 0.9349, + "epoch": 0.2962962962962963, + "grad_norm": 0.302734375, + "learning_rate": 0.00029921711811019334, + "loss": 0.7581, "step": 184 }, { - "epoch": 0.35946462715105165, - "grad_norm": 0.228515625, - "learning_rate": 0.00029908123871855396, - "loss": 0.9033, + "epoch": 0.3027375201288245, + "grad_norm": 0.30859375, + "learning_rate": 0.00029917178471589864, + "loss": 0.7131, "step": 188 }, { - "epoch": 0.367112810707457, - "grad_norm": 0.2294921875, - "learning_rate": 0.0002990302610598297, - "loss": 0.9007, + "epoch": 0.30917874396135264, + "grad_norm": 0.28125, + "learning_rate": 0.0002991251790981053, + "loss": 0.7121, "step": 192 }, { - "epoch": 0.37476099426386233, - "grad_norm": 0.271484375, - "learning_rate": 0.00029897791165521434, - "loss": 0.9243, + "epoch": 0.31561996779388085, + "grad_norm": 0.28125, + "learning_rate": 0.0002990773016542932, + "loss": 0.7385, "step": 196 }, { - "epoch": 0.3824091778202677, - "grad_norm": 0.244140625, - "learning_rate": 0.0002989241909865567, - "loss": 0.9095, + "epoch": 0.322061191626409, + "grad_norm": 0.345703125, + "learning_rate": 0.00029902815279278874, + "loss": 0.743, "step": 200 }, { - "epoch": 0.390057361376673, - "grad_norm": 0.2490234375, - "learning_rate": 0.0002988690995483276, - "loss": 0.8825, + "epoch": 0.3285024154589372, + "grad_norm": 0.28515625, + "learning_rate": 0.00029897773293276214, + "loss": 0.6984, "step": 204 }, { - "epoch": 0.3977055449330784, - "grad_norm": 0.232421875, - "learning_rate": 0.00029881263784761503, - "loss": 0.8929, + "epoch": 0.33494363929146537, + "grad_norm": 0.2890625, + "learning_rate": 0.000298926042504223, + "loss": 0.7278, "step": 208 }, { - "epoch": 0.40535372848948376, - "grad_norm": 0.2353515625, - "learning_rate": 0.00029875480640411957, - "loss": 0.9097, + "epoch": 0.3413848631239936, + "grad_norm": 0.271484375, + "learning_rate": 0.00029887308194801745, + "loss": 0.7043, "step": 212 }, { - "epoch": 0.4130019120458891, - "grad_norm": 0.2392578125, - "learning_rate": 0.00029869560575014945, - "loss": 0.8563, + "epoch": 0.34782608695652173, + "grad_norm": 0.263671875, + "learning_rate": 0.00029881885171582364, + "loss": 0.7455, "step": 216 }, { - "epoch": 0.42065009560229444, - "grad_norm": 0.2041015625, - "learning_rate": 0.00029863503643061585, - "loss": 0.8839, + "epoch": 0.35426731078904994, + "grad_norm": 0.28125, + "learning_rate": 0.0002987633522701486, + "loss": 0.7314, "step": 220 }, { - "epoch": 0.42829827915869984, - "grad_norm": 0.26171875, - "learning_rate": 0.0002985730990030278, - "loss": 0.8635, + "epoch": 0.3607085346215781, + "grad_norm": 0.28125, + "learning_rate": 0.00029870658408432375, + "loss": 0.7344, "step": 224 }, { - "epoch": 0.4359464627151052, - "grad_norm": 0.2451171875, - "learning_rate": 0.00029850979403748705, - "loss": 0.859, + "epoch": 0.3671497584541063, + "grad_norm": 0.2734375, + "learning_rate": 0.0002986485476425011, + "loss": 0.7324, "step": 228 }, { - "epoch": 0.4435946462715105, - "grad_norm": 0.2314453125, - "learning_rate": 0.00029844512211668286, - "loss": 0.8256, + "epoch": 0.37359098228663445, + "grad_norm": 0.28125, + "learning_rate": 0.0002985892434396491, + "loss": 0.7197, "step": 232 }, { - "epoch": 0.45124282982791586, - "grad_norm": 0.21484375, - "learning_rate": 0.00029837908383588646, - "loss": 0.8282, + "epoch": 0.38003220611916266, + "grad_norm": 0.275390625, + "learning_rate": 0.00029852867198154837, + "loss": 0.6616, "step": 236 }, { - "epoch": 0.4588910133843212, - "grad_norm": 0.2373046875, - "learning_rate": 0.0002983116798029459, - "loss": 0.8579, + "epoch": 0.3864734299516908, + "grad_norm": 0.267578125, + "learning_rate": 0.0002984668337847874, + "loss": 0.6325, "step": 240 }, { - "epoch": 0.4665391969407266, - "grad_norm": 0.2255859375, - "learning_rate": 0.0002982429106382801, - "loss": 0.8805, + "epoch": 0.392914653784219, + "grad_norm": 0.28125, + "learning_rate": 0.0002984037293767583, + "loss": 0.6445, "step": 244 }, { - "epoch": 0.47418738049713194, - "grad_norm": 0.2412109375, - "learning_rate": 0.00029817277697487347, - "loss": 0.823, + "epoch": 0.3993558776167472, + "grad_norm": 0.2734375, + "learning_rate": 0.00029833935929565194, + "loss": 0.6846, "step": 248 }, { - "epoch": 0.4818355640535373, - "grad_norm": 0.244140625, - "learning_rate": 0.0002981012794582698, - "loss": 0.8546, + "epoch": 0.4057971014492754, + "grad_norm": 0.26953125, + "learning_rate": 0.00029827372409045377, + "loss": 0.6976, "step": 252 }, { - "epoch": 0.4894837476099426, - "grad_norm": 0.2421875, - "learning_rate": 0.0002980284187465665, - "loss": 0.8533, + "epoch": 0.41223832528180354, + "grad_norm": 0.306640625, + "learning_rate": 0.0002982068243209389, + "loss": 0.7165, "step": 256 }, { - "epoch": 0.497131931166348, - "grad_norm": 0.248046875, - "learning_rate": 0.00029795419551040833, - "loss": 0.8111, + "epoch": 0.41867954911433175, + "grad_norm": 0.275390625, + "learning_rate": 0.00029813866055766736, + "loss": 0.6647, "step": 260 }, { - "epoch": 0.5047801147227533, - "grad_norm": 0.251953125, - "learning_rate": 0.0002978786104329816, - "loss": 0.8823, + "epoch": 0.4251207729468599, + "grad_norm": 0.283203125, + "learning_rate": 0.00029806923338197925, + "loss": 0.6809, "step": 264 }, { - "epoch": 0.5124282982791587, - "grad_norm": 0.2392578125, - "learning_rate": 0.0002978016642100076, - "loss": 0.8839, + "epoch": 0.43156199677938806, + "grad_norm": 0.267578125, + "learning_rate": 0.00029799854338598974, + "loss": 0.7285, "step": 268 }, { - "epoch": 0.5200764818355641, - "grad_norm": 0.2353515625, - "learning_rate": 0.00029772335754973614, - "loss": 0.8512, + "epoch": 0.43800322061191627, + "grad_norm": 0.32421875, + "learning_rate": 0.0002979265911725842, + "loss": 0.6978, "step": 272 }, { - "epoch": 0.5277246653919694, - "grad_norm": 0.2392578125, - "learning_rate": 0.00029764369117293925, - "loss": 0.8557, + "epoch": 0.4444444444444444, + "grad_norm": 0.267578125, + "learning_rate": 0.00029785337735541276, + "loss": 0.6598, "step": 276 }, { - "epoch": 0.5353728489483748, - "grad_norm": 0.25390625, - "learning_rate": 0.0002975626658129044, - "loss": 0.8163, + "epoch": 0.45088566827697263, + "grad_norm": 0.279296875, + "learning_rate": 0.0002977789025588854, + "loss": 0.6534, "step": 280 }, { - "epoch": 0.5430210325047801, - "grad_norm": 0.2333984375, - "learning_rate": 0.0002974802822154278, - "loss": 0.9078, + "epoch": 0.4573268921095008, + "grad_norm": 0.2734375, + "learning_rate": 0.0002977031674181663, + "loss": 0.7261, "step": 284 }, { - "epoch": 0.5506692160611855, - "grad_norm": 0.2314453125, - "learning_rate": 0.00029739654113880755, - "loss": 0.8652, + "epoch": 0.463768115942029, + "grad_norm": 0.27734375, + "learning_rate": 0.00029762617257916873, + "loss": 0.6762, "step": 288 }, { - "epoch": 0.5583173996175909, - "grad_norm": 0.2158203125, - "learning_rate": 0.00029731144335383663, - "loss": 0.8551, + "epoch": 0.47020933977455714, + "grad_norm": 0.306640625, + "learning_rate": 0.0002975479186985493, + "loss": 0.6625, "step": 292 }, { - "epoch": 0.5659655831739961, - "grad_norm": 0.2373046875, - "learning_rate": 0.0002972249896437958, - "loss": 0.8536, + "epoch": 0.47665056360708535, + "grad_norm": 0.291015625, + "learning_rate": 0.0002974684064437025, + "loss": 0.6617, "step": 296 }, { - "epoch": 0.5736137667304015, - "grad_norm": 0.2216796875, - "learning_rate": 0.0002971371808044464, - "loss": 0.825, + "epoch": 0.4830917874396135, + "grad_norm": 0.294921875, + "learning_rate": 0.00029738763649275496, + "loss": 0.6886, "step": 300 }, { - "epoch": 0.5812619502868069, - "grad_norm": 0.244140625, - "learning_rate": 0.000297048017644023, - "loss": 0.8082, + "epoch": 0.4895330112721417, + "grad_norm": 0.265625, + "learning_rate": 0.0002973056095345596, + "loss": 0.6623, "step": 304 }, { - "epoch": 0.5889101338432122, - "grad_norm": 0.251953125, - "learning_rate": 0.0002969575009832261, - "loss": 0.8304, + "epoch": 0.49597423510466987, + "grad_norm": 0.298828125, + "learning_rate": 0.00029722232626869, + "loss": 0.6568, "step": 308 }, { - "epoch": 0.5965583173996176, - "grad_norm": 0.265625, - "learning_rate": 0.00029686563165521435, - "loss": 0.8101, + "epoch": 0.5024154589371981, + "grad_norm": 0.263671875, + "learning_rate": 0.0002971377874054341, + "loss": 0.6281, "step": 312 }, { - "epoch": 0.6042065009560229, - "grad_norm": 0.234375, - "learning_rate": 0.00029677241050559707, - "loss": 0.8535, + "epoch": 0.5088566827697263, + "grad_norm": 0.259765625, + "learning_rate": 0.0002970519936657884, + "loss": 0.6618, "step": 316 }, { - "epoch": 0.6118546845124283, - "grad_norm": 0.2470703125, - "learning_rate": 0.00029667783839242625, - "loss": 0.85, + "epoch": 0.5152979066022544, + "grad_norm": 0.26171875, + "learning_rate": 0.00029696494578145157, + "loss": 0.6797, "step": 320 }, { - "epoch": 0.6195028680688337, - "grad_norm": 0.2490234375, - "learning_rate": 0.0002965819161861891, - "loss": 0.8101, + "epoch": 0.5217391304347826, + "grad_norm": 0.2890625, + "learning_rate": 0.0002968766444948185, + "loss": 0.6756, "step": 324 }, { - "epoch": 0.627151051625239, - "grad_norm": 0.25, - "learning_rate": 0.0002964846447697994, - "loss": 0.8521, + "epoch": 0.5281803542673108, + "grad_norm": 0.271484375, + "learning_rate": 0.0002967870905589739, + "loss": 0.698, "step": 328 }, { - "epoch": 0.6347992351816444, - "grad_norm": 0.21484375, - "learning_rate": 0.00029638602503858995, - "loss": 0.8506, + "epoch": 0.534621578099839, + "grad_norm": 0.28125, + "learning_rate": 0.0002966962847376855, + "loss": 0.6431, "step": 332 }, { - "epoch": 0.6424474187380497, - "grad_norm": 0.2412109375, - "learning_rate": 0.00029628605790030384, - "loss": 0.8044, + "epoch": 0.5410628019323671, + "grad_norm": 0.27734375, + "learning_rate": 0.00029660422780539814, + "loss": 0.6713, "step": 336 }, { - "epoch": 0.6500956022944551, - "grad_norm": 0.2265625, - "learning_rate": 0.0002961847442750866, - "loss": 0.8311, + "epoch": 0.5475040257648953, + "grad_norm": 0.28125, + "learning_rate": 0.00029651092054722665, + "loss": 0.615, "step": 340 }, { - "epoch": 0.6577437858508605, - "grad_norm": 0.2431640625, - "learning_rate": 0.00029608208509547735, - "loss": 0.8705, + "epoch": 0.5539452495974235, + "grad_norm": 0.275390625, + "learning_rate": 0.0002964163637589495, + "loss": 0.7173, "step": 344 }, { - "epoch": 0.6653919694072657, - "grad_norm": 0.240234375, - "learning_rate": 0.00029597808130640027, - "loss": 0.8272, + "epoch": 0.5603864734299517, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002963205582470017, + "loss": 0.6808, "step": 348 }, { - "epoch": 0.6730401529636711, - "grad_norm": 0.2333984375, - "learning_rate": 0.0002958727338651562, - "loss": 0.8035, + "epoch": 0.5668276972624798, + "grad_norm": 0.28515625, + "learning_rate": 0.00029622350482846844, + "loss": 0.6684, "step": 352 }, { - "epoch": 0.6806883365200764, - "grad_norm": 0.25390625, - "learning_rate": 0.00029576604374141315, - "loss": 0.8655, + "epoch": 0.573268921095008, + "grad_norm": 0.251953125, + "learning_rate": 0.00029612520433107734, + "loss": 0.644, "step": 356 }, { - "epoch": 0.6883365200764818, - "grad_norm": 0.2333984375, - "learning_rate": 0.00029565801191719837, - "loss": 0.8585, + "epoch": 0.5797101449275363, + "grad_norm": 0.28125, + "learning_rate": 0.0002960256575931922, + "loss": 0.6599, "step": 360 }, { - "epoch": 0.6959847036328872, - "grad_norm": 0.267578125, - "learning_rate": 0.0002955486393868884, - "loss": 0.8279, + "epoch": 0.5861513687600645, + "grad_norm": 0.298828125, + "learning_rate": 0.0002959248654638053, + "loss": 0.7006, "step": 364 }, { - "epoch": 0.7036328871892925, - "grad_norm": 0.2060546875, - "learning_rate": 0.0002954379271572004, - "loss": 0.8229, + "epoch": 0.5925925925925926, + "grad_norm": 0.265625, + "learning_rate": 0.00029582282880253035, + "loss": 0.625, "step": 368 }, { - "epoch": 0.7112810707456979, - "grad_norm": 0.251953125, - "learning_rate": 0.0002953258762471828, - "loss": 0.8549, + "epoch": 0.5990338164251208, + "grad_norm": 0.26953125, + "learning_rate": 0.0002957195484795952, + "loss": 0.7234, "step": 372 }, { - "epoch": 0.7189292543021033, - "grad_norm": 0.2265625, - "learning_rate": 0.0002952124876882058, - "loss": 0.837, + "epoch": 0.605475040257649, + "grad_norm": 0.291015625, + "learning_rate": 0.0002956150253758344, + "loss": 0.6556, "step": 376 }, { - "epoch": 0.7265774378585086, - "grad_norm": 0.251953125, - "learning_rate": 0.00029509776252395194, - "loss": 0.8084, + "epoch": 0.6119162640901772, + "grad_norm": 0.283203125, + "learning_rate": 0.00029550926038268146, + "loss": 0.6402, "step": 380 }, { - "epoch": 0.734225621414914, - "grad_norm": 0.26953125, - "learning_rate": 0.0002949817018104066, - "loss": 0.8454, + "epoch": 0.6183574879227053, + "grad_norm": 0.265625, + "learning_rate": 0.0002954022544021617, + "loss": 0.6446, "step": 384 }, { - "epoch": 0.7418738049713193, - "grad_norm": 0.2451171875, - "learning_rate": 0.0002948643066158482, - "loss": 0.7603, + "epoch": 0.6247987117552335, + "grad_norm": 0.296875, + "learning_rate": 0.00029529400834688415, + "loss": 0.6379, "step": 388 }, { - "epoch": 0.7495219885277247, - "grad_norm": 0.2353515625, - "learning_rate": 0.00029474557802083834, - "loss": 0.8814, + "epoch": 0.6312399355877617, + "grad_norm": 0.271484375, + "learning_rate": 0.00029518452314003394, + "loss": 0.644, "step": 392 }, { - "epoch": 0.7571701720841301, - "grad_norm": 0.23046875, - "learning_rate": 0.0002946255171182119, - "loss": 0.8678, + "epoch": 0.6376811594202898, + "grad_norm": 0.30078125, + "learning_rate": 0.0002950737997153645, + "loss": 0.6413, "step": 396 }, { - "epoch": 0.7648183556405354, - "grad_norm": 0.236328125, - "learning_rate": 0.00029450412501306675, - "loss": 0.8397, + "epoch": 0.644122383252818, + "grad_norm": 0.267578125, + "learning_rate": 0.00029496183901718927, + "loss": 0.6249, "step": 400 }, { - "epoch": 0.7724665391969407, - "grad_norm": 0.2412109375, - "learning_rate": 0.00029438140282275413, - "loss": 0.797, + "epoch": 0.6505636070853462, + "grad_norm": 0.291015625, + "learning_rate": 0.00029484864200037415, + "loss": 0.5929, "step": 404 }, { - "epoch": 0.780114722753346, - "grad_norm": 0.24609375, - "learning_rate": 0.0002942573516768678, - "loss": 0.7764, + "epoch": 0.6570048309178744, + "grad_norm": 0.263671875, + "learning_rate": 0.0002947342096303289, + "loss": 0.6447, "step": 408 }, { - "epoch": 0.7877629063097514, - "grad_norm": 0.263671875, - "learning_rate": 0.00029413197271723385, - "loss": 0.7856, + "epoch": 0.6634460547504025, + "grad_norm": 0.287109375, + "learning_rate": 0.0002946185428829991, + "loss": 0.641, "step": 412 }, { - "epoch": 0.7954110898661568, - "grad_norm": 0.2353515625, - "learning_rate": 0.0002940052670979004, - "loss": 0.7878, + "epoch": 0.6698872785829307, + "grad_norm": 0.3125, + "learning_rate": 0.0002945016427448579, + "loss": 0.6878, "step": 416 }, { - "epoch": 0.8030592734225621, - "grad_norm": 0.2421875, - "learning_rate": 0.0002938772359851265, - "loss": 0.7971, + "epoch": 0.6763285024154589, + "grad_norm": 0.287109375, + "learning_rate": 0.0002943835102128975, + "loss": 0.6646, "step": 420 }, { - "epoch": 0.8107074569789675, - "grad_norm": 0.240234375, - "learning_rate": 0.00029374788055737194, - "loss": 0.825, + "epoch": 0.6827697262479872, + "grad_norm": 0.26171875, + "learning_rate": 0.0002942641462946206, + "loss": 0.613, "step": 424 }, { - "epoch": 0.8183556405353728, - "grad_norm": 0.2373046875, - "learning_rate": 0.000293617202005286, - "loss": 0.8105, + "epoch": 0.6892109500805152, + "grad_norm": 0.302734375, + "learning_rate": 0.00029414355200803197, + "loss": 0.6135, "step": 428 }, { - "epoch": 0.8260038240917782, - "grad_norm": 0.26171875, - "learning_rate": 0.00029348520153169656, - "loss": 0.8055, + "epoch": 0.6956521739130435, + "grad_norm": 0.283203125, + "learning_rate": 0.0002940217283816296, + "loss": 0.6145, "step": 432 }, { - "epoch": 0.8336520076481836, - "grad_norm": 0.228515625, - "learning_rate": 0.0002933518803515993, - "loss": 0.7616, + "epoch": 0.7020933977455717, + "grad_norm": 0.27734375, + "learning_rate": 0.0002938986764543961, + "loss": 0.6199, "step": 436 }, { - "epoch": 0.8413001912045889, - "grad_norm": 0.265625, - "learning_rate": 0.00029321723969214625, - "loss": 0.7842, + "epoch": 0.7085346215780999, + "grad_norm": 0.267578125, + "learning_rate": 0.0002937743972757895, + "loss": 0.6566, "step": 440 }, { - "epoch": 0.8489483747609943, - "grad_norm": 0.216796875, - "learning_rate": 0.0002930812807926343, - "loss": 0.803, + "epoch": 0.714975845410628, + "grad_norm": 0.27734375, + "learning_rate": 0.0002936488919057349, + "loss": 0.6536, "step": 444 }, { - "epoch": 0.8565965583173997, - "grad_norm": 0.21875, - "learning_rate": 0.0002929440049044945, - "loss": 0.8174, + "epoch": 0.7214170692431562, + "grad_norm": 0.28515625, + "learning_rate": 0.0002935221614146148, + "loss": 0.6586, "step": 448 }, { - "epoch": 0.864244741873805, - "grad_norm": 0.2314453125, - "learning_rate": 0.0002928054132912797, - "loss": 0.836, + "epoch": 0.7278582930756844, + "grad_norm": 0.259765625, + "learning_rate": 0.0002933942068832604, + "loss": 0.6234, "step": 452 }, { - "epoch": 0.8718929254302104, - "grad_norm": 0.2412109375, - "learning_rate": 0.0002926655072286536, - "loss": 0.7552, + "epoch": 0.7342995169082126, + "grad_norm": 0.28515625, + "learning_rate": 0.00029326502940294207, + "loss": 0.6115, "step": 456 }, { - "epoch": 0.8795411089866156, - "grad_norm": 0.248046875, - "learning_rate": 0.00029252428800437854, - "loss": 0.7755, + "epoch": 0.7407407407407407, + "grad_norm": 0.298828125, + "learning_rate": 0.00029313463007536034, + "loss": 0.6205, "step": 460 }, { - "epoch": 0.887189292543021, - "grad_norm": 0.2412109375, - "learning_rate": 0.00029238175691830395, - "loss": 0.8101, + "epoch": 0.7471819645732689, + "grad_norm": 0.271484375, + "learning_rate": 0.0002930030100126363, + "loss": 0.6185, "step": 464 }, { - "epoch": 0.8948374760994264, - "grad_norm": 0.248046875, - "learning_rate": 0.00029223791528235407, - "loss": 0.7662, + "epoch": 0.7536231884057971, + "grad_norm": 0.251953125, + "learning_rate": 0.0002928701703373021, + "loss": 0.6305, "step": 468 }, { - "epoch": 0.9024856596558317, - "grad_norm": 0.2314453125, - "learning_rate": 0.00029209276442051634, - "loss": 0.7772, + "epoch": 0.7600644122383253, + "grad_norm": 0.267578125, + "learning_rate": 0.00029273611218229165, + "loss": 0.6464, "step": 472 }, { - "epoch": 0.9101338432122371, - "grad_norm": 0.2197265625, - "learning_rate": 0.0002919463056688287, - "loss": 0.8135, + "epoch": 0.7665056360708534, + "grad_norm": 0.263671875, + "learning_rate": 0.0002926008366909307, + "loss": 0.6488, "step": 476 }, { - "epoch": 0.9177820267686424, - "grad_norm": 0.236328125, - "learning_rate": 0.00029179854037536773, - "loss": 0.7895, + "epoch": 0.7729468599033816, + "grad_norm": 0.28515625, + "learning_rate": 0.00029246434501692685, + "loss": 0.6148, "step": 480 }, { - "epoch": 0.9254302103250478, - "grad_norm": 0.2392578125, - "learning_rate": 0.0002916494699002358, - "loss": 0.7644, + "epoch": 0.7793880837359098, + "grad_norm": 0.28125, + "learning_rate": 0.00029232663832436047, + "loss": 0.5946, "step": 484 }, { - "epoch": 0.9330783938814532, - "grad_norm": 0.2197265625, - "learning_rate": 0.00029149909561554914, - "loss": 0.7804, + "epoch": 0.785829307568438, + "grad_norm": 0.265625, + "learning_rate": 0.0002921877177876741, + "loss": 0.5898, "step": 488 }, { - "epoch": 0.9407265774378585, - "grad_norm": 0.2353515625, - "learning_rate": 0.0002913474189054246, - "loss": 0.7779, + "epoch": 0.7922705314009661, + "grad_norm": 0.265625, + "learning_rate": 0.0002920475845916626, + "loss": 0.6435, "step": 492 }, { - "epoch": 0.9483747609942639, - "grad_norm": 0.240234375, - "learning_rate": 0.0002911944411659672, - "loss": 0.781, + "epoch": 0.7987117552334944, + "grad_norm": 0.2578125, + "learning_rate": 0.00029190623993146313, + "loss": 0.6605, "step": 496 }, { - "epoch": 0.9560229445506692, - "grad_norm": 0.259765625, - "learning_rate": 0.0002910401638052574, - "loss": 0.8079, + "epoch": 0.8051529790660226, + "grad_norm": 0.287109375, + "learning_rate": 0.0002917636850125449, + "loss": 0.6297, "step": 500 }, { - "epoch": 0.9636711281070746, - "grad_norm": 0.259765625, - "learning_rate": 0.00029088458824333787, - "loss": 0.8332, + "epoch": 0.8115942028985508, + "grad_norm": 0.26953125, + "learning_rate": 0.00029161992105069905, + "loss": 0.6313, "step": 504 }, { - "epoch": 0.97131931166348, - "grad_norm": 0.2275390625, - "learning_rate": 0.00029072771591220057, - "loss": 0.7752, + "epoch": 0.8180354267310789, + "grad_norm": 0.26953125, + "learning_rate": 0.0002914749492720279, + "loss": 0.5953, "step": 508 }, { - "epoch": 0.9789674952198852, - "grad_norm": 0.26171875, - "learning_rate": 0.00029056954825577353, - "loss": 0.8038, + "epoch": 0.8244766505636071, + "grad_norm": 0.267578125, + "learning_rate": 0.00029132877091293493, + "loss": 0.6615, "step": 512 }, { - "epoch": 0.9866156787762906, - "grad_norm": 0.23046875, - "learning_rate": 0.0002904100867299077, - "loss": 0.7362, + "epoch": 0.8309178743961353, + "grad_norm": 0.287109375, + "learning_rate": 0.000291181387220114, + "loss": 0.6771, "step": 516 }, { - "epoch": 0.994263862332696, - "grad_norm": 0.228515625, - "learning_rate": 0.0002902493328023633, - "loss": 0.8137, + "epoch": 0.8373590982286635, + "grad_norm": 0.283203125, + "learning_rate": 0.0002910327994505387, + "loss": 0.5889, "step": 520 }, { - "epoch": 1.0019120458891013, - "grad_norm": 0.21875, - "learning_rate": 0.0002900872879527964, - "loss": 0.7389, + "epoch": 0.8438003220611916, + "grad_norm": 0.275390625, + "learning_rate": 0.0002908830088714516, + "loss": 0.5781, "step": 524 }, { - "epoch": 1.0095602294455066, - "grad_norm": 0.25390625, - "learning_rate": 0.00028992395367274547, - "loss": 0.6994, + "epoch": 0.8502415458937198, + "grad_norm": 0.287109375, + "learning_rate": 0.00029073201676035383, + "loss": 0.6182, "step": 528 }, { - "epoch": 1.0172084130019121, - "grad_norm": 0.2412109375, - "learning_rate": 0.0002897593314656174, - "loss": 0.7461, + "epoch": 0.856682769726248, + "grad_norm": 0.263671875, + "learning_rate": 0.00029057982440499356, + "loss": 0.6226, "step": 532 }, { - "epoch": 1.0248565965583174, - "grad_norm": 0.263671875, - "learning_rate": 0.0002895934228466738, - "loss": 0.6856, + "epoch": 0.8631239935587761, + "grad_norm": 0.27734375, + "learning_rate": 0.00029042643310335547, + "loss": 0.6547, "step": 536 }, { - "epoch": 1.0325047801147227, - "grad_norm": 0.236328125, - "learning_rate": 0.0002894262293430171, - "loss": 0.7405, + "epoch": 0.8695652173913043, + "grad_norm": 0.26953125, + "learning_rate": 0.00029027184416364956, + "loss": 0.6114, "step": 540 }, { - "epoch": 1.0401529636711282, - "grad_norm": 0.2451171875, - "learning_rate": 0.0002892577524935763, - "loss": 0.7906, + "epoch": 0.8760064412238325, + "grad_norm": 0.28515625, + "learning_rate": 0.0002901160589043, + "loss": 0.6491, "step": 544 }, { - "epoch": 1.0478011472275335, - "grad_norm": 0.2578125, - "learning_rate": 0.00028908799384909313, - "loss": 0.7144, + "epoch": 0.8824476650563607, + "grad_norm": 0.275390625, + "learning_rate": 0.00028995907865393385, + "loss": 0.6375, "step": 548 }, { - "epoch": 1.0554493307839388, - "grad_norm": 0.232421875, - "learning_rate": 0.0002889169549721073, - "loss": 0.7233, + "epoch": 0.8888888888888888, + "grad_norm": 0.2578125, + "learning_rate": 0.00028980090475136963, + "loss": 0.6083, "step": 552 }, { - "epoch": 1.063097514340344, - "grad_norm": 0.28125, - "learning_rate": 0.00028874463743694265, - "loss": 0.7068, + "epoch": 0.895330112721417, + "grad_norm": 0.267578125, + "learning_rate": 0.0002896415385456062, + "loss": 0.5879, "step": 556 }, { - "epoch": 1.0707456978967496, - "grad_norm": 0.2353515625, - "learning_rate": 0.0002885710428296922, - "loss": 0.7105, + "epoch": 0.9017713365539453, + "grad_norm": 0.279296875, + "learning_rate": 0.000289480981395811, + "loss": 0.6596, "step": 560 }, { - "epoch": 1.0783938814531548, - "grad_norm": 0.2421875, - "learning_rate": 0.00028839617274820404, - "loss": 0.6563, + "epoch": 0.9082125603864735, + "grad_norm": 0.306640625, + "learning_rate": 0.00028931923467130855, + "loss": 0.5774, "step": 564 }, { - "epoch": 1.0860420650095601, - "grad_norm": 0.244140625, - "learning_rate": 0.00028822002880206593, - "loss": 0.6956, + "epoch": 0.9146537842190016, + "grad_norm": 0.28515625, + "learning_rate": 0.00028915629975156867, + "loss": 0.6118, "step": 568 }, { - "epoch": 1.0936902485659656, - "grad_norm": 0.255859375, - "learning_rate": 0.00028804261261259115, - "loss": 0.6669, + "epoch": 0.9210950080515298, + "grad_norm": 0.298828125, + "learning_rate": 0.0002889921780261949, + "loss": 0.615, "step": 572 }, { - "epoch": 1.101338432122371, - "grad_norm": 0.2578125, - "learning_rate": 0.00028786392581280334, - "loss": 0.6727, + "epoch": 0.927536231884058, + "grad_norm": 0.25, + "learning_rate": 0.00028882687089491234, + "loss": 0.6225, "step": 576 }, { - "epoch": 1.1089866156787762, - "grad_norm": 0.26953125, - "learning_rate": 0.00028768397004742135, - "loss": 0.714, + "epoch": 0.9339774557165862, + "grad_norm": 0.291015625, + "learning_rate": 0.0002886603797675563, + "loss": 0.5626, "step": 580 }, { - "epoch": 1.1166347992351817, - "grad_norm": 0.255859375, - "learning_rate": 0.00028750274697284423, - "loss": 0.666, + "epoch": 0.9404186795491143, + "grad_norm": 0.26953125, + "learning_rate": 0.0002884927060640596, + "loss": 0.5886, "step": 584 }, { - "epoch": 1.124282982791587, - "grad_norm": 0.23828125, - "learning_rate": 0.00028732025825713587, - "loss": 0.6883, + "epoch": 0.9468599033816425, + "grad_norm": 0.328125, + "learning_rate": 0.0002883238512144409, + "loss": 0.6251, "step": 588 }, { - "epoch": 1.1319311663479923, - "grad_norm": 0.26171875, - "learning_rate": 0.00028713650558000983, - "loss": 0.7002, + "epoch": 0.9533011272141707, + "grad_norm": 0.27734375, + "learning_rate": 0.0002881538166587921, + "loss": 0.6326, "step": 592 }, { - "epoch": 1.1395793499043978, - "grad_norm": 0.26171875, - "learning_rate": 0.0002869514906328138, - "loss": 0.7038, + "epoch": 0.9597423510466989, + "grad_norm": 0.271484375, + "learning_rate": 0.0002879826038472667, + "loss": 0.5666, "step": 596 }, { - "epoch": 1.147227533460803, - "grad_norm": 0.2578125, - "learning_rate": 0.00028676521511851395, - "loss": 0.7178, + "epoch": 0.966183574879227, + "grad_norm": 0.279296875, + "learning_rate": 0.00028781021424006677, + "loss": 0.5282, "step": 600 }, { - "epoch": 1.1548757170172084, - "grad_norm": 0.267578125, - "learning_rate": 0.0002865776807516793, - "loss": 0.6752, + "epoch": 0.9726247987117552, + "grad_norm": 0.271484375, + "learning_rate": 0.00028763664930743087, + "loss": 0.6628, "step": 604 }, { - "epoch": 1.1625239005736137, - "grad_norm": 0.2470703125, - "learning_rate": 0.0002863888892584659, - "loss": 0.7204, + "epoch": 0.9790660225442834, + "grad_norm": 0.265625, + "learning_rate": 0.00028746191052962146, + "loss": 0.5669, "step": 608 }, { - "epoch": 1.1701720841300192, - "grad_norm": 0.298828125, - "learning_rate": 0.00028619884237660124, - "loss": 0.6967, + "epoch": 0.9855072463768116, + "grad_norm": 0.267578125, + "learning_rate": 0.00028728599939691215, + "loss": 0.5955, "step": 612 }, { - "epoch": 1.1778202676864244, - "grad_norm": 0.2421875, - "learning_rate": 0.0002860075418553676, - "loss": 0.705, + "epoch": 0.9919484702093397, + "grad_norm": 0.27734375, + "learning_rate": 0.00028710891740957507, + "loss": 0.5995, "step": 616 }, { - "epoch": 1.1854684512428297, - "grad_norm": 0.271484375, - "learning_rate": 0.00028581498945558676, - "loss": 0.7208, + "epoch": 0.998389694041868, + "grad_norm": 0.265625, + "learning_rate": 0.00028693066607786823, + "loss": 0.5813, "step": 620 }, { - "epoch": 1.1931166347992352, - "grad_norm": 0.26953125, - "learning_rate": 0.00028562118694960316, - "loss": 0.6872, + "epoch": 1.0048309178743962, + "grad_norm": 0.251953125, + "learning_rate": 0.0002867512469220222, + "loss": 0.5306, "step": 624 }, { - "epoch": 1.2007648183556405, - "grad_norm": 0.2431640625, - "learning_rate": 0.0002854261361212679, - "loss": 0.7445, + "epoch": 1.0112721417069244, + "grad_norm": 0.275390625, + "learning_rate": 0.00028657066147222773, + "loss": 0.4918, "step": 628 }, { - "epoch": 1.2084130019120458, - "grad_norm": 0.24609375, - "learning_rate": 0.00028522983876592213, - "loss": 0.7267, + "epoch": 1.0177133655394526, + "grad_norm": 0.259765625, + "learning_rate": 0.00028638891126862224, + "loss": 0.5198, "step": 632 }, { - "epoch": 1.2160611854684513, - "grad_norm": 0.26171875, - "learning_rate": 0.0002850322966903808, - "loss": 0.6831, + "epoch": 1.0241545893719808, + "grad_norm": 0.259765625, + "learning_rate": 0.0002862059978612769, + "loss": 0.5673, "step": 636 }, { - "epoch": 1.2237093690248566, - "grad_norm": 0.26171875, - "learning_rate": 0.00028483351171291576, - "loss": 0.7185, + "epoch": 1.0305958132045088, + "grad_norm": 0.2734375, + "learning_rate": 0.00028602192281018327, + "loss": 0.5127, "step": 640 }, { - "epoch": 1.231357552581262, - "grad_norm": 0.26171875, - "learning_rate": 0.00028463348566323913, - "loss": 0.7019, + "epoch": 1.037037037037037, + "grad_norm": 0.283203125, + "learning_rate": 0.0002858366876852403, + "loss": 0.5517, "step": 644 }, { - "epoch": 1.2390057361376674, - "grad_norm": 0.263671875, - "learning_rate": 0.00028443222038248645, - "loss": 0.7699, + "epoch": 1.0434782608695652, + "grad_norm": 0.26171875, + "learning_rate": 0.0002856502940662403, + "loss": 0.5209, "step": 648 }, { - "epoch": 1.2466539196940727, - "grad_norm": 0.24609375, - "learning_rate": 0.00028422971772319977, - "loss": 0.6939, + "epoch": 1.0499194847020934, + "grad_norm": 0.279296875, + "learning_rate": 0.00028546274354285646, + "loss": 0.5362, "step": 652 }, { - "epoch": 1.254302103250478, - "grad_norm": 0.28125, - "learning_rate": 0.00028402597954931046, - "loss": 0.6505, + "epoch": 1.0563607085346216, + "grad_norm": 0.25390625, + "learning_rate": 0.00028527403771462826, + "loss": 0.5256, "step": 656 }, { - "epoch": 1.2619502868068833, - "grad_norm": 0.251953125, - "learning_rate": 0.00028382100773612236, - "loss": 0.6632, + "epoch": 1.0628019323671498, + "grad_norm": 0.361328125, + "learning_rate": 0.00028508417819094844, + "loss": 0.5257, "step": 660 }, { - "epoch": 1.2695984703632888, - "grad_norm": 0.251953125, - "learning_rate": 0.0002836148041702941, - "loss": 0.7787, + "epoch": 1.069243156199678, + "grad_norm": 0.2890625, + "learning_rate": 0.0002848931665910492, + "loss": 0.4971, "step": 664 }, { - "epoch": 1.277246653919694, - "grad_norm": 0.2412109375, - "learning_rate": 0.00028340737074982207, - "loss": 0.7293, + "epoch": 1.075684380032206, + "grad_norm": 0.275390625, + "learning_rate": 0.0002847010045439882, + "loss": 0.5214, "step": 668 }, { - "epoch": 1.2848948374760996, - "grad_norm": 0.23828125, - "learning_rate": 0.0002831987093840229, - "loss": 0.7471, + "epoch": 1.0821256038647342, + "grad_norm": 0.298828125, + "learning_rate": 0.0002845076936886349, + "loss": 0.5283, "step": 672 }, { - "epoch": 1.2925430210325048, - "grad_norm": 0.240234375, - "learning_rate": 0.00028298882199351565, - "loss": 0.7445, + "epoch": 1.0885668276972624, + "grad_norm": 0.271484375, + "learning_rate": 0.0002843132356736563, + "loss": 0.5024, "step": 676 }, { - "epoch": 1.3001912045889101, - "grad_norm": 0.251953125, - "learning_rate": 0.00028277771051020433, - "loss": 0.6997, + "epoch": 1.0950080515297906, + "grad_norm": 0.26171875, + "learning_rate": 0.0002841176321575032, + "loss": 0.5515, "step": 680 }, { - "epoch": 1.3078393881453154, - "grad_norm": 0.2412109375, - "learning_rate": 0.00028256537687726017, - "loss": 0.7389, + "epoch": 1.1014492753623188, + "grad_norm": 0.27734375, + "learning_rate": 0.0002839208848083958, + "loss": 0.5493, "step": 684 }, { - "epoch": 1.3154875717017207, - "grad_norm": 0.26171875, - "learning_rate": 0.0002823518230491036, - "loss": 0.7171, + "epoch": 1.107890499194847, + "grad_norm": 0.2578125, + "learning_rate": 0.0002837229953043096, + "loss": 0.4908, "step": 688 }, { - "epoch": 1.3231357552581262, - "grad_norm": 0.251953125, - "learning_rate": 0.00028213705099138636, - "loss": 0.7054, + "epoch": 1.1143317230273753, + "grad_norm": 0.27734375, + "learning_rate": 0.0002835239653329611, + "loss": 0.5136, "step": 692 }, { - "epoch": 1.3307839388145315, - "grad_norm": 0.267578125, - "learning_rate": 0.00028192106268097334, - "loss": 0.6543, + "epoch": 1.1207729468599035, + "grad_norm": 0.283203125, + "learning_rate": 0.0002833237965917934, + "loss": 0.5379, "step": 696 }, { - "epoch": 1.338432122370937, - "grad_norm": 0.2578125, - "learning_rate": 0.0002817038601059243, - "loss": 0.7012, + "epoch": 1.1272141706924317, + "grad_norm": 0.28515625, + "learning_rate": 0.0002831224907879614, + "loss": 0.5059, "step": 700 }, { - "epoch": 1.3460803059273423, - "grad_norm": 0.259765625, - "learning_rate": 0.0002814854452654758, - "loss": 0.7058, + "epoch": 1.1336553945249597, + "grad_norm": 0.27734375, + "learning_rate": 0.00028292004963831796, + "loss": 0.5231, "step": 704 }, { - "epoch": 1.3537284894837476, - "grad_norm": 0.259765625, - "learning_rate": 0.00028126582017002266, - "loss": 0.7797, + "epoch": 1.1400966183574879, + "grad_norm": 0.279296875, + "learning_rate": 0.00028271647486939855, + "loss": 0.5223, "step": 708 }, { - "epoch": 1.3613766730401529, - "grad_norm": 0.267578125, - "learning_rate": 0.0002810449868410994, - "loss": 0.6898, + "epoch": 1.146537842190016, + "grad_norm": 0.27734375, + "learning_rate": 0.0002825117682174069, + "loss": 0.4907, "step": 712 }, { - "epoch": 1.3690248565965584, - "grad_norm": 0.240234375, - "learning_rate": 0.00028082294731136164, - "loss": 0.6872, + "epoch": 1.1529790660225443, + "grad_norm": 0.267578125, + "learning_rate": 0.0002823059314282, + "loss": 0.4996, "step": 716 }, { - "epoch": 1.3766730401529637, - "grad_norm": 0.2734375, - "learning_rate": 0.00028059970362456776, - "loss": 0.706, + "epoch": 1.1594202898550725, + "grad_norm": 0.25, + "learning_rate": 0.0002820989662572734, + "loss": 0.5084, "step": 720 }, { - "epoch": 1.384321223709369, - "grad_norm": 0.236328125, - "learning_rate": 0.00028037525783555935, - "loss": 0.6971, + "epoch": 1.1658615136876007, + "grad_norm": 0.271484375, + "learning_rate": 0.0002818908744697461, + "loss": 0.4909, "step": 724 }, { - "epoch": 1.3919694072657744, - "grad_norm": 0.251953125, - "learning_rate": 0.00028014961201024304, - "loss": 0.7212, + "epoch": 1.1723027375201287, + "grad_norm": 0.265625, + "learning_rate": 0.00028168165784034566, + "loss": 0.5245, "step": 728 }, { - "epoch": 1.3996175908221797, - "grad_norm": 0.25, - "learning_rate": 0.0002799227682255711, - "loss": 0.702, + "epoch": 1.178743961352657, + "grad_norm": 0.271484375, + "learning_rate": 0.00028147131815339267, + "loss": 0.5307, "step": 732 }, { - "epoch": 1.407265774378585, - "grad_norm": 0.259765625, - "learning_rate": 0.00027969472856952224, - "loss": 0.6604, + "epoch": 1.1851851851851851, + "grad_norm": 0.26953125, + "learning_rate": 0.00028125985720278614, + "loss": 0.5213, "step": 736 }, { - "epoch": 1.4149139579349903, - "grad_norm": 0.265625, - "learning_rate": 0.00027946549514108277, - "loss": 0.7, + "epoch": 1.1916264090177133, + "grad_norm": 0.298828125, + "learning_rate": 0.0002810472767919876, + "loss": 0.5257, "step": 740 }, { - "epoch": 1.4225621414913958, - "grad_norm": 0.263671875, - "learning_rate": 0.00027923507005022687, - "loss": 0.7281, + "epoch": 1.1980676328502415, + "grad_norm": 0.2890625, + "learning_rate": 0.0002808335787340061, + "loss": 0.4913, "step": 744 }, { - "epoch": 1.430210325047801, - "grad_norm": 0.2392578125, - "learning_rate": 0.00027900345541789746, - "loss": 0.6261, + "epoch": 1.2045088566827697, + "grad_norm": 0.263671875, + "learning_rate": 0.00028061876485138264, + "loss": 0.5331, "step": 748 }, { - "epoch": 1.4378585086042066, - "grad_norm": 0.275390625, - "learning_rate": 0.0002787706533759865, - "loss": 0.7131, + "epoch": 1.210950080515298, + "grad_norm": 0.283203125, + "learning_rate": 0.00028040283697617464, + "loss": 0.5055, "step": 752 }, { - "epoch": 1.445506692160612, - "grad_norm": 0.26171875, - "learning_rate": 0.00027853666606731547, - "loss": 0.664, + "epoch": 1.2173913043478262, + "grad_norm": 0.263671875, + "learning_rate": 0.0002801857969499402, + "loss": 0.5318, "step": 756 }, { - "epoch": 1.4531548757170172, - "grad_norm": 0.255859375, - "learning_rate": 0.0002783014956456157, - "loss": 0.6856, + "epoch": 1.2238325281803544, + "grad_norm": 0.29296875, + "learning_rate": 0.0002799676466237225, + "loss": 0.4991, "step": 760 }, { - "epoch": 1.4608030592734225, - "grad_norm": 0.240234375, - "learning_rate": 0.0002780651442755083, - "loss": 0.7422, + "epoch": 1.2302737520128824, + "grad_norm": 0.27734375, + "learning_rate": 0.0002797483878580342, + "loss": 0.5059, "step": 764 }, { - "epoch": 1.468451242829828, - "grad_norm": 0.24609375, - "learning_rate": 0.0002778276141324844, - "loss": 0.6738, + "epoch": 1.2367149758454106, + "grad_norm": 0.28125, + "learning_rate": 0.00027952802252284104, + "loss": 0.5043, "step": 768 }, { - "epoch": 1.4760994263862333, - "grad_norm": 0.271484375, - "learning_rate": 0.0002775889074028853, - "loss": 0.7595, + "epoch": 1.2431561996779388, + "grad_norm": 0.251953125, + "learning_rate": 0.0002793065524975465, + "loss": 0.5747, "step": 772 }, { - "epoch": 1.4837476099426385, - "grad_norm": 0.255859375, - "learning_rate": 0.000277349026283882, - "loss": 0.7132, + "epoch": 1.249597423510467, + "grad_norm": 0.30078125, + "learning_rate": 0.0002790839796709755, + "loss": 0.5082, "step": 776 }, { - "epoch": 1.491395793499044, - "grad_norm": 0.2451171875, - "learning_rate": 0.0002771079729834552, - "loss": 0.7316, + "epoch": 1.2560386473429952, + "grad_norm": 0.287109375, + "learning_rate": 0.00027886030594135805, + "loss": 0.5369, "step": 780 }, { - "epoch": 1.4990439770554493, - "grad_norm": 0.2578125, - "learning_rate": 0.0002768657497203749, - "loss": 0.7314, + "epoch": 1.2624798711755234, + "grad_norm": 0.27734375, + "learning_rate": 0.0002786355332163135, + "loss": 0.5423, "step": 784 }, { - "epoch": 1.5066921606118546, - "grad_norm": 0.25390625, - "learning_rate": 0.00027662235872418005, - "loss": 0.7036, + "epoch": 1.2689210950080514, + "grad_norm": 0.302734375, + "learning_rate": 0.000278409663412834, + "loss": 0.4882, "step": 788 }, { - "epoch": 1.51434034416826, - "grad_norm": 0.25390625, - "learning_rate": 0.00027637780223515793, - "loss": 0.7191, + "epoch": 1.2753623188405796, + "grad_norm": 0.26953125, + "learning_rate": 0.0002781826984572683, + "loss": 0.504, "step": 792 }, { - "epoch": 1.5219885277246654, - "grad_norm": 0.267578125, - "learning_rate": 0.00027613208250432353, - "loss": 0.712, + "epoch": 1.2818035426731078, + "grad_norm": 0.2734375, + "learning_rate": 0.0002779546402853051, + "loss": 0.4872, "step": 796 }, { - "epoch": 1.5296367112810707, - "grad_norm": 0.248046875, - "learning_rate": 0.000275885201793399, - "loss": 0.6918, + "epoch": 1.288244766505636, + "grad_norm": 0.265625, + "learning_rate": 0.00027772549084195675, + "loss": 0.5348, "step": 800 }, { - "epoch": 1.5372848948374762, - "grad_norm": 0.25390625, - "learning_rate": 0.0002756371623747925, - "loss": 0.6822, + "epoch": 1.2946859903381642, + "grad_norm": 0.29296875, + "learning_rate": 0.00027749525208154265, + "loss": 0.5718, "step": 804 }, { - "epoch": 1.5449330783938815, - "grad_norm": 0.271484375, - "learning_rate": 0.0002753879665315778, - "loss": 0.707, + "epoch": 1.3011272141706924, + "grad_norm": 0.294921875, + "learning_rate": 0.0002772639259676726, + "loss": 0.5393, "step": 808 }, { - "epoch": 1.5525812619502868, - "grad_norm": 0.26171875, - "learning_rate": 0.0002751376165574726, - "loss": 0.6921, + "epoch": 1.3075684380032206, + "grad_norm": 0.2734375, + "learning_rate": 0.00027703151447322965, + "loss": 0.5421, "step": 812 }, { - "epoch": 1.560229445506692, - "grad_norm": 0.2734375, - "learning_rate": 0.0002748861147568181, - "loss": 0.7048, + "epoch": 1.3140096618357489, + "grad_norm": 0.275390625, + "learning_rate": 0.0002767980195803539, + "loss": 0.5555, "step": 816 }, { - "epoch": 1.5678776290630974, - "grad_norm": 0.2734375, - "learning_rate": 0.00027463346344455724, - "loss": 0.7171, + "epoch": 1.320450885668277, + "grad_norm": 0.2890625, + "learning_rate": 0.0002765634432804253, + "loss": 0.553, "step": 820 }, { - "epoch": 1.5755258126195029, - "grad_norm": 0.267578125, - "learning_rate": 0.0002743796649462137, - "loss": 0.6839, + "epoch": 1.3268921095008053, + "grad_norm": 0.27734375, + "learning_rate": 0.00027632778757404655, + "loss": 0.5075, "step": 824 }, { - "epoch": 1.5831739961759084, - "grad_norm": 0.25390625, - "learning_rate": 0.00027412472159787037, - "loss": 0.6722, + "epoch": 1.3333333333333333, + "grad_norm": 0.294921875, + "learning_rate": 0.0002760910544710261, + "loss": 0.4933, "step": 828 }, { - "epoch": 1.5908221797323137, - "grad_norm": 0.251953125, - "learning_rate": 0.00027386863574614803, - "loss": 0.6538, + "epoch": 1.3397745571658615, + "grad_norm": 0.283203125, + "learning_rate": 0.00027585324599036133, + "loss": 0.5039, "step": 832 }, { - "epoch": 1.598470363288719, - "grad_norm": 0.2734375, - "learning_rate": 0.0002736114097481833, - "loss": 0.7013, + "epoch": 1.3462157809983897, + "grad_norm": 0.28125, + "learning_rate": 0.00027561436416022073, + "loss": 0.5175, "step": 836 }, { - "epoch": 1.6061185468451242, - "grad_norm": 0.259765625, - "learning_rate": 0.0002733530459716076, - "loss": 0.6967, + "epoch": 1.3526570048309179, + "grad_norm": 0.28125, + "learning_rate": 0.00027537441101792715, + "loss": 0.5375, "step": 840 }, { - "epoch": 1.6137667304015295, - "grad_norm": 0.255859375, - "learning_rate": 0.00027309354679452483, - "loss": 0.7052, + "epoch": 1.359098228663446, + "grad_norm": 0.287109375, + "learning_rate": 0.0002751333886099402, + "loss": 0.5235, "step": 844 }, { - "epoch": 1.621414913957935, - "grad_norm": 0.279296875, - "learning_rate": 0.0002728329146054897, - "loss": 0.6677, + "epoch": 1.3655394524959743, + "grad_norm": 0.28125, + "learning_rate": 0.0002748912989918387, + "loss": 0.4882, "step": 848 }, { - "epoch": 1.6290630975143403, - "grad_norm": 0.267578125, - "learning_rate": 0.00027257115180348557, - "loss": 0.7128, + "epoch": 1.3719806763285023, + "grad_norm": 0.287109375, + "learning_rate": 0.0002746481442283034, + "loss": 0.5032, "step": 852 }, { - "epoch": 1.6367112810707458, - "grad_norm": 0.25, - "learning_rate": 0.0002723082607979028, - "loss": 0.7085, + "epoch": 1.3784219001610305, + "grad_norm": 0.279296875, + "learning_rate": 0.0002744039263930991, + "loss": 0.5052, "step": 856 }, { - "epoch": 1.644359464627151, - "grad_norm": 0.259765625, - "learning_rate": 0.00027204424400851596, - "loss": 0.6719, + "epoch": 1.3848631239935587, + "grad_norm": 0.265625, + "learning_rate": 0.0002741586475690571, + "loss": 0.5538, "step": 860 }, { - "epoch": 1.6520076481835564, - "grad_norm": 0.275390625, - "learning_rate": 0.00027177910386546206, - "loss": 0.6873, + "epoch": 1.391304347826087, + "grad_norm": 0.263671875, + "learning_rate": 0.0002739123098480576, + "loss": 0.5457, "step": 864 }, { - "epoch": 1.6596558317399617, - "grad_norm": 0.248046875, - "learning_rate": 0.00027151284280921794, - "loss": 0.6919, + "epoch": 1.3977455716586151, + "grad_norm": 0.2734375, + "learning_rate": 0.00027366491533101147, + "loss": 0.5111, "step": 868 }, { - "epoch": 1.667304015296367, - "grad_norm": 0.2734375, - "learning_rate": 0.0002712454632905779, - "loss": 0.6967, + "epoch": 1.4041867954911433, + "grad_norm": 0.263671875, + "learning_rate": 0.0002734164661278426, + "loss": 0.4902, "step": 872 }, { - "epoch": 1.6749521988527725, - "grad_norm": 0.271484375, - "learning_rate": 0.00027097696777063113, - "loss": 0.7067, + "epoch": 1.4106280193236715, + "grad_norm": 0.263671875, + "learning_rate": 0.00027316696435747, + "loss": 0.5504, "step": 876 }, { - "epoch": 1.682600382409178, - "grad_norm": 0.2578125, - "learning_rate": 0.00027070735872073885, - "loss": 0.6303, + "epoch": 1.4170692431561998, + "grad_norm": 0.271484375, + "learning_rate": 0.00027291641214778937, + "loss": 0.5234, "step": 880 }, { - "epoch": 1.6902485659655833, + "epoch": 1.423510466988728, "grad_norm": 0.271484375, - "learning_rate": 0.0002704366386225119, - "loss": 0.6828, + "learning_rate": 0.0002726648116356554, + "loss": 0.5052, "step": 884 }, { - "epoch": 1.6978967495219885, - "grad_norm": 0.271484375, - "learning_rate": 0.0002701648099677878, - "loss": 0.6876, + "epoch": 1.4299516908212562, + "grad_norm": 0.28515625, + "learning_rate": 0.000272412164966863, + "loss": 0.5189, "step": 888 }, { - "epoch": 1.7055449330783938, - "grad_norm": 0.26171875, - "learning_rate": 0.0002698918752586075, - "loss": 0.7079, + "epoch": 1.4363929146537842, + "grad_norm": 0.279296875, + "learning_rate": 0.00027215847429612965, + "loss": 0.4982, "step": 892 }, { - "epoch": 1.7131931166347991, - "grad_norm": 0.2890625, - "learning_rate": 0.00026961783700719293, - "loss": 0.6956, + "epoch": 1.4428341384863124, + "grad_norm": 0.275390625, + "learning_rate": 0.0002719037417870765, + "loss": 0.4916, "step": 896 }, { - "epoch": 1.7208413001912046, - "grad_norm": 0.2578125, - "learning_rate": 0.0002693426977359233, - "loss": 0.7143, + "epoch": 1.4492753623188406, + "grad_norm": 0.259765625, + "learning_rate": 0.00027164796961221015, + "loss": 0.5149, "step": 900 }, { - "epoch": 1.72848948374761, - "grad_norm": 0.251953125, - "learning_rate": 0.0002690664599773122, - "loss": 0.6973, + "epoch": 1.4557165861513688, + "grad_norm": 0.287109375, + "learning_rate": 0.0002713911599529039, + "loss": 0.5636, "step": 904 }, { - "epoch": 1.7361376673040154, - "grad_norm": 0.27734375, - "learning_rate": 0.00026878912627398434, - "loss": 0.68, + "epoch": 1.462157809983897, + "grad_norm": 0.275390625, + "learning_rate": 0.00027113331499937967, + "loss": 0.5191, "step": 908 }, { - "epoch": 1.7437858508604207, - "grad_norm": 0.248046875, - "learning_rate": 0.0002685106991786519, - "loss": 0.7011, + "epoch": 1.4685990338164252, + "grad_norm": 0.265625, + "learning_rate": 0.00027087443695068873, + "loss": 0.4786, "step": 912 }, { - "epoch": 1.751434034416826, - "grad_norm": 0.2578125, - "learning_rate": 0.00026823118125409107, - "loss": 0.6862, + "epoch": 1.4750402576489532, + "grad_norm": 0.30078125, + "learning_rate": 0.0002706145280146931, + "loss": 0.5033, "step": 916 }, { - "epoch": 1.7590822179732313, - "grad_norm": 0.271484375, - "learning_rate": 0.0002679505750731189, - "loss": 0.6929, + "epoch": 1.4814814814814814, + "grad_norm": 0.275390625, + "learning_rate": 0.00027035359040804703, + "loss": 0.4753, "step": 920 }, { - "epoch": 1.7667304015296366, - "grad_norm": 0.26953125, - "learning_rate": 0.00026766888321856896, - "loss": 0.6927, + "epoch": 1.4879227053140096, + "grad_norm": 0.279296875, + "learning_rate": 0.0002700916263561778, + "loss": 0.5255, "step": 924 }, { - "epoch": 1.774378585086042, - "grad_norm": 0.259765625, - "learning_rate": 0.000267386108283268, - "loss": 0.6618, + "epoch": 1.4943639291465378, + "grad_norm": 0.298828125, + "learning_rate": 0.0002698286380932667, + "loss": 0.5472, "step": 928 }, { - "epoch": 1.7820267686424476, - "grad_norm": 0.251953125, - "learning_rate": 0.0002671022528700118, - "loss": 0.7173, + "epoch": 1.500805152979066, + "grad_norm": 0.265625, + "learning_rate": 0.0002695646278622302, + "loss": 0.4944, "step": 932 }, { - "epoch": 1.7896749521988529, - "grad_norm": 0.23828125, - "learning_rate": 0.00026681731959154174, - "loss": 0.7314, + "epoch": 1.5072463768115942, + "grad_norm": 0.26953125, + "learning_rate": 0.0002692995979147007, + "loss": 0.4677, "step": 936 }, { - "epoch": 1.7973231357552581, - "grad_norm": 0.255859375, - "learning_rate": 0.00026653131107052, - "loss": 0.7013, + "epoch": 1.5136876006441224, + "grad_norm": 0.28515625, + "learning_rate": 0.00026903355051100734, + "loss": 0.5152, "step": 940 }, { - "epoch": 1.8049713193116634, - "grad_norm": 0.267578125, - "learning_rate": 0.00026624422993950603, - "loss": 0.7591, + "epoch": 1.5201288244766507, + "grad_norm": 0.279296875, + "learning_rate": 0.0002687664879201565, + "loss": 0.5287, "step": 944 }, { - "epoch": 1.8126195028680687, - "grad_norm": 0.279296875, - "learning_rate": 0.0002659560788409321, - "loss": 0.6398, + "epoch": 1.5265700483091789, + "grad_norm": 0.2734375, + "learning_rate": 0.00026849841241981313, + "loss": 0.5185, "step": 948 }, { - "epoch": 1.8202676864244742, - "grad_norm": 0.2890625, - "learning_rate": 0.0002656668604270788, - "loss": 0.6778, + "epoch": 1.533011272141707, + "grad_norm": 0.279296875, + "learning_rate": 0.00026822932629628034, + "loss": 0.4925, "step": 952 }, { - "epoch": 1.8279158699808795, - "grad_norm": 0.265625, - "learning_rate": 0.00026537657736005094, - "loss": 0.6543, + "epoch": 1.539452495974235, + "grad_norm": 0.279296875, + "learning_rate": 0.0002679592318444808, + "loss": 0.4938, "step": 956 }, { - "epoch": 1.835564053537285, - "grad_norm": 0.251953125, - "learning_rate": 0.000265085232311753, - "loss": 0.7096, + "epoch": 1.5458937198067633, + "grad_norm": 0.271484375, + "learning_rate": 0.0002676881313679366, + "loss": 0.4962, "step": 960 }, { - "epoch": 1.8432122370936903, - "grad_norm": 0.2314453125, - "learning_rate": 0.00026479282796386416, - "loss": 0.6939, + "epoch": 1.5523349436392915, + "grad_norm": 0.275390625, + "learning_rate": 0.0002674160271787498, + "loss": 0.4962, "step": 964 }, { - "epoch": 1.8508604206500956, - "grad_norm": 0.26171875, - "learning_rate": 0.00026449936700781413, - "loss": 0.728, + "epoch": 1.5587761674718197, + "grad_norm": 0.26953125, + "learning_rate": 0.0002671429215975828, + "loss": 0.5142, "step": 968 }, { - "epoch": 1.8585086042065009, - "grad_norm": 0.26171875, - "learning_rate": 0.0002642048521447581, - "loss": 0.6862, + "epoch": 1.5652173913043477, + "grad_norm": 0.28515625, + "learning_rate": 0.00026686881695363833, + "loss": 0.5361, "step": 972 }, { - "epoch": 1.8661567877629062, - "grad_norm": 0.25390625, - "learning_rate": 0.00026390928608555195, - "loss": 0.6767, + "epoch": 1.5716586151368759, + "grad_norm": 0.287109375, + "learning_rate": 0.0002665937155846399, + "loss": 0.519, "step": 976 }, { - "epoch": 1.8738049713193117, - "grad_norm": 0.314453125, - "learning_rate": 0.0002636126715507272, - "loss": 0.6229, + "epoch": 1.578099838969404, + "grad_norm": 0.2734375, + "learning_rate": 0.0002663176198368114, + "loss": 0.5055, "step": 980 }, { - "epoch": 1.8814531548757172, - "grad_norm": 0.26171875, - "learning_rate": 0.000263315011270466, - "loss": 0.6941, + "epoch": 1.5845410628019323, + "grad_norm": 0.2578125, + "learning_rate": 0.0002660405320648576, + "loss": 0.5256, "step": 984 }, { - "epoch": 1.8891013384321225, - "grad_norm": 0.267578125, - "learning_rate": 0.00026301630798457613, - "loss": 0.6854, + "epoch": 1.5909822866344605, + "grad_norm": 0.28125, + "learning_rate": 0.0002657624546319437, + "loss": 0.5103, "step": 988 }, { - "epoch": 1.8967495219885278, - "grad_norm": 0.26953125, - "learning_rate": 0.00026271656444246577, - "loss": 0.7136, + "epoch": 1.5974235104669887, + "grad_norm": 0.296875, + "learning_rate": 0.0002654833899096753, + "loss": 0.5249, "step": 992 }, { - "epoch": 1.904397705544933, - "grad_norm": 0.265625, - "learning_rate": 0.000262415783403118, - "loss": 0.696, + "epoch": 1.603864734299517, + "grad_norm": 0.330078125, + "learning_rate": 0.00026520334027807827, + "loss": 0.4895, "step": 996 }, { - "epoch": 1.9120458891013383, - "grad_norm": 0.25390625, - "learning_rate": 0.00026211396763506546, - "loss": 0.6688, + "epoch": 1.6103059581320451, + "grad_norm": 0.28125, + "learning_rate": 0.0002649223081255782, + "loss": 0.5061, "step": 1000 }, { - "epoch": 1.9196940726577438, - "grad_norm": 0.25390625, - "learning_rate": 0.0002618111199163651, - "loss": 0.6953, + "epoch": 1.6167471819645733, + "grad_norm": 0.28515625, + "learning_rate": 0.00026464029584898036, + "loss": 0.4781, "step": 1004 }, { - "epoch": 1.9273422562141491, - "grad_norm": 0.25, - "learning_rate": 0.00026150724303457235, - "loss": 0.7481, + "epoch": 1.6231884057971016, + "grad_norm": 0.275390625, + "learning_rate": 0.00026435730585344896, + "loss": 0.4885, "step": 1008 }, { - "epoch": 1.9349904397705546, - "grad_norm": 0.279296875, - "learning_rate": 0.0002612023397867155, - "loss": 0.675, + "epoch": 1.6296296296296298, + "grad_norm": 0.27734375, + "learning_rate": 0.0002640733405524869, + "loss": 0.5188, "step": 1012 }, { - "epoch": 1.94263862332696, - "grad_norm": 0.2470703125, - "learning_rate": 0.00026089641297927, - "loss": 0.6684, + "epoch": 1.636070853462158, + "grad_norm": 0.296875, + "learning_rate": 0.00026378840236791485, + "loss": 0.5386, "step": 1016 }, { - "epoch": 1.9502868068833652, - "grad_norm": 0.302734375, - "learning_rate": 0.0002605894654281329, - "loss": 0.6547, + "epoch": 1.642512077294686, + "grad_norm": 0.2734375, + "learning_rate": 0.000263502493729851, + "loss": 0.5438, "step": 1020 }, { - "epoch": 1.9579349904397705, - "grad_norm": 0.267578125, - "learning_rate": 0.0002602814999585963, - "loss": 0.7232, + "epoch": 1.6489533011272142, + "grad_norm": 0.279296875, + "learning_rate": 0.00026321561707668995, + "loss": 0.5121, "step": 1024 }, { - "epoch": 1.9655831739961758, - "grad_norm": 0.25390625, - "learning_rate": 0.0002599725194053219, - "loss": 0.7069, + "epoch": 1.6553945249597424, + "grad_norm": 0.26953125, + "learning_rate": 0.0002629277748550823, + "loss": 0.4868, "step": 1028 }, { - "epoch": 1.9732313575525813, - "grad_norm": 0.26171875, - "learning_rate": 0.0002596625266123146, - "loss": 0.7303, + "epoch": 1.6618357487922706, + "grad_norm": 0.287109375, + "learning_rate": 0.0002626389695199134, + "loss": 0.5199, "step": 1032 }, { - "epoch": 1.9808795411089866, - "grad_norm": 0.263671875, - "learning_rate": 0.00025935152443289664, - "loss": 0.7096, + "epoch": 1.6682769726247986, + "grad_norm": 0.283203125, + "learning_rate": 0.0002623492035342826, + "loss": 0.5424, "step": 1036 }, { - "epoch": 1.988527724665392, - "grad_norm": 0.259765625, - "learning_rate": 0.00025903951572968094, - "loss": 0.7055, + "epoch": 1.6747181964573268, + "grad_norm": 0.279296875, + "learning_rate": 0.00026205847936948244, + "loss": 0.4983, "step": 1040 }, { - "epoch": 1.9961759082217974, - "grad_norm": 0.265625, - "learning_rate": 0.00025872650337454504, - "loss": 0.7108, + "epoch": 1.681159420289855, + "grad_norm": 0.27734375, + "learning_rate": 0.00026176679950497706, + "loss": 0.5323, "step": 1044 }, { - "epoch": 2.0038240917782026, - "grad_norm": 0.244140625, - "learning_rate": 0.00025841249024860453, - "loss": 0.5808, + "epoch": 1.6876006441223832, + "grad_norm": 0.294921875, + "learning_rate": 0.0002614741664283816, + "loss": 0.5964, "step": 1048 }, { - "epoch": 2.011472275334608, - "grad_norm": 0.294921875, - "learning_rate": 0.00025809747924218667, - "loss": 0.58, + "epoch": 1.6940418679549114, + "grad_norm": 0.287109375, + "learning_rate": 0.00026118058263544056, + "loss": 0.5227, "step": 1052 }, { - "epoch": 2.019120458891013, - "grad_norm": 0.265625, - "learning_rate": 0.00025778147325480357, - "loss": 0.6208, + "epoch": 1.7004830917874396, + "grad_norm": 0.30859375, + "learning_rate": 0.00026088605063000696, + "loss": 0.464, "step": 1056 }, { - "epoch": 2.026768642447419, - "grad_norm": 0.2578125, - "learning_rate": 0.0002574644751951256, - "loss": 0.5692, + "epoch": 1.7069243156199678, + "grad_norm": 0.267578125, + "learning_rate": 0.0002605905729240205, + "loss": 0.4978, "step": 1060 }, { - "epoch": 2.0344168260038242, - "grad_norm": 0.2734375, - "learning_rate": 0.00025714648798095483, - "loss": 0.5891, + "epoch": 1.713365539452496, + "grad_norm": 0.337890625, + "learning_rate": 0.00026029415203748633, + "loss": 0.4983, "step": 1064 }, { - "epoch": 2.0420650095602295, - "grad_norm": 0.2412109375, - "learning_rate": 0.00025682751453919776, - "loss": 0.6214, + "epoch": 1.7198067632850242, + "grad_norm": 0.3125, + "learning_rate": 0.0002599967904984539, + "loss": 0.5166, "step": 1068 }, { - "epoch": 2.049713193116635, - "grad_norm": 0.279296875, - "learning_rate": 0.0002565075578058388, - "loss": 0.5889, + "epoch": 1.7262479871175525, + "grad_norm": 0.296875, + "learning_rate": 0.00025969849084299466, + "loss": 0.5683, "step": 1072 }, { - "epoch": 2.05736137667304, - "grad_norm": 0.275390625, - "learning_rate": 0.0002561866207259128, - "loss": 0.6099, + "epoch": 1.7326892109500807, + "grad_norm": 0.287109375, + "learning_rate": 0.00025939925561518126, + "loss": 0.486, "step": 1076 }, { - "epoch": 2.0650095602294454, - "grad_norm": 0.271484375, - "learning_rate": 0.0002558647062534785, - "loss": 0.5722, + "epoch": 1.7391304347826086, + "grad_norm": 0.279296875, + "learning_rate": 0.0002590990873670652, + "loss": 0.4655, "step": 1080 }, { - "epoch": 2.0726577437858507, - "grad_norm": 0.279296875, - "learning_rate": 0.0002555418173515908, - "loss": 0.609, + "epoch": 1.7455716586151369, + "grad_norm": 0.26953125, + "learning_rate": 0.00025879798865865533, + "loss": 0.4689, "step": 1084 }, { - "epoch": 2.0803059273422564, - "grad_norm": 0.291015625, - "learning_rate": 0.0002552179569922737, - "loss": 0.6158, + "epoch": 1.752012882447665, + "grad_norm": 0.283203125, + "learning_rate": 0.0002584959620578962, + "loss": 0.424, "step": 1088 }, { - "epoch": 2.0879541108986617, - "grad_norm": 0.2734375, - "learning_rate": 0.00025489312815649314, - "loss": 0.588, + "epoch": 1.7584541062801933, + "grad_norm": 0.279296875, + "learning_rate": 0.00025819301014064574, + "loss": 0.5134, "step": 1092 }, { - "epoch": 2.095602294455067, - "grad_norm": 0.26953125, - "learning_rate": 0.00025456733383412926, - "loss": 0.6278, + "epoch": 1.7648953301127213, + "grad_norm": 0.279296875, + "learning_rate": 0.0002578891354906537, + "loss": 0.4893, "step": 1096 }, { - "epoch": 2.1032504780114722, - "grad_norm": 0.26953125, - "learning_rate": 0.000254240577023949, - "loss": 0.5915, + "epoch": 1.7713365539452495, + "grad_norm": 0.279296875, + "learning_rate": 0.00025758434069953927, + "loss": 0.4887, "step": 1100 }, { - "epoch": 2.1108986615678775, - "grad_norm": 0.26953125, - "learning_rate": 0.00025391286073357856, - "loss": 0.5764, + "epoch": 1.7777777777777777, + "grad_norm": 0.283203125, + "learning_rate": 0.0002572786283667692, + "loss": 0.5153, "step": 1104 }, { - "epoch": 2.118546845124283, - "grad_norm": 0.279296875, - "learning_rate": 0.0002535841879794755, - "loss": 0.6146, + "epoch": 1.7842190016103059, + "grad_norm": 0.28125, + "learning_rate": 0.00025697200109963563, + "loss": 0.5056, "step": 1108 }, { - "epoch": 2.126195028680688, - "grad_norm": 0.287109375, - "learning_rate": 0.0002532545617869014, - "loss": 0.5794, + "epoch": 1.790660225442834, + "grad_norm": 0.275390625, + "learning_rate": 0.0002566644615132337, + "loss": 0.5319, "step": 1112 }, { - "epoch": 2.133843212237094, - "grad_norm": 0.259765625, - "learning_rate": 0.0002529239851898935, - "loss": 0.6412, + "epoch": 1.7971014492753623, + "grad_norm": 0.265625, + "learning_rate": 0.00025635601223043933, + "loss": 0.5182, "step": 1116 }, { - "epoch": 2.141491395793499, - "grad_norm": 0.287109375, - "learning_rate": 0.00025259246123123706, - "loss": 0.6288, + "epoch": 1.8035426731078905, + "grad_norm": 0.28515625, + "learning_rate": 0.000256046655881887, + "loss": 0.5028, "step": 1120 }, { - "epoch": 2.1491395793499044, - "grad_norm": 0.2578125, - "learning_rate": 0.0002522599929624375, - "loss": 0.5644, + "epoch": 1.8099838969404187, + "grad_norm": 0.283203125, + "learning_rate": 0.000255736395105947, + "loss": 0.5006, "step": 1124 }, { - "epoch": 2.1567877629063097, - "grad_norm": 0.27734375, - "learning_rate": 0.00025192658344369193, - "loss": 0.6219, + "epoch": 1.816425120772947, + "grad_norm": 0.279296875, + "learning_rate": 0.0002554252325487032, + "loss": 0.5234, "step": 1128 }, { - "epoch": 2.164435946462715, - "grad_norm": 0.275390625, - "learning_rate": 0.00025159223574386114, - "loss": 0.6015, + "epoch": 1.8228663446054751, + "grad_norm": 0.294921875, + "learning_rate": 0.0002551131708639303, + "loss": 0.5544, "step": 1132 }, { - "epoch": 2.1720841300191203, - "grad_norm": 0.265625, - "learning_rate": 0.00025125695294044156, - "loss": 0.612, + "epoch": 1.8293075684380034, + "grad_norm": 0.28125, + "learning_rate": 0.00025480021271307156, + "loss": 0.4766, "step": 1136 }, { - "epoch": 2.179732313575526, - "grad_norm": 0.263671875, - "learning_rate": 0.0002509207381195366, - "loss": 0.5878, + "epoch": 1.8357487922705316, + "grad_norm": 0.283203125, + "learning_rate": 0.00025448636076521534, + "loss": 0.4615, "step": 1140 }, { - "epoch": 2.1873804971319313, - "grad_norm": 0.26171875, - "learning_rate": 0.0002505835943758286, - "loss": 0.6134, + "epoch": 1.8421900161030595, + "grad_norm": 0.27734375, + "learning_rate": 0.0002541716176970732, + "loss": 0.504, "step": 1144 }, { - "epoch": 2.1950286806883366, - "grad_norm": 0.28125, - "learning_rate": 0.00025024552481254993, - "loss": 0.5663, + "epoch": 1.8486312399355878, + "grad_norm": 0.294921875, + "learning_rate": 0.0002538559861929566, + "loss": 0.5873, "step": 1148 }, { - "epoch": 2.202676864244742, - "grad_norm": 0.271484375, - "learning_rate": 0.0002499065325414547, - "loss": 0.5685, + "epoch": 1.855072463768116, + "grad_norm": 0.275390625, + "learning_rate": 0.000253539468944754, + "loss": 0.5917, "step": 1152 }, { - "epoch": 2.210325047801147, - "grad_norm": 0.28515625, - "learning_rate": 0.00024956662068279027, - "loss": 0.5839, + "epoch": 1.8615136876006442, + "grad_norm": 0.275390625, + "learning_rate": 0.0002532220686519081, + "loss": 0.4924, "step": 1156 }, { - "epoch": 2.2179732313575524, - "grad_norm": 0.306640625, - "learning_rate": 0.00024922579236526807, - "loss": 0.611, + "epoch": 1.8679549114331722, + "grad_norm": 0.296875, + "learning_rate": 0.00025290378802139273, + "loss": 0.4582, "step": 1160 }, { - "epoch": 2.2256214149139577, - "grad_norm": 0.279296875, - "learning_rate": 0.00024888405072603513, - "loss": 0.6218, + "epoch": 1.8743961352657004, + "grad_norm": 0.3203125, + "learning_rate": 0.0002525846297676896, + "loss": 0.5639, "step": 1164 }, { - "epoch": 2.2332695984703634, - "grad_norm": 0.28125, - "learning_rate": 0.0002485413989106452, - "loss": 0.5743, + "epoch": 1.8808373590982286, + "grad_norm": 0.275390625, + "learning_rate": 0.0002522645966127655, + "loss": 0.5198, "step": 1168 }, { - "epoch": 2.2409177820267687, - "grad_norm": 0.29296875, - "learning_rate": 0.00024819784007302966, - "loss": 0.5921, + "epoch": 1.8872785829307568, + "grad_norm": 0.26953125, + "learning_rate": 0.0002519436912860488, + "loss": 0.4766, "step": 1172 }, { - "epoch": 2.248565965583174, - "grad_norm": 0.265625, - "learning_rate": 0.00024785337737546863, - "loss": 0.5444, + "epoch": 1.893719806763285, + "grad_norm": 0.28515625, + "learning_rate": 0.0002516219165244062, + "loss": 0.4583, "step": 1176 }, { - "epoch": 2.2562141491395793, - "grad_norm": 0.271484375, - "learning_rate": 0.0002475080139885617, - "loss": 0.5823, + "epoch": 1.9001610305958132, + "grad_norm": 0.30078125, + "learning_rate": 0.0002512992750721195, + "loss": 0.549, "step": 1180 }, { - "epoch": 2.2638623326959846, - "grad_norm": 0.2734375, - "learning_rate": 0.00024716175309119875, - "loss": 0.5788, + "epoch": 1.9066022544283414, + "grad_norm": 0.287109375, + "learning_rate": 0.0002509757696808622, + "loss": 0.4792, "step": 1184 }, { - "epoch": 2.27151051625239, - "grad_norm": 0.26171875, - "learning_rate": 0.00024681459787053106, - "loss": 0.5666, + "epoch": 1.9130434782608696, + "grad_norm": 0.27734375, + "learning_rate": 0.0002506514031096758, + "loss": 0.4834, "step": 1188 }, { - "epoch": 2.2791586998087956, - "grad_norm": 0.298828125, - "learning_rate": 0.0002464665515219415, - "loss": 0.5988, + "epoch": 1.9194847020933978, + "grad_norm": 0.279296875, + "learning_rate": 0.00025032617812494664, + "loss": 0.4969, "step": 1192 }, { - "epoch": 2.286806883365201, - "grad_norm": 0.3125, - "learning_rate": 0.0002461176172490153, - "loss": 0.5851, + "epoch": 1.925925925925926, + "grad_norm": 0.29296875, + "learning_rate": 0.00025000009750038196, + "loss": 0.5553, "step": 1196 }, { - "epoch": 2.294455066921606, - "grad_norm": 0.310546875, - "learning_rate": 0.0002457677982635107, - "loss": 0.6328, + "epoch": 1.9323671497584543, + "grad_norm": 0.294921875, + "learning_rate": 0.00024967316401698647, + "loss": 0.536, "step": 1200 }, { - "epoch": 2.3021032504780115, - "grad_norm": 0.28515625, - "learning_rate": 0.000245417097785329, - "loss": 0.6264, + "epoch": 1.9388083735909822, + "grad_norm": 0.265625, + "learning_rate": 0.00024934538046303856, + "loss": 0.4848, "step": 1204 }, { - "epoch": 2.3097514340344167, - "grad_norm": 0.26171875, - "learning_rate": 0.00024506551904248546, - "loss": 0.5853, + "epoch": 1.9452495974235104, + "grad_norm": 0.28515625, + "learning_rate": 0.0002490167496340664, + "loss": 0.4984, "step": 1208 }, { - "epoch": 2.317399617590822, - "grad_norm": 0.2890625, - "learning_rate": 0.00024471306527107915, - "loss": 0.571, + "epoch": 1.9516908212560387, + "grad_norm": 0.2734375, + "learning_rate": 0.0002486872743328244, + "loss": 0.4993, "step": 1212 }, { - "epoch": 2.3250478011472273, - "grad_norm": 0.2890625, - "learning_rate": 0.0002443597397152634, - "loss": 0.6076, + "epoch": 1.9581320450885669, + "grad_norm": 0.296875, + "learning_rate": 0.000248356957369269, + "loss": 0.5265, "step": 1216 }, { - "epoch": 2.332695984703633, - "grad_norm": 0.271484375, - "learning_rate": 0.00024400554562721585, - "loss": 0.6026, + "epoch": 1.9645732689210949, + "grad_norm": 0.296875, + "learning_rate": 0.0002480258015605349, + "loss": 0.5287, "step": 1220 }, { - "epoch": 2.3403441682600383, - "grad_norm": 0.279296875, - "learning_rate": 0.00024365048626710843, - "loss": 0.6196, + "epoch": 1.971014492753623, + "grad_norm": 0.287109375, + "learning_rate": 0.0002476938097309108, + "loss": 0.5616, "step": 1224 }, { - "epoch": 2.3479923518164436, - "grad_norm": 0.29296875, - "learning_rate": 0.00024329456490307757, - "loss": 0.5704, + "epoch": 1.9774557165861513, + "grad_norm": 0.291015625, + "learning_rate": 0.0002473609847118156, + "loss": 0.4542, "step": 1228 }, { - "epoch": 2.355640535372849, - "grad_norm": 0.291015625, - "learning_rate": 0.00024293778481119396, - "loss": 0.6217, + "epoch": 1.9838969404186795, + "grad_norm": 0.26171875, + "learning_rate": 0.0002470273293417741, + "loss": 0.4813, "step": 1232 }, { - "epoch": 2.363288718929254, - "grad_norm": 0.27734375, - "learning_rate": 0.0002425801492754324, - "loss": 0.6238, + "epoch": 1.9903381642512077, + "grad_norm": 0.3125, + "learning_rate": 0.00024669284646639287, + "loss": 0.5336, "step": 1236 }, { - "epoch": 2.3709369024856595, - "grad_norm": 0.275390625, - "learning_rate": 0.00024222166158764161, - "loss": 0.5945, + "epoch": 1.996779388083736, + "grad_norm": 0.279296875, + "learning_rate": 0.00024635753893833585, + "loss": 0.5528, "step": 1240 }, { - "epoch": 2.378585086042065, - "grad_norm": 0.294921875, - "learning_rate": 0.00024186232504751397, - "loss": 0.5605, + "epoch": 2.003220611916264, + "grad_norm": 0.2265625, + "learning_rate": 0.00024602140961730006, + "loss": 0.4706, "step": 1244 }, { - "epoch": 2.3862332695984705, - "grad_norm": 0.287109375, - "learning_rate": 0.0002415021429625551, - "loss": 0.6451, + "epoch": 2.0096618357487923, + "grad_norm": 0.279296875, + "learning_rate": 0.00024568446136999134, + "loss": 0.4093, "step": 1248 }, { - "epoch": 2.3938814531548758, - "grad_norm": 0.279296875, - "learning_rate": 0.00024114111864805338, - "loss": 0.5581, + "epoch": 2.0161030595813205, + "grad_norm": 0.275390625, + "learning_rate": 0.00024534669707009974, + "loss": 0.3899, "step": 1252 }, { - "epoch": 2.401529636711281, - "grad_norm": 0.259765625, - "learning_rate": 0.00024077925542704949, - "loss": 0.6212, + "epoch": 2.0225442834138487, + "grad_norm": 0.26953125, + "learning_rate": 0.0002450081195982752, + "loss": 0.361, "step": 1256 }, { - "epoch": 2.4091778202676863, - "grad_norm": 0.287109375, - "learning_rate": 0.0002404165566303057, - "loss": 0.6121, + "epoch": 2.028985507246377, + "grad_norm": 0.267578125, + "learning_rate": 0.00024466873184210273, + "loss": 0.3999, "step": 1260 }, { - "epoch": 2.4168260038240916, - "grad_norm": 0.33203125, - "learning_rate": 0.00024005302559627561, - "loss": 0.578, + "epoch": 2.035426731078905, + "grad_norm": 0.28125, + "learning_rate": 0.00024432853669607786, + "loss": 0.3753, "step": 1264 }, { - "epoch": 2.424474187380497, - "grad_norm": 0.291015625, - "learning_rate": 0.00023968866567107282, - "loss": 0.6353, + "epoch": 2.0418679549114334, + "grad_norm": 0.26953125, + "learning_rate": 0.00024398753706158225, + "loss": 0.3951, "step": 1268 }, { - "epoch": 2.4321223709369026, - "grad_norm": 0.310546875, - "learning_rate": 0.00023932348020844064, - "loss": 0.5811, + "epoch": 2.0483091787439616, + "grad_norm": 0.291015625, + "learning_rate": 0.00024364573584685848, + "loss": 0.3791, "step": 1272 }, { - "epoch": 2.439770554493308, - "grad_norm": 0.302734375, - "learning_rate": 0.00023895747256972083, - "loss": 0.5834, + "epoch": 2.0547504025764893, + "grad_norm": 0.28125, + "learning_rate": 0.00024330313596698553, + "loss": 0.4148, "step": 1276 }, { - "epoch": 2.447418738049713, - "grad_norm": 0.279296875, - "learning_rate": 0.00023859064612382315, - "loss": 0.6234, + "epoch": 2.0611916264090175, + "grad_norm": 0.271484375, + "learning_rate": 0.00024295974034385396, + "loss": 0.3767, "step": 1280 }, { - "epoch": 2.4550669216061185, - "grad_norm": 0.2890625, - "learning_rate": 0.0002382230042471938, - "loss": 0.6298, + "epoch": 2.0676328502415457, + "grad_norm": 0.283203125, + "learning_rate": 0.00024261555190614072, + "loss": 0.3743, "step": 1284 }, { - "epoch": 2.462715105162524, - "grad_norm": 0.326171875, - "learning_rate": 0.0002378545503237846, - "loss": 0.6834, + "epoch": 2.074074074074074, + "grad_norm": 0.287109375, + "learning_rate": 0.00024227057358928452, + "loss": 0.3847, "step": 1288 }, { - "epoch": 2.470363288718929, - "grad_norm": 0.296875, - "learning_rate": 0.00023748528774502194, - "loss": 0.6176, + "epoch": 2.080515297906602, + "grad_norm": 0.279296875, + "learning_rate": 0.00024192480833546044, + "loss": 0.3627, "step": 1292 }, { - "epoch": 2.478011472275335, - "grad_norm": 0.28125, - "learning_rate": 0.00023711521990977554, - "loss": 0.6141, + "epoch": 2.0869565217391304, + "grad_norm": 0.275390625, + "learning_rate": 0.00024157825909355523, + "loss": 0.4324, "step": 1296 }, { - "epoch": 2.48565965583174, - "grad_norm": 0.2734375, - "learning_rate": 0.00023674435022432683, - "loss": 0.5958, + "epoch": 2.0933977455716586, + "grad_norm": 0.29296875, + "learning_rate": 0.0002412309288191417, + "loss": 0.4302, "step": 1300 }, { - "epoch": 2.4933078393881454, - "grad_norm": 0.310546875, - "learning_rate": 0.0002363726821023381, - "loss": 0.6249, + "epoch": 2.099838969404187, + "grad_norm": 0.3046875, + "learning_rate": 0.00024088282047445396, + "loss": 0.3788, "step": 1304 }, { - "epoch": 2.5009560229445507, - "grad_norm": 0.28515625, - "learning_rate": 0.00023600021896482063, - "loss": 0.5865, + "epoch": 2.106280193236715, + "grad_norm": 0.28125, + "learning_rate": 0.00024053393702836185, + "loss": 0.399, "step": 1308 }, { - "epoch": 2.508604206500956, - "grad_norm": 0.267578125, - "learning_rate": 0.0002356269642401036, - "loss": 0.5982, + "epoch": 2.112721417069243, + "grad_norm": 0.2890625, + "learning_rate": 0.0002401842814563457, + "loss": 0.387, "step": 1312 }, { - "epoch": 2.5162523900573612, + "epoch": 2.1191626409017714, "grad_norm": 0.287109375, - "learning_rate": 0.0002352529213638022, - "loss": 0.5722, + "learning_rate": 0.00023983385674047113, + "loss": 0.3905, "step": 1316 }, { - "epoch": 2.5239005736137665, - "grad_norm": 0.296875, - "learning_rate": 0.0002348780937787862, - "loss": 0.6308, + "epoch": 2.1256038647342996, + "grad_norm": 0.298828125, + "learning_rate": 0.00023948266586936324, + "loss": 0.3715, "step": 1320 }, { - "epoch": 2.5315487571701722, - "grad_norm": 0.28515625, - "learning_rate": 0.00023450248493514817, - "loss": 0.6104, + "epoch": 2.132045088566828, + "grad_norm": 0.314453125, + "learning_rate": 0.00023913071183818155, + "loss": 0.4474, "step": 1324 }, { - "epoch": 2.5391969407265775, - "grad_norm": 0.306640625, - "learning_rate": 0.00023412609829017182, - "loss": 0.6053, + "epoch": 2.138486312399356, + "grad_norm": 0.291015625, + "learning_rate": 0.00023877799764859416, + "loss": 0.3759, "step": 1328 }, { - "epoch": 2.546845124282983, - "grad_norm": 0.294921875, - "learning_rate": 0.00023374893730830005, - "loss": 0.5447, + "epoch": 2.1449275362318843, + "grad_norm": 0.30859375, + "learning_rate": 0.00023842452630875216, + "loss": 0.373, "step": 1332 }, { - "epoch": 2.554493307839388, - "grad_norm": 0.294921875, - "learning_rate": 0.00023337100546110313, - "loss": 0.5987, + "epoch": 2.151368760064412, + "grad_norm": 0.296875, + "learning_rate": 0.0002380703008332643, + "loss": 0.4218, "step": 1336 }, { - "epoch": 2.5621414913957934, - "grad_norm": 0.29296875, - "learning_rate": 0.0002329923062272468, - "loss": 0.6037, + "epoch": 2.1578099838969402, + "grad_norm": 0.306640625, + "learning_rate": 0.0002377153242431708, + "loss": 0.4234, "step": 1340 }, { - "epoch": 2.569789674952199, - "grad_norm": 0.298828125, - "learning_rate": 0.0002326128430924602, - "loss": 0.6055, + "epoch": 2.1642512077294684, + "grad_norm": 0.31640625, + "learning_rate": 0.00023735959956591786, + "loss": 0.3971, "step": 1344 }, { - "epoch": 2.5774378585086044, - "grad_norm": 0.291015625, - "learning_rate": 0.00023223261954950363, - "loss": 0.6163, + "epoch": 2.1706924315619966, + "grad_norm": 0.322265625, + "learning_rate": 0.0002370031298353319, + "loss": 0.4211, "step": 1348 }, { - "epoch": 2.5850860420650097, - "grad_norm": 0.298828125, - "learning_rate": 0.00023185163909813678, - "loss": 0.6272, + "epoch": 2.177133655394525, + "grad_norm": 0.2890625, + "learning_rate": 0.00023664591809159353, + "loss": 0.3972, "step": 1352 }, { - "epoch": 2.592734225621415, - "grad_norm": 0.28125, - "learning_rate": 0.00023146990524508613, - "loss": 0.6303, + "epoch": 2.183574879227053, + "grad_norm": 0.306640625, + "learning_rate": 0.00023628796738121169, + "loss": 0.4185, "step": 1356 }, { - "epoch": 2.6003824091778203, - "grad_norm": 0.279296875, - "learning_rate": 0.00023108742150401284, - "loss": 0.5862, + "epoch": 2.1900161030595813, + "grad_norm": 0.29296875, + "learning_rate": 0.00023592928075699763, + "loss": 0.402, "step": 1360 }, { - "epoch": 2.6080305927342256, - "grad_norm": 0.291015625, - "learning_rate": 0.00023070419139548044, - "loss": 0.6083, + "epoch": 2.1964573268921095, + "grad_norm": 0.30078125, + "learning_rate": 0.00023556986127803894, + "loss": 0.4056, "step": 1364 }, { - "epoch": 2.615678776290631, - "grad_norm": 0.298828125, - "learning_rate": 0.00023032021844692242, - "loss": 0.6466, + "epoch": 2.2028985507246377, + "grad_norm": 0.345703125, + "learning_rate": 0.00023520971200967334, + "loss": 0.4506, "step": 1368 }, { - "epoch": 2.623326959847036, + "epoch": 2.209339774557166, "grad_norm": 0.302734375, - "learning_rate": 0.0002299355061926096, - "loss": 0.6236, + "learning_rate": 0.00023484883602346274, + "loss": 0.4093, "step": 1372 }, { - "epoch": 2.6309751434034414, - "grad_norm": 0.28125, - "learning_rate": 0.00022955005817361783, - "loss": 0.6203, + "epoch": 2.215780998389694, + "grad_norm": 0.310546875, + "learning_rate": 0.0002344872363971668, + "loss": 0.4717, "step": 1376 }, { - "epoch": 2.638623326959847, - "grad_norm": 0.283203125, - "learning_rate": 0.00022916387793779533, - "loss": 0.6101, + "epoch": 2.2222222222222223, + "grad_norm": 0.298828125, + "learning_rate": 0.00023412491621471694, + "loss": 0.3948, "step": 1380 }, { - "epoch": 2.6462715105162524, - "grad_norm": 0.2890625, - "learning_rate": 0.00022877696903972984, - "loss": 0.5361, + "epoch": 2.2286634460547505, + "grad_norm": 0.29296875, + "learning_rate": 0.00023376187856618972, + "loss": 0.3925, "step": 1384 }, { - "epoch": 2.6539196940726577, - "grad_norm": 0.2890625, - "learning_rate": 0.00022838933504071618, - "loss": 0.594, + "epoch": 2.2351046698872787, + "grad_norm": 0.283203125, + "learning_rate": 0.00023339812654778083, + "loss": 0.4324, "step": 1388 }, { - "epoch": 2.661567877629063, - "grad_norm": 0.28125, - "learning_rate": 0.0002280009795087233, - "loss": 0.6323, + "epoch": 2.241545893719807, + "grad_norm": 0.314453125, + "learning_rate": 0.0002330336632617784, + "loss": 0.4557, "step": 1392 }, { - "epoch": 2.6692160611854687, - "grad_norm": 0.267578125, - "learning_rate": 0.00022761190601836142, - "loss": 0.5863, + "epoch": 2.247987117552335, + "grad_norm": 0.310546875, + "learning_rate": 0.00023266849181653683, + "loss": 0.4301, "step": 1396 }, { - "epoch": 2.676864244741874, - "grad_norm": 0.30078125, - "learning_rate": 0.00022722211815084944, - "loss": 0.6109, + "epoch": 2.2544283413848634, + "grad_norm": 0.275390625, + "learning_rate": 0.00023230261532644985, + "loss": 0.3799, "step": 1400 }, { - "epoch": 2.6845124282982793, - "grad_norm": 0.3046875, - "learning_rate": 0.0002268316194939815, - "loss": 0.6062, + "epoch": 2.260869565217391, + "grad_norm": 0.287109375, + "learning_rate": 0.0002319360369119245, + "loss": 0.3826, "step": 1404 }, { - "epoch": 2.6921606118546846, - "grad_norm": 0.3046875, - "learning_rate": 0.0002264404136420941, - "loss": 0.5996, + "epoch": 2.2673107890499193, + "grad_norm": 0.291015625, + "learning_rate": 0.00023156875969935405, + "loss": 0.3862, "step": 1408 }, { - "epoch": 2.69980879541109, - "grad_norm": 0.30859375, - "learning_rate": 0.0002260485041960334, - "loss": 0.6301, + "epoch": 2.2737520128824475, + "grad_norm": 0.28125, + "learning_rate": 0.00023120078682109158, + "loss": 0.4269, "step": 1412 }, { - "epoch": 2.707456978967495, - "grad_norm": 0.2890625, - "learning_rate": 0.00022565589476312157, - "loss": 0.5807, + "epoch": 2.2801932367149758, + "grad_norm": 0.302734375, + "learning_rate": 0.00023083212141542328, + "loss": 0.4139, "step": 1416 }, { - "epoch": 2.7151051625239004, - "grad_norm": 0.28515625, - "learning_rate": 0.00022526258895712377, - "loss": 0.6542, + "epoch": 2.286634460547504, + "grad_norm": 0.296875, + "learning_rate": 0.00023046276662654143, + "loss": 0.3579, "step": 1420 }, { - "epoch": 2.7227533460803057, - "grad_norm": 0.283203125, - "learning_rate": 0.00022486859039821513, - "loss": 0.6001, + "epoch": 2.293075684380032, + "grad_norm": 0.287109375, + "learning_rate": 0.00023009272560451803, + "loss": 0.4, "step": 1424 }, { - "epoch": 2.730401529636711, - "grad_norm": 0.30078125, - "learning_rate": 0.00022447390271294697, - "loss": 0.5997, + "epoch": 2.2995169082125604, + "grad_norm": 0.298828125, + "learning_rate": 0.00022972200150527745, + "loss": 0.3937, "step": 1428 }, { - "epoch": 2.7380497131931167, - "grad_norm": 0.2890625, - "learning_rate": 0.00022407852953421382, - "loss": 0.6401, + "epoch": 2.3059581320450886, + "grad_norm": 0.318359375, + "learning_rate": 0.00022935059749056992, + "loss": 0.4553, "step": 1432 }, { - "epoch": 2.745697896749522, - "grad_norm": 0.283203125, - "learning_rate": 0.00022368247450121965, - "loss": 0.5744, + "epoch": 2.312399355877617, + "grad_norm": 0.259765625, + "learning_rate": 0.00022897851672794417, + "loss": 0.396, "step": 1436 }, { - "epoch": 2.7533460803059273, - "grad_norm": 0.28125, - "learning_rate": 0.00022328574125944476, - "loss": 0.5853, + "epoch": 2.318840579710145, + "grad_norm": 0.3046875, + "learning_rate": 0.00022860576239072084, + "loss": 0.5137, "step": 1440 }, { - "epoch": 2.7609942638623326, - "grad_norm": 0.302734375, - "learning_rate": 0.00022288833346061182, - "loss": 0.5861, + "epoch": 2.325281803542673, + "grad_norm": 0.28515625, + "learning_rate": 0.00022823233765796502, + "loss": 0.4085, "step": 1444 }, { - "epoch": 2.768642447418738, - "grad_norm": 0.28515625, - "learning_rate": 0.0002224902547626526, - "loss": 0.6194, + "epoch": 2.3317230273752014, + "grad_norm": 0.302734375, + "learning_rate": 0.0002278582457144595, + "loss": 0.3963, "step": 1448 }, { - "epoch": 2.7762906309751436, - "grad_norm": 0.298828125, - "learning_rate": 0.00022209150882967398, - "loss": 0.604, + "epoch": 2.3381642512077296, + "grad_norm": 0.306640625, + "learning_rate": 0.00022748348975067733, + "loss": 0.4377, "step": 1452 }, { - "epoch": 2.783938814531549, - "grad_norm": 0.29296875, - "learning_rate": 0.00022169209933192458, - "loss": 0.6312, + "epoch": 2.3446054750402574, + "grad_norm": 0.322265625, + "learning_rate": 0.00022710807296275472, + "loss": 0.4275, "step": 1456 }, { - "epoch": 2.791586998087954, - "grad_norm": 0.287109375, - "learning_rate": 0.0002212920299457606, - "loss": 0.6312, + "epoch": 2.3510466988727856, + "grad_norm": 0.310546875, + "learning_rate": 0.0002267319985524637, + "loss": 0.4089, "step": 1460 }, { - "epoch": 2.7992351816443595, - "grad_norm": 0.294921875, - "learning_rate": 0.0002208913043536123, - "loss": 0.6089, + "epoch": 2.357487922705314, + "grad_norm": 0.30859375, + "learning_rate": 0.00022635526972718508, + "loss": 0.4386, "step": 1464 }, { - "epoch": 2.8068833652007648, - "grad_norm": 0.3125, - "learning_rate": 0.00022048992624394988, - "loss": 0.6349, + "epoch": 2.363929146537842, + "grad_norm": 0.294921875, + "learning_rate": 0.0002259778896998807, + "loss": 0.4172, "step": 1468 }, { - "epoch": 2.81453154875717, - "grad_norm": 0.29296875, - "learning_rate": 0.00022008789931124976, - "loss": 0.591, + "epoch": 2.3703703703703702, + "grad_norm": 0.34375, + "learning_rate": 0.00022559986168906637, + "loss": 0.4022, "step": 1472 }, { - "epoch": 2.8221797323135753, - "grad_norm": 0.28515625, - "learning_rate": 0.0002196852272559603, - "loss": 0.6085, + "epoch": 2.3768115942028984, + "grad_norm": 0.2890625, + "learning_rate": 0.00022522118891878418, + "loss": 0.4665, "step": 1476 }, { - "epoch": 2.8298279158699806, - "grad_norm": 0.28125, - "learning_rate": 0.00021928191378446795, - "loss": 0.6284, + "epoch": 2.3832528180354267, + "grad_norm": 0.3125, + "learning_rate": 0.00022484187461857517, + "loss": 0.3916, "step": 1480 }, { - "epoch": 2.8374760994263863, - "grad_norm": 0.283203125, - "learning_rate": 0.00021887796260906304, - "loss": 0.5796, + "epoch": 2.389694041867955, + "grad_norm": 0.306640625, + "learning_rate": 0.00022446192202345156, + "loss": 0.3918, "step": 1484 }, { - "epoch": 2.8451242829827916, - "grad_norm": 0.3046875, - "learning_rate": 0.00021847337744790562, - "loss": 0.5739, + "epoch": 2.396135265700483, + "grad_norm": 0.29296875, + "learning_rate": 0.00022408133437386968, + "loss": 0.4198, "step": 1488 }, { - "epoch": 2.852772466539197, - "grad_norm": 0.294921875, - "learning_rate": 0.0002180681620249913, - "loss": 0.6, + "epoch": 2.4025764895330113, + "grad_norm": 0.306640625, + "learning_rate": 0.00022370011491570162, + "loss": 0.3635, "step": 1492 }, { - "epoch": 2.860420650095602, - "grad_norm": 0.30078125, - "learning_rate": 0.00021766232007011682, - "loss": 0.6113, + "epoch": 2.4090177133655395, + "grad_norm": 0.310546875, + "learning_rate": 0.000223318266900208, + "loss": 0.4297, "step": 1496 }, { - "epoch": 2.8680688336520075, + "epoch": 2.4154589371980677, "grad_norm": 0.29296875, - "learning_rate": 0.0002172558553188459, - "loss": 0.5949, + "learning_rate": 0.00022293579358401023, + "loss": 0.3819, "step": 1500 }, { - "epoch": 2.875717017208413, - "grad_norm": 0.30078125, - "learning_rate": 0.00021684877151247485, - "loss": 0.5929, + "epoch": 2.421900161030596, + "grad_norm": 0.294921875, + "learning_rate": 0.0002225526982290625, + "loss": 0.4068, "step": 1504 }, { - "epoch": 2.8833652007648185, - "grad_norm": 0.283203125, - "learning_rate": 0.00021644107239799786, - "loss": 0.6224, + "epoch": 2.428341384863124, + "grad_norm": 0.328125, + "learning_rate": 0.00022216898410262428, + "loss": 0.3808, "step": 1508 }, { - "epoch": 2.891013384321224, - "grad_norm": 0.306640625, - "learning_rate": 0.00021603276172807288, - "loss": 0.6439, + "epoch": 2.4347826086956523, + "grad_norm": 0.298828125, + "learning_rate": 0.00022178465447723214, + "loss": 0.4037, "step": 1512 }, { - "epoch": 2.898661567877629, - "grad_norm": 0.298828125, - "learning_rate": 0.00021562384326098688, - "loss": 0.5835, + "epoch": 2.4412238325281805, + "grad_norm": 0.33203125, + "learning_rate": 0.000221399712630672, + "loss": 0.452, "step": 1516 }, { - "epoch": 2.9063097514340344, - "grad_norm": 0.298828125, - "learning_rate": 0.0002152143207606211, - "loss": 0.6, + "epoch": 2.4476650563607087, + "grad_norm": 0.296875, + "learning_rate": 0.0002210141618459513, + "loss": 0.4127, "step": 1520 }, { - "epoch": 2.9139579349904396, - "grad_norm": 0.310546875, - "learning_rate": 0.00021480419799641692, - "loss": 0.5959, + "epoch": 2.454106280193237, + "grad_norm": 0.27734375, + "learning_rate": 0.00022062800541127064, + "loss": 0.3894, "step": 1524 }, { - "epoch": 2.921606118546845, - "grad_norm": 0.291015625, - "learning_rate": 0.0002143934787433406, - "loss": 0.6111, + "epoch": 2.4605475040257647, + "grad_norm": 0.296875, + "learning_rate": 0.00022024124661999613, + "loss": 0.4256, "step": 1528 }, { - "epoch": 2.92925430210325, - "grad_norm": 0.27734375, - "learning_rate": 0.00021398216678184884, - "loss": 0.6072, + "epoch": 2.466988727858293, + "grad_norm": 0.318359375, + "learning_rate": 0.00021985388877063104, + "loss": 0.4556, "step": 1532 }, { - "epoch": 2.936902485659656, - "grad_norm": 0.314453125, - "learning_rate": 0.00021357026589785392, - "loss": 0.5744, + "epoch": 2.473429951690821, + "grad_norm": 0.31640625, + "learning_rate": 0.00021946593516678777, + "loss": 0.4504, "step": 1536 }, { - "epoch": 2.9445506692160612, - "grad_norm": 0.29296875, - "learning_rate": 0.00021315777988268876, - "loss": 0.603, + "epoch": 2.4798711755233493, + "grad_norm": 0.322265625, + "learning_rate": 0.00021907738911715964, + "loss": 0.4062, "step": 1540 }, { - "epoch": 2.9521988527724665, - "grad_norm": 0.287109375, - "learning_rate": 0.00021274471253307224, - "loss": 0.6364, + "epoch": 2.4863123993558776, + "grad_norm": 0.298828125, + "learning_rate": 0.00021868825393549275, + "loss": 0.4386, "step": 1544 }, { - "epoch": 2.959847036328872, - "grad_norm": 0.314453125, - "learning_rate": 0.00021233106765107407, - "loss": 0.5766, + "epoch": 2.4927536231884058, + "grad_norm": 0.30859375, + "learning_rate": 0.0002182985329405576, + "loss": 0.3559, "step": 1548 }, { - "epoch": 2.967495219885277, - "grad_norm": 0.287109375, - "learning_rate": 0.00021191684904407976, - "loss": 0.5715, + "epoch": 2.499194847020934, + "grad_norm": 0.28515625, + "learning_rate": 0.00021790822945612088, + "loss": 0.4244, "step": 1552 }, { - "epoch": 2.975143403441683, - "grad_norm": 0.26953125, - "learning_rate": 0.0002115020605247558, - "loss": 0.6015, + "epoch": 2.505636070853462, + "grad_norm": 0.314453125, + "learning_rate": 0.0002175173468109171, + "loss": 0.4028, "step": 1556 }, { - "epoch": 2.982791586998088, - "grad_norm": 0.29296875, - "learning_rate": 0.00021108670591101433, - "loss": 0.6017, + "epoch": 2.5120772946859904, + "grad_norm": 0.265625, + "learning_rate": 0.00021712588833862014, + "loss": 0.3726, "step": 1560 }, { - "epoch": 2.9904397705544934, - "grad_norm": 0.283203125, - "learning_rate": 0.00021067078902597814, - "loss": 0.6201, + "epoch": 2.5185185185185186, + "grad_norm": 0.322265625, + "learning_rate": 0.00021673385737781492, + "loss": 0.481, "step": 1564 }, { - "epoch": 2.9980879541108987, - "grad_norm": 0.26953125, - "learning_rate": 0.0002102543136979454, - "loss": 0.6178, + "epoch": 2.524959742351047, + "grad_norm": 0.30078125, + "learning_rate": 0.00021634125727196883, + "loss": 0.3778, "step": 1568 }, { - "epoch": 3.005736137667304, - "grad_norm": 0.263671875, - "learning_rate": 0.00020983728376035448, - "loss": 0.464, + "epoch": 2.531400966183575, + "grad_norm": 0.296875, + "learning_rate": 0.00021594809136940327, + "loss": 0.4438, "step": 1572 }, { - "epoch": 3.0133843212237093, - "grad_norm": 0.296875, - "learning_rate": 0.00020941970305174862, - "loss": 0.4966, + "epoch": 2.537842190016103, + "grad_norm": 0.328125, + "learning_rate": 0.00021555436302326514, + "loss": 0.4399, "step": 1576 }, { - "epoch": 3.0210325047801145, - "grad_norm": 0.296875, - "learning_rate": 0.00020900157541574066, - "loss": 0.5019, + "epoch": 2.544283413848631, + "grad_norm": 0.3046875, + "learning_rate": 0.00021516007559149803, + "loss": 0.3979, "step": 1580 }, { - "epoch": 3.0286806883365203, - "grad_norm": 0.283203125, - "learning_rate": 0.00020858290470097762, - "loss": 0.477, + "epoch": 2.550724637681159, + "grad_norm": 0.310546875, + "learning_rate": 0.00021476523243681397, + "loss": 0.4085, "step": 1584 }, { - "epoch": 3.0363288718929256, + "epoch": 2.5571658615136874, "grad_norm": 0.291015625, - "learning_rate": 0.00020816369476110512, - "loss": 0.5126, + "learning_rate": 0.0002143698369266643, + "loss": 0.3875, "step": 1588 }, { - "epoch": 3.043977055449331, - "grad_norm": 0.27734375, - "learning_rate": 0.0002077439494547324, - "loss": 0.5319, + "epoch": 2.5636070853462156, + "grad_norm": 0.287109375, + "learning_rate": 0.0002139738924332113, + "loss": 0.4288, "step": 1592 }, { - "epoch": 3.051625239005736, - "grad_norm": 0.302734375, - "learning_rate": 0.00020732367264539612, - "loss": 0.4976, + "epoch": 2.570048309178744, + "grad_norm": 0.3046875, + "learning_rate": 0.0002135774023332992, + "loss": 0.4155, "step": 1596 }, { - "epoch": 3.0592734225621414, - "grad_norm": 0.298828125, - "learning_rate": 0.00020690286820152534, - "loss": 0.4638, + "epoch": 2.576489533011272, + "grad_norm": 0.2890625, + "learning_rate": 0.00021318037000842558, + "loss": 0.377, "step": 1600 }, { - "epoch": 3.0669216061185467, - "grad_norm": 0.298828125, - "learning_rate": 0.0002064815399964057, - "loss": 0.5326, + "epoch": 2.5829307568438002, + "grad_norm": 0.326171875, + "learning_rate": 0.00021278279884471242, + "loss": 0.4134, "step": 1604 }, { - "epoch": 3.0745697896749524, - "grad_norm": 0.30859375, - "learning_rate": 0.00020605969190814374, - "loss": 0.5066, + "epoch": 2.5893719806763285, + "grad_norm": 0.3203125, + "learning_rate": 0.0002123846922328771, + "loss": 0.3668, "step": 1608 }, { - "epoch": 3.0822179732313577, - "grad_norm": 0.279296875, - "learning_rate": 0.0002056373278196313, - "loss": 0.4611, + "epoch": 2.5958132045088567, + "grad_norm": 0.31640625, + "learning_rate": 0.00021198605356820377, + "loss": 0.4207, "step": 1612 }, { - "epoch": 3.089866156787763, - "grad_norm": 0.279296875, - "learning_rate": 0.0002052144516185097, - "loss": 0.511, + "epoch": 2.602254428341385, + "grad_norm": 0.318359375, + "learning_rate": 0.00021158688625051416, + "loss": 0.434, "step": 1616 }, { - "epoch": 3.0975143403441683, - "grad_norm": 0.287109375, - "learning_rate": 0.00020479106719713402, - "loss": 0.5338, + "epoch": 2.608695652173913, + "grad_norm": 0.298828125, + "learning_rate": 0.00021118719368413866, + "loss": 0.3963, "step": 1620 }, { - "epoch": 3.1051625239005736, - "grad_norm": 0.314453125, - "learning_rate": 0.00020436717845253723, - "loss": 0.5384, + "epoch": 2.6151368760064413, + "grad_norm": 0.33984375, + "learning_rate": 0.0002107869792778873, + "loss": 0.4366, "step": 1624 }, { - "epoch": 3.112810707456979, - "grad_norm": 0.30078125, - "learning_rate": 0.0002039427892863943, - "loss": 0.5188, + "epoch": 2.6215780998389695, + "grad_norm": 0.28125, + "learning_rate": 0.00021038624644502063, + "loss": 0.3604, "step": 1628 }, { - "epoch": 3.120458891013384, - "grad_norm": 0.318359375, - "learning_rate": 0.00020351790360498636, - "loss": 0.5045, + "epoch": 2.6280193236714977, + "grad_norm": 0.3046875, + "learning_rate": 0.00020998499860322073, + "loss": 0.4029, "step": 1632 }, { - "epoch": 3.12810707456979, - "grad_norm": 0.3125, - "learning_rate": 0.00020309252531916475, - "loss": 0.4894, + "epoch": 2.634460547504026, + "grad_norm": 0.30859375, + "learning_rate": 0.00020958323917456186, + "loss": 0.429, "step": 1636 }, { - "epoch": 3.135755258126195, - "grad_norm": 0.310546875, - "learning_rate": 0.00020266665834431486, - "loss": 0.5241, + "epoch": 2.640901771336554, + "grad_norm": 0.326171875, + "learning_rate": 0.00020918097158548145, + "loss": 0.426, "step": 1640 }, { - "epoch": 3.1434034416826004, - "grad_norm": 0.33203125, - "learning_rate": 0.00020224030660032023, - "loss": 0.4838, + "epoch": 2.6473429951690823, + "grad_norm": 0.318359375, + "learning_rate": 0.0002087781992667509, + "loss": 0.4129, "step": 1644 }, { - "epoch": 3.1510516252390057, - "grad_norm": 0.30078125, - "learning_rate": 0.00020181347401152652, - "loss": 0.5449, + "epoch": 2.6537842190016105, + "grad_norm": 0.3125, + "learning_rate": 0.000208374925653446, + "loss": 0.3946, "step": 1648 }, { - "epoch": 3.158699808795411, - "grad_norm": 0.30859375, - "learning_rate": 0.0002013861645067054, - "loss": 0.5428, + "epoch": 2.6602254428341388, + "grad_norm": 0.32421875, + "learning_rate": 0.00020797115418491816, + "loss": 0.3564, "step": 1652 }, { - "epoch": 3.1663479923518163, - "grad_norm": 0.3125, - "learning_rate": 0.00020095838201901798, - "loss": 0.5506, + "epoch": 2.6666666666666665, + "grad_norm": 0.32421875, + "learning_rate": 0.00020756688830476453, + "loss": 0.4553, "step": 1656 }, { - "epoch": 3.173996175908222, - "grad_norm": 0.32421875, - "learning_rate": 0.00020053013048597926, - "loss": 0.4983, + "epoch": 2.6731078904991947, + "grad_norm": 0.318359375, + "learning_rate": 0.0002071621314607991, + "loss": 0.4497, "step": 1660 }, { - "epoch": 3.1816443594646273, - "grad_norm": 0.3046875, - "learning_rate": 0.00020010141384942148, - "loss": 0.4941, + "epoch": 2.679549114331723, + "grad_norm": 0.30859375, + "learning_rate": 0.00020675688710502293, + "loss": 0.3987, "step": 1664 }, { - "epoch": 3.1892925430210326, - "grad_norm": 0.31640625, - "learning_rate": 0.0001996722360554577, - "loss": 0.5406, + "epoch": 2.685990338164251, + "grad_norm": 0.306640625, + "learning_rate": 0.00020635115869359498, + "loss": 0.4695, "step": 1668 }, { - "epoch": 3.196940726577438, - "grad_norm": 0.31640625, - "learning_rate": 0.00019924260105444602, - "loss": 0.484, + "epoch": 2.6924315619967794, + "grad_norm": 0.318359375, + "learning_rate": 0.0002059449496868024, + "loss": 0.4566, "step": 1672 }, { - "epoch": 3.204588910133843, - "grad_norm": 0.310546875, - "learning_rate": 0.00019881251280095261, - "loss": 0.5374, + "epoch": 2.6988727858293076, + "grad_norm": 0.314453125, + "learning_rate": 0.00020553826354903121, + "loss": 0.4199, "step": 1676 }, { - "epoch": 3.2122370936902485, - "grad_norm": 0.3046875, - "learning_rate": 0.00019838197525371583, - "loss": 0.4977, + "epoch": 2.7053140096618358, + "grad_norm": 0.30078125, + "learning_rate": 0.00020513110374873676, + "loss": 0.3612, "step": 1680 }, { - "epoch": 3.2198852772466537, - "grad_norm": 0.33203125, - "learning_rate": 0.0001979509923756094, - "loss": 0.5462, + "epoch": 2.711755233494364, + "grad_norm": 0.294921875, + "learning_rate": 0.00020472347375841384, + "loss": 0.383, "step": 1684 }, { - "epoch": 3.2275334608030595, - "grad_norm": 0.296875, - "learning_rate": 0.0001975195681336061, - "loss": 0.4945, + "epoch": 2.718196457326892, + "grad_norm": 0.294921875, + "learning_rate": 0.0002043153770545675, + "loss": 0.4051, "step": 1688 }, { - "epoch": 3.2351816443594648, - "grad_norm": 0.29296875, - "learning_rate": 0.00019708770649874132, - "loss": 0.5042, + "epoch": 2.7246376811594204, + "grad_norm": 0.31640625, + "learning_rate": 0.00020390681711768312, + "loss": 0.4408, "step": 1692 }, { - "epoch": 3.24282982791587, - "grad_norm": 0.328125, - "learning_rate": 0.00019665541144607627, - "loss": 0.5369, + "epoch": 2.7310789049919486, + "grad_norm": 0.3203125, + "learning_rate": 0.00020349779743219682, + "loss": 0.4155, "step": 1696 }, { - "epoch": 3.2504780114722753, - "grad_norm": 0.3125, - "learning_rate": 0.00019622268695466166, - "loss": 0.5121, + "epoch": 2.7375201288244764, + "grad_norm": 0.294921875, + "learning_rate": 0.0002030883214864657, + "loss": 0.4164, "step": 1700 }, { - "epoch": 3.2581261950286806, - "grad_norm": 0.294921875, - "learning_rate": 0.000195789537007501, - "loss": 0.4984, + "epoch": 2.7439613526570046, + "grad_norm": 0.28515625, + "learning_rate": 0.0002026783927727381, + "loss": 0.4013, "step": 1704 }, { - "epoch": 3.265774378585086, - "grad_norm": 0.306640625, - "learning_rate": 0.00019535596559151376, - "loss": 0.5391, + "epoch": 2.750402576489533, + "grad_norm": 0.328125, + "learning_rate": 0.00020226801478712383, + "loss": 0.3839, "step": 1708 }, { - "epoch": 3.2734225621414916, - "grad_norm": 0.3125, - "learning_rate": 0.00019492197669749892, - "loss": 0.4778, + "epoch": 2.756843800322061, + "grad_norm": 0.296875, + "learning_rate": 0.00020185719102956438, + "loss": 0.4691, "step": 1712 }, { - "epoch": 3.281070745697897, - "grad_norm": 0.306640625, - "learning_rate": 0.00019448757432009807, - "loss": 0.5131, + "epoch": 2.763285024154589, + "grad_norm": 0.3125, + "learning_rate": 0.0002014459250038031, + "loss": 0.3949, "step": 1716 }, { - "epoch": 3.288718929254302, - "grad_norm": 0.31640625, - "learning_rate": 0.00019405276245775877, - "loss": 0.4901, + "epoch": 2.7697262479871174, + "grad_norm": 0.298828125, + "learning_rate": 0.00020103422021735507, + "loss": 0.3918, "step": 1720 }, { - "epoch": 3.2963671128107075, + "epoch": 2.7761674718196456, "grad_norm": 0.326171875, - "learning_rate": 0.00019361754511269753, - "loss": 0.5426, + "learning_rate": 0.00020062208018147755, + "loss": 0.4027, "step": 1724 }, { - "epoch": 3.3040152963671128, - "grad_norm": 0.337890625, - "learning_rate": 0.00019318192629086327, - "loss": 0.5105, + "epoch": 2.782608695652174, + "grad_norm": 0.296875, + "learning_rate": 0.00020020950841113984, + "loss": 0.4319, "step": 1728 }, { - "epoch": 3.311663479923518, - "grad_norm": 0.33203125, - "learning_rate": 0.00019274591000190028, - "loss": 0.5448, + "epoch": 2.789049919484702, + "grad_norm": 0.3359375, + "learning_rate": 0.00019979650842499324, + "loss": 0.4255, "step": 1732 }, { - "epoch": 3.3193116634799233, - "grad_norm": 0.314453125, - "learning_rate": 0.00019230950025911123, - "loss": 0.5079, + "epoch": 2.7954911433172303, + "grad_norm": 0.294921875, + "learning_rate": 0.00019938308374534115, + "loss": 0.4403, "step": 1736 }, { - "epoch": 3.3269598470363286, - "grad_norm": 0.296875, - "learning_rate": 0.0001918727010794204, - "loss": 0.5185, + "epoch": 2.8019323671497585, + "grad_norm": 0.306640625, + "learning_rate": 0.00019896923789810905, + "loss": 0.4311, "step": 1740 }, { - "epoch": 3.3346080305927344, - "grad_norm": 0.3125, - "learning_rate": 0.0001914355164833366, - "loss": 0.4918, + "epoch": 2.8083735909822867, + "grad_norm": 0.279296875, + "learning_rate": 0.00019855497441281436, + "loss": 0.407, "step": 1744 }, { - "epoch": 3.3422562141491396, - "grad_norm": 0.33203125, - "learning_rate": 0.00019099795049491621, - "loss": 0.5051, + "epoch": 2.814814814814815, + "grad_norm": 0.287109375, + "learning_rate": 0.00019814029682253644, + "loss": 0.4184, "step": 1748 }, { - "epoch": 3.349904397705545, - "grad_norm": 0.3125, - "learning_rate": 0.00019056000714172617, - "loss": 0.5295, + "epoch": 2.821256038647343, + "grad_norm": 0.283203125, + "learning_rate": 0.00019772520866388605, + "loss": 0.3812, "step": 1752 }, { - "epoch": 3.35755258126195, - "grad_norm": 0.33203125, - "learning_rate": 0.00019012169045480676, - "loss": 0.5455, + "epoch": 2.8276972624798713, + "grad_norm": 0.34375, + "learning_rate": 0.00019730971347697602, + "loss": 0.4228, "step": 1756 }, { - "epoch": 3.3652007648183555, - "grad_norm": 0.29296875, - "learning_rate": 0.00018968300446863478, - "loss": 0.515, + "epoch": 2.8341384863123995, + "grad_norm": 0.306640625, + "learning_rate": 0.00019689381480539014, + "loss": 0.4321, "step": 1760 }, { - "epoch": 3.3728489483747612, - "grad_norm": 0.296875, - "learning_rate": 0.00018924395322108607, - "loss": 0.4868, + "epoch": 2.8405797101449277, + "grad_norm": 0.31640625, + "learning_rate": 0.00019647751619615353, + "loss": 0.4321, "step": 1764 }, { - "epoch": 3.3804971319311665, - "grad_norm": 0.330078125, - "learning_rate": 0.00018880454075339854, - "loss": 0.5087, + "epoch": 2.847020933977456, + "grad_norm": 0.3046875, + "learning_rate": 0.00019606082119970214, + "loss": 0.4502, "step": 1768 }, { - "epoch": 3.388145315487572, - "grad_norm": 0.318359375, - "learning_rate": 0.00018836477111013495, - "loss": 0.5179, + "epoch": 2.853462157809984, + "grad_norm": 0.306640625, + "learning_rate": 0.00019564373336985268, + "loss": 0.4298, "step": 1772 }, { - "epoch": 3.395793499043977, - "grad_norm": 0.349609375, - "learning_rate": 0.00018792464833914576, - "loss": 0.5613, + "epoch": 2.8599033816425123, + "grad_norm": 0.318359375, + "learning_rate": 0.00019522625626377198, + "loss": 0.4469, "step": 1776 }, { - "epoch": 3.4034416826003824, - "grad_norm": 0.333984375, - "learning_rate": 0.0001874841764915317, - "loss": 0.5269, + "epoch": 2.86634460547504, + "grad_norm": 0.34375, + "learning_rate": 0.00019480839344194695, + "loss": 0.4033, "step": 1780 }, { - "epoch": 3.4110898661567877, - "grad_norm": 0.337890625, - "learning_rate": 0.00018704335962160663, - "loss": 0.5024, + "epoch": 2.8727858293075683, + "grad_norm": 0.296875, + "learning_rate": 0.00019439014846815413, + "loss": 0.4381, "step": 1784 }, { - "epoch": 3.418738049713193, - "grad_norm": 0.322265625, - "learning_rate": 0.00018660220178686002, - "loss": 0.5272, + "epoch": 2.8792270531400965, + "grad_norm": 0.296875, + "learning_rate": 0.00019397152490942919, + "loss": 0.4205, "step": 1788 }, { - "epoch": 3.4263862332695982, - "grad_norm": 0.330078125, - "learning_rate": 0.0001861607070479199, - "loss": 0.4757, + "epoch": 2.8856682769726247, + "grad_norm": 0.30078125, + "learning_rate": 0.00019355252633603668, + "loss": 0.4187, "step": 1792 }, { - "epoch": 3.434034416826004, - "grad_norm": 0.34765625, - "learning_rate": 0.00018571887946851535, - "loss": 0.5167, + "epoch": 2.892109500805153, + "grad_norm": 0.33203125, + "learning_rate": 0.00019313315632143944, + "loss": 0.3912, "step": 1796 }, { - "epoch": 3.4416826003824093, - "grad_norm": 0.314453125, - "learning_rate": 0.00018527672311543887, - "loss": 0.5162, + "epoch": 2.898550724637681, + "grad_norm": 0.33203125, + "learning_rate": 0.00019271341844226812, + "loss": 0.4236, "step": 1800 }, { - "epoch": 3.4493307839388145, - "grad_norm": 0.33203125, - "learning_rate": 0.00018483424205850934, - "loss": 0.5711, + "epoch": 2.9049919484702094, + "grad_norm": 0.28125, + "learning_rate": 0.0001922933162782909, + "loss": 0.3677, "step": 1804 }, { - "epoch": 3.45697896749522, - "grad_norm": 0.359375, - "learning_rate": 0.0001843914403705343, - "loss": 0.5373, + "epoch": 2.9114331723027376, + "grad_norm": 0.310546875, + "learning_rate": 0.00019187285341238261, + "loss": 0.3979, "step": 1808 }, { - "epoch": 3.464627151051625, - "grad_norm": 0.31640625, - "learning_rate": 0.00018394832212727252, - "loss": 0.5118, + "epoch": 2.917874396135266, + "grad_norm": 0.298828125, + "learning_rate": 0.00019145203343049453, + "loss": 0.3967, "step": 1812 }, { - "epoch": 3.472275334608031, - "grad_norm": 0.318359375, - "learning_rate": 0.00018350489140739654, - "loss": 0.5283, + "epoch": 2.924315619967794, + "grad_norm": 0.30859375, + "learning_rate": 0.00019103085992162343, + "loss": 0.4128, "step": 1816 }, { - "epoch": 3.479923518164436, + "epoch": 2.930756843800322, "grad_norm": 0.298828125, - "learning_rate": 0.00018306115229245506, - "loss": 0.4733, + "learning_rate": 0.00019060933647778135, + "loss": 0.3968, "step": 1820 }, { - "epoch": 3.4875717017208414, - "grad_norm": 0.322265625, - "learning_rate": 0.00018261710886683538, - "loss": 0.5091, + "epoch": 2.9371980676328504, + "grad_norm": 0.33203125, + "learning_rate": 0.00019018746669396464, + "loss": 0.4208, "step": 1824 }, { - "epoch": 3.4952198852772467, - "grad_norm": 0.333984375, - "learning_rate": 0.0001821727652177258, - "loss": 0.5055, + "epoch": 2.943639291465378, + "grad_norm": 0.30859375, + "learning_rate": 0.00018976525416812358, + "loss": 0.413, "step": 1828 }, { - "epoch": 3.502868068833652, - "grad_norm": 0.31640625, - "learning_rate": 0.00018172812543507813, - "loss": 0.518, + "epoch": 2.9500805152979064, + "grad_norm": 0.298828125, + "learning_rate": 0.00018934270250113135, + "loss": 0.4122, "step": 1832 }, { - "epoch": 3.5105162523900573, - "grad_norm": 0.3125, - "learning_rate": 0.00018128319361156978, - "loss": 0.5309, + "epoch": 2.9565217391304346, + "grad_norm": 0.3359375, + "learning_rate": 0.00018891981529675376, + "loss": 0.3961, "step": 1836 }, { - "epoch": 3.5181644359464626, - "grad_norm": 0.34375, - "learning_rate": 0.0001808379738425664, - "loss": 0.5381, + "epoch": 2.962962962962963, + "grad_norm": 0.31640625, + "learning_rate": 0.00018849659616161808, + "loss": 0.4498, "step": 1840 }, { - "epoch": 3.525812619502868, - "grad_norm": 0.341796875, - "learning_rate": 0.00018039247022608393, - "loss": 0.5596, + "epoch": 2.969404186795491, + "grad_norm": 0.302734375, + "learning_rate": 0.00018807304870518263, + "loss": 0.3935, "step": 1844 }, { - "epoch": 3.5334608030592736, - "grad_norm": 0.333984375, - "learning_rate": 0.00017994668686275092, - "loss": 0.5198, + "epoch": 2.975845410628019, + "grad_norm": 0.3125, + "learning_rate": 0.00018764917653970567, + "loss": 0.4183, "step": 1848 }, { - "epoch": 3.541108986615679, - "grad_norm": 0.3203125, - "learning_rate": 0.00017950062785577104, - "loss": 0.542, + "epoch": 2.9822866344605474, + "grad_norm": 0.326171875, + "learning_rate": 0.000187224983280215, + "loss": 0.4101, "step": 1852 }, { - "epoch": 3.548757170172084, - "grad_norm": 0.341796875, - "learning_rate": 0.00017905429731088497, - "loss": 0.5138, + "epoch": 2.9887278582930756, + "grad_norm": 0.314453125, + "learning_rate": 0.00018680047254447665, + "loss": 0.4587, "step": 1856 }, { - "epoch": 3.5564053537284894, - "grad_norm": 0.3359375, - "learning_rate": 0.0001786076993363328, - "loss": 0.5144, + "epoch": 2.995169082125604, + "grad_norm": 0.294921875, + "learning_rate": 0.0001863756479529644, + "loss": 0.4216, "step": 1860 }, { - "epoch": 3.5640535372848947, - "grad_norm": 0.330078125, - "learning_rate": 0.0001781608380428161, - "loss": 0.5127, + "epoch": 3.001610305958132, + "grad_norm": 0.255859375, + "learning_rate": 0.00018595051312882892, + "loss": 0.3842, "step": 1864 }, { - "epoch": 3.5717017208413004, - "grad_norm": 0.30859375, - "learning_rate": 0.0001777137175434602, - "loss": 0.5018, + "epoch": 3.0080515297906603, + "grad_norm": 0.28515625, + "learning_rate": 0.00018552507169786634, + "loss": 0.3189, "step": 1868 }, { - "epoch": 3.5793499043977057, - "grad_norm": 0.337890625, - "learning_rate": 0.00017726634195377642, - "loss": 0.4387, + "epoch": 3.0144927536231885, + "grad_norm": 0.31640625, + "learning_rate": 0.00018509932728848804, + "loss": 0.3061, "step": 1872 }, { - "epoch": 3.586998087954111, - "grad_norm": 0.33984375, - "learning_rate": 0.00017681871539162382, - "loss": 0.5421, + "epoch": 3.0209339774557167, + "grad_norm": 0.3046875, + "learning_rate": 0.00018467328353168934, + "loss": 0.3166, "step": 1876 }, { - "epoch": 3.5946462715105163, - "grad_norm": 0.353515625, - "learning_rate": 0.00017637084197717163, - "loss": 0.5118, + "epoch": 3.027375201288245, + "grad_norm": 0.28125, + "learning_rate": 0.00018424694406101838, + "loss": 0.3081, "step": 1880 }, { - "epoch": 3.6022944550669216, - "grad_norm": 0.333984375, - "learning_rate": 0.00017592272583286125, - "loss": 0.5017, + "epoch": 3.033816425120773, + "grad_norm": 0.28125, + "learning_rate": 0.0001838203125125455, + "loss": 0.2944, "step": 1884 }, { - "epoch": 3.609942638623327, - "grad_norm": 0.326171875, - "learning_rate": 0.00017547437108336836, - "loss": 0.533, + "epoch": 3.0402576489533013, + "grad_norm": 0.294921875, + "learning_rate": 0.00018339339252483196, + "loss": 0.285, "step": 1888 }, { - "epoch": 3.617590822179732, - "grad_norm": 0.33203125, - "learning_rate": 0.00017502578185556468, - "loss": 0.5954, + "epoch": 3.0466988727858295, + "grad_norm": 0.283203125, + "learning_rate": 0.00018296618773889912, + "loss": 0.2926, "step": 1892 }, { - "epoch": 3.6252390057361374, - "grad_norm": 0.345703125, - "learning_rate": 0.00017457696227848036, - "loss": 0.5098, + "epoch": 3.0531400966183573, + "grad_norm": 0.3125, + "learning_rate": 0.000182538701798197, + "loss": 0.3019, "step": 1896 }, { - "epoch": 3.632887189292543, - "grad_norm": 0.322265625, - "learning_rate": 0.00017412791648326566, - "loss": 0.4859, + "epoch": 3.0595813204508855, + "grad_norm": 0.314453125, + "learning_rate": 0.00018211093834857379, + "loss": 0.2984, "step": 1900 }, { - "epoch": 3.6405353728489485, - "grad_norm": 0.333984375, - "learning_rate": 0.0001736786486031531, - "loss": 0.5466, + "epoch": 3.0660225442834137, + "grad_norm": 0.30859375, + "learning_rate": 0.00018168290103824422, + "loss": 0.3185, "step": 1904 }, { - "epoch": 3.6481835564053537, - "grad_norm": 0.33203125, - "learning_rate": 0.00017322916277341945, - "loss": 0.5258, + "epoch": 3.072463768115942, + "grad_norm": 0.30859375, + "learning_rate": 0.00018125459351775873, + "loss": 0.3192, "step": 1908 }, { - "epoch": 3.655831739961759, - "grad_norm": 0.3515625, - "learning_rate": 0.00017277946313134758, - "loss": 0.5302, + "epoch": 3.07890499194847, + "grad_norm": 0.3125, + "learning_rate": 0.00018082601943997232, + "loss": 0.3459, "step": 1912 }, { - "epoch": 3.6634799235181643, - "grad_norm": 0.31640625, - "learning_rate": 0.00017232955381618826, - "loss": 0.5487, + "epoch": 3.0853462157809983, + "grad_norm": 0.3203125, + "learning_rate": 0.00018039718246001325, + "loss": 0.2837, "step": 1916 }, { - "epoch": 3.67112810707457, - "grad_norm": 0.357421875, - "learning_rate": 0.00017187943896912236, - "loss": 0.497, + "epoch": 3.0917874396135265, + "grad_norm": 0.32421875, + "learning_rate": 0.000179968086235252, + "loss": 0.3134, "step": 1920 }, { - "epoch": 3.6787762906309753, - "grad_norm": 0.314453125, - "learning_rate": 0.0001714291227332224, - "loss": 0.4907, + "epoch": 3.0982286634460547, + "grad_norm": 0.30859375, + "learning_rate": 0.00017953873442527008, + "loss": 0.2907, "step": 1924 }, { - "epoch": 3.6864244741873806, - "grad_norm": 0.322265625, - "learning_rate": 0.00017097860925341472, - "loss": 0.5322, + "epoch": 3.104669887278583, + "grad_norm": 0.314453125, + "learning_rate": 0.00017910913069182872, + "loss": 0.3076, "step": 1928 }, { - "epoch": 3.694072657743786, - "grad_norm": 0.3125, - "learning_rate": 0.00017052790267644112, - "loss": 0.4859, + "epoch": 3.111111111111111, + "grad_norm": 0.302734375, + "learning_rate": 0.00017867927869883775, + "loss": 0.293, "step": 1932 }, { - "epoch": 3.701720841300191, - "grad_norm": 0.330078125, - "learning_rate": 0.00017007700715082077, - "loss": 0.5101, + "epoch": 3.1175523349436394, + "grad_norm": 0.302734375, + "learning_rate": 0.00017824918211232422, + "loss": 0.2775, "step": 1936 }, { - "epoch": 3.7093690248565965, - "grad_norm": 0.31640625, - "learning_rate": 0.00016962592682681206, - "loss": 0.5091, + "epoch": 3.1239935587761676, + "grad_norm": 0.326171875, + "learning_rate": 0.00017781884460040136, + "loss": 0.3037, "step": 1940 }, { - "epoch": 3.7170172084130018, - "grad_norm": 0.3203125, - "learning_rate": 0.00016917466585637426, - "loss": 0.5399, + "epoch": 3.130434782608696, + "grad_norm": 0.34375, + "learning_rate": 0.00017738826983323703, + "loss": 0.3139, "step": 1944 }, { - "epoch": 3.724665391969407, - "grad_norm": 0.349609375, - "learning_rate": 0.0001687232283931294, - "loss": 0.4829, + "epoch": 3.136876006441224, + "grad_norm": 0.328125, + "learning_rate": 0.00017695746148302252, + "loss": 0.3081, "step": 1948 }, { - "epoch": 3.7323135755258128, - "grad_norm": 0.333984375, - "learning_rate": 0.00016827161859232418, - "loss": 0.5355, + "epoch": 3.143317230273752, + "grad_norm": 0.302734375, + "learning_rate": 0.00017652642322394142, + "loss": 0.344, "step": 1952 }, { - "epoch": 3.739961759082218, - "grad_norm": 0.35546875, - "learning_rate": 0.00016781984061079138, - "loss": 0.5442, + "epoch": 3.14975845410628, + "grad_norm": 0.3125, + "learning_rate": 0.00017609515873213787, + "loss": 0.3006, "step": 1956 }, { - "epoch": 3.7476099426386233, - "grad_norm": 0.330078125, - "learning_rate": 0.00016736789860691197, - "loss": 0.5238, + "epoch": 3.156199677938808, + "grad_norm": 0.326171875, + "learning_rate": 0.00017566367168568572, + "loss": 0.2933, "step": 1960 }, { - "epoch": 3.7552581261950286, - "grad_norm": 0.32421875, - "learning_rate": 0.00016691579674057657, - "loss": 0.5004, + "epoch": 3.1626409017713364, + "grad_norm": 0.30078125, + "learning_rate": 0.00017523196576455663, + "loss": 0.2869, "step": 1964 }, { - "epoch": 3.762906309751434, - "grad_norm": 0.3203125, - "learning_rate": 0.00016646353917314726, - "loss": 0.4718, + "epoch": 3.1690821256038646, + "grad_norm": 0.322265625, + "learning_rate": 0.00017480004465058918, + "loss": 0.2935, "step": 1968 }, { - "epoch": 3.7705544933078396, - "grad_norm": 0.33203125, - "learning_rate": 0.00016601113006741916, - "loss": 0.4324, + "epoch": 3.175523349436393, + "grad_norm": 0.291015625, + "learning_rate": 0.00017436791202745706, + "loss": 0.3451, "step": 1972 }, { - "epoch": 3.778202676864245, - "grad_norm": 0.314453125, - "learning_rate": 0.00016555857358758252, - "loss": 0.457, + "epoch": 3.181964573268921, + "grad_norm": 0.3125, + "learning_rate": 0.00017393557158063803, + "loss": 0.3047, "step": 1976 }, { - "epoch": 3.78585086042065, - "grad_norm": 0.337890625, - "learning_rate": 0.00016510587389918373, - "loss": 0.4992, + "epoch": 3.1884057971014492, + "grad_norm": 0.32421875, + "learning_rate": 0.00017350302699738204, + "loss": 0.327, "step": 1980 }, { - "epoch": 3.7934990439770555, - "grad_norm": 0.333984375, - "learning_rate": 0.00016465303516908762, - "loss": 0.4984, + "epoch": 3.1948470209339774, + "grad_norm": 0.337890625, + "learning_rate": 0.00017307028196668028, + "loss": 0.3238, "step": 1984 }, { - "epoch": 3.801147227533461, - "grad_norm": 0.318359375, - "learning_rate": 0.0001642000615654387, - "loss": 0.5278, + "epoch": 3.2012882447665056, + "grad_norm": 0.326171875, + "learning_rate": 0.0001726373401792333, + "loss": 0.2957, "step": 1988 }, { - "epoch": 3.808795411089866, - "grad_norm": 0.333984375, - "learning_rate": 0.0001637469572576229, - "loss": 0.5272, + "epoch": 3.207729468599034, + "grad_norm": 0.296875, + "learning_rate": 0.00017220420532741977, + "loss": 0.3124, "step": 1992 }, { - "epoch": 3.8164435946462714, - "grad_norm": 0.333984375, - "learning_rate": 0.00016329372641622934, - "loss": 0.5019, + "epoch": 3.214170692431562, + "grad_norm": 0.34375, + "learning_rate": 0.00017177088110526486, + "loss": 0.2852, "step": 1996 }, { - "epoch": 3.8240917782026767, - "grad_norm": 0.353515625, - "learning_rate": 0.00016284037321301166, - "loss": 0.4939, + "epoch": 3.2206119162640903, + "grad_norm": 0.29296875, + "learning_rate": 0.00017133737120840907, + "loss": 0.3084, "step": 2000 }, { - "epoch": 3.8317399617590824, - "grad_norm": 0.333984375, - "learning_rate": 0.00016238690182084986, - "loss": 0.5335, + "epoch": 3.2270531400966185, + "grad_norm": 0.310546875, + "learning_rate": 0.000170903679334076, + "loss": 0.2671, "step": 2004 }, { - "epoch": 3.8393881453154877, - "grad_norm": 0.33203125, - "learning_rate": 0.00016193331641371176, - "loss": 0.5396, + "epoch": 3.2334943639291467, + "grad_norm": 0.291015625, + "learning_rate": 0.00017046980918104164, + "loss": 0.2851, "step": 2008 }, { - "epoch": 3.847036328871893, - "grad_norm": 0.34765625, - "learning_rate": 0.00016147962116661472, - "loss": 0.5078, + "epoch": 3.239935587761675, + "grad_norm": 0.314453125, + "learning_rate": 0.0001700357644496022, + "loss": 0.2921, "step": 2012 }, { - "epoch": 3.8546845124282982, - "grad_norm": 0.328125, - "learning_rate": 0.00016102582025558703, - "loss": 0.5286, + "epoch": 3.246376811594203, + "grad_norm": 0.3359375, + "learning_rate": 0.00016960154884154298, + "loss": 0.2898, "step": 2016 }, { - "epoch": 3.8623326959847035, - "grad_norm": 0.345703125, - "learning_rate": 0.00016057191785762964, - "loss": 0.504, + "epoch": 3.2528180354267313, + "grad_norm": 0.3203125, + "learning_rate": 0.00016916716606010646, + "loss": 0.3277, "step": 2020 }, { - "epoch": 3.8699808795411093, - "grad_norm": 0.341796875, - "learning_rate": 0.00016011791815067754, - "loss": 0.5131, + "epoch": 3.259259259259259, + "grad_norm": 0.3046875, + "learning_rate": 0.00016873261980996095, + "loss": 0.3301, "step": 2024 }, { - "epoch": 3.8776290630975145, - "grad_norm": 0.330078125, - "learning_rate": 0.00015966382531356144, - "loss": 0.5068, + "epoch": 3.2657004830917873, + "grad_norm": 0.306640625, + "learning_rate": 0.00016829791379716896, + "loss": 0.3639, "step": 2028 }, { - "epoch": 3.88527724665392, - "grad_norm": 0.326171875, - "learning_rate": 0.00015920964352596927, - "loss": 0.5257, + "epoch": 3.2721417069243155, + "grad_norm": 0.353515625, + "learning_rate": 0.00016786305172915544, + "loss": 0.3492, "step": 2032 }, { - "epoch": 3.892925430210325, - "grad_norm": 0.3203125, - "learning_rate": 0.00015875537696840775, - "loss": 0.5145, + "epoch": 3.2785829307568437, + "grad_norm": 0.3359375, + "learning_rate": 0.0001674280373146763, + "loss": 0.3233, "step": 2036 }, { - "epoch": 3.9005736137667304, - "grad_norm": 0.314453125, - "learning_rate": 0.0001583010298221638, - "loss": 0.5219, + "epoch": 3.285024154589372, + "grad_norm": 0.32421875, + "learning_rate": 0.00016699287426378683, + "loss": 0.3232, "step": 2040 }, { - "epoch": 3.9082217973231357, - "grad_norm": 0.326171875, - "learning_rate": 0.0001578466062692661, - "loss": 0.5548, + "epoch": 3.2914653784219, + "grad_norm": 0.31640625, + "learning_rate": 0.0001665575662878099, + "loss": 0.326, "step": 2044 }, { - "epoch": 3.915869980879541, - "grad_norm": 0.330078125, - "learning_rate": 0.00015739211049244667, - "loss": 0.4981, + "epoch": 3.2979066022544283, + "grad_norm": 0.337890625, + "learning_rate": 0.00016612211709930442, + "loss": 0.3169, "step": 2048 }, { - "epoch": 3.9235181644359463, - "grad_norm": 0.322265625, - "learning_rate": 0.00015693754667510235, - "loss": 0.5184, + "epoch": 3.3043478260869565, + "grad_norm": 0.3046875, + "learning_rate": 0.00016568653041203356, + "loss": 0.3036, "step": 2052 }, { - "epoch": 3.9311663479923515, - "grad_norm": 0.32421875, - "learning_rate": 0.00015648291900125609, - "loss": 0.5207, + "epoch": 3.3107890499194848, + "grad_norm": 0.33203125, + "learning_rate": 0.00016525080994093328, + "loss": 0.2987, "step": 2056 }, { - "epoch": 3.9388145315487573, - "grad_norm": 0.33203125, - "learning_rate": 0.00015602823165551877, - "loss": 0.5194, + "epoch": 3.317230273752013, + "grad_norm": 0.30859375, + "learning_rate": 0.00016481495940208046, + "loss": 0.3108, "step": 2060 }, { - "epoch": 3.9464627151051626, - "grad_norm": 0.318359375, - "learning_rate": 0.0001555734888230505, - "loss": 0.4692, + "epoch": 3.323671497584541, + "grad_norm": 0.330078125, + "learning_rate": 0.0001643789825126613, + "loss": 0.3119, "step": 2064 }, { - "epoch": 3.954110898661568, + "epoch": 3.3301127214170694, "grad_norm": 0.330078125, - "learning_rate": 0.00015511869468952201, - "loss": 0.5247, + "learning_rate": 0.0001639428829909396, + "loss": 0.342, "step": 2068 }, { - "epoch": 3.961759082217973, - "grad_norm": 0.31640625, - "learning_rate": 0.0001546638534410763, - "loss": 0.4561, + "epoch": 3.3365539452495976, + "grad_norm": 0.3046875, + "learning_rate": 0.00016350666455622497, + "loss": 0.3025, "step": 2072 }, { - "epoch": 3.969407265774379, - "grad_norm": 0.328125, - "learning_rate": 0.00015420896926429014, - "loss": 0.5073, + "epoch": 3.342995169082126, + "grad_norm": 0.337890625, + "learning_rate": 0.0001630703309288412, + "loss": 0.3136, "step": 2076 }, { - "epoch": 3.977055449330784, - "grad_norm": 0.341796875, - "learning_rate": 0.00015375404634613524, - "loss": 0.5323, + "epoch": 3.3494363929146536, + "grad_norm": 0.365234375, + "learning_rate": 0.00016263388583009463, + "loss": 0.2957, "step": 2080 }, { - "epoch": 3.9847036328871894, - "grad_norm": 0.333984375, - "learning_rate": 0.00015329908887393992, - "loss": 0.5223, + "epoch": 3.3558776167471818, + "grad_norm": 0.341796875, + "learning_rate": 0.0001621973329822421, + "loss": 0.2948, "step": 2084 }, { - "epoch": 3.9923518164435947, - "grad_norm": 0.359375, - "learning_rate": 0.0001528441010353508, - "loss": 0.5112, + "epoch": 3.36231884057971, + "grad_norm": 0.318359375, + "learning_rate": 0.00016176067610845958, + "loss": 0.3298, "step": 2088 }, { - "epoch": 4.0, - "grad_norm": 0.87109375, - "learning_rate": 0.00015238908701829378, - "loss": 0.5374, + "epoch": 3.368760064412238, + "grad_norm": 0.35546875, + "learning_rate": 0.00016132391893281003, + "loss": 0.327, "step": 2092 }, { - "epoch": 4.007648183556405, - "grad_norm": 0.30859375, - "learning_rate": 0.0001519340510109357, - "loss": 0.446, + "epoch": 3.3752012882447664, + "grad_norm": 0.326171875, + "learning_rate": 0.0001608870651802121, + "loss": 0.3009, "step": 2096 }, { - "epoch": 4.015296367112811, - "grad_norm": 0.3203125, - "learning_rate": 0.00015147899720164594, - "loss": 0.467, + "epoch": 3.3816425120772946, + "grad_norm": 0.3359375, + "learning_rate": 0.00016045011857640783, + "loss": 0.3148, "step": 2100 }, { - "epoch": 4.022944550669216, - "grad_norm": 0.333984375, - "learning_rate": 0.00015102392977895765, - "loss": 0.4112, + "epoch": 3.388083735909823, + "grad_norm": 0.298828125, + "learning_rate": 0.0001600130828479314, + "loss": 0.3282, "step": 2104 }, { - "epoch": 4.030592734225621, - "grad_norm": 0.32421875, - "learning_rate": 0.00015056885293152932, - "loss": 0.4401, + "epoch": 3.394524959742351, + "grad_norm": 0.314453125, + "learning_rate": 0.0001595759617220769, + "loss": 0.3203, "step": 2108 }, { - "epoch": 4.038240917782026, - "grad_norm": 0.333984375, - "learning_rate": 0.00015011377084810624, - "loss": 0.406, + "epoch": 3.4009661835748792, + "grad_norm": 0.32421875, + "learning_rate": 0.00015913875892686685, + "loss": 0.2977, "step": 2112 }, { - "epoch": 4.045889101338432, - "grad_norm": 0.328125, - "learning_rate": 0.00014965868771748178, - "loss": 0.4528, + "epoch": 3.4074074074074074, + "grad_norm": 0.34765625, + "learning_rate": 0.00015870147819102025, + "loss": 0.2806, "step": 2116 }, { - "epoch": 4.053537284894838, + "epoch": 3.4138486312399356, "grad_norm": 0.337890625, - "learning_rate": 0.00014920360772845896, - "loss": 0.4345, + "learning_rate": 0.00015826412324392085, + "loss": 0.3096, "step": 2120 }, { - "epoch": 4.061185468451243, - "grad_norm": 0.345703125, - "learning_rate": 0.00014874853506981206, - "loss": 0.4349, + "epoch": 3.420289855072464, + "grad_norm": 0.3203125, + "learning_rate": 0.00015782669781558528, + "loss": 0.301, "step": 2124 }, { - "epoch": 4.0688336520076485, - "grad_norm": 0.34765625, - "learning_rate": 0.00014829347393024764, - "loss": 0.4494, + "epoch": 3.426731078904992, + "grad_norm": 0.34375, + "learning_rate": 0.00015738920563663136, + "loss": 0.3055, "step": 2128 }, { - "epoch": 4.076481835564054, - "grad_norm": 0.333984375, - "learning_rate": 0.00014783842849836644, - "loss": 0.4159, + "epoch": 3.4331723027375203, + "grad_norm": 0.349609375, + "learning_rate": 0.00015695165043824605, + "loss": 0.3187, "step": 2132 }, { - "epoch": 4.084130019120459, - "grad_norm": 0.32421875, - "learning_rate": 0.00014738340296262443, - "loss": 0.372, + "epoch": 3.4396135265700485, + "grad_norm": 0.322265625, + "learning_rate": 0.00015651403595215392, + "loss": 0.308, "step": 2136 }, { - "epoch": 4.091778202676864, - "grad_norm": 0.3359375, - "learning_rate": 0.00014692840151129467, - "loss": 0.4621, + "epoch": 3.4460547504025767, + "grad_norm": 0.34375, + "learning_rate": 0.00015607636591058506, + "loss": 0.3033, "step": 2140 }, { - "epoch": 4.09942638623327, - "grad_norm": 0.328125, - "learning_rate": 0.00014647342833242827, - "loss": 0.4095, + "epoch": 3.452495974235105, + "grad_norm": 0.32421875, + "learning_rate": 0.0001556386440462435, + "loss": 0.3313, "step": 2144 }, { - "epoch": 4.107074569789675, - "grad_norm": 0.341796875, - "learning_rate": 0.00014601848761381633, - "loss": 0.4172, + "epoch": 3.4589371980676327, + "grad_norm": 0.318359375, + "learning_rate": 0.0001552008740922751, + "loss": 0.2891, "step": 2148 }, { - "epoch": 4.11472275334608, - "grad_norm": 0.31640625, - "learning_rate": 0.00014556358354295113, - "loss": 0.4197, + "epoch": 3.465378421900161, + "grad_norm": 0.345703125, + "learning_rate": 0.00015476305978223606, + "loss": 0.3416, "step": 2152 }, { - "epoch": 4.1223709369024855, - "grad_norm": 0.32421875, - "learning_rate": 0.0001451087203069875, - "loss": 0.4573, + "epoch": 3.471819645732689, + "grad_norm": 0.33203125, + "learning_rate": 0.00015432520485006055, + "loss": 0.2768, "step": 2156 }, { - "epoch": 4.130019120458891, - "grad_norm": 0.33203125, - "learning_rate": 0.00014465390209270456, - "loss": 0.4117, + "epoch": 3.4782608695652173, + "grad_norm": 0.345703125, + "learning_rate": 0.00015388731303002954, + "loss": 0.3216, "step": 2160 }, { - "epoch": 4.137667304015296, - "grad_norm": 0.357421875, - "learning_rate": 0.00014419913308646686, - "loss": 0.4148, + "epoch": 3.4847020933977455, + "grad_norm": 0.33984375, + "learning_rate": 0.0001534493880567384, + "loss": 0.3112, "step": 2164 }, { - "epoch": 4.145315487571701, - "grad_norm": 0.318359375, - "learning_rate": 0.00014374441747418628, - "loss": 0.4251, + "epoch": 3.4911433172302737, + "grad_norm": 0.314453125, + "learning_rate": 0.00015301143366506527, + "loss": 0.323, "step": 2168 }, { - "epoch": 4.1529636711281075, - "grad_norm": 0.33984375, - "learning_rate": 0.00014328975944128292, - "loss": 0.4314, + "epoch": 3.497584541062802, + "grad_norm": 0.330078125, + "learning_rate": 0.00015257345359013928, + "loss": 0.3406, "step": 2172 }, { - "epoch": 4.160611854684513, - "grad_norm": 0.34765625, - "learning_rate": 0.00014283516317264704, - "loss": 0.4386, + "epoch": 3.50402576489533, + "grad_norm": 0.330078125, + "learning_rate": 0.00015213545156730847, + "loss": 0.2904, "step": 2176 }, { - "epoch": 4.168260038240918, - "grad_norm": 0.3359375, - "learning_rate": 0.00014238063285260057, - "loss": 0.392, + "epoch": 3.5104669887278583, + "grad_norm": 0.333984375, + "learning_rate": 0.00015169743133210814, + "loss": 0.3107, "step": 2180 }, { - "epoch": 4.175908221797323, + "epoch": 3.5169082125603865, "grad_norm": 0.357421875, - "learning_rate": 0.00014192617266485803, - "loss": 0.4187, + "learning_rate": 0.0001512593966202289, + "loss": 0.3377, "step": 2184 }, { - "epoch": 4.183556405353729, - "grad_norm": 0.34375, - "learning_rate": 0.0001414717867924888, - "loss": 0.4165, + "epoch": 3.5233494363929148, + "grad_norm": 0.390625, + "learning_rate": 0.00015082135116748483, + "loss": 0.3491, "step": 2188 }, { - "epoch": 4.191204588910134, - "grad_norm": 0.35546875, - "learning_rate": 0.0001410174794178779, - "loss": 0.4191, + "epoch": 3.529790660225443, + "grad_norm": 0.33984375, + "learning_rate": 0.00015038329870978168, + "loss": 0.2865, "step": 2192 }, { - "epoch": 4.198852772466539, - "grad_norm": 0.35546875, - "learning_rate": 0.00014056325472268805, - "loss": 0.436, + "epoch": 3.536231884057971, + "grad_norm": 0.337890625, + "learning_rate": 0.00014994524298308479, + "loss": 0.2913, "step": 2196 }, { - "epoch": 4.2065009560229445, - "grad_norm": 0.349609375, - "learning_rate": 0.0001401091168878209, - "loss": 0.4075, + "epoch": 3.542673107890499, + "grad_norm": 0.35546875, + "learning_rate": 0.0001495071877233875, + "loss": 0.3163, "step": 2200 }, { - "epoch": 4.21414913957935, - "grad_norm": 0.345703125, - "learning_rate": 0.00013965507009337845, - "loss": 0.3995, + "epoch": 3.549114331723027, + "grad_norm": 0.337890625, + "learning_rate": 0.00014906913666667913, + "loss": 0.2722, "step": 2204 }, { - "epoch": 4.221797323135755, - "grad_norm": 0.35546875, - "learning_rate": 0.00013920111851862494, - "loss": 0.4474, + "epoch": 3.5555555555555554, + "grad_norm": 0.302734375, + "learning_rate": 0.00014863109354891317, + "loss": 0.3163, "step": 2208 }, { - "epoch": 4.22944550669216, - "grad_norm": 0.330078125, - "learning_rate": 0.00013874726634194797, - "loss": 0.405, + "epoch": 3.5619967793880836, + "grad_norm": 0.33203125, + "learning_rate": 0.00014819306210597536, + "loss": 0.3735, "step": 2212 }, { - "epoch": 4.237093690248566, - "grad_norm": 0.326171875, - "learning_rate": 0.0001382935177408204, - "loss": 0.4316, + "epoch": 3.5684380032206118, + "grad_norm": 0.35546875, + "learning_rate": 0.00014775504607365196, + "loss": 0.3303, "step": 2216 }, { - "epoch": 4.244741873804971, - "grad_norm": 0.337890625, - "learning_rate": 0.00013783987689176157, - "loss": 0.4299, + "epoch": 3.57487922705314, + "grad_norm": 0.3203125, + "learning_rate": 0.00014731704918759765, + "loss": 0.2946, "step": 2220 }, { - "epoch": 4.252390057361376, - "grad_norm": 0.3515625, - "learning_rate": 0.00013738634797029914, - "loss": 0.4347, + "epoch": 3.581320450885668, + "grad_norm": 0.318359375, + "learning_rate": 0.000146879075183304, + "loss": 0.3434, "step": 2224 }, { - "epoch": 4.260038240917782, - "grad_norm": 0.34375, - "learning_rate": 0.00013693293515093052, - "loss": 0.4393, + "epoch": 3.5877616747181964, + "grad_norm": 0.36328125, + "learning_rate": 0.00014644112779606727, + "loss": 0.3063, "step": 2228 }, { - "epoch": 4.267686424474188, - "grad_norm": 0.375, - "learning_rate": 0.00013647964260708436, - "loss": 0.44, + "epoch": 3.5942028985507246, + "grad_norm": 0.333984375, + "learning_rate": 0.00014600321076095683, + "loss": 0.2962, "step": 2232 }, { - "epoch": 4.275334608030593, - "grad_norm": 0.35546875, - "learning_rate": 0.0001360264745110824, - "loss": 0.4194, + "epoch": 3.600644122383253, + "grad_norm": 0.34765625, + "learning_rate": 0.00014556532781278316, + "loss": 0.3006, "step": 2236 }, { - "epoch": 4.282982791586998, - "grad_norm": 0.349609375, - "learning_rate": 0.0001355734350341007, - "loss": 0.4259, + "epoch": 3.607085346215781, + "grad_norm": 0.341796875, + "learning_rate": 0.00014512748268606592, + "loss": 0.3688, "step": 2240 }, { - "epoch": 4.2906309751434035, - "grad_norm": 0.365234375, - "learning_rate": 0.00013512052834613165, - "loss": 0.4311, + "epoch": 3.6135265700483092, + "grad_norm": 0.33203125, + "learning_rate": 0.00014468967911500242, + "loss": 0.3348, "step": 2244 }, { - "epoch": 4.298279158699809, - "grad_norm": 0.35546875, - "learning_rate": 0.00013466775861594523, - "loss": 0.4097, + "epoch": 3.6199677938808374, + "grad_norm": 0.330078125, + "learning_rate": 0.0001442519208334353, + "loss": 0.3128, "step": 2248 }, { - "epoch": 4.305927342256214, - "grad_norm": 0.31640625, - "learning_rate": 0.0001342151300110509, - "loss": 0.4149, + "epoch": 3.6264090177133657, + "grad_norm": 0.341796875, + "learning_rate": 0.00014381421157482125, + "loss": 0.3488, "step": 2252 }, { - "epoch": 4.313575525812619, - "grad_norm": 0.3203125, - "learning_rate": 0.0001337626466976591, - "loss": 0.3983, + "epoch": 3.632850241545894, + "grad_norm": 0.31640625, + "learning_rate": 0.0001433765550721985, + "loss": 0.2614, "step": 2256 }, { - "epoch": 4.321223709369025, - "grad_norm": 0.36328125, - "learning_rate": 0.0001333103128406429, - "loss": 0.4424, + "epoch": 3.639291465378422, + "grad_norm": 0.34765625, + "learning_rate": 0.00014293895505815575, + "loss": 0.2984, "step": 2260 }, { - "epoch": 4.32887189292543, - "grad_norm": 0.34765625, - "learning_rate": 0.00013285813260349982, - "loss": 0.4352, + "epoch": 3.6457326892109503, + "grad_norm": 0.31640625, + "learning_rate": 0.00014250141526479953, + "loss": 0.3257, "step": 2264 }, { - "epoch": 4.336520076481835, - "grad_norm": 0.341796875, - "learning_rate": 0.0001324061101483132, - "loss": 0.4441, + "epoch": 3.6521739130434785, + "grad_norm": 0.330078125, + "learning_rate": 0.00014206393942372314, + "loss": 0.3235, "step": 2268 }, { - "epoch": 4.3441682600382405, - "grad_norm": 0.361328125, - "learning_rate": 0.00013195424963571424, - "loss": 0.421, + "epoch": 3.6586151368760067, + "grad_norm": 0.357421875, + "learning_rate": 0.0001416265312659741, + "loss": 0.3435, "step": 2272 }, { - "epoch": 4.351816443594647, - "grad_norm": 0.33203125, - "learning_rate": 0.00013150255522484345, - "loss": 0.4131, + "epoch": 3.6650563607085345, + "grad_norm": 0.32421875, + "learning_rate": 0.00014118919452202306, + "loss": 0.3191, "step": 2276 }, { - "epoch": 4.359464627151052, - "grad_norm": 0.34375, - "learning_rate": 0.00013105103107331255, - "loss": 0.3568, + "epoch": 3.6714975845410627, + "grad_norm": 0.318359375, + "learning_rate": 0.00014075193292173126, + "loss": 0.2869, "step": 2280 }, { - "epoch": 4.367112810707457, - "grad_norm": 0.3359375, - "learning_rate": 0.00013059968133716606, - "loss": 0.445, + "epoch": 3.677938808373591, + "grad_norm": 0.318359375, + "learning_rate": 0.00014031475019431934, + "loss": 0.3089, "step": 2284 }, { - "epoch": 4.374760994263863, - "grad_norm": 0.333984375, - "learning_rate": 0.00013014851017084303, - "loss": 0.4267, + "epoch": 3.684380032206119, + "grad_norm": 0.322265625, + "learning_rate": 0.00013987765006833518, + "loss": 0.3332, "step": 2288 }, { - "epoch": 4.382409177820268, - "grad_norm": 0.37890625, - "learning_rate": 0.00012969752172713905, - "loss": 0.4458, + "epoch": 3.6908212560386473, + "grad_norm": 0.30859375, + "learning_rate": 0.0001394406362716221, + "loss": 0.3127, "step": 2292 }, { - "epoch": 4.390057361376673, - "grad_norm": 0.341796875, - "learning_rate": 0.00012924672015716759, - "loss": 0.439, + "epoch": 3.6972624798711755, + "grad_norm": 0.32421875, + "learning_rate": 0.00013900371253128727, + "loss": 0.3177, "step": 2296 }, { - "epoch": 4.397705544933078, - "grad_norm": 0.34765625, - "learning_rate": 0.00012879610961032218, - "loss": 0.4792, + "epoch": 3.7037037037037037, + "grad_norm": 0.337890625, + "learning_rate": 0.0001385668825736697, + "loss": 0.3324, "step": 2300 }, { - "epoch": 4.405353728489484, - "grad_norm": 0.310546875, - "learning_rate": 0.0001283456942342383, - "loss": 0.4113, + "epoch": 3.710144927536232, + "grad_norm": 0.32421875, + "learning_rate": 0.0001381301501243087, + "loss": 0.2785, "step": 2304 }, { - "epoch": 4.413001912045889, - "grad_norm": 0.35546875, - "learning_rate": 0.0001278954781747545, - "loss": 0.4548, + "epoch": 3.71658615136876, + "grad_norm": 0.353515625, + "learning_rate": 0.00013769351890791185, + "loss": 0.3274, "step": 2308 }, { - "epoch": 4.420650095602294, - "grad_norm": 0.359375, - "learning_rate": 0.00012744546557587517, - "loss": 0.4512, + "epoch": 3.7230273752012883, + "grad_norm": 0.3359375, + "learning_rate": 0.00013725699264832344, + "loss": 0.3041, "step": 2312 }, { - "epoch": 4.4282982791587, - "grad_norm": 0.3515625, - "learning_rate": 0.00012699566057973168, - "loss": 0.4211, + "epoch": 3.7294685990338166, + "grad_norm": 0.326171875, + "learning_rate": 0.00013682057506849256, + "loss": 0.3343, "step": 2316 }, { - "epoch": 4.435946462715105, - "grad_norm": 0.35546875, - "learning_rate": 0.00012654606732654468, - "loss": 0.4256, + "epoch": 3.7359098228663448, + "grad_norm": 0.3125, + "learning_rate": 0.00013638426989044148, + "loss": 0.2785, "step": 2320 }, { - "epoch": 4.44359464627151, - "grad_norm": 0.357421875, - "learning_rate": 0.00012609668995458573, - "loss": 0.4451, + "epoch": 3.7423510466988725, + "grad_norm": 0.34375, + "learning_rate": 0.00013594808083523376, + "loss": 0.3454, "step": 2324 }, { - "epoch": 4.451242829827915, - "grad_norm": 0.34375, - "learning_rate": 0.0001256475326001394, - "loss": 0.4703, + "epoch": 3.7487922705314007, + "grad_norm": 0.33203125, + "learning_rate": 0.00013551201162294275, + "loss": 0.312, "step": 2328 }, { - "epoch": 4.458891013384322, - "grad_norm": 0.33984375, - "learning_rate": 0.00012519859939746504, - "loss": 0.4032, + "epoch": 3.755233494363929, + "grad_norm": 0.3359375, + "learning_rate": 0.00013507606597261946, + "loss": 0.2885, "step": 2332 }, { - "epoch": 4.466539196940727, - "grad_norm": 0.384765625, - "learning_rate": 0.00012474989447875886, - "loss": 0.4324, + "epoch": 3.761674718196457, + "grad_norm": 0.337890625, + "learning_rate": 0.00013464024760226142, + "loss": 0.3328, "step": 2336 }, { - "epoch": 4.474187380497132, - "grad_norm": 0.380859375, - "learning_rate": 0.0001243014219741158, - "loss": 0.4671, + "epoch": 3.7681159420289854, + "grad_norm": 0.33203125, + "learning_rate": 0.0001342045602287803, + "loss": 0.3078, "step": 2340 }, { - "epoch": 4.4818355640535374, - "grad_norm": 0.36328125, - "learning_rate": 0.00012385318601149158, - "loss": 0.4463, + "epoch": 3.7745571658615136, + "grad_norm": 0.326171875, + "learning_rate": 0.00013376900756797085, + "loss": 0.3126, "step": 2344 }, { - "epoch": 4.489483747609943, - "grad_norm": 0.365234375, - "learning_rate": 0.00012340519071666467, - "loss": 0.4448, + "epoch": 3.780998389694042, + "grad_norm": 0.3125, + "learning_rate": 0.00013333359333447865, + "loss": 0.2941, "step": 2348 }, { - "epoch": 4.497131931166348, - "grad_norm": 0.326171875, - "learning_rate": 0.0001229574402131982, - "loss": 0.4345, + "epoch": 3.78743961352657, + "grad_norm": 0.353515625, + "learning_rate": 0.0001328983212417689, + "loss": 0.3251, "step": 2352 }, { - "epoch": 4.504780114722753, - "grad_norm": 0.359375, - "learning_rate": 0.00012250993862240227, - "loss": 0.433, + "epoch": 3.793880837359098, + "grad_norm": 0.341796875, + "learning_rate": 0.0001324631950020945, + "loss": 0.3367, "step": 2356 }, { - "epoch": 4.512428298279159, - "grad_norm": 0.34765625, - "learning_rate": 0.00012206269006329593, - "loss": 0.4293, + "epoch": 3.8003220611916264, + "grad_norm": 0.365234375, + "learning_rate": 0.0001320282183264643, + "loss": 0.3164, "step": 2360 }, { - "epoch": 4.520076481835564, - "grad_norm": 0.35546875, - "learning_rate": 0.00012161569865256896, - "loss": 0.4413, + "epoch": 3.8067632850241546, + "grad_norm": 0.353515625, + "learning_rate": 0.00013159339492461176, + "loss": 0.3584, "step": 2364 }, { - "epoch": 4.527724665391969, - "grad_norm": 0.349609375, - "learning_rate": 0.00012116896850454446, - "loss": 0.4446, + "epoch": 3.813204508856683, + "grad_norm": 0.34375, + "learning_rate": 0.00013115872850496293, + "loss": 0.3307, "step": 2368 }, { - "epoch": 4.5353728489483744, - "grad_norm": 0.345703125, - "learning_rate": 0.00012072250373114057, - "loss": 0.4642, + "epoch": 3.819645732689211, + "grad_norm": 0.33984375, + "learning_rate": 0.0001307242227746053, + "loss": 0.3475, "step": 2372 }, { - "epoch": 4.54302103250478, - "grad_norm": 0.34765625, - "learning_rate": 0.00012027630844183288, - "loss": 0.4825, + "epoch": 3.8260869565217392, + "grad_norm": 0.345703125, + "learning_rate": 0.00013028988143925553, + "loss": 0.3058, "step": 2376 }, { - "epoch": 4.550669216061186, - "grad_norm": 0.353515625, - "learning_rate": 0.00011983038674361658, - "loss": 0.4303, + "epoch": 3.8325281803542675, + "grad_norm": 0.345703125, + "learning_rate": 0.00012985570820322868, + "loss": 0.2718, "step": 2380 }, { - "epoch": 4.558317399617591, - "grad_norm": 0.34375, - "learning_rate": 0.00011938474274096844, - "loss": 0.4013, + "epoch": 3.8389694041867957, + "grad_norm": 0.333984375, + "learning_rate": 0.00012942170676940576, + "loss": 0.3074, "step": 2384 }, { - "epoch": 4.5659655831739965, - "grad_norm": 0.330078125, - "learning_rate": 0.00011893938053580933, - "loss": 0.4183, + "epoch": 3.845410628019324, + "grad_norm": 0.32421875, + "learning_rate": 0.00012898788083920282, + "loss": 0.3177, "step": 2388 }, { - "epoch": 4.573613766730402, - "grad_norm": 0.359375, - "learning_rate": 0.00011849430422746624, - "loss": 0.4345, + "epoch": 3.851851851851852, + "grad_norm": 0.341796875, + "learning_rate": 0.0001285542341125389, + "loss": 0.3012, "step": 2392 }, { - "epoch": 4.581261950286807, - "grad_norm": 0.369140625, - "learning_rate": 0.00011804951791263466, - "loss": 0.4253, + "epoch": 3.8582930756843803, + "grad_norm": 0.30859375, + "learning_rate": 0.0001281207702878049, + "loss": 0.3024, "step": 2396 }, { - "epoch": 4.588910133843212, - "grad_norm": 0.359375, - "learning_rate": 0.00011760502568534081, - "loss": 0.473, + "epoch": 3.864734299516908, + "grad_norm": 0.328125, + "learning_rate": 0.00012768749306183165, + "loss": 0.3092, "step": 2400 }, { - "epoch": 4.596558317399618, - "grad_norm": 0.34765625, - "learning_rate": 0.00011716083163690405, - "loss": 0.451, + "epoch": 3.8711755233494363, + "grad_norm": 0.34375, + "learning_rate": 0.00012725440612985868, + "loss": 0.2978, "step": 2404 }, { - "epoch": 4.604206500956023, - "grad_norm": 0.353515625, - "learning_rate": 0.00011671693985589913, - "loss": 0.4522, + "epoch": 3.8776167471819645, + "grad_norm": 0.337890625, + "learning_rate": 0.0001268215131855025, + "loss": 0.3337, "step": 2408 }, { - "epoch": 4.611854684512428, - "grad_norm": 0.361328125, - "learning_rate": 0.00011627335442811846, - "loss": 0.4193, + "epoch": 3.8840579710144927, + "grad_norm": 0.314453125, + "learning_rate": 0.00012638881792072522, + "loss": 0.3278, "step": 2412 }, { - "epoch": 4.6195028680688335, - "grad_norm": 0.349609375, - "learning_rate": 0.00011583007943653494, - "loss": 0.4616, + "epoch": 3.890499194847021, + "grad_norm": 0.318359375, + "learning_rate": 0.00012595632402580305, + "loss": 0.3051, "step": 2416 }, { - "epoch": 4.627151051625239, - "grad_norm": 0.34375, - "learning_rate": 0.00011538711896126369, - "loss": 0.4549, + "epoch": 3.896940418679549, + "grad_norm": 0.310546875, + "learning_rate": 0.00012552403518929472, + "loss": 0.2764, "step": 2420 }, { - "epoch": 4.634799235181644, - "grad_norm": 0.353515625, - "learning_rate": 0.00011494447707952514, - "loss": 0.4119, + "epoch": 3.9033816425120773, + "grad_norm": 0.322265625, + "learning_rate": 0.0001250919550980102, + "loss": 0.3124, "step": 2424 }, { - "epoch": 4.642447418738049, - "grad_norm": 0.359375, - "learning_rate": 0.0001145021578656071, - "loss": 0.4203, + "epoch": 3.9098228663446055, + "grad_norm": 0.322265625, + "learning_rate": 0.00012466008743697906, + "loss": 0.3407, "step": 2428 }, { - "epoch": 4.650095602294455, - "grad_norm": 0.345703125, - "learning_rate": 0.00011406016539082747, - "loss": 0.4199, + "epoch": 3.9162640901771337, + "grad_norm": 0.34375, + "learning_rate": 0.00012422843588941925, + "loss": 0.3336, "step": 2432 }, { - "epoch": 4.657743785850861, - "grad_norm": 0.380859375, - "learning_rate": 0.00011361850372349667, - "loss": 0.4791, + "epoch": 3.922705314009662, + "grad_norm": 0.33203125, + "learning_rate": 0.00012379700413670547, + "loss": 0.2992, "step": 2436 }, { - "epoch": 4.665391969407266, - "grad_norm": 0.3515625, - "learning_rate": 0.00011317717692888012, - "loss": 0.3904, + "epoch": 3.92914653784219, + "grad_norm": 0.32421875, + "learning_rate": 0.00012336579585833798, + "loss": 0.3341, "step": 2440 }, { - "epoch": 4.673040152963671, - "grad_norm": 0.353515625, - "learning_rate": 0.00011273618906916107, - "loss": 0.413, + "epoch": 3.9355877616747184, + "grad_norm": 0.345703125, + "learning_rate": 0.00012293481473191103, + "loss": 0.3153, "step": 2444 }, { - "epoch": 4.680688336520077, - "grad_norm": 0.365234375, - "learning_rate": 0.00011229554420340289, - "loss": 0.4078, + "epoch": 3.942028985507246, + "grad_norm": 0.322265625, + "learning_rate": 0.00012250406443308168, + "loss": 0.2993, "step": 2448 }, { - "epoch": 4.688336520076482, - "grad_norm": 0.375, - "learning_rate": 0.00011185524638751195, - "loss": 0.481, + "epoch": 3.9484702093397743, + "grad_norm": 0.328125, + "learning_rate": 0.00012207354863553825, + "loss": 0.3144, "step": 2452 }, { - "epoch": 4.695984703632887, - "grad_norm": 0.384765625, - "learning_rate": 0.0001114152996742003, - "loss": 0.4649, + "epoch": 3.9549114331723025, + "grad_norm": 0.330078125, + "learning_rate": 0.00012164327101096923, + "loss": 0.3251, "step": 2456 }, { - "epoch": 4.7036328871892925, - "grad_norm": 0.357421875, - "learning_rate": 0.00011097570811294803, - "loss": 0.4758, + "epoch": 3.9613526570048307, + "grad_norm": 0.3125, + "learning_rate": 0.00012121323522903167, + "loss": 0.2799, "step": 2460 }, { - "epoch": 4.711281070745698, - "grad_norm": 0.37109375, - "learning_rate": 0.00011053647574996648, - "loss": 0.3909, + "epoch": 3.967793880837359, + "grad_norm": 0.330078125, + "learning_rate": 0.00012078344495732028, + "loss": 0.3188, "step": 2464 }, { - "epoch": 4.718929254302103, - "grad_norm": 0.353515625, - "learning_rate": 0.0001100976066281606, - "loss": 0.3929, + "epoch": 3.974235104669887, + "grad_norm": 0.333984375, + "learning_rate": 0.00012035390386133558, + "loss": 0.3052, "step": 2468 }, { - "epoch": 4.726577437858508, - "grad_norm": 0.34765625, - "learning_rate": 0.00010965910478709206, - "loss": 0.4572, + "epoch": 3.9806763285024154, + "grad_norm": 0.3203125, + "learning_rate": 0.00011992461560445337, + "loss": 0.2771, "step": 2472 }, { - "epoch": 4.734225621414914, - "grad_norm": 0.376953125, - "learning_rate": 0.00010922097426294166, - "loss": 0.422, + "epoch": 3.9871175523349436, + "grad_norm": 0.326171875, + "learning_rate": 0.00011949558384789271, + "loss": 0.3164, "step": 2476 }, { - "epoch": 4.741873804971319, - "grad_norm": 0.3671875, - "learning_rate": 0.00010878321908847259, - "loss": 0.4397, + "epoch": 3.993558776167472, + "grad_norm": 0.337890625, + "learning_rate": 0.00011906681225068535, + "loss": 0.2902, "step": 2480 }, { - "epoch": 4.749521988527725, - "grad_norm": 0.333984375, - "learning_rate": 0.00010834584329299322, - "loss": 0.4073, + "epoch": 4.0, + "grad_norm": 0.486328125, + "learning_rate": 0.00011863830446964417, + "loss": 0.3142, "step": 2484 }, { - "epoch": 4.75717017208413, - "grad_norm": 0.3515625, - "learning_rate": 0.00010790885090231968, - "loss": 0.4209, + "epoch": 4.006441223832528, + "grad_norm": 0.259765625, + "learning_rate": 0.00011821006415933199, + "loss": 0.2147, "step": 2488 }, { - "epoch": 4.764818355640536, - "grad_norm": 0.361328125, - "learning_rate": 0.00010747224593873933, - "loss": 0.4365, + "epoch": 4.012882447665056, + "grad_norm": 0.322265625, + "learning_rate": 0.00011778209497203062, + "loss": 0.2092, "step": 2492 }, { - "epoch": 4.772466539196941, - "grad_norm": 0.33203125, - "learning_rate": 0.00010703603242097322, - "loss": 0.4213, + "epoch": 4.019323671497585, + "grad_norm": 0.3203125, + "learning_rate": 0.00011735440055770945, + "loss": 0.2548, "step": 2496 }, { - "epoch": 4.780114722753346, - "grad_norm": 0.33984375, - "learning_rate": 0.00010660021436413956, - "loss": 0.4705, + "epoch": 4.025764895330113, + "grad_norm": 0.306640625, + "learning_rate": 0.00011692698456399458, + "loss": 0.2183, "step": 2500 }, { - "epoch": 4.7877629063097515, - "grad_norm": 0.353515625, - "learning_rate": 0.00010616479577971638, - "loss": 0.4171, + "epoch": 4.032206119162641, + "grad_norm": 0.298828125, + "learning_rate": 0.0001164998506361374, + "loss": 0.2009, "step": 2504 }, { - "epoch": 4.795411089866157, - "grad_norm": 0.373046875, - "learning_rate": 0.00010572978067550489, - "loss": 0.4357, + "epoch": 4.038647342995169, + "grad_norm": 0.298828125, + "learning_rate": 0.00011607300241698387, + "loss": 0.218, "step": 2508 }, { - "epoch": 4.803059273422562, - "grad_norm": 0.359375, - "learning_rate": 0.00010529517305559244, - "loss": 0.4225, + "epoch": 4.0450885668276975, + "grad_norm": 0.326171875, + "learning_rate": 0.00011564644354694312, + "loss": 0.2201, "step": 2512 }, { - "epoch": 4.810707456978967, - "grad_norm": 0.33203125, - "learning_rate": 0.00010486097692031566, - "loss": 0.4569, + "epoch": 4.051529790660226, + "grad_norm": 0.333984375, + "learning_rate": 0.00011522017766395665, + "loss": 0.2078, "step": 2516 }, { - "epoch": 4.818355640535373, - "grad_norm": 0.357421875, - "learning_rate": 0.00010442719626622374, - "loss": 0.4801, + "epoch": 4.057971014492754, + "grad_norm": 0.296875, + "learning_rate": 0.00011479420840346706, + "loss": 0.1932, "step": 2520 }, { - "epoch": 4.826003824091778, - "grad_norm": 0.353515625, - "learning_rate": 0.0001039938350860415, - "loss": 0.4476, + "epoch": 4.064412238325282, + "grad_norm": 0.30078125, + "learning_rate": 0.00011436853939838734, + "loss": 0.2217, "step": 2524 }, { - "epoch": 4.833652007648183, - "grad_norm": 0.33203125, - "learning_rate": 0.00010356089736863282, - "loss": 0.4016, + "epoch": 4.07085346215781, + "grad_norm": 0.30859375, + "learning_rate": 0.0001139431742790696, + "loss": 0.2448, "step": 2528 }, { - "epoch": 4.8413001912045885, - "grad_norm": 0.361328125, - "learning_rate": 0.0001031283870989638, - "loss": 0.4467, + "epoch": 4.0772946859903385, + "grad_norm": 0.318359375, + "learning_rate": 0.0001135181166732743, + "loss": 0.2254, "step": 2532 }, { - "epoch": 4.848948374760994, - "grad_norm": 0.35546875, - "learning_rate": 0.00010269630825806597, - "loss": 0.4236, + "epoch": 4.083735909822867, + "grad_norm": 0.330078125, + "learning_rate": 0.00011309337020613922, + "loss": 0.2665, "step": 2536 }, { - "epoch": 4.8565965583174, - "grad_norm": 0.373046875, - "learning_rate": 0.00010226466482300006, - "loss": 0.426, + "epoch": 4.090177133655395, + "grad_norm": 0.32421875, + "learning_rate": 0.0001126689385001486, + "loss": 0.2365, "step": 2540 }, { - "epoch": 4.864244741873805, - "grad_norm": 0.35546875, - "learning_rate": 0.00010183346076681882, - "loss": 0.4452, + "epoch": 4.096618357487923, + "grad_norm": 0.3125, + "learning_rate": 0.00011224482517510224, + "loss": 0.2341, "step": 2544 }, { - "epoch": 4.871892925430211, - "grad_norm": 0.353515625, - "learning_rate": 0.00010140270005853098, - "loss": 0.4182, + "epoch": 4.1030595813204505, + "grad_norm": 0.30078125, + "learning_rate": 0.00011182103384808444, + "loss": 0.2015, "step": 2548 }, { - "epoch": 4.879541108986616, - "grad_norm": 0.37109375, - "learning_rate": 0.00010097238666306427, - "loss": 0.4035, + "epoch": 4.109500805152979, + "grad_norm": 0.318359375, + "learning_rate": 0.00011139756813343359, + "loss": 0.2334, "step": 2552 }, { - "epoch": 4.887189292543021, - "grad_norm": 0.36328125, - "learning_rate": 0.00010054252454122934, - "loss": 0.4187, + "epoch": 4.115942028985507, + "grad_norm": 0.341796875, + "learning_rate": 0.00011097443164271075, + "loss": 0.246, "step": 2556 }, { - "epoch": 4.894837476099426, - "grad_norm": 0.376953125, - "learning_rate": 0.000100113117649683, - "loss": 0.4494, + "epoch": 4.122383252818035, + "grad_norm": 0.34375, + "learning_rate": 0.00011055162798466948, + "loss": 0.2322, "step": 2560 }, { - "epoch": 4.902485659655832, - "grad_norm": 0.361328125, - "learning_rate": 9.968416994089189e-05, - "loss": 0.4461, + "epoch": 4.128824476650563, + "grad_norm": 0.333984375, + "learning_rate": 0.00011012916076522443, + "loss": 0.2178, "step": 2564 }, { - "epoch": 4.910133843212237, - "grad_norm": 0.359375, - "learning_rate": 9.925568536309619e-05, - "loss": 0.4589, + "epoch": 4.1352657004830915, + "grad_norm": 0.291015625, + "learning_rate": 0.00010970703358742127, + "loss": 0.2147, "step": 2568 }, { - "epoch": 4.917782026768642, - "grad_norm": 0.337890625, - "learning_rate": 9.88276678602731e-05, - "loss": 0.4329, + "epoch": 4.14170692431562, + "grad_norm": 0.333984375, + "learning_rate": 0.00010928525005140521, + "loss": 0.2315, "step": 2572 }, { - "epoch": 4.925430210325048, - "grad_norm": 0.35546875, - "learning_rate": 9.840012137210072e-05, - "loss": 0.4048, + "epoch": 4.148148148148148, + "grad_norm": 0.33203125, + "learning_rate": 0.00010886381375439105, + "loss": 0.2284, "step": 2576 }, { - "epoch": 4.933078393881453, - "grad_norm": 0.40234375, - "learning_rate": 9.797304983392164e-05, - "loss": 0.4689, + "epoch": 4.154589371980676, + "grad_norm": 0.3203125, + "learning_rate": 0.0001084427282906318, + "loss": 0.2568, "step": 2580 }, { - "epoch": 4.940726577437858, - "grad_norm": 0.37890625, - "learning_rate": 9.75464571767068e-05, - "loss": 0.4538, + "epoch": 4.161030595813204, + "grad_norm": 0.314453125, + "learning_rate": 0.00010802199725138869, + "loss": 0.2163, "step": 2584 }, { - "epoch": 4.948374760994264, - "grad_norm": 0.349609375, - "learning_rate": 9.712034732701942e-05, - "loss": 0.4251, + "epoch": 4.1674718196457325, + "grad_norm": 0.3203125, + "learning_rate": 0.00010760162422489987, + "loss": 0.2267, "step": 2588 }, { - "epoch": 4.95602294455067, - "grad_norm": 0.3671875, - "learning_rate": 9.669472420697845e-05, - "loss": 0.3808, + "epoch": 4.173913043478261, + "grad_norm": 0.3359375, + "learning_rate": 0.00010718161279635048, + "loss": 0.2263, "step": 2592 }, { - "epoch": 4.963671128107075, + "epoch": 4.180354267310789, "grad_norm": 0.33984375, - "learning_rate": 9.626959173422306e-05, - "loss": 0.4249, + "learning_rate": 0.00010676196654784144, + "loss": 0.2395, "step": 2596 }, { - "epoch": 4.97131931166348, - "grad_norm": 0.36328125, - "learning_rate": 9.5844953821876e-05, - "loss": 0.4491, + "epoch": 4.186795491143317, + "grad_norm": 0.328125, + "learning_rate": 0.00010634268905835949, + "loss": 0.2454, "step": 2600 }, { - "epoch": 4.9789674952198855, - "grad_norm": 0.33203125, - "learning_rate": 9.542081437850801e-05, - "loss": 0.3934, + "epoch": 4.193236714975845, + "grad_norm": 0.310546875, + "learning_rate": 0.00010592378390374612, + "loss": 0.2186, "step": 2604 }, { - "epoch": 4.986615678776291, - "grad_norm": 0.3515625, - "learning_rate": 9.49971773081017e-05, - "loss": 0.4589, + "epoch": 4.199677938808374, + "grad_norm": 0.33984375, + "learning_rate": 0.00010550525465666751, + "loss": 0.2302, "step": 2608 }, { - "epoch": 4.994263862332696, - "grad_norm": 0.3671875, - "learning_rate": 9.457404651001546e-05, - "loss": 0.4418, + "epoch": 4.206119162640902, + "grad_norm": 0.328125, + "learning_rate": 0.00010508710488658385, + "loss": 0.2475, "step": 2612 }, { - "epoch": 5.001912045889101, - "grad_norm": 0.31640625, - "learning_rate": 9.415142587894786e-05, - "loss": 0.3326, + "epoch": 4.21256038647343, + "grad_norm": 0.314453125, + "learning_rate": 0.00010466933815971884, + "loss": 0.1988, "step": 2616 }, { - "epoch": 5.009560229445507, - "grad_norm": 0.314453125, - "learning_rate": 9.372931930490147e-05, - "loss": 0.3716, + "epoch": 4.219001610305958, + "grad_norm": 0.32421875, + "learning_rate": 0.00010425195803902948, + "loss": 0.2137, "step": 2620 }, { - "epoch": 5.017208413001912, - "grad_norm": 0.341796875, - "learning_rate": 9.330773067314747e-05, - "loss": 0.4067, + "epoch": 4.225442834138486, + "grad_norm": 0.345703125, + "learning_rate": 0.00010383496808417547, + "loss": 0.2564, "step": 2624 }, { - "epoch": 5.024856596558317, - "grad_norm": 0.345703125, - "learning_rate": 9.28866638641894e-05, - "loss": 0.3533, + "epoch": 4.231884057971015, + "grad_norm": 0.314453125, + "learning_rate": 0.00010341837185148903, + "loss": 0.2361, "step": 2628 }, { - "epoch": 5.0325047801147225, - "grad_norm": 0.3359375, - "learning_rate": 9.246612275372786e-05, - "loss": 0.3446, + "epoch": 4.238325281803543, + "grad_norm": 0.314453125, + "learning_rate": 0.00010300217289394443, + "loss": 0.2324, "step": 2632 }, { - "epoch": 5.040152963671128, - "grad_norm": 0.353515625, - "learning_rate": 9.204611121262466e-05, - "loss": 0.391, + "epoch": 4.244766505636071, + "grad_norm": 0.302734375, + "learning_rate": 0.00010258637476112782, + "loss": 0.2175, "step": 2636 }, { - "epoch": 5.047801147227533, - "grad_norm": 0.3515625, - "learning_rate": 9.16266331068671e-05, - "loss": 0.3821, + "epoch": 4.251207729468599, + "grad_norm": 0.3203125, + "learning_rate": 0.00010217098099920676, + "loss": 0.2533, "step": 2640 }, { - "epoch": 5.055449330783939, - "grad_norm": 0.35546875, - "learning_rate": 9.120769229753262e-05, - "loss": 0.3813, + "epoch": 4.2576489533011275, + "grad_norm": 0.31640625, + "learning_rate": 0.00010175599515090026, + "loss": 0.2155, "step": 2644 }, { - "epoch": 5.0630975143403445, - "grad_norm": 0.361328125, - "learning_rate": 9.078929264075293e-05, - "loss": 0.3493, + "epoch": 4.264090177133656, + "grad_norm": 0.3203125, + "learning_rate": 0.00010134142075544824, + "loss": 0.2299, "step": 2648 }, { - "epoch": 5.07074569789675, - "grad_norm": 0.3515625, - "learning_rate": 9.0371437987679e-05, - "loss": 0.3346, + "epoch": 4.270531400966184, + "grad_norm": 0.3359375, + "learning_rate": 0.00010092726134858168, + "loss": 0.2776, "step": 2652 }, { - "epoch": 5.078393881453155, - "grad_norm": 0.35546875, - "learning_rate": 8.995413218444502e-05, - "loss": 0.3753, + "epoch": 4.276972624798712, + "grad_norm": 0.345703125, + "learning_rate": 0.00010051352046249213, + "loss": 0.2079, "step": 2656 }, { - "epoch": 5.08604206500956, - "grad_norm": 0.33984375, - "learning_rate": 8.953737907213346e-05, - "loss": 0.3749, + "epoch": 4.28341384863124, + "grad_norm": 0.328125, + "learning_rate": 0.00010010020162580192, + "loss": 0.198, "step": 2660 }, { - "epoch": 5.093690248565966, - "grad_norm": 0.3359375, - "learning_rate": 8.912118248673966e-05, - "loss": 0.3673, + "epoch": 4.2898550724637685, + "grad_norm": 0.328125, + "learning_rate": 9.96873083635337e-05, + "loss": 0.223, "step": 2664 }, { - "epoch": 5.101338432122371, - "grad_norm": 0.34765625, - "learning_rate": 8.870554625913619e-05, - "loss": 0.3591, + "epoch": 4.296296296296296, + "grad_norm": 0.33984375, + "learning_rate": 9.927484419708076e-05, + "loss": 0.187, "step": 2668 }, { - "epoch": 5.108986615678776, - "grad_norm": 0.369140625, - "learning_rate": 8.8290474215038e-05, - "loss": 0.387, + "epoch": 4.302737520128824, + "grad_norm": 0.33984375, + "learning_rate": 9.88628126441768e-05, + "loss": 0.2339, "step": 2672 }, { - "epoch": 5.1166347992351815, - "grad_norm": 0.359375, - "learning_rate": 8.787597017496687e-05, - "loss": 0.3774, + "epoch": 4.309178743961352, + "grad_norm": 0.328125, + "learning_rate": 9.84512172188657e-05, + "loss": 0.2164, "step": 2676 }, { - "epoch": 5.124282982791587, - "grad_norm": 0.330078125, - "learning_rate": 8.74620379542166e-05, - "loss": 0.3711, + "epoch": 4.3156199677938805, + "grad_norm": 0.30859375, + "learning_rate": 9.804006143147212e-05, + "loss": 0.2328, "step": 2680 }, { - "epoch": 5.131931166347992, - "grad_norm": 0.33203125, - "learning_rate": 8.704868136281742e-05, - "loss": 0.3733, + "epoch": 4.322061191626409, + "grad_norm": 0.322265625, + "learning_rate": 9.762934878857105e-05, + "loss": 0.2577, "step": 2684 }, { - "epoch": 5.139579349904397, - "grad_norm": 0.36328125, - "learning_rate": 8.663590420550145e-05, - "loss": 0.3483, + "epoch": 4.328502415458937, + "grad_norm": 0.3203125, + "learning_rate": 9.721908279295812e-05, + "loss": 0.2256, "step": 2688 }, { - "epoch": 5.147227533460803, - "grad_norm": 0.369140625, - "learning_rate": 8.622371028166743e-05, - "loss": 0.3773, + "epoch": 4.334943639291465, + "grad_norm": 0.361328125, + "learning_rate": 9.680926694361964e-05, + "loss": 0.2344, "step": 2692 }, { - "epoch": 5.154875717017209, - "grad_norm": 0.3671875, - "learning_rate": 8.581210338534538e-05, - "loss": 0.3921, + "epoch": 4.341384863123993, + "grad_norm": 0.31640625, + "learning_rate": 9.639990473570294e-05, + "loss": 0.2238, "step": 2696 }, { - "epoch": 5.162523900573614, - "grad_norm": 0.365234375, - "learning_rate": 8.540108730516248e-05, - "loss": 0.355, + "epoch": 4.3478260869565215, + "grad_norm": 0.30859375, + "learning_rate": 9.599099966048627e-05, + "loss": 0.1847, "step": 2700 }, { - "epoch": 5.170172084130019, - "grad_norm": 0.34375, - "learning_rate": 8.499066582430748e-05, - "loss": 0.3805, + "epoch": 4.35426731078905, + "grad_norm": 0.353515625, + "learning_rate": 9.558255520534937e-05, + "loss": 0.2451, "step": 2704 }, { - "epoch": 5.177820267686425, - "grad_norm": 0.333984375, - "learning_rate": 8.45808427204962e-05, - "loss": 0.3575, + "epoch": 4.360708534621578, + "grad_norm": 0.328125, + "learning_rate": 9.517457485374336e-05, + "loss": 0.2112, "step": 2708 }, { - "epoch": 5.18546845124283, - "grad_norm": 0.373046875, - "learning_rate": 8.417162176593686e-05, - "loss": 0.3948, + "epoch": 4.367149758454106, + "grad_norm": 0.30078125, + "learning_rate": 9.476706208516138e-05, + "loss": 0.2048, "step": 2712 }, { - "epoch": 5.193116634799235, - "grad_norm": 0.345703125, - "learning_rate": 8.376300672729504e-05, - "loss": 0.3883, + "epoch": 4.373590982286634, + "grad_norm": 0.32421875, + "learning_rate": 9.43600203751086e-05, + "loss": 0.2036, "step": 2716 }, { - "epoch": 5.2007648183556405, - "grad_norm": 0.35546875, - "learning_rate": 8.335500136565919e-05, - "loss": 0.4049, + "epoch": 4.3800322061191626, + "grad_norm": 0.31640625, + "learning_rate": 9.395345319507287e-05, + "loss": 0.2125, "step": 2720 }, { - "epoch": 5.208413001912046, - "grad_norm": 0.376953125, - "learning_rate": 8.294760943650605e-05, - "loss": 0.3689, + "epoch": 4.386473429951691, + "grad_norm": 0.328125, + "learning_rate": 9.354736401249486e-05, + "loss": 0.2199, "step": 2724 }, { - "epoch": 5.216061185468451, - "grad_norm": 0.384765625, - "learning_rate": 8.254083468966612e-05, - "loss": 0.3568, + "epoch": 4.392914653784219, + "grad_norm": 0.341796875, + "learning_rate": 9.31417562907387e-05, + "loss": 0.2064, "step": 2728 }, { - "epoch": 5.223709369024856, - "grad_norm": 0.361328125, - "learning_rate": 8.213468086928891e-05, - "loss": 0.3961, + "epoch": 4.399355877616747, + "grad_norm": 0.3046875, + "learning_rate": 9.273663348906222e-05, + "loss": 0.2183, "step": 2732 }, { - "epoch": 5.231357552581262, - "grad_norm": 0.3515625, - "learning_rate": 8.172915171380863e-05, - "loss": 0.3587, + "epoch": 4.405797101449275, + "grad_norm": 0.318359375, + "learning_rate": 9.233199906258766e-05, + "loss": 0.2639, "step": 2736 }, { - "epoch": 5.239005736137667, - "grad_norm": 0.357421875, - "learning_rate": 8.132425095590999e-05, - "loss": 0.407, + "epoch": 4.412238325281804, + "grad_norm": 0.3515625, + "learning_rate": 9.192785646227217e-05, + "loss": 0.251, "step": 2740 }, { - "epoch": 5.246653919694072, - "grad_norm": 0.369140625, - "learning_rate": 8.091998232249325e-05, - "loss": 0.4085, + "epoch": 4.418679549114332, + "grad_norm": 0.35546875, + "learning_rate": 9.152420913487814e-05, + "loss": 0.2386, "step": 2744 }, { - "epoch": 5.254302103250478, - "grad_norm": 0.3671875, - "learning_rate": 8.051634953464069e-05, - "loss": 0.366, + "epoch": 4.42512077294686, + "grad_norm": 0.33984375, + "learning_rate": 9.112106052294418e-05, + "loss": 0.217, "step": 2748 }, { - "epoch": 5.261950286806884, - "grad_norm": 0.341796875, - "learning_rate": 8.011335630758169e-05, - "loss": 0.3572, + "epoch": 4.431561996779388, + "grad_norm": 0.337890625, + "learning_rate": 9.071841406475539e-05, + "loss": 0.2102, "step": 2752 }, { - "epoch": 5.269598470363289, - "grad_norm": 0.392578125, - "learning_rate": 7.971100635065894e-05, - "loss": 0.4136, + "epoch": 4.438003220611916, + "grad_norm": 0.34765625, + "learning_rate": 9.03162731943144e-05, + "loss": 0.2282, "step": 2756 }, { - "epoch": 5.277246653919694, - "grad_norm": 0.3671875, - "learning_rate": 7.930930336729406e-05, - "loss": 0.3311, + "epoch": 4.444444444444445, + "grad_norm": 0.302734375, + "learning_rate": 8.991464134131166e-05, + "loss": 0.2395, "step": 2760 }, { - "epoch": 5.2848948374761, - "grad_norm": 0.357421875, - "learning_rate": 7.890825105495376e-05, - "loss": 0.351, + "epoch": 4.450885668276973, + "grad_norm": 0.33203125, + "learning_rate": 8.951352193109673e-05, + "loss": 0.2379, "step": 2764 }, { - "epoch": 5.292543021032505, - "grad_norm": 0.341796875, - "learning_rate": 7.850785310511555e-05, - "loss": 0.3403, + "epoch": 4.457326892109501, + "grad_norm": 0.318359375, + "learning_rate": 8.911291838464838e-05, + "loss": 0.2319, "step": 2768 }, { - "epoch": 5.30019120458891, + "epoch": 4.463768115942029, "grad_norm": 0.3359375, - "learning_rate": 7.810811320323386e-05, - "loss": 0.2974, + "learning_rate": 8.871283411854619e-05, + "loss": 0.2066, "step": 2772 }, { - "epoch": 5.307839388145315, - "grad_norm": 0.36328125, - "learning_rate": 7.770903502870625e-05, - "loss": 0.3635, + "epoch": 4.4702093397745575, + "grad_norm": 0.328125, + "learning_rate": 8.831327254494066e-05, + "loss": 0.2086, "step": 2776 }, { - "epoch": 5.315487571701721, - "grad_norm": 0.341796875, - "learning_rate": 7.731062225483933e-05, - "loss": 0.3669, + "epoch": 4.476650563607086, + "grad_norm": 0.33984375, + "learning_rate": 8.791423707152482e-05, + "loss": 0.2454, "step": 2780 }, { - "epoch": 5.323135755258126, - "grad_norm": 0.330078125, - "learning_rate": 7.6912878548815e-05, - "loss": 0.3823, + "epoch": 4.483091787439614, + "grad_norm": 0.31640625, + "learning_rate": 8.751573110150443e-05, + "loss": 0.2254, "step": 2784 }, { - "epoch": 5.330783938814531, - "grad_norm": 0.37890625, - "learning_rate": 7.651580757165691e-05, - "loss": 0.3644, + "epoch": 4.489533011272142, + "grad_norm": 0.3359375, + "learning_rate": 8.711775803356971e-05, + "loss": 0.2446, "step": 2788 }, { - "epoch": 5.338432122370937, - "grad_norm": 0.361328125, - "learning_rate": 7.611941297819643e-05, - "loss": 0.3505, + "epoch": 4.49597423510467, + "grad_norm": 0.3125, + "learning_rate": 8.672032126186566e-05, + "loss": 0.2154, "step": 2792 }, { - "epoch": 5.346080305927342, - "grad_norm": 0.361328125, - "learning_rate": 7.572369841703924e-05, - "loss": 0.3691, + "epoch": 4.5024154589371985, + "grad_norm": 0.318359375, + "learning_rate": 8.632342417596365e-05, + "loss": 0.2269, "step": 2796 }, { - "epoch": 5.353728489483748, - "grad_norm": 0.357421875, - "learning_rate": 7.532866753053159e-05, - "loss": 0.3229, + "epoch": 4.508856682769727, + "grad_norm": 0.326171875, + "learning_rate": 8.592707016083221e-05, + "loss": 0.2134, "step": 2800 }, { - "epoch": 5.361376673040153, - "grad_norm": 0.39453125, - "learning_rate": 7.493432395472711e-05, - "loss": 0.3849, + "epoch": 4.515297906602254, + "grad_norm": 0.359375, + "learning_rate": 8.553126259680828e-05, + "loss": 0.25, "step": 2804 }, { - "epoch": 5.369024856596559, - "grad_norm": 0.375, - "learning_rate": 7.454067131935269e-05, - "loss": 0.4105, + "epoch": 4.521739130434782, + "grad_norm": 0.3359375, + "learning_rate": 8.513600485956835e-05, + "loss": 0.2262, "step": 2808 }, { - "epoch": 5.376673040152964, - "grad_norm": 0.369140625, - "learning_rate": 7.414771324777579e-05, - "loss": 0.3686, + "epoch": 4.5281803542673105, + "grad_norm": 0.322265625, + "learning_rate": 8.474130032009951e-05, + "loss": 0.2062, "step": 2812 }, { - "epoch": 5.384321223709369, - "grad_norm": 0.357421875, - "learning_rate": 7.375545335697085e-05, - "loss": 0.3293, + "epoch": 4.534621578099839, + "grad_norm": 0.349609375, + "learning_rate": 8.434715234467123e-05, + "loss": 0.2293, "step": 2816 }, { - "epoch": 5.3919694072657744, - "grad_norm": 0.3828125, - "learning_rate": 7.336389525748548e-05, - "loss": 0.3892, + "epoch": 4.541062801932367, + "grad_norm": 0.337890625, + "learning_rate": 8.395356429480587e-05, + "loss": 0.204, "step": 2820 }, { - "epoch": 5.39961759082218, - "grad_norm": 0.3984375, - "learning_rate": 7.29730425534081e-05, - "loss": 0.3562, + "epoch": 4.547504025764895, + "grad_norm": 0.326171875, + "learning_rate": 8.356053952725072e-05, + "loss": 0.2207, "step": 2824 }, { - "epoch": 5.407265774378585, - "grad_norm": 0.376953125, - "learning_rate": 7.258289884233417e-05, - "loss": 0.363, + "epoch": 4.553945249597423, + "grad_norm": 0.3359375, + "learning_rate": 8.316808139394876e-05, + "loss": 0.2231, "step": 2828 }, { - "epoch": 5.41491395793499, - "grad_norm": 0.35546875, - "learning_rate": 7.21934677153332e-05, - "loss": 0.3517, + "epoch": 4.5603864734299515, + "grad_norm": 0.34765625, + "learning_rate": 8.277619324201081e-05, + "loss": 0.2321, "step": 2832 }, { - "epoch": 5.422562141491396, - "grad_norm": 0.353515625, - "learning_rate": 7.180475275691573e-05, - "loss": 0.39, + "epoch": 4.56682769726248, + "grad_norm": 0.330078125, + "learning_rate": 8.238487841368617e-05, + "loss": 0.2298, "step": 2836 }, { - "epoch": 5.430210325047801, - "grad_norm": 0.376953125, - "learning_rate": 7.141675754500049e-05, - "loss": 0.3898, + "epoch": 4.573268921095008, + "grad_norm": 0.33203125, + "learning_rate": 8.199414024633473e-05, + "loss": 0.1997, "step": 2840 }, { - "epoch": 5.437858508604206, - "grad_norm": 0.396484375, - "learning_rate": 7.102948565088116e-05, - "loss": 0.4363, + "epoch": 4.579710144927536, + "grad_norm": 0.3828125, + "learning_rate": 8.160398207239805e-05, + "loss": 0.2359, "step": 2844 }, { - "epoch": 5.4455066921606115, - "grad_norm": 0.373046875, - "learning_rate": 7.064294063919368e-05, - "loss": 0.3494, + "epoch": 4.586151368760064, + "grad_norm": 0.345703125, + "learning_rate": 8.121440721937157e-05, + "loss": 0.216, "step": 2848 }, { - "epoch": 5.453154875717018, - "grad_norm": 0.353515625, - "learning_rate": 7.025712606788362e-05, - "loss": 0.3457, + "epoch": 4.592592592592593, + "grad_norm": 0.349609375, + "learning_rate": 8.082541900977542e-05, + "loss": 0.2374, "step": 2852 }, { - "epoch": 5.460803059273423, - "grad_norm": 0.369140625, - "learning_rate": 6.987204548817278e-05, - "loss": 0.3279, + "epoch": 4.599033816425121, + "grad_norm": 0.328125, + "learning_rate": 8.04370207611267e-05, + "loss": 0.2363, "step": 2856 }, { - "epoch": 5.468451242829828, - "grad_norm": 0.353515625, - "learning_rate": 6.948770244452737e-05, - "loss": 0.3591, + "epoch": 4.605475040257649, + "grad_norm": 0.361328125, + "learning_rate": 8.004921578591091e-05, + "loss": 0.214, "step": 2860 }, { - "epoch": 5.4760994263862335, - "grad_norm": 0.365234375, - "learning_rate": 6.910410047462495e-05, - "loss": 0.3488, + "epoch": 4.611916264090177, + "grad_norm": 0.30859375, + "learning_rate": 7.966200739155389e-05, + "loss": 0.2214, "step": 2864 }, { - "epoch": 5.483747609942639, - "grad_norm": 0.3671875, - "learning_rate": 6.87212431093215e-05, - "loss": 0.3466, + "epoch": 4.618357487922705, + "grad_norm": 0.337890625, + "learning_rate": 7.927539888039339e-05, + "loss": 0.2431, "step": 2868 }, { - "epoch": 5.491395793499044, - "grad_norm": 0.34765625, - "learning_rate": 6.833913387261973e-05, - "loss": 0.4094, + "epoch": 4.624798711755234, + "grad_norm": 0.31640625, + "learning_rate": 7.888939354965093e-05, + "loss": 0.2104, "step": 2872 }, { - "epoch": 5.499043977055449, - "grad_norm": 0.375, - "learning_rate": 6.795777628163599e-05, - "loss": 0.3803, + "epoch": 4.631239935587762, + "grad_norm": 0.314453125, + "learning_rate": 7.850399469140393e-05, + "loss": 0.204, "step": 2876 }, { - "epoch": 5.506692160611855, - "grad_norm": 0.361328125, - "learning_rate": 6.757717384656817e-05, - "loss": 0.3832, + "epoch": 4.63768115942029, + "grad_norm": 0.353515625, + "learning_rate": 7.811920559255736e-05, + "loss": 0.2263, "step": 2880 }, { - "epoch": 5.51434034416826, - "grad_norm": 0.37109375, - "learning_rate": 6.719733007066331e-05, - "loss": 0.3575, + "epoch": 4.644122383252818, + "grad_norm": 0.3359375, + "learning_rate": 7.773502953481585e-05, + "loss": 0.2161, "step": 2884 }, { - "epoch": 5.521988527724665, - "grad_norm": 0.357421875, - "learning_rate": 6.68182484501855e-05, - "loss": 0.3435, + "epoch": 4.650563607085346, + "grad_norm": 0.330078125, + "learning_rate": 7.73514697946556e-05, + "loss": 0.2279, "step": 2888 }, { - "epoch": 5.5296367112810705, - "grad_norm": 0.359375, - "learning_rate": 6.643993247438347e-05, - "loss": 0.3674, + "epoch": 4.657004830917875, + "grad_norm": 0.361328125, + "learning_rate": 7.696852964329655e-05, + "loss": 0.2615, "step": 2892 }, { - "epoch": 5.537284894837476, - "grad_norm": 0.361328125, - "learning_rate": 6.606238562545859e-05, - "loss": 0.3507, + "epoch": 4.663446054750403, + "grad_norm": 0.33203125, + "learning_rate": 7.658621234667443e-05, + "loss": 0.2407, "step": 2896 }, { - "epoch": 5.544933078393882, - "grad_norm": 0.357421875, - "learning_rate": 6.568561137853296e-05, - "loss": 0.3555, + "epoch": 4.669887278582931, + "grad_norm": 0.326171875, + "learning_rate": 7.620452116541291e-05, + "loss": 0.2101, "step": 2900 }, { - "epoch": 5.552581261950287, - "grad_norm": 0.37890625, - "learning_rate": 6.530961320161712e-05, - "loss": 0.3786, + "epoch": 4.676328502415459, + "grad_norm": 0.322265625, + "learning_rate": 7.582345935479569e-05, + "loss": 0.2191, "step": 2904 }, { - "epoch": 5.5602294455066925, - "grad_norm": 0.373046875, - "learning_rate": 6.493439455557835e-05, - "loss": 0.4043, + "epoch": 4.6827697262479875, + "grad_norm": 0.302734375, + "learning_rate": 7.544303016473894e-05, + "loss": 0.2159, "step": 2908 }, { - "epoch": 5.567877629063098, - "grad_norm": 0.3671875, - "learning_rate": 6.455995889410873e-05, - "loss": 0.4086, + "epoch": 4.689210950080515, + "grad_norm": 0.32421875, + "learning_rate": 7.506323683976344e-05, + "loss": 0.2251, "step": 2912 }, { - "epoch": 5.575525812619503, - "grad_norm": 0.365234375, - "learning_rate": 6.418630966369348e-05, - "loss": 0.366, + "epoch": 4.695652173913043, + "grad_norm": 0.328125, + "learning_rate": 7.468408261896701e-05, + "loss": 0.1935, "step": 2916 }, { - "epoch": 5.583173996175908, + "epoch": 4.702093397745571, "grad_norm": 0.341796875, - "learning_rate": 6.381345030357899e-05, - "loss": 0.3801, + "learning_rate": 7.430557073599662e-05, + "loss": 0.2123, "step": 2920 }, { - "epoch": 5.590822179732314, - "grad_norm": 0.3984375, - "learning_rate": 6.344138424574134e-05, - "loss": 0.3985, + "epoch": 4.708534621578099, + "grad_norm": 0.384765625, + "learning_rate": 7.392770441902116e-05, + "loss": 0.2466, "step": 2924 }, { - "epoch": 5.598470363288719, - "grad_norm": 0.390625, - "learning_rate": 6.307011491485484e-05, - "loss": 0.4266, + "epoch": 4.714975845410628, + "grad_norm": 0.337890625, + "learning_rate": 7.355048689070389e-05, + "loss": 0.2332, "step": 2928 }, { - "epoch": 5.606118546845124, - "grad_norm": 0.359375, - "learning_rate": 6.269964572826001e-05, - "loss": 0.3317, + "epoch": 4.721417069243156, + "grad_norm": 0.349609375, + "learning_rate": 7.317392136817453e-05, + "loss": 0.2364, "step": 2932 }, { - "epoch": 5.6137667304015295, - "grad_norm": 0.384765625, - "learning_rate": 6.232998009593275e-05, - "loss": 0.3663, + "epoch": 4.727858293075684, + "grad_norm": 0.37109375, + "learning_rate": 7.279801106300231e-05, + "loss": 0.2662, "step": 2936 }, { - "epoch": 5.621414913957935, - "grad_norm": 0.392578125, - "learning_rate": 6.196112142045268e-05, - "loss": 0.3804, + "epoch": 4.734299516908212, + "grad_norm": 0.31640625, + "learning_rate": 7.242275918116832e-05, + "loss": 0.2174, "step": 2940 }, { - "epoch": 5.62906309751434, - "grad_norm": 0.380859375, - "learning_rate": 6.159307309697149e-05, - "loss": 0.3782, + "epoch": 4.7407407407407405, + "grad_norm": 0.34765625, + "learning_rate": 7.204816892303833e-05, + "loss": 0.2135, "step": 2944 }, { - "epoch": 5.636711281070745, - "grad_norm": 0.400390625, - "learning_rate": 6.122583851318233e-05, - "loss": 0.4047, + "epoch": 4.747181964573269, + "grad_norm": 0.359375, + "learning_rate": 7.16742434833352e-05, + "loss": 0.231, "step": 2948 }, { - "epoch": 5.644359464627151, + "epoch": 4.753623188405797, "grad_norm": 0.34765625, - "learning_rate": 6.085942104928815e-05, - "loss": 0.3875, + "learning_rate": 7.1300986051112e-05, + "loss": 0.2569, "step": 2952 }, { - "epoch": 5.652007648183556, - "grad_norm": 0.388671875, - "learning_rate": 6.049382407797076e-05, - "loss": 0.3649, + "epoch": 4.760064412238325, + "grad_norm": 0.314453125, + "learning_rate": 7.09283998097246e-05, + "loss": 0.2072, "step": 2956 }, { - "epoch": 5.659655831739962, - "grad_norm": 0.3359375, - "learning_rate": 6.012905096435968e-05, - "loss": 0.3813, + "epoch": 4.766505636070853, + "grad_norm": 0.32421875, + "learning_rate": 7.055648793680466e-05, + "loss": 0.2059, "step": 2960 }, { - "epoch": 5.667304015296367, - "grad_norm": 0.396484375, - "learning_rate": 5.976510506600146e-05, - "loss": 0.3955, + "epoch": 4.7729468599033815, + "grad_norm": 0.34765625, + "learning_rate": 7.018525360423217e-05, + "loss": 0.2429, "step": 2964 }, { - "epoch": 5.674952198852773, - "grad_norm": 0.38671875, - "learning_rate": 5.9401989732828384e-05, - "loss": 0.3574, + "epoch": 4.77938808373591, + "grad_norm": 0.330078125, + "learning_rate": 6.981469997810892e-05, + "loss": 0.2203, "step": 2968 }, { - "epoch": 5.682600382409178, - "grad_norm": 0.349609375, - "learning_rate": 5.9039708307127816e-05, - "loss": 0.333, + "epoch": 4.785829307568438, + "grad_norm": 0.3359375, + "learning_rate": 6.944483021873115e-05, + "loss": 0.232, "step": 2972 }, { - "epoch": 5.690248565965583, - "grad_norm": 0.361328125, - "learning_rate": 5.8678264123511626e-05, - "loss": 0.3856, + "epoch": 4.792270531400966, + "grad_norm": 0.318359375, + "learning_rate": 6.907564748056273e-05, + "loss": 0.2124, "step": 2976 }, { - "epoch": 5.6978967495219885, - "grad_norm": 0.345703125, - "learning_rate": 5.8317660508885e-05, - "loss": 0.3931, + "epoch": 4.798711755233494, + "grad_norm": 0.353515625, + "learning_rate": 6.870715491220808e-05, + "loss": 0.2184, "step": 2980 }, { - "epoch": 5.705544933078394, - "grad_norm": 0.375, - "learning_rate": 5.795790078241641e-05, - "loss": 0.3364, + "epoch": 4.805152979066023, + "grad_norm": 0.31640625, + "learning_rate": 6.833935565638559e-05, + "loss": 0.238, "step": 2984 }, { - "epoch": 5.713193116634799, - "grad_norm": 0.349609375, - "learning_rate": 5.7598988255506644e-05, - "loss": 0.3825, + "epoch": 4.811594202898551, + "grad_norm": 0.333984375, + "learning_rate": 6.797225284990064e-05, + "loss": 0.2283, "step": 2988 }, { - "epoch": 5.720841300191204, - "grad_norm": 0.376953125, - "learning_rate": 5.724092623175841e-05, - "loss": 0.3654, + "epoch": 4.818035426731079, + "grad_norm": 0.33203125, + "learning_rate": 6.760584962361888e-05, + "loss": 0.2351, "step": 2992 }, { - "epoch": 5.72848948374761, - "grad_norm": 0.373046875, - "learning_rate": 5.6883718006946146e-05, - "loss": 0.4175, + "epoch": 4.824476650563607, + "grad_norm": 0.298828125, + "learning_rate": 6.72401491024396e-05, + "loss": 0.2019, "step": 2996 }, { - "epoch": 5.736137667304015, - "grad_norm": 0.392578125, - "learning_rate": 5.652736686898537e-05, - "loss": 0.369, + "epoch": 4.830917874396135, + "grad_norm": 0.373046875, + "learning_rate": 6.687515440526882e-05, + "loss": 0.242, "step": 3000 }, { - "epoch": 5.743785850860421, - "grad_norm": 0.3671875, - "learning_rate": 5.61718760979026e-05, - "loss": 0.3961, + "epoch": 4.837359098228664, + "grad_norm": 0.341796875, + "learning_rate": 6.651086864499305e-05, + "loss": 0.2196, "step": 3004 }, { - "epoch": 5.751434034416826, - "grad_norm": 0.353515625, - "learning_rate": 5.5817248965805096e-05, - "loss": 0.3622, + "epoch": 4.843800322061192, + "grad_norm": 0.365234375, + "learning_rate": 6.614729492845258e-05, + "loss": 0.2146, "step": 3008 }, { - "epoch": 5.759082217973232, - "grad_norm": 0.36328125, - "learning_rate": 5.546348873685089e-05, - "loss": 0.3369, + "epoch": 4.85024154589372, + "grad_norm": 0.375, + "learning_rate": 6.578443635641497e-05, + "loss": 0.2232, "step": 3012 }, { - "epoch": 5.766730401529637, - "grad_norm": 0.357421875, - "learning_rate": 5.51105986672185e-05, - "loss": 0.3314, + "epoch": 4.856682769726248, + "grad_norm": 0.35546875, + "learning_rate": 6.542229602354847e-05, + "loss": 0.2319, "step": 3016 }, { - "epoch": 5.774378585086042, - "grad_norm": 0.337890625, - "learning_rate": 5.475858200507708e-05, - "loss": 0.3212, + "epoch": 4.8631239935587764, + "grad_norm": 0.353515625, + "learning_rate": 6.506087701839593e-05, + "loss": 0.2156, "step": 3020 }, { - "epoch": 5.782026768642448, - "grad_norm": 0.375, - "learning_rate": 5.440744199055663e-05, - "loss": 0.3978, + "epoch": 4.869565217391305, + "grad_norm": 0.330078125, + "learning_rate": 6.470018242334825e-05, + "loss": 0.2372, "step": 3024 }, { - "epoch": 5.789674952198853, - "grad_norm": 0.37890625, - "learning_rate": 5.4057181855718e-05, - "loss": 0.3938, + "epoch": 4.876006441223833, + "grad_norm": 0.3203125, + "learning_rate": 6.434021531461818e-05, + "loss": 0.2077, "step": 3028 }, { - "epoch": 5.797323135755258, - "grad_norm": 0.369140625, - "learning_rate": 5.370780482452317e-05, - "loss": 0.3613, + "epoch": 4.882447665056361, + "grad_norm": 0.345703125, + "learning_rate": 6.398097876221385e-05, + "loss": 0.2183, "step": 3032 }, { - "epoch": 5.804971319311663, - "grad_norm": 0.365234375, - "learning_rate": 5.335931411280559e-05, - "loss": 0.3717, + "epoch": 4.888888888888889, + "grad_norm": 0.33984375, + "learning_rate": 6.362247582991317e-05, + "loss": 0.2104, "step": 3036 }, { - "epoch": 5.812619502868069, - "grad_norm": 0.345703125, - "learning_rate": 5.3011712928240787e-05, - "loss": 0.3572, + "epoch": 4.8953301127214175, + "grad_norm": 0.33203125, + "learning_rate": 6.326470957523686e-05, + "loss": 0.2048, "step": 3040 }, { - "epoch": 5.820267686424474, - "grad_norm": 0.359375, - "learning_rate": 5.2665004470316456e-05, - "loss": 0.3993, + "epoch": 4.901771336553946, + "grad_norm": 0.361328125, + "learning_rate": 6.29076830494232e-05, + "loss": 0.2346, "step": 3044 }, { - "epoch": 5.827915869980879, - "grad_norm": 0.365234375, - "learning_rate": 5.231919193030324e-05, - "loss": 0.3708, + "epoch": 4.908212560386474, + "grad_norm": 0.34765625, + "learning_rate": 6.255139929740129e-05, + "loss": 0.2068, "step": 3048 }, { - "epoch": 5.835564053537285, - "grad_norm": 0.37890625, - "learning_rate": 5.197427849122549e-05, - "loss": 0.3877, + "epoch": 4.914653784219001, + "grad_norm": 0.353515625, + "learning_rate": 6.219586135776575e-05, + "loss": 0.239, "step": 3052 }, { - "epoch": 5.84321223709369, - "grad_norm": 0.34375, - "learning_rate": 5.1630267327831494e-05, - "loss": 0.3747, + "epoch": 4.921095008051529, + "grad_norm": 0.30078125, + "learning_rate": 6.184107226275038e-05, + "loss": 0.1814, "step": 3056 }, { - "epoch": 5.850860420650095, - "grad_norm": 0.373046875, - "learning_rate": 5.128716160656489e-05, - "loss": 0.4003, + "epoch": 4.927536231884058, + "grad_norm": 0.34765625, + "learning_rate": 6.148703503820224e-05, + "loss": 0.2272, "step": 3060 }, { - "epoch": 5.858508604206501, - "grad_norm": 0.3515625, - "learning_rate": 5.0944964485534975e-05, - "loss": 0.3654, + "epoch": 4.933977455716586, + "grad_norm": 0.349609375, + "learning_rate": 6.113375270355617e-05, + "loss": 0.2418, "step": 3064 }, { - "epoch": 5.866156787762907, - "grad_norm": 0.39453125, - "learning_rate": 5.06036791144879e-05, - "loss": 0.3837, + "epoch": 4.940418679549114, + "grad_norm": 0.357421875, + "learning_rate": 6.078122827180879e-05, + "loss": 0.2723, "step": 3068 }, { - "epoch": 5.873804971319312, - "grad_norm": 0.40625, - "learning_rate": 5.0263308634777745e-05, - "loss": 0.368, + "epoch": 4.946859903381642, + "grad_norm": 0.328125, + "learning_rate": 6.042946474949302e-05, + "loss": 0.2407, "step": 3072 }, { - "epoch": 5.881453154875717, - "grad_norm": 0.376953125, - "learning_rate": 4.992385617933734e-05, - "loss": 0.3975, + "epoch": 4.9533011272141705, + "grad_norm": 0.3203125, + "learning_rate": 6.007846513665207e-05, + "loss": 0.2153, "step": 3076 }, { - "epoch": 5.8891013384321225, - "grad_norm": 0.365234375, - "learning_rate": 4.958532487264968e-05, - "loss": 0.3837, + "epoch": 4.959742351046699, + "grad_norm": 0.330078125, + "learning_rate": 5.972823242681426e-05, + "loss": 0.2206, "step": 3080 }, { - "epoch": 5.896749521988528, - "grad_norm": 0.333984375, - "learning_rate": 4.924771783071895e-05, - "loss": 0.3744, + "epoch": 4.966183574879227, + "grad_norm": 0.361328125, + "learning_rate": 5.937876960696727e-05, + "loss": 0.2105, "step": 3084 }, { - "epoch": 5.904397705544933, - "grad_norm": 0.357421875, - "learning_rate": 4.8911038161042136e-05, - "loss": 0.3439, + "epoch": 4.972624798711755, + "grad_norm": 0.365234375, + "learning_rate": 5.903007965753279e-05, + "loss": 0.2526, "step": 3088 }, { - "epoch": 5.912045889101338, - "grad_norm": 0.37890625, - "learning_rate": 4.857528896258012e-05, - "loss": 0.4041, + "epoch": 4.979066022544283, + "grad_norm": 0.310546875, + "learning_rate": 5.868216555234081e-05, + "loss": 0.2168, "step": 3092 }, { - "epoch": 5.919694072657744, - "grad_norm": 0.375, - "learning_rate": 4.824047332572924e-05, - "loss": 0.3753, + "epoch": 4.9855072463768115, + "grad_norm": 0.349609375, + "learning_rate": 5.833503025860469e-05, + "loss": 0.2174, "step": 3096 }, { - "epoch": 5.927342256214149, - "grad_norm": 0.359375, - "learning_rate": 4.7906594332293116e-05, - "loss": 0.3822, + "epoch": 4.99194847020934, + "grad_norm": 0.322265625, + "learning_rate": 5.798867673689553e-05, + "loss": 0.2365, "step": 3100 }, { - "epoch": 5.934990439770554, - "grad_norm": 0.3515625, - "learning_rate": 4.75736550554537e-05, - "loss": 0.3535, + "epoch": 4.998389694041868, + "grad_norm": 0.328125, + "learning_rate": 5.764310794111711e-05, + "loss": 0.1766, "step": 3104 }, { - "epoch": 5.9426386233269595, - "grad_norm": 0.35546875, - "learning_rate": 4.724165855974367e-05, - "loss": 0.377, + "epoch": 5.004830917874396, + "grad_norm": 0.28125, + "learning_rate": 5.7298326818480427e-05, + "loss": 0.1998, "step": 3108 }, { - "epoch": 5.950286806883366, - "grad_norm": 0.365234375, - "learning_rate": 4.6910607901017715e-05, - "loss": 0.359, + "epoch": 5.011272141706924, + "grad_norm": 0.267578125, + "learning_rate": 5.695433630947894e-05, + "loss": 0.1838, "step": 3112 }, { - "epoch": 5.957934990439771, - "grad_norm": 0.36328125, - "learning_rate": 4.65805061264246e-05, - "loss": 0.3415, + "epoch": 5.017713365539453, + "grad_norm": 0.328125, + "learning_rate": 5.661113934786321e-05, + "loss": 0.2045, "step": 3116 }, { - "epoch": 5.965583173996176, - "grad_norm": 0.390625, - "learning_rate": 4.625135627437922e-05, - "loss": 0.3968, + "epoch": 5.024154589371981, + "grad_norm": 0.28125, + "learning_rate": 5.626873886061597e-05, + "loss": 0.1917, "step": 3120 }, { - "epoch": 5.9732313575525815, - "grad_norm": 0.357421875, - "learning_rate": 4.592316137453439e-05, - "loss": 0.3983, + "epoch": 5.030595813204509, + "grad_norm": 0.314453125, + "learning_rate": 5.592713776792723e-05, + "loss": 0.204, "step": 3124 }, { - "epoch": 5.980879541108987, - "grad_norm": 0.357421875, - "learning_rate": 4.559592444775315e-05, - "loss": 0.3466, + "epoch": 5.037037037037037, + "grad_norm": 0.2734375, + "learning_rate": 5.5586338983169076e-05, + "loss": 0.1471, "step": 3128 }, { - "epoch": 5.988527724665392, - "grad_norm": 0.3515625, - "learning_rate": 4.5269648506080816e-05, - "loss": 0.3442, + "epoch": 5.043478260869565, + "grad_norm": 0.318359375, + "learning_rate": 5.52463454128714e-05, + "loss": 0.1966, "step": 3132 }, { - "epoch": 5.996175908221797, - "grad_norm": 0.375, - "learning_rate": 4.4944336552717514e-05, - "loss": 0.3768, + "epoch": 5.049919484702094, + "grad_norm": 0.2890625, + "learning_rate": 5.490715995669641e-05, + "loss": 0.1782, "step": 3136 }, { - "epoch": 6.003824091778203, - "grad_norm": 0.3359375, - "learning_rate": 4.461999158199019e-05, - "loss": 0.3533, + "epoch": 5.056360708534622, + "grad_norm": 0.318359375, + "learning_rate": 5.456878550741453e-05, + "loss": 0.1877, "step": 3140 }, { - "epoch": 6.011472275334608, - "grad_norm": 0.34765625, - "learning_rate": 4.429661657932523e-05, - "loss": 0.3247, + "epoch": 5.06280193236715, + "grad_norm": 0.291015625, + "learning_rate": 5.423122495087915e-05, + "loss": 0.1643, "step": 3144 }, { - "epoch": 6.019120458891013, - "grad_norm": 0.34375, - "learning_rate": 4.397421452122114e-05, - "loss": 0.329, + "epoch": 5.069243156199678, + "grad_norm": 0.298828125, + "learning_rate": 5.3894481166002674e-05, + "loss": 0.1792, "step": 3148 }, { - "epoch": 6.0267686424474185, - "grad_norm": 0.36328125, - "learning_rate": 4.3652788375220787e-05, - "loss": 0.3547, + "epoch": 5.0756843800322065, + "grad_norm": 0.310546875, + "learning_rate": 5.355855702473125e-05, + "loss": 0.1567, "step": 3152 }, { - "epoch": 6.034416826003824, - "grad_norm": 0.361328125, - "learning_rate": 4.333234109988434e-05, - "loss": 0.4026, + "epoch": 5.082125603864735, + "grad_norm": 0.341796875, + "learning_rate": 5.322345539202086e-05, + "loss": 0.2051, "step": 3156 }, { - "epoch": 6.042065009560229, - "grad_norm": 0.3828125, - "learning_rate": 4.3012875644761955e-05, - "loss": 0.3534, + "epoch": 5.088566827697263, + "grad_norm": 0.326171875, + "learning_rate": 5.288917912581257e-05, + "loss": 0.1754, "step": 3160 }, { - "epoch": 6.049713193116634, - "grad_norm": 0.36328125, - "learning_rate": 4.269439495036678e-05, - "loss": 0.3035, + "epoch": 5.095008051529791, + "grad_norm": 0.2890625, + "learning_rate": 5.255573107700832e-05, + "loss": 0.1824, "step": 3164 }, { - "epoch": 6.0573613766730405, - "grad_norm": 0.365234375, - "learning_rate": 4.2376901948147465e-05, - "loss": 0.3553, + "epoch": 5.101449275362318, + "grad_norm": 0.34765625, + "learning_rate": 5.222311408944635e-05, + "loss": 0.2092, "step": 3168 }, { - "epoch": 6.065009560229446, - "grad_norm": 0.369140625, - "learning_rate": 4.206039956046176e-05, - "loss": 0.3602, + "epoch": 5.107890499194847, + "grad_norm": 0.28515625, + "learning_rate": 5.189133099987731e-05, + "loss": 0.146, "step": 3172 }, { - "epoch": 6.072657743785851, - "grad_norm": 0.349609375, - "learning_rate": 4.174489070054927e-05, - "loss": 0.3606, + "epoch": 5.114331723027375, + "grad_norm": 0.28515625, + "learning_rate": 5.156038463793981e-05, + "loss": 0.1692, "step": 3176 }, { - "epoch": 6.080305927342256, - "grad_norm": 0.349609375, - "learning_rate": 4.143037827250447e-05, - "loss": 0.3499, + "epoch": 5.120772946859903, + "grad_norm": 0.291015625, + "learning_rate": 5.123027782613636e-05, + "loss": 0.1877, "step": 3180 }, { - "epoch": 6.087954110898662, - "grad_norm": 0.349609375, - "learning_rate": 4.1116865171250496e-05, - "loss": 0.3447, + "epoch": 5.127214170692431, + "grad_norm": 0.30078125, + "learning_rate": 5.09010133798094e-05, + "loss": 0.154, "step": 3184 }, { - "epoch": 6.095602294455067, - "grad_norm": 0.357421875, - "learning_rate": 4.0804354282512016e-05, - "loss": 0.3541, + "epoch": 5.1336553945249594, + "grad_norm": 0.310546875, + "learning_rate": 5.0572594107116974e-05, + "loss": 0.1559, "step": 3188 }, { - "epoch": 6.103250478011472, - "grad_norm": 0.3515625, - "learning_rate": 4.049284848278886e-05, - "loss": 0.3638, + "epoch": 5.140096618357488, + "grad_norm": 0.318359375, + "learning_rate": 5.0245022809009155e-05, + "loss": 0.171, "step": 3192 }, { - "epoch": 6.1108986615678775, - "grad_norm": 0.373046875, - "learning_rate": 4.01823506393297e-05, - "loss": 0.362, + "epoch": 5.146537842190016, + "grad_norm": 0.294921875, + "learning_rate": 4.991830227920398e-05, + "loss": 0.1774, "step": 3196 }, { - "epoch": 6.118546845124283, - "grad_norm": 0.3828125, - "learning_rate": 3.987286361010531e-05, - "loss": 0.3367, + "epoch": 5.152979066022544, + "grad_norm": 0.283203125, + "learning_rate": 4.9592435304163675e-05, + "loss": 0.1813, "step": 3200 }, { - "epoch": 6.126195028680688, - "grad_norm": 0.361328125, - "learning_rate": 3.9564390243782516e-05, - "loss": 0.3249, + "epoch": 5.159420289855072, + "grad_norm": 0.302734375, + "learning_rate": 4.926742466307069e-05, + "loss": 0.1557, "step": 3204 }, { - "epoch": 6.133843212237093, - "grad_norm": 0.396484375, - "learning_rate": 3.925693337969788e-05, - "loss": 0.3328, + "epoch": 5.1658615136876005, + "grad_norm": 0.283203125, + "learning_rate": 4.8943273127804345e-05, + "loss": 0.1574, "step": 3208 }, { - "epoch": 6.141491395793499, - "grad_norm": 0.35546875, - "learning_rate": 3.895049584783173e-05, - "loss": 0.3746, + "epoch": 5.172302737520129, + "grad_norm": 0.306640625, + "learning_rate": 4.8619983462916935e-05, + "loss": 0.1548, "step": 3212 }, { - "epoch": 6.149139579349905, - "grad_norm": 0.34765625, - "learning_rate": 3.8645080468781676e-05, - "loss": 0.3702, + "epoch": 5.178743961352657, + "grad_norm": 0.318359375, + "learning_rate": 4.829755842561025e-05, + "loss": 0.1888, "step": 3216 }, { - "epoch": 6.15678776290631, - "grad_norm": 0.337890625, - "learning_rate": 3.834069005373724e-05, - "loss": 0.3136, + "epoch": 5.185185185185185, + "grad_norm": 0.2734375, + "learning_rate": 4.797600076571194e-05, + "loss": 0.2004, "step": 3220 }, { - "epoch": 6.164435946462715, - "grad_norm": 0.357421875, - "learning_rate": 3.8037327404453634e-05, - "loss": 0.3542, + "epoch": 5.191626409017713, + "grad_norm": 0.2890625, + "learning_rate": 4.7655313225652294e-05, + "loss": 0.1587, "step": 3224 }, { - "epoch": 6.172084130019121, - "grad_norm": 0.369140625, - "learning_rate": 3.77349953132258e-05, - "loss": 0.3572, + "epoch": 5.1980676328502415, + "grad_norm": 0.296875, + "learning_rate": 4.7335498540440606e-05, + "loss": 0.1669, "step": 3228 }, { - "epoch": 6.179732313575526, - "grad_norm": 0.37109375, - "learning_rate": 3.7433696562863215e-05, - "loss": 0.3526, + "epoch": 5.20450885668277, + "grad_norm": 0.3359375, + "learning_rate": 4.7016559437642084e-05, + "loss": 0.171, "step": 3232 }, { - "epoch": 6.187380497131931, - "grad_norm": 0.365234375, - "learning_rate": 3.7133433926663805e-05, - "loss": 0.331, + "epoch": 5.210950080515298, + "grad_norm": 0.30859375, + "learning_rate": 4.6698498637354225e-05, + "loss": 0.1566, "step": 3236 }, { - "epoch": 6.195028680688337, - "grad_norm": 0.376953125, - "learning_rate": 3.6834210168388674e-05, - "loss": 0.3489, + "epoch": 5.217391304347826, + "grad_norm": 0.294921875, + "learning_rate": 4.6381318852184194e-05, + "loss": 0.1936, "step": 3240 }, { - "epoch": 6.202676864244742, - "grad_norm": 0.359375, - "learning_rate": 3.653602804223656e-05, - "loss": 0.3329, + "epoch": 5.223832528180354, + "grad_norm": 0.3359375, + "learning_rate": 4.606502278722503e-05, + "loss": 0.1897, "step": 3244 }, { - "epoch": 6.210325047801147, - "grad_norm": 0.34765625, - "learning_rate": 3.623889029281861e-05, - "loss": 0.2646, + "epoch": 5.230273752012883, + "grad_norm": 0.34375, + "learning_rate": 4.574961314003304e-05, + "loss": 0.1935, "step": 3248 }, { - "epoch": 6.217973231357552, - "grad_norm": 0.373046875, - "learning_rate": 3.5942799655132925e-05, - "loss": 0.3547, + "epoch": 5.236714975845411, + "grad_norm": 0.291015625, + "learning_rate": 4.5435092600604676e-05, + "loss": 0.159, "step": 3252 }, { - "epoch": 6.225621414913958, - "grad_norm": 0.369140625, - "learning_rate": 3.56477588545395e-05, - "loss": 0.3547, + "epoch": 5.243156199677939, + "grad_norm": 0.296875, + "learning_rate": 4.5121463851353476e-05, + "loss": 0.2065, "step": 3256 }, { - "epoch": 6.233269598470363, - "grad_norm": 0.341796875, - "learning_rate": 3.535377060673524e-05, - "loss": 0.3205, + "epoch": 5.249597423510467, + "grad_norm": 0.322265625, + "learning_rate": 4.48087295670874e-05, + "loss": 0.1993, "step": 3260 }, { - "epoch": 6.240917782026768, - "grad_norm": 0.361328125, - "learning_rate": 3.506083761772871e-05, - "loss": 0.3647, + "epoch": 5.256038647342995, + "grad_norm": 0.326171875, + "learning_rate": 4.449689241498569e-05, + "loss": 0.1717, "step": 3264 }, { - "epoch": 6.248565965583174, - "grad_norm": 0.33984375, - "learning_rate": 3.476896258381537e-05, - "loss": 0.3138, + "epoch": 5.262479871175524, + "grad_norm": 0.306640625, + "learning_rate": 4.41859550545765e-05, + "loss": 0.1907, "step": 3268 }, { - "epoch": 6.25621414913958, - "grad_norm": 0.3828125, - "learning_rate": 3.447814819155291e-05, - "loss": 0.3071, + "epoch": 5.268921095008052, + "grad_norm": 0.287109375, + "learning_rate": 4.387592013771396e-05, + "loss": 0.1691, "step": 3272 }, { - "epoch": 6.263862332695985, - "grad_norm": 0.349609375, - "learning_rate": 3.418839711773623e-05, - "loss": 0.3353, + "epoch": 5.27536231884058, + "grad_norm": 0.298828125, + "learning_rate": 4.356679030855573e-05, + "loss": 0.198, "step": 3276 }, { - "epoch": 6.27151051625239, - "grad_norm": 0.384765625, - "learning_rate": 3.389971202937295e-05, - "loss": 0.3497, + "epoch": 5.281803542673108, + "grad_norm": 0.314453125, + "learning_rate": 4.32585682035402e-05, + "loss": 0.2026, "step": 3280 }, { - "epoch": 6.279158699808796, - "grad_norm": 0.3359375, - "learning_rate": 3.361209558365883e-05, - "loss": 0.3125, + "epoch": 5.2882447665056365, + "grad_norm": 0.326171875, + "learning_rate": 4.2951256451364264e-05, + "loss": 0.1975, "step": 3284 }, { - "epoch": 6.286806883365201, - "grad_norm": 0.365234375, - "learning_rate": 3.332555042795349e-05, - "loss": 0.3575, + "epoch": 5.294685990338165, + "grad_norm": 0.283203125, + "learning_rate": 4.264485767296081e-05, + "loss": 0.1686, "step": 3288 }, { - "epoch": 6.294455066921606, - "grad_norm": 0.359375, - "learning_rate": 3.304007919975563e-05, - "loss": 0.3554, + "epoch": 5.301127214170693, + "grad_norm": 0.310546875, + "learning_rate": 4.233937448147635e-05, + "loss": 0.1583, "step": 3292 }, { - "epoch": 6.3021032504780115, - "grad_norm": 0.34375, - "learning_rate": 3.2755684526679196e-05, - "loss": 0.3427, + "epoch": 5.30756843800322, + "grad_norm": 0.296875, + "learning_rate": 4.203480948224866e-05, + "loss": 0.1777, "step": 3296 }, { - "epoch": 6.309751434034417, - "grad_norm": 0.35546875, - "learning_rate": 3.247236902642905e-05, - "loss": 0.3648, + "epoch": 5.314009661835748, + "grad_norm": 0.322265625, + "learning_rate": 4.173116527278471e-05, + "loss": 0.1616, "step": 3300 }, { - "epoch": 6.317399617590822, - "grad_norm": 0.375, - "learning_rate": 3.219013530677655e-05, - "loss": 0.3269, + "epoch": 5.320450885668277, + "grad_norm": 0.31640625, + "learning_rate": 4.142844444273845e-05, + "loss": 0.1731, "step": 3304 }, { - "epoch": 6.325047801147227, + "epoch": 5.326892109500805, "grad_norm": 0.349609375, - "learning_rate": 3.190898596553615e-05, - "loss": 0.3503, + "learning_rate": 4.1126649573888696e-05, + "loss": 0.2219, "step": 3308 }, { - "epoch": 6.332695984703633, - "grad_norm": 0.33984375, - "learning_rate": 3.162892359054098e-05, - "loss": 0.3239, + "epoch": 5.333333333333333, + "grad_norm": 0.298828125, + "learning_rate": 4.082578324011716e-05, + "loss": 0.1937, "step": 3312 }, { - "epoch": 6.340344168260038, - "grad_norm": 0.376953125, - "learning_rate": 3.1349950759619255e-05, - "loss": 0.3253, + "epoch": 5.339774557165861, + "grad_norm": 0.33203125, + "learning_rate": 4.052584800738636e-05, + "loss": 0.1891, "step": 3316 }, { - "epoch": 6.347992351816444, - "grad_norm": 0.361328125, - "learning_rate": 3.107207004057046e-05, - "loss": 0.3642, + "epoch": 5.3462157809983895, + "grad_norm": 0.306640625, + "learning_rate": 4.0226846433717954e-05, + "loss": 0.1811, "step": 3320 }, { - "epoch": 6.355640535372849, - "grad_norm": 0.349609375, - "learning_rate": 3.079528399114189e-05, - "loss": 0.3058, + "epoch": 5.352657004830918, + "grad_norm": 0.279296875, + "learning_rate": 3.992878106917079e-05, + "loss": 0.1768, "step": 3324 }, { - "epoch": 6.363288718929255, - "grad_norm": 0.375, - "learning_rate": 3.0519595159004853e-05, - "loss": 0.3705, + "epoch": 5.359098228663446, + "grad_norm": 0.318359375, + "learning_rate": 3.963165445581922e-05, + "loss": 0.1985, "step": 3328 }, { - "epoch": 6.37093690248566, - "grad_norm": 0.345703125, - "learning_rate": 3.0245006081731367e-05, - "loss": 0.3444, + "epoch": 5.365539452495974, + "grad_norm": 0.318359375, + "learning_rate": 3.933546912773119e-05, + "loss": 0.154, "step": 3332 }, { - "epoch": 6.378585086042065, - "grad_norm": 0.37109375, - "learning_rate": 2.9971519286770883e-05, - "loss": 0.3181, + "epoch": 5.371980676328502, + "grad_norm": 0.2734375, + "learning_rate": 3.904022761094715e-05, + "loss": 0.1788, "step": 3336 }, { - "epoch": 6.3862332695984705, - "grad_norm": 0.359375, - "learning_rate": 2.969913729142668e-05, - "loss": 0.3519, + "epoch": 5.3784219001610305, + "grad_norm": 0.3125, + "learning_rate": 3.874593242345785e-05, + "loss": 0.1964, "step": 3340 }, { - "epoch": 6.393881453154876, - "grad_norm": 0.34765625, - "learning_rate": 2.9427862602833165e-05, - "loss": 0.3261, + "epoch": 5.384863123993559, + "grad_norm": 0.306640625, + "learning_rate": 3.845258607518344e-05, + "loss": 0.1822, "step": 3344 }, { - "epoch": 6.401529636711281, - "grad_norm": 0.373046875, - "learning_rate": 2.915769771793256e-05, - "loss": 0.3554, + "epoch": 5.391304347826087, + "grad_norm": 0.306640625, + "learning_rate": 3.816019106795157e-05, + "loss": 0.1711, "step": 3348 }, { - "epoch": 6.409177820267686, - "grad_norm": 0.337890625, - "learning_rate": 2.8888645123451694e-05, - "loss": 0.3119, + "epoch": 5.397745571658615, + "grad_norm": 0.283203125, + "learning_rate": 3.7868749895476624e-05, + "loss": 0.1785, "step": 3352 }, { - "epoch": 6.416826003824092, - "grad_norm": 0.357421875, - "learning_rate": 2.862070729587959e-05, - "loss": 0.3576, + "epoch": 5.404186795491143, + "grad_norm": 0.361328125, + "learning_rate": 3.7578265043337834e-05, + "loss": 0.1891, "step": 3356 }, { - "epoch": 6.424474187380497, - "grad_norm": 0.3671875, - "learning_rate": 2.8353886701444312e-05, - "loss": 0.3464, + "epoch": 5.4106280193236715, + "grad_norm": 0.29296875, + "learning_rate": 3.72887389889586e-05, + "loss": 0.1766, "step": 3360 }, { - "epoch": 6.432122370936902, - "grad_norm": 0.345703125, - "learning_rate": 2.808818579609037e-05, - "loss": 0.3362, + "epoch": 5.4170692431562, + "grad_norm": 0.322265625, + "learning_rate": 3.700017420158486e-05, + "loss": 0.1733, "step": 3364 }, { - "epoch": 6.4397705544933075, - "grad_norm": 0.359375, - "learning_rate": 2.7823607025456103e-05, - "loss": 0.3556, + "epoch": 5.423510466988728, + "grad_norm": 0.287109375, + "learning_rate": 3.671257314226471e-05, + "loss": 0.1895, "step": 3368 }, { - "epoch": 6.447418738049713, - "grad_norm": 0.353515625, - "learning_rate": 2.7560152824851285e-05, - "loss": 0.2955, + "epoch": 5.429951690821256, + "grad_norm": 0.287109375, + "learning_rate": 3.642593826382663e-05, + "loss": 0.1867, "step": 3372 }, { - "epoch": 6.455066921606119, - "grad_norm": 0.349609375, - "learning_rate": 2.7297825619234515e-05, - "loss": 0.3064, + "epoch": 5.436392914653784, + "grad_norm": 0.33984375, + "learning_rate": 3.6140272010859166e-05, + "loss": 0.1946, "step": 3376 }, { - "epoch": 6.462715105162524, - "grad_norm": 0.373046875, - "learning_rate": 2.7036627823190994e-05, - "loss": 0.3553, + "epoch": 5.442834138486313, + "grad_norm": 0.314453125, + "learning_rate": 3.585557681968979e-05, + "loss": 0.1684, "step": 3380 }, { - "epoch": 6.4703632887189295, - "grad_norm": 0.34375, - "learning_rate": 2.6776561840910367e-05, - "loss": 0.3141, + "epoch": 5.449275362318841, + "grad_norm": 0.298828125, + "learning_rate": 3.5571855118364236e-05, + "loss": 0.1886, "step": 3384 }, { - "epoch": 6.478011472275335, - "grad_norm": 0.3515625, - "learning_rate": 2.6517630066164448e-05, - "loss": 0.3746, + "epoch": 5.455716586151369, + "grad_norm": 0.314453125, + "learning_rate": 3.528910932662577e-05, + "loss": 0.199, "step": 3388 }, { - "epoch": 6.48565965583174, - "grad_norm": 0.365234375, - "learning_rate": 2.6259834882285302e-05, - "loss": 0.3467, + "epoch": 5.462157809983897, + "grad_norm": 0.291015625, + "learning_rate": 3.5007341855894394e-05, + "loss": 0.1877, "step": 3392 }, { - "epoch": 6.493307839388145, - "grad_norm": 0.359375, - "learning_rate": 2.6003178662143214e-05, - "loss": 0.3257, + "epoch": 5.468599033816425, + "grad_norm": 0.337890625, + "learning_rate": 3.472655510924656e-05, + "loss": 0.1856, "step": 3396 }, { - "epoch": 6.500956022944551, - "grad_norm": 0.36328125, - "learning_rate": 2.574766376812502e-05, - "loss": 0.3231, + "epoch": 5.475040257648954, + "grad_norm": 0.3203125, + "learning_rate": 3.4446751481394516e-05, + "loss": 0.1818, "step": 3400 }, { - "epoch": 6.508604206500956, - "grad_norm": 0.333984375, - "learning_rate": 2.5493292552112128e-05, - "loss": 0.3102, + "epoch": 5.481481481481482, + "grad_norm": 0.34375, + "learning_rate": 3.4167933358665936e-05, + "loss": 0.2009, "step": 3404 }, { - "epoch": 6.516252390057361, - "grad_norm": 0.35546875, - "learning_rate": 2.5240067355458978e-05, - "loss": 0.33, + "epoch": 5.48792270531401, + "grad_norm": 0.306640625, + "learning_rate": 3.3890103118983366e-05, + "loss": 0.1824, "step": 3408 }, { - "epoch": 6.5239005736137665, - "grad_norm": 0.373046875, - "learning_rate": 2.4987990508971667e-05, - "loss": 0.3529, + "epoch": 5.494363929146537, + "grad_norm": 0.34375, + "learning_rate": 3.3613263131844294e-05, + "loss": 0.1746, "step": 3412 }, { - "epoch": 6.531548757170172, - "grad_norm": 0.375, - "learning_rate": 2.4737064332886055e-05, - "loss": 0.3285, + "epoch": 5.500805152979066, + "grad_norm": 0.30078125, + "learning_rate": 3.333741575830069e-05, + "loss": 0.1769, "step": 3416 }, { - "epoch": 6.539196940726577, - "grad_norm": 0.361328125, - "learning_rate": 2.4487291136846894e-05, - "loss": 0.3835, + "epoch": 5.507246376811594, + "grad_norm": 0.341796875, + "learning_rate": 3.306256335093898e-05, + "loss": 0.1822, "step": 3420 }, { - "epoch": 6.546845124282983, - "grad_norm": 0.353515625, - "learning_rate": 2.4238673219886384e-05, - "loss": 0.3476, + "epoch": 5.513687600644122, + "grad_norm": 0.302734375, + "learning_rate": 3.278870825385983e-05, + "loss": 0.1925, "step": 3424 }, { - "epoch": 6.5544933078393885, - "grad_norm": 0.369140625, - "learning_rate": 2.399121287040275e-05, - "loss": 0.3209, + "epoch": 5.52012882447665, + "grad_norm": 0.328125, + "learning_rate": 3.251585280265839e-05, + "loss": 0.1923, "step": 3428 }, { - "epoch": 6.562141491395794, - "grad_norm": 0.3671875, - "learning_rate": 2.3744912366139644e-05, - "loss": 0.3498, + "epoch": 5.526570048309178, + "grad_norm": 0.3125, + "learning_rate": 3.224399932440419e-05, + "loss": 0.1815, "step": 3432 }, { - "epoch": 6.569789674952199, - "grad_norm": 0.359375, - "learning_rate": 2.3499773974164825e-05, - "loss": 0.348, + "epoch": 5.533011272141707, + "grad_norm": 0.318359375, + "learning_rate": 3.1973150137621364e-05, + "loss": 0.1738, "step": 3436 }, { - "epoch": 6.577437858508604, - "grad_norm": 0.3671875, - "learning_rate": 2.325579995084946e-05, - "loss": 0.3783, + "epoch": 5.539452495974235, + "grad_norm": 0.296875, + "learning_rate": 3.170330755226893e-05, + "loss": 0.191, "step": 3440 }, { - "epoch": 6.58508604206501, - "grad_norm": 0.359375, - "learning_rate": 2.3012992541847254e-05, - "loss": 0.3855, + "epoch": 5.545893719806763, + "grad_norm": 0.30078125, + "learning_rate": 3.1434473869720804e-05, + "loss": 0.1538, "step": 3444 }, { - "epoch": 6.592734225621415, - "grad_norm": 0.34765625, - "learning_rate": 2.277135398207393e-05, - "loss": 0.3163, + "epoch": 5.552334943639291, + "grad_norm": 0.306640625, + "learning_rate": 3.116665138274676e-05, + "loss": 0.1748, "step": 3448 }, { - "epoch": 6.60038240917782, - "grad_norm": 0.365234375, - "learning_rate": 2.2530886495686506e-05, - "loss": 0.3601, + "epoch": 5.5587761674718195, + "grad_norm": 0.330078125, + "learning_rate": 3.0899842375492145e-05, + "loss": 0.1893, "step": 3452 }, { - "epoch": 6.6080305927342256, - "grad_norm": 0.396484375, - "learning_rate": 2.229159229606281e-05, - "loss": 0.3425, + "epoch": 5.565217391304348, + "grad_norm": 0.328125, + "learning_rate": 3.063404912345897e-05, + "loss": 0.1727, "step": 3456 }, { - "epoch": 6.615678776290631, - "grad_norm": 0.38671875, - "learning_rate": 2.2053473585781377e-05, - "loss": 0.324, + "epoch": 5.571658615136876, + "grad_norm": 0.31640625, + "learning_rate": 3.036927389348625e-05, + "loss": 0.1804, "step": 3460 }, { - "epoch": 6.623326959847036, - "grad_norm": 0.37890625, - "learning_rate": 2.181653255660072e-05, - "loss": 0.3386, + "epoch": 5.578099838969404, + "grad_norm": 0.283203125, + "learning_rate": 3.010551894373075e-05, + "loss": 0.1778, "step": 3464 }, { - "epoch": 6.630975143403441, - "grad_norm": 0.359375, - "learning_rate": 2.1580771389439612e-05, - "loss": 0.3434, + "epoch": 5.584541062801932, + "grad_norm": 0.3125, + "learning_rate": 2.9842786523647582e-05, + "loss": 0.1679, "step": 3468 }, { - "epoch": 6.638623326959847, - "grad_norm": 0.34765625, - "learning_rate": 2.1346192254356737e-05, - "loss": 0.331, + "epoch": 5.5909822866344605, + "grad_norm": 0.291015625, + "learning_rate": 2.9581078873971248e-05, + "loss": 0.1812, "step": 3472 }, { - "epoch": 6.646271510516252, - "grad_norm": 0.365234375, - "learning_rate": 2.1112797310530716e-05, - "loss": 0.3483, + "epoch": 5.597423510466989, + "grad_norm": 0.31640625, + "learning_rate": 2.9320398226696367e-05, + "loss": 0.188, "step": 3476 }, { - "epoch": 6.653919694072657, - "grad_norm": 0.376953125, - "learning_rate": 2.08805887062405e-05, - "loss": 0.3241, + "epoch": 5.603864734299517, + "grad_norm": 0.31640625, + "learning_rate": 2.9060746805058738e-05, + "loss": 0.1541, "step": 3480 }, { - "epoch": 6.661567877629063, - "grad_norm": 0.365234375, - "learning_rate": 2.0649568578845205e-05, - "loss": 0.3269, + "epoch": 5.610305958132045, + "grad_norm": 0.34375, + "learning_rate": 2.8802126823516193e-05, + "loss": 0.1671, "step": 3484 }, { - "epoch": 6.669216061185469, - "grad_norm": 0.359375, - "learning_rate": 2.0419739054764743e-05, - "loss": 0.3636, + "epoch": 5.616747181964573, + "grad_norm": 0.30078125, + "learning_rate": 2.8544540487729984e-05, + "loss": 0.1609, "step": 3488 }, { - "epoch": 6.676864244741874, - "grad_norm": 0.34375, - "learning_rate": 2.019110224946008e-05, - "loss": 0.3549, + "epoch": 5.6231884057971016, + "grad_norm": 0.326171875, + "learning_rate": 2.828798999454577e-05, + "loss": 0.1488, "step": 3492 }, { - "epoch": 6.684512428298279, - "grad_norm": 0.3515625, - "learning_rate": 1.9963660267413913e-05, - "loss": 0.3497, + "epoch": 5.62962962962963, + "grad_norm": 0.326171875, + "learning_rate": 2.8032477531974984e-05, + "loss": 0.2012, "step": 3496 }, { - "epoch": 6.692160611854685, - "grad_norm": 0.36328125, - "learning_rate": 1.9737415202111144e-05, - "loss": 0.3639, + "epoch": 5.636070853462158, + "grad_norm": 0.3203125, + "learning_rate": 2.7778005279176053e-05, + "loss": 0.208, "step": 3500 }, { - "epoch": 6.69980879541109, - "grad_norm": 0.353515625, - "learning_rate": 1.9512369136019663e-05, - "loss": 0.3487, + "epoch": 5.642512077294686, + "grad_norm": 0.328125, + "learning_rate": 2.7524575406435955e-05, + "loss": 0.192, "step": 3504 }, { - "epoch": 6.707456978967495, - "grad_norm": 0.333984375, - "learning_rate": 1.9288524140571286e-05, - "loss": 0.3196, + "epoch": 5.648953301127214, + "grad_norm": 0.302734375, + "learning_rate": 2.7272190075151655e-05, + "loss": 0.1582, "step": 3508 }, { - "epoch": 6.7151051625239, - "grad_norm": 0.361328125, - "learning_rate": 1.906588227614254e-05, - "loss": 0.3398, + "epoch": 5.655394524959743, + "grad_norm": 0.3203125, + "learning_rate": 2.7020851437811608e-05, + "loss": 0.1762, "step": 3512 }, { - "epoch": 6.722753346080306, - "grad_norm": 0.341796875, - "learning_rate": 1.8844445592035767e-05, - "loss": 0.37, + "epoch": 5.661835748792271, + "grad_norm": 0.30859375, + "learning_rate": 2.6770561637977556e-05, + "loss": 0.1678, "step": 3516 }, { - "epoch": 6.730401529636711, - "grad_norm": 0.37109375, - "learning_rate": 1.8624216126460183e-05, - "loss": 0.3466, + "epoch": 5.668276972624799, + "grad_norm": 0.294921875, + "learning_rate": 2.652132281026598e-05, + "loss": 0.1822, "step": 3520 }, { - "epoch": 6.738049713193116, - "grad_norm": 0.353515625, - "learning_rate": 1.8405195906513347e-05, - "loss": 0.3638, + "epoch": 5.674718196457327, + "grad_norm": 0.30859375, + "learning_rate": 2.6273137080330225e-05, + "loss": 0.183, "step": 3524 }, { - "epoch": 6.7456978967495225, - "grad_norm": 0.369140625, - "learning_rate": 1.8187386948162203e-05, - "loss": 0.3894, + "epoch": 5.681159420289855, + "grad_norm": 0.3125, + "learning_rate": 2.6026006564842106e-05, + "loss": 0.2009, "step": 3528 }, { - "epoch": 6.753346080305928, - "grad_norm": 0.341796875, - "learning_rate": 1.797079125622469e-05, - "loss": 0.3109, + "epoch": 5.687600644122384, + "grad_norm": 0.31640625, + "learning_rate": 2.577993337147406e-05, + "loss": 0.1858, "step": 3532 }, { - "epoch": 6.760994263862333, - "grad_norm": 0.345703125, - "learning_rate": 1.7755410824351363e-05, - "loss": 0.3191, + "epoch": 5.694041867954912, + "grad_norm": 0.337890625, + "learning_rate": 2.5534919598880887e-05, + "loss": 0.203, "step": 3536 }, { - "epoch": 6.768642447418738, - "grad_norm": 0.37890625, - "learning_rate": 1.7541247635006756e-05, - "loss": 0.3548, + "epoch": 5.70048309178744, + "grad_norm": 0.294921875, + "learning_rate": 2.5290967336682266e-05, + "loss": 0.1588, "step": 3540 }, { - "epoch": 6.776290630975144, - "grad_norm": 0.369140625, - "learning_rate": 1.7328303659451477e-05, - "loss": 0.3518, + "epoch": 5.706924315619968, + "grad_norm": 0.73828125, + "learning_rate": 2.5048078665444497e-05, + "loss": 0.1622, "step": 3544 }, { - "epoch": 6.783938814531549, - "grad_norm": 0.359375, - "learning_rate": 1.7116580857723872e-05, - "loss": 0.367, + "epoch": 5.713365539452496, + "grad_norm": 0.33203125, + "learning_rate": 2.4806255656663092e-05, + "loss": 0.185, "step": 3548 }, { - "epoch": 6.791586998087954, - "grad_norm": 0.3515625, - "learning_rate": 1.6906081178621917e-05, - "loss": 0.3502, + "epoch": 5.719806763285024, + "grad_norm": 0.30078125, + "learning_rate": 2.4565500372744845e-05, + "loss": 0.1904, "step": 3552 }, { - "epoch": 6.7992351816443595, - "grad_norm": 0.376953125, - "learning_rate": 1.6696806559685553e-05, - "loss": 0.4081, + "epoch": 5.726247987117552, + "grad_norm": 0.314453125, + "learning_rate": 2.4325814866990583e-05, + "loss": 0.175, "step": 3556 }, { - "epoch": 6.806883365200765, - "grad_norm": 0.38671875, - "learning_rate": 1.648875892717857e-05, - "loss": 0.3557, + "epoch": 5.73268921095008, + "grad_norm": 0.30859375, + "learning_rate": 2.4087201183577205e-05, + "loss": 0.1699, "step": 3560 }, { - "epoch": 6.81453154875717, - "grad_norm": 0.337890625, - "learning_rate": 1.628194019607099e-05, - "loss": 0.3765, + "epoch": 5.739130434782608, + "grad_norm": 0.32421875, + "learning_rate": 2.384966135754063e-05, + "loss": 0.1823, "step": 3564 }, { - "epoch": 6.822179732313575, - "grad_norm": 0.3515625, - "learning_rate": 1.6076352270021435e-05, - "loss": 0.3261, + "epoch": 5.745571658615137, + "grad_norm": 0.33203125, + "learning_rate": 2.3613197414758273e-05, + "loss": 0.1788, "step": 3568 }, { - "epoch": 6.829827915869981, - "grad_norm": 0.40625, - "learning_rate": 1.587199704135973e-05, - "loss": 0.3734, + "epoch": 5.752012882447665, + "grad_norm": 0.3125, + "learning_rate": 2.3377811371931793e-05, + "loss": 0.1794, "step": 3572 }, { - "epoch": 6.837476099426386, - "grad_norm": 0.38671875, - "learning_rate": 1.5668876391069107e-05, - "loss": 0.3523, + "epoch": 5.758454106280193, + "grad_norm": 0.3125, + "learning_rate": 2.3143505236569915e-05, + "loss": 0.1684, "step": 3576 }, { - "epoch": 6.845124282982791, - "grad_norm": 0.3671875, - "learning_rate": 1.5466992188769394e-05, - "loss": 0.3465, + "epoch": 5.764895330112721, + "grad_norm": 0.2890625, + "learning_rate": 2.2910281006971164e-05, + "loss": 0.157, "step": 3580 }, { - "epoch": 6.8527724665391965, - "grad_norm": 0.357421875, - "learning_rate": 1.5266346292699522e-05, - "loss": 0.3506, + "epoch": 5.7713365539452495, + "grad_norm": 0.3359375, + "learning_rate": 2.26781406722071e-05, + "loss": 0.1833, "step": 3584 }, { - "epoch": 6.860420650095603, - "grad_norm": 0.359375, - "learning_rate": 1.5066940549700285e-05, - "loss": 0.3112, + "epoch": 5.777777777777778, + "grad_norm": 0.330078125, + "learning_rate": 2.2447086212105143e-05, + "loss": 0.1945, "step": 3588 }, { - "epoch": 6.868068833652008, - "grad_norm": 0.40234375, - "learning_rate": 1.4868776795197712e-05, - "loss": 0.3271, + "epoch": 5.784219001610306, + "grad_norm": 0.310546875, + "learning_rate": 2.2217119597231747e-05, + "loss": 0.1801, "step": 3592 }, { - "epoch": 6.875717017208413, - "grad_norm": 0.35546875, - "learning_rate": 1.4671856853185876e-05, - "loss": 0.371, + "epoch": 5.790660225442834, + "grad_norm": 0.30859375, + "learning_rate": 2.1988242788875532e-05, + "loss": 0.1735, "step": 3596 }, { - "epoch": 6.8833652007648185, - "grad_norm": 0.384765625, - "learning_rate": 1.4476182536210207e-05, - "loss": 0.3448, + "epoch": 5.797101449275362, + "grad_norm": 0.302734375, + "learning_rate": 2.1760457739030695e-05, + "loss": 0.1755, "step": 3600 }, { - "epoch": 6.891013384321224, - "grad_norm": 0.353515625, - "learning_rate": 1.4281755645350873e-05, - "loss": 0.3605, + "epoch": 5.8035426731078905, + "grad_norm": 0.318359375, + "learning_rate": 2.1533766390380254e-05, + "loss": 0.1674, "step": 3604 }, { - "epoch": 6.898661567877629, - "grad_norm": 0.35546875, - "learning_rate": 1.4088577970206044e-05, - "loss": 0.3385, + "epoch": 5.809983896940419, + "grad_norm": 0.302734375, + "learning_rate": 2.1308170676279547e-05, + "loss": 0.138, "step": 3608 }, { - "epoch": 6.906309751434034, - "grad_norm": 0.35546875, - "learning_rate": 1.3896651288875572e-05, - "loss": 0.3285, + "epoch": 5.816425120772947, + "grad_norm": 0.3046875, + "learning_rate": 2.108367252073961e-05, + "loss": 0.1521, "step": 3612 }, { - "epoch": 6.91395793499044, - "grad_norm": 0.34765625, - "learning_rate": 1.3705977367944498e-05, - "loss": 0.3316, + "epoch": 5.822866344605475, + "grad_norm": 0.310546875, + "learning_rate": 2.0860273838410928e-05, + "loss": 0.1771, "step": 3616 }, { - "epoch": 6.921606118546845, - "grad_norm": 0.35546875, - "learning_rate": 1.3516557962466978e-05, - "loss": 0.3636, + "epoch": 5.829307568438003, + "grad_norm": 0.328125, + "learning_rate": 2.0637976534567046e-05, + "loss": 0.1628, "step": 3620 }, { - "epoch": 6.92925430210325, - "grad_norm": 0.380859375, - "learning_rate": 1.3328394815949884e-05, - "loss": 0.3649, + "epoch": 5.835748792270532, + "grad_norm": 0.298828125, + "learning_rate": 2.0416782505088347e-05, + "loss": 0.1631, "step": 3624 }, { - "epoch": 6.9369024856596555, - "grad_norm": 0.35546875, - "learning_rate": 1.3141489660336902e-05, - "loss": 0.3521, + "epoch": 5.84219001610306, + "grad_norm": 0.271484375, + "learning_rate": 2.0196693636445727e-05, + "loss": 0.1731, "step": 3628 }, { - "epoch": 6.944550669216062, - "grad_norm": 0.34765625, - "learning_rate": 1.295584421599265e-05, - "loss": 0.3177, + "epoch": 5.848631239935588, + "grad_norm": 0.310546875, + "learning_rate": 1.9977711805684706e-05, + "loss": 0.1748, "step": 3632 }, { - "epoch": 6.952198852772467, - "grad_norm": 0.341796875, - "learning_rate": 1.2771460191686656e-05, - "loss": 0.3148, + "epoch": 5.855072463768116, + "grad_norm": 0.34375, + "learning_rate": 1.975983888040945e-05, + "loss": 0.1872, "step": 3636 }, { - "epoch": 6.959847036328872, - "grad_norm": 0.3671875, - "learning_rate": 1.2588339284577815e-05, - "loss": 0.3324, + "epoch": 5.861513687600644, + "grad_norm": 0.328125, + "learning_rate": 1.9543076718766538e-05, + "loss": 0.1883, "step": 3640 }, { - "epoch": 6.9674952198852775, - "grad_norm": 0.37109375, - "learning_rate": 1.240648318019859e-05, - "loss": 0.3729, + "epoch": 5.867954911433173, + "grad_norm": 0.322265625, + "learning_rate": 1.932742716942946e-05, + "loss": 0.1543, "step": 3644 }, { - "epoch": 6.975143403441683, - "grad_norm": 0.3671875, - "learning_rate": 1.2225893552439742e-05, - "loss": 0.3616, + "epoch": 5.874396135265701, + "grad_norm": 0.314453125, + "learning_rate": 1.911289207158254e-05, + "loss": 0.1807, "step": 3648 }, { - "epoch": 6.982791586998088, - "grad_norm": 0.345703125, - "learning_rate": 1.2046572063534587e-05, - "loss": 0.3261, + "epoch": 5.880837359098229, + "grad_norm": 0.333984375, + "learning_rate": 1.8899473254905672e-05, + "loss": 0.1775, "step": 3652 }, { - "epoch": 6.990439770554493, - "grad_norm": 0.384765625, - "learning_rate": 1.1868520364044049e-05, - "loss": 0.3542, + "epoch": 5.887278582930757, + "grad_norm": 0.314453125, + "learning_rate": 1.8687172539558208e-05, + "loss": 0.1767, "step": 3656 }, { - "epoch": 6.998087954110899, - "grad_norm": 0.33203125, - "learning_rate": 1.1691740092841228e-05, - "loss": 0.3238, + "epoch": 5.8937198067632846, + "grad_norm": 0.306640625, + "learning_rate": 1.8475991736163835e-05, + "loss": 0.1662, "step": 3660 }, { - "epoch": 7.005736137667304, - "grad_norm": 0.365234375, - "learning_rate": 1.151623287709636e-05, - "loss": 0.4048, + "epoch": 5.900161030595813, + "grad_norm": 0.294921875, + "learning_rate": 1.8265932645794827e-05, + "loss": 0.1575, "step": 3664 }, { - "epoch": 7.013384321223709, - "grad_norm": 0.353515625, - "learning_rate": 1.1342000332261963e-05, - "loss": 0.3609, + "epoch": 5.906602254428341, + "grad_norm": 0.359375, + "learning_rate": 1.805699705995708e-05, + "loss": 0.1778, "step": 3668 }, { - "epoch": 7.0210325047801145, - "grad_norm": 0.326171875, - "learning_rate": 1.1169044062057797e-05, - "loss": 0.3145, + "epoch": 5.913043478260869, + "grad_norm": 0.322265625, + "learning_rate": 1.7849186760574346e-05, + "loss": 0.1661, "step": 3672 }, { - "epoch": 7.02868068833652, - "grad_norm": 0.35546875, - "learning_rate": 1.0997365658456164e-05, - "loss": 0.3618, + "epoch": 5.919484702093397, + "grad_norm": 0.30078125, + "learning_rate": 1.7642503519973432e-05, + "loss": 0.1603, "step": 3676 }, { - "epoch": 7.036328871892925, - "grad_norm": 0.341796875, - "learning_rate": 1.082696670166736e-05, - "loss": 0.3298, + "epoch": 5.925925925925926, + "grad_norm": 0.302734375, + "learning_rate": 1.7436949100868864e-05, + "loss": 0.1603, "step": 3680 }, { - "epoch": 7.04397705544933, - "grad_norm": 0.353515625, - "learning_rate": 1.0657848760124954e-05, - "loss": 0.3249, + "epoch": 5.932367149758454, + "grad_norm": 0.326171875, + "learning_rate": 1.7232525256348013e-05, + "loss": 0.1907, "step": 3684 }, { - "epoch": 7.051625239005737, - "grad_norm": 0.341796875, - "learning_rate": 1.0490013390471474e-05, - "loss": 0.294, + "epoch": 5.938808373590982, + "grad_norm": 0.333984375, + "learning_rate": 1.7029233729855883e-05, + "loss": 0.1848, "step": 3688 }, { - "epoch": 7.059273422562142, - "grad_norm": 0.341796875, - "learning_rate": 1.0323462137543998e-05, - "loss": 0.3264, + "epoch": 5.94524959742351, + "grad_norm": 0.30859375, + "learning_rate": 1.6827076255180593e-05, + "loss": 0.1719, "step": 3692 }, { - "epoch": 7.066921606118547, - "grad_norm": 0.3359375, - "learning_rate": 1.015819653436012e-05, - "loss": 0.3164, + "epoch": 5.951690821256038, + "grad_norm": 0.296875, + "learning_rate": 1.6626054556438322e-05, + "loss": 0.1819, "step": 3696 }, { - "epoch": 7.074569789674952, - "grad_norm": 0.34375, - "learning_rate": 9.994218102103468e-06, - "loss": 0.3317, + "epoch": 5.958132045088567, + "grad_norm": 0.31640625, + "learning_rate": 1.6426170348058703e-05, + "loss": 0.1669, "step": 3700 }, { - "epoch": 7.082217973231358, - "grad_norm": 0.37109375, - "learning_rate": 9.831528350110119e-06, - "loss": 0.3168, + "epoch": 5.964573268921095, + "grad_norm": 0.31640625, + "learning_rate": 1.6227425334770245e-05, + "loss": 0.169, "step": 3704 }, { - "epoch": 7.089866156787763, - "grad_norm": 0.39453125, - "learning_rate": 9.670128775854513e-06, - "loss": 0.3901, + "epoch": 5.971014492753623, + "grad_norm": 0.333984375, + "learning_rate": 1.6029821211585592e-05, + "loss": 0.1723, "step": 3708 }, { - "epoch": 7.097514340344168, - "grad_norm": 0.3359375, - "learning_rate": 9.51002086493553e-06, - "loss": 0.3297, + "epoch": 5.977455716586151, + "grad_norm": 0.337890625, + "learning_rate": 1.5833359663787392e-05, + "loss": 0.2008, "step": 3712 }, { - "epoch": 7.105162523900574, - "grad_norm": 0.353515625, - "learning_rate": 9.351206091063107e-06, - "loss": 0.3484, + "epoch": 5.9838969404186795, + "grad_norm": 0.275390625, + "learning_rate": 1.563804236691364e-05, + "loss": 0.1523, "step": 3716 }, { - "epoch": 7.112810707456979, - "grad_norm": 0.34375, - "learning_rate": 9.193685916044469e-06, - "loss": 0.3255, + "epoch": 5.990338164251208, + "grad_norm": 0.271484375, + "learning_rate": 1.5443870986743562e-05, + "loss": 0.1592, "step": 3720 }, { - "epoch": 7.120458891013384, - "grad_norm": 0.361328125, - "learning_rate": 9.03746178977074e-06, - "loss": 0.3837, + "epoch": 5.996779388083736, + "grad_norm": 0.337890625, + "learning_rate": 1.5250847179283243e-05, + "loss": 0.2154, "step": 3724 }, { - "epoch": 7.128107074569789, - "grad_norm": 0.3515625, - "learning_rate": 8.882535150203567e-06, - "loss": 0.3689, + "epoch": 6.003220611916264, + "grad_norm": 0.30859375, + "learning_rate": 1.505897259075171e-05, + "loss": 0.1917, "step": 3728 }, { - "epoch": 7.135755258126195, - "grad_norm": 0.330078125, - "learning_rate": 8.728907423361991e-06, - "loss": 0.3209, + "epoch": 6.009661835748792, + "grad_norm": 0.287109375, + "learning_rate": 1.4868248857566734e-05, + "loss": 0.1512, "step": 3732 }, { - "epoch": 7.1434034416826, - "grad_norm": 0.326171875, - "learning_rate": 8.576580023309126e-06, - "loss": 0.3297, + "epoch": 6.0161030595813205, + "grad_norm": 0.310546875, + "learning_rate": 1.4678677606330964e-05, + "loss": 0.1889, "step": 3736 }, { - "epoch": 7.151051625239006, - "grad_norm": 0.35546875, - "learning_rate": 8.425554352139313e-06, - "loss": 0.3432, + "epoch": 6.022544283413849, + "grad_norm": 0.2890625, + "learning_rate": 1.4490260453817898e-05, + "loss": 0.1694, "step": 3740 }, { - "epoch": 7.1586998087954115, - "grad_norm": 0.33203125, - "learning_rate": 8.275831799965194e-06, - "loss": 0.3514, + "epoch": 6.028985507246377, + "grad_norm": 0.283203125, + "learning_rate": 1.4302999006958342e-05, + "loss": 0.1365, "step": 3744 }, { - "epoch": 7.166347992351817, - "grad_norm": 0.349609375, - "learning_rate": 8.127413744904804e-06, - "loss": 0.3326, + "epoch": 6.035426731078905, + "grad_norm": 0.267578125, + "learning_rate": 1.411689486282654e-05, + "loss": 0.148, "step": 3748 }, { - "epoch": 7.173996175908222, - "grad_norm": 0.373046875, - "learning_rate": 7.980301553068985e-06, - "loss": 0.3558, + "epoch": 6.041867954911433, + "grad_norm": 0.30859375, + "learning_rate": 1.393194960862657e-05, + "loss": 0.1744, "step": 3752 }, { - "epoch": 7.181644359464627, - "grad_norm": 0.353515625, - "learning_rate": 7.83449657854886e-06, - "loss": 0.3504, + "epoch": 6.048309178743962, + "grad_norm": 0.28515625, + "learning_rate": 1.3748164821678759e-05, + "loss": 0.1642, "step": 3756 }, { - "epoch": 7.189292543021033, - "grad_norm": 0.34375, - "learning_rate": 7.690000163403177e-06, - "loss": 0.3136, + "epoch": 6.05475040257649, + "grad_norm": 0.310546875, + "learning_rate": 1.3565542069406433e-05, + "loss": 0.1826, "step": 3760 }, { - "epoch": 7.196940726577438, - "grad_norm": 0.33984375, - "learning_rate": 7.546813637646182e-06, - "loss": 0.3292, + "epoch": 6.061191626409018, + "grad_norm": 0.314453125, + "learning_rate": 1.3384082909322375e-05, + "loss": 0.1911, "step": 3764 }, { - "epoch": 7.204588910133843, - "grad_norm": 0.365234375, - "learning_rate": 7.404938319235171e-06, - "loss": 0.327, + "epoch": 6.067632850241546, + "grad_norm": 0.283203125, + "learning_rate": 1.320378888901546e-05, + "loss": 0.134, "step": 3768 }, { - "epoch": 7.2122370936902485, - "grad_norm": 0.36328125, - "learning_rate": 7.264375514058607e-06, - "loss": 0.3526, + "epoch": 6.074074074074074, + "grad_norm": 0.3125, + "learning_rate": 1.3024661546137694e-05, + "loss": 0.1778, "step": 3772 }, { - "epoch": 7.219885277246654, - "grad_norm": 0.333984375, - "learning_rate": 7.125126515923752e-06, - "loss": 0.3506, + "epoch": 6.080515297906603, + "grad_norm": 0.271484375, + "learning_rate": 1.2846702408390975e-05, + "loss": 0.1542, "step": 3776 }, { - "epoch": 7.227533460803059, - "grad_norm": 0.349609375, - "learning_rate": 6.987192606545156e-06, - "loss": 0.3355, + "epoch": 6.086956521739131, + "grad_norm": 0.275390625, + "learning_rate": 1.2669912993514036e-05, + "loss": 0.185, "step": 3780 }, { - "epoch": 7.235181644359464, - "grad_norm": 0.353515625, - "learning_rate": 6.850575055532553e-06, - "loss": 0.3484, + "epoch": 6.093397745571659, + "grad_norm": 0.326171875, + "learning_rate": 1.2494294809269512e-05, + "loss": 0.1937, "step": 3784 }, { - "epoch": 7.24282982791587, - "grad_norm": 0.36328125, - "learning_rate": 6.715275120379271e-06, - "loss": 0.366, + "epoch": 6.099838969404187, + "grad_norm": 0.296875, + "learning_rate": 1.2319849353431154e-05, + "loss": 0.1642, "step": 3788 }, { - "epoch": 7.250478011472275, - "grad_norm": 0.33984375, - "learning_rate": 6.581294046450753e-06, - "loss": 0.322, + "epoch": 6.106280193236715, + "grad_norm": 0.28515625, + "learning_rate": 1.2146578113771005e-05, + "loss": 0.156, "step": 3792 }, { - "epoch": 7.258126195028681, - "grad_norm": 0.322265625, - "learning_rate": 6.448633066972953e-06, - "loss": 0.2989, + "epoch": 6.112721417069243, + "grad_norm": 0.28515625, + "learning_rate": 1.1974482568046694e-05, + "loss": 0.172, "step": 3796 }, { - "epoch": 7.265774378585086, - "grad_norm": 0.353515625, - "learning_rate": 6.317293403021029e-06, - "loss": 0.3377, + "epoch": 6.119162640901771, + "grad_norm": 0.302734375, + "learning_rate": 1.1803564183988812e-05, + "loss": 0.1655, "step": 3800 }, { - "epoch": 7.273422562141492, - "grad_norm": 0.326171875, - "learning_rate": 6.187276263508167e-06, - "loss": 0.3001, + "epoch": 6.125603864734299, + "grad_norm": 0.294921875, + "learning_rate": 1.1633824419288474e-05, + "loss": 0.1741, "step": 3804 }, { - "epoch": 7.281070745697897, - "grad_norm": 0.337890625, - "learning_rate": 6.0585828451743925e-06, - "loss": 0.3335, + "epoch": 6.132045088566827, + "grad_norm": 0.3125, + "learning_rate": 1.146526472158487e-05, + "loss": 0.1805, "step": 3808 }, { - "epoch": 7.288718929254302, - "grad_norm": 0.34375, - "learning_rate": 5.93121433257554e-06, - "loss": 0.3458, + "epoch": 6.138486312399356, + "grad_norm": 0.298828125, + "learning_rate": 1.1297886528452882e-05, + "loss": 0.1617, "step": 3812 }, { - "epoch": 7.2963671128107075, - "grad_norm": 0.3671875, - "learning_rate": 5.805171898072369e-06, - "loss": 0.2941, + "epoch": 6.144927536231884, + "grad_norm": 0.291015625, + "learning_rate": 1.1131691267390757e-05, + "loss": 0.1863, "step": 3816 }, { - "epoch": 7.304015296367113, - "grad_norm": 0.359375, - "learning_rate": 5.680456701819885e-06, - "loss": 0.3888, + "epoch": 6.151368760064412, + "grad_norm": 0.33984375, + "learning_rate": 1.0966680355808122e-05, + "loss": 0.2013, "step": 3820 }, { - "epoch": 7.311663479923518, - "grad_norm": 0.36328125, - "learning_rate": 5.5570698917563994e-06, - "loss": 0.3663, + "epoch": 6.15780998389694, + "grad_norm": 0.2734375, + "learning_rate": 1.080285520101371e-05, + "loss": 0.1683, "step": 3824 }, { - "epoch": 7.319311663479923, - "grad_norm": 0.3515625, - "learning_rate": 5.435012603593219e-06, - "loss": 0.292, + "epoch": 6.164251207729468, + "grad_norm": 0.287109375, + "learning_rate": 1.0640217200203466e-05, + "loss": 0.1729, "step": 3828 }, { - "epoch": 7.326959847036329, - "grad_norm": 0.373046875, - "learning_rate": 5.3142859608041265e-06, - "loss": 0.3628, + "epoch": 6.170692431561997, + "grad_norm": 0.30859375, + "learning_rate": 1.047876774044863e-05, + "loss": 0.1736, "step": 3832 }, { - "epoch": 7.334608030592734, - "grad_norm": 0.345703125, - "learning_rate": 5.194891074614899e-06, - "loss": 0.3377, + "epoch": 6.177133655394525, + "grad_norm": 0.302734375, + "learning_rate": 1.0318508198683734e-05, + "loss": 0.1757, "step": 3836 }, { - "epoch": 7.342256214149139, - "grad_norm": 0.353515625, - "learning_rate": 5.076829043993253e-06, - "loss": 0.3322, + "epoch": 6.183574879227053, + "grad_norm": 0.275390625, + "learning_rate": 1.015943994169523e-05, + "loss": 0.1824, "step": 3840 }, { - "epoch": 7.349904397705545, - "grad_norm": 0.37890625, - "learning_rate": 4.960100955638685e-06, - "loss": 0.3607, + "epoch": 6.190016103059581, + "grad_norm": 0.28125, + "learning_rate": 1.0001564326109363e-05, + "loss": 0.1536, "step": 3844 }, { - "epoch": 7.357552581261951, - "grad_norm": 0.345703125, - "learning_rate": 4.844707883972398e-06, - "loss": 0.3303, + "epoch": 6.1964573268921095, + "grad_norm": 0.314453125, + "learning_rate": 9.844882698381013e-06, + "loss": 0.2085, "step": 3848 }, { - "epoch": 7.365200764818356, - "grad_norm": 0.337890625, - "learning_rate": 4.730650891127457e-06, - "loss": 0.3327, + "epoch": 6.202898550724638, + "grad_norm": 0.2890625, + "learning_rate": 9.689396394781923e-06, + "loss": 0.1665, "step": 3852 }, { - "epoch": 7.372848948374761, - "grad_norm": 0.357421875, - "learning_rate": 4.617931026939115e-06, - "loss": 0.3766, + "epoch": 6.209339774557166, + "grad_norm": 0.322265625, + "learning_rate": 9.535106741389542e-06, + "loss": 0.1714, "step": 3856 }, { - "epoch": 7.3804971319311665, - "grad_norm": 0.33203125, - "learning_rate": 4.506549328934916e-06, - "loss": 0.2884, + "epoch": 6.215780998389694, + "grad_norm": 0.3046875, + "learning_rate": 9.382015054075465e-06, + "loss": 0.1639, "step": 3860 }, { - "epoch": 7.388145315487572, - "grad_norm": 0.353515625, - "learning_rate": 4.39650682232538e-06, - "loss": 0.3036, + "epoch": 6.222222222222222, + "grad_norm": 0.265625, + "learning_rate": 9.230122638494408e-06, + "loss": 0.1391, "step": 3864 }, { - "epoch": 7.395793499043977, - "grad_norm": 0.37890625, - "learning_rate": 4.287804519994431e-06, - "loss": 0.3398, + "epoch": 6.2286634460547505, + "grad_norm": 0.30078125, + "learning_rate": 9.079430790072972e-06, + "loss": 0.1514, "step": 3868 }, { - "epoch": 7.403441682600382, - "grad_norm": 0.353515625, - "learning_rate": 4.180443422490115e-06, - "loss": 0.3276, + "epoch": 6.235104669887279, + "grad_norm": 0.26953125, + "learning_rate": 8.92994079399868e-06, + "loss": 0.1324, "step": 3872 }, { - "epoch": 7.411089866156788, - "grad_norm": 0.35546875, - "learning_rate": 4.074424518015384e-06, - "loss": 0.3319, + "epoch": 6.241545893719807, + "grad_norm": 0.306640625, + "learning_rate": 8.781653925208887e-06, + "loss": 0.1629, "step": 3876 }, { - "epoch": 7.418738049713193, - "grad_norm": 0.390625, - "learning_rate": 3.969748782418991e-06, - "loss": 0.3925, + "epoch": 6.247987117552335, + "grad_norm": 0.330078125, + "learning_rate": 8.634571448380056e-06, + "loss": 0.1899, "step": 3880 }, { - "epoch": 7.426386233269598, - "grad_norm": 0.365234375, - "learning_rate": 3.8664171791865765e-06, - "loss": 0.3556, + "epoch": 6.254428341384863, + "grad_norm": 0.3046875, + "learning_rate": 8.488694617916785e-06, + "loss": 0.1696, "step": 3884 }, { - "epoch": 7.4340344168260035, - "grad_norm": 0.384765625, - "learning_rate": 3.764430659431661e-06, - "loss": 0.324, + "epoch": 6.260869565217392, + "grad_norm": 0.296875, + "learning_rate": 8.344024677941346e-06, + "loss": 0.1652, "step": 3888 }, { - "epoch": 7.441682600382409, - "grad_norm": 0.37109375, - "learning_rate": 3.6637901618870203e-06, - "loss": 0.3443, + "epoch": 6.26731078904992, + "grad_norm": 0.28515625, + "learning_rate": 8.200562862282912e-06, + "loss": 0.1642, "step": 3892 }, { - "epoch": 7.449330783938814, - "grad_norm": 0.3828125, - "learning_rate": 3.564496612896006e-06, - "loss": 0.3409, + "epoch": 6.273752012882448, + "grad_norm": 0.30078125, + "learning_rate": 8.058310394466994e-06, + "loss": 0.1458, "step": 3896 }, { - "epoch": 7.45697896749522, - "grad_norm": 0.322265625, - "learning_rate": 3.4665509264039717e-06, - "loss": 0.3106, + "epoch": 6.280193236714976, + "grad_norm": 0.3203125, + "learning_rate": 7.917268487705175e-06, + "loss": 0.1519, "step": 3900 }, { - "epoch": 7.4646271510516256, - "grad_norm": 0.35546875, - "learning_rate": 3.3699540039499263e-06, - "loss": 0.3442, + "epoch": 6.286634460547504, + "grad_norm": 0.287109375, + "learning_rate": 7.777438344884645e-06, + "loss": 0.1745, "step": 3904 }, { - "epoch": 7.472275334608031, - "grad_norm": 0.35546875, - "learning_rate": 3.274706734658228e-06, - "loss": 0.3649, + "epoch": 6.293075684380033, + "grad_norm": 0.265625, + "learning_rate": 7.638821158557962e-06, + "loss": 0.1696, "step": 3908 }, { - "epoch": 7.479923518164436, - "grad_norm": 0.330078125, - "learning_rate": 3.1808099952303045e-06, - "loss": 0.3215, + "epoch": 6.29951690821256, + "grad_norm": 0.255859375, + "learning_rate": 7.501418110932872e-06, + "loss": 0.1634, "step": 3912 }, { - "epoch": 7.487571701720841, - "grad_norm": 0.37109375, - "learning_rate": 3.0882646499367614e-06, - "loss": 0.2973, + "epoch": 6.305958132045088, + "grad_norm": 0.279296875, + "learning_rate": 7.365230373862274e-06, + "loss": 0.1589, "step": 3916 }, { - "epoch": 7.495219885277247, - "grad_norm": 0.375, - "learning_rate": 2.9970715506092534e-06, - "loss": 0.3862, + "epoch": 6.312399355877616, + "grad_norm": 0.3046875, + "learning_rate": 7.2302591088341576e-06, + "loss": 0.1675, "step": 3920 }, { - "epoch": 7.502868068833652, - "grad_norm": 0.3515625, - "learning_rate": 2.9072315366327424e-06, - "loss": 0.3246, + "epoch": 6.318840579710145, + "grad_norm": 0.298828125, + "learning_rate": 7.096505466961794e-06, + "loss": 0.1718, "step": 3924 }, { - "epoch": 7.510516252390057, - "grad_norm": 0.36328125, - "learning_rate": 2.8187454349377193e-06, - "loss": 0.3823, + "epoch": 6.325281803542673, + "grad_norm": 0.271484375, + "learning_rate": 6.963970588973761e-06, + "loss": 0.1412, "step": 3928 }, { - "epoch": 7.5181644359464626, - "grad_norm": 0.369140625, - "learning_rate": 2.731614059992676e-06, - "loss": 0.3697, + "epoch": 6.331723027375201, + "grad_norm": 0.302734375, + "learning_rate": 6.832655605204401e-06, + "loss": 0.165, "step": 3932 }, { - "epoch": 7.525812619502868, - "grad_norm": 0.365234375, - "learning_rate": 2.6458382137964805e-06, - "loss": 0.3312, + "epoch": 6.338164251207729, + "grad_norm": 0.314453125, + "learning_rate": 6.702561635584047e-06, + "loss": 0.1496, "step": 3936 }, { - "epoch": 7.533460803059273, - "grad_norm": 0.353515625, - "learning_rate": 2.5614186858711137e-06, - "loss": 0.3188, + "epoch": 6.344605475040257, + "grad_norm": 0.30859375, + "learning_rate": 6.57368978962956e-06, + "loss": 0.1829, "step": 3940 }, { - "epoch": 7.541108986615678, - "grad_norm": 0.345703125, - "learning_rate": 2.47835625325436e-06, - "loss": 0.322, + "epoch": 6.351046698872786, + "grad_norm": 0.306640625, + "learning_rate": 6.44604116643474e-06, + "loss": 0.182, "step": 3944 }, { - "epoch": 7.548757170172085, - "grad_norm": 0.369140625, - "learning_rate": 2.3966516804925784e-06, - "loss": 0.3401, + "epoch": 6.357487922705314, + "grad_norm": 0.279296875, + "learning_rate": 6.3196168546610634e-06, + "loss": 0.1714, "step": 3948 }, { - "epoch": 7.55640535372849, - "grad_norm": 0.353515625, - "learning_rate": 2.3163057196338096e-06, - "loss": 0.3388, + "epoch": 6.363929146537842, + "grad_norm": 0.35546875, + "learning_rate": 6.194417932528478e-06, + "loss": 0.2102, "step": 3952 }, { - "epoch": 7.564053537284895, - "grad_norm": 0.34375, - "learning_rate": 2.2373191102207646e-06, - "loss": 0.3375, + "epoch": 6.37037037037037, + "grad_norm": 0.306640625, + "learning_rate": 6.070445467805923e-06, + "loss": 0.1761, "step": 3956 }, { - "epoch": 7.5717017208413, - "grad_norm": 0.322265625, - "learning_rate": 2.1596925792839946e-06, - "loss": 0.3125, + "epoch": 6.3768115942028984, + "grad_norm": 0.30859375, + "learning_rate": 5.947700517802523e-06, + "loss": 0.1816, "step": 3960 }, { - "epoch": 7.579349904397706, - "grad_norm": 0.37890625, - "learning_rate": 2.083426841335284e-06, - "loss": 0.3436, + "epoch": 6.383252818035427, + "grad_norm": 0.287109375, + "learning_rate": 5.826184129358358e-06, + "loss": 0.2008, "step": 3964 }, { - "epoch": 7.586998087954111, - "grad_norm": 0.369140625, - "learning_rate": 2.0085225983610177e-06, - "loss": 0.3559, + "epoch": 6.389694041867955, + "grad_norm": 0.296875, + "learning_rate": 5.705897338835724e-06, + "loss": 0.1857, "step": 3968 }, { - "epoch": 7.594646271510516, - "grad_norm": 0.341796875, - "learning_rate": 1.9349805398156893e-06, - "loss": 0.3167, + "epoch": 6.396135265700483, + "grad_norm": 0.2890625, + "learning_rate": 5.58684117211009e-06, + "loss": 0.1681, "step": 3972 }, { - "epoch": 7.602294455066922, - "grad_norm": 0.35546875, - "learning_rate": 1.8628013426156386e-06, - "loss": 0.3654, + "epoch": 6.402576489533011, + "grad_norm": 0.314453125, + "learning_rate": 5.469016644561519e-06, + "loss": 0.1409, "step": 3976 }, { - "epoch": 7.609942638623327, - "grad_norm": 0.359375, - "learning_rate": 1.7919856711327563e-06, - "loss": 0.3534, + "epoch": 6.4090177133655395, + "grad_norm": 0.3046875, + "learning_rate": 5.352424761065926e-06, + "loss": 0.1647, "step": 3980 }, { - "epoch": 7.617590822179732, - "grad_norm": 0.36328125, - "learning_rate": 1.722534177188406e-06, - "loss": 0.3139, + "epoch": 6.415458937198068, + "grad_norm": 0.310546875, + "learning_rate": 5.2370665159865045e-06, + "loss": 0.1556, "step": 3984 }, { - "epoch": 7.625239005736137, - "grad_norm": 0.37109375, - "learning_rate": 1.6544475000473957e-06, - "loss": 0.3578, + "epoch": 6.421900161030596, + "grad_norm": 0.33203125, + "learning_rate": 5.1229428931652775e-06, + "loss": 0.1912, "step": 3988 }, { - "epoch": 7.632887189292543, - "grad_norm": 0.357421875, - "learning_rate": 1.5877262664120983e-06, - "loss": 0.3272, + "epoch": 6.428341384863124, + "grad_norm": 0.275390625, + "learning_rate": 5.010054865914676e-06, + "loss": 0.1521, "step": 3992 }, { - "epoch": 7.640535372848948, - "grad_norm": 0.373046875, - "learning_rate": 1.522371090416724e-06, - "loss": 0.3578, + "epoch": 6.434782608695652, + "grad_norm": 0.29296875, + "learning_rate": 4.898403397009293e-06, + "loss": 0.192, "step": 3996 }, { - "epoch": 7.648183556405353, - "grad_norm": 0.34375, - "learning_rate": 1.4583825736215749e-06, - "loss": 0.3131, + "epoch": 6.4412238325281805, + "grad_norm": 0.28515625, + "learning_rate": 4.787989438677625e-06, + "loss": 0.1464, "step": 4000 } ], "logging_steps": 4, - "max_steps": 4184, + "max_steps": 4347, "num_input_tokens_seen": 0, - "num_train_epochs": 8, + "num_train_epochs": 7, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { @@ -7026,7 +7026,7 @@ "attributes": {} } }, - "total_flos": 2.6138957704404664e+18, + "total_flos": 2.6355937321911583e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null