diff --git "a/checkpoint-4000/trainer_state.json" "b/checkpoint-4000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-4000/trainer_state.json" @@ -0,0 +1,7033 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 7.648183556405353, + "eval_steps": 500, + "global_step": 4000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0076481835564053535, + "grad_norm": 0.294921875, + "learning_rate": 2.8571428571428567e-05, + "loss": 1.3014, + "step": 4 + }, + { + "epoch": 0.015296367112810707, + "grad_norm": 0.26953125, + "learning_rate": 5.7142857142857135e-05, + "loss": 1.3157, + "step": 8 + }, + { + "epoch": 0.022944550669216062, + "grad_norm": 0.201171875, + "learning_rate": 8.57142857142857e-05, + "loss": 1.2369, + "step": 12 + }, + { + "epoch": 0.030592734225621414, + "grad_norm": 0.255859375, + "learning_rate": 0.00011428571428571427, + "loss": 1.207, + "step": 16 + }, + { + "epoch": 0.03824091778202677, + "grad_norm": 0.177734375, + "learning_rate": 0.00014285714285714284, + "loss": 1.1666, + "step": 20 + }, + { + "epoch": 0.045889101338432124, + "grad_norm": 0.1806640625, + "learning_rate": 0.0001714285714285714, + "loss": 1.178, + "step": 24 + }, + { + "epoch": 0.05353728489483748, + "grad_norm": 0.1552734375, + "learning_rate": 0.00019999999999999998, + "loss": 1.1533, + "step": 28 + }, + { + "epoch": 0.06118546845124283, + "grad_norm": 0.1787109375, + "learning_rate": 0.00022857142857142854, + "loss": 1.1508, + "step": 32 + }, + { + "epoch": 0.06883365200764818, + "grad_norm": 0.1767578125, + "learning_rate": 0.0002571428571428571, + "loss": 1.1415, + "step": 36 + }, + { + "epoch": 0.07648183556405354, + "grad_norm": 0.1865234375, + "learning_rate": 0.0002857142857142857, + "loss": 1.1255, + "step": 40 + }, + { + "epoch": 0.0841300191204589, + "grad_norm": 0.17578125, + "learning_rate": 0.0002999998274159216, + "loss": 1.0581, + "step": 44 + }, + { + "epoch": 0.09177820267686425, + "grad_norm": 0.197265625, + "learning_rate": 0.00029999844674567734, + "loss": 1.0987, + "step": 48 + }, + { + "epoch": 0.0994263862332696, + "grad_norm": 0.185546875, + "learning_rate": 0.0002999956854178972, + "loss": 1.089, + "step": 52 + }, + { + "epoch": 0.10707456978967496, + "grad_norm": 0.197265625, + "learning_rate": 0.00029999154345799773, + "loss": 1.0934, + "step": 56 + }, + { + "epoch": 0.1147227533460803, + "grad_norm": 0.18359375, + "learning_rate": 0.0002999860209041035, + "loss": 1.0712, + "step": 60 + }, + { + "epoch": 0.12237093690248566, + "grad_norm": 0.19140625, + "learning_rate": 0.00029997911780704675, + "loss": 1.0253, + "step": 64 + }, + { + "epoch": 0.13001912045889102, + "grad_norm": 0.1845703125, + "learning_rate": 0.00029997083423036696, + "loss": 1.0526, + "step": 68 + }, + { + "epoch": 0.13766730401529637, + "grad_norm": 0.2080078125, + "learning_rate": 0.00029996117025031, + "loss": 1.0746, + "step": 72 + }, + { + "epoch": 0.14531548757170173, + "grad_norm": 0.193359375, + "learning_rate": 0.00029995012595582796, + "loss": 1.0502, + "step": 76 + }, + { + "epoch": 0.15296367112810708, + "grad_norm": 0.1806640625, + "learning_rate": 0.0002999377014485777, + "loss": 1.0461, + "step": 80 + }, + { + "epoch": 0.16061185468451242, + "grad_norm": 0.1923828125, + "learning_rate": 0.00029992389684292025, + "loss": 1.0223, + "step": 84 + }, + { + "epoch": 0.1682600382409178, + "grad_norm": 0.2060546875, + "learning_rate": 0.00029990871226591995, + "loss": 1.0218, + "step": 88 + }, + { + "epoch": 0.17590822179732313, + "grad_norm": 0.2041015625, + "learning_rate": 0.00029989214785734286, + "loss": 1.0192, + "step": 92 + }, + { + "epoch": 0.1835564053537285, + "grad_norm": 0.2109375, + "learning_rate": 0.00029987420376965577, + "loss": 1.0243, + "step": 96 + }, + { + "epoch": 0.19120458891013384, + "grad_norm": 0.205078125, + "learning_rate": 0.00029985488016802457, + "loss": 1.0202, + "step": 100 + }, + { + "epoch": 0.1988527724665392, + "grad_norm": 0.203125, + "learning_rate": 0.00029983417723031307, + "loss": 0.9904, + "step": 104 + }, + { + "epoch": 0.20650095602294455, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002998120951470809, + "loss": 1.0228, + "step": 108 + }, + { + "epoch": 0.21414913957934992, + "grad_norm": 0.23828125, + "learning_rate": 0.00029978863412158217, + "loss": 1.004, + "step": 112 + }, + { + "epoch": 0.22179732313575526, + "grad_norm": 0.232421875, + "learning_rate": 0.0002997637943697635, + "loss": 0.9903, + "step": 116 + }, + { + "epoch": 0.2294455066921606, + "grad_norm": 0.2109375, + "learning_rate": 0.00029973757612026164, + "loss": 0.969, + "step": 120 + }, + { + "epoch": 0.23709369024856597, + "grad_norm": 0.232421875, + "learning_rate": 0.00029970997961440213, + "loss": 0.9869, + "step": 124 + }, + { + "epoch": 0.2447418738049713, + "grad_norm": 0.263671875, + "learning_rate": 0.0002996810051061963, + "loss": 0.9758, + "step": 128 + }, + { + "epoch": 0.25239005736137665, + "grad_norm": 0.2080078125, + "learning_rate": 0.00029965065286233943, + "loss": 0.9615, + "step": 132 + }, + { + "epoch": 0.26003824091778205, + "grad_norm": 0.224609375, + "learning_rate": 0.00029961892316220817, + "loss": 0.9541, + "step": 136 + }, + { + "epoch": 0.2676864244741874, + "grad_norm": 0.21484375, + "learning_rate": 0.0002995858162978577, + "loss": 0.9455, + "step": 140 + }, + { + "epoch": 0.27533460803059273, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002995513325740197, + "loss": 0.9566, + "step": 144 + }, + { + "epoch": 0.2829827915869981, + "grad_norm": 0.25390625, + "learning_rate": 0.00029951547230809865, + "loss": 0.9405, + "step": 148 + }, + { + "epoch": 0.29063097514340347, + "grad_norm": 0.2265625, + "learning_rate": 0.00029947823583016973, + "loss": 0.9119, + "step": 152 + }, + { + "epoch": 0.2982791586998088, + "grad_norm": 0.2412109375, + "learning_rate": 0.00029943962348297535, + "loss": 0.9507, + "step": 156 + }, + { + "epoch": 0.30592734225621415, + "grad_norm": 0.21484375, + "learning_rate": 0.00029939963562192196, + "loss": 0.9507, + "step": 160 + }, + { + "epoch": 0.3135755258126195, + "grad_norm": 0.234375, + "learning_rate": 0.000299358272615077, + "loss": 0.9815, + "step": 164 + }, + { + "epoch": 0.32122370936902483, + "grad_norm": 0.2255859375, + "learning_rate": 0.0002993155348431654, + "loss": 0.9364, + "step": 168 + }, + { + "epoch": 0.32887189292543023, + "grad_norm": 0.21875, + "learning_rate": 0.0002992714226995661, + "loss": 0.9177, + "step": 172 + }, + { + "epoch": 0.3365200764818356, + "grad_norm": 0.21875, + "learning_rate": 0.00029922593659030837, + "loss": 0.9224, + "step": 176 + }, + { + "epoch": 0.3441682600382409, + "grad_norm": 0.21875, + "learning_rate": 0.00029917907693406817, + "loss": 0.9359, + "step": 180 + }, + { + "epoch": 0.35181644359464626, + "grad_norm": 0.2265625, + "learning_rate": 0.00029913084416216415, + "loss": 0.9349, + "step": 184 + }, + { + "epoch": 0.35946462715105165, + "grad_norm": 0.228515625, + "learning_rate": 0.00029908123871855396, + "loss": 0.9033, + "step": 188 + }, + { + "epoch": 0.367112810707457, + "grad_norm": 0.2294921875, + "learning_rate": 0.0002990302610598297, + "loss": 0.9007, + "step": 192 + }, + { + "epoch": 0.37476099426386233, + "grad_norm": 0.271484375, + "learning_rate": 0.00029897791165521434, + "loss": 0.9243, + "step": 196 + }, + { + "epoch": 0.3824091778202677, + "grad_norm": 0.244140625, + "learning_rate": 0.0002989241909865567, + "loss": 0.9095, + "step": 200 + }, + { + "epoch": 0.390057361376673, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002988690995483276, + "loss": 0.8825, + "step": 204 + }, + { + "epoch": 0.3977055449330784, + "grad_norm": 0.232421875, + "learning_rate": 0.00029881263784761503, + "loss": 0.8929, + "step": 208 + }, + { + "epoch": 0.40535372848948376, + "grad_norm": 0.2353515625, + "learning_rate": 0.00029875480640411957, + "loss": 0.9097, + "step": 212 + }, + { + "epoch": 0.4130019120458891, + "grad_norm": 0.2392578125, + "learning_rate": 0.00029869560575014945, + "loss": 0.8563, + "step": 216 + }, + { + "epoch": 0.42065009560229444, + "grad_norm": 0.2041015625, + "learning_rate": 0.00029863503643061585, + "loss": 0.8839, + "step": 220 + }, + { + "epoch": 0.42829827915869984, + "grad_norm": 0.26171875, + "learning_rate": 0.0002985730990030278, + "loss": 0.8635, + "step": 224 + }, + { + "epoch": 0.4359464627151052, + "grad_norm": 0.2451171875, + "learning_rate": 0.00029850979403748705, + "loss": 0.859, + "step": 228 + }, + { + "epoch": 0.4435946462715105, + "grad_norm": 0.2314453125, + "learning_rate": 0.00029844512211668286, + "loss": 0.8256, + "step": 232 + }, + { + "epoch": 0.45124282982791586, + "grad_norm": 0.21484375, + "learning_rate": 0.00029837908383588646, + "loss": 0.8282, + "step": 236 + }, + { + "epoch": 0.4588910133843212, + "grad_norm": 0.2373046875, + "learning_rate": 0.0002983116798029459, + "loss": 0.8579, + "step": 240 + }, + { + "epoch": 0.4665391969407266, + "grad_norm": 0.2255859375, + "learning_rate": 0.0002982429106382801, + "loss": 0.8805, + "step": 244 + }, + { + "epoch": 0.47418738049713194, + "grad_norm": 0.2412109375, + "learning_rate": 0.00029817277697487347, + "loss": 0.823, + "step": 248 + }, + { + "epoch": 0.4818355640535373, + "grad_norm": 0.244140625, + "learning_rate": 0.0002981012794582698, + "loss": 0.8546, + "step": 252 + }, + { + "epoch": 0.4894837476099426, + "grad_norm": 0.2421875, + "learning_rate": 0.0002980284187465665, + "loss": 0.8533, + "step": 256 + }, + { + "epoch": 0.497131931166348, + "grad_norm": 0.248046875, + "learning_rate": 0.00029795419551040833, + "loss": 0.8111, + "step": 260 + }, + { + "epoch": 0.5047801147227533, + "grad_norm": 0.251953125, + "learning_rate": 0.0002978786104329816, + "loss": 0.8823, + "step": 264 + }, + { + "epoch": 0.5124282982791587, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002978016642100076, + "loss": 0.8839, + "step": 268 + }, + { + "epoch": 0.5200764818355641, + "grad_norm": 0.2353515625, + "learning_rate": 0.00029772335754973614, + "loss": 0.8512, + "step": 272 + }, + { + "epoch": 0.5277246653919694, + "grad_norm": 0.2392578125, + "learning_rate": 0.00029764369117293925, + "loss": 0.8557, + "step": 276 + }, + { + "epoch": 0.5353728489483748, + "grad_norm": 0.25390625, + "learning_rate": 0.0002975626658129044, + "loss": 0.8163, + "step": 280 + }, + { + "epoch": 0.5430210325047801, + "grad_norm": 0.2333984375, + "learning_rate": 0.0002974802822154278, + "loss": 0.9078, + "step": 284 + }, + { + "epoch": 0.5506692160611855, + "grad_norm": 0.2314453125, + "learning_rate": 0.00029739654113880755, + "loss": 0.8652, + "step": 288 + }, + { + "epoch": 0.5583173996175909, + "grad_norm": 0.2158203125, + "learning_rate": 0.00029731144335383663, + "loss": 0.8551, + "step": 292 + }, + { + "epoch": 0.5659655831739961, + "grad_norm": 0.2373046875, + "learning_rate": 0.0002972249896437958, + "loss": 0.8536, + "step": 296 + }, + { + "epoch": 0.5736137667304015, + "grad_norm": 0.2216796875, + "learning_rate": 0.0002971371808044464, + "loss": 0.825, + "step": 300 + }, + { + "epoch": 0.5812619502868069, + "grad_norm": 0.244140625, + "learning_rate": 0.000297048017644023, + "loss": 0.8082, + "step": 304 + }, + { + "epoch": 0.5889101338432122, + "grad_norm": 0.251953125, + "learning_rate": 0.0002969575009832261, + "loss": 0.8304, + "step": 308 + }, + { + "epoch": 0.5965583173996176, + "grad_norm": 0.265625, + "learning_rate": 0.00029686563165521435, + "loss": 0.8101, + "step": 312 + }, + { + "epoch": 0.6042065009560229, + "grad_norm": 0.234375, + "learning_rate": 0.00029677241050559707, + "loss": 0.8535, + "step": 316 + }, + { + "epoch": 0.6118546845124283, + "grad_norm": 0.2470703125, + "learning_rate": 0.00029667783839242625, + "loss": 0.85, + "step": 320 + }, + { + "epoch": 0.6195028680688337, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002965819161861891, + "loss": 0.8101, + "step": 324 + }, + { + "epoch": 0.627151051625239, + "grad_norm": 0.25, + "learning_rate": 0.0002964846447697994, + "loss": 0.8521, + "step": 328 + }, + { + "epoch": 0.6347992351816444, + "grad_norm": 0.21484375, + "learning_rate": 0.00029638602503858995, + "loss": 0.8506, + "step": 332 + }, + { + "epoch": 0.6424474187380497, + "grad_norm": 0.2412109375, + "learning_rate": 0.00029628605790030384, + "loss": 0.8044, + "step": 336 + }, + { + "epoch": 0.6500956022944551, + "grad_norm": 0.2265625, + "learning_rate": 0.0002961847442750866, + "loss": 0.8311, + "step": 340 + }, + { + "epoch": 0.6577437858508605, + "grad_norm": 0.2431640625, + "learning_rate": 0.00029608208509547735, + "loss": 0.8705, + "step": 344 + }, + { + "epoch": 0.6653919694072657, + "grad_norm": 0.240234375, + "learning_rate": 0.00029597808130640027, + "loss": 0.8272, + "step": 348 + }, + { + "epoch": 0.6730401529636711, + "grad_norm": 0.2333984375, + "learning_rate": 0.0002958727338651562, + "loss": 0.8035, + "step": 352 + }, + { + "epoch": 0.6806883365200764, + "grad_norm": 0.25390625, + "learning_rate": 0.00029576604374141315, + "loss": 0.8655, + "step": 356 + }, + { + "epoch": 0.6883365200764818, + "grad_norm": 0.2333984375, + "learning_rate": 0.00029565801191719837, + "loss": 0.8585, + "step": 360 + }, + { + "epoch": 0.6959847036328872, + "grad_norm": 0.267578125, + "learning_rate": 0.0002955486393868884, + "loss": 0.8279, + "step": 364 + }, + { + "epoch": 0.7036328871892925, + "grad_norm": 0.2060546875, + "learning_rate": 0.0002954379271572004, + "loss": 0.8229, + "step": 368 + }, + { + "epoch": 0.7112810707456979, + "grad_norm": 0.251953125, + "learning_rate": 0.0002953258762471828, + "loss": 0.8549, + "step": 372 + }, + { + "epoch": 0.7189292543021033, + "grad_norm": 0.2265625, + "learning_rate": 0.0002952124876882058, + "loss": 0.837, + "step": 376 + }, + { + "epoch": 0.7265774378585086, + "grad_norm": 0.251953125, + "learning_rate": 0.00029509776252395194, + "loss": 0.8084, + "step": 380 + }, + { + "epoch": 0.734225621414914, + "grad_norm": 0.26953125, + "learning_rate": 0.0002949817018104066, + "loss": 0.8454, + "step": 384 + }, + { + "epoch": 0.7418738049713193, + "grad_norm": 0.2451171875, + "learning_rate": 0.0002948643066158482, + "loss": 0.7603, + "step": 388 + }, + { + "epoch": 0.7495219885277247, + "grad_norm": 0.2353515625, + "learning_rate": 0.00029474557802083834, + "loss": 0.8814, + "step": 392 + }, + { + "epoch": 0.7571701720841301, + "grad_norm": 0.23046875, + "learning_rate": 0.0002946255171182119, + "loss": 0.8678, + "step": 396 + }, + { + "epoch": 0.7648183556405354, + "grad_norm": 0.236328125, + "learning_rate": 0.00029450412501306675, + "loss": 0.8397, + "step": 400 + }, + { + "epoch": 0.7724665391969407, + "grad_norm": 0.2412109375, + "learning_rate": 0.00029438140282275413, + "loss": 0.797, + "step": 404 + }, + { + "epoch": 0.780114722753346, + "grad_norm": 0.24609375, + "learning_rate": 0.0002942573516768678, + "loss": 0.7764, + "step": 408 + }, + { + "epoch": 0.7877629063097514, + "grad_norm": 0.263671875, + "learning_rate": 0.00029413197271723385, + "loss": 0.7856, + "step": 412 + }, + { + "epoch": 0.7954110898661568, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002940052670979004, + "loss": 0.7878, + "step": 416 + }, + { + "epoch": 0.8030592734225621, + "grad_norm": 0.2421875, + "learning_rate": 0.0002938772359851265, + "loss": 0.7971, + "step": 420 + }, + { + "epoch": 0.8107074569789675, + "grad_norm": 0.240234375, + "learning_rate": 0.00029374788055737194, + "loss": 0.825, + "step": 424 + }, + { + "epoch": 0.8183556405353728, + "grad_norm": 0.2373046875, + "learning_rate": 0.000293617202005286, + "loss": 0.8105, + "step": 428 + }, + { + "epoch": 0.8260038240917782, + "grad_norm": 0.26171875, + "learning_rate": 0.00029348520153169656, + "loss": 0.8055, + "step": 432 + }, + { + "epoch": 0.8336520076481836, + "grad_norm": 0.228515625, + "learning_rate": 0.0002933518803515993, + "loss": 0.7616, + "step": 436 + }, + { + "epoch": 0.8413001912045889, + "grad_norm": 0.265625, + "learning_rate": 0.00029321723969214625, + "loss": 0.7842, + "step": 440 + }, + { + "epoch": 0.8489483747609943, + "grad_norm": 0.216796875, + "learning_rate": 0.0002930812807926343, + "loss": 0.803, + "step": 444 + }, + { + "epoch": 0.8565965583173997, + "grad_norm": 0.21875, + "learning_rate": 0.0002929440049044945, + "loss": 0.8174, + "step": 448 + }, + { + "epoch": 0.864244741873805, + "grad_norm": 0.2314453125, + "learning_rate": 0.0002928054132912797, + "loss": 0.836, + "step": 452 + }, + { + "epoch": 0.8718929254302104, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002926655072286536, + "loss": 0.7552, + "step": 456 + }, + { + "epoch": 0.8795411089866156, + "grad_norm": 0.248046875, + "learning_rate": 0.00029252428800437854, + "loss": 0.7755, + "step": 460 + }, + { + "epoch": 0.887189292543021, + "grad_norm": 0.2412109375, + "learning_rate": 0.00029238175691830395, + "loss": 0.8101, + "step": 464 + }, + { + "epoch": 0.8948374760994264, + "grad_norm": 0.248046875, + "learning_rate": 0.00029223791528235407, + "loss": 0.7662, + "step": 468 + }, + { + "epoch": 0.9024856596558317, + "grad_norm": 0.2314453125, + "learning_rate": 0.00029209276442051634, + "loss": 0.7772, + "step": 472 + }, + { + "epoch": 0.9101338432122371, + "grad_norm": 0.2197265625, + "learning_rate": 0.0002919463056688287, + "loss": 0.8135, + "step": 476 + }, + { + "epoch": 0.9177820267686424, + "grad_norm": 0.236328125, + "learning_rate": 0.00029179854037536773, + "loss": 0.7895, + "step": 480 + }, + { + "epoch": 0.9254302103250478, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002916494699002358, + "loss": 0.7644, + "step": 484 + }, + { + "epoch": 0.9330783938814532, + "grad_norm": 0.2197265625, + "learning_rate": 0.00029149909561554914, + "loss": 0.7804, + "step": 488 + }, + { + "epoch": 0.9407265774378585, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002913474189054246, + "loss": 0.7779, + "step": 492 + }, + { + "epoch": 0.9483747609942639, + "grad_norm": 0.240234375, + "learning_rate": 0.0002911944411659672, + "loss": 0.781, + "step": 496 + }, + { + "epoch": 0.9560229445506692, + "grad_norm": 0.259765625, + "learning_rate": 0.0002910401638052574, + "loss": 0.8079, + "step": 500 + }, + { + "epoch": 0.9636711281070746, + "grad_norm": 0.259765625, + "learning_rate": 0.00029088458824333787, + "loss": 0.8332, + "step": 504 + }, + { + "epoch": 0.97131931166348, + "grad_norm": 0.2275390625, + "learning_rate": 0.00029072771591220057, + "loss": 0.7752, + "step": 508 + }, + { + "epoch": 0.9789674952198852, + "grad_norm": 0.26171875, + "learning_rate": 0.00029056954825577353, + "loss": 0.8038, + "step": 512 + }, + { + "epoch": 0.9866156787762906, + "grad_norm": 0.23046875, + "learning_rate": 0.0002904100867299077, + "loss": 0.7362, + "step": 516 + }, + { + "epoch": 0.994263862332696, + "grad_norm": 0.228515625, + "learning_rate": 0.0002902493328023633, + "loss": 0.8137, + "step": 520 + }, + { + "epoch": 1.0019120458891013, + "grad_norm": 0.21875, + "learning_rate": 0.0002900872879527964, + "loss": 0.7389, + "step": 524 + }, + { + "epoch": 1.0095602294455066, + "grad_norm": 0.25390625, + "learning_rate": 0.00028992395367274547, + "loss": 0.6994, + "step": 528 + }, + { + "epoch": 1.0172084130019121, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002897593314656174, + "loss": 0.7461, + "step": 532 + }, + { + "epoch": 1.0248565965583174, + "grad_norm": 0.263671875, + "learning_rate": 0.0002895934228466738, + "loss": 0.6856, + "step": 536 + }, + { + "epoch": 1.0325047801147227, + "grad_norm": 0.236328125, + "learning_rate": 0.0002894262293430171, + "loss": 0.7405, + "step": 540 + }, + { + "epoch": 1.0401529636711282, + "grad_norm": 0.2451171875, + "learning_rate": 0.0002892577524935763, + "loss": 0.7906, + "step": 544 + }, + { + "epoch": 1.0478011472275335, + "grad_norm": 0.2578125, + "learning_rate": 0.00028908799384909313, + "loss": 0.7144, + "step": 548 + }, + { + "epoch": 1.0554493307839388, + "grad_norm": 0.232421875, + "learning_rate": 0.0002889169549721073, + "loss": 0.7233, + "step": 552 + }, + { + "epoch": 1.063097514340344, + "grad_norm": 0.28125, + "learning_rate": 0.00028874463743694265, + "loss": 0.7068, + "step": 556 + }, + { + "epoch": 1.0707456978967496, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002885710428296922, + "loss": 0.7105, + "step": 560 + }, + { + "epoch": 1.0783938814531548, + "grad_norm": 0.2421875, + "learning_rate": 0.00028839617274820404, + "loss": 0.6563, + "step": 564 + }, + { + "epoch": 1.0860420650095601, + "grad_norm": 0.244140625, + "learning_rate": 0.00028822002880206593, + "loss": 0.6956, + "step": 568 + }, + { + "epoch": 1.0936902485659656, + "grad_norm": 0.255859375, + "learning_rate": 0.00028804261261259115, + "loss": 0.6669, + "step": 572 + }, + { + "epoch": 1.101338432122371, + "grad_norm": 0.2578125, + "learning_rate": 0.00028786392581280334, + "loss": 0.6727, + "step": 576 + }, + { + "epoch": 1.1089866156787762, + "grad_norm": 0.26953125, + "learning_rate": 0.00028768397004742135, + "loss": 0.714, + "step": 580 + }, + { + "epoch": 1.1166347992351817, + "grad_norm": 0.255859375, + "learning_rate": 0.00028750274697284423, + "loss": 0.666, + "step": 584 + }, + { + "epoch": 1.124282982791587, + "grad_norm": 0.23828125, + "learning_rate": 0.00028732025825713587, + "loss": 0.6883, + "step": 588 + }, + { + "epoch": 1.1319311663479923, + "grad_norm": 0.26171875, + "learning_rate": 0.00028713650558000983, + "loss": 0.7002, + "step": 592 + }, + { + "epoch": 1.1395793499043978, + "grad_norm": 0.26171875, + "learning_rate": 0.0002869514906328138, + "loss": 0.7038, + "step": 596 + }, + { + "epoch": 1.147227533460803, + "grad_norm": 0.2578125, + "learning_rate": 0.00028676521511851395, + "loss": 0.7178, + "step": 600 + }, + { + "epoch": 1.1548757170172084, + "grad_norm": 0.267578125, + "learning_rate": 0.0002865776807516793, + "loss": 0.6752, + "step": 604 + }, + { + "epoch": 1.1625239005736137, + "grad_norm": 0.2470703125, + "learning_rate": 0.0002863888892584659, + "loss": 0.7204, + "step": 608 + }, + { + "epoch": 1.1701720841300192, + "grad_norm": 0.298828125, + "learning_rate": 0.00028619884237660124, + "loss": 0.6967, + "step": 612 + }, + { + "epoch": 1.1778202676864244, + "grad_norm": 0.2421875, + "learning_rate": 0.0002860075418553676, + "loss": 0.705, + "step": 616 + }, + { + "epoch": 1.1854684512428297, + "grad_norm": 0.271484375, + "learning_rate": 0.00028581498945558676, + "loss": 0.7208, + "step": 620 + }, + { + "epoch": 1.1931166347992352, + "grad_norm": 0.26953125, + "learning_rate": 0.00028562118694960316, + "loss": 0.6872, + "step": 624 + }, + { + "epoch": 1.2007648183556405, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002854261361212679, + "loss": 0.7445, + "step": 628 + }, + { + "epoch": 1.2084130019120458, + "grad_norm": 0.24609375, + "learning_rate": 0.00028522983876592213, + "loss": 0.7267, + "step": 632 + }, + { + "epoch": 1.2160611854684513, + "grad_norm": 0.26171875, + "learning_rate": 0.0002850322966903808, + "loss": 0.6831, + "step": 636 + }, + { + "epoch": 1.2237093690248566, + "grad_norm": 0.26171875, + "learning_rate": 0.00028483351171291576, + "loss": 0.7185, + "step": 640 + }, + { + "epoch": 1.231357552581262, + "grad_norm": 0.26171875, + "learning_rate": 0.00028463348566323913, + "loss": 0.7019, + "step": 644 + }, + { + "epoch": 1.2390057361376674, + "grad_norm": 0.263671875, + "learning_rate": 0.00028443222038248645, + "loss": 0.7699, + "step": 648 + }, + { + "epoch": 1.2466539196940727, + "grad_norm": 0.24609375, + "learning_rate": 0.00028422971772319977, + "loss": 0.6939, + "step": 652 + }, + { + "epoch": 1.254302103250478, + "grad_norm": 0.28125, + "learning_rate": 0.00028402597954931046, + "loss": 0.6505, + "step": 656 + }, + { + "epoch": 1.2619502868068833, + "grad_norm": 0.251953125, + "learning_rate": 0.00028382100773612236, + "loss": 0.6632, + "step": 660 + }, + { + "epoch": 1.2695984703632888, + "grad_norm": 0.251953125, + "learning_rate": 0.0002836148041702941, + "loss": 0.7787, + "step": 664 + }, + { + "epoch": 1.277246653919694, + "grad_norm": 0.2412109375, + "learning_rate": 0.00028340737074982207, + "loss": 0.7293, + "step": 668 + }, + { + "epoch": 1.2848948374760996, + "grad_norm": 0.23828125, + "learning_rate": 0.0002831987093840229, + "loss": 0.7471, + "step": 672 + }, + { + "epoch": 1.2925430210325048, + "grad_norm": 0.240234375, + "learning_rate": 0.00028298882199351565, + "loss": 0.7445, + "step": 676 + }, + { + "epoch": 1.3001912045889101, + "grad_norm": 0.251953125, + "learning_rate": 0.00028277771051020433, + "loss": 0.6997, + "step": 680 + }, + { + "epoch": 1.3078393881453154, + "grad_norm": 0.2412109375, + "learning_rate": 0.00028256537687726017, + "loss": 0.7389, + "step": 684 + }, + { + "epoch": 1.3154875717017207, + "grad_norm": 0.26171875, + "learning_rate": 0.0002823518230491036, + "loss": 0.7171, + "step": 688 + }, + { + "epoch": 1.3231357552581262, + "grad_norm": 0.251953125, + "learning_rate": 0.00028213705099138636, + "loss": 0.7054, + "step": 692 + }, + { + "epoch": 1.3307839388145315, + "grad_norm": 0.267578125, + "learning_rate": 0.00028192106268097334, + "loss": 0.6543, + "step": 696 + }, + { + "epoch": 1.338432122370937, + "grad_norm": 0.2578125, + "learning_rate": 0.0002817038601059243, + "loss": 0.7012, + "step": 700 + }, + { + "epoch": 1.3460803059273423, + "grad_norm": 0.259765625, + "learning_rate": 0.0002814854452654758, + "loss": 0.7058, + "step": 704 + }, + { + "epoch": 1.3537284894837476, + "grad_norm": 0.259765625, + "learning_rate": 0.00028126582017002266, + "loss": 0.7797, + "step": 708 + }, + { + "epoch": 1.3613766730401529, + "grad_norm": 0.267578125, + "learning_rate": 0.0002810449868410994, + "loss": 0.6898, + "step": 712 + }, + { + "epoch": 1.3690248565965584, + "grad_norm": 0.240234375, + "learning_rate": 0.00028082294731136164, + "loss": 0.6872, + "step": 716 + }, + { + "epoch": 1.3766730401529637, + "grad_norm": 0.2734375, + "learning_rate": 0.00028059970362456776, + "loss": 0.706, + "step": 720 + }, + { + "epoch": 1.384321223709369, + "grad_norm": 0.236328125, + "learning_rate": 0.00028037525783555935, + "loss": 0.6971, + "step": 724 + }, + { + "epoch": 1.3919694072657744, + "grad_norm": 0.251953125, + "learning_rate": 0.00028014961201024304, + "loss": 0.7212, + "step": 728 + }, + { + "epoch": 1.3996175908221797, + "grad_norm": 0.25, + "learning_rate": 0.0002799227682255711, + "loss": 0.702, + "step": 732 + }, + { + "epoch": 1.407265774378585, + "grad_norm": 0.259765625, + "learning_rate": 0.00027969472856952224, + "loss": 0.6604, + "step": 736 + }, + { + "epoch": 1.4149139579349903, + "grad_norm": 0.265625, + "learning_rate": 0.00027946549514108277, + "loss": 0.7, + "step": 740 + }, + { + "epoch": 1.4225621414913958, + "grad_norm": 0.263671875, + "learning_rate": 0.00027923507005022687, + "loss": 0.7281, + "step": 744 + }, + { + "epoch": 1.430210325047801, + "grad_norm": 0.2392578125, + "learning_rate": 0.00027900345541789746, + "loss": 0.6261, + "step": 748 + }, + { + "epoch": 1.4378585086042066, + "grad_norm": 0.275390625, + "learning_rate": 0.0002787706533759865, + "loss": 0.7131, + "step": 752 + }, + { + "epoch": 1.445506692160612, + "grad_norm": 0.26171875, + "learning_rate": 0.00027853666606731547, + "loss": 0.664, + "step": 756 + }, + { + "epoch": 1.4531548757170172, + "grad_norm": 0.255859375, + "learning_rate": 0.0002783014956456157, + "loss": 0.6856, + "step": 760 + }, + { + "epoch": 1.4608030592734225, + "grad_norm": 0.240234375, + "learning_rate": 0.0002780651442755083, + "loss": 0.7422, + "step": 764 + }, + { + "epoch": 1.468451242829828, + "grad_norm": 0.24609375, + "learning_rate": 0.0002778276141324844, + "loss": 0.6738, + "step": 768 + }, + { + "epoch": 1.4760994263862333, + "grad_norm": 0.271484375, + "learning_rate": 0.0002775889074028853, + "loss": 0.7595, + "step": 772 + }, + { + "epoch": 1.4837476099426385, + "grad_norm": 0.255859375, + "learning_rate": 0.000277349026283882, + "loss": 0.7132, + "step": 776 + }, + { + "epoch": 1.491395793499044, + "grad_norm": 0.2451171875, + "learning_rate": 0.0002771079729834552, + "loss": 0.7316, + "step": 780 + }, + { + "epoch": 1.4990439770554493, + "grad_norm": 0.2578125, + "learning_rate": 0.0002768657497203749, + "loss": 0.7314, + "step": 784 + }, + { + "epoch": 1.5066921606118546, + "grad_norm": 0.25390625, + "learning_rate": 0.00027662235872418005, + "loss": 0.7036, + "step": 788 + }, + { + "epoch": 1.51434034416826, + "grad_norm": 0.25390625, + "learning_rate": 0.00027637780223515793, + "loss": 0.7191, + "step": 792 + }, + { + "epoch": 1.5219885277246654, + "grad_norm": 0.267578125, + "learning_rate": 0.00027613208250432353, + "loss": 0.712, + "step": 796 + }, + { + "epoch": 1.5296367112810707, + "grad_norm": 0.248046875, + "learning_rate": 0.000275885201793399, + "loss": 0.6918, + "step": 800 + }, + { + "epoch": 1.5372848948374762, + "grad_norm": 0.25390625, + "learning_rate": 0.0002756371623747925, + "loss": 0.6822, + "step": 804 + }, + { + "epoch": 1.5449330783938815, + "grad_norm": 0.271484375, + "learning_rate": 0.0002753879665315778, + "loss": 0.707, + "step": 808 + }, + { + "epoch": 1.5525812619502868, + "grad_norm": 0.26171875, + "learning_rate": 0.0002751376165574726, + "loss": 0.6921, + "step": 812 + }, + { + "epoch": 1.560229445506692, + "grad_norm": 0.2734375, + "learning_rate": 0.0002748861147568181, + "loss": 0.7048, + "step": 816 + }, + { + "epoch": 1.5678776290630974, + "grad_norm": 0.2734375, + "learning_rate": 0.00027463346344455724, + "loss": 0.7171, + "step": 820 + }, + { + "epoch": 1.5755258126195029, + "grad_norm": 0.267578125, + "learning_rate": 0.0002743796649462137, + "loss": 0.6839, + "step": 824 + }, + { + "epoch": 1.5831739961759084, + "grad_norm": 0.25390625, + "learning_rate": 0.00027412472159787037, + "loss": 0.6722, + "step": 828 + }, + { + "epoch": 1.5908221797323137, + "grad_norm": 0.251953125, + "learning_rate": 0.00027386863574614803, + "loss": 0.6538, + "step": 832 + }, + { + "epoch": 1.598470363288719, + "grad_norm": 0.2734375, + "learning_rate": 0.0002736114097481833, + "loss": 0.7013, + "step": 836 + }, + { + "epoch": 1.6061185468451242, + "grad_norm": 0.259765625, + "learning_rate": 0.0002733530459716076, + "loss": 0.6967, + "step": 840 + }, + { + "epoch": 1.6137667304015295, + "grad_norm": 0.255859375, + "learning_rate": 0.00027309354679452483, + "loss": 0.7052, + "step": 844 + }, + { + "epoch": 1.621414913957935, + "grad_norm": 0.279296875, + "learning_rate": 0.0002728329146054897, + "loss": 0.6677, + "step": 848 + }, + { + "epoch": 1.6290630975143403, + "grad_norm": 0.267578125, + "learning_rate": 0.00027257115180348557, + "loss": 0.7128, + "step": 852 + }, + { + "epoch": 1.6367112810707458, + "grad_norm": 0.25, + "learning_rate": 0.0002723082607979028, + "loss": 0.7085, + "step": 856 + }, + { + "epoch": 1.644359464627151, + "grad_norm": 0.259765625, + "learning_rate": 0.00027204424400851596, + "loss": 0.6719, + "step": 860 + }, + { + "epoch": 1.6520076481835564, + "grad_norm": 0.275390625, + "learning_rate": 0.00027177910386546206, + "loss": 0.6873, + "step": 864 + }, + { + "epoch": 1.6596558317399617, + "grad_norm": 0.248046875, + "learning_rate": 0.00027151284280921794, + "loss": 0.6919, + "step": 868 + }, + { + "epoch": 1.667304015296367, + "grad_norm": 0.2734375, + "learning_rate": 0.0002712454632905779, + "loss": 0.6967, + "step": 872 + }, + { + "epoch": 1.6749521988527725, + "grad_norm": 0.271484375, + "learning_rate": 0.00027097696777063113, + "loss": 0.7067, + "step": 876 + }, + { + "epoch": 1.682600382409178, + "grad_norm": 0.2578125, + "learning_rate": 0.00027070735872073885, + "loss": 0.6303, + "step": 880 + }, + { + "epoch": 1.6902485659655833, + "grad_norm": 0.271484375, + "learning_rate": 0.0002704366386225119, + "loss": 0.6828, + "step": 884 + }, + { + "epoch": 1.6978967495219885, + "grad_norm": 0.271484375, + "learning_rate": 0.0002701648099677878, + "loss": 0.6876, + "step": 888 + }, + { + "epoch": 1.7055449330783938, + "grad_norm": 0.26171875, + "learning_rate": 0.0002698918752586075, + "loss": 0.7079, + "step": 892 + }, + { + "epoch": 1.7131931166347991, + "grad_norm": 0.2890625, + "learning_rate": 0.00026961783700719293, + "loss": 0.6956, + "step": 896 + }, + { + "epoch": 1.7208413001912046, + "grad_norm": 0.2578125, + "learning_rate": 0.0002693426977359233, + "loss": 0.7143, + "step": 900 + }, + { + "epoch": 1.72848948374761, + "grad_norm": 0.251953125, + "learning_rate": 0.0002690664599773122, + "loss": 0.6973, + "step": 904 + }, + { + "epoch": 1.7361376673040154, + "grad_norm": 0.27734375, + "learning_rate": 0.00026878912627398434, + "loss": 0.68, + "step": 908 + }, + { + "epoch": 1.7437858508604207, + "grad_norm": 0.248046875, + "learning_rate": 0.0002685106991786519, + "loss": 0.7011, + "step": 912 + }, + { + "epoch": 1.751434034416826, + "grad_norm": 0.2578125, + "learning_rate": 0.00026823118125409107, + "loss": 0.6862, + "step": 916 + }, + { + "epoch": 1.7590822179732313, + "grad_norm": 0.271484375, + "learning_rate": 0.0002679505750731189, + "loss": 0.6929, + "step": 920 + }, + { + "epoch": 1.7667304015296366, + "grad_norm": 0.26953125, + "learning_rate": 0.00026766888321856896, + "loss": 0.6927, + "step": 924 + }, + { + "epoch": 1.774378585086042, + "grad_norm": 0.259765625, + "learning_rate": 0.000267386108283268, + "loss": 0.6618, + "step": 928 + }, + { + "epoch": 1.7820267686424476, + "grad_norm": 0.251953125, + "learning_rate": 0.0002671022528700118, + "loss": 0.7173, + "step": 932 + }, + { + "epoch": 1.7896749521988529, + "grad_norm": 0.23828125, + "learning_rate": 0.00026681731959154174, + "loss": 0.7314, + "step": 936 + }, + { + "epoch": 1.7973231357552581, + "grad_norm": 0.255859375, + "learning_rate": 0.00026653131107052, + "loss": 0.7013, + "step": 940 + }, + { + "epoch": 1.8049713193116634, + "grad_norm": 0.267578125, + "learning_rate": 0.00026624422993950603, + "loss": 0.7591, + "step": 944 + }, + { + "epoch": 1.8126195028680687, + "grad_norm": 0.279296875, + "learning_rate": 0.0002659560788409321, + "loss": 0.6398, + "step": 948 + }, + { + "epoch": 1.8202676864244742, + "grad_norm": 0.2890625, + "learning_rate": 0.0002656668604270788, + "loss": 0.6778, + "step": 952 + }, + { + "epoch": 1.8279158699808795, + "grad_norm": 0.265625, + "learning_rate": 0.00026537657736005094, + "loss": 0.6543, + "step": 956 + }, + { + "epoch": 1.835564053537285, + "grad_norm": 0.251953125, + "learning_rate": 0.000265085232311753, + "loss": 0.7096, + "step": 960 + }, + { + "epoch": 1.8432122370936903, + "grad_norm": 0.2314453125, + "learning_rate": 0.00026479282796386416, + "loss": 0.6939, + "step": 964 + }, + { + "epoch": 1.8508604206500956, + "grad_norm": 0.26171875, + "learning_rate": 0.00026449936700781413, + "loss": 0.728, + "step": 968 + }, + { + "epoch": 1.8585086042065009, + "grad_norm": 0.26171875, + "learning_rate": 0.0002642048521447581, + "loss": 0.6862, + "step": 972 + }, + { + "epoch": 1.8661567877629062, + "grad_norm": 0.25390625, + "learning_rate": 0.00026390928608555195, + "loss": 0.6767, + "step": 976 + }, + { + "epoch": 1.8738049713193117, + "grad_norm": 0.314453125, + "learning_rate": 0.0002636126715507272, + "loss": 0.6229, + "step": 980 + }, + { + "epoch": 1.8814531548757172, + "grad_norm": 0.26171875, + "learning_rate": 0.000263315011270466, + "loss": 0.6941, + "step": 984 + }, + { + "epoch": 1.8891013384321225, + "grad_norm": 0.267578125, + "learning_rate": 0.00026301630798457613, + "loss": 0.6854, + "step": 988 + }, + { + "epoch": 1.8967495219885278, + "grad_norm": 0.26953125, + "learning_rate": 0.00026271656444246577, + "loss": 0.7136, + "step": 992 + }, + { + "epoch": 1.904397705544933, + "grad_norm": 0.265625, + "learning_rate": 0.000262415783403118, + "loss": 0.696, + "step": 996 + }, + { + "epoch": 1.9120458891013383, + "grad_norm": 0.25390625, + "learning_rate": 0.00026211396763506546, + "loss": 0.6688, + "step": 1000 + }, + { + "epoch": 1.9196940726577438, + "grad_norm": 0.25390625, + "learning_rate": 0.0002618111199163651, + "loss": 0.6953, + "step": 1004 + }, + { + "epoch": 1.9273422562141491, + "grad_norm": 0.25, + "learning_rate": 0.00026150724303457235, + "loss": 0.7481, + "step": 1008 + }, + { + "epoch": 1.9349904397705546, + "grad_norm": 0.279296875, + "learning_rate": 0.0002612023397867155, + "loss": 0.675, + "step": 1012 + }, + { + "epoch": 1.94263862332696, + "grad_norm": 0.2470703125, + "learning_rate": 0.00026089641297927, + "loss": 0.6684, + "step": 1016 + }, + { + "epoch": 1.9502868068833652, + "grad_norm": 0.302734375, + "learning_rate": 0.0002605894654281329, + "loss": 0.6547, + "step": 1020 + }, + { + "epoch": 1.9579349904397705, + "grad_norm": 0.267578125, + "learning_rate": 0.0002602814999585963, + "loss": 0.7232, + "step": 1024 + }, + { + "epoch": 1.9655831739961758, + "grad_norm": 0.25390625, + "learning_rate": 0.0002599725194053219, + "loss": 0.7069, + "step": 1028 + }, + { + "epoch": 1.9732313575525813, + "grad_norm": 0.26171875, + "learning_rate": 0.0002596625266123146, + "loss": 0.7303, + "step": 1032 + }, + { + "epoch": 1.9808795411089866, + "grad_norm": 0.263671875, + "learning_rate": 0.00025935152443289664, + "loss": 0.7096, + "step": 1036 + }, + { + "epoch": 1.988527724665392, + "grad_norm": 0.259765625, + "learning_rate": 0.00025903951572968094, + "loss": 0.7055, + "step": 1040 + }, + { + "epoch": 1.9961759082217974, + "grad_norm": 0.265625, + "learning_rate": 0.00025872650337454504, + "loss": 0.7108, + "step": 1044 + }, + { + "epoch": 2.0038240917782026, + "grad_norm": 0.244140625, + "learning_rate": 0.00025841249024860453, + "loss": 0.5808, + "step": 1048 + }, + { + "epoch": 2.011472275334608, + "grad_norm": 0.294921875, + "learning_rate": 0.00025809747924218667, + "loss": 0.58, + "step": 1052 + }, + { + "epoch": 2.019120458891013, + "grad_norm": 0.265625, + "learning_rate": 0.00025778147325480357, + "loss": 0.6208, + "step": 1056 + }, + { + "epoch": 2.026768642447419, + "grad_norm": 0.2578125, + "learning_rate": 0.0002574644751951256, + "loss": 0.5692, + "step": 1060 + }, + { + "epoch": 2.0344168260038242, + "grad_norm": 0.2734375, + "learning_rate": 0.00025714648798095483, + "loss": 0.5891, + "step": 1064 + }, + { + "epoch": 2.0420650095602295, + "grad_norm": 0.2412109375, + "learning_rate": 0.00025682751453919776, + "loss": 0.6214, + "step": 1068 + }, + { + "epoch": 2.049713193116635, + "grad_norm": 0.279296875, + "learning_rate": 0.0002565075578058388, + "loss": 0.5889, + "step": 1072 + }, + { + "epoch": 2.05736137667304, + "grad_norm": 0.275390625, + "learning_rate": 0.0002561866207259128, + "loss": 0.6099, + "step": 1076 + }, + { + "epoch": 2.0650095602294454, + "grad_norm": 0.271484375, + "learning_rate": 0.0002558647062534785, + "loss": 0.5722, + "step": 1080 + }, + { + "epoch": 2.0726577437858507, + "grad_norm": 0.279296875, + "learning_rate": 0.0002555418173515908, + "loss": 0.609, + "step": 1084 + }, + { + "epoch": 2.0803059273422564, + "grad_norm": 0.291015625, + "learning_rate": 0.0002552179569922737, + "loss": 0.6158, + "step": 1088 + }, + { + "epoch": 2.0879541108986617, + "grad_norm": 0.2734375, + "learning_rate": 0.00025489312815649314, + "loss": 0.588, + "step": 1092 + }, + { + "epoch": 2.095602294455067, + "grad_norm": 0.26953125, + "learning_rate": 0.00025456733383412926, + "loss": 0.6278, + "step": 1096 + }, + { + "epoch": 2.1032504780114722, + "grad_norm": 0.26953125, + "learning_rate": 0.000254240577023949, + "loss": 0.5915, + "step": 1100 + }, + { + "epoch": 2.1108986615678775, + "grad_norm": 0.26953125, + "learning_rate": 0.00025391286073357856, + "loss": 0.5764, + "step": 1104 + }, + { + "epoch": 2.118546845124283, + "grad_norm": 0.279296875, + "learning_rate": 0.0002535841879794755, + "loss": 0.6146, + "step": 1108 + }, + { + "epoch": 2.126195028680688, + "grad_norm": 0.287109375, + "learning_rate": 0.0002532545617869014, + "loss": 0.5794, + "step": 1112 + }, + { + "epoch": 2.133843212237094, + "grad_norm": 0.259765625, + "learning_rate": 0.0002529239851898935, + "loss": 0.6412, + "step": 1116 + }, + { + "epoch": 2.141491395793499, + "grad_norm": 0.287109375, + "learning_rate": 0.00025259246123123706, + "loss": 0.6288, + "step": 1120 + }, + { + "epoch": 2.1491395793499044, + "grad_norm": 0.2578125, + "learning_rate": 0.0002522599929624375, + "loss": 0.5644, + "step": 1124 + }, + { + "epoch": 2.1567877629063097, + "grad_norm": 0.27734375, + "learning_rate": 0.00025192658344369193, + "loss": 0.6219, + "step": 1128 + }, + { + "epoch": 2.164435946462715, + "grad_norm": 0.275390625, + "learning_rate": 0.00025159223574386114, + "loss": 0.6015, + "step": 1132 + }, + { + "epoch": 2.1720841300191203, + "grad_norm": 0.265625, + "learning_rate": 0.00025125695294044156, + "loss": 0.612, + "step": 1136 + }, + { + "epoch": 2.179732313575526, + "grad_norm": 0.263671875, + "learning_rate": 0.0002509207381195366, + "loss": 0.5878, + "step": 1140 + }, + { + "epoch": 2.1873804971319313, + "grad_norm": 0.26171875, + "learning_rate": 0.0002505835943758286, + "loss": 0.6134, + "step": 1144 + }, + { + "epoch": 2.1950286806883366, + "grad_norm": 0.28125, + "learning_rate": 0.00025024552481254993, + "loss": 0.5663, + "step": 1148 + }, + { + "epoch": 2.202676864244742, + "grad_norm": 0.271484375, + "learning_rate": 0.0002499065325414547, + "loss": 0.5685, + "step": 1152 + }, + { + "epoch": 2.210325047801147, + "grad_norm": 0.28515625, + "learning_rate": 0.00024956662068279027, + "loss": 0.5839, + "step": 1156 + }, + { + "epoch": 2.2179732313575524, + "grad_norm": 0.306640625, + "learning_rate": 0.00024922579236526807, + "loss": 0.611, + "step": 1160 + }, + { + "epoch": 2.2256214149139577, + "grad_norm": 0.279296875, + "learning_rate": 0.00024888405072603513, + "loss": 0.6218, + "step": 1164 + }, + { + "epoch": 2.2332695984703634, + "grad_norm": 0.28125, + "learning_rate": 0.0002485413989106452, + "loss": 0.5743, + "step": 1168 + }, + { + "epoch": 2.2409177820267687, + "grad_norm": 0.29296875, + "learning_rate": 0.00024819784007302966, + "loss": 0.5921, + "step": 1172 + }, + { + "epoch": 2.248565965583174, + "grad_norm": 0.265625, + "learning_rate": 0.00024785337737546863, + "loss": 0.5444, + "step": 1176 + }, + { + "epoch": 2.2562141491395793, + "grad_norm": 0.271484375, + "learning_rate": 0.0002475080139885617, + "loss": 0.5823, + "step": 1180 + }, + { + "epoch": 2.2638623326959846, + "grad_norm": 0.2734375, + "learning_rate": 0.00024716175309119875, + "loss": 0.5788, + "step": 1184 + }, + { + "epoch": 2.27151051625239, + "grad_norm": 0.26171875, + "learning_rate": 0.00024681459787053106, + "loss": 0.5666, + "step": 1188 + }, + { + "epoch": 2.2791586998087956, + "grad_norm": 0.298828125, + "learning_rate": 0.0002464665515219415, + "loss": 0.5988, + "step": 1192 + }, + { + "epoch": 2.286806883365201, + "grad_norm": 0.3125, + "learning_rate": 0.0002461176172490153, + "loss": 0.5851, + "step": 1196 + }, + { + "epoch": 2.294455066921606, + "grad_norm": 0.310546875, + "learning_rate": 0.0002457677982635107, + "loss": 0.6328, + "step": 1200 + }, + { + "epoch": 2.3021032504780115, + "grad_norm": 0.28515625, + "learning_rate": 0.000245417097785329, + "loss": 0.6264, + "step": 1204 + }, + { + "epoch": 2.3097514340344167, + "grad_norm": 0.26171875, + "learning_rate": 0.00024506551904248546, + "loss": 0.5853, + "step": 1208 + }, + { + "epoch": 2.317399617590822, + "grad_norm": 0.2890625, + "learning_rate": 0.00024471306527107915, + "loss": 0.571, + "step": 1212 + }, + { + "epoch": 2.3250478011472273, + "grad_norm": 0.2890625, + "learning_rate": 0.0002443597397152634, + "loss": 0.6076, + "step": 1216 + }, + { + "epoch": 2.332695984703633, + "grad_norm": 0.271484375, + "learning_rate": 0.00024400554562721585, + "loss": 0.6026, + "step": 1220 + }, + { + "epoch": 2.3403441682600383, + "grad_norm": 0.279296875, + "learning_rate": 0.00024365048626710843, + "loss": 0.6196, + "step": 1224 + }, + { + "epoch": 2.3479923518164436, + "grad_norm": 0.29296875, + "learning_rate": 0.00024329456490307757, + "loss": 0.5704, + "step": 1228 + }, + { + "epoch": 2.355640535372849, + "grad_norm": 0.291015625, + "learning_rate": 0.00024293778481119396, + "loss": 0.6217, + "step": 1232 + }, + { + "epoch": 2.363288718929254, + "grad_norm": 0.27734375, + "learning_rate": 0.0002425801492754324, + "loss": 0.6238, + "step": 1236 + }, + { + "epoch": 2.3709369024856595, + "grad_norm": 0.275390625, + "learning_rate": 0.00024222166158764161, + "loss": 0.5945, + "step": 1240 + }, + { + "epoch": 2.378585086042065, + "grad_norm": 0.294921875, + "learning_rate": 0.00024186232504751397, + "loss": 0.5605, + "step": 1244 + }, + { + "epoch": 2.3862332695984705, + "grad_norm": 0.287109375, + "learning_rate": 0.0002415021429625551, + "loss": 0.6451, + "step": 1248 + }, + { + "epoch": 2.3938814531548758, + "grad_norm": 0.279296875, + "learning_rate": 0.00024114111864805338, + "loss": 0.5581, + "step": 1252 + }, + { + "epoch": 2.401529636711281, + "grad_norm": 0.259765625, + "learning_rate": 0.00024077925542704949, + "loss": 0.6212, + "step": 1256 + }, + { + "epoch": 2.4091778202676863, + "grad_norm": 0.287109375, + "learning_rate": 0.0002404165566303057, + "loss": 0.6121, + "step": 1260 + }, + { + "epoch": 2.4168260038240916, + "grad_norm": 0.33203125, + "learning_rate": 0.00024005302559627561, + "loss": 0.578, + "step": 1264 + }, + { + "epoch": 2.424474187380497, + "grad_norm": 0.291015625, + "learning_rate": 0.00023968866567107282, + "loss": 0.6353, + "step": 1268 + }, + { + "epoch": 2.4321223709369026, + "grad_norm": 0.310546875, + "learning_rate": 0.00023932348020844064, + "loss": 0.5811, + "step": 1272 + }, + { + "epoch": 2.439770554493308, + "grad_norm": 0.302734375, + "learning_rate": 0.00023895747256972083, + "loss": 0.5834, + "step": 1276 + }, + { + "epoch": 2.447418738049713, + "grad_norm": 0.279296875, + "learning_rate": 0.00023859064612382315, + "loss": 0.6234, + "step": 1280 + }, + { + "epoch": 2.4550669216061185, + "grad_norm": 0.2890625, + "learning_rate": 0.0002382230042471938, + "loss": 0.6298, + "step": 1284 + }, + { + "epoch": 2.462715105162524, + "grad_norm": 0.326171875, + "learning_rate": 0.0002378545503237846, + "loss": 0.6834, + "step": 1288 + }, + { + "epoch": 2.470363288718929, + "grad_norm": 0.296875, + "learning_rate": 0.00023748528774502194, + "loss": 0.6176, + "step": 1292 + }, + { + "epoch": 2.478011472275335, + "grad_norm": 0.28125, + "learning_rate": 0.00023711521990977554, + "loss": 0.6141, + "step": 1296 + }, + { + "epoch": 2.48565965583174, + "grad_norm": 0.2734375, + "learning_rate": 0.00023674435022432683, + "loss": 0.5958, + "step": 1300 + }, + { + "epoch": 2.4933078393881454, + "grad_norm": 0.310546875, + "learning_rate": 0.0002363726821023381, + "loss": 0.6249, + "step": 1304 + }, + { + "epoch": 2.5009560229445507, + "grad_norm": 0.28515625, + "learning_rate": 0.00023600021896482063, + "loss": 0.5865, + "step": 1308 + }, + { + "epoch": 2.508604206500956, + "grad_norm": 0.267578125, + "learning_rate": 0.0002356269642401036, + "loss": 0.5982, + "step": 1312 + }, + { + "epoch": 2.5162523900573612, + "grad_norm": 0.287109375, + "learning_rate": 0.0002352529213638022, + "loss": 0.5722, + "step": 1316 + }, + { + "epoch": 2.5239005736137665, + "grad_norm": 0.296875, + "learning_rate": 0.0002348780937787862, + "loss": 0.6308, + "step": 1320 + }, + { + "epoch": 2.5315487571701722, + "grad_norm": 0.28515625, + "learning_rate": 0.00023450248493514817, + "loss": 0.6104, + "step": 1324 + }, + { + "epoch": 2.5391969407265775, + "grad_norm": 0.306640625, + "learning_rate": 0.00023412609829017182, + "loss": 0.6053, + "step": 1328 + }, + { + "epoch": 2.546845124282983, + "grad_norm": 0.294921875, + "learning_rate": 0.00023374893730830005, + "loss": 0.5447, + "step": 1332 + }, + { + "epoch": 2.554493307839388, + "grad_norm": 0.294921875, + "learning_rate": 0.00023337100546110313, + "loss": 0.5987, + "step": 1336 + }, + { + "epoch": 2.5621414913957934, + "grad_norm": 0.29296875, + "learning_rate": 0.0002329923062272468, + "loss": 0.6037, + "step": 1340 + }, + { + "epoch": 2.569789674952199, + "grad_norm": 0.298828125, + "learning_rate": 0.0002326128430924602, + "loss": 0.6055, + "step": 1344 + }, + { + "epoch": 2.5774378585086044, + "grad_norm": 0.291015625, + "learning_rate": 0.00023223261954950363, + "loss": 0.6163, + "step": 1348 + }, + { + "epoch": 2.5850860420650097, + "grad_norm": 0.298828125, + "learning_rate": 0.00023185163909813678, + "loss": 0.6272, + "step": 1352 + }, + { + "epoch": 2.592734225621415, + "grad_norm": 0.28125, + "learning_rate": 0.00023146990524508613, + "loss": 0.6303, + "step": 1356 + }, + { + "epoch": 2.6003824091778203, + "grad_norm": 0.279296875, + "learning_rate": 0.00023108742150401284, + "loss": 0.5862, + "step": 1360 + }, + { + "epoch": 2.6080305927342256, + "grad_norm": 0.291015625, + "learning_rate": 0.00023070419139548044, + "loss": 0.6083, + "step": 1364 + }, + { + "epoch": 2.615678776290631, + "grad_norm": 0.298828125, + "learning_rate": 0.00023032021844692242, + "loss": 0.6466, + "step": 1368 + }, + { + "epoch": 2.623326959847036, + "grad_norm": 0.302734375, + "learning_rate": 0.0002299355061926096, + "loss": 0.6236, + "step": 1372 + }, + { + "epoch": 2.6309751434034414, + "grad_norm": 0.28125, + "learning_rate": 0.00022955005817361783, + "loss": 0.6203, + "step": 1376 + }, + { + "epoch": 2.638623326959847, + "grad_norm": 0.283203125, + "learning_rate": 0.00022916387793779533, + "loss": 0.6101, + "step": 1380 + }, + { + "epoch": 2.6462715105162524, + "grad_norm": 0.2890625, + "learning_rate": 0.00022877696903972984, + "loss": 0.5361, + "step": 1384 + }, + { + "epoch": 2.6539196940726577, + "grad_norm": 0.2890625, + "learning_rate": 0.00022838933504071618, + "loss": 0.594, + "step": 1388 + }, + { + "epoch": 2.661567877629063, + "grad_norm": 0.28125, + "learning_rate": 0.0002280009795087233, + "loss": 0.6323, + "step": 1392 + }, + { + "epoch": 2.6692160611854687, + "grad_norm": 0.267578125, + "learning_rate": 0.00022761190601836142, + "loss": 0.5863, + "step": 1396 + }, + { + "epoch": 2.676864244741874, + "grad_norm": 0.30078125, + "learning_rate": 0.00022722211815084944, + "loss": 0.6109, + "step": 1400 + }, + { + "epoch": 2.6845124282982793, + "grad_norm": 0.3046875, + "learning_rate": 0.0002268316194939815, + "loss": 0.6062, + "step": 1404 + }, + { + "epoch": 2.6921606118546846, + "grad_norm": 0.3046875, + "learning_rate": 0.0002264404136420941, + "loss": 0.5996, + "step": 1408 + }, + { + "epoch": 2.69980879541109, + "grad_norm": 0.30859375, + "learning_rate": 0.0002260485041960334, + "loss": 0.6301, + "step": 1412 + }, + { + "epoch": 2.707456978967495, + "grad_norm": 0.2890625, + "learning_rate": 0.00022565589476312157, + "loss": 0.5807, + "step": 1416 + }, + { + "epoch": 2.7151051625239004, + "grad_norm": 0.28515625, + "learning_rate": 0.00022526258895712377, + "loss": 0.6542, + "step": 1420 + }, + { + "epoch": 2.7227533460803057, + "grad_norm": 0.283203125, + "learning_rate": 0.00022486859039821513, + "loss": 0.6001, + "step": 1424 + }, + { + "epoch": 2.730401529636711, + "grad_norm": 0.30078125, + "learning_rate": 0.00022447390271294697, + "loss": 0.5997, + "step": 1428 + }, + { + "epoch": 2.7380497131931167, + "grad_norm": 0.2890625, + "learning_rate": 0.00022407852953421382, + "loss": 0.6401, + "step": 1432 + }, + { + "epoch": 2.745697896749522, + "grad_norm": 0.283203125, + "learning_rate": 0.00022368247450121965, + "loss": 0.5744, + "step": 1436 + }, + { + "epoch": 2.7533460803059273, + "grad_norm": 0.28125, + "learning_rate": 0.00022328574125944476, + "loss": 0.5853, + "step": 1440 + }, + { + "epoch": 2.7609942638623326, + "grad_norm": 0.302734375, + "learning_rate": 0.00022288833346061182, + "loss": 0.5861, + "step": 1444 + }, + { + "epoch": 2.768642447418738, + "grad_norm": 0.28515625, + "learning_rate": 0.0002224902547626526, + "loss": 0.6194, + "step": 1448 + }, + { + "epoch": 2.7762906309751436, + "grad_norm": 0.298828125, + "learning_rate": 0.00022209150882967398, + "loss": 0.604, + "step": 1452 + }, + { + "epoch": 2.783938814531549, + "grad_norm": 0.29296875, + "learning_rate": 0.00022169209933192458, + "loss": 0.6312, + "step": 1456 + }, + { + "epoch": 2.791586998087954, + "grad_norm": 0.287109375, + "learning_rate": 0.0002212920299457606, + "loss": 0.6312, + "step": 1460 + }, + { + "epoch": 2.7992351816443595, + "grad_norm": 0.294921875, + "learning_rate": 0.0002208913043536123, + "loss": 0.6089, + "step": 1464 + }, + { + "epoch": 2.8068833652007648, + "grad_norm": 0.3125, + "learning_rate": 0.00022048992624394988, + "loss": 0.6349, + "step": 1468 + }, + { + "epoch": 2.81453154875717, + "grad_norm": 0.29296875, + "learning_rate": 0.00022008789931124976, + "loss": 0.591, + "step": 1472 + }, + { + "epoch": 2.8221797323135753, + "grad_norm": 0.28515625, + "learning_rate": 0.0002196852272559603, + "loss": 0.6085, + "step": 1476 + }, + { + "epoch": 2.8298279158699806, + "grad_norm": 0.28125, + "learning_rate": 0.00021928191378446795, + "loss": 0.6284, + "step": 1480 + }, + { + "epoch": 2.8374760994263863, + "grad_norm": 0.283203125, + "learning_rate": 0.00021887796260906304, + "loss": 0.5796, + "step": 1484 + }, + { + "epoch": 2.8451242829827916, + "grad_norm": 0.3046875, + "learning_rate": 0.00021847337744790562, + "loss": 0.5739, + "step": 1488 + }, + { + "epoch": 2.852772466539197, + "grad_norm": 0.294921875, + "learning_rate": 0.0002180681620249913, + "loss": 0.6, + "step": 1492 + }, + { + "epoch": 2.860420650095602, + "grad_norm": 0.30078125, + "learning_rate": 0.00021766232007011682, + "loss": 0.6113, + "step": 1496 + }, + { + "epoch": 2.8680688336520075, + "grad_norm": 0.29296875, + "learning_rate": 0.0002172558553188459, + "loss": 0.5949, + "step": 1500 + }, + { + "epoch": 2.875717017208413, + "grad_norm": 0.30078125, + "learning_rate": 0.00021684877151247485, + "loss": 0.5929, + "step": 1504 + }, + { + "epoch": 2.8833652007648185, + "grad_norm": 0.283203125, + "learning_rate": 0.00021644107239799786, + "loss": 0.6224, + "step": 1508 + }, + { + "epoch": 2.891013384321224, + "grad_norm": 0.306640625, + "learning_rate": 0.00021603276172807288, + "loss": 0.6439, + "step": 1512 + }, + { + "epoch": 2.898661567877629, + "grad_norm": 0.298828125, + "learning_rate": 0.00021562384326098688, + "loss": 0.5835, + "step": 1516 + }, + { + "epoch": 2.9063097514340344, + "grad_norm": 0.298828125, + "learning_rate": 0.0002152143207606211, + "loss": 0.6, + "step": 1520 + }, + { + "epoch": 2.9139579349904396, + "grad_norm": 0.310546875, + "learning_rate": 0.00021480419799641692, + "loss": 0.5959, + "step": 1524 + }, + { + "epoch": 2.921606118546845, + "grad_norm": 0.291015625, + "learning_rate": 0.0002143934787433406, + "loss": 0.6111, + "step": 1528 + }, + { + "epoch": 2.92925430210325, + "grad_norm": 0.27734375, + "learning_rate": 0.00021398216678184884, + "loss": 0.6072, + "step": 1532 + }, + { + "epoch": 2.936902485659656, + "grad_norm": 0.314453125, + "learning_rate": 0.00021357026589785392, + "loss": 0.5744, + "step": 1536 + }, + { + "epoch": 2.9445506692160612, + "grad_norm": 0.29296875, + "learning_rate": 0.00021315777988268876, + "loss": 0.603, + "step": 1540 + }, + { + "epoch": 2.9521988527724665, + "grad_norm": 0.287109375, + "learning_rate": 0.00021274471253307224, + "loss": 0.6364, + "step": 1544 + }, + { + "epoch": 2.959847036328872, + "grad_norm": 0.314453125, + "learning_rate": 0.00021233106765107407, + "loss": 0.5766, + "step": 1548 + }, + { + "epoch": 2.967495219885277, + "grad_norm": 0.287109375, + "learning_rate": 0.00021191684904407976, + "loss": 0.5715, + "step": 1552 + }, + { + "epoch": 2.975143403441683, + "grad_norm": 0.26953125, + "learning_rate": 0.0002115020605247558, + "loss": 0.6015, + "step": 1556 + }, + { + "epoch": 2.982791586998088, + "grad_norm": 0.29296875, + "learning_rate": 0.00021108670591101433, + "loss": 0.6017, + "step": 1560 + }, + { + "epoch": 2.9904397705544934, + "grad_norm": 0.283203125, + "learning_rate": 0.00021067078902597814, + "loss": 0.6201, + "step": 1564 + }, + { + "epoch": 2.9980879541108987, + "grad_norm": 0.26953125, + "learning_rate": 0.0002102543136979454, + "loss": 0.6178, + "step": 1568 + }, + { + "epoch": 3.005736137667304, + "grad_norm": 0.263671875, + "learning_rate": 0.00020983728376035448, + "loss": 0.464, + "step": 1572 + }, + { + "epoch": 3.0133843212237093, + "grad_norm": 0.296875, + "learning_rate": 0.00020941970305174862, + "loss": 0.4966, + "step": 1576 + }, + { + "epoch": 3.0210325047801145, + "grad_norm": 0.296875, + "learning_rate": 0.00020900157541574066, + "loss": 0.5019, + "step": 1580 + }, + { + "epoch": 3.0286806883365203, + "grad_norm": 0.283203125, + "learning_rate": 0.00020858290470097762, + "loss": 0.477, + "step": 1584 + }, + { + "epoch": 3.0363288718929256, + "grad_norm": 0.291015625, + "learning_rate": 0.00020816369476110512, + "loss": 0.5126, + "step": 1588 + }, + { + "epoch": 3.043977055449331, + "grad_norm": 0.27734375, + "learning_rate": 0.0002077439494547324, + "loss": 0.5319, + "step": 1592 + }, + { + "epoch": 3.051625239005736, + "grad_norm": 0.302734375, + "learning_rate": 0.00020732367264539612, + "loss": 0.4976, + "step": 1596 + }, + { + "epoch": 3.0592734225621414, + "grad_norm": 0.298828125, + "learning_rate": 0.00020690286820152534, + "loss": 0.4638, + "step": 1600 + }, + { + "epoch": 3.0669216061185467, + "grad_norm": 0.298828125, + "learning_rate": 0.0002064815399964057, + "loss": 0.5326, + "step": 1604 + }, + { + "epoch": 3.0745697896749524, + "grad_norm": 0.30859375, + "learning_rate": 0.00020605969190814374, + "loss": 0.5066, + "step": 1608 + }, + { + "epoch": 3.0822179732313577, + "grad_norm": 0.279296875, + "learning_rate": 0.0002056373278196313, + "loss": 0.4611, + "step": 1612 + }, + { + "epoch": 3.089866156787763, + "grad_norm": 0.279296875, + "learning_rate": 0.0002052144516185097, + "loss": 0.511, + "step": 1616 + }, + { + "epoch": 3.0975143403441683, + "grad_norm": 0.287109375, + "learning_rate": 0.00020479106719713402, + "loss": 0.5338, + "step": 1620 + }, + { + "epoch": 3.1051625239005736, + "grad_norm": 0.314453125, + "learning_rate": 0.00020436717845253723, + "loss": 0.5384, + "step": 1624 + }, + { + "epoch": 3.112810707456979, + "grad_norm": 0.30078125, + "learning_rate": 0.0002039427892863943, + "loss": 0.5188, + "step": 1628 + }, + { + "epoch": 3.120458891013384, + "grad_norm": 0.318359375, + "learning_rate": 0.00020351790360498636, + "loss": 0.5045, + "step": 1632 + }, + { + "epoch": 3.12810707456979, + "grad_norm": 0.3125, + "learning_rate": 0.00020309252531916475, + "loss": 0.4894, + "step": 1636 + }, + { + "epoch": 3.135755258126195, + "grad_norm": 0.310546875, + "learning_rate": 0.00020266665834431486, + "loss": 0.5241, + "step": 1640 + }, + { + "epoch": 3.1434034416826004, + "grad_norm": 0.33203125, + "learning_rate": 0.00020224030660032023, + "loss": 0.4838, + "step": 1644 + }, + { + "epoch": 3.1510516252390057, + "grad_norm": 0.30078125, + "learning_rate": 0.00020181347401152652, + "loss": 0.5449, + "step": 1648 + }, + { + "epoch": 3.158699808795411, + "grad_norm": 0.30859375, + "learning_rate": 0.0002013861645067054, + "loss": 0.5428, + "step": 1652 + }, + { + "epoch": 3.1663479923518163, + "grad_norm": 0.3125, + "learning_rate": 0.00020095838201901798, + "loss": 0.5506, + "step": 1656 + }, + { + "epoch": 3.173996175908222, + "grad_norm": 0.32421875, + "learning_rate": 0.00020053013048597926, + "loss": 0.4983, + "step": 1660 + }, + { + "epoch": 3.1816443594646273, + "grad_norm": 0.3046875, + "learning_rate": 0.00020010141384942148, + "loss": 0.4941, + "step": 1664 + }, + { + "epoch": 3.1892925430210326, + "grad_norm": 0.31640625, + "learning_rate": 0.0001996722360554577, + "loss": 0.5406, + "step": 1668 + }, + { + "epoch": 3.196940726577438, + "grad_norm": 0.31640625, + "learning_rate": 0.00019924260105444602, + "loss": 0.484, + "step": 1672 + }, + { + "epoch": 3.204588910133843, + "grad_norm": 0.310546875, + "learning_rate": 0.00019881251280095261, + "loss": 0.5374, + "step": 1676 + }, + { + "epoch": 3.2122370936902485, + "grad_norm": 0.3046875, + "learning_rate": 0.00019838197525371583, + "loss": 0.4977, + "step": 1680 + }, + { + "epoch": 3.2198852772466537, + "grad_norm": 0.33203125, + "learning_rate": 0.0001979509923756094, + "loss": 0.5462, + "step": 1684 + }, + { + "epoch": 3.2275334608030595, + "grad_norm": 0.296875, + "learning_rate": 0.0001975195681336061, + "loss": 0.4945, + "step": 1688 + }, + { + "epoch": 3.2351816443594648, + "grad_norm": 0.29296875, + "learning_rate": 0.00019708770649874132, + "loss": 0.5042, + "step": 1692 + }, + { + "epoch": 3.24282982791587, + "grad_norm": 0.328125, + "learning_rate": 0.00019665541144607627, + "loss": 0.5369, + "step": 1696 + }, + { + "epoch": 3.2504780114722753, + "grad_norm": 0.3125, + "learning_rate": 0.00019622268695466166, + "loss": 0.5121, + "step": 1700 + }, + { + "epoch": 3.2581261950286806, + "grad_norm": 0.294921875, + "learning_rate": 0.000195789537007501, + "loss": 0.4984, + "step": 1704 + }, + { + "epoch": 3.265774378585086, + "grad_norm": 0.306640625, + "learning_rate": 0.00019535596559151376, + "loss": 0.5391, + "step": 1708 + }, + { + "epoch": 3.2734225621414916, + "grad_norm": 0.3125, + "learning_rate": 0.00019492197669749892, + "loss": 0.4778, + "step": 1712 + }, + { + "epoch": 3.281070745697897, + "grad_norm": 0.306640625, + "learning_rate": 0.00019448757432009807, + "loss": 0.5131, + "step": 1716 + }, + { + "epoch": 3.288718929254302, + "grad_norm": 0.31640625, + "learning_rate": 0.00019405276245775877, + "loss": 0.4901, + "step": 1720 + }, + { + "epoch": 3.2963671128107075, + "grad_norm": 0.326171875, + "learning_rate": 0.00019361754511269753, + "loss": 0.5426, + "step": 1724 + }, + { + "epoch": 3.3040152963671128, + "grad_norm": 0.337890625, + "learning_rate": 0.00019318192629086327, + "loss": 0.5105, + "step": 1728 + }, + { + "epoch": 3.311663479923518, + "grad_norm": 0.33203125, + "learning_rate": 0.00019274591000190028, + "loss": 0.5448, + "step": 1732 + }, + { + "epoch": 3.3193116634799233, + "grad_norm": 0.314453125, + "learning_rate": 0.00019230950025911123, + "loss": 0.5079, + "step": 1736 + }, + { + "epoch": 3.3269598470363286, + "grad_norm": 0.296875, + "learning_rate": 0.0001918727010794204, + "loss": 0.5185, + "step": 1740 + }, + { + "epoch": 3.3346080305927344, + "grad_norm": 0.3125, + "learning_rate": 0.0001914355164833366, + "loss": 0.4918, + "step": 1744 + }, + { + "epoch": 3.3422562141491396, + "grad_norm": 0.33203125, + "learning_rate": 0.00019099795049491621, + "loss": 0.5051, + "step": 1748 + }, + { + "epoch": 3.349904397705545, + "grad_norm": 0.3125, + "learning_rate": 0.00019056000714172617, + "loss": 0.5295, + "step": 1752 + }, + { + "epoch": 3.35755258126195, + "grad_norm": 0.33203125, + "learning_rate": 0.00019012169045480676, + "loss": 0.5455, + "step": 1756 + }, + { + "epoch": 3.3652007648183555, + "grad_norm": 0.29296875, + "learning_rate": 0.00018968300446863478, + "loss": 0.515, + "step": 1760 + }, + { + "epoch": 3.3728489483747612, + "grad_norm": 0.296875, + "learning_rate": 0.00018924395322108607, + "loss": 0.4868, + "step": 1764 + }, + { + "epoch": 3.3804971319311665, + "grad_norm": 0.330078125, + "learning_rate": 0.00018880454075339854, + "loss": 0.5087, + "step": 1768 + }, + { + "epoch": 3.388145315487572, + "grad_norm": 0.318359375, + "learning_rate": 0.00018836477111013495, + "loss": 0.5179, + "step": 1772 + }, + { + "epoch": 3.395793499043977, + "grad_norm": 0.349609375, + "learning_rate": 0.00018792464833914576, + "loss": 0.5613, + "step": 1776 + }, + { + "epoch": 3.4034416826003824, + "grad_norm": 0.333984375, + "learning_rate": 0.0001874841764915317, + "loss": 0.5269, + "step": 1780 + }, + { + "epoch": 3.4110898661567877, + "grad_norm": 0.337890625, + "learning_rate": 0.00018704335962160663, + "loss": 0.5024, + "step": 1784 + }, + { + "epoch": 3.418738049713193, + "grad_norm": 0.322265625, + "learning_rate": 0.00018660220178686002, + "loss": 0.5272, + "step": 1788 + }, + { + "epoch": 3.4263862332695982, + "grad_norm": 0.330078125, + "learning_rate": 0.0001861607070479199, + "loss": 0.4757, + "step": 1792 + }, + { + "epoch": 3.434034416826004, + "grad_norm": 0.34765625, + "learning_rate": 0.00018571887946851535, + "loss": 0.5167, + "step": 1796 + }, + { + "epoch": 3.4416826003824093, + "grad_norm": 0.314453125, + "learning_rate": 0.00018527672311543887, + "loss": 0.5162, + "step": 1800 + }, + { + "epoch": 3.4493307839388145, + "grad_norm": 0.33203125, + "learning_rate": 0.00018483424205850934, + "loss": 0.5711, + "step": 1804 + }, + { + "epoch": 3.45697896749522, + "grad_norm": 0.359375, + "learning_rate": 0.0001843914403705343, + "loss": 0.5373, + "step": 1808 + }, + { + "epoch": 3.464627151051625, + "grad_norm": 0.31640625, + "learning_rate": 0.00018394832212727252, + "loss": 0.5118, + "step": 1812 + }, + { + "epoch": 3.472275334608031, + "grad_norm": 0.318359375, + "learning_rate": 0.00018350489140739654, + "loss": 0.5283, + "step": 1816 + }, + { + "epoch": 3.479923518164436, + "grad_norm": 0.298828125, + "learning_rate": 0.00018306115229245506, + "loss": 0.4733, + "step": 1820 + }, + { + "epoch": 3.4875717017208414, + "grad_norm": 0.322265625, + "learning_rate": 0.00018261710886683538, + "loss": 0.5091, + "step": 1824 + }, + { + "epoch": 3.4952198852772467, + "grad_norm": 0.333984375, + "learning_rate": 0.0001821727652177258, + "loss": 0.5055, + "step": 1828 + }, + { + "epoch": 3.502868068833652, + "grad_norm": 0.31640625, + "learning_rate": 0.00018172812543507813, + "loss": 0.518, + "step": 1832 + }, + { + "epoch": 3.5105162523900573, + "grad_norm": 0.3125, + "learning_rate": 0.00018128319361156978, + "loss": 0.5309, + "step": 1836 + }, + { + "epoch": 3.5181644359464626, + "grad_norm": 0.34375, + "learning_rate": 0.0001808379738425664, + "loss": 0.5381, + "step": 1840 + }, + { + "epoch": 3.525812619502868, + "grad_norm": 0.341796875, + "learning_rate": 0.00018039247022608393, + "loss": 0.5596, + "step": 1844 + }, + { + "epoch": 3.5334608030592736, + "grad_norm": 0.333984375, + "learning_rate": 0.00017994668686275092, + "loss": 0.5198, + "step": 1848 + }, + { + "epoch": 3.541108986615679, + "grad_norm": 0.3203125, + "learning_rate": 0.00017950062785577104, + "loss": 0.542, + "step": 1852 + }, + { + "epoch": 3.548757170172084, + "grad_norm": 0.341796875, + "learning_rate": 0.00017905429731088497, + "loss": 0.5138, + "step": 1856 + }, + { + "epoch": 3.5564053537284894, + "grad_norm": 0.3359375, + "learning_rate": 0.0001786076993363328, + "loss": 0.5144, + "step": 1860 + }, + { + "epoch": 3.5640535372848947, + "grad_norm": 0.330078125, + "learning_rate": 0.0001781608380428161, + "loss": 0.5127, + "step": 1864 + }, + { + "epoch": 3.5717017208413004, + "grad_norm": 0.30859375, + "learning_rate": 0.0001777137175434602, + "loss": 0.5018, + "step": 1868 + }, + { + "epoch": 3.5793499043977057, + "grad_norm": 0.337890625, + "learning_rate": 0.00017726634195377642, + "loss": 0.4387, + "step": 1872 + }, + { + "epoch": 3.586998087954111, + "grad_norm": 0.33984375, + "learning_rate": 0.00017681871539162382, + "loss": 0.5421, + "step": 1876 + }, + { + "epoch": 3.5946462715105163, + "grad_norm": 0.353515625, + "learning_rate": 0.00017637084197717163, + "loss": 0.5118, + "step": 1880 + }, + { + "epoch": 3.6022944550669216, + "grad_norm": 0.333984375, + "learning_rate": 0.00017592272583286125, + "loss": 0.5017, + "step": 1884 + }, + { + "epoch": 3.609942638623327, + "grad_norm": 0.326171875, + "learning_rate": 0.00017547437108336836, + "loss": 0.533, + "step": 1888 + }, + { + "epoch": 3.617590822179732, + "grad_norm": 0.33203125, + "learning_rate": 0.00017502578185556468, + "loss": 0.5954, + "step": 1892 + }, + { + "epoch": 3.6252390057361374, + "grad_norm": 0.345703125, + "learning_rate": 0.00017457696227848036, + "loss": 0.5098, + "step": 1896 + }, + { + "epoch": 3.632887189292543, + "grad_norm": 0.322265625, + "learning_rate": 0.00017412791648326566, + "loss": 0.4859, + "step": 1900 + }, + { + "epoch": 3.6405353728489485, + "grad_norm": 0.333984375, + "learning_rate": 0.0001736786486031531, + "loss": 0.5466, + "step": 1904 + }, + { + "epoch": 3.6481835564053537, + "grad_norm": 0.33203125, + "learning_rate": 0.00017322916277341945, + "loss": 0.5258, + "step": 1908 + }, + { + "epoch": 3.655831739961759, + "grad_norm": 0.3515625, + "learning_rate": 0.00017277946313134758, + "loss": 0.5302, + "step": 1912 + }, + { + "epoch": 3.6634799235181643, + "grad_norm": 0.31640625, + "learning_rate": 0.00017232955381618826, + "loss": 0.5487, + "step": 1916 + }, + { + "epoch": 3.67112810707457, + "grad_norm": 0.357421875, + "learning_rate": 0.00017187943896912236, + "loss": 0.497, + "step": 1920 + }, + { + "epoch": 3.6787762906309753, + "grad_norm": 0.314453125, + "learning_rate": 0.0001714291227332224, + "loss": 0.4907, + "step": 1924 + }, + { + "epoch": 3.6864244741873806, + "grad_norm": 0.322265625, + "learning_rate": 0.00017097860925341472, + "loss": 0.5322, + "step": 1928 + }, + { + "epoch": 3.694072657743786, + "grad_norm": 0.3125, + "learning_rate": 0.00017052790267644112, + "loss": 0.4859, + "step": 1932 + }, + { + "epoch": 3.701720841300191, + "grad_norm": 0.330078125, + "learning_rate": 0.00017007700715082077, + "loss": 0.5101, + "step": 1936 + }, + { + "epoch": 3.7093690248565965, + "grad_norm": 0.31640625, + "learning_rate": 0.00016962592682681206, + "loss": 0.5091, + "step": 1940 + }, + { + "epoch": 3.7170172084130018, + "grad_norm": 0.3203125, + "learning_rate": 0.00016917466585637426, + "loss": 0.5399, + "step": 1944 + }, + { + "epoch": 3.724665391969407, + "grad_norm": 0.349609375, + "learning_rate": 0.0001687232283931294, + "loss": 0.4829, + "step": 1948 + }, + { + "epoch": 3.7323135755258128, + "grad_norm": 0.333984375, + "learning_rate": 0.00016827161859232418, + "loss": 0.5355, + "step": 1952 + }, + { + "epoch": 3.739961759082218, + "grad_norm": 0.35546875, + "learning_rate": 0.00016781984061079138, + "loss": 0.5442, + "step": 1956 + }, + { + "epoch": 3.7476099426386233, + "grad_norm": 0.330078125, + "learning_rate": 0.00016736789860691197, + "loss": 0.5238, + "step": 1960 + }, + { + "epoch": 3.7552581261950286, + "grad_norm": 0.32421875, + "learning_rate": 0.00016691579674057657, + "loss": 0.5004, + "step": 1964 + }, + { + "epoch": 3.762906309751434, + "grad_norm": 0.3203125, + "learning_rate": 0.00016646353917314726, + "loss": 0.4718, + "step": 1968 + }, + { + "epoch": 3.7705544933078396, + "grad_norm": 0.33203125, + "learning_rate": 0.00016601113006741916, + "loss": 0.4324, + "step": 1972 + }, + { + "epoch": 3.778202676864245, + "grad_norm": 0.314453125, + "learning_rate": 0.00016555857358758252, + "loss": 0.457, + "step": 1976 + }, + { + "epoch": 3.78585086042065, + "grad_norm": 0.337890625, + "learning_rate": 0.00016510587389918373, + "loss": 0.4992, + "step": 1980 + }, + { + "epoch": 3.7934990439770555, + "grad_norm": 0.333984375, + "learning_rate": 0.00016465303516908762, + "loss": 0.4984, + "step": 1984 + }, + { + "epoch": 3.801147227533461, + "grad_norm": 0.318359375, + "learning_rate": 0.0001642000615654387, + "loss": 0.5278, + "step": 1988 + }, + { + "epoch": 3.808795411089866, + "grad_norm": 0.333984375, + "learning_rate": 0.0001637469572576229, + "loss": 0.5272, + "step": 1992 + }, + { + "epoch": 3.8164435946462714, + "grad_norm": 0.333984375, + "learning_rate": 0.00016329372641622934, + "loss": 0.5019, + "step": 1996 + }, + { + "epoch": 3.8240917782026767, + "grad_norm": 0.353515625, + "learning_rate": 0.00016284037321301166, + "loss": 0.4939, + "step": 2000 + }, + { + "epoch": 3.8317399617590824, + "grad_norm": 0.333984375, + "learning_rate": 0.00016238690182084986, + "loss": 0.5335, + "step": 2004 + }, + { + "epoch": 3.8393881453154877, + "grad_norm": 0.33203125, + "learning_rate": 0.00016193331641371176, + "loss": 0.5396, + "step": 2008 + }, + { + "epoch": 3.847036328871893, + "grad_norm": 0.34765625, + "learning_rate": 0.00016147962116661472, + "loss": 0.5078, + "step": 2012 + }, + { + "epoch": 3.8546845124282982, + "grad_norm": 0.328125, + "learning_rate": 0.00016102582025558703, + "loss": 0.5286, + "step": 2016 + }, + { + "epoch": 3.8623326959847035, + "grad_norm": 0.345703125, + "learning_rate": 0.00016057191785762964, + "loss": 0.504, + "step": 2020 + }, + { + "epoch": 3.8699808795411093, + "grad_norm": 0.341796875, + "learning_rate": 0.00016011791815067754, + "loss": 0.5131, + "step": 2024 + }, + { + "epoch": 3.8776290630975145, + "grad_norm": 0.330078125, + "learning_rate": 0.00015966382531356144, + "loss": 0.5068, + "step": 2028 + }, + { + "epoch": 3.88527724665392, + "grad_norm": 0.326171875, + "learning_rate": 0.00015920964352596927, + "loss": 0.5257, + "step": 2032 + }, + { + "epoch": 3.892925430210325, + "grad_norm": 0.3203125, + "learning_rate": 0.00015875537696840775, + "loss": 0.5145, + "step": 2036 + }, + { + "epoch": 3.9005736137667304, + "grad_norm": 0.314453125, + "learning_rate": 0.0001583010298221638, + "loss": 0.5219, + "step": 2040 + }, + { + "epoch": 3.9082217973231357, + "grad_norm": 0.326171875, + "learning_rate": 0.0001578466062692661, + "loss": 0.5548, + "step": 2044 + }, + { + "epoch": 3.915869980879541, + "grad_norm": 0.330078125, + "learning_rate": 0.00015739211049244667, + "loss": 0.4981, + "step": 2048 + }, + { + "epoch": 3.9235181644359463, + "grad_norm": 0.322265625, + "learning_rate": 0.00015693754667510235, + "loss": 0.5184, + "step": 2052 + }, + { + "epoch": 3.9311663479923515, + "grad_norm": 0.32421875, + "learning_rate": 0.00015648291900125609, + "loss": 0.5207, + "step": 2056 + }, + { + "epoch": 3.9388145315487573, + "grad_norm": 0.33203125, + "learning_rate": 0.00015602823165551877, + "loss": 0.5194, + "step": 2060 + }, + { + "epoch": 3.9464627151051626, + "grad_norm": 0.318359375, + "learning_rate": 0.0001555734888230505, + "loss": 0.4692, + "step": 2064 + }, + { + "epoch": 3.954110898661568, + "grad_norm": 0.330078125, + "learning_rate": 0.00015511869468952201, + "loss": 0.5247, + "step": 2068 + }, + { + "epoch": 3.961759082217973, + "grad_norm": 0.31640625, + "learning_rate": 0.0001546638534410763, + "loss": 0.4561, + "step": 2072 + }, + { + "epoch": 3.969407265774379, + "grad_norm": 0.328125, + "learning_rate": 0.00015420896926429014, + "loss": 0.5073, + "step": 2076 + }, + { + "epoch": 3.977055449330784, + "grad_norm": 0.341796875, + "learning_rate": 0.00015375404634613524, + "loss": 0.5323, + "step": 2080 + }, + { + "epoch": 3.9847036328871894, + "grad_norm": 0.333984375, + "learning_rate": 0.00015329908887393992, + "loss": 0.5223, + "step": 2084 + }, + { + "epoch": 3.9923518164435947, + "grad_norm": 0.359375, + "learning_rate": 0.0001528441010353508, + "loss": 0.5112, + "step": 2088 + }, + { + "epoch": 4.0, + "grad_norm": 0.87109375, + "learning_rate": 0.00015238908701829378, + "loss": 0.5374, + "step": 2092 + }, + { + "epoch": 4.007648183556405, + "grad_norm": 0.30859375, + "learning_rate": 0.0001519340510109357, + "loss": 0.446, + "step": 2096 + }, + { + "epoch": 4.015296367112811, + "grad_norm": 0.3203125, + "learning_rate": 0.00015147899720164594, + "loss": 0.467, + "step": 2100 + }, + { + "epoch": 4.022944550669216, + "grad_norm": 0.333984375, + "learning_rate": 0.00015102392977895765, + "loss": 0.4112, + "step": 2104 + }, + { + "epoch": 4.030592734225621, + "grad_norm": 0.32421875, + "learning_rate": 0.00015056885293152932, + "loss": 0.4401, + "step": 2108 + }, + { + "epoch": 4.038240917782026, + "grad_norm": 0.333984375, + "learning_rate": 0.00015011377084810624, + "loss": 0.406, + "step": 2112 + }, + { + "epoch": 4.045889101338432, + "grad_norm": 0.328125, + "learning_rate": 0.00014965868771748178, + "loss": 0.4528, + "step": 2116 + }, + { + "epoch": 4.053537284894838, + "grad_norm": 0.337890625, + "learning_rate": 0.00014920360772845896, + "loss": 0.4345, + "step": 2120 + }, + { + "epoch": 4.061185468451243, + "grad_norm": 0.345703125, + "learning_rate": 0.00014874853506981206, + "loss": 0.4349, + "step": 2124 + }, + { + "epoch": 4.0688336520076485, + "grad_norm": 0.34765625, + "learning_rate": 0.00014829347393024764, + "loss": 0.4494, + "step": 2128 + }, + { + "epoch": 4.076481835564054, + "grad_norm": 0.333984375, + "learning_rate": 0.00014783842849836644, + "loss": 0.4159, + "step": 2132 + }, + { + "epoch": 4.084130019120459, + "grad_norm": 0.32421875, + "learning_rate": 0.00014738340296262443, + "loss": 0.372, + "step": 2136 + }, + { + "epoch": 4.091778202676864, + "grad_norm": 0.3359375, + "learning_rate": 0.00014692840151129467, + "loss": 0.4621, + "step": 2140 + }, + { + "epoch": 4.09942638623327, + "grad_norm": 0.328125, + "learning_rate": 0.00014647342833242827, + "loss": 0.4095, + "step": 2144 + }, + { + "epoch": 4.107074569789675, + "grad_norm": 0.341796875, + "learning_rate": 0.00014601848761381633, + "loss": 0.4172, + "step": 2148 + }, + { + "epoch": 4.11472275334608, + "grad_norm": 0.31640625, + "learning_rate": 0.00014556358354295113, + "loss": 0.4197, + "step": 2152 + }, + { + "epoch": 4.1223709369024855, + "grad_norm": 0.32421875, + "learning_rate": 0.0001451087203069875, + "loss": 0.4573, + "step": 2156 + }, + { + "epoch": 4.130019120458891, + "grad_norm": 0.33203125, + "learning_rate": 0.00014465390209270456, + "loss": 0.4117, + "step": 2160 + }, + { + "epoch": 4.137667304015296, + "grad_norm": 0.357421875, + "learning_rate": 0.00014419913308646686, + "loss": 0.4148, + "step": 2164 + }, + { + "epoch": 4.145315487571701, + "grad_norm": 0.318359375, + "learning_rate": 0.00014374441747418628, + "loss": 0.4251, + "step": 2168 + }, + { + "epoch": 4.1529636711281075, + "grad_norm": 0.33984375, + "learning_rate": 0.00014328975944128292, + "loss": 0.4314, + "step": 2172 + }, + { + "epoch": 4.160611854684513, + "grad_norm": 0.34765625, + "learning_rate": 0.00014283516317264704, + "loss": 0.4386, + "step": 2176 + }, + { + "epoch": 4.168260038240918, + "grad_norm": 0.3359375, + "learning_rate": 0.00014238063285260057, + "loss": 0.392, + "step": 2180 + }, + { + "epoch": 4.175908221797323, + "grad_norm": 0.357421875, + "learning_rate": 0.00014192617266485803, + "loss": 0.4187, + "step": 2184 + }, + { + "epoch": 4.183556405353729, + "grad_norm": 0.34375, + "learning_rate": 0.0001414717867924888, + "loss": 0.4165, + "step": 2188 + }, + { + "epoch": 4.191204588910134, + "grad_norm": 0.35546875, + "learning_rate": 0.0001410174794178779, + "loss": 0.4191, + "step": 2192 + }, + { + "epoch": 4.198852772466539, + "grad_norm": 0.35546875, + "learning_rate": 0.00014056325472268805, + "loss": 0.436, + "step": 2196 + }, + { + "epoch": 4.2065009560229445, + "grad_norm": 0.349609375, + "learning_rate": 0.0001401091168878209, + "loss": 0.4075, + "step": 2200 + }, + { + "epoch": 4.21414913957935, + "grad_norm": 0.345703125, + "learning_rate": 0.00013965507009337845, + "loss": 0.3995, + "step": 2204 + }, + { + "epoch": 4.221797323135755, + "grad_norm": 0.35546875, + "learning_rate": 0.00013920111851862494, + "loss": 0.4474, + "step": 2208 + }, + { + "epoch": 4.22944550669216, + "grad_norm": 0.330078125, + "learning_rate": 0.00013874726634194797, + "loss": 0.405, + "step": 2212 + }, + { + "epoch": 4.237093690248566, + "grad_norm": 0.326171875, + "learning_rate": 0.0001382935177408204, + "loss": 0.4316, + "step": 2216 + }, + { + "epoch": 4.244741873804971, + "grad_norm": 0.337890625, + "learning_rate": 0.00013783987689176157, + "loss": 0.4299, + "step": 2220 + }, + { + "epoch": 4.252390057361376, + "grad_norm": 0.3515625, + "learning_rate": 0.00013738634797029914, + "loss": 0.4347, + "step": 2224 + }, + { + "epoch": 4.260038240917782, + "grad_norm": 0.34375, + "learning_rate": 0.00013693293515093052, + "loss": 0.4393, + "step": 2228 + }, + { + "epoch": 4.267686424474188, + "grad_norm": 0.375, + "learning_rate": 0.00013647964260708436, + "loss": 0.44, + "step": 2232 + }, + { + "epoch": 4.275334608030593, + "grad_norm": 0.35546875, + "learning_rate": 0.0001360264745110824, + "loss": 0.4194, + "step": 2236 + }, + { + "epoch": 4.282982791586998, + "grad_norm": 0.349609375, + "learning_rate": 0.0001355734350341007, + "loss": 0.4259, + "step": 2240 + }, + { + "epoch": 4.2906309751434035, + "grad_norm": 0.365234375, + "learning_rate": 0.00013512052834613165, + "loss": 0.4311, + "step": 2244 + }, + { + "epoch": 4.298279158699809, + "grad_norm": 0.35546875, + "learning_rate": 0.00013466775861594523, + "loss": 0.4097, + "step": 2248 + }, + { + "epoch": 4.305927342256214, + "grad_norm": 0.31640625, + "learning_rate": 0.0001342151300110509, + "loss": 0.4149, + "step": 2252 + }, + { + "epoch": 4.313575525812619, + "grad_norm": 0.3203125, + "learning_rate": 0.0001337626466976591, + "loss": 0.3983, + "step": 2256 + }, + { + "epoch": 4.321223709369025, + "grad_norm": 0.36328125, + "learning_rate": 0.0001333103128406429, + "loss": 0.4424, + "step": 2260 + }, + { + "epoch": 4.32887189292543, + "grad_norm": 0.34765625, + "learning_rate": 0.00013285813260349982, + "loss": 0.4352, + "step": 2264 + }, + { + "epoch": 4.336520076481835, + "grad_norm": 0.341796875, + "learning_rate": 0.0001324061101483132, + "loss": 0.4441, + "step": 2268 + }, + { + "epoch": 4.3441682600382405, + "grad_norm": 0.361328125, + "learning_rate": 0.00013195424963571424, + "loss": 0.421, + "step": 2272 + }, + { + "epoch": 4.351816443594647, + "grad_norm": 0.33203125, + "learning_rate": 0.00013150255522484345, + "loss": 0.4131, + "step": 2276 + }, + { + "epoch": 4.359464627151052, + "grad_norm": 0.34375, + "learning_rate": 0.00013105103107331255, + "loss": 0.3568, + "step": 2280 + }, + { + "epoch": 4.367112810707457, + "grad_norm": 0.3359375, + "learning_rate": 0.00013059968133716606, + "loss": 0.445, + "step": 2284 + }, + { + "epoch": 4.374760994263863, + "grad_norm": 0.333984375, + "learning_rate": 0.00013014851017084303, + "loss": 0.4267, + "step": 2288 + }, + { + "epoch": 4.382409177820268, + "grad_norm": 0.37890625, + "learning_rate": 0.00012969752172713905, + "loss": 0.4458, + "step": 2292 + }, + { + "epoch": 4.390057361376673, + "grad_norm": 0.341796875, + "learning_rate": 0.00012924672015716759, + "loss": 0.439, + "step": 2296 + }, + { + "epoch": 4.397705544933078, + "grad_norm": 0.34765625, + "learning_rate": 0.00012879610961032218, + "loss": 0.4792, + "step": 2300 + }, + { + "epoch": 4.405353728489484, + "grad_norm": 0.310546875, + "learning_rate": 0.0001283456942342383, + "loss": 0.4113, + "step": 2304 + }, + { + "epoch": 4.413001912045889, + "grad_norm": 0.35546875, + "learning_rate": 0.0001278954781747545, + "loss": 0.4548, + "step": 2308 + }, + { + "epoch": 4.420650095602294, + "grad_norm": 0.359375, + "learning_rate": 0.00012744546557587517, + "loss": 0.4512, + "step": 2312 + }, + { + "epoch": 4.4282982791587, + "grad_norm": 0.3515625, + "learning_rate": 0.00012699566057973168, + "loss": 0.4211, + "step": 2316 + }, + { + "epoch": 4.435946462715105, + "grad_norm": 0.35546875, + "learning_rate": 0.00012654606732654468, + "loss": 0.4256, + "step": 2320 + }, + { + "epoch": 4.44359464627151, + "grad_norm": 0.357421875, + "learning_rate": 0.00012609668995458573, + "loss": 0.4451, + "step": 2324 + }, + { + "epoch": 4.451242829827915, + "grad_norm": 0.34375, + "learning_rate": 0.0001256475326001394, + "loss": 0.4703, + "step": 2328 + }, + { + "epoch": 4.458891013384322, + "grad_norm": 0.33984375, + "learning_rate": 0.00012519859939746504, + "loss": 0.4032, + "step": 2332 + }, + { + "epoch": 4.466539196940727, + "grad_norm": 0.384765625, + "learning_rate": 0.00012474989447875886, + "loss": 0.4324, + "step": 2336 + }, + { + "epoch": 4.474187380497132, + "grad_norm": 0.380859375, + "learning_rate": 0.0001243014219741158, + "loss": 0.4671, + "step": 2340 + }, + { + "epoch": 4.4818355640535374, + "grad_norm": 0.36328125, + "learning_rate": 0.00012385318601149158, + "loss": 0.4463, + "step": 2344 + }, + { + "epoch": 4.489483747609943, + "grad_norm": 0.365234375, + "learning_rate": 0.00012340519071666467, + "loss": 0.4448, + "step": 2348 + }, + { + "epoch": 4.497131931166348, + "grad_norm": 0.326171875, + "learning_rate": 0.0001229574402131982, + "loss": 0.4345, + "step": 2352 + }, + { + "epoch": 4.504780114722753, + "grad_norm": 0.359375, + "learning_rate": 0.00012250993862240227, + "loss": 0.433, + "step": 2356 + }, + { + "epoch": 4.512428298279159, + "grad_norm": 0.34765625, + "learning_rate": 0.00012206269006329593, + "loss": 0.4293, + "step": 2360 + }, + { + "epoch": 4.520076481835564, + "grad_norm": 0.35546875, + "learning_rate": 0.00012161569865256896, + "loss": 0.4413, + "step": 2364 + }, + { + "epoch": 4.527724665391969, + "grad_norm": 0.349609375, + "learning_rate": 0.00012116896850454446, + "loss": 0.4446, + "step": 2368 + }, + { + "epoch": 4.5353728489483744, + "grad_norm": 0.345703125, + "learning_rate": 0.00012072250373114057, + "loss": 0.4642, + "step": 2372 + }, + { + "epoch": 4.54302103250478, + "grad_norm": 0.34765625, + "learning_rate": 0.00012027630844183288, + "loss": 0.4825, + "step": 2376 + }, + { + "epoch": 4.550669216061186, + "grad_norm": 0.353515625, + "learning_rate": 0.00011983038674361658, + "loss": 0.4303, + "step": 2380 + }, + { + "epoch": 4.558317399617591, + "grad_norm": 0.34375, + "learning_rate": 0.00011938474274096844, + "loss": 0.4013, + "step": 2384 + }, + { + "epoch": 4.5659655831739965, + "grad_norm": 0.330078125, + "learning_rate": 0.00011893938053580933, + "loss": 0.4183, + "step": 2388 + }, + { + "epoch": 4.573613766730402, + "grad_norm": 0.359375, + "learning_rate": 0.00011849430422746624, + "loss": 0.4345, + "step": 2392 + }, + { + "epoch": 4.581261950286807, + "grad_norm": 0.369140625, + "learning_rate": 0.00011804951791263466, + "loss": 0.4253, + "step": 2396 + }, + { + "epoch": 4.588910133843212, + "grad_norm": 0.359375, + "learning_rate": 0.00011760502568534081, + "loss": 0.473, + "step": 2400 + }, + { + "epoch": 4.596558317399618, + "grad_norm": 0.34765625, + "learning_rate": 0.00011716083163690405, + "loss": 0.451, + "step": 2404 + }, + { + "epoch": 4.604206500956023, + "grad_norm": 0.353515625, + "learning_rate": 0.00011671693985589913, + "loss": 0.4522, + "step": 2408 + }, + { + "epoch": 4.611854684512428, + "grad_norm": 0.361328125, + "learning_rate": 0.00011627335442811846, + "loss": 0.4193, + "step": 2412 + }, + { + "epoch": 4.6195028680688335, + "grad_norm": 0.349609375, + "learning_rate": 0.00011583007943653494, + "loss": 0.4616, + "step": 2416 + }, + { + "epoch": 4.627151051625239, + "grad_norm": 0.34375, + "learning_rate": 0.00011538711896126369, + "loss": 0.4549, + "step": 2420 + }, + { + "epoch": 4.634799235181644, + "grad_norm": 0.353515625, + "learning_rate": 0.00011494447707952514, + "loss": 0.4119, + "step": 2424 + }, + { + "epoch": 4.642447418738049, + "grad_norm": 0.359375, + "learning_rate": 0.0001145021578656071, + "loss": 0.4203, + "step": 2428 + }, + { + "epoch": 4.650095602294455, + "grad_norm": 0.345703125, + "learning_rate": 0.00011406016539082747, + "loss": 0.4199, + "step": 2432 + }, + { + "epoch": 4.657743785850861, + "grad_norm": 0.380859375, + "learning_rate": 0.00011361850372349667, + "loss": 0.4791, + "step": 2436 + }, + { + "epoch": 4.665391969407266, + "grad_norm": 0.3515625, + "learning_rate": 0.00011317717692888012, + "loss": 0.3904, + "step": 2440 + }, + { + "epoch": 4.673040152963671, + "grad_norm": 0.353515625, + "learning_rate": 0.00011273618906916107, + "loss": 0.413, + "step": 2444 + }, + { + "epoch": 4.680688336520077, + "grad_norm": 0.365234375, + "learning_rate": 0.00011229554420340289, + "loss": 0.4078, + "step": 2448 + }, + { + "epoch": 4.688336520076482, + "grad_norm": 0.375, + "learning_rate": 0.00011185524638751195, + "loss": 0.481, + "step": 2452 + }, + { + "epoch": 4.695984703632887, + "grad_norm": 0.384765625, + "learning_rate": 0.0001114152996742003, + "loss": 0.4649, + "step": 2456 + }, + { + "epoch": 4.7036328871892925, + "grad_norm": 0.357421875, + "learning_rate": 0.00011097570811294803, + "loss": 0.4758, + "step": 2460 + }, + { + "epoch": 4.711281070745698, + "grad_norm": 0.37109375, + "learning_rate": 0.00011053647574996648, + "loss": 0.3909, + "step": 2464 + }, + { + "epoch": 4.718929254302103, + "grad_norm": 0.353515625, + "learning_rate": 0.0001100976066281606, + "loss": 0.3929, + "step": 2468 + }, + { + "epoch": 4.726577437858508, + "grad_norm": 0.34765625, + "learning_rate": 0.00010965910478709206, + "loss": 0.4572, + "step": 2472 + }, + { + "epoch": 4.734225621414914, + "grad_norm": 0.376953125, + "learning_rate": 0.00010922097426294166, + "loss": 0.422, + "step": 2476 + }, + { + "epoch": 4.741873804971319, + "grad_norm": 0.3671875, + "learning_rate": 0.00010878321908847259, + "loss": 0.4397, + "step": 2480 + }, + { + "epoch": 4.749521988527725, + "grad_norm": 0.333984375, + "learning_rate": 0.00010834584329299322, + "loss": 0.4073, + "step": 2484 + }, + { + "epoch": 4.75717017208413, + "grad_norm": 0.3515625, + "learning_rate": 0.00010790885090231968, + "loss": 0.4209, + "step": 2488 + }, + { + "epoch": 4.764818355640536, + "grad_norm": 0.361328125, + "learning_rate": 0.00010747224593873933, + "loss": 0.4365, + "step": 2492 + }, + { + "epoch": 4.772466539196941, + "grad_norm": 0.33203125, + "learning_rate": 0.00010703603242097322, + "loss": 0.4213, + "step": 2496 + }, + { + "epoch": 4.780114722753346, + "grad_norm": 0.33984375, + "learning_rate": 0.00010660021436413956, + "loss": 0.4705, + "step": 2500 + }, + { + "epoch": 4.7877629063097515, + "grad_norm": 0.353515625, + "learning_rate": 0.00010616479577971638, + "loss": 0.4171, + "step": 2504 + }, + { + "epoch": 4.795411089866157, + "grad_norm": 0.373046875, + "learning_rate": 0.00010572978067550489, + "loss": 0.4357, + "step": 2508 + }, + { + "epoch": 4.803059273422562, + "grad_norm": 0.359375, + "learning_rate": 0.00010529517305559244, + "loss": 0.4225, + "step": 2512 + }, + { + "epoch": 4.810707456978967, + "grad_norm": 0.33203125, + "learning_rate": 0.00010486097692031566, + "loss": 0.4569, + "step": 2516 + }, + { + "epoch": 4.818355640535373, + "grad_norm": 0.357421875, + "learning_rate": 0.00010442719626622374, + "loss": 0.4801, + "step": 2520 + }, + { + "epoch": 4.826003824091778, + "grad_norm": 0.353515625, + "learning_rate": 0.0001039938350860415, + "loss": 0.4476, + "step": 2524 + }, + { + "epoch": 4.833652007648183, + "grad_norm": 0.33203125, + "learning_rate": 0.00010356089736863282, + "loss": 0.4016, + "step": 2528 + }, + { + "epoch": 4.8413001912045885, + "grad_norm": 0.361328125, + "learning_rate": 0.0001031283870989638, + "loss": 0.4467, + "step": 2532 + }, + { + "epoch": 4.848948374760994, + "grad_norm": 0.35546875, + "learning_rate": 0.00010269630825806597, + "loss": 0.4236, + "step": 2536 + }, + { + "epoch": 4.8565965583174, + "grad_norm": 0.373046875, + "learning_rate": 0.00010226466482300006, + "loss": 0.426, + "step": 2540 + }, + { + "epoch": 4.864244741873805, + "grad_norm": 0.35546875, + "learning_rate": 0.00010183346076681882, + "loss": 0.4452, + "step": 2544 + }, + { + "epoch": 4.871892925430211, + "grad_norm": 0.353515625, + "learning_rate": 0.00010140270005853098, + "loss": 0.4182, + "step": 2548 + }, + { + "epoch": 4.879541108986616, + "grad_norm": 0.37109375, + "learning_rate": 0.00010097238666306427, + "loss": 0.4035, + "step": 2552 + }, + { + "epoch": 4.887189292543021, + "grad_norm": 0.36328125, + "learning_rate": 0.00010054252454122934, + "loss": 0.4187, + "step": 2556 + }, + { + "epoch": 4.894837476099426, + "grad_norm": 0.376953125, + "learning_rate": 0.000100113117649683, + "loss": 0.4494, + "step": 2560 + }, + { + "epoch": 4.902485659655832, + "grad_norm": 0.361328125, + "learning_rate": 9.968416994089189e-05, + "loss": 0.4461, + "step": 2564 + }, + { + "epoch": 4.910133843212237, + "grad_norm": 0.359375, + "learning_rate": 9.925568536309619e-05, + "loss": 0.4589, + "step": 2568 + }, + { + "epoch": 4.917782026768642, + "grad_norm": 0.337890625, + "learning_rate": 9.88276678602731e-05, + "loss": 0.4329, + "step": 2572 + }, + { + "epoch": 4.925430210325048, + "grad_norm": 0.35546875, + "learning_rate": 9.840012137210072e-05, + "loss": 0.4048, + "step": 2576 + }, + { + "epoch": 4.933078393881453, + "grad_norm": 0.40234375, + "learning_rate": 9.797304983392164e-05, + "loss": 0.4689, + "step": 2580 + }, + { + "epoch": 4.940726577437858, + "grad_norm": 0.37890625, + "learning_rate": 9.75464571767068e-05, + "loss": 0.4538, + "step": 2584 + }, + { + "epoch": 4.948374760994264, + "grad_norm": 0.349609375, + "learning_rate": 9.712034732701942e-05, + "loss": 0.4251, + "step": 2588 + }, + { + "epoch": 4.95602294455067, + "grad_norm": 0.3671875, + "learning_rate": 9.669472420697845e-05, + "loss": 0.3808, + "step": 2592 + }, + { + "epoch": 4.963671128107075, + "grad_norm": 0.33984375, + "learning_rate": 9.626959173422306e-05, + "loss": 0.4249, + "step": 2596 + }, + { + "epoch": 4.97131931166348, + "grad_norm": 0.36328125, + "learning_rate": 9.5844953821876e-05, + "loss": 0.4491, + "step": 2600 + }, + { + "epoch": 4.9789674952198855, + "grad_norm": 0.33203125, + "learning_rate": 9.542081437850801e-05, + "loss": 0.3934, + "step": 2604 + }, + { + "epoch": 4.986615678776291, + "grad_norm": 0.3515625, + "learning_rate": 9.49971773081017e-05, + "loss": 0.4589, + "step": 2608 + }, + { + "epoch": 4.994263862332696, + "grad_norm": 0.3671875, + "learning_rate": 9.457404651001546e-05, + "loss": 0.4418, + "step": 2612 + }, + { + "epoch": 5.001912045889101, + "grad_norm": 0.31640625, + "learning_rate": 9.415142587894786e-05, + "loss": 0.3326, + "step": 2616 + }, + { + "epoch": 5.009560229445507, + "grad_norm": 0.314453125, + "learning_rate": 9.372931930490147e-05, + "loss": 0.3716, + "step": 2620 + }, + { + "epoch": 5.017208413001912, + "grad_norm": 0.341796875, + "learning_rate": 9.330773067314747e-05, + "loss": 0.4067, + "step": 2624 + }, + { + "epoch": 5.024856596558317, + "grad_norm": 0.345703125, + "learning_rate": 9.28866638641894e-05, + "loss": 0.3533, + "step": 2628 + }, + { + "epoch": 5.0325047801147225, + "grad_norm": 0.3359375, + "learning_rate": 9.246612275372786e-05, + "loss": 0.3446, + "step": 2632 + }, + { + "epoch": 5.040152963671128, + "grad_norm": 0.353515625, + "learning_rate": 9.204611121262466e-05, + "loss": 0.391, + "step": 2636 + }, + { + "epoch": 5.047801147227533, + "grad_norm": 0.3515625, + "learning_rate": 9.16266331068671e-05, + "loss": 0.3821, + "step": 2640 + }, + { + "epoch": 5.055449330783939, + "grad_norm": 0.35546875, + "learning_rate": 9.120769229753262e-05, + "loss": 0.3813, + "step": 2644 + }, + { + "epoch": 5.0630975143403445, + "grad_norm": 0.361328125, + "learning_rate": 9.078929264075293e-05, + "loss": 0.3493, + "step": 2648 + }, + { + "epoch": 5.07074569789675, + "grad_norm": 0.3515625, + "learning_rate": 9.0371437987679e-05, + "loss": 0.3346, + "step": 2652 + }, + { + "epoch": 5.078393881453155, + "grad_norm": 0.35546875, + "learning_rate": 8.995413218444502e-05, + "loss": 0.3753, + "step": 2656 + }, + { + "epoch": 5.08604206500956, + "grad_norm": 0.33984375, + "learning_rate": 8.953737907213346e-05, + "loss": 0.3749, + "step": 2660 + }, + { + "epoch": 5.093690248565966, + "grad_norm": 0.3359375, + "learning_rate": 8.912118248673966e-05, + "loss": 0.3673, + "step": 2664 + }, + { + "epoch": 5.101338432122371, + "grad_norm": 0.34765625, + "learning_rate": 8.870554625913619e-05, + "loss": 0.3591, + "step": 2668 + }, + { + "epoch": 5.108986615678776, + "grad_norm": 0.369140625, + "learning_rate": 8.8290474215038e-05, + "loss": 0.387, + "step": 2672 + }, + { + "epoch": 5.1166347992351815, + "grad_norm": 0.359375, + "learning_rate": 8.787597017496687e-05, + "loss": 0.3774, + "step": 2676 + }, + { + "epoch": 5.124282982791587, + "grad_norm": 0.330078125, + "learning_rate": 8.74620379542166e-05, + "loss": 0.3711, + "step": 2680 + }, + { + "epoch": 5.131931166347992, + "grad_norm": 0.33203125, + "learning_rate": 8.704868136281742e-05, + "loss": 0.3733, + "step": 2684 + }, + { + "epoch": 5.139579349904397, + "grad_norm": 0.36328125, + "learning_rate": 8.663590420550145e-05, + "loss": 0.3483, + "step": 2688 + }, + { + "epoch": 5.147227533460803, + "grad_norm": 0.369140625, + "learning_rate": 8.622371028166743e-05, + "loss": 0.3773, + "step": 2692 + }, + { + "epoch": 5.154875717017209, + "grad_norm": 0.3671875, + "learning_rate": 8.581210338534538e-05, + "loss": 0.3921, + "step": 2696 + }, + { + "epoch": 5.162523900573614, + "grad_norm": 0.365234375, + "learning_rate": 8.540108730516248e-05, + "loss": 0.355, + "step": 2700 + }, + { + "epoch": 5.170172084130019, + "grad_norm": 0.34375, + "learning_rate": 8.499066582430748e-05, + "loss": 0.3805, + "step": 2704 + }, + { + "epoch": 5.177820267686425, + "grad_norm": 0.333984375, + "learning_rate": 8.45808427204962e-05, + "loss": 0.3575, + "step": 2708 + }, + { + "epoch": 5.18546845124283, + "grad_norm": 0.373046875, + "learning_rate": 8.417162176593686e-05, + "loss": 0.3948, + "step": 2712 + }, + { + "epoch": 5.193116634799235, + "grad_norm": 0.345703125, + "learning_rate": 8.376300672729504e-05, + "loss": 0.3883, + "step": 2716 + }, + { + "epoch": 5.2007648183556405, + "grad_norm": 0.35546875, + "learning_rate": 8.335500136565919e-05, + "loss": 0.4049, + "step": 2720 + }, + { + "epoch": 5.208413001912046, + "grad_norm": 0.376953125, + "learning_rate": 8.294760943650605e-05, + "loss": 0.3689, + "step": 2724 + }, + { + "epoch": 5.216061185468451, + "grad_norm": 0.384765625, + "learning_rate": 8.254083468966612e-05, + "loss": 0.3568, + "step": 2728 + }, + { + "epoch": 5.223709369024856, + "grad_norm": 0.361328125, + "learning_rate": 8.213468086928891e-05, + "loss": 0.3961, + "step": 2732 + }, + { + "epoch": 5.231357552581262, + "grad_norm": 0.3515625, + "learning_rate": 8.172915171380863e-05, + "loss": 0.3587, + "step": 2736 + }, + { + "epoch": 5.239005736137667, + "grad_norm": 0.357421875, + "learning_rate": 8.132425095590999e-05, + "loss": 0.407, + "step": 2740 + }, + { + "epoch": 5.246653919694072, + "grad_norm": 0.369140625, + "learning_rate": 8.091998232249325e-05, + "loss": 0.4085, + "step": 2744 + }, + { + "epoch": 5.254302103250478, + "grad_norm": 0.3671875, + "learning_rate": 8.051634953464069e-05, + "loss": 0.366, + "step": 2748 + }, + { + "epoch": 5.261950286806884, + "grad_norm": 0.341796875, + "learning_rate": 8.011335630758169e-05, + "loss": 0.3572, + "step": 2752 + }, + { + "epoch": 5.269598470363289, + "grad_norm": 0.392578125, + "learning_rate": 7.971100635065894e-05, + "loss": 0.4136, + "step": 2756 + }, + { + "epoch": 5.277246653919694, + "grad_norm": 0.3671875, + "learning_rate": 7.930930336729406e-05, + "loss": 0.3311, + "step": 2760 + }, + { + "epoch": 5.2848948374761, + "grad_norm": 0.357421875, + "learning_rate": 7.890825105495376e-05, + "loss": 0.351, + "step": 2764 + }, + { + "epoch": 5.292543021032505, + "grad_norm": 0.341796875, + "learning_rate": 7.850785310511555e-05, + "loss": 0.3403, + "step": 2768 + }, + { + "epoch": 5.30019120458891, + "grad_norm": 0.3359375, + "learning_rate": 7.810811320323386e-05, + "loss": 0.2974, + "step": 2772 + }, + { + "epoch": 5.307839388145315, + "grad_norm": 0.36328125, + "learning_rate": 7.770903502870625e-05, + "loss": 0.3635, + "step": 2776 + }, + { + "epoch": 5.315487571701721, + "grad_norm": 0.341796875, + "learning_rate": 7.731062225483933e-05, + "loss": 0.3669, + "step": 2780 + }, + { + "epoch": 5.323135755258126, + "grad_norm": 0.330078125, + "learning_rate": 7.6912878548815e-05, + "loss": 0.3823, + "step": 2784 + }, + { + "epoch": 5.330783938814531, + "grad_norm": 0.37890625, + "learning_rate": 7.651580757165691e-05, + "loss": 0.3644, + "step": 2788 + }, + { + "epoch": 5.338432122370937, + "grad_norm": 0.361328125, + "learning_rate": 7.611941297819643e-05, + "loss": 0.3505, + "step": 2792 + }, + { + "epoch": 5.346080305927342, + "grad_norm": 0.361328125, + "learning_rate": 7.572369841703924e-05, + "loss": 0.3691, + "step": 2796 + }, + { + "epoch": 5.353728489483748, + "grad_norm": 0.357421875, + "learning_rate": 7.532866753053159e-05, + "loss": 0.3229, + "step": 2800 + }, + { + "epoch": 5.361376673040153, + "grad_norm": 0.39453125, + "learning_rate": 7.493432395472711e-05, + "loss": 0.3849, + "step": 2804 + }, + { + "epoch": 5.369024856596559, + "grad_norm": 0.375, + "learning_rate": 7.454067131935269e-05, + "loss": 0.4105, + "step": 2808 + }, + { + "epoch": 5.376673040152964, + "grad_norm": 0.369140625, + "learning_rate": 7.414771324777579e-05, + "loss": 0.3686, + "step": 2812 + }, + { + "epoch": 5.384321223709369, + "grad_norm": 0.357421875, + "learning_rate": 7.375545335697085e-05, + "loss": 0.3293, + "step": 2816 + }, + { + "epoch": 5.3919694072657744, + "grad_norm": 0.3828125, + "learning_rate": 7.336389525748548e-05, + "loss": 0.3892, + "step": 2820 + }, + { + "epoch": 5.39961759082218, + "grad_norm": 0.3984375, + "learning_rate": 7.29730425534081e-05, + "loss": 0.3562, + "step": 2824 + }, + { + "epoch": 5.407265774378585, + "grad_norm": 0.376953125, + "learning_rate": 7.258289884233417e-05, + "loss": 0.363, + "step": 2828 + }, + { + "epoch": 5.41491395793499, + "grad_norm": 0.35546875, + "learning_rate": 7.21934677153332e-05, + "loss": 0.3517, + "step": 2832 + }, + { + "epoch": 5.422562141491396, + "grad_norm": 0.353515625, + "learning_rate": 7.180475275691573e-05, + "loss": 0.39, + "step": 2836 + }, + { + "epoch": 5.430210325047801, + "grad_norm": 0.376953125, + "learning_rate": 7.141675754500049e-05, + "loss": 0.3898, + "step": 2840 + }, + { + "epoch": 5.437858508604206, + "grad_norm": 0.396484375, + "learning_rate": 7.102948565088116e-05, + "loss": 0.4363, + "step": 2844 + }, + { + "epoch": 5.4455066921606115, + "grad_norm": 0.373046875, + "learning_rate": 7.064294063919368e-05, + "loss": 0.3494, + "step": 2848 + }, + { + "epoch": 5.453154875717018, + "grad_norm": 0.353515625, + "learning_rate": 7.025712606788362e-05, + "loss": 0.3457, + "step": 2852 + }, + { + "epoch": 5.460803059273423, + "grad_norm": 0.369140625, + "learning_rate": 6.987204548817278e-05, + "loss": 0.3279, + "step": 2856 + }, + { + "epoch": 5.468451242829828, + "grad_norm": 0.353515625, + "learning_rate": 6.948770244452737e-05, + "loss": 0.3591, + "step": 2860 + }, + { + "epoch": 5.4760994263862335, + "grad_norm": 0.365234375, + "learning_rate": 6.910410047462495e-05, + "loss": 0.3488, + "step": 2864 + }, + { + "epoch": 5.483747609942639, + "grad_norm": 0.3671875, + "learning_rate": 6.87212431093215e-05, + "loss": 0.3466, + "step": 2868 + }, + { + "epoch": 5.491395793499044, + "grad_norm": 0.34765625, + "learning_rate": 6.833913387261973e-05, + "loss": 0.4094, + "step": 2872 + }, + { + "epoch": 5.499043977055449, + "grad_norm": 0.375, + "learning_rate": 6.795777628163599e-05, + "loss": 0.3803, + "step": 2876 + }, + { + "epoch": 5.506692160611855, + "grad_norm": 0.361328125, + "learning_rate": 6.757717384656817e-05, + "loss": 0.3832, + "step": 2880 + }, + { + "epoch": 5.51434034416826, + "grad_norm": 0.37109375, + "learning_rate": 6.719733007066331e-05, + "loss": 0.3575, + "step": 2884 + }, + { + "epoch": 5.521988527724665, + "grad_norm": 0.357421875, + "learning_rate": 6.68182484501855e-05, + "loss": 0.3435, + "step": 2888 + }, + { + "epoch": 5.5296367112810705, + "grad_norm": 0.359375, + "learning_rate": 6.643993247438347e-05, + "loss": 0.3674, + "step": 2892 + }, + { + "epoch": 5.537284894837476, + "grad_norm": 0.361328125, + "learning_rate": 6.606238562545859e-05, + "loss": 0.3507, + "step": 2896 + }, + { + "epoch": 5.544933078393882, + "grad_norm": 0.357421875, + "learning_rate": 6.568561137853296e-05, + "loss": 0.3555, + "step": 2900 + }, + { + "epoch": 5.552581261950287, + "grad_norm": 0.37890625, + "learning_rate": 6.530961320161712e-05, + "loss": 0.3786, + "step": 2904 + }, + { + "epoch": 5.5602294455066925, + "grad_norm": 0.373046875, + "learning_rate": 6.493439455557835e-05, + "loss": 0.4043, + "step": 2908 + }, + { + "epoch": 5.567877629063098, + "grad_norm": 0.3671875, + "learning_rate": 6.455995889410873e-05, + "loss": 0.4086, + "step": 2912 + }, + { + "epoch": 5.575525812619503, + "grad_norm": 0.365234375, + "learning_rate": 6.418630966369348e-05, + "loss": 0.366, + "step": 2916 + }, + { + "epoch": 5.583173996175908, + "grad_norm": 0.341796875, + "learning_rate": 6.381345030357899e-05, + "loss": 0.3801, + "step": 2920 + }, + { + "epoch": 5.590822179732314, + "grad_norm": 0.3984375, + "learning_rate": 6.344138424574134e-05, + "loss": 0.3985, + "step": 2924 + }, + { + "epoch": 5.598470363288719, + "grad_norm": 0.390625, + "learning_rate": 6.307011491485484e-05, + "loss": 0.4266, + "step": 2928 + }, + { + "epoch": 5.606118546845124, + "grad_norm": 0.359375, + "learning_rate": 6.269964572826001e-05, + "loss": 0.3317, + "step": 2932 + }, + { + "epoch": 5.6137667304015295, + "grad_norm": 0.384765625, + "learning_rate": 6.232998009593275e-05, + "loss": 0.3663, + "step": 2936 + }, + { + "epoch": 5.621414913957935, + "grad_norm": 0.392578125, + "learning_rate": 6.196112142045268e-05, + "loss": 0.3804, + "step": 2940 + }, + { + "epoch": 5.62906309751434, + "grad_norm": 0.380859375, + "learning_rate": 6.159307309697149e-05, + "loss": 0.3782, + "step": 2944 + }, + { + "epoch": 5.636711281070745, + "grad_norm": 0.400390625, + "learning_rate": 6.122583851318233e-05, + "loss": 0.4047, + "step": 2948 + }, + { + "epoch": 5.644359464627151, + "grad_norm": 0.34765625, + "learning_rate": 6.085942104928815e-05, + "loss": 0.3875, + "step": 2952 + }, + { + "epoch": 5.652007648183556, + "grad_norm": 0.388671875, + "learning_rate": 6.049382407797076e-05, + "loss": 0.3649, + "step": 2956 + }, + { + "epoch": 5.659655831739962, + "grad_norm": 0.3359375, + "learning_rate": 6.012905096435968e-05, + "loss": 0.3813, + "step": 2960 + }, + { + "epoch": 5.667304015296367, + "grad_norm": 0.396484375, + "learning_rate": 5.976510506600146e-05, + "loss": 0.3955, + "step": 2964 + }, + { + "epoch": 5.674952198852773, + "grad_norm": 0.38671875, + "learning_rate": 5.9401989732828384e-05, + "loss": 0.3574, + "step": 2968 + }, + { + "epoch": 5.682600382409178, + "grad_norm": 0.349609375, + "learning_rate": 5.9039708307127816e-05, + "loss": 0.333, + "step": 2972 + }, + { + "epoch": 5.690248565965583, + "grad_norm": 0.361328125, + "learning_rate": 5.8678264123511626e-05, + "loss": 0.3856, + "step": 2976 + }, + { + "epoch": 5.6978967495219885, + "grad_norm": 0.345703125, + "learning_rate": 5.8317660508885e-05, + "loss": 0.3931, + "step": 2980 + }, + { + "epoch": 5.705544933078394, + "grad_norm": 0.375, + "learning_rate": 5.795790078241641e-05, + "loss": 0.3364, + "step": 2984 + }, + { + "epoch": 5.713193116634799, + "grad_norm": 0.349609375, + "learning_rate": 5.7598988255506644e-05, + "loss": 0.3825, + "step": 2988 + }, + { + "epoch": 5.720841300191204, + "grad_norm": 0.376953125, + "learning_rate": 5.724092623175841e-05, + "loss": 0.3654, + "step": 2992 + }, + { + "epoch": 5.72848948374761, + "grad_norm": 0.373046875, + "learning_rate": 5.6883718006946146e-05, + "loss": 0.4175, + "step": 2996 + }, + { + "epoch": 5.736137667304015, + "grad_norm": 0.392578125, + "learning_rate": 5.652736686898537e-05, + "loss": 0.369, + "step": 3000 + }, + { + "epoch": 5.743785850860421, + "grad_norm": 0.3671875, + "learning_rate": 5.61718760979026e-05, + "loss": 0.3961, + "step": 3004 + }, + { + "epoch": 5.751434034416826, + "grad_norm": 0.353515625, + "learning_rate": 5.5817248965805096e-05, + "loss": 0.3622, + "step": 3008 + }, + { + "epoch": 5.759082217973232, + "grad_norm": 0.36328125, + "learning_rate": 5.546348873685089e-05, + "loss": 0.3369, + "step": 3012 + }, + { + "epoch": 5.766730401529637, + "grad_norm": 0.357421875, + "learning_rate": 5.51105986672185e-05, + "loss": 0.3314, + "step": 3016 + }, + { + "epoch": 5.774378585086042, + "grad_norm": 0.337890625, + "learning_rate": 5.475858200507708e-05, + "loss": 0.3212, + "step": 3020 + }, + { + "epoch": 5.782026768642448, + "grad_norm": 0.375, + "learning_rate": 5.440744199055663e-05, + "loss": 0.3978, + "step": 3024 + }, + { + "epoch": 5.789674952198853, + "grad_norm": 0.37890625, + "learning_rate": 5.4057181855718e-05, + "loss": 0.3938, + "step": 3028 + }, + { + "epoch": 5.797323135755258, + "grad_norm": 0.369140625, + "learning_rate": 5.370780482452317e-05, + "loss": 0.3613, + "step": 3032 + }, + { + "epoch": 5.804971319311663, + "grad_norm": 0.365234375, + "learning_rate": 5.335931411280559e-05, + "loss": 0.3717, + "step": 3036 + }, + { + "epoch": 5.812619502868069, + "grad_norm": 0.345703125, + "learning_rate": 5.3011712928240787e-05, + "loss": 0.3572, + "step": 3040 + }, + { + "epoch": 5.820267686424474, + "grad_norm": 0.359375, + "learning_rate": 5.2665004470316456e-05, + "loss": 0.3993, + "step": 3044 + }, + { + "epoch": 5.827915869980879, + "grad_norm": 0.365234375, + "learning_rate": 5.231919193030324e-05, + "loss": 0.3708, + "step": 3048 + }, + { + "epoch": 5.835564053537285, + "grad_norm": 0.37890625, + "learning_rate": 5.197427849122549e-05, + "loss": 0.3877, + "step": 3052 + }, + { + "epoch": 5.84321223709369, + "grad_norm": 0.34375, + "learning_rate": 5.1630267327831494e-05, + "loss": 0.3747, + "step": 3056 + }, + { + "epoch": 5.850860420650095, + "grad_norm": 0.373046875, + "learning_rate": 5.128716160656489e-05, + "loss": 0.4003, + "step": 3060 + }, + { + "epoch": 5.858508604206501, + "grad_norm": 0.3515625, + "learning_rate": 5.0944964485534975e-05, + "loss": 0.3654, + "step": 3064 + }, + { + "epoch": 5.866156787762907, + "grad_norm": 0.39453125, + "learning_rate": 5.06036791144879e-05, + "loss": 0.3837, + "step": 3068 + }, + { + "epoch": 5.873804971319312, + "grad_norm": 0.40625, + "learning_rate": 5.0263308634777745e-05, + "loss": 0.368, + "step": 3072 + }, + { + "epoch": 5.881453154875717, + "grad_norm": 0.376953125, + "learning_rate": 4.992385617933734e-05, + "loss": 0.3975, + "step": 3076 + }, + { + "epoch": 5.8891013384321225, + "grad_norm": 0.365234375, + "learning_rate": 4.958532487264968e-05, + "loss": 0.3837, + "step": 3080 + }, + { + "epoch": 5.896749521988528, + "grad_norm": 0.333984375, + "learning_rate": 4.924771783071895e-05, + "loss": 0.3744, + "step": 3084 + }, + { + "epoch": 5.904397705544933, + "grad_norm": 0.357421875, + "learning_rate": 4.8911038161042136e-05, + "loss": 0.3439, + "step": 3088 + }, + { + "epoch": 5.912045889101338, + "grad_norm": 0.37890625, + "learning_rate": 4.857528896258012e-05, + "loss": 0.4041, + "step": 3092 + }, + { + "epoch": 5.919694072657744, + "grad_norm": 0.375, + "learning_rate": 4.824047332572924e-05, + "loss": 0.3753, + "step": 3096 + }, + { + "epoch": 5.927342256214149, + "grad_norm": 0.359375, + "learning_rate": 4.7906594332293116e-05, + "loss": 0.3822, + "step": 3100 + }, + { + "epoch": 5.934990439770554, + "grad_norm": 0.3515625, + "learning_rate": 4.75736550554537e-05, + "loss": 0.3535, + "step": 3104 + }, + { + "epoch": 5.9426386233269595, + "grad_norm": 0.35546875, + "learning_rate": 4.724165855974367e-05, + "loss": 0.377, + "step": 3108 + }, + { + "epoch": 5.950286806883366, + "grad_norm": 0.365234375, + "learning_rate": 4.6910607901017715e-05, + "loss": 0.359, + "step": 3112 + }, + { + "epoch": 5.957934990439771, + "grad_norm": 0.36328125, + "learning_rate": 4.65805061264246e-05, + "loss": 0.3415, + "step": 3116 + }, + { + "epoch": 5.965583173996176, + "grad_norm": 0.390625, + "learning_rate": 4.625135627437922e-05, + "loss": 0.3968, + "step": 3120 + }, + { + "epoch": 5.9732313575525815, + "grad_norm": 0.357421875, + "learning_rate": 4.592316137453439e-05, + "loss": 0.3983, + "step": 3124 + }, + { + "epoch": 5.980879541108987, + "grad_norm": 0.357421875, + "learning_rate": 4.559592444775315e-05, + "loss": 0.3466, + "step": 3128 + }, + { + "epoch": 5.988527724665392, + "grad_norm": 0.3515625, + "learning_rate": 4.5269648506080816e-05, + "loss": 0.3442, + "step": 3132 + }, + { + "epoch": 5.996175908221797, + "grad_norm": 0.375, + "learning_rate": 4.4944336552717514e-05, + "loss": 0.3768, + "step": 3136 + }, + { + "epoch": 6.003824091778203, + "grad_norm": 0.3359375, + "learning_rate": 4.461999158199019e-05, + "loss": 0.3533, + "step": 3140 + }, + { + "epoch": 6.011472275334608, + "grad_norm": 0.34765625, + "learning_rate": 4.429661657932523e-05, + "loss": 0.3247, + "step": 3144 + }, + { + "epoch": 6.019120458891013, + "grad_norm": 0.34375, + "learning_rate": 4.397421452122114e-05, + "loss": 0.329, + "step": 3148 + }, + { + "epoch": 6.0267686424474185, + "grad_norm": 0.36328125, + "learning_rate": 4.3652788375220787e-05, + "loss": 0.3547, + "step": 3152 + }, + { + "epoch": 6.034416826003824, + "grad_norm": 0.361328125, + "learning_rate": 4.333234109988434e-05, + "loss": 0.4026, + "step": 3156 + }, + { + "epoch": 6.042065009560229, + "grad_norm": 0.3828125, + "learning_rate": 4.3012875644761955e-05, + "loss": 0.3534, + "step": 3160 + }, + { + "epoch": 6.049713193116634, + "grad_norm": 0.36328125, + "learning_rate": 4.269439495036678e-05, + "loss": 0.3035, + "step": 3164 + }, + { + "epoch": 6.0573613766730405, + "grad_norm": 0.365234375, + "learning_rate": 4.2376901948147465e-05, + "loss": 0.3553, + "step": 3168 + }, + { + "epoch": 6.065009560229446, + "grad_norm": 0.369140625, + "learning_rate": 4.206039956046176e-05, + "loss": 0.3602, + "step": 3172 + }, + { + "epoch": 6.072657743785851, + "grad_norm": 0.349609375, + "learning_rate": 4.174489070054927e-05, + "loss": 0.3606, + "step": 3176 + }, + { + "epoch": 6.080305927342256, + "grad_norm": 0.349609375, + "learning_rate": 4.143037827250447e-05, + "loss": 0.3499, + "step": 3180 + }, + { + "epoch": 6.087954110898662, + "grad_norm": 0.349609375, + "learning_rate": 4.1116865171250496e-05, + "loss": 0.3447, + "step": 3184 + }, + { + "epoch": 6.095602294455067, + "grad_norm": 0.357421875, + "learning_rate": 4.0804354282512016e-05, + "loss": 0.3541, + "step": 3188 + }, + { + "epoch": 6.103250478011472, + "grad_norm": 0.3515625, + "learning_rate": 4.049284848278886e-05, + "loss": 0.3638, + "step": 3192 + }, + { + "epoch": 6.1108986615678775, + "grad_norm": 0.373046875, + "learning_rate": 4.01823506393297e-05, + "loss": 0.362, + "step": 3196 + }, + { + "epoch": 6.118546845124283, + "grad_norm": 0.3828125, + "learning_rate": 3.987286361010531e-05, + "loss": 0.3367, + "step": 3200 + }, + { + "epoch": 6.126195028680688, + "grad_norm": 0.361328125, + "learning_rate": 3.9564390243782516e-05, + "loss": 0.3249, + "step": 3204 + }, + { + "epoch": 6.133843212237093, + "grad_norm": 0.396484375, + "learning_rate": 3.925693337969788e-05, + "loss": 0.3328, + "step": 3208 + }, + { + "epoch": 6.141491395793499, + "grad_norm": 0.35546875, + "learning_rate": 3.895049584783173e-05, + "loss": 0.3746, + "step": 3212 + }, + { + "epoch": 6.149139579349905, + "grad_norm": 0.34765625, + "learning_rate": 3.8645080468781676e-05, + "loss": 0.3702, + "step": 3216 + }, + { + "epoch": 6.15678776290631, + "grad_norm": 0.337890625, + "learning_rate": 3.834069005373724e-05, + "loss": 0.3136, + "step": 3220 + }, + { + "epoch": 6.164435946462715, + "grad_norm": 0.357421875, + "learning_rate": 3.8037327404453634e-05, + "loss": 0.3542, + "step": 3224 + }, + { + "epoch": 6.172084130019121, + "grad_norm": 0.369140625, + "learning_rate": 3.77349953132258e-05, + "loss": 0.3572, + "step": 3228 + }, + { + "epoch": 6.179732313575526, + "grad_norm": 0.37109375, + "learning_rate": 3.7433696562863215e-05, + "loss": 0.3526, + "step": 3232 + }, + { + "epoch": 6.187380497131931, + "grad_norm": 0.365234375, + "learning_rate": 3.7133433926663805e-05, + "loss": 0.331, + "step": 3236 + }, + { + "epoch": 6.195028680688337, + "grad_norm": 0.376953125, + "learning_rate": 3.6834210168388674e-05, + "loss": 0.3489, + "step": 3240 + }, + { + "epoch": 6.202676864244742, + "grad_norm": 0.359375, + "learning_rate": 3.653602804223656e-05, + "loss": 0.3329, + "step": 3244 + }, + { + "epoch": 6.210325047801147, + "grad_norm": 0.34765625, + "learning_rate": 3.623889029281861e-05, + "loss": 0.2646, + "step": 3248 + }, + { + "epoch": 6.217973231357552, + "grad_norm": 0.373046875, + "learning_rate": 3.5942799655132925e-05, + "loss": 0.3547, + "step": 3252 + }, + { + "epoch": 6.225621414913958, + "grad_norm": 0.369140625, + "learning_rate": 3.56477588545395e-05, + "loss": 0.3547, + "step": 3256 + }, + { + "epoch": 6.233269598470363, + "grad_norm": 0.341796875, + "learning_rate": 3.535377060673524e-05, + "loss": 0.3205, + "step": 3260 + }, + { + "epoch": 6.240917782026768, + "grad_norm": 0.361328125, + "learning_rate": 3.506083761772871e-05, + "loss": 0.3647, + "step": 3264 + }, + { + "epoch": 6.248565965583174, + "grad_norm": 0.33984375, + "learning_rate": 3.476896258381537e-05, + "loss": 0.3138, + "step": 3268 + }, + { + "epoch": 6.25621414913958, + "grad_norm": 0.3828125, + "learning_rate": 3.447814819155291e-05, + "loss": 0.3071, + "step": 3272 + }, + { + "epoch": 6.263862332695985, + "grad_norm": 0.349609375, + "learning_rate": 3.418839711773623e-05, + "loss": 0.3353, + "step": 3276 + }, + { + "epoch": 6.27151051625239, + "grad_norm": 0.384765625, + "learning_rate": 3.389971202937295e-05, + "loss": 0.3497, + "step": 3280 + }, + { + "epoch": 6.279158699808796, + "grad_norm": 0.3359375, + "learning_rate": 3.361209558365883e-05, + "loss": 0.3125, + "step": 3284 + }, + { + "epoch": 6.286806883365201, + "grad_norm": 0.365234375, + "learning_rate": 3.332555042795349e-05, + "loss": 0.3575, + "step": 3288 + }, + { + "epoch": 6.294455066921606, + "grad_norm": 0.359375, + "learning_rate": 3.304007919975563e-05, + "loss": 0.3554, + "step": 3292 + }, + { + "epoch": 6.3021032504780115, + "grad_norm": 0.34375, + "learning_rate": 3.2755684526679196e-05, + "loss": 0.3427, + "step": 3296 + }, + { + "epoch": 6.309751434034417, + "grad_norm": 0.35546875, + "learning_rate": 3.247236902642905e-05, + "loss": 0.3648, + "step": 3300 + }, + { + "epoch": 6.317399617590822, + "grad_norm": 0.375, + "learning_rate": 3.219013530677655e-05, + "loss": 0.3269, + "step": 3304 + }, + { + "epoch": 6.325047801147227, + "grad_norm": 0.349609375, + "learning_rate": 3.190898596553615e-05, + "loss": 0.3503, + "step": 3308 + }, + { + "epoch": 6.332695984703633, + "grad_norm": 0.33984375, + "learning_rate": 3.162892359054098e-05, + "loss": 0.3239, + "step": 3312 + }, + { + "epoch": 6.340344168260038, + "grad_norm": 0.376953125, + "learning_rate": 3.1349950759619255e-05, + "loss": 0.3253, + "step": 3316 + }, + { + "epoch": 6.347992351816444, + "grad_norm": 0.361328125, + "learning_rate": 3.107207004057046e-05, + "loss": 0.3642, + "step": 3320 + }, + { + "epoch": 6.355640535372849, + "grad_norm": 0.349609375, + "learning_rate": 3.079528399114189e-05, + "loss": 0.3058, + "step": 3324 + }, + { + "epoch": 6.363288718929255, + "grad_norm": 0.375, + "learning_rate": 3.0519595159004853e-05, + "loss": 0.3705, + "step": 3328 + }, + { + "epoch": 6.37093690248566, + "grad_norm": 0.345703125, + "learning_rate": 3.0245006081731367e-05, + "loss": 0.3444, + "step": 3332 + }, + { + "epoch": 6.378585086042065, + "grad_norm": 0.37109375, + "learning_rate": 2.9971519286770883e-05, + "loss": 0.3181, + "step": 3336 + }, + { + "epoch": 6.3862332695984705, + "grad_norm": 0.359375, + "learning_rate": 2.969913729142668e-05, + "loss": 0.3519, + "step": 3340 + }, + { + "epoch": 6.393881453154876, + "grad_norm": 0.34765625, + "learning_rate": 2.9427862602833165e-05, + "loss": 0.3261, + "step": 3344 + }, + { + "epoch": 6.401529636711281, + "grad_norm": 0.373046875, + "learning_rate": 2.915769771793256e-05, + "loss": 0.3554, + "step": 3348 + }, + { + "epoch": 6.409177820267686, + "grad_norm": 0.337890625, + "learning_rate": 2.8888645123451694e-05, + "loss": 0.3119, + "step": 3352 + }, + { + "epoch": 6.416826003824092, + "grad_norm": 0.357421875, + "learning_rate": 2.862070729587959e-05, + "loss": 0.3576, + "step": 3356 + }, + { + "epoch": 6.424474187380497, + "grad_norm": 0.3671875, + "learning_rate": 2.8353886701444312e-05, + "loss": 0.3464, + "step": 3360 + }, + { + "epoch": 6.432122370936902, + "grad_norm": 0.345703125, + "learning_rate": 2.808818579609037e-05, + "loss": 0.3362, + "step": 3364 + }, + { + "epoch": 6.4397705544933075, + "grad_norm": 0.359375, + "learning_rate": 2.7823607025456103e-05, + "loss": 0.3556, + "step": 3368 + }, + { + "epoch": 6.447418738049713, + "grad_norm": 0.353515625, + "learning_rate": 2.7560152824851285e-05, + "loss": 0.2955, + "step": 3372 + }, + { + "epoch": 6.455066921606119, + "grad_norm": 0.349609375, + "learning_rate": 2.7297825619234515e-05, + "loss": 0.3064, + "step": 3376 + }, + { + "epoch": 6.462715105162524, + "grad_norm": 0.373046875, + "learning_rate": 2.7036627823190994e-05, + "loss": 0.3553, + "step": 3380 + }, + { + "epoch": 6.4703632887189295, + "grad_norm": 0.34375, + "learning_rate": 2.6776561840910367e-05, + "loss": 0.3141, + "step": 3384 + }, + { + "epoch": 6.478011472275335, + "grad_norm": 0.3515625, + "learning_rate": 2.6517630066164448e-05, + "loss": 0.3746, + "step": 3388 + }, + { + "epoch": 6.48565965583174, + "grad_norm": 0.365234375, + "learning_rate": 2.6259834882285302e-05, + "loss": 0.3467, + "step": 3392 + }, + { + "epoch": 6.493307839388145, + "grad_norm": 0.359375, + "learning_rate": 2.6003178662143214e-05, + "loss": 0.3257, + "step": 3396 + }, + { + "epoch": 6.500956022944551, + "grad_norm": 0.36328125, + "learning_rate": 2.574766376812502e-05, + "loss": 0.3231, + "step": 3400 + }, + { + "epoch": 6.508604206500956, + "grad_norm": 0.333984375, + "learning_rate": 2.5493292552112128e-05, + "loss": 0.3102, + "step": 3404 + }, + { + "epoch": 6.516252390057361, + "grad_norm": 0.35546875, + "learning_rate": 2.5240067355458978e-05, + "loss": 0.33, + "step": 3408 + }, + { + "epoch": 6.5239005736137665, + "grad_norm": 0.373046875, + "learning_rate": 2.4987990508971667e-05, + "loss": 0.3529, + "step": 3412 + }, + { + "epoch": 6.531548757170172, + "grad_norm": 0.375, + "learning_rate": 2.4737064332886055e-05, + "loss": 0.3285, + "step": 3416 + }, + { + "epoch": 6.539196940726577, + "grad_norm": 0.361328125, + "learning_rate": 2.4487291136846894e-05, + "loss": 0.3835, + "step": 3420 + }, + { + "epoch": 6.546845124282983, + "grad_norm": 0.353515625, + "learning_rate": 2.4238673219886384e-05, + "loss": 0.3476, + "step": 3424 + }, + { + "epoch": 6.5544933078393885, + "grad_norm": 0.369140625, + "learning_rate": 2.399121287040275e-05, + "loss": 0.3209, + "step": 3428 + }, + { + "epoch": 6.562141491395794, + "grad_norm": 0.3671875, + "learning_rate": 2.3744912366139644e-05, + "loss": 0.3498, + "step": 3432 + }, + { + "epoch": 6.569789674952199, + "grad_norm": 0.359375, + "learning_rate": 2.3499773974164825e-05, + "loss": 0.348, + "step": 3436 + }, + { + "epoch": 6.577437858508604, + "grad_norm": 0.3671875, + "learning_rate": 2.325579995084946e-05, + "loss": 0.3783, + "step": 3440 + }, + { + "epoch": 6.58508604206501, + "grad_norm": 0.359375, + "learning_rate": 2.3012992541847254e-05, + "loss": 0.3855, + "step": 3444 + }, + { + "epoch": 6.592734225621415, + "grad_norm": 0.34765625, + "learning_rate": 2.277135398207393e-05, + "loss": 0.3163, + "step": 3448 + }, + { + "epoch": 6.60038240917782, + "grad_norm": 0.365234375, + "learning_rate": 2.2530886495686506e-05, + "loss": 0.3601, + "step": 3452 + }, + { + "epoch": 6.6080305927342256, + "grad_norm": 0.396484375, + "learning_rate": 2.229159229606281e-05, + "loss": 0.3425, + "step": 3456 + }, + { + "epoch": 6.615678776290631, + "grad_norm": 0.38671875, + "learning_rate": 2.2053473585781377e-05, + "loss": 0.324, + "step": 3460 + }, + { + "epoch": 6.623326959847036, + "grad_norm": 0.37890625, + "learning_rate": 2.181653255660072e-05, + "loss": 0.3386, + "step": 3464 + }, + { + "epoch": 6.630975143403441, + "grad_norm": 0.359375, + "learning_rate": 2.1580771389439612e-05, + "loss": 0.3434, + "step": 3468 + }, + { + "epoch": 6.638623326959847, + "grad_norm": 0.34765625, + "learning_rate": 2.1346192254356737e-05, + "loss": 0.331, + "step": 3472 + }, + { + "epoch": 6.646271510516252, + "grad_norm": 0.365234375, + "learning_rate": 2.1112797310530716e-05, + "loss": 0.3483, + "step": 3476 + }, + { + "epoch": 6.653919694072657, + "grad_norm": 0.376953125, + "learning_rate": 2.08805887062405e-05, + "loss": 0.3241, + "step": 3480 + }, + { + "epoch": 6.661567877629063, + "grad_norm": 0.365234375, + "learning_rate": 2.0649568578845205e-05, + "loss": 0.3269, + "step": 3484 + }, + { + "epoch": 6.669216061185469, + "grad_norm": 0.359375, + "learning_rate": 2.0419739054764743e-05, + "loss": 0.3636, + "step": 3488 + }, + { + "epoch": 6.676864244741874, + "grad_norm": 0.34375, + "learning_rate": 2.019110224946008e-05, + "loss": 0.3549, + "step": 3492 + }, + { + "epoch": 6.684512428298279, + "grad_norm": 0.3515625, + "learning_rate": 1.9963660267413913e-05, + "loss": 0.3497, + "step": 3496 + }, + { + "epoch": 6.692160611854685, + "grad_norm": 0.36328125, + "learning_rate": 1.9737415202111144e-05, + "loss": 0.3639, + "step": 3500 + }, + { + "epoch": 6.69980879541109, + "grad_norm": 0.353515625, + "learning_rate": 1.9512369136019663e-05, + "loss": 0.3487, + "step": 3504 + }, + { + "epoch": 6.707456978967495, + "grad_norm": 0.333984375, + "learning_rate": 1.9288524140571286e-05, + "loss": 0.3196, + "step": 3508 + }, + { + "epoch": 6.7151051625239, + "grad_norm": 0.361328125, + "learning_rate": 1.906588227614254e-05, + "loss": 0.3398, + "step": 3512 + }, + { + "epoch": 6.722753346080306, + "grad_norm": 0.341796875, + "learning_rate": 1.8844445592035767e-05, + "loss": 0.37, + "step": 3516 + }, + { + "epoch": 6.730401529636711, + "grad_norm": 0.37109375, + "learning_rate": 1.8624216126460183e-05, + "loss": 0.3466, + "step": 3520 + }, + { + "epoch": 6.738049713193116, + "grad_norm": 0.353515625, + "learning_rate": 1.8405195906513347e-05, + "loss": 0.3638, + "step": 3524 + }, + { + "epoch": 6.7456978967495225, + "grad_norm": 0.369140625, + "learning_rate": 1.8187386948162203e-05, + "loss": 0.3894, + "step": 3528 + }, + { + "epoch": 6.753346080305928, + "grad_norm": 0.341796875, + "learning_rate": 1.797079125622469e-05, + "loss": 0.3109, + "step": 3532 + }, + { + "epoch": 6.760994263862333, + "grad_norm": 0.345703125, + "learning_rate": 1.7755410824351363e-05, + "loss": 0.3191, + "step": 3536 + }, + { + "epoch": 6.768642447418738, + "grad_norm": 0.37890625, + "learning_rate": 1.7541247635006756e-05, + "loss": 0.3548, + "step": 3540 + }, + { + "epoch": 6.776290630975144, + "grad_norm": 0.369140625, + "learning_rate": 1.7328303659451477e-05, + "loss": 0.3518, + "step": 3544 + }, + { + "epoch": 6.783938814531549, + "grad_norm": 0.359375, + "learning_rate": 1.7116580857723872e-05, + "loss": 0.367, + "step": 3548 + }, + { + "epoch": 6.791586998087954, + "grad_norm": 0.3515625, + "learning_rate": 1.6906081178621917e-05, + "loss": 0.3502, + "step": 3552 + }, + { + "epoch": 6.7992351816443595, + "grad_norm": 0.376953125, + "learning_rate": 1.6696806559685553e-05, + "loss": 0.4081, + "step": 3556 + }, + { + "epoch": 6.806883365200765, + "grad_norm": 0.38671875, + "learning_rate": 1.648875892717857e-05, + "loss": 0.3557, + "step": 3560 + }, + { + "epoch": 6.81453154875717, + "grad_norm": 0.337890625, + "learning_rate": 1.628194019607099e-05, + "loss": 0.3765, + "step": 3564 + }, + { + "epoch": 6.822179732313575, + "grad_norm": 0.3515625, + "learning_rate": 1.6076352270021435e-05, + "loss": 0.3261, + "step": 3568 + }, + { + "epoch": 6.829827915869981, + "grad_norm": 0.40625, + "learning_rate": 1.587199704135973e-05, + "loss": 0.3734, + "step": 3572 + }, + { + "epoch": 6.837476099426386, + "grad_norm": 0.38671875, + "learning_rate": 1.5668876391069107e-05, + "loss": 0.3523, + "step": 3576 + }, + { + "epoch": 6.845124282982791, + "grad_norm": 0.3671875, + "learning_rate": 1.5466992188769394e-05, + "loss": 0.3465, + "step": 3580 + }, + { + "epoch": 6.8527724665391965, + "grad_norm": 0.357421875, + "learning_rate": 1.5266346292699522e-05, + "loss": 0.3506, + "step": 3584 + }, + { + "epoch": 6.860420650095603, + "grad_norm": 0.359375, + "learning_rate": 1.5066940549700285e-05, + "loss": 0.3112, + "step": 3588 + }, + { + "epoch": 6.868068833652008, + "grad_norm": 0.40234375, + "learning_rate": 1.4868776795197712e-05, + "loss": 0.3271, + "step": 3592 + }, + { + "epoch": 6.875717017208413, + "grad_norm": 0.35546875, + "learning_rate": 1.4671856853185876e-05, + "loss": 0.371, + "step": 3596 + }, + { + "epoch": 6.8833652007648185, + "grad_norm": 0.384765625, + "learning_rate": 1.4476182536210207e-05, + "loss": 0.3448, + "step": 3600 + }, + { + "epoch": 6.891013384321224, + "grad_norm": 0.353515625, + "learning_rate": 1.4281755645350873e-05, + "loss": 0.3605, + "step": 3604 + }, + { + "epoch": 6.898661567877629, + "grad_norm": 0.35546875, + "learning_rate": 1.4088577970206044e-05, + "loss": 0.3385, + "step": 3608 + }, + { + "epoch": 6.906309751434034, + "grad_norm": 0.35546875, + "learning_rate": 1.3896651288875572e-05, + "loss": 0.3285, + "step": 3612 + }, + { + "epoch": 6.91395793499044, + "grad_norm": 0.34765625, + "learning_rate": 1.3705977367944498e-05, + "loss": 0.3316, + "step": 3616 + }, + { + "epoch": 6.921606118546845, + "grad_norm": 0.35546875, + "learning_rate": 1.3516557962466978e-05, + "loss": 0.3636, + "step": 3620 + }, + { + "epoch": 6.92925430210325, + "grad_norm": 0.380859375, + "learning_rate": 1.3328394815949884e-05, + "loss": 0.3649, + "step": 3624 + }, + { + "epoch": 6.9369024856596555, + "grad_norm": 0.35546875, + "learning_rate": 1.3141489660336902e-05, + "loss": 0.3521, + "step": 3628 + }, + { + "epoch": 6.944550669216062, + "grad_norm": 0.34765625, + "learning_rate": 1.295584421599265e-05, + "loss": 0.3177, + "step": 3632 + }, + { + "epoch": 6.952198852772467, + "grad_norm": 0.341796875, + "learning_rate": 1.2771460191686656e-05, + "loss": 0.3148, + "step": 3636 + }, + { + "epoch": 6.959847036328872, + "grad_norm": 0.3671875, + "learning_rate": 1.2588339284577815e-05, + "loss": 0.3324, + "step": 3640 + }, + { + "epoch": 6.9674952198852775, + "grad_norm": 0.37109375, + "learning_rate": 1.240648318019859e-05, + "loss": 0.3729, + "step": 3644 + }, + { + "epoch": 6.975143403441683, + "grad_norm": 0.3671875, + "learning_rate": 1.2225893552439742e-05, + "loss": 0.3616, + "step": 3648 + }, + { + "epoch": 6.982791586998088, + "grad_norm": 0.345703125, + "learning_rate": 1.2046572063534587e-05, + "loss": 0.3261, + "step": 3652 + }, + { + "epoch": 6.990439770554493, + "grad_norm": 0.384765625, + "learning_rate": 1.1868520364044049e-05, + "loss": 0.3542, + "step": 3656 + }, + { + "epoch": 6.998087954110899, + "grad_norm": 0.33203125, + "learning_rate": 1.1691740092841228e-05, + "loss": 0.3238, + "step": 3660 + }, + { + "epoch": 7.005736137667304, + "grad_norm": 0.365234375, + "learning_rate": 1.151623287709636e-05, + "loss": 0.4048, + "step": 3664 + }, + { + "epoch": 7.013384321223709, + "grad_norm": 0.353515625, + "learning_rate": 1.1342000332261963e-05, + "loss": 0.3609, + "step": 3668 + }, + { + "epoch": 7.0210325047801145, + "grad_norm": 0.326171875, + "learning_rate": 1.1169044062057797e-05, + "loss": 0.3145, + "step": 3672 + }, + { + "epoch": 7.02868068833652, + "grad_norm": 0.35546875, + "learning_rate": 1.0997365658456164e-05, + "loss": 0.3618, + "step": 3676 + }, + { + "epoch": 7.036328871892925, + "grad_norm": 0.341796875, + "learning_rate": 1.082696670166736e-05, + "loss": 0.3298, + "step": 3680 + }, + { + "epoch": 7.04397705544933, + "grad_norm": 0.353515625, + "learning_rate": 1.0657848760124954e-05, + "loss": 0.3249, + "step": 3684 + }, + { + "epoch": 7.051625239005737, + "grad_norm": 0.341796875, + "learning_rate": 1.0490013390471474e-05, + "loss": 0.294, + "step": 3688 + }, + { + "epoch": 7.059273422562142, + "grad_norm": 0.341796875, + "learning_rate": 1.0323462137543998e-05, + "loss": 0.3264, + "step": 3692 + }, + { + "epoch": 7.066921606118547, + "grad_norm": 0.3359375, + "learning_rate": 1.015819653436012e-05, + "loss": 0.3164, + "step": 3696 + }, + { + "epoch": 7.074569789674952, + "grad_norm": 0.34375, + "learning_rate": 9.994218102103468e-06, + "loss": 0.3317, + "step": 3700 + }, + { + "epoch": 7.082217973231358, + "grad_norm": 0.37109375, + "learning_rate": 9.831528350110119e-06, + "loss": 0.3168, + "step": 3704 + }, + { + "epoch": 7.089866156787763, + "grad_norm": 0.39453125, + "learning_rate": 9.670128775854513e-06, + "loss": 0.3901, + "step": 3708 + }, + { + "epoch": 7.097514340344168, + "grad_norm": 0.3359375, + "learning_rate": 9.51002086493553e-06, + "loss": 0.3297, + "step": 3712 + }, + { + "epoch": 7.105162523900574, + "grad_norm": 0.353515625, + "learning_rate": 9.351206091063107e-06, + "loss": 0.3484, + "step": 3716 + }, + { + "epoch": 7.112810707456979, + "grad_norm": 0.34375, + "learning_rate": 9.193685916044469e-06, + "loss": 0.3255, + "step": 3720 + }, + { + "epoch": 7.120458891013384, + "grad_norm": 0.361328125, + "learning_rate": 9.03746178977074e-06, + "loss": 0.3837, + "step": 3724 + }, + { + "epoch": 7.128107074569789, + "grad_norm": 0.3515625, + "learning_rate": 8.882535150203567e-06, + "loss": 0.3689, + "step": 3728 + }, + { + "epoch": 7.135755258126195, + "grad_norm": 0.330078125, + "learning_rate": 8.728907423361991e-06, + "loss": 0.3209, + "step": 3732 + }, + { + "epoch": 7.1434034416826, + "grad_norm": 0.326171875, + "learning_rate": 8.576580023309126e-06, + "loss": 0.3297, + "step": 3736 + }, + { + "epoch": 7.151051625239006, + "grad_norm": 0.35546875, + "learning_rate": 8.425554352139313e-06, + "loss": 0.3432, + "step": 3740 + }, + { + "epoch": 7.1586998087954115, + "grad_norm": 0.33203125, + "learning_rate": 8.275831799965194e-06, + "loss": 0.3514, + "step": 3744 + }, + { + "epoch": 7.166347992351817, + "grad_norm": 0.349609375, + "learning_rate": 8.127413744904804e-06, + "loss": 0.3326, + "step": 3748 + }, + { + "epoch": 7.173996175908222, + "grad_norm": 0.373046875, + "learning_rate": 7.980301553068985e-06, + "loss": 0.3558, + "step": 3752 + }, + { + "epoch": 7.181644359464627, + "grad_norm": 0.353515625, + "learning_rate": 7.83449657854886e-06, + "loss": 0.3504, + "step": 3756 + }, + { + "epoch": 7.189292543021033, + "grad_norm": 0.34375, + "learning_rate": 7.690000163403177e-06, + "loss": 0.3136, + "step": 3760 + }, + { + "epoch": 7.196940726577438, + "grad_norm": 0.33984375, + "learning_rate": 7.546813637646182e-06, + "loss": 0.3292, + "step": 3764 + }, + { + "epoch": 7.204588910133843, + "grad_norm": 0.365234375, + "learning_rate": 7.404938319235171e-06, + "loss": 0.327, + "step": 3768 + }, + { + "epoch": 7.2122370936902485, + "grad_norm": 0.36328125, + "learning_rate": 7.264375514058607e-06, + "loss": 0.3526, + "step": 3772 + }, + { + "epoch": 7.219885277246654, + "grad_norm": 0.333984375, + "learning_rate": 7.125126515923752e-06, + "loss": 0.3506, + "step": 3776 + }, + { + "epoch": 7.227533460803059, + "grad_norm": 0.349609375, + "learning_rate": 6.987192606545156e-06, + "loss": 0.3355, + "step": 3780 + }, + { + "epoch": 7.235181644359464, + "grad_norm": 0.353515625, + "learning_rate": 6.850575055532553e-06, + "loss": 0.3484, + "step": 3784 + }, + { + "epoch": 7.24282982791587, + "grad_norm": 0.36328125, + "learning_rate": 6.715275120379271e-06, + "loss": 0.366, + "step": 3788 + }, + { + "epoch": 7.250478011472275, + "grad_norm": 0.33984375, + "learning_rate": 6.581294046450753e-06, + "loss": 0.322, + "step": 3792 + }, + { + "epoch": 7.258126195028681, + "grad_norm": 0.322265625, + "learning_rate": 6.448633066972953e-06, + "loss": 0.2989, + "step": 3796 + }, + { + "epoch": 7.265774378585086, + "grad_norm": 0.353515625, + "learning_rate": 6.317293403021029e-06, + "loss": 0.3377, + "step": 3800 + }, + { + "epoch": 7.273422562141492, + "grad_norm": 0.326171875, + "learning_rate": 6.187276263508167e-06, + "loss": 0.3001, + "step": 3804 + }, + { + "epoch": 7.281070745697897, + "grad_norm": 0.337890625, + "learning_rate": 6.0585828451743925e-06, + "loss": 0.3335, + "step": 3808 + }, + { + "epoch": 7.288718929254302, + "grad_norm": 0.34375, + "learning_rate": 5.93121433257554e-06, + "loss": 0.3458, + "step": 3812 + }, + { + "epoch": 7.2963671128107075, + "grad_norm": 0.3671875, + "learning_rate": 5.805171898072369e-06, + "loss": 0.2941, + "step": 3816 + }, + { + "epoch": 7.304015296367113, + "grad_norm": 0.359375, + "learning_rate": 5.680456701819885e-06, + "loss": 0.3888, + "step": 3820 + }, + { + "epoch": 7.311663479923518, + "grad_norm": 0.36328125, + "learning_rate": 5.5570698917563994e-06, + "loss": 0.3663, + "step": 3824 + }, + { + "epoch": 7.319311663479923, + "grad_norm": 0.3515625, + "learning_rate": 5.435012603593219e-06, + "loss": 0.292, + "step": 3828 + }, + { + "epoch": 7.326959847036329, + "grad_norm": 0.373046875, + "learning_rate": 5.3142859608041265e-06, + "loss": 0.3628, + "step": 3832 + }, + { + "epoch": 7.334608030592734, + "grad_norm": 0.345703125, + "learning_rate": 5.194891074614899e-06, + "loss": 0.3377, + "step": 3836 + }, + { + "epoch": 7.342256214149139, + "grad_norm": 0.353515625, + "learning_rate": 5.076829043993253e-06, + "loss": 0.3322, + "step": 3840 + }, + { + "epoch": 7.349904397705545, + "grad_norm": 0.37890625, + "learning_rate": 4.960100955638685e-06, + "loss": 0.3607, + "step": 3844 + }, + { + "epoch": 7.357552581261951, + "grad_norm": 0.345703125, + "learning_rate": 4.844707883972398e-06, + "loss": 0.3303, + "step": 3848 + }, + { + "epoch": 7.365200764818356, + "grad_norm": 0.337890625, + "learning_rate": 4.730650891127457e-06, + "loss": 0.3327, + "step": 3852 + }, + { + "epoch": 7.372848948374761, + "grad_norm": 0.357421875, + "learning_rate": 4.617931026939115e-06, + "loss": 0.3766, + "step": 3856 + }, + { + "epoch": 7.3804971319311665, + "grad_norm": 0.33203125, + "learning_rate": 4.506549328934916e-06, + "loss": 0.2884, + "step": 3860 + }, + { + "epoch": 7.388145315487572, + "grad_norm": 0.353515625, + "learning_rate": 4.39650682232538e-06, + "loss": 0.3036, + "step": 3864 + }, + { + "epoch": 7.395793499043977, + "grad_norm": 0.37890625, + "learning_rate": 4.287804519994431e-06, + "loss": 0.3398, + "step": 3868 + }, + { + "epoch": 7.403441682600382, + "grad_norm": 0.353515625, + "learning_rate": 4.180443422490115e-06, + "loss": 0.3276, + "step": 3872 + }, + { + "epoch": 7.411089866156788, + "grad_norm": 0.35546875, + "learning_rate": 4.074424518015384e-06, + "loss": 0.3319, + "step": 3876 + }, + { + "epoch": 7.418738049713193, + "grad_norm": 0.390625, + "learning_rate": 3.969748782418991e-06, + "loss": 0.3925, + "step": 3880 + }, + { + "epoch": 7.426386233269598, + "grad_norm": 0.365234375, + "learning_rate": 3.8664171791865765e-06, + "loss": 0.3556, + "step": 3884 + }, + { + "epoch": 7.4340344168260035, + "grad_norm": 0.384765625, + "learning_rate": 3.764430659431661e-06, + "loss": 0.324, + "step": 3888 + }, + { + "epoch": 7.441682600382409, + "grad_norm": 0.37109375, + "learning_rate": 3.6637901618870203e-06, + "loss": 0.3443, + "step": 3892 + }, + { + "epoch": 7.449330783938814, + "grad_norm": 0.3828125, + "learning_rate": 3.564496612896006e-06, + "loss": 0.3409, + "step": 3896 + }, + { + "epoch": 7.45697896749522, + "grad_norm": 0.322265625, + "learning_rate": 3.4665509264039717e-06, + "loss": 0.3106, + "step": 3900 + }, + { + "epoch": 7.4646271510516256, + "grad_norm": 0.35546875, + "learning_rate": 3.3699540039499263e-06, + "loss": 0.3442, + "step": 3904 + }, + { + "epoch": 7.472275334608031, + "grad_norm": 0.35546875, + "learning_rate": 3.274706734658228e-06, + "loss": 0.3649, + "step": 3908 + }, + { + "epoch": 7.479923518164436, + "grad_norm": 0.330078125, + "learning_rate": 3.1808099952303045e-06, + "loss": 0.3215, + "step": 3912 + }, + { + "epoch": 7.487571701720841, + "grad_norm": 0.37109375, + "learning_rate": 3.0882646499367614e-06, + "loss": 0.2973, + "step": 3916 + }, + { + "epoch": 7.495219885277247, + "grad_norm": 0.375, + "learning_rate": 2.9970715506092534e-06, + "loss": 0.3862, + "step": 3920 + }, + { + "epoch": 7.502868068833652, + "grad_norm": 0.3515625, + "learning_rate": 2.9072315366327424e-06, + "loss": 0.3246, + "step": 3924 + }, + { + "epoch": 7.510516252390057, + "grad_norm": 0.36328125, + "learning_rate": 2.8187454349377193e-06, + "loss": 0.3823, + "step": 3928 + }, + { + "epoch": 7.5181644359464626, + "grad_norm": 0.369140625, + "learning_rate": 2.731614059992676e-06, + "loss": 0.3697, + "step": 3932 + }, + { + "epoch": 7.525812619502868, + "grad_norm": 0.365234375, + "learning_rate": 2.6458382137964805e-06, + "loss": 0.3312, + "step": 3936 + }, + { + "epoch": 7.533460803059273, + "grad_norm": 0.353515625, + "learning_rate": 2.5614186858711137e-06, + "loss": 0.3188, + "step": 3940 + }, + { + "epoch": 7.541108986615678, + "grad_norm": 0.345703125, + "learning_rate": 2.47835625325436e-06, + "loss": 0.322, + "step": 3944 + }, + { + "epoch": 7.548757170172085, + "grad_norm": 0.369140625, + "learning_rate": 2.3966516804925784e-06, + "loss": 0.3401, + "step": 3948 + }, + { + "epoch": 7.55640535372849, + "grad_norm": 0.353515625, + "learning_rate": 2.3163057196338096e-06, + "loss": 0.3388, + "step": 3952 + }, + { + "epoch": 7.564053537284895, + "grad_norm": 0.34375, + "learning_rate": 2.2373191102207646e-06, + "loss": 0.3375, + "step": 3956 + }, + { + "epoch": 7.5717017208413, + "grad_norm": 0.322265625, + "learning_rate": 2.1596925792839946e-06, + "loss": 0.3125, + "step": 3960 + }, + { + "epoch": 7.579349904397706, + "grad_norm": 0.37890625, + "learning_rate": 2.083426841335284e-06, + "loss": 0.3436, + "step": 3964 + }, + { + "epoch": 7.586998087954111, + "grad_norm": 0.369140625, + "learning_rate": 2.0085225983610177e-06, + "loss": 0.3559, + "step": 3968 + }, + { + "epoch": 7.594646271510516, + "grad_norm": 0.341796875, + "learning_rate": 1.9349805398156893e-06, + "loss": 0.3167, + "step": 3972 + }, + { + "epoch": 7.602294455066922, + "grad_norm": 0.35546875, + "learning_rate": 1.8628013426156386e-06, + "loss": 0.3654, + "step": 3976 + }, + { + "epoch": 7.609942638623327, + "grad_norm": 0.359375, + "learning_rate": 1.7919856711327563e-06, + "loss": 0.3534, + "step": 3980 + }, + { + "epoch": 7.617590822179732, + "grad_norm": 0.36328125, + "learning_rate": 1.722534177188406e-06, + "loss": 0.3139, + "step": 3984 + }, + { + "epoch": 7.625239005736137, + "grad_norm": 0.37109375, + "learning_rate": 1.6544475000473957e-06, + "loss": 0.3578, + "step": 3988 + }, + { + "epoch": 7.632887189292543, + "grad_norm": 0.357421875, + "learning_rate": 1.5877262664120983e-06, + "loss": 0.3272, + "step": 3992 + }, + { + "epoch": 7.640535372848948, + "grad_norm": 0.373046875, + "learning_rate": 1.522371090416724e-06, + "loss": 0.3578, + "step": 3996 + }, + { + "epoch": 7.648183556405353, + "grad_norm": 0.34375, + "learning_rate": 1.4583825736215749e-06, + "loss": 0.3131, + "step": 4000 + } + ], + "logging_steps": 4, + "max_steps": 4184, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.6138957704404664e+18, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +}