diff --git "a/checkpoint-3500/trainer_state.json" "b/checkpoint-3500/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-3500/trainer_state.json" @@ -0,0 +1,6158 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 6.692160611854685, + "eval_steps": 500, + "global_step": 3500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0076481835564053535, + "grad_norm": 0.291015625, + "learning_rate": 3.243243243243243e-05, + "loss": 1.3011, + "step": 4 + }, + { + "epoch": 0.015296367112810707, + "grad_norm": 0.26171875, + "learning_rate": 6.486486486486486e-05, + "loss": 1.3104, + "step": 8 + }, + { + "epoch": 0.022944550669216062, + "grad_norm": 0.2060546875, + "learning_rate": 9.72972972972973e-05, + "loss": 1.2309, + "step": 12 + }, + { + "epoch": 0.030592734225621414, + "grad_norm": 0.2392578125, + "learning_rate": 0.00012972972972972972, + "loss": 1.2051, + "step": 16 + }, + { + "epoch": 0.03824091778202677, + "grad_norm": 0.169921875, + "learning_rate": 0.00016216216216216215, + "loss": 1.1622, + "step": 20 + }, + { + "epoch": 0.045889101338432124, + "grad_norm": 0.1748046875, + "learning_rate": 0.0001945945945945946, + "loss": 1.1749, + "step": 24 + }, + { + "epoch": 0.05353728489483748, + "grad_norm": 0.154296875, + "learning_rate": 0.00022702702702702703, + "loss": 1.149, + "step": 28 + }, + { + "epoch": 0.06118546845124283, + "grad_norm": 0.1767578125, + "learning_rate": 0.00025945945945945944, + "loss": 1.1455, + "step": 32 + }, + { + "epoch": 0.06883365200764818, + "grad_norm": 0.1884765625, + "learning_rate": 0.0002918918918918919, + "loss": 1.1358, + "step": 36 + }, + { + "epoch": 0.07648183556405354, + "grad_norm": 0.181640625, + "learning_rate": 0.00029999949274434724, + "loss": 1.1201, + "step": 40 + }, + { + "epoch": 0.0841300191204589, + "grad_norm": 0.1708984375, + "learning_rate": 0.0002999972382816974, + "loss": 1.0549, + "step": 44 + }, + { + "epoch": 0.09177820267686425, + "grad_norm": 0.19921875, + "learning_rate": 0.0002999931802773903, + "loss": 1.0946, + "step": 48 + }, + { + "epoch": 0.0994263862332696, + "grad_norm": 0.1826171875, + "learning_rate": 0.00029998731878021884, + "loss": 1.0841, + "step": 52 + }, + { + "epoch": 0.10707456978967496, + "grad_norm": 0.2001953125, + "learning_rate": 0.00029997965386066057, + "loss": 1.0904, + "step": 56 + }, + { + "epoch": 0.1147227533460803, + "grad_norm": 0.1826171875, + "learning_rate": 0.0002999701856108772, + "loss": 1.0673, + "step": 60 + }, + { + "epoch": 0.12237093690248566, + "grad_norm": 0.1826171875, + "learning_rate": 0.00029995891414471334, + "loss": 1.0211, + "step": 64 + }, + { + "epoch": 0.13001912045889102, + "grad_norm": 0.1865234375, + "learning_rate": 0.0002999458395976953, + "loss": 1.0497, + "step": 68 + }, + { + "epoch": 0.13766730401529637, + "grad_norm": 0.208984375, + "learning_rate": 0.0002999309621270293, + "loss": 1.0711, + "step": 72 + }, + { + "epoch": 0.14531548757170173, + "grad_norm": 0.1923828125, + "learning_rate": 0.00029991428191159935, + "loss": 1.047, + "step": 76 + }, + { + "epoch": 0.15296367112810708, + "grad_norm": 0.1962890625, + "learning_rate": 0.00029989579915196574, + "loss": 1.0442, + "step": 80 + }, + { + "epoch": 0.16061185468451242, + "grad_norm": 0.197265625, + "learning_rate": 0.000299875514070362, + "loss": 1.0197, + "step": 84 + }, + { + "epoch": 0.1682600382409178, + "grad_norm": 0.2080078125, + "learning_rate": 0.00029985342691069255, + "loss": 1.0185, + "step": 88 + }, + { + "epoch": 0.17590822179732313, + "grad_norm": 0.2177734375, + "learning_rate": 0.0002998295379385297, + "loss": 1.0168, + "step": 92 + }, + { + "epoch": 0.1835564053537285, + "grad_norm": 0.2099609375, + "learning_rate": 0.00029980384744111047, + "loss": 1.0211, + "step": 96 + }, + { + "epoch": 0.19120458891013384, + "grad_norm": 0.208984375, + "learning_rate": 0.0002997763557273331, + "loss": 1.0178, + "step": 100 + }, + { + "epoch": 0.1988527724665392, + "grad_norm": 0.2060546875, + "learning_rate": 0.0002997470631277533, + "loss": 0.9871, + "step": 104 + }, + { + "epoch": 0.20650095602294455, + "grad_norm": 0.2314453125, + "learning_rate": 0.0002997159699945804, + "loss": 1.0197, + "step": 108 + }, + { + "epoch": 0.21414913957934992, + "grad_norm": 0.240234375, + "learning_rate": 0.0002996830767016731, + "loss": 1.0, + "step": 112 + }, + { + "epoch": 0.22179732313575526, + "grad_norm": 0.236328125, + "learning_rate": 0.0002996483836445347, + "loss": 0.9885, + "step": 116 + }, + { + "epoch": 0.2294455066921606, + "grad_norm": 0.2099609375, + "learning_rate": 0.00029961189124030885, + "loss": 0.9664, + "step": 120 + }, + { + "epoch": 0.23709369024856597, + "grad_norm": 0.236328125, + "learning_rate": 0.00029957359992777404, + "loss": 0.9831, + "step": 124 + }, + { + "epoch": 0.2447418738049713, + "grad_norm": 0.25390625, + "learning_rate": 0.00029953351016733854, + "loss": 0.9718, + "step": 128 + }, + { + "epoch": 0.25239005736137665, + "grad_norm": 0.205078125, + "learning_rate": 0.000299491622441035, + "loss": 0.9592, + "step": 132 + }, + { + "epoch": 0.26003824091778205, + "grad_norm": 0.2109375, + "learning_rate": 0.00029944793725251436, + "loss": 0.9514, + "step": 136 + }, + { + "epoch": 0.2676864244741874, + "grad_norm": 0.220703125, + "learning_rate": 0.00029940245512704005, + "loss": 0.9435, + "step": 140 + }, + { + "epoch": 0.27533460803059273, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002993551766114815, + "loss": 0.9545, + "step": 144 + }, + { + "epoch": 0.2829827915869981, + "grad_norm": 0.25, + "learning_rate": 0.00029930610227430767, + "loss": 0.9378, + "step": 148 + }, + { + "epoch": 0.29063097514340347, + "grad_norm": 0.2314453125, + "learning_rate": 0.0002992552327055802, + "loss": 0.9085, + "step": 152 + }, + { + "epoch": 0.2982791586998088, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002992025685169461, + "loss": 0.9482, + "step": 156 + }, + { + "epoch": 0.30592734225621415, + "grad_norm": 0.216796875, + "learning_rate": 0.00029914811034163096, + "loss": 0.949, + "step": 160 + }, + { + "epoch": 0.3135755258126195, + "grad_norm": 0.2421875, + "learning_rate": 0.00029909185883443063, + "loss": 0.9796, + "step": 164 + }, + { + "epoch": 0.32122370936902483, + "grad_norm": 0.2294921875, + "learning_rate": 0.0002990338146717039, + "loss": 0.9351, + "step": 168 + }, + { + "epoch": 0.32887189292543023, + "grad_norm": 0.220703125, + "learning_rate": 0.0002989739785513639, + "loss": 0.9166, + "step": 172 + }, + { + "epoch": 0.3365200764818356, + "grad_norm": 0.2158203125, + "learning_rate": 0.0002989123511928703, + "loss": 0.9204, + "step": 176 + }, + { + "epoch": 0.3441682600382409, + "grad_norm": 0.2255859375, + "learning_rate": 0.00029884893333722, + "loss": 0.9356, + "step": 180 + }, + { + "epoch": 0.35181644359464626, + "grad_norm": 0.2265625, + "learning_rate": 0.0002987837257469387, + "loss": 0.9342, + "step": 184 + }, + { + "epoch": 0.35946462715105165, + "grad_norm": 0.2265625, + "learning_rate": 0.00029871672920607153, + "loss": 0.9026, + "step": 188 + }, + { + "epoch": 0.367112810707457, + "grad_norm": 0.2314453125, + "learning_rate": 0.0002986479445201737, + "loss": 0.8983, + "step": 192 + }, + { + "epoch": 0.37476099426386233, + "grad_norm": 0.259765625, + "learning_rate": 0.0002985773725163008, + "loss": 0.922, + "step": 196 + }, + { + "epoch": 0.3824091778202677, + "grad_norm": 0.251953125, + "learning_rate": 0.0002985050140429986, + "loss": 0.9099, + "step": 200 + }, + { + "epoch": 0.390057361376673, + "grad_norm": 0.25, + "learning_rate": 0.0002984308699702935, + "loss": 0.8825, + "step": 204 + }, + { + "epoch": 0.3977055449330784, + "grad_norm": 0.2373046875, + "learning_rate": 0.0002983549411896812, + "loss": 0.893, + "step": 208 + }, + { + "epoch": 0.40535372848948376, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002982772286141167, + "loss": 0.9068, + "step": 212 + }, + { + "epoch": 0.4130019120458891, + "grad_norm": 0.2421875, + "learning_rate": 0.000298197733178003, + "loss": 0.8536, + "step": 216 + }, + { + "epoch": 0.42065009560229444, + "grad_norm": 0.2041015625, + "learning_rate": 0.00029811645583717987, + "loss": 0.8843, + "step": 220 + }, + { + "epoch": 0.42829827915869984, + "grad_norm": 0.263671875, + "learning_rate": 0.00029803339756891254, + "loss": 0.8627, + "step": 224 + }, + { + "epoch": 0.4359464627151052, + "grad_norm": 0.263671875, + "learning_rate": 0.00029794855937187963, + "loss": 0.8572, + "step": 228 + }, + { + "epoch": 0.4435946462715105, + "grad_norm": 0.2265625, + "learning_rate": 0.0002978619422661613, + "loss": 0.8255, + "step": 232 + }, + { + "epoch": 0.45124282982791586, + "grad_norm": 0.2197265625, + "learning_rate": 0.0002977735472932273, + "loss": 0.8274, + "step": 236 + }, + { + "epoch": 0.4588910133843212, + "grad_norm": 0.2490234375, + "learning_rate": 0.00029768337551592394, + "loss": 0.8569, + "step": 240 + }, + { + "epoch": 0.4665391969407266, + "grad_norm": 0.2255859375, + "learning_rate": 0.00029759142801846143, + "loss": 0.8791, + "step": 244 + }, + { + "epoch": 0.47418738049713194, + "grad_norm": 0.248046875, + "learning_rate": 0.00029749770590640123, + "loss": 0.8215, + "step": 248 + }, + { + "epoch": 0.4818355640535373, + "grad_norm": 0.2470703125, + "learning_rate": 0.00029740221030664216, + "loss": 0.8546, + "step": 252 + }, + { + "epoch": 0.4894837476099426, + "grad_norm": 0.2421875, + "learning_rate": 0.00029730494236740744, + "loss": 0.8518, + "step": 256 + }, + { + "epoch": 0.497131931166348, + "grad_norm": 0.251953125, + "learning_rate": 0.0002972059032582304, + "loss": 0.8115, + "step": 260 + }, + { + "epoch": 0.5047801147227533, + "grad_norm": 0.251953125, + "learning_rate": 0.0002971050941699407, + "loss": 0.8818, + "step": 264 + }, + { + "epoch": 0.5124282982791587, + "grad_norm": 0.2314453125, + "learning_rate": 0.00029700251631464993, + "loss": 0.8834, + "step": 268 + }, + { + "epoch": 0.5200764818355641, + "grad_norm": 0.23828125, + "learning_rate": 0.000296898170925737, + "loss": 0.8502, + "step": 272 + }, + { + "epoch": 0.5277246653919694, + "grad_norm": 0.2373046875, + "learning_rate": 0.0002967920592578335, + "loss": 0.854, + "step": 276 + }, + { + "epoch": 0.5353728489483748, + "grad_norm": 0.240234375, + "learning_rate": 0.0002966841825868082, + "loss": 0.8164, + "step": 280 + }, + { + "epoch": 0.5430210325047801, + "grad_norm": 0.228515625, + "learning_rate": 0.00029657454220975216, + "loss": 0.9058, + "step": 284 + }, + { + "epoch": 0.5506692160611855, + "grad_norm": 0.2294921875, + "learning_rate": 0.00029646313944496297, + "loss": 0.8646, + "step": 288 + }, + { + "epoch": 0.5583173996175909, + "grad_norm": 0.2158203125, + "learning_rate": 0.00029634997563192866, + "loss": 0.8536, + "step": 292 + }, + { + "epoch": 0.5659655831739961, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002962350521313122, + "loss": 0.8532, + "step": 296 + }, + { + "epoch": 0.5736137667304015, + "grad_norm": 0.2255859375, + "learning_rate": 0.0002961183703249342, + "loss": 0.8228, + "step": 300 + }, + { + "epoch": 0.5812619502868069, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002959999316157573, + "loss": 0.8088, + "step": 304 + }, + { + "epoch": 0.5889101338432122, + "grad_norm": 0.24609375, + "learning_rate": 0.00029587973742786875, + "loss": 0.8291, + "step": 308 + }, + { + "epoch": 0.5965583173996176, + "grad_norm": 0.271484375, + "learning_rate": 0.0002957577892064632, + "loss": 0.8087, + "step": 312 + }, + { + "epoch": 0.6042065009560229, + "grad_norm": 0.2373046875, + "learning_rate": 0.00029563408841782576, + "loss": 0.853, + "step": 316 + }, + { + "epoch": 0.6118546845124283, + "grad_norm": 0.251953125, + "learning_rate": 0.00029550863654931385, + "loss": 0.847, + "step": 320 + }, + { + "epoch": 0.6195028680688337, + "grad_norm": 0.25390625, + "learning_rate": 0.0002953814351093398, + "loss": 0.8087, + "step": 324 + }, + { + "epoch": 0.627151051625239, + "grad_norm": 0.2421875, + "learning_rate": 0.0002952524856273524, + "loss": 0.8514, + "step": 328 + }, + { + "epoch": 0.6347992351816444, + "grad_norm": 0.212890625, + "learning_rate": 0.00029512178965381854, + "loss": 0.8501, + "step": 332 + }, + { + "epoch": 0.6424474187380497, + "grad_norm": 0.2353515625, + "learning_rate": 0.00029498934876020475, + "loss": 0.8029, + "step": 336 + }, + { + "epoch": 0.6500956022944551, + "grad_norm": 0.232421875, + "learning_rate": 0.00029485516453895826, + "loss": 0.8293, + "step": 340 + }, + { + "epoch": 0.6577437858508605, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002947192386034874, + "loss": 0.8695, + "step": 344 + }, + { + "epoch": 0.6653919694072657, + "grad_norm": 0.236328125, + "learning_rate": 0.00029458157258814316, + "loss": 0.8249, + "step": 348 + }, + { + "epoch": 0.6730401529636711, + "grad_norm": 0.234375, + "learning_rate": 0.00029444216814819834, + "loss": 0.8009, + "step": 352 + }, + { + "epoch": 0.6806883365200764, + "grad_norm": 0.2490234375, + "learning_rate": 0.00029430102695982875, + "loss": 0.8642, + "step": 356 + }, + { + "epoch": 0.6883365200764818, + "grad_norm": 0.228515625, + "learning_rate": 0.00029415815072009237, + "loss": 0.8562, + "step": 360 + }, + { + "epoch": 0.6959847036328872, + "grad_norm": 0.240234375, + "learning_rate": 0.00029401354114690905, + "loss": 0.8274, + "step": 364 + }, + { + "epoch": 0.7036328871892925, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002938671999790402, + "loss": 0.8214, + "step": 368 + }, + { + "epoch": 0.7112810707456979, + "grad_norm": 0.24609375, + "learning_rate": 0.00029371912897606736, + "loss": 0.8537, + "step": 372 + }, + { + "epoch": 0.7189292543021033, + "grad_norm": 0.2275390625, + "learning_rate": 0.00029356932991837163, + "loss": 0.8378, + "step": 376 + }, + { + "epoch": 0.7265774378585086, + "grad_norm": 0.251953125, + "learning_rate": 0.0002934178046071116, + "loss": 0.8064, + "step": 380 + }, + { + "epoch": 0.734225621414914, + "grad_norm": 0.259765625, + "learning_rate": 0.0002932645548642024, + "loss": 0.8427, + "step": 384 + }, + { + "epoch": 0.7418738049713193, + "grad_norm": 0.251953125, + "learning_rate": 0.0002931095825322931, + "loss": 0.7602, + "step": 388 + }, + { + "epoch": 0.7495219885277247, + "grad_norm": 0.232421875, + "learning_rate": 0.00029295288947474513, + "loss": 0.881, + "step": 392 + }, + { + "epoch": 0.7571701720841301, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002927944775756095, + "loss": 0.865, + "step": 396 + }, + { + "epoch": 0.7648183556405354, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002926343487396044, + "loss": 0.839, + "step": 400 + }, + { + "epoch": 0.7724665391969407, + "grad_norm": 0.2412109375, + "learning_rate": 0.00029247250489209217, + "loss": 0.7939, + "step": 404 + }, + { + "epoch": 0.780114722753346, + "grad_norm": 0.2470703125, + "learning_rate": 0.00029230894797905595, + "loss": 0.7748, + "step": 408 + }, + { + "epoch": 0.7877629063097514, + "grad_norm": 0.255859375, + "learning_rate": 0.00029214367996707676, + "loss": 0.7829, + "step": 412 + }, + { + "epoch": 0.7954110898661568, + "grad_norm": 0.2392578125, + "learning_rate": 0.00029197670284330954, + "loss": 0.7867, + "step": 416 + }, + { + "epoch": 0.8030592734225621, + "grad_norm": 0.2255859375, + "learning_rate": 0.00029180801861545906, + "loss": 0.7971, + "step": 420 + }, + { + "epoch": 0.8107074569789675, + "grad_norm": 0.25390625, + "learning_rate": 0.0002916376293117564, + "loss": 0.8241, + "step": 424 + }, + { + "epoch": 0.8183556405353728, + "grad_norm": 0.244140625, + "learning_rate": 0.00029146553698093387, + "loss": 0.8119, + "step": 428 + }, + { + "epoch": 0.8260038240917782, + "grad_norm": 0.271484375, + "learning_rate": 0.00029129174369220087, + "loss": 0.8048, + "step": 432 + }, + { + "epoch": 0.8336520076481836, + "grad_norm": 0.232421875, + "learning_rate": 0.00029111625153521877, + "loss": 0.76, + "step": 436 + }, + { + "epoch": 0.8413001912045889, + "grad_norm": 0.271484375, + "learning_rate": 0.00029093906262007583, + "loss": 0.7833, + "step": 440 + }, + { + "epoch": 0.8489483747609943, + "grad_norm": 0.2158203125, + "learning_rate": 0.00029076017907726196, + "loss": 0.8027, + "step": 444 + }, + { + "epoch": 0.8565965583173997, + "grad_norm": 0.21875, + "learning_rate": 0.0002905796030576428, + "loss": 0.818, + "step": 448 + }, + { + "epoch": 0.864244741873805, + "grad_norm": 0.2373046875, + "learning_rate": 0.00029039733673243416, + "loss": 0.8358, + "step": 452 + }, + { + "epoch": 0.8718929254302104, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002902133822931759, + "loss": 0.7543, + "step": 456 + }, + { + "epoch": 0.8795411089866156, + "grad_norm": 0.2421875, + "learning_rate": 0.00029002774195170525, + "loss": 0.7765, + "step": 460 + }, + { + "epoch": 0.887189292543021, + "grad_norm": 0.2451171875, + "learning_rate": 0.0002898404179401306, + "loss": 0.8094, + "step": 464 + }, + { + "epoch": 0.8948374760994264, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002896514125108045, + "loss": 0.7657, + "step": 468 + }, + { + "epoch": 0.9024856596558317, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002894607279362966, + "loss": 0.7774, + "step": 472 + }, + { + "epoch": 0.9101338432122371, + "grad_norm": 0.2255859375, + "learning_rate": 0.0002892683665093662, + "loss": 0.8148, + "step": 476 + }, + { + "epoch": 0.9177820267686424, + "grad_norm": 0.236328125, + "learning_rate": 0.0002890743305429348, + "loss": 0.7882, + "step": 480 + }, + { + "epoch": 0.9254302103250478, + "grad_norm": 0.2294921875, + "learning_rate": 0.0002888786223700585, + "loss": 0.7656, + "step": 484 + }, + { + "epoch": 0.9330783938814532, + "grad_norm": 0.224609375, + "learning_rate": 0.00028868124434389944, + "loss": 0.7802, + "step": 488 + }, + { + "epoch": 0.9407265774378585, + "grad_norm": 0.240234375, + "learning_rate": 0.00028848219883769805, + "loss": 0.7773, + "step": 492 + }, + { + "epoch": 0.9483747609942639, + "grad_norm": 0.2314453125, + "learning_rate": 0.000288281488244744, + "loss": 0.7803, + "step": 496 + }, + { + "epoch": 0.9560229445506692, + "grad_norm": 0.255859375, + "learning_rate": 0.000288079114978348, + "loss": 0.8056, + "step": 500 + }, + { + "epoch": 0.9636711281070746, + "grad_norm": 0.240234375, + "learning_rate": 0.0002878750814718121, + "loss": 0.8309, + "step": 504 + }, + { + "epoch": 0.97131931166348, + "grad_norm": 0.2294921875, + "learning_rate": 0.00028766939017840114, + "loss": 0.7737, + "step": 508 + }, + { + "epoch": 0.9789674952198852, + "grad_norm": 0.26171875, + "learning_rate": 0.00028746204357131273, + "loss": 0.8039, + "step": 512 + }, + { + "epoch": 0.9866156787762906, + "grad_norm": 0.2255859375, + "learning_rate": 0.0002872530441436477, + "loss": 0.7341, + "step": 516 + }, + { + "epoch": 0.994263862332696, + "grad_norm": 0.228515625, + "learning_rate": 0.0002870423944083801, + "loss": 0.8122, + "step": 520 + }, + { + "epoch": 1.0019120458891013, + "grad_norm": 0.216796875, + "learning_rate": 0.0002868300968983271, + "loss": 0.7403, + "step": 524 + }, + { + "epoch": 1.0095602294455066, + "grad_norm": 0.24609375, + "learning_rate": 0.0002866161541661185, + "loss": 0.697, + "step": 528 + }, + { + "epoch": 1.0172084130019121, + "grad_norm": 0.23828125, + "learning_rate": 0.0002864005687841656, + "loss": 0.7442, + "step": 532 + }, + { + "epoch": 1.0248565965583174, + "grad_norm": 0.26953125, + "learning_rate": 0.0002861833433446312, + "loss": 0.6853, + "step": 536 + }, + { + "epoch": 1.0325047801147227, + "grad_norm": 0.23828125, + "learning_rate": 0.00028596448045939735, + "loss": 0.7398, + "step": 540 + }, + { + "epoch": 1.0401529636711282, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002857439827600348, + "loss": 0.7912, + "step": 544 + }, + { + "epoch": 1.0478011472275335, + "grad_norm": 0.259765625, + "learning_rate": 0.0002855218528977709, + "loss": 0.7138, + "step": 548 + }, + { + "epoch": 1.0554493307839388, + "grad_norm": 0.2314453125, + "learning_rate": 0.00028529809354345794, + "loss": 0.726, + "step": 552 + }, + { + "epoch": 1.063097514340344, + "grad_norm": 0.287109375, + "learning_rate": 0.0002850727073875409, + "loss": 0.7058, + "step": 556 + }, + { + "epoch": 1.0707456978967496, + "grad_norm": 0.228515625, + "learning_rate": 0.00028484569714002517, + "loss": 0.7102, + "step": 560 + }, + { + "epoch": 1.0783938814531548, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002846170655304438, + "loss": 0.6534, + "step": 564 + }, + { + "epoch": 1.0860420650095601, + "grad_norm": 0.23828125, + "learning_rate": 0.0002843868153078251, + "loss": 0.6918, + "step": 568 + }, + { + "epoch": 1.0936902485659656, + "grad_norm": 0.251953125, + "learning_rate": 0.000284154949240659, + "loss": 0.6673, + "step": 572 + }, + { + "epoch": 1.101338432122371, + "grad_norm": 0.255859375, + "learning_rate": 0.0002839214701168644, + "loss": 0.6722, + "step": 576 + }, + { + "epoch": 1.1089866156787762, + "grad_norm": 0.251953125, + "learning_rate": 0.00028368638074375516, + "loss": 0.7141, + "step": 580 + }, + { + "epoch": 1.1166347992351817, + "grad_norm": 0.263671875, + "learning_rate": 0.0002834496839480063, + "loss": 0.665, + "step": 584 + }, + { + "epoch": 1.124282982791587, + "grad_norm": 0.2421875, + "learning_rate": 0.00028321138257562066, + "loss": 0.6886, + "step": 588 + }, + { + "epoch": 1.1319311663479923, + "grad_norm": 0.265625, + "learning_rate": 0.00028297147949189386, + "loss": 0.6997, + "step": 592 + }, + { + "epoch": 1.1395793499043978, + "grad_norm": 0.2578125, + "learning_rate": 0.00028272997758138044, + "loss": 0.7051, + "step": 596 + }, + { + "epoch": 1.147227533460803, + "grad_norm": 0.248046875, + "learning_rate": 0.00028248687974785896, + "loss": 0.7188, + "step": 600 + }, + { + "epoch": 1.1548757170172084, + "grad_norm": 0.275390625, + "learning_rate": 0.0002822421889142969, + "loss": 0.6757, + "step": 604 + }, + { + "epoch": 1.1625239005736137, + "grad_norm": 0.25, + "learning_rate": 0.00028199590802281595, + "loss": 0.7203, + "step": 608 + }, + { + "epoch": 1.1701720841300192, + "grad_norm": 0.255859375, + "learning_rate": 0.00028174804003465616, + "loss": 0.6943, + "step": 612 + }, + { + "epoch": 1.1778202676864244, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002814985879301408, + "loss": 0.7037, + "step": 616 + }, + { + "epoch": 1.1854684512428297, + "grad_norm": 0.26953125, + "learning_rate": 0.0002812475547086401, + "loss": 0.7215, + "step": 620 + }, + { + "epoch": 1.1931166347992352, + "grad_norm": 0.263671875, + "learning_rate": 0.00028099494338853554, + "loss": 0.6863, + "step": 624 + }, + { + "epoch": 1.2007648183556405, + "grad_norm": 0.240234375, + "learning_rate": 0.0002807407570071832, + "loss": 0.7432, + "step": 628 + }, + { + "epoch": 1.2084130019120458, + "grad_norm": 0.244140625, + "learning_rate": 0.00028048499862087757, + "loss": 0.7265, + "step": 632 + }, + { + "epoch": 1.2160611854684513, + "grad_norm": 0.259765625, + "learning_rate": 0.00028022767130481466, + "loss": 0.6848, + "step": 636 + }, + { + "epoch": 1.2237093690248566, + "grad_norm": 0.2470703125, + "learning_rate": 0.0002799687781530549, + "loss": 0.717, + "step": 640 + }, + { + "epoch": 1.231357552581262, + "grad_norm": 0.259765625, + "learning_rate": 0.00027970832227848627, + "loss": 0.7011, + "step": 644 + }, + { + "epoch": 1.2390057361376674, + "grad_norm": 0.2578125, + "learning_rate": 0.0002794463068127866, + "loss": 0.7696, + "step": 648 + }, + { + "epoch": 1.2466539196940727, + "grad_norm": 0.2431640625, + "learning_rate": 0.00027918273490638574, + "loss": 0.6922, + "step": 652 + }, + { + "epoch": 1.254302103250478, + "grad_norm": 0.28515625, + "learning_rate": 0.0002789176097284283, + "loss": 0.6521, + "step": 656 + }, + { + "epoch": 1.2619502868068833, + "grad_norm": 0.25390625, + "learning_rate": 0.0002786509344667349, + "loss": 0.6642, + "step": 660 + }, + { + "epoch": 1.2695984703632888, + "grad_norm": 0.25390625, + "learning_rate": 0.0002783827123277643, + "loss": 0.7773, + "step": 664 + }, + { + "epoch": 1.277246653919694, + "grad_norm": 0.2431640625, + "learning_rate": 0.00027811294653657444, + "loss": 0.7314, + "step": 668 + }, + { + "epoch": 1.2848948374760996, + "grad_norm": 0.2373046875, + "learning_rate": 0.000277841640336784, + "loss": 0.7461, + "step": 672 + }, + { + "epoch": 1.2925430210325048, + "grad_norm": 0.236328125, + "learning_rate": 0.00027756879699053337, + "loss": 0.7426, + "step": 676 + }, + { + "epoch": 1.3001912045889101, + "grad_norm": 0.26171875, + "learning_rate": 0.0002772944197784451, + "loss": 0.6986, + "step": 680 + }, + { + "epoch": 1.3078393881453154, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002770185119995849, + "loss": 0.7379, + "step": 684 + }, + { + "epoch": 1.3154875717017207, + "grad_norm": 0.265625, + "learning_rate": 0.0002767410769714216, + "loss": 0.7146, + "step": 688 + }, + { + "epoch": 1.3231357552581262, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002764621180297875, + "loss": 0.7061, + "step": 692 + }, + { + "epoch": 1.3307839388145315, + "grad_norm": 0.271484375, + "learning_rate": 0.0002761816385288382, + "loss": 0.6547, + "step": 696 + }, + { + "epoch": 1.338432122370937, + "grad_norm": 0.271484375, + "learning_rate": 0.0002758996418410122, + "loss": 0.7018, + "step": 700 + }, + { + "epoch": 1.3460803059273423, + "grad_norm": 0.2578125, + "learning_rate": 0.0002756161313569904, + "loss": 0.7062, + "step": 704 + }, + { + "epoch": 1.3537284894837476, + "grad_norm": 0.271484375, + "learning_rate": 0.00027533111048565537, + "loss": 0.778, + "step": 708 + }, + { + "epoch": 1.3613766730401529, + "grad_norm": 0.271484375, + "learning_rate": 0.00027504458265405034, + "loss": 0.6916, + "step": 712 + }, + { + "epoch": 1.3690248565965584, + "grad_norm": 0.2333984375, + "learning_rate": 0.00027475655130733786, + "loss": 0.6862, + "step": 716 + }, + { + "epoch": 1.3766730401529637, + "grad_norm": 0.267578125, + "learning_rate": 0.00027446701990875864, + "loss": 0.7037, + "step": 720 + }, + { + "epoch": 1.384321223709369, + "grad_norm": 0.2431640625, + "learning_rate": 0.00027417599193958964, + "loss": 0.6976, + "step": 724 + }, + { + "epoch": 1.3919694072657744, + "grad_norm": 0.24609375, + "learning_rate": 0.00027388347089910253, + "loss": 0.7209, + "step": 728 + }, + { + "epoch": 1.3996175908221797, + "grad_norm": 0.251953125, + "learning_rate": 0.0002735894603045211, + "loss": 0.7009, + "step": 732 + }, + { + "epoch": 1.407265774378585, + "grad_norm": 0.2578125, + "learning_rate": 0.0002732939636909796, + "loss": 0.6583, + "step": 736 + }, + { + "epoch": 1.4149139579349903, + "grad_norm": 0.255859375, + "learning_rate": 0.00027299698461147966, + "loss": 0.6999, + "step": 740 + }, + { + "epoch": 1.4225621414913958, + "grad_norm": 0.267578125, + "learning_rate": 0.0002726985266368481, + "loss": 0.7269, + "step": 744 + }, + { + "epoch": 1.430210325047801, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002723985933556936, + "loss": 0.6256, + "step": 748 + }, + { + "epoch": 1.4378585086042066, + "grad_norm": 0.271484375, + "learning_rate": 0.00027209718837436353, + "loss": 0.7129, + "step": 752 + }, + { + "epoch": 1.445506692160612, + "grad_norm": 0.271484375, + "learning_rate": 0.000271794315316901, + "loss": 0.6623, + "step": 756 + }, + { + "epoch": 1.4531548757170172, + "grad_norm": 0.259765625, + "learning_rate": 0.00027148997782500085, + "loss": 0.6869, + "step": 760 + }, + { + "epoch": 1.4608030592734225, + "grad_norm": 0.2451171875, + "learning_rate": 0.0002711841795579661, + "loss": 0.7426, + "step": 764 + }, + { + "epoch": 1.468451242829828, + "grad_norm": 0.2578125, + "learning_rate": 0.00027087692419266383, + "loss": 0.6731, + "step": 768 + }, + { + "epoch": 1.4760994263862333, + "grad_norm": 0.275390625, + "learning_rate": 0.00027056821542348114, + "loss": 0.7591, + "step": 772 + }, + { + "epoch": 1.4837476099426385, + "grad_norm": 0.25, + "learning_rate": 0.0002702580569622805, + "loss": 0.7129, + "step": 776 + }, + { + "epoch": 1.491395793499044, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002699464525383552, + "loss": 0.7307, + "step": 780 + }, + { + "epoch": 1.4990439770554493, + "grad_norm": 0.2578125, + "learning_rate": 0.0002696334058983848, + "loss": 0.7317, + "step": 784 + }, + { + "epoch": 1.5066921606118546, + "grad_norm": 0.2578125, + "learning_rate": 0.0002693189208063894, + "loss": 0.6994, + "step": 788 + }, + { + "epoch": 1.51434034416826, + "grad_norm": 0.255859375, + "learning_rate": 0.00026900300104368524, + "loss": 0.72, + "step": 792 + }, + { + "epoch": 1.5219885277246654, + "grad_norm": 0.259765625, + "learning_rate": 0.0002686856504088385, + "loss": 0.7112, + "step": 796 + }, + { + "epoch": 1.5296367112810707, + "grad_norm": 0.2490234375, + "learning_rate": 0.00026836687271762015, + "loss": 0.6912, + "step": 800 + }, + { + "epoch": 1.5372848948374762, + "grad_norm": 0.2421875, + "learning_rate": 0.0002680466718029596, + "loss": 0.6801, + "step": 804 + }, + { + "epoch": 1.5449330783938815, + "grad_norm": 0.265625, + "learning_rate": 0.00026772505151489897, + "loss": 0.7077, + "step": 808 + }, + { + "epoch": 1.5525812619502868, + "grad_norm": 0.283203125, + "learning_rate": 0.00026740201572054685, + "loss": 0.6926, + "step": 812 + }, + { + "epoch": 1.560229445506692, + "grad_norm": 0.27734375, + "learning_rate": 0.00026707756830403144, + "loss": 0.702, + "step": 816 + }, + { + "epoch": 1.5678776290630974, + "grad_norm": 0.271484375, + "learning_rate": 0.00026675171316645403, + "loss": 0.7178, + "step": 820 + }, + { + "epoch": 1.5755258126195029, + "grad_norm": 0.26171875, + "learning_rate": 0.00026642445422584224, + "loss": 0.6843, + "step": 824 + }, + { + "epoch": 1.5831739961759084, + "grad_norm": 0.25, + "learning_rate": 0.0002660957954171028, + "loss": 0.6722, + "step": 828 + }, + { + "epoch": 1.5908221797323137, + "grad_norm": 0.251953125, + "learning_rate": 0.00026576574069197406, + "loss": 0.6518, + "step": 832 + }, + { + "epoch": 1.598470363288719, + "grad_norm": 0.263671875, + "learning_rate": 0.00026543429401897875, + "loss": 0.6998, + "step": 836 + }, + { + "epoch": 1.6061185468451242, + "grad_norm": 0.26171875, + "learning_rate": 0.0002651014593833762, + "loss": 0.6966, + "step": 840 + }, + { + "epoch": 1.6137667304015295, + "grad_norm": 0.25390625, + "learning_rate": 0.00026476724078711416, + "loss": 0.7054, + "step": 844 + }, + { + "epoch": 1.621414913957935, + "grad_norm": 0.279296875, + "learning_rate": 0.00026443164224878115, + "loss": 0.6655, + "step": 848 + }, + { + "epoch": 1.6290630975143403, + "grad_norm": 0.275390625, + "learning_rate": 0.0002640946678035576, + "loss": 0.7098, + "step": 852 + }, + { + "epoch": 1.6367112810707458, + "grad_norm": 0.2451171875, + "learning_rate": 0.0002637563215031679, + "loss": 0.7099, + "step": 856 + }, + { + "epoch": 1.644359464627151, + "grad_norm": 0.2470703125, + "learning_rate": 0.00026341660741583127, + "loss": 0.6704, + "step": 860 + }, + { + "epoch": 1.6520076481835564, + "grad_norm": 0.27734375, + "learning_rate": 0.00026307552962621293, + "loss": 0.686, + "step": 864 + }, + { + "epoch": 1.6596558317399617, + "grad_norm": 0.2470703125, + "learning_rate": 0.00026273309223537507, + "loss": 0.69, + "step": 868 + }, + { + "epoch": 1.667304015296367, + "grad_norm": 0.275390625, + "learning_rate": 0.0002623892993607275, + "loss": 0.6957, + "step": 872 + }, + { + "epoch": 1.6749521988527725, + "grad_norm": 0.267578125, + "learning_rate": 0.00026204415513597813, + "loss": 0.7071, + "step": 876 + }, + { + "epoch": 1.682600382409178, + "grad_norm": 0.26953125, + "learning_rate": 0.0002616976637110832, + "loss": 0.6313, + "step": 880 + }, + { + "epoch": 1.6902485659655833, + "grad_norm": 0.267578125, + "learning_rate": 0.0002613498292521977, + "loss": 0.6809, + "step": 884 + }, + { + "epoch": 1.6978967495219885, + "grad_norm": 0.275390625, + "learning_rate": 0.00026100065594162475, + "loss": 0.6867, + "step": 888 + }, + { + "epoch": 1.7055449330783938, + "grad_norm": 0.26171875, + "learning_rate": 0.00026065014797776575, + "loss": 0.7065, + "step": 892 + }, + { + "epoch": 1.7131931166347991, + "grad_norm": 0.296875, + "learning_rate": 0.0002602983095750698, + "loss": 0.6938, + "step": 896 + }, + { + "epoch": 1.7208413001912046, + "grad_norm": 0.25, + "learning_rate": 0.0002599451449639828, + "loss": 0.7138, + "step": 900 + }, + { + "epoch": 1.72848948374761, + "grad_norm": 0.2421875, + "learning_rate": 0.00025959065839089684, + "loss": 0.6976, + "step": 904 + }, + { + "epoch": 1.7361376673040154, + "grad_norm": 0.2431640625, + "learning_rate": 0.00025923485411809917, + "loss": 0.6792, + "step": 908 + }, + { + "epoch": 1.7437858508604207, + "grad_norm": 0.25390625, + "learning_rate": 0.00025887773642372064, + "loss": 0.7016, + "step": 912 + }, + { + "epoch": 1.751434034416826, + "grad_norm": 0.2578125, + "learning_rate": 0.00025851930960168464, + "loss": 0.6845, + "step": 916 + }, + { + "epoch": 1.7590822179732313, + "grad_norm": 0.267578125, + "learning_rate": 0.0002581595779616552, + "loss": 0.6932, + "step": 920 + }, + { + "epoch": 1.7667304015296366, + "grad_norm": 0.263671875, + "learning_rate": 0.0002577985458289852, + "loss": 0.6911, + "step": 924 + }, + { + "epoch": 1.774378585086042, + "grad_norm": 0.259765625, + "learning_rate": 0.00025743621754466457, + "loss": 0.6611, + "step": 928 + }, + { + "epoch": 1.7820267686424476, + "grad_norm": 0.251953125, + "learning_rate": 0.0002570725974652679, + "loss": 0.7158, + "step": 932 + }, + { + "epoch": 1.7896749521988529, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002567076899629021, + "loss": 0.7176, + "step": 936 + }, + { + "epoch": 1.7973231357552581, + "grad_norm": 0.251953125, + "learning_rate": 0.0002563414994251538, + "loss": 0.7001, + "step": 940 + }, + { + "epoch": 1.8049713193116634, + "grad_norm": 0.267578125, + "learning_rate": 0.0002559740302550366, + "loss": 0.7583, + "step": 944 + }, + { + "epoch": 1.8126195028680687, + "grad_norm": 0.28125, + "learning_rate": 0.0002556052868709383, + "loss": 0.6374, + "step": 948 + }, + { + "epoch": 1.8202676864244742, + "grad_norm": 0.279296875, + "learning_rate": 0.00025523527370656753, + "loss": 0.6771, + "step": 952 + }, + { + "epoch": 1.8279158699808795, + "grad_norm": 0.263671875, + "learning_rate": 0.0002548639952109006, + "loss": 0.6547, + "step": 956 + }, + { + "epoch": 1.835564053537285, + "grad_norm": 0.255859375, + "learning_rate": 0.0002544914558481279, + "loss": 0.7078, + "step": 960 + }, + { + "epoch": 1.8432122370936903, + "grad_norm": 0.23828125, + "learning_rate": 0.00025411766009760027, + "loss": 0.6913, + "step": 964 + }, + { + "epoch": 1.8508604206500956, + "grad_norm": 0.26171875, + "learning_rate": 0.00025374261245377525, + "loss": 0.725, + "step": 968 + }, + { + "epoch": 1.8585086042065009, + "grad_norm": 0.251953125, + "learning_rate": 0.0002533663174261628, + "loss": 0.6871, + "step": 972 + }, + { + "epoch": 1.8661567877629062, + "grad_norm": 0.283203125, + "learning_rate": 0.0002529887795392713, + "loss": 0.6748, + "step": 976 + }, + { + "epoch": 1.8738049713193117, + "grad_norm": 0.28515625, + "learning_rate": 0.00025261000333255305, + "loss": 0.6224, + "step": 980 + }, + { + "epoch": 1.8814531548757172, + "grad_norm": 0.255859375, + "learning_rate": 0.0002522299933603497, + "loss": 0.6914, + "step": 984 + }, + { + "epoch": 1.8891013384321225, + "grad_norm": 0.26953125, + "learning_rate": 0.0002518487541918374, + "loss": 0.684, + "step": 988 + }, + { + "epoch": 1.8967495219885278, + "grad_norm": 0.259765625, + "learning_rate": 0.000251466290410972, + "loss": 0.7131, + "step": 992 + }, + { + "epoch": 1.904397705544933, + "grad_norm": 0.267578125, + "learning_rate": 0.0002510826066164341, + "loss": 0.6956, + "step": 996 + }, + { + "epoch": 1.9120458891013383, + "grad_norm": 0.2470703125, + "learning_rate": 0.00025069770742157317, + "loss": 0.6683, + "step": 1000 + }, + { + "epoch": 1.9196940726577438, + "grad_norm": 0.251953125, + "learning_rate": 0.00025031159745435267, + "loss": 0.6941, + "step": 1004 + }, + { + "epoch": 1.9273422562141491, + "grad_norm": 0.248046875, + "learning_rate": 0.0002499242813572942, + "loss": 0.7458, + "step": 1008 + }, + { + "epoch": 1.9349904397705546, + "grad_norm": 0.2734375, + "learning_rate": 0.0002495357637874215, + "loss": 0.6724, + "step": 1012 + }, + { + "epoch": 1.94263862332696, + "grad_norm": 0.240234375, + "learning_rate": 0.0002491460494162048, + "loss": 0.6662, + "step": 1016 + }, + { + "epoch": 1.9502868068833652, + "grad_norm": 0.279296875, + "learning_rate": 0.00024875514292950447, + "loss": 0.652, + "step": 1020 + }, + { + "epoch": 1.9579349904397705, + "grad_norm": 0.263671875, + "learning_rate": 0.00024836304902751445, + "loss": 0.7223, + "step": 1024 + }, + { + "epoch": 1.9655831739961758, + "grad_norm": 0.255859375, + "learning_rate": 0.0002479697724247062, + "loss": 0.7065, + "step": 1028 + }, + { + "epoch": 1.9732313575525813, + "grad_norm": 0.263671875, + "learning_rate": 0.0002475753178497716, + "loss": 0.7307, + "step": 1032 + }, + { + "epoch": 1.9808795411089866, + "grad_norm": 0.279296875, + "learning_rate": 0.00024717969004556646, + "loss": 0.7086, + "step": 1036 + }, + { + "epoch": 1.988527724665392, + "grad_norm": 0.259765625, + "learning_rate": 0.0002467828937690532, + "loss": 0.7051, + "step": 1040 + }, + { + "epoch": 1.9961759082217974, + "grad_norm": 0.2734375, + "learning_rate": 0.0002463849337912437, + "loss": 0.7084, + "step": 1044 + }, + { + "epoch": 2.0038240917782026, + "grad_norm": 0.236328125, + "learning_rate": 0.00024598581489714206, + "loss": 0.5785, + "step": 1048 + }, + { + "epoch": 2.011472275334608, + "grad_norm": 0.28125, + "learning_rate": 0.0002455855418856869, + "loss": 0.5793, + "step": 1052 + }, + { + "epoch": 2.019120458891013, + "grad_norm": 0.259765625, + "learning_rate": 0.000245184119569694, + "loss": 0.6212, + "step": 1056 + }, + { + "epoch": 2.026768642447419, + "grad_norm": 0.26953125, + "learning_rate": 0.0002447815527757979, + "loss": 0.569, + "step": 1060 + }, + { + "epoch": 2.0344168260038242, + "grad_norm": 0.26953125, + "learning_rate": 0.0002443778463443944, + "loss": 0.5868, + "step": 1064 + }, + { + "epoch": 2.0420650095602295, + "grad_norm": 0.236328125, + "learning_rate": 0.0002439730051295818, + "loss": 0.6195, + "step": 1068 + }, + { + "epoch": 2.049713193116635, + "grad_norm": 0.279296875, + "learning_rate": 0.0002435670339991031, + "loss": 0.5886, + "step": 1072 + }, + { + "epoch": 2.05736137667304, + "grad_norm": 0.28125, + "learning_rate": 0.00024315993783428718, + "loss": 0.6097, + "step": 1076 + }, + { + "epoch": 2.0650095602294454, + "grad_norm": 0.3203125, + "learning_rate": 0.00024275172152999006, + "loss": 0.5727, + "step": 1080 + }, + { + "epoch": 2.0726577437858507, + "grad_norm": 0.275390625, + "learning_rate": 0.00024234238999453614, + "loss": 0.6085, + "step": 1084 + }, + { + "epoch": 2.0803059273422564, + "grad_norm": 0.29296875, + "learning_rate": 0.00024193194814965934, + "loss": 0.6144, + "step": 1088 + }, + { + "epoch": 2.0879541108986617, + "grad_norm": 0.279296875, + "learning_rate": 0.00024152040093044353, + "loss": 0.5868, + "step": 1092 + }, + { + "epoch": 2.095602294455067, + "grad_norm": 0.2734375, + "learning_rate": 0.00024110775328526352, + "loss": 0.6278, + "step": 1096 + }, + { + "epoch": 2.1032504780114722, + "grad_norm": 0.275390625, + "learning_rate": 0.00024069401017572543, + "loss": 0.5923, + "step": 1100 + }, + { + "epoch": 2.1108986615678775, + "grad_norm": 0.265625, + "learning_rate": 0.00024027917657660713, + "loss": 0.5759, + "step": 1104 + }, + { + "epoch": 2.118546845124283, + "grad_norm": 0.2734375, + "learning_rate": 0.00023986325747579824, + "loss": 0.6138, + "step": 1108 + }, + { + "epoch": 2.126195028680688, + "grad_norm": 0.28515625, + "learning_rate": 0.0002394462578742403, + "loss": 0.5786, + "step": 1112 + }, + { + "epoch": 2.133843212237094, + "grad_norm": 0.267578125, + "learning_rate": 0.0002390281827858668, + "loss": 0.64, + "step": 1116 + }, + { + "epoch": 2.141491395793499, + "grad_norm": 0.279296875, + "learning_rate": 0.0002386090372375424, + "loss": 0.6307, + "step": 1120 + }, + { + "epoch": 2.1491395793499044, + "grad_norm": 0.2578125, + "learning_rate": 0.00023818882626900294, + "loss": 0.5641, + "step": 1124 + }, + { + "epoch": 2.1567877629063097, + "grad_norm": 0.28125, + "learning_rate": 0.00023776755493279473, + "loss": 0.623, + "step": 1128 + }, + { + "epoch": 2.164435946462715, + "grad_norm": 0.275390625, + "learning_rate": 0.00023734522829421372, + "loss": 0.6022, + "step": 1132 + }, + { + "epoch": 2.1720841300191203, + "grad_norm": 0.2734375, + "learning_rate": 0.00023692185143124464, + "loss": 0.6121, + "step": 1136 + }, + { + "epoch": 2.179732313575526, + "grad_norm": 0.267578125, + "learning_rate": 0.00023649742943449996, + "loss": 0.5878, + "step": 1140 + }, + { + "epoch": 2.1873804971319313, + "grad_norm": 0.2578125, + "learning_rate": 0.00023607196740715858, + "loss": 0.6143, + "step": 1144 + }, + { + "epoch": 2.1950286806883366, + "grad_norm": 0.28515625, + "learning_rate": 0.00023564547046490468, + "loss": 0.5655, + "step": 1148 + }, + { + "epoch": 2.202676864244742, + "grad_norm": 0.26953125, + "learning_rate": 0.00023521794373586603, + "loss": 0.5685, + "step": 1152 + }, + { + "epoch": 2.210325047801147, + "grad_norm": 0.283203125, + "learning_rate": 0.00023478939236055228, + "loss": 0.5845, + "step": 1156 + }, + { + "epoch": 2.2179732313575524, + "grad_norm": 0.2890625, + "learning_rate": 0.00023435982149179346, + "loss": 0.6108, + "step": 1160 + }, + { + "epoch": 2.2256214149139577, + "grad_norm": 0.28125, + "learning_rate": 0.0002339292362946777, + "loss": 0.6221, + "step": 1164 + }, + { + "epoch": 2.2332695984703634, + "grad_norm": 0.28515625, + "learning_rate": 0.0002334976419464892, + "loss": 0.5739, + "step": 1168 + }, + { + "epoch": 2.2409177820267687, + "grad_norm": 0.28515625, + "learning_rate": 0.00023306504363664613, + "loss": 0.5928, + "step": 1172 + }, + { + "epoch": 2.248565965583174, + "grad_norm": 0.259765625, + "learning_rate": 0.00023263144656663801, + "loss": 0.5422, + "step": 1176 + }, + { + "epoch": 2.2562141491395793, + "grad_norm": 0.271484375, + "learning_rate": 0.00023219685594996347, + "loss": 0.5815, + "step": 1180 + }, + { + "epoch": 2.2638623326959846, + "grad_norm": 0.267578125, + "learning_rate": 0.00023176127701206713, + "loss": 0.5786, + "step": 1184 + }, + { + "epoch": 2.27151051625239, + "grad_norm": 0.263671875, + "learning_rate": 0.00023132471499027717, + "loss": 0.5634, + "step": 1188 + }, + { + "epoch": 2.2791586998087956, + "grad_norm": 0.291015625, + "learning_rate": 0.0002308871751337422, + "loss": 0.5969, + "step": 1192 + }, + { + "epoch": 2.286806883365201, + "grad_norm": 0.279296875, + "learning_rate": 0.00023044866270336822, + "loss": 0.587, + "step": 1196 + }, + { + "epoch": 2.294455066921606, + "grad_norm": 0.310546875, + "learning_rate": 0.00023000918297175506, + "loss": 0.6312, + "step": 1200 + }, + { + "epoch": 2.3021032504780115, + "grad_norm": 0.28515625, + "learning_rate": 0.00022956874122313347, + "loss": 0.6268, + "step": 1204 + }, + { + "epoch": 2.3097514340344167, + "grad_norm": 0.263671875, + "learning_rate": 0.00022912734275330117, + "loss": 0.586, + "step": 1208 + }, + { + "epoch": 2.317399617590822, + "grad_norm": 0.28125, + "learning_rate": 0.00022868499286955943, + "loss": 0.571, + "step": 1212 + }, + { + "epoch": 2.3250478011472273, + "grad_norm": 0.296875, + "learning_rate": 0.00022824169689064915, + "loss": 0.6062, + "step": 1216 + }, + { + "epoch": 2.332695984703633, + "grad_norm": 0.263671875, + "learning_rate": 0.00022779746014668683, + "loss": 0.5991, + "step": 1220 + }, + { + "epoch": 2.3403441682600383, + "grad_norm": 0.27734375, + "learning_rate": 0.00022735228797910066, + "loss": 0.6193, + "step": 1224 + }, + { + "epoch": 2.3479923518164436, + "grad_norm": 0.2890625, + "learning_rate": 0.0002269061857405662, + "loss": 0.5719, + "step": 1228 + }, + { + "epoch": 2.355640535372849, + "grad_norm": 0.27734375, + "learning_rate": 0.00022645915879494202, + "loss": 0.6171, + "step": 1232 + }, + { + "epoch": 2.363288718929254, + "grad_norm": 0.267578125, + "learning_rate": 0.00022601121251720514, + "loss": 0.6213, + "step": 1236 + }, + { + "epoch": 2.3709369024856595, + "grad_norm": 0.271484375, + "learning_rate": 0.0002255623522933866, + "loss": 0.5933, + "step": 1240 + }, + { + "epoch": 2.378585086042065, + "grad_norm": 0.2890625, + "learning_rate": 0.00022511258352050649, + "loss": 0.5598, + "step": 1244 + }, + { + "epoch": 2.3862332695984705, + "grad_norm": 0.2890625, + "learning_rate": 0.00022466191160650916, + "loss": 0.6435, + "step": 1248 + }, + { + "epoch": 2.3938814531548758, + "grad_norm": 0.28515625, + "learning_rate": 0.00022421034197019822, + "loss": 0.5553, + "step": 1252 + }, + { + "epoch": 2.401529636711281, + "grad_norm": 0.25390625, + "learning_rate": 0.00022375788004117128, + "loss": 0.6193, + "step": 1256 + }, + { + "epoch": 2.4091778202676863, + "grad_norm": 0.29296875, + "learning_rate": 0.00022330453125975474, + "loss": 0.6117, + "step": 1260 + }, + { + "epoch": 2.4168260038240916, + "grad_norm": 0.318359375, + "learning_rate": 0.0002228503010769384, + "loss": 0.5761, + "step": 1264 + }, + { + "epoch": 2.424474187380497, + "grad_norm": 0.2890625, + "learning_rate": 0.0002223951949543098, + "loss": 0.6343, + "step": 1268 + }, + { + "epoch": 2.4321223709369026, + "grad_norm": 0.28515625, + "learning_rate": 0.00022193921836398875, + "loss": 0.5763, + "step": 1272 + }, + { + "epoch": 2.439770554493308, + "grad_norm": 0.2890625, + "learning_rate": 0.00022148237678856138, + "loss": 0.5807, + "step": 1276 + }, + { + "epoch": 2.447418738049713, + "grad_norm": 0.279296875, + "learning_rate": 0.0002210246757210142, + "loss": 0.62, + "step": 1280 + }, + { + "epoch": 2.4550669216061185, + "grad_norm": 0.28515625, + "learning_rate": 0.00022056612066466817, + "loss": 0.6255, + "step": 1284 + }, + { + "epoch": 2.462715105162524, + "grad_norm": 0.3203125, + "learning_rate": 0.00022010671713311238, + "loss": 0.6786, + "step": 1288 + }, + { + "epoch": 2.470363288718929, + "grad_norm": 0.2890625, + "learning_rate": 0.000219646470650138, + "loss": 0.6173, + "step": 1292 + }, + { + "epoch": 2.478011472275335, + "grad_norm": 0.2734375, + "learning_rate": 0.00021918538674967156, + "loss": 0.6113, + "step": 1296 + }, + { + "epoch": 2.48565965583174, + "grad_norm": 0.265625, + "learning_rate": 0.0002187234709757087, + "loss": 0.5949, + "step": 1300 + }, + { + "epoch": 2.4933078393881454, + "grad_norm": 0.296875, + "learning_rate": 0.00021826072888224716, + "loss": 0.6248, + "step": 1304 + }, + { + "epoch": 2.5009560229445507, + "grad_norm": 0.283203125, + "learning_rate": 0.00021779716603322034, + "loss": 0.5849, + "step": 1308 + }, + { + "epoch": 2.508604206500956, + "grad_norm": 0.26953125, + "learning_rate": 0.0002173327880024303, + "loss": 0.5947, + "step": 1312 + }, + { + "epoch": 2.5162523900573612, + "grad_norm": 0.294921875, + "learning_rate": 0.00021686760037348065, + "loss": 0.5689, + "step": 1316 + }, + { + "epoch": 2.5239005736137665, + "grad_norm": 0.2890625, + "learning_rate": 0.00021640160873970954, + "loss": 0.6302, + "step": 1320 + }, + { + "epoch": 2.5315487571701722, + "grad_norm": 0.275390625, + "learning_rate": 0.00021593481870412217, + "loss": 0.6117, + "step": 1324 + }, + { + "epoch": 2.5391969407265775, + "grad_norm": 0.298828125, + "learning_rate": 0.0002154672358793238, + "loss": 0.6024, + "step": 1328 + }, + { + "epoch": 2.546845124282983, + "grad_norm": 0.298828125, + "learning_rate": 0.00021499886588745195, + "loss": 0.5417, + "step": 1332 + }, + { + "epoch": 2.554493307839388, + "grad_norm": 0.29296875, + "learning_rate": 0.00021452971436010886, + "loss": 0.5975, + "step": 1336 + }, + { + "epoch": 2.5621414913957934, + "grad_norm": 0.287109375, + "learning_rate": 0.00021405978693829397, + "loss": 0.5997, + "step": 1340 + }, + { + "epoch": 2.569789674952199, + "grad_norm": 0.296875, + "learning_rate": 0.00021358908927233576, + "loss": 0.6047, + "step": 1344 + }, + { + "epoch": 2.5774378585086044, + "grad_norm": 0.283203125, + "learning_rate": 0.00021311762702182414, + "loss": 0.6135, + "step": 1348 + }, + { + "epoch": 2.5850860420650097, + "grad_norm": 0.2890625, + "learning_rate": 0.00021264540585554215, + "loss": 0.6251, + "step": 1352 + }, + { + "epoch": 2.592734225621415, + "grad_norm": 0.275390625, + "learning_rate": 0.00021217243145139802, + "loss": 0.6308, + "step": 1356 + }, + { + "epoch": 2.6003824091778203, + "grad_norm": 0.2890625, + "learning_rate": 0.0002116987094963567, + "loss": 0.5828, + "step": 1360 + }, + { + "epoch": 2.6080305927342256, + "grad_norm": 0.30078125, + "learning_rate": 0.00021122424568637157, + "loss": 0.6057, + "step": 1364 + }, + { + "epoch": 2.615678776290631, + "grad_norm": 0.291015625, + "learning_rate": 0.00021074904572631606, + "loss": 0.6435, + "step": 1368 + }, + { + "epoch": 2.623326959847036, + "grad_norm": 0.294921875, + "learning_rate": 0.00021027311532991475, + "loss": 0.6201, + "step": 1372 + }, + { + "epoch": 2.6309751434034414, + "grad_norm": 0.27734375, + "learning_rate": 0.00020979646021967503, + "loss": 0.6192, + "step": 1376 + }, + { + "epoch": 2.638623326959847, + "grad_norm": 0.287109375, + "learning_rate": 0.00020931908612681805, + "loss": 0.6072, + "step": 1380 + }, + { + "epoch": 2.6462715105162524, + "grad_norm": 0.287109375, + "learning_rate": 0.00020884099879120993, + "loss": 0.5332, + "step": 1384 + }, + { + "epoch": 2.6539196940726577, + "grad_norm": 0.291015625, + "learning_rate": 0.00020836220396129265, + "loss": 0.5923, + "step": 1388 + }, + { + "epoch": 2.661567877629063, + "grad_norm": 0.2734375, + "learning_rate": 0.00020788270739401505, + "loss": 0.6293, + "step": 1392 + }, + { + "epoch": 2.6692160611854687, + "grad_norm": 0.275390625, + "learning_rate": 0.00020740251485476345, + "loss": 0.5851, + "step": 1396 + }, + { + "epoch": 2.676864244741874, + "grad_norm": 0.30078125, + "learning_rate": 0.00020692163211729253, + "loss": 0.6088, + "step": 1400 + }, + { + "epoch": 2.6845124282982793, + "grad_norm": 0.306640625, + "learning_rate": 0.0002064400649636557, + "loss": 0.6033, + "step": 1404 + }, + { + "epoch": 2.6921606118546846, + "grad_norm": 0.3046875, + "learning_rate": 0.0002059578191841357, + "loss": 0.5948, + "step": 1408 + }, + { + "epoch": 2.69980879541109, + "grad_norm": 0.29296875, + "learning_rate": 0.00020547490057717499, + "loss": 0.6287, + "step": 1412 + }, + { + "epoch": 2.707456978967495, + "grad_norm": 0.296875, + "learning_rate": 0.00020499131494930602, + "loss": 0.5736, + "step": 1416 + }, + { + "epoch": 2.7151051625239004, + "grad_norm": 0.287109375, + "learning_rate": 0.0002045070681150813, + "loss": 0.6496, + "step": 1420 + }, + { + "epoch": 2.7227533460803057, + "grad_norm": 0.279296875, + "learning_rate": 0.00020402216589700362, + "loss": 0.5993, + "step": 1424 + }, + { + "epoch": 2.730401529636711, + "grad_norm": 0.296875, + "learning_rate": 0.00020353661412545598, + "loss": 0.596, + "step": 1428 + }, + { + "epoch": 2.7380497131931167, + "grad_norm": 0.28515625, + "learning_rate": 0.00020305041863863152, + "loss": 0.639, + "step": 1432 + }, + { + "epoch": 2.745697896749522, + "grad_norm": 0.27734375, + "learning_rate": 0.00020256358528246334, + "loss": 0.5703, + "step": 1436 + }, + { + "epoch": 2.7533460803059273, + "grad_norm": 0.279296875, + "learning_rate": 0.00020207611991055407, + "loss": 0.5838, + "step": 1440 + }, + { + "epoch": 2.7609942638623326, + "grad_norm": 0.3046875, + "learning_rate": 0.0002015880283841057, + "loss": 0.5845, + "step": 1444 + }, + { + "epoch": 2.768642447418738, + "grad_norm": 0.287109375, + "learning_rate": 0.00020109931657184894, + "loss": 0.6169, + "step": 1448 + }, + { + "epoch": 2.7762906309751436, + "grad_norm": 0.296875, + "learning_rate": 0.0002006099903499727, + "loss": 0.6026, + "step": 1452 + }, + { + "epoch": 2.783938814531549, + "grad_norm": 0.291015625, + "learning_rate": 0.00020012005560205356, + "loss": 0.6278, + "step": 1456 + }, + { + "epoch": 2.791586998087954, + "grad_norm": 0.28515625, + "learning_rate": 0.0001996295182189847, + "loss": 0.6273, + "step": 1460 + }, + { + "epoch": 2.7992351816443595, + "grad_norm": 0.291015625, + "learning_rate": 0.00019913838409890548, + "loss": 0.6084, + "step": 1464 + }, + { + "epoch": 2.8068833652007648, + "grad_norm": 0.30859375, + "learning_rate": 0.00019864665914713024, + "loss": 0.6295, + "step": 1468 + }, + { + "epoch": 2.81453154875717, + "grad_norm": 0.29296875, + "learning_rate": 0.0001981543492760774, + "loss": 0.5889, + "step": 1472 + }, + { + "epoch": 2.8221797323135753, + "grad_norm": 0.287109375, + "learning_rate": 0.00019766146040519836, + "loss": 0.6064, + "step": 1476 + }, + { + "epoch": 2.8298279158699806, + "grad_norm": 0.28515625, + "learning_rate": 0.00019716799846090634, + "loss": 0.6269, + "step": 1480 + }, + { + "epoch": 2.8374760994263863, + "grad_norm": 0.287109375, + "learning_rate": 0.00019667396937650506, + "loss": 0.5742, + "step": 1484 + }, + { + "epoch": 2.8451242829827916, + "grad_norm": 0.33203125, + "learning_rate": 0.0001961793790921174, + "loss": 0.5701, + "step": 1488 + }, + { + "epoch": 2.852772466539197, + "grad_norm": 0.2734375, + "learning_rate": 0.00019568423355461402, + "loss": 0.5973, + "step": 1492 + }, + { + "epoch": 2.860420650095602, + "grad_norm": 0.296875, + "learning_rate": 0.00019518853871754204, + "loss": 0.609, + "step": 1496 + }, + { + "epoch": 2.8680688336520075, + "grad_norm": 0.30078125, + "learning_rate": 0.00019469230054105295, + "loss": 0.5944, + "step": 1500 + }, + { + "epoch": 2.875717017208413, + "grad_norm": 0.302734375, + "learning_rate": 0.0001941955249918315, + "loss": 0.5914, + "step": 1504 + }, + { + "epoch": 2.8833652007648185, + "grad_norm": 0.283203125, + "learning_rate": 0.00019369821804302365, + "loss": 0.6191, + "step": 1508 + }, + { + "epoch": 2.891013384321224, + "grad_norm": 0.296875, + "learning_rate": 0.00019320038567416484, + "loss": 0.6409, + "step": 1512 + }, + { + "epoch": 2.898661567877629, + "grad_norm": 0.294921875, + "learning_rate": 0.00019270203387110798, + "loss": 0.5779, + "step": 1516 + }, + { + "epoch": 2.9063097514340344, + "grad_norm": 0.29296875, + "learning_rate": 0.00019220316862595167, + "loss": 0.5956, + "step": 1520 + }, + { + "epoch": 2.9139579349904396, + "grad_norm": 0.3046875, + "learning_rate": 0.00019170379593696802, + "loss": 0.5916, + "step": 1524 + }, + { + "epoch": 2.921606118546845, + "grad_norm": 0.2890625, + "learning_rate": 0.00019120392180853058, + "loss": 0.6069, + "step": 1528 + }, + { + "epoch": 2.92925430210325, + "grad_norm": 0.279296875, + "learning_rate": 0.0001907035522510421, + "loss": 0.6029, + "step": 1532 + }, + { + "epoch": 2.936902485659656, + "grad_norm": 0.310546875, + "learning_rate": 0.00019020269328086226, + "loss": 0.5706, + "step": 1536 + }, + { + "epoch": 2.9445506692160612, + "grad_norm": 0.29296875, + "learning_rate": 0.0001897013509202354, + "loss": 0.6024, + "step": 1540 + }, + { + "epoch": 2.9521988527724665, + "grad_norm": 0.287109375, + "learning_rate": 0.00018919953119721808, + "loss": 0.6326, + "step": 1544 + }, + { + "epoch": 2.959847036328872, + "grad_norm": 0.296875, + "learning_rate": 0.0001886972401456065, + "loss": 0.5744, + "step": 1548 + }, + { + "epoch": 2.967495219885277, + "grad_norm": 0.28515625, + "learning_rate": 0.00018819448380486413, + "loss": 0.5679, + "step": 1552 + }, + { + "epoch": 2.975143403441683, + "grad_norm": 0.27734375, + "learning_rate": 0.00018769126822004898, + "loss": 0.5992, + "step": 1556 + }, + { + "epoch": 2.982791586998088, + "grad_norm": 0.30078125, + "learning_rate": 0.00018718759944174086, + "loss": 0.5981, + "step": 1560 + }, + { + "epoch": 2.9904397705544934, + "grad_norm": 0.29296875, + "learning_rate": 0.0001866834835259688, + "loss": 0.6188, + "step": 1564 + }, + { + "epoch": 2.9980879541108987, + "grad_norm": 0.33984375, + "learning_rate": 0.0001861789265341381, + "loss": 0.617, + "step": 1568 + }, + { + "epoch": 3.005736137667304, + "grad_norm": 0.322265625, + "learning_rate": 0.00018567393453295742, + "loss": 0.4644, + "step": 1572 + }, + { + "epoch": 3.0133843212237093, + "grad_norm": 0.298828125, + "learning_rate": 0.00018516851359436602, + "loss": 0.4965, + "step": 1576 + }, + { + "epoch": 3.0210325047801145, + "grad_norm": 0.29296875, + "learning_rate": 0.00018466266979546057, + "loss": 0.501, + "step": 1580 + }, + { + "epoch": 3.0286806883365203, + "grad_norm": 0.28125, + "learning_rate": 0.0001841564092184221, + "loss": 0.4787, + "step": 1584 + }, + { + "epoch": 3.0363288718929256, + "grad_norm": 0.298828125, + "learning_rate": 0.00018364973795044294, + "loss": 0.5116, + "step": 1588 + }, + { + "epoch": 3.043977055449331, + "grad_norm": 0.28125, + "learning_rate": 0.00018314266208365357, + "loss": 0.5309, + "step": 1592 + }, + { + "epoch": 3.051625239005736, + "grad_norm": 0.30078125, + "learning_rate": 0.00018263518771504924, + "loss": 0.4979, + "step": 1596 + }, + { + "epoch": 3.0592734225621414, + "grad_norm": 0.296875, + "learning_rate": 0.00018212732094641666, + "loss": 0.4647, + "step": 1600 + }, + { + "epoch": 3.0669216061185467, + "grad_norm": 0.294921875, + "learning_rate": 0.00018161906788426076, + "loss": 0.5367, + "step": 1604 + }, + { + "epoch": 3.0745697896749524, + "grad_norm": 0.30078125, + "learning_rate": 0.00018111043463973122, + "loss": 0.5095, + "step": 1608 + }, + { + "epoch": 3.0822179732313577, + "grad_norm": 0.27734375, + "learning_rate": 0.00018060142732854894, + "loss": 0.4615, + "step": 1612 + }, + { + "epoch": 3.089866156787763, + "grad_norm": 0.267578125, + "learning_rate": 0.00018009205207093252, + "loss": 0.5105, + "step": 1616 + }, + { + "epoch": 3.0975143403441683, + "grad_norm": 0.287109375, + "learning_rate": 0.00017958231499152463, + "loss": 0.5326, + "step": 1620 + }, + { + "epoch": 3.1051625239005736, + "grad_norm": 0.3125, + "learning_rate": 0.0001790722222193186, + "loss": 0.5383, + "step": 1624 + }, + { + "epoch": 3.112810707456979, + "grad_norm": 0.296875, + "learning_rate": 0.00017856177988758438, + "loss": 0.5192, + "step": 1628 + }, + { + "epoch": 3.120458891013384, + "grad_norm": 0.3046875, + "learning_rate": 0.00017805099413379508, + "loss": 0.5029, + "step": 1632 + }, + { + "epoch": 3.12810707456979, + "grad_norm": 0.318359375, + "learning_rate": 0.00017753987109955297, + "loss": 0.4896, + "step": 1636 + }, + { + "epoch": 3.135755258126195, + "grad_norm": 0.310546875, + "learning_rate": 0.00017702841693051577, + "loss": 0.5254, + "step": 1640 + }, + { + "epoch": 3.1434034416826004, + "grad_norm": 0.330078125, + "learning_rate": 0.0001765166377763227, + "loss": 0.4829, + "step": 1644 + }, + { + "epoch": 3.1510516252390057, + "grad_norm": 0.3046875, + "learning_rate": 0.00017600453979052055, + "loss": 0.5461, + "step": 1648 + }, + { + "epoch": 3.158699808795411, + "grad_norm": 0.298828125, + "learning_rate": 0.0001754921291304897, + "loss": 0.5415, + "step": 1652 + }, + { + "epoch": 3.1663479923518163, + "grad_norm": 0.31640625, + "learning_rate": 0.00017497941195737004, + "loss": 0.5501, + "step": 1656 + }, + { + "epoch": 3.173996175908222, + "grad_norm": 0.314453125, + "learning_rate": 0.00017446639443598696, + "loss": 0.4964, + "step": 1660 + }, + { + "epoch": 3.1816443594646273, + "grad_norm": 0.3046875, + "learning_rate": 0.00017395308273477714, + "loss": 0.4938, + "step": 1664 + }, + { + "epoch": 3.1892925430210326, + "grad_norm": 0.30859375, + "learning_rate": 0.00017343948302571446, + "loss": 0.5409, + "step": 1668 + }, + { + "epoch": 3.196940726577438, + "grad_norm": 0.302734375, + "learning_rate": 0.00017292560148423578, + "loss": 0.4844, + "step": 1672 + }, + { + "epoch": 3.204588910133843, + "grad_norm": 0.3046875, + "learning_rate": 0.00017241144428916655, + "loss": 0.539, + "step": 1676 + }, + { + "epoch": 3.2122370936902485, + "grad_norm": 0.3046875, + "learning_rate": 0.00017189701762264687, + "loss": 0.4974, + "step": 1680 + }, + { + "epoch": 3.2198852772466537, + "grad_norm": 0.337890625, + "learning_rate": 0.0001713823276700567, + "loss": 0.5443, + "step": 1684 + }, + { + "epoch": 3.2275334608030595, + "grad_norm": 0.29296875, + "learning_rate": 0.00017086738061994176, + "loss": 0.4936, + "step": 1688 + }, + { + "epoch": 3.2351816443594648, + "grad_norm": 0.298828125, + "learning_rate": 0.00017035218266393918, + "loss": 0.5027, + "step": 1692 + }, + { + "epoch": 3.24282982791587, + "grad_norm": 0.326171875, + "learning_rate": 0.00016983673999670273, + "loss": 0.5352, + "step": 1696 + }, + { + "epoch": 3.2504780114722753, + "grad_norm": 0.310546875, + "learning_rate": 0.0001693210588158287, + "loss": 0.5147, + "step": 1700 + }, + { + "epoch": 3.2581261950286806, + "grad_norm": 0.296875, + "learning_rate": 0.00016880514532178123, + "loss": 0.5013, + "step": 1704 + }, + { + "epoch": 3.265774378585086, + "grad_norm": 0.3125, + "learning_rate": 0.00016828900571781767, + "loss": 0.5408, + "step": 1708 + }, + { + "epoch": 3.2734225621414916, + "grad_norm": 0.3046875, + "learning_rate": 0.00016777264620991414, + "loss": 0.4758, + "step": 1712 + }, + { + "epoch": 3.281070745697897, + "grad_norm": 0.3046875, + "learning_rate": 0.00016725607300669087, + "loss": 0.5154, + "step": 1716 + }, + { + "epoch": 3.288718929254302, + "grad_norm": 0.302734375, + "learning_rate": 0.0001667392923193375, + "loss": 0.4882, + "step": 1720 + }, + { + "epoch": 3.2963671128107075, + "grad_norm": 0.322265625, + "learning_rate": 0.00016622231036153836, + "loss": 0.5423, + "step": 1724 + }, + { + "epoch": 3.3040152963671128, + "grad_norm": 0.328125, + "learning_rate": 0.0001657051333493978, + "loss": 0.509, + "step": 1728 + }, + { + "epoch": 3.311663479923518, + "grad_norm": 0.349609375, + "learning_rate": 0.00016518776750136578, + "loss": 0.5447, + "step": 1732 + }, + { + "epoch": 3.3193116634799233, + "grad_norm": 0.33203125, + "learning_rate": 0.00016467021903816237, + "loss": 0.5048, + "step": 1736 + }, + { + "epoch": 3.3269598470363286, + "grad_norm": 0.287109375, + "learning_rate": 0.00016415249418270364, + "loss": 0.5183, + "step": 1740 + }, + { + "epoch": 3.3346080305927344, + "grad_norm": 0.3125, + "learning_rate": 0.00016363459916002643, + "loss": 0.4915, + "step": 1744 + }, + { + "epoch": 3.3422562141491396, + "grad_norm": 0.349609375, + "learning_rate": 0.00016311654019721377, + "loss": 0.5016, + "step": 1748 + }, + { + "epoch": 3.349904397705545, + "grad_norm": 0.30859375, + "learning_rate": 0.00016259832352331978, + "loss": 0.5276, + "step": 1752 + }, + { + "epoch": 3.35755258126195, + "grad_norm": 0.34375, + "learning_rate": 0.0001620799553692949, + "loss": 0.5436, + "step": 1756 + }, + { + "epoch": 3.3652007648183555, + "grad_norm": 0.29296875, + "learning_rate": 0.00016156144196791103, + "loss": 0.5152, + "step": 1760 + }, + { + "epoch": 3.3728489483747612, + "grad_norm": 0.28515625, + "learning_rate": 0.0001610427895536863, + "loss": 0.4845, + "step": 1764 + }, + { + "epoch": 3.3804971319311665, + "grad_norm": 0.33203125, + "learning_rate": 0.00016052400436281046, + "loss": 0.51, + "step": 1768 + }, + { + "epoch": 3.388145315487572, + "grad_norm": 0.306640625, + "learning_rate": 0.00016000509263306976, + "loss": 0.5163, + "step": 1772 + }, + { + "epoch": 3.395793499043977, + "grad_norm": 0.34765625, + "learning_rate": 0.0001594860606037719, + "loss": 0.559, + "step": 1776 + }, + { + "epoch": 3.4034416826003824, + "grad_norm": 0.3359375, + "learning_rate": 0.0001589669145156709, + "loss": 0.523, + "step": 1780 + }, + { + "epoch": 3.4110898661567877, + "grad_norm": 0.3359375, + "learning_rate": 0.00015844766061089241, + "loss": 0.4994, + "step": 1784 + }, + { + "epoch": 3.418738049713193, + "grad_norm": 0.32421875, + "learning_rate": 0.00015792830513285838, + "loss": 0.5259, + "step": 1788 + }, + { + "epoch": 3.4263862332695982, + "grad_norm": 0.314453125, + "learning_rate": 0.000157408854326212, + "loss": 0.4734, + "step": 1792 + }, + { + "epoch": 3.434034416826004, + "grad_norm": 0.330078125, + "learning_rate": 0.00015688931443674276, + "loss": 0.5163, + "step": 1796 + }, + { + "epoch": 3.4416826003824093, + "grad_norm": 0.337890625, + "learning_rate": 0.0001563696917113112, + "loss": 0.514, + "step": 1800 + }, + { + "epoch": 3.4493307839388145, + "grad_norm": 0.322265625, + "learning_rate": 0.00015584999239777393, + "loss": 0.5691, + "step": 1804 + }, + { + "epoch": 3.45697896749522, + "grad_norm": 0.3515625, + "learning_rate": 0.0001553302227449084, + "loss": 0.5365, + "step": 1808 + }, + { + "epoch": 3.464627151051625, + "grad_norm": 0.318359375, + "learning_rate": 0.0001548103890023378, + "loss": 0.5111, + "step": 1812 + }, + { + "epoch": 3.472275334608031, + "grad_norm": 0.318359375, + "learning_rate": 0.00015429049742045591, + "loss": 0.5272, + "step": 1816 + }, + { + "epoch": 3.479923518164436, + "grad_norm": 0.30078125, + "learning_rate": 0.000153770554250352, + "loss": 0.4746, + "step": 1820 + }, + { + "epoch": 3.4875717017208414, + "grad_norm": 0.328125, + "learning_rate": 0.00015325056574373564, + "loss": 0.5091, + "step": 1824 + }, + { + "epoch": 3.4952198852772467, + "grad_norm": 0.322265625, + "learning_rate": 0.00015273053815286153, + "loss": 0.5043, + "step": 1828 + }, + { + "epoch": 3.502868068833652, + "grad_norm": 0.310546875, + "learning_rate": 0.00015221047773045424, + "loss": 0.5157, + "step": 1832 + }, + { + "epoch": 3.5105162523900573, + "grad_norm": 0.310546875, + "learning_rate": 0.00015169039072963312, + "loss": 0.525, + "step": 1836 + }, + { + "epoch": 3.5181644359464626, + "grad_norm": 0.333984375, + "learning_rate": 0.00015117028340383713, + "loss": 0.536, + "step": 1840 + }, + { + "epoch": 3.525812619502868, + "grad_norm": 0.326171875, + "learning_rate": 0.00015065016200674963, + "loss": 0.556, + "step": 1844 + }, + { + "epoch": 3.5334608030592736, + "grad_norm": 0.345703125, + "learning_rate": 0.00015013003279222312, + "loss": 0.5199, + "step": 1848 + }, + { + "epoch": 3.541108986615679, + "grad_norm": 0.30859375, + "learning_rate": 0.0001496099020142041, + "loss": 0.5381, + "step": 1852 + }, + { + "epoch": 3.548757170172084, + "grad_norm": 0.33203125, + "learning_rate": 0.00014908977592665787, + "loss": 0.5092, + "step": 1856 + }, + { + "epoch": 3.5564053537284894, + "grad_norm": 0.328125, + "learning_rate": 0.00014856966078349339, + "loss": 0.5101, + "step": 1860 + }, + { + "epoch": 3.5640535372848947, + "grad_norm": 0.353515625, + "learning_rate": 0.00014804956283848793, + "loss": 0.5093, + "step": 1864 + }, + { + "epoch": 3.5717017208413004, + "grad_norm": 0.306640625, + "learning_rate": 0.00014752948834521206, + "loss": 0.499, + "step": 1868 + }, + { + "epoch": 3.5793499043977057, + "grad_norm": 0.33984375, + "learning_rate": 0.00014700944355695432, + "loss": 0.4342, + "step": 1872 + }, + { + "epoch": 3.586998087954111, + "grad_norm": 0.333984375, + "learning_rate": 0.00014648943472664612, + "loss": 0.541, + "step": 1876 + }, + { + "epoch": 3.5946462715105163, + "grad_norm": 0.349609375, + "learning_rate": 0.00014596946810678646, + "loss": 0.5089, + "step": 1880 + }, + { + "epoch": 3.6022944550669216, + "grad_norm": 0.30859375, + "learning_rate": 0.00014544954994936689, + "loss": 0.4995, + "step": 1884 + }, + { + "epoch": 3.609942638623327, + "grad_norm": 0.318359375, + "learning_rate": 0.0001449296865057962, + "loss": 0.5299, + "step": 1888 + }, + { + "epoch": 3.617590822179732, + "grad_norm": 0.337890625, + "learning_rate": 0.00014440988402682526, + "loss": 0.5933, + "step": 1892 + }, + { + "epoch": 3.6252390057361374, + "grad_norm": 0.3359375, + "learning_rate": 0.00014389014876247205, + "loss": 0.5045, + "step": 1896 + }, + { + "epoch": 3.632887189292543, + "grad_norm": 0.3125, + "learning_rate": 0.00014337048696194625, + "loss": 0.4814, + "step": 1900 + }, + { + "epoch": 3.6405353728489485, + "grad_norm": 0.32421875, + "learning_rate": 0.00014285090487357427, + "loss": 0.5416, + "step": 1904 + }, + { + "epoch": 3.6481835564053537, + "grad_norm": 0.32421875, + "learning_rate": 0.0001423314087447241, + "loss": 0.5236, + "step": 1908 + }, + { + "epoch": 3.655831739961759, + "grad_norm": 0.34765625, + "learning_rate": 0.00014181200482173015, + "loss": 0.5281, + "step": 1912 + }, + { + "epoch": 3.6634799235181643, + "grad_norm": 0.3046875, + "learning_rate": 0.00014129269934981802, + "loss": 0.5446, + "step": 1916 + }, + { + "epoch": 3.67112810707457, + "grad_norm": 0.359375, + "learning_rate": 0.00014077349857302983, + "loss": 0.4949, + "step": 1920 + }, + { + "epoch": 3.6787762906309753, + "grad_norm": 0.31640625, + "learning_rate": 0.00014025440873414863, + "loss": 0.4875, + "step": 1924 + }, + { + "epoch": 3.6864244741873806, + "grad_norm": 0.3203125, + "learning_rate": 0.0001397354360746237, + "loss": 0.528, + "step": 1928 + }, + { + "epoch": 3.694072657743786, + "grad_norm": 0.298828125, + "learning_rate": 0.0001392165868344953, + "loss": 0.4827, + "step": 1932 + }, + { + "epoch": 3.701720841300191, + "grad_norm": 0.337890625, + "learning_rate": 0.0001386978672523198, + "loss": 0.5073, + "step": 1936 + }, + { + "epoch": 3.7093690248565965, + "grad_norm": 0.3203125, + "learning_rate": 0.0001381792835650945, + "loss": 0.5036, + "step": 1940 + }, + { + "epoch": 3.7170172084130018, + "grad_norm": 0.30859375, + "learning_rate": 0.00013766084200818272, + "loss": 0.5396, + "step": 1944 + }, + { + "epoch": 3.724665391969407, + "grad_norm": 0.345703125, + "learning_rate": 0.0001371425488152389, + "loss": 0.4815, + "step": 1948 + }, + { + "epoch": 3.7323135755258128, + "grad_norm": 0.3203125, + "learning_rate": 0.0001366244102181335, + "loss": 0.5306, + "step": 1952 + }, + { + "epoch": 3.739961759082218, + "grad_norm": 0.34375, + "learning_rate": 0.00013610643244687826, + "loss": 0.5419, + "step": 1956 + }, + { + "epoch": 3.7476099426386233, + "grad_norm": 0.3203125, + "learning_rate": 0.00013558862172955105, + "loss": 0.5204, + "step": 1960 + }, + { + "epoch": 3.7552581261950286, + "grad_norm": 0.3203125, + "learning_rate": 0.00013507098429222115, + "loss": 0.4982, + "step": 1964 + }, + { + "epoch": 3.762906309751434, + "grad_norm": 0.3203125, + "learning_rate": 0.00013455352635887438, + "loss": 0.4667, + "step": 1968 + }, + { + "epoch": 3.7705544933078396, + "grad_norm": 0.34765625, + "learning_rate": 0.00013403625415133824, + "loss": 0.4302, + "step": 1972 + }, + { + "epoch": 3.778202676864245, + "grad_norm": 0.310546875, + "learning_rate": 0.00013351917388920704, + "loss": 0.4545, + "step": 1976 + }, + { + "epoch": 3.78585086042065, + "grad_norm": 0.337890625, + "learning_rate": 0.00013300229178976722, + "loss": 0.4953, + "step": 1980 + }, + { + "epoch": 3.7934990439770555, + "grad_norm": 0.3359375, + "learning_rate": 0.0001324856140679225, + "loss": 0.4966, + "step": 1984 + }, + { + "epoch": 3.801147227533461, + "grad_norm": 0.3203125, + "learning_rate": 0.0001319691469361193, + "loss": 0.5236, + "step": 1988 + }, + { + "epoch": 3.808795411089866, + "grad_norm": 0.328125, + "learning_rate": 0.00013145289660427173, + "loss": 0.5244, + "step": 1992 + }, + { + "epoch": 3.8164435946462714, + "grad_norm": 0.3359375, + "learning_rate": 0.00013093686927968738, + "loss": 0.4982, + "step": 1996 + }, + { + "epoch": 3.8240917782026767, + "grad_norm": 0.345703125, + "learning_rate": 0.00013042107116699228, + "loss": 0.4899, + "step": 2000 + }, + { + "epoch": 3.8317399617590824, + "grad_norm": 0.3125, + "learning_rate": 0.00012990550846805654, + "loss": 0.5296, + "step": 2004 + }, + { + "epoch": 3.8393881453154877, + "grad_norm": 0.322265625, + "learning_rate": 0.0001293901873819196, + "loss": 0.5331, + "step": 2008 + }, + { + "epoch": 3.847036328871893, + "grad_norm": 0.333984375, + "learning_rate": 0.00012887511410471589, + "loss": 0.501, + "step": 2012 + }, + { + "epoch": 3.8546845124282982, + "grad_norm": 0.328125, + "learning_rate": 0.00012836029482960018, + "loss": 0.5254, + "step": 2016 + }, + { + "epoch": 3.8623326959847035, + "grad_norm": 0.337890625, + "learning_rate": 0.00012784573574667316, + "loss": 0.5009, + "step": 2020 + }, + { + "epoch": 3.8699808795411093, + "grad_norm": 0.353515625, + "learning_rate": 0.00012733144304290697, + "loss": 0.5107, + "step": 2024 + }, + { + "epoch": 3.8776290630975145, + "grad_norm": 0.326171875, + "learning_rate": 0.0001268174229020709, + "loss": 0.5025, + "step": 2028 + }, + { + "epoch": 3.88527724665392, + "grad_norm": 0.318359375, + "learning_rate": 0.0001263036815046571, + "loss": 0.5239, + "step": 2032 + }, + { + "epoch": 3.892925430210325, + "grad_norm": 0.333984375, + "learning_rate": 0.00012579022502780596, + "loss": 0.5112, + "step": 2036 + }, + { + "epoch": 3.9005736137667304, + "grad_norm": 0.322265625, + "learning_rate": 0.00012527705964523209, + "loss": 0.5182, + "step": 2040 + }, + { + "epoch": 3.9082217973231357, + "grad_norm": 0.322265625, + "learning_rate": 0.00012476419152715007, + "loss": 0.5505, + "step": 2044 + }, + { + "epoch": 3.915869980879541, + "grad_norm": 0.3125, + "learning_rate": 0.00012425162684020024, + "loss": 0.4957, + "step": 2048 + }, + { + "epoch": 3.9235181644359463, + "grad_norm": 0.322265625, + "learning_rate": 0.0001237393717473745, + "loss": 0.5132, + "step": 2052 + }, + { + "epoch": 3.9311663479923515, + "grad_norm": 0.3125, + "learning_rate": 0.0001232274324079422, + "loss": 0.516, + "step": 2056 + }, + { + "epoch": 3.9388145315487573, + "grad_norm": 0.33203125, + "learning_rate": 0.00012271581497737619, + "loss": 0.5156, + "step": 2060 + }, + { + "epoch": 3.9464627151051626, + "grad_norm": 0.314453125, + "learning_rate": 0.00012220452560727875, + "loss": 0.4621, + "step": 2064 + }, + { + "epoch": 3.954110898661568, + "grad_norm": 0.33203125, + "learning_rate": 0.00012169357044530758, + "loss": 0.5206, + "step": 2068 + }, + { + "epoch": 3.961759082217973, + "grad_norm": 0.322265625, + "learning_rate": 0.0001211829556351019, + "loss": 0.4511, + "step": 2072 + }, + { + "epoch": 3.969407265774379, + "grad_norm": 0.328125, + "learning_rate": 0.00012067268731620861, + "loss": 0.5047, + "step": 2076 + }, + { + "epoch": 3.977055449330784, + "grad_norm": 0.3359375, + "learning_rate": 0.00012016277162400848, + "loss": 0.5295, + "step": 2080 + }, + { + "epoch": 3.9847036328871894, + "grad_norm": 0.333984375, + "learning_rate": 0.00011965321468964237, + "loss": 0.5204, + "step": 2084 + }, + { + "epoch": 3.9923518164435947, + "grad_norm": 0.333984375, + "learning_rate": 0.00011914402263993745, + "loss": 0.5064, + "step": 2088 + }, + { + "epoch": 4.0, + "grad_norm": 0.87109375, + "learning_rate": 0.00011863520159733357, + "loss": 0.53, + "step": 2092 + }, + { + "epoch": 4.007648183556405, + "grad_norm": 0.3046875, + "learning_rate": 0.00011812675767980972, + "loss": 0.4532, + "step": 2096 + }, + { + "epoch": 4.015296367112811, + "grad_norm": 0.3046875, + "learning_rate": 0.00011761869700081036, + "loss": 0.4748, + "step": 2100 + }, + { + "epoch": 4.022944550669216, + "grad_norm": 0.314453125, + "learning_rate": 0.00011711102566917194, + "loss": 0.4188, + "step": 2104 + }, + { + "epoch": 4.030592734225621, + "grad_norm": 0.32421875, + "learning_rate": 0.00011660374978904947, + "loss": 0.4466, + "step": 2108 + }, + { + "epoch": 4.038240917782026, + "grad_norm": 0.32421875, + "learning_rate": 0.00011609687545984315, + "loss": 0.4112, + "step": 2112 + }, + { + "epoch": 4.045889101338432, + "grad_norm": 0.345703125, + "learning_rate": 0.00011559040877612497, + "loss": 0.4566, + "step": 2116 + }, + { + "epoch": 4.053537284894838, + "grad_norm": 0.33203125, + "learning_rate": 0.00011508435582756545, + "loss": 0.4413, + "step": 2120 + }, + { + "epoch": 4.061185468451243, + "grad_norm": 0.34765625, + "learning_rate": 0.00011457872269886043, + "loss": 0.4435, + "step": 2124 + }, + { + "epoch": 4.0688336520076485, + "grad_norm": 0.341796875, + "learning_rate": 0.00011407351546965796, + "loss": 0.4568, + "step": 2128 + }, + { + "epoch": 4.076481835564054, + "grad_norm": 0.328125, + "learning_rate": 0.00011356874021448506, + "loss": 0.4247, + "step": 2132 + }, + { + "epoch": 4.084130019120459, + "grad_norm": 0.302734375, + "learning_rate": 0.00011306440300267482, + "loss": 0.3762, + "step": 2136 + }, + { + "epoch": 4.091778202676864, + "grad_norm": 0.337890625, + "learning_rate": 0.00011256050989829337, + "loss": 0.4713, + "step": 2140 + }, + { + "epoch": 4.09942638623327, + "grad_norm": 0.3203125, + "learning_rate": 0.00011205706696006698, + "loss": 0.4178, + "step": 2144 + }, + { + "epoch": 4.107074569789675, + "grad_norm": 0.353515625, + "learning_rate": 0.00011155408024130921, + "loss": 0.4266, + "step": 2148 + }, + { + "epoch": 4.11472275334608, + "grad_norm": 0.31640625, + "learning_rate": 0.00011105155578984795, + "loss": 0.4242, + "step": 2152 + }, + { + "epoch": 4.1223709369024855, + "grad_norm": 0.310546875, + "learning_rate": 0.00011054949964795307, + "loss": 0.464, + "step": 2156 + }, + { + "epoch": 4.130019120458891, + "grad_norm": 0.31640625, + "learning_rate": 0.00011004791785226347, + "loss": 0.4216, + "step": 2160 + }, + { + "epoch": 4.137667304015296, + "grad_norm": 0.34375, + "learning_rate": 0.00010954681643371462, + "loss": 0.4222, + "step": 2164 + }, + { + "epoch": 4.145315487571701, + "grad_norm": 0.3203125, + "learning_rate": 0.00010904620141746601, + "loss": 0.4321, + "step": 2168 + }, + { + "epoch": 4.1529636711281075, + "grad_norm": 0.330078125, + "learning_rate": 0.0001085460788228287, + "loss": 0.441, + "step": 2172 + }, + { + "epoch": 4.160611854684513, + "grad_norm": 0.330078125, + "learning_rate": 0.00010804645466319292, + "loss": 0.4468, + "step": 2176 + }, + { + "epoch": 4.168260038240918, + "grad_norm": 0.33203125, + "learning_rate": 0.0001075473349459559, + "loss": 0.3948, + "step": 2180 + }, + { + "epoch": 4.175908221797323, + "grad_norm": 0.33203125, + "learning_rate": 0.00010704872567244948, + "loss": 0.4233, + "step": 2184 + }, + { + "epoch": 4.183556405353729, + "grad_norm": 0.34765625, + "learning_rate": 0.00010655063283786795, + "loss": 0.4227, + "step": 2188 + }, + { + "epoch": 4.191204588910134, + "grad_norm": 0.337890625, + "learning_rate": 0.00010605306243119617, + "loss": 0.4242, + "step": 2192 + }, + { + "epoch": 4.198852772466539, + "grad_norm": 0.337890625, + "learning_rate": 0.00010555602043513724, + "loss": 0.4428, + "step": 2196 + }, + { + "epoch": 4.2065009560229445, + "grad_norm": 0.34375, + "learning_rate": 0.00010505951282604088, + "loss": 0.4132, + "step": 2200 + }, + { + "epoch": 4.21414913957935, + "grad_norm": 0.33984375, + "learning_rate": 0.00010456354557383139, + "loss": 0.4046, + "step": 2204 + }, + { + "epoch": 4.221797323135755, + "grad_norm": 0.345703125, + "learning_rate": 0.00010406812464193584, + "loss": 0.4527, + "step": 2208 + }, + { + "epoch": 4.22944550669216, + "grad_norm": 0.33203125, + "learning_rate": 0.00010357325598721255, + "loss": 0.4107, + "step": 2212 + }, + { + "epoch": 4.237093690248566, + "grad_norm": 0.31640625, + "learning_rate": 0.00010307894555987927, + "loss": 0.4375, + "step": 2216 + }, + { + "epoch": 4.244741873804971, + "grad_norm": 0.328125, + "learning_rate": 0.00010258519930344179, + "loss": 0.4328, + "step": 2220 + }, + { + "epoch": 4.252390057361376, + "grad_norm": 0.34375, + "learning_rate": 0.0001020920231546223, + "loss": 0.4388, + "step": 2224 + }, + { + "epoch": 4.260038240917782, + "grad_norm": 0.33984375, + "learning_rate": 0.00010159942304328819, + "loss": 0.4443, + "step": 2228 + }, + { + "epoch": 4.267686424474188, + "grad_norm": 0.361328125, + "learning_rate": 0.00010110740489238066, + "loss": 0.4446, + "step": 2232 + }, + { + "epoch": 4.275334608030593, + "grad_norm": 0.341796875, + "learning_rate": 0.00010061597461784346, + "loss": 0.4269, + "step": 2236 + }, + { + "epoch": 4.282982791586998, + "grad_norm": 0.33984375, + "learning_rate": 0.00010012513812855191, + "loss": 0.4326, + "step": 2240 + }, + { + "epoch": 4.2906309751434035, + "grad_norm": 0.35546875, + "learning_rate": 9.963490132624169e-05, + "loss": 0.4342, + "step": 2244 + }, + { + "epoch": 4.298279158699809, + "grad_norm": 0.3515625, + "learning_rate": 9.914527010543795e-05, + "loss": 0.4157, + "step": 2248 + }, + { + "epoch": 4.305927342256214, + "grad_norm": 0.3359375, + "learning_rate": 9.865625035338447e-05, + "loss": 0.4237, + "step": 2252 + }, + { + "epoch": 4.313575525812619, + "grad_norm": 0.314453125, + "learning_rate": 9.816784794997275e-05, + "loss": 0.4033, + "step": 2256 + }, + { + "epoch": 4.321223709369025, + "grad_norm": 0.359375, + "learning_rate": 9.76800687676715e-05, + "loss": 0.4436, + "step": 2260 + }, + { + "epoch": 4.32887189292543, + "grad_norm": 0.3359375, + "learning_rate": 9.719291867145583e-05, + "loss": 0.4384, + "step": 2264 + }, + { + "epoch": 4.336520076481835, + "grad_norm": 0.341796875, + "learning_rate": 9.670640351873688e-05, + "loss": 0.4512, + "step": 2268 + }, + { + "epoch": 4.3441682600382405, + "grad_norm": 0.34375, + "learning_rate": 9.62205291592913e-05, + "loss": 0.4259, + "step": 2272 + }, + { + "epoch": 4.351816443594647, + "grad_norm": 0.337890625, + "learning_rate": 9.573530143519098e-05, + "loss": 0.4178, + "step": 2276 + }, + { + "epoch": 4.359464627151052, + "grad_norm": 0.3671875, + "learning_rate": 9.525072618073277e-05, + "loss": 0.3608, + "step": 2280 + }, + { + "epoch": 4.367112810707457, + "grad_norm": 0.32421875, + "learning_rate": 9.476680922236831e-05, + "loss": 0.4489, + "step": 2284 + }, + { + "epoch": 4.374760994263863, + "grad_norm": 0.330078125, + "learning_rate": 9.428355637863402e-05, + "loss": 0.4346, + "step": 2288 + }, + { + "epoch": 4.382409177820268, + "grad_norm": 0.33203125, + "learning_rate": 9.380097346008112e-05, + "loss": 0.4542, + "step": 2292 + }, + { + "epoch": 4.390057361376673, + "grad_norm": 0.341796875, + "learning_rate": 9.331906626920576e-05, + "loss": 0.4395, + "step": 2296 + }, + { + "epoch": 4.397705544933078, + "grad_norm": 0.341796875, + "learning_rate": 9.283784060037921e-05, + "loss": 0.4858, + "step": 2300 + }, + { + "epoch": 4.405353728489484, + "grad_norm": 0.30859375, + "learning_rate": 9.235730223977837e-05, + "loss": 0.4148, + "step": 2304 + }, + { + "epoch": 4.413001912045889, + "grad_norm": 0.36328125, + "learning_rate": 9.187745696531584e-05, + "loss": 0.4579, + "step": 2308 + }, + { + "epoch": 4.420650095602294, + "grad_norm": 0.357421875, + "learning_rate": 9.139831054657081e-05, + "loss": 0.457, + "step": 2312 + }, + { + "epoch": 4.4282982791587, + "grad_norm": 0.341796875, + "learning_rate": 9.091986874471956e-05, + "loss": 0.4257, + "step": 2316 + }, + { + "epoch": 4.435946462715105, + "grad_norm": 0.3515625, + "learning_rate": 9.044213731246614e-05, + "loss": 0.4287, + "step": 2320 + }, + { + "epoch": 4.44359464627151, + "grad_norm": 0.3359375, + "learning_rate": 8.99651219939732e-05, + "loss": 0.4482, + "step": 2324 + }, + { + "epoch": 4.451242829827915, + "grad_norm": 0.345703125, + "learning_rate": 8.948882852479305e-05, + "loss": 0.4772, + "step": 2328 + }, + { + "epoch": 4.458891013384322, + "grad_norm": 0.33203125, + "learning_rate": 8.901326263179851e-05, + "loss": 0.4067, + "step": 2332 + }, + { + "epoch": 4.466539196940727, + "grad_norm": 0.373046875, + "learning_rate": 8.85384300331142e-05, + "loss": 0.4399, + "step": 2336 + }, + { + "epoch": 4.474187380497132, + "grad_norm": 0.3671875, + "learning_rate": 8.80643364380477e-05, + "loss": 0.4726, + "step": 2340 + }, + { + "epoch": 4.4818355640535374, + "grad_norm": 0.349609375, + "learning_rate": 8.759098754702099e-05, + "loss": 0.4514, + "step": 2344 + }, + { + "epoch": 4.489483747609943, + "grad_norm": 0.36328125, + "learning_rate": 8.711838905150179e-05, + "loss": 0.4502, + "step": 2348 + }, + { + "epoch": 4.497131931166348, + "grad_norm": 0.328125, + "learning_rate": 8.664654663393516e-05, + "loss": 0.4366, + "step": 2352 + }, + { + "epoch": 4.504780114722753, + "grad_norm": 0.341796875, + "learning_rate": 8.617546596767534e-05, + "loss": 0.437, + "step": 2356 + }, + { + "epoch": 4.512428298279159, + "grad_norm": 0.337890625, + "learning_rate": 8.570515271691723e-05, + "loss": 0.4313, + "step": 2360 + }, + { + "epoch": 4.520076481835564, + "grad_norm": 0.34765625, + "learning_rate": 8.523561253662864e-05, + "loss": 0.447, + "step": 2364 + }, + { + "epoch": 4.527724665391969, + "grad_norm": 0.357421875, + "learning_rate": 8.476685107248197e-05, + "loss": 0.4488, + "step": 2368 + }, + { + "epoch": 4.5353728489483744, + "grad_norm": 0.33203125, + "learning_rate": 8.429887396078655e-05, + "loss": 0.469, + "step": 2372 + }, + { + "epoch": 4.54302103250478, + "grad_norm": 0.3515625, + "learning_rate": 8.38316868284207e-05, + "loss": 0.4886, + "step": 2376 + }, + { + "epoch": 4.550669216061186, + "grad_norm": 0.341796875, + "learning_rate": 8.336529529276421e-05, + "loss": 0.433, + "step": 2380 + }, + { + "epoch": 4.558317399617591, + "grad_norm": 0.33984375, + "learning_rate": 8.289970496163085e-05, + "loss": 0.4029, + "step": 2384 + }, + { + "epoch": 4.5659655831739965, + "grad_norm": 0.322265625, + "learning_rate": 8.243492143320058e-05, + "loss": 0.4198, + "step": 2388 + }, + { + "epoch": 4.573613766730402, + "grad_norm": 0.341796875, + "learning_rate": 8.197095029595276e-05, + "loss": 0.4377, + "step": 2392 + }, + { + "epoch": 4.581261950286807, + "grad_norm": 0.359375, + "learning_rate": 8.150779712859854e-05, + "loss": 0.4263, + "step": 2396 + }, + { + "epoch": 4.588910133843212, + "grad_norm": 0.349609375, + "learning_rate": 8.104546750001402e-05, + "loss": 0.4784, + "step": 2400 + }, + { + "epoch": 4.596558317399618, + "grad_norm": 0.337890625, + "learning_rate": 8.05839669691732e-05, + "loss": 0.4549, + "step": 2404 + }, + { + "epoch": 4.604206500956023, + "grad_norm": 0.33203125, + "learning_rate": 8.01233010850811e-05, + "loss": 0.4519, + "step": 2408 + }, + { + "epoch": 4.611854684512428, + "grad_norm": 0.37109375, + "learning_rate": 7.966347538670712e-05, + "loss": 0.4242, + "step": 2412 + }, + { + "epoch": 4.6195028680688335, + "grad_norm": 0.333984375, + "learning_rate": 7.92044954029184e-05, + "loss": 0.4647, + "step": 2416 + }, + { + "epoch": 4.627151051625239, + "grad_norm": 0.341796875, + "learning_rate": 7.874636665241335e-05, + "loss": 0.46, + "step": 2420 + }, + { + "epoch": 4.634799235181644, + "grad_norm": 0.34375, + "learning_rate": 7.828909464365531e-05, + "loss": 0.4169, + "step": 2424 + }, + { + "epoch": 4.642447418738049, + "grad_norm": 0.349609375, + "learning_rate": 7.783268487480626e-05, + "loss": 0.4251, + "step": 2428 + }, + { + "epoch": 4.650095602294455, + "grad_norm": 0.337890625, + "learning_rate": 7.73771428336608e-05, + "loss": 0.4232, + "step": 2432 + }, + { + "epoch": 4.657743785850861, + "grad_norm": 0.3671875, + "learning_rate": 7.692247399758008e-05, + "loss": 0.4836, + "step": 2436 + }, + { + "epoch": 4.665391969407266, + "grad_norm": 0.341796875, + "learning_rate": 7.6468683833426e-05, + "loss": 0.3917, + "step": 2440 + }, + { + "epoch": 4.673040152963671, + "grad_norm": 0.345703125, + "learning_rate": 7.601577779749545e-05, + "loss": 0.4153, + "step": 2444 + }, + { + "epoch": 4.680688336520077, + "grad_norm": 0.33984375, + "learning_rate": 7.55637613354547e-05, + "loss": 0.4103, + "step": 2448 + }, + { + "epoch": 4.688336520076482, + "grad_norm": 0.34765625, + "learning_rate": 7.511263988227397e-05, + "loss": 0.4858, + "step": 2452 + }, + { + "epoch": 4.695984703632887, + "grad_norm": 0.369140625, + "learning_rate": 7.466241886216198e-05, + "loss": 0.468, + "step": 2456 + }, + { + "epoch": 4.7036328871892925, + "grad_norm": 0.34765625, + "learning_rate": 7.421310368850085e-05, + "loss": 0.4817, + "step": 2460 + }, + { + "epoch": 4.711281070745698, + "grad_norm": 0.35546875, + "learning_rate": 7.376469976378094e-05, + "loss": 0.3923, + "step": 2464 + }, + { + "epoch": 4.718929254302103, + "grad_norm": 0.337890625, + "learning_rate": 7.33172124795359e-05, + "loss": 0.3954, + "step": 2468 + }, + { + "epoch": 4.726577437858508, + "grad_norm": 0.345703125, + "learning_rate": 7.287064721627782e-05, + "loss": 0.4606, + "step": 2472 + }, + { + "epoch": 4.734225621414914, + "grad_norm": 0.357421875, + "learning_rate": 7.242500934343262e-05, + "loss": 0.4253, + "step": 2476 + }, + { + "epoch": 4.741873804971319, + "grad_norm": 0.3671875, + "learning_rate": 7.19803042192754e-05, + "loss": 0.4431, + "step": 2480 + }, + { + "epoch": 4.749521988527725, + "grad_norm": 0.322265625, + "learning_rate": 7.153653719086604e-05, + "loss": 0.4097, + "step": 2484 + }, + { + "epoch": 4.75717017208413, + "grad_norm": 0.322265625, + "learning_rate": 7.109371359398493e-05, + "loss": 0.4208, + "step": 2488 + }, + { + "epoch": 4.764818355640536, + "grad_norm": 0.345703125, + "learning_rate": 7.06518387530688e-05, + "loss": 0.4396, + "step": 2492 + }, + { + "epoch": 4.772466539196941, + "grad_norm": 0.328125, + "learning_rate": 7.021091798114667e-05, + "loss": 0.4212, + "step": 2496 + }, + { + "epoch": 4.780114722753346, + "grad_norm": 0.33203125, + "learning_rate": 6.977095657977603e-05, + "loss": 0.4742, + "step": 2500 + }, + { + "epoch": 4.7877629063097515, + "grad_norm": 0.341796875, + "learning_rate": 6.933195983897905e-05, + "loss": 0.4198, + "step": 2504 + }, + { + "epoch": 4.795411089866157, + "grad_norm": 0.361328125, + "learning_rate": 6.889393303717898e-05, + "loss": 0.4401, + "step": 2508 + }, + { + "epoch": 4.803059273422562, + "grad_norm": 0.34765625, + "learning_rate": 6.845688144113663e-05, + "loss": 0.4273, + "step": 2512 + }, + { + "epoch": 4.810707456978967, + "grad_norm": 0.322265625, + "learning_rate": 6.802081030588722e-05, + "loss": 0.4589, + "step": 2516 + }, + { + "epoch": 4.818355640535373, + "grad_norm": 0.34765625, + "learning_rate": 6.758572487467698e-05, + "loss": 0.4835, + "step": 2520 + }, + { + "epoch": 4.826003824091778, + "grad_norm": 0.34375, + "learning_rate": 6.715163037890021e-05, + "loss": 0.4493, + "step": 2524 + }, + { + "epoch": 4.833652007648183, + "grad_norm": 0.3515625, + "learning_rate": 6.671853203803641e-05, + "loss": 0.4032, + "step": 2528 + }, + { + "epoch": 4.8413001912045885, + "grad_norm": 0.34765625, + "learning_rate": 6.628643505958742e-05, + "loss": 0.4498, + "step": 2532 + }, + { + "epoch": 4.848948374760994, + "grad_norm": 0.34375, + "learning_rate": 6.585534463901493e-05, + "loss": 0.4255, + "step": 2536 + }, + { + "epoch": 4.8565965583174, + "grad_norm": 0.388671875, + "learning_rate": 6.542526595967795e-05, + "loss": 0.4285, + "step": 2540 + }, + { + "epoch": 4.864244741873805, + "grad_norm": 0.337890625, + "learning_rate": 6.499620419277036e-05, + "loss": 0.4507, + "step": 2544 + }, + { + "epoch": 4.871892925430211, + "grad_norm": 0.333984375, + "learning_rate": 6.456816449725892e-05, + "loss": 0.4196, + "step": 2548 + }, + { + "epoch": 4.879541108986616, + "grad_norm": 0.345703125, + "learning_rate": 6.414115201982134e-05, + "loss": 0.4061, + "step": 2552 + }, + { + "epoch": 4.887189292543021, + "grad_norm": 0.33984375, + "learning_rate": 6.371517189478403e-05, + "loss": 0.4199, + "step": 2556 + }, + { + "epoch": 4.894837476099426, + "grad_norm": 0.3515625, + "learning_rate": 6.329022924406061e-05, + "loss": 0.452, + "step": 2560 + }, + { + "epoch": 4.902485659655832, + "grad_norm": 0.345703125, + "learning_rate": 6.286632917709031e-05, + "loss": 0.4516, + "step": 2564 + }, + { + "epoch": 4.910133843212237, + "grad_norm": 0.33984375, + "learning_rate": 6.244347679077651e-05, + "loss": 0.4622, + "step": 2568 + }, + { + "epoch": 4.917782026768642, + "grad_norm": 0.333984375, + "learning_rate": 6.202167716942543e-05, + "loss": 0.4387, + "step": 2572 + }, + { + "epoch": 4.925430210325048, + "grad_norm": 0.35546875, + "learning_rate": 6.160093538468505e-05, + "loss": 0.4064, + "step": 2576 + }, + { + "epoch": 4.933078393881453, + "grad_norm": 0.3671875, + "learning_rate": 6.118125649548405e-05, + "loss": 0.4728, + "step": 2580 + }, + { + "epoch": 4.940726577437858, + "grad_norm": 0.35546875, + "learning_rate": 6.076264554797112e-05, + "loss": 0.4555, + "step": 2584 + }, + { + "epoch": 4.948374760994264, + "grad_norm": 0.337890625, + "learning_rate": 6.0345107575454105e-05, + "loss": 0.427, + "step": 2588 + }, + { + "epoch": 4.95602294455067, + "grad_norm": 0.341796875, + "learning_rate": 5.992864759833963e-05, + "loss": 0.3841, + "step": 2592 + }, + { + "epoch": 4.963671128107075, + "grad_norm": 0.330078125, + "learning_rate": 5.9513270624072655e-05, + "loss": 0.4289, + "step": 2596 + }, + { + "epoch": 4.97131931166348, + "grad_norm": 0.353515625, + "learning_rate": 5.9098981647076345e-05, + "loss": 0.4538, + "step": 2600 + }, + { + "epoch": 4.9789674952198855, + "grad_norm": 0.33203125, + "learning_rate": 5.8685785648691894e-05, + "loss": 0.3954, + "step": 2604 + }, + { + "epoch": 4.986615678776291, + "grad_norm": 0.34765625, + "learning_rate": 5.8273687597118765e-05, + "loss": 0.4614, + "step": 2608 + }, + { + "epoch": 4.994263862332696, + "grad_norm": 0.3359375, + "learning_rate": 5.786269244735488e-05, + "loss": 0.4429, + "step": 2612 + }, + { + "epoch": 5.001912045889101, + "grad_norm": 0.30859375, + "learning_rate": 5.7452805141137034e-05, + "loss": 0.3411, + "step": 2616 + }, + { + "epoch": 5.009560229445507, + "grad_norm": 0.318359375, + "learning_rate": 5.704403060688158e-05, + "loss": 0.3973, + "step": 2620 + }, + { + "epoch": 5.017208413001912, + "grad_norm": 0.314453125, + "learning_rate": 5.663637375962489e-05, + "loss": 0.4319, + "step": 2624 + }, + { + "epoch": 5.024856596558317, + "grad_norm": 0.3203125, + "learning_rate": 5.6229839500964635e-05, + "loss": 0.3768, + "step": 2628 + }, + { + "epoch": 5.0325047801147225, + "grad_norm": 0.314453125, + "learning_rate": 5.582443271900063e-05, + "loss": 0.3683, + "step": 2632 + }, + { + "epoch": 5.040152963671128, + "grad_norm": 0.35546875, + "learning_rate": 5.542015828827609e-05, + "loss": 0.4145, + "step": 2636 + }, + { + "epoch": 5.047801147227533, + "grad_norm": 0.35546875, + "learning_rate": 5.5017021069719014e-05, + "loss": 0.4116, + "step": 2640 + }, + { + "epoch": 5.055449330783939, + "grad_norm": 0.3359375, + "learning_rate": 5.4615025910583756e-05, + "loss": 0.4069, + "step": 2644 + }, + { + "epoch": 5.0630975143403445, + "grad_norm": 0.35546875, + "learning_rate": 5.421417764439276e-05, + "loss": 0.375, + "step": 2648 + }, + { + "epoch": 5.07074569789675, + "grad_norm": 0.333984375, + "learning_rate": 5.3814481090878374e-05, + "loss": 0.3576, + "step": 2652 + }, + { + "epoch": 5.078393881453155, + "grad_norm": 0.337890625, + "learning_rate": 5.3415941055924974e-05, + "loss": 0.4027, + "step": 2656 + }, + { + "epoch": 5.08604206500956, + "grad_norm": 0.328125, + "learning_rate": 5.301856233151123e-05, + "loss": 0.3981, + "step": 2660 + }, + { + "epoch": 5.093690248565966, + "grad_norm": 0.318359375, + "learning_rate": 5.262234969565226e-05, + "loss": 0.3928, + "step": 2664 + }, + { + "epoch": 5.101338432122371, + "grad_norm": 0.33203125, + "learning_rate": 5.222730791234246e-05, + "loss": 0.3813, + "step": 2668 + }, + { + "epoch": 5.108986615678776, + "grad_norm": 0.3671875, + "learning_rate": 5.183344173149798e-05, + "loss": 0.4151, + "step": 2672 + }, + { + "epoch": 5.1166347992351815, + "grad_norm": 0.35546875, + "learning_rate": 5.14407558888998e-05, + "loss": 0.4051, + "step": 2676 + }, + { + "epoch": 5.124282982791587, + "grad_norm": 0.330078125, + "learning_rate": 5.104925510613668e-05, + "loss": 0.3973, + "step": 2680 + }, + { + "epoch": 5.131931166347992, + "grad_norm": 0.3203125, + "learning_rate": 5.0658944090548436e-05, + "loss": 0.3958, + "step": 2684 + }, + { + "epoch": 5.139579349904397, + "grad_norm": 0.34765625, + "learning_rate": 5.0269827535169306e-05, + "loss": 0.3731, + "step": 2688 + }, + { + "epoch": 5.147227533460803, + "grad_norm": 0.33984375, + "learning_rate": 4.988191011867153e-05, + "loss": 0.4028, + "step": 2692 + }, + { + "epoch": 5.154875717017209, + "grad_norm": 0.36328125, + "learning_rate": 4.9495196505309196e-05, + "loss": 0.4201, + "step": 2696 + }, + { + "epoch": 5.162523900573614, + "grad_norm": 0.345703125, + "learning_rate": 4.9109691344861886e-05, + "loss": 0.3815, + "step": 2700 + }, + { + "epoch": 5.170172084130019, + "grad_norm": 0.326171875, + "learning_rate": 4.8725399272579075e-05, + "loss": 0.4043, + "step": 2704 + }, + { + "epoch": 5.177820267686425, + "grad_norm": 0.330078125, + "learning_rate": 4.8342324909124256e-05, + "loss": 0.3828, + "step": 2708 + }, + { + "epoch": 5.18546845124283, + "grad_norm": 0.34765625, + "learning_rate": 4.7960472860519365e-05, + "loss": 0.4199, + "step": 2712 + }, + { + "epoch": 5.193116634799235, + "grad_norm": 0.330078125, + "learning_rate": 4.757984771808947e-05, + "loss": 0.4071, + "step": 2716 + }, + { + "epoch": 5.2007648183556405, + "grad_norm": 0.34765625, + "learning_rate": 4.72004540584075e-05, + "loss": 0.4288, + "step": 2720 + }, + { + "epoch": 5.208413001912046, + "grad_norm": 0.353515625, + "learning_rate": 4.682229644323922e-05, + "loss": 0.3935, + "step": 2724 + }, + { + "epoch": 5.216061185468451, + "grad_norm": 0.37109375, + "learning_rate": 4.6445379419488436e-05, + "loss": 0.3801, + "step": 2728 + }, + { + "epoch": 5.223709369024856, + "grad_norm": 0.33203125, + "learning_rate": 4.606970751914229e-05, + "loss": 0.4181, + "step": 2732 + }, + { + "epoch": 5.231357552581262, + "grad_norm": 0.341796875, + "learning_rate": 4.569528525921672e-05, + "loss": 0.3816, + "step": 2736 + }, + { + "epoch": 5.239005736137667, + "grad_norm": 0.349609375, + "learning_rate": 4.532211714170229e-05, + "loss": 0.4331, + "step": 2740 + }, + { + "epoch": 5.246653919694072, + "grad_norm": 0.365234375, + "learning_rate": 4.495020765350988e-05, + "loss": 0.4331, + "step": 2744 + }, + { + "epoch": 5.254302103250478, + "grad_norm": 0.359375, + "learning_rate": 4.4579561266416855e-05, + "loss": 0.3889, + "step": 2748 + }, + { + "epoch": 5.261950286806884, + "grad_norm": 0.328125, + "learning_rate": 4.421018243701327e-05, + "loss": 0.3804, + "step": 2752 + }, + { + "epoch": 5.269598470363289, + "grad_norm": 0.55859375, + "learning_rate": 4.384207560664825e-05, + "loss": 0.4386, + "step": 2756 + }, + { + "epoch": 5.277246653919694, + "grad_norm": 0.34375, + "learning_rate": 4.347524520137667e-05, + "loss": 0.3547, + "step": 2760 + }, + { + "epoch": 5.2848948374761, + "grad_norm": 0.337890625, + "learning_rate": 4.310969563190578e-05, + "loss": 0.3719, + "step": 2764 + }, + { + "epoch": 5.292543021032505, + "grad_norm": 0.328125, + "learning_rate": 4.274543129354245e-05, + "loss": 0.3611, + "step": 2768 + }, + { + "epoch": 5.30019120458891, + "grad_norm": 0.3125, + "learning_rate": 4.2382456566139985e-05, + "loss": 0.3173, + "step": 2772 + }, + { + "epoch": 5.307839388145315, + "grad_norm": 0.345703125, + "learning_rate": 4.202077581404574e-05, + "loss": 0.3848, + "step": 2776 + }, + { + "epoch": 5.315487571701721, + "grad_norm": 0.330078125, + "learning_rate": 4.166039338604838e-05, + "loss": 0.3921, + "step": 2780 + }, + { + "epoch": 5.323135755258126, + "grad_norm": 0.318359375, + "learning_rate": 4.130131361532586e-05, + "loss": 0.4049, + "step": 2784 + }, + { + "epoch": 5.330783938814531, + "grad_norm": 0.35546875, + "learning_rate": 4.094354081939317e-05, + "loss": 0.3866, + "step": 2788 + }, + { + "epoch": 5.338432122370937, + "grad_norm": 0.365234375, + "learning_rate": 4.058707930005048e-05, + "loss": 0.3727, + "step": 2792 + }, + { + "epoch": 5.346080305927342, + "grad_norm": 0.341796875, + "learning_rate": 4.023193334333132e-05, + "loss": 0.3957, + "step": 2796 + }, + { + "epoch": 5.353728489483748, + "grad_norm": 0.341796875, + "learning_rate": 3.9878107219451206e-05, + "loss": 0.344, + "step": 2800 + }, + { + "epoch": 5.361376673040153, + "grad_norm": 0.375, + "learning_rate": 3.9525605182756134e-05, + "loss": 0.4048, + "step": 2804 + }, + { + "epoch": 5.369024856596559, + "grad_norm": 0.349609375, + "learning_rate": 3.917443147167152e-05, + "loss": 0.4343, + "step": 2808 + }, + { + "epoch": 5.376673040152964, + "grad_norm": 0.345703125, + "learning_rate": 3.882459030865124e-05, + "loss": 0.3908, + "step": 2812 + }, + { + "epoch": 5.384321223709369, + "grad_norm": 0.34765625, + "learning_rate": 3.8476085900126776e-05, + "loss": 0.3491, + "step": 2816 + }, + { + "epoch": 5.3919694072657744, + "grad_norm": 0.3515625, + "learning_rate": 3.8128922436456766e-05, + "loss": 0.4092, + "step": 2820 + }, + { + "epoch": 5.39961759082218, + "grad_norm": 0.375, + "learning_rate": 3.7783104091876524e-05, + "loss": 0.3766, + "step": 2824 + }, + { + "epoch": 5.407265774378585, + "grad_norm": 0.34375, + "learning_rate": 3.743863502444783e-05, + "loss": 0.3835, + "step": 2828 + }, + { + "epoch": 5.41491395793499, + "grad_norm": 0.33203125, + "learning_rate": 3.709551937600909e-05, + "loss": 0.3733, + "step": 2832 + }, + { + "epoch": 5.422562141491396, + "grad_norm": 0.34375, + "learning_rate": 3.675376127212532e-05, + "loss": 0.4127, + "step": 2836 + }, + { + "epoch": 5.430210325047801, + "grad_norm": 0.361328125, + "learning_rate": 3.64133648220387e-05, + "loss": 0.4091, + "step": 2840 + }, + { + "epoch": 5.437858508604206, + "grad_norm": 0.365234375, + "learning_rate": 3.607433411861912e-05, + "loss": 0.4612, + "step": 2844 + }, + { + "epoch": 5.4455066921606115, + "grad_norm": 0.330078125, + "learning_rate": 3.5736673238314914e-05, + "loss": 0.3668, + "step": 2848 + }, + { + "epoch": 5.453154875717018, + "grad_norm": 0.3359375, + "learning_rate": 3.5400386241103946e-05, + "loss": 0.3645, + "step": 2852 + }, + { + "epoch": 5.460803059273423, + "grad_norm": 0.33984375, + "learning_rate": 3.506547717044472e-05, + "loss": 0.348, + "step": 2856 + }, + { + "epoch": 5.468451242829828, + "grad_norm": 0.33984375, + "learning_rate": 3.473195005322776e-05, + "loss": 0.3812, + "step": 2860 + }, + { + "epoch": 5.4760994263862335, + "grad_norm": 0.333984375, + "learning_rate": 3.439980889972723e-05, + "loss": 0.3705, + "step": 2864 + }, + { + "epoch": 5.483747609942639, + "grad_norm": 0.35546875, + "learning_rate": 3.406905770355274e-05, + "loss": 0.3687, + "step": 2868 + }, + { + "epoch": 5.491395793499044, + "grad_norm": 0.32421875, + "learning_rate": 3.373970044160121e-05, + "loss": 0.4348, + "step": 2872 + }, + { + "epoch": 5.499043977055449, + "grad_norm": 0.36328125, + "learning_rate": 3.341174107400916e-05, + "loss": 0.4039, + "step": 2876 + }, + { + "epoch": 5.506692160611855, + "grad_norm": 0.3359375, + "learning_rate": 3.30851835441051e-05, + "loss": 0.4072, + "step": 2880 + }, + { + "epoch": 5.51434034416826, + "grad_norm": 0.353515625, + "learning_rate": 3.276003177836203e-05, + "loss": 0.3796, + "step": 2884 + }, + { + "epoch": 5.521988527724665, + "grad_norm": 0.345703125, + "learning_rate": 3.2436289686350285e-05, + "loss": 0.3647, + "step": 2888 + }, + { + "epoch": 5.5296367112810705, + "grad_norm": 0.341796875, + "learning_rate": 3.211396116069055e-05, + "loss": 0.3895, + "step": 2892 + }, + { + "epoch": 5.537284894837476, + "grad_norm": 0.34375, + "learning_rate": 3.179305007700697e-05, + "loss": 0.3689, + "step": 2896 + }, + { + "epoch": 5.544933078393882, + "grad_norm": 0.349609375, + "learning_rate": 3.147356029388067e-05, + "loss": 0.3782, + "step": 2900 + }, + { + "epoch": 5.552581261950287, + "grad_norm": 0.349609375, + "learning_rate": 3.115549565280325e-05, + "loss": 0.3985, + "step": 2904 + }, + { + "epoch": 5.5602294455066925, + "grad_norm": 0.341796875, + "learning_rate": 3.083885997813066e-05, + "loss": 0.4289, + "step": 2908 + }, + { + "epoch": 5.567877629063098, + "grad_norm": 0.33984375, + "learning_rate": 3.052365707703718e-05, + "loss": 0.436, + "step": 2912 + }, + { + "epoch": 5.575525812619503, + "grad_norm": 0.341796875, + "learning_rate": 3.0209890739469693e-05, + "loss": 0.387, + "step": 2916 + }, + { + "epoch": 5.583173996175908, + "grad_norm": 0.326171875, + "learning_rate": 2.989756473810203e-05, + "loss": 0.4034, + "step": 2920 + }, + { + "epoch": 5.590822179732314, + "grad_norm": 0.3671875, + "learning_rate": 2.9586682828289738e-05, + "loss": 0.4206, + "step": 2924 + }, + { + "epoch": 5.598470363288719, + "grad_norm": 0.36328125, + "learning_rate": 2.9277248748024763e-05, + "loss": 0.4529, + "step": 2928 + }, + { + "epoch": 5.606118546845124, + "grad_norm": 0.330078125, + "learning_rate": 2.8969266217890648e-05, + "loss": 0.3527, + "step": 2932 + }, + { + "epoch": 5.6137667304015295, + "grad_norm": 0.365234375, + "learning_rate": 2.866273894101776e-05, + "loss": 0.389, + "step": 2936 + }, + { + "epoch": 5.621414913957935, + "grad_norm": 0.361328125, + "learning_rate": 2.835767060303865e-05, + "loss": 0.4017, + "step": 2940 + }, + { + "epoch": 5.62906309751434, + "grad_norm": 0.353515625, + "learning_rate": 2.8054064872043917e-05, + "loss": 0.3973, + "step": 2944 + }, + { + "epoch": 5.636711281070745, + "grad_norm": 0.375, + "learning_rate": 2.7751925398537993e-05, + "loss": 0.4281, + "step": 2948 + }, + { + "epoch": 5.644359464627151, + "grad_norm": 0.330078125, + "learning_rate": 2.745125581539523e-05, + "loss": 0.4071, + "step": 2952 + }, + { + "epoch": 5.652007648183556, + "grad_norm": 0.373046875, + "learning_rate": 2.7152059737816395e-05, + "loss": 0.3866, + "step": 2956 + }, + { + "epoch": 5.659655831739962, + "grad_norm": 0.32421875, + "learning_rate": 2.6854340763284954e-05, + "loss": 0.4029, + "step": 2960 + }, + { + "epoch": 5.667304015296367, + "grad_norm": 0.369140625, + "learning_rate": 2.6558102471523975e-05, + "loss": 0.4207, + "step": 2964 + }, + { + "epoch": 5.674952198852773, + "grad_norm": 0.365234375, + "learning_rate": 2.6263348424453012e-05, + "loss": 0.3769, + "step": 2968 + }, + { + "epoch": 5.682600382409178, + "grad_norm": 0.337890625, + "learning_rate": 2.597008216614534e-05, + "loss": 0.3527, + "step": 2972 + }, + { + "epoch": 5.690248565965583, + "grad_norm": 0.345703125, + "learning_rate": 2.5678307222785315e-05, + "loss": 0.4091, + "step": 2976 + }, + { + "epoch": 5.6978967495219885, + "grad_norm": 0.328125, + "learning_rate": 2.5388027102625945e-05, + "loss": 0.4175, + "step": 2980 + }, + { + "epoch": 5.705544933078394, + "grad_norm": 0.349609375, + "learning_rate": 2.5099245295946764e-05, + "loss": 0.3557, + "step": 2984 + }, + { + "epoch": 5.713193116634799, + "grad_norm": 0.328125, + "learning_rate": 2.4811965275011825e-05, + "loss": 0.4021, + "step": 2988 + }, + { + "epoch": 5.720841300191204, + "grad_norm": 0.33203125, + "learning_rate": 2.4526190494027953e-05, + "loss": 0.3868, + "step": 2992 + }, + { + "epoch": 5.72848948374761, + "grad_norm": 0.369140625, + "learning_rate": 2.4241924389103227e-05, + "loss": 0.4399, + "step": 2996 + }, + { + "epoch": 5.736137667304015, + "grad_norm": 0.3359375, + "learning_rate": 2.395917037820566e-05, + "loss": 0.3907, + "step": 3000 + }, + { + "epoch": 5.743785850860421, + "grad_norm": 0.349609375, + "learning_rate": 2.3677931861122084e-05, + "loss": 0.4195, + "step": 3004 + }, + { + "epoch": 5.751434034416826, + "grad_norm": 0.3359375, + "learning_rate": 2.339821221941731e-05, + "loss": 0.3867, + "step": 3008 + }, + { + "epoch": 5.759082217973232, + "grad_norm": 0.3359375, + "learning_rate": 2.312001481639348e-05, + "loss": 0.3583, + "step": 3012 + }, + { + "epoch": 5.766730401529637, + "grad_norm": 0.3359375, + "learning_rate": 2.2843342997049445e-05, + "loss": 0.3527, + "step": 3016 + }, + { + "epoch": 5.774378585086042, + "grad_norm": 0.30859375, + "learning_rate": 2.2568200088040867e-05, + "loss": 0.3393, + "step": 3020 + }, + { + "epoch": 5.782026768642448, + "grad_norm": 0.359375, + "learning_rate": 2.2294589397639978e-05, + "loss": 0.4225, + "step": 3024 + }, + { + "epoch": 5.789674952198853, + "grad_norm": 0.357421875, + "learning_rate": 2.2022514215695842e-05, + "loss": 0.4191, + "step": 3028 + }, + { + "epoch": 5.797323135755258, + "grad_norm": 0.341796875, + "learning_rate": 2.175197781359485e-05, + "loss": 0.3792, + "step": 3032 + }, + { + "epoch": 5.804971319311663, + "grad_norm": 0.337890625, + "learning_rate": 2.1482983444221402e-05, + "loss": 0.3942, + "step": 3036 + }, + { + "epoch": 5.812619502868069, + "grad_norm": 0.326171875, + "learning_rate": 2.1215534341918707e-05, + "loss": 0.3753, + "step": 3040 + }, + { + "epoch": 5.820267686424474, + "grad_norm": 0.33984375, + "learning_rate": 2.0949633722449915e-05, + "loss": 0.4234, + "step": 3044 + }, + { + "epoch": 5.827915869980879, + "grad_norm": 0.357421875, + "learning_rate": 2.0685284782959566e-05, + "loss": 0.3925, + "step": 3048 + }, + { + "epoch": 5.835564053537285, + "grad_norm": 0.349609375, + "learning_rate": 2.0422490701934996e-05, + "loss": 0.412, + "step": 3052 + }, + { + "epoch": 5.84321223709369, + "grad_norm": 0.328125, + "learning_rate": 2.0161254639168183e-05, + "loss": 0.3981, + "step": 3056 + }, + { + "epoch": 5.850860420650095, + "grad_norm": 0.3515625, + "learning_rate": 1.9901579735717743e-05, + "loss": 0.4204, + "step": 3060 + }, + { + "epoch": 5.858508604206501, + "grad_norm": 0.333984375, + "learning_rate": 1.964346911387127e-05, + "loss": 0.3878, + "step": 3064 + }, + { + "epoch": 5.866156787762907, + "grad_norm": 0.373046875, + "learning_rate": 1.9386925877107585e-05, + "loss": 0.4047, + "step": 3068 + }, + { + "epoch": 5.873804971319312, + "grad_norm": 0.3671875, + "learning_rate": 1.913195311005959e-05, + "loss": 0.3899, + "step": 3072 + }, + { + "epoch": 5.881453154875717, + "grad_norm": 0.353515625, + "learning_rate": 1.8878553878477105e-05, + "loss": 0.4179, + "step": 3076 + }, + { + "epoch": 5.8891013384321225, + "grad_norm": 0.341796875, + "learning_rate": 1.8626731229190016e-05, + "loss": 0.403, + "step": 3080 + }, + { + "epoch": 5.896749521988528, + "grad_norm": 0.31640625, + "learning_rate": 1.8376488190071666e-05, + "loss": 0.3925, + "step": 3084 + }, + { + "epoch": 5.904397705544933, + "grad_norm": 0.3359375, + "learning_rate": 1.8127827770002423e-05, + "loss": 0.3647, + "step": 3088 + }, + { + "epoch": 5.912045889101338, + "grad_norm": 0.353515625, + "learning_rate": 1.7880752958833543e-05, + "loss": 0.4301, + "step": 3092 + }, + { + "epoch": 5.919694072657744, + "grad_norm": 0.35546875, + "learning_rate": 1.7635266727351092e-05, + "loss": 0.3988, + "step": 3096 + }, + { + "epoch": 5.927342256214149, + "grad_norm": 0.34375, + "learning_rate": 1.73913720272404e-05, + "loss": 0.4028, + "step": 3100 + }, + { + "epoch": 5.934990439770554, + "grad_norm": 0.328125, + "learning_rate": 1.714907179105049e-05, + "loss": 0.3756, + "step": 3104 + }, + { + "epoch": 5.9426386233269595, + "grad_norm": 0.341796875, + "learning_rate": 1.6908368932158777e-05, + "loss": 0.4023, + "step": 3108 + }, + { + "epoch": 5.950286806883366, + "grad_norm": 0.345703125, + "learning_rate": 1.6669266344736104e-05, + "loss": 0.3784, + "step": 3112 + }, + { + "epoch": 5.957934990439771, + "grad_norm": 0.34375, + "learning_rate": 1.6431766903711914e-05, + "loss": 0.3622, + "step": 3116 + }, + { + "epoch": 5.965583173996176, + "grad_norm": 0.359375, + "learning_rate": 1.6195873464739702e-05, + "loss": 0.416, + "step": 3120 + }, + { + "epoch": 5.9732313575525815, + "grad_norm": 0.3359375, + "learning_rate": 1.5961588864162627e-05, + "loss": 0.4191, + "step": 3124 + }, + { + "epoch": 5.980879541108987, + "grad_norm": 0.328125, + "learning_rate": 1.5728915918979477e-05, + "loss": 0.3683, + "step": 3128 + }, + { + "epoch": 5.988527724665392, + "grad_norm": 0.337890625, + "learning_rate": 1.5497857426810756e-05, + "loss": 0.365, + "step": 3132 + }, + { + "epoch": 5.996175908221797, + "grad_norm": 0.3671875, + "learning_rate": 1.5268416165865055e-05, + "loss": 0.401, + "step": 3136 + }, + { + "epoch": 6.003824091778203, + "grad_norm": 0.33203125, + "learning_rate": 1.5040594894905628e-05, + "loss": 0.3805, + "step": 3140 + }, + { + "epoch": 6.011472275334608, + "grad_norm": 0.34765625, + "learning_rate": 1.481439635321729e-05, + "loss": 0.3635, + "step": 3144 + }, + { + "epoch": 6.019120458891013, + "grad_norm": 0.33984375, + "learning_rate": 1.458982326057338e-05, + "loss": 0.37, + "step": 3148 + }, + { + "epoch": 6.0267686424474185, + "grad_norm": 0.34765625, + "learning_rate": 1.436687831720314e-05, + "loss": 0.3981, + "step": 3152 + }, + { + "epoch": 6.034416826003824, + "grad_norm": 0.345703125, + "learning_rate": 1.4145564203759219e-05, + "loss": 0.4484, + "step": 3156 + }, + { + "epoch": 6.042065009560229, + "grad_norm": 0.3671875, + "learning_rate": 1.3925883581285401e-05, + "loss": 0.3988, + "step": 3160 + }, + { + "epoch": 6.049713193116634, + "grad_norm": 0.357421875, + "learning_rate": 1.3707839091184702e-05, + "loss": 0.3422, + "step": 3164 + }, + { + "epoch": 6.0573613766730405, + "grad_norm": 0.35546875, + "learning_rate": 1.349143335518752e-05, + "loss": 0.3994, + "step": 3168 + }, + { + "epoch": 6.065009560229446, + "grad_norm": 0.341796875, + "learning_rate": 1.3276668975320165e-05, + "loss": 0.4038, + "step": 3172 + }, + { + "epoch": 6.072657743785851, + "grad_norm": 0.328125, + "learning_rate": 1.3063548533873536e-05, + "loss": 0.4065, + "step": 3176 + }, + { + "epoch": 6.080305927342256, + "grad_norm": 0.33984375, + "learning_rate": 1.2852074593372142e-05, + "loss": 0.3902, + "step": 3180 + }, + { + "epoch": 6.087954110898662, + "grad_norm": 0.328125, + "learning_rate": 1.2642249696543178e-05, + "loss": 0.388, + "step": 3184 + }, + { + "epoch": 6.095602294455067, + "grad_norm": 0.34375, + "learning_rate": 1.243407636628605e-05, + "loss": 0.3979, + "step": 3188 + }, + { + "epoch": 6.103250478011472, + "grad_norm": 0.3359375, + "learning_rate": 1.2227557105642e-05, + "loss": 0.404, + "step": 3192 + }, + { + "epoch": 6.1108986615678775, + "grad_norm": 0.34765625, + "learning_rate": 1.2022694397763993e-05, + "loss": 0.4059, + "step": 3196 + }, + { + "epoch": 6.118546845124283, + "grad_norm": 0.359375, + "learning_rate": 1.1819490705886914e-05, + "loss": 0.3766, + "step": 3200 + }, + { + "epoch": 6.126195028680688, + "grad_norm": 0.337890625, + "learning_rate": 1.16179484732979e-05, + "loss": 0.3629, + "step": 3204 + }, + { + "epoch": 6.133843212237093, + "grad_norm": 0.375, + "learning_rate": 1.1418070123306989e-05, + "loss": 0.3733, + "step": 3208 + }, + { + "epoch": 6.141491395793499, + "grad_norm": 0.337890625, + "learning_rate": 1.1219858059217951e-05, + "loss": 0.4169, + "step": 3212 + }, + { + "epoch": 6.149139579349905, + "grad_norm": 0.33984375, + "learning_rate": 1.1023314664299455e-05, + "loss": 0.4131, + "step": 3216 + }, + { + "epoch": 6.15678776290631, + "grad_norm": 0.318359375, + "learning_rate": 1.0828442301756312e-05, + "loss": 0.3505, + "step": 3220 + }, + { + "epoch": 6.164435946462715, + "grad_norm": 0.337890625, + "learning_rate": 1.0635243314701163e-05, + "loss": 0.3978, + "step": 3224 + }, + { + "epoch": 6.172084130019121, + "grad_norm": 0.33984375, + "learning_rate": 1.0443720026126273e-05, + "loss": 0.3982, + "step": 3228 + }, + { + "epoch": 6.179732313575526, + "grad_norm": 0.357421875, + "learning_rate": 1.025387473887554e-05, + "loss": 0.3935, + "step": 3232 + }, + { + "epoch": 6.187380497131931, + "grad_norm": 0.337890625, + "learning_rate": 1.0065709735616917e-05, + "loss": 0.3709, + "step": 3236 + }, + { + "epoch": 6.195028680688337, + "grad_norm": 0.3515625, + "learning_rate": 9.87922727881484e-06, + "loss": 0.3922, + "step": 3240 + }, + { + "epoch": 6.202676864244742, + "grad_norm": 0.328125, + "learning_rate": 9.694429610703153e-06, + "loss": 0.3719, + "step": 3244 + }, + { + "epoch": 6.210325047801147, + "grad_norm": 0.333984375, + "learning_rate": 9.511318953258013e-06, + "loss": 0.2995, + "step": 3248 + }, + { + "epoch": 6.217973231357552, + "grad_norm": 0.34765625, + "learning_rate": 9.329897508171296e-06, + "loss": 0.3932, + "step": 3252 + }, + { + "epoch": 6.225621414913958, + "grad_norm": 0.34765625, + "learning_rate": 9.150167456824065e-06, + "loss": 0.4016, + "step": 3256 + }, + { + "epoch": 6.233269598470363, + "grad_norm": 0.33984375, + "learning_rate": 8.972130960260326e-06, + "loss": 0.3622, + "step": 3260 + }, + { + "epoch": 6.240917782026768, + "grad_norm": 0.337890625, + "learning_rate": 8.795790159161098e-06, + "loss": 0.4053, + "step": 3264 + }, + { + "epoch": 6.248565965583174, + "grad_norm": 0.3203125, + "learning_rate": 8.621147173818587e-06, + "loss": 0.3517, + "step": 3268 + }, + { + "epoch": 6.25621414913958, + "grad_norm": 0.35546875, + "learning_rate": 8.448204104110818e-06, + "loss": 0.3448, + "step": 3272 + }, + { + "epoch": 6.263862332695985, + "grad_norm": 0.349609375, + "learning_rate": 8.276963029476275e-06, + "loss": 0.3758, + "step": 3276 + }, + { + "epoch": 6.27151051625239, + "grad_norm": 0.37109375, + "learning_rate": 8.107426008888934e-06, + "loss": 0.3911, + "step": 3280 + }, + { + "epoch": 6.279158699808796, + "grad_norm": 0.322265625, + "learning_rate": 7.93959508083351e-06, + "loss": 0.3491, + "step": 3284 + }, + { + "epoch": 6.286806883365201, + "grad_norm": 0.34375, + "learning_rate": 7.773472263280977e-06, + "loss": 0.3954, + "step": 3288 + }, + { + "epoch": 6.294455066921606, + "grad_norm": 0.34375, + "learning_rate": 7.609059553664254e-06, + "loss": 0.4018, + "step": 3292 + }, + { + "epoch": 6.3021032504780115, + "grad_norm": 0.330078125, + "learning_rate": 7.446358928854207e-06, + "loss": 0.3823, + "step": 3296 + }, + { + "epoch": 6.309751434034417, + "grad_norm": 0.345703125, + "learning_rate": 7.2853723451358705e-06, + "loss": 0.4097, + "step": 3300 + }, + { + "epoch": 6.317399617590822, + "grad_norm": 0.345703125, + "learning_rate": 7.126101738184964e-06, + "loss": 0.3676, + "step": 3304 + }, + { + "epoch": 6.325047801147227, + "grad_norm": 0.337890625, + "learning_rate": 6.9685490230445615e-06, + "loss": 0.3901, + "step": 3308 + }, + { + "epoch": 6.332695984703633, + "grad_norm": 0.32421875, + "learning_rate": 6.812716094102128e-06, + "loss": 0.3652, + "step": 3312 + }, + { + "epoch": 6.340344168260038, + "grad_norm": 0.35546875, + "learning_rate": 6.658604825066683e-06, + "loss": 0.3631, + "step": 3316 + }, + { + "epoch": 6.347992351816444, + "grad_norm": 0.341796875, + "learning_rate": 6.50621706894629e-06, + "loss": 0.4053, + "step": 3320 + }, + { + "epoch": 6.355640535372849, + "grad_norm": 0.33203125, + "learning_rate": 6.355554658025791e-06, + "loss": 0.3451, + "step": 3324 + }, + { + "epoch": 6.363288718929255, + "grad_norm": 0.34375, + "learning_rate": 6.206619403844804e-06, + "loss": 0.4109, + "step": 3328 + }, + { + "epoch": 6.37093690248566, + "grad_norm": 0.318359375, + "learning_rate": 6.059413097175808e-06, + "loss": 0.3897, + "step": 3332 + }, + { + "epoch": 6.378585086042065, + "grad_norm": 0.341796875, + "learning_rate": 5.913937508002797e-06, + "loss": 0.3573, + "step": 3336 + }, + { + "epoch": 6.3862332695984705, + "grad_norm": 0.341796875, + "learning_rate": 5.770194385499877e-06, + "loss": 0.3928, + "step": 3340 + }, + { + "epoch": 6.393881453154876, + "grad_norm": 0.330078125, + "learning_rate": 5.628185458010248e-06, + "loss": 0.3662, + "step": 3344 + }, + { + "epoch": 6.401529636711281, + "grad_norm": 0.349609375, + "learning_rate": 5.487912433025493e-06, + "loss": 0.3974, + "step": 3348 + }, + { + "epoch": 6.409177820267686, + "grad_norm": 0.32421875, + "learning_rate": 5.349376997164923e-06, + "loss": 0.3477, + "step": 3352 + }, + { + "epoch": 6.416826003824092, + "grad_norm": 0.330078125, + "learning_rate": 5.212580816155426e-06, + "loss": 0.3992, + "step": 3356 + }, + { + "epoch": 6.424474187380497, + "grad_norm": 0.345703125, + "learning_rate": 5.077525534811339e-06, + "loss": 0.3881, + "step": 3360 + }, + { + "epoch": 6.432122370936902, + "grad_norm": 0.322265625, + "learning_rate": 4.9442127770147385e-06, + "loss": 0.3767, + "step": 3364 + }, + { + "epoch": 6.4397705544933075, + "grad_norm": 0.34765625, + "learning_rate": 4.812644145695915e-06, + "loss": 0.3939, + "step": 3368 + }, + { + "epoch": 6.447418738049713, + "grad_norm": 0.3359375, + "learning_rate": 4.682821222813998e-06, + "loss": 0.3322, + "step": 3372 + }, + { + "epoch": 6.455066921606119, + "grad_norm": 0.34375, + "learning_rate": 4.554745569338092e-06, + "loss": 0.3414, + "step": 3376 + }, + { + "epoch": 6.462715105162524, + "grad_norm": 0.35546875, + "learning_rate": 4.428418725228372e-06, + "loss": 0.3982, + "step": 3380 + }, + { + "epoch": 6.4703632887189295, + "grad_norm": 0.33203125, + "learning_rate": 4.303842209417652e-06, + "loss": 0.3522, + "step": 3384 + }, + { + "epoch": 6.478011472275335, + "grad_norm": 0.330078125, + "learning_rate": 4.181017519793079e-06, + "loss": 0.4167, + "step": 3388 + }, + { + "epoch": 6.48565965583174, + "grad_norm": 0.349609375, + "learning_rate": 4.059946133178132e-06, + "loss": 0.3875, + "step": 3392 + }, + { + "epoch": 6.493307839388145, + "grad_norm": 0.33203125, + "learning_rate": 3.94062950531489e-06, + "loss": 0.3662, + "step": 3396 + }, + { + "epoch": 6.500956022944551, + "grad_norm": 0.34375, + "learning_rate": 3.823069070846474e-06, + "loss": 0.3625, + "step": 3400 + }, + { + "epoch": 6.508604206500956, + "grad_norm": 0.318359375, + "learning_rate": 3.707266243299861e-06, + "loss": 0.3445, + "step": 3404 + }, + { + "epoch": 6.516252390057361, + "grad_norm": 0.353515625, + "learning_rate": 3.5932224150688526e-06, + "loss": 0.3705, + "step": 3408 + }, + { + "epoch": 6.5239005736137665, + "grad_norm": 0.35546875, + "learning_rate": 3.4809389573973e-06, + "loss": 0.396, + "step": 3412 + }, + { + "epoch": 6.531548757170172, + "grad_norm": 0.359375, + "learning_rate": 3.3704172203627035e-06, + "loss": 0.3689, + "step": 3416 + }, + { + "epoch": 6.539196940726577, + "grad_norm": 0.33203125, + "learning_rate": 3.2616585328599065e-06, + "loss": 0.4253, + "step": 3420 + }, + { + "epoch": 6.546845124282983, + "grad_norm": 0.3359375, + "learning_rate": 3.154664202585128e-06, + "loss": 0.387, + "step": 3424 + }, + { + "epoch": 6.5544933078393885, + "grad_norm": 0.353515625, + "learning_rate": 3.049435516020271e-06, + "loss": 0.3585, + "step": 3428 + }, + { + "epoch": 6.562141491395794, + "grad_norm": 0.34765625, + "learning_rate": 2.94597373841744e-06, + "loss": 0.3912, + "step": 3432 + }, + { + "epoch": 6.569789674952199, + "grad_norm": 0.337890625, + "learning_rate": 2.844280113783698e-06, + "loss": 0.3863, + "step": 3436 + }, + { + "epoch": 6.577437858508604, + "grad_norm": 0.353515625, + "learning_rate": 2.7443558648661656e-06, + "loss": 0.4214, + "step": 3440 + }, + { + "epoch": 6.58508604206501, + "grad_norm": 0.33203125, + "learning_rate": 2.646202193137248e-06, + "loss": 0.429, + "step": 3444 + }, + { + "epoch": 6.592734225621415, + "grad_norm": 0.330078125, + "learning_rate": 2.549820278780246e-06, + "loss": 0.3523, + "step": 3448 + }, + { + "epoch": 6.60038240917782, + "grad_norm": 0.33984375, + "learning_rate": 2.455211280675168e-06, + "loss": 0.4001, + "step": 3452 + }, + { + "epoch": 6.6080305927342256, + "grad_norm": 0.376953125, + "learning_rate": 2.3623763363847246e-06, + "loss": 0.3819, + "step": 3456 + }, + { + "epoch": 6.615678776290631, + "grad_norm": 0.361328125, + "learning_rate": 2.271316562140757e-06, + "loss": 0.3632, + "step": 3460 + }, + { + "epoch": 6.623326959847036, + "grad_norm": 0.357421875, + "learning_rate": 2.182033052830695e-06, + "loss": 0.3842, + "step": 3464 + }, + { + "epoch": 6.630975143403441, + "grad_norm": 0.34375, + "learning_rate": 2.094526881984521e-06, + "loss": 0.3844, + "step": 3468 + }, + { + "epoch": 6.638623326959847, + "grad_norm": 0.328125, + "learning_rate": 2.0087991017617598e-06, + "loss": 0.3697, + "step": 3472 + }, + { + "epoch": 6.646271510516252, + "grad_norm": 0.3515625, + "learning_rate": 1.924850742938894e-06, + "loss": 0.3886, + "step": 3476 + }, + { + "epoch": 6.653919694072657, + "grad_norm": 0.349609375, + "learning_rate": 1.8426828148969008e-06, + "loss": 0.363, + "step": 3480 + }, + { + "epoch": 6.661567877629063, + "grad_norm": 0.33984375, + "learning_rate": 1.7622963056091843e-06, + "loss": 0.3671, + "step": 3484 + }, + { + "epoch": 6.669216061185469, + "grad_norm": 0.341796875, + "learning_rate": 1.6836921816296644e-06, + "loss": 0.4043, + "step": 3488 + }, + { + "epoch": 6.676864244741874, + "grad_norm": 0.33203125, + "learning_rate": 1.6068713880811546e-06, + "loss": 0.398, + "step": 3492 + }, + { + "epoch": 6.684512428298279, + "grad_norm": 0.330078125, + "learning_rate": 1.531834848643987e-06, + "loss": 0.3847, + "step": 3496 + }, + { + "epoch": 6.692160611854685, + "grad_norm": 0.34765625, + "learning_rate": 1.4585834655449547e-06, + "loss": 0.4041, + "step": 3500 + } + ], + "logging_steps": 4, + "max_steps": 3661, + "num_input_tokens_seen": 0, + "num_train_epochs": 7, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.2872303824598467e+18, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +}