{ "best_metric": null, "best_model_checkpoint": null, "epoch": 6.447831184056272, "eval_steps": 500, "global_step": 5500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004689331770222743, "grad_norm": 0.3125, "learning_rate": 1.9999999999999998e-05, "loss": 1.2947, "step": 4 }, { "epoch": 0.009378663540445486, "grad_norm": 0.294921875, "learning_rate": 3.9999999999999996e-05, "loss": 1.2923, "step": 8 }, { "epoch": 0.01406799531066823, "grad_norm": 0.1904296875, "learning_rate": 5.9999999999999995e-05, "loss": 1.227, "step": 12 }, { "epoch": 0.01875732708089097, "grad_norm": 0.2216796875, "learning_rate": 7.999999999999999e-05, "loss": 1.2031, "step": 16 }, { "epoch": 0.023446658851113716, "grad_norm": 0.2373046875, "learning_rate": 9.999999999999999e-05, "loss": 1.1748, "step": 20 }, { "epoch": 0.02813599062133646, "grad_norm": 0.1865234375, "learning_rate": 0.00011999999999999999, "loss": 1.1424, "step": 24 }, { "epoch": 0.032825322391559206, "grad_norm": 0.1787109375, "learning_rate": 0.00014, "loss": 1.1835, "step": 28 }, { "epoch": 0.03751465416178194, "grad_norm": 0.1689453125, "learning_rate": 0.00015999999999999999, "loss": 1.1185, "step": 32 }, { "epoch": 0.04220398593200469, "grad_norm": 0.189453125, "learning_rate": 0.00017999999999999998, "loss": 1.139, "step": 36 }, { "epoch": 0.04689331770222743, "grad_norm": 0.19921875, "learning_rate": 0.00019999999999999998, "loss": 1.1278, "step": 40 }, { "epoch": 0.05158264947245018, "grad_norm": 0.2314453125, "learning_rate": 0.00021999999999999995, "loss": 1.0939, "step": 44 }, { "epoch": 0.05627198124267292, "grad_norm": 0.2041015625, "learning_rate": 0.00023999999999999998, "loss": 1.0765, "step": 48 }, { "epoch": 0.06096131301289566, "grad_norm": 0.193359375, "learning_rate": 0.00026, "loss": 1.1053, "step": 52 }, { "epoch": 0.06565064478311841, "grad_norm": 0.1962890625, "learning_rate": 0.00028, "loss": 1.0675, "step": 56 }, { "epoch": 0.07033997655334115, "grad_norm": 0.1923828125, "learning_rate": 0.0003, "loss": 1.0904, "step": 60 }, { "epoch": 0.07502930832356389, "grad_norm": 0.1943359375, "learning_rate": 0.00029999966103183746, "loss": 1.0871, "step": 64 }, { "epoch": 0.07971864009378664, "grad_norm": 0.1982421875, "learning_rate": 0.0002999986441288818, "loss": 1.0505, "step": 68 }, { "epoch": 0.08440797186400938, "grad_norm": 0.212890625, "learning_rate": 0.000299996949295729, "loss": 1.0788, "step": 72 }, { "epoch": 0.08909730363423213, "grad_norm": 0.201171875, "learning_rate": 0.0002999945765400391, "loss": 1.0503, "step": 76 }, { "epoch": 0.09378663540445487, "grad_norm": 0.1904296875, "learning_rate": 0.00029999152587253583, "loss": 1.0564, "step": 80 }, { "epoch": 0.0984759671746776, "grad_norm": 0.2197265625, "learning_rate": 0.0002999877973070069, "loss": 1.0396, "step": 84 }, { "epoch": 0.10316529894490035, "grad_norm": 0.1953125, "learning_rate": 0.00029998339086030395, "loss": 1.0208, "step": 88 }, { "epoch": 0.10785463071512309, "grad_norm": 0.203125, "learning_rate": 0.00029997830655234217, "loss": 1.057, "step": 92 }, { "epoch": 0.11254396248534584, "grad_norm": 0.205078125, "learning_rate": 0.0002999725444061004, "loss": 1.0283, "step": 96 }, { "epoch": 0.11723329425556858, "grad_norm": 0.189453125, "learning_rate": 0.0002999661044476212, "loss": 0.988, "step": 100 }, { "epoch": 0.12192262602579132, "grad_norm": 0.2001953125, "learning_rate": 0.00029995898670601053, "loss": 1.049, "step": 104 }, { "epoch": 0.12661195779601406, "grad_norm": 0.1953125, "learning_rate": 0.0002999511912134374, "loss": 0.9804, "step": 108 }, { "epoch": 0.13130128956623682, "grad_norm": 0.2021484375, "learning_rate": 0.0002999427180051341, "loss": 1.0172, "step": 112 }, { "epoch": 0.13599062133645956, "grad_norm": 0.2177734375, "learning_rate": 0.00029993356711939615, "loss": 0.9863, "step": 116 }, { "epoch": 0.1406799531066823, "grad_norm": 0.236328125, "learning_rate": 0.0002999237385975815, "loss": 0.9799, "step": 120 }, { "epoch": 0.14536928487690504, "grad_norm": 0.2138671875, "learning_rate": 0.0002999132324841109, "loss": 1.0098, "step": 124 }, { "epoch": 0.15005861664712777, "grad_norm": 0.220703125, "learning_rate": 0.0002999020488264676, "loss": 0.9597, "step": 128 }, { "epoch": 0.15474794841735054, "grad_norm": 0.2216796875, "learning_rate": 0.0002998901876751969, "loss": 0.9958, "step": 132 }, { "epoch": 0.15943728018757328, "grad_norm": 0.2333984375, "learning_rate": 0.00029987764908390624, "loss": 0.947, "step": 136 }, { "epoch": 0.16412661195779601, "grad_norm": 0.240234375, "learning_rate": 0.0002998644331092647, "loss": 0.948, "step": 140 }, { "epoch": 0.16881594372801875, "grad_norm": 0.205078125, "learning_rate": 0.00029985053981100286, "loss": 0.9685, "step": 144 }, { "epoch": 0.1735052754982415, "grad_norm": 0.208984375, "learning_rate": 0.00029983596925191265, "loss": 0.975, "step": 148 }, { "epoch": 0.17819460726846426, "grad_norm": 0.224609375, "learning_rate": 0.0002998207214978466, "loss": 0.978, "step": 152 }, { "epoch": 0.182883939038687, "grad_norm": 0.2177734375, "learning_rate": 0.0002998047966177182, "loss": 0.9702, "step": 156 }, { "epoch": 0.18757327080890973, "grad_norm": 0.220703125, "learning_rate": 0.0002997881946835012, "loss": 0.9399, "step": 160 }, { "epoch": 0.19226260257913247, "grad_norm": 0.244140625, "learning_rate": 0.00029977091577022916, "loss": 0.9403, "step": 164 }, { "epoch": 0.1969519343493552, "grad_norm": 0.2109375, "learning_rate": 0.0002997529599559956, "loss": 0.9342, "step": 168 }, { "epoch": 0.20164126611957797, "grad_norm": 0.2138671875, "learning_rate": 0.00029973432732195303, "loss": 0.9186, "step": 172 }, { "epoch": 0.2063305978898007, "grad_norm": 0.22265625, "learning_rate": 0.0002997150179523131, "loss": 0.9377, "step": 176 }, { "epoch": 0.21101992966002345, "grad_norm": 0.2333984375, "learning_rate": 0.00029969503193434606, "loss": 0.9365, "step": 180 }, { "epoch": 0.21570926143024619, "grad_norm": 0.2255859375, "learning_rate": 0.00029967436935838, "loss": 0.921, "step": 184 }, { "epoch": 0.22039859320046892, "grad_norm": 0.2138671875, "learning_rate": 0.00029965303031780126, "loss": 0.9041, "step": 188 }, { "epoch": 0.2250879249706917, "grad_norm": 0.22265625, "learning_rate": 0.00029963101490905307, "loss": 0.9319, "step": 192 }, { "epoch": 0.22977725674091443, "grad_norm": 0.263671875, "learning_rate": 0.0002996083232316358, "loss": 0.9135, "step": 196 }, { "epoch": 0.23446658851113716, "grad_norm": 0.236328125, "learning_rate": 0.0002995849553881061, "loss": 0.8845, "step": 200 }, { "epoch": 0.2391559202813599, "grad_norm": 0.244140625, "learning_rate": 0.00029956091148407684, "loss": 0.8891, "step": 204 }, { "epoch": 0.24384525205158264, "grad_norm": 0.251953125, "learning_rate": 0.00029953619162821616, "loss": 0.8917, "step": 208 }, { "epoch": 0.2485345838218054, "grad_norm": 0.2373046875, "learning_rate": 0.0002995107959322474, "loss": 0.9432, "step": 212 }, { "epoch": 0.2532239155920281, "grad_norm": 0.2255859375, "learning_rate": 0.00029948472451094823, "loss": 0.9197, "step": 216 }, { "epoch": 0.25791324736225085, "grad_norm": 0.267578125, "learning_rate": 0.0002994579774821505, "loss": 0.9129, "step": 220 }, { "epoch": 0.26260257913247365, "grad_norm": 0.2490234375, "learning_rate": 0.0002994305549667394, "loss": 0.8779, "step": 224 }, { "epoch": 0.2672919109026964, "grad_norm": 0.2314453125, "learning_rate": 0.000299402457088653, "loss": 0.9172, "step": 228 }, { "epoch": 0.2719812426729191, "grad_norm": 0.2431640625, "learning_rate": 0.0002993736839748818, "loss": 0.8779, "step": 232 }, { "epoch": 0.27667057444314186, "grad_norm": 0.236328125, "learning_rate": 0.0002993442357554681, "loss": 0.9188, "step": 236 }, { "epoch": 0.2813599062133646, "grad_norm": 0.2197265625, "learning_rate": 0.00029931411256350535, "loss": 0.8902, "step": 240 }, { "epoch": 0.28604923798358733, "grad_norm": 0.2490234375, "learning_rate": 0.0002992833145351376, "loss": 0.8734, "step": 244 }, { "epoch": 0.29073856975381007, "grad_norm": 0.21484375, "learning_rate": 0.0002992518418095588, "loss": 0.9317, "step": 248 }, { "epoch": 0.2954279015240328, "grad_norm": 0.224609375, "learning_rate": 0.00029921969452901235, "loss": 0.8715, "step": 252 }, { "epoch": 0.30011723329425555, "grad_norm": 0.236328125, "learning_rate": 0.0002991868728387903, "loss": 0.8825, "step": 256 }, { "epoch": 0.3048065650644783, "grad_norm": 0.23828125, "learning_rate": 0.00029915337688723277, "loss": 0.926, "step": 260 }, { "epoch": 0.3094958968347011, "grad_norm": 0.23828125, "learning_rate": 0.00029911920682572726, "loss": 0.8774, "step": 264 }, { "epoch": 0.3141852286049238, "grad_norm": 0.216796875, "learning_rate": 0.0002990843628087079, "loss": 0.8844, "step": 268 }, { "epoch": 0.31887456037514655, "grad_norm": 0.2197265625, "learning_rate": 0.0002990488449936549, "loss": 0.8683, "step": 272 }, { "epoch": 0.3235638921453693, "grad_norm": 0.287109375, "learning_rate": 0.00029901265354109367, "loss": 0.796, "step": 276 }, { "epoch": 0.32825322391559203, "grad_norm": 0.251953125, "learning_rate": 0.0002989757886145942, "loss": 0.8603, "step": 280 }, { "epoch": 0.33294255568581477, "grad_norm": 0.263671875, "learning_rate": 0.0002989382503807704, "loss": 0.9065, "step": 284 }, { "epoch": 0.3376318874560375, "grad_norm": 0.240234375, "learning_rate": 0.00029890003900927904, "loss": 0.8713, "step": 288 }, { "epoch": 0.34232121922626024, "grad_norm": 0.24609375, "learning_rate": 0.0002988611546728194, "loss": 0.8532, "step": 292 }, { "epoch": 0.347010550996483, "grad_norm": 0.2255859375, "learning_rate": 0.000298821597547132, "loss": 0.8241, "step": 296 }, { "epoch": 0.3516998827667057, "grad_norm": 0.216796875, "learning_rate": 0.0002987813678109985, "loss": 0.8575, "step": 300 }, { "epoch": 0.3563892145369285, "grad_norm": 0.216796875, "learning_rate": 0.00029874046564624, "loss": 0.8889, "step": 304 }, { "epoch": 0.36107854630715125, "grad_norm": 0.259765625, "learning_rate": 0.0002986988912377171, "loss": 0.8393, "step": 308 }, { "epoch": 0.365767878077374, "grad_norm": 0.236328125, "learning_rate": 0.00029865664477332843, "loss": 0.8675, "step": 312 }, { "epoch": 0.3704572098475967, "grad_norm": 0.21875, "learning_rate": 0.00029861372644401, "loss": 0.8316, "step": 316 }, { "epoch": 0.37514654161781946, "grad_norm": 0.224609375, "learning_rate": 0.0002985701364437345, "loss": 0.8705, "step": 320 }, { "epoch": 0.3798358733880422, "grad_norm": 0.2333984375, "learning_rate": 0.0002985258749695102, "loss": 0.8752, "step": 324 }, { "epoch": 0.38452520515826494, "grad_norm": 0.244140625, "learning_rate": 0.00029848094222138024, "loss": 0.8594, "step": 328 }, { "epoch": 0.3892145369284877, "grad_norm": 0.251953125, "learning_rate": 0.0002984353384024215, "loss": 0.8335, "step": 332 }, { "epoch": 0.3939038686987104, "grad_norm": 0.25, "learning_rate": 0.0002983890637187439, "loss": 0.7962, "step": 336 }, { "epoch": 0.39859320046893315, "grad_norm": 0.2236328125, "learning_rate": 0.0002983421183794893, "loss": 0.8526, "step": 340 }, { "epoch": 0.40328253223915594, "grad_norm": 0.23046875, "learning_rate": 0.00029829450259683085, "loss": 0.8273, "step": 344 }, { "epoch": 0.4079718640093787, "grad_norm": 0.2333984375, "learning_rate": 0.00029824621658597165, "loss": 0.8174, "step": 348 }, { "epoch": 0.4126611957796014, "grad_norm": 0.259765625, "learning_rate": 0.00029819726056514383, "loss": 0.836, "step": 352 }, { "epoch": 0.41735052754982416, "grad_norm": 0.259765625, "learning_rate": 0.00029814763475560796, "loss": 0.814, "step": 356 }, { "epoch": 0.4220398593200469, "grad_norm": 0.23046875, "learning_rate": 0.00029809733938165157, "loss": 0.8317, "step": 360 }, { "epoch": 0.42672919109026963, "grad_norm": 0.25, "learning_rate": 0.0002980463746705884, "loss": 0.8571, "step": 364 }, { "epoch": 0.43141852286049237, "grad_norm": 0.2373046875, "learning_rate": 0.00029799474085275734, "loss": 0.8309, "step": 368 }, { "epoch": 0.4361078546307151, "grad_norm": 0.255859375, "learning_rate": 0.00029794243816152127, "loss": 0.8289, "step": 372 }, { "epoch": 0.44079718640093785, "grad_norm": 0.232421875, "learning_rate": 0.0002978894668332663, "loss": 0.8364, "step": 376 }, { "epoch": 0.4454865181711606, "grad_norm": 0.2421875, "learning_rate": 0.00029783582710740013, "loss": 0.8101, "step": 380 }, { "epoch": 0.4501758499413834, "grad_norm": 0.224609375, "learning_rate": 0.00029778151922635175, "loss": 0.8661, "step": 384 }, { "epoch": 0.4548651817116061, "grad_norm": 0.2294921875, "learning_rate": 0.0002977265434355696, "loss": 0.8419, "step": 388 }, { "epoch": 0.45955451348182885, "grad_norm": 0.259765625, "learning_rate": 0.000297670899983521, "loss": 0.8444, "step": 392 }, { "epoch": 0.4642438452520516, "grad_norm": 0.2373046875, "learning_rate": 0.00029761458912169064, "loss": 0.8413, "step": 396 }, { "epoch": 0.46893317702227433, "grad_norm": 0.2294921875, "learning_rate": 0.0002975576111045798, "loss": 0.7815, "step": 400 }, { "epoch": 0.47362250879249707, "grad_norm": 0.24609375, "learning_rate": 0.0002974999661897049, "loss": 0.8496, "step": 404 }, { "epoch": 0.4783118405627198, "grad_norm": 0.263671875, "learning_rate": 0.0002974416546375965, "loss": 0.842, "step": 408 }, { "epoch": 0.48300117233294254, "grad_norm": 0.234375, "learning_rate": 0.00029738267671179793, "loss": 0.8453, "step": 412 }, { "epoch": 0.4876905041031653, "grad_norm": 0.23046875, "learning_rate": 0.00029732303267886455, "loss": 0.8066, "step": 416 }, { "epoch": 0.492379835873388, "grad_norm": 0.26953125, "learning_rate": 0.00029726272280836206, "loss": 0.749, "step": 420 }, { "epoch": 0.4970691676436108, "grad_norm": 0.2353515625, "learning_rate": 0.0002972017473728654, "loss": 0.8594, "step": 424 }, { "epoch": 0.5017584994138335, "grad_norm": 0.2431640625, "learning_rate": 0.00029714010664795765, "loss": 0.8312, "step": 428 }, { "epoch": 0.5064478311840562, "grad_norm": 0.23828125, "learning_rate": 0.00029707780091222877, "loss": 0.8336, "step": 432 }, { "epoch": 0.511137162954279, "grad_norm": 0.26953125, "learning_rate": 0.0002970148304472742, "loss": 0.8229, "step": 436 }, { "epoch": 0.5158264947245017, "grad_norm": 0.2412109375, "learning_rate": 0.0002969511955376937, "loss": 0.7516, "step": 440 }, { "epoch": 0.5205158264947245, "grad_norm": 0.2373046875, "learning_rate": 0.00029688689647109013, "loss": 0.8158, "step": 444 }, { "epoch": 0.5252051582649473, "grad_norm": 0.2734375, "learning_rate": 0.00029682193353806793, "loss": 0.7859, "step": 448 }, { "epoch": 0.52989449003517, "grad_norm": 0.283203125, "learning_rate": 0.00029675630703223196, "loss": 0.8025, "step": 452 }, { "epoch": 0.5345838218053928, "grad_norm": 0.2451171875, "learning_rate": 0.0002966900172501861, "loss": 0.7937, "step": 456 }, { "epoch": 0.5392731535756154, "grad_norm": 0.2578125, "learning_rate": 0.00029662306449153216, "loss": 0.8076, "step": 460 }, { "epoch": 0.5439624853458382, "grad_norm": 0.2216796875, "learning_rate": 0.00029655544905886816, "loss": 0.8268, "step": 464 }, { "epoch": 0.5486518171160609, "grad_norm": 0.2421875, "learning_rate": 0.0002964871712577871, "loss": 0.7727, "step": 468 }, { "epoch": 0.5533411488862837, "grad_norm": 0.251953125, "learning_rate": 0.0002964182313968757, "loss": 0.809, "step": 472 }, { "epoch": 0.5580304806565064, "grad_norm": 0.2333984375, "learning_rate": 0.0002963486297877128, "loss": 0.8369, "step": 476 }, { "epoch": 0.5627198124267292, "grad_norm": 0.236328125, "learning_rate": 0.00029627836674486817, "loss": 0.7811, "step": 480 }, { "epoch": 0.567409144196952, "grad_norm": 0.2421875, "learning_rate": 0.00029620744258590097, "loss": 0.7798, "step": 484 }, { "epoch": 0.5720984759671747, "grad_norm": 0.2490234375, "learning_rate": 0.0002961358576313583, "loss": 0.809, "step": 488 }, { "epoch": 0.5767878077373975, "grad_norm": 0.255859375, "learning_rate": 0.00029606361220477364, "loss": 0.7677, "step": 492 }, { "epoch": 0.5814771395076201, "grad_norm": 0.2294921875, "learning_rate": 0.0002959907066326658, "loss": 0.7994, "step": 496 }, { "epoch": 0.5861664712778429, "grad_norm": 0.267578125, "learning_rate": 0.00029591714124453693, "loss": 0.8217, "step": 500 }, { "epoch": 0.5908558030480656, "grad_norm": 0.234375, "learning_rate": 0.00029584291637287146, "loss": 0.7913, "step": 504 }, { "epoch": 0.5955451348182884, "grad_norm": 0.2451171875, "learning_rate": 0.00029576803235313413, "loss": 0.7679, "step": 508 }, { "epoch": 0.6002344665885111, "grad_norm": 0.2578125, "learning_rate": 0.00029569248952376903, "loss": 0.7839, "step": 512 }, { "epoch": 0.6049237983587339, "grad_norm": 0.228515625, "learning_rate": 0.00029561628822619775, "loss": 0.8129, "step": 516 }, { "epoch": 0.6096131301289566, "grad_norm": 0.2451171875, "learning_rate": 0.00029553942880481765, "loss": 0.8105, "step": 520 }, { "epoch": 0.6143024618991794, "grad_norm": 0.2578125, "learning_rate": 0.0002954619116070008, "loss": 0.7806, "step": 524 }, { "epoch": 0.6189917936694022, "grad_norm": 0.25, "learning_rate": 0.00029538373698309193, "loss": 0.7739, "step": 528 }, { "epoch": 0.6236811254396248, "grad_norm": 0.2734375, "learning_rate": 0.00029530490528640723, "loss": 0.7882, "step": 532 }, { "epoch": 0.6283704572098476, "grad_norm": 0.2333984375, "learning_rate": 0.00029522541687323253, "loss": 0.7934, "step": 536 }, { "epoch": 0.6330597889800703, "grad_norm": 0.271484375, "learning_rate": 0.00029514527210282163, "loss": 0.8188, "step": 540 }, { "epoch": 0.6377491207502931, "grad_norm": 0.2421875, "learning_rate": 0.00029506447133739494, "loss": 0.79, "step": 544 }, { "epoch": 0.6424384525205158, "grad_norm": 0.2490234375, "learning_rate": 0.0002949830149421376, "loss": 0.7961, "step": 548 }, { "epoch": 0.6471277842907386, "grad_norm": 0.2470703125, "learning_rate": 0.00029490090328519795, "loss": 0.798, "step": 552 }, { "epoch": 0.6518171160609613, "grad_norm": 0.263671875, "learning_rate": 0.00029481813673768576, "loss": 0.7388, "step": 556 }, { "epoch": 0.6565064478311841, "grad_norm": 0.23046875, "learning_rate": 0.0002947347156736708, "loss": 0.8278, "step": 560 }, { "epoch": 0.6611957796014069, "grad_norm": 0.2431640625, "learning_rate": 0.0002946506404701808, "loss": 0.7871, "step": 564 }, { "epoch": 0.6658851113716295, "grad_norm": 0.25390625, "learning_rate": 0.0002945659115071999, "loss": 0.7859, "step": 568 }, { "epoch": 0.6705744431418523, "grad_norm": 0.2392578125, "learning_rate": 0.0002944805291676672, "loss": 0.8397, "step": 572 }, { "epoch": 0.675263774912075, "grad_norm": 0.2314453125, "learning_rate": 0.0002943944938374746, "loss": 0.776, "step": 576 }, { "epoch": 0.6799531066822978, "grad_norm": 0.2578125, "learning_rate": 0.0002943078059054652, "loss": 0.8045, "step": 580 }, { "epoch": 0.6846424384525205, "grad_norm": 0.255859375, "learning_rate": 0.0002942204657634317, "loss": 0.8548, "step": 584 }, { "epoch": 0.6893317702227433, "grad_norm": 0.240234375, "learning_rate": 0.0002941324738061145, "loss": 0.741, "step": 588 }, { "epoch": 0.694021101992966, "grad_norm": 0.2451171875, "learning_rate": 0.00029404383043119984, "loss": 0.7604, "step": 592 }, { "epoch": 0.6987104337631888, "grad_norm": 0.2470703125, "learning_rate": 0.00029395453603931816, "loss": 0.7914, "step": 596 }, { "epoch": 0.7033997655334114, "grad_norm": 0.296875, "learning_rate": 0.00029386459103404215, "loss": 0.7633, "step": 600 }, { "epoch": 0.7080890973036342, "grad_norm": 0.2373046875, "learning_rate": 0.0002937739958218852, "loss": 0.7783, "step": 604 }, { "epoch": 0.712778429073857, "grad_norm": 0.2333984375, "learning_rate": 0.000293682750812299, "loss": 0.7437, "step": 608 }, { "epoch": 0.7174677608440797, "grad_norm": 0.2431640625, "learning_rate": 0.00029359085641767244, "loss": 0.746, "step": 612 }, { "epoch": 0.7221570926143025, "grad_norm": 0.2421875, "learning_rate": 0.00029349831305332914, "loss": 0.774, "step": 616 }, { "epoch": 0.7268464243845252, "grad_norm": 0.271484375, "learning_rate": 0.0002934051211375258, "loss": 0.7411, "step": 620 }, { "epoch": 0.731535756154748, "grad_norm": 0.24609375, "learning_rate": 0.0002933112810914503, "loss": 0.7725, "step": 624 }, { "epoch": 0.7362250879249707, "grad_norm": 0.25, "learning_rate": 0.0002932167933392198, "loss": 0.812, "step": 628 }, { "epoch": 0.7409144196951934, "grad_norm": 0.2470703125, "learning_rate": 0.0002931216583078789, "loss": 0.8059, "step": 632 }, { "epoch": 0.7456037514654161, "grad_norm": 0.2470703125, "learning_rate": 0.0002930258764273975, "loss": 0.7539, "step": 636 }, { "epoch": 0.7502930832356389, "grad_norm": 0.259765625, "learning_rate": 0.000292929448130669, "loss": 0.7813, "step": 640 }, { "epoch": 0.7549824150058617, "grad_norm": 0.26953125, "learning_rate": 0.0002928323738535085, "loss": 0.817, "step": 644 }, { "epoch": 0.7596717467760844, "grad_norm": 0.23828125, "learning_rate": 0.00029273465403465045, "loss": 0.7949, "step": 648 }, { "epoch": 0.7643610785463072, "grad_norm": 0.259765625, "learning_rate": 0.0002926362891157469, "loss": 0.7825, "step": 652 }, { "epoch": 0.7690504103165299, "grad_norm": 0.2421875, "learning_rate": 0.0002925372795413656, "loss": 0.7385, "step": 656 }, { "epoch": 0.7737397420867527, "grad_norm": 0.234375, "learning_rate": 0.00029243762575898775, "loss": 0.8009, "step": 660 }, { "epoch": 0.7784290738569754, "grad_norm": 0.255859375, "learning_rate": 0.0002923373282190062, "loss": 0.7594, "step": 664 }, { "epoch": 0.7831184056271981, "grad_norm": 0.251953125, "learning_rate": 0.00029223638737472325, "loss": 0.7312, "step": 668 }, { "epoch": 0.7878077373974208, "grad_norm": 0.2421875, "learning_rate": 0.00029213480368234853, "loss": 0.7294, "step": 672 }, { "epoch": 0.7924970691676436, "grad_norm": 0.255859375, "learning_rate": 0.00029203257760099737, "loss": 0.7024, "step": 676 }, { "epoch": 0.7971864009378663, "grad_norm": 0.26171875, "learning_rate": 0.0002919297095926883, "loss": 0.7424, "step": 680 }, { "epoch": 0.8018757327080891, "grad_norm": 0.275390625, "learning_rate": 0.0002918262001223408, "loss": 0.7668, "step": 684 }, { "epoch": 0.8065650644783119, "grad_norm": 0.25390625, "learning_rate": 0.000291722049657774, "loss": 0.7923, "step": 688 }, { "epoch": 0.8112543962485346, "grad_norm": 0.2470703125, "learning_rate": 0.0002916172586697036, "loss": 0.8041, "step": 692 }, { "epoch": 0.8159437280187574, "grad_norm": 0.2451171875, "learning_rate": 0.00029151182763174053, "loss": 0.7869, "step": 696 }, { "epoch": 0.82063305978898, "grad_norm": 0.259765625, "learning_rate": 0.0002914057570203882, "loss": 0.7462, "step": 700 }, { "epoch": 0.8253223915592028, "grad_norm": 0.2265625, "learning_rate": 0.0002912990473150409, "loss": 0.7332, "step": 704 }, { "epoch": 0.8300117233294255, "grad_norm": 0.25, "learning_rate": 0.000291191698997981, "loss": 0.7944, "step": 708 }, { "epoch": 0.8347010550996483, "grad_norm": 0.26171875, "learning_rate": 0.0002910837125543775, "loss": 0.7531, "step": 712 }, { "epoch": 0.839390386869871, "grad_norm": 0.26953125, "learning_rate": 0.0002909750884722832, "loss": 0.7751, "step": 716 }, { "epoch": 0.8440797186400938, "grad_norm": 0.2412109375, "learning_rate": 0.00029086582724263286, "loss": 0.7757, "step": 720 }, { "epoch": 0.8487690504103166, "grad_norm": 0.2265625, "learning_rate": 0.00029075592935924084, "loss": 0.7761, "step": 724 }, { "epoch": 0.8534583821805393, "grad_norm": 0.24609375, "learning_rate": 0.00029064539531879893, "loss": 0.8099, "step": 728 }, { "epoch": 0.8581477139507621, "grad_norm": 0.216796875, "learning_rate": 0.0002905342256208741, "loss": 0.759, "step": 732 }, { "epoch": 0.8628370457209847, "grad_norm": 0.2421875, "learning_rate": 0.0002904224207679061, "loss": 0.7609, "step": 736 }, { "epoch": 0.8675263774912075, "grad_norm": 0.263671875, "learning_rate": 0.0002903099812652056, "loss": 0.717, "step": 740 }, { "epoch": 0.8722157092614302, "grad_norm": 0.244140625, "learning_rate": 0.00029019690762095116, "loss": 0.7318, "step": 744 }, { "epoch": 0.876905041031653, "grad_norm": 0.240234375, "learning_rate": 0.00029008320034618784, "loss": 0.7132, "step": 748 }, { "epoch": 0.8815943728018757, "grad_norm": 0.240234375, "learning_rate": 0.00028996885995482424, "loss": 0.7291, "step": 752 }, { "epoch": 0.8862837045720985, "grad_norm": 0.248046875, "learning_rate": 0.0002898538869636303, "loss": 0.7854, "step": 756 }, { "epoch": 0.8909730363423212, "grad_norm": 0.259765625, "learning_rate": 0.0002897382818922352, "loss": 0.7146, "step": 760 }, { "epoch": 0.895662368112544, "grad_norm": 0.2431640625, "learning_rate": 0.0002896220452631247, "loss": 0.8165, "step": 764 }, { "epoch": 0.9003516998827668, "grad_norm": 0.240234375, "learning_rate": 0.0002895051776016392, "loss": 0.7719, "step": 768 }, { "epoch": 0.9050410316529894, "grad_norm": 0.2412109375, "learning_rate": 0.00028938767943597075, "loss": 0.793, "step": 772 }, { "epoch": 0.9097303634232122, "grad_norm": 0.228515625, "learning_rate": 0.0002892695512971613, "loss": 0.7722, "step": 776 }, { "epoch": 0.9144196951934349, "grad_norm": 0.240234375, "learning_rate": 0.0002891507937190998, "loss": 0.664, "step": 780 }, { "epoch": 0.9191090269636577, "grad_norm": 0.240234375, "learning_rate": 0.0002890314072385201, "loss": 0.7462, "step": 784 }, { "epoch": 0.9237983587338804, "grad_norm": 0.22265625, "learning_rate": 0.0002889113923949985, "loss": 0.7702, "step": 788 }, { "epoch": 0.9284876905041032, "grad_norm": 0.2421875, "learning_rate": 0.0002887907497309511, "loss": 0.7545, "step": 792 }, { "epoch": 0.9331770222743259, "grad_norm": 0.248046875, "learning_rate": 0.0002886694797916314, "loss": 0.7855, "step": 796 }, { "epoch": 0.9378663540445487, "grad_norm": 0.2294921875, "learning_rate": 0.00028854758312512826, "loss": 0.7477, "step": 800 }, { "epoch": 0.9425556858147714, "grad_norm": 0.240234375, "learning_rate": 0.00028842506028236274, "loss": 0.7647, "step": 804 }, { "epoch": 0.9472450175849941, "grad_norm": 0.232421875, "learning_rate": 0.0002883019118170861, "loss": 0.7492, "step": 808 }, { "epoch": 0.9519343493552169, "grad_norm": 0.2333984375, "learning_rate": 0.0002881781382858772, "loss": 0.729, "step": 812 }, { "epoch": 0.9566236811254396, "grad_norm": 0.25, "learning_rate": 0.0002880537402481397, "loss": 0.7545, "step": 816 }, { "epoch": 0.9613130128956624, "grad_norm": 0.2421875, "learning_rate": 0.0002879287182661001, "loss": 0.7451, "step": 820 }, { "epoch": 0.9660023446658851, "grad_norm": 0.267578125, "learning_rate": 0.00028780307290480453, "loss": 0.7464, "step": 824 }, { "epoch": 0.9706916764361079, "grad_norm": 0.2265625, "learning_rate": 0.00028767680473211683, "loss": 0.7747, "step": 828 }, { "epoch": 0.9753810082063306, "grad_norm": 0.251953125, "learning_rate": 0.0002875499143187154, "loss": 0.7581, "step": 832 }, { "epoch": 0.9800703399765534, "grad_norm": 0.2421875, "learning_rate": 0.00028742240223809116, "loss": 0.7786, "step": 836 }, { "epoch": 0.984759671746776, "grad_norm": 0.27734375, "learning_rate": 0.0002872942690665445, "loss": 0.8077, "step": 840 }, { "epoch": 0.9894490035169988, "grad_norm": 0.25, "learning_rate": 0.0002871655153831831, "loss": 0.7733, "step": 844 }, { "epoch": 0.9941383352872216, "grad_norm": 0.2490234375, "learning_rate": 0.0002870361417699188, "loss": 0.7462, "step": 848 }, { "epoch": 0.9988276670574443, "grad_norm": 0.255859375, "learning_rate": 0.0002869061488114654, "loss": 0.695, "step": 852 }, { "epoch": 1.003516998827667, "grad_norm": 0.2431640625, "learning_rate": 0.00028677553709533605, "loss": 0.7101, "step": 856 }, { "epoch": 1.0082063305978899, "grad_norm": 0.318359375, "learning_rate": 0.00028664430721184013, "loss": 0.6597, "step": 860 }, { "epoch": 1.0128956623681125, "grad_norm": 0.265625, "learning_rate": 0.0002865124597540811, "loss": 0.6458, "step": 864 }, { "epoch": 1.0175849941383353, "grad_norm": 0.25, "learning_rate": 0.0002863799953179534, "loss": 0.706, "step": 868 }, { "epoch": 1.022274325908558, "grad_norm": 0.23828125, "learning_rate": 0.00028624691450214007, "loss": 0.7264, "step": 872 }, { "epoch": 1.0269636576787808, "grad_norm": 0.25390625, "learning_rate": 0.00028611321790810996, "loss": 0.6963, "step": 876 }, { "epoch": 1.0316529894490034, "grad_norm": 0.244140625, "learning_rate": 0.0002859789061401149, "loss": 0.6838, "step": 880 }, { "epoch": 1.0363423212192262, "grad_norm": 0.2392578125, "learning_rate": 0.00028584397980518705, "loss": 0.7163, "step": 884 }, { "epoch": 1.041031652989449, "grad_norm": 0.265625, "learning_rate": 0.00028570843951313625, "loss": 0.6668, "step": 888 }, { "epoch": 1.0457209847596718, "grad_norm": 0.2333984375, "learning_rate": 0.00028557228587654693, "loss": 0.7022, "step": 892 }, { "epoch": 1.0504103165298946, "grad_norm": 0.25390625, "learning_rate": 0.0002854355195107758, "loss": 0.6669, "step": 896 }, { "epoch": 1.0550996483001172, "grad_norm": 0.2470703125, "learning_rate": 0.00028529814103394886, "loss": 0.7281, "step": 900 }, { "epoch": 1.05978898007034, "grad_norm": 0.2314453125, "learning_rate": 0.00028516015106695833, "loss": 0.714, "step": 904 }, { "epoch": 1.0644783118405627, "grad_norm": 0.2353515625, "learning_rate": 0.0002850215502334605, "loss": 0.6515, "step": 908 }, { "epoch": 1.0691676436107855, "grad_norm": 0.25, "learning_rate": 0.0002848823391598721, "loss": 0.7476, "step": 912 }, { "epoch": 1.073856975381008, "grad_norm": 0.244140625, "learning_rate": 0.00028474251847536826, "loss": 0.6498, "step": 916 }, { "epoch": 1.078546307151231, "grad_norm": 0.25390625, "learning_rate": 0.000284602088811879, "loss": 0.7022, "step": 920 }, { "epoch": 1.0832356389214537, "grad_norm": 0.2578125, "learning_rate": 0.0002844610508040868, "loss": 0.6899, "step": 924 }, { "epoch": 1.0879249706916765, "grad_norm": 0.2490234375, "learning_rate": 0.00028431940508942365, "loss": 0.6563, "step": 928 }, { "epoch": 1.0926143024618993, "grad_norm": 0.240234375, "learning_rate": 0.000284177152308068, "loss": 0.6752, "step": 932 }, { "epoch": 1.0973036342321218, "grad_norm": 0.296875, "learning_rate": 0.000284034293102942, "loss": 0.7024, "step": 936 }, { "epoch": 1.1019929660023446, "grad_norm": 0.2470703125, "learning_rate": 0.00028389082811970873, "loss": 0.6902, "step": 940 }, { "epoch": 1.1066822977725674, "grad_norm": 0.259765625, "learning_rate": 0.00028374675800676893, "loss": 0.6658, "step": 944 }, { "epoch": 1.1113716295427902, "grad_norm": 0.26953125, "learning_rate": 0.00028360208341525836, "loss": 0.7058, "step": 948 }, { "epoch": 1.1160609613130128, "grad_norm": 0.279296875, "learning_rate": 0.0002834568049990447, "loss": 0.7162, "step": 952 }, { "epoch": 1.1207502930832356, "grad_norm": 0.279296875, "learning_rate": 0.0002833109234147249, "loss": 0.6888, "step": 956 }, { "epoch": 1.1254396248534584, "grad_norm": 0.283203125, "learning_rate": 0.0002831644393216216, "loss": 0.6802, "step": 960 }, { "epoch": 1.1301289566236812, "grad_norm": 0.259765625, "learning_rate": 0.00028301735338178086, "loss": 0.6579, "step": 964 }, { "epoch": 1.134818288393904, "grad_norm": 0.271484375, "learning_rate": 0.0002828696662599686, "loss": 0.6941, "step": 968 }, { "epoch": 1.1395076201641265, "grad_norm": 0.259765625, "learning_rate": 0.00028272137862366795, "loss": 0.6928, "step": 972 }, { "epoch": 1.1441969519343493, "grad_norm": 0.240234375, "learning_rate": 0.00028257249114307606, "loss": 0.6596, "step": 976 }, { "epoch": 1.1488862837045721, "grad_norm": 0.25, "learning_rate": 0.00028242300449110114, "loss": 0.6836, "step": 980 }, { "epoch": 1.153575615474795, "grad_norm": 0.23828125, "learning_rate": 0.00028227291934335944, "loss": 0.6556, "step": 984 }, { "epoch": 1.1582649472450175, "grad_norm": 0.24609375, "learning_rate": 0.00028212223637817213, "loss": 0.6663, "step": 988 }, { "epoch": 1.1629542790152403, "grad_norm": 0.2470703125, "learning_rate": 0.00028197095627656215, "loss": 0.6999, "step": 992 }, { "epoch": 1.167643610785463, "grad_norm": 0.2470703125, "learning_rate": 0.0002818190797222514, "loss": 0.6913, "step": 996 }, { "epoch": 1.1723329425556859, "grad_norm": 0.251953125, "learning_rate": 0.0002816666074016575, "loss": 0.6688, "step": 1000 }, { "epoch": 1.1770222743259087, "grad_norm": 0.263671875, "learning_rate": 0.0002815135400038905, "loss": 0.6966, "step": 1004 }, { "epoch": 1.1817116060961312, "grad_norm": 0.259765625, "learning_rate": 0.0002813598782207501, "loss": 0.6789, "step": 1008 }, { "epoch": 1.186400937866354, "grad_norm": 0.25, "learning_rate": 0.00028120562274672233, "loss": 0.7121, "step": 1012 }, { "epoch": 1.1910902696365768, "grad_norm": 0.267578125, "learning_rate": 0.0002810507742789765, "loss": 0.6333, "step": 1016 }, { "epoch": 1.1957796014067996, "grad_norm": 0.26953125, "learning_rate": 0.0002808953335173619, "loss": 0.683, "step": 1020 }, { "epoch": 1.2004689331770222, "grad_norm": 0.2412109375, "learning_rate": 0.00028073930116440484, "loss": 0.6771, "step": 1024 }, { "epoch": 1.205158264947245, "grad_norm": 0.25390625, "learning_rate": 0.0002805826779253052, "loss": 0.6461, "step": 1028 }, { "epoch": 1.2098475967174678, "grad_norm": 0.26953125, "learning_rate": 0.0002804254645079337, "loss": 0.665, "step": 1032 }, { "epoch": 1.2145369284876906, "grad_norm": 0.25390625, "learning_rate": 0.0002802676616228281, "loss": 0.7006, "step": 1036 }, { "epoch": 1.2192262602579134, "grad_norm": 0.2734375, "learning_rate": 0.00028010926998319055, "loss": 0.6557, "step": 1040 }, { "epoch": 1.223915592028136, "grad_norm": 0.255859375, "learning_rate": 0.000279950290304884, "loss": 0.6676, "step": 1044 }, { "epoch": 1.2286049237983587, "grad_norm": 0.28125, "learning_rate": 0.000279790723306429, "loss": 0.6762, "step": 1048 }, { "epoch": 1.2332942555685815, "grad_norm": 0.2490234375, "learning_rate": 0.00027963056970900085, "loss": 0.6742, "step": 1052 }, { "epoch": 1.2379835873388043, "grad_norm": 0.26953125, "learning_rate": 0.0002794698302364257, "loss": 0.6682, "step": 1056 }, { "epoch": 1.2426729191090269, "grad_norm": 0.271484375, "learning_rate": 0.0002793085056151778, "loss": 0.6691, "step": 1060 }, { "epoch": 1.2473622508792497, "grad_norm": 0.25, "learning_rate": 0.00027914659657437586, "loss": 0.7082, "step": 1064 }, { "epoch": 1.2520515826494725, "grad_norm": 0.259765625, "learning_rate": 0.00027898410384578004, "loss": 0.6849, "step": 1068 }, { "epoch": 1.2567409144196953, "grad_norm": 0.27734375, "learning_rate": 0.00027882102816378853, "loss": 0.6864, "step": 1072 }, { "epoch": 1.261430246189918, "grad_norm": 0.267578125, "learning_rate": 0.0002786573702654342, "loss": 0.6555, "step": 1076 }, { "epoch": 1.2661195779601406, "grad_norm": 0.2490234375, "learning_rate": 0.0002784931308903813, "loss": 0.7219, "step": 1080 }, { "epoch": 1.2708089097303634, "grad_norm": 0.267578125, "learning_rate": 0.000278328310780922, "loss": 0.7314, "step": 1084 }, { "epoch": 1.2754982415005862, "grad_norm": 0.271484375, "learning_rate": 0.0002781629106819733, "loss": 0.6988, "step": 1088 }, { "epoch": 1.2801875732708088, "grad_norm": 0.255859375, "learning_rate": 0.0002779969313410733, "loss": 0.6559, "step": 1092 }, { "epoch": 1.2848769050410316, "grad_norm": 0.279296875, "learning_rate": 0.0002778303735083784, "loss": 0.6329, "step": 1096 }, { "epoch": 1.2895662368112544, "grad_norm": 0.255859375, "learning_rate": 0.0002776632379366591, "loss": 0.6737, "step": 1100 }, { "epoch": 1.2942555685814772, "grad_norm": 0.2734375, "learning_rate": 0.0002774955253812973, "loss": 0.6834, "step": 1104 }, { "epoch": 1.2989449003517, "grad_norm": 0.271484375, "learning_rate": 0.00027732723660028256, "loss": 0.6357, "step": 1108 }, { "epoch": 1.3036342321219228, "grad_norm": 0.2578125, "learning_rate": 0.0002771583723542087, "loss": 0.6635, "step": 1112 }, { "epoch": 1.3083235638921453, "grad_norm": 0.259765625, "learning_rate": 0.0002769889334062705, "loss": 0.6871, "step": 1116 }, { "epoch": 1.3130128956623681, "grad_norm": 0.26953125, "learning_rate": 0.00027681892052226005, "loss": 0.7013, "step": 1120 }, { "epoch": 1.317702227432591, "grad_norm": 0.2470703125, "learning_rate": 0.0002766483344705634, "loss": 0.6578, "step": 1124 }, { "epoch": 1.3223915592028135, "grad_norm": 0.267578125, "learning_rate": 0.00027647717602215704, "loss": 0.66, "step": 1128 }, { "epoch": 1.3270808909730363, "grad_norm": 0.271484375, "learning_rate": 0.00027630544595060464, "loss": 0.6884, "step": 1132 }, { "epoch": 1.331770222743259, "grad_norm": 0.26171875, "learning_rate": 0.0002761331450320531, "loss": 0.6845, "step": 1136 }, { "epoch": 1.3364595545134819, "grad_norm": 0.2578125, "learning_rate": 0.00027596027404522944, "loss": 0.6924, "step": 1140 }, { "epoch": 1.3411488862837047, "grad_norm": 0.2578125, "learning_rate": 0.0002757868337714372, "loss": 0.6962, "step": 1144 }, { "epoch": 1.3458382180539274, "grad_norm": 0.267578125, "learning_rate": 0.00027561282499455276, "loss": 0.6869, "step": 1148 }, { "epoch": 1.35052754982415, "grad_norm": 0.25390625, "learning_rate": 0.00027543824850102187, "loss": 0.7182, "step": 1152 }, { "epoch": 1.3552168815943728, "grad_norm": 0.259765625, "learning_rate": 0.00027526310507985626, "loss": 0.6907, "step": 1156 }, { "epoch": 1.3599062133645956, "grad_norm": 0.26953125, "learning_rate": 0.0002750873955226298, "loss": 0.7224, "step": 1160 }, { "epoch": 1.3645955451348182, "grad_norm": 0.251953125, "learning_rate": 0.00027491112062347515, "loss": 0.7173, "step": 1164 }, { "epoch": 1.369284876905041, "grad_norm": 0.298828125, "learning_rate": 0.0002747342811790799, "loss": 0.6719, "step": 1168 }, { "epoch": 1.3739742086752638, "grad_norm": 0.265625, "learning_rate": 0.00027455687798868346, "loss": 0.7309, "step": 1172 }, { "epoch": 1.3786635404454866, "grad_norm": 0.267578125, "learning_rate": 0.0002743789118540728, "loss": 0.7328, "step": 1176 }, { "epoch": 1.3833528722157094, "grad_norm": 0.26171875, "learning_rate": 0.00027420038357957934, "loss": 0.6742, "step": 1180 }, { "epoch": 1.388042203985932, "grad_norm": 0.259765625, "learning_rate": 0.0002740212939720751, "loss": 0.7193, "step": 1184 }, { "epoch": 1.3927315357561547, "grad_norm": 0.267578125, "learning_rate": 0.0002738416438409691, "loss": 0.7321, "step": 1188 }, { "epoch": 1.3974208675263775, "grad_norm": 0.271484375, "learning_rate": 0.0002736614339982036, "loss": 0.7035, "step": 1192 }, { "epoch": 1.4021101992966003, "grad_norm": 0.2734375, "learning_rate": 0.00027348066525825066, "loss": 0.6952, "step": 1196 }, { "epoch": 1.4067995310668229, "grad_norm": 0.2578125, "learning_rate": 0.0002732993384381082, "loss": 0.776, "step": 1200 }, { "epoch": 1.4114888628370457, "grad_norm": 0.25, "learning_rate": 0.00027311745435729655, "loss": 0.7111, "step": 1204 }, { "epoch": 1.4161781946072685, "grad_norm": 0.248046875, "learning_rate": 0.00027293501383785445, "loss": 0.6733, "step": 1208 }, { "epoch": 1.4208675263774913, "grad_norm": 0.26171875, "learning_rate": 0.00027275201770433574, "loss": 0.7015, "step": 1212 }, { "epoch": 1.425556858147714, "grad_norm": 0.26171875, "learning_rate": 0.0002725684667838051, "loss": 0.6968, "step": 1216 }, { "epoch": 1.4302461899179366, "grad_norm": 0.2412109375, "learning_rate": 0.00027238436190583486, "loss": 0.7071, "step": 1220 }, { "epoch": 1.4349355216881594, "grad_norm": 0.298828125, "learning_rate": 0.00027219970390250094, "loss": 0.6828, "step": 1224 }, { "epoch": 1.4396248534583822, "grad_norm": 0.25390625, "learning_rate": 0.000272014493608379, "loss": 0.648, "step": 1228 }, { "epoch": 1.444314185228605, "grad_norm": 0.2490234375, "learning_rate": 0.0002718287318605411, "loss": 0.6828, "step": 1232 }, { "epoch": 1.4490035169988276, "grad_norm": 0.27734375, "learning_rate": 0.0002716424194985514, "loss": 0.6893, "step": 1236 }, { "epoch": 1.4536928487690504, "grad_norm": 0.248046875, "learning_rate": 0.0002714555573644627, "loss": 0.7048, "step": 1240 }, { "epoch": 1.4583821805392732, "grad_norm": 0.251953125, "learning_rate": 0.0002712681463028126, "loss": 0.6076, "step": 1244 }, { "epoch": 1.463071512309496, "grad_norm": 0.2451171875, "learning_rate": 0.00027108018716061945, "loss": 0.7175, "step": 1248 }, { "epoch": 1.4677608440797187, "grad_norm": 0.263671875, "learning_rate": 0.0002708916807873787, "loss": 0.6885, "step": 1252 }, { "epoch": 1.4724501758499413, "grad_norm": 0.2578125, "learning_rate": 0.0002707026280350594, "loss": 0.6368, "step": 1256 }, { "epoch": 1.477139507620164, "grad_norm": 0.26171875, "learning_rate": 0.00027051302975809947, "loss": 0.6759, "step": 1260 }, { "epoch": 1.481828839390387, "grad_norm": 0.26171875, "learning_rate": 0.00027032288681340285, "loss": 0.6619, "step": 1264 }, { "epoch": 1.4865181711606097, "grad_norm": 0.26171875, "learning_rate": 0.0002701322000603347, "loss": 0.7396, "step": 1268 }, { "epoch": 1.4912075029308323, "grad_norm": 0.26953125, "learning_rate": 0.00026994097036071846, "loss": 0.6688, "step": 1272 }, { "epoch": 1.495896834701055, "grad_norm": 0.25390625, "learning_rate": 0.000269749198578831, "loss": 0.6439, "step": 1276 }, { "epoch": 1.5005861664712778, "grad_norm": 0.279296875, "learning_rate": 0.00026955688558139945, "loss": 0.7026, "step": 1280 }, { "epoch": 1.5052754982415006, "grad_norm": 0.271484375, "learning_rate": 0.0002693640322375969, "loss": 0.6609, "step": 1284 }, { "epoch": 1.5099648300117234, "grad_norm": 0.26953125, "learning_rate": 0.0002691706394190386, "loss": 0.6639, "step": 1288 }, { "epoch": 1.5146541617819462, "grad_norm": 0.271484375, "learning_rate": 0.00026897670799977803, "loss": 0.6592, "step": 1292 }, { "epoch": 1.5193434935521688, "grad_norm": 0.29296875, "learning_rate": 0.0002687822388563028, "loss": 0.631, "step": 1296 }, { "epoch": 1.5240328253223916, "grad_norm": 0.248046875, "learning_rate": 0.000268587232867531, "loss": 0.671, "step": 1300 }, { "epoch": 1.5287221570926142, "grad_norm": 0.279296875, "learning_rate": 0.00026839169091480685, "loss": 0.7033, "step": 1304 }, { "epoch": 1.533411488862837, "grad_norm": 0.259765625, "learning_rate": 0.00026819561388189697, "loss": 0.6508, "step": 1308 }, { "epoch": 1.5381008206330598, "grad_norm": 0.259765625, "learning_rate": 0.00026799900265498625, "loss": 0.6661, "step": 1312 }, { "epoch": 1.5427901524032825, "grad_norm": 0.263671875, "learning_rate": 0.0002678018581226741, "loss": 0.6565, "step": 1316 }, { "epoch": 1.5474794841735053, "grad_norm": 0.2578125, "learning_rate": 0.00026760418117597007, "loss": 0.7513, "step": 1320 }, { "epoch": 1.5521688159437281, "grad_norm": 0.267578125, "learning_rate": 0.00026740597270829, "loss": 0.6873, "step": 1324 }, { "epoch": 1.556858147713951, "grad_norm": 0.259765625, "learning_rate": 0.00026720723361545206, "loss": 0.6724, "step": 1328 }, { "epoch": 1.5615474794841735, "grad_norm": 0.2578125, "learning_rate": 0.0002670079647956726, "loss": 0.7035, "step": 1332 }, { "epoch": 1.5662368112543963, "grad_norm": 0.294921875, "learning_rate": 0.00026680816714956215, "loss": 0.6473, "step": 1336 }, { "epoch": 1.5709261430246189, "grad_norm": 0.25390625, "learning_rate": 0.0002666078415801211, "loss": 0.7342, "step": 1340 }, { "epoch": 1.5756154747948417, "grad_norm": 0.267578125, "learning_rate": 0.0002664069889927361, "loss": 0.7155, "step": 1344 }, { "epoch": 1.5803048065650644, "grad_norm": 0.27734375, "learning_rate": 0.00026620561029517555, "loss": 0.7006, "step": 1348 }, { "epoch": 1.5849941383352872, "grad_norm": 0.26171875, "learning_rate": 0.0002660037063975857, "loss": 0.6876, "step": 1352 }, { "epoch": 1.58968347010551, "grad_norm": 0.2451171875, "learning_rate": 0.0002658012782124865, "loss": 0.6475, "step": 1356 }, { "epoch": 1.5943728018757328, "grad_norm": 0.244140625, "learning_rate": 0.0002655983266547673, "loss": 0.6373, "step": 1360 }, { "epoch": 1.5990621336459554, "grad_norm": 0.27734375, "learning_rate": 0.000265394852641683, "loss": 0.6848, "step": 1364 }, { "epoch": 1.6037514654161782, "grad_norm": 0.265625, "learning_rate": 0.0002651908570928498, "loss": 0.7114, "step": 1368 }, { "epoch": 1.608440797186401, "grad_norm": 0.2490234375, "learning_rate": 0.0002649863409302411, "loss": 0.7023, "step": 1372 }, { "epoch": 1.6131301289566236, "grad_norm": 0.259765625, "learning_rate": 0.000264781305078183, "loss": 0.6557, "step": 1376 }, { "epoch": 1.6178194607268463, "grad_norm": 0.263671875, "learning_rate": 0.00026457575046335055, "loss": 0.6688, "step": 1380 }, { "epoch": 1.6225087924970691, "grad_norm": 0.25390625, "learning_rate": 0.00026436967801476334, "loss": 0.6833, "step": 1384 }, { "epoch": 1.627198124267292, "grad_norm": 0.259765625, "learning_rate": 0.0002641630886637814, "loss": 0.6415, "step": 1388 }, { "epoch": 1.6318874560375147, "grad_norm": 0.255859375, "learning_rate": 0.0002639559833441008, "loss": 0.7361, "step": 1392 }, { "epoch": 1.6365767878077375, "grad_norm": 0.259765625, "learning_rate": 0.00026374836299174984, "loss": 0.6997, "step": 1396 }, { "epoch": 1.64126611957796, "grad_norm": 0.26953125, "learning_rate": 0.0002635402285450842, "loss": 0.6023, "step": 1400 }, { "epoch": 1.6459554513481829, "grad_norm": 0.28515625, "learning_rate": 0.00026333158094478333, "loss": 0.7078, "step": 1404 }, { "epoch": 1.6506447831184057, "grad_norm": 0.259765625, "learning_rate": 0.0002631224211338458, "loss": 0.7311, "step": 1408 }, { "epoch": 1.6553341148886282, "grad_norm": 0.27734375, "learning_rate": 0.00026291275005758507, "loss": 0.7061, "step": 1412 }, { "epoch": 1.660023446658851, "grad_norm": 0.267578125, "learning_rate": 0.00026270256866362554, "loss": 0.6862, "step": 1416 }, { "epoch": 1.6647127784290738, "grad_norm": 0.251953125, "learning_rate": 0.0002624918779018979, "loss": 0.6868, "step": 1420 }, { "epoch": 1.6694021101992966, "grad_norm": 0.263671875, "learning_rate": 0.00026228067872463475, "loss": 0.7052, "step": 1424 }, { "epoch": 1.6740914419695194, "grad_norm": 0.2578125, "learning_rate": 0.0002620689720863669, "loss": 0.6386, "step": 1428 }, { "epoch": 1.6787807737397422, "grad_norm": 0.26171875, "learning_rate": 0.0002618567589439185, "loss": 0.6729, "step": 1432 }, { "epoch": 1.6834701055099648, "grad_norm": 0.271484375, "learning_rate": 0.00026164404025640276, "loss": 0.655, "step": 1436 }, { "epoch": 1.6881594372801876, "grad_norm": 0.27734375, "learning_rate": 0.0002614308169852179, "loss": 0.7085, "step": 1440 }, { "epoch": 1.6928487690504102, "grad_norm": 0.265625, "learning_rate": 0.00026121709009404264, "loss": 0.7295, "step": 1444 }, { "epoch": 1.697538100820633, "grad_norm": 0.26171875, "learning_rate": 0.00026100286054883166, "loss": 0.6506, "step": 1448 }, { "epoch": 1.7022274325908557, "grad_norm": 0.263671875, "learning_rate": 0.0002607881293178117, "loss": 0.6513, "step": 1452 }, { "epoch": 1.7069167643610785, "grad_norm": 0.2578125, "learning_rate": 0.00026057289737147675, "loss": 0.6906, "step": 1456 }, { "epoch": 1.7116060961313013, "grad_norm": 0.279296875, "learning_rate": 0.00026035716568258377, "loss": 0.6819, "step": 1460 }, { "epoch": 1.7162954279015241, "grad_norm": 0.265625, "learning_rate": 0.0002601409352261485, "loss": 0.6822, "step": 1464 }, { "epoch": 1.720984759671747, "grad_norm": 0.255859375, "learning_rate": 0.0002599242069794407, "loss": 0.6983, "step": 1468 }, { "epoch": 1.7256740914419695, "grad_norm": 0.263671875, "learning_rate": 0.00025970698192198026, "loss": 0.663, "step": 1472 }, { "epoch": 1.7303634232121923, "grad_norm": 0.248046875, "learning_rate": 0.00025948926103553196, "loss": 0.7066, "step": 1476 }, { "epoch": 1.7350527549824148, "grad_norm": 0.265625, "learning_rate": 0.00025927104530410193, "loss": 0.6464, "step": 1480 }, { "epoch": 1.7397420867526376, "grad_norm": 0.267578125, "learning_rate": 0.0002590523357139327, "loss": 0.6341, "step": 1484 }, { "epoch": 1.7444314185228604, "grad_norm": 0.26953125, "learning_rate": 0.00025883313325349866, "loss": 0.6986, "step": 1488 }, { "epoch": 1.7491207502930832, "grad_norm": 0.2734375, "learning_rate": 0.0002586134389135019, "loss": 0.6552, "step": 1492 }, { "epoch": 1.753810082063306, "grad_norm": 0.25390625, "learning_rate": 0.0002583932536868676, "loss": 0.681, "step": 1496 }, { "epoch": 1.7584994138335288, "grad_norm": 0.2490234375, "learning_rate": 0.00025817257856873956, "loss": 0.6324, "step": 1500 }, { "epoch": 1.7631887456037516, "grad_norm": 0.279296875, "learning_rate": 0.00025795141455647554, "loss": 0.6766, "step": 1504 }, { "epoch": 1.7678780773739742, "grad_norm": 0.271484375, "learning_rate": 0.0002577297626496431, "loss": 0.6905, "step": 1508 }, { "epoch": 1.772567409144197, "grad_norm": 0.275390625, "learning_rate": 0.00025750762385001464, "loss": 0.6752, "step": 1512 }, { "epoch": 1.7772567409144195, "grad_norm": 0.2421875, "learning_rate": 0.0002572849991615633, "loss": 0.6785, "step": 1516 }, { "epoch": 1.7819460726846423, "grad_norm": 0.259765625, "learning_rate": 0.00025706188959045826, "loss": 0.6516, "step": 1520 }, { "epoch": 1.7866354044548651, "grad_norm": 0.255859375, "learning_rate": 0.00025683829614505993, "loss": 0.628, "step": 1524 }, { "epoch": 1.791324736225088, "grad_norm": 0.251953125, "learning_rate": 0.00025661421983591586, "loss": 0.6955, "step": 1528 }, { "epoch": 1.7960140679953107, "grad_norm": 0.287109375, "learning_rate": 0.0002563896616757558, "loss": 0.6969, "step": 1532 }, { "epoch": 1.8007033997655335, "grad_norm": 0.25, "learning_rate": 0.00025616462267948726, "loss": 0.6666, "step": 1536 }, { "epoch": 1.8053927315357563, "grad_norm": 0.263671875, "learning_rate": 0.00025593910386419107, "loss": 0.6589, "step": 1540 }, { "epoch": 1.8100820633059789, "grad_norm": 0.275390625, "learning_rate": 0.0002557131062491165, "loss": 0.6676, "step": 1544 }, { "epoch": 1.8147713950762017, "grad_norm": 0.26171875, "learning_rate": 0.0002554866308556769, "loss": 0.6157, "step": 1548 }, { "epoch": 1.8194607268464242, "grad_norm": 0.2578125, "learning_rate": 0.0002552596787074448, "loss": 0.6262, "step": 1552 }, { "epoch": 1.824150058616647, "grad_norm": 0.275390625, "learning_rate": 0.00025503225083014765, "loss": 0.7328, "step": 1556 }, { "epoch": 1.8288393903868698, "grad_norm": 0.263671875, "learning_rate": 0.0002548043482516629, "loss": 0.6837, "step": 1560 }, { "epoch": 1.8335287221570926, "grad_norm": 0.259765625, "learning_rate": 0.0002545759720020134, "loss": 0.7037, "step": 1564 }, { "epoch": 1.8382180539273154, "grad_norm": 0.27734375, "learning_rate": 0.0002543471231133628, "loss": 0.6798, "step": 1568 }, { "epoch": 1.8429073856975382, "grad_norm": 0.265625, "learning_rate": 0.0002541178026200112, "loss": 0.7137, "step": 1572 }, { "epoch": 1.847596717467761, "grad_norm": 0.26953125, "learning_rate": 0.0002538880115583896, "loss": 0.7179, "step": 1576 }, { "epoch": 1.8522860492379836, "grad_norm": 0.267578125, "learning_rate": 0.0002536577509670562, "loss": 0.6664, "step": 1580 }, { "epoch": 1.8569753810082064, "grad_norm": 0.255859375, "learning_rate": 0.0002534270218866911, "loss": 0.5868, "step": 1584 }, { "epoch": 1.861664712778429, "grad_norm": 0.26953125, "learning_rate": 0.00025319582536009175, "loss": 0.68, "step": 1588 }, { "epoch": 1.8663540445486517, "grad_norm": 0.267578125, "learning_rate": 0.00025296416243216836, "loss": 0.6532, "step": 1592 }, { "epoch": 1.8710433763188745, "grad_norm": 0.271484375, "learning_rate": 0.000252732034149939, "loss": 0.6645, "step": 1596 }, { "epoch": 1.8757327080890973, "grad_norm": 0.26953125, "learning_rate": 0.00025249944156252504, "loss": 0.6842, "step": 1600 }, { "epoch": 1.88042203985932, "grad_norm": 0.263671875, "learning_rate": 0.0002522663857211461, "loss": 0.6603, "step": 1604 }, { "epoch": 1.885111371629543, "grad_norm": 0.25, "learning_rate": 0.00025203286767911575, "loss": 0.704, "step": 1608 }, { "epoch": 1.8898007033997657, "grad_norm": 0.263671875, "learning_rate": 0.0002517988884918364, "loss": 0.7084, "step": 1612 }, { "epoch": 1.8944900351699883, "grad_norm": 0.2470703125, "learning_rate": 0.00025156444921679464, "loss": 0.7358, "step": 1616 }, { "epoch": 1.899179366940211, "grad_norm": 0.26953125, "learning_rate": 0.0002513295509135564, "loss": 0.6261, "step": 1620 }, { "epoch": 1.9038686987104336, "grad_norm": 0.294921875, "learning_rate": 0.0002510941946437625, "loss": 0.7177, "step": 1624 }, { "epoch": 1.9085580304806564, "grad_norm": 0.267578125, "learning_rate": 0.00025085838147112315, "loss": 0.6916, "step": 1628 }, { "epoch": 1.9132473622508792, "grad_norm": 0.267578125, "learning_rate": 0.000250622112461414, "loss": 0.6928, "step": 1632 }, { "epoch": 1.917936694021102, "grad_norm": 0.26953125, "learning_rate": 0.00025038538868247043, "loss": 0.6989, "step": 1636 }, { "epoch": 1.9226260257913248, "grad_norm": 0.283203125, "learning_rate": 0.0002501482112041836, "loss": 0.6828, "step": 1640 }, { "epoch": 1.9273153575615476, "grad_norm": 0.2392578125, "learning_rate": 0.00024991058109849495, "loss": 0.6445, "step": 1644 }, { "epoch": 1.9320046893317704, "grad_norm": 0.294921875, "learning_rate": 0.00024967249943939174, "loss": 0.6642, "step": 1648 }, { "epoch": 1.936694021101993, "grad_norm": 0.2578125, "learning_rate": 0.000249433967302902, "loss": 0.6969, "step": 1652 }, { "epoch": 1.9413833528722158, "grad_norm": 0.251953125, "learning_rate": 0.00024919498576708975, "loss": 0.6826, "step": 1656 }, { "epoch": 1.9460726846424383, "grad_norm": 0.2734375, "learning_rate": 0.00024895555591205004, "loss": 0.6732, "step": 1660 }, { "epoch": 1.9507620164126611, "grad_norm": 0.271484375, "learning_rate": 0.00024871567881990414, "loss": 0.6914, "step": 1664 }, { "epoch": 1.955451348182884, "grad_norm": 0.263671875, "learning_rate": 0.00024847535557479477, "loss": 0.6865, "step": 1668 }, { "epoch": 1.9601406799531067, "grad_norm": 0.26171875, "learning_rate": 0.0002482345872628809, "loss": 0.6773, "step": 1672 }, { "epoch": 1.9648300117233295, "grad_norm": 0.267578125, "learning_rate": 0.0002479933749723332, "loss": 0.6618, "step": 1676 }, { "epoch": 1.9695193434935523, "grad_norm": 0.267578125, "learning_rate": 0.00024775171979332867, "loss": 0.6894, "step": 1680 }, { "epoch": 1.9742086752637749, "grad_norm": 0.2578125, "learning_rate": 0.0002475096228180463, "loss": 0.6323, "step": 1684 }, { "epoch": 1.9788980070339977, "grad_norm": 0.28125, "learning_rate": 0.00024726708514066157, "loss": 0.6447, "step": 1688 }, { "epoch": 1.9835873388042204, "grad_norm": 0.267578125, "learning_rate": 0.0002470241078573418, "loss": 0.6645, "step": 1692 }, { "epoch": 1.988276670574443, "grad_norm": 0.275390625, "learning_rate": 0.00024678069206624117, "loss": 0.6562, "step": 1696 }, { "epoch": 1.9929660023446658, "grad_norm": 0.263671875, "learning_rate": 0.0002465368388674958, "loss": 0.7016, "step": 1700 }, { "epoch": 1.9976553341148886, "grad_norm": 0.265625, "learning_rate": 0.00024629254936321855, "loss": 0.6702, "step": 1704 }, { "epoch": 2.0023446658851114, "grad_norm": 0.244140625, "learning_rate": 0.0002460478246574944, "loss": 0.6856, "step": 1708 }, { "epoch": 2.007033997655334, "grad_norm": 0.279296875, "learning_rate": 0.00024580266585637496, "loss": 0.5902, "step": 1712 }, { "epoch": 2.011723329425557, "grad_norm": 0.302734375, "learning_rate": 0.00024555707406787405, "loss": 0.563, "step": 1716 }, { "epoch": 2.0164126611957798, "grad_norm": 0.2578125, "learning_rate": 0.0002453110504019623, "loss": 0.6143, "step": 1720 }, { "epoch": 2.021101992966002, "grad_norm": 0.259765625, "learning_rate": 0.0002450645959705622, "loss": 0.5521, "step": 1724 }, { "epoch": 2.025791324736225, "grad_norm": 0.302734375, "learning_rate": 0.0002448177118875432, "loss": 0.6465, "step": 1728 }, { "epoch": 2.0304806565064477, "grad_norm": 0.26171875, "learning_rate": 0.00024457039926871656, "loss": 0.5643, "step": 1732 }, { "epoch": 2.0351699882766705, "grad_norm": 0.271484375, "learning_rate": 0.00024432265923183025, "loss": 0.5682, "step": 1736 }, { "epoch": 2.0398593200468933, "grad_norm": 0.27734375, "learning_rate": 0.00024407449289656416, "loss": 0.5971, "step": 1740 }, { "epoch": 2.044548651817116, "grad_norm": 0.2734375, "learning_rate": 0.00024382590138452475, "loss": 0.5766, "step": 1744 }, { "epoch": 2.049237983587339, "grad_norm": 0.25, "learning_rate": 0.00024357688581924013, "loss": 0.5996, "step": 1748 }, { "epoch": 2.0539273153575617, "grad_norm": 0.279296875, "learning_rate": 0.00024332744732615496, "loss": 0.5863, "step": 1752 }, { "epoch": 2.0586166471277845, "grad_norm": 0.271484375, "learning_rate": 0.00024307758703262527, "loss": 0.6162, "step": 1756 }, { "epoch": 2.063305978898007, "grad_norm": 0.271484375, "learning_rate": 0.00024282730606791365, "loss": 0.6311, "step": 1760 }, { "epoch": 2.0679953106682296, "grad_norm": 0.30078125, "learning_rate": 0.00024257660556318373, "loss": 0.566, "step": 1764 }, { "epoch": 2.0726846424384524, "grad_norm": 0.2734375, "learning_rate": 0.00024232548665149533, "loss": 0.5363, "step": 1768 }, { "epoch": 2.077373974208675, "grad_norm": 0.263671875, "learning_rate": 0.00024207395046779945, "loss": 0.591, "step": 1772 }, { "epoch": 2.082063305978898, "grad_norm": 0.271484375, "learning_rate": 0.00024182199814893278, "loss": 0.579, "step": 1776 }, { "epoch": 2.086752637749121, "grad_norm": 0.2734375, "learning_rate": 0.00024156963083361282, "loss": 0.6016, "step": 1780 }, { "epoch": 2.0914419695193436, "grad_norm": 0.271484375, "learning_rate": 0.0002413168496624328, "loss": 0.6139, "step": 1784 }, { "epoch": 2.0961313012895664, "grad_norm": 0.26953125, "learning_rate": 0.00024106365577785625, "loss": 0.6217, "step": 1788 }, { "epoch": 2.100820633059789, "grad_norm": 0.3125, "learning_rate": 0.00024081005032421202, "loss": 0.5849, "step": 1792 }, { "epoch": 2.1055099648300115, "grad_norm": 0.291015625, "learning_rate": 0.0002405560344476892, "loss": 0.5774, "step": 1796 }, { "epoch": 2.1101992966002343, "grad_norm": 0.27734375, "learning_rate": 0.00024030160929633165, "loss": 0.6123, "step": 1800 }, { "epoch": 2.114888628370457, "grad_norm": 0.271484375, "learning_rate": 0.00024004677602003306, "loss": 0.5666, "step": 1804 }, { "epoch": 2.11957796014068, "grad_norm": 0.2890625, "learning_rate": 0.00023979153577053167, "loss": 0.5931, "step": 1808 }, { "epoch": 2.1242672919109027, "grad_norm": 0.28125, "learning_rate": 0.00023953588970140503, "loss": 0.5862, "step": 1812 }, { "epoch": 2.1289566236811255, "grad_norm": 0.275390625, "learning_rate": 0.00023927983896806495, "loss": 0.5796, "step": 1816 }, { "epoch": 2.1336459554513483, "grad_norm": 0.28125, "learning_rate": 0.0002390233847277519, "loss": 0.5914, "step": 1820 }, { "epoch": 2.138335287221571, "grad_norm": 0.26171875, "learning_rate": 0.00023876652813953028, "loss": 0.5639, "step": 1824 }, { "epoch": 2.143024618991794, "grad_norm": 0.265625, "learning_rate": 0.00023850927036428286, "loss": 0.5648, "step": 1828 }, { "epoch": 2.147713950762016, "grad_norm": 0.27734375, "learning_rate": 0.00023825161256470546, "loss": 0.5877, "step": 1832 }, { "epoch": 2.152403282532239, "grad_norm": 0.28515625, "learning_rate": 0.00023799355590530205, "loss": 0.6211, "step": 1836 }, { "epoch": 2.157092614302462, "grad_norm": 0.341796875, "learning_rate": 0.00023773510155237918, "loss": 0.6338, "step": 1840 }, { "epoch": 2.1617819460726846, "grad_norm": 0.2890625, "learning_rate": 0.0002374762506740408, "loss": 0.59, "step": 1844 }, { "epoch": 2.1664712778429074, "grad_norm": 0.28125, "learning_rate": 0.00023721700444018296, "loss": 0.6159, "step": 1848 }, { "epoch": 2.17116060961313, "grad_norm": 0.3125, "learning_rate": 0.00023695736402248865, "loss": 0.6008, "step": 1852 }, { "epoch": 2.175849941383353, "grad_norm": 0.279296875, "learning_rate": 0.00023669733059442238, "loss": 0.5405, "step": 1856 }, { "epoch": 2.1805392731535758, "grad_norm": 0.28515625, "learning_rate": 0.00023643690533122467, "loss": 0.5994, "step": 1860 }, { "epoch": 2.1852286049237986, "grad_norm": 0.318359375, "learning_rate": 0.00023617608940990737, "loss": 0.6262, "step": 1864 }, { "epoch": 2.189917936694021, "grad_norm": 0.27734375, "learning_rate": 0.0002359148840092476, "loss": 0.5902, "step": 1868 }, { "epoch": 2.1946072684642437, "grad_norm": 0.298828125, "learning_rate": 0.00023565329030978297, "loss": 0.5659, "step": 1872 }, { "epoch": 2.1992966002344665, "grad_norm": 0.28125, "learning_rate": 0.00023539130949380585, "loss": 0.6104, "step": 1876 }, { "epoch": 2.2039859320046893, "grad_norm": 0.287109375, "learning_rate": 0.00023512894274535843, "loss": 0.5938, "step": 1880 }, { "epoch": 2.208675263774912, "grad_norm": 0.279296875, "learning_rate": 0.000234866191250227, "loss": 0.5961, "step": 1884 }, { "epoch": 2.213364595545135, "grad_norm": 0.28515625, "learning_rate": 0.00023460305619593674, "loss": 0.6316, "step": 1888 }, { "epoch": 2.2180539273153577, "grad_norm": 0.296875, "learning_rate": 0.00023433953877174645, "loss": 0.5741, "step": 1892 }, { "epoch": 2.2227432590855805, "grad_norm": 0.291015625, "learning_rate": 0.000234075640168643, "loss": 0.5559, "step": 1896 }, { "epoch": 2.2274325908558033, "grad_norm": 0.27734375, "learning_rate": 0.00023381136157933603, "loss": 0.5655, "step": 1900 }, { "epoch": 2.2321219226260256, "grad_norm": 0.294921875, "learning_rate": 0.0002335467041982526, "loss": 0.5871, "step": 1904 }, { "epoch": 2.2368112543962484, "grad_norm": 0.3046875, "learning_rate": 0.0002332816692215318, "loss": 0.5668, "step": 1908 }, { "epoch": 2.241500586166471, "grad_norm": 0.283203125, "learning_rate": 0.00023301625784701905, "loss": 0.6187, "step": 1912 }, { "epoch": 2.246189917936694, "grad_norm": 0.287109375, "learning_rate": 0.0002327504712742612, "loss": 0.6177, "step": 1916 }, { "epoch": 2.2508792497069168, "grad_norm": 0.2890625, "learning_rate": 0.0002324843107045008, "loss": 0.6034, "step": 1920 }, { "epoch": 2.2555685814771396, "grad_norm": 0.283203125, "learning_rate": 0.00023221777734067046, "loss": 0.5928, "step": 1924 }, { "epoch": 2.2602579132473624, "grad_norm": 0.302734375, "learning_rate": 0.000231950872387388, "loss": 0.5952, "step": 1928 }, { "epoch": 2.264947245017585, "grad_norm": 0.30078125, "learning_rate": 0.0002316835970509504, "loss": 0.5139, "step": 1932 }, { "epoch": 2.269636576787808, "grad_norm": 0.296875, "learning_rate": 0.00023141595253932886, "loss": 0.6095, "step": 1936 }, { "epoch": 2.2743259085580303, "grad_norm": 0.291015625, "learning_rate": 0.00023114794006216278, "loss": 0.6051, "step": 1940 }, { "epoch": 2.279015240328253, "grad_norm": 0.265625, "learning_rate": 0.0002308795608307549, "loss": 0.571, "step": 1944 }, { "epoch": 2.283704572098476, "grad_norm": 0.287109375, "learning_rate": 0.0002306108160580654, "loss": 0.6444, "step": 1948 }, { "epoch": 2.2883939038686987, "grad_norm": 0.279296875, "learning_rate": 0.00023034170695870665, "loss": 0.5642, "step": 1952 }, { "epoch": 2.2930832356389215, "grad_norm": 0.287109375, "learning_rate": 0.00023007223474893736, "loss": 0.59, "step": 1956 }, { "epoch": 2.2977725674091443, "grad_norm": 0.28515625, "learning_rate": 0.00022980240064665765, "loss": 0.5728, "step": 1960 }, { "epoch": 2.302461899179367, "grad_norm": 0.287109375, "learning_rate": 0.0002295322058714031, "loss": 0.6141, "step": 1964 }, { "epoch": 2.30715123094959, "grad_norm": 0.29296875, "learning_rate": 0.0002292616516443394, "loss": 0.6183, "step": 1968 }, { "epoch": 2.311840562719812, "grad_norm": 0.294921875, "learning_rate": 0.00022899073918825673, "loss": 0.5835, "step": 1972 }, { "epoch": 2.316529894490035, "grad_norm": 0.2890625, "learning_rate": 0.00022871946972756455, "loss": 0.6463, "step": 1976 }, { "epoch": 2.321219226260258, "grad_norm": 0.28125, "learning_rate": 0.00022844784448828554, "loss": 0.6421, "step": 1980 }, { "epoch": 2.3259085580304806, "grad_norm": 0.28515625, "learning_rate": 0.0002281758646980505, "loss": 0.6004, "step": 1984 }, { "epoch": 2.3305978898007034, "grad_norm": 0.30859375, "learning_rate": 0.0002279035315860926, "loss": 0.6484, "step": 1988 }, { "epoch": 2.335287221570926, "grad_norm": 0.28515625, "learning_rate": 0.00022763084638324202, "loss": 0.5479, "step": 1992 }, { "epoch": 2.339976553341149, "grad_norm": 0.294921875, "learning_rate": 0.00022735781032192, "loss": 0.6289, "step": 1996 }, { "epoch": 2.3446658851113718, "grad_norm": 0.291015625, "learning_rate": 0.00022708442463613367, "loss": 0.5875, "step": 2000 }, { "epoch": 2.3493552168815945, "grad_norm": 0.2890625, "learning_rate": 0.00022681069056147032, "loss": 0.6117, "step": 2004 }, { "epoch": 2.3540445486518173, "grad_norm": 0.291015625, "learning_rate": 0.00022653660933509166, "loss": 0.5951, "step": 2008 }, { "epoch": 2.3587338804220397, "grad_norm": 0.298828125, "learning_rate": 0.00022626218219572858, "loss": 0.6197, "step": 2012 }, { "epoch": 2.3634232121922625, "grad_norm": 0.296875, "learning_rate": 0.00022598741038367523, "loss": 0.5915, "step": 2016 }, { "epoch": 2.3681125439624853, "grad_norm": 0.2890625, "learning_rate": 0.0002257122951407836, "loss": 0.6072, "step": 2020 }, { "epoch": 2.372801875732708, "grad_norm": 0.27734375, "learning_rate": 0.0002254368377104577, "loss": 0.536, "step": 2024 }, { "epoch": 2.377491207502931, "grad_norm": 0.2890625, "learning_rate": 0.0002251610393376483, "loss": 0.6213, "step": 2028 }, { "epoch": 2.3821805392731537, "grad_norm": 0.298828125, "learning_rate": 0.00022488490126884692, "loss": 0.5981, "step": 2032 }, { "epoch": 2.3868698710433764, "grad_norm": 0.29296875, "learning_rate": 0.00022460842475208038, "loss": 0.619, "step": 2036 }, { "epoch": 2.3915592028135992, "grad_norm": 0.3046875, "learning_rate": 0.00022433161103690521, "loss": 0.5714, "step": 2040 }, { "epoch": 2.3962485345838216, "grad_norm": 0.287109375, "learning_rate": 0.00022405446137440185, "loss": 0.6044, "step": 2044 }, { "epoch": 2.4009378663540444, "grad_norm": 0.287109375, "learning_rate": 0.0002237769770171692, "loss": 0.5437, "step": 2048 }, { "epoch": 2.405627198124267, "grad_norm": 0.296875, "learning_rate": 0.00022349915921931866, "loss": 0.5494, "step": 2052 }, { "epoch": 2.41031652989449, "grad_norm": 0.3203125, "learning_rate": 0.0002232210092364689, "loss": 0.6109, "step": 2056 }, { "epoch": 2.4150058616647128, "grad_norm": 0.2890625, "learning_rate": 0.00022294252832573958, "loss": 0.5868, "step": 2060 }, { "epoch": 2.4196951934349356, "grad_norm": 0.27734375, "learning_rate": 0.00022266371774574633, "loss": 0.6001, "step": 2064 }, { "epoch": 2.4243845252051583, "grad_norm": 0.294921875, "learning_rate": 0.00022238457875659455, "loss": 0.5908, "step": 2068 }, { "epoch": 2.429073856975381, "grad_norm": 0.30078125, "learning_rate": 0.000222105112619874, "loss": 0.6001, "step": 2072 }, { "epoch": 2.433763188745604, "grad_norm": 0.28515625, "learning_rate": 0.00022182532059865305, "loss": 0.6076, "step": 2076 }, { "epoch": 2.4384525205158267, "grad_norm": 0.287109375, "learning_rate": 0.00022154520395747279, "loss": 0.5652, "step": 2080 }, { "epoch": 2.443141852286049, "grad_norm": 0.2890625, "learning_rate": 0.0002212647639623415, "loss": 0.5713, "step": 2084 }, { "epoch": 2.447831184056272, "grad_norm": 0.2890625, "learning_rate": 0.000220984001880729, "loss": 0.5742, "step": 2088 }, { "epoch": 2.4525205158264947, "grad_norm": 0.287109375, "learning_rate": 0.00022070291898156064, "loss": 0.6326, "step": 2092 }, { "epoch": 2.4572098475967175, "grad_norm": 0.27734375, "learning_rate": 0.00022042151653521182, "loss": 0.5809, "step": 2096 }, { "epoch": 2.4618991793669402, "grad_norm": 0.279296875, "learning_rate": 0.0002201397958135022, "loss": 0.5758, "step": 2100 }, { "epoch": 2.466588511137163, "grad_norm": 0.318359375, "learning_rate": 0.00021985775808968982, "loss": 0.5594, "step": 2104 }, { "epoch": 2.471277842907386, "grad_norm": 0.296875, "learning_rate": 0.0002195754046384654, "loss": 0.5766, "step": 2108 }, { "epoch": 2.4759671746776086, "grad_norm": 0.287109375, "learning_rate": 0.00021929273673594677, "loss": 0.5754, "step": 2112 }, { "epoch": 2.480656506447831, "grad_norm": 0.296875, "learning_rate": 0.0002190097556596728, "loss": 0.5989, "step": 2116 }, { "epoch": 2.4853458382180538, "grad_norm": 0.306640625, "learning_rate": 0.0002187264626885979, "loss": 0.5721, "step": 2120 }, { "epoch": 2.4900351699882766, "grad_norm": 0.30078125, "learning_rate": 0.00021844285910308593, "loss": 0.6148, "step": 2124 }, { "epoch": 2.4947245017584994, "grad_norm": 0.28515625, "learning_rate": 0.00021815894618490482, "loss": 0.6034, "step": 2128 }, { "epoch": 2.499413833528722, "grad_norm": 0.306640625, "learning_rate": 0.00021787472521722038, "loss": 0.5831, "step": 2132 }, { "epoch": 2.504103165298945, "grad_norm": 0.28515625, "learning_rate": 0.0002175901974845907, "loss": 0.5933, "step": 2136 }, { "epoch": 2.5087924970691677, "grad_norm": 0.287109375, "learning_rate": 0.00021730536427296045, "loss": 0.589, "step": 2140 }, { "epoch": 2.5134818288393905, "grad_norm": 0.283203125, "learning_rate": 0.00021702022686965471, "loss": 0.5794, "step": 2144 }, { "epoch": 2.5181711606096133, "grad_norm": 0.279296875, "learning_rate": 0.00021673478656337365, "loss": 0.6354, "step": 2148 }, { "epoch": 2.522860492379836, "grad_norm": 0.263671875, "learning_rate": 0.00021644904464418618, "loss": 0.6022, "step": 2152 }, { "epoch": 2.5275498241500585, "grad_norm": 0.287109375, "learning_rate": 0.0002161630024035245, "loss": 0.5818, "step": 2156 }, { "epoch": 2.5322391559202813, "grad_norm": 0.310546875, "learning_rate": 0.0002158766611341781, "loss": 0.5831, "step": 2160 }, { "epoch": 2.536928487690504, "grad_norm": 0.2890625, "learning_rate": 0.000215590022130288, "loss": 0.5985, "step": 2164 }, { "epoch": 2.541617819460727, "grad_norm": 0.306640625, "learning_rate": 0.00021530308668734079, "loss": 0.6242, "step": 2168 }, { "epoch": 2.5463071512309496, "grad_norm": 0.30859375, "learning_rate": 0.0002150158561021629, "loss": 0.5457, "step": 2172 }, { "epoch": 2.5509964830011724, "grad_norm": 0.29296875, "learning_rate": 0.00021472833167291458, "loss": 0.604, "step": 2176 }, { "epoch": 2.5556858147713952, "grad_norm": 0.291015625, "learning_rate": 0.00021444051469908426, "loss": 0.6074, "step": 2180 }, { "epoch": 2.5603751465416176, "grad_norm": 0.279296875, "learning_rate": 0.00021415240648148246, "loss": 0.557, "step": 2184 }, { "epoch": 2.5650644783118404, "grad_norm": 0.314453125, "learning_rate": 0.00021386400832223605, "loss": 0.573, "step": 2188 }, { "epoch": 2.569753810082063, "grad_norm": 0.279296875, "learning_rate": 0.0002135753215247822, "loss": 0.5672, "step": 2192 }, { "epoch": 2.574443141852286, "grad_norm": 0.2890625, "learning_rate": 0.00021328634739386279, "loss": 0.5693, "step": 2196 }, { "epoch": 2.5791324736225087, "grad_norm": 0.279296875, "learning_rate": 0.0002129970872355182, "loss": 0.5845, "step": 2200 }, { "epoch": 2.5838218053927315, "grad_norm": 0.310546875, "learning_rate": 0.00021270754235708152, "loss": 0.588, "step": 2204 }, { "epoch": 2.5885111371629543, "grad_norm": 0.2890625, "learning_rate": 0.00021241771406717275, "loss": 0.5666, "step": 2208 }, { "epoch": 2.593200468933177, "grad_norm": 0.291015625, "learning_rate": 0.0002121276036756926, "loss": 0.5667, "step": 2212 }, { "epoch": 2.5978898007034, "grad_norm": 0.3046875, "learning_rate": 0.000211837212493817, "loss": 0.6236, "step": 2216 }, { "epoch": 2.6025791324736227, "grad_norm": 0.298828125, "learning_rate": 0.00021154654183399077, "loss": 0.5384, "step": 2220 }, { "epoch": 2.6072684642438455, "grad_norm": 0.294921875, "learning_rate": 0.00021125559300992197, "loss": 0.6194, "step": 2224 }, { "epoch": 2.611957796014068, "grad_norm": 0.27734375, "learning_rate": 0.00021096436733657572, "loss": 0.6018, "step": 2228 }, { "epoch": 2.6166471277842906, "grad_norm": 0.294921875, "learning_rate": 0.00021067286613016847, "loss": 0.6199, "step": 2232 }, { "epoch": 2.6213364595545134, "grad_norm": 0.271484375, "learning_rate": 0.00021038109070816184, "loss": 0.585, "step": 2236 }, { "epoch": 2.6260257913247362, "grad_norm": 0.306640625, "learning_rate": 0.00021008904238925704, "loss": 0.5525, "step": 2240 }, { "epoch": 2.630715123094959, "grad_norm": 0.294921875, "learning_rate": 0.00020979672249338835, "loss": 0.5492, "step": 2244 }, { "epoch": 2.635404454865182, "grad_norm": 0.314453125, "learning_rate": 0.00020950413234171767, "loss": 0.6457, "step": 2248 }, { "epoch": 2.6400937866354046, "grad_norm": 0.29296875, "learning_rate": 0.00020921127325662826, "loss": 0.5911, "step": 2252 }, { "epoch": 2.644783118405627, "grad_norm": 0.287109375, "learning_rate": 0.00020891814656171895, "loss": 0.5998, "step": 2256 }, { "epoch": 2.6494724501758498, "grad_norm": 0.296875, "learning_rate": 0.00020862475358179787, "loss": 0.6458, "step": 2260 }, { "epoch": 2.6541617819460726, "grad_norm": 0.2890625, "learning_rate": 0.00020833109564287675, "loss": 0.5858, "step": 2264 }, { "epoch": 2.6588511137162953, "grad_norm": 0.28125, "learning_rate": 0.00020803717407216486, "loss": 0.5901, "step": 2268 }, { "epoch": 2.663540445486518, "grad_norm": 0.291015625, "learning_rate": 0.0002077429901980629, "loss": 0.5914, "step": 2272 }, { "epoch": 2.668229777256741, "grad_norm": 0.302734375, "learning_rate": 0.00020744854535015715, "loss": 0.6325, "step": 2276 }, { "epoch": 2.6729191090269637, "grad_norm": 0.28515625, "learning_rate": 0.00020715384085921327, "loss": 0.6216, "step": 2280 }, { "epoch": 2.6776084407971865, "grad_norm": 0.314453125, "learning_rate": 0.00020685887805717046, "loss": 0.6035, "step": 2284 }, { "epoch": 2.6822977725674093, "grad_norm": 0.283203125, "learning_rate": 0.00020656365827713543, "loss": 0.5751, "step": 2288 }, { "epoch": 2.686987104337632, "grad_norm": 0.283203125, "learning_rate": 0.0002062681828533762, "loss": 0.5953, "step": 2292 }, { "epoch": 2.691676436107855, "grad_norm": 0.291015625, "learning_rate": 0.00020597245312131636, "loss": 0.621, "step": 2296 }, { "epoch": 2.6963657678780772, "grad_norm": 0.296875, "learning_rate": 0.00020567647041752862, "loss": 0.6052, "step": 2300 }, { "epoch": 2.7010550996483, "grad_norm": 0.294921875, "learning_rate": 0.0002053802360797292, "loss": 0.6146, "step": 2304 }, { "epoch": 2.705744431418523, "grad_norm": 0.294921875, "learning_rate": 0.00020508375144677167, "loss": 0.6077, "step": 2308 }, { "epoch": 2.7104337631887456, "grad_norm": 0.294921875, "learning_rate": 0.00020478701785864057, "loss": 0.568, "step": 2312 }, { "epoch": 2.7151230949589684, "grad_norm": 0.28125, "learning_rate": 0.0002044900366564458, "loss": 0.6129, "step": 2316 }, { "epoch": 2.719812426729191, "grad_norm": 0.3203125, "learning_rate": 0.00020419280918241632, "loss": 0.6165, "step": 2320 }, { "epoch": 2.7245017584994136, "grad_norm": 0.310546875, "learning_rate": 0.00020389533677989417, "loss": 0.6291, "step": 2324 }, { "epoch": 2.7291910902696364, "grad_norm": 0.29296875, "learning_rate": 0.00020359762079332833, "loss": 0.6276, "step": 2328 }, { "epoch": 2.733880422039859, "grad_norm": 0.298828125, "learning_rate": 0.0002032996625682687, "loss": 0.6048, "step": 2332 }, { "epoch": 2.738569753810082, "grad_norm": 0.294921875, "learning_rate": 0.0002030014634513599, "loss": 0.6189, "step": 2336 }, { "epoch": 2.7432590855803047, "grad_norm": 0.298828125, "learning_rate": 0.00020270302479033538, "loss": 0.5516, "step": 2340 }, { "epoch": 2.7479484173505275, "grad_norm": 0.30859375, "learning_rate": 0.00020240434793401124, "loss": 0.6309, "step": 2344 }, { "epoch": 2.7526377491207503, "grad_norm": 0.3046875, "learning_rate": 0.00020210543423228, "loss": 0.5694, "step": 2348 }, { "epoch": 2.757327080890973, "grad_norm": 0.30078125, "learning_rate": 0.00020180628503610484, "loss": 0.6261, "step": 2352 }, { "epoch": 2.762016412661196, "grad_norm": 0.2890625, "learning_rate": 0.000201506901697513, "loss": 0.6279, "step": 2356 }, { "epoch": 2.7667057444314187, "grad_norm": 0.27734375, "learning_rate": 0.0002012072855695902, "loss": 0.596, "step": 2360 }, { "epoch": 2.7713950762016415, "grad_norm": 0.30078125, "learning_rate": 0.00020090743800647403, "loss": 0.5867, "step": 2364 }, { "epoch": 2.776084407971864, "grad_norm": 0.294921875, "learning_rate": 0.0002006073603633483, "loss": 0.6346, "step": 2368 }, { "epoch": 2.7807737397420866, "grad_norm": 0.3046875, "learning_rate": 0.00020030705399643646, "loss": 0.6264, "step": 2372 }, { "epoch": 2.7854630715123094, "grad_norm": 0.314453125, "learning_rate": 0.00020000652026299593, "loss": 0.655, "step": 2376 }, { "epoch": 2.7901524032825322, "grad_norm": 0.30859375, "learning_rate": 0.0001997057605213115, "loss": 0.606, "step": 2380 }, { "epoch": 2.794841735052755, "grad_norm": 0.302734375, "learning_rate": 0.00019940477613068964, "loss": 0.5661, "step": 2384 }, { "epoch": 2.799531066822978, "grad_norm": 0.294921875, "learning_rate": 0.00019910356845145196, "loss": 0.598, "step": 2388 }, { "epoch": 2.8042203985932006, "grad_norm": 0.2890625, "learning_rate": 0.0001988021388449293, "loss": 0.6477, "step": 2392 }, { "epoch": 2.808909730363423, "grad_norm": 0.287109375, "learning_rate": 0.00019850048867345554, "loss": 0.6005, "step": 2396 }, { "epoch": 2.8135990621336457, "grad_norm": 0.28125, "learning_rate": 0.0001981986193003614, "loss": 0.584, "step": 2400 }, { "epoch": 2.8182883939038685, "grad_norm": 0.2890625, "learning_rate": 0.0001978965320899683, "loss": 0.6111, "step": 2404 }, { "epoch": 2.8229777256740913, "grad_norm": 0.28125, "learning_rate": 0.00019759422840758228, "loss": 0.6175, "step": 2408 }, { "epoch": 2.827667057444314, "grad_norm": 0.30078125, "learning_rate": 0.00019729170961948754, "loss": 0.6083, "step": 2412 }, { "epoch": 2.832356389214537, "grad_norm": 0.3125, "learning_rate": 0.00019698897709294062, "loss": 0.5866, "step": 2416 }, { "epoch": 2.8370457209847597, "grad_norm": 0.294921875, "learning_rate": 0.000196686032196164, "loss": 0.5791, "step": 2420 }, { "epoch": 2.8417350527549825, "grad_norm": 0.306640625, "learning_rate": 0.00019638287629834012, "loss": 0.6533, "step": 2424 }, { "epoch": 2.8464243845252053, "grad_norm": 0.302734375, "learning_rate": 0.0001960795107696048, "loss": 0.5994, "step": 2428 }, { "epoch": 2.851113716295428, "grad_norm": 0.3125, "learning_rate": 0.00019577593698104156, "loss": 0.5925, "step": 2432 }, { "epoch": 2.855803048065651, "grad_norm": 0.2890625, "learning_rate": 0.00019547215630467504, "loss": 0.5999, "step": 2436 }, { "epoch": 2.8604923798358732, "grad_norm": 0.3046875, "learning_rate": 0.00019516817011346494, "loss": 0.6372, "step": 2440 }, { "epoch": 2.865181711606096, "grad_norm": 0.30859375, "learning_rate": 0.00019486397978129977, "loss": 0.5615, "step": 2444 }, { "epoch": 2.869871043376319, "grad_norm": 0.28515625, "learning_rate": 0.00019455958668299075, "loss": 0.5898, "step": 2448 }, { "epoch": 2.8745603751465416, "grad_norm": 0.2890625, "learning_rate": 0.0001942549921942654, "loss": 0.6083, "step": 2452 }, { "epoch": 2.8792497069167644, "grad_norm": 0.3203125, "learning_rate": 0.00019395019769176156, "loss": 0.6447, "step": 2456 }, { "epoch": 2.883939038686987, "grad_norm": 0.275390625, "learning_rate": 0.00019364520455302103, "loss": 0.6103, "step": 2460 }, { "epoch": 2.88862837045721, "grad_norm": 0.310546875, "learning_rate": 0.0001933400141564833, "loss": 0.5962, "step": 2464 }, { "epoch": 2.8933177022274323, "grad_norm": 0.298828125, "learning_rate": 0.00019303462788147933, "loss": 0.636, "step": 2468 }, { "epoch": 2.898007033997655, "grad_norm": 0.302734375, "learning_rate": 0.00019272904710822551, "loss": 0.5804, "step": 2472 }, { "epoch": 2.902696365767878, "grad_norm": 0.296875, "learning_rate": 0.00019242327321781726, "loss": 0.6356, "step": 2476 }, { "epoch": 2.9073856975381007, "grad_norm": 0.28515625, "learning_rate": 0.00019211730759222272, "loss": 0.574, "step": 2480 }, { "epoch": 2.9120750293083235, "grad_norm": 0.28125, "learning_rate": 0.00019181115161427662, "loss": 0.6377, "step": 2484 }, { "epoch": 2.9167643610785463, "grad_norm": 0.298828125, "learning_rate": 0.00019150480666767407, "loss": 0.6016, "step": 2488 }, { "epoch": 2.921453692848769, "grad_norm": 0.29296875, "learning_rate": 0.0001911982741369641, "loss": 0.5675, "step": 2492 }, { "epoch": 2.926143024618992, "grad_norm": 0.29296875, "learning_rate": 0.0001908915554075437, "loss": 0.5886, "step": 2496 }, { "epoch": 2.9308323563892147, "grad_norm": 0.27734375, "learning_rate": 0.00019058465186565132, "loss": 0.6028, "step": 2500 }, { "epoch": 2.9355216881594375, "grad_norm": 0.28515625, "learning_rate": 0.0001902775648983606, "loss": 0.5958, "step": 2504 }, { "epoch": 2.9402110199296603, "grad_norm": 0.302734375, "learning_rate": 0.00018997029589357443, "loss": 0.5895, "step": 2508 }, { "epoch": 2.9449003516998826, "grad_norm": 0.322265625, "learning_rate": 0.00018966284624001814, "loss": 0.6345, "step": 2512 }, { "epoch": 2.9495896834701054, "grad_norm": 0.30078125, "learning_rate": 0.00018935521732723376, "loss": 0.5931, "step": 2516 }, { "epoch": 2.954279015240328, "grad_norm": 0.29296875, "learning_rate": 0.00018904741054557325, "loss": 0.6009, "step": 2520 }, { "epoch": 2.958968347010551, "grad_norm": 0.31640625, "learning_rate": 0.00018873942728619273, "loss": 0.6011, "step": 2524 }, { "epoch": 2.963657678780774, "grad_norm": 0.3125, "learning_rate": 0.00018843126894104573, "loss": 0.5769, "step": 2528 }, { "epoch": 2.9683470105509966, "grad_norm": 0.30078125, "learning_rate": 0.00018812293690287715, "loss": 0.5863, "step": 2532 }, { "epoch": 2.9730363423212194, "grad_norm": 0.296875, "learning_rate": 0.00018781443256521695, "loss": 0.5778, "step": 2536 }, { "epoch": 2.9777256740914417, "grad_norm": 0.30859375, "learning_rate": 0.00018750575732237379, "loss": 0.6134, "step": 2540 }, { "epoch": 2.9824150058616645, "grad_norm": 0.27734375, "learning_rate": 0.00018719691256942868, "loss": 0.558, "step": 2544 }, { "epoch": 2.9871043376318873, "grad_norm": 0.2734375, "learning_rate": 0.00018688789970222882, "loss": 0.5734, "step": 2548 }, { "epoch": 2.99179366940211, "grad_norm": 0.296875, "learning_rate": 0.00018657872011738124, "loss": 0.5789, "step": 2552 }, { "epoch": 2.996483001172333, "grad_norm": 0.291015625, "learning_rate": 0.0001862693752122463, "loss": 0.6177, "step": 2556 }, { "epoch": 3.0011723329425557, "grad_norm": 0.271484375, "learning_rate": 0.0001859598663849318, "loss": 0.5135, "step": 2560 }, { "epoch": 3.0058616647127785, "grad_norm": 0.2734375, "learning_rate": 0.00018565019503428618, "loss": 0.5306, "step": 2564 }, { "epoch": 3.0105509964830013, "grad_norm": 0.3203125, "learning_rate": 0.00018534036255989247, "loss": 0.5166, "step": 2568 }, { "epoch": 3.015240328253224, "grad_norm": 0.302734375, "learning_rate": 0.00018503037036206194, "loss": 0.5773, "step": 2572 }, { "epoch": 3.019929660023447, "grad_norm": 0.298828125, "learning_rate": 0.00018472021984182777, "loss": 0.5381, "step": 2576 }, { "epoch": 3.024618991793669, "grad_norm": 0.306640625, "learning_rate": 0.00018440991240093862, "loss": 0.5158, "step": 2580 }, { "epoch": 3.029308323563892, "grad_norm": 0.330078125, "learning_rate": 0.00018409944944185237, "loss": 0.5442, "step": 2584 }, { "epoch": 3.033997655334115, "grad_norm": 0.294921875, "learning_rate": 0.0001837888323677299, "loss": 0.5519, "step": 2588 }, { "epoch": 3.0386869871043376, "grad_norm": 0.279296875, "learning_rate": 0.0001834780625824285, "loss": 0.525, "step": 2592 }, { "epoch": 3.0433763188745604, "grad_norm": 0.322265625, "learning_rate": 0.0001831671414904956, "loss": 0.5157, "step": 2596 }, { "epoch": 3.048065650644783, "grad_norm": 0.306640625, "learning_rate": 0.00018285607049716256, "loss": 0.5142, "step": 2600 }, { "epoch": 3.052754982415006, "grad_norm": 0.29296875, "learning_rate": 0.0001825448510083383, "loss": 0.4794, "step": 2604 }, { "epoch": 3.0574443141852288, "grad_norm": 0.298828125, "learning_rate": 0.00018223348443060274, "loss": 0.4835, "step": 2608 }, { "epoch": 3.0621336459554516, "grad_norm": 0.318359375, "learning_rate": 0.00018192197217120067, "loss": 0.4776, "step": 2612 }, { "epoch": 3.066822977725674, "grad_norm": 0.29296875, "learning_rate": 0.00018161031563803523, "loss": 0.5137, "step": 2616 }, { "epoch": 3.0715123094958967, "grad_norm": 0.294921875, "learning_rate": 0.00018129851623966168, "loss": 0.5287, "step": 2620 }, { "epoch": 3.0762016412661195, "grad_norm": 0.2890625, "learning_rate": 0.0001809865753852809, "loss": 0.5198, "step": 2624 }, { "epoch": 3.0808909730363423, "grad_norm": 0.294921875, "learning_rate": 0.00018067449448473321, "loss": 0.4759, "step": 2628 }, { "epoch": 3.085580304806565, "grad_norm": 0.306640625, "learning_rate": 0.00018036227494849173, "loss": 0.4228, "step": 2632 }, { "epoch": 3.090269636576788, "grad_norm": 0.296875, "learning_rate": 0.00018004991818765625, "loss": 0.5385, "step": 2636 }, { "epoch": 3.0949589683470107, "grad_norm": 0.30859375, "learning_rate": 0.00017973742561394675, "loss": 0.5078, "step": 2640 }, { "epoch": 3.0996483001172335, "grad_norm": 0.29296875, "learning_rate": 0.000179424798639697, "loss": 0.4977, "step": 2644 }, { "epoch": 3.1043376318874563, "grad_norm": 0.322265625, "learning_rate": 0.00017911203867784819, "loss": 0.5108, "step": 2648 }, { "epoch": 3.1090269636576786, "grad_norm": 0.322265625, "learning_rate": 0.0001787991471419426, "loss": 0.5151, "step": 2652 }, { "epoch": 3.1137162954279014, "grad_norm": 0.302734375, "learning_rate": 0.00017848612544611714, "loss": 0.5332, "step": 2656 }, { "epoch": 3.118405627198124, "grad_norm": 0.306640625, "learning_rate": 0.00017817297500509702, "loss": 0.5177, "step": 2660 }, { "epoch": 3.123094958968347, "grad_norm": 0.30078125, "learning_rate": 0.0001778596972341893, "loss": 0.5166, "step": 2664 }, { "epoch": 3.12778429073857, "grad_norm": 0.30859375, "learning_rate": 0.00017754629354927655, "loss": 0.4979, "step": 2668 }, { "epoch": 3.1324736225087926, "grad_norm": 0.30078125, "learning_rate": 0.00017723276536681025, "loss": 0.5008, "step": 2672 }, { "epoch": 3.1371629542790154, "grad_norm": 0.2890625, "learning_rate": 0.00017691911410380485, "loss": 0.5153, "step": 2676 }, { "epoch": 3.141852286049238, "grad_norm": 0.306640625, "learning_rate": 0.00017660534117783084, "loss": 0.4782, "step": 2680 }, { "epoch": 3.1465416178194605, "grad_norm": 0.30078125, "learning_rate": 0.00017629144800700866, "loss": 0.4542, "step": 2684 }, { "epoch": 3.1512309495896833, "grad_norm": 0.298828125, "learning_rate": 0.00017597743601000218, "loss": 0.5261, "step": 2688 }, { "epoch": 3.155920281359906, "grad_norm": 0.337890625, "learning_rate": 0.00017566330660601236, "loss": 0.4949, "step": 2692 }, { "epoch": 3.160609613130129, "grad_norm": 0.318359375, "learning_rate": 0.0001753490612147707, "loss": 0.5243, "step": 2696 }, { "epoch": 3.1652989449003517, "grad_norm": 0.310546875, "learning_rate": 0.00017503470125653309, "loss": 0.5316, "step": 2700 }, { "epoch": 3.1699882766705745, "grad_norm": 0.3125, "learning_rate": 0.00017472022815207295, "loss": 0.5134, "step": 2704 }, { "epoch": 3.1746776084407973, "grad_norm": 0.306640625, "learning_rate": 0.0001744056433226753, "loss": 0.543, "step": 2708 }, { "epoch": 3.17936694021102, "grad_norm": 0.314453125, "learning_rate": 0.00017409094819013002, "loss": 0.5009, "step": 2712 }, { "epoch": 3.184056271981243, "grad_norm": 0.333984375, "learning_rate": 0.00017377614417672554, "loss": 0.5103, "step": 2716 }, { "epoch": 3.1887456037514657, "grad_norm": 0.318359375, "learning_rate": 0.0001734612327052423, "loss": 0.508, "step": 2720 }, { "epoch": 3.193434935521688, "grad_norm": 0.291015625, "learning_rate": 0.00017314621519894652, "loss": 0.4487, "step": 2724 }, { "epoch": 3.198124267291911, "grad_norm": 0.310546875, "learning_rate": 0.00017283109308158362, "loss": 0.5308, "step": 2728 }, { "epoch": 3.2028135990621336, "grad_norm": 0.3203125, "learning_rate": 0.00017251586777737175, "loss": 0.5029, "step": 2732 }, { "epoch": 3.2075029308323564, "grad_norm": 0.3046875, "learning_rate": 0.00017220054071099555, "loss": 0.5274, "step": 2736 }, { "epoch": 3.212192262602579, "grad_norm": 0.318359375, "learning_rate": 0.0001718851133075994, "loss": 0.5138, "step": 2740 }, { "epoch": 3.216881594372802, "grad_norm": 0.3203125, "learning_rate": 0.00017156958699278134, "loss": 0.5345, "step": 2744 }, { "epoch": 3.2215709261430248, "grad_norm": 0.322265625, "learning_rate": 0.00017125396319258635, "loss": 0.5127, "step": 2748 }, { "epoch": 3.2262602579132476, "grad_norm": 0.298828125, "learning_rate": 0.0001709382433335, "loss": 0.4827, "step": 2752 }, { "epoch": 3.23094958968347, "grad_norm": 0.322265625, "learning_rate": 0.00017062242884244213, "loss": 0.4796, "step": 2756 }, { "epoch": 3.2356389214536927, "grad_norm": 0.314453125, "learning_rate": 0.00017030652114676003, "loss": 0.4927, "step": 2760 }, { "epoch": 3.2403282532239155, "grad_norm": 0.30078125, "learning_rate": 0.00016999052167422247, "loss": 0.5091, "step": 2764 }, { "epoch": 3.2450175849941383, "grad_norm": 0.3203125, "learning_rate": 0.00016967443185301293, "loss": 0.5312, "step": 2768 }, { "epoch": 3.249706916764361, "grad_norm": 0.3203125, "learning_rate": 0.00016935825311172322, "loss": 0.5339, "step": 2772 }, { "epoch": 3.254396248534584, "grad_norm": 0.3359375, "learning_rate": 0.00016904198687934697, "loss": 0.5263, "step": 2776 }, { "epoch": 3.2590855803048067, "grad_norm": 0.31640625, "learning_rate": 0.00016872563458527332, "loss": 0.4754, "step": 2780 }, { "epoch": 3.2637749120750295, "grad_norm": 0.3359375, "learning_rate": 0.0001684091976592804, "loss": 0.5075, "step": 2784 }, { "epoch": 3.2684642438452522, "grad_norm": 0.3203125, "learning_rate": 0.00016809267753152871, "loss": 0.5461, "step": 2788 }, { "epoch": 3.273153575615475, "grad_norm": 0.3125, "learning_rate": 0.00016777607563255498, "loss": 0.5318, "step": 2792 }, { "epoch": 3.2778429073856974, "grad_norm": 0.326171875, "learning_rate": 0.00016745939339326532, "loss": 0.5415, "step": 2796 }, { "epoch": 3.28253223915592, "grad_norm": 0.306640625, "learning_rate": 0.000167142632244929, "loss": 0.5451, "step": 2800 }, { "epoch": 3.287221570926143, "grad_norm": 0.298828125, "learning_rate": 0.00016682579361917196, "loss": 0.5004, "step": 2804 }, { "epoch": 3.2919109026963658, "grad_norm": 0.32421875, "learning_rate": 0.00016650887894797029, "loss": 0.5172, "step": 2808 }, { "epoch": 3.2966002344665886, "grad_norm": 0.3359375, "learning_rate": 0.00016619188966364383, "loss": 0.5349, "step": 2812 }, { "epoch": 3.3012895662368114, "grad_norm": 0.3125, "learning_rate": 0.0001658748271988495, "loss": 0.5499, "step": 2816 }, { "epoch": 3.305978898007034, "grad_norm": 0.318359375, "learning_rate": 0.00016555769298657515, "loss": 0.5422, "step": 2820 }, { "epoch": 3.3106682297772565, "grad_norm": 0.326171875, "learning_rate": 0.00016524048846013265, "loss": 0.5366, "step": 2824 }, { "epoch": 3.3153575615474793, "grad_norm": 0.287109375, "learning_rate": 0.00016492321505315194, "loss": 0.5024, "step": 2828 }, { "epoch": 3.320046893317702, "grad_norm": 0.318359375, "learning_rate": 0.00016460587419957407, "loss": 0.5025, "step": 2832 }, { "epoch": 3.324736225087925, "grad_norm": 0.333984375, "learning_rate": 0.00016428846733364502, "loss": 0.4966, "step": 2836 }, { "epoch": 3.3294255568581477, "grad_norm": 0.310546875, "learning_rate": 0.00016397099588990902, "loss": 0.5281, "step": 2840 }, { "epoch": 3.3341148886283705, "grad_norm": 0.318359375, "learning_rate": 0.00016365346130320233, "loss": 0.4967, "step": 2844 }, { "epoch": 3.3388042203985933, "grad_norm": 0.3046875, "learning_rate": 0.00016333586500864647, "loss": 0.4664, "step": 2848 }, { "epoch": 3.343493552168816, "grad_norm": 0.33203125, "learning_rate": 0.00016301820844164176, "loss": 0.5526, "step": 2852 }, { "epoch": 3.348182883939039, "grad_norm": 0.318359375, "learning_rate": 0.00016270049303786113, "loss": 0.5298, "step": 2856 }, { "epoch": 3.3528722157092616, "grad_norm": 0.318359375, "learning_rate": 0.0001623827202332433, "loss": 0.5579, "step": 2860 }, { "epoch": 3.357561547479484, "grad_norm": 0.30078125, "learning_rate": 0.00016206489146398655, "loss": 0.5597, "step": 2864 }, { "epoch": 3.362250879249707, "grad_norm": 0.318359375, "learning_rate": 0.0001617470081665419, "loss": 0.5362, "step": 2868 }, { "epoch": 3.3669402110199296, "grad_norm": 0.333984375, "learning_rate": 0.0001614290717776069, "loss": 0.5342, "step": 2872 }, { "epoch": 3.3716295427901524, "grad_norm": 0.318359375, "learning_rate": 0.0001611110837341191, "loss": 0.4736, "step": 2876 }, { "epoch": 3.376318874560375, "grad_norm": 0.33203125, "learning_rate": 0.0001607930454732495, "loss": 0.5421, "step": 2880 }, { "epoch": 3.381008206330598, "grad_norm": 0.296875, "learning_rate": 0.000160474958432396, "loss": 0.4857, "step": 2884 }, { "epoch": 3.3856975381008207, "grad_norm": 0.328125, "learning_rate": 0.000160156824049177, "loss": 0.5453, "step": 2888 }, { "epoch": 3.3903868698710435, "grad_norm": 0.3203125, "learning_rate": 0.00015983864376142482, "loss": 0.5494, "step": 2892 }, { "epoch": 3.395076201641266, "grad_norm": 0.337890625, "learning_rate": 0.0001595204190071794, "loss": 0.5328, "step": 2896 }, { "epoch": 3.3997655334114887, "grad_norm": 0.314453125, "learning_rate": 0.00015920215122468146, "loss": 0.4856, "step": 2900 }, { "epoch": 3.4044548651817115, "grad_norm": 0.3203125, "learning_rate": 0.00015888384185236632, "loss": 0.4919, "step": 2904 }, { "epoch": 3.4091441969519343, "grad_norm": 0.3125, "learning_rate": 0.00015856549232885712, "loss": 0.5082, "step": 2908 }, { "epoch": 3.413833528722157, "grad_norm": 0.333984375, "learning_rate": 0.00015824710409295868, "loss": 0.4919, "step": 2912 }, { "epoch": 3.41852286049238, "grad_norm": 0.3125, "learning_rate": 0.0001579286785836506, "loss": 0.5208, "step": 2916 }, { "epoch": 3.4232121922626026, "grad_norm": 0.32421875, "learning_rate": 0.0001576102172400811, "loss": 0.5251, "step": 2920 }, { "epoch": 3.4279015240328254, "grad_norm": 0.33984375, "learning_rate": 0.0001572917215015602, "loss": 0.4927, "step": 2924 }, { "epoch": 3.4325908558030482, "grad_norm": 0.30078125, "learning_rate": 0.00015697319280755343, "loss": 0.5404, "step": 2928 }, { "epoch": 3.437280187573271, "grad_norm": 0.328125, "learning_rate": 0.00015665463259767525, "loss": 0.5198, "step": 2932 }, { "epoch": 3.4419695193434934, "grad_norm": 0.326171875, "learning_rate": 0.00015633604231168264, "loss": 0.5321, "step": 2936 }, { "epoch": 3.446658851113716, "grad_norm": 0.3125, "learning_rate": 0.00015601742338946844, "loss": 0.5492, "step": 2940 }, { "epoch": 3.451348182883939, "grad_norm": 0.328125, "learning_rate": 0.00015569877727105493, "loss": 0.5115, "step": 2944 }, { "epoch": 3.4560375146541618, "grad_norm": 0.31640625, "learning_rate": 0.00015538010539658728, "loss": 0.4897, "step": 2948 }, { "epoch": 3.4607268464243846, "grad_norm": 0.322265625, "learning_rate": 0.00015506140920632707, "loss": 0.5417, "step": 2952 }, { "epoch": 3.4654161781946073, "grad_norm": 0.333984375, "learning_rate": 0.0001547426901406458, "loss": 0.5028, "step": 2956 }, { "epoch": 3.47010550996483, "grad_norm": 0.318359375, "learning_rate": 0.00015442394964001842, "loss": 0.5215, "step": 2960 }, { "epoch": 3.474794841735053, "grad_norm": 0.33984375, "learning_rate": 0.00015410518914501655, "loss": 0.5129, "step": 2964 }, { "epoch": 3.4794841735052753, "grad_norm": 0.33203125, "learning_rate": 0.00015378641009630242, "loss": 0.5582, "step": 2968 }, { "epoch": 3.484173505275498, "grad_norm": 0.341796875, "learning_rate": 0.000153467613934622, "loss": 0.5316, "step": 2972 }, { "epoch": 3.488862837045721, "grad_norm": 0.310546875, "learning_rate": 0.00015314880210079863, "loss": 0.5291, "step": 2976 }, { "epoch": 3.4935521688159437, "grad_norm": 0.333984375, "learning_rate": 0.00015282997603572639, "loss": 0.486, "step": 2980 }, { "epoch": 3.4982415005861665, "grad_norm": 0.318359375, "learning_rate": 0.00015251113718036378, "loss": 0.5083, "step": 2984 }, { "epoch": 3.5029308323563892, "grad_norm": 0.3125, "learning_rate": 0.0001521922869757271, "loss": 0.521, "step": 2988 }, { "epoch": 3.507620164126612, "grad_norm": 0.32421875, "learning_rate": 0.0001518734268628839, "loss": 0.5537, "step": 2992 }, { "epoch": 3.512309495896835, "grad_norm": 0.322265625, "learning_rate": 0.00015155455828294657, "loss": 0.4843, "step": 2996 }, { "epoch": 3.5169988276670576, "grad_norm": 0.32421875, "learning_rate": 0.00015123568267706575, "loss": 0.5133, "step": 3000 }, { "epoch": 3.5216881594372804, "grad_norm": 0.30859375, "learning_rate": 0.00015091680148642371, "loss": 0.4921, "step": 3004 }, { "epoch": 3.5263774912075028, "grad_norm": 0.328125, "learning_rate": 0.00015059791615222817, "loss": 0.5169, "step": 3008 }, { "epoch": 3.5310668229777256, "grad_norm": 0.322265625, "learning_rate": 0.00015027902811570544, "loss": 0.5067, "step": 3012 }, { "epoch": 3.5357561547479484, "grad_norm": 0.2890625, "learning_rate": 0.00014996013881809402, "loss": 0.5611, "step": 3016 }, { "epoch": 3.540445486518171, "grad_norm": 0.328125, "learning_rate": 0.00014964124970063829, "loss": 0.5201, "step": 3020 }, { "epoch": 3.545134818288394, "grad_norm": 0.328125, "learning_rate": 0.0001493223622045816, "loss": 0.5519, "step": 3024 }, { "epoch": 3.5498241500586167, "grad_norm": 0.328125, "learning_rate": 0.00014900347777116007, "loss": 0.5112, "step": 3028 }, { "epoch": 3.5545134818288395, "grad_norm": 0.32421875, "learning_rate": 0.00014868459784159603, "loss": 0.5378, "step": 3032 }, { "epoch": 3.559202813599062, "grad_norm": 0.328125, "learning_rate": 0.0001483657238570913, "loss": 0.4843, "step": 3036 }, { "epoch": 3.5638921453692847, "grad_norm": 0.318359375, "learning_rate": 0.00014804685725882104, "loss": 0.5048, "step": 3040 }, { "epoch": 3.5685814771395075, "grad_norm": 0.3046875, "learning_rate": 0.00014772799948792683, "loss": 0.4974, "step": 3044 }, { "epoch": 3.5732708089097303, "grad_norm": 0.328125, "learning_rate": 0.0001474091519855105, "loss": 0.5482, "step": 3048 }, { "epoch": 3.577960140679953, "grad_norm": 0.330078125, "learning_rate": 0.00014709031619262737, "loss": 0.4867, "step": 3052 }, { "epoch": 3.582649472450176, "grad_norm": 0.34375, "learning_rate": 0.00014677149355027985, "loss": 0.5304, "step": 3056 }, { "epoch": 3.5873388042203986, "grad_norm": 0.34375, "learning_rate": 0.00014645268549941107, "loss": 0.5533, "step": 3060 }, { "epoch": 3.5920281359906214, "grad_norm": 0.33203125, "learning_rate": 0.00014613389348089794, "loss": 0.5024, "step": 3064 }, { "epoch": 3.5967174677608442, "grad_norm": 0.298828125, "learning_rate": 0.0001458151189355451, "loss": 0.5124, "step": 3068 }, { "epoch": 3.601406799531067, "grad_norm": 0.31640625, "learning_rate": 0.00014549636330407823, "loss": 0.5064, "step": 3072 }, { "epoch": 3.60609613130129, "grad_norm": 0.337890625, "learning_rate": 0.0001451776280271374, "loss": 0.5421, "step": 3076 }, { "epoch": 3.610785463071512, "grad_norm": 0.318359375, "learning_rate": 0.00014485891454527083, "loss": 0.4855, "step": 3080 }, { "epoch": 3.615474794841735, "grad_norm": 0.306640625, "learning_rate": 0.0001445402242989281, "loss": 0.4824, "step": 3084 }, { "epoch": 3.6201641266119577, "grad_norm": 0.328125, "learning_rate": 0.00014422155872845387, "loss": 0.5056, "step": 3088 }, { "epoch": 3.6248534583821805, "grad_norm": 0.337890625, "learning_rate": 0.00014390291927408123, "loss": 0.4958, "step": 3092 }, { "epoch": 3.6295427901524033, "grad_norm": 0.3203125, "learning_rate": 0.00014358430737592534, "loss": 0.5354, "step": 3096 }, { "epoch": 3.634232121922626, "grad_norm": 0.34375, "learning_rate": 0.00014326572447397658, "loss": 0.4956, "step": 3100 }, { "epoch": 3.638921453692849, "grad_norm": 0.330078125, "learning_rate": 0.00014294717200809452, "loss": 0.5054, "step": 3104 }, { "epoch": 3.6436107854630713, "grad_norm": 0.326171875, "learning_rate": 0.0001426286514180011, "loss": 0.5112, "step": 3108 }, { "epoch": 3.648300117233294, "grad_norm": 0.3203125, "learning_rate": 0.00014231016414327407, "loss": 0.5005, "step": 3112 }, { "epoch": 3.652989449003517, "grad_norm": 0.306640625, "learning_rate": 0.00014199171162334077, "loss": 0.5444, "step": 3116 }, { "epoch": 3.6576787807737396, "grad_norm": 0.322265625, "learning_rate": 0.00014167329529747146, "loss": 0.5815, "step": 3120 }, { "epoch": 3.6623681125439624, "grad_norm": 0.330078125, "learning_rate": 0.0001413549166047727, "loss": 0.5558, "step": 3124 }, { "epoch": 3.6670574443141852, "grad_norm": 0.33203125, "learning_rate": 0.0001410365769841811, "loss": 0.5045, "step": 3128 }, { "epoch": 3.671746776084408, "grad_norm": 0.337890625, "learning_rate": 0.00014071827787445656, "loss": 0.5228, "step": 3132 }, { "epoch": 3.676436107854631, "grad_norm": 0.33203125, "learning_rate": 0.00014040002071417595, "loss": 0.5277, "step": 3136 }, { "epoch": 3.6811254396248536, "grad_norm": 0.310546875, "learning_rate": 0.00014008180694172645, "loss": 0.5202, "step": 3140 }, { "epoch": 3.6858147713950764, "grad_norm": 0.32421875, "learning_rate": 0.00013976363799529936, "loss": 0.5646, "step": 3144 }, { "epoch": 3.690504103165299, "grad_norm": 0.34765625, "learning_rate": 0.0001394455153128832, "loss": 0.5456, "step": 3148 }, { "epoch": 3.6951934349355215, "grad_norm": 0.37109375, "learning_rate": 0.0001391274403322574, "loss": 0.5319, "step": 3152 }, { "epoch": 3.6998827667057443, "grad_norm": 0.322265625, "learning_rate": 0.00013880941449098596, "loss": 0.5385, "step": 3156 }, { "epoch": 3.704572098475967, "grad_norm": 0.318359375, "learning_rate": 0.0001384914392264106, "loss": 0.5151, "step": 3160 }, { "epoch": 3.70926143024619, "grad_norm": 0.322265625, "learning_rate": 0.00013817351597564457, "loss": 0.5354, "step": 3164 }, { "epoch": 3.7139507620164127, "grad_norm": 0.31640625, "learning_rate": 0.00013785564617556603, "loss": 0.5217, "step": 3168 }, { "epoch": 3.7186400937866355, "grad_norm": 0.34375, "learning_rate": 0.00013753783126281145, "loss": 0.5383, "step": 3172 }, { "epoch": 3.7233294255568583, "grad_norm": 0.330078125, "learning_rate": 0.0001372200726737694, "loss": 0.4918, "step": 3176 }, { "epoch": 3.7280187573270807, "grad_norm": 0.302734375, "learning_rate": 0.00013690237184457377, "loss": 0.4809, "step": 3180 }, { "epoch": 3.7327080890973034, "grad_norm": 0.326171875, "learning_rate": 0.00013658473021109749, "loss": 0.5185, "step": 3184 }, { "epoch": 3.7373974208675262, "grad_norm": 0.337890625, "learning_rate": 0.00013626714920894587, "loss": 0.5666, "step": 3188 }, { "epoch": 3.742086752637749, "grad_norm": 0.349609375, "learning_rate": 0.00013594963027345022, "loss": 0.5398, "step": 3192 }, { "epoch": 3.746776084407972, "grad_norm": 0.3125, "learning_rate": 0.0001356321748396614, "loss": 0.553, "step": 3196 }, { "epoch": 3.7514654161781946, "grad_norm": 0.326171875, "learning_rate": 0.00013531478434234312, "loss": 0.5552, "step": 3200 }, { "epoch": 3.7561547479484174, "grad_norm": 0.326171875, "learning_rate": 0.00013499746021596582, "loss": 0.543, "step": 3204 }, { "epoch": 3.76084407971864, "grad_norm": 0.333984375, "learning_rate": 0.00013468020389469974, "loss": 0.5163, "step": 3208 }, { "epoch": 3.765533411488863, "grad_norm": 0.330078125, "learning_rate": 0.0001343630168124088, "loss": 0.5194, "step": 3212 }, { "epoch": 3.770222743259086, "grad_norm": 0.349609375, "learning_rate": 0.00013404590040264397, "loss": 0.525, "step": 3216 }, { "epoch": 3.7749120750293086, "grad_norm": 0.322265625, "learning_rate": 0.0001337288560986368, "loss": 0.4793, "step": 3220 }, { "epoch": 3.779601406799531, "grad_norm": 0.318359375, "learning_rate": 0.000133411885333293, "loss": 0.5353, "step": 3224 }, { "epoch": 3.7842907385697537, "grad_norm": 0.333984375, "learning_rate": 0.00013309498953918583, "loss": 0.5114, "step": 3228 }, { "epoch": 3.7889800703399765, "grad_norm": 0.345703125, "learning_rate": 0.0001327781701485498, "loss": 0.5858, "step": 3232 }, { "epoch": 3.7936694021101993, "grad_norm": 0.34375, "learning_rate": 0.00013246142859327402, "loss": 0.548, "step": 3236 }, { "epoch": 3.798358733880422, "grad_norm": 0.306640625, "learning_rate": 0.0001321447663048959, "loss": 0.4745, "step": 3240 }, { "epoch": 3.803048065650645, "grad_norm": 0.3359375, "learning_rate": 0.00013182818471459457, "loss": 0.4773, "step": 3244 }, { "epoch": 3.8077373974208673, "grad_norm": 0.326171875, "learning_rate": 0.00013151168525318436, "loss": 0.517, "step": 3248 }, { "epoch": 3.81242672919109, "grad_norm": 0.322265625, "learning_rate": 0.00013119526935110852, "loss": 0.5349, "step": 3252 }, { "epoch": 3.817116060961313, "grad_norm": 0.31640625, "learning_rate": 0.00013087893843843264, "loss": 0.5046, "step": 3256 }, { "epoch": 3.8218053927315356, "grad_norm": 0.33984375, "learning_rate": 0.00013056269394483814, "loss": 0.5361, "step": 3260 }, { "epoch": 3.8264947245017584, "grad_norm": 0.333984375, "learning_rate": 0.00013024653729961586, "loss": 0.5355, "step": 3264 }, { "epoch": 3.831184056271981, "grad_norm": 0.3203125, "learning_rate": 0.00012993046993165966, "loss": 0.5392, "step": 3268 }, { "epoch": 3.835873388042204, "grad_norm": 0.349609375, "learning_rate": 0.00012961449326945985, "loss": 0.5118, "step": 3272 }, { "epoch": 3.840562719812427, "grad_norm": 0.31640625, "learning_rate": 0.00012929860874109683, "loss": 0.5205, "step": 3276 }, { "epoch": 3.8452520515826496, "grad_norm": 0.357421875, "learning_rate": 0.00012898281777423465, "loss": 0.5297, "step": 3280 }, { "epoch": 3.8499413833528724, "grad_norm": 0.314453125, "learning_rate": 0.00012866712179611427, "loss": 0.5043, "step": 3284 }, { "epoch": 3.854630715123095, "grad_norm": 0.326171875, "learning_rate": 0.0001283515222335476, "loss": 0.5284, "step": 3288 }, { "epoch": 3.859320046893318, "grad_norm": 0.333984375, "learning_rate": 0.00012803602051291064, "loss": 0.5118, "step": 3292 }, { "epoch": 3.8640093786635403, "grad_norm": 0.328125, "learning_rate": 0.00012772061806013728, "loss": 0.5621, "step": 3296 }, { "epoch": 3.868698710433763, "grad_norm": 0.314453125, "learning_rate": 0.00012740531630071268, "loss": 0.5587, "step": 3300 }, { "epoch": 3.873388042203986, "grad_norm": 0.322265625, "learning_rate": 0.00012709011665966698, "loss": 0.4888, "step": 3304 }, { "epoch": 3.8780773739742087, "grad_norm": 0.3203125, "learning_rate": 0.00012677502056156878, "loss": 0.4617, "step": 3308 }, { "epoch": 3.8827667057444315, "grad_norm": 0.31640625, "learning_rate": 0.00012646002943051863, "loss": 0.5277, "step": 3312 }, { "epoch": 3.8874560375146543, "grad_norm": 0.310546875, "learning_rate": 0.0001261451446901428, "loss": 0.5163, "step": 3316 }, { "epoch": 3.8921453692848766, "grad_norm": 0.33984375, "learning_rate": 0.00012583036776358652, "loss": 0.4925, "step": 3320 }, { "epoch": 3.8968347010550994, "grad_norm": 0.34375, "learning_rate": 0.00012551570007350796, "loss": 0.5093, "step": 3324 }, { "epoch": 3.9015240328253222, "grad_norm": 0.33984375, "learning_rate": 0.0001252011430420715, "loss": 0.5028, "step": 3328 }, { "epoch": 3.906213364595545, "grad_norm": 0.3203125, "learning_rate": 0.0001248866980909414, "loss": 0.5412, "step": 3332 }, { "epoch": 3.910902696365768, "grad_norm": 0.326171875, "learning_rate": 0.00012457236664127535, "loss": 0.5431, "step": 3336 }, { "epoch": 3.9155920281359906, "grad_norm": 0.310546875, "learning_rate": 0.00012425815011371806, "loss": 0.5178, "step": 3340 }, { "epoch": 3.9202813599062134, "grad_norm": 0.310546875, "learning_rate": 0.00012394404992839485, "loss": 0.5413, "step": 3344 }, { "epoch": 3.924970691676436, "grad_norm": 0.3515625, "learning_rate": 0.0001236300675049052, "loss": 0.5313, "step": 3348 }, { "epoch": 3.929660023446659, "grad_norm": 0.337890625, "learning_rate": 0.0001233162042623165, "loss": 0.5314, "step": 3352 }, { "epoch": 3.934349355216882, "grad_norm": 0.31640625, "learning_rate": 0.0001230024616191572, "loss": 0.5232, "step": 3356 }, { "epoch": 3.9390386869871046, "grad_norm": 0.310546875, "learning_rate": 0.00012268884099341095, "loss": 0.5187, "step": 3360 }, { "epoch": 3.943728018757327, "grad_norm": 0.3125, "learning_rate": 0.00012237534380250985, "loss": 0.5218, "step": 3364 }, { "epoch": 3.9484173505275497, "grad_norm": 0.3359375, "learning_rate": 0.00012206197146332808, "loss": 0.5651, "step": 3368 }, { "epoch": 3.9531066822977725, "grad_norm": 0.353515625, "learning_rate": 0.00012174872539217565, "loss": 0.5458, "step": 3372 }, { "epoch": 3.9577960140679953, "grad_norm": 0.3203125, "learning_rate": 0.00012143560700479177, "loss": 0.5499, "step": 3376 }, { "epoch": 3.962485345838218, "grad_norm": 0.359375, "learning_rate": 0.00012112261771633866, "loss": 0.5396, "step": 3380 }, { "epoch": 3.967174677608441, "grad_norm": 0.328125, "learning_rate": 0.00012080975894139508, "loss": 0.5212, "step": 3384 }, { "epoch": 3.9718640093786637, "grad_norm": 0.3359375, "learning_rate": 0.00012049703209394983, "loss": 0.4536, "step": 3388 }, { "epoch": 3.976553341148886, "grad_norm": 0.333984375, "learning_rate": 0.00012018443858739554, "loss": 0.508, "step": 3392 }, { "epoch": 3.981242672919109, "grad_norm": 0.3359375, "learning_rate": 0.0001198719798345221, "loss": 0.5013, "step": 3396 }, { "epoch": 3.9859320046893316, "grad_norm": 0.361328125, "learning_rate": 0.00011955965724751048, "loss": 0.5184, "step": 3400 }, { "epoch": 3.9906213364595544, "grad_norm": 0.33984375, "learning_rate": 0.00011924747223792619, "loss": 0.5191, "step": 3404 }, { "epoch": 3.995310668229777, "grad_norm": 0.30859375, "learning_rate": 0.00011893542621671296, "loss": 0.5332, "step": 3408 }, { "epoch": 4.0, "grad_norm": 1.765625, "learning_rate": 0.00011862352059418636, "loss": 0.4709, "step": 3412 }, { "epoch": 4.004689331770223, "grad_norm": 0.30078125, "learning_rate": 0.00011831175678002737, "loss": 0.4506, "step": 3416 }, { "epoch": 4.009378663540446, "grad_norm": 0.3203125, "learning_rate": 0.00011800013618327605, "loss": 0.4412, "step": 3420 }, { "epoch": 4.014067995310668, "grad_norm": 0.345703125, "learning_rate": 0.00011768866021232528, "loss": 0.4716, "step": 3424 }, { "epoch": 4.018757327080891, "grad_norm": 0.3203125, "learning_rate": 0.00011737733027491427, "loss": 0.4189, "step": 3428 }, { "epoch": 4.023446658851114, "grad_norm": 0.318359375, "learning_rate": 0.00011706614777812204, "loss": 0.4458, "step": 3432 }, { "epoch": 4.028135990621337, "grad_norm": 0.31640625, "learning_rate": 0.00011675511412836145, "loss": 0.4492, "step": 3436 }, { "epoch": 4.0328253223915596, "grad_norm": 0.318359375, "learning_rate": 0.00011644423073137259, "loss": 0.4557, "step": 3440 }, { "epoch": 4.037514654161782, "grad_norm": 0.3125, "learning_rate": 0.00011613349899221641, "loss": 0.4215, "step": 3444 }, { "epoch": 4.042203985932004, "grad_norm": 0.345703125, "learning_rate": 0.00011582292031526844, "loss": 0.467, "step": 3448 }, { "epoch": 4.046893317702227, "grad_norm": 0.318359375, "learning_rate": 0.00011551249610421252, "loss": 0.4608, "step": 3452 }, { "epoch": 4.05158264947245, "grad_norm": 0.333984375, "learning_rate": 0.00011520222776203428, "loss": 0.487, "step": 3456 }, { "epoch": 4.056271981242673, "grad_norm": 0.34375, "learning_rate": 0.00011489211669101493, "loss": 0.4697, "step": 3460 }, { "epoch": 4.060961313012895, "grad_norm": 0.333984375, "learning_rate": 0.00011458216429272489, "loss": 0.4833, "step": 3464 }, { "epoch": 4.065650644783118, "grad_norm": 0.33203125, "learning_rate": 0.00011427237196801736, "loss": 0.4375, "step": 3468 }, { "epoch": 4.070339976553341, "grad_norm": 0.310546875, "learning_rate": 0.00011396274111702217, "loss": 0.4756, "step": 3472 }, { "epoch": 4.075029308323564, "grad_norm": 0.33203125, "learning_rate": 0.00011365327313913932, "loss": 0.4834, "step": 3476 }, { "epoch": 4.079718640093787, "grad_norm": 0.3359375, "learning_rate": 0.00011334396943303271, "loss": 0.4716, "step": 3480 }, { "epoch": 4.084407971864009, "grad_norm": 0.34375, "learning_rate": 0.00011303483139662382, "loss": 0.4264, "step": 3484 }, { "epoch": 4.089097303634232, "grad_norm": 0.349609375, "learning_rate": 0.00011272586042708535, "loss": 0.4405, "step": 3488 }, { "epoch": 4.093786635404455, "grad_norm": 0.318359375, "learning_rate": 0.00011241705792083484, "loss": 0.3871, "step": 3492 }, { "epoch": 4.098475967174678, "grad_norm": 0.333984375, "learning_rate": 0.00011210842527352861, "loss": 0.4416, "step": 3496 }, { "epoch": 4.103165298944901, "grad_norm": 0.326171875, "learning_rate": 0.00011179996388005524, "loss": 0.4453, "step": 3500 }, { "epoch": 4.107854630715123, "grad_norm": 0.32421875, "learning_rate": 0.0001114916751345292, "loss": 0.4967, "step": 3504 }, { "epoch": 4.112543962485346, "grad_norm": 0.330078125, "learning_rate": 0.00011118356043028476, "loss": 0.465, "step": 3508 }, { "epoch": 4.117233294255569, "grad_norm": 0.33984375, "learning_rate": 0.00011087562115986965, "loss": 0.4547, "step": 3512 }, { "epoch": 4.121922626025792, "grad_norm": 0.32421875, "learning_rate": 0.00011056785871503862, "loss": 0.4575, "step": 3516 }, { "epoch": 4.126611957796014, "grad_norm": 0.32421875, "learning_rate": 0.00011026027448674725, "loss": 0.441, "step": 3520 }, { "epoch": 4.131301289566236, "grad_norm": 0.337890625, "learning_rate": 0.00010995286986514571, "loss": 0.4565, "step": 3524 }, { "epoch": 4.135990621336459, "grad_norm": 0.34375, "learning_rate": 0.00010964564623957239, "loss": 0.4744, "step": 3528 }, { "epoch": 4.140679953106682, "grad_norm": 0.328125, "learning_rate": 0.00010933860499854768, "loss": 0.4447, "step": 3532 }, { "epoch": 4.145369284876905, "grad_norm": 0.359375, "learning_rate": 0.0001090317475297677, "loss": 0.5169, "step": 3536 }, { "epoch": 4.150058616647128, "grad_norm": 0.3359375, "learning_rate": 0.00010872507522009781, "loss": 0.461, "step": 3540 }, { "epoch": 4.15474794841735, "grad_norm": 0.333984375, "learning_rate": 0.00010841858945556677, "loss": 0.452, "step": 3544 }, { "epoch": 4.159437280187573, "grad_norm": 0.318359375, "learning_rate": 0.00010811229162136009, "loss": 0.4486, "step": 3548 }, { "epoch": 4.164126611957796, "grad_norm": 0.337890625, "learning_rate": 0.00010780618310181395, "loss": 0.4286, "step": 3552 }, { "epoch": 4.168815943728019, "grad_norm": 0.333984375, "learning_rate": 0.00010750026528040895, "loss": 0.4743, "step": 3556 }, { "epoch": 4.173505275498242, "grad_norm": 0.34375, "learning_rate": 0.00010719453953976375, "loss": 0.4637, "step": 3560 }, { "epoch": 4.178194607268464, "grad_norm": 0.3046875, "learning_rate": 0.00010688900726162899, "loss": 0.466, "step": 3564 }, { "epoch": 4.182883939038687, "grad_norm": 0.330078125, "learning_rate": 0.00010658366982688076, "loss": 0.4639, "step": 3568 }, { "epoch": 4.18757327080891, "grad_norm": 0.353515625, "learning_rate": 0.00010627852861551479, "loss": 0.4853, "step": 3572 }, { "epoch": 4.192262602579133, "grad_norm": 0.330078125, "learning_rate": 0.00010597358500663966, "loss": 0.4674, "step": 3576 }, { "epoch": 4.1969519343493555, "grad_norm": 0.318359375, "learning_rate": 0.00010566884037847111, "loss": 0.4648, "step": 3580 }, { "epoch": 4.201641266119578, "grad_norm": 0.326171875, "learning_rate": 0.0001053642961083255, "loss": 0.4708, "step": 3584 }, { "epoch": 4.206330597889801, "grad_norm": 0.337890625, "learning_rate": 0.00010505995357261364, "loss": 0.4416, "step": 3588 }, { "epoch": 4.211019929660023, "grad_norm": 0.35546875, "learning_rate": 0.00010475581414683466, "loss": 0.4991, "step": 3592 }, { "epoch": 4.215709261430246, "grad_norm": 0.3359375, "learning_rate": 0.00010445187920556956, "loss": 0.4488, "step": 3596 }, { "epoch": 4.220398593200469, "grad_norm": 0.34375, "learning_rate": 0.00010414815012247529, "loss": 0.4471, "step": 3600 }, { "epoch": 4.225087924970691, "grad_norm": 0.33984375, "learning_rate": 0.00010384462827027838, "loss": 0.4286, "step": 3604 }, { "epoch": 4.229777256740914, "grad_norm": 0.330078125, "learning_rate": 0.00010354131502076875, "loss": 0.4893, "step": 3608 }, { "epoch": 4.234466588511137, "grad_norm": 0.3671875, "learning_rate": 0.00010323821174479363, "loss": 0.4489, "step": 3612 }, { "epoch": 4.23915592028136, "grad_norm": 0.34765625, "learning_rate": 0.000102935319812251, "loss": 0.4349, "step": 3616 }, { "epoch": 4.243845252051583, "grad_norm": 0.33203125, "learning_rate": 0.0001026326405920839, "loss": 0.4235, "step": 3620 }, { "epoch": 4.248534583821805, "grad_norm": 0.3359375, "learning_rate": 0.00010233017545227389, "loss": 0.4619, "step": 3624 }, { "epoch": 4.253223915592028, "grad_norm": 0.33984375, "learning_rate": 0.00010202792575983502, "loss": 0.4251, "step": 3628 }, { "epoch": 4.257913247362251, "grad_norm": 0.328125, "learning_rate": 0.00010172589288080759, "loss": 0.4545, "step": 3632 }, { "epoch": 4.262602579132474, "grad_norm": 0.328125, "learning_rate": 0.00010142407818025201, "loss": 0.425, "step": 3636 }, { "epoch": 4.2672919109026966, "grad_norm": 0.3359375, "learning_rate": 0.00010112248302224263, "loss": 0.4444, "step": 3640 }, { "epoch": 4.271981242672919, "grad_norm": 0.353515625, "learning_rate": 0.00010082110876986147, "loss": 0.4439, "step": 3644 }, { "epoch": 4.276670574443142, "grad_norm": 0.376953125, "learning_rate": 0.00010051995678519231, "loss": 0.4381, "step": 3648 }, { "epoch": 4.281359906213365, "grad_norm": 0.359375, "learning_rate": 0.00010021902842931421, "loss": 0.4599, "step": 3652 }, { "epoch": 4.286049237983588, "grad_norm": 0.3359375, "learning_rate": 9.991832506229558e-05, "loss": 0.4676, "step": 3656 }, { "epoch": 4.29073856975381, "grad_norm": 0.341796875, "learning_rate": 9.961784804318803e-05, "loss": 0.4528, "step": 3660 }, { "epoch": 4.295427901524032, "grad_norm": 0.328125, "learning_rate": 9.931759873002012e-05, "loss": 0.4812, "step": 3664 }, { "epoch": 4.300117233294255, "grad_norm": 0.333984375, "learning_rate": 9.901757847979136e-05, "loss": 0.451, "step": 3668 }, { "epoch": 4.304806565064478, "grad_norm": 0.333984375, "learning_rate": 9.871778864846578e-05, "loss": 0.47, "step": 3672 }, { "epoch": 4.309495896834701, "grad_norm": 0.33203125, "learning_rate": 9.841823059096629e-05, "loss": 0.4477, "step": 3676 }, { "epoch": 4.314185228604924, "grad_norm": 0.33203125, "learning_rate": 9.811890566116806e-05, "loss": 0.4822, "step": 3680 }, { "epoch": 4.318874560375146, "grad_norm": 0.35546875, "learning_rate": 9.781981521189283e-05, "loss": 0.4965, "step": 3684 }, { "epoch": 4.323563892145369, "grad_norm": 0.333984375, "learning_rate": 9.75209605949023e-05, "loss": 0.4346, "step": 3688 }, { "epoch": 4.328253223915592, "grad_norm": 0.365234375, "learning_rate": 9.722234316089256e-05, "loss": 0.4458, "step": 3692 }, { "epoch": 4.332942555685815, "grad_norm": 0.341796875, "learning_rate": 9.692396425948768e-05, "loss": 0.4547, "step": 3696 }, { "epoch": 4.337631887456038, "grad_norm": 0.33203125, "learning_rate": 9.662582523923357e-05, "loss": 0.4411, "step": 3700 }, { "epoch": 4.34232121922626, "grad_norm": 0.330078125, "learning_rate": 9.632792744759207e-05, "loss": 0.4091, "step": 3704 }, { "epoch": 4.347010550996483, "grad_norm": 0.345703125, "learning_rate": 9.603027223093474e-05, "loss": 0.4727, "step": 3708 }, { "epoch": 4.351699882766706, "grad_norm": 0.33984375, "learning_rate": 9.573286093453682e-05, "loss": 0.4794, "step": 3712 }, { "epoch": 4.356389214536929, "grad_norm": 0.361328125, "learning_rate": 9.543569490257111e-05, "loss": 0.4868, "step": 3716 }, { "epoch": 4.3610785463071515, "grad_norm": 0.365234375, "learning_rate": 9.513877547810192e-05, "loss": 0.4537, "step": 3720 }, { "epoch": 4.365767878077374, "grad_norm": 0.3515625, "learning_rate": 9.484210400307903e-05, "loss": 0.414, "step": 3724 }, { "epoch": 4.370457209847597, "grad_norm": 0.322265625, "learning_rate": 9.454568181833151e-05, "loss": 0.445, "step": 3728 }, { "epoch": 4.37514654161782, "grad_norm": 0.345703125, "learning_rate": 9.424951026356183e-05, "loss": 0.4561, "step": 3732 }, { "epoch": 4.379835873388042, "grad_norm": 0.33984375, "learning_rate": 9.395359067733974e-05, "loss": 0.4429, "step": 3736 }, { "epoch": 4.384525205158265, "grad_norm": 0.349609375, "learning_rate": 9.365792439709609e-05, "loss": 0.4832, "step": 3740 }, { "epoch": 4.389214536928487, "grad_norm": 0.361328125, "learning_rate": 9.336251275911702e-05, "loss": 0.4795, "step": 3744 }, { "epoch": 4.39390386869871, "grad_norm": 0.33984375, "learning_rate": 9.306735709853765e-05, "loss": 0.4701, "step": 3748 }, { "epoch": 4.398593200468933, "grad_norm": 0.345703125, "learning_rate": 9.277245874933633e-05, "loss": 0.4642, "step": 3752 }, { "epoch": 4.403282532239156, "grad_norm": 0.37109375, "learning_rate": 9.247781904432847e-05, "loss": 0.4756, "step": 3756 }, { "epoch": 4.407971864009379, "grad_norm": 0.341796875, "learning_rate": 9.218343931516034e-05, "loss": 0.4885, "step": 3760 }, { "epoch": 4.412661195779601, "grad_norm": 0.33984375, "learning_rate": 9.188932089230338e-05, "loss": 0.4598, "step": 3764 }, { "epoch": 4.417350527549824, "grad_norm": 0.419921875, "learning_rate": 9.159546510504807e-05, "loss": 0.4661, "step": 3768 }, { "epoch": 4.422039859320047, "grad_norm": 0.359375, "learning_rate": 9.130187328149779e-05, "loss": 0.4818, "step": 3772 }, { "epoch": 4.42672919109027, "grad_norm": 0.337890625, "learning_rate": 9.100854674856293e-05, "loss": 0.4438, "step": 3776 }, { "epoch": 4.4314185228604925, "grad_norm": 0.3671875, "learning_rate": 9.071548683195495e-05, "loss": 0.4307, "step": 3780 }, { "epoch": 4.436107854630715, "grad_norm": 0.333984375, "learning_rate": 9.042269485618021e-05, "loss": 0.44, "step": 3784 }, { "epoch": 4.440797186400938, "grad_norm": 0.32421875, "learning_rate": 9.013017214453422e-05, "loss": 0.4822, "step": 3788 }, { "epoch": 4.445486518171161, "grad_norm": 0.349609375, "learning_rate": 8.983792001909543e-05, "loss": 0.5057, "step": 3792 }, { "epoch": 4.450175849941384, "grad_norm": 0.3515625, "learning_rate": 8.954593980071941e-05, "loss": 0.4424, "step": 3796 }, { "epoch": 4.4548651817116065, "grad_norm": 0.3671875, "learning_rate": 8.925423280903274e-05, "loss": 0.4472, "step": 3800 }, { "epoch": 4.459554513481828, "grad_norm": 0.34375, "learning_rate": 8.896280036242722e-05, "loss": 0.4449, "step": 3804 }, { "epoch": 4.464243845252051, "grad_norm": 0.34375, "learning_rate": 8.86716437780538e-05, "loss": 0.4707, "step": 3808 }, { "epoch": 4.468933177022274, "grad_norm": 0.345703125, "learning_rate": 8.838076437181663e-05, "loss": 0.4591, "step": 3812 }, { "epoch": 4.473622508792497, "grad_norm": 0.328125, "learning_rate": 8.80901634583672e-05, "loss": 0.4404, "step": 3816 }, { "epoch": 4.47831184056272, "grad_norm": 0.349609375, "learning_rate": 8.779984235109825e-05, "loss": 0.4562, "step": 3820 }, { "epoch": 4.483001172332942, "grad_norm": 0.357421875, "learning_rate": 8.750980236213792e-05, "loss": 0.4657, "step": 3824 }, { "epoch": 4.487690504103165, "grad_norm": 0.345703125, "learning_rate": 8.722004480234381e-05, "loss": 0.4363, "step": 3828 }, { "epoch": 4.492379835873388, "grad_norm": 0.333984375, "learning_rate": 8.693057098129729e-05, "loss": 0.468, "step": 3832 }, { "epoch": 4.497069167643611, "grad_norm": 0.35546875, "learning_rate": 8.664138220729686e-05, "loss": 0.5015, "step": 3836 }, { "epoch": 4.5017584994138335, "grad_norm": 0.361328125, "learning_rate": 8.63524797873532e-05, "loss": 0.4253, "step": 3840 }, { "epoch": 4.506447831184056, "grad_norm": 0.349609375, "learning_rate": 8.606386502718258e-05, "loss": 0.4468, "step": 3844 }, { "epoch": 4.511137162954279, "grad_norm": 0.34375, "learning_rate": 8.577553923120111e-05, "loss": 0.481, "step": 3848 }, { "epoch": 4.515826494724502, "grad_norm": 0.353515625, "learning_rate": 8.548750370251915e-05, "loss": 0.468, "step": 3852 }, { "epoch": 4.520515826494725, "grad_norm": 0.3359375, "learning_rate": 8.519975974293485e-05, "loss": 0.4859, "step": 3856 }, { "epoch": 4.5252051582649475, "grad_norm": 0.322265625, "learning_rate": 8.49123086529289e-05, "loss": 0.3966, "step": 3860 }, { "epoch": 4.52989449003517, "grad_norm": 0.33203125, "learning_rate": 8.462515173165817e-05, "loss": 0.4594, "step": 3864 }, { "epoch": 4.534583821805393, "grad_norm": 0.349609375, "learning_rate": 8.433829027695e-05, "loss": 0.4901, "step": 3868 }, { "epoch": 4.539273153575616, "grad_norm": 0.3359375, "learning_rate": 8.405172558529643e-05, "loss": 0.4108, "step": 3872 }, { "epoch": 4.543962485345839, "grad_norm": 0.328125, "learning_rate": 8.376545895184815e-05, "loss": 0.4007, "step": 3876 }, { "epoch": 4.548651817116061, "grad_norm": 0.345703125, "learning_rate": 8.347949167040894e-05, "loss": 0.4615, "step": 3880 }, { "epoch": 4.553341148886283, "grad_norm": 0.341796875, "learning_rate": 8.319382503342938e-05, "loss": 0.4131, "step": 3884 }, { "epoch": 4.558030480656506, "grad_norm": 0.357421875, "learning_rate": 8.290846033200158e-05, "loss": 0.4706, "step": 3888 }, { "epoch": 4.562719812426729, "grad_norm": 0.3359375, "learning_rate": 8.262339885585274e-05, "loss": 0.4386, "step": 3892 }, { "epoch": 4.567409144196952, "grad_norm": 0.349609375, "learning_rate": 8.233864189333967e-05, "loss": 0.4437, "step": 3896 }, { "epoch": 4.572098475967175, "grad_norm": 0.34765625, "learning_rate": 8.20541907314431e-05, "loss": 0.4487, "step": 3900 }, { "epoch": 4.576787807737397, "grad_norm": 0.33203125, "learning_rate": 8.177004665576147e-05, "loss": 0.4597, "step": 3904 }, { "epoch": 4.58147713950762, "grad_norm": 0.345703125, "learning_rate": 8.148621095050537e-05, "loss": 0.4854, "step": 3908 }, { "epoch": 4.586166471277843, "grad_norm": 0.3359375, "learning_rate": 8.120268489849164e-05, "loss": 0.4578, "step": 3912 }, { "epoch": 4.590855803048066, "grad_norm": 0.353515625, "learning_rate": 8.091946978113782e-05, "loss": 0.4828, "step": 3916 }, { "epoch": 4.5955451348182885, "grad_norm": 0.3515625, "learning_rate": 8.063656687845592e-05, "loss": 0.4351, "step": 3920 }, { "epoch": 4.600234466588511, "grad_norm": 0.349609375, "learning_rate": 8.035397746904695e-05, "loss": 0.4401, "step": 3924 }, { "epoch": 4.604923798358734, "grad_norm": 0.341796875, "learning_rate": 8.007170283009517e-05, "loss": 0.4649, "step": 3928 }, { "epoch": 4.609613130128957, "grad_norm": 0.34375, "learning_rate": 7.978974423736202e-05, "loss": 0.4128, "step": 3932 }, { "epoch": 4.61430246189918, "grad_norm": 0.34765625, "learning_rate": 7.950810296518076e-05, "loss": 0.4591, "step": 3936 }, { "epoch": 4.6189917936694025, "grad_norm": 0.349609375, "learning_rate": 7.922678028645032e-05, "loss": 0.5247, "step": 3940 }, { "epoch": 4.623681125439624, "grad_norm": 0.359375, "learning_rate": 7.89457774726298e-05, "loss": 0.4786, "step": 3944 }, { "epoch": 4.628370457209847, "grad_norm": 0.326171875, "learning_rate": 7.866509579373261e-05, "loss": 0.4466, "step": 3948 }, { "epoch": 4.63305978898007, "grad_norm": 0.34375, "learning_rate": 7.838473651832077e-05, "loss": 0.4672, "step": 3952 }, { "epoch": 4.637749120750293, "grad_norm": 0.341796875, "learning_rate": 7.810470091349925e-05, "loss": 0.454, "step": 3956 }, { "epoch": 4.642438452520516, "grad_norm": 0.32421875, "learning_rate": 7.782499024491004e-05, "loss": 0.4429, "step": 3960 }, { "epoch": 4.647127784290738, "grad_norm": 0.37109375, "learning_rate": 7.754560577672674e-05, "loss": 0.4261, "step": 3964 }, { "epoch": 4.651817116060961, "grad_norm": 0.353515625, "learning_rate": 7.726654877164847e-05, "loss": 0.4814, "step": 3968 }, { "epoch": 4.656506447831184, "grad_norm": 0.328125, "learning_rate": 7.698782049089438e-05, "loss": 0.4318, "step": 3972 }, { "epoch": 4.661195779601407, "grad_norm": 0.33984375, "learning_rate": 7.67094221941981e-05, "loss": 0.4282, "step": 3976 }, { "epoch": 4.6658851113716295, "grad_norm": 0.341796875, "learning_rate": 7.64313551398017e-05, "loss": 0.4523, "step": 3980 }, { "epoch": 4.670574443141852, "grad_norm": 0.34375, "learning_rate": 7.615362058445022e-05, "loss": 0.4633, "step": 3984 }, { "epoch": 4.675263774912075, "grad_norm": 0.337890625, "learning_rate": 7.587621978338586e-05, "loss": 0.4672, "step": 3988 }, { "epoch": 4.679953106682298, "grad_norm": 0.33203125, "learning_rate": 7.559915399034266e-05, "loss": 0.4357, "step": 3992 }, { "epoch": 4.684642438452521, "grad_norm": 0.33984375, "learning_rate": 7.532242445754029e-05, "loss": 0.4189, "step": 3996 }, { "epoch": 4.6893317702227435, "grad_norm": 0.353515625, "learning_rate": 7.504603243567874e-05, "loss": 0.4741, "step": 4000 }, { "epoch": 4.694021101992966, "grad_norm": 0.357421875, "learning_rate": 7.476997917393269e-05, "loss": 0.4782, "step": 4004 }, { "epoch": 4.698710433763189, "grad_norm": 0.337890625, "learning_rate": 7.449426591994565e-05, "loss": 0.4787, "step": 4008 }, { "epoch": 4.703399765533412, "grad_norm": 0.36328125, "learning_rate": 7.421889391982454e-05, "loss": 0.4287, "step": 4012 }, { "epoch": 4.708089097303635, "grad_norm": 0.35546875, "learning_rate": 7.394386441813388e-05, "loss": 0.4493, "step": 4016 }, { "epoch": 4.7127784290738575, "grad_norm": 0.33203125, "learning_rate": 7.366917865789027e-05, "loss": 0.4431, "step": 4020 }, { "epoch": 4.717467760844079, "grad_norm": 0.32421875, "learning_rate": 7.339483788055672e-05, "loss": 0.3989, "step": 4024 }, { "epoch": 4.722157092614302, "grad_norm": 0.359375, "learning_rate": 7.312084332603706e-05, "loss": 0.4511, "step": 4028 }, { "epoch": 4.726846424384525, "grad_norm": 0.375, "learning_rate": 7.284719623267044e-05, "loss": 0.4183, "step": 4032 }, { "epoch": 4.731535756154748, "grad_norm": 0.333984375, "learning_rate": 7.257389783722548e-05, "loss": 0.4115, "step": 4036 }, { "epoch": 4.7362250879249705, "grad_norm": 0.341796875, "learning_rate": 7.2300949374895e-05, "loss": 0.4635, "step": 4040 }, { "epoch": 4.740914419695193, "grad_norm": 0.333984375, "learning_rate": 7.202835207929014e-05, "loss": 0.4566, "step": 4044 }, { "epoch": 4.745603751465416, "grad_norm": 0.32421875, "learning_rate": 7.175610718243493e-05, "loss": 0.4718, "step": 4048 }, { "epoch": 4.750293083235639, "grad_norm": 0.33984375, "learning_rate": 7.148421591476086e-05, "loss": 0.4123, "step": 4052 }, { "epoch": 4.754982415005862, "grad_norm": 0.33984375, "learning_rate": 7.121267950510082e-05, "loss": 0.4439, "step": 4056 }, { "epoch": 4.7596717467760845, "grad_norm": 0.337890625, "learning_rate": 7.094149918068432e-05, "loss": 0.4509, "step": 4060 }, { "epoch": 4.764361078546307, "grad_norm": 0.326171875, "learning_rate": 7.067067616713117e-05, "loss": 0.4291, "step": 4064 }, { "epoch": 4.76905041031653, "grad_norm": 0.32421875, "learning_rate": 7.040021168844653e-05, "loss": 0.4565, "step": 4068 }, { "epoch": 4.773739742086753, "grad_norm": 0.357421875, "learning_rate": 7.013010696701502e-05, "loss": 0.4885, "step": 4072 }, { "epoch": 4.778429073856976, "grad_norm": 0.333984375, "learning_rate": 6.986036322359522e-05, "loss": 0.3944, "step": 4076 }, { "epoch": 4.7831184056271985, "grad_norm": 0.3515625, "learning_rate": 6.959098167731447e-05, "loss": 0.4708, "step": 4080 }, { "epoch": 4.78780773739742, "grad_norm": 0.353515625, "learning_rate": 6.93219635456629e-05, "loss": 0.4989, "step": 4084 }, { "epoch": 4.792497069167643, "grad_norm": 0.3359375, "learning_rate": 6.905331004448843e-05, "loss": 0.4702, "step": 4088 }, { "epoch": 4.797186400937866, "grad_norm": 0.353515625, "learning_rate": 6.878502238799062e-05, "loss": 0.4528, "step": 4092 }, { "epoch": 4.801875732708089, "grad_norm": 0.341796875, "learning_rate": 6.851710178871596e-05, "loss": 0.4384, "step": 4096 }, { "epoch": 4.8065650644783116, "grad_norm": 0.337890625, "learning_rate": 6.824954945755177e-05, "loss": 0.4377, "step": 4100 }, { "epoch": 4.811254396248534, "grad_norm": 0.333984375, "learning_rate": 6.798236660372095e-05, "loss": 0.4406, "step": 4104 }, { "epoch": 4.815943728018757, "grad_norm": 0.345703125, "learning_rate": 6.77155544347767e-05, "loss": 0.4855, "step": 4108 }, { "epoch": 4.82063305978898, "grad_norm": 0.357421875, "learning_rate": 6.744911415659665e-05, "loss": 0.4849, "step": 4112 }, { "epoch": 4.825322391559203, "grad_norm": 0.35546875, "learning_rate": 6.718304697337785e-05, "loss": 0.4904, "step": 4116 }, { "epoch": 4.8300117233294255, "grad_norm": 0.3515625, "learning_rate": 6.691735408763097e-05, "loss": 0.4598, "step": 4120 }, { "epoch": 4.834701055099648, "grad_norm": 0.34375, "learning_rate": 6.66520367001751e-05, "loss": 0.4602, "step": 4124 }, { "epoch": 4.839390386869871, "grad_norm": 0.345703125, "learning_rate": 6.638709601013215e-05, "loss": 0.4598, "step": 4128 }, { "epoch": 4.844079718640094, "grad_norm": 0.365234375, "learning_rate": 6.612253321492157e-05, "loss": 0.438, "step": 4132 }, { "epoch": 4.848769050410317, "grad_norm": 0.349609375, "learning_rate": 6.585834951025496e-05, "loss": 0.4723, "step": 4136 }, { "epoch": 4.8534583821805395, "grad_norm": 0.376953125, "learning_rate": 6.559454609013043e-05, "loss": 0.4737, "step": 4140 }, { "epoch": 4.858147713950762, "grad_norm": 0.359375, "learning_rate": 6.533112414682754e-05, "loss": 0.4772, "step": 4144 }, { "epoch": 4.862837045720985, "grad_norm": 0.34765625, "learning_rate": 6.506808487090163e-05, "loss": 0.4312, "step": 4148 }, { "epoch": 4.867526377491208, "grad_norm": 0.349609375, "learning_rate": 6.48054294511785e-05, "loss": 0.4765, "step": 4152 }, { "epoch": 4.872215709261431, "grad_norm": 0.359375, "learning_rate": 6.454315907474926e-05, "loss": 0.4566, "step": 4156 }, { "epoch": 4.8769050410316535, "grad_norm": 0.337890625, "learning_rate": 6.428127492696454e-05, "loss": 0.4499, "step": 4160 }, { "epoch": 4.881594372801875, "grad_norm": 0.349609375, "learning_rate": 6.401977819142972e-05, "loss": 0.4875, "step": 4164 }, { "epoch": 4.886283704572098, "grad_norm": 0.357421875, "learning_rate": 6.375867004999882e-05, "loss": 0.4595, "step": 4168 }, { "epoch": 4.890973036342321, "grad_norm": 0.353515625, "learning_rate": 6.349795168276994e-05, "loss": 0.4624, "step": 4172 }, { "epoch": 4.895662368112544, "grad_norm": 0.36328125, "learning_rate": 6.323762426807939e-05, "loss": 0.4611, "step": 4176 }, { "epoch": 4.9003516998827665, "grad_norm": 0.328125, "learning_rate": 6.297768898249649e-05, "loss": 0.4109, "step": 4180 }, { "epoch": 4.905041031652989, "grad_norm": 0.33984375, "learning_rate": 6.271814700081852e-05, "loss": 0.4552, "step": 4184 }, { "epoch": 4.909730363423212, "grad_norm": 0.33203125, "learning_rate": 6.245899949606498e-05, "loss": 0.4127, "step": 4188 }, { "epoch": 4.914419695193435, "grad_norm": 0.34765625, "learning_rate": 6.220024763947263e-05, "loss": 0.4686, "step": 4192 }, { "epoch": 4.919109026963658, "grad_norm": 0.365234375, "learning_rate": 6.194189260049003e-05, "loss": 0.4711, "step": 4196 }, { "epoch": 4.9237983587338805, "grad_norm": 0.328125, "learning_rate": 6.168393554677224e-05, "loss": 0.4401, "step": 4200 }, { "epoch": 4.928487690504103, "grad_norm": 0.34765625, "learning_rate": 6.142637764417566e-05, "loss": 0.4706, "step": 4204 }, { "epoch": 4.933177022274326, "grad_norm": 0.36328125, "learning_rate": 6.116922005675262e-05, "loss": 0.4727, "step": 4208 }, { "epoch": 4.937866354044549, "grad_norm": 0.337890625, "learning_rate": 6.0912463946746346e-05, "loss": 0.4737, "step": 4212 }, { "epoch": 4.942555685814772, "grad_norm": 0.337890625, "learning_rate": 6.065611047458538e-05, "loss": 0.4833, "step": 4216 }, { "epoch": 4.9472450175849945, "grad_norm": 0.330078125, "learning_rate": 6.04001607988787e-05, "loss": 0.4372, "step": 4220 }, { "epoch": 4.951934349355217, "grad_norm": 0.35546875, "learning_rate": 6.0144616076410114e-05, "loss": 0.4829, "step": 4224 }, { "epoch": 4.956623681125439, "grad_norm": 0.33984375, "learning_rate": 5.9889477462133234e-05, "loss": 0.428, "step": 4228 }, { "epoch": 4.961313012895662, "grad_norm": 0.361328125, "learning_rate": 5.963474610916643e-05, "loss": 0.4963, "step": 4232 }, { "epoch": 4.966002344665885, "grad_norm": 0.359375, "learning_rate": 5.938042316878719e-05, "loss": 0.5161, "step": 4236 }, { "epoch": 4.9706916764361075, "grad_norm": 0.373046875, "learning_rate": 5.912650979042729e-05, "loss": 0.4917, "step": 4240 }, { "epoch": 4.97538100820633, "grad_norm": 0.36328125, "learning_rate": 5.8873007121667314e-05, "loss": 0.4658, "step": 4244 }, { "epoch": 4.980070339976553, "grad_norm": 0.34765625, "learning_rate": 5.861991630823185e-05, "loss": 0.4614, "step": 4248 }, { "epoch": 4.984759671746776, "grad_norm": 0.337890625, "learning_rate": 5.8367238493983885e-05, "loss": 0.428, "step": 4252 }, { "epoch": 4.989449003516999, "grad_norm": 0.333984375, "learning_rate": 5.81149748209198e-05, "loss": 0.4652, "step": 4256 }, { "epoch": 4.9941383352872215, "grad_norm": 0.357421875, "learning_rate": 5.7863126429164445e-05, "loss": 0.4367, "step": 4260 }, { "epoch": 4.998827667057444, "grad_norm": 0.328125, "learning_rate": 5.761169445696552e-05, "loss": 0.4746, "step": 4264 }, { "epoch": 5.003516998827667, "grad_norm": 0.306640625, "learning_rate": 5.7360680040688915e-05, "loss": 0.5014, "step": 4268 }, { "epoch": 5.00820633059789, "grad_norm": 0.3046875, "learning_rate": 5.711008431481318e-05, "loss": 0.4015, "step": 4272 }, { "epoch": 5.012895662368113, "grad_norm": 0.298828125, "learning_rate": 5.6859908411924634e-05, "loss": 0.3825, "step": 4276 }, { "epoch": 5.0175849941383355, "grad_norm": 0.34765625, "learning_rate": 5.6610153462712144e-05, "loss": 0.4382, "step": 4280 }, { "epoch": 5.022274325908558, "grad_norm": 0.333984375, "learning_rate": 5.6360820595962e-05, "loss": 0.4462, "step": 4284 }, { "epoch": 5.026963657678781, "grad_norm": 0.349609375, "learning_rate": 5.611191093855304e-05, "loss": 0.4038, "step": 4288 }, { "epoch": 5.031652989449004, "grad_norm": 0.345703125, "learning_rate": 5.5863425615451144e-05, "loss": 0.434, "step": 4292 }, { "epoch": 5.036342321219227, "grad_norm": 0.318359375, "learning_rate": 5.5615365749704586e-05, "loss": 0.4023, "step": 4296 }, { "epoch": 5.041031652989449, "grad_norm": 0.341796875, "learning_rate": 5.536773246243861e-05, "loss": 0.3909, "step": 4300 }, { "epoch": 5.045720984759671, "grad_norm": 0.359375, "learning_rate": 5.512052687285052e-05, "loss": 0.4292, "step": 4304 }, { "epoch": 5.050410316529894, "grad_norm": 0.3203125, "learning_rate": 5.487375009820477e-05, "loss": 0.4089, "step": 4308 }, { "epoch": 5.055099648300117, "grad_norm": 0.34765625, "learning_rate": 5.4627403253827436e-05, "loss": 0.442, "step": 4312 }, { "epoch": 5.05978898007034, "grad_norm": 0.3359375, "learning_rate": 5.438148745310182e-05, "loss": 0.4045, "step": 4316 }, { "epoch": 5.0644783118405625, "grad_norm": 0.330078125, "learning_rate": 5.413600380746286e-05, "loss": 0.3942, "step": 4320 }, { "epoch": 5.069167643610785, "grad_norm": 0.3359375, "learning_rate": 5.3890953426392544e-05, "loss": 0.3925, "step": 4324 }, { "epoch": 5.073856975381008, "grad_norm": 0.35546875, "learning_rate": 5.364633741741448e-05, "loss": 0.4398, "step": 4328 }, { "epoch": 5.078546307151231, "grad_norm": 0.359375, "learning_rate": 5.340215688608918e-05, "loss": 0.4871, "step": 4332 }, { "epoch": 5.083235638921454, "grad_norm": 0.365234375, "learning_rate": 5.315841293600906e-05, "loss": 0.4335, "step": 4336 }, { "epoch": 5.0879249706916765, "grad_norm": 0.33203125, "learning_rate": 5.2915106668793214e-05, "loss": 0.4057, "step": 4340 }, { "epoch": 5.092614302461899, "grad_norm": 0.357421875, "learning_rate": 5.2672239184082845e-05, "loss": 0.4184, "step": 4344 }, { "epoch": 5.097303634232122, "grad_norm": 0.3359375, "learning_rate": 5.242981157953567e-05, "loss": 0.399, "step": 4348 }, { "epoch": 5.101992966002345, "grad_norm": 0.341796875, "learning_rate": 5.2187824950821725e-05, "loss": 0.4237, "step": 4352 }, { "epoch": 5.106682297772568, "grad_norm": 0.357421875, "learning_rate": 5.194628039161778e-05, "loss": 0.454, "step": 4356 }, { "epoch": 5.1113716295427905, "grad_norm": 0.3671875, "learning_rate": 5.170517899360267e-05, "loss": 0.4397, "step": 4360 }, { "epoch": 5.116060961313013, "grad_norm": 0.3515625, "learning_rate": 5.1464521846452464e-05, "loss": 0.4265, "step": 4364 }, { "epoch": 5.120750293083236, "grad_norm": 0.35546875, "learning_rate": 5.12243100378352e-05, "loss": 0.4019, "step": 4368 }, { "epoch": 5.125439624853458, "grad_norm": 0.3359375, "learning_rate": 5.098454465340638e-05, "loss": 0.3929, "step": 4372 }, { "epoch": 5.130128956623681, "grad_norm": 0.337890625, "learning_rate": 5.074522677680372e-05, "loss": 0.416, "step": 4376 }, { "epoch": 5.1348182883939035, "grad_norm": 0.34375, "learning_rate": 5.050635748964239e-05, "loss": 0.4315, "step": 4380 }, { "epoch": 5.139507620164126, "grad_norm": 0.341796875, "learning_rate": 5.0267937871510304e-05, "loss": 0.3818, "step": 4384 }, { "epoch": 5.144196951934349, "grad_norm": 0.33984375, "learning_rate": 5.0029968999962726e-05, "loss": 0.3975, "step": 4388 }, { "epoch": 5.148886283704572, "grad_norm": 0.357421875, "learning_rate": 4.97924519505181e-05, "loss": 0.4051, "step": 4392 }, { "epoch": 5.153575615474795, "grad_norm": 0.330078125, "learning_rate": 4.955538779665256e-05, "loss": 0.3967, "step": 4396 }, { "epoch": 5.1582649472450175, "grad_norm": 0.328125, "learning_rate": 4.9318777609795536e-05, "loss": 0.4136, "step": 4400 }, { "epoch": 5.16295427901524, "grad_norm": 0.34765625, "learning_rate": 4.90826224593246e-05, "loss": 0.4419, "step": 4404 }, { "epoch": 5.167643610785463, "grad_norm": 0.349609375, "learning_rate": 4.884692341256072e-05, "loss": 0.45, "step": 4408 }, { "epoch": 5.172332942555686, "grad_norm": 0.330078125, "learning_rate": 4.8611681534763635e-05, "loss": 0.389, "step": 4412 }, { "epoch": 5.177022274325909, "grad_norm": 0.3515625, "learning_rate": 4.837689788912667e-05, "loss": 0.4161, "step": 4416 }, { "epoch": 5.1817116060961315, "grad_norm": 0.34375, "learning_rate": 4.814257353677241e-05, "loss": 0.3891, "step": 4420 }, { "epoch": 5.186400937866354, "grad_norm": 0.35546875, "learning_rate": 4.7908709536747224e-05, "loss": 0.4633, "step": 4424 }, { "epoch": 5.191090269636577, "grad_norm": 0.333984375, "learning_rate": 4.7675306946017296e-05, "loss": 0.4279, "step": 4428 }, { "epoch": 5.1957796014068, "grad_norm": 0.357421875, "learning_rate": 4.74423668194632e-05, "loss": 0.4269, "step": 4432 }, { "epoch": 5.200468933177023, "grad_norm": 0.330078125, "learning_rate": 4.720989020987535e-05, "loss": 0.4207, "step": 4436 }, { "epoch": 5.205158264947245, "grad_norm": 0.3515625, "learning_rate": 4.697787816794947e-05, "loss": 0.492, "step": 4440 }, { "epoch": 5.209847596717467, "grad_norm": 0.369140625, "learning_rate": 4.674633174228138e-05, "loss": 0.4622, "step": 4444 }, { "epoch": 5.21453692848769, "grad_norm": 0.34375, "learning_rate": 4.651525197936275e-05, "loss": 0.4413, "step": 4448 }, { "epoch": 5.219226260257913, "grad_norm": 0.357421875, "learning_rate": 4.6284639923575934e-05, "loss": 0.4275, "step": 4452 }, { "epoch": 5.223915592028136, "grad_norm": 0.341796875, "learning_rate": 4.6054496617189554e-05, "loss": 0.3943, "step": 4456 }, { "epoch": 5.2286049237983585, "grad_norm": 0.345703125, "learning_rate": 4.582482310035365e-05, "loss": 0.4183, "step": 4460 }, { "epoch": 5.233294255568581, "grad_norm": 0.349609375, "learning_rate": 4.559562041109499e-05, "loss": 0.439, "step": 4464 }, { "epoch": 5.237983587338804, "grad_norm": 0.34765625, "learning_rate": 4.53668895853125e-05, "loss": 0.4362, "step": 4468 }, { "epoch": 5.242672919109027, "grad_norm": 0.33203125, "learning_rate": 4.5138631656772346e-05, "loss": 0.4021, "step": 4472 }, { "epoch": 5.24736225087925, "grad_norm": 0.359375, "learning_rate": 4.4910847657103555e-05, "loss": 0.4298, "step": 4476 }, { "epoch": 5.2520515826494725, "grad_norm": 0.353515625, "learning_rate": 4.468353861579306e-05, "loss": 0.3793, "step": 4480 }, { "epoch": 5.256740914419695, "grad_norm": 0.3359375, "learning_rate": 4.44567055601812e-05, "loss": 0.3934, "step": 4484 }, { "epoch": 5.261430246189918, "grad_norm": 0.361328125, "learning_rate": 4.423034951545718e-05, "loss": 0.3962, "step": 4488 }, { "epoch": 5.266119577960141, "grad_norm": 0.341796875, "learning_rate": 4.4004471504654196e-05, "loss": 0.4206, "step": 4492 }, { "epoch": 5.270808909730364, "grad_norm": 0.330078125, "learning_rate": 4.377907254864496e-05, "loss": 0.3844, "step": 4496 }, { "epoch": 5.275498241500586, "grad_norm": 0.345703125, "learning_rate": 4.355415366613702e-05, "loss": 0.4442, "step": 4500 }, { "epoch": 5.280187573270809, "grad_norm": 0.341796875, "learning_rate": 4.332971587366837e-05, "loss": 0.4124, "step": 4504 }, { "epoch": 5.284876905041032, "grad_norm": 0.34375, "learning_rate": 4.3105760185602476e-05, "loss": 0.443, "step": 4508 }, { "epoch": 5.289566236811254, "grad_norm": 0.37890625, "learning_rate": 4.2882287614123965e-05, "loss": 0.4309, "step": 4512 }, { "epoch": 5.294255568581477, "grad_norm": 0.353515625, "learning_rate": 4.2659299169234056e-05, "loss": 0.4248, "step": 4516 }, { "epoch": 5.2989449003516995, "grad_norm": 0.3671875, "learning_rate": 4.24367958587458e-05, "loss": 0.4564, "step": 4520 }, { "epoch": 5.303634232121922, "grad_norm": 0.349609375, "learning_rate": 4.221477868827978e-05, "loss": 0.4004, "step": 4524 }, { "epoch": 5.308323563892145, "grad_norm": 0.36328125, "learning_rate": 4.1993248661259324e-05, "loss": 0.4194, "step": 4528 }, { "epoch": 5.313012895662368, "grad_norm": 0.34375, "learning_rate": 4.1772206778906104e-05, "loss": 0.3892, "step": 4532 }, { "epoch": 5.317702227432591, "grad_norm": 0.359375, "learning_rate": 4.155165404023561e-05, "loss": 0.415, "step": 4536 }, { "epoch": 5.3223915592028135, "grad_norm": 0.365234375, "learning_rate": 4.1331591442052534e-05, "loss": 0.4516, "step": 4540 }, { "epoch": 5.327080890973036, "grad_norm": 0.369140625, "learning_rate": 4.111201997894651e-05, "loss": 0.3757, "step": 4544 }, { "epoch": 5.331770222743259, "grad_norm": 0.3359375, "learning_rate": 4.089294064328725e-05, "loss": 0.3921, "step": 4548 }, { "epoch": 5.336459554513482, "grad_norm": 0.373046875, "learning_rate": 4.067435442522043e-05, "loss": 0.4161, "step": 4552 }, { "epoch": 5.341148886283705, "grad_norm": 0.3671875, "learning_rate": 4.045626231266294e-05, "loss": 0.455, "step": 4556 }, { "epoch": 5.3458382180539274, "grad_norm": 0.33984375, "learning_rate": 4.023866529129848e-05, "loss": 0.424, "step": 4560 }, { "epoch": 5.35052754982415, "grad_norm": 0.345703125, "learning_rate": 4.002156434457333e-05, "loss": 0.4199, "step": 4564 }, { "epoch": 5.355216881594373, "grad_norm": 0.373046875, "learning_rate": 3.980496045369155e-05, "loss": 0.4172, "step": 4568 }, { "epoch": 5.359906213364596, "grad_norm": 0.337890625, "learning_rate": 3.95888545976108e-05, "loss": 0.4208, "step": 4572 }, { "epoch": 5.364595545134819, "grad_norm": 0.3359375, "learning_rate": 3.937324775303773e-05, "loss": 0.3733, "step": 4576 }, { "epoch": 5.369284876905041, "grad_norm": 0.369140625, "learning_rate": 3.915814089442388e-05, "loss": 0.4418, "step": 4580 }, { "epoch": 5.373974208675264, "grad_norm": 0.380859375, "learning_rate": 3.894353499396086e-05, "loss": 0.4322, "step": 4584 }, { "epoch": 5.378663540445486, "grad_norm": 0.337890625, "learning_rate": 3.872943102157622e-05, "loss": 0.4462, "step": 4588 }, { "epoch": 5.383352872215709, "grad_norm": 0.345703125, "learning_rate": 3.851582994492912e-05, "loss": 0.4307, "step": 4592 }, { "epoch": 5.388042203985932, "grad_norm": 0.337890625, "learning_rate": 3.830273272940564e-05, "loss": 0.4003, "step": 4596 }, { "epoch": 5.3927315357561545, "grad_norm": 0.35546875, "learning_rate": 3.8090140338114843e-05, "loss": 0.4188, "step": 4600 }, { "epoch": 5.397420867526377, "grad_norm": 0.349609375, "learning_rate": 3.787805373188405e-05, "loss": 0.4103, "step": 4604 }, { "epoch": 5.4021101992966, "grad_norm": 0.357421875, "learning_rate": 3.766647386925467e-05, "loss": 0.4376, "step": 4608 }, { "epoch": 5.406799531066823, "grad_norm": 0.333984375, "learning_rate": 3.745540170647788e-05, "loss": 0.4164, "step": 4612 }, { "epoch": 5.411488862837046, "grad_norm": 0.369140625, "learning_rate": 3.724483819751022e-05, "loss": 0.4355, "step": 4616 }, { "epoch": 5.4161781946072685, "grad_norm": 0.36328125, "learning_rate": 3.703478429400945e-05, "loss": 0.4503, "step": 4620 }, { "epoch": 5.420867526377491, "grad_norm": 0.326171875, "learning_rate": 3.6825240945329946e-05, "loss": 0.4262, "step": 4624 }, { "epoch": 5.425556858147714, "grad_norm": 0.33984375, "learning_rate": 3.661620909851878e-05, "loss": 0.4333, "step": 4628 }, { "epoch": 5.430246189917937, "grad_norm": 0.349609375, "learning_rate": 3.640768969831113e-05, "loss": 0.4332, "step": 4632 }, { "epoch": 5.43493552168816, "grad_norm": 0.34375, "learning_rate": 3.619968368712613e-05, "loss": 0.3988, "step": 4636 }, { "epoch": 5.439624853458382, "grad_norm": 0.353515625, "learning_rate": 3.599219200506277e-05, "loss": 0.3978, "step": 4640 }, { "epoch": 5.444314185228605, "grad_norm": 0.34765625, "learning_rate": 3.5785215589895224e-05, "loss": 0.4265, "step": 4644 }, { "epoch": 5.449003516998828, "grad_norm": 0.3359375, "learning_rate": 3.557875537706914e-05, "loss": 0.4031, "step": 4648 }, { "epoch": 5.453692848769051, "grad_norm": 0.357421875, "learning_rate": 3.5372812299696934e-05, "loss": 0.4482, "step": 4652 }, { "epoch": 5.458382180539273, "grad_norm": 0.341796875, "learning_rate": 3.5167387288554014e-05, "loss": 0.3847, "step": 4656 }, { "epoch": 5.4630715123094955, "grad_norm": 0.328125, "learning_rate": 3.496248127207415e-05, "loss": 0.4316, "step": 4660 }, { "epoch": 5.467760844079718, "grad_norm": 0.341796875, "learning_rate": 3.475809517634554e-05, "loss": 0.4049, "step": 4664 }, { "epoch": 5.472450175849941, "grad_norm": 0.345703125, "learning_rate": 3.455422992510664e-05, "loss": 0.43, "step": 4668 }, { "epoch": 5.477139507620164, "grad_norm": 0.361328125, "learning_rate": 3.435088643974177e-05, "loss": 0.4267, "step": 4672 }, { "epoch": 5.481828839390387, "grad_norm": 0.3515625, "learning_rate": 3.41480656392773e-05, "loss": 0.4264, "step": 4676 }, { "epoch": 5.4865181711606095, "grad_norm": 0.33984375, "learning_rate": 3.394576844037695e-05, "loss": 0.3992, "step": 4680 }, { "epoch": 5.491207502930832, "grad_norm": 0.369140625, "learning_rate": 3.374399575733835e-05, "loss": 0.4328, "step": 4684 }, { "epoch": 5.495896834701055, "grad_norm": 0.34375, "learning_rate": 3.3542748502088325e-05, "loss": 0.4447, "step": 4688 }, { "epoch": 5.500586166471278, "grad_norm": 0.369140625, "learning_rate": 3.334202758417896e-05, "loss": 0.4483, "step": 4692 }, { "epoch": 5.505275498241501, "grad_norm": 0.373046875, "learning_rate": 3.314183391078373e-05, "loss": 0.4673, "step": 4696 }, { "epoch": 5.509964830011723, "grad_norm": 0.35546875, "learning_rate": 3.294216838669295e-05, "loss": 0.4609, "step": 4700 }, { "epoch": 5.514654161781946, "grad_norm": 0.34765625, "learning_rate": 3.2743031914310104e-05, "loss": 0.4391, "step": 4704 }, { "epoch": 5.519343493552169, "grad_norm": 0.357421875, "learning_rate": 3.254442539364749e-05, "loss": 0.4274, "step": 4708 }, { "epoch": 5.524032825322392, "grad_norm": 0.341796875, "learning_rate": 3.2346349722322274e-05, "loss": 0.3941, "step": 4712 }, { "epoch": 5.528722157092615, "grad_norm": 0.35546875, "learning_rate": 3.2148805795552406e-05, "loss": 0.3779, "step": 4716 }, { "epoch": 5.533411488862837, "grad_norm": 0.353515625, "learning_rate": 3.195179450615252e-05, "loss": 0.3988, "step": 4720 }, { "epoch": 5.53810082063306, "grad_norm": 0.3671875, "learning_rate": 3.175531674453012e-05, "loss": 0.4613, "step": 4724 }, { "epoch": 5.542790152403283, "grad_norm": 0.34765625, "learning_rate": 3.155937339868117e-05, "loss": 0.4266, "step": 4728 }, { "epoch": 5.547479484173505, "grad_norm": 0.373046875, "learning_rate": 3.136396535418653e-05, "loss": 0.4355, "step": 4732 }, { "epoch": 5.552168815943728, "grad_norm": 0.37890625, "learning_rate": 3.1169093494207547e-05, "loss": 0.4324, "step": 4736 }, { "epoch": 5.5568581477139505, "grad_norm": 0.35546875, "learning_rate": 3.097475869948228e-05, "loss": 0.4061, "step": 4740 }, { "epoch": 5.561547479484173, "grad_norm": 0.34375, "learning_rate": 3.078096184832158e-05, "loss": 0.4486, "step": 4744 }, { "epoch": 5.566236811254396, "grad_norm": 0.34765625, "learning_rate": 3.058770381660487e-05, "loss": 0.4335, "step": 4748 }, { "epoch": 5.570926143024619, "grad_norm": 0.34765625, "learning_rate": 3.0394985477776522e-05, "loss": 0.4712, "step": 4752 }, { "epoch": 5.575615474794842, "grad_norm": 0.33203125, "learning_rate": 3.0202807702841493e-05, "loss": 0.4249, "step": 4756 }, { "epoch": 5.5803048065650644, "grad_norm": 0.33203125, "learning_rate": 3.0011171360361815e-05, "loss": 0.4202, "step": 4760 }, { "epoch": 5.584994138335287, "grad_norm": 0.349609375, "learning_rate": 2.9820077316452417e-05, "loss": 0.4492, "step": 4764 }, { "epoch": 5.58968347010551, "grad_norm": 0.357421875, "learning_rate": 2.962952643477718e-05, "loss": 0.4444, "step": 4768 }, { "epoch": 5.594372801875733, "grad_norm": 0.341796875, "learning_rate": 2.9439519576545302e-05, "loss": 0.4112, "step": 4772 }, { "epoch": 5.599062133645956, "grad_norm": 0.353515625, "learning_rate": 2.925005760050704e-05, "loss": 0.4111, "step": 4776 }, { "epoch": 5.603751465416178, "grad_norm": 0.37109375, "learning_rate": 2.906114136295018e-05, "loss": 0.423, "step": 4780 }, { "epoch": 5.608440797186401, "grad_norm": 0.341796875, "learning_rate": 2.8872771717695858e-05, "loss": 0.4275, "step": 4784 }, { "epoch": 5.613130128956624, "grad_norm": 0.35546875, "learning_rate": 2.8684949516094947e-05, "loss": 0.4399, "step": 4788 }, { "epoch": 5.617819460726847, "grad_norm": 0.357421875, "learning_rate": 2.8497675607024046e-05, "loss": 0.4434, "step": 4792 }, { "epoch": 5.622508792497069, "grad_norm": 0.361328125, "learning_rate": 2.831095083688169e-05, "loss": 0.4714, "step": 4796 }, { "epoch": 5.6271981242672915, "grad_norm": 0.365234375, "learning_rate": 2.812477604958465e-05, "loss": 0.4226, "step": 4800 }, { "epoch": 5.631887456037514, "grad_norm": 0.37890625, "learning_rate": 2.793915208656387e-05, "loss": 0.4441, "step": 4804 }, { "epoch": 5.636576787807737, "grad_norm": 0.353515625, "learning_rate": 2.775407978676093e-05, "loss": 0.441, "step": 4808 }, { "epoch": 5.64126611957796, "grad_norm": 0.341796875, "learning_rate": 2.7569559986624023e-05, "loss": 0.425, "step": 4812 }, { "epoch": 5.645955451348183, "grad_norm": 0.33984375, "learning_rate": 2.7385593520104276e-05, "loss": 0.4097, "step": 4816 }, { "epoch": 5.6506447831184055, "grad_norm": 0.34765625, "learning_rate": 2.7202181218652113e-05, "loss": 0.4147, "step": 4820 }, { "epoch": 5.655334114888628, "grad_norm": 0.333984375, "learning_rate": 2.701932391121323e-05, "loss": 0.3897, "step": 4824 }, { "epoch": 5.660023446658851, "grad_norm": 0.359375, "learning_rate": 2.6837022424225048e-05, "loss": 0.4353, "step": 4828 }, { "epoch": 5.664712778429074, "grad_norm": 0.388671875, "learning_rate": 2.6655277581612838e-05, "loss": 0.4483, "step": 4832 }, { "epoch": 5.669402110199297, "grad_norm": 0.37109375, "learning_rate": 2.647409020478623e-05, "loss": 0.3887, "step": 4836 }, { "epoch": 5.674091441969519, "grad_norm": 0.34765625, "learning_rate": 2.629346111263521e-05, "loss": 0.4746, "step": 4840 }, { "epoch": 5.678780773739742, "grad_norm": 0.345703125, "learning_rate": 2.6113391121526573e-05, "loss": 0.4224, "step": 4844 }, { "epoch": 5.683470105509965, "grad_norm": 0.35546875, "learning_rate": 2.593388104530031e-05, "loss": 0.4432, "step": 4848 }, { "epoch": 5.688159437280188, "grad_norm": 0.373046875, "learning_rate": 2.5754931695265674e-05, "loss": 0.4395, "step": 4852 }, { "epoch": 5.692848769050411, "grad_norm": 0.369140625, "learning_rate": 2.5576543880197847e-05, "loss": 0.4717, "step": 4856 }, { "epoch": 5.697538100820633, "grad_norm": 0.3515625, "learning_rate": 2.539871840633399e-05, "loss": 0.4418, "step": 4860 }, { "epoch": 5.702227432590856, "grad_norm": 0.328125, "learning_rate": 2.522145607736976e-05, "loss": 0.4035, "step": 4864 }, { "epoch": 5.706916764361079, "grad_norm": 0.34375, "learning_rate": 2.5044757694455642e-05, "loss": 0.4121, "step": 4868 }, { "epoch": 5.711606096131302, "grad_norm": 0.357421875, "learning_rate": 2.4868624056193264e-05, "loss": 0.4117, "step": 4872 }, { "epoch": 5.716295427901524, "grad_norm": 0.365234375, "learning_rate": 2.469305595863199e-05, "loss": 0.4515, "step": 4876 }, { "epoch": 5.7209847596717465, "grad_norm": 0.359375, "learning_rate": 2.4518054195265024e-05, "loss": 0.4104, "step": 4880 }, { "epoch": 5.725674091441969, "grad_norm": 0.369140625, "learning_rate": 2.4343619557026102e-05, "loss": 0.4226, "step": 4884 }, { "epoch": 5.730363423212192, "grad_norm": 0.341796875, "learning_rate": 2.4169752832285723e-05, "loss": 0.4267, "step": 4888 }, { "epoch": 5.735052754982415, "grad_norm": 0.375, "learning_rate": 2.3996454806847624e-05, "loss": 0.4508, "step": 4892 }, { "epoch": 5.739742086752638, "grad_norm": 0.33984375, "learning_rate": 2.3823726263945442e-05, "loss": 0.4046, "step": 4896 }, { "epoch": 5.74443141852286, "grad_norm": 0.337890625, "learning_rate": 2.3651567984238707e-05, "loss": 0.4372, "step": 4900 }, { "epoch": 5.749120750293083, "grad_norm": 0.345703125, "learning_rate": 2.3479980745809885e-05, "loss": 0.4356, "step": 4904 }, { "epoch": 5.753810082063306, "grad_norm": 0.33984375, "learning_rate": 2.3308965324160374e-05, "loss": 0.4222, "step": 4908 }, { "epoch": 5.758499413833529, "grad_norm": 0.3359375, "learning_rate": 2.313852249220735e-05, "loss": 0.4257, "step": 4912 }, { "epoch": 5.763188745603752, "grad_norm": 0.330078125, "learning_rate": 2.2968653020280036e-05, "loss": 0.387, "step": 4916 }, { "epoch": 5.767878077373974, "grad_norm": 0.330078125, "learning_rate": 2.2799357676116287e-05, "loss": 0.3745, "step": 4920 }, { "epoch": 5.772567409144197, "grad_norm": 0.35546875, "learning_rate": 2.2630637224859283e-05, "loss": 0.4284, "step": 4924 }, { "epoch": 5.77725674091442, "grad_norm": 0.359375, "learning_rate": 2.246249242905377e-05, "loss": 0.4014, "step": 4928 }, { "epoch": 5.781946072684643, "grad_norm": 0.345703125, "learning_rate": 2.2294924048642888e-05, "loss": 0.4091, "step": 4932 }, { "epoch": 5.786635404454865, "grad_norm": 0.359375, "learning_rate": 2.212793284096458e-05, "loss": 0.4428, "step": 4936 }, { "epoch": 5.7913247362250875, "grad_norm": 0.34765625, "learning_rate": 2.196151956074821e-05, "loss": 0.4281, "step": 4940 }, { "epoch": 5.79601406799531, "grad_norm": 0.3359375, "learning_rate": 2.179568496011116e-05, "loss": 0.4555, "step": 4944 }, { "epoch": 5.800703399765533, "grad_norm": 0.384765625, "learning_rate": 2.1630429788555376e-05, "loss": 0.4294, "step": 4948 }, { "epoch": 5.805392731535756, "grad_norm": 0.349609375, "learning_rate": 2.146575479296418e-05, "loss": 0.3904, "step": 4952 }, { "epoch": 5.810082063305979, "grad_norm": 0.33984375, "learning_rate": 2.1301660717598575e-05, "loss": 0.417, "step": 4956 }, { "epoch": 5.814771395076201, "grad_norm": 0.380859375, "learning_rate": 2.1138148304094177e-05, "loss": 0.4715, "step": 4960 }, { "epoch": 5.819460726846424, "grad_norm": 0.357421875, "learning_rate": 2.0975218291457645e-05, "loss": 0.403, "step": 4964 }, { "epoch": 5.824150058616647, "grad_norm": 0.3515625, "learning_rate": 2.0812871416063477e-05, "loss": 0.399, "step": 4968 }, { "epoch": 5.82883939038687, "grad_norm": 0.361328125, "learning_rate": 2.0651108411650685e-05, "loss": 0.4323, "step": 4972 }, { "epoch": 5.833528722157093, "grad_norm": 0.359375, "learning_rate": 2.0489930009319287e-05, "loss": 0.4407, "step": 4976 }, { "epoch": 5.838218053927315, "grad_norm": 0.353515625, "learning_rate": 2.0329336937527312e-05, "loss": 0.4169, "step": 4980 }, { "epoch": 5.842907385697538, "grad_norm": 0.384765625, "learning_rate": 2.0169329922087218e-05, "loss": 0.4336, "step": 4984 }, { "epoch": 5.847596717467761, "grad_norm": 0.34765625, "learning_rate": 2.000990968616287e-05, "loss": 0.4445, "step": 4988 }, { "epoch": 5.852286049237984, "grad_norm": 0.34765625, "learning_rate": 1.985107695026601e-05, "loss": 0.4177, "step": 4992 }, { "epoch": 5.856975381008207, "grad_norm": 0.349609375, "learning_rate": 1.9692832432253154e-05, "loss": 0.4066, "step": 4996 }, { "epoch": 5.861664712778429, "grad_norm": 0.337890625, "learning_rate": 1.9535176847322416e-05, "loss": 0.3923, "step": 5000 }, { "epoch": 5.866354044548652, "grad_norm": 0.36328125, "learning_rate": 1.937811090801004e-05, "loss": 0.3981, "step": 5004 }, { "epoch": 5.871043376318875, "grad_norm": 0.349609375, "learning_rate": 1.9221635324187513e-05, "loss": 0.4235, "step": 5008 }, { "epoch": 5.875732708089098, "grad_norm": 0.373046875, "learning_rate": 1.9065750803057907e-05, "loss": 0.4204, "step": 5012 }, { "epoch": 5.88042203985932, "grad_norm": 0.3671875, "learning_rate": 1.8910458049153173e-05, "loss": 0.4582, "step": 5016 }, { "epoch": 5.8851113716295425, "grad_norm": 0.359375, "learning_rate": 1.8755757764330632e-05, "loss": 0.4433, "step": 5020 }, { "epoch": 5.889800703399765, "grad_norm": 0.3359375, "learning_rate": 1.860165064776985e-05, "loss": 0.4477, "step": 5024 }, { "epoch": 5.894490035169988, "grad_norm": 0.373046875, "learning_rate": 1.8448137395969636e-05, "loss": 0.3817, "step": 5028 }, { "epoch": 5.899179366940211, "grad_norm": 0.3515625, "learning_rate": 1.8295218702744662e-05, "loss": 0.4144, "step": 5032 }, { "epoch": 5.903868698710434, "grad_norm": 0.359375, "learning_rate": 1.8142895259222584e-05, "loss": 0.4391, "step": 5036 }, { "epoch": 5.908558030480656, "grad_norm": 0.361328125, "learning_rate": 1.7991167753840673e-05, "loss": 0.4012, "step": 5040 }, { "epoch": 5.913247362250879, "grad_norm": 0.369140625, "learning_rate": 1.784003687234281e-05, "loss": 0.4313, "step": 5044 }, { "epoch": 5.917936694021102, "grad_norm": 0.349609375, "learning_rate": 1.7689503297776464e-05, "loss": 0.4179, "step": 5048 }, { "epoch": 5.922626025791325, "grad_norm": 0.376953125, "learning_rate": 1.753956771048946e-05, "loss": 0.4241, "step": 5052 }, { "epoch": 5.927315357561548, "grad_norm": 0.369140625, "learning_rate": 1.7390230788127024e-05, "loss": 0.4458, "step": 5056 }, { "epoch": 5.93200468933177, "grad_norm": 0.357421875, "learning_rate": 1.7241493205628644e-05, "loss": 0.3708, "step": 5060 }, { "epoch": 5.936694021101993, "grad_norm": 0.359375, "learning_rate": 1.709335563522507e-05, "loss": 0.4046, "step": 5064 }, { "epoch": 5.941383352872216, "grad_norm": 0.333984375, "learning_rate": 1.6945818746435248e-05, "loss": 0.4018, "step": 5068 }, { "epoch": 5.946072684642439, "grad_norm": 0.349609375, "learning_rate": 1.6798883206063217e-05, "loss": 0.437, "step": 5072 }, { "epoch": 5.950762016412662, "grad_norm": 0.34765625, "learning_rate": 1.665254967819532e-05, "loss": 0.4529, "step": 5076 }, { "epoch": 5.9554513481828835, "grad_norm": 0.34765625, "learning_rate": 1.6506818824196965e-05, "loss": 0.4405, "step": 5080 }, { "epoch": 5.960140679953106, "grad_norm": 0.3515625, "learning_rate": 1.636169130270973e-05, "loss": 0.4241, "step": 5084 }, { "epoch": 5.964830011723329, "grad_norm": 0.357421875, "learning_rate": 1.6217167769648398e-05, "loss": 0.4551, "step": 5088 }, { "epoch": 5.969519343493552, "grad_norm": 0.32421875, "learning_rate": 1.6073248878198032e-05, "loss": 0.427, "step": 5092 }, { "epoch": 5.974208675263775, "grad_norm": 0.359375, "learning_rate": 1.5929935278810883e-05, "loss": 0.3894, "step": 5096 }, { "epoch": 5.978898007033997, "grad_norm": 0.341796875, "learning_rate": 1.578722761920359e-05, "loss": 0.434, "step": 5100 }, { "epoch": 5.98358733880422, "grad_norm": 0.35546875, "learning_rate": 1.5645126544354253e-05, "loss": 0.4446, "step": 5104 }, { "epoch": 5.988276670574443, "grad_norm": 0.33203125, "learning_rate": 1.550363269649932e-05, "loss": 0.4205, "step": 5108 }, { "epoch": 5.992966002344666, "grad_norm": 0.34375, "learning_rate": 1.536274671513098e-05, "loss": 0.4032, "step": 5112 }, { "epoch": 5.997655334114889, "grad_norm": 0.359375, "learning_rate": 1.5222469236994061e-05, "loss": 0.4029, "step": 5116 }, { "epoch": 6.002344665885111, "grad_norm": 0.361328125, "learning_rate": 1.5082800896083186e-05, "loss": 0.3855, "step": 5120 }, { "epoch": 6.007033997655334, "grad_norm": 0.32421875, "learning_rate": 1.4943742323639995e-05, "loss": 0.4209, "step": 5124 }, { "epoch": 6.011723329425557, "grad_norm": 0.353515625, "learning_rate": 1.4805294148150171e-05, "loss": 0.4114, "step": 5128 }, { "epoch": 6.01641266119578, "grad_norm": 0.34375, "learning_rate": 1.4667456995340731e-05, "loss": 0.4657, "step": 5132 }, { "epoch": 6.021101992966003, "grad_norm": 0.341796875, "learning_rate": 1.4530231488177058e-05, "loss": 0.4312, "step": 5136 }, { "epoch": 6.025791324736225, "grad_norm": 0.359375, "learning_rate": 1.4393618246860239e-05, "loss": 0.43, "step": 5140 }, { "epoch": 6.030480656506448, "grad_norm": 0.337890625, "learning_rate": 1.4257617888824096e-05, "loss": 0.4133, "step": 5144 }, { "epoch": 6.035169988276671, "grad_norm": 0.33984375, "learning_rate": 1.4122231028732516e-05, "loss": 0.4065, "step": 5148 }, { "epoch": 6.039859320046894, "grad_norm": 0.3359375, "learning_rate": 1.398745827847667e-05, "loss": 0.4284, "step": 5152 }, { "epoch": 6.044548651817116, "grad_norm": 0.34375, "learning_rate": 1.385330024717215e-05, "loss": 0.3909, "step": 5156 }, { "epoch": 6.049237983587338, "grad_norm": 0.3359375, "learning_rate": 1.3719757541156317e-05, "loss": 0.3842, "step": 5160 }, { "epoch": 6.053927315357561, "grad_norm": 0.32421875, "learning_rate": 1.3586830763985479e-05, "loss": 0.3678, "step": 5164 }, { "epoch": 6.058616647127784, "grad_norm": 0.349609375, "learning_rate": 1.3454520516432282e-05, "loss": 0.4318, "step": 5168 }, { "epoch": 6.063305978898007, "grad_norm": 0.330078125, "learning_rate": 1.3322827396482888e-05, "loss": 0.4121, "step": 5172 }, { "epoch": 6.06799531066823, "grad_norm": 0.359375, "learning_rate": 1.3191751999334237e-05, "loss": 0.4247, "step": 5176 }, { "epoch": 6.072684642438452, "grad_norm": 0.337890625, "learning_rate": 1.3061294917391558e-05, "loss": 0.375, "step": 5180 }, { "epoch": 6.077373974208675, "grad_norm": 0.34765625, "learning_rate": 1.2931456740265406e-05, "loss": 0.4611, "step": 5184 }, { "epoch": 6.082063305978898, "grad_norm": 0.361328125, "learning_rate": 1.2802238054769298e-05, "loss": 0.4174, "step": 5188 }, { "epoch": 6.086752637749121, "grad_norm": 0.349609375, "learning_rate": 1.2673639444916805e-05, "loss": 0.4337, "step": 5192 }, { "epoch": 6.091441969519344, "grad_norm": 0.357421875, "learning_rate": 1.2545661491919057e-05, "loss": 0.3709, "step": 5196 }, { "epoch": 6.096131301289566, "grad_norm": 0.341796875, "learning_rate": 1.2418304774182075e-05, "loss": 0.452, "step": 5200 }, { "epoch": 6.100820633059789, "grad_norm": 0.345703125, "learning_rate": 1.2291569867304112e-05, "loss": 0.435, "step": 5204 }, { "epoch": 6.105509964830012, "grad_norm": 0.34765625, "learning_rate": 1.2165457344073238e-05, "loss": 0.4482, "step": 5208 }, { "epoch": 6.110199296600235, "grad_norm": 0.345703125, "learning_rate": 1.2039967774464448e-05, "loss": 0.3896, "step": 5212 }, { "epoch": 6.1148886283704575, "grad_norm": 0.33984375, "learning_rate": 1.1915101725637383e-05, "loss": 0.4166, "step": 5216 }, { "epoch": 6.11957796014068, "grad_norm": 0.341796875, "learning_rate": 1.1790859761933563e-05, "loss": 0.3914, "step": 5220 }, { "epoch": 6.124267291910903, "grad_norm": 0.357421875, "learning_rate": 1.166724244487387e-05, "loss": 0.4055, "step": 5224 }, { "epoch": 6.128956623681125, "grad_norm": 0.3359375, "learning_rate": 1.1544250333156207e-05, "loss": 0.409, "step": 5228 }, { "epoch": 6.133645955451348, "grad_norm": 0.314453125, "learning_rate": 1.142188398265259e-05, "loss": 0.3903, "step": 5232 }, { "epoch": 6.138335287221571, "grad_norm": 0.34375, "learning_rate": 1.1300143946407064e-05, "loss": 0.397, "step": 5236 }, { "epoch": 6.143024618991793, "grad_norm": 0.326171875, "learning_rate": 1.1179030774632851e-05, "loss": 0.3783, "step": 5240 }, { "epoch": 6.147713950762016, "grad_norm": 0.353515625, "learning_rate": 1.1058545014710146e-05, "loss": 0.4203, "step": 5244 }, { "epoch": 6.152403282532239, "grad_norm": 0.3515625, "learning_rate": 1.093868721118339e-05, "loss": 0.433, "step": 5248 }, { "epoch": 6.157092614302462, "grad_norm": 0.337890625, "learning_rate": 1.0819457905758978e-05, "loss": 0.4351, "step": 5252 }, { "epoch": 6.161781946072685, "grad_norm": 0.33984375, "learning_rate": 1.0700857637302779e-05, "loss": 0.4498, "step": 5256 }, { "epoch": 6.166471277842907, "grad_norm": 0.3515625, "learning_rate": 1.058288694183762e-05, "loss": 0.4151, "step": 5260 }, { "epoch": 6.17116060961313, "grad_norm": 0.333984375, "learning_rate": 1.0465546352541055e-05, "loss": 0.4446, "step": 5264 }, { "epoch": 6.175849941383353, "grad_norm": 0.345703125, "learning_rate": 1.034883639974261e-05, "loss": 0.4044, "step": 5268 }, { "epoch": 6.180539273153576, "grad_norm": 0.341796875, "learning_rate": 1.0232757610921833e-05, "loss": 0.3634, "step": 5272 }, { "epoch": 6.185228604923799, "grad_norm": 0.34765625, "learning_rate": 1.0117310510705528e-05, "loss": 0.4032, "step": 5276 }, { "epoch": 6.189917936694021, "grad_norm": 0.341796875, "learning_rate": 1.0002495620865558e-05, "loss": 0.4418, "step": 5280 }, { "epoch": 6.194607268464244, "grad_norm": 0.349609375, "learning_rate": 9.888313460316549e-06, "loss": 0.4224, "step": 5284 }, { "epoch": 6.199296600234467, "grad_norm": 0.373046875, "learning_rate": 9.77476454511335e-06, "loss": 0.4139, "step": 5288 }, { "epoch": 6.20398593200469, "grad_norm": 0.3515625, "learning_rate": 9.661849388448866e-06, "loss": 0.3725, "step": 5292 }, { "epoch": 6.2086752637749125, "grad_norm": 0.34375, "learning_rate": 9.549568500651695e-06, "loss": 0.4023, "step": 5296 }, { "epoch": 6.213364595545134, "grad_norm": 0.337890625, "learning_rate": 9.437922389183772e-06, "loss": 0.3973, "step": 5300 }, { "epoch": 6.218053927315357, "grad_norm": 0.3515625, "learning_rate": 9.3269115586381e-06, "loss": 0.4492, "step": 5304 }, { "epoch": 6.22274325908558, "grad_norm": 0.349609375, "learning_rate": 9.216536510736528e-06, "loss": 0.3933, "step": 5308 }, { "epoch": 6.227432590855803, "grad_norm": 0.365234375, "learning_rate": 9.106797744327449e-06, "loss": 0.4262, "step": 5312 }, { "epoch": 6.232121922626026, "grad_norm": 0.359375, "learning_rate": 8.997695755383444e-06, "loss": 0.386, "step": 5316 }, { "epoch": 6.236811254396248, "grad_norm": 0.333984375, "learning_rate": 8.889231036999245e-06, "loss": 0.4026, "step": 5320 }, { "epoch": 6.241500586166471, "grad_norm": 0.345703125, "learning_rate": 8.781404079389304e-06, "loss": 0.4596, "step": 5324 }, { "epoch": 6.246189917936694, "grad_norm": 0.326171875, "learning_rate": 8.674215369885695e-06, "loss": 0.4081, "step": 5328 }, { "epoch": 6.250879249706917, "grad_norm": 0.369140625, "learning_rate": 8.567665392935918e-06, "loss": 0.4246, "step": 5332 }, { "epoch": 6.25556858147714, "grad_norm": 0.384765625, "learning_rate": 8.461754630100581e-06, "loss": 0.4221, "step": 5336 }, { "epoch": 6.260257913247362, "grad_norm": 0.375, "learning_rate": 8.356483560051468e-06, "loss": 0.4704, "step": 5340 }, { "epoch": 6.264947245017585, "grad_norm": 0.349609375, "learning_rate": 8.251852658569014e-06, "loss": 0.3896, "step": 5344 }, { "epoch": 6.269636576787808, "grad_norm": 0.333984375, "learning_rate": 8.147862398540545e-06, "loss": 0.4065, "step": 5348 }, { "epoch": 6.274325908558031, "grad_norm": 0.34765625, "learning_rate": 8.044513249957874e-06, "loss": 0.4074, "step": 5352 }, { "epoch": 6.2790152403282535, "grad_norm": 0.337890625, "learning_rate": 7.94180567991528e-06, "loss": 0.4237, "step": 5356 }, { "epoch": 6.283704572098476, "grad_norm": 0.349609375, "learning_rate": 7.839740152607398e-06, "loss": 0.4043, "step": 5360 }, { "epoch": 6.288393903868699, "grad_norm": 0.33984375, "learning_rate": 7.738317129327049e-06, "loss": 0.4098, "step": 5364 }, { "epoch": 6.293083235638921, "grad_norm": 0.32421875, "learning_rate": 7.63753706846329e-06, "loss": 0.4149, "step": 5368 }, { "epoch": 6.297772567409144, "grad_norm": 0.357421875, "learning_rate": 7.537400425499191e-06, "loss": 0.3909, "step": 5372 }, { "epoch": 6.302461899179367, "grad_norm": 0.337890625, "learning_rate": 7.437907653009878e-06, "loss": 0.3893, "step": 5376 }, { "epoch": 6.307151230949589, "grad_norm": 0.359375, "learning_rate": 7.339059200660441e-06, "loss": 0.4365, "step": 5380 }, { "epoch": 6.311840562719812, "grad_norm": 0.37109375, "learning_rate": 7.240855515203897e-06, "loss": 0.4586, "step": 5384 }, { "epoch": 6.316529894490035, "grad_norm": 0.35546875, "learning_rate": 7.143297040479262e-06, "loss": 0.4519, "step": 5388 }, { "epoch": 6.321219226260258, "grad_norm": 0.337890625, "learning_rate": 7.046384217409401e-06, "loss": 0.3872, "step": 5392 }, { "epoch": 6.325908558030481, "grad_norm": 0.34765625, "learning_rate": 6.950117483999145e-06, "loss": 0.4004, "step": 5396 }, { "epoch": 6.330597889800703, "grad_norm": 0.32421875, "learning_rate": 6.854497275333282e-06, "loss": 0.3972, "step": 5400 }, { "epoch": 6.335287221570926, "grad_norm": 0.32421875, "learning_rate": 6.759524023574514e-06, "loss": 0.367, "step": 5404 }, { "epoch": 6.339976553341149, "grad_norm": 0.349609375, "learning_rate": 6.6651981579616705e-06, "loss": 0.4688, "step": 5408 }, { "epoch": 6.344665885111372, "grad_norm": 0.337890625, "learning_rate": 6.57152010480762e-06, "loss": 0.4207, "step": 5412 }, { "epoch": 6.3493552168815945, "grad_norm": 0.337890625, "learning_rate": 6.4784902874973734e-06, "loss": 0.413, "step": 5416 }, { "epoch": 6.354044548651817, "grad_norm": 0.349609375, "learning_rate": 6.3861091264861995e-06, "loss": 0.448, "step": 5420 }, { "epoch": 6.35873388042204, "grad_norm": 0.35546875, "learning_rate": 6.2943770392977826e-06, "loss": 0.4018, "step": 5424 }, { "epoch": 6.363423212192263, "grad_norm": 0.359375, "learning_rate": 6.203294440522183e-06, "loss": 0.416, "step": 5428 }, { "epoch": 6.368112543962486, "grad_norm": 0.3359375, "learning_rate": 6.112861741814063e-06, "loss": 0.4392, "step": 5432 }, { "epoch": 6.3728018757327085, "grad_norm": 0.373046875, "learning_rate": 6.023079351890881e-06, "loss": 0.4195, "step": 5436 }, { "epoch": 6.377491207502931, "grad_norm": 0.333984375, "learning_rate": 5.933947676530881e-06, "loss": 0.4029, "step": 5440 }, { "epoch": 6.382180539273153, "grad_norm": 0.357421875, "learning_rate": 5.845467118571445e-06, "loss": 0.4476, "step": 5444 }, { "epoch": 6.386869871043376, "grad_norm": 0.359375, "learning_rate": 5.757638077907123e-06, "loss": 0.4326, "step": 5448 }, { "epoch": 6.391559202813599, "grad_norm": 0.34375, "learning_rate": 5.670460951487854e-06, "loss": 0.4126, "step": 5452 }, { "epoch": 6.396248534583822, "grad_norm": 0.34765625, "learning_rate": 5.583936133317285e-06, "loss": 0.4201, "step": 5456 }, { "epoch": 6.400937866354044, "grad_norm": 0.353515625, "learning_rate": 5.498064014450837e-06, "loss": 0.4105, "step": 5460 }, { "epoch": 6.405627198124267, "grad_norm": 0.369140625, "learning_rate": 5.4128449829940745e-06, "loss": 0.4393, "step": 5464 }, { "epoch": 6.41031652989449, "grad_norm": 0.36328125, "learning_rate": 5.3282794241007895e-06, "loss": 0.3989, "step": 5468 }, { "epoch": 6.415005861664713, "grad_norm": 0.345703125, "learning_rate": 5.244367719971454e-06, "loss": 0.409, "step": 5472 }, { "epoch": 6.4196951934349356, "grad_norm": 0.357421875, "learning_rate": 5.161110249851353e-06, "loss": 0.4121, "step": 5476 }, { "epoch": 6.424384525205158, "grad_norm": 0.34375, "learning_rate": 5.078507390028852e-06, "loss": 0.4181, "step": 5480 }, { "epoch": 6.429073856975381, "grad_norm": 0.349609375, "learning_rate": 4.996559513833903e-06, "loss": 0.4494, "step": 5484 }, { "epoch": 6.433763188745604, "grad_norm": 0.35546875, "learning_rate": 4.915266991636025e-06, "loss": 0.4472, "step": 5488 }, { "epoch": 6.438452520515827, "grad_norm": 0.345703125, "learning_rate": 4.83463019084297e-06, "loss": 0.3834, "step": 5492 }, { "epoch": 6.4431418522860495, "grad_norm": 0.3359375, "learning_rate": 4.754649475898814e-06, "loss": 0.3881, "step": 5496 }, { "epoch": 6.447831184056272, "grad_norm": 0.3828125, "learning_rate": 4.675325208282471e-06, "loss": 0.4295, "step": 5500 } ], "logging_steps": 4, "max_steps": 5971, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.5958144579529605e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }