diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,6430 +1,9641 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 7.0, + "epoch": 6.447831184056272, "eval_steps": 500, - "global_step": 3661, + "global_step": 5500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.0076481835564053535, - "grad_norm": 0.291015625, - "learning_rate": 3.243243243243243e-05, - "loss": 1.3011, + "epoch": 0.004689331770222743, + "grad_norm": 0.3125, + "learning_rate": 1.9999999999999998e-05, + "loss": 1.2947, "step": 4 }, { - "epoch": 0.015296367112810707, - "grad_norm": 0.26171875, - "learning_rate": 6.486486486486486e-05, - "loss": 1.3104, + "epoch": 0.009378663540445486, + "grad_norm": 0.294921875, + "learning_rate": 3.9999999999999996e-05, + "loss": 1.2923, "step": 8 }, { - "epoch": 0.022944550669216062, - "grad_norm": 0.2060546875, - "learning_rate": 9.72972972972973e-05, - "loss": 1.2309, + "epoch": 0.01406799531066823, + "grad_norm": 0.1904296875, + "learning_rate": 5.9999999999999995e-05, + "loss": 1.227, "step": 12 }, { - "epoch": 0.030592734225621414, - "grad_norm": 0.2392578125, - "learning_rate": 0.00012972972972972972, - "loss": 1.2051, + "epoch": 0.01875732708089097, + "grad_norm": 0.2216796875, + "learning_rate": 7.999999999999999e-05, + "loss": 1.2031, "step": 16 }, { - "epoch": 0.03824091778202677, - "grad_norm": 0.169921875, - "learning_rate": 0.00016216216216216215, - "loss": 1.1622, + "epoch": 0.023446658851113716, + "grad_norm": 0.2373046875, + "learning_rate": 9.999999999999999e-05, + "loss": 1.1748, "step": 20 }, { - "epoch": 0.045889101338432124, - "grad_norm": 0.1748046875, - "learning_rate": 0.0001945945945945946, - "loss": 1.1749, + "epoch": 0.02813599062133646, + "grad_norm": 0.1865234375, + "learning_rate": 0.00011999999999999999, + "loss": 1.1424, "step": 24 }, { - "epoch": 0.05353728489483748, - "grad_norm": 0.154296875, - "learning_rate": 0.00022702702702702703, - "loss": 1.149, + "epoch": 0.032825322391559206, + "grad_norm": 0.1787109375, + "learning_rate": 0.00014, + "loss": 1.1835, "step": 28 }, { - "epoch": 0.06118546845124283, - "grad_norm": 0.1767578125, - "learning_rate": 0.00025945945945945944, - "loss": 1.1455, + "epoch": 0.03751465416178194, + "grad_norm": 0.1689453125, + "learning_rate": 0.00015999999999999999, + "loss": 1.1185, "step": 32 }, { - "epoch": 0.06883365200764818, - "grad_norm": 0.1884765625, - "learning_rate": 0.0002918918918918919, - "loss": 1.1358, + "epoch": 0.04220398593200469, + "grad_norm": 0.189453125, + "learning_rate": 0.00017999999999999998, + "loss": 1.139, "step": 36 }, { - "epoch": 0.07648183556405354, - "grad_norm": 0.181640625, - "learning_rate": 0.00029999949274434724, - "loss": 1.1201, + "epoch": 0.04689331770222743, + "grad_norm": 0.19921875, + "learning_rate": 0.00019999999999999998, + "loss": 1.1278, "step": 40 }, { - "epoch": 0.0841300191204589, - "grad_norm": 0.1708984375, - "learning_rate": 0.0002999972382816974, - "loss": 1.0549, + "epoch": 0.05158264947245018, + "grad_norm": 0.2314453125, + "learning_rate": 0.00021999999999999995, + "loss": 1.0939, "step": 44 }, { - "epoch": 0.09177820267686425, - "grad_norm": 0.19921875, - "learning_rate": 0.0002999931802773903, - "loss": 1.0946, + "epoch": 0.05627198124267292, + "grad_norm": 0.2041015625, + "learning_rate": 0.00023999999999999998, + "loss": 1.0765, "step": 48 }, { - "epoch": 0.0994263862332696, - "grad_norm": 0.1826171875, - "learning_rate": 0.00029998731878021884, - "loss": 1.0841, + "epoch": 0.06096131301289566, + "grad_norm": 0.193359375, + "learning_rate": 0.00026, + "loss": 1.1053, "step": 52 }, { - "epoch": 0.10707456978967496, - "grad_norm": 0.2001953125, - "learning_rate": 0.00029997965386066057, - "loss": 1.0904, + "epoch": 0.06565064478311841, + "grad_norm": 0.1962890625, + "learning_rate": 0.00028, + "loss": 1.0675, "step": 56 }, { - "epoch": 0.1147227533460803, - "grad_norm": 0.1826171875, - "learning_rate": 0.0002999701856108772, - "loss": 1.0673, + "epoch": 0.07033997655334115, + "grad_norm": 0.1923828125, + "learning_rate": 0.0003, + "loss": 1.0904, "step": 60 }, { - "epoch": 0.12237093690248566, - "grad_norm": 0.1826171875, - "learning_rate": 0.00029995891414471334, - "loss": 1.0211, + "epoch": 0.07502930832356389, + "grad_norm": 0.1943359375, + "learning_rate": 0.00029999966103183746, + "loss": 1.0871, "step": 64 }, { - "epoch": 0.13001912045889102, - "grad_norm": 0.1865234375, - "learning_rate": 0.0002999458395976953, - "loss": 1.0497, + "epoch": 0.07971864009378664, + "grad_norm": 0.1982421875, + "learning_rate": 0.0002999986441288818, + "loss": 1.0505, "step": 68 }, { - "epoch": 0.13766730401529637, - "grad_norm": 0.208984375, - "learning_rate": 0.0002999309621270293, - "loss": 1.0711, + "epoch": 0.08440797186400938, + "grad_norm": 0.212890625, + "learning_rate": 0.000299996949295729, + "loss": 1.0788, "step": 72 }, { - "epoch": 0.14531548757170173, - "grad_norm": 0.1923828125, - "learning_rate": 0.00029991428191159935, - "loss": 1.047, + "epoch": 0.08909730363423213, + "grad_norm": 0.201171875, + "learning_rate": 0.0002999945765400391, + "loss": 1.0503, "step": 76 }, { - "epoch": 0.15296367112810708, - "grad_norm": 0.1962890625, - "learning_rate": 0.00029989579915196574, - "loss": 1.0442, + "epoch": 0.09378663540445487, + "grad_norm": 0.1904296875, + "learning_rate": 0.00029999152587253583, + "loss": 1.0564, "step": 80 }, { - "epoch": 0.16061185468451242, - "grad_norm": 0.197265625, - "learning_rate": 0.000299875514070362, - "loss": 1.0197, + "epoch": 0.0984759671746776, + "grad_norm": 0.2197265625, + "learning_rate": 0.0002999877973070069, + "loss": 1.0396, "step": 84 }, { - "epoch": 0.1682600382409178, - "grad_norm": 0.2080078125, - "learning_rate": 0.00029985342691069255, - "loss": 1.0185, + "epoch": 0.10316529894490035, + "grad_norm": 0.1953125, + "learning_rate": 0.00029998339086030395, + "loss": 1.0208, "step": 88 }, { - "epoch": 0.17590822179732313, - "grad_norm": 0.2177734375, - "learning_rate": 0.0002998295379385297, - "loss": 1.0168, + "epoch": 0.10785463071512309, + "grad_norm": 0.203125, + "learning_rate": 0.00029997830655234217, + "loss": 1.057, "step": 92 }, { - "epoch": 0.1835564053537285, - "grad_norm": 0.2099609375, - "learning_rate": 0.00029980384744111047, - "loss": 1.0211, + "epoch": 0.11254396248534584, + "grad_norm": 0.205078125, + "learning_rate": 0.0002999725444061004, + "loss": 1.0283, "step": 96 }, { - "epoch": 0.19120458891013384, - "grad_norm": 0.208984375, - "learning_rate": 0.0002997763557273331, - "loss": 1.0178, + "epoch": 0.11723329425556858, + "grad_norm": 0.189453125, + "learning_rate": 0.0002999661044476212, + "loss": 0.988, "step": 100 }, { - "epoch": 0.1988527724665392, - "grad_norm": 0.2060546875, - "learning_rate": 0.0002997470631277533, - "loss": 0.9871, + "epoch": 0.12192262602579132, + "grad_norm": 0.2001953125, + "learning_rate": 0.00029995898670601053, + "loss": 1.049, "step": 104 }, { - "epoch": 0.20650095602294455, - "grad_norm": 0.2314453125, - "learning_rate": 0.0002997159699945804, - "loss": 1.0197, + "epoch": 0.12661195779601406, + "grad_norm": 0.1953125, + "learning_rate": 0.0002999511912134374, + "loss": 0.9804, "step": 108 }, { - "epoch": 0.21414913957934992, - "grad_norm": 0.240234375, - "learning_rate": 0.0002996830767016731, - "loss": 1.0, + "epoch": 0.13130128956623682, + "grad_norm": 0.2021484375, + "learning_rate": 0.0002999427180051341, + "loss": 1.0172, "step": 112 }, { - "epoch": 0.22179732313575526, - "grad_norm": 0.236328125, - "learning_rate": 0.0002996483836445347, - "loss": 0.9885, + "epoch": 0.13599062133645956, + "grad_norm": 0.2177734375, + "learning_rate": 0.00029993356711939615, + "loss": 0.9863, "step": 116 }, { - "epoch": 0.2294455066921606, - "grad_norm": 0.2099609375, - "learning_rate": 0.00029961189124030885, - "loss": 0.9664, + "epoch": 0.1406799531066823, + "grad_norm": 0.236328125, + "learning_rate": 0.0002999237385975815, + "loss": 0.9799, "step": 120 }, { - "epoch": 0.23709369024856597, - "grad_norm": 0.236328125, - "learning_rate": 0.00029957359992777404, - "loss": 0.9831, + "epoch": 0.14536928487690504, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002999132324841109, + "loss": 1.0098, "step": 124 }, { - "epoch": 0.2447418738049713, - "grad_norm": 0.25390625, - "learning_rate": 0.00029953351016733854, - "loss": 0.9718, + "epoch": 0.15005861664712777, + "grad_norm": 0.220703125, + "learning_rate": 0.0002999020488264676, + "loss": 0.9597, "step": 128 }, { - "epoch": 0.25239005736137665, - "grad_norm": 0.205078125, - "learning_rate": 0.000299491622441035, - "loss": 0.9592, + "epoch": 0.15474794841735054, + "grad_norm": 0.2216796875, + "learning_rate": 0.0002998901876751969, + "loss": 0.9958, "step": 132 }, { - "epoch": 0.26003824091778205, - "grad_norm": 0.2109375, - "learning_rate": 0.00029944793725251436, - "loss": 0.9514, + "epoch": 0.15943728018757328, + "grad_norm": 0.2333984375, + "learning_rate": 0.00029987764908390624, + "loss": 0.947, "step": 136 }, { - "epoch": 0.2676864244741874, - "grad_norm": 0.220703125, - "learning_rate": 0.00029940245512704005, - "loss": 0.9435, + "epoch": 0.16412661195779601, + "grad_norm": 0.240234375, + "learning_rate": 0.0002998644331092647, + "loss": 0.948, "step": 140 }, { - "epoch": 0.27533460803059273, - "grad_norm": 0.2138671875, - "learning_rate": 0.0002993551766114815, - "loss": 0.9545, + "epoch": 0.16881594372801875, + "grad_norm": 0.205078125, + "learning_rate": 0.00029985053981100286, + "loss": 0.9685, "step": 144 }, { - "epoch": 0.2829827915869981, - "grad_norm": 0.25, - "learning_rate": 0.00029930610227430767, - "loss": 0.9378, + "epoch": 0.1735052754982415, + "grad_norm": 0.208984375, + "learning_rate": 0.00029983596925191265, + "loss": 0.975, "step": 148 }, { - "epoch": 0.29063097514340347, - "grad_norm": 0.2314453125, - "learning_rate": 0.0002992552327055802, - "loss": 0.9085, + "epoch": 0.17819460726846426, + "grad_norm": 0.224609375, + "learning_rate": 0.0002998207214978466, + "loss": 0.978, "step": 152 }, { - "epoch": 0.2982791586998088, - "grad_norm": 0.2353515625, - "learning_rate": 0.0002992025685169461, - "loss": 0.9482, + "epoch": 0.182883939038687, + "grad_norm": 0.2177734375, + "learning_rate": 0.0002998047966177182, + "loss": 0.9702, "step": 156 }, { - "epoch": 0.30592734225621415, - "grad_norm": 0.216796875, - "learning_rate": 0.00029914811034163096, - "loss": 0.949, + "epoch": 0.18757327080890973, + "grad_norm": 0.220703125, + "learning_rate": 0.0002997881946835012, + "loss": 0.9399, "step": 160 }, { - "epoch": 0.3135755258126195, - "grad_norm": 0.2421875, - "learning_rate": 0.00029909185883443063, - "loss": 0.9796, + "epoch": 0.19226260257913247, + "grad_norm": 0.244140625, + "learning_rate": 0.00029977091577022916, + "loss": 0.9403, "step": 164 }, { - "epoch": 0.32122370936902483, - "grad_norm": 0.2294921875, - "learning_rate": 0.0002990338146717039, - "loss": 0.9351, + "epoch": 0.1969519343493552, + "grad_norm": 0.2109375, + "learning_rate": 0.0002997529599559956, + "loss": 0.9342, "step": 168 }, { - "epoch": 0.32887189292543023, - "grad_norm": 0.220703125, - "learning_rate": 0.0002989739785513639, - "loss": 0.9166, + "epoch": 0.20164126611957797, + "grad_norm": 0.2138671875, + "learning_rate": 0.00029973432732195303, + "loss": 0.9186, "step": 172 }, { - "epoch": 0.3365200764818356, - "grad_norm": 0.2158203125, - "learning_rate": 0.0002989123511928703, - "loss": 0.9204, + "epoch": 0.2063305978898007, + "grad_norm": 0.22265625, + "learning_rate": 0.0002997150179523131, + "loss": 0.9377, "step": 176 }, { - "epoch": 0.3441682600382409, - "grad_norm": 0.2255859375, - "learning_rate": 0.00029884893333722, - "loss": 0.9356, + "epoch": 0.21101992966002345, + "grad_norm": 0.2333984375, + "learning_rate": 0.00029969503193434606, + "loss": 0.9365, "step": 180 }, { - "epoch": 0.35181644359464626, - "grad_norm": 0.2265625, - "learning_rate": 0.0002987837257469387, - "loss": 0.9342, + "epoch": 0.21570926143024619, + "grad_norm": 0.2255859375, + "learning_rate": 0.00029967436935838, + "loss": 0.921, "step": 184 }, { - "epoch": 0.35946462715105165, - "grad_norm": 0.2265625, - "learning_rate": 0.00029871672920607153, - "loss": 0.9026, + "epoch": 0.22039859320046892, + "grad_norm": 0.2138671875, + "learning_rate": 0.00029965303031780126, + "loss": 0.9041, "step": 188 }, { - "epoch": 0.367112810707457, - "grad_norm": 0.2314453125, - "learning_rate": 0.0002986479445201737, - "loss": 0.8983, + "epoch": 0.2250879249706917, + "grad_norm": 0.22265625, + "learning_rate": 0.00029963101490905307, + "loss": 0.9319, "step": 192 }, { - "epoch": 0.37476099426386233, - "grad_norm": 0.259765625, - "learning_rate": 0.0002985773725163008, - "loss": 0.922, + "epoch": 0.22977725674091443, + "grad_norm": 0.263671875, + "learning_rate": 0.0002996083232316358, + "loss": 0.9135, "step": 196 }, { - "epoch": 0.3824091778202677, - "grad_norm": 0.251953125, - "learning_rate": 0.0002985050140429986, - "loss": 0.9099, + "epoch": 0.23446658851113716, + "grad_norm": 0.236328125, + "learning_rate": 0.0002995849553881061, + "loss": 0.8845, "step": 200 }, { - "epoch": 0.390057361376673, - "grad_norm": 0.25, - "learning_rate": 0.0002984308699702935, - "loss": 0.8825, + "epoch": 0.2391559202813599, + "grad_norm": 0.244140625, + "learning_rate": 0.00029956091148407684, + "loss": 0.8891, "step": 204 }, { - "epoch": 0.3977055449330784, - "grad_norm": 0.2373046875, - "learning_rate": 0.0002983549411896812, - "loss": 0.893, + "epoch": 0.24384525205158264, + "grad_norm": 0.251953125, + "learning_rate": 0.00029953619162821616, + "loss": 0.8917, "step": 208 }, { - "epoch": 0.40535372848948376, - "grad_norm": 0.2353515625, - "learning_rate": 0.0002982772286141167, - "loss": 0.9068, + "epoch": 0.2485345838218054, + "grad_norm": 0.2373046875, + "learning_rate": 0.0002995107959322474, + "loss": 0.9432, "step": 212 }, { - "epoch": 0.4130019120458891, - "grad_norm": 0.2421875, - "learning_rate": 0.000298197733178003, - "loss": 0.8536, + "epoch": 0.2532239155920281, + "grad_norm": 0.2255859375, + "learning_rate": 0.00029948472451094823, + "loss": 0.9197, "step": 216 }, { - "epoch": 0.42065009560229444, - "grad_norm": 0.2041015625, - "learning_rate": 0.00029811645583717987, - "loss": 0.8843, + "epoch": 0.25791324736225085, + "grad_norm": 0.267578125, + "learning_rate": 0.0002994579774821505, + "loss": 0.9129, "step": 220 }, { - "epoch": 0.42829827915869984, - "grad_norm": 0.263671875, - "learning_rate": 0.00029803339756891254, - "loss": 0.8627, + "epoch": 0.26260257913247365, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002994305549667394, + "loss": 0.8779, "step": 224 }, { - "epoch": 0.4359464627151052, - "grad_norm": 0.263671875, - "learning_rate": 0.00029794855937187963, - "loss": 0.8572, + "epoch": 0.2672919109026964, + "grad_norm": 0.2314453125, + "learning_rate": 0.000299402457088653, + "loss": 0.9172, "step": 228 }, { - "epoch": 0.4435946462715105, - "grad_norm": 0.2265625, - "learning_rate": 0.0002978619422661613, - "loss": 0.8255, + "epoch": 0.2719812426729191, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002993736839748818, + "loss": 0.8779, "step": 232 }, { - "epoch": 0.45124282982791586, - "grad_norm": 0.2197265625, - "learning_rate": 0.0002977735472932273, - "loss": 0.8274, + "epoch": 0.27667057444314186, + "grad_norm": 0.236328125, + "learning_rate": 0.0002993442357554681, + "loss": 0.9188, "step": 236 }, { - "epoch": 0.4588910133843212, - "grad_norm": 0.2490234375, - "learning_rate": 0.00029768337551592394, - "loss": 0.8569, + "epoch": 0.2813599062133646, + "grad_norm": 0.2197265625, + "learning_rate": 0.00029931411256350535, + "loss": 0.8902, "step": 240 }, { - "epoch": 0.4665391969407266, - "grad_norm": 0.2255859375, - "learning_rate": 0.00029759142801846143, - "loss": 0.8791, + "epoch": 0.28604923798358733, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002992833145351376, + "loss": 0.8734, "step": 244 }, { - "epoch": 0.47418738049713194, - "grad_norm": 0.248046875, - "learning_rate": 0.00029749770590640123, - "loss": 0.8215, + "epoch": 0.29073856975381007, + "grad_norm": 0.21484375, + "learning_rate": 0.0002992518418095588, + "loss": 0.9317, "step": 248 }, { - "epoch": 0.4818355640535373, - "grad_norm": 0.2470703125, - "learning_rate": 0.00029740221030664216, - "loss": 0.8546, + "epoch": 0.2954279015240328, + "grad_norm": 0.224609375, + "learning_rate": 0.00029921969452901235, + "loss": 0.8715, "step": 252 }, { - "epoch": 0.4894837476099426, - "grad_norm": 0.2421875, - "learning_rate": 0.00029730494236740744, - "loss": 0.8518, + "epoch": 0.30011723329425555, + "grad_norm": 0.236328125, + "learning_rate": 0.0002991868728387903, + "loss": 0.8825, "step": 256 }, { - "epoch": 0.497131931166348, - "grad_norm": 0.251953125, - "learning_rate": 0.0002972059032582304, - "loss": 0.8115, + "epoch": 0.3048065650644783, + "grad_norm": 0.23828125, + "learning_rate": 0.00029915337688723277, + "loss": 0.926, "step": 260 }, { - "epoch": 0.5047801147227533, - "grad_norm": 0.251953125, - "learning_rate": 0.0002971050941699407, - "loss": 0.8818, + "epoch": 0.3094958968347011, + "grad_norm": 0.23828125, + "learning_rate": 0.00029911920682572726, + "loss": 0.8774, "step": 264 }, { - "epoch": 0.5124282982791587, - "grad_norm": 0.2314453125, - "learning_rate": 0.00029700251631464993, - "loss": 0.8834, + "epoch": 0.3141852286049238, + "grad_norm": 0.216796875, + "learning_rate": 0.0002990843628087079, + "loss": 0.8844, "step": 268 }, { - "epoch": 0.5200764818355641, - "grad_norm": 0.23828125, - "learning_rate": 0.000296898170925737, - "loss": 0.8502, + "epoch": 0.31887456037514655, + "grad_norm": 0.2197265625, + "learning_rate": 0.0002990488449936549, + "loss": 0.8683, "step": 272 }, { - "epoch": 0.5277246653919694, - "grad_norm": 0.2373046875, - "learning_rate": 0.0002967920592578335, - "loss": 0.854, + "epoch": 0.3235638921453693, + "grad_norm": 0.287109375, + "learning_rate": 0.00029901265354109367, + "loss": 0.796, "step": 276 }, { - "epoch": 0.5353728489483748, - "grad_norm": 0.240234375, - "learning_rate": 0.0002966841825868082, - "loss": 0.8164, + "epoch": 0.32825322391559203, + "grad_norm": 0.251953125, + "learning_rate": 0.0002989757886145942, + "loss": 0.8603, "step": 280 }, { - "epoch": 0.5430210325047801, - "grad_norm": 0.228515625, - "learning_rate": 0.00029657454220975216, - "loss": 0.9058, + "epoch": 0.33294255568581477, + "grad_norm": 0.263671875, + "learning_rate": 0.0002989382503807704, + "loss": 0.9065, "step": 284 }, { - "epoch": 0.5506692160611855, - "grad_norm": 0.2294921875, - "learning_rate": 0.00029646313944496297, - "loss": 0.8646, + "epoch": 0.3376318874560375, + "grad_norm": 0.240234375, + "learning_rate": 0.00029890003900927904, + "loss": 0.8713, "step": 288 }, { - "epoch": 0.5583173996175909, - "grad_norm": 0.2158203125, - "learning_rate": 0.00029634997563192866, - "loss": 0.8536, + "epoch": 0.34232121922626024, + "grad_norm": 0.24609375, + "learning_rate": 0.0002988611546728194, + "loss": 0.8532, "step": 292 }, { - "epoch": 0.5659655831739961, - "grad_norm": 0.2490234375, - "learning_rate": 0.0002962350521313122, - "loss": 0.8532, + "epoch": 0.347010550996483, + "grad_norm": 0.2255859375, + "learning_rate": 0.000298821597547132, + "loss": 0.8241, "step": 296 }, { - "epoch": 0.5736137667304015, - "grad_norm": 0.2255859375, - "learning_rate": 0.0002961183703249342, - "loss": 0.8228, + "epoch": 0.3516998827667057, + "grad_norm": 0.216796875, + "learning_rate": 0.0002987813678109985, + "loss": 0.8575, "step": 300 }, { - "epoch": 0.5812619502868069, - "grad_norm": 0.2392578125, - "learning_rate": 0.0002959999316157573, - "loss": 0.8088, + "epoch": 0.3563892145369285, + "grad_norm": 0.216796875, + "learning_rate": 0.00029874046564624, + "loss": 0.8889, "step": 304 }, { - "epoch": 0.5889101338432122, - "grad_norm": 0.24609375, - "learning_rate": 0.00029587973742786875, - "loss": 0.8291, + "epoch": 0.36107854630715125, + "grad_norm": 0.259765625, + "learning_rate": 0.0002986988912377171, + "loss": 0.8393, "step": 308 }, { - "epoch": 0.5965583173996176, - "grad_norm": 0.271484375, - "learning_rate": 0.0002957577892064632, - "loss": 0.8087, + "epoch": 0.365767878077374, + "grad_norm": 0.236328125, + "learning_rate": 0.00029865664477332843, + "loss": 0.8675, "step": 312 }, { - "epoch": 0.6042065009560229, - "grad_norm": 0.2373046875, - "learning_rate": 0.00029563408841782576, - "loss": 0.853, + "epoch": 0.3704572098475967, + "grad_norm": 0.21875, + "learning_rate": 0.00029861372644401, + "loss": 0.8316, "step": 316 }, { - "epoch": 0.6118546845124283, - "grad_norm": 0.251953125, - "learning_rate": 0.00029550863654931385, - "loss": 0.847, + "epoch": 0.37514654161781946, + "grad_norm": 0.224609375, + "learning_rate": 0.0002985701364437345, + "loss": 0.8705, "step": 320 }, { - "epoch": 0.6195028680688337, - "grad_norm": 0.25390625, - "learning_rate": 0.0002953814351093398, - "loss": 0.8087, + "epoch": 0.3798358733880422, + "grad_norm": 0.2333984375, + "learning_rate": 0.0002985258749695102, + "loss": 0.8752, "step": 324 }, { - "epoch": 0.627151051625239, - "grad_norm": 0.2421875, - "learning_rate": 0.0002952524856273524, - "loss": 0.8514, + "epoch": 0.38452520515826494, + "grad_norm": 0.244140625, + "learning_rate": 0.00029848094222138024, + "loss": 0.8594, "step": 328 }, { - "epoch": 0.6347992351816444, - "grad_norm": 0.212890625, - "learning_rate": 0.00029512178965381854, - "loss": 0.8501, + "epoch": 0.3892145369284877, + "grad_norm": 0.251953125, + "learning_rate": 0.0002984353384024215, + "loss": 0.8335, "step": 332 }, { - "epoch": 0.6424474187380497, - "grad_norm": 0.2353515625, - "learning_rate": 0.00029498934876020475, - "loss": 0.8029, + "epoch": 0.3939038686987104, + "grad_norm": 0.25, + "learning_rate": 0.0002983890637187439, + "loss": 0.7962, "step": 336 }, { - "epoch": 0.6500956022944551, - "grad_norm": 0.232421875, - "learning_rate": 0.00029485516453895826, - "loss": 0.8293, + "epoch": 0.39859320046893315, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002983421183794893, + "loss": 0.8526, "step": 340 }, { - "epoch": 0.6577437858508605, - "grad_norm": 0.2490234375, - "learning_rate": 0.0002947192386034874, - "loss": 0.8695, + "epoch": 0.40328253223915594, + "grad_norm": 0.23046875, + "learning_rate": 0.00029829450259683085, + "loss": 0.8273, "step": 344 }, { - "epoch": 0.6653919694072657, - "grad_norm": 0.236328125, - "learning_rate": 0.00029458157258814316, - "loss": 0.8249, + "epoch": 0.4079718640093787, + "grad_norm": 0.2333984375, + "learning_rate": 0.00029824621658597165, + "loss": 0.8174, "step": 348 }, { - "epoch": 0.6730401529636711, - "grad_norm": 0.234375, - "learning_rate": 0.00029444216814819834, - "loss": 0.8009, + "epoch": 0.4126611957796014, + "grad_norm": 0.259765625, + "learning_rate": 0.00029819726056514383, + "loss": 0.836, "step": 352 }, { - "epoch": 0.6806883365200764, - "grad_norm": 0.2490234375, - "learning_rate": 0.00029430102695982875, - "loss": 0.8642, + "epoch": 0.41735052754982416, + "grad_norm": 0.259765625, + "learning_rate": 0.00029814763475560796, + "loss": 0.814, "step": 356 }, { - "epoch": 0.6883365200764818, - "grad_norm": 0.228515625, - "learning_rate": 0.00029415815072009237, - "loss": 0.8562, + "epoch": 0.4220398593200469, + "grad_norm": 0.23046875, + "learning_rate": 0.00029809733938165157, + "loss": 0.8317, "step": 360 }, { - "epoch": 0.6959847036328872, - "grad_norm": 0.240234375, - "learning_rate": 0.00029401354114690905, - "loss": 0.8274, + "epoch": 0.42672919109026963, + "grad_norm": 0.25, + "learning_rate": 0.0002980463746705884, + "loss": 0.8571, "step": 364 }, { - "epoch": 0.7036328871892925, - "grad_norm": 0.2099609375, - "learning_rate": 0.0002938671999790402, - "loss": 0.8214, + "epoch": 0.43141852286049237, + "grad_norm": 0.2373046875, + "learning_rate": 0.00029799474085275734, + "loss": 0.8309, "step": 368 }, { - "epoch": 0.7112810707456979, - "grad_norm": 0.24609375, - "learning_rate": 0.00029371912897606736, - "loss": 0.8537, + "epoch": 0.4361078546307151, + "grad_norm": 0.255859375, + "learning_rate": 0.00029794243816152127, + "loss": 0.8289, "step": 372 }, { - "epoch": 0.7189292543021033, - "grad_norm": 0.2275390625, - "learning_rate": 0.00029356932991837163, - "loss": 0.8378, + "epoch": 0.44079718640093785, + "grad_norm": 0.232421875, + "learning_rate": 0.0002978894668332663, + "loss": 0.8364, "step": 376 }, { - "epoch": 0.7265774378585086, - "grad_norm": 0.251953125, - "learning_rate": 0.0002934178046071116, - "loss": 0.8064, + "epoch": 0.4454865181711606, + "grad_norm": 0.2421875, + "learning_rate": 0.00029783582710740013, + "loss": 0.8101, "step": 380 }, { - "epoch": 0.734225621414914, - "grad_norm": 0.259765625, - "learning_rate": 0.0002932645548642024, - "loss": 0.8427, + "epoch": 0.4501758499413834, + "grad_norm": 0.224609375, + "learning_rate": 0.00029778151922635175, + "loss": 0.8661, "step": 384 }, { - "epoch": 0.7418738049713193, - "grad_norm": 0.251953125, - "learning_rate": 0.0002931095825322931, - "loss": 0.7602, + "epoch": 0.4548651817116061, + "grad_norm": 0.2294921875, + "learning_rate": 0.0002977265434355696, + "loss": 0.8419, "step": 388 }, { - "epoch": 0.7495219885277247, - "grad_norm": 0.232421875, - "learning_rate": 0.00029295288947474513, - "loss": 0.881, + "epoch": 0.45955451348182885, + "grad_norm": 0.259765625, + "learning_rate": 0.000297670899983521, + "loss": 0.8444, "step": 392 }, { - "epoch": 0.7571701720841301, - "grad_norm": 0.2353515625, - "learning_rate": 0.0002927944775756095, - "loss": 0.865, + "epoch": 0.4642438452520516, + "grad_norm": 0.2373046875, + "learning_rate": 0.00029761458912169064, + "loss": 0.8413, "step": 396 }, { - "epoch": 0.7648183556405354, - "grad_norm": 0.2353515625, - "learning_rate": 0.0002926343487396044, - "loss": 0.839, + "epoch": 0.46893317702227433, + "grad_norm": 0.2294921875, + "learning_rate": 0.0002975576111045798, + "loss": 0.7815, "step": 400 }, { - "epoch": 0.7724665391969407, - "grad_norm": 0.2412109375, - "learning_rate": 0.00029247250489209217, - "loss": 0.7939, + "epoch": 0.47362250879249707, + "grad_norm": 0.24609375, + "learning_rate": 0.0002974999661897049, + "loss": 0.8496, "step": 404 }, { - "epoch": 0.780114722753346, - "grad_norm": 0.2470703125, - "learning_rate": 0.00029230894797905595, - "loss": 0.7748, + "epoch": 0.4783118405627198, + "grad_norm": 0.263671875, + "learning_rate": 0.0002974416546375965, + "loss": 0.842, "step": 408 }, { - "epoch": 0.7877629063097514, - "grad_norm": 0.255859375, - "learning_rate": 0.00029214367996707676, - "loss": 0.7829, + "epoch": 0.48300117233294254, + "grad_norm": 0.234375, + "learning_rate": 0.00029738267671179793, + "loss": 0.8453, "step": 412 }, { - "epoch": 0.7954110898661568, - "grad_norm": 0.2392578125, - "learning_rate": 0.00029197670284330954, - "loss": 0.7867, + "epoch": 0.4876905041031653, + "grad_norm": 0.23046875, + "learning_rate": 0.00029732303267886455, + "loss": 0.8066, "step": 416 }, { - "epoch": 0.8030592734225621, - "grad_norm": 0.2255859375, - "learning_rate": 0.00029180801861545906, - "loss": 0.7971, + "epoch": 0.492379835873388, + "grad_norm": 0.26953125, + "learning_rate": 0.00029726272280836206, + "loss": 0.749, "step": 420 }, { - "epoch": 0.8107074569789675, - "grad_norm": 0.25390625, - "learning_rate": 0.0002916376293117564, - "loss": 0.8241, + "epoch": 0.4970691676436108, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002972017473728654, + "loss": 0.8594, "step": 424 }, { - "epoch": 0.8183556405353728, - "grad_norm": 0.244140625, - "learning_rate": 0.00029146553698093387, - "loss": 0.8119, + "epoch": 0.5017584994138335, + "grad_norm": 0.2431640625, + "learning_rate": 0.00029714010664795765, + "loss": 0.8312, "step": 428 }, { - "epoch": 0.8260038240917782, - "grad_norm": 0.271484375, - "learning_rate": 0.00029129174369220087, - "loss": 0.8048, + "epoch": 0.5064478311840562, + "grad_norm": 0.23828125, + "learning_rate": 0.00029707780091222877, + "loss": 0.8336, "step": 432 }, { - "epoch": 0.8336520076481836, - "grad_norm": 0.232421875, - "learning_rate": 0.00029111625153521877, - "loss": 0.76, + "epoch": 0.511137162954279, + "grad_norm": 0.26953125, + "learning_rate": 0.0002970148304472742, + "loss": 0.8229, "step": 436 }, { - "epoch": 0.8413001912045889, - "grad_norm": 0.271484375, - "learning_rate": 0.00029093906262007583, - "loss": 0.7833, + "epoch": 0.5158264947245017, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002969511955376937, + "loss": 0.7516, "step": 440 }, { - "epoch": 0.8489483747609943, - "grad_norm": 0.2158203125, - "learning_rate": 0.00029076017907726196, - "loss": 0.8027, + "epoch": 0.5205158264947245, + "grad_norm": 0.2373046875, + "learning_rate": 0.00029688689647109013, + "loss": 0.8158, "step": 444 }, { - "epoch": 0.8565965583173997, - "grad_norm": 0.21875, - "learning_rate": 0.0002905796030576428, - "loss": 0.818, + "epoch": 0.5252051582649473, + "grad_norm": 0.2734375, + "learning_rate": 0.00029682193353806793, + "loss": 0.7859, "step": 448 }, { - "epoch": 0.864244741873805, - "grad_norm": 0.2373046875, - "learning_rate": 0.00029039733673243416, - "loss": 0.8358, + "epoch": 0.52989449003517, + "grad_norm": 0.283203125, + "learning_rate": 0.00029675630703223196, + "loss": 0.8025, "step": 452 }, { - "epoch": 0.8718929254302104, - "grad_norm": 0.2412109375, - "learning_rate": 0.0002902133822931759, - "loss": 0.7543, + "epoch": 0.5345838218053928, + "grad_norm": 0.2451171875, + "learning_rate": 0.0002966900172501861, + "loss": 0.7937, "step": 456 }, { - "epoch": 0.8795411089866156, - "grad_norm": 0.2421875, - "learning_rate": 0.00029002774195170525, - "loss": 0.7765, + "epoch": 0.5392731535756154, + "grad_norm": 0.2578125, + "learning_rate": 0.00029662306449153216, + "loss": 0.8076, "step": 460 }, { - "epoch": 0.887189292543021, - "grad_norm": 0.2451171875, - "learning_rate": 0.0002898404179401306, - "loss": 0.8094, + "epoch": 0.5439624853458382, + "grad_norm": 0.2216796875, + "learning_rate": 0.00029655544905886816, + "loss": 0.8268, "step": 464 }, { - "epoch": 0.8948374760994264, - "grad_norm": 0.2431640625, - "learning_rate": 0.0002896514125108045, - "loss": 0.7657, + "epoch": 0.5486518171160609, + "grad_norm": 0.2421875, + "learning_rate": 0.0002964871712577871, + "loss": 0.7727, "step": 468 }, { - "epoch": 0.9024856596558317, - "grad_norm": 0.2353515625, - "learning_rate": 0.0002894607279362966, - "loss": 0.7774, + "epoch": 0.5533411488862837, + "grad_norm": 0.251953125, + "learning_rate": 0.0002964182313968757, + "loss": 0.809, "step": 472 }, { - "epoch": 0.9101338432122371, - "grad_norm": 0.2255859375, - "learning_rate": 0.0002892683665093662, - "loss": 0.8148, + "epoch": 0.5580304806565064, + "grad_norm": 0.2333984375, + "learning_rate": 0.0002963486297877128, + "loss": 0.8369, "step": 476 }, { - "epoch": 0.9177820267686424, + "epoch": 0.5627198124267292, "grad_norm": 0.236328125, - "learning_rate": 0.0002890743305429348, - "loss": 0.7882, + "learning_rate": 0.00029627836674486817, + "loss": 0.7811, "step": 480 }, { - "epoch": 0.9254302103250478, - "grad_norm": 0.2294921875, - "learning_rate": 0.0002888786223700585, - "loss": 0.7656, + "epoch": 0.567409144196952, + "grad_norm": 0.2421875, + "learning_rate": 0.00029620744258590097, + "loss": 0.7798, "step": 484 }, { - "epoch": 0.9330783938814532, - "grad_norm": 0.224609375, - "learning_rate": 0.00028868124434389944, - "loss": 0.7802, + "epoch": 0.5720984759671747, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002961358576313583, + "loss": 0.809, "step": 488 }, { - "epoch": 0.9407265774378585, - "grad_norm": 0.240234375, - "learning_rate": 0.00028848219883769805, - "loss": 0.7773, + "epoch": 0.5767878077373975, + "grad_norm": 0.255859375, + "learning_rate": 0.00029606361220477364, + "loss": 0.7677, "step": 492 }, { - "epoch": 0.9483747609942639, - "grad_norm": 0.2314453125, - "learning_rate": 0.000288281488244744, - "loss": 0.7803, + "epoch": 0.5814771395076201, + "grad_norm": 0.2294921875, + "learning_rate": 0.0002959907066326658, + "loss": 0.7994, "step": 496 }, { - "epoch": 0.9560229445506692, - "grad_norm": 0.255859375, - "learning_rate": 0.000288079114978348, - "loss": 0.8056, + "epoch": 0.5861664712778429, + "grad_norm": 0.267578125, + "learning_rate": 0.00029591714124453693, + "loss": 0.8217, "step": 500 }, { - "epoch": 0.9636711281070746, - "grad_norm": 0.240234375, - "learning_rate": 0.0002878750814718121, - "loss": 0.8309, + "epoch": 0.5908558030480656, + "grad_norm": 0.234375, + "learning_rate": 0.00029584291637287146, + "loss": 0.7913, "step": 504 }, { - "epoch": 0.97131931166348, - "grad_norm": 0.2294921875, - "learning_rate": 0.00028766939017840114, - "loss": 0.7737, + "epoch": 0.5955451348182884, + "grad_norm": 0.2451171875, + "learning_rate": 0.00029576803235313413, + "loss": 0.7679, "step": 508 }, { - "epoch": 0.9789674952198852, - "grad_norm": 0.26171875, - "learning_rate": 0.00028746204357131273, - "loss": 0.8039, + "epoch": 0.6002344665885111, + "grad_norm": 0.2578125, + "learning_rate": 0.00029569248952376903, + "loss": 0.7839, "step": 512 }, { - "epoch": 0.9866156787762906, - "grad_norm": 0.2255859375, - "learning_rate": 0.0002872530441436477, - "loss": 0.7341, + "epoch": 0.6049237983587339, + "grad_norm": 0.228515625, + "learning_rate": 0.00029561628822619775, + "loss": 0.8129, "step": 516 }, { - "epoch": 0.994263862332696, - "grad_norm": 0.228515625, - "learning_rate": 0.0002870423944083801, - "loss": 0.8122, + "epoch": 0.6096131301289566, + "grad_norm": 0.2451171875, + "learning_rate": 0.00029553942880481765, + "loss": 0.8105, "step": 520 }, { - "epoch": 1.0019120458891013, - "grad_norm": 0.216796875, - "learning_rate": 0.0002868300968983271, - "loss": 0.7403, + "epoch": 0.6143024618991794, + "grad_norm": 0.2578125, + "learning_rate": 0.0002954619116070008, + "loss": 0.7806, "step": 524 }, { - "epoch": 1.0095602294455066, - "grad_norm": 0.24609375, - "learning_rate": 0.0002866161541661185, - "loss": 0.697, + "epoch": 0.6189917936694022, + "grad_norm": 0.25, + "learning_rate": 0.00029538373698309193, + "loss": 0.7739, "step": 528 }, { - "epoch": 1.0172084130019121, - "grad_norm": 0.23828125, - "learning_rate": 0.0002864005687841656, - "loss": 0.7442, + "epoch": 0.6236811254396248, + "grad_norm": 0.2734375, + "learning_rate": 0.00029530490528640723, + "loss": 0.7882, "step": 532 }, { - "epoch": 1.0248565965583174, - "grad_norm": 0.26953125, - "learning_rate": 0.0002861833433446312, - "loss": 0.6853, + "epoch": 0.6283704572098476, + "grad_norm": 0.2333984375, + "learning_rate": 0.00029522541687323253, + "loss": 0.7934, "step": 536 }, { - "epoch": 1.0325047801147227, - "grad_norm": 0.23828125, - "learning_rate": 0.00028596448045939735, - "loss": 0.7398, + "epoch": 0.6330597889800703, + "grad_norm": 0.271484375, + "learning_rate": 0.00029514527210282163, + "loss": 0.8188, "step": 540 }, { - "epoch": 1.0401529636711282, - "grad_norm": 0.2490234375, - "learning_rate": 0.0002857439827600348, - "loss": 0.7912, + "epoch": 0.6377491207502931, + "grad_norm": 0.2421875, + "learning_rate": 0.00029506447133739494, + "loss": 0.79, "step": 544 }, { - "epoch": 1.0478011472275335, - "grad_norm": 0.259765625, - "learning_rate": 0.0002855218528977709, - "loss": 0.7138, + "epoch": 0.6424384525205158, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002949830149421376, + "loss": 0.7961, "step": 548 }, { - "epoch": 1.0554493307839388, - "grad_norm": 0.2314453125, - "learning_rate": 0.00028529809354345794, - "loss": 0.726, + "epoch": 0.6471277842907386, + "grad_norm": 0.2470703125, + "learning_rate": 0.00029490090328519795, + "loss": 0.798, "step": 552 }, { - "epoch": 1.063097514340344, - "grad_norm": 0.287109375, - "learning_rate": 0.0002850727073875409, - "loss": 0.7058, + "epoch": 0.6518171160609613, + "grad_norm": 0.263671875, + "learning_rate": 0.00029481813673768576, + "loss": 0.7388, "step": 556 }, { - "epoch": 1.0707456978967496, - "grad_norm": 0.228515625, - "learning_rate": 0.00028484569714002517, - "loss": 0.7102, + "epoch": 0.6565064478311841, + "grad_norm": 0.23046875, + "learning_rate": 0.0002947347156736708, + "loss": 0.8278, "step": 560 }, { - "epoch": 1.0783938814531548, - "grad_norm": 0.2392578125, - "learning_rate": 0.0002846170655304438, - "loss": 0.6534, + "epoch": 0.6611957796014069, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002946506404701808, + "loss": 0.7871, "step": 564 }, { - "epoch": 1.0860420650095601, - "grad_norm": 0.23828125, - "learning_rate": 0.0002843868153078251, - "loss": 0.6918, + "epoch": 0.6658851113716295, + "grad_norm": 0.25390625, + "learning_rate": 0.0002945659115071999, + "loss": 0.7859, "step": 568 }, { - "epoch": 1.0936902485659656, - "grad_norm": 0.251953125, - "learning_rate": 0.000284154949240659, - "loss": 0.6673, + "epoch": 0.6705744431418523, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002944805291676672, + "loss": 0.8397, "step": 572 }, { - "epoch": 1.101338432122371, - "grad_norm": 0.255859375, - "learning_rate": 0.0002839214701168644, - "loss": 0.6722, + "epoch": 0.675263774912075, + "grad_norm": 0.2314453125, + "learning_rate": 0.0002943944938374746, + "loss": 0.776, "step": 576 }, { - "epoch": 1.1089866156787762, - "grad_norm": 0.251953125, - "learning_rate": 0.00028368638074375516, - "loss": 0.7141, + "epoch": 0.6799531066822978, + "grad_norm": 0.2578125, + "learning_rate": 0.0002943078059054652, + "loss": 0.8045, "step": 580 }, { - "epoch": 1.1166347992351817, - "grad_norm": 0.263671875, - "learning_rate": 0.0002834496839480063, - "loss": 0.665, + "epoch": 0.6846424384525205, + "grad_norm": 0.255859375, + "learning_rate": 0.0002942204657634317, + "loss": 0.8548, "step": 584 }, { - "epoch": 1.124282982791587, - "grad_norm": 0.2421875, - "learning_rate": 0.00028321138257562066, - "loss": 0.6886, + "epoch": 0.6893317702227433, + "grad_norm": 0.240234375, + "learning_rate": 0.0002941324738061145, + "loss": 0.741, "step": 588 }, { - "epoch": 1.1319311663479923, - "grad_norm": 0.265625, - "learning_rate": 0.00028297147949189386, - "loss": 0.6997, + "epoch": 0.694021101992966, + "grad_norm": 0.2451171875, + "learning_rate": 0.00029404383043119984, + "loss": 0.7604, "step": 592 }, { - "epoch": 1.1395793499043978, - "grad_norm": 0.2578125, - "learning_rate": 0.00028272997758138044, - "loss": 0.7051, + "epoch": 0.6987104337631888, + "grad_norm": 0.2470703125, + "learning_rate": 0.00029395453603931816, + "loss": 0.7914, "step": 596 }, { - "epoch": 1.147227533460803, - "grad_norm": 0.248046875, - "learning_rate": 0.00028248687974785896, - "loss": 0.7188, + "epoch": 0.7033997655334114, + "grad_norm": 0.296875, + "learning_rate": 0.00029386459103404215, + "loss": 0.7633, "step": 600 }, { - "epoch": 1.1548757170172084, - "grad_norm": 0.275390625, - "learning_rate": 0.0002822421889142969, - "loss": 0.6757, + "epoch": 0.7080890973036342, + "grad_norm": 0.2373046875, + "learning_rate": 0.0002937739958218852, + "loss": 0.7783, "step": 604 }, { - "epoch": 1.1625239005736137, - "grad_norm": 0.25, - "learning_rate": 0.00028199590802281595, - "loss": 0.7203, + "epoch": 0.712778429073857, + "grad_norm": 0.2333984375, + "learning_rate": 0.000293682750812299, + "loss": 0.7437, "step": 608 }, { - "epoch": 1.1701720841300192, - "grad_norm": 0.255859375, - "learning_rate": 0.00028174804003465616, - "loss": 0.6943, + "epoch": 0.7174677608440797, + "grad_norm": 0.2431640625, + "learning_rate": 0.00029359085641767244, + "loss": 0.746, "step": 612 }, { - "epoch": 1.1778202676864244, - "grad_norm": 0.2412109375, - "learning_rate": 0.0002814985879301408, - "loss": 0.7037, + "epoch": 0.7221570926143025, + "grad_norm": 0.2421875, + "learning_rate": 0.00029349831305332914, + "loss": 0.774, "step": 616 }, { - "epoch": 1.1854684512428297, - "grad_norm": 0.26953125, - "learning_rate": 0.0002812475547086401, - "loss": 0.7215, + "epoch": 0.7268464243845252, + "grad_norm": 0.271484375, + "learning_rate": 0.0002934051211375258, + "loss": 0.7411, "step": 620 }, { - "epoch": 1.1931166347992352, - "grad_norm": 0.263671875, - "learning_rate": 0.00028099494338853554, - "loss": 0.6863, + "epoch": 0.731535756154748, + "grad_norm": 0.24609375, + "learning_rate": 0.0002933112810914503, + "loss": 0.7725, "step": 624 }, { - "epoch": 1.2007648183556405, - "grad_norm": 0.240234375, - "learning_rate": 0.0002807407570071832, - "loss": 0.7432, + "epoch": 0.7362250879249707, + "grad_norm": 0.25, + "learning_rate": 0.0002932167933392198, + "loss": 0.812, "step": 628 }, { - "epoch": 1.2084130019120458, - "grad_norm": 0.244140625, - "learning_rate": 0.00028048499862087757, - "loss": 0.7265, + "epoch": 0.7409144196951934, + "grad_norm": 0.2470703125, + "learning_rate": 0.0002931216583078789, + "loss": 0.8059, "step": 632 }, { - "epoch": 1.2160611854684513, - "grad_norm": 0.259765625, - "learning_rate": 0.00028022767130481466, - "loss": 0.6848, + "epoch": 0.7456037514654161, + "grad_norm": 0.2470703125, + "learning_rate": 0.0002930258764273975, + "loss": 0.7539, "step": 636 }, { - "epoch": 1.2237093690248566, - "grad_norm": 0.2470703125, - "learning_rate": 0.0002799687781530549, - "loss": 0.717, + "epoch": 0.7502930832356389, + "grad_norm": 0.259765625, + "learning_rate": 0.000292929448130669, + "loss": 0.7813, "step": 640 }, { - "epoch": 1.231357552581262, - "grad_norm": 0.259765625, - "learning_rate": 0.00027970832227848627, - "loss": 0.7011, + "epoch": 0.7549824150058617, + "grad_norm": 0.26953125, + "learning_rate": 0.0002928323738535085, + "loss": 0.817, "step": 644 }, { - "epoch": 1.2390057361376674, - "grad_norm": 0.2578125, - "learning_rate": 0.0002794463068127866, - "loss": 0.7696, + "epoch": 0.7596717467760844, + "grad_norm": 0.23828125, + "learning_rate": 0.00029273465403465045, + "loss": 0.7949, "step": 648 }, { - "epoch": 1.2466539196940727, - "grad_norm": 0.2431640625, - "learning_rate": 0.00027918273490638574, - "loss": 0.6922, + "epoch": 0.7643610785463072, + "grad_norm": 0.259765625, + "learning_rate": 0.0002926362891157469, + "loss": 0.7825, "step": 652 }, { - "epoch": 1.254302103250478, - "grad_norm": 0.28515625, - "learning_rate": 0.0002789176097284283, - "loss": 0.6521, + "epoch": 0.7690504103165299, + "grad_norm": 0.2421875, + "learning_rate": 0.0002925372795413656, + "loss": 0.7385, "step": 656 }, { - "epoch": 1.2619502868068833, - "grad_norm": 0.25390625, - "learning_rate": 0.0002786509344667349, - "loss": 0.6642, + "epoch": 0.7737397420867527, + "grad_norm": 0.234375, + "learning_rate": 0.00029243762575898775, + "loss": 0.8009, "step": 660 }, { - "epoch": 1.2695984703632888, - "grad_norm": 0.25390625, - "learning_rate": 0.0002783827123277643, - "loss": 0.7773, + "epoch": 0.7784290738569754, + "grad_norm": 0.255859375, + "learning_rate": 0.0002923373282190062, + "loss": 0.7594, "step": 664 }, { - "epoch": 1.277246653919694, - "grad_norm": 0.2431640625, - "learning_rate": 0.00027811294653657444, - "loss": 0.7314, + "epoch": 0.7831184056271981, + "grad_norm": 0.251953125, + "learning_rate": 0.00029223638737472325, + "loss": 0.7312, "step": 668 }, { - "epoch": 1.2848948374760996, - "grad_norm": 0.2373046875, - "learning_rate": 0.000277841640336784, - "loss": 0.7461, + "epoch": 0.7878077373974208, + "grad_norm": 0.2421875, + "learning_rate": 0.00029213480368234853, + "loss": 0.7294, "step": 672 }, { - "epoch": 1.2925430210325048, - "grad_norm": 0.236328125, - "learning_rate": 0.00027756879699053337, - "loss": 0.7426, + "epoch": 0.7924970691676436, + "grad_norm": 0.255859375, + "learning_rate": 0.00029203257760099737, + "loss": 0.7024, "step": 676 }, { - "epoch": 1.3001912045889101, + "epoch": 0.7971864009378663, "grad_norm": 0.26171875, - "learning_rate": 0.0002772944197784451, - "loss": 0.6986, + "learning_rate": 0.0002919297095926883, + "loss": 0.7424, "step": 680 }, { - "epoch": 1.3078393881453154, - "grad_norm": 0.2412109375, - "learning_rate": 0.0002770185119995849, - "loss": 0.7379, + "epoch": 0.8018757327080891, + "grad_norm": 0.275390625, + "learning_rate": 0.0002918262001223408, + "loss": 0.7668, "step": 684 }, { - "epoch": 1.3154875717017207, - "grad_norm": 0.265625, - "learning_rate": 0.0002767410769714216, - "loss": 0.7146, + "epoch": 0.8065650644783119, + "grad_norm": 0.25390625, + "learning_rate": 0.000291722049657774, + "loss": 0.7923, "step": 688 }, { - "epoch": 1.3231357552581262, - "grad_norm": 0.2490234375, - "learning_rate": 0.0002764621180297875, - "loss": 0.7061, + "epoch": 0.8112543962485346, + "grad_norm": 0.2470703125, + "learning_rate": 0.0002916172586697036, + "loss": 0.8041, "step": 692 }, { - "epoch": 1.3307839388145315, - "grad_norm": 0.271484375, - "learning_rate": 0.0002761816385288382, - "loss": 0.6547, + "epoch": 0.8159437280187574, + "grad_norm": 0.2451171875, + "learning_rate": 0.00029151182763174053, + "loss": 0.7869, "step": 696 }, { - "epoch": 1.338432122370937, - "grad_norm": 0.271484375, - "learning_rate": 0.0002758996418410122, - "loss": 0.7018, + "epoch": 0.82063305978898, + "grad_norm": 0.259765625, + "learning_rate": 0.0002914057570203882, + "loss": 0.7462, "step": 700 }, { - "epoch": 1.3460803059273423, - "grad_norm": 0.2578125, - "learning_rate": 0.0002756161313569904, - "loss": 0.7062, + "epoch": 0.8253223915592028, + "grad_norm": 0.2265625, + "learning_rate": 0.0002912990473150409, + "loss": 0.7332, "step": 704 }, { - "epoch": 1.3537284894837476, - "grad_norm": 0.271484375, - "learning_rate": 0.00027533111048565537, - "loss": 0.778, + "epoch": 0.8300117233294255, + "grad_norm": 0.25, + "learning_rate": 0.000291191698997981, + "loss": 0.7944, "step": 708 }, { - "epoch": 1.3613766730401529, - "grad_norm": 0.271484375, - "learning_rate": 0.00027504458265405034, - "loss": 0.6916, + "epoch": 0.8347010550996483, + "grad_norm": 0.26171875, + "learning_rate": 0.0002910837125543775, + "loss": 0.7531, "step": 712 }, { - "epoch": 1.3690248565965584, - "grad_norm": 0.2333984375, - "learning_rate": 0.00027475655130733786, - "loss": 0.6862, + "epoch": 0.839390386869871, + "grad_norm": 0.26953125, + "learning_rate": 0.0002909750884722832, + "loss": 0.7751, "step": 716 }, { - "epoch": 1.3766730401529637, - "grad_norm": 0.267578125, - "learning_rate": 0.00027446701990875864, - "loss": 0.7037, + "epoch": 0.8440797186400938, + "grad_norm": 0.2412109375, + "learning_rate": 0.00029086582724263286, + "loss": 0.7757, "step": 720 }, { - "epoch": 1.384321223709369, - "grad_norm": 0.2431640625, - "learning_rate": 0.00027417599193958964, - "loss": 0.6976, + "epoch": 0.8487690504103166, + "grad_norm": 0.2265625, + "learning_rate": 0.00029075592935924084, + "loss": 0.7761, "step": 724 }, { - "epoch": 1.3919694072657744, + "epoch": 0.8534583821805393, "grad_norm": 0.24609375, - "learning_rate": 0.00027388347089910253, - "loss": 0.7209, + "learning_rate": 0.00029064539531879893, + "loss": 0.8099, "step": 728 }, { - "epoch": 1.3996175908221797, - "grad_norm": 0.251953125, - "learning_rate": 0.0002735894603045211, - "loss": 0.7009, + "epoch": 0.8581477139507621, + "grad_norm": 0.216796875, + "learning_rate": 0.0002905342256208741, + "loss": 0.759, "step": 732 }, { - "epoch": 1.407265774378585, - "grad_norm": 0.2578125, - "learning_rate": 0.0002732939636909796, - "loss": 0.6583, + "epoch": 0.8628370457209847, + "grad_norm": 0.2421875, + "learning_rate": 0.0002904224207679061, + "loss": 0.7609, "step": 736 }, { - "epoch": 1.4149139579349903, - "grad_norm": 0.255859375, - "learning_rate": 0.00027299698461147966, - "loss": 0.6999, + "epoch": 0.8675263774912075, + "grad_norm": 0.263671875, + "learning_rate": 0.0002903099812652056, + "loss": 0.717, "step": 740 }, { - "epoch": 1.4225621414913958, - "grad_norm": 0.267578125, - "learning_rate": 0.0002726985266368481, - "loss": 0.7269, + "epoch": 0.8722157092614302, + "grad_norm": 0.244140625, + "learning_rate": 0.00029019690762095116, + "loss": 0.7318, "step": 744 }, { - "epoch": 1.430210325047801, - "grad_norm": 0.2392578125, - "learning_rate": 0.0002723985933556936, - "loss": 0.6256, + "epoch": 0.876905041031653, + "grad_norm": 0.240234375, + "learning_rate": 0.00029008320034618784, + "loss": 0.7132, "step": 748 }, { - "epoch": 1.4378585086042066, - "grad_norm": 0.271484375, - "learning_rate": 0.00027209718837436353, - "loss": 0.7129, + "epoch": 0.8815943728018757, + "grad_norm": 0.240234375, + "learning_rate": 0.00028996885995482424, + "loss": 0.7291, "step": 752 }, { - "epoch": 1.445506692160612, - "grad_norm": 0.271484375, - "learning_rate": 0.000271794315316901, - "loss": 0.6623, + "epoch": 0.8862837045720985, + "grad_norm": 0.248046875, + "learning_rate": 0.0002898538869636303, + "loss": 0.7854, "step": 756 }, { - "epoch": 1.4531548757170172, + "epoch": 0.8909730363423212, "grad_norm": 0.259765625, - "learning_rate": 0.00027148997782500085, - "loss": 0.6869, + "learning_rate": 0.0002897382818922352, + "loss": 0.7146, "step": 760 }, { - "epoch": 1.4608030592734225, - "grad_norm": 0.2451171875, - "learning_rate": 0.0002711841795579661, - "loss": 0.7426, + "epoch": 0.895662368112544, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002896220452631247, + "loss": 0.8165, "step": 764 }, { - "epoch": 1.468451242829828, - "grad_norm": 0.2578125, - "learning_rate": 0.00027087692419266383, - "loss": 0.6731, + "epoch": 0.9003516998827668, + "grad_norm": 0.240234375, + "learning_rate": 0.0002895051776016392, + "loss": 0.7719, "step": 768 }, { - "epoch": 1.4760994263862333, - "grad_norm": 0.275390625, - "learning_rate": 0.00027056821542348114, - "loss": 0.7591, + "epoch": 0.9050410316529894, + "grad_norm": 0.2412109375, + "learning_rate": 0.00028938767943597075, + "loss": 0.793, "step": 772 }, { - "epoch": 1.4837476099426385, - "grad_norm": 0.25, - "learning_rate": 0.0002702580569622805, - "loss": 0.7129, + "epoch": 0.9097303634232122, + "grad_norm": 0.228515625, + "learning_rate": 0.0002892695512971613, + "loss": 0.7722, "step": 776 }, { - "epoch": 1.491395793499044, - "grad_norm": 0.2490234375, - "learning_rate": 0.0002699464525383552, - "loss": 0.7307, + "epoch": 0.9144196951934349, + "grad_norm": 0.240234375, + "learning_rate": 0.0002891507937190998, + "loss": 0.664, "step": 780 }, { - "epoch": 1.4990439770554493, - "grad_norm": 0.2578125, - "learning_rate": 0.0002696334058983848, - "loss": 0.7317, + "epoch": 0.9191090269636577, + "grad_norm": 0.240234375, + "learning_rate": 0.0002890314072385201, + "loss": 0.7462, "step": 784 }, { - "epoch": 1.5066921606118546, - "grad_norm": 0.2578125, - "learning_rate": 0.0002693189208063894, - "loss": 0.6994, + "epoch": 0.9237983587338804, + "grad_norm": 0.22265625, + "learning_rate": 0.0002889113923949985, + "loss": 0.7702, "step": 788 }, { - "epoch": 1.51434034416826, - "grad_norm": 0.255859375, - "learning_rate": 0.00026900300104368524, - "loss": 0.72, + "epoch": 0.9284876905041032, + "grad_norm": 0.2421875, + "learning_rate": 0.0002887907497309511, + "loss": 0.7545, "step": 792 }, { - "epoch": 1.5219885277246654, - "grad_norm": 0.259765625, - "learning_rate": 0.0002686856504088385, - "loss": 0.7112, + "epoch": 0.9331770222743259, + "grad_norm": 0.248046875, + "learning_rate": 0.0002886694797916314, + "loss": 0.7855, "step": 796 }, { - "epoch": 1.5296367112810707, - "grad_norm": 0.2490234375, - "learning_rate": 0.00026836687271762015, - "loss": 0.6912, + "epoch": 0.9378663540445487, + "grad_norm": 0.2294921875, + "learning_rate": 0.00028854758312512826, + "loss": 0.7477, "step": 800 }, { - "epoch": 1.5372848948374762, - "grad_norm": 0.2421875, - "learning_rate": 0.0002680466718029596, - "loss": 0.6801, + "epoch": 0.9425556858147714, + "grad_norm": 0.240234375, + "learning_rate": 0.00028842506028236274, + "loss": 0.7647, "step": 804 }, { - "epoch": 1.5449330783938815, - "grad_norm": 0.265625, - "learning_rate": 0.00026772505151489897, - "loss": 0.7077, + "epoch": 0.9472450175849941, + "grad_norm": 0.232421875, + "learning_rate": 0.0002883019118170861, + "loss": 0.7492, "step": 808 }, { - "epoch": 1.5525812619502868, - "grad_norm": 0.283203125, - "learning_rate": 0.00026740201572054685, - "loss": 0.6926, + "epoch": 0.9519343493552169, + "grad_norm": 0.2333984375, + "learning_rate": 0.0002881781382858772, + "loss": 0.729, "step": 812 }, { - "epoch": 1.560229445506692, - "grad_norm": 0.27734375, - "learning_rate": 0.00026707756830403144, - "loss": 0.702, + "epoch": 0.9566236811254396, + "grad_norm": 0.25, + "learning_rate": 0.0002880537402481397, + "loss": 0.7545, "step": 816 }, { - "epoch": 1.5678776290630974, - "grad_norm": 0.271484375, - "learning_rate": 0.00026675171316645403, - "loss": 0.7178, + "epoch": 0.9613130128956624, + "grad_norm": 0.2421875, + "learning_rate": 0.0002879287182661001, + "loss": 0.7451, "step": 820 }, { - "epoch": 1.5755258126195029, - "grad_norm": 0.26171875, - "learning_rate": 0.00026642445422584224, - "loss": 0.6843, + "epoch": 0.9660023446658851, + "grad_norm": 0.267578125, + "learning_rate": 0.00028780307290480453, + "loss": 0.7464, "step": 824 }, { - "epoch": 1.5831739961759084, - "grad_norm": 0.25, - "learning_rate": 0.0002660957954171028, - "loss": 0.6722, + "epoch": 0.9706916764361079, + "grad_norm": 0.2265625, + "learning_rate": 0.00028767680473211683, + "loss": 0.7747, "step": 828 }, { - "epoch": 1.5908221797323137, + "epoch": 0.9753810082063306, "grad_norm": 0.251953125, - "learning_rate": 0.00026576574069197406, - "loss": 0.6518, + "learning_rate": 0.0002875499143187154, + "loss": 0.7581, "step": 832 }, { - "epoch": 1.598470363288719, - "grad_norm": 0.263671875, - "learning_rate": 0.00026543429401897875, - "loss": 0.6998, + "epoch": 0.9800703399765534, + "grad_norm": 0.2421875, + "learning_rate": 0.00028742240223809116, + "loss": 0.7786, "step": 836 }, { - "epoch": 1.6061185468451242, - "grad_norm": 0.26171875, - "learning_rate": 0.0002651014593833762, - "loss": 0.6966, + "epoch": 0.984759671746776, + "grad_norm": 0.27734375, + "learning_rate": 0.0002872942690665445, + "loss": 0.8077, "step": 840 }, { - "epoch": 1.6137667304015295, - "grad_norm": 0.25390625, - "learning_rate": 0.00026476724078711416, - "loss": 0.7054, + "epoch": 0.9894490035169988, + "grad_norm": 0.25, + "learning_rate": 0.0002871655153831831, + "loss": 0.7733, "step": 844 }, { - "epoch": 1.621414913957935, - "grad_norm": 0.279296875, - "learning_rate": 0.00026443164224878115, - "loss": 0.6655, + "epoch": 0.9941383352872216, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002870361417699188, + "loss": 0.7462, "step": 848 }, { - "epoch": 1.6290630975143403, - "grad_norm": 0.275390625, - "learning_rate": 0.0002640946678035576, - "loss": 0.7098, + "epoch": 0.9988276670574443, + "grad_norm": 0.255859375, + "learning_rate": 0.0002869061488114654, + "loss": 0.695, "step": 852 }, { - "epoch": 1.6367112810707458, - "grad_norm": 0.2451171875, - "learning_rate": 0.0002637563215031679, - "loss": 0.7099, + "epoch": 1.003516998827667, + "grad_norm": 0.2431640625, + "learning_rate": 0.00028677553709533605, + "loss": 0.7101, "step": 856 }, { - "epoch": 1.644359464627151, - "grad_norm": 0.2470703125, - "learning_rate": 0.00026341660741583127, - "loss": 0.6704, + "epoch": 1.0082063305978899, + "grad_norm": 0.318359375, + "learning_rate": 0.00028664430721184013, + "loss": 0.6597, "step": 860 }, { - "epoch": 1.6520076481835564, - "grad_norm": 0.27734375, - "learning_rate": 0.00026307552962621293, - "loss": 0.686, + "epoch": 1.0128956623681125, + "grad_norm": 0.265625, + "learning_rate": 0.0002865124597540811, + "loss": 0.6458, "step": 864 }, { - "epoch": 1.6596558317399617, - "grad_norm": 0.2470703125, - "learning_rate": 0.00026273309223537507, - "loss": 0.69, + "epoch": 1.0175849941383353, + "grad_norm": 0.25, + "learning_rate": 0.0002863799953179534, + "loss": 0.706, "step": 868 }, { - "epoch": 1.667304015296367, - "grad_norm": 0.275390625, - "learning_rate": 0.0002623892993607275, - "loss": 0.6957, + "epoch": 1.022274325908558, + "grad_norm": 0.23828125, + "learning_rate": 0.00028624691450214007, + "loss": 0.7264, "step": 872 }, { - "epoch": 1.6749521988527725, - "grad_norm": 0.267578125, - "learning_rate": 0.00026204415513597813, - "loss": 0.7071, + "epoch": 1.0269636576787808, + "grad_norm": 0.25390625, + "learning_rate": 0.00028611321790810996, + "loss": 0.6963, "step": 876 }, { - "epoch": 1.682600382409178, - "grad_norm": 0.26953125, - "learning_rate": 0.0002616976637110832, - "loss": 0.6313, + "epoch": 1.0316529894490034, + "grad_norm": 0.244140625, + "learning_rate": 0.0002859789061401149, + "loss": 0.6838, "step": 880 }, { - "epoch": 1.6902485659655833, - "grad_norm": 0.267578125, - "learning_rate": 0.0002613498292521977, - "loss": 0.6809, + "epoch": 1.0363423212192262, + "grad_norm": 0.2392578125, + "learning_rate": 0.00028584397980518705, + "loss": 0.7163, "step": 884 }, { - "epoch": 1.6978967495219885, - "grad_norm": 0.275390625, - "learning_rate": 0.00026100065594162475, - "loss": 0.6867, + "epoch": 1.041031652989449, + "grad_norm": 0.265625, + "learning_rate": 0.00028570843951313625, + "loss": 0.6668, "step": 888 }, { - "epoch": 1.7055449330783938, - "grad_norm": 0.26171875, - "learning_rate": 0.00026065014797776575, - "loss": 0.7065, + "epoch": 1.0457209847596718, + "grad_norm": 0.2333984375, + "learning_rate": 0.00028557228587654693, + "loss": 0.7022, "step": 892 }, { - "epoch": 1.7131931166347991, - "grad_norm": 0.296875, - "learning_rate": 0.0002602983095750698, - "loss": 0.6938, + "epoch": 1.0504103165298946, + "grad_norm": 0.25390625, + "learning_rate": 0.0002854355195107758, + "loss": 0.6669, "step": 896 }, { - "epoch": 1.7208413001912046, - "grad_norm": 0.25, - "learning_rate": 0.0002599451449639828, - "loss": 0.7138, + "epoch": 1.0550996483001172, + "grad_norm": 0.2470703125, + "learning_rate": 0.00028529814103394886, + "loss": 0.7281, "step": 900 }, { - "epoch": 1.72848948374761, - "grad_norm": 0.2421875, - "learning_rate": 0.00025959065839089684, - "loss": 0.6976, + "epoch": 1.05978898007034, + "grad_norm": 0.2314453125, + "learning_rate": 0.00028516015106695833, + "loss": 0.714, "step": 904 }, { - "epoch": 1.7361376673040154, - "grad_norm": 0.2431640625, - "learning_rate": 0.00025923485411809917, - "loss": 0.6792, + "epoch": 1.0644783118405627, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002850215502334605, + "loss": 0.6515, "step": 908 }, { - "epoch": 1.7437858508604207, - "grad_norm": 0.25390625, - "learning_rate": 0.00025887773642372064, - "loss": 0.7016, + "epoch": 1.0691676436107855, + "grad_norm": 0.25, + "learning_rate": 0.0002848823391598721, + "loss": 0.7476, "step": 912 }, { - "epoch": 1.751434034416826, - "grad_norm": 0.2578125, - "learning_rate": 0.00025851930960168464, - "loss": 0.6845, + "epoch": 1.073856975381008, + "grad_norm": 0.244140625, + "learning_rate": 0.00028474251847536826, + "loss": 0.6498, "step": 916 }, { - "epoch": 1.7590822179732313, - "grad_norm": 0.267578125, - "learning_rate": 0.0002581595779616552, - "loss": 0.6932, + "epoch": 1.078546307151231, + "grad_norm": 0.25390625, + "learning_rate": 0.000284602088811879, + "loss": 0.7022, "step": 920 }, { - "epoch": 1.7667304015296366, - "grad_norm": 0.263671875, - "learning_rate": 0.0002577985458289852, - "loss": 0.6911, + "epoch": 1.0832356389214537, + "grad_norm": 0.2578125, + "learning_rate": 0.0002844610508040868, + "loss": 0.6899, "step": 924 }, { - "epoch": 1.774378585086042, - "grad_norm": 0.259765625, - "learning_rate": 0.00025743621754466457, - "loss": 0.6611, + "epoch": 1.0879249706916765, + "grad_norm": 0.2490234375, + "learning_rate": 0.00028431940508942365, + "loss": 0.6563, "step": 928 }, { - "epoch": 1.7820267686424476, - "grad_norm": 0.251953125, - "learning_rate": 0.0002570725974652679, - "loss": 0.7158, + "epoch": 1.0926143024618993, + "grad_norm": 0.240234375, + "learning_rate": 0.000284177152308068, + "loss": 0.6752, "step": 932 }, { - "epoch": 1.7896749521988529, - "grad_norm": 0.2490234375, - "learning_rate": 0.0002567076899629021, - "loss": 0.7176, + "epoch": 1.0973036342321218, + "grad_norm": 0.296875, + "learning_rate": 0.000284034293102942, + "loss": 0.7024, "step": 936 }, { - "epoch": 1.7973231357552581, - "grad_norm": 0.251953125, - "learning_rate": 0.0002563414994251538, - "loss": 0.7001, + "epoch": 1.1019929660023446, + "grad_norm": 0.2470703125, + "learning_rate": 0.00028389082811970873, + "loss": 0.6902, "step": 940 }, { - "epoch": 1.8049713193116634, - "grad_norm": 0.267578125, - "learning_rate": 0.0002559740302550366, - "loss": 0.7583, + "epoch": 1.1066822977725674, + "grad_norm": 0.259765625, + "learning_rate": 0.00028374675800676893, + "loss": 0.6658, "step": 944 }, { - "epoch": 1.8126195028680687, - "grad_norm": 0.28125, - "learning_rate": 0.0002556052868709383, - "loss": 0.6374, + "epoch": 1.1113716295427902, + "grad_norm": 0.26953125, + "learning_rate": 0.00028360208341525836, + "loss": 0.7058, "step": 948 }, { - "epoch": 1.8202676864244742, + "epoch": 1.1160609613130128, "grad_norm": 0.279296875, - "learning_rate": 0.00025523527370656753, - "loss": 0.6771, + "learning_rate": 0.0002834568049990447, + "loss": 0.7162, "step": 952 }, { - "epoch": 1.8279158699808795, - "grad_norm": 0.263671875, - "learning_rate": 0.0002548639952109006, - "loss": 0.6547, + "epoch": 1.1207502930832356, + "grad_norm": 0.279296875, + "learning_rate": 0.0002833109234147249, + "loss": 0.6888, "step": 956 }, { - "epoch": 1.835564053537285, - "grad_norm": 0.255859375, - "learning_rate": 0.0002544914558481279, - "loss": 0.7078, + "epoch": 1.1254396248534584, + "grad_norm": 0.283203125, + "learning_rate": 0.0002831644393216216, + "loss": 0.6802, "step": 960 }, { - "epoch": 1.8432122370936903, - "grad_norm": 0.23828125, - "learning_rate": 0.00025411766009760027, - "loss": 0.6913, + "epoch": 1.1301289566236812, + "grad_norm": 0.259765625, + "learning_rate": 0.00028301735338178086, + "loss": 0.6579, "step": 964 }, { - "epoch": 1.8508604206500956, - "grad_norm": 0.26171875, - "learning_rate": 0.00025374261245377525, - "loss": 0.725, + "epoch": 1.134818288393904, + "grad_norm": 0.271484375, + "learning_rate": 0.0002828696662599686, + "loss": 0.6941, "step": 968 }, { - "epoch": 1.8585086042065009, - "grad_norm": 0.251953125, - "learning_rate": 0.0002533663174261628, - "loss": 0.6871, + "epoch": 1.1395076201641265, + "grad_norm": 0.259765625, + "learning_rate": 0.00028272137862366795, + "loss": 0.6928, "step": 972 }, { - "epoch": 1.8661567877629062, - "grad_norm": 0.283203125, - "learning_rate": 0.0002529887795392713, - "loss": 0.6748, + "epoch": 1.1441969519343493, + "grad_norm": 0.240234375, + "learning_rate": 0.00028257249114307606, + "loss": 0.6596, "step": 976 }, { - "epoch": 1.8738049713193117, - "grad_norm": 0.28515625, - "learning_rate": 0.00025261000333255305, - "loss": 0.6224, + "epoch": 1.1488862837045721, + "grad_norm": 0.25, + "learning_rate": 0.00028242300449110114, + "loss": 0.6836, "step": 980 }, { - "epoch": 1.8814531548757172, - "grad_norm": 0.255859375, - "learning_rate": 0.0002522299933603497, - "loss": 0.6914, + "epoch": 1.153575615474795, + "grad_norm": 0.23828125, + "learning_rate": 0.00028227291934335944, + "loss": 0.6556, "step": 984 }, { - "epoch": 1.8891013384321225, - "grad_norm": 0.26953125, - "learning_rate": 0.0002518487541918374, - "loss": 0.684, + "epoch": 1.1582649472450175, + "grad_norm": 0.24609375, + "learning_rate": 0.00028212223637817213, + "loss": 0.6663, "step": 988 }, { - "epoch": 1.8967495219885278, - "grad_norm": 0.259765625, - "learning_rate": 0.000251466290410972, - "loss": 0.7131, + "epoch": 1.1629542790152403, + "grad_norm": 0.2470703125, + "learning_rate": 0.00028197095627656215, + "loss": 0.6999, "step": 992 }, { - "epoch": 1.904397705544933, - "grad_norm": 0.267578125, - "learning_rate": 0.0002510826066164341, - "loss": 0.6956, + "epoch": 1.167643610785463, + "grad_norm": 0.2470703125, + "learning_rate": 0.0002818190797222514, + "loss": 0.6913, "step": 996 }, { - "epoch": 1.9120458891013383, - "grad_norm": 0.2470703125, - "learning_rate": 0.00025069770742157317, - "loss": 0.6683, + "epoch": 1.1723329425556859, + "grad_norm": 0.251953125, + "learning_rate": 0.0002816666074016575, + "loss": 0.6688, "step": 1000 }, { - "epoch": 1.9196940726577438, - "grad_norm": 0.251953125, - "learning_rate": 0.00025031159745435267, - "loss": 0.6941, + "epoch": 1.1770222743259087, + "grad_norm": 0.263671875, + "learning_rate": 0.0002815135400038905, + "loss": 0.6966, "step": 1004 }, { - "epoch": 1.9273422562141491, - "grad_norm": 0.248046875, - "learning_rate": 0.0002499242813572942, - "loss": 0.7458, + "epoch": 1.1817116060961312, + "grad_norm": 0.259765625, + "learning_rate": 0.0002813598782207501, + "loss": 0.6789, "step": 1008 }, { - "epoch": 1.9349904397705546, - "grad_norm": 0.2734375, - "learning_rate": 0.0002495357637874215, - "loss": 0.6724, + "epoch": 1.186400937866354, + "grad_norm": 0.25, + "learning_rate": 0.00028120562274672233, + "loss": 0.7121, "step": 1012 }, { - "epoch": 1.94263862332696, - "grad_norm": 0.240234375, - "learning_rate": 0.0002491460494162048, - "loss": 0.6662, + "epoch": 1.1910902696365768, + "grad_norm": 0.267578125, + "learning_rate": 0.0002810507742789765, + "loss": 0.6333, "step": 1016 }, { - "epoch": 1.9502868068833652, - "grad_norm": 0.279296875, - "learning_rate": 0.00024875514292950447, - "loss": 0.652, + "epoch": 1.1957796014067996, + "grad_norm": 0.26953125, + "learning_rate": 0.0002808953335173619, + "loss": 0.683, "step": 1020 }, { - "epoch": 1.9579349904397705, - "grad_norm": 0.263671875, - "learning_rate": 0.00024836304902751445, - "loss": 0.7223, + "epoch": 1.2004689331770222, + "grad_norm": 0.2412109375, + "learning_rate": 0.00028073930116440484, + "loss": 0.6771, "step": 1024 }, { - "epoch": 1.9655831739961758, - "grad_norm": 0.255859375, - "learning_rate": 0.0002479697724247062, - "loss": 0.7065, + "epoch": 1.205158264947245, + "grad_norm": 0.25390625, + "learning_rate": 0.0002805826779253052, + "loss": 0.6461, "step": 1028 }, { - "epoch": 1.9732313575525813, - "grad_norm": 0.263671875, - "learning_rate": 0.0002475753178497716, - "loss": 0.7307, + "epoch": 1.2098475967174678, + "grad_norm": 0.26953125, + "learning_rate": 0.0002804254645079337, + "loss": 0.665, "step": 1032 }, { - "epoch": 1.9808795411089866, - "grad_norm": 0.279296875, - "learning_rate": 0.00024717969004556646, - "loss": 0.7086, + "epoch": 1.2145369284876906, + "grad_norm": 0.25390625, + "learning_rate": 0.0002802676616228281, + "loss": 0.7006, "step": 1036 }, { - "epoch": 1.988527724665392, - "grad_norm": 0.259765625, - "learning_rate": 0.0002467828937690532, - "loss": 0.7051, + "epoch": 1.2192262602579134, + "grad_norm": 0.2734375, + "learning_rate": 0.00028010926998319055, + "loss": 0.6557, "step": 1040 }, { - "epoch": 1.9961759082217974, - "grad_norm": 0.2734375, - "learning_rate": 0.0002463849337912437, - "loss": 0.7084, + "epoch": 1.223915592028136, + "grad_norm": 0.255859375, + "learning_rate": 0.000279950290304884, + "loss": 0.6676, "step": 1044 }, { - "epoch": 2.0038240917782026, - "grad_norm": 0.236328125, - "learning_rate": 0.00024598581489714206, - "loss": 0.5785, + "epoch": 1.2286049237983587, + "grad_norm": 0.28125, + "learning_rate": 0.000279790723306429, + "loss": 0.6762, "step": 1048 }, { - "epoch": 2.011472275334608, - "grad_norm": 0.28125, - "learning_rate": 0.0002455855418856869, - "loss": 0.5793, + "epoch": 1.2332942555685815, + "grad_norm": 0.2490234375, + "learning_rate": 0.00027963056970900085, + "loss": 0.6742, "step": 1052 }, { - "epoch": 2.019120458891013, - "grad_norm": 0.259765625, - "learning_rate": 0.000245184119569694, - "loss": 0.6212, + "epoch": 1.2379835873388043, + "grad_norm": 0.26953125, + "learning_rate": 0.0002794698302364257, + "loss": 0.6682, "step": 1056 }, { - "epoch": 2.026768642447419, - "grad_norm": 0.26953125, - "learning_rate": 0.0002447815527757979, - "loss": 0.569, + "epoch": 1.2426729191090269, + "grad_norm": 0.271484375, + "learning_rate": 0.0002793085056151778, + "loss": 0.6691, "step": 1060 }, { - "epoch": 2.0344168260038242, - "grad_norm": 0.26953125, - "learning_rate": 0.0002443778463443944, - "loss": 0.5868, + "epoch": 1.2473622508792497, + "grad_norm": 0.25, + "learning_rate": 0.00027914659657437586, + "loss": 0.7082, "step": 1064 }, { - "epoch": 2.0420650095602295, - "grad_norm": 0.236328125, - "learning_rate": 0.0002439730051295818, - "loss": 0.6195, + "epoch": 1.2520515826494725, + "grad_norm": 0.259765625, + "learning_rate": 0.00027898410384578004, + "loss": 0.6849, "step": 1068 }, { - "epoch": 2.049713193116635, - "grad_norm": 0.279296875, - "learning_rate": 0.0002435670339991031, - "loss": 0.5886, + "epoch": 1.2567409144196953, + "grad_norm": 0.27734375, + "learning_rate": 0.00027882102816378853, + "loss": 0.6864, "step": 1072 }, { - "epoch": 2.05736137667304, - "grad_norm": 0.28125, - "learning_rate": 0.00024315993783428718, - "loss": 0.6097, + "epoch": 1.261430246189918, + "grad_norm": 0.267578125, + "learning_rate": 0.0002786573702654342, + "loss": 0.6555, "step": 1076 }, { - "epoch": 2.0650095602294454, - "grad_norm": 0.3203125, - "learning_rate": 0.00024275172152999006, - "loss": 0.5727, + "epoch": 1.2661195779601406, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002784931308903813, + "loss": 0.7219, "step": 1080 }, { - "epoch": 2.0726577437858507, - "grad_norm": 0.275390625, - "learning_rate": 0.00024234238999453614, - "loss": 0.6085, + "epoch": 1.2708089097303634, + "grad_norm": 0.267578125, + "learning_rate": 0.000278328310780922, + "loss": 0.7314, "step": 1084 }, { - "epoch": 2.0803059273422564, - "grad_norm": 0.29296875, - "learning_rate": 0.00024193194814965934, - "loss": 0.6144, + "epoch": 1.2754982415005862, + "grad_norm": 0.271484375, + "learning_rate": 0.0002781629106819733, + "loss": 0.6988, "step": 1088 }, { - "epoch": 2.0879541108986617, - "grad_norm": 0.279296875, - "learning_rate": 0.00024152040093044353, - "loss": 0.5868, + "epoch": 1.2801875732708088, + "grad_norm": 0.255859375, + "learning_rate": 0.0002779969313410733, + "loss": 0.6559, "step": 1092 }, { - "epoch": 2.095602294455067, - "grad_norm": 0.2734375, - "learning_rate": 0.00024110775328526352, - "loss": 0.6278, + "epoch": 1.2848769050410316, + "grad_norm": 0.279296875, + "learning_rate": 0.0002778303735083784, + "loss": 0.6329, "step": 1096 }, { - "epoch": 2.1032504780114722, - "grad_norm": 0.275390625, - "learning_rate": 0.00024069401017572543, - "loss": 0.5923, + "epoch": 1.2895662368112544, + "grad_norm": 0.255859375, + "learning_rate": 0.0002776632379366591, + "loss": 0.6737, "step": 1100 }, { - "epoch": 2.1108986615678775, - "grad_norm": 0.265625, - "learning_rate": 0.00024027917657660713, - "loss": 0.5759, + "epoch": 1.2942555685814772, + "grad_norm": 0.2734375, + "learning_rate": 0.0002774955253812973, + "loss": 0.6834, "step": 1104 }, { - "epoch": 2.118546845124283, - "grad_norm": 0.2734375, - "learning_rate": 0.00023986325747579824, - "loss": 0.6138, + "epoch": 1.2989449003517, + "grad_norm": 0.271484375, + "learning_rate": 0.00027732723660028256, + "loss": 0.6357, "step": 1108 }, { - "epoch": 2.126195028680688, - "grad_norm": 0.28515625, - "learning_rate": 0.0002394462578742403, - "loss": 0.5786, + "epoch": 1.3036342321219228, + "grad_norm": 0.2578125, + "learning_rate": 0.0002771583723542087, + "loss": 0.6635, "step": 1112 }, { - "epoch": 2.133843212237094, - "grad_norm": 0.267578125, - "learning_rate": 0.0002390281827858668, - "loss": 0.64, + "epoch": 1.3083235638921453, + "grad_norm": 0.259765625, + "learning_rate": 0.0002769889334062705, + "loss": 0.6871, "step": 1116 }, { - "epoch": 2.141491395793499, - "grad_norm": 0.279296875, - "learning_rate": 0.0002386090372375424, - "loss": 0.6307, + "epoch": 1.3130128956623681, + "grad_norm": 0.26953125, + "learning_rate": 0.00027681892052226005, + "loss": 0.7013, "step": 1120 }, { - "epoch": 2.1491395793499044, - "grad_norm": 0.2578125, - "learning_rate": 0.00023818882626900294, - "loss": 0.5641, + "epoch": 1.317702227432591, + "grad_norm": 0.2470703125, + "learning_rate": 0.0002766483344705634, + "loss": 0.6578, "step": 1124 }, { - "epoch": 2.1567877629063097, - "grad_norm": 0.28125, - "learning_rate": 0.00023776755493279473, - "loss": 0.623, + "epoch": 1.3223915592028135, + "grad_norm": 0.267578125, + "learning_rate": 0.00027647717602215704, + "loss": 0.66, "step": 1128 }, { - "epoch": 2.164435946462715, - "grad_norm": 0.275390625, - "learning_rate": 0.00023734522829421372, - "loss": 0.6022, + "epoch": 1.3270808909730363, + "grad_norm": 0.271484375, + "learning_rate": 0.00027630544595060464, + "loss": 0.6884, "step": 1132 }, { - "epoch": 2.1720841300191203, - "grad_norm": 0.2734375, - "learning_rate": 0.00023692185143124464, - "loss": 0.6121, + "epoch": 1.331770222743259, + "grad_norm": 0.26171875, + "learning_rate": 0.0002761331450320531, + "loss": 0.6845, "step": 1136 }, { - "epoch": 2.179732313575526, - "grad_norm": 0.267578125, - "learning_rate": 0.00023649742943449996, - "loss": 0.5878, + "epoch": 1.3364595545134819, + "grad_norm": 0.2578125, + "learning_rate": 0.00027596027404522944, + "loss": 0.6924, "step": 1140 }, { - "epoch": 2.1873804971319313, + "epoch": 1.3411488862837047, "grad_norm": 0.2578125, - "learning_rate": 0.00023607196740715858, - "loss": 0.6143, + "learning_rate": 0.0002757868337714372, + "loss": 0.6962, "step": 1144 }, { - "epoch": 2.1950286806883366, - "grad_norm": 0.28515625, - "learning_rate": 0.00023564547046490468, - "loss": 0.5655, + "epoch": 1.3458382180539274, + "grad_norm": 0.267578125, + "learning_rate": 0.00027561282499455276, + "loss": 0.6869, "step": 1148 }, { - "epoch": 2.202676864244742, - "grad_norm": 0.26953125, - "learning_rate": 0.00023521794373586603, - "loss": 0.5685, + "epoch": 1.35052754982415, + "grad_norm": 0.25390625, + "learning_rate": 0.00027543824850102187, + "loss": 0.7182, "step": 1152 }, { - "epoch": 2.210325047801147, - "grad_norm": 0.283203125, - "learning_rate": 0.00023478939236055228, - "loss": 0.5845, + "epoch": 1.3552168815943728, + "grad_norm": 0.259765625, + "learning_rate": 0.00027526310507985626, + "loss": 0.6907, "step": 1156 }, { - "epoch": 2.2179732313575524, - "grad_norm": 0.2890625, - "learning_rate": 0.00023435982149179346, - "loss": 0.6108, + "epoch": 1.3599062133645956, + "grad_norm": 0.26953125, + "learning_rate": 0.0002750873955226298, + "loss": 0.7224, "step": 1160 }, { - "epoch": 2.2256214149139577, - "grad_norm": 0.28125, - "learning_rate": 0.0002339292362946777, - "loss": 0.6221, + "epoch": 1.3645955451348182, + "grad_norm": 0.251953125, + "learning_rate": 0.00027491112062347515, + "loss": 0.7173, "step": 1164 }, { - "epoch": 2.2332695984703634, - "grad_norm": 0.28515625, - "learning_rate": 0.0002334976419464892, - "loss": 0.5739, + "epoch": 1.369284876905041, + "grad_norm": 0.298828125, + "learning_rate": 0.0002747342811790799, + "loss": 0.6719, "step": 1168 }, { - "epoch": 2.2409177820267687, - "grad_norm": 0.28515625, - "learning_rate": 0.00023306504363664613, - "loss": 0.5928, + "epoch": 1.3739742086752638, + "grad_norm": 0.265625, + "learning_rate": 0.00027455687798868346, + "loss": 0.7309, "step": 1172 }, { - "epoch": 2.248565965583174, - "grad_norm": 0.259765625, - "learning_rate": 0.00023263144656663801, - "loss": 0.5422, + "epoch": 1.3786635404454866, + "grad_norm": 0.267578125, + "learning_rate": 0.0002743789118540728, + "loss": 0.7328, "step": 1176 }, { - "epoch": 2.2562141491395793, - "grad_norm": 0.271484375, - "learning_rate": 0.00023219685594996347, - "loss": 0.5815, + "epoch": 1.3833528722157094, + "grad_norm": 0.26171875, + "learning_rate": 0.00027420038357957934, + "loss": 0.6742, "step": 1180 }, { - "epoch": 2.2638623326959846, - "grad_norm": 0.267578125, - "learning_rate": 0.00023176127701206713, - "loss": 0.5786, + "epoch": 1.388042203985932, + "grad_norm": 0.259765625, + "learning_rate": 0.0002740212939720751, + "loss": 0.7193, "step": 1184 }, { - "epoch": 2.27151051625239, - "grad_norm": 0.263671875, - "learning_rate": 0.00023132471499027717, - "loss": 0.5634, + "epoch": 1.3927315357561547, + "grad_norm": 0.267578125, + "learning_rate": 0.0002738416438409691, + "loss": 0.7321, "step": 1188 }, { - "epoch": 2.2791586998087956, - "grad_norm": 0.291015625, - "learning_rate": 0.0002308871751337422, - "loss": 0.5969, + "epoch": 1.3974208675263775, + "grad_norm": 0.271484375, + "learning_rate": 0.0002736614339982036, + "loss": 0.7035, "step": 1192 }, { - "epoch": 2.286806883365201, - "grad_norm": 0.279296875, - "learning_rate": 0.00023044866270336822, - "loss": 0.587, + "epoch": 1.4021101992966003, + "grad_norm": 0.2734375, + "learning_rate": 0.00027348066525825066, + "loss": 0.6952, "step": 1196 }, { - "epoch": 2.294455066921606, - "grad_norm": 0.310546875, - "learning_rate": 0.00023000918297175506, - "loss": 0.6312, + "epoch": 1.4067995310668229, + "grad_norm": 0.2578125, + "learning_rate": 0.0002732993384381082, + "loss": 0.776, "step": 1200 }, { - "epoch": 2.3021032504780115, - "grad_norm": 0.28515625, - "learning_rate": 0.00022956874122313347, - "loss": 0.6268, + "epoch": 1.4114888628370457, + "grad_norm": 0.25, + "learning_rate": 0.00027311745435729655, + "loss": 0.7111, "step": 1204 }, { - "epoch": 2.3097514340344167, - "grad_norm": 0.263671875, - "learning_rate": 0.00022912734275330117, - "loss": 0.586, + "epoch": 1.4161781946072685, + "grad_norm": 0.248046875, + "learning_rate": 0.00027293501383785445, + "loss": 0.6733, "step": 1208 }, { - "epoch": 2.317399617590822, - "grad_norm": 0.28125, - "learning_rate": 0.00022868499286955943, - "loss": 0.571, + "epoch": 1.4208675263774913, + "grad_norm": 0.26171875, + "learning_rate": 0.00027275201770433574, + "loss": 0.7015, "step": 1212 }, { - "epoch": 2.3250478011472273, - "grad_norm": 0.296875, - "learning_rate": 0.00022824169689064915, - "loss": 0.6062, + "epoch": 1.425556858147714, + "grad_norm": 0.26171875, + "learning_rate": 0.0002725684667838051, + "loss": 0.6968, "step": 1216 }, { - "epoch": 2.332695984703633, - "grad_norm": 0.263671875, - "learning_rate": 0.00022779746014668683, - "loss": 0.5991, + "epoch": 1.4302461899179366, + "grad_norm": 0.2412109375, + "learning_rate": 0.00027238436190583486, + "loss": 0.7071, "step": 1220 }, { - "epoch": 2.3403441682600383, - "grad_norm": 0.27734375, - "learning_rate": 0.00022735228797910066, - "loss": 0.6193, + "epoch": 1.4349355216881594, + "grad_norm": 0.298828125, + "learning_rate": 0.00027219970390250094, + "loss": 0.6828, "step": 1224 }, { - "epoch": 2.3479923518164436, - "grad_norm": 0.2890625, - "learning_rate": 0.0002269061857405662, - "loss": 0.5719, + "epoch": 1.4396248534583822, + "grad_norm": 0.25390625, + "learning_rate": 0.000272014493608379, + "loss": 0.648, "step": 1228 }, { - "epoch": 2.355640535372849, - "grad_norm": 0.27734375, - "learning_rate": 0.00022645915879494202, - "loss": 0.6171, + "epoch": 1.444314185228605, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002718287318605411, + "loss": 0.6828, "step": 1232 }, { - "epoch": 2.363288718929254, - "grad_norm": 0.267578125, - "learning_rate": 0.00022601121251720514, - "loss": 0.6213, + "epoch": 1.4490035169988276, + "grad_norm": 0.27734375, + "learning_rate": 0.0002716424194985514, + "loss": 0.6893, "step": 1236 }, { - "epoch": 2.3709369024856595, - "grad_norm": 0.271484375, - "learning_rate": 0.0002255623522933866, - "loss": 0.5933, + "epoch": 1.4536928487690504, + "grad_norm": 0.248046875, + "learning_rate": 0.0002714555573644627, + "loss": 0.7048, "step": 1240 }, { - "epoch": 2.378585086042065, - "grad_norm": 0.2890625, - "learning_rate": 0.00022511258352050649, - "loss": 0.5598, + "epoch": 1.4583821805392732, + "grad_norm": 0.251953125, + "learning_rate": 0.0002712681463028126, + "loss": 0.6076, "step": 1244 }, { - "epoch": 2.3862332695984705, - "grad_norm": 0.2890625, - "learning_rate": 0.00022466191160650916, - "loss": 0.6435, + "epoch": 1.463071512309496, + "grad_norm": 0.2451171875, + "learning_rate": 0.00027108018716061945, + "loss": 0.7175, "step": 1248 }, { - "epoch": 2.3938814531548758, - "grad_norm": 0.28515625, - "learning_rate": 0.00022421034197019822, - "loss": 0.5553, + "epoch": 1.4677608440797187, + "grad_norm": 0.263671875, + "learning_rate": 0.0002708916807873787, + "loss": 0.6885, "step": 1252 }, { - "epoch": 2.401529636711281, - "grad_norm": 0.25390625, - "learning_rate": 0.00022375788004117128, - "loss": 0.6193, + "epoch": 1.4724501758499413, + "grad_norm": 0.2578125, + "learning_rate": 0.0002707026280350594, + "loss": 0.6368, "step": 1256 }, { - "epoch": 2.4091778202676863, - "grad_norm": 0.29296875, - "learning_rate": 0.00022330453125975474, - "loss": 0.6117, + "epoch": 1.477139507620164, + "grad_norm": 0.26171875, + "learning_rate": 0.00027051302975809947, + "loss": 0.6759, "step": 1260 }, { - "epoch": 2.4168260038240916, - "grad_norm": 0.318359375, - "learning_rate": 0.0002228503010769384, - "loss": 0.5761, + "epoch": 1.481828839390387, + "grad_norm": 0.26171875, + "learning_rate": 0.00027032288681340285, + "loss": 0.6619, "step": 1264 }, { - "epoch": 2.424474187380497, - "grad_norm": 0.2890625, - "learning_rate": 0.0002223951949543098, - "loss": 0.6343, + "epoch": 1.4865181711606097, + "grad_norm": 0.26171875, + "learning_rate": 0.0002701322000603347, + "loss": 0.7396, "step": 1268 }, { - "epoch": 2.4321223709369026, - "grad_norm": 0.28515625, - "learning_rate": 0.00022193921836398875, - "loss": 0.5763, + "epoch": 1.4912075029308323, + "grad_norm": 0.26953125, + "learning_rate": 0.00026994097036071846, + "loss": 0.6688, "step": 1272 }, { - "epoch": 2.439770554493308, - "grad_norm": 0.2890625, - "learning_rate": 0.00022148237678856138, - "loss": 0.5807, + "epoch": 1.495896834701055, + "grad_norm": 0.25390625, + "learning_rate": 0.000269749198578831, + "loss": 0.6439, "step": 1276 }, { - "epoch": 2.447418738049713, + "epoch": 1.5005861664712778, "grad_norm": 0.279296875, - "learning_rate": 0.0002210246757210142, - "loss": 0.62, + "learning_rate": 0.00026955688558139945, + "loss": 0.7026, "step": 1280 }, { - "epoch": 2.4550669216061185, - "grad_norm": 0.28515625, - "learning_rate": 0.00022056612066466817, - "loss": 0.6255, + "epoch": 1.5052754982415006, + "grad_norm": 0.271484375, + "learning_rate": 0.0002693640322375969, + "loss": 0.6609, "step": 1284 }, { - "epoch": 2.462715105162524, - "grad_norm": 0.3203125, - "learning_rate": 0.00022010671713311238, - "loss": 0.6786, + "epoch": 1.5099648300117234, + "grad_norm": 0.26953125, + "learning_rate": 0.0002691706394190386, + "loss": 0.6639, "step": 1288 }, { - "epoch": 2.470363288718929, - "grad_norm": 0.2890625, - "learning_rate": 0.000219646470650138, - "loss": 0.6173, + "epoch": 1.5146541617819462, + "grad_norm": 0.271484375, + "learning_rate": 0.00026897670799977803, + "loss": 0.6592, "step": 1292 }, { - "epoch": 2.478011472275335, - "grad_norm": 0.2734375, - "learning_rate": 0.00021918538674967156, - "loss": 0.6113, + "epoch": 1.5193434935521688, + "grad_norm": 0.29296875, + "learning_rate": 0.0002687822388563028, + "loss": 0.631, "step": 1296 }, { - "epoch": 2.48565965583174, - "grad_norm": 0.265625, - "learning_rate": 0.0002187234709757087, - "loss": 0.5949, + "epoch": 1.5240328253223916, + "grad_norm": 0.248046875, + "learning_rate": 0.000268587232867531, + "loss": 0.671, "step": 1300 }, { - "epoch": 2.4933078393881454, - "grad_norm": 0.296875, - "learning_rate": 0.00021826072888224716, - "loss": 0.6248, + "epoch": 1.5287221570926142, + "grad_norm": 0.279296875, + "learning_rate": 0.00026839169091480685, + "loss": 0.7033, "step": 1304 }, { - "epoch": 2.5009560229445507, - "grad_norm": 0.283203125, - "learning_rate": 0.00021779716603322034, - "loss": 0.5849, + "epoch": 1.533411488862837, + "grad_norm": 0.259765625, + "learning_rate": 0.00026819561388189697, + "loss": 0.6508, "step": 1308 }, { - "epoch": 2.508604206500956, - "grad_norm": 0.26953125, - "learning_rate": 0.0002173327880024303, - "loss": 0.5947, + "epoch": 1.5381008206330598, + "grad_norm": 0.259765625, + "learning_rate": 0.00026799900265498625, + "loss": 0.6661, "step": 1312 }, { - "epoch": 2.5162523900573612, - "grad_norm": 0.294921875, - "learning_rate": 0.00021686760037348065, - "loss": 0.5689, + "epoch": 1.5427901524032825, + "grad_norm": 0.263671875, + "learning_rate": 0.0002678018581226741, + "loss": 0.6565, "step": 1316 }, { - "epoch": 2.5239005736137665, - "grad_norm": 0.2890625, - "learning_rate": 0.00021640160873970954, - "loss": 0.6302, + "epoch": 1.5474794841735053, + "grad_norm": 0.2578125, + "learning_rate": 0.00026760418117597007, + "loss": 0.7513, "step": 1320 }, { - "epoch": 2.5315487571701722, - "grad_norm": 0.275390625, - "learning_rate": 0.00021593481870412217, - "loss": 0.6117, + "epoch": 1.5521688159437281, + "grad_norm": 0.267578125, + "learning_rate": 0.00026740597270829, + "loss": 0.6873, "step": 1324 }, { - "epoch": 2.5391969407265775, - "grad_norm": 0.298828125, - "learning_rate": 0.0002154672358793238, - "loss": 0.6024, + "epoch": 1.556858147713951, + "grad_norm": 0.259765625, + "learning_rate": 0.00026720723361545206, + "loss": 0.6724, "step": 1328 }, { - "epoch": 2.546845124282983, - "grad_norm": 0.298828125, - "learning_rate": 0.00021499886588745195, - "loss": 0.5417, + "epoch": 1.5615474794841735, + "grad_norm": 0.2578125, + "learning_rate": 0.0002670079647956726, + "loss": 0.7035, "step": 1332 }, { - "epoch": 2.554493307839388, - "grad_norm": 0.29296875, - "learning_rate": 0.00021452971436010886, - "loss": 0.5975, + "epoch": 1.5662368112543963, + "grad_norm": 0.294921875, + "learning_rate": 0.00026680816714956215, + "loss": 0.6473, "step": 1336 }, { - "epoch": 2.5621414913957934, - "grad_norm": 0.287109375, - "learning_rate": 0.00021405978693829397, - "loss": 0.5997, + "epoch": 1.5709261430246189, + "grad_norm": 0.25390625, + "learning_rate": 0.0002666078415801211, + "loss": 0.7342, "step": 1340 }, { - "epoch": 2.569789674952199, - "grad_norm": 0.296875, - "learning_rate": 0.00021358908927233576, - "loss": 0.6047, + "epoch": 1.5756154747948417, + "grad_norm": 0.267578125, + "learning_rate": 0.0002664069889927361, + "loss": 0.7155, "step": 1344 }, { - "epoch": 2.5774378585086044, - "grad_norm": 0.283203125, - "learning_rate": 0.00021311762702182414, - "loss": 0.6135, + "epoch": 1.5803048065650644, + "grad_norm": 0.27734375, + "learning_rate": 0.00026620561029517555, + "loss": 0.7006, "step": 1348 }, { - "epoch": 2.5850860420650097, - "grad_norm": 0.2890625, - "learning_rate": 0.00021264540585554215, - "loss": 0.6251, + "epoch": 1.5849941383352872, + "grad_norm": 0.26171875, + "learning_rate": 0.0002660037063975857, + "loss": 0.6876, "step": 1352 }, { - "epoch": 2.592734225621415, - "grad_norm": 0.275390625, - "learning_rate": 0.00021217243145139802, - "loss": 0.6308, + "epoch": 1.58968347010551, + "grad_norm": 0.2451171875, + "learning_rate": 0.0002658012782124865, + "loss": 0.6475, "step": 1356 }, { - "epoch": 2.6003824091778203, - "grad_norm": 0.2890625, - "learning_rate": 0.0002116987094963567, - "loss": 0.5828, + "epoch": 1.5943728018757328, + "grad_norm": 0.244140625, + "learning_rate": 0.0002655983266547673, + "loss": 0.6373, "step": 1360 }, { - "epoch": 2.6080305927342256, - "grad_norm": 0.30078125, - "learning_rate": 0.00021122424568637157, - "loss": 0.6057, + "epoch": 1.5990621336459554, + "grad_norm": 0.27734375, + "learning_rate": 0.000265394852641683, + "loss": 0.6848, "step": 1364 }, { - "epoch": 2.615678776290631, - "grad_norm": 0.291015625, - "learning_rate": 0.00021074904572631606, - "loss": 0.6435, + "epoch": 1.6037514654161782, + "grad_norm": 0.265625, + "learning_rate": 0.0002651908570928498, + "loss": 0.7114, "step": 1368 }, { - "epoch": 2.623326959847036, - "grad_norm": 0.294921875, - "learning_rate": 0.00021027311532991475, - "loss": 0.6201, + "epoch": 1.608440797186401, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002649863409302411, + "loss": 0.7023, "step": 1372 }, { - "epoch": 2.6309751434034414, - "grad_norm": 0.27734375, - "learning_rate": 0.00020979646021967503, - "loss": 0.6192, + "epoch": 1.6131301289566236, + "grad_norm": 0.259765625, + "learning_rate": 0.000264781305078183, + "loss": 0.6557, "step": 1376 }, { - "epoch": 2.638623326959847, - "grad_norm": 0.287109375, - "learning_rate": 0.00020931908612681805, - "loss": 0.6072, + "epoch": 1.6178194607268463, + "grad_norm": 0.263671875, + "learning_rate": 0.00026457575046335055, + "loss": 0.6688, "step": 1380 }, { - "epoch": 2.6462715105162524, - "grad_norm": 0.287109375, - "learning_rate": 0.00020884099879120993, - "loss": 0.5332, + "epoch": 1.6225087924970691, + "grad_norm": 0.25390625, + "learning_rate": 0.00026436967801476334, + "loss": 0.6833, "step": 1384 }, { - "epoch": 2.6539196940726577, - "grad_norm": 0.291015625, - "learning_rate": 0.00020836220396129265, - "loss": 0.5923, + "epoch": 1.627198124267292, + "grad_norm": 0.259765625, + "learning_rate": 0.0002641630886637814, + "loss": 0.6415, "step": 1388 }, { - "epoch": 2.661567877629063, - "grad_norm": 0.2734375, - "learning_rate": 0.00020788270739401505, - "loss": 0.6293, + "epoch": 1.6318874560375147, + "grad_norm": 0.255859375, + "learning_rate": 0.0002639559833441008, + "loss": 0.7361, "step": 1392 }, { - "epoch": 2.6692160611854687, - "grad_norm": 0.275390625, - "learning_rate": 0.00020740251485476345, - "loss": 0.5851, + "epoch": 1.6365767878077375, + "grad_norm": 0.259765625, + "learning_rate": 0.00026374836299174984, + "loss": 0.6997, "step": 1396 }, { - "epoch": 2.676864244741874, - "grad_norm": 0.30078125, - "learning_rate": 0.00020692163211729253, - "loss": 0.6088, + "epoch": 1.64126611957796, + "grad_norm": 0.26953125, + "learning_rate": 0.0002635402285450842, + "loss": 0.6023, "step": 1400 }, { - "epoch": 2.6845124282982793, - "grad_norm": 0.306640625, - "learning_rate": 0.0002064400649636557, - "loss": 0.6033, + "epoch": 1.6459554513481829, + "grad_norm": 0.28515625, + "learning_rate": 0.00026333158094478333, + "loss": 0.7078, "step": 1404 }, { - "epoch": 2.6921606118546846, - "grad_norm": 0.3046875, - "learning_rate": 0.0002059578191841357, - "loss": 0.5948, + "epoch": 1.6506447831184057, + "grad_norm": 0.259765625, + "learning_rate": 0.0002631224211338458, + "loss": 0.7311, "step": 1408 }, { - "epoch": 2.69980879541109, - "grad_norm": 0.29296875, - "learning_rate": 0.00020547490057717499, - "loss": 0.6287, + "epoch": 1.6553341148886282, + "grad_norm": 0.27734375, + "learning_rate": 0.00026291275005758507, + "loss": 0.7061, "step": 1412 }, { - "epoch": 2.707456978967495, - "grad_norm": 0.296875, - "learning_rate": 0.00020499131494930602, - "loss": 0.5736, + "epoch": 1.660023446658851, + "grad_norm": 0.267578125, + "learning_rate": 0.00026270256866362554, + "loss": 0.6862, "step": 1416 }, { - "epoch": 2.7151051625239004, - "grad_norm": 0.287109375, - "learning_rate": 0.0002045070681150813, - "loss": 0.6496, + "epoch": 1.6647127784290738, + "grad_norm": 0.251953125, + "learning_rate": 0.0002624918779018979, + "loss": 0.6868, "step": 1420 }, { - "epoch": 2.7227533460803057, - "grad_norm": 0.279296875, - "learning_rate": 0.00020402216589700362, - "loss": 0.5993, + "epoch": 1.6694021101992966, + "grad_norm": 0.263671875, + "learning_rate": 0.00026228067872463475, + "loss": 0.7052, "step": 1424 }, { - "epoch": 2.730401529636711, - "grad_norm": 0.296875, - "learning_rate": 0.00020353661412545598, - "loss": 0.596, + "epoch": 1.6740914419695194, + "grad_norm": 0.2578125, + "learning_rate": 0.0002620689720863669, + "loss": 0.6386, "step": 1428 }, { - "epoch": 2.7380497131931167, - "grad_norm": 0.28515625, - "learning_rate": 0.00020305041863863152, - "loss": 0.639, + "epoch": 1.6787807737397422, + "grad_norm": 0.26171875, + "learning_rate": 0.0002618567589439185, + "loss": 0.6729, "step": 1432 }, { - "epoch": 2.745697896749522, - "grad_norm": 0.27734375, - "learning_rate": 0.00020256358528246334, - "loss": 0.5703, + "epoch": 1.6834701055099648, + "grad_norm": 0.271484375, + "learning_rate": 0.00026164404025640276, + "loss": 0.655, "step": 1436 }, { - "epoch": 2.7533460803059273, - "grad_norm": 0.279296875, - "learning_rate": 0.00020207611991055407, - "loss": 0.5838, + "epoch": 1.6881594372801876, + "grad_norm": 0.27734375, + "learning_rate": 0.0002614308169852179, + "loss": 0.7085, "step": 1440 }, { - "epoch": 2.7609942638623326, - "grad_norm": 0.3046875, - "learning_rate": 0.0002015880283841057, - "loss": 0.5845, + "epoch": 1.6928487690504102, + "grad_norm": 0.265625, + "learning_rate": 0.00026121709009404264, + "loss": 0.7295, "step": 1444 }, { - "epoch": 2.768642447418738, - "grad_norm": 0.287109375, - "learning_rate": 0.00020109931657184894, - "loss": 0.6169, + "epoch": 1.697538100820633, + "grad_norm": 0.26171875, + "learning_rate": 0.00026100286054883166, + "loss": 0.6506, "step": 1448 }, { - "epoch": 2.7762906309751436, - "grad_norm": 0.296875, - "learning_rate": 0.0002006099903499727, - "loss": 0.6026, + "epoch": 1.7022274325908557, + "grad_norm": 0.263671875, + "learning_rate": 0.0002607881293178117, + "loss": 0.6513, "step": 1452 }, { - "epoch": 2.783938814531549, - "grad_norm": 0.291015625, - "learning_rate": 0.00020012005560205356, - "loss": 0.6278, + "epoch": 1.7069167643610785, + "grad_norm": 0.2578125, + "learning_rate": 0.00026057289737147675, + "loss": 0.6906, "step": 1456 }, { - "epoch": 2.791586998087954, - "grad_norm": 0.28515625, - "learning_rate": 0.0001996295182189847, - "loss": 0.6273, + "epoch": 1.7116060961313013, + "grad_norm": 0.279296875, + "learning_rate": 0.00026035716568258377, + "loss": 0.6819, "step": 1460 }, { - "epoch": 2.7992351816443595, - "grad_norm": 0.291015625, - "learning_rate": 0.00019913838409890548, - "loss": 0.6084, + "epoch": 1.7162954279015241, + "grad_norm": 0.265625, + "learning_rate": 0.0002601409352261485, + "loss": 0.6822, "step": 1464 }, { - "epoch": 2.8068833652007648, - "grad_norm": 0.30859375, - "learning_rate": 0.00019864665914713024, - "loss": 0.6295, + "epoch": 1.720984759671747, + "grad_norm": 0.255859375, + "learning_rate": 0.0002599242069794407, + "loss": 0.6983, "step": 1468 }, { - "epoch": 2.81453154875717, - "grad_norm": 0.29296875, - "learning_rate": 0.0001981543492760774, - "loss": 0.5889, + "epoch": 1.7256740914419695, + "grad_norm": 0.263671875, + "learning_rate": 0.00025970698192198026, + "loss": 0.663, "step": 1472 }, { - "epoch": 2.8221797323135753, - "grad_norm": 0.287109375, - "learning_rate": 0.00019766146040519836, - "loss": 0.6064, + "epoch": 1.7303634232121923, + "grad_norm": 0.248046875, + "learning_rate": 0.00025948926103553196, + "loss": 0.7066, "step": 1476 }, { - "epoch": 2.8298279158699806, - "grad_norm": 0.28515625, - "learning_rate": 0.00019716799846090634, - "loss": 0.6269, + "epoch": 1.7350527549824148, + "grad_norm": 0.265625, + "learning_rate": 0.00025927104530410193, + "loss": 0.6464, "step": 1480 }, { - "epoch": 2.8374760994263863, - "grad_norm": 0.287109375, - "learning_rate": 0.00019667396937650506, - "loss": 0.5742, + "epoch": 1.7397420867526376, + "grad_norm": 0.267578125, + "learning_rate": 0.0002590523357139327, + "loss": 0.6341, "step": 1484 }, { - "epoch": 2.8451242829827916, - "grad_norm": 0.33203125, - "learning_rate": 0.0001961793790921174, - "loss": 0.5701, + "epoch": 1.7444314185228604, + "grad_norm": 0.26953125, + "learning_rate": 0.00025883313325349866, + "loss": 0.6986, "step": 1488 }, { - "epoch": 2.852772466539197, + "epoch": 1.7491207502930832, "grad_norm": 0.2734375, - "learning_rate": 0.00019568423355461402, - "loss": 0.5973, + "learning_rate": 0.0002586134389135019, + "loss": 0.6552, "step": 1492 }, { - "epoch": 2.860420650095602, - "grad_norm": 0.296875, - "learning_rate": 0.00019518853871754204, - "loss": 0.609, + "epoch": 1.753810082063306, + "grad_norm": 0.25390625, + "learning_rate": 0.0002583932536868676, + "loss": 0.681, "step": 1496 }, { - "epoch": 2.8680688336520075, - "grad_norm": 0.30078125, - "learning_rate": 0.00019469230054105295, - "loss": 0.5944, + "epoch": 1.7584994138335288, + "grad_norm": 0.2490234375, + "learning_rate": 0.00025817257856873956, + "loss": 0.6324, "step": 1500 }, { - "epoch": 2.875717017208413, - "grad_norm": 0.302734375, - "learning_rate": 0.0001941955249918315, - "loss": 0.5914, + "epoch": 1.7631887456037516, + "grad_norm": 0.279296875, + "learning_rate": 0.00025795141455647554, + "loss": 0.6766, "step": 1504 }, { - "epoch": 2.8833652007648185, - "grad_norm": 0.283203125, - "learning_rate": 0.00019369821804302365, - "loss": 0.6191, + "epoch": 1.7678780773739742, + "grad_norm": 0.271484375, + "learning_rate": 0.0002577297626496431, + "loss": 0.6905, "step": 1508 }, { - "epoch": 2.891013384321224, - "grad_norm": 0.296875, - "learning_rate": 0.00019320038567416484, - "loss": 0.6409, + "epoch": 1.772567409144197, + "grad_norm": 0.275390625, + "learning_rate": 0.00025750762385001464, + "loss": 0.6752, "step": 1512 }, { - "epoch": 2.898661567877629, - "grad_norm": 0.294921875, - "learning_rate": 0.00019270203387110798, - "loss": 0.5779, + "epoch": 1.7772567409144195, + "grad_norm": 0.2421875, + "learning_rate": 0.0002572849991615633, + "loss": 0.6785, "step": 1516 }, { - "epoch": 2.9063097514340344, - "grad_norm": 0.29296875, - "learning_rate": 0.00019220316862595167, - "loss": 0.5956, + "epoch": 1.7819460726846423, + "grad_norm": 0.259765625, + "learning_rate": 0.00025706188959045826, + "loss": 0.6516, "step": 1520 }, { - "epoch": 2.9139579349904396, - "grad_norm": 0.3046875, - "learning_rate": 0.00019170379593696802, - "loss": 0.5916, + "epoch": 1.7866354044548651, + "grad_norm": 0.255859375, + "learning_rate": 0.00025683829614505993, + "loss": 0.628, "step": 1524 }, { - "epoch": 2.921606118546845, - "grad_norm": 0.2890625, - "learning_rate": 0.00019120392180853058, - "loss": 0.6069, + "epoch": 1.791324736225088, + "grad_norm": 0.251953125, + "learning_rate": 0.00025661421983591586, + "loss": 0.6955, "step": 1528 }, { - "epoch": 2.92925430210325, - "grad_norm": 0.279296875, - "learning_rate": 0.0001907035522510421, - "loss": 0.6029, + "epoch": 1.7960140679953107, + "grad_norm": 0.287109375, + "learning_rate": 0.0002563896616757558, + "loss": 0.6969, "step": 1532 }, { - "epoch": 2.936902485659656, - "grad_norm": 0.310546875, - "learning_rate": 0.00019020269328086226, - "loss": 0.5706, + "epoch": 1.8007033997655335, + "grad_norm": 0.25, + "learning_rate": 0.00025616462267948726, + "loss": 0.6666, "step": 1536 }, { - "epoch": 2.9445506692160612, - "grad_norm": 0.29296875, - "learning_rate": 0.0001897013509202354, - "loss": 0.6024, + "epoch": 1.8053927315357563, + "grad_norm": 0.263671875, + "learning_rate": 0.00025593910386419107, + "loss": 0.6589, "step": 1540 }, { - "epoch": 2.9521988527724665, - "grad_norm": 0.287109375, - "learning_rate": 0.00018919953119721808, - "loss": 0.6326, + "epoch": 1.8100820633059789, + "grad_norm": 0.275390625, + "learning_rate": 0.0002557131062491165, + "loss": 0.6676, "step": 1544 }, { - "epoch": 2.959847036328872, - "grad_norm": 0.296875, - "learning_rate": 0.0001886972401456065, - "loss": 0.5744, + "epoch": 1.8147713950762017, + "grad_norm": 0.26171875, + "learning_rate": 0.0002554866308556769, + "loss": 0.6157, "step": 1548 }, { - "epoch": 2.967495219885277, - "grad_norm": 0.28515625, - "learning_rate": 0.00018819448380486413, - "loss": 0.5679, + "epoch": 1.8194607268464242, + "grad_norm": 0.2578125, + "learning_rate": 0.0002552596787074448, + "loss": 0.6262, "step": 1552 }, { - "epoch": 2.975143403441683, - "grad_norm": 0.27734375, - "learning_rate": 0.00018769126822004898, - "loss": 0.5992, + "epoch": 1.824150058616647, + "grad_norm": 0.275390625, + "learning_rate": 0.00025503225083014765, + "loss": 0.7328, "step": 1556 }, { - "epoch": 2.982791586998088, - "grad_norm": 0.30078125, - "learning_rate": 0.00018718759944174086, - "loss": 0.5981, + "epoch": 1.8288393903868698, + "grad_norm": 0.263671875, + "learning_rate": 0.0002548043482516629, + "loss": 0.6837, "step": 1560 }, { - "epoch": 2.9904397705544934, - "grad_norm": 0.29296875, - "learning_rate": 0.0001866834835259688, - "loss": 0.6188, + "epoch": 1.8335287221570926, + "grad_norm": 0.259765625, + "learning_rate": 0.0002545759720020134, + "loss": 0.7037, "step": 1564 }, { - "epoch": 2.9980879541108987, - "grad_norm": 0.33984375, - "learning_rate": 0.0001861789265341381, - "loss": 0.617, + "epoch": 1.8382180539273154, + "grad_norm": 0.27734375, + "learning_rate": 0.0002543471231133628, + "loss": 0.6798, "step": 1568 }, { - "epoch": 3.005736137667304, - "grad_norm": 0.322265625, - "learning_rate": 0.00018567393453295742, - "loss": 0.4644, + "epoch": 1.8429073856975382, + "grad_norm": 0.265625, + "learning_rate": 0.0002541178026200112, + "loss": 0.7137, "step": 1572 }, { - "epoch": 3.0133843212237093, - "grad_norm": 0.298828125, - "learning_rate": 0.00018516851359436602, - "loss": 0.4965, + "epoch": 1.847596717467761, + "grad_norm": 0.26953125, + "learning_rate": 0.0002538880115583896, + "loss": 0.7179, "step": 1576 }, { - "epoch": 3.0210325047801145, - "grad_norm": 0.29296875, - "learning_rate": 0.00018466266979546057, - "loss": 0.501, + "epoch": 1.8522860492379836, + "grad_norm": 0.267578125, + "learning_rate": 0.0002536577509670562, + "loss": 0.6664, "step": 1580 }, { - "epoch": 3.0286806883365203, - "grad_norm": 0.28125, - "learning_rate": 0.0001841564092184221, - "loss": 0.4787, + "epoch": 1.8569753810082064, + "grad_norm": 0.255859375, + "learning_rate": 0.0002534270218866911, + "loss": 0.5868, "step": 1584 }, { - "epoch": 3.0363288718929256, - "grad_norm": 0.298828125, - "learning_rate": 0.00018364973795044294, - "loss": 0.5116, + "epoch": 1.861664712778429, + "grad_norm": 0.26953125, + "learning_rate": 0.00025319582536009175, + "loss": 0.68, "step": 1588 }, { - "epoch": 3.043977055449331, - "grad_norm": 0.28125, - "learning_rate": 0.00018314266208365357, - "loss": 0.5309, + "epoch": 1.8663540445486517, + "grad_norm": 0.267578125, + "learning_rate": 0.00025296416243216836, + "loss": 0.6532, "step": 1592 }, { - "epoch": 3.051625239005736, - "grad_norm": 0.30078125, - "learning_rate": 0.00018263518771504924, - "loss": 0.4979, + "epoch": 1.8710433763188745, + "grad_norm": 0.271484375, + "learning_rate": 0.000252732034149939, + "loss": 0.6645, "step": 1596 }, { - "epoch": 3.0592734225621414, - "grad_norm": 0.296875, - "learning_rate": 0.00018212732094641666, - "loss": 0.4647, + "epoch": 1.8757327080890973, + "grad_norm": 0.26953125, + "learning_rate": 0.00025249944156252504, + "loss": 0.6842, "step": 1600 }, { - "epoch": 3.0669216061185467, - "grad_norm": 0.294921875, - "learning_rate": 0.00018161906788426076, - "loss": 0.5367, + "epoch": 1.88042203985932, + "grad_norm": 0.263671875, + "learning_rate": 0.0002522663857211461, + "loss": 0.6603, "step": 1604 }, { - "epoch": 3.0745697896749524, - "grad_norm": 0.30078125, - "learning_rate": 0.00018111043463973122, - "loss": 0.5095, + "epoch": 1.885111371629543, + "grad_norm": 0.25, + "learning_rate": 0.00025203286767911575, + "loss": 0.704, "step": 1608 }, { - "epoch": 3.0822179732313577, - "grad_norm": 0.27734375, - "learning_rate": 0.00018060142732854894, - "loss": 0.4615, + "epoch": 1.8898007033997657, + "grad_norm": 0.263671875, + "learning_rate": 0.0002517988884918364, + "loss": 0.7084, "step": 1612 }, { - "epoch": 3.089866156787763, - "grad_norm": 0.267578125, - "learning_rate": 0.00018009205207093252, - "loss": 0.5105, + "epoch": 1.8944900351699883, + "grad_norm": 0.2470703125, + "learning_rate": 0.00025156444921679464, + "loss": 0.7358, "step": 1616 }, { - "epoch": 3.0975143403441683, - "grad_norm": 0.287109375, - "learning_rate": 0.00017958231499152463, - "loss": 0.5326, + "epoch": 1.899179366940211, + "grad_norm": 0.26953125, + "learning_rate": 0.0002513295509135564, + "loss": 0.6261, "step": 1620 }, { - "epoch": 3.1051625239005736, - "grad_norm": 0.3125, - "learning_rate": 0.0001790722222193186, - "loss": 0.5383, + "epoch": 1.9038686987104336, + "grad_norm": 0.294921875, + "learning_rate": 0.0002510941946437625, + "loss": 0.7177, "step": 1624 }, { - "epoch": 3.112810707456979, - "grad_norm": 0.296875, - "learning_rate": 0.00017856177988758438, - "loss": 0.5192, + "epoch": 1.9085580304806564, + "grad_norm": 0.267578125, + "learning_rate": 0.00025085838147112315, + "loss": 0.6916, "step": 1628 }, { - "epoch": 3.120458891013384, - "grad_norm": 0.3046875, - "learning_rate": 0.00017805099413379508, - "loss": 0.5029, + "epoch": 1.9132473622508792, + "grad_norm": 0.267578125, + "learning_rate": 0.000250622112461414, + "loss": 0.6928, "step": 1632 }, { - "epoch": 3.12810707456979, - "grad_norm": 0.318359375, - "learning_rate": 0.00017753987109955297, - "loss": 0.4896, + "epoch": 1.917936694021102, + "grad_norm": 0.26953125, + "learning_rate": 0.00025038538868247043, + "loss": 0.6989, "step": 1636 }, { - "epoch": 3.135755258126195, - "grad_norm": 0.310546875, - "learning_rate": 0.00017702841693051577, - "loss": 0.5254, + "epoch": 1.9226260257913248, + "grad_norm": 0.283203125, + "learning_rate": 0.0002501482112041836, + "loss": 0.6828, "step": 1640 }, { - "epoch": 3.1434034416826004, - "grad_norm": 0.330078125, - "learning_rate": 0.0001765166377763227, - "loss": 0.4829, + "epoch": 1.9273153575615476, + "grad_norm": 0.2392578125, + "learning_rate": 0.00024991058109849495, + "loss": 0.6445, "step": 1644 }, { - "epoch": 3.1510516252390057, - "grad_norm": 0.3046875, - "learning_rate": 0.00017600453979052055, - "loss": 0.5461, + "epoch": 1.9320046893317704, + "grad_norm": 0.294921875, + "learning_rate": 0.00024967249943939174, + "loss": 0.6642, "step": 1648 }, { - "epoch": 3.158699808795411, - "grad_norm": 0.298828125, - "learning_rate": 0.0001754921291304897, - "loss": 0.5415, + "epoch": 1.936694021101993, + "grad_norm": 0.2578125, + "learning_rate": 0.000249433967302902, + "loss": 0.6969, "step": 1652 }, { - "epoch": 3.1663479923518163, - "grad_norm": 0.31640625, - "learning_rate": 0.00017497941195737004, - "loss": 0.5501, + "epoch": 1.9413833528722158, + "grad_norm": 0.251953125, + "learning_rate": 0.00024919498576708975, + "loss": 0.6826, "step": 1656 }, { - "epoch": 3.173996175908222, - "grad_norm": 0.314453125, - "learning_rate": 0.00017446639443598696, - "loss": 0.4964, + "epoch": 1.9460726846424383, + "grad_norm": 0.2734375, + "learning_rate": 0.00024895555591205004, + "loss": 0.6732, "step": 1660 }, { - "epoch": 3.1816443594646273, - "grad_norm": 0.3046875, - "learning_rate": 0.00017395308273477714, - "loss": 0.4938, + "epoch": 1.9507620164126611, + "grad_norm": 0.271484375, + "learning_rate": 0.00024871567881990414, + "loss": 0.6914, "step": 1664 }, { - "epoch": 3.1892925430210326, - "grad_norm": 0.30859375, - "learning_rate": 0.00017343948302571446, - "loss": 0.5409, + "epoch": 1.955451348182884, + "grad_norm": 0.263671875, + "learning_rate": 0.00024847535557479477, + "loss": 0.6865, "step": 1668 }, { - "epoch": 3.196940726577438, - "grad_norm": 0.302734375, - "learning_rate": 0.00017292560148423578, - "loss": 0.4844, + "epoch": 1.9601406799531067, + "grad_norm": 0.26171875, + "learning_rate": 0.0002482345872628809, + "loss": 0.6773, "step": 1672 }, { - "epoch": 3.204588910133843, - "grad_norm": 0.3046875, - "learning_rate": 0.00017241144428916655, - "loss": 0.539, + "epoch": 1.9648300117233295, + "grad_norm": 0.267578125, + "learning_rate": 0.0002479933749723332, + "loss": 0.6618, "step": 1676 }, { - "epoch": 3.2122370936902485, - "grad_norm": 0.3046875, - "learning_rate": 0.00017189701762264687, - "loss": 0.4974, + "epoch": 1.9695193434935523, + "grad_norm": 0.267578125, + "learning_rate": 0.00024775171979332867, + "loss": 0.6894, "step": 1680 }, { - "epoch": 3.2198852772466537, - "grad_norm": 0.337890625, - "learning_rate": 0.0001713823276700567, - "loss": 0.5443, + "epoch": 1.9742086752637749, + "grad_norm": 0.2578125, + "learning_rate": 0.0002475096228180463, + "loss": 0.6323, "step": 1684 }, { - "epoch": 3.2275334608030595, - "grad_norm": 0.29296875, - "learning_rate": 0.00017086738061994176, - "loss": 0.4936, + "epoch": 1.9788980070339977, + "grad_norm": 0.28125, + "learning_rate": 0.00024726708514066157, + "loss": 0.6447, "step": 1688 }, { - "epoch": 3.2351816443594648, - "grad_norm": 0.298828125, - "learning_rate": 0.00017035218266393918, - "loss": 0.5027, + "epoch": 1.9835873388042204, + "grad_norm": 0.267578125, + "learning_rate": 0.0002470241078573418, + "loss": 0.6645, "step": 1692 }, { - "epoch": 3.24282982791587, - "grad_norm": 0.326171875, - "learning_rate": 0.00016983673999670273, - "loss": 0.5352, + "epoch": 1.988276670574443, + "grad_norm": 0.275390625, + "learning_rate": 0.00024678069206624117, + "loss": 0.6562, "step": 1696 }, { - "epoch": 3.2504780114722753, - "grad_norm": 0.310546875, - "learning_rate": 0.0001693210588158287, - "loss": 0.5147, + "epoch": 1.9929660023446658, + "grad_norm": 0.263671875, + "learning_rate": 0.0002465368388674958, + "loss": 0.7016, "step": 1700 }, { - "epoch": 3.2581261950286806, - "grad_norm": 0.296875, - "learning_rate": 0.00016880514532178123, - "loss": 0.5013, + "epoch": 1.9976553341148886, + "grad_norm": 0.265625, + "learning_rate": 0.00024629254936321855, + "loss": 0.6702, "step": 1704 }, { - "epoch": 3.265774378585086, - "grad_norm": 0.3125, - "learning_rate": 0.00016828900571781767, - "loss": 0.5408, + "epoch": 2.0023446658851114, + "grad_norm": 0.244140625, + "learning_rate": 0.0002460478246574944, + "loss": 0.6856, "step": 1708 }, { - "epoch": 3.2734225621414916, - "grad_norm": 0.3046875, - "learning_rate": 0.00016777264620991414, - "loss": 0.4758, + "epoch": 2.007033997655334, + "grad_norm": 0.279296875, + "learning_rate": 0.00024580266585637496, + "loss": 0.5902, "step": 1712 }, { - "epoch": 3.281070745697897, - "grad_norm": 0.3046875, - "learning_rate": 0.00016725607300669087, - "loss": 0.5154, + "epoch": 2.011723329425557, + "grad_norm": 0.302734375, + "learning_rate": 0.00024555707406787405, + "loss": 0.563, "step": 1716 }, { - "epoch": 3.288718929254302, - "grad_norm": 0.302734375, - "learning_rate": 0.0001667392923193375, - "loss": 0.4882, + "epoch": 2.0164126611957798, + "grad_norm": 0.2578125, + "learning_rate": 0.0002453110504019623, + "loss": 0.6143, "step": 1720 }, { - "epoch": 3.2963671128107075, - "grad_norm": 0.322265625, - "learning_rate": 0.00016622231036153836, - "loss": 0.5423, + "epoch": 2.021101992966002, + "grad_norm": 0.259765625, + "learning_rate": 0.0002450645959705622, + "loss": 0.5521, "step": 1724 }, { - "epoch": 3.3040152963671128, - "grad_norm": 0.328125, - "learning_rate": 0.0001657051333493978, - "loss": 0.509, + "epoch": 2.025791324736225, + "grad_norm": 0.302734375, + "learning_rate": 0.0002448177118875432, + "loss": 0.6465, "step": 1728 }, { - "epoch": 3.311663479923518, - "grad_norm": 0.349609375, - "learning_rate": 0.00016518776750136578, - "loss": 0.5447, + "epoch": 2.0304806565064477, + "grad_norm": 0.26171875, + "learning_rate": 0.00024457039926871656, + "loss": 0.5643, "step": 1732 }, { - "epoch": 3.3193116634799233, - "grad_norm": 0.33203125, - "learning_rate": 0.00016467021903816237, - "loss": 0.5048, + "epoch": 2.0351699882766705, + "grad_norm": 0.271484375, + "learning_rate": 0.00024432265923183025, + "loss": 0.5682, "step": 1736 }, { - "epoch": 3.3269598470363286, - "grad_norm": 0.287109375, - "learning_rate": 0.00016415249418270364, - "loss": 0.5183, + "epoch": 2.0398593200468933, + "grad_norm": 0.27734375, + "learning_rate": 0.00024407449289656416, + "loss": 0.5971, "step": 1740 }, { - "epoch": 3.3346080305927344, - "grad_norm": 0.3125, - "learning_rate": 0.00016363459916002643, - "loss": 0.4915, + "epoch": 2.044548651817116, + "grad_norm": 0.2734375, + "learning_rate": 0.00024382590138452475, + "loss": 0.5766, "step": 1744 }, { - "epoch": 3.3422562141491396, - "grad_norm": 0.349609375, - "learning_rate": 0.00016311654019721377, - "loss": 0.5016, + "epoch": 2.049237983587339, + "grad_norm": 0.25, + "learning_rate": 0.00024357688581924013, + "loss": 0.5996, "step": 1748 }, { - "epoch": 3.349904397705545, - "grad_norm": 0.30859375, - "learning_rate": 0.00016259832352331978, - "loss": 0.5276, + "epoch": 2.0539273153575617, + "grad_norm": 0.279296875, + "learning_rate": 0.00024332744732615496, + "loss": 0.5863, "step": 1752 }, { - "epoch": 3.35755258126195, - "grad_norm": 0.34375, - "learning_rate": 0.0001620799553692949, - "loss": 0.5436, + "epoch": 2.0586166471277845, + "grad_norm": 0.271484375, + "learning_rate": 0.00024307758703262527, + "loss": 0.6162, "step": 1756 }, { - "epoch": 3.3652007648183555, - "grad_norm": 0.29296875, - "learning_rate": 0.00016156144196791103, - "loss": 0.5152, + "epoch": 2.063305978898007, + "grad_norm": 0.271484375, + "learning_rate": 0.00024282730606791365, + "loss": 0.6311, "step": 1760 }, { - "epoch": 3.3728489483747612, - "grad_norm": 0.28515625, - "learning_rate": 0.0001610427895536863, - "loss": 0.4845, + "epoch": 2.0679953106682296, + "grad_norm": 0.30078125, + "learning_rate": 0.00024257660556318373, + "loss": 0.566, "step": 1764 }, { - "epoch": 3.3804971319311665, - "grad_norm": 0.33203125, - "learning_rate": 0.00016052400436281046, - "loss": 0.51, + "epoch": 2.0726846424384524, + "grad_norm": 0.2734375, + "learning_rate": 0.00024232548665149533, + "loss": 0.5363, "step": 1768 }, { - "epoch": 3.388145315487572, - "grad_norm": 0.306640625, - "learning_rate": 0.00016000509263306976, - "loss": 0.5163, + "epoch": 2.077373974208675, + "grad_norm": 0.263671875, + "learning_rate": 0.00024207395046779945, + "loss": 0.591, "step": 1772 }, { - "epoch": 3.395793499043977, - "grad_norm": 0.34765625, - "learning_rate": 0.0001594860606037719, - "loss": 0.559, + "epoch": 2.082063305978898, + "grad_norm": 0.271484375, + "learning_rate": 0.00024182199814893278, + "loss": 0.579, "step": 1776 }, { - "epoch": 3.4034416826003824, - "grad_norm": 0.3359375, - "learning_rate": 0.0001589669145156709, - "loss": 0.523, + "epoch": 2.086752637749121, + "grad_norm": 0.2734375, + "learning_rate": 0.00024156963083361282, + "loss": 0.6016, "step": 1780 }, { - "epoch": 3.4110898661567877, - "grad_norm": 0.3359375, - "learning_rate": 0.00015844766061089241, - "loss": 0.4994, + "epoch": 2.0914419695193436, + "grad_norm": 0.271484375, + "learning_rate": 0.0002413168496624328, + "loss": 0.6139, "step": 1784 }, { - "epoch": 3.418738049713193, - "grad_norm": 0.32421875, - "learning_rate": 0.00015792830513285838, - "loss": 0.5259, + "epoch": 2.0961313012895664, + "grad_norm": 0.26953125, + "learning_rate": 0.00024106365577785625, + "loss": 0.6217, "step": 1788 }, { - "epoch": 3.4263862332695982, - "grad_norm": 0.314453125, - "learning_rate": 0.000157408854326212, - "loss": 0.4734, + "epoch": 2.100820633059789, + "grad_norm": 0.3125, + "learning_rate": 0.00024081005032421202, + "loss": 0.5849, "step": 1792 }, { - "epoch": 3.434034416826004, - "grad_norm": 0.330078125, - "learning_rate": 0.00015688931443674276, - "loss": 0.5163, + "epoch": 2.1055099648300115, + "grad_norm": 0.291015625, + "learning_rate": 0.0002405560344476892, + "loss": 0.5774, "step": 1796 }, { - "epoch": 3.4416826003824093, - "grad_norm": 0.337890625, - "learning_rate": 0.0001563696917113112, - "loss": 0.514, + "epoch": 2.1101992966002343, + "grad_norm": 0.27734375, + "learning_rate": 0.00024030160929633165, + "loss": 0.6123, "step": 1800 }, { - "epoch": 3.4493307839388145, - "grad_norm": 0.322265625, - "learning_rate": 0.00015584999239777393, - "loss": 0.5691, + "epoch": 2.114888628370457, + "grad_norm": 0.271484375, + "learning_rate": 0.00024004677602003306, + "loss": 0.5666, "step": 1804 }, { - "epoch": 3.45697896749522, - "grad_norm": 0.3515625, - "learning_rate": 0.0001553302227449084, - "loss": 0.5365, + "epoch": 2.11957796014068, + "grad_norm": 0.2890625, + "learning_rate": 0.00023979153577053167, + "loss": 0.5931, "step": 1808 }, { - "epoch": 3.464627151051625, - "grad_norm": 0.318359375, - "learning_rate": 0.0001548103890023378, - "loss": 0.5111, + "epoch": 2.1242672919109027, + "grad_norm": 0.28125, + "learning_rate": 0.00023953588970140503, + "loss": 0.5862, "step": 1812 }, { - "epoch": 3.472275334608031, - "grad_norm": 0.318359375, - "learning_rate": 0.00015429049742045591, - "loss": 0.5272, + "epoch": 2.1289566236811255, + "grad_norm": 0.275390625, + "learning_rate": 0.00023927983896806495, + "loss": 0.5796, "step": 1816 }, { - "epoch": 3.479923518164436, - "grad_norm": 0.30078125, - "learning_rate": 0.000153770554250352, - "loss": 0.4746, + "epoch": 2.1336459554513483, + "grad_norm": 0.28125, + "learning_rate": 0.0002390233847277519, + "loss": 0.5914, "step": 1820 }, { - "epoch": 3.4875717017208414, - "grad_norm": 0.328125, - "learning_rate": 0.00015325056574373564, - "loss": 0.5091, + "epoch": 2.138335287221571, + "grad_norm": 0.26171875, + "learning_rate": 0.00023876652813953028, + "loss": 0.5639, "step": 1824 }, { - "epoch": 3.4952198852772467, - "grad_norm": 0.322265625, - "learning_rate": 0.00015273053815286153, - "loss": 0.5043, + "epoch": 2.143024618991794, + "grad_norm": 0.265625, + "learning_rate": 0.00023850927036428286, + "loss": 0.5648, "step": 1828 }, { - "epoch": 3.502868068833652, - "grad_norm": 0.310546875, - "learning_rate": 0.00015221047773045424, - "loss": 0.5157, + "epoch": 2.147713950762016, + "grad_norm": 0.27734375, + "learning_rate": 0.00023825161256470546, + "loss": 0.5877, "step": 1832 }, { - "epoch": 3.5105162523900573, - "grad_norm": 0.310546875, - "learning_rate": 0.00015169039072963312, - "loss": 0.525, + "epoch": 2.152403282532239, + "grad_norm": 0.28515625, + "learning_rate": 0.00023799355590530205, + "loss": 0.6211, "step": 1836 }, { - "epoch": 3.5181644359464626, - "grad_norm": 0.333984375, - "learning_rate": 0.00015117028340383713, - "loss": 0.536, + "epoch": 2.157092614302462, + "grad_norm": 0.341796875, + "learning_rate": 0.00023773510155237918, + "loss": 0.6338, "step": 1840 }, { - "epoch": 3.525812619502868, - "grad_norm": 0.326171875, - "learning_rate": 0.00015065016200674963, - "loss": 0.556, + "epoch": 2.1617819460726846, + "grad_norm": 0.2890625, + "learning_rate": 0.0002374762506740408, + "loss": 0.59, "step": 1844 }, { - "epoch": 3.5334608030592736, - "grad_norm": 0.345703125, - "learning_rate": 0.00015013003279222312, - "loss": 0.5199, + "epoch": 2.1664712778429074, + "grad_norm": 0.28125, + "learning_rate": 0.00023721700444018296, + "loss": 0.6159, "step": 1848 }, { - "epoch": 3.541108986615679, - "grad_norm": 0.30859375, - "learning_rate": 0.0001496099020142041, - "loss": 0.5381, + "epoch": 2.17116060961313, + "grad_norm": 0.3125, + "learning_rate": 0.00023695736402248865, + "loss": 0.6008, "step": 1852 }, { - "epoch": 3.548757170172084, - "grad_norm": 0.33203125, - "learning_rate": 0.00014908977592665787, - "loss": 0.5092, + "epoch": 2.175849941383353, + "grad_norm": 0.279296875, + "learning_rate": 0.00023669733059442238, + "loss": 0.5405, "step": 1856 }, { - "epoch": 3.5564053537284894, - "grad_norm": 0.328125, - "learning_rate": 0.00014856966078349339, - "loss": 0.5101, + "epoch": 2.1805392731535758, + "grad_norm": 0.28515625, + "learning_rate": 0.00023643690533122467, + "loss": 0.5994, "step": 1860 }, { - "epoch": 3.5640535372848947, - "grad_norm": 0.353515625, - "learning_rate": 0.00014804956283848793, - "loss": 0.5093, + "epoch": 2.1852286049237986, + "grad_norm": 0.318359375, + "learning_rate": 0.00023617608940990737, + "loss": 0.6262, "step": 1864 }, { - "epoch": 3.5717017208413004, - "grad_norm": 0.306640625, - "learning_rate": 0.00014752948834521206, - "loss": 0.499, + "epoch": 2.189917936694021, + "grad_norm": 0.27734375, + "learning_rate": 0.0002359148840092476, + "loss": 0.5902, "step": 1868 }, { - "epoch": 3.5793499043977057, - "grad_norm": 0.33984375, - "learning_rate": 0.00014700944355695432, - "loss": 0.4342, + "epoch": 2.1946072684642437, + "grad_norm": 0.298828125, + "learning_rate": 0.00023565329030978297, + "loss": 0.5659, "step": 1872 }, { - "epoch": 3.586998087954111, - "grad_norm": 0.333984375, - "learning_rate": 0.00014648943472664612, - "loss": 0.541, + "epoch": 2.1992966002344665, + "grad_norm": 0.28125, + "learning_rate": 0.00023539130949380585, + "loss": 0.6104, "step": 1876 }, { - "epoch": 3.5946462715105163, - "grad_norm": 0.349609375, - "learning_rate": 0.00014596946810678646, - "loss": 0.5089, + "epoch": 2.2039859320046893, + "grad_norm": 0.287109375, + "learning_rate": 0.00023512894274535843, + "loss": 0.5938, "step": 1880 }, { - "epoch": 3.6022944550669216, - "grad_norm": 0.30859375, - "learning_rate": 0.00014544954994936689, - "loss": 0.4995, + "epoch": 2.208675263774912, + "grad_norm": 0.279296875, + "learning_rate": 0.000234866191250227, + "loss": 0.5961, "step": 1884 }, { - "epoch": 3.609942638623327, - "grad_norm": 0.318359375, - "learning_rate": 0.0001449296865057962, - "loss": 0.5299, + "epoch": 2.213364595545135, + "grad_norm": 0.28515625, + "learning_rate": 0.00023460305619593674, + "loss": 0.6316, "step": 1888 }, { - "epoch": 3.617590822179732, - "grad_norm": 0.337890625, - "learning_rate": 0.00014440988402682526, - "loss": 0.5933, + "epoch": 2.2180539273153577, + "grad_norm": 0.296875, + "learning_rate": 0.00023433953877174645, + "loss": 0.5741, "step": 1892 }, { - "epoch": 3.6252390057361374, - "grad_norm": 0.3359375, - "learning_rate": 0.00014389014876247205, - "loss": 0.5045, + "epoch": 2.2227432590855805, + "grad_norm": 0.291015625, + "learning_rate": 0.000234075640168643, + "loss": 0.5559, "step": 1896 }, { - "epoch": 3.632887189292543, - "grad_norm": 0.3125, - "learning_rate": 0.00014337048696194625, - "loss": 0.4814, + "epoch": 2.2274325908558033, + "grad_norm": 0.27734375, + "learning_rate": 0.00023381136157933603, + "loss": 0.5655, "step": 1900 }, { - "epoch": 3.6405353728489485, - "grad_norm": 0.32421875, - "learning_rate": 0.00014285090487357427, - "loss": 0.5416, + "epoch": 2.2321219226260256, + "grad_norm": 0.294921875, + "learning_rate": 0.0002335467041982526, + "loss": 0.5871, "step": 1904 }, { - "epoch": 3.6481835564053537, - "grad_norm": 0.32421875, - "learning_rate": 0.0001423314087447241, - "loss": 0.5236, + "epoch": 2.2368112543962484, + "grad_norm": 0.3046875, + "learning_rate": 0.0002332816692215318, + "loss": 0.5668, "step": 1908 }, { - "epoch": 3.655831739961759, - "grad_norm": 0.34765625, - "learning_rate": 0.00014181200482173015, - "loss": 0.5281, + "epoch": 2.241500586166471, + "grad_norm": 0.283203125, + "learning_rate": 0.00023301625784701905, + "loss": 0.6187, "step": 1912 }, { - "epoch": 3.6634799235181643, - "grad_norm": 0.3046875, - "learning_rate": 0.00014129269934981802, - "loss": 0.5446, + "epoch": 2.246189917936694, + "grad_norm": 0.287109375, + "learning_rate": 0.0002327504712742612, + "loss": 0.6177, "step": 1916 }, { - "epoch": 3.67112810707457, - "grad_norm": 0.359375, - "learning_rate": 0.00014077349857302983, - "loss": 0.4949, + "epoch": 2.2508792497069168, + "grad_norm": 0.2890625, + "learning_rate": 0.0002324843107045008, + "loss": 0.6034, "step": 1920 }, { - "epoch": 3.6787762906309753, - "grad_norm": 0.31640625, - "learning_rate": 0.00014025440873414863, - "loss": 0.4875, + "epoch": 2.2555685814771396, + "grad_norm": 0.283203125, + "learning_rate": 0.00023221777734067046, + "loss": 0.5928, "step": 1924 }, { - "epoch": 3.6864244741873806, - "grad_norm": 0.3203125, - "learning_rate": 0.0001397354360746237, - "loss": 0.528, + "epoch": 2.2602579132473624, + "grad_norm": 0.302734375, + "learning_rate": 0.000231950872387388, + "loss": 0.5952, "step": 1928 }, { - "epoch": 3.694072657743786, - "grad_norm": 0.298828125, - "learning_rate": 0.0001392165868344953, - "loss": 0.4827, + "epoch": 2.264947245017585, + "grad_norm": 0.30078125, + "learning_rate": 0.0002316835970509504, + "loss": 0.5139, "step": 1932 }, { - "epoch": 3.701720841300191, - "grad_norm": 0.337890625, - "learning_rate": 0.0001386978672523198, - "loss": 0.5073, + "epoch": 2.269636576787808, + "grad_norm": 0.296875, + "learning_rate": 0.00023141595253932886, + "loss": 0.6095, "step": 1936 }, { - "epoch": 3.7093690248565965, - "grad_norm": 0.3203125, - "learning_rate": 0.0001381792835650945, - "loss": 0.5036, + "epoch": 2.2743259085580303, + "grad_norm": 0.291015625, + "learning_rate": 0.00023114794006216278, + "loss": 0.6051, "step": 1940 }, { - "epoch": 3.7170172084130018, - "grad_norm": 0.30859375, - "learning_rate": 0.00013766084200818272, - "loss": 0.5396, + "epoch": 2.279015240328253, + "grad_norm": 0.265625, + "learning_rate": 0.0002308795608307549, + "loss": 0.571, "step": 1944 }, { - "epoch": 3.724665391969407, - "grad_norm": 0.345703125, - "learning_rate": 0.0001371425488152389, - "loss": 0.4815, + "epoch": 2.283704572098476, + "grad_norm": 0.287109375, + "learning_rate": 0.0002306108160580654, + "loss": 0.6444, "step": 1948 }, { - "epoch": 3.7323135755258128, - "grad_norm": 0.3203125, - "learning_rate": 0.0001366244102181335, - "loss": 0.5306, + "epoch": 2.2883939038686987, + "grad_norm": 0.279296875, + "learning_rate": 0.00023034170695870665, + "loss": 0.5642, "step": 1952 }, { - "epoch": 3.739961759082218, - "grad_norm": 0.34375, - "learning_rate": 0.00013610643244687826, - "loss": 0.5419, + "epoch": 2.2930832356389215, + "grad_norm": 0.287109375, + "learning_rate": 0.00023007223474893736, + "loss": 0.59, "step": 1956 }, { - "epoch": 3.7476099426386233, - "grad_norm": 0.3203125, - "learning_rate": 0.00013558862172955105, - "loss": 0.5204, + "epoch": 2.2977725674091443, + "grad_norm": 0.28515625, + "learning_rate": 0.00022980240064665765, + "loss": 0.5728, "step": 1960 }, { - "epoch": 3.7552581261950286, - "grad_norm": 0.3203125, - "learning_rate": 0.00013507098429222115, - "loss": 0.4982, + "epoch": 2.302461899179367, + "grad_norm": 0.287109375, + "learning_rate": 0.0002295322058714031, + "loss": 0.6141, "step": 1964 }, { - "epoch": 3.762906309751434, - "grad_norm": 0.3203125, - "learning_rate": 0.00013455352635887438, - "loss": 0.4667, + "epoch": 2.30715123094959, + "grad_norm": 0.29296875, + "learning_rate": 0.0002292616516443394, + "loss": 0.6183, "step": 1968 }, { - "epoch": 3.7705544933078396, - "grad_norm": 0.34765625, - "learning_rate": 0.00013403625415133824, - "loss": 0.4302, + "epoch": 2.311840562719812, + "grad_norm": 0.294921875, + "learning_rate": 0.00022899073918825673, + "loss": 0.5835, "step": 1972 }, { - "epoch": 3.778202676864245, - "grad_norm": 0.310546875, - "learning_rate": 0.00013351917388920704, - "loss": 0.4545, + "epoch": 2.316529894490035, + "grad_norm": 0.2890625, + "learning_rate": 0.00022871946972756455, + "loss": 0.6463, "step": 1976 }, { - "epoch": 3.78585086042065, - "grad_norm": 0.337890625, - "learning_rate": 0.00013300229178976722, - "loss": 0.4953, + "epoch": 2.321219226260258, + "grad_norm": 0.28125, + "learning_rate": 0.00022844784448828554, + "loss": 0.6421, "step": 1980 }, { - "epoch": 3.7934990439770555, - "grad_norm": 0.3359375, - "learning_rate": 0.0001324856140679225, - "loss": 0.4966, + "epoch": 2.3259085580304806, + "grad_norm": 0.28515625, + "learning_rate": 0.0002281758646980505, + "loss": 0.6004, "step": 1984 }, { - "epoch": 3.801147227533461, - "grad_norm": 0.3203125, - "learning_rate": 0.0001319691469361193, - "loss": 0.5236, + "epoch": 2.3305978898007034, + "grad_norm": 0.30859375, + "learning_rate": 0.0002279035315860926, + "loss": 0.6484, "step": 1988 }, { - "epoch": 3.808795411089866, - "grad_norm": 0.328125, - "learning_rate": 0.00013145289660427173, - "loss": 0.5244, + "epoch": 2.335287221570926, + "grad_norm": 0.28515625, + "learning_rate": 0.00022763084638324202, + "loss": 0.5479, "step": 1992 }, { - "epoch": 3.8164435946462714, - "grad_norm": 0.3359375, - "learning_rate": 0.00013093686927968738, - "loss": 0.4982, + "epoch": 2.339976553341149, + "grad_norm": 0.294921875, + "learning_rate": 0.00022735781032192, + "loss": 0.6289, "step": 1996 }, { - "epoch": 3.8240917782026767, - "grad_norm": 0.345703125, - "learning_rate": 0.00013042107116699228, - "loss": 0.4899, + "epoch": 2.3446658851113718, + "grad_norm": 0.291015625, + "learning_rate": 0.00022708442463613367, + "loss": 0.5875, "step": 2000 }, { - "epoch": 3.8317399617590824, - "grad_norm": 0.3125, - "learning_rate": 0.00012990550846805654, - "loss": 0.5296, + "epoch": 2.3493552168815945, + "grad_norm": 0.2890625, + "learning_rate": 0.00022681069056147032, + "loss": 0.6117, "step": 2004 }, { - "epoch": 3.8393881453154877, - "grad_norm": 0.322265625, - "learning_rate": 0.0001293901873819196, - "loss": 0.5331, + "epoch": 2.3540445486518173, + "grad_norm": 0.291015625, + "learning_rate": 0.00022653660933509166, + "loss": 0.5951, "step": 2008 }, { - "epoch": 3.847036328871893, - "grad_norm": 0.333984375, - "learning_rate": 0.00012887511410471589, - "loss": 0.501, + "epoch": 2.3587338804220397, + "grad_norm": 0.298828125, + "learning_rate": 0.00022626218219572858, + "loss": 0.6197, "step": 2012 }, { - "epoch": 3.8546845124282982, - "grad_norm": 0.328125, - "learning_rate": 0.00012836029482960018, - "loss": 0.5254, + "epoch": 2.3634232121922625, + "grad_norm": 0.296875, + "learning_rate": 0.00022598741038367523, + "loss": 0.5915, "step": 2016 }, { - "epoch": 3.8623326959847035, - "grad_norm": 0.337890625, - "learning_rate": 0.00012784573574667316, - "loss": 0.5009, + "epoch": 2.3681125439624853, + "grad_norm": 0.2890625, + "learning_rate": 0.0002257122951407836, + "loss": 0.6072, "step": 2020 }, { - "epoch": 3.8699808795411093, - "grad_norm": 0.353515625, - "learning_rate": 0.00012733144304290697, - "loss": 0.5107, + "epoch": 2.372801875732708, + "grad_norm": 0.27734375, + "learning_rate": 0.0002254368377104577, + "loss": 0.536, "step": 2024 }, { - "epoch": 3.8776290630975145, - "grad_norm": 0.326171875, - "learning_rate": 0.0001268174229020709, - "loss": 0.5025, + "epoch": 2.377491207502931, + "grad_norm": 0.2890625, + "learning_rate": 0.0002251610393376483, + "loss": 0.6213, "step": 2028 }, { - "epoch": 3.88527724665392, - "grad_norm": 0.318359375, - "learning_rate": 0.0001263036815046571, - "loss": 0.5239, + "epoch": 2.3821805392731537, + "grad_norm": 0.298828125, + "learning_rate": 0.00022488490126884692, + "loss": 0.5981, "step": 2032 }, { - "epoch": 3.892925430210325, - "grad_norm": 0.333984375, - "learning_rate": 0.00012579022502780596, - "loss": 0.5112, + "epoch": 2.3868698710433764, + "grad_norm": 0.29296875, + "learning_rate": 0.00022460842475208038, + "loss": 0.619, "step": 2036 }, { - "epoch": 3.9005736137667304, - "grad_norm": 0.322265625, - "learning_rate": 0.00012527705964523209, - "loss": 0.5182, + "epoch": 2.3915592028135992, + "grad_norm": 0.3046875, + "learning_rate": 0.00022433161103690521, + "loss": 0.5714, "step": 2040 }, { - "epoch": 3.9082217973231357, - "grad_norm": 0.322265625, - "learning_rate": 0.00012476419152715007, - "loss": 0.5505, + "epoch": 2.3962485345838216, + "grad_norm": 0.287109375, + "learning_rate": 0.00022405446137440185, + "loss": 0.6044, "step": 2044 }, { - "epoch": 3.915869980879541, - "grad_norm": 0.3125, - "learning_rate": 0.00012425162684020024, - "loss": 0.4957, + "epoch": 2.4009378663540444, + "grad_norm": 0.287109375, + "learning_rate": 0.0002237769770171692, + "loss": 0.5437, "step": 2048 }, { - "epoch": 3.9235181644359463, - "grad_norm": 0.322265625, - "learning_rate": 0.0001237393717473745, - "loss": 0.5132, + "epoch": 2.405627198124267, + "grad_norm": 0.296875, + "learning_rate": 0.00022349915921931866, + "loss": 0.5494, "step": 2052 }, { - "epoch": 3.9311663479923515, - "grad_norm": 0.3125, - "learning_rate": 0.0001232274324079422, - "loss": 0.516, + "epoch": 2.41031652989449, + "grad_norm": 0.3203125, + "learning_rate": 0.0002232210092364689, + "loss": 0.6109, "step": 2056 }, { - "epoch": 3.9388145315487573, - "grad_norm": 0.33203125, - "learning_rate": 0.00012271581497737619, - "loss": 0.5156, + "epoch": 2.4150058616647128, + "grad_norm": 0.2890625, + "learning_rate": 0.00022294252832573958, + "loss": 0.5868, "step": 2060 }, { - "epoch": 3.9464627151051626, - "grad_norm": 0.314453125, - "learning_rate": 0.00012220452560727875, - "loss": 0.4621, + "epoch": 2.4196951934349356, + "grad_norm": 0.27734375, + "learning_rate": 0.00022266371774574633, + "loss": 0.6001, "step": 2064 }, { - "epoch": 3.954110898661568, - "grad_norm": 0.33203125, - "learning_rate": 0.00012169357044530758, - "loss": 0.5206, + "epoch": 2.4243845252051583, + "grad_norm": 0.294921875, + "learning_rate": 0.00022238457875659455, + "loss": 0.5908, "step": 2068 }, { - "epoch": 3.961759082217973, - "grad_norm": 0.322265625, - "learning_rate": 0.0001211829556351019, - "loss": 0.4511, + "epoch": 2.429073856975381, + "grad_norm": 0.30078125, + "learning_rate": 0.000222105112619874, + "loss": 0.6001, "step": 2072 }, { - "epoch": 3.969407265774379, - "grad_norm": 0.328125, - "learning_rate": 0.00012067268731620861, - "loss": 0.5047, + "epoch": 2.433763188745604, + "grad_norm": 0.28515625, + "learning_rate": 0.00022182532059865305, + "loss": 0.6076, "step": 2076 }, { - "epoch": 3.977055449330784, - "grad_norm": 0.3359375, - "learning_rate": 0.00012016277162400848, - "loss": 0.5295, + "epoch": 2.4384525205158267, + "grad_norm": 0.287109375, + "learning_rate": 0.00022154520395747279, + "loss": 0.5652, "step": 2080 }, { - "epoch": 3.9847036328871894, - "grad_norm": 0.333984375, - "learning_rate": 0.00011965321468964237, - "loss": 0.5204, + "epoch": 2.443141852286049, + "grad_norm": 0.2890625, + "learning_rate": 0.0002212647639623415, + "loss": 0.5713, "step": 2084 }, { - "epoch": 3.9923518164435947, - "grad_norm": 0.333984375, - "learning_rate": 0.00011914402263993745, - "loss": 0.5064, + "epoch": 2.447831184056272, + "grad_norm": 0.2890625, + "learning_rate": 0.000220984001880729, + "loss": 0.5742, "step": 2088 }, { - "epoch": 4.0, - "grad_norm": 0.87109375, - "learning_rate": 0.00011863520159733357, - "loss": 0.53, + "epoch": 2.4525205158264947, + "grad_norm": 0.287109375, + "learning_rate": 0.00022070291898156064, + "loss": 0.6326, "step": 2092 }, { - "epoch": 4.007648183556405, - "grad_norm": 0.3046875, - "learning_rate": 0.00011812675767980972, - "loss": 0.4532, + "epoch": 2.4572098475967175, + "grad_norm": 0.27734375, + "learning_rate": 0.00022042151653521182, + "loss": 0.5809, "step": 2096 }, { - "epoch": 4.015296367112811, - "grad_norm": 0.3046875, - "learning_rate": 0.00011761869700081036, - "loss": 0.4748, + "epoch": 2.4618991793669402, + "grad_norm": 0.279296875, + "learning_rate": 0.0002201397958135022, + "loss": 0.5758, "step": 2100 }, { - "epoch": 4.022944550669216, - "grad_norm": 0.314453125, - "learning_rate": 0.00011711102566917194, - "loss": 0.4188, + "epoch": 2.466588511137163, + "grad_norm": 0.318359375, + "learning_rate": 0.00021985775808968982, + "loss": 0.5594, "step": 2104 }, { - "epoch": 4.030592734225621, - "grad_norm": 0.32421875, - "learning_rate": 0.00011660374978904947, - "loss": 0.4466, + "epoch": 2.471277842907386, + "grad_norm": 0.296875, + "learning_rate": 0.0002195754046384654, + "loss": 0.5766, "step": 2108 }, { - "epoch": 4.038240917782026, - "grad_norm": 0.32421875, - "learning_rate": 0.00011609687545984315, - "loss": 0.4112, + "epoch": 2.4759671746776086, + "grad_norm": 0.287109375, + "learning_rate": 0.00021929273673594677, + "loss": 0.5754, "step": 2112 }, { - "epoch": 4.045889101338432, - "grad_norm": 0.345703125, - "learning_rate": 0.00011559040877612497, - "loss": 0.4566, + "epoch": 2.480656506447831, + "grad_norm": 0.296875, + "learning_rate": 0.0002190097556596728, + "loss": 0.5989, "step": 2116 }, { - "epoch": 4.053537284894838, - "grad_norm": 0.33203125, - "learning_rate": 0.00011508435582756545, - "loss": 0.4413, + "epoch": 2.4853458382180538, + "grad_norm": 0.306640625, + "learning_rate": 0.0002187264626885979, + "loss": 0.5721, "step": 2120 }, { - "epoch": 4.061185468451243, - "grad_norm": 0.34765625, - "learning_rate": 0.00011457872269886043, - "loss": 0.4435, + "epoch": 2.4900351699882766, + "grad_norm": 0.30078125, + "learning_rate": 0.00021844285910308593, + "loss": 0.6148, "step": 2124 }, { - "epoch": 4.0688336520076485, - "grad_norm": 0.341796875, - "learning_rate": 0.00011407351546965796, - "loss": 0.4568, + "epoch": 2.4947245017584994, + "grad_norm": 0.28515625, + "learning_rate": 0.00021815894618490482, + "loss": 0.6034, "step": 2128 }, { - "epoch": 4.076481835564054, - "grad_norm": 0.328125, - "learning_rate": 0.00011356874021448506, - "loss": 0.4247, + "epoch": 2.499413833528722, + "grad_norm": 0.306640625, + "learning_rate": 0.00021787472521722038, + "loss": 0.5831, "step": 2132 }, { - "epoch": 4.084130019120459, - "grad_norm": 0.302734375, - "learning_rate": 0.00011306440300267482, - "loss": 0.3762, + "epoch": 2.504103165298945, + "grad_norm": 0.28515625, + "learning_rate": 0.0002175901974845907, + "loss": 0.5933, "step": 2136 }, { - "epoch": 4.091778202676864, - "grad_norm": 0.337890625, - "learning_rate": 0.00011256050989829337, - "loss": 0.4713, + "epoch": 2.5087924970691677, + "grad_norm": 0.287109375, + "learning_rate": 0.00021730536427296045, + "loss": 0.589, "step": 2140 }, { - "epoch": 4.09942638623327, - "grad_norm": 0.3203125, - "learning_rate": 0.00011205706696006698, - "loss": 0.4178, + "epoch": 2.5134818288393905, + "grad_norm": 0.283203125, + "learning_rate": 0.00021702022686965471, + "loss": 0.5794, "step": 2144 }, { - "epoch": 4.107074569789675, - "grad_norm": 0.353515625, - "learning_rate": 0.00011155408024130921, - "loss": 0.4266, + "epoch": 2.5181711606096133, + "grad_norm": 0.279296875, + "learning_rate": 0.00021673478656337365, + "loss": 0.6354, "step": 2148 }, { - "epoch": 4.11472275334608, - "grad_norm": 0.31640625, - "learning_rate": 0.00011105155578984795, - "loss": 0.4242, + "epoch": 2.522860492379836, + "grad_norm": 0.263671875, + "learning_rate": 0.00021644904464418618, + "loss": 0.6022, "step": 2152 }, { - "epoch": 4.1223709369024855, - "grad_norm": 0.310546875, - "learning_rate": 0.00011054949964795307, - "loss": 0.464, + "epoch": 2.5275498241500585, + "grad_norm": 0.287109375, + "learning_rate": 0.0002161630024035245, + "loss": 0.5818, "step": 2156 }, { - "epoch": 4.130019120458891, - "grad_norm": 0.31640625, - "learning_rate": 0.00011004791785226347, - "loss": 0.4216, + "epoch": 2.5322391559202813, + "grad_norm": 0.310546875, + "learning_rate": 0.0002158766611341781, + "loss": 0.5831, "step": 2160 }, { - "epoch": 4.137667304015296, - "grad_norm": 0.34375, - "learning_rate": 0.00010954681643371462, - "loss": 0.4222, + "epoch": 2.536928487690504, + "grad_norm": 0.2890625, + "learning_rate": 0.000215590022130288, + "loss": 0.5985, "step": 2164 }, { - "epoch": 4.145315487571701, - "grad_norm": 0.3203125, - "learning_rate": 0.00010904620141746601, - "loss": 0.4321, + "epoch": 2.541617819460727, + "grad_norm": 0.306640625, + "learning_rate": 0.00021530308668734079, + "loss": 0.6242, "step": 2168 }, { - "epoch": 4.1529636711281075, - "grad_norm": 0.330078125, - "learning_rate": 0.0001085460788228287, - "loss": 0.441, + "epoch": 2.5463071512309496, + "grad_norm": 0.30859375, + "learning_rate": 0.0002150158561021629, + "loss": 0.5457, "step": 2172 }, { - "epoch": 4.160611854684513, - "grad_norm": 0.330078125, - "learning_rate": 0.00010804645466319292, - "loss": 0.4468, + "epoch": 2.5509964830011724, + "grad_norm": 0.29296875, + "learning_rate": 0.00021472833167291458, + "loss": 0.604, "step": 2176 }, { - "epoch": 4.168260038240918, - "grad_norm": 0.33203125, - "learning_rate": 0.0001075473349459559, - "loss": 0.3948, + "epoch": 2.5556858147713952, + "grad_norm": 0.291015625, + "learning_rate": 0.00021444051469908426, + "loss": 0.6074, "step": 2180 }, { - "epoch": 4.175908221797323, - "grad_norm": 0.33203125, - "learning_rate": 0.00010704872567244948, - "loss": 0.4233, + "epoch": 2.5603751465416176, + "grad_norm": 0.279296875, + "learning_rate": 0.00021415240648148246, + "loss": 0.557, "step": 2184 }, { - "epoch": 4.183556405353729, - "grad_norm": 0.34765625, - "learning_rate": 0.00010655063283786795, - "loss": 0.4227, + "epoch": 2.5650644783118404, + "grad_norm": 0.314453125, + "learning_rate": 0.00021386400832223605, + "loss": 0.573, "step": 2188 }, { - "epoch": 4.191204588910134, - "grad_norm": 0.337890625, - "learning_rate": 0.00010605306243119617, - "loss": 0.4242, + "epoch": 2.569753810082063, + "grad_norm": 0.279296875, + "learning_rate": 0.0002135753215247822, + "loss": 0.5672, "step": 2192 }, { - "epoch": 4.198852772466539, - "grad_norm": 0.337890625, - "learning_rate": 0.00010555602043513724, - "loss": 0.4428, + "epoch": 2.574443141852286, + "grad_norm": 0.2890625, + "learning_rate": 0.00021328634739386279, + "loss": 0.5693, "step": 2196 }, { - "epoch": 4.2065009560229445, - "grad_norm": 0.34375, - "learning_rate": 0.00010505951282604088, - "loss": 0.4132, + "epoch": 2.5791324736225087, + "grad_norm": 0.279296875, + "learning_rate": 0.0002129970872355182, + "loss": 0.5845, "step": 2200 }, { - "epoch": 4.21414913957935, - "grad_norm": 0.33984375, - "learning_rate": 0.00010456354557383139, - "loss": 0.4046, + "epoch": 2.5838218053927315, + "grad_norm": 0.310546875, + "learning_rate": 0.00021270754235708152, + "loss": 0.588, "step": 2204 }, { - "epoch": 4.221797323135755, - "grad_norm": 0.345703125, - "learning_rate": 0.00010406812464193584, - "loss": 0.4527, + "epoch": 2.5885111371629543, + "grad_norm": 0.2890625, + "learning_rate": 0.00021241771406717275, + "loss": 0.5666, "step": 2208 }, { - "epoch": 4.22944550669216, - "grad_norm": 0.33203125, - "learning_rate": 0.00010357325598721255, - "loss": 0.4107, + "epoch": 2.593200468933177, + "grad_norm": 0.291015625, + "learning_rate": 0.0002121276036756926, + "loss": 0.5667, "step": 2212 }, { - "epoch": 4.237093690248566, - "grad_norm": 0.31640625, - "learning_rate": 0.00010307894555987927, - "loss": 0.4375, + "epoch": 2.5978898007034, + "grad_norm": 0.3046875, + "learning_rate": 0.000211837212493817, + "loss": 0.6236, "step": 2216 }, { - "epoch": 4.244741873804971, - "grad_norm": 0.328125, - "learning_rate": 0.00010258519930344179, - "loss": 0.4328, + "epoch": 2.6025791324736227, + "grad_norm": 0.298828125, + "learning_rate": 0.00021154654183399077, + "loss": 0.5384, "step": 2220 }, { - "epoch": 4.252390057361376, - "grad_norm": 0.34375, - "learning_rate": 0.0001020920231546223, - "loss": 0.4388, + "epoch": 2.6072684642438455, + "grad_norm": 0.294921875, + "learning_rate": 0.00021125559300992197, + "loss": 0.6194, "step": 2224 }, { - "epoch": 4.260038240917782, - "grad_norm": 0.33984375, - "learning_rate": 0.00010159942304328819, - "loss": 0.4443, + "epoch": 2.611957796014068, + "grad_norm": 0.27734375, + "learning_rate": 0.00021096436733657572, + "loss": 0.6018, "step": 2228 }, { - "epoch": 4.267686424474188, - "grad_norm": 0.361328125, - "learning_rate": 0.00010110740489238066, - "loss": 0.4446, + "epoch": 2.6166471277842906, + "grad_norm": 0.294921875, + "learning_rate": 0.00021067286613016847, + "loss": 0.6199, "step": 2232 }, { - "epoch": 4.275334608030593, - "grad_norm": 0.341796875, - "learning_rate": 0.00010061597461784346, - "loss": 0.4269, + "epoch": 2.6213364595545134, + "grad_norm": 0.271484375, + "learning_rate": 0.00021038109070816184, + "loss": 0.585, "step": 2236 }, { - "epoch": 4.282982791586998, - "grad_norm": 0.33984375, - "learning_rate": 0.00010012513812855191, - "loss": 0.4326, + "epoch": 2.6260257913247362, + "grad_norm": 0.306640625, + "learning_rate": 0.00021008904238925704, + "loss": 0.5525, "step": 2240 }, { - "epoch": 4.2906309751434035, - "grad_norm": 0.35546875, - "learning_rate": 9.963490132624169e-05, - "loss": 0.4342, + "epoch": 2.630715123094959, + "grad_norm": 0.294921875, + "learning_rate": 0.00020979672249338835, + "loss": 0.5492, "step": 2244 }, { - "epoch": 4.298279158699809, - "grad_norm": 0.3515625, - "learning_rate": 9.914527010543795e-05, - "loss": 0.4157, + "epoch": 2.635404454865182, + "grad_norm": 0.314453125, + "learning_rate": 0.00020950413234171767, + "loss": 0.6457, "step": 2248 }, { - "epoch": 4.305927342256214, - "grad_norm": 0.3359375, - "learning_rate": 9.865625035338447e-05, - "loss": 0.4237, + "epoch": 2.6400937866354046, + "grad_norm": 0.29296875, + "learning_rate": 0.00020921127325662826, + "loss": 0.5911, "step": 2252 }, { - "epoch": 4.313575525812619, - "grad_norm": 0.314453125, - "learning_rate": 9.816784794997275e-05, - "loss": 0.4033, + "epoch": 2.644783118405627, + "grad_norm": 0.287109375, + "learning_rate": 0.00020891814656171895, + "loss": 0.5998, "step": 2256 }, { - "epoch": 4.321223709369025, - "grad_norm": 0.359375, - "learning_rate": 9.76800687676715e-05, - "loss": 0.4436, + "epoch": 2.6494724501758498, + "grad_norm": 0.296875, + "learning_rate": 0.00020862475358179787, + "loss": 0.6458, "step": 2260 }, { - "epoch": 4.32887189292543, - "grad_norm": 0.3359375, - "learning_rate": 9.719291867145583e-05, - "loss": 0.4384, + "epoch": 2.6541617819460726, + "grad_norm": 0.2890625, + "learning_rate": 0.00020833109564287675, + "loss": 0.5858, "step": 2264 }, { - "epoch": 4.336520076481835, - "grad_norm": 0.341796875, - "learning_rate": 9.670640351873688e-05, - "loss": 0.4512, + "epoch": 2.6588511137162953, + "grad_norm": 0.28125, + "learning_rate": 0.00020803717407216486, + "loss": 0.5901, "step": 2268 }, { - "epoch": 4.3441682600382405, - "grad_norm": 0.34375, - "learning_rate": 9.62205291592913e-05, - "loss": 0.4259, + "epoch": 2.663540445486518, + "grad_norm": 0.291015625, + "learning_rate": 0.0002077429901980629, + "loss": 0.5914, "step": 2272 }, { - "epoch": 4.351816443594647, - "grad_norm": 0.337890625, - "learning_rate": 9.573530143519098e-05, - "loss": 0.4178, + "epoch": 2.668229777256741, + "grad_norm": 0.302734375, + "learning_rate": 0.00020744854535015715, + "loss": 0.6325, "step": 2276 }, { - "epoch": 4.359464627151052, - "grad_norm": 0.3671875, - "learning_rate": 9.525072618073277e-05, - "loss": 0.3608, + "epoch": 2.6729191090269637, + "grad_norm": 0.28515625, + "learning_rate": 0.00020715384085921327, + "loss": 0.6216, "step": 2280 }, { - "epoch": 4.367112810707457, - "grad_norm": 0.32421875, - "learning_rate": 9.476680922236831e-05, - "loss": 0.4489, + "epoch": 2.6776084407971865, + "grad_norm": 0.314453125, + "learning_rate": 0.00020685887805717046, + "loss": 0.6035, "step": 2284 }, { - "epoch": 4.374760994263863, - "grad_norm": 0.330078125, - "learning_rate": 9.428355637863402e-05, - "loss": 0.4346, + "epoch": 2.6822977725674093, + "grad_norm": 0.283203125, + "learning_rate": 0.00020656365827713543, + "loss": 0.5751, "step": 2288 }, { - "epoch": 4.382409177820268, - "grad_norm": 0.33203125, - "learning_rate": 9.380097346008112e-05, - "loss": 0.4542, + "epoch": 2.686987104337632, + "grad_norm": 0.283203125, + "learning_rate": 0.0002062681828533762, + "loss": 0.5953, "step": 2292 }, { - "epoch": 4.390057361376673, - "grad_norm": 0.341796875, - "learning_rate": 9.331906626920576e-05, - "loss": 0.4395, + "epoch": 2.691676436107855, + "grad_norm": 0.291015625, + "learning_rate": 0.00020597245312131636, + "loss": 0.621, "step": 2296 }, { - "epoch": 4.397705544933078, - "grad_norm": 0.341796875, - "learning_rate": 9.283784060037921e-05, - "loss": 0.4858, + "epoch": 2.6963657678780772, + "grad_norm": 0.296875, + "learning_rate": 0.00020567647041752862, + "loss": 0.6052, "step": 2300 }, { - "epoch": 4.405353728489484, - "grad_norm": 0.30859375, - "learning_rate": 9.235730223977837e-05, - "loss": 0.4148, + "epoch": 2.7010550996483, + "grad_norm": 0.294921875, + "learning_rate": 0.0002053802360797292, + "loss": 0.6146, "step": 2304 }, { - "epoch": 4.413001912045889, - "grad_norm": 0.36328125, - "learning_rate": 9.187745696531584e-05, - "loss": 0.4579, + "epoch": 2.705744431418523, + "grad_norm": 0.294921875, + "learning_rate": 0.00020508375144677167, + "loss": 0.6077, "step": 2308 }, { - "epoch": 4.420650095602294, - "grad_norm": 0.357421875, - "learning_rate": 9.139831054657081e-05, - "loss": 0.457, + "epoch": 2.7104337631887456, + "grad_norm": 0.294921875, + "learning_rate": 0.00020478701785864057, + "loss": 0.568, "step": 2312 }, { - "epoch": 4.4282982791587, - "grad_norm": 0.341796875, - "learning_rate": 9.091986874471956e-05, - "loss": 0.4257, + "epoch": 2.7151230949589684, + "grad_norm": 0.28125, + "learning_rate": 0.0002044900366564458, + "loss": 0.6129, "step": 2316 }, { - "epoch": 4.435946462715105, - "grad_norm": 0.3515625, - "learning_rate": 9.044213731246614e-05, - "loss": 0.4287, + "epoch": 2.719812426729191, + "grad_norm": 0.3203125, + "learning_rate": 0.00020419280918241632, + "loss": 0.6165, "step": 2320 }, { - "epoch": 4.44359464627151, - "grad_norm": 0.3359375, - "learning_rate": 8.99651219939732e-05, - "loss": 0.4482, + "epoch": 2.7245017584994136, + "grad_norm": 0.310546875, + "learning_rate": 0.00020389533677989417, + "loss": 0.6291, "step": 2324 }, { - "epoch": 4.451242829827915, - "grad_norm": 0.345703125, - "learning_rate": 8.948882852479305e-05, - "loss": 0.4772, + "epoch": 2.7291910902696364, + "grad_norm": 0.29296875, + "learning_rate": 0.00020359762079332833, + "loss": 0.6276, "step": 2328 }, { - "epoch": 4.458891013384322, - "grad_norm": 0.33203125, - "learning_rate": 8.901326263179851e-05, - "loss": 0.4067, + "epoch": 2.733880422039859, + "grad_norm": 0.298828125, + "learning_rate": 0.0002032996625682687, + "loss": 0.6048, "step": 2332 }, { - "epoch": 4.466539196940727, - "grad_norm": 0.373046875, - "learning_rate": 8.85384300331142e-05, - "loss": 0.4399, + "epoch": 2.738569753810082, + "grad_norm": 0.294921875, + "learning_rate": 0.0002030014634513599, + "loss": 0.6189, "step": 2336 }, { - "epoch": 4.474187380497132, - "grad_norm": 0.3671875, - "learning_rate": 8.80643364380477e-05, - "loss": 0.4726, + "epoch": 2.7432590855803047, + "grad_norm": 0.298828125, + "learning_rate": 0.00020270302479033538, + "loss": 0.5516, "step": 2340 }, { - "epoch": 4.4818355640535374, - "grad_norm": 0.349609375, - "learning_rate": 8.759098754702099e-05, - "loss": 0.4514, + "epoch": 2.7479484173505275, + "grad_norm": 0.30859375, + "learning_rate": 0.00020240434793401124, + "loss": 0.6309, "step": 2344 }, { - "epoch": 4.489483747609943, - "grad_norm": 0.36328125, - "learning_rate": 8.711838905150179e-05, - "loss": 0.4502, + "epoch": 2.7526377491207503, + "grad_norm": 0.3046875, + "learning_rate": 0.00020210543423228, + "loss": 0.5694, "step": 2348 }, { - "epoch": 4.497131931166348, - "grad_norm": 0.328125, - "learning_rate": 8.664654663393516e-05, - "loss": 0.4366, + "epoch": 2.757327080890973, + "grad_norm": 0.30078125, + "learning_rate": 0.00020180628503610484, + "loss": 0.6261, "step": 2352 }, { - "epoch": 4.504780114722753, - "grad_norm": 0.341796875, - "learning_rate": 8.617546596767534e-05, - "loss": 0.437, + "epoch": 2.762016412661196, + "grad_norm": 0.2890625, + "learning_rate": 0.000201506901697513, + "loss": 0.6279, "step": 2356 }, { - "epoch": 4.512428298279159, - "grad_norm": 0.337890625, - "learning_rate": 8.570515271691723e-05, - "loss": 0.4313, + "epoch": 2.7667057444314187, + "grad_norm": 0.27734375, + "learning_rate": 0.0002012072855695902, + "loss": 0.596, "step": 2360 }, { - "epoch": 4.520076481835564, - "grad_norm": 0.34765625, - "learning_rate": 8.523561253662864e-05, - "loss": 0.447, + "epoch": 2.7713950762016415, + "grad_norm": 0.30078125, + "learning_rate": 0.00020090743800647403, + "loss": 0.5867, "step": 2364 }, { - "epoch": 4.527724665391969, - "grad_norm": 0.357421875, - "learning_rate": 8.476685107248197e-05, - "loss": 0.4488, + "epoch": 2.776084407971864, + "grad_norm": 0.294921875, + "learning_rate": 0.0002006073603633483, + "loss": 0.6346, "step": 2368 }, { - "epoch": 4.5353728489483744, - "grad_norm": 0.33203125, - "learning_rate": 8.429887396078655e-05, - "loss": 0.469, + "epoch": 2.7807737397420866, + "grad_norm": 0.3046875, + "learning_rate": 0.00020030705399643646, + "loss": 0.6264, "step": 2372 }, { - "epoch": 4.54302103250478, - "grad_norm": 0.3515625, - "learning_rate": 8.38316868284207e-05, - "loss": 0.4886, + "epoch": 2.7854630715123094, + "grad_norm": 0.314453125, + "learning_rate": 0.00020000652026299593, + "loss": 0.655, "step": 2376 }, { - "epoch": 4.550669216061186, - "grad_norm": 0.341796875, - "learning_rate": 8.336529529276421e-05, - "loss": 0.433, + "epoch": 2.7901524032825322, + "grad_norm": 0.30859375, + "learning_rate": 0.0001997057605213115, + "loss": 0.606, "step": 2380 }, { - "epoch": 4.558317399617591, - "grad_norm": 0.33984375, - "learning_rate": 8.289970496163085e-05, - "loss": 0.4029, + "epoch": 2.794841735052755, + "grad_norm": 0.302734375, + "learning_rate": 0.00019940477613068964, + "loss": 0.5661, "step": 2384 }, { - "epoch": 4.5659655831739965, - "grad_norm": 0.322265625, - "learning_rate": 8.243492143320058e-05, - "loss": 0.4198, + "epoch": 2.799531066822978, + "grad_norm": 0.294921875, + "learning_rate": 0.00019910356845145196, + "loss": 0.598, "step": 2388 }, { - "epoch": 4.573613766730402, - "grad_norm": 0.341796875, - "learning_rate": 8.197095029595276e-05, - "loss": 0.4377, + "epoch": 2.8042203985932006, + "grad_norm": 0.2890625, + "learning_rate": 0.0001988021388449293, + "loss": 0.6477, "step": 2392 }, { - "epoch": 4.581261950286807, - "grad_norm": 0.359375, - "learning_rate": 8.150779712859854e-05, - "loss": 0.4263, + "epoch": 2.808909730363423, + "grad_norm": 0.287109375, + "learning_rate": 0.00019850048867345554, + "loss": 0.6005, "step": 2396 }, { - "epoch": 4.588910133843212, - "grad_norm": 0.349609375, - "learning_rate": 8.104546750001402e-05, - "loss": 0.4784, + "epoch": 2.8135990621336457, + "grad_norm": 0.28125, + "learning_rate": 0.0001981986193003614, + "loss": 0.584, "step": 2400 }, { - "epoch": 4.596558317399618, - "grad_norm": 0.337890625, - "learning_rate": 8.05839669691732e-05, - "loss": 0.4549, + "epoch": 2.8182883939038685, + "grad_norm": 0.2890625, + "learning_rate": 0.0001978965320899683, + "loss": 0.6111, "step": 2404 }, { - "epoch": 4.604206500956023, - "grad_norm": 0.33203125, - "learning_rate": 8.01233010850811e-05, - "loss": 0.4519, + "epoch": 2.8229777256740913, + "grad_norm": 0.28125, + "learning_rate": 0.00019759422840758228, + "loss": 0.6175, "step": 2408 }, { - "epoch": 4.611854684512428, - "grad_norm": 0.37109375, - "learning_rate": 7.966347538670712e-05, - "loss": 0.4242, + "epoch": 2.827667057444314, + "grad_norm": 0.30078125, + "learning_rate": 0.00019729170961948754, + "loss": 0.6083, "step": 2412 }, { - "epoch": 4.6195028680688335, - "grad_norm": 0.333984375, - "learning_rate": 7.92044954029184e-05, - "loss": 0.4647, + "epoch": 2.832356389214537, + "grad_norm": 0.3125, + "learning_rate": 0.00019698897709294062, + "loss": 0.5866, "step": 2416 }, { - "epoch": 4.627151051625239, - "grad_norm": 0.341796875, - "learning_rate": 7.874636665241335e-05, - "loss": 0.46, + "epoch": 2.8370457209847597, + "grad_norm": 0.294921875, + "learning_rate": 0.000196686032196164, + "loss": 0.5791, "step": 2420 }, { - "epoch": 4.634799235181644, - "grad_norm": 0.34375, - "learning_rate": 7.828909464365531e-05, - "loss": 0.4169, + "epoch": 2.8417350527549825, + "grad_norm": 0.306640625, + "learning_rate": 0.00019638287629834012, + "loss": 0.6533, "step": 2424 }, { - "epoch": 4.642447418738049, - "grad_norm": 0.349609375, - "learning_rate": 7.783268487480626e-05, - "loss": 0.4251, + "epoch": 2.8464243845252053, + "grad_norm": 0.302734375, + "learning_rate": 0.0001960795107696048, + "loss": 0.5994, "step": 2428 }, { - "epoch": 4.650095602294455, - "grad_norm": 0.337890625, - "learning_rate": 7.73771428336608e-05, - "loss": 0.4232, + "epoch": 2.851113716295428, + "grad_norm": 0.3125, + "learning_rate": 0.00019577593698104156, + "loss": 0.5925, "step": 2432 }, { - "epoch": 4.657743785850861, - "grad_norm": 0.3671875, - "learning_rate": 7.692247399758008e-05, - "loss": 0.4836, + "epoch": 2.855803048065651, + "grad_norm": 0.2890625, + "learning_rate": 0.00019547215630467504, + "loss": 0.5999, "step": 2436 }, { - "epoch": 4.665391969407266, - "grad_norm": 0.341796875, - "learning_rate": 7.6468683833426e-05, - "loss": 0.3917, + "epoch": 2.8604923798358732, + "grad_norm": 0.3046875, + "learning_rate": 0.00019516817011346494, + "loss": 0.6372, "step": 2440 }, { - "epoch": 4.673040152963671, - "grad_norm": 0.345703125, - "learning_rate": 7.601577779749545e-05, - "loss": 0.4153, + "epoch": 2.865181711606096, + "grad_norm": 0.30859375, + "learning_rate": 0.00019486397978129977, + "loss": 0.5615, "step": 2444 }, { - "epoch": 4.680688336520077, - "grad_norm": 0.33984375, - "learning_rate": 7.55637613354547e-05, - "loss": 0.4103, + "epoch": 2.869871043376319, + "grad_norm": 0.28515625, + "learning_rate": 0.00019455958668299075, + "loss": 0.5898, "step": 2448 }, { - "epoch": 4.688336520076482, - "grad_norm": 0.34765625, - "learning_rate": 7.511263988227397e-05, - "loss": 0.4858, + "epoch": 2.8745603751465416, + "grad_norm": 0.2890625, + "learning_rate": 0.0001942549921942654, + "loss": 0.6083, "step": 2452 }, { - "epoch": 4.695984703632887, - "grad_norm": 0.369140625, - "learning_rate": 7.466241886216198e-05, - "loss": 0.468, + "epoch": 2.8792497069167644, + "grad_norm": 0.3203125, + "learning_rate": 0.00019395019769176156, + "loss": 0.6447, "step": 2456 }, { - "epoch": 4.7036328871892925, - "grad_norm": 0.34765625, - "learning_rate": 7.421310368850085e-05, - "loss": 0.4817, + "epoch": 2.883939038686987, + "grad_norm": 0.275390625, + "learning_rate": 0.00019364520455302103, + "loss": 0.6103, "step": 2460 }, { - "epoch": 4.711281070745698, - "grad_norm": 0.35546875, - "learning_rate": 7.376469976378094e-05, - "loss": 0.3923, + "epoch": 2.88862837045721, + "grad_norm": 0.310546875, + "learning_rate": 0.0001933400141564833, + "loss": 0.5962, "step": 2464 }, { - "epoch": 4.718929254302103, - "grad_norm": 0.337890625, - "learning_rate": 7.33172124795359e-05, - "loss": 0.3954, + "epoch": 2.8933177022274323, + "grad_norm": 0.298828125, + "learning_rate": 0.00019303462788147933, + "loss": 0.636, "step": 2468 }, { - "epoch": 4.726577437858508, - "grad_norm": 0.345703125, - "learning_rate": 7.287064721627782e-05, - "loss": 0.4606, + "epoch": 2.898007033997655, + "grad_norm": 0.302734375, + "learning_rate": 0.00019272904710822551, + "loss": 0.5804, "step": 2472 }, { - "epoch": 4.734225621414914, - "grad_norm": 0.357421875, - "learning_rate": 7.242500934343262e-05, - "loss": 0.4253, + "epoch": 2.902696365767878, + "grad_norm": 0.296875, + "learning_rate": 0.00019242327321781726, + "loss": 0.6356, "step": 2476 }, { - "epoch": 4.741873804971319, - "grad_norm": 0.3671875, - "learning_rate": 7.19803042192754e-05, - "loss": 0.4431, + "epoch": 2.9073856975381007, + "grad_norm": 0.28515625, + "learning_rate": 0.00019211730759222272, + "loss": 0.574, "step": 2480 }, { - "epoch": 4.749521988527725, - "grad_norm": 0.322265625, - "learning_rate": 7.153653719086604e-05, - "loss": 0.4097, + "epoch": 2.9120750293083235, + "grad_norm": 0.28125, + "learning_rate": 0.00019181115161427662, + "loss": 0.6377, "step": 2484 }, { - "epoch": 4.75717017208413, - "grad_norm": 0.322265625, - "learning_rate": 7.109371359398493e-05, - "loss": 0.4208, + "epoch": 2.9167643610785463, + "grad_norm": 0.298828125, + "learning_rate": 0.00019150480666767407, + "loss": 0.6016, "step": 2488 }, { - "epoch": 4.764818355640536, - "grad_norm": 0.345703125, - "learning_rate": 7.06518387530688e-05, - "loss": 0.4396, + "epoch": 2.921453692848769, + "grad_norm": 0.29296875, + "learning_rate": 0.0001911982741369641, + "loss": 0.5675, "step": 2492 }, { - "epoch": 4.772466539196941, - "grad_norm": 0.328125, - "learning_rate": 7.021091798114667e-05, - "loss": 0.4212, + "epoch": 2.926143024618992, + "grad_norm": 0.29296875, + "learning_rate": 0.0001908915554075437, + "loss": 0.5886, "step": 2496 }, { - "epoch": 4.780114722753346, - "grad_norm": 0.33203125, - "learning_rate": 6.977095657977603e-05, - "loss": 0.4742, + "epoch": 2.9308323563892147, + "grad_norm": 0.27734375, + "learning_rate": 0.00019058465186565132, + "loss": 0.6028, "step": 2500 }, { - "epoch": 4.7877629063097515, - "grad_norm": 0.341796875, - "learning_rate": 6.933195983897905e-05, - "loss": 0.4198, + "epoch": 2.9355216881594375, + "grad_norm": 0.28515625, + "learning_rate": 0.0001902775648983606, + "loss": 0.5958, "step": 2504 }, { - "epoch": 4.795411089866157, - "grad_norm": 0.361328125, - "learning_rate": 6.889393303717898e-05, - "loss": 0.4401, + "epoch": 2.9402110199296603, + "grad_norm": 0.302734375, + "learning_rate": 0.00018997029589357443, + "loss": 0.5895, "step": 2508 }, { - "epoch": 4.803059273422562, - "grad_norm": 0.34765625, - "learning_rate": 6.845688144113663e-05, - "loss": 0.4273, + "epoch": 2.9449003516998826, + "grad_norm": 0.322265625, + "learning_rate": 0.00018966284624001814, + "loss": 0.6345, "step": 2512 }, { - "epoch": 4.810707456978967, - "grad_norm": 0.322265625, - "learning_rate": 6.802081030588722e-05, - "loss": 0.4589, + "epoch": 2.9495896834701054, + "grad_norm": 0.30078125, + "learning_rate": 0.00018935521732723376, + "loss": 0.5931, "step": 2516 }, { - "epoch": 4.818355640535373, - "grad_norm": 0.34765625, - "learning_rate": 6.758572487467698e-05, - "loss": 0.4835, + "epoch": 2.954279015240328, + "grad_norm": 0.29296875, + "learning_rate": 0.00018904741054557325, + "loss": 0.6009, "step": 2520 }, { - "epoch": 4.826003824091778, - "grad_norm": 0.34375, - "learning_rate": 6.715163037890021e-05, - "loss": 0.4493, + "epoch": 2.958968347010551, + "grad_norm": 0.31640625, + "learning_rate": 0.00018873942728619273, + "loss": 0.6011, "step": 2524 }, { - "epoch": 4.833652007648183, - "grad_norm": 0.3515625, - "learning_rate": 6.671853203803641e-05, - "loss": 0.4032, + "epoch": 2.963657678780774, + "grad_norm": 0.3125, + "learning_rate": 0.00018843126894104573, + "loss": 0.5769, "step": 2528 }, { - "epoch": 4.8413001912045885, - "grad_norm": 0.34765625, - "learning_rate": 6.628643505958742e-05, - "loss": 0.4498, + "epoch": 2.9683470105509966, + "grad_norm": 0.30078125, + "learning_rate": 0.00018812293690287715, + "loss": 0.5863, "step": 2532 }, { - "epoch": 4.848948374760994, - "grad_norm": 0.34375, - "learning_rate": 6.585534463901493e-05, - "loss": 0.4255, + "epoch": 2.9730363423212194, + "grad_norm": 0.296875, + "learning_rate": 0.00018781443256521695, + "loss": 0.5778, "step": 2536 }, { - "epoch": 4.8565965583174, - "grad_norm": 0.388671875, - "learning_rate": 6.542526595967795e-05, - "loss": 0.4285, + "epoch": 2.9777256740914417, + "grad_norm": 0.30859375, + "learning_rate": 0.00018750575732237379, + "loss": 0.6134, "step": 2540 }, { - "epoch": 4.864244741873805, - "grad_norm": 0.337890625, - "learning_rate": 6.499620419277036e-05, - "loss": 0.4507, + "epoch": 2.9824150058616645, + "grad_norm": 0.27734375, + "learning_rate": 0.00018719691256942868, + "loss": 0.558, "step": 2544 }, { - "epoch": 4.871892925430211, - "grad_norm": 0.333984375, - "learning_rate": 6.456816449725892e-05, - "loss": 0.4196, + "epoch": 2.9871043376318873, + "grad_norm": 0.2734375, + "learning_rate": 0.00018688789970222882, + "loss": 0.5734, "step": 2548 }, { - "epoch": 4.879541108986616, - "grad_norm": 0.345703125, - "learning_rate": 6.414115201982134e-05, - "loss": 0.4061, + "epoch": 2.99179366940211, + "grad_norm": 0.296875, + "learning_rate": 0.00018657872011738124, + "loss": 0.5789, "step": 2552 }, { - "epoch": 4.887189292543021, - "grad_norm": 0.33984375, - "learning_rate": 6.371517189478403e-05, - "loss": 0.4199, + "epoch": 2.996483001172333, + "grad_norm": 0.291015625, + "learning_rate": 0.0001862693752122463, + "loss": 0.6177, "step": 2556 }, { - "epoch": 4.894837476099426, - "grad_norm": 0.3515625, - "learning_rate": 6.329022924406061e-05, - "loss": 0.452, + "epoch": 3.0011723329425557, + "grad_norm": 0.271484375, + "learning_rate": 0.0001859598663849318, + "loss": 0.5135, "step": 2560 }, { - "epoch": 4.902485659655832, - "grad_norm": 0.345703125, - "learning_rate": 6.286632917709031e-05, - "loss": 0.4516, + "epoch": 3.0058616647127785, + "grad_norm": 0.2734375, + "learning_rate": 0.00018565019503428618, + "loss": 0.5306, "step": 2564 }, { - "epoch": 4.910133843212237, - "grad_norm": 0.33984375, - "learning_rate": 6.244347679077651e-05, - "loss": 0.4622, + "epoch": 3.0105509964830013, + "grad_norm": 0.3203125, + "learning_rate": 0.00018534036255989247, + "loss": 0.5166, "step": 2568 }, { - "epoch": 4.917782026768642, - "grad_norm": 0.333984375, - "learning_rate": 6.202167716942543e-05, - "loss": 0.4387, + "epoch": 3.015240328253224, + "grad_norm": 0.302734375, + "learning_rate": 0.00018503037036206194, + "loss": 0.5773, "step": 2572 }, { - "epoch": 4.925430210325048, - "grad_norm": 0.35546875, - "learning_rate": 6.160093538468505e-05, - "loss": 0.4064, + "epoch": 3.019929660023447, + "grad_norm": 0.298828125, + "learning_rate": 0.00018472021984182777, + "loss": 0.5381, "step": 2576 }, { - "epoch": 4.933078393881453, - "grad_norm": 0.3671875, - "learning_rate": 6.118125649548405e-05, - "loss": 0.4728, + "epoch": 3.024618991793669, + "grad_norm": 0.306640625, + "learning_rate": 0.00018440991240093862, + "loss": 0.5158, "step": 2580 }, { - "epoch": 4.940726577437858, - "grad_norm": 0.35546875, - "learning_rate": 6.076264554797112e-05, - "loss": 0.4555, + "epoch": 3.029308323563892, + "grad_norm": 0.330078125, + "learning_rate": 0.00018409944944185237, + "loss": 0.5442, "step": 2584 }, { - "epoch": 4.948374760994264, - "grad_norm": 0.337890625, - "learning_rate": 6.0345107575454105e-05, - "loss": 0.427, + "epoch": 3.033997655334115, + "grad_norm": 0.294921875, + "learning_rate": 0.0001837888323677299, + "loss": 0.5519, "step": 2588 }, { - "epoch": 4.95602294455067, - "grad_norm": 0.341796875, - "learning_rate": 5.992864759833963e-05, - "loss": 0.3841, + "epoch": 3.0386869871043376, + "grad_norm": 0.279296875, + "learning_rate": 0.0001834780625824285, + "loss": 0.525, "step": 2592 }, { - "epoch": 4.963671128107075, - "grad_norm": 0.330078125, - "learning_rate": 5.9513270624072655e-05, - "loss": 0.4289, + "epoch": 3.0433763188745604, + "grad_norm": 0.322265625, + "learning_rate": 0.0001831671414904956, + "loss": 0.5157, "step": 2596 }, { - "epoch": 4.97131931166348, - "grad_norm": 0.353515625, - "learning_rate": 5.9098981647076345e-05, - "loss": 0.4538, + "epoch": 3.048065650644783, + "grad_norm": 0.306640625, + "learning_rate": 0.00018285607049716256, + "loss": 0.5142, "step": 2600 }, { - "epoch": 4.9789674952198855, - "grad_norm": 0.33203125, - "learning_rate": 5.8685785648691894e-05, - "loss": 0.3954, + "epoch": 3.052754982415006, + "grad_norm": 0.29296875, + "learning_rate": 0.0001825448510083383, + "loss": 0.4794, "step": 2604 }, { - "epoch": 4.986615678776291, - "grad_norm": 0.34765625, - "learning_rate": 5.8273687597118765e-05, - "loss": 0.4614, + "epoch": 3.0574443141852288, + "grad_norm": 0.298828125, + "learning_rate": 0.00018223348443060274, + "loss": 0.4835, "step": 2608 }, { - "epoch": 4.994263862332696, - "grad_norm": 0.3359375, - "learning_rate": 5.786269244735488e-05, - "loss": 0.4429, + "epoch": 3.0621336459554516, + "grad_norm": 0.318359375, + "learning_rate": 0.00018192197217120067, + "loss": 0.4776, "step": 2612 }, { - "epoch": 5.001912045889101, - "grad_norm": 0.30859375, - "learning_rate": 5.7452805141137034e-05, - "loss": 0.3411, + "epoch": 3.066822977725674, + "grad_norm": 0.29296875, + "learning_rate": 0.00018161031563803523, + "loss": 0.5137, "step": 2616 }, { - "epoch": 5.009560229445507, - "grad_norm": 0.318359375, - "learning_rate": 5.704403060688158e-05, - "loss": 0.3973, + "epoch": 3.0715123094958967, + "grad_norm": 0.294921875, + "learning_rate": 0.00018129851623966168, + "loss": 0.5287, "step": 2620 }, { - "epoch": 5.017208413001912, - "grad_norm": 0.314453125, - "learning_rate": 5.663637375962489e-05, - "loss": 0.4319, + "epoch": 3.0762016412661195, + "grad_norm": 0.2890625, + "learning_rate": 0.0001809865753852809, + "loss": 0.5198, "step": 2624 }, { - "epoch": 5.024856596558317, - "grad_norm": 0.3203125, - "learning_rate": 5.6229839500964635e-05, - "loss": 0.3768, + "epoch": 3.0808909730363423, + "grad_norm": 0.294921875, + "learning_rate": 0.00018067449448473321, + "loss": 0.4759, "step": 2628 }, { - "epoch": 5.0325047801147225, - "grad_norm": 0.314453125, - "learning_rate": 5.582443271900063e-05, - "loss": 0.3683, + "epoch": 3.085580304806565, + "grad_norm": 0.306640625, + "learning_rate": 0.00018036227494849173, + "loss": 0.4228, "step": 2632 }, { - "epoch": 5.040152963671128, - "grad_norm": 0.35546875, - "learning_rate": 5.542015828827609e-05, - "loss": 0.4145, + "epoch": 3.090269636576788, + "grad_norm": 0.296875, + "learning_rate": 0.00018004991818765625, + "loss": 0.5385, "step": 2636 }, { - "epoch": 5.047801147227533, - "grad_norm": 0.35546875, - "learning_rate": 5.5017021069719014e-05, - "loss": 0.4116, + "epoch": 3.0949589683470107, + "grad_norm": 0.30859375, + "learning_rate": 0.00017973742561394675, + "loss": 0.5078, "step": 2640 }, { - "epoch": 5.055449330783939, - "grad_norm": 0.3359375, - "learning_rate": 5.4615025910583756e-05, - "loss": 0.4069, + "epoch": 3.0996483001172335, + "grad_norm": 0.29296875, + "learning_rate": 0.000179424798639697, + "loss": 0.4977, "step": 2644 }, { - "epoch": 5.0630975143403445, - "grad_norm": 0.35546875, - "learning_rate": 5.421417764439276e-05, - "loss": 0.375, + "epoch": 3.1043376318874563, + "grad_norm": 0.322265625, + "learning_rate": 0.00017911203867784819, + "loss": 0.5108, "step": 2648 }, { - "epoch": 5.07074569789675, - "grad_norm": 0.333984375, - "learning_rate": 5.3814481090878374e-05, - "loss": 0.3576, + "epoch": 3.1090269636576786, + "grad_norm": 0.322265625, + "learning_rate": 0.0001787991471419426, + "loss": 0.5151, "step": 2652 }, { - "epoch": 5.078393881453155, - "grad_norm": 0.337890625, - "learning_rate": 5.3415941055924974e-05, - "loss": 0.4027, + "epoch": 3.1137162954279014, + "grad_norm": 0.302734375, + "learning_rate": 0.00017848612544611714, + "loss": 0.5332, "step": 2656 }, { - "epoch": 5.08604206500956, - "grad_norm": 0.328125, - "learning_rate": 5.301856233151123e-05, - "loss": 0.3981, + "epoch": 3.118405627198124, + "grad_norm": 0.306640625, + "learning_rate": 0.00017817297500509702, + "loss": 0.5177, "step": 2660 }, { - "epoch": 5.093690248565966, - "grad_norm": 0.318359375, - "learning_rate": 5.262234969565226e-05, - "loss": 0.3928, + "epoch": 3.123094958968347, + "grad_norm": 0.30078125, + "learning_rate": 0.0001778596972341893, + "loss": 0.5166, "step": 2664 }, { - "epoch": 5.101338432122371, - "grad_norm": 0.33203125, - "learning_rate": 5.222730791234246e-05, - "loss": 0.3813, + "epoch": 3.12778429073857, + "grad_norm": 0.30859375, + "learning_rate": 0.00017754629354927655, + "loss": 0.4979, "step": 2668 }, { - "epoch": 5.108986615678776, - "grad_norm": 0.3671875, - "learning_rate": 5.183344173149798e-05, - "loss": 0.4151, + "epoch": 3.1324736225087926, + "grad_norm": 0.30078125, + "learning_rate": 0.00017723276536681025, + "loss": 0.5008, "step": 2672 }, { - "epoch": 5.1166347992351815, - "grad_norm": 0.35546875, - "learning_rate": 5.14407558888998e-05, - "loss": 0.4051, + "epoch": 3.1371629542790154, + "grad_norm": 0.2890625, + "learning_rate": 0.00017691911410380485, + "loss": 0.5153, "step": 2676 }, { - "epoch": 5.124282982791587, - "grad_norm": 0.330078125, - "learning_rate": 5.104925510613668e-05, - "loss": 0.3973, + "epoch": 3.141852286049238, + "grad_norm": 0.306640625, + "learning_rate": 0.00017660534117783084, + "loss": 0.4782, "step": 2680 }, { - "epoch": 5.131931166347992, - "grad_norm": 0.3203125, - "learning_rate": 5.0658944090548436e-05, - "loss": 0.3958, + "epoch": 3.1465416178194605, + "grad_norm": 0.30078125, + "learning_rate": 0.00017629144800700866, + "loss": 0.4542, "step": 2684 }, { - "epoch": 5.139579349904397, - "grad_norm": 0.34765625, - "learning_rate": 5.0269827535169306e-05, - "loss": 0.3731, + "epoch": 3.1512309495896833, + "grad_norm": 0.298828125, + "learning_rate": 0.00017597743601000218, + "loss": 0.5261, "step": 2688 }, { - "epoch": 5.147227533460803, - "grad_norm": 0.33984375, - "learning_rate": 4.988191011867153e-05, - "loss": 0.4028, + "epoch": 3.155920281359906, + "grad_norm": 0.337890625, + "learning_rate": 0.00017566330660601236, + "loss": 0.4949, "step": 2692 }, { - "epoch": 5.154875717017209, - "grad_norm": 0.36328125, - "learning_rate": 4.9495196505309196e-05, - "loss": 0.4201, + "epoch": 3.160609613130129, + "grad_norm": 0.318359375, + "learning_rate": 0.0001753490612147707, + "loss": 0.5243, "step": 2696 }, { - "epoch": 5.162523900573614, - "grad_norm": 0.345703125, - "learning_rate": 4.9109691344861886e-05, - "loss": 0.3815, + "epoch": 3.1652989449003517, + "grad_norm": 0.310546875, + "learning_rate": 0.00017503470125653309, + "loss": 0.5316, "step": 2700 }, { - "epoch": 5.170172084130019, - "grad_norm": 0.326171875, - "learning_rate": 4.8725399272579075e-05, - "loss": 0.4043, + "epoch": 3.1699882766705745, + "grad_norm": 0.3125, + "learning_rate": 0.00017472022815207295, + "loss": 0.5134, "step": 2704 }, { - "epoch": 5.177820267686425, - "grad_norm": 0.330078125, - "learning_rate": 4.8342324909124256e-05, - "loss": 0.3828, + "epoch": 3.1746776084407973, + "grad_norm": 0.306640625, + "learning_rate": 0.0001744056433226753, + "loss": 0.543, "step": 2708 }, { - "epoch": 5.18546845124283, - "grad_norm": 0.34765625, - "learning_rate": 4.7960472860519365e-05, - "loss": 0.4199, + "epoch": 3.17936694021102, + "grad_norm": 0.314453125, + "learning_rate": 0.00017409094819013002, + "loss": 0.5009, "step": 2712 }, { - "epoch": 5.193116634799235, - "grad_norm": 0.330078125, - "learning_rate": 4.757984771808947e-05, - "loss": 0.4071, + "epoch": 3.184056271981243, + "grad_norm": 0.333984375, + "learning_rate": 0.00017377614417672554, + "loss": 0.5103, "step": 2716 }, { - "epoch": 5.2007648183556405, - "grad_norm": 0.34765625, - "learning_rate": 4.72004540584075e-05, - "loss": 0.4288, + "epoch": 3.1887456037514657, + "grad_norm": 0.318359375, + "learning_rate": 0.0001734612327052423, + "loss": 0.508, "step": 2720 }, { - "epoch": 5.208413001912046, - "grad_norm": 0.353515625, - "learning_rate": 4.682229644323922e-05, - "loss": 0.3935, + "epoch": 3.193434935521688, + "grad_norm": 0.291015625, + "learning_rate": 0.00017314621519894652, + "loss": 0.4487, "step": 2724 }, { - "epoch": 5.216061185468451, - "grad_norm": 0.37109375, - "learning_rate": 4.6445379419488436e-05, - "loss": 0.3801, + "epoch": 3.198124267291911, + "grad_norm": 0.310546875, + "learning_rate": 0.00017283109308158362, + "loss": 0.5308, "step": 2728 }, { - "epoch": 5.223709369024856, - "grad_norm": 0.33203125, - "learning_rate": 4.606970751914229e-05, - "loss": 0.4181, + "epoch": 3.2028135990621336, + "grad_norm": 0.3203125, + "learning_rate": 0.00017251586777737175, + "loss": 0.5029, "step": 2732 }, { - "epoch": 5.231357552581262, - "grad_norm": 0.341796875, - "learning_rate": 4.569528525921672e-05, - "loss": 0.3816, + "epoch": 3.2075029308323564, + "grad_norm": 0.3046875, + "learning_rate": 0.00017220054071099555, + "loss": 0.5274, "step": 2736 }, { - "epoch": 5.239005736137667, - "grad_norm": 0.349609375, - "learning_rate": 4.532211714170229e-05, - "loss": 0.4331, + "epoch": 3.212192262602579, + "grad_norm": 0.318359375, + "learning_rate": 0.0001718851133075994, + "loss": 0.5138, "step": 2740 }, { - "epoch": 5.246653919694072, - "grad_norm": 0.365234375, - "learning_rate": 4.495020765350988e-05, - "loss": 0.4331, + "epoch": 3.216881594372802, + "grad_norm": 0.3203125, + "learning_rate": 0.00017156958699278134, + "loss": 0.5345, "step": 2744 }, { - "epoch": 5.254302103250478, - "grad_norm": 0.359375, - "learning_rate": 4.4579561266416855e-05, - "loss": 0.3889, + "epoch": 3.2215709261430248, + "grad_norm": 0.322265625, + "learning_rate": 0.00017125396319258635, + "loss": 0.5127, "step": 2748 }, { - "epoch": 5.261950286806884, - "grad_norm": 0.328125, - "learning_rate": 4.421018243701327e-05, - "loss": 0.3804, + "epoch": 3.2262602579132476, + "grad_norm": 0.298828125, + "learning_rate": 0.0001709382433335, + "loss": 0.4827, "step": 2752 }, { - "epoch": 5.269598470363289, - "grad_norm": 0.55859375, - "learning_rate": 4.384207560664825e-05, - "loss": 0.4386, + "epoch": 3.23094958968347, + "grad_norm": 0.322265625, + "learning_rate": 0.00017062242884244213, + "loss": 0.4796, "step": 2756 }, { - "epoch": 5.277246653919694, - "grad_norm": 0.34375, - "learning_rate": 4.347524520137667e-05, - "loss": 0.3547, + "epoch": 3.2356389214536927, + "grad_norm": 0.314453125, + "learning_rate": 0.00017030652114676003, + "loss": 0.4927, "step": 2760 }, { - "epoch": 5.2848948374761, - "grad_norm": 0.337890625, - "learning_rate": 4.310969563190578e-05, - "loss": 0.3719, + "epoch": 3.2403282532239155, + "grad_norm": 0.30078125, + "learning_rate": 0.00016999052167422247, + "loss": 0.5091, "step": 2764 }, { - "epoch": 5.292543021032505, - "grad_norm": 0.328125, - "learning_rate": 4.274543129354245e-05, - "loss": 0.3611, + "epoch": 3.2450175849941383, + "grad_norm": 0.3203125, + "learning_rate": 0.00016967443185301293, + "loss": 0.5312, "step": 2768 }, { - "epoch": 5.30019120458891, - "grad_norm": 0.3125, - "learning_rate": 4.2382456566139985e-05, - "loss": 0.3173, + "epoch": 3.249706916764361, + "grad_norm": 0.3203125, + "learning_rate": 0.00016935825311172322, + "loss": 0.5339, "step": 2772 }, { - "epoch": 5.307839388145315, - "grad_norm": 0.345703125, - "learning_rate": 4.202077581404574e-05, - "loss": 0.3848, + "epoch": 3.254396248534584, + "grad_norm": 0.3359375, + "learning_rate": 0.00016904198687934697, + "loss": 0.5263, "step": 2776 }, { - "epoch": 5.315487571701721, - "grad_norm": 0.330078125, - "learning_rate": 4.166039338604838e-05, - "loss": 0.3921, + "epoch": 3.2590855803048067, + "grad_norm": 0.31640625, + "learning_rate": 0.00016872563458527332, + "loss": 0.4754, "step": 2780 }, { - "epoch": 5.323135755258126, - "grad_norm": 0.318359375, - "learning_rate": 4.130131361532586e-05, - "loss": 0.4049, + "epoch": 3.2637749120750295, + "grad_norm": 0.3359375, + "learning_rate": 0.0001684091976592804, + "loss": 0.5075, "step": 2784 }, { - "epoch": 5.330783938814531, - "grad_norm": 0.35546875, - "learning_rate": 4.094354081939317e-05, - "loss": 0.3866, + "epoch": 3.2684642438452522, + "grad_norm": 0.3203125, + "learning_rate": 0.00016809267753152871, + "loss": 0.5461, "step": 2788 }, { - "epoch": 5.338432122370937, - "grad_norm": 0.365234375, - "learning_rate": 4.058707930005048e-05, - "loss": 0.3727, + "epoch": 3.273153575615475, + "grad_norm": 0.3125, + "learning_rate": 0.00016777607563255498, + "loss": 0.5318, "step": 2792 }, { - "epoch": 5.346080305927342, - "grad_norm": 0.341796875, - "learning_rate": 4.023193334333132e-05, - "loss": 0.3957, + "epoch": 3.2778429073856974, + "grad_norm": 0.326171875, + "learning_rate": 0.00016745939339326532, + "loss": 0.5415, "step": 2796 }, { - "epoch": 5.353728489483748, - "grad_norm": 0.341796875, - "learning_rate": 3.9878107219451206e-05, - "loss": 0.344, + "epoch": 3.28253223915592, + "grad_norm": 0.306640625, + "learning_rate": 0.000167142632244929, + "loss": 0.5451, "step": 2800 }, { - "epoch": 5.361376673040153, - "grad_norm": 0.375, - "learning_rate": 3.9525605182756134e-05, - "loss": 0.4048, + "epoch": 3.287221570926143, + "grad_norm": 0.298828125, + "learning_rate": 0.00016682579361917196, + "loss": 0.5004, "step": 2804 }, { - "epoch": 5.369024856596559, - "grad_norm": 0.349609375, - "learning_rate": 3.917443147167152e-05, - "loss": 0.4343, + "epoch": 3.2919109026963658, + "grad_norm": 0.32421875, + "learning_rate": 0.00016650887894797029, + "loss": 0.5172, "step": 2808 }, { - "epoch": 5.376673040152964, - "grad_norm": 0.345703125, - "learning_rate": 3.882459030865124e-05, - "loss": 0.3908, + "epoch": 3.2966002344665886, + "grad_norm": 0.3359375, + "learning_rate": 0.00016619188966364383, + "loss": 0.5349, "step": 2812 }, { - "epoch": 5.384321223709369, - "grad_norm": 0.34765625, - "learning_rate": 3.8476085900126776e-05, - "loss": 0.3491, + "epoch": 3.3012895662368114, + "grad_norm": 0.3125, + "learning_rate": 0.0001658748271988495, + "loss": 0.5499, "step": 2816 }, { - "epoch": 5.3919694072657744, - "grad_norm": 0.3515625, - "learning_rate": 3.8128922436456766e-05, - "loss": 0.4092, + "epoch": 3.305978898007034, + "grad_norm": 0.318359375, + "learning_rate": 0.00016555769298657515, + "loss": 0.5422, "step": 2820 }, { - "epoch": 5.39961759082218, - "grad_norm": 0.375, - "learning_rate": 3.7783104091876524e-05, - "loss": 0.3766, + "epoch": 3.3106682297772565, + "grad_norm": 0.326171875, + "learning_rate": 0.00016524048846013265, + "loss": 0.5366, "step": 2824 }, { - "epoch": 5.407265774378585, - "grad_norm": 0.34375, - "learning_rate": 3.743863502444783e-05, - "loss": 0.3835, + "epoch": 3.3153575615474793, + "grad_norm": 0.287109375, + "learning_rate": 0.00016492321505315194, + "loss": 0.5024, "step": 2828 }, { - "epoch": 5.41491395793499, - "grad_norm": 0.33203125, - "learning_rate": 3.709551937600909e-05, - "loss": 0.3733, + "epoch": 3.320046893317702, + "grad_norm": 0.318359375, + "learning_rate": 0.00016460587419957407, + "loss": 0.5025, "step": 2832 }, { - "epoch": 5.422562141491396, - "grad_norm": 0.34375, - "learning_rate": 3.675376127212532e-05, - "loss": 0.4127, + "epoch": 3.324736225087925, + "grad_norm": 0.333984375, + "learning_rate": 0.00016428846733364502, + "loss": 0.4966, "step": 2836 }, { - "epoch": 5.430210325047801, - "grad_norm": 0.361328125, - "learning_rate": 3.64133648220387e-05, - "loss": 0.4091, + "epoch": 3.3294255568581477, + "grad_norm": 0.310546875, + "learning_rate": 0.00016397099588990902, + "loss": 0.5281, "step": 2840 }, { - "epoch": 5.437858508604206, - "grad_norm": 0.365234375, - "learning_rate": 3.607433411861912e-05, - "loss": 0.4612, + "epoch": 3.3341148886283705, + "grad_norm": 0.318359375, + "learning_rate": 0.00016365346130320233, + "loss": 0.4967, "step": 2844 }, { - "epoch": 5.4455066921606115, - "grad_norm": 0.330078125, - "learning_rate": 3.5736673238314914e-05, - "loss": 0.3668, + "epoch": 3.3388042203985933, + "grad_norm": 0.3046875, + "learning_rate": 0.00016333586500864647, + "loss": 0.4664, "step": 2848 }, { - "epoch": 5.453154875717018, - "grad_norm": 0.3359375, - "learning_rate": 3.5400386241103946e-05, - "loss": 0.3645, + "epoch": 3.343493552168816, + "grad_norm": 0.33203125, + "learning_rate": 0.00016301820844164176, + "loss": 0.5526, "step": 2852 }, { - "epoch": 5.460803059273423, - "grad_norm": 0.33984375, - "learning_rate": 3.506547717044472e-05, - "loss": 0.348, + "epoch": 3.348182883939039, + "grad_norm": 0.318359375, + "learning_rate": 0.00016270049303786113, + "loss": 0.5298, "step": 2856 }, { - "epoch": 5.468451242829828, - "grad_norm": 0.33984375, - "learning_rate": 3.473195005322776e-05, - "loss": 0.3812, + "epoch": 3.3528722157092616, + "grad_norm": 0.318359375, + "learning_rate": 0.0001623827202332433, + "loss": 0.5579, "step": 2860 }, { - "epoch": 5.4760994263862335, - "grad_norm": 0.333984375, - "learning_rate": 3.439980889972723e-05, - "loss": 0.3705, + "epoch": 3.357561547479484, + "grad_norm": 0.30078125, + "learning_rate": 0.00016206489146398655, + "loss": 0.5597, "step": 2864 }, { - "epoch": 5.483747609942639, - "grad_norm": 0.35546875, - "learning_rate": 3.406905770355274e-05, - "loss": 0.3687, + "epoch": 3.362250879249707, + "grad_norm": 0.318359375, + "learning_rate": 0.0001617470081665419, + "loss": 0.5362, "step": 2868 }, { - "epoch": 5.491395793499044, - "grad_norm": 0.32421875, - "learning_rate": 3.373970044160121e-05, - "loss": 0.4348, + "epoch": 3.3669402110199296, + "grad_norm": 0.333984375, + "learning_rate": 0.0001614290717776069, + "loss": 0.5342, "step": 2872 }, { - "epoch": 5.499043977055449, - "grad_norm": 0.36328125, - "learning_rate": 3.341174107400916e-05, - "loss": 0.4039, + "epoch": 3.3716295427901524, + "grad_norm": 0.318359375, + "learning_rate": 0.0001611110837341191, + "loss": 0.4736, "step": 2876 }, { - "epoch": 5.506692160611855, - "grad_norm": 0.3359375, - "learning_rate": 3.30851835441051e-05, - "loss": 0.4072, + "epoch": 3.376318874560375, + "grad_norm": 0.33203125, + "learning_rate": 0.0001607930454732495, + "loss": 0.5421, "step": 2880 }, { - "epoch": 5.51434034416826, - "grad_norm": 0.353515625, - "learning_rate": 3.276003177836203e-05, - "loss": 0.3796, + "epoch": 3.381008206330598, + "grad_norm": 0.296875, + "learning_rate": 0.000160474958432396, + "loss": 0.4857, "step": 2884 }, { - "epoch": 5.521988527724665, - "grad_norm": 0.345703125, - "learning_rate": 3.2436289686350285e-05, - "loss": 0.3647, + "epoch": 3.3856975381008207, + "grad_norm": 0.328125, + "learning_rate": 0.000160156824049177, + "loss": 0.5453, "step": 2888 }, { - "epoch": 5.5296367112810705, - "grad_norm": 0.341796875, - "learning_rate": 3.211396116069055e-05, - "loss": 0.3895, + "epoch": 3.3903868698710435, + "grad_norm": 0.3203125, + "learning_rate": 0.00015983864376142482, + "loss": 0.5494, "step": 2892 }, { - "epoch": 5.537284894837476, - "grad_norm": 0.34375, - "learning_rate": 3.179305007700697e-05, - "loss": 0.3689, + "epoch": 3.395076201641266, + "grad_norm": 0.337890625, + "learning_rate": 0.0001595204190071794, + "loss": 0.5328, "step": 2896 }, { - "epoch": 5.544933078393882, - "grad_norm": 0.349609375, - "learning_rate": 3.147356029388067e-05, - "loss": 0.3782, + "epoch": 3.3997655334114887, + "grad_norm": 0.314453125, + "learning_rate": 0.00015920215122468146, + "loss": 0.4856, "step": 2900 }, { - "epoch": 5.552581261950287, - "grad_norm": 0.349609375, - "learning_rate": 3.115549565280325e-05, - "loss": 0.3985, + "epoch": 3.4044548651817115, + "grad_norm": 0.3203125, + "learning_rate": 0.00015888384185236632, + "loss": 0.4919, "step": 2904 }, { - "epoch": 5.5602294455066925, - "grad_norm": 0.341796875, - "learning_rate": 3.083885997813066e-05, - "loss": 0.4289, + "epoch": 3.4091441969519343, + "grad_norm": 0.3125, + "learning_rate": 0.00015856549232885712, + "loss": 0.5082, "step": 2908 }, { - "epoch": 5.567877629063098, - "grad_norm": 0.33984375, - "learning_rate": 3.052365707703718e-05, - "loss": 0.436, + "epoch": 3.413833528722157, + "grad_norm": 0.333984375, + "learning_rate": 0.00015824710409295868, + "loss": 0.4919, "step": 2912 }, { - "epoch": 5.575525812619503, - "grad_norm": 0.341796875, - "learning_rate": 3.0209890739469693e-05, - "loss": 0.387, + "epoch": 3.41852286049238, + "grad_norm": 0.3125, + "learning_rate": 0.0001579286785836506, + "loss": 0.5208, "step": 2916 }, { - "epoch": 5.583173996175908, - "grad_norm": 0.326171875, - "learning_rate": 2.989756473810203e-05, - "loss": 0.4034, + "epoch": 3.4232121922626026, + "grad_norm": 0.32421875, + "learning_rate": 0.0001576102172400811, + "loss": 0.5251, "step": 2920 }, { - "epoch": 5.590822179732314, - "grad_norm": 0.3671875, - "learning_rate": 2.9586682828289738e-05, - "loss": 0.4206, + "epoch": 3.4279015240328254, + "grad_norm": 0.33984375, + "learning_rate": 0.0001572917215015602, + "loss": 0.4927, "step": 2924 }, { - "epoch": 5.598470363288719, - "grad_norm": 0.36328125, - "learning_rate": 2.9277248748024763e-05, - "loss": 0.4529, + "epoch": 3.4325908558030482, + "grad_norm": 0.30078125, + "learning_rate": 0.00015697319280755343, + "loss": 0.5404, "step": 2928 }, { - "epoch": 5.606118546845124, - "grad_norm": 0.330078125, - "learning_rate": 2.8969266217890648e-05, - "loss": 0.3527, + "epoch": 3.437280187573271, + "grad_norm": 0.328125, + "learning_rate": 0.00015665463259767525, + "loss": 0.5198, "step": 2932 }, { - "epoch": 5.6137667304015295, - "grad_norm": 0.365234375, - "learning_rate": 2.866273894101776e-05, - "loss": 0.389, + "epoch": 3.4419695193434934, + "grad_norm": 0.326171875, + "learning_rate": 0.00015633604231168264, + "loss": 0.5321, "step": 2936 }, { - "epoch": 5.621414913957935, - "grad_norm": 0.361328125, - "learning_rate": 2.835767060303865e-05, - "loss": 0.4017, + "epoch": 3.446658851113716, + "grad_norm": 0.3125, + "learning_rate": 0.00015601742338946844, + "loss": 0.5492, "step": 2940 }, { - "epoch": 5.62906309751434, - "grad_norm": 0.353515625, - "learning_rate": 2.8054064872043917e-05, - "loss": 0.3973, + "epoch": 3.451348182883939, + "grad_norm": 0.328125, + "learning_rate": 0.00015569877727105493, + "loss": 0.5115, "step": 2944 }, { - "epoch": 5.636711281070745, - "grad_norm": 0.375, - "learning_rate": 2.7751925398537993e-05, - "loss": 0.4281, + "epoch": 3.4560375146541618, + "grad_norm": 0.31640625, + "learning_rate": 0.00015538010539658728, + "loss": 0.4897, "step": 2948 }, { - "epoch": 5.644359464627151, - "grad_norm": 0.330078125, - "learning_rate": 2.745125581539523e-05, - "loss": 0.4071, + "epoch": 3.4607268464243846, + "grad_norm": 0.322265625, + "learning_rate": 0.00015506140920632707, + "loss": 0.5417, "step": 2952 }, { - "epoch": 5.652007648183556, - "grad_norm": 0.373046875, - "learning_rate": 2.7152059737816395e-05, - "loss": 0.3866, + "epoch": 3.4654161781946073, + "grad_norm": 0.333984375, + "learning_rate": 0.0001547426901406458, + "loss": 0.5028, "step": 2956 }, { - "epoch": 5.659655831739962, - "grad_norm": 0.32421875, - "learning_rate": 2.6854340763284954e-05, - "loss": 0.4029, + "epoch": 3.47010550996483, + "grad_norm": 0.318359375, + "learning_rate": 0.00015442394964001842, + "loss": 0.5215, "step": 2960 }, { - "epoch": 5.667304015296367, - "grad_norm": 0.369140625, - "learning_rate": 2.6558102471523975e-05, - "loss": 0.4207, + "epoch": 3.474794841735053, + "grad_norm": 0.33984375, + "learning_rate": 0.00015410518914501655, + "loss": 0.5129, "step": 2964 }, { - "epoch": 5.674952198852773, - "grad_norm": 0.365234375, - "learning_rate": 2.6263348424453012e-05, - "loss": 0.3769, + "epoch": 3.4794841735052753, + "grad_norm": 0.33203125, + "learning_rate": 0.00015378641009630242, + "loss": 0.5582, "step": 2968 }, { - "epoch": 5.682600382409178, - "grad_norm": 0.337890625, - "learning_rate": 2.597008216614534e-05, - "loss": 0.3527, + "epoch": 3.484173505275498, + "grad_norm": 0.341796875, + "learning_rate": 0.000153467613934622, + "loss": 0.5316, "step": 2972 }, { - "epoch": 5.690248565965583, - "grad_norm": 0.345703125, - "learning_rate": 2.5678307222785315e-05, - "loss": 0.4091, + "epoch": 3.488862837045721, + "grad_norm": 0.310546875, + "learning_rate": 0.00015314880210079863, + "loss": 0.5291, "step": 2976 }, { - "epoch": 5.6978967495219885, - "grad_norm": 0.328125, - "learning_rate": 2.5388027102625945e-05, - "loss": 0.4175, + "epoch": 3.4935521688159437, + "grad_norm": 0.333984375, + "learning_rate": 0.00015282997603572639, + "loss": 0.486, "step": 2980 }, { - "epoch": 5.705544933078394, - "grad_norm": 0.349609375, - "learning_rate": 2.5099245295946764e-05, - "loss": 0.3557, + "epoch": 3.4982415005861665, + "grad_norm": 0.318359375, + "learning_rate": 0.00015251113718036378, + "loss": 0.5083, "step": 2984 }, { - "epoch": 5.713193116634799, - "grad_norm": 0.328125, - "learning_rate": 2.4811965275011825e-05, - "loss": 0.4021, + "epoch": 3.5029308323563892, + "grad_norm": 0.3125, + "learning_rate": 0.0001521922869757271, + "loss": 0.521, "step": 2988 }, { - "epoch": 5.720841300191204, - "grad_norm": 0.33203125, - "learning_rate": 2.4526190494027953e-05, - "loss": 0.3868, + "epoch": 3.507620164126612, + "grad_norm": 0.32421875, + "learning_rate": 0.0001518734268628839, + "loss": 0.5537, "step": 2992 }, { - "epoch": 5.72848948374761, - "grad_norm": 0.369140625, - "learning_rate": 2.4241924389103227e-05, - "loss": 0.4399, + "epoch": 3.512309495896835, + "grad_norm": 0.322265625, + "learning_rate": 0.00015155455828294657, + "loss": 0.4843, "step": 2996 }, { - "epoch": 5.736137667304015, - "grad_norm": 0.3359375, - "learning_rate": 2.395917037820566e-05, - "loss": 0.3907, + "epoch": 3.5169988276670576, + "grad_norm": 0.32421875, + "learning_rate": 0.00015123568267706575, + "loss": 0.5133, "step": 3000 }, { - "epoch": 5.743785850860421, - "grad_norm": 0.349609375, - "learning_rate": 2.3677931861122084e-05, - "loss": 0.4195, + "epoch": 3.5216881594372804, + "grad_norm": 0.30859375, + "learning_rate": 0.00015091680148642371, + "loss": 0.4921, "step": 3004 }, { - "epoch": 5.751434034416826, - "grad_norm": 0.3359375, - "learning_rate": 2.339821221941731e-05, - "loss": 0.3867, + "epoch": 3.5263774912075028, + "grad_norm": 0.328125, + "learning_rate": 0.00015059791615222817, + "loss": 0.5169, "step": 3008 }, { - "epoch": 5.759082217973232, - "grad_norm": 0.3359375, - "learning_rate": 2.312001481639348e-05, - "loss": 0.3583, + "epoch": 3.5310668229777256, + "grad_norm": 0.322265625, + "learning_rate": 0.00015027902811570544, + "loss": 0.5067, "step": 3012 }, { - "epoch": 5.766730401529637, - "grad_norm": 0.3359375, - "learning_rate": 2.2843342997049445e-05, - "loss": 0.3527, + "epoch": 3.5357561547479484, + "grad_norm": 0.2890625, + "learning_rate": 0.00014996013881809402, + "loss": 0.5611, "step": 3016 }, { - "epoch": 5.774378585086042, - "grad_norm": 0.30859375, - "learning_rate": 2.2568200088040867e-05, - "loss": 0.3393, + "epoch": 3.540445486518171, + "grad_norm": 0.328125, + "learning_rate": 0.00014964124970063829, + "loss": 0.5201, "step": 3020 }, { - "epoch": 5.782026768642448, - "grad_norm": 0.359375, - "learning_rate": 2.2294589397639978e-05, - "loss": 0.4225, + "epoch": 3.545134818288394, + "grad_norm": 0.328125, + "learning_rate": 0.0001493223622045816, + "loss": 0.5519, "step": 3024 }, { - "epoch": 5.789674952198853, - "grad_norm": 0.357421875, - "learning_rate": 2.2022514215695842e-05, - "loss": 0.4191, + "epoch": 3.5498241500586167, + "grad_norm": 0.328125, + "learning_rate": 0.00014900347777116007, + "loss": 0.5112, "step": 3028 }, { - "epoch": 5.797323135755258, - "grad_norm": 0.341796875, - "learning_rate": 2.175197781359485e-05, - "loss": 0.3792, + "epoch": 3.5545134818288395, + "grad_norm": 0.32421875, + "learning_rate": 0.00014868459784159603, + "loss": 0.5378, "step": 3032 }, { - "epoch": 5.804971319311663, - "grad_norm": 0.337890625, - "learning_rate": 2.1482983444221402e-05, - "loss": 0.3942, + "epoch": 3.559202813599062, + "grad_norm": 0.328125, + "learning_rate": 0.0001483657238570913, + "loss": 0.4843, "step": 3036 }, { - "epoch": 5.812619502868069, - "grad_norm": 0.326171875, - "learning_rate": 2.1215534341918707e-05, - "loss": 0.3753, + "epoch": 3.5638921453692847, + "grad_norm": 0.318359375, + "learning_rate": 0.00014804685725882104, + "loss": 0.5048, "step": 3040 }, { - "epoch": 5.820267686424474, - "grad_norm": 0.33984375, - "learning_rate": 2.0949633722449915e-05, - "loss": 0.4234, + "epoch": 3.5685814771395075, + "grad_norm": 0.3046875, + "learning_rate": 0.00014772799948792683, + "loss": 0.4974, "step": 3044 }, { - "epoch": 5.827915869980879, - "grad_norm": 0.357421875, - "learning_rate": 2.0685284782959566e-05, - "loss": 0.3925, + "epoch": 3.5732708089097303, + "grad_norm": 0.328125, + "learning_rate": 0.0001474091519855105, + "loss": 0.5482, "step": 3048 }, { - "epoch": 5.835564053537285, - "grad_norm": 0.349609375, - "learning_rate": 2.0422490701934996e-05, - "loss": 0.412, + "epoch": 3.577960140679953, + "grad_norm": 0.330078125, + "learning_rate": 0.00014709031619262737, + "loss": 0.4867, "step": 3052 }, { - "epoch": 5.84321223709369, - "grad_norm": 0.328125, - "learning_rate": 2.0161254639168183e-05, - "loss": 0.3981, + "epoch": 3.582649472450176, + "grad_norm": 0.34375, + "learning_rate": 0.00014677149355027985, + "loss": 0.5304, "step": 3056 }, { - "epoch": 5.850860420650095, - "grad_norm": 0.3515625, - "learning_rate": 1.9901579735717743e-05, - "loss": 0.4204, + "epoch": 3.5873388042203986, + "grad_norm": 0.34375, + "learning_rate": 0.00014645268549941107, + "loss": 0.5533, "step": 3060 }, { - "epoch": 5.858508604206501, - "grad_norm": 0.333984375, - "learning_rate": 1.964346911387127e-05, - "loss": 0.3878, + "epoch": 3.5920281359906214, + "grad_norm": 0.33203125, + "learning_rate": 0.00014613389348089794, + "loss": 0.5024, "step": 3064 }, { - "epoch": 5.866156787762907, - "grad_norm": 0.373046875, - "learning_rate": 1.9386925877107585e-05, - "loss": 0.4047, + "epoch": 3.5967174677608442, + "grad_norm": 0.298828125, + "learning_rate": 0.0001458151189355451, + "loss": 0.5124, "step": 3068 }, { - "epoch": 5.873804971319312, - "grad_norm": 0.3671875, - "learning_rate": 1.913195311005959e-05, - "loss": 0.3899, + "epoch": 3.601406799531067, + "grad_norm": 0.31640625, + "learning_rate": 0.00014549636330407823, + "loss": 0.5064, "step": 3072 }, { - "epoch": 5.881453154875717, - "grad_norm": 0.353515625, - "learning_rate": 1.8878553878477105e-05, - "loss": 0.4179, + "epoch": 3.60609613130129, + "grad_norm": 0.337890625, + "learning_rate": 0.0001451776280271374, + "loss": 0.5421, "step": 3076 }, { - "epoch": 5.8891013384321225, - "grad_norm": 0.341796875, - "learning_rate": 1.8626731229190016e-05, - "loss": 0.403, + "epoch": 3.610785463071512, + "grad_norm": 0.318359375, + "learning_rate": 0.00014485891454527083, + "loss": 0.4855, "step": 3080 }, { - "epoch": 5.896749521988528, - "grad_norm": 0.31640625, - "learning_rate": 1.8376488190071666e-05, - "loss": 0.3925, + "epoch": 3.615474794841735, + "grad_norm": 0.306640625, + "learning_rate": 0.0001445402242989281, + "loss": 0.4824, "step": 3084 }, { - "epoch": 5.904397705544933, - "grad_norm": 0.3359375, - "learning_rate": 1.8127827770002423e-05, - "loss": 0.3647, + "epoch": 3.6201641266119577, + "grad_norm": 0.328125, + "learning_rate": 0.00014422155872845387, + "loss": 0.5056, "step": 3088 }, { - "epoch": 5.912045889101338, - "grad_norm": 0.353515625, - "learning_rate": 1.7880752958833543e-05, - "loss": 0.4301, + "epoch": 3.6248534583821805, + "grad_norm": 0.337890625, + "learning_rate": 0.00014390291927408123, + "loss": 0.4958, "step": 3092 }, { - "epoch": 5.919694072657744, - "grad_norm": 0.35546875, - "learning_rate": 1.7635266727351092e-05, - "loss": 0.3988, + "epoch": 3.6295427901524033, + "grad_norm": 0.3203125, + "learning_rate": 0.00014358430737592534, + "loss": 0.5354, "step": 3096 }, { - "epoch": 5.927342256214149, + "epoch": 3.634232121922626, "grad_norm": 0.34375, - "learning_rate": 1.73913720272404e-05, - "loss": 0.4028, + "learning_rate": 0.00014326572447397658, + "loss": 0.4956, "step": 3100 }, { - "epoch": 5.934990439770554, - "grad_norm": 0.328125, - "learning_rate": 1.714907179105049e-05, - "loss": 0.3756, + "epoch": 3.638921453692849, + "grad_norm": 0.330078125, + "learning_rate": 0.00014294717200809452, + "loss": 0.5054, "step": 3104 }, { - "epoch": 5.9426386233269595, - "grad_norm": 0.341796875, - "learning_rate": 1.6908368932158777e-05, - "loss": 0.4023, + "epoch": 3.6436107854630713, + "grad_norm": 0.326171875, + "learning_rate": 0.0001426286514180011, + "loss": 0.5112, "step": 3108 }, { - "epoch": 5.950286806883366, - "grad_norm": 0.345703125, - "learning_rate": 1.6669266344736104e-05, - "loss": 0.3784, + "epoch": 3.648300117233294, + "grad_norm": 0.3203125, + "learning_rate": 0.00014231016414327407, + "loss": 0.5005, "step": 3112 }, { - "epoch": 5.957934990439771, - "grad_norm": 0.34375, - "learning_rate": 1.6431766903711914e-05, - "loss": 0.3622, + "epoch": 3.652989449003517, + "grad_norm": 0.306640625, + "learning_rate": 0.00014199171162334077, + "loss": 0.5444, "step": 3116 }, { - "epoch": 5.965583173996176, - "grad_norm": 0.359375, - "learning_rate": 1.6195873464739702e-05, - "loss": 0.416, + "epoch": 3.6576787807737396, + "grad_norm": 0.322265625, + "learning_rate": 0.00014167329529747146, + "loss": 0.5815, "step": 3120 }, { - "epoch": 5.9732313575525815, - "grad_norm": 0.3359375, - "learning_rate": 1.5961588864162627e-05, - "loss": 0.4191, + "epoch": 3.6623681125439624, + "grad_norm": 0.330078125, + "learning_rate": 0.0001413549166047727, + "loss": 0.5558, "step": 3124 }, { - "epoch": 5.980879541108987, - "grad_norm": 0.328125, - "learning_rate": 1.5728915918979477e-05, - "loss": 0.3683, + "epoch": 3.6670574443141852, + "grad_norm": 0.33203125, + "learning_rate": 0.0001410365769841811, + "loss": 0.5045, "step": 3128 }, { - "epoch": 5.988527724665392, + "epoch": 3.671746776084408, "grad_norm": 0.337890625, - "learning_rate": 1.5497857426810756e-05, - "loss": 0.365, + "learning_rate": 0.00014071827787445656, + "loss": 0.5228, "step": 3132 }, { - "epoch": 5.996175908221797, - "grad_norm": 0.3671875, - "learning_rate": 1.5268416165865055e-05, - "loss": 0.401, + "epoch": 3.676436107854631, + "grad_norm": 0.33203125, + "learning_rate": 0.00014040002071417595, + "loss": 0.5277, "step": 3136 }, { - "epoch": 6.003824091778203, - "grad_norm": 0.33203125, - "learning_rate": 1.5040594894905628e-05, - "loss": 0.3805, + "epoch": 3.6811254396248536, + "grad_norm": 0.310546875, + "learning_rate": 0.00014008180694172645, + "loss": 0.5202, "step": 3140 }, { - "epoch": 6.011472275334608, - "grad_norm": 0.34765625, - "learning_rate": 1.481439635321729e-05, - "loss": 0.3635, + "epoch": 3.6858147713950764, + "grad_norm": 0.32421875, + "learning_rate": 0.00013976363799529936, + "loss": 0.5646, "step": 3144 }, { - "epoch": 6.019120458891013, - "grad_norm": 0.33984375, - "learning_rate": 1.458982326057338e-05, - "loss": 0.37, + "epoch": 3.690504103165299, + "grad_norm": 0.34765625, + "learning_rate": 0.0001394455153128832, + "loss": 0.5456, "step": 3148 }, { - "epoch": 6.0267686424474185, - "grad_norm": 0.34765625, - "learning_rate": 1.436687831720314e-05, - "loss": 0.3981, + "epoch": 3.6951934349355215, + "grad_norm": 0.37109375, + "learning_rate": 0.0001391274403322574, + "loss": 0.5319, "step": 3152 }, { - "epoch": 6.034416826003824, - "grad_norm": 0.345703125, - "learning_rate": 1.4145564203759219e-05, - "loss": 0.4484, + "epoch": 3.6998827667057443, + "grad_norm": 0.322265625, + "learning_rate": 0.00013880941449098596, + "loss": 0.5385, "step": 3156 }, { - "epoch": 6.042065009560229, - "grad_norm": 0.3671875, - "learning_rate": 1.3925883581285401e-05, - "loss": 0.3988, + "epoch": 3.704572098475967, + "grad_norm": 0.318359375, + "learning_rate": 0.0001384914392264106, + "loss": 0.5151, "step": 3160 }, { - "epoch": 6.049713193116634, - "grad_norm": 0.357421875, - "learning_rate": 1.3707839091184702e-05, - "loss": 0.3422, + "epoch": 3.70926143024619, + "grad_norm": 0.322265625, + "learning_rate": 0.00013817351597564457, + "loss": 0.5354, "step": 3164 }, { - "epoch": 6.0573613766730405, - "grad_norm": 0.35546875, - "learning_rate": 1.349143335518752e-05, - "loss": 0.3994, + "epoch": 3.7139507620164127, + "grad_norm": 0.31640625, + "learning_rate": 0.00013785564617556603, + "loss": 0.5217, "step": 3168 }, { - "epoch": 6.065009560229446, - "grad_norm": 0.341796875, - "learning_rate": 1.3276668975320165e-05, - "loss": 0.4038, + "epoch": 3.7186400937866355, + "grad_norm": 0.34375, + "learning_rate": 0.00013753783126281145, + "loss": 0.5383, "step": 3172 }, { - "epoch": 6.072657743785851, - "grad_norm": 0.328125, - "learning_rate": 1.3063548533873536e-05, - "loss": 0.4065, + "epoch": 3.7233294255568583, + "grad_norm": 0.330078125, + "learning_rate": 0.0001372200726737694, + "loss": 0.4918, "step": 3176 }, { - "epoch": 6.080305927342256, - "grad_norm": 0.33984375, - "learning_rate": 1.2852074593372142e-05, - "loss": 0.3902, + "epoch": 3.7280187573270807, + "grad_norm": 0.302734375, + "learning_rate": 0.00013690237184457377, + "loss": 0.4809, "step": 3180 }, { - "epoch": 6.087954110898662, - "grad_norm": 0.328125, - "learning_rate": 1.2642249696543178e-05, - "loss": 0.388, + "epoch": 3.7327080890973034, + "grad_norm": 0.326171875, + "learning_rate": 0.00013658473021109749, + "loss": 0.5185, "step": 3184 }, { - "epoch": 6.095602294455067, - "grad_norm": 0.34375, - "learning_rate": 1.243407636628605e-05, - "loss": 0.3979, + "epoch": 3.7373974208675262, + "grad_norm": 0.337890625, + "learning_rate": 0.00013626714920894587, + "loss": 0.5666, "step": 3188 }, { - "epoch": 6.103250478011472, - "grad_norm": 0.3359375, - "learning_rate": 1.2227557105642e-05, - "loss": 0.404, + "epoch": 3.742086752637749, + "grad_norm": 0.349609375, + "learning_rate": 0.00013594963027345022, + "loss": 0.5398, "step": 3192 }, { - "epoch": 6.1108986615678775, - "grad_norm": 0.34765625, - "learning_rate": 1.2022694397763993e-05, - "loss": 0.4059, + "epoch": 3.746776084407972, + "grad_norm": 0.3125, + "learning_rate": 0.0001356321748396614, + "loss": 0.553, "step": 3196 }, { - "epoch": 6.118546845124283, - "grad_norm": 0.359375, - "learning_rate": 1.1819490705886914e-05, - "loss": 0.3766, + "epoch": 3.7514654161781946, + "grad_norm": 0.326171875, + "learning_rate": 0.00013531478434234312, + "loss": 0.5552, "step": 3200 }, { - "epoch": 6.126195028680688, - "grad_norm": 0.337890625, - "learning_rate": 1.16179484732979e-05, - "loss": 0.3629, + "epoch": 3.7561547479484174, + "grad_norm": 0.326171875, + "learning_rate": 0.00013499746021596582, + "loss": 0.543, "step": 3204 }, { - "epoch": 6.133843212237093, - "grad_norm": 0.375, - "learning_rate": 1.1418070123306989e-05, - "loss": 0.3733, + "epoch": 3.76084407971864, + "grad_norm": 0.333984375, + "learning_rate": 0.00013468020389469974, + "loss": 0.5163, "step": 3208 }, { - "epoch": 6.141491395793499, - "grad_norm": 0.337890625, - "learning_rate": 1.1219858059217951e-05, - "loss": 0.4169, + "epoch": 3.765533411488863, + "grad_norm": 0.330078125, + "learning_rate": 0.0001343630168124088, + "loss": 0.5194, "step": 3212 }, { - "epoch": 6.149139579349905, - "grad_norm": 0.33984375, - "learning_rate": 1.1023314664299455e-05, - "loss": 0.4131, + "epoch": 3.770222743259086, + "grad_norm": 0.349609375, + "learning_rate": 0.00013404590040264397, + "loss": 0.525, "step": 3216 }, { - "epoch": 6.15678776290631, - "grad_norm": 0.318359375, - "learning_rate": 1.0828442301756312e-05, - "loss": 0.3505, + "epoch": 3.7749120750293086, + "grad_norm": 0.322265625, + "learning_rate": 0.0001337288560986368, + "loss": 0.4793, "step": 3220 }, { - "epoch": 6.164435946462715, - "grad_norm": 0.337890625, - "learning_rate": 1.0635243314701163e-05, - "loss": 0.3978, + "epoch": 3.779601406799531, + "grad_norm": 0.318359375, + "learning_rate": 0.000133411885333293, + "loss": 0.5353, "step": 3224 }, { - "epoch": 6.172084130019121, - "grad_norm": 0.33984375, - "learning_rate": 1.0443720026126273e-05, - "loss": 0.3982, + "epoch": 3.7842907385697537, + "grad_norm": 0.333984375, + "learning_rate": 0.00013309498953918583, + "loss": 0.5114, "step": 3228 }, { - "epoch": 6.179732313575526, - "grad_norm": 0.357421875, - "learning_rate": 1.025387473887554e-05, - "loss": 0.3935, + "epoch": 3.7889800703399765, + "grad_norm": 0.345703125, + "learning_rate": 0.0001327781701485498, + "loss": 0.5858, "step": 3232 }, { - "epoch": 6.187380497131931, - "grad_norm": 0.337890625, - "learning_rate": 1.0065709735616917e-05, - "loss": 0.3709, + "epoch": 3.7936694021101993, + "grad_norm": 0.34375, + "learning_rate": 0.00013246142859327402, + "loss": 0.548, "step": 3236 }, { - "epoch": 6.195028680688337, - "grad_norm": 0.3515625, - "learning_rate": 9.87922727881484e-06, - "loss": 0.3922, + "epoch": 3.798358733880422, + "grad_norm": 0.306640625, + "learning_rate": 0.0001321447663048959, + "loss": 0.4745, "step": 3240 }, { - "epoch": 6.202676864244742, - "grad_norm": 0.328125, - "learning_rate": 9.694429610703153e-06, - "loss": 0.3719, + "epoch": 3.803048065650645, + "grad_norm": 0.3359375, + "learning_rate": 0.00013182818471459457, + "loss": 0.4773, "step": 3244 }, { - "epoch": 6.210325047801147, - "grad_norm": 0.333984375, - "learning_rate": 9.511318953258013e-06, - "loss": 0.2995, + "epoch": 3.8077373974208673, + "grad_norm": 0.326171875, + "learning_rate": 0.00013151168525318436, + "loss": 0.517, "step": 3248 }, { - "epoch": 6.217973231357552, - "grad_norm": 0.34765625, - "learning_rate": 9.329897508171296e-06, - "loss": 0.3932, + "epoch": 3.81242672919109, + "grad_norm": 0.322265625, + "learning_rate": 0.00013119526935110852, + "loss": 0.5349, "step": 3252 }, { - "epoch": 6.225621414913958, - "grad_norm": 0.34765625, - "learning_rate": 9.150167456824065e-06, - "loss": 0.4016, + "epoch": 3.817116060961313, + "grad_norm": 0.31640625, + "learning_rate": 0.00013087893843843264, + "loss": 0.5046, "step": 3256 }, { - "epoch": 6.233269598470363, + "epoch": 3.8218053927315356, "grad_norm": 0.33984375, - "learning_rate": 8.972130960260326e-06, - "loss": 0.3622, + "learning_rate": 0.00013056269394483814, + "loss": 0.5361, "step": 3260 }, { - "epoch": 6.240917782026768, - "grad_norm": 0.337890625, - "learning_rate": 8.795790159161098e-06, - "loss": 0.4053, + "epoch": 3.8264947245017584, + "grad_norm": 0.333984375, + "learning_rate": 0.00013024653729961586, + "loss": 0.5355, "step": 3264 }, { - "epoch": 6.248565965583174, + "epoch": 3.831184056271981, "grad_norm": 0.3203125, - "learning_rate": 8.621147173818587e-06, - "loss": 0.3517, + "learning_rate": 0.00012993046993165966, + "loss": 0.5392, "step": 3268 }, { - "epoch": 6.25621414913958, - "grad_norm": 0.35546875, - "learning_rate": 8.448204104110818e-06, - "loss": 0.3448, + "epoch": 3.835873388042204, + "grad_norm": 0.349609375, + "learning_rate": 0.00012961449326945985, + "loss": 0.5118, "step": 3272 }, { - "epoch": 6.263862332695985, - "grad_norm": 0.349609375, - "learning_rate": 8.276963029476275e-06, - "loss": 0.3758, + "epoch": 3.840562719812427, + "grad_norm": 0.31640625, + "learning_rate": 0.00012929860874109683, + "loss": 0.5205, "step": 3276 }, { - "epoch": 6.27151051625239, - "grad_norm": 0.37109375, - "learning_rate": 8.107426008888934e-06, - "loss": 0.3911, + "epoch": 3.8452520515826496, + "grad_norm": 0.357421875, + "learning_rate": 0.00012898281777423465, + "loss": 0.5297, "step": 3280 }, { - "epoch": 6.279158699808796, - "grad_norm": 0.322265625, - "learning_rate": 7.93959508083351e-06, - "loss": 0.3491, + "epoch": 3.8499413833528724, + "grad_norm": 0.314453125, + "learning_rate": 0.00012866712179611427, + "loss": 0.5043, "step": 3284 }, { - "epoch": 6.286806883365201, - "grad_norm": 0.34375, - "learning_rate": 7.773472263280977e-06, - "loss": 0.3954, + "epoch": 3.854630715123095, + "grad_norm": 0.326171875, + "learning_rate": 0.0001283515222335476, + "loss": 0.5284, "step": 3288 }, { - "epoch": 6.294455066921606, - "grad_norm": 0.34375, - "learning_rate": 7.609059553664254e-06, - "loss": 0.4018, + "epoch": 3.859320046893318, + "grad_norm": 0.333984375, + "learning_rate": 0.00012803602051291064, + "loss": 0.5118, "step": 3292 }, { - "epoch": 6.3021032504780115, - "grad_norm": 0.330078125, - "learning_rate": 7.446358928854207e-06, - "loss": 0.3823, + "epoch": 3.8640093786635403, + "grad_norm": 0.328125, + "learning_rate": 0.00012772061806013728, + "loss": 0.5621, "step": 3296 }, { - "epoch": 6.309751434034417, - "grad_norm": 0.345703125, - "learning_rate": 7.2853723451358705e-06, - "loss": 0.4097, + "epoch": 3.868698710433763, + "grad_norm": 0.314453125, + "learning_rate": 0.00012740531630071268, + "loss": 0.5587, "step": 3300 }, { - "epoch": 6.317399617590822, - "grad_norm": 0.345703125, - "learning_rate": 7.126101738184964e-06, - "loss": 0.3676, + "epoch": 3.873388042203986, + "grad_norm": 0.322265625, + "learning_rate": 0.00012709011665966698, + "loss": 0.4888, "step": 3304 }, { - "epoch": 6.325047801147227, - "grad_norm": 0.337890625, - "learning_rate": 6.9685490230445615e-06, - "loss": 0.3901, + "epoch": 3.8780773739742087, + "grad_norm": 0.3203125, + "learning_rate": 0.00012677502056156878, + "loss": 0.4617, "step": 3308 }, { - "epoch": 6.332695984703633, - "grad_norm": 0.32421875, - "learning_rate": 6.812716094102128e-06, - "loss": 0.3652, + "epoch": 3.8827667057444315, + "grad_norm": 0.31640625, + "learning_rate": 0.00012646002943051863, + "loss": 0.5277, "step": 3312 }, { - "epoch": 6.340344168260038, - "grad_norm": 0.35546875, - "learning_rate": 6.658604825066683e-06, - "loss": 0.3631, + "epoch": 3.8874560375146543, + "grad_norm": 0.310546875, + "learning_rate": 0.0001261451446901428, + "loss": 0.5163, "step": 3316 }, { - "epoch": 6.347992351816444, - "grad_norm": 0.341796875, - "learning_rate": 6.50621706894629e-06, - "loss": 0.4053, + "epoch": 3.8921453692848766, + "grad_norm": 0.33984375, + "learning_rate": 0.00012583036776358652, + "loss": 0.4925, "step": 3320 }, { - "epoch": 6.355640535372849, - "grad_norm": 0.33203125, - "learning_rate": 6.355554658025791e-06, - "loss": 0.3451, + "epoch": 3.8968347010550994, + "grad_norm": 0.34375, + "learning_rate": 0.00012551570007350796, + "loss": 0.5093, "step": 3324 }, { - "epoch": 6.363288718929255, - "grad_norm": 0.34375, - "learning_rate": 6.206619403844804e-06, - "loss": 0.4109, + "epoch": 3.9015240328253222, + "grad_norm": 0.33984375, + "learning_rate": 0.0001252011430420715, + "loss": 0.5028, "step": 3328 }, { - "epoch": 6.37093690248566, - "grad_norm": 0.318359375, - "learning_rate": 6.059413097175808e-06, - "loss": 0.3897, + "epoch": 3.906213364595545, + "grad_norm": 0.3203125, + "learning_rate": 0.0001248866980909414, + "loss": 0.5412, "step": 3332 }, { - "epoch": 6.378585086042065, - "grad_norm": 0.341796875, - "learning_rate": 5.913937508002797e-06, - "loss": 0.3573, + "epoch": 3.910902696365768, + "grad_norm": 0.326171875, + "learning_rate": 0.00012457236664127535, + "loss": 0.5431, "step": 3336 }, { - "epoch": 6.3862332695984705, - "grad_norm": 0.341796875, - "learning_rate": 5.770194385499877e-06, - "loss": 0.3928, + "epoch": 3.9155920281359906, + "grad_norm": 0.310546875, + "learning_rate": 0.00012425815011371806, + "loss": 0.5178, "step": 3340 }, { - "epoch": 6.393881453154876, - "grad_norm": 0.330078125, - "learning_rate": 5.628185458010248e-06, - "loss": 0.3662, + "epoch": 3.9202813599062134, + "grad_norm": 0.310546875, + "learning_rate": 0.00012394404992839485, + "loss": 0.5413, "step": 3344 }, { - "epoch": 6.401529636711281, - "grad_norm": 0.349609375, - "learning_rate": 5.487912433025493e-06, - "loss": 0.3974, + "epoch": 3.924970691676436, + "grad_norm": 0.3515625, + "learning_rate": 0.0001236300675049052, + "loss": 0.5313, "step": 3348 }, { - "epoch": 6.409177820267686, - "grad_norm": 0.32421875, - "learning_rate": 5.349376997164923e-06, - "loss": 0.3477, + "epoch": 3.929660023446659, + "grad_norm": 0.337890625, + "learning_rate": 0.0001233162042623165, + "loss": 0.5314, "step": 3352 }, { - "epoch": 6.416826003824092, - "grad_norm": 0.330078125, - "learning_rate": 5.212580816155426e-06, - "loss": 0.3992, + "epoch": 3.934349355216882, + "grad_norm": 0.31640625, + "learning_rate": 0.0001230024616191572, + "loss": 0.5232, "step": 3356 }, { - "epoch": 6.424474187380497, - "grad_norm": 0.345703125, - "learning_rate": 5.077525534811339e-06, - "loss": 0.3881, + "epoch": 3.9390386869871046, + "grad_norm": 0.310546875, + "learning_rate": 0.00012268884099341095, + "loss": 0.5187, "step": 3360 }, { - "epoch": 6.432122370936902, - "grad_norm": 0.322265625, - "learning_rate": 4.9442127770147385e-06, - "loss": 0.3767, + "epoch": 3.943728018757327, + "grad_norm": 0.3125, + "learning_rate": 0.00012237534380250985, + "loss": 0.5218, "step": 3364 }, { - "epoch": 6.4397705544933075, - "grad_norm": 0.34765625, - "learning_rate": 4.812644145695915e-06, - "loss": 0.3939, + "epoch": 3.9484173505275497, + "grad_norm": 0.3359375, + "learning_rate": 0.00012206197146332808, + "loss": 0.5651, "step": 3368 }, { - "epoch": 6.447418738049713, - "grad_norm": 0.3359375, - "learning_rate": 4.682821222813998e-06, - "loss": 0.3322, + "epoch": 3.9531066822977725, + "grad_norm": 0.353515625, + "learning_rate": 0.00012174872539217565, + "loss": 0.5458, "step": 3372 }, { - "epoch": 6.455066921606119, - "grad_norm": 0.34375, - "learning_rate": 4.554745569338092e-06, - "loss": 0.3414, + "epoch": 3.9577960140679953, + "grad_norm": 0.3203125, + "learning_rate": 0.00012143560700479177, + "loss": 0.5499, "step": 3376 }, { - "epoch": 6.462715105162524, - "grad_norm": 0.35546875, - "learning_rate": 4.428418725228372e-06, - "loss": 0.3982, + "epoch": 3.962485345838218, + "grad_norm": 0.359375, + "learning_rate": 0.00012112261771633866, + "loss": 0.5396, "step": 3380 }, { - "epoch": 6.4703632887189295, - "grad_norm": 0.33203125, - "learning_rate": 4.303842209417652e-06, - "loss": 0.3522, + "epoch": 3.967174677608441, + "grad_norm": 0.328125, + "learning_rate": 0.00012080975894139508, + "loss": 0.5212, "step": 3384 }, { - "epoch": 6.478011472275335, - "grad_norm": 0.330078125, - "learning_rate": 4.181017519793079e-06, - "loss": 0.4167, + "epoch": 3.9718640093786637, + "grad_norm": 0.3359375, + "learning_rate": 0.00012049703209394983, + "loss": 0.4536, "step": 3388 }, { - "epoch": 6.48565965583174, - "grad_norm": 0.349609375, - "learning_rate": 4.059946133178132e-06, - "loss": 0.3875, + "epoch": 3.976553341148886, + "grad_norm": 0.333984375, + "learning_rate": 0.00012018443858739554, + "loss": 0.508, "step": 3392 }, { - "epoch": 6.493307839388145, - "grad_norm": 0.33203125, - "learning_rate": 3.94062950531489e-06, - "loss": 0.3662, + "epoch": 3.981242672919109, + "grad_norm": 0.3359375, + "learning_rate": 0.0001198719798345221, + "loss": 0.5013, "step": 3396 }, { - "epoch": 6.500956022944551, - "grad_norm": 0.34375, - "learning_rate": 3.823069070846474e-06, - "loss": 0.3625, + "epoch": 3.9859320046893316, + "grad_norm": 0.361328125, + "learning_rate": 0.00011955965724751048, + "loss": 0.5184, "step": 3400 }, { - "epoch": 6.508604206500956, - "grad_norm": 0.318359375, - "learning_rate": 3.707266243299861e-06, - "loss": 0.3445, + "epoch": 3.9906213364595544, + "grad_norm": 0.33984375, + "learning_rate": 0.00011924747223792619, + "loss": 0.5191, "step": 3404 }, { - "epoch": 6.516252390057361, - "grad_norm": 0.353515625, - "learning_rate": 3.5932224150688526e-06, - "loss": 0.3705, + "epoch": 3.995310668229777, + "grad_norm": 0.30859375, + "learning_rate": 0.00011893542621671296, + "loss": 0.5332, "step": 3408 }, { - "epoch": 6.5239005736137665, - "grad_norm": 0.35546875, - "learning_rate": 3.4809389573973e-06, - "loss": 0.396, + "epoch": 4.0, + "grad_norm": 1.765625, + "learning_rate": 0.00011862352059418636, + "loss": 0.4709, "step": 3412 }, { - "epoch": 6.531548757170172, - "grad_norm": 0.359375, - "learning_rate": 3.3704172203627035e-06, - "loss": 0.3689, + "epoch": 4.004689331770223, + "grad_norm": 0.30078125, + "learning_rate": 0.00011831175678002737, + "loss": 0.4506, "step": 3416 }, { - "epoch": 6.539196940726577, - "grad_norm": 0.33203125, - "learning_rate": 3.2616585328599065e-06, - "loss": 0.4253, + "epoch": 4.009378663540446, + "grad_norm": 0.3203125, + "learning_rate": 0.00011800013618327605, + "loss": 0.4412, "step": 3420 }, { - "epoch": 6.546845124282983, - "grad_norm": 0.3359375, - "learning_rate": 3.154664202585128e-06, - "loss": 0.387, + "epoch": 4.014067995310668, + "grad_norm": 0.345703125, + "learning_rate": 0.00011768866021232528, + "loss": 0.4716, "step": 3424 }, { - "epoch": 6.5544933078393885, - "grad_norm": 0.353515625, - "learning_rate": 3.049435516020271e-06, - "loss": 0.3585, + "epoch": 4.018757327080891, + "grad_norm": 0.3203125, + "learning_rate": 0.00011737733027491427, + "loss": 0.4189, "step": 3428 }, { - "epoch": 6.562141491395794, - "grad_norm": 0.34765625, - "learning_rate": 2.94597373841744e-06, - "loss": 0.3912, + "epoch": 4.023446658851114, + "grad_norm": 0.318359375, + "learning_rate": 0.00011706614777812204, + "loss": 0.4458, "step": 3432 }, { - "epoch": 6.569789674952199, - "grad_norm": 0.337890625, - "learning_rate": 2.844280113783698e-06, - "loss": 0.3863, + "epoch": 4.028135990621337, + "grad_norm": 0.31640625, + "learning_rate": 0.00011675511412836145, + "loss": 0.4492, "step": 3436 }, { - "epoch": 6.577437858508604, - "grad_norm": 0.353515625, - "learning_rate": 2.7443558648661656e-06, - "loss": 0.4214, + "epoch": 4.0328253223915596, + "grad_norm": 0.318359375, + "learning_rate": 0.00011644423073137259, + "loss": 0.4557, "step": 3440 }, { - "epoch": 6.58508604206501, - "grad_norm": 0.33203125, - "learning_rate": 2.646202193137248e-06, - "loss": 0.429, + "epoch": 4.037514654161782, + "grad_norm": 0.3125, + "learning_rate": 0.00011613349899221641, + "loss": 0.4215, "step": 3444 }, { - "epoch": 6.592734225621415, - "grad_norm": 0.330078125, - "learning_rate": 2.549820278780246e-06, - "loss": 0.3523, + "epoch": 4.042203985932004, + "grad_norm": 0.345703125, + "learning_rate": 0.00011582292031526844, + "loss": 0.467, "step": 3448 }, { - "epoch": 6.60038240917782, - "grad_norm": 0.33984375, - "learning_rate": 2.455211280675168e-06, - "loss": 0.4001, + "epoch": 4.046893317702227, + "grad_norm": 0.318359375, + "learning_rate": 0.00011551249610421252, + "loss": 0.4608, "step": 3452 }, { - "epoch": 6.6080305927342256, - "grad_norm": 0.376953125, - "learning_rate": 2.3623763363847246e-06, - "loss": 0.3819, + "epoch": 4.05158264947245, + "grad_norm": 0.333984375, + "learning_rate": 0.00011520222776203428, + "loss": 0.487, "step": 3456 }, { - "epoch": 6.615678776290631, - "grad_norm": 0.361328125, - "learning_rate": 2.271316562140757e-06, - "loss": 0.3632, + "epoch": 4.056271981242673, + "grad_norm": 0.34375, + "learning_rate": 0.00011489211669101493, + "loss": 0.4697, "step": 3460 }, { - "epoch": 6.623326959847036, - "grad_norm": 0.357421875, - "learning_rate": 2.182033052830695e-06, - "loss": 0.3842, + "epoch": 4.060961313012895, + "grad_norm": 0.333984375, + "learning_rate": 0.00011458216429272489, + "loss": 0.4833, "step": 3464 }, { - "epoch": 6.630975143403441, - "grad_norm": 0.34375, - "learning_rate": 2.094526881984521e-06, - "loss": 0.3844, + "epoch": 4.065650644783118, + "grad_norm": 0.33203125, + "learning_rate": 0.00011427237196801736, + "loss": 0.4375, "step": 3468 }, { - "epoch": 6.638623326959847, - "grad_norm": 0.328125, - "learning_rate": 2.0087991017617598e-06, - "loss": 0.3697, + "epoch": 4.070339976553341, + "grad_norm": 0.310546875, + "learning_rate": 0.00011396274111702217, + "loss": 0.4756, "step": 3472 }, { - "epoch": 6.646271510516252, - "grad_norm": 0.3515625, - "learning_rate": 1.924850742938894e-06, - "loss": 0.3886, + "epoch": 4.075029308323564, + "grad_norm": 0.33203125, + "learning_rate": 0.00011365327313913932, + "loss": 0.4834, "step": 3476 }, { - "epoch": 6.653919694072657, - "grad_norm": 0.349609375, - "learning_rate": 1.8426828148969008e-06, - "loss": 0.363, + "epoch": 4.079718640093787, + "grad_norm": 0.3359375, + "learning_rate": 0.00011334396943303271, + "loss": 0.4716, "step": 3480 }, { - "epoch": 6.661567877629063, - "grad_norm": 0.33984375, - "learning_rate": 1.7622963056091843e-06, - "loss": 0.3671, + "epoch": 4.084407971864009, + "grad_norm": 0.34375, + "learning_rate": 0.00011303483139662382, + "loss": 0.4264, "step": 3484 }, { - "epoch": 6.669216061185469, - "grad_norm": 0.341796875, - "learning_rate": 1.6836921816296644e-06, - "loss": 0.4043, + "epoch": 4.089097303634232, + "grad_norm": 0.349609375, + "learning_rate": 0.00011272586042708535, + "loss": 0.4405, "step": 3488 }, { - "epoch": 6.676864244741874, - "grad_norm": 0.33203125, - "learning_rate": 1.6068713880811546e-06, - "loss": 0.398, + "epoch": 4.093786635404455, + "grad_norm": 0.318359375, + "learning_rate": 0.00011241705792083484, + "loss": 0.3871, "step": 3492 }, { - "epoch": 6.684512428298279, - "grad_norm": 0.330078125, - "learning_rate": 1.531834848643987e-06, - "loss": 0.3847, + "epoch": 4.098475967174678, + "grad_norm": 0.333984375, + "learning_rate": 0.00011210842527352861, + "loss": 0.4416, "step": 3496 }, { - "epoch": 6.692160611854685, - "grad_norm": 0.34765625, - "learning_rate": 1.4585834655449547e-06, - "loss": 0.4041, + "epoch": 4.103165298944901, + "grad_norm": 0.326171875, + "learning_rate": 0.00011179996388005524, + "loss": 0.4453, "step": 3500 }, { - "epoch": 6.69980879541109, - "grad_norm": 0.333984375, - "learning_rate": 1.3871181195464042e-06, - "loss": 0.3924, + "epoch": 4.107854630715123, + "grad_norm": 0.32421875, + "learning_rate": 0.0001114916751345292, + "loss": 0.4967, "step": 3504 }, { - "epoch": 6.707456978967495, - "grad_norm": 0.32421875, - "learning_rate": 1.3174396699356937e-06, - "loss": 0.3557, + "epoch": 4.112543962485346, + "grad_norm": 0.330078125, + "learning_rate": 0.00011118356043028476, + "loss": 0.465, "step": 3508 }, { - "epoch": 6.7151051625239, + "epoch": 4.117233294255569, "grad_norm": 0.33984375, - "learning_rate": 1.2495489545148008e-06, - "loss": 0.3767, + "learning_rate": 0.00011087562115986965, + "loss": 0.4547, "step": 3512 }, { - "epoch": 6.722753346080306, - "grad_norm": 0.31640625, - "learning_rate": 1.1834467895903476e-06, - "loss": 0.4122, + "epoch": 4.121922626025792, + "grad_norm": 0.32421875, + "learning_rate": 0.00011056785871503862, + "loss": 0.4575, "step": 3516 }, { - "epoch": 6.730401529636711, - "grad_norm": 0.3515625, - "learning_rate": 1.1191339699636426e-06, - "loss": 0.3873, + "epoch": 4.126611957796014, + "grad_norm": 0.32421875, + "learning_rate": 0.00011026027448674725, + "loss": 0.441, "step": 3520 }, { - "epoch": 6.738049713193116, - "grad_norm": 0.3515625, - "learning_rate": 1.0566112689213035e-06, - "loss": 0.4039, + "epoch": 4.131301289566236, + "grad_norm": 0.337890625, + "learning_rate": 0.00010995286986514571, + "loss": 0.4565, "step": 3524 }, { - "epoch": 6.7456978967495225, - "grad_norm": 0.341796875, - "learning_rate": 9.958794382257663e-07, - "loss": 0.4338, + "epoch": 4.135990621336459, + "grad_norm": 0.34375, + "learning_rate": 0.00010964564623957239, + "loss": 0.4744, "step": 3528 }, { - "epoch": 6.753346080305928, - "grad_norm": 0.337890625, - "learning_rate": 9.369392081063908e-07, - "loss": 0.3474, + "epoch": 4.140679953106682, + "grad_norm": 0.328125, + "learning_rate": 0.00010933860499854768, + "loss": 0.4447, "step": 3532 }, { - "epoch": 6.760994263862333, - "grad_norm": 0.330078125, - "learning_rate": 8.797912872506529e-07, - "loss": 0.3553, + "epoch": 4.145369284876905, + "grad_norm": 0.359375, + "learning_rate": 0.0001090317475297677, + "loss": 0.5169, "step": 3536 }, { - "epoch": 6.768642447418738, - "grad_norm": 0.35546875, - "learning_rate": 8.244363627955664e-07, - "loss": 0.3955, + "epoch": 4.150058616647128, + "grad_norm": 0.3359375, + "learning_rate": 0.00010872507522009781, + "loss": 0.461, "step": 3540 }, { - "epoch": 6.776290630975144, - "grad_norm": 0.357421875, - "learning_rate": 7.708751003194569e-07, - "loss": 0.3925, + "epoch": 4.15474794841735, + "grad_norm": 0.333984375, + "learning_rate": 0.00010841858945556677, + "loss": 0.452, "step": 3544 }, { - "epoch": 6.783938814531549, - "grad_norm": 0.333984375, - "learning_rate": 7.191081438339685e-07, - "loss": 0.4088, + "epoch": 4.159437280187573, + "grad_norm": 0.318359375, + "learning_rate": 0.00010811229162136009, + "loss": 0.4486, "step": 3548 }, { - "epoch": 6.791586998087954, - "grad_norm": 0.322265625, - "learning_rate": 6.691361157763198e-07, - "loss": 0.3916, + "epoch": 4.164126611957796, + "grad_norm": 0.337890625, + "learning_rate": 0.00010780618310181395, + "loss": 0.4286, "step": 3552 }, { - "epoch": 6.7992351816443595, - "grad_norm": 0.349609375, - "learning_rate": 6.209596170018094e-07, - "loss": 0.4524, + "epoch": 4.168815943728019, + "grad_norm": 0.333984375, + "learning_rate": 0.00010750026528040895, + "loss": 0.4743, "step": 3556 }, { - "epoch": 6.806883365200765, - "grad_norm": 0.357421875, - "learning_rate": 5.745792267765559e-07, - "loss": 0.398, + "epoch": 4.173505275498242, + "grad_norm": 0.34375, + "learning_rate": 0.00010719453953976375, + "loss": 0.4637, "step": 3560 }, { - "epoch": 6.81453154875717, - "grad_norm": 0.32421875, - "learning_rate": 5.29995502770636e-07, - "loss": 0.4172, + "epoch": 4.178194607268464, + "grad_norm": 0.3046875, + "learning_rate": 0.00010688900726162899, + "loss": 0.466, "step": 3564 }, { - "epoch": 6.822179732313575, - "grad_norm": 0.33203125, - "learning_rate": 4.87208981051257e-07, - "loss": 0.3622, + "epoch": 4.182883939038687, + "grad_norm": 0.330078125, + "learning_rate": 0.00010658366982688076, + "loss": 0.4639, "step": 3568 }, { - "epoch": 6.829827915869981, - "grad_norm": 0.365234375, - "learning_rate": 4.462201760763784e-07, - "loss": 0.4181, + "epoch": 4.18757327080891, + "grad_norm": 0.353515625, + "learning_rate": 0.00010627852861551479, + "loss": 0.4853, "step": 3572 }, { - "epoch": 6.837476099426386, - "grad_norm": 0.35546875, - "learning_rate": 4.0702958068853373e-07, - "loss": 0.3955, + "epoch": 4.192262602579133, + "grad_norm": 0.330078125, + "learning_rate": 0.00010597358500663966, + "loss": 0.4674, "step": 3576 }, { - "epoch": 6.845124282982791, - "grad_norm": 0.349609375, - "learning_rate": 3.696376661088685e-07, - "loss": 0.3865, + "epoch": 4.1969519343493555, + "grad_norm": 0.318359375, + "learning_rate": 0.00010566884037847111, + "loss": 0.4648, "step": 3580 }, { - "epoch": 6.8527724665391965, - "grad_norm": 0.333984375, - "learning_rate": 3.340448819315111e-07, - "loss": 0.3909, + "epoch": 4.201641266119578, + "grad_norm": 0.326171875, + "learning_rate": 0.0001053642961083255, + "loss": 0.4708, "step": 3584 }, { - "epoch": 6.860420650095603, - "grad_norm": 0.341796875, - "learning_rate": 3.002516561181112e-07, - "loss": 0.3462, + "epoch": 4.206330597889801, + "grad_norm": 0.337890625, + "learning_rate": 0.00010505995357261364, + "loss": 0.4416, "step": 3588 }, { - "epoch": 6.868068833652008, - "grad_norm": 0.373046875, - "learning_rate": 2.6825839499277634e-07, - "loss": 0.3655, + "epoch": 4.211019929660023, + "grad_norm": 0.35546875, + "learning_rate": 0.00010475581414683466, + "loss": 0.4991, "step": 3592 }, { - "epoch": 6.875717017208413, - "grad_norm": 0.32421875, - "learning_rate": 2.3806548323710984e-07, - "loss": 0.4164, + "epoch": 4.215709261430246, + "grad_norm": 0.3359375, + "learning_rate": 0.00010445187920556956, + "loss": 0.4488, "step": 3596 }, { - "epoch": 6.8833652007648185, - "grad_norm": 0.365234375, - "learning_rate": 2.096732838856141e-07, - "loss": 0.3865, + "epoch": 4.220398593200469, + "grad_norm": 0.34375, + "learning_rate": 0.00010414815012247529, + "loss": 0.4471, "step": 3600 }, { - "epoch": 6.891013384321224, + "epoch": 4.225087924970691, "grad_norm": 0.33984375, - "learning_rate": 1.8308213832134434e-07, - "loss": 0.4012, + "learning_rate": 0.00010384462827027838, + "loss": 0.4286, "step": 3604 }, { - "epoch": 6.898661567877629, - "grad_norm": 0.333984375, - "learning_rate": 1.5829236627177833e-07, - "loss": 0.3761, + "epoch": 4.229777256740914, + "grad_norm": 0.330078125, + "learning_rate": 0.00010354131502076875, + "loss": 0.4893, "step": 3608 }, { - "epoch": 6.906309751434034, - "grad_norm": 0.341796875, - "learning_rate": 1.3530426580496968e-07, - "loss": 0.3674, + "epoch": 4.234466588511137, + "grad_norm": 0.3671875, + "learning_rate": 0.00010323821174479363, + "loss": 0.4489, "step": 3612 }, { - "epoch": 6.91395793499044, - "grad_norm": 0.3359375, - "learning_rate": 1.1411811332600051e-07, - "loss": 0.3708, + "epoch": 4.23915592028136, + "grad_norm": 0.34765625, + "learning_rate": 0.000102935319812251, + "loss": 0.4349, "step": 3616 }, { - "epoch": 6.921606118546845, - "grad_norm": 0.333984375, - "learning_rate": 9.473416357361762e-08, - "loss": 0.4026, + "epoch": 4.243845252051583, + "grad_norm": 0.33203125, + "learning_rate": 0.0001026326405920839, + "loss": 0.4235, "step": 3620 }, { - "epoch": 6.92925430210325, - "grad_norm": 0.35546875, - "learning_rate": 7.715264961718481e-08, - "loss": 0.4073, + "epoch": 4.248534583821805, + "grad_norm": 0.3359375, + "learning_rate": 0.00010233017545227389, + "loss": 0.4619, "step": 3624 }, { - "epoch": 6.9369024856596555, - "grad_norm": 0.3359375, - "learning_rate": 6.137378285386851e-08, - "loss": 0.3944, + "epoch": 4.253223915592028, + "grad_norm": 0.33984375, + "learning_rate": 0.00010202792575983502, + "loss": 0.4251, "step": 3628 }, { - "epoch": 6.944550669216062, - "grad_norm": 0.337890625, - "learning_rate": 4.739775300612314e-08, - "loss": 0.352, + "epoch": 4.257913247362251, + "grad_norm": 0.328125, + "learning_rate": 0.00010172589288080759, + "loss": 0.4545, "step": 3632 }, { - "epoch": 6.952198852772467, - "grad_norm": 0.333984375, - "learning_rate": 3.522472811939292e-08, - "loss": 0.3563, + "epoch": 4.262602579132474, + "grad_norm": 0.328125, + "learning_rate": 0.00010142407818025201, + "loss": 0.425, "step": 3636 }, { - "epoch": 6.959847036328872, - "grad_norm": 0.34375, - "learning_rate": 2.4854854560096837e-08, - "loss": 0.3673, + "epoch": 4.2672919109026966, + "grad_norm": 0.3359375, + "learning_rate": 0.00010112248302224263, + "loss": 0.4444, "step": 3640 }, { - "epoch": 6.9674952198852775, + "epoch": 4.271981242672919, "grad_norm": 0.353515625, - "learning_rate": 1.6288257013830075e-08, - "loss": 0.4149, + "learning_rate": 0.00010082110876986147, + "loss": 0.4439, "step": 3644 }, { - "epoch": 6.975143403441683, - "grad_norm": 0.3515625, - "learning_rate": 9.52503848396513e-09, - "loss": 0.4061, + "epoch": 4.276670574443142, + "grad_norm": 0.376953125, + "learning_rate": 0.00010051995678519231, + "loss": 0.4381, "step": 3648 }, { - "epoch": 6.982791586998088, - "grad_norm": 0.32421875, - "learning_rate": 4.565280290269591e-09, - "loss": 0.3616, + "epoch": 4.281359906213365, + "grad_norm": 0.359375, + "learning_rate": 0.00010021902842931421, + "loss": 0.4599, "step": 3652 }, { - "epoch": 6.990439770554493, - "grad_norm": 0.3671875, - "learning_rate": 1.4090420680734627e-09, - "loss": 0.3985, + "epoch": 4.286049237983588, + "grad_norm": 0.3359375, + "learning_rate": 9.991832506229558e-05, + "loss": 0.4676, "step": 3656 }, { - "epoch": 6.998087954110899, - "grad_norm": 0.3125, - "learning_rate": 5.6361767436507997e-11, - "loss": 0.3654, + "epoch": 4.29073856975381, + "grad_norm": 0.341796875, + "learning_rate": 9.961784804318803e-05, + "loss": 0.4528, "step": 3660 }, { - "epoch": 7.0, - "step": 3661, - "total_flos": 2.3920283694378516e+18, - "train_loss": 0.561195495056344, - "train_runtime": 14692.9794, - "train_samples_per_second": 7.96, - "train_steps_per_second": 0.249 + "epoch": 4.295427901524032, + "grad_norm": 0.328125, + "learning_rate": 9.931759873002012e-05, + "loss": 0.4812, + "step": 3664 + }, + { + "epoch": 4.300117233294255, + "grad_norm": 0.333984375, + "learning_rate": 9.901757847979136e-05, + "loss": 0.451, + "step": 3668 + }, + { + "epoch": 4.304806565064478, + "grad_norm": 0.333984375, + "learning_rate": 9.871778864846578e-05, + "loss": 0.47, + "step": 3672 + }, + { + "epoch": 4.309495896834701, + "grad_norm": 0.33203125, + "learning_rate": 9.841823059096629e-05, + "loss": 0.4477, + "step": 3676 + }, + { + "epoch": 4.314185228604924, + "grad_norm": 0.33203125, + "learning_rate": 9.811890566116806e-05, + "loss": 0.4822, + "step": 3680 + }, + { + "epoch": 4.318874560375146, + "grad_norm": 0.35546875, + "learning_rate": 9.781981521189283e-05, + "loss": 0.4965, + "step": 3684 + }, + { + "epoch": 4.323563892145369, + "grad_norm": 0.333984375, + "learning_rate": 9.75209605949023e-05, + "loss": 0.4346, + "step": 3688 + }, + { + "epoch": 4.328253223915592, + "grad_norm": 0.365234375, + "learning_rate": 9.722234316089256e-05, + "loss": 0.4458, + "step": 3692 + }, + { + "epoch": 4.332942555685815, + "grad_norm": 0.341796875, + "learning_rate": 9.692396425948768e-05, + "loss": 0.4547, + "step": 3696 + }, + { + "epoch": 4.337631887456038, + "grad_norm": 0.33203125, + "learning_rate": 9.662582523923357e-05, + "loss": 0.4411, + "step": 3700 + }, + { + "epoch": 4.34232121922626, + "grad_norm": 0.330078125, + "learning_rate": 9.632792744759207e-05, + "loss": 0.4091, + "step": 3704 + }, + { + "epoch": 4.347010550996483, + "grad_norm": 0.345703125, + "learning_rate": 9.603027223093474e-05, + "loss": 0.4727, + "step": 3708 + }, + { + "epoch": 4.351699882766706, + "grad_norm": 0.33984375, + "learning_rate": 9.573286093453682e-05, + "loss": 0.4794, + "step": 3712 + }, + { + "epoch": 4.356389214536929, + "grad_norm": 0.361328125, + "learning_rate": 9.543569490257111e-05, + "loss": 0.4868, + "step": 3716 + }, + { + "epoch": 4.3610785463071515, + "grad_norm": 0.365234375, + "learning_rate": 9.513877547810192e-05, + "loss": 0.4537, + "step": 3720 + }, + { + "epoch": 4.365767878077374, + "grad_norm": 0.3515625, + "learning_rate": 9.484210400307903e-05, + "loss": 0.414, + "step": 3724 + }, + { + "epoch": 4.370457209847597, + "grad_norm": 0.322265625, + "learning_rate": 9.454568181833151e-05, + "loss": 0.445, + "step": 3728 + }, + { + "epoch": 4.37514654161782, + "grad_norm": 0.345703125, + "learning_rate": 9.424951026356183e-05, + "loss": 0.4561, + "step": 3732 + }, + { + "epoch": 4.379835873388042, + "grad_norm": 0.33984375, + "learning_rate": 9.395359067733974e-05, + "loss": 0.4429, + "step": 3736 + }, + { + "epoch": 4.384525205158265, + "grad_norm": 0.349609375, + "learning_rate": 9.365792439709609e-05, + "loss": 0.4832, + "step": 3740 + }, + { + "epoch": 4.389214536928487, + "grad_norm": 0.361328125, + "learning_rate": 9.336251275911702e-05, + "loss": 0.4795, + "step": 3744 + }, + { + "epoch": 4.39390386869871, + "grad_norm": 0.33984375, + "learning_rate": 9.306735709853765e-05, + "loss": 0.4701, + "step": 3748 + }, + { + "epoch": 4.398593200468933, + "grad_norm": 0.345703125, + "learning_rate": 9.277245874933633e-05, + "loss": 0.4642, + "step": 3752 + }, + { + "epoch": 4.403282532239156, + "grad_norm": 0.37109375, + "learning_rate": 9.247781904432847e-05, + "loss": 0.4756, + "step": 3756 + }, + { + "epoch": 4.407971864009379, + "grad_norm": 0.341796875, + "learning_rate": 9.218343931516034e-05, + "loss": 0.4885, + "step": 3760 + }, + { + "epoch": 4.412661195779601, + "grad_norm": 0.33984375, + "learning_rate": 9.188932089230338e-05, + "loss": 0.4598, + "step": 3764 + }, + { + "epoch": 4.417350527549824, + "grad_norm": 0.419921875, + "learning_rate": 9.159546510504807e-05, + "loss": 0.4661, + "step": 3768 + }, + { + "epoch": 4.422039859320047, + "grad_norm": 0.359375, + "learning_rate": 9.130187328149779e-05, + "loss": 0.4818, + "step": 3772 + }, + { + "epoch": 4.42672919109027, + "grad_norm": 0.337890625, + "learning_rate": 9.100854674856293e-05, + "loss": 0.4438, + "step": 3776 + }, + { + "epoch": 4.4314185228604925, + "grad_norm": 0.3671875, + "learning_rate": 9.071548683195495e-05, + "loss": 0.4307, + "step": 3780 + }, + { + "epoch": 4.436107854630715, + "grad_norm": 0.333984375, + "learning_rate": 9.042269485618021e-05, + "loss": 0.44, + "step": 3784 + }, + { + "epoch": 4.440797186400938, + "grad_norm": 0.32421875, + "learning_rate": 9.013017214453422e-05, + "loss": 0.4822, + "step": 3788 + }, + { + "epoch": 4.445486518171161, + "grad_norm": 0.349609375, + "learning_rate": 8.983792001909543e-05, + "loss": 0.5057, + "step": 3792 + }, + { + "epoch": 4.450175849941384, + "grad_norm": 0.3515625, + "learning_rate": 8.954593980071941e-05, + "loss": 0.4424, + "step": 3796 + }, + { + "epoch": 4.4548651817116065, + "grad_norm": 0.3671875, + "learning_rate": 8.925423280903274e-05, + "loss": 0.4472, + "step": 3800 + }, + { + "epoch": 4.459554513481828, + "grad_norm": 0.34375, + "learning_rate": 8.896280036242722e-05, + "loss": 0.4449, + "step": 3804 + }, + { + "epoch": 4.464243845252051, + "grad_norm": 0.34375, + "learning_rate": 8.86716437780538e-05, + "loss": 0.4707, + "step": 3808 + }, + { + "epoch": 4.468933177022274, + "grad_norm": 0.345703125, + "learning_rate": 8.838076437181663e-05, + "loss": 0.4591, + "step": 3812 + }, + { + "epoch": 4.473622508792497, + "grad_norm": 0.328125, + "learning_rate": 8.80901634583672e-05, + "loss": 0.4404, + "step": 3816 + }, + { + "epoch": 4.47831184056272, + "grad_norm": 0.349609375, + "learning_rate": 8.779984235109825e-05, + "loss": 0.4562, + "step": 3820 + }, + { + "epoch": 4.483001172332942, + "grad_norm": 0.357421875, + "learning_rate": 8.750980236213792e-05, + "loss": 0.4657, + "step": 3824 + }, + { + "epoch": 4.487690504103165, + "grad_norm": 0.345703125, + "learning_rate": 8.722004480234381e-05, + "loss": 0.4363, + "step": 3828 + }, + { + "epoch": 4.492379835873388, + "grad_norm": 0.333984375, + "learning_rate": 8.693057098129729e-05, + "loss": 0.468, + "step": 3832 + }, + { + "epoch": 4.497069167643611, + "grad_norm": 0.35546875, + "learning_rate": 8.664138220729686e-05, + "loss": 0.5015, + "step": 3836 + }, + { + "epoch": 4.5017584994138335, + "grad_norm": 0.361328125, + "learning_rate": 8.63524797873532e-05, + "loss": 0.4253, + "step": 3840 + }, + { + "epoch": 4.506447831184056, + "grad_norm": 0.349609375, + "learning_rate": 8.606386502718258e-05, + "loss": 0.4468, + "step": 3844 + }, + { + "epoch": 4.511137162954279, + "grad_norm": 0.34375, + "learning_rate": 8.577553923120111e-05, + "loss": 0.481, + "step": 3848 + }, + { + "epoch": 4.515826494724502, + "grad_norm": 0.353515625, + "learning_rate": 8.548750370251915e-05, + "loss": 0.468, + "step": 3852 + }, + { + "epoch": 4.520515826494725, + "grad_norm": 0.3359375, + "learning_rate": 8.519975974293485e-05, + "loss": 0.4859, + "step": 3856 + }, + { + "epoch": 4.5252051582649475, + "grad_norm": 0.322265625, + "learning_rate": 8.49123086529289e-05, + "loss": 0.3966, + "step": 3860 + }, + { + "epoch": 4.52989449003517, + "grad_norm": 0.33203125, + "learning_rate": 8.462515173165817e-05, + "loss": 0.4594, + "step": 3864 + }, + { + "epoch": 4.534583821805393, + "grad_norm": 0.349609375, + "learning_rate": 8.433829027695e-05, + "loss": 0.4901, + "step": 3868 + }, + { + "epoch": 4.539273153575616, + "grad_norm": 0.3359375, + "learning_rate": 8.405172558529643e-05, + "loss": 0.4108, + "step": 3872 + }, + { + "epoch": 4.543962485345839, + "grad_norm": 0.328125, + "learning_rate": 8.376545895184815e-05, + "loss": 0.4007, + "step": 3876 + }, + { + "epoch": 4.548651817116061, + "grad_norm": 0.345703125, + "learning_rate": 8.347949167040894e-05, + "loss": 0.4615, + "step": 3880 + }, + { + "epoch": 4.553341148886283, + "grad_norm": 0.341796875, + "learning_rate": 8.319382503342938e-05, + "loss": 0.4131, + "step": 3884 + }, + { + "epoch": 4.558030480656506, + "grad_norm": 0.357421875, + "learning_rate": 8.290846033200158e-05, + "loss": 0.4706, + "step": 3888 + }, + { + "epoch": 4.562719812426729, + "grad_norm": 0.3359375, + "learning_rate": 8.262339885585274e-05, + "loss": 0.4386, + "step": 3892 + }, + { + "epoch": 4.567409144196952, + "grad_norm": 0.349609375, + "learning_rate": 8.233864189333967e-05, + "loss": 0.4437, + "step": 3896 + }, + { + "epoch": 4.572098475967175, + "grad_norm": 0.34765625, + "learning_rate": 8.20541907314431e-05, + "loss": 0.4487, + "step": 3900 + }, + { + "epoch": 4.576787807737397, + "grad_norm": 0.33203125, + "learning_rate": 8.177004665576147e-05, + "loss": 0.4597, + "step": 3904 + }, + { + "epoch": 4.58147713950762, + "grad_norm": 0.345703125, + "learning_rate": 8.148621095050537e-05, + "loss": 0.4854, + "step": 3908 + }, + { + "epoch": 4.586166471277843, + "grad_norm": 0.3359375, + "learning_rate": 8.120268489849164e-05, + "loss": 0.4578, + "step": 3912 + }, + { + "epoch": 4.590855803048066, + "grad_norm": 0.353515625, + "learning_rate": 8.091946978113782e-05, + "loss": 0.4828, + "step": 3916 + }, + { + "epoch": 4.5955451348182885, + "grad_norm": 0.3515625, + "learning_rate": 8.063656687845592e-05, + "loss": 0.4351, + "step": 3920 + }, + { + "epoch": 4.600234466588511, + "grad_norm": 0.349609375, + "learning_rate": 8.035397746904695e-05, + "loss": 0.4401, + "step": 3924 + }, + { + "epoch": 4.604923798358734, + "grad_norm": 0.341796875, + "learning_rate": 8.007170283009517e-05, + "loss": 0.4649, + "step": 3928 + }, + { + "epoch": 4.609613130128957, + "grad_norm": 0.34375, + "learning_rate": 7.978974423736202e-05, + "loss": 0.4128, + "step": 3932 + }, + { + "epoch": 4.61430246189918, + "grad_norm": 0.34765625, + "learning_rate": 7.950810296518076e-05, + "loss": 0.4591, + "step": 3936 + }, + { + "epoch": 4.6189917936694025, + "grad_norm": 0.349609375, + "learning_rate": 7.922678028645032e-05, + "loss": 0.5247, + "step": 3940 + }, + { + "epoch": 4.623681125439624, + "grad_norm": 0.359375, + "learning_rate": 7.89457774726298e-05, + "loss": 0.4786, + "step": 3944 + }, + { + "epoch": 4.628370457209847, + "grad_norm": 0.326171875, + "learning_rate": 7.866509579373261e-05, + "loss": 0.4466, + "step": 3948 + }, + { + "epoch": 4.63305978898007, + "grad_norm": 0.34375, + "learning_rate": 7.838473651832077e-05, + "loss": 0.4672, + "step": 3952 + }, + { + "epoch": 4.637749120750293, + "grad_norm": 0.341796875, + "learning_rate": 7.810470091349925e-05, + "loss": 0.454, + "step": 3956 + }, + { + "epoch": 4.642438452520516, + "grad_norm": 0.32421875, + "learning_rate": 7.782499024491004e-05, + "loss": 0.4429, + "step": 3960 + }, + { + "epoch": 4.647127784290738, + "grad_norm": 0.37109375, + "learning_rate": 7.754560577672674e-05, + "loss": 0.4261, + "step": 3964 + }, + { + "epoch": 4.651817116060961, + "grad_norm": 0.353515625, + "learning_rate": 7.726654877164847e-05, + "loss": 0.4814, + "step": 3968 + }, + { + "epoch": 4.656506447831184, + "grad_norm": 0.328125, + "learning_rate": 7.698782049089438e-05, + "loss": 0.4318, + "step": 3972 + }, + { + "epoch": 4.661195779601407, + "grad_norm": 0.33984375, + "learning_rate": 7.67094221941981e-05, + "loss": 0.4282, + "step": 3976 + }, + { + "epoch": 4.6658851113716295, + "grad_norm": 0.341796875, + "learning_rate": 7.64313551398017e-05, + "loss": 0.4523, + "step": 3980 + }, + { + "epoch": 4.670574443141852, + "grad_norm": 0.34375, + "learning_rate": 7.615362058445022e-05, + "loss": 0.4633, + "step": 3984 + }, + { + "epoch": 4.675263774912075, + "grad_norm": 0.337890625, + "learning_rate": 7.587621978338586e-05, + "loss": 0.4672, + "step": 3988 + }, + { + "epoch": 4.679953106682298, + "grad_norm": 0.33203125, + "learning_rate": 7.559915399034266e-05, + "loss": 0.4357, + "step": 3992 + }, + { + "epoch": 4.684642438452521, + "grad_norm": 0.33984375, + "learning_rate": 7.532242445754029e-05, + "loss": 0.4189, + "step": 3996 + }, + { + "epoch": 4.6893317702227435, + "grad_norm": 0.353515625, + "learning_rate": 7.504603243567874e-05, + "loss": 0.4741, + "step": 4000 + }, + { + "epoch": 4.694021101992966, + "grad_norm": 0.357421875, + "learning_rate": 7.476997917393269e-05, + "loss": 0.4782, + "step": 4004 + }, + { + "epoch": 4.698710433763189, + "grad_norm": 0.337890625, + "learning_rate": 7.449426591994565e-05, + "loss": 0.4787, + "step": 4008 + }, + { + "epoch": 4.703399765533412, + "grad_norm": 0.36328125, + "learning_rate": 7.421889391982454e-05, + "loss": 0.4287, + "step": 4012 + }, + { + "epoch": 4.708089097303635, + "grad_norm": 0.35546875, + "learning_rate": 7.394386441813388e-05, + "loss": 0.4493, + "step": 4016 + }, + { + "epoch": 4.7127784290738575, + "grad_norm": 0.33203125, + "learning_rate": 7.366917865789027e-05, + "loss": 0.4431, + "step": 4020 + }, + { + "epoch": 4.717467760844079, + "grad_norm": 0.32421875, + "learning_rate": 7.339483788055672e-05, + "loss": 0.3989, + "step": 4024 + }, + { + "epoch": 4.722157092614302, + "grad_norm": 0.359375, + "learning_rate": 7.312084332603706e-05, + "loss": 0.4511, + "step": 4028 + }, + { + "epoch": 4.726846424384525, + "grad_norm": 0.375, + "learning_rate": 7.284719623267044e-05, + "loss": 0.4183, + "step": 4032 + }, + { + "epoch": 4.731535756154748, + "grad_norm": 0.333984375, + "learning_rate": 7.257389783722548e-05, + "loss": 0.4115, + "step": 4036 + }, + { + "epoch": 4.7362250879249705, + "grad_norm": 0.341796875, + "learning_rate": 7.2300949374895e-05, + "loss": 0.4635, + "step": 4040 + }, + { + "epoch": 4.740914419695193, + "grad_norm": 0.333984375, + "learning_rate": 7.202835207929014e-05, + "loss": 0.4566, + "step": 4044 + }, + { + "epoch": 4.745603751465416, + "grad_norm": 0.32421875, + "learning_rate": 7.175610718243493e-05, + "loss": 0.4718, + "step": 4048 + }, + { + "epoch": 4.750293083235639, + "grad_norm": 0.33984375, + "learning_rate": 7.148421591476086e-05, + "loss": 0.4123, + "step": 4052 + }, + { + "epoch": 4.754982415005862, + "grad_norm": 0.33984375, + "learning_rate": 7.121267950510082e-05, + "loss": 0.4439, + "step": 4056 + }, + { + "epoch": 4.7596717467760845, + "grad_norm": 0.337890625, + "learning_rate": 7.094149918068432e-05, + "loss": 0.4509, + "step": 4060 + }, + { + "epoch": 4.764361078546307, + "grad_norm": 0.326171875, + "learning_rate": 7.067067616713117e-05, + "loss": 0.4291, + "step": 4064 + }, + { + "epoch": 4.76905041031653, + "grad_norm": 0.32421875, + "learning_rate": 7.040021168844653e-05, + "loss": 0.4565, + "step": 4068 + }, + { + "epoch": 4.773739742086753, + "grad_norm": 0.357421875, + "learning_rate": 7.013010696701502e-05, + "loss": 0.4885, + "step": 4072 + }, + { + "epoch": 4.778429073856976, + "grad_norm": 0.333984375, + "learning_rate": 6.986036322359522e-05, + "loss": 0.3944, + "step": 4076 + }, + { + "epoch": 4.7831184056271985, + "grad_norm": 0.3515625, + "learning_rate": 6.959098167731447e-05, + "loss": 0.4708, + "step": 4080 + }, + { + "epoch": 4.78780773739742, + "grad_norm": 0.353515625, + "learning_rate": 6.93219635456629e-05, + "loss": 0.4989, + "step": 4084 + }, + { + "epoch": 4.792497069167643, + "grad_norm": 0.3359375, + "learning_rate": 6.905331004448843e-05, + "loss": 0.4702, + "step": 4088 + }, + { + "epoch": 4.797186400937866, + "grad_norm": 0.353515625, + "learning_rate": 6.878502238799062e-05, + "loss": 0.4528, + "step": 4092 + }, + { + "epoch": 4.801875732708089, + "grad_norm": 0.341796875, + "learning_rate": 6.851710178871596e-05, + "loss": 0.4384, + "step": 4096 + }, + { + "epoch": 4.8065650644783116, + "grad_norm": 0.337890625, + "learning_rate": 6.824954945755177e-05, + "loss": 0.4377, + "step": 4100 + }, + { + "epoch": 4.811254396248534, + "grad_norm": 0.333984375, + "learning_rate": 6.798236660372095e-05, + "loss": 0.4406, + "step": 4104 + }, + { + "epoch": 4.815943728018757, + "grad_norm": 0.345703125, + "learning_rate": 6.77155544347767e-05, + "loss": 0.4855, + "step": 4108 + }, + { + "epoch": 4.82063305978898, + "grad_norm": 0.357421875, + "learning_rate": 6.744911415659665e-05, + "loss": 0.4849, + "step": 4112 + }, + { + "epoch": 4.825322391559203, + "grad_norm": 0.35546875, + "learning_rate": 6.718304697337785e-05, + "loss": 0.4904, + "step": 4116 + }, + { + "epoch": 4.8300117233294255, + "grad_norm": 0.3515625, + "learning_rate": 6.691735408763097e-05, + "loss": 0.4598, + "step": 4120 + }, + { + "epoch": 4.834701055099648, + "grad_norm": 0.34375, + "learning_rate": 6.66520367001751e-05, + "loss": 0.4602, + "step": 4124 + }, + { + "epoch": 4.839390386869871, + "grad_norm": 0.345703125, + "learning_rate": 6.638709601013215e-05, + "loss": 0.4598, + "step": 4128 + }, + { + "epoch": 4.844079718640094, + "grad_norm": 0.365234375, + "learning_rate": 6.612253321492157e-05, + "loss": 0.438, + "step": 4132 + }, + { + "epoch": 4.848769050410317, + "grad_norm": 0.349609375, + "learning_rate": 6.585834951025496e-05, + "loss": 0.4723, + "step": 4136 + }, + { + "epoch": 4.8534583821805395, + "grad_norm": 0.376953125, + "learning_rate": 6.559454609013043e-05, + "loss": 0.4737, + "step": 4140 + }, + { + "epoch": 4.858147713950762, + "grad_norm": 0.359375, + "learning_rate": 6.533112414682754e-05, + "loss": 0.4772, + "step": 4144 + }, + { + "epoch": 4.862837045720985, + "grad_norm": 0.34765625, + "learning_rate": 6.506808487090163e-05, + "loss": 0.4312, + "step": 4148 + }, + { + "epoch": 4.867526377491208, + "grad_norm": 0.349609375, + "learning_rate": 6.48054294511785e-05, + "loss": 0.4765, + "step": 4152 + }, + { + "epoch": 4.872215709261431, + "grad_norm": 0.359375, + "learning_rate": 6.454315907474926e-05, + "loss": 0.4566, + "step": 4156 + }, + { + "epoch": 4.8769050410316535, + "grad_norm": 0.337890625, + "learning_rate": 6.428127492696454e-05, + "loss": 0.4499, + "step": 4160 + }, + { + "epoch": 4.881594372801875, + "grad_norm": 0.349609375, + "learning_rate": 6.401977819142972e-05, + "loss": 0.4875, + "step": 4164 + }, + { + "epoch": 4.886283704572098, + "grad_norm": 0.357421875, + "learning_rate": 6.375867004999882e-05, + "loss": 0.4595, + "step": 4168 + }, + { + "epoch": 4.890973036342321, + "grad_norm": 0.353515625, + "learning_rate": 6.349795168276994e-05, + "loss": 0.4624, + "step": 4172 + }, + { + "epoch": 4.895662368112544, + "grad_norm": 0.36328125, + "learning_rate": 6.323762426807939e-05, + "loss": 0.4611, + "step": 4176 + }, + { + "epoch": 4.9003516998827665, + "grad_norm": 0.328125, + "learning_rate": 6.297768898249649e-05, + "loss": 0.4109, + "step": 4180 + }, + { + "epoch": 4.905041031652989, + "grad_norm": 0.33984375, + "learning_rate": 6.271814700081852e-05, + "loss": 0.4552, + "step": 4184 + }, + { + "epoch": 4.909730363423212, + "grad_norm": 0.33203125, + "learning_rate": 6.245899949606498e-05, + "loss": 0.4127, + "step": 4188 + }, + { + "epoch": 4.914419695193435, + "grad_norm": 0.34765625, + "learning_rate": 6.220024763947263e-05, + "loss": 0.4686, + "step": 4192 + }, + { + "epoch": 4.919109026963658, + "grad_norm": 0.365234375, + "learning_rate": 6.194189260049003e-05, + "loss": 0.4711, + "step": 4196 + }, + { + "epoch": 4.9237983587338805, + "grad_norm": 0.328125, + "learning_rate": 6.168393554677224e-05, + "loss": 0.4401, + "step": 4200 + }, + { + "epoch": 4.928487690504103, + "grad_norm": 0.34765625, + "learning_rate": 6.142637764417566e-05, + "loss": 0.4706, + "step": 4204 + }, + { + "epoch": 4.933177022274326, + "grad_norm": 0.36328125, + "learning_rate": 6.116922005675262e-05, + "loss": 0.4727, + "step": 4208 + }, + { + "epoch": 4.937866354044549, + "grad_norm": 0.337890625, + "learning_rate": 6.0912463946746346e-05, + "loss": 0.4737, + "step": 4212 + }, + { + "epoch": 4.942555685814772, + "grad_norm": 0.337890625, + "learning_rate": 6.065611047458538e-05, + "loss": 0.4833, + "step": 4216 + }, + { + "epoch": 4.9472450175849945, + "grad_norm": 0.330078125, + "learning_rate": 6.04001607988787e-05, + "loss": 0.4372, + "step": 4220 + }, + { + "epoch": 4.951934349355217, + "grad_norm": 0.35546875, + "learning_rate": 6.0144616076410114e-05, + "loss": 0.4829, + "step": 4224 + }, + { + "epoch": 4.956623681125439, + "grad_norm": 0.33984375, + "learning_rate": 5.9889477462133234e-05, + "loss": 0.428, + "step": 4228 + }, + { + "epoch": 4.961313012895662, + "grad_norm": 0.361328125, + "learning_rate": 5.963474610916643e-05, + "loss": 0.4963, + "step": 4232 + }, + { + "epoch": 4.966002344665885, + "grad_norm": 0.359375, + "learning_rate": 5.938042316878719e-05, + "loss": 0.5161, + "step": 4236 + }, + { + "epoch": 4.9706916764361075, + "grad_norm": 0.373046875, + "learning_rate": 5.912650979042729e-05, + "loss": 0.4917, + "step": 4240 + }, + { + "epoch": 4.97538100820633, + "grad_norm": 0.36328125, + "learning_rate": 5.8873007121667314e-05, + "loss": 0.4658, + "step": 4244 + }, + { + "epoch": 4.980070339976553, + "grad_norm": 0.34765625, + "learning_rate": 5.861991630823185e-05, + "loss": 0.4614, + "step": 4248 + }, + { + "epoch": 4.984759671746776, + "grad_norm": 0.337890625, + "learning_rate": 5.8367238493983885e-05, + "loss": 0.428, + "step": 4252 + }, + { + "epoch": 4.989449003516999, + "grad_norm": 0.333984375, + "learning_rate": 5.81149748209198e-05, + "loss": 0.4652, + "step": 4256 + }, + { + "epoch": 4.9941383352872215, + "grad_norm": 0.357421875, + "learning_rate": 5.7863126429164445e-05, + "loss": 0.4367, + "step": 4260 + }, + { + "epoch": 4.998827667057444, + "grad_norm": 0.328125, + "learning_rate": 5.761169445696552e-05, + "loss": 0.4746, + "step": 4264 + }, + { + "epoch": 5.003516998827667, + "grad_norm": 0.306640625, + "learning_rate": 5.7360680040688915e-05, + "loss": 0.5014, + "step": 4268 + }, + { + "epoch": 5.00820633059789, + "grad_norm": 0.3046875, + "learning_rate": 5.711008431481318e-05, + "loss": 0.4015, + "step": 4272 + }, + { + "epoch": 5.012895662368113, + "grad_norm": 0.298828125, + "learning_rate": 5.6859908411924634e-05, + "loss": 0.3825, + "step": 4276 + }, + { + "epoch": 5.0175849941383355, + "grad_norm": 0.34765625, + "learning_rate": 5.6610153462712144e-05, + "loss": 0.4382, + "step": 4280 + }, + { + "epoch": 5.022274325908558, + "grad_norm": 0.333984375, + "learning_rate": 5.6360820595962e-05, + "loss": 0.4462, + "step": 4284 + }, + { + "epoch": 5.026963657678781, + "grad_norm": 0.349609375, + "learning_rate": 5.611191093855304e-05, + "loss": 0.4038, + "step": 4288 + }, + { + "epoch": 5.031652989449004, + "grad_norm": 0.345703125, + "learning_rate": 5.5863425615451144e-05, + "loss": 0.434, + "step": 4292 + }, + { + "epoch": 5.036342321219227, + "grad_norm": 0.318359375, + "learning_rate": 5.5615365749704586e-05, + "loss": 0.4023, + "step": 4296 + }, + { + "epoch": 5.041031652989449, + "grad_norm": 0.341796875, + "learning_rate": 5.536773246243861e-05, + "loss": 0.3909, + "step": 4300 + }, + { + "epoch": 5.045720984759671, + "grad_norm": 0.359375, + "learning_rate": 5.512052687285052e-05, + "loss": 0.4292, + "step": 4304 + }, + { + "epoch": 5.050410316529894, + "grad_norm": 0.3203125, + "learning_rate": 5.487375009820477e-05, + "loss": 0.4089, + "step": 4308 + }, + { + "epoch": 5.055099648300117, + "grad_norm": 0.34765625, + "learning_rate": 5.4627403253827436e-05, + "loss": 0.442, + "step": 4312 + }, + { + "epoch": 5.05978898007034, + "grad_norm": 0.3359375, + "learning_rate": 5.438148745310182e-05, + "loss": 0.4045, + "step": 4316 + }, + { + "epoch": 5.0644783118405625, + "grad_norm": 0.330078125, + "learning_rate": 5.413600380746286e-05, + "loss": 0.3942, + "step": 4320 + }, + { + "epoch": 5.069167643610785, + "grad_norm": 0.3359375, + "learning_rate": 5.3890953426392544e-05, + "loss": 0.3925, + "step": 4324 + }, + { + "epoch": 5.073856975381008, + "grad_norm": 0.35546875, + "learning_rate": 5.364633741741448e-05, + "loss": 0.4398, + "step": 4328 + }, + { + "epoch": 5.078546307151231, + "grad_norm": 0.359375, + "learning_rate": 5.340215688608918e-05, + "loss": 0.4871, + "step": 4332 + }, + { + "epoch": 5.083235638921454, + "grad_norm": 0.365234375, + "learning_rate": 5.315841293600906e-05, + "loss": 0.4335, + "step": 4336 + }, + { + "epoch": 5.0879249706916765, + "grad_norm": 0.33203125, + "learning_rate": 5.2915106668793214e-05, + "loss": 0.4057, + "step": 4340 + }, + { + "epoch": 5.092614302461899, + "grad_norm": 0.357421875, + "learning_rate": 5.2672239184082845e-05, + "loss": 0.4184, + "step": 4344 + }, + { + "epoch": 5.097303634232122, + "grad_norm": 0.3359375, + "learning_rate": 5.242981157953567e-05, + "loss": 0.399, + "step": 4348 + }, + { + "epoch": 5.101992966002345, + "grad_norm": 0.341796875, + "learning_rate": 5.2187824950821725e-05, + "loss": 0.4237, + "step": 4352 + }, + { + "epoch": 5.106682297772568, + "grad_norm": 0.357421875, + "learning_rate": 5.194628039161778e-05, + "loss": 0.454, + "step": 4356 + }, + { + "epoch": 5.1113716295427905, + "grad_norm": 0.3671875, + "learning_rate": 5.170517899360267e-05, + "loss": 0.4397, + "step": 4360 + }, + { + "epoch": 5.116060961313013, + "grad_norm": 0.3515625, + "learning_rate": 5.1464521846452464e-05, + "loss": 0.4265, + "step": 4364 + }, + { + "epoch": 5.120750293083236, + "grad_norm": 0.35546875, + "learning_rate": 5.12243100378352e-05, + "loss": 0.4019, + "step": 4368 + }, + { + "epoch": 5.125439624853458, + "grad_norm": 0.3359375, + "learning_rate": 5.098454465340638e-05, + "loss": 0.3929, + "step": 4372 + }, + { + "epoch": 5.130128956623681, + "grad_norm": 0.337890625, + "learning_rate": 5.074522677680372e-05, + "loss": 0.416, + "step": 4376 + }, + { + "epoch": 5.1348182883939035, + "grad_norm": 0.34375, + "learning_rate": 5.050635748964239e-05, + "loss": 0.4315, + "step": 4380 + }, + { + "epoch": 5.139507620164126, + "grad_norm": 0.341796875, + "learning_rate": 5.0267937871510304e-05, + "loss": 0.3818, + "step": 4384 + }, + { + "epoch": 5.144196951934349, + "grad_norm": 0.33984375, + "learning_rate": 5.0029968999962726e-05, + "loss": 0.3975, + "step": 4388 + }, + { + "epoch": 5.148886283704572, + "grad_norm": 0.357421875, + "learning_rate": 4.97924519505181e-05, + "loss": 0.4051, + "step": 4392 + }, + { + "epoch": 5.153575615474795, + "grad_norm": 0.330078125, + "learning_rate": 4.955538779665256e-05, + "loss": 0.3967, + "step": 4396 + }, + { + "epoch": 5.1582649472450175, + "grad_norm": 0.328125, + "learning_rate": 4.9318777609795536e-05, + "loss": 0.4136, + "step": 4400 + }, + { + "epoch": 5.16295427901524, + "grad_norm": 0.34765625, + "learning_rate": 4.90826224593246e-05, + "loss": 0.4419, + "step": 4404 + }, + { + "epoch": 5.167643610785463, + "grad_norm": 0.349609375, + "learning_rate": 4.884692341256072e-05, + "loss": 0.45, + "step": 4408 + }, + { + "epoch": 5.172332942555686, + "grad_norm": 0.330078125, + "learning_rate": 4.8611681534763635e-05, + "loss": 0.389, + "step": 4412 + }, + { + "epoch": 5.177022274325909, + "grad_norm": 0.3515625, + "learning_rate": 4.837689788912667e-05, + "loss": 0.4161, + "step": 4416 + }, + { + "epoch": 5.1817116060961315, + "grad_norm": 0.34375, + "learning_rate": 4.814257353677241e-05, + "loss": 0.3891, + "step": 4420 + }, + { + "epoch": 5.186400937866354, + "grad_norm": 0.35546875, + "learning_rate": 4.7908709536747224e-05, + "loss": 0.4633, + "step": 4424 + }, + { + "epoch": 5.191090269636577, + "grad_norm": 0.333984375, + "learning_rate": 4.7675306946017296e-05, + "loss": 0.4279, + "step": 4428 + }, + { + "epoch": 5.1957796014068, + "grad_norm": 0.357421875, + "learning_rate": 4.74423668194632e-05, + "loss": 0.4269, + "step": 4432 + }, + { + "epoch": 5.200468933177023, + "grad_norm": 0.330078125, + "learning_rate": 4.720989020987535e-05, + "loss": 0.4207, + "step": 4436 + }, + { + "epoch": 5.205158264947245, + "grad_norm": 0.3515625, + "learning_rate": 4.697787816794947e-05, + "loss": 0.492, + "step": 4440 + }, + { + "epoch": 5.209847596717467, + "grad_norm": 0.369140625, + "learning_rate": 4.674633174228138e-05, + "loss": 0.4622, + "step": 4444 + }, + { + "epoch": 5.21453692848769, + "grad_norm": 0.34375, + "learning_rate": 4.651525197936275e-05, + "loss": 0.4413, + "step": 4448 + }, + { + "epoch": 5.219226260257913, + "grad_norm": 0.357421875, + "learning_rate": 4.6284639923575934e-05, + "loss": 0.4275, + "step": 4452 + }, + { + "epoch": 5.223915592028136, + "grad_norm": 0.341796875, + "learning_rate": 4.6054496617189554e-05, + "loss": 0.3943, + "step": 4456 + }, + { + "epoch": 5.2286049237983585, + "grad_norm": 0.345703125, + "learning_rate": 4.582482310035365e-05, + "loss": 0.4183, + "step": 4460 + }, + { + "epoch": 5.233294255568581, + "grad_norm": 0.349609375, + "learning_rate": 4.559562041109499e-05, + "loss": 0.439, + "step": 4464 + }, + { + "epoch": 5.237983587338804, + "grad_norm": 0.34765625, + "learning_rate": 4.53668895853125e-05, + "loss": 0.4362, + "step": 4468 + }, + { + "epoch": 5.242672919109027, + "grad_norm": 0.33203125, + "learning_rate": 4.5138631656772346e-05, + "loss": 0.4021, + "step": 4472 + }, + { + "epoch": 5.24736225087925, + "grad_norm": 0.359375, + "learning_rate": 4.4910847657103555e-05, + "loss": 0.4298, + "step": 4476 + }, + { + "epoch": 5.2520515826494725, + "grad_norm": 0.353515625, + "learning_rate": 4.468353861579306e-05, + "loss": 0.3793, + "step": 4480 + }, + { + "epoch": 5.256740914419695, + "grad_norm": 0.3359375, + "learning_rate": 4.44567055601812e-05, + "loss": 0.3934, + "step": 4484 + }, + { + "epoch": 5.261430246189918, + "grad_norm": 0.361328125, + "learning_rate": 4.423034951545718e-05, + "loss": 0.3962, + "step": 4488 + }, + { + "epoch": 5.266119577960141, + "grad_norm": 0.341796875, + "learning_rate": 4.4004471504654196e-05, + "loss": 0.4206, + "step": 4492 + }, + { + "epoch": 5.270808909730364, + "grad_norm": 0.330078125, + "learning_rate": 4.377907254864496e-05, + "loss": 0.3844, + "step": 4496 + }, + { + "epoch": 5.275498241500586, + "grad_norm": 0.345703125, + "learning_rate": 4.355415366613702e-05, + "loss": 0.4442, + "step": 4500 + }, + { + "epoch": 5.280187573270809, + "grad_norm": 0.341796875, + "learning_rate": 4.332971587366837e-05, + "loss": 0.4124, + "step": 4504 + }, + { + "epoch": 5.284876905041032, + "grad_norm": 0.34375, + "learning_rate": 4.3105760185602476e-05, + "loss": 0.443, + "step": 4508 + }, + { + "epoch": 5.289566236811254, + "grad_norm": 0.37890625, + "learning_rate": 4.2882287614123965e-05, + "loss": 0.4309, + "step": 4512 + }, + { + "epoch": 5.294255568581477, + "grad_norm": 0.353515625, + "learning_rate": 4.2659299169234056e-05, + "loss": 0.4248, + "step": 4516 + }, + { + "epoch": 5.2989449003516995, + "grad_norm": 0.3671875, + "learning_rate": 4.24367958587458e-05, + "loss": 0.4564, + "step": 4520 + }, + { + "epoch": 5.303634232121922, + "grad_norm": 0.349609375, + "learning_rate": 4.221477868827978e-05, + "loss": 0.4004, + "step": 4524 + }, + { + "epoch": 5.308323563892145, + "grad_norm": 0.36328125, + "learning_rate": 4.1993248661259324e-05, + "loss": 0.4194, + "step": 4528 + }, + { + "epoch": 5.313012895662368, + "grad_norm": 0.34375, + "learning_rate": 4.1772206778906104e-05, + "loss": 0.3892, + "step": 4532 + }, + { + "epoch": 5.317702227432591, + "grad_norm": 0.359375, + "learning_rate": 4.155165404023561e-05, + "loss": 0.415, + "step": 4536 + }, + { + "epoch": 5.3223915592028135, + "grad_norm": 0.365234375, + "learning_rate": 4.1331591442052534e-05, + "loss": 0.4516, + "step": 4540 + }, + { + "epoch": 5.327080890973036, + "grad_norm": 0.369140625, + "learning_rate": 4.111201997894651e-05, + "loss": 0.3757, + "step": 4544 + }, + { + "epoch": 5.331770222743259, + "grad_norm": 0.3359375, + "learning_rate": 4.089294064328725e-05, + "loss": 0.3921, + "step": 4548 + }, + { + "epoch": 5.336459554513482, + "grad_norm": 0.373046875, + "learning_rate": 4.067435442522043e-05, + "loss": 0.4161, + "step": 4552 + }, + { + "epoch": 5.341148886283705, + "grad_norm": 0.3671875, + "learning_rate": 4.045626231266294e-05, + "loss": 0.455, + "step": 4556 + }, + { + "epoch": 5.3458382180539274, + "grad_norm": 0.33984375, + "learning_rate": 4.023866529129848e-05, + "loss": 0.424, + "step": 4560 + }, + { + "epoch": 5.35052754982415, + "grad_norm": 0.345703125, + "learning_rate": 4.002156434457333e-05, + "loss": 0.4199, + "step": 4564 + }, + { + "epoch": 5.355216881594373, + "grad_norm": 0.373046875, + "learning_rate": 3.980496045369155e-05, + "loss": 0.4172, + "step": 4568 + }, + { + "epoch": 5.359906213364596, + "grad_norm": 0.337890625, + "learning_rate": 3.95888545976108e-05, + "loss": 0.4208, + "step": 4572 + }, + { + "epoch": 5.364595545134819, + "grad_norm": 0.3359375, + "learning_rate": 3.937324775303773e-05, + "loss": 0.3733, + "step": 4576 + }, + { + "epoch": 5.369284876905041, + "grad_norm": 0.369140625, + "learning_rate": 3.915814089442388e-05, + "loss": 0.4418, + "step": 4580 + }, + { + "epoch": 5.373974208675264, + "grad_norm": 0.380859375, + "learning_rate": 3.894353499396086e-05, + "loss": 0.4322, + "step": 4584 + }, + { + "epoch": 5.378663540445486, + "grad_norm": 0.337890625, + "learning_rate": 3.872943102157622e-05, + "loss": 0.4462, + "step": 4588 + }, + { + "epoch": 5.383352872215709, + "grad_norm": 0.345703125, + "learning_rate": 3.851582994492912e-05, + "loss": 0.4307, + "step": 4592 + }, + { + "epoch": 5.388042203985932, + "grad_norm": 0.337890625, + "learning_rate": 3.830273272940564e-05, + "loss": 0.4003, + "step": 4596 + }, + { + "epoch": 5.3927315357561545, + "grad_norm": 0.35546875, + "learning_rate": 3.8090140338114843e-05, + "loss": 0.4188, + "step": 4600 + }, + { + "epoch": 5.397420867526377, + "grad_norm": 0.349609375, + "learning_rate": 3.787805373188405e-05, + "loss": 0.4103, + "step": 4604 + }, + { + "epoch": 5.4021101992966, + "grad_norm": 0.357421875, + "learning_rate": 3.766647386925467e-05, + "loss": 0.4376, + "step": 4608 + }, + { + "epoch": 5.406799531066823, + "grad_norm": 0.333984375, + "learning_rate": 3.745540170647788e-05, + "loss": 0.4164, + "step": 4612 + }, + { + "epoch": 5.411488862837046, + "grad_norm": 0.369140625, + "learning_rate": 3.724483819751022e-05, + "loss": 0.4355, + "step": 4616 + }, + { + "epoch": 5.4161781946072685, + "grad_norm": 0.36328125, + "learning_rate": 3.703478429400945e-05, + "loss": 0.4503, + "step": 4620 + }, + { + "epoch": 5.420867526377491, + "grad_norm": 0.326171875, + "learning_rate": 3.6825240945329946e-05, + "loss": 0.4262, + "step": 4624 + }, + { + "epoch": 5.425556858147714, + "grad_norm": 0.33984375, + "learning_rate": 3.661620909851878e-05, + "loss": 0.4333, + "step": 4628 + }, + { + "epoch": 5.430246189917937, + "grad_norm": 0.349609375, + "learning_rate": 3.640768969831113e-05, + "loss": 0.4332, + "step": 4632 + }, + { + "epoch": 5.43493552168816, + "grad_norm": 0.34375, + "learning_rate": 3.619968368712613e-05, + "loss": 0.3988, + "step": 4636 + }, + { + "epoch": 5.439624853458382, + "grad_norm": 0.353515625, + "learning_rate": 3.599219200506277e-05, + "loss": 0.3978, + "step": 4640 + }, + { + "epoch": 5.444314185228605, + "grad_norm": 0.34765625, + "learning_rate": 3.5785215589895224e-05, + "loss": 0.4265, + "step": 4644 + }, + { + "epoch": 5.449003516998828, + "grad_norm": 0.3359375, + "learning_rate": 3.557875537706914e-05, + "loss": 0.4031, + "step": 4648 + }, + { + "epoch": 5.453692848769051, + "grad_norm": 0.357421875, + "learning_rate": 3.5372812299696934e-05, + "loss": 0.4482, + "step": 4652 + }, + { + "epoch": 5.458382180539273, + "grad_norm": 0.341796875, + "learning_rate": 3.5167387288554014e-05, + "loss": 0.3847, + "step": 4656 + }, + { + "epoch": 5.4630715123094955, + "grad_norm": 0.328125, + "learning_rate": 3.496248127207415e-05, + "loss": 0.4316, + "step": 4660 + }, + { + "epoch": 5.467760844079718, + "grad_norm": 0.341796875, + "learning_rate": 3.475809517634554e-05, + "loss": 0.4049, + "step": 4664 + }, + { + "epoch": 5.472450175849941, + "grad_norm": 0.345703125, + "learning_rate": 3.455422992510664e-05, + "loss": 0.43, + "step": 4668 + }, + { + "epoch": 5.477139507620164, + "grad_norm": 0.361328125, + "learning_rate": 3.435088643974177e-05, + "loss": 0.4267, + "step": 4672 + }, + { + "epoch": 5.481828839390387, + "grad_norm": 0.3515625, + "learning_rate": 3.41480656392773e-05, + "loss": 0.4264, + "step": 4676 + }, + { + "epoch": 5.4865181711606095, + "grad_norm": 0.33984375, + "learning_rate": 3.394576844037695e-05, + "loss": 0.3992, + "step": 4680 + }, + { + "epoch": 5.491207502930832, + "grad_norm": 0.369140625, + "learning_rate": 3.374399575733835e-05, + "loss": 0.4328, + "step": 4684 + }, + { + "epoch": 5.495896834701055, + "grad_norm": 0.34375, + "learning_rate": 3.3542748502088325e-05, + "loss": 0.4447, + "step": 4688 + }, + { + "epoch": 5.500586166471278, + "grad_norm": 0.369140625, + "learning_rate": 3.334202758417896e-05, + "loss": 0.4483, + "step": 4692 + }, + { + "epoch": 5.505275498241501, + "grad_norm": 0.373046875, + "learning_rate": 3.314183391078373e-05, + "loss": 0.4673, + "step": 4696 + }, + { + "epoch": 5.509964830011723, + "grad_norm": 0.35546875, + "learning_rate": 3.294216838669295e-05, + "loss": 0.4609, + "step": 4700 + }, + { + "epoch": 5.514654161781946, + "grad_norm": 0.34765625, + "learning_rate": 3.2743031914310104e-05, + "loss": 0.4391, + "step": 4704 + }, + { + "epoch": 5.519343493552169, + "grad_norm": 0.357421875, + "learning_rate": 3.254442539364749e-05, + "loss": 0.4274, + "step": 4708 + }, + { + "epoch": 5.524032825322392, + "grad_norm": 0.341796875, + "learning_rate": 3.2346349722322274e-05, + "loss": 0.3941, + "step": 4712 + }, + { + "epoch": 5.528722157092615, + "grad_norm": 0.35546875, + "learning_rate": 3.2148805795552406e-05, + "loss": 0.3779, + "step": 4716 + }, + { + "epoch": 5.533411488862837, + "grad_norm": 0.353515625, + "learning_rate": 3.195179450615252e-05, + "loss": 0.3988, + "step": 4720 + }, + { + "epoch": 5.53810082063306, + "grad_norm": 0.3671875, + "learning_rate": 3.175531674453012e-05, + "loss": 0.4613, + "step": 4724 + }, + { + "epoch": 5.542790152403283, + "grad_norm": 0.34765625, + "learning_rate": 3.155937339868117e-05, + "loss": 0.4266, + "step": 4728 + }, + { + "epoch": 5.547479484173505, + "grad_norm": 0.373046875, + "learning_rate": 3.136396535418653e-05, + "loss": 0.4355, + "step": 4732 + }, + { + "epoch": 5.552168815943728, + "grad_norm": 0.37890625, + "learning_rate": 3.1169093494207547e-05, + "loss": 0.4324, + "step": 4736 + }, + { + "epoch": 5.5568581477139505, + "grad_norm": 0.35546875, + "learning_rate": 3.097475869948228e-05, + "loss": 0.4061, + "step": 4740 + }, + { + "epoch": 5.561547479484173, + "grad_norm": 0.34375, + "learning_rate": 3.078096184832158e-05, + "loss": 0.4486, + "step": 4744 + }, + { + "epoch": 5.566236811254396, + "grad_norm": 0.34765625, + "learning_rate": 3.058770381660487e-05, + "loss": 0.4335, + "step": 4748 + }, + { + "epoch": 5.570926143024619, + "grad_norm": 0.34765625, + "learning_rate": 3.0394985477776522e-05, + "loss": 0.4712, + "step": 4752 + }, + { + "epoch": 5.575615474794842, + "grad_norm": 0.33203125, + "learning_rate": 3.0202807702841493e-05, + "loss": 0.4249, + "step": 4756 + }, + { + "epoch": 5.5803048065650644, + "grad_norm": 0.33203125, + "learning_rate": 3.0011171360361815e-05, + "loss": 0.4202, + "step": 4760 + }, + { + "epoch": 5.584994138335287, + "grad_norm": 0.349609375, + "learning_rate": 2.9820077316452417e-05, + "loss": 0.4492, + "step": 4764 + }, + { + "epoch": 5.58968347010551, + "grad_norm": 0.357421875, + "learning_rate": 2.962952643477718e-05, + "loss": 0.4444, + "step": 4768 + }, + { + "epoch": 5.594372801875733, + "grad_norm": 0.341796875, + "learning_rate": 2.9439519576545302e-05, + "loss": 0.4112, + "step": 4772 + }, + { + "epoch": 5.599062133645956, + "grad_norm": 0.353515625, + "learning_rate": 2.925005760050704e-05, + "loss": 0.4111, + "step": 4776 + }, + { + "epoch": 5.603751465416178, + "grad_norm": 0.37109375, + "learning_rate": 2.906114136295018e-05, + "loss": 0.423, + "step": 4780 + }, + { + "epoch": 5.608440797186401, + "grad_norm": 0.341796875, + "learning_rate": 2.8872771717695858e-05, + "loss": 0.4275, + "step": 4784 + }, + { + "epoch": 5.613130128956624, + "grad_norm": 0.35546875, + "learning_rate": 2.8684949516094947e-05, + "loss": 0.4399, + "step": 4788 + }, + { + "epoch": 5.617819460726847, + "grad_norm": 0.357421875, + "learning_rate": 2.8497675607024046e-05, + "loss": 0.4434, + "step": 4792 + }, + { + "epoch": 5.622508792497069, + "grad_norm": 0.361328125, + "learning_rate": 2.831095083688169e-05, + "loss": 0.4714, + "step": 4796 + }, + { + "epoch": 5.6271981242672915, + "grad_norm": 0.365234375, + "learning_rate": 2.812477604958465e-05, + "loss": 0.4226, + "step": 4800 + }, + { + "epoch": 5.631887456037514, + "grad_norm": 0.37890625, + "learning_rate": 2.793915208656387e-05, + "loss": 0.4441, + "step": 4804 + }, + { + "epoch": 5.636576787807737, + "grad_norm": 0.353515625, + "learning_rate": 2.775407978676093e-05, + "loss": 0.441, + "step": 4808 + }, + { + "epoch": 5.64126611957796, + "grad_norm": 0.341796875, + "learning_rate": 2.7569559986624023e-05, + "loss": 0.425, + "step": 4812 + }, + { + "epoch": 5.645955451348183, + "grad_norm": 0.33984375, + "learning_rate": 2.7385593520104276e-05, + "loss": 0.4097, + "step": 4816 + }, + { + "epoch": 5.6506447831184055, + "grad_norm": 0.34765625, + "learning_rate": 2.7202181218652113e-05, + "loss": 0.4147, + "step": 4820 + }, + { + "epoch": 5.655334114888628, + "grad_norm": 0.333984375, + "learning_rate": 2.701932391121323e-05, + "loss": 0.3897, + "step": 4824 + }, + { + "epoch": 5.660023446658851, + "grad_norm": 0.359375, + "learning_rate": 2.6837022424225048e-05, + "loss": 0.4353, + "step": 4828 + }, + { + "epoch": 5.664712778429074, + "grad_norm": 0.388671875, + "learning_rate": 2.6655277581612838e-05, + "loss": 0.4483, + "step": 4832 + }, + { + "epoch": 5.669402110199297, + "grad_norm": 0.37109375, + "learning_rate": 2.647409020478623e-05, + "loss": 0.3887, + "step": 4836 + }, + { + "epoch": 5.674091441969519, + "grad_norm": 0.34765625, + "learning_rate": 2.629346111263521e-05, + "loss": 0.4746, + "step": 4840 + }, + { + "epoch": 5.678780773739742, + "grad_norm": 0.345703125, + "learning_rate": 2.6113391121526573e-05, + "loss": 0.4224, + "step": 4844 + }, + { + "epoch": 5.683470105509965, + "grad_norm": 0.35546875, + "learning_rate": 2.593388104530031e-05, + "loss": 0.4432, + "step": 4848 + }, + { + "epoch": 5.688159437280188, + "grad_norm": 0.373046875, + "learning_rate": 2.5754931695265674e-05, + "loss": 0.4395, + "step": 4852 + }, + { + "epoch": 5.692848769050411, + "grad_norm": 0.369140625, + "learning_rate": 2.5576543880197847e-05, + "loss": 0.4717, + "step": 4856 + }, + { + "epoch": 5.697538100820633, + "grad_norm": 0.3515625, + "learning_rate": 2.539871840633399e-05, + "loss": 0.4418, + "step": 4860 + }, + { + "epoch": 5.702227432590856, + "grad_norm": 0.328125, + "learning_rate": 2.522145607736976e-05, + "loss": 0.4035, + "step": 4864 + }, + { + "epoch": 5.706916764361079, + "grad_norm": 0.34375, + "learning_rate": 2.5044757694455642e-05, + "loss": 0.4121, + "step": 4868 + }, + { + "epoch": 5.711606096131302, + "grad_norm": 0.357421875, + "learning_rate": 2.4868624056193264e-05, + "loss": 0.4117, + "step": 4872 + }, + { + "epoch": 5.716295427901524, + "grad_norm": 0.365234375, + "learning_rate": 2.469305595863199e-05, + "loss": 0.4515, + "step": 4876 + }, + { + "epoch": 5.7209847596717465, + "grad_norm": 0.359375, + "learning_rate": 2.4518054195265024e-05, + "loss": 0.4104, + "step": 4880 + }, + { + "epoch": 5.725674091441969, + "grad_norm": 0.369140625, + "learning_rate": 2.4343619557026102e-05, + "loss": 0.4226, + "step": 4884 + }, + { + "epoch": 5.730363423212192, + "grad_norm": 0.341796875, + "learning_rate": 2.4169752832285723e-05, + "loss": 0.4267, + "step": 4888 + }, + { + "epoch": 5.735052754982415, + "grad_norm": 0.375, + "learning_rate": 2.3996454806847624e-05, + "loss": 0.4508, + "step": 4892 + }, + { + "epoch": 5.739742086752638, + "grad_norm": 0.33984375, + "learning_rate": 2.3823726263945442e-05, + "loss": 0.4046, + "step": 4896 + }, + { + "epoch": 5.74443141852286, + "grad_norm": 0.337890625, + "learning_rate": 2.3651567984238707e-05, + "loss": 0.4372, + "step": 4900 + }, + { + "epoch": 5.749120750293083, + "grad_norm": 0.345703125, + "learning_rate": 2.3479980745809885e-05, + "loss": 0.4356, + "step": 4904 + }, + { + "epoch": 5.753810082063306, + "grad_norm": 0.33984375, + "learning_rate": 2.3308965324160374e-05, + "loss": 0.4222, + "step": 4908 + }, + { + "epoch": 5.758499413833529, + "grad_norm": 0.3359375, + "learning_rate": 2.313852249220735e-05, + "loss": 0.4257, + "step": 4912 + }, + { + "epoch": 5.763188745603752, + "grad_norm": 0.330078125, + "learning_rate": 2.2968653020280036e-05, + "loss": 0.387, + "step": 4916 + }, + { + "epoch": 5.767878077373974, + "grad_norm": 0.330078125, + "learning_rate": 2.2799357676116287e-05, + "loss": 0.3745, + "step": 4920 + }, + { + "epoch": 5.772567409144197, + "grad_norm": 0.35546875, + "learning_rate": 2.2630637224859283e-05, + "loss": 0.4284, + "step": 4924 + }, + { + "epoch": 5.77725674091442, + "grad_norm": 0.359375, + "learning_rate": 2.246249242905377e-05, + "loss": 0.4014, + "step": 4928 + }, + { + "epoch": 5.781946072684643, + "grad_norm": 0.345703125, + "learning_rate": 2.2294924048642888e-05, + "loss": 0.4091, + "step": 4932 + }, + { + "epoch": 5.786635404454865, + "grad_norm": 0.359375, + "learning_rate": 2.212793284096458e-05, + "loss": 0.4428, + "step": 4936 + }, + { + "epoch": 5.7913247362250875, + "grad_norm": 0.34765625, + "learning_rate": 2.196151956074821e-05, + "loss": 0.4281, + "step": 4940 + }, + { + "epoch": 5.79601406799531, + "grad_norm": 0.3359375, + "learning_rate": 2.179568496011116e-05, + "loss": 0.4555, + "step": 4944 + }, + { + "epoch": 5.800703399765533, + "grad_norm": 0.384765625, + "learning_rate": 2.1630429788555376e-05, + "loss": 0.4294, + "step": 4948 + }, + { + "epoch": 5.805392731535756, + "grad_norm": 0.349609375, + "learning_rate": 2.146575479296418e-05, + "loss": 0.3904, + "step": 4952 + }, + { + "epoch": 5.810082063305979, + "grad_norm": 0.33984375, + "learning_rate": 2.1301660717598575e-05, + "loss": 0.417, + "step": 4956 + }, + { + "epoch": 5.814771395076201, + "grad_norm": 0.380859375, + "learning_rate": 2.1138148304094177e-05, + "loss": 0.4715, + "step": 4960 + }, + { + "epoch": 5.819460726846424, + "grad_norm": 0.357421875, + "learning_rate": 2.0975218291457645e-05, + "loss": 0.403, + "step": 4964 + }, + { + "epoch": 5.824150058616647, + "grad_norm": 0.3515625, + "learning_rate": 2.0812871416063477e-05, + "loss": 0.399, + "step": 4968 + }, + { + "epoch": 5.82883939038687, + "grad_norm": 0.361328125, + "learning_rate": 2.0651108411650685e-05, + "loss": 0.4323, + "step": 4972 + }, + { + "epoch": 5.833528722157093, + "grad_norm": 0.359375, + "learning_rate": 2.0489930009319287e-05, + "loss": 0.4407, + "step": 4976 + }, + { + "epoch": 5.838218053927315, + "grad_norm": 0.353515625, + "learning_rate": 2.0329336937527312e-05, + "loss": 0.4169, + "step": 4980 + }, + { + "epoch": 5.842907385697538, + "grad_norm": 0.384765625, + "learning_rate": 2.0169329922087218e-05, + "loss": 0.4336, + "step": 4984 + }, + { + "epoch": 5.847596717467761, + "grad_norm": 0.34765625, + "learning_rate": 2.000990968616287e-05, + "loss": 0.4445, + "step": 4988 + }, + { + "epoch": 5.852286049237984, + "grad_norm": 0.34765625, + "learning_rate": 1.985107695026601e-05, + "loss": 0.4177, + "step": 4992 + }, + { + "epoch": 5.856975381008207, + "grad_norm": 0.349609375, + "learning_rate": 1.9692832432253154e-05, + "loss": 0.4066, + "step": 4996 + }, + { + "epoch": 5.861664712778429, + "grad_norm": 0.337890625, + "learning_rate": 1.9535176847322416e-05, + "loss": 0.3923, + "step": 5000 + }, + { + "epoch": 5.866354044548652, + "grad_norm": 0.36328125, + "learning_rate": 1.937811090801004e-05, + "loss": 0.3981, + "step": 5004 + }, + { + "epoch": 5.871043376318875, + "grad_norm": 0.349609375, + "learning_rate": 1.9221635324187513e-05, + "loss": 0.4235, + "step": 5008 + }, + { + "epoch": 5.875732708089098, + "grad_norm": 0.373046875, + "learning_rate": 1.9065750803057907e-05, + "loss": 0.4204, + "step": 5012 + }, + { + "epoch": 5.88042203985932, + "grad_norm": 0.3671875, + "learning_rate": 1.8910458049153173e-05, + "loss": 0.4582, + "step": 5016 + }, + { + "epoch": 5.8851113716295425, + "grad_norm": 0.359375, + "learning_rate": 1.8755757764330632e-05, + "loss": 0.4433, + "step": 5020 + }, + { + "epoch": 5.889800703399765, + "grad_norm": 0.3359375, + "learning_rate": 1.860165064776985e-05, + "loss": 0.4477, + "step": 5024 + }, + { + "epoch": 5.894490035169988, + "grad_norm": 0.373046875, + "learning_rate": 1.8448137395969636e-05, + "loss": 0.3817, + "step": 5028 + }, + { + "epoch": 5.899179366940211, + "grad_norm": 0.3515625, + "learning_rate": 1.8295218702744662e-05, + "loss": 0.4144, + "step": 5032 + }, + { + "epoch": 5.903868698710434, + "grad_norm": 0.359375, + "learning_rate": 1.8142895259222584e-05, + "loss": 0.4391, + "step": 5036 + }, + { + "epoch": 5.908558030480656, + "grad_norm": 0.361328125, + "learning_rate": 1.7991167753840673e-05, + "loss": 0.4012, + "step": 5040 + }, + { + "epoch": 5.913247362250879, + "grad_norm": 0.369140625, + "learning_rate": 1.784003687234281e-05, + "loss": 0.4313, + "step": 5044 + }, + { + "epoch": 5.917936694021102, + "grad_norm": 0.349609375, + "learning_rate": 1.7689503297776464e-05, + "loss": 0.4179, + "step": 5048 + }, + { + "epoch": 5.922626025791325, + "grad_norm": 0.376953125, + "learning_rate": 1.753956771048946e-05, + "loss": 0.4241, + "step": 5052 + }, + { + "epoch": 5.927315357561548, + "grad_norm": 0.369140625, + "learning_rate": 1.7390230788127024e-05, + "loss": 0.4458, + "step": 5056 + }, + { + "epoch": 5.93200468933177, + "grad_norm": 0.357421875, + "learning_rate": 1.7241493205628644e-05, + "loss": 0.3708, + "step": 5060 + }, + { + "epoch": 5.936694021101993, + "grad_norm": 0.359375, + "learning_rate": 1.709335563522507e-05, + "loss": 0.4046, + "step": 5064 + }, + { + "epoch": 5.941383352872216, + "grad_norm": 0.333984375, + "learning_rate": 1.6945818746435248e-05, + "loss": 0.4018, + "step": 5068 + }, + { + "epoch": 5.946072684642439, + "grad_norm": 0.349609375, + "learning_rate": 1.6798883206063217e-05, + "loss": 0.437, + "step": 5072 + }, + { + "epoch": 5.950762016412662, + "grad_norm": 0.34765625, + "learning_rate": 1.665254967819532e-05, + "loss": 0.4529, + "step": 5076 + }, + { + "epoch": 5.9554513481828835, + "grad_norm": 0.34765625, + "learning_rate": 1.6506818824196965e-05, + "loss": 0.4405, + "step": 5080 + }, + { + "epoch": 5.960140679953106, + "grad_norm": 0.3515625, + "learning_rate": 1.636169130270973e-05, + "loss": 0.4241, + "step": 5084 + }, + { + "epoch": 5.964830011723329, + "grad_norm": 0.357421875, + "learning_rate": 1.6217167769648398e-05, + "loss": 0.4551, + "step": 5088 + }, + { + "epoch": 5.969519343493552, + "grad_norm": 0.32421875, + "learning_rate": 1.6073248878198032e-05, + "loss": 0.427, + "step": 5092 + }, + { + "epoch": 5.974208675263775, + "grad_norm": 0.359375, + "learning_rate": 1.5929935278810883e-05, + "loss": 0.3894, + "step": 5096 + }, + { + "epoch": 5.978898007033997, + "grad_norm": 0.341796875, + "learning_rate": 1.578722761920359e-05, + "loss": 0.434, + "step": 5100 + }, + { + "epoch": 5.98358733880422, + "grad_norm": 0.35546875, + "learning_rate": 1.5645126544354253e-05, + "loss": 0.4446, + "step": 5104 + }, + { + "epoch": 5.988276670574443, + "grad_norm": 0.33203125, + "learning_rate": 1.550363269649932e-05, + "loss": 0.4205, + "step": 5108 + }, + { + "epoch": 5.992966002344666, + "grad_norm": 0.34375, + "learning_rate": 1.536274671513098e-05, + "loss": 0.4032, + "step": 5112 + }, + { + "epoch": 5.997655334114889, + "grad_norm": 0.359375, + "learning_rate": 1.5222469236994061e-05, + "loss": 0.4029, + "step": 5116 + }, + { + "epoch": 6.002344665885111, + "grad_norm": 0.361328125, + "learning_rate": 1.5082800896083186e-05, + "loss": 0.3855, + "step": 5120 + }, + { + "epoch": 6.007033997655334, + "grad_norm": 0.32421875, + "learning_rate": 1.4943742323639995e-05, + "loss": 0.4209, + "step": 5124 + }, + { + "epoch": 6.011723329425557, + "grad_norm": 0.353515625, + "learning_rate": 1.4805294148150171e-05, + "loss": 0.4114, + "step": 5128 + }, + { + "epoch": 6.01641266119578, + "grad_norm": 0.34375, + "learning_rate": 1.4667456995340731e-05, + "loss": 0.4657, + "step": 5132 + }, + { + "epoch": 6.021101992966003, + "grad_norm": 0.341796875, + "learning_rate": 1.4530231488177058e-05, + "loss": 0.4312, + "step": 5136 + }, + { + "epoch": 6.025791324736225, + "grad_norm": 0.359375, + "learning_rate": 1.4393618246860239e-05, + "loss": 0.43, + "step": 5140 + }, + { + "epoch": 6.030480656506448, + "grad_norm": 0.337890625, + "learning_rate": 1.4257617888824096e-05, + "loss": 0.4133, + "step": 5144 + }, + { + "epoch": 6.035169988276671, + "grad_norm": 0.33984375, + "learning_rate": 1.4122231028732516e-05, + "loss": 0.4065, + "step": 5148 + }, + { + "epoch": 6.039859320046894, + "grad_norm": 0.3359375, + "learning_rate": 1.398745827847667e-05, + "loss": 0.4284, + "step": 5152 + }, + { + "epoch": 6.044548651817116, + "grad_norm": 0.34375, + "learning_rate": 1.385330024717215e-05, + "loss": 0.3909, + "step": 5156 + }, + { + "epoch": 6.049237983587338, + "grad_norm": 0.3359375, + "learning_rate": 1.3719757541156317e-05, + "loss": 0.3842, + "step": 5160 + }, + { + "epoch": 6.053927315357561, + "grad_norm": 0.32421875, + "learning_rate": 1.3586830763985479e-05, + "loss": 0.3678, + "step": 5164 + }, + { + "epoch": 6.058616647127784, + "grad_norm": 0.349609375, + "learning_rate": 1.3454520516432282e-05, + "loss": 0.4318, + "step": 5168 + }, + { + "epoch": 6.063305978898007, + "grad_norm": 0.330078125, + "learning_rate": 1.3322827396482888e-05, + "loss": 0.4121, + "step": 5172 + }, + { + "epoch": 6.06799531066823, + "grad_norm": 0.359375, + "learning_rate": 1.3191751999334237e-05, + "loss": 0.4247, + "step": 5176 + }, + { + "epoch": 6.072684642438452, + "grad_norm": 0.337890625, + "learning_rate": 1.3061294917391558e-05, + "loss": 0.375, + "step": 5180 + }, + { + "epoch": 6.077373974208675, + "grad_norm": 0.34765625, + "learning_rate": 1.2931456740265406e-05, + "loss": 0.4611, + "step": 5184 + }, + { + "epoch": 6.082063305978898, + "grad_norm": 0.361328125, + "learning_rate": 1.2802238054769298e-05, + "loss": 0.4174, + "step": 5188 + }, + { + "epoch": 6.086752637749121, + "grad_norm": 0.349609375, + "learning_rate": 1.2673639444916805e-05, + "loss": 0.4337, + "step": 5192 + }, + { + "epoch": 6.091441969519344, + "grad_norm": 0.357421875, + "learning_rate": 1.2545661491919057e-05, + "loss": 0.3709, + "step": 5196 + }, + { + "epoch": 6.096131301289566, + "grad_norm": 0.341796875, + "learning_rate": 1.2418304774182075e-05, + "loss": 0.452, + "step": 5200 + }, + { + "epoch": 6.100820633059789, + "grad_norm": 0.345703125, + "learning_rate": 1.2291569867304112e-05, + "loss": 0.435, + "step": 5204 + }, + { + "epoch": 6.105509964830012, + "grad_norm": 0.34765625, + "learning_rate": 1.2165457344073238e-05, + "loss": 0.4482, + "step": 5208 + }, + { + "epoch": 6.110199296600235, + "grad_norm": 0.345703125, + "learning_rate": 1.2039967774464448e-05, + "loss": 0.3896, + "step": 5212 + }, + { + "epoch": 6.1148886283704575, + "grad_norm": 0.33984375, + "learning_rate": 1.1915101725637383e-05, + "loss": 0.4166, + "step": 5216 + }, + { + "epoch": 6.11957796014068, + "grad_norm": 0.341796875, + "learning_rate": 1.1790859761933563e-05, + "loss": 0.3914, + "step": 5220 + }, + { + "epoch": 6.124267291910903, + "grad_norm": 0.357421875, + "learning_rate": 1.166724244487387e-05, + "loss": 0.4055, + "step": 5224 + }, + { + "epoch": 6.128956623681125, + "grad_norm": 0.3359375, + "learning_rate": 1.1544250333156207e-05, + "loss": 0.409, + "step": 5228 + }, + { + "epoch": 6.133645955451348, + "grad_norm": 0.314453125, + "learning_rate": 1.142188398265259e-05, + "loss": 0.3903, + "step": 5232 + }, + { + "epoch": 6.138335287221571, + "grad_norm": 0.34375, + "learning_rate": 1.1300143946407064e-05, + "loss": 0.397, + "step": 5236 + }, + { + "epoch": 6.143024618991793, + "grad_norm": 0.326171875, + "learning_rate": 1.1179030774632851e-05, + "loss": 0.3783, + "step": 5240 + }, + { + "epoch": 6.147713950762016, + "grad_norm": 0.353515625, + "learning_rate": 1.1058545014710146e-05, + "loss": 0.4203, + "step": 5244 + }, + { + "epoch": 6.152403282532239, + "grad_norm": 0.3515625, + "learning_rate": 1.093868721118339e-05, + "loss": 0.433, + "step": 5248 + }, + { + "epoch": 6.157092614302462, + "grad_norm": 0.337890625, + "learning_rate": 1.0819457905758978e-05, + "loss": 0.4351, + "step": 5252 + }, + { + "epoch": 6.161781946072685, + "grad_norm": 0.33984375, + "learning_rate": 1.0700857637302779e-05, + "loss": 0.4498, + "step": 5256 + }, + { + "epoch": 6.166471277842907, + "grad_norm": 0.3515625, + "learning_rate": 1.058288694183762e-05, + "loss": 0.4151, + "step": 5260 + }, + { + "epoch": 6.17116060961313, + "grad_norm": 0.333984375, + "learning_rate": 1.0465546352541055e-05, + "loss": 0.4446, + "step": 5264 + }, + { + "epoch": 6.175849941383353, + "grad_norm": 0.345703125, + "learning_rate": 1.034883639974261e-05, + "loss": 0.4044, + "step": 5268 + }, + { + "epoch": 6.180539273153576, + "grad_norm": 0.341796875, + "learning_rate": 1.0232757610921833e-05, + "loss": 0.3634, + "step": 5272 + }, + { + "epoch": 6.185228604923799, + "grad_norm": 0.34765625, + "learning_rate": 1.0117310510705528e-05, + "loss": 0.4032, + "step": 5276 + }, + { + "epoch": 6.189917936694021, + "grad_norm": 0.341796875, + "learning_rate": 1.0002495620865558e-05, + "loss": 0.4418, + "step": 5280 + }, + { + "epoch": 6.194607268464244, + "grad_norm": 0.349609375, + "learning_rate": 9.888313460316549e-06, + "loss": 0.4224, + "step": 5284 + }, + { + "epoch": 6.199296600234467, + "grad_norm": 0.373046875, + "learning_rate": 9.77476454511335e-06, + "loss": 0.4139, + "step": 5288 + }, + { + "epoch": 6.20398593200469, + "grad_norm": 0.3515625, + "learning_rate": 9.661849388448866e-06, + "loss": 0.3725, + "step": 5292 + }, + { + "epoch": 6.2086752637749125, + "grad_norm": 0.34375, + "learning_rate": 9.549568500651695e-06, + "loss": 0.4023, + "step": 5296 + }, + { + "epoch": 6.213364595545134, + "grad_norm": 0.337890625, + "learning_rate": 9.437922389183772e-06, + "loss": 0.3973, + "step": 5300 + }, + { + "epoch": 6.218053927315357, + "grad_norm": 0.3515625, + "learning_rate": 9.3269115586381e-06, + "loss": 0.4492, + "step": 5304 + }, + { + "epoch": 6.22274325908558, + "grad_norm": 0.349609375, + "learning_rate": 9.216536510736528e-06, + "loss": 0.3933, + "step": 5308 + }, + { + "epoch": 6.227432590855803, + "grad_norm": 0.365234375, + "learning_rate": 9.106797744327449e-06, + "loss": 0.4262, + "step": 5312 + }, + { + "epoch": 6.232121922626026, + "grad_norm": 0.359375, + "learning_rate": 8.997695755383444e-06, + "loss": 0.386, + "step": 5316 + }, + { + "epoch": 6.236811254396248, + "grad_norm": 0.333984375, + "learning_rate": 8.889231036999245e-06, + "loss": 0.4026, + "step": 5320 + }, + { + "epoch": 6.241500586166471, + "grad_norm": 0.345703125, + "learning_rate": 8.781404079389304e-06, + "loss": 0.4596, + "step": 5324 + }, + { + "epoch": 6.246189917936694, + "grad_norm": 0.326171875, + "learning_rate": 8.674215369885695e-06, + "loss": 0.4081, + "step": 5328 + }, + { + "epoch": 6.250879249706917, + "grad_norm": 0.369140625, + "learning_rate": 8.567665392935918e-06, + "loss": 0.4246, + "step": 5332 + }, + { + "epoch": 6.25556858147714, + "grad_norm": 0.384765625, + "learning_rate": 8.461754630100581e-06, + "loss": 0.4221, + "step": 5336 + }, + { + "epoch": 6.260257913247362, + "grad_norm": 0.375, + "learning_rate": 8.356483560051468e-06, + "loss": 0.4704, + "step": 5340 + }, + { + "epoch": 6.264947245017585, + "grad_norm": 0.349609375, + "learning_rate": 8.251852658569014e-06, + "loss": 0.3896, + "step": 5344 + }, + { + "epoch": 6.269636576787808, + "grad_norm": 0.333984375, + "learning_rate": 8.147862398540545e-06, + "loss": 0.4065, + "step": 5348 + }, + { + "epoch": 6.274325908558031, + "grad_norm": 0.34765625, + "learning_rate": 8.044513249957874e-06, + "loss": 0.4074, + "step": 5352 + }, + { + "epoch": 6.2790152403282535, + "grad_norm": 0.337890625, + "learning_rate": 7.94180567991528e-06, + "loss": 0.4237, + "step": 5356 + }, + { + "epoch": 6.283704572098476, + "grad_norm": 0.349609375, + "learning_rate": 7.839740152607398e-06, + "loss": 0.4043, + "step": 5360 + }, + { + "epoch": 6.288393903868699, + "grad_norm": 0.33984375, + "learning_rate": 7.738317129327049e-06, + "loss": 0.4098, + "step": 5364 + }, + { + "epoch": 6.293083235638921, + "grad_norm": 0.32421875, + "learning_rate": 7.63753706846329e-06, + "loss": 0.4149, + "step": 5368 + }, + { + "epoch": 6.297772567409144, + "grad_norm": 0.357421875, + "learning_rate": 7.537400425499191e-06, + "loss": 0.3909, + "step": 5372 + }, + { + "epoch": 6.302461899179367, + "grad_norm": 0.337890625, + "learning_rate": 7.437907653009878e-06, + "loss": 0.3893, + "step": 5376 + }, + { + "epoch": 6.307151230949589, + "grad_norm": 0.359375, + "learning_rate": 7.339059200660441e-06, + "loss": 0.4365, + "step": 5380 + }, + { + "epoch": 6.311840562719812, + "grad_norm": 0.37109375, + "learning_rate": 7.240855515203897e-06, + "loss": 0.4586, + "step": 5384 + }, + { + "epoch": 6.316529894490035, + "grad_norm": 0.35546875, + "learning_rate": 7.143297040479262e-06, + "loss": 0.4519, + "step": 5388 + }, + { + "epoch": 6.321219226260258, + "grad_norm": 0.337890625, + "learning_rate": 7.046384217409401e-06, + "loss": 0.3872, + "step": 5392 + }, + { + "epoch": 6.325908558030481, + "grad_norm": 0.34765625, + "learning_rate": 6.950117483999145e-06, + "loss": 0.4004, + "step": 5396 + }, + { + "epoch": 6.330597889800703, + "grad_norm": 0.32421875, + "learning_rate": 6.854497275333282e-06, + "loss": 0.3972, + "step": 5400 + }, + { + "epoch": 6.335287221570926, + "grad_norm": 0.32421875, + "learning_rate": 6.759524023574514e-06, + "loss": 0.367, + "step": 5404 + }, + { + "epoch": 6.339976553341149, + "grad_norm": 0.349609375, + "learning_rate": 6.6651981579616705e-06, + "loss": 0.4688, + "step": 5408 + }, + { + "epoch": 6.344665885111372, + "grad_norm": 0.337890625, + "learning_rate": 6.57152010480762e-06, + "loss": 0.4207, + "step": 5412 + }, + { + "epoch": 6.3493552168815945, + "grad_norm": 0.337890625, + "learning_rate": 6.4784902874973734e-06, + "loss": 0.413, + "step": 5416 + }, + { + "epoch": 6.354044548651817, + "grad_norm": 0.349609375, + "learning_rate": 6.3861091264861995e-06, + "loss": 0.448, + "step": 5420 + }, + { + "epoch": 6.35873388042204, + "grad_norm": 0.35546875, + "learning_rate": 6.2943770392977826e-06, + "loss": 0.4018, + "step": 5424 + }, + { + "epoch": 6.363423212192263, + "grad_norm": 0.359375, + "learning_rate": 6.203294440522183e-06, + "loss": 0.416, + "step": 5428 + }, + { + "epoch": 6.368112543962486, + "grad_norm": 0.3359375, + "learning_rate": 6.112861741814063e-06, + "loss": 0.4392, + "step": 5432 + }, + { + "epoch": 6.3728018757327085, + "grad_norm": 0.373046875, + "learning_rate": 6.023079351890881e-06, + "loss": 0.4195, + "step": 5436 + }, + { + "epoch": 6.377491207502931, + "grad_norm": 0.333984375, + "learning_rate": 5.933947676530881e-06, + "loss": 0.4029, + "step": 5440 + }, + { + "epoch": 6.382180539273153, + "grad_norm": 0.357421875, + "learning_rate": 5.845467118571445e-06, + "loss": 0.4476, + "step": 5444 + }, + { + "epoch": 6.386869871043376, + "grad_norm": 0.359375, + "learning_rate": 5.757638077907123e-06, + "loss": 0.4326, + "step": 5448 + }, + { + "epoch": 6.391559202813599, + "grad_norm": 0.34375, + "learning_rate": 5.670460951487854e-06, + "loss": 0.4126, + "step": 5452 + }, + { + "epoch": 6.396248534583822, + "grad_norm": 0.34765625, + "learning_rate": 5.583936133317285e-06, + "loss": 0.4201, + "step": 5456 + }, + { + "epoch": 6.400937866354044, + "grad_norm": 0.353515625, + "learning_rate": 5.498064014450837e-06, + "loss": 0.4105, + "step": 5460 + }, + { + "epoch": 6.405627198124267, + "grad_norm": 0.369140625, + "learning_rate": 5.4128449829940745e-06, + "loss": 0.4393, + "step": 5464 + }, + { + "epoch": 6.41031652989449, + "grad_norm": 0.36328125, + "learning_rate": 5.3282794241007895e-06, + "loss": 0.3989, + "step": 5468 + }, + { + "epoch": 6.415005861664713, + "grad_norm": 0.345703125, + "learning_rate": 5.244367719971454e-06, + "loss": 0.409, + "step": 5472 + }, + { + "epoch": 6.4196951934349356, + "grad_norm": 0.357421875, + "learning_rate": 5.161110249851353e-06, + "loss": 0.4121, + "step": 5476 + }, + { + "epoch": 6.424384525205158, + "grad_norm": 0.34375, + "learning_rate": 5.078507390028852e-06, + "loss": 0.4181, + "step": 5480 + }, + { + "epoch": 6.429073856975381, + "grad_norm": 0.349609375, + "learning_rate": 4.996559513833903e-06, + "loss": 0.4494, + "step": 5484 + }, + { + "epoch": 6.433763188745604, + "grad_norm": 0.35546875, + "learning_rate": 4.915266991636025e-06, + "loss": 0.4472, + "step": 5488 + }, + { + "epoch": 6.438452520515827, + "grad_norm": 0.345703125, + "learning_rate": 4.83463019084297e-06, + "loss": 0.3834, + "step": 5492 + }, + { + "epoch": 6.4431418522860495, + "grad_norm": 0.3359375, + "learning_rate": 4.754649475898814e-06, + "loss": 0.3881, + "step": 5496 + }, + { + "epoch": 6.447831184056272, + "grad_norm": 0.3828125, + "learning_rate": 4.675325208282471e-06, + "loss": 0.4295, + "step": 5500 } ], "logging_steps": 4, - "max_steps": 3661, + "max_steps": 5971, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 500, @@ -6440,7 +9651,7 @@ "attributes": {} } }, - "total_flos": 2.3920283694378516e+18, + "total_flos": 3.5958144579529605e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null