diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,35443 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 20239, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 3.7912962436676025, + "learning_rate": 4.9990118088838384e-05, + "loss": 10.3461, + "step": 4 + }, + { + "epoch": 0.0, + "grad_norm": 3.3178839683532715, + "learning_rate": 4.9980236177676767e-05, + "loss": 9.7253, + "step": 8 + }, + { + "epoch": 0.0, + "grad_norm": 3.043405055999756, + "learning_rate": 4.997035426651515e-05, + "loss": 9.476, + "step": 12 + }, + { + "epoch": 0.0, + "grad_norm": 3.0094900131225586, + "learning_rate": 4.9960472355353524e-05, + "loss": 9.2119, + "step": 16 + }, + { + "epoch": 0.0, + "grad_norm": 2.7685112953186035, + "learning_rate": 4.9950590444191906e-05, + "loss": 9.0744, + "step": 20 + }, + { + "epoch": 0.0, + "grad_norm": 2.816589832305908, + "learning_rate": 4.994070853303029e-05, + "loss": 8.8707, + "step": 24 + }, + { + "epoch": 0.0, + "grad_norm": 3.2649807929992676, + "learning_rate": 4.993082662186867e-05, + "loss": 8.677, + "step": 28 + }, + { + "epoch": 0.0, + "grad_norm": 2.543574571609497, + "learning_rate": 4.992094471070705e-05, + "loss": 8.5568, + "step": 32 + }, + { + "epoch": 0.0, + "grad_norm": 2.669053316116333, + "learning_rate": 4.9911062799545434e-05, + "loss": 8.2874, + "step": 36 + }, + { + "epoch": 0.0, + "grad_norm": 2.4249765872955322, + "learning_rate": 4.9901180888383816e-05, + "loss": 8.2541, + "step": 40 + }, + { + "epoch": 0.0, + "grad_norm": 2.1649515628814697, + "learning_rate": 4.98912989772222e-05, + "loss": 8.0488, + "step": 44 + }, + { + "epoch": 0.0, + "grad_norm": 2.1681978702545166, + "learning_rate": 4.9881417066060574e-05, + "loss": 7.9646, + "step": 48 + }, + { + "epoch": 0.0, + "grad_norm": 2.2630350589752197, + "learning_rate": 4.9871535154898956e-05, + "loss": 7.7305, + "step": 52 + }, + { + "epoch": 0.0, + "grad_norm": 2.286691904067993, + "learning_rate": 4.986165324373734e-05, + "loss": 7.5245, + "step": 56 + }, + { + "epoch": 0.0, + "grad_norm": 1.9876737594604492, + "learning_rate": 4.9851771332575727e-05, + "loss": 7.4304, + "step": 60 + }, + { + "epoch": 0.0, + "grad_norm": 1.7167022228240967, + "learning_rate": 4.984188942141411e-05, + "loss": 7.4031, + "step": 64 + }, + { + "epoch": 0.0, + "grad_norm": 1.8193055391311646, + "learning_rate": 4.983200751025249e-05, + "loss": 7.3022, + "step": 68 + }, + { + "epoch": 0.0, + "grad_norm": 1.543209195137024, + "learning_rate": 4.9822125599090866e-05, + "loss": 7.2365, + "step": 72 + }, + { + "epoch": 0.0, + "grad_norm": 1.75989830493927, + "learning_rate": 4.981224368792925e-05, + "loss": 7.0086, + "step": 76 + }, + { + "epoch": 0.0, + "grad_norm": 1.8119884729385376, + "learning_rate": 4.980236177676763e-05, + "loss": 7.1398, + "step": 80 + }, + { + "epoch": 0.0, + "grad_norm": 1.4095029830932617, + "learning_rate": 4.979247986560601e-05, + "loss": 6.9534, + "step": 84 + }, + { + "epoch": 0.0, + "grad_norm": 1.6824373006820679, + "learning_rate": 4.9782597954444394e-05, + "loss": 7.0708, + "step": 88 + }, + { + "epoch": 0.0, + "grad_norm": 1.6116963624954224, + "learning_rate": 4.9772716043282776e-05, + "loss": 6.957, + "step": 92 + }, + { + "epoch": 0.0, + "grad_norm": 1.2887672185897827, + "learning_rate": 4.976283413212116e-05, + "loss": 6.8803, + "step": 96 + }, + { + "epoch": 0.0, + "grad_norm": 1.2640583515167236, + "learning_rate": 4.9752952220959534e-05, + "loss": 6.6778, + "step": 100 + }, + { + "epoch": 0.01, + "grad_norm": 1.3888378143310547, + "learning_rate": 4.9743070309797916e-05, + "loss": 6.7205, + "step": 104 + }, + { + "epoch": 0.01, + "grad_norm": 1.4042987823486328, + "learning_rate": 4.97331883986363e-05, + "loss": 6.7003, + "step": 108 + }, + { + "epoch": 0.01, + "grad_norm": 1.531121015548706, + "learning_rate": 4.972330648747468e-05, + "loss": 6.7113, + "step": 112 + }, + { + "epoch": 0.01, + "grad_norm": 1.5138936042785645, + "learning_rate": 4.971342457631306e-05, + "loss": 6.6541, + "step": 116 + }, + { + "epoch": 0.01, + "grad_norm": 1.5610084533691406, + "learning_rate": 4.9703542665151444e-05, + "loss": 6.6966, + "step": 120 + }, + { + "epoch": 0.01, + "grad_norm": 1.3988789319992065, + "learning_rate": 4.9693660753989826e-05, + "loss": 6.5739, + "step": 124 + }, + { + "epoch": 0.01, + "grad_norm": 1.511693000793457, + "learning_rate": 4.968377884282821e-05, + "loss": 6.6209, + "step": 128 + }, + { + "epoch": 0.01, + "grad_norm": 1.4789738655090332, + "learning_rate": 4.967389693166658e-05, + "loss": 6.497, + "step": 132 + }, + { + "epoch": 0.01, + "grad_norm": 1.312217354774475, + "learning_rate": 4.9664015020504965e-05, + "loss": 6.4966, + "step": 136 + }, + { + "epoch": 0.01, + "grad_norm": 1.2716199159622192, + "learning_rate": 4.965413310934335e-05, + "loss": 6.6663, + "step": 140 + }, + { + "epoch": 0.01, + "grad_norm": 1.5896294116973877, + "learning_rate": 4.964425119818173e-05, + "loss": 6.7358, + "step": 144 + }, + { + "epoch": 0.01, + "grad_norm": 3.169236660003662, + "learning_rate": 4.963436928702011e-05, + "loss": 6.7237, + "step": 148 + }, + { + "epoch": 0.01, + "grad_norm": 2.6182782649993896, + "learning_rate": 4.9624487375858494e-05, + "loss": 6.4832, + "step": 152 + }, + { + "epoch": 0.01, + "grad_norm": 1.5328868627548218, + "learning_rate": 4.9614605464696876e-05, + "loss": 6.5984, + "step": 156 + }, + { + "epoch": 0.01, + "grad_norm": 1.2593821287155151, + "learning_rate": 4.960472355353525e-05, + "loss": 6.5301, + "step": 160 + }, + { + "epoch": 0.01, + "grad_norm": 1.7900025844573975, + "learning_rate": 4.959484164237363e-05, + "loss": 6.555, + "step": 164 + }, + { + "epoch": 0.01, + "grad_norm": 2.1635518074035645, + "learning_rate": 4.9584959731212015e-05, + "loss": 6.5686, + "step": 168 + }, + { + "epoch": 0.01, + "grad_norm": 1.4018573760986328, + "learning_rate": 4.95750778200504e-05, + "loss": 6.4757, + "step": 172 + }, + { + "epoch": 0.01, + "grad_norm": 1.6255518198013306, + "learning_rate": 4.9565195908888786e-05, + "loss": 6.4482, + "step": 176 + }, + { + "epoch": 0.01, + "grad_norm": 1.5604472160339355, + "learning_rate": 4.955531399772717e-05, + "loss": 6.448, + "step": 180 + }, + { + "epoch": 0.01, + "grad_norm": 1.4451005458831787, + "learning_rate": 4.954543208656554e-05, + "loss": 6.5368, + "step": 184 + }, + { + "epoch": 0.01, + "grad_norm": 1.330167531967163, + "learning_rate": 4.9535550175403925e-05, + "loss": 6.473, + "step": 188 + }, + { + "epoch": 0.01, + "grad_norm": 1.9696011543273926, + "learning_rate": 4.952566826424231e-05, + "loss": 6.4951, + "step": 192 + }, + { + "epoch": 0.01, + "grad_norm": 1.7933244705200195, + "learning_rate": 4.951578635308069e-05, + "loss": 6.471, + "step": 196 + }, + { + "epoch": 0.01, + "grad_norm": 1.6044100522994995, + "learning_rate": 4.950590444191907e-05, + "loss": 6.4612, + "step": 200 + }, + { + "epoch": 0.01, + "grad_norm": 1.3888564109802246, + "learning_rate": 4.9496022530757454e-05, + "loss": 6.4879, + "step": 204 + }, + { + "epoch": 0.01, + "grad_norm": 1.2492824792861938, + "learning_rate": 4.9486140619595836e-05, + "loss": 6.3364, + "step": 208 + }, + { + "epoch": 0.01, + "grad_norm": 1.8178552389144897, + "learning_rate": 4.947625870843422e-05, + "loss": 6.5152, + "step": 212 + }, + { + "epoch": 0.01, + "grad_norm": 1.363503098487854, + "learning_rate": 4.946637679727259e-05, + "loss": 6.2865, + "step": 216 + }, + { + "epoch": 0.01, + "grad_norm": 1.319766640663147, + "learning_rate": 4.9456494886110975e-05, + "loss": 6.4775, + "step": 220 + }, + { + "epoch": 0.01, + "grad_norm": 1.3917903900146484, + "learning_rate": 4.944661297494936e-05, + "loss": 6.4147, + "step": 224 + }, + { + "epoch": 0.01, + "grad_norm": 2.465641736984253, + "learning_rate": 4.943673106378774e-05, + "loss": 6.2991, + "step": 228 + }, + { + "epoch": 0.01, + "grad_norm": 1.6081475019454956, + "learning_rate": 4.942684915262612e-05, + "loss": 6.442, + "step": 232 + }, + { + "epoch": 0.01, + "grad_norm": 1.3391867876052856, + "learning_rate": 4.94169672414645e-05, + "loss": 6.3932, + "step": 236 + }, + { + "epoch": 0.01, + "grad_norm": 1.4221361875534058, + "learning_rate": 4.9407085330302885e-05, + "loss": 6.5403, + "step": 240 + }, + { + "epoch": 0.01, + "grad_norm": 1.60570228099823, + "learning_rate": 4.939720341914126e-05, + "loss": 6.397, + "step": 244 + }, + { + "epoch": 0.01, + "grad_norm": 1.3305834531784058, + "learning_rate": 4.938732150797964e-05, + "loss": 6.4258, + "step": 248 + }, + { + "epoch": 0.01, + "grad_norm": 1.617050290107727, + "learning_rate": 4.9377439596818025e-05, + "loss": 6.3044, + "step": 252 + }, + { + "epoch": 0.01, + "grad_norm": 1.6071454286575317, + "learning_rate": 4.936755768565641e-05, + "loss": 6.3836, + "step": 256 + }, + { + "epoch": 0.01, + "grad_norm": 2.0243899822235107, + "learning_rate": 4.935767577449479e-05, + "loss": 6.4785, + "step": 260 + }, + { + "epoch": 0.01, + "grad_norm": 1.4023394584655762, + "learning_rate": 4.934779386333317e-05, + "loss": 6.4129, + "step": 264 + }, + { + "epoch": 0.01, + "grad_norm": 1.6561346054077148, + "learning_rate": 4.933791195217155e-05, + "loss": 6.227, + "step": 268 + }, + { + "epoch": 0.01, + "grad_norm": 1.4024381637573242, + "learning_rate": 4.932803004100993e-05, + "loss": 6.3957, + "step": 272 + }, + { + "epoch": 0.01, + "grad_norm": 1.576090693473816, + "learning_rate": 4.931814812984831e-05, + "loss": 6.359, + "step": 276 + }, + { + "epoch": 0.01, + "grad_norm": 1.2421482801437378, + "learning_rate": 4.930826621868669e-05, + "loss": 6.3735, + "step": 280 + }, + { + "epoch": 0.01, + "grad_norm": 1.453856110572815, + "learning_rate": 4.9298384307525074e-05, + "loss": 6.4156, + "step": 284 + }, + { + "epoch": 0.01, + "grad_norm": 1.426669955253601, + "learning_rate": 4.928850239636346e-05, + "loss": 6.3278, + "step": 288 + }, + { + "epoch": 0.01, + "grad_norm": 1.5175261497497559, + "learning_rate": 4.9278620485201845e-05, + "loss": 6.3645, + "step": 292 + }, + { + "epoch": 0.01, + "grad_norm": 1.3980427980422974, + "learning_rate": 4.926873857404023e-05, + "loss": 6.3977, + "step": 296 + }, + { + "epoch": 0.01, + "grad_norm": 1.6867117881774902, + "learning_rate": 4.92588566628786e-05, + "loss": 6.3506, + "step": 300 + }, + { + "epoch": 0.02, + "grad_norm": 1.5795122385025024, + "learning_rate": 4.9248974751716985e-05, + "loss": 6.3062, + "step": 304 + }, + { + "epoch": 0.02, + "grad_norm": 1.5149136781692505, + "learning_rate": 4.923909284055537e-05, + "loss": 6.302, + "step": 308 + }, + { + "epoch": 0.02, + "grad_norm": 1.2091716527938843, + "learning_rate": 4.922921092939375e-05, + "loss": 6.2064, + "step": 312 + }, + { + "epoch": 0.02, + "grad_norm": 1.553070306777954, + "learning_rate": 4.921932901823213e-05, + "loss": 6.3236, + "step": 316 + }, + { + "epoch": 0.02, + "grad_norm": 1.7350945472717285, + "learning_rate": 4.920944710707051e-05, + "loss": 6.3381, + "step": 320 + }, + { + "epoch": 0.02, + "grad_norm": 1.2416383028030396, + "learning_rate": 4.9199565195908895e-05, + "loss": 6.4411, + "step": 324 + }, + { + "epoch": 0.02, + "grad_norm": 1.4091325998306274, + "learning_rate": 4.918968328474727e-05, + "loss": 6.4273, + "step": 328 + }, + { + "epoch": 0.02, + "grad_norm": 1.261334776878357, + "learning_rate": 4.917980137358565e-05, + "loss": 6.2724, + "step": 332 + }, + { + "epoch": 0.02, + "grad_norm": 1.6297985315322876, + "learning_rate": 4.9169919462424034e-05, + "loss": 6.2387, + "step": 336 + }, + { + "epoch": 0.02, + "grad_norm": 1.458993673324585, + "learning_rate": 4.9160037551262417e-05, + "loss": 6.3042, + "step": 340 + }, + { + "epoch": 0.02, + "grad_norm": 1.355808138847351, + "learning_rate": 4.91501556401008e-05, + "loss": 6.2788, + "step": 344 + }, + { + "epoch": 0.02, + "grad_norm": 1.2890819311141968, + "learning_rate": 4.914027372893918e-05, + "loss": 6.3692, + "step": 348 + }, + { + "epoch": 0.02, + "grad_norm": 1.5858672857284546, + "learning_rate": 4.913039181777756e-05, + "loss": 6.2324, + "step": 352 + }, + { + "epoch": 0.02, + "grad_norm": 1.5306330919265747, + "learning_rate": 4.912050990661594e-05, + "loss": 6.4247, + "step": 356 + }, + { + "epoch": 0.02, + "grad_norm": 1.386789083480835, + "learning_rate": 4.911062799545432e-05, + "loss": 6.4219, + "step": 360 + }, + { + "epoch": 0.02, + "grad_norm": 1.288030743598938, + "learning_rate": 4.91007460842927e-05, + "loss": 6.2128, + "step": 364 + }, + { + "epoch": 0.02, + "grad_norm": 1.5249669551849365, + "learning_rate": 4.9090864173131084e-05, + "loss": 6.2724, + "step": 368 + }, + { + "epoch": 0.02, + "grad_norm": 1.787862777709961, + "learning_rate": 4.9080982261969466e-05, + "loss": 6.3139, + "step": 372 + }, + { + "epoch": 0.02, + "grad_norm": 1.7556533813476562, + "learning_rate": 4.907110035080785e-05, + "loss": 6.3006, + "step": 376 + }, + { + "epoch": 0.02, + "grad_norm": 1.5250591039657593, + "learning_rate": 4.906121843964623e-05, + "loss": 6.3109, + "step": 380 + }, + { + "epoch": 0.02, + "grad_norm": 1.591663122177124, + "learning_rate": 4.905133652848461e-05, + "loss": 6.2026, + "step": 384 + }, + { + "epoch": 0.02, + "grad_norm": 1.3261005878448486, + "learning_rate": 4.904145461732299e-05, + "loss": 6.2905, + "step": 388 + }, + { + "epoch": 0.02, + "grad_norm": 1.692723035812378, + "learning_rate": 4.903157270616137e-05, + "loss": 6.3296, + "step": 392 + }, + { + "epoch": 0.02, + "grad_norm": 1.2827457189559937, + "learning_rate": 4.902169079499975e-05, + "loss": 6.229, + "step": 396 + }, + { + "epoch": 0.02, + "grad_norm": 1.3109639883041382, + "learning_rate": 4.9011808883838134e-05, + "loss": 6.2625, + "step": 400 + }, + { + "epoch": 0.02, + "grad_norm": 1.7134902477264404, + "learning_rate": 4.900192697267652e-05, + "loss": 6.1219, + "step": 404 + }, + { + "epoch": 0.02, + "grad_norm": 1.3591091632843018, + "learning_rate": 4.8992045061514905e-05, + "loss": 6.2059, + "step": 408 + }, + { + "epoch": 0.02, + "grad_norm": 1.4868909120559692, + "learning_rate": 4.898216315035328e-05, + "loss": 6.2244, + "step": 412 + }, + { + "epoch": 0.02, + "grad_norm": 1.4216508865356445, + "learning_rate": 4.897228123919166e-05, + "loss": 6.2348, + "step": 416 + }, + { + "epoch": 0.02, + "grad_norm": 1.6655827760696411, + "learning_rate": 4.8962399328030044e-05, + "loss": 6.2708, + "step": 420 + }, + { + "epoch": 0.02, + "grad_norm": 1.2668567895889282, + "learning_rate": 4.8952517416868426e-05, + "loss": 6.1847, + "step": 424 + }, + { + "epoch": 0.02, + "grad_norm": 1.4796233177185059, + "learning_rate": 4.894263550570681e-05, + "loss": 6.2613, + "step": 428 + }, + { + "epoch": 0.02, + "grad_norm": 1.6667245626449585, + "learning_rate": 4.893275359454519e-05, + "loss": 6.3676, + "step": 432 + }, + { + "epoch": 0.02, + "grad_norm": 1.780787467956543, + "learning_rate": 4.892287168338357e-05, + "loss": 6.341, + "step": 436 + }, + { + "epoch": 0.02, + "grad_norm": 1.415461540222168, + "learning_rate": 4.891298977222195e-05, + "loss": 6.2441, + "step": 440 + }, + { + "epoch": 0.02, + "grad_norm": 1.4248474836349487, + "learning_rate": 4.890310786106033e-05, + "loss": 6.282, + "step": 444 + }, + { + "epoch": 0.02, + "grad_norm": 2.0168416500091553, + "learning_rate": 4.889322594989871e-05, + "loss": 6.2457, + "step": 448 + }, + { + "epoch": 0.02, + "grad_norm": 1.983055830001831, + "learning_rate": 4.8883344038737094e-05, + "loss": 6.1607, + "step": 452 + }, + { + "epoch": 0.02, + "grad_norm": 1.9059456586837769, + "learning_rate": 4.8873462127575476e-05, + "loss": 6.1443, + "step": 456 + }, + { + "epoch": 0.02, + "grad_norm": 1.5710817575454712, + "learning_rate": 4.886358021641386e-05, + "loss": 6.2631, + "step": 460 + }, + { + "epoch": 0.02, + "grad_norm": 2.3912734985351562, + "learning_rate": 4.885369830525224e-05, + "loss": 6.2484, + "step": 464 + }, + { + "epoch": 0.02, + "grad_norm": 1.5551170110702515, + "learning_rate": 4.884381639409062e-05, + "loss": 6.3059, + "step": 468 + }, + { + "epoch": 0.02, + "grad_norm": 1.5288927555084229, + "learning_rate": 4.8833934482929e-05, + "loss": 6.0518, + "step": 472 + }, + { + "epoch": 0.02, + "grad_norm": 1.6663211584091187, + "learning_rate": 4.882405257176738e-05, + "loss": 6.2273, + "step": 476 + }, + { + "epoch": 0.02, + "grad_norm": 1.5548433065414429, + "learning_rate": 4.881417066060576e-05, + "loss": 6.1407, + "step": 480 + }, + { + "epoch": 0.02, + "grad_norm": 1.4325834512710571, + "learning_rate": 4.8804288749444144e-05, + "loss": 6.1346, + "step": 484 + }, + { + "epoch": 0.02, + "grad_norm": 1.622158408164978, + "learning_rate": 4.8794406838282526e-05, + "loss": 6.1832, + "step": 488 + }, + { + "epoch": 0.02, + "grad_norm": 1.4777097702026367, + "learning_rate": 4.878452492712091e-05, + "loss": 6.2451, + "step": 492 + }, + { + "epoch": 0.02, + "grad_norm": 1.1623598337173462, + "learning_rate": 4.877464301595929e-05, + "loss": 6.1631, + "step": 496 + }, + { + "epoch": 0.02, + "grad_norm": 1.598676323890686, + "learning_rate": 4.8764761104797665e-05, + "loss": 6.2348, + "step": 500 + }, + { + "epoch": 0.02, + "grad_norm": 1.610736608505249, + "learning_rate": 4.875487919363605e-05, + "loss": 6.3664, + "step": 504 + }, + { + "epoch": 0.03, + "grad_norm": 1.4379361867904663, + "learning_rate": 4.874499728247443e-05, + "loss": 6.202, + "step": 508 + }, + { + "epoch": 0.03, + "grad_norm": 1.3208808898925781, + "learning_rate": 4.873511537131281e-05, + "loss": 6.1286, + "step": 512 + }, + { + "epoch": 0.03, + "grad_norm": 1.3087486028671265, + "learning_rate": 4.872523346015119e-05, + "loss": 6.2657, + "step": 516 + }, + { + "epoch": 0.03, + "grad_norm": 1.3428252935409546, + "learning_rate": 4.871535154898958e-05, + "loss": 6.0752, + "step": 520 + }, + { + "epoch": 0.03, + "grad_norm": 1.5278196334838867, + "learning_rate": 4.870546963782796e-05, + "loss": 6.267, + "step": 524 + }, + { + "epoch": 0.03, + "grad_norm": 1.5938080549240112, + "learning_rate": 4.869558772666634e-05, + "loss": 6.177, + "step": 528 + }, + { + "epoch": 0.03, + "grad_norm": 1.2772130966186523, + "learning_rate": 4.868570581550472e-05, + "loss": 6.2219, + "step": 532 + }, + { + "epoch": 0.03, + "grad_norm": 1.4687914848327637, + "learning_rate": 4.8675823904343104e-05, + "loss": 6.2316, + "step": 536 + }, + { + "epoch": 0.03, + "grad_norm": 1.4582056999206543, + "learning_rate": 4.8665941993181486e-05, + "loss": 6.0915, + "step": 540 + }, + { + "epoch": 0.03, + "grad_norm": 1.3734614849090576, + "learning_rate": 4.865606008201987e-05, + "loss": 6.2694, + "step": 544 + }, + { + "epoch": 0.03, + "grad_norm": 1.426127314567566, + "learning_rate": 4.864617817085825e-05, + "loss": 6.1274, + "step": 548 + }, + { + "epoch": 0.03, + "grad_norm": 1.3772287368774414, + "learning_rate": 4.863629625969663e-05, + "loss": 6.2704, + "step": 552 + }, + { + "epoch": 0.03, + "grad_norm": 1.2581090927124023, + "learning_rate": 4.862641434853501e-05, + "loss": 6.1335, + "step": 556 + }, + { + "epoch": 0.03, + "grad_norm": 1.1419745683670044, + "learning_rate": 4.861653243737339e-05, + "loss": 6.1259, + "step": 560 + }, + { + "epoch": 0.03, + "grad_norm": 1.2435179948806763, + "learning_rate": 4.860665052621177e-05, + "loss": 6.2289, + "step": 564 + }, + { + "epoch": 0.03, + "grad_norm": 1.5236430168151855, + "learning_rate": 4.859676861505015e-05, + "loss": 6.0294, + "step": 568 + }, + { + "epoch": 0.03, + "grad_norm": 2.289353370666504, + "learning_rate": 4.8586886703888535e-05, + "loss": 6.1533, + "step": 572 + }, + { + "epoch": 0.03, + "grad_norm": 1.5420417785644531, + "learning_rate": 4.857700479272692e-05, + "loss": 6.155, + "step": 576 + }, + { + "epoch": 0.03, + "grad_norm": 1.8614426851272583, + "learning_rate": 4.85671228815653e-05, + "loss": 6.2381, + "step": 580 + }, + { + "epoch": 0.03, + "grad_norm": 1.405530571937561, + "learning_rate": 4.8557240970403675e-05, + "loss": 6.2005, + "step": 584 + }, + { + "epoch": 0.03, + "grad_norm": 1.4058623313903809, + "learning_rate": 4.854735905924206e-05, + "loss": 6.1903, + "step": 588 + }, + { + "epoch": 0.03, + "grad_norm": 1.7855048179626465, + "learning_rate": 4.853747714808044e-05, + "loss": 6.2939, + "step": 592 + }, + { + "epoch": 0.03, + "grad_norm": 1.3504676818847656, + "learning_rate": 4.852759523691882e-05, + "loss": 6.0347, + "step": 596 + }, + { + "epoch": 0.03, + "grad_norm": 1.3448097705841064, + "learning_rate": 4.85177133257572e-05, + "loss": 6.1058, + "step": 600 + }, + { + "epoch": 0.03, + "grad_norm": 1.2908382415771484, + "learning_rate": 4.8507831414595585e-05, + "loss": 6.1323, + "step": 604 + }, + { + "epoch": 0.03, + "grad_norm": 1.6946722269058228, + "learning_rate": 4.849794950343397e-05, + "loss": 6.0755, + "step": 608 + }, + { + "epoch": 0.03, + "grad_norm": 1.0505646467208862, + "learning_rate": 4.848806759227235e-05, + "loss": 6.0742, + "step": 612 + }, + { + "epoch": 0.03, + "grad_norm": 1.269295334815979, + "learning_rate": 4.8478185681110724e-05, + "loss": 6.2864, + "step": 616 + }, + { + "epoch": 0.03, + "grad_norm": 1.3444817066192627, + "learning_rate": 4.8468303769949107e-05, + "loss": 6.1599, + "step": 620 + }, + { + "epoch": 0.03, + "grad_norm": 1.5646265745162964, + "learning_rate": 4.845842185878749e-05, + "loss": 6.1745, + "step": 624 + }, + { + "epoch": 0.03, + "grad_norm": 1.2452738285064697, + "learning_rate": 4.844853994762587e-05, + "loss": 6.1796, + "step": 628 + }, + { + "epoch": 0.03, + "grad_norm": 1.9167345762252808, + "learning_rate": 4.843865803646426e-05, + "loss": 6.1854, + "step": 632 + }, + { + "epoch": 0.03, + "grad_norm": 1.3464816808700562, + "learning_rate": 4.842877612530264e-05, + "loss": 6.1394, + "step": 636 + }, + { + "epoch": 0.03, + "grad_norm": 1.5352030992507935, + "learning_rate": 4.841889421414102e-05, + "loss": 6.1579, + "step": 640 + }, + { + "epoch": 0.03, + "grad_norm": 1.2066044807434082, + "learning_rate": 4.84090123029794e-05, + "loss": 6.1284, + "step": 644 + }, + { + "epoch": 0.03, + "grad_norm": 1.3687278032302856, + "learning_rate": 4.839913039181778e-05, + "loss": 6.1768, + "step": 648 + }, + { + "epoch": 0.03, + "grad_norm": 1.4037103652954102, + "learning_rate": 4.838924848065616e-05, + "loss": 6.0552, + "step": 652 + }, + { + "epoch": 0.03, + "grad_norm": 1.8637155294418335, + "learning_rate": 4.8379366569494545e-05, + "loss": 6.126, + "step": 656 + }, + { + "epoch": 0.03, + "grad_norm": 1.8843746185302734, + "learning_rate": 4.836948465833293e-05, + "loss": 6.1839, + "step": 660 + }, + { + "epoch": 0.03, + "grad_norm": 1.2985315322875977, + "learning_rate": 4.835960274717131e-05, + "loss": 6.0767, + "step": 664 + }, + { + "epoch": 0.03, + "grad_norm": 1.5230488777160645, + "learning_rate": 4.8349720836009684e-05, + "loss": 6.1762, + "step": 668 + }, + { + "epoch": 0.03, + "grad_norm": 1.8898823261260986, + "learning_rate": 4.8339838924848067e-05, + "loss": 6.0165, + "step": 672 + }, + { + "epoch": 0.03, + "grad_norm": 1.7203717231750488, + "learning_rate": 4.832995701368645e-05, + "loss": 6.1677, + "step": 676 + }, + { + "epoch": 0.03, + "grad_norm": 1.478527307510376, + "learning_rate": 4.832007510252483e-05, + "loss": 6.2011, + "step": 680 + }, + { + "epoch": 0.03, + "grad_norm": 1.2331160306930542, + "learning_rate": 4.831019319136321e-05, + "loss": 6.1311, + "step": 684 + }, + { + "epoch": 0.03, + "grad_norm": 1.4134057760238647, + "learning_rate": 4.8300311280201595e-05, + "loss": 6.157, + "step": 688 + }, + { + "epoch": 0.03, + "grad_norm": 1.2819682359695435, + "learning_rate": 4.829042936903998e-05, + "loss": 6.0408, + "step": 692 + }, + { + "epoch": 0.03, + "grad_norm": 1.646480917930603, + "learning_rate": 4.828054745787836e-05, + "loss": 6.0527, + "step": 696 + }, + { + "epoch": 0.03, + "grad_norm": 1.3179553747177124, + "learning_rate": 4.8270665546716734e-05, + "loss": 6.1771, + "step": 700 + }, + { + "epoch": 0.03, + "grad_norm": 1.3149442672729492, + "learning_rate": 4.8260783635555116e-05, + "loss": 6.1106, + "step": 704 + }, + { + "epoch": 0.03, + "grad_norm": 1.345340371131897, + "learning_rate": 4.82509017243935e-05, + "loss": 6.0963, + "step": 708 + }, + { + "epoch": 0.04, + "grad_norm": 2.3398826122283936, + "learning_rate": 4.824101981323188e-05, + "loss": 6.075, + "step": 712 + }, + { + "epoch": 0.04, + "grad_norm": 1.1765165328979492, + "learning_rate": 4.823113790207026e-05, + "loss": 6.1648, + "step": 716 + }, + { + "epoch": 0.04, + "grad_norm": 1.5758169889450073, + "learning_rate": 4.8221255990908645e-05, + "loss": 6.206, + "step": 720 + }, + { + "epoch": 0.04, + "grad_norm": 1.3040266036987305, + "learning_rate": 4.8211374079747027e-05, + "loss": 6.0651, + "step": 724 + }, + { + "epoch": 0.04, + "grad_norm": 1.7494781017303467, + "learning_rate": 4.82014921685854e-05, + "loss": 6.0144, + "step": 728 + }, + { + "epoch": 0.04, + "grad_norm": 1.5416646003723145, + "learning_rate": 4.8191610257423784e-05, + "loss": 6.2011, + "step": 732 + }, + { + "epoch": 0.04, + "grad_norm": 1.3276399374008179, + "learning_rate": 4.8181728346262166e-05, + "loss": 6.2073, + "step": 736 + }, + { + "epoch": 0.04, + "grad_norm": 1.3996212482452393, + "learning_rate": 4.817184643510055e-05, + "loss": 6.0768, + "step": 740 + }, + { + "epoch": 0.04, + "grad_norm": 1.3483880758285522, + "learning_rate": 4.816196452393893e-05, + "loss": 5.9309, + "step": 744 + }, + { + "epoch": 0.04, + "grad_norm": 1.1830189228057861, + "learning_rate": 4.815208261277732e-05, + "loss": 6.0258, + "step": 748 + }, + { + "epoch": 0.04, + "grad_norm": 1.3837305307388306, + "learning_rate": 4.8142200701615694e-05, + "loss": 6.0004, + "step": 752 + }, + { + "epoch": 0.04, + "grad_norm": 1.1996792554855347, + "learning_rate": 4.8132318790454076e-05, + "loss": 6.067, + "step": 756 + }, + { + "epoch": 0.04, + "grad_norm": 1.6578378677368164, + "learning_rate": 4.812243687929246e-05, + "loss": 6.1112, + "step": 760 + }, + { + "epoch": 0.04, + "grad_norm": 1.732629418373108, + "learning_rate": 4.811255496813084e-05, + "loss": 6.0361, + "step": 764 + }, + { + "epoch": 0.04, + "grad_norm": 1.3690398931503296, + "learning_rate": 4.810267305696922e-05, + "loss": 6.0105, + "step": 768 + }, + { + "epoch": 0.04, + "grad_norm": 1.551343560218811, + "learning_rate": 4.8092791145807605e-05, + "loss": 5.9922, + "step": 772 + }, + { + "epoch": 0.04, + "grad_norm": 1.222483515739441, + "learning_rate": 4.8082909234645987e-05, + "loss": 6.1032, + "step": 776 + }, + { + "epoch": 0.04, + "grad_norm": 1.4626158475875854, + "learning_rate": 4.807302732348437e-05, + "loss": 6.2455, + "step": 780 + }, + { + "epoch": 0.04, + "grad_norm": 1.4593418836593628, + "learning_rate": 4.8063145412322744e-05, + "loss": 6.104, + "step": 784 + }, + { + "epoch": 0.04, + "grad_norm": 1.4751644134521484, + "learning_rate": 4.8053263501161126e-05, + "loss": 6.0467, + "step": 788 + }, + { + "epoch": 0.04, + "grad_norm": 1.25166916847229, + "learning_rate": 4.804338158999951e-05, + "loss": 6.0525, + "step": 792 + }, + { + "epoch": 0.04, + "grad_norm": 1.3588769435882568, + "learning_rate": 4.803349967883789e-05, + "loss": 6.1355, + "step": 796 + }, + { + "epoch": 0.04, + "grad_norm": 1.2559925317764282, + "learning_rate": 4.802361776767627e-05, + "loss": 6.3282, + "step": 800 + }, + { + "epoch": 0.04, + "grad_norm": 1.3895193338394165, + "learning_rate": 4.8013735856514654e-05, + "loss": 6.0936, + "step": 804 + }, + { + "epoch": 0.04, + "grad_norm": 1.691657543182373, + "learning_rate": 4.8003853945353036e-05, + "loss": 6.1079, + "step": 808 + }, + { + "epoch": 0.04, + "grad_norm": 1.3020676374435425, + "learning_rate": 4.799397203419141e-05, + "loss": 6.0076, + "step": 812 + }, + { + "epoch": 0.04, + "grad_norm": 1.306980848312378, + "learning_rate": 4.7984090123029794e-05, + "loss": 6.1308, + "step": 816 + }, + { + "epoch": 0.04, + "grad_norm": 1.3331334590911865, + "learning_rate": 4.7974208211868176e-05, + "loss": 6.0901, + "step": 820 + }, + { + "epoch": 0.04, + "grad_norm": 1.2504887580871582, + "learning_rate": 4.796432630070656e-05, + "loss": 6.1079, + "step": 824 + }, + { + "epoch": 0.04, + "grad_norm": 1.462392807006836, + "learning_rate": 4.795444438954494e-05, + "loss": 6.0692, + "step": 828 + }, + { + "epoch": 0.04, + "grad_norm": 1.281559944152832, + "learning_rate": 4.794456247838332e-05, + "loss": 6.0399, + "step": 832 + }, + { + "epoch": 0.04, + "grad_norm": 1.2926335334777832, + "learning_rate": 4.7934680567221704e-05, + "loss": 6.0005, + "step": 836 + }, + { + "epoch": 0.04, + "grad_norm": 1.5699244737625122, + "learning_rate": 4.792479865606008e-05, + "loss": 5.9845, + "step": 840 + }, + { + "epoch": 0.04, + "grad_norm": 1.491740345954895, + "learning_rate": 4.791491674489846e-05, + "loss": 6.015, + "step": 844 + }, + { + "epoch": 0.04, + "grad_norm": 1.3208236694335938, + "learning_rate": 4.790503483373684e-05, + "loss": 6.0501, + "step": 848 + }, + { + "epoch": 0.04, + "grad_norm": 1.5497496128082275, + "learning_rate": 4.7895152922575225e-05, + "loss": 6.105, + "step": 852 + }, + { + "epoch": 0.04, + "grad_norm": 1.3851691484451294, + "learning_rate": 4.788527101141361e-05, + "loss": 6.1256, + "step": 856 + }, + { + "epoch": 0.04, + "grad_norm": 1.506717324256897, + "learning_rate": 4.787538910025199e-05, + "loss": 5.9827, + "step": 860 + }, + { + "epoch": 0.04, + "grad_norm": 1.2789814472198486, + "learning_rate": 4.786550718909038e-05, + "loss": 6.0632, + "step": 864 + }, + { + "epoch": 0.04, + "grad_norm": 1.2237069606781006, + "learning_rate": 4.7855625277928754e-05, + "loss": 6.0908, + "step": 868 + }, + { + "epoch": 0.04, + "grad_norm": 1.2778388261795044, + "learning_rate": 4.7845743366767136e-05, + "loss": 6.0442, + "step": 872 + }, + { + "epoch": 0.04, + "grad_norm": 1.534972071647644, + "learning_rate": 4.783586145560552e-05, + "loss": 6.1035, + "step": 876 + }, + { + "epoch": 0.04, + "grad_norm": 1.4843637943267822, + "learning_rate": 4.78259795444439e-05, + "loss": 6.0233, + "step": 880 + }, + { + "epoch": 0.04, + "grad_norm": 1.4751485586166382, + "learning_rate": 4.781609763328228e-05, + "loss": 6.109, + "step": 884 + }, + { + "epoch": 0.04, + "grad_norm": 1.399499535560608, + "learning_rate": 4.7806215722120664e-05, + "loss": 6.0075, + "step": 888 + }, + { + "epoch": 0.04, + "grad_norm": 1.5269832611083984, + "learning_rate": 4.7796333810959046e-05, + "loss": 6.1387, + "step": 892 + }, + { + "epoch": 0.04, + "grad_norm": 1.342552661895752, + "learning_rate": 4.778645189979742e-05, + "loss": 6.0763, + "step": 896 + }, + { + "epoch": 0.04, + "grad_norm": 1.2727460861206055, + "learning_rate": 4.77765699886358e-05, + "loss": 5.9279, + "step": 900 + }, + { + "epoch": 0.04, + "grad_norm": 1.3465628623962402, + "learning_rate": 4.7766688077474185e-05, + "loss": 6.0226, + "step": 904 + }, + { + "epoch": 0.04, + "grad_norm": 1.288557767868042, + "learning_rate": 4.775680616631257e-05, + "loss": 6.0799, + "step": 908 + }, + { + "epoch": 0.05, + "grad_norm": 1.375345230102539, + "learning_rate": 4.774692425515095e-05, + "loss": 6.0868, + "step": 912 + }, + { + "epoch": 0.05, + "grad_norm": 1.440955638885498, + "learning_rate": 4.773704234398933e-05, + "loss": 5.9835, + "step": 916 + }, + { + "epoch": 0.05, + "grad_norm": 1.3870395421981812, + "learning_rate": 4.7727160432827714e-05, + "loss": 6.0711, + "step": 920 + }, + { + "epoch": 0.05, + "grad_norm": 1.4793189764022827, + "learning_rate": 4.771727852166609e-05, + "loss": 6.1001, + "step": 924 + }, + { + "epoch": 0.05, + "grad_norm": 1.2727391719818115, + "learning_rate": 4.770739661050447e-05, + "loss": 6.0782, + "step": 928 + }, + { + "epoch": 0.05, + "grad_norm": 1.3459304571151733, + "learning_rate": 4.769751469934285e-05, + "loss": 5.9659, + "step": 932 + }, + { + "epoch": 0.05, + "grad_norm": 1.6304296255111694, + "learning_rate": 4.7687632788181235e-05, + "loss": 6.0666, + "step": 936 + }, + { + "epoch": 0.05, + "grad_norm": 1.721566081047058, + "learning_rate": 4.767775087701962e-05, + "loss": 6.0233, + "step": 940 + }, + { + "epoch": 0.05, + "grad_norm": 1.486752986907959, + "learning_rate": 4.7667868965858e-05, + "loss": 6.0544, + "step": 944 + }, + { + "epoch": 0.05, + "grad_norm": 1.404415488243103, + "learning_rate": 4.765798705469638e-05, + "loss": 5.8913, + "step": 948 + }, + { + "epoch": 0.05, + "grad_norm": 1.550802230834961, + "learning_rate": 4.764810514353476e-05, + "loss": 6.0449, + "step": 952 + }, + { + "epoch": 0.05, + "grad_norm": 1.5222808122634888, + "learning_rate": 4.763822323237314e-05, + "loss": 6.1524, + "step": 956 + }, + { + "epoch": 0.05, + "grad_norm": 1.2582489252090454, + "learning_rate": 4.762834132121152e-05, + "loss": 6.09, + "step": 960 + }, + { + "epoch": 0.05, + "grad_norm": 1.5326069593429565, + "learning_rate": 4.76184594100499e-05, + "loss": 6.0265, + "step": 964 + }, + { + "epoch": 0.05, + "grad_norm": 1.5986106395721436, + "learning_rate": 4.7608577498888285e-05, + "loss": 6.1111, + "step": 968 + }, + { + "epoch": 0.05, + "grad_norm": 1.2689043283462524, + "learning_rate": 4.759869558772667e-05, + "loss": 6.0827, + "step": 972 + }, + { + "epoch": 0.05, + "grad_norm": 1.2669328451156616, + "learning_rate": 4.758881367656505e-05, + "loss": 6.1393, + "step": 976 + }, + { + "epoch": 0.05, + "grad_norm": 1.2476458549499512, + "learning_rate": 4.757893176540343e-05, + "loss": 6.0072, + "step": 980 + }, + { + "epoch": 0.05, + "grad_norm": 1.4797428846359253, + "learning_rate": 4.756904985424181e-05, + "loss": 6.0025, + "step": 984 + }, + { + "epoch": 0.05, + "grad_norm": 1.9927531480789185, + "learning_rate": 4.7559167943080195e-05, + "loss": 6.087, + "step": 988 + }, + { + "epoch": 0.05, + "grad_norm": 1.6228762865066528, + "learning_rate": 4.754928603191858e-05, + "loss": 6.0355, + "step": 992 + }, + { + "epoch": 0.05, + "grad_norm": 1.5881098508834839, + "learning_rate": 4.753940412075696e-05, + "loss": 6.0462, + "step": 996 + }, + { + "epoch": 0.05, + "grad_norm": 1.944541335105896, + "learning_rate": 4.752952220959534e-05, + "loss": 6.1294, + "step": 1000 + }, + { + "epoch": 0.05, + "grad_norm": 1.6126629114151, + "learning_rate": 4.751964029843372e-05, + "loss": 6.1586, + "step": 1004 + }, + { + "epoch": 0.05, + "grad_norm": 1.4057198762893677, + "learning_rate": 4.75097583872721e-05, + "loss": 6.24, + "step": 1008 + }, + { + "epoch": 0.05, + "grad_norm": 1.3622597455978394, + "learning_rate": 4.749987647611048e-05, + "loss": 5.9785, + "step": 1012 + }, + { + "epoch": 0.05, + "grad_norm": 1.3643813133239746, + "learning_rate": 4.748999456494886e-05, + "loss": 6.0392, + "step": 1016 + }, + { + "epoch": 0.05, + "grad_norm": 1.793847918510437, + "learning_rate": 4.7480112653787245e-05, + "loss": 5.9712, + "step": 1020 + }, + { + "epoch": 0.05, + "grad_norm": 1.4792604446411133, + "learning_rate": 4.747023074262563e-05, + "loss": 6.0705, + "step": 1024 + }, + { + "epoch": 0.05, + "grad_norm": 1.35064697265625, + "learning_rate": 4.746034883146401e-05, + "loss": 6.0608, + "step": 1028 + }, + { + "epoch": 0.05, + "grad_norm": 1.3092408180236816, + "learning_rate": 4.745046692030239e-05, + "loss": 5.885, + "step": 1032 + }, + { + "epoch": 0.05, + "grad_norm": 1.560230016708374, + "learning_rate": 4.744058500914077e-05, + "loss": 6.1149, + "step": 1036 + }, + { + "epoch": 0.05, + "grad_norm": 1.5619248151779175, + "learning_rate": 4.743070309797915e-05, + "loss": 6.1887, + "step": 1040 + }, + { + "epoch": 0.05, + "grad_norm": 1.4519466161727905, + "learning_rate": 4.742082118681753e-05, + "loss": 6.0853, + "step": 1044 + }, + { + "epoch": 0.05, + "grad_norm": 1.4475151300430298, + "learning_rate": 4.741093927565591e-05, + "loss": 6.0921, + "step": 1048 + }, + { + "epoch": 0.05, + "grad_norm": 1.2523047924041748, + "learning_rate": 4.7401057364494295e-05, + "loss": 6.1132, + "step": 1052 + }, + { + "epoch": 0.05, + "grad_norm": 1.2426884174346924, + "learning_rate": 4.7391175453332677e-05, + "loss": 5.9939, + "step": 1056 + }, + { + "epoch": 0.05, + "grad_norm": 1.4118828773498535, + "learning_rate": 4.738129354217106e-05, + "loss": 6.082, + "step": 1060 + }, + { + "epoch": 0.05, + "grad_norm": 1.6150298118591309, + "learning_rate": 4.737141163100944e-05, + "loss": 6.0342, + "step": 1064 + }, + { + "epoch": 0.05, + "grad_norm": 1.3286446332931519, + "learning_rate": 4.7361529719847816e-05, + "loss": 5.9456, + "step": 1068 + }, + { + "epoch": 0.05, + "grad_norm": 1.4334193468093872, + "learning_rate": 4.73516478086862e-05, + "loss": 5.9225, + "step": 1072 + }, + { + "epoch": 0.05, + "grad_norm": 1.2666079998016357, + "learning_rate": 4.734176589752458e-05, + "loss": 6.0389, + "step": 1076 + }, + { + "epoch": 0.05, + "grad_norm": 1.2868504524230957, + "learning_rate": 4.733188398636296e-05, + "loss": 6.0719, + "step": 1080 + }, + { + "epoch": 0.05, + "grad_norm": 1.7467691898345947, + "learning_rate": 4.7322002075201344e-05, + "loss": 5.9916, + "step": 1084 + }, + { + "epoch": 0.05, + "grad_norm": 1.4924054145812988, + "learning_rate": 4.7312120164039726e-05, + "loss": 5.9197, + "step": 1088 + }, + { + "epoch": 0.05, + "grad_norm": 1.1824862957000732, + "learning_rate": 4.730223825287811e-05, + "loss": 5.9122, + "step": 1092 + }, + { + "epoch": 0.05, + "grad_norm": 1.435604214668274, + "learning_rate": 4.729235634171649e-05, + "loss": 6.1722, + "step": 1096 + }, + { + "epoch": 0.05, + "grad_norm": 1.6537317037582397, + "learning_rate": 4.728247443055487e-05, + "loss": 6.0557, + "step": 1100 + }, + { + "epoch": 0.05, + "grad_norm": 1.237634539604187, + "learning_rate": 4.7272592519393255e-05, + "loss": 6.1475, + "step": 1104 + }, + { + "epoch": 0.05, + "grad_norm": 1.2861469984054565, + "learning_rate": 4.7262710608231637e-05, + "loss": 6.0261, + "step": 1108 + }, + { + "epoch": 0.05, + "grad_norm": 1.3722236156463623, + "learning_rate": 4.725282869707002e-05, + "loss": 5.9994, + "step": 1112 + }, + { + "epoch": 0.06, + "grad_norm": 1.4114540815353394, + "learning_rate": 4.72429467859084e-05, + "loss": 5.923, + "step": 1116 + }, + { + "epoch": 0.06, + "grad_norm": 1.572077751159668, + "learning_rate": 4.723306487474678e-05, + "loss": 5.9886, + "step": 1120 + }, + { + "epoch": 0.06, + "grad_norm": 1.971572995185852, + "learning_rate": 4.722318296358516e-05, + "loss": 6.0632, + "step": 1124 + }, + { + "epoch": 0.06, + "grad_norm": 2.1920833587646484, + "learning_rate": 4.721330105242354e-05, + "loss": 6.066, + "step": 1128 + }, + { + "epoch": 0.06, + "grad_norm": 1.6061211824417114, + "learning_rate": 4.720341914126192e-05, + "loss": 5.9523, + "step": 1132 + }, + { + "epoch": 0.06, + "grad_norm": 1.617601990699768, + "learning_rate": 4.7193537230100304e-05, + "loss": 6.0579, + "step": 1136 + }, + { + "epoch": 0.06, + "grad_norm": 1.674851655960083, + "learning_rate": 4.7183655318938686e-05, + "loss": 5.869, + "step": 1140 + }, + { + "epoch": 0.06, + "grad_norm": 1.6465020179748535, + "learning_rate": 4.717377340777707e-05, + "loss": 5.9525, + "step": 1144 + }, + { + "epoch": 0.06, + "grad_norm": 1.4357322454452515, + "learning_rate": 4.716389149661545e-05, + "loss": 5.8798, + "step": 1148 + }, + { + "epoch": 0.06, + "grad_norm": 1.319442868232727, + "learning_rate": 4.7154009585453826e-05, + "loss": 6.0924, + "step": 1152 + }, + { + "epoch": 0.06, + "grad_norm": 1.7044557332992554, + "learning_rate": 4.714412767429221e-05, + "loss": 6.0367, + "step": 1156 + }, + { + "epoch": 0.06, + "grad_norm": 1.6891183853149414, + "learning_rate": 4.713424576313059e-05, + "loss": 5.9983, + "step": 1160 + }, + { + "epoch": 0.06, + "grad_norm": 1.340993046760559, + "learning_rate": 4.712436385196897e-05, + "loss": 6.0353, + "step": 1164 + }, + { + "epoch": 0.06, + "grad_norm": 1.3144937753677368, + "learning_rate": 4.7114481940807354e-05, + "loss": 5.9603, + "step": 1168 + }, + { + "epoch": 0.06, + "grad_norm": 1.9154691696166992, + "learning_rate": 4.7104600029645736e-05, + "loss": 6.057, + "step": 1172 + }, + { + "epoch": 0.06, + "grad_norm": 1.2976912260055542, + "learning_rate": 4.709471811848412e-05, + "loss": 5.992, + "step": 1176 + }, + { + "epoch": 0.06, + "grad_norm": 1.4527958631515503, + "learning_rate": 4.70848362073225e-05, + "loss": 5.9253, + "step": 1180 + }, + { + "epoch": 0.06, + "grad_norm": 1.6319037675857544, + "learning_rate": 4.7074954296160875e-05, + "loss": 6.1028, + "step": 1184 + }, + { + "epoch": 0.06, + "grad_norm": 1.7407358884811401, + "learning_rate": 4.706507238499926e-05, + "loss": 5.9943, + "step": 1188 + }, + { + "epoch": 0.06, + "grad_norm": 2.127054452896118, + "learning_rate": 4.705519047383764e-05, + "loss": 6.0707, + "step": 1192 + }, + { + "epoch": 0.06, + "grad_norm": 1.761857271194458, + "learning_rate": 4.704530856267602e-05, + "loss": 6.0144, + "step": 1196 + }, + { + "epoch": 0.06, + "grad_norm": 1.4145355224609375, + "learning_rate": 4.7035426651514404e-05, + "loss": 6.0829, + "step": 1200 + }, + { + "epoch": 0.06, + "grad_norm": 1.3968502283096313, + "learning_rate": 4.7025544740352786e-05, + "loss": 6.0907, + "step": 1204 + }, + { + "epoch": 0.06, + "grad_norm": 1.406941294670105, + "learning_rate": 4.701566282919117e-05, + "loss": 5.9521, + "step": 1208 + }, + { + "epoch": 0.06, + "grad_norm": 1.3803069591522217, + "learning_rate": 4.700578091802955e-05, + "loss": 5.9801, + "step": 1212 + }, + { + "epoch": 0.06, + "grad_norm": 1.3069123029708862, + "learning_rate": 4.699589900686793e-05, + "loss": 5.9288, + "step": 1216 + }, + { + "epoch": 0.06, + "grad_norm": 1.5246793031692505, + "learning_rate": 4.6986017095706314e-05, + "loss": 6.0128, + "step": 1220 + }, + { + "epoch": 0.06, + "grad_norm": 1.5078657865524292, + "learning_rate": 4.6976135184544696e-05, + "loss": 5.9037, + "step": 1224 + }, + { + "epoch": 0.06, + "grad_norm": 1.9630528688430786, + "learning_rate": 4.696625327338308e-05, + "loss": 5.9762, + "step": 1228 + }, + { + "epoch": 0.06, + "grad_norm": 1.4446896314620972, + "learning_rate": 4.695637136222146e-05, + "loss": 5.9254, + "step": 1232 + }, + { + "epoch": 0.06, + "grad_norm": 1.1959322690963745, + "learning_rate": 4.6946489451059835e-05, + "loss": 5.9263, + "step": 1236 + }, + { + "epoch": 0.06, + "grad_norm": 1.5779132843017578, + "learning_rate": 4.693660753989822e-05, + "loss": 5.9132, + "step": 1240 + }, + { + "epoch": 0.06, + "grad_norm": 1.3477551937103271, + "learning_rate": 4.69267256287366e-05, + "loss": 6.0072, + "step": 1244 + }, + { + "epoch": 0.06, + "grad_norm": 1.950527310371399, + "learning_rate": 4.691684371757498e-05, + "loss": 5.9209, + "step": 1248 + }, + { + "epoch": 0.06, + "grad_norm": 2.0284175872802734, + "learning_rate": 4.6906961806413364e-05, + "loss": 6.0581, + "step": 1252 + }, + { + "epoch": 0.06, + "grad_norm": 1.3213205337524414, + "learning_rate": 4.6897079895251746e-05, + "loss": 5.9722, + "step": 1256 + }, + { + "epoch": 0.06, + "grad_norm": 1.2748960256576538, + "learning_rate": 4.688719798409013e-05, + "loss": 6.0285, + "step": 1260 + }, + { + "epoch": 0.06, + "grad_norm": 1.4361499547958374, + "learning_rate": 4.687731607292851e-05, + "loss": 5.9448, + "step": 1264 + }, + { + "epoch": 0.06, + "grad_norm": 1.5793545246124268, + "learning_rate": 4.6867434161766885e-05, + "loss": 6.0026, + "step": 1268 + }, + { + "epoch": 0.06, + "grad_norm": 1.7214800119400024, + "learning_rate": 4.685755225060527e-05, + "loss": 6.1118, + "step": 1272 + }, + { + "epoch": 0.06, + "grad_norm": 1.3730213642120361, + "learning_rate": 4.684767033944365e-05, + "loss": 5.9988, + "step": 1276 + }, + { + "epoch": 0.06, + "grad_norm": 1.4589751958847046, + "learning_rate": 4.683778842828203e-05, + "loss": 6.0041, + "step": 1280 + }, + { + "epoch": 0.06, + "grad_norm": 1.3940002918243408, + "learning_rate": 4.682790651712041e-05, + "loss": 5.9818, + "step": 1284 + }, + { + "epoch": 0.06, + "grad_norm": 1.3168179988861084, + "learning_rate": 4.6818024605958795e-05, + "loss": 5.9819, + "step": 1288 + }, + { + "epoch": 0.06, + "grad_norm": 1.737804651260376, + "learning_rate": 4.680814269479718e-05, + "loss": 5.9301, + "step": 1292 + }, + { + "epoch": 0.06, + "grad_norm": 1.2958004474639893, + "learning_rate": 4.679826078363555e-05, + "loss": 5.9922, + "step": 1296 + }, + { + "epoch": 0.06, + "grad_norm": 1.7017489671707153, + "learning_rate": 4.6788378872473935e-05, + "loss": 6.0218, + "step": 1300 + }, + { + "epoch": 0.06, + "grad_norm": 1.7935067415237427, + "learning_rate": 4.677849696131232e-05, + "loss": 5.8878, + "step": 1304 + }, + { + "epoch": 0.06, + "grad_norm": 2.293447256088257, + "learning_rate": 4.67686150501507e-05, + "loss": 6.1183, + "step": 1308 + }, + { + "epoch": 0.06, + "grad_norm": 1.470645785331726, + "learning_rate": 4.675873313898908e-05, + "loss": 5.8939, + "step": 1312 + }, + { + "epoch": 0.07, + "grad_norm": 1.3084948062896729, + "learning_rate": 4.674885122782746e-05, + "loss": 5.9267, + "step": 1316 + }, + { + "epoch": 0.07, + "grad_norm": 1.2316402196884155, + "learning_rate": 4.6738969316665845e-05, + "loss": 6.0664, + "step": 1320 + }, + { + "epoch": 0.07, + "grad_norm": 1.3546851873397827, + "learning_rate": 4.672908740550423e-05, + "loss": 5.9785, + "step": 1324 + }, + { + "epoch": 0.07, + "grad_norm": 1.1932753324508667, + "learning_rate": 4.671920549434261e-05, + "loss": 5.8807, + "step": 1328 + }, + { + "epoch": 0.07, + "grad_norm": 1.3659350872039795, + "learning_rate": 4.670932358318099e-05, + "loss": 6.0175, + "step": 1332 + }, + { + "epoch": 0.07, + "grad_norm": 1.316653847694397, + "learning_rate": 4.669944167201937e-05, + "loss": 5.9124, + "step": 1336 + }, + { + "epoch": 0.07, + "grad_norm": 1.4429857730865479, + "learning_rate": 4.6689559760857755e-05, + "loss": 5.9828, + "step": 1340 + }, + { + "epoch": 0.07, + "grad_norm": 1.4865456819534302, + "learning_rate": 4.667967784969614e-05, + "loss": 6.0829, + "step": 1344 + }, + { + "epoch": 0.07, + "grad_norm": 1.453019618988037, + "learning_rate": 4.666979593853452e-05, + "loss": 5.9101, + "step": 1348 + }, + { + "epoch": 0.07, + "grad_norm": 1.2942475080490112, + "learning_rate": 4.6659914027372895e-05, + "loss": 6.0094, + "step": 1352 + }, + { + "epoch": 0.07, + "grad_norm": 1.6731159687042236, + "learning_rate": 4.665003211621128e-05, + "loss": 5.9274, + "step": 1356 + }, + { + "epoch": 0.07, + "grad_norm": 1.5828598737716675, + "learning_rate": 4.664015020504966e-05, + "loss": 6.0568, + "step": 1360 + }, + { + "epoch": 0.07, + "grad_norm": 1.3310965299606323, + "learning_rate": 4.663026829388804e-05, + "loss": 5.9873, + "step": 1364 + }, + { + "epoch": 0.07, + "grad_norm": 1.5734776258468628, + "learning_rate": 4.662038638272642e-05, + "loss": 5.9194, + "step": 1368 + }, + { + "epoch": 0.07, + "grad_norm": 1.8752501010894775, + "learning_rate": 4.6610504471564805e-05, + "loss": 5.945, + "step": 1372 + }, + { + "epoch": 0.07, + "grad_norm": 1.5967419147491455, + "learning_rate": 4.660062256040319e-05, + "loss": 5.8828, + "step": 1376 + }, + { + "epoch": 0.07, + "grad_norm": 1.3059850931167603, + "learning_rate": 4.659074064924156e-05, + "loss": 5.9692, + "step": 1380 + }, + { + "epoch": 0.07, + "grad_norm": 1.3276833295822144, + "learning_rate": 4.6580858738079945e-05, + "loss": 5.9998, + "step": 1384 + }, + { + "epoch": 0.07, + "grad_norm": 1.481294870376587, + "learning_rate": 4.6570976826918327e-05, + "loss": 6.0054, + "step": 1388 + }, + { + "epoch": 0.07, + "grad_norm": 1.2492045164108276, + "learning_rate": 4.656109491575671e-05, + "loss": 5.7969, + "step": 1392 + }, + { + "epoch": 0.07, + "grad_norm": 1.3145158290863037, + "learning_rate": 4.655121300459509e-05, + "loss": 6.0625, + "step": 1396 + }, + { + "epoch": 0.07, + "grad_norm": 1.7083948850631714, + "learning_rate": 4.654133109343347e-05, + "loss": 6.0563, + "step": 1400 + }, + { + "epoch": 0.07, + "grad_norm": 2.5010440349578857, + "learning_rate": 4.6531449182271855e-05, + "loss": 5.9032, + "step": 1404 + }, + { + "epoch": 0.07, + "grad_norm": 1.3582074642181396, + "learning_rate": 4.652156727111023e-05, + "loss": 5.9933, + "step": 1408 + }, + { + "epoch": 0.07, + "grad_norm": 1.4667145013809204, + "learning_rate": 4.651168535994861e-05, + "loss": 5.8992, + "step": 1412 + }, + { + "epoch": 0.07, + "grad_norm": 1.5343049764633179, + "learning_rate": 4.6501803448786994e-05, + "loss": 5.9115, + "step": 1416 + }, + { + "epoch": 0.07, + "grad_norm": 1.1951204538345337, + "learning_rate": 4.6491921537625376e-05, + "loss": 5.8802, + "step": 1420 + }, + { + "epoch": 0.07, + "grad_norm": 1.566027283668518, + "learning_rate": 4.648203962646376e-05, + "loss": 5.8612, + "step": 1424 + }, + { + "epoch": 0.07, + "grad_norm": 1.2942713499069214, + "learning_rate": 4.647215771530214e-05, + "loss": 5.936, + "step": 1428 + }, + { + "epoch": 0.07, + "grad_norm": 1.5581068992614746, + "learning_rate": 4.646227580414052e-05, + "loss": 5.8286, + "step": 1432 + }, + { + "epoch": 0.07, + "grad_norm": 1.3115367889404297, + "learning_rate": 4.6452393892978905e-05, + "loss": 5.9605, + "step": 1436 + }, + { + "epoch": 0.07, + "grad_norm": 1.3432234525680542, + "learning_rate": 4.6442511981817287e-05, + "loss": 5.8929, + "step": 1440 + }, + { + "epoch": 0.07, + "grad_norm": 1.789021611213684, + "learning_rate": 4.643263007065567e-05, + "loss": 6.0817, + "step": 1444 + }, + { + "epoch": 0.07, + "grad_norm": 1.8558330535888672, + "learning_rate": 4.642274815949405e-05, + "loss": 6.0362, + "step": 1448 + }, + { + "epoch": 0.07, + "grad_norm": 1.2429139614105225, + "learning_rate": 4.641286624833243e-05, + "loss": 6.0426, + "step": 1452 + }, + { + "epoch": 0.07, + "grad_norm": 1.4920710325241089, + "learning_rate": 4.6402984337170815e-05, + "loss": 5.8848, + "step": 1456 + }, + { + "epoch": 0.07, + "grad_norm": 1.456693172454834, + "learning_rate": 4.63931024260092e-05, + "loss": 6.0129, + "step": 1460 + }, + { + "epoch": 0.07, + "grad_norm": 1.2984966039657593, + "learning_rate": 4.638322051484757e-05, + "loss": 5.9807, + "step": 1464 + }, + { + "epoch": 0.07, + "grad_norm": 1.3711044788360596, + "learning_rate": 4.6373338603685954e-05, + "loss": 5.9158, + "step": 1468 + }, + { + "epoch": 0.07, + "grad_norm": 1.432022213935852, + "learning_rate": 4.6363456692524336e-05, + "loss": 6.0205, + "step": 1472 + }, + { + "epoch": 0.07, + "grad_norm": 1.549072504043579, + "learning_rate": 4.635357478136272e-05, + "loss": 5.9811, + "step": 1476 + }, + { + "epoch": 0.07, + "grad_norm": 1.531638741493225, + "learning_rate": 4.63436928702011e-05, + "loss": 5.9643, + "step": 1480 + }, + { + "epoch": 0.07, + "grad_norm": 1.9268430471420288, + "learning_rate": 4.633381095903948e-05, + "loss": 6.022, + "step": 1484 + }, + { + "epoch": 0.07, + "grad_norm": 1.491971731185913, + "learning_rate": 4.6323929047877865e-05, + "loss": 5.9582, + "step": 1488 + }, + { + "epoch": 0.07, + "grad_norm": 1.253057837486267, + "learning_rate": 4.631404713671624e-05, + "loss": 5.9783, + "step": 1492 + }, + { + "epoch": 0.07, + "grad_norm": 1.7390155792236328, + "learning_rate": 4.630416522555462e-05, + "loss": 5.9987, + "step": 1496 + }, + { + "epoch": 0.07, + "grad_norm": 1.605100154876709, + "learning_rate": 4.6294283314393004e-05, + "loss": 6.0706, + "step": 1500 + }, + { + "epoch": 0.07, + "grad_norm": 1.30108642578125, + "learning_rate": 4.6284401403231386e-05, + "loss": 6.0712, + "step": 1504 + }, + { + "epoch": 0.07, + "grad_norm": 1.7230815887451172, + "learning_rate": 4.627451949206977e-05, + "loss": 6.0353, + "step": 1508 + }, + { + "epoch": 0.07, + "grad_norm": 1.7896109819412231, + "learning_rate": 4.626463758090815e-05, + "loss": 5.9901, + "step": 1512 + }, + { + "epoch": 0.07, + "grad_norm": 1.710947871208191, + "learning_rate": 4.625475566974653e-05, + "loss": 5.9141, + "step": 1516 + }, + { + "epoch": 0.08, + "grad_norm": 1.5355948209762573, + "learning_rate": 4.6244873758584914e-05, + "loss": 5.9802, + "step": 1520 + }, + { + "epoch": 0.08, + "grad_norm": 1.4352582693099976, + "learning_rate": 4.623499184742329e-05, + "loss": 6.0397, + "step": 1524 + }, + { + "epoch": 0.08, + "grad_norm": 1.4116641283035278, + "learning_rate": 4.622510993626167e-05, + "loss": 6.0213, + "step": 1528 + }, + { + "epoch": 0.08, + "grad_norm": 1.2762186527252197, + "learning_rate": 4.6215228025100054e-05, + "loss": 5.8323, + "step": 1532 + }, + { + "epoch": 0.08, + "grad_norm": 1.8467053174972534, + "learning_rate": 4.6205346113938436e-05, + "loss": 5.8623, + "step": 1536 + }, + { + "epoch": 0.08, + "grad_norm": 1.5691808462142944, + "learning_rate": 4.619546420277682e-05, + "loss": 6.0056, + "step": 1540 + }, + { + "epoch": 0.08, + "grad_norm": 1.9999240636825562, + "learning_rate": 4.61855822916152e-05, + "loss": 6.1042, + "step": 1544 + }, + { + "epoch": 0.08, + "grad_norm": 1.9879581928253174, + "learning_rate": 4.617570038045358e-05, + "loss": 5.8972, + "step": 1548 + }, + { + "epoch": 0.08, + "grad_norm": 1.6930572986602783, + "learning_rate": 4.6165818469291964e-05, + "loss": 5.8287, + "step": 1552 + }, + { + "epoch": 0.08, + "grad_norm": 2.064516544342041, + "learning_rate": 4.6155936558130346e-05, + "loss": 5.986, + "step": 1556 + }, + { + "epoch": 0.08, + "grad_norm": 1.5297229290008545, + "learning_rate": 4.614605464696873e-05, + "loss": 6.0793, + "step": 1560 + }, + { + "epoch": 0.08, + "grad_norm": 1.3446491956710815, + "learning_rate": 4.613617273580711e-05, + "loss": 5.971, + "step": 1564 + }, + { + "epoch": 0.08, + "grad_norm": 1.3736571073532104, + "learning_rate": 4.612629082464549e-05, + "loss": 6.049, + "step": 1568 + }, + { + "epoch": 0.08, + "grad_norm": 1.439300298690796, + "learning_rate": 4.6116408913483874e-05, + "loss": 6.0757, + "step": 1572 + }, + { + "epoch": 0.08, + "grad_norm": 1.766449213027954, + "learning_rate": 4.610652700232225e-05, + "loss": 5.8482, + "step": 1576 + }, + { + "epoch": 0.08, + "grad_norm": 1.3712018728256226, + "learning_rate": 4.609664509116063e-05, + "loss": 5.9108, + "step": 1580 + }, + { + "epoch": 0.08, + "grad_norm": 1.6353716850280762, + "learning_rate": 4.6086763179999014e-05, + "loss": 5.9461, + "step": 1584 + }, + { + "epoch": 0.08, + "grad_norm": 1.549001932144165, + "learning_rate": 4.6076881268837396e-05, + "loss": 5.9503, + "step": 1588 + }, + { + "epoch": 0.08, + "grad_norm": 1.8643873929977417, + "learning_rate": 4.606699935767578e-05, + "loss": 5.8876, + "step": 1592 + }, + { + "epoch": 0.08, + "grad_norm": 3.1906590461730957, + "learning_rate": 4.605711744651416e-05, + "loss": 5.8155, + "step": 1596 + }, + { + "epoch": 0.08, + "grad_norm": 1.4951763153076172, + "learning_rate": 4.604723553535254e-05, + "loss": 6.0645, + "step": 1600 + }, + { + "epoch": 0.08, + "grad_norm": 1.5761932134628296, + "learning_rate": 4.6037353624190924e-05, + "loss": 5.8113, + "step": 1604 + }, + { + "epoch": 0.08, + "grad_norm": 1.858544111251831, + "learning_rate": 4.60274717130293e-05, + "loss": 5.9492, + "step": 1608 + }, + { + "epoch": 0.08, + "grad_norm": 1.2514946460723877, + "learning_rate": 4.601758980186768e-05, + "loss": 6.0415, + "step": 1612 + }, + { + "epoch": 0.08, + "grad_norm": 1.4306968450546265, + "learning_rate": 4.600770789070606e-05, + "loss": 6.0258, + "step": 1616 + }, + { + "epoch": 0.08, + "grad_norm": 1.4032137393951416, + "learning_rate": 4.5997825979544445e-05, + "loss": 5.9323, + "step": 1620 + }, + { + "epoch": 0.08, + "grad_norm": 1.424999713897705, + "learning_rate": 4.598794406838283e-05, + "loss": 5.937, + "step": 1624 + }, + { + "epoch": 0.08, + "grad_norm": 1.3444308042526245, + "learning_rate": 4.597806215722121e-05, + "loss": 5.8967, + "step": 1628 + }, + { + "epoch": 0.08, + "grad_norm": 1.4193394184112549, + "learning_rate": 4.596818024605959e-05, + "loss": 5.9547, + "step": 1632 + }, + { + "epoch": 0.08, + "grad_norm": 1.2748864889144897, + "learning_rate": 4.595829833489797e-05, + "loss": 5.9361, + "step": 1636 + }, + { + "epoch": 0.08, + "grad_norm": 1.469801664352417, + "learning_rate": 4.594841642373635e-05, + "loss": 5.8623, + "step": 1640 + }, + { + "epoch": 0.08, + "grad_norm": 1.563642978668213, + "learning_rate": 4.593853451257473e-05, + "loss": 6.0724, + "step": 1644 + }, + { + "epoch": 0.08, + "grad_norm": 1.504905104637146, + "learning_rate": 4.592865260141311e-05, + "loss": 5.9072, + "step": 1648 + }, + { + "epoch": 0.08, + "grad_norm": 1.543295979499817, + "learning_rate": 4.5918770690251495e-05, + "loss": 5.9527, + "step": 1652 + }, + { + "epoch": 0.08, + "grad_norm": 2.244002103805542, + "learning_rate": 4.590888877908988e-05, + "loss": 6.0182, + "step": 1656 + }, + { + "epoch": 0.08, + "grad_norm": 2.3683278560638428, + "learning_rate": 4.589900686792826e-05, + "loss": 6.022, + "step": 1660 + }, + { + "epoch": 0.08, + "grad_norm": 1.5250694751739502, + "learning_rate": 4.588912495676664e-05, + "loss": 5.9414, + "step": 1664 + }, + { + "epoch": 0.08, + "grad_norm": 1.940976619720459, + "learning_rate": 4.5879243045605023e-05, + "loss": 5.9301, + "step": 1668 + }, + { + "epoch": 0.08, + "grad_norm": 1.503818154335022, + "learning_rate": 4.5869361134443405e-05, + "loss": 5.8361, + "step": 1672 + }, + { + "epoch": 0.08, + "grad_norm": 2.5099518299102783, + "learning_rate": 4.585947922328179e-05, + "loss": 5.8945, + "step": 1676 + }, + { + "epoch": 0.08, + "grad_norm": 1.6731679439544678, + "learning_rate": 4.584959731212017e-05, + "loss": 5.9091, + "step": 1680 + }, + { + "epoch": 0.08, + "grad_norm": 1.2417348623275757, + "learning_rate": 4.583971540095855e-05, + "loss": 5.9407, + "step": 1684 + }, + { + "epoch": 0.08, + "grad_norm": 1.4417905807495117, + "learning_rate": 4.5829833489796934e-05, + "loss": 6.0244, + "step": 1688 + }, + { + "epoch": 0.08, + "grad_norm": 1.8846923112869263, + "learning_rate": 4.581995157863531e-05, + "loss": 5.9297, + "step": 1692 + }, + { + "epoch": 0.08, + "grad_norm": 1.6629383563995361, + "learning_rate": 4.581006966747369e-05, + "loss": 5.845, + "step": 1696 + }, + { + "epoch": 0.08, + "grad_norm": 1.4970371723175049, + "learning_rate": 4.580018775631207e-05, + "loss": 5.9948, + "step": 1700 + }, + { + "epoch": 0.08, + "grad_norm": 2.547971248626709, + "learning_rate": 4.5790305845150455e-05, + "loss": 5.9322, + "step": 1704 + }, + { + "epoch": 0.08, + "grad_norm": 2.2521891593933105, + "learning_rate": 4.578042393398884e-05, + "loss": 5.9448, + "step": 1708 + }, + { + "epoch": 0.08, + "grad_norm": 1.8124247789382935, + "learning_rate": 4.577054202282722e-05, + "loss": 5.9226, + "step": 1712 + }, + { + "epoch": 0.08, + "grad_norm": 2.33345890045166, + "learning_rate": 4.57606601116656e-05, + "loss": 6.0771, + "step": 1716 + }, + { + "epoch": 0.08, + "grad_norm": 2.122553825378418, + "learning_rate": 4.575077820050398e-05, + "loss": 5.8754, + "step": 1720 + }, + { + "epoch": 0.09, + "grad_norm": 1.5283949375152588, + "learning_rate": 4.574089628934236e-05, + "loss": 5.8644, + "step": 1724 + }, + { + "epoch": 0.09, + "grad_norm": 1.6564345359802246, + "learning_rate": 4.573101437818074e-05, + "loss": 5.8691, + "step": 1728 + }, + { + "epoch": 0.09, + "grad_norm": 1.6714503765106201, + "learning_rate": 4.572113246701912e-05, + "loss": 5.9227, + "step": 1732 + }, + { + "epoch": 0.09, + "grad_norm": 1.543869972229004, + "learning_rate": 4.5711250555857505e-05, + "loss": 5.9588, + "step": 1736 + }, + { + "epoch": 0.09, + "grad_norm": 1.248793601989746, + "learning_rate": 4.570136864469589e-05, + "loss": 5.9065, + "step": 1740 + }, + { + "epoch": 0.09, + "grad_norm": 1.3636749982833862, + "learning_rate": 4.569148673353427e-05, + "loss": 5.8001, + "step": 1744 + }, + { + "epoch": 0.09, + "grad_norm": 1.2990843057632446, + "learning_rate": 4.568160482237265e-05, + "loss": 5.9017, + "step": 1748 + }, + { + "epoch": 0.09, + "grad_norm": 1.4160529375076294, + "learning_rate": 4.5671722911211026e-05, + "loss": 5.7955, + "step": 1752 + }, + { + "epoch": 0.09, + "grad_norm": 1.5701916217803955, + "learning_rate": 4.566184100004941e-05, + "loss": 5.9576, + "step": 1756 + }, + { + "epoch": 0.09, + "grad_norm": 1.5187253952026367, + "learning_rate": 4.565195908888779e-05, + "loss": 5.9054, + "step": 1760 + }, + { + "epoch": 0.09, + "grad_norm": 1.6530662775039673, + "learning_rate": 4.564207717772617e-05, + "loss": 5.9385, + "step": 1764 + }, + { + "epoch": 0.09, + "grad_norm": 1.481063723564148, + "learning_rate": 4.5632195266564555e-05, + "loss": 5.8584, + "step": 1768 + }, + { + "epoch": 0.09, + "grad_norm": 1.2179416418075562, + "learning_rate": 4.562231335540294e-05, + "loss": 5.9652, + "step": 1772 + }, + { + "epoch": 0.09, + "grad_norm": 1.4156241416931152, + "learning_rate": 4.561243144424132e-05, + "loss": 5.7772, + "step": 1776 + }, + { + "epoch": 0.09, + "grad_norm": 1.5597620010375977, + "learning_rate": 4.56025495330797e-05, + "loss": 5.9441, + "step": 1780 + }, + { + "epoch": 0.09, + "grad_norm": 1.4729033708572388, + "learning_rate": 4.559266762191808e-05, + "loss": 5.9495, + "step": 1784 + }, + { + "epoch": 0.09, + "grad_norm": 1.4252592325210571, + "learning_rate": 4.5582785710756465e-05, + "loss": 5.8379, + "step": 1788 + }, + { + "epoch": 0.09, + "grad_norm": 1.3946348428726196, + "learning_rate": 4.557290379959485e-05, + "loss": 5.9435, + "step": 1792 + }, + { + "epoch": 0.09, + "grad_norm": 1.366830587387085, + "learning_rate": 4.556302188843323e-05, + "loss": 5.9755, + "step": 1796 + }, + { + "epoch": 0.09, + "grad_norm": 1.2371442317962646, + "learning_rate": 4.555313997727161e-05, + "loss": 5.9288, + "step": 1800 + }, + { + "epoch": 0.09, + "grad_norm": 1.3472713232040405, + "learning_rate": 4.5543258066109986e-05, + "loss": 5.9537, + "step": 1804 + }, + { + "epoch": 0.09, + "grad_norm": 1.9578423500061035, + "learning_rate": 4.553337615494837e-05, + "loss": 6.0531, + "step": 1808 + }, + { + "epoch": 0.09, + "grad_norm": 1.4482135772705078, + "learning_rate": 4.552349424378675e-05, + "loss": 5.8764, + "step": 1812 + }, + { + "epoch": 0.09, + "grad_norm": 1.4886963367462158, + "learning_rate": 4.551361233262513e-05, + "loss": 6.0265, + "step": 1816 + }, + { + "epoch": 0.09, + "grad_norm": 1.89201819896698, + "learning_rate": 4.5503730421463515e-05, + "loss": 5.8338, + "step": 1820 + }, + { + "epoch": 0.09, + "grad_norm": 1.207160234451294, + "learning_rate": 4.54938485103019e-05, + "loss": 5.8131, + "step": 1824 + }, + { + "epoch": 0.09, + "grad_norm": 1.4633376598358154, + "learning_rate": 4.548396659914028e-05, + "loss": 5.8665, + "step": 1828 + }, + { + "epoch": 0.09, + "grad_norm": 1.4453964233398438, + "learning_rate": 4.547408468797866e-05, + "loss": 5.963, + "step": 1832 + }, + { + "epoch": 0.09, + "grad_norm": 2.0406370162963867, + "learning_rate": 4.5464202776817036e-05, + "loss": 5.8779, + "step": 1836 + }, + { + "epoch": 0.09, + "grad_norm": 1.6090114116668701, + "learning_rate": 4.545432086565542e-05, + "loss": 5.906, + "step": 1840 + }, + { + "epoch": 0.09, + "grad_norm": 1.6481086015701294, + "learning_rate": 4.54444389544938e-05, + "loss": 6.0581, + "step": 1844 + }, + { + "epoch": 0.09, + "grad_norm": 1.5279346704483032, + "learning_rate": 4.543455704333218e-05, + "loss": 5.937, + "step": 1848 + }, + { + "epoch": 0.09, + "grad_norm": 1.580449104309082, + "learning_rate": 4.5424675132170564e-05, + "loss": 5.8628, + "step": 1852 + }, + { + "epoch": 0.09, + "grad_norm": 1.5227487087249756, + "learning_rate": 4.5414793221008946e-05, + "loss": 5.8597, + "step": 1856 + }, + { + "epoch": 0.09, + "grad_norm": 1.3211569786071777, + "learning_rate": 4.540491130984733e-05, + "loss": 5.9294, + "step": 1860 + }, + { + "epoch": 0.09, + "grad_norm": 1.5065453052520752, + "learning_rate": 4.5395029398685704e-05, + "loss": 5.9311, + "step": 1864 + }, + { + "epoch": 0.09, + "grad_norm": 1.6894686222076416, + "learning_rate": 4.5385147487524086e-05, + "loss": 5.9489, + "step": 1868 + }, + { + "epoch": 0.09, + "grad_norm": 1.4861186742782593, + "learning_rate": 4.537526557636247e-05, + "loss": 5.9022, + "step": 1872 + }, + { + "epoch": 0.09, + "grad_norm": 1.7374814748764038, + "learning_rate": 4.536538366520085e-05, + "loss": 5.9511, + "step": 1876 + }, + { + "epoch": 0.09, + "grad_norm": 1.7111232280731201, + "learning_rate": 4.535550175403923e-05, + "loss": 5.9008, + "step": 1880 + }, + { + "epoch": 0.09, + "grad_norm": 1.887349009513855, + "learning_rate": 4.5345619842877614e-05, + "loss": 5.7785, + "step": 1884 + }, + { + "epoch": 0.09, + "grad_norm": 1.397018313407898, + "learning_rate": 4.5335737931715996e-05, + "loss": 5.8256, + "step": 1888 + }, + { + "epoch": 0.09, + "grad_norm": 1.312874674797058, + "learning_rate": 4.532585602055438e-05, + "loss": 5.7768, + "step": 1892 + }, + { + "epoch": 0.09, + "grad_norm": 1.6667051315307617, + "learning_rate": 4.531597410939276e-05, + "loss": 5.9521, + "step": 1896 + }, + { + "epoch": 0.09, + "grad_norm": 1.3993628025054932, + "learning_rate": 4.530609219823114e-05, + "loss": 5.8285, + "step": 1900 + }, + { + "epoch": 0.09, + "grad_norm": 1.5168507099151611, + "learning_rate": 4.5296210287069524e-05, + "loss": 5.9211, + "step": 1904 + }, + { + "epoch": 0.09, + "grad_norm": 1.9311803579330444, + "learning_rate": 4.5286328375907906e-05, + "loss": 6.015, + "step": 1908 + }, + { + "epoch": 0.09, + "grad_norm": 1.33588707447052, + "learning_rate": 4.527644646474629e-05, + "loss": 5.8528, + "step": 1912 + }, + { + "epoch": 0.09, + "grad_norm": 1.831453561782837, + "learning_rate": 4.526656455358467e-05, + "loss": 5.8254, + "step": 1916 + }, + { + "epoch": 0.09, + "grad_norm": 1.7024390697479248, + "learning_rate": 4.5256682642423046e-05, + "loss": 5.8458, + "step": 1920 + }, + { + "epoch": 0.1, + "grad_norm": 1.5297564268112183, + "learning_rate": 4.524680073126143e-05, + "loss": 5.8749, + "step": 1924 + }, + { + "epoch": 0.1, + "grad_norm": 1.4288631677627563, + "learning_rate": 4.523691882009981e-05, + "loss": 5.8288, + "step": 1928 + }, + { + "epoch": 0.1, + "grad_norm": 1.723061203956604, + "learning_rate": 4.522703690893819e-05, + "loss": 5.8773, + "step": 1932 + }, + { + "epoch": 0.1, + "grad_norm": 1.3699047565460205, + "learning_rate": 4.5217154997776574e-05, + "loss": 5.94, + "step": 1936 + }, + { + "epoch": 0.1, + "grad_norm": 1.883584976196289, + "learning_rate": 4.5207273086614956e-05, + "loss": 5.8437, + "step": 1940 + }, + { + "epoch": 0.1, + "grad_norm": 1.6371957063674927, + "learning_rate": 4.519739117545334e-05, + "loss": 5.9447, + "step": 1944 + }, + { + "epoch": 0.1, + "grad_norm": 1.4362295866012573, + "learning_rate": 4.5187509264291713e-05, + "loss": 6.0218, + "step": 1948 + }, + { + "epoch": 0.1, + "grad_norm": 1.9110528230667114, + "learning_rate": 4.5177627353130095e-05, + "loss": 5.9644, + "step": 1952 + }, + { + "epoch": 0.1, + "grad_norm": 2.587083101272583, + "learning_rate": 4.516774544196848e-05, + "loss": 5.9193, + "step": 1956 + }, + { + "epoch": 0.1, + "grad_norm": 1.5505852699279785, + "learning_rate": 4.515786353080686e-05, + "loss": 5.8284, + "step": 1960 + }, + { + "epoch": 0.1, + "grad_norm": 1.5493978261947632, + "learning_rate": 4.514798161964524e-05, + "loss": 6.0078, + "step": 1964 + }, + { + "epoch": 0.1, + "grad_norm": 1.5519258975982666, + "learning_rate": 4.5138099708483624e-05, + "loss": 5.8343, + "step": 1968 + }, + { + "epoch": 0.1, + "grad_norm": 1.4279431104660034, + "learning_rate": 4.5128217797322006e-05, + "loss": 5.9317, + "step": 1972 + }, + { + "epoch": 0.1, + "grad_norm": 1.8912272453308105, + "learning_rate": 4.511833588616038e-05, + "loss": 5.8355, + "step": 1976 + }, + { + "epoch": 0.1, + "grad_norm": 1.440687656402588, + "learning_rate": 4.510845397499876e-05, + "loss": 5.9027, + "step": 1980 + }, + { + "epoch": 0.1, + "grad_norm": 1.8350549936294556, + "learning_rate": 4.5098572063837145e-05, + "loss": 5.9534, + "step": 1984 + }, + { + "epoch": 0.1, + "grad_norm": 1.4987351894378662, + "learning_rate": 4.508869015267553e-05, + "loss": 6.0618, + "step": 1988 + }, + { + "epoch": 0.1, + "grad_norm": 1.4456908702850342, + "learning_rate": 4.507880824151391e-05, + "loss": 5.9174, + "step": 1992 + }, + { + "epoch": 0.1, + "grad_norm": 1.5561717748641968, + "learning_rate": 4.506892633035229e-05, + "loss": 5.8602, + "step": 1996 + }, + { + "epoch": 0.1, + "grad_norm": 1.976826548576355, + "learning_rate": 4.5059044419190673e-05, + "loss": 5.97, + "step": 2000 + }, + { + "epoch": 0.1, + "grad_norm": 1.561604380607605, + "learning_rate": 4.5049162508029055e-05, + "loss": 5.9561, + "step": 2004 + }, + { + "epoch": 0.1, + "grad_norm": 2.241724967956543, + "learning_rate": 4.503928059686743e-05, + "loss": 5.9314, + "step": 2008 + }, + { + "epoch": 0.1, + "grad_norm": 2.012200355529785, + "learning_rate": 4.502939868570582e-05, + "loss": 5.8255, + "step": 2012 + }, + { + "epoch": 0.1, + "grad_norm": 1.6281402111053467, + "learning_rate": 4.50195167745442e-05, + "loss": 5.8459, + "step": 2016 + }, + { + "epoch": 0.1, + "grad_norm": 1.9645634889602661, + "learning_rate": 4.5009634863382584e-05, + "loss": 5.805, + "step": 2020 + }, + { + "epoch": 0.1, + "grad_norm": 1.7598261833190918, + "learning_rate": 4.4999752952220966e-05, + "loss": 5.7636, + "step": 2024 + }, + { + "epoch": 0.1, + "grad_norm": 1.8579994440078735, + "learning_rate": 4.498987104105935e-05, + "loss": 5.9409, + "step": 2028 + }, + { + "epoch": 0.1, + "grad_norm": 1.3588138818740845, + "learning_rate": 4.497998912989772e-05, + "loss": 5.8235, + "step": 2032 + }, + { + "epoch": 0.1, + "grad_norm": 1.471695065498352, + "learning_rate": 4.4970107218736105e-05, + "loss": 5.9183, + "step": 2036 + }, + { + "epoch": 0.1, + "grad_norm": 1.500862717628479, + "learning_rate": 4.496022530757449e-05, + "loss": 5.8246, + "step": 2040 + }, + { + "epoch": 0.1, + "grad_norm": 1.3388968706130981, + "learning_rate": 4.495034339641287e-05, + "loss": 5.9709, + "step": 2044 + }, + { + "epoch": 0.1, + "grad_norm": 1.5488966703414917, + "learning_rate": 4.494046148525125e-05, + "loss": 5.8914, + "step": 2048 + }, + { + "epoch": 0.1, + "grad_norm": 1.5115787982940674, + "learning_rate": 4.4930579574089633e-05, + "loss": 5.8664, + "step": 2052 + }, + { + "epoch": 0.1, + "grad_norm": 1.632888674736023, + "learning_rate": 4.4920697662928015e-05, + "loss": 5.9276, + "step": 2056 + }, + { + "epoch": 0.1, + "grad_norm": 1.3317750692367554, + "learning_rate": 4.491081575176639e-05, + "loss": 5.8649, + "step": 2060 + }, + { + "epoch": 0.1, + "grad_norm": 1.7422010898590088, + "learning_rate": 4.490093384060477e-05, + "loss": 5.8982, + "step": 2064 + }, + { + "epoch": 0.1, + "grad_norm": 2.052384614944458, + "learning_rate": 4.4891051929443155e-05, + "loss": 5.9476, + "step": 2068 + }, + { + "epoch": 0.1, + "grad_norm": 2.492372989654541, + "learning_rate": 4.488117001828154e-05, + "loss": 5.8919, + "step": 2072 + }, + { + "epoch": 0.1, + "grad_norm": 1.3981868028640747, + "learning_rate": 4.487128810711992e-05, + "loss": 5.9484, + "step": 2076 + }, + { + "epoch": 0.1, + "grad_norm": 1.7076584100723267, + "learning_rate": 4.48614061959583e-05, + "loss": 5.7792, + "step": 2080 + }, + { + "epoch": 0.1, + "grad_norm": 1.4366710186004639, + "learning_rate": 4.485152428479668e-05, + "loss": 5.8862, + "step": 2084 + }, + { + "epoch": 0.1, + "grad_norm": 1.2299433946609497, + "learning_rate": 4.4841642373635065e-05, + "loss": 5.7836, + "step": 2088 + }, + { + "epoch": 0.1, + "grad_norm": 1.4261624813079834, + "learning_rate": 4.483176046247344e-05, + "loss": 5.9149, + "step": 2092 + }, + { + "epoch": 0.1, + "grad_norm": 1.5158166885375977, + "learning_rate": 4.482187855131182e-05, + "loss": 5.8517, + "step": 2096 + }, + { + "epoch": 0.1, + "grad_norm": 1.3522121906280518, + "learning_rate": 4.4811996640150205e-05, + "loss": 5.8754, + "step": 2100 + }, + { + "epoch": 0.1, + "grad_norm": 1.343929648399353, + "learning_rate": 4.480211472898859e-05, + "loss": 5.771, + "step": 2104 + }, + { + "epoch": 0.1, + "grad_norm": 1.6901476383209229, + "learning_rate": 4.479223281782697e-05, + "loss": 5.8675, + "step": 2108 + }, + { + "epoch": 0.1, + "grad_norm": 1.438178539276123, + "learning_rate": 4.478235090666535e-05, + "loss": 5.9775, + "step": 2112 + }, + { + "epoch": 0.1, + "grad_norm": 2.2448184490203857, + "learning_rate": 4.477246899550373e-05, + "loss": 5.861, + "step": 2116 + }, + { + "epoch": 0.1, + "grad_norm": 1.3768398761749268, + "learning_rate": 4.476258708434211e-05, + "loss": 5.8436, + "step": 2120 + }, + { + "epoch": 0.1, + "grad_norm": 1.5526635646820068, + "learning_rate": 4.475270517318049e-05, + "loss": 5.8113, + "step": 2124 + }, + { + "epoch": 0.11, + "grad_norm": 2.0470657348632812, + "learning_rate": 4.474282326201888e-05, + "loss": 5.8941, + "step": 2128 + }, + { + "epoch": 0.11, + "grad_norm": 1.404012680053711, + "learning_rate": 4.473294135085726e-05, + "loss": 5.9352, + "step": 2132 + }, + { + "epoch": 0.11, + "grad_norm": 1.3800268173217773, + "learning_rate": 4.472305943969564e-05, + "loss": 5.9212, + "step": 2136 + }, + { + "epoch": 0.11, + "grad_norm": 1.8839443922042847, + "learning_rate": 4.4713177528534025e-05, + "loss": 5.8908, + "step": 2140 + }, + { + "epoch": 0.11, + "grad_norm": 1.8185203075408936, + "learning_rate": 4.47032956173724e-05, + "loss": 5.7962, + "step": 2144 + }, + { + "epoch": 0.11, + "grad_norm": 1.6154285669326782, + "learning_rate": 4.469341370621078e-05, + "loss": 5.8918, + "step": 2148 + }, + { + "epoch": 0.11, + "grad_norm": 2.3525383472442627, + "learning_rate": 4.4683531795049165e-05, + "loss": 5.8528, + "step": 2152 + }, + { + "epoch": 0.11, + "grad_norm": 1.7467882633209229, + "learning_rate": 4.467364988388755e-05, + "loss": 5.8305, + "step": 2156 + }, + { + "epoch": 0.11, + "grad_norm": 1.7372554540634155, + "learning_rate": 4.466376797272593e-05, + "loss": 5.7785, + "step": 2160 + }, + { + "epoch": 0.11, + "grad_norm": 1.7468005418777466, + "learning_rate": 4.465388606156431e-05, + "loss": 6.0067, + "step": 2164 + }, + { + "epoch": 0.11, + "grad_norm": 1.874570608139038, + "learning_rate": 4.464400415040269e-05, + "loss": 5.9846, + "step": 2168 + }, + { + "epoch": 0.11, + "grad_norm": 1.5136998891830444, + "learning_rate": 4.4634122239241075e-05, + "loss": 5.8196, + "step": 2172 + }, + { + "epoch": 0.11, + "grad_norm": 1.5951347351074219, + "learning_rate": 4.462424032807945e-05, + "loss": 5.8991, + "step": 2176 + }, + { + "epoch": 0.11, + "grad_norm": 1.9109349250793457, + "learning_rate": 4.461435841691783e-05, + "loss": 5.8357, + "step": 2180 + }, + { + "epoch": 0.11, + "grad_norm": 1.5938990116119385, + "learning_rate": 4.4604476505756214e-05, + "loss": 5.7961, + "step": 2184 + }, + { + "epoch": 0.11, + "grad_norm": 1.457306981086731, + "learning_rate": 4.4594594594594596e-05, + "loss": 5.8715, + "step": 2188 + }, + { + "epoch": 0.11, + "grad_norm": 1.478857398033142, + "learning_rate": 4.458471268343298e-05, + "loss": 5.9143, + "step": 2192 + }, + { + "epoch": 0.11, + "grad_norm": 1.413000464439392, + "learning_rate": 4.457483077227136e-05, + "loss": 5.8109, + "step": 2196 + }, + { + "epoch": 0.11, + "grad_norm": 1.348897099494934, + "learning_rate": 4.456494886110974e-05, + "loss": 5.8774, + "step": 2200 + }, + { + "epoch": 0.11, + "grad_norm": 1.4481480121612549, + "learning_rate": 4.455506694994812e-05, + "loss": 5.7836, + "step": 2204 + }, + { + "epoch": 0.11, + "grad_norm": 2.077380895614624, + "learning_rate": 4.45451850387865e-05, + "loss": 5.8215, + "step": 2208 + }, + { + "epoch": 0.11, + "grad_norm": 1.5154703855514526, + "learning_rate": 4.453530312762488e-05, + "loss": 5.8485, + "step": 2212 + }, + { + "epoch": 0.11, + "grad_norm": 1.3549476861953735, + "learning_rate": 4.4525421216463264e-05, + "loss": 5.9692, + "step": 2216 + }, + { + "epoch": 0.11, + "grad_norm": 1.3713443279266357, + "learning_rate": 4.4515539305301646e-05, + "loss": 5.8609, + "step": 2220 + }, + { + "epoch": 0.11, + "grad_norm": 1.3348569869995117, + "learning_rate": 4.450565739414003e-05, + "loss": 5.831, + "step": 2224 + }, + { + "epoch": 0.11, + "grad_norm": 1.4860191345214844, + "learning_rate": 4.449577548297841e-05, + "loss": 6.0256, + "step": 2228 + }, + { + "epoch": 0.11, + "grad_norm": 1.3161065578460693, + "learning_rate": 4.448589357181679e-05, + "loss": 5.9999, + "step": 2232 + }, + { + "epoch": 0.11, + "grad_norm": 1.8355900049209595, + "learning_rate": 4.447601166065517e-05, + "loss": 5.8118, + "step": 2236 + }, + { + "epoch": 0.11, + "grad_norm": 1.4929054975509644, + "learning_rate": 4.4466129749493556e-05, + "loss": 5.9227, + "step": 2240 + }, + { + "epoch": 0.11, + "grad_norm": 1.6608703136444092, + "learning_rate": 4.445624783833194e-05, + "loss": 5.8468, + "step": 2244 + }, + { + "epoch": 0.11, + "grad_norm": 1.5917619466781616, + "learning_rate": 4.444636592717032e-05, + "loss": 5.9577, + "step": 2248 + }, + { + "epoch": 0.11, + "grad_norm": 1.5820611715316772, + "learning_rate": 4.44364840160087e-05, + "loss": 5.8065, + "step": 2252 + }, + { + "epoch": 0.11, + "grad_norm": 1.6974551677703857, + "learning_rate": 4.4426602104847085e-05, + "loss": 5.963, + "step": 2256 + }, + { + "epoch": 0.11, + "grad_norm": 1.5884884595870972, + "learning_rate": 4.441672019368546e-05, + "loss": 5.7997, + "step": 2260 + }, + { + "epoch": 0.11, + "grad_norm": 1.5706701278686523, + "learning_rate": 4.440683828252384e-05, + "loss": 5.8355, + "step": 2264 + }, + { + "epoch": 0.11, + "grad_norm": 1.5051394701004028, + "learning_rate": 4.4396956371362224e-05, + "loss": 5.9357, + "step": 2268 + }, + { + "epoch": 0.11, + "grad_norm": 1.6415704488754272, + "learning_rate": 4.4387074460200606e-05, + "loss": 5.7755, + "step": 2272 + }, + { + "epoch": 0.11, + "grad_norm": 1.9928330183029175, + "learning_rate": 4.437719254903899e-05, + "loss": 6.0354, + "step": 2276 + }, + { + "epoch": 0.11, + "grad_norm": 1.568490982055664, + "learning_rate": 4.436731063787737e-05, + "loss": 5.9754, + "step": 2280 + }, + { + "epoch": 0.11, + "grad_norm": 1.4449853897094727, + "learning_rate": 4.435742872671575e-05, + "loss": 5.939, + "step": 2284 + }, + { + "epoch": 0.11, + "grad_norm": 2.227518081665039, + "learning_rate": 4.434754681555413e-05, + "loss": 5.8591, + "step": 2288 + }, + { + "epoch": 0.11, + "grad_norm": 1.7351478338241577, + "learning_rate": 4.433766490439251e-05, + "loss": 5.8574, + "step": 2292 + }, + { + "epoch": 0.11, + "grad_norm": 2.8627262115478516, + "learning_rate": 4.432778299323089e-05, + "loss": 5.9735, + "step": 2296 + }, + { + "epoch": 0.11, + "grad_norm": 1.56145179271698, + "learning_rate": 4.4317901082069274e-05, + "loss": 5.9248, + "step": 2300 + }, + { + "epoch": 0.11, + "grad_norm": 1.5302497148513794, + "learning_rate": 4.4308019170907656e-05, + "loss": 5.8347, + "step": 2304 + }, + { + "epoch": 0.11, + "grad_norm": 1.597585678100586, + "learning_rate": 4.429813725974604e-05, + "loss": 5.7803, + "step": 2308 + }, + { + "epoch": 0.11, + "grad_norm": 1.529463768005371, + "learning_rate": 4.428825534858442e-05, + "loss": 5.8968, + "step": 2312 + }, + { + "epoch": 0.11, + "grad_norm": 1.4933751821517944, + "learning_rate": 4.42783734374228e-05, + "loss": 5.9021, + "step": 2316 + }, + { + "epoch": 0.11, + "grad_norm": 1.6076233386993408, + "learning_rate": 4.426849152626118e-05, + "loss": 5.8318, + "step": 2320 + }, + { + "epoch": 0.11, + "grad_norm": 1.7479904890060425, + "learning_rate": 4.425860961509956e-05, + "loss": 5.8686, + "step": 2324 + }, + { + "epoch": 0.12, + "grad_norm": 1.5784941911697388, + "learning_rate": 4.424872770393794e-05, + "loss": 5.8552, + "step": 2328 + }, + { + "epoch": 0.12, + "grad_norm": 2.12491512298584, + "learning_rate": 4.4238845792776323e-05, + "loss": 5.9495, + "step": 2332 + }, + { + "epoch": 0.12, + "grad_norm": 1.6172250509262085, + "learning_rate": 4.4228963881614705e-05, + "loss": 5.7925, + "step": 2336 + }, + { + "epoch": 0.12, + "grad_norm": 1.4207360744476318, + "learning_rate": 4.421908197045309e-05, + "loss": 5.8818, + "step": 2340 + }, + { + "epoch": 0.12, + "grad_norm": 2.528517246246338, + "learning_rate": 4.420920005929147e-05, + "loss": 5.848, + "step": 2344 + }, + { + "epoch": 0.12, + "grad_norm": 1.4071799516677856, + "learning_rate": 4.4199318148129845e-05, + "loss": 5.7346, + "step": 2348 + }, + { + "epoch": 0.12, + "grad_norm": 1.543596863746643, + "learning_rate": 4.418943623696823e-05, + "loss": 5.87, + "step": 2352 + }, + { + "epoch": 0.12, + "grad_norm": 1.697926640510559, + "learning_rate": 4.4179554325806616e-05, + "loss": 5.8042, + "step": 2356 + }, + { + "epoch": 0.12, + "grad_norm": 1.8649811744689941, + "learning_rate": 4.4169672414645e-05, + "loss": 5.8247, + "step": 2360 + }, + { + "epoch": 0.12, + "grad_norm": 1.496875286102295, + "learning_rate": 4.415979050348338e-05, + "loss": 5.9044, + "step": 2364 + }, + { + "epoch": 0.12, + "grad_norm": 1.469252586364746, + "learning_rate": 4.414990859232176e-05, + "loss": 5.8298, + "step": 2368 + }, + { + "epoch": 0.12, + "grad_norm": 1.4524880647659302, + "learning_rate": 4.414002668116014e-05, + "loss": 5.8173, + "step": 2372 + }, + { + "epoch": 0.12, + "grad_norm": 1.2573550939559937, + "learning_rate": 4.413014476999852e-05, + "loss": 5.8825, + "step": 2376 + }, + { + "epoch": 0.12, + "grad_norm": 1.3610568046569824, + "learning_rate": 4.41202628588369e-05, + "loss": 5.9205, + "step": 2380 + }, + { + "epoch": 0.12, + "grad_norm": 1.3914682865142822, + "learning_rate": 4.4110380947675283e-05, + "loss": 5.8465, + "step": 2384 + }, + { + "epoch": 0.12, + "grad_norm": 1.628902792930603, + "learning_rate": 4.4100499036513665e-05, + "loss": 5.9335, + "step": 2388 + }, + { + "epoch": 0.12, + "grad_norm": 2.1272637844085693, + "learning_rate": 4.409061712535205e-05, + "loss": 5.8199, + "step": 2392 + }, + { + "epoch": 0.12, + "grad_norm": 1.497086524963379, + "learning_rate": 4.408073521419043e-05, + "loss": 5.7442, + "step": 2396 + }, + { + "epoch": 0.12, + "grad_norm": 1.6062419414520264, + "learning_rate": 4.407085330302881e-05, + "loss": 5.7353, + "step": 2400 + }, + { + "epoch": 0.12, + "grad_norm": 1.6324589252471924, + "learning_rate": 4.406097139186719e-05, + "loss": 5.8851, + "step": 2404 + }, + { + "epoch": 0.12, + "grad_norm": 1.7234266996383667, + "learning_rate": 4.405108948070557e-05, + "loss": 5.9936, + "step": 2408 + }, + { + "epoch": 0.12, + "grad_norm": 1.7293999195098877, + "learning_rate": 4.404120756954395e-05, + "loss": 5.8189, + "step": 2412 + }, + { + "epoch": 0.12, + "grad_norm": 1.554434895515442, + "learning_rate": 4.403132565838233e-05, + "loss": 5.8536, + "step": 2416 + }, + { + "epoch": 0.12, + "grad_norm": 1.773876428604126, + "learning_rate": 4.4021443747220715e-05, + "loss": 5.808, + "step": 2420 + }, + { + "epoch": 0.12, + "grad_norm": 1.341245174407959, + "learning_rate": 4.40115618360591e-05, + "loss": 5.785, + "step": 2424 + }, + { + "epoch": 0.12, + "grad_norm": 1.4888144731521606, + "learning_rate": 4.400167992489748e-05, + "loss": 5.7738, + "step": 2428 + }, + { + "epoch": 0.12, + "grad_norm": 1.269622564315796, + "learning_rate": 4.3991798013735855e-05, + "loss": 5.7985, + "step": 2432 + }, + { + "epoch": 0.12, + "grad_norm": 2.2829160690307617, + "learning_rate": 4.398191610257424e-05, + "loss": 5.9229, + "step": 2436 + }, + { + "epoch": 0.12, + "grad_norm": 1.5251401662826538, + "learning_rate": 4.397203419141262e-05, + "loss": 5.967, + "step": 2440 + }, + { + "epoch": 0.12, + "grad_norm": 1.8288617134094238, + "learning_rate": 4.3962152280251e-05, + "loss": 5.8184, + "step": 2444 + }, + { + "epoch": 0.12, + "grad_norm": 1.5593011379241943, + "learning_rate": 4.395227036908938e-05, + "loss": 5.9089, + "step": 2448 + }, + { + "epoch": 0.12, + "grad_norm": 1.4598174095153809, + "learning_rate": 4.3942388457927765e-05, + "loss": 5.8489, + "step": 2452 + }, + { + "epoch": 0.12, + "grad_norm": 1.364039421081543, + "learning_rate": 4.393250654676615e-05, + "loss": 5.9755, + "step": 2456 + }, + { + "epoch": 0.12, + "grad_norm": 1.418669581413269, + "learning_rate": 4.392262463560453e-05, + "loss": 5.8468, + "step": 2460 + }, + { + "epoch": 0.12, + "grad_norm": 1.7515896558761597, + "learning_rate": 4.3912742724442904e-05, + "loss": 6.0253, + "step": 2464 + }, + { + "epoch": 0.12, + "grad_norm": 1.6947784423828125, + "learning_rate": 4.3902860813281286e-05, + "loss": 5.8654, + "step": 2468 + }, + { + "epoch": 0.12, + "grad_norm": 1.597745656967163, + "learning_rate": 4.3892978902119675e-05, + "loss": 5.78, + "step": 2472 + }, + { + "epoch": 0.12, + "grad_norm": 1.9116005897521973, + "learning_rate": 4.388309699095806e-05, + "loss": 5.871, + "step": 2476 + }, + { + "epoch": 0.12, + "grad_norm": 1.5638071298599243, + "learning_rate": 4.387321507979644e-05, + "loss": 5.8569, + "step": 2480 + }, + { + "epoch": 0.12, + "grad_norm": 2.321582794189453, + "learning_rate": 4.386333316863482e-05, + "loss": 5.7426, + "step": 2484 + }, + { + "epoch": 0.12, + "grad_norm": 1.4646013975143433, + "learning_rate": 4.38534512574732e-05, + "loss": 5.8682, + "step": 2488 + }, + { + "epoch": 0.12, + "grad_norm": 1.777219533920288, + "learning_rate": 4.384356934631158e-05, + "loss": 5.778, + "step": 2492 + }, + { + "epoch": 0.12, + "grad_norm": 1.7113094329833984, + "learning_rate": 4.383368743514996e-05, + "loss": 5.8171, + "step": 2496 + }, + { + "epoch": 0.12, + "grad_norm": 1.6728547811508179, + "learning_rate": 4.382380552398834e-05, + "loss": 5.8537, + "step": 2500 + }, + { + "epoch": 0.12, + "grad_norm": 1.5846984386444092, + "learning_rate": 4.3813923612826725e-05, + "loss": 5.9563, + "step": 2504 + }, + { + "epoch": 0.12, + "grad_norm": 2.277194023132324, + "learning_rate": 4.380404170166511e-05, + "loss": 5.9298, + "step": 2508 + }, + { + "epoch": 0.12, + "grad_norm": 2.1459007263183594, + "learning_rate": 4.379415979050349e-05, + "loss": 5.9069, + "step": 2512 + }, + { + "epoch": 0.12, + "grad_norm": 1.9131797552108765, + "learning_rate": 4.3784277879341864e-05, + "loss": 5.8698, + "step": 2516 + }, + { + "epoch": 0.12, + "grad_norm": 1.5482133626937866, + "learning_rate": 4.3774395968180246e-05, + "loss": 5.8516, + "step": 2520 + }, + { + "epoch": 0.12, + "grad_norm": 1.3043674230575562, + "learning_rate": 4.376451405701863e-05, + "loss": 5.8176, + "step": 2524 + }, + { + "epoch": 0.12, + "grad_norm": 1.6391806602478027, + "learning_rate": 4.375463214585701e-05, + "loss": 5.7712, + "step": 2528 + }, + { + "epoch": 0.13, + "grad_norm": 1.7865711450576782, + "learning_rate": 4.374475023469539e-05, + "loss": 5.9933, + "step": 2532 + }, + { + "epoch": 0.13, + "grad_norm": 1.6309571266174316, + "learning_rate": 4.3734868323533775e-05, + "loss": 5.7993, + "step": 2536 + }, + { + "epoch": 0.13, + "grad_norm": 1.9978580474853516, + "learning_rate": 4.372498641237216e-05, + "loss": 5.8303, + "step": 2540 + }, + { + "epoch": 0.13, + "grad_norm": 1.507794976234436, + "learning_rate": 4.371510450121053e-05, + "loss": 5.7701, + "step": 2544 + }, + { + "epoch": 0.13, + "grad_norm": 1.4123347997665405, + "learning_rate": 4.3705222590048914e-05, + "loss": 5.9114, + "step": 2548 + }, + { + "epoch": 0.13, + "grad_norm": 1.467590570449829, + "learning_rate": 4.3695340678887296e-05, + "loss": 5.893, + "step": 2552 + }, + { + "epoch": 0.13, + "grad_norm": 1.4081734418869019, + "learning_rate": 4.368545876772568e-05, + "loss": 5.8997, + "step": 2556 + }, + { + "epoch": 0.13, + "grad_norm": 1.4744434356689453, + "learning_rate": 4.367557685656406e-05, + "loss": 5.7802, + "step": 2560 + }, + { + "epoch": 0.13, + "grad_norm": 1.405860185623169, + "learning_rate": 4.366569494540244e-05, + "loss": 5.9679, + "step": 2564 + }, + { + "epoch": 0.13, + "grad_norm": 1.8207783699035645, + "learning_rate": 4.3655813034240824e-05, + "loss": 5.9814, + "step": 2568 + }, + { + "epoch": 0.13, + "grad_norm": 1.5685409307479858, + "learning_rate": 4.3645931123079206e-05, + "loss": 5.9399, + "step": 2572 + }, + { + "epoch": 0.13, + "grad_norm": 1.218668818473816, + "learning_rate": 4.363604921191758e-05, + "loss": 5.7926, + "step": 2576 + }, + { + "epoch": 0.13, + "grad_norm": 1.3519160747528076, + "learning_rate": 4.3626167300755964e-05, + "loss": 5.7725, + "step": 2580 + }, + { + "epoch": 0.13, + "grad_norm": 2.0880026817321777, + "learning_rate": 4.361628538959435e-05, + "loss": 5.9561, + "step": 2584 + }, + { + "epoch": 0.13, + "grad_norm": 1.6406432390213013, + "learning_rate": 4.3606403478432735e-05, + "loss": 5.8689, + "step": 2588 + }, + { + "epoch": 0.13, + "grad_norm": 1.4885075092315674, + "learning_rate": 4.359652156727112e-05, + "loss": 5.7307, + "step": 2592 + }, + { + "epoch": 0.13, + "grad_norm": 1.5429643392562866, + "learning_rate": 4.35866396561095e-05, + "loss": 5.8006, + "step": 2596 + }, + { + "epoch": 0.13, + "grad_norm": 1.7911728620529175, + "learning_rate": 4.3576757744947874e-05, + "loss": 5.8635, + "step": 2600 + }, + { + "epoch": 0.13, + "grad_norm": 2.0220367908477783, + "learning_rate": 4.3566875833786256e-05, + "loss": 5.8923, + "step": 2604 + }, + { + "epoch": 0.13, + "grad_norm": 1.5731260776519775, + "learning_rate": 4.355699392262464e-05, + "loss": 5.9229, + "step": 2608 + }, + { + "epoch": 0.13, + "grad_norm": 2.0656206607818604, + "learning_rate": 4.354711201146302e-05, + "loss": 5.9055, + "step": 2612 + }, + { + "epoch": 0.13, + "grad_norm": 1.5131778717041016, + "learning_rate": 4.35372301003014e-05, + "loss": 5.8296, + "step": 2616 + }, + { + "epoch": 0.13, + "grad_norm": 1.953755497932434, + "learning_rate": 4.3527348189139784e-05, + "loss": 5.7472, + "step": 2620 + }, + { + "epoch": 0.13, + "grad_norm": 1.496201515197754, + "learning_rate": 4.3517466277978166e-05, + "loss": 5.8703, + "step": 2624 + }, + { + "epoch": 0.13, + "grad_norm": 1.6766773462295532, + "learning_rate": 4.350758436681654e-05, + "loss": 5.8507, + "step": 2628 + }, + { + "epoch": 0.13, + "grad_norm": 1.5544074773788452, + "learning_rate": 4.3497702455654924e-05, + "loss": 5.7533, + "step": 2632 + }, + { + "epoch": 0.13, + "grad_norm": 2.169536828994751, + "learning_rate": 4.3487820544493306e-05, + "loss": 5.9255, + "step": 2636 + }, + { + "epoch": 0.13, + "grad_norm": 1.78980553150177, + "learning_rate": 4.347793863333169e-05, + "loss": 5.8978, + "step": 2640 + }, + { + "epoch": 0.13, + "grad_norm": 1.4349106550216675, + "learning_rate": 4.346805672217007e-05, + "loss": 5.7736, + "step": 2644 + }, + { + "epoch": 0.13, + "grad_norm": 1.5959937572479248, + "learning_rate": 4.345817481100845e-05, + "loss": 5.8393, + "step": 2648 + }, + { + "epoch": 0.13, + "grad_norm": 1.2932758331298828, + "learning_rate": 4.3448292899846834e-05, + "loss": 6.0105, + "step": 2652 + }, + { + "epoch": 0.13, + "grad_norm": 1.8422064781188965, + "learning_rate": 4.3438410988685216e-05, + "loss": 5.7793, + "step": 2656 + }, + { + "epoch": 0.13, + "grad_norm": 1.948499083518982, + "learning_rate": 4.342852907752359e-05, + "loss": 5.9639, + "step": 2660 + }, + { + "epoch": 0.13, + "grad_norm": 1.4169591665267944, + "learning_rate": 4.3418647166361973e-05, + "loss": 5.8024, + "step": 2664 + }, + { + "epoch": 0.13, + "grad_norm": 2.4500057697296143, + "learning_rate": 4.3408765255200356e-05, + "loss": 5.8555, + "step": 2668 + }, + { + "epoch": 0.13, + "grad_norm": 1.5837527513504028, + "learning_rate": 4.339888334403874e-05, + "loss": 5.9194, + "step": 2672 + }, + { + "epoch": 0.13, + "grad_norm": 1.8204997777938843, + "learning_rate": 4.338900143287712e-05, + "loss": 5.9046, + "step": 2676 + }, + { + "epoch": 0.13, + "grad_norm": 1.7072649002075195, + "learning_rate": 4.33791195217155e-05, + "loss": 5.7805, + "step": 2680 + }, + { + "epoch": 0.13, + "grad_norm": 2.0868546962738037, + "learning_rate": 4.3369237610553884e-05, + "loss": 5.8899, + "step": 2684 + }, + { + "epoch": 0.13, + "grad_norm": 1.489852786064148, + "learning_rate": 4.335935569939226e-05, + "loss": 5.8553, + "step": 2688 + }, + { + "epoch": 0.13, + "grad_norm": 1.7851141691207886, + "learning_rate": 4.334947378823064e-05, + "loss": 5.7927, + "step": 2692 + }, + { + "epoch": 0.13, + "grad_norm": 1.4655989408493042, + "learning_rate": 4.333959187706902e-05, + "loss": 5.9079, + "step": 2696 + }, + { + "epoch": 0.13, + "grad_norm": 1.4320762157440186, + "learning_rate": 4.332970996590741e-05, + "loss": 5.9267, + "step": 2700 + }, + { + "epoch": 0.13, + "grad_norm": 1.6071585416793823, + "learning_rate": 4.3319828054745794e-05, + "loss": 5.7468, + "step": 2704 + }, + { + "epoch": 0.13, + "grad_norm": 1.3550992012023926, + "learning_rate": 4.3309946143584176e-05, + "loss": 5.8188, + "step": 2708 + }, + { + "epoch": 0.13, + "grad_norm": 2.1684658527374268, + "learning_rate": 4.330006423242255e-05, + "loss": 5.8959, + "step": 2712 + }, + { + "epoch": 0.13, + "grad_norm": 1.543056607246399, + "learning_rate": 4.3290182321260933e-05, + "loss": 5.9079, + "step": 2716 + }, + { + "epoch": 0.13, + "grad_norm": 1.4535815715789795, + "learning_rate": 4.3280300410099316e-05, + "loss": 5.814, + "step": 2720 + }, + { + "epoch": 0.13, + "grad_norm": 1.6012686491012573, + "learning_rate": 4.32704184989377e-05, + "loss": 5.9343, + "step": 2724 + }, + { + "epoch": 0.13, + "grad_norm": 1.63141667842865, + "learning_rate": 4.326053658777608e-05, + "loss": 5.744, + "step": 2728 + }, + { + "epoch": 0.13, + "grad_norm": 1.621268630027771, + "learning_rate": 4.325065467661446e-05, + "loss": 5.6779, + "step": 2732 + }, + { + "epoch": 0.14, + "grad_norm": 1.809212565422058, + "learning_rate": 4.3240772765452844e-05, + "loss": 5.8384, + "step": 2736 + }, + { + "epoch": 0.14, + "grad_norm": 1.3587805032730103, + "learning_rate": 4.3230890854291226e-05, + "loss": 5.7724, + "step": 2740 + }, + { + "epoch": 0.14, + "grad_norm": 1.7285997867584229, + "learning_rate": 4.32210089431296e-05, + "loss": 5.8804, + "step": 2744 + }, + { + "epoch": 0.14, + "grad_norm": 1.9753797054290771, + "learning_rate": 4.321112703196798e-05, + "loss": 5.8618, + "step": 2748 + }, + { + "epoch": 0.14, + "grad_norm": 1.357673168182373, + "learning_rate": 4.3201245120806365e-05, + "loss": 5.6777, + "step": 2752 + }, + { + "epoch": 0.14, + "grad_norm": 1.5860601663589478, + "learning_rate": 4.319136320964475e-05, + "loss": 5.7696, + "step": 2756 + }, + { + "epoch": 0.14, + "grad_norm": 1.830790638923645, + "learning_rate": 4.318148129848313e-05, + "loss": 5.7376, + "step": 2760 + }, + { + "epoch": 0.14, + "grad_norm": 1.4875571727752686, + "learning_rate": 4.317159938732151e-05, + "loss": 5.8386, + "step": 2764 + }, + { + "epoch": 0.14, + "grad_norm": 1.5012223720550537, + "learning_rate": 4.3161717476159893e-05, + "loss": 5.8406, + "step": 2768 + }, + { + "epoch": 0.14, + "grad_norm": 1.872456669807434, + "learning_rate": 4.315183556499827e-05, + "loss": 5.7977, + "step": 2772 + }, + { + "epoch": 0.14, + "grad_norm": 1.4684159755706787, + "learning_rate": 4.314195365383665e-05, + "loss": 5.6846, + "step": 2776 + }, + { + "epoch": 0.14, + "grad_norm": 1.8385950326919556, + "learning_rate": 4.313207174267503e-05, + "loss": 5.9036, + "step": 2780 + }, + { + "epoch": 0.14, + "grad_norm": 1.6362018585205078, + "learning_rate": 4.3122189831513415e-05, + "loss": 5.8381, + "step": 2784 + }, + { + "epoch": 0.14, + "grad_norm": 1.5058547258377075, + "learning_rate": 4.31123079203518e-05, + "loss": 5.8647, + "step": 2788 + }, + { + "epoch": 0.14, + "grad_norm": 2.060148239135742, + "learning_rate": 4.310242600919018e-05, + "loss": 5.7431, + "step": 2792 + }, + { + "epoch": 0.14, + "grad_norm": 1.9664667844772339, + "learning_rate": 4.309254409802856e-05, + "loss": 5.8879, + "step": 2796 + }, + { + "epoch": 0.14, + "grad_norm": 1.9889500141143799, + "learning_rate": 4.308266218686694e-05, + "loss": 5.8715, + "step": 2800 + }, + { + "epoch": 0.14, + "grad_norm": 1.3932100534439087, + "learning_rate": 4.307278027570532e-05, + "loss": 5.6858, + "step": 2804 + }, + { + "epoch": 0.14, + "grad_norm": 1.6131174564361572, + "learning_rate": 4.30628983645437e-05, + "loss": 5.9675, + "step": 2808 + }, + { + "epoch": 0.14, + "grad_norm": 1.9435076713562012, + "learning_rate": 4.305301645338208e-05, + "loss": 5.8918, + "step": 2812 + }, + { + "epoch": 0.14, + "grad_norm": 1.5480365753173828, + "learning_rate": 4.304313454222047e-05, + "loss": 5.7797, + "step": 2816 + }, + { + "epoch": 0.14, + "grad_norm": 1.5039936304092407, + "learning_rate": 4.3033252631058853e-05, + "loss": 5.7075, + "step": 2820 + }, + { + "epoch": 0.14, + "grad_norm": 1.4041931629180908, + "learning_rate": 4.3023370719897236e-05, + "loss": 5.6871, + "step": 2824 + }, + { + "epoch": 0.14, + "grad_norm": 1.9822477102279663, + "learning_rate": 4.301348880873561e-05, + "loss": 5.8225, + "step": 2828 + }, + { + "epoch": 0.14, + "grad_norm": 1.6121152639389038, + "learning_rate": 4.300360689757399e-05, + "loss": 5.7735, + "step": 2832 + }, + { + "epoch": 0.14, + "grad_norm": 1.6461482048034668, + "learning_rate": 4.2993724986412375e-05, + "loss": 5.8867, + "step": 2836 + }, + { + "epoch": 0.14, + "grad_norm": 1.8447121381759644, + "learning_rate": 4.298384307525076e-05, + "loss": 5.7835, + "step": 2840 + }, + { + "epoch": 0.14, + "grad_norm": 1.4918212890625, + "learning_rate": 4.297396116408914e-05, + "loss": 5.6962, + "step": 2844 + }, + { + "epoch": 0.14, + "grad_norm": 1.7599409818649292, + "learning_rate": 4.296407925292752e-05, + "loss": 5.8473, + "step": 2848 + }, + { + "epoch": 0.14, + "grad_norm": 1.9731839895248413, + "learning_rate": 4.29541973417659e-05, + "loss": 5.904, + "step": 2852 + }, + { + "epoch": 0.14, + "grad_norm": 1.9410591125488281, + "learning_rate": 4.294431543060428e-05, + "loss": 5.8863, + "step": 2856 + }, + { + "epoch": 0.14, + "grad_norm": 1.942104458808899, + "learning_rate": 4.293443351944266e-05, + "loss": 5.7006, + "step": 2860 + }, + { + "epoch": 0.14, + "grad_norm": 1.6088027954101562, + "learning_rate": 4.292455160828104e-05, + "loss": 5.7491, + "step": 2864 + }, + { + "epoch": 0.14, + "grad_norm": 1.5795434713363647, + "learning_rate": 4.2914669697119425e-05, + "loss": 5.8635, + "step": 2868 + }, + { + "epoch": 0.14, + "grad_norm": 1.7501459121704102, + "learning_rate": 4.290478778595781e-05, + "loss": 5.8032, + "step": 2872 + }, + { + "epoch": 0.14, + "grad_norm": 1.9903556108474731, + "learning_rate": 4.289490587479619e-05, + "loss": 5.885, + "step": 2876 + }, + { + "epoch": 0.14, + "grad_norm": 1.539027452468872, + "learning_rate": 4.288502396363457e-05, + "loss": 5.6591, + "step": 2880 + }, + { + "epoch": 0.14, + "grad_norm": 1.6801403760910034, + "learning_rate": 4.287514205247295e-05, + "loss": 5.8648, + "step": 2884 + }, + { + "epoch": 0.14, + "grad_norm": 1.8741987943649292, + "learning_rate": 4.286526014131133e-05, + "loss": 5.9236, + "step": 2888 + }, + { + "epoch": 0.14, + "grad_norm": 1.261087417602539, + "learning_rate": 4.285537823014971e-05, + "loss": 5.7465, + "step": 2892 + }, + { + "epoch": 0.14, + "grad_norm": 1.9104219675064087, + "learning_rate": 4.284549631898809e-05, + "loss": 5.7858, + "step": 2896 + }, + { + "epoch": 0.14, + "grad_norm": 1.8742128610610962, + "learning_rate": 4.2835614407826474e-05, + "loss": 5.8395, + "step": 2900 + }, + { + "epoch": 0.14, + "grad_norm": 1.9993197917938232, + "learning_rate": 4.2825732496664856e-05, + "loss": 5.7205, + "step": 2904 + }, + { + "epoch": 0.14, + "grad_norm": 1.668923258781433, + "learning_rate": 4.281585058550324e-05, + "loss": 5.8264, + "step": 2908 + }, + { + "epoch": 0.14, + "grad_norm": 2.509131669998169, + "learning_rate": 4.280596867434162e-05, + "loss": 5.8287, + "step": 2912 + }, + { + "epoch": 0.14, + "grad_norm": 1.6229509115219116, + "learning_rate": 4.2796086763179996e-05, + "loss": 5.8989, + "step": 2916 + }, + { + "epoch": 0.14, + "grad_norm": 1.9508914947509766, + "learning_rate": 4.278620485201838e-05, + "loss": 5.7111, + "step": 2920 + }, + { + "epoch": 0.14, + "grad_norm": 1.3820146322250366, + "learning_rate": 4.277632294085676e-05, + "loss": 5.8116, + "step": 2924 + }, + { + "epoch": 0.14, + "grad_norm": 1.623271107673645, + "learning_rate": 4.276644102969514e-05, + "loss": 5.7792, + "step": 2928 + }, + { + "epoch": 0.14, + "grad_norm": 1.7664297819137573, + "learning_rate": 4.275655911853353e-05, + "loss": 5.8097, + "step": 2932 + }, + { + "epoch": 0.15, + "grad_norm": 1.8661329746246338, + "learning_rate": 4.274667720737191e-05, + "loss": 5.8191, + "step": 2936 + }, + { + "epoch": 0.15, + "grad_norm": 1.6165112257003784, + "learning_rate": 4.273679529621029e-05, + "loss": 5.8802, + "step": 2940 + }, + { + "epoch": 0.15, + "grad_norm": 1.5878552198410034, + "learning_rate": 4.272691338504867e-05, + "loss": 5.8965, + "step": 2944 + }, + { + "epoch": 0.15, + "grad_norm": 3.6278650760650635, + "learning_rate": 4.271703147388705e-05, + "loss": 5.6925, + "step": 2948 + }, + { + "epoch": 0.15, + "grad_norm": 1.6811779737472534, + "learning_rate": 4.2707149562725434e-05, + "loss": 5.7483, + "step": 2952 + }, + { + "epoch": 0.15, + "grad_norm": 1.4222928285598755, + "learning_rate": 4.2697267651563816e-05, + "loss": 5.7322, + "step": 2956 + }, + { + "epoch": 0.15, + "grad_norm": 1.6275368928909302, + "learning_rate": 4.26873857404022e-05, + "loss": 5.803, + "step": 2960 + }, + { + "epoch": 0.15, + "grad_norm": 1.4676285982131958, + "learning_rate": 4.267750382924058e-05, + "loss": 5.8874, + "step": 2964 + }, + { + "epoch": 0.15, + "grad_norm": 1.3239818811416626, + "learning_rate": 4.266762191807896e-05, + "loss": 5.8061, + "step": 2968 + }, + { + "epoch": 0.15, + "grad_norm": 1.5904514789581299, + "learning_rate": 4.265774000691734e-05, + "loss": 5.8226, + "step": 2972 + }, + { + "epoch": 0.15, + "grad_norm": 1.6302812099456787, + "learning_rate": 4.264785809575572e-05, + "loss": 5.8038, + "step": 2976 + }, + { + "epoch": 0.15, + "grad_norm": 1.495665192604065, + "learning_rate": 4.26379761845941e-05, + "loss": 5.8282, + "step": 2980 + }, + { + "epoch": 0.15, + "grad_norm": 1.776206612586975, + "learning_rate": 4.2628094273432484e-05, + "loss": 5.8956, + "step": 2984 + }, + { + "epoch": 0.15, + "grad_norm": 2.3062102794647217, + "learning_rate": 4.2618212362270866e-05, + "loss": 5.7843, + "step": 2988 + }, + { + "epoch": 0.15, + "grad_norm": 1.649765968322754, + "learning_rate": 4.260833045110925e-05, + "loss": 5.7521, + "step": 2992 + }, + { + "epoch": 0.15, + "grad_norm": 1.4618674516677856, + "learning_rate": 4.259844853994763e-05, + "loss": 5.8289, + "step": 2996 + }, + { + "epoch": 0.15, + "grad_norm": 1.6106091737747192, + "learning_rate": 4.2588566628786006e-05, + "loss": 5.8524, + "step": 3000 + }, + { + "epoch": 0.15, + "grad_norm": 1.6770355701446533, + "learning_rate": 4.257868471762439e-05, + "loss": 5.8803, + "step": 3004 + }, + { + "epoch": 0.15, + "grad_norm": 1.5975035429000854, + "learning_rate": 4.256880280646277e-05, + "loss": 5.8038, + "step": 3008 + }, + { + "epoch": 0.15, + "grad_norm": 1.5957070589065552, + "learning_rate": 4.255892089530115e-05, + "loss": 5.7577, + "step": 3012 + }, + { + "epoch": 0.15, + "grad_norm": 1.676400065422058, + "learning_rate": 4.2549038984139534e-05, + "loss": 5.7606, + "step": 3016 + }, + { + "epoch": 0.15, + "grad_norm": 1.4680049419403076, + "learning_rate": 4.2539157072977916e-05, + "loss": 5.8903, + "step": 3020 + }, + { + "epoch": 0.15, + "grad_norm": 1.5760340690612793, + "learning_rate": 4.25292751618163e-05, + "loss": 5.7209, + "step": 3024 + }, + { + "epoch": 0.15, + "grad_norm": 1.6681965589523315, + "learning_rate": 4.251939325065468e-05, + "loss": 5.7767, + "step": 3028 + }, + { + "epoch": 0.15, + "grad_norm": 1.4011784791946411, + "learning_rate": 4.2509511339493055e-05, + "loss": 5.7752, + "step": 3032 + }, + { + "epoch": 0.15, + "grad_norm": 1.6614855527877808, + "learning_rate": 4.249962942833144e-05, + "loss": 5.8396, + "step": 3036 + }, + { + "epoch": 0.15, + "grad_norm": 1.5414800643920898, + "learning_rate": 4.248974751716982e-05, + "loss": 5.8238, + "step": 3040 + }, + { + "epoch": 0.15, + "grad_norm": 1.9518810510635376, + "learning_rate": 4.247986560600821e-05, + "loss": 5.9185, + "step": 3044 + }, + { + "epoch": 0.15, + "grad_norm": 1.8648451566696167, + "learning_rate": 4.246998369484659e-05, + "loss": 5.9562, + "step": 3048 + }, + { + "epoch": 0.15, + "grad_norm": 2.052186965942383, + "learning_rate": 4.246010178368497e-05, + "loss": 5.7948, + "step": 3052 + }, + { + "epoch": 0.15, + "grad_norm": 1.8737437725067139, + "learning_rate": 4.245021987252335e-05, + "loss": 5.7977, + "step": 3056 + }, + { + "epoch": 0.15, + "grad_norm": 1.7005298137664795, + "learning_rate": 4.244033796136173e-05, + "loss": 5.7203, + "step": 3060 + }, + { + "epoch": 0.15, + "grad_norm": 1.91958487033844, + "learning_rate": 4.243045605020011e-05, + "loss": 5.798, + "step": 3064 + }, + { + "epoch": 0.15, + "grad_norm": 1.6794556379318237, + "learning_rate": 4.2420574139038494e-05, + "loss": 5.8465, + "step": 3068 + }, + { + "epoch": 0.15, + "grad_norm": 1.8262078762054443, + "learning_rate": 4.2410692227876876e-05, + "loss": 5.7919, + "step": 3072 + }, + { + "epoch": 0.15, + "grad_norm": 1.3758339881896973, + "learning_rate": 4.240081031671526e-05, + "loss": 5.8541, + "step": 3076 + }, + { + "epoch": 0.15, + "grad_norm": 1.6782643795013428, + "learning_rate": 4.239092840555364e-05, + "loss": 5.7396, + "step": 3080 + }, + { + "epoch": 0.15, + "grad_norm": 1.5164580345153809, + "learning_rate": 4.2381046494392015e-05, + "loss": 5.8555, + "step": 3084 + }, + { + "epoch": 0.15, + "grad_norm": 1.664754033088684, + "learning_rate": 4.23711645832304e-05, + "loss": 5.7878, + "step": 3088 + }, + { + "epoch": 0.15, + "grad_norm": 1.615749716758728, + "learning_rate": 4.236128267206878e-05, + "loss": 5.8452, + "step": 3092 + }, + { + "epoch": 0.15, + "grad_norm": 1.699905514717102, + "learning_rate": 4.235140076090716e-05, + "loss": 5.6857, + "step": 3096 + }, + { + "epoch": 0.15, + "grad_norm": 1.5537385940551758, + "learning_rate": 4.2341518849745543e-05, + "loss": 5.7966, + "step": 3100 + }, + { + "epoch": 0.15, + "grad_norm": 1.6133285760879517, + "learning_rate": 4.2331636938583926e-05, + "loss": 5.7937, + "step": 3104 + }, + { + "epoch": 0.15, + "grad_norm": 1.6025242805480957, + "learning_rate": 4.232175502742231e-05, + "loss": 5.847, + "step": 3108 + }, + { + "epoch": 0.15, + "grad_norm": 2.157740831375122, + "learning_rate": 4.231187311626069e-05, + "loss": 5.8554, + "step": 3112 + }, + { + "epoch": 0.15, + "grad_norm": 1.3504477739334106, + "learning_rate": 4.2301991205099065e-05, + "loss": 5.864, + "step": 3116 + }, + { + "epoch": 0.15, + "grad_norm": 1.835391879081726, + "learning_rate": 4.229210929393745e-05, + "loss": 5.8793, + "step": 3120 + }, + { + "epoch": 0.15, + "grad_norm": 1.7732737064361572, + "learning_rate": 4.228222738277583e-05, + "loss": 5.8299, + "step": 3124 + }, + { + "epoch": 0.15, + "grad_norm": 1.3115538358688354, + "learning_rate": 4.227234547161421e-05, + "loss": 5.6581, + "step": 3128 + }, + { + "epoch": 0.15, + "grad_norm": 1.5173206329345703, + "learning_rate": 4.226246356045259e-05, + "loss": 5.6791, + "step": 3132 + }, + { + "epoch": 0.15, + "grad_norm": 1.850712537765503, + "learning_rate": 4.2252581649290975e-05, + "loss": 5.7433, + "step": 3136 + }, + { + "epoch": 0.16, + "grad_norm": 1.6634035110473633, + "learning_rate": 4.224269973812936e-05, + "loss": 5.8331, + "step": 3140 + }, + { + "epoch": 0.16, + "grad_norm": 1.8786728382110596, + "learning_rate": 4.223281782696773e-05, + "loss": 5.6907, + "step": 3144 + }, + { + "epoch": 0.16, + "grad_norm": 1.4625319242477417, + "learning_rate": 4.2222935915806115e-05, + "loss": 5.9035, + "step": 3148 + }, + { + "epoch": 0.16, + "grad_norm": 1.6950637102127075, + "learning_rate": 4.22130540046445e-05, + "loss": 5.7991, + "step": 3152 + }, + { + "epoch": 0.16, + "grad_norm": 1.7324408292770386, + "learning_rate": 4.220317209348288e-05, + "loss": 5.7367, + "step": 3156 + }, + { + "epoch": 0.16, + "grad_norm": 2.2330594062805176, + "learning_rate": 4.219329018232127e-05, + "loss": 5.8253, + "step": 3160 + }, + { + "epoch": 0.16, + "grad_norm": 1.658172607421875, + "learning_rate": 4.218340827115965e-05, + "loss": 5.8604, + "step": 3164 + }, + { + "epoch": 0.16, + "grad_norm": 1.7080429792404175, + "learning_rate": 4.2173526359998025e-05, + "loss": 5.8495, + "step": 3168 + }, + { + "epoch": 0.16, + "grad_norm": 1.8790498971939087, + "learning_rate": 4.216364444883641e-05, + "loss": 5.7018, + "step": 3172 + }, + { + "epoch": 0.16, + "grad_norm": 2.222698211669922, + "learning_rate": 4.215376253767479e-05, + "loss": 5.841, + "step": 3176 + }, + { + "epoch": 0.16, + "grad_norm": 1.7230134010314941, + "learning_rate": 4.214388062651317e-05, + "loss": 5.805, + "step": 3180 + }, + { + "epoch": 0.16, + "grad_norm": 2.234403133392334, + "learning_rate": 4.213399871535155e-05, + "loss": 5.9041, + "step": 3184 + }, + { + "epoch": 0.16, + "grad_norm": 2.1015374660491943, + "learning_rate": 4.2124116804189935e-05, + "loss": 5.9052, + "step": 3188 + }, + { + "epoch": 0.16, + "grad_norm": 1.6418051719665527, + "learning_rate": 4.211423489302832e-05, + "loss": 5.7498, + "step": 3192 + }, + { + "epoch": 0.16, + "grad_norm": 1.524634599685669, + "learning_rate": 4.210435298186669e-05, + "loss": 5.8123, + "step": 3196 + }, + { + "epoch": 0.16, + "grad_norm": 1.4214069843292236, + "learning_rate": 4.2094471070705075e-05, + "loss": 5.7962, + "step": 3200 + }, + { + "epoch": 0.16, + "grad_norm": 2.6622087955474854, + "learning_rate": 4.208458915954346e-05, + "loss": 5.8462, + "step": 3204 + }, + { + "epoch": 0.16, + "grad_norm": 1.3304983377456665, + "learning_rate": 4.207470724838184e-05, + "loss": 5.8222, + "step": 3208 + }, + { + "epoch": 0.16, + "grad_norm": 1.9604005813598633, + "learning_rate": 4.206482533722022e-05, + "loss": 5.8678, + "step": 3212 + }, + { + "epoch": 0.16, + "grad_norm": 1.4757952690124512, + "learning_rate": 4.20549434260586e-05, + "loss": 5.8205, + "step": 3216 + }, + { + "epoch": 0.16, + "grad_norm": 1.6784671545028687, + "learning_rate": 4.2045061514896985e-05, + "loss": 5.8077, + "step": 3220 + }, + { + "epoch": 0.16, + "grad_norm": 2.231874465942383, + "learning_rate": 4.203517960373537e-05, + "loss": 5.7779, + "step": 3224 + }, + { + "epoch": 0.16, + "grad_norm": 2.200303316116333, + "learning_rate": 4.202529769257374e-05, + "loss": 5.8089, + "step": 3228 + }, + { + "epoch": 0.16, + "grad_norm": 1.9842939376831055, + "learning_rate": 4.2015415781412124e-05, + "loss": 5.819, + "step": 3232 + }, + { + "epoch": 0.16, + "grad_norm": 1.8117098808288574, + "learning_rate": 4.2005533870250506e-05, + "loss": 5.9165, + "step": 3236 + }, + { + "epoch": 0.16, + "grad_norm": 1.559996247291565, + "learning_rate": 4.199565195908889e-05, + "loss": 5.7312, + "step": 3240 + }, + { + "epoch": 0.16, + "grad_norm": 1.4291075468063354, + "learning_rate": 4.198577004792727e-05, + "loss": 5.8351, + "step": 3244 + }, + { + "epoch": 0.16, + "grad_norm": 2.1594057083129883, + "learning_rate": 4.197588813676565e-05, + "loss": 5.8667, + "step": 3248 + }, + { + "epoch": 0.16, + "grad_norm": 1.4217649698257446, + "learning_rate": 4.1966006225604035e-05, + "loss": 5.7143, + "step": 3252 + }, + { + "epoch": 0.16, + "grad_norm": 2.619981288909912, + "learning_rate": 4.195612431444241e-05, + "loss": 5.8356, + "step": 3256 + }, + { + "epoch": 0.16, + "grad_norm": 2.002779960632324, + "learning_rate": 4.194624240328079e-05, + "loss": 5.7442, + "step": 3260 + }, + { + "epoch": 0.16, + "grad_norm": 1.639796257019043, + "learning_rate": 4.1936360492119174e-05, + "loss": 5.8934, + "step": 3264 + }, + { + "epoch": 0.16, + "grad_norm": 1.6575497388839722, + "learning_rate": 4.1926478580957556e-05, + "loss": 5.7446, + "step": 3268 + }, + { + "epoch": 0.16, + "grad_norm": 1.411887526512146, + "learning_rate": 4.191659666979594e-05, + "loss": 5.803, + "step": 3272 + }, + { + "epoch": 0.16, + "grad_norm": 1.6861897706985474, + "learning_rate": 4.190671475863433e-05, + "loss": 5.8827, + "step": 3276 + }, + { + "epoch": 0.16, + "grad_norm": 1.5666462182998657, + "learning_rate": 4.18968328474727e-05, + "loss": 5.6994, + "step": 3280 + }, + { + "epoch": 0.16, + "grad_norm": 2.06365704536438, + "learning_rate": 4.1886950936311084e-05, + "loss": 5.7583, + "step": 3284 + }, + { + "epoch": 0.16, + "grad_norm": 1.5888551473617554, + "learning_rate": 4.1877069025149466e-05, + "loss": 5.778, + "step": 3288 + }, + { + "epoch": 0.16, + "grad_norm": 1.8238952159881592, + "learning_rate": 4.186718711398785e-05, + "loss": 5.8059, + "step": 3292 + }, + { + "epoch": 0.16, + "grad_norm": 1.9061448574066162, + "learning_rate": 4.185730520282623e-05, + "loss": 5.7522, + "step": 3296 + }, + { + "epoch": 0.16, + "grad_norm": 2.1226706504821777, + "learning_rate": 4.184742329166461e-05, + "loss": 5.7591, + "step": 3300 + }, + { + "epoch": 0.16, + "grad_norm": 1.4875596761703491, + "learning_rate": 4.1837541380502995e-05, + "loss": 5.8178, + "step": 3304 + }, + { + "epoch": 0.16, + "grad_norm": 1.5928765535354614, + "learning_rate": 4.182765946934138e-05, + "loss": 5.7657, + "step": 3308 + }, + { + "epoch": 0.16, + "grad_norm": 1.4694312810897827, + "learning_rate": 4.181777755817975e-05, + "loss": 5.8106, + "step": 3312 + }, + { + "epoch": 0.16, + "grad_norm": 1.9643967151641846, + "learning_rate": 4.1807895647018134e-05, + "loss": 5.8297, + "step": 3316 + }, + { + "epoch": 0.16, + "grad_norm": 1.6413182020187378, + "learning_rate": 4.1798013735856516e-05, + "loss": 5.8, + "step": 3320 + }, + { + "epoch": 0.16, + "grad_norm": 1.6926419734954834, + "learning_rate": 4.17881318246949e-05, + "loss": 5.7488, + "step": 3324 + }, + { + "epoch": 0.16, + "grad_norm": 2.3814029693603516, + "learning_rate": 4.177824991353328e-05, + "loss": 5.8541, + "step": 3328 + }, + { + "epoch": 0.16, + "grad_norm": 1.9140582084655762, + "learning_rate": 4.176836800237166e-05, + "loss": 5.7962, + "step": 3332 + }, + { + "epoch": 0.16, + "grad_norm": 1.598402976989746, + "learning_rate": 4.1758486091210044e-05, + "loss": 5.6331, + "step": 3336 + }, + { + "epoch": 0.17, + "grad_norm": 1.7428086996078491, + "learning_rate": 4.174860418004842e-05, + "loss": 5.8097, + "step": 3340 + }, + { + "epoch": 0.17, + "grad_norm": 1.6181442737579346, + "learning_rate": 4.17387222688868e-05, + "loss": 5.7108, + "step": 3344 + }, + { + "epoch": 0.17, + "grad_norm": 1.6270121335983276, + "learning_rate": 4.1728840357725184e-05, + "loss": 5.7934, + "step": 3348 + }, + { + "epoch": 0.17, + "grad_norm": 1.677556037902832, + "learning_rate": 4.1718958446563566e-05, + "loss": 5.8193, + "step": 3352 + }, + { + "epoch": 0.17, + "grad_norm": 1.8996258974075317, + "learning_rate": 4.170907653540195e-05, + "loss": 5.8761, + "step": 3356 + }, + { + "epoch": 0.17, + "grad_norm": 1.5995745658874512, + "learning_rate": 4.169919462424033e-05, + "loss": 5.8213, + "step": 3360 + }, + { + "epoch": 0.17, + "grad_norm": 1.6188803911209106, + "learning_rate": 4.168931271307871e-05, + "loss": 5.9193, + "step": 3364 + }, + { + "epoch": 0.17, + "grad_norm": 2.0405361652374268, + "learning_rate": 4.1679430801917094e-05, + "loss": 5.76, + "step": 3368 + }, + { + "epoch": 0.17, + "grad_norm": 1.8527336120605469, + "learning_rate": 4.166954889075547e-05, + "loss": 5.7301, + "step": 3372 + }, + { + "epoch": 0.17, + "grad_norm": 1.9228570461273193, + "learning_rate": 4.165966697959385e-05, + "loss": 5.6206, + "step": 3376 + }, + { + "epoch": 0.17, + "grad_norm": 1.8511258363723755, + "learning_rate": 4.1649785068432233e-05, + "loss": 5.6747, + "step": 3380 + }, + { + "epoch": 0.17, + "grad_norm": 2.077301263809204, + "learning_rate": 4.1639903157270616e-05, + "loss": 5.7165, + "step": 3384 + }, + { + "epoch": 0.17, + "grad_norm": 1.5718106031417847, + "learning_rate": 4.1630021246109004e-05, + "loss": 5.8808, + "step": 3388 + }, + { + "epoch": 0.17, + "grad_norm": 1.6767138242721558, + "learning_rate": 4.1620139334947386e-05, + "loss": 5.6554, + "step": 3392 + }, + { + "epoch": 0.17, + "grad_norm": 1.6757532358169556, + "learning_rate": 4.161025742378576e-05, + "loss": 5.8836, + "step": 3396 + }, + { + "epoch": 0.17, + "grad_norm": 1.7318501472473145, + "learning_rate": 4.1600375512624144e-05, + "loss": 5.7989, + "step": 3400 + }, + { + "epoch": 0.17, + "grad_norm": 1.6216315031051636, + "learning_rate": 4.1590493601462526e-05, + "loss": 5.7528, + "step": 3404 + }, + { + "epoch": 0.17, + "grad_norm": 1.916919231414795, + "learning_rate": 4.158061169030091e-05, + "loss": 5.7544, + "step": 3408 + }, + { + "epoch": 0.17, + "grad_norm": 2.1574885845184326, + "learning_rate": 4.157072977913929e-05, + "loss": 5.7093, + "step": 3412 + }, + { + "epoch": 0.17, + "grad_norm": 1.754546880722046, + "learning_rate": 4.156084786797767e-05, + "loss": 5.6852, + "step": 3416 + }, + { + "epoch": 0.17, + "grad_norm": 1.5894404649734497, + "learning_rate": 4.1550965956816054e-05, + "loss": 5.8409, + "step": 3420 + }, + { + "epoch": 0.17, + "grad_norm": 1.5836817026138306, + "learning_rate": 4.154108404565443e-05, + "loss": 5.8131, + "step": 3424 + }, + { + "epoch": 0.17, + "grad_norm": 2.2382681369781494, + "learning_rate": 4.153120213449281e-05, + "loss": 5.7432, + "step": 3428 + }, + { + "epoch": 0.17, + "grad_norm": 1.515018105506897, + "learning_rate": 4.1521320223331194e-05, + "loss": 5.8258, + "step": 3432 + }, + { + "epoch": 0.17, + "grad_norm": 1.6593300104141235, + "learning_rate": 4.1511438312169576e-05, + "loss": 5.6845, + "step": 3436 + }, + { + "epoch": 0.17, + "grad_norm": 1.5710387229919434, + "learning_rate": 4.150155640100796e-05, + "loss": 5.6832, + "step": 3440 + }, + { + "epoch": 0.17, + "grad_norm": 1.8487441539764404, + "learning_rate": 4.149167448984634e-05, + "loss": 5.9216, + "step": 3444 + }, + { + "epoch": 0.17, + "grad_norm": 1.8978421688079834, + "learning_rate": 4.148179257868472e-05, + "loss": 5.8274, + "step": 3448 + }, + { + "epoch": 0.17, + "grad_norm": 1.6334807872772217, + "learning_rate": 4.1471910667523104e-05, + "loss": 5.8029, + "step": 3452 + }, + { + "epoch": 0.17, + "grad_norm": 1.6049920320510864, + "learning_rate": 4.146202875636148e-05, + "loss": 5.898, + "step": 3456 + }, + { + "epoch": 0.17, + "grad_norm": 1.874814510345459, + "learning_rate": 4.145214684519986e-05, + "loss": 5.7016, + "step": 3460 + }, + { + "epoch": 0.17, + "grad_norm": 1.4894344806671143, + "learning_rate": 4.144226493403824e-05, + "loss": 5.8736, + "step": 3464 + }, + { + "epoch": 0.17, + "grad_norm": 1.7190723419189453, + "learning_rate": 4.1432383022876625e-05, + "loss": 5.8398, + "step": 3468 + }, + { + "epoch": 0.17, + "grad_norm": 2.1003429889678955, + "learning_rate": 4.142250111171501e-05, + "loss": 5.725, + "step": 3472 + }, + { + "epoch": 0.17, + "grad_norm": 1.4103918075561523, + "learning_rate": 4.141261920055339e-05, + "loss": 5.8784, + "step": 3476 + }, + { + "epoch": 0.17, + "grad_norm": 1.6862280368804932, + "learning_rate": 4.140273728939177e-05, + "loss": 5.7469, + "step": 3480 + }, + { + "epoch": 0.17, + "grad_norm": 1.669593095779419, + "learning_rate": 4.139285537823015e-05, + "loss": 5.7476, + "step": 3484 + }, + { + "epoch": 0.17, + "grad_norm": 1.798256754875183, + "learning_rate": 4.138297346706853e-05, + "loss": 5.729, + "step": 3488 + }, + { + "epoch": 0.17, + "grad_norm": 1.827129602432251, + "learning_rate": 4.137309155590691e-05, + "loss": 5.7887, + "step": 3492 + }, + { + "epoch": 0.17, + "grad_norm": 1.7160648107528687, + "learning_rate": 4.136320964474529e-05, + "loss": 5.7244, + "step": 3496 + }, + { + "epoch": 0.17, + "grad_norm": 2.105008363723755, + "learning_rate": 4.1353327733583675e-05, + "loss": 5.8301, + "step": 3500 + }, + { + "epoch": 0.17, + "grad_norm": 1.4461700916290283, + "learning_rate": 4.1343445822422064e-05, + "loss": 5.8252, + "step": 3504 + }, + { + "epoch": 0.17, + "grad_norm": 1.478606104850769, + "learning_rate": 4.133356391126044e-05, + "loss": 5.751, + "step": 3508 + }, + { + "epoch": 0.17, + "grad_norm": 1.9824438095092773, + "learning_rate": 4.132368200009882e-05, + "loss": 5.7331, + "step": 3512 + }, + { + "epoch": 0.17, + "grad_norm": 1.8545335531234741, + "learning_rate": 4.13138000889372e-05, + "loss": 5.8547, + "step": 3516 + }, + { + "epoch": 0.17, + "grad_norm": 1.836466908454895, + "learning_rate": 4.1303918177775585e-05, + "loss": 5.7603, + "step": 3520 + }, + { + "epoch": 0.17, + "grad_norm": 1.3290674686431885, + "learning_rate": 4.129403626661397e-05, + "loss": 5.8594, + "step": 3524 + }, + { + "epoch": 0.17, + "grad_norm": 1.4753937721252441, + "learning_rate": 4.128415435545235e-05, + "loss": 5.8027, + "step": 3528 + }, + { + "epoch": 0.17, + "grad_norm": 2.090026617050171, + "learning_rate": 4.127427244429073e-05, + "loss": 5.8225, + "step": 3532 + }, + { + "epoch": 0.17, + "grad_norm": 2.7022199630737305, + "learning_rate": 4.1264390533129114e-05, + "loss": 5.8274, + "step": 3536 + }, + { + "epoch": 0.17, + "grad_norm": 2.0701937675476074, + "learning_rate": 4.125450862196749e-05, + "loss": 5.8273, + "step": 3540 + }, + { + "epoch": 0.18, + "grad_norm": 1.5415451526641846, + "learning_rate": 4.124462671080587e-05, + "loss": 5.7365, + "step": 3544 + }, + { + "epoch": 0.18, + "grad_norm": 1.871156096458435, + "learning_rate": 4.123474479964425e-05, + "loss": 5.8028, + "step": 3548 + }, + { + "epoch": 0.18, + "grad_norm": 1.788110613822937, + "learning_rate": 4.1224862888482635e-05, + "loss": 5.7425, + "step": 3552 + }, + { + "epoch": 0.18, + "grad_norm": 1.4837939739227295, + "learning_rate": 4.121498097732102e-05, + "loss": 5.7204, + "step": 3556 + }, + { + "epoch": 0.18, + "grad_norm": 1.9719536304473877, + "learning_rate": 4.12050990661594e-05, + "loss": 5.6699, + "step": 3560 + }, + { + "epoch": 0.18, + "grad_norm": 1.865614652633667, + "learning_rate": 4.119521715499778e-05, + "loss": 5.7139, + "step": 3564 + }, + { + "epoch": 0.18, + "grad_norm": 1.9451555013656616, + "learning_rate": 4.1185335243836156e-05, + "loss": 5.7739, + "step": 3568 + }, + { + "epoch": 0.18, + "grad_norm": 2.0122992992401123, + "learning_rate": 4.117545333267454e-05, + "loss": 5.9195, + "step": 3572 + }, + { + "epoch": 0.18, + "grad_norm": 1.606345295906067, + "learning_rate": 4.116557142151292e-05, + "loss": 5.8827, + "step": 3576 + }, + { + "epoch": 0.18, + "grad_norm": 1.9131412506103516, + "learning_rate": 4.11556895103513e-05, + "loss": 5.8901, + "step": 3580 + }, + { + "epoch": 0.18, + "grad_norm": 1.780287742614746, + "learning_rate": 4.1145807599189685e-05, + "loss": 5.8709, + "step": 3584 + }, + { + "epoch": 0.18, + "grad_norm": 2.840616226196289, + "learning_rate": 4.113592568802807e-05, + "loss": 5.8292, + "step": 3588 + }, + { + "epoch": 0.18, + "grad_norm": 1.641686201095581, + "learning_rate": 4.112604377686645e-05, + "loss": 5.7915, + "step": 3592 + }, + { + "epoch": 0.18, + "grad_norm": 1.6334573030471802, + "learning_rate": 4.111616186570483e-05, + "loss": 5.8706, + "step": 3596 + }, + { + "epoch": 0.18, + "grad_norm": 2.521855592727661, + "learning_rate": 4.1106279954543206e-05, + "loss": 5.7234, + "step": 3600 + }, + { + "epoch": 0.18, + "grad_norm": 1.7530171871185303, + "learning_rate": 4.109639804338159e-05, + "loss": 5.9393, + "step": 3604 + }, + { + "epoch": 0.18, + "grad_norm": 1.5222742557525635, + "learning_rate": 4.108651613221997e-05, + "loss": 5.7107, + "step": 3608 + }, + { + "epoch": 0.18, + "grad_norm": 1.9538359642028809, + "learning_rate": 4.107663422105835e-05, + "loss": 5.837, + "step": 3612 + }, + { + "epoch": 0.18, + "grad_norm": 1.5958921909332275, + "learning_rate": 4.1066752309896734e-05, + "loss": 5.8572, + "step": 3616 + }, + { + "epoch": 0.18, + "grad_norm": 1.4122501611709595, + "learning_rate": 4.105687039873512e-05, + "loss": 5.8037, + "step": 3620 + }, + { + "epoch": 0.18, + "grad_norm": 1.674170732498169, + "learning_rate": 4.10469884875735e-05, + "loss": 5.6393, + "step": 3624 + }, + { + "epoch": 0.18, + "grad_norm": 1.989425778388977, + "learning_rate": 4.103710657641188e-05, + "loss": 5.7025, + "step": 3628 + }, + { + "epoch": 0.18, + "grad_norm": 1.4842572212219238, + "learning_rate": 4.102722466525026e-05, + "loss": 5.7045, + "step": 3632 + }, + { + "epoch": 0.18, + "grad_norm": 1.6306571960449219, + "learning_rate": 4.1017342754088645e-05, + "loss": 5.9058, + "step": 3636 + }, + { + "epoch": 0.18, + "grad_norm": 1.7082542181015015, + "learning_rate": 4.100746084292703e-05, + "loss": 5.7987, + "step": 3640 + }, + { + "epoch": 0.18, + "grad_norm": 1.4293181896209717, + "learning_rate": 4.099757893176541e-05, + "loss": 5.7266, + "step": 3644 + }, + { + "epoch": 0.18, + "grad_norm": 1.4269033670425415, + "learning_rate": 4.098769702060379e-05, + "loss": 5.7295, + "step": 3648 + }, + { + "epoch": 0.18, + "grad_norm": 1.6524076461791992, + "learning_rate": 4.0977815109442166e-05, + "loss": 5.7705, + "step": 3652 + }, + { + "epoch": 0.18, + "grad_norm": 1.667291522026062, + "learning_rate": 4.096793319828055e-05, + "loss": 5.631, + "step": 3656 + }, + { + "epoch": 0.18, + "grad_norm": 1.8146156072616577, + "learning_rate": 4.095805128711893e-05, + "loss": 5.8973, + "step": 3660 + }, + { + "epoch": 0.18, + "grad_norm": 1.884358286857605, + "learning_rate": 4.094816937595731e-05, + "loss": 5.7693, + "step": 3664 + }, + { + "epoch": 0.18, + "grad_norm": 1.5343087911605835, + "learning_rate": 4.0938287464795694e-05, + "loss": 5.7735, + "step": 3668 + }, + { + "epoch": 0.18, + "grad_norm": 1.6283289194107056, + "learning_rate": 4.0928405553634076e-05, + "loss": 5.788, + "step": 3672 + }, + { + "epoch": 0.18, + "grad_norm": 1.5237561464309692, + "learning_rate": 4.091852364247246e-05, + "loss": 5.6661, + "step": 3676 + }, + { + "epoch": 0.18, + "grad_norm": 1.8047362565994263, + "learning_rate": 4.090864173131084e-05, + "loss": 5.6598, + "step": 3680 + }, + { + "epoch": 0.18, + "grad_norm": 1.6127629280090332, + "learning_rate": 4.0898759820149216e-05, + "loss": 5.6893, + "step": 3684 + }, + { + "epoch": 0.18, + "grad_norm": 2.2976841926574707, + "learning_rate": 4.08888779089876e-05, + "loss": 5.7673, + "step": 3688 + }, + { + "epoch": 0.18, + "grad_norm": 2.025730609893799, + "learning_rate": 4.087899599782598e-05, + "loss": 5.7116, + "step": 3692 + }, + { + "epoch": 0.18, + "grad_norm": 1.5258173942565918, + "learning_rate": 4.086911408666436e-05, + "loss": 5.8727, + "step": 3696 + }, + { + "epoch": 0.18, + "grad_norm": 1.983288049697876, + "learning_rate": 4.0859232175502744e-05, + "loss": 5.8148, + "step": 3700 + }, + { + "epoch": 0.18, + "grad_norm": 1.6084184646606445, + "learning_rate": 4.0849350264341126e-05, + "loss": 5.816, + "step": 3704 + }, + { + "epoch": 0.18, + "grad_norm": 1.739123821258545, + "learning_rate": 4.083946835317951e-05, + "loss": 5.8155, + "step": 3708 + }, + { + "epoch": 0.18, + "grad_norm": 1.8195106983184814, + "learning_rate": 4.0829586442017884e-05, + "loss": 5.7962, + "step": 3712 + }, + { + "epoch": 0.18, + "grad_norm": 1.5186692476272583, + "learning_rate": 4.0819704530856266e-05, + "loss": 5.8265, + "step": 3716 + }, + { + "epoch": 0.18, + "grad_norm": 2.1515772342681885, + "learning_rate": 4.080982261969465e-05, + "loss": 5.734, + "step": 3720 + }, + { + "epoch": 0.18, + "grad_norm": 1.9751808643341064, + "learning_rate": 4.079994070853303e-05, + "loss": 5.7605, + "step": 3724 + }, + { + "epoch": 0.18, + "grad_norm": 1.5598946809768677, + "learning_rate": 4.079005879737141e-05, + "loss": 5.6552, + "step": 3728 + }, + { + "epoch": 0.18, + "grad_norm": 1.642751693725586, + "learning_rate": 4.0780176886209794e-05, + "loss": 5.7607, + "step": 3732 + }, + { + "epoch": 0.18, + "grad_norm": 2.0432446002960205, + "learning_rate": 4.0770294975048176e-05, + "loss": 5.7103, + "step": 3736 + }, + { + "epoch": 0.18, + "grad_norm": 1.8500975370407104, + "learning_rate": 4.076041306388656e-05, + "loss": 5.7195, + "step": 3740 + }, + { + "epoch": 0.18, + "grad_norm": 1.5115128755569458, + "learning_rate": 4.075053115272494e-05, + "loss": 5.8491, + "step": 3744 + }, + { + "epoch": 0.19, + "grad_norm": 2.241079092025757, + "learning_rate": 4.074064924156332e-05, + "loss": 5.8183, + "step": 3748 + }, + { + "epoch": 0.19, + "grad_norm": 1.3165819644927979, + "learning_rate": 4.0730767330401704e-05, + "loss": 5.7614, + "step": 3752 + }, + { + "epoch": 0.19, + "grad_norm": 1.4908742904663086, + "learning_rate": 4.0720885419240086e-05, + "loss": 5.7343, + "step": 3756 + }, + { + "epoch": 0.19, + "grad_norm": 1.761338710784912, + "learning_rate": 4.071100350807847e-05, + "loss": 5.7702, + "step": 3760 + }, + { + "epoch": 0.19, + "grad_norm": 1.7084866762161255, + "learning_rate": 4.0701121596916844e-05, + "loss": 5.7966, + "step": 3764 + }, + { + "epoch": 0.19, + "grad_norm": 2.0340840816497803, + "learning_rate": 4.0691239685755226e-05, + "loss": 5.7835, + "step": 3768 + }, + { + "epoch": 0.19, + "grad_norm": 2.2564029693603516, + "learning_rate": 4.068135777459361e-05, + "loss": 5.7911, + "step": 3772 + }, + { + "epoch": 0.19, + "grad_norm": 2.2431206703186035, + "learning_rate": 4.067147586343199e-05, + "loss": 5.7461, + "step": 3776 + }, + { + "epoch": 0.19, + "grad_norm": 1.6614986658096313, + "learning_rate": 4.066159395227037e-05, + "loss": 5.8013, + "step": 3780 + }, + { + "epoch": 0.19, + "grad_norm": 1.8505353927612305, + "learning_rate": 4.0651712041108754e-05, + "loss": 5.7317, + "step": 3784 + }, + { + "epoch": 0.19, + "grad_norm": 3.03542160987854, + "learning_rate": 4.0641830129947136e-05, + "loss": 5.822, + "step": 3788 + }, + { + "epoch": 0.19, + "grad_norm": 1.5652294158935547, + "learning_rate": 4.063194821878552e-05, + "loss": 5.8001, + "step": 3792 + }, + { + "epoch": 0.19, + "grad_norm": 1.7387696504592896, + "learning_rate": 4.062206630762389e-05, + "loss": 5.7575, + "step": 3796 + }, + { + "epoch": 0.19, + "grad_norm": 1.7505065202713013, + "learning_rate": 4.0612184396462275e-05, + "loss": 5.6909, + "step": 3800 + }, + { + "epoch": 0.19, + "grad_norm": 1.7365105152130127, + "learning_rate": 4.060230248530066e-05, + "loss": 5.7282, + "step": 3804 + }, + { + "epoch": 0.19, + "grad_norm": 1.965735912322998, + "learning_rate": 4.059242057413904e-05, + "loss": 5.7692, + "step": 3808 + }, + { + "epoch": 0.19, + "grad_norm": 1.409680962562561, + "learning_rate": 4.058253866297742e-05, + "loss": 5.8235, + "step": 3812 + }, + { + "epoch": 0.19, + "grad_norm": 1.8310275077819824, + "learning_rate": 4.0572656751815804e-05, + "loss": 5.7516, + "step": 3816 + }, + { + "epoch": 0.19, + "grad_norm": 1.537810206413269, + "learning_rate": 4.0562774840654186e-05, + "loss": 5.6683, + "step": 3820 + }, + { + "epoch": 0.19, + "grad_norm": 1.7618563175201416, + "learning_rate": 4.055289292949256e-05, + "loss": 5.7643, + "step": 3824 + }, + { + "epoch": 0.19, + "grad_norm": 1.603681206703186, + "learning_rate": 4.054301101833094e-05, + "loss": 5.6578, + "step": 3828 + }, + { + "epoch": 0.19, + "grad_norm": 2.3885388374328613, + "learning_rate": 4.0533129107169325e-05, + "loss": 5.8278, + "step": 3832 + }, + { + "epoch": 0.19, + "grad_norm": 1.6048877239227295, + "learning_rate": 4.052324719600771e-05, + "loss": 5.8245, + "step": 3836 + }, + { + "epoch": 0.19, + "grad_norm": 1.443018913269043, + "learning_rate": 4.051336528484609e-05, + "loss": 5.5956, + "step": 3840 + }, + { + "epoch": 0.19, + "grad_norm": 1.6389518976211548, + "learning_rate": 4.050348337368447e-05, + "loss": 5.7129, + "step": 3844 + }, + { + "epoch": 0.19, + "grad_norm": 1.4876172542572021, + "learning_rate": 4.049360146252285e-05, + "loss": 5.7235, + "step": 3848 + }, + { + "epoch": 0.19, + "grad_norm": 1.8396450281143188, + "learning_rate": 4.0483719551361235e-05, + "loss": 5.8035, + "step": 3852 + }, + { + "epoch": 0.19, + "grad_norm": 1.4481145143508911, + "learning_rate": 4.047383764019962e-05, + "loss": 5.793, + "step": 3856 + }, + { + "epoch": 0.19, + "grad_norm": 1.8424584865570068, + "learning_rate": 4.0463955729038e-05, + "loss": 5.7926, + "step": 3860 + }, + { + "epoch": 0.19, + "grad_norm": 2.1682097911834717, + "learning_rate": 4.045407381787638e-05, + "loss": 5.7651, + "step": 3864 + }, + { + "epoch": 0.19, + "grad_norm": 1.371147871017456, + "learning_rate": 4.0444191906714764e-05, + "loss": 5.7553, + "step": 3868 + }, + { + "epoch": 0.19, + "grad_norm": 1.8529584407806396, + "learning_rate": 4.0434309995553146e-05, + "loss": 5.738, + "step": 3872 + }, + { + "epoch": 0.19, + "grad_norm": 2.445864677429199, + "learning_rate": 4.042442808439153e-05, + "loss": 5.6886, + "step": 3876 + }, + { + "epoch": 0.19, + "grad_norm": 1.6438809633255005, + "learning_rate": 4.04145461732299e-05, + "loss": 5.7889, + "step": 3880 + }, + { + "epoch": 0.19, + "grad_norm": 2.3593320846557617, + "learning_rate": 4.0404664262068285e-05, + "loss": 5.7318, + "step": 3884 + }, + { + "epoch": 0.19, + "grad_norm": 1.7897164821624756, + "learning_rate": 4.039478235090667e-05, + "loss": 5.7785, + "step": 3888 + }, + { + "epoch": 0.19, + "grad_norm": 1.6130775213241577, + "learning_rate": 4.038490043974505e-05, + "loss": 5.7774, + "step": 3892 + }, + { + "epoch": 0.19, + "grad_norm": 1.5768321752548218, + "learning_rate": 4.037501852858343e-05, + "loss": 5.8277, + "step": 3896 + }, + { + "epoch": 0.19, + "grad_norm": 1.9530898332595825, + "learning_rate": 4.036513661742181e-05, + "loss": 5.8019, + "step": 3900 + }, + { + "epoch": 0.19, + "grad_norm": 1.9102182388305664, + "learning_rate": 4.0355254706260195e-05, + "loss": 5.7975, + "step": 3904 + }, + { + "epoch": 0.19, + "grad_norm": 1.7103482484817505, + "learning_rate": 4.034537279509857e-05, + "loss": 5.7725, + "step": 3908 + }, + { + "epoch": 0.19, + "grad_norm": 1.5741090774536133, + "learning_rate": 4.033549088393695e-05, + "loss": 5.7367, + "step": 3912 + }, + { + "epoch": 0.19, + "grad_norm": 2.2126150131225586, + "learning_rate": 4.0325608972775335e-05, + "loss": 5.9088, + "step": 3916 + }, + { + "epoch": 0.19, + "grad_norm": 1.7655351161956787, + "learning_rate": 4.031572706161372e-05, + "loss": 5.847, + "step": 3920 + }, + { + "epoch": 0.19, + "grad_norm": 1.7152812480926514, + "learning_rate": 4.03058451504521e-05, + "loss": 5.7307, + "step": 3924 + }, + { + "epoch": 0.19, + "grad_norm": 1.767104983329773, + "learning_rate": 4.029596323929048e-05, + "loss": 5.8074, + "step": 3928 + }, + { + "epoch": 0.19, + "grad_norm": 1.9230767488479614, + "learning_rate": 4.028608132812886e-05, + "loss": 5.7124, + "step": 3932 + }, + { + "epoch": 0.19, + "grad_norm": 1.6367123126983643, + "learning_rate": 4.0276199416967245e-05, + "loss": 5.7474, + "step": 3936 + }, + { + "epoch": 0.19, + "grad_norm": 2.0680441856384277, + "learning_rate": 4.026631750580562e-05, + "loss": 5.8713, + "step": 3940 + }, + { + "epoch": 0.19, + "grad_norm": 1.7396044731140137, + "learning_rate": 4.0256435594644e-05, + "loss": 5.7464, + "step": 3944 + }, + { + "epoch": 0.2, + "grad_norm": 2.2416396141052246, + "learning_rate": 4.0246553683482384e-05, + "loss": 5.9287, + "step": 3948 + }, + { + "epoch": 0.2, + "grad_norm": 2.192002534866333, + "learning_rate": 4.0236671772320766e-05, + "loss": 5.8617, + "step": 3952 + }, + { + "epoch": 0.2, + "grad_norm": 2.6008896827697754, + "learning_rate": 4.022678986115915e-05, + "loss": 5.7929, + "step": 3956 + }, + { + "epoch": 0.2, + "grad_norm": 1.7184627056121826, + "learning_rate": 4.021690794999753e-05, + "loss": 5.7198, + "step": 3960 + }, + { + "epoch": 0.2, + "grad_norm": 2.4607460498809814, + "learning_rate": 4.020702603883591e-05, + "loss": 5.6012, + "step": 3964 + }, + { + "epoch": 0.2, + "grad_norm": 3.070399761199951, + "learning_rate": 4.0197144127674295e-05, + "loss": 5.6176, + "step": 3968 + }, + { + "epoch": 0.2, + "grad_norm": 1.7792775630950928, + "learning_rate": 4.018726221651268e-05, + "loss": 5.8348, + "step": 3972 + }, + { + "epoch": 0.2, + "grad_norm": 2.0791399478912354, + "learning_rate": 4.017738030535106e-05, + "loss": 5.8916, + "step": 3976 + }, + { + "epoch": 0.2, + "grad_norm": 1.7796858549118042, + "learning_rate": 4.016749839418944e-05, + "loss": 5.8424, + "step": 3980 + }, + { + "epoch": 0.2, + "grad_norm": 2.2471671104431152, + "learning_rate": 4.015761648302782e-05, + "loss": 5.832, + "step": 3984 + }, + { + "epoch": 0.2, + "grad_norm": 1.6249337196350098, + "learning_rate": 4.0147734571866205e-05, + "loss": 5.72, + "step": 3988 + }, + { + "epoch": 0.2, + "grad_norm": 1.6861313581466675, + "learning_rate": 4.013785266070458e-05, + "loss": 5.7362, + "step": 3992 + }, + { + "epoch": 0.2, + "grad_norm": 1.8561558723449707, + "learning_rate": 4.012797074954296e-05, + "loss": 5.7032, + "step": 3996 + }, + { + "epoch": 0.2, + "grad_norm": 1.6355276107788086, + "learning_rate": 4.0118088838381344e-05, + "loss": 5.7695, + "step": 4000 + }, + { + "epoch": 0.2, + "grad_norm": 2.0652620792388916, + "learning_rate": 4.0108206927219726e-05, + "loss": 5.8284, + "step": 4004 + }, + { + "epoch": 0.2, + "grad_norm": 1.5604512691497803, + "learning_rate": 4.009832501605811e-05, + "loss": 5.8084, + "step": 4008 + }, + { + "epoch": 0.2, + "grad_norm": 1.8975857496261597, + "learning_rate": 4.008844310489649e-05, + "loss": 5.7599, + "step": 4012 + }, + { + "epoch": 0.2, + "grad_norm": 1.7917417287826538, + "learning_rate": 4.007856119373487e-05, + "loss": 5.7057, + "step": 4016 + }, + { + "epoch": 0.2, + "grad_norm": 1.6700019836425781, + "learning_rate": 4.0068679282573255e-05, + "loss": 5.7124, + "step": 4020 + }, + { + "epoch": 0.2, + "grad_norm": 1.7579398155212402, + "learning_rate": 4.005879737141163e-05, + "loss": 5.7449, + "step": 4024 + }, + { + "epoch": 0.2, + "grad_norm": 2.5005486011505127, + "learning_rate": 4.004891546025001e-05, + "loss": 5.6679, + "step": 4028 + }, + { + "epoch": 0.2, + "grad_norm": 2.033926010131836, + "learning_rate": 4.0039033549088394e-05, + "loss": 5.8639, + "step": 4032 + }, + { + "epoch": 0.2, + "grad_norm": 2.0436344146728516, + "learning_rate": 4.0029151637926776e-05, + "loss": 5.7561, + "step": 4036 + }, + { + "epoch": 0.2, + "grad_norm": 1.592089295387268, + "learning_rate": 4.001926972676516e-05, + "loss": 5.7275, + "step": 4040 + }, + { + "epoch": 0.2, + "grad_norm": 1.860810399055481, + "learning_rate": 4.000938781560354e-05, + "loss": 5.829, + "step": 4044 + }, + { + "epoch": 0.2, + "grad_norm": 1.988085150718689, + "learning_rate": 3.999950590444192e-05, + "loss": 5.7726, + "step": 4048 + }, + { + "epoch": 0.2, + "grad_norm": 1.7819504737854004, + "learning_rate": 3.99896239932803e-05, + "loss": 5.6843, + "step": 4052 + }, + { + "epoch": 0.2, + "grad_norm": 2.2862792015075684, + "learning_rate": 3.997974208211868e-05, + "loss": 5.7483, + "step": 4056 + }, + { + "epoch": 0.2, + "grad_norm": 1.5969876050949097, + "learning_rate": 3.996986017095706e-05, + "loss": 5.7825, + "step": 4060 + }, + { + "epoch": 0.2, + "grad_norm": 1.5458773374557495, + "learning_rate": 3.9959978259795444e-05, + "loss": 5.7123, + "step": 4064 + }, + { + "epoch": 0.2, + "grad_norm": 1.516552209854126, + "learning_rate": 3.9950096348633826e-05, + "loss": 5.7107, + "step": 4068 + }, + { + "epoch": 0.2, + "grad_norm": 1.6216940879821777, + "learning_rate": 3.994021443747221e-05, + "loss": 5.7065, + "step": 4072 + }, + { + "epoch": 0.2, + "grad_norm": 2.020914077758789, + "learning_rate": 3.993033252631059e-05, + "loss": 5.938, + "step": 4076 + }, + { + "epoch": 0.2, + "grad_norm": 1.9217082262039185, + "learning_rate": 3.992045061514897e-05, + "loss": 5.8464, + "step": 4080 + }, + { + "epoch": 0.2, + "grad_norm": 1.8421107530593872, + "learning_rate": 3.9910568703987354e-05, + "loss": 5.672, + "step": 4084 + }, + { + "epoch": 0.2, + "grad_norm": 2.107970952987671, + "learning_rate": 3.9900686792825736e-05, + "loss": 5.787, + "step": 4088 + }, + { + "epoch": 0.2, + "grad_norm": 2.8105857372283936, + "learning_rate": 3.989080488166412e-05, + "loss": 5.8617, + "step": 4092 + }, + { + "epoch": 0.2, + "grad_norm": 1.9893261194229126, + "learning_rate": 3.98809229705025e-05, + "loss": 5.766, + "step": 4096 + }, + { + "epoch": 0.2, + "grad_norm": 2.125051498413086, + "learning_rate": 3.987104105934088e-05, + "loss": 5.8332, + "step": 4100 + }, + { + "epoch": 0.2, + "grad_norm": 1.6055108308792114, + "learning_rate": 3.9861159148179264e-05, + "loss": 5.7197, + "step": 4104 + }, + { + "epoch": 0.2, + "grad_norm": 1.727008581161499, + "learning_rate": 3.985127723701764e-05, + "loss": 5.7569, + "step": 4108 + }, + { + "epoch": 0.2, + "grad_norm": 1.6093586683273315, + "learning_rate": 3.984139532585602e-05, + "loss": 5.6682, + "step": 4112 + }, + { + "epoch": 0.2, + "grad_norm": 2.0254461765289307, + "learning_rate": 3.9831513414694404e-05, + "loss": 5.7424, + "step": 4116 + }, + { + "epoch": 0.2, + "grad_norm": 1.8866037130355835, + "learning_rate": 3.9821631503532786e-05, + "loss": 5.8635, + "step": 4120 + }, + { + "epoch": 0.2, + "grad_norm": 1.5677746534347534, + "learning_rate": 3.981174959237117e-05, + "loss": 5.8245, + "step": 4124 + }, + { + "epoch": 0.2, + "grad_norm": 1.763741374015808, + "learning_rate": 3.980186768120955e-05, + "loss": 5.7215, + "step": 4128 + }, + { + "epoch": 0.2, + "grad_norm": 1.9562249183654785, + "learning_rate": 3.979198577004793e-05, + "loss": 5.7907, + "step": 4132 + }, + { + "epoch": 0.2, + "grad_norm": 1.8377013206481934, + "learning_rate": 3.978210385888631e-05, + "loss": 5.6803, + "step": 4136 + }, + { + "epoch": 0.2, + "grad_norm": 1.684601902961731, + "learning_rate": 3.977222194772469e-05, + "loss": 5.772, + "step": 4140 + }, + { + "epoch": 0.2, + "grad_norm": 1.5692558288574219, + "learning_rate": 3.976234003656307e-05, + "loss": 5.8362, + "step": 4144 + }, + { + "epoch": 0.2, + "grad_norm": 1.6108436584472656, + "learning_rate": 3.9752458125401454e-05, + "loss": 5.7818, + "step": 4148 + }, + { + "epoch": 0.21, + "grad_norm": 1.75248384475708, + "learning_rate": 3.9742576214239836e-05, + "loss": 5.8399, + "step": 4152 + }, + { + "epoch": 0.21, + "grad_norm": 1.5307252407073975, + "learning_rate": 3.973269430307822e-05, + "loss": 5.7638, + "step": 4156 + }, + { + "epoch": 0.21, + "grad_norm": 1.4091581106185913, + "learning_rate": 3.97228123919166e-05, + "loss": 5.7551, + "step": 4160 + }, + { + "epoch": 0.21, + "grad_norm": 1.6635953187942505, + "learning_rate": 3.971293048075498e-05, + "loss": 5.7285, + "step": 4164 + }, + { + "epoch": 0.21, + "grad_norm": 1.8554000854492188, + "learning_rate": 3.970304856959336e-05, + "loss": 5.5641, + "step": 4168 + }, + { + "epoch": 0.21, + "grad_norm": 1.6862932443618774, + "learning_rate": 3.969316665843174e-05, + "loss": 5.8047, + "step": 4172 + }, + { + "epoch": 0.21, + "grad_norm": 1.9776607751846313, + "learning_rate": 3.968328474727012e-05, + "loss": 5.7278, + "step": 4176 + }, + { + "epoch": 0.21, + "grad_norm": 1.6065359115600586, + "learning_rate": 3.96734028361085e-05, + "loss": 5.7115, + "step": 4180 + }, + { + "epoch": 0.21, + "grad_norm": 2.3388211727142334, + "learning_rate": 3.9663520924946885e-05, + "loss": 5.6954, + "step": 4184 + }, + { + "epoch": 0.21, + "grad_norm": 1.7472275495529175, + "learning_rate": 3.965363901378527e-05, + "loss": 5.834, + "step": 4188 + }, + { + "epoch": 0.21, + "grad_norm": 1.7328163385391235, + "learning_rate": 3.964375710262365e-05, + "loss": 5.6081, + "step": 4192 + }, + { + "epoch": 0.21, + "grad_norm": 2.0004336833953857, + "learning_rate": 3.963387519146203e-05, + "loss": 5.7344, + "step": 4196 + }, + { + "epoch": 0.21, + "grad_norm": 2.126734733581543, + "learning_rate": 3.9623993280300414e-05, + "loss": 5.7677, + "step": 4200 + }, + { + "epoch": 0.21, + "grad_norm": 1.693859338760376, + "learning_rate": 3.9614111369138796e-05, + "loss": 5.7142, + "step": 4204 + }, + { + "epoch": 0.21, + "grad_norm": 1.758854627609253, + "learning_rate": 3.960422945797718e-05, + "loss": 5.7381, + "step": 4208 + }, + { + "epoch": 0.21, + "grad_norm": 2.1982223987579346, + "learning_rate": 3.959434754681556e-05, + "loss": 5.9032, + "step": 4212 + }, + { + "epoch": 0.21, + "grad_norm": 1.588758111000061, + "learning_rate": 3.958446563565394e-05, + "loss": 5.6668, + "step": 4216 + }, + { + "epoch": 0.21, + "grad_norm": 1.8513824939727783, + "learning_rate": 3.957458372449232e-05, + "loss": 5.5942, + "step": 4220 + }, + { + "epoch": 0.21, + "grad_norm": 2.1672956943511963, + "learning_rate": 3.95647018133307e-05, + "loss": 5.8085, + "step": 4224 + }, + { + "epoch": 0.21, + "grad_norm": 1.9962481260299683, + "learning_rate": 3.955481990216908e-05, + "loss": 5.7762, + "step": 4228 + }, + { + "epoch": 0.21, + "grad_norm": 1.5460338592529297, + "learning_rate": 3.954493799100746e-05, + "loss": 5.7283, + "step": 4232 + }, + { + "epoch": 0.21, + "grad_norm": 1.5077719688415527, + "learning_rate": 3.9535056079845845e-05, + "loss": 5.7272, + "step": 4236 + }, + { + "epoch": 0.21, + "grad_norm": 2.141517400741577, + "learning_rate": 3.952517416868423e-05, + "loss": 5.7612, + "step": 4240 + }, + { + "epoch": 0.21, + "grad_norm": 1.6528626680374146, + "learning_rate": 3.951529225752261e-05, + "loss": 5.6216, + "step": 4244 + }, + { + "epoch": 0.21, + "grad_norm": 1.8636984825134277, + "learning_rate": 3.950541034636099e-05, + "loss": 5.8001, + "step": 4248 + }, + { + "epoch": 0.21, + "grad_norm": 1.4790719747543335, + "learning_rate": 3.949552843519937e-05, + "loss": 5.7866, + "step": 4252 + }, + { + "epoch": 0.21, + "grad_norm": 1.534711241722107, + "learning_rate": 3.948564652403775e-05, + "loss": 5.708, + "step": 4256 + }, + { + "epoch": 0.21, + "grad_norm": 2.083681344985962, + "learning_rate": 3.947576461287613e-05, + "loss": 5.6374, + "step": 4260 + }, + { + "epoch": 0.21, + "grad_norm": 1.5164477825164795, + "learning_rate": 3.946588270171451e-05, + "loss": 5.7652, + "step": 4264 + }, + { + "epoch": 0.21, + "grad_norm": 1.6196374893188477, + "learning_rate": 3.9456000790552895e-05, + "loss": 5.7647, + "step": 4268 + }, + { + "epoch": 0.21, + "grad_norm": 2.348536491394043, + "learning_rate": 3.944611887939128e-05, + "loss": 5.6887, + "step": 4272 + }, + { + "epoch": 0.21, + "grad_norm": 1.8124507665634155, + "learning_rate": 3.943623696822966e-05, + "loss": 5.8996, + "step": 4276 + }, + { + "epoch": 0.21, + "grad_norm": 1.919488787651062, + "learning_rate": 3.9426355057068034e-05, + "loss": 5.8877, + "step": 4280 + }, + { + "epoch": 0.21, + "grad_norm": 2.190864086151123, + "learning_rate": 3.9416473145906416e-05, + "loss": 5.6733, + "step": 4284 + }, + { + "epoch": 0.21, + "grad_norm": 1.941540241241455, + "learning_rate": 3.94065912347448e-05, + "loss": 5.6711, + "step": 4288 + }, + { + "epoch": 0.21, + "grad_norm": 1.46822190284729, + "learning_rate": 3.939670932358318e-05, + "loss": 5.7721, + "step": 4292 + }, + { + "epoch": 0.21, + "grad_norm": 1.7537801265716553, + "learning_rate": 3.938682741242156e-05, + "loss": 5.663, + "step": 4296 + }, + { + "epoch": 0.21, + "grad_norm": 1.6122314929962158, + "learning_rate": 3.9376945501259945e-05, + "loss": 5.6916, + "step": 4300 + }, + { + "epoch": 0.21, + "grad_norm": 1.9700013399124146, + "learning_rate": 3.936706359009833e-05, + "loss": 5.7515, + "step": 4304 + }, + { + "epoch": 0.21, + "grad_norm": 1.7138198614120483, + "learning_rate": 3.935718167893671e-05, + "loss": 5.6894, + "step": 4308 + }, + { + "epoch": 0.21, + "grad_norm": 1.5922127962112427, + "learning_rate": 3.934729976777509e-05, + "loss": 5.6651, + "step": 4312 + }, + { + "epoch": 0.21, + "grad_norm": 1.7139687538146973, + "learning_rate": 3.933741785661347e-05, + "loss": 5.8207, + "step": 4316 + }, + { + "epoch": 0.21, + "grad_norm": 1.8785958290100098, + "learning_rate": 3.9327535945451855e-05, + "loss": 5.6558, + "step": 4320 + }, + { + "epoch": 0.21, + "grad_norm": 1.6499451398849487, + "learning_rate": 3.931765403429024e-05, + "loss": 5.7528, + "step": 4324 + }, + { + "epoch": 0.21, + "grad_norm": 1.9110082387924194, + "learning_rate": 3.930777212312862e-05, + "loss": 5.7717, + "step": 4328 + }, + { + "epoch": 0.21, + "grad_norm": 1.643752098083496, + "learning_rate": 3.9297890211966994e-05, + "loss": 5.7846, + "step": 4332 + }, + { + "epoch": 0.21, + "grad_norm": 2.1606407165527344, + "learning_rate": 3.9288008300805377e-05, + "loss": 5.7187, + "step": 4336 + }, + { + "epoch": 0.21, + "grad_norm": 1.6892307996749878, + "learning_rate": 3.927812638964376e-05, + "loss": 5.7547, + "step": 4340 + }, + { + "epoch": 0.21, + "grad_norm": 1.863633632659912, + "learning_rate": 3.926824447848214e-05, + "loss": 5.6712, + "step": 4344 + }, + { + "epoch": 0.21, + "grad_norm": 1.9984108209609985, + "learning_rate": 3.925836256732052e-05, + "loss": 5.6589, + "step": 4348 + }, + { + "epoch": 0.22, + "grad_norm": 1.6699365377426147, + "learning_rate": 3.9248480656158905e-05, + "loss": 5.7613, + "step": 4352 + }, + { + "epoch": 0.22, + "grad_norm": 1.7803456783294678, + "learning_rate": 3.923859874499729e-05, + "loss": 5.7216, + "step": 4356 + }, + { + "epoch": 0.22, + "grad_norm": 1.8623511791229248, + "learning_rate": 3.922871683383567e-05, + "loss": 5.6227, + "step": 4360 + }, + { + "epoch": 0.22, + "grad_norm": 1.5841137170791626, + "learning_rate": 3.9218834922674044e-05, + "loss": 5.6657, + "step": 4364 + }, + { + "epoch": 0.22, + "grad_norm": 1.5960050821304321, + "learning_rate": 3.9208953011512426e-05, + "loss": 5.6565, + "step": 4368 + }, + { + "epoch": 0.22, + "grad_norm": 1.5138053894042969, + "learning_rate": 3.919907110035081e-05, + "loss": 5.7149, + "step": 4372 + }, + { + "epoch": 0.22, + "grad_norm": 2.044208288192749, + "learning_rate": 3.918918918918919e-05, + "loss": 5.659, + "step": 4376 + }, + { + "epoch": 0.22, + "grad_norm": 1.7930463552474976, + "learning_rate": 3.917930727802757e-05, + "loss": 5.6993, + "step": 4380 + }, + { + "epoch": 0.22, + "grad_norm": 1.9749689102172852, + "learning_rate": 3.9169425366865954e-05, + "loss": 5.6016, + "step": 4384 + }, + { + "epoch": 0.22, + "grad_norm": 1.737301230430603, + "learning_rate": 3.9159543455704337e-05, + "loss": 5.651, + "step": 4388 + }, + { + "epoch": 0.22, + "grad_norm": 1.506861925125122, + "learning_rate": 3.914966154454271e-05, + "loss": 5.7291, + "step": 4392 + }, + { + "epoch": 0.22, + "grad_norm": 1.6121779680252075, + "learning_rate": 3.9139779633381094e-05, + "loss": 5.6529, + "step": 4396 + }, + { + "epoch": 0.22, + "grad_norm": 1.6343719959259033, + "learning_rate": 3.9129897722219476e-05, + "loss": 5.7278, + "step": 4400 + }, + { + "epoch": 0.22, + "grad_norm": 1.9690921306610107, + "learning_rate": 3.912001581105786e-05, + "loss": 5.636, + "step": 4404 + }, + { + "epoch": 0.22, + "grad_norm": 1.6430257558822632, + "learning_rate": 3.911013389989624e-05, + "loss": 5.7014, + "step": 4408 + }, + { + "epoch": 0.22, + "grad_norm": 1.506526231765747, + "learning_rate": 3.910025198873462e-05, + "loss": 5.8489, + "step": 4412 + }, + { + "epoch": 0.22, + "grad_norm": 1.704164743423462, + "learning_rate": 3.9090370077573004e-05, + "loss": 5.8284, + "step": 4416 + }, + { + "epoch": 0.22, + "grad_norm": 1.7875306606292725, + "learning_rate": 3.9080488166411386e-05, + "loss": 5.633, + "step": 4420 + }, + { + "epoch": 0.22, + "grad_norm": 1.9106501340866089, + "learning_rate": 3.907060625524977e-05, + "loss": 5.775, + "step": 4424 + }, + { + "epoch": 0.22, + "grad_norm": 1.9393559694290161, + "learning_rate": 3.906072434408815e-05, + "loss": 5.689, + "step": 4428 + }, + { + "epoch": 0.22, + "grad_norm": 1.8752541542053223, + "learning_rate": 3.905084243292653e-05, + "loss": 5.7892, + "step": 4432 + }, + { + "epoch": 0.22, + "grad_norm": 1.5603359937667847, + "learning_rate": 3.9040960521764914e-05, + "loss": 5.8323, + "step": 4436 + }, + { + "epoch": 0.22, + "grad_norm": 1.6264493465423584, + "learning_rate": 3.9031078610603297e-05, + "loss": 5.7913, + "step": 4440 + }, + { + "epoch": 0.22, + "grad_norm": 1.8390896320343018, + "learning_rate": 3.902119669944168e-05, + "loss": 5.7695, + "step": 4444 + }, + { + "epoch": 0.22, + "grad_norm": 1.5323280096054077, + "learning_rate": 3.9011314788280054e-05, + "loss": 5.7638, + "step": 4448 + }, + { + "epoch": 0.22, + "grad_norm": 1.8048474788665771, + "learning_rate": 3.9001432877118436e-05, + "loss": 5.7824, + "step": 4452 + }, + { + "epoch": 0.22, + "grad_norm": 1.657486915588379, + "learning_rate": 3.899155096595682e-05, + "loss": 5.7037, + "step": 4456 + }, + { + "epoch": 0.22, + "grad_norm": 1.7981828451156616, + "learning_rate": 3.89816690547952e-05, + "loss": 5.6465, + "step": 4460 + }, + { + "epoch": 0.22, + "grad_norm": 1.5225144624710083, + "learning_rate": 3.897178714363358e-05, + "loss": 5.8237, + "step": 4464 + }, + { + "epoch": 0.22, + "grad_norm": 1.4845541715621948, + "learning_rate": 3.8961905232471964e-05, + "loss": 5.7549, + "step": 4468 + }, + { + "epoch": 0.22, + "grad_norm": 1.5387517213821411, + "learning_rate": 3.8952023321310346e-05, + "loss": 5.7666, + "step": 4472 + }, + { + "epoch": 0.22, + "grad_norm": 1.7411534786224365, + "learning_rate": 3.894214141014872e-05, + "loss": 5.7204, + "step": 4476 + }, + { + "epoch": 0.22, + "grad_norm": 1.622908592224121, + "learning_rate": 3.8932259498987104e-05, + "loss": 5.6133, + "step": 4480 + }, + { + "epoch": 0.22, + "grad_norm": 1.8925143480300903, + "learning_rate": 3.8922377587825486e-05, + "loss": 5.7133, + "step": 4484 + }, + { + "epoch": 0.22, + "grad_norm": 1.7361855506896973, + "learning_rate": 3.891249567666387e-05, + "loss": 5.7737, + "step": 4488 + }, + { + "epoch": 0.22, + "grad_norm": 1.588912010192871, + "learning_rate": 3.890261376550225e-05, + "loss": 5.7253, + "step": 4492 + }, + { + "epoch": 0.22, + "grad_norm": 1.7690430879592896, + "learning_rate": 3.889273185434063e-05, + "loss": 5.7693, + "step": 4496 + }, + { + "epoch": 0.22, + "grad_norm": 2.1007232666015625, + "learning_rate": 3.8882849943179014e-05, + "loss": 5.733, + "step": 4500 + }, + { + "epoch": 0.22, + "grad_norm": 2.1150882244110107, + "learning_rate": 3.8872968032017396e-05, + "loss": 5.6447, + "step": 4504 + }, + { + "epoch": 0.22, + "grad_norm": 1.85843825340271, + "learning_rate": 3.886308612085577e-05, + "loss": 5.7652, + "step": 4508 + }, + { + "epoch": 0.22, + "grad_norm": 1.6900382041931152, + "learning_rate": 3.885320420969415e-05, + "loss": 5.7937, + "step": 4512 + }, + { + "epoch": 0.22, + "grad_norm": 1.7769362926483154, + "learning_rate": 3.8843322298532535e-05, + "loss": 5.7276, + "step": 4516 + }, + { + "epoch": 0.22, + "grad_norm": 2.092283010482788, + "learning_rate": 3.883344038737092e-05, + "loss": 5.6712, + "step": 4520 + }, + { + "epoch": 0.22, + "grad_norm": 1.7213283777236938, + "learning_rate": 3.88235584762093e-05, + "loss": 5.7979, + "step": 4524 + }, + { + "epoch": 0.22, + "grad_norm": 1.5792101621627808, + "learning_rate": 3.881367656504768e-05, + "loss": 5.7273, + "step": 4528 + }, + { + "epoch": 0.22, + "grad_norm": 1.8439412117004395, + "learning_rate": 3.8803794653886064e-05, + "loss": 5.8053, + "step": 4532 + }, + { + "epoch": 0.22, + "grad_norm": 2.0028135776519775, + "learning_rate": 3.8793912742724446e-05, + "loss": 5.7714, + "step": 4536 + }, + { + "epoch": 0.22, + "grad_norm": 1.7748719453811646, + "learning_rate": 3.878403083156283e-05, + "loss": 5.8676, + "step": 4540 + }, + { + "epoch": 0.22, + "grad_norm": 1.7069579362869263, + "learning_rate": 3.877414892040121e-05, + "loss": 5.5816, + "step": 4544 + }, + { + "epoch": 0.22, + "grad_norm": 1.695499062538147, + "learning_rate": 3.876426700923959e-05, + "loss": 5.8817, + "step": 4548 + }, + { + "epoch": 0.22, + "grad_norm": 1.573487639427185, + "learning_rate": 3.8754385098077974e-05, + "loss": 5.792, + "step": 4552 + }, + { + "epoch": 0.23, + "grad_norm": 1.576565146446228, + "learning_rate": 3.8744503186916356e-05, + "loss": 5.6382, + "step": 4556 + }, + { + "epoch": 0.23, + "grad_norm": 2.081883668899536, + "learning_rate": 3.873462127575473e-05, + "loss": 5.7344, + "step": 4560 + }, + { + "epoch": 0.23, + "grad_norm": 1.9463318586349487, + "learning_rate": 3.872473936459311e-05, + "loss": 5.6469, + "step": 4564 + }, + { + "epoch": 0.23, + "grad_norm": 1.8385138511657715, + "learning_rate": 3.8714857453431495e-05, + "loss": 5.8192, + "step": 4568 + }, + { + "epoch": 0.23, + "grad_norm": 1.7394839525222778, + "learning_rate": 3.870497554226988e-05, + "loss": 5.5372, + "step": 4572 + }, + { + "epoch": 0.23, + "grad_norm": 1.578692078590393, + "learning_rate": 3.869509363110826e-05, + "loss": 5.7304, + "step": 4576 + }, + { + "epoch": 0.23, + "grad_norm": 1.551023006439209, + "learning_rate": 3.868521171994664e-05, + "loss": 5.8348, + "step": 4580 + }, + { + "epoch": 0.23, + "grad_norm": 1.7546929121017456, + "learning_rate": 3.8675329808785024e-05, + "loss": 5.7532, + "step": 4584 + }, + { + "epoch": 0.23, + "grad_norm": 1.7034144401550293, + "learning_rate": 3.8665447897623406e-05, + "loss": 5.6995, + "step": 4588 + }, + { + "epoch": 0.23, + "grad_norm": 1.4249943494796753, + "learning_rate": 3.865556598646178e-05, + "loss": 5.7219, + "step": 4592 + }, + { + "epoch": 0.23, + "grad_norm": 1.5602226257324219, + "learning_rate": 3.864568407530016e-05, + "loss": 5.6544, + "step": 4596 + }, + { + "epoch": 0.23, + "grad_norm": 1.4957911968231201, + "learning_rate": 3.8635802164138545e-05, + "loss": 5.7934, + "step": 4600 + }, + { + "epoch": 0.23, + "grad_norm": 1.6986734867095947, + "learning_rate": 3.862592025297693e-05, + "loss": 5.7564, + "step": 4604 + }, + { + "epoch": 0.23, + "grad_norm": 1.6847269535064697, + "learning_rate": 3.861603834181531e-05, + "loss": 5.583, + "step": 4608 + }, + { + "epoch": 0.23, + "grad_norm": 1.73738431930542, + "learning_rate": 3.860615643065369e-05, + "loss": 5.6771, + "step": 4612 + }, + { + "epoch": 0.23, + "grad_norm": 1.5951757431030273, + "learning_rate": 3.859627451949207e-05, + "loss": 5.6885, + "step": 4616 + }, + { + "epoch": 0.23, + "grad_norm": 1.6416071653366089, + "learning_rate": 3.858639260833045e-05, + "loss": 5.7239, + "step": 4620 + }, + { + "epoch": 0.23, + "grad_norm": 2.0191540718078613, + "learning_rate": 3.857651069716883e-05, + "loss": 5.6335, + "step": 4624 + }, + { + "epoch": 0.23, + "grad_norm": 1.677107810974121, + "learning_rate": 3.856662878600721e-05, + "loss": 5.6885, + "step": 4628 + }, + { + "epoch": 0.23, + "grad_norm": 1.619813323020935, + "learning_rate": 3.8556746874845595e-05, + "loss": 5.6838, + "step": 4632 + }, + { + "epoch": 0.23, + "grad_norm": 1.8103737831115723, + "learning_rate": 3.854686496368398e-05, + "loss": 5.6114, + "step": 4636 + }, + { + "epoch": 0.23, + "grad_norm": 1.6023643016815186, + "learning_rate": 3.853698305252236e-05, + "loss": 5.718, + "step": 4640 + }, + { + "epoch": 0.23, + "grad_norm": 1.3481965065002441, + "learning_rate": 3.852710114136074e-05, + "loss": 5.6867, + "step": 4644 + }, + { + "epoch": 0.23, + "grad_norm": 1.6693611145019531, + "learning_rate": 3.851721923019912e-05, + "loss": 5.8371, + "step": 4648 + }, + { + "epoch": 0.23, + "grad_norm": 1.7420861721038818, + "learning_rate": 3.8507337319037505e-05, + "loss": 5.7511, + "step": 4652 + }, + { + "epoch": 0.23, + "grad_norm": 1.6563256978988647, + "learning_rate": 3.849745540787589e-05, + "loss": 5.64, + "step": 4656 + }, + { + "epoch": 0.23, + "grad_norm": 1.7449144124984741, + "learning_rate": 3.848757349671427e-05, + "loss": 5.7528, + "step": 4660 + }, + { + "epoch": 0.23, + "grad_norm": 1.9364055395126343, + "learning_rate": 3.847769158555265e-05, + "loss": 5.7674, + "step": 4664 + }, + { + "epoch": 0.23, + "grad_norm": 1.8317818641662598, + "learning_rate": 3.846780967439103e-05, + "loss": 5.7468, + "step": 4668 + }, + { + "epoch": 0.23, + "grad_norm": 1.949170470237732, + "learning_rate": 3.8457927763229415e-05, + "loss": 5.7701, + "step": 4672 + }, + { + "epoch": 0.23, + "grad_norm": 1.7268710136413574, + "learning_rate": 3.844804585206779e-05, + "loss": 5.7698, + "step": 4676 + }, + { + "epoch": 0.23, + "grad_norm": 1.8129808902740479, + "learning_rate": 3.843816394090617e-05, + "loss": 5.7412, + "step": 4680 + }, + { + "epoch": 0.23, + "grad_norm": 1.815779209136963, + "learning_rate": 3.8428282029744555e-05, + "loss": 5.7174, + "step": 4684 + }, + { + "epoch": 0.23, + "grad_norm": 1.8246920108795166, + "learning_rate": 3.841840011858294e-05, + "loss": 5.7602, + "step": 4688 + }, + { + "epoch": 0.23, + "grad_norm": 1.7265750169754028, + "learning_rate": 3.840851820742132e-05, + "loss": 5.7137, + "step": 4692 + }, + { + "epoch": 0.23, + "grad_norm": 2.4503543376922607, + "learning_rate": 3.83986362962597e-05, + "loss": 5.78, + "step": 4696 + }, + { + "epoch": 0.23, + "grad_norm": 2.0876848697662354, + "learning_rate": 3.838875438509808e-05, + "loss": 5.67, + "step": 4700 + }, + { + "epoch": 0.23, + "grad_norm": 1.5135231018066406, + "learning_rate": 3.837887247393646e-05, + "loss": 5.7372, + "step": 4704 + }, + { + "epoch": 0.23, + "grad_norm": 1.9444183111190796, + "learning_rate": 3.836899056277484e-05, + "loss": 5.6755, + "step": 4708 + }, + { + "epoch": 0.23, + "grad_norm": 1.8341116905212402, + "learning_rate": 3.835910865161322e-05, + "loss": 5.7714, + "step": 4712 + }, + { + "epoch": 0.23, + "grad_norm": 1.6584597826004028, + "learning_rate": 3.8349226740451604e-05, + "loss": 5.6811, + "step": 4716 + }, + { + "epoch": 0.23, + "grad_norm": 1.7305800914764404, + "learning_rate": 3.8339344829289987e-05, + "loss": 5.6905, + "step": 4720 + }, + { + "epoch": 0.23, + "grad_norm": 1.826827883720398, + "learning_rate": 3.832946291812837e-05, + "loss": 5.6126, + "step": 4724 + }, + { + "epoch": 0.23, + "grad_norm": 1.5062451362609863, + "learning_rate": 3.831958100696675e-05, + "loss": 5.8465, + "step": 4728 + }, + { + "epoch": 0.23, + "grad_norm": 1.7500250339508057, + "learning_rate": 3.830969909580513e-05, + "loss": 5.6855, + "step": 4732 + }, + { + "epoch": 0.23, + "grad_norm": 2.0079612731933594, + "learning_rate": 3.829981718464351e-05, + "loss": 5.729, + "step": 4736 + }, + { + "epoch": 0.23, + "grad_norm": 1.8388514518737793, + "learning_rate": 3.828993527348189e-05, + "loss": 5.7142, + "step": 4740 + }, + { + "epoch": 0.23, + "grad_norm": 1.8375358581542969, + "learning_rate": 3.828005336232027e-05, + "loss": 5.6261, + "step": 4744 + }, + { + "epoch": 0.23, + "grad_norm": 1.785122275352478, + "learning_rate": 3.8270171451158654e-05, + "loss": 5.7206, + "step": 4748 + }, + { + "epoch": 0.23, + "grad_norm": 1.7444349527359009, + "learning_rate": 3.8260289539997036e-05, + "loss": 5.711, + "step": 4752 + }, + { + "epoch": 0.23, + "grad_norm": 2.0970168113708496, + "learning_rate": 3.825040762883542e-05, + "loss": 5.7049, + "step": 4756 + }, + { + "epoch": 0.24, + "grad_norm": 1.8633285760879517, + "learning_rate": 3.82405257176738e-05, + "loss": 5.7639, + "step": 4760 + }, + { + "epoch": 0.24, + "grad_norm": 1.7935363054275513, + "learning_rate": 3.8230643806512176e-05, + "loss": 5.7429, + "step": 4764 + }, + { + "epoch": 0.24, + "grad_norm": 2.099822521209717, + "learning_rate": 3.8220761895350564e-05, + "loss": 5.6922, + "step": 4768 + }, + { + "epoch": 0.24, + "grad_norm": 1.7735134363174438, + "learning_rate": 3.8210879984188947e-05, + "loss": 5.777, + "step": 4772 + }, + { + "epoch": 0.24, + "grad_norm": 1.62099027633667, + "learning_rate": 3.820099807302733e-05, + "loss": 5.7513, + "step": 4776 + }, + { + "epoch": 0.24, + "grad_norm": 2.413672924041748, + "learning_rate": 3.819111616186571e-05, + "loss": 5.7412, + "step": 4780 + }, + { + "epoch": 0.24, + "grad_norm": 2.0373451709747314, + "learning_rate": 3.818123425070409e-05, + "loss": 5.8356, + "step": 4784 + }, + { + "epoch": 0.24, + "grad_norm": 1.7266334295272827, + "learning_rate": 3.817135233954247e-05, + "loss": 5.7796, + "step": 4788 + }, + { + "epoch": 0.24, + "grad_norm": 1.5502921342849731, + "learning_rate": 3.816147042838085e-05, + "loss": 5.6788, + "step": 4792 + }, + { + "epoch": 0.24, + "grad_norm": 1.559435248374939, + "learning_rate": 3.815158851721923e-05, + "loss": 5.6855, + "step": 4796 + }, + { + "epoch": 0.24, + "grad_norm": 1.8041141033172607, + "learning_rate": 3.8141706606057614e-05, + "loss": 5.712, + "step": 4800 + }, + { + "epoch": 0.24, + "grad_norm": 1.526065468788147, + "learning_rate": 3.8131824694895996e-05, + "loss": 5.8179, + "step": 4804 + }, + { + "epoch": 0.24, + "grad_norm": 1.73811674118042, + "learning_rate": 3.812194278373438e-05, + "loss": 5.6526, + "step": 4808 + }, + { + "epoch": 0.24, + "grad_norm": 1.927445650100708, + "learning_rate": 3.811206087257276e-05, + "loss": 5.7575, + "step": 4812 + }, + { + "epoch": 0.24, + "grad_norm": 1.8947980403900146, + "learning_rate": 3.810217896141114e-05, + "loss": 5.6713, + "step": 4816 + }, + { + "epoch": 0.24, + "grad_norm": 1.586437702178955, + "learning_rate": 3.809229705024952e-05, + "loss": 5.7355, + "step": 4820 + }, + { + "epoch": 0.24, + "grad_norm": 1.980283498764038, + "learning_rate": 3.80824151390879e-05, + "loss": 5.6553, + "step": 4824 + }, + { + "epoch": 0.24, + "grad_norm": 1.615174651145935, + "learning_rate": 3.807253322792628e-05, + "loss": 5.7573, + "step": 4828 + }, + { + "epoch": 0.24, + "grad_norm": 2.1818222999572754, + "learning_rate": 3.8062651316764664e-05, + "loss": 5.6662, + "step": 4832 + }, + { + "epoch": 0.24, + "grad_norm": 2.164574146270752, + "learning_rate": 3.8052769405603046e-05, + "loss": 5.6212, + "step": 4836 + }, + { + "epoch": 0.24, + "grad_norm": 2.612293243408203, + "learning_rate": 3.804288749444143e-05, + "loss": 5.6983, + "step": 4840 + }, + { + "epoch": 0.24, + "grad_norm": 1.661539912223816, + "learning_rate": 3.803300558327981e-05, + "loss": 5.6968, + "step": 4844 + }, + { + "epoch": 0.24, + "grad_norm": 1.7480438947677612, + "learning_rate": 3.8023123672118185e-05, + "loss": 5.6364, + "step": 4848 + }, + { + "epoch": 0.24, + "grad_norm": 1.5679854154586792, + "learning_rate": 3.801324176095657e-05, + "loss": 5.7243, + "step": 4852 + }, + { + "epoch": 0.24, + "grad_norm": 1.688027024269104, + "learning_rate": 3.800335984979495e-05, + "loss": 5.7444, + "step": 4856 + }, + { + "epoch": 0.24, + "grad_norm": 2.0255541801452637, + "learning_rate": 3.799347793863333e-05, + "loss": 5.7882, + "step": 4860 + }, + { + "epoch": 0.24, + "grad_norm": 1.664778232574463, + "learning_rate": 3.7983596027471714e-05, + "loss": 5.8742, + "step": 4864 + }, + { + "epoch": 0.24, + "grad_norm": 1.7059557437896729, + "learning_rate": 3.7973714116310096e-05, + "loss": 5.6959, + "step": 4868 + }, + { + "epoch": 0.24, + "grad_norm": 1.6631444692611694, + "learning_rate": 3.796383220514848e-05, + "loss": 5.6701, + "step": 4872 + }, + { + "epoch": 0.24, + "grad_norm": 2.2237651348114014, + "learning_rate": 3.795395029398685e-05, + "loss": 5.7526, + "step": 4876 + }, + { + "epoch": 0.24, + "grad_norm": 1.5631194114685059, + "learning_rate": 3.7944068382825235e-05, + "loss": 5.6439, + "step": 4880 + }, + { + "epoch": 0.24, + "grad_norm": 1.7338947057724, + "learning_rate": 3.7934186471663624e-05, + "loss": 5.699, + "step": 4884 + }, + { + "epoch": 0.24, + "grad_norm": 1.682901382446289, + "learning_rate": 3.7924304560502006e-05, + "loss": 5.7413, + "step": 4888 + }, + { + "epoch": 0.24, + "grad_norm": 1.9311175346374512, + "learning_rate": 3.791442264934039e-05, + "loss": 5.8328, + "step": 4892 + }, + { + "epoch": 0.24, + "grad_norm": 1.4761751890182495, + "learning_rate": 3.790454073817877e-05, + "loss": 5.6389, + "step": 4896 + }, + { + "epoch": 0.24, + "grad_norm": 1.6680078506469727, + "learning_rate": 3.7894658827017145e-05, + "loss": 5.81, + "step": 4900 + }, + { + "epoch": 0.24, + "grad_norm": 2.0462136268615723, + "learning_rate": 3.788477691585553e-05, + "loss": 5.7153, + "step": 4904 + }, + { + "epoch": 0.24, + "grad_norm": 1.5725479125976562, + "learning_rate": 3.787489500469391e-05, + "loss": 5.5877, + "step": 4908 + }, + { + "epoch": 0.24, + "grad_norm": 1.8569021224975586, + "learning_rate": 3.786501309353229e-05, + "loss": 5.6218, + "step": 4912 + }, + { + "epoch": 0.24, + "grad_norm": 1.4288854598999023, + "learning_rate": 3.7855131182370674e-05, + "loss": 5.6984, + "step": 4916 + }, + { + "epoch": 0.24, + "grad_norm": 1.483189582824707, + "learning_rate": 3.7845249271209056e-05, + "loss": 5.7004, + "step": 4920 + }, + { + "epoch": 0.24, + "grad_norm": 1.9790661334991455, + "learning_rate": 3.783536736004744e-05, + "loss": 5.6083, + "step": 4924 + }, + { + "epoch": 0.24, + "grad_norm": 1.6783969402313232, + "learning_rate": 3.782548544888582e-05, + "loss": 5.7083, + "step": 4928 + }, + { + "epoch": 0.24, + "grad_norm": 1.763135552406311, + "learning_rate": 3.7815603537724195e-05, + "loss": 5.6475, + "step": 4932 + }, + { + "epoch": 0.24, + "grad_norm": 1.8823894262313843, + "learning_rate": 3.780572162656258e-05, + "loss": 5.741, + "step": 4936 + }, + { + "epoch": 0.24, + "grad_norm": 1.9821616411209106, + "learning_rate": 3.779583971540096e-05, + "loss": 5.7479, + "step": 4940 + }, + { + "epoch": 0.24, + "grad_norm": 1.6714750528335571, + "learning_rate": 3.778595780423934e-05, + "loss": 5.7442, + "step": 4944 + }, + { + "epoch": 0.24, + "grad_norm": 1.7158372402191162, + "learning_rate": 3.777607589307772e-05, + "loss": 5.7097, + "step": 4948 + }, + { + "epoch": 0.24, + "grad_norm": 1.8387162685394287, + "learning_rate": 3.7766193981916105e-05, + "loss": 5.534, + "step": 4952 + }, + { + "epoch": 0.24, + "grad_norm": 1.7854334115982056, + "learning_rate": 3.775631207075449e-05, + "loss": 5.7142, + "step": 4956 + }, + { + "epoch": 0.25, + "grad_norm": 1.736302137374878, + "learning_rate": 3.774643015959286e-05, + "loss": 5.7211, + "step": 4960 + }, + { + "epoch": 0.25, + "grad_norm": 2.1947836875915527, + "learning_rate": 3.7736548248431245e-05, + "loss": 5.6862, + "step": 4964 + }, + { + "epoch": 0.25, + "grad_norm": 1.9635826349258423, + "learning_rate": 3.772666633726963e-05, + "loss": 5.7067, + "step": 4968 + }, + { + "epoch": 0.25, + "grad_norm": 1.4694422483444214, + "learning_rate": 3.771678442610801e-05, + "loss": 5.6205, + "step": 4972 + }, + { + "epoch": 0.25, + "grad_norm": 2.0147790908813477, + "learning_rate": 3.770690251494639e-05, + "loss": 5.7691, + "step": 4976 + }, + { + "epoch": 0.25, + "grad_norm": 1.6146892309188843, + "learning_rate": 3.769702060378477e-05, + "loss": 5.7103, + "step": 4980 + }, + { + "epoch": 0.25, + "grad_norm": 2.098802089691162, + "learning_rate": 3.7687138692623155e-05, + "loss": 5.759, + "step": 4984 + }, + { + "epoch": 0.25, + "grad_norm": 1.7809268236160278, + "learning_rate": 3.767725678146154e-05, + "loss": 5.7146, + "step": 4988 + }, + { + "epoch": 0.25, + "grad_norm": 1.8064768314361572, + "learning_rate": 3.766737487029991e-05, + "loss": 5.6203, + "step": 4992 + }, + { + "epoch": 0.25, + "grad_norm": 1.6928107738494873, + "learning_rate": 3.76574929591383e-05, + "loss": 5.8565, + "step": 4996 + }, + { + "epoch": 0.25, + "grad_norm": 1.65586256980896, + "learning_rate": 3.764761104797668e-05, + "loss": 5.7671, + "step": 5000 + }, + { + "epoch": 0.25, + "grad_norm": 2.0193347930908203, + "learning_rate": 3.7637729136815065e-05, + "loss": 5.8168, + "step": 5004 + }, + { + "epoch": 0.25, + "grad_norm": 1.571082592010498, + "learning_rate": 3.762784722565345e-05, + "loss": 5.6741, + "step": 5008 + }, + { + "epoch": 0.25, + "grad_norm": 1.5919872522354126, + "learning_rate": 3.761796531449183e-05, + "loss": 5.7805, + "step": 5012 + }, + { + "epoch": 0.25, + "grad_norm": 1.8394559621810913, + "learning_rate": 3.7608083403330205e-05, + "loss": 5.7168, + "step": 5016 + }, + { + "epoch": 0.25, + "grad_norm": 2.177563190460205, + "learning_rate": 3.759820149216859e-05, + "loss": 5.8527, + "step": 5020 + }, + { + "epoch": 0.25, + "grad_norm": 1.9234048128128052, + "learning_rate": 3.758831958100697e-05, + "loss": 5.6813, + "step": 5024 + }, + { + "epoch": 0.25, + "grad_norm": 1.7297863960266113, + "learning_rate": 3.757843766984535e-05, + "loss": 5.5673, + "step": 5028 + }, + { + "epoch": 0.25, + "grad_norm": 1.6289762258529663, + "learning_rate": 3.756855575868373e-05, + "loss": 5.6836, + "step": 5032 + }, + { + "epoch": 0.25, + "grad_norm": 1.9951887130737305, + "learning_rate": 3.7558673847522115e-05, + "loss": 5.7777, + "step": 5036 + }, + { + "epoch": 0.25, + "grad_norm": 1.6676596403121948, + "learning_rate": 3.75487919363605e-05, + "loss": 5.731, + "step": 5040 + }, + { + "epoch": 0.25, + "grad_norm": 1.914063811302185, + "learning_rate": 3.753891002519887e-05, + "loss": 5.747, + "step": 5044 + }, + { + "epoch": 0.25, + "grad_norm": 1.8118635416030884, + "learning_rate": 3.7529028114037254e-05, + "loss": 5.5851, + "step": 5048 + }, + { + "epoch": 0.25, + "grad_norm": 1.7621952295303345, + "learning_rate": 3.7519146202875637e-05, + "loss": 5.8087, + "step": 5052 + }, + { + "epoch": 0.25, + "grad_norm": 2.0561869144439697, + "learning_rate": 3.750926429171402e-05, + "loss": 5.7153, + "step": 5056 + }, + { + "epoch": 0.25, + "grad_norm": 1.6286967992782593, + "learning_rate": 3.74993823805524e-05, + "loss": 5.7822, + "step": 5060 + }, + { + "epoch": 0.25, + "grad_norm": 2.714860439300537, + "learning_rate": 3.748950046939078e-05, + "loss": 5.7273, + "step": 5064 + }, + { + "epoch": 0.25, + "grad_norm": 1.5165153741836548, + "learning_rate": 3.7479618558229165e-05, + "loss": 5.6865, + "step": 5068 + }, + { + "epoch": 0.25, + "grad_norm": 1.6314889192581177, + "learning_rate": 3.746973664706755e-05, + "loss": 5.91, + "step": 5072 + }, + { + "epoch": 0.25, + "grad_norm": 2.141998529434204, + "learning_rate": 3.745985473590592e-05, + "loss": 5.796, + "step": 5076 + }, + { + "epoch": 0.25, + "grad_norm": 1.727159857749939, + "learning_rate": 3.7449972824744304e-05, + "loss": 5.7056, + "step": 5080 + }, + { + "epoch": 0.25, + "grad_norm": 1.6345195770263672, + "learning_rate": 3.7440090913582686e-05, + "loss": 5.6393, + "step": 5084 + }, + { + "epoch": 0.25, + "grad_norm": 1.8869349956512451, + "learning_rate": 3.743020900242107e-05, + "loss": 5.6482, + "step": 5088 + }, + { + "epoch": 0.25, + "grad_norm": 2.2009177207946777, + "learning_rate": 3.742032709125945e-05, + "loss": 5.762, + "step": 5092 + }, + { + "epoch": 0.25, + "grad_norm": 1.64715576171875, + "learning_rate": 3.741044518009783e-05, + "loss": 5.6038, + "step": 5096 + }, + { + "epoch": 0.25, + "grad_norm": 1.730360984802246, + "learning_rate": 3.7400563268936215e-05, + "loss": 5.6776, + "step": 5100 + }, + { + "epoch": 0.25, + "grad_norm": 1.7169520854949951, + "learning_rate": 3.739068135777459e-05, + "loss": 5.7149, + "step": 5104 + }, + { + "epoch": 0.25, + "grad_norm": 1.6068758964538574, + "learning_rate": 3.738079944661297e-05, + "loss": 5.7772, + "step": 5108 + }, + { + "epoch": 0.25, + "grad_norm": 2.215623378753662, + "learning_rate": 3.737091753545136e-05, + "loss": 5.697, + "step": 5112 + }, + { + "epoch": 0.25, + "grad_norm": 2.080639123916626, + "learning_rate": 3.736103562428974e-05, + "loss": 5.6851, + "step": 5116 + }, + { + "epoch": 0.25, + "grad_norm": 2.041546583175659, + "learning_rate": 3.7351153713128125e-05, + "loss": 5.7803, + "step": 5120 + }, + { + "epoch": 0.25, + "grad_norm": 1.9273161888122559, + "learning_rate": 3.734127180196651e-05, + "loss": 5.6727, + "step": 5124 + }, + { + "epoch": 0.25, + "grad_norm": 1.5295966863632202, + "learning_rate": 3.733138989080488e-05, + "loss": 5.7002, + "step": 5128 + }, + { + "epoch": 0.25, + "grad_norm": 1.5543110370635986, + "learning_rate": 3.7321507979643264e-05, + "loss": 5.7539, + "step": 5132 + }, + { + "epoch": 0.25, + "grad_norm": 1.867628574371338, + "learning_rate": 3.7311626068481646e-05, + "loss": 5.6902, + "step": 5136 + }, + { + "epoch": 0.25, + "grad_norm": 1.7172714471817017, + "learning_rate": 3.730174415732003e-05, + "loss": 5.7929, + "step": 5140 + }, + { + "epoch": 0.25, + "grad_norm": 1.8065274953842163, + "learning_rate": 3.729186224615841e-05, + "loss": 5.7148, + "step": 5144 + }, + { + "epoch": 0.25, + "grad_norm": 1.6474413871765137, + "learning_rate": 3.728198033499679e-05, + "loss": 5.6105, + "step": 5148 + }, + { + "epoch": 0.25, + "grad_norm": 2.2249326705932617, + "learning_rate": 3.7272098423835175e-05, + "loss": 5.7129, + "step": 5152 + }, + { + "epoch": 0.25, + "grad_norm": 1.8622581958770752, + "learning_rate": 3.7262216512673557e-05, + "loss": 5.6788, + "step": 5156 + }, + { + "epoch": 0.25, + "grad_norm": 1.7544127702713013, + "learning_rate": 3.725233460151193e-05, + "loss": 5.6447, + "step": 5160 + }, + { + "epoch": 0.26, + "grad_norm": 1.9091047048568726, + "learning_rate": 3.7242452690350314e-05, + "loss": 5.6228, + "step": 5164 + }, + { + "epoch": 0.26, + "grad_norm": 1.5641545057296753, + "learning_rate": 3.7232570779188696e-05, + "loss": 5.7202, + "step": 5168 + }, + { + "epoch": 0.26, + "grad_norm": 1.8679791688919067, + "learning_rate": 3.722268886802708e-05, + "loss": 5.7362, + "step": 5172 + }, + { + "epoch": 0.26, + "grad_norm": 1.9691587686538696, + "learning_rate": 3.721280695686546e-05, + "loss": 5.6811, + "step": 5176 + }, + { + "epoch": 0.26, + "grad_norm": 2.0612921714782715, + "learning_rate": 3.720292504570384e-05, + "loss": 5.7, + "step": 5180 + }, + { + "epoch": 0.26, + "grad_norm": 1.4250149726867676, + "learning_rate": 3.7193043134542224e-05, + "loss": 5.7469, + "step": 5184 + }, + { + "epoch": 0.26, + "grad_norm": 2.013101577758789, + "learning_rate": 3.71831612233806e-05, + "loss": 5.6312, + "step": 5188 + }, + { + "epoch": 0.26, + "grad_norm": 1.8997466564178467, + "learning_rate": 3.717327931221898e-05, + "loss": 5.6744, + "step": 5192 + }, + { + "epoch": 0.26, + "grad_norm": 1.4980552196502686, + "learning_rate": 3.7163397401057364e-05, + "loss": 5.5909, + "step": 5196 + }, + { + "epoch": 0.26, + "grad_norm": 1.8174835443496704, + "learning_rate": 3.7153515489895746e-05, + "loss": 5.7095, + "step": 5200 + }, + { + "epoch": 0.26, + "grad_norm": 1.4909565448760986, + "learning_rate": 3.714363357873413e-05, + "loss": 5.7022, + "step": 5204 + }, + { + "epoch": 0.26, + "grad_norm": 1.5424959659576416, + "learning_rate": 3.713375166757251e-05, + "loss": 5.6704, + "step": 5208 + }, + { + "epoch": 0.26, + "grad_norm": 1.5841034650802612, + "learning_rate": 3.712386975641089e-05, + "loss": 5.6546, + "step": 5212 + }, + { + "epoch": 0.26, + "grad_norm": 1.7105164527893066, + "learning_rate": 3.7113987845249274e-05, + "loss": 5.6281, + "step": 5216 + }, + { + "epoch": 0.26, + "grad_norm": 2.422480821609497, + "learning_rate": 3.710410593408765e-05, + "loss": 5.7145, + "step": 5220 + }, + { + "epoch": 0.26, + "grad_norm": 1.8925801515579224, + "learning_rate": 3.709422402292603e-05, + "loss": 5.6512, + "step": 5224 + }, + { + "epoch": 0.26, + "grad_norm": 1.7350256443023682, + "learning_rate": 3.708434211176442e-05, + "loss": 5.7257, + "step": 5228 + }, + { + "epoch": 0.26, + "grad_norm": 1.9060580730438232, + "learning_rate": 3.70744602006028e-05, + "loss": 5.7056, + "step": 5232 + }, + { + "epoch": 0.26, + "grad_norm": 1.6758359670639038, + "learning_rate": 3.7064578289441184e-05, + "loss": 5.795, + "step": 5236 + }, + { + "epoch": 0.26, + "grad_norm": 1.7977421283721924, + "learning_rate": 3.7054696378279566e-05, + "loss": 5.6278, + "step": 5240 + }, + { + "epoch": 0.26, + "grad_norm": 1.7420555353164673, + "learning_rate": 3.704481446711794e-05, + "loss": 5.619, + "step": 5244 + }, + { + "epoch": 0.26, + "grad_norm": 1.7721624374389648, + "learning_rate": 3.7034932555956324e-05, + "loss": 5.5908, + "step": 5248 + }, + { + "epoch": 0.26, + "grad_norm": 1.9039374589920044, + "learning_rate": 3.7025050644794706e-05, + "loss": 5.7308, + "step": 5252 + }, + { + "epoch": 0.26, + "grad_norm": 1.9757115840911865, + "learning_rate": 3.701516873363309e-05, + "loss": 5.7986, + "step": 5256 + }, + { + "epoch": 0.26, + "grad_norm": 1.6613463163375854, + "learning_rate": 3.700528682247147e-05, + "loss": 5.6056, + "step": 5260 + }, + { + "epoch": 0.26, + "grad_norm": 1.9101234674453735, + "learning_rate": 3.699540491130985e-05, + "loss": 5.7109, + "step": 5264 + }, + { + "epoch": 0.26, + "grad_norm": 2.04067325592041, + "learning_rate": 3.6985523000148234e-05, + "loss": 5.7513, + "step": 5268 + }, + { + "epoch": 0.26, + "grad_norm": 1.8164728879928589, + "learning_rate": 3.697564108898661e-05, + "loss": 5.5739, + "step": 5272 + }, + { + "epoch": 0.26, + "grad_norm": 1.9348748922348022, + "learning_rate": 3.696575917782499e-05, + "loss": 5.6177, + "step": 5276 + }, + { + "epoch": 0.26, + "grad_norm": 2.20129132270813, + "learning_rate": 3.695587726666337e-05, + "loss": 5.7036, + "step": 5280 + }, + { + "epoch": 0.26, + "grad_norm": 2.0877761840820312, + "learning_rate": 3.6945995355501755e-05, + "loss": 5.6192, + "step": 5284 + }, + { + "epoch": 0.26, + "grad_norm": 1.6023980379104614, + "learning_rate": 3.693611344434014e-05, + "loss": 5.6557, + "step": 5288 + }, + { + "epoch": 0.26, + "grad_norm": 1.4598197937011719, + "learning_rate": 3.692623153317852e-05, + "loss": 5.6821, + "step": 5292 + }, + { + "epoch": 0.26, + "grad_norm": 1.9218881130218506, + "learning_rate": 3.69163496220169e-05, + "loss": 5.7097, + "step": 5296 + }, + { + "epoch": 0.26, + "grad_norm": 1.91704261302948, + "learning_rate": 3.6906467710855284e-05, + "loss": 5.7038, + "step": 5300 + }, + { + "epoch": 0.26, + "grad_norm": 1.594611406326294, + "learning_rate": 3.689658579969366e-05, + "loss": 5.5904, + "step": 5304 + }, + { + "epoch": 0.26, + "grad_norm": 1.570440649986267, + "learning_rate": 3.688670388853204e-05, + "loss": 5.5548, + "step": 5308 + }, + { + "epoch": 0.26, + "grad_norm": 1.8532248735427856, + "learning_rate": 3.687682197737042e-05, + "loss": 5.691, + "step": 5312 + }, + { + "epoch": 0.26, + "grad_norm": 1.8759549856185913, + "learning_rate": 3.6866940066208805e-05, + "loss": 5.783, + "step": 5316 + }, + { + "epoch": 0.26, + "grad_norm": 2.0743143558502197, + "learning_rate": 3.685705815504719e-05, + "loss": 5.8346, + "step": 5320 + }, + { + "epoch": 0.26, + "grad_norm": 1.7164572477340698, + "learning_rate": 3.684717624388557e-05, + "loss": 5.6427, + "step": 5324 + }, + { + "epoch": 0.26, + "grad_norm": 1.8269315958023071, + "learning_rate": 3.683729433272395e-05, + "loss": 5.7506, + "step": 5328 + }, + { + "epoch": 0.26, + "grad_norm": 1.8456995487213135, + "learning_rate": 3.6827412421562327e-05, + "loss": 5.6247, + "step": 5332 + }, + { + "epoch": 0.26, + "grad_norm": 1.7062360048294067, + "learning_rate": 3.681753051040071e-05, + "loss": 5.6777, + "step": 5336 + }, + { + "epoch": 0.26, + "grad_norm": 1.861578106880188, + "learning_rate": 3.68076485992391e-05, + "loss": 5.7011, + "step": 5340 + }, + { + "epoch": 0.26, + "grad_norm": 1.5446622371673584, + "learning_rate": 3.679776668807748e-05, + "loss": 5.6793, + "step": 5344 + }, + { + "epoch": 0.26, + "grad_norm": 1.8392596244812012, + "learning_rate": 3.678788477691586e-05, + "loss": 5.7193, + "step": 5348 + }, + { + "epoch": 0.26, + "grad_norm": 2.0166003704071045, + "learning_rate": 3.6778002865754244e-05, + "loss": 5.6509, + "step": 5352 + }, + { + "epoch": 0.26, + "grad_norm": 1.791098952293396, + "learning_rate": 3.676812095459262e-05, + "loss": 5.7176, + "step": 5356 + }, + { + "epoch": 0.26, + "grad_norm": 1.9136872291564941, + "learning_rate": 3.6758239043431e-05, + "loss": 5.8457, + "step": 5360 + }, + { + "epoch": 0.27, + "grad_norm": 2.2667651176452637, + "learning_rate": 3.674835713226938e-05, + "loss": 5.6658, + "step": 5364 + }, + { + "epoch": 0.27, + "grad_norm": 1.8071796894073486, + "learning_rate": 3.6738475221107765e-05, + "loss": 5.7392, + "step": 5368 + }, + { + "epoch": 0.27, + "grad_norm": 1.8109067678451538, + "learning_rate": 3.672859330994615e-05, + "loss": 5.6318, + "step": 5372 + }, + { + "epoch": 0.27, + "grad_norm": 1.8517425060272217, + "learning_rate": 3.671871139878453e-05, + "loss": 5.8122, + "step": 5376 + }, + { + "epoch": 0.27, + "grad_norm": 1.8264243602752686, + "learning_rate": 3.670882948762291e-05, + "loss": 5.5755, + "step": 5380 + }, + { + "epoch": 0.27, + "grad_norm": 1.98093581199646, + "learning_rate": 3.669894757646129e-05, + "loss": 5.7236, + "step": 5384 + }, + { + "epoch": 0.27, + "grad_norm": 1.6828770637512207, + "learning_rate": 3.668906566529967e-05, + "loss": 5.7345, + "step": 5388 + }, + { + "epoch": 0.27, + "grad_norm": 1.671663761138916, + "learning_rate": 3.667918375413805e-05, + "loss": 5.5266, + "step": 5392 + }, + { + "epoch": 0.27, + "grad_norm": 2.236020565032959, + "learning_rate": 3.666930184297643e-05, + "loss": 5.6844, + "step": 5396 + }, + { + "epoch": 0.27, + "grad_norm": 2.147383213043213, + "learning_rate": 3.6659419931814815e-05, + "loss": 5.6782, + "step": 5400 + }, + { + "epoch": 0.27, + "grad_norm": 2.0428617000579834, + "learning_rate": 3.66495380206532e-05, + "loss": 5.6924, + "step": 5404 + }, + { + "epoch": 0.27, + "grad_norm": 1.5944437980651855, + "learning_rate": 3.663965610949158e-05, + "loss": 5.5994, + "step": 5408 + }, + { + "epoch": 0.27, + "grad_norm": 1.8434429168701172, + "learning_rate": 3.662977419832996e-05, + "loss": 5.6628, + "step": 5412 + }, + { + "epoch": 0.27, + "grad_norm": 1.6614712476730347, + "learning_rate": 3.6619892287168336e-05, + "loss": 5.7582, + "step": 5416 + }, + { + "epoch": 0.27, + "grad_norm": 1.9906529188156128, + "learning_rate": 3.661001037600672e-05, + "loss": 5.8577, + "step": 5420 + }, + { + "epoch": 0.27, + "grad_norm": 1.6746339797973633, + "learning_rate": 3.66001284648451e-05, + "loss": 5.6674, + "step": 5424 + }, + { + "epoch": 0.27, + "grad_norm": 2.0132009983062744, + "learning_rate": 3.659024655368348e-05, + "loss": 5.75, + "step": 5428 + }, + { + "epoch": 0.27, + "grad_norm": 1.6676075458526611, + "learning_rate": 3.6580364642521865e-05, + "loss": 5.7463, + "step": 5432 + }, + { + "epoch": 0.27, + "grad_norm": 1.644707441329956, + "learning_rate": 3.6570482731360247e-05, + "loss": 5.5246, + "step": 5436 + }, + { + "epoch": 0.27, + "grad_norm": 1.6207093000411987, + "learning_rate": 3.656060082019863e-05, + "loss": 5.6253, + "step": 5440 + }, + { + "epoch": 0.27, + "grad_norm": 1.9052598476409912, + "learning_rate": 3.6550718909037004e-05, + "loss": 5.6262, + "step": 5444 + }, + { + "epoch": 0.27, + "grad_norm": 1.976206660270691, + "learning_rate": 3.6540836997875386e-05, + "loss": 5.6091, + "step": 5448 + }, + { + "epoch": 0.27, + "grad_norm": 1.858864665031433, + "learning_rate": 3.653095508671377e-05, + "loss": 5.6392, + "step": 5452 + }, + { + "epoch": 0.27, + "grad_norm": 1.7667744159698486, + "learning_rate": 3.652107317555216e-05, + "loss": 5.7207, + "step": 5456 + }, + { + "epoch": 0.27, + "grad_norm": 1.4944398403167725, + "learning_rate": 3.651119126439054e-05, + "loss": 5.8588, + "step": 5460 + }, + { + "epoch": 0.27, + "grad_norm": 2.0682997703552246, + "learning_rate": 3.650130935322892e-05, + "loss": 5.7068, + "step": 5464 + }, + { + "epoch": 0.27, + "grad_norm": 1.81681489944458, + "learning_rate": 3.64914274420673e-05, + "loss": 5.6977, + "step": 5468 + }, + { + "epoch": 0.27, + "grad_norm": 1.6798757314682007, + "learning_rate": 3.648154553090568e-05, + "loss": 5.7233, + "step": 5472 + }, + { + "epoch": 0.27, + "grad_norm": 1.8230525255203247, + "learning_rate": 3.647166361974406e-05, + "loss": 5.7243, + "step": 5476 + }, + { + "epoch": 0.27, + "grad_norm": 1.4133111238479614, + "learning_rate": 3.646178170858244e-05, + "loss": 5.598, + "step": 5480 + }, + { + "epoch": 0.27, + "grad_norm": 1.9329841136932373, + "learning_rate": 3.6451899797420825e-05, + "loss": 5.5917, + "step": 5484 + }, + { + "epoch": 0.27, + "grad_norm": 1.5919127464294434, + "learning_rate": 3.6442017886259207e-05, + "loss": 5.6524, + "step": 5488 + }, + { + "epoch": 0.27, + "grad_norm": 1.8038116693496704, + "learning_rate": 3.643213597509759e-05, + "loss": 5.6005, + "step": 5492 + }, + { + "epoch": 0.27, + "grad_norm": 1.781088948249817, + "learning_rate": 3.642225406393597e-05, + "loss": 5.6194, + "step": 5496 + }, + { + "epoch": 0.27, + "grad_norm": 2.1184487342834473, + "learning_rate": 3.6412372152774346e-05, + "loss": 5.7654, + "step": 5500 + }, + { + "epoch": 0.27, + "grad_norm": 1.6980026960372925, + "learning_rate": 3.640249024161273e-05, + "loss": 5.6293, + "step": 5504 + }, + { + "epoch": 0.27, + "grad_norm": 1.9863158464431763, + "learning_rate": 3.639260833045111e-05, + "loss": 5.852, + "step": 5508 + }, + { + "epoch": 0.27, + "grad_norm": 2.160196542739868, + "learning_rate": 3.638272641928949e-05, + "loss": 5.6083, + "step": 5512 + }, + { + "epoch": 0.27, + "grad_norm": 1.8680537939071655, + "learning_rate": 3.6372844508127874e-05, + "loss": 5.6863, + "step": 5516 + }, + { + "epoch": 0.27, + "grad_norm": 2.162466526031494, + "learning_rate": 3.6362962596966256e-05, + "loss": 5.7776, + "step": 5520 + }, + { + "epoch": 0.27, + "grad_norm": 1.7698131799697876, + "learning_rate": 3.635308068580464e-05, + "loss": 5.5958, + "step": 5524 + }, + { + "epoch": 0.27, + "grad_norm": 1.8697733879089355, + "learning_rate": 3.6343198774643014e-05, + "loss": 5.6393, + "step": 5528 + }, + { + "epoch": 0.27, + "grad_norm": 1.4883641004562378, + "learning_rate": 3.6333316863481396e-05, + "loss": 5.6008, + "step": 5532 + }, + { + "epoch": 0.27, + "grad_norm": 1.7563178539276123, + "learning_rate": 3.632343495231978e-05, + "loss": 5.7821, + "step": 5536 + }, + { + "epoch": 0.27, + "grad_norm": 1.8179692029953003, + "learning_rate": 3.631355304115816e-05, + "loss": 5.6816, + "step": 5540 + }, + { + "epoch": 0.27, + "grad_norm": 1.5550754070281982, + "learning_rate": 3.630367112999654e-05, + "loss": 5.6527, + "step": 5544 + }, + { + "epoch": 0.27, + "grad_norm": 1.9609134197235107, + "learning_rate": 3.6293789218834924e-05, + "loss": 5.7238, + "step": 5548 + }, + { + "epoch": 0.27, + "grad_norm": 1.6006996631622314, + "learning_rate": 3.6283907307673306e-05, + "loss": 5.6861, + "step": 5552 + }, + { + "epoch": 0.27, + "grad_norm": 1.9726330041885376, + "learning_rate": 3.627402539651169e-05, + "loss": 5.7777, + "step": 5556 + }, + { + "epoch": 0.27, + "grad_norm": 1.6247719526290894, + "learning_rate": 3.626414348535006e-05, + "loss": 5.7374, + "step": 5560 + }, + { + "epoch": 0.27, + "grad_norm": 1.60383141040802, + "learning_rate": 3.6254261574188445e-05, + "loss": 5.7191, + "step": 5564 + }, + { + "epoch": 0.28, + "grad_norm": 1.6703567504882812, + "learning_rate": 3.624437966302683e-05, + "loss": 5.5841, + "step": 5568 + }, + { + "epoch": 0.28, + "grad_norm": 1.7297996282577515, + "learning_rate": 3.6234497751865216e-05, + "loss": 5.7305, + "step": 5572 + }, + { + "epoch": 0.28, + "grad_norm": 1.7804654836654663, + "learning_rate": 3.62246158407036e-05, + "loss": 5.6864, + "step": 5576 + }, + { + "epoch": 0.28, + "grad_norm": 1.5437934398651123, + "learning_rate": 3.621473392954198e-05, + "loss": 5.7737, + "step": 5580 + }, + { + "epoch": 0.28, + "grad_norm": 1.8865420818328857, + "learning_rate": 3.6204852018380356e-05, + "loss": 5.6338, + "step": 5584 + }, + { + "epoch": 0.28, + "grad_norm": 2.0955584049224854, + "learning_rate": 3.619497010721874e-05, + "loss": 5.7677, + "step": 5588 + }, + { + "epoch": 0.28, + "grad_norm": 1.8487645387649536, + "learning_rate": 3.618508819605712e-05, + "loss": 5.6992, + "step": 5592 + }, + { + "epoch": 0.28, + "grad_norm": 1.652840495109558, + "learning_rate": 3.61752062848955e-05, + "loss": 5.5231, + "step": 5596 + }, + { + "epoch": 0.28, + "grad_norm": 1.615386724472046, + "learning_rate": 3.6165324373733884e-05, + "loss": 5.721, + "step": 5600 + }, + { + "epoch": 0.28, + "grad_norm": 1.860023856163025, + "learning_rate": 3.6155442462572266e-05, + "loss": 5.7149, + "step": 5604 + }, + { + "epoch": 0.28, + "grad_norm": 1.7823337316513062, + "learning_rate": 3.614556055141065e-05, + "loss": 5.7699, + "step": 5608 + }, + { + "epoch": 0.28, + "grad_norm": 1.9639289379119873, + "learning_rate": 3.613567864024902e-05, + "loss": 5.6564, + "step": 5612 + }, + { + "epoch": 0.28, + "grad_norm": 1.6741266250610352, + "learning_rate": 3.6125796729087405e-05, + "loss": 5.6792, + "step": 5616 + }, + { + "epoch": 0.28, + "grad_norm": 1.6273597478866577, + "learning_rate": 3.611591481792579e-05, + "loss": 5.6275, + "step": 5620 + }, + { + "epoch": 0.28, + "grad_norm": 1.663702368736267, + "learning_rate": 3.610603290676417e-05, + "loss": 5.6499, + "step": 5624 + }, + { + "epoch": 0.28, + "grad_norm": 1.3998336791992188, + "learning_rate": 3.609615099560255e-05, + "loss": 5.5426, + "step": 5628 + }, + { + "epoch": 0.28, + "grad_norm": 1.5529056787490845, + "learning_rate": 3.6086269084440934e-05, + "loss": 5.7016, + "step": 5632 + }, + { + "epoch": 0.28, + "grad_norm": 1.6349724531173706, + "learning_rate": 3.6076387173279316e-05, + "loss": 5.6359, + "step": 5636 + }, + { + "epoch": 0.28, + "grad_norm": 1.6268702745437622, + "learning_rate": 3.60665052621177e-05, + "loss": 5.68, + "step": 5640 + }, + { + "epoch": 0.28, + "grad_norm": 1.83133065700531, + "learning_rate": 3.605662335095607e-05, + "loss": 5.644, + "step": 5644 + }, + { + "epoch": 0.28, + "grad_norm": 1.7102351188659668, + "learning_rate": 3.6046741439794455e-05, + "loss": 5.7675, + "step": 5648 + }, + { + "epoch": 0.28, + "grad_norm": 1.901228666305542, + "learning_rate": 3.603685952863284e-05, + "loss": 5.7194, + "step": 5652 + }, + { + "epoch": 0.28, + "grad_norm": 1.9542912244796753, + "learning_rate": 3.602697761747122e-05, + "loss": 5.7244, + "step": 5656 + }, + { + "epoch": 0.28, + "grad_norm": 1.9861336946487427, + "learning_rate": 3.60170957063096e-05, + "loss": 5.5597, + "step": 5660 + }, + { + "epoch": 0.28, + "grad_norm": 1.6458784341812134, + "learning_rate": 3.600721379514798e-05, + "loss": 5.6276, + "step": 5664 + }, + { + "epoch": 0.28, + "grad_norm": 2.3147518634796143, + "learning_rate": 3.5997331883986365e-05, + "loss": 5.623, + "step": 5668 + }, + { + "epoch": 0.28, + "grad_norm": 1.6608690023422241, + "learning_rate": 3.598744997282474e-05, + "loss": 5.7577, + "step": 5672 + }, + { + "epoch": 0.28, + "grad_norm": 1.4289793968200684, + "learning_rate": 3.597756806166312e-05, + "loss": 5.6755, + "step": 5676 + }, + { + "epoch": 0.28, + "grad_norm": 1.8258533477783203, + "learning_rate": 3.5967686150501505e-05, + "loss": 5.6546, + "step": 5680 + }, + { + "epoch": 0.28, + "grad_norm": 1.589067816734314, + "learning_rate": 3.595780423933989e-05, + "loss": 5.6725, + "step": 5684 + }, + { + "epoch": 0.28, + "grad_norm": 1.7325801849365234, + "learning_rate": 3.5947922328178276e-05, + "loss": 5.7298, + "step": 5688 + }, + { + "epoch": 0.28, + "grad_norm": 1.7161133289337158, + "learning_rate": 3.593804041701666e-05, + "loss": 5.5812, + "step": 5692 + }, + { + "epoch": 0.28, + "grad_norm": 1.6219887733459473, + "learning_rate": 3.592815850585503e-05, + "loss": 5.5354, + "step": 5696 + }, + { + "epoch": 0.28, + "grad_norm": 2.099241256713867, + "learning_rate": 3.5918276594693415e-05, + "loss": 5.7611, + "step": 5700 + }, + { + "epoch": 0.28, + "grad_norm": 1.576979160308838, + "learning_rate": 3.59083946835318e-05, + "loss": 5.7276, + "step": 5704 + }, + { + "epoch": 0.28, + "grad_norm": 1.8135298490524292, + "learning_rate": 3.589851277237018e-05, + "loss": 5.6826, + "step": 5708 + }, + { + "epoch": 0.28, + "grad_norm": 1.8552438020706177, + "learning_rate": 3.588863086120856e-05, + "loss": 5.6452, + "step": 5712 + }, + { + "epoch": 0.28, + "grad_norm": 1.7755975723266602, + "learning_rate": 3.587874895004694e-05, + "loss": 5.7432, + "step": 5716 + }, + { + "epoch": 0.28, + "grad_norm": 1.728273868560791, + "learning_rate": 3.5868867038885325e-05, + "loss": 5.8236, + "step": 5720 + }, + { + "epoch": 0.28, + "grad_norm": 2.304983139038086, + "learning_rate": 3.585898512772371e-05, + "loss": 5.6947, + "step": 5724 + }, + { + "epoch": 0.28, + "grad_norm": 1.9431263208389282, + "learning_rate": 3.584910321656208e-05, + "loss": 5.6718, + "step": 5728 + }, + { + "epoch": 0.28, + "grad_norm": 1.6405879259109497, + "learning_rate": 3.5839221305400465e-05, + "loss": 5.7134, + "step": 5732 + }, + { + "epoch": 0.28, + "grad_norm": 1.6948162317276, + "learning_rate": 3.582933939423885e-05, + "loss": 5.6881, + "step": 5736 + }, + { + "epoch": 0.28, + "grad_norm": 1.9962084293365479, + "learning_rate": 3.581945748307723e-05, + "loss": 5.6576, + "step": 5740 + }, + { + "epoch": 0.28, + "grad_norm": 2.2845137119293213, + "learning_rate": 3.580957557191561e-05, + "loss": 5.6479, + "step": 5744 + }, + { + "epoch": 0.28, + "grad_norm": 1.9110279083251953, + "learning_rate": 3.579969366075399e-05, + "loss": 5.7157, + "step": 5748 + }, + { + "epoch": 0.28, + "grad_norm": 1.9124606847763062, + "learning_rate": 3.5789811749592375e-05, + "loss": 5.8124, + "step": 5752 + }, + { + "epoch": 0.28, + "grad_norm": 1.4642013311386108, + "learning_rate": 3.577992983843075e-05, + "loss": 5.7739, + "step": 5756 + }, + { + "epoch": 0.28, + "grad_norm": 1.943257451057434, + "learning_rate": 3.577004792726913e-05, + "loss": 5.6817, + "step": 5760 + }, + { + "epoch": 0.28, + "grad_norm": 1.9670331478118896, + "learning_rate": 3.5760166016107515e-05, + "loss": 5.5489, + "step": 5764 + }, + { + "epoch": 0.28, + "grad_norm": 1.63418447971344, + "learning_rate": 3.5750284104945897e-05, + "loss": 5.5785, + "step": 5768 + }, + { + "epoch": 0.29, + "grad_norm": 1.7041114568710327, + "learning_rate": 3.574040219378428e-05, + "loss": 5.7375, + "step": 5772 + }, + { + "epoch": 0.29, + "grad_norm": 2.0340771675109863, + "learning_rate": 3.573052028262266e-05, + "loss": 5.6921, + "step": 5776 + }, + { + "epoch": 0.29, + "grad_norm": 1.7758631706237793, + "learning_rate": 3.572063837146104e-05, + "loss": 5.688, + "step": 5780 + }, + { + "epoch": 0.29, + "grad_norm": 1.6245428323745728, + "learning_rate": 3.5710756460299425e-05, + "loss": 5.7504, + "step": 5784 + }, + { + "epoch": 0.29, + "grad_norm": 1.6857564449310303, + "learning_rate": 3.57008745491378e-05, + "loss": 5.5688, + "step": 5788 + }, + { + "epoch": 0.29, + "grad_norm": 1.740413784980774, + "learning_rate": 3.569099263797618e-05, + "loss": 5.6504, + "step": 5792 + }, + { + "epoch": 0.29, + "grad_norm": 1.8830409049987793, + "learning_rate": 3.5681110726814564e-05, + "loss": 5.6287, + "step": 5796 + }, + { + "epoch": 0.29, + "grad_norm": 1.8001620769500732, + "learning_rate": 3.567122881565295e-05, + "loss": 5.8408, + "step": 5800 + }, + { + "epoch": 0.29, + "grad_norm": 1.518395185470581, + "learning_rate": 3.5661346904491335e-05, + "loss": 5.6564, + "step": 5804 + }, + { + "epoch": 0.29, + "grad_norm": 1.641120433807373, + "learning_rate": 3.565146499332972e-05, + "loss": 5.6351, + "step": 5808 + }, + { + "epoch": 0.29, + "grad_norm": 2.126620292663574, + "learning_rate": 3.564158308216809e-05, + "loss": 5.6653, + "step": 5812 + }, + { + "epoch": 0.29, + "grad_norm": 2.0950162410736084, + "learning_rate": 3.5631701171006475e-05, + "loss": 5.63, + "step": 5816 + }, + { + "epoch": 0.29, + "grad_norm": 1.8449667692184448, + "learning_rate": 3.5621819259844857e-05, + "loss": 5.6507, + "step": 5820 + }, + { + "epoch": 0.29, + "grad_norm": 1.8015673160552979, + "learning_rate": 3.561193734868324e-05, + "loss": 5.6993, + "step": 5824 + }, + { + "epoch": 0.29, + "grad_norm": 1.9131306409835815, + "learning_rate": 3.560205543752162e-05, + "loss": 5.8073, + "step": 5828 + }, + { + "epoch": 0.29, + "grad_norm": 1.5919278860092163, + "learning_rate": 3.559217352636e-05, + "loss": 5.5955, + "step": 5832 + }, + { + "epoch": 0.29, + "grad_norm": 1.8440625667572021, + "learning_rate": 3.5582291615198385e-05, + "loss": 5.7592, + "step": 5836 + }, + { + "epoch": 0.29, + "grad_norm": 1.8236738443374634, + "learning_rate": 3.557240970403676e-05, + "loss": 5.7716, + "step": 5840 + }, + { + "epoch": 0.29, + "grad_norm": 1.5875698328018188, + "learning_rate": 3.556252779287514e-05, + "loss": 5.6328, + "step": 5844 + }, + { + "epoch": 0.29, + "grad_norm": 1.722981572151184, + "learning_rate": 3.5552645881713524e-05, + "loss": 5.576, + "step": 5848 + }, + { + "epoch": 0.29, + "grad_norm": 1.9150844812393188, + "learning_rate": 3.5542763970551906e-05, + "loss": 5.6884, + "step": 5852 + }, + { + "epoch": 0.29, + "grad_norm": 2.071272373199463, + "learning_rate": 3.553288205939029e-05, + "loss": 5.6853, + "step": 5856 + }, + { + "epoch": 0.29, + "grad_norm": 1.585911512374878, + "learning_rate": 3.552300014822867e-05, + "loss": 5.8361, + "step": 5860 + }, + { + "epoch": 0.29, + "grad_norm": 1.6303049325942993, + "learning_rate": 3.551311823706705e-05, + "loss": 5.6433, + "step": 5864 + }, + { + "epoch": 0.29, + "grad_norm": 1.914119839668274, + "learning_rate": 3.5503236325905435e-05, + "loss": 5.6434, + "step": 5868 + }, + { + "epoch": 0.29, + "grad_norm": 1.6995179653167725, + "learning_rate": 3.549335441474381e-05, + "loss": 5.517, + "step": 5872 + }, + { + "epoch": 0.29, + "grad_norm": 1.6506705284118652, + "learning_rate": 3.548347250358219e-05, + "loss": 5.7451, + "step": 5876 + }, + { + "epoch": 0.29, + "grad_norm": 1.7888357639312744, + "learning_rate": 3.5473590592420574e-05, + "loss": 5.7428, + "step": 5880 + }, + { + "epoch": 0.29, + "grad_norm": 1.5513304471969604, + "learning_rate": 3.5463708681258956e-05, + "loss": 5.6993, + "step": 5884 + }, + { + "epoch": 0.29, + "grad_norm": 1.9307150840759277, + "learning_rate": 3.545382677009734e-05, + "loss": 5.7173, + "step": 5888 + }, + { + "epoch": 0.29, + "grad_norm": 1.4725440740585327, + "learning_rate": 3.544394485893572e-05, + "loss": 5.7675, + "step": 5892 + }, + { + "epoch": 0.29, + "grad_norm": 2.1401283740997314, + "learning_rate": 3.54340629477741e-05, + "loss": 5.6074, + "step": 5896 + }, + { + "epoch": 0.29, + "grad_norm": 1.7221059799194336, + "learning_rate": 3.542418103661248e-05, + "loss": 5.5691, + "step": 5900 + }, + { + "epoch": 0.29, + "grad_norm": 2.4511234760284424, + "learning_rate": 3.541429912545086e-05, + "loss": 5.8371, + "step": 5904 + }, + { + "epoch": 0.29, + "grad_norm": 1.6160163879394531, + "learning_rate": 3.540441721428924e-05, + "loss": 5.5454, + "step": 5908 + }, + { + "epoch": 0.29, + "grad_norm": 1.6199833154678345, + "learning_rate": 3.5394535303127624e-05, + "loss": 5.618, + "step": 5912 + }, + { + "epoch": 0.29, + "grad_norm": 1.7704286575317383, + "learning_rate": 3.538465339196601e-05, + "loss": 5.6579, + "step": 5916 + }, + { + "epoch": 0.29, + "grad_norm": 2.315131425857544, + "learning_rate": 3.5374771480804395e-05, + "loss": 5.6309, + "step": 5920 + }, + { + "epoch": 0.29, + "grad_norm": 1.7877010107040405, + "learning_rate": 3.536488956964277e-05, + "loss": 5.7664, + "step": 5924 + }, + { + "epoch": 0.29, + "grad_norm": 1.655348777770996, + "learning_rate": 3.535500765848115e-05, + "loss": 5.7179, + "step": 5928 + }, + { + "epoch": 0.29, + "grad_norm": 2.0273170471191406, + "learning_rate": 3.5345125747319534e-05, + "loss": 5.6274, + "step": 5932 + }, + { + "epoch": 0.29, + "grad_norm": 1.6776584386825562, + "learning_rate": 3.5335243836157916e-05, + "loss": 5.722, + "step": 5936 + }, + { + "epoch": 0.29, + "grad_norm": 1.862793207168579, + "learning_rate": 3.53253619249963e-05, + "loss": 5.6436, + "step": 5940 + }, + { + "epoch": 0.29, + "grad_norm": 1.7735410928726196, + "learning_rate": 3.531548001383468e-05, + "loss": 5.5571, + "step": 5944 + }, + { + "epoch": 0.29, + "grad_norm": 1.8442201614379883, + "learning_rate": 3.530559810267306e-05, + "loss": 5.5682, + "step": 5948 + }, + { + "epoch": 0.29, + "grad_norm": 1.9948362112045288, + "learning_rate": 3.5295716191511444e-05, + "loss": 5.6108, + "step": 5952 + }, + { + "epoch": 0.29, + "grad_norm": 1.7667597532272339, + "learning_rate": 3.528583428034982e-05, + "loss": 5.6617, + "step": 5956 + }, + { + "epoch": 0.29, + "grad_norm": 1.569806456565857, + "learning_rate": 3.52759523691882e-05, + "loss": 5.7098, + "step": 5960 + }, + { + "epoch": 0.29, + "grad_norm": 1.666054129600525, + "learning_rate": 3.5266070458026584e-05, + "loss": 5.6734, + "step": 5964 + }, + { + "epoch": 0.29, + "grad_norm": 1.6367340087890625, + "learning_rate": 3.5256188546864966e-05, + "loss": 5.6654, + "step": 5968 + }, + { + "epoch": 0.3, + "grad_norm": 1.5181485414505005, + "learning_rate": 3.524630663570335e-05, + "loss": 5.6392, + "step": 5972 + }, + { + "epoch": 0.3, + "grad_norm": 2.067699432373047, + "learning_rate": 3.523642472454173e-05, + "loss": 5.6753, + "step": 5976 + }, + { + "epoch": 0.3, + "grad_norm": 1.6809829473495483, + "learning_rate": 3.522654281338011e-05, + "loss": 5.6829, + "step": 5980 + }, + { + "epoch": 0.3, + "grad_norm": 1.5863231420516968, + "learning_rate": 3.521666090221849e-05, + "loss": 5.5417, + "step": 5984 + }, + { + "epoch": 0.3, + "grad_norm": 1.9677083492279053, + "learning_rate": 3.520677899105687e-05, + "loss": 5.6524, + "step": 5988 + }, + { + "epoch": 0.3, + "grad_norm": 1.5074371099472046, + "learning_rate": 3.519689707989525e-05, + "loss": 5.7254, + "step": 5992 + }, + { + "epoch": 0.3, + "grad_norm": 1.552756667137146, + "learning_rate": 3.518701516873363e-05, + "loss": 5.5925, + "step": 5996 + }, + { + "epoch": 0.3, + "grad_norm": 1.689927101135254, + "learning_rate": 3.5177133257572015e-05, + "loss": 5.629, + "step": 6000 + }, + { + "epoch": 0.3, + "grad_norm": 1.6513864994049072, + "learning_rate": 3.51672513464104e-05, + "loss": 5.7166, + "step": 6004 + }, + { + "epoch": 0.3, + "grad_norm": 1.8767340183258057, + "learning_rate": 3.515736943524878e-05, + "loss": 5.7523, + "step": 6008 + }, + { + "epoch": 0.3, + "grad_norm": 2.079411029815674, + "learning_rate": 3.5147487524087155e-05, + "loss": 5.7882, + "step": 6012 + }, + { + "epoch": 0.3, + "grad_norm": 1.992756962776184, + "learning_rate": 3.513760561292554e-05, + "loss": 5.587, + "step": 6016 + }, + { + "epoch": 0.3, + "grad_norm": 1.477186679840088, + "learning_rate": 3.512772370176392e-05, + "loss": 5.7238, + "step": 6020 + }, + { + "epoch": 0.3, + "grad_norm": 1.6832951307296753, + "learning_rate": 3.51178417906023e-05, + "loss": 5.627, + "step": 6024 + }, + { + "epoch": 0.3, + "grad_norm": 1.8202719688415527, + "learning_rate": 3.510795987944068e-05, + "loss": 5.6409, + "step": 6028 + }, + { + "epoch": 0.3, + "grad_norm": 1.7379246950149536, + "learning_rate": 3.509807796827907e-05, + "loss": 5.6045, + "step": 6032 + }, + { + "epoch": 0.3, + "grad_norm": 1.7630070447921753, + "learning_rate": 3.5088196057117454e-05, + "loss": 5.6474, + "step": 6036 + }, + { + "epoch": 0.3, + "grad_norm": 1.8432222604751587, + "learning_rate": 3.507831414595583e-05, + "loss": 5.694, + "step": 6040 + }, + { + "epoch": 0.3, + "grad_norm": 2.1126575469970703, + "learning_rate": 3.506843223479421e-05, + "loss": 5.5952, + "step": 6044 + }, + { + "epoch": 0.3, + "grad_norm": 1.8852741718292236, + "learning_rate": 3.5058550323632593e-05, + "loss": 5.4868, + "step": 6048 + }, + { + "epoch": 0.3, + "grad_norm": 2.05767560005188, + "learning_rate": 3.5048668412470975e-05, + "loss": 5.6139, + "step": 6052 + }, + { + "epoch": 0.3, + "grad_norm": 1.8169785737991333, + "learning_rate": 3.503878650130936e-05, + "loss": 5.6821, + "step": 6056 + }, + { + "epoch": 0.3, + "grad_norm": 1.8394947052001953, + "learning_rate": 3.502890459014774e-05, + "loss": 5.7144, + "step": 6060 + }, + { + "epoch": 0.3, + "grad_norm": 1.533893346786499, + "learning_rate": 3.501902267898612e-05, + "loss": 5.6372, + "step": 6064 + }, + { + "epoch": 0.3, + "grad_norm": 1.7650554180145264, + "learning_rate": 3.50091407678245e-05, + "loss": 5.7372, + "step": 6068 + }, + { + "epoch": 0.3, + "grad_norm": 1.7776150703430176, + "learning_rate": 3.499925885666288e-05, + "loss": 5.5772, + "step": 6072 + }, + { + "epoch": 0.3, + "grad_norm": 1.804993987083435, + "learning_rate": 3.498937694550126e-05, + "loss": 5.6967, + "step": 6076 + }, + { + "epoch": 0.3, + "grad_norm": 2.0517046451568604, + "learning_rate": 3.497949503433964e-05, + "loss": 5.7905, + "step": 6080 + }, + { + "epoch": 0.3, + "grad_norm": 1.6444928646087646, + "learning_rate": 3.4969613123178025e-05, + "loss": 5.5937, + "step": 6084 + }, + { + "epoch": 0.3, + "grad_norm": 2.079911231994629, + "learning_rate": 3.495973121201641e-05, + "loss": 5.6773, + "step": 6088 + }, + { + "epoch": 0.3, + "grad_norm": 1.929654598236084, + "learning_rate": 3.494984930085479e-05, + "loss": 5.7453, + "step": 6092 + }, + { + "epoch": 0.3, + "grad_norm": 2.0268120765686035, + "learning_rate": 3.4939967389693165e-05, + "loss": 5.6351, + "step": 6096 + }, + { + "epoch": 0.3, + "grad_norm": 1.7415413856506348, + "learning_rate": 3.493008547853155e-05, + "loss": 5.5666, + "step": 6100 + }, + { + "epoch": 0.3, + "grad_norm": 1.8862464427947998, + "learning_rate": 3.492020356736993e-05, + "loss": 5.7066, + "step": 6104 + }, + { + "epoch": 0.3, + "grad_norm": 1.8561365604400635, + "learning_rate": 3.491032165620831e-05, + "loss": 5.6259, + "step": 6108 + }, + { + "epoch": 0.3, + "grad_norm": 1.681982398033142, + "learning_rate": 3.490043974504669e-05, + "loss": 5.5725, + "step": 6112 + }, + { + "epoch": 0.3, + "grad_norm": 1.4180033206939697, + "learning_rate": 3.4890557833885075e-05, + "loss": 5.6196, + "step": 6116 + }, + { + "epoch": 0.3, + "grad_norm": 1.7663787603378296, + "learning_rate": 3.488067592272346e-05, + "loss": 5.617, + "step": 6120 + }, + { + "epoch": 0.3, + "grad_norm": 1.6531801223754883, + "learning_rate": 3.487079401156184e-05, + "loss": 5.5927, + "step": 6124 + }, + { + "epoch": 0.3, + "grad_norm": 1.739673376083374, + "learning_rate": 3.4860912100400214e-05, + "loss": 5.6717, + "step": 6128 + }, + { + "epoch": 0.3, + "grad_norm": 2.0703117847442627, + "learning_rate": 3.4851030189238596e-05, + "loss": 5.6708, + "step": 6132 + }, + { + "epoch": 0.3, + "grad_norm": 1.5977106094360352, + "learning_rate": 3.484114827807698e-05, + "loss": 5.6917, + "step": 6136 + }, + { + "epoch": 0.3, + "grad_norm": 1.7111542224884033, + "learning_rate": 3.483126636691536e-05, + "loss": 5.5998, + "step": 6140 + }, + { + "epoch": 0.3, + "grad_norm": 2.138233184814453, + "learning_rate": 3.482138445575375e-05, + "loss": 5.6539, + "step": 6144 + }, + { + "epoch": 0.3, + "grad_norm": 1.5501015186309814, + "learning_rate": 3.481150254459213e-05, + "loss": 5.6418, + "step": 6148 + }, + { + "epoch": 0.3, + "grad_norm": 1.7464877367019653, + "learning_rate": 3.480162063343051e-05, + "loss": 5.5607, + "step": 6152 + }, + { + "epoch": 0.3, + "grad_norm": 2.0313351154327393, + "learning_rate": 3.479173872226889e-05, + "loss": 5.7242, + "step": 6156 + }, + { + "epoch": 0.3, + "grad_norm": 2.1652917861938477, + "learning_rate": 3.478185681110727e-05, + "loss": 5.6722, + "step": 6160 + }, + { + "epoch": 0.3, + "grad_norm": 2.6227622032165527, + "learning_rate": 3.477197489994565e-05, + "loss": 5.5609, + "step": 6164 + }, + { + "epoch": 0.3, + "grad_norm": 2.144803047180176, + "learning_rate": 3.4762092988784035e-05, + "loss": 5.6724, + "step": 6168 + }, + { + "epoch": 0.3, + "grad_norm": 2.0227320194244385, + "learning_rate": 3.475221107762242e-05, + "loss": 5.6686, + "step": 6172 + }, + { + "epoch": 0.31, + "grad_norm": 1.794042944908142, + "learning_rate": 3.47423291664608e-05, + "loss": 5.6792, + "step": 6176 + }, + { + "epoch": 0.31, + "grad_norm": 1.6213692426681519, + "learning_rate": 3.4732447255299174e-05, + "loss": 5.6446, + "step": 6180 + }, + { + "epoch": 0.31, + "grad_norm": 1.9633342027664185, + "learning_rate": 3.4722565344137556e-05, + "loss": 5.6725, + "step": 6184 + }, + { + "epoch": 0.31, + "grad_norm": 1.4858800172805786, + "learning_rate": 3.471268343297594e-05, + "loss": 5.7152, + "step": 6188 + }, + { + "epoch": 0.31, + "grad_norm": 1.4575623273849487, + "learning_rate": 3.470280152181432e-05, + "loss": 5.6749, + "step": 6192 + }, + { + "epoch": 0.31, + "grad_norm": 2.1202147006988525, + "learning_rate": 3.46929196106527e-05, + "loss": 5.7791, + "step": 6196 + }, + { + "epoch": 0.31, + "grad_norm": 1.9096757173538208, + "learning_rate": 3.4683037699491085e-05, + "loss": 5.4813, + "step": 6200 + }, + { + "epoch": 0.31, + "grad_norm": 2.282806873321533, + "learning_rate": 3.467315578832947e-05, + "loss": 5.6761, + "step": 6204 + }, + { + "epoch": 0.31, + "grad_norm": 1.6041266918182373, + "learning_rate": 3.466327387716785e-05, + "loss": 5.7067, + "step": 6208 + }, + { + "epoch": 0.31, + "grad_norm": 1.9719665050506592, + "learning_rate": 3.4653391966006224e-05, + "loss": 5.6565, + "step": 6212 + }, + { + "epoch": 0.31, + "grad_norm": 1.8574167490005493, + "learning_rate": 3.4643510054844606e-05, + "loss": 5.6941, + "step": 6216 + }, + { + "epoch": 0.31, + "grad_norm": 1.8125001192092896, + "learning_rate": 3.463362814368299e-05, + "loss": 5.6066, + "step": 6220 + }, + { + "epoch": 0.31, + "grad_norm": 1.7597132921218872, + "learning_rate": 3.462374623252137e-05, + "loss": 5.7592, + "step": 6224 + }, + { + "epoch": 0.31, + "grad_norm": 2.0065159797668457, + "learning_rate": 3.461386432135975e-05, + "loss": 5.7911, + "step": 6228 + }, + { + "epoch": 0.31, + "grad_norm": 2.03913950920105, + "learning_rate": 3.4603982410198134e-05, + "loss": 5.6571, + "step": 6232 + }, + { + "epoch": 0.31, + "grad_norm": 2.114107608795166, + "learning_rate": 3.4594100499036516e-05, + "loss": 5.7014, + "step": 6236 + }, + { + "epoch": 0.31, + "grad_norm": 1.7447437047958374, + "learning_rate": 3.458421858787489e-05, + "loss": 5.712, + "step": 6240 + }, + { + "epoch": 0.31, + "grad_norm": 1.8154895305633545, + "learning_rate": 3.4574336676713274e-05, + "loss": 5.7241, + "step": 6244 + }, + { + "epoch": 0.31, + "grad_norm": 2.1502370834350586, + "learning_rate": 3.4564454765551656e-05, + "loss": 5.6274, + "step": 6248 + }, + { + "epoch": 0.31, + "grad_norm": 1.6285412311553955, + "learning_rate": 3.455457285439004e-05, + "loss": 5.6201, + "step": 6252 + }, + { + "epoch": 0.31, + "grad_norm": 1.825987696647644, + "learning_rate": 3.454469094322842e-05, + "loss": 5.5851, + "step": 6256 + }, + { + "epoch": 0.31, + "grad_norm": 2.224214553833008, + "learning_rate": 3.453480903206681e-05, + "loss": 5.7344, + "step": 6260 + }, + { + "epoch": 0.31, + "grad_norm": 1.8334683179855347, + "learning_rate": 3.4524927120905184e-05, + "loss": 5.6966, + "step": 6264 + }, + { + "epoch": 0.31, + "grad_norm": 1.599949598312378, + "learning_rate": 3.4515045209743566e-05, + "loss": 5.6679, + "step": 6268 + }, + { + "epoch": 0.31, + "grad_norm": 1.9811068773269653, + "learning_rate": 3.450516329858195e-05, + "loss": 5.7008, + "step": 6272 + }, + { + "epoch": 0.31, + "grad_norm": 2.3280673027038574, + "learning_rate": 3.449528138742033e-05, + "loss": 5.7588, + "step": 6276 + }, + { + "epoch": 0.31, + "grad_norm": 1.8196971416473389, + "learning_rate": 3.448539947625871e-05, + "loss": 5.6388, + "step": 6280 + }, + { + "epoch": 0.31, + "grad_norm": 1.4781452417373657, + "learning_rate": 3.4475517565097094e-05, + "loss": 5.6588, + "step": 6284 + }, + { + "epoch": 0.31, + "grad_norm": 2.2682340145111084, + "learning_rate": 3.4465635653935476e-05, + "loss": 5.6705, + "step": 6288 + }, + { + "epoch": 0.31, + "grad_norm": 1.7862980365753174, + "learning_rate": 3.445575374277386e-05, + "loss": 5.6001, + "step": 6292 + }, + { + "epoch": 0.31, + "grad_norm": 1.870069980621338, + "learning_rate": 3.4445871831612234e-05, + "loss": 5.6938, + "step": 6296 + }, + { + "epoch": 0.31, + "grad_norm": 2.120589017868042, + "learning_rate": 3.4435989920450616e-05, + "loss": 5.633, + "step": 6300 + }, + { + "epoch": 0.31, + "grad_norm": 1.6841241121292114, + "learning_rate": 3.4426108009289e-05, + "loss": 5.6739, + "step": 6304 + }, + { + "epoch": 0.31, + "grad_norm": 1.8620730638504028, + "learning_rate": 3.441622609812738e-05, + "loss": 5.6794, + "step": 6308 + }, + { + "epoch": 0.31, + "grad_norm": 1.8764095306396484, + "learning_rate": 3.440634418696576e-05, + "loss": 5.724, + "step": 6312 + }, + { + "epoch": 0.31, + "grad_norm": 1.4684795141220093, + "learning_rate": 3.4396462275804144e-05, + "loss": 5.6413, + "step": 6316 + }, + { + "epoch": 0.31, + "grad_norm": 1.5952019691467285, + "learning_rate": 3.4386580364642526e-05, + "loss": 5.6075, + "step": 6320 + }, + { + "epoch": 0.31, + "grad_norm": 1.8536378145217896, + "learning_rate": 3.43766984534809e-05, + "loss": 5.6342, + "step": 6324 + }, + { + "epoch": 0.31, + "grad_norm": 1.8524278402328491, + "learning_rate": 3.4366816542319283e-05, + "loss": 5.6754, + "step": 6328 + }, + { + "epoch": 0.31, + "grad_norm": 1.715560793876648, + "learning_rate": 3.4356934631157665e-05, + "loss": 5.7763, + "step": 6332 + }, + { + "epoch": 0.31, + "grad_norm": 1.8335529565811157, + "learning_rate": 3.434705271999605e-05, + "loss": 5.6376, + "step": 6336 + }, + { + "epoch": 0.31, + "grad_norm": 1.4588648080825806, + "learning_rate": 3.433717080883443e-05, + "loss": 5.6589, + "step": 6340 + }, + { + "epoch": 0.31, + "grad_norm": 2.085669755935669, + "learning_rate": 3.432728889767281e-05, + "loss": 5.6664, + "step": 6344 + }, + { + "epoch": 0.31, + "grad_norm": 1.9969209432601929, + "learning_rate": 3.4317406986511194e-05, + "loss": 5.6533, + "step": 6348 + }, + { + "epoch": 0.31, + "grad_norm": 2.169795513153076, + "learning_rate": 3.4307525075349576e-05, + "loss": 5.7483, + "step": 6352 + }, + { + "epoch": 0.31, + "grad_norm": 1.6342527866363525, + "learning_rate": 3.429764316418795e-05, + "loss": 5.6761, + "step": 6356 + }, + { + "epoch": 0.31, + "grad_norm": 1.7355843782424927, + "learning_rate": 3.428776125302633e-05, + "loss": 5.7214, + "step": 6360 + }, + { + "epoch": 0.31, + "grad_norm": 1.6192421913146973, + "learning_rate": 3.4277879341864715e-05, + "loss": 5.629, + "step": 6364 + }, + { + "epoch": 0.31, + "grad_norm": 1.8149397373199463, + "learning_rate": 3.42679974307031e-05, + "loss": 5.6567, + "step": 6368 + }, + { + "epoch": 0.31, + "grad_norm": 1.9818474054336548, + "learning_rate": 3.425811551954148e-05, + "loss": 5.6311, + "step": 6372 + }, + { + "epoch": 0.32, + "grad_norm": 1.8166816234588623, + "learning_rate": 3.424823360837987e-05, + "loss": 5.6253, + "step": 6376 + }, + { + "epoch": 0.32, + "grad_norm": 1.760532021522522, + "learning_rate": 3.4238351697218243e-05, + "loss": 5.5845, + "step": 6380 + }, + { + "epoch": 0.32, + "grad_norm": 2.153517961502075, + "learning_rate": 3.4228469786056625e-05, + "loss": 5.5968, + "step": 6384 + }, + { + "epoch": 0.32, + "grad_norm": 1.7822462320327759, + "learning_rate": 3.421858787489501e-05, + "loss": 5.547, + "step": 6388 + }, + { + "epoch": 0.32, + "grad_norm": 1.9486993551254272, + "learning_rate": 3.420870596373339e-05, + "loss": 5.7527, + "step": 6392 + }, + { + "epoch": 0.32, + "grad_norm": 1.8310834169387817, + "learning_rate": 3.419882405257177e-05, + "loss": 5.6288, + "step": 6396 + }, + { + "epoch": 0.32, + "grad_norm": 1.9566413164138794, + "learning_rate": 3.4188942141410154e-05, + "loss": 5.5835, + "step": 6400 + }, + { + "epoch": 0.32, + "grad_norm": 1.8529417514801025, + "learning_rate": 3.4179060230248536e-05, + "loss": 5.6979, + "step": 6404 + }, + { + "epoch": 0.32, + "grad_norm": 1.4912470579147339, + "learning_rate": 3.416917831908691e-05, + "loss": 5.6043, + "step": 6408 + }, + { + "epoch": 0.32, + "grad_norm": 1.7634273767471313, + "learning_rate": 3.415929640792529e-05, + "loss": 5.7066, + "step": 6412 + }, + { + "epoch": 0.32, + "grad_norm": 1.8041954040527344, + "learning_rate": 3.4149414496763675e-05, + "loss": 5.6787, + "step": 6416 + }, + { + "epoch": 0.32, + "grad_norm": 1.9450712203979492, + "learning_rate": 3.413953258560206e-05, + "loss": 5.6414, + "step": 6420 + }, + { + "epoch": 0.32, + "grad_norm": 2.1971383094787598, + "learning_rate": 3.412965067444044e-05, + "loss": 5.6731, + "step": 6424 + }, + { + "epoch": 0.32, + "grad_norm": 1.7693517208099365, + "learning_rate": 3.411976876327882e-05, + "loss": 5.6599, + "step": 6428 + }, + { + "epoch": 0.32, + "grad_norm": 2.282921075820923, + "learning_rate": 3.4109886852117203e-05, + "loss": 5.6397, + "step": 6432 + }, + { + "epoch": 0.32, + "grad_norm": 1.8509403467178345, + "learning_rate": 3.4100004940955585e-05, + "loss": 5.6985, + "step": 6436 + }, + { + "epoch": 0.32, + "grad_norm": 1.8916041851043701, + "learning_rate": 3.409012302979396e-05, + "loss": 5.6531, + "step": 6440 + }, + { + "epoch": 0.32, + "grad_norm": 1.615857481956482, + "learning_rate": 3.408024111863234e-05, + "loss": 5.605, + "step": 6444 + }, + { + "epoch": 0.32, + "grad_norm": 1.6781501770019531, + "learning_rate": 3.4070359207470725e-05, + "loss": 5.6553, + "step": 6448 + }, + { + "epoch": 0.32, + "grad_norm": 1.7929623126983643, + "learning_rate": 3.406047729630911e-05, + "loss": 5.6905, + "step": 6452 + }, + { + "epoch": 0.32, + "grad_norm": 1.9167546033859253, + "learning_rate": 3.405059538514749e-05, + "loss": 5.614, + "step": 6456 + }, + { + "epoch": 0.32, + "grad_norm": 1.7820324897766113, + "learning_rate": 3.404071347398587e-05, + "loss": 5.6548, + "step": 6460 + }, + { + "epoch": 0.32, + "grad_norm": 2.117344379425049, + "learning_rate": 3.403083156282425e-05, + "loss": 5.6407, + "step": 6464 + }, + { + "epoch": 0.32, + "grad_norm": 1.829023838043213, + "learning_rate": 3.402094965166263e-05, + "loss": 5.5939, + "step": 6468 + }, + { + "epoch": 0.32, + "grad_norm": 1.7248526811599731, + "learning_rate": 3.401106774050101e-05, + "loss": 5.8424, + "step": 6472 + }, + { + "epoch": 0.32, + "grad_norm": 1.576296329498291, + "learning_rate": 3.400118582933939e-05, + "loss": 5.5122, + "step": 6476 + }, + { + "epoch": 0.32, + "grad_norm": 1.5682412385940552, + "learning_rate": 3.3991303918177775e-05, + "loss": 5.6437, + "step": 6480 + }, + { + "epoch": 0.32, + "grad_norm": 1.5419074296951294, + "learning_rate": 3.398142200701616e-05, + "loss": 5.6232, + "step": 6484 + }, + { + "epoch": 0.32, + "grad_norm": 2.059434652328491, + "learning_rate": 3.3971540095854545e-05, + "loss": 5.6276, + "step": 6488 + }, + { + "epoch": 0.32, + "grad_norm": 1.67318856716156, + "learning_rate": 3.396165818469292e-05, + "loss": 5.6936, + "step": 6492 + }, + { + "epoch": 0.32, + "grad_norm": 1.6248462200164795, + "learning_rate": 3.39517762735313e-05, + "loss": 5.6305, + "step": 6496 + }, + { + "epoch": 0.32, + "grad_norm": 1.914443016052246, + "learning_rate": 3.3941894362369685e-05, + "loss": 5.5594, + "step": 6500 + }, + { + "epoch": 0.32, + "grad_norm": 1.6861499547958374, + "learning_rate": 3.393201245120807e-05, + "loss": 5.6522, + "step": 6504 + }, + { + "epoch": 0.32, + "grad_norm": 1.8825210332870483, + "learning_rate": 3.392213054004645e-05, + "loss": 5.7009, + "step": 6508 + }, + { + "epoch": 0.32, + "grad_norm": 1.8397966623306274, + "learning_rate": 3.391224862888483e-05, + "loss": 5.5723, + "step": 6512 + }, + { + "epoch": 0.32, + "grad_norm": 1.7531960010528564, + "learning_rate": 3.390236671772321e-05, + "loss": 5.7406, + "step": 6516 + }, + { + "epoch": 0.32, + "grad_norm": 1.8257700204849243, + "learning_rate": 3.3892484806561595e-05, + "loss": 5.6363, + "step": 6520 + }, + { + "epoch": 0.32, + "grad_norm": 1.8056398630142212, + "learning_rate": 3.388260289539997e-05, + "loss": 5.6686, + "step": 6524 + }, + { + "epoch": 0.32, + "grad_norm": 1.9961930513381958, + "learning_rate": 3.387272098423835e-05, + "loss": 5.5821, + "step": 6528 + }, + { + "epoch": 0.32, + "grad_norm": 1.7457338571548462, + "learning_rate": 3.3862839073076735e-05, + "loss": 5.7391, + "step": 6532 + }, + { + "epoch": 0.32, + "grad_norm": 1.8142015933990479, + "learning_rate": 3.385295716191512e-05, + "loss": 5.6761, + "step": 6536 + }, + { + "epoch": 0.32, + "grad_norm": 1.6556874513626099, + "learning_rate": 3.38430752507535e-05, + "loss": 5.7818, + "step": 6540 + }, + { + "epoch": 0.32, + "grad_norm": 1.9555789232254028, + "learning_rate": 3.383319333959188e-05, + "loss": 5.7287, + "step": 6544 + }, + { + "epoch": 0.32, + "grad_norm": 1.5307425260543823, + "learning_rate": 3.382331142843026e-05, + "loss": 5.6996, + "step": 6548 + }, + { + "epoch": 0.32, + "grad_norm": 2.0899040699005127, + "learning_rate": 3.381342951726864e-05, + "loss": 5.6487, + "step": 6552 + }, + { + "epoch": 0.32, + "grad_norm": 1.913406491279602, + "learning_rate": 3.380354760610702e-05, + "loss": 5.5607, + "step": 6556 + }, + { + "epoch": 0.32, + "grad_norm": 1.7522472143173218, + "learning_rate": 3.37936656949454e-05, + "loss": 5.5921, + "step": 6560 + }, + { + "epoch": 0.32, + "grad_norm": 1.8658267259597778, + "learning_rate": 3.3783783783783784e-05, + "loss": 5.6933, + "step": 6564 + }, + { + "epoch": 0.32, + "grad_norm": 1.596774697303772, + "learning_rate": 3.3773901872622166e-05, + "loss": 5.6883, + "step": 6568 + }, + { + "epoch": 0.32, + "grad_norm": 1.769968032836914, + "learning_rate": 3.376401996146055e-05, + "loss": 5.6984, + "step": 6572 + }, + { + "epoch": 0.32, + "grad_norm": 2.154975175857544, + "learning_rate": 3.375413805029893e-05, + "loss": 5.654, + "step": 6576 + }, + { + "epoch": 0.33, + "grad_norm": 1.5026295185089111, + "learning_rate": 3.3744256139137306e-05, + "loss": 5.536, + "step": 6580 + }, + { + "epoch": 0.33, + "grad_norm": 1.9100069999694824, + "learning_rate": 3.373437422797569e-05, + "loss": 5.7633, + "step": 6584 + }, + { + "epoch": 0.33, + "grad_norm": 1.7092385292053223, + "learning_rate": 3.372449231681407e-05, + "loss": 5.6036, + "step": 6588 + }, + { + "epoch": 0.33, + "grad_norm": 1.4627431631088257, + "learning_rate": 3.371461040565245e-05, + "loss": 5.6126, + "step": 6592 + }, + { + "epoch": 0.33, + "grad_norm": 1.7545133829116821, + "learning_rate": 3.3704728494490834e-05, + "loss": 5.6248, + "step": 6596 + }, + { + "epoch": 0.33, + "grad_norm": 1.8125765323638916, + "learning_rate": 3.3694846583329216e-05, + "loss": 5.6628, + "step": 6600 + }, + { + "epoch": 0.33, + "grad_norm": 1.6120585203170776, + "learning_rate": 3.3684964672167605e-05, + "loss": 5.595, + "step": 6604 + }, + { + "epoch": 0.33, + "grad_norm": 1.8188247680664062, + "learning_rate": 3.367508276100598e-05, + "loss": 5.5898, + "step": 6608 + }, + { + "epoch": 0.33, + "grad_norm": 1.7864772081375122, + "learning_rate": 3.366520084984436e-05, + "loss": 5.6559, + "step": 6612 + }, + { + "epoch": 0.33, + "grad_norm": 2.448668956756592, + "learning_rate": 3.3655318938682744e-05, + "loss": 5.7192, + "step": 6616 + }, + { + "epoch": 0.33, + "grad_norm": 1.7164603471755981, + "learning_rate": 3.3645437027521126e-05, + "loss": 5.613, + "step": 6620 + }, + { + "epoch": 0.33, + "grad_norm": 1.6809614896774292, + "learning_rate": 3.363555511635951e-05, + "loss": 5.7096, + "step": 6624 + }, + { + "epoch": 0.33, + "grad_norm": 1.6097911596298218, + "learning_rate": 3.362567320519789e-05, + "loss": 5.7016, + "step": 6628 + }, + { + "epoch": 0.33, + "grad_norm": 2.04894757270813, + "learning_rate": 3.361579129403627e-05, + "loss": 5.6882, + "step": 6632 + }, + { + "epoch": 0.33, + "grad_norm": 1.7453163862228394, + "learning_rate": 3.360590938287465e-05, + "loss": 5.7116, + "step": 6636 + }, + { + "epoch": 0.33, + "grad_norm": 2.0974841117858887, + "learning_rate": 3.359602747171303e-05, + "loss": 5.685, + "step": 6640 + }, + { + "epoch": 0.33, + "grad_norm": 1.907728672027588, + "learning_rate": 3.358614556055141e-05, + "loss": 5.6337, + "step": 6644 + }, + { + "epoch": 0.33, + "grad_norm": 1.708357334136963, + "learning_rate": 3.3576263649389794e-05, + "loss": 5.7255, + "step": 6648 + }, + { + "epoch": 0.33, + "grad_norm": 1.9592260122299194, + "learning_rate": 3.3566381738228176e-05, + "loss": 5.7785, + "step": 6652 + }, + { + "epoch": 0.33, + "grad_norm": 1.7609105110168457, + "learning_rate": 3.355649982706656e-05, + "loss": 5.6758, + "step": 6656 + }, + { + "epoch": 0.33, + "grad_norm": 1.6743985414505005, + "learning_rate": 3.354661791590494e-05, + "loss": 5.5952, + "step": 6660 + }, + { + "epoch": 0.33, + "grad_norm": 1.8798311948776245, + "learning_rate": 3.3536736004743315e-05, + "loss": 5.7004, + "step": 6664 + }, + { + "epoch": 0.33, + "grad_norm": 2.2910819053649902, + "learning_rate": 3.35268540935817e-05, + "loss": 5.5182, + "step": 6668 + }, + { + "epoch": 0.33, + "grad_norm": 1.6029813289642334, + "learning_rate": 3.351697218242008e-05, + "loss": 5.7538, + "step": 6672 + }, + { + "epoch": 0.33, + "grad_norm": 1.7451902627944946, + "learning_rate": 3.350709027125846e-05, + "loss": 5.7265, + "step": 6676 + }, + { + "epoch": 0.33, + "grad_norm": 1.8161840438842773, + "learning_rate": 3.3497208360096844e-05, + "loss": 5.5846, + "step": 6680 + }, + { + "epoch": 0.33, + "grad_norm": 2.196413993835449, + "learning_rate": 3.3487326448935226e-05, + "loss": 5.7234, + "step": 6684 + }, + { + "epoch": 0.33, + "grad_norm": 1.707971215248108, + "learning_rate": 3.347744453777361e-05, + "loss": 5.6536, + "step": 6688 + }, + { + "epoch": 0.33, + "grad_norm": 1.8692013025283813, + "learning_rate": 3.346756262661199e-05, + "loss": 5.6981, + "step": 6692 + }, + { + "epoch": 0.33, + "grad_norm": 1.7000290155410767, + "learning_rate": 3.3457680715450365e-05, + "loss": 5.5349, + "step": 6696 + }, + { + "epoch": 0.33, + "grad_norm": 1.7592395544052124, + "learning_rate": 3.344779880428875e-05, + "loss": 5.5299, + "step": 6700 + }, + { + "epoch": 0.33, + "grad_norm": 1.9749822616577148, + "learning_rate": 3.343791689312713e-05, + "loss": 5.6149, + "step": 6704 + }, + { + "epoch": 0.33, + "grad_norm": 1.9164543151855469, + "learning_rate": 3.342803498196551e-05, + "loss": 5.6748, + "step": 6708 + }, + { + "epoch": 0.33, + "grad_norm": 1.7371500730514526, + "learning_rate": 3.3418153070803893e-05, + "loss": 5.632, + "step": 6712 + }, + { + "epoch": 0.33, + "grad_norm": 2.007580041885376, + "learning_rate": 3.3408271159642275e-05, + "loss": 5.6878, + "step": 6716 + }, + { + "epoch": 0.33, + "grad_norm": 2.049675703048706, + "learning_rate": 3.339838924848066e-05, + "loss": 5.6347, + "step": 6720 + }, + { + "epoch": 0.33, + "grad_norm": 1.5192993879318237, + "learning_rate": 3.338850733731904e-05, + "loss": 5.5949, + "step": 6724 + }, + { + "epoch": 0.33, + "grad_norm": 1.7811031341552734, + "learning_rate": 3.337862542615742e-05, + "loss": 5.5796, + "step": 6728 + }, + { + "epoch": 0.33, + "grad_norm": 2.0031299591064453, + "learning_rate": 3.3368743514995804e-05, + "loss": 5.6231, + "step": 6732 + }, + { + "epoch": 0.33, + "grad_norm": 1.9391804933547974, + "learning_rate": 3.3358861603834186e-05, + "loss": 5.6029, + "step": 6736 + }, + { + "epoch": 0.33, + "grad_norm": 1.847928762435913, + "learning_rate": 3.334897969267257e-05, + "loss": 5.6513, + "step": 6740 + }, + { + "epoch": 0.33, + "grad_norm": 2.11073637008667, + "learning_rate": 3.333909778151095e-05, + "loss": 5.7021, + "step": 6744 + }, + { + "epoch": 0.33, + "grad_norm": 1.9884241819381714, + "learning_rate": 3.3329215870349325e-05, + "loss": 5.6343, + "step": 6748 + }, + { + "epoch": 0.33, + "grad_norm": 2.0201947689056396, + "learning_rate": 3.331933395918771e-05, + "loss": 5.6233, + "step": 6752 + }, + { + "epoch": 0.33, + "grad_norm": 1.783299207687378, + "learning_rate": 3.330945204802609e-05, + "loss": 5.6181, + "step": 6756 + }, + { + "epoch": 0.33, + "grad_norm": 1.9971978664398193, + "learning_rate": 3.329957013686447e-05, + "loss": 5.6573, + "step": 6760 + }, + { + "epoch": 0.33, + "grad_norm": 2.188537120819092, + "learning_rate": 3.3289688225702853e-05, + "loss": 5.5245, + "step": 6764 + }, + { + "epoch": 0.33, + "grad_norm": 1.8785967826843262, + "learning_rate": 3.3279806314541235e-05, + "loss": 5.6986, + "step": 6768 + }, + { + "epoch": 0.33, + "grad_norm": 2.19769024848938, + "learning_rate": 3.326992440337962e-05, + "loss": 5.5032, + "step": 6772 + }, + { + "epoch": 0.33, + "grad_norm": 2.074648857116699, + "learning_rate": 3.3260042492218e-05, + "loss": 5.7612, + "step": 6776 + }, + { + "epoch": 0.33, + "grad_norm": 1.9131178855895996, + "learning_rate": 3.3250160581056375e-05, + "loss": 5.6805, + "step": 6780 + }, + { + "epoch": 0.34, + "grad_norm": 2.2468371391296387, + "learning_rate": 3.324027866989476e-05, + "loss": 5.6507, + "step": 6784 + }, + { + "epoch": 0.34, + "grad_norm": 2.0636277198791504, + "learning_rate": 3.323039675873314e-05, + "loss": 5.7031, + "step": 6788 + }, + { + "epoch": 0.34, + "grad_norm": 1.6496247053146362, + "learning_rate": 3.322051484757152e-05, + "loss": 5.5691, + "step": 6792 + }, + { + "epoch": 0.34, + "grad_norm": 1.7620829343795776, + "learning_rate": 3.32106329364099e-05, + "loss": 5.6188, + "step": 6796 + }, + { + "epoch": 0.34, + "grad_norm": 1.8972188234329224, + "learning_rate": 3.3200751025248285e-05, + "loss": 5.6746, + "step": 6800 + }, + { + "epoch": 0.34, + "grad_norm": 1.8980793952941895, + "learning_rate": 3.319086911408667e-05, + "loss": 5.696, + "step": 6804 + }, + { + "epoch": 0.34, + "grad_norm": 1.681355357170105, + "learning_rate": 3.318098720292504e-05, + "loss": 5.629, + "step": 6808 + }, + { + "epoch": 0.34, + "grad_norm": 1.9827635288238525, + "learning_rate": 3.3171105291763425e-05, + "loss": 5.6046, + "step": 6812 + }, + { + "epoch": 0.34, + "grad_norm": 1.6929621696472168, + "learning_rate": 3.316122338060181e-05, + "loss": 5.7076, + "step": 6816 + }, + { + "epoch": 0.34, + "grad_norm": 1.706438660621643, + "learning_rate": 3.315134146944019e-05, + "loss": 5.6982, + "step": 6820 + }, + { + "epoch": 0.34, + "grad_norm": 1.9626370668411255, + "learning_rate": 3.314145955827857e-05, + "loss": 5.6642, + "step": 6824 + }, + { + "epoch": 0.34, + "grad_norm": 1.926810383796692, + "learning_rate": 3.313157764711695e-05, + "loss": 5.6876, + "step": 6828 + }, + { + "epoch": 0.34, + "grad_norm": 2.1014394760131836, + "learning_rate": 3.3121695735955335e-05, + "loss": 5.6713, + "step": 6832 + }, + { + "epoch": 0.34, + "grad_norm": 1.9832298755645752, + "learning_rate": 3.311181382479372e-05, + "loss": 5.5695, + "step": 6836 + }, + { + "epoch": 0.34, + "grad_norm": 1.867805004119873, + "learning_rate": 3.31019319136321e-05, + "loss": 5.598, + "step": 6840 + }, + { + "epoch": 0.34, + "grad_norm": 1.7277621030807495, + "learning_rate": 3.309205000247048e-05, + "loss": 5.6008, + "step": 6844 + }, + { + "epoch": 0.34, + "grad_norm": 2.0435431003570557, + "learning_rate": 3.308216809130886e-05, + "loss": 5.5801, + "step": 6848 + }, + { + "epoch": 0.34, + "grad_norm": 1.6969705820083618, + "learning_rate": 3.3072286180147245e-05, + "loss": 5.5302, + "step": 6852 + }, + { + "epoch": 0.34, + "grad_norm": 1.8054941892623901, + "learning_rate": 3.306240426898563e-05, + "loss": 5.6352, + "step": 6856 + }, + { + "epoch": 0.34, + "grad_norm": 1.7074542045593262, + "learning_rate": 3.305252235782401e-05, + "loss": 5.5968, + "step": 6860 + }, + { + "epoch": 0.34, + "grad_norm": 1.8238881826400757, + "learning_rate": 3.3042640446662385e-05, + "loss": 5.6272, + "step": 6864 + }, + { + "epoch": 0.34, + "grad_norm": 1.9226677417755127, + "learning_rate": 3.303275853550077e-05, + "loss": 5.6381, + "step": 6868 + }, + { + "epoch": 0.34, + "grad_norm": 1.5835906267166138, + "learning_rate": 3.302287662433915e-05, + "loss": 5.489, + "step": 6872 + }, + { + "epoch": 0.34, + "grad_norm": 1.7852261066436768, + "learning_rate": 3.301299471317753e-05, + "loss": 5.7686, + "step": 6876 + }, + { + "epoch": 0.34, + "grad_norm": 2.1643717288970947, + "learning_rate": 3.300311280201591e-05, + "loss": 5.7131, + "step": 6880 + }, + { + "epoch": 0.34, + "grad_norm": 1.8683335781097412, + "learning_rate": 3.2993230890854295e-05, + "loss": 5.6893, + "step": 6884 + }, + { + "epoch": 0.34, + "grad_norm": 1.8597195148468018, + "learning_rate": 3.298334897969268e-05, + "loss": 5.5943, + "step": 6888 + }, + { + "epoch": 0.34, + "grad_norm": 1.9217149019241333, + "learning_rate": 3.297346706853105e-05, + "loss": 5.6866, + "step": 6892 + }, + { + "epoch": 0.34, + "grad_norm": 1.7497360706329346, + "learning_rate": 3.2963585157369434e-05, + "loss": 5.6797, + "step": 6896 + }, + { + "epoch": 0.34, + "grad_norm": 1.8368375301361084, + "learning_rate": 3.2953703246207816e-05, + "loss": 5.5535, + "step": 6900 + }, + { + "epoch": 0.34, + "grad_norm": 1.6737070083618164, + "learning_rate": 3.29438213350462e-05, + "loss": 5.5282, + "step": 6904 + }, + { + "epoch": 0.34, + "grad_norm": 1.5021297931671143, + "learning_rate": 3.293393942388458e-05, + "loss": 5.6176, + "step": 6908 + }, + { + "epoch": 0.34, + "grad_norm": 1.836517572402954, + "learning_rate": 3.292405751272296e-05, + "loss": 5.6304, + "step": 6912 + }, + { + "epoch": 0.34, + "grad_norm": 1.9427759647369385, + "learning_rate": 3.2914175601561345e-05, + "loss": 5.6843, + "step": 6916 + }, + { + "epoch": 0.34, + "grad_norm": 2.1088333129882812, + "learning_rate": 3.290429369039973e-05, + "loss": 5.6674, + "step": 6920 + }, + { + "epoch": 0.34, + "grad_norm": 1.9784494638442993, + "learning_rate": 3.28944117792381e-05, + "loss": 5.6965, + "step": 6924 + }, + { + "epoch": 0.34, + "grad_norm": 1.666114091873169, + "learning_rate": 3.2884529868076484e-05, + "loss": 5.5627, + "step": 6928 + }, + { + "epoch": 0.34, + "grad_norm": 1.7081410884857178, + "learning_rate": 3.2874647956914866e-05, + "loss": 5.7073, + "step": 6932 + }, + { + "epoch": 0.34, + "grad_norm": 1.857577919960022, + "learning_rate": 3.286476604575325e-05, + "loss": 5.6804, + "step": 6936 + }, + { + "epoch": 0.34, + "grad_norm": 1.952774167060852, + "learning_rate": 3.285488413459163e-05, + "loss": 5.6911, + "step": 6940 + }, + { + "epoch": 0.34, + "grad_norm": 1.695110559463501, + "learning_rate": 3.284500222343001e-05, + "loss": 5.5463, + "step": 6944 + }, + { + "epoch": 0.34, + "grad_norm": 1.7788817882537842, + "learning_rate": 3.2835120312268394e-05, + "loss": 5.7336, + "step": 6948 + }, + { + "epoch": 0.34, + "grad_norm": 2.170694589614868, + "learning_rate": 3.2825238401106776e-05, + "loss": 5.6662, + "step": 6952 + }, + { + "epoch": 0.34, + "grad_norm": 1.9008727073669434, + "learning_rate": 3.281535648994516e-05, + "loss": 5.6276, + "step": 6956 + }, + { + "epoch": 0.34, + "grad_norm": 2.079054355621338, + "learning_rate": 3.280547457878354e-05, + "loss": 5.6058, + "step": 6960 + }, + { + "epoch": 0.34, + "grad_norm": 1.6386624574661255, + "learning_rate": 3.279559266762192e-05, + "loss": 5.5782, + "step": 6964 + }, + { + "epoch": 0.34, + "grad_norm": 1.9276303052902222, + "learning_rate": 3.2785710756460305e-05, + "loss": 5.6403, + "step": 6968 + }, + { + "epoch": 0.34, + "grad_norm": 1.7339591979980469, + "learning_rate": 3.277582884529869e-05, + "loss": 5.6689, + "step": 6972 + }, + { + "epoch": 0.34, + "grad_norm": 1.9973163604736328, + "learning_rate": 3.276594693413706e-05, + "loss": 5.6529, + "step": 6976 + }, + { + "epoch": 0.34, + "grad_norm": 1.5551352500915527, + "learning_rate": 3.2756065022975444e-05, + "loss": 5.609, + "step": 6980 + }, + { + "epoch": 0.35, + "grad_norm": 1.8106021881103516, + "learning_rate": 3.2746183111813826e-05, + "loss": 5.5107, + "step": 6984 + }, + { + "epoch": 0.35, + "grad_norm": 1.6934738159179688, + "learning_rate": 3.273630120065221e-05, + "loss": 5.6064, + "step": 6988 + }, + { + "epoch": 0.35, + "grad_norm": 1.6493241786956787, + "learning_rate": 3.272641928949059e-05, + "loss": 5.6641, + "step": 6992 + }, + { + "epoch": 0.35, + "grad_norm": 1.817825436592102, + "learning_rate": 3.271653737832897e-05, + "loss": 5.5306, + "step": 6996 + }, + { + "epoch": 0.35, + "grad_norm": 1.5873377323150635, + "learning_rate": 3.2706655467167354e-05, + "loss": 5.6717, + "step": 7000 + }, + { + "epoch": 0.35, + "grad_norm": 1.584267497062683, + "learning_rate": 3.2696773556005736e-05, + "loss": 5.5343, + "step": 7004 + }, + { + "epoch": 0.35, + "grad_norm": 1.52151620388031, + "learning_rate": 3.268689164484411e-05, + "loss": 5.612, + "step": 7008 + }, + { + "epoch": 0.35, + "grad_norm": 1.8319940567016602, + "learning_rate": 3.2677009733682494e-05, + "loss": 5.611, + "step": 7012 + }, + { + "epoch": 0.35, + "grad_norm": 1.779436469078064, + "learning_rate": 3.2667127822520876e-05, + "loss": 5.5905, + "step": 7016 + }, + { + "epoch": 0.35, + "grad_norm": 1.5194777250289917, + "learning_rate": 3.265724591135926e-05, + "loss": 5.6275, + "step": 7020 + }, + { + "epoch": 0.35, + "grad_norm": 1.6653060913085938, + "learning_rate": 3.264736400019764e-05, + "loss": 5.6553, + "step": 7024 + }, + { + "epoch": 0.35, + "grad_norm": 1.6240752935409546, + "learning_rate": 3.263748208903602e-05, + "loss": 5.6956, + "step": 7028 + }, + { + "epoch": 0.35, + "grad_norm": 1.6351805925369263, + "learning_rate": 3.2627600177874404e-05, + "loss": 5.6165, + "step": 7032 + }, + { + "epoch": 0.35, + "grad_norm": 1.8824491500854492, + "learning_rate": 3.261771826671278e-05, + "loss": 5.5225, + "step": 7036 + }, + { + "epoch": 0.35, + "grad_norm": 1.713706612586975, + "learning_rate": 3.260783635555116e-05, + "loss": 5.6428, + "step": 7040 + }, + { + "epoch": 0.35, + "grad_norm": 2.0658748149871826, + "learning_rate": 3.2597954444389543e-05, + "loss": 5.6175, + "step": 7044 + }, + { + "epoch": 0.35, + "grad_norm": 1.7719210386276245, + "learning_rate": 3.2588072533227926e-05, + "loss": 5.6819, + "step": 7048 + }, + { + "epoch": 0.35, + "grad_norm": 2.1328206062316895, + "learning_rate": 3.257819062206631e-05, + "loss": 5.5427, + "step": 7052 + }, + { + "epoch": 0.35, + "grad_norm": 1.5215253829956055, + "learning_rate": 3.256830871090469e-05, + "loss": 5.7679, + "step": 7056 + }, + { + "epoch": 0.35, + "grad_norm": 2.001835584640503, + "learning_rate": 3.255842679974307e-05, + "loss": 5.748, + "step": 7060 + }, + { + "epoch": 0.35, + "grad_norm": 1.5775867700576782, + "learning_rate": 3.2548544888581454e-05, + "loss": 5.5609, + "step": 7064 + }, + { + "epoch": 0.35, + "grad_norm": 1.7867364883422852, + "learning_rate": 3.2538662977419836e-05, + "loss": 5.6232, + "step": 7068 + }, + { + "epoch": 0.35, + "grad_norm": 2.010613203048706, + "learning_rate": 3.252878106625822e-05, + "loss": 5.6611, + "step": 7072 + }, + { + "epoch": 0.35, + "grad_norm": 2.0896737575531006, + "learning_rate": 3.25188991550966e-05, + "loss": 5.6961, + "step": 7076 + }, + { + "epoch": 0.35, + "grad_norm": 1.7435530424118042, + "learning_rate": 3.250901724393498e-05, + "loss": 5.816, + "step": 7080 + }, + { + "epoch": 0.35, + "grad_norm": 2.452756643295288, + "learning_rate": 3.2499135332773364e-05, + "loss": 5.6278, + "step": 7084 + }, + { + "epoch": 0.35, + "grad_norm": 1.5467629432678223, + "learning_rate": 3.2489253421611746e-05, + "loss": 5.7114, + "step": 7088 + }, + { + "epoch": 0.35, + "grad_norm": 1.8225852251052856, + "learning_rate": 3.247937151045012e-05, + "loss": 5.5523, + "step": 7092 + }, + { + "epoch": 0.35, + "grad_norm": 1.9562902450561523, + "learning_rate": 3.2469489599288503e-05, + "loss": 5.7284, + "step": 7096 + }, + { + "epoch": 0.35, + "grad_norm": 2.379361152648926, + "learning_rate": 3.2459607688126886e-05, + "loss": 5.5628, + "step": 7100 + }, + { + "epoch": 0.35, + "grad_norm": 1.9397469758987427, + "learning_rate": 3.244972577696527e-05, + "loss": 5.6596, + "step": 7104 + }, + { + "epoch": 0.35, + "grad_norm": 1.7778300046920776, + "learning_rate": 3.243984386580365e-05, + "loss": 5.7102, + "step": 7108 + }, + { + "epoch": 0.35, + "grad_norm": 2.671886920928955, + "learning_rate": 3.242996195464203e-05, + "loss": 5.7244, + "step": 7112 + }, + { + "epoch": 0.35, + "grad_norm": 1.7230006456375122, + "learning_rate": 3.2420080043480414e-05, + "loss": 5.7147, + "step": 7116 + }, + { + "epoch": 0.35, + "grad_norm": 1.751182198524475, + "learning_rate": 3.241019813231879e-05, + "loss": 5.7059, + "step": 7120 + }, + { + "epoch": 0.35, + "grad_norm": 1.7081093788146973, + "learning_rate": 3.240031622115717e-05, + "loss": 5.5171, + "step": 7124 + }, + { + "epoch": 0.35, + "grad_norm": 1.7573639154434204, + "learning_rate": 3.239043430999555e-05, + "loss": 5.6948, + "step": 7128 + }, + { + "epoch": 0.35, + "grad_norm": 1.7177603244781494, + "learning_rate": 3.2380552398833935e-05, + "loss": 5.5412, + "step": 7132 + }, + { + "epoch": 0.35, + "grad_norm": 1.8538342714309692, + "learning_rate": 3.237067048767232e-05, + "loss": 5.645, + "step": 7136 + }, + { + "epoch": 0.35, + "grad_norm": 2.174427032470703, + "learning_rate": 3.23607885765107e-05, + "loss": 5.6742, + "step": 7140 + }, + { + "epoch": 0.35, + "grad_norm": 2.461571455001831, + "learning_rate": 3.235090666534908e-05, + "loss": 5.6642, + "step": 7144 + }, + { + "epoch": 0.35, + "grad_norm": 1.8825712203979492, + "learning_rate": 3.234102475418746e-05, + "loss": 5.6553, + "step": 7148 + }, + { + "epoch": 0.35, + "grad_norm": 1.5318057537078857, + "learning_rate": 3.233114284302584e-05, + "loss": 5.6551, + "step": 7152 + }, + { + "epoch": 0.35, + "grad_norm": 2.0473759174346924, + "learning_rate": 3.232126093186422e-05, + "loss": 5.6036, + "step": 7156 + }, + { + "epoch": 0.35, + "grad_norm": 2.186314821243286, + "learning_rate": 3.23113790207026e-05, + "loss": 5.762, + "step": 7160 + }, + { + "epoch": 0.35, + "grad_norm": 1.6777757406234741, + "learning_rate": 3.2301497109540985e-05, + "loss": 5.6549, + "step": 7164 + }, + { + "epoch": 0.35, + "grad_norm": 1.951010823249817, + "learning_rate": 3.229161519837937e-05, + "loss": 5.7491, + "step": 7168 + }, + { + "epoch": 0.35, + "grad_norm": 1.590847134590149, + "learning_rate": 3.228173328721775e-05, + "loss": 5.52, + "step": 7172 + }, + { + "epoch": 0.35, + "grad_norm": 2.1618316173553467, + "learning_rate": 3.227185137605613e-05, + "loss": 5.5362, + "step": 7176 + }, + { + "epoch": 0.35, + "grad_norm": 1.874100685119629, + "learning_rate": 3.226196946489451e-05, + "loss": 5.5656, + "step": 7180 + }, + { + "epoch": 0.35, + "grad_norm": 1.7269055843353271, + "learning_rate": 3.2252087553732895e-05, + "loss": 5.5793, + "step": 7184 + }, + { + "epoch": 0.36, + "grad_norm": 1.7733286619186401, + "learning_rate": 3.224220564257128e-05, + "loss": 5.6867, + "step": 7188 + }, + { + "epoch": 0.36, + "grad_norm": 1.6358729600906372, + "learning_rate": 3.223232373140966e-05, + "loss": 5.8119, + "step": 7192 + }, + { + "epoch": 0.36, + "grad_norm": 1.663542628288269, + "learning_rate": 3.222244182024804e-05, + "loss": 5.585, + "step": 7196 + }, + { + "epoch": 0.36, + "grad_norm": 2.0216832160949707, + "learning_rate": 3.2212559909086423e-05, + "loss": 5.5331, + "step": 7200 + }, + { + "epoch": 0.36, + "grad_norm": 1.9756247997283936, + "learning_rate": 3.22026779979248e-05, + "loss": 5.7117, + "step": 7204 + }, + { + "epoch": 0.36, + "grad_norm": 2.0371248722076416, + "learning_rate": 3.219279608676318e-05, + "loss": 5.6412, + "step": 7208 + }, + { + "epoch": 0.36, + "grad_norm": 1.8078463077545166, + "learning_rate": 3.218291417560156e-05, + "loss": 5.5826, + "step": 7212 + }, + { + "epoch": 0.36, + "grad_norm": 1.6436288356781006, + "learning_rate": 3.2173032264439945e-05, + "loss": 5.5825, + "step": 7216 + }, + { + "epoch": 0.36, + "grad_norm": 2.0301754474639893, + "learning_rate": 3.216315035327833e-05, + "loss": 5.7317, + "step": 7220 + }, + { + "epoch": 0.36, + "grad_norm": 2.0184414386749268, + "learning_rate": 3.215326844211671e-05, + "loss": 5.5583, + "step": 7224 + }, + { + "epoch": 0.36, + "grad_norm": 1.9053844213485718, + "learning_rate": 3.214338653095509e-05, + "loss": 5.6326, + "step": 7228 + }, + { + "epoch": 0.36, + "grad_norm": 1.7837705612182617, + "learning_rate": 3.2133504619793466e-05, + "loss": 5.6659, + "step": 7232 + }, + { + "epoch": 0.36, + "grad_norm": 2.1691172122955322, + "learning_rate": 3.212362270863185e-05, + "loss": 5.6534, + "step": 7236 + }, + { + "epoch": 0.36, + "grad_norm": 1.6231029033660889, + "learning_rate": 3.211374079747023e-05, + "loss": 5.5752, + "step": 7240 + }, + { + "epoch": 0.36, + "grad_norm": 1.6797888278961182, + "learning_rate": 3.210385888630861e-05, + "loss": 5.5594, + "step": 7244 + }, + { + "epoch": 0.36, + "grad_norm": 2.0142295360565186, + "learning_rate": 3.2093976975146995e-05, + "loss": 5.5551, + "step": 7248 + }, + { + "epoch": 0.36, + "grad_norm": 1.6823694705963135, + "learning_rate": 3.208409506398538e-05, + "loss": 5.5654, + "step": 7252 + }, + { + "epoch": 0.36, + "grad_norm": 2.035011053085327, + "learning_rate": 3.207421315282376e-05, + "loss": 5.753, + "step": 7256 + }, + { + "epoch": 0.36, + "grad_norm": 2.0539419651031494, + "learning_rate": 3.206433124166214e-05, + "loss": 5.481, + "step": 7260 + }, + { + "epoch": 0.36, + "grad_norm": 1.8874220848083496, + "learning_rate": 3.2054449330500516e-05, + "loss": 5.5776, + "step": 7264 + }, + { + "epoch": 0.36, + "grad_norm": 2.067695140838623, + "learning_rate": 3.20445674193389e-05, + "loss": 5.7252, + "step": 7268 + }, + { + "epoch": 0.36, + "grad_norm": 1.9559959173202515, + "learning_rate": 3.203468550817728e-05, + "loss": 5.7496, + "step": 7272 + }, + { + "epoch": 0.36, + "grad_norm": 1.7679195404052734, + "learning_rate": 3.202480359701566e-05, + "loss": 5.5574, + "step": 7276 + }, + { + "epoch": 0.36, + "grad_norm": 1.7871371507644653, + "learning_rate": 3.2014921685854044e-05, + "loss": 5.611, + "step": 7280 + }, + { + "epoch": 0.36, + "grad_norm": 2.027100086212158, + "learning_rate": 3.2005039774692426e-05, + "loss": 5.6949, + "step": 7284 + }, + { + "epoch": 0.36, + "grad_norm": 2.3536198139190674, + "learning_rate": 3.199515786353081e-05, + "loss": 5.5937, + "step": 7288 + }, + { + "epoch": 0.36, + "grad_norm": 1.9089040756225586, + "learning_rate": 3.198527595236919e-05, + "loss": 5.742, + "step": 7292 + }, + { + "epoch": 0.36, + "grad_norm": 1.9175152778625488, + "learning_rate": 3.197539404120757e-05, + "loss": 5.7345, + "step": 7296 + }, + { + "epoch": 0.36, + "grad_norm": 2.3115925788879395, + "learning_rate": 3.1965512130045955e-05, + "loss": 5.5774, + "step": 7300 + }, + { + "epoch": 0.36, + "grad_norm": 1.8105931282043457, + "learning_rate": 3.195563021888434e-05, + "loss": 5.5753, + "step": 7304 + }, + { + "epoch": 0.36, + "grad_norm": 2.1161036491394043, + "learning_rate": 3.194574830772272e-05, + "loss": 5.6975, + "step": 7308 + }, + { + "epoch": 0.36, + "grad_norm": 1.7692967653274536, + "learning_rate": 3.19358663965611e-05, + "loss": 5.6869, + "step": 7312 + }, + { + "epoch": 0.36, + "grad_norm": 1.7916195392608643, + "learning_rate": 3.1925984485399476e-05, + "loss": 5.7502, + "step": 7316 + }, + { + "epoch": 0.36, + "grad_norm": 1.5649669170379639, + "learning_rate": 3.191610257423786e-05, + "loss": 5.6049, + "step": 7320 + }, + { + "epoch": 0.36, + "grad_norm": 1.5286930799484253, + "learning_rate": 3.190622066307624e-05, + "loss": 5.5598, + "step": 7324 + }, + { + "epoch": 0.36, + "grad_norm": 1.7816998958587646, + "learning_rate": 3.189633875191462e-05, + "loss": 5.5383, + "step": 7328 + }, + { + "epoch": 0.36, + "grad_norm": 1.7937431335449219, + "learning_rate": 3.1886456840753004e-05, + "loss": 5.6448, + "step": 7332 + }, + { + "epoch": 0.36, + "grad_norm": 1.801894187927246, + "learning_rate": 3.1876574929591386e-05, + "loss": 5.638, + "step": 7336 + }, + { + "epoch": 0.36, + "grad_norm": 1.8095769882202148, + "learning_rate": 3.186669301842977e-05, + "loss": 5.7017, + "step": 7340 + }, + { + "epoch": 0.36, + "grad_norm": 1.600150465965271, + "learning_rate": 3.185681110726815e-05, + "loss": 5.639, + "step": 7344 + }, + { + "epoch": 0.36, + "grad_norm": 1.827073574066162, + "learning_rate": 3.1846929196106526e-05, + "loss": 5.6148, + "step": 7348 + }, + { + "epoch": 0.36, + "grad_norm": 1.7427455186843872, + "learning_rate": 3.183704728494491e-05, + "loss": 5.6816, + "step": 7352 + }, + { + "epoch": 0.36, + "grad_norm": 1.5865864753723145, + "learning_rate": 3.182716537378329e-05, + "loss": 5.6173, + "step": 7356 + }, + { + "epoch": 0.36, + "grad_norm": 2.4280364513397217, + "learning_rate": 3.181728346262167e-05, + "loss": 5.7189, + "step": 7360 + }, + { + "epoch": 0.36, + "grad_norm": 1.7864285707473755, + "learning_rate": 3.1807401551460054e-05, + "loss": 5.7009, + "step": 7364 + }, + { + "epoch": 0.36, + "grad_norm": 1.9539587497711182, + "learning_rate": 3.1797519640298436e-05, + "loss": 5.6526, + "step": 7368 + }, + { + "epoch": 0.36, + "grad_norm": 1.8817050457000732, + "learning_rate": 3.178763772913682e-05, + "loss": 5.5829, + "step": 7372 + }, + { + "epoch": 0.36, + "grad_norm": 1.9259823560714722, + "learning_rate": 3.1777755817975193e-05, + "loss": 5.5862, + "step": 7376 + }, + { + "epoch": 0.36, + "grad_norm": 1.8345321416854858, + "learning_rate": 3.1767873906813576e-05, + "loss": 5.689, + "step": 7380 + }, + { + "epoch": 0.36, + "grad_norm": 1.9445204734802246, + "learning_rate": 3.175799199565196e-05, + "loss": 5.5377, + "step": 7384 + }, + { + "epoch": 0.37, + "grad_norm": 1.9036486148834229, + "learning_rate": 3.174811008449034e-05, + "loss": 5.6994, + "step": 7388 + }, + { + "epoch": 0.37, + "grad_norm": 1.7880631685256958, + "learning_rate": 3.173822817332872e-05, + "loss": 5.5478, + "step": 7392 + }, + { + "epoch": 0.37, + "grad_norm": 1.7411099672317505, + "learning_rate": 3.1728346262167104e-05, + "loss": 5.6932, + "step": 7396 + }, + { + "epoch": 0.37, + "grad_norm": 2.0032172203063965, + "learning_rate": 3.1718464351005486e-05, + "loss": 5.6742, + "step": 7400 + }, + { + "epoch": 0.37, + "grad_norm": 1.963858723640442, + "learning_rate": 3.170858243984387e-05, + "loss": 5.6343, + "step": 7404 + }, + { + "epoch": 0.37, + "grad_norm": 1.7364962100982666, + "learning_rate": 3.169870052868225e-05, + "loss": 5.6262, + "step": 7408 + }, + { + "epoch": 0.37, + "grad_norm": 1.885438084602356, + "learning_rate": 3.168881861752063e-05, + "loss": 5.6402, + "step": 7412 + }, + { + "epoch": 0.37, + "grad_norm": 1.7210749387741089, + "learning_rate": 3.1678936706359014e-05, + "loss": 5.6839, + "step": 7416 + }, + { + "epoch": 0.37, + "grad_norm": 1.7860082387924194, + "learning_rate": 3.1669054795197396e-05, + "loss": 5.5595, + "step": 7420 + }, + { + "epoch": 0.37, + "grad_norm": 1.878787636756897, + "learning_rate": 3.165917288403578e-05, + "loss": 5.6312, + "step": 7424 + }, + { + "epoch": 0.37, + "grad_norm": 1.6183253526687622, + "learning_rate": 3.164929097287416e-05, + "loss": 5.6786, + "step": 7428 + }, + { + "epoch": 0.37, + "grad_norm": 1.8205534219741821, + "learning_rate": 3.1639409061712536e-05, + "loss": 5.6006, + "step": 7432 + }, + { + "epoch": 0.37, + "grad_norm": 1.9625846147537231, + "learning_rate": 3.162952715055092e-05, + "loss": 5.5243, + "step": 7436 + }, + { + "epoch": 0.37, + "grad_norm": 1.967149257659912, + "learning_rate": 3.16196452393893e-05, + "loss": 5.599, + "step": 7440 + }, + { + "epoch": 0.37, + "grad_norm": 2.0951106548309326, + "learning_rate": 3.160976332822768e-05, + "loss": 5.6613, + "step": 7444 + }, + { + "epoch": 0.37, + "grad_norm": 1.8848724365234375, + "learning_rate": 3.1599881417066064e-05, + "loss": 5.7463, + "step": 7448 + }, + { + "epoch": 0.37, + "grad_norm": 1.793150544166565, + "learning_rate": 3.1589999505904446e-05, + "loss": 5.7087, + "step": 7452 + }, + { + "epoch": 0.37, + "grad_norm": 1.5212810039520264, + "learning_rate": 3.158011759474283e-05, + "loss": 5.6169, + "step": 7456 + }, + { + "epoch": 0.37, + "grad_norm": 1.9898124933242798, + "learning_rate": 3.15702356835812e-05, + "loss": 5.5823, + "step": 7460 + }, + { + "epoch": 0.37, + "grad_norm": 1.9081612825393677, + "learning_rate": 3.1560353772419585e-05, + "loss": 5.673, + "step": 7464 + }, + { + "epoch": 0.37, + "grad_norm": 2.049330234527588, + "learning_rate": 3.155047186125797e-05, + "loss": 5.6275, + "step": 7468 + }, + { + "epoch": 0.37, + "grad_norm": 1.874483346939087, + "learning_rate": 3.154058995009635e-05, + "loss": 5.7012, + "step": 7472 + }, + { + "epoch": 0.37, + "grad_norm": 1.674302101135254, + "learning_rate": 3.153070803893473e-05, + "loss": 5.6507, + "step": 7476 + }, + { + "epoch": 0.37, + "grad_norm": 2.1794581413269043, + "learning_rate": 3.1520826127773113e-05, + "loss": 5.6312, + "step": 7480 + }, + { + "epoch": 0.37, + "grad_norm": 1.8025480508804321, + "learning_rate": 3.1510944216611496e-05, + "loss": 5.5184, + "step": 7484 + }, + { + "epoch": 0.37, + "grad_norm": 1.8189462423324585, + "learning_rate": 3.150106230544988e-05, + "loss": 5.558, + "step": 7488 + }, + { + "epoch": 0.37, + "grad_norm": 1.8474452495574951, + "learning_rate": 3.149118039428825e-05, + "loss": 5.5896, + "step": 7492 + }, + { + "epoch": 0.37, + "grad_norm": 1.8259671926498413, + "learning_rate": 3.1481298483126635e-05, + "loss": 5.6092, + "step": 7496 + }, + { + "epoch": 0.37, + "grad_norm": 1.675440788269043, + "learning_rate": 3.147141657196502e-05, + "loss": 5.5084, + "step": 7500 + }, + { + "epoch": 0.37, + "grad_norm": 2.073160409927368, + "learning_rate": 3.14615346608034e-05, + "loss": 5.6257, + "step": 7504 + }, + { + "epoch": 0.37, + "grad_norm": 1.9725701808929443, + "learning_rate": 3.145165274964178e-05, + "loss": 5.5622, + "step": 7508 + }, + { + "epoch": 0.37, + "grad_norm": 2.097548007965088, + "learning_rate": 3.144177083848016e-05, + "loss": 5.5948, + "step": 7512 + }, + { + "epoch": 0.37, + "grad_norm": 1.8091487884521484, + "learning_rate": 3.1431888927318545e-05, + "loss": 5.6698, + "step": 7516 + }, + { + "epoch": 0.37, + "grad_norm": 1.8227310180664062, + "learning_rate": 3.142200701615692e-05, + "loss": 5.5319, + "step": 7520 + }, + { + "epoch": 0.37, + "grad_norm": 1.9433192014694214, + "learning_rate": 3.141212510499531e-05, + "loss": 5.6118, + "step": 7524 + }, + { + "epoch": 0.37, + "grad_norm": 1.7683522701263428, + "learning_rate": 3.140224319383369e-05, + "loss": 5.5843, + "step": 7528 + }, + { + "epoch": 0.37, + "grad_norm": 1.8720110654830933, + "learning_rate": 3.1392361282672073e-05, + "loss": 5.626, + "step": 7532 + }, + { + "epoch": 0.37, + "grad_norm": 1.7826591730117798, + "learning_rate": 3.1382479371510456e-05, + "loss": 5.6833, + "step": 7536 + }, + { + "epoch": 0.37, + "grad_norm": 1.870161533355713, + "learning_rate": 3.137259746034884e-05, + "loss": 5.7177, + "step": 7540 + }, + { + "epoch": 0.37, + "grad_norm": 2.076082706451416, + "learning_rate": 3.136271554918721e-05, + "loss": 5.563, + "step": 7544 + }, + { + "epoch": 0.37, + "grad_norm": 1.7204762697219849, + "learning_rate": 3.1352833638025595e-05, + "loss": 5.5931, + "step": 7548 + }, + { + "epoch": 0.37, + "grad_norm": 2.109579563140869, + "learning_rate": 3.134295172686398e-05, + "loss": 5.6927, + "step": 7552 + }, + { + "epoch": 0.37, + "grad_norm": 1.7823752164840698, + "learning_rate": 3.133306981570236e-05, + "loss": 5.501, + "step": 7556 + }, + { + "epoch": 0.37, + "grad_norm": 1.9205490350723267, + "learning_rate": 3.132318790454074e-05, + "loss": 5.5822, + "step": 7560 + }, + { + "epoch": 0.37, + "grad_norm": 2.1453466415405273, + "learning_rate": 3.131330599337912e-05, + "loss": 5.6234, + "step": 7564 + }, + { + "epoch": 0.37, + "grad_norm": 2.079413652420044, + "learning_rate": 3.1303424082217505e-05, + "loss": 5.5851, + "step": 7568 + }, + { + "epoch": 0.37, + "grad_norm": 1.7375316619873047, + "learning_rate": 3.129354217105589e-05, + "loss": 5.51, + "step": 7572 + }, + { + "epoch": 0.37, + "grad_norm": 1.7538070678710938, + "learning_rate": 3.128366025989426e-05, + "loss": 5.5565, + "step": 7576 + }, + { + "epoch": 0.37, + "grad_norm": 1.8971534967422485, + "learning_rate": 3.1273778348732645e-05, + "loss": 5.6077, + "step": 7580 + }, + { + "epoch": 0.37, + "grad_norm": 1.975240707397461, + "learning_rate": 3.126389643757103e-05, + "loss": 5.8038, + "step": 7584 + }, + { + "epoch": 0.37, + "grad_norm": 1.9399492740631104, + "learning_rate": 3.125401452640941e-05, + "loss": 5.6554, + "step": 7588 + }, + { + "epoch": 0.38, + "grad_norm": 2.004110336303711, + "learning_rate": 3.124413261524779e-05, + "loss": 5.6242, + "step": 7592 + }, + { + "epoch": 0.38, + "grad_norm": 1.8853540420532227, + "learning_rate": 3.123425070408617e-05, + "loss": 5.7207, + "step": 7596 + }, + { + "epoch": 0.38, + "grad_norm": 1.765731930732727, + "learning_rate": 3.1224368792924555e-05, + "loss": 5.5813, + "step": 7600 + }, + { + "epoch": 0.38, + "grad_norm": 1.8963857889175415, + "learning_rate": 3.121448688176293e-05, + "loss": 5.6913, + "step": 7604 + }, + { + "epoch": 0.38, + "grad_norm": 1.8706163167953491, + "learning_rate": 3.120460497060131e-05, + "loss": 5.5406, + "step": 7608 + }, + { + "epoch": 0.38, + "grad_norm": 1.6852182149887085, + "learning_rate": 3.1194723059439694e-05, + "loss": 5.582, + "step": 7612 + }, + { + "epoch": 0.38, + "grad_norm": 1.8383783102035522, + "learning_rate": 3.1184841148278076e-05, + "loss": 5.664, + "step": 7616 + }, + { + "epoch": 0.38, + "grad_norm": 1.8068432807922363, + "learning_rate": 3.117495923711646e-05, + "loss": 5.6969, + "step": 7620 + }, + { + "epoch": 0.38, + "grad_norm": 2.020507335662842, + "learning_rate": 3.116507732595484e-05, + "loss": 5.5544, + "step": 7624 + }, + { + "epoch": 0.38, + "grad_norm": 1.7080477476119995, + "learning_rate": 3.115519541479322e-05, + "loss": 5.7007, + "step": 7628 + }, + { + "epoch": 0.38, + "grad_norm": 1.6896196603775024, + "learning_rate": 3.11453135036316e-05, + "loss": 5.5952, + "step": 7632 + }, + { + "epoch": 0.38, + "grad_norm": 2.0468990802764893, + "learning_rate": 3.113543159246998e-05, + "loss": 5.735, + "step": 7636 + }, + { + "epoch": 0.38, + "grad_norm": 1.9977657794952393, + "learning_rate": 3.112554968130837e-05, + "loss": 5.7005, + "step": 7640 + }, + { + "epoch": 0.38, + "grad_norm": 2.3371498584747314, + "learning_rate": 3.111566777014675e-05, + "loss": 5.6963, + "step": 7644 + }, + { + "epoch": 0.38, + "grad_norm": 1.8989230394363403, + "learning_rate": 3.110578585898513e-05, + "loss": 5.5717, + "step": 7648 + }, + { + "epoch": 0.38, + "grad_norm": 2.1724061965942383, + "learning_rate": 3.1095903947823515e-05, + "loss": 5.5531, + "step": 7652 + }, + { + "epoch": 0.38, + "grad_norm": 1.7669955492019653, + "learning_rate": 3.10860220366619e-05, + "loss": 5.6975, + "step": 7656 + }, + { + "epoch": 0.38, + "grad_norm": 2.1776323318481445, + "learning_rate": 3.107614012550027e-05, + "loss": 5.7217, + "step": 7660 + }, + { + "epoch": 0.38, + "grad_norm": 1.5925564765930176, + "learning_rate": 3.1066258214338654e-05, + "loss": 5.6475, + "step": 7664 + }, + { + "epoch": 0.38, + "grad_norm": 1.862136721611023, + "learning_rate": 3.1056376303177036e-05, + "loss": 5.6022, + "step": 7668 + }, + { + "epoch": 0.38, + "grad_norm": 1.893431305885315, + "learning_rate": 3.104649439201542e-05, + "loss": 5.5991, + "step": 7672 + }, + { + "epoch": 0.38, + "grad_norm": 1.9323973655700684, + "learning_rate": 3.10366124808538e-05, + "loss": 5.737, + "step": 7676 + }, + { + "epoch": 0.38, + "grad_norm": 1.630057454109192, + "learning_rate": 3.102673056969218e-05, + "loss": 5.5885, + "step": 7680 + }, + { + "epoch": 0.38, + "grad_norm": 2.0866503715515137, + "learning_rate": 3.1016848658530565e-05, + "loss": 5.5873, + "step": 7684 + }, + { + "epoch": 0.38, + "grad_norm": 1.6265943050384521, + "learning_rate": 3.100696674736894e-05, + "loss": 5.6085, + "step": 7688 + }, + { + "epoch": 0.38, + "grad_norm": 2.116450309753418, + "learning_rate": 3.099708483620732e-05, + "loss": 5.6814, + "step": 7692 + }, + { + "epoch": 0.38, + "grad_norm": 1.8399298191070557, + "learning_rate": 3.0987202925045704e-05, + "loss": 5.6584, + "step": 7696 + }, + { + "epoch": 0.38, + "grad_norm": 1.8756877183914185, + "learning_rate": 3.0977321013884086e-05, + "loss": 5.7231, + "step": 7700 + }, + { + "epoch": 0.38, + "grad_norm": 1.9492945671081543, + "learning_rate": 3.096743910272247e-05, + "loss": 5.6126, + "step": 7704 + }, + { + "epoch": 0.38, + "grad_norm": 1.8977458477020264, + "learning_rate": 3.095755719156085e-05, + "loss": 5.4845, + "step": 7708 + }, + { + "epoch": 0.38, + "grad_norm": 1.920137643814087, + "learning_rate": 3.094767528039923e-05, + "loss": 5.6911, + "step": 7712 + }, + { + "epoch": 0.38, + "grad_norm": 1.9091439247131348, + "learning_rate": 3.093779336923761e-05, + "loss": 5.7215, + "step": 7716 + }, + { + "epoch": 0.38, + "grad_norm": 1.9468705654144287, + "learning_rate": 3.092791145807599e-05, + "loss": 5.5602, + "step": 7720 + }, + { + "epoch": 0.38, + "grad_norm": 2.171674966812134, + "learning_rate": 3.091802954691437e-05, + "loss": 5.6432, + "step": 7724 + }, + { + "epoch": 0.38, + "grad_norm": 1.7514827251434326, + "learning_rate": 3.0908147635752754e-05, + "loss": 5.7022, + "step": 7728 + }, + { + "epoch": 0.38, + "grad_norm": 1.6173917055130005, + "learning_rate": 3.0898265724591136e-05, + "loss": 5.6705, + "step": 7732 + }, + { + "epoch": 0.38, + "grad_norm": 1.7290568351745605, + "learning_rate": 3.088838381342952e-05, + "loss": 5.5759, + "step": 7736 + }, + { + "epoch": 0.38, + "grad_norm": 1.6534110307693481, + "learning_rate": 3.08785019022679e-05, + "loss": 5.7897, + "step": 7740 + }, + { + "epoch": 0.38, + "grad_norm": 2.1814112663269043, + "learning_rate": 3.086861999110628e-05, + "loss": 5.6196, + "step": 7744 + }, + { + "epoch": 0.38, + "grad_norm": 1.8534756898880005, + "learning_rate": 3.085873807994466e-05, + "loss": 5.5227, + "step": 7748 + }, + { + "epoch": 0.38, + "grad_norm": 1.9489648342132568, + "learning_rate": 3.0848856168783046e-05, + "loss": 5.647, + "step": 7752 + }, + { + "epoch": 0.38, + "grad_norm": 1.976659893989563, + "learning_rate": 3.083897425762143e-05, + "loss": 5.5386, + "step": 7756 + }, + { + "epoch": 0.38, + "grad_norm": 2.00985050201416, + "learning_rate": 3.082909234645981e-05, + "loss": 5.723, + "step": 7760 + }, + { + "epoch": 0.38, + "grad_norm": 1.8619848489761353, + "learning_rate": 3.081921043529819e-05, + "loss": 5.7385, + "step": 7764 + }, + { + "epoch": 0.38, + "grad_norm": 1.6431546211242676, + "learning_rate": 3.0809328524136574e-05, + "loss": 5.5918, + "step": 7768 + }, + { + "epoch": 0.38, + "grad_norm": 1.8953239917755127, + "learning_rate": 3.079944661297495e-05, + "loss": 5.6693, + "step": 7772 + }, + { + "epoch": 0.38, + "grad_norm": 2.1630382537841797, + "learning_rate": 3.078956470181333e-05, + "loss": 5.5721, + "step": 7776 + }, + { + "epoch": 0.38, + "grad_norm": 1.921631932258606, + "learning_rate": 3.0779682790651714e-05, + "loss": 5.7353, + "step": 7780 + }, + { + "epoch": 0.38, + "grad_norm": 1.9565848112106323, + "learning_rate": 3.0769800879490096e-05, + "loss": 5.6735, + "step": 7784 + }, + { + "epoch": 0.38, + "grad_norm": 2.029181718826294, + "learning_rate": 3.075991896832848e-05, + "loss": 5.5432, + "step": 7788 + }, + { + "epoch": 0.38, + "grad_norm": 1.7901182174682617, + "learning_rate": 3.075003705716686e-05, + "loss": 5.6048, + "step": 7792 + }, + { + "epoch": 0.39, + "grad_norm": 1.7717857360839844, + "learning_rate": 3.074015514600524e-05, + "loss": 5.5689, + "step": 7796 + }, + { + "epoch": 0.39, + "grad_norm": 1.8414852619171143, + "learning_rate": 3.073027323484362e-05, + "loss": 5.5628, + "step": 7800 + }, + { + "epoch": 0.39, + "grad_norm": 2.446169376373291, + "learning_rate": 3.0720391323682e-05, + "loss": 5.6246, + "step": 7804 + }, + { + "epoch": 0.39, + "grad_norm": 1.7307089567184448, + "learning_rate": 3.071050941252038e-05, + "loss": 5.6322, + "step": 7808 + }, + { + "epoch": 0.39, + "grad_norm": 2.0720410346984863, + "learning_rate": 3.0700627501358764e-05, + "loss": 5.6957, + "step": 7812 + }, + { + "epoch": 0.39, + "grad_norm": 1.8278499841690063, + "learning_rate": 3.0690745590197146e-05, + "loss": 5.6488, + "step": 7816 + }, + { + "epoch": 0.39, + "grad_norm": 1.9019477367401123, + "learning_rate": 3.068086367903553e-05, + "loss": 5.671, + "step": 7820 + }, + { + "epoch": 0.39, + "grad_norm": 2.330875873565674, + "learning_rate": 3.067098176787391e-05, + "loss": 5.7233, + "step": 7824 + }, + { + "epoch": 0.39, + "grad_norm": 1.958566427230835, + "learning_rate": 3.066109985671229e-05, + "loss": 5.5485, + "step": 7828 + }, + { + "epoch": 0.39, + "grad_norm": 1.9183526039123535, + "learning_rate": 3.065121794555067e-05, + "loss": 5.627, + "step": 7832 + }, + { + "epoch": 0.39, + "grad_norm": 2.0147671699523926, + "learning_rate": 3.064133603438905e-05, + "loss": 5.5493, + "step": 7836 + }, + { + "epoch": 0.39, + "grad_norm": 1.8255884647369385, + "learning_rate": 3.063145412322743e-05, + "loss": 5.618, + "step": 7840 + }, + { + "epoch": 0.39, + "grad_norm": 2.188514471054077, + "learning_rate": 3.062157221206581e-05, + "loss": 5.6623, + "step": 7844 + }, + { + "epoch": 0.39, + "grad_norm": 1.7055175304412842, + "learning_rate": 3.0611690300904195e-05, + "loss": 5.5607, + "step": 7848 + }, + { + "epoch": 0.39, + "grad_norm": 1.6150466203689575, + "learning_rate": 3.060180838974258e-05, + "loss": 5.5892, + "step": 7852 + }, + { + "epoch": 0.39, + "grad_norm": 1.7597399950027466, + "learning_rate": 3.059192647858096e-05, + "loss": 5.6028, + "step": 7856 + }, + { + "epoch": 0.39, + "grad_norm": 2.0006275177001953, + "learning_rate": 3.0582044567419335e-05, + "loss": 5.5031, + "step": 7860 + }, + { + "epoch": 0.39, + "grad_norm": 1.8812612295150757, + "learning_rate": 3.057216265625772e-05, + "loss": 5.5484, + "step": 7864 + }, + { + "epoch": 0.39, + "grad_norm": 1.7753983736038208, + "learning_rate": 3.0562280745096106e-05, + "loss": 5.661, + "step": 7868 + }, + { + "epoch": 0.39, + "grad_norm": 1.891542673110962, + "learning_rate": 3.055239883393449e-05, + "loss": 5.5273, + "step": 7872 + }, + { + "epoch": 0.39, + "grad_norm": 2.026078939437866, + "learning_rate": 3.054251692277287e-05, + "loss": 5.5718, + "step": 7876 + }, + { + "epoch": 0.39, + "grad_norm": 2.0182738304138184, + "learning_rate": 3.053263501161125e-05, + "loss": 5.694, + "step": 7880 + }, + { + "epoch": 0.39, + "grad_norm": 1.9878616333007812, + "learning_rate": 3.052275310044963e-05, + "loss": 5.6561, + "step": 7884 + }, + { + "epoch": 0.39, + "grad_norm": 1.9343925714492798, + "learning_rate": 3.0512871189288012e-05, + "loss": 5.6317, + "step": 7888 + }, + { + "epoch": 0.39, + "grad_norm": 1.8456355333328247, + "learning_rate": 3.050298927812639e-05, + "loss": 5.5619, + "step": 7892 + }, + { + "epoch": 0.39, + "grad_norm": 1.6567593812942505, + "learning_rate": 3.0493107366964773e-05, + "loss": 5.5108, + "step": 7896 + }, + { + "epoch": 0.39, + "grad_norm": 1.7682963609695435, + "learning_rate": 3.0483225455803155e-05, + "loss": 5.8091, + "step": 7900 + }, + { + "epoch": 0.39, + "grad_norm": 1.653235912322998, + "learning_rate": 3.0473343544641537e-05, + "loss": 5.5357, + "step": 7904 + }, + { + "epoch": 0.39, + "grad_norm": 1.7630318403244019, + "learning_rate": 3.0463461633479916e-05, + "loss": 5.6776, + "step": 7908 + }, + { + "epoch": 0.39, + "grad_norm": 2.0381624698638916, + "learning_rate": 3.0453579722318298e-05, + "loss": 5.6855, + "step": 7912 + }, + { + "epoch": 0.39, + "grad_norm": 1.6931108236312866, + "learning_rate": 3.044369781115668e-05, + "loss": 5.6519, + "step": 7916 + }, + { + "epoch": 0.39, + "grad_norm": 1.7193225622177124, + "learning_rate": 3.0433815899995062e-05, + "loss": 5.5361, + "step": 7920 + }, + { + "epoch": 0.39, + "grad_norm": 1.7203538417816162, + "learning_rate": 3.042393398883344e-05, + "loss": 5.5749, + "step": 7924 + }, + { + "epoch": 0.39, + "grad_norm": 2.0116119384765625, + "learning_rate": 3.0414052077671823e-05, + "loss": 5.478, + "step": 7928 + }, + { + "epoch": 0.39, + "grad_norm": 1.5771147012710571, + "learning_rate": 3.0404170166510205e-05, + "loss": 5.6352, + "step": 7932 + }, + { + "epoch": 0.39, + "grad_norm": 1.8036284446716309, + "learning_rate": 3.0394288255348584e-05, + "loss": 5.5748, + "step": 7936 + }, + { + "epoch": 0.39, + "grad_norm": 1.650652289390564, + "learning_rate": 3.0384406344186966e-05, + "loss": 5.6925, + "step": 7940 + }, + { + "epoch": 0.39, + "grad_norm": 1.871989369392395, + "learning_rate": 3.0374524433025348e-05, + "loss": 5.4965, + "step": 7944 + }, + { + "epoch": 0.39, + "grad_norm": 1.7644288539886475, + "learning_rate": 3.036464252186373e-05, + "loss": 5.521, + "step": 7948 + }, + { + "epoch": 0.39, + "grad_norm": 1.7584574222564697, + "learning_rate": 3.035476061070211e-05, + "loss": 5.4997, + "step": 7952 + }, + { + "epoch": 0.39, + "grad_norm": 1.6933388710021973, + "learning_rate": 3.034487869954049e-05, + "loss": 5.6415, + "step": 7956 + }, + { + "epoch": 0.39, + "grad_norm": 1.6390060186386108, + "learning_rate": 3.0334996788378873e-05, + "loss": 5.6538, + "step": 7960 + }, + { + "epoch": 0.39, + "grad_norm": 1.8131755590438843, + "learning_rate": 3.0325114877217255e-05, + "loss": 5.6138, + "step": 7964 + }, + { + "epoch": 0.39, + "grad_norm": 2.0270628929138184, + "learning_rate": 3.0315232966055633e-05, + "loss": 5.5461, + "step": 7968 + }, + { + "epoch": 0.39, + "grad_norm": 1.8326473236083984, + "learning_rate": 3.0305351054894015e-05, + "loss": 5.5499, + "step": 7972 + }, + { + "epoch": 0.39, + "grad_norm": 1.9747962951660156, + "learning_rate": 3.0295469143732397e-05, + "loss": 5.5334, + "step": 7976 + }, + { + "epoch": 0.39, + "grad_norm": 1.95182204246521, + "learning_rate": 3.028558723257078e-05, + "loss": 5.5317, + "step": 7980 + }, + { + "epoch": 0.39, + "grad_norm": 2.177966356277466, + "learning_rate": 3.0275705321409165e-05, + "loss": 5.5488, + "step": 7984 + }, + { + "epoch": 0.39, + "grad_norm": 2.0180881023406982, + "learning_rate": 3.0265823410247547e-05, + "loss": 5.5304, + "step": 7988 + }, + { + "epoch": 0.39, + "grad_norm": 1.7427732944488525, + "learning_rate": 3.0255941499085926e-05, + "loss": 5.6641, + "step": 7992 + }, + { + "epoch": 0.4, + "grad_norm": 1.7346999645233154, + "learning_rate": 3.0246059587924308e-05, + "loss": 5.6368, + "step": 7996 + }, + { + "epoch": 0.4, + "grad_norm": 2.121185302734375, + "learning_rate": 3.023617767676269e-05, + "loss": 5.5946, + "step": 8000 + }, + { + "epoch": 0.4, + "grad_norm": 1.9402865171432495, + "learning_rate": 3.0226295765601072e-05, + "loss": 5.679, + "step": 8004 + }, + { + "epoch": 0.4, + "grad_norm": 2.0164012908935547, + "learning_rate": 3.021641385443945e-05, + "loss": 5.5902, + "step": 8008 + }, + { + "epoch": 0.4, + "grad_norm": 1.8733190298080444, + "learning_rate": 3.0209002421068237e-05, + "loss": 5.532, + "step": 8012 + }, + { + "epoch": 0.4, + "grad_norm": 1.7538553476333618, + "learning_rate": 3.019912050990662e-05, + "loss": 5.6426, + "step": 8016 + }, + { + "epoch": 0.4, + "grad_norm": 1.886910319328308, + "learning_rate": 3.0189238598745e-05, + "loss": 5.5693, + "step": 8020 + }, + { + "epoch": 0.4, + "grad_norm": 1.9799367189407349, + "learning_rate": 3.0179356687583383e-05, + "loss": 5.7278, + "step": 8024 + }, + { + "epoch": 0.4, + "grad_norm": 1.9877070188522339, + "learning_rate": 3.0169474776421762e-05, + "loss": 5.5553, + "step": 8028 + }, + { + "epoch": 0.4, + "grad_norm": 2.2819747924804688, + "learning_rate": 3.0159592865260144e-05, + "loss": 5.5856, + "step": 8032 + }, + { + "epoch": 0.4, + "grad_norm": 1.671416997909546, + "learning_rate": 3.0149710954098526e-05, + "loss": 5.588, + "step": 8036 + }, + { + "epoch": 0.4, + "grad_norm": 1.724274754524231, + "learning_rate": 3.0139829042936908e-05, + "loss": 5.5928, + "step": 8040 + }, + { + "epoch": 0.4, + "grad_norm": 1.6406031847000122, + "learning_rate": 3.0129947131775287e-05, + "loss": 5.6456, + "step": 8044 + }, + { + "epoch": 0.4, + "grad_norm": 1.9587162733078003, + "learning_rate": 3.012006522061367e-05, + "loss": 5.6093, + "step": 8048 + }, + { + "epoch": 0.4, + "grad_norm": 1.987953543663025, + "learning_rate": 3.011018330945205e-05, + "loss": 5.8002, + "step": 8052 + }, + { + "epoch": 0.4, + "grad_norm": 1.800836443901062, + "learning_rate": 3.0100301398290433e-05, + "loss": 5.7093, + "step": 8056 + }, + { + "epoch": 0.4, + "grad_norm": 1.9985570907592773, + "learning_rate": 3.009041948712881e-05, + "loss": 5.6896, + "step": 8060 + }, + { + "epoch": 0.4, + "grad_norm": 2.0569660663604736, + "learning_rate": 3.0080537575967194e-05, + "loss": 5.7986, + "step": 8064 + }, + { + "epoch": 0.4, + "grad_norm": 1.8365398645401, + "learning_rate": 3.0070655664805576e-05, + "loss": 5.4791, + "step": 8068 + }, + { + "epoch": 0.4, + "grad_norm": 1.890994668006897, + "learning_rate": 3.0060773753643954e-05, + "loss": 5.5919, + "step": 8072 + }, + { + "epoch": 0.4, + "grad_norm": 1.8914673328399658, + "learning_rate": 3.0050891842482337e-05, + "loss": 5.6298, + "step": 8076 + }, + { + "epoch": 0.4, + "grad_norm": 2.059929132461548, + "learning_rate": 3.004100993132072e-05, + "loss": 5.589, + "step": 8080 + }, + { + "epoch": 0.4, + "grad_norm": 1.552819013595581, + "learning_rate": 3.00311280201591e-05, + "loss": 5.6104, + "step": 8084 + }, + { + "epoch": 0.4, + "grad_norm": 2.081338882446289, + "learning_rate": 3.002124610899748e-05, + "loss": 5.6061, + "step": 8088 + }, + { + "epoch": 0.4, + "grad_norm": 1.625524640083313, + "learning_rate": 3.001136419783586e-05, + "loss": 5.6501, + "step": 8092 + }, + { + "epoch": 0.4, + "grad_norm": 1.818798303604126, + "learning_rate": 3.0001482286674243e-05, + "loss": 5.6345, + "step": 8096 + }, + { + "epoch": 0.4, + "grad_norm": 1.9189039468765259, + "learning_rate": 2.9991600375512626e-05, + "loss": 5.6594, + "step": 8100 + }, + { + "epoch": 0.4, + "grad_norm": 1.9323093891143799, + "learning_rate": 2.9981718464351004e-05, + "loss": 5.6704, + "step": 8104 + }, + { + "epoch": 0.4, + "grad_norm": 1.9228005409240723, + "learning_rate": 2.9971836553189386e-05, + "loss": 5.7182, + "step": 8108 + }, + { + "epoch": 0.4, + "grad_norm": 1.8709877729415894, + "learning_rate": 2.9961954642027768e-05, + "loss": 5.6675, + "step": 8112 + }, + { + "epoch": 0.4, + "grad_norm": 2.015131711959839, + "learning_rate": 2.9952072730866147e-05, + "loss": 5.5459, + "step": 8116 + }, + { + "epoch": 0.4, + "grad_norm": 2.3515000343322754, + "learning_rate": 2.994219081970453e-05, + "loss": 5.6607, + "step": 8120 + }, + { + "epoch": 0.4, + "grad_norm": 1.6969714164733887, + "learning_rate": 2.993230890854291e-05, + "loss": 5.5014, + "step": 8124 + }, + { + "epoch": 0.4, + "grad_norm": 1.8755816221237183, + "learning_rate": 2.9922426997381297e-05, + "loss": 5.6539, + "step": 8128 + }, + { + "epoch": 0.4, + "grad_norm": 1.724385380744934, + "learning_rate": 2.991254508621968e-05, + "loss": 5.6172, + "step": 8132 + }, + { + "epoch": 0.4, + "grad_norm": 1.9022701978683472, + "learning_rate": 2.990266317505806e-05, + "loss": 5.6263, + "step": 8136 + }, + { + "epoch": 0.4, + "grad_norm": 1.7855650186538696, + "learning_rate": 2.9892781263896443e-05, + "loss": 5.558, + "step": 8140 + }, + { + "epoch": 0.4, + "grad_norm": 1.6109340190887451, + "learning_rate": 2.988289935273482e-05, + "loss": 5.5173, + "step": 8144 + }, + { + "epoch": 0.4, + "grad_norm": 1.7251406908035278, + "learning_rate": 2.9873017441573203e-05, + "loss": 5.5703, + "step": 8148 + }, + { + "epoch": 0.4, + "grad_norm": 1.8107340335845947, + "learning_rate": 2.9863135530411586e-05, + "loss": 5.6168, + "step": 8152 + }, + { + "epoch": 0.4, + "grad_norm": 1.5449600219726562, + "learning_rate": 2.9853253619249964e-05, + "loss": 5.4385, + "step": 8156 + }, + { + "epoch": 0.4, + "grad_norm": 1.950967788696289, + "learning_rate": 2.9843371708088346e-05, + "loss": 5.6621, + "step": 8160 + }, + { + "epoch": 0.4, + "grad_norm": 2.0183985233306885, + "learning_rate": 2.983348979692673e-05, + "loss": 5.5622, + "step": 8164 + }, + { + "epoch": 0.4, + "grad_norm": 2.118438243865967, + "learning_rate": 2.982360788576511e-05, + "loss": 5.7116, + "step": 8168 + }, + { + "epoch": 0.4, + "grad_norm": 2.065899610519409, + "learning_rate": 2.981372597460349e-05, + "loss": 5.7335, + "step": 8172 + }, + { + "epoch": 0.4, + "grad_norm": 2.075225830078125, + "learning_rate": 2.980384406344187e-05, + "loss": 5.4978, + "step": 8176 + }, + { + "epoch": 0.4, + "grad_norm": 1.7782163619995117, + "learning_rate": 2.9793962152280253e-05, + "loss": 5.6495, + "step": 8180 + }, + { + "epoch": 0.4, + "grad_norm": 1.8186616897583008, + "learning_rate": 2.9784080241118635e-05, + "loss": 5.5176, + "step": 8184 + }, + { + "epoch": 0.4, + "grad_norm": 1.926069736480713, + "learning_rate": 2.9774198329957014e-05, + "loss": 5.7383, + "step": 8188 + }, + { + "epoch": 0.4, + "grad_norm": 1.8002229928970337, + "learning_rate": 2.9764316418795396e-05, + "loss": 5.6145, + "step": 8192 + }, + { + "epoch": 0.4, + "grad_norm": 1.5681378841400146, + "learning_rate": 2.9754434507633778e-05, + "loss": 5.634, + "step": 8196 + }, + { + "epoch": 0.41, + "grad_norm": 2.045126438140869, + "learning_rate": 2.9744552596472157e-05, + "loss": 5.5229, + "step": 8200 + }, + { + "epoch": 0.41, + "grad_norm": 2.116232395172119, + "learning_rate": 2.973467068531054e-05, + "loss": 5.6309, + "step": 8204 + }, + { + "epoch": 0.41, + "grad_norm": 1.5766971111297607, + "learning_rate": 2.972478877414892e-05, + "loss": 5.5052, + "step": 8208 + }, + { + "epoch": 0.41, + "grad_norm": 1.6305807828903198, + "learning_rate": 2.9714906862987303e-05, + "loss": 5.5024, + "step": 8212 + }, + { + "epoch": 0.41, + "grad_norm": 1.7745639085769653, + "learning_rate": 2.970502495182568e-05, + "loss": 5.6619, + "step": 8216 + }, + { + "epoch": 0.41, + "grad_norm": 1.708950400352478, + "learning_rate": 2.9695143040664064e-05, + "loss": 5.634, + "step": 8220 + }, + { + "epoch": 0.41, + "grad_norm": 1.8508306741714478, + "learning_rate": 2.9685261129502446e-05, + "loss": 5.7091, + "step": 8224 + }, + { + "epoch": 0.41, + "grad_norm": 1.9124394655227661, + "learning_rate": 2.9675379218340828e-05, + "loss": 5.6441, + "step": 8228 + }, + { + "epoch": 0.41, + "grad_norm": 1.735961675643921, + "learning_rate": 2.9665497307179206e-05, + "loss": 5.5941, + "step": 8232 + }, + { + "epoch": 0.41, + "grad_norm": 2.0475118160247803, + "learning_rate": 2.965561539601759e-05, + "loss": 5.5674, + "step": 8236 + }, + { + "epoch": 0.41, + "grad_norm": 1.8113858699798584, + "learning_rate": 2.964573348485597e-05, + "loss": 5.6697, + "step": 8240 + }, + { + "epoch": 0.41, + "grad_norm": 1.9009729623794556, + "learning_rate": 2.9635851573694356e-05, + "loss": 5.5668, + "step": 8244 + }, + { + "epoch": 0.41, + "grad_norm": 1.8220778703689575, + "learning_rate": 2.9625969662532738e-05, + "loss": 5.6735, + "step": 8248 + }, + { + "epoch": 0.41, + "grad_norm": 1.7952344417572021, + "learning_rate": 2.961608775137112e-05, + "loss": 5.5436, + "step": 8252 + }, + { + "epoch": 0.41, + "grad_norm": 1.8756263256072998, + "learning_rate": 2.96062058402095e-05, + "loss": 5.5825, + "step": 8256 + }, + { + "epoch": 0.41, + "grad_norm": 1.8858979940414429, + "learning_rate": 2.959632392904788e-05, + "loss": 5.6039, + "step": 8260 + }, + { + "epoch": 0.41, + "grad_norm": 1.780319333076477, + "learning_rate": 2.9586442017886263e-05, + "loss": 5.6614, + "step": 8264 + }, + { + "epoch": 0.41, + "grad_norm": 2.561244487762451, + "learning_rate": 2.9576560106724645e-05, + "loss": 5.6054, + "step": 8268 + }, + { + "epoch": 0.41, + "grad_norm": 1.9080662727355957, + "learning_rate": 2.9566678195563024e-05, + "loss": 5.5438, + "step": 8272 + }, + { + "epoch": 0.41, + "grad_norm": 1.8043930530548096, + "learning_rate": 2.9556796284401406e-05, + "loss": 5.5615, + "step": 8276 + }, + { + "epoch": 0.41, + "grad_norm": 1.8665918111801147, + "learning_rate": 2.9546914373239788e-05, + "loss": 5.6224, + "step": 8280 + }, + { + "epoch": 0.41, + "grad_norm": 1.7628755569458008, + "learning_rate": 2.9537032462078166e-05, + "loss": 5.648, + "step": 8284 + }, + { + "epoch": 0.41, + "grad_norm": 1.5806515216827393, + "learning_rate": 2.952715055091655e-05, + "loss": 5.6113, + "step": 8288 + }, + { + "epoch": 0.41, + "grad_norm": 1.7797776460647583, + "learning_rate": 2.951726863975493e-05, + "loss": 5.6718, + "step": 8292 + }, + { + "epoch": 0.41, + "grad_norm": 1.85147225856781, + "learning_rate": 2.9507386728593313e-05, + "loss": 5.571, + "step": 8296 + }, + { + "epoch": 0.41, + "grad_norm": 1.8725066184997559, + "learning_rate": 2.949750481743169e-05, + "loss": 5.6677, + "step": 8300 + }, + { + "epoch": 0.41, + "grad_norm": 1.6381113529205322, + "learning_rate": 2.9487622906270073e-05, + "loss": 5.5314, + "step": 8304 + }, + { + "epoch": 0.41, + "grad_norm": 1.7837311029434204, + "learning_rate": 2.9477740995108455e-05, + "loss": 5.6066, + "step": 8308 + }, + { + "epoch": 0.41, + "grad_norm": 1.7460732460021973, + "learning_rate": 2.9467859083946837e-05, + "loss": 5.5304, + "step": 8312 + }, + { + "epoch": 0.41, + "grad_norm": 1.7592207193374634, + "learning_rate": 2.9457977172785216e-05, + "loss": 5.5576, + "step": 8316 + }, + { + "epoch": 0.41, + "grad_norm": 1.7295989990234375, + "learning_rate": 2.9448095261623598e-05, + "loss": 5.6019, + "step": 8320 + }, + { + "epoch": 0.41, + "grad_norm": 2.4635212421417236, + "learning_rate": 2.943821335046198e-05, + "loss": 5.6384, + "step": 8324 + }, + { + "epoch": 0.41, + "grad_norm": 1.829713225364685, + "learning_rate": 2.9428331439300362e-05, + "loss": 5.6704, + "step": 8328 + }, + { + "epoch": 0.41, + "grad_norm": 2.121614694595337, + "learning_rate": 2.941844952813874e-05, + "loss": 5.6339, + "step": 8332 + }, + { + "epoch": 0.41, + "grad_norm": 1.783610224723816, + "learning_rate": 2.9408567616977123e-05, + "loss": 5.6805, + "step": 8336 + }, + { + "epoch": 0.41, + "grad_norm": 2.0255215167999268, + "learning_rate": 2.9398685705815505e-05, + "loss": 5.667, + "step": 8340 + }, + { + "epoch": 0.41, + "grad_norm": 1.8871128559112549, + "learning_rate": 2.9388803794653884e-05, + "loss": 5.6883, + "step": 8344 + }, + { + "epoch": 0.41, + "grad_norm": 1.9648714065551758, + "learning_rate": 2.9378921883492266e-05, + "loss": 5.6178, + "step": 8348 + }, + { + "epoch": 0.41, + "grad_norm": 1.8247488737106323, + "learning_rate": 2.9369039972330648e-05, + "loss": 5.675, + "step": 8352 + }, + { + "epoch": 0.41, + "grad_norm": 1.8563957214355469, + "learning_rate": 2.9359158061169033e-05, + "loss": 5.6308, + "step": 8356 + }, + { + "epoch": 0.41, + "grad_norm": 1.8070034980773926, + "learning_rate": 2.9349276150007415e-05, + "loss": 5.6328, + "step": 8360 + }, + { + "epoch": 0.41, + "grad_norm": 1.8198412656784058, + "learning_rate": 2.9339394238845797e-05, + "loss": 5.6216, + "step": 8364 + }, + { + "epoch": 0.41, + "grad_norm": 1.7104226350784302, + "learning_rate": 2.9329512327684176e-05, + "loss": 5.6402, + "step": 8368 + }, + { + "epoch": 0.41, + "grad_norm": 1.6858009099960327, + "learning_rate": 2.9319630416522558e-05, + "loss": 5.6666, + "step": 8372 + }, + { + "epoch": 0.41, + "grad_norm": 1.978938102722168, + "learning_rate": 2.930974850536094e-05, + "loss": 5.4985, + "step": 8376 + }, + { + "epoch": 0.41, + "grad_norm": 1.6124422550201416, + "learning_rate": 2.9299866594199322e-05, + "loss": 5.478, + "step": 8380 + }, + { + "epoch": 0.41, + "grad_norm": 2.06709623336792, + "learning_rate": 2.92899846830377e-05, + "loss": 5.5993, + "step": 8384 + }, + { + "epoch": 0.41, + "grad_norm": 2.138789415359497, + "learning_rate": 2.9280102771876083e-05, + "loss": 5.6381, + "step": 8388 + }, + { + "epoch": 0.41, + "grad_norm": 1.7755143642425537, + "learning_rate": 2.9270220860714465e-05, + "loss": 5.6309, + "step": 8392 + }, + { + "epoch": 0.41, + "grad_norm": 2.0836946964263916, + "learning_rate": 2.9260338949552847e-05, + "loss": 5.5439, + "step": 8396 + }, + { + "epoch": 0.42, + "grad_norm": 1.8185824155807495, + "learning_rate": 2.9250457038391226e-05, + "loss": 5.5819, + "step": 8400 + }, + { + "epoch": 0.42, + "grad_norm": 1.9818052053451538, + "learning_rate": 2.9240575127229608e-05, + "loss": 5.6349, + "step": 8404 + }, + { + "epoch": 0.42, + "grad_norm": 2.246522903442383, + "learning_rate": 2.923069321606799e-05, + "loss": 5.6007, + "step": 8408 + }, + { + "epoch": 0.42, + "grad_norm": 1.9953023195266724, + "learning_rate": 2.922081130490637e-05, + "loss": 5.643, + "step": 8412 + }, + { + "epoch": 0.42, + "grad_norm": 1.9882394075393677, + "learning_rate": 2.921092939374475e-05, + "loss": 5.6082, + "step": 8416 + }, + { + "epoch": 0.42, + "grad_norm": 1.7919508218765259, + "learning_rate": 2.9201047482583133e-05, + "loss": 5.6575, + "step": 8420 + }, + { + "epoch": 0.42, + "grad_norm": 1.6816109418869019, + "learning_rate": 2.9191165571421515e-05, + "loss": 5.6128, + "step": 8424 + }, + { + "epoch": 0.42, + "grad_norm": 1.9026116132736206, + "learning_rate": 2.9181283660259893e-05, + "loss": 5.5993, + "step": 8428 + }, + { + "epoch": 0.42, + "grad_norm": 2.1336424350738525, + "learning_rate": 2.9171401749098276e-05, + "loss": 5.5573, + "step": 8432 + }, + { + "epoch": 0.42, + "grad_norm": 1.6572264432907104, + "learning_rate": 2.9161519837936658e-05, + "loss": 5.7065, + "step": 8436 + }, + { + "epoch": 0.42, + "grad_norm": 2.1240315437316895, + "learning_rate": 2.915163792677504e-05, + "loss": 5.5141, + "step": 8440 + }, + { + "epoch": 0.42, + "grad_norm": 1.728661298751831, + "learning_rate": 2.914175601561342e-05, + "loss": 5.621, + "step": 8444 + }, + { + "epoch": 0.42, + "grad_norm": 2.108468770980835, + "learning_rate": 2.91318741044518e-05, + "loss": 5.5439, + "step": 8448 + }, + { + "epoch": 0.42, + "grad_norm": 1.912477731704712, + "learning_rate": 2.9121992193290182e-05, + "loss": 5.6222, + "step": 8452 + }, + { + "epoch": 0.42, + "grad_norm": 1.753421425819397, + "learning_rate": 2.9112110282128565e-05, + "loss": 5.6209, + "step": 8456 + }, + { + "epoch": 0.42, + "grad_norm": 1.7683593034744263, + "learning_rate": 2.9102228370966943e-05, + "loss": 5.6178, + "step": 8460 + }, + { + "epoch": 0.42, + "grad_norm": 1.8550388813018799, + "learning_rate": 2.9092346459805325e-05, + "loss": 5.6337, + "step": 8464 + }, + { + "epoch": 0.42, + "grad_norm": 1.6444224119186401, + "learning_rate": 2.9082464548643707e-05, + "loss": 5.7698, + "step": 8468 + }, + { + "epoch": 0.42, + "grad_norm": 1.868353247642517, + "learning_rate": 2.9072582637482093e-05, + "loss": 5.8062, + "step": 8472 + }, + { + "epoch": 0.42, + "grad_norm": 1.8483880758285522, + "learning_rate": 2.9062700726320475e-05, + "loss": 5.5449, + "step": 8476 + }, + { + "epoch": 0.42, + "grad_norm": 2.059861660003662, + "learning_rate": 2.9052818815158857e-05, + "loss": 5.5988, + "step": 8480 + }, + { + "epoch": 0.42, + "grad_norm": 1.7102926969528198, + "learning_rate": 2.9042936903997236e-05, + "loss": 5.5771, + "step": 8484 + }, + { + "epoch": 0.42, + "grad_norm": 1.8607144355773926, + "learning_rate": 2.9033054992835618e-05, + "loss": 5.4982, + "step": 8488 + }, + { + "epoch": 0.42, + "grad_norm": 1.9258161783218384, + "learning_rate": 2.9023173081674e-05, + "loss": 5.6881, + "step": 8492 + }, + { + "epoch": 0.42, + "grad_norm": 1.688467264175415, + "learning_rate": 2.901329117051238e-05, + "loss": 5.6507, + "step": 8496 + }, + { + "epoch": 0.42, + "grad_norm": 1.7011233568191528, + "learning_rate": 2.900340925935076e-05, + "loss": 5.59, + "step": 8500 + }, + { + "epoch": 0.42, + "grad_norm": 1.6969784498214722, + "learning_rate": 2.8993527348189142e-05, + "loss": 5.6226, + "step": 8504 + }, + { + "epoch": 0.42, + "grad_norm": 1.669765830039978, + "learning_rate": 2.8983645437027525e-05, + "loss": 5.5824, + "step": 8508 + }, + { + "epoch": 0.42, + "grad_norm": 1.9257421493530273, + "learning_rate": 2.8973763525865903e-05, + "loss": 5.5598, + "step": 8512 + }, + { + "epoch": 0.42, + "grad_norm": 2.1687793731689453, + "learning_rate": 2.8963881614704285e-05, + "loss": 5.7037, + "step": 8516 + }, + { + "epoch": 0.42, + "grad_norm": 1.5989969968795776, + "learning_rate": 2.8953999703542667e-05, + "loss": 5.5294, + "step": 8520 + }, + { + "epoch": 0.42, + "grad_norm": 1.8674973249435425, + "learning_rate": 2.894411779238105e-05, + "loss": 5.4714, + "step": 8524 + }, + { + "epoch": 0.42, + "grad_norm": 1.83799409866333, + "learning_rate": 2.8934235881219428e-05, + "loss": 5.6442, + "step": 8528 + }, + { + "epoch": 0.42, + "grad_norm": 2.0756680965423584, + "learning_rate": 2.892435397005781e-05, + "loss": 5.6252, + "step": 8532 + }, + { + "epoch": 0.42, + "grad_norm": 2.1030049324035645, + "learning_rate": 2.8914472058896192e-05, + "loss": 5.5707, + "step": 8536 + }, + { + "epoch": 0.42, + "grad_norm": 1.740062952041626, + "learning_rate": 2.8904590147734574e-05, + "loss": 5.6083, + "step": 8540 + }, + { + "epoch": 0.42, + "grad_norm": 1.9821360111236572, + "learning_rate": 2.8894708236572953e-05, + "loss": 5.5575, + "step": 8544 + }, + { + "epoch": 0.42, + "grad_norm": 2.0628159046173096, + "learning_rate": 2.8884826325411335e-05, + "loss": 5.5519, + "step": 8548 + }, + { + "epoch": 0.42, + "grad_norm": 2.3252205848693848, + "learning_rate": 2.8874944414249717e-05, + "loss": 5.6204, + "step": 8552 + }, + { + "epoch": 0.42, + "grad_norm": 1.7173527479171753, + "learning_rate": 2.8865062503088096e-05, + "loss": 5.5685, + "step": 8556 + }, + { + "epoch": 0.42, + "grad_norm": 1.7327337265014648, + "learning_rate": 2.8855180591926478e-05, + "loss": 5.6208, + "step": 8560 + }, + { + "epoch": 0.42, + "grad_norm": 1.7945717573165894, + "learning_rate": 2.884529868076486e-05, + "loss": 5.6034, + "step": 8564 + }, + { + "epoch": 0.42, + "grad_norm": 1.8818703889846802, + "learning_rate": 2.8835416769603242e-05, + "loss": 5.5966, + "step": 8568 + }, + { + "epoch": 0.42, + "grad_norm": 1.780110478401184, + "learning_rate": 2.882553485844162e-05, + "loss": 5.6438, + "step": 8572 + }, + { + "epoch": 0.42, + "grad_norm": 1.9500248432159424, + "learning_rate": 2.8815652947280003e-05, + "loss": 5.6569, + "step": 8576 + }, + { + "epoch": 0.42, + "grad_norm": 2.0697216987609863, + "learning_rate": 2.8805771036118385e-05, + "loss": 5.5631, + "step": 8580 + }, + { + "epoch": 0.42, + "grad_norm": 1.8026626110076904, + "learning_rate": 2.8795889124956767e-05, + "loss": 5.6308, + "step": 8584 + }, + { + "epoch": 0.42, + "grad_norm": 1.6089578866958618, + "learning_rate": 2.8786007213795152e-05, + "loss": 5.5297, + "step": 8588 + }, + { + "epoch": 0.42, + "grad_norm": 1.8671869039535522, + "learning_rate": 2.8776125302633534e-05, + "loss": 5.5388, + "step": 8592 + }, + { + "epoch": 0.42, + "grad_norm": 1.7763539552688599, + "learning_rate": 2.8766243391471913e-05, + "loss": 5.6698, + "step": 8596 + }, + { + "epoch": 0.42, + "grad_norm": 1.6824060678482056, + "learning_rate": 2.8756361480310295e-05, + "loss": 5.6613, + "step": 8600 + }, + { + "epoch": 0.43, + "grad_norm": 1.9802032709121704, + "learning_rate": 2.8746479569148677e-05, + "loss": 5.6713, + "step": 8604 + }, + { + "epoch": 0.43, + "grad_norm": 1.8129801750183105, + "learning_rate": 2.873659765798706e-05, + "loss": 5.5339, + "step": 8608 + }, + { + "epoch": 0.43, + "grad_norm": 2.0134878158569336, + "learning_rate": 2.8726715746825438e-05, + "loss": 5.5781, + "step": 8612 + }, + { + "epoch": 0.43, + "grad_norm": 1.7483782768249512, + "learning_rate": 2.871683383566382e-05, + "loss": 5.5785, + "step": 8616 + }, + { + "epoch": 0.43, + "grad_norm": 2.1585028171539307, + "learning_rate": 2.8706951924502202e-05, + "loss": 5.5063, + "step": 8620 + }, + { + "epoch": 0.43, + "grad_norm": 2.0951032638549805, + "learning_rate": 2.8697070013340584e-05, + "loss": 5.5559, + "step": 8624 + }, + { + "epoch": 0.43, + "grad_norm": 1.7235571146011353, + "learning_rate": 2.8687188102178963e-05, + "loss": 5.6181, + "step": 8628 + }, + { + "epoch": 0.43, + "grad_norm": 2.0943381786346436, + "learning_rate": 2.8677306191017345e-05, + "loss": 5.5952, + "step": 8632 + }, + { + "epoch": 0.43, + "grad_norm": 1.9712262153625488, + "learning_rate": 2.8667424279855727e-05, + "loss": 5.6672, + "step": 8636 + }, + { + "epoch": 0.43, + "grad_norm": 1.837053894996643, + "learning_rate": 2.8657542368694105e-05, + "loss": 5.6023, + "step": 8640 + }, + { + "epoch": 0.43, + "grad_norm": 1.9455965757369995, + "learning_rate": 2.8647660457532487e-05, + "loss": 5.6813, + "step": 8644 + }, + { + "epoch": 0.43, + "grad_norm": 1.9085052013397217, + "learning_rate": 2.863777854637087e-05, + "loss": 5.5879, + "step": 8648 + }, + { + "epoch": 0.43, + "grad_norm": 2.043121814727783, + "learning_rate": 2.862789663520925e-05, + "loss": 5.484, + "step": 8652 + }, + { + "epoch": 0.43, + "grad_norm": 2.043996572494507, + "learning_rate": 2.861801472404763e-05, + "loss": 5.5774, + "step": 8656 + }, + { + "epoch": 0.43, + "grad_norm": 2.140770673751831, + "learning_rate": 2.8608132812886012e-05, + "loss": 5.5304, + "step": 8660 + }, + { + "epoch": 0.43, + "grad_norm": 2.25034499168396, + "learning_rate": 2.8598250901724394e-05, + "loss": 5.4963, + "step": 8664 + }, + { + "epoch": 0.43, + "grad_norm": 1.623706340789795, + "learning_rate": 2.8588368990562776e-05, + "loss": 5.5669, + "step": 8668 + }, + { + "epoch": 0.43, + "grad_norm": 1.8031169176101685, + "learning_rate": 2.8578487079401155e-05, + "loss": 5.5762, + "step": 8672 + }, + { + "epoch": 0.43, + "grad_norm": 2.0662543773651123, + "learning_rate": 2.8568605168239537e-05, + "loss": 5.4806, + "step": 8676 + }, + { + "epoch": 0.43, + "grad_norm": 2.1524598598480225, + "learning_rate": 2.855872325707792e-05, + "loss": 5.6503, + "step": 8680 + }, + { + "epoch": 0.43, + "grad_norm": 1.9902993440628052, + "learning_rate": 2.8548841345916298e-05, + "loss": 5.4978, + "step": 8684 + }, + { + "epoch": 0.43, + "grad_norm": 1.8020424842834473, + "learning_rate": 2.853895943475468e-05, + "loss": 5.6205, + "step": 8688 + }, + { + "epoch": 0.43, + "grad_norm": 1.732035517692566, + "learning_rate": 2.8529077523593062e-05, + "loss": 5.4912, + "step": 8692 + }, + { + "epoch": 0.43, + "grad_norm": 1.8738574981689453, + "learning_rate": 2.8519195612431444e-05, + "loss": 5.5621, + "step": 8696 + }, + { + "epoch": 0.43, + "grad_norm": 1.9714980125427246, + "learning_rate": 2.850931370126983e-05, + "loss": 5.6087, + "step": 8700 + }, + { + "epoch": 0.43, + "grad_norm": 2.0756115913391113, + "learning_rate": 2.849943179010821e-05, + "loss": 5.65, + "step": 8704 + }, + { + "epoch": 0.43, + "grad_norm": 2.104881763458252, + "learning_rate": 2.8489549878946594e-05, + "loss": 5.6053, + "step": 8708 + }, + { + "epoch": 0.43, + "grad_norm": 1.7442312240600586, + "learning_rate": 2.8479667967784972e-05, + "loss": 5.6199, + "step": 8712 + }, + { + "epoch": 0.43, + "grad_norm": 2.0308890342712402, + "learning_rate": 2.8469786056623354e-05, + "loss": 5.7298, + "step": 8716 + }, + { + "epoch": 0.43, + "grad_norm": 1.7256563901901245, + "learning_rate": 2.8459904145461736e-05, + "loss": 5.4541, + "step": 8720 + }, + { + "epoch": 0.43, + "grad_norm": 2.112795352935791, + "learning_rate": 2.8450022234300115e-05, + "loss": 5.5393, + "step": 8724 + }, + { + "epoch": 0.43, + "grad_norm": 1.886513113975525, + "learning_rate": 2.8440140323138497e-05, + "loss": 5.6014, + "step": 8728 + }, + { + "epoch": 0.43, + "grad_norm": 1.9806932210922241, + "learning_rate": 2.843025841197688e-05, + "loss": 5.6828, + "step": 8732 + }, + { + "epoch": 0.43, + "grad_norm": 2.23341965675354, + "learning_rate": 2.842037650081526e-05, + "loss": 5.7338, + "step": 8736 + }, + { + "epoch": 0.43, + "grad_norm": 1.7140839099884033, + "learning_rate": 2.841049458965364e-05, + "loss": 5.6041, + "step": 8740 + }, + { + "epoch": 0.43, + "grad_norm": 1.7378863096237183, + "learning_rate": 2.8400612678492022e-05, + "loss": 5.5925, + "step": 8744 + }, + { + "epoch": 0.43, + "grad_norm": 1.7445762157440186, + "learning_rate": 2.8390730767330404e-05, + "loss": 5.5118, + "step": 8748 + }, + { + "epoch": 0.43, + "grad_norm": 1.9341727495193481, + "learning_rate": 2.8380848856168786e-05, + "loss": 5.6059, + "step": 8752 + }, + { + "epoch": 0.43, + "grad_norm": 2.1164705753326416, + "learning_rate": 2.8370966945007165e-05, + "loss": 5.7005, + "step": 8756 + }, + { + "epoch": 0.43, + "grad_norm": 1.7062251567840576, + "learning_rate": 2.8361085033845547e-05, + "loss": 5.5896, + "step": 8760 + }, + { + "epoch": 0.43, + "grad_norm": 1.8553556203842163, + "learning_rate": 2.835120312268393e-05, + "loss": 5.6167, + "step": 8764 + }, + { + "epoch": 0.43, + "grad_norm": 1.7515733242034912, + "learning_rate": 2.8341321211522308e-05, + "loss": 5.5941, + "step": 8768 + }, + { + "epoch": 0.43, + "grad_norm": 1.8316551446914673, + "learning_rate": 2.833143930036069e-05, + "loss": 5.4873, + "step": 8772 + }, + { + "epoch": 0.43, + "grad_norm": 1.9083267450332642, + "learning_rate": 2.8321557389199072e-05, + "loss": 5.6866, + "step": 8776 + }, + { + "epoch": 0.43, + "grad_norm": 1.4909923076629639, + "learning_rate": 2.8311675478037454e-05, + "loss": 5.5345, + "step": 8780 + }, + { + "epoch": 0.43, + "grad_norm": 1.7641175985336304, + "learning_rate": 2.8301793566875832e-05, + "loss": 5.6514, + "step": 8784 + }, + { + "epoch": 0.43, + "grad_norm": 1.8059598207473755, + "learning_rate": 2.8291911655714215e-05, + "loss": 5.4931, + "step": 8788 + }, + { + "epoch": 0.43, + "grad_norm": 1.8962539434432983, + "learning_rate": 2.8282029744552597e-05, + "loss": 5.5765, + "step": 8792 + }, + { + "epoch": 0.43, + "grad_norm": 2.188370943069458, + "learning_rate": 2.827214783339098e-05, + "loss": 5.6586, + "step": 8796 + }, + { + "epoch": 0.43, + "grad_norm": 1.9519965648651123, + "learning_rate": 2.8262265922229357e-05, + "loss": 5.6691, + "step": 8800 + }, + { + "epoch": 0.44, + "grad_norm": 1.9207426309585571, + "learning_rate": 2.825238401106774e-05, + "loss": 5.7145, + "step": 8804 + }, + { + "epoch": 0.44, + "grad_norm": 1.8392800092697144, + "learning_rate": 2.824250209990612e-05, + "loss": 5.5793, + "step": 8808 + }, + { + "epoch": 0.44, + "grad_norm": 2.228625774383545, + "learning_rate": 2.8232620188744503e-05, + "loss": 5.6521, + "step": 8812 + }, + { + "epoch": 0.44, + "grad_norm": 2.18316650390625, + "learning_rate": 2.822273827758289e-05, + "loss": 5.7501, + "step": 8816 + }, + { + "epoch": 0.44, + "grad_norm": 2.011134147644043, + "learning_rate": 2.821285636642127e-05, + "loss": 5.6124, + "step": 8820 + }, + { + "epoch": 0.44, + "grad_norm": 1.9527933597564697, + "learning_rate": 2.820297445525965e-05, + "loss": 5.4269, + "step": 8824 + }, + { + "epoch": 0.44, + "grad_norm": 1.6352803707122803, + "learning_rate": 2.8193092544098032e-05, + "loss": 5.7361, + "step": 8828 + }, + { + "epoch": 0.44, + "grad_norm": 1.889510154724121, + "learning_rate": 2.8183210632936414e-05, + "loss": 5.5663, + "step": 8832 + }, + { + "epoch": 0.44, + "grad_norm": 1.8308645486831665, + "learning_rate": 2.8173328721774796e-05, + "loss": 5.6488, + "step": 8836 + }, + { + "epoch": 0.44, + "grad_norm": 1.8273881673812866, + "learning_rate": 2.8163446810613175e-05, + "loss": 5.6201, + "step": 8840 + }, + { + "epoch": 0.44, + "grad_norm": 1.8046854734420776, + "learning_rate": 2.8153564899451557e-05, + "loss": 5.6918, + "step": 8844 + }, + { + "epoch": 0.44, + "grad_norm": 2.0153470039367676, + "learning_rate": 2.814368298828994e-05, + "loss": 5.567, + "step": 8848 + }, + { + "epoch": 0.44, + "grad_norm": 1.6771526336669922, + "learning_rate": 2.8133801077128317e-05, + "loss": 5.5549, + "step": 8852 + }, + { + "epoch": 0.44, + "grad_norm": 1.917395830154419, + "learning_rate": 2.81239191659667e-05, + "loss": 5.564, + "step": 8856 + }, + { + "epoch": 0.44, + "grad_norm": 1.7779862880706787, + "learning_rate": 2.811403725480508e-05, + "loss": 5.6795, + "step": 8860 + }, + { + "epoch": 0.44, + "grad_norm": 1.9387528896331787, + "learning_rate": 2.8104155343643464e-05, + "loss": 5.5145, + "step": 8864 + }, + { + "epoch": 0.44, + "grad_norm": 2.0087499618530273, + "learning_rate": 2.8094273432481842e-05, + "loss": 5.548, + "step": 8868 + }, + { + "epoch": 0.44, + "grad_norm": 2.05503249168396, + "learning_rate": 2.8084391521320224e-05, + "loss": 5.6076, + "step": 8872 + }, + { + "epoch": 0.44, + "grad_norm": 1.6153981685638428, + "learning_rate": 2.8074509610158606e-05, + "loss": 5.6318, + "step": 8876 + }, + { + "epoch": 0.44, + "grad_norm": 1.6110368967056274, + "learning_rate": 2.806462769899699e-05, + "loss": 5.596, + "step": 8880 + }, + { + "epoch": 0.44, + "grad_norm": 1.7750228643417358, + "learning_rate": 2.8054745787835367e-05, + "loss": 5.528, + "step": 8884 + }, + { + "epoch": 0.44, + "grad_norm": 2.0455288887023926, + "learning_rate": 2.804486387667375e-05, + "loss": 5.4797, + "step": 8888 + }, + { + "epoch": 0.44, + "grad_norm": 1.8901207447052002, + "learning_rate": 2.803498196551213e-05, + "loss": 5.6263, + "step": 8892 + }, + { + "epoch": 0.44, + "grad_norm": 1.78607177734375, + "learning_rate": 2.8025100054350513e-05, + "loss": 5.6771, + "step": 8896 + }, + { + "epoch": 0.44, + "grad_norm": 1.7866969108581543, + "learning_rate": 2.8015218143188892e-05, + "loss": 5.6625, + "step": 8900 + }, + { + "epoch": 0.44, + "grad_norm": 1.713319182395935, + "learning_rate": 2.8005336232027274e-05, + "loss": 5.5529, + "step": 8904 + }, + { + "epoch": 0.44, + "grad_norm": 1.7033851146697998, + "learning_rate": 2.7995454320865656e-05, + "loss": 5.6241, + "step": 8908 + }, + { + "epoch": 0.44, + "grad_norm": 1.626865029335022, + "learning_rate": 2.7985572409704035e-05, + "loss": 5.623, + "step": 8912 + }, + { + "epoch": 0.44, + "grad_norm": 1.8610364198684692, + "learning_rate": 2.7975690498542417e-05, + "loss": 5.5305, + "step": 8916 + }, + { + "epoch": 0.44, + "grad_norm": 2.474336624145508, + "learning_rate": 2.79658085873808e-05, + "loss": 5.5765, + "step": 8920 + }, + { + "epoch": 0.44, + "grad_norm": 2.0331220626831055, + "learning_rate": 2.795592667621918e-05, + "loss": 5.5884, + "step": 8924 + }, + { + "epoch": 0.44, + "grad_norm": 1.7288097143173218, + "learning_rate": 2.794604476505756e-05, + "loss": 5.5913, + "step": 8928 + }, + { + "epoch": 0.44, + "grad_norm": 1.5526161193847656, + "learning_rate": 2.793616285389595e-05, + "loss": 5.5849, + "step": 8932 + }, + { + "epoch": 0.44, + "grad_norm": 1.636829137802124, + "learning_rate": 2.7926280942734327e-05, + "loss": 5.5373, + "step": 8936 + }, + { + "epoch": 0.44, + "grad_norm": 1.8936233520507812, + "learning_rate": 2.791639903157271e-05, + "loss": 5.7408, + "step": 8940 + }, + { + "epoch": 0.44, + "grad_norm": 1.8193585872650146, + "learning_rate": 2.790651712041109e-05, + "loss": 5.6536, + "step": 8944 + }, + { + "epoch": 0.44, + "grad_norm": 2.0485775470733643, + "learning_rate": 2.7896635209249473e-05, + "loss": 5.6758, + "step": 8948 + }, + { + "epoch": 0.44, + "grad_norm": 1.8670713901519775, + "learning_rate": 2.7886753298087852e-05, + "loss": 5.5534, + "step": 8952 + }, + { + "epoch": 0.44, + "grad_norm": 1.8386154174804688, + "learning_rate": 2.7876871386926234e-05, + "loss": 5.6744, + "step": 8956 + }, + { + "epoch": 0.44, + "grad_norm": 2.060434103012085, + "learning_rate": 2.7866989475764616e-05, + "loss": 5.6095, + "step": 8960 + }, + { + "epoch": 0.44, + "grad_norm": 1.5825614929199219, + "learning_rate": 2.7857107564602998e-05, + "loss": 5.5813, + "step": 8964 + }, + { + "epoch": 0.44, + "grad_norm": 1.6022027730941772, + "learning_rate": 2.7847225653441377e-05, + "loss": 5.5509, + "step": 8968 + }, + { + "epoch": 0.44, + "grad_norm": 1.892376184463501, + "learning_rate": 2.783734374227976e-05, + "loss": 5.5502, + "step": 8972 + }, + { + "epoch": 0.44, + "grad_norm": 1.8178150653839111, + "learning_rate": 2.782746183111814e-05, + "loss": 5.5913, + "step": 8976 + }, + { + "epoch": 0.44, + "grad_norm": 1.899789571762085, + "learning_rate": 2.7817579919956523e-05, + "loss": 5.6232, + "step": 8980 + }, + { + "epoch": 0.44, + "grad_norm": 1.5999733209609985, + "learning_rate": 2.78076980087949e-05, + "loss": 5.5976, + "step": 8984 + }, + { + "epoch": 0.44, + "grad_norm": 1.885884404182434, + "learning_rate": 2.7797816097633284e-05, + "loss": 5.5364, + "step": 8988 + }, + { + "epoch": 0.44, + "grad_norm": 1.7063078880310059, + "learning_rate": 2.7787934186471666e-05, + "loss": 5.5669, + "step": 8992 + }, + { + "epoch": 0.44, + "grad_norm": 1.8398665189743042, + "learning_rate": 2.7778052275310044e-05, + "loss": 5.5482, + "step": 8996 + }, + { + "epoch": 0.44, + "grad_norm": 1.7803176641464233, + "learning_rate": 2.7768170364148426e-05, + "loss": 5.6153, + "step": 9000 + }, + { + "epoch": 0.44, + "grad_norm": 1.9626309871673584, + "learning_rate": 2.775828845298681e-05, + "loss": 5.5867, + "step": 9004 + }, + { + "epoch": 0.45, + "grad_norm": 2.0748071670532227, + "learning_rate": 2.774840654182519e-05, + "loss": 5.5583, + "step": 9008 + }, + { + "epoch": 0.45, + "grad_norm": 1.8239414691925049, + "learning_rate": 2.773852463066357e-05, + "loss": 5.5866, + "step": 9012 + }, + { + "epoch": 0.45, + "grad_norm": 1.6811037063598633, + "learning_rate": 2.772864271950195e-05, + "loss": 5.4177, + "step": 9016 + }, + { + "epoch": 0.45, + "grad_norm": 2.1083476543426514, + "learning_rate": 2.7718760808340333e-05, + "loss": 5.5789, + "step": 9020 + }, + { + "epoch": 0.45, + "grad_norm": 1.621522068977356, + "learning_rate": 2.7708878897178715e-05, + "loss": 5.5616, + "step": 9024 + }, + { + "epoch": 0.45, + "grad_norm": 2.0952842235565186, + "learning_rate": 2.7698996986017094e-05, + "loss": 5.5983, + "step": 9028 + }, + { + "epoch": 0.45, + "grad_norm": 1.8348654508590698, + "learning_rate": 2.7689115074855476e-05, + "loss": 5.4559, + "step": 9032 + }, + { + "epoch": 0.45, + "grad_norm": 2.1072843074798584, + "learning_rate": 2.7679233163693858e-05, + "loss": 5.6188, + "step": 9036 + }, + { + "epoch": 0.45, + "grad_norm": 1.5936877727508545, + "learning_rate": 2.7669351252532237e-05, + "loss": 5.5372, + "step": 9040 + }, + { + "epoch": 0.45, + "grad_norm": 1.8489831686019897, + "learning_rate": 2.7659469341370626e-05, + "loss": 5.6375, + "step": 9044 + }, + { + "epoch": 0.45, + "grad_norm": 2.1420514583587646, + "learning_rate": 2.7649587430209008e-05, + "loss": 5.5546, + "step": 9048 + }, + { + "epoch": 0.45, + "grad_norm": 1.9371378421783447, + "learning_rate": 2.7639705519047386e-05, + "loss": 5.4121, + "step": 9052 + }, + { + "epoch": 0.45, + "grad_norm": 2.1076085567474365, + "learning_rate": 2.762982360788577e-05, + "loss": 5.6349, + "step": 9056 + }, + { + "epoch": 0.45, + "grad_norm": 2.1002743244171143, + "learning_rate": 2.761994169672415e-05, + "loss": 5.7135, + "step": 9060 + }, + { + "epoch": 0.45, + "grad_norm": 1.5791290998458862, + "learning_rate": 2.761005978556253e-05, + "loss": 5.5199, + "step": 9064 + }, + { + "epoch": 0.45, + "grad_norm": 1.9685577154159546, + "learning_rate": 2.760017787440091e-05, + "loss": 5.6298, + "step": 9068 + }, + { + "epoch": 0.45, + "grad_norm": 1.6860204935073853, + "learning_rate": 2.7590295963239293e-05, + "loss": 5.67, + "step": 9072 + }, + { + "epoch": 0.45, + "grad_norm": 2.2700107097625732, + "learning_rate": 2.7580414052077675e-05, + "loss": 5.661, + "step": 9076 + }, + { + "epoch": 0.45, + "grad_norm": 1.9949781894683838, + "learning_rate": 2.7570532140916054e-05, + "loss": 5.6058, + "step": 9080 + }, + { + "epoch": 0.45, + "grad_norm": 1.8357362747192383, + "learning_rate": 2.7560650229754436e-05, + "loss": 5.6563, + "step": 9084 + }, + { + "epoch": 0.45, + "grad_norm": 2.157716751098633, + "learning_rate": 2.7550768318592818e-05, + "loss": 5.5362, + "step": 9088 + }, + { + "epoch": 0.45, + "grad_norm": 2.0846590995788574, + "learning_rate": 2.75408864074312e-05, + "loss": 5.4562, + "step": 9092 + }, + { + "epoch": 0.45, + "grad_norm": 1.8942608833312988, + "learning_rate": 2.753100449626958e-05, + "loss": 5.6858, + "step": 9096 + }, + { + "epoch": 0.45, + "grad_norm": 1.882866621017456, + "learning_rate": 2.752112258510796e-05, + "loss": 5.5599, + "step": 9100 + }, + { + "epoch": 0.45, + "grad_norm": 2.1065609455108643, + "learning_rate": 2.7511240673946343e-05, + "loss": 5.6257, + "step": 9104 + }, + { + "epoch": 0.45, + "grad_norm": 1.8394254446029663, + "learning_rate": 2.7501358762784725e-05, + "loss": 5.548, + "step": 9108 + }, + { + "epoch": 0.45, + "grad_norm": 1.759247899055481, + "learning_rate": 2.7491476851623104e-05, + "loss": 5.5753, + "step": 9112 + }, + { + "epoch": 0.45, + "grad_norm": 1.773542881011963, + "learning_rate": 2.7481594940461486e-05, + "loss": 5.6327, + "step": 9116 + }, + { + "epoch": 0.45, + "grad_norm": 2.0030620098114014, + "learning_rate": 2.7471713029299868e-05, + "loss": 5.5694, + "step": 9120 + }, + { + "epoch": 0.45, + "grad_norm": 1.7507848739624023, + "learning_rate": 2.7461831118138247e-05, + "loss": 5.6386, + "step": 9124 + }, + { + "epoch": 0.45, + "grad_norm": 1.7345106601715088, + "learning_rate": 2.745194920697663e-05, + "loss": 5.5398, + "step": 9128 + }, + { + "epoch": 0.45, + "grad_norm": 2.1354246139526367, + "learning_rate": 2.744206729581501e-05, + "loss": 5.5165, + "step": 9132 + }, + { + "epoch": 0.45, + "grad_norm": 1.6289561986923218, + "learning_rate": 2.7432185384653393e-05, + "loss": 5.6551, + "step": 9136 + }, + { + "epoch": 0.45, + "grad_norm": 1.850968599319458, + "learning_rate": 2.742230347349177e-05, + "loss": 5.5557, + "step": 9140 + }, + { + "epoch": 0.45, + "grad_norm": 1.6815087795257568, + "learning_rate": 2.7412421562330154e-05, + "loss": 5.5728, + "step": 9144 + }, + { + "epoch": 0.45, + "grad_norm": 1.868223786354065, + "learning_rate": 2.7402539651168536e-05, + "loss": 5.5449, + "step": 9148 + }, + { + "epoch": 0.45, + "grad_norm": 1.6773638725280762, + "learning_rate": 2.7392657740006918e-05, + "loss": 5.588, + "step": 9152 + }, + { + "epoch": 0.45, + "grad_norm": 1.8860238790512085, + "learning_rate": 2.7382775828845296e-05, + "loss": 5.537, + "step": 9156 + }, + { + "epoch": 0.45, + "grad_norm": 1.8454649448394775, + "learning_rate": 2.7372893917683685e-05, + "loss": 5.5351, + "step": 9160 + }, + { + "epoch": 0.45, + "grad_norm": 1.8175948858261108, + "learning_rate": 2.7363012006522064e-05, + "loss": 5.5808, + "step": 9164 + }, + { + "epoch": 0.45, + "grad_norm": 1.8350155353546143, + "learning_rate": 2.7353130095360446e-05, + "loss": 5.5217, + "step": 9168 + }, + { + "epoch": 0.45, + "grad_norm": 1.6785465478897095, + "learning_rate": 2.7343248184198828e-05, + "loss": 5.5363, + "step": 9172 + }, + { + "epoch": 0.45, + "grad_norm": 2.043757200241089, + "learning_rate": 2.733336627303721e-05, + "loss": 5.3924, + "step": 9176 + }, + { + "epoch": 0.45, + "grad_norm": 1.7272677421569824, + "learning_rate": 2.732348436187559e-05, + "loss": 5.5548, + "step": 9180 + }, + { + "epoch": 0.45, + "grad_norm": 1.7841123342514038, + "learning_rate": 2.731360245071397e-05, + "loss": 5.5787, + "step": 9184 + }, + { + "epoch": 0.45, + "grad_norm": 2.2284867763519287, + "learning_rate": 2.7303720539552353e-05, + "loss": 5.6589, + "step": 9188 + }, + { + "epoch": 0.45, + "grad_norm": 1.8492909669876099, + "learning_rate": 2.7293838628390735e-05, + "loss": 5.6947, + "step": 9192 + }, + { + "epoch": 0.45, + "grad_norm": 1.870833396911621, + "learning_rate": 2.7283956717229114e-05, + "loss": 5.5846, + "step": 9196 + }, + { + "epoch": 0.45, + "grad_norm": 2.007213592529297, + "learning_rate": 2.7274074806067496e-05, + "loss": 5.4946, + "step": 9200 + }, + { + "epoch": 0.45, + "grad_norm": 2.178415536880493, + "learning_rate": 2.7264192894905878e-05, + "loss": 5.6511, + "step": 9204 + }, + { + "epoch": 0.45, + "grad_norm": 1.777798056602478, + "learning_rate": 2.7254310983744256e-05, + "loss": 5.5872, + "step": 9208 + }, + { + "epoch": 0.46, + "grad_norm": 2.0451252460479736, + "learning_rate": 2.724442907258264e-05, + "loss": 5.5361, + "step": 9212 + }, + { + "epoch": 0.46, + "grad_norm": 1.672156572341919, + "learning_rate": 2.723454716142102e-05, + "loss": 5.5812, + "step": 9216 + }, + { + "epoch": 0.46, + "grad_norm": 1.7018766403198242, + "learning_rate": 2.7224665250259402e-05, + "loss": 5.5854, + "step": 9220 + }, + { + "epoch": 0.46, + "grad_norm": 1.8152602910995483, + "learning_rate": 2.721478333909778e-05, + "loss": 5.5784, + "step": 9224 + }, + { + "epoch": 0.46, + "grad_norm": 1.722531795501709, + "learning_rate": 2.7204901427936163e-05, + "loss": 5.5214, + "step": 9228 + }, + { + "epoch": 0.46, + "grad_norm": 1.9717679023742676, + "learning_rate": 2.7195019516774545e-05, + "loss": 5.5436, + "step": 9232 + }, + { + "epoch": 0.46, + "grad_norm": 1.6937377452850342, + "learning_rate": 2.7185137605612927e-05, + "loss": 5.5609, + "step": 9236 + }, + { + "epoch": 0.46, + "grad_norm": 1.6418997049331665, + "learning_rate": 2.7175255694451306e-05, + "loss": 5.5576, + "step": 9240 + }, + { + "epoch": 0.46, + "grad_norm": 1.9024310111999512, + "learning_rate": 2.7165373783289688e-05, + "loss": 5.6251, + "step": 9244 + }, + { + "epoch": 0.46, + "grad_norm": 1.9837607145309448, + "learning_rate": 2.715549187212807e-05, + "loss": 5.6149, + "step": 9248 + }, + { + "epoch": 0.46, + "grad_norm": 1.5828512907028198, + "learning_rate": 2.714560996096645e-05, + "loss": 5.4367, + "step": 9252 + }, + { + "epoch": 0.46, + "grad_norm": 2.0914130210876465, + "learning_rate": 2.713572804980483e-05, + "loss": 5.632, + "step": 9256 + }, + { + "epoch": 0.46, + "grad_norm": 2.2636313438415527, + "learning_rate": 2.7125846138643213e-05, + "loss": 5.6487, + "step": 9260 + }, + { + "epoch": 0.46, + "grad_norm": 2.0330276489257812, + "learning_rate": 2.7115964227481595e-05, + "loss": 5.5618, + "step": 9264 + }, + { + "epoch": 0.46, + "grad_norm": 1.7501007318496704, + "learning_rate": 2.7106082316319974e-05, + "loss": 5.4962, + "step": 9268 + }, + { + "epoch": 0.46, + "grad_norm": 2.0401079654693604, + "learning_rate": 2.7096200405158356e-05, + "loss": 5.5273, + "step": 9272 + }, + { + "epoch": 0.46, + "grad_norm": 1.885672688484192, + "learning_rate": 2.7086318493996745e-05, + "loss": 5.5909, + "step": 9276 + }, + { + "epoch": 0.46, + "grad_norm": 1.8399213552474976, + "learning_rate": 2.7076436582835123e-05, + "loss": 5.5688, + "step": 9280 + }, + { + "epoch": 0.46, + "grad_norm": 1.6890549659729004, + "learning_rate": 2.7066554671673505e-05, + "loss": 5.5704, + "step": 9284 + }, + { + "epoch": 0.46, + "grad_norm": 2.0120296478271484, + "learning_rate": 2.7056672760511887e-05, + "loss": 5.6876, + "step": 9288 + }, + { + "epoch": 0.46, + "grad_norm": 1.8612735271453857, + "learning_rate": 2.7046790849350266e-05, + "loss": 5.5703, + "step": 9292 + }, + { + "epoch": 0.46, + "grad_norm": 1.8374227285385132, + "learning_rate": 2.7036908938188648e-05, + "loss": 5.4759, + "step": 9296 + }, + { + "epoch": 0.46, + "grad_norm": 1.8448069095611572, + "learning_rate": 2.702702702702703e-05, + "loss": 5.5785, + "step": 9300 + }, + { + "epoch": 0.46, + "grad_norm": 1.6031235456466675, + "learning_rate": 2.7017145115865412e-05, + "loss": 5.5566, + "step": 9304 + }, + { + "epoch": 0.46, + "grad_norm": 1.8467074632644653, + "learning_rate": 2.700726320470379e-05, + "loss": 5.6011, + "step": 9308 + }, + { + "epoch": 0.46, + "grad_norm": 2.152393102645874, + "learning_rate": 2.6997381293542173e-05, + "loss": 5.4838, + "step": 9312 + }, + { + "epoch": 0.46, + "grad_norm": 1.6905425786972046, + "learning_rate": 2.6987499382380555e-05, + "loss": 5.6356, + "step": 9316 + }, + { + "epoch": 0.46, + "grad_norm": 2.052292823791504, + "learning_rate": 2.6977617471218937e-05, + "loss": 5.5832, + "step": 9320 + }, + { + "epoch": 0.46, + "grad_norm": 1.9417749643325806, + "learning_rate": 2.6967735560057316e-05, + "loss": 5.6308, + "step": 9324 + }, + { + "epoch": 0.46, + "grad_norm": 1.7844290733337402, + "learning_rate": 2.6957853648895698e-05, + "loss": 5.5116, + "step": 9328 + }, + { + "epoch": 0.46, + "grad_norm": 1.945826768875122, + "learning_rate": 2.694797173773408e-05, + "loss": 5.5016, + "step": 9332 + }, + { + "epoch": 0.46, + "grad_norm": 2.079559087753296, + "learning_rate": 2.693808982657246e-05, + "loss": 5.631, + "step": 9336 + }, + { + "epoch": 0.46, + "grad_norm": 1.7609906196594238, + "learning_rate": 2.692820791541084e-05, + "loss": 5.6115, + "step": 9340 + }, + { + "epoch": 0.46, + "grad_norm": 2.221853733062744, + "learning_rate": 2.6918326004249223e-05, + "loss": 5.6196, + "step": 9344 + }, + { + "epoch": 0.46, + "grad_norm": 1.8510643243789673, + "learning_rate": 2.6908444093087605e-05, + "loss": 5.4623, + "step": 9348 + }, + { + "epoch": 0.46, + "grad_norm": 2.1202962398529053, + "learning_rate": 2.6898562181925983e-05, + "loss": 5.6011, + "step": 9352 + }, + { + "epoch": 0.46, + "grad_norm": 1.9215221405029297, + "learning_rate": 2.6888680270764365e-05, + "loss": 5.6439, + "step": 9356 + }, + { + "epoch": 0.46, + "grad_norm": 1.8922703266143799, + "learning_rate": 2.6878798359602748e-05, + "loss": 5.6081, + "step": 9360 + }, + { + "epoch": 0.46, + "grad_norm": 2.219496726989746, + "learning_rate": 2.686891644844113e-05, + "loss": 5.7311, + "step": 9364 + }, + { + "epoch": 0.46, + "grad_norm": 1.926804542541504, + "learning_rate": 2.6859034537279508e-05, + "loss": 5.5689, + "step": 9368 + }, + { + "epoch": 0.46, + "grad_norm": 2.2841477394104004, + "learning_rate": 2.684915262611789e-05, + "loss": 5.6086, + "step": 9372 + }, + { + "epoch": 0.46, + "grad_norm": 1.9708247184753418, + "learning_rate": 2.6839270714956272e-05, + "loss": 5.6008, + "step": 9376 + }, + { + "epoch": 0.46, + "grad_norm": 1.7928872108459473, + "learning_rate": 2.6829388803794654e-05, + "loss": 5.6374, + "step": 9380 + }, + { + "epoch": 0.46, + "grad_norm": 1.7741127014160156, + "learning_rate": 2.6819506892633033e-05, + "loss": 5.611, + "step": 9384 + }, + { + "epoch": 0.46, + "grad_norm": 1.63377845287323, + "learning_rate": 2.6809624981471415e-05, + "loss": 5.6611, + "step": 9388 + }, + { + "epoch": 0.46, + "grad_norm": 1.8570857048034668, + "learning_rate": 2.67997430703098e-05, + "loss": 5.5389, + "step": 9392 + }, + { + "epoch": 0.46, + "grad_norm": 1.8265759944915771, + "learning_rate": 2.6789861159148183e-05, + "loss": 5.6445, + "step": 9396 + }, + { + "epoch": 0.46, + "grad_norm": 1.8934123516082764, + "learning_rate": 2.6779979247986565e-05, + "loss": 5.6222, + "step": 9400 + }, + { + "epoch": 0.46, + "grad_norm": 1.8162897825241089, + "learning_rate": 2.6770097336824947e-05, + "loss": 5.474, + "step": 9404 + }, + { + "epoch": 0.46, + "grad_norm": 2.1060986518859863, + "learning_rate": 2.6760215425663325e-05, + "loss": 5.6903, + "step": 9408 + }, + { + "epoch": 0.47, + "grad_norm": 1.9316869974136353, + "learning_rate": 2.6750333514501708e-05, + "loss": 5.4954, + "step": 9412 + }, + { + "epoch": 0.47, + "grad_norm": 1.7672759294509888, + "learning_rate": 2.674045160334009e-05, + "loss": 5.604, + "step": 9416 + }, + { + "epoch": 0.47, + "grad_norm": 1.6622782945632935, + "learning_rate": 2.6730569692178468e-05, + "loss": 5.5769, + "step": 9420 + }, + { + "epoch": 0.47, + "grad_norm": 1.6604558229446411, + "learning_rate": 2.672068778101685e-05, + "loss": 5.5286, + "step": 9424 + }, + { + "epoch": 0.47, + "grad_norm": 1.779937744140625, + "learning_rate": 2.6710805869855232e-05, + "loss": 5.6161, + "step": 9428 + }, + { + "epoch": 0.47, + "grad_norm": 1.9902960062026978, + "learning_rate": 2.6700923958693614e-05, + "loss": 5.6306, + "step": 9432 + }, + { + "epoch": 0.47, + "grad_norm": 2.084911823272705, + "learning_rate": 2.6691042047531993e-05, + "loss": 5.5941, + "step": 9436 + }, + { + "epoch": 0.47, + "grad_norm": 1.6512478590011597, + "learning_rate": 2.6681160136370375e-05, + "loss": 5.5051, + "step": 9440 + }, + { + "epoch": 0.47, + "grad_norm": 2.100574016571045, + "learning_rate": 2.6671278225208757e-05, + "loss": 5.5, + "step": 9444 + }, + { + "epoch": 0.47, + "grad_norm": 1.922301173210144, + "learning_rate": 2.666139631404714e-05, + "loss": 5.7458, + "step": 9448 + }, + { + "epoch": 0.47, + "grad_norm": 2.0800986289978027, + "learning_rate": 2.6651514402885518e-05, + "loss": 5.4861, + "step": 9452 + }, + { + "epoch": 0.47, + "grad_norm": 1.9604395627975464, + "learning_rate": 2.66416324917239e-05, + "loss": 5.6116, + "step": 9456 + }, + { + "epoch": 0.47, + "grad_norm": 1.9513726234436035, + "learning_rate": 2.6631750580562282e-05, + "loss": 5.6, + "step": 9460 + }, + { + "epoch": 0.47, + "grad_norm": 1.6587291955947876, + "learning_rate": 2.6621868669400664e-05, + "loss": 5.5058, + "step": 9464 + }, + { + "epoch": 0.47, + "grad_norm": 1.724055290222168, + "learning_rate": 2.6611986758239043e-05, + "loss": 5.6968, + "step": 9468 + }, + { + "epoch": 0.47, + "grad_norm": 1.8488603830337524, + "learning_rate": 2.6602104847077425e-05, + "loss": 5.4925, + "step": 9472 + }, + { + "epoch": 0.47, + "grad_norm": 1.9515416622161865, + "learning_rate": 2.6592222935915807e-05, + "loss": 5.5739, + "step": 9476 + }, + { + "epoch": 0.47, + "grad_norm": 1.8610438108444214, + "learning_rate": 2.6582341024754186e-05, + "loss": 5.6003, + "step": 9480 + }, + { + "epoch": 0.47, + "grad_norm": 1.7559446096420288, + "learning_rate": 2.6572459113592568e-05, + "loss": 5.6684, + "step": 9484 + }, + { + "epoch": 0.47, + "grad_norm": 1.95654296875, + "learning_rate": 2.656257720243095e-05, + "loss": 5.5024, + "step": 9488 + }, + { + "epoch": 0.47, + "grad_norm": 1.9304943084716797, + "learning_rate": 2.6552695291269332e-05, + "loss": 5.5273, + "step": 9492 + }, + { + "epoch": 0.47, + "grad_norm": 1.9279637336730957, + "learning_rate": 2.654281338010771e-05, + "loss": 5.503, + "step": 9496 + }, + { + "epoch": 0.47, + "grad_norm": 1.975297212600708, + "learning_rate": 2.6532931468946093e-05, + "loss": 5.4805, + "step": 9500 + }, + { + "epoch": 0.47, + "grad_norm": 1.8157209157943726, + "learning_rate": 2.6523049557784478e-05, + "loss": 5.5161, + "step": 9504 + }, + { + "epoch": 0.47, + "grad_norm": 1.9559147357940674, + "learning_rate": 2.651316764662286e-05, + "loss": 5.5609, + "step": 9508 + }, + { + "epoch": 0.47, + "grad_norm": 2.0093133449554443, + "learning_rate": 2.6503285735461242e-05, + "loss": 5.5939, + "step": 9512 + }, + { + "epoch": 0.47, + "grad_norm": 1.8078559637069702, + "learning_rate": 2.6493403824299624e-05, + "loss": 5.4802, + "step": 9516 + }, + { + "epoch": 0.47, + "grad_norm": 1.8105264902114868, + "learning_rate": 2.6483521913138003e-05, + "loss": 5.6028, + "step": 9520 + }, + { + "epoch": 0.47, + "grad_norm": 2.1344854831695557, + "learning_rate": 2.6473640001976385e-05, + "loss": 5.6429, + "step": 9524 + }, + { + "epoch": 0.47, + "grad_norm": 1.7309706211090088, + "learning_rate": 2.6463758090814767e-05, + "loss": 5.3914, + "step": 9528 + }, + { + "epoch": 0.47, + "grad_norm": 2.0924670696258545, + "learning_rate": 2.645387617965315e-05, + "loss": 5.5516, + "step": 9532 + }, + { + "epoch": 0.47, + "grad_norm": 2.076195001602173, + "learning_rate": 2.6443994268491528e-05, + "loss": 5.5816, + "step": 9536 + }, + { + "epoch": 0.47, + "grad_norm": 2.286782741546631, + "learning_rate": 2.643411235732991e-05, + "loss": 5.633, + "step": 9540 + }, + { + "epoch": 0.47, + "grad_norm": 2.0521233081817627, + "learning_rate": 2.6424230446168292e-05, + "loss": 5.5289, + "step": 9544 + }, + { + "epoch": 0.47, + "grad_norm": 1.7153042554855347, + "learning_rate": 2.6414348535006674e-05, + "loss": 5.5659, + "step": 9548 + }, + { + "epoch": 0.47, + "grad_norm": 1.8992023468017578, + "learning_rate": 2.6404466623845053e-05, + "loss": 5.5774, + "step": 9552 + }, + { + "epoch": 0.47, + "grad_norm": 2.049445390701294, + "learning_rate": 2.6394584712683435e-05, + "loss": 5.5893, + "step": 9556 + }, + { + "epoch": 0.47, + "grad_norm": 1.7689001560211182, + "learning_rate": 2.6384702801521817e-05, + "loss": 5.4599, + "step": 9560 + }, + { + "epoch": 0.47, + "grad_norm": 1.7153732776641846, + "learning_rate": 2.6374820890360195e-05, + "loss": 5.4882, + "step": 9564 + }, + { + "epoch": 0.47, + "grad_norm": 1.8958677053451538, + "learning_rate": 2.6364938979198577e-05, + "loss": 5.4936, + "step": 9568 + }, + { + "epoch": 0.47, + "grad_norm": 1.7627476453781128, + "learning_rate": 2.635505706803696e-05, + "loss": 5.4682, + "step": 9572 + }, + { + "epoch": 0.47, + "grad_norm": 1.768936276435852, + "learning_rate": 2.634517515687534e-05, + "loss": 5.5129, + "step": 9576 + }, + { + "epoch": 0.47, + "grad_norm": 1.9146132469177246, + "learning_rate": 2.633529324571372e-05, + "loss": 5.7007, + "step": 9580 + }, + { + "epoch": 0.47, + "grad_norm": 1.9375848770141602, + "learning_rate": 2.6325411334552102e-05, + "loss": 5.6438, + "step": 9584 + }, + { + "epoch": 0.47, + "grad_norm": 1.8867487907409668, + "learning_rate": 2.6315529423390484e-05, + "loss": 5.5024, + "step": 9588 + }, + { + "epoch": 0.47, + "grad_norm": 1.8224083185195923, + "learning_rate": 2.6305647512228866e-05, + "loss": 5.4406, + "step": 9592 + }, + { + "epoch": 0.47, + "grad_norm": 1.8230053186416626, + "learning_rate": 2.6295765601067245e-05, + "loss": 5.6339, + "step": 9596 + }, + { + "epoch": 0.47, + "grad_norm": 1.9611897468566895, + "learning_rate": 2.6285883689905627e-05, + "loss": 5.377, + "step": 9600 + }, + { + "epoch": 0.47, + "grad_norm": 1.842227816581726, + "learning_rate": 2.627600177874401e-05, + "loss": 5.61, + "step": 9604 + }, + { + "epoch": 0.47, + "grad_norm": 1.8939987421035767, + "learning_rate": 2.6266119867582388e-05, + "loss": 5.6035, + "step": 9608 + }, + { + "epoch": 0.47, + "grad_norm": 1.800186276435852, + "learning_rate": 2.625623795642077e-05, + "loss": 5.6362, + "step": 9612 + }, + { + "epoch": 0.48, + "grad_norm": 2.1012604236602783, + "learning_rate": 2.6246356045259152e-05, + "loss": 5.7597, + "step": 9616 + }, + { + "epoch": 0.48, + "grad_norm": 1.9128572940826416, + "learning_rate": 2.6236474134097537e-05, + "loss": 5.5882, + "step": 9620 + }, + { + "epoch": 0.48, + "grad_norm": 1.9831055402755737, + "learning_rate": 2.622659222293592e-05, + "loss": 5.6463, + "step": 9624 + }, + { + "epoch": 0.48, + "grad_norm": 1.7044267654418945, + "learning_rate": 2.62167103117743e-05, + "loss": 5.4153, + "step": 9628 + }, + { + "epoch": 0.48, + "grad_norm": 1.859174132347107, + "learning_rate": 2.620682840061268e-05, + "loss": 5.4704, + "step": 9632 + }, + { + "epoch": 0.48, + "grad_norm": 1.8365576267242432, + "learning_rate": 2.6196946489451062e-05, + "loss": 5.6485, + "step": 9636 + }, + { + "epoch": 0.48, + "grad_norm": 1.8253902196884155, + "learning_rate": 2.6187064578289444e-05, + "loss": 5.6777, + "step": 9640 + }, + { + "epoch": 0.48, + "grad_norm": 1.8828972578048706, + "learning_rate": 2.6177182667127826e-05, + "loss": 5.4909, + "step": 9644 + }, + { + "epoch": 0.48, + "grad_norm": 1.7956063747406006, + "learning_rate": 2.6167300755966205e-05, + "loss": 5.5966, + "step": 9648 + }, + { + "epoch": 0.48, + "grad_norm": 2.1321167945861816, + "learning_rate": 2.6157418844804587e-05, + "loss": 5.6254, + "step": 9652 + }, + { + "epoch": 0.48, + "grad_norm": 1.7192585468292236, + "learning_rate": 2.614753693364297e-05, + "loss": 5.4451, + "step": 9656 + }, + { + "epoch": 0.48, + "grad_norm": 1.9337345361709595, + "learning_rate": 2.613765502248135e-05, + "loss": 5.6618, + "step": 9660 + }, + { + "epoch": 0.48, + "grad_norm": 2.0022127628326416, + "learning_rate": 2.612777311131973e-05, + "loss": 5.5478, + "step": 9664 + }, + { + "epoch": 0.48, + "grad_norm": 1.6713944673538208, + "learning_rate": 2.6117891200158112e-05, + "loss": 5.5078, + "step": 9668 + }, + { + "epoch": 0.48, + "grad_norm": 1.902187466621399, + "learning_rate": 2.6108009288996494e-05, + "loss": 5.6425, + "step": 9672 + }, + { + "epoch": 0.48, + "grad_norm": 2.1489810943603516, + "learning_rate": 2.6098127377834876e-05, + "loss": 5.6363, + "step": 9676 + }, + { + "epoch": 0.48, + "grad_norm": 1.7200430631637573, + "learning_rate": 2.6088245466673255e-05, + "loss": 5.6002, + "step": 9680 + }, + { + "epoch": 0.48, + "grad_norm": 1.9988877773284912, + "learning_rate": 2.6078363555511637e-05, + "loss": 5.6169, + "step": 9684 + }, + { + "epoch": 0.48, + "grad_norm": 1.99684476852417, + "learning_rate": 2.606848164435002e-05, + "loss": 5.4686, + "step": 9688 + }, + { + "epoch": 0.48, + "grad_norm": 1.6895607709884644, + "learning_rate": 2.6058599733188398e-05, + "loss": 5.566, + "step": 9692 + }, + { + "epoch": 0.48, + "grad_norm": 1.6524453163146973, + "learning_rate": 2.604871782202678e-05, + "loss": 5.5477, + "step": 9696 + }, + { + "epoch": 0.48, + "grad_norm": 1.7908339500427246, + "learning_rate": 2.603883591086516e-05, + "loss": 5.7, + "step": 9700 + }, + { + "epoch": 0.48, + "grad_norm": 1.8683209419250488, + "learning_rate": 2.6028953999703544e-05, + "loss": 5.4894, + "step": 9704 + }, + { + "epoch": 0.48, + "grad_norm": 1.883863925933838, + "learning_rate": 2.6019072088541922e-05, + "loss": 5.6895, + "step": 9708 + }, + { + "epoch": 0.48, + "grad_norm": 1.8453763723373413, + "learning_rate": 2.6009190177380304e-05, + "loss": 5.4554, + "step": 9712 + }, + { + "epoch": 0.48, + "grad_norm": 1.7885375022888184, + "learning_rate": 2.5999308266218686e-05, + "loss": 5.6365, + "step": 9716 + }, + { + "epoch": 0.48, + "grad_norm": 1.9146047830581665, + "learning_rate": 2.598942635505707e-05, + "loss": 5.5714, + "step": 9720 + }, + { + "epoch": 0.48, + "grad_norm": 1.9622520208358765, + "learning_rate": 2.5979544443895447e-05, + "loss": 5.6337, + "step": 9724 + }, + { + "epoch": 0.48, + "grad_norm": 2.236665725708008, + "learning_rate": 2.596966253273383e-05, + "loss": 5.5048, + "step": 9728 + }, + { + "epoch": 0.48, + "grad_norm": 1.8453449010849, + "learning_rate": 2.595978062157221e-05, + "loss": 5.6236, + "step": 9732 + }, + { + "epoch": 0.48, + "grad_norm": 1.9595757722854614, + "learning_rate": 2.5949898710410597e-05, + "loss": 5.6084, + "step": 9736 + }, + { + "epoch": 0.48, + "grad_norm": 2.1944825649261475, + "learning_rate": 2.594001679924898e-05, + "loss": 5.4383, + "step": 9740 + }, + { + "epoch": 0.48, + "grad_norm": 1.821465015411377, + "learning_rate": 2.593013488808736e-05, + "loss": 5.4977, + "step": 9744 + }, + { + "epoch": 0.48, + "grad_norm": 1.7616369724273682, + "learning_rate": 2.592025297692574e-05, + "loss": 5.5552, + "step": 9748 + }, + { + "epoch": 0.48, + "grad_norm": 1.8618212938308716, + "learning_rate": 2.591037106576412e-05, + "loss": 5.6036, + "step": 9752 + }, + { + "epoch": 0.48, + "grad_norm": 2.120798110961914, + "learning_rate": 2.5900489154602504e-05, + "loss": 5.4679, + "step": 9756 + }, + { + "epoch": 0.48, + "grad_norm": 2.2172162532806396, + "learning_rate": 2.5890607243440886e-05, + "loss": 5.583, + "step": 9760 + }, + { + "epoch": 0.48, + "grad_norm": 1.9342153072357178, + "learning_rate": 2.5880725332279264e-05, + "loss": 5.5783, + "step": 9764 + }, + { + "epoch": 0.48, + "grad_norm": 1.6747381687164307, + "learning_rate": 2.5870843421117647e-05, + "loss": 5.6047, + "step": 9768 + }, + { + "epoch": 0.48, + "grad_norm": 2.0254557132720947, + "learning_rate": 2.586096150995603e-05, + "loss": 5.6368, + "step": 9772 + }, + { + "epoch": 0.48, + "grad_norm": 1.9324589967727661, + "learning_rate": 2.5851079598794407e-05, + "loss": 5.6284, + "step": 9776 + }, + { + "epoch": 0.48, + "grad_norm": 1.6605185270309448, + "learning_rate": 2.584119768763279e-05, + "loss": 5.5681, + "step": 9780 + }, + { + "epoch": 0.48, + "grad_norm": 1.6846225261688232, + "learning_rate": 2.583131577647117e-05, + "loss": 5.5159, + "step": 9784 + }, + { + "epoch": 0.48, + "grad_norm": 2.0729458332061768, + "learning_rate": 2.5821433865309553e-05, + "loss": 5.5701, + "step": 9788 + }, + { + "epoch": 0.48, + "grad_norm": 1.831186056137085, + "learning_rate": 2.5811551954147932e-05, + "loss": 5.5996, + "step": 9792 + }, + { + "epoch": 0.48, + "grad_norm": 1.744837999343872, + "learning_rate": 2.5801670042986314e-05, + "loss": 5.5798, + "step": 9796 + }, + { + "epoch": 0.48, + "grad_norm": 1.7464189529418945, + "learning_rate": 2.5791788131824696e-05, + "loss": 5.4147, + "step": 9800 + }, + { + "epoch": 0.48, + "grad_norm": 2.083294630050659, + "learning_rate": 2.5781906220663078e-05, + "loss": 5.5884, + "step": 9804 + }, + { + "epoch": 0.48, + "grad_norm": 1.848488211631775, + "learning_rate": 2.5772024309501457e-05, + "loss": 5.4192, + "step": 9808 + }, + { + "epoch": 0.48, + "grad_norm": 1.8457578420639038, + "learning_rate": 2.576214239833984e-05, + "loss": 5.6189, + "step": 9812 + }, + { + "epoch": 0.49, + "grad_norm": 1.9450173377990723, + "learning_rate": 2.575226048717822e-05, + "loss": 5.5665, + "step": 9816 + }, + { + "epoch": 0.49, + "grad_norm": 1.7913298606872559, + "learning_rate": 2.57423785760166e-05, + "loss": 5.5333, + "step": 9820 + }, + { + "epoch": 0.49, + "grad_norm": 1.8808374404907227, + "learning_rate": 2.5732496664854982e-05, + "loss": 5.4615, + "step": 9824 + }, + { + "epoch": 0.49, + "grad_norm": 2.1443545818328857, + "learning_rate": 2.5722614753693364e-05, + "loss": 5.599, + "step": 9828 + }, + { + "epoch": 0.49, + "grad_norm": 1.867958903312683, + "learning_rate": 2.5712732842531746e-05, + "loss": 5.6383, + "step": 9832 + }, + { + "epoch": 0.49, + "grad_norm": 1.9032071828842163, + "learning_rate": 2.5702850931370125e-05, + "loss": 5.6756, + "step": 9836 + }, + { + "epoch": 0.49, + "grad_norm": 2.0405712127685547, + "learning_rate": 2.5692969020208507e-05, + "loss": 5.5741, + "step": 9840 + }, + { + "epoch": 0.49, + "grad_norm": 1.9483815431594849, + "learning_rate": 2.568308710904689e-05, + "loss": 5.5603, + "step": 9844 + }, + { + "epoch": 0.49, + "grad_norm": 1.6286249160766602, + "learning_rate": 2.5673205197885274e-05, + "loss": 5.5213, + "step": 9848 + }, + { + "epoch": 0.49, + "grad_norm": 2.009946346282959, + "learning_rate": 2.5663323286723656e-05, + "loss": 5.6207, + "step": 9852 + }, + { + "epoch": 0.49, + "grad_norm": 1.7582067251205444, + "learning_rate": 2.5653441375562038e-05, + "loss": 5.6453, + "step": 9856 + }, + { + "epoch": 0.49, + "grad_norm": 1.7018730640411377, + "learning_rate": 2.5643559464400417e-05, + "loss": 5.6713, + "step": 9860 + }, + { + "epoch": 0.49, + "grad_norm": 1.920355200767517, + "learning_rate": 2.56336775532388e-05, + "loss": 5.6109, + "step": 9864 + }, + { + "epoch": 0.49, + "grad_norm": 1.79157555103302, + "learning_rate": 2.562379564207718e-05, + "loss": 5.5102, + "step": 9868 + }, + { + "epoch": 0.49, + "grad_norm": 1.7163673639297485, + "learning_rate": 2.5613913730915563e-05, + "loss": 5.6472, + "step": 9872 + }, + { + "epoch": 0.49, + "grad_norm": 1.8523849248886108, + "learning_rate": 2.5604031819753942e-05, + "loss": 5.6162, + "step": 9876 + }, + { + "epoch": 0.49, + "grad_norm": 2.006044626235962, + "learning_rate": 2.5594149908592324e-05, + "loss": 5.6793, + "step": 9880 + }, + { + "epoch": 0.49, + "grad_norm": 1.745415449142456, + "learning_rate": 2.558673847522111e-05, + "loss": 5.5417, + "step": 9884 + }, + { + "epoch": 0.49, + "grad_norm": 1.9379210472106934, + "learning_rate": 2.5576856564059492e-05, + "loss": 5.6047, + "step": 9888 + }, + { + "epoch": 0.49, + "grad_norm": 1.8513517379760742, + "learning_rate": 2.5566974652897875e-05, + "loss": 5.6515, + "step": 9892 + }, + { + "epoch": 0.49, + "grad_norm": 1.8587697744369507, + "learning_rate": 2.5557092741736253e-05, + "loss": 5.5598, + "step": 9896 + }, + { + "epoch": 0.49, + "grad_norm": 1.9905014038085938, + "learning_rate": 2.5547210830574635e-05, + "loss": 5.6137, + "step": 9900 + }, + { + "epoch": 0.49, + "grad_norm": 1.83003568649292, + "learning_rate": 2.5537328919413017e-05, + "loss": 5.6503, + "step": 9904 + }, + { + "epoch": 0.49, + "grad_norm": 1.71113121509552, + "learning_rate": 2.55274470082514e-05, + "loss": 5.561, + "step": 9908 + }, + { + "epoch": 0.49, + "grad_norm": 1.6225950717926025, + "learning_rate": 2.5517565097089778e-05, + "loss": 5.4564, + "step": 9912 + }, + { + "epoch": 0.49, + "grad_norm": 1.8159841299057007, + "learning_rate": 2.550768318592816e-05, + "loss": 5.5984, + "step": 9916 + }, + { + "epoch": 0.49, + "grad_norm": 2.003652572631836, + "learning_rate": 2.5497801274766542e-05, + "loss": 5.4597, + "step": 9920 + }, + { + "epoch": 0.49, + "grad_norm": 1.8188750743865967, + "learning_rate": 2.5487919363604924e-05, + "loss": 5.46, + "step": 9924 + }, + { + "epoch": 0.49, + "grad_norm": 1.832154631614685, + "learning_rate": 2.5478037452443303e-05, + "loss": 5.5748, + "step": 9928 + }, + { + "epoch": 0.49, + "grad_norm": 1.9353375434875488, + "learning_rate": 2.5468155541281685e-05, + "loss": 5.5672, + "step": 9932 + }, + { + "epoch": 0.49, + "grad_norm": 1.9940686225891113, + "learning_rate": 2.5458273630120067e-05, + "loss": 5.5623, + "step": 9936 + }, + { + "epoch": 0.49, + "grad_norm": 1.8885998725891113, + "learning_rate": 2.544839171895845e-05, + "loss": 5.6176, + "step": 9940 + }, + { + "epoch": 0.49, + "grad_norm": 1.8526244163513184, + "learning_rate": 2.5438509807796828e-05, + "loss": 5.6358, + "step": 9944 + }, + { + "epoch": 0.49, + "grad_norm": 1.8423300981521606, + "learning_rate": 2.542862789663521e-05, + "loss": 5.5085, + "step": 9948 + }, + { + "epoch": 0.49, + "grad_norm": 1.6946587562561035, + "learning_rate": 2.5418745985473592e-05, + "loss": 5.452, + "step": 9952 + }, + { + "epoch": 0.49, + "grad_norm": 1.8842428922653198, + "learning_rate": 2.540886407431197e-05, + "loss": 5.4547, + "step": 9956 + }, + { + "epoch": 0.49, + "grad_norm": 1.8062175512313843, + "learning_rate": 2.5398982163150353e-05, + "loss": 5.5232, + "step": 9960 + }, + { + "epoch": 0.49, + "grad_norm": 1.968763828277588, + "learning_rate": 2.5389100251988735e-05, + "loss": 5.6634, + "step": 9964 + }, + { + "epoch": 0.49, + "grad_norm": 1.782151460647583, + "learning_rate": 2.5379218340827117e-05, + "loss": 5.4746, + "step": 9968 + }, + { + "epoch": 0.49, + "grad_norm": 2.0645010471343994, + "learning_rate": 2.5369336429665495e-05, + "loss": 5.5755, + "step": 9972 + }, + { + "epoch": 0.49, + "grad_norm": 2.081439733505249, + "learning_rate": 2.5359454518503877e-05, + "loss": 5.5264, + "step": 9976 + }, + { + "epoch": 0.49, + "grad_norm": 1.9738670587539673, + "learning_rate": 2.534957260734226e-05, + "loss": 5.5369, + "step": 9980 + }, + { + "epoch": 0.49, + "grad_norm": 2.08581805229187, + "learning_rate": 2.533969069618064e-05, + "loss": 5.6849, + "step": 9984 + }, + { + "epoch": 0.49, + "grad_norm": 1.998647928237915, + "learning_rate": 2.532980878501902e-05, + "loss": 5.628, + "step": 9988 + }, + { + "epoch": 0.49, + "grad_norm": 1.9414081573486328, + "learning_rate": 2.5319926873857402e-05, + "loss": 5.5865, + "step": 9992 + }, + { + "epoch": 0.49, + "grad_norm": 1.7478708028793335, + "learning_rate": 2.5310044962695788e-05, + "loss": 5.4073, + "step": 9996 + }, + { + "epoch": 0.49, + "grad_norm": 2.031409740447998, + "learning_rate": 2.530016305153417e-05, + "loss": 5.6781, + "step": 10000 + }, + { + "epoch": 0.49, + "grad_norm": 1.8171534538269043, + "learning_rate": 2.5290281140372552e-05, + "loss": 5.6865, + "step": 10004 + }, + { + "epoch": 0.49, + "grad_norm": 2.128012180328369, + "learning_rate": 2.5280399229210934e-05, + "loss": 5.6194, + "step": 10008 + }, + { + "epoch": 0.49, + "grad_norm": 2.1675913333892822, + "learning_rate": 2.5270517318049313e-05, + "loss": 5.4726, + "step": 10012 + }, + { + "epoch": 0.49, + "grad_norm": 2.229329824447632, + "learning_rate": 2.5260635406887695e-05, + "loss": 5.564, + "step": 10016 + }, + { + "epoch": 0.5, + "grad_norm": 1.8510156869888306, + "learning_rate": 2.5250753495726077e-05, + "loss": 5.4116, + "step": 10020 + }, + { + "epoch": 0.5, + "grad_norm": 1.9130499362945557, + "learning_rate": 2.524087158456446e-05, + "loss": 5.5758, + "step": 10024 + }, + { + "epoch": 0.5, + "grad_norm": 2.073378562927246, + "learning_rate": 2.5230989673402837e-05, + "loss": 5.5724, + "step": 10028 + }, + { + "epoch": 0.5, + "grad_norm": 2.248762845993042, + "learning_rate": 2.522110776224122e-05, + "loss": 5.6893, + "step": 10032 + }, + { + "epoch": 0.5, + "grad_norm": 2.0180504322052, + "learning_rate": 2.52112258510796e-05, + "loss": 5.642, + "step": 10036 + }, + { + "epoch": 0.5, + "grad_norm": 1.786154866218567, + "learning_rate": 2.520134393991798e-05, + "loss": 5.6359, + "step": 10040 + }, + { + "epoch": 0.5, + "grad_norm": 1.8328882455825806, + "learning_rate": 2.5191462028756362e-05, + "loss": 5.5998, + "step": 10044 + }, + { + "epoch": 0.5, + "grad_norm": 2.1407294273376465, + "learning_rate": 2.5181580117594744e-05, + "loss": 5.7289, + "step": 10048 + }, + { + "epoch": 0.5, + "grad_norm": 1.6847882270812988, + "learning_rate": 2.5171698206433126e-05, + "loss": 5.6441, + "step": 10052 + }, + { + "epoch": 0.5, + "grad_norm": 1.7295290231704712, + "learning_rate": 2.5161816295271505e-05, + "loss": 5.5917, + "step": 10056 + }, + { + "epoch": 0.5, + "grad_norm": 2.069204807281494, + "learning_rate": 2.5151934384109887e-05, + "loss": 5.3987, + "step": 10060 + }, + { + "epoch": 0.5, + "grad_norm": 1.8884251117706299, + "learning_rate": 2.514205247294827e-05, + "loss": 5.5554, + "step": 10064 + }, + { + "epoch": 0.5, + "grad_norm": 1.767871618270874, + "learning_rate": 2.513217056178665e-05, + "loss": 5.6294, + "step": 10068 + }, + { + "epoch": 0.5, + "grad_norm": 1.8906821012496948, + "learning_rate": 2.512228865062503e-05, + "loss": 5.5464, + "step": 10072 + }, + { + "epoch": 0.5, + "grad_norm": 1.6515437364578247, + "learning_rate": 2.5112406739463412e-05, + "loss": 5.4848, + "step": 10076 + }, + { + "epoch": 0.5, + "grad_norm": 2.0448694229125977, + "learning_rate": 2.5102524828301794e-05, + "loss": 5.5193, + "step": 10080 + }, + { + "epoch": 0.5, + "grad_norm": 1.867287278175354, + "learning_rate": 2.5092642917140173e-05, + "loss": 5.5843, + "step": 10084 + }, + { + "epoch": 0.5, + "grad_norm": 1.833117961883545, + "learning_rate": 2.5082761005978555e-05, + "loss": 5.4566, + "step": 10088 + }, + { + "epoch": 0.5, + "grad_norm": 1.842322826385498, + "learning_rate": 2.5072879094816937e-05, + "loss": 5.5783, + "step": 10092 + }, + { + "epoch": 0.5, + "grad_norm": 1.7041923999786377, + "learning_rate": 2.506299718365532e-05, + "loss": 5.6291, + "step": 10096 + }, + { + "epoch": 0.5, + "grad_norm": 1.9583604335784912, + "learning_rate": 2.5053115272493698e-05, + "loss": 5.5434, + "step": 10100 + }, + { + "epoch": 0.5, + "grad_norm": 1.8357032537460327, + "learning_rate": 2.504323336133208e-05, + "loss": 5.5751, + "step": 10104 + }, + { + "epoch": 0.5, + "grad_norm": 1.9538915157318115, + "learning_rate": 2.503335145017047e-05, + "loss": 5.7138, + "step": 10108 + }, + { + "epoch": 0.5, + "grad_norm": 2.0937674045562744, + "learning_rate": 2.5023469539008847e-05, + "loss": 5.5237, + "step": 10112 + }, + { + "epoch": 0.5, + "grad_norm": 1.9279645681381226, + "learning_rate": 2.501358762784723e-05, + "loss": 5.6076, + "step": 10116 + }, + { + "epoch": 0.5, + "grad_norm": 1.8550045490264893, + "learning_rate": 2.500370571668561e-05, + "loss": 5.5574, + "step": 10120 + }, + { + "epoch": 0.5, + "grad_norm": 2.0821774005889893, + "learning_rate": 2.499382380552399e-05, + "loss": 5.532, + "step": 10124 + }, + { + "epoch": 0.5, + "grad_norm": 2.088787078857422, + "learning_rate": 2.498394189436237e-05, + "loss": 5.6185, + "step": 10128 + }, + { + "epoch": 0.5, + "grad_norm": 1.7520018815994263, + "learning_rate": 2.497405998320075e-05, + "loss": 5.5648, + "step": 10132 + }, + { + "epoch": 0.5, + "grad_norm": 1.963576078414917, + "learning_rate": 2.4964178072039136e-05, + "loss": 5.6652, + "step": 10136 + }, + { + "epoch": 0.5, + "grad_norm": 1.9875547885894775, + "learning_rate": 2.4954296160877515e-05, + "loss": 5.5179, + "step": 10140 + }, + { + "epoch": 0.5, + "grad_norm": 1.9994404315948486, + "learning_rate": 2.4944414249715897e-05, + "loss": 5.6485, + "step": 10144 + }, + { + "epoch": 0.5, + "grad_norm": 2.0803356170654297, + "learning_rate": 2.493453233855428e-05, + "loss": 5.4617, + "step": 10148 + }, + { + "epoch": 0.5, + "grad_norm": 1.6625994443893433, + "learning_rate": 2.492465042739266e-05, + "loss": 5.5363, + "step": 10152 + }, + { + "epoch": 0.5, + "grad_norm": 1.8753883838653564, + "learning_rate": 2.491476851623104e-05, + "loss": 5.3869, + "step": 10156 + }, + { + "epoch": 0.5, + "grad_norm": 1.644220232963562, + "learning_rate": 2.4904886605069422e-05, + "loss": 5.5971, + "step": 10160 + }, + { + "epoch": 0.5, + "grad_norm": 2.017249584197998, + "learning_rate": 2.4895004693907804e-05, + "loss": 5.506, + "step": 10164 + }, + { + "epoch": 0.5, + "grad_norm": 2.240140914916992, + "learning_rate": 2.4885122782746182e-05, + "loss": 5.6691, + "step": 10168 + }, + { + "epoch": 0.5, + "grad_norm": 1.842436671257019, + "learning_rate": 2.4875240871584565e-05, + "loss": 5.5673, + "step": 10172 + }, + { + "epoch": 0.5, + "grad_norm": 1.8828648328781128, + "learning_rate": 2.4865358960422947e-05, + "loss": 5.5275, + "step": 10176 + }, + { + "epoch": 0.5, + "grad_norm": 2.1830368041992188, + "learning_rate": 2.485547704926133e-05, + "loss": 5.6258, + "step": 10180 + }, + { + "epoch": 0.5, + "grad_norm": 2.012813091278076, + "learning_rate": 2.4845595138099707e-05, + "loss": 5.5947, + "step": 10184 + }, + { + "epoch": 0.5, + "grad_norm": 1.7969881296157837, + "learning_rate": 2.483571322693809e-05, + "loss": 5.564, + "step": 10188 + }, + { + "epoch": 0.5, + "grad_norm": 1.8949358463287354, + "learning_rate": 2.482583131577647e-05, + "loss": 5.6213, + "step": 10192 + }, + { + "epoch": 0.5, + "grad_norm": 1.8343627452850342, + "learning_rate": 2.4815949404614854e-05, + "loss": 5.5869, + "step": 10196 + }, + { + "epoch": 0.5, + "grad_norm": 1.8885095119476318, + "learning_rate": 2.4806067493453236e-05, + "loss": 5.586, + "step": 10200 + }, + { + "epoch": 0.5, + "grad_norm": 2.0102386474609375, + "learning_rate": 2.4796185582291618e-05, + "loss": 5.5219, + "step": 10204 + }, + { + "epoch": 0.5, + "grad_norm": 2.127244472503662, + "learning_rate": 2.478630367113e-05, + "loss": 5.6196, + "step": 10208 + }, + { + "epoch": 0.5, + "grad_norm": 1.7955610752105713, + "learning_rate": 2.477642175996838e-05, + "loss": 5.5848, + "step": 10212 + }, + { + "epoch": 0.5, + "grad_norm": 1.9604214429855347, + "learning_rate": 2.476653984880676e-05, + "loss": 5.6014, + "step": 10216 + }, + { + "epoch": 0.5, + "grad_norm": 2.11737322807312, + "learning_rate": 2.4756657937645142e-05, + "loss": 5.5377, + "step": 10220 + }, + { + "epoch": 0.51, + "grad_norm": 2.0900325775146484, + "learning_rate": 2.4746776026483525e-05, + "loss": 5.5901, + "step": 10224 + }, + { + "epoch": 0.51, + "grad_norm": 1.7925187349319458, + "learning_rate": 2.4736894115321903e-05, + "loss": 5.4441, + "step": 10228 + }, + { + "epoch": 0.51, + "grad_norm": 2.107675790786743, + "learning_rate": 2.4727012204160285e-05, + "loss": 5.5537, + "step": 10232 + }, + { + "epoch": 0.51, + "grad_norm": 2.0693657398223877, + "learning_rate": 2.4717130292998667e-05, + "loss": 5.5913, + "step": 10236 + }, + { + "epoch": 0.51, + "grad_norm": 2.158755302429199, + "learning_rate": 2.4707248381837046e-05, + "loss": 5.7313, + "step": 10240 + }, + { + "epoch": 0.51, + "grad_norm": 1.9274017810821533, + "learning_rate": 2.4697366470675428e-05, + "loss": 5.5108, + "step": 10244 + }, + { + "epoch": 0.51, + "grad_norm": 1.8750487565994263, + "learning_rate": 2.468748455951381e-05, + "loss": 5.5994, + "step": 10248 + }, + { + "epoch": 0.51, + "grad_norm": 1.6671373844146729, + "learning_rate": 2.4677602648352192e-05, + "loss": 5.6319, + "step": 10252 + }, + { + "epoch": 0.51, + "grad_norm": 1.721057415008545, + "learning_rate": 2.4667720737190574e-05, + "loss": 5.5567, + "step": 10256 + }, + { + "epoch": 0.51, + "grad_norm": 1.7472023963928223, + "learning_rate": 2.4657838826028956e-05, + "loss": 5.5035, + "step": 10260 + }, + { + "epoch": 0.51, + "grad_norm": 1.897161602973938, + "learning_rate": 2.464795691486734e-05, + "loss": 5.5944, + "step": 10264 + }, + { + "epoch": 0.51, + "grad_norm": 1.9217979907989502, + "learning_rate": 2.4638075003705717e-05, + "loss": 5.5253, + "step": 10268 + }, + { + "epoch": 0.51, + "grad_norm": 2.010005474090576, + "learning_rate": 2.46281930925441e-05, + "loss": 5.7227, + "step": 10272 + }, + { + "epoch": 0.51, + "grad_norm": 1.7467890977859497, + "learning_rate": 2.461831118138248e-05, + "loss": 5.5426, + "step": 10276 + }, + { + "epoch": 0.51, + "grad_norm": 1.9532781839370728, + "learning_rate": 2.4608429270220863e-05, + "loss": 5.5081, + "step": 10280 + }, + { + "epoch": 0.51, + "grad_norm": 1.979059100151062, + "learning_rate": 2.4598547359059242e-05, + "loss": 5.5655, + "step": 10284 + }, + { + "epoch": 0.51, + "grad_norm": 1.7593183517456055, + "learning_rate": 2.4588665447897624e-05, + "loss": 5.5235, + "step": 10288 + }, + { + "epoch": 0.51, + "grad_norm": 1.8140133619308472, + "learning_rate": 2.4578783536736006e-05, + "loss": 5.5346, + "step": 10292 + }, + { + "epoch": 0.51, + "grad_norm": 1.9852521419525146, + "learning_rate": 2.4568901625574388e-05, + "loss": 5.5365, + "step": 10296 + }, + { + "epoch": 0.51, + "grad_norm": 2.00048828125, + "learning_rate": 2.4559019714412767e-05, + "loss": 5.5772, + "step": 10300 + }, + { + "epoch": 0.51, + "grad_norm": 1.9229919910430908, + "learning_rate": 2.454913780325115e-05, + "loss": 5.4598, + "step": 10304 + }, + { + "epoch": 0.51, + "grad_norm": 1.9021358489990234, + "learning_rate": 2.453925589208953e-05, + "loss": 5.4991, + "step": 10308 + }, + { + "epoch": 0.51, + "grad_norm": 1.9008359909057617, + "learning_rate": 2.4529373980927913e-05, + "loss": 5.5531, + "step": 10312 + }, + { + "epoch": 0.51, + "grad_norm": 1.8930054903030396, + "learning_rate": 2.4519492069766295e-05, + "loss": 5.5356, + "step": 10316 + }, + { + "epoch": 0.51, + "grad_norm": 1.8766874074935913, + "learning_rate": 2.4509610158604677e-05, + "loss": 5.6213, + "step": 10320 + }, + { + "epoch": 0.51, + "grad_norm": 1.8489048480987549, + "learning_rate": 2.4499728247443056e-05, + "loss": 5.5647, + "step": 10324 + }, + { + "epoch": 0.51, + "grad_norm": 2.0344486236572266, + "learning_rate": 2.4489846336281438e-05, + "loss": 5.5734, + "step": 10328 + }, + { + "epoch": 0.51, + "grad_norm": 1.83133864402771, + "learning_rate": 2.447996442511982e-05, + "loss": 5.4536, + "step": 10332 + }, + { + "epoch": 0.51, + "grad_norm": 1.6086121797561646, + "learning_rate": 2.4470082513958202e-05, + "loss": 5.5275, + "step": 10336 + }, + { + "epoch": 0.51, + "grad_norm": 2.0024595260620117, + "learning_rate": 2.446020060279658e-05, + "loss": 5.5924, + "step": 10340 + }, + { + "epoch": 0.51, + "grad_norm": 1.9737377166748047, + "learning_rate": 2.4450318691634963e-05, + "loss": 5.5466, + "step": 10344 + }, + { + "epoch": 0.51, + "grad_norm": 1.9475603103637695, + "learning_rate": 2.4440436780473345e-05, + "loss": 5.5415, + "step": 10348 + }, + { + "epoch": 0.51, + "grad_norm": 2.1475892066955566, + "learning_rate": 2.4430554869311727e-05, + "loss": 5.6749, + "step": 10352 + }, + { + "epoch": 0.51, + "grad_norm": 1.7354161739349365, + "learning_rate": 2.4420672958150105e-05, + "loss": 5.4942, + "step": 10356 + }, + { + "epoch": 0.51, + "grad_norm": 1.9726040363311768, + "learning_rate": 2.4410791046988487e-05, + "loss": 5.5258, + "step": 10360 + }, + { + "epoch": 0.51, + "grad_norm": 1.7719279527664185, + "learning_rate": 2.440090913582687e-05, + "loss": 5.5169, + "step": 10364 + }, + { + "epoch": 0.51, + "grad_norm": 1.8121774196624756, + "learning_rate": 2.439102722466525e-05, + "loss": 5.5811, + "step": 10368 + }, + { + "epoch": 0.51, + "grad_norm": 1.656872034072876, + "learning_rate": 2.4381145313503634e-05, + "loss": 5.6121, + "step": 10372 + }, + { + "epoch": 0.51, + "grad_norm": 1.9030474424362183, + "learning_rate": 2.4371263402342016e-05, + "loss": 5.5099, + "step": 10376 + }, + { + "epoch": 0.51, + "grad_norm": 1.8425421714782715, + "learning_rate": 2.4361381491180398e-05, + "loss": 5.5167, + "step": 10380 + }, + { + "epoch": 0.51, + "grad_norm": 2.0104122161865234, + "learning_rate": 2.4351499580018776e-05, + "loss": 5.629, + "step": 10384 + }, + { + "epoch": 0.51, + "grad_norm": 1.7906516790390015, + "learning_rate": 2.434161766885716e-05, + "loss": 5.632, + "step": 10388 + }, + { + "epoch": 0.51, + "grad_norm": 1.966325044631958, + "learning_rate": 2.433173575769554e-05, + "loss": 5.6271, + "step": 10392 + }, + { + "epoch": 0.51, + "grad_norm": 1.9197744131088257, + "learning_rate": 2.432185384653392e-05, + "loss": 5.5729, + "step": 10396 + }, + { + "epoch": 0.51, + "grad_norm": 1.9084253311157227, + "learning_rate": 2.43119719353723e-05, + "loss": 5.5213, + "step": 10400 + }, + { + "epoch": 0.51, + "grad_norm": 1.743037223815918, + "learning_rate": 2.4302090024210683e-05, + "loss": 5.6202, + "step": 10404 + }, + { + "epoch": 0.51, + "grad_norm": 1.6884185075759888, + "learning_rate": 2.4292208113049065e-05, + "loss": 5.52, + "step": 10408 + }, + { + "epoch": 0.51, + "grad_norm": 1.9082211256027222, + "learning_rate": 2.4282326201887444e-05, + "loss": 5.6913, + "step": 10412 + }, + { + "epoch": 0.51, + "grad_norm": 1.7120208740234375, + "learning_rate": 2.4272444290725826e-05, + "loss": 5.536, + "step": 10416 + }, + { + "epoch": 0.51, + "grad_norm": 2.0013391971588135, + "learning_rate": 2.4262562379564208e-05, + "loss": 5.6444, + "step": 10420 + }, + { + "epoch": 0.52, + "grad_norm": 1.751233458518982, + "learning_rate": 2.425268046840259e-05, + "loss": 5.4514, + "step": 10424 + }, + { + "epoch": 0.52, + "grad_norm": 1.7701613903045654, + "learning_rate": 2.4242798557240972e-05, + "loss": 5.5492, + "step": 10428 + }, + { + "epoch": 0.52, + "grad_norm": 1.672849416732788, + "learning_rate": 2.4232916646079354e-05, + "loss": 5.5202, + "step": 10432 + }, + { + "epoch": 0.52, + "grad_norm": 1.9248921871185303, + "learning_rate": 2.4223034734917736e-05, + "loss": 5.5876, + "step": 10436 + }, + { + "epoch": 0.52, + "grad_norm": 1.7727632522583008, + "learning_rate": 2.4213152823756115e-05, + "loss": 5.5529, + "step": 10440 + }, + { + "epoch": 0.52, + "grad_norm": 1.9755526781082153, + "learning_rate": 2.4203270912594497e-05, + "loss": 5.6183, + "step": 10444 + }, + { + "epoch": 0.52, + "grad_norm": 1.7922430038452148, + "learning_rate": 2.419338900143288e-05, + "loss": 5.5369, + "step": 10448 + }, + { + "epoch": 0.52, + "grad_norm": 2.185535430908203, + "learning_rate": 2.4183507090271258e-05, + "loss": 5.6702, + "step": 10452 + }, + { + "epoch": 0.52, + "grad_norm": 1.659045934677124, + "learning_rate": 2.417362517910964e-05, + "loss": 5.4956, + "step": 10456 + }, + { + "epoch": 0.52, + "grad_norm": 1.8726792335510254, + "learning_rate": 2.4163743267948022e-05, + "loss": 5.5971, + "step": 10460 + }, + { + "epoch": 0.52, + "grad_norm": 1.8656306266784668, + "learning_rate": 2.4153861356786404e-05, + "loss": 5.6505, + "step": 10464 + }, + { + "epoch": 0.52, + "grad_norm": 2.1114795207977295, + "learning_rate": 2.4143979445624783e-05, + "loss": 5.6579, + "step": 10468 + }, + { + "epoch": 0.52, + "grad_norm": 1.7665212154388428, + "learning_rate": 2.4134097534463165e-05, + "loss": 5.5182, + "step": 10472 + }, + { + "epoch": 0.52, + "grad_norm": 2.0020570755004883, + "learning_rate": 2.4124215623301547e-05, + "loss": 5.6146, + "step": 10476 + }, + { + "epoch": 0.52, + "grad_norm": 1.81464684009552, + "learning_rate": 2.411433371213993e-05, + "loss": 5.6334, + "step": 10480 + }, + { + "epoch": 0.52, + "grad_norm": 2.0478081703186035, + "learning_rate": 2.410445180097831e-05, + "loss": 5.6295, + "step": 10484 + }, + { + "epoch": 0.52, + "grad_norm": 1.9547245502471924, + "learning_rate": 2.4094569889816693e-05, + "loss": 5.604, + "step": 10488 + }, + { + "epoch": 0.52, + "grad_norm": 2.018515110015869, + "learning_rate": 2.4084687978655075e-05, + "loss": 5.498, + "step": 10492 + }, + { + "epoch": 0.52, + "grad_norm": 1.7030364274978638, + "learning_rate": 2.4074806067493454e-05, + "loss": 5.654, + "step": 10496 + }, + { + "epoch": 0.52, + "grad_norm": 2.0866682529449463, + "learning_rate": 2.4064924156331836e-05, + "loss": 5.5155, + "step": 10500 + }, + { + "epoch": 0.52, + "grad_norm": 2.075389862060547, + "learning_rate": 2.4055042245170218e-05, + "loss": 5.5227, + "step": 10504 + }, + { + "epoch": 0.52, + "grad_norm": 1.9765968322753906, + "learning_rate": 2.40451603340086e-05, + "loss": 5.6732, + "step": 10508 + }, + { + "epoch": 0.52, + "grad_norm": 2.0013983249664307, + "learning_rate": 2.403527842284698e-05, + "loss": 5.6551, + "step": 10512 + }, + { + "epoch": 0.52, + "grad_norm": 2.044642448425293, + "learning_rate": 2.402539651168536e-05, + "loss": 5.5395, + "step": 10516 + }, + { + "epoch": 0.52, + "grad_norm": 1.6758370399475098, + "learning_rate": 2.4015514600523743e-05, + "loss": 5.5563, + "step": 10520 + }, + { + "epoch": 0.52, + "grad_norm": 1.8717275857925415, + "learning_rate": 2.400563268936212e-05, + "loss": 5.5039, + "step": 10524 + }, + { + "epoch": 0.52, + "grad_norm": 1.9705373048782349, + "learning_rate": 2.3995750778200504e-05, + "loss": 5.6172, + "step": 10528 + }, + { + "epoch": 0.52, + "grad_norm": 1.7623977661132812, + "learning_rate": 2.3985868867038886e-05, + "loss": 5.4925, + "step": 10532 + }, + { + "epoch": 0.52, + "grad_norm": 1.9180505275726318, + "learning_rate": 2.3975986955877268e-05, + "loss": 5.6497, + "step": 10536 + }, + { + "epoch": 0.52, + "grad_norm": 1.940900206565857, + "learning_rate": 2.396610504471565e-05, + "loss": 5.5216, + "step": 10540 + }, + { + "epoch": 0.52, + "grad_norm": 1.7929134368896484, + "learning_rate": 2.3956223133554032e-05, + "loss": 5.483, + "step": 10544 + }, + { + "epoch": 0.52, + "grad_norm": 1.8497039079666138, + "learning_rate": 2.3946341222392414e-05, + "loss": 5.5895, + "step": 10548 + }, + { + "epoch": 0.52, + "grad_norm": 1.8685697317123413, + "learning_rate": 2.3936459311230793e-05, + "loss": 5.5565, + "step": 10552 + }, + { + "epoch": 0.52, + "grad_norm": 2.2542967796325684, + "learning_rate": 2.3926577400069175e-05, + "loss": 5.5895, + "step": 10556 + }, + { + "epoch": 0.52, + "grad_norm": 2.043692111968994, + "learning_rate": 2.3916695488907557e-05, + "loss": 5.4708, + "step": 10560 + }, + { + "epoch": 0.52, + "grad_norm": 2.1556544303894043, + "learning_rate": 2.390681357774594e-05, + "loss": 5.5257, + "step": 10564 + }, + { + "epoch": 0.52, + "grad_norm": 2.0351712703704834, + "learning_rate": 2.3896931666584317e-05, + "loss": 5.5247, + "step": 10568 + }, + { + "epoch": 0.52, + "grad_norm": 1.7244350910186768, + "learning_rate": 2.38870497554227e-05, + "loss": 5.4927, + "step": 10572 + }, + { + "epoch": 0.52, + "grad_norm": 2.0928471088409424, + "learning_rate": 2.387716784426108e-05, + "loss": 5.4176, + "step": 10576 + }, + { + "epoch": 0.52, + "grad_norm": 2.0530073642730713, + "learning_rate": 2.3867285933099464e-05, + "loss": 5.6525, + "step": 10580 + }, + { + "epoch": 0.52, + "grad_norm": 1.8700250387191772, + "learning_rate": 2.3857404021937842e-05, + "loss": 5.5591, + "step": 10584 + }, + { + "epoch": 0.52, + "grad_norm": 1.6774561405181885, + "learning_rate": 2.3847522110776224e-05, + "loss": 5.5925, + "step": 10588 + }, + { + "epoch": 0.52, + "grad_norm": 2.1128244400024414, + "learning_rate": 2.3837640199614606e-05, + "loss": 5.5118, + "step": 10592 + }, + { + "epoch": 0.52, + "grad_norm": 1.9600874185562134, + "learning_rate": 2.382775828845299e-05, + "loss": 5.563, + "step": 10596 + }, + { + "epoch": 0.52, + "grad_norm": 1.9983313083648682, + "learning_rate": 2.381787637729137e-05, + "loss": 5.4649, + "step": 10600 + }, + { + "epoch": 0.52, + "grad_norm": 1.9052096605300903, + "learning_rate": 2.3807994466129753e-05, + "loss": 5.5366, + "step": 10604 + }, + { + "epoch": 0.52, + "grad_norm": 1.9954668283462524, + "learning_rate": 2.379811255496813e-05, + "loss": 5.5479, + "step": 10608 + }, + { + "epoch": 0.52, + "grad_norm": 1.9341228008270264, + "learning_rate": 2.3788230643806513e-05, + "loss": 5.4649, + "step": 10612 + }, + { + "epoch": 0.52, + "grad_norm": 1.9938849210739136, + "learning_rate": 2.3778348732644895e-05, + "loss": 5.5791, + "step": 10616 + }, + { + "epoch": 0.52, + "grad_norm": 1.6775513887405396, + "learning_rate": 2.3768466821483277e-05, + "loss": 5.61, + "step": 10620 + }, + { + "epoch": 0.52, + "grad_norm": 1.9239006042480469, + "learning_rate": 2.3758584910321656e-05, + "loss": 5.5689, + "step": 10624 + }, + { + "epoch": 0.53, + "grad_norm": 2.060796022415161, + "learning_rate": 2.3748702999160038e-05, + "loss": 5.5823, + "step": 10628 + }, + { + "epoch": 0.53, + "grad_norm": 1.95215904712677, + "learning_rate": 2.373882108799842e-05, + "loss": 5.4581, + "step": 10632 + }, + { + "epoch": 0.53, + "grad_norm": 1.9908874034881592, + "learning_rate": 2.3728939176836802e-05, + "loss": 5.5968, + "step": 10636 + }, + { + "epoch": 0.53, + "grad_norm": 1.6556943655014038, + "learning_rate": 2.371905726567518e-05, + "loss": 5.5399, + "step": 10640 + }, + { + "epoch": 0.53, + "grad_norm": 1.9152753353118896, + "learning_rate": 2.3709175354513563e-05, + "loss": 5.6018, + "step": 10644 + }, + { + "epoch": 0.53, + "grad_norm": 2.3656210899353027, + "learning_rate": 2.3699293443351945e-05, + "loss": 5.4332, + "step": 10648 + }, + { + "epoch": 0.53, + "grad_norm": 1.9801435470581055, + "learning_rate": 2.3689411532190324e-05, + "loss": 5.5684, + "step": 10652 + }, + { + "epoch": 0.53, + "grad_norm": 2.1441659927368164, + "learning_rate": 2.367952962102871e-05, + "loss": 5.3962, + "step": 10656 + }, + { + "epoch": 0.53, + "grad_norm": 2.2612667083740234, + "learning_rate": 2.366964770986709e-05, + "loss": 5.5301, + "step": 10660 + }, + { + "epoch": 0.53, + "grad_norm": 2.1340534687042236, + "learning_rate": 2.3659765798705473e-05, + "loss": 5.6125, + "step": 10664 + }, + { + "epoch": 0.53, + "grad_norm": 2.0850844383239746, + "learning_rate": 2.3649883887543852e-05, + "loss": 5.4433, + "step": 10668 + }, + { + "epoch": 0.53, + "grad_norm": 1.7150609493255615, + "learning_rate": 2.3640001976382234e-05, + "loss": 5.5438, + "step": 10672 + }, + { + "epoch": 0.53, + "grad_norm": 1.9754505157470703, + "learning_rate": 2.3630120065220616e-05, + "loss": 5.6475, + "step": 10676 + }, + { + "epoch": 0.53, + "grad_norm": 1.9348641633987427, + "learning_rate": 2.3620238154058995e-05, + "loss": 5.6183, + "step": 10680 + }, + { + "epoch": 0.53, + "grad_norm": 1.9712179899215698, + "learning_rate": 2.3610356242897377e-05, + "loss": 5.5499, + "step": 10684 + }, + { + "epoch": 0.53, + "grad_norm": 1.9912450313568115, + "learning_rate": 2.360047433173576e-05, + "loss": 5.534, + "step": 10688 + }, + { + "epoch": 0.53, + "grad_norm": 1.8730937242507935, + "learning_rate": 2.359059242057414e-05, + "loss": 5.6078, + "step": 10692 + }, + { + "epoch": 0.53, + "grad_norm": 2.082500696182251, + "learning_rate": 2.358071050941252e-05, + "loss": 5.5638, + "step": 10696 + }, + { + "epoch": 0.53, + "grad_norm": 1.990276575088501, + "learning_rate": 2.35708285982509e-05, + "loss": 5.4854, + "step": 10700 + }, + { + "epoch": 0.53, + "grad_norm": 1.8696178197860718, + "learning_rate": 2.3560946687089284e-05, + "loss": 5.5041, + "step": 10704 + }, + { + "epoch": 0.53, + "grad_norm": 2.1446194648742676, + "learning_rate": 2.3551064775927666e-05, + "loss": 5.6358, + "step": 10708 + }, + { + "epoch": 0.53, + "grad_norm": 1.7713229656219482, + "learning_rate": 2.3541182864766048e-05, + "loss": 5.5635, + "step": 10712 + }, + { + "epoch": 0.53, + "grad_norm": 1.876599907875061, + "learning_rate": 2.353130095360443e-05, + "loss": 5.4398, + "step": 10716 + }, + { + "epoch": 0.53, + "grad_norm": 1.7540398836135864, + "learning_rate": 2.3521419042442812e-05, + "loss": 5.4983, + "step": 10720 + }, + { + "epoch": 0.53, + "grad_norm": 2.0557737350463867, + "learning_rate": 2.351153713128119e-05, + "loss": 5.4517, + "step": 10724 + }, + { + "epoch": 0.53, + "grad_norm": 1.7068967819213867, + "learning_rate": 2.3501655220119573e-05, + "loss": 5.4742, + "step": 10728 + }, + { + "epoch": 0.53, + "grad_norm": 1.9416307210922241, + "learning_rate": 2.3491773308957955e-05, + "loss": 5.6298, + "step": 10732 + }, + { + "epoch": 0.53, + "grad_norm": 1.8731729984283447, + "learning_rate": 2.3481891397796333e-05, + "loss": 5.5531, + "step": 10736 + }, + { + "epoch": 0.53, + "grad_norm": 2.060716390609741, + "learning_rate": 2.3472009486634715e-05, + "loss": 5.6034, + "step": 10740 + }, + { + "epoch": 0.53, + "grad_norm": 2.003988742828369, + "learning_rate": 2.3462127575473098e-05, + "loss": 5.5876, + "step": 10744 + }, + { + "epoch": 0.53, + "grad_norm": 2.0887482166290283, + "learning_rate": 2.345224566431148e-05, + "loss": 5.5417, + "step": 10748 + }, + { + "epoch": 0.53, + "grad_norm": 2.017455577850342, + "learning_rate": 2.3442363753149858e-05, + "loss": 5.546, + "step": 10752 + }, + { + "epoch": 0.53, + "grad_norm": 1.8705204725265503, + "learning_rate": 2.343248184198824e-05, + "loss": 5.6274, + "step": 10756 + }, + { + "epoch": 0.53, + "grad_norm": 1.9942293167114258, + "learning_rate": 2.3422599930826622e-05, + "loss": 5.4287, + "step": 10760 + }, + { + "epoch": 0.53, + "grad_norm": 1.894716739654541, + "learning_rate": 2.3412718019665004e-05, + "loss": 5.5343, + "step": 10764 + }, + { + "epoch": 0.53, + "grad_norm": 2.020045042037964, + "learning_rate": 2.3402836108503387e-05, + "loss": 5.6098, + "step": 10768 + }, + { + "epoch": 0.53, + "grad_norm": 2.01450252532959, + "learning_rate": 2.339295419734177e-05, + "loss": 5.5875, + "step": 10772 + }, + { + "epoch": 0.53, + "grad_norm": 1.9330328702926636, + "learning_rate": 2.338307228618015e-05, + "loss": 5.5679, + "step": 10776 + }, + { + "epoch": 0.53, + "grad_norm": 1.871109962463379, + "learning_rate": 2.337319037501853e-05, + "loss": 5.5312, + "step": 10780 + }, + { + "epoch": 0.53, + "grad_norm": 2.02812123298645, + "learning_rate": 2.336330846385691e-05, + "loss": 5.3512, + "step": 10784 + }, + { + "epoch": 0.53, + "grad_norm": 1.9817901849746704, + "learning_rate": 2.3353426552695293e-05, + "loss": 5.4152, + "step": 10788 + }, + { + "epoch": 0.53, + "grad_norm": 1.9336769580841064, + "learning_rate": 2.3343544641533675e-05, + "loss": 5.5362, + "step": 10792 + }, + { + "epoch": 0.53, + "grad_norm": 1.8532609939575195, + "learning_rate": 2.3333662730372054e-05, + "loss": 5.4603, + "step": 10796 + }, + { + "epoch": 0.53, + "grad_norm": 1.8806451559066772, + "learning_rate": 2.3323780819210436e-05, + "loss": 5.3589, + "step": 10800 + }, + { + "epoch": 0.53, + "grad_norm": 1.8947298526763916, + "learning_rate": 2.3313898908048818e-05, + "loss": 5.5674, + "step": 10804 + }, + { + "epoch": 0.53, + "grad_norm": 1.9279778003692627, + "learning_rate": 2.3304016996887197e-05, + "loss": 5.5096, + "step": 10808 + }, + { + "epoch": 0.53, + "grad_norm": 2.0926554203033447, + "learning_rate": 2.329413508572558e-05, + "loss": 5.6147, + "step": 10812 + }, + { + "epoch": 0.53, + "grad_norm": 2.0202767848968506, + "learning_rate": 2.328425317456396e-05, + "loss": 5.6283, + "step": 10816 + }, + { + "epoch": 0.53, + "grad_norm": 1.9681588411331177, + "learning_rate": 2.3274371263402343e-05, + "loss": 5.711, + "step": 10820 + }, + { + "epoch": 0.53, + "grad_norm": 2.059591054916382, + "learning_rate": 2.3264489352240722e-05, + "loss": 5.5387, + "step": 10824 + }, + { + "epoch": 0.54, + "grad_norm": 1.7772566080093384, + "learning_rate": 2.3254607441079107e-05, + "loss": 5.5673, + "step": 10828 + }, + { + "epoch": 0.54, + "grad_norm": 2.169037103652954, + "learning_rate": 2.324472552991749e-05, + "loss": 5.4922, + "step": 10832 + }, + { + "epoch": 0.54, + "grad_norm": 1.7867958545684814, + "learning_rate": 2.3234843618755868e-05, + "loss": 5.4829, + "step": 10836 + }, + { + "epoch": 0.54, + "grad_norm": 2.0761024951934814, + "learning_rate": 2.322496170759425e-05, + "loss": 5.57, + "step": 10840 + }, + { + "epoch": 0.54, + "grad_norm": 1.8901103734970093, + "learning_rate": 2.3215079796432632e-05, + "loss": 5.6031, + "step": 10844 + }, + { + "epoch": 0.54, + "grad_norm": 1.8197063207626343, + "learning_rate": 2.3205197885271014e-05, + "loss": 5.6221, + "step": 10848 + }, + { + "epoch": 0.54, + "grad_norm": 1.8209093809127808, + "learning_rate": 2.3195315974109393e-05, + "loss": 5.4747, + "step": 10852 + }, + { + "epoch": 0.54, + "grad_norm": 2.1222522258758545, + "learning_rate": 2.3185434062947775e-05, + "loss": 5.5274, + "step": 10856 + }, + { + "epoch": 0.54, + "grad_norm": 1.7312264442443848, + "learning_rate": 2.3175552151786157e-05, + "loss": 5.5181, + "step": 10860 + }, + { + "epoch": 0.54, + "grad_norm": 1.9499653577804565, + "learning_rate": 2.316567024062454e-05, + "loss": 5.5653, + "step": 10864 + }, + { + "epoch": 0.54, + "grad_norm": 1.7755954265594482, + "learning_rate": 2.3155788329462918e-05, + "loss": 5.5213, + "step": 10868 + }, + { + "epoch": 0.54, + "grad_norm": 1.6512964963912964, + "learning_rate": 2.31459064183013e-05, + "loss": 5.4163, + "step": 10872 + }, + { + "epoch": 0.54, + "grad_norm": 1.9016536474227905, + "learning_rate": 2.3136024507139682e-05, + "loss": 5.4877, + "step": 10876 + }, + { + "epoch": 0.54, + "grad_norm": 1.9362504482269287, + "learning_rate": 2.312614259597806e-05, + "loss": 5.5776, + "step": 10880 + }, + { + "epoch": 0.54, + "grad_norm": 2.0821099281311035, + "learning_rate": 2.3116260684816446e-05, + "loss": 5.4961, + "step": 10884 + }, + { + "epoch": 0.54, + "grad_norm": 1.6689753532409668, + "learning_rate": 2.3106378773654828e-05, + "loss": 5.5967, + "step": 10888 + }, + { + "epoch": 0.54, + "grad_norm": 1.8034151792526245, + "learning_rate": 2.3096496862493207e-05, + "loss": 5.5646, + "step": 10892 + }, + { + "epoch": 0.54, + "grad_norm": 1.8747068643569946, + "learning_rate": 2.308661495133159e-05, + "loss": 5.599, + "step": 10896 + }, + { + "epoch": 0.54, + "grad_norm": 2.044126033782959, + "learning_rate": 2.307673304016997e-05, + "loss": 5.4739, + "step": 10900 + }, + { + "epoch": 0.54, + "grad_norm": 2.0283472537994385, + "learning_rate": 2.3066851129008353e-05, + "loss": 5.5447, + "step": 10904 + }, + { + "epoch": 0.54, + "grad_norm": 2.0088658332824707, + "learning_rate": 2.305696921784673e-05, + "loss": 5.6111, + "step": 10908 + }, + { + "epoch": 0.54, + "grad_norm": 1.8219811916351318, + "learning_rate": 2.3047087306685114e-05, + "loss": 5.6571, + "step": 10912 + }, + { + "epoch": 0.54, + "grad_norm": 2.010409116744995, + "learning_rate": 2.3037205395523496e-05, + "loss": 5.5405, + "step": 10916 + }, + { + "epoch": 0.54, + "grad_norm": 2.03324031829834, + "learning_rate": 2.3027323484361878e-05, + "loss": 5.633, + "step": 10920 + }, + { + "epoch": 0.54, + "grad_norm": 1.9077966213226318, + "learning_rate": 2.3017441573200256e-05, + "loss": 5.5292, + "step": 10924 + }, + { + "epoch": 0.54, + "grad_norm": 1.7984883785247803, + "learning_rate": 2.300755966203864e-05, + "loss": 5.5423, + "step": 10928 + }, + { + "epoch": 0.54, + "grad_norm": 2.038520336151123, + "learning_rate": 2.299767775087702e-05, + "loss": 5.5662, + "step": 10932 + }, + { + "epoch": 0.54, + "grad_norm": 2.0444748401641846, + "learning_rate": 2.2987795839715403e-05, + "loss": 5.5273, + "step": 10936 + }, + { + "epoch": 0.54, + "grad_norm": 2.0163471698760986, + "learning_rate": 2.2977913928553785e-05, + "loss": 5.5836, + "step": 10940 + }, + { + "epoch": 0.54, + "grad_norm": 2.246277093887329, + "learning_rate": 2.2968032017392167e-05, + "loss": 5.717, + "step": 10944 + }, + { + "epoch": 0.54, + "grad_norm": 2.040771484375, + "learning_rate": 2.295815010623055e-05, + "loss": 5.5738, + "step": 10948 + }, + { + "epoch": 0.54, + "grad_norm": 1.9808095693588257, + "learning_rate": 2.2948268195068927e-05, + "loss": 5.6387, + "step": 10952 + }, + { + "epoch": 0.54, + "grad_norm": 2.1175899505615234, + "learning_rate": 2.293838628390731e-05, + "loss": 5.5871, + "step": 10956 + }, + { + "epoch": 0.54, + "grad_norm": 1.99501633644104, + "learning_rate": 2.292850437274569e-05, + "loss": 5.5695, + "step": 10960 + }, + { + "epoch": 0.54, + "grad_norm": 1.9630918502807617, + "learning_rate": 2.291862246158407e-05, + "loss": 5.5771, + "step": 10964 + }, + { + "epoch": 0.54, + "grad_norm": 1.638632893562317, + "learning_rate": 2.2908740550422452e-05, + "loss": 5.551, + "step": 10968 + }, + { + "epoch": 0.54, + "grad_norm": 1.9305760860443115, + "learning_rate": 2.2898858639260834e-05, + "loss": 5.5803, + "step": 10972 + }, + { + "epoch": 0.54, + "grad_norm": 1.7646735906600952, + "learning_rate": 2.2888976728099216e-05, + "loss": 5.5042, + "step": 10976 + }, + { + "epoch": 0.54, + "grad_norm": 2.2676877975463867, + "learning_rate": 2.2879094816937595e-05, + "loss": 5.5913, + "step": 10980 + }, + { + "epoch": 0.54, + "grad_norm": 1.974547028541565, + "learning_rate": 2.2869212905775977e-05, + "loss": 5.5148, + "step": 10984 + }, + { + "epoch": 0.54, + "grad_norm": 1.830438494682312, + "learning_rate": 2.285933099461436e-05, + "loss": 5.5472, + "step": 10988 + }, + { + "epoch": 0.54, + "grad_norm": 2.166577100753784, + "learning_rate": 2.284944908345274e-05, + "loss": 5.4449, + "step": 10992 + }, + { + "epoch": 0.54, + "grad_norm": 2.0472869873046875, + "learning_rate": 2.283956717229112e-05, + "loss": 5.6287, + "step": 10996 + }, + { + "epoch": 0.54, + "grad_norm": 1.8216519355773926, + "learning_rate": 2.2829685261129505e-05, + "loss": 5.5427, + "step": 11000 + }, + { + "epoch": 0.54, + "grad_norm": 2.0221810340881348, + "learning_rate": 2.2819803349967887e-05, + "loss": 5.6037, + "step": 11004 + }, + { + "epoch": 0.54, + "grad_norm": 1.8711715936660767, + "learning_rate": 2.2809921438806266e-05, + "loss": 5.5891, + "step": 11008 + }, + { + "epoch": 0.54, + "grad_norm": 1.8653993606567383, + "learning_rate": 2.2800039527644648e-05, + "loss": 5.4277, + "step": 11012 + }, + { + "epoch": 0.54, + "grad_norm": 1.9809545278549194, + "learning_rate": 2.279015761648303e-05, + "loss": 5.4548, + "step": 11016 + }, + { + "epoch": 0.54, + "grad_norm": 1.9581102132797241, + "learning_rate": 2.278027570532141e-05, + "loss": 5.5037, + "step": 11020 + }, + { + "epoch": 0.54, + "grad_norm": 2.251185417175293, + "learning_rate": 2.277039379415979e-05, + "loss": 5.5988, + "step": 11024 + }, + { + "epoch": 0.54, + "grad_norm": 1.7871466875076294, + "learning_rate": 2.2760511882998173e-05, + "loss": 5.3987, + "step": 11028 + }, + { + "epoch": 0.55, + "grad_norm": 1.8406583070755005, + "learning_rate": 2.2750629971836555e-05, + "loss": 5.6034, + "step": 11032 + }, + { + "epoch": 0.55, + "grad_norm": 2.1242032051086426, + "learning_rate": 2.2740748060674934e-05, + "loss": 5.512, + "step": 11036 + }, + { + "epoch": 0.55, + "grad_norm": 2.062807083129883, + "learning_rate": 2.2730866149513316e-05, + "loss": 5.5253, + "step": 11040 + }, + { + "epoch": 0.55, + "grad_norm": 2.1405367851257324, + "learning_rate": 2.2720984238351698e-05, + "loss": 5.579, + "step": 11044 + }, + { + "epoch": 0.55, + "grad_norm": 1.989423155784607, + "learning_rate": 2.271110232719008e-05, + "loss": 5.5023, + "step": 11048 + }, + { + "epoch": 0.55, + "grad_norm": 1.7983574867248535, + "learning_rate": 2.270122041602846e-05, + "loss": 5.6174, + "step": 11052 + }, + { + "epoch": 0.55, + "grad_norm": 1.9673739671707153, + "learning_rate": 2.2691338504866844e-05, + "loss": 5.4921, + "step": 11056 + }, + { + "epoch": 0.55, + "grad_norm": 2.0545785427093506, + "learning_rate": 2.2681456593705226e-05, + "loss": 5.5915, + "step": 11060 + }, + { + "epoch": 0.55, + "grad_norm": 1.947089433670044, + "learning_rate": 2.2671574682543605e-05, + "loss": 5.6147, + "step": 11064 + }, + { + "epoch": 0.55, + "grad_norm": 1.6919525861740112, + "learning_rate": 2.2661692771381987e-05, + "loss": 5.5268, + "step": 11068 + }, + { + "epoch": 0.55, + "grad_norm": 1.9140796661376953, + "learning_rate": 2.265181086022037e-05, + "loss": 5.6551, + "step": 11072 + }, + { + "epoch": 0.55, + "grad_norm": 2.034980297088623, + "learning_rate": 2.264192894905875e-05, + "loss": 5.5495, + "step": 11076 + }, + { + "epoch": 0.55, + "grad_norm": 2.2609176635742188, + "learning_rate": 2.263204703789713e-05, + "loss": 5.52, + "step": 11080 + }, + { + "epoch": 0.55, + "grad_norm": 2.154770612716675, + "learning_rate": 2.262216512673551e-05, + "loss": 5.5406, + "step": 11084 + }, + { + "epoch": 0.55, + "grad_norm": 1.8771696090698242, + "learning_rate": 2.2612283215573894e-05, + "loss": 5.4876, + "step": 11088 + }, + { + "epoch": 0.55, + "grad_norm": 1.8587368726730347, + "learning_rate": 2.2602401304412272e-05, + "loss": 5.6025, + "step": 11092 + }, + { + "epoch": 0.55, + "grad_norm": 1.8150922060012817, + "learning_rate": 2.2592519393250654e-05, + "loss": 5.4589, + "step": 11096 + }, + { + "epoch": 0.55, + "grad_norm": 1.94694185256958, + "learning_rate": 2.2582637482089037e-05, + "loss": 5.4608, + "step": 11100 + }, + { + "epoch": 0.55, + "grad_norm": 1.919480800628662, + "learning_rate": 2.257275557092742e-05, + "loss": 5.4808, + "step": 11104 + }, + { + "epoch": 0.55, + "grad_norm": 1.9377970695495605, + "learning_rate": 2.2562873659765797e-05, + "loss": 5.5321, + "step": 11108 + }, + { + "epoch": 0.55, + "grad_norm": 1.8229963779449463, + "learning_rate": 2.2552991748604183e-05, + "loss": 5.5011, + "step": 11112 + }, + { + "epoch": 0.55, + "grad_norm": 1.7934753894805908, + "learning_rate": 2.2543109837442565e-05, + "loss": 5.4912, + "step": 11116 + }, + { + "epoch": 0.55, + "grad_norm": 2.294386386871338, + "learning_rate": 2.2533227926280943e-05, + "loss": 5.5593, + "step": 11120 + }, + { + "epoch": 0.55, + "grad_norm": 1.9534809589385986, + "learning_rate": 2.2523346015119325e-05, + "loss": 5.5784, + "step": 11124 + }, + { + "epoch": 0.55, + "grad_norm": 2.121579885482788, + "learning_rate": 2.2513464103957708e-05, + "loss": 5.6106, + "step": 11128 + }, + { + "epoch": 0.55, + "grad_norm": 1.8341236114501953, + "learning_rate": 2.250358219279609e-05, + "loss": 5.5102, + "step": 11132 + }, + { + "epoch": 0.55, + "grad_norm": 1.8656764030456543, + "learning_rate": 2.2493700281634468e-05, + "loss": 5.5725, + "step": 11136 + }, + { + "epoch": 0.55, + "grad_norm": 2.026597738265991, + "learning_rate": 2.248381837047285e-05, + "loss": 5.5864, + "step": 11140 + }, + { + "epoch": 0.55, + "grad_norm": 1.7924002408981323, + "learning_rate": 2.2473936459311232e-05, + "loss": 5.5194, + "step": 11144 + }, + { + "epoch": 0.55, + "grad_norm": 1.9699978828430176, + "learning_rate": 2.2464054548149614e-05, + "loss": 5.5491, + "step": 11148 + }, + { + "epoch": 0.55, + "grad_norm": 2.0057380199432373, + "learning_rate": 2.2454172636987993e-05, + "loss": 5.5209, + "step": 11152 + }, + { + "epoch": 0.55, + "grad_norm": 1.9208364486694336, + "learning_rate": 2.2444290725826375e-05, + "loss": 5.6376, + "step": 11156 + }, + { + "epoch": 0.55, + "grad_norm": 2.1530115604400635, + "learning_rate": 2.2434408814664757e-05, + "loss": 5.5286, + "step": 11160 + }, + { + "epoch": 0.55, + "grad_norm": 2.0784969329833984, + "learning_rate": 2.2424526903503136e-05, + "loss": 5.629, + "step": 11164 + }, + { + "epoch": 0.55, + "grad_norm": 1.7239807844161987, + "learning_rate": 2.2414644992341518e-05, + "loss": 5.4354, + "step": 11168 + }, + { + "epoch": 0.55, + "grad_norm": 2.1020421981811523, + "learning_rate": 2.2404763081179903e-05, + "loss": 5.4923, + "step": 11172 + }, + { + "epoch": 0.55, + "grad_norm": 1.8407636880874634, + "learning_rate": 2.2394881170018282e-05, + "loss": 5.5506, + "step": 11176 + }, + { + "epoch": 0.55, + "grad_norm": 1.8488260507583618, + "learning_rate": 2.2384999258856664e-05, + "loss": 5.6145, + "step": 11180 + }, + { + "epoch": 0.55, + "grad_norm": 2.0274198055267334, + "learning_rate": 2.2375117347695046e-05, + "loss": 5.5115, + "step": 11184 + }, + { + "epoch": 0.55, + "grad_norm": 2.122840642929077, + "learning_rate": 2.2365235436533428e-05, + "loss": 5.4538, + "step": 11188 + }, + { + "epoch": 0.55, + "grad_norm": 2.121507406234741, + "learning_rate": 2.2355353525371807e-05, + "loss": 5.4959, + "step": 11192 + }, + { + "epoch": 0.55, + "grad_norm": 2.1240291595458984, + "learning_rate": 2.234547161421019e-05, + "loss": 5.3912, + "step": 11196 + }, + { + "epoch": 0.55, + "grad_norm": 1.7324938774108887, + "learning_rate": 2.233558970304857e-05, + "loss": 5.5702, + "step": 11200 + }, + { + "epoch": 0.55, + "grad_norm": 1.7895362377166748, + "learning_rate": 2.2325707791886953e-05, + "loss": 5.5114, + "step": 11204 + }, + { + "epoch": 0.55, + "grad_norm": 1.8272205591201782, + "learning_rate": 2.2315825880725332e-05, + "loss": 5.5404, + "step": 11208 + }, + { + "epoch": 0.55, + "grad_norm": 2.2934823036193848, + "learning_rate": 2.2305943969563714e-05, + "loss": 5.6429, + "step": 11212 + }, + { + "epoch": 0.55, + "grad_norm": 2.1152985095977783, + "learning_rate": 2.2296062058402096e-05, + "loss": 5.5438, + "step": 11216 + }, + { + "epoch": 0.55, + "grad_norm": 1.9162452220916748, + "learning_rate": 2.2286180147240478e-05, + "loss": 5.574, + "step": 11220 + }, + { + "epoch": 0.55, + "grad_norm": 1.9226047992706299, + "learning_rate": 2.2276298236078857e-05, + "loss": 5.5964, + "step": 11224 + }, + { + "epoch": 0.55, + "grad_norm": 2.2039763927459717, + "learning_rate": 2.2266416324917242e-05, + "loss": 5.5645, + "step": 11228 + }, + { + "epoch": 0.55, + "grad_norm": 1.8182283639907837, + "learning_rate": 2.2256534413755624e-05, + "loss": 5.5464, + "step": 11232 + }, + { + "epoch": 0.56, + "grad_norm": 1.8267743587493896, + "learning_rate": 2.2246652502594003e-05, + "loss": 5.5826, + "step": 11236 + }, + { + "epoch": 0.56, + "grad_norm": 1.933274269104004, + "learning_rate": 2.2236770591432385e-05, + "loss": 5.5138, + "step": 11240 + }, + { + "epoch": 0.56, + "grad_norm": 2.160489320755005, + "learning_rate": 2.2226888680270767e-05, + "loss": 5.5879, + "step": 11244 + }, + { + "epoch": 0.56, + "grad_norm": 1.8666636943817139, + "learning_rate": 2.2217006769109146e-05, + "loss": 5.5751, + "step": 11248 + }, + { + "epoch": 0.56, + "grad_norm": 2.0974009037017822, + "learning_rate": 2.2207124857947528e-05, + "loss": 5.4429, + "step": 11252 + }, + { + "epoch": 0.56, + "grad_norm": 1.6826304197311401, + "learning_rate": 2.219724294678591e-05, + "loss": 5.5061, + "step": 11256 + }, + { + "epoch": 0.56, + "grad_norm": 1.9517406225204468, + "learning_rate": 2.2187361035624292e-05, + "loss": 5.5793, + "step": 11260 + }, + { + "epoch": 0.56, + "grad_norm": 1.9135072231292725, + "learning_rate": 2.217747912446267e-05, + "loss": 5.4732, + "step": 11264 + }, + { + "epoch": 0.56, + "grad_norm": 1.7441056966781616, + "learning_rate": 2.2167597213301053e-05, + "loss": 5.4808, + "step": 11268 + }, + { + "epoch": 0.56, + "grad_norm": 1.851120114326477, + "learning_rate": 2.2157715302139435e-05, + "loss": 5.6432, + "step": 11272 + }, + { + "epoch": 0.56, + "grad_norm": 1.8004862070083618, + "learning_rate": 2.2147833390977817e-05, + "loss": 5.5154, + "step": 11276 + }, + { + "epoch": 0.56, + "grad_norm": 1.6254485845565796, + "learning_rate": 2.2137951479816195e-05, + "loss": 5.6375, + "step": 11280 + }, + { + "epoch": 0.56, + "grad_norm": 1.9577946662902832, + "learning_rate": 2.2128069568654577e-05, + "loss": 5.559, + "step": 11284 + }, + { + "epoch": 0.56, + "grad_norm": 2.018347978591919, + "learning_rate": 2.2118187657492963e-05, + "loss": 5.484, + "step": 11288 + }, + { + "epoch": 0.56, + "grad_norm": 1.8874802589416504, + "learning_rate": 2.210830574633134e-05, + "loss": 5.577, + "step": 11292 + }, + { + "epoch": 0.56, + "grad_norm": 1.764642357826233, + "learning_rate": 2.2098423835169724e-05, + "loss": 5.45, + "step": 11296 + }, + { + "epoch": 0.56, + "grad_norm": 2.156693458557129, + "learning_rate": 2.2088541924008106e-05, + "loss": 5.5669, + "step": 11300 + }, + { + "epoch": 0.56, + "grad_norm": 1.8049863576889038, + "learning_rate": 2.2078660012846484e-05, + "loss": 5.5686, + "step": 11304 + }, + { + "epoch": 0.56, + "grad_norm": 1.7862818241119385, + "learning_rate": 2.2068778101684866e-05, + "loss": 5.4679, + "step": 11308 + }, + { + "epoch": 0.56, + "grad_norm": 1.8136225938796997, + "learning_rate": 2.205889619052325e-05, + "loss": 5.5579, + "step": 11312 + }, + { + "epoch": 0.56, + "grad_norm": 1.744735598564148, + "learning_rate": 2.204901427936163e-05, + "loss": 5.5756, + "step": 11316 + }, + { + "epoch": 0.56, + "grad_norm": 2.0922114849090576, + "learning_rate": 2.203913236820001e-05, + "loss": 5.3664, + "step": 11320 + }, + { + "epoch": 0.56, + "grad_norm": 1.8184114694595337, + "learning_rate": 2.202925045703839e-05, + "loss": 5.3773, + "step": 11324 + }, + { + "epoch": 0.56, + "grad_norm": 1.75265371799469, + "learning_rate": 2.2019368545876773e-05, + "loss": 5.5344, + "step": 11328 + }, + { + "epoch": 0.56, + "grad_norm": 2.045360803604126, + "learning_rate": 2.2009486634715155e-05, + "loss": 5.5896, + "step": 11332 + }, + { + "epoch": 0.56, + "grad_norm": 1.7742873430252075, + "learning_rate": 2.1999604723553534e-05, + "loss": 5.5624, + "step": 11336 + }, + { + "epoch": 0.56, + "grad_norm": 2.0210537910461426, + "learning_rate": 2.1989722812391916e-05, + "loss": 5.6381, + "step": 11340 + }, + { + "epoch": 0.56, + "grad_norm": 1.8388046026229858, + "learning_rate": 2.19798409012303e-05, + "loss": 5.5318, + "step": 11344 + }, + { + "epoch": 0.56, + "grad_norm": 1.9268219470977783, + "learning_rate": 2.196995899006868e-05, + "loss": 5.5328, + "step": 11348 + }, + { + "epoch": 0.56, + "grad_norm": 2.0054502487182617, + "learning_rate": 2.1960077078907062e-05, + "loss": 5.5936, + "step": 11352 + }, + { + "epoch": 0.56, + "grad_norm": 1.741058349609375, + "learning_rate": 2.1950195167745444e-05, + "loss": 5.5389, + "step": 11356 + }, + { + "epoch": 0.56, + "grad_norm": 1.9942952394485474, + "learning_rate": 2.1940313256583826e-05, + "loss": 5.5587, + "step": 11360 + }, + { + "epoch": 0.56, + "grad_norm": 1.9131739139556885, + "learning_rate": 2.1930431345422205e-05, + "loss": 5.417, + "step": 11364 + }, + { + "epoch": 0.56, + "grad_norm": 1.7850251197814941, + "learning_rate": 2.1920549434260587e-05, + "loss": 5.5102, + "step": 11368 + }, + { + "epoch": 0.56, + "grad_norm": 2.221616744995117, + "learning_rate": 2.191066752309897e-05, + "loss": 5.5262, + "step": 11372 + }, + { + "epoch": 0.56, + "grad_norm": 1.910980463027954, + "learning_rate": 2.1900785611937348e-05, + "loss": 5.7343, + "step": 11376 + }, + { + "epoch": 0.56, + "grad_norm": 1.8210395574569702, + "learning_rate": 2.189090370077573e-05, + "loss": 5.5456, + "step": 11380 + }, + { + "epoch": 0.56, + "grad_norm": 1.9524825811386108, + "learning_rate": 2.1881021789614112e-05, + "loss": 5.5558, + "step": 11384 + }, + { + "epoch": 0.56, + "grad_norm": 1.5711989402770996, + "learning_rate": 2.1871139878452494e-05, + "loss": 5.4329, + "step": 11388 + }, + { + "epoch": 0.56, + "grad_norm": 2.0921053886413574, + "learning_rate": 2.1861257967290873e-05, + "loss": 5.6219, + "step": 11392 + }, + { + "epoch": 0.56, + "grad_norm": 2.04020357131958, + "learning_rate": 2.1851376056129255e-05, + "loss": 5.4476, + "step": 11396 + }, + { + "epoch": 0.56, + "grad_norm": 1.7985339164733887, + "learning_rate": 2.184149414496764e-05, + "loss": 5.5278, + "step": 11400 + }, + { + "epoch": 0.56, + "grad_norm": 1.8650219440460205, + "learning_rate": 2.183161223380602e-05, + "loss": 5.5657, + "step": 11404 + }, + { + "epoch": 0.56, + "grad_norm": 1.8160661458969116, + "learning_rate": 2.18217303226444e-05, + "loss": 5.525, + "step": 11408 + }, + { + "epoch": 0.56, + "grad_norm": 2.0409529209136963, + "learning_rate": 2.1811848411482783e-05, + "loss": 5.4098, + "step": 11412 + }, + { + "epoch": 0.56, + "grad_norm": 1.731351375579834, + "learning_rate": 2.1801966500321165e-05, + "loss": 5.588, + "step": 11416 + }, + { + "epoch": 0.56, + "grad_norm": 1.9625564813613892, + "learning_rate": 2.1792084589159544e-05, + "loss": 5.4872, + "step": 11420 + }, + { + "epoch": 0.56, + "grad_norm": 2.0399959087371826, + "learning_rate": 2.1782202677997926e-05, + "loss": 5.5627, + "step": 11424 + }, + { + "epoch": 0.56, + "grad_norm": 1.684001088142395, + "learning_rate": 2.1772320766836308e-05, + "loss": 5.6156, + "step": 11428 + }, + { + "epoch": 0.56, + "grad_norm": 1.7688792943954468, + "learning_rate": 2.176243885567469e-05, + "loss": 5.4949, + "step": 11432 + }, + { + "epoch": 0.57, + "grad_norm": 2.069566011428833, + "learning_rate": 2.175255694451307e-05, + "loss": 5.6452, + "step": 11436 + }, + { + "epoch": 0.57, + "grad_norm": 2.030831813812256, + "learning_rate": 2.174267503335145e-05, + "loss": 5.3957, + "step": 11440 + }, + { + "epoch": 0.57, + "grad_norm": 1.9303979873657227, + "learning_rate": 2.1732793122189833e-05, + "loss": 5.4431, + "step": 11444 + }, + { + "epoch": 0.57, + "grad_norm": 1.6101690530776978, + "learning_rate": 2.172291121102821e-05, + "loss": 5.5681, + "step": 11448 + }, + { + "epoch": 0.57, + "grad_norm": 1.9189858436584473, + "learning_rate": 2.1713029299866593e-05, + "loss": 5.4847, + "step": 11452 + }, + { + "epoch": 0.57, + "grad_norm": 2.09655499458313, + "learning_rate": 2.1703147388704976e-05, + "loss": 5.5774, + "step": 11456 + }, + { + "epoch": 0.57, + "grad_norm": 2.018876791000366, + "learning_rate": 2.1693265477543358e-05, + "loss": 5.614, + "step": 11460 + }, + { + "epoch": 0.57, + "grad_norm": 1.754034161567688, + "learning_rate": 2.168338356638174e-05, + "loss": 5.443, + "step": 11464 + }, + { + "epoch": 0.57, + "grad_norm": 1.950484275817871, + "learning_rate": 2.1673501655220122e-05, + "loss": 5.4982, + "step": 11468 + }, + { + "epoch": 0.57, + "grad_norm": 1.8960100412368774, + "learning_rate": 2.1663619744058504e-05, + "loss": 5.478, + "step": 11472 + }, + { + "epoch": 0.57, + "grad_norm": 1.7579559087753296, + "learning_rate": 2.1653737832896882e-05, + "loss": 5.5021, + "step": 11476 + }, + { + "epoch": 0.57, + "grad_norm": 2.0521066188812256, + "learning_rate": 2.1643855921735264e-05, + "loss": 5.5329, + "step": 11480 + }, + { + "epoch": 0.57, + "grad_norm": 1.999937891960144, + "learning_rate": 2.1633974010573647e-05, + "loss": 5.4377, + "step": 11484 + }, + { + "epoch": 0.57, + "grad_norm": 2.099349021911621, + "learning_rate": 2.162409209941203e-05, + "loss": 5.562, + "step": 11488 + }, + { + "epoch": 0.57, + "grad_norm": 2.0938682556152344, + "learning_rate": 2.1614210188250407e-05, + "loss": 5.577, + "step": 11492 + }, + { + "epoch": 0.57, + "grad_norm": 1.6756892204284668, + "learning_rate": 2.160432827708879e-05, + "loss": 5.6301, + "step": 11496 + }, + { + "epoch": 0.57, + "grad_norm": 1.7941629886627197, + "learning_rate": 2.159444636592717e-05, + "loss": 5.5144, + "step": 11500 + }, + { + "epoch": 0.57, + "grad_norm": 1.9828251600265503, + "learning_rate": 2.1584564454765553e-05, + "loss": 5.674, + "step": 11504 + }, + { + "epoch": 0.57, + "grad_norm": 1.8105112314224243, + "learning_rate": 2.1574682543603932e-05, + "loss": 5.5155, + "step": 11508 + }, + { + "epoch": 0.57, + "grad_norm": 1.7346547842025757, + "learning_rate": 2.1564800632442314e-05, + "loss": 5.6869, + "step": 11512 + }, + { + "epoch": 0.57, + "grad_norm": 1.7930268049240112, + "learning_rate": 2.15549187212807e-05, + "loss": 5.4631, + "step": 11516 + }, + { + "epoch": 0.57, + "grad_norm": 1.799404263496399, + "learning_rate": 2.154503681011908e-05, + "loss": 5.5324, + "step": 11520 + }, + { + "epoch": 0.57, + "grad_norm": 1.948190450668335, + "learning_rate": 2.153515489895746e-05, + "loss": 5.6236, + "step": 11524 + }, + { + "epoch": 0.57, + "grad_norm": 2.0511763095855713, + "learning_rate": 2.1525272987795842e-05, + "loss": 5.6085, + "step": 11528 + }, + { + "epoch": 0.57, + "grad_norm": 2.1785106658935547, + "learning_rate": 2.151539107663422e-05, + "loss": 5.4754, + "step": 11532 + }, + { + "epoch": 0.57, + "grad_norm": 1.9326204061508179, + "learning_rate": 2.1505509165472603e-05, + "loss": 5.5052, + "step": 11536 + }, + { + "epoch": 0.57, + "grad_norm": 1.9747314453125, + "learning_rate": 2.1495627254310985e-05, + "loss": 5.4398, + "step": 11540 + }, + { + "epoch": 0.57, + "grad_norm": 1.743662714958191, + "learning_rate": 2.1485745343149367e-05, + "loss": 5.4521, + "step": 11544 + }, + { + "epoch": 0.57, + "grad_norm": 1.823280930519104, + "learning_rate": 2.1475863431987746e-05, + "loss": 5.4957, + "step": 11548 + }, + { + "epoch": 0.57, + "grad_norm": 2.047112464904785, + "learning_rate": 2.1465981520826128e-05, + "loss": 5.584, + "step": 11552 + }, + { + "epoch": 0.57, + "grad_norm": 1.7637749910354614, + "learning_rate": 2.145609960966451e-05, + "loss": 5.6439, + "step": 11556 + }, + { + "epoch": 0.57, + "grad_norm": 1.7925045490264893, + "learning_rate": 2.1446217698502892e-05, + "loss": 5.4745, + "step": 11560 + }, + { + "epoch": 0.57, + "grad_norm": 1.8895896673202515, + "learning_rate": 2.143633578734127e-05, + "loss": 5.4082, + "step": 11564 + }, + { + "epoch": 0.57, + "grad_norm": 1.956284761428833, + "learning_rate": 2.1426453876179653e-05, + "loss": 5.3854, + "step": 11568 + }, + { + "epoch": 0.57, + "grad_norm": 1.9955167770385742, + "learning_rate": 2.141657196501804e-05, + "loss": 5.4486, + "step": 11572 + }, + { + "epoch": 0.57, + "grad_norm": 2.0636212825775146, + "learning_rate": 2.1406690053856417e-05, + "loss": 5.6201, + "step": 11576 + }, + { + "epoch": 0.57, + "grad_norm": 2.0052077770233154, + "learning_rate": 2.13968081426948e-05, + "loss": 5.5525, + "step": 11580 + }, + { + "epoch": 0.57, + "grad_norm": 1.9207104444503784, + "learning_rate": 2.138692623153318e-05, + "loss": 5.6011, + "step": 11584 + }, + { + "epoch": 0.57, + "grad_norm": 1.9215471744537354, + "learning_rate": 2.137704432037156e-05, + "loss": 5.556, + "step": 11588 + }, + { + "epoch": 0.57, + "grad_norm": 1.6822532415390015, + "learning_rate": 2.1367162409209942e-05, + "loss": 5.521, + "step": 11592 + }, + { + "epoch": 0.57, + "grad_norm": 2.1839683055877686, + "learning_rate": 2.1357280498048324e-05, + "loss": 5.6367, + "step": 11596 + }, + { + "epoch": 0.57, + "grad_norm": 1.855774998664856, + "learning_rate": 2.1347398586886706e-05, + "loss": 5.518, + "step": 11600 + }, + { + "epoch": 0.57, + "grad_norm": 2.0112996101379395, + "learning_rate": 2.1337516675725085e-05, + "loss": 5.6125, + "step": 11604 + }, + { + "epoch": 0.57, + "grad_norm": 2.0093090534210205, + "learning_rate": 2.1327634764563467e-05, + "loss": 5.6358, + "step": 11608 + }, + { + "epoch": 0.57, + "grad_norm": 1.6822510957717896, + "learning_rate": 2.131775285340185e-05, + "loss": 5.5177, + "step": 11612 + }, + { + "epoch": 0.57, + "grad_norm": 1.9259730577468872, + "learning_rate": 2.130787094224023e-05, + "loss": 5.5029, + "step": 11616 + }, + { + "epoch": 0.57, + "grad_norm": 1.9150243997573853, + "learning_rate": 2.129798903107861e-05, + "loss": 5.5636, + "step": 11620 + }, + { + "epoch": 0.57, + "grad_norm": 1.8998494148254395, + "learning_rate": 2.128810711991699e-05, + "loss": 5.5628, + "step": 11624 + }, + { + "epoch": 0.57, + "grad_norm": 1.9951376914978027, + "learning_rate": 2.1278225208755374e-05, + "loss": 5.5671, + "step": 11628 + }, + { + "epoch": 0.57, + "grad_norm": 1.7849557399749756, + "learning_rate": 2.1268343297593756e-05, + "loss": 5.5127, + "step": 11632 + }, + { + "epoch": 0.57, + "grad_norm": 2.0039825439453125, + "learning_rate": 2.1258461386432138e-05, + "loss": 5.5114, + "step": 11636 + }, + { + "epoch": 0.58, + "grad_norm": 2.007443904876709, + "learning_rate": 2.124857947527052e-05, + "loss": 5.5814, + "step": 11640 + }, + { + "epoch": 0.58, + "grad_norm": 1.7764946222305298, + "learning_rate": 2.1238697564108902e-05, + "loss": 5.5585, + "step": 11644 + }, + { + "epoch": 0.58, + "grad_norm": 2.218045473098755, + "learning_rate": 2.122881565294728e-05, + "loss": 5.4461, + "step": 11648 + }, + { + "epoch": 0.58, + "grad_norm": 2.088454484939575, + "learning_rate": 2.1218933741785663e-05, + "loss": 5.492, + "step": 11652 + }, + { + "epoch": 0.58, + "grad_norm": 1.8053348064422607, + "learning_rate": 2.1209051830624045e-05, + "loss": 5.5221, + "step": 11656 + }, + { + "epoch": 0.58, + "grad_norm": 2.035414218902588, + "learning_rate": 2.1199169919462423e-05, + "loss": 5.6862, + "step": 11660 + }, + { + "epoch": 0.58, + "grad_norm": 2.0980472564697266, + "learning_rate": 2.1189288008300805e-05, + "loss": 5.6149, + "step": 11664 + }, + { + "epoch": 0.58, + "grad_norm": 2.033268690109253, + "learning_rate": 2.1179406097139187e-05, + "loss": 5.4371, + "step": 11668 + }, + { + "epoch": 0.58, + "grad_norm": 1.8143877983093262, + "learning_rate": 2.116952418597757e-05, + "loss": 5.5271, + "step": 11672 + }, + { + "epoch": 0.58, + "grad_norm": 1.7776703834533691, + "learning_rate": 2.1159642274815948e-05, + "loss": 5.6312, + "step": 11676 + }, + { + "epoch": 0.58, + "grad_norm": 1.8278911113739014, + "learning_rate": 2.114976036365433e-05, + "loss": 5.5128, + "step": 11680 + }, + { + "epoch": 0.58, + "grad_norm": 1.8950855731964111, + "learning_rate": 2.1139878452492712e-05, + "loss": 5.4134, + "step": 11684 + }, + { + "epoch": 0.58, + "grad_norm": 1.828503966331482, + "learning_rate": 2.1129996541331094e-05, + "loss": 5.5917, + "step": 11688 + }, + { + "epoch": 0.58, + "grad_norm": 1.9912033081054688, + "learning_rate": 2.1120114630169476e-05, + "loss": 5.4005, + "step": 11692 + }, + { + "epoch": 0.58, + "grad_norm": 1.871800422668457, + "learning_rate": 2.111023271900786e-05, + "loss": 5.5202, + "step": 11696 + }, + { + "epoch": 0.58, + "grad_norm": 1.8771202564239502, + "learning_rate": 2.110035080784624e-05, + "loss": 5.5294, + "step": 11700 + }, + { + "epoch": 0.58, + "grad_norm": 1.8930245637893677, + "learning_rate": 2.109046889668462e-05, + "loss": 5.482, + "step": 11704 + }, + { + "epoch": 0.58, + "grad_norm": 1.910056471824646, + "learning_rate": 2.1080586985523e-05, + "loss": 5.625, + "step": 11708 + }, + { + "epoch": 0.58, + "grad_norm": 1.7149245738983154, + "learning_rate": 2.1070705074361383e-05, + "loss": 5.494, + "step": 11712 + }, + { + "epoch": 0.58, + "grad_norm": 1.8306865692138672, + "learning_rate": 2.1060823163199765e-05, + "loss": 5.5143, + "step": 11716 + }, + { + "epoch": 0.58, + "grad_norm": 1.9364176988601685, + "learning_rate": 2.1050941252038144e-05, + "loss": 5.5469, + "step": 11720 + }, + { + "epoch": 0.58, + "grad_norm": 1.9254311323165894, + "learning_rate": 2.1041059340876526e-05, + "loss": 5.6064, + "step": 11724 + }, + { + "epoch": 0.58, + "grad_norm": 1.8167616128921509, + "learning_rate": 2.1031177429714908e-05, + "loss": 5.6055, + "step": 11728 + }, + { + "epoch": 0.58, + "grad_norm": 1.9327070713043213, + "learning_rate": 2.1021295518553287e-05, + "loss": 5.6255, + "step": 11732 + }, + { + "epoch": 0.58, + "grad_norm": 2.0709402561187744, + "learning_rate": 2.101141360739167e-05, + "loss": 5.5823, + "step": 11736 + }, + { + "epoch": 0.58, + "grad_norm": 1.6243914365768433, + "learning_rate": 2.100153169623005e-05, + "loss": 5.5357, + "step": 11740 + }, + { + "epoch": 0.58, + "grad_norm": 2.0544323921203613, + "learning_rate": 2.0991649785068433e-05, + "loss": 5.4751, + "step": 11744 + }, + { + "epoch": 0.58, + "grad_norm": 1.8204375505447388, + "learning_rate": 2.0981767873906815e-05, + "loss": 5.4421, + "step": 11748 + }, + { + "epoch": 0.58, + "grad_norm": 1.8007164001464844, + "learning_rate": 2.0971885962745197e-05, + "loss": 5.5578, + "step": 11752 + }, + { + "epoch": 0.58, + "grad_norm": 2.0843026638031006, + "learning_rate": 2.096200405158358e-05, + "loss": 5.5082, + "step": 11756 + }, + { + "epoch": 0.58, + "grad_norm": 1.7756987810134888, + "learning_rate": 2.0952122140421958e-05, + "loss": 5.4814, + "step": 11760 + }, + { + "epoch": 0.58, + "grad_norm": 1.9955602884292603, + "learning_rate": 2.094224022926034e-05, + "loss": 5.6497, + "step": 11764 + }, + { + "epoch": 0.58, + "grad_norm": 2.000737190246582, + "learning_rate": 2.0932358318098722e-05, + "loss": 5.5241, + "step": 11768 + }, + { + "epoch": 0.58, + "grad_norm": 1.7721540927886963, + "learning_rate": 2.0922476406937104e-05, + "loss": 5.4961, + "step": 11772 + }, + { + "epoch": 0.58, + "grad_norm": 1.9845120906829834, + "learning_rate": 2.0912594495775483e-05, + "loss": 5.6158, + "step": 11776 + }, + { + "epoch": 0.58, + "grad_norm": 1.7041538953781128, + "learning_rate": 2.0902712584613865e-05, + "loss": 5.4892, + "step": 11780 + }, + { + "epoch": 0.58, + "grad_norm": 1.8320084810256958, + "learning_rate": 2.0892830673452247e-05, + "loss": 5.6435, + "step": 11784 + }, + { + "epoch": 0.58, + "grad_norm": 1.7904802560806274, + "learning_rate": 2.088294876229063e-05, + "loss": 5.4774, + "step": 11788 + }, + { + "epoch": 0.58, + "grad_norm": 1.894283413887024, + "learning_rate": 2.0873066851129008e-05, + "loss": 5.4308, + "step": 11792 + }, + { + "epoch": 0.58, + "grad_norm": 1.8176664113998413, + "learning_rate": 2.086318493996739e-05, + "loss": 5.4719, + "step": 11796 + }, + { + "epoch": 0.58, + "grad_norm": 1.9829813241958618, + "learning_rate": 2.0853303028805772e-05, + "loss": 5.4217, + "step": 11800 + }, + { + "epoch": 0.58, + "grad_norm": 1.8619272708892822, + "learning_rate": 2.0843421117644154e-05, + "loss": 5.5205, + "step": 11804 + }, + { + "epoch": 0.58, + "grad_norm": 1.8735342025756836, + "learning_rate": 2.0833539206482536e-05, + "loss": 5.5359, + "step": 11808 + }, + { + "epoch": 0.58, + "grad_norm": 1.860768437385559, + "learning_rate": 2.0823657295320918e-05, + "loss": 5.5111, + "step": 11812 + }, + { + "epoch": 0.58, + "grad_norm": 2.0333621501922607, + "learning_rate": 2.0813775384159297e-05, + "loss": 5.5294, + "step": 11816 + }, + { + "epoch": 0.58, + "grad_norm": 1.9842480421066284, + "learning_rate": 2.080389347299768e-05, + "loss": 5.521, + "step": 11820 + }, + { + "epoch": 0.58, + "grad_norm": 1.7381541728973389, + "learning_rate": 2.079401156183606e-05, + "loss": 5.5587, + "step": 11824 + }, + { + "epoch": 0.58, + "grad_norm": 1.6994056701660156, + "learning_rate": 2.0784129650674443e-05, + "loss": 5.5389, + "step": 11828 + }, + { + "epoch": 0.58, + "grad_norm": 1.7459512948989868, + "learning_rate": 2.077424773951282e-05, + "loss": 5.6567, + "step": 11832 + }, + { + "epoch": 0.58, + "grad_norm": 1.9870339632034302, + "learning_rate": 2.0764365828351203e-05, + "loss": 5.5713, + "step": 11836 + }, + { + "epoch": 0.59, + "grad_norm": 2.1633095741271973, + "learning_rate": 2.0754483917189586e-05, + "loss": 5.4663, + "step": 11840 + }, + { + "epoch": 0.59, + "grad_norm": 1.8806403875350952, + "learning_rate": 2.0744602006027968e-05, + "loss": 5.489, + "step": 11844 + }, + { + "epoch": 0.59, + "grad_norm": 2.007995367050171, + "learning_rate": 2.0734720094866346e-05, + "loss": 5.669, + "step": 11848 + }, + { + "epoch": 0.59, + "grad_norm": 2.024402141571045, + "learning_rate": 2.072483818370473e-05, + "loss": 5.5401, + "step": 11852 + }, + { + "epoch": 0.59, + "grad_norm": 1.9132201671600342, + "learning_rate": 2.071495627254311e-05, + "loss": 5.6524, + "step": 11856 + }, + { + "epoch": 0.59, + "grad_norm": 2.0311427116394043, + "learning_rate": 2.0705074361381492e-05, + "loss": 5.5418, + "step": 11860 + }, + { + "epoch": 0.59, + "grad_norm": 2.0080606937408447, + "learning_rate": 2.0695192450219875e-05, + "loss": 5.5101, + "step": 11864 + }, + { + "epoch": 0.59, + "grad_norm": 1.698283076286316, + "learning_rate": 2.0685310539058257e-05, + "loss": 5.6118, + "step": 11868 + }, + { + "epoch": 0.59, + "grad_norm": 1.9465456008911133, + "learning_rate": 2.0675428627896635e-05, + "loss": 5.545, + "step": 11872 + }, + { + "epoch": 0.59, + "grad_norm": 1.7435317039489746, + "learning_rate": 2.0665546716735017e-05, + "loss": 5.4387, + "step": 11876 + }, + { + "epoch": 0.59, + "grad_norm": 2.106904983520508, + "learning_rate": 2.06556648055734e-05, + "loss": 5.6137, + "step": 11880 + }, + { + "epoch": 0.59, + "grad_norm": 2.1040077209472656, + "learning_rate": 2.064578289441178e-05, + "loss": 5.5537, + "step": 11884 + }, + { + "epoch": 0.59, + "grad_norm": 1.9074550867080688, + "learning_rate": 2.063590098325016e-05, + "loss": 5.5837, + "step": 11888 + }, + { + "epoch": 0.59, + "grad_norm": 2.0350706577301025, + "learning_rate": 2.0626019072088542e-05, + "loss": 5.6268, + "step": 11892 + }, + { + "epoch": 0.59, + "grad_norm": 2.2680063247680664, + "learning_rate": 2.0616137160926924e-05, + "loss": 5.4717, + "step": 11896 + }, + { + "epoch": 0.59, + "grad_norm": 1.8204529285430908, + "learning_rate": 2.0606255249765306e-05, + "loss": 5.5205, + "step": 11900 + }, + { + "epoch": 0.59, + "grad_norm": 2.01955246925354, + "learning_rate": 2.0596373338603685e-05, + "loss": 5.4586, + "step": 11904 + }, + { + "epoch": 0.59, + "grad_norm": 2.047470808029175, + "learning_rate": 2.0586491427442067e-05, + "loss": 5.5496, + "step": 11908 + }, + { + "epoch": 0.59, + "grad_norm": 1.9742119312286377, + "learning_rate": 2.057660951628045e-05, + "loss": 5.5521, + "step": 11912 + }, + { + "epoch": 0.59, + "grad_norm": 1.9781845808029175, + "learning_rate": 2.056672760511883e-05, + "loss": 5.4699, + "step": 11916 + }, + { + "epoch": 0.59, + "grad_norm": 1.7147916555404663, + "learning_rate": 2.0556845693957213e-05, + "loss": 5.5772, + "step": 11920 + }, + { + "epoch": 0.59, + "grad_norm": 1.8772165775299072, + "learning_rate": 2.0546963782795595e-05, + "loss": 5.4976, + "step": 11924 + }, + { + "epoch": 0.59, + "grad_norm": 1.9535537958145142, + "learning_rate": 2.0537081871633977e-05, + "loss": 5.4462, + "step": 11928 + }, + { + "epoch": 0.59, + "grad_norm": 1.8837189674377441, + "learning_rate": 2.0527199960472356e-05, + "loss": 5.5724, + "step": 11932 + }, + { + "epoch": 0.59, + "grad_norm": 1.9887135028839111, + "learning_rate": 2.0517318049310738e-05, + "loss": 5.6435, + "step": 11936 + }, + { + "epoch": 0.59, + "grad_norm": 1.9106159210205078, + "learning_rate": 2.050743613814912e-05, + "loss": 5.6322, + "step": 11940 + }, + { + "epoch": 0.59, + "grad_norm": 1.9000262022018433, + "learning_rate": 2.04975542269875e-05, + "loss": 5.5926, + "step": 11944 + }, + { + "epoch": 0.59, + "grad_norm": 1.8226341009140015, + "learning_rate": 2.048767231582588e-05, + "loss": 5.5135, + "step": 11948 + }, + { + "epoch": 0.59, + "grad_norm": 1.8902435302734375, + "learning_rate": 2.0477790404664263e-05, + "loss": 5.4756, + "step": 11952 + }, + { + "epoch": 0.59, + "grad_norm": 2.058504819869995, + "learning_rate": 2.0467908493502645e-05, + "loss": 5.6203, + "step": 11956 + }, + { + "epoch": 0.59, + "grad_norm": 1.977247714996338, + "learning_rate": 2.0458026582341024e-05, + "loss": 5.5297, + "step": 11960 + }, + { + "epoch": 0.59, + "grad_norm": 1.6700700521469116, + "learning_rate": 2.0448144671179406e-05, + "loss": 5.5585, + "step": 11964 + }, + { + "epoch": 0.59, + "grad_norm": 1.7290418148040771, + "learning_rate": 2.0438262760017788e-05, + "loss": 5.498, + "step": 11968 + }, + { + "epoch": 0.59, + "grad_norm": 1.7892743349075317, + "learning_rate": 2.042838084885617e-05, + "loss": 5.4366, + "step": 11972 + }, + { + "epoch": 0.59, + "grad_norm": 2.1159372329711914, + "learning_rate": 2.0418498937694552e-05, + "loss": 5.5055, + "step": 11976 + }, + { + "epoch": 0.59, + "grad_norm": 2.0965609550476074, + "learning_rate": 2.0408617026532934e-05, + "loss": 5.6062, + "step": 11980 + }, + { + "epoch": 0.59, + "grad_norm": 1.8796151876449585, + "learning_rate": 2.0398735115371316e-05, + "loss": 5.5831, + "step": 11984 + }, + { + "epoch": 0.59, + "grad_norm": 1.8930494785308838, + "learning_rate": 2.0388853204209695e-05, + "loss": 5.5128, + "step": 11988 + }, + { + "epoch": 0.59, + "grad_norm": 2.1627285480499268, + "learning_rate": 2.0378971293048077e-05, + "loss": 5.5642, + "step": 11992 + }, + { + "epoch": 0.59, + "grad_norm": 1.7648162841796875, + "learning_rate": 2.036908938188646e-05, + "loss": 5.4877, + "step": 11996 + }, + { + "epoch": 0.59, + "grad_norm": 2.2223527431488037, + "learning_rate": 2.035920747072484e-05, + "loss": 5.5078, + "step": 12000 + }, + { + "epoch": 0.59, + "grad_norm": 2.301929473876953, + "learning_rate": 2.034932555956322e-05, + "loss": 5.5219, + "step": 12004 + }, + { + "epoch": 0.59, + "grad_norm": 1.9763400554656982, + "learning_rate": 2.03394436484016e-05, + "loss": 5.5175, + "step": 12008 + }, + { + "epoch": 0.59, + "grad_norm": 2.0059380531311035, + "learning_rate": 2.0329561737239984e-05, + "loss": 5.5533, + "step": 12012 + }, + { + "epoch": 0.59, + "grad_norm": 2.3033435344696045, + "learning_rate": 2.0319679826078362e-05, + "loss": 5.5641, + "step": 12016 + }, + { + "epoch": 0.59, + "grad_norm": 1.9601203203201294, + "learning_rate": 2.0309797914916744e-05, + "loss": 5.4841, + "step": 12020 + }, + { + "epoch": 0.59, + "grad_norm": 1.893579363822937, + "learning_rate": 2.0299916003755126e-05, + "loss": 5.418, + "step": 12024 + }, + { + "epoch": 0.59, + "grad_norm": 1.857035517692566, + "learning_rate": 2.029003409259351e-05, + "loss": 5.5092, + "step": 12028 + }, + { + "epoch": 0.59, + "grad_norm": 2.068701982498169, + "learning_rate": 2.028015218143189e-05, + "loss": 5.4992, + "step": 12032 + }, + { + "epoch": 0.59, + "grad_norm": 1.8823144435882568, + "learning_rate": 2.0270270270270273e-05, + "loss": 5.4709, + "step": 12036 + }, + { + "epoch": 0.59, + "grad_norm": 1.873325228691101, + "learning_rate": 2.0260388359108655e-05, + "loss": 5.4815, + "step": 12040 + }, + { + "epoch": 0.6, + "grad_norm": 1.88863205909729, + "learning_rate": 2.0250506447947033e-05, + "loss": 5.6217, + "step": 12044 + }, + { + "epoch": 0.6, + "grad_norm": 1.813723087310791, + "learning_rate": 2.0240624536785415e-05, + "loss": 5.5736, + "step": 12048 + }, + { + "epoch": 0.6, + "grad_norm": 1.8674362897872925, + "learning_rate": 2.0230742625623797e-05, + "loss": 5.4345, + "step": 12052 + }, + { + "epoch": 0.6, + "grad_norm": 1.840463399887085, + "learning_rate": 2.022086071446218e-05, + "loss": 5.4454, + "step": 12056 + }, + { + "epoch": 0.6, + "grad_norm": 1.8874510526657104, + "learning_rate": 2.0210978803300558e-05, + "loss": 5.4834, + "step": 12060 + }, + { + "epoch": 0.6, + "grad_norm": 1.8396440744400024, + "learning_rate": 2.020109689213894e-05, + "loss": 5.5029, + "step": 12064 + }, + { + "epoch": 0.6, + "grad_norm": 1.950766921043396, + "learning_rate": 2.0191214980977322e-05, + "loss": 5.4637, + "step": 12068 + }, + { + "epoch": 0.6, + "grad_norm": 2.1061501502990723, + "learning_rate": 2.0181333069815704e-05, + "loss": 5.4081, + "step": 12072 + }, + { + "epoch": 0.6, + "grad_norm": 1.9160481691360474, + "learning_rate": 2.0171451158654083e-05, + "loss": 5.484, + "step": 12076 + }, + { + "epoch": 0.6, + "grad_norm": 2.151904344558716, + "learning_rate": 2.0161569247492465e-05, + "loss": 5.5189, + "step": 12080 + }, + { + "epoch": 0.6, + "grad_norm": 1.8847110271453857, + "learning_rate": 2.0151687336330847e-05, + "loss": 5.4264, + "step": 12084 + }, + { + "epoch": 0.6, + "grad_norm": 1.914305329322815, + "learning_rate": 2.014180542516923e-05, + "loss": 5.551, + "step": 12088 + }, + { + "epoch": 0.6, + "grad_norm": 2.034773111343384, + "learning_rate": 2.013192351400761e-05, + "loss": 5.5728, + "step": 12092 + }, + { + "epoch": 0.6, + "grad_norm": 1.7983940839767456, + "learning_rate": 2.0122041602845993e-05, + "loss": 5.4991, + "step": 12096 + }, + { + "epoch": 0.6, + "grad_norm": 2.019416570663452, + "learning_rate": 2.0112159691684372e-05, + "loss": 5.6503, + "step": 12100 + }, + { + "epoch": 0.6, + "grad_norm": 1.9403501749038696, + "learning_rate": 2.0102277780522754e-05, + "loss": 5.5295, + "step": 12104 + }, + { + "epoch": 0.6, + "grad_norm": 2.006972074508667, + "learning_rate": 2.0092395869361136e-05, + "loss": 5.4612, + "step": 12108 + }, + { + "epoch": 0.6, + "grad_norm": 2.0223989486694336, + "learning_rate": 2.0082513958199518e-05, + "loss": 5.4774, + "step": 12112 + }, + { + "epoch": 0.6, + "grad_norm": 1.7617640495300293, + "learning_rate": 2.0072632047037897e-05, + "loss": 5.5604, + "step": 12116 + }, + { + "epoch": 0.6, + "grad_norm": 1.961089849472046, + "learning_rate": 2.006275013587628e-05, + "loss": 5.4945, + "step": 12120 + }, + { + "epoch": 0.6, + "grad_norm": 2.054691791534424, + "learning_rate": 2.005286822471466e-05, + "loss": 5.4596, + "step": 12124 + }, + { + "epoch": 0.6, + "grad_norm": 1.8954715728759766, + "learning_rate": 2.0042986313553043e-05, + "loss": 5.5597, + "step": 12128 + }, + { + "epoch": 0.6, + "grad_norm": 1.9412333965301514, + "learning_rate": 2.0033104402391422e-05, + "loss": 5.5437, + "step": 12132 + }, + { + "epoch": 0.6, + "grad_norm": 1.7679616212844849, + "learning_rate": 2.0023222491229804e-05, + "loss": 5.3381, + "step": 12136 + }, + { + "epoch": 0.6, + "grad_norm": 1.884602665901184, + "learning_rate": 2.0013340580068186e-05, + "loss": 5.5533, + "step": 12140 + }, + { + "epoch": 0.6, + "grad_norm": 2.0963213443756104, + "learning_rate": 2.0003458668906565e-05, + "loss": 5.5015, + "step": 12144 + }, + { + "epoch": 0.6, + "grad_norm": 1.796938180923462, + "learning_rate": 1.999357675774495e-05, + "loss": 5.4519, + "step": 12148 + }, + { + "epoch": 0.6, + "grad_norm": 2.2093305587768555, + "learning_rate": 1.9983694846583332e-05, + "loss": 5.6164, + "step": 12152 + }, + { + "epoch": 0.6, + "grad_norm": 1.8350774049758911, + "learning_rate": 1.997381293542171e-05, + "loss": 5.583, + "step": 12156 + }, + { + "epoch": 0.6, + "grad_norm": 1.8653216361999512, + "learning_rate": 1.9963931024260093e-05, + "loss": 5.656, + "step": 12160 + }, + { + "epoch": 0.6, + "grad_norm": 1.7206474542617798, + "learning_rate": 1.9954049113098475e-05, + "loss": 5.5226, + "step": 12164 + }, + { + "epoch": 0.6, + "grad_norm": 1.8878382444381714, + "learning_rate": 1.9944167201936857e-05, + "loss": 5.6363, + "step": 12168 + }, + { + "epoch": 0.6, + "grad_norm": 2.0668506622314453, + "learning_rate": 1.9934285290775236e-05, + "loss": 5.5106, + "step": 12172 + }, + { + "epoch": 0.6, + "grad_norm": 2.0027477741241455, + "learning_rate": 1.9924403379613618e-05, + "loss": 5.5217, + "step": 12176 + }, + { + "epoch": 0.6, + "grad_norm": 2.1123950481414795, + "learning_rate": 1.9914521468452e-05, + "loss": 5.6275, + "step": 12180 + }, + { + "epoch": 0.6, + "grad_norm": 2.1055514812469482, + "learning_rate": 1.9904639557290382e-05, + "loss": 5.6186, + "step": 12184 + }, + { + "epoch": 0.6, + "grad_norm": 2.0140175819396973, + "learning_rate": 1.989475764612876e-05, + "loss": 5.5965, + "step": 12188 + }, + { + "epoch": 0.6, + "grad_norm": 1.902254343032837, + "learning_rate": 1.9884875734967142e-05, + "loss": 5.5557, + "step": 12192 + }, + { + "epoch": 0.6, + "grad_norm": 1.9510375261306763, + "learning_rate": 1.9874993823805525e-05, + "loss": 5.4804, + "step": 12196 + }, + { + "epoch": 0.6, + "grad_norm": 1.950716495513916, + "learning_rate": 1.9865111912643907e-05, + "loss": 5.5887, + "step": 12200 + }, + { + "epoch": 0.6, + "grad_norm": 2.0936696529388428, + "learning_rate": 1.985523000148229e-05, + "loss": 5.5463, + "step": 12204 + }, + { + "epoch": 0.6, + "grad_norm": 1.6361807584762573, + "learning_rate": 1.984534809032067e-05, + "loss": 5.5516, + "step": 12208 + }, + { + "epoch": 0.6, + "grad_norm": 2.1499600410461426, + "learning_rate": 1.9835466179159053e-05, + "loss": 5.5467, + "step": 12212 + }, + { + "epoch": 0.6, + "grad_norm": 1.9551507234573364, + "learning_rate": 1.982558426799743e-05, + "loss": 5.549, + "step": 12216 + }, + { + "epoch": 0.6, + "grad_norm": 1.9426565170288086, + "learning_rate": 1.9815702356835814e-05, + "loss": 5.5726, + "step": 12220 + }, + { + "epoch": 0.6, + "grad_norm": 1.9303926229476929, + "learning_rate": 1.9805820445674196e-05, + "loss": 5.5571, + "step": 12224 + }, + { + "epoch": 0.6, + "grad_norm": 1.859390139579773, + "learning_rate": 1.9795938534512574e-05, + "loss": 5.499, + "step": 12228 + }, + { + "epoch": 0.6, + "grad_norm": 1.8791084289550781, + "learning_rate": 1.9786056623350956e-05, + "loss": 5.5617, + "step": 12232 + }, + { + "epoch": 0.6, + "grad_norm": 2.1466195583343506, + "learning_rate": 1.977617471218934e-05, + "loss": 5.3845, + "step": 12236 + }, + { + "epoch": 0.6, + "grad_norm": 1.7159942388534546, + "learning_rate": 1.976629280102772e-05, + "loss": 5.551, + "step": 12240 + }, + { + "epoch": 0.6, + "grad_norm": 1.8822176456451416, + "learning_rate": 1.97564108898661e-05, + "loss": 5.4277, + "step": 12244 + }, + { + "epoch": 0.61, + "grad_norm": 1.798642635345459, + "learning_rate": 1.974652897870448e-05, + "loss": 5.6463, + "step": 12248 + }, + { + "epoch": 0.61, + "grad_norm": 1.9936386346817017, + "learning_rate": 1.9736647067542863e-05, + "loss": 5.6342, + "step": 12252 + }, + { + "epoch": 0.61, + "grad_norm": 2.1285390853881836, + "learning_rate": 1.9726765156381245e-05, + "loss": 5.5005, + "step": 12256 + }, + { + "epoch": 0.61, + "grad_norm": 1.88554048538208, + "learning_rate": 1.9716883245219624e-05, + "loss": 5.5671, + "step": 12260 + }, + { + "epoch": 0.61, + "grad_norm": 2.1367435455322266, + "learning_rate": 1.970700133405801e-05, + "loss": 5.4724, + "step": 12264 + }, + { + "epoch": 0.61, + "grad_norm": 1.9615224599838257, + "learning_rate": 1.969711942289639e-05, + "loss": 5.4797, + "step": 12268 + }, + { + "epoch": 0.61, + "grad_norm": 2.0672008991241455, + "learning_rate": 1.968723751173477e-05, + "loss": 5.4903, + "step": 12272 + }, + { + "epoch": 0.61, + "grad_norm": 1.8822599649429321, + "learning_rate": 1.9677355600573152e-05, + "loss": 5.4196, + "step": 12276 + }, + { + "epoch": 0.61, + "grad_norm": 1.9987417459487915, + "learning_rate": 1.9667473689411534e-05, + "loss": 5.5575, + "step": 12280 + }, + { + "epoch": 0.61, + "grad_norm": 1.8995615243911743, + "learning_rate": 1.9657591778249916e-05, + "loss": 5.5343, + "step": 12284 + }, + { + "epoch": 0.61, + "grad_norm": 1.9003994464874268, + "learning_rate": 1.9647709867088295e-05, + "loss": 5.5341, + "step": 12288 + }, + { + "epoch": 0.61, + "grad_norm": 1.9870491027832031, + "learning_rate": 1.9637827955926677e-05, + "loss": 5.5928, + "step": 12292 + }, + { + "epoch": 0.61, + "grad_norm": 2.072319269180298, + "learning_rate": 1.962794604476506e-05, + "loss": 5.4821, + "step": 12296 + }, + { + "epoch": 0.61, + "grad_norm": 1.9005805253982544, + "learning_rate": 1.9618064133603438e-05, + "loss": 5.5296, + "step": 12300 + }, + { + "epoch": 0.61, + "grad_norm": 1.8590092658996582, + "learning_rate": 1.960818222244182e-05, + "loss": 5.568, + "step": 12304 + }, + { + "epoch": 0.61, + "grad_norm": 2.030334949493408, + "learning_rate": 1.9598300311280202e-05, + "loss": 5.4239, + "step": 12308 + }, + { + "epoch": 0.61, + "grad_norm": 1.9241563081741333, + "learning_rate": 1.9588418400118584e-05, + "loss": 5.5618, + "step": 12312 + }, + { + "epoch": 0.61, + "grad_norm": 1.9295763969421387, + "learning_rate": 1.9578536488956963e-05, + "loss": 5.6011, + "step": 12316 + }, + { + "epoch": 0.61, + "grad_norm": 1.9351352453231812, + "learning_rate": 1.9568654577795348e-05, + "loss": 5.5549, + "step": 12320 + }, + { + "epoch": 0.61, + "grad_norm": 2.0557003021240234, + "learning_rate": 1.955877266663373e-05, + "loss": 5.531, + "step": 12324 + }, + { + "epoch": 0.61, + "grad_norm": 2.0774848461151123, + "learning_rate": 1.954889075547211e-05, + "loss": 5.5031, + "step": 12328 + }, + { + "epoch": 0.61, + "grad_norm": 1.9636493921279907, + "learning_rate": 1.953900884431049e-05, + "loss": 5.5733, + "step": 12332 + }, + { + "epoch": 0.61, + "grad_norm": 2.006387710571289, + "learning_rate": 1.9529126933148873e-05, + "loss": 5.5184, + "step": 12336 + }, + { + "epoch": 0.61, + "grad_norm": 1.8912975788116455, + "learning_rate": 1.9519245021987255e-05, + "loss": 5.4529, + "step": 12340 + }, + { + "epoch": 0.61, + "grad_norm": 2.028090715408325, + "learning_rate": 1.9509363110825634e-05, + "loss": 5.4242, + "step": 12344 + }, + { + "epoch": 0.61, + "grad_norm": 2.042482376098633, + "learning_rate": 1.9499481199664016e-05, + "loss": 5.5455, + "step": 12348 + }, + { + "epoch": 0.61, + "grad_norm": 2.1823337078094482, + "learning_rate": 1.9489599288502398e-05, + "loss": 5.4165, + "step": 12352 + }, + { + "epoch": 0.61, + "grad_norm": 1.8578866720199585, + "learning_rate": 1.947971737734078e-05, + "loss": 5.4873, + "step": 12356 + }, + { + "epoch": 0.61, + "grad_norm": 1.964311957359314, + "learning_rate": 1.946983546617916e-05, + "loss": 5.5457, + "step": 12360 + }, + { + "epoch": 0.61, + "grad_norm": 2.030364513397217, + "learning_rate": 1.945995355501754e-05, + "loss": 5.474, + "step": 12364 + }, + { + "epoch": 0.61, + "grad_norm": 2.0638349056243896, + "learning_rate": 1.9450071643855923e-05, + "loss": 5.5748, + "step": 12368 + }, + { + "epoch": 0.61, + "grad_norm": 2.002610445022583, + "learning_rate": 1.94401897326943e-05, + "loss": 5.546, + "step": 12372 + }, + { + "epoch": 0.61, + "grad_norm": 1.8753662109375, + "learning_rate": 1.9430307821532687e-05, + "loss": 5.5053, + "step": 12376 + }, + { + "epoch": 0.61, + "grad_norm": 2.1950864791870117, + "learning_rate": 1.942042591037107e-05, + "loss": 5.4716, + "step": 12380 + }, + { + "epoch": 0.61, + "grad_norm": 2.067065715789795, + "learning_rate": 1.9410543999209447e-05, + "loss": 5.5028, + "step": 12384 + }, + { + "epoch": 0.61, + "grad_norm": 2.0129029750823975, + "learning_rate": 1.940066208804783e-05, + "loss": 5.522, + "step": 12388 + }, + { + "epoch": 0.61, + "grad_norm": 1.5361533164978027, + "learning_rate": 1.939078017688621e-05, + "loss": 5.4896, + "step": 12392 + }, + { + "epoch": 0.61, + "grad_norm": 1.945295810699463, + "learning_rate": 1.9380898265724594e-05, + "loss": 5.5306, + "step": 12396 + }, + { + "epoch": 0.61, + "grad_norm": 1.7914199829101562, + "learning_rate": 1.9371016354562972e-05, + "loss": 5.5514, + "step": 12400 + }, + { + "epoch": 0.61, + "grad_norm": 2.061509609222412, + "learning_rate": 1.9361134443401354e-05, + "loss": 5.5803, + "step": 12404 + }, + { + "epoch": 0.61, + "grad_norm": 1.9697644710540771, + "learning_rate": 1.9351252532239736e-05, + "loss": 5.4989, + "step": 12408 + }, + { + "epoch": 0.61, + "grad_norm": 1.801199197769165, + "learning_rate": 1.934137062107812e-05, + "loss": 5.6458, + "step": 12412 + }, + { + "epoch": 0.61, + "grad_norm": 1.985129475593567, + "learning_rate": 1.9331488709916497e-05, + "loss": 5.4291, + "step": 12416 + }, + { + "epoch": 0.61, + "grad_norm": 1.8330814838409424, + "learning_rate": 1.932160679875488e-05, + "loss": 5.3954, + "step": 12420 + }, + { + "epoch": 0.61, + "grad_norm": 2.2082693576812744, + "learning_rate": 1.931172488759326e-05, + "loss": 5.5882, + "step": 12424 + }, + { + "epoch": 0.61, + "grad_norm": 1.6590445041656494, + "learning_rate": 1.930184297643164e-05, + "loss": 5.4728, + "step": 12428 + }, + { + "epoch": 0.61, + "grad_norm": 2.0365848541259766, + "learning_rate": 1.9291961065270022e-05, + "loss": 5.5032, + "step": 12432 + }, + { + "epoch": 0.61, + "grad_norm": 1.9523799419403076, + "learning_rate": 1.9282079154108407e-05, + "loss": 5.4974, + "step": 12436 + }, + { + "epoch": 0.61, + "grad_norm": 1.9144923686981201, + "learning_rate": 1.9272197242946786e-05, + "loss": 5.5348, + "step": 12440 + }, + { + "epoch": 0.61, + "grad_norm": 1.7671104669570923, + "learning_rate": 1.9262315331785168e-05, + "loss": 5.5292, + "step": 12444 + }, + { + "epoch": 0.62, + "grad_norm": 2.0515549182891846, + "learning_rate": 1.925243342062355e-05, + "loss": 5.4845, + "step": 12448 + }, + { + "epoch": 0.62, + "grad_norm": 1.9381266832351685, + "learning_rate": 1.9242551509461932e-05, + "loss": 5.5455, + "step": 12452 + }, + { + "epoch": 0.62, + "grad_norm": 2.017817497253418, + "learning_rate": 1.923266959830031e-05, + "loss": 5.3844, + "step": 12456 + }, + { + "epoch": 0.62, + "grad_norm": 1.8400537967681885, + "learning_rate": 1.9222787687138693e-05, + "loss": 5.4878, + "step": 12460 + }, + { + "epoch": 0.62, + "grad_norm": 2.263641119003296, + "learning_rate": 1.9212905775977075e-05, + "loss": 5.5841, + "step": 12464 + }, + { + "epoch": 0.62, + "grad_norm": 1.8722437620162964, + "learning_rate": 1.9203023864815457e-05, + "loss": 5.4782, + "step": 12468 + }, + { + "epoch": 0.62, + "grad_norm": 2.020585060119629, + "learning_rate": 1.9193141953653836e-05, + "loss": 5.4668, + "step": 12472 + }, + { + "epoch": 0.62, + "grad_norm": 1.7479088306427002, + "learning_rate": 1.9183260042492218e-05, + "loss": 5.5514, + "step": 12476 + }, + { + "epoch": 0.62, + "grad_norm": 1.9156551361083984, + "learning_rate": 1.91733781313306e-05, + "loss": 5.4994, + "step": 12480 + }, + { + "epoch": 0.62, + "grad_norm": 1.882408618927002, + "learning_rate": 1.9163496220168982e-05, + "loss": 5.4834, + "step": 12484 + }, + { + "epoch": 0.62, + "grad_norm": 2.083282232284546, + "learning_rate": 1.915361430900736e-05, + "loss": 5.5116, + "step": 12488 + }, + { + "epoch": 0.62, + "grad_norm": 1.9320555925369263, + "learning_rate": 1.9143732397845746e-05, + "loss": 5.5378, + "step": 12492 + }, + { + "epoch": 0.62, + "grad_norm": 2.215940475463867, + "learning_rate": 1.9133850486684128e-05, + "loss": 5.4937, + "step": 12496 + }, + { + "epoch": 0.62, + "grad_norm": 2.308119297027588, + "learning_rate": 1.9123968575522507e-05, + "loss": 5.555, + "step": 12500 + }, + { + "epoch": 0.62, + "grad_norm": 2.178675413131714, + "learning_rate": 1.911408666436089e-05, + "loss": 5.487, + "step": 12504 + }, + { + "epoch": 0.62, + "grad_norm": 2.004458427429199, + "learning_rate": 1.910420475319927e-05, + "loss": 5.5416, + "step": 12508 + }, + { + "epoch": 0.62, + "grad_norm": 2.0435168743133545, + "learning_rate": 1.909432284203765e-05, + "loss": 5.4122, + "step": 12512 + }, + { + "epoch": 0.62, + "grad_norm": 2.0281968116760254, + "learning_rate": 1.9084440930876032e-05, + "loss": 5.5032, + "step": 12516 + }, + { + "epoch": 0.62, + "grad_norm": 1.9901241064071655, + "learning_rate": 1.9074559019714414e-05, + "loss": 5.5346, + "step": 12520 + }, + { + "epoch": 0.62, + "grad_norm": 2.0608649253845215, + "learning_rate": 1.9064677108552796e-05, + "loss": 5.5318, + "step": 12524 + }, + { + "epoch": 0.62, + "grad_norm": 2.1132655143737793, + "learning_rate": 1.9054795197391175e-05, + "loss": 5.5227, + "step": 12528 + }, + { + "epoch": 0.62, + "grad_norm": 2.1006295680999756, + "learning_rate": 1.9044913286229557e-05, + "loss": 5.4704, + "step": 12532 + }, + { + "epoch": 0.62, + "grad_norm": 1.8386894464492798, + "learning_rate": 1.903503137506794e-05, + "loss": 5.4159, + "step": 12536 + }, + { + "epoch": 0.62, + "grad_norm": 1.9647696018218994, + "learning_rate": 1.902514946390632e-05, + "loss": 5.6805, + "step": 12540 + }, + { + "epoch": 0.62, + "grad_norm": 2.1188244819641113, + "learning_rate": 1.90152675527447e-05, + "loss": 5.4004, + "step": 12544 + }, + { + "epoch": 0.62, + "grad_norm": 2.0802998542785645, + "learning_rate": 1.9005385641583085e-05, + "loss": 5.4416, + "step": 12548 + }, + { + "epoch": 0.62, + "grad_norm": 1.834084153175354, + "learning_rate": 1.8995503730421467e-05, + "loss": 5.6306, + "step": 12552 + }, + { + "epoch": 0.62, + "grad_norm": 1.8112331628799438, + "learning_rate": 1.8985621819259846e-05, + "loss": 5.5333, + "step": 12556 + }, + { + "epoch": 0.62, + "grad_norm": 1.899707317352295, + "learning_rate": 1.8975739908098228e-05, + "loss": 5.4976, + "step": 12560 + }, + { + "epoch": 0.62, + "grad_norm": 2.298161506652832, + "learning_rate": 1.896585799693661e-05, + "loss": 5.594, + "step": 12564 + }, + { + "epoch": 0.62, + "grad_norm": 2.173597574234009, + "learning_rate": 1.8955976085774992e-05, + "loss": 5.572, + "step": 12568 + }, + { + "epoch": 0.62, + "grad_norm": 1.8348718881607056, + "learning_rate": 1.894609417461337e-05, + "loss": 5.4743, + "step": 12572 + }, + { + "epoch": 0.62, + "grad_norm": 1.7967536449432373, + "learning_rate": 1.8936212263451753e-05, + "loss": 5.5196, + "step": 12576 + }, + { + "epoch": 0.62, + "grad_norm": 1.9703553915023804, + "learning_rate": 1.8926330352290135e-05, + "loss": 5.532, + "step": 12580 + }, + { + "epoch": 0.62, + "grad_norm": 1.7581743001937866, + "learning_rate": 1.8916448441128513e-05, + "loss": 5.5114, + "step": 12584 + }, + { + "epoch": 0.62, + "grad_norm": 2.0711758136749268, + "learning_rate": 1.8906566529966895e-05, + "loss": 5.4094, + "step": 12588 + }, + { + "epoch": 0.62, + "grad_norm": 1.7996923923492432, + "learning_rate": 1.8896684618805277e-05, + "loss": 5.5836, + "step": 12592 + }, + { + "epoch": 0.62, + "grad_norm": 1.97800612449646, + "learning_rate": 1.888680270764366e-05, + "loss": 5.4898, + "step": 12596 + }, + { + "epoch": 0.62, + "grad_norm": 1.7940218448638916, + "learning_rate": 1.8876920796482038e-05, + "loss": 5.6005, + "step": 12600 + }, + { + "epoch": 0.62, + "grad_norm": 1.8298521041870117, + "learning_rate": 1.886703888532042e-05, + "loss": 5.55, + "step": 12604 + }, + { + "epoch": 0.62, + "grad_norm": 2.040109872817993, + "learning_rate": 1.8857156974158806e-05, + "loss": 5.5685, + "step": 12608 + }, + { + "epoch": 0.62, + "grad_norm": 1.8531662225723267, + "learning_rate": 1.8847275062997184e-05, + "loss": 5.5343, + "step": 12612 + }, + { + "epoch": 0.62, + "grad_norm": 2.1842970848083496, + "learning_rate": 1.8837393151835566e-05, + "loss": 5.4704, + "step": 12616 + }, + { + "epoch": 0.62, + "grad_norm": 1.876779556274414, + "learning_rate": 1.882751124067395e-05, + "loss": 5.5272, + "step": 12620 + }, + { + "epoch": 0.62, + "grad_norm": 1.9100033044815063, + "learning_rate": 1.881762932951233e-05, + "loss": 5.4669, + "step": 12624 + }, + { + "epoch": 0.62, + "grad_norm": 2.233772039413452, + "learning_rate": 1.880774741835071e-05, + "loss": 5.5786, + "step": 12628 + }, + { + "epoch": 0.62, + "grad_norm": 2.021141767501831, + "learning_rate": 1.879786550718909e-05, + "loss": 5.5469, + "step": 12632 + }, + { + "epoch": 0.62, + "grad_norm": 1.8748712539672852, + "learning_rate": 1.8787983596027473e-05, + "loss": 5.4186, + "step": 12636 + }, + { + "epoch": 0.62, + "grad_norm": 2.0556745529174805, + "learning_rate": 1.8778101684865855e-05, + "loss": 5.6404, + "step": 12640 + }, + { + "epoch": 0.62, + "grad_norm": 2.089085102081299, + "learning_rate": 1.8768219773704234e-05, + "loss": 5.6056, + "step": 12644 + }, + { + "epoch": 0.62, + "grad_norm": 1.8434518575668335, + "learning_rate": 1.8758337862542616e-05, + "loss": 5.6488, + "step": 12648 + }, + { + "epoch": 0.63, + "grad_norm": 2.003434658050537, + "learning_rate": 1.8748455951380998e-05, + "loss": 5.5057, + "step": 12652 + }, + { + "epoch": 0.63, + "grad_norm": 2.292663335800171, + "learning_rate": 1.8738574040219377e-05, + "loss": 5.5022, + "step": 12656 + }, + { + "epoch": 0.63, + "grad_norm": 1.9476063251495361, + "learning_rate": 1.872869212905776e-05, + "loss": 5.5844, + "step": 12660 + }, + { + "epoch": 0.63, + "grad_norm": 2.138032913208008, + "learning_rate": 1.8718810217896144e-05, + "loss": 5.4383, + "step": 12664 + }, + { + "epoch": 0.63, + "grad_norm": 1.8477308750152588, + "learning_rate": 1.8708928306734523e-05, + "loss": 5.5665, + "step": 12668 + }, + { + "epoch": 0.63, + "grad_norm": 1.9181241989135742, + "learning_rate": 1.8699046395572905e-05, + "loss": 5.5661, + "step": 12672 + }, + { + "epoch": 0.63, + "grad_norm": 1.928312063217163, + "learning_rate": 1.8689164484411287e-05, + "loss": 5.5366, + "step": 12676 + }, + { + "epoch": 0.63, + "grad_norm": 2.210855484008789, + "learning_rate": 1.867928257324967e-05, + "loss": 5.5136, + "step": 12680 + }, + { + "epoch": 0.63, + "grad_norm": 2.030755043029785, + "learning_rate": 1.8669400662088048e-05, + "loss": 5.4813, + "step": 12684 + }, + { + "epoch": 0.63, + "grad_norm": 2.1717166900634766, + "learning_rate": 1.865951875092643e-05, + "loss": 5.579, + "step": 12688 + }, + { + "epoch": 0.63, + "grad_norm": 2.068718671798706, + "learning_rate": 1.8649636839764812e-05, + "loss": 5.4381, + "step": 12692 + }, + { + "epoch": 0.63, + "grad_norm": 2.0134997367858887, + "learning_rate": 1.8639754928603194e-05, + "loss": 5.6513, + "step": 12696 + }, + { + "epoch": 0.63, + "grad_norm": 2.061288356781006, + "learning_rate": 1.8629873017441573e-05, + "loss": 5.4894, + "step": 12700 + }, + { + "epoch": 0.63, + "grad_norm": 2.0297648906707764, + "learning_rate": 1.8619991106279955e-05, + "loss": 5.5485, + "step": 12704 + }, + { + "epoch": 0.63, + "grad_norm": 2.0792784690856934, + "learning_rate": 1.8610109195118337e-05, + "loss": 5.5622, + "step": 12708 + }, + { + "epoch": 0.63, + "grad_norm": 2.003371238708496, + "learning_rate": 1.8600227283956715e-05, + "loss": 5.5616, + "step": 12712 + }, + { + "epoch": 0.63, + "grad_norm": 2.0913472175598145, + "learning_rate": 1.8590345372795098e-05, + "loss": 5.6751, + "step": 12716 + }, + { + "epoch": 0.63, + "grad_norm": 1.9094600677490234, + "learning_rate": 1.8580463461633483e-05, + "loss": 5.5047, + "step": 12720 + }, + { + "epoch": 0.63, + "grad_norm": 2.1928985118865967, + "learning_rate": 1.857058155047186e-05, + "loss": 5.6044, + "step": 12724 + }, + { + "epoch": 0.63, + "grad_norm": 1.9102288484573364, + "learning_rate": 1.8560699639310244e-05, + "loss": 5.4843, + "step": 12728 + }, + { + "epoch": 0.63, + "grad_norm": 1.908156156539917, + "learning_rate": 1.8550817728148626e-05, + "loss": 5.5217, + "step": 12732 + }, + { + "epoch": 0.63, + "grad_norm": 1.8940017223358154, + "learning_rate": 1.8540935816987008e-05, + "loss": 5.5211, + "step": 12736 + }, + { + "epoch": 0.63, + "grad_norm": 1.6733046770095825, + "learning_rate": 1.8531053905825386e-05, + "loss": 5.6507, + "step": 12740 + }, + { + "epoch": 0.63, + "grad_norm": 1.9494881629943848, + "learning_rate": 1.852117199466377e-05, + "loss": 5.6028, + "step": 12744 + }, + { + "epoch": 0.63, + "grad_norm": 2.361642360687256, + "learning_rate": 1.851129008350215e-05, + "loss": 5.5678, + "step": 12748 + }, + { + "epoch": 0.63, + "grad_norm": 1.9810757637023926, + "learning_rate": 1.8501408172340533e-05, + "loss": 5.5055, + "step": 12752 + }, + { + "epoch": 0.63, + "grad_norm": 2.196544885635376, + "learning_rate": 1.849152626117891e-05, + "loss": 5.4622, + "step": 12756 + }, + { + "epoch": 0.63, + "grad_norm": 1.841874599456787, + "learning_rate": 1.8481644350017293e-05, + "loss": 5.525, + "step": 12760 + }, + { + "epoch": 0.63, + "grad_norm": 1.982703685760498, + "learning_rate": 1.8471762438855675e-05, + "loss": 5.5797, + "step": 12764 + }, + { + "epoch": 0.63, + "grad_norm": 2.193528413772583, + "learning_rate": 1.8461880527694058e-05, + "loss": 5.4744, + "step": 12768 + }, + { + "epoch": 0.63, + "grad_norm": 1.6755714416503906, + "learning_rate": 1.8451998616532436e-05, + "loss": 5.4944, + "step": 12772 + }, + { + "epoch": 0.63, + "grad_norm": 1.9214802980422974, + "learning_rate": 1.8442116705370818e-05, + "loss": 5.4525, + "step": 12776 + }, + { + "epoch": 0.63, + "grad_norm": 2.1332216262817383, + "learning_rate": 1.8432234794209204e-05, + "loss": 5.4974, + "step": 12780 + }, + { + "epoch": 0.63, + "grad_norm": 1.9983022212982178, + "learning_rate": 1.8422352883047582e-05, + "loss": 5.4737, + "step": 12784 + }, + { + "epoch": 0.63, + "grad_norm": 2.090367555618286, + "learning_rate": 1.8412470971885964e-05, + "loss": 5.4413, + "step": 12788 + }, + { + "epoch": 0.63, + "grad_norm": 1.903393030166626, + "learning_rate": 1.8402589060724346e-05, + "loss": 5.5338, + "step": 12792 + }, + { + "epoch": 0.63, + "grad_norm": 1.8425928354263306, + "learning_rate": 1.8392707149562725e-05, + "loss": 5.5461, + "step": 12796 + }, + { + "epoch": 0.63, + "grad_norm": 1.7911487817764282, + "learning_rate": 1.8382825238401107e-05, + "loss": 5.5144, + "step": 12800 + }, + { + "epoch": 0.63, + "grad_norm": 1.9388378858566284, + "learning_rate": 1.837294332723949e-05, + "loss": 5.4777, + "step": 12804 + }, + { + "epoch": 0.63, + "grad_norm": 1.9651098251342773, + "learning_rate": 1.836306141607787e-05, + "loss": 5.583, + "step": 12808 + }, + { + "epoch": 0.63, + "grad_norm": 2.097846269607544, + "learning_rate": 1.835317950491625e-05, + "loss": 5.4835, + "step": 12812 + }, + { + "epoch": 0.63, + "grad_norm": 1.8683522939682007, + "learning_rate": 1.8343297593754632e-05, + "loss": 5.542, + "step": 12816 + }, + { + "epoch": 0.63, + "grad_norm": 1.9476234912872314, + "learning_rate": 1.8333415682593014e-05, + "loss": 5.5376, + "step": 12820 + }, + { + "epoch": 0.63, + "grad_norm": 2.049328088760376, + "learning_rate": 1.8323533771431396e-05, + "loss": 5.4911, + "step": 12824 + }, + { + "epoch": 0.63, + "grad_norm": 1.8876453638076782, + "learning_rate": 1.8313651860269775e-05, + "loss": 5.5226, + "step": 12828 + }, + { + "epoch": 0.63, + "grad_norm": 1.9597487449645996, + "learning_rate": 1.8303769949108157e-05, + "loss": 5.4785, + "step": 12832 + }, + { + "epoch": 0.63, + "grad_norm": 2.0029759407043457, + "learning_rate": 1.8293888037946542e-05, + "loss": 5.5109, + "step": 12836 + }, + { + "epoch": 0.63, + "grad_norm": 1.9219965934753418, + "learning_rate": 1.828400612678492e-05, + "loss": 5.508, + "step": 12840 + }, + { + "epoch": 0.63, + "grad_norm": 1.9963774681091309, + "learning_rate": 1.8274124215623303e-05, + "loss": 5.664, + "step": 12844 + }, + { + "epoch": 0.63, + "grad_norm": 2.181628942489624, + "learning_rate": 1.8264242304461685e-05, + "loss": 5.4549, + "step": 12848 + }, + { + "epoch": 0.64, + "grad_norm": 1.9105952978134155, + "learning_rate": 1.8254360393300067e-05, + "loss": 5.6288, + "step": 12852 + }, + { + "epoch": 0.64, + "grad_norm": 1.8461229801177979, + "learning_rate": 1.8244478482138446e-05, + "loss": 5.5195, + "step": 12856 + }, + { + "epoch": 0.64, + "grad_norm": 1.9446773529052734, + "learning_rate": 1.8234596570976828e-05, + "loss": 5.4313, + "step": 12860 + }, + { + "epoch": 0.64, + "grad_norm": 2.007297992706299, + "learning_rate": 1.822471465981521e-05, + "loss": 5.6664, + "step": 12864 + }, + { + "epoch": 0.64, + "grad_norm": 2.0537166595458984, + "learning_rate": 1.821483274865359e-05, + "loss": 5.5447, + "step": 12868 + }, + { + "epoch": 0.64, + "grad_norm": 2.0639407634735107, + "learning_rate": 1.820495083749197e-05, + "loss": 5.4532, + "step": 12872 + }, + { + "epoch": 0.64, + "grad_norm": 1.9469093084335327, + "learning_rate": 1.8195068926330353e-05, + "loss": 5.5461, + "step": 12876 + }, + { + "epoch": 0.64, + "grad_norm": 1.766298770904541, + "learning_rate": 1.8185187015168735e-05, + "loss": 5.4775, + "step": 12880 + }, + { + "epoch": 0.64, + "grad_norm": 1.7954472303390503, + "learning_rate": 1.8175305104007114e-05, + "loss": 5.5326, + "step": 12884 + }, + { + "epoch": 0.64, + "grad_norm": 1.8361804485321045, + "learning_rate": 1.8165423192845496e-05, + "loss": 5.5217, + "step": 12888 + }, + { + "epoch": 0.64, + "grad_norm": 2.2192041873931885, + "learning_rate": 1.815554128168388e-05, + "loss": 5.4166, + "step": 12892 + }, + { + "epoch": 0.64, + "grad_norm": 2.092569351196289, + "learning_rate": 1.814565937052226e-05, + "loss": 5.5973, + "step": 12896 + }, + { + "epoch": 0.64, + "grad_norm": 1.8529601097106934, + "learning_rate": 1.8135777459360642e-05, + "loss": 5.4088, + "step": 12900 + }, + { + "epoch": 0.64, + "grad_norm": 1.7156609296798706, + "learning_rate": 1.8125895548199024e-05, + "loss": 5.5506, + "step": 12904 + }, + { + "epoch": 0.64, + "grad_norm": 2.3277995586395264, + "learning_rate": 1.8116013637037406e-05, + "loss": 5.4852, + "step": 12908 + }, + { + "epoch": 0.64, + "grad_norm": 2.039177894592285, + "learning_rate": 1.8106131725875785e-05, + "loss": 5.509, + "step": 12912 + }, + { + "epoch": 0.64, + "grad_norm": 2.2541489601135254, + "learning_rate": 1.8096249814714167e-05, + "loss": 5.5551, + "step": 12916 + }, + { + "epoch": 0.64, + "grad_norm": 1.9068487882614136, + "learning_rate": 1.808636790355255e-05, + "loss": 5.5223, + "step": 12920 + }, + { + "epoch": 0.64, + "grad_norm": 1.8464723825454712, + "learning_rate": 1.807648599239093e-05, + "loss": 5.4688, + "step": 12924 + }, + { + "epoch": 0.64, + "grad_norm": 1.8630855083465576, + "learning_rate": 1.806660408122931e-05, + "loss": 5.521, + "step": 12928 + }, + { + "epoch": 0.64, + "grad_norm": 1.9354201555252075, + "learning_rate": 1.805672217006769e-05, + "loss": 5.5234, + "step": 12932 + }, + { + "epoch": 0.64, + "grad_norm": 2.2278544902801514, + "learning_rate": 1.8046840258906074e-05, + "loss": 5.563, + "step": 12936 + }, + { + "epoch": 0.64, + "grad_norm": 2.0102896690368652, + "learning_rate": 1.8036958347744452e-05, + "loss": 5.6071, + "step": 12940 + }, + { + "epoch": 0.64, + "grad_norm": 2.12506103515625, + "learning_rate": 1.8027076436582834e-05, + "loss": 5.5518, + "step": 12944 + }, + { + "epoch": 0.64, + "grad_norm": 2.134568929672241, + "learning_rate": 1.8017194525421216e-05, + "loss": 5.4289, + "step": 12948 + }, + { + "epoch": 0.64, + "grad_norm": 1.866938829421997, + "learning_rate": 1.80073126142596e-05, + "loss": 5.4047, + "step": 12952 + }, + { + "epoch": 0.64, + "grad_norm": 2.0489606857299805, + "learning_rate": 1.799743070309798e-05, + "loss": 5.4668, + "step": 12956 + }, + { + "epoch": 0.64, + "grad_norm": 2.130350351333618, + "learning_rate": 1.7987548791936363e-05, + "loss": 5.599, + "step": 12960 + }, + { + "epoch": 0.64, + "grad_norm": 2.1276466846466064, + "learning_rate": 1.7977666880774745e-05, + "loss": 5.3925, + "step": 12964 + }, + { + "epoch": 0.64, + "grad_norm": 1.8620883226394653, + "learning_rate": 1.7967784969613123e-05, + "loss": 5.4918, + "step": 12968 + }, + { + "epoch": 0.64, + "grad_norm": 1.7407227754592896, + "learning_rate": 1.7957903058451505e-05, + "loss": 5.5406, + "step": 12972 + }, + { + "epoch": 0.64, + "grad_norm": 1.9221688508987427, + "learning_rate": 1.7948021147289887e-05, + "loss": 5.5111, + "step": 12976 + }, + { + "epoch": 0.64, + "grad_norm": 2.0765016078948975, + "learning_rate": 1.793813923612827e-05, + "loss": 5.6063, + "step": 12980 + }, + { + "epoch": 0.64, + "grad_norm": 2.040132999420166, + "learning_rate": 1.7928257324966648e-05, + "loss": 5.5904, + "step": 12984 + }, + { + "epoch": 0.64, + "grad_norm": 1.8433319330215454, + "learning_rate": 1.791837541380503e-05, + "loss": 5.4806, + "step": 12988 + }, + { + "epoch": 0.64, + "grad_norm": 2.065800428390503, + "learning_rate": 1.7908493502643412e-05, + "loss": 5.563, + "step": 12992 + }, + { + "epoch": 0.64, + "grad_norm": 2.199831485748291, + "learning_rate": 1.789861159148179e-05, + "loss": 5.5233, + "step": 12996 + }, + { + "epoch": 0.64, + "grad_norm": 1.8309836387634277, + "learning_rate": 1.7888729680320173e-05, + "loss": 5.4911, + "step": 13000 + }, + { + "epoch": 0.64, + "grad_norm": 1.900347113609314, + "learning_rate": 1.7878847769158555e-05, + "loss": 5.5083, + "step": 13004 + }, + { + "epoch": 0.64, + "grad_norm": 1.8298702239990234, + "learning_rate": 1.786896585799694e-05, + "loss": 5.5197, + "step": 13008 + }, + { + "epoch": 0.64, + "grad_norm": 1.8966771364212036, + "learning_rate": 1.785908394683532e-05, + "loss": 5.4034, + "step": 13012 + }, + { + "epoch": 0.64, + "grad_norm": 1.9861708879470825, + "learning_rate": 1.78492020356737e-05, + "loss": 5.3773, + "step": 13016 + }, + { + "epoch": 0.64, + "grad_norm": 2.1633896827697754, + "learning_rate": 1.7839320124512083e-05, + "loss": 5.5735, + "step": 13020 + }, + { + "epoch": 0.64, + "grad_norm": 1.8944169282913208, + "learning_rate": 1.7829438213350462e-05, + "loss": 5.4982, + "step": 13024 + }, + { + "epoch": 0.64, + "grad_norm": 2.358996868133545, + "learning_rate": 1.7819556302188844e-05, + "loss": 5.5851, + "step": 13028 + }, + { + "epoch": 0.64, + "grad_norm": 1.8002705574035645, + "learning_rate": 1.7809674391027226e-05, + "loss": 5.5903, + "step": 13032 + }, + { + "epoch": 0.64, + "grad_norm": 2.174081325531006, + "learning_rate": 1.7799792479865608e-05, + "loss": 5.5327, + "step": 13036 + }, + { + "epoch": 0.64, + "grad_norm": 2.115267515182495, + "learning_rate": 1.7789910568703987e-05, + "loss": 5.6085, + "step": 13040 + }, + { + "epoch": 0.64, + "grad_norm": 2.197908878326416, + "learning_rate": 1.778002865754237e-05, + "loss": 5.5499, + "step": 13044 + }, + { + "epoch": 0.64, + "grad_norm": 2.2714781761169434, + "learning_rate": 1.777014674638075e-05, + "loss": 5.6221, + "step": 13048 + }, + { + "epoch": 0.64, + "grad_norm": 2.1368441581726074, + "learning_rate": 1.7760264835219133e-05, + "loss": 5.5613, + "step": 13052 + }, + { + "epoch": 0.65, + "grad_norm": 2.0283219814300537, + "learning_rate": 1.775038292405751e-05, + "loss": 5.4734, + "step": 13056 + }, + { + "epoch": 0.65, + "grad_norm": 1.7801271677017212, + "learning_rate": 1.7740501012895894e-05, + "loss": 5.4928, + "step": 13060 + }, + { + "epoch": 0.65, + "grad_norm": 1.8141615390777588, + "learning_rate": 1.7730619101734276e-05, + "loss": 5.4943, + "step": 13064 + }, + { + "epoch": 0.65, + "grad_norm": 1.9424062967300415, + "learning_rate": 1.7720737190572658e-05, + "loss": 5.4888, + "step": 13068 + }, + { + "epoch": 0.65, + "grad_norm": 1.7956730127334595, + "learning_rate": 1.771085527941104e-05, + "loss": 5.3367, + "step": 13072 + }, + { + "epoch": 0.65, + "grad_norm": 1.8769028186798096, + "learning_rate": 1.7700973368249422e-05, + "loss": 5.5365, + "step": 13076 + }, + { + "epoch": 0.65, + "grad_norm": 1.8470765352249146, + "learning_rate": 1.76910914570878e-05, + "loss": 5.6281, + "step": 13080 + }, + { + "epoch": 0.65, + "grad_norm": 1.9143494367599487, + "learning_rate": 1.7681209545926183e-05, + "loss": 5.403, + "step": 13084 + }, + { + "epoch": 0.65, + "grad_norm": 1.8906290531158447, + "learning_rate": 1.7671327634764565e-05, + "loss": 5.4201, + "step": 13088 + }, + { + "epoch": 0.65, + "grad_norm": 1.9759694337844849, + "learning_rate": 1.7661445723602947e-05, + "loss": 5.5, + "step": 13092 + }, + { + "epoch": 0.65, + "grad_norm": 2.16597580909729, + "learning_rate": 1.7651563812441325e-05, + "loss": 5.4831, + "step": 13096 + }, + { + "epoch": 0.65, + "grad_norm": 2.142273187637329, + "learning_rate": 1.7641681901279708e-05, + "loss": 5.5608, + "step": 13100 + }, + { + "epoch": 0.65, + "grad_norm": 2.0143542289733887, + "learning_rate": 1.763179999011809e-05, + "loss": 5.4914, + "step": 13104 + }, + { + "epoch": 0.65, + "grad_norm": 2.0241010189056396, + "learning_rate": 1.762191807895647e-05, + "loss": 5.5402, + "step": 13108 + }, + { + "epoch": 0.65, + "grad_norm": 2.111691951751709, + "learning_rate": 1.761203616779485e-05, + "loss": 5.5417, + "step": 13112 + }, + { + "epoch": 0.65, + "grad_norm": 2.0966546535491943, + "learning_rate": 1.7602154256633232e-05, + "loss": 5.4544, + "step": 13116 + }, + { + "epoch": 0.65, + "grad_norm": 2.2063889503479004, + "learning_rate": 1.7592272345471614e-05, + "loss": 5.5447, + "step": 13120 + }, + { + "epoch": 0.65, + "grad_norm": 2.0416488647460938, + "learning_rate": 1.7582390434309997e-05, + "loss": 5.56, + "step": 13124 + }, + { + "epoch": 0.65, + "grad_norm": 2.1015825271606445, + "learning_rate": 1.757250852314838e-05, + "loss": 5.5632, + "step": 13128 + }, + { + "epoch": 0.65, + "grad_norm": 2.154283046722412, + "learning_rate": 1.756262661198676e-05, + "loss": 5.4369, + "step": 13132 + }, + { + "epoch": 0.65, + "grad_norm": 1.9613300561904907, + "learning_rate": 1.7552744700825143e-05, + "loss": 5.4493, + "step": 13136 + }, + { + "epoch": 0.65, + "grad_norm": 1.936897873878479, + "learning_rate": 1.754286278966352e-05, + "loss": 5.6429, + "step": 13140 + }, + { + "epoch": 0.65, + "grad_norm": 2.1174933910369873, + "learning_rate": 1.7532980878501903e-05, + "loss": 5.5222, + "step": 13144 + }, + { + "epoch": 0.65, + "grad_norm": 1.8853952884674072, + "learning_rate": 1.7523098967340285e-05, + "loss": 5.5823, + "step": 13148 + }, + { + "epoch": 0.65, + "grad_norm": 1.7702313661575317, + "learning_rate": 1.7513217056178664e-05, + "loss": 5.5205, + "step": 13152 + }, + { + "epoch": 0.65, + "grad_norm": 1.8901150226593018, + "learning_rate": 1.7503335145017046e-05, + "loss": 5.5227, + "step": 13156 + }, + { + "epoch": 0.65, + "grad_norm": 1.9898940324783325, + "learning_rate": 1.7493453233855428e-05, + "loss": 5.5064, + "step": 13160 + }, + { + "epoch": 0.65, + "grad_norm": 1.8458938598632812, + "learning_rate": 1.748357132269381e-05, + "loss": 5.4032, + "step": 13164 + }, + { + "epoch": 0.65, + "grad_norm": 2.1321139335632324, + "learning_rate": 1.747368941153219e-05, + "loss": 5.5491, + "step": 13168 + }, + { + "epoch": 0.65, + "grad_norm": 2.055555820465088, + "learning_rate": 1.746380750037057e-05, + "loss": 5.4821, + "step": 13172 + }, + { + "epoch": 0.65, + "grad_norm": 2.0164570808410645, + "learning_rate": 1.7453925589208953e-05, + "loss": 5.4772, + "step": 13176 + }, + { + "epoch": 0.65, + "grad_norm": 1.8798964023590088, + "learning_rate": 1.7444043678047335e-05, + "loss": 5.6292, + "step": 13180 + }, + { + "epoch": 0.65, + "grad_norm": 1.8820830583572388, + "learning_rate": 1.7434161766885717e-05, + "loss": 5.5503, + "step": 13184 + }, + { + "epoch": 0.65, + "grad_norm": 1.768707513809204, + "learning_rate": 1.74242798557241e-05, + "loss": 5.5907, + "step": 13188 + }, + { + "epoch": 0.65, + "grad_norm": 1.7819797992706299, + "learning_rate": 1.741439794456248e-05, + "loss": 5.3631, + "step": 13192 + }, + { + "epoch": 0.65, + "grad_norm": 1.9888780117034912, + "learning_rate": 1.740451603340086e-05, + "loss": 5.5318, + "step": 13196 + }, + { + "epoch": 0.65, + "grad_norm": 1.7733293771743774, + "learning_rate": 1.7394634122239242e-05, + "loss": 5.4794, + "step": 13200 + }, + { + "epoch": 0.65, + "grad_norm": 2.08803653717041, + "learning_rate": 1.7384752211077624e-05, + "loss": 5.5392, + "step": 13204 + }, + { + "epoch": 0.65, + "grad_norm": 1.7837083339691162, + "learning_rate": 1.7374870299916006e-05, + "loss": 5.5612, + "step": 13208 + }, + { + "epoch": 0.65, + "grad_norm": 2.200601100921631, + "learning_rate": 1.7364988388754385e-05, + "loss": 5.4618, + "step": 13212 + }, + { + "epoch": 0.65, + "grad_norm": 2.062946319580078, + "learning_rate": 1.7355106477592767e-05, + "loss": 5.4031, + "step": 13216 + }, + { + "epoch": 0.65, + "grad_norm": 1.8929904699325562, + "learning_rate": 1.734522456643115e-05, + "loss": 5.6011, + "step": 13220 + }, + { + "epoch": 0.65, + "grad_norm": 2.175036907196045, + "learning_rate": 1.7335342655269528e-05, + "loss": 5.5038, + "step": 13224 + }, + { + "epoch": 0.65, + "grad_norm": 2.1107425689697266, + "learning_rate": 1.732546074410791e-05, + "loss": 5.4624, + "step": 13228 + }, + { + "epoch": 0.65, + "grad_norm": 1.8881957530975342, + "learning_rate": 1.7315578832946292e-05, + "loss": 5.4579, + "step": 13232 + }, + { + "epoch": 0.65, + "grad_norm": 1.9419997930526733, + "learning_rate": 1.7305696921784674e-05, + "loss": 5.5691, + "step": 13236 + }, + { + "epoch": 0.65, + "grad_norm": 2.006504535675049, + "learning_rate": 1.7295815010623056e-05, + "loss": 5.604, + "step": 13240 + }, + { + "epoch": 0.65, + "grad_norm": 1.9548697471618652, + "learning_rate": 1.7285933099461438e-05, + "loss": 5.5202, + "step": 13244 + }, + { + "epoch": 0.65, + "grad_norm": 1.903361201286316, + "learning_rate": 1.727605118829982e-05, + "loss": 5.5459, + "step": 13248 + }, + { + "epoch": 0.65, + "grad_norm": 1.9035142660140991, + "learning_rate": 1.72661692771382e-05, + "loss": 5.5067, + "step": 13252 + }, + { + "epoch": 0.65, + "grad_norm": 1.9801918268203735, + "learning_rate": 1.725628736597658e-05, + "loss": 5.502, + "step": 13256 + }, + { + "epoch": 0.66, + "grad_norm": 1.8996813297271729, + "learning_rate": 1.7246405454814963e-05, + "loss": 5.4709, + "step": 13260 + }, + { + "epoch": 0.66, + "grad_norm": 1.903206467628479, + "learning_rate": 1.7236523543653345e-05, + "loss": 5.4252, + "step": 13264 + }, + { + "epoch": 0.66, + "grad_norm": 2.280048370361328, + "learning_rate": 1.7226641632491724e-05, + "loss": 5.6236, + "step": 13268 + }, + { + "epoch": 0.66, + "grad_norm": 1.916698932647705, + "learning_rate": 1.7216759721330106e-05, + "loss": 5.5417, + "step": 13272 + }, + { + "epoch": 0.66, + "grad_norm": 1.8944514989852905, + "learning_rate": 1.7206877810168488e-05, + "loss": 5.5273, + "step": 13276 + }, + { + "epoch": 0.66, + "grad_norm": 2.160426139831543, + "learning_rate": 1.7196995899006866e-05, + "loss": 5.4925, + "step": 13280 + }, + { + "epoch": 0.66, + "grad_norm": 1.9661693572998047, + "learning_rate": 1.718711398784525e-05, + "loss": 5.5566, + "step": 13284 + }, + { + "epoch": 0.66, + "grad_norm": 2.1579537391662598, + "learning_rate": 1.717723207668363e-05, + "loss": 5.5, + "step": 13288 + }, + { + "epoch": 0.66, + "grad_norm": 1.8311907052993774, + "learning_rate": 1.7167350165522013e-05, + "loss": 5.4402, + "step": 13292 + }, + { + "epoch": 0.66, + "grad_norm": 2.067732810974121, + "learning_rate": 1.7157468254360395e-05, + "loss": 5.5876, + "step": 13296 + }, + { + "epoch": 0.66, + "grad_norm": 1.9030920267105103, + "learning_rate": 1.7147586343198777e-05, + "loss": 5.5544, + "step": 13300 + }, + { + "epoch": 0.66, + "grad_norm": 1.9689793586730957, + "learning_rate": 1.713770443203716e-05, + "loss": 5.5876, + "step": 13304 + }, + { + "epoch": 0.66, + "grad_norm": 2.319972038269043, + "learning_rate": 1.7127822520875537e-05, + "loss": 5.6179, + "step": 13308 + }, + { + "epoch": 0.66, + "grad_norm": 2.05124568939209, + "learning_rate": 1.711794060971392e-05, + "loss": 5.4808, + "step": 13312 + }, + { + "epoch": 0.66, + "grad_norm": 2.054259777069092, + "learning_rate": 1.71080586985523e-05, + "loss": 5.571, + "step": 13316 + }, + { + "epoch": 0.66, + "grad_norm": 2.1697633266448975, + "learning_rate": 1.7098176787390684e-05, + "loss": 5.5687, + "step": 13320 + }, + { + "epoch": 0.66, + "grad_norm": 2.158599853515625, + "learning_rate": 1.7088294876229062e-05, + "loss": 5.4535, + "step": 13324 + }, + { + "epoch": 0.66, + "grad_norm": 2.263106107711792, + "learning_rate": 1.7078412965067444e-05, + "loss": 5.5662, + "step": 13328 + }, + { + "epoch": 0.66, + "grad_norm": 1.9761734008789062, + "learning_rate": 1.7068531053905826e-05, + "loss": 5.4655, + "step": 13332 + }, + { + "epoch": 0.66, + "grad_norm": 2.1491572856903076, + "learning_rate": 1.705864914274421e-05, + "loss": 5.5403, + "step": 13336 + }, + { + "epoch": 0.66, + "grad_norm": 1.9614084959030151, + "learning_rate": 1.7048767231582587e-05, + "loss": 5.4883, + "step": 13340 + }, + { + "epoch": 0.66, + "grad_norm": 2.1084208488464355, + "learning_rate": 1.703888532042097e-05, + "loss": 5.4915, + "step": 13344 + }, + { + "epoch": 0.66, + "grad_norm": 1.9315608739852905, + "learning_rate": 1.702900340925935e-05, + "loss": 5.531, + "step": 13348 + }, + { + "epoch": 0.66, + "grad_norm": 1.9300886392593384, + "learning_rate": 1.7019121498097733e-05, + "loss": 5.5294, + "step": 13352 + }, + { + "epoch": 0.66, + "grad_norm": 1.9439579248428345, + "learning_rate": 1.7009239586936115e-05, + "loss": 5.5372, + "step": 13356 + }, + { + "epoch": 0.66, + "grad_norm": 1.8347561359405518, + "learning_rate": 1.6999357675774497e-05, + "loss": 5.5304, + "step": 13360 + }, + { + "epoch": 0.66, + "grad_norm": 1.8642383813858032, + "learning_rate": 1.6989475764612876e-05, + "loss": 5.3163, + "step": 13364 + }, + { + "epoch": 0.66, + "grad_norm": 1.9651672840118408, + "learning_rate": 1.6979593853451258e-05, + "loss": 5.4271, + "step": 13368 + }, + { + "epoch": 0.66, + "grad_norm": 2.146197557449341, + "learning_rate": 1.696971194228964e-05, + "loss": 5.6012, + "step": 13372 + }, + { + "epoch": 0.66, + "grad_norm": 1.9717754125595093, + "learning_rate": 1.6959830031128022e-05, + "loss": 5.4703, + "step": 13376 + }, + { + "epoch": 0.66, + "grad_norm": 1.8049124479293823, + "learning_rate": 1.69499481199664e-05, + "loss": 5.4398, + "step": 13380 + }, + { + "epoch": 0.66, + "grad_norm": 1.8642454147338867, + "learning_rate": 1.6940066208804783e-05, + "loss": 5.5652, + "step": 13384 + }, + { + "epoch": 0.66, + "grad_norm": 1.8307348489761353, + "learning_rate": 1.6930184297643165e-05, + "loss": 5.6079, + "step": 13388 + }, + { + "epoch": 0.66, + "grad_norm": 1.8231866359710693, + "learning_rate": 1.6920302386481547e-05, + "loss": 5.4477, + "step": 13392 + }, + { + "epoch": 0.66, + "grad_norm": 1.7777388095855713, + "learning_rate": 1.6910420475319926e-05, + "loss": 5.5014, + "step": 13396 + }, + { + "epoch": 0.66, + "grad_norm": 2.1190943717956543, + "learning_rate": 1.6900538564158308e-05, + "loss": 5.512, + "step": 13400 + }, + { + "epoch": 0.66, + "grad_norm": 1.7859139442443848, + "learning_rate": 1.689065665299669e-05, + "loss": 5.4176, + "step": 13404 + }, + { + "epoch": 0.66, + "grad_norm": 1.937577486038208, + "learning_rate": 1.6880774741835072e-05, + "loss": 5.5242, + "step": 13408 + }, + { + "epoch": 0.66, + "grad_norm": 1.9069881439208984, + "learning_rate": 1.6870892830673454e-05, + "loss": 5.5248, + "step": 13412 + }, + { + "epoch": 0.66, + "grad_norm": 2.3434953689575195, + "learning_rate": 1.6861010919511836e-05, + "loss": 5.5887, + "step": 13416 + }, + { + "epoch": 0.66, + "grad_norm": 2.0236103534698486, + "learning_rate": 1.6851129008350218e-05, + "loss": 5.5241, + "step": 13420 + }, + { + "epoch": 0.66, + "grad_norm": 1.807458519935608, + "learning_rate": 1.6841247097188597e-05, + "loss": 5.5018, + "step": 13424 + }, + { + "epoch": 0.66, + "grad_norm": 1.7654306888580322, + "learning_rate": 1.683136518602698e-05, + "loss": 5.5346, + "step": 13428 + }, + { + "epoch": 0.66, + "grad_norm": 1.796311378479004, + "learning_rate": 1.682148327486536e-05, + "loss": 5.4943, + "step": 13432 + }, + { + "epoch": 0.66, + "grad_norm": 2.1759448051452637, + "learning_rate": 1.681160136370374e-05, + "loss": 5.5745, + "step": 13436 + }, + { + "epoch": 0.66, + "grad_norm": 1.9799151420593262, + "learning_rate": 1.680171945254212e-05, + "loss": 5.5721, + "step": 13440 + }, + { + "epoch": 0.66, + "grad_norm": 1.8375133275985718, + "learning_rate": 1.6791837541380504e-05, + "loss": 5.5058, + "step": 13444 + }, + { + "epoch": 0.66, + "grad_norm": 2.0697977542877197, + "learning_rate": 1.6781955630218886e-05, + "loss": 5.49, + "step": 13448 + }, + { + "epoch": 0.66, + "grad_norm": 1.968781590461731, + "learning_rate": 1.6772073719057264e-05, + "loss": 5.414, + "step": 13452 + }, + { + "epoch": 0.66, + "grad_norm": 2.182166814804077, + "learning_rate": 1.6762191807895647e-05, + "loss": 5.6226, + "step": 13456 + }, + { + "epoch": 0.67, + "grad_norm": 2.110377073287964, + "learning_rate": 1.675230989673403e-05, + "loss": 5.5626, + "step": 13460 + }, + { + "epoch": 0.67, + "grad_norm": 1.979601263999939, + "learning_rate": 1.674242798557241e-05, + "loss": 5.62, + "step": 13464 + }, + { + "epoch": 0.67, + "grad_norm": 2.2129158973693848, + "learning_rate": 1.6732546074410793e-05, + "loss": 5.5565, + "step": 13468 + }, + { + "epoch": 0.67, + "grad_norm": 2.0401570796966553, + "learning_rate": 1.6722664163249175e-05, + "loss": 5.4781, + "step": 13472 + }, + { + "epoch": 0.67, + "grad_norm": 1.7625924348831177, + "learning_rate": 1.6712782252087557e-05, + "loss": 5.4104, + "step": 13476 + }, + { + "epoch": 0.67, + "grad_norm": 1.8981072902679443, + "learning_rate": 1.6702900340925936e-05, + "loss": 5.5208, + "step": 13480 + }, + { + "epoch": 0.67, + "grad_norm": 2.141097068786621, + "learning_rate": 1.6693018429764318e-05, + "loss": 5.505, + "step": 13484 + }, + { + "epoch": 0.67, + "grad_norm": 2.0228660106658936, + "learning_rate": 1.66831365186027e-05, + "loss": 5.5653, + "step": 13488 + }, + { + "epoch": 0.67, + "grad_norm": 1.9627779722213745, + "learning_rate": 1.667325460744108e-05, + "loss": 5.5383, + "step": 13492 + }, + { + "epoch": 0.67, + "grad_norm": 1.80388605594635, + "learning_rate": 1.666337269627946e-05, + "loss": 5.4095, + "step": 13496 + }, + { + "epoch": 0.67, + "grad_norm": 2.125562906265259, + "learning_rate": 1.6653490785117842e-05, + "loss": 5.5157, + "step": 13500 + }, + { + "epoch": 0.67, + "grad_norm": 2.3643898963928223, + "learning_rate": 1.6643608873956224e-05, + "loss": 5.4778, + "step": 13504 + }, + { + "epoch": 0.67, + "grad_norm": 1.9573434591293335, + "learning_rate": 1.6633726962794603e-05, + "loss": 5.3951, + "step": 13508 + }, + { + "epoch": 0.67, + "grad_norm": 1.767749309539795, + "learning_rate": 1.6623845051632985e-05, + "loss": 5.5775, + "step": 13512 + }, + { + "epoch": 0.67, + "grad_norm": 1.7242648601531982, + "learning_rate": 1.6613963140471367e-05, + "loss": 5.5681, + "step": 13516 + }, + { + "epoch": 0.67, + "grad_norm": 2.098914384841919, + "learning_rate": 1.660408122930975e-05, + "loss": 5.4892, + "step": 13520 + }, + { + "epoch": 0.67, + "grad_norm": 1.9471126794815063, + "learning_rate": 1.659419931814813e-05, + "loss": 5.5672, + "step": 13524 + }, + { + "epoch": 0.67, + "grad_norm": 1.9518860578536987, + "learning_rate": 1.6584317406986513e-05, + "loss": 5.5318, + "step": 13528 + }, + { + "epoch": 0.67, + "grad_norm": 1.9697927236557007, + "learning_rate": 1.6574435495824896e-05, + "loss": 5.5275, + "step": 13532 + }, + { + "epoch": 0.67, + "grad_norm": 2.005502700805664, + "learning_rate": 1.6564553584663274e-05, + "loss": 5.6122, + "step": 13536 + }, + { + "epoch": 0.67, + "grad_norm": 2.06354022026062, + "learning_rate": 1.6554671673501656e-05, + "loss": 5.3431, + "step": 13540 + }, + { + "epoch": 0.67, + "grad_norm": 1.9697012901306152, + "learning_rate": 1.6544789762340038e-05, + "loss": 5.5376, + "step": 13544 + }, + { + "epoch": 0.67, + "grad_norm": 1.9158987998962402, + "learning_rate": 1.653490785117842e-05, + "loss": 5.5176, + "step": 13548 + }, + { + "epoch": 0.67, + "grad_norm": 2.0514771938323975, + "learning_rate": 1.65250259400168e-05, + "loss": 5.5724, + "step": 13552 + }, + { + "epoch": 0.67, + "grad_norm": 1.8229410648345947, + "learning_rate": 1.651514402885518e-05, + "loss": 5.4933, + "step": 13556 + }, + { + "epoch": 0.67, + "grad_norm": 1.6895649433135986, + "learning_rate": 1.6505262117693563e-05, + "loss": 5.5462, + "step": 13560 + }, + { + "epoch": 0.67, + "grad_norm": 1.9636627435684204, + "learning_rate": 1.6495380206531942e-05, + "loss": 5.4918, + "step": 13564 + }, + { + "epoch": 0.67, + "grad_norm": 1.728353500366211, + "learning_rate": 1.6485498295370324e-05, + "loss": 5.4126, + "step": 13568 + }, + { + "epoch": 0.67, + "grad_norm": 1.8992080688476562, + "learning_rate": 1.6475616384208706e-05, + "loss": 5.4954, + "step": 13572 + }, + { + "epoch": 0.67, + "grad_norm": 1.8971209526062012, + "learning_rate": 1.6465734473047088e-05, + "loss": 5.5205, + "step": 13576 + }, + { + "epoch": 0.67, + "grad_norm": 1.9099971055984497, + "learning_rate": 1.6455852561885467e-05, + "loss": 5.5401, + "step": 13580 + }, + { + "epoch": 0.67, + "grad_norm": 1.7198883295059204, + "learning_rate": 1.6445970650723852e-05, + "loss": 5.5187, + "step": 13584 + }, + { + "epoch": 0.67, + "grad_norm": 1.854477882385254, + "learning_rate": 1.6436088739562234e-05, + "loss": 5.5558, + "step": 13588 + }, + { + "epoch": 0.67, + "grad_norm": 1.9261908531188965, + "learning_rate": 1.6426206828400613e-05, + "loss": 5.5156, + "step": 13592 + }, + { + "epoch": 0.67, + "grad_norm": 2.166208505630493, + "learning_rate": 1.6416324917238995e-05, + "loss": 5.5002, + "step": 13596 + }, + { + "epoch": 0.67, + "grad_norm": 1.8934109210968018, + "learning_rate": 1.6406443006077377e-05, + "loss": 5.5092, + "step": 13600 + }, + { + "epoch": 0.67, + "grad_norm": 1.830947756767273, + "learning_rate": 1.639656109491576e-05, + "loss": 5.5542, + "step": 13604 + }, + { + "epoch": 0.67, + "grad_norm": 1.86097252368927, + "learning_rate": 1.6386679183754138e-05, + "loss": 5.502, + "step": 13608 + }, + { + "epoch": 0.67, + "grad_norm": 1.9578551054000854, + "learning_rate": 1.637679727259252e-05, + "loss": 5.5351, + "step": 13612 + }, + { + "epoch": 0.67, + "grad_norm": 1.9985218048095703, + "learning_rate": 1.6366915361430902e-05, + "loss": 5.403, + "step": 13616 + }, + { + "epoch": 0.67, + "grad_norm": 2.346946954727173, + "learning_rate": 1.6357033450269284e-05, + "loss": 5.4314, + "step": 13620 + }, + { + "epoch": 0.67, + "grad_norm": 1.9613852500915527, + "learning_rate": 1.6347151539107663e-05, + "loss": 5.4851, + "step": 13624 + }, + { + "epoch": 0.67, + "grad_norm": 2.2844598293304443, + "learning_rate": 1.6337269627946045e-05, + "loss": 5.4646, + "step": 13628 + }, + { + "epoch": 0.67, + "grad_norm": 2.0212109088897705, + "learning_rate": 1.6327387716784427e-05, + "loss": 5.5153, + "step": 13632 + }, + { + "epoch": 0.67, + "grad_norm": 2.0245635509490967, + "learning_rate": 1.6317505805622805e-05, + "loss": 5.5516, + "step": 13636 + }, + { + "epoch": 0.67, + "grad_norm": 2.051738977432251, + "learning_rate": 1.630762389446119e-05, + "loss": 5.6327, + "step": 13640 + }, + { + "epoch": 0.67, + "grad_norm": 2.0657126903533936, + "learning_rate": 1.6297741983299573e-05, + "loss": 5.5882, + "step": 13644 + }, + { + "epoch": 0.67, + "grad_norm": 1.979304313659668, + "learning_rate": 1.628786007213795e-05, + "loss": 5.3553, + "step": 13648 + }, + { + "epoch": 0.67, + "grad_norm": 2.10304856300354, + "learning_rate": 1.6277978160976334e-05, + "loss": 5.5474, + "step": 13652 + }, + { + "epoch": 0.67, + "grad_norm": 2.0482354164123535, + "learning_rate": 1.6268096249814716e-05, + "loss": 5.5004, + "step": 13656 + }, + { + "epoch": 0.67, + "grad_norm": 2.0741443634033203, + "learning_rate": 1.6258214338653098e-05, + "loss": 5.4887, + "step": 13660 + }, + { + "epoch": 0.68, + "grad_norm": 1.9850010871887207, + "learning_rate": 1.6248332427491476e-05, + "loss": 5.4449, + "step": 13664 + }, + { + "epoch": 0.68, + "grad_norm": 2.456059217453003, + "learning_rate": 1.623845051632986e-05, + "loss": 5.4471, + "step": 13668 + }, + { + "epoch": 0.68, + "grad_norm": 1.8275492191314697, + "learning_rate": 1.622856860516824e-05, + "loss": 5.5139, + "step": 13672 + }, + { + "epoch": 0.68, + "grad_norm": 2.166975736618042, + "learning_rate": 1.6218686694006623e-05, + "loss": 5.5475, + "step": 13676 + }, + { + "epoch": 0.68, + "grad_norm": 2.0723702907562256, + "learning_rate": 1.6208804782845e-05, + "loss": 5.6062, + "step": 13680 + }, + { + "epoch": 0.68, + "grad_norm": 2.227518320083618, + "learning_rate": 1.6198922871683383e-05, + "loss": 5.4955, + "step": 13684 + }, + { + "epoch": 0.68, + "grad_norm": 1.9052515029907227, + "learning_rate": 1.6189040960521765e-05, + "loss": 5.5553, + "step": 13688 + }, + { + "epoch": 0.68, + "grad_norm": 1.945573329925537, + "learning_rate": 1.6179159049360147e-05, + "loss": 5.3597, + "step": 13692 + }, + { + "epoch": 0.68, + "grad_norm": 1.874603271484375, + "learning_rate": 1.616927713819853e-05, + "loss": 5.391, + "step": 13696 + }, + { + "epoch": 0.68, + "grad_norm": 2.339505672454834, + "learning_rate": 1.615939522703691e-05, + "loss": 5.4194, + "step": 13700 + }, + { + "epoch": 0.68, + "grad_norm": 1.7062286138534546, + "learning_rate": 1.6149513315875294e-05, + "loss": 5.4586, + "step": 13704 + }, + { + "epoch": 0.68, + "grad_norm": 1.8526628017425537, + "learning_rate": 1.6139631404713672e-05, + "loss": 5.3614, + "step": 13708 + }, + { + "epoch": 0.68, + "grad_norm": 1.8179652690887451, + "learning_rate": 1.6129749493552054e-05, + "loss": 5.4977, + "step": 13712 + }, + { + "epoch": 0.68, + "grad_norm": 1.889673113822937, + "learning_rate": 1.6119867582390436e-05, + "loss": 5.5591, + "step": 13716 + }, + { + "epoch": 0.68, + "grad_norm": 2.039731740951538, + "learning_rate": 1.6109985671228815e-05, + "loss": 5.4809, + "step": 13720 + }, + { + "epoch": 0.68, + "grad_norm": 2.3024516105651855, + "learning_rate": 1.6100103760067197e-05, + "loss": 5.4518, + "step": 13724 + }, + { + "epoch": 0.68, + "grad_norm": 1.9836472272872925, + "learning_rate": 1.609022184890558e-05, + "loss": 5.4812, + "step": 13728 + }, + { + "epoch": 0.68, + "grad_norm": 1.8883979320526123, + "learning_rate": 1.608033993774396e-05, + "loss": 5.4914, + "step": 13732 + }, + { + "epoch": 0.68, + "grad_norm": 1.9850894212722778, + "learning_rate": 1.607045802658234e-05, + "loss": 5.6323, + "step": 13736 + }, + { + "epoch": 0.68, + "grad_norm": 2.124187707901001, + "learning_rate": 1.6060576115420722e-05, + "loss": 5.5609, + "step": 13740 + }, + { + "epoch": 0.68, + "grad_norm": 1.9509978294372559, + "learning_rate": 1.6050694204259104e-05, + "loss": 5.5234, + "step": 13744 + }, + { + "epoch": 0.68, + "grad_norm": 1.9340142011642456, + "learning_rate": 1.6040812293097486e-05, + "loss": 5.491, + "step": 13748 + }, + { + "epoch": 0.68, + "grad_norm": 2.0254647731781006, + "learning_rate": 1.6030930381935865e-05, + "loss": 5.4735, + "step": 13752 + }, + { + "epoch": 0.68, + "grad_norm": 2.143260955810547, + "learning_rate": 1.602104847077425e-05, + "loss": 5.5477, + "step": 13756 + }, + { + "epoch": 0.68, + "grad_norm": 1.6931935548782349, + "learning_rate": 1.6011166559612632e-05, + "loss": 5.5821, + "step": 13760 + }, + { + "epoch": 0.68, + "grad_norm": 1.9342801570892334, + "learning_rate": 1.600128464845101e-05, + "loss": 5.4517, + "step": 13764 + }, + { + "epoch": 0.68, + "grad_norm": 1.8290477991104126, + "learning_rate": 1.5991402737289393e-05, + "loss": 5.4992, + "step": 13768 + }, + { + "epoch": 0.68, + "grad_norm": 2.020256757736206, + "learning_rate": 1.5981520826127775e-05, + "loss": 5.5175, + "step": 13772 + }, + { + "epoch": 0.68, + "grad_norm": 2.106531858444214, + "learning_rate": 1.5971638914966157e-05, + "loss": 5.4482, + "step": 13776 + }, + { + "epoch": 0.68, + "grad_norm": 1.9939652681350708, + "learning_rate": 1.5961757003804536e-05, + "loss": 5.4952, + "step": 13780 + }, + { + "epoch": 0.68, + "grad_norm": 2.0970144271850586, + "learning_rate": 1.5951875092642918e-05, + "loss": 5.5438, + "step": 13784 + }, + { + "epoch": 0.68, + "grad_norm": 1.9521361589431763, + "learning_rate": 1.59419931814813e-05, + "loss": 5.5225, + "step": 13788 + }, + { + "epoch": 0.68, + "grad_norm": 2.0148961544036865, + "learning_rate": 1.593211127031968e-05, + "loss": 5.4785, + "step": 13792 + }, + { + "epoch": 0.68, + "grad_norm": 2.1378774642944336, + "learning_rate": 1.592222935915806e-05, + "loss": 5.5553, + "step": 13796 + }, + { + "epoch": 0.68, + "grad_norm": 2.312819719314575, + "learning_rate": 1.5912347447996443e-05, + "loss": 5.4033, + "step": 13800 + }, + { + "epoch": 0.68, + "grad_norm": 2.2060418128967285, + "learning_rate": 1.5902465536834825e-05, + "loss": 5.4713, + "step": 13804 + }, + { + "epoch": 0.68, + "grad_norm": 1.8953157663345337, + "learning_rate": 1.5892583625673203e-05, + "loss": 5.5035, + "step": 13808 + }, + { + "epoch": 0.68, + "grad_norm": 1.7217421531677246, + "learning_rate": 1.588270171451159e-05, + "loss": 5.6139, + "step": 13812 + }, + { + "epoch": 0.68, + "grad_norm": 1.9969755411148071, + "learning_rate": 1.587281980334997e-05, + "loss": 5.44, + "step": 13816 + }, + { + "epoch": 0.68, + "grad_norm": 2.071129560470581, + "learning_rate": 1.586293789218835e-05, + "loss": 5.4202, + "step": 13820 + }, + { + "epoch": 0.68, + "grad_norm": 1.8004354238510132, + "learning_rate": 1.5853055981026732e-05, + "loss": 5.5286, + "step": 13824 + }, + { + "epoch": 0.68, + "grad_norm": 2.0194168090820312, + "learning_rate": 1.5843174069865114e-05, + "loss": 5.426, + "step": 13828 + }, + { + "epoch": 0.68, + "grad_norm": 1.7910535335540771, + "learning_rate": 1.5833292158703496e-05, + "loss": 5.529, + "step": 13832 + }, + { + "epoch": 0.68, + "grad_norm": 2.0648398399353027, + "learning_rate": 1.5823410247541874e-05, + "loss": 5.5632, + "step": 13836 + }, + { + "epoch": 0.68, + "grad_norm": 1.881971001625061, + "learning_rate": 1.5813528336380257e-05, + "loss": 5.6164, + "step": 13840 + }, + { + "epoch": 0.68, + "grad_norm": 2.088383913040161, + "learning_rate": 1.580364642521864e-05, + "loss": 5.5221, + "step": 13844 + }, + { + "epoch": 0.68, + "grad_norm": 1.9117428064346313, + "learning_rate": 1.5793764514057017e-05, + "loss": 5.6231, + "step": 13848 + }, + { + "epoch": 0.68, + "grad_norm": 2.0459342002868652, + "learning_rate": 1.57838826028954e-05, + "loss": 5.3971, + "step": 13852 + }, + { + "epoch": 0.68, + "grad_norm": 1.8177950382232666, + "learning_rate": 1.577400069173378e-05, + "loss": 5.4996, + "step": 13856 + }, + { + "epoch": 0.68, + "grad_norm": 2.090265989303589, + "learning_rate": 1.5764118780572163e-05, + "loss": 5.5479, + "step": 13860 + }, + { + "epoch": 0.69, + "grad_norm": 2.1461987495422363, + "learning_rate": 1.5754236869410542e-05, + "loss": 5.4959, + "step": 13864 + }, + { + "epoch": 0.69, + "grad_norm": 2.008942127227783, + "learning_rate": 1.5744354958248928e-05, + "loss": 5.3582, + "step": 13868 + }, + { + "epoch": 0.69, + "grad_norm": 1.7721225023269653, + "learning_rate": 1.573447304708731e-05, + "loss": 5.4618, + "step": 13872 + }, + { + "epoch": 0.69, + "grad_norm": 2.161746025085449, + "learning_rate": 1.572459113592569e-05, + "loss": 5.5163, + "step": 13876 + }, + { + "epoch": 0.69, + "grad_norm": 2.0746963024139404, + "learning_rate": 1.571470922476407e-05, + "loss": 5.4898, + "step": 13880 + }, + { + "epoch": 0.69, + "grad_norm": Infinity, + "learning_rate": 1.5707297791392857e-05, + "loss": 5.5354, + "step": 13884 + }, + { + "epoch": 0.69, + "grad_norm": 2.0238466262817383, + "learning_rate": 1.5697415880231236e-05, + "loss": 5.5099, + "step": 13888 + }, + { + "epoch": 0.69, + "grad_norm": 1.9300113916397095, + "learning_rate": 1.5687533969069618e-05, + "loss": 5.682, + "step": 13892 + }, + { + "epoch": 0.69, + "grad_norm": 1.9683018922805786, + "learning_rate": 1.5677652057908e-05, + "loss": 5.5058, + "step": 13896 + }, + { + "epoch": 0.69, + "grad_norm": 1.975710391998291, + "learning_rate": 1.5667770146746382e-05, + "loss": 5.5139, + "step": 13900 + }, + { + "epoch": 0.69, + "grad_norm": 1.9731028079986572, + "learning_rate": 1.5657888235584764e-05, + "loss": 5.565, + "step": 13904 + }, + { + "epoch": 0.69, + "grad_norm": 2.0347113609313965, + "learning_rate": 1.5648006324423146e-05, + "loss": 5.6587, + "step": 13908 + }, + { + "epoch": 0.69, + "grad_norm": 1.9666558504104614, + "learning_rate": 1.5638124413261525e-05, + "loss": 5.6072, + "step": 13912 + }, + { + "epoch": 0.69, + "grad_norm": 1.808203935623169, + "learning_rate": 1.5628242502099907e-05, + "loss": 5.4888, + "step": 13916 + }, + { + "epoch": 0.69, + "grad_norm": 2.0783326625823975, + "learning_rate": 1.561836059093829e-05, + "loss": 5.6407, + "step": 13920 + }, + { + "epoch": 0.69, + "grad_norm": 2.006601572036743, + "learning_rate": 1.560847867977667e-05, + "loss": 5.5085, + "step": 13924 + }, + { + "epoch": 0.69, + "grad_norm": 1.8633679151535034, + "learning_rate": 1.559859676861505e-05, + "loss": 5.4967, + "step": 13928 + }, + { + "epoch": 0.69, + "grad_norm": 1.9505640268325806, + "learning_rate": 1.558871485745343e-05, + "loss": 5.4678, + "step": 13932 + }, + { + "epoch": 0.69, + "grad_norm": 2.0986247062683105, + "learning_rate": 1.5578832946291814e-05, + "loss": 5.4932, + "step": 13936 + }, + { + "epoch": 0.69, + "grad_norm": 2.256755828857422, + "learning_rate": 1.5568951035130196e-05, + "loss": 5.5958, + "step": 13940 + }, + { + "epoch": 0.69, + "grad_norm": 2.2147109508514404, + "learning_rate": 1.5559069123968574e-05, + "loss": 5.5194, + "step": 13944 + }, + { + "epoch": 0.69, + "grad_norm": 1.88095223903656, + "learning_rate": 1.5549187212806956e-05, + "loss": 5.4609, + "step": 13948 + }, + { + "epoch": 0.69, + "grad_norm": 1.867007851600647, + "learning_rate": 1.553930530164534e-05, + "loss": 5.4802, + "step": 13952 + }, + { + "epoch": 0.69, + "grad_norm": 2.029303789138794, + "learning_rate": 1.552942339048372e-05, + "loss": 5.5337, + "step": 13956 + }, + { + "epoch": 0.69, + "grad_norm": 1.9142879247665405, + "learning_rate": 1.5519541479322103e-05, + "loss": 5.4932, + "step": 13960 + }, + { + "epoch": 0.69, + "grad_norm": 1.8500617742538452, + "learning_rate": 1.5509659568160485e-05, + "loss": 5.4916, + "step": 13964 + }, + { + "epoch": 0.69, + "grad_norm": 2.0694692134857178, + "learning_rate": 1.5499777656998867e-05, + "loss": 5.4766, + "step": 13968 + }, + { + "epoch": 0.69, + "grad_norm": 2.122901201248169, + "learning_rate": 1.5489895745837245e-05, + "loss": 5.6467, + "step": 13972 + }, + { + "epoch": 0.69, + "grad_norm": 2.108297109603882, + "learning_rate": 1.5480013834675627e-05, + "loss": 5.5902, + "step": 13976 + }, + { + "epoch": 0.69, + "grad_norm": 1.9898920059204102, + "learning_rate": 1.547013192351401e-05, + "loss": 5.5772, + "step": 13980 + }, + { + "epoch": 0.69, + "grad_norm": 2.2552645206451416, + "learning_rate": 1.5460250012352388e-05, + "loss": 5.507, + "step": 13984 + }, + { + "epoch": 0.69, + "grad_norm": 1.9198163747787476, + "learning_rate": 1.545036810119077e-05, + "loss": 5.4749, + "step": 13988 + }, + { + "epoch": 0.69, + "grad_norm": 2.101717233657837, + "learning_rate": 1.5440486190029152e-05, + "loss": 5.444, + "step": 13992 + }, + { + "epoch": 0.69, + "grad_norm": 2.1184744834899902, + "learning_rate": 1.5430604278867534e-05, + "loss": 5.6017, + "step": 13996 + }, + { + "epoch": 0.69, + "grad_norm": 1.7170560359954834, + "learning_rate": 1.5420722367705913e-05, + "loss": 5.4235, + "step": 14000 + }, + { + "epoch": 0.69, + "grad_norm": 2.1068732738494873, + "learning_rate": 1.5410840456544295e-05, + "loss": 5.504, + "step": 14004 + }, + { + "epoch": 0.69, + "grad_norm": 2.275245189666748, + "learning_rate": 1.5400958545382677e-05, + "loss": 5.5617, + "step": 14008 + }, + { + "epoch": 0.69, + "grad_norm": 2.2603347301483154, + "learning_rate": 1.539107663422106e-05, + "loss": 5.4309, + "step": 14012 + }, + { + "epoch": 0.69, + "grad_norm": 2.082984685897827, + "learning_rate": 1.538119472305944e-05, + "loss": 5.408, + "step": 14016 + }, + { + "epoch": 0.69, + "grad_norm": 1.8457398414611816, + "learning_rate": 1.5371312811897823e-05, + "loss": 5.4611, + "step": 14020 + }, + { + "epoch": 0.69, + "grad_norm": 2.0834155082702637, + "learning_rate": 1.5361430900736205e-05, + "loss": 5.4901, + "step": 14024 + }, + { + "epoch": 0.69, + "grad_norm": 1.9145028591156006, + "learning_rate": 1.5351548989574584e-05, + "loss": 5.5115, + "step": 14028 + }, + { + "epoch": 0.69, + "grad_norm": 1.7850406169891357, + "learning_rate": 1.5341667078412966e-05, + "loss": 5.4384, + "step": 14032 + }, + { + "epoch": 0.69, + "grad_norm": 1.964521884918213, + "learning_rate": 1.5331785167251348e-05, + "loss": 5.3728, + "step": 14036 + }, + { + "epoch": 0.69, + "grad_norm": 2.125605821609497, + "learning_rate": 1.532190325608973e-05, + "loss": 5.4972, + "step": 14040 + }, + { + "epoch": 0.69, + "grad_norm": 1.7855336666107178, + "learning_rate": 1.531202134492811e-05, + "loss": 5.5624, + "step": 14044 + }, + { + "epoch": 0.69, + "grad_norm": 1.9751635789871216, + "learning_rate": 1.530213943376649e-05, + "loss": 5.4388, + "step": 14048 + }, + { + "epoch": 0.69, + "grad_norm": 1.943023681640625, + "learning_rate": 1.5292257522604873e-05, + "loss": 5.4861, + "step": 14052 + }, + { + "epoch": 0.69, + "grad_norm": 1.7670375108718872, + "learning_rate": 1.528237561144325e-05, + "loss": 5.5438, + "step": 14056 + }, + { + "epoch": 0.69, + "grad_norm": 2.040696859359741, + "learning_rate": 1.5272493700281634e-05, + "loss": 5.4905, + "step": 14060 + }, + { + "epoch": 0.69, + "grad_norm": 2.2104179859161377, + "learning_rate": 1.5262611789120016e-05, + "loss": 5.4197, + "step": 14064 + }, + { + "epoch": 0.7, + "grad_norm": 2.070598840713501, + "learning_rate": 1.5252729877958396e-05, + "loss": 5.3342, + "step": 14068 + }, + { + "epoch": 0.7, + "grad_norm": 2.1541106700897217, + "learning_rate": 1.524284796679678e-05, + "loss": 5.5594, + "step": 14072 + }, + { + "epoch": 0.7, + "grad_norm": 1.8858451843261719, + "learning_rate": 1.5232966055635162e-05, + "loss": 5.5494, + "step": 14076 + }, + { + "epoch": 0.7, + "grad_norm": 2.0127689838409424, + "learning_rate": 1.5223084144473542e-05, + "loss": 5.5005, + "step": 14080 + }, + { + "epoch": 0.7, + "grad_norm": 1.70602548122406, + "learning_rate": 1.5213202233311924e-05, + "loss": 5.4694, + "step": 14084 + }, + { + "epoch": 0.7, + "grad_norm": 2.1480917930603027, + "learning_rate": 1.5203320322150305e-05, + "loss": 5.5891, + "step": 14088 + }, + { + "epoch": 0.7, + "grad_norm": 1.8127115964889526, + "learning_rate": 1.5193438410988687e-05, + "loss": 5.497, + "step": 14092 + }, + { + "epoch": 0.7, + "grad_norm": 1.768389105796814, + "learning_rate": 1.5183556499827067e-05, + "loss": 5.5075, + "step": 14096 + }, + { + "epoch": 0.7, + "grad_norm": 2.2110512256622314, + "learning_rate": 1.517367458866545e-05, + "loss": 5.4944, + "step": 14100 + }, + { + "epoch": 0.7, + "grad_norm": 2.2322652339935303, + "learning_rate": 1.516379267750383e-05, + "loss": 5.5072, + "step": 14104 + }, + { + "epoch": 0.7, + "grad_norm": 2.103665351867676, + "learning_rate": 1.5153910766342212e-05, + "loss": 5.5093, + "step": 14108 + }, + { + "epoch": 0.7, + "grad_norm": 2.0604758262634277, + "learning_rate": 1.5144028855180592e-05, + "loss": 5.4539, + "step": 14112 + }, + { + "epoch": 0.7, + "grad_norm": 1.8728346824645996, + "learning_rate": 1.5134146944018972e-05, + "loss": 5.401, + "step": 14116 + }, + { + "epoch": 0.7, + "grad_norm": 2.031858444213867, + "learning_rate": 1.5124265032857354e-05, + "loss": 5.499, + "step": 14120 + }, + { + "epoch": 0.7, + "grad_norm": 1.828974962234497, + "learning_rate": 1.5114383121695735e-05, + "loss": 5.4467, + "step": 14124 + }, + { + "epoch": 0.7, + "grad_norm": 1.8760355710983276, + "learning_rate": 1.5104501210534119e-05, + "loss": 5.3958, + "step": 14128 + }, + { + "epoch": 0.7, + "grad_norm": 1.8751137256622314, + "learning_rate": 1.50946192993725e-05, + "loss": 5.4515, + "step": 14132 + }, + { + "epoch": 0.7, + "grad_norm": 1.9187356233596802, + "learning_rate": 1.5084737388210881e-05, + "loss": 5.5581, + "step": 14136 + }, + { + "epoch": 0.7, + "grad_norm": 2.1077044010162354, + "learning_rate": 1.5074855477049263e-05, + "loss": 5.5018, + "step": 14140 + }, + { + "epoch": 0.7, + "grad_norm": 1.9304084777832031, + "learning_rate": 1.5064973565887643e-05, + "loss": 5.6017, + "step": 14144 + }, + { + "epoch": 0.7, + "grad_norm": 1.7740778923034668, + "learning_rate": 1.5055091654726025e-05, + "loss": 5.5123, + "step": 14148 + }, + { + "epoch": 0.7, + "grad_norm": 1.9732328653335571, + "learning_rate": 1.5045209743564406e-05, + "loss": 5.4148, + "step": 14152 + }, + { + "epoch": 0.7, + "grad_norm": 1.98763906955719, + "learning_rate": 1.5035327832402788e-05, + "loss": 5.5671, + "step": 14156 + }, + { + "epoch": 0.7, + "grad_norm": 2.025677442550659, + "learning_rate": 1.5025445921241168e-05, + "loss": 5.5388, + "step": 14160 + }, + { + "epoch": 0.7, + "grad_norm": 2.1304235458374023, + "learning_rate": 1.501556401007955e-05, + "loss": 5.549, + "step": 14164 + }, + { + "epoch": 0.7, + "grad_norm": 1.8666431903839111, + "learning_rate": 1.500568209891793e-05, + "loss": 5.5133, + "step": 14168 + }, + { + "epoch": 0.7, + "grad_norm": 2.001925468444824, + "learning_rate": 1.4995800187756313e-05, + "loss": 5.4878, + "step": 14172 + }, + { + "epoch": 0.7, + "grad_norm": 2.0786337852478027, + "learning_rate": 1.4985918276594693e-05, + "loss": 5.5475, + "step": 14176 + }, + { + "epoch": 0.7, + "grad_norm": 2.0157644748687744, + "learning_rate": 1.4976036365433073e-05, + "loss": 5.4686, + "step": 14180 + }, + { + "epoch": 0.7, + "grad_norm": 2.1091387271881104, + "learning_rate": 1.4966154454271456e-05, + "loss": 5.5175, + "step": 14184 + }, + { + "epoch": 0.7, + "grad_norm": 1.8892532587051392, + "learning_rate": 1.495627254310984e-05, + "loss": 5.57, + "step": 14188 + }, + { + "epoch": 0.7, + "grad_norm": 2.0681941509246826, + "learning_rate": 1.4946390631948221e-05, + "loss": 5.4503, + "step": 14192 + }, + { + "epoch": 0.7, + "grad_norm": 2.028324842453003, + "learning_rate": 1.4936508720786602e-05, + "loss": 5.5231, + "step": 14196 + }, + { + "epoch": 0.7, + "grad_norm": 1.8590697050094604, + "learning_rate": 1.4926626809624982e-05, + "loss": 5.4046, + "step": 14200 + }, + { + "epoch": 0.7, + "grad_norm": 2.1684091091156006, + "learning_rate": 1.4916744898463364e-05, + "loss": 5.5428, + "step": 14204 + }, + { + "epoch": 0.7, + "grad_norm": 2.4600932598114014, + "learning_rate": 1.4906862987301745e-05, + "loss": 5.4973, + "step": 14208 + }, + { + "epoch": 0.7, + "grad_norm": 2.182546615600586, + "learning_rate": 1.4896981076140127e-05, + "loss": 5.5168, + "step": 14212 + }, + { + "epoch": 0.7, + "grad_norm": 2.0207765102386475, + "learning_rate": 1.4887099164978507e-05, + "loss": 5.4892, + "step": 14216 + }, + { + "epoch": 0.7, + "grad_norm": 2.07978892326355, + "learning_rate": 1.4877217253816889e-05, + "loss": 5.5074, + "step": 14220 + }, + { + "epoch": 0.7, + "grad_norm": 2.0122103691101074, + "learning_rate": 1.486733534265527e-05, + "loss": 5.5007, + "step": 14224 + }, + { + "epoch": 0.7, + "grad_norm": 1.951704502105713, + "learning_rate": 1.4857453431493651e-05, + "loss": 5.5281, + "step": 14228 + }, + { + "epoch": 0.7, + "grad_norm": 1.903686761856079, + "learning_rate": 1.4847571520332032e-05, + "loss": 5.483, + "step": 14232 + }, + { + "epoch": 0.7, + "grad_norm": 1.7512837648391724, + "learning_rate": 1.4837689609170414e-05, + "loss": 5.5026, + "step": 14236 + }, + { + "epoch": 0.7, + "grad_norm": 1.9716506004333496, + "learning_rate": 1.4827807698008794e-05, + "loss": 5.5547, + "step": 14240 + }, + { + "epoch": 0.7, + "grad_norm": 2.1064813137054443, + "learning_rate": 1.4817925786847178e-05, + "loss": 5.6868, + "step": 14244 + }, + { + "epoch": 0.7, + "grad_norm": 1.920245885848999, + "learning_rate": 1.480804387568556e-05, + "loss": 5.5195, + "step": 14248 + }, + { + "epoch": 0.7, + "grad_norm": 1.8807624578475952, + "learning_rate": 1.479816196452394e-05, + "loss": 5.5605, + "step": 14252 + }, + { + "epoch": 0.7, + "grad_norm": 2.1558022499084473, + "learning_rate": 1.4788280053362322e-05, + "loss": 5.54, + "step": 14256 + }, + { + "epoch": 0.7, + "grad_norm": 1.9825403690338135, + "learning_rate": 1.4778398142200703e-05, + "loss": 5.5888, + "step": 14260 + }, + { + "epoch": 0.7, + "grad_norm": 1.974697232246399, + "learning_rate": 1.4768516231039083e-05, + "loss": 5.5791, + "step": 14264 + }, + { + "epoch": 0.7, + "grad_norm": 2.0667243003845215, + "learning_rate": 1.4758634319877465e-05, + "loss": 5.4354, + "step": 14268 + }, + { + "epoch": 0.71, + "grad_norm": 1.8902255296707153, + "learning_rate": 1.4748752408715846e-05, + "loss": 5.5167, + "step": 14272 + }, + { + "epoch": 0.71, + "grad_norm": 1.882251501083374, + "learning_rate": 1.4738870497554228e-05, + "loss": 5.4551, + "step": 14276 + }, + { + "epoch": 0.71, + "grad_norm": 2.0265276432037354, + "learning_rate": 1.4728988586392608e-05, + "loss": 5.3989, + "step": 14280 + }, + { + "epoch": 0.71, + "grad_norm": 2.1499834060668945, + "learning_rate": 1.471910667523099e-05, + "loss": 5.3913, + "step": 14284 + }, + { + "epoch": 0.71, + "grad_norm": 1.9374068975448608, + "learning_rate": 1.470922476406937e-05, + "loss": 5.4951, + "step": 14288 + }, + { + "epoch": 0.71, + "grad_norm": 2.04496693611145, + "learning_rate": 1.4699342852907753e-05, + "loss": 5.5568, + "step": 14292 + }, + { + "epoch": 0.71, + "grad_norm": 2.1718828678131104, + "learning_rate": 1.4689460941746133e-05, + "loss": 5.4003, + "step": 14296 + }, + { + "epoch": 0.71, + "grad_norm": 2.170172929763794, + "learning_rate": 1.4679579030584517e-05, + "loss": 5.4535, + "step": 14300 + }, + { + "epoch": 0.71, + "grad_norm": 1.9548550844192505, + "learning_rate": 1.4669697119422899e-05, + "loss": 5.5077, + "step": 14304 + }, + { + "epoch": 0.71, + "grad_norm": 1.7335338592529297, + "learning_rate": 1.4659815208261279e-05, + "loss": 5.4265, + "step": 14308 + }, + { + "epoch": 0.71, + "grad_norm": 2.0178778171539307, + "learning_rate": 1.4649933297099661e-05, + "loss": 5.3215, + "step": 14312 + }, + { + "epoch": 0.71, + "grad_norm": 2.198841094970703, + "learning_rate": 1.4640051385938042e-05, + "loss": 5.5392, + "step": 14316 + }, + { + "epoch": 0.71, + "grad_norm": 2.093581438064575, + "learning_rate": 1.4630169474776424e-05, + "loss": 5.4572, + "step": 14320 + }, + { + "epoch": 0.71, + "grad_norm": 1.7876406908035278, + "learning_rate": 1.4620287563614804e-05, + "loss": 5.3976, + "step": 14324 + }, + { + "epoch": 0.71, + "grad_norm": 2.3379602432250977, + "learning_rate": 1.4610405652453184e-05, + "loss": 5.5972, + "step": 14328 + }, + { + "epoch": 0.71, + "grad_norm": 2.0745928287506104, + "learning_rate": 1.4600523741291566e-05, + "loss": 5.4489, + "step": 14332 + }, + { + "epoch": 0.71, + "grad_norm": 1.9868686199188232, + "learning_rate": 1.4590641830129947e-05, + "loss": 5.5921, + "step": 14336 + }, + { + "epoch": 0.71, + "grad_norm": 2.052441358566284, + "learning_rate": 1.4580759918968329e-05, + "loss": 5.5621, + "step": 14340 + }, + { + "epoch": 0.71, + "grad_norm": 2.250218629837036, + "learning_rate": 1.457087800780671e-05, + "loss": 5.3768, + "step": 14344 + }, + { + "epoch": 0.71, + "grad_norm": 2.0471935272216797, + "learning_rate": 1.4560996096645091e-05, + "loss": 5.5863, + "step": 14348 + }, + { + "epoch": 0.71, + "grad_norm": 2.0607481002807617, + "learning_rate": 1.4551114185483472e-05, + "loss": 5.5404, + "step": 14352 + }, + { + "epoch": 0.71, + "grad_norm": 1.7724336385726929, + "learning_rate": 1.4541232274321854e-05, + "loss": 5.4719, + "step": 14356 + }, + { + "epoch": 0.71, + "grad_norm": 2.1439015865325928, + "learning_rate": 1.4531350363160237e-05, + "loss": 5.5045, + "step": 14360 + }, + { + "epoch": 0.71, + "grad_norm": 2.067152500152588, + "learning_rate": 1.4521468451998618e-05, + "loss": 5.5518, + "step": 14364 + }, + { + "epoch": 0.71, + "grad_norm": 1.8666131496429443, + "learning_rate": 1.4511586540837e-05, + "loss": 5.4791, + "step": 14368 + }, + { + "epoch": 0.71, + "grad_norm": 1.9369677305221558, + "learning_rate": 1.450170462967538e-05, + "loss": 5.4379, + "step": 14372 + }, + { + "epoch": 0.71, + "grad_norm": 2.0378055572509766, + "learning_rate": 1.4491822718513762e-05, + "loss": 5.4874, + "step": 14376 + }, + { + "epoch": 0.71, + "grad_norm": 1.8923200368881226, + "learning_rate": 1.4481940807352143e-05, + "loss": 5.5813, + "step": 14380 + }, + { + "epoch": 0.71, + "grad_norm": 1.9572350978851318, + "learning_rate": 1.4472058896190525e-05, + "loss": 5.5161, + "step": 14384 + }, + { + "epoch": 0.71, + "grad_norm": 1.9131194353103638, + "learning_rate": 1.4462176985028905e-05, + "loss": 5.6163, + "step": 14388 + }, + { + "epoch": 0.71, + "grad_norm": 2.5276191234588623, + "learning_rate": 1.4452295073867287e-05, + "loss": 5.5442, + "step": 14392 + }, + { + "epoch": 0.71, + "grad_norm": 2.0798611640930176, + "learning_rate": 1.4442413162705667e-05, + "loss": 5.6086, + "step": 14396 + }, + { + "epoch": 0.71, + "grad_norm": 2.1294023990631104, + "learning_rate": 1.4432531251544048e-05, + "loss": 5.3953, + "step": 14400 + }, + { + "epoch": 0.71, + "grad_norm": 2.01753568649292, + "learning_rate": 1.442264934038243e-05, + "loss": 5.537, + "step": 14404 + }, + { + "epoch": 0.71, + "grad_norm": 2.0679104328155518, + "learning_rate": 1.441276742922081e-05, + "loss": 5.4632, + "step": 14408 + }, + { + "epoch": 0.71, + "grad_norm": 1.921613097190857, + "learning_rate": 1.4402885518059192e-05, + "loss": 5.2852, + "step": 14412 + }, + { + "epoch": 0.71, + "grad_norm": 1.9771466255187988, + "learning_rate": 1.4393003606897576e-05, + "loss": 5.5808, + "step": 14416 + }, + { + "epoch": 0.71, + "grad_norm": 1.8879215717315674, + "learning_rate": 1.4383121695735956e-05, + "loss": 5.4957, + "step": 14420 + }, + { + "epoch": 0.71, + "grad_norm": 2.0186474323272705, + "learning_rate": 1.4373239784574339e-05, + "loss": 5.4864, + "step": 14424 + }, + { + "epoch": 0.71, + "grad_norm": 2.147420883178711, + "learning_rate": 1.4363357873412719e-05, + "loss": 5.5098, + "step": 14428 + }, + { + "epoch": 0.71, + "grad_norm": 1.9963911771774292, + "learning_rate": 1.4353475962251101e-05, + "loss": 5.3886, + "step": 14432 + }, + { + "epoch": 0.71, + "grad_norm": 2.03368878364563, + "learning_rate": 1.4343594051089481e-05, + "loss": 5.4533, + "step": 14436 + }, + { + "epoch": 0.71, + "grad_norm": 2.274022340774536, + "learning_rate": 1.4333712139927863e-05, + "loss": 5.5222, + "step": 14440 + }, + { + "epoch": 0.71, + "grad_norm": 2.0925748348236084, + "learning_rate": 1.4323830228766244e-05, + "loss": 5.3865, + "step": 14444 + }, + { + "epoch": 0.71, + "grad_norm": 2.25508451461792, + "learning_rate": 1.4313948317604626e-05, + "loss": 5.388, + "step": 14448 + }, + { + "epoch": 0.71, + "grad_norm": 1.9655667543411255, + "learning_rate": 1.4304066406443006e-05, + "loss": 5.5274, + "step": 14452 + }, + { + "epoch": 0.71, + "grad_norm": 1.7187186479568481, + "learning_rate": 1.4294184495281388e-05, + "loss": 5.5024, + "step": 14456 + }, + { + "epoch": 0.71, + "grad_norm": 1.9573662281036377, + "learning_rate": 1.4284302584119769e-05, + "loss": 5.5709, + "step": 14460 + }, + { + "epoch": 0.71, + "grad_norm": 1.9070782661437988, + "learning_rate": 1.4274420672958149e-05, + "loss": 5.5638, + "step": 14464 + }, + { + "epoch": 0.71, + "grad_norm": 1.99477219581604, + "learning_rate": 1.4264538761796531e-05, + "loss": 5.554, + "step": 14468 + }, + { + "epoch": 0.72, + "grad_norm": 2.1351230144500732, + "learning_rate": 1.4254656850634915e-05, + "loss": 5.4401, + "step": 14472 + }, + { + "epoch": 0.72, + "grad_norm": 2.12768292427063, + "learning_rate": 1.4244774939473297e-05, + "loss": 5.5931, + "step": 14476 + }, + { + "epoch": 0.72, + "grad_norm": 1.834502100944519, + "learning_rate": 1.4234893028311677e-05, + "loss": 5.3383, + "step": 14480 + }, + { + "epoch": 0.72, + "grad_norm": 1.9401111602783203, + "learning_rate": 1.4225011117150058e-05, + "loss": 5.5206, + "step": 14484 + }, + { + "epoch": 0.72, + "grad_norm": 2.1304268836975098, + "learning_rate": 1.421512920598844e-05, + "loss": 5.4791, + "step": 14488 + }, + { + "epoch": 0.72, + "grad_norm": 1.8855656385421753, + "learning_rate": 1.420524729482682e-05, + "loss": 5.4404, + "step": 14492 + }, + { + "epoch": 0.72, + "grad_norm": 2.1601414680480957, + "learning_rate": 1.4195365383665202e-05, + "loss": 5.509, + "step": 14496 + }, + { + "epoch": 0.72, + "grad_norm": 2.054011821746826, + "learning_rate": 1.4185483472503582e-05, + "loss": 5.5097, + "step": 14500 + }, + { + "epoch": 0.72, + "grad_norm": 2.2468581199645996, + "learning_rate": 1.4175601561341964e-05, + "loss": 5.5658, + "step": 14504 + }, + { + "epoch": 0.72, + "grad_norm": 1.8028780221939087, + "learning_rate": 1.4165719650180345e-05, + "loss": 5.4528, + "step": 14508 + }, + { + "epoch": 0.72, + "grad_norm": 1.7557624578475952, + "learning_rate": 1.4155837739018727e-05, + "loss": 5.4267, + "step": 14512 + }, + { + "epoch": 0.72, + "grad_norm": 2.2908835411071777, + "learning_rate": 1.4145955827857107e-05, + "loss": 5.5676, + "step": 14516 + }, + { + "epoch": 0.72, + "grad_norm": 1.8863370418548584, + "learning_rate": 1.413607391669549e-05, + "loss": 5.5873, + "step": 14520 + }, + { + "epoch": 0.72, + "grad_norm": 1.9381186962127686, + "learning_rate": 1.412619200553387e-05, + "loss": 5.5573, + "step": 14524 + }, + { + "epoch": 0.72, + "grad_norm": 1.9330207109451294, + "learning_rate": 1.4116310094372252e-05, + "loss": 5.3673, + "step": 14528 + }, + { + "epoch": 0.72, + "grad_norm": 2.0471458435058594, + "learning_rate": 1.4106428183210636e-05, + "loss": 5.4435, + "step": 14532 + }, + { + "epoch": 0.72, + "grad_norm": 1.922804594039917, + "learning_rate": 1.4096546272049016e-05, + "loss": 5.4511, + "step": 14536 + }, + { + "epoch": 0.72, + "grad_norm": 1.946153998374939, + "learning_rate": 1.4086664360887398e-05, + "loss": 5.4641, + "step": 14540 + }, + { + "epoch": 0.72, + "grad_norm": 2.0340843200683594, + "learning_rate": 1.4076782449725778e-05, + "loss": 5.5205, + "step": 14544 + }, + { + "epoch": 0.72, + "grad_norm": 2.218997001647949, + "learning_rate": 1.4066900538564159e-05, + "loss": 5.4572, + "step": 14548 + }, + { + "epoch": 0.72, + "grad_norm": 1.8929450511932373, + "learning_rate": 1.405701862740254e-05, + "loss": 5.4428, + "step": 14552 + }, + { + "epoch": 0.72, + "grad_norm": 1.828822374343872, + "learning_rate": 1.4047136716240921e-05, + "loss": 5.4737, + "step": 14556 + }, + { + "epoch": 0.72, + "grad_norm": 2.1129636764526367, + "learning_rate": 1.4037254805079303e-05, + "loss": 5.4952, + "step": 14560 + }, + { + "epoch": 0.72, + "grad_norm": 2.0485777854919434, + "learning_rate": 1.4027372893917684e-05, + "loss": 5.5007, + "step": 14564 + }, + { + "epoch": 0.72, + "grad_norm": 2.2085585594177246, + "learning_rate": 1.4017490982756066e-05, + "loss": 5.5036, + "step": 14568 + }, + { + "epoch": 0.72, + "grad_norm": 2.0381059646606445, + "learning_rate": 1.4007609071594446e-05, + "loss": 5.5335, + "step": 14572 + }, + { + "epoch": 0.72, + "grad_norm": 1.994460940361023, + "learning_rate": 1.3997727160432828e-05, + "loss": 5.5262, + "step": 14576 + }, + { + "epoch": 0.72, + "grad_norm": 1.9527006149291992, + "learning_rate": 1.3987845249271208e-05, + "loss": 5.4994, + "step": 14580 + }, + { + "epoch": 0.72, + "grad_norm": 2.07161808013916, + "learning_rate": 1.397796333810959e-05, + "loss": 5.5022, + "step": 14584 + }, + { + "epoch": 0.72, + "grad_norm": 2.083160638809204, + "learning_rate": 1.3968081426947974e-05, + "loss": 5.46, + "step": 14588 + }, + { + "epoch": 0.72, + "grad_norm": 1.9139795303344727, + "learning_rate": 1.3958199515786355e-05, + "loss": 5.3918, + "step": 14592 + }, + { + "epoch": 0.72, + "grad_norm": 2.016361951828003, + "learning_rate": 1.3948317604624737e-05, + "loss": 5.3577, + "step": 14596 + }, + { + "epoch": 0.72, + "grad_norm": 1.9085208177566528, + "learning_rate": 1.3938435693463117e-05, + "loss": 5.5672, + "step": 14600 + }, + { + "epoch": 0.72, + "grad_norm": 2.156883716583252, + "learning_rate": 1.3928553782301499e-05, + "loss": 5.4402, + "step": 14604 + }, + { + "epoch": 0.72, + "grad_norm": 2.1130199432373047, + "learning_rate": 1.391867187113988e-05, + "loss": 5.5375, + "step": 14608 + }, + { + "epoch": 0.72, + "grad_norm": 2.0779714584350586, + "learning_rate": 1.3908789959978261e-05, + "loss": 5.4568, + "step": 14612 + }, + { + "epoch": 0.72, + "grad_norm": 2.0961239337921143, + "learning_rate": 1.3898908048816642e-05, + "loss": 5.5331, + "step": 14616 + }, + { + "epoch": 0.72, + "grad_norm": 1.9321538209915161, + "learning_rate": 1.3889026137655022e-05, + "loss": 5.4607, + "step": 14620 + }, + { + "epoch": 0.72, + "grad_norm": 2.171811103820801, + "learning_rate": 1.3879144226493404e-05, + "loss": 5.5705, + "step": 14624 + }, + { + "epoch": 0.72, + "grad_norm": 2.0111582279205322, + "learning_rate": 1.3869262315331785e-05, + "loss": 5.5365, + "step": 14628 + }, + { + "epoch": 0.72, + "grad_norm": 2.2230725288391113, + "learning_rate": 1.3859380404170167e-05, + "loss": 5.4544, + "step": 14632 + }, + { + "epoch": 0.72, + "grad_norm": 2.1781952381134033, + "learning_rate": 1.3849498493008547e-05, + "loss": 5.5207, + "step": 14636 + }, + { + "epoch": 0.72, + "grad_norm": 2.1648108959198, + "learning_rate": 1.3839616581846929e-05, + "loss": 5.542, + "step": 14640 + }, + { + "epoch": 0.72, + "grad_norm": 2.008898973464966, + "learning_rate": 1.3829734670685313e-05, + "loss": 5.4819, + "step": 14644 + }, + { + "epoch": 0.72, + "grad_norm": 1.8579167127609253, + "learning_rate": 1.3819852759523693e-05, + "loss": 5.5307, + "step": 14648 + }, + { + "epoch": 0.72, + "grad_norm": 1.951011300086975, + "learning_rate": 1.3809970848362075e-05, + "loss": 5.5083, + "step": 14652 + }, + { + "epoch": 0.72, + "grad_norm": 2.622732639312744, + "learning_rate": 1.3800088937200456e-05, + "loss": 5.4687, + "step": 14656 + }, + { + "epoch": 0.72, + "grad_norm": 1.8241809606552124, + "learning_rate": 1.3790207026038838e-05, + "loss": 5.5702, + "step": 14660 + }, + { + "epoch": 0.72, + "grad_norm": 2.2202184200286865, + "learning_rate": 1.3780325114877218e-05, + "loss": 5.5717, + "step": 14664 + }, + { + "epoch": 0.72, + "grad_norm": 2.125215530395508, + "learning_rate": 1.37704432037156e-05, + "loss": 5.4724, + "step": 14668 + }, + { + "epoch": 0.72, + "grad_norm": 2.0341951847076416, + "learning_rate": 1.376056129255398e-05, + "loss": 5.4859, + "step": 14672 + }, + { + "epoch": 0.73, + "grad_norm": 1.9950518608093262, + "learning_rate": 1.3750679381392363e-05, + "loss": 5.4983, + "step": 14676 + }, + { + "epoch": 0.73, + "grad_norm": 1.9778791666030884, + "learning_rate": 1.3740797470230743e-05, + "loss": 5.3464, + "step": 14680 + }, + { + "epoch": 0.73, + "grad_norm": 2.052849054336548, + "learning_rate": 1.3730915559069123e-05, + "loss": 5.4766, + "step": 14684 + }, + { + "epoch": 0.73, + "grad_norm": 2.0208585262298584, + "learning_rate": 1.3721033647907505e-05, + "loss": 5.4662, + "step": 14688 + }, + { + "epoch": 0.73, + "grad_norm": 2.0381686687469482, + "learning_rate": 1.3711151736745886e-05, + "loss": 5.4048, + "step": 14692 + }, + { + "epoch": 0.73, + "grad_norm": 2.0504868030548096, + "learning_rate": 1.3701269825584268e-05, + "loss": 5.5422, + "step": 14696 + }, + { + "epoch": 0.73, + "grad_norm": 1.9056216478347778, + "learning_rate": 1.3691387914422648e-05, + "loss": 5.5829, + "step": 14700 + }, + { + "epoch": 0.73, + "grad_norm": 2.1584672927856445, + "learning_rate": 1.3681506003261032e-05, + "loss": 5.4341, + "step": 14704 + }, + { + "epoch": 0.73, + "grad_norm": 1.8544187545776367, + "learning_rate": 1.3671624092099414e-05, + "loss": 5.5552, + "step": 14708 + }, + { + "epoch": 0.73, + "grad_norm": 1.9360532760620117, + "learning_rate": 1.3661742180937794e-05, + "loss": 5.413, + "step": 14712 + }, + { + "epoch": 0.73, + "grad_norm": 1.7825835943222046, + "learning_rate": 1.3651860269776176e-05, + "loss": 5.4916, + "step": 14716 + }, + { + "epoch": 0.73, + "grad_norm": 2.0297420024871826, + "learning_rate": 1.3641978358614557e-05, + "loss": 5.4825, + "step": 14720 + }, + { + "epoch": 0.73, + "grad_norm": 2.0158278942108154, + "learning_rate": 1.3632096447452939e-05, + "loss": 5.4502, + "step": 14724 + }, + { + "epoch": 0.73, + "grad_norm": 2.265287160873413, + "learning_rate": 1.362221453629132e-05, + "loss": 5.5644, + "step": 14728 + }, + { + "epoch": 0.73, + "grad_norm": 2.0918755531311035, + "learning_rate": 1.3612332625129701e-05, + "loss": 5.6084, + "step": 14732 + }, + { + "epoch": 0.73, + "grad_norm": 2.141103744506836, + "learning_rate": 1.3602450713968082e-05, + "loss": 5.4327, + "step": 14736 + }, + { + "epoch": 0.73, + "grad_norm": 2.070760488510132, + "learning_rate": 1.3592568802806464e-05, + "loss": 5.5944, + "step": 14740 + }, + { + "epoch": 0.73, + "grad_norm": 1.9426864385604858, + "learning_rate": 1.3582686891644844e-05, + "loss": 5.4326, + "step": 14744 + }, + { + "epoch": 0.73, + "grad_norm": 1.8541244268417358, + "learning_rate": 1.3572804980483224e-05, + "loss": 5.5438, + "step": 14748 + }, + { + "epoch": 0.73, + "grad_norm": 2.081083297729492, + "learning_rate": 1.3562923069321606e-05, + "loss": 5.5656, + "step": 14752 + }, + { + "epoch": 0.73, + "grad_norm": 1.923309326171875, + "learning_rate": 1.3553041158159987e-05, + "loss": 5.5546, + "step": 14756 + }, + { + "epoch": 0.73, + "grad_norm": 1.746881365776062, + "learning_rate": 1.3543159246998372e-05, + "loss": 5.4799, + "step": 14760 + }, + { + "epoch": 0.73, + "grad_norm": 1.91403067111969, + "learning_rate": 1.3533277335836753e-05, + "loss": 5.4892, + "step": 14764 + }, + { + "epoch": 0.73, + "grad_norm": 2.0929675102233887, + "learning_rate": 1.3523395424675133e-05, + "loss": 5.5852, + "step": 14768 + }, + { + "epoch": 0.73, + "grad_norm": 1.9422674179077148, + "learning_rate": 1.3513513513513515e-05, + "loss": 5.5581, + "step": 14772 + }, + { + "epoch": 0.73, + "grad_norm": 1.684131383895874, + "learning_rate": 1.3503631602351895e-05, + "loss": 5.553, + "step": 14776 + }, + { + "epoch": 0.73, + "grad_norm": 1.8600718975067139, + "learning_rate": 1.3493749691190278e-05, + "loss": 5.5774, + "step": 14780 + }, + { + "epoch": 0.73, + "grad_norm": 1.7786033153533936, + "learning_rate": 1.3483867780028658e-05, + "loss": 5.4997, + "step": 14784 + }, + { + "epoch": 0.73, + "grad_norm": 2.239494562149048, + "learning_rate": 1.347398586886704e-05, + "loss": 5.6186, + "step": 14788 + }, + { + "epoch": 0.73, + "grad_norm": 2.078833818435669, + "learning_rate": 1.346410395770542e-05, + "loss": 5.5609, + "step": 14792 + }, + { + "epoch": 0.73, + "grad_norm": 1.867530107498169, + "learning_rate": 1.3454222046543802e-05, + "loss": 5.5624, + "step": 14796 + }, + { + "epoch": 0.73, + "grad_norm": 1.8725926876068115, + "learning_rate": 1.3444340135382183e-05, + "loss": 5.5675, + "step": 14800 + }, + { + "epoch": 0.73, + "grad_norm": 1.9592565298080444, + "learning_rate": 1.3434458224220565e-05, + "loss": 5.4278, + "step": 14804 + }, + { + "epoch": 0.73, + "grad_norm": 2.0653085708618164, + "learning_rate": 1.3424576313058945e-05, + "loss": 5.4732, + "step": 14808 + }, + { + "epoch": 0.73, + "grad_norm": 2.0164620876312256, + "learning_rate": 1.3414694401897327e-05, + "loss": 5.4636, + "step": 14812 + }, + { + "epoch": 0.73, + "grad_norm": 1.9894605875015259, + "learning_rate": 1.3404812490735708e-05, + "loss": 5.5102, + "step": 14816 + }, + { + "epoch": 0.73, + "grad_norm": 2.0743892192840576, + "learning_rate": 1.3394930579574091e-05, + "loss": 5.4852, + "step": 14820 + }, + { + "epoch": 0.73, + "grad_norm": 2.000945806503296, + "learning_rate": 1.3385048668412473e-05, + "loss": 5.4698, + "step": 14824 + }, + { + "epoch": 0.73, + "grad_norm": 2.219153881072998, + "learning_rate": 1.3375166757250854e-05, + "loss": 5.5966, + "step": 14828 + }, + { + "epoch": 0.73, + "grad_norm": 2.0266921520233154, + "learning_rate": 1.3365284846089234e-05, + "loss": 5.5036, + "step": 14832 + }, + { + "epoch": 0.73, + "grad_norm": 2.1432504653930664, + "learning_rate": 1.3355402934927616e-05, + "loss": 5.5122, + "step": 14836 + }, + { + "epoch": 0.73, + "grad_norm": 1.975110411643982, + "learning_rate": 1.3345521023765997e-05, + "loss": 5.4677, + "step": 14840 + }, + { + "epoch": 0.73, + "grad_norm": 2.1612823009490967, + "learning_rate": 1.3335639112604379e-05, + "loss": 5.544, + "step": 14844 + }, + { + "epoch": 0.73, + "grad_norm": 2.0813982486724854, + "learning_rate": 1.3325757201442759e-05, + "loss": 5.3651, + "step": 14848 + }, + { + "epoch": 0.73, + "grad_norm": 1.8772978782653809, + "learning_rate": 1.3315875290281141e-05, + "loss": 5.4319, + "step": 14852 + }, + { + "epoch": 0.73, + "grad_norm": 1.8763823509216309, + "learning_rate": 1.3305993379119521e-05, + "loss": 5.6508, + "step": 14856 + }, + { + "epoch": 0.73, + "grad_norm": 2.0054590702056885, + "learning_rate": 1.3296111467957903e-05, + "loss": 5.4899, + "step": 14860 + }, + { + "epoch": 0.73, + "grad_norm": 2.0155956745147705, + "learning_rate": 1.3286229556796284e-05, + "loss": 5.4597, + "step": 14864 + }, + { + "epoch": 0.73, + "grad_norm": 1.9215456247329712, + "learning_rate": 1.3276347645634666e-05, + "loss": 5.5587, + "step": 14868 + }, + { + "epoch": 0.73, + "grad_norm": 2.2192320823669434, + "learning_rate": 1.3266465734473046e-05, + "loss": 5.5397, + "step": 14872 + }, + { + "epoch": 0.74, + "grad_norm": 1.8037538528442383, + "learning_rate": 1.325658382331143e-05, + "loss": 5.503, + "step": 14876 + }, + { + "epoch": 0.74, + "grad_norm": 2.096147298812866, + "learning_rate": 1.3246701912149812e-05, + "loss": 5.4808, + "step": 14880 + }, + { + "epoch": 0.74, + "grad_norm": 2.0961217880249023, + "learning_rate": 1.3236820000988192e-05, + "loss": 5.5546, + "step": 14884 + }, + { + "epoch": 0.74, + "grad_norm": 2.1718828678131104, + "learning_rate": 1.3226938089826574e-05, + "loss": 5.4201, + "step": 14888 + }, + { + "epoch": 0.74, + "grad_norm": 2.062138795852661, + "learning_rate": 1.3217056178664955e-05, + "loss": 5.46, + "step": 14892 + }, + { + "epoch": 0.74, + "grad_norm": 2.0585410594940186, + "learning_rate": 1.3207174267503337e-05, + "loss": 5.5545, + "step": 14896 + }, + { + "epoch": 0.74, + "grad_norm": 1.8028877973556519, + "learning_rate": 1.3197292356341717e-05, + "loss": 5.4255, + "step": 14900 + }, + { + "epoch": 0.74, + "grad_norm": 2.002005100250244, + "learning_rate": 1.3187410445180098e-05, + "loss": 5.5847, + "step": 14904 + }, + { + "epoch": 0.74, + "grad_norm": 2.1855287551879883, + "learning_rate": 1.317752853401848e-05, + "loss": 5.536, + "step": 14908 + }, + { + "epoch": 0.74, + "grad_norm": 2.193171977996826, + "learning_rate": 1.316764662285686e-05, + "loss": 5.4059, + "step": 14912 + }, + { + "epoch": 0.74, + "grad_norm": 2.0406908988952637, + "learning_rate": 1.3157764711695242e-05, + "loss": 5.5851, + "step": 14916 + }, + { + "epoch": 0.74, + "grad_norm": 2.342682123184204, + "learning_rate": 1.3147882800533623e-05, + "loss": 5.5068, + "step": 14920 + }, + { + "epoch": 0.74, + "grad_norm": 2.0584092140197754, + "learning_rate": 1.3138000889372005e-05, + "loss": 5.3973, + "step": 14924 + }, + { + "epoch": 0.74, + "grad_norm": 1.7771698236465454, + "learning_rate": 1.3128118978210385e-05, + "loss": 5.4812, + "step": 14928 + }, + { + "epoch": 0.74, + "grad_norm": 1.923229694366455, + "learning_rate": 1.3118237067048769e-05, + "loss": 5.4176, + "step": 14932 + }, + { + "epoch": 0.74, + "grad_norm": 1.9571110010147095, + "learning_rate": 1.310835515588715e-05, + "loss": 5.5085, + "step": 14936 + }, + { + "epoch": 0.74, + "grad_norm": 1.9827457666397095, + "learning_rate": 1.3098473244725531e-05, + "loss": 5.4454, + "step": 14940 + }, + { + "epoch": 0.74, + "grad_norm": 2.2986981868743896, + "learning_rate": 1.3088591333563913e-05, + "loss": 5.562, + "step": 14944 + }, + { + "epoch": 0.74, + "grad_norm": 2.087019920349121, + "learning_rate": 1.3078709422402294e-05, + "loss": 5.5164, + "step": 14948 + }, + { + "epoch": 0.74, + "grad_norm": 1.9258410930633545, + "learning_rate": 1.3068827511240676e-05, + "loss": 5.5252, + "step": 14952 + }, + { + "epoch": 0.74, + "grad_norm": 1.833736538887024, + "learning_rate": 1.3058945600079056e-05, + "loss": 5.4423, + "step": 14956 + }, + { + "epoch": 0.74, + "grad_norm": 2.1392202377319336, + "learning_rate": 1.3049063688917438e-05, + "loss": 5.5637, + "step": 14960 + }, + { + "epoch": 0.74, + "grad_norm": 1.9656065702438354, + "learning_rate": 1.3039181777755818e-05, + "loss": 5.474, + "step": 14964 + }, + { + "epoch": 0.74, + "grad_norm": 1.8016414642333984, + "learning_rate": 1.3029299866594199e-05, + "loss": 5.4101, + "step": 14968 + }, + { + "epoch": 0.74, + "grad_norm": 2.141220808029175, + "learning_rate": 1.301941795543258e-05, + "loss": 5.4747, + "step": 14972 + }, + { + "epoch": 0.74, + "grad_norm": 2.1080868244171143, + "learning_rate": 1.3009536044270961e-05, + "loss": 5.3842, + "step": 14976 + }, + { + "epoch": 0.74, + "grad_norm": 1.9357255697250366, + "learning_rate": 1.2999654133109343e-05, + "loss": 5.5716, + "step": 14980 + }, + { + "epoch": 0.74, + "grad_norm": 2.3347742557525635, + "learning_rate": 1.2989772221947724e-05, + "loss": 5.5935, + "step": 14984 + }, + { + "epoch": 0.74, + "grad_norm": 1.999286413192749, + "learning_rate": 1.2979890310786106e-05, + "loss": 5.4976, + "step": 14988 + }, + { + "epoch": 0.74, + "grad_norm": 2.040515184402466, + "learning_rate": 1.297000839962449e-05, + "loss": 5.6029, + "step": 14992 + }, + { + "epoch": 0.74, + "grad_norm": 1.9321497678756714, + "learning_rate": 1.296012648846287e-05, + "loss": 5.503, + "step": 14996 + }, + { + "epoch": 0.74, + "grad_norm": 1.834435224533081, + "learning_rate": 1.2950244577301252e-05, + "loss": 5.5076, + "step": 15000 + }, + { + "epoch": 0.74, + "grad_norm": 2.410569906234741, + "learning_rate": 1.2940362666139632e-05, + "loss": 5.6112, + "step": 15004 + }, + { + "epoch": 0.74, + "grad_norm": 2.110943555831909, + "learning_rate": 1.2930480754978014e-05, + "loss": 5.4589, + "step": 15008 + }, + { + "epoch": 0.74, + "grad_norm": 2.2104272842407227, + "learning_rate": 1.2920598843816395e-05, + "loss": 5.4494, + "step": 15012 + }, + { + "epoch": 0.74, + "grad_norm": 2.0038228034973145, + "learning_rate": 1.2910716932654777e-05, + "loss": 5.4229, + "step": 15016 + }, + { + "epoch": 0.74, + "grad_norm": 2.0080738067626953, + "learning_rate": 1.2900835021493157e-05, + "loss": 5.5645, + "step": 15020 + }, + { + "epoch": 0.74, + "grad_norm": 2.075934886932373, + "learning_rate": 1.2890953110331539e-05, + "loss": 5.6104, + "step": 15024 + }, + { + "epoch": 0.74, + "grad_norm": 1.9630203247070312, + "learning_rate": 1.288107119916992e-05, + "loss": 5.4888, + "step": 15028 + }, + { + "epoch": 0.74, + "grad_norm": 2.1975886821746826, + "learning_rate": 1.28711892880083e-05, + "loss": 5.5283, + "step": 15032 + }, + { + "epoch": 0.74, + "grad_norm": 1.9507403373718262, + "learning_rate": 1.2861307376846682e-05, + "loss": 5.6203, + "step": 15036 + }, + { + "epoch": 0.74, + "grad_norm": 2.0065085887908936, + "learning_rate": 1.2851425465685062e-05, + "loss": 5.4493, + "step": 15040 + }, + { + "epoch": 0.74, + "grad_norm": 2.170250177383423, + "learning_rate": 1.2841543554523444e-05, + "loss": 5.5062, + "step": 15044 + }, + { + "epoch": 0.74, + "grad_norm": 1.9755939245224, + "learning_rate": 1.2831661643361828e-05, + "loss": 5.4901, + "step": 15048 + }, + { + "epoch": 0.74, + "grad_norm": 2.0391461849212646, + "learning_rate": 1.2821779732200208e-05, + "loss": 5.4896, + "step": 15052 + }, + { + "epoch": 0.74, + "grad_norm": 1.8779929876327515, + "learning_rate": 1.281189782103859e-05, + "loss": 5.4311, + "step": 15056 + }, + { + "epoch": 0.74, + "grad_norm": 2.123504400253296, + "learning_rate": 1.2802015909876971e-05, + "loss": 5.5117, + "step": 15060 + }, + { + "epoch": 0.74, + "grad_norm": 1.885321021080017, + "learning_rate": 1.2792133998715353e-05, + "loss": 5.515, + "step": 15064 + }, + { + "epoch": 0.74, + "grad_norm": 2.281585454940796, + "learning_rate": 1.2782252087553733e-05, + "loss": 5.4979, + "step": 15068 + }, + { + "epoch": 0.74, + "grad_norm": 2.1277706623077393, + "learning_rate": 1.2772370176392115e-05, + "loss": 5.4285, + "step": 15072 + }, + { + "epoch": 0.74, + "grad_norm": 2.0085339546203613, + "learning_rate": 1.2762488265230496e-05, + "loss": 5.4741, + "step": 15076 + }, + { + "epoch": 0.75, + "grad_norm": 2.0528817176818848, + "learning_rate": 1.2752606354068878e-05, + "loss": 5.5861, + "step": 15080 + }, + { + "epoch": 0.75, + "grad_norm": 2.0713131427764893, + "learning_rate": 1.2742724442907258e-05, + "loss": 5.5236, + "step": 15084 + }, + { + "epoch": 0.75, + "grad_norm": 1.9570958614349365, + "learning_rate": 1.273284253174564e-05, + "loss": 5.4983, + "step": 15088 + }, + { + "epoch": 0.75, + "grad_norm": 1.8373630046844482, + "learning_rate": 1.272296062058402e-05, + "loss": 5.4913, + "step": 15092 + }, + { + "epoch": 0.75, + "grad_norm": 2.0442957878112793, + "learning_rate": 1.2713078709422403e-05, + "loss": 5.4757, + "step": 15096 + }, + { + "epoch": 0.75, + "grad_norm": 2.090407609939575, + "learning_rate": 1.2703196798260783e-05, + "loss": 5.4915, + "step": 15100 + }, + { + "epoch": 0.75, + "grad_norm": 2.1005630493164062, + "learning_rate": 1.2693314887099167e-05, + "loss": 5.5734, + "step": 15104 + }, + { + "epoch": 0.75, + "grad_norm": 2.062366008758545, + "learning_rate": 1.2683432975937549e-05, + "loss": 5.5959, + "step": 15108 + }, + { + "epoch": 0.75, + "grad_norm": 1.9203088283538818, + "learning_rate": 1.267355106477593e-05, + "loss": 5.4388, + "step": 15112 + }, + { + "epoch": 0.75, + "grad_norm": 2.0604312419891357, + "learning_rate": 1.266366915361431e-05, + "loss": 5.5115, + "step": 15116 + }, + { + "epoch": 0.75, + "grad_norm": 2.230130672454834, + "learning_rate": 1.2653787242452692e-05, + "loss": 5.596, + "step": 15120 + }, + { + "epoch": 0.75, + "grad_norm": 2.0392181873321533, + "learning_rate": 1.2643905331291072e-05, + "loss": 5.57, + "step": 15124 + }, + { + "epoch": 0.75, + "grad_norm": 2.253793954849243, + "learning_rate": 1.2634023420129454e-05, + "loss": 5.5402, + "step": 15128 + }, + { + "epoch": 0.75, + "grad_norm": 1.9613460302352905, + "learning_rate": 1.2624141508967834e-05, + "loss": 5.3467, + "step": 15132 + }, + { + "epoch": 0.75, + "grad_norm": 1.8354169130325317, + "learning_rate": 1.2614259597806216e-05, + "loss": 5.5377, + "step": 15136 + }, + { + "epoch": 0.75, + "grad_norm": 1.8670376539230347, + "learning_rate": 1.2604377686644597e-05, + "loss": 5.4526, + "step": 15140 + }, + { + "epoch": 0.75, + "grad_norm": 2.1439993381500244, + "learning_rate": 1.2594495775482979e-05, + "loss": 5.4915, + "step": 15144 + }, + { + "epoch": 0.75, + "grad_norm": 1.9621843099594116, + "learning_rate": 1.258461386432136e-05, + "loss": 5.415, + "step": 15148 + }, + { + "epoch": 0.75, + "grad_norm": 2.0130698680877686, + "learning_rate": 1.2574731953159741e-05, + "loss": 5.5646, + "step": 15152 + }, + { + "epoch": 0.75, + "grad_norm": 2.1963608264923096, + "learning_rate": 1.2564850041998122e-05, + "loss": 5.5678, + "step": 15156 + }, + { + "epoch": 0.75, + "grad_norm": 2.007394790649414, + "learning_rate": 1.2554968130836504e-05, + "loss": 5.4874, + "step": 15160 + }, + { + "epoch": 0.75, + "grad_norm": 2.03348708152771, + "learning_rate": 1.2545086219674888e-05, + "loss": 5.3595, + "step": 15164 + }, + { + "epoch": 0.75, + "grad_norm": 2.1427929401397705, + "learning_rate": 1.2535204308513268e-05, + "loss": 5.5416, + "step": 15168 + }, + { + "epoch": 0.75, + "grad_norm": 2.0188114643096924, + "learning_rate": 1.252532239735165e-05, + "loss": 5.4923, + "step": 15172 + }, + { + "epoch": 0.75, + "grad_norm": 1.9621504545211792, + "learning_rate": 1.251544048619003e-05, + "loss": 5.5167, + "step": 15176 + }, + { + "epoch": 0.75, + "grad_norm": 2.1401383876800537, + "learning_rate": 1.2505558575028412e-05, + "loss": 5.3883, + "step": 15180 + }, + { + "epoch": 0.75, + "grad_norm": 2.0091779232025146, + "learning_rate": 1.2495676663866793e-05, + "loss": 5.5855, + "step": 15184 + }, + { + "epoch": 0.75, + "grad_norm": 2.0588550567626953, + "learning_rate": 1.2485794752705173e-05, + "loss": 5.4961, + "step": 15188 + }, + { + "epoch": 0.75, + "grad_norm": 2.0023372173309326, + "learning_rate": 1.2475912841543555e-05, + "loss": 5.3909, + "step": 15192 + }, + { + "epoch": 0.75, + "grad_norm": 2.395747184753418, + "learning_rate": 1.2466030930381936e-05, + "loss": 5.4807, + "step": 15196 + }, + { + "epoch": 0.75, + "grad_norm": 1.8098511695861816, + "learning_rate": 1.2456149019220318e-05, + "loss": 5.4387, + "step": 15200 + }, + { + "epoch": 0.75, + "grad_norm": 1.9918867349624634, + "learning_rate": 1.24462671080587e-05, + "loss": 5.396, + "step": 15204 + }, + { + "epoch": 0.75, + "grad_norm": 2.0140202045440674, + "learning_rate": 1.2436385196897082e-05, + "loss": 5.4098, + "step": 15208 + }, + { + "epoch": 0.75, + "grad_norm": 1.9749282598495483, + "learning_rate": 1.2426503285735462e-05, + "loss": 5.4362, + "step": 15212 + }, + { + "epoch": 0.75, + "grad_norm": 2.1992135047912598, + "learning_rate": 1.2416621374573842e-05, + "loss": 5.5986, + "step": 15216 + }, + { + "epoch": 0.75, + "grad_norm": 2.3077468872070312, + "learning_rate": 1.2406739463412225e-05, + "loss": 5.5058, + "step": 15220 + }, + { + "epoch": 0.75, + "grad_norm": 1.964931845664978, + "learning_rate": 1.2396857552250605e-05, + "loss": 5.3997, + "step": 15224 + }, + { + "epoch": 0.75, + "grad_norm": 1.9411903619766235, + "learning_rate": 1.2386975641088987e-05, + "loss": 5.4008, + "step": 15228 + }, + { + "epoch": 0.75, + "grad_norm": 2.01727557182312, + "learning_rate": 1.2377093729927367e-05, + "loss": 5.6187, + "step": 15232 + }, + { + "epoch": 0.75, + "grad_norm": 1.9451463222503662, + "learning_rate": 1.2367211818765751e-05, + "loss": 5.5272, + "step": 15236 + }, + { + "epoch": 0.75, + "grad_norm": 2.1012978553771973, + "learning_rate": 1.2357329907604131e-05, + "loss": 5.3657, + "step": 15240 + }, + { + "epoch": 0.75, + "grad_norm": 2.219510078430176, + "learning_rate": 1.2347447996442513e-05, + "loss": 5.526, + "step": 15244 + }, + { + "epoch": 0.75, + "grad_norm": 2.013444185256958, + "learning_rate": 1.2337566085280894e-05, + "loss": 5.4749, + "step": 15248 + }, + { + "epoch": 0.75, + "grad_norm": 1.7549456357955933, + "learning_rate": 1.2327684174119274e-05, + "loss": 5.3478, + "step": 15252 + }, + { + "epoch": 0.75, + "grad_norm": 1.8471717834472656, + "learning_rate": 1.2317802262957656e-05, + "loss": 5.5201, + "step": 15256 + }, + { + "epoch": 0.75, + "grad_norm": 2.324028491973877, + "learning_rate": 1.2307920351796037e-05, + "loss": 5.5589, + "step": 15260 + }, + { + "epoch": 0.75, + "grad_norm": 2.0255486965179443, + "learning_rate": 1.229803844063442e-05, + "loss": 5.5575, + "step": 15264 + }, + { + "epoch": 0.75, + "grad_norm": 2.1873011589050293, + "learning_rate": 1.22881565294728e-05, + "loss": 5.5165, + "step": 15268 + }, + { + "epoch": 0.75, + "grad_norm": 2.1346065998077393, + "learning_rate": 1.2278274618311183e-05, + "loss": 5.5053, + "step": 15272 + }, + { + "epoch": 0.75, + "grad_norm": 2.1076815128326416, + "learning_rate": 1.2268392707149563e-05, + "loss": 5.4105, + "step": 15276 + }, + { + "epoch": 0.75, + "grad_norm": 1.9122978448867798, + "learning_rate": 1.2258510795987945e-05, + "loss": 5.4235, + "step": 15280 + }, + { + "epoch": 0.76, + "grad_norm": 2.054979085922241, + "learning_rate": 1.2248628884826326e-05, + "loss": 5.5838, + "step": 15284 + }, + { + "epoch": 0.76, + "grad_norm": 1.9995005130767822, + "learning_rate": 1.2238746973664706e-05, + "loss": 5.4584, + "step": 15288 + }, + { + "epoch": 0.76, + "grad_norm": 2.0757248401641846, + "learning_rate": 1.222886506250309e-05, + "loss": 5.5064, + "step": 15292 + }, + { + "epoch": 0.76, + "grad_norm": 1.831465721130371, + "learning_rate": 1.221898315134147e-05, + "loss": 5.5416, + "step": 15296 + }, + { + "epoch": 0.76, + "grad_norm": 2.3364105224609375, + "learning_rate": 1.2209101240179852e-05, + "loss": 5.5332, + "step": 15300 + }, + { + "epoch": 0.76, + "grad_norm": 1.9546422958374023, + "learning_rate": 1.2199219329018233e-05, + "loss": 5.4726, + "step": 15304 + }, + { + "epoch": 0.76, + "grad_norm": 1.8395370244979858, + "learning_rate": 1.2189337417856615e-05, + "loss": 5.4443, + "step": 15308 + }, + { + "epoch": 0.76, + "grad_norm": 2.080458402633667, + "learning_rate": 1.2179455506694995e-05, + "loss": 5.5154, + "step": 15312 + }, + { + "epoch": 0.76, + "grad_norm": 2.017116069793701, + "learning_rate": 1.2169573595533375e-05, + "loss": 5.5048, + "step": 15316 + }, + { + "epoch": 0.76, + "grad_norm": 2.0325398445129395, + "learning_rate": 1.2159691684371759e-05, + "loss": 5.3668, + "step": 15320 + }, + { + "epoch": 0.76, + "grad_norm": 2.246100664138794, + "learning_rate": 1.214980977321014e-05, + "loss": 5.5176, + "step": 15324 + }, + { + "epoch": 0.76, + "grad_norm": 2.0287086963653564, + "learning_rate": 1.2139927862048522e-05, + "loss": 5.5352, + "step": 15328 + }, + { + "epoch": 0.76, + "grad_norm": 2.278211832046509, + "learning_rate": 1.2130045950886902e-05, + "loss": 5.441, + "step": 15332 + }, + { + "epoch": 0.76, + "grad_norm": 2.143902063369751, + "learning_rate": 1.2120164039725284e-05, + "loss": 5.5263, + "step": 15336 + }, + { + "epoch": 0.76, + "grad_norm": 2.0122079849243164, + "learning_rate": 1.2110282128563664e-05, + "loss": 5.3925, + "step": 15340 + }, + { + "epoch": 0.76, + "grad_norm": 2.094726324081421, + "learning_rate": 1.2100400217402046e-05, + "loss": 5.4837, + "step": 15344 + }, + { + "epoch": 0.76, + "grad_norm": 2.0904650688171387, + "learning_rate": 1.2090518306240428e-05, + "loss": 5.4732, + "step": 15348 + }, + { + "epoch": 0.76, + "grad_norm": 1.869597315788269, + "learning_rate": 1.2080636395078809e-05, + "loss": 5.4292, + "step": 15352 + }, + { + "epoch": 0.76, + "grad_norm": 2.0959110260009766, + "learning_rate": 1.2070754483917191e-05, + "loss": 5.5033, + "step": 15356 + }, + { + "epoch": 0.76, + "grad_norm": 2.153909921646118, + "learning_rate": 1.2060872572755571e-05, + "loss": 5.4692, + "step": 15360 + }, + { + "epoch": 0.76, + "grad_norm": 2.0445823669433594, + "learning_rate": 1.2050990661593953e-05, + "loss": 5.3999, + "step": 15364 + }, + { + "epoch": 0.76, + "grad_norm": 2.0181336402893066, + "learning_rate": 1.2041108750432334e-05, + "loss": 5.4045, + "step": 15368 + }, + { + "epoch": 0.76, + "grad_norm": 2.008654832839966, + "learning_rate": 1.2031226839270716e-05, + "loss": 5.6017, + "step": 15372 + }, + { + "epoch": 0.76, + "grad_norm": 2.126469850540161, + "learning_rate": 1.2021344928109096e-05, + "loss": 5.4578, + "step": 15376 + }, + { + "epoch": 0.76, + "grad_norm": 2.031398057937622, + "learning_rate": 1.2011463016947478e-05, + "loss": 5.5091, + "step": 15380 + }, + { + "epoch": 0.76, + "grad_norm": 2.102151870727539, + "learning_rate": 1.200158110578586e-05, + "loss": 5.4637, + "step": 15384 + }, + { + "epoch": 0.76, + "grad_norm": 1.9239962100982666, + "learning_rate": 1.199169919462424e-05, + "loss": 5.5014, + "step": 15388 + }, + { + "epoch": 0.76, + "grad_norm": 1.9613525867462158, + "learning_rate": 1.1981817283462623e-05, + "loss": 5.5283, + "step": 15392 + }, + { + "epoch": 0.76, + "grad_norm": 1.973003625869751, + "learning_rate": 1.1971935372301003e-05, + "loss": 5.4865, + "step": 15396 + }, + { + "epoch": 0.76, + "grad_norm": 2.3486452102661133, + "learning_rate": 1.1962053461139385e-05, + "loss": 5.5125, + "step": 15400 + }, + { + "epoch": 0.76, + "grad_norm": 2.088740825653076, + "learning_rate": 1.1952171549977765e-05, + "loss": 5.4416, + "step": 15404 + }, + { + "epoch": 0.76, + "grad_norm": 1.9701464176177979, + "learning_rate": 1.1942289638816147e-05, + "loss": 5.2871, + "step": 15408 + }, + { + "epoch": 0.76, + "grad_norm": 2.2388153076171875, + "learning_rate": 1.193240772765453e-05, + "loss": 5.551, + "step": 15412 + }, + { + "epoch": 0.76, + "grad_norm": 2.2453413009643555, + "learning_rate": 1.192252581649291e-05, + "loss": 5.4372, + "step": 15416 + }, + { + "epoch": 0.76, + "grad_norm": 2.1105456352233887, + "learning_rate": 1.1912643905331292e-05, + "loss": 5.5284, + "step": 15420 + }, + { + "epoch": 0.76, + "grad_norm": 2.197547197341919, + "learning_rate": 1.1902761994169672e-05, + "loss": 5.5067, + "step": 15424 + }, + { + "epoch": 0.76, + "grad_norm": 1.9585908651351929, + "learning_rate": 1.1892880083008054e-05, + "loss": 5.4292, + "step": 15428 + }, + { + "epoch": 0.76, + "grad_norm": 2.037917137145996, + "learning_rate": 1.1882998171846435e-05, + "loss": 5.455, + "step": 15432 + }, + { + "epoch": 0.76, + "grad_norm": 1.797452449798584, + "learning_rate": 1.1873116260684817e-05, + "loss": 5.4833, + "step": 15436 + }, + { + "epoch": 0.76, + "grad_norm": 1.8614048957824707, + "learning_rate": 1.1863234349523199e-05, + "loss": 5.5127, + "step": 15440 + }, + { + "epoch": 0.76, + "grad_norm": 1.9821441173553467, + "learning_rate": 1.185335243836158e-05, + "loss": 5.5038, + "step": 15444 + }, + { + "epoch": 0.76, + "grad_norm": 2.3709757328033447, + "learning_rate": 1.1843470527199961e-05, + "loss": 5.4745, + "step": 15448 + }, + { + "epoch": 0.76, + "grad_norm": 1.8125073909759521, + "learning_rate": 1.1833588616038342e-05, + "loss": 5.4642, + "step": 15452 + }, + { + "epoch": 0.76, + "grad_norm": 2.0087850093841553, + "learning_rate": 1.1823706704876724e-05, + "loss": 5.4628, + "step": 15456 + }, + { + "epoch": 0.76, + "grad_norm": 1.9169509410858154, + "learning_rate": 1.1813824793715104e-05, + "loss": 5.4821, + "step": 15460 + }, + { + "epoch": 0.76, + "grad_norm": 2.01556396484375, + "learning_rate": 1.1803942882553488e-05, + "loss": 5.4142, + "step": 15464 + }, + { + "epoch": 0.76, + "grad_norm": 1.9918155670166016, + "learning_rate": 1.1794060971391868e-05, + "loss": 5.6412, + "step": 15468 + }, + { + "epoch": 0.76, + "grad_norm": 2.052454710006714, + "learning_rate": 1.1784179060230249e-05, + "loss": 5.4347, + "step": 15472 + }, + { + "epoch": 0.76, + "grad_norm": 1.8537468910217285, + "learning_rate": 1.177429714906863e-05, + "loss": 5.5036, + "step": 15476 + }, + { + "epoch": 0.76, + "grad_norm": 1.9864877462387085, + "learning_rate": 1.1764415237907011e-05, + "loss": 5.4979, + "step": 15480 + }, + { + "epoch": 0.77, + "grad_norm": 2.200070858001709, + "learning_rate": 1.1754533326745393e-05, + "loss": 5.5684, + "step": 15484 + }, + { + "epoch": 0.77, + "grad_norm": 1.7679542303085327, + "learning_rate": 1.1744651415583773e-05, + "loss": 5.4283, + "step": 15488 + }, + { + "epoch": 0.77, + "grad_norm": 1.8458738327026367, + "learning_rate": 1.1734769504422157e-05, + "loss": 5.5566, + "step": 15492 + }, + { + "epoch": 0.77, + "grad_norm": 2.3362741470336914, + "learning_rate": 1.1724887593260538e-05, + "loss": 5.4377, + "step": 15496 + }, + { + "epoch": 0.77, + "grad_norm": 1.7595479488372803, + "learning_rate": 1.1715005682098918e-05, + "loss": 5.4785, + "step": 15500 + }, + { + "epoch": 0.77, + "grad_norm": 2.1757829189300537, + "learning_rate": 1.17051237709373e-05, + "loss": 5.5964, + "step": 15504 + }, + { + "epoch": 0.77, + "grad_norm": 2.0845088958740234, + "learning_rate": 1.169524185977568e-05, + "loss": 5.5294, + "step": 15508 + }, + { + "epoch": 0.77, + "grad_norm": 1.937070608139038, + "learning_rate": 1.1685359948614062e-05, + "loss": 5.5338, + "step": 15512 + }, + { + "epoch": 0.77, + "grad_norm": 2.137470006942749, + "learning_rate": 1.1675478037452443e-05, + "loss": 5.4788, + "step": 15516 + }, + { + "epoch": 0.77, + "grad_norm": 2.1320180892944336, + "learning_rate": 1.1665596126290827e-05, + "loss": 5.6173, + "step": 15520 + }, + { + "epoch": 0.77, + "grad_norm": 2.0288352966308594, + "learning_rate": 1.1655714215129207e-05, + "loss": 5.3012, + "step": 15524 + }, + { + "epoch": 0.77, + "grad_norm": 1.784881830215454, + "learning_rate": 1.1645832303967589e-05, + "loss": 5.5805, + "step": 15528 + }, + { + "epoch": 0.77, + "grad_norm": 2.08077073097229, + "learning_rate": 1.163595039280597e-05, + "loss": 5.4992, + "step": 15532 + }, + { + "epoch": 0.77, + "grad_norm": 1.7977101802825928, + "learning_rate": 1.162606848164435e-05, + "loss": 5.4872, + "step": 15536 + }, + { + "epoch": 0.77, + "grad_norm": 2.186459541320801, + "learning_rate": 1.1616186570482732e-05, + "loss": 5.578, + "step": 15540 + }, + { + "epoch": 0.77, + "grad_norm": 2.183048725128174, + "learning_rate": 1.1606304659321112e-05, + "loss": 5.4738, + "step": 15544 + }, + { + "epoch": 0.77, + "grad_norm": 1.8500030040740967, + "learning_rate": 1.1596422748159494e-05, + "loss": 5.5345, + "step": 15548 + }, + { + "epoch": 0.77, + "grad_norm": 1.8691611289978027, + "learning_rate": 1.1586540836997876e-05, + "loss": 5.5283, + "step": 15552 + }, + { + "epoch": 0.77, + "grad_norm": 2.1216142177581787, + "learning_rate": 1.1576658925836258e-05, + "loss": 5.581, + "step": 15556 + }, + { + "epoch": 0.77, + "grad_norm": 2.0468342304229736, + "learning_rate": 1.1566777014674639e-05, + "loss": 5.5649, + "step": 15560 + }, + { + "epoch": 0.77, + "grad_norm": 2.060667037963867, + "learning_rate": 1.155689510351302e-05, + "loss": 5.4901, + "step": 15564 + }, + { + "epoch": 0.77, + "grad_norm": 1.803126573562622, + "learning_rate": 1.1547013192351401e-05, + "loss": 5.4285, + "step": 15568 + }, + { + "epoch": 0.77, + "grad_norm": 2.05906081199646, + "learning_rate": 1.1537131281189781e-05, + "loss": 5.4268, + "step": 15572 + }, + { + "epoch": 0.77, + "grad_norm": 2.0089757442474365, + "learning_rate": 1.1527249370028164e-05, + "loss": 5.5307, + "step": 15576 + }, + { + "epoch": 0.77, + "grad_norm": 1.9152356386184692, + "learning_rate": 1.1517367458866546e-05, + "loss": 5.4297, + "step": 15580 + }, + { + "epoch": 0.77, + "grad_norm": 1.9012370109558105, + "learning_rate": 1.1507485547704928e-05, + "loss": 5.5271, + "step": 15584 + }, + { + "epoch": 0.77, + "grad_norm": 2.0786564350128174, + "learning_rate": 1.1497603636543308e-05, + "loss": 5.5957, + "step": 15588 + }, + { + "epoch": 0.77, + "grad_norm": 2.03715443611145, + "learning_rate": 1.148772172538169e-05, + "loss": 5.5501, + "step": 15592 + }, + { + "epoch": 0.77, + "grad_norm": 2.1424834728240967, + "learning_rate": 1.147783981422007e-05, + "loss": 5.5153, + "step": 15596 + }, + { + "epoch": 0.77, + "grad_norm": 1.9324986934661865, + "learning_rate": 1.146795790305845e-05, + "loss": 5.5928, + "step": 15600 + }, + { + "epoch": 0.77, + "grad_norm": 1.986244559288025, + "learning_rate": 1.1458075991896833e-05, + "loss": 5.4466, + "step": 15604 + }, + { + "epoch": 0.77, + "grad_norm": 2.3215315341949463, + "learning_rate": 1.1448194080735215e-05, + "loss": 5.4948, + "step": 15608 + }, + { + "epoch": 0.77, + "grad_norm": 2.2035694122314453, + "learning_rate": 1.1438312169573597e-05, + "loss": 5.6002, + "step": 15612 + }, + { + "epoch": 0.77, + "grad_norm": 2.0092086791992188, + "learning_rate": 1.1428430258411977e-05, + "loss": 5.4083, + "step": 15616 + }, + { + "epoch": 0.77, + "grad_norm": 2.008274555206299, + "learning_rate": 1.141854834725036e-05, + "loss": 5.3977, + "step": 15620 + }, + { + "epoch": 0.77, + "grad_norm": 2.084843873977661, + "learning_rate": 1.140866643608874e-05, + "loss": 5.4896, + "step": 15624 + }, + { + "epoch": 0.77, + "grad_norm": 2.138129949569702, + "learning_rate": 1.1398784524927122e-05, + "loss": 5.556, + "step": 15628 + }, + { + "epoch": 0.77, + "grad_norm": 2.161590814590454, + "learning_rate": 1.1388902613765502e-05, + "loss": 5.4909, + "step": 15632 + }, + { + "epoch": 0.77, + "grad_norm": 2.1914632320404053, + "learning_rate": 1.1379020702603884e-05, + "loss": 5.5578, + "step": 15636 + }, + { + "epoch": 0.77, + "grad_norm": 2.254403829574585, + "learning_rate": 1.1369138791442266e-05, + "loss": 5.6521, + "step": 15640 + }, + { + "epoch": 0.77, + "grad_norm": 1.9875483512878418, + "learning_rate": 1.1359256880280647e-05, + "loss": 5.4972, + "step": 15644 + }, + { + "epoch": 0.77, + "grad_norm": 2.2201669216156006, + "learning_rate": 1.1349374969119029e-05, + "loss": 5.6261, + "step": 15648 + }, + { + "epoch": 0.77, + "grad_norm": 2.054435968399048, + "learning_rate": 1.1339493057957409e-05, + "loss": 5.5603, + "step": 15652 + }, + { + "epoch": 0.77, + "grad_norm": 1.902565598487854, + "learning_rate": 1.1329611146795791e-05, + "loss": 5.6618, + "step": 15656 + }, + { + "epoch": 0.77, + "grad_norm": 2.2018725872039795, + "learning_rate": 1.1319729235634172e-05, + "loss": 5.429, + "step": 15660 + }, + { + "epoch": 0.77, + "grad_norm": 1.9127063751220703, + "learning_rate": 1.1309847324472554e-05, + "loss": 5.5171, + "step": 15664 + }, + { + "epoch": 0.77, + "grad_norm": 1.8942164182662964, + "learning_rate": 1.1299965413310936e-05, + "loss": 5.5357, + "step": 15668 + }, + { + "epoch": 0.77, + "grad_norm": 2.2651684284210205, + "learning_rate": 1.1290083502149316e-05, + "loss": 5.4591, + "step": 15672 + }, + { + "epoch": 0.77, + "grad_norm": 2.189774990081787, + "learning_rate": 1.1280201590987698e-05, + "loss": 5.4338, + "step": 15676 + }, + { + "epoch": 0.77, + "grad_norm": 2.1134681701660156, + "learning_rate": 1.1270319679826078e-05, + "loss": 5.5294, + "step": 15680 + }, + { + "epoch": 0.77, + "grad_norm": 2.2125091552734375, + "learning_rate": 1.126043776866446e-05, + "loss": 5.5269, + "step": 15684 + }, + { + "epoch": 0.78, + "grad_norm": 1.9919397830963135, + "learning_rate": 1.1250555857502841e-05, + "loss": 5.4072, + "step": 15688 + }, + { + "epoch": 0.78, + "grad_norm": 1.958975911140442, + "learning_rate": 1.1240673946341223e-05, + "loss": 5.4421, + "step": 15692 + }, + { + "epoch": 0.78, + "grad_norm": 2.033118724822998, + "learning_rate": 1.1230792035179605e-05, + "loss": 5.5095, + "step": 15696 + }, + { + "epoch": 0.78, + "grad_norm": 2.1201207637786865, + "learning_rate": 1.1220910124017985e-05, + "loss": 5.4767, + "step": 15700 + }, + { + "epoch": 0.78, + "grad_norm": 1.9773985147476196, + "learning_rate": 1.1211028212856367e-05, + "loss": 5.424, + "step": 15704 + }, + { + "epoch": 0.78, + "grad_norm": 1.841871976852417, + "learning_rate": 1.1201146301694748e-05, + "loss": 5.521, + "step": 15708 + }, + { + "epoch": 0.78, + "grad_norm": 1.9351176023483276, + "learning_rate": 1.119126439053313e-05, + "loss": 5.5646, + "step": 15712 + }, + { + "epoch": 0.78, + "grad_norm": 2.0061535835266113, + "learning_rate": 1.118138247937151e-05, + "loss": 5.36, + "step": 15716 + }, + { + "epoch": 0.78, + "grad_norm": 2.4020817279815674, + "learning_rate": 1.1171500568209892e-05, + "loss": 5.4701, + "step": 15720 + }, + { + "epoch": 0.78, + "grad_norm": 1.9213863611221313, + "learning_rate": 1.1161618657048274e-05, + "loss": 5.4668, + "step": 15724 + }, + { + "epoch": 0.78, + "grad_norm": 2.3538432121276855, + "learning_rate": 1.1151736745886655e-05, + "loss": 5.5128, + "step": 15728 + }, + { + "epoch": 0.78, + "grad_norm": 2.147163152694702, + "learning_rate": 1.1141854834725037e-05, + "loss": 5.4579, + "step": 15732 + }, + { + "epoch": 0.78, + "grad_norm": 2.1335911750793457, + "learning_rate": 1.1131972923563417e-05, + "loss": 5.4277, + "step": 15736 + }, + { + "epoch": 0.78, + "grad_norm": 2.1882131099700928, + "learning_rate": 1.11220910124018e-05, + "loss": 5.6003, + "step": 15740 + }, + { + "epoch": 0.78, + "grad_norm": 1.9644808769226074, + "learning_rate": 1.111220910124018e-05, + "loss": 5.5355, + "step": 15744 + }, + { + "epoch": 0.78, + "grad_norm": 1.7521815299987793, + "learning_rate": 1.1102327190078562e-05, + "loss": 5.4743, + "step": 15748 + }, + { + "epoch": 0.78, + "grad_norm": 2.1638858318328857, + "learning_rate": 1.1092445278916944e-05, + "loss": 5.3701, + "step": 15752 + }, + { + "epoch": 0.78, + "grad_norm": 1.8966166973114014, + "learning_rate": 1.1082563367755324e-05, + "loss": 5.5355, + "step": 15756 + }, + { + "epoch": 0.78, + "grad_norm": 1.981217861175537, + "learning_rate": 1.1072681456593706e-05, + "loss": 5.4366, + "step": 15760 + }, + { + "epoch": 0.78, + "grad_norm": 2.0059125423431396, + "learning_rate": 1.1062799545432086e-05, + "loss": 5.4749, + "step": 15764 + }, + { + "epoch": 0.78, + "grad_norm": 2.042475461959839, + "learning_rate": 1.1052917634270469e-05, + "loss": 5.4066, + "step": 15768 + }, + { + "epoch": 0.78, + "grad_norm": 2.1431596279144287, + "learning_rate": 1.1043035723108849e-05, + "loss": 5.4831, + "step": 15772 + }, + { + "epoch": 0.78, + "grad_norm": 1.903968095779419, + "learning_rate": 1.1033153811947231e-05, + "loss": 5.4749, + "step": 15776 + }, + { + "epoch": 0.78, + "grad_norm": 2.2087221145629883, + "learning_rate": 1.1023271900785613e-05, + "loss": 5.5542, + "step": 15780 + }, + { + "epoch": 0.78, + "grad_norm": 1.990248680114746, + "learning_rate": 1.1013389989623993e-05, + "loss": 5.4592, + "step": 15784 + }, + { + "epoch": 0.78, + "grad_norm": 1.9822028875350952, + "learning_rate": 1.1003508078462375e-05, + "loss": 5.5245, + "step": 15788 + }, + { + "epoch": 0.78, + "grad_norm": 2.0607571601867676, + "learning_rate": 1.0993626167300756e-05, + "loss": 5.5105, + "step": 15792 + }, + { + "epoch": 0.78, + "grad_norm": 2.2510719299316406, + "learning_rate": 1.0983744256139138e-05, + "loss": 5.4485, + "step": 15796 + }, + { + "epoch": 0.78, + "grad_norm": 1.9407929182052612, + "learning_rate": 1.0973862344977518e-05, + "loss": 5.4835, + "step": 15800 + }, + { + "epoch": 0.78, + "grad_norm": 2.0228731632232666, + "learning_rate": 1.09639804338159e-05, + "loss": 5.4121, + "step": 15804 + }, + { + "epoch": 0.78, + "grad_norm": 2.0110514163970947, + "learning_rate": 1.0954098522654282e-05, + "loss": 5.4097, + "step": 15808 + }, + { + "epoch": 0.78, + "grad_norm": 2.005176305770874, + "learning_rate": 1.0944216611492664e-05, + "loss": 5.5651, + "step": 15812 + }, + { + "epoch": 0.78, + "grad_norm": 2.08233380317688, + "learning_rate": 1.0934334700331045e-05, + "loss": 5.5942, + "step": 15816 + }, + { + "epoch": 0.78, + "grad_norm": 1.76272714138031, + "learning_rate": 1.0924452789169425e-05, + "loss": 5.3883, + "step": 15820 + }, + { + "epoch": 0.78, + "grad_norm": 1.8395804166793823, + "learning_rate": 1.0914570878007807e-05, + "loss": 5.372, + "step": 15824 + }, + { + "epoch": 0.78, + "grad_norm": 2.197016954421997, + "learning_rate": 1.0904688966846188e-05, + "loss": 5.4239, + "step": 15828 + }, + { + "epoch": 0.78, + "grad_norm": 2.1127779483795166, + "learning_rate": 1.089480705568457e-05, + "loss": 5.358, + "step": 15832 + }, + { + "epoch": 0.78, + "grad_norm": 2.1906561851501465, + "learning_rate": 1.0884925144522952e-05, + "loss": 5.528, + "step": 15836 + }, + { + "epoch": 0.78, + "grad_norm": 2.2963180541992188, + "learning_rate": 1.0875043233361334e-05, + "loss": 5.6399, + "step": 15840 + }, + { + "epoch": 0.78, + "grad_norm": 2.1306509971618652, + "learning_rate": 1.0865161322199714e-05, + "loss": 5.4986, + "step": 15844 + }, + { + "epoch": 0.78, + "grad_norm": 1.8235065937042236, + "learning_rate": 1.0855279411038096e-05, + "loss": 5.5502, + "step": 15848 + }, + { + "epoch": 0.78, + "grad_norm": 2.058922529220581, + "learning_rate": 1.0845397499876477e-05, + "loss": 5.5531, + "step": 15852 + }, + { + "epoch": 0.78, + "grad_norm": 1.818724274635315, + "learning_rate": 1.0835515588714857e-05, + "loss": 5.4607, + "step": 15856 + }, + { + "epoch": 0.78, + "grad_norm": 2.0503363609313965, + "learning_rate": 1.0825633677553239e-05, + "loss": 5.4722, + "step": 15860 + }, + { + "epoch": 0.78, + "grad_norm": 2.076927423477173, + "learning_rate": 1.081575176639162e-05, + "loss": 5.4039, + "step": 15864 + }, + { + "epoch": 0.78, + "grad_norm": 1.991584300994873, + "learning_rate": 1.0805869855230003e-05, + "loss": 5.4651, + "step": 15868 + }, + { + "epoch": 0.78, + "grad_norm": 2.276181936264038, + "learning_rate": 1.0795987944068383e-05, + "loss": 5.5605, + "step": 15872 + }, + { + "epoch": 0.78, + "grad_norm": 2.0964319705963135, + "learning_rate": 1.0786106032906766e-05, + "loss": 5.4315, + "step": 15876 + }, + { + "epoch": 0.78, + "grad_norm": 2.0152671337127686, + "learning_rate": 1.0776224121745146e-05, + "loss": 5.5034, + "step": 15880 + }, + { + "epoch": 0.78, + "grad_norm": 2.065906524658203, + "learning_rate": 1.0766342210583528e-05, + "loss": 5.5383, + "step": 15884 + }, + { + "epoch": 0.79, + "grad_norm": 1.9010733366012573, + "learning_rate": 1.0756460299421908e-05, + "loss": 5.5424, + "step": 15888 + }, + { + "epoch": 0.79, + "grad_norm": 1.9711081981658936, + "learning_rate": 1.0746578388260289e-05, + "loss": 5.4899, + "step": 15892 + }, + { + "epoch": 0.79, + "grad_norm": 1.9570908546447754, + "learning_rate": 1.0736696477098672e-05, + "loss": 5.5072, + "step": 15896 + }, + { + "epoch": 0.79, + "grad_norm": 2.1524243354797363, + "learning_rate": 1.0726814565937053e-05, + "loss": 5.4421, + "step": 15900 + }, + { + "epoch": 0.79, + "grad_norm": 1.8811511993408203, + "learning_rate": 1.0716932654775435e-05, + "loss": 5.3984, + "step": 15904 + }, + { + "epoch": 0.79, + "grad_norm": 1.914753794670105, + "learning_rate": 1.0707050743613815e-05, + "loss": 5.5874, + "step": 15908 + }, + { + "epoch": 0.79, + "grad_norm": 1.9757344722747803, + "learning_rate": 1.0697168832452197e-05, + "loss": 5.5402, + "step": 15912 + }, + { + "epoch": 0.79, + "grad_norm": 2.1234004497528076, + "learning_rate": 1.0689757399080982e-05, + "loss": 5.4948, + "step": 15916 + }, + { + "epoch": 0.79, + "grad_norm": 1.9321492910385132, + "learning_rate": 1.0679875487919364e-05, + "loss": 5.3983, + "step": 15920 + }, + { + "epoch": 0.79, + "grad_norm": 2.0924885272979736, + "learning_rate": 1.0669993576757746e-05, + "loss": 5.3964, + "step": 15924 + }, + { + "epoch": 0.79, + "grad_norm": 2.1206881999969482, + "learning_rate": 1.0660111665596127e-05, + "loss": 5.4496, + "step": 15928 + }, + { + "epoch": 0.79, + "grad_norm": 2.230121374130249, + "learning_rate": 1.0650229754434509e-05, + "loss": 5.3782, + "step": 15932 + }, + { + "epoch": 0.79, + "grad_norm": 1.9307682514190674, + "learning_rate": 1.0640347843272889e-05, + "loss": 5.4723, + "step": 15936 + }, + { + "epoch": 0.79, + "grad_norm": 2.326744794845581, + "learning_rate": 1.0630465932111271e-05, + "loss": 5.4944, + "step": 15940 + }, + { + "epoch": 0.79, + "grad_norm": 2.0075511932373047, + "learning_rate": 1.0620584020949651e-05, + "loss": 5.4213, + "step": 15944 + }, + { + "epoch": 0.79, + "grad_norm": 2.0154924392700195, + "learning_rate": 1.0610702109788034e-05, + "loss": 5.5201, + "step": 15948 + }, + { + "epoch": 0.79, + "grad_norm": 2.3839402198791504, + "learning_rate": 1.0600820198626416e-05, + "loss": 5.5373, + "step": 15952 + }, + { + "epoch": 0.79, + "grad_norm": 1.8833853006362915, + "learning_rate": 1.0590938287464796e-05, + "loss": 5.5104, + "step": 15956 + }, + { + "epoch": 0.79, + "grad_norm": 2.0425021648406982, + "learning_rate": 1.0581056376303178e-05, + "loss": 5.5934, + "step": 15960 + }, + { + "epoch": 0.79, + "grad_norm": 1.8929284811019897, + "learning_rate": 1.0571174465141558e-05, + "loss": 5.5095, + "step": 15964 + }, + { + "epoch": 0.79, + "grad_norm": 1.8911198377609253, + "learning_rate": 1.056129255397994e-05, + "loss": 5.4591, + "step": 15968 + }, + { + "epoch": 0.79, + "grad_norm": 1.856986403465271, + "learning_rate": 1.055141064281832e-05, + "loss": 5.35, + "step": 15972 + }, + { + "epoch": 0.79, + "grad_norm": 1.8045761585235596, + "learning_rate": 1.0541528731656703e-05, + "loss": 5.5255, + "step": 15976 + }, + { + "epoch": 0.79, + "grad_norm": 1.9339076280593872, + "learning_rate": 1.0531646820495085e-05, + "loss": 5.4632, + "step": 15980 + }, + { + "epoch": 0.79, + "grad_norm": 2.076307535171509, + "learning_rate": 1.0521764909333465e-05, + "loss": 5.453, + "step": 15984 + }, + { + "epoch": 0.79, + "grad_norm": 2.0790350437164307, + "learning_rate": 1.0511882998171847e-05, + "loss": 5.4504, + "step": 15988 + }, + { + "epoch": 0.79, + "grad_norm": 2.015014171600342, + "learning_rate": 1.0502001087010228e-05, + "loss": 5.4082, + "step": 15992 + }, + { + "epoch": 0.79, + "grad_norm": 1.9248104095458984, + "learning_rate": 1.049211917584861e-05, + "loss": 5.5261, + "step": 15996 + }, + { + "epoch": 0.79, + "grad_norm": 1.9845161437988281, + "learning_rate": 1.048223726468699e-05, + "loss": 5.495, + "step": 16000 + }, + { + "epoch": 0.79, + "grad_norm": 2.0011472702026367, + "learning_rate": 1.0472355353525372e-05, + "loss": 5.3597, + "step": 16004 + }, + { + "epoch": 0.79, + "grad_norm": 2.0203723907470703, + "learning_rate": 1.0462473442363753e-05, + "loss": 5.5997, + "step": 16008 + }, + { + "epoch": 0.79, + "grad_norm": 1.9304739236831665, + "learning_rate": 1.0452591531202136e-05, + "loss": 5.4136, + "step": 16012 + }, + { + "epoch": 0.79, + "grad_norm": 2.1026952266693115, + "learning_rate": 1.0442709620040517e-05, + "loss": 5.5538, + "step": 16016 + }, + { + "epoch": 0.79, + "grad_norm": 1.9498411417007446, + "learning_rate": 1.0432827708878897e-05, + "loss": 5.5066, + "step": 16020 + }, + { + "epoch": 0.79, + "grad_norm": 2.043534517288208, + "learning_rate": 1.0422945797717279e-05, + "loss": 5.4616, + "step": 16024 + }, + { + "epoch": 0.79, + "grad_norm": 1.9264411926269531, + "learning_rate": 1.041306388655566e-05, + "loss": 5.5081, + "step": 16028 + }, + { + "epoch": 0.79, + "grad_norm": 1.9634205102920532, + "learning_rate": 1.0403181975394042e-05, + "loss": 5.436, + "step": 16032 + }, + { + "epoch": 0.79, + "grad_norm": 1.9793167114257812, + "learning_rate": 1.0393300064232422e-05, + "loss": 5.4457, + "step": 16036 + }, + { + "epoch": 0.79, + "grad_norm": 2.0029139518737793, + "learning_rate": 1.0383418153070806e-05, + "loss": 5.5097, + "step": 16040 + }, + { + "epoch": 0.79, + "grad_norm": 1.93049156665802, + "learning_rate": 1.0373536241909186e-05, + "loss": 5.5088, + "step": 16044 + }, + { + "epoch": 0.79, + "grad_norm": 1.9566268920898438, + "learning_rate": 1.0363654330747566e-05, + "loss": 5.509, + "step": 16048 + }, + { + "epoch": 0.79, + "grad_norm": 1.9150116443634033, + "learning_rate": 1.0353772419585948e-05, + "loss": 5.432, + "step": 16052 + }, + { + "epoch": 0.79, + "grad_norm": 1.9457064867019653, + "learning_rate": 1.0343890508424329e-05, + "loss": 5.5555, + "step": 16056 + }, + { + "epoch": 0.79, + "grad_norm": 2.037177562713623, + "learning_rate": 1.0334008597262711e-05, + "loss": 5.4792, + "step": 16060 + }, + { + "epoch": 0.79, + "grad_norm": 2.0516912937164307, + "learning_rate": 1.0324126686101091e-05, + "loss": 5.5053, + "step": 16064 + }, + { + "epoch": 0.79, + "grad_norm": 1.7669730186462402, + "learning_rate": 1.0314244774939475e-05, + "loss": 5.3994, + "step": 16068 + }, + { + "epoch": 0.79, + "grad_norm": 2.1283509731292725, + "learning_rate": 1.0304362863777855e-05, + "loss": 5.466, + "step": 16072 + }, + { + "epoch": 0.79, + "grad_norm": 1.7946540117263794, + "learning_rate": 1.0294480952616237e-05, + "loss": 5.4215, + "step": 16076 + }, + { + "epoch": 0.79, + "grad_norm": 2.273894786834717, + "learning_rate": 1.0284599041454618e-05, + "loss": 5.408, + "step": 16080 + }, + { + "epoch": 0.79, + "grad_norm": 2.055126667022705, + "learning_rate": 1.0274717130292998e-05, + "loss": 5.5304, + "step": 16084 + }, + { + "epoch": 0.79, + "grad_norm": 2.0089943408966064, + "learning_rate": 1.026483521913138e-05, + "loss": 5.4147, + "step": 16088 + }, + { + "epoch": 0.8, + "grad_norm": 2.053406000137329, + "learning_rate": 1.025495330796976e-05, + "loss": 5.5624, + "step": 16092 + }, + { + "epoch": 0.8, + "grad_norm": 1.9701050519943237, + "learning_rate": 1.0245071396808144e-05, + "loss": 5.3615, + "step": 16096 + }, + { + "epoch": 0.8, + "grad_norm": 1.9590353965759277, + "learning_rate": 1.0235189485646525e-05, + "loss": 5.4707, + "step": 16100 + }, + { + "epoch": 0.8, + "grad_norm": 2.097073793411255, + "learning_rate": 1.0225307574484907e-05, + "loss": 5.4783, + "step": 16104 + }, + { + "epoch": 0.8, + "grad_norm": 1.9956692457199097, + "learning_rate": 1.0215425663323287e-05, + "loss": 5.4288, + "step": 16108 + }, + { + "epoch": 0.8, + "grad_norm": 1.8568942546844482, + "learning_rate": 1.020554375216167e-05, + "loss": 5.5691, + "step": 16112 + }, + { + "epoch": 0.8, + "grad_norm": 1.9717293977737427, + "learning_rate": 1.019566184100005e-05, + "loss": 5.4763, + "step": 16116 + }, + { + "epoch": 0.8, + "grad_norm": 2.085775375366211, + "learning_rate": 1.018577992983843e-05, + "loss": 5.5406, + "step": 16120 + }, + { + "epoch": 0.8, + "grad_norm": 1.9938271045684814, + "learning_rate": 1.0175898018676814e-05, + "loss": 5.52, + "step": 16124 + }, + { + "epoch": 0.8, + "grad_norm": 2.1408092975616455, + "learning_rate": 1.0166016107515194e-05, + "loss": 5.4718, + "step": 16128 + }, + { + "epoch": 0.8, + "grad_norm": 2.084689140319824, + "learning_rate": 1.0156134196353576e-05, + "loss": 5.5263, + "step": 16132 + }, + { + "epoch": 0.8, + "grad_norm": 1.8502254486083984, + "learning_rate": 1.0146252285191956e-05, + "loss": 5.4056, + "step": 16136 + }, + { + "epoch": 0.8, + "grad_norm": 1.832261323928833, + "learning_rate": 1.0136370374030339e-05, + "loss": 5.4597, + "step": 16140 + }, + { + "epoch": 0.8, + "grad_norm": 1.9201068878173828, + "learning_rate": 1.0126488462868719e-05, + "loss": 5.5378, + "step": 16144 + }, + { + "epoch": 0.8, + "grad_norm": 1.8467752933502197, + "learning_rate": 1.0116606551707101e-05, + "loss": 5.4567, + "step": 16148 + }, + { + "epoch": 0.8, + "grad_norm": 2.1131222248077393, + "learning_rate": 1.0106724640545481e-05, + "loss": 5.35, + "step": 16152 + }, + { + "epoch": 0.8, + "grad_norm": 2.0209567546844482, + "learning_rate": 1.0096842729383863e-05, + "loss": 5.5339, + "step": 16156 + }, + { + "epoch": 0.8, + "grad_norm": 2.1684625148773193, + "learning_rate": 1.0086960818222245e-05, + "loss": 5.4267, + "step": 16160 + }, + { + "epoch": 0.8, + "grad_norm": 2.1010987758636475, + "learning_rate": 1.0077078907060626e-05, + "loss": 5.4042, + "step": 16164 + }, + { + "epoch": 0.8, + "grad_norm": 1.9701296091079712, + "learning_rate": 1.0067196995899008e-05, + "loss": 5.6227, + "step": 16168 + }, + { + "epoch": 0.8, + "grad_norm": 1.8271695375442505, + "learning_rate": 1.0057315084737388e-05, + "loss": 5.4151, + "step": 16172 + }, + { + "epoch": 0.8, + "grad_norm": 2.2199959754943848, + "learning_rate": 1.004743317357577e-05, + "loss": 5.5352, + "step": 16176 + }, + { + "epoch": 0.8, + "grad_norm": 2.2069809436798096, + "learning_rate": 1.003755126241415e-05, + "loss": 5.4987, + "step": 16180 + }, + { + "epoch": 0.8, + "grad_norm": 2.027318239212036, + "learning_rate": 1.0027669351252533e-05, + "loss": 5.5284, + "step": 16184 + }, + { + "epoch": 0.8, + "grad_norm": 1.9697614908218384, + "learning_rate": 1.0017787440090915e-05, + "loss": 5.5906, + "step": 16188 + }, + { + "epoch": 0.8, + "grad_norm": 2.252358913421631, + "learning_rate": 1.0007905528929295e-05, + "loss": 5.4911, + "step": 16192 + }, + { + "epoch": 0.8, + "grad_norm": 2.1323323249816895, + "learning_rate": 9.998023617767677e-06, + "loss": 5.5276, + "step": 16196 + }, + { + "epoch": 0.8, + "grad_norm": 2.277160167694092, + "learning_rate": 9.988141706606058e-06, + "loss": 5.4718, + "step": 16200 + }, + { + "epoch": 0.8, + "grad_norm": 1.8769733905792236, + "learning_rate": 9.97825979544444e-06, + "loss": 5.3622, + "step": 16204 + }, + { + "epoch": 0.8, + "grad_norm": 1.8731465339660645, + "learning_rate": 9.96837788428282e-06, + "loss": 5.5215, + "step": 16208 + }, + { + "epoch": 0.8, + "grad_norm": 1.8808999061584473, + "learning_rate": 9.958495973121202e-06, + "loss": 5.3953, + "step": 16212 + }, + { + "epoch": 0.8, + "grad_norm": 1.8657152652740479, + "learning_rate": 9.948614061959584e-06, + "loss": 5.3316, + "step": 16216 + }, + { + "epoch": 0.8, + "grad_norm": 1.849173665046692, + "learning_rate": 9.938732150797965e-06, + "loss": 5.4471, + "step": 16220 + }, + { + "epoch": 0.8, + "grad_norm": 2.220717668533325, + "learning_rate": 9.928850239636347e-06, + "loss": 5.5204, + "step": 16224 + }, + { + "epoch": 0.8, + "grad_norm": 2.0210342407226562, + "learning_rate": 9.918968328474727e-06, + "loss": 5.3364, + "step": 16228 + }, + { + "epoch": 0.8, + "grad_norm": 1.9695372581481934, + "learning_rate": 9.909086417313109e-06, + "loss": 5.485, + "step": 16232 + }, + { + "epoch": 0.8, + "grad_norm": 1.875001311302185, + "learning_rate": 9.89920450615149e-06, + "loss": 5.3977, + "step": 16236 + }, + { + "epoch": 0.8, + "grad_norm": 2.136852741241455, + "learning_rate": 9.889322594989871e-06, + "loss": 5.5784, + "step": 16240 + }, + { + "epoch": 0.8, + "grad_norm": 1.8972970247268677, + "learning_rate": 9.879440683828253e-06, + "loss": 5.4973, + "step": 16244 + }, + { + "epoch": 0.8, + "grad_norm": 2.14034366607666, + "learning_rate": 9.869558772666634e-06, + "loss": 5.546, + "step": 16248 + }, + { + "epoch": 0.8, + "grad_norm": 2.0028293132781982, + "learning_rate": 9.859676861505016e-06, + "loss": 5.3869, + "step": 16252 + }, + { + "epoch": 0.8, + "grad_norm": 1.9485490322113037, + "learning_rate": 9.849794950343396e-06, + "loss": 5.4852, + "step": 16256 + }, + { + "epoch": 0.8, + "grad_norm": 1.9260238409042358, + "learning_rate": 9.839913039181778e-06, + "loss": 5.473, + "step": 16260 + }, + { + "epoch": 0.8, + "grad_norm": 2.1630001068115234, + "learning_rate": 9.830031128020159e-06, + "loss": 5.4207, + "step": 16264 + }, + { + "epoch": 0.8, + "grad_norm": 1.9328508377075195, + "learning_rate": 9.82014921685854e-06, + "loss": 5.4433, + "step": 16268 + }, + { + "epoch": 0.8, + "grad_norm": 1.9832642078399658, + "learning_rate": 9.810267305696923e-06, + "loss": 5.4387, + "step": 16272 + }, + { + "epoch": 0.8, + "grad_norm": 2.0616111755371094, + "learning_rate": 9.800385394535303e-06, + "loss": 5.3895, + "step": 16276 + }, + { + "epoch": 0.8, + "grad_norm": 2.1709606647491455, + "learning_rate": 9.790503483373685e-06, + "loss": 5.5639, + "step": 16280 + }, + { + "epoch": 0.8, + "grad_norm": 2.1553335189819336, + "learning_rate": 9.780621572212066e-06, + "loss": 5.5419, + "step": 16284 + }, + { + "epoch": 0.8, + "grad_norm": 1.7869793176651, + "learning_rate": 9.770739661050448e-06, + "loss": 5.4425, + "step": 16288 + }, + { + "epoch": 0.8, + "grad_norm": 1.902388095855713, + "learning_rate": 9.760857749888828e-06, + "loss": 5.3651, + "step": 16292 + }, + { + "epoch": 0.81, + "grad_norm": 2.135723829269409, + "learning_rate": 9.750975838727212e-06, + "loss": 5.5838, + "step": 16296 + }, + { + "epoch": 0.81, + "grad_norm": 2.3749701976776123, + "learning_rate": 9.741093927565592e-06, + "loss": 5.5562, + "step": 16300 + }, + { + "epoch": 0.81, + "grad_norm": 1.9421731233596802, + "learning_rate": 9.731212016403973e-06, + "loss": 5.432, + "step": 16304 + }, + { + "epoch": 0.81, + "grad_norm": 1.87296462059021, + "learning_rate": 9.721330105242355e-06, + "loss": 5.5429, + "step": 16308 + }, + { + "epoch": 0.81, + "grad_norm": 2.0159077644348145, + "learning_rate": 9.711448194080735e-06, + "loss": 5.5088, + "step": 16312 + }, + { + "epoch": 0.81, + "grad_norm": 1.9539657831192017, + "learning_rate": 9.701566282919117e-06, + "loss": 5.49, + "step": 16316 + }, + { + "epoch": 0.81, + "grad_norm": 2.0840256214141846, + "learning_rate": 9.691684371757497e-06, + "loss": 5.4972, + "step": 16320 + }, + { + "epoch": 0.81, + "grad_norm": 2.0817222595214844, + "learning_rate": 9.68180246059588e-06, + "loss": 5.5185, + "step": 16324 + }, + { + "epoch": 0.81, + "grad_norm": 2.172551393508911, + "learning_rate": 9.671920549434262e-06, + "loss": 5.5465, + "step": 16328 + }, + { + "epoch": 0.81, + "grad_norm": 1.8779146671295166, + "learning_rate": 9.662038638272644e-06, + "loss": 5.5047, + "step": 16332 + }, + { + "epoch": 0.81, + "grad_norm": 1.9017332792282104, + "learning_rate": 9.652156727111024e-06, + "loss": 5.4705, + "step": 16336 + }, + { + "epoch": 0.81, + "grad_norm": 2.0198209285736084, + "learning_rate": 9.642274815949404e-06, + "loss": 5.4886, + "step": 16340 + }, + { + "epoch": 0.81, + "grad_norm": 2.0686516761779785, + "learning_rate": 9.632392904787786e-06, + "loss": 5.5188, + "step": 16344 + }, + { + "epoch": 0.81, + "grad_norm": 1.962902545928955, + "learning_rate": 9.622510993626167e-06, + "loss": 5.5414, + "step": 16348 + }, + { + "epoch": 0.81, + "grad_norm": 2.0159122943878174, + "learning_rate": 9.612629082464549e-06, + "loss": 5.422, + "step": 16352 + }, + { + "epoch": 0.81, + "grad_norm": 2.2383811473846436, + "learning_rate": 9.60274717130293e-06, + "loss": 5.5384, + "step": 16356 + }, + { + "epoch": 0.81, + "grad_norm": 1.993220567703247, + "learning_rate": 9.592865260141313e-06, + "loss": 5.3551, + "step": 16360 + }, + { + "epoch": 0.81, + "grad_norm": 2.130995035171509, + "learning_rate": 9.582983348979693e-06, + "loss": 5.4926, + "step": 16364 + }, + { + "epoch": 0.81, + "grad_norm": 1.9019581079483032, + "learning_rate": 9.573101437818074e-06, + "loss": 5.5623, + "step": 16368 + }, + { + "epoch": 0.81, + "grad_norm": 2.1175732612609863, + "learning_rate": 9.563219526656456e-06, + "loss": 5.4529, + "step": 16372 + }, + { + "epoch": 0.81, + "grad_norm": 1.8680050373077393, + "learning_rate": 9.553337615494836e-06, + "loss": 5.4273, + "step": 16376 + }, + { + "epoch": 0.81, + "grad_norm": 2.0988821983337402, + "learning_rate": 9.543455704333218e-06, + "loss": 5.477, + "step": 16380 + }, + { + "epoch": 0.81, + "grad_norm": 2.198651075363159, + "learning_rate": 9.5335737931716e-06, + "loss": 5.5427, + "step": 16384 + }, + { + "epoch": 0.81, + "grad_norm": 2.1054868698120117, + "learning_rate": 9.523691882009982e-06, + "loss": 5.4173, + "step": 16388 + }, + { + "epoch": 0.81, + "grad_norm": 2.0211637020111084, + "learning_rate": 9.513809970848363e-06, + "loss": 5.6035, + "step": 16392 + }, + { + "epoch": 0.81, + "grad_norm": 2.0547540187835693, + "learning_rate": 9.503928059686745e-06, + "loss": 5.4983, + "step": 16396 + }, + { + "epoch": 0.81, + "grad_norm": 1.9957647323608398, + "learning_rate": 9.494046148525125e-06, + "loss": 5.5374, + "step": 16400 + }, + { + "epoch": 0.81, + "grad_norm": 2.0535998344421387, + "learning_rate": 9.484164237363505e-06, + "loss": 5.5643, + "step": 16404 + }, + { + "epoch": 0.81, + "grad_norm": 2.1046228408813477, + "learning_rate": 9.474282326201887e-06, + "loss": 5.4727, + "step": 16408 + }, + { + "epoch": 0.81, + "grad_norm": 2.1698873043060303, + "learning_rate": 9.46440041504027e-06, + "loss": 5.5637, + "step": 16412 + }, + { + "epoch": 0.81, + "grad_norm": 1.9448730945587158, + "learning_rate": 9.454518503878652e-06, + "loss": 5.4932, + "step": 16416 + }, + { + "epoch": 0.81, + "grad_norm": 2.0663201808929443, + "learning_rate": 9.444636592717032e-06, + "loss": 5.5067, + "step": 16420 + }, + { + "epoch": 0.81, + "grad_norm": 1.8125361204147339, + "learning_rate": 9.434754681555414e-06, + "loss": 5.5381, + "step": 16424 + }, + { + "epoch": 0.81, + "grad_norm": 2.077420473098755, + "learning_rate": 9.424872770393794e-06, + "loss": 5.5642, + "step": 16428 + }, + { + "epoch": 0.81, + "grad_norm": 1.9312140941619873, + "learning_rate": 9.414990859232176e-06, + "loss": 5.3661, + "step": 16432 + }, + { + "epoch": 0.81, + "grad_norm": 1.8529763221740723, + "learning_rate": 9.405108948070557e-06, + "loss": 5.4776, + "step": 16436 + }, + { + "epoch": 0.81, + "grad_norm": 1.8764407634735107, + "learning_rate": 9.395227036908939e-06, + "loss": 5.4928, + "step": 16440 + }, + { + "epoch": 0.81, + "grad_norm": 2.139594316482544, + "learning_rate": 9.385345125747321e-06, + "loss": 5.4108, + "step": 16444 + }, + { + "epoch": 0.81, + "grad_norm": 2.1105079650878906, + "learning_rate": 9.375463214585701e-06, + "loss": 5.4876, + "step": 16448 + }, + { + "epoch": 0.81, + "grad_norm": 1.9484140872955322, + "learning_rate": 9.365581303424083e-06, + "loss": 5.4148, + "step": 16452 + }, + { + "epoch": 0.81, + "grad_norm": 2.0571391582489014, + "learning_rate": 9.355699392262464e-06, + "loss": 5.6282, + "step": 16456 + }, + { + "epoch": 0.81, + "grad_norm": 1.7625576257705688, + "learning_rate": 9.345817481100846e-06, + "loss": 5.5073, + "step": 16460 + }, + { + "epoch": 0.81, + "grad_norm": 2.1183722019195557, + "learning_rate": 9.335935569939226e-06, + "loss": 5.4659, + "step": 16464 + }, + { + "epoch": 0.81, + "grad_norm": 2.135255813598633, + "learning_rate": 9.326053658777608e-06, + "loss": 5.4262, + "step": 16468 + }, + { + "epoch": 0.81, + "grad_norm": 1.7497916221618652, + "learning_rate": 9.31617174761599e-06, + "loss": 5.537, + "step": 16472 + }, + { + "epoch": 0.81, + "grad_norm": 1.9297901391983032, + "learning_rate": 9.30628983645437e-06, + "loss": 5.4848, + "step": 16476 + }, + { + "epoch": 0.81, + "grad_norm": 1.992133617401123, + "learning_rate": 9.296407925292753e-06, + "loss": 5.5307, + "step": 16480 + }, + { + "epoch": 0.81, + "grad_norm": 1.8346421718597412, + "learning_rate": 9.286526014131133e-06, + "loss": 5.4071, + "step": 16484 + }, + { + "epoch": 0.81, + "grad_norm": 1.9083247184753418, + "learning_rate": 9.276644102969515e-06, + "loss": 5.4248, + "step": 16488 + }, + { + "epoch": 0.81, + "grad_norm": 2.0954606533050537, + "learning_rate": 9.266762191807895e-06, + "loss": 5.5175, + "step": 16492 + }, + { + "epoch": 0.82, + "grad_norm": 2.101158857345581, + "learning_rate": 9.256880280646278e-06, + "loss": 5.5345, + "step": 16496 + }, + { + "epoch": 0.82, + "grad_norm": 2.170283794403076, + "learning_rate": 9.24699836948466e-06, + "loss": 5.5719, + "step": 16500 + }, + { + "epoch": 0.82, + "grad_norm": 2.078697681427002, + "learning_rate": 9.23711645832304e-06, + "loss": 5.4199, + "step": 16504 + }, + { + "epoch": 0.82, + "grad_norm": 2.1254682540893555, + "learning_rate": 9.227234547161422e-06, + "loss": 5.3647, + "step": 16508 + }, + { + "epoch": 0.82, + "grad_norm": 2.2468132972717285, + "learning_rate": 9.217352635999802e-06, + "loss": 5.4673, + "step": 16512 + }, + { + "epoch": 0.82, + "grad_norm": 2.053579330444336, + "learning_rate": 9.207470724838184e-06, + "loss": 5.4083, + "step": 16516 + }, + { + "epoch": 0.82, + "grad_norm": 2.2088301181793213, + "learning_rate": 9.197588813676565e-06, + "loss": 5.5694, + "step": 16520 + }, + { + "epoch": 0.82, + "grad_norm": 1.9974719285964966, + "learning_rate": 9.187706902514947e-06, + "loss": 5.4396, + "step": 16524 + }, + { + "epoch": 0.82, + "grad_norm": 2.066420555114746, + "learning_rate": 9.177824991353329e-06, + "loss": 5.5418, + "step": 16528 + }, + { + "epoch": 0.82, + "grad_norm": 2.2263917922973633, + "learning_rate": 9.16794308019171e-06, + "loss": 5.4904, + "step": 16532 + }, + { + "epoch": 0.82, + "grad_norm": 2.0956313610076904, + "learning_rate": 9.158061169030091e-06, + "loss": 5.47, + "step": 16536 + }, + { + "epoch": 0.82, + "grad_norm": 2.120701551437378, + "learning_rate": 9.148179257868472e-06, + "loss": 5.4498, + "step": 16540 + }, + { + "epoch": 0.82, + "grad_norm": 2.068040609359741, + "learning_rate": 9.138297346706854e-06, + "loss": 5.5373, + "step": 16544 + }, + { + "epoch": 0.82, + "grad_norm": 2.125322103500366, + "learning_rate": 9.128415435545234e-06, + "loss": 5.4782, + "step": 16548 + }, + { + "epoch": 0.82, + "grad_norm": 1.8984034061431885, + "learning_rate": 9.118533524383616e-06, + "loss": 5.5614, + "step": 16552 + }, + { + "epoch": 0.82, + "grad_norm": 2.038201332092285, + "learning_rate": 9.108651613221998e-06, + "loss": 5.5308, + "step": 16556 + }, + { + "epoch": 0.82, + "grad_norm": 2.083704948425293, + "learning_rate": 9.098769702060379e-06, + "loss": 5.3986, + "step": 16560 + }, + { + "epoch": 0.82, + "grad_norm": 2.2222559452056885, + "learning_rate": 9.08888779089876e-06, + "loss": 5.4164, + "step": 16564 + }, + { + "epoch": 0.82, + "grad_norm": 2.319937229156494, + "learning_rate": 9.079005879737141e-06, + "loss": 5.5564, + "step": 16568 + }, + { + "epoch": 0.82, + "grad_norm": 1.930305004119873, + "learning_rate": 9.069123968575523e-06, + "loss": 5.5181, + "step": 16572 + }, + { + "epoch": 0.82, + "grad_norm": 2.175090789794922, + "learning_rate": 9.059242057413904e-06, + "loss": 5.4816, + "step": 16576 + }, + { + "epoch": 0.82, + "grad_norm": 1.9041398763656616, + "learning_rate": 9.049360146252286e-06, + "loss": 5.4462, + "step": 16580 + }, + { + "epoch": 0.82, + "grad_norm": 1.92721426486969, + "learning_rate": 9.039478235090668e-06, + "loss": 5.4993, + "step": 16584 + }, + { + "epoch": 0.82, + "grad_norm": 2.098320484161377, + "learning_rate": 9.029596323929048e-06, + "loss": 5.4334, + "step": 16588 + }, + { + "epoch": 0.82, + "grad_norm": 1.988296389579773, + "learning_rate": 9.01971441276743e-06, + "loss": 5.5241, + "step": 16592 + }, + { + "epoch": 0.82, + "grad_norm": 1.9492675065994263, + "learning_rate": 9.00983250160581e-06, + "loss": 5.4024, + "step": 16596 + }, + { + "epoch": 0.82, + "grad_norm": 2.0381886959075928, + "learning_rate": 8.999950590444192e-06, + "loss": 5.4861, + "step": 16600 + }, + { + "epoch": 0.82, + "grad_norm": 2.2015647888183594, + "learning_rate": 8.990068679282573e-06, + "loss": 5.4589, + "step": 16604 + }, + { + "epoch": 0.82, + "grad_norm": 1.970509648323059, + "learning_rate": 8.980186768120955e-06, + "loss": 5.5072, + "step": 16608 + }, + { + "epoch": 0.82, + "grad_norm": 2.048265218734741, + "learning_rate": 8.970304856959337e-06, + "loss": 5.4781, + "step": 16612 + }, + { + "epoch": 0.82, + "grad_norm": 2.1781177520751953, + "learning_rate": 8.960422945797719e-06, + "loss": 5.4382, + "step": 16616 + }, + { + "epoch": 0.82, + "grad_norm": 2.3206918239593506, + "learning_rate": 8.9505410346361e-06, + "loss": 5.5371, + "step": 16620 + }, + { + "epoch": 0.82, + "grad_norm": 2.129166603088379, + "learning_rate": 8.94065912347448e-06, + "loss": 5.4773, + "step": 16624 + }, + { + "epoch": 0.82, + "grad_norm": 1.8786160945892334, + "learning_rate": 8.930777212312862e-06, + "loss": 5.3599, + "step": 16628 + }, + { + "epoch": 0.82, + "grad_norm": 2.114015579223633, + "learning_rate": 8.920895301151242e-06, + "loss": 5.5262, + "step": 16632 + }, + { + "epoch": 0.82, + "grad_norm": 1.9730268716812134, + "learning_rate": 8.911013389989624e-06, + "loss": 5.3921, + "step": 16636 + }, + { + "epoch": 0.82, + "grad_norm": 2.007050037384033, + "learning_rate": 8.901131478828005e-06, + "loss": 5.4521, + "step": 16640 + }, + { + "epoch": 0.82, + "grad_norm": 1.9782698154449463, + "learning_rate": 8.891249567666388e-06, + "loss": 5.4024, + "step": 16644 + }, + { + "epoch": 0.82, + "grad_norm": 2.118109941482544, + "learning_rate": 8.881367656504769e-06, + "loss": 5.4549, + "step": 16648 + }, + { + "epoch": 0.82, + "grad_norm": 1.9269788265228271, + "learning_rate": 8.871485745343149e-06, + "loss": 5.3879, + "step": 16652 + }, + { + "epoch": 0.82, + "grad_norm": 2.0356998443603516, + "learning_rate": 8.861603834181531e-06, + "loss": 5.4729, + "step": 16656 + }, + { + "epoch": 0.82, + "grad_norm": 1.9945244789123535, + "learning_rate": 8.851721923019912e-06, + "loss": 5.489, + "step": 16660 + }, + { + "epoch": 0.82, + "grad_norm": 2.1839029788970947, + "learning_rate": 8.841840011858294e-06, + "loss": 5.3626, + "step": 16664 + }, + { + "epoch": 0.82, + "grad_norm": 1.9056282043457031, + "learning_rate": 8.831958100696674e-06, + "loss": 5.5155, + "step": 16668 + }, + { + "epoch": 0.82, + "grad_norm": 1.971134066581726, + "learning_rate": 8.822076189535058e-06, + "loss": 5.4582, + "step": 16672 + }, + { + "epoch": 0.82, + "grad_norm": 2.2608683109283447, + "learning_rate": 8.812194278373438e-06, + "loss": 5.5671, + "step": 16676 + }, + { + "epoch": 0.82, + "grad_norm": 1.8873885869979858, + "learning_rate": 8.80231236721182e-06, + "loss": 5.5378, + "step": 16680 + }, + { + "epoch": 0.82, + "grad_norm": 2.1268815994262695, + "learning_rate": 8.7924304560502e-06, + "loss": 5.5464, + "step": 16684 + }, + { + "epoch": 0.82, + "grad_norm": 2.120333671569824, + "learning_rate": 8.782548544888581e-06, + "loss": 5.4566, + "step": 16688 + }, + { + "epoch": 0.82, + "grad_norm": 2.202099323272705, + "learning_rate": 8.772666633726963e-06, + "loss": 5.3333, + "step": 16692 + }, + { + "epoch": 0.82, + "grad_norm": 1.8824645280838013, + "learning_rate": 8.762784722565343e-06, + "loss": 5.5016, + "step": 16696 + }, + { + "epoch": 0.83, + "grad_norm": 2.0147457122802734, + "learning_rate": 8.752902811403727e-06, + "loss": 5.4586, + "step": 16700 + }, + { + "epoch": 0.83, + "grad_norm": 2.041895627975464, + "learning_rate": 8.743020900242107e-06, + "loss": 5.5022, + "step": 16704 + }, + { + "epoch": 0.83, + "grad_norm": 2.077690362930298, + "learning_rate": 8.73313898908049e-06, + "loss": 5.5781, + "step": 16708 + }, + { + "epoch": 0.83, + "grad_norm": 1.9477964639663696, + "learning_rate": 8.72325707791887e-06, + "loss": 5.5584, + "step": 16712 + }, + { + "epoch": 0.83, + "grad_norm": 1.8398356437683105, + "learning_rate": 8.713375166757252e-06, + "loss": 5.5032, + "step": 16716 + }, + { + "epoch": 0.83, + "grad_norm": 1.9973992109298706, + "learning_rate": 8.703493255595632e-06, + "loss": 5.37, + "step": 16720 + }, + { + "epoch": 0.83, + "grad_norm": 2.0176520347595215, + "learning_rate": 8.693611344434013e-06, + "loss": 5.5138, + "step": 16724 + }, + { + "epoch": 0.83, + "grad_norm": 2.1837217807769775, + "learning_rate": 8.683729433272396e-06, + "loss": 5.5308, + "step": 16728 + }, + { + "epoch": 0.83, + "grad_norm": 2.0156595706939697, + "learning_rate": 8.673847522110777e-06, + "loss": 5.5722, + "step": 16732 + }, + { + "epoch": 0.83, + "grad_norm": 2.3676466941833496, + "learning_rate": 8.663965610949159e-06, + "loss": 5.5775, + "step": 16736 + }, + { + "epoch": 0.83, + "grad_norm": 2.270716667175293, + "learning_rate": 8.65408369978754e-06, + "loss": 5.4107, + "step": 16740 + }, + { + "epoch": 0.83, + "grad_norm": 1.9551721811294556, + "learning_rate": 8.644201788625921e-06, + "loss": 5.6022, + "step": 16744 + }, + { + "epoch": 0.83, + "grad_norm": 1.9827896356582642, + "learning_rate": 8.634319877464302e-06, + "loss": 5.4923, + "step": 16748 + }, + { + "epoch": 0.83, + "grad_norm": 2.1360533237457275, + "learning_rate": 8.624437966302682e-06, + "loss": 5.4328, + "step": 16752 + }, + { + "epoch": 0.83, + "grad_norm": 2.094109296798706, + "learning_rate": 8.614556055141066e-06, + "loss": 5.416, + "step": 16756 + }, + { + "epoch": 0.83, + "grad_norm": 1.9513869285583496, + "learning_rate": 8.604674143979446e-06, + "loss": 5.403, + "step": 16760 + }, + { + "epoch": 0.83, + "grad_norm": 2.0020523071289062, + "learning_rate": 8.594792232817828e-06, + "loss": 5.4628, + "step": 16764 + }, + { + "epoch": 0.83, + "grad_norm": 2.405801296234131, + "learning_rate": 8.584910321656209e-06, + "loss": 5.5799, + "step": 16768 + }, + { + "epoch": 0.83, + "grad_norm": 2.114650249481201, + "learning_rate": 8.57502841049459e-06, + "loss": 5.4151, + "step": 16772 + }, + { + "epoch": 0.83, + "grad_norm": 1.9269883632659912, + "learning_rate": 8.565146499332971e-06, + "loss": 5.4321, + "step": 16776 + }, + { + "epoch": 0.83, + "grad_norm": 1.894822597503662, + "learning_rate": 8.555264588171353e-06, + "loss": 5.4419, + "step": 16780 + }, + { + "epoch": 0.83, + "grad_norm": 2.0186209678649902, + "learning_rate": 8.545382677009735e-06, + "loss": 5.5657, + "step": 16784 + }, + { + "epoch": 0.83, + "grad_norm": 1.920372486114502, + "learning_rate": 8.535500765848115e-06, + "loss": 5.5334, + "step": 16788 + }, + { + "epoch": 0.83, + "grad_norm": 2.1055715084075928, + "learning_rate": 8.525618854686497e-06, + "loss": 5.4952, + "step": 16792 + }, + { + "epoch": 0.83, + "grad_norm": 2.0137712955474854, + "learning_rate": 8.515736943524878e-06, + "loss": 5.5128, + "step": 16796 + }, + { + "epoch": 0.83, + "grad_norm": 2.0473227500915527, + "learning_rate": 8.50585503236326e-06, + "loss": 5.6793, + "step": 16800 + }, + { + "epoch": 0.83, + "grad_norm": 2.0757429599761963, + "learning_rate": 8.49597312120164e-06, + "loss": 5.4888, + "step": 16804 + }, + { + "epoch": 0.83, + "grad_norm": 1.9529187679290771, + "learning_rate": 8.486091210040022e-06, + "loss": 5.2731, + "step": 16808 + }, + { + "epoch": 0.83, + "grad_norm": 1.897220492362976, + "learning_rate": 8.476209298878403e-06, + "loss": 5.4103, + "step": 16812 + }, + { + "epoch": 0.83, + "grad_norm": 2.0591204166412354, + "learning_rate": 8.466327387716785e-06, + "loss": 5.5609, + "step": 16816 + }, + { + "epoch": 0.83, + "grad_norm": 1.8429813385009766, + "learning_rate": 8.456445476555167e-06, + "loss": 5.3453, + "step": 16820 + }, + { + "epoch": 0.83, + "grad_norm": 1.854067087173462, + "learning_rate": 8.446563565393547e-06, + "loss": 5.4855, + "step": 16824 + }, + { + "epoch": 0.83, + "grad_norm": 1.87723708152771, + "learning_rate": 8.43668165423193e-06, + "loss": 5.5087, + "step": 16828 + }, + { + "epoch": 0.83, + "grad_norm": 2.25486159324646, + "learning_rate": 8.42679974307031e-06, + "loss": 5.6419, + "step": 16832 + }, + { + "epoch": 0.83, + "grad_norm": 2.1054129600524902, + "learning_rate": 8.416917831908692e-06, + "loss": 5.4713, + "step": 16836 + }, + { + "epoch": 0.83, + "grad_norm": 1.9546363353729248, + "learning_rate": 8.407035920747072e-06, + "loss": 5.51, + "step": 16840 + }, + { + "epoch": 0.83, + "grad_norm": 1.8574483394622803, + "learning_rate": 8.397154009585454e-06, + "loss": 5.4273, + "step": 16844 + }, + { + "epoch": 0.83, + "grad_norm": 2.135690927505493, + "learning_rate": 8.387272098423836e-06, + "loss": 5.4408, + "step": 16848 + }, + { + "epoch": 0.83, + "grad_norm": 1.9898631572723389, + "learning_rate": 8.377390187262217e-06, + "loss": 5.2905, + "step": 16852 + }, + { + "epoch": 0.83, + "grad_norm": 2.016470193862915, + "learning_rate": 8.367508276100599e-06, + "loss": 5.3266, + "step": 16856 + }, + { + "epoch": 0.83, + "grad_norm": 1.9973735809326172, + "learning_rate": 8.357626364938979e-06, + "loss": 5.4987, + "step": 16860 + }, + { + "epoch": 0.83, + "grad_norm": 2.116567373275757, + "learning_rate": 8.347744453777361e-06, + "loss": 5.4306, + "step": 16864 + }, + { + "epoch": 0.83, + "grad_norm": 2.044475793838501, + "learning_rate": 8.337862542615741e-06, + "loss": 5.4907, + "step": 16868 + }, + { + "epoch": 0.83, + "grad_norm": 2.090527296066284, + "learning_rate": 8.327980631454123e-06, + "loss": 5.3698, + "step": 16872 + }, + { + "epoch": 0.83, + "grad_norm": 1.98384428024292, + "learning_rate": 8.318098720292506e-06, + "loss": 5.469, + "step": 16876 + }, + { + "epoch": 0.83, + "grad_norm": 1.775121808052063, + "learning_rate": 8.308216809130886e-06, + "loss": 5.461, + "step": 16880 + }, + { + "epoch": 0.83, + "grad_norm": 1.9661427736282349, + "learning_rate": 8.298334897969268e-06, + "loss": 5.494, + "step": 16884 + }, + { + "epoch": 0.83, + "grad_norm": 2.0031895637512207, + "learning_rate": 8.288452986807648e-06, + "loss": 5.5011, + "step": 16888 + }, + { + "epoch": 0.83, + "grad_norm": 2.221911907196045, + "learning_rate": 8.27857107564603e-06, + "loss": 5.4296, + "step": 16892 + }, + { + "epoch": 0.83, + "grad_norm": 2.0504343509674072, + "learning_rate": 8.26868916448441e-06, + "loss": 5.5495, + "step": 16896 + }, + { + "epoch": 0.84, + "grad_norm": 2.1068339347839355, + "learning_rate": 8.258807253322794e-06, + "loss": 5.408, + "step": 16900 + }, + { + "epoch": 0.84, + "grad_norm": 2.0044867992401123, + "learning_rate": 8.248925342161175e-06, + "loss": 5.512, + "step": 16904 + }, + { + "epoch": 0.84, + "grad_norm": 2.3192813396453857, + "learning_rate": 8.239043430999555e-06, + "loss": 5.4185, + "step": 16908 + }, + { + "epoch": 0.84, + "grad_norm": 1.8410991430282593, + "learning_rate": 8.229161519837937e-06, + "loss": 5.4222, + "step": 16912 + }, + { + "epoch": 0.84, + "grad_norm": 2.0134191513061523, + "learning_rate": 8.219279608676318e-06, + "loss": 5.4357, + "step": 16916 + }, + { + "epoch": 0.84, + "grad_norm": 2.0390844345092773, + "learning_rate": 8.2093976975147e-06, + "loss": 5.5363, + "step": 16920 + }, + { + "epoch": 0.84, + "grad_norm": 2.12786602973938, + "learning_rate": 8.19951578635308e-06, + "loss": 5.31, + "step": 16924 + }, + { + "epoch": 0.84, + "grad_norm": 1.9766027927398682, + "learning_rate": 8.189633875191464e-06, + "loss": 5.534, + "step": 16928 + }, + { + "epoch": 0.84, + "grad_norm": 1.7689497470855713, + "learning_rate": 8.179751964029844e-06, + "loss": 5.3465, + "step": 16932 + }, + { + "epoch": 0.84, + "grad_norm": 2.117271900177002, + "learning_rate": 8.169870052868225e-06, + "loss": 5.4583, + "step": 16936 + }, + { + "epoch": 0.84, + "grad_norm": 2.0808498859405518, + "learning_rate": 8.159988141706607e-06, + "loss": 5.477, + "step": 16940 + }, + { + "epoch": 0.84, + "grad_norm": 2.0178062915802, + "learning_rate": 8.150106230544987e-06, + "loss": 5.5178, + "step": 16944 + }, + { + "epoch": 0.84, + "grad_norm": 1.7878342866897583, + "learning_rate": 8.140224319383369e-06, + "loss": 5.533, + "step": 16948 + }, + { + "epoch": 0.84, + "grad_norm": 2.0112874507904053, + "learning_rate": 8.13034240822175e-06, + "loss": 5.342, + "step": 16952 + }, + { + "epoch": 0.84, + "grad_norm": 2.224484443664551, + "learning_rate": 8.120460497060133e-06, + "loss": 5.4574, + "step": 16956 + }, + { + "epoch": 0.84, + "grad_norm": 2.29886531829834, + "learning_rate": 8.110578585898514e-06, + "loss": 5.5541, + "step": 16960 + }, + { + "epoch": 0.84, + "grad_norm": 1.8924994468688965, + "learning_rate": 8.100696674736896e-06, + "loss": 5.4196, + "step": 16964 + }, + { + "epoch": 0.84, + "grad_norm": 2.2159488201141357, + "learning_rate": 8.090814763575276e-06, + "loss": 5.4989, + "step": 16968 + }, + { + "epoch": 0.84, + "grad_norm": 2.170715570449829, + "learning_rate": 8.080932852413656e-06, + "loss": 5.5074, + "step": 16972 + }, + { + "epoch": 0.84, + "grad_norm": 2.0515708923339844, + "learning_rate": 8.071050941252038e-06, + "loss": 5.4033, + "step": 16976 + }, + { + "epoch": 0.84, + "grad_norm": 2.0467865467071533, + "learning_rate": 8.061169030090419e-06, + "loss": 5.4566, + "step": 16980 + }, + { + "epoch": 0.84, + "grad_norm": 1.9163670539855957, + "learning_rate": 8.0512871189288e-06, + "loss": 5.5335, + "step": 16984 + }, + { + "epoch": 0.84, + "grad_norm": 2.1390318870544434, + "learning_rate": 8.041405207767183e-06, + "loss": 5.5707, + "step": 16988 + }, + { + "epoch": 0.84, + "grad_norm": 1.9965319633483887, + "learning_rate": 8.031523296605565e-06, + "loss": 5.4165, + "step": 16992 + }, + { + "epoch": 0.84, + "grad_norm": 2.137233257293701, + "learning_rate": 8.021641385443945e-06, + "loss": 5.3596, + "step": 16996 + }, + { + "epoch": 0.84, + "grad_norm": 2.152256727218628, + "learning_rate": 8.011759474282327e-06, + "loss": 5.3396, + "step": 17000 + }, + { + "epoch": 0.84, + "grad_norm": 2.28680682182312, + "learning_rate": 8.001877563120708e-06, + "loss": 5.5507, + "step": 17004 + }, + { + "epoch": 0.84, + "grad_norm": 2.26821231842041, + "learning_rate": 7.991995651959088e-06, + "loss": 5.4969, + "step": 17008 + }, + { + "epoch": 0.84, + "grad_norm": 2.275667428970337, + "learning_rate": 7.98211374079747e-06, + "loss": 5.441, + "step": 17012 + }, + { + "epoch": 0.84, + "grad_norm": 2.080756902694702, + "learning_rate": 7.972231829635852e-06, + "loss": 5.4398, + "step": 17016 + }, + { + "epoch": 0.84, + "grad_norm": 2.10422420501709, + "learning_rate": 7.962349918474234e-06, + "loss": 5.437, + "step": 17020 + }, + { + "epoch": 0.84, + "grad_norm": 1.858323335647583, + "learning_rate": 7.952468007312615e-06, + "loss": 5.5432, + "step": 17024 + }, + { + "epoch": 0.84, + "grad_norm": 2.4101650714874268, + "learning_rate": 7.942586096150997e-06, + "loss": 5.412, + "step": 17028 + }, + { + "epoch": 0.84, + "grad_norm": 2.2219436168670654, + "learning_rate": 7.932704184989377e-06, + "loss": 5.5279, + "step": 17032 + }, + { + "epoch": 0.84, + "grad_norm": 2.182474374771118, + "learning_rate": 7.922822273827757e-06, + "loss": 5.4662, + "step": 17036 + }, + { + "epoch": 0.84, + "grad_norm": 2.060351610183716, + "learning_rate": 7.91294036266614e-06, + "loss": 5.504, + "step": 17040 + }, + { + "epoch": 0.84, + "grad_norm": 2.1096701622009277, + "learning_rate": 7.903058451504522e-06, + "loss": 5.4539, + "step": 17044 + }, + { + "epoch": 0.84, + "grad_norm": 2.0492708683013916, + "learning_rate": 7.893176540342904e-06, + "loss": 5.4398, + "step": 17048 + }, + { + "epoch": 0.84, + "grad_norm": 2.032947301864624, + "learning_rate": 7.883294629181284e-06, + "loss": 5.5253, + "step": 17052 + }, + { + "epoch": 0.84, + "grad_norm": 2.0764636993408203, + "learning_rate": 7.873412718019666e-06, + "loss": 5.4655, + "step": 17056 + }, + { + "epoch": 0.84, + "grad_norm": 2.105656862258911, + "learning_rate": 7.863530806858046e-06, + "loss": 5.4761, + "step": 17060 + }, + { + "epoch": 0.84, + "grad_norm": 1.975953459739685, + "learning_rate": 7.853648895696428e-06, + "loss": 5.5364, + "step": 17064 + }, + { + "epoch": 0.84, + "grad_norm": 2.0592944622039795, + "learning_rate": 7.843766984534809e-06, + "loss": 5.4987, + "step": 17068 + }, + { + "epoch": 0.84, + "grad_norm": 2.1122117042541504, + "learning_rate": 7.833885073373191e-06, + "loss": 5.4162, + "step": 17072 + }, + { + "epoch": 0.84, + "grad_norm": 2.143172264099121, + "learning_rate": 7.824003162211573e-06, + "loss": 5.4959, + "step": 17076 + }, + { + "epoch": 0.84, + "grad_norm": 1.9919787645339966, + "learning_rate": 7.814121251049953e-06, + "loss": 5.469, + "step": 17080 + }, + { + "epoch": 0.84, + "grad_norm": 1.9146004915237427, + "learning_rate": 7.804239339888335e-06, + "loss": 5.4748, + "step": 17084 + }, + { + "epoch": 0.84, + "grad_norm": 2.3150486946105957, + "learning_rate": 7.794357428726716e-06, + "loss": 5.4056, + "step": 17088 + }, + { + "epoch": 0.84, + "grad_norm": 2.1717705726623535, + "learning_rate": 7.784475517565098e-06, + "loss": 5.5389, + "step": 17092 + }, + { + "epoch": 0.84, + "grad_norm": 2.1674489974975586, + "learning_rate": 7.774593606403478e-06, + "loss": 5.3378, + "step": 17096 + }, + { + "epoch": 0.84, + "grad_norm": 2.17425537109375, + "learning_rate": 7.76471169524186e-06, + "loss": 5.5094, + "step": 17100 + }, + { + "epoch": 0.85, + "grad_norm": 2.2170867919921875, + "learning_rate": 7.754829784080242e-06, + "loss": 5.4591, + "step": 17104 + }, + { + "epoch": 0.85, + "grad_norm": 2.0710206031799316, + "learning_rate": 7.744947872918623e-06, + "loss": 5.461, + "step": 17108 + }, + { + "epoch": 0.85, + "grad_norm": 1.9662617444992065, + "learning_rate": 7.735065961757005e-06, + "loss": 5.5232, + "step": 17112 + }, + { + "epoch": 0.85, + "grad_norm": 2.1950018405914307, + "learning_rate": 7.725184050595385e-06, + "loss": 5.4845, + "step": 17116 + }, + { + "epoch": 0.85, + "grad_norm": 2.166281223297119, + "learning_rate": 7.715302139433767e-06, + "loss": 5.4899, + "step": 17120 + }, + { + "epoch": 0.85, + "grad_norm": 2.0825867652893066, + "learning_rate": 7.705420228272148e-06, + "loss": 5.5889, + "step": 17124 + }, + { + "epoch": 0.85, + "grad_norm": 2.0458121299743652, + "learning_rate": 7.69553831711053e-06, + "loss": 5.4465, + "step": 17128 + }, + { + "epoch": 0.85, + "grad_norm": 1.972931146621704, + "learning_rate": 7.685656405948912e-06, + "loss": 5.447, + "step": 17132 + }, + { + "epoch": 0.85, + "grad_norm": 2.2071616649627686, + "learning_rate": 7.675774494787292e-06, + "loss": 5.5405, + "step": 17136 + }, + { + "epoch": 0.85, + "grad_norm": 2.24798583984375, + "learning_rate": 7.665892583625674e-06, + "loss": 5.5034, + "step": 17140 + }, + { + "epoch": 0.85, + "grad_norm": 2.3352463245391846, + "learning_rate": 7.656010672464054e-06, + "loss": 5.5322, + "step": 17144 + }, + { + "epoch": 0.85, + "grad_norm": 2.1701347827911377, + "learning_rate": 7.646128761302436e-06, + "loss": 5.6042, + "step": 17148 + }, + { + "epoch": 0.85, + "grad_norm": 2.0654942989349365, + "learning_rate": 7.636246850140817e-06, + "loss": 5.5102, + "step": 17152 + }, + { + "epoch": 0.85, + "grad_norm": 2.410454273223877, + "learning_rate": 7.626364938979198e-06, + "loss": 5.4408, + "step": 17156 + }, + { + "epoch": 0.85, + "grad_norm": 2.0221352577209473, + "learning_rate": 7.616483027817581e-06, + "loss": 5.4086, + "step": 17160 + }, + { + "epoch": 0.85, + "grad_norm": 2.211092233657837, + "learning_rate": 7.606601116655962e-06, + "loss": 5.4589, + "step": 17164 + }, + { + "epoch": 0.85, + "grad_norm": 1.9467920064926147, + "learning_rate": 7.596719205494343e-06, + "loss": 5.4867, + "step": 17168 + }, + { + "epoch": 0.85, + "grad_norm": 2.1144025325775146, + "learning_rate": 7.586837294332725e-06, + "loss": 5.5143, + "step": 17172 + }, + { + "epoch": 0.85, + "grad_norm": 2.1652915477752686, + "learning_rate": 7.576955383171106e-06, + "loss": 5.533, + "step": 17176 + }, + { + "epoch": 0.85, + "grad_norm": 1.9289984703063965, + "learning_rate": 7.567073472009486e-06, + "loss": 5.5406, + "step": 17180 + }, + { + "epoch": 0.85, + "grad_norm": 2.028322458267212, + "learning_rate": 7.557191560847867e-06, + "loss": 5.4118, + "step": 17184 + }, + { + "epoch": 0.85, + "grad_norm": 2.2385053634643555, + "learning_rate": 7.54730964968625e-06, + "loss": 5.5023, + "step": 17188 + }, + { + "epoch": 0.85, + "grad_norm": 1.8756178617477417, + "learning_rate": 7.5374277385246315e-06, + "loss": 5.4028, + "step": 17192 + }, + { + "epoch": 0.85, + "grad_norm": 2.0008492469787598, + "learning_rate": 7.527545827363013e-06, + "loss": 5.4572, + "step": 17196 + }, + { + "epoch": 0.85, + "grad_norm": 1.9606680870056152, + "learning_rate": 7.517663916201394e-06, + "loss": 5.3813, + "step": 17200 + }, + { + "epoch": 0.85, + "grad_norm": 1.765757441520691, + "learning_rate": 7.507782005039775e-06, + "loss": 5.433, + "step": 17204 + }, + { + "epoch": 0.85, + "grad_norm": 2.2999327182769775, + "learning_rate": 7.497900093878156e-06, + "loss": 5.4443, + "step": 17208 + }, + { + "epoch": 0.85, + "grad_norm": 1.831790804862976, + "learning_rate": 7.488018182716537e-06, + "loss": 5.4745, + "step": 17212 + }, + { + "epoch": 0.85, + "grad_norm": 2.0281448364257812, + "learning_rate": 7.47813627155492e-06, + "loss": 5.3546, + "step": 17216 + }, + { + "epoch": 0.85, + "grad_norm": 2.163875102996826, + "learning_rate": 7.468254360393301e-06, + "loss": 5.4957, + "step": 17220 + }, + { + "epoch": 0.85, + "grad_norm": 2.0201468467712402, + "learning_rate": 7.458372449231682e-06, + "loss": 5.3797, + "step": 17224 + }, + { + "epoch": 0.85, + "grad_norm": 1.9520927667617798, + "learning_rate": 7.448490538070063e-06, + "loss": 5.5663, + "step": 17228 + }, + { + "epoch": 0.85, + "grad_norm": 1.9706037044525146, + "learning_rate": 7.4386086269084445e-06, + "loss": 5.4462, + "step": 17232 + }, + { + "epoch": 0.85, + "grad_norm": 2.2334280014038086, + "learning_rate": 7.428726715746826e-06, + "loss": 5.5028, + "step": 17236 + }, + { + "epoch": 0.85, + "grad_norm": 2.089432716369629, + "learning_rate": 7.418844804585207e-06, + "loss": 5.5537, + "step": 17240 + }, + { + "epoch": 0.85, + "grad_norm": 2.0354325771331787, + "learning_rate": 7.408962893423589e-06, + "loss": 5.4, + "step": 17244 + }, + { + "epoch": 0.85, + "grad_norm": 1.9282554388046265, + "learning_rate": 7.39908098226197e-06, + "loss": 5.3915, + "step": 17248 + }, + { + "epoch": 0.85, + "grad_norm": 2.133868455886841, + "learning_rate": 7.389199071100351e-06, + "loss": 5.4868, + "step": 17252 + }, + { + "epoch": 0.85, + "grad_norm": 2.0873701572418213, + "learning_rate": 7.379317159938733e-06, + "loss": 5.6375, + "step": 17256 + }, + { + "epoch": 0.85, + "grad_norm": 1.9751038551330566, + "learning_rate": 7.369435248777114e-06, + "loss": 5.4291, + "step": 17260 + }, + { + "epoch": 0.85, + "grad_norm": 1.8549004793167114, + "learning_rate": 7.359553337615495e-06, + "loss": 5.4953, + "step": 17264 + }, + { + "epoch": 0.85, + "grad_norm": 1.9882365465164185, + "learning_rate": 7.349671426453876e-06, + "loss": 5.5191, + "step": 17268 + }, + { + "epoch": 0.85, + "grad_norm": 2.0008509159088135, + "learning_rate": 7.339789515292258e-06, + "loss": 5.573, + "step": 17272 + }, + { + "epoch": 0.85, + "grad_norm": 1.9084336757659912, + "learning_rate": 7.3299076041306395e-06, + "loss": 5.5864, + "step": 17276 + }, + { + "epoch": 0.85, + "grad_norm": 2.3234941959381104, + "learning_rate": 7.320025692969021e-06, + "loss": 5.4648, + "step": 17280 + }, + { + "epoch": 0.85, + "grad_norm": 2.033445358276367, + "learning_rate": 7.310143781807402e-06, + "loss": 5.5663, + "step": 17284 + }, + { + "epoch": 0.85, + "grad_norm": 2.036726474761963, + "learning_rate": 7.300261870645783e-06, + "loss": 5.3706, + "step": 17288 + }, + { + "epoch": 0.85, + "grad_norm": 2.135927677154541, + "learning_rate": 7.290379959484164e-06, + "loss": 5.4365, + "step": 17292 + }, + { + "epoch": 0.85, + "grad_norm": 2.0434539318084717, + "learning_rate": 7.280498048322546e-06, + "loss": 5.4914, + "step": 17296 + }, + { + "epoch": 0.85, + "grad_norm": 1.9364794492721558, + "learning_rate": 7.270616137160927e-06, + "loss": 5.358, + "step": 17300 + }, + { + "epoch": 0.85, + "grad_norm": 2.1555495262145996, + "learning_rate": 7.260734225999309e-06, + "loss": 5.4667, + "step": 17304 + }, + { + "epoch": 0.86, + "grad_norm": 2.0687687397003174, + "learning_rate": 7.25085231483769e-06, + "loss": 5.5075, + "step": 17308 + }, + { + "epoch": 0.86, + "grad_norm": 2.2169644832611084, + "learning_rate": 7.240970403676071e-06, + "loss": 5.5186, + "step": 17312 + }, + { + "epoch": 0.86, + "grad_norm": 2.0690207481384277, + "learning_rate": 7.2310884925144525e-06, + "loss": 5.4944, + "step": 17316 + }, + { + "epoch": 0.86, + "grad_norm": 2.172851324081421, + "learning_rate": 7.221206581352834e-06, + "loss": 5.5178, + "step": 17320 + }, + { + "epoch": 0.86, + "grad_norm": 2.178602457046509, + "learning_rate": 7.211324670191215e-06, + "loss": 5.5693, + "step": 17324 + }, + { + "epoch": 0.86, + "grad_norm": 1.9525049924850464, + "learning_rate": 7.201442759029596e-06, + "loss": 5.3773, + "step": 17328 + }, + { + "epoch": 0.86, + "grad_norm": 2.0250043869018555, + "learning_rate": 7.191560847867978e-06, + "loss": 5.4672, + "step": 17332 + }, + { + "epoch": 0.86, + "grad_norm": 2.229799747467041, + "learning_rate": 7.1816789367063594e-06, + "loss": 5.4451, + "step": 17336 + }, + { + "epoch": 0.86, + "grad_norm": 2.2048773765563965, + "learning_rate": 7.171797025544741e-06, + "loss": 5.5388, + "step": 17340 + }, + { + "epoch": 0.86, + "grad_norm": 2.1948986053466797, + "learning_rate": 7.161915114383122e-06, + "loss": 5.4121, + "step": 17344 + }, + { + "epoch": 0.86, + "grad_norm": 2.410446882247925, + "learning_rate": 7.152033203221503e-06, + "loss": 5.5068, + "step": 17348 + }, + { + "epoch": 0.86, + "grad_norm": 2.0198326110839844, + "learning_rate": 7.142151292059884e-06, + "loss": 5.4786, + "step": 17352 + }, + { + "epoch": 0.86, + "grad_norm": 2.1943955421447754, + "learning_rate": 7.1322693808982655e-06, + "loss": 5.4957, + "step": 17356 + }, + { + "epoch": 0.86, + "grad_norm": 2.1132426261901855, + "learning_rate": 7.122387469736648e-06, + "loss": 5.3965, + "step": 17360 + }, + { + "epoch": 0.86, + "grad_norm": 2.0462357997894287, + "learning_rate": 7.112505558575029e-06, + "loss": 5.5153, + "step": 17364 + }, + { + "epoch": 0.86, + "grad_norm": 2.0501723289489746, + "learning_rate": 7.10262364741341e-06, + "loss": 5.5062, + "step": 17368 + }, + { + "epoch": 0.86, + "grad_norm": 2.148674726486206, + "learning_rate": 7.092741736251791e-06, + "loss": 5.2946, + "step": 17372 + }, + { + "epoch": 0.86, + "grad_norm": 2.0384411811828613, + "learning_rate": 7.082859825090172e-06, + "loss": 5.6131, + "step": 17376 + }, + { + "epoch": 0.86, + "grad_norm": 2.235848903656006, + "learning_rate": 7.072977913928554e-06, + "loss": 5.3982, + "step": 17380 + }, + { + "epoch": 0.86, + "grad_norm": 2.0050299167633057, + "learning_rate": 7.063096002766935e-06, + "loss": 5.5681, + "step": 17384 + }, + { + "epoch": 0.86, + "grad_norm": 1.9482308626174927, + "learning_rate": 7.053214091605318e-06, + "loss": 5.4242, + "step": 17388 + }, + { + "epoch": 0.86, + "grad_norm": 2.077125072479248, + "learning_rate": 7.043332180443699e-06, + "loss": 5.4092, + "step": 17392 + }, + { + "epoch": 0.86, + "grad_norm": 2.2242355346679688, + "learning_rate": 7.033450269282079e-06, + "loss": 5.4268, + "step": 17396 + }, + { + "epoch": 0.86, + "grad_norm": 2.2366597652435303, + "learning_rate": 7.0235683581204605e-06, + "loss": 5.4304, + "step": 17400 + }, + { + "epoch": 0.86, + "grad_norm": 2.3268561363220215, + "learning_rate": 7.013686446958842e-06, + "loss": 5.4141, + "step": 17404 + }, + { + "epoch": 0.86, + "grad_norm": 2.1040186882019043, + "learning_rate": 7.003804535797223e-06, + "loss": 5.4129, + "step": 17408 + }, + { + "epoch": 0.86, + "grad_norm": 2.0050957202911377, + "learning_rate": 6.993922624635604e-06, + "loss": 5.5029, + "step": 17412 + }, + { + "epoch": 0.86, + "grad_norm": 1.914214849472046, + "learning_rate": 6.984040713473987e-06, + "loss": 5.5394, + "step": 17416 + }, + { + "epoch": 0.86, + "grad_norm": 2.112946033477783, + "learning_rate": 6.974158802312368e-06, + "loss": 5.4547, + "step": 17420 + }, + { + "epoch": 0.86, + "grad_norm": 1.980510950088501, + "learning_rate": 6.9642768911507495e-06, + "loss": 5.4997, + "step": 17424 + }, + { + "epoch": 0.86, + "grad_norm": 1.985985517501831, + "learning_rate": 6.954394979989131e-06, + "loss": 5.3671, + "step": 17428 + }, + { + "epoch": 0.86, + "grad_norm": 1.897262454032898, + "learning_rate": 6.944513068827511e-06, + "loss": 5.4578, + "step": 17432 + }, + { + "epoch": 0.86, + "grad_norm": 1.9851828813552856, + "learning_rate": 6.934631157665892e-06, + "loss": 5.4318, + "step": 17436 + }, + { + "epoch": 0.86, + "grad_norm": 1.8977246284484863, + "learning_rate": 6.9247492465042735e-06, + "loss": 5.4777, + "step": 17440 + }, + { + "epoch": 0.86, + "grad_norm": 2.1280171871185303, + "learning_rate": 6.9148673353426564e-06, + "loss": 5.583, + "step": 17444 + }, + { + "epoch": 0.86, + "grad_norm": 2.3764641284942627, + "learning_rate": 6.904985424181038e-06, + "loss": 5.5559, + "step": 17448 + }, + { + "epoch": 0.86, + "grad_norm": 1.9994136095046997, + "learning_rate": 6.895103513019419e-06, + "loss": 5.5662, + "step": 17452 + }, + { + "epoch": 0.86, + "grad_norm": 2.108659267425537, + "learning_rate": 6.8852216018578e-06, + "loss": 5.4098, + "step": 17456 + }, + { + "epoch": 0.86, + "grad_norm": 1.9477959871292114, + "learning_rate": 6.875339690696181e-06, + "loss": 5.4161, + "step": 17460 + }, + { + "epoch": 0.86, + "grad_norm": 2.2120134830474854, + "learning_rate": 6.865457779534562e-06, + "loss": 5.5344, + "step": 17464 + }, + { + "epoch": 0.86, + "grad_norm": 1.9351931810379028, + "learning_rate": 6.855575868372943e-06, + "loss": 5.5115, + "step": 17468 + }, + { + "epoch": 0.86, + "grad_norm": 1.9376587867736816, + "learning_rate": 6.845693957211324e-06, + "loss": 5.4218, + "step": 17472 + }, + { + "epoch": 0.86, + "grad_norm": 1.8527143001556396, + "learning_rate": 6.835812046049707e-06, + "loss": 5.4781, + "step": 17476 + }, + { + "epoch": 0.86, + "grad_norm": 1.9370919466018677, + "learning_rate": 6.825930134888088e-06, + "loss": 5.6185, + "step": 17480 + }, + { + "epoch": 0.86, + "grad_norm": 1.8956094980239868, + "learning_rate": 6.816048223726469e-06, + "loss": 5.5335, + "step": 17484 + }, + { + "epoch": 0.86, + "grad_norm": 2.1358373165130615, + "learning_rate": 6.806166312564851e-06, + "loss": 5.5501, + "step": 17488 + }, + { + "epoch": 0.86, + "grad_norm": 1.8500255346298218, + "learning_rate": 6.796284401403232e-06, + "loss": 5.4718, + "step": 17492 + }, + { + "epoch": 0.86, + "grad_norm": 1.9620647430419922, + "learning_rate": 6.786402490241612e-06, + "loss": 5.4566, + "step": 17496 + }, + { + "epoch": 0.86, + "grad_norm": 2.0902743339538574, + "learning_rate": 6.776520579079993e-06, + "loss": 5.484, + "step": 17500 + }, + { + "epoch": 0.86, + "grad_norm": 2.329399824142456, + "learning_rate": 6.766638667918376e-06, + "loss": 5.5337, + "step": 17504 + }, + { + "epoch": 0.87, + "grad_norm": 1.9751675128936768, + "learning_rate": 6.7567567567567575e-06, + "loss": 5.4638, + "step": 17508 + }, + { + "epoch": 0.87, + "grad_norm": 2.188885450363159, + "learning_rate": 6.746874845595139e-06, + "loss": 5.5174, + "step": 17512 + }, + { + "epoch": 0.87, + "grad_norm": 2.1230249404907227, + "learning_rate": 6.73699293443352e-06, + "loss": 5.4525, + "step": 17516 + }, + { + "epoch": 0.87, + "grad_norm": 2.0748202800750732, + "learning_rate": 6.727111023271901e-06, + "loss": 5.5633, + "step": 17520 + }, + { + "epoch": 0.87, + "grad_norm": 1.8766546249389648, + "learning_rate": 6.717229112110282e-06, + "loss": 5.5295, + "step": 17524 + }, + { + "epoch": 0.87, + "grad_norm": 2.1104044914245605, + "learning_rate": 6.707347200948664e-06, + "loss": 5.5265, + "step": 17528 + }, + { + "epoch": 0.87, + "grad_norm": 1.9020673036575317, + "learning_rate": 6.697465289787046e-06, + "loss": 5.4684, + "step": 17532 + }, + { + "epoch": 0.87, + "grad_norm": 1.9456652402877808, + "learning_rate": 6.687583378625427e-06, + "loss": 5.3826, + "step": 17536 + }, + { + "epoch": 0.87, + "grad_norm": 2.117117166519165, + "learning_rate": 6.677701467463808e-06, + "loss": 5.4525, + "step": 17540 + }, + { + "epoch": 0.87, + "grad_norm": 2.0873782634735107, + "learning_rate": 6.667819556302189e-06, + "loss": 5.5402, + "step": 17544 + }, + { + "epoch": 0.87, + "grad_norm": 2.0289838314056396, + "learning_rate": 6.6579376451405705e-06, + "loss": 5.5603, + "step": 17548 + }, + { + "epoch": 0.87, + "grad_norm": 2.2275471687316895, + "learning_rate": 6.648055733978952e-06, + "loss": 5.3792, + "step": 17552 + }, + { + "epoch": 0.87, + "grad_norm": 1.9133155345916748, + "learning_rate": 6.638173822817333e-06, + "loss": 5.4289, + "step": 17556 + }, + { + "epoch": 0.87, + "grad_norm": 2.193645477294922, + "learning_rate": 6.628291911655715e-06, + "loss": 5.5224, + "step": 17560 + }, + { + "epoch": 0.87, + "grad_norm": 2.1608972549438477, + "learning_rate": 6.618410000494096e-06, + "loss": 5.5412, + "step": 17564 + }, + { + "epoch": 0.87, + "grad_norm": 2.141594648361206, + "learning_rate": 6.6085280893324774e-06, + "loss": 5.4787, + "step": 17568 + }, + { + "epoch": 0.87, + "grad_norm": 1.9416935443878174, + "learning_rate": 6.598646178170859e-06, + "loss": 5.3674, + "step": 17572 + }, + { + "epoch": 0.87, + "grad_norm": 2.110677480697632, + "learning_rate": 6.58876426700924e-06, + "loss": 5.4024, + "step": 17576 + }, + { + "epoch": 0.87, + "grad_norm": 2.2235372066497803, + "learning_rate": 6.578882355847621e-06, + "loss": 5.4458, + "step": 17580 + }, + { + "epoch": 0.87, + "grad_norm": 2.280282974243164, + "learning_rate": 6.569000444686002e-06, + "loss": 5.5981, + "step": 17584 + }, + { + "epoch": 0.87, + "grad_norm": 2.1084625720977783, + "learning_rate": 6.559118533524384e-06, + "loss": 5.4475, + "step": 17588 + }, + { + "epoch": 0.87, + "grad_norm": 2.004232406616211, + "learning_rate": 6.5492366223627656e-06, + "loss": 5.4314, + "step": 17592 + }, + { + "epoch": 0.87, + "grad_norm": 1.9286199808120728, + "learning_rate": 6.539354711201147e-06, + "loss": 5.3943, + "step": 17596 + }, + { + "epoch": 0.87, + "grad_norm": 1.9742239713668823, + "learning_rate": 6.529472800039528e-06, + "loss": 5.4633, + "step": 17600 + }, + { + "epoch": 0.87, + "grad_norm": 2.1503305435180664, + "learning_rate": 6.519590888877909e-06, + "loss": 5.4654, + "step": 17604 + }, + { + "epoch": 0.87, + "grad_norm": 1.996319055557251, + "learning_rate": 6.50970897771629e-06, + "loss": 5.4479, + "step": 17608 + }, + { + "epoch": 0.87, + "grad_norm": 2.1689870357513428, + "learning_rate": 6.499827066554672e-06, + "loss": 5.4242, + "step": 17612 + }, + { + "epoch": 0.87, + "grad_norm": 2.0061464309692383, + "learning_rate": 6.489945155393053e-06, + "loss": 5.3833, + "step": 17616 + }, + { + "epoch": 0.87, + "grad_norm": 2.1201388835906982, + "learning_rate": 6.480063244231435e-06, + "loss": 5.4374, + "step": 17620 + }, + { + "epoch": 0.87, + "grad_norm": 2.196545124053955, + "learning_rate": 6.470181333069816e-06, + "loss": 5.3564, + "step": 17624 + }, + { + "epoch": 0.87, + "grad_norm": 2.073232412338257, + "learning_rate": 6.460299421908197e-06, + "loss": 5.4057, + "step": 17628 + }, + { + "epoch": 0.87, + "grad_norm": 1.9354524612426758, + "learning_rate": 6.4504175107465785e-06, + "loss": 5.4718, + "step": 17632 + }, + { + "epoch": 0.87, + "grad_norm": 2.032994508743286, + "learning_rate": 6.44053559958496e-06, + "loss": 5.358, + "step": 17636 + }, + { + "epoch": 0.87, + "grad_norm": 2.130598545074463, + "learning_rate": 6.430653688423341e-06, + "loss": 5.4658, + "step": 17640 + }, + { + "epoch": 0.87, + "grad_norm": 1.8692468404769897, + "learning_rate": 6.420771777261722e-06, + "loss": 5.4004, + "step": 17644 + }, + { + "epoch": 0.87, + "grad_norm": 1.9368531703948975, + "learning_rate": 6.410889866100104e-06, + "loss": 5.4809, + "step": 17648 + }, + { + "epoch": 0.87, + "grad_norm": 2.235506534576416, + "learning_rate": 6.4010079549384855e-06, + "loss": 5.4532, + "step": 17652 + }, + { + "epoch": 0.87, + "grad_norm": 1.9980324506759644, + "learning_rate": 6.391126043776867e-06, + "loss": 5.5156, + "step": 17656 + }, + { + "epoch": 0.87, + "grad_norm": 1.947649598121643, + "learning_rate": 6.381244132615248e-06, + "loss": 5.3676, + "step": 17660 + }, + { + "epoch": 0.87, + "grad_norm": 2.365041971206665, + "learning_rate": 6.371362221453629e-06, + "loss": 5.4387, + "step": 17664 + }, + { + "epoch": 0.87, + "grad_norm": 2.2820627689361572, + "learning_rate": 6.36148031029201e-06, + "loss": 5.4541, + "step": 17668 + }, + { + "epoch": 0.87, + "grad_norm": 2.02691650390625, + "learning_rate": 6.3515983991303915e-06, + "loss": 5.5084, + "step": 17672 + }, + { + "epoch": 0.87, + "grad_norm": 2.0064783096313477, + "learning_rate": 6.3417164879687744e-06, + "loss": 5.5205, + "step": 17676 + }, + { + "epoch": 0.87, + "grad_norm": 1.9961150884628296, + "learning_rate": 6.331834576807155e-06, + "loss": 5.3904, + "step": 17680 + }, + { + "epoch": 0.87, + "grad_norm": 2.2273404598236084, + "learning_rate": 6.321952665645536e-06, + "loss": 5.4756, + "step": 17684 + }, + { + "epoch": 0.87, + "grad_norm": 2.078472852706909, + "learning_rate": 6.312070754483917e-06, + "loss": 5.5535, + "step": 17688 + }, + { + "epoch": 0.87, + "grad_norm": 1.9248629808425903, + "learning_rate": 6.3021888433222984e-06, + "loss": 5.4378, + "step": 17692 + }, + { + "epoch": 0.87, + "grad_norm": 2.1985530853271484, + "learning_rate": 6.29230693216068e-06, + "loss": 5.4345, + "step": 17696 + }, + { + "epoch": 0.87, + "grad_norm": 2.087536096572876, + "learning_rate": 6.282425020999061e-06, + "loss": 5.5135, + "step": 17700 + }, + { + "epoch": 0.87, + "grad_norm": 2.0699515342712402, + "learning_rate": 6.272543109837444e-06, + "loss": 5.3821, + "step": 17704 + }, + { + "epoch": 0.87, + "grad_norm": 2.309680223464966, + "learning_rate": 6.262661198675825e-06, + "loss": 5.3426, + "step": 17708 + }, + { + "epoch": 0.88, + "grad_norm": 1.9877557754516602, + "learning_rate": 6.252779287514206e-06, + "loss": 5.554, + "step": 17712 + }, + { + "epoch": 0.88, + "grad_norm": 2.1621484756469727, + "learning_rate": 6.2428973763525866e-06, + "loss": 5.4897, + "step": 17716 + }, + { + "epoch": 0.88, + "grad_norm": 2.0568161010742188, + "learning_rate": 6.233015465190968e-06, + "loss": 5.4726, + "step": 17720 + }, + { + "epoch": 0.88, + "grad_norm": 1.8659361600875854, + "learning_rate": 6.22313355402935e-06, + "loss": 5.461, + "step": 17724 + }, + { + "epoch": 0.88, + "grad_norm": 1.8461517095565796, + "learning_rate": 6.213251642867731e-06, + "loss": 5.3407, + "step": 17728 + }, + { + "epoch": 0.88, + "grad_norm": 2.2194485664367676, + "learning_rate": 6.203369731706112e-06, + "loss": 5.3863, + "step": 17732 + }, + { + "epoch": 0.88, + "grad_norm": 2.2594525814056396, + "learning_rate": 6.1934878205444935e-06, + "loss": 5.4737, + "step": 17736 + }, + { + "epoch": 0.88, + "grad_norm": 2.067777156829834, + "learning_rate": 6.1836059093828755e-06, + "loss": 5.4817, + "step": 17740 + }, + { + "epoch": 0.88, + "grad_norm": 1.9696800708770752, + "learning_rate": 6.173723998221257e-06, + "loss": 5.5215, + "step": 17744 + }, + { + "epoch": 0.88, + "grad_norm": 1.9700802564620972, + "learning_rate": 6.163842087059637e-06, + "loss": 5.5821, + "step": 17748 + }, + { + "epoch": 0.88, + "grad_norm": 2.2519845962524414, + "learning_rate": 6.153960175898018e-06, + "loss": 5.5501, + "step": 17752 + }, + { + "epoch": 0.88, + "grad_norm": 2.1531550884246826, + "learning_rate": 6.1440782647364e-06, + "loss": 5.354, + "step": 17756 + }, + { + "epoch": 0.88, + "grad_norm": 2.299639940261841, + "learning_rate": 6.134196353574782e-06, + "loss": 5.5487, + "step": 17760 + }, + { + "epoch": 0.88, + "grad_norm": 1.9032407999038696, + "learning_rate": 6.124314442413163e-06, + "loss": 5.4551, + "step": 17764 + }, + { + "epoch": 0.88, + "grad_norm": 2.121720552444458, + "learning_rate": 6.114432531251545e-06, + "loss": 5.4185, + "step": 17768 + }, + { + "epoch": 0.88, + "grad_norm": 1.955588698387146, + "learning_rate": 6.104550620089926e-06, + "loss": 5.5947, + "step": 17772 + }, + { + "epoch": 0.88, + "grad_norm": 1.9518580436706543, + "learning_rate": 6.094668708928307e-06, + "loss": 5.4159, + "step": 17776 + }, + { + "epoch": 0.88, + "grad_norm": 2.2284739017486572, + "learning_rate": 6.084786797766688e-06, + "loss": 5.4806, + "step": 17780 + }, + { + "epoch": 0.88, + "grad_norm": 1.9473198652267456, + "learning_rate": 6.07490488660507e-06, + "loss": 5.3457, + "step": 17784 + }, + { + "epoch": 0.88, + "grad_norm": 2.25762939453125, + "learning_rate": 6.065022975443451e-06, + "loss": 5.3905, + "step": 17788 + }, + { + "epoch": 0.88, + "grad_norm": 2.13055682182312, + "learning_rate": 6.055141064281832e-06, + "loss": 5.4596, + "step": 17792 + }, + { + "epoch": 0.88, + "grad_norm": 2.078608751296997, + "learning_rate": 6.045259153120214e-06, + "loss": 5.4836, + "step": 17796 + }, + { + "epoch": 0.88, + "grad_norm": 2.032860040664673, + "learning_rate": 6.0353772419585954e-06, + "loss": 5.3315, + "step": 17800 + }, + { + "epoch": 0.88, + "grad_norm": 2.18186616897583, + "learning_rate": 6.025495330796977e-06, + "loss": 5.5165, + "step": 17804 + }, + { + "epoch": 0.88, + "grad_norm": 1.8949894905090332, + "learning_rate": 6.015613419635358e-06, + "loss": 5.4296, + "step": 17808 + }, + { + "epoch": 0.88, + "grad_norm": 1.9019147157669067, + "learning_rate": 6.005731508473739e-06, + "loss": 5.3742, + "step": 17812 + }, + { + "epoch": 0.88, + "grad_norm": 1.9749938249588013, + "learning_rate": 5.99584959731212e-06, + "loss": 5.4702, + "step": 17816 + }, + { + "epoch": 0.88, + "grad_norm": 1.9528026580810547, + "learning_rate": 5.9859676861505015e-06, + "loss": 5.4179, + "step": 17820 + }, + { + "epoch": 0.88, + "grad_norm": 2.04555082321167, + "learning_rate": 5.976085774988883e-06, + "loss": 5.4807, + "step": 17824 + }, + { + "epoch": 0.88, + "grad_norm": 2.2078750133514404, + "learning_rate": 5.966203863827265e-06, + "loss": 5.4206, + "step": 17828 + }, + { + "epoch": 0.88, + "grad_norm": 2.1232731342315674, + "learning_rate": 5.956321952665646e-06, + "loss": 5.5452, + "step": 17832 + }, + { + "epoch": 0.88, + "grad_norm": 1.724265217781067, + "learning_rate": 5.946440041504027e-06, + "loss": 5.3831, + "step": 17836 + }, + { + "epoch": 0.88, + "grad_norm": 2.0802602767944336, + "learning_rate": 5.936558130342408e-06, + "loss": 5.3925, + "step": 17840 + }, + { + "epoch": 0.88, + "grad_norm": 1.913464069366455, + "learning_rate": 5.92667621918079e-06, + "loss": 5.4786, + "step": 17844 + }, + { + "epoch": 0.88, + "grad_norm": 1.9357552528381348, + "learning_rate": 5.916794308019171e-06, + "loss": 5.4415, + "step": 17848 + }, + { + "epoch": 0.88, + "grad_norm": 1.9869678020477295, + "learning_rate": 5.906912396857552e-06, + "loss": 5.4731, + "step": 17852 + }, + { + "epoch": 0.88, + "grad_norm": 2.1964402198791504, + "learning_rate": 5.897030485695934e-06, + "loss": 5.4538, + "step": 17856 + }, + { + "epoch": 0.88, + "grad_norm": 2.2341887950897217, + "learning_rate": 5.887148574534315e-06, + "loss": 5.4484, + "step": 17860 + }, + { + "epoch": 0.88, + "grad_norm": 2.2685790061950684, + "learning_rate": 5.8772666633726965e-06, + "loss": 5.4935, + "step": 17864 + }, + { + "epoch": 0.88, + "grad_norm": 1.9977366924285889, + "learning_rate": 5.867384752211079e-06, + "loss": 5.4866, + "step": 17868 + }, + { + "epoch": 0.88, + "grad_norm": 1.8025336265563965, + "learning_rate": 5.857502841049459e-06, + "loss": 5.5315, + "step": 17872 + }, + { + "epoch": 0.88, + "grad_norm": 1.8028703927993774, + "learning_rate": 5.84762092988784e-06, + "loss": 5.4048, + "step": 17876 + }, + { + "epoch": 0.88, + "grad_norm": 2.0206375122070312, + "learning_rate": 5.837739018726221e-06, + "loss": 5.5057, + "step": 17880 + }, + { + "epoch": 0.88, + "grad_norm": 2.2146549224853516, + "learning_rate": 5.8278571075646034e-06, + "loss": 5.4375, + "step": 17884 + }, + { + "epoch": 0.88, + "grad_norm": 2.1485488414764404, + "learning_rate": 5.817975196402985e-06, + "loss": 5.4375, + "step": 17888 + }, + { + "epoch": 0.88, + "grad_norm": 1.9976389408111572, + "learning_rate": 5.808093285241366e-06, + "loss": 5.5645, + "step": 17892 + }, + { + "epoch": 0.88, + "grad_norm": 2.1577677726745605, + "learning_rate": 5.798211374079747e-06, + "loss": 5.4917, + "step": 17896 + }, + { + "epoch": 0.88, + "grad_norm": 2.085784673690796, + "learning_rate": 5.788329462918129e-06, + "loss": 5.5115, + "step": 17900 + }, + { + "epoch": 0.88, + "grad_norm": 2.250061511993408, + "learning_rate": 5.77844755175651e-06, + "loss": 5.4932, + "step": 17904 + }, + { + "epoch": 0.88, + "grad_norm": 2.076542854309082, + "learning_rate": 5.768565640594891e-06, + "loss": 5.3858, + "step": 17908 + }, + { + "epoch": 0.89, + "grad_norm": 2.0458431243896484, + "learning_rate": 5.761154207223677e-06, + "loss": 5.5596, + "step": 17912 + }, + { + "epoch": 0.89, + "grad_norm": 2.075693130493164, + "learning_rate": 5.7512722960620585e-06, + "loss": 5.4962, + "step": 17916 + }, + { + "epoch": 0.89, + "grad_norm": 2.09002685546875, + "learning_rate": 5.74139038490044e-06, + "loss": 5.3926, + "step": 17920 + }, + { + "epoch": 0.89, + "grad_norm": 2.0116419792175293, + "learning_rate": 5.731508473738821e-06, + "loss": 5.4649, + "step": 17924 + }, + { + "epoch": 0.89, + "grad_norm": 2.0170979499816895, + "learning_rate": 5.721626562577203e-06, + "loss": 5.4906, + "step": 17928 + }, + { + "epoch": 0.89, + "grad_norm": 2.033344268798828, + "learning_rate": 5.711744651415584e-06, + "loss": 5.5144, + "step": 17932 + }, + { + "epoch": 0.89, + "grad_norm": 1.9842782020568848, + "learning_rate": 5.701862740253965e-06, + "loss": 5.4478, + "step": 17936 + }, + { + "epoch": 0.89, + "grad_norm": 2.059737205505371, + "learning_rate": 5.691980829092347e-06, + "loss": 5.4969, + "step": 17940 + }, + { + "epoch": 0.89, + "grad_norm": 2.116508960723877, + "learning_rate": 5.682098917930728e-06, + "loss": 5.5734, + "step": 17944 + }, + { + "epoch": 0.89, + "grad_norm": 2.062220573425293, + "learning_rate": 5.672217006769109e-06, + "loss": 5.3472, + "step": 17948 + }, + { + "epoch": 0.89, + "grad_norm": 2.176339626312256, + "learning_rate": 5.66233509560749e-06, + "loss": 5.3356, + "step": 17952 + }, + { + "epoch": 0.89, + "grad_norm": 2.2417047023773193, + "learning_rate": 5.652453184445872e-06, + "loss": 5.429, + "step": 17956 + }, + { + "epoch": 0.89, + "grad_norm": 1.9376815557479858, + "learning_rate": 5.6425712732842535e-06, + "loss": 5.4263, + "step": 17960 + }, + { + "epoch": 0.89, + "grad_norm": 1.9888767004013062, + "learning_rate": 5.632689362122635e-06, + "loss": 5.5286, + "step": 17964 + }, + { + "epoch": 0.89, + "grad_norm": 2.1054704189300537, + "learning_rate": 5.622807450961016e-06, + "loss": 5.4473, + "step": 17968 + }, + { + "epoch": 0.89, + "grad_norm": 2.3070156574249268, + "learning_rate": 5.612925539799398e-06, + "loss": 5.6394, + "step": 17972 + }, + { + "epoch": 0.89, + "grad_norm": 1.956694483757019, + "learning_rate": 5.603043628637779e-06, + "loss": 5.3773, + "step": 17976 + }, + { + "epoch": 0.89, + "grad_norm": 2.1512622833251953, + "learning_rate": 5.59316171747616e-06, + "loss": 5.5051, + "step": 17980 + }, + { + "epoch": 0.89, + "grad_norm": 1.9715803861618042, + "learning_rate": 5.583279806314542e-06, + "loss": 5.4564, + "step": 17984 + }, + { + "epoch": 0.89, + "grad_norm": 2.07094144821167, + "learning_rate": 5.573397895152923e-06, + "loss": 5.486, + "step": 17988 + }, + { + "epoch": 0.89, + "grad_norm": 2.0776047706604004, + "learning_rate": 5.563515983991304e-06, + "loss": 5.4101, + "step": 17992 + }, + { + "epoch": 0.89, + "grad_norm": 2.0641090869903564, + "learning_rate": 5.553634072829685e-06, + "loss": 5.4413, + "step": 17996 + }, + { + "epoch": 0.89, + "grad_norm": 2.069200038909912, + "learning_rate": 5.543752161668067e-06, + "loss": 5.4239, + "step": 18000 + }, + { + "epoch": 0.89, + "grad_norm": 1.9378856420516968, + "learning_rate": 5.5338702505064486e-06, + "loss": 5.4994, + "step": 18004 + }, + { + "epoch": 0.89, + "grad_norm": 1.9170506000518799, + "learning_rate": 5.52398833934483e-06, + "loss": 5.4589, + "step": 18008 + }, + { + "epoch": 0.89, + "grad_norm": 2.1123738288879395, + "learning_rate": 5.514106428183211e-06, + "loss": 5.4226, + "step": 18012 + }, + { + "epoch": 0.89, + "grad_norm": 1.9895274639129639, + "learning_rate": 5.504224517021592e-06, + "loss": 5.522, + "step": 18016 + }, + { + "epoch": 0.89, + "grad_norm": 2.216384172439575, + "learning_rate": 5.494342605859973e-06, + "loss": 5.4323, + "step": 18020 + }, + { + "epoch": 0.89, + "grad_norm": 2.1332645416259766, + "learning_rate": 5.484460694698355e-06, + "loss": 5.3666, + "step": 18024 + }, + { + "epoch": 0.89, + "grad_norm": 2.0537400245666504, + "learning_rate": 5.474578783536737e-06, + "loss": 5.4988, + "step": 18028 + }, + { + "epoch": 0.89, + "grad_norm": 1.9897429943084717, + "learning_rate": 5.464696872375118e-06, + "loss": 5.4833, + "step": 18032 + }, + { + "epoch": 0.89, + "grad_norm": 2.066513776779175, + "learning_rate": 5.454814961213499e-06, + "loss": 5.4636, + "step": 18036 + }, + { + "epoch": 0.89, + "grad_norm": 2.251376152038574, + "learning_rate": 5.44493305005188e-06, + "loss": 5.5205, + "step": 18040 + }, + { + "epoch": 0.89, + "grad_norm": 2.1560051441192627, + "learning_rate": 5.4350511388902615e-06, + "loss": 5.4888, + "step": 18044 + }, + { + "epoch": 0.89, + "grad_norm": 1.9921746253967285, + "learning_rate": 5.425169227728643e-06, + "loss": 5.503, + "step": 18048 + }, + { + "epoch": 0.89, + "grad_norm": 1.9806662797927856, + "learning_rate": 5.415287316567024e-06, + "loss": 5.5299, + "step": 18052 + }, + { + "epoch": 0.89, + "grad_norm": 2.086308479309082, + "learning_rate": 5.405405405405406e-06, + "loss": 5.3775, + "step": 18056 + }, + { + "epoch": 0.89, + "grad_norm": 1.9652711153030396, + "learning_rate": 5.395523494243787e-06, + "loss": 5.5683, + "step": 18060 + }, + { + "epoch": 0.89, + "grad_norm": 1.8376736640930176, + "learning_rate": 5.3856415830821685e-06, + "loss": 5.3905, + "step": 18064 + }, + { + "epoch": 0.89, + "grad_norm": 2.1515750885009766, + "learning_rate": 5.37575967192055e-06, + "loss": 5.421, + "step": 18068 + }, + { + "epoch": 0.89, + "grad_norm": 2.18635630607605, + "learning_rate": 5.365877760758931e-06, + "loss": 5.4513, + "step": 18072 + }, + { + "epoch": 0.89, + "grad_norm": 2.2322137355804443, + "learning_rate": 5.355995849597312e-06, + "loss": 5.5696, + "step": 18076 + }, + { + "epoch": 0.89, + "grad_norm": 2.1388771533966064, + "learning_rate": 5.346113938435693e-06, + "loss": 5.5154, + "step": 18080 + }, + { + "epoch": 0.89, + "grad_norm": 2.1032564640045166, + "learning_rate": 5.336232027274075e-06, + "loss": 5.4737, + "step": 18084 + }, + { + "epoch": 0.89, + "grad_norm": 2.2419564723968506, + "learning_rate": 5.326350116112457e-06, + "loss": 5.4804, + "step": 18088 + }, + { + "epoch": 0.89, + "grad_norm": 2.1092734336853027, + "learning_rate": 5.316468204950838e-06, + "loss": 5.523, + "step": 18092 + }, + { + "epoch": 0.89, + "grad_norm": 1.9642736911773682, + "learning_rate": 5.306586293789219e-06, + "loss": 5.4615, + "step": 18096 + }, + { + "epoch": 0.89, + "grad_norm": 2.285712480545044, + "learning_rate": 5.296704382627601e-06, + "loss": 5.4944, + "step": 18100 + }, + { + "epoch": 0.89, + "grad_norm": 1.8048274517059326, + "learning_rate": 5.2868224714659814e-06, + "loss": 5.3293, + "step": 18104 + }, + { + "epoch": 0.89, + "grad_norm": 1.9001215696334839, + "learning_rate": 5.276940560304363e-06, + "loss": 5.3799, + "step": 18108 + }, + { + "epoch": 0.89, + "grad_norm": 2.052248954772949, + "learning_rate": 5.267058649142744e-06, + "loss": 5.54, + "step": 18112 + }, + { + "epoch": 0.9, + "grad_norm": 1.9618264436721802, + "learning_rate": 5.257176737981126e-06, + "loss": 5.5206, + "step": 18116 + }, + { + "epoch": 0.9, + "grad_norm": 2.086357355117798, + "learning_rate": 5.247294826819507e-06, + "loss": 5.4713, + "step": 18120 + }, + { + "epoch": 0.9, + "grad_norm": 1.989790678024292, + "learning_rate": 5.237412915657888e-06, + "loss": 5.4183, + "step": 18124 + }, + { + "epoch": 0.9, + "grad_norm": 2.4142305850982666, + "learning_rate": 5.22753100449627e-06, + "loss": 5.4867, + "step": 18128 + }, + { + "epoch": 0.9, + "grad_norm": 1.9405925273895264, + "learning_rate": 5.217649093334652e-06, + "loss": 5.4259, + "step": 18132 + }, + { + "epoch": 0.9, + "grad_norm": 1.9569774866104126, + "learning_rate": 5.207767182173033e-06, + "loss": 5.4619, + "step": 18136 + }, + { + "epoch": 0.9, + "grad_norm": 2.1723146438598633, + "learning_rate": 5.197885271011413e-06, + "loss": 5.4719, + "step": 18140 + }, + { + "epoch": 0.9, + "grad_norm": 2.3420112133026123, + "learning_rate": 5.188003359849795e-06, + "loss": 5.4274, + "step": 18144 + }, + { + "epoch": 0.9, + "grad_norm": 2.2983791828155518, + "learning_rate": 5.1781214486881765e-06, + "loss": 5.5184, + "step": 18148 + }, + { + "epoch": 0.9, + "grad_norm": 2.061795711517334, + "learning_rate": 5.168239537526558e-06, + "loss": 5.4416, + "step": 18152 + }, + { + "epoch": 0.9, + "grad_norm": 2.11879301071167, + "learning_rate": 5.15835762636494e-06, + "loss": 5.3997, + "step": 18156 + }, + { + "epoch": 0.9, + "grad_norm": 2.072601556777954, + "learning_rate": 5.148475715203321e-06, + "loss": 5.4744, + "step": 18160 + }, + { + "epoch": 0.9, + "grad_norm": 2.037374258041382, + "learning_rate": 5.138593804041702e-06, + "loss": 5.5549, + "step": 18164 + }, + { + "epoch": 0.9, + "grad_norm": 2.1012215614318848, + "learning_rate": 5.128711892880083e-06, + "loss": 5.5601, + "step": 18168 + }, + { + "epoch": 0.9, + "grad_norm": 1.9614689350128174, + "learning_rate": 5.118829981718465e-06, + "loss": 5.3775, + "step": 18172 + }, + { + "epoch": 0.9, + "grad_norm": 2.3375091552734375, + "learning_rate": 5.108948070556846e-06, + "loss": 5.5373, + "step": 18176 + }, + { + "epoch": 0.9, + "grad_norm": 1.9838519096374512, + "learning_rate": 5.099066159395227e-06, + "loss": 5.3491, + "step": 18180 + }, + { + "epoch": 0.9, + "grad_norm": 2.0909204483032227, + "learning_rate": 5.089184248233608e-06, + "loss": 5.3386, + "step": 18184 + }, + { + "epoch": 0.9, + "grad_norm": 2.396127223968506, + "learning_rate": 5.07930233707199e-06, + "loss": 5.4791, + "step": 18188 + }, + { + "epoch": 0.9, + "grad_norm": 2.117344379425049, + "learning_rate": 5.0694204259103715e-06, + "loss": 5.3322, + "step": 18192 + }, + { + "epoch": 0.9, + "grad_norm": 2.0109291076660156, + "learning_rate": 5.059538514748753e-06, + "loss": 5.4152, + "step": 18196 + }, + { + "epoch": 0.9, + "grad_norm": 2.051154851913452, + "learning_rate": 5.049656603587134e-06, + "loss": 5.504, + "step": 18200 + }, + { + "epoch": 0.9, + "grad_norm": 1.9939186573028564, + "learning_rate": 5.039774692425515e-06, + "loss": 5.377, + "step": 18204 + }, + { + "epoch": 0.9, + "grad_norm": 2.0900635719299316, + "learning_rate": 5.029892781263896e-06, + "loss": 5.4448, + "step": 18208 + }, + { + "epoch": 0.9, + "grad_norm": 1.7526922225952148, + "learning_rate": 5.020010870102278e-06, + "loss": 5.2948, + "step": 18212 + }, + { + "epoch": 0.9, + "grad_norm": 2.082535982131958, + "learning_rate": 5.01012895894066e-06, + "loss": 5.4976, + "step": 18216 + }, + { + "epoch": 0.9, + "grad_norm": 2.088693380355835, + "learning_rate": 5.000247047779041e-06, + "loss": 5.4579, + "step": 18220 + }, + { + "epoch": 0.9, + "grad_norm": 1.9704039096832275, + "learning_rate": 4.990365136617422e-06, + "loss": 5.5093, + "step": 18224 + }, + { + "epoch": 0.9, + "grad_norm": 2.2970244884490967, + "learning_rate": 4.980483225455803e-06, + "loss": 5.5017, + "step": 18228 + }, + { + "epoch": 0.9, + "grad_norm": 2.0478765964508057, + "learning_rate": 4.9706013142941845e-06, + "loss": 5.4761, + "step": 18232 + }, + { + "epoch": 0.9, + "grad_norm": 2.1008124351501465, + "learning_rate": 4.960719403132566e-06, + "loss": 5.5902, + "step": 18236 + }, + { + "epoch": 0.9, + "grad_norm": 2.41119384765625, + "learning_rate": 4.950837491970947e-06, + "loss": 5.613, + "step": 18240 + }, + { + "epoch": 0.9, + "grad_norm": 1.9696173667907715, + "learning_rate": 4.940955580809329e-06, + "loss": 5.4408, + "step": 18244 + }, + { + "epoch": 0.9, + "grad_norm": 2.0599782466888428, + "learning_rate": 4.93107366964771e-06, + "loss": 5.455, + "step": 18248 + }, + { + "epoch": 0.9, + "grad_norm": 2.1011674404144287, + "learning_rate": 4.921191758486091e-06, + "loss": 5.4402, + "step": 18252 + }, + { + "epoch": 0.9, + "grad_norm": 2.209735870361328, + "learning_rate": 4.9113098473244735e-06, + "loss": 5.378, + "step": 18256 + }, + { + "epoch": 0.9, + "grad_norm": 2.093017101287842, + "learning_rate": 4.901427936162855e-06, + "loss": 5.4387, + "step": 18260 + }, + { + "epoch": 0.9, + "grad_norm": 2.14595103263855, + "learning_rate": 4.891546025001235e-06, + "loss": 5.5165, + "step": 18264 + }, + { + "epoch": 0.9, + "grad_norm": 2.3589043617248535, + "learning_rate": 4.881664113839616e-06, + "loss": 5.4973, + "step": 18268 + }, + { + "epoch": 0.9, + "grad_norm": 2.041437864303589, + "learning_rate": 4.871782202677998e-06, + "loss": 5.5404, + "step": 18272 + }, + { + "epoch": 0.9, + "grad_norm": 2.244915723800659, + "learning_rate": 4.8619002915163795e-06, + "loss": 5.3885, + "step": 18276 + }, + { + "epoch": 0.9, + "grad_norm": 2.1960060596466064, + "learning_rate": 4.852018380354761e-06, + "loss": 5.534, + "step": 18280 + }, + { + "epoch": 0.9, + "grad_norm": 2.032874345779419, + "learning_rate": 4.842136469193142e-06, + "loss": 5.4292, + "step": 18284 + }, + { + "epoch": 0.9, + "grad_norm": 1.9136545658111572, + "learning_rate": 4.832254558031524e-06, + "loss": 5.511, + "step": 18288 + }, + { + "epoch": 0.9, + "grad_norm": 2.0950520038604736, + "learning_rate": 4.822372646869905e-06, + "loss": 5.414, + "step": 18292 + }, + { + "epoch": 0.9, + "grad_norm": 2.0211408138275146, + "learning_rate": 4.812490735708286e-06, + "loss": 5.4139, + "step": 18296 + }, + { + "epoch": 0.9, + "grad_norm": 1.994523286819458, + "learning_rate": 4.802608824546668e-06, + "loss": 5.4814, + "step": 18300 + }, + { + "epoch": 0.9, + "grad_norm": 2.0099375247955322, + "learning_rate": 4.792726913385049e-06, + "loss": 5.4478, + "step": 18304 + }, + { + "epoch": 0.9, + "grad_norm": 1.9458421468734741, + "learning_rate": 4.78284500222343e-06, + "loss": 5.4538, + "step": 18308 + }, + { + "epoch": 0.9, + "grad_norm": 2.13274884223938, + "learning_rate": 4.772963091061811e-06, + "loss": 5.5146, + "step": 18312 + }, + { + "epoch": 0.9, + "grad_norm": 2.223994731903076, + "learning_rate": 4.763081179900193e-06, + "loss": 5.3984, + "step": 18316 + }, + { + "epoch": 0.91, + "grad_norm": 2.0752246379852295, + "learning_rate": 4.753199268738575e-06, + "loss": 5.4198, + "step": 18320 + }, + { + "epoch": 0.91, + "grad_norm": 1.913928508758545, + "learning_rate": 4.743317357576956e-06, + "loss": 5.4548, + "step": 18324 + }, + { + "epoch": 0.91, + "grad_norm": 1.9709479808807373, + "learning_rate": 4.733435446415337e-06, + "loss": 5.432, + "step": 18328 + }, + { + "epoch": 0.91, + "grad_norm": 2.19464111328125, + "learning_rate": 4.723553535253718e-06, + "loss": 5.5339, + "step": 18332 + }, + { + "epoch": 0.91, + "grad_norm": 1.9611873626708984, + "learning_rate": 4.7136716240920994e-06, + "loss": 5.6126, + "step": 18336 + }, + { + "epoch": 0.91, + "grad_norm": 1.9830701351165771, + "learning_rate": 4.703789712930481e-06, + "loss": 5.413, + "step": 18340 + }, + { + "epoch": 0.91, + "grad_norm": 2.189093828201294, + "learning_rate": 4.693907801768863e-06, + "loss": 5.4765, + "step": 18344 + }, + { + "epoch": 0.91, + "grad_norm": 1.831568956375122, + "learning_rate": 4.684025890607244e-06, + "loss": 5.4116, + "step": 18348 + }, + { + "epoch": 0.91, + "grad_norm": 1.8038347959518433, + "learning_rate": 4.674143979445625e-06, + "loss": 5.3937, + "step": 18352 + }, + { + "epoch": 0.91, + "grad_norm": 2.043757200241089, + "learning_rate": 4.664262068284006e-06, + "loss": 5.3792, + "step": 18356 + }, + { + "epoch": 0.91, + "grad_norm": 2.10916805267334, + "learning_rate": 4.6543801571223876e-06, + "loss": 5.4894, + "step": 18360 + }, + { + "epoch": 0.91, + "grad_norm": 2.1079790592193604, + "learning_rate": 4.644498245960769e-06, + "loss": 5.418, + "step": 18364 + }, + { + "epoch": 0.91, + "grad_norm": 2.068610191345215, + "learning_rate": 4.63461633479915e-06, + "loss": 5.4669, + "step": 18368 + }, + { + "epoch": 0.91, + "grad_norm": 2.140131950378418, + "learning_rate": 4.624734423637532e-06, + "loss": 5.4256, + "step": 18372 + }, + { + "epoch": 0.91, + "grad_norm": 1.9517799615859985, + "learning_rate": 4.614852512475913e-06, + "loss": 5.4603, + "step": 18376 + }, + { + "epoch": 0.91, + "grad_norm": 2.0555922985076904, + "learning_rate": 4.6049706013142945e-06, + "loss": 5.594, + "step": 18380 + }, + { + "epoch": 0.91, + "grad_norm": 2.1756505966186523, + "learning_rate": 4.595088690152676e-06, + "loss": 5.4045, + "step": 18384 + }, + { + "epoch": 0.91, + "grad_norm": 2.0365309715270996, + "learning_rate": 4.585206778991057e-06, + "loss": 5.4668, + "step": 18388 + }, + { + "epoch": 0.91, + "grad_norm": 1.979575753211975, + "learning_rate": 4.575324867829438e-06, + "loss": 5.5546, + "step": 18392 + }, + { + "epoch": 0.91, + "grad_norm": 2.1645045280456543, + "learning_rate": 4.565442956667819e-06, + "loss": 5.4158, + "step": 18396 + }, + { + "epoch": 0.91, + "grad_norm": 2.400611162185669, + "learning_rate": 4.555561045506201e-06, + "loss": 5.4139, + "step": 18400 + }, + { + "epoch": 0.91, + "grad_norm": 2.1293954849243164, + "learning_rate": 4.545679134344583e-06, + "loss": 5.3817, + "step": 18404 + }, + { + "epoch": 0.91, + "grad_norm": 2.0387656688690186, + "learning_rate": 4.535797223182964e-06, + "loss": 5.533, + "step": 18408 + }, + { + "epoch": 0.91, + "grad_norm": 2.231621265411377, + "learning_rate": 4.525915312021345e-06, + "loss": 5.4445, + "step": 18412 + }, + { + "epoch": 0.91, + "grad_norm": 2.0062594413757324, + "learning_rate": 4.516033400859727e-06, + "loss": 5.3092, + "step": 18416 + }, + { + "epoch": 0.91, + "grad_norm": 1.9188722372055054, + "learning_rate": 4.506151489698108e-06, + "loss": 5.4884, + "step": 18420 + }, + { + "epoch": 0.91, + "grad_norm": 1.9304651021957397, + "learning_rate": 4.496269578536489e-06, + "loss": 5.5277, + "step": 18424 + }, + { + "epoch": 0.91, + "grad_norm": 2.186587333679199, + "learning_rate": 4.48638766737487e-06, + "loss": 5.494, + "step": 18428 + }, + { + "epoch": 0.91, + "grad_norm": 2.170325994491577, + "learning_rate": 4.476505756213252e-06, + "loss": 5.5372, + "step": 18432 + }, + { + "epoch": 0.91, + "grad_norm": 2.1663074493408203, + "learning_rate": 4.466623845051633e-06, + "loss": 5.4398, + "step": 18436 + }, + { + "epoch": 0.91, + "grad_norm": 2.090306043624878, + "learning_rate": 4.456741933890014e-06, + "loss": 5.5293, + "step": 18440 + }, + { + "epoch": 0.91, + "grad_norm": 1.9806874990463257, + "learning_rate": 4.4468600227283964e-06, + "loss": 5.4678, + "step": 18444 + }, + { + "epoch": 0.91, + "grad_norm": 2.011406183242798, + "learning_rate": 4.436978111566778e-06, + "loss": 5.4633, + "step": 18448 + }, + { + "epoch": 0.91, + "grad_norm": 1.8348437547683716, + "learning_rate": 4.427096200405159e-06, + "loss": 5.4313, + "step": 18452 + }, + { + "epoch": 0.91, + "grad_norm": 2.4174129962921143, + "learning_rate": 4.417214289243539e-06, + "loss": 5.5076, + "step": 18456 + }, + { + "epoch": 0.91, + "grad_norm": 2.0693600177764893, + "learning_rate": 4.407332378081921e-06, + "loss": 5.505, + "step": 18460 + }, + { + "epoch": 0.91, + "grad_norm": 1.9076710939407349, + "learning_rate": 4.3974504669203025e-06, + "loss": 5.3791, + "step": 18464 + }, + { + "epoch": 0.91, + "grad_norm": 2.2067015171051025, + "learning_rate": 4.387568555758684e-06, + "loss": 5.4183, + "step": 18468 + }, + { + "epoch": 0.91, + "grad_norm": 2.229694128036499, + "learning_rate": 4.377686644597066e-06, + "loss": 5.4481, + "step": 18472 + }, + { + "epoch": 0.91, + "grad_norm": 2.3484323024749756, + "learning_rate": 4.367804733435447e-06, + "loss": 5.55, + "step": 18476 + }, + { + "epoch": 0.91, + "grad_norm": 2.2795610427856445, + "learning_rate": 4.357922822273828e-06, + "loss": 5.4688, + "step": 18480 + }, + { + "epoch": 0.91, + "grad_norm": 2.2929630279541016, + "learning_rate": 4.348040911112209e-06, + "loss": 5.5401, + "step": 18484 + }, + { + "epoch": 0.91, + "grad_norm": 2.089888334274292, + "learning_rate": 4.338158999950591e-06, + "loss": 5.3467, + "step": 18488 + }, + { + "epoch": 0.91, + "grad_norm": 2.0417628288269043, + "learning_rate": 4.328277088788972e-06, + "loss": 5.3768, + "step": 18492 + }, + { + "epoch": 0.91, + "grad_norm": 2.065882682800293, + "learning_rate": 4.318395177627353e-06, + "loss": 5.4829, + "step": 18496 + }, + { + "epoch": 0.91, + "grad_norm": 2.1247379779815674, + "learning_rate": 4.308513266465734e-06, + "loss": 5.4237, + "step": 18500 + }, + { + "epoch": 0.91, + "grad_norm": 1.8752723932266235, + "learning_rate": 4.298631355304116e-06, + "loss": 5.426, + "step": 18504 + }, + { + "epoch": 0.91, + "grad_norm": 2.2132692337036133, + "learning_rate": 4.2887494441424975e-06, + "loss": 5.4543, + "step": 18508 + }, + { + "epoch": 0.91, + "grad_norm": 2.0311672687530518, + "learning_rate": 4.278867532980879e-06, + "loss": 5.6122, + "step": 18512 + }, + { + "epoch": 0.91, + "grad_norm": 2.083220958709717, + "learning_rate": 4.26898562181926e-06, + "loss": 5.4746, + "step": 18516 + }, + { + "epoch": 0.92, + "grad_norm": 1.9595463275909424, + "learning_rate": 4.259103710657641e-06, + "loss": 5.3808, + "step": 18520 + }, + { + "epoch": 0.92, + "grad_norm": 2.2142648696899414, + "learning_rate": 4.249221799496022e-06, + "loss": 5.4515, + "step": 18524 + }, + { + "epoch": 0.92, + "grad_norm": 1.9906036853790283, + "learning_rate": 4.239339888334404e-06, + "loss": 5.4185, + "step": 18528 + }, + { + "epoch": 0.92, + "grad_norm": 2.193829298019409, + "learning_rate": 4.229457977172786e-06, + "loss": 5.5095, + "step": 18532 + }, + { + "epoch": 0.92, + "grad_norm": 2.1080212593078613, + "learning_rate": 4.219576066011167e-06, + "loss": 5.4778, + "step": 18536 + }, + { + "epoch": 0.92, + "grad_norm": 1.9774274826049805, + "learning_rate": 4.209694154849548e-06, + "loss": 5.557, + "step": 18540 + }, + { + "epoch": 0.92, + "grad_norm": 1.9373048543930054, + "learning_rate": 4.19981224368793e-06, + "loss": 5.2974, + "step": 18544 + }, + { + "epoch": 0.92, + "grad_norm": 2.1534974575042725, + "learning_rate": 4.1899303325263105e-06, + "loss": 5.3349, + "step": 18548 + }, + { + "epoch": 0.92, + "grad_norm": 1.8971552848815918, + "learning_rate": 4.180048421364692e-06, + "loss": 5.4853, + "step": 18552 + }, + { + "epoch": 0.92, + "grad_norm": 2.132871627807617, + "learning_rate": 4.170166510203073e-06, + "loss": 5.4839, + "step": 18556 + }, + { + "epoch": 0.92, + "grad_norm": 2.068107843399048, + "learning_rate": 4.160284599041455e-06, + "loss": 5.5167, + "step": 18560 + }, + { + "epoch": 0.92, + "grad_norm": 1.9894949197769165, + "learning_rate": 4.150402687879836e-06, + "loss": 5.5169, + "step": 18564 + }, + { + "epoch": 0.92, + "grad_norm": 1.914143443107605, + "learning_rate": 4.1405207767182174e-06, + "loss": 5.484, + "step": 18568 + }, + { + "epoch": 0.92, + "grad_norm": 2.168916940689087, + "learning_rate": 4.1306388655565995e-06, + "loss": 5.4795, + "step": 18572 + }, + { + "epoch": 0.92, + "grad_norm": 2.0595452785491943, + "learning_rate": 4.120756954394981e-06, + "loss": 5.5072, + "step": 18576 + }, + { + "epoch": 0.92, + "grad_norm": 1.9776825904846191, + "learning_rate": 4.110875043233361e-06, + "loss": 5.5014, + "step": 18580 + }, + { + "epoch": 0.92, + "grad_norm": 2.120623826980591, + "learning_rate": 4.100993132071742e-06, + "loss": 5.4891, + "step": 18584 + }, + { + "epoch": 0.92, + "grad_norm": 1.789355754852295, + "learning_rate": 4.091111220910124e-06, + "loss": 5.5514, + "step": 18588 + }, + { + "epoch": 0.92, + "grad_norm": 2.0536649227142334, + "learning_rate": 4.0812293097485056e-06, + "loss": 5.3581, + "step": 18592 + }, + { + "epoch": 0.92, + "grad_norm": 1.785683035850525, + "learning_rate": 4.071347398586887e-06, + "loss": 5.4293, + "step": 18596 + }, + { + "epoch": 0.92, + "grad_norm": 1.9793387651443481, + "learning_rate": 4.061465487425268e-06, + "loss": 5.4698, + "step": 18600 + }, + { + "epoch": 0.92, + "grad_norm": 2.0507895946502686, + "learning_rate": 4.05158357626365e-06, + "loss": 5.3736, + "step": 18604 + }, + { + "epoch": 0.92, + "grad_norm": 2.2948641777038574, + "learning_rate": 4.041701665102031e-06, + "loss": 5.4733, + "step": 18608 + }, + { + "epoch": 0.92, + "grad_norm": 2.0238101482391357, + "learning_rate": 4.0318197539404125e-06, + "loss": 5.4463, + "step": 18612 + }, + { + "epoch": 0.92, + "grad_norm": 2.103618860244751, + "learning_rate": 4.021937842778794e-06, + "loss": 5.4943, + "step": 18616 + }, + { + "epoch": 0.92, + "grad_norm": 2.038139581680298, + "learning_rate": 4.012055931617175e-06, + "loss": 5.5276, + "step": 18620 + }, + { + "epoch": 0.92, + "grad_norm": 1.837792158126831, + "learning_rate": 4.002174020455556e-06, + "loss": 5.4706, + "step": 18624 + }, + { + "epoch": 0.92, + "grad_norm": 2.0344648361206055, + "learning_rate": 3.992292109293937e-06, + "loss": 5.3494, + "step": 18628 + }, + { + "epoch": 0.92, + "grad_norm": 2.0213499069213867, + "learning_rate": 3.982410198132319e-06, + "loss": 5.5867, + "step": 18632 + }, + { + "epoch": 0.92, + "grad_norm": 2.209096670150757, + "learning_rate": 3.972528286970701e-06, + "loss": 5.5146, + "step": 18636 + }, + { + "epoch": 0.92, + "grad_norm": 1.924126148223877, + "learning_rate": 3.962646375809082e-06, + "loss": 5.4478, + "step": 18640 + }, + { + "epoch": 0.92, + "grad_norm": 2.2356016635894775, + "learning_rate": 3.952764464647463e-06, + "loss": 5.5293, + "step": 18644 + }, + { + "epoch": 0.92, + "grad_norm": 2.2954514026641846, + "learning_rate": 3.942882553485844e-06, + "loss": 5.4561, + "step": 18648 + }, + { + "epoch": 0.92, + "grad_norm": 2.275831460952759, + "learning_rate": 3.9330006423242254e-06, + "loss": 5.4746, + "step": 18652 + }, + { + "epoch": 0.92, + "grad_norm": 2.116305112838745, + "learning_rate": 3.923118731162607e-06, + "loss": 5.6006, + "step": 18656 + }, + { + "epoch": 0.92, + "grad_norm": 2.1635780334472656, + "learning_rate": 3.913236820000989e-06, + "loss": 5.5606, + "step": 18660 + }, + { + "epoch": 0.92, + "grad_norm": 1.8676637411117554, + "learning_rate": 3.90335490883937e-06, + "loss": 5.4456, + "step": 18664 + }, + { + "epoch": 0.92, + "grad_norm": 2.344409227371216, + "learning_rate": 3.893472997677751e-06, + "loss": 5.5425, + "step": 18668 + }, + { + "epoch": 0.92, + "grad_norm": 2.0696935653686523, + "learning_rate": 3.883591086516132e-06, + "loss": 5.4989, + "step": 18672 + }, + { + "epoch": 0.92, + "grad_norm": 2.1459434032440186, + "learning_rate": 3.8737091753545136e-06, + "loss": 5.4776, + "step": 18676 + }, + { + "epoch": 0.92, + "grad_norm": 2.0918266773223877, + "learning_rate": 3.863827264192895e-06, + "loss": 5.408, + "step": 18680 + }, + { + "epoch": 0.92, + "grad_norm": 1.6926395893096924, + "learning_rate": 3.853945353031276e-06, + "loss": 5.4212, + "step": 18684 + }, + { + "epoch": 0.92, + "grad_norm": 1.9440195560455322, + "learning_rate": 3.844063441869658e-06, + "loss": 5.4617, + "step": 18688 + }, + { + "epoch": 0.92, + "grad_norm": 1.8561819791793823, + "learning_rate": 3.834181530708039e-06, + "loss": 5.5185, + "step": 18692 + }, + { + "epoch": 0.92, + "grad_norm": 2.1534957885742188, + "learning_rate": 3.8242996195464205e-06, + "loss": 5.3607, + "step": 18696 + }, + { + "epoch": 0.92, + "grad_norm": 1.9762316942214966, + "learning_rate": 3.8144177083848017e-06, + "loss": 5.4843, + "step": 18700 + }, + { + "epoch": 0.92, + "grad_norm": 1.9604766368865967, + "learning_rate": 3.8045357972231833e-06, + "loss": 5.5596, + "step": 18704 + }, + { + "epoch": 0.92, + "grad_norm": 2.186389684677124, + "learning_rate": 3.7946538860615645e-06, + "loss": 5.5269, + "step": 18708 + }, + { + "epoch": 0.92, + "grad_norm": 2.211263656616211, + "learning_rate": 3.7847719748999458e-06, + "loss": 5.4103, + "step": 18712 + }, + { + "epoch": 0.92, + "grad_norm": 2.0040395259857178, + "learning_rate": 3.7748900637383274e-06, + "loss": 5.4427, + "step": 18716 + }, + { + "epoch": 0.92, + "grad_norm": 2.08552885055542, + "learning_rate": 3.7650081525767086e-06, + "loss": 5.3101, + "step": 18720 + }, + { + "epoch": 0.93, + "grad_norm": 2.090424060821533, + "learning_rate": 3.75512624141509e-06, + "loss": 5.4502, + "step": 18724 + }, + { + "epoch": 0.93, + "grad_norm": 1.8602705001831055, + "learning_rate": 3.745244330253471e-06, + "loss": 5.4708, + "step": 18728 + }, + { + "epoch": 0.93, + "grad_norm": 1.9746719598770142, + "learning_rate": 3.7353624190918527e-06, + "loss": 5.3835, + "step": 18732 + }, + { + "epoch": 0.93, + "grad_norm": 1.9959601163864136, + "learning_rate": 3.725480507930234e-06, + "loss": 5.4041, + "step": 18736 + }, + { + "epoch": 0.93, + "grad_norm": 2.080343008041382, + "learning_rate": 3.715598596768615e-06, + "loss": 5.5822, + "step": 18740 + }, + { + "epoch": 0.93, + "grad_norm": 2.260117292404175, + "learning_rate": 3.7057166856069963e-06, + "loss": 5.4475, + "step": 18744 + }, + { + "epoch": 0.93, + "grad_norm": 2.2036492824554443, + "learning_rate": 3.695834774445378e-06, + "loss": 5.5317, + "step": 18748 + }, + { + "epoch": 0.93, + "grad_norm": 2.4585494995117188, + "learning_rate": 3.685952863283759e-06, + "loss": 5.5552, + "step": 18752 + }, + { + "epoch": 0.93, + "grad_norm": 2.2832350730895996, + "learning_rate": 3.6760709521221404e-06, + "loss": 5.4459, + "step": 18756 + }, + { + "epoch": 0.93, + "grad_norm": 2.144115924835205, + "learning_rate": 3.666189040960522e-06, + "loss": 5.3545, + "step": 18760 + }, + { + "epoch": 0.93, + "grad_norm": 2.214642286300659, + "learning_rate": 3.6563071297989032e-06, + "loss": 5.3576, + "step": 18764 + }, + { + "epoch": 0.93, + "grad_norm": 1.9967424869537354, + "learning_rate": 3.6464252186372844e-06, + "loss": 5.4909, + "step": 18768 + }, + { + "epoch": 0.93, + "grad_norm": 2.0453543663024902, + "learning_rate": 3.6365433074756657e-06, + "loss": 5.5049, + "step": 18772 + }, + { + "epoch": 0.93, + "grad_norm": 2.09987473487854, + "learning_rate": 3.6266613963140473e-06, + "loss": 5.5611, + "step": 18776 + }, + { + "epoch": 0.93, + "grad_norm": 1.9165294170379639, + "learning_rate": 3.6167794851524285e-06, + "loss": 5.3851, + "step": 18780 + }, + { + "epoch": 0.93, + "grad_norm": 2.128594160079956, + "learning_rate": 3.6068975739908097e-06, + "loss": 5.444, + "step": 18784 + }, + { + "epoch": 0.93, + "grad_norm": 2.029412031173706, + "learning_rate": 3.5970156628291918e-06, + "loss": 5.6027, + "step": 18788 + }, + { + "epoch": 0.93, + "grad_norm": 2.0921339988708496, + "learning_rate": 3.587133751667573e-06, + "loss": 5.4492, + "step": 18792 + }, + { + "epoch": 0.93, + "grad_norm": 2.300293445587158, + "learning_rate": 3.5772518405059538e-06, + "loss": 5.5684, + "step": 18796 + }, + { + "epoch": 0.93, + "grad_norm": 2.034621000289917, + "learning_rate": 3.567369929344335e-06, + "loss": 5.5198, + "step": 18800 + }, + { + "epoch": 0.93, + "grad_norm": 2.3296048641204834, + "learning_rate": 3.557488018182717e-06, + "loss": 5.5555, + "step": 18804 + }, + { + "epoch": 0.93, + "grad_norm": 2.2251405715942383, + "learning_rate": 3.5476061070210983e-06, + "loss": 5.4131, + "step": 18808 + }, + { + "epoch": 0.93, + "grad_norm": 2.1167004108428955, + "learning_rate": 3.537724195859479e-06, + "loss": 5.3361, + "step": 18812 + }, + { + "epoch": 0.93, + "grad_norm": 1.959514856338501, + "learning_rate": 3.527842284697861e-06, + "loss": 5.3026, + "step": 18816 + }, + { + "epoch": 0.93, + "grad_norm": 2.3255879878997803, + "learning_rate": 3.5179603735362423e-06, + "loss": 5.3271, + "step": 18820 + }, + { + "epoch": 0.93, + "grad_norm": 2.0048890113830566, + "learning_rate": 3.5080784623746235e-06, + "loss": 5.4256, + "step": 18824 + }, + { + "epoch": 0.93, + "grad_norm": 2.1140854358673096, + "learning_rate": 3.4981965512130043e-06, + "loss": 5.3592, + "step": 18828 + }, + { + "epoch": 0.93, + "grad_norm": 1.857484221458435, + "learning_rate": 3.4883146400513864e-06, + "loss": 5.4143, + "step": 18832 + }, + { + "epoch": 0.93, + "grad_norm": 2.063621997833252, + "learning_rate": 3.4784327288897676e-06, + "loss": 5.5095, + "step": 18836 + }, + { + "epoch": 0.93, + "grad_norm": 1.9271105527877808, + "learning_rate": 3.468550817728149e-06, + "loss": 5.4678, + "step": 18840 + }, + { + "epoch": 0.93, + "grad_norm": 1.9842108488082886, + "learning_rate": 3.4586689065665296e-06, + "loss": 5.4825, + "step": 18844 + }, + { + "epoch": 0.93, + "grad_norm": 2.128753662109375, + "learning_rate": 3.4487869954049117e-06, + "loss": 5.4128, + "step": 18848 + }, + { + "epoch": 0.93, + "grad_norm": 2.0321123600006104, + "learning_rate": 3.438905084243293e-06, + "loss": 5.4735, + "step": 18852 + }, + { + "epoch": 0.93, + "grad_norm": 2.1090190410614014, + "learning_rate": 3.429023173081674e-06, + "loss": 5.4828, + "step": 18856 + }, + { + "epoch": 0.93, + "grad_norm": 2.016251802444458, + "learning_rate": 3.4191412619200557e-06, + "loss": 5.56, + "step": 18860 + }, + { + "epoch": 0.93, + "grad_norm": 1.863065481185913, + "learning_rate": 3.409259350758437e-06, + "loss": 5.456, + "step": 18864 + }, + { + "epoch": 0.93, + "grad_norm": 2.226012706756592, + "learning_rate": 3.399377439596818e-06, + "loss": 5.5674, + "step": 18868 + }, + { + "epoch": 0.93, + "grad_norm": 2.3455286026000977, + "learning_rate": 3.3894955284351994e-06, + "loss": 5.5108, + "step": 18872 + }, + { + "epoch": 0.93, + "grad_norm": 1.991924524307251, + "learning_rate": 3.379613617273581e-06, + "loss": 5.5043, + "step": 18876 + }, + { + "epoch": 0.93, + "grad_norm": 2.287811279296875, + "learning_rate": 3.3697317061119622e-06, + "loss": 5.4942, + "step": 18880 + }, + { + "epoch": 0.93, + "grad_norm": 2.099695920944214, + "learning_rate": 3.3598497949503434e-06, + "loss": 5.4295, + "step": 18884 + }, + { + "epoch": 0.93, + "grad_norm": 2.0772879123687744, + "learning_rate": 3.349967883788725e-06, + "loss": 5.4999, + "step": 18888 + }, + { + "epoch": 0.93, + "grad_norm": 2.076507329940796, + "learning_rate": 3.3400859726271063e-06, + "loss": 5.3828, + "step": 18892 + }, + { + "epoch": 0.93, + "grad_norm": 1.9823633432388306, + "learning_rate": 3.3302040614654875e-06, + "loss": 5.4312, + "step": 18896 + }, + { + "epoch": 0.93, + "grad_norm": 2.120603322982788, + "learning_rate": 3.3203221503038687e-06, + "loss": 5.4585, + "step": 18900 + }, + { + "epoch": 0.93, + "grad_norm": 1.9160094261169434, + "learning_rate": 3.3104402391422504e-06, + "loss": 5.4471, + "step": 18904 + }, + { + "epoch": 0.93, + "grad_norm": 2.0175113677978516, + "learning_rate": 3.3005583279806316e-06, + "loss": 5.3894, + "step": 18908 + }, + { + "epoch": 0.93, + "grad_norm": 2.052076578140259, + "learning_rate": 3.2906764168190128e-06, + "loss": 5.5049, + "step": 18912 + }, + { + "epoch": 0.93, + "grad_norm": 2.0342178344726562, + "learning_rate": 3.280794505657394e-06, + "loss": 5.4793, + "step": 18916 + }, + { + "epoch": 0.93, + "grad_norm": 2.004908561706543, + "learning_rate": 3.2709125944957756e-06, + "loss": 5.5442, + "step": 18920 + }, + { + "epoch": 0.94, + "grad_norm": 2.1404547691345215, + "learning_rate": 3.261030683334157e-06, + "loss": 5.4585, + "step": 18924 + }, + { + "epoch": 0.94, + "grad_norm": 2.1052112579345703, + "learning_rate": 3.251148772172538e-06, + "loss": 5.5015, + "step": 18928 + }, + { + "epoch": 0.94, + "grad_norm": 2.0737648010253906, + "learning_rate": 3.24126686101092e-06, + "loss": 5.5249, + "step": 18932 + }, + { + "epoch": 0.94, + "grad_norm": 2.041416883468628, + "learning_rate": 3.231384949849301e-06, + "loss": 5.5562, + "step": 18936 + }, + { + "epoch": 0.94, + "grad_norm": 2.186037063598633, + "learning_rate": 3.221503038687682e-06, + "loss": 5.4618, + "step": 18940 + }, + { + "epoch": 0.94, + "grad_norm": 2.2970268726348877, + "learning_rate": 3.2116211275260633e-06, + "loss": 5.3398, + "step": 18944 + }, + { + "epoch": 0.94, + "grad_norm": 2.2040741443634033, + "learning_rate": 3.2017392163644454e-06, + "loss": 5.48, + "step": 18948 + }, + { + "epoch": 0.94, + "grad_norm": 2.0961127281188965, + "learning_rate": 3.191857305202826e-06, + "loss": 5.5835, + "step": 18952 + }, + { + "epoch": 0.94, + "grad_norm": 2.112224578857422, + "learning_rate": 3.1819753940412074e-06, + "loss": 5.5615, + "step": 18956 + }, + { + "epoch": 0.94, + "grad_norm": 1.9072363376617432, + "learning_rate": 3.1720934828795895e-06, + "loss": 5.4394, + "step": 18960 + }, + { + "epoch": 0.94, + "grad_norm": 2.0917303562164307, + "learning_rate": 3.1622115717179707e-06, + "loss": 5.4445, + "step": 18964 + }, + { + "epoch": 0.94, + "grad_norm": 1.9776668548583984, + "learning_rate": 3.152329660556352e-06, + "loss": 5.4521, + "step": 18968 + }, + { + "epoch": 0.94, + "grad_norm": 2.028456449508667, + "learning_rate": 3.1424477493947327e-06, + "loss": 5.4723, + "step": 18972 + }, + { + "epoch": 0.94, + "grad_norm": 2.263448715209961, + "learning_rate": 3.1325658382331147e-06, + "loss": 5.4633, + "step": 18976 + }, + { + "epoch": 0.94, + "grad_norm": 2.058852434158325, + "learning_rate": 3.122683927071496e-06, + "loss": 5.4923, + "step": 18980 + }, + { + "epoch": 0.94, + "grad_norm": 2.173109769821167, + "learning_rate": 3.112802015909877e-06, + "loss": 5.3346, + "step": 18984 + }, + { + "epoch": 0.94, + "grad_norm": 2.0743250846862793, + "learning_rate": 3.1029201047482584e-06, + "loss": 5.5121, + "step": 18988 + }, + { + "epoch": 0.94, + "grad_norm": 2.125735282897949, + "learning_rate": 3.09303819358664e-06, + "loss": 5.3462, + "step": 18992 + }, + { + "epoch": 0.94, + "grad_norm": 1.9164544343948364, + "learning_rate": 3.0831562824250212e-06, + "loss": 5.4138, + "step": 18996 + }, + { + "epoch": 0.94, + "grad_norm": 1.9155079126358032, + "learning_rate": 3.0732743712634024e-06, + "loss": 5.3672, + "step": 19000 + }, + { + "epoch": 0.94, + "grad_norm": 2.128096342086792, + "learning_rate": 3.0633924601017836e-06, + "loss": 5.4021, + "step": 19004 + }, + { + "epoch": 0.94, + "grad_norm": 2.0933046340942383, + "learning_rate": 3.0535105489401653e-06, + "loss": 5.5964, + "step": 19008 + }, + { + "epoch": 0.94, + "grad_norm": 1.9437744617462158, + "learning_rate": 3.0436286377785465e-06, + "loss": 5.5408, + "step": 19012 + }, + { + "epoch": 0.94, + "grad_norm": 2.2945642471313477, + "learning_rate": 3.0337467266169277e-06, + "loss": 5.4818, + "step": 19016 + }, + { + "epoch": 0.94, + "grad_norm": 2.067274570465088, + "learning_rate": 3.023864815455309e-06, + "loss": 5.4306, + "step": 19020 + }, + { + "epoch": 0.94, + "grad_norm": 1.847240686416626, + "learning_rate": 3.0139829042936906e-06, + "loss": 5.451, + "step": 19024 + }, + { + "epoch": 0.94, + "grad_norm": 2.06215238571167, + "learning_rate": 3.004100993132072e-06, + "loss": 5.406, + "step": 19028 + }, + { + "epoch": 0.94, + "grad_norm": 1.9967362880706787, + "learning_rate": 2.994219081970453e-06, + "loss": 5.4419, + "step": 19032 + }, + { + "epoch": 0.94, + "grad_norm": 2.127033233642578, + "learning_rate": 2.9843371708088346e-06, + "loss": 5.4244, + "step": 19036 + }, + { + "epoch": 0.94, + "grad_norm": 2.093514919281006, + "learning_rate": 2.974455259647216e-06, + "loss": 5.4256, + "step": 19040 + }, + { + "epoch": 0.94, + "grad_norm": 2.253448963165283, + "learning_rate": 2.9645733484855975e-06, + "loss": 5.5029, + "step": 19044 + }, + { + "epoch": 0.94, + "grad_norm": 1.996626377105713, + "learning_rate": 2.9546914373239783e-06, + "loss": 5.5046, + "step": 19048 + }, + { + "epoch": 0.94, + "grad_norm": 2.028315782546997, + "learning_rate": 2.94480952616236e-06, + "loss": 5.5133, + "step": 19052 + }, + { + "epoch": 0.94, + "grad_norm": 1.8789162635803223, + "learning_rate": 2.934927615000741e-06, + "loss": 5.5341, + "step": 19056 + }, + { + "epoch": 0.94, + "grad_norm": 2.1197164058685303, + "learning_rate": 2.9250457038391228e-06, + "loss": 5.426, + "step": 19060 + }, + { + "epoch": 0.94, + "grad_norm": 2.14929461479187, + "learning_rate": 2.915163792677504e-06, + "loss": 5.5187, + "step": 19064 + }, + { + "epoch": 0.94, + "grad_norm": 2.1319830417633057, + "learning_rate": 2.905281881515885e-06, + "loss": 5.4122, + "step": 19068 + }, + { + "epoch": 0.94, + "grad_norm": 2.1806631088256836, + "learning_rate": 2.895399970354267e-06, + "loss": 5.483, + "step": 19072 + }, + { + "epoch": 0.94, + "grad_norm": 2.130634307861328, + "learning_rate": 2.885518059192648e-06, + "loss": 5.352, + "step": 19076 + }, + { + "epoch": 0.94, + "grad_norm": 1.8736112117767334, + "learning_rate": 2.8756361480310292e-06, + "loss": 5.4776, + "step": 19080 + }, + { + "epoch": 0.94, + "grad_norm": 1.9261388778686523, + "learning_rate": 2.8657542368694105e-06, + "loss": 5.4317, + "step": 19084 + }, + { + "epoch": 0.94, + "grad_norm": 2.2391560077667236, + "learning_rate": 2.855872325707792e-06, + "loss": 5.4883, + "step": 19088 + }, + { + "epoch": 0.94, + "grad_norm": 2.0670218467712402, + "learning_rate": 2.8459904145461733e-06, + "loss": 5.4033, + "step": 19092 + }, + { + "epoch": 0.94, + "grad_norm": 1.906471610069275, + "learning_rate": 2.8361085033845545e-06, + "loss": 5.4008, + "step": 19096 + }, + { + "epoch": 0.94, + "grad_norm": 1.9118762016296387, + "learning_rate": 2.826226592222936e-06, + "loss": 5.4519, + "step": 19100 + }, + { + "epoch": 0.94, + "grad_norm": 2.1099867820739746, + "learning_rate": 2.8163446810613174e-06, + "loss": 5.4299, + "step": 19104 + }, + { + "epoch": 0.94, + "grad_norm": 1.8047434091567993, + "learning_rate": 2.806462769899699e-06, + "loss": 5.4628, + "step": 19108 + }, + { + "epoch": 0.94, + "grad_norm": 2.058469772338867, + "learning_rate": 2.79658085873808e-06, + "loss": 5.4885, + "step": 19112 + }, + { + "epoch": 0.94, + "grad_norm": 2.310060501098633, + "learning_rate": 2.7866989475764614e-06, + "loss": 5.5399, + "step": 19116 + }, + { + "epoch": 0.94, + "grad_norm": 2.1850757598876953, + "learning_rate": 2.7768170364148426e-06, + "loss": 5.397, + "step": 19120 + }, + { + "epoch": 0.94, + "grad_norm": 1.979308009147644, + "learning_rate": 2.7669351252532243e-06, + "loss": 5.4149, + "step": 19124 + }, + { + "epoch": 0.95, + "grad_norm": 2.1982696056365967, + "learning_rate": 2.7570532140916055e-06, + "loss": 5.4355, + "step": 19128 + }, + { + "epoch": 0.95, + "grad_norm": 2.1380321979522705, + "learning_rate": 2.7471713029299867e-06, + "loss": 5.4576, + "step": 19132 + }, + { + "epoch": 0.95, + "grad_norm": 2.1724483966827393, + "learning_rate": 2.7372893917683683e-06, + "loss": 5.4849, + "step": 19136 + }, + { + "epoch": 0.95, + "grad_norm": 2.1920723915100098, + "learning_rate": 2.7274074806067496e-06, + "loss": 5.5044, + "step": 19140 + }, + { + "epoch": 0.95, + "grad_norm": 2.0876924991607666, + "learning_rate": 2.7175255694451308e-06, + "loss": 5.5164, + "step": 19144 + }, + { + "epoch": 0.95, + "grad_norm": 2.028972864151001, + "learning_rate": 2.707643658283512e-06, + "loss": 5.4324, + "step": 19148 + }, + { + "epoch": 0.95, + "grad_norm": 2.1649794578552246, + "learning_rate": 2.6977617471218936e-06, + "loss": 5.4183, + "step": 19152 + }, + { + "epoch": 0.95, + "grad_norm": 2.0742039680480957, + "learning_rate": 2.687879835960275e-06, + "loss": 5.5804, + "step": 19156 + }, + { + "epoch": 0.95, + "grad_norm": 2.0070149898529053, + "learning_rate": 2.677997924798656e-06, + "loss": 5.4086, + "step": 19160 + }, + { + "epoch": 0.95, + "grad_norm": 2.270585298538208, + "learning_rate": 2.6681160136370377e-06, + "loss": 5.3827, + "step": 19164 + }, + { + "epoch": 0.95, + "grad_norm": 1.9209606647491455, + "learning_rate": 2.658234102475419e-06, + "loss": 5.4162, + "step": 19168 + }, + { + "epoch": 0.95, + "grad_norm": 2.2000796794891357, + "learning_rate": 2.6483521913138005e-06, + "loss": 5.5244, + "step": 19172 + }, + { + "epoch": 0.95, + "grad_norm": 2.1105525493621826, + "learning_rate": 2.6384702801521813e-06, + "loss": 5.554, + "step": 19176 + }, + { + "epoch": 0.95, + "grad_norm": 2.1889917850494385, + "learning_rate": 2.628588368990563e-06, + "loss": 5.434, + "step": 19180 + }, + { + "epoch": 0.95, + "grad_norm": 2.020902395248413, + "learning_rate": 2.618706457828944e-06, + "loss": 5.5205, + "step": 19184 + }, + { + "epoch": 0.95, + "grad_norm": 1.8815635442733765, + "learning_rate": 2.608824546667326e-06, + "loss": 5.3136, + "step": 19188 + }, + { + "epoch": 0.95, + "grad_norm": 2.1251041889190674, + "learning_rate": 2.5989426355057066e-06, + "loss": 5.4888, + "step": 19192 + }, + { + "epoch": 0.95, + "grad_norm": 2.0390384197235107, + "learning_rate": 2.5890607243440882e-06, + "loss": 5.487, + "step": 19196 + }, + { + "epoch": 0.95, + "grad_norm": 2.1622934341430664, + "learning_rate": 2.57917881318247e-06, + "loss": 5.4759, + "step": 19200 + }, + { + "epoch": 0.95, + "grad_norm": 2.190906524658203, + "learning_rate": 2.569296902020851e-06, + "loss": 5.505, + "step": 19204 + }, + { + "epoch": 0.95, + "grad_norm": 2.2829856872558594, + "learning_rate": 2.5594149908592323e-06, + "loss": 5.5217, + "step": 19208 + }, + { + "epoch": 0.95, + "grad_norm": 1.9600523710250854, + "learning_rate": 2.5495330796976135e-06, + "loss": 5.5042, + "step": 19212 + }, + { + "epoch": 0.95, + "grad_norm": 1.987808346748352, + "learning_rate": 2.539651168535995e-06, + "loss": 5.5081, + "step": 19216 + }, + { + "epoch": 0.95, + "grad_norm": 2.160310745239258, + "learning_rate": 2.5297692573743764e-06, + "loss": 5.41, + "step": 19220 + }, + { + "epoch": 0.95, + "grad_norm": 2.06400728225708, + "learning_rate": 2.5198873462127576e-06, + "loss": 5.4132, + "step": 19224 + }, + { + "epoch": 0.95, + "grad_norm": 1.9328960180282593, + "learning_rate": 2.510005435051139e-06, + "loss": 5.4076, + "step": 19228 + }, + { + "epoch": 0.95, + "grad_norm": 2.1214709281921387, + "learning_rate": 2.5001235238895204e-06, + "loss": 5.4865, + "step": 19232 + }, + { + "epoch": 0.95, + "grad_norm": 1.868773102760315, + "learning_rate": 2.4902416127279016e-06, + "loss": 5.41, + "step": 19236 + }, + { + "epoch": 0.95, + "grad_norm": 1.9432061910629272, + "learning_rate": 2.480359701566283e-06, + "loss": 5.4604, + "step": 19240 + }, + { + "epoch": 0.95, + "grad_norm": 1.9144906997680664, + "learning_rate": 2.4704777904046645e-06, + "loss": 5.4058, + "step": 19244 + }, + { + "epoch": 0.95, + "grad_norm": 2.061899185180664, + "learning_rate": 2.4605958792430457e-06, + "loss": 5.3946, + "step": 19248 + }, + { + "epoch": 0.95, + "grad_norm": 2.211974859237671, + "learning_rate": 2.4507139680814273e-06, + "loss": 5.5185, + "step": 19252 + }, + { + "epoch": 0.95, + "grad_norm": 2.155259847640991, + "learning_rate": 2.440832056919808e-06, + "loss": 5.5318, + "step": 19256 + }, + { + "epoch": 0.95, + "grad_norm": 2.0900609493255615, + "learning_rate": 2.4309501457581898e-06, + "loss": 5.3854, + "step": 19260 + }, + { + "epoch": 0.95, + "grad_norm": 2.067147970199585, + "learning_rate": 2.421068234596571e-06, + "loss": 5.5763, + "step": 19264 + }, + { + "epoch": 0.95, + "grad_norm": 1.9264382123947144, + "learning_rate": 2.4111863234349526e-06, + "loss": 5.4579, + "step": 19268 + }, + { + "epoch": 0.95, + "grad_norm": 1.9205288887023926, + "learning_rate": 2.401304412273334e-06, + "loss": 5.5091, + "step": 19272 + }, + { + "epoch": 0.95, + "grad_norm": 2.4955880641937256, + "learning_rate": 2.391422501111715e-06, + "loss": 5.4334, + "step": 19276 + }, + { + "epoch": 0.95, + "grad_norm": 2.0582950115203857, + "learning_rate": 2.3815405899500967e-06, + "loss": 5.5041, + "step": 19280 + }, + { + "epoch": 0.95, + "grad_norm": 1.9333152770996094, + "learning_rate": 2.371658678788478e-06, + "loss": 5.5563, + "step": 19284 + }, + { + "epoch": 0.95, + "grad_norm": 2.0240728855133057, + "learning_rate": 2.361776767626859e-06, + "loss": 5.4964, + "step": 19288 + }, + { + "epoch": 0.95, + "grad_norm": 2.198359251022339, + "learning_rate": 2.3518948564652403e-06, + "loss": 5.5561, + "step": 19292 + }, + { + "epoch": 0.95, + "grad_norm": 2.372915506362915, + "learning_rate": 2.342012945303622e-06, + "loss": 5.5221, + "step": 19296 + }, + { + "epoch": 0.95, + "grad_norm": 2.183767557144165, + "learning_rate": 2.332131034142003e-06, + "loss": 5.4475, + "step": 19300 + }, + { + "epoch": 0.95, + "grad_norm": 2.019258499145508, + "learning_rate": 2.3222491229803844e-06, + "loss": 5.4531, + "step": 19304 + }, + { + "epoch": 0.95, + "grad_norm": 2.098344564437866, + "learning_rate": 2.312367211818766e-06, + "loss": 5.4763, + "step": 19308 + }, + { + "epoch": 0.95, + "grad_norm": 1.9019840955734253, + "learning_rate": 2.3024853006571472e-06, + "loss": 5.5028, + "step": 19312 + }, + { + "epoch": 0.95, + "grad_norm": 1.9219045639038086, + "learning_rate": 2.2926033894955284e-06, + "loss": 5.4599, + "step": 19316 + }, + { + "epoch": 0.95, + "grad_norm": 2.0754764080047607, + "learning_rate": 2.2827214783339097e-06, + "loss": 5.5103, + "step": 19320 + }, + { + "epoch": 0.95, + "grad_norm": 1.8941829204559326, + "learning_rate": 2.2728395671722913e-06, + "loss": 5.4677, + "step": 19324 + }, + { + "epoch": 0.95, + "grad_norm": 2.03706431388855, + "learning_rate": 2.2629576560106725e-06, + "loss": 5.3742, + "step": 19328 + }, + { + "epoch": 0.96, + "grad_norm": 2.2529075145721436, + "learning_rate": 2.253075744849054e-06, + "loss": 5.4839, + "step": 19332 + }, + { + "epoch": 0.96, + "grad_norm": 2.053737163543701, + "learning_rate": 2.243193833687435e-06, + "loss": 5.466, + "step": 19336 + }, + { + "epoch": 0.96, + "grad_norm": 2.1574110984802246, + "learning_rate": 2.2333119225258166e-06, + "loss": 5.3834, + "step": 19340 + }, + { + "epoch": 0.96, + "grad_norm": 2.3568503856658936, + "learning_rate": 2.2234300113641982e-06, + "loss": 5.4472, + "step": 19344 + }, + { + "epoch": 0.96, + "grad_norm": 2.0544795989990234, + "learning_rate": 2.2135481002025794e-06, + "loss": 5.3323, + "step": 19348 + }, + { + "epoch": 0.96, + "grad_norm": 2.053724527359009, + "learning_rate": 2.2036661890409606e-06, + "loss": 5.4349, + "step": 19352 + }, + { + "epoch": 0.96, + "grad_norm": 2.132596015930176, + "learning_rate": 2.193784277879342e-06, + "loss": 5.4897, + "step": 19356 + }, + { + "epoch": 0.96, + "grad_norm": 2.1658577919006348, + "learning_rate": 2.1839023667177235e-06, + "loss": 5.4621, + "step": 19360 + }, + { + "epoch": 0.96, + "grad_norm": 2.2812771797180176, + "learning_rate": 2.1740204555561047e-06, + "loss": 5.3762, + "step": 19364 + }, + { + "epoch": 0.96, + "grad_norm": 2.056185483932495, + "learning_rate": 2.164138544394486e-06, + "loss": 5.4444, + "step": 19368 + }, + { + "epoch": 0.96, + "grad_norm": 2.1531660556793213, + "learning_rate": 2.154256633232867e-06, + "loss": 5.5484, + "step": 19372 + }, + { + "epoch": 0.96, + "grad_norm": 2.3134000301361084, + "learning_rate": 2.1443747220712488e-06, + "loss": 5.5518, + "step": 19376 + }, + { + "epoch": 0.96, + "grad_norm": 2.110779047012329, + "learning_rate": 2.13449281090963e-06, + "loss": 5.5833, + "step": 19380 + }, + { + "epoch": 0.96, + "grad_norm": 1.998799443244934, + "learning_rate": 2.124610899748011e-06, + "loss": 5.4789, + "step": 19384 + }, + { + "epoch": 0.96, + "grad_norm": 1.902309536933899, + "learning_rate": 2.114728988586393e-06, + "loss": 5.4095, + "step": 19388 + }, + { + "epoch": 0.96, + "grad_norm": 2.1755199432373047, + "learning_rate": 2.104847077424774e-06, + "loss": 5.5097, + "step": 19392 + }, + { + "epoch": 0.96, + "grad_norm": 2.089999198913574, + "learning_rate": 2.0949651662631553e-06, + "loss": 5.4806, + "step": 19396 + }, + { + "epoch": 0.96, + "grad_norm": 2.029437780380249, + "learning_rate": 2.0850832551015365e-06, + "loss": 5.406, + "step": 19400 + }, + { + "epoch": 0.96, + "grad_norm": 2.0628230571746826, + "learning_rate": 2.075201343939918e-06, + "loss": 5.5123, + "step": 19404 + }, + { + "epoch": 0.96, + "grad_norm": 2.2448441982269287, + "learning_rate": 2.0653194327782997e-06, + "loss": 5.5619, + "step": 19408 + }, + { + "epoch": 0.96, + "grad_norm": 1.9651387929916382, + "learning_rate": 2.0554375216166805e-06, + "loss": 5.4864, + "step": 19412 + }, + { + "epoch": 0.96, + "grad_norm": 2.1724853515625, + "learning_rate": 2.045555610455062e-06, + "loss": 5.4135, + "step": 19416 + }, + { + "epoch": 0.96, + "grad_norm": 1.981934666633606, + "learning_rate": 2.0356736992934434e-06, + "loss": 5.5062, + "step": 19420 + }, + { + "epoch": 0.96, + "grad_norm": 2.0844616889953613, + "learning_rate": 2.025791788131825e-06, + "loss": 5.4076, + "step": 19424 + }, + { + "epoch": 0.96, + "grad_norm": 2.2012956142425537, + "learning_rate": 2.0159098769702062e-06, + "loss": 5.4093, + "step": 19428 + }, + { + "epoch": 0.96, + "grad_norm": 2.386598587036133, + "learning_rate": 2.0060279658085874e-06, + "loss": 5.4528, + "step": 19432 + }, + { + "epoch": 0.96, + "grad_norm": 1.9763364791870117, + "learning_rate": 1.9961460546469687e-06, + "loss": 5.5794, + "step": 19436 + }, + { + "epoch": 0.96, + "grad_norm": 1.9587762355804443, + "learning_rate": 1.9862641434853503e-06, + "loss": 5.4508, + "step": 19440 + }, + { + "epoch": 0.96, + "grad_norm": 1.9256550073623657, + "learning_rate": 1.9763822323237315e-06, + "loss": 5.4952, + "step": 19444 + }, + { + "epoch": 0.96, + "grad_norm": 2.1667470932006836, + "learning_rate": 1.9665003211621127e-06, + "loss": 5.4601, + "step": 19448 + }, + { + "epoch": 0.96, + "grad_norm": 2.241722583770752, + "learning_rate": 1.9566184100004944e-06, + "loss": 5.4552, + "step": 19452 + }, + { + "epoch": 0.96, + "grad_norm": 2.051391363143921, + "learning_rate": 1.9467364988388756e-06, + "loss": 5.4654, + "step": 19456 + }, + { + "epoch": 0.96, + "grad_norm": 2.4016683101654053, + "learning_rate": 1.9368545876772568e-06, + "loss": 5.4888, + "step": 19460 + }, + { + "epoch": 0.96, + "grad_norm": 2.049546480178833, + "learning_rate": 1.926972676515638e-06, + "loss": 5.4593, + "step": 19464 + }, + { + "epoch": 0.96, + "grad_norm": 2.011448621749878, + "learning_rate": 1.9170907653540196e-06, + "loss": 5.4591, + "step": 19468 + }, + { + "epoch": 0.96, + "grad_norm": 2.133927345275879, + "learning_rate": 1.9072088541924008e-06, + "loss": 5.5276, + "step": 19472 + }, + { + "epoch": 0.96, + "grad_norm": 2.1823043823242188, + "learning_rate": 1.8973269430307823e-06, + "loss": 5.4671, + "step": 19476 + }, + { + "epoch": 0.96, + "grad_norm": 2.2862277030944824, + "learning_rate": 1.8874450318691637e-06, + "loss": 5.3897, + "step": 19480 + }, + { + "epoch": 0.96, + "grad_norm": 1.9178924560546875, + "learning_rate": 1.877563120707545e-06, + "loss": 5.4061, + "step": 19484 + }, + { + "epoch": 0.96, + "grad_norm": 2.043980836868286, + "learning_rate": 1.8676812095459263e-06, + "loss": 5.4469, + "step": 19488 + }, + { + "epoch": 0.96, + "grad_norm": 2.0812246799468994, + "learning_rate": 1.8577992983843076e-06, + "loss": 5.4019, + "step": 19492 + }, + { + "epoch": 0.96, + "grad_norm": 2.1293153762817383, + "learning_rate": 1.847917387222689e-06, + "loss": 5.5026, + "step": 19496 + }, + { + "epoch": 0.96, + "grad_norm": 2.150707960128784, + "learning_rate": 1.8380354760610702e-06, + "loss": 5.5382, + "step": 19500 + }, + { + "epoch": 0.96, + "grad_norm": 2.0206191539764404, + "learning_rate": 1.8281535648994516e-06, + "loss": 5.3712, + "step": 19504 + }, + { + "epoch": 0.96, + "grad_norm": 2.0948994159698486, + "learning_rate": 1.8182716537378328e-06, + "loss": 5.3896, + "step": 19508 + }, + { + "epoch": 0.96, + "grad_norm": 2.020963191986084, + "learning_rate": 1.8083897425762143e-06, + "loss": 5.3902, + "step": 19512 + }, + { + "epoch": 0.96, + "grad_norm": 2.0603950023651123, + "learning_rate": 1.7985078314145959e-06, + "loss": 5.4755, + "step": 19516 + }, + { + "epoch": 0.96, + "grad_norm": 1.9533694982528687, + "learning_rate": 1.7886259202529769e-06, + "loss": 5.3764, + "step": 19520 + }, + { + "epoch": 0.96, + "grad_norm": 2.0614960193634033, + "learning_rate": 1.7787440090913585e-06, + "loss": 5.5436, + "step": 19524 + }, + { + "epoch": 0.96, + "grad_norm": 1.9552912712097168, + "learning_rate": 1.7688620979297395e-06, + "loss": 5.4593, + "step": 19528 + }, + { + "epoch": 0.97, + "grad_norm": 2.1261250972747803, + "learning_rate": 1.7589801867681212e-06, + "loss": 5.3616, + "step": 19532 + }, + { + "epoch": 0.97, + "grad_norm": 1.9987773895263672, + "learning_rate": 1.7490982756065022e-06, + "loss": 5.4697, + "step": 19536 + }, + { + "epoch": 0.97, + "grad_norm": 1.9402105808258057, + "learning_rate": 1.7392163644448838e-06, + "loss": 5.4345, + "step": 19540 + }, + { + "epoch": 0.97, + "grad_norm": 1.9414345026016235, + "learning_rate": 1.7293344532832648e-06, + "loss": 5.3996, + "step": 19544 + }, + { + "epoch": 0.97, + "grad_norm": 2.1511659622192383, + "learning_rate": 1.7194525421216464e-06, + "loss": 5.3444, + "step": 19548 + }, + { + "epoch": 0.97, + "grad_norm": 2.011024236679077, + "learning_rate": 1.7095706309600279e-06, + "loss": 5.5161, + "step": 19552 + }, + { + "epoch": 0.97, + "grad_norm": 2.05631422996521, + "learning_rate": 1.699688719798409e-06, + "loss": 5.451, + "step": 19556 + }, + { + "epoch": 0.97, + "grad_norm": 2.112424373626709, + "learning_rate": 1.6898068086367905e-06, + "loss": 5.5912, + "step": 19560 + }, + { + "epoch": 0.97, + "grad_norm": 2.210801839828491, + "learning_rate": 1.6799248974751717e-06, + "loss": 5.5403, + "step": 19564 + }, + { + "epoch": 0.97, + "grad_norm": 2.0372581481933594, + "learning_rate": 1.6700429863135531e-06, + "loss": 5.3212, + "step": 19568 + }, + { + "epoch": 0.97, + "grad_norm": 1.9437897205352783, + "learning_rate": 1.6601610751519344e-06, + "loss": 5.49, + "step": 19572 + }, + { + "epoch": 0.97, + "grad_norm": 2.0939133167266846, + "learning_rate": 1.6502791639903158e-06, + "loss": 5.5276, + "step": 19576 + }, + { + "epoch": 0.97, + "grad_norm": 2.0417065620422363, + "learning_rate": 1.640397252828697e-06, + "loss": 5.4277, + "step": 19580 + }, + { + "epoch": 0.97, + "grad_norm": 1.9575787782669067, + "learning_rate": 1.6305153416670784e-06, + "loss": 5.4794, + "step": 19584 + }, + { + "epoch": 0.97, + "grad_norm": 2.010903835296631, + "learning_rate": 1.62063343050546e-06, + "loss": 5.5096, + "step": 19588 + }, + { + "epoch": 0.97, + "grad_norm": 1.9882851839065552, + "learning_rate": 1.610751519343841e-06, + "loss": 5.3729, + "step": 19592 + }, + { + "epoch": 0.97, + "grad_norm": 2.00439190864563, + "learning_rate": 1.6008696081822227e-06, + "loss": 5.4072, + "step": 19596 + }, + { + "epoch": 0.97, + "grad_norm": 2.154120445251465, + "learning_rate": 1.5909876970206037e-06, + "loss": 5.546, + "step": 19600 + }, + { + "epoch": 0.97, + "grad_norm": 2.1139557361602783, + "learning_rate": 1.5811057858589853e-06, + "loss": 5.5374, + "step": 19604 + }, + { + "epoch": 0.97, + "grad_norm": 2.38275408744812, + "learning_rate": 1.5712238746973663e-06, + "loss": 5.6017, + "step": 19608 + }, + { + "epoch": 0.97, + "grad_norm": 2.0247836112976074, + "learning_rate": 1.561341963535748e-06, + "loss": 5.4871, + "step": 19612 + }, + { + "epoch": 0.97, + "grad_norm": 1.97475004196167, + "learning_rate": 1.5514600523741292e-06, + "loss": 5.5351, + "step": 19616 + }, + { + "epoch": 0.97, + "grad_norm": 2.0974037647247314, + "learning_rate": 1.5415781412125106e-06, + "loss": 5.5301, + "step": 19620 + }, + { + "epoch": 0.97, + "grad_norm": 2.1904966831207275, + "learning_rate": 1.5316962300508918e-06, + "loss": 5.4878, + "step": 19624 + }, + { + "epoch": 0.97, + "grad_norm": 2.0616374015808105, + "learning_rate": 1.5218143188892732e-06, + "loss": 5.4575, + "step": 19628 + }, + { + "epoch": 0.97, + "grad_norm": 2.0660183429718018, + "learning_rate": 1.5119324077276545e-06, + "loss": 5.3544, + "step": 19632 + }, + { + "epoch": 0.97, + "grad_norm": 1.9736340045928955, + "learning_rate": 1.502050496566036e-06, + "loss": 5.4619, + "step": 19636 + }, + { + "epoch": 0.97, + "grad_norm": 2.014892339706421, + "learning_rate": 1.4921685854044173e-06, + "loss": 5.3977, + "step": 19640 + }, + { + "epoch": 0.97, + "grad_norm": 1.8507190942764282, + "learning_rate": 1.4822866742427987e-06, + "loss": 5.4187, + "step": 19644 + }, + { + "epoch": 0.97, + "grad_norm": 2.1220788955688477, + "learning_rate": 1.47240476308118e-06, + "loss": 5.4812, + "step": 19648 + }, + { + "epoch": 0.97, + "grad_norm": 2.1677944660186768, + "learning_rate": 1.4625228519195614e-06, + "loss": 5.3252, + "step": 19652 + }, + { + "epoch": 0.97, + "grad_norm": 2.054081439971924, + "learning_rate": 1.4526409407579426e-06, + "loss": 5.5602, + "step": 19656 + }, + { + "epoch": 0.97, + "grad_norm": 2.1197407245635986, + "learning_rate": 1.442759029596324e-06, + "loss": 5.4585, + "step": 19660 + }, + { + "epoch": 0.97, + "grad_norm": 2.005307674407959, + "learning_rate": 1.4328771184347052e-06, + "loss": 5.4986, + "step": 19664 + }, + { + "epoch": 0.97, + "grad_norm": 2.428250312805176, + "learning_rate": 1.4229952072730867e-06, + "loss": 5.3953, + "step": 19668 + }, + { + "epoch": 0.97, + "grad_norm": 2.051632881164551, + "learning_rate": 1.413113296111468e-06, + "loss": 5.4902, + "step": 19672 + }, + { + "epoch": 0.97, + "grad_norm": 2.1171364784240723, + "learning_rate": 1.4032313849498495e-06, + "loss": 5.411, + "step": 19676 + }, + { + "epoch": 0.97, + "grad_norm": 1.9378799200057983, + "learning_rate": 1.3933494737882307e-06, + "loss": 5.401, + "step": 19680 + }, + { + "epoch": 0.97, + "grad_norm": 2.151566743850708, + "learning_rate": 1.3834675626266121e-06, + "loss": 5.4693, + "step": 19684 + }, + { + "epoch": 0.97, + "grad_norm": 1.948789358139038, + "learning_rate": 1.3735856514649934e-06, + "loss": 5.4927, + "step": 19688 + }, + { + "epoch": 0.97, + "grad_norm": 2.4134631156921387, + "learning_rate": 1.3637037403033748e-06, + "loss": 5.5223, + "step": 19692 + }, + { + "epoch": 0.97, + "grad_norm": 2.0687310695648193, + "learning_rate": 1.353821829141756e-06, + "loss": 5.6205, + "step": 19696 + }, + { + "epoch": 0.97, + "grad_norm": 2.03088641166687, + "learning_rate": 1.3439399179801374e-06, + "loss": 5.4587, + "step": 19700 + }, + { + "epoch": 0.97, + "grad_norm": 2.0663702487945557, + "learning_rate": 1.3340580068185188e-06, + "loss": 5.4831, + "step": 19704 + }, + { + "epoch": 0.97, + "grad_norm": 1.904789686203003, + "learning_rate": 1.3241760956569003e-06, + "loss": 5.5511, + "step": 19708 + }, + { + "epoch": 0.97, + "grad_norm": 2.119781970977783, + "learning_rate": 1.3142941844952815e-06, + "loss": 5.4318, + "step": 19712 + }, + { + "epoch": 0.97, + "grad_norm": 1.9469795227050781, + "learning_rate": 1.304412273333663e-06, + "loss": 5.4406, + "step": 19716 + }, + { + "epoch": 0.97, + "grad_norm": 2.30818247795105, + "learning_rate": 1.2945303621720441e-06, + "loss": 5.4206, + "step": 19720 + }, + { + "epoch": 0.97, + "grad_norm": 1.9821704626083374, + "learning_rate": 1.2846484510104255e-06, + "loss": 5.5063, + "step": 19724 + }, + { + "epoch": 0.97, + "grad_norm": 2.095536470413208, + "learning_rate": 1.2747665398488068e-06, + "loss": 5.545, + "step": 19728 + }, + { + "epoch": 0.97, + "grad_norm": 2.048340320587158, + "learning_rate": 1.2648846286871882e-06, + "loss": 5.386, + "step": 19732 + }, + { + "epoch": 0.98, + "grad_norm": 2.1303012371063232, + "learning_rate": 1.2550027175255694e-06, + "loss": 5.433, + "step": 19736 + }, + { + "epoch": 0.98, + "grad_norm": 2.1081206798553467, + "learning_rate": 1.2451208063639508e-06, + "loss": 5.4296, + "step": 19740 + }, + { + "epoch": 0.98, + "grad_norm": 2.0982227325439453, + "learning_rate": 1.2352388952023322e-06, + "loss": 5.4267, + "step": 19744 + }, + { + "epoch": 0.98, + "grad_norm": 1.9972717761993408, + "learning_rate": 1.2253569840407137e-06, + "loss": 5.55, + "step": 19748 + }, + { + "epoch": 0.98, + "grad_norm": 2.1928958892822266, + "learning_rate": 1.2154750728790949e-06, + "loss": 5.5068, + "step": 19752 + }, + { + "epoch": 0.98, + "grad_norm": 1.973193883895874, + "learning_rate": 1.2055931617174763e-06, + "loss": 5.4586, + "step": 19756 + }, + { + "epoch": 0.98, + "grad_norm": 1.975972056388855, + "learning_rate": 1.1957112505558575e-06, + "loss": 5.2701, + "step": 19760 + }, + { + "epoch": 0.98, + "grad_norm": 2.053972005844116, + "learning_rate": 1.185829339394239e-06, + "loss": 5.4591, + "step": 19764 + }, + { + "epoch": 0.98, + "grad_norm": 2.108062267303467, + "learning_rate": 1.1759474282326202e-06, + "loss": 5.4726, + "step": 19768 + }, + { + "epoch": 0.98, + "grad_norm": 2.1291093826293945, + "learning_rate": 1.1660655170710016e-06, + "loss": 5.459, + "step": 19772 + }, + { + "epoch": 0.98, + "grad_norm": 2.2266008853912354, + "learning_rate": 1.156183605909383e-06, + "loss": 5.4267, + "step": 19776 + }, + { + "epoch": 0.98, + "grad_norm": 2.031205892562866, + "learning_rate": 1.1463016947477642e-06, + "loss": 5.3383, + "step": 19780 + }, + { + "epoch": 0.98, + "grad_norm": 2.1661624908447266, + "learning_rate": 1.1364197835861456e-06, + "loss": 5.4843, + "step": 19784 + }, + { + "epoch": 0.98, + "grad_norm": 2.1752281188964844, + "learning_rate": 1.126537872424527e-06, + "loss": 5.4579, + "step": 19788 + }, + { + "epoch": 0.98, + "grad_norm": 2.279290199279785, + "learning_rate": 1.1166559612629083e-06, + "loss": 5.4887, + "step": 19792 + }, + { + "epoch": 0.98, + "grad_norm": 1.843815803527832, + "learning_rate": 1.1067740501012897e-06, + "loss": 5.4187, + "step": 19796 + }, + { + "epoch": 0.98, + "grad_norm": 1.9832565784454346, + "learning_rate": 1.096892138939671e-06, + "loss": 5.5083, + "step": 19800 + }, + { + "epoch": 0.98, + "grad_norm": 1.7962124347686768, + "learning_rate": 1.0870102277780524e-06, + "loss": 5.5071, + "step": 19804 + }, + { + "epoch": 0.98, + "grad_norm": 2.0890607833862305, + "learning_rate": 1.0771283166164336e-06, + "loss": 5.56, + "step": 19808 + }, + { + "epoch": 0.98, + "grad_norm": 1.972830057144165, + "learning_rate": 1.067246405454815e-06, + "loss": 5.4381, + "step": 19812 + }, + { + "epoch": 0.98, + "grad_norm": 1.968068242073059, + "learning_rate": 1.0573644942931964e-06, + "loss": 5.5253, + "step": 19816 + }, + { + "epoch": 0.98, + "grad_norm": 2.201663017272949, + "learning_rate": 1.0474825831315776e-06, + "loss": 5.5018, + "step": 19820 + }, + { + "epoch": 0.98, + "grad_norm": 1.8504890203475952, + "learning_rate": 1.037600671969959e-06, + "loss": 5.4115, + "step": 19824 + }, + { + "epoch": 0.98, + "grad_norm": 2.112267017364502, + "learning_rate": 1.0277187608083403e-06, + "loss": 5.3483, + "step": 19828 + }, + { + "epoch": 0.98, + "grad_norm": 2.159766435623169, + "learning_rate": 1.0178368496467217e-06, + "loss": 5.381, + "step": 19832 + }, + { + "epoch": 0.98, + "grad_norm": 2.1608057022094727, + "learning_rate": 1.0079549384851031e-06, + "loss": 5.4216, + "step": 19836 + }, + { + "epoch": 0.98, + "grad_norm": 1.8269460201263428, + "learning_rate": 9.980730273234843e-07, + "loss": 5.4642, + "step": 19840 + }, + { + "epoch": 0.98, + "grad_norm": 2.0793392658233643, + "learning_rate": 9.881911161618658e-07, + "loss": 5.4693, + "step": 19844 + }, + { + "epoch": 0.98, + "grad_norm": 2.1695215702056885, + "learning_rate": 9.783092050002472e-07, + "loss": 5.5016, + "step": 19848 + }, + { + "epoch": 0.98, + "grad_norm": 1.9932477474212646, + "learning_rate": 9.684272938386284e-07, + "loss": 5.4885, + "step": 19852 + }, + { + "epoch": 0.98, + "grad_norm": 1.9905871152877808, + "learning_rate": 9.585453826770098e-07, + "loss": 5.4697, + "step": 19856 + }, + { + "epoch": 0.98, + "grad_norm": 1.9750021696090698, + "learning_rate": 9.486634715153911e-07, + "loss": 5.3438, + "step": 19860 + }, + { + "epoch": 0.98, + "grad_norm": 1.9781215190887451, + "learning_rate": 9.387815603537725e-07, + "loss": 5.4243, + "step": 19864 + }, + { + "epoch": 0.98, + "grad_norm": 2.112760543823242, + "learning_rate": 9.288996491921538e-07, + "loss": 5.496, + "step": 19868 + }, + { + "epoch": 0.98, + "grad_norm": 2.029419422149658, + "learning_rate": 9.190177380305351e-07, + "loss": 5.4413, + "step": 19872 + }, + { + "epoch": 0.98, + "grad_norm": 2.0092504024505615, + "learning_rate": 9.091358268689164e-07, + "loss": 5.3783, + "step": 19876 + }, + { + "epoch": 0.98, + "grad_norm": 2.1386382579803467, + "learning_rate": 8.992539157072979e-07, + "loss": 5.4708, + "step": 19880 + }, + { + "epoch": 0.98, + "grad_norm": 1.9855784177780151, + "learning_rate": 8.893720045456793e-07, + "loss": 5.5455, + "step": 19884 + }, + { + "epoch": 0.98, + "grad_norm": 2.007969856262207, + "learning_rate": 8.794900933840606e-07, + "loss": 5.5531, + "step": 19888 + }, + { + "epoch": 0.98, + "grad_norm": 1.9627690315246582, + "learning_rate": 8.696081822224419e-07, + "loss": 5.4983, + "step": 19892 + }, + { + "epoch": 0.98, + "grad_norm": 2.0733351707458496, + "learning_rate": 8.597262710608232e-07, + "loss": 5.4475, + "step": 19896 + }, + { + "epoch": 0.98, + "grad_norm": 2.1221306324005127, + "learning_rate": 8.498443598992045e-07, + "loss": 5.5056, + "step": 19900 + }, + { + "epoch": 0.98, + "grad_norm": 1.9957540035247803, + "learning_rate": 8.399624487375859e-07, + "loss": 5.5603, + "step": 19904 + }, + { + "epoch": 0.98, + "grad_norm": 2.1626851558685303, + "learning_rate": 8.300805375759672e-07, + "loss": 5.485, + "step": 19908 + }, + { + "epoch": 0.98, + "grad_norm": 1.9834141731262207, + "learning_rate": 8.201986264143485e-07, + "loss": 5.3205, + "step": 19912 + }, + { + "epoch": 0.98, + "grad_norm": 2.065784454345703, + "learning_rate": 8.1031671525273e-07, + "loss": 5.5108, + "step": 19916 + }, + { + "epoch": 0.98, + "grad_norm": 2.299056053161621, + "learning_rate": 8.004348040911113e-07, + "loss": 5.5952, + "step": 19920 + }, + { + "epoch": 0.98, + "grad_norm": 2.0861730575561523, + "learning_rate": 7.905528929294927e-07, + "loss": 5.4168, + "step": 19924 + }, + { + "epoch": 0.98, + "grad_norm": 2.2378146648406982, + "learning_rate": 7.80670981767874e-07, + "loss": 5.4531, + "step": 19928 + }, + { + "epoch": 0.98, + "grad_norm": 1.9151175022125244, + "learning_rate": 7.707890706062553e-07, + "loss": 5.4655, + "step": 19932 + }, + { + "epoch": 0.99, + "grad_norm": 2.122283697128296, + "learning_rate": 7.609071594446366e-07, + "loss": 5.4724, + "step": 19936 + }, + { + "epoch": 0.99, + "grad_norm": 2.1069324016571045, + "learning_rate": 7.51025248283018e-07, + "loss": 5.5641, + "step": 19940 + }, + { + "epoch": 0.99, + "grad_norm": 2.006833791732788, + "learning_rate": 7.411433371213994e-07, + "loss": 5.4192, + "step": 19944 + }, + { + "epoch": 0.99, + "grad_norm": 2.0627007484436035, + "learning_rate": 7.312614259597807e-07, + "loss": 5.5582, + "step": 19948 + }, + { + "epoch": 0.99, + "grad_norm": 1.9530029296875, + "learning_rate": 7.21379514798162e-07, + "loss": 5.5445, + "step": 19952 + }, + { + "epoch": 0.99, + "grad_norm": 2.1795194149017334, + "learning_rate": 7.114976036365433e-07, + "loss": 5.5048, + "step": 19956 + }, + { + "epoch": 0.99, + "grad_norm": 2.1359763145446777, + "learning_rate": 7.016156924749248e-07, + "loss": 5.432, + "step": 19960 + }, + { + "epoch": 0.99, + "grad_norm": 2.1916072368621826, + "learning_rate": 6.917337813133061e-07, + "loss": 5.4901, + "step": 19964 + }, + { + "epoch": 0.99, + "grad_norm": 2.025327205657959, + "learning_rate": 6.818518701516874e-07, + "loss": 5.3755, + "step": 19968 + }, + { + "epoch": 0.99, + "grad_norm": 2.0348243713378906, + "learning_rate": 6.719699589900687e-07, + "loss": 5.5042, + "step": 19972 + }, + { + "epoch": 0.99, + "grad_norm": 2.128100633621216, + "learning_rate": 6.620880478284501e-07, + "loss": 5.4892, + "step": 19976 + }, + { + "epoch": 0.99, + "grad_norm": 2.0665409564971924, + "learning_rate": 6.522061366668315e-07, + "loss": 5.4291, + "step": 19980 + }, + { + "epoch": 0.99, + "grad_norm": 2.2713510990142822, + "learning_rate": 6.423242255052128e-07, + "loss": 5.5485, + "step": 19984 + }, + { + "epoch": 0.99, + "grad_norm": 1.9565412998199463, + "learning_rate": 6.324423143435941e-07, + "loss": 5.3597, + "step": 19988 + }, + { + "epoch": 0.99, + "grad_norm": 2.122183084487915, + "learning_rate": 6.225604031819754e-07, + "loss": 5.5183, + "step": 19992 + }, + { + "epoch": 0.99, + "grad_norm": 2.2601876258850098, + "learning_rate": 6.126784920203568e-07, + "loss": 5.3787, + "step": 19996 + }, + { + "epoch": 0.99, + "grad_norm": 1.9778292179107666, + "learning_rate": 6.027965808587382e-07, + "loss": 5.3719, + "step": 20000 + }, + { + "epoch": 0.99, + "grad_norm": 2.0290167331695557, + "learning_rate": 5.929146696971195e-07, + "loss": 5.4688, + "step": 20004 + }, + { + "epoch": 0.99, + "grad_norm": 2.108403444290161, + "learning_rate": 5.830327585355008e-07, + "loss": 5.3245, + "step": 20008 + }, + { + "epoch": 0.99, + "grad_norm": 2.3024027347564697, + "learning_rate": 5.731508473738821e-07, + "loss": 5.4963, + "step": 20012 + }, + { + "epoch": 0.99, + "grad_norm": 2.154008150100708, + "learning_rate": 5.632689362122635e-07, + "loss": 5.4071, + "step": 20016 + }, + { + "epoch": 0.99, + "grad_norm": 2.2379016876220703, + "learning_rate": 5.533870250506449e-07, + "loss": 5.4005, + "step": 20020 + }, + { + "epoch": 0.99, + "grad_norm": 2.0857393741607666, + "learning_rate": 5.435051138890262e-07, + "loss": 5.5153, + "step": 20024 + }, + { + "epoch": 0.99, + "grad_norm": 1.9951947927474976, + "learning_rate": 5.336232027274075e-07, + "loss": 5.4283, + "step": 20028 + }, + { + "epoch": 0.99, + "grad_norm": 2.357630729675293, + "learning_rate": 5.237412915657888e-07, + "loss": 5.4484, + "step": 20032 + }, + { + "epoch": 0.99, + "grad_norm": 2.1432507038116455, + "learning_rate": 5.138593804041701e-07, + "loss": 5.4331, + "step": 20036 + }, + { + "epoch": 0.99, + "grad_norm": 1.8710029125213623, + "learning_rate": 5.039774692425516e-07, + "loss": 5.331, + "step": 20040 + }, + { + "epoch": 0.99, + "grad_norm": 2.1204705238342285, + "learning_rate": 4.940955580809329e-07, + "loss": 5.3974, + "step": 20044 + }, + { + "epoch": 0.99, + "grad_norm": 2.2792441844940186, + "learning_rate": 4.842136469193142e-07, + "loss": 5.3835, + "step": 20048 + }, + { + "epoch": 0.99, + "grad_norm": 2.204084873199463, + "learning_rate": 4.7433173575769557e-07, + "loss": 5.457, + "step": 20052 + }, + { + "epoch": 0.99, + "grad_norm": 2.0620133876800537, + "learning_rate": 4.644498245960769e-07, + "loss": 5.4719, + "step": 20056 + }, + { + "epoch": 0.99, + "grad_norm": 2.0034713745117188, + "learning_rate": 4.545679134344582e-07, + "loss": 5.3951, + "step": 20060 + }, + { + "epoch": 0.99, + "grad_norm": 1.9210227727890015, + "learning_rate": 4.4468600227283963e-07, + "loss": 5.5225, + "step": 20064 + }, + { + "epoch": 0.99, + "grad_norm": 2.102591037750244, + "learning_rate": 4.3480409111122095e-07, + "loss": 5.5268, + "step": 20068 + }, + { + "epoch": 0.99, + "grad_norm": 2.0443971157073975, + "learning_rate": 4.2492217994960227e-07, + "loss": 5.5364, + "step": 20072 + }, + { + "epoch": 0.99, + "grad_norm": 2.135408878326416, + "learning_rate": 4.150402687879836e-07, + "loss": 5.4666, + "step": 20076 + }, + { + "epoch": 0.99, + "grad_norm": 2.1176159381866455, + "learning_rate": 4.05158357626365e-07, + "loss": 5.4501, + "step": 20080 + }, + { + "epoch": 0.99, + "grad_norm": 1.8199737071990967, + "learning_rate": 3.9527644646474633e-07, + "loss": 5.4657, + "step": 20084 + }, + { + "epoch": 0.99, + "grad_norm": 2.200198173522949, + "learning_rate": 3.8539453530312765e-07, + "loss": 5.6004, + "step": 20088 + }, + { + "epoch": 0.99, + "grad_norm": 1.9499452114105225, + "learning_rate": 3.75512624141509e-07, + "loss": 5.5219, + "step": 20092 + }, + { + "epoch": 0.99, + "grad_norm": 2.084623098373413, + "learning_rate": 3.6563071297989034e-07, + "loss": 5.5269, + "step": 20096 + }, + { + "epoch": 0.99, + "grad_norm": 2.1379587650299072, + "learning_rate": 3.5574880181827166e-07, + "loss": 5.2866, + "step": 20100 + }, + { + "epoch": 0.99, + "grad_norm": 2.1811280250549316, + "learning_rate": 3.4586689065665304e-07, + "loss": 5.5607, + "step": 20104 + }, + { + "epoch": 0.99, + "grad_norm": 2.0732221603393555, + "learning_rate": 3.3598497949503435e-07, + "loss": 5.5122, + "step": 20108 + }, + { + "epoch": 0.99, + "grad_norm": 2.181933641433716, + "learning_rate": 3.2610306833341573e-07, + "loss": 5.5222, + "step": 20112 + }, + { + "epoch": 0.99, + "grad_norm": 1.8368821144104004, + "learning_rate": 3.1622115717179705e-07, + "loss": 5.3693, + "step": 20116 + }, + { + "epoch": 0.99, + "grad_norm": 2.0082316398620605, + "learning_rate": 3.063392460101784e-07, + "loss": 5.3343, + "step": 20120 + }, + { + "epoch": 0.99, + "grad_norm": 1.952849268913269, + "learning_rate": 2.9645733484855974e-07, + "loss": 5.377, + "step": 20124 + }, + { + "epoch": 0.99, + "grad_norm": 1.9544694423675537, + "learning_rate": 2.8657542368694106e-07, + "loss": 5.4439, + "step": 20128 + }, + { + "epoch": 0.99, + "grad_norm": 2.0548641681671143, + "learning_rate": 2.7669351252532243e-07, + "loss": 5.5092, + "step": 20132 + }, + { + "epoch": 0.99, + "grad_norm": 2.101043939590454, + "learning_rate": 2.6681160136370375e-07, + "loss": 5.3574, + "step": 20136 + }, + { + "epoch": 1.0, + "grad_norm": 2.1047868728637695, + "learning_rate": 2.5692969020208507e-07, + "loss": 5.513, + "step": 20140 + }, + { + "epoch": 1.0, + "grad_norm": 1.9184880256652832, + "learning_rate": 2.4704777904046644e-07, + "loss": 5.3459, + "step": 20144 + }, + { + "epoch": 1.0, + "grad_norm": 2.2628798484802246, + "learning_rate": 2.3716586787884778e-07, + "loss": 5.4285, + "step": 20148 + }, + { + "epoch": 1.0, + "grad_norm": 2.3330535888671875, + "learning_rate": 2.272839567172291e-07, + "loss": 5.4673, + "step": 20152 + }, + { + "epoch": 1.0, + "grad_norm": 2.0739777088165283, + "learning_rate": 2.1740204555561048e-07, + "loss": 5.4847, + "step": 20156 + }, + { + "epoch": 1.0, + "grad_norm": 2.010349750518799, + "learning_rate": 2.075201343939918e-07, + "loss": 5.4364, + "step": 20160 + }, + { + "epoch": 1.0, + "grad_norm": 1.8513480424880981, + "learning_rate": 1.9763822323237317e-07, + "loss": 5.528, + "step": 20164 + }, + { + "epoch": 1.0, + "grad_norm": 2.0048437118530273, + "learning_rate": 1.877563120707545e-07, + "loss": 5.3088, + "step": 20168 + }, + { + "epoch": 1.0, + "grad_norm": 1.953896403312683, + "learning_rate": 1.7787440090913583e-07, + "loss": 5.4566, + "step": 20172 + }, + { + "epoch": 1.0, + "grad_norm": 1.9878010749816895, + "learning_rate": 1.6799248974751718e-07, + "loss": 5.3852, + "step": 20176 + }, + { + "epoch": 1.0, + "grad_norm": 2.017378330230713, + "learning_rate": 1.5811057858589852e-07, + "loss": 5.334, + "step": 20180 + }, + { + "epoch": 1.0, + "grad_norm": 1.9125686883926392, + "learning_rate": 1.4822866742427987e-07, + "loss": 5.4209, + "step": 20184 + }, + { + "epoch": 1.0, + "grad_norm": 2.205244302749634, + "learning_rate": 1.3834675626266121e-07, + "loss": 5.443, + "step": 20188 + }, + { + "epoch": 1.0, + "grad_norm": 1.9783563613891602, + "learning_rate": 1.2846484510104253e-07, + "loss": 5.5629, + "step": 20192 + }, + { + "epoch": 1.0, + "grad_norm": 2.003634214401245, + "learning_rate": 1.1858293393942389e-07, + "loss": 5.4012, + "step": 20196 + }, + { + "epoch": 1.0, + "grad_norm": 2.183382511138916, + "learning_rate": 1.0870102277780524e-07, + "loss": 5.4401, + "step": 20200 + }, + { + "epoch": 1.0, + "grad_norm": 2.161017656326294, + "learning_rate": 9.881911161618658e-08, + "loss": 5.5125, + "step": 20204 + }, + { + "epoch": 1.0, + "grad_norm": 2.1500487327575684, + "learning_rate": 8.893720045456792e-08, + "loss": 5.5267, + "step": 20208 + }, + { + "epoch": 1.0, + "grad_norm": 2.1386756896972656, + "learning_rate": 7.905528929294926e-08, + "loss": 5.4015, + "step": 20212 + }, + { + "epoch": 1.0, + "grad_norm": 2.1625399589538574, + "learning_rate": 6.917337813133061e-08, + "loss": 5.467, + "step": 20216 + }, + { + "epoch": 1.0, + "grad_norm": 2.1189723014831543, + "learning_rate": 5.9291466969711946e-08, + "loss": 5.4626, + "step": 20220 + }, + { + "epoch": 1.0, + "grad_norm": 2.1535654067993164, + "learning_rate": 4.940955580809329e-08, + "loss": 5.507, + "step": 20224 + }, + { + "epoch": 1.0, + "grad_norm": 1.8377280235290527, + "learning_rate": 3.952764464647463e-08, + "loss": 5.4731, + "step": 20228 + }, + { + "epoch": 1.0, + "grad_norm": 1.746285080909729, + "learning_rate": 2.9645733484855973e-08, + "loss": 5.3422, + "step": 20232 + }, + { + "epoch": 1.0, + "grad_norm": 2.335073947906494, + "learning_rate": 1.9763822323237315e-08, + "loss": 5.4435, + "step": 20236 + }, + { + "epoch": 1.0, + "step": 20239, + "total_flos": 8.525167704460493e+16, + "train_loss": 5.640737008268616, + "train_runtime": 3394.3955, + "train_samples_per_second": 95.396, + "train_steps_per_second": 5.962 + } + ], + "logging_steps": 4, + "max_steps": 20239, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 2024, + "total_flos": 8.525167704460493e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}