{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9002671415850401, "eval_steps": 500, "global_step": 9099, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 3.023737668991089, "learning_rate": 4.998021173444148e-05, "loss": 10.2295, "step": 4 }, { "epoch": 0.0, "grad_norm": 2.7276954650878906, "learning_rate": 4.9960423468882955e-05, "loss": 9.2429, "step": 8 }, { "epoch": 0.0, "grad_norm": 2.202863931655884, "learning_rate": 4.9940635203324435e-05, "loss": 8.8335, "step": 12 }, { "epoch": 0.0, "grad_norm": 2.1424026489257812, "learning_rate": 4.992084693776591e-05, "loss": 8.5825, "step": 16 }, { "epoch": 0.0, "grad_norm": 2.1057238578796387, "learning_rate": 4.990105867220739e-05, "loss": 8.2023, "step": 20 }, { "epoch": 0.0, "grad_norm": 2.0647401809692383, "learning_rate": 4.988127040664886e-05, "loss": 7.7905, "step": 24 }, { "epoch": 0.0, "grad_norm": 1.845037817955017, "learning_rate": 4.986148214109034e-05, "loss": 7.5639, "step": 28 }, { "epoch": 0.0, "grad_norm": 1.766366958618164, "learning_rate": 4.984169387553181e-05, "loss": 7.3338, "step": 32 }, { "epoch": 0.0, "grad_norm": 1.7267919778823853, "learning_rate": 4.982190560997329e-05, "loss": 7.3396, "step": 36 }, { "epoch": 0.0, "grad_norm": 1.3584438562393188, "learning_rate": 4.9802117344414766e-05, "loss": 7.2269, "step": 40 }, { "epoch": 0.0, "grad_norm": 1.5067508220672607, "learning_rate": 4.978232907885624e-05, "loss": 6.8696, "step": 44 }, { "epoch": 0.0, "grad_norm": 1.3194962739944458, "learning_rate": 4.976254081329771e-05, "loss": 6.7392, "step": 48 }, { "epoch": 0.01, "grad_norm": 1.4466848373413086, "learning_rate": 4.974275254773919e-05, "loss": 6.5025, "step": 52 }, { "epoch": 0.01, "grad_norm": 1.3488733768463135, "learning_rate": 4.9722964282180664e-05, "loss": 6.4044, "step": 56 }, { "epoch": 0.01, "grad_norm": 1.301389217376709, "learning_rate": 4.9703176016622144e-05, "loss": 6.2985, "step": 60 }, { "epoch": 0.01, "grad_norm": 1.354225754737854, "learning_rate": 4.9683387751063624e-05, "loss": 6.1545, "step": 64 }, { "epoch": 0.01, "grad_norm": 1.1548670530319214, "learning_rate": 4.9663599485505096e-05, "loss": 6.1522, "step": 68 }, { "epoch": 0.01, "grad_norm": 1.2819015979766846, "learning_rate": 4.9643811219946576e-05, "loss": 5.9876, "step": 72 }, { "epoch": 0.01, "grad_norm": 0.9923768639564514, "learning_rate": 4.962402295438805e-05, "loss": 5.8414, "step": 76 }, { "epoch": 0.01, "grad_norm": 1.005074381828308, "learning_rate": 4.960423468882953e-05, "loss": 5.7825, "step": 80 }, { "epoch": 0.01, "grad_norm": 0.9730919599533081, "learning_rate": 4.9584446423271e-05, "loss": 5.675, "step": 84 }, { "epoch": 0.01, "grad_norm": 1.032512903213501, "learning_rate": 4.956465815771248e-05, "loss": 5.644, "step": 88 }, { "epoch": 0.01, "grad_norm": 1.2836546897888184, "learning_rate": 4.9544869892153954e-05, "loss": 5.7789, "step": 92 }, { "epoch": 0.01, "grad_norm": 0.9425686001777649, "learning_rate": 4.9525081626595434e-05, "loss": 5.429, "step": 96 }, { "epoch": 0.01, "grad_norm": 0.8604844212532043, "learning_rate": 4.950529336103691e-05, "loss": 5.4202, "step": 100 }, { "epoch": 0.01, "grad_norm": 0.8457801342010498, "learning_rate": 4.948550509547839e-05, "loss": 5.2959, "step": 104 }, { "epoch": 0.01, "grad_norm": 0.9246242046356201, "learning_rate": 4.946571682991986e-05, "loss": 5.4305, "step": 108 }, { "epoch": 0.01, "grad_norm": 0.9483153223991394, "learning_rate": 4.944592856436134e-05, "loss": 5.382, "step": 112 }, { "epoch": 0.01, "grad_norm": 0.8151757717132568, "learning_rate": 4.942614029880281e-05, "loss": 5.2487, "step": 116 }, { "epoch": 0.01, "grad_norm": 0.8092381954193115, "learning_rate": 4.940635203324429e-05, "loss": 5.4251, "step": 120 }, { "epoch": 0.01, "grad_norm": 1.0206758975982666, "learning_rate": 4.9386563767685765e-05, "loss": 5.245, "step": 124 }, { "epoch": 0.01, "grad_norm": 0.9263086318969727, "learning_rate": 4.936677550212724e-05, "loss": 5.3408, "step": 128 }, { "epoch": 0.01, "grad_norm": 0.8204173445701599, "learning_rate": 4.934698723656872e-05, "loss": 5.3163, "step": 132 }, { "epoch": 0.01, "grad_norm": 0.817661702632904, "learning_rate": 4.932719897101019e-05, "loss": 5.2058, "step": 136 }, { "epoch": 0.01, "grad_norm": 0.8175525069236755, "learning_rate": 4.930741070545167e-05, "loss": 5.0683, "step": 140 }, { "epoch": 0.01, "grad_norm": 0.8956682682037354, "learning_rate": 4.928762243989314e-05, "loss": 5.285, "step": 144 }, { "epoch": 0.01, "grad_norm": 1.1034317016601562, "learning_rate": 4.926783417433462e-05, "loss": 5.2186, "step": 148 }, { "epoch": 0.02, "grad_norm": 0.8372359275817871, "learning_rate": 4.9248045908776096e-05, "loss": 5.0079, "step": 152 }, { "epoch": 0.02, "grad_norm": 0.8574570417404175, "learning_rate": 4.9228257643217576e-05, "loss": 5.1058, "step": 156 }, { "epoch": 0.02, "grad_norm": 1.0270808935165405, "learning_rate": 4.920846937765905e-05, "loss": 5.3269, "step": 160 }, { "epoch": 0.02, "grad_norm": 0.8500723242759705, "learning_rate": 4.918868111210053e-05, "loss": 5.1533, "step": 164 }, { "epoch": 0.02, "grad_norm": 0.8077535033226013, "learning_rate": 4.9168892846542e-05, "loss": 4.9847, "step": 168 }, { "epoch": 0.02, "grad_norm": 0.9095591902732849, "learning_rate": 4.914910458098348e-05, "loss": 4.8715, "step": 172 }, { "epoch": 0.02, "grad_norm": 1.0022218227386475, "learning_rate": 4.9129316315424954e-05, "loss": 4.9394, "step": 176 }, { "epoch": 0.02, "grad_norm": 1.0106725692749023, "learning_rate": 4.9109528049866433e-05, "loss": 5.1306, "step": 180 }, { "epoch": 0.02, "grad_norm": 0.980566143989563, "learning_rate": 4.9089739784307906e-05, "loss": 5.0595, "step": 184 }, { "epoch": 0.02, "grad_norm": 1.0247553586959839, "learning_rate": 4.9069951518749386e-05, "loss": 5.0103, "step": 188 }, { "epoch": 0.02, "grad_norm": 0.925584077835083, "learning_rate": 4.905016325319086e-05, "loss": 4.7923, "step": 192 }, { "epoch": 0.02, "grad_norm": 0.9778655171394348, "learning_rate": 4.903037498763234e-05, "loss": 4.8465, "step": 196 }, { "epoch": 0.02, "grad_norm": 0.8761100172996521, "learning_rate": 4.901058672207381e-05, "loss": 4.856, "step": 200 }, { "epoch": 0.02, "grad_norm": 0.8272585272789001, "learning_rate": 4.899079845651529e-05, "loss": 4.8365, "step": 204 }, { "epoch": 0.02, "grad_norm": 0.8929518461227417, "learning_rate": 4.8971010190956764e-05, "loss": 4.7708, "step": 208 }, { "epoch": 0.02, "grad_norm": 1.0088162422180176, "learning_rate": 4.895122192539824e-05, "loss": 4.8667, "step": 212 }, { "epoch": 0.02, "grad_norm": 1.0307660102844238, "learning_rate": 4.893143365983972e-05, "loss": 4.9297, "step": 216 }, { "epoch": 0.02, "grad_norm": 0.8324564099311829, "learning_rate": 4.891164539428119e-05, "loss": 4.6895, "step": 220 }, { "epoch": 0.02, "grad_norm": 0.9149929285049438, "learning_rate": 4.889185712872267e-05, "loss": 5.0596, "step": 224 }, { "epoch": 0.02, "grad_norm": 0.9188018441200256, "learning_rate": 4.887206886316414e-05, "loss": 4.7217, "step": 228 }, { "epoch": 0.02, "grad_norm": 0.8771291971206665, "learning_rate": 4.885228059760562e-05, "loss": 4.5893, "step": 232 }, { "epoch": 0.02, "grad_norm": 0.9374723434448242, "learning_rate": 4.8832492332047095e-05, "loss": 4.7205, "step": 236 }, { "epoch": 0.02, "grad_norm": 0.9590020179748535, "learning_rate": 4.8812704066488575e-05, "loss": 4.6909, "step": 240 }, { "epoch": 0.02, "grad_norm": 0.8914527893066406, "learning_rate": 4.879291580093005e-05, "loss": 4.7799, "step": 244 }, { "epoch": 0.02, "grad_norm": 0.96151202917099, "learning_rate": 4.877312753537153e-05, "loss": 4.6818, "step": 248 }, { "epoch": 0.02, "grad_norm": 0.9983839988708496, "learning_rate": 4.8753339269813e-05, "loss": 4.9377, "step": 252 }, { "epoch": 0.03, "grad_norm": 1.3818860054016113, "learning_rate": 4.873355100425448e-05, "loss": 4.8015, "step": 256 }, { "epoch": 0.03, "grad_norm": 0.8486915826797485, "learning_rate": 4.871376273869595e-05, "loss": 4.6542, "step": 260 }, { "epoch": 0.03, "grad_norm": 0.9783093333244324, "learning_rate": 4.869397447313743e-05, "loss": 4.635, "step": 264 }, { "epoch": 0.03, "grad_norm": 0.9911454916000366, "learning_rate": 4.8674186207578906e-05, "loss": 4.7617, "step": 268 }, { "epoch": 0.03, "grad_norm": 0.8163164258003235, "learning_rate": 4.8654397942020385e-05, "loss": 4.6394, "step": 272 }, { "epoch": 0.03, "grad_norm": 0.9522330164909363, "learning_rate": 4.8634609676461865e-05, "loss": 4.661, "step": 276 }, { "epoch": 0.03, "grad_norm": 1.0596697330474854, "learning_rate": 4.861482141090334e-05, "loss": 4.5832, "step": 280 }, { "epoch": 0.03, "grad_norm": 0.9185882806777954, "learning_rate": 4.859503314534482e-05, "loss": 4.5482, "step": 284 }, { "epoch": 0.03, "grad_norm": 0.980002760887146, "learning_rate": 4.857524487978629e-05, "loss": 4.6268, "step": 288 }, { "epoch": 0.03, "grad_norm": 0.7909073829650879, "learning_rate": 4.855545661422777e-05, "loss": 4.735, "step": 292 }, { "epoch": 0.03, "grad_norm": 0.9752382636070251, "learning_rate": 4.853566834866924e-05, "loss": 4.6384, "step": 296 }, { "epoch": 0.03, "grad_norm": 1.0073622465133667, "learning_rate": 4.8515880083110716e-05, "loss": 4.5566, "step": 300 }, { "epoch": 0.03, "grad_norm": 1.1823135614395142, "learning_rate": 4.849609181755219e-05, "loss": 4.6323, "step": 304 }, { "epoch": 0.03, "grad_norm": 0.8075012564659119, "learning_rate": 4.847630355199367e-05, "loss": 4.4955, "step": 308 }, { "epoch": 0.03, "grad_norm": 0.9888399839401245, "learning_rate": 4.845651528643514e-05, "loss": 4.5405, "step": 312 }, { "epoch": 0.03, "grad_norm": 0.9254989624023438, "learning_rate": 4.843672702087662e-05, "loss": 4.4871, "step": 316 }, { "epoch": 0.03, "grad_norm": 1.068520426750183, "learning_rate": 4.8416938755318095e-05, "loss": 4.7617, "step": 320 }, { "epoch": 0.03, "grad_norm": 0.8666436672210693, "learning_rate": 4.8397150489759574e-05, "loss": 4.6256, "step": 324 }, { "epoch": 0.03, "grad_norm": 0.9162269830703735, "learning_rate": 4.837736222420105e-05, "loss": 4.3914, "step": 328 }, { "epoch": 0.03, "grad_norm": 0.8688908219337463, "learning_rate": 4.835757395864253e-05, "loss": 4.469, "step": 332 }, { "epoch": 0.03, "grad_norm": 0.9131215214729309, "learning_rate": 4.8337785693084007e-05, "loss": 4.5437, "step": 336 }, { "epoch": 0.03, "grad_norm": 0.847606360912323, "learning_rate": 4.831799742752548e-05, "loss": 4.6195, "step": 340 }, { "epoch": 0.03, "grad_norm": 0.9658231735229492, "learning_rate": 4.829820916196696e-05, "loss": 4.5982, "step": 344 }, { "epoch": 0.03, "grad_norm": 0.8523685336112976, "learning_rate": 4.827842089640843e-05, "loss": 4.42, "step": 348 }, { "epoch": 0.03, "grad_norm": 0.9430307149887085, "learning_rate": 4.825863263084991e-05, "loss": 4.5254, "step": 352 }, { "epoch": 0.04, "grad_norm": 1.079181432723999, "learning_rate": 4.8238844365291385e-05, "loss": 4.76, "step": 356 }, { "epoch": 0.04, "grad_norm": 0.9053409099578857, "learning_rate": 4.8219056099732865e-05, "loss": 4.3891, "step": 360 }, { "epoch": 0.04, "grad_norm": 1.0313445329666138, "learning_rate": 4.819926783417434e-05, "loss": 4.6794, "step": 364 }, { "epoch": 0.04, "grad_norm": 0.9754608273506165, "learning_rate": 4.817947956861582e-05, "loss": 4.3483, "step": 368 }, { "epoch": 0.04, "grad_norm": 0.8928918838500977, "learning_rate": 4.815969130305729e-05, "loss": 4.3506, "step": 372 }, { "epoch": 0.04, "grad_norm": 0.9732823967933655, "learning_rate": 4.813990303749877e-05, "loss": 4.4418, "step": 376 }, { "epoch": 0.04, "grad_norm": 0.9762834906578064, "learning_rate": 4.8120114771940236e-05, "loss": 4.4823, "step": 380 }, { "epoch": 0.04, "grad_norm": 1.0655723810195923, "learning_rate": 4.8100326506381716e-05, "loss": 4.3974, "step": 384 }, { "epoch": 0.04, "grad_norm": 0.9439470767974854, "learning_rate": 4.808053824082319e-05, "loss": 4.5157, "step": 388 }, { "epoch": 0.04, "grad_norm": 0.9962547421455383, "learning_rate": 4.806074997526467e-05, "loss": 4.3487, "step": 392 }, { "epoch": 0.04, "grad_norm": 1.0140503644943237, "learning_rate": 4.804096170970615e-05, "loss": 4.2984, "step": 396 }, { "epoch": 0.04, "grad_norm": 0.9673125147819519, "learning_rate": 4.802117344414762e-05, "loss": 4.361, "step": 400 }, { "epoch": 0.04, "grad_norm": 1.0120090246200562, "learning_rate": 4.80013851785891e-05, "loss": 4.4019, "step": 404 }, { "epoch": 0.04, "grad_norm": 0.902720034122467, "learning_rate": 4.7981596913030574e-05, "loss": 4.392, "step": 408 }, { "epoch": 0.04, "grad_norm": 0.8929727077484131, "learning_rate": 4.796180864747205e-05, "loss": 4.7008, "step": 412 }, { "epoch": 0.04, "grad_norm": 1.045725703239441, "learning_rate": 4.7942020381913526e-05, "loss": 4.4655, "step": 416 }, { "epoch": 0.04, "grad_norm": 0.9211822748184204, "learning_rate": 4.7922232116355006e-05, "loss": 4.464, "step": 420 }, { "epoch": 0.04, "grad_norm": 0.9875116348266602, "learning_rate": 4.790244385079648e-05, "loss": 4.3493, "step": 424 }, { "epoch": 0.04, "grad_norm": 0.9416815042495728, "learning_rate": 4.788265558523796e-05, "loss": 4.2362, "step": 428 }, { "epoch": 0.04, "grad_norm": 0.9395657777786255, "learning_rate": 4.786286731967943e-05, "loss": 4.3159, "step": 432 }, { "epoch": 0.04, "grad_norm": 0.8677350878715515, "learning_rate": 4.784307905412091e-05, "loss": 4.3343, "step": 436 }, { "epoch": 0.04, "grad_norm": 0.9983429908752441, "learning_rate": 4.7823290788562384e-05, "loss": 4.3666, "step": 440 }, { "epoch": 0.04, "grad_norm": 1.076252818107605, "learning_rate": 4.7803502523003864e-05, "loss": 4.4674, "step": 444 }, { "epoch": 0.04, "grad_norm": 0.8906400203704834, "learning_rate": 4.778371425744534e-05, "loss": 4.3918, "step": 448 }, { "epoch": 0.04, "grad_norm": 0.9424994587898254, "learning_rate": 4.7763925991886817e-05, "loss": 4.3358, "step": 452 }, { "epoch": 0.05, "grad_norm": 0.9982520341873169, "learning_rate": 4.774413772632829e-05, "loss": 4.4196, "step": 456 }, { "epoch": 0.05, "grad_norm": 0.9584160447120667, "learning_rate": 4.772434946076977e-05, "loss": 4.2583, "step": 460 }, { "epoch": 0.05, "grad_norm": 0.9916421175003052, "learning_rate": 4.770456119521124e-05, "loss": 4.4004, "step": 464 }, { "epoch": 0.05, "grad_norm": 1.0897949934005737, "learning_rate": 4.7684772929652715e-05, "loss": 4.3993, "step": 468 }, { "epoch": 0.05, "grad_norm": 1.3799035549163818, "learning_rate": 4.7664984664094195e-05, "loss": 4.3915, "step": 472 }, { "epoch": 0.05, "grad_norm": 1.0133105516433716, "learning_rate": 4.764519639853567e-05, "loss": 4.0608, "step": 476 }, { "epoch": 0.05, "grad_norm": 1.1998016834259033, "learning_rate": 4.762540813297715e-05, "loss": 4.4948, "step": 480 }, { "epoch": 0.05, "grad_norm": 1.1928074359893799, "learning_rate": 4.760561986741862e-05, "loss": 4.1837, "step": 484 }, { "epoch": 0.05, "grad_norm": 1.0764780044555664, "learning_rate": 4.75858316018601e-05, "loss": 4.3355, "step": 488 }, { "epoch": 0.05, "grad_norm": 1.0785058736801147, "learning_rate": 4.756604333630157e-05, "loss": 4.3751, "step": 492 }, { "epoch": 0.05, "grad_norm": 1.0299701690673828, "learning_rate": 4.754625507074305e-05, "loss": 4.2652, "step": 496 }, { "epoch": 0.05, "grad_norm": 0.9715741276741028, "learning_rate": 4.7526466805184526e-05, "loss": 4.2677, "step": 500 }, { "epoch": 0.05, "grad_norm": 1.1598002910614014, "learning_rate": 4.7506678539626005e-05, "loss": 4.2647, "step": 504 }, { "epoch": 0.05, "grad_norm": 0.9881719946861267, "learning_rate": 4.748689027406748e-05, "loss": 4.2163, "step": 508 }, { "epoch": 0.05, "grad_norm": 1.0998167991638184, "learning_rate": 4.746710200850896e-05, "loss": 4.1632, "step": 512 }, { "epoch": 0.05, "grad_norm": 1.055464267730713, "learning_rate": 4.744731374295043e-05, "loss": 4.2287, "step": 516 }, { "epoch": 0.05, "grad_norm": 1.1115161180496216, "learning_rate": 4.742752547739191e-05, "loss": 4.3554, "step": 520 }, { "epoch": 0.05, "grad_norm": 0.8956658840179443, "learning_rate": 4.7407737211833384e-05, "loss": 3.9656, "step": 524 }, { "epoch": 0.05, "grad_norm": 1.2546039819717407, "learning_rate": 4.738794894627486e-05, "loss": 4.2604, "step": 528 }, { "epoch": 0.05, "grad_norm": 1.1724745035171509, "learning_rate": 4.7368160680716336e-05, "loss": 4.291, "step": 532 }, { "epoch": 0.05, "grad_norm": 1.068641185760498, "learning_rate": 4.7348372415157816e-05, "loss": 4.0505, "step": 536 }, { "epoch": 0.05, "grad_norm": 1.0498847961425781, "learning_rate": 4.732858414959929e-05, "loss": 4.2438, "step": 540 }, { "epoch": 0.05, "grad_norm": 1.0670949220657349, "learning_rate": 4.730879588404077e-05, "loss": 4.1264, "step": 544 }, { "epoch": 0.05, "grad_norm": 1.076369285583496, "learning_rate": 4.728900761848225e-05, "loss": 4.2795, "step": 548 }, { "epoch": 0.05, "grad_norm": 1.1954712867736816, "learning_rate": 4.7269219352923714e-05, "loss": 4.2029, "step": 552 }, { "epoch": 0.06, "grad_norm": 0.9923123717308044, "learning_rate": 4.7249431087365194e-05, "loss": 4.2668, "step": 556 }, { "epoch": 0.06, "grad_norm": 1.1335583925247192, "learning_rate": 4.722964282180667e-05, "loss": 3.9782, "step": 560 }, { "epoch": 0.06, "grad_norm": 1.0587942600250244, "learning_rate": 4.720985455624815e-05, "loss": 4.0898, "step": 564 }, { "epoch": 0.06, "grad_norm": 0.9978137016296387, "learning_rate": 4.719006629068962e-05, "loss": 3.9701, "step": 568 }, { "epoch": 0.06, "grad_norm": 1.0784775018692017, "learning_rate": 4.71702780251311e-05, "loss": 4.0768, "step": 572 }, { "epoch": 0.06, "grad_norm": 1.1316741704940796, "learning_rate": 4.715048975957257e-05, "loss": 4.1185, "step": 576 }, { "epoch": 0.06, "grad_norm": 1.1574583053588867, "learning_rate": 4.713070149401405e-05, "loss": 4.1627, "step": 580 }, { "epoch": 0.06, "grad_norm": 1.4153751134872437, "learning_rate": 4.7110913228455525e-05, "loss": 4.4233, "step": 584 }, { "epoch": 0.06, "grad_norm": 1.0705913305282593, "learning_rate": 4.7091124962897005e-05, "loss": 4.1492, "step": 588 }, { "epoch": 0.06, "grad_norm": 1.3845056295394897, "learning_rate": 4.707133669733848e-05, "loss": 4.1557, "step": 592 }, { "epoch": 0.06, "grad_norm": 0.989825427532196, "learning_rate": 4.705154843177996e-05, "loss": 4.1221, "step": 596 }, { "epoch": 0.06, "grad_norm": 1.1429495811462402, "learning_rate": 4.703176016622143e-05, "loss": 4.1611, "step": 600 }, { "epoch": 0.06, "grad_norm": 1.1858593225479126, "learning_rate": 4.701197190066291e-05, "loss": 4.121, "step": 604 }, { "epoch": 0.06, "grad_norm": 0.9340394735336304, "learning_rate": 4.699218363510439e-05, "loss": 4.0173, "step": 608 }, { "epoch": 0.06, "grad_norm": 1.0611008405685425, "learning_rate": 4.697239536954586e-05, "loss": 4.0978, "step": 612 }, { "epoch": 0.06, "grad_norm": 1.1144658327102661, "learning_rate": 4.695260710398734e-05, "loss": 3.9948, "step": 616 }, { "epoch": 0.06, "grad_norm": 1.0355262756347656, "learning_rate": 4.6932818838428815e-05, "loss": 3.9703, "step": 620 }, { "epoch": 0.06, "grad_norm": 1.0302252769470215, "learning_rate": 4.6913030572870295e-05, "loss": 4.055, "step": 624 }, { "epoch": 0.06, "grad_norm": 0.9846487641334534, "learning_rate": 4.689324230731177e-05, "loss": 4.0066, "step": 628 }, { "epoch": 0.06, "grad_norm": 1.2877755165100098, "learning_rate": 4.687345404175325e-05, "loss": 4.1539, "step": 632 }, { "epoch": 0.06, "grad_norm": 1.046048641204834, "learning_rate": 4.6853665776194714e-05, "loss": 3.8136, "step": 636 }, { "epoch": 0.06, "grad_norm": 1.0364129543304443, "learning_rate": 4.6833877510636193e-05, "loss": 4.1889, "step": 640 }, { "epoch": 0.06, "grad_norm": 1.102586030960083, "learning_rate": 4.6814089245077666e-05, "loss": 3.9993, "step": 644 }, { "epoch": 0.06, "grad_norm": 1.0004945993423462, "learning_rate": 4.6794300979519146e-05, "loss": 3.9929, "step": 648 }, { "epoch": 0.06, "grad_norm": 1.0600558519363403, "learning_rate": 4.677451271396062e-05, "loss": 4.0179, "step": 652 }, { "epoch": 0.06, "grad_norm": 1.0851588249206543, "learning_rate": 4.67547244484021e-05, "loss": 3.9701, "step": 656 }, { "epoch": 0.07, "grad_norm": 1.2407004833221436, "learning_rate": 4.673493618284357e-05, "loss": 4.1936, "step": 660 }, { "epoch": 0.07, "grad_norm": 1.4276139736175537, "learning_rate": 4.671514791728505e-05, "loss": 4.0371, "step": 664 }, { "epoch": 0.07, "grad_norm": 1.1554222106933594, "learning_rate": 4.669535965172653e-05, "loss": 4.1037, "step": 668 }, { "epoch": 0.07, "grad_norm": 1.062843918800354, "learning_rate": 4.6675571386168004e-05, "loss": 3.8831, "step": 672 }, { "epoch": 0.07, "grad_norm": 1.2357606887817383, "learning_rate": 4.6655783120609484e-05, "loss": 4.0885, "step": 676 }, { "epoch": 0.07, "grad_norm": 1.1083883047103882, "learning_rate": 4.663599485505096e-05, "loss": 3.8199, "step": 680 }, { "epoch": 0.07, "grad_norm": 1.0669097900390625, "learning_rate": 4.6616206589492436e-05, "loss": 4.1267, "step": 684 }, { "epoch": 0.07, "grad_norm": 1.0153981447219849, "learning_rate": 4.659641832393391e-05, "loss": 3.833, "step": 688 }, { "epoch": 0.07, "grad_norm": 1.2568185329437256, "learning_rate": 4.657663005837539e-05, "loss": 4.1119, "step": 692 }, { "epoch": 0.07, "grad_norm": 1.2034963369369507, "learning_rate": 4.655684179281686e-05, "loss": 3.7792, "step": 696 }, { "epoch": 0.07, "grad_norm": 1.192213773727417, "learning_rate": 4.653705352725834e-05, "loss": 4.0965, "step": 700 }, { "epoch": 0.07, "grad_norm": 1.1414371728897095, "learning_rate": 4.6517265261699815e-05, "loss": 3.8453, "step": 704 }, { "epoch": 0.07, "grad_norm": 1.2463369369506836, "learning_rate": 4.6497476996141294e-05, "loss": 3.9088, "step": 708 }, { "epoch": 0.07, "grad_norm": 1.062708854675293, "learning_rate": 4.647768873058277e-05, "loss": 3.9295, "step": 712 }, { "epoch": 0.07, "grad_norm": 1.05110764503479, "learning_rate": 4.645790046502425e-05, "loss": 3.9937, "step": 716 }, { "epoch": 0.07, "grad_norm": 1.0769037008285522, "learning_rate": 4.643811219946571e-05, "loss": 3.9416, "step": 720 }, { "epoch": 0.07, "grad_norm": 1.1095948219299316, "learning_rate": 4.641832393390719e-05, "loss": 3.7524, "step": 724 }, { "epoch": 0.07, "grad_norm": 1.1786772012710571, "learning_rate": 4.639853566834867e-05, "loss": 3.7961, "step": 728 }, { "epoch": 0.07, "grad_norm": 1.1711845397949219, "learning_rate": 4.6378747402790145e-05, "loss": 3.9437, "step": 732 }, { "epoch": 0.07, "grad_norm": 1.0217204093933105, "learning_rate": 4.6358959137231625e-05, "loss": 3.8099, "step": 736 }, { "epoch": 0.07, "grad_norm": 1.1621912717819214, "learning_rate": 4.63391708716731e-05, "loss": 3.9684, "step": 740 }, { "epoch": 0.07, "grad_norm": 1.2898588180541992, "learning_rate": 4.631938260611458e-05, "loss": 3.8558, "step": 744 }, { "epoch": 0.07, "grad_norm": 1.2988104820251465, "learning_rate": 4.629959434055605e-05, "loss": 3.8503, "step": 748 }, { "epoch": 0.07, "grad_norm": 1.1195327043533325, "learning_rate": 4.627980607499753e-05, "loss": 3.8956, "step": 752 }, { "epoch": 0.07, "grad_norm": 1.2289355993270874, "learning_rate": 4.6260017809439003e-05, "loss": 3.6909, "step": 756 }, { "epoch": 0.08, "grad_norm": 1.2389601469039917, "learning_rate": 4.624022954388048e-05, "loss": 3.9477, "step": 760 }, { "epoch": 0.08, "grad_norm": 1.2713934183120728, "learning_rate": 4.6220441278321956e-05, "loss": 3.9466, "step": 764 }, { "epoch": 0.08, "grad_norm": 1.19290292263031, "learning_rate": 4.6200653012763436e-05, "loss": 3.7048, "step": 768 }, { "epoch": 0.08, "grad_norm": 1.107513666152954, "learning_rate": 4.618086474720491e-05, "loss": 3.8013, "step": 772 }, { "epoch": 0.08, "grad_norm": 0.9974806904792786, "learning_rate": 4.616107648164639e-05, "loss": 3.7578, "step": 776 }, { "epoch": 0.08, "grad_norm": 1.180731177330017, "learning_rate": 4.614128821608786e-05, "loss": 3.9304, "step": 780 }, { "epoch": 0.08, "grad_norm": 1.5762641429901123, "learning_rate": 4.612149995052934e-05, "loss": 3.9271, "step": 784 }, { "epoch": 0.08, "grad_norm": 1.1390140056610107, "learning_rate": 4.6101711684970814e-05, "loss": 4.0947, "step": 788 }, { "epoch": 0.08, "grad_norm": 1.2429100275039673, "learning_rate": 4.6081923419412294e-05, "loss": 3.9939, "step": 792 }, { "epoch": 0.08, "grad_norm": 1.1867088079452515, "learning_rate": 4.606213515385377e-05, "loss": 3.8247, "step": 796 }, { "epoch": 0.08, "grad_norm": 1.0802291631698608, "learning_rate": 4.6042346888295246e-05, "loss": 3.6472, "step": 800 }, { "epoch": 0.08, "grad_norm": 1.3064371347427368, "learning_rate": 4.602255862273672e-05, "loss": 3.785, "step": 804 }, { "epoch": 0.08, "grad_norm": 1.3858779668807983, "learning_rate": 4.600277035717819e-05, "loss": 4.0065, "step": 808 }, { "epoch": 0.08, "grad_norm": 1.09247887134552, "learning_rate": 4.598298209161967e-05, "loss": 3.8801, "step": 812 }, { "epoch": 0.08, "grad_norm": 1.1438707113265991, "learning_rate": 4.5963193826061145e-05, "loss": 3.7864, "step": 816 }, { "epoch": 0.08, "grad_norm": 1.107773780822754, "learning_rate": 4.5943405560502625e-05, "loss": 3.6436, "step": 820 }, { "epoch": 0.08, "grad_norm": 1.0622732639312744, "learning_rate": 4.59236172949441e-05, "loss": 3.8597, "step": 824 }, { "epoch": 0.08, "grad_norm": 1.0812435150146484, "learning_rate": 4.590382902938558e-05, "loss": 3.8085, "step": 828 }, { "epoch": 0.08, "grad_norm": 1.1576310396194458, "learning_rate": 4.588404076382705e-05, "loss": 3.6459, "step": 832 }, { "epoch": 0.08, "grad_norm": 1.1088974475860596, "learning_rate": 4.586425249826853e-05, "loss": 3.8514, "step": 836 }, { "epoch": 0.08, "grad_norm": 1.1565968990325928, "learning_rate": 4.584446423271e-05, "loss": 3.7429, "step": 840 }, { "epoch": 0.08, "grad_norm": 1.226541519165039, "learning_rate": 4.582467596715148e-05, "loss": 3.9345, "step": 844 }, { "epoch": 0.08, "grad_norm": 1.3901216983795166, "learning_rate": 4.5804887701592955e-05, "loss": 4.0038, "step": 848 }, { "epoch": 0.08, "grad_norm": 1.0757673978805542, "learning_rate": 4.5785099436034435e-05, "loss": 3.6689, "step": 852 }, { "epoch": 0.08, "grad_norm": 1.0806201696395874, "learning_rate": 4.576531117047591e-05, "loss": 3.8034, "step": 856 }, { "epoch": 0.09, "grad_norm": 1.0815987586975098, "learning_rate": 4.574552290491739e-05, "loss": 3.5688, "step": 860 }, { "epoch": 0.09, "grad_norm": 1.2539079189300537, "learning_rate": 4.572573463935886e-05, "loss": 3.7314, "step": 864 }, { "epoch": 0.09, "grad_norm": 1.2489246129989624, "learning_rate": 4.570594637380034e-05, "loss": 3.7358, "step": 868 }, { "epoch": 0.09, "grad_norm": 1.188781976699829, "learning_rate": 4.568615810824181e-05, "loss": 3.8216, "step": 872 }, { "epoch": 0.09, "grad_norm": 1.1977195739746094, "learning_rate": 4.566636984268329e-05, "loss": 3.7418, "step": 876 }, { "epoch": 0.09, "grad_norm": 1.3144996166229248, "learning_rate": 4.564658157712477e-05, "loss": 3.8569, "step": 880 }, { "epoch": 0.09, "grad_norm": 1.194718360900879, "learning_rate": 4.5626793311566246e-05, "loss": 3.9339, "step": 884 }, { "epoch": 0.09, "grad_norm": 1.1180779933929443, "learning_rate": 4.560700504600772e-05, "loss": 3.7593, "step": 888 }, { "epoch": 0.09, "grad_norm": 1.2359132766723633, "learning_rate": 4.558721678044919e-05, "loss": 3.7554, "step": 892 }, { "epoch": 0.09, "grad_norm": 1.456741213798523, "learning_rate": 4.556742851489067e-05, "loss": 3.8741, "step": 896 }, { "epoch": 0.09, "grad_norm": 1.21878981590271, "learning_rate": 4.5547640249332144e-05, "loss": 3.7214, "step": 900 }, { "epoch": 0.09, "grad_norm": 1.2724573612213135, "learning_rate": 4.5527851983773624e-05, "loss": 3.7083, "step": 904 }, { "epoch": 0.09, "grad_norm": 1.3674492835998535, "learning_rate": 4.55080637182151e-05, "loss": 3.7782, "step": 908 }, { "epoch": 0.09, "grad_norm": 1.1962213516235352, "learning_rate": 4.5488275452656577e-05, "loss": 3.8053, "step": 912 }, { "epoch": 0.09, "grad_norm": 1.204624056816101, "learning_rate": 4.546848718709805e-05, "loss": 3.8807, "step": 916 }, { "epoch": 0.09, "grad_norm": 1.162916660308838, "learning_rate": 4.544869892153953e-05, "loss": 3.62, "step": 920 }, { "epoch": 0.09, "grad_norm": 1.1495506763458252, "learning_rate": 4.5428910655981e-05, "loss": 3.6001, "step": 924 }, { "epoch": 0.09, "grad_norm": 1.1593899726867676, "learning_rate": 4.540912239042248e-05, "loss": 4.0318, "step": 928 }, { "epoch": 0.09, "grad_norm": 1.1468162536621094, "learning_rate": 4.5389334124863955e-05, "loss": 3.6793, "step": 932 }, { "epoch": 0.09, "grad_norm": 1.0815014839172363, "learning_rate": 4.5369545859305434e-05, "loss": 3.8516, "step": 936 }, { "epoch": 0.09, "grad_norm": 1.1018073558807373, "learning_rate": 4.5349757593746914e-05, "loss": 3.7732, "step": 940 }, { "epoch": 0.09, "grad_norm": 1.1128356456756592, "learning_rate": 4.532996932818839e-05, "loss": 3.5784, "step": 944 }, { "epoch": 0.09, "grad_norm": 1.449112057685852, "learning_rate": 4.531018106262987e-05, "loss": 3.8318, "step": 948 }, { "epoch": 0.09, "grad_norm": 1.1925861835479736, "learning_rate": 4.529039279707134e-05, "loss": 3.7108, "step": 952 }, { "epoch": 0.09, "grad_norm": 1.1462700366973877, "learning_rate": 4.527060453151282e-05, "loss": 3.6801, "step": 956 }, { "epoch": 0.09, "grad_norm": 1.218001127243042, "learning_rate": 4.525081626595429e-05, "loss": 3.7581, "step": 960 }, { "epoch": 0.1, "grad_norm": 1.09965980052948, "learning_rate": 4.523102800039577e-05, "loss": 3.661, "step": 964 }, { "epoch": 0.1, "grad_norm": 1.1705689430236816, "learning_rate": 4.5211239734837245e-05, "loss": 3.8608, "step": 968 }, { "epoch": 0.1, "grad_norm": 1.2540526390075684, "learning_rate": 4.519145146927872e-05, "loss": 3.5236, "step": 972 }, { "epoch": 0.1, "grad_norm": 1.1072841882705688, "learning_rate": 4.517166320372019e-05, "loss": 3.5754, "step": 976 }, { "epoch": 0.1, "grad_norm": 1.0101158618927002, "learning_rate": 4.515187493816167e-05, "loss": 3.7373, "step": 980 }, { "epoch": 0.1, "grad_norm": 1.3311535120010376, "learning_rate": 4.5132086672603144e-05, "loss": 3.7958, "step": 984 }, { "epoch": 0.1, "grad_norm": 1.1845935583114624, "learning_rate": 4.511229840704462e-05, "loss": 3.8895, "step": 988 }, { "epoch": 0.1, "grad_norm": 1.1750415563583374, "learning_rate": 4.5092510141486096e-05, "loss": 3.8246, "step": 992 }, { "epoch": 0.1, "grad_norm": 1.423757553100586, "learning_rate": 4.5072721875927576e-05, "loss": 3.5447, "step": 996 }, { "epoch": 0.1, "grad_norm": 1.1389232873916626, "learning_rate": 4.5052933610369056e-05, "loss": 3.5603, "step": 1000 }, { "epoch": 0.1, "grad_norm": 1.1800868511199951, "learning_rate": 4.503314534481053e-05, "loss": 3.9599, "step": 1004 }, { "epoch": 0.1, "grad_norm": 1.163454294204712, "learning_rate": 4.501335707925201e-05, "loss": 3.7299, "step": 1008 }, { "epoch": 0.1, "grad_norm": 1.1138209104537964, "learning_rate": 4.499356881369348e-05, "loss": 3.5834, "step": 1012 }, { "epoch": 0.1, "grad_norm": 1.2494182586669922, "learning_rate": 4.497378054813496e-05, "loss": 3.7193, "step": 1016 }, { "epoch": 0.1, "grad_norm": 1.1176248788833618, "learning_rate": 4.4953992282576434e-05, "loss": 3.5979, "step": 1020 }, { "epoch": 0.1, "grad_norm": 1.1829919815063477, "learning_rate": 4.4934204017017914e-05, "loss": 3.6594, "step": 1024 }, { "epoch": 0.1, "grad_norm": 1.1001139879226685, "learning_rate": 4.4914415751459386e-05, "loss": 3.65, "step": 1028 }, { "epoch": 0.1, "grad_norm": 1.2622652053833008, "learning_rate": 4.4894627485900866e-05, "loss": 3.6189, "step": 1032 }, { "epoch": 0.1, "grad_norm": 1.2330125570297241, "learning_rate": 4.487483922034234e-05, "loss": 3.7518, "step": 1036 }, { "epoch": 0.1, "grad_norm": 1.5254411697387695, "learning_rate": 4.485505095478382e-05, "loss": 3.6814, "step": 1040 }, { "epoch": 0.1, "grad_norm": 1.231075644493103, "learning_rate": 4.483526268922529e-05, "loss": 3.6836, "step": 1044 }, { "epoch": 0.1, "grad_norm": 1.404321551322937, "learning_rate": 4.481547442366677e-05, "loss": 3.6597, "step": 1048 }, { "epoch": 0.1, "grad_norm": 1.1372672319412231, "learning_rate": 4.4795686158108244e-05, "loss": 3.7171, "step": 1052 }, { "epoch": 0.1, "grad_norm": 1.3405723571777344, "learning_rate": 4.477589789254972e-05, "loss": 3.7671, "step": 1056 }, { "epoch": 0.1, "grad_norm": 1.246029019355774, "learning_rate": 4.47561096269912e-05, "loss": 4.1252, "step": 1060 }, { "epoch": 0.11, "grad_norm": 1.395970344543457, "learning_rate": 4.473632136143267e-05, "loss": 3.554, "step": 1064 }, { "epoch": 0.11, "grad_norm": 1.4145135879516602, "learning_rate": 4.471653309587415e-05, "loss": 3.7314, "step": 1068 }, { "epoch": 0.11, "grad_norm": 1.1709779500961304, "learning_rate": 4.469674483031562e-05, "loss": 3.4601, "step": 1072 }, { "epoch": 0.11, "grad_norm": 1.4412345886230469, "learning_rate": 4.46769565647571e-05, "loss": 3.4616, "step": 1076 }, { "epoch": 0.11, "grad_norm": 1.3158358335494995, "learning_rate": 4.4657168299198575e-05, "loss": 3.5625, "step": 1080 }, { "epoch": 0.11, "grad_norm": 1.1684952974319458, "learning_rate": 4.4637380033640055e-05, "loss": 3.4344, "step": 1084 }, { "epoch": 0.11, "grad_norm": 1.3412691354751587, "learning_rate": 4.461759176808153e-05, "loss": 3.6387, "step": 1088 }, { "epoch": 0.11, "grad_norm": 1.2844157218933105, "learning_rate": 4.459780350252301e-05, "loss": 3.5851, "step": 1092 }, { "epoch": 0.11, "grad_norm": 1.217627763748169, "learning_rate": 4.457801523696448e-05, "loss": 3.5215, "step": 1096 }, { "epoch": 0.11, "grad_norm": 1.3247570991516113, "learning_rate": 4.455822697140596e-05, "loss": 3.5599, "step": 1100 }, { "epoch": 0.11, "grad_norm": 1.402744174003601, "learning_rate": 4.453843870584743e-05, "loss": 3.6444, "step": 1104 }, { "epoch": 0.11, "grad_norm": 1.2150934934616089, "learning_rate": 4.451865044028891e-05, "loss": 3.5712, "step": 1108 }, { "epoch": 0.11, "grad_norm": 1.1183940172195435, "learning_rate": 4.4498862174730386e-05, "loss": 3.6094, "step": 1112 }, { "epoch": 0.11, "grad_norm": 1.2478612661361694, "learning_rate": 4.4479073909171866e-05, "loss": 3.3976, "step": 1116 }, { "epoch": 0.11, "grad_norm": 1.2105813026428223, "learning_rate": 4.445928564361334e-05, "loss": 3.5313, "step": 1120 }, { "epoch": 0.11, "grad_norm": 1.1932976245880127, "learning_rate": 4.443949737805482e-05, "loss": 3.4661, "step": 1124 }, { "epoch": 0.11, "grad_norm": 1.3227620124816895, "learning_rate": 4.441970911249629e-05, "loss": 3.5443, "step": 1128 }, { "epoch": 0.11, "grad_norm": 1.1562778949737549, "learning_rate": 4.439992084693777e-05, "loss": 3.5925, "step": 1132 }, { "epoch": 0.11, "grad_norm": 1.3220064640045166, "learning_rate": 4.4380132581379244e-05, "loss": 3.5389, "step": 1136 }, { "epoch": 0.11, "grad_norm": 1.2712664604187012, "learning_rate": 4.436034431582072e-05, "loss": 3.6925, "step": 1140 }, { "epoch": 0.11, "grad_norm": 1.2455075979232788, "learning_rate": 4.4340556050262196e-05, "loss": 3.5731, "step": 1144 }, { "epoch": 0.11, "grad_norm": 1.1540822982788086, "learning_rate": 4.432076778470367e-05, "loss": 3.4803, "step": 1148 }, { "epoch": 0.11, "grad_norm": 1.2069053649902344, "learning_rate": 4.430097951914515e-05, "loss": 3.5102, "step": 1152 }, { "epoch": 0.11, "grad_norm": 1.3267216682434082, "learning_rate": 4.428119125358662e-05, "loss": 3.7385, "step": 1156 }, { "epoch": 0.11, "grad_norm": 1.311906099319458, "learning_rate": 4.42614029880281e-05, "loss": 3.6113, "step": 1160 }, { "epoch": 0.12, "grad_norm": 1.1880320310592651, "learning_rate": 4.4241614722469575e-05, "loss": 3.4399, "step": 1164 }, { "epoch": 0.12, "grad_norm": 1.5865764617919922, "learning_rate": 4.4221826456911054e-05, "loss": 3.7722, "step": 1168 }, { "epoch": 0.12, "grad_norm": 1.119573950767517, "learning_rate": 4.420203819135253e-05, "loss": 3.5, "step": 1172 }, { "epoch": 0.12, "grad_norm": 1.2991716861724854, "learning_rate": 4.418224992579401e-05, "loss": 3.5364, "step": 1176 }, { "epoch": 0.12, "grad_norm": 1.1628044843673706, "learning_rate": 4.416246166023548e-05, "loss": 3.3195, "step": 1180 }, { "epoch": 0.12, "grad_norm": 1.0280978679656982, "learning_rate": 4.414267339467696e-05, "loss": 3.4377, "step": 1184 }, { "epoch": 0.12, "grad_norm": 1.0991793870925903, "learning_rate": 4.412288512911843e-05, "loss": 3.5634, "step": 1188 }, { "epoch": 0.12, "grad_norm": 1.0973697900772095, "learning_rate": 4.410309686355991e-05, "loss": 3.5387, "step": 1192 }, { "epoch": 0.12, "grad_norm": 1.5861742496490479, "learning_rate": 4.4083308598001385e-05, "loss": 3.5671, "step": 1196 }, { "epoch": 0.12, "grad_norm": 1.3537582159042358, "learning_rate": 4.4063520332442865e-05, "loss": 3.7076, "step": 1200 }, { "epoch": 0.12, "grad_norm": 1.1639502048492432, "learning_rate": 4.404373206688434e-05, "loss": 3.577, "step": 1204 }, { "epoch": 0.12, "grad_norm": 1.0819730758666992, "learning_rate": 4.402394380132582e-05, "loss": 3.5282, "step": 1208 }, { "epoch": 0.12, "grad_norm": 1.2191344499588013, "learning_rate": 4.40041555357673e-05, "loss": 3.447, "step": 1212 }, { "epoch": 0.12, "grad_norm": 1.1389458179473877, "learning_rate": 4.398436727020877e-05, "loss": 3.4366, "step": 1216 }, { "epoch": 0.12, "grad_norm": 1.2213441133499146, "learning_rate": 4.396457900465025e-05, "loss": 3.5216, "step": 1220 }, { "epoch": 0.12, "grad_norm": 1.4312680959701538, "learning_rate": 4.3944790739091716e-05, "loss": 3.5432, "step": 1224 }, { "epoch": 0.12, "grad_norm": 1.196575403213501, "learning_rate": 4.3925002473533196e-05, "loss": 3.3649, "step": 1228 }, { "epoch": 0.12, "grad_norm": 1.1721031665802002, "learning_rate": 4.390521420797467e-05, "loss": 3.3323, "step": 1232 }, { "epoch": 0.12, "grad_norm": 1.1894913911819458, "learning_rate": 4.388542594241615e-05, "loss": 3.431, "step": 1236 }, { "epoch": 0.12, "grad_norm": 1.127156138420105, "learning_rate": 4.386563767685762e-05, "loss": 3.5986, "step": 1240 }, { "epoch": 0.12, "grad_norm": 1.2058907747268677, "learning_rate": 4.38458494112991e-05, "loss": 3.4094, "step": 1244 }, { "epoch": 0.12, "grad_norm": 1.1665937900543213, "learning_rate": 4.3826061145740574e-05, "loss": 3.5399, "step": 1248 }, { "epoch": 0.12, "grad_norm": 1.0757410526275635, "learning_rate": 4.3806272880182054e-05, "loss": 3.3555, "step": 1252 }, { "epoch": 0.12, "grad_norm": 1.1566282510757446, "learning_rate": 4.378648461462353e-05, "loss": 3.5372, "step": 1256 }, { "epoch": 0.12, "grad_norm": 1.1599324941635132, "learning_rate": 4.3766696349065006e-05, "loss": 3.6885, "step": 1260 }, { "epoch": 0.13, "grad_norm": 1.2669498920440674, "learning_rate": 4.374690808350648e-05, "loss": 3.5088, "step": 1264 }, { "epoch": 0.13, "grad_norm": 1.2056463956832886, "learning_rate": 4.372711981794796e-05, "loss": 3.4207, "step": 1268 }, { "epoch": 0.13, "grad_norm": 1.285954475402832, "learning_rate": 4.370733155238944e-05, "loss": 3.4905, "step": 1272 }, { "epoch": 0.13, "grad_norm": 1.5418660640716553, "learning_rate": 4.368754328683091e-05, "loss": 3.46, "step": 1276 }, { "epoch": 0.13, "grad_norm": 1.3269129991531372, "learning_rate": 4.366775502127239e-05, "loss": 3.651, "step": 1280 }, { "epoch": 0.13, "grad_norm": 1.206346869468689, "learning_rate": 4.3647966755713864e-05, "loss": 3.4792, "step": 1284 }, { "epoch": 0.13, "grad_norm": 1.5066856145858765, "learning_rate": 4.3628178490155344e-05, "loss": 3.7163, "step": 1288 }, { "epoch": 0.13, "grad_norm": 1.660408616065979, "learning_rate": 4.360839022459682e-05, "loss": 3.3533, "step": 1292 }, { "epoch": 0.13, "grad_norm": 1.3185946941375732, "learning_rate": 4.35886019590383e-05, "loss": 3.6158, "step": 1296 }, { "epoch": 0.13, "grad_norm": 1.2295507192611694, "learning_rate": 4.356881369347977e-05, "loss": 3.3683, "step": 1300 }, { "epoch": 0.13, "grad_norm": 1.2218221426010132, "learning_rate": 4.354902542792125e-05, "loss": 3.3305, "step": 1304 }, { "epoch": 0.13, "grad_norm": 1.4210175275802612, "learning_rate": 4.352923716236272e-05, "loss": 3.4825, "step": 1308 }, { "epoch": 0.13, "grad_norm": 1.4661409854888916, "learning_rate": 4.3509448896804195e-05, "loss": 3.4458, "step": 1312 }, { "epoch": 0.13, "grad_norm": 1.2703677415847778, "learning_rate": 4.348966063124567e-05, "loss": 3.2625, "step": 1316 }, { "epoch": 0.13, "grad_norm": 1.4040093421936035, "learning_rate": 4.346987236568715e-05, "loss": 3.5308, "step": 1320 }, { "epoch": 0.13, "grad_norm": 1.1892530918121338, "learning_rate": 4.345008410012862e-05, "loss": 3.4657, "step": 1324 }, { "epoch": 0.13, "grad_norm": 1.3836407661437988, "learning_rate": 4.34302958345701e-05, "loss": 3.4017, "step": 1328 }, { "epoch": 0.13, "grad_norm": 1.1580448150634766, "learning_rate": 4.341050756901158e-05, "loss": 3.367, "step": 1332 }, { "epoch": 0.13, "grad_norm": 1.1472089290618896, "learning_rate": 4.339071930345305e-05, "loss": 3.4479, "step": 1336 }, { "epoch": 0.13, "grad_norm": 1.2365572452545166, "learning_rate": 4.337093103789453e-05, "loss": 3.4469, "step": 1340 }, { "epoch": 0.13, "grad_norm": 1.2386647462844849, "learning_rate": 4.3351142772336006e-05, "loss": 3.6044, "step": 1344 }, { "epoch": 0.13, "grad_norm": 1.3203320503234863, "learning_rate": 4.3331354506777485e-05, "loss": 3.5549, "step": 1348 }, { "epoch": 0.13, "grad_norm": 1.4202224016189575, "learning_rate": 4.331156624121896e-05, "loss": 3.6182, "step": 1352 }, { "epoch": 0.13, "grad_norm": 1.1654404401779175, "learning_rate": 4.329177797566044e-05, "loss": 3.4088, "step": 1356 }, { "epoch": 0.13, "grad_norm": 1.2231131792068481, "learning_rate": 4.327198971010191e-05, "loss": 3.4264, "step": 1360 }, { "epoch": 0.13, "grad_norm": 1.1695729494094849, "learning_rate": 4.325220144454339e-05, "loss": 3.4689, "step": 1364 }, { "epoch": 0.14, "grad_norm": 1.392952561378479, "learning_rate": 4.3232413178984864e-05, "loss": 3.4129, "step": 1368 }, { "epoch": 0.14, "grad_norm": 1.9919403791427612, "learning_rate": 4.321262491342634e-05, "loss": 3.7645, "step": 1372 }, { "epoch": 0.14, "grad_norm": 1.3091975450515747, "learning_rate": 4.3192836647867816e-05, "loss": 3.4731, "step": 1376 }, { "epoch": 0.14, "grad_norm": 1.440266489982605, "learning_rate": 4.3173048382309296e-05, "loss": 3.3394, "step": 1380 }, { "epoch": 0.14, "grad_norm": 1.3461400270462036, "learning_rate": 4.315326011675077e-05, "loss": 3.3563, "step": 1384 }, { "epoch": 0.14, "grad_norm": 1.4940065145492554, "learning_rate": 4.313347185119225e-05, "loss": 3.4845, "step": 1388 }, { "epoch": 0.14, "grad_norm": 1.401813268661499, "learning_rate": 4.311368358563372e-05, "loss": 3.3725, "step": 1392 }, { "epoch": 0.14, "grad_norm": 1.2095842361450195, "learning_rate": 4.3093895320075194e-05, "loss": 3.5393, "step": 1396 }, { "epoch": 0.14, "grad_norm": 1.553962230682373, "learning_rate": 4.3074107054516674e-05, "loss": 3.6136, "step": 1400 }, { "epoch": 0.14, "grad_norm": 1.1763421297073364, "learning_rate": 4.305431878895815e-05, "loss": 3.3757, "step": 1404 }, { "epoch": 0.14, "grad_norm": 1.5987308025360107, "learning_rate": 4.303453052339963e-05, "loss": 3.5515, "step": 1408 }, { "epoch": 0.14, "grad_norm": 1.3491801023483276, "learning_rate": 4.30147422578411e-05, "loss": 3.5995, "step": 1412 }, { "epoch": 0.14, "grad_norm": 1.194741129875183, "learning_rate": 4.299495399228258e-05, "loss": 3.3052, "step": 1416 }, { "epoch": 0.14, "grad_norm": 1.3502442836761475, "learning_rate": 4.297516572672405e-05, "loss": 3.3727, "step": 1420 }, { "epoch": 0.14, "grad_norm": 1.4272186756134033, "learning_rate": 4.295537746116553e-05, "loss": 3.3057, "step": 1424 }, { "epoch": 0.14, "grad_norm": 1.1727451086044312, "learning_rate": 4.2935589195607005e-05, "loss": 3.4478, "step": 1428 }, { "epoch": 0.14, "grad_norm": 1.367772102355957, "learning_rate": 4.2915800930048485e-05, "loss": 3.3518, "step": 1432 }, { "epoch": 0.14, "grad_norm": 1.3269891738891602, "learning_rate": 4.289601266448996e-05, "loss": 3.3426, "step": 1436 }, { "epoch": 0.14, "grad_norm": 1.188839316368103, "learning_rate": 4.287622439893144e-05, "loss": 3.4735, "step": 1440 }, { "epoch": 0.14, "grad_norm": 1.3754055500030518, "learning_rate": 4.285643613337291e-05, "loss": 3.3271, "step": 1444 }, { "epoch": 0.14, "grad_norm": 1.2622300386428833, "learning_rate": 4.283664786781439e-05, "loss": 3.7099, "step": 1448 }, { "epoch": 0.14, "grad_norm": 1.2850700616836548, "learning_rate": 4.281685960225586e-05, "loss": 3.3448, "step": 1452 }, { "epoch": 0.14, "grad_norm": 1.3063966035842896, "learning_rate": 4.279707133669734e-05, "loss": 3.4525, "step": 1456 }, { "epoch": 0.14, "grad_norm": 1.2147005796432495, "learning_rate": 4.2777283071138816e-05, "loss": 3.3215, "step": 1460 }, { "epoch": 0.14, "grad_norm": 1.347546100616455, "learning_rate": 4.2757494805580295e-05, "loss": 3.5355, "step": 1464 }, { "epoch": 0.15, "grad_norm": 1.228791356086731, "learning_rate": 4.273770654002177e-05, "loss": 3.3136, "step": 1468 }, { "epoch": 0.15, "grad_norm": 1.5091530084609985, "learning_rate": 4.271791827446325e-05, "loss": 3.321, "step": 1472 }, { "epoch": 0.15, "grad_norm": 1.1982457637786865, "learning_rate": 4.269813000890472e-05, "loss": 3.3217, "step": 1476 }, { "epoch": 0.15, "grad_norm": 1.2437925338745117, "learning_rate": 4.2678341743346194e-05, "loss": 3.666, "step": 1480 }, { "epoch": 0.15, "grad_norm": 1.508147120475769, "learning_rate": 4.2658553477787674e-05, "loss": 3.1524, "step": 1484 }, { "epoch": 0.15, "grad_norm": 1.1853328943252563, "learning_rate": 4.2638765212229146e-05, "loss": 3.3933, "step": 1488 }, { "epoch": 0.15, "grad_norm": 1.410609245300293, "learning_rate": 4.2618976946670626e-05, "loss": 3.6355, "step": 1492 }, { "epoch": 0.15, "grad_norm": 1.2669347524642944, "learning_rate": 4.25991886811121e-05, "loss": 3.3659, "step": 1496 }, { "epoch": 0.15, "grad_norm": 1.1890374422073364, "learning_rate": 4.257940041555358e-05, "loss": 3.4131, "step": 1500 }, { "epoch": 0.15, "grad_norm": 1.2087454795837402, "learning_rate": 4.255961214999505e-05, "loss": 3.437, "step": 1504 }, { "epoch": 0.15, "grad_norm": 1.2528936862945557, "learning_rate": 4.253982388443653e-05, "loss": 3.3099, "step": 1508 }, { "epoch": 0.15, "grad_norm": 1.1920204162597656, "learning_rate": 4.2520035618878004e-05, "loss": 3.4506, "step": 1512 }, { "epoch": 0.15, "grad_norm": 1.289352297782898, "learning_rate": 4.2500247353319484e-05, "loss": 3.5612, "step": 1516 }, { "epoch": 0.15, "grad_norm": 1.3303660154342651, "learning_rate": 4.248045908776096e-05, "loss": 3.4003, "step": 1520 }, { "epoch": 0.15, "grad_norm": 1.2178258895874023, "learning_rate": 4.246067082220244e-05, "loss": 3.09, "step": 1524 }, { "epoch": 0.15, "grad_norm": 1.2495688199996948, "learning_rate": 4.244088255664391e-05, "loss": 3.362, "step": 1528 }, { "epoch": 0.15, "grad_norm": 1.2434120178222656, "learning_rate": 4.242109429108539e-05, "loss": 3.2701, "step": 1532 }, { "epoch": 0.15, "grad_norm": 1.4451349973678589, "learning_rate": 4.240130602552686e-05, "loss": 3.3683, "step": 1536 }, { "epoch": 0.15, "grad_norm": 1.4097487926483154, "learning_rate": 4.238151775996834e-05, "loss": 3.2426, "step": 1540 }, { "epoch": 0.15, "grad_norm": 1.2663370370864868, "learning_rate": 4.236172949440982e-05, "loss": 3.177, "step": 1544 }, { "epoch": 0.15, "grad_norm": 1.4020271301269531, "learning_rate": 4.2341941228851295e-05, "loss": 3.3555, "step": 1548 }, { "epoch": 0.15, "grad_norm": 1.355491042137146, "learning_rate": 4.2322152963292774e-05, "loss": 3.2385, "step": 1552 }, { "epoch": 0.15, "grad_norm": 1.521461844444275, "learning_rate": 4.230236469773425e-05, "loss": 3.153, "step": 1556 }, { "epoch": 0.15, "grad_norm": 1.278916597366333, "learning_rate": 4.228257643217573e-05, "loss": 3.3441, "step": 1560 }, { "epoch": 0.15, "grad_norm": 1.2973657846450806, "learning_rate": 4.226278816661719e-05, "loss": 3.3513, "step": 1564 }, { "epoch": 0.16, "grad_norm": 1.254542350769043, "learning_rate": 4.224299990105867e-05, "loss": 3.3336, "step": 1568 }, { "epoch": 0.16, "grad_norm": 1.475539207458496, "learning_rate": 4.2223211635500146e-05, "loss": 3.2832, "step": 1572 }, { "epoch": 0.16, "grad_norm": 1.3780232667922974, "learning_rate": 4.2203423369941626e-05, "loss": 3.4761, "step": 1576 }, { "epoch": 0.16, "grad_norm": 1.28725266456604, "learning_rate": 4.21836351043831e-05, "loss": 3.3786, "step": 1580 }, { "epoch": 0.16, "grad_norm": 1.1149132251739502, "learning_rate": 4.216384683882458e-05, "loss": 3.4289, "step": 1584 }, { "epoch": 0.16, "grad_norm": 1.2939023971557617, "learning_rate": 4.214405857326605e-05, "loss": 3.3372, "step": 1588 }, { "epoch": 0.16, "grad_norm": 1.5904688835144043, "learning_rate": 4.212427030770753e-05, "loss": 3.3784, "step": 1592 }, { "epoch": 0.16, "grad_norm": 1.3073712587356567, "learning_rate": 4.2104482042149004e-05, "loss": 3.4208, "step": 1596 }, { "epoch": 0.16, "grad_norm": 1.3639119863510132, "learning_rate": 4.2084693776590483e-05, "loss": 3.2751, "step": 1600 }, { "epoch": 0.16, "grad_norm": 1.1963696479797363, "learning_rate": 4.206490551103196e-05, "loss": 3.3383, "step": 1604 }, { "epoch": 0.16, "grad_norm": 1.503912091255188, "learning_rate": 4.2045117245473436e-05, "loss": 3.3637, "step": 1608 }, { "epoch": 0.16, "grad_norm": 1.4482710361480713, "learning_rate": 4.2025328979914916e-05, "loss": 3.5253, "step": 1612 }, { "epoch": 0.16, "grad_norm": 1.301410436630249, "learning_rate": 4.200554071435639e-05, "loss": 3.3642, "step": 1616 }, { "epoch": 0.16, "grad_norm": 1.2556508779525757, "learning_rate": 4.198575244879787e-05, "loss": 3.2129, "step": 1620 }, { "epoch": 0.16, "grad_norm": 1.2119685411453247, "learning_rate": 4.196596418323934e-05, "loss": 3.2957, "step": 1624 }, { "epoch": 0.16, "grad_norm": 1.4289143085479736, "learning_rate": 4.194617591768082e-05, "loss": 3.5322, "step": 1628 }, { "epoch": 0.16, "grad_norm": 1.314368486404419, "learning_rate": 4.1926387652122294e-05, "loss": 3.1358, "step": 1632 }, { "epoch": 0.16, "grad_norm": 1.2228057384490967, "learning_rate": 4.1906599386563774e-05, "loss": 3.4493, "step": 1636 }, { "epoch": 0.16, "grad_norm": 1.2432242631912231, "learning_rate": 4.188681112100525e-05, "loss": 3.2671, "step": 1640 }, { "epoch": 0.16, "grad_norm": 1.34592866897583, "learning_rate": 4.1867022855446726e-05, "loss": 3.3529, "step": 1644 }, { "epoch": 0.16, "grad_norm": 1.360672116279602, "learning_rate": 4.184723458988819e-05, "loss": 3.1993, "step": 1648 }, { "epoch": 0.16, "grad_norm": 1.199696660041809, "learning_rate": 4.182744632432967e-05, "loss": 3.376, "step": 1652 }, { "epoch": 0.16, "grad_norm": 1.3399593830108643, "learning_rate": 4.180765805877115e-05, "loss": 3.4037, "step": 1656 }, { "epoch": 0.16, "grad_norm": 1.1756970882415771, "learning_rate": 4.1787869793212625e-05, "loss": 3.3409, "step": 1660 }, { "epoch": 0.16, "grad_norm": 1.292723298072815, "learning_rate": 4.1768081527654105e-05, "loss": 3.3704, "step": 1664 }, { "epoch": 0.17, "grad_norm": 1.324936032295227, "learning_rate": 4.174829326209558e-05, "loss": 3.3542, "step": 1668 }, { "epoch": 0.17, "grad_norm": 1.5456078052520752, "learning_rate": 4.172850499653706e-05, "loss": 3.2134, "step": 1672 }, { "epoch": 0.17, "grad_norm": 1.175718903541565, "learning_rate": 4.170871673097853e-05, "loss": 3.1983, "step": 1676 }, { "epoch": 0.17, "grad_norm": 1.1541227102279663, "learning_rate": 4.168892846542001e-05, "loss": 3.1252, "step": 1680 }, { "epoch": 0.17, "grad_norm": 1.4465668201446533, "learning_rate": 4.166914019986148e-05, "loss": 3.2788, "step": 1684 }, { "epoch": 0.17, "grad_norm": 1.340494155883789, "learning_rate": 4.164935193430296e-05, "loss": 3.5389, "step": 1688 }, { "epoch": 0.17, "grad_norm": 1.2862470149993896, "learning_rate": 4.1629563668744435e-05, "loss": 3.3313, "step": 1692 }, { "epoch": 0.17, "grad_norm": 1.4783071279525757, "learning_rate": 4.1609775403185915e-05, "loss": 3.2759, "step": 1696 }, { "epoch": 0.17, "grad_norm": 1.1228585243225098, "learning_rate": 4.158998713762739e-05, "loss": 3.3506, "step": 1700 }, { "epoch": 0.17, "grad_norm": 1.2861573696136475, "learning_rate": 4.157019887206887e-05, "loss": 3.3374, "step": 1704 }, { "epoch": 0.17, "grad_norm": 1.3766535520553589, "learning_rate": 4.155041060651034e-05, "loss": 3.264, "step": 1708 }, { "epoch": 0.17, "grad_norm": 1.4737287759780884, "learning_rate": 4.153062234095182e-05, "loss": 3.4986, "step": 1712 }, { "epoch": 0.17, "grad_norm": 1.3399109840393066, "learning_rate": 4.1510834075393293e-05, "loss": 3.5896, "step": 1716 }, { "epoch": 0.17, "grad_norm": 1.3930294513702393, "learning_rate": 4.149104580983477e-05, "loss": 3.2878, "step": 1720 }, { "epoch": 0.17, "grad_norm": 1.2427089214324951, "learning_rate": 4.1471257544276246e-05, "loss": 3.3721, "step": 1724 }, { "epoch": 0.17, "grad_norm": 1.3937251567840576, "learning_rate": 4.1451469278717726e-05, "loss": 3.3628, "step": 1728 }, { "epoch": 0.17, "grad_norm": 1.4732547998428345, "learning_rate": 4.14316810131592e-05, "loss": 3.3437, "step": 1732 }, { "epoch": 0.17, "grad_norm": 1.2372941970825195, "learning_rate": 4.141189274760067e-05, "loss": 3.2885, "step": 1736 }, { "epoch": 0.17, "grad_norm": 1.3826080560684204, "learning_rate": 4.139210448204215e-05, "loss": 3.2253, "step": 1740 }, { "epoch": 0.17, "grad_norm": 1.3599334955215454, "learning_rate": 4.1372316216483624e-05, "loss": 3.3872, "step": 1744 }, { "epoch": 0.17, "grad_norm": 1.286747694015503, "learning_rate": 4.1352527950925104e-05, "loss": 3.4709, "step": 1748 }, { "epoch": 0.17, "grad_norm": 1.3777498006820679, "learning_rate": 4.133273968536658e-05, "loss": 3.2032, "step": 1752 }, { "epoch": 0.17, "grad_norm": 1.3295429944992065, "learning_rate": 4.131295141980806e-05, "loss": 3.4612, "step": 1756 }, { "epoch": 0.17, "grad_norm": 1.2335968017578125, "learning_rate": 4.129316315424953e-05, "loss": 3.2492, "step": 1760 }, { "epoch": 0.17, "grad_norm": 1.282143235206604, "learning_rate": 4.127337488869101e-05, "loss": 3.3274, "step": 1764 }, { "epoch": 0.17, "grad_norm": 1.2488516569137573, "learning_rate": 4.125358662313248e-05, "loss": 3.3067, "step": 1768 }, { "epoch": 0.18, "grad_norm": 1.3893166780471802, "learning_rate": 4.123379835757396e-05, "loss": 3.1867, "step": 1772 }, { "epoch": 0.18, "grad_norm": 1.3564475774765015, "learning_rate": 4.1214010092015435e-05, "loss": 3.2346, "step": 1776 }, { "epoch": 0.18, "grad_norm": 1.2899243831634521, "learning_rate": 4.1194221826456915e-05, "loss": 3.3261, "step": 1780 }, { "epoch": 0.18, "grad_norm": 1.25933837890625, "learning_rate": 4.117443356089839e-05, "loss": 3.3999, "step": 1784 }, { "epoch": 0.18, "grad_norm": 1.4917421340942383, "learning_rate": 4.115464529533987e-05, "loss": 3.5482, "step": 1788 }, { "epoch": 0.18, "grad_norm": 1.2294834852218628, "learning_rate": 4.113485702978134e-05, "loss": 3.1481, "step": 1792 }, { "epoch": 0.18, "grad_norm": 1.3130604028701782, "learning_rate": 4.111506876422282e-05, "loss": 3.5579, "step": 1796 }, { "epoch": 0.18, "grad_norm": 1.234863519668579, "learning_rate": 4.109528049866429e-05, "loss": 3.2204, "step": 1800 }, { "epoch": 0.18, "grad_norm": 1.606605052947998, "learning_rate": 4.107549223310577e-05, "loss": 3.3306, "step": 1804 }, { "epoch": 0.18, "grad_norm": 1.2031803131103516, "learning_rate": 4.1055703967547245e-05, "loss": 3.3349, "step": 1808 }, { "epoch": 0.18, "grad_norm": 1.354557991027832, "learning_rate": 4.1035915701988725e-05, "loss": 3.1236, "step": 1812 }, { "epoch": 0.18, "grad_norm": 1.3591339588165283, "learning_rate": 4.10161274364302e-05, "loss": 3.1869, "step": 1816 }, { "epoch": 0.18, "grad_norm": 1.277288556098938, "learning_rate": 4.099633917087167e-05, "loss": 3.0739, "step": 1820 }, { "epoch": 0.18, "grad_norm": 1.4283719062805176, "learning_rate": 4.097655090531315e-05, "loss": 3.3606, "step": 1824 }, { "epoch": 0.18, "grad_norm": 1.3387796878814697, "learning_rate": 4.0956762639754624e-05, "loss": 3.3761, "step": 1828 }, { "epoch": 0.18, "grad_norm": 1.3217228651046753, "learning_rate": 4.09369743741961e-05, "loss": 3.3802, "step": 1832 }, { "epoch": 0.18, "grad_norm": 1.2054520845413208, "learning_rate": 4.0917186108637576e-05, "loss": 3.1521, "step": 1836 }, { "epoch": 0.18, "grad_norm": 1.6184253692626953, "learning_rate": 4.0897397843079056e-05, "loss": 3.3772, "step": 1840 }, { "epoch": 0.18, "grad_norm": 1.4544942378997803, "learning_rate": 4.087760957752053e-05, "loss": 3.2255, "step": 1844 }, { "epoch": 0.18, "grad_norm": 1.247625708580017, "learning_rate": 4.085782131196201e-05, "loss": 3.2605, "step": 1848 }, { "epoch": 0.18, "grad_norm": 1.2736012935638428, "learning_rate": 4.083803304640348e-05, "loss": 3.1793, "step": 1852 }, { "epoch": 0.18, "grad_norm": 1.2771648168563843, "learning_rate": 4.081824478084496e-05, "loss": 3.1654, "step": 1856 }, { "epoch": 0.18, "grad_norm": 1.1745537519454956, "learning_rate": 4.0798456515286434e-05, "loss": 3.1628, "step": 1860 }, { "epoch": 0.18, "grad_norm": 1.2880799770355225, "learning_rate": 4.0778668249727914e-05, "loss": 3.2518, "step": 1864 }, { "epoch": 0.18, "grad_norm": 1.3577632904052734, "learning_rate": 4.075887998416939e-05, "loss": 3.171, "step": 1868 }, { "epoch": 0.19, "grad_norm": 1.3156131505966187, "learning_rate": 4.0739091718610867e-05, "loss": 3.4448, "step": 1872 }, { "epoch": 0.19, "grad_norm": 1.2552664279937744, "learning_rate": 4.0719303453052346e-05, "loss": 3.3617, "step": 1876 }, { "epoch": 0.19, "grad_norm": 1.3123071193695068, "learning_rate": 4.069951518749382e-05, "loss": 3.271, "step": 1880 }, { "epoch": 0.19, "grad_norm": 1.3457539081573486, "learning_rate": 4.06797269219353e-05, "loss": 3.2374, "step": 1884 }, { "epoch": 0.19, "grad_norm": 1.3469061851501465, "learning_rate": 4.065993865637677e-05, "loss": 3.2502, "step": 1888 }, { "epoch": 0.19, "grad_norm": 1.2933361530303955, "learning_rate": 4.064015039081825e-05, "loss": 3.1439, "step": 1892 }, { "epoch": 0.19, "grad_norm": 1.2762932777404785, "learning_rate": 4.0620362125259724e-05, "loss": 3.1837, "step": 1896 }, { "epoch": 0.19, "grad_norm": 1.3552213907241821, "learning_rate": 4.06005738597012e-05, "loss": 3.1888, "step": 1900 }, { "epoch": 0.19, "grad_norm": 1.3573856353759766, "learning_rate": 4.058078559414267e-05, "loss": 3.1358, "step": 1904 }, { "epoch": 0.19, "grad_norm": 1.308092474937439, "learning_rate": 4.056099732858415e-05, "loss": 3.3578, "step": 1908 }, { "epoch": 0.19, "grad_norm": 1.4167733192443848, "learning_rate": 4.054120906302562e-05, "loss": 3.4611, "step": 1912 }, { "epoch": 0.19, "grad_norm": 1.3537653684616089, "learning_rate": 4.05214207974671e-05, "loss": 3.224, "step": 1916 }, { "epoch": 0.19, "grad_norm": 1.4007222652435303, "learning_rate": 4.0501632531908576e-05, "loss": 3.2109, "step": 1920 }, { "epoch": 0.19, "grad_norm": 1.3303008079528809, "learning_rate": 4.0481844266350055e-05, "loss": 3.4183, "step": 1924 }, { "epoch": 0.19, "grad_norm": 1.5393402576446533, "learning_rate": 4.0462056000791535e-05, "loss": 3.1778, "step": 1928 }, { "epoch": 0.19, "grad_norm": 1.2925307750701904, "learning_rate": 4.044226773523301e-05, "loss": 3.3246, "step": 1932 }, { "epoch": 0.19, "grad_norm": 1.2742151021957397, "learning_rate": 4.042247946967449e-05, "loss": 3.2618, "step": 1936 }, { "epoch": 0.19, "grad_norm": 1.341132402420044, "learning_rate": 4.040269120411596e-05, "loss": 3.2285, "step": 1940 }, { "epoch": 0.19, "grad_norm": 1.279293179512024, "learning_rate": 4.038290293855744e-05, "loss": 3.2013, "step": 1944 }, { "epoch": 0.19, "grad_norm": 1.304842233657837, "learning_rate": 4.036311467299891e-05, "loss": 3.3386, "step": 1948 }, { "epoch": 0.19, "grad_norm": 1.4351922273635864, "learning_rate": 4.034332640744039e-05, "loss": 3.3236, "step": 1952 }, { "epoch": 0.19, "grad_norm": 1.3184465169906616, "learning_rate": 4.0323538141881866e-05, "loss": 3.3001, "step": 1956 }, { "epoch": 0.19, "grad_norm": 1.358441948890686, "learning_rate": 4.0303749876323346e-05, "loss": 3.3153, "step": 1960 }, { "epoch": 0.19, "grad_norm": 1.3413522243499756, "learning_rate": 4.028396161076482e-05, "loss": 3.2905, "step": 1964 }, { "epoch": 0.19, "grad_norm": 1.5526001453399658, "learning_rate": 4.02641733452063e-05, "loss": 3.2665, "step": 1968 }, { "epoch": 0.2, "grad_norm": 1.4052098989486694, "learning_rate": 4.024438507964777e-05, "loss": 3.0877, "step": 1972 }, { "epoch": 0.2, "grad_norm": 1.362823724746704, "learning_rate": 4.022459681408925e-05, "loss": 3.0379, "step": 1976 }, { "epoch": 0.2, "grad_norm": 1.3832095861434937, "learning_rate": 4.0204808548530724e-05, "loss": 3.2831, "step": 1980 }, { "epoch": 0.2, "grad_norm": 1.3092079162597656, "learning_rate": 4.01850202829722e-05, "loss": 3.2422, "step": 1984 }, { "epoch": 0.2, "grad_norm": 1.3156044483184814, "learning_rate": 4.0165232017413676e-05, "loss": 3.2832, "step": 1988 }, { "epoch": 0.2, "grad_norm": 1.2616153955459595, "learning_rate": 4.014544375185515e-05, "loss": 3.2577, "step": 1992 }, { "epoch": 0.2, "grad_norm": 1.3820970058441162, "learning_rate": 4.012565548629663e-05, "loss": 3.1686, "step": 1996 }, { "epoch": 0.2, "grad_norm": 1.3397316932678223, "learning_rate": 4.01058672207381e-05, "loss": 3.1746, "step": 2000 }, { "epoch": 0.2, "grad_norm": 1.2949211597442627, "learning_rate": 4.008607895517958e-05, "loss": 3.1891, "step": 2004 }, { "epoch": 0.2, "grad_norm": 1.457186222076416, "learning_rate": 4.0066290689621055e-05, "loss": 3.2126, "step": 2008 }, { "epoch": 0.2, "grad_norm": 1.3785138130187988, "learning_rate": 4.0046502424062534e-05, "loss": 3.3084, "step": 2012 }, { "epoch": 0.2, "grad_norm": 1.2410658597946167, "learning_rate": 4.002671415850401e-05, "loss": 3.2643, "step": 2016 }, { "epoch": 0.2, "grad_norm": 1.4620486497879028, "learning_rate": 4.000692589294549e-05, "loss": 3.1098, "step": 2020 }, { "epoch": 0.2, "grad_norm": 1.2891669273376465, "learning_rate": 3.998713762738696e-05, "loss": 3.1495, "step": 2024 }, { "epoch": 0.2, "grad_norm": 1.4822944402694702, "learning_rate": 3.996734936182844e-05, "loss": 3.2116, "step": 2028 }, { "epoch": 0.2, "grad_norm": 1.3625112771987915, "learning_rate": 3.994756109626991e-05, "loss": 3.2135, "step": 2032 }, { "epoch": 0.2, "grad_norm": 1.4246516227722168, "learning_rate": 3.992777283071139e-05, "loss": 3.0766, "step": 2036 }, { "epoch": 0.2, "grad_norm": 1.4355683326721191, "learning_rate": 3.9907984565152865e-05, "loss": 3.1343, "step": 2040 }, { "epoch": 0.2, "grad_norm": 1.3231664896011353, "learning_rate": 3.9888196299594345e-05, "loss": 2.9431, "step": 2044 }, { "epoch": 0.2, "grad_norm": 1.5020241737365723, "learning_rate": 3.986840803403582e-05, "loss": 3.3893, "step": 2048 }, { "epoch": 0.2, "grad_norm": 1.5987601280212402, "learning_rate": 3.98486197684773e-05, "loss": 3.3362, "step": 2052 }, { "epoch": 0.2, "grad_norm": 1.5132125616073608, "learning_rate": 3.982883150291877e-05, "loss": 3.5167, "step": 2056 }, { "epoch": 0.2, "grad_norm": 1.2821168899536133, "learning_rate": 3.980904323736025e-05, "loss": 3.2242, "step": 2060 }, { "epoch": 0.2, "grad_norm": 1.3775780200958252, "learning_rate": 3.978925497180172e-05, "loss": 3.2939, "step": 2064 }, { "epoch": 0.2, "grad_norm": 1.260902762413025, "learning_rate": 3.9769466706243196e-05, "loss": 3.0705, "step": 2068 }, { "epoch": 0.21, "grad_norm": 1.3593419790267944, "learning_rate": 3.9749678440684676e-05, "loss": 3.3769, "step": 2072 }, { "epoch": 0.21, "grad_norm": 1.4187499284744263, "learning_rate": 3.972989017512615e-05, "loss": 3.0845, "step": 2076 }, { "epoch": 0.21, "grad_norm": 1.3207932710647583, "learning_rate": 3.971010190956763e-05, "loss": 3.062, "step": 2080 }, { "epoch": 0.21, "grad_norm": 1.2566810846328735, "learning_rate": 3.96903136440091e-05, "loss": 3.2373, "step": 2084 }, { "epoch": 0.21, "grad_norm": 1.4184499979019165, "learning_rate": 3.967052537845058e-05, "loss": 3.243, "step": 2088 }, { "epoch": 0.21, "grad_norm": 1.358438491821289, "learning_rate": 3.9650737112892054e-05, "loss": 3.1513, "step": 2092 }, { "epoch": 0.21, "grad_norm": 1.1665936708450317, "learning_rate": 3.9630948847333534e-05, "loss": 3.1287, "step": 2096 }, { "epoch": 0.21, "grad_norm": 1.3452047109603882, "learning_rate": 3.961116058177501e-05, "loss": 3.2622, "step": 2100 }, { "epoch": 0.21, "grad_norm": 1.434053659439087, "learning_rate": 3.9591372316216486e-05, "loss": 3.3259, "step": 2104 }, { "epoch": 0.21, "grad_norm": 1.3744926452636719, "learning_rate": 3.957158405065796e-05, "loss": 3.2474, "step": 2108 }, { "epoch": 0.21, "grad_norm": 1.656575322151184, "learning_rate": 3.955179578509944e-05, "loss": 3.1016, "step": 2112 }, { "epoch": 0.21, "grad_norm": 1.3220798969268799, "learning_rate": 3.953200751954091e-05, "loss": 3.167, "step": 2116 }, { "epoch": 0.21, "grad_norm": 1.250577449798584, "learning_rate": 3.951221925398239e-05, "loss": 3.0891, "step": 2120 }, { "epoch": 0.21, "grad_norm": 1.4257659912109375, "learning_rate": 3.9492430988423865e-05, "loss": 3.2264, "step": 2124 }, { "epoch": 0.21, "grad_norm": 1.6712124347686768, "learning_rate": 3.9472642722865344e-05, "loss": 3.2472, "step": 2128 }, { "epoch": 0.21, "grad_norm": 1.240073323249817, "learning_rate": 3.945285445730682e-05, "loss": 3.2035, "step": 2132 }, { "epoch": 0.21, "grad_norm": 1.3379197120666504, "learning_rate": 3.94330661917483e-05, "loss": 3.1321, "step": 2136 }, { "epoch": 0.21, "grad_norm": 1.395456314086914, "learning_rate": 3.941327792618978e-05, "loss": 3.2416, "step": 2140 }, { "epoch": 0.21, "grad_norm": 1.4799938201904297, "learning_rate": 3.939348966063125e-05, "loss": 3.1631, "step": 2144 }, { "epoch": 0.21, "grad_norm": 1.4667563438415527, "learning_rate": 3.937370139507273e-05, "loss": 3.247, "step": 2148 }, { "epoch": 0.21, "grad_norm": 1.5173306465148926, "learning_rate": 3.93539131295142e-05, "loss": 3.242, "step": 2152 }, { "epoch": 0.21, "grad_norm": 1.4049328565597534, "learning_rate": 3.9334124863955675e-05, "loss": 3.2837, "step": 2156 }, { "epoch": 0.21, "grad_norm": 1.4764662981033325, "learning_rate": 3.931433659839715e-05, "loss": 3.201, "step": 2160 }, { "epoch": 0.21, "grad_norm": 1.3218275308609009, "learning_rate": 3.929454833283863e-05, "loss": 3.0854, "step": 2164 }, { "epoch": 0.21, "grad_norm": 1.3813103437423706, "learning_rate": 3.92747600672801e-05, "loss": 3.1529, "step": 2168 }, { "epoch": 0.21, "grad_norm": 1.2131150960922241, "learning_rate": 3.925497180172158e-05, "loss": 3.0118, "step": 2172 }, { "epoch": 0.22, "grad_norm": 1.3104761838912964, "learning_rate": 3.9235183536163053e-05, "loss": 3.0601, "step": 2176 }, { "epoch": 0.22, "grad_norm": 1.6685932874679565, "learning_rate": 3.921539527060453e-05, "loss": 3.1585, "step": 2180 }, { "epoch": 0.22, "grad_norm": 1.3816349506378174, "learning_rate": 3.9195607005046006e-05, "loss": 3.2128, "step": 2184 }, { "epoch": 0.22, "grad_norm": 1.293393611907959, "learning_rate": 3.9175818739487486e-05, "loss": 3.2281, "step": 2188 }, { "epoch": 0.22, "grad_norm": 1.4393213987350464, "learning_rate": 3.915603047392896e-05, "loss": 3.298, "step": 2192 }, { "epoch": 0.22, "grad_norm": 1.3526524305343628, "learning_rate": 3.913624220837044e-05, "loss": 3.0613, "step": 2196 }, { "epoch": 0.22, "grad_norm": 1.4308154582977295, "learning_rate": 3.911645394281192e-05, "loss": 3.2307, "step": 2200 }, { "epoch": 0.22, "grad_norm": 1.688053011894226, "learning_rate": 3.909666567725339e-05, "loss": 3.1716, "step": 2204 }, { "epoch": 0.22, "grad_norm": 1.5624258518218994, "learning_rate": 3.907687741169487e-05, "loss": 3.1115, "step": 2208 }, { "epoch": 0.22, "grad_norm": 1.3107057809829712, "learning_rate": 3.9057089146136344e-05, "loss": 3.3096, "step": 2212 }, { "epoch": 0.22, "grad_norm": 1.3054726123809814, "learning_rate": 3.9037300880577823e-05, "loss": 3.0534, "step": 2216 }, { "epoch": 0.22, "grad_norm": 1.2478541135787964, "learning_rate": 3.9017512615019296e-05, "loss": 3.2014, "step": 2220 }, { "epoch": 0.22, "grad_norm": 1.2822325229644775, "learning_rate": 3.8997724349460776e-05, "loss": 3.1986, "step": 2224 }, { "epoch": 0.22, "grad_norm": 1.383755087852478, "learning_rate": 3.897793608390225e-05, "loss": 3.1122, "step": 2228 }, { "epoch": 0.22, "grad_norm": 1.3420864343643188, "learning_rate": 3.895814781834373e-05, "loss": 3.0604, "step": 2232 }, { "epoch": 0.22, "grad_norm": 1.3208297491073608, "learning_rate": 3.89383595527852e-05, "loss": 3.0086, "step": 2236 }, { "epoch": 0.22, "grad_norm": 1.3529295921325684, "learning_rate": 3.8918571287226675e-05, "loss": 3.146, "step": 2240 }, { "epoch": 0.22, "grad_norm": 1.4611573219299316, "learning_rate": 3.889878302166815e-05, "loss": 3.1959, "step": 2244 }, { "epoch": 0.22, "grad_norm": 1.354971170425415, "learning_rate": 3.887899475610963e-05, "loss": 2.8893, "step": 2248 }, { "epoch": 0.22, "grad_norm": 1.6441867351531982, "learning_rate": 3.88592064905511e-05, "loss": 3.2257, "step": 2252 }, { "epoch": 0.22, "grad_norm": 1.3111629486083984, "learning_rate": 3.883941822499258e-05, "loss": 3.0993, "step": 2256 }, { "epoch": 0.22, "grad_norm": 1.3077386617660522, "learning_rate": 3.881962995943406e-05, "loss": 3.088, "step": 2260 }, { "epoch": 0.22, "grad_norm": 1.331356167793274, "learning_rate": 3.879984169387553e-05, "loss": 3.31, "step": 2264 }, { "epoch": 0.22, "grad_norm": 1.4217604398727417, "learning_rate": 3.878005342831701e-05, "loss": 3.1174, "step": 2268 }, { "epoch": 0.22, "grad_norm": 1.2443788051605225, "learning_rate": 3.8760265162758485e-05, "loss": 3.11, "step": 2272 }, { "epoch": 0.23, "grad_norm": 1.288475751876831, "learning_rate": 3.8740476897199965e-05, "loss": 3.2074, "step": 2276 }, { "epoch": 0.23, "grad_norm": 1.427825689315796, "learning_rate": 3.872068863164144e-05, "loss": 3.0412, "step": 2280 }, { "epoch": 0.23, "grad_norm": 1.4200148582458496, "learning_rate": 3.870090036608292e-05, "loss": 3.1752, "step": 2284 }, { "epoch": 0.23, "grad_norm": 1.460418939590454, "learning_rate": 3.868111210052439e-05, "loss": 3.2623, "step": 2288 }, { "epoch": 0.23, "grad_norm": 1.38273024559021, "learning_rate": 3.866132383496587e-05, "loss": 3.0194, "step": 2292 }, { "epoch": 0.23, "grad_norm": 1.5908851623535156, "learning_rate": 3.864153556940734e-05, "loss": 3.1621, "step": 2296 }, { "epoch": 0.23, "grad_norm": 1.4536399841308594, "learning_rate": 3.862174730384882e-05, "loss": 3.1168, "step": 2300 }, { "epoch": 0.23, "grad_norm": 1.3599523305892944, "learning_rate": 3.8601959038290296e-05, "loss": 3.2446, "step": 2304 }, { "epoch": 0.23, "grad_norm": 1.3411145210266113, "learning_rate": 3.8582170772731775e-05, "loss": 3.0645, "step": 2308 }, { "epoch": 0.23, "grad_norm": 1.4459635019302368, "learning_rate": 3.856238250717325e-05, "loss": 3.0084, "step": 2312 }, { "epoch": 0.23, "grad_norm": 1.358909249305725, "learning_rate": 3.854259424161473e-05, "loss": 3.1009, "step": 2316 }, { "epoch": 0.23, "grad_norm": 1.5240005254745483, "learning_rate": 3.85228059760562e-05, "loss": 3.1275, "step": 2320 }, { "epoch": 0.23, "grad_norm": 1.4764708280563354, "learning_rate": 3.8503017710497674e-05, "loss": 3.0676, "step": 2324 }, { "epoch": 0.23, "grad_norm": 1.3841642141342163, "learning_rate": 3.8483229444939154e-05, "loss": 3.066, "step": 2328 }, { "epoch": 0.23, "grad_norm": 1.3241703510284424, "learning_rate": 3.8463441179380627e-05, "loss": 3.1452, "step": 2332 }, { "epoch": 0.23, "grad_norm": 1.4375591278076172, "learning_rate": 3.8443652913822106e-05, "loss": 2.9545, "step": 2336 }, { "epoch": 0.23, "grad_norm": 1.4808233976364136, "learning_rate": 3.842386464826358e-05, "loss": 3.2496, "step": 2340 }, { "epoch": 0.23, "grad_norm": 1.2886080741882324, "learning_rate": 3.840407638270506e-05, "loss": 3.1226, "step": 2344 }, { "epoch": 0.23, "grad_norm": 1.4224919080734253, "learning_rate": 3.838428811714653e-05, "loss": 3.196, "step": 2348 }, { "epoch": 0.23, "grad_norm": 1.2736425399780273, "learning_rate": 3.836449985158801e-05, "loss": 3.1041, "step": 2352 }, { "epoch": 0.23, "grad_norm": 1.2044873237609863, "learning_rate": 3.8344711586029485e-05, "loss": 3.1979, "step": 2356 }, { "epoch": 0.23, "grad_norm": 1.2397358417510986, "learning_rate": 3.8324923320470964e-05, "loss": 3.0942, "step": 2360 }, { "epoch": 0.23, "grad_norm": 1.3050447702407837, "learning_rate": 3.830513505491244e-05, "loss": 3.1241, "step": 2364 }, { "epoch": 0.23, "grad_norm": 1.239399790763855, "learning_rate": 3.828534678935392e-05, "loss": 3.0856, "step": 2368 }, { "epoch": 0.23, "grad_norm": 1.5021380186080933, "learning_rate": 3.826555852379539e-05, "loss": 3.0362, "step": 2372 }, { "epoch": 0.24, "grad_norm": 1.2373955249786377, "learning_rate": 3.824577025823687e-05, "loss": 3.0351, "step": 2376 }, { "epoch": 0.24, "grad_norm": 1.3865991830825806, "learning_rate": 3.822598199267834e-05, "loss": 3.1084, "step": 2380 }, { "epoch": 0.24, "grad_norm": 1.2980425357818604, "learning_rate": 3.820619372711982e-05, "loss": 3.1217, "step": 2384 }, { "epoch": 0.24, "grad_norm": 1.5192391872406006, "learning_rate": 3.8186405461561295e-05, "loss": 3.1863, "step": 2388 }, { "epoch": 0.24, "grad_norm": 1.4014787673950195, "learning_rate": 3.8166617196002775e-05, "loss": 2.9426, "step": 2392 }, { "epoch": 0.24, "grad_norm": 1.4827275276184082, "learning_rate": 3.814682893044425e-05, "loss": 3.0311, "step": 2396 }, { "epoch": 0.24, "grad_norm": 1.7020756006240845, "learning_rate": 3.812704066488573e-05, "loss": 3.229, "step": 2400 }, { "epoch": 0.24, "grad_norm": 1.3895395994186401, "learning_rate": 3.81072523993272e-05, "loss": 3.1177, "step": 2404 }, { "epoch": 0.24, "grad_norm": 1.469364881515503, "learning_rate": 3.808746413376867e-05, "loss": 3.0913, "step": 2408 }, { "epoch": 0.24, "grad_norm": 1.2792705297470093, "learning_rate": 3.806767586821015e-05, "loss": 3.0318, "step": 2412 }, { "epoch": 0.24, "grad_norm": 1.4794644117355347, "learning_rate": 3.8047887602651626e-05, "loss": 3.1361, "step": 2416 }, { "epoch": 0.24, "grad_norm": 1.6199162006378174, "learning_rate": 3.8028099337093106e-05, "loss": 3.0314, "step": 2420 }, { "epoch": 0.24, "grad_norm": 1.303435206413269, "learning_rate": 3.800831107153458e-05, "loss": 3.0878, "step": 2424 }, { "epoch": 0.24, "grad_norm": 1.4103130102157593, "learning_rate": 3.798852280597606e-05, "loss": 3.1757, "step": 2428 }, { "epoch": 0.24, "grad_norm": 1.2540159225463867, "learning_rate": 3.796873454041753e-05, "loss": 3.2547, "step": 2432 }, { "epoch": 0.24, "grad_norm": 1.3388727903366089, "learning_rate": 3.794894627485901e-05, "loss": 3.2623, "step": 2436 }, { "epoch": 0.24, "grad_norm": 1.2804844379425049, "learning_rate": 3.7929158009300484e-05, "loss": 3.0305, "step": 2440 }, { "epoch": 0.24, "grad_norm": 1.4325406551361084, "learning_rate": 3.7909369743741964e-05, "loss": 3.1235, "step": 2444 }, { "epoch": 0.24, "grad_norm": 1.4079090356826782, "learning_rate": 3.7889581478183437e-05, "loss": 3.0626, "step": 2448 }, { "epoch": 0.24, "grad_norm": 1.3477818965911865, "learning_rate": 3.7869793212624916e-05, "loss": 3.0653, "step": 2452 }, { "epoch": 0.24, "grad_norm": 1.4144619703292847, "learning_rate": 3.785000494706639e-05, "loss": 3.0494, "step": 2456 }, { "epoch": 0.24, "grad_norm": 1.279802680015564, "learning_rate": 3.783021668150787e-05, "loss": 3.0711, "step": 2460 }, { "epoch": 0.24, "grad_norm": 1.32356858253479, "learning_rate": 3.781042841594934e-05, "loss": 3.0562, "step": 2464 }, { "epoch": 0.24, "grad_norm": 1.3349965810775757, "learning_rate": 3.779064015039082e-05, "loss": 3.0004, "step": 2468 }, { "epoch": 0.24, "grad_norm": 1.32090425491333, "learning_rate": 3.77708518848323e-05, "loss": 3.2252, "step": 2472 }, { "epoch": 0.24, "grad_norm": 1.335302472114563, "learning_rate": 3.7751063619273774e-05, "loss": 3.2227, "step": 2476 }, { "epoch": 0.25, "grad_norm": 1.4220561981201172, "learning_rate": 3.7731275353715254e-05, "loss": 3.0286, "step": 2480 }, { "epoch": 0.25, "grad_norm": 1.3459542989730835, "learning_rate": 3.771148708815673e-05, "loss": 3.0613, "step": 2484 }, { "epoch": 0.25, "grad_norm": 1.3894877433776855, "learning_rate": 3.7691698822598206e-05, "loss": 3.1524, "step": 2488 }, { "epoch": 0.25, "grad_norm": 1.3688431978225708, "learning_rate": 3.767191055703967e-05, "loss": 3.2377, "step": 2492 }, { "epoch": 0.25, "grad_norm": 1.2429877519607544, "learning_rate": 3.765212229148115e-05, "loss": 3.0412, "step": 2496 }, { "epoch": 0.25, "grad_norm": 1.6966149806976318, "learning_rate": 3.7632334025922625e-05, "loss": 2.9925, "step": 2500 }, { "epoch": 0.25, "grad_norm": 1.4553391933441162, "learning_rate": 3.7612545760364105e-05, "loss": 3.1891, "step": 2504 }, { "epoch": 0.25, "grad_norm": 1.3509441614151, "learning_rate": 3.759275749480558e-05, "loss": 3.0536, "step": 2508 }, { "epoch": 0.25, "grad_norm": 1.3694161176681519, "learning_rate": 3.757296922924706e-05, "loss": 3.2306, "step": 2512 }, { "epoch": 0.25, "grad_norm": 1.6577290296554565, "learning_rate": 3.755318096368853e-05, "loss": 3.1636, "step": 2516 }, { "epoch": 0.25, "grad_norm": 1.3860623836517334, "learning_rate": 3.753339269813001e-05, "loss": 3.1207, "step": 2520 }, { "epoch": 0.25, "grad_norm": 1.5849056243896484, "learning_rate": 3.751360443257148e-05, "loss": 3.0021, "step": 2524 }, { "epoch": 0.25, "grad_norm": 1.4191337823867798, "learning_rate": 3.749381616701296e-05, "loss": 3.1331, "step": 2528 }, { "epoch": 0.25, "grad_norm": 1.3399465084075928, "learning_rate": 3.747402790145444e-05, "loss": 3.2503, "step": 2532 }, { "epoch": 0.25, "grad_norm": 1.290664553642273, "learning_rate": 3.7454239635895916e-05, "loss": 3.114, "step": 2536 }, { "epoch": 0.25, "grad_norm": 1.2612792253494263, "learning_rate": 3.7434451370337395e-05, "loss": 3.1496, "step": 2540 }, { "epoch": 0.25, "grad_norm": 1.4211294651031494, "learning_rate": 3.741466310477887e-05, "loss": 3.0861, "step": 2544 }, { "epoch": 0.25, "grad_norm": 1.4412263631820679, "learning_rate": 3.739487483922035e-05, "loss": 3.1606, "step": 2548 }, { "epoch": 0.25, "grad_norm": 1.3415204286575317, "learning_rate": 3.737508657366182e-05, "loss": 3.0834, "step": 2552 }, { "epoch": 0.25, "grad_norm": 1.5815222263336182, "learning_rate": 3.73552983081033e-05, "loss": 2.9571, "step": 2556 }, { "epoch": 0.25, "grad_norm": 1.2796626091003418, "learning_rate": 3.7335510042544774e-05, "loss": 3.0611, "step": 2560 }, { "epoch": 0.25, "grad_norm": 1.4062576293945312, "learning_rate": 3.731572177698625e-05, "loss": 3.1557, "step": 2564 }, { "epoch": 0.25, "grad_norm": 1.3327045440673828, "learning_rate": 3.7295933511427726e-05, "loss": 2.9271, "step": 2568 }, { "epoch": 0.25, "grad_norm": 1.3985902070999146, "learning_rate": 3.7276145245869206e-05, "loss": 3.0211, "step": 2572 }, { "epoch": 0.25, "grad_norm": 1.494410514831543, "learning_rate": 3.725635698031067e-05, "loss": 2.9367, "step": 2576 }, { "epoch": 0.26, "grad_norm": 1.3023014068603516, "learning_rate": 3.723656871475215e-05, "loss": 3.0077, "step": 2580 }, { "epoch": 0.26, "grad_norm": 1.4200507402420044, "learning_rate": 3.7216780449193625e-05, "loss": 3.1887, "step": 2584 }, { "epoch": 0.26, "grad_norm": 1.4320076704025269, "learning_rate": 3.7196992183635104e-05, "loss": 2.9131, "step": 2588 }, { "epoch": 0.26, "grad_norm": 1.2773874998092651, "learning_rate": 3.7177203918076584e-05, "loss": 2.9756, "step": 2592 }, { "epoch": 0.26, "grad_norm": 1.3249197006225586, "learning_rate": 3.715741565251806e-05, "loss": 3.0856, "step": 2596 }, { "epoch": 0.26, "grad_norm": 1.574328899383545, "learning_rate": 3.713762738695954e-05, "loss": 2.9265, "step": 2600 }, { "epoch": 0.26, "grad_norm": 1.2989048957824707, "learning_rate": 3.711783912140101e-05, "loss": 3.1064, "step": 2604 }, { "epoch": 0.26, "grad_norm": 1.2440621852874756, "learning_rate": 3.709805085584249e-05, "loss": 3.096, "step": 2608 }, { "epoch": 0.26, "grad_norm": 1.2897485494613647, "learning_rate": 3.707826259028396e-05, "loss": 2.9686, "step": 2612 }, { "epoch": 0.26, "grad_norm": 1.4826507568359375, "learning_rate": 3.705847432472544e-05, "loss": 3.0006, "step": 2616 }, { "epoch": 0.26, "grad_norm": 1.1983760595321655, "learning_rate": 3.7038686059166915e-05, "loss": 3.1168, "step": 2620 }, { "epoch": 0.26, "grad_norm": 1.8693993091583252, "learning_rate": 3.7018897793608395e-05, "loss": 3.1812, "step": 2624 }, { "epoch": 0.26, "grad_norm": 1.3697383403778076, "learning_rate": 3.699910952804987e-05, "loss": 3.0964, "step": 2628 }, { "epoch": 0.26, "grad_norm": 1.4124512672424316, "learning_rate": 3.697932126249135e-05, "loss": 3.0129, "step": 2632 }, { "epoch": 0.26, "grad_norm": 1.1903616189956665, "learning_rate": 3.695953299693282e-05, "loss": 2.9507, "step": 2636 }, { "epoch": 0.26, "grad_norm": 1.5959855318069458, "learning_rate": 3.69397447313743e-05, "loss": 3.0499, "step": 2640 }, { "epoch": 0.26, "grad_norm": 1.506282091140747, "learning_rate": 3.691995646581577e-05, "loss": 3.0591, "step": 2644 }, { "epoch": 0.26, "grad_norm": 1.6031521558761597, "learning_rate": 3.690016820025725e-05, "loss": 2.9865, "step": 2648 }, { "epoch": 0.26, "grad_norm": 1.400120735168457, "learning_rate": 3.6880379934698726e-05, "loss": 3.0445, "step": 2652 }, { "epoch": 0.26, "grad_norm": 1.300150752067566, "learning_rate": 3.6860591669140205e-05, "loss": 3.0002, "step": 2656 }, { "epoch": 0.26, "grad_norm": 1.2600607872009277, "learning_rate": 3.684080340358168e-05, "loss": 3.0244, "step": 2660 }, { "epoch": 0.26, "grad_norm": 1.358891487121582, "learning_rate": 3.682101513802315e-05, "loss": 3.1586, "step": 2664 }, { "epoch": 0.26, "grad_norm": 1.4360027313232422, "learning_rate": 3.680122687246463e-05, "loss": 3.2983, "step": 2668 }, { "epoch": 0.26, "grad_norm": 1.2610455751419067, "learning_rate": 3.6781438606906104e-05, "loss": 3.0377, "step": 2672 }, { "epoch": 0.26, "grad_norm": 1.2880274057388306, "learning_rate": 3.6761650341347583e-05, "loss": 3.0465, "step": 2676 }, { "epoch": 0.27, "grad_norm": 1.603733777999878, "learning_rate": 3.6741862075789056e-05, "loss": 2.9944, "step": 2680 }, { "epoch": 0.27, "grad_norm": 1.4317169189453125, "learning_rate": 3.6722073810230536e-05, "loss": 3.1036, "step": 2684 }, { "epoch": 0.27, "grad_norm": 1.2459261417388916, "learning_rate": 3.670228554467201e-05, "loss": 2.8453, "step": 2688 }, { "epoch": 0.27, "grad_norm": 1.4440115690231323, "learning_rate": 3.668249727911349e-05, "loss": 3.0867, "step": 2692 }, { "epoch": 0.27, "grad_norm": 1.3969016075134277, "learning_rate": 3.666270901355496e-05, "loss": 3.0322, "step": 2696 }, { "epoch": 0.27, "grad_norm": 1.2696243524551392, "learning_rate": 3.664292074799644e-05, "loss": 3.1173, "step": 2700 }, { "epoch": 0.27, "grad_norm": 1.5136852264404297, "learning_rate": 3.6623132482437914e-05, "loss": 3.021, "step": 2704 }, { "epoch": 0.27, "grad_norm": 1.4152220487594604, "learning_rate": 3.6603344216879394e-05, "loss": 2.925, "step": 2708 }, { "epoch": 0.27, "grad_norm": 1.7811540365219116, "learning_rate": 3.658355595132087e-05, "loss": 3.0219, "step": 2712 }, { "epoch": 0.27, "grad_norm": 1.384444236755371, "learning_rate": 3.656376768576235e-05, "loss": 3.036, "step": 2716 }, { "epoch": 0.27, "grad_norm": 1.4472016096115112, "learning_rate": 3.654397942020382e-05, "loss": 3.1301, "step": 2720 }, { "epoch": 0.27, "grad_norm": 1.2954609394073486, "learning_rate": 3.65241911546453e-05, "loss": 3.0274, "step": 2724 }, { "epoch": 0.27, "grad_norm": 1.3119304180145264, "learning_rate": 3.650440288908677e-05, "loss": 3.0739, "step": 2728 }, { "epoch": 0.27, "grad_norm": 1.405961513519287, "learning_rate": 3.648461462352825e-05, "loss": 2.9857, "step": 2732 }, { "epoch": 0.27, "grad_norm": 1.532257318496704, "learning_rate": 3.6464826357969725e-05, "loss": 3.0947, "step": 2736 }, { "epoch": 0.27, "grad_norm": 1.4909120798110962, "learning_rate": 3.6445038092411205e-05, "loss": 3.1899, "step": 2740 }, { "epoch": 0.27, "grad_norm": 1.2964402437210083, "learning_rate": 3.642524982685268e-05, "loss": 2.9136, "step": 2744 }, { "epoch": 0.27, "grad_norm": 1.3999255895614624, "learning_rate": 3.640546156129415e-05, "loss": 3.1076, "step": 2748 }, { "epoch": 0.27, "grad_norm": 1.310899257659912, "learning_rate": 3.638567329573563e-05, "loss": 3.2241, "step": 2752 }, { "epoch": 0.27, "grad_norm": 1.2926506996154785, "learning_rate": 3.63658850301771e-05, "loss": 2.9607, "step": 2756 }, { "epoch": 0.27, "grad_norm": 1.4596292972564697, "learning_rate": 3.634609676461858e-05, "loss": 2.9604, "step": 2760 }, { "epoch": 0.27, "grad_norm": 1.4320791959762573, "learning_rate": 3.6326308499060056e-05, "loss": 3.1414, "step": 2764 }, { "epoch": 0.27, "grad_norm": 1.3382930755615234, "learning_rate": 3.6306520233501535e-05, "loss": 2.8968, "step": 2768 }, { "epoch": 0.27, "grad_norm": 1.3211889266967773, "learning_rate": 3.628673196794301e-05, "loss": 2.8903, "step": 2772 }, { "epoch": 0.27, "grad_norm": 1.323534369468689, "learning_rate": 3.626694370238449e-05, "loss": 2.9924, "step": 2776 }, { "epoch": 0.28, "grad_norm": 1.314077377319336, "learning_rate": 3.624715543682596e-05, "loss": 2.9786, "step": 2780 }, { "epoch": 0.28, "grad_norm": 1.7178696393966675, "learning_rate": 3.622736717126744e-05, "loss": 3.1973, "step": 2784 }, { "epoch": 0.28, "grad_norm": 1.5973659753799438, "learning_rate": 3.6207578905708914e-05, "loss": 2.9483, "step": 2788 }, { "epoch": 0.28, "grad_norm": 1.4203895330429077, "learning_rate": 3.618779064015039e-05, "loss": 2.9089, "step": 2792 }, { "epoch": 0.28, "grad_norm": 1.486110806465149, "learning_rate": 3.6168002374591866e-05, "loss": 3.0919, "step": 2796 }, { "epoch": 0.28, "grad_norm": 1.4070860147476196, "learning_rate": 3.6148214109033346e-05, "loss": 3.0393, "step": 2800 }, { "epoch": 0.28, "grad_norm": 1.4570317268371582, "learning_rate": 3.6128425843474826e-05, "loss": 3.0708, "step": 2804 }, { "epoch": 0.28, "grad_norm": 1.8382024765014648, "learning_rate": 3.61086375779163e-05, "loss": 3.1437, "step": 2808 }, { "epoch": 0.28, "grad_norm": 1.3467711210250854, "learning_rate": 3.608884931235778e-05, "loss": 3.047, "step": 2812 }, { "epoch": 0.28, "grad_norm": 1.209226369857788, "learning_rate": 3.606906104679925e-05, "loss": 2.9569, "step": 2816 }, { "epoch": 0.28, "grad_norm": 1.458509087562561, "learning_rate": 3.604927278124073e-05, "loss": 3.0049, "step": 2820 }, { "epoch": 0.28, "grad_norm": 1.4587996006011963, "learning_rate": 3.6029484515682204e-05, "loss": 2.9481, "step": 2824 }, { "epoch": 0.28, "grad_norm": 1.2717212438583374, "learning_rate": 3.600969625012368e-05, "loss": 2.9896, "step": 2828 }, { "epoch": 0.28, "grad_norm": 1.3680723905563354, "learning_rate": 3.598990798456515e-05, "loss": 3.0087, "step": 2832 }, { "epoch": 0.28, "grad_norm": 1.3884750604629517, "learning_rate": 3.597011971900663e-05, "loss": 3.0768, "step": 2836 }, { "epoch": 0.28, "grad_norm": 1.5019512176513672, "learning_rate": 3.59503314534481e-05, "loss": 2.9614, "step": 2840 }, { "epoch": 0.28, "grad_norm": 1.706018328666687, "learning_rate": 3.593054318788958e-05, "loss": 2.9818, "step": 2844 }, { "epoch": 0.28, "grad_norm": 1.3418842554092407, "learning_rate": 3.5910754922331055e-05, "loss": 2.9335, "step": 2848 }, { "epoch": 0.28, "grad_norm": 1.6864101886749268, "learning_rate": 3.5890966656772535e-05, "loss": 3.1771, "step": 2852 }, { "epoch": 0.28, "grad_norm": 1.488398551940918, "learning_rate": 3.587117839121401e-05, "loss": 2.9231, "step": 2856 }, { "epoch": 0.28, "grad_norm": 1.43323814868927, "learning_rate": 3.585139012565549e-05, "loss": 2.9957, "step": 2860 }, { "epoch": 0.28, "grad_norm": 1.4537419080734253, "learning_rate": 3.583160186009697e-05, "loss": 2.9353, "step": 2864 }, { "epoch": 0.28, "grad_norm": 1.3649722337722778, "learning_rate": 3.581181359453844e-05, "loss": 2.9827, "step": 2868 }, { "epoch": 0.28, "grad_norm": 1.420401692390442, "learning_rate": 3.579202532897992e-05, "loss": 3.0755, "step": 2872 }, { "epoch": 0.28, "grad_norm": 1.5330480337142944, "learning_rate": 3.577223706342139e-05, "loss": 3.0098, "step": 2876 }, { "epoch": 0.28, "grad_norm": 1.2478176355361938, "learning_rate": 3.575244879786287e-05, "loss": 3.0078, "step": 2880 }, { "epoch": 0.29, "grad_norm": 1.4124771356582642, "learning_rate": 3.5732660532304345e-05, "loss": 3.0649, "step": 2884 }, { "epoch": 0.29, "grad_norm": 1.5072100162506104, "learning_rate": 3.5712872266745825e-05, "loss": 3.1248, "step": 2888 }, { "epoch": 0.29, "grad_norm": 1.3434092998504639, "learning_rate": 3.56930840011873e-05, "loss": 3.1075, "step": 2892 }, { "epoch": 0.29, "grad_norm": 1.3640666007995605, "learning_rate": 3.567329573562878e-05, "loss": 2.9797, "step": 2896 }, { "epoch": 0.29, "grad_norm": 1.4746694564819336, "learning_rate": 3.565350747007025e-05, "loss": 2.9813, "step": 2900 }, { "epoch": 0.29, "grad_norm": 1.4786782264709473, "learning_rate": 3.563371920451173e-05, "loss": 3.111, "step": 2904 }, { "epoch": 0.29, "grad_norm": 1.471091628074646, "learning_rate": 3.56139309389532e-05, "loss": 2.9263, "step": 2908 }, { "epoch": 0.29, "grad_norm": 1.4228413105010986, "learning_rate": 3.5594142673394676e-05, "loss": 3.1006, "step": 2912 }, { "epoch": 0.29, "grad_norm": 1.2829701900482178, "learning_rate": 3.557435440783615e-05, "loss": 2.9774, "step": 2916 }, { "epoch": 0.29, "grad_norm": 1.3806427717208862, "learning_rate": 3.555456614227763e-05, "loss": 3.0992, "step": 2920 }, { "epoch": 0.29, "grad_norm": 1.5112413167953491, "learning_rate": 3.553477787671911e-05, "loss": 2.8947, "step": 2924 }, { "epoch": 0.29, "grad_norm": 1.3634873628616333, "learning_rate": 3.551498961116058e-05, "loss": 3.0911, "step": 2928 }, { "epoch": 0.29, "grad_norm": 1.299893856048584, "learning_rate": 3.549520134560206e-05, "loss": 3.0254, "step": 2932 }, { "epoch": 0.29, "grad_norm": 1.524451494216919, "learning_rate": 3.5475413080043534e-05, "loss": 2.9585, "step": 2936 }, { "epoch": 0.29, "grad_norm": 1.3160494565963745, "learning_rate": 3.5455624814485014e-05, "loss": 2.9677, "step": 2940 }, { "epoch": 0.29, "grad_norm": 1.422784686088562, "learning_rate": 3.543583654892649e-05, "loss": 3.0225, "step": 2944 }, { "epoch": 0.29, "grad_norm": 1.3876713514328003, "learning_rate": 3.5416048283367967e-05, "loss": 2.9325, "step": 2948 }, { "epoch": 0.29, "grad_norm": 1.3324222564697266, "learning_rate": 3.539626001780944e-05, "loss": 3.0236, "step": 2952 }, { "epoch": 0.29, "grad_norm": 1.5146019458770752, "learning_rate": 3.537647175225092e-05, "loss": 3.0919, "step": 2956 }, { "epoch": 0.29, "grad_norm": 1.576621174812317, "learning_rate": 3.535668348669239e-05, "loss": 3.0733, "step": 2960 }, { "epoch": 0.29, "grad_norm": 1.651235580444336, "learning_rate": 3.533689522113387e-05, "loss": 3.2117, "step": 2964 }, { "epoch": 0.29, "grad_norm": 1.377480387687683, "learning_rate": 3.5317106955575345e-05, "loss": 2.8459, "step": 2968 }, { "epoch": 0.29, "grad_norm": 1.462600588798523, "learning_rate": 3.5297318690016824e-05, "loss": 2.9828, "step": 2972 }, { "epoch": 0.29, "grad_norm": 1.320587396621704, "learning_rate": 3.52775304244583e-05, "loss": 2.9569, "step": 2976 }, { "epoch": 0.29, "grad_norm": 1.3665657043457031, "learning_rate": 3.525774215889978e-05, "loss": 2.9964, "step": 2980 }, { "epoch": 0.3, "grad_norm": 1.4614739418029785, "learning_rate": 3.523795389334125e-05, "loss": 2.9624, "step": 2984 }, { "epoch": 0.3, "grad_norm": 1.582934021949768, "learning_rate": 3.521816562778273e-05, "loss": 2.9726, "step": 2988 }, { "epoch": 0.3, "grad_norm": 1.4328818321228027, "learning_rate": 3.51983773622242e-05, "loss": 3.0368, "step": 2992 }, { "epoch": 0.3, "grad_norm": 1.3142259120941162, "learning_rate": 3.517858909666568e-05, "loss": 2.8137, "step": 2996 }, { "epoch": 0.3, "grad_norm": 1.4220584630966187, "learning_rate": 3.5158800831107155e-05, "loss": 3.0077, "step": 3000 }, { "epoch": 0.3, "grad_norm": 1.5171457529067993, "learning_rate": 3.513901256554863e-05, "loss": 2.9489, "step": 3004 }, { "epoch": 0.3, "grad_norm": 1.649688720703125, "learning_rate": 3.511922429999011e-05, "loss": 3.092, "step": 3008 }, { "epoch": 0.3, "grad_norm": 1.4517191648483276, "learning_rate": 3.509943603443158e-05, "loss": 2.8212, "step": 3012 }, { "epoch": 0.3, "grad_norm": 1.4763597249984741, "learning_rate": 3.507964776887306e-05, "loss": 3.145, "step": 3016 }, { "epoch": 0.3, "grad_norm": 1.3676544427871704, "learning_rate": 3.5059859503314534e-05, "loss": 3.0658, "step": 3020 }, { "epoch": 0.3, "grad_norm": 1.242475152015686, "learning_rate": 3.504007123775601e-05, "loss": 2.9705, "step": 3024 }, { "epoch": 0.3, "grad_norm": 1.509164810180664, "learning_rate": 3.5020282972197486e-05, "loss": 3.0112, "step": 3028 }, { "epoch": 0.3, "grad_norm": 1.3613548278808594, "learning_rate": 3.5000494706638966e-05, "loss": 2.9177, "step": 3032 }, { "epoch": 0.3, "grad_norm": 1.3977431058883667, "learning_rate": 3.498070644108044e-05, "loss": 3.0567, "step": 3036 }, { "epoch": 0.3, "grad_norm": 1.3139115571975708, "learning_rate": 3.496091817552192e-05, "loss": 2.9182, "step": 3040 }, { "epoch": 0.3, "grad_norm": 1.3715747594833374, "learning_rate": 3.494112990996339e-05, "loss": 3.0362, "step": 3044 }, { "epoch": 0.3, "grad_norm": 1.2754712104797363, "learning_rate": 3.492134164440487e-05, "loss": 2.8626, "step": 3048 }, { "epoch": 0.3, "grad_norm": 1.5584040880203247, "learning_rate": 3.4901553378846344e-05, "loss": 2.7835, "step": 3052 }, { "epoch": 0.3, "grad_norm": 1.359312891960144, "learning_rate": 3.4881765113287824e-05, "loss": 2.9583, "step": 3056 }, { "epoch": 0.3, "grad_norm": 1.4567357301712036, "learning_rate": 3.48619768477293e-05, "loss": 2.8088, "step": 3060 }, { "epoch": 0.3, "grad_norm": 1.3756351470947266, "learning_rate": 3.4842188582170776e-05, "loss": 2.9143, "step": 3064 }, { "epoch": 0.3, "grad_norm": 1.4864003658294678, "learning_rate": 3.482240031661225e-05, "loss": 3.0454, "step": 3068 }, { "epoch": 0.3, "grad_norm": 1.4282584190368652, "learning_rate": 3.480261205105373e-05, "loss": 2.863, "step": 3072 }, { "epoch": 0.3, "grad_norm": 1.3447134494781494, "learning_rate": 3.478282378549521e-05, "loss": 3.1251, "step": 3076 }, { "epoch": 0.3, "grad_norm": 1.3359801769256592, "learning_rate": 3.476303551993668e-05, "loss": 3.1131, "step": 3080 }, { "epoch": 0.31, "grad_norm": 1.6050612926483154, "learning_rate": 3.4743247254378155e-05, "loss": 2.8146, "step": 3084 }, { "epoch": 0.31, "grad_norm": 1.4369301795959473, "learning_rate": 3.472345898881963e-05, "loss": 3.0191, "step": 3088 }, { "epoch": 0.31, "grad_norm": 1.4939532279968262, "learning_rate": 3.470367072326111e-05, "loss": 2.8275, "step": 3092 }, { "epoch": 0.31, "grad_norm": 1.4874589443206787, "learning_rate": 3.468388245770258e-05, "loss": 2.9788, "step": 3096 }, { "epoch": 0.31, "grad_norm": 1.4248600006103516, "learning_rate": 3.466409419214406e-05, "loss": 2.8865, "step": 3100 }, { "epoch": 0.31, "grad_norm": 1.3846864700317383, "learning_rate": 3.464430592658553e-05, "loss": 2.6846, "step": 3104 }, { "epoch": 0.31, "grad_norm": 1.660626769065857, "learning_rate": 3.462451766102701e-05, "loss": 3.0301, "step": 3108 }, { "epoch": 0.31, "grad_norm": 1.3925195932388306, "learning_rate": 3.4604729395468486e-05, "loss": 2.893, "step": 3112 }, { "epoch": 0.31, "grad_norm": 1.4249762296676636, "learning_rate": 3.4584941129909965e-05, "loss": 2.9668, "step": 3116 }, { "epoch": 0.31, "grad_norm": 1.4822063446044922, "learning_rate": 3.456515286435144e-05, "loss": 2.7548, "step": 3120 }, { "epoch": 0.31, "grad_norm": 1.3979532718658447, "learning_rate": 3.454536459879292e-05, "loss": 2.9613, "step": 3124 }, { "epoch": 0.31, "grad_norm": 1.3357387781143188, "learning_rate": 3.452557633323439e-05, "loss": 3.0212, "step": 3128 }, { "epoch": 0.31, "grad_norm": 1.5871549844741821, "learning_rate": 3.450578806767587e-05, "loss": 2.8224, "step": 3132 }, { "epoch": 0.31, "grad_norm": 1.32512366771698, "learning_rate": 3.448599980211735e-05, "loss": 2.808, "step": 3136 }, { "epoch": 0.31, "grad_norm": 1.383267879486084, "learning_rate": 3.446621153655882e-05, "loss": 2.877, "step": 3140 }, { "epoch": 0.31, "grad_norm": 1.4139374494552612, "learning_rate": 3.44464232710003e-05, "loss": 2.7878, "step": 3144 }, { "epoch": 0.31, "grad_norm": 1.4128443002700806, "learning_rate": 3.4426635005441776e-05, "loss": 2.7679, "step": 3148 }, { "epoch": 0.31, "grad_norm": 1.4073232412338257, "learning_rate": 3.4406846739883256e-05, "loss": 2.8843, "step": 3152 }, { "epoch": 0.31, "grad_norm": 1.4618744850158691, "learning_rate": 3.438705847432473e-05, "loss": 3.0241, "step": 3156 }, { "epoch": 0.31, "grad_norm": 1.5047119855880737, "learning_rate": 3.436727020876621e-05, "loss": 3.0856, "step": 3160 }, { "epoch": 0.31, "grad_norm": 1.426459789276123, "learning_rate": 3.434748194320768e-05, "loss": 3.0243, "step": 3164 }, { "epoch": 0.31, "grad_norm": 1.3475228548049927, "learning_rate": 3.4327693677649154e-05, "loss": 2.9228, "step": 3168 }, { "epoch": 0.31, "grad_norm": 1.3559972047805786, "learning_rate": 3.430790541209063e-05, "loss": 2.9625, "step": 3172 }, { "epoch": 0.31, "grad_norm": 1.4422370195388794, "learning_rate": 3.428811714653211e-05, "loss": 2.8789, "step": 3176 }, { "epoch": 0.31, "grad_norm": 1.404948115348816, "learning_rate": 3.426832888097358e-05, "loss": 3.017, "step": 3180 }, { "epoch": 0.32, "grad_norm": 1.3570481538772583, "learning_rate": 3.424854061541506e-05, "loss": 2.7785, "step": 3184 }, { "epoch": 0.32, "grad_norm": 1.295659065246582, "learning_rate": 3.422875234985653e-05, "loss": 2.7838, "step": 3188 }, { "epoch": 0.32, "grad_norm": 1.3800030946731567, "learning_rate": 3.420896408429801e-05, "loss": 2.9911, "step": 3192 }, { "epoch": 0.32, "grad_norm": 1.3445378541946411, "learning_rate": 3.418917581873949e-05, "loss": 2.9278, "step": 3196 }, { "epoch": 0.32, "grad_norm": 1.3806275129318237, "learning_rate": 3.4169387553180965e-05, "loss": 2.9189, "step": 3200 }, { "epoch": 0.32, "grad_norm": 1.3769384622573853, "learning_rate": 3.4149599287622444e-05, "loss": 2.924, "step": 3204 }, { "epoch": 0.32, "grad_norm": 1.5868099927902222, "learning_rate": 3.412981102206392e-05, "loss": 2.9324, "step": 3208 }, { "epoch": 0.32, "grad_norm": 1.3903356790542603, "learning_rate": 3.41100227565054e-05, "loss": 2.7864, "step": 3212 }, { "epoch": 0.32, "grad_norm": 1.3730672597885132, "learning_rate": 3.409023449094687e-05, "loss": 2.934, "step": 3216 }, { "epoch": 0.32, "grad_norm": 1.3796464204788208, "learning_rate": 3.407044622538835e-05, "loss": 2.9562, "step": 3220 }, { "epoch": 0.32, "grad_norm": 1.617128849029541, "learning_rate": 3.405065795982982e-05, "loss": 3.0319, "step": 3224 }, { "epoch": 0.32, "grad_norm": 1.5228337049484253, "learning_rate": 3.40308696942713e-05, "loss": 2.875, "step": 3228 }, { "epoch": 0.32, "grad_norm": 1.3389008045196533, "learning_rate": 3.4011081428712775e-05, "loss": 2.7897, "step": 3232 }, { "epoch": 0.32, "grad_norm": 1.3617353439331055, "learning_rate": 3.3991293163154255e-05, "loss": 2.9357, "step": 3236 }, { "epoch": 0.32, "grad_norm": 1.3685081005096436, "learning_rate": 3.397150489759573e-05, "loss": 2.9993, "step": 3240 }, { "epoch": 0.32, "grad_norm": 1.4341316223144531, "learning_rate": 3.395171663203721e-05, "loss": 3.1416, "step": 3244 }, { "epoch": 0.32, "grad_norm": 1.3631387948989868, "learning_rate": 3.393192836647868e-05, "loss": 2.9022, "step": 3248 }, { "epoch": 0.32, "grad_norm": 1.3081270456314087, "learning_rate": 3.391214010092015e-05, "loss": 3.0646, "step": 3252 }, { "epoch": 0.32, "grad_norm": 1.3105672597885132, "learning_rate": 3.389235183536163e-05, "loss": 2.8765, "step": 3256 }, { "epoch": 0.32, "grad_norm": 1.881330966949463, "learning_rate": 3.3872563569803106e-05, "loss": 2.818, "step": 3260 }, { "epoch": 0.32, "grad_norm": 1.445777416229248, "learning_rate": 3.3852775304244586e-05, "loss": 2.786, "step": 3264 }, { "epoch": 0.32, "grad_norm": 1.4523110389709473, "learning_rate": 3.383298703868606e-05, "loss": 3.077, "step": 3268 }, { "epoch": 0.32, "grad_norm": 1.3693511486053467, "learning_rate": 3.381319877312754e-05, "loss": 2.9497, "step": 3272 }, { "epoch": 0.32, "grad_norm": 1.4519122838974, "learning_rate": 3.379341050756901e-05, "loss": 2.8887, "step": 3276 }, { "epoch": 0.32, "grad_norm": 1.2643674612045288, "learning_rate": 3.377362224201049e-05, "loss": 2.8879, "step": 3280 }, { "epoch": 0.32, "grad_norm": 1.5315420627593994, "learning_rate": 3.3753833976451964e-05, "loss": 2.8984, "step": 3284 }, { "epoch": 0.33, "grad_norm": 1.4942806959152222, "learning_rate": 3.3734045710893444e-05, "loss": 3.0528, "step": 3288 }, { "epoch": 0.33, "grad_norm": 1.8208317756652832, "learning_rate": 3.3714257445334917e-05, "loss": 2.8531, "step": 3292 }, { "epoch": 0.33, "grad_norm": 1.6653791666030884, "learning_rate": 3.3694469179776396e-05, "loss": 2.9974, "step": 3296 }, { "epoch": 0.33, "grad_norm": 1.4518672227859497, "learning_rate": 3.367468091421787e-05, "loss": 2.8841, "step": 3300 }, { "epoch": 0.33, "grad_norm": 1.373217225074768, "learning_rate": 3.365489264865935e-05, "loss": 2.8967, "step": 3304 }, { "epoch": 0.33, "grad_norm": 1.567406415939331, "learning_rate": 3.363510438310082e-05, "loss": 3.0588, "step": 3308 }, { "epoch": 0.33, "grad_norm": 1.296132206916809, "learning_rate": 3.36153161175423e-05, "loss": 2.9882, "step": 3312 }, { "epoch": 0.33, "grad_norm": 1.3636564016342163, "learning_rate": 3.3595527851983775e-05, "loss": 2.8311, "step": 3316 }, { "epoch": 0.33, "grad_norm": 1.4256870746612549, "learning_rate": 3.3575739586425254e-05, "loss": 2.8547, "step": 3320 }, { "epoch": 0.33, "grad_norm": 1.5391736030578613, "learning_rate": 3.355595132086673e-05, "loss": 3.0205, "step": 3324 }, { "epoch": 0.33, "grad_norm": 1.5316370725631714, "learning_rate": 3.353616305530821e-05, "loss": 2.8755, "step": 3328 }, { "epoch": 0.33, "grad_norm": 1.5849004983901978, "learning_rate": 3.351637478974968e-05, "loss": 2.8707, "step": 3332 }, { "epoch": 0.33, "grad_norm": 1.5995546579360962, "learning_rate": 3.349658652419115e-05, "loss": 2.839, "step": 3336 }, { "epoch": 0.33, "grad_norm": 1.3852463960647583, "learning_rate": 3.347679825863263e-05, "loss": 2.8356, "step": 3340 }, { "epoch": 0.33, "grad_norm": 1.2991340160369873, "learning_rate": 3.3457009993074105e-05, "loss": 2.8408, "step": 3344 }, { "epoch": 0.33, "grad_norm": 1.4714194536209106, "learning_rate": 3.3437221727515585e-05, "loss": 2.898, "step": 3348 }, { "epoch": 0.33, "grad_norm": 1.466454267501831, "learning_rate": 3.341743346195706e-05, "loss": 3.1129, "step": 3352 }, { "epoch": 0.33, "grad_norm": 1.3871067762374878, "learning_rate": 3.339764519639854e-05, "loss": 2.8544, "step": 3356 }, { "epoch": 0.33, "grad_norm": 1.4386447668075562, "learning_rate": 3.337785693084001e-05, "loss": 2.8696, "step": 3360 }, { "epoch": 0.33, "grad_norm": 1.4348955154418945, "learning_rate": 3.335806866528149e-05, "loss": 3.0176, "step": 3364 }, { "epoch": 0.33, "grad_norm": 1.3820732831954956, "learning_rate": 3.333828039972296e-05, "loss": 2.8816, "step": 3368 }, { "epoch": 0.33, "grad_norm": 1.4179422855377197, "learning_rate": 3.331849213416444e-05, "loss": 2.8338, "step": 3372 }, { "epoch": 0.33, "grad_norm": 1.338954210281372, "learning_rate": 3.3298703868605916e-05, "loss": 2.8356, "step": 3376 }, { "epoch": 0.33, "grad_norm": 1.5031296014785767, "learning_rate": 3.3278915603047396e-05, "loss": 3.0582, "step": 3380 }, { "epoch": 0.33, "grad_norm": 1.4210484027862549, "learning_rate": 3.325912733748887e-05, "loss": 2.7175, "step": 3384 }, { "epoch": 0.34, "grad_norm": 1.301172137260437, "learning_rate": 3.323933907193035e-05, "loss": 2.8864, "step": 3388 }, { "epoch": 0.34, "grad_norm": 1.3612853288650513, "learning_rate": 3.321955080637182e-05, "loss": 2.9919, "step": 3392 }, { "epoch": 0.34, "grad_norm": 1.8065041303634644, "learning_rate": 3.31997625408133e-05, "loss": 2.9198, "step": 3396 }, { "epoch": 0.34, "grad_norm": 1.6538633108139038, "learning_rate": 3.3179974275254774e-05, "loss": 2.9522, "step": 3400 }, { "epoch": 0.34, "grad_norm": 1.440852403640747, "learning_rate": 3.3160186009696254e-05, "loss": 3.0007, "step": 3404 }, { "epoch": 0.34, "grad_norm": 1.3337926864624023, "learning_rate": 3.314039774413773e-05, "loss": 3.0386, "step": 3408 }, { "epoch": 0.34, "grad_norm": 1.6055779457092285, "learning_rate": 3.3120609478579206e-05, "loss": 2.7901, "step": 3412 }, { "epoch": 0.34, "grad_norm": 1.7633087635040283, "learning_rate": 3.3100821213020686e-05, "loss": 2.9472, "step": 3416 }, { "epoch": 0.34, "grad_norm": 1.3165653944015503, "learning_rate": 3.308103294746215e-05, "loss": 2.8783, "step": 3420 }, { "epoch": 0.34, "grad_norm": 1.4195142984390259, "learning_rate": 3.306124468190363e-05, "loss": 2.985, "step": 3424 }, { "epoch": 0.34, "grad_norm": 1.3331257104873657, "learning_rate": 3.3041456416345105e-05, "loss": 2.7907, "step": 3428 }, { "epoch": 0.34, "grad_norm": 1.7462310791015625, "learning_rate": 3.3021668150786584e-05, "loss": 2.9509, "step": 3432 }, { "epoch": 0.34, "grad_norm": 1.522599220275879, "learning_rate": 3.300187988522806e-05, "loss": 2.8467, "step": 3436 }, { "epoch": 0.34, "grad_norm": 1.349449872970581, "learning_rate": 3.298209161966954e-05, "loss": 3.0331, "step": 3440 }, { "epoch": 0.34, "grad_norm": 1.2950477600097656, "learning_rate": 3.296230335411101e-05, "loss": 2.8946, "step": 3444 }, { "epoch": 0.34, "grad_norm": 1.456489086151123, "learning_rate": 3.294251508855249e-05, "loss": 2.9786, "step": 3448 }, { "epoch": 0.34, "grad_norm": 1.4597196578979492, "learning_rate": 3.292272682299396e-05, "loss": 2.8709, "step": 3452 }, { "epoch": 0.34, "grad_norm": 1.3608200550079346, "learning_rate": 3.290293855743544e-05, "loss": 2.8589, "step": 3456 }, { "epoch": 0.34, "grad_norm": 1.4015833139419556, "learning_rate": 3.2883150291876915e-05, "loss": 2.9041, "step": 3460 }, { "epoch": 0.34, "grad_norm": 1.5029760599136353, "learning_rate": 3.2863362026318395e-05, "loss": 2.9721, "step": 3464 }, { "epoch": 0.34, "grad_norm": 1.54849374294281, "learning_rate": 3.2843573760759875e-05, "loss": 2.9531, "step": 3468 }, { "epoch": 0.34, "grad_norm": 1.4009016752243042, "learning_rate": 3.282378549520135e-05, "loss": 2.8118, "step": 3472 }, { "epoch": 0.34, "grad_norm": 1.4506298303604126, "learning_rate": 3.280399722964283e-05, "loss": 2.8969, "step": 3476 }, { "epoch": 0.34, "grad_norm": 1.4417248964309692, "learning_rate": 3.27842089640843e-05, "loss": 2.8477, "step": 3480 }, { "epoch": 0.34, "grad_norm": 1.5801173448562622, "learning_rate": 3.276442069852578e-05, "loss": 3.0832, "step": 3484 }, { "epoch": 0.35, "grad_norm": 1.4388275146484375, "learning_rate": 3.274463243296725e-05, "loss": 2.938, "step": 3488 }, { "epoch": 0.35, "grad_norm": 1.5307226181030273, "learning_rate": 3.272484416740873e-05, "loss": 2.934, "step": 3492 }, { "epoch": 0.35, "grad_norm": 1.3734909296035767, "learning_rate": 3.2705055901850206e-05, "loss": 2.8592, "step": 3496 }, { "epoch": 0.35, "grad_norm": 1.4125416278839111, "learning_rate": 3.2685267636291685e-05, "loss": 2.8323, "step": 3500 }, { "epoch": 0.35, "grad_norm": 1.3895879983901978, "learning_rate": 3.266547937073315e-05, "loss": 3.025, "step": 3504 }, { "epoch": 0.35, "grad_norm": 1.3139837980270386, "learning_rate": 3.264569110517463e-05, "loss": 2.9754, "step": 3508 }, { "epoch": 0.35, "grad_norm": 1.4398940801620483, "learning_rate": 3.2625902839616104e-05, "loss": 2.7895, "step": 3512 }, { "epoch": 0.35, "grad_norm": 1.474226951599121, "learning_rate": 3.2606114574057584e-05, "loss": 2.7609, "step": 3516 }, { "epoch": 0.35, "grad_norm": 1.440250039100647, "learning_rate": 3.258632630849906e-05, "loss": 2.8932, "step": 3520 }, { "epoch": 0.35, "grad_norm": 1.4383162260055542, "learning_rate": 3.2566538042940536e-05, "loss": 2.8672, "step": 3524 }, { "epoch": 0.35, "grad_norm": 1.4906786680221558, "learning_rate": 3.2546749777382016e-05, "loss": 2.8692, "step": 3528 }, { "epoch": 0.35, "grad_norm": 1.699826717376709, "learning_rate": 3.252696151182349e-05, "loss": 2.7653, "step": 3532 }, { "epoch": 0.35, "grad_norm": 1.4905091524124146, "learning_rate": 3.250717324626497e-05, "loss": 2.9857, "step": 3536 }, { "epoch": 0.35, "grad_norm": 1.6657990217208862, "learning_rate": 3.248738498070644e-05, "loss": 3.0826, "step": 3540 }, { "epoch": 0.35, "grad_norm": 1.464480996131897, "learning_rate": 3.246759671514792e-05, "loss": 2.833, "step": 3544 }, { "epoch": 0.35, "grad_norm": 1.5183041095733643, "learning_rate": 3.2447808449589394e-05, "loss": 2.8327, "step": 3548 }, { "epoch": 0.35, "grad_norm": 1.3862593173980713, "learning_rate": 3.2428020184030874e-05, "loss": 2.7403, "step": 3552 }, { "epoch": 0.35, "grad_norm": 1.560221552848816, "learning_rate": 3.240823191847235e-05, "loss": 3.0822, "step": 3556 }, { "epoch": 0.35, "grad_norm": 1.4953581094741821, "learning_rate": 3.238844365291383e-05, "loss": 2.873, "step": 3560 }, { "epoch": 0.35, "grad_norm": 1.3668919801712036, "learning_rate": 3.23686553873553e-05, "loss": 2.8884, "step": 3564 }, { "epoch": 0.35, "grad_norm": 1.5106139183044434, "learning_rate": 3.234886712179678e-05, "loss": 2.9894, "step": 3568 }, { "epoch": 0.35, "grad_norm": 1.5733063220977783, "learning_rate": 3.232907885623825e-05, "loss": 2.6832, "step": 3572 }, { "epoch": 0.35, "grad_norm": 1.3589937686920166, "learning_rate": 3.230929059067973e-05, "loss": 2.8609, "step": 3576 }, { "epoch": 0.35, "grad_norm": 1.5051617622375488, "learning_rate": 3.2289502325121205e-05, "loss": 3.0103, "step": 3580 }, { "epoch": 0.35, "grad_norm": 1.423579454421997, "learning_rate": 3.2269714059562685e-05, "loss": 2.8885, "step": 3584 }, { "epoch": 0.36, "grad_norm": 1.4297691583633423, "learning_rate": 3.224992579400416e-05, "loss": 2.8898, "step": 3588 }, { "epoch": 0.36, "grad_norm": 1.5489140748977661, "learning_rate": 3.223013752844563e-05, "loss": 2.836, "step": 3592 }, { "epoch": 0.36, "grad_norm": 1.4612491130828857, "learning_rate": 3.221034926288711e-05, "loss": 2.7572, "step": 3596 }, { "epoch": 0.36, "grad_norm": 1.412541389465332, "learning_rate": 3.219056099732858e-05, "loss": 2.9328, "step": 3600 }, { "epoch": 0.36, "grad_norm": 1.5174839496612549, "learning_rate": 3.217077273177006e-05, "loss": 2.7265, "step": 3604 }, { "epoch": 0.36, "grad_norm": 1.732851505279541, "learning_rate": 3.2150984466211536e-05, "loss": 2.8555, "step": 3608 }, { "epoch": 0.36, "grad_norm": 1.3930217027664185, "learning_rate": 3.2131196200653016e-05, "loss": 2.8495, "step": 3612 }, { "epoch": 0.36, "grad_norm": 1.9050196409225464, "learning_rate": 3.211140793509449e-05, "loss": 2.7914, "step": 3616 }, { "epoch": 0.36, "grad_norm": 1.423581600189209, "learning_rate": 3.209161966953597e-05, "loss": 2.8277, "step": 3620 }, { "epoch": 0.36, "grad_norm": 1.490317702293396, "learning_rate": 3.207183140397744e-05, "loss": 2.8735, "step": 3624 }, { "epoch": 0.36, "grad_norm": 1.434913992881775, "learning_rate": 3.205204313841892e-05, "loss": 2.8089, "step": 3628 }, { "epoch": 0.36, "grad_norm": 1.666888952255249, "learning_rate": 3.2032254872860394e-05, "loss": 2.791, "step": 3632 }, { "epoch": 0.36, "grad_norm": 1.7636072635650635, "learning_rate": 3.2012466607301873e-05, "loss": 2.8418, "step": 3636 }, { "epoch": 0.36, "grad_norm": 1.707155704498291, "learning_rate": 3.1992678341743346e-05, "loss": 2.7568, "step": 3640 }, { "epoch": 0.36, "grad_norm": 1.5901198387145996, "learning_rate": 3.1972890076184826e-05, "loss": 2.7845, "step": 3644 }, { "epoch": 0.36, "grad_norm": 1.4522852897644043, "learning_rate": 3.19531018106263e-05, "loss": 2.8471, "step": 3648 }, { "epoch": 0.36, "grad_norm": 1.348705768585205, "learning_rate": 3.193331354506778e-05, "loss": 3.13, "step": 3652 }, { "epoch": 0.36, "grad_norm": 1.645375370979309, "learning_rate": 3.191352527950925e-05, "loss": 2.8477, "step": 3656 }, { "epoch": 0.36, "grad_norm": 1.5673881769180298, "learning_rate": 3.189373701395073e-05, "loss": 2.6724, "step": 3660 }, { "epoch": 0.36, "grad_norm": 1.3590004444122314, "learning_rate": 3.1873948748392204e-05, "loss": 2.8683, "step": 3664 }, { "epoch": 0.36, "grad_norm": 1.4108936786651611, "learning_rate": 3.1854160482833684e-05, "loss": 2.8131, "step": 3668 }, { "epoch": 0.36, "grad_norm": 1.6254466772079468, "learning_rate": 3.183437221727516e-05, "loss": 2.9229, "step": 3672 }, { "epoch": 0.36, "grad_norm": 1.460128903388977, "learning_rate": 3.181458395171663e-05, "loss": 2.7728, "step": 3676 }, { "epoch": 0.36, "grad_norm": 1.473266363143921, "learning_rate": 3.179479568615811e-05, "loss": 2.8825, "step": 3680 }, { "epoch": 0.36, "grad_norm": 1.422609567642212, "learning_rate": 3.177500742059958e-05, "loss": 2.8954, "step": 3684 }, { "epoch": 0.36, "grad_norm": 1.5324984788894653, "learning_rate": 3.175521915504106e-05, "loss": 2.8591, "step": 3688 }, { "epoch": 0.37, "grad_norm": 1.472915530204773, "learning_rate": 3.1735430889482535e-05, "loss": 2.7172, "step": 3692 }, { "epoch": 0.37, "grad_norm": 1.3716154098510742, "learning_rate": 3.1715642623924015e-05, "loss": 2.7983, "step": 3696 }, { "epoch": 0.37, "grad_norm": 1.2956911325454712, "learning_rate": 3.169585435836549e-05, "loss": 2.8473, "step": 3700 }, { "epoch": 0.37, "grad_norm": 1.3868067264556885, "learning_rate": 3.167606609280697e-05, "loss": 2.7424, "step": 3704 }, { "epoch": 0.37, "grad_norm": 1.554824709892273, "learning_rate": 3.165627782724844e-05, "loss": 2.9807, "step": 3708 }, { "epoch": 0.37, "grad_norm": 2.6641464233398438, "learning_rate": 3.163648956168992e-05, "loss": 3.1424, "step": 3712 }, { "epoch": 0.37, "grad_norm": 1.5256997346878052, "learning_rate": 3.161670129613139e-05, "loss": 2.7976, "step": 3716 }, { "epoch": 0.37, "grad_norm": 1.4320893287658691, "learning_rate": 3.159691303057287e-05, "loss": 2.7902, "step": 3720 }, { "epoch": 0.37, "grad_norm": 1.3281612396240234, "learning_rate": 3.1577124765014346e-05, "loss": 2.673, "step": 3724 }, { "epoch": 0.37, "grad_norm": 1.5591663122177124, "learning_rate": 3.1557336499455825e-05, "loss": 2.6409, "step": 3728 }, { "epoch": 0.37, "grad_norm": 1.4519292116165161, "learning_rate": 3.15375482338973e-05, "loss": 2.9247, "step": 3732 }, { "epoch": 0.37, "grad_norm": 1.34962797164917, "learning_rate": 3.151775996833878e-05, "loss": 2.8384, "step": 3736 }, { "epoch": 0.37, "grad_norm": 1.7062203884124756, "learning_rate": 3.149797170278026e-05, "loss": 2.7923, "step": 3740 }, { "epoch": 0.37, "grad_norm": 1.5387376546859741, "learning_rate": 3.147818343722173e-05, "loss": 3.0167, "step": 3744 }, { "epoch": 0.37, "grad_norm": 1.3127942085266113, "learning_rate": 3.145839517166321e-05, "loss": 2.9422, "step": 3748 }, { "epoch": 0.37, "grad_norm": 1.4698482751846313, "learning_rate": 3.143860690610468e-05, "loss": 2.9139, "step": 3752 }, { "epoch": 0.37, "grad_norm": 1.472330927848816, "learning_rate": 3.1418818640546156e-05, "loss": 2.8468, "step": 3756 }, { "epoch": 0.37, "grad_norm": 1.507949948310852, "learning_rate": 3.139903037498763e-05, "loss": 2.8117, "step": 3760 }, { "epoch": 0.37, "grad_norm": 1.3722580671310425, "learning_rate": 3.137924210942911e-05, "loss": 3.0344, "step": 3764 }, { "epoch": 0.37, "grad_norm": 1.6335688829421997, "learning_rate": 3.135945384387058e-05, "loss": 2.9591, "step": 3768 }, { "epoch": 0.37, "grad_norm": 1.608832836151123, "learning_rate": 3.133966557831206e-05, "loss": 2.9696, "step": 3772 }, { "epoch": 0.37, "grad_norm": 1.4067091941833496, "learning_rate": 3.1319877312753535e-05, "loss": 2.797, "step": 3776 }, { "epoch": 0.37, "grad_norm": 1.439083218574524, "learning_rate": 3.1300089047195014e-05, "loss": 2.9332, "step": 3780 }, { "epoch": 0.37, "grad_norm": 1.4436142444610596, "learning_rate": 3.128030078163649e-05, "loss": 2.9546, "step": 3784 }, { "epoch": 0.37, "grad_norm": 1.5676676034927368, "learning_rate": 3.126051251607797e-05, "loss": 2.9537, "step": 3788 }, { "epoch": 0.38, "grad_norm": 1.564913272857666, "learning_rate": 3.124072425051944e-05, "loss": 3.0004, "step": 3792 }, { "epoch": 0.38, "grad_norm": 1.5197542905807495, "learning_rate": 3.122093598496092e-05, "loss": 2.9121, "step": 3796 }, { "epoch": 0.38, "grad_norm": 1.467848539352417, "learning_rate": 3.12011477194024e-05, "loss": 2.8398, "step": 3800 }, { "epoch": 0.38, "grad_norm": 1.415881872177124, "learning_rate": 3.118135945384387e-05, "loss": 2.8903, "step": 3804 }, { "epoch": 0.38, "grad_norm": 1.3919771909713745, "learning_rate": 3.116157118828535e-05, "loss": 2.862, "step": 3808 }, { "epoch": 0.38, "grad_norm": 1.5213255882263184, "learning_rate": 3.1141782922726825e-05, "loss": 2.876, "step": 3812 }, { "epoch": 0.38, "grad_norm": 1.4630500078201294, "learning_rate": 3.1121994657168305e-05, "loss": 2.9567, "step": 3816 }, { "epoch": 0.38, "grad_norm": 1.4120430946350098, "learning_rate": 3.110220639160978e-05, "loss": 2.7293, "step": 3820 }, { "epoch": 0.38, "grad_norm": 1.336676001548767, "learning_rate": 3.108241812605126e-05, "loss": 2.8274, "step": 3824 }, { "epoch": 0.38, "grad_norm": 1.5005687475204468, "learning_rate": 3.106262986049273e-05, "loss": 2.9078, "step": 3828 }, { "epoch": 0.38, "grad_norm": 1.5053759813308716, "learning_rate": 3.104284159493421e-05, "loss": 2.8964, "step": 3832 }, { "epoch": 0.38, "grad_norm": 1.764050841331482, "learning_rate": 3.102305332937568e-05, "loss": 2.7096, "step": 3836 }, { "epoch": 0.38, "grad_norm": 1.731428623199463, "learning_rate": 3.100326506381716e-05, "loss": 2.8654, "step": 3840 }, { "epoch": 0.38, "grad_norm": 1.5615731477737427, "learning_rate": 3.098347679825863e-05, "loss": 2.7207, "step": 3844 }, { "epoch": 0.38, "grad_norm": 1.3528928756713867, "learning_rate": 3.096368853270011e-05, "loss": 2.7666, "step": 3848 }, { "epoch": 0.38, "grad_norm": 1.4369572401046753, "learning_rate": 3.094390026714158e-05, "loss": 3.0385, "step": 3852 }, { "epoch": 0.38, "grad_norm": 1.7137233018875122, "learning_rate": 3.092411200158306e-05, "loss": 2.7059, "step": 3856 }, { "epoch": 0.38, "grad_norm": 1.4573568105697632, "learning_rate": 3.090432373602454e-05, "loss": 2.6826, "step": 3860 }, { "epoch": 0.38, "grad_norm": 1.6437382698059082, "learning_rate": 3.0884535470466014e-05, "loss": 2.8949, "step": 3864 }, { "epoch": 0.38, "grad_norm": 1.5055683851242065, "learning_rate": 3.086474720490749e-05, "loss": 2.7713, "step": 3868 }, { "epoch": 0.38, "grad_norm": 1.3726445436477661, "learning_rate": 3.0844958939348966e-05, "loss": 2.7203, "step": 3872 }, { "epoch": 0.38, "grad_norm": 1.4541021585464478, "learning_rate": 3.0825170673790446e-05, "loss": 2.6903, "step": 3876 }, { "epoch": 0.38, "grad_norm": 1.6873700618743896, "learning_rate": 3.080538240823192e-05, "loss": 2.6891, "step": 3880 }, { "epoch": 0.38, "grad_norm": 1.5541645288467407, "learning_rate": 3.07855941426734e-05, "loss": 2.7473, "step": 3884 }, { "epoch": 0.38, "grad_norm": 1.5389822721481323, "learning_rate": 3.076580587711487e-05, "loss": 2.7178, "step": 3888 }, { "epoch": 0.39, "grad_norm": 1.3678780794143677, "learning_rate": 3.074601761155635e-05, "loss": 2.914, "step": 3892 }, { "epoch": 0.39, "grad_norm": 1.504895806312561, "learning_rate": 3.0726229345997824e-05, "loss": 2.635, "step": 3896 }, { "epoch": 0.39, "grad_norm": 1.5606142282485962, "learning_rate": 3.0706441080439304e-05, "loss": 2.8645, "step": 3900 }, { "epoch": 0.39, "grad_norm": 1.5420433282852173, "learning_rate": 3.068665281488078e-05, "loss": 3.0066, "step": 3904 }, { "epoch": 0.39, "grad_norm": 1.494759202003479, "learning_rate": 3.0666864549322257e-05, "loss": 2.8071, "step": 3908 }, { "epoch": 0.39, "grad_norm": 1.386305570602417, "learning_rate": 3.064707628376373e-05, "loss": 2.9353, "step": 3912 }, { "epoch": 0.39, "grad_norm": 1.5420827865600586, "learning_rate": 3.062728801820521e-05, "loss": 2.9036, "step": 3916 }, { "epoch": 0.39, "grad_norm": 1.4949458837509155, "learning_rate": 3.060749975264668e-05, "loss": 2.5632, "step": 3920 }, { "epoch": 0.39, "grad_norm": 1.6547415256500244, "learning_rate": 3.058771148708816e-05, "loss": 2.8553, "step": 3924 }, { "epoch": 0.39, "grad_norm": 2.1204848289489746, "learning_rate": 3.0567923221529635e-05, "loss": 2.9203, "step": 3928 }, { "epoch": 0.39, "grad_norm": 1.4536625146865845, "learning_rate": 3.054813495597111e-05, "loss": 2.7916, "step": 3932 }, { "epoch": 0.39, "grad_norm": 1.4841111898422241, "learning_rate": 3.052834669041259e-05, "loss": 2.7978, "step": 3936 }, { "epoch": 0.39, "grad_norm": 1.5112613439559937, "learning_rate": 3.050855842485406e-05, "loss": 2.7701, "step": 3940 }, { "epoch": 0.39, "grad_norm": 1.4966477155685425, "learning_rate": 3.0488770159295537e-05, "loss": 2.6693, "step": 3944 }, { "epoch": 0.39, "grad_norm": 1.3681539297103882, "learning_rate": 3.0468981893737013e-05, "loss": 2.7811, "step": 3948 }, { "epoch": 0.39, "grad_norm": 1.4147274494171143, "learning_rate": 3.044919362817849e-05, "loss": 2.7958, "step": 3952 }, { "epoch": 0.39, "grad_norm": 1.451531171798706, "learning_rate": 3.0429405362619966e-05, "loss": 2.8591, "step": 3956 }, { "epoch": 0.39, "grad_norm": 1.4994832277297974, "learning_rate": 3.0409617097061442e-05, "loss": 2.7079, "step": 3960 }, { "epoch": 0.39, "grad_norm": 1.4195365905761719, "learning_rate": 3.038982883150292e-05, "loss": 2.8642, "step": 3964 }, { "epoch": 0.39, "grad_norm": 1.4782614707946777, "learning_rate": 3.0370040565944398e-05, "loss": 2.8319, "step": 3968 }, { "epoch": 0.39, "grad_norm": 1.6576443910598755, "learning_rate": 3.0350252300385874e-05, "loss": 2.6855, "step": 3972 }, { "epoch": 0.39, "grad_norm": 1.8916692733764648, "learning_rate": 3.033046403482735e-05, "loss": 2.8664, "step": 3976 }, { "epoch": 0.39, "grad_norm": 1.4349229335784912, "learning_rate": 3.0310675769268827e-05, "loss": 2.7252, "step": 3980 }, { "epoch": 0.39, "grad_norm": 1.5039000511169434, "learning_rate": 3.0290887503710303e-05, "loss": 2.8368, "step": 3984 }, { "epoch": 0.39, "grad_norm": 1.552319884300232, "learning_rate": 3.027109923815178e-05, "loss": 2.7585, "step": 3988 }, { "epoch": 0.39, "grad_norm": 1.618980884552002, "learning_rate": 3.0251310972593256e-05, "loss": 2.8395, "step": 3992 }, { "epoch": 0.4, "grad_norm": 1.370418667793274, "learning_rate": 3.0231522707034732e-05, "loss": 2.8074, "step": 3996 }, { "epoch": 0.4, "grad_norm": 1.3681377172470093, "learning_rate": 3.021173444147621e-05, "loss": 2.8299, "step": 4000 }, { "epoch": 0.4, "grad_norm": 1.4201312065124512, "learning_rate": 3.0191946175917685e-05, "loss": 2.8259, "step": 4004 }, { "epoch": 0.4, "grad_norm": 1.8192863464355469, "learning_rate": 3.017215791035916e-05, "loss": 2.9578, "step": 4008 }, { "epoch": 0.4, "grad_norm": 1.5189778804779053, "learning_rate": 3.015236964480063e-05, "loss": 2.7655, "step": 4012 }, { "epoch": 0.4, "grad_norm": 1.401796579360962, "learning_rate": 3.0132581379242107e-05, "loss": 2.7549, "step": 4016 }, { "epoch": 0.4, "grad_norm": 1.5482715368270874, "learning_rate": 3.0112793113683583e-05, "loss": 2.5986, "step": 4020 }, { "epoch": 0.4, "grad_norm": 1.446227788925171, "learning_rate": 3.0093004848125063e-05, "loss": 3.0037, "step": 4024 }, { "epoch": 0.4, "grad_norm": 1.5745714902877808, "learning_rate": 3.007321658256654e-05, "loss": 2.9434, "step": 4028 }, { "epoch": 0.4, "grad_norm": 1.4580100774765015, "learning_rate": 3.0053428317008016e-05, "loss": 2.7611, "step": 4032 }, { "epoch": 0.4, "grad_norm": 1.6320377588272095, "learning_rate": 3.0033640051449492e-05, "loss": 2.7802, "step": 4036 }, { "epoch": 0.4, "grad_norm": 1.5840559005737305, "learning_rate": 3.001385178589097e-05, "loss": 2.8459, "step": 4040 }, { "epoch": 0.4, "grad_norm": 1.4639407396316528, "learning_rate": 2.9994063520332445e-05, "loss": 2.8289, "step": 4044 }, { "epoch": 0.4, "grad_norm": 1.4280468225479126, "learning_rate": 2.997427525477392e-05, "loss": 2.6556, "step": 4048 }, { "epoch": 0.4, "grad_norm": 1.4254354238510132, "learning_rate": 2.9954486989215397e-05, "loss": 2.8135, "step": 4052 }, { "epoch": 0.4, "grad_norm": 1.5892139673233032, "learning_rate": 2.9934698723656874e-05, "loss": 2.8655, "step": 4056 }, { "epoch": 0.4, "grad_norm": 1.422926425933838, "learning_rate": 2.991491045809835e-05, "loss": 2.9126, "step": 4060 }, { "epoch": 0.4, "grad_norm": 1.5437527894973755, "learning_rate": 2.9895122192539826e-05, "loss": 2.7983, "step": 4064 }, { "epoch": 0.4, "grad_norm": 1.6389095783233643, "learning_rate": 2.9875333926981303e-05, "loss": 2.8345, "step": 4068 }, { "epoch": 0.4, "grad_norm": 1.9963791370391846, "learning_rate": 2.985554566142278e-05, "loss": 2.8074, "step": 4072 }, { "epoch": 0.4, "grad_norm": 1.4846253395080566, "learning_rate": 2.9835757395864255e-05, "loss": 2.8744, "step": 4076 }, { "epoch": 0.4, "grad_norm": 1.5970510244369507, "learning_rate": 2.981596913030573e-05, "loss": 2.8114, "step": 4080 }, { "epoch": 0.4, "grad_norm": 1.6389985084533691, "learning_rate": 2.9796180864747208e-05, "loss": 2.6592, "step": 4084 }, { "epoch": 0.4, "grad_norm": 1.4449435472488403, "learning_rate": 2.9776392599188684e-05, "loss": 2.7522, "step": 4088 }, { "epoch": 0.4, "grad_norm": 1.3930374383926392, "learning_rate": 2.975660433363016e-05, "loss": 2.6228, "step": 4092 }, { "epoch": 0.41, "grad_norm": 1.4793918132781982, "learning_rate": 2.9736816068071633e-05, "loss": 2.7966, "step": 4096 }, { "epoch": 0.41, "grad_norm": 1.457022786140442, "learning_rate": 2.971702780251311e-05, "loss": 2.8147, "step": 4100 }, { "epoch": 0.41, "grad_norm": 1.3806568384170532, "learning_rate": 2.9697239536954586e-05, "loss": 2.7444, "step": 4104 }, { "epoch": 0.41, "grad_norm": 1.445950984954834, "learning_rate": 2.9677451271396062e-05, "loss": 2.6564, "step": 4108 }, { "epoch": 0.41, "grad_norm": 1.4374557733535767, "learning_rate": 2.965766300583754e-05, "loss": 2.8971, "step": 4112 }, { "epoch": 0.41, "grad_norm": 1.4371333122253418, "learning_rate": 2.9637874740279015e-05, "loss": 2.9071, "step": 4116 }, { "epoch": 0.41, "grad_norm": 1.5124506950378418, "learning_rate": 2.961808647472049e-05, "loss": 2.9047, "step": 4120 }, { "epoch": 0.41, "grad_norm": 1.5280250310897827, "learning_rate": 2.9598298209161968e-05, "loss": 2.8296, "step": 4124 }, { "epoch": 0.41, "grad_norm": 1.3567237854003906, "learning_rate": 2.9578509943603444e-05, "loss": 2.757, "step": 4128 }, { "epoch": 0.41, "grad_norm": 1.5457067489624023, "learning_rate": 2.955872167804492e-05, "loss": 2.8907, "step": 4132 }, { "epoch": 0.41, "grad_norm": 1.4458136558532715, "learning_rate": 2.9538933412486397e-05, "loss": 2.7835, "step": 4136 }, { "epoch": 0.41, "grad_norm": 1.4487221240997314, "learning_rate": 2.9519145146927873e-05, "loss": 2.7497, "step": 4140 }, { "epoch": 0.41, "grad_norm": 1.5274755954742432, "learning_rate": 2.949935688136935e-05, "loss": 2.7221, "step": 4144 }, { "epoch": 0.41, "grad_norm": 1.5352815389633179, "learning_rate": 2.9479568615810826e-05, "loss": 2.8334, "step": 4148 }, { "epoch": 0.41, "grad_norm": 1.5403484106063843, "learning_rate": 2.9459780350252302e-05, "loss": 2.7065, "step": 4152 }, { "epoch": 0.41, "grad_norm": 1.4784417152404785, "learning_rate": 2.9439992084693778e-05, "loss": 2.6853, "step": 4156 }, { "epoch": 0.41, "grad_norm": 1.4991648197174072, "learning_rate": 2.9420203819135255e-05, "loss": 2.6927, "step": 4160 }, { "epoch": 0.41, "grad_norm": 1.5489524602890015, "learning_rate": 2.940041555357673e-05, "loss": 2.8819, "step": 4164 }, { "epoch": 0.41, "grad_norm": 1.5099831819534302, "learning_rate": 2.9380627288018207e-05, "loss": 2.7281, "step": 4168 }, { "epoch": 0.41, "grad_norm": 1.400437355041504, "learning_rate": 2.9360839022459684e-05, "loss": 2.706, "step": 4172 }, { "epoch": 0.41, "grad_norm": 1.4548041820526123, "learning_rate": 2.9341050756901163e-05, "loss": 2.7494, "step": 4176 }, { "epoch": 0.41, "grad_norm": 1.3302807807922363, "learning_rate": 2.9321262491342633e-05, "loss": 2.9415, "step": 4180 }, { "epoch": 0.41, "grad_norm": 1.5597517490386963, "learning_rate": 2.930147422578411e-05, "loss": 2.8837, "step": 4184 }, { "epoch": 0.41, "grad_norm": 1.456819772720337, "learning_rate": 2.9281685960225585e-05, "loss": 2.7019, "step": 4188 }, { "epoch": 0.41, "grad_norm": 1.4813134670257568, "learning_rate": 2.9261897694667062e-05, "loss": 2.7553, "step": 4192 }, { "epoch": 0.42, "grad_norm": 1.4024277925491333, "learning_rate": 2.9242109429108538e-05, "loss": 2.7829, "step": 4196 }, { "epoch": 0.42, "grad_norm": 1.5183507204055786, "learning_rate": 2.9222321163550014e-05, "loss": 2.6526, "step": 4200 }, { "epoch": 0.42, "grad_norm": 1.73296320438385, "learning_rate": 2.920253289799149e-05, "loss": 2.8376, "step": 4204 }, { "epoch": 0.42, "grad_norm": 1.4100421667099, "learning_rate": 2.9182744632432967e-05, "loss": 2.8162, "step": 4208 }, { "epoch": 0.42, "grad_norm": 1.3729844093322754, "learning_rate": 2.9162956366874443e-05, "loss": 2.6933, "step": 4212 }, { "epoch": 0.42, "grad_norm": 1.381945252418518, "learning_rate": 2.914316810131592e-05, "loss": 2.6763, "step": 4216 }, { "epoch": 0.42, "grad_norm": 1.4343644380569458, "learning_rate": 2.9123379835757396e-05, "loss": 2.6836, "step": 4220 }, { "epoch": 0.42, "grad_norm": 1.4591138362884521, "learning_rate": 2.9103591570198872e-05, "loss": 2.6811, "step": 4224 }, { "epoch": 0.42, "grad_norm": 1.4086369276046753, "learning_rate": 2.908380330464035e-05, "loss": 2.7654, "step": 4228 }, { "epoch": 0.42, "grad_norm": 1.3589953184127808, "learning_rate": 2.9064015039081825e-05, "loss": 2.9029, "step": 4232 }, { "epoch": 0.42, "grad_norm": 1.4481515884399414, "learning_rate": 2.9044226773523305e-05, "loss": 2.8348, "step": 4236 }, { "epoch": 0.42, "grad_norm": 1.4589226245880127, "learning_rate": 2.902443850796478e-05, "loss": 2.7427, "step": 4240 }, { "epoch": 0.42, "grad_norm": 1.3930855989456177, "learning_rate": 2.9004650242406257e-05, "loss": 2.8394, "step": 4244 }, { "epoch": 0.42, "grad_norm": 1.670841097831726, "learning_rate": 2.8984861976847734e-05, "loss": 2.8234, "step": 4248 }, { "epoch": 0.42, "grad_norm": 2.0551598072052, "learning_rate": 2.896507371128921e-05, "loss": 2.8201, "step": 4252 }, { "epoch": 0.42, "grad_norm": 1.6182607412338257, "learning_rate": 2.8945285445730686e-05, "loss": 2.7768, "step": 4256 }, { "epoch": 0.42, "grad_norm": 1.6682448387145996, "learning_rate": 2.8925497180172163e-05, "loss": 2.7996, "step": 4260 }, { "epoch": 0.42, "grad_norm": 1.5060317516326904, "learning_rate": 2.8905708914613632e-05, "loss": 2.9885, "step": 4264 }, { "epoch": 0.42, "grad_norm": 1.6912871599197388, "learning_rate": 2.888592064905511e-05, "loss": 2.7473, "step": 4268 }, { "epoch": 0.42, "grad_norm": 1.4391567707061768, "learning_rate": 2.8866132383496585e-05, "loss": 2.7497, "step": 4272 }, { "epoch": 0.42, "grad_norm": 1.4954835176467896, "learning_rate": 2.884634411793806e-05, "loss": 2.724, "step": 4276 }, { "epoch": 0.42, "grad_norm": 1.4786354303359985, "learning_rate": 2.8826555852379537e-05, "loss": 2.9594, "step": 4280 }, { "epoch": 0.42, "grad_norm": 1.5478767156600952, "learning_rate": 2.8806767586821014e-05, "loss": 2.7347, "step": 4284 }, { "epoch": 0.42, "grad_norm": 1.488110899925232, "learning_rate": 2.878697932126249e-05, "loss": 2.736, "step": 4288 }, { "epoch": 0.42, "grad_norm": 1.4168086051940918, "learning_rate": 2.8767191055703966e-05, "loss": 2.6479, "step": 4292 }, { "epoch": 0.43, "grad_norm": 1.413324236869812, "learning_rate": 2.8747402790145446e-05, "loss": 2.7392, "step": 4296 }, { "epoch": 0.43, "grad_norm": 1.40789794921875, "learning_rate": 2.8727614524586922e-05, "loss": 2.6474, "step": 4300 }, { "epoch": 0.43, "grad_norm": 1.3412038087844849, "learning_rate": 2.87078262590284e-05, "loss": 2.6181, "step": 4304 }, { "epoch": 0.43, "grad_norm": 1.4228928089141846, "learning_rate": 2.8688037993469875e-05, "loss": 2.8285, "step": 4308 }, { "epoch": 0.43, "grad_norm": 1.498350977897644, "learning_rate": 2.866824972791135e-05, "loss": 2.7477, "step": 4312 }, { "epoch": 0.43, "grad_norm": 1.5861128568649292, "learning_rate": 2.8648461462352828e-05, "loss": 2.7869, "step": 4316 }, { "epoch": 0.43, "grad_norm": 1.544293761253357, "learning_rate": 2.8628673196794304e-05, "loss": 2.711, "step": 4320 }, { "epoch": 0.43, "grad_norm": 1.576124906539917, "learning_rate": 2.860888493123578e-05, "loss": 2.8703, "step": 4324 }, { "epoch": 0.43, "grad_norm": 1.5502452850341797, "learning_rate": 2.8589096665677257e-05, "loss": 2.7711, "step": 4328 }, { "epoch": 0.43, "grad_norm": 1.3446522951126099, "learning_rate": 2.8569308400118733e-05, "loss": 2.8535, "step": 4332 }, { "epoch": 0.43, "grad_norm": 1.4643163681030273, "learning_rate": 2.854952013456021e-05, "loss": 2.65, "step": 4336 }, { "epoch": 0.43, "grad_norm": 1.6467000246047974, "learning_rate": 2.8529731869001686e-05, "loss": 2.8216, "step": 4340 }, { "epoch": 0.43, "grad_norm": 1.387041687965393, "learning_rate": 2.8509943603443162e-05, "loss": 2.7273, "step": 4344 }, { "epoch": 0.43, "grad_norm": 1.4695614576339722, "learning_rate": 2.849015533788463e-05, "loss": 2.6167, "step": 4348 }, { "epoch": 0.43, "grad_norm": 1.6175564527511597, "learning_rate": 2.847036707232611e-05, "loss": 2.761, "step": 4352 }, { "epoch": 0.43, "grad_norm": 1.5921447277069092, "learning_rate": 2.8450578806767588e-05, "loss": 2.7493, "step": 4356 }, { "epoch": 0.43, "grad_norm": 1.5560427904129028, "learning_rate": 2.8430790541209064e-05, "loss": 2.8581, "step": 4360 }, { "epoch": 0.43, "grad_norm": 1.5728269815444946, "learning_rate": 2.841100227565054e-05, "loss": 2.6552, "step": 4364 }, { "epoch": 0.43, "grad_norm": 1.6176984310150146, "learning_rate": 2.8391214010092017e-05, "loss": 2.698, "step": 4368 }, { "epoch": 0.43, "grad_norm": 1.4440550804138184, "learning_rate": 2.8371425744533493e-05, "loss": 2.8222, "step": 4372 }, { "epoch": 0.43, "grad_norm": 1.721948266029358, "learning_rate": 2.835163747897497e-05, "loss": 2.6922, "step": 4376 }, { "epoch": 0.43, "grad_norm": 1.679369330406189, "learning_rate": 2.8331849213416445e-05, "loss": 2.8892, "step": 4380 }, { "epoch": 0.43, "grad_norm": 1.4985451698303223, "learning_rate": 2.8312060947857922e-05, "loss": 2.7287, "step": 4384 }, { "epoch": 0.43, "grad_norm": 1.4109586477279663, "learning_rate": 2.8292272682299398e-05, "loss": 2.7606, "step": 4388 }, { "epoch": 0.43, "grad_norm": 1.5079187154769897, "learning_rate": 2.8272484416740874e-05, "loss": 2.6592, "step": 4392 }, { "epoch": 0.43, "grad_norm": 1.4183659553527832, "learning_rate": 2.825269615118235e-05, "loss": 2.7749, "step": 4396 }, { "epoch": 0.44, "grad_norm": 1.3497616052627563, "learning_rate": 2.8232907885623827e-05, "loss": 2.8151, "step": 4400 }, { "epoch": 0.44, "grad_norm": 1.5650898218154907, "learning_rate": 2.8213119620065303e-05, "loss": 2.9046, "step": 4404 }, { "epoch": 0.44, "grad_norm": 1.5899450778961182, "learning_rate": 2.819333135450678e-05, "loss": 2.6898, "step": 4408 }, { "epoch": 0.44, "grad_norm": 1.4732282161712646, "learning_rate": 2.8173543088948256e-05, "loss": 2.7099, "step": 4412 }, { "epoch": 0.44, "grad_norm": 1.4923087358474731, "learning_rate": 2.8153754823389732e-05, "loss": 2.7293, "step": 4416 }, { "epoch": 0.44, "grad_norm": 1.509701132774353, "learning_rate": 2.813396655783121e-05, "loss": 3.0249, "step": 4420 }, { "epoch": 0.44, "grad_norm": 1.492567777633667, "learning_rate": 2.8114178292272685e-05, "loss": 2.779, "step": 4424 }, { "epoch": 0.44, "grad_norm": 1.551950216293335, "learning_rate": 2.809439002671416e-05, "loss": 2.8592, "step": 4428 }, { "epoch": 0.44, "grad_norm": 1.5013272762298584, "learning_rate": 2.8074601761155634e-05, "loss": 2.6557, "step": 4432 }, { "epoch": 0.44, "grad_norm": 1.757328987121582, "learning_rate": 2.805481349559711e-05, "loss": 2.917, "step": 4436 }, { "epoch": 0.44, "grad_norm": 1.4949613809585571, "learning_rate": 2.8035025230038587e-05, "loss": 2.8814, "step": 4440 }, { "epoch": 0.44, "grad_norm": 1.5500638484954834, "learning_rate": 2.8015236964480063e-05, "loss": 2.7875, "step": 4444 }, { "epoch": 0.44, "grad_norm": 1.724369764328003, "learning_rate": 2.799544869892154e-05, "loss": 2.8852, "step": 4448 }, { "epoch": 0.44, "grad_norm": 1.6037100553512573, "learning_rate": 2.7975660433363016e-05, "loss": 2.8543, "step": 4452 }, { "epoch": 0.44, "grad_norm": 1.5066895484924316, "learning_rate": 2.7955872167804492e-05, "loss": 2.743, "step": 4456 }, { "epoch": 0.44, "grad_norm": 1.412857174873352, "learning_rate": 2.793608390224597e-05, "loss": 2.6795, "step": 4460 }, { "epoch": 0.44, "grad_norm": 1.4990521669387817, "learning_rate": 2.7916295636687445e-05, "loss": 2.7017, "step": 4464 }, { "epoch": 0.44, "grad_norm": 1.7534558773040771, "learning_rate": 2.789650737112892e-05, "loss": 2.6926, "step": 4468 }, { "epoch": 0.44, "grad_norm": 1.6677272319793701, "learning_rate": 2.7876719105570398e-05, "loss": 2.7715, "step": 4472 }, { "epoch": 0.44, "grad_norm": 1.4068310260772705, "learning_rate": 2.7856930840011874e-05, "loss": 2.7641, "step": 4476 }, { "epoch": 0.44, "grad_norm": 1.6524847745895386, "learning_rate": 2.783714257445335e-05, "loss": 2.8915, "step": 4480 }, { "epoch": 0.44, "grad_norm": 1.362470030784607, "learning_rate": 2.7817354308894826e-05, "loss": 2.7326, "step": 4484 }, { "epoch": 0.44, "grad_norm": 1.7215932607650757, "learning_rate": 2.7797566043336303e-05, "loss": 2.9945, "step": 4488 }, { "epoch": 0.44, "grad_norm": 1.5028218030929565, "learning_rate": 2.777777777777778e-05, "loss": 2.7282, "step": 4492 }, { "epoch": 0.44, "grad_norm": 1.628304123878479, "learning_rate": 2.7757989512219255e-05, "loss": 2.7148, "step": 4496 }, { "epoch": 0.45, "grad_norm": 1.5549771785736084, "learning_rate": 2.7738201246660732e-05, "loss": 2.7639, "step": 4500 }, { "epoch": 0.45, "grad_norm": 1.7809425592422485, "learning_rate": 2.7718412981102208e-05, "loss": 2.8016, "step": 4504 }, { "epoch": 0.45, "grad_norm": 1.5097914934158325, "learning_rate": 2.7698624715543688e-05, "loss": 2.7919, "step": 4508 }, { "epoch": 0.45, "grad_norm": 1.6162583827972412, "learning_rate": 2.7678836449985164e-05, "loss": 2.7603, "step": 4512 }, { "epoch": 0.45, "grad_norm": 1.3952536582946777, "learning_rate": 2.7659048184426634e-05, "loss": 2.8791, "step": 4516 }, { "epoch": 0.45, "grad_norm": 1.4246437549591064, "learning_rate": 2.763925991886811e-05, "loss": 2.7032, "step": 4520 }, { "epoch": 0.45, "grad_norm": 1.5354806184768677, "learning_rate": 2.7619471653309586e-05, "loss": 2.6509, "step": 4524 }, { "epoch": 0.45, "grad_norm": 1.414692997932434, "learning_rate": 2.7599683387751063e-05, "loss": 2.8077, "step": 4528 }, { "epoch": 0.45, "grad_norm": 1.4562816619873047, "learning_rate": 2.757989512219254e-05, "loss": 2.6559, "step": 4532 }, { "epoch": 0.45, "grad_norm": 1.5134601593017578, "learning_rate": 2.7560106856634015e-05, "loss": 2.9881, "step": 4536 }, { "epoch": 0.45, "grad_norm": 1.4460077285766602, "learning_rate": 2.754031859107549e-05, "loss": 2.9132, "step": 4540 }, { "epoch": 0.45, "grad_norm": 1.2956750392913818, "learning_rate": 2.7520530325516968e-05, "loss": 2.6041, "step": 4544 }, { "epoch": 0.45, "grad_norm": 1.3744826316833496, "learning_rate": 2.7500742059958444e-05, "loss": 2.7083, "step": 4548 }, { "epoch": 0.45, "grad_norm": 1.8652702569961548, "learning_rate": 2.748095379439992e-05, "loss": 2.7688, "step": 4552 }, { "epoch": 0.45, "grad_norm": 1.4958977699279785, "learning_rate": 2.7461165528841397e-05, "loss": 2.6397, "step": 4556 }, { "epoch": 0.45, "grad_norm": 1.7164748907089233, "learning_rate": 2.7441377263282873e-05, "loss": 2.8242, "step": 4560 }, { "epoch": 0.45, "grad_norm": 1.3987586498260498, "learning_rate": 2.742158899772435e-05, "loss": 2.7012, "step": 4564 }, { "epoch": 0.45, "grad_norm": 1.44004487991333, "learning_rate": 2.740180073216583e-05, "loss": 2.6568, "step": 4568 }, { "epoch": 0.45, "grad_norm": 1.5092453956604004, "learning_rate": 2.7382012466607306e-05, "loss": 2.8711, "step": 4572 }, { "epoch": 0.45, "grad_norm": 1.4860295057296753, "learning_rate": 2.7362224201048782e-05, "loss": 2.6946, "step": 4576 }, { "epoch": 0.45, "grad_norm": 1.4830384254455566, "learning_rate": 2.7342435935490258e-05, "loss": 2.6399, "step": 4580 }, { "epoch": 0.45, "grad_norm": 1.5015872716903687, "learning_rate": 2.7322647669931734e-05, "loss": 2.794, "step": 4584 }, { "epoch": 0.45, "grad_norm": 1.514248013496399, "learning_rate": 2.730285940437321e-05, "loss": 2.8176, "step": 4588 }, { "epoch": 0.45, "grad_norm": 1.651678204536438, "learning_rate": 2.7283071138814687e-05, "loss": 2.8214, "step": 4592 }, { "epoch": 0.45, "grad_norm": 1.5051383972167969, "learning_rate": 2.7263282873256163e-05, "loss": 2.7998, "step": 4596 }, { "epoch": 0.46, "grad_norm": 1.4921104907989502, "learning_rate": 2.7243494607697633e-05, "loss": 2.7712, "step": 4600 }, { "epoch": 0.46, "grad_norm": 1.4588526487350464, "learning_rate": 2.722370634213911e-05, "loss": 2.718, "step": 4604 }, { "epoch": 0.46, "grad_norm": 1.5112457275390625, "learning_rate": 2.7203918076580586e-05, "loss": 2.8277, "step": 4608 }, { "epoch": 0.46, "grad_norm": 1.5032070875167847, "learning_rate": 2.7184129811022062e-05, "loss": 2.6329, "step": 4612 }, { "epoch": 0.46, "grad_norm": 1.3996548652648926, "learning_rate": 2.7164341545463538e-05, "loss": 2.5617, "step": 4616 }, { "epoch": 0.46, "grad_norm": 1.5661203861236572, "learning_rate": 2.7144553279905015e-05, "loss": 2.6544, "step": 4620 }, { "epoch": 0.46, "grad_norm": 1.4876741170883179, "learning_rate": 2.7124765014346494e-05, "loss": 2.7655, "step": 4624 }, { "epoch": 0.46, "grad_norm": 1.550423264503479, "learning_rate": 2.710497674878797e-05, "loss": 2.7317, "step": 4628 }, { "epoch": 0.46, "grad_norm": 1.477411150932312, "learning_rate": 2.7085188483229447e-05, "loss": 2.6941, "step": 4632 }, { "epoch": 0.46, "grad_norm": 1.4628989696502686, "learning_rate": 2.7065400217670923e-05, "loss": 2.791, "step": 4636 }, { "epoch": 0.46, "grad_norm": 1.4556288719177246, "learning_rate": 2.70456119521124e-05, "loss": 2.7437, "step": 4640 }, { "epoch": 0.46, "grad_norm": 1.7020779848098755, "learning_rate": 2.7025823686553876e-05, "loss": 2.5774, "step": 4644 }, { "epoch": 0.46, "grad_norm": 1.8932197093963623, "learning_rate": 2.7006035420995352e-05, "loss": 2.7657, "step": 4648 }, { "epoch": 0.46, "grad_norm": 1.6236851215362549, "learning_rate": 2.698624715543683e-05, "loss": 2.7994, "step": 4652 }, { "epoch": 0.46, "grad_norm": 1.6066538095474243, "learning_rate": 2.6966458889878305e-05, "loss": 2.8015, "step": 4656 }, { "epoch": 0.46, "grad_norm": 1.4361777305603027, "learning_rate": 2.694667062431978e-05, "loss": 2.7335, "step": 4660 }, { "epoch": 0.46, "grad_norm": 1.6797633171081543, "learning_rate": 2.6926882358761258e-05, "loss": 2.7286, "step": 4664 }, { "epoch": 0.46, "grad_norm": 1.4555542469024658, "learning_rate": 2.6907094093202734e-05, "loss": 2.6814, "step": 4668 }, { "epoch": 0.46, "grad_norm": 1.4565298557281494, "learning_rate": 2.688730582764421e-05, "loss": 2.7205, "step": 4672 }, { "epoch": 0.46, "grad_norm": 1.4873201847076416, "learning_rate": 2.6867517562085687e-05, "loss": 2.866, "step": 4676 }, { "epoch": 0.46, "grad_norm": 1.62325918674469, "learning_rate": 2.6847729296527163e-05, "loss": 2.7617, "step": 4680 }, { "epoch": 0.46, "grad_norm": 1.4098718166351318, "learning_rate": 2.682794103096864e-05, "loss": 2.6488, "step": 4684 }, { "epoch": 0.46, "grad_norm": 1.5865626335144043, "learning_rate": 2.6808152765410112e-05, "loss": 2.5687, "step": 4688 }, { "epoch": 0.46, "grad_norm": 1.4347946643829346, "learning_rate": 2.678836449985159e-05, "loss": 2.7482, "step": 4692 }, { "epoch": 0.46, "grad_norm": 1.5309094190597534, "learning_rate": 2.6768576234293065e-05, "loss": 2.8154, "step": 4696 }, { "epoch": 0.47, "grad_norm": 1.4870914220809937, "learning_rate": 2.674878796873454e-05, "loss": 2.7054, "step": 4700 }, { "epoch": 0.47, "grad_norm": 1.362409234046936, "learning_rate": 2.6728999703176017e-05, "loss": 2.6008, "step": 4704 }, { "epoch": 0.47, "grad_norm": 1.7659647464752197, "learning_rate": 2.6709211437617494e-05, "loss": 2.7721, "step": 4708 }, { "epoch": 0.47, "grad_norm": 1.5317940711975098, "learning_rate": 2.668942317205897e-05, "loss": 2.6605, "step": 4712 }, { "epoch": 0.47, "grad_norm": 1.5721125602722168, "learning_rate": 2.6669634906500446e-05, "loss": 2.9969, "step": 4716 }, { "epoch": 0.47, "grad_norm": 1.5925160646438599, "learning_rate": 2.6649846640941923e-05, "loss": 2.6141, "step": 4720 }, { "epoch": 0.47, "grad_norm": 1.5406255722045898, "learning_rate": 2.66300583753834e-05, "loss": 2.6511, "step": 4724 }, { "epoch": 0.47, "grad_norm": 1.5240296125411987, "learning_rate": 2.6610270109824875e-05, "loss": 2.6192, "step": 4728 }, { "epoch": 0.47, "grad_norm": 1.5159399509429932, "learning_rate": 2.659048184426635e-05, "loss": 2.6747, "step": 4732 }, { "epoch": 0.47, "grad_norm": 1.5311481952667236, "learning_rate": 2.6570693578707828e-05, "loss": 2.6445, "step": 4736 }, { "epoch": 0.47, "grad_norm": 1.610335111618042, "learning_rate": 2.6550905313149304e-05, "loss": 2.7823, "step": 4740 }, { "epoch": 0.47, "grad_norm": 1.5212194919586182, "learning_rate": 2.653111704759078e-05, "loss": 2.7615, "step": 4744 }, { "epoch": 0.47, "grad_norm": 1.5369598865509033, "learning_rate": 2.6511328782032257e-05, "loss": 2.5804, "step": 4748 }, { "epoch": 0.47, "grad_norm": 1.4863191843032837, "learning_rate": 2.6491540516473733e-05, "loss": 2.8282, "step": 4752 }, { "epoch": 0.47, "grad_norm": 1.6071970462799072, "learning_rate": 2.647175225091521e-05, "loss": 2.7363, "step": 4756 }, { "epoch": 0.47, "grad_norm": 1.543915867805481, "learning_rate": 2.6451963985356686e-05, "loss": 2.8169, "step": 4760 }, { "epoch": 0.47, "grad_norm": 1.399947166442871, "learning_rate": 2.6432175719798162e-05, "loss": 2.7116, "step": 4764 }, { "epoch": 0.47, "grad_norm": 1.3791353702545166, "learning_rate": 2.641238745423964e-05, "loss": 2.8288, "step": 4768 }, { "epoch": 0.47, "grad_norm": 1.5583422183990479, "learning_rate": 2.639259918868111e-05, "loss": 2.6392, "step": 4772 }, { "epoch": 0.47, "grad_norm": 1.4231597185134888, "learning_rate": 2.6372810923122588e-05, "loss": 2.6542, "step": 4776 }, { "epoch": 0.47, "grad_norm": 1.5944623947143555, "learning_rate": 2.6353022657564064e-05, "loss": 2.7268, "step": 4780 }, { "epoch": 0.47, "grad_norm": 1.586701512336731, "learning_rate": 2.633323439200554e-05, "loss": 2.8372, "step": 4784 }, { "epoch": 0.47, "grad_norm": 1.9561891555786133, "learning_rate": 2.6313446126447017e-05, "loss": 2.6418, "step": 4788 }, { "epoch": 0.47, "grad_norm": 1.4587674140930176, "learning_rate": 2.6293657860888493e-05, "loss": 2.6083, "step": 4792 }, { "epoch": 0.47, "grad_norm": 1.4791700839996338, "learning_rate": 2.627386959532997e-05, "loss": 2.7053, "step": 4796 }, { "epoch": 0.47, "grad_norm": 1.6022498607635498, "learning_rate": 2.6254081329771446e-05, "loss": 2.7117, "step": 4800 }, { "epoch": 0.48, "grad_norm": 1.499300241470337, "learning_rate": 2.6234293064212922e-05, "loss": 2.879, "step": 4804 }, { "epoch": 0.48, "grad_norm": 1.4296376705169678, "learning_rate": 2.62145047986544e-05, "loss": 2.9341, "step": 4808 }, { "epoch": 0.48, "grad_norm": 1.4815930128097534, "learning_rate": 2.6194716533095875e-05, "loss": 2.505, "step": 4812 }, { "epoch": 0.48, "grad_norm": 1.395080327987671, "learning_rate": 2.617492826753735e-05, "loss": 2.7681, "step": 4816 }, { "epoch": 0.48, "grad_norm": 1.5271371603012085, "learning_rate": 2.6155140001978827e-05, "loss": 2.7507, "step": 4820 }, { "epoch": 0.48, "grad_norm": 1.522066593170166, "learning_rate": 2.6135351736420304e-05, "loss": 2.7203, "step": 4824 }, { "epoch": 0.48, "grad_norm": 1.4368566274642944, "learning_rate": 2.611556347086178e-05, "loss": 2.6228, "step": 4828 }, { "epoch": 0.48, "grad_norm": 1.473230004310608, "learning_rate": 2.6095775205303256e-05, "loss": 2.726, "step": 4832 }, { "epoch": 0.48, "grad_norm": 1.4038417339324951, "learning_rate": 2.6075986939744736e-05, "loss": 2.6796, "step": 4836 }, { "epoch": 0.48, "grad_norm": 1.5209479331970215, "learning_rate": 2.6056198674186212e-05, "loss": 2.8567, "step": 4840 }, { "epoch": 0.48, "grad_norm": 1.7008355855941772, "learning_rate": 2.603641040862769e-05, "loss": 2.8475, "step": 4844 }, { "epoch": 0.48, "grad_norm": 1.6162598133087158, "learning_rate": 2.6016622143069165e-05, "loss": 2.6449, "step": 4848 }, { "epoch": 0.48, "grad_norm": 1.4284156560897827, "learning_rate": 2.599683387751064e-05, "loss": 2.7213, "step": 4852 }, { "epoch": 0.48, "grad_norm": 1.3640947341918945, "learning_rate": 2.597704561195211e-05, "loss": 2.6178, "step": 4856 }, { "epoch": 0.48, "grad_norm": 1.6424022912979126, "learning_rate": 2.5957257346393587e-05, "loss": 2.6476, "step": 4860 }, { "epoch": 0.48, "grad_norm": 1.6182935237884521, "learning_rate": 2.5937469080835063e-05, "loss": 2.6933, "step": 4864 }, { "epoch": 0.48, "grad_norm": 1.3428994417190552, "learning_rate": 2.591768081527654e-05, "loss": 2.6471, "step": 4868 }, { "epoch": 0.48, "grad_norm": 1.550610899925232, "learning_rate": 2.5897892549718016e-05, "loss": 2.651, "step": 4872 }, { "epoch": 0.48, "grad_norm": 1.8785467147827148, "learning_rate": 2.5878104284159492e-05, "loss": 2.6447, "step": 4876 }, { "epoch": 0.48, "grad_norm": 1.5535615682601929, "learning_rate": 2.585831601860097e-05, "loss": 2.6525, "step": 4880 }, { "epoch": 0.48, "grad_norm": 1.5342074632644653, "learning_rate": 2.5838527753042445e-05, "loss": 2.7041, "step": 4884 }, { "epoch": 0.48, "grad_norm": 1.6638293266296387, "learning_rate": 2.581873948748392e-05, "loss": 2.6759, "step": 4888 }, { "epoch": 0.48, "grad_norm": 1.4886223077774048, "learning_rate": 2.5798951221925398e-05, "loss": 2.6616, "step": 4892 }, { "epoch": 0.48, "grad_norm": 1.539483904838562, "learning_rate": 2.5779162956366877e-05, "loss": 2.6294, "step": 4896 }, { "epoch": 0.48, "grad_norm": 1.415686845779419, "learning_rate": 2.5759374690808354e-05, "loss": 2.5885, "step": 4900 }, { "epoch": 0.49, "grad_norm": 1.6120994091033936, "learning_rate": 2.573958642524983e-05, "loss": 2.6169, "step": 4904 }, { "epoch": 0.49, "grad_norm": 1.713667392730713, "learning_rate": 2.5719798159691306e-05, "loss": 2.7591, "step": 4908 }, { "epoch": 0.49, "grad_norm": 1.5422923564910889, "learning_rate": 2.5700009894132783e-05, "loss": 2.7633, "step": 4912 }, { "epoch": 0.49, "grad_norm": 1.5131072998046875, "learning_rate": 2.568022162857426e-05, "loss": 2.6749, "step": 4916 }, { "epoch": 0.49, "grad_norm": 1.5619624853134155, "learning_rate": 2.5660433363015735e-05, "loss": 2.5834, "step": 4920 }, { "epoch": 0.49, "grad_norm": 1.4721622467041016, "learning_rate": 2.564064509745721e-05, "loss": 2.6411, "step": 4924 }, { "epoch": 0.49, "grad_norm": 1.424367070198059, "learning_rate": 2.5620856831898688e-05, "loss": 2.6848, "step": 4928 }, { "epoch": 0.49, "grad_norm": 1.480597734451294, "learning_rate": 2.5601068566340164e-05, "loss": 2.5919, "step": 4932 }, { "epoch": 0.49, "grad_norm": 1.5287058353424072, "learning_rate": 2.558128030078164e-05, "loss": 2.7616, "step": 4936 }, { "epoch": 0.49, "grad_norm": 1.7165645360946655, "learning_rate": 2.556149203522311e-05, "loss": 2.7636, "step": 4940 }, { "epoch": 0.49, "grad_norm": 1.4173622131347656, "learning_rate": 2.5541703769664586e-05, "loss": 2.7301, "step": 4944 }, { "epoch": 0.49, "grad_norm": 1.4607168436050415, "learning_rate": 2.5521915504106063e-05, "loss": 2.609, "step": 4948 }, { "epoch": 0.49, "grad_norm": 1.4109200239181519, "learning_rate": 2.550212723854754e-05, "loss": 2.7442, "step": 4952 }, { "epoch": 0.49, "grad_norm": 1.5535697937011719, "learning_rate": 2.548233897298902e-05, "loss": 2.5371, "step": 4956 }, { "epoch": 0.49, "grad_norm": 1.4528133869171143, "learning_rate": 2.5462550707430495e-05, "loss": 2.5971, "step": 4960 }, { "epoch": 0.49, "grad_norm": 1.7372537851333618, "learning_rate": 2.544276244187197e-05, "loss": 2.8321, "step": 4964 }, { "epoch": 0.49, "grad_norm": 1.6011797189712524, "learning_rate": 2.5422974176313448e-05, "loss": 2.7647, "step": 4968 }, { "epoch": 0.49, "grad_norm": 1.6484735012054443, "learning_rate": 2.5403185910754924e-05, "loss": 2.5399, "step": 4972 }, { "epoch": 0.49, "grad_norm": 1.5052474737167358, "learning_rate": 2.53833976451964e-05, "loss": 2.6719, "step": 4976 }, { "epoch": 0.49, "grad_norm": 1.5310105085372925, "learning_rate": 2.5363609379637877e-05, "loss": 2.6558, "step": 4980 }, { "epoch": 0.49, "grad_norm": 1.469632863998413, "learning_rate": 2.5343821114079353e-05, "loss": 2.7404, "step": 4984 }, { "epoch": 0.49, "grad_norm": 1.53377103805542, "learning_rate": 2.532403284852083e-05, "loss": 2.7085, "step": 4988 }, { "epoch": 0.49, "grad_norm": 1.6303514242172241, "learning_rate": 2.5304244582962306e-05, "loss": 2.749, "step": 4992 }, { "epoch": 0.49, "grad_norm": 1.5342525243759155, "learning_rate": 2.5284456317403782e-05, "loss": 2.8031, "step": 4996 }, { "epoch": 0.49, "grad_norm": 1.7527660131454468, "learning_rate": 2.526466805184526e-05, "loss": 2.6969, "step": 5000 }, { "epoch": 0.5, "grad_norm": 1.5336272716522217, "learning_rate": 2.5244879786286735e-05, "loss": 2.6488, "step": 5004 }, { "epoch": 0.5, "grad_norm": 1.6374998092651367, "learning_rate": 2.522509152072821e-05, "loss": 2.5628, "step": 5008 }, { "epoch": 0.5, "grad_norm": 1.4945316314697266, "learning_rate": 2.5205303255169687e-05, "loss": 2.8031, "step": 5012 }, { "epoch": 0.5, "grad_norm": 1.5056184530258179, "learning_rate": 2.5185514989611164e-05, "loss": 2.7111, "step": 5016 }, { "epoch": 0.5, "grad_norm": 1.389101505279541, "learning_rate": 2.516572672405264e-05, "loss": 2.6636, "step": 5020 }, { "epoch": 0.5, "grad_norm": 1.8190226554870605, "learning_rate": 2.5145938458494113e-05, "loss": 2.692, "step": 5024 }, { "epoch": 0.5, "grad_norm": 1.553562879562378, "learning_rate": 2.512615019293559e-05, "loss": 2.6122, "step": 5028 }, { "epoch": 0.5, "grad_norm": 1.710195779800415, "learning_rate": 2.5106361927377066e-05, "loss": 2.7634, "step": 5032 }, { "epoch": 0.5, "grad_norm": 1.669340968132019, "learning_rate": 2.5086573661818542e-05, "loss": 2.6008, "step": 5036 }, { "epoch": 0.5, "grad_norm": 1.7336004972457886, "learning_rate": 2.5066785396260018e-05, "loss": 2.6427, "step": 5040 }, { "epoch": 0.5, "grad_norm": 1.6529115438461304, "learning_rate": 2.5046997130701495e-05, "loss": 2.6774, "step": 5044 }, { "epoch": 0.5, "grad_norm": 1.463104248046875, "learning_rate": 2.502720886514297e-05, "loss": 2.5906, "step": 5048 }, { "epoch": 0.5, "grad_norm": 1.594033122062683, "learning_rate": 2.5007420599584447e-05, "loss": 2.5802, "step": 5052 }, { "epoch": 0.5, "grad_norm": 1.5680876970291138, "learning_rate": 2.4987632334025923e-05, "loss": 2.7835, "step": 5056 }, { "epoch": 0.5, "grad_norm": 1.551176905632019, "learning_rate": 2.49678440684674e-05, "loss": 2.7352, "step": 5060 }, { "epoch": 0.5, "grad_norm": 1.6078176498413086, "learning_rate": 2.4948055802908876e-05, "loss": 2.6514, "step": 5064 }, { "epoch": 0.5, "grad_norm": 1.4909130334854126, "learning_rate": 2.4928267537350352e-05, "loss": 2.6863, "step": 5068 }, { "epoch": 0.5, "grad_norm": 1.485734462738037, "learning_rate": 2.490847927179183e-05, "loss": 2.6079, "step": 5072 }, { "epoch": 0.5, "grad_norm": 1.413549542427063, "learning_rate": 2.4888691006233305e-05, "loss": 2.7355, "step": 5076 }, { "epoch": 0.5, "grad_norm": 1.5501868724822998, "learning_rate": 2.486890274067478e-05, "loss": 2.6522, "step": 5080 }, { "epoch": 0.5, "grad_norm": 1.4532791376113892, "learning_rate": 2.4849114475116258e-05, "loss": 2.7269, "step": 5084 }, { "epoch": 0.5, "grad_norm": 1.6605114936828613, "learning_rate": 2.482932620955773e-05, "loss": 2.6386, "step": 5088 }, { "epoch": 0.5, "grad_norm": 1.6736440658569336, "learning_rate": 2.480953794399921e-05, "loss": 2.8607, "step": 5092 }, { "epoch": 0.5, "grad_norm": 1.7026129961013794, "learning_rate": 2.4789749678440687e-05, "loss": 2.7838, "step": 5096 }, { "epoch": 0.5, "grad_norm": 1.4763104915618896, "learning_rate": 2.4769961412882163e-05, "loss": 2.5136, "step": 5100 }, { "epoch": 0.5, "grad_norm": 1.555331826210022, "learning_rate": 2.475017314732364e-05, "loss": 2.5966, "step": 5104 }, { "epoch": 0.51, "grad_norm": 1.4764961004257202, "learning_rate": 2.4730384881765116e-05, "loss": 2.7076, "step": 5108 }, { "epoch": 0.51, "grad_norm": 1.5390756130218506, "learning_rate": 2.4710596616206592e-05, "loss": 2.7615, "step": 5112 }, { "epoch": 0.51, "grad_norm": 1.503321647644043, "learning_rate": 2.4690808350648068e-05, "loss": 2.62, "step": 5116 }, { "epoch": 0.51, "grad_norm": 1.5027825832366943, "learning_rate": 2.4671020085089545e-05, "loss": 2.9407, "step": 5120 }, { "epoch": 0.51, "grad_norm": 1.4294583797454834, "learning_rate": 2.465123181953102e-05, "loss": 2.8738, "step": 5124 }, { "epoch": 0.51, "grad_norm": 1.5709290504455566, "learning_rate": 2.4631443553972494e-05, "loss": 2.6954, "step": 5128 }, { "epoch": 0.51, "grad_norm": 1.6391737461090088, "learning_rate": 2.461165528841397e-05, "loss": 2.8727, "step": 5132 }, { "epoch": 0.51, "grad_norm": 1.5847351551055908, "learning_rate": 2.4591867022855447e-05, "loss": 2.5896, "step": 5136 }, { "epoch": 0.51, "grad_norm": 1.4603075981140137, "learning_rate": 2.4572078757296923e-05, "loss": 2.7727, "step": 5140 }, { "epoch": 0.51, "grad_norm": 1.8148667812347412, "learning_rate": 2.45522904917384e-05, "loss": 2.518, "step": 5144 }, { "epoch": 0.51, "grad_norm": 1.6938127279281616, "learning_rate": 2.4532502226179875e-05, "loss": 2.575, "step": 5148 }, { "epoch": 0.51, "grad_norm": 1.495416522026062, "learning_rate": 2.4512713960621352e-05, "loss": 2.7263, "step": 5152 }, { "epoch": 0.51, "grad_norm": 1.5447607040405273, "learning_rate": 2.4492925695062828e-05, "loss": 2.6644, "step": 5156 }, { "epoch": 0.51, "grad_norm": 1.594707727432251, "learning_rate": 2.4473137429504304e-05, "loss": 2.5444, "step": 5160 }, { "epoch": 0.51, "grad_norm": 1.613542914390564, "learning_rate": 2.445334916394578e-05, "loss": 2.5996, "step": 5164 }, { "epoch": 0.51, "grad_norm": 1.6237636804580688, "learning_rate": 2.443356089838726e-05, "loss": 2.6802, "step": 5168 }, { "epoch": 0.51, "grad_norm": 1.5020370483398438, "learning_rate": 2.4413772632828733e-05, "loss": 2.6467, "step": 5172 }, { "epoch": 0.51, "grad_norm": 1.441662311553955, "learning_rate": 2.439398436727021e-05, "loss": 2.6707, "step": 5176 }, { "epoch": 0.51, "grad_norm": 1.626299500465393, "learning_rate": 2.4374196101711686e-05, "loss": 2.5029, "step": 5180 }, { "epoch": 0.51, "grad_norm": 1.5790330171585083, "learning_rate": 2.4354407836153162e-05, "loss": 2.6478, "step": 5184 }, { "epoch": 0.51, "grad_norm": 1.4696558713912964, "learning_rate": 2.433461957059464e-05, "loss": 2.6289, "step": 5188 }, { "epoch": 0.51, "grad_norm": 1.3996411561965942, "learning_rate": 2.4314831305036115e-05, "loss": 2.5142, "step": 5192 }, { "epoch": 0.51, "grad_norm": 1.630058765411377, "learning_rate": 2.429504303947759e-05, "loss": 2.6655, "step": 5196 }, { "epoch": 0.51, "grad_norm": 1.648018717765808, "learning_rate": 2.4275254773919068e-05, "loss": 2.425, "step": 5200 }, { "epoch": 0.51, "grad_norm": 1.6513872146606445, "learning_rate": 2.4255466508360544e-05, "loss": 2.7594, "step": 5204 }, { "epoch": 0.52, "grad_norm": 1.592578649520874, "learning_rate": 2.423567824280202e-05, "loss": 2.5232, "step": 5208 }, { "epoch": 0.52, "grad_norm": 1.6589707136154175, "learning_rate": 2.4215889977243493e-05, "loss": 2.6596, "step": 5212 }, { "epoch": 0.52, "grad_norm": 1.5870932340621948, "learning_rate": 2.419610171168497e-05, "loss": 2.5496, "step": 5216 }, { "epoch": 0.52, "grad_norm": 1.6115508079528809, "learning_rate": 2.4176313446126446e-05, "loss": 2.7024, "step": 5220 }, { "epoch": 0.52, "grad_norm": 1.5552654266357422, "learning_rate": 2.4156525180567922e-05, "loss": 2.6954, "step": 5224 }, { "epoch": 0.52, "grad_norm": 1.530119776725769, "learning_rate": 2.4136736915009402e-05, "loss": 2.6178, "step": 5228 }, { "epoch": 0.52, "grad_norm": 1.5883251428604126, "learning_rate": 2.4116948649450878e-05, "loss": 2.7307, "step": 5232 }, { "epoch": 0.52, "grad_norm": 1.6557033061981201, "learning_rate": 2.4097160383892355e-05, "loss": 2.7623, "step": 5236 }, { "epoch": 0.52, "grad_norm": 1.6295868158340454, "learning_rate": 2.407737211833383e-05, "loss": 2.5603, "step": 5240 }, { "epoch": 0.52, "grad_norm": 1.4882088899612427, "learning_rate": 2.4057583852775307e-05, "loss": 2.7022, "step": 5244 }, { "epoch": 0.52, "grad_norm": 1.559869647026062, "learning_rate": 2.4037795587216784e-05, "loss": 2.8356, "step": 5248 }, { "epoch": 0.52, "grad_norm": 1.5058995485305786, "learning_rate": 2.401800732165826e-05, "loss": 2.8771, "step": 5252 }, { "epoch": 0.52, "grad_norm": 1.5419751405715942, "learning_rate": 2.3998219056099733e-05, "loss": 2.6347, "step": 5256 }, { "epoch": 0.52, "grad_norm": 1.5456287860870361, "learning_rate": 2.397843079054121e-05, "loss": 2.7462, "step": 5260 }, { "epoch": 0.52, "grad_norm": 1.6244553327560425, "learning_rate": 2.3958642524982685e-05, "loss": 2.7263, "step": 5264 }, { "epoch": 0.52, "grad_norm": 1.5929856300354004, "learning_rate": 2.3938854259424162e-05, "loss": 2.6368, "step": 5268 }, { "epoch": 0.52, "grad_norm": 2.1067686080932617, "learning_rate": 2.3919065993865638e-05, "loss": 2.784, "step": 5272 }, { "epoch": 0.52, "grad_norm": 1.4130641222000122, "learning_rate": 2.3899277728307114e-05, "loss": 2.6074, "step": 5276 }, { "epoch": 0.52, "grad_norm": 1.516021490097046, "learning_rate": 2.387948946274859e-05, "loss": 2.7861, "step": 5280 }, { "epoch": 0.52, "grad_norm": 1.6379324197769165, "learning_rate": 2.3859701197190067e-05, "loss": 2.6482, "step": 5284 }, { "epoch": 0.52, "grad_norm": 1.7864313125610352, "learning_rate": 2.3839912931631543e-05, "loss": 2.6253, "step": 5288 }, { "epoch": 0.52, "grad_norm": 1.6106088161468506, "learning_rate": 2.382012466607302e-05, "loss": 2.5256, "step": 5292 }, { "epoch": 0.52, "grad_norm": 1.5859754085540771, "learning_rate": 2.3800336400514496e-05, "loss": 2.6219, "step": 5296 }, { "epoch": 0.52, "grad_norm": 1.5231300592422485, "learning_rate": 2.3780548134955972e-05, "loss": 2.7499, "step": 5300 }, { "epoch": 0.52, "grad_norm": 1.472907543182373, "learning_rate": 2.376075986939745e-05, "loss": 2.5311, "step": 5304 }, { "epoch": 0.53, "grad_norm": 1.8170127868652344, "learning_rate": 2.3740971603838925e-05, "loss": 2.5569, "step": 5308 }, { "epoch": 0.53, "grad_norm": 1.5469038486480713, "learning_rate": 2.37211833382804e-05, "loss": 2.6323, "step": 5312 }, { "epoch": 0.53, "grad_norm": 1.594907283782959, "learning_rate": 2.3701395072721878e-05, "loss": 2.8168, "step": 5316 }, { "epoch": 0.53, "grad_norm": 1.5616902112960815, "learning_rate": 2.3681606807163354e-05, "loss": 2.7958, "step": 5320 }, { "epoch": 0.53, "grad_norm": 1.5545421838760376, "learning_rate": 2.366181854160483e-05, "loss": 2.6999, "step": 5324 }, { "epoch": 0.53, "grad_norm": 1.4910705089569092, "learning_rate": 2.3642030276046307e-05, "loss": 2.6652, "step": 5328 }, { "epoch": 0.53, "grad_norm": 1.510505199432373, "learning_rate": 2.3622242010487783e-05, "loss": 2.6772, "step": 5332 }, { "epoch": 0.53, "grad_norm": 1.5067452192306519, "learning_rate": 2.360245374492926e-05, "loss": 2.765, "step": 5336 }, { "epoch": 0.53, "grad_norm": 1.6641408205032349, "learning_rate": 2.3582665479370732e-05, "loss": 2.5982, "step": 5340 }, { "epoch": 0.53, "grad_norm": 1.6969571113586426, "learning_rate": 2.356287721381221e-05, "loss": 2.8366, "step": 5344 }, { "epoch": 0.53, "grad_norm": 1.5563101768493652, "learning_rate": 2.3543088948253685e-05, "loss": 2.6941, "step": 5348 }, { "epoch": 0.53, "grad_norm": 1.5236217975616455, "learning_rate": 2.352330068269516e-05, "loss": 2.6801, "step": 5352 }, { "epoch": 0.53, "grad_norm": 1.6700359582901, "learning_rate": 2.3503512417136637e-05, "loss": 2.8281, "step": 5356 }, { "epoch": 0.53, "grad_norm": 1.5832792520523071, "learning_rate": 2.3483724151578114e-05, "loss": 2.6856, "step": 5360 }, { "epoch": 0.53, "grad_norm": 1.7960867881774902, "learning_rate": 2.3463935886019593e-05, "loss": 2.6418, "step": 5364 }, { "epoch": 0.53, "grad_norm": 2.743746042251587, "learning_rate": 2.344414762046107e-05, "loss": 2.7925, "step": 5368 }, { "epoch": 0.53, "grad_norm": 1.7574864625930786, "learning_rate": 2.3424359354902546e-05, "loss": 2.8191, "step": 5372 }, { "epoch": 0.53, "grad_norm": 1.4211304187774658, "learning_rate": 2.3404571089344022e-05, "loss": 2.6055, "step": 5376 }, { "epoch": 0.53, "grad_norm": 1.6393153667449951, "learning_rate": 2.33847828237855e-05, "loss": 2.541, "step": 5380 }, { "epoch": 0.53, "grad_norm": 1.6058558225631714, "learning_rate": 2.336499455822697e-05, "loss": 2.5992, "step": 5384 }, { "epoch": 0.53, "grad_norm": 1.4362446069717407, "learning_rate": 2.3345206292668448e-05, "loss": 2.6894, "step": 5388 }, { "epoch": 0.53, "grad_norm": 1.5442392826080322, "learning_rate": 2.3325418027109924e-05, "loss": 2.6369, "step": 5392 }, { "epoch": 0.53, "grad_norm": 1.654779314994812, "learning_rate": 2.33056297615514e-05, "loss": 2.7932, "step": 5396 }, { "epoch": 0.53, "grad_norm": 1.576923131942749, "learning_rate": 2.3285841495992877e-05, "loss": 2.6261, "step": 5400 }, { "epoch": 0.53, "grad_norm": 1.595237374305725, "learning_rate": 2.3266053230434353e-05, "loss": 2.5116, "step": 5404 }, { "epoch": 0.54, "grad_norm": 1.7787840366363525, "learning_rate": 2.324626496487583e-05, "loss": 2.7103, "step": 5408 }, { "epoch": 0.54, "grad_norm": 1.4982911348342896, "learning_rate": 2.3226476699317306e-05, "loss": 2.587, "step": 5412 }, { "epoch": 0.54, "grad_norm": 1.5009403228759766, "learning_rate": 2.3206688433758782e-05, "loss": 2.7021, "step": 5416 }, { "epoch": 0.54, "grad_norm": 1.5201959609985352, "learning_rate": 2.318690016820026e-05, "loss": 2.5477, "step": 5420 }, { "epoch": 0.54, "grad_norm": 1.480794906616211, "learning_rate": 2.3167111902641735e-05, "loss": 2.725, "step": 5424 }, { "epoch": 0.54, "grad_norm": 1.5079982280731201, "learning_rate": 2.314732363708321e-05, "loss": 2.4785, "step": 5428 }, { "epoch": 0.54, "grad_norm": 1.601902723312378, "learning_rate": 2.3127535371524688e-05, "loss": 2.5547, "step": 5432 }, { "epoch": 0.54, "grad_norm": 1.5328859090805054, "learning_rate": 2.3107747105966164e-05, "loss": 2.6307, "step": 5436 }, { "epoch": 0.54, "grad_norm": 1.5834133625030518, "learning_rate": 2.308795884040764e-05, "loss": 2.656, "step": 5440 }, { "epoch": 0.54, "grad_norm": 1.6440998315811157, "learning_rate": 2.3068170574849116e-05, "loss": 2.7507, "step": 5444 }, { "epoch": 0.54, "grad_norm": 1.4416550397872925, "learning_rate": 2.3048382309290593e-05, "loss": 2.8631, "step": 5448 }, { "epoch": 0.54, "grad_norm": 1.5241260528564453, "learning_rate": 2.302859404373207e-05, "loss": 2.815, "step": 5452 }, { "epoch": 0.54, "grad_norm": 1.8453205823898315, "learning_rate": 2.3008805778173545e-05, "loss": 2.6714, "step": 5456 }, { "epoch": 0.54, "grad_norm": 1.5847132205963135, "learning_rate": 2.2989017512615022e-05, "loss": 2.6486, "step": 5460 }, { "epoch": 0.54, "grad_norm": 1.6181340217590332, "learning_rate": 2.2969229247056498e-05, "loss": 2.5461, "step": 5464 }, { "epoch": 0.54, "grad_norm": 1.4716991186141968, "learning_rate": 2.294944098149797e-05, "loss": 2.6739, "step": 5468 }, { "epoch": 0.54, "grad_norm": 1.4331846237182617, "learning_rate": 2.2929652715939447e-05, "loss": 2.5011, "step": 5472 }, { "epoch": 0.54, "grad_norm": 1.7750493288040161, "learning_rate": 2.2909864450380924e-05, "loss": 2.683, "step": 5476 }, { "epoch": 0.54, "grad_norm": 1.6344447135925293, "learning_rate": 2.28900761848224e-05, "loss": 2.8271, "step": 5480 }, { "epoch": 0.54, "grad_norm": 1.4624501466751099, "learning_rate": 2.2870287919263876e-05, "loss": 2.6828, "step": 5484 }, { "epoch": 0.54, "grad_norm": 1.3957983255386353, "learning_rate": 2.2850499653705353e-05, "loss": 2.5376, "step": 5488 }, { "epoch": 0.54, "grad_norm": 1.4633480310440063, "learning_rate": 2.283071138814683e-05, "loss": 2.6159, "step": 5492 }, { "epoch": 0.54, "grad_norm": 1.607138752937317, "learning_rate": 2.2810923122588305e-05, "loss": 2.5313, "step": 5496 }, { "epoch": 0.54, "grad_norm": 1.5315511226654053, "learning_rate": 2.2791134857029785e-05, "loss": 2.6017, "step": 5500 }, { "epoch": 0.54, "grad_norm": 1.3674393892288208, "learning_rate": 2.277134659147126e-05, "loss": 2.5482, "step": 5504 }, { "epoch": 0.54, "grad_norm": 1.4765186309814453, "learning_rate": 2.2751558325912734e-05, "loss": 2.5593, "step": 5508 }, { "epoch": 0.55, "grad_norm": 1.5527360439300537, "learning_rate": 2.273177006035421e-05, "loss": 2.7501, "step": 5512 }, { "epoch": 0.55, "grad_norm": 1.7403366565704346, "learning_rate": 2.2711981794795687e-05, "loss": 2.6449, "step": 5516 }, { "epoch": 0.55, "grad_norm": 1.3762569427490234, "learning_rate": 2.2692193529237163e-05, "loss": 2.5219, "step": 5520 }, { "epoch": 0.55, "grad_norm": 1.5803818702697754, "learning_rate": 2.267240526367864e-05, "loss": 2.5245, "step": 5524 }, { "epoch": 0.55, "grad_norm": 1.6432803869247437, "learning_rate": 2.2652616998120116e-05, "loss": 2.657, "step": 5528 }, { "epoch": 0.55, "grad_norm": 1.4227681159973145, "learning_rate": 2.2632828732561592e-05, "loss": 2.6232, "step": 5532 }, { "epoch": 0.55, "grad_norm": 1.6246461868286133, "learning_rate": 2.261304046700307e-05, "loss": 2.7291, "step": 5536 }, { "epoch": 0.55, "grad_norm": 1.5243287086486816, "learning_rate": 2.2593252201444545e-05, "loss": 2.828, "step": 5540 }, { "epoch": 0.55, "grad_norm": 1.4307969808578491, "learning_rate": 2.257346393588602e-05, "loss": 2.6103, "step": 5544 }, { "epoch": 0.55, "grad_norm": 1.4919029474258423, "learning_rate": 2.2553675670327497e-05, "loss": 2.652, "step": 5548 }, { "epoch": 0.55, "grad_norm": 1.633585810661316, "learning_rate": 2.253388740476897e-05, "loss": 2.5161, "step": 5552 }, { "epoch": 0.55, "grad_norm": 1.5435733795166016, "learning_rate": 2.2514099139210447e-05, "loss": 2.5211, "step": 5556 }, { "epoch": 0.55, "grad_norm": 1.804867148399353, "learning_rate": 2.2494310873651926e-05, "loss": 2.6671, "step": 5560 }, { "epoch": 0.55, "grad_norm": 1.7000728845596313, "learning_rate": 2.2474522608093403e-05, "loss": 2.5494, "step": 5564 }, { "epoch": 0.55, "grad_norm": 1.597815752029419, "learning_rate": 2.245473434253488e-05, "loss": 2.7588, "step": 5568 }, { "epoch": 0.55, "grad_norm": 1.824008584022522, "learning_rate": 2.2434946076976355e-05, "loss": 2.6684, "step": 5572 }, { "epoch": 0.55, "grad_norm": 1.4856268167495728, "learning_rate": 2.2415157811417832e-05, "loss": 2.5801, "step": 5576 }, { "epoch": 0.55, "grad_norm": 1.4334465265274048, "learning_rate": 2.2395369545859308e-05, "loss": 2.6735, "step": 5580 }, { "epoch": 0.55, "grad_norm": 1.529638648033142, "learning_rate": 2.2375581280300784e-05, "loss": 2.4979, "step": 5584 }, { "epoch": 0.55, "grad_norm": 1.5678493976593018, "learning_rate": 2.235579301474226e-05, "loss": 2.458, "step": 5588 }, { "epoch": 0.55, "grad_norm": 1.4850939512252808, "learning_rate": 2.2336004749183734e-05, "loss": 2.5068, "step": 5592 }, { "epoch": 0.55, "grad_norm": 1.8440673351287842, "learning_rate": 2.231621648362521e-05, "loss": 2.6083, "step": 5596 }, { "epoch": 0.55, "grad_norm": 1.4649991989135742, "learning_rate": 2.2296428218066686e-05, "loss": 2.7386, "step": 5600 }, { "epoch": 0.55, "grad_norm": 1.5300774574279785, "learning_rate": 2.2276639952508163e-05, "loss": 2.6624, "step": 5604 }, { "epoch": 0.55, "grad_norm": 1.4183005094528198, "learning_rate": 2.225685168694964e-05, "loss": 2.5477, "step": 5608 }, { "epoch": 0.56, "grad_norm": 1.5531784296035767, "learning_rate": 2.2237063421391115e-05, "loss": 2.5075, "step": 5612 }, { "epoch": 0.56, "grad_norm": 1.6235988140106201, "learning_rate": 2.221727515583259e-05, "loss": 2.5529, "step": 5616 }, { "epoch": 0.56, "grad_norm": 1.6714985370635986, "learning_rate": 2.2197486890274068e-05, "loss": 2.7613, "step": 5620 }, { "epoch": 0.56, "grad_norm": 1.5148706436157227, "learning_rate": 2.2177698624715544e-05, "loss": 2.8314, "step": 5624 }, { "epoch": 0.56, "grad_norm": 1.6229020357131958, "learning_rate": 2.215791035915702e-05, "loss": 2.5539, "step": 5628 }, { "epoch": 0.56, "grad_norm": 1.7232576608657837, "learning_rate": 2.2138122093598497e-05, "loss": 2.756, "step": 5632 }, { "epoch": 0.56, "grad_norm": 1.4733608961105347, "learning_rate": 2.2118333828039973e-05, "loss": 2.6255, "step": 5636 }, { "epoch": 0.56, "grad_norm": 1.748321294784546, "learning_rate": 2.209854556248145e-05, "loss": 2.675, "step": 5640 }, { "epoch": 0.56, "grad_norm": 1.5811595916748047, "learning_rate": 2.2078757296922926e-05, "loss": 2.7092, "step": 5644 }, { "epoch": 0.56, "grad_norm": 1.3841744661331177, "learning_rate": 2.2058969031364402e-05, "loss": 2.5024, "step": 5648 }, { "epoch": 0.56, "grad_norm": 1.558066964149475, "learning_rate": 2.203918076580588e-05, "loss": 2.5734, "step": 5652 }, { "epoch": 0.56, "grad_norm": 1.4873226881027222, "learning_rate": 2.2019392500247355e-05, "loss": 2.7111, "step": 5656 }, { "epoch": 0.56, "grad_norm": 1.5889838933944702, "learning_rate": 2.199960423468883e-05, "loss": 2.6065, "step": 5660 }, { "epoch": 0.56, "grad_norm": 1.4627254009246826, "learning_rate": 2.1979815969130307e-05, "loss": 2.6233, "step": 5664 }, { "epoch": 0.56, "grad_norm": 1.6303706169128418, "learning_rate": 2.1960027703571784e-05, "loss": 2.7411, "step": 5668 }, { "epoch": 0.56, "grad_norm": 1.7008352279663086, "learning_rate": 2.194023943801326e-05, "loss": 2.5536, "step": 5672 }, { "epoch": 0.56, "grad_norm": 1.538040041923523, "learning_rate": 2.1920451172454733e-05, "loss": 2.6556, "step": 5676 }, { "epoch": 0.56, "grad_norm": 1.5172909498214722, "learning_rate": 2.190066290689621e-05, "loss": 2.6931, "step": 5680 }, { "epoch": 0.56, "grad_norm": 1.3563427925109863, "learning_rate": 2.1880874641337686e-05, "loss": 2.738, "step": 5684 }, { "epoch": 0.56, "grad_norm": 1.6162517070770264, "learning_rate": 2.1861086375779162e-05, "loss": 2.6649, "step": 5688 }, { "epoch": 0.56, "grad_norm": 1.6602249145507812, "learning_rate": 2.1841298110220638e-05, "loss": 2.6552, "step": 5692 }, { "epoch": 0.56, "grad_norm": 1.4889838695526123, "learning_rate": 2.1821509844662118e-05, "loss": 2.4506, "step": 5696 }, { "epoch": 0.56, "grad_norm": 1.5726191997528076, "learning_rate": 2.1801721579103594e-05, "loss": 2.696, "step": 5700 }, { "epoch": 0.56, "grad_norm": 1.7843947410583496, "learning_rate": 2.178193331354507e-05, "loss": 2.6979, "step": 5704 }, { "epoch": 0.56, "grad_norm": 1.5087847709655762, "learning_rate": 2.1762145047986547e-05, "loss": 2.5383, "step": 5708 }, { "epoch": 0.57, "grad_norm": 1.6036449670791626, "learning_rate": 2.1742356782428023e-05, "loss": 2.5809, "step": 5712 }, { "epoch": 0.57, "grad_norm": 1.578401803970337, "learning_rate": 2.17225685168695e-05, "loss": 2.7029, "step": 5716 }, { "epoch": 0.57, "grad_norm": 1.510796308517456, "learning_rate": 2.1702780251310972e-05, "loss": 2.4606, "step": 5720 }, { "epoch": 0.57, "grad_norm": 1.6872867345809937, "learning_rate": 2.168299198575245e-05, "loss": 2.6832, "step": 5724 }, { "epoch": 0.57, "grad_norm": 1.6195570230484009, "learning_rate": 2.1663203720193925e-05, "loss": 2.6499, "step": 5728 }, { "epoch": 0.57, "grad_norm": 1.4994075298309326, "learning_rate": 2.16434154546354e-05, "loss": 2.7302, "step": 5732 }, { "epoch": 0.57, "grad_norm": 1.5541733503341675, "learning_rate": 2.1623627189076878e-05, "loss": 2.6049, "step": 5736 }, { "epoch": 0.57, "grad_norm": 1.5584886074066162, "learning_rate": 2.1603838923518354e-05, "loss": 2.5976, "step": 5740 }, { "epoch": 0.57, "grad_norm": 1.5630571842193604, "learning_rate": 2.158405065795983e-05, "loss": 2.5042, "step": 5744 }, { "epoch": 0.57, "grad_norm": 1.4705463647842407, "learning_rate": 2.1564262392401307e-05, "loss": 2.6125, "step": 5748 }, { "epoch": 0.57, "grad_norm": 1.5950294733047485, "learning_rate": 2.1544474126842783e-05, "loss": 2.5934, "step": 5752 }, { "epoch": 0.57, "grad_norm": 1.4994343519210815, "learning_rate": 2.152468586128426e-05, "loss": 2.6526, "step": 5756 }, { "epoch": 0.57, "grad_norm": 1.4799262285232544, "learning_rate": 2.1504897595725736e-05, "loss": 2.6464, "step": 5760 }, { "epoch": 0.57, "grad_norm": 1.4674098491668701, "learning_rate": 2.1485109330167212e-05, "loss": 2.6307, "step": 5764 }, { "epoch": 0.57, "grad_norm": 1.519846796989441, "learning_rate": 2.146532106460869e-05, "loss": 2.6029, "step": 5768 }, { "epoch": 0.57, "grad_norm": 1.6543153524398804, "learning_rate": 2.1445532799050165e-05, "loss": 2.5781, "step": 5772 }, { "epoch": 0.57, "grad_norm": 1.8287581205368042, "learning_rate": 2.142574453349164e-05, "loss": 2.6109, "step": 5776 }, { "epoch": 0.57, "grad_norm": 1.6444793939590454, "learning_rate": 2.1405956267933117e-05, "loss": 2.7402, "step": 5780 }, { "epoch": 0.57, "grad_norm": 1.7062760591506958, "learning_rate": 2.1386168002374594e-05, "loss": 2.8537, "step": 5784 }, { "epoch": 0.57, "grad_norm": 1.794710636138916, "learning_rate": 2.136637973681607e-05, "loss": 2.8538, "step": 5788 }, { "epoch": 0.57, "grad_norm": 1.6265027523040771, "learning_rate": 2.1346591471257546e-05, "loss": 2.7576, "step": 5792 }, { "epoch": 0.57, "grad_norm": 1.637753963470459, "learning_rate": 2.1326803205699023e-05, "loss": 2.6573, "step": 5796 }, { "epoch": 0.57, "grad_norm": 1.5703392028808594, "learning_rate": 2.13070149401405e-05, "loss": 2.5352, "step": 5800 }, { "epoch": 0.57, "grad_norm": 1.7320841550827026, "learning_rate": 2.1287226674581972e-05, "loss": 2.6481, "step": 5804 }, { "epoch": 0.57, "grad_norm": 1.5625312328338623, "learning_rate": 2.1267438409023448e-05, "loss": 2.5449, "step": 5808 }, { "epoch": 0.58, "grad_norm": 1.4894498586654663, "learning_rate": 2.1247650143464924e-05, "loss": 2.7572, "step": 5812 }, { "epoch": 0.58, "grad_norm": 1.9753198623657227, "learning_rate": 2.12278618779064e-05, "loss": 2.6719, "step": 5816 }, { "epoch": 0.58, "grad_norm": 1.6790626049041748, "learning_rate": 2.1208073612347877e-05, "loss": 2.6557, "step": 5820 }, { "epoch": 0.58, "grad_norm": 1.5721454620361328, "learning_rate": 2.1188285346789353e-05, "loss": 2.5762, "step": 5824 }, { "epoch": 0.58, "grad_norm": 1.654955506324768, "learning_rate": 2.116849708123083e-05, "loss": 2.6838, "step": 5828 }, { "epoch": 0.58, "grad_norm": 1.5922191143035889, "learning_rate": 2.114870881567231e-05, "loss": 2.8244, "step": 5832 }, { "epoch": 0.58, "grad_norm": 1.5361075401306152, "learning_rate": 2.1128920550113786e-05, "loss": 2.6246, "step": 5836 }, { "epoch": 0.58, "grad_norm": 1.4784355163574219, "learning_rate": 2.1109132284555262e-05, "loss": 2.5971, "step": 5840 }, { "epoch": 0.58, "grad_norm": 1.9370943307876587, "learning_rate": 2.108934401899674e-05, "loss": 2.4188, "step": 5844 }, { "epoch": 0.58, "grad_norm": 1.6138865947723389, "learning_rate": 2.106955575343821e-05, "loss": 2.7327, "step": 5848 }, { "epoch": 0.58, "grad_norm": 1.6351326704025269, "learning_rate": 2.1049767487879688e-05, "loss": 2.6568, "step": 5852 }, { "epoch": 0.58, "grad_norm": 1.4805229902267456, "learning_rate": 2.1029979222321164e-05, "loss": 2.7057, "step": 5856 }, { "epoch": 0.58, "grad_norm": 1.7400959730148315, "learning_rate": 2.101019095676264e-05, "loss": 2.7251, "step": 5860 }, { "epoch": 0.58, "grad_norm": 1.5572409629821777, "learning_rate": 2.0990402691204117e-05, "loss": 2.5459, "step": 5864 }, { "epoch": 0.58, "grad_norm": 1.6387975215911865, "learning_rate": 2.0970614425645593e-05, "loss": 2.7208, "step": 5868 }, { "epoch": 0.58, "grad_norm": 1.6002295017242432, "learning_rate": 2.095082616008707e-05, "loss": 2.5598, "step": 5872 }, { "epoch": 0.58, "grad_norm": 1.6020545959472656, "learning_rate": 2.0931037894528546e-05, "loss": 2.6175, "step": 5876 }, { "epoch": 0.58, "grad_norm": 1.5802550315856934, "learning_rate": 2.0911249628970022e-05, "loss": 2.7237, "step": 5880 }, { "epoch": 0.58, "grad_norm": 1.6119384765625, "learning_rate": 2.0891461363411498e-05, "loss": 2.7419, "step": 5884 }, { "epoch": 0.58, "grad_norm": 1.748727798461914, "learning_rate": 2.087167309785297e-05, "loss": 2.5774, "step": 5888 }, { "epoch": 0.58, "grad_norm": 1.707313060760498, "learning_rate": 2.085188483229445e-05, "loss": 2.764, "step": 5892 }, { "epoch": 0.58, "grad_norm": 1.5973708629608154, "learning_rate": 2.0832096566735927e-05, "loss": 2.5781, "step": 5896 }, { "epoch": 0.58, "grad_norm": 1.5735652446746826, "learning_rate": 2.0812308301177404e-05, "loss": 2.6071, "step": 5900 }, { "epoch": 0.58, "grad_norm": 1.6533702611923218, "learning_rate": 2.079252003561888e-05, "loss": 2.7356, "step": 5904 }, { "epoch": 0.58, "grad_norm": 1.7489056587219238, "learning_rate": 2.0772731770060356e-05, "loss": 2.6429, "step": 5908 }, { "epoch": 0.58, "grad_norm": 1.793553352355957, "learning_rate": 2.0752943504501833e-05, "loss": 2.7273, "step": 5912 }, { "epoch": 0.59, "grad_norm": 1.7507030963897705, "learning_rate": 2.073315523894331e-05, "loss": 2.6684, "step": 5916 }, { "epoch": 0.59, "grad_norm": 1.6355698108673096, "learning_rate": 2.0713366973384785e-05, "loss": 2.5185, "step": 5920 }, { "epoch": 0.59, "grad_norm": 1.636668086051941, "learning_rate": 2.069357870782626e-05, "loss": 2.6138, "step": 5924 }, { "epoch": 0.59, "grad_norm": 2.161924123764038, "learning_rate": 2.0673790442267738e-05, "loss": 2.7274, "step": 5928 }, { "epoch": 0.59, "grad_norm": 1.6482921838760376, "learning_rate": 2.065400217670921e-05, "loss": 2.5224, "step": 5932 }, { "epoch": 0.59, "grad_norm": 1.7071282863616943, "learning_rate": 2.0634213911150687e-05, "loss": 2.514, "step": 5936 }, { "epoch": 0.59, "grad_norm": 1.6767092943191528, "learning_rate": 2.0614425645592163e-05, "loss": 2.5072, "step": 5940 }, { "epoch": 0.59, "grad_norm": 1.5524712800979614, "learning_rate": 2.059463738003364e-05, "loss": 2.402, "step": 5944 }, { "epoch": 0.59, "grad_norm": 1.7386505603790283, "learning_rate": 2.0574849114475116e-05, "loss": 2.5022, "step": 5948 }, { "epoch": 0.59, "grad_norm": 1.5279655456542969, "learning_rate": 2.0555060848916592e-05, "loss": 2.5708, "step": 5952 }, { "epoch": 0.59, "grad_norm": 1.6289914846420288, "learning_rate": 2.053527258335807e-05, "loss": 2.6689, "step": 5956 }, { "epoch": 0.59, "grad_norm": 1.536629557609558, "learning_rate": 2.0515484317799545e-05, "loss": 2.5802, "step": 5960 }, { "epoch": 0.59, "grad_norm": 1.6183369159698486, "learning_rate": 2.049569605224102e-05, "loss": 2.7748, "step": 5964 }, { "epoch": 0.59, "grad_norm": 1.5437310934066772, "learning_rate": 2.04759077866825e-05, "loss": 2.5857, "step": 5968 }, { "epoch": 0.59, "grad_norm": 1.4670217037200928, "learning_rate": 2.0456119521123974e-05, "loss": 2.7475, "step": 5972 }, { "epoch": 0.59, "grad_norm": 1.4012467861175537, "learning_rate": 2.043633125556545e-05, "loss": 2.4938, "step": 5976 }, { "epoch": 0.59, "grad_norm": 1.6246941089630127, "learning_rate": 2.0416542990006927e-05, "loss": 2.589, "step": 5980 }, { "epoch": 0.59, "grad_norm": 1.5825214385986328, "learning_rate": 2.0396754724448403e-05, "loss": 2.7124, "step": 5984 }, { "epoch": 0.59, "grad_norm": 1.598771572113037, "learning_rate": 2.037696645888988e-05, "loss": 2.604, "step": 5988 }, { "epoch": 0.59, "grad_norm": 1.6887823343276978, "learning_rate": 2.0357178193331356e-05, "loss": 2.463, "step": 5992 }, { "epoch": 0.59, "grad_norm": 1.5830531120300293, "learning_rate": 2.0337389927772832e-05, "loss": 2.5329, "step": 5996 }, { "epoch": 0.59, "grad_norm": 1.5762251615524292, "learning_rate": 2.0317601662214308e-05, "loss": 2.5809, "step": 6000 }, { "epoch": 0.59, "grad_norm": 1.8028018474578857, "learning_rate": 2.0297813396655785e-05, "loss": 2.5708, "step": 6004 }, { "epoch": 0.59, "grad_norm": 1.4725638628005981, "learning_rate": 2.027802513109726e-05, "loss": 2.7582, "step": 6008 }, { "epoch": 0.59, "grad_norm": 2.119457244873047, "learning_rate": 2.0258236865538737e-05, "loss": 2.7722, "step": 6012 }, { "epoch": 0.6, "grad_norm": 1.713395357131958, "learning_rate": 2.023844859998021e-05, "loss": 2.5151, "step": 6016 }, { "epoch": 0.6, "grad_norm": 1.5877728462219238, "learning_rate": 2.0218660334421686e-05, "loss": 2.647, "step": 6020 }, { "epoch": 0.6, "grad_norm": 1.6302303075790405, "learning_rate": 2.0198872068863163e-05, "loss": 2.6777, "step": 6024 }, { "epoch": 0.6, "grad_norm": 1.739067554473877, "learning_rate": 2.0179083803304642e-05, "loss": 2.736, "step": 6028 }, { "epoch": 0.6, "grad_norm": 1.683003306388855, "learning_rate": 2.015929553774612e-05, "loss": 2.5068, "step": 6032 }, { "epoch": 0.6, "grad_norm": 1.5176719427108765, "learning_rate": 2.0139507272187595e-05, "loss": 2.4223, "step": 6036 }, { "epoch": 0.6, "grad_norm": 1.7050352096557617, "learning_rate": 2.011971900662907e-05, "loss": 2.7499, "step": 6040 }, { "epoch": 0.6, "grad_norm": 1.6220310926437378, "learning_rate": 2.0099930741070548e-05, "loss": 2.6881, "step": 6044 }, { "epoch": 0.6, "grad_norm": 1.61378014087677, "learning_rate": 2.0080142475512024e-05, "loss": 2.7385, "step": 6048 }, { "epoch": 0.6, "grad_norm": 1.709804892539978, "learning_rate": 2.00603542099535e-05, "loss": 2.4782, "step": 6052 }, { "epoch": 0.6, "grad_norm": 1.5907514095306396, "learning_rate": 2.0040565944394973e-05, "loss": 2.6041, "step": 6056 }, { "epoch": 0.6, "grad_norm": 1.587908148765564, "learning_rate": 2.002077767883645e-05, "loss": 2.6641, "step": 6060 }, { "epoch": 0.6, "grad_norm": 1.4979619979858398, "learning_rate": 2.0000989413277926e-05, "loss": 2.6292, "step": 6064 }, { "epoch": 0.6, "grad_norm": 1.6397794485092163, "learning_rate": 1.9981201147719402e-05, "loss": 2.4687, "step": 6068 }, { "epoch": 0.6, "grad_norm": 1.5526165962219238, "learning_rate": 1.996141288216088e-05, "loss": 2.5717, "step": 6072 }, { "epoch": 0.6, "grad_norm": 1.6154141426086426, "learning_rate": 1.9941624616602355e-05, "loss": 2.4732, "step": 6076 }, { "epoch": 0.6, "grad_norm": 1.5267953872680664, "learning_rate": 1.992183635104383e-05, "loss": 2.5728, "step": 6080 }, { "epoch": 0.6, "grad_norm": 1.5741758346557617, "learning_rate": 1.9902048085485308e-05, "loss": 2.6762, "step": 6084 }, { "epoch": 0.6, "grad_norm": 1.4406962394714355, "learning_rate": 1.9882259819926784e-05, "loss": 2.4681, "step": 6088 }, { "epoch": 0.6, "grad_norm": 1.5490901470184326, "learning_rate": 1.986247155436826e-05, "loss": 2.7022, "step": 6092 }, { "epoch": 0.6, "grad_norm": 1.6361629962921143, "learning_rate": 1.9842683288809737e-05, "loss": 2.4699, "step": 6096 }, { "epoch": 0.6, "grad_norm": 1.4828660488128662, "learning_rate": 1.9822895023251213e-05, "loss": 2.4622, "step": 6100 }, { "epoch": 0.6, "grad_norm": 1.5388118028640747, "learning_rate": 1.980310675769269e-05, "loss": 2.5757, "step": 6104 }, { "epoch": 0.6, "grad_norm": 1.7506879568099976, "learning_rate": 1.9783318492134165e-05, "loss": 2.5815, "step": 6108 }, { "epoch": 0.6, "grad_norm": 1.6351765394210815, "learning_rate": 1.9763530226575642e-05, "loss": 2.7742, "step": 6112 }, { "epoch": 0.61, "grad_norm": 1.708899736404419, "learning_rate": 1.9743741961017118e-05, "loss": 2.5452, "step": 6116 }, { "epoch": 0.61, "grad_norm": 1.6561249494552612, "learning_rate": 1.9723953695458594e-05, "loss": 2.5306, "step": 6120 }, { "epoch": 0.61, "grad_norm": 1.5460957288742065, "learning_rate": 1.970416542990007e-05, "loss": 2.529, "step": 6124 }, { "epoch": 0.61, "grad_norm": 1.5808017253875732, "learning_rate": 1.9684377164341547e-05, "loss": 2.587, "step": 6128 }, { "epoch": 0.61, "grad_norm": 1.5812370777130127, "learning_rate": 1.9664588898783023e-05, "loss": 2.6806, "step": 6132 }, { "epoch": 0.61, "grad_norm": 1.6108824014663696, "learning_rate": 1.96448006332245e-05, "loss": 2.7478, "step": 6136 }, { "epoch": 0.61, "grad_norm": 1.501314401626587, "learning_rate": 1.9625012367665976e-05, "loss": 2.7401, "step": 6140 }, { "epoch": 0.61, "grad_norm": 1.5375041961669922, "learning_rate": 1.960522410210745e-05, "loss": 2.6419, "step": 6144 }, { "epoch": 0.61, "grad_norm": 1.5446549654006958, "learning_rate": 1.9585435836548925e-05, "loss": 2.4504, "step": 6148 }, { "epoch": 0.61, "grad_norm": 1.6299073696136475, "learning_rate": 1.95656475709904e-05, "loss": 2.79, "step": 6152 }, { "epoch": 0.61, "grad_norm": 1.5294485092163086, "learning_rate": 1.9545859305431878e-05, "loss": 2.5702, "step": 6156 }, { "epoch": 0.61, "grad_norm": 1.7888283729553223, "learning_rate": 1.9526071039873354e-05, "loss": 2.7481, "step": 6160 }, { "epoch": 0.61, "grad_norm": 1.5239052772521973, "learning_rate": 1.9506282774314834e-05, "loss": 2.7299, "step": 6164 }, { "epoch": 0.61, "grad_norm": 1.5439248085021973, "learning_rate": 1.948649450875631e-05, "loss": 2.4492, "step": 6168 }, { "epoch": 0.61, "grad_norm": 1.5575426816940308, "learning_rate": 1.9466706243197787e-05, "loss": 2.5791, "step": 6172 }, { "epoch": 0.61, "grad_norm": 1.6688716411590576, "learning_rate": 1.9446917977639263e-05, "loss": 2.5319, "step": 6176 }, { "epoch": 0.61, "grad_norm": 1.6760215759277344, "learning_rate": 1.942712971208074e-05, "loss": 2.7329, "step": 6180 }, { "epoch": 0.61, "grad_norm": 1.5455204248428345, "learning_rate": 1.9407341446522212e-05, "loss": 2.6252, "step": 6184 }, { "epoch": 0.61, "grad_norm": 1.5902513265609741, "learning_rate": 1.938755318096369e-05, "loss": 2.5223, "step": 6188 }, { "epoch": 0.61, "grad_norm": 1.5582561492919922, "learning_rate": 1.9367764915405165e-05, "loss": 2.5884, "step": 6192 }, { "epoch": 0.61, "grad_norm": 1.6034671068191528, "learning_rate": 1.934797664984664e-05, "loss": 2.6888, "step": 6196 }, { "epoch": 0.61, "grad_norm": 1.6465849876403809, "learning_rate": 1.9328188384288117e-05, "loss": 2.4756, "step": 6200 }, { "epoch": 0.61, "grad_norm": 1.6276373863220215, "learning_rate": 1.9308400118729594e-05, "loss": 2.5836, "step": 6204 }, { "epoch": 0.61, "grad_norm": 1.5838865041732788, "learning_rate": 1.928861185317107e-05, "loss": 2.6465, "step": 6208 }, { "epoch": 0.61, "grad_norm": 1.6422873735427856, "learning_rate": 1.9268823587612546e-05, "loss": 2.604, "step": 6212 }, { "epoch": 0.62, "grad_norm": 1.591674566268921, "learning_rate": 1.9249035322054023e-05, "loss": 2.5624, "step": 6216 }, { "epoch": 0.62, "grad_norm": 1.670748233795166, "learning_rate": 1.92292470564955e-05, "loss": 2.5349, "step": 6220 }, { "epoch": 0.62, "grad_norm": 1.5904476642608643, "learning_rate": 1.9209458790936975e-05, "loss": 2.4124, "step": 6224 }, { "epoch": 0.62, "grad_norm": 1.5372642278671265, "learning_rate": 1.9189670525378452e-05, "loss": 2.8056, "step": 6228 }, { "epoch": 0.62, "grad_norm": 1.6156257390975952, "learning_rate": 1.9169882259819928e-05, "loss": 2.3666, "step": 6232 }, { "epoch": 0.62, "grad_norm": 1.5358694791793823, "learning_rate": 1.9150093994261404e-05, "loss": 2.5517, "step": 6236 }, { "epoch": 0.62, "grad_norm": 1.6131166219711304, "learning_rate": 1.913030572870288e-05, "loss": 2.699, "step": 6240 }, { "epoch": 0.62, "grad_norm": 1.7722057104110718, "learning_rate": 1.9110517463144357e-05, "loss": 2.6431, "step": 6244 }, { "epoch": 0.62, "grad_norm": 1.7471140623092651, "learning_rate": 1.9090729197585833e-05, "loss": 2.7873, "step": 6248 }, { "epoch": 0.62, "grad_norm": 1.698923945426941, "learning_rate": 1.907094093202731e-05, "loss": 2.6642, "step": 6252 }, { "epoch": 0.62, "grad_norm": 1.5123364925384521, "learning_rate": 1.9051152666468786e-05, "loss": 2.6528, "step": 6256 }, { "epoch": 0.62, "grad_norm": 1.5572673082351685, "learning_rate": 1.9031364400910262e-05, "loss": 2.4895, "step": 6260 }, { "epoch": 0.62, "grad_norm": 1.8590190410614014, "learning_rate": 1.901157613535174e-05, "loss": 2.6208, "step": 6264 }, { "epoch": 0.62, "grad_norm": 1.452551245689392, "learning_rate": 1.899178786979321e-05, "loss": 2.5474, "step": 6268 }, { "epoch": 0.62, "grad_norm": 1.5120790004730225, "learning_rate": 1.8971999604234688e-05, "loss": 2.4449, "step": 6272 }, { "epoch": 0.62, "grad_norm": 1.7119253873825073, "learning_rate": 1.8952211338676164e-05, "loss": 2.5996, "step": 6276 }, { "epoch": 0.62, "grad_norm": 1.4548217058181763, "learning_rate": 1.893242307311764e-05, "loss": 2.6629, "step": 6280 }, { "epoch": 0.62, "grad_norm": 1.6181526184082031, "learning_rate": 1.8912634807559117e-05, "loss": 2.5607, "step": 6284 }, { "epoch": 0.62, "grad_norm": 1.5881283283233643, "learning_rate": 1.8892846542000593e-05, "loss": 2.7344, "step": 6288 }, { "epoch": 0.62, "grad_norm": 1.5534508228302002, "learning_rate": 1.887305827644207e-05, "loss": 2.5413, "step": 6292 }, { "epoch": 0.62, "grad_norm": 1.4985178709030151, "learning_rate": 1.8853270010883546e-05, "loss": 2.6648, "step": 6296 }, { "epoch": 0.62, "grad_norm": 1.698745846748352, "learning_rate": 1.8833481745325026e-05, "loss": 2.5784, "step": 6300 }, { "epoch": 0.62, "grad_norm": 1.42083740234375, "learning_rate": 1.8813693479766502e-05, "loss": 2.7346, "step": 6304 }, { "epoch": 0.62, "grad_norm": 1.6639600992202759, "learning_rate": 1.8793905214207978e-05, "loss": 2.7116, "step": 6308 }, { "epoch": 0.62, "grad_norm": 1.6525336503982544, "learning_rate": 1.877411694864945e-05, "loss": 2.696, "step": 6312 }, { "epoch": 0.62, "grad_norm": 1.6521825790405273, "learning_rate": 1.8754328683090927e-05, "loss": 2.4546, "step": 6316 }, { "epoch": 0.63, "grad_norm": 1.7254414558410645, "learning_rate": 1.8734540417532404e-05, "loss": 2.6165, "step": 6320 }, { "epoch": 0.63, "grad_norm": 1.47000253200531, "learning_rate": 1.871475215197388e-05, "loss": 2.4961, "step": 6324 }, { "epoch": 0.63, "grad_norm": 1.6371259689331055, "learning_rate": 1.8694963886415356e-05, "loss": 2.4793, "step": 6328 }, { "epoch": 0.63, "grad_norm": 1.928614854812622, "learning_rate": 1.8675175620856833e-05, "loss": 2.8014, "step": 6332 }, { "epoch": 0.63, "grad_norm": 1.5594316720962524, "learning_rate": 1.865538735529831e-05, "loss": 2.4129, "step": 6336 }, { "epoch": 0.63, "grad_norm": 1.5634496212005615, "learning_rate": 1.8635599089739785e-05, "loss": 2.5991, "step": 6340 }, { "epoch": 0.63, "grad_norm": 1.7114262580871582, "learning_rate": 1.861581082418126e-05, "loss": 2.6678, "step": 6344 }, { "epoch": 0.63, "grad_norm": 1.5006263256072998, "learning_rate": 1.8596022558622738e-05, "loss": 2.6402, "step": 6348 }, { "epoch": 0.63, "grad_norm": 1.6565409898757935, "learning_rate": 1.857623429306421e-05, "loss": 2.6237, "step": 6352 }, { "epoch": 0.63, "grad_norm": 1.792551875114441, "learning_rate": 1.855644602750569e-05, "loss": 2.4968, "step": 6356 }, { "epoch": 0.63, "grad_norm": 1.714000940322876, "learning_rate": 1.8536657761947167e-05, "loss": 2.5924, "step": 6360 }, { "epoch": 0.63, "grad_norm": 2.0721774101257324, "learning_rate": 1.8516869496388643e-05, "loss": 2.7513, "step": 6364 }, { "epoch": 0.63, "grad_norm": 1.5976616144180298, "learning_rate": 1.849708123083012e-05, "loss": 2.5484, "step": 6368 }, { "epoch": 0.63, "grad_norm": 1.5501139163970947, "learning_rate": 1.8477292965271596e-05, "loss": 2.6753, "step": 6372 }, { "epoch": 0.63, "grad_norm": 1.590151309967041, "learning_rate": 1.8457504699713072e-05, "loss": 2.5246, "step": 6376 }, { "epoch": 0.63, "grad_norm": 1.6754822731018066, "learning_rate": 1.843771643415455e-05, "loss": 2.6071, "step": 6380 }, { "epoch": 0.63, "grad_norm": 1.625549554824829, "learning_rate": 1.8417928168596025e-05, "loss": 2.7181, "step": 6384 }, { "epoch": 0.63, "grad_norm": 1.7473890781402588, "learning_rate": 1.83981399030375e-05, "loss": 2.5226, "step": 6388 }, { "epoch": 0.63, "grad_norm": 1.6598209142684937, "learning_rate": 1.8378351637478978e-05, "loss": 2.5456, "step": 6392 }, { "epoch": 0.63, "grad_norm": 1.5726118087768555, "learning_rate": 1.835856337192045e-05, "loss": 2.661, "step": 6396 }, { "epoch": 0.63, "grad_norm": 1.6404038667678833, "learning_rate": 1.8338775106361927e-05, "loss": 2.451, "step": 6400 }, { "epoch": 0.63, "grad_norm": 1.9309062957763672, "learning_rate": 1.8318986840803403e-05, "loss": 2.495, "step": 6404 }, { "epoch": 0.63, "grad_norm": 1.5471394062042236, "learning_rate": 1.829919857524488e-05, "loss": 2.6116, "step": 6408 }, { "epoch": 0.63, "grad_norm": 1.7516142129898071, "learning_rate": 1.8279410309686356e-05, "loss": 2.7225, "step": 6412 }, { "epoch": 0.63, "grad_norm": 1.534562110900879, "learning_rate": 1.8259622044127832e-05, "loss": 2.7206, "step": 6416 }, { "epoch": 0.64, "grad_norm": 1.9905803203582764, "learning_rate": 1.823983377856931e-05, "loss": 2.6701, "step": 6420 }, { "epoch": 0.64, "grad_norm": 1.5553815364837646, "learning_rate": 1.8220045513010785e-05, "loss": 2.5828, "step": 6424 }, { "epoch": 0.64, "grad_norm": 1.5264707803726196, "learning_rate": 1.820025724745226e-05, "loss": 2.5069, "step": 6428 }, { "epoch": 0.64, "grad_norm": 1.8345638513565063, "learning_rate": 1.8180468981893737e-05, "loss": 2.7473, "step": 6432 }, { "epoch": 0.64, "grad_norm": 1.6059725284576416, "learning_rate": 1.8160680716335214e-05, "loss": 2.6397, "step": 6436 }, { "epoch": 0.64, "grad_norm": 1.5249567031860352, "learning_rate": 1.814089245077669e-05, "loss": 2.6957, "step": 6440 }, { "epoch": 0.64, "grad_norm": 1.6217684745788574, "learning_rate": 1.8121104185218166e-05, "loss": 2.506, "step": 6444 }, { "epoch": 0.64, "grad_norm": 1.7154309749603271, "learning_rate": 1.8101315919659643e-05, "loss": 2.6848, "step": 6448 }, { "epoch": 0.64, "grad_norm": 1.553000807762146, "learning_rate": 1.808152765410112e-05, "loss": 2.4827, "step": 6452 }, { "epoch": 0.64, "grad_norm": 1.7927863597869873, "learning_rate": 1.8061739388542595e-05, "loss": 2.5527, "step": 6456 }, { "epoch": 0.64, "grad_norm": 1.4993387460708618, "learning_rate": 1.804195112298407e-05, "loss": 2.732, "step": 6460 }, { "epoch": 0.64, "grad_norm": 1.5990402698516846, "learning_rate": 1.8022162857425548e-05, "loss": 2.6775, "step": 6464 }, { "epoch": 0.64, "grad_norm": 1.5883631706237793, "learning_rate": 1.8002374591867024e-05, "loss": 2.5361, "step": 6468 }, { "epoch": 0.64, "grad_norm": 1.4568392038345337, "learning_rate": 1.79825863263085e-05, "loss": 2.7561, "step": 6472 }, { "epoch": 0.64, "grad_norm": 1.5289112329483032, "learning_rate": 1.7962798060749977e-05, "loss": 2.4471, "step": 6476 }, { "epoch": 0.64, "grad_norm": 1.6319639682769775, "learning_rate": 1.794300979519145e-05, "loss": 2.6101, "step": 6480 }, { "epoch": 0.64, "grad_norm": 1.5441527366638184, "learning_rate": 1.7923221529632926e-05, "loss": 2.5532, "step": 6484 }, { "epoch": 0.64, "grad_norm": 1.6211721897125244, "learning_rate": 1.7903433264074402e-05, "loss": 2.6647, "step": 6488 }, { "epoch": 0.64, "grad_norm": 1.587514877319336, "learning_rate": 1.7883644998515882e-05, "loss": 2.6054, "step": 6492 }, { "epoch": 0.64, "grad_norm": 1.764523983001709, "learning_rate": 1.786385673295736e-05, "loss": 2.5837, "step": 6496 }, { "epoch": 0.64, "grad_norm": 1.555456519126892, "learning_rate": 1.7844068467398835e-05, "loss": 2.6482, "step": 6500 }, { "epoch": 0.64, "grad_norm": 1.5590410232543945, "learning_rate": 1.782428020184031e-05, "loss": 2.834, "step": 6504 }, { "epoch": 0.64, "grad_norm": 1.5642460584640503, "learning_rate": 1.7804491936281787e-05, "loss": 2.6191, "step": 6508 }, { "epoch": 0.64, "grad_norm": 1.7200212478637695, "learning_rate": 1.7784703670723264e-05, "loss": 2.6683, "step": 6512 }, { "epoch": 0.64, "grad_norm": 1.682166576385498, "learning_rate": 1.776491540516474e-05, "loss": 2.5589, "step": 6516 }, { "epoch": 0.65, "grad_norm": 1.6353708505630493, "learning_rate": 1.7745127139606213e-05, "loss": 2.5668, "step": 6520 }, { "epoch": 0.65, "grad_norm": 1.5286115407943726, "learning_rate": 1.772533887404769e-05, "loss": 2.4588, "step": 6524 }, { "epoch": 0.65, "grad_norm": 1.6446589231491089, "learning_rate": 1.7705550608489166e-05, "loss": 2.6396, "step": 6528 }, { "epoch": 0.65, "grad_norm": 1.5995491743087769, "learning_rate": 1.7685762342930642e-05, "loss": 2.4982, "step": 6532 }, { "epoch": 0.65, "grad_norm": 1.5419460535049438, "learning_rate": 1.766597407737212e-05, "loss": 2.6596, "step": 6536 }, { "epoch": 0.65, "grad_norm": 1.5353944301605225, "learning_rate": 1.7646185811813595e-05, "loss": 2.5443, "step": 6540 }, { "epoch": 0.65, "grad_norm": 1.683626651763916, "learning_rate": 1.762639754625507e-05, "loss": 2.6385, "step": 6544 }, { "epoch": 0.65, "grad_norm": 1.5726920366287231, "learning_rate": 1.7606609280696547e-05, "loss": 2.7459, "step": 6548 }, { "epoch": 0.65, "grad_norm": 1.7215642929077148, "learning_rate": 1.7586821015138024e-05, "loss": 2.4393, "step": 6552 }, { "epoch": 0.65, "grad_norm": 1.5964510440826416, "learning_rate": 1.75670327495795e-05, "loss": 2.7072, "step": 6556 }, { "epoch": 0.65, "grad_norm": 1.8928728103637695, "learning_rate": 1.7547244484020976e-05, "loss": 2.664, "step": 6560 }, { "epoch": 0.65, "grad_norm": 1.6763553619384766, "learning_rate": 1.7527456218462453e-05, "loss": 2.4276, "step": 6564 }, { "epoch": 0.65, "grad_norm": 1.570327639579773, "learning_rate": 1.750766795290393e-05, "loss": 2.621, "step": 6568 }, { "epoch": 0.65, "grad_norm": 1.5460453033447266, "learning_rate": 1.7487879687345405e-05, "loss": 2.5846, "step": 6572 }, { "epoch": 0.65, "grad_norm": 1.5720558166503906, "learning_rate": 1.746809142178688e-05, "loss": 2.6405, "step": 6576 }, { "epoch": 0.65, "grad_norm": 1.6330138444900513, "learning_rate": 1.7448303156228358e-05, "loss": 2.6873, "step": 6580 }, { "epoch": 0.65, "grad_norm": 1.7572230100631714, "learning_rate": 1.7428514890669834e-05, "loss": 2.6039, "step": 6584 }, { "epoch": 0.65, "grad_norm": 1.5484024286270142, "learning_rate": 1.740872662511131e-05, "loss": 2.5413, "step": 6588 }, { "epoch": 0.65, "grad_norm": 1.8520413637161255, "learning_rate": 1.7388938359552787e-05, "loss": 2.6781, "step": 6592 }, { "epoch": 0.65, "grad_norm": 1.4541682004928589, "learning_rate": 1.7369150093994263e-05, "loss": 2.48, "step": 6596 }, { "epoch": 0.65, "grad_norm": 1.503838062286377, "learning_rate": 1.734936182843574e-05, "loss": 2.5952, "step": 6600 }, { "epoch": 0.65, "grad_norm": 1.73090660572052, "learning_rate": 1.7329573562877216e-05, "loss": 2.5526, "step": 6604 }, { "epoch": 0.65, "grad_norm": 1.8160631656646729, "learning_rate": 1.730978529731869e-05, "loss": 2.8937, "step": 6608 }, { "epoch": 0.65, "grad_norm": 1.646761178970337, "learning_rate": 1.7289997031760165e-05, "loss": 2.6505, "step": 6612 }, { "epoch": 0.65, "grad_norm": 1.8188313245773315, "learning_rate": 1.727020876620164e-05, "loss": 2.5133, "step": 6616 }, { "epoch": 0.65, "grad_norm": 1.4916778802871704, "learning_rate": 1.7250420500643118e-05, "loss": 2.7022, "step": 6620 }, { "epoch": 0.66, "grad_norm": 1.6285133361816406, "learning_rate": 1.7230632235084594e-05, "loss": 2.8066, "step": 6624 }, { "epoch": 0.66, "grad_norm": 1.8589283227920532, "learning_rate": 1.7210843969526074e-05, "loss": 2.6522, "step": 6628 }, { "epoch": 0.66, "grad_norm": 1.4183986186981201, "learning_rate": 1.719105570396755e-05, "loss": 2.4366, "step": 6632 }, { "epoch": 0.66, "grad_norm": 1.6243681907653809, "learning_rate": 1.7171267438409026e-05, "loss": 2.4741, "step": 6636 }, { "epoch": 0.66, "grad_norm": 1.6193912029266357, "learning_rate": 1.7151479172850503e-05, "loss": 2.5141, "step": 6640 }, { "epoch": 0.66, "grad_norm": 1.7768150568008423, "learning_rate": 1.713169090729198e-05, "loss": 2.5269, "step": 6644 }, { "epoch": 0.66, "grad_norm": 1.7248855829238892, "learning_rate": 1.7111902641733452e-05, "loss": 2.6412, "step": 6648 }, { "epoch": 0.66, "grad_norm": 1.6122647523880005, "learning_rate": 1.7092114376174928e-05, "loss": 2.6304, "step": 6652 }, { "epoch": 0.66, "grad_norm": 1.7491798400878906, "learning_rate": 1.7072326110616405e-05, "loss": 2.7254, "step": 6656 }, { "epoch": 0.66, "grad_norm": 1.5350676774978638, "learning_rate": 1.705253784505788e-05, "loss": 2.6792, "step": 6660 }, { "epoch": 0.66, "grad_norm": 1.5873485803604126, "learning_rate": 1.7032749579499357e-05, "loss": 2.7011, "step": 6664 }, { "epoch": 0.66, "grad_norm": 2.228262186050415, "learning_rate": 1.7012961313940834e-05, "loss": 2.4938, "step": 6668 }, { "epoch": 0.66, "grad_norm": 1.5721497535705566, "learning_rate": 1.699317304838231e-05, "loss": 2.502, "step": 6672 }, { "epoch": 0.66, "grad_norm": 1.7099807262420654, "learning_rate": 1.6973384782823786e-05, "loss": 2.5514, "step": 6676 }, { "epoch": 0.66, "grad_norm": 1.7094590663909912, "learning_rate": 1.6953596517265263e-05, "loss": 2.5132, "step": 6680 }, { "epoch": 0.66, "grad_norm": 1.536875605583191, "learning_rate": 1.693380825170674e-05, "loss": 2.5599, "step": 6684 }, { "epoch": 0.66, "grad_norm": 1.6018167734146118, "learning_rate": 1.6914019986148215e-05, "loss": 2.5827, "step": 6688 }, { "epoch": 0.66, "grad_norm": 1.582042932510376, "learning_rate": 1.689423172058969e-05, "loss": 2.6136, "step": 6692 }, { "epoch": 0.66, "grad_norm": 1.5590091943740845, "learning_rate": 1.6874443455031168e-05, "loss": 2.7667, "step": 6696 }, { "epoch": 0.66, "grad_norm": 1.6942031383514404, "learning_rate": 1.6854655189472644e-05, "loss": 2.6389, "step": 6700 }, { "epoch": 0.66, "grad_norm": 1.9794580936431885, "learning_rate": 1.683486692391412e-05, "loss": 2.4073, "step": 6704 }, { "epoch": 0.66, "grad_norm": 1.7205188274383545, "learning_rate": 1.6815078658355597e-05, "loss": 2.6566, "step": 6708 }, { "epoch": 0.66, "grad_norm": 1.6077735424041748, "learning_rate": 1.6795290392797073e-05, "loss": 2.5308, "step": 6712 }, { "epoch": 0.66, "grad_norm": 1.5309977531433105, "learning_rate": 1.677550212723855e-05, "loss": 2.6203, "step": 6716 }, { "epoch": 0.66, "grad_norm": 1.6211974620819092, "learning_rate": 1.6755713861680026e-05, "loss": 2.6303, "step": 6720 }, { "epoch": 0.67, "grad_norm": 1.4833593368530273, "learning_rate": 1.6735925596121502e-05, "loss": 2.4867, "step": 6724 }, { "epoch": 0.67, "grad_norm": 1.793488621711731, "learning_rate": 1.671613733056298e-05, "loss": 2.5169, "step": 6728 }, { "epoch": 0.67, "grad_norm": 1.5642069578170776, "learning_rate": 1.669634906500445e-05, "loss": 2.5192, "step": 6732 }, { "epoch": 0.67, "grad_norm": 1.84517240524292, "learning_rate": 1.6676560799445928e-05, "loss": 2.5577, "step": 6736 }, { "epoch": 0.67, "grad_norm": 1.5825902223587036, "learning_rate": 1.6656772533887404e-05, "loss": 2.5124, "step": 6740 }, { "epoch": 0.67, "grad_norm": 1.642572045326233, "learning_rate": 1.663698426832888e-05, "loss": 2.5925, "step": 6744 }, { "epoch": 0.67, "grad_norm": 1.5614920854568481, "learning_rate": 1.6617196002770357e-05, "loss": 2.64, "step": 6748 }, { "epoch": 0.67, "grad_norm": 1.5319268703460693, "learning_rate": 1.6597407737211833e-05, "loss": 2.3939, "step": 6752 }, { "epoch": 0.67, "grad_norm": 1.553107738494873, "learning_rate": 1.657761947165331e-05, "loss": 2.5153, "step": 6756 }, { "epoch": 0.67, "grad_norm": 1.5315675735473633, "learning_rate": 1.6557831206094786e-05, "loss": 2.4994, "step": 6760 }, { "epoch": 0.67, "grad_norm": 1.5716221332550049, "learning_rate": 1.6538042940536265e-05, "loss": 2.4707, "step": 6764 }, { "epoch": 0.67, "grad_norm": 2.1599626541137695, "learning_rate": 1.651825467497774e-05, "loss": 2.4953, "step": 6768 }, { "epoch": 0.67, "grad_norm": 1.762488603591919, "learning_rate": 1.6498466409419218e-05, "loss": 2.5602, "step": 6772 }, { "epoch": 0.67, "grad_norm": 1.584439754486084, "learning_rate": 1.647867814386069e-05, "loss": 2.5398, "step": 6776 }, { "epoch": 0.67, "grad_norm": 1.592529535293579, "learning_rate": 1.6458889878302167e-05, "loss": 2.504, "step": 6780 }, { "epoch": 0.67, "grad_norm": 1.6364370584487915, "learning_rate": 1.6439101612743643e-05, "loss": 2.5175, "step": 6784 }, { "epoch": 0.67, "grad_norm": 1.4882373809814453, "learning_rate": 1.641931334718512e-05, "loss": 2.506, "step": 6788 }, { "epoch": 0.67, "grad_norm": 1.6269882917404175, "learning_rate": 1.6399525081626596e-05, "loss": 2.5361, "step": 6792 }, { "epoch": 0.67, "grad_norm": 1.6206145286560059, "learning_rate": 1.6379736816068072e-05, "loss": 2.5575, "step": 6796 }, { "epoch": 0.67, "grad_norm": 1.5736054182052612, "learning_rate": 1.635994855050955e-05, "loss": 2.6325, "step": 6800 }, { "epoch": 0.67, "grad_norm": 1.6561203002929688, "learning_rate": 1.6340160284951025e-05, "loss": 2.4781, "step": 6804 }, { "epoch": 0.67, "grad_norm": 1.612779974937439, "learning_rate": 1.63203720193925e-05, "loss": 2.6126, "step": 6808 }, { "epoch": 0.67, "grad_norm": 1.7301238775253296, "learning_rate": 1.6300583753833978e-05, "loss": 2.6143, "step": 6812 }, { "epoch": 0.67, "grad_norm": 1.7517168521881104, "learning_rate": 1.628079548827545e-05, "loss": 2.6524, "step": 6816 }, { "epoch": 0.67, "grad_norm": 1.5563359260559082, "learning_rate": 1.6261007222716927e-05, "loss": 2.5678, "step": 6820 }, { "epoch": 0.68, "grad_norm": 1.7279319763183594, "learning_rate": 1.6241218957158407e-05, "loss": 2.6764, "step": 6824 }, { "epoch": 0.68, "grad_norm": 1.6466785669326782, "learning_rate": 1.6221430691599883e-05, "loss": 2.4621, "step": 6828 }, { "epoch": 0.68, "grad_norm": 1.578447937965393, "learning_rate": 1.620164242604136e-05, "loss": 2.6362, "step": 6832 }, { "epoch": 0.68, "grad_norm": 1.7554636001586914, "learning_rate": 1.6181854160482836e-05, "loss": 2.6348, "step": 6836 }, { "epoch": 0.68, "grad_norm": 1.8078508377075195, "learning_rate": 1.6162065894924312e-05, "loss": 2.4432, "step": 6840 }, { "epoch": 0.68, "grad_norm": 1.6505666971206665, "learning_rate": 1.6142277629365788e-05, "loss": 2.5918, "step": 6844 }, { "epoch": 0.68, "grad_norm": 1.6466803550720215, "learning_rate": 1.6122489363807265e-05, "loss": 2.7365, "step": 6848 }, { "epoch": 0.68, "grad_norm": 1.682653546333313, "learning_rate": 1.610270109824874e-05, "loss": 2.6347, "step": 6852 }, { "epoch": 0.68, "grad_norm": 1.5635805130004883, "learning_rate": 1.6082912832690217e-05, "loss": 2.4988, "step": 6856 }, { "epoch": 0.68, "grad_norm": 1.6750497817993164, "learning_rate": 1.606312456713169e-05, "loss": 2.4018, "step": 6860 }, { "epoch": 0.68, "grad_norm": 1.5516955852508545, "learning_rate": 1.6043336301573167e-05, "loss": 2.5583, "step": 6864 }, { "epoch": 0.68, "grad_norm": 1.7607613801956177, "learning_rate": 1.6023548036014643e-05, "loss": 2.4629, "step": 6868 }, { "epoch": 0.68, "grad_norm": 1.6372771263122559, "learning_rate": 1.600375977045612e-05, "loss": 2.7317, "step": 6872 }, { "epoch": 0.68, "grad_norm": 1.5994541645050049, "learning_rate": 1.5983971504897595e-05, "loss": 2.4516, "step": 6876 }, { "epoch": 0.68, "grad_norm": 1.6625773906707764, "learning_rate": 1.5964183239339072e-05, "loss": 2.5038, "step": 6880 }, { "epoch": 0.68, "grad_norm": 1.8105216026306152, "learning_rate": 1.5944394973780548e-05, "loss": 2.4708, "step": 6884 }, { "epoch": 0.68, "grad_norm": 1.7151480913162231, "learning_rate": 1.5924606708222024e-05, "loss": 2.4271, "step": 6888 }, { "epoch": 0.68, "grad_norm": 1.6709083318710327, "learning_rate": 1.59048184426635e-05, "loss": 2.7086, "step": 6892 }, { "epoch": 0.68, "grad_norm": 1.6832211017608643, "learning_rate": 1.5885030177104977e-05, "loss": 2.6396, "step": 6896 }, { "epoch": 0.68, "grad_norm": 1.6037044525146484, "learning_rate": 1.5865241911546453e-05, "loss": 2.3786, "step": 6900 }, { "epoch": 0.68, "grad_norm": 1.6449296474456787, "learning_rate": 1.584545364598793e-05, "loss": 2.6229, "step": 6904 }, { "epoch": 0.68, "grad_norm": 1.7927825450897217, "learning_rate": 1.5825665380429406e-05, "loss": 2.5644, "step": 6908 }, { "epoch": 0.68, "grad_norm": 1.5636272430419922, "learning_rate": 1.5805877114870882e-05, "loss": 2.5509, "step": 6912 }, { "epoch": 0.68, "grad_norm": 1.5654444694519043, "learning_rate": 1.578608884931236e-05, "loss": 2.4836, "step": 6916 }, { "epoch": 0.68, "grad_norm": 1.7937673330307007, "learning_rate": 1.5766300583753835e-05, "loss": 2.669, "step": 6920 }, { "epoch": 0.69, "grad_norm": 1.6205195188522339, "learning_rate": 1.574651231819531e-05, "loss": 2.7591, "step": 6924 }, { "epoch": 0.69, "grad_norm": 1.5670419931411743, "learning_rate": 1.5726724052636788e-05, "loss": 2.7567, "step": 6928 }, { "epoch": 0.69, "grad_norm": 1.5903569459915161, "learning_rate": 1.5706935787078264e-05, "loss": 2.505, "step": 6932 }, { "epoch": 0.69, "grad_norm": 1.7012027502059937, "learning_rate": 1.568714752151974e-05, "loss": 2.4375, "step": 6936 }, { "epoch": 0.69, "grad_norm": 1.5950981378555298, "learning_rate": 1.5667359255961217e-05, "loss": 2.6312, "step": 6940 }, { "epoch": 0.69, "grad_norm": 1.647019624710083, "learning_rate": 1.564757099040269e-05, "loss": 2.547, "step": 6944 }, { "epoch": 0.69, "grad_norm": 1.6310683488845825, "learning_rate": 1.5627782724844166e-05, "loss": 2.5012, "step": 6948 }, { "epoch": 0.69, "grad_norm": 1.5516809225082397, "learning_rate": 1.5607994459285642e-05, "loss": 2.6731, "step": 6952 }, { "epoch": 0.69, "grad_norm": 1.5987101793289185, "learning_rate": 1.558820619372712e-05, "loss": 2.4518, "step": 6956 }, { "epoch": 0.69, "grad_norm": 1.5934362411499023, "learning_rate": 1.5568417928168598e-05, "loss": 2.4448, "step": 6960 }, { "epoch": 0.69, "grad_norm": 1.6797455549240112, "learning_rate": 1.5548629662610075e-05, "loss": 2.6067, "step": 6964 }, { "epoch": 0.69, "grad_norm": 1.9424878358840942, "learning_rate": 1.552884139705155e-05, "loss": 2.7723, "step": 6968 }, { "epoch": 0.69, "grad_norm": 1.659338355064392, "learning_rate": 1.5509053131493027e-05, "loss": 2.5958, "step": 6972 }, { "epoch": 0.69, "grad_norm": 1.5043983459472656, "learning_rate": 1.5489264865934504e-05, "loss": 2.5618, "step": 6976 }, { "epoch": 0.69, "grad_norm": 1.7023625373840332, "learning_rate": 1.546947660037598e-05, "loss": 2.5923, "step": 6980 }, { "epoch": 0.69, "grad_norm": 2.293471097946167, "learning_rate": 1.5449688334817456e-05, "loss": 2.7371, "step": 6984 }, { "epoch": 0.69, "grad_norm": 1.6672747135162354, "learning_rate": 1.542990006925893e-05, "loss": 2.5335, "step": 6988 }, { "epoch": 0.69, "grad_norm": 1.706365704536438, "learning_rate": 1.5410111803700405e-05, "loss": 2.4963, "step": 6992 }, { "epoch": 0.69, "grad_norm": 1.7993139028549194, "learning_rate": 1.5390323538141882e-05, "loss": 2.5698, "step": 6996 }, { "epoch": 0.69, "grad_norm": 1.6543632745742798, "learning_rate": 1.5370535272583358e-05, "loss": 2.5999, "step": 7000 }, { "epoch": 0.69, "grad_norm": 1.583425760269165, "learning_rate": 1.5350747007024834e-05, "loss": 2.585, "step": 7004 }, { "epoch": 0.69, "grad_norm": 1.6158406734466553, "learning_rate": 1.533095874146631e-05, "loss": 2.6547, "step": 7008 }, { "epoch": 0.69, "grad_norm": 1.6902023553848267, "learning_rate": 1.5311170475907787e-05, "loss": 2.5624, "step": 7012 }, { "epoch": 0.69, "grad_norm": 1.5671786069869995, "learning_rate": 1.5291382210349263e-05, "loss": 2.5003, "step": 7016 }, { "epoch": 0.69, "grad_norm": 1.4708104133605957, "learning_rate": 1.527159394479074e-05, "loss": 2.4721, "step": 7020 }, { "epoch": 0.69, "grad_norm": 1.622897744178772, "learning_rate": 1.5251805679232218e-05, "loss": 2.6162, "step": 7024 }, { "epoch": 0.7, "grad_norm": 1.5571931600570679, "learning_rate": 1.523201741367369e-05, "loss": 2.5378, "step": 7028 }, { "epoch": 0.7, "grad_norm": 1.4922298192977905, "learning_rate": 1.5212229148115167e-05, "loss": 2.5156, "step": 7032 }, { "epoch": 0.7, "grad_norm": 2.0152604579925537, "learning_rate": 1.5192440882556643e-05, "loss": 2.4795, "step": 7036 }, { "epoch": 0.7, "grad_norm": 1.5429670810699463, "learning_rate": 1.517265261699812e-05, "loss": 2.5534, "step": 7040 }, { "epoch": 0.7, "grad_norm": 1.5789587497711182, "learning_rate": 1.5152864351439598e-05, "loss": 2.647, "step": 7044 }, { "epoch": 0.7, "grad_norm": 1.6377935409545898, "learning_rate": 1.5133076085881074e-05, "loss": 2.7136, "step": 7048 }, { "epoch": 0.7, "grad_norm": 1.6484628915786743, "learning_rate": 1.511328782032255e-05, "loss": 2.5635, "step": 7052 }, { "epoch": 0.7, "grad_norm": 1.7103774547576904, "learning_rate": 1.5093499554764027e-05, "loss": 2.767, "step": 7056 }, { "epoch": 0.7, "grad_norm": 1.6443290710449219, "learning_rate": 1.5073711289205503e-05, "loss": 2.5986, "step": 7060 }, { "epoch": 0.7, "grad_norm": 1.6503266096115112, "learning_rate": 1.505392302364698e-05, "loss": 2.4505, "step": 7064 }, { "epoch": 0.7, "grad_norm": 1.656258225440979, "learning_rate": 1.5034134758088456e-05, "loss": 2.5093, "step": 7068 }, { "epoch": 0.7, "grad_norm": 1.5531201362609863, "learning_rate": 1.501434649252993e-05, "loss": 2.5512, "step": 7072 }, { "epoch": 0.7, "grad_norm": 1.5085419416427612, "learning_rate": 1.4994558226971406e-05, "loss": 2.4362, "step": 7076 }, { "epoch": 0.7, "grad_norm": 1.5943050384521484, "learning_rate": 1.4974769961412883e-05, "loss": 2.5309, "step": 7080 }, { "epoch": 0.7, "grad_norm": 1.7004754543304443, "learning_rate": 1.4954981695854359e-05, "loss": 2.573, "step": 7084 }, { "epoch": 0.7, "grad_norm": 1.6682156324386597, "learning_rate": 1.4935193430295835e-05, "loss": 2.5718, "step": 7088 }, { "epoch": 0.7, "grad_norm": 1.489130973815918, "learning_rate": 1.4915405164737312e-05, "loss": 2.7728, "step": 7092 }, { "epoch": 0.7, "grad_norm": 1.72616708278656, "learning_rate": 1.4895616899178788e-05, "loss": 2.6457, "step": 7096 }, { "epoch": 0.7, "grad_norm": 1.5968120098114014, "learning_rate": 1.4875828633620264e-05, "loss": 2.4886, "step": 7100 }, { "epoch": 0.7, "grad_norm": 1.7065930366516113, "learning_rate": 1.485604036806174e-05, "loss": 2.6894, "step": 7104 }, { "epoch": 0.7, "grad_norm": 1.8437420129776, "learning_rate": 1.4836252102503217e-05, "loss": 2.6103, "step": 7108 }, { "epoch": 0.7, "grad_norm": 1.5512233972549438, "learning_rate": 1.4816463836944692e-05, "loss": 2.4657, "step": 7112 }, { "epoch": 0.7, "grad_norm": 1.6278142929077148, "learning_rate": 1.4796675571386168e-05, "loss": 2.7127, "step": 7116 }, { "epoch": 0.7, "grad_norm": 1.5600184202194214, "learning_rate": 1.4776887305827644e-05, "loss": 2.4707, "step": 7120 }, { "epoch": 0.7, "grad_norm": 1.5605080127716064, "learning_rate": 1.475709904026912e-05, "loss": 2.5009, "step": 7124 }, { "epoch": 0.71, "grad_norm": 1.6376603841781616, "learning_rate": 1.4737310774710597e-05, "loss": 2.4116, "step": 7128 }, { "epoch": 0.71, "grad_norm": 1.5430164337158203, "learning_rate": 1.4717522509152073e-05, "loss": 2.4315, "step": 7132 }, { "epoch": 0.71, "grad_norm": 1.9249303340911865, "learning_rate": 1.469773424359355e-05, "loss": 2.5128, "step": 7136 }, { "epoch": 0.71, "grad_norm": 1.914553165435791, "learning_rate": 1.4677945978035026e-05, "loss": 2.5726, "step": 7140 }, { "epoch": 0.71, "grad_norm": 1.6890403032302856, "learning_rate": 1.4658157712476502e-05, "loss": 2.5233, "step": 7144 }, { "epoch": 0.71, "grad_norm": 1.6215218305587769, "learning_rate": 1.463836944691798e-05, "loss": 2.4541, "step": 7148 }, { "epoch": 0.71, "grad_norm": 1.6023436784744263, "learning_rate": 1.4618581181359457e-05, "loss": 2.5825, "step": 7152 }, { "epoch": 0.71, "grad_norm": 1.6800665855407715, "learning_rate": 1.459879291580093e-05, "loss": 2.5275, "step": 7156 }, { "epoch": 0.71, "grad_norm": 1.5266000032424927, "learning_rate": 1.4579004650242406e-05, "loss": 2.6414, "step": 7160 }, { "epoch": 0.71, "grad_norm": 1.660843014717102, "learning_rate": 1.4559216384683882e-05, "loss": 2.4934, "step": 7164 }, { "epoch": 0.71, "grad_norm": 1.5643970966339111, "learning_rate": 1.4539428119125358e-05, "loss": 2.4847, "step": 7168 }, { "epoch": 0.71, "grad_norm": 1.4681652784347534, "learning_rate": 1.4519639853566835e-05, "loss": 2.423, "step": 7172 }, { "epoch": 0.71, "grad_norm": 1.5900237560272217, "learning_rate": 1.4499851588008311e-05, "loss": 2.5645, "step": 7176 }, { "epoch": 0.71, "grad_norm": 1.6541297435760498, "learning_rate": 1.4480063322449789e-05, "loss": 2.5434, "step": 7180 }, { "epoch": 0.71, "grad_norm": 1.5105384588241577, "learning_rate": 1.4460275056891265e-05, "loss": 2.6391, "step": 7184 }, { "epoch": 0.71, "grad_norm": 1.6248685121536255, "learning_rate": 1.4440486791332742e-05, "loss": 2.6772, "step": 7188 }, { "epoch": 0.71, "grad_norm": 1.559591293334961, "learning_rate": 1.4420698525774218e-05, "loss": 2.5175, "step": 7192 }, { "epoch": 0.71, "grad_norm": 1.73567533493042, "learning_rate": 1.4400910260215691e-05, "loss": 2.5799, "step": 7196 }, { "epoch": 0.71, "grad_norm": 1.602149248123169, "learning_rate": 1.4381121994657167e-05, "loss": 2.5475, "step": 7200 }, { "epoch": 0.71, "grad_norm": 1.6269313097000122, "learning_rate": 1.4361333729098644e-05, "loss": 2.5729, "step": 7204 }, { "epoch": 0.71, "grad_norm": 1.5919511318206787, "learning_rate": 1.4341545463540122e-05, "loss": 2.4055, "step": 7208 }, { "epoch": 0.71, "grad_norm": 1.9625754356384277, "learning_rate": 1.4321757197981598e-05, "loss": 2.6867, "step": 7212 }, { "epoch": 0.71, "grad_norm": 1.7777873277664185, "learning_rate": 1.4301968932423074e-05, "loss": 2.7518, "step": 7216 }, { "epoch": 0.71, "grad_norm": 1.632881760597229, "learning_rate": 1.428218066686455e-05, "loss": 2.3954, "step": 7220 }, { "epoch": 0.71, "grad_norm": 1.7910826206207275, "learning_rate": 1.4262392401306027e-05, "loss": 2.5477, "step": 7224 }, { "epoch": 0.72, "grad_norm": 1.6172887086868286, "learning_rate": 1.4242604135747503e-05, "loss": 2.6665, "step": 7228 }, { "epoch": 0.72, "grad_norm": 1.7775293588638306, "learning_rate": 1.422281587018898e-05, "loss": 2.5237, "step": 7232 }, { "epoch": 0.72, "grad_norm": 1.6558274030685425, "learning_rate": 1.4203027604630456e-05, "loss": 2.5909, "step": 7236 }, { "epoch": 0.72, "grad_norm": 1.6271092891693115, "learning_rate": 1.418323933907193e-05, "loss": 2.5299, "step": 7240 }, { "epoch": 0.72, "grad_norm": 1.5768835544586182, "learning_rate": 1.4163451073513407e-05, "loss": 2.5376, "step": 7244 }, { "epoch": 0.72, "grad_norm": 1.641741156578064, "learning_rate": 1.4143662807954883e-05, "loss": 2.5802, "step": 7248 }, { "epoch": 0.72, "grad_norm": 1.5866329669952393, "learning_rate": 1.412387454239636e-05, "loss": 2.4445, "step": 7252 }, { "epoch": 0.72, "grad_norm": 1.6858747005462646, "learning_rate": 1.4104086276837836e-05, "loss": 2.51, "step": 7256 }, { "epoch": 0.72, "grad_norm": 1.6350436210632324, "learning_rate": 1.4084298011279312e-05, "loss": 2.6297, "step": 7260 }, { "epoch": 0.72, "grad_norm": 1.7628724575042725, "learning_rate": 1.4064509745720788e-05, "loss": 2.6294, "step": 7264 }, { "epoch": 0.72, "grad_norm": 1.6731958389282227, "learning_rate": 1.4044721480162265e-05, "loss": 2.5577, "step": 7268 }, { "epoch": 0.72, "grad_norm": 1.7393099069595337, "learning_rate": 1.4024933214603741e-05, "loss": 2.6079, "step": 7272 }, { "epoch": 0.72, "grad_norm": 1.7639967203140259, "learning_rate": 1.4005144949045217e-05, "loss": 2.7389, "step": 7276 }, { "epoch": 0.72, "grad_norm": 1.5961958169937134, "learning_rate": 1.3985356683486692e-05, "loss": 2.5846, "step": 7280 }, { "epoch": 0.72, "grad_norm": 1.6526871919631958, "learning_rate": 1.3965568417928168e-05, "loss": 2.5965, "step": 7284 }, { "epoch": 0.72, "grad_norm": 1.7771737575531006, "learning_rate": 1.3945780152369645e-05, "loss": 2.5905, "step": 7288 }, { "epoch": 0.72, "grad_norm": 1.713395118713379, "learning_rate": 1.3925991886811121e-05, "loss": 2.5063, "step": 7292 }, { "epoch": 0.72, "grad_norm": 1.5294148921966553, "learning_rate": 1.3906203621252597e-05, "loss": 2.4081, "step": 7296 }, { "epoch": 0.72, "grad_norm": 1.6980494260787964, "learning_rate": 1.3886415355694074e-05, "loss": 2.4693, "step": 7300 }, { "epoch": 0.72, "grad_norm": 1.703873634338379, "learning_rate": 1.386662709013555e-05, "loss": 2.4529, "step": 7304 }, { "epoch": 0.72, "grad_norm": 1.5093457698822021, "learning_rate": 1.3846838824577026e-05, "loss": 2.439, "step": 7308 }, { "epoch": 0.72, "grad_norm": 1.6407102346420288, "learning_rate": 1.3827050559018503e-05, "loss": 2.4581, "step": 7312 }, { "epoch": 0.72, "grad_norm": 1.485217809677124, "learning_rate": 1.380726229345998e-05, "loss": 2.5162, "step": 7316 }, { "epoch": 0.72, "grad_norm": 1.5687867403030396, "learning_rate": 1.3787474027901457e-05, "loss": 2.4868, "step": 7320 }, { "epoch": 0.72, "grad_norm": 1.5972037315368652, "learning_rate": 1.376768576234293e-05, "loss": 2.5713, "step": 7324 }, { "epoch": 0.73, "grad_norm": 1.6357609033584595, "learning_rate": 1.3747897496784406e-05, "loss": 2.5051, "step": 7328 }, { "epoch": 0.73, "grad_norm": 1.7274508476257324, "learning_rate": 1.3728109231225883e-05, "loss": 2.596, "step": 7332 }, { "epoch": 0.73, "grad_norm": 1.9209715127944946, "learning_rate": 1.3708320965667359e-05, "loss": 2.7628, "step": 7336 }, { "epoch": 0.73, "grad_norm": 1.5816470384597778, "learning_rate": 1.3688532700108835e-05, "loss": 2.4371, "step": 7340 }, { "epoch": 0.73, "grad_norm": 1.6190577745437622, "learning_rate": 1.3668744434550313e-05, "loss": 2.4256, "step": 7344 }, { "epoch": 0.73, "grad_norm": 1.627508282661438, "learning_rate": 1.364895616899179e-05, "loss": 2.6653, "step": 7348 }, { "epoch": 0.73, "grad_norm": 1.6422009468078613, "learning_rate": 1.3629167903433266e-05, "loss": 2.6699, "step": 7352 }, { "epoch": 0.73, "grad_norm": 1.546067476272583, "learning_rate": 1.3609379637874742e-05, "loss": 2.4263, "step": 7356 }, { "epoch": 0.73, "grad_norm": 1.621297836303711, "learning_rate": 1.3589591372316218e-05, "loss": 2.4296, "step": 7360 }, { "epoch": 0.73, "grad_norm": 1.643576741218567, "learning_rate": 1.3569803106757691e-05, "loss": 2.5806, "step": 7364 }, { "epoch": 0.73, "grad_norm": 1.6136977672576904, "learning_rate": 1.3550014841199168e-05, "loss": 2.4887, "step": 7368 }, { "epoch": 0.73, "grad_norm": 1.704002857208252, "learning_rate": 1.3530226575640644e-05, "loss": 2.583, "step": 7372 }, { "epoch": 0.73, "grad_norm": 1.6694648265838623, "learning_rate": 1.3510438310082122e-05, "loss": 2.4127, "step": 7376 }, { "epoch": 0.73, "grad_norm": 1.6585307121276855, "learning_rate": 1.3490650044523598e-05, "loss": 2.4128, "step": 7380 }, { "epoch": 0.73, "grad_norm": 1.7365046739578247, "learning_rate": 1.3470861778965075e-05, "loss": 2.6269, "step": 7384 }, { "epoch": 0.73, "grad_norm": 1.6612383127212524, "learning_rate": 1.3451073513406551e-05, "loss": 2.5896, "step": 7388 }, { "epoch": 0.73, "grad_norm": 1.7151292562484741, "learning_rate": 1.3431285247848027e-05, "loss": 2.4705, "step": 7392 }, { "epoch": 0.73, "grad_norm": 1.608148455619812, "learning_rate": 1.3411496982289504e-05, "loss": 2.4956, "step": 7396 }, { "epoch": 0.73, "grad_norm": 1.9409472942352295, "learning_rate": 1.339170871673098e-05, "loss": 2.5163, "step": 7400 }, { "epoch": 0.73, "grad_norm": 1.6790823936462402, "learning_rate": 1.3371920451172456e-05, "loss": 2.5319, "step": 7404 }, { "epoch": 0.73, "grad_norm": 1.833620548248291, "learning_rate": 1.3352132185613931e-05, "loss": 2.5467, "step": 7408 }, { "epoch": 0.73, "grad_norm": 1.7631961107254028, "learning_rate": 1.3332343920055407e-05, "loss": 2.5748, "step": 7412 }, { "epoch": 0.73, "grad_norm": 1.5768530368804932, "learning_rate": 1.3312555654496884e-05, "loss": 2.4908, "step": 7416 }, { "epoch": 0.73, "grad_norm": 1.6658759117126465, "learning_rate": 1.329276738893836e-05, "loss": 2.5698, "step": 7420 }, { "epoch": 0.73, "grad_norm": 1.7477085590362549, "learning_rate": 1.3272979123379836e-05, "loss": 2.3864, "step": 7424 }, { "epoch": 0.73, "grad_norm": 1.6787872314453125, "learning_rate": 1.3253190857821313e-05, "loss": 2.5065, "step": 7428 }, { "epoch": 0.74, "grad_norm": 1.6128181219100952, "learning_rate": 1.3233402592262789e-05, "loss": 2.6635, "step": 7432 }, { "epoch": 0.74, "grad_norm": 1.4591010808944702, "learning_rate": 1.3213614326704265e-05, "loss": 2.4067, "step": 7436 }, { "epoch": 0.74, "grad_norm": 1.7754063606262207, "learning_rate": 1.3193826061145742e-05, "loss": 2.5308, "step": 7440 }, { "epoch": 0.74, "grad_norm": 1.708154559135437, "learning_rate": 1.3174037795587218e-05, "loss": 2.4604, "step": 7444 }, { "epoch": 0.74, "grad_norm": 1.539201021194458, "learning_rate": 1.3154249530028694e-05, "loss": 2.5372, "step": 7448 }, { "epoch": 0.74, "grad_norm": 1.8630115985870361, "learning_rate": 1.3134461264470169e-05, "loss": 2.5022, "step": 7452 }, { "epoch": 0.74, "grad_norm": 1.7771594524383545, "learning_rate": 1.3114672998911645e-05, "loss": 2.3826, "step": 7456 }, { "epoch": 0.74, "grad_norm": 1.8175123929977417, "learning_rate": 1.3094884733353121e-05, "loss": 2.5922, "step": 7460 }, { "epoch": 0.74, "grad_norm": 1.4767266511917114, "learning_rate": 1.3075096467794598e-05, "loss": 2.5334, "step": 7464 }, { "epoch": 0.74, "grad_norm": 1.7710050344467163, "learning_rate": 1.3055308202236074e-05, "loss": 2.5901, "step": 7468 }, { "epoch": 0.74, "grad_norm": 1.78278648853302, "learning_rate": 1.303551993667755e-05, "loss": 2.5621, "step": 7472 }, { "epoch": 0.74, "grad_norm": 1.553341269493103, "learning_rate": 1.3015731671119027e-05, "loss": 2.5296, "step": 7476 }, { "epoch": 0.74, "grad_norm": 1.4547873735427856, "learning_rate": 1.2995943405560505e-05, "loss": 2.5812, "step": 7480 }, { "epoch": 0.74, "grad_norm": 1.6034144163131714, "learning_rate": 1.2976155140001981e-05, "loss": 2.3507, "step": 7484 }, { "epoch": 0.74, "grad_norm": 1.6461652517318726, "learning_rate": 1.2956366874443457e-05, "loss": 2.6291, "step": 7488 }, { "epoch": 0.74, "grad_norm": 1.6804616451263428, "learning_rate": 1.293657860888493e-05, "loss": 2.4514, "step": 7492 }, { "epoch": 0.74, "grad_norm": 1.5887171030044556, "learning_rate": 1.2916790343326407e-05, "loss": 2.386, "step": 7496 }, { "epoch": 0.74, "grad_norm": 1.6213395595550537, "learning_rate": 1.2897002077767883e-05, "loss": 2.5569, "step": 7500 }, { "epoch": 0.74, "grad_norm": 1.7485177516937256, "learning_rate": 1.287721381220936e-05, "loss": 2.5454, "step": 7504 }, { "epoch": 0.74, "grad_norm": 1.660869836807251, "learning_rate": 1.2857425546650836e-05, "loss": 2.5177, "step": 7508 }, { "epoch": 0.74, "grad_norm": 1.674647331237793, "learning_rate": 1.2837637281092314e-05, "loss": 2.3855, "step": 7512 }, { "epoch": 0.74, "grad_norm": 2.053084373474121, "learning_rate": 1.281784901553379e-05, "loss": 2.7179, "step": 7516 }, { "epoch": 0.74, "grad_norm": 1.5778189897537231, "learning_rate": 1.2798060749975266e-05, "loss": 2.5341, "step": 7520 }, { "epoch": 0.74, "grad_norm": 1.670196294784546, "learning_rate": 1.2778272484416743e-05, "loss": 2.4186, "step": 7524 }, { "epoch": 0.74, "grad_norm": 1.7191470861434937, "learning_rate": 1.2758484218858219e-05, "loss": 2.4801, "step": 7528 }, { "epoch": 0.75, "grad_norm": 1.6857608556747437, "learning_rate": 1.2738695953299695e-05, "loss": 2.4867, "step": 7532 }, { "epoch": 0.75, "grad_norm": 1.6951642036437988, "learning_rate": 1.2718907687741168e-05, "loss": 2.5551, "step": 7536 }, { "epoch": 0.75, "grad_norm": 1.9836032390594482, "learning_rate": 1.2699119422182646e-05, "loss": 2.4202, "step": 7540 }, { "epoch": 0.75, "grad_norm": 1.6894361972808838, "learning_rate": 1.2679331156624122e-05, "loss": 2.5553, "step": 7544 }, { "epoch": 0.75, "grad_norm": 1.6373331546783447, "learning_rate": 1.2659542891065599e-05, "loss": 2.3549, "step": 7548 }, { "epoch": 0.75, "grad_norm": 1.5669854879379272, "learning_rate": 1.2639754625507075e-05, "loss": 2.4814, "step": 7552 }, { "epoch": 0.75, "grad_norm": 1.697510838508606, "learning_rate": 1.2619966359948551e-05, "loss": 2.5292, "step": 7556 }, { "epoch": 0.75, "grad_norm": 1.6051623821258545, "learning_rate": 1.2600178094390028e-05, "loss": 2.4635, "step": 7560 }, { "epoch": 0.75, "grad_norm": 1.550139307975769, "learning_rate": 1.2580389828831504e-05, "loss": 2.5052, "step": 7564 }, { "epoch": 0.75, "grad_norm": 1.8294285535812378, "learning_rate": 1.256060156327298e-05, "loss": 2.6251, "step": 7568 }, { "epoch": 0.75, "grad_norm": 1.731762170791626, "learning_rate": 1.2540813297714457e-05, "loss": 2.6812, "step": 7572 }, { "epoch": 0.75, "grad_norm": 1.7276403903961182, "learning_rate": 1.2521025032155931e-05, "loss": 2.5587, "step": 7576 }, { "epoch": 0.75, "grad_norm": 1.8602039813995361, "learning_rate": 1.2501236766597408e-05, "loss": 2.5649, "step": 7580 }, { "epoch": 0.75, "grad_norm": 1.5579040050506592, "learning_rate": 1.2481448501038886e-05, "loss": 2.6513, "step": 7584 }, { "epoch": 0.75, "grad_norm": 1.7131482362747192, "learning_rate": 1.246166023548036e-05, "loss": 2.4914, "step": 7588 }, { "epoch": 0.75, "grad_norm": 1.68119215965271, "learning_rate": 1.2441871969921837e-05, "loss": 2.5319, "step": 7592 }, { "epoch": 0.75, "grad_norm": 1.8896154165267944, "learning_rate": 1.2422083704363313e-05, "loss": 2.5683, "step": 7596 }, { "epoch": 0.75, "grad_norm": 1.5681391954421997, "learning_rate": 1.240229543880479e-05, "loss": 2.4047, "step": 7600 }, { "epoch": 0.75, "grad_norm": 1.4728906154632568, "learning_rate": 1.2382507173246266e-05, "loss": 2.5071, "step": 7604 }, { "epoch": 0.75, "grad_norm": 1.5875272750854492, "learning_rate": 1.2362718907687742e-05, "loss": 2.5415, "step": 7608 }, { "epoch": 0.75, "grad_norm": 1.6239516735076904, "learning_rate": 1.2342930642129218e-05, "loss": 2.4362, "step": 7612 }, { "epoch": 0.75, "grad_norm": 1.812610149383545, "learning_rate": 1.2323142376570695e-05, "loss": 2.5034, "step": 7616 }, { "epoch": 0.75, "grad_norm": 1.6190868616104126, "learning_rate": 1.2303354111012171e-05, "loss": 2.4786, "step": 7620 }, { "epoch": 0.75, "grad_norm": 1.6609690189361572, "learning_rate": 1.2283565845453647e-05, "loss": 2.372, "step": 7624 }, { "epoch": 0.75, "grad_norm": 1.700492024421692, "learning_rate": 1.2263777579895122e-05, "loss": 2.4527, "step": 7628 }, { "epoch": 0.76, "grad_norm": 1.8237019777297974, "learning_rate": 1.2243989314336598e-05, "loss": 2.548, "step": 7632 }, { "epoch": 0.76, "grad_norm": 1.558048129081726, "learning_rate": 1.2224201048778074e-05, "loss": 2.5439, "step": 7636 }, { "epoch": 0.76, "grad_norm": 1.7001163959503174, "learning_rate": 1.220441278321955e-05, "loss": 2.5146, "step": 7640 }, { "epoch": 0.76, "grad_norm": 1.5829248428344727, "learning_rate": 1.2184624517661027e-05, "loss": 2.5473, "step": 7644 }, { "epoch": 0.76, "grad_norm": 1.5922554731369019, "learning_rate": 1.2164836252102505e-05, "loss": 2.5482, "step": 7648 }, { "epoch": 0.76, "grad_norm": 1.5552897453308105, "learning_rate": 1.214504798654398e-05, "loss": 2.7146, "step": 7652 }, { "epoch": 0.76, "grad_norm": 1.6641972064971924, "learning_rate": 1.2125259720985456e-05, "loss": 2.5317, "step": 7656 }, { "epoch": 0.76, "grad_norm": 1.727538824081421, "learning_rate": 1.2105471455426932e-05, "loss": 2.511, "step": 7660 }, { "epoch": 0.76, "grad_norm": 1.7466334104537964, "learning_rate": 1.2085683189868409e-05, "loss": 2.6592, "step": 7664 }, { "epoch": 0.76, "grad_norm": 1.578805923461914, "learning_rate": 1.2065894924309885e-05, "loss": 2.5405, "step": 7668 }, { "epoch": 0.76, "grad_norm": 1.7704498767852783, "learning_rate": 1.204610665875136e-05, "loss": 2.7104, "step": 7672 }, { "epoch": 0.76, "grad_norm": 1.901253342628479, "learning_rate": 1.2026318393192838e-05, "loss": 2.68, "step": 7676 }, { "epoch": 0.76, "grad_norm": 1.6251075267791748, "learning_rate": 1.2006530127634314e-05, "loss": 2.5327, "step": 7680 }, { "epoch": 0.76, "grad_norm": 1.9433711767196655, "learning_rate": 1.198674186207579e-05, "loss": 2.664, "step": 7684 }, { "epoch": 0.76, "grad_norm": 1.6232560873031616, "learning_rate": 1.1966953596517267e-05, "loss": 2.3881, "step": 7688 }, { "epoch": 0.76, "grad_norm": 1.625562310218811, "learning_rate": 1.1947165330958741e-05, "loss": 2.3981, "step": 7692 }, { "epoch": 0.76, "grad_norm": 1.8203229904174805, "learning_rate": 1.1927377065400218e-05, "loss": 2.5587, "step": 7696 }, { "epoch": 0.76, "grad_norm": 1.5125036239624023, "learning_rate": 1.1907588799841694e-05, "loss": 2.548, "step": 7700 }, { "epoch": 0.76, "grad_norm": 1.7927870750427246, "learning_rate": 1.188780053428317e-05, "loss": 2.5317, "step": 7704 }, { "epoch": 0.76, "grad_norm": 1.853130578994751, "learning_rate": 1.1868012268724647e-05, "loss": 2.5032, "step": 7708 }, { "epoch": 0.76, "grad_norm": 1.8347195386886597, "learning_rate": 1.1848224003166123e-05, "loss": 2.3406, "step": 7712 }, { "epoch": 0.76, "grad_norm": 1.6292403936386108, "learning_rate": 1.18284357376076e-05, "loss": 2.4874, "step": 7716 }, { "epoch": 0.76, "grad_norm": 1.6553095579147339, "learning_rate": 1.1808647472049076e-05, "loss": 2.5065, "step": 7720 }, { "epoch": 0.76, "grad_norm": 1.9831068515777588, "learning_rate": 1.1788859206490552e-05, "loss": 2.5079, "step": 7724 }, { "epoch": 0.76, "grad_norm": 1.7396299839019775, "learning_rate": 1.1769070940932028e-05, "loss": 2.5171, "step": 7728 }, { "epoch": 0.77, "grad_norm": 1.8842509984970093, "learning_rate": 1.1749282675373505e-05, "loss": 2.6561, "step": 7732 }, { "epoch": 0.77, "grad_norm": 1.5185445547103882, "learning_rate": 1.1729494409814979e-05, "loss": 2.6638, "step": 7736 }, { "epoch": 0.77, "grad_norm": 1.7892464399337769, "learning_rate": 1.1709706144256455e-05, "loss": 2.4509, "step": 7740 }, { "epoch": 0.77, "grad_norm": 1.6770362854003906, "learning_rate": 1.1689917878697933e-05, "loss": 2.536, "step": 7744 }, { "epoch": 0.77, "grad_norm": 1.6587915420532227, "learning_rate": 1.167012961313941e-05, "loss": 2.515, "step": 7748 }, { "epoch": 0.77, "grad_norm": 1.7976752519607544, "learning_rate": 1.1650341347580886e-05, "loss": 2.4524, "step": 7752 }, { "epoch": 0.77, "grad_norm": 1.7145442962646484, "learning_rate": 1.163055308202236e-05, "loss": 2.56, "step": 7756 }, { "epoch": 0.77, "grad_norm": 1.6737264394760132, "learning_rate": 1.1610764816463837e-05, "loss": 2.45, "step": 7760 }, { "epoch": 0.77, "grad_norm": 1.683347225189209, "learning_rate": 1.1590976550905313e-05, "loss": 2.4508, "step": 7764 }, { "epoch": 0.77, "grad_norm": 1.6320801973342896, "learning_rate": 1.157118828534679e-05, "loss": 2.6438, "step": 7768 }, { "epoch": 0.77, "grad_norm": 1.8501825332641602, "learning_rate": 1.1551400019788266e-05, "loss": 2.4262, "step": 7772 }, { "epoch": 0.77, "grad_norm": 1.8582115173339844, "learning_rate": 1.1531611754229742e-05, "loss": 2.4314, "step": 7776 }, { "epoch": 0.77, "grad_norm": 1.5969737768173218, "learning_rate": 1.1511823488671219e-05, "loss": 2.4917, "step": 7780 }, { "epoch": 0.77, "grad_norm": 1.5447652339935303, "learning_rate": 1.1492035223112695e-05, "loss": 2.5134, "step": 7784 }, { "epoch": 0.77, "grad_norm": 1.7579408884048462, "learning_rate": 1.1472246957554171e-05, "loss": 2.4526, "step": 7788 }, { "epoch": 0.77, "grad_norm": 1.6958955526351929, "learning_rate": 1.1452458691995648e-05, "loss": 2.5719, "step": 7792 }, { "epoch": 0.77, "grad_norm": 1.6478573083877563, "learning_rate": 1.1432670426437124e-05, "loss": 2.5227, "step": 7796 }, { "epoch": 0.77, "grad_norm": 1.5438200235366821, "learning_rate": 1.1412882160878599e-05, "loss": 2.3835, "step": 7800 }, { "epoch": 0.77, "grad_norm": 1.5718754529953003, "learning_rate": 1.1393093895320075e-05, "loss": 2.4568, "step": 7804 }, { "epoch": 0.77, "grad_norm": 1.6531394720077515, "learning_rate": 1.1373305629761551e-05, "loss": 2.3707, "step": 7808 }, { "epoch": 0.77, "grad_norm": 1.8568472862243652, "learning_rate": 1.135351736420303e-05, "loss": 2.583, "step": 7812 }, { "epoch": 0.77, "grad_norm": 1.7812650203704834, "learning_rate": 1.1333729098644506e-05, "loss": 2.5865, "step": 7816 }, { "epoch": 0.77, "grad_norm": 1.7314536571502686, "learning_rate": 1.131394083308598e-05, "loss": 2.5384, "step": 7820 }, { "epoch": 0.77, "grad_norm": 1.8775840997695923, "learning_rate": 1.1294152567527457e-05, "loss": 2.6276, "step": 7824 }, { "epoch": 0.77, "grad_norm": 1.3948659896850586, "learning_rate": 1.1274364301968933e-05, "loss": 2.5302, "step": 7828 }, { "epoch": 0.77, "grad_norm": 1.690691590309143, "learning_rate": 1.125457603641041e-05, "loss": 2.5569, "step": 7832 }, { "epoch": 0.78, "grad_norm": 1.606754183769226, "learning_rate": 1.1234787770851885e-05, "loss": 2.3866, "step": 7836 }, { "epoch": 0.78, "grad_norm": 1.5848886966705322, "learning_rate": 1.1214999505293362e-05, "loss": 2.5407, "step": 7840 }, { "epoch": 0.78, "grad_norm": 1.5651729106903076, "learning_rate": 1.1195211239734838e-05, "loss": 2.2891, "step": 7844 }, { "epoch": 0.78, "grad_norm": 1.704177975654602, "learning_rate": 1.1175422974176314e-05, "loss": 2.6963, "step": 7848 }, { "epoch": 0.78, "grad_norm": 1.605334997177124, "learning_rate": 1.115563470861779e-05, "loss": 2.5576, "step": 7852 }, { "epoch": 0.78, "grad_norm": 1.530823826789856, "learning_rate": 1.1135846443059267e-05, "loss": 2.4859, "step": 7856 }, { "epoch": 0.78, "grad_norm": 1.6835219860076904, "learning_rate": 1.1116058177500742e-05, "loss": 2.462, "step": 7860 }, { "epoch": 0.78, "grad_norm": 1.6420457363128662, "learning_rate": 1.1096269911942218e-05, "loss": 2.6175, "step": 7864 }, { "epoch": 0.78, "grad_norm": 1.6330996751785278, "learning_rate": 1.1076481646383694e-05, "loss": 2.6308, "step": 7868 }, { "epoch": 0.78, "grad_norm": 2.031244993209839, "learning_rate": 1.105669338082517e-05, "loss": 2.4599, "step": 7872 }, { "epoch": 0.78, "grad_norm": 1.5154794454574585, "learning_rate": 1.1036905115266647e-05, "loss": 2.4949, "step": 7876 }, { "epoch": 0.78, "grad_norm": 1.7397664785385132, "learning_rate": 1.1017116849708125e-05, "loss": 2.3962, "step": 7880 }, { "epoch": 0.78, "grad_norm": 1.7765430212020874, "learning_rate": 1.09973285841496e-05, "loss": 2.4214, "step": 7884 }, { "epoch": 0.78, "grad_norm": 1.6177083253860474, "learning_rate": 1.0977540318591076e-05, "loss": 2.5221, "step": 7888 }, { "epoch": 0.78, "grad_norm": 1.6060839891433716, "learning_rate": 1.0957752053032552e-05, "loss": 2.6571, "step": 7892 }, { "epoch": 0.78, "grad_norm": 1.6707481145858765, "learning_rate": 1.0937963787474029e-05, "loss": 2.373, "step": 7896 }, { "epoch": 0.78, "grad_norm": 1.8542253971099854, "learning_rate": 1.0918175521915505e-05, "loss": 2.4029, "step": 7900 }, { "epoch": 0.78, "grad_norm": 1.6408201456069946, "learning_rate": 1.089838725635698e-05, "loss": 2.4065, "step": 7904 }, { "epoch": 0.78, "grad_norm": 1.5829615592956543, "learning_rate": 1.0878598990798458e-05, "loss": 2.6859, "step": 7908 }, { "epoch": 0.78, "grad_norm": 1.5252026319503784, "learning_rate": 1.0858810725239934e-05, "loss": 2.5326, "step": 7912 }, { "epoch": 0.78, "grad_norm": 1.5747402906417847, "learning_rate": 1.083902245968141e-05, "loss": 2.5311, "step": 7916 }, { "epoch": 0.78, "grad_norm": 1.592838168144226, "learning_rate": 1.0819234194122887e-05, "loss": 2.3582, "step": 7920 }, { "epoch": 0.78, "grad_norm": 1.5652263164520264, "learning_rate": 1.0799445928564361e-05, "loss": 2.5613, "step": 7924 }, { "epoch": 0.78, "grad_norm": 1.5245355367660522, "learning_rate": 1.0779657663005837e-05, "loss": 2.7002, "step": 7928 }, { "epoch": 0.78, "grad_norm": 1.724204421043396, "learning_rate": 1.0759869397447314e-05, "loss": 2.9039, "step": 7932 }, { "epoch": 0.79, "grad_norm": 1.7106647491455078, "learning_rate": 1.074008113188879e-05, "loss": 2.5086, "step": 7936 }, { "epoch": 0.79, "grad_norm": 1.5160934925079346, "learning_rate": 1.0720292866330266e-05, "loss": 2.447, "step": 7940 }, { "epoch": 0.79, "grad_norm": 1.833022952079773, "learning_rate": 1.0700504600771743e-05, "loss": 2.4753, "step": 7944 }, { "epoch": 0.79, "grad_norm": 1.7385578155517578, "learning_rate": 1.0680716335213219e-05, "loss": 2.6183, "step": 7948 }, { "epoch": 0.79, "grad_norm": 1.8667320013046265, "learning_rate": 1.0660928069654695e-05, "loss": 2.5605, "step": 7952 }, { "epoch": 0.79, "grad_norm": 1.5835785865783691, "learning_rate": 1.0641139804096172e-05, "loss": 2.4019, "step": 7956 }, { "epoch": 0.79, "grad_norm": 1.7108229398727417, "learning_rate": 1.0621351538537648e-05, "loss": 2.708, "step": 7960 }, { "epoch": 0.79, "grad_norm": 1.674804925918579, "learning_rate": 1.0601563272979124e-05, "loss": 2.5102, "step": 7964 }, { "epoch": 0.79, "grad_norm": 1.955705165863037, "learning_rate": 1.0581775007420599e-05, "loss": 2.5221, "step": 7968 }, { "epoch": 0.79, "grad_norm": 1.740157961845398, "learning_rate": 1.0561986741862075e-05, "loss": 2.5232, "step": 7972 }, { "epoch": 0.79, "grad_norm": 1.6809014081954956, "learning_rate": 1.0542198476303553e-05, "loss": 2.3372, "step": 7976 }, { "epoch": 0.79, "grad_norm": 1.7483688592910767, "learning_rate": 1.052241021074503e-05, "loss": 2.4913, "step": 7980 }, { "epoch": 0.79, "grad_norm": 1.5279000997543335, "learning_rate": 1.0502621945186506e-05, "loss": 2.6386, "step": 7984 }, { "epoch": 0.79, "grad_norm": 1.7373197078704834, "learning_rate": 1.048283367962798e-05, "loss": 2.5823, "step": 7988 }, { "epoch": 0.79, "grad_norm": 1.6561895608901978, "learning_rate": 1.0463045414069457e-05, "loss": 2.4531, "step": 7992 }, { "epoch": 0.79, "grad_norm": 1.6324809789657593, "learning_rate": 1.0443257148510933e-05, "loss": 2.4943, "step": 7996 }, { "epoch": 0.79, "grad_norm": 1.4932702779769897, "learning_rate": 1.042346888295241e-05, "loss": 2.5745, "step": 8000 }, { "epoch": 0.79, "grad_norm": 1.6975860595703125, "learning_rate": 1.0403680617393886e-05, "loss": 2.5195, "step": 8004 }, { "epoch": 0.79, "grad_norm": 1.5818710327148438, "learning_rate": 1.0383892351835362e-05, "loss": 2.5146, "step": 8008 }, { "epoch": 0.79, "grad_norm": 1.6648439168930054, "learning_rate": 1.0364104086276839e-05, "loss": 2.5038, "step": 8012 }, { "epoch": 0.79, "grad_norm": 1.8664870262145996, "learning_rate": 1.0344315820718315e-05, "loss": 2.4395, "step": 8016 }, { "epoch": 0.79, "grad_norm": 1.6299059391021729, "learning_rate": 1.0324527555159791e-05, "loss": 2.632, "step": 8020 }, { "epoch": 0.79, "grad_norm": 1.6017898321151733, "learning_rate": 1.0304739289601268e-05, "loss": 2.523, "step": 8024 }, { "epoch": 0.79, "grad_norm": 1.8422410488128662, "learning_rate": 1.0284951024042744e-05, "loss": 2.5135, "step": 8028 }, { "epoch": 0.79, "grad_norm": 1.7916709184646606, "learning_rate": 1.0265162758484218e-05, "loss": 2.3953, "step": 8032 }, { "epoch": 0.8, "grad_norm": 1.6924244165420532, "learning_rate": 1.0245374492925695e-05, "loss": 2.5083, "step": 8036 }, { "epoch": 0.8, "grad_norm": 1.6407674551010132, "learning_rate": 1.0225586227367171e-05, "loss": 2.5489, "step": 8040 }, { "epoch": 0.8, "grad_norm": 2.751863718032837, "learning_rate": 1.0205797961808649e-05, "loss": 2.5538, "step": 8044 }, { "epoch": 0.8, "grad_norm": 1.8018790483474731, "learning_rate": 1.0186009696250125e-05, "loss": 2.4931, "step": 8048 }, { "epoch": 0.8, "grad_norm": 1.6749464273452759, "learning_rate": 1.01662214306916e-05, "loss": 2.5116, "step": 8052 }, { "epoch": 0.8, "grad_norm": 1.8620166778564453, "learning_rate": 1.0146433165133076e-05, "loss": 2.4857, "step": 8056 }, { "epoch": 0.8, "grad_norm": 1.8505146503448486, "learning_rate": 1.0126644899574553e-05, "loss": 2.535, "step": 8060 }, { "epoch": 0.8, "grad_norm": 1.4946340322494507, "learning_rate": 1.0106856634016029e-05, "loss": 2.3432, "step": 8064 }, { "epoch": 0.8, "grad_norm": 1.8005412817001343, "learning_rate": 1.0087068368457505e-05, "loss": 2.4651, "step": 8068 }, { "epoch": 0.8, "grad_norm": 1.786773443222046, "learning_rate": 1.006728010289898e-05, "loss": 2.4242, "step": 8072 }, { "epoch": 0.8, "grad_norm": 1.7355870008468628, "learning_rate": 1.0047491837340458e-05, "loss": 2.4785, "step": 8076 }, { "epoch": 0.8, "grad_norm": 1.677388310432434, "learning_rate": 1.0027703571781934e-05, "loss": 2.5572, "step": 8080 }, { "epoch": 0.8, "grad_norm": 1.7865822315216064, "learning_rate": 1.000791530622341e-05, "loss": 2.771, "step": 8084 }, { "epoch": 0.8, "grad_norm": 1.8583182096481323, "learning_rate": 9.988127040664887e-06, "loss": 2.4894, "step": 8088 }, { "epoch": 0.8, "grad_norm": 1.6651523113250732, "learning_rate": 9.968338775106362e-06, "loss": 2.5608, "step": 8092 }, { "epoch": 0.8, "grad_norm": 1.5435396432876587, "learning_rate": 9.948550509547838e-06, "loss": 2.5702, "step": 8096 }, { "epoch": 0.8, "grad_norm": 1.6652929782867432, "learning_rate": 9.928762243989314e-06, "loss": 2.5155, "step": 8100 }, { "epoch": 0.8, "grad_norm": 1.5997744798660278, "learning_rate": 9.90897397843079e-06, "loss": 2.412, "step": 8104 }, { "epoch": 0.8, "grad_norm": 1.518398404121399, "learning_rate": 9.889185712872267e-06, "loss": 2.4749, "step": 8108 }, { "epoch": 0.8, "grad_norm": 1.6166468858718872, "learning_rate": 9.869397447313745e-06, "loss": 2.528, "step": 8112 }, { "epoch": 0.8, "grad_norm": 1.7324260473251343, "learning_rate": 9.84960918175522e-06, "loss": 2.4854, "step": 8116 }, { "epoch": 0.8, "grad_norm": 1.6371716260910034, "learning_rate": 9.829820916196696e-06, "loss": 2.5037, "step": 8120 }, { "epoch": 0.8, "grad_norm": 1.711093544960022, "learning_rate": 9.810032650638172e-06, "loss": 2.6155, "step": 8124 }, { "epoch": 0.8, "grad_norm": 1.6455191373825073, "learning_rate": 9.790244385079648e-06, "loss": 2.4078, "step": 8128 }, { "epoch": 0.8, "grad_norm": 1.6366294622421265, "learning_rate": 9.770456119521125e-06, "loss": 2.4642, "step": 8132 }, { "epoch": 0.8, "grad_norm": 1.668990969657898, "learning_rate": 9.7506678539626e-06, "loss": 2.5074, "step": 8136 }, { "epoch": 0.81, "grad_norm": 1.5362697839736938, "learning_rate": 9.730879588404076e-06, "loss": 2.4896, "step": 8140 }, { "epoch": 0.81, "grad_norm": 1.8022223711013794, "learning_rate": 9.711091322845554e-06, "loss": 2.5353, "step": 8144 }, { "epoch": 0.81, "grad_norm": 1.6878408193588257, "learning_rate": 9.69130305728703e-06, "loss": 2.5586, "step": 8148 }, { "epoch": 0.81, "grad_norm": 1.689186692237854, "learning_rate": 9.671514791728506e-06, "loss": 2.655, "step": 8152 }, { "epoch": 0.81, "grad_norm": 1.6996088027954102, "learning_rate": 9.651726526169981e-06, "loss": 2.7126, "step": 8156 }, { "epoch": 0.81, "grad_norm": 2.8080925941467285, "learning_rate": 9.631938260611457e-06, "loss": 2.7099, "step": 8160 }, { "epoch": 0.81, "grad_norm": 1.7663508653640747, "learning_rate": 9.612149995052934e-06, "loss": 2.4897, "step": 8164 }, { "epoch": 0.81, "grad_norm": 1.7868987321853638, "learning_rate": 9.59236172949441e-06, "loss": 2.4844, "step": 8168 }, { "epoch": 0.81, "grad_norm": 1.6591540575027466, "learning_rate": 9.572573463935886e-06, "loss": 2.4604, "step": 8172 }, { "epoch": 0.81, "grad_norm": 1.7677662372589111, "learning_rate": 9.552785198377363e-06, "loss": 2.5354, "step": 8176 }, { "epoch": 0.81, "grad_norm": 1.6505751609802246, "learning_rate": 9.532996932818839e-06, "loss": 2.5834, "step": 8180 }, { "epoch": 0.81, "grad_norm": 1.7923264503479004, "learning_rate": 9.513208667260315e-06, "loss": 2.5445, "step": 8184 }, { "epoch": 0.81, "grad_norm": 1.6527979373931885, "learning_rate": 9.493420401701792e-06, "loss": 2.3743, "step": 8188 }, { "epoch": 0.81, "grad_norm": 1.490796446800232, "learning_rate": 9.473632136143268e-06, "loss": 2.4511, "step": 8192 }, { "epoch": 0.81, "grad_norm": 1.6742923259735107, "learning_rate": 9.453843870584744e-06, "loss": 2.5265, "step": 8196 }, { "epoch": 0.81, "grad_norm": 1.6631675958633423, "learning_rate": 9.434055605026219e-06, "loss": 2.3355, "step": 8200 }, { "epoch": 0.81, "grad_norm": 1.9126391410827637, "learning_rate": 9.414267339467695e-06, "loss": 2.5204, "step": 8204 }, { "epoch": 0.81, "grad_norm": 1.6832706928253174, "learning_rate": 9.394479073909172e-06, "loss": 2.541, "step": 8208 }, { "epoch": 0.81, "grad_norm": 1.8622807264328003, "learning_rate": 9.37469080835065e-06, "loss": 2.6101, "step": 8212 }, { "epoch": 0.81, "grad_norm": 1.7272640466690063, "learning_rate": 9.354902542792126e-06, "loss": 2.7245, "step": 8216 }, { "epoch": 0.81, "grad_norm": 1.6699191331863403, "learning_rate": 9.3351142772336e-06, "loss": 2.4707, "step": 8220 }, { "epoch": 0.81, "grad_norm": 1.599859595298767, "learning_rate": 9.315326011675077e-06, "loss": 2.5186, "step": 8224 }, { "epoch": 0.81, "grad_norm": 1.6403051614761353, "learning_rate": 9.295537746116553e-06, "loss": 2.6446, "step": 8228 }, { "epoch": 0.81, "grad_norm": 1.6128116846084595, "learning_rate": 9.27574948055803e-06, "loss": 2.6126, "step": 8232 }, { "epoch": 0.81, "grad_norm": 1.588544487953186, "learning_rate": 9.255961214999506e-06, "loss": 2.6034, "step": 8236 }, { "epoch": 0.82, "grad_norm": 1.7381192445755005, "learning_rate": 9.236172949440982e-06, "loss": 2.6789, "step": 8240 }, { "epoch": 0.82, "grad_norm": 1.6580698490142822, "learning_rate": 9.216384683882458e-06, "loss": 2.6275, "step": 8244 }, { "epoch": 0.82, "grad_norm": 1.779470443725586, "learning_rate": 9.196596418323935e-06, "loss": 2.6094, "step": 8248 }, { "epoch": 0.82, "grad_norm": 1.6793197393417358, "learning_rate": 9.176808152765411e-06, "loss": 2.3446, "step": 8252 }, { "epoch": 0.82, "grad_norm": 1.7751526832580566, "learning_rate": 9.157019887206887e-06, "loss": 2.5427, "step": 8256 }, { "epoch": 0.82, "grad_norm": 1.687898874282837, "learning_rate": 9.137231621648364e-06, "loss": 2.5313, "step": 8260 }, { "epoch": 0.82, "grad_norm": 1.5011862516403198, "learning_rate": 9.117443356089838e-06, "loss": 2.4652, "step": 8264 }, { "epoch": 0.82, "grad_norm": 1.7039753198623657, "learning_rate": 9.097655090531315e-06, "loss": 2.5125, "step": 8268 }, { "epoch": 0.82, "grad_norm": 1.7907326221466064, "learning_rate": 9.077866824972791e-06, "loss": 2.4032, "step": 8272 }, { "epoch": 0.82, "grad_norm": 1.689347267150879, "learning_rate": 9.058078559414267e-06, "loss": 2.4636, "step": 8276 }, { "epoch": 0.82, "grad_norm": 1.7220097780227661, "learning_rate": 9.038290293855745e-06, "loss": 2.4757, "step": 8280 }, { "epoch": 0.82, "grad_norm": 1.8284951448440552, "learning_rate": 9.01850202829722e-06, "loss": 2.473, "step": 8284 }, { "epoch": 0.82, "grad_norm": 1.7803459167480469, "learning_rate": 8.998713762738696e-06, "loss": 2.4781, "step": 8288 }, { "epoch": 0.82, "grad_norm": 1.6893064975738525, "learning_rate": 8.978925497180173e-06, "loss": 2.5115, "step": 8292 }, { "epoch": 0.82, "grad_norm": 1.8316203355789185, "learning_rate": 8.959137231621649e-06, "loss": 2.5759, "step": 8296 }, { "epoch": 0.82, "grad_norm": 1.6714107990264893, "learning_rate": 8.939348966063125e-06, "loss": 2.5676, "step": 8300 }, { "epoch": 0.82, "grad_norm": 1.5850313901901245, "learning_rate": 8.9195607005046e-06, "loss": 2.3758, "step": 8304 }, { "epoch": 0.82, "grad_norm": 1.595004677772522, "learning_rate": 8.899772434946078e-06, "loss": 2.4782, "step": 8308 }, { "epoch": 0.82, "grad_norm": 1.756579041481018, "learning_rate": 8.879984169387554e-06, "loss": 2.4742, "step": 8312 }, { "epoch": 0.82, "grad_norm": 1.601770043373108, "learning_rate": 8.86019590382903e-06, "loss": 2.3767, "step": 8316 }, { "epoch": 0.82, "grad_norm": 1.8463127613067627, "learning_rate": 8.840407638270507e-06, "loss": 2.5466, "step": 8320 }, { "epoch": 0.82, "grad_norm": 1.8509941101074219, "learning_rate": 8.820619372711983e-06, "loss": 2.606, "step": 8324 }, { "epoch": 0.82, "grad_norm": 1.6294124126434326, "learning_rate": 8.800831107153458e-06, "loss": 2.4114, "step": 8328 }, { "epoch": 0.82, "grad_norm": 1.5632402896881104, "learning_rate": 8.781042841594934e-06, "loss": 2.3389, "step": 8332 }, { "epoch": 0.82, "grad_norm": 1.602977991104126, "learning_rate": 8.76125457603641e-06, "loss": 2.5443, "step": 8336 }, { "epoch": 0.83, "grad_norm": 1.8994452953338623, "learning_rate": 8.741466310477887e-06, "loss": 2.4973, "step": 8340 }, { "epoch": 0.83, "grad_norm": 1.703715205192566, "learning_rate": 8.721678044919363e-06, "loss": 2.3838, "step": 8344 }, { "epoch": 0.83, "grad_norm": 1.7744590044021606, "learning_rate": 8.70188977936084e-06, "loss": 2.2877, "step": 8348 }, { "epoch": 0.83, "grad_norm": 1.7415697574615479, "learning_rate": 8.682101513802316e-06, "loss": 2.5678, "step": 8352 }, { "epoch": 0.83, "grad_norm": 1.6912627220153809, "learning_rate": 8.662313248243792e-06, "loss": 2.3746, "step": 8356 }, { "epoch": 0.83, "grad_norm": 1.591155767440796, "learning_rate": 8.642524982685268e-06, "loss": 2.4637, "step": 8360 }, { "epoch": 0.83, "grad_norm": 1.625097393989563, "learning_rate": 8.622736717126745e-06, "loss": 2.4071, "step": 8364 }, { "epoch": 0.83, "grad_norm": 1.7765227556228638, "learning_rate": 8.60294845156822e-06, "loss": 2.4339, "step": 8368 }, { "epoch": 0.83, "grad_norm": 1.7714086771011353, "learning_rate": 8.583160186009696e-06, "loss": 2.4657, "step": 8372 }, { "epoch": 0.83, "grad_norm": 1.5393376350402832, "learning_rate": 8.563371920451174e-06, "loss": 2.4295, "step": 8376 }, { "epoch": 0.83, "grad_norm": 1.6363029479980469, "learning_rate": 8.54358365489265e-06, "loss": 2.3262, "step": 8380 }, { "epoch": 0.83, "grad_norm": 1.6590903997421265, "learning_rate": 8.523795389334126e-06, "loss": 2.5488, "step": 8384 }, { "epoch": 0.83, "grad_norm": 2.057309150695801, "learning_rate": 8.504007123775601e-06, "loss": 2.4571, "step": 8388 }, { "epoch": 0.83, "grad_norm": 1.643871784210205, "learning_rate": 8.484218858217077e-06, "loss": 2.5755, "step": 8392 }, { "epoch": 0.83, "grad_norm": 1.5784872770309448, "learning_rate": 8.464430592658554e-06, "loss": 2.3636, "step": 8396 }, { "epoch": 0.83, "grad_norm": 1.5051548480987549, "learning_rate": 8.44464232710003e-06, "loss": 2.3239, "step": 8400 }, { "epoch": 0.83, "grad_norm": 1.7668542861938477, "learning_rate": 8.424854061541506e-06, "loss": 2.4491, "step": 8404 }, { "epoch": 0.83, "grad_norm": 1.7257317304611206, "learning_rate": 8.405065795982982e-06, "loss": 2.4014, "step": 8408 }, { "epoch": 0.83, "grad_norm": 1.6491328477859497, "learning_rate": 8.385277530424459e-06, "loss": 2.5093, "step": 8412 }, { "epoch": 0.83, "grad_norm": 1.5918080806732178, "learning_rate": 8.365489264865935e-06, "loss": 2.7333, "step": 8416 }, { "epoch": 0.83, "grad_norm": 1.709262490272522, "learning_rate": 8.345700999307411e-06, "loss": 2.3297, "step": 8420 }, { "epoch": 0.83, "grad_norm": 1.5516706705093384, "learning_rate": 8.325912733748888e-06, "loss": 2.3839, "step": 8424 }, { "epoch": 0.83, "grad_norm": 1.711029291152954, "learning_rate": 8.306124468190364e-06, "loss": 2.4985, "step": 8428 }, { "epoch": 0.83, "grad_norm": 1.5779407024383545, "learning_rate": 8.286336202631839e-06, "loss": 2.4944, "step": 8432 }, { "epoch": 0.83, "grad_norm": 1.8659144639968872, "learning_rate": 8.266547937073315e-06, "loss": 2.4244, "step": 8436 }, { "epoch": 0.84, "grad_norm": 1.93182373046875, "learning_rate": 8.246759671514791e-06, "loss": 2.4782, "step": 8440 }, { "epoch": 0.84, "grad_norm": 1.5297750234603882, "learning_rate": 8.22697140595627e-06, "loss": 2.5646, "step": 8444 }, { "epoch": 0.84, "grad_norm": 1.7578099966049194, "learning_rate": 8.207183140397746e-06, "loss": 2.5986, "step": 8448 }, { "epoch": 0.84, "grad_norm": 1.7178421020507812, "learning_rate": 8.18739487483922e-06, "loss": 2.4336, "step": 8452 }, { "epoch": 0.84, "grad_norm": 1.6756671667099, "learning_rate": 8.167606609280697e-06, "loss": 2.5326, "step": 8456 }, { "epoch": 0.84, "grad_norm": 1.681978464126587, "learning_rate": 8.147818343722173e-06, "loss": 2.4505, "step": 8460 }, { "epoch": 0.84, "grad_norm": 1.777003288269043, "learning_rate": 8.12803007816365e-06, "loss": 2.4595, "step": 8464 }, { "epoch": 0.84, "grad_norm": 1.7001906633377075, "learning_rate": 8.108241812605126e-06, "loss": 2.3165, "step": 8468 }, { "epoch": 0.84, "grad_norm": 1.6168255805969238, "learning_rate": 8.0884535470466e-06, "loss": 2.4513, "step": 8472 }, { "epoch": 0.84, "grad_norm": 1.7238038778305054, "learning_rate": 8.068665281488078e-06, "loss": 2.5386, "step": 8476 }, { "epoch": 0.84, "grad_norm": 1.7556039094924927, "learning_rate": 8.048877015929555e-06, "loss": 2.5683, "step": 8480 }, { "epoch": 0.84, "grad_norm": 1.8705509901046753, "learning_rate": 8.029088750371031e-06, "loss": 2.411, "step": 8484 }, { "epoch": 0.84, "grad_norm": 1.5241179466247559, "learning_rate": 8.009300484812507e-06, "loss": 2.424, "step": 8488 }, { "epoch": 0.84, "grad_norm": 1.6394929885864258, "learning_rate": 7.989512219253984e-06, "loss": 2.4855, "step": 8492 }, { "epoch": 0.84, "grad_norm": 1.7148469686508179, "learning_rate": 7.969723953695458e-06, "loss": 2.4967, "step": 8496 }, { "epoch": 0.84, "grad_norm": 1.7260617017745972, "learning_rate": 7.949935688136934e-06, "loss": 2.5231, "step": 8500 }, { "epoch": 0.84, "grad_norm": 1.84506356716156, "learning_rate": 7.93014742257841e-06, "loss": 2.3639, "step": 8504 }, { "epoch": 0.84, "grad_norm": 1.5255876779556274, "learning_rate": 7.910359157019887e-06, "loss": 2.4189, "step": 8508 }, { "epoch": 0.84, "grad_norm": 1.7980495691299438, "learning_rate": 7.890570891461365e-06, "loss": 2.4907, "step": 8512 }, { "epoch": 0.84, "grad_norm": 1.7311155796051025, "learning_rate": 7.87078262590284e-06, "loss": 2.5981, "step": 8516 }, { "epoch": 0.84, "grad_norm": 1.6992567777633667, "learning_rate": 7.850994360344316e-06, "loss": 2.5617, "step": 8520 }, { "epoch": 0.84, "grad_norm": 1.6677074432373047, "learning_rate": 7.831206094785792e-06, "loss": 2.5869, "step": 8524 }, { "epoch": 0.84, "grad_norm": 1.717405080795288, "learning_rate": 7.811417829227269e-06, "loss": 2.534, "step": 8528 }, { "epoch": 0.84, "grad_norm": 1.7169833183288574, "learning_rate": 7.791629563668745e-06, "loss": 2.5644, "step": 8532 }, { "epoch": 0.84, "grad_norm": 1.6315467357635498, "learning_rate": 7.77184129811022e-06, "loss": 2.5054, "step": 8536 }, { "epoch": 0.84, "grad_norm": 1.7004671096801758, "learning_rate": 7.752053032551696e-06, "loss": 2.3946, "step": 8540 }, { "epoch": 0.85, "grad_norm": 1.7185847759246826, "learning_rate": 7.732264766993174e-06, "loss": 2.6361, "step": 8544 }, { "epoch": 0.85, "grad_norm": 1.7826788425445557, "learning_rate": 7.71247650143465e-06, "loss": 2.4693, "step": 8548 }, { "epoch": 0.85, "grad_norm": 1.5494498014450073, "learning_rate": 7.692688235876127e-06, "loss": 2.475, "step": 8552 }, { "epoch": 0.85, "grad_norm": 1.6112264394760132, "learning_rate": 7.672899970317603e-06, "loss": 2.685, "step": 8556 }, { "epoch": 0.85, "grad_norm": 1.583173394203186, "learning_rate": 7.653111704759078e-06, "loss": 2.4246, "step": 8560 }, { "epoch": 0.85, "grad_norm": 1.6651097536087036, "learning_rate": 7.633323439200554e-06, "loss": 2.4226, "step": 8564 }, { "epoch": 0.85, "grad_norm": 1.6389166116714478, "learning_rate": 7.613535173642031e-06, "loss": 2.4979, "step": 8568 }, { "epoch": 0.85, "grad_norm": 1.7426066398620605, "learning_rate": 7.5937469080835074e-06, "loss": 2.5513, "step": 8572 }, { "epoch": 0.85, "grad_norm": 1.6744520664215088, "learning_rate": 7.573958642524984e-06, "loss": 2.4503, "step": 8576 }, { "epoch": 0.85, "grad_norm": 1.7158594131469727, "learning_rate": 7.554170376966458e-06, "loss": 2.4517, "step": 8580 }, { "epoch": 0.85, "grad_norm": 1.6604433059692383, "learning_rate": 7.5343821114079355e-06, "loss": 2.4247, "step": 8584 }, { "epoch": 0.85, "grad_norm": 1.636860728263855, "learning_rate": 7.514593845849412e-06, "loss": 2.2244, "step": 8588 }, { "epoch": 0.85, "grad_norm": 1.6336443424224854, "learning_rate": 7.494805580290888e-06, "loss": 2.5156, "step": 8592 }, { "epoch": 0.85, "grad_norm": 1.8386069536209106, "learning_rate": 7.4750173147323645e-06, "loss": 2.4954, "step": 8596 }, { "epoch": 0.85, "grad_norm": 1.6216977834701538, "learning_rate": 7.45522904917384e-06, "loss": 2.4894, "step": 8600 }, { "epoch": 0.85, "grad_norm": 1.7700227499008179, "learning_rate": 7.435440783615316e-06, "loss": 2.6143, "step": 8604 }, { "epoch": 0.85, "grad_norm": 1.6691062450408936, "learning_rate": 7.415652518056793e-06, "loss": 2.4329, "step": 8608 }, { "epoch": 0.85, "grad_norm": 1.517760992050171, "learning_rate": 7.395864252498269e-06, "loss": 2.402, "step": 8612 }, { "epoch": 0.85, "grad_norm": 1.5939360857009888, "learning_rate": 7.376075986939745e-06, "loss": 2.4902, "step": 8616 }, { "epoch": 0.85, "grad_norm": 1.6323703527450562, "learning_rate": 7.356287721381221e-06, "loss": 2.6149, "step": 8620 }, { "epoch": 0.85, "grad_norm": 1.5932726860046387, "learning_rate": 7.336499455822697e-06, "loss": 2.4341, "step": 8624 }, { "epoch": 0.85, "grad_norm": 1.7558265924453735, "learning_rate": 7.316711190264173e-06, "loss": 2.6402, "step": 8628 }, { "epoch": 0.85, "grad_norm": 1.7289772033691406, "learning_rate": 7.29692292470565e-06, "loss": 2.3923, "step": 8632 }, { "epoch": 0.85, "grad_norm": 1.800723910331726, "learning_rate": 7.277134659147127e-06, "loss": 2.4707, "step": 8636 }, { "epoch": 0.85, "grad_norm": 1.672587275505066, "learning_rate": 7.257346393588603e-06, "loss": 2.526, "step": 8640 }, { "epoch": 0.86, "grad_norm": 1.686955451965332, "learning_rate": 7.237558128030078e-06, "loss": 2.4433, "step": 8644 }, { "epoch": 0.86, "grad_norm": 1.5995945930480957, "learning_rate": 7.217769862471554e-06, "loss": 2.609, "step": 8648 }, { "epoch": 0.86, "grad_norm": 1.6582813262939453, "learning_rate": 7.197981596913031e-06, "loss": 2.3675, "step": 8652 }, { "epoch": 0.86, "grad_norm": 1.6642727851867676, "learning_rate": 7.178193331354508e-06, "loss": 2.4762, "step": 8656 }, { "epoch": 0.86, "grad_norm": 1.6755313873291016, "learning_rate": 7.158405065795984e-06, "loss": 2.4811, "step": 8660 }, { "epoch": 0.86, "grad_norm": 1.787563681602478, "learning_rate": 7.138616800237459e-06, "loss": 2.4481, "step": 8664 }, { "epoch": 0.86, "grad_norm": 1.5694202184677124, "learning_rate": 7.118828534678936e-06, "loss": 2.44, "step": 8668 }, { "epoch": 0.86, "grad_norm": 1.6753268241882324, "learning_rate": 7.099040269120412e-06, "loss": 2.664, "step": 8672 }, { "epoch": 0.86, "grad_norm": 1.8283534049987793, "learning_rate": 7.079252003561888e-06, "loss": 2.4761, "step": 8676 }, { "epoch": 0.86, "grad_norm": 1.730433702468872, "learning_rate": 7.059463738003365e-06, "loss": 2.5623, "step": 8680 }, { "epoch": 0.86, "grad_norm": 1.7397688627243042, "learning_rate": 7.03967547244484e-06, "loss": 2.6005, "step": 8684 }, { "epoch": 0.86, "grad_norm": 1.7049192190170288, "learning_rate": 7.0198872068863165e-06, "loss": 2.4976, "step": 8688 }, { "epoch": 0.86, "grad_norm": 1.7921533584594727, "learning_rate": 7.000098941327793e-06, "loss": 2.5239, "step": 8692 }, { "epoch": 0.86, "grad_norm": 1.7059839963912964, "learning_rate": 6.980310675769269e-06, "loss": 2.403, "step": 8696 }, { "epoch": 0.86, "grad_norm": 1.636533260345459, "learning_rate": 6.9605224102107455e-06, "loss": 2.4023, "step": 8700 }, { "epoch": 0.86, "grad_norm": 1.6458524465560913, "learning_rate": 6.940734144652221e-06, "loss": 2.6021, "step": 8704 }, { "epoch": 0.86, "grad_norm": 1.8312097787857056, "learning_rate": 6.920945879093697e-06, "loss": 2.4073, "step": 8708 }, { "epoch": 0.86, "grad_norm": 1.6643646955490112, "learning_rate": 6.901157613535174e-06, "loss": 2.4315, "step": 8712 }, { "epoch": 0.86, "grad_norm": 1.7501451969146729, "learning_rate": 6.88136934797665e-06, "loss": 2.4771, "step": 8716 }, { "epoch": 0.86, "grad_norm": 1.6172442436218262, "learning_rate": 6.861581082418127e-06, "loss": 2.4944, "step": 8720 }, { "epoch": 0.86, "grad_norm": 1.7708154916763306, "learning_rate": 6.841792816859603e-06, "loss": 2.5976, "step": 8724 }, { "epoch": 0.86, "grad_norm": 1.640822410583496, "learning_rate": 6.822004551301078e-06, "loss": 2.4854, "step": 8728 }, { "epoch": 0.86, "grad_norm": 1.7448416948318481, "learning_rate": 6.802216285742554e-06, "loss": 2.5023, "step": 8732 }, { "epoch": 0.86, "grad_norm": 1.7678117752075195, "learning_rate": 6.7824280201840315e-06, "loss": 2.3739, "step": 8736 }, { "epoch": 0.86, "grad_norm": 1.7742412090301514, "learning_rate": 6.762639754625508e-06, "loss": 2.5075, "step": 8740 }, { "epoch": 0.87, "grad_norm": 1.6153099536895752, "learning_rate": 6.742851489066984e-06, "loss": 2.3971, "step": 8744 }, { "epoch": 0.87, "grad_norm": 2.1363136768341064, "learning_rate": 6.72306322350846e-06, "loss": 2.5768, "step": 8748 }, { "epoch": 0.87, "grad_norm": 1.694061279296875, "learning_rate": 6.703274957949936e-06, "loss": 2.4628, "step": 8752 }, { "epoch": 0.87, "grad_norm": 1.5482895374298096, "learning_rate": 6.683486692391412e-06, "loss": 2.3695, "step": 8756 }, { "epoch": 0.87, "grad_norm": 1.6727862358093262, "learning_rate": 6.663698426832889e-06, "loss": 2.5452, "step": 8760 }, { "epoch": 0.87, "grad_norm": 2.0361154079437256, "learning_rate": 6.643910161274365e-06, "loss": 2.7449, "step": 8764 }, { "epoch": 0.87, "grad_norm": 1.6247440576553345, "learning_rate": 6.62412189571584e-06, "loss": 2.5772, "step": 8768 }, { "epoch": 0.87, "grad_norm": 1.6015691757202148, "learning_rate": 6.604333630157317e-06, "loss": 2.5729, "step": 8772 }, { "epoch": 0.87, "grad_norm": 1.7053395509719849, "learning_rate": 6.584545364598793e-06, "loss": 2.541, "step": 8776 }, { "epoch": 0.87, "grad_norm": 1.6078673601150513, "learning_rate": 6.564757099040269e-06, "loss": 2.5443, "step": 8780 }, { "epoch": 0.87, "grad_norm": 1.6850202083587646, "learning_rate": 6.544968833481746e-06, "loss": 2.6706, "step": 8784 }, { "epoch": 0.87, "grad_norm": 1.6219288110733032, "learning_rate": 6.525180567923223e-06, "loss": 2.5243, "step": 8788 }, { "epoch": 0.87, "grad_norm": 1.8155920505523682, "learning_rate": 6.5053923023646975e-06, "loss": 2.5544, "step": 8792 }, { "epoch": 0.87, "grad_norm": 1.7686444520950317, "learning_rate": 6.485604036806174e-06, "loss": 2.3905, "step": 8796 }, { "epoch": 0.87, "grad_norm": 1.9808334112167358, "learning_rate": 6.46581577124765e-06, "loss": 2.5068, "step": 8800 }, { "epoch": 0.87, "grad_norm": 1.7255302667617798, "learning_rate": 6.446027505689127e-06, "loss": 2.4938, "step": 8804 }, { "epoch": 0.87, "grad_norm": 1.6764934062957764, "learning_rate": 6.426239240130604e-06, "loss": 2.3194, "step": 8808 }, { "epoch": 0.87, "grad_norm": 1.5869959592819214, "learning_rate": 6.406450974572078e-06, "loss": 2.4587, "step": 8812 }, { "epoch": 0.87, "grad_norm": 1.5677440166473389, "learning_rate": 6.386662709013555e-06, "loss": 2.4172, "step": 8816 }, { "epoch": 0.87, "grad_norm": 1.6823347806930542, "learning_rate": 6.366874443455032e-06, "loss": 2.4559, "step": 8820 }, { "epoch": 0.87, "grad_norm": 1.6375516653060913, "learning_rate": 6.347086177896508e-06, "loss": 2.5198, "step": 8824 }, { "epoch": 0.87, "grad_norm": 1.629103422164917, "learning_rate": 6.327297912337984e-06, "loss": 2.5148, "step": 8828 }, { "epoch": 0.87, "grad_norm": 1.5278247594833374, "learning_rate": 6.30750964677946e-06, "loss": 2.4411, "step": 8832 }, { "epoch": 0.87, "grad_norm": 1.8897967338562012, "learning_rate": 6.287721381220936e-06, "loss": 2.4329, "step": 8836 }, { "epoch": 0.87, "grad_norm": 1.6970409154891968, "learning_rate": 6.2679331156624125e-06, "loss": 2.7446, "step": 8840 }, { "epoch": 0.88, "grad_norm": 1.5253138542175293, "learning_rate": 6.248144850103889e-06, "loss": 2.4237, "step": 8844 }, { "epoch": 0.88, "grad_norm": 1.6197317838668823, "learning_rate": 6.228356584545364e-06, "loss": 2.5055, "step": 8848 }, { "epoch": 0.88, "grad_norm": 1.5637788772583008, "learning_rate": 6.2085683189868415e-06, "loss": 2.3537, "step": 8852 }, { "epoch": 0.88, "grad_norm": 1.662288784980774, "learning_rate": 6.188780053428318e-06, "loss": 2.3261, "step": 8856 }, { "epoch": 0.88, "grad_norm": 1.642146348953247, "learning_rate": 6.168991787869793e-06, "loss": 2.4903, "step": 8860 }, { "epoch": 0.88, "grad_norm": 1.6560384035110474, "learning_rate": 6.1492035223112696e-06, "loss": 2.5563, "step": 8864 }, { "epoch": 0.88, "grad_norm": 1.7499784231185913, "learning_rate": 6.129415256752746e-06, "loss": 2.5078, "step": 8868 }, { "epoch": 0.88, "grad_norm": 1.8147586584091187, "learning_rate": 6.109626991194222e-06, "loss": 2.3585, "step": 8872 }, { "epoch": 0.88, "grad_norm": 1.6541246175765991, "learning_rate": 6.0898387256356985e-06, "loss": 2.5436, "step": 8876 }, { "epoch": 0.88, "grad_norm": 1.8309890031814575, "learning_rate": 6.070050460077174e-06, "loss": 2.6095, "step": 8880 }, { "epoch": 0.88, "grad_norm": 1.725372552871704, "learning_rate": 6.050262194518651e-06, "loss": 2.5652, "step": 8884 }, { "epoch": 0.88, "grad_norm": 1.6648577451705933, "learning_rate": 6.0304739289601275e-06, "loss": 2.4624, "step": 8888 }, { "epoch": 0.88, "grad_norm": 1.6224662065505981, "learning_rate": 6.010685663401603e-06, "loss": 2.4616, "step": 8892 }, { "epoch": 0.88, "grad_norm": 1.535581111907959, "learning_rate": 5.990897397843079e-06, "loss": 2.3335, "step": 8896 }, { "epoch": 0.88, "grad_norm": 1.5054175853729248, "learning_rate": 5.971109132284556e-06, "loss": 2.3097, "step": 8900 }, { "epoch": 0.88, "grad_norm": 1.7295256853103638, "learning_rate": 5.951320866726032e-06, "loss": 2.538, "step": 8904 }, { "epoch": 0.88, "grad_norm": 1.6823656558990479, "learning_rate": 5.931532601167508e-06, "loss": 2.4051, "step": 8908 }, { "epoch": 0.88, "grad_norm": 1.702989935874939, "learning_rate": 5.911744335608984e-06, "loss": 2.3926, "step": 8912 }, { "epoch": 0.88, "grad_norm": 1.6187750101089478, "learning_rate": 5.89195607005046e-06, "loss": 2.2627, "step": 8916 }, { "epoch": 0.88, "grad_norm": 1.6742513179779053, "learning_rate": 5.872167804491937e-06, "loss": 2.3604, "step": 8920 }, { "epoch": 0.88, "grad_norm": 1.7713173627853394, "learning_rate": 5.852379538933413e-06, "loss": 2.557, "step": 8924 }, { "epoch": 0.88, "grad_norm": 1.624006748199463, "learning_rate": 5.832591273374889e-06, "loss": 2.4546, "step": 8928 }, { "epoch": 0.88, "grad_norm": 1.8159652948379517, "learning_rate": 5.812803007816365e-06, "loss": 2.4153, "step": 8932 }, { "epoch": 0.88, "grad_norm": 1.6506266593933105, "learning_rate": 5.793014742257842e-06, "loss": 2.6401, "step": 8936 }, { "epoch": 0.88, "grad_norm": 1.7533698081970215, "learning_rate": 5.773226476699318e-06, "loss": 2.5884, "step": 8940 }, { "epoch": 0.88, "grad_norm": 1.6324323415756226, "learning_rate": 5.7534382111407935e-06, "loss": 2.2965, "step": 8944 }, { "epoch": 0.89, "grad_norm": 1.5913504362106323, "learning_rate": 5.73364994558227e-06, "loss": 2.4982, "step": 8948 }, { "epoch": 0.89, "grad_norm": 1.9502431154251099, "learning_rate": 5.713861680023746e-06, "loss": 2.3815, "step": 8952 }, { "epoch": 0.89, "grad_norm": 1.5532846450805664, "learning_rate": 5.694073414465222e-06, "loss": 2.4715, "step": 8956 }, { "epoch": 0.89, "grad_norm": 1.6778051853179932, "learning_rate": 5.674285148906699e-06, "loss": 2.2548, "step": 8960 }, { "epoch": 0.89, "grad_norm": 1.5361812114715576, "learning_rate": 5.654496883348174e-06, "loss": 2.4623, "step": 8964 }, { "epoch": 0.89, "grad_norm": 1.6158546209335327, "learning_rate": 5.634708617789651e-06, "loss": 2.4732, "step": 8968 }, { "epoch": 0.89, "grad_norm": 1.7343107461929321, "learning_rate": 5.614920352231128e-06, "loss": 2.4489, "step": 8972 }, { "epoch": 0.89, "grad_norm": 1.6918219327926636, "learning_rate": 5.595132086672603e-06, "loss": 2.4923, "step": 8976 }, { "epoch": 0.89, "grad_norm": 1.725468397140503, "learning_rate": 5.5753438211140795e-06, "loss": 2.5126, "step": 8980 }, { "epoch": 0.89, "grad_norm": 1.6052392721176147, "learning_rate": 5.555555555555556e-06, "loss": 2.3489, "step": 8984 }, { "epoch": 0.89, "grad_norm": 1.6850762367248535, "learning_rate": 5.535767289997032e-06, "loss": 2.5302, "step": 8988 }, { "epoch": 0.89, "grad_norm": 2.009579658508301, "learning_rate": 5.5159790244385085e-06, "loss": 2.4038, "step": 8992 }, { "epoch": 0.89, "grad_norm": 1.787927269935608, "learning_rate": 5.496190758879984e-06, "loss": 2.4092, "step": 8996 }, { "epoch": 0.89, "grad_norm": 1.583129644393921, "learning_rate": 5.476402493321461e-06, "loss": 2.5511, "step": 9000 }, { "epoch": 0.89, "grad_norm": 1.5953203439712524, "learning_rate": 5.4566142277629374e-06, "loss": 2.3661, "step": 9004 }, { "epoch": 0.89, "grad_norm": 1.6569699048995972, "learning_rate": 5.436825962204413e-06, "loss": 2.3866, "step": 9008 }, { "epoch": 0.89, "grad_norm": 1.5289307832717896, "learning_rate": 5.417037696645889e-06, "loss": 2.4527, "step": 9012 }, { "epoch": 0.89, "grad_norm": 1.5741091966629028, "learning_rate": 5.3972494310873655e-06, "loss": 2.5182, "step": 9016 }, { "epoch": 0.89, "grad_norm": 1.672529697418213, "learning_rate": 5.377461165528842e-06, "loss": 2.4246, "step": 9020 }, { "epoch": 0.89, "grad_norm": 1.5960274934768677, "learning_rate": 5.357672899970318e-06, "loss": 2.3121, "step": 9024 }, { "epoch": 0.89, "grad_norm": 1.9494187831878662, "learning_rate": 5.337884634411794e-06, "loss": 2.3272, "step": 9028 }, { "epoch": 0.89, "grad_norm": 1.8283107280731201, "learning_rate": 5.31809636885327e-06, "loss": 2.4936, "step": 9032 }, { "epoch": 0.89, "grad_norm": 1.6064289808273315, "learning_rate": 5.298308103294747e-06, "loss": 2.5352, "step": 9036 }, { "epoch": 0.89, "grad_norm": 1.6828787326812744, "learning_rate": 5.278519837736223e-06, "loss": 2.2623, "step": 9040 }, { "epoch": 0.89, "grad_norm": 1.93602454662323, "learning_rate": 5.258731572177699e-06, "loss": 2.4931, "step": 9044 }, { "epoch": 0.9, "grad_norm": 1.6569011211395264, "learning_rate": 5.238943306619174e-06, "loss": 2.4178, "step": 9048 }, { "epoch": 0.9, "grad_norm": 1.6328504085540771, "learning_rate": 5.219155041060652e-06, "loss": 2.6022, "step": 9052 }, { "epoch": 0.9, "grad_norm": 1.7538514137268066, "learning_rate": 5.199366775502128e-06, "loss": 2.3894, "step": 9056 }, { "epoch": 0.9, "grad_norm": 1.7447439432144165, "learning_rate": 5.179578509943603e-06, "loss": 2.3315, "step": 9060 }, { "epoch": 0.9, "grad_norm": 1.793074607849121, "learning_rate": 5.15979024438508e-06, "loss": 2.4952, "step": 9064 }, { "epoch": 0.9, "grad_norm": 1.6562350988388062, "learning_rate": 5.140001978826556e-06, "loss": 2.3996, "step": 9068 }, { "epoch": 0.9, "grad_norm": 1.7558027505874634, "learning_rate": 5.120213713268032e-06, "loss": 2.4778, "step": 9072 }, { "epoch": 0.9, "grad_norm": 1.6544588804244995, "learning_rate": 5.100425447709509e-06, "loss": 2.5229, "step": 9076 }, { "epoch": 0.9, "grad_norm": 1.775094985961914, "learning_rate": 5.080637182150984e-06, "loss": 2.3536, "step": 9080 }, { "epoch": 0.9, "grad_norm": 1.9515008926391602, "learning_rate": 5.060848916592461e-06, "loss": 2.4411, "step": 9084 }, { "epoch": 0.9, "grad_norm": 1.8084925413131714, "learning_rate": 5.041060651033938e-06, "loss": 2.3756, "step": 9088 }, { "epoch": 0.9, "grad_norm": 1.6303074359893799, "learning_rate": 5.021272385475413e-06, "loss": 2.3407, "step": 9092 }, { "epoch": 0.9, "grad_norm": 1.6556874513626099, "learning_rate": 5.0014841199168894e-06, "loss": 2.4766, "step": 9096 } ], "logging_steps": 4, "max_steps": 10107, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1011, "total_flos": 1.1128110498761933e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }