diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,27100 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999741314639005, + "eval_steps": 500, + "global_step": 19328, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 0.52734375, + "learning_rate": 1.0346611484738748e-07, + "loss": 2.0672, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 0.55078125, + "learning_rate": 5.173305742369374e-07, + "loss": 2.0719, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 0.60546875, + "learning_rate": 1.0346611484738748e-06, + "loss": 2.0752, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 0.65625, + "learning_rate": 1.5519917227108122e-06, + "loss": 2.0881, + "step": 15 + }, + { + "epoch": 0.0, + "grad_norm": 0.53125, + "learning_rate": 2.0693222969477496e-06, + "loss": 2.0211, + "step": 20 + }, + { + "epoch": 0.0, + "grad_norm": 1.1171875, + "learning_rate": 2.586652871184687e-06, + "loss": 2.0231, + "step": 25 + }, + { + "epoch": 0.0, + "grad_norm": 0.5625, + "learning_rate": 3.1039834454216244e-06, + "loss": 2.0532, + "step": 30 + }, + { + "epoch": 0.0, + "grad_norm": 0.470703125, + "learning_rate": 3.6213140196585623e-06, + "loss": 2.0431, + "step": 35 + }, + { + "epoch": 0.0, + "grad_norm": 0.546875, + "learning_rate": 4.138644593895499e-06, + "loss": 2.0622, + "step": 40 + }, + { + "epoch": 0.0, + "grad_norm": 0.5, + "learning_rate": 4.655975168132437e-06, + "loss": 2.0141, + "step": 45 + }, + { + "epoch": 0.0, + "grad_norm": 0.57421875, + "learning_rate": 5.173305742369374e-06, + "loss": 2.0169, + "step": 50 + }, + { + "epoch": 0.0, + "grad_norm": 0.466796875, + "learning_rate": 5.6906363166063115e-06, + "loss": 2.0446, + "step": 55 + }, + { + "epoch": 0.0, + "grad_norm": 0.52734375, + "learning_rate": 6.207966890843249e-06, + "loss": 2.0514, + "step": 60 + }, + { + "epoch": 0.0, + "grad_norm": 0.50390625, + "learning_rate": 6.725297465080186e-06, + "loss": 2.028, + "step": 65 + }, + { + "epoch": 0.0, + "grad_norm": 0.5078125, + "learning_rate": 7.2426280393171246e-06, + "loss": 1.9917, + "step": 70 + }, + { + "epoch": 0.0, + "grad_norm": 0.47265625, + "learning_rate": 7.75995861355406e-06, + "loss": 2.0271, + "step": 75 + }, + { + "epoch": 0.0, + "grad_norm": 0.53125, + "learning_rate": 8.277289187790999e-06, + "loss": 1.9452, + "step": 80 + }, + { + "epoch": 0.0, + "grad_norm": 0.455078125, + "learning_rate": 8.794619762027937e-06, + "loss": 1.9469, + "step": 85 + }, + { + "epoch": 0.0, + "grad_norm": 0.515625, + "learning_rate": 9.311950336264873e-06, + "loss": 1.9631, + "step": 90 + }, + { + "epoch": 0.0, + "grad_norm": 0.498046875, + "learning_rate": 9.82928091050181e-06, + "loss": 1.939, + "step": 95 + }, + { + "epoch": 0.01, + "grad_norm": 0.4765625, + "learning_rate": 1.0346611484738748e-05, + "loss": 1.939, + "step": 100 + }, + { + "epoch": 0.01, + "grad_norm": 0.486328125, + "learning_rate": 1.0863942058975686e-05, + "loss": 1.9396, + "step": 105 + }, + { + "epoch": 0.01, + "grad_norm": 0.46484375, + "learning_rate": 1.1381272633212623e-05, + "loss": 1.9, + "step": 110 + }, + { + "epoch": 0.01, + "grad_norm": 0.52734375, + "learning_rate": 1.1898603207449561e-05, + "loss": 1.9245, + "step": 115 + }, + { + "epoch": 0.01, + "grad_norm": 0.498046875, + "learning_rate": 1.2415933781686498e-05, + "loss": 1.8493, + "step": 120 + }, + { + "epoch": 0.01, + "grad_norm": 0.515625, + "learning_rate": 1.2933264355923436e-05, + "loss": 1.8828, + "step": 125 + }, + { + "epoch": 0.01, + "grad_norm": 0.51171875, + "learning_rate": 1.3450594930160373e-05, + "loss": 1.8809, + "step": 130 + }, + { + "epoch": 0.01, + "grad_norm": 0.46875, + "learning_rate": 1.3967925504397311e-05, + "loss": 1.8241, + "step": 135 + }, + { + "epoch": 0.01, + "grad_norm": 0.46484375, + "learning_rate": 1.4485256078634249e-05, + "loss": 1.8636, + "step": 140 + }, + { + "epoch": 0.01, + "grad_norm": 0.51171875, + "learning_rate": 1.5002586652871187e-05, + "loss": 1.8901, + "step": 145 + }, + { + "epoch": 0.01, + "grad_norm": 0.494140625, + "learning_rate": 1.551991722710812e-05, + "loss": 1.8464, + "step": 150 + }, + { + "epoch": 0.01, + "grad_norm": 0.49609375, + "learning_rate": 1.603724780134506e-05, + "loss": 1.7929, + "step": 155 + }, + { + "epoch": 0.01, + "grad_norm": 0.51953125, + "learning_rate": 1.6554578375581997e-05, + "loss": 1.8162, + "step": 160 + }, + { + "epoch": 0.01, + "grad_norm": 0.50390625, + "learning_rate": 1.7071908949818935e-05, + "loss": 1.8365, + "step": 165 + }, + { + "epoch": 0.01, + "grad_norm": 0.52734375, + "learning_rate": 1.7589239524055874e-05, + "loss": 1.8309, + "step": 170 + }, + { + "epoch": 0.01, + "grad_norm": 0.6171875, + "learning_rate": 1.8106570098292812e-05, + "loss": 1.8047, + "step": 175 + }, + { + "epoch": 0.01, + "grad_norm": 0.5546875, + "learning_rate": 1.8623900672529747e-05, + "loss": 1.8204, + "step": 180 + }, + { + "epoch": 0.01, + "grad_norm": 0.72265625, + "learning_rate": 1.9141231246766685e-05, + "loss": 1.8022, + "step": 185 + }, + { + "epoch": 0.01, + "grad_norm": 0.578125, + "learning_rate": 1.965856182100362e-05, + "loss": 1.8294, + "step": 190 + }, + { + "epoch": 0.01, + "grad_norm": 0.54296875, + "learning_rate": 2.0175892395240558e-05, + "loss": 1.8189, + "step": 195 + }, + { + "epoch": 0.01, + "grad_norm": 0.578125, + "learning_rate": 2.0693222969477496e-05, + "loss": 1.8118, + "step": 200 + }, + { + "epoch": 0.01, + "grad_norm": 0.57421875, + "learning_rate": 2.1210553543714435e-05, + "loss": 1.8137, + "step": 205 + }, + { + "epoch": 0.01, + "grad_norm": 0.58203125, + "learning_rate": 2.1727884117951373e-05, + "loss": 1.7953, + "step": 210 + }, + { + "epoch": 0.01, + "grad_norm": 0.61328125, + "learning_rate": 2.224521469218831e-05, + "loss": 1.8048, + "step": 215 + }, + { + "epoch": 0.01, + "grad_norm": 0.62109375, + "learning_rate": 2.2762545266425246e-05, + "loss": 1.7902, + "step": 220 + }, + { + "epoch": 0.01, + "grad_norm": 0.62890625, + "learning_rate": 2.3279875840662184e-05, + "loss": 1.8015, + "step": 225 + }, + { + "epoch": 0.01, + "grad_norm": 0.625, + "learning_rate": 2.3797206414899122e-05, + "loss": 1.8083, + "step": 230 + }, + { + "epoch": 0.01, + "grad_norm": 0.63671875, + "learning_rate": 2.4314536989136057e-05, + "loss": 1.7645, + "step": 235 + }, + { + "epoch": 0.01, + "grad_norm": 0.6484375, + "learning_rate": 2.4831867563372996e-05, + "loss": 1.7412, + "step": 240 + }, + { + "epoch": 0.01, + "grad_norm": 0.66015625, + "learning_rate": 2.5349198137609937e-05, + "loss": 1.7416, + "step": 245 + }, + { + "epoch": 0.01, + "grad_norm": 0.640625, + "learning_rate": 2.5866528711846872e-05, + "loss": 1.7907, + "step": 250 + }, + { + "epoch": 0.01, + "grad_norm": 0.66015625, + "learning_rate": 2.6383859286083807e-05, + "loss": 1.7905, + "step": 255 + }, + { + "epoch": 0.01, + "grad_norm": 0.6640625, + "learning_rate": 2.6901189860320745e-05, + "loss": 1.7633, + "step": 260 + }, + { + "epoch": 0.01, + "grad_norm": 0.67578125, + "learning_rate": 2.7418520434557683e-05, + "loss": 1.7678, + "step": 265 + }, + { + "epoch": 0.01, + "grad_norm": 0.66015625, + "learning_rate": 2.7935851008794622e-05, + "loss": 1.7868, + "step": 270 + }, + { + "epoch": 0.01, + "grad_norm": 0.68359375, + "learning_rate": 2.8453181583031557e-05, + "loss": 1.7642, + "step": 275 + }, + { + "epoch": 0.01, + "grad_norm": 0.70703125, + "learning_rate": 2.8970512157268498e-05, + "loss": 1.7907, + "step": 280 + }, + { + "epoch": 0.01, + "grad_norm": 0.69921875, + "learning_rate": 2.9487842731505433e-05, + "loss": 1.7673, + "step": 285 + }, + { + "epoch": 0.02, + "grad_norm": 0.7109375, + "learning_rate": 3.0005173305742375e-05, + "loss": 1.7521, + "step": 290 + }, + { + "epoch": 0.02, + "grad_norm": 0.71875, + "learning_rate": 3.052250387997931e-05, + "loss": 1.7589, + "step": 295 + }, + { + "epoch": 0.02, + "grad_norm": 0.69921875, + "learning_rate": 3.103983445421624e-05, + "loss": 1.7534, + "step": 300 + }, + { + "epoch": 0.02, + "grad_norm": 0.7109375, + "learning_rate": 3.1557165028453186e-05, + "loss": 1.7469, + "step": 305 + }, + { + "epoch": 0.02, + "grad_norm": 0.72265625, + "learning_rate": 3.207449560269012e-05, + "loss": 1.7285, + "step": 310 + }, + { + "epoch": 0.02, + "grad_norm": 0.73046875, + "learning_rate": 3.259182617692706e-05, + "loss": 1.7777, + "step": 315 + }, + { + "epoch": 0.02, + "grad_norm": 0.74609375, + "learning_rate": 3.3109156751163994e-05, + "loss": 1.6814, + "step": 320 + }, + { + "epoch": 0.02, + "grad_norm": 0.76171875, + "learning_rate": 3.362648732540093e-05, + "loss": 1.739, + "step": 325 + }, + { + "epoch": 0.02, + "grad_norm": 0.74609375, + "learning_rate": 3.414381789963787e-05, + "loss": 1.7609, + "step": 330 + }, + { + "epoch": 0.02, + "grad_norm": 0.73046875, + "learning_rate": 3.46611484738748e-05, + "loss": 1.7366, + "step": 335 + }, + { + "epoch": 0.02, + "grad_norm": 0.75, + "learning_rate": 3.517847904811175e-05, + "loss": 1.7704, + "step": 340 + }, + { + "epoch": 0.02, + "grad_norm": 0.7578125, + "learning_rate": 3.569580962234868e-05, + "loss": 1.6942, + "step": 345 + }, + { + "epoch": 0.02, + "grad_norm": 0.76171875, + "learning_rate": 3.6213140196585624e-05, + "loss": 1.6884, + "step": 350 + }, + { + "epoch": 0.02, + "grad_norm": 0.8046875, + "learning_rate": 3.6730470770822555e-05, + "loss": 1.7542, + "step": 355 + }, + { + "epoch": 0.02, + "grad_norm": 0.71875, + "learning_rate": 3.724780134505949e-05, + "loss": 1.7435, + "step": 360 + }, + { + "epoch": 0.02, + "grad_norm": 0.765625, + "learning_rate": 3.776513191929643e-05, + "loss": 1.7158, + "step": 365 + }, + { + "epoch": 0.02, + "grad_norm": 0.7734375, + "learning_rate": 3.828246249353337e-05, + "loss": 1.7178, + "step": 370 + }, + { + "epoch": 0.02, + "grad_norm": 0.79296875, + "learning_rate": 3.879979306777031e-05, + "loss": 1.7613, + "step": 375 + }, + { + "epoch": 0.02, + "grad_norm": 0.78515625, + "learning_rate": 3.931712364200724e-05, + "loss": 1.7211, + "step": 380 + }, + { + "epoch": 0.02, + "grad_norm": 380.0, + "learning_rate": 3.9834454216244185e-05, + "loss": 1.796, + "step": 385 + }, + { + "epoch": 0.02, + "grad_norm": 0.77734375, + "learning_rate": 4.0351784790481116e-05, + "loss": 1.7092, + "step": 390 + }, + { + "epoch": 0.02, + "grad_norm": 0.765625, + "learning_rate": 4.086911536471806e-05, + "loss": 1.7148, + "step": 395 + }, + { + "epoch": 0.02, + "grad_norm": 0.77734375, + "learning_rate": 4.138644593895499e-05, + "loss": 1.6965, + "step": 400 + }, + { + "epoch": 0.02, + "grad_norm": 0.7578125, + "learning_rate": 4.190377651319193e-05, + "loss": 1.7083, + "step": 405 + }, + { + "epoch": 0.02, + "grad_norm": 0.75390625, + "learning_rate": 4.242110708742887e-05, + "loss": 1.6792, + "step": 410 + }, + { + "epoch": 0.02, + "grad_norm": 0.7578125, + "learning_rate": 4.293843766166581e-05, + "loss": 1.7314, + "step": 415 + }, + { + "epoch": 0.02, + "grad_norm": 0.8046875, + "learning_rate": 4.3455768235902746e-05, + "loss": 1.741, + "step": 420 + }, + { + "epoch": 0.02, + "grad_norm": 0.78125, + "learning_rate": 4.397309881013968e-05, + "loss": 1.7095, + "step": 425 + }, + { + "epoch": 0.02, + "grad_norm": 0.8046875, + "learning_rate": 4.449042938437662e-05, + "loss": 1.7073, + "step": 430 + }, + { + "epoch": 0.02, + "grad_norm": 0.80078125, + "learning_rate": 4.5007759958613554e-05, + "loss": 1.7033, + "step": 435 + }, + { + "epoch": 0.02, + "grad_norm": 0.78515625, + "learning_rate": 4.552509053285049e-05, + "loss": 1.6986, + "step": 440 + }, + { + "epoch": 0.02, + "grad_norm": 0.77734375, + "learning_rate": 4.604242110708743e-05, + "loss": 1.6743, + "step": 445 + }, + { + "epoch": 0.02, + "grad_norm": 0.82421875, + "learning_rate": 4.655975168132437e-05, + "loss": 1.6705, + "step": 450 + }, + { + "epoch": 0.02, + "grad_norm": 0.73828125, + "learning_rate": 4.707708225556131e-05, + "loss": 1.6937, + "step": 455 + }, + { + "epoch": 0.02, + "grad_norm": 0.79296875, + "learning_rate": 4.7594412829798245e-05, + "loss": 1.7355, + "step": 460 + }, + { + "epoch": 0.02, + "grad_norm": 0.76953125, + "learning_rate": 4.811174340403518e-05, + "loss": 1.6826, + "step": 465 + }, + { + "epoch": 0.02, + "grad_norm": 0.83984375, + "learning_rate": 4.8629073978272115e-05, + "loss": 1.6627, + "step": 470 + }, + { + "epoch": 0.02, + "grad_norm": 0.78125, + "learning_rate": 4.914640455250906e-05, + "loss": 1.6788, + "step": 475 + }, + { + "epoch": 0.02, + "grad_norm": 0.78125, + "learning_rate": 4.966373512674599e-05, + "loss": 1.7126, + "step": 480 + }, + { + "epoch": 0.03, + "grad_norm": 0.77734375, + "learning_rate": 5.018106570098293e-05, + "loss": 1.7152, + "step": 485 + }, + { + "epoch": 0.03, + "grad_norm": 0.80859375, + "learning_rate": 5.0698396275219874e-05, + "loss": 1.6685, + "step": 490 + }, + { + "epoch": 0.03, + "grad_norm": 0.74609375, + "learning_rate": 5.12157268494568e-05, + "loss": 1.6562, + "step": 495 + }, + { + "epoch": 0.03, + "grad_norm": 0.7578125, + "learning_rate": 5.1733057423693744e-05, + "loss": 1.651, + "step": 500 + }, + { + "epoch": 0.03, + "grad_norm": 0.76953125, + "learning_rate": 5.225038799793068e-05, + "loss": 1.6997, + "step": 505 + }, + { + "epoch": 0.03, + "grad_norm": 0.74609375, + "learning_rate": 5.2767718572167614e-05, + "loss": 1.6904, + "step": 510 + }, + { + "epoch": 0.03, + "grad_norm": 0.7890625, + "learning_rate": 5.328504914640455e-05, + "loss": 1.6754, + "step": 515 + }, + { + "epoch": 0.03, + "grad_norm": 0.7578125, + "learning_rate": 5.380237972064149e-05, + "loss": 1.7106, + "step": 520 + }, + { + "epoch": 0.03, + "grad_norm": 0.7890625, + "learning_rate": 5.4319710294878435e-05, + "loss": 1.6913, + "step": 525 + }, + { + "epoch": 0.03, + "grad_norm": 0.80078125, + "learning_rate": 5.483704086911537e-05, + "loss": 1.6779, + "step": 530 + }, + { + "epoch": 0.03, + "grad_norm": 0.796875, + "learning_rate": 5.5354371443352305e-05, + "loss": 1.6658, + "step": 535 + }, + { + "epoch": 0.03, + "grad_norm": 0.76953125, + "learning_rate": 5.5871702017589243e-05, + "loss": 1.6931, + "step": 540 + }, + { + "epoch": 0.03, + "grad_norm": 0.76953125, + "learning_rate": 5.6389032591826175e-05, + "loss": 1.6653, + "step": 545 + }, + { + "epoch": 0.03, + "grad_norm": 0.75, + "learning_rate": 5.690636316606311e-05, + "loss": 1.6921, + "step": 550 + }, + { + "epoch": 0.03, + "grad_norm": 0.74609375, + "learning_rate": 5.742369374030006e-05, + "loss": 1.6621, + "step": 555 + }, + { + "epoch": 0.03, + "grad_norm": 0.74609375, + "learning_rate": 5.7941024314536996e-05, + "loss": 1.6868, + "step": 560 + }, + { + "epoch": 0.03, + "grad_norm": 0.75, + "learning_rate": 5.845835488877393e-05, + "loss": 1.672, + "step": 565 + }, + { + "epoch": 0.03, + "grad_norm": 0.75, + "learning_rate": 5.8975685463010866e-05, + "loss": 1.6741, + "step": 570 + }, + { + "epoch": 0.03, + "grad_norm": 0.75390625, + "learning_rate": 5.9493016037247804e-05, + "loss": 1.6989, + "step": 575 + }, + { + "epoch": 0.03, + "grad_norm": 0.734375, + "learning_rate": 6.001034661148475e-05, + "loss": 1.6492, + "step": 580 + }, + { + "epoch": 0.03, + "grad_norm": 0.78125, + "learning_rate": 6.0527677185721674e-05, + "loss": 1.7075, + "step": 585 + }, + { + "epoch": 0.03, + "grad_norm": 0.7421875, + "learning_rate": 6.104500775995862e-05, + "loss": 1.6889, + "step": 590 + }, + { + "epoch": 0.03, + "grad_norm": 0.71875, + "learning_rate": 6.156233833419556e-05, + "loss": 1.6749, + "step": 595 + }, + { + "epoch": 0.03, + "grad_norm": 0.73046875, + "learning_rate": 6.207966890843248e-05, + "loss": 1.6923, + "step": 600 + }, + { + "epoch": 0.03, + "grad_norm": 0.72265625, + "learning_rate": 6.259699948266943e-05, + "loss": 1.6846, + "step": 605 + }, + { + "epoch": 0.03, + "grad_norm": 0.72265625, + "learning_rate": 6.311433005690637e-05, + "loss": 1.6878, + "step": 610 + }, + { + "epoch": 0.03, + "grad_norm": 0.7265625, + "learning_rate": 6.363166063114331e-05, + "loss": 1.665, + "step": 615 + }, + { + "epoch": 0.03, + "grad_norm": 0.72265625, + "learning_rate": 6.414899120538024e-05, + "loss": 1.6745, + "step": 620 + }, + { + "epoch": 0.03, + "grad_norm": 0.7890625, + "learning_rate": 6.466632177961717e-05, + "loss": 1.6813, + "step": 625 + }, + { + "epoch": 0.03, + "grad_norm": 0.73828125, + "learning_rate": 6.518365235385413e-05, + "loss": 1.6718, + "step": 630 + }, + { + "epoch": 0.03, + "grad_norm": 0.796875, + "learning_rate": 6.570098292809105e-05, + "loss": 1.6575, + "step": 635 + }, + { + "epoch": 0.03, + "grad_norm": 0.703125, + "learning_rate": 6.621831350232799e-05, + "loss": 1.6663, + "step": 640 + }, + { + "epoch": 0.03, + "grad_norm": 0.73828125, + "learning_rate": 6.673564407656493e-05, + "loss": 1.659, + "step": 645 + }, + { + "epoch": 0.03, + "grad_norm": 0.71875, + "learning_rate": 6.725297465080186e-05, + "loss": 1.6705, + "step": 650 + }, + { + "epoch": 0.03, + "grad_norm": 0.75390625, + "learning_rate": 6.77703052250388e-05, + "loss": 1.6884, + "step": 655 + }, + { + "epoch": 0.03, + "grad_norm": 0.71875, + "learning_rate": 6.828763579927574e-05, + "loss": 1.6974, + "step": 660 + }, + { + "epoch": 0.03, + "grad_norm": 0.70703125, + "learning_rate": 6.880496637351268e-05, + "loss": 1.6686, + "step": 665 + }, + { + "epoch": 0.03, + "grad_norm": 0.69140625, + "learning_rate": 6.93222969477496e-05, + "loss": 1.6708, + "step": 670 + }, + { + "epoch": 0.03, + "grad_norm": 0.70703125, + "learning_rate": 6.983962752198656e-05, + "loss": 1.6826, + "step": 675 + }, + { + "epoch": 0.04, + "grad_norm": 0.70703125, + "learning_rate": 7.03569580962235e-05, + "loss": 1.6771, + "step": 680 + }, + { + "epoch": 0.04, + "grad_norm": 0.67578125, + "learning_rate": 7.087428867046043e-05, + "loss": 1.657, + "step": 685 + }, + { + "epoch": 0.04, + "grad_norm": 0.671875, + "learning_rate": 7.139161924469736e-05, + "loss": 1.6522, + "step": 690 + }, + { + "epoch": 0.04, + "grad_norm": 0.69140625, + "learning_rate": 7.19089498189343e-05, + "loss": 1.6453, + "step": 695 + }, + { + "epoch": 0.04, + "grad_norm": 0.71875, + "learning_rate": 7.242628039317125e-05, + "loss": 1.6442, + "step": 700 + }, + { + "epoch": 0.04, + "grad_norm": 0.671875, + "learning_rate": 7.294361096740819e-05, + "loss": 1.631, + "step": 705 + }, + { + "epoch": 0.04, + "grad_norm": 0.71484375, + "learning_rate": 7.346094154164511e-05, + "loss": 1.6205, + "step": 710 + }, + { + "epoch": 0.04, + "grad_norm": 0.6875, + "learning_rate": 7.397827211588205e-05, + "loss": 1.6374, + "step": 715 + }, + { + "epoch": 0.04, + "grad_norm": 0.68359375, + "learning_rate": 7.449560269011899e-05, + "loss": 1.6693, + "step": 720 + }, + { + "epoch": 0.04, + "grad_norm": 0.68359375, + "learning_rate": 7.501293326435593e-05, + "loss": 1.6841, + "step": 725 + }, + { + "epoch": 0.04, + "grad_norm": 0.68359375, + "learning_rate": 7.553026383859286e-05, + "loss": 1.6304, + "step": 730 + }, + { + "epoch": 0.04, + "grad_norm": 0.68359375, + "learning_rate": 7.60475944128298e-05, + "loss": 1.6373, + "step": 735 + }, + { + "epoch": 0.04, + "grad_norm": 0.7109375, + "learning_rate": 7.656492498706674e-05, + "loss": 1.6395, + "step": 740 + }, + { + "epoch": 0.04, + "grad_norm": 0.6875, + "learning_rate": 7.708225556130368e-05, + "loss": 1.6679, + "step": 745 + }, + { + "epoch": 0.04, + "grad_norm": 0.67578125, + "learning_rate": 7.759958613554062e-05, + "loss": 1.6473, + "step": 750 + }, + { + "epoch": 0.04, + "grad_norm": 0.6796875, + "learning_rate": 7.811691670977755e-05, + "loss": 1.6029, + "step": 755 + }, + { + "epoch": 0.04, + "grad_norm": 0.6640625, + "learning_rate": 7.863424728401448e-05, + "loss": 1.6523, + "step": 760 + }, + { + "epoch": 0.04, + "grad_norm": 0.7109375, + "learning_rate": 7.915157785825143e-05, + "loss": 1.6281, + "step": 765 + }, + { + "epoch": 0.04, + "grad_norm": 0.6953125, + "learning_rate": 7.966890843248837e-05, + "loss": 1.6032, + "step": 770 + }, + { + "epoch": 0.04, + "grad_norm": 0.703125, + "learning_rate": 8.018623900672531e-05, + "loss": 1.6495, + "step": 775 + }, + { + "epoch": 0.04, + "grad_norm": 0.66796875, + "learning_rate": 8.070356958096223e-05, + "loss": 1.6309, + "step": 780 + }, + { + "epoch": 0.04, + "grad_norm": 0.6796875, + "learning_rate": 8.122090015519917e-05, + "loss": 1.6321, + "step": 785 + }, + { + "epoch": 0.04, + "grad_norm": 0.6640625, + "learning_rate": 8.173823072943612e-05, + "loss": 1.6487, + "step": 790 + }, + { + "epoch": 0.04, + "grad_norm": 0.6171875, + "learning_rate": 8.225556130367305e-05, + "loss": 1.648, + "step": 795 + }, + { + "epoch": 0.04, + "grad_norm": 0.65234375, + "learning_rate": 8.277289187790999e-05, + "loss": 1.6622, + "step": 800 + }, + { + "epoch": 0.04, + "grad_norm": 0.65625, + "learning_rate": 8.329022245214692e-05, + "loss": 1.6737, + "step": 805 + }, + { + "epoch": 0.04, + "grad_norm": 0.66015625, + "learning_rate": 8.380755302638386e-05, + "loss": 1.6437, + "step": 810 + }, + { + "epoch": 0.04, + "grad_norm": 0.640625, + "learning_rate": 8.43248836006208e-05, + "loss": 1.6797, + "step": 815 + }, + { + "epoch": 0.04, + "grad_norm": 0.6328125, + "learning_rate": 8.484221417485774e-05, + "loss": 1.6351, + "step": 820 + }, + { + "epoch": 0.04, + "grad_norm": 0.66796875, + "learning_rate": 8.535954474909468e-05, + "loss": 1.6527, + "step": 825 + }, + { + "epoch": 0.04, + "grad_norm": 0.640625, + "learning_rate": 8.587687532333161e-05, + "loss": 1.6279, + "step": 830 + }, + { + "epoch": 0.04, + "grad_norm": 0.6796875, + "learning_rate": 8.639420589756855e-05, + "loss": 1.6513, + "step": 835 + }, + { + "epoch": 0.04, + "grad_norm": 0.67578125, + "learning_rate": 8.691153647180549e-05, + "loss": 1.6415, + "step": 840 + }, + { + "epoch": 0.04, + "grad_norm": 0.65234375, + "learning_rate": 8.742886704604243e-05, + "loss": 1.6349, + "step": 845 + }, + { + "epoch": 0.04, + "grad_norm": 0.6484375, + "learning_rate": 8.794619762027935e-05, + "loss": 1.6161, + "step": 850 + }, + { + "epoch": 0.04, + "grad_norm": 0.6484375, + "learning_rate": 8.846352819451629e-05, + "loss": 1.6249, + "step": 855 + }, + { + "epoch": 0.04, + "grad_norm": 0.671875, + "learning_rate": 8.898085876875324e-05, + "loss": 1.6353, + "step": 860 + }, + { + "epoch": 0.04, + "grad_norm": 0.65234375, + "learning_rate": 8.949818934299018e-05, + "loss": 1.6625, + "step": 865 + }, + { + "epoch": 0.05, + "grad_norm": 0.68359375, + "learning_rate": 9.001551991722711e-05, + "loss": 1.6277, + "step": 870 + }, + { + "epoch": 0.05, + "grad_norm": 0.62109375, + "learning_rate": 9.053285049146405e-05, + "loss": 1.675, + "step": 875 + }, + { + "epoch": 0.05, + "grad_norm": 0.6328125, + "learning_rate": 9.105018106570098e-05, + "loss": 1.654, + "step": 880 + }, + { + "epoch": 0.05, + "grad_norm": 0.640625, + "learning_rate": 9.156751163993792e-05, + "loss": 1.6671, + "step": 885 + }, + { + "epoch": 0.05, + "grad_norm": 0.62890625, + "learning_rate": 9.208484221417486e-05, + "loss": 1.6186, + "step": 890 + }, + { + "epoch": 0.05, + "grad_norm": 0.625, + "learning_rate": 9.26021727884118e-05, + "loss": 1.643, + "step": 895 + }, + { + "epoch": 0.05, + "grad_norm": 0.625, + "learning_rate": 9.311950336264874e-05, + "loss": 1.6187, + "step": 900 + }, + { + "epoch": 0.05, + "grad_norm": 0.62109375, + "learning_rate": 9.363683393688568e-05, + "loss": 1.6145, + "step": 905 + }, + { + "epoch": 0.05, + "grad_norm": 0.62890625, + "learning_rate": 9.415416451112261e-05, + "loss": 1.6412, + "step": 910 + }, + { + "epoch": 0.05, + "grad_norm": 0.66015625, + "learning_rate": 9.467149508535955e-05, + "loss": 1.6214, + "step": 915 + }, + { + "epoch": 0.05, + "grad_norm": 0.62890625, + "learning_rate": 9.518882565959649e-05, + "loss": 1.6311, + "step": 920 + }, + { + "epoch": 0.05, + "grad_norm": 0.6171875, + "learning_rate": 9.570615623383343e-05, + "loss": 1.6373, + "step": 925 + }, + { + "epoch": 0.05, + "grad_norm": 0.6171875, + "learning_rate": 9.622348680807037e-05, + "loss": 1.6238, + "step": 930 + }, + { + "epoch": 0.05, + "grad_norm": 0.625, + "learning_rate": 9.67408173823073e-05, + "loss": 1.6447, + "step": 935 + }, + { + "epoch": 0.05, + "grad_norm": 0.65625, + "learning_rate": 9.725814795654423e-05, + "loss": 1.6108, + "step": 940 + }, + { + "epoch": 0.05, + "grad_norm": 0.6484375, + "learning_rate": 9.777547853078117e-05, + "loss": 1.6195, + "step": 945 + }, + { + "epoch": 0.05, + "grad_norm": 0.66015625, + "learning_rate": 9.829280910501812e-05, + "loss": 1.6216, + "step": 950 + }, + { + "epoch": 0.05, + "grad_norm": 0.6015625, + "learning_rate": 9.881013967925506e-05, + "loss": 1.6312, + "step": 955 + }, + { + "epoch": 0.05, + "grad_norm": 0.609375, + "learning_rate": 9.932747025349198e-05, + "loss": 1.6351, + "step": 960 + }, + { + "epoch": 0.05, + "grad_norm": 0.6171875, + "learning_rate": 9.984480082772892e-05, + "loss": 1.6219, + "step": 965 + }, + { + "epoch": 0.05, + "grad_norm": 0.58984375, + "learning_rate": 0.00010036213140196586, + "loss": 1.6085, + "step": 970 + }, + { + "epoch": 0.05, + "grad_norm": 0.59375, + "learning_rate": 0.0001008794619762028, + "loss": 1.5999, + "step": 975 + }, + { + "epoch": 0.05, + "grad_norm": 0.6015625, + "learning_rate": 0.00010139679255043975, + "loss": 1.638, + "step": 980 + }, + { + "epoch": 0.05, + "grad_norm": 0.62109375, + "learning_rate": 0.00010191412312467667, + "loss": 1.6279, + "step": 985 + }, + { + "epoch": 0.05, + "grad_norm": 0.59375, + "learning_rate": 0.0001024314536989136, + "loss": 1.6012, + "step": 990 + }, + { + "epoch": 0.05, + "grad_norm": 0.609375, + "learning_rate": 0.00010294878427315055, + "loss": 1.5978, + "step": 995 + }, + { + "epoch": 0.05, + "grad_norm": 0.62890625, + "learning_rate": 0.00010346611484738749, + "loss": 1.6574, + "step": 1000 + }, + { + "epoch": 0.05, + "grad_norm": 0.62890625, + "learning_rate": 0.00010398344542162441, + "loss": 1.6144, + "step": 1005 + }, + { + "epoch": 0.05, + "grad_norm": 0.57421875, + "learning_rate": 0.00010450077599586136, + "loss": 1.5865, + "step": 1010 + }, + { + "epoch": 0.05, + "grad_norm": 0.59375, + "learning_rate": 0.00010501810657009829, + "loss": 1.5854, + "step": 1015 + }, + { + "epoch": 0.05, + "grad_norm": 0.58984375, + "learning_rate": 0.00010553543714433523, + "loss": 1.5843, + "step": 1020 + }, + { + "epoch": 0.05, + "grad_norm": 28.125, + "learning_rate": 0.00010605276771857218, + "loss": 1.737, + "step": 1025 + }, + { + "epoch": 0.05, + "grad_norm": 0.59765625, + "learning_rate": 0.0001065700982928091, + "loss": 1.6076, + "step": 1030 + }, + { + "epoch": 0.05, + "grad_norm": 0.6015625, + "learning_rate": 0.00010708742886704606, + "loss": 1.6308, + "step": 1035 + }, + { + "epoch": 0.05, + "grad_norm": 0.59765625, + "learning_rate": 0.00010760475944128298, + "loss": 1.5846, + "step": 1040 + }, + { + "epoch": 0.05, + "grad_norm": 0.59765625, + "learning_rate": 0.00010812209001551992, + "loss": 1.6332, + "step": 1045 + }, + { + "epoch": 0.05, + "grad_norm": 0.57421875, + "learning_rate": 0.00010863942058975687, + "loss": 1.6076, + "step": 1050 + }, + { + "epoch": 0.05, + "grad_norm": 0.60546875, + "learning_rate": 0.0001091567511639938, + "loss": 1.6014, + "step": 1055 + }, + { + "epoch": 0.05, + "grad_norm": 0.5703125, + "learning_rate": 0.00010967408173823073, + "loss": 1.6394, + "step": 1060 + }, + { + "epoch": 0.06, + "grad_norm": 0.57421875, + "learning_rate": 0.00011019141231246769, + "loss": 1.5853, + "step": 1065 + }, + { + "epoch": 0.06, + "grad_norm": 0.74609375, + "learning_rate": 0.00011070874288670461, + "loss": 1.5903, + "step": 1070 + }, + { + "epoch": 0.06, + "grad_norm": 0.609375, + "learning_rate": 0.00011122607346094154, + "loss": 1.6, + "step": 1075 + }, + { + "epoch": 0.06, + "grad_norm": 0.6015625, + "learning_rate": 0.00011174340403517849, + "loss": 1.636, + "step": 1080 + }, + { + "epoch": 0.06, + "grad_norm": 0.62109375, + "learning_rate": 0.00011226073460941543, + "loss": 1.632, + "step": 1085 + }, + { + "epoch": 0.06, + "grad_norm": 0.58984375, + "learning_rate": 0.00011277806518365235, + "loss": 1.5872, + "step": 1090 + }, + { + "epoch": 0.06, + "grad_norm": 0.57421875, + "learning_rate": 0.0001132953957578893, + "loss": 1.6298, + "step": 1095 + }, + { + "epoch": 0.06, + "grad_norm": 0.578125, + "learning_rate": 0.00011381272633212623, + "loss": 1.6073, + "step": 1100 + }, + { + "epoch": 0.06, + "grad_norm": 0.58203125, + "learning_rate": 0.00011433005690636318, + "loss": 1.5932, + "step": 1105 + }, + { + "epoch": 0.06, + "grad_norm": 0.5859375, + "learning_rate": 0.00011484738748060012, + "loss": 1.5957, + "step": 1110 + }, + { + "epoch": 0.06, + "grad_norm": 0.7109375, + "learning_rate": 0.00011536471805483704, + "loss": 1.5781, + "step": 1115 + }, + { + "epoch": 0.06, + "grad_norm": 0.5859375, + "learning_rate": 0.00011588204862907399, + "loss": 1.5761, + "step": 1120 + }, + { + "epoch": 0.06, + "grad_norm": 0.57421875, + "learning_rate": 0.00011639937920331092, + "loss": 1.5804, + "step": 1125 + }, + { + "epoch": 0.06, + "grad_norm": 0.59375, + "learning_rate": 0.00011691670977754786, + "loss": 1.6005, + "step": 1130 + }, + { + "epoch": 0.06, + "grad_norm": 0.57421875, + "learning_rate": 0.00011743404035178481, + "loss": 1.5513, + "step": 1135 + }, + { + "epoch": 0.06, + "grad_norm": 0.625, + "learning_rate": 0.00011795137092602173, + "loss": 1.6244, + "step": 1140 + }, + { + "epoch": 0.06, + "grad_norm": 0.5703125, + "learning_rate": 0.00011846870150025866, + "loss": 1.5941, + "step": 1145 + }, + { + "epoch": 0.06, + "grad_norm": 0.5703125, + "learning_rate": 0.00011898603207449561, + "loss": 1.6306, + "step": 1150 + }, + { + "epoch": 0.06, + "grad_norm": 0.625, + "learning_rate": 0.00011950336264873255, + "loss": 1.6149, + "step": 1155 + }, + { + "epoch": 0.06, + "grad_norm": 0.57421875, + "learning_rate": 0.0001200206932229695, + "loss": 1.603, + "step": 1160 + }, + { + "epoch": 0.06, + "grad_norm": 0.578125, + "learning_rate": 0.00012053802379720642, + "loss": 1.5644, + "step": 1165 + }, + { + "epoch": 0.06, + "grad_norm": 0.59765625, + "learning_rate": 0.00012105535437144335, + "loss": 1.5883, + "step": 1170 + }, + { + "epoch": 0.06, + "grad_norm": 0.55078125, + "learning_rate": 0.0001215726849456803, + "loss": 1.5846, + "step": 1175 + }, + { + "epoch": 0.06, + "grad_norm": 0.59375, + "learning_rate": 0.00012209001551991724, + "loss": 1.6083, + "step": 1180 + }, + { + "epoch": 0.06, + "grad_norm": 0.578125, + "learning_rate": 0.00012260734609415415, + "loss": 1.6051, + "step": 1185 + }, + { + "epoch": 0.06, + "grad_norm": 0.5625, + "learning_rate": 0.00012312467666839111, + "loss": 1.6224, + "step": 1190 + }, + { + "epoch": 0.06, + "grad_norm": 0.5859375, + "learning_rate": 0.00012364200724262805, + "loss": 1.6303, + "step": 1195 + }, + { + "epoch": 0.06, + "grad_norm": 0.56640625, + "learning_rate": 0.00012415933781686496, + "loss": 1.5796, + "step": 1200 + }, + { + "epoch": 0.06, + "grad_norm": 0.5703125, + "learning_rate": 0.00012467666839110193, + "loss": 1.5645, + "step": 1205 + }, + { + "epoch": 0.06, + "grad_norm": 0.55859375, + "learning_rate": 0.00012519399896533887, + "loss": 1.591, + "step": 1210 + }, + { + "epoch": 0.06, + "grad_norm": 0.57421875, + "learning_rate": 0.00012571132953957578, + "loss": 1.6042, + "step": 1215 + }, + { + "epoch": 0.06, + "grad_norm": 0.58203125, + "learning_rate": 0.00012622866011381274, + "loss": 1.5599, + "step": 1220 + }, + { + "epoch": 0.06, + "grad_norm": 0.57421875, + "learning_rate": 0.00012674599068804966, + "loss": 1.6363, + "step": 1225 + }, + { + "epoch": 0.06, + "grad_norm": 0.5390625, + "learning_rate": 0.00012726332126228662, + "loss": 1.5934, + "step": 1230 + }, + { + "epoch": 0.06, + "grad_norm": 0.55859375, + "learning_rate": 0.00012778065183652356, + "loss": 1.5754, + "step": 1235 + }, + { + "epoch": 0.06, + "grad_norm": 0.578125, + "learning_rate": 0.00012829798241076047, + "loss": 1.586, + "step": 1240 + }, + { + "epoch": 0.06, + "grad_norm": 0.5625, + "learning_rate": 0.00012881531298499744, + "loss": 1.5841, + "step": 1245 + }, + { + "epoch": 0.06, + "grad_norm": 0.57421875, + "learning_rate": 0.00012933264355923435, + "loss": 1.5874, + "step": 1250 + }, + { + "epoch": 0.06, + "grad_norm": 0.5546875, + "learning_rate": 0.00012984997413347129, + "loss": 1.592, + "step": 1255 + }, + { + "epoch": 0.07, + "grad_norm": 0.59765625, + "learning_rate": 0.00013036730470770825, + "loss": 1.6185, + "step": 1260 + }, + { + "epoch": 0.07, + "grad_norm": 0.57421875, + "learning_rate": 0.00013088463528194516, + "loss": 1.603, + "step": 1265 + }, + { + "epoch": 0.07, + "grad_norm": 0.578125, + "learning_rate": 0.0001314019658561821, + "loss": 1.542, + "step": 1270 + }, + { + "epoch": 0.07, + "grad_norm": 0.578125, + "learning_rate": 0.00013191929643041904, + "loss": 1.5686, + "step": 1275 + }, + { + "epoch": 0.07, + "grad_norm": 0.59765625, + "learning_rate": 0.00013243662700465598, + "loss": 1.5893, + "step": 1280 + }, + { + "epoch": 0.07, + "grad_norm": 0.5859375, + "learning_rate": 0.00013295395757889294, + "loss": 1.6039, + "step": 1285 + }, + { + "epoch": 0.07, + "grad_norm": 0.5703125, + "learning_rate": 0.00013347128815312985, + "loss": 1.6043, + "step": 1290 + }, + { + "epoch": 0.07, + "grad_norm": 0.56640625, + "learning_rate": 0.0001339886187273668, + "loss": 1.5886, + "step": 1295 + }, + { + "epoch": 0.07, + "grad_norm": 0.53125, + "learning_rate": 0.00013450594930160373, + "loss": 1.6044, + "step": 1300 + }, + { + "epoch": 0.07, + "grad_norm": 0.56640625, + "learning_rate": 0.00013502327987584067, + "loss": 1.5665, + "step": 1305 + }, + { + "epoch": 0.07, + "grad_norm": 0.5703125, + "learning_rate": 0.0001355406104500776, + "loss": 1.5608, + "step": 1310 + }, + { + "epoch": 0.07, + "grad_norm": 0.546875, + "learning_rate": 0.00013605794102431454, + "loss": 1.6247, + "step": 1315 + }, + { + "epoch": 0.07, + "grad_norm": 0.55859375, + "learning_rate": 0.00013657527159855148, + "loss": 1.5377, + "step": 1320 + }, + { + "epoch": 0.07, + "grad_norm": 0.5390625, + "learning_rate": 0.00013709260217278842, + "loss": 1.5695, + "step": 1325 + }, + { + "epoch": 0.07, + "grad_norm": 0.55078125, + "learning_rate": 0.00013760993274702536, + "loss": 1.5705, + "step": 1330 + }, + { + "epoch": 0.07, + "grad_norm": 0.55078125, + "learning_rate": 0.0001381272633212623, + "loss": 1.5698, + "step": 1335 + }, + { + "epoch": 0.07, + "grad_norm": 0.546875, + "learning_rate": 0.0001386445938954992, + "loss": 1.5721, + "step": 1340 + }, + { + "epoch": 0.07, + "grad_norm": 0.55859375, + "learning_rate": 0.00013916192446973617, + "loss": 1.5982, + "step": 1345 + }, + { + "epoch": 0.07, + "grad_norm": 0.578125, + "learning_rate": 0.0001396792550439731, + "loss": 1.5812, + "step": 1350 + }, + { + "epoch": 0.07, + "grad_norm": 0.5546875, + "learning_rate": 0.00014019658561821005, + "loss": 1.62, + "step": 1355 + }, + { + "epoch": 0.07, + "grad_norm": 0.55078125, + "learning_rate": 0.000140713916192447, + "loss": 1.6233, + "step": 1360 + }, + { + "epoch": 0.07, + "grad_norm": 0.57421875, + "learning_rate": 0.0001412312467666839, + "loss": 1.5828, + "step": 1365 + }, + { + "epoch": 0.07, + "grad_norm": 0.5390625, + "learning_rate": 0.00014174857734092087, + "loss": 1.594, + "step": 1370 + }, + { + "epoch": 0.07, + "grad_norm": 0.56640625, + "learning_rate": 0.0001422659079151578, + "loss": 1.6036, + "step": 1375 + }, + { + "epoch": 0.07, + "grad_norm": 0.55078125, + "learning_rate": 0.00014278323848939471, + "loss": 1.616, + "step": 1380 + }, + { + "epoch": 0.07, + "grad_norm": 0.53125, + "learning_rate": 0.00014330056906363168, + "loss": 1.5825, + "step": 1385 + }, + { + "epoch": 0.07, + "grad_norm": 0.55078125, + "learning_rate": 0.0001438178996378686, + "loss": 1.5786, + "step": 1390 + }, + { + "epoch": 0.07, + "grad_norm": 0.54296875, + "learning_rate": 0.00014433523021210553, + "loss": 1.5681, + "step": 1395 + }, + { + "epoch": 0.07, + "grad_norm": 0.55859375, + "learning_rate": 0.0001448525607863425, + "loss": 1.5683, + "step": 1400 + }, + { + "epoch": 0.07, + "grad_norm": 0.54296875, + "learning_rate": 0.0001453698913605794, + "loss": 1.5987, + "step": 1405 + }, + { + "epoch": 0.07, + "grad_norm": 0.53515625, + "learning_rate": 0.00014588722193481637, + "loss": 1.5762, + "step": 1410 + }, + { + "epoch": 0.07, + "grad_norm": 0.61328125, + "learning_rate": 0.00014640455250905328, + "loss": 1.5804, + "step": 1415 + }, + { + "epoch": 0.07, + "grad_norm": 0.58203125, + "learning_rate": 0.00014692188308329022, + "loss": 1.5967, + "step": 1420 + }, + { + "epoch": 0.07, + "grad_norm": 0.55078125, + "learning_rate": 0.00014743921365752719, + "loss": 1.564, + "step": 1425 + }, + { + "epoch": 0.07, + "grad_norm": 0.53515625, + "learning_rate": 0.0001479565442317641, + "loss": 1.5913, + "step": 1430 + }, + { + "epoch": 0.07, + "grad_norm": 0.546875, + "learning_rate": 0.00014847387480600104, + "loss": 1.5789, + "step": 1435 + }, + { + "epoch": 0.07, + "grad_norm": 0.55859375, + "learning_rate": 0.00014899120538023797, + "loss": 1.5692, + "step": 1440 + }, + { + "epoch": 0.07, + "grad_norm": 0.53125, + "learning_rate": 0.0001495085359544749, + "loss": 1.5682, + "step": 1445 + }, + { + "epoch": 0.08, + "grad_norm": 0.54296875, + "learning_rate": 0.00015002586652871185, + "loss": 1.5949, + "step": 1450 + }, + { + "epoch": 0.08, + "grad_norm": 0.578125, + "learning_rate": 0.0001505431971029488, + "loss": 1.6124, + "step": 1455 + }, + { + "epoch": 0.08, + "grad_norm": 0.5234375, + "learning_rate": 0.00015106052767718573, + "loss": 1.611, + "step": 1460 + }, + { + "epoch": 0.08, + "grad_norm": 0.55859375, + "learning_rate": 0.00015157785825142266, + "loss": 1.5875, + "step": 1465 + }, + { + "epoch": 0.08, + "grad_norm": 0.546875, + "learning_rate": 0.0001520951888256596, + "loss": 1.5513, + "step": 1470 + }, + { + "epoch": 0.08, + "grad_norm": 0.55859375, + "learning_rate": 0.00015261251939989654, + "loss": 1.5774, + "step": 1475 + }, + { + "epoch": 0.08, + "grad_norm": 0.54296875, + "learning_rate": 0.00015312984997413348, + "loss": 1.58, + "step": 1480 + }, + { + "epoch": 0.08, + "grad_norm": 0.546875, + "learning_rate": 0.00015364718054837042, + "loss": 1.584, + "step": 1485 + }, + { + "epoch": 0.08, + "grad_norm": 0.546875, + "learning_rate": 0.00015416451112260736, + "loss": 1.6072, + "step": 1490 + }, + { + "epoch": 0.08, + "grad_norm": 0.55859375, + "learning_rate": 0.0001546818416968443, + "loss": 1.545, + "step": 1495 + }, + { + "epoch": 0.08, + "grad_norm": 0.54296875, + "learning_rate": 0.00015519917227108123, + "loss": 1.59, + "step": 1500 + }, + { + "epoch": 0.08, + "grad_norm": 0.546875, + "learning_rate": 0.00015571650284531817, + "loss": 1.5463, + "step": 1505 + }, + { + "epoch": 0.08, + "grad_norm": 0.53515625, + "learning_rate": 0.0001562338334195551, + "loss": 1.5767, + "step": 1510 + }, + { + "epoch": 0.08, + "grad_norm": 0.55078125, + "learning_rate": 0.00015675116399379205, + "loss": 1.6077, + "step": 1515 + }, + { + "epoch": 0.08, + "grad_norm": 0.54296875, + "learning_rate": 0.00015726849456802896, + "loss": 1.5653, + "step": 1520 + }, + { + "epoch": 0.08, + "grad_norm": 0.55859375, + "learning_rate": 0.00015778582514226592, + "loss": 1.5685, + "step": 1525 + }, + { + "epoch": 0.08, + "grad_norm": 0.52734375, + "learning_rate": 0.00015830315571650286, + "loss": 1.5647, + "step": 1530 + }, + { + "epoch": 0.08, + "grad_norm": 0.52734375, + "learning_rate": 0.0001588204862907398, + "loss": 1.5662, + "step": 1535 + }, + { + "epoch": 0.08, + "grad_norm": 0.546875, + "learning_rate": 0.00015933781686497674, + "loss": 1.5859, + "step": 1540 + }, + { + "epoch": 0.08, + "grad_norm": 0.55078125, + "learning_rate": 0.00015985514743921365, + "loss": 1.5762, + "step": 1545 + }, + { + "epoch": 0.08, + "grad_norm": 0.5234375, + "learning_rate": 0.00016037247801345062, + "loss": 1.5691, + "step": 1550 + }, + { + "epoch": 0.08, + "grad_norm": 0.5625, + "learning_rate": 0.00016088980858768755, + "loss": 1.5798, + "step": 1555 + }, + { + "epoch": 0.08, + "grad_norm": 0.54296875, + "learning_rate": 0.00016140713916192446, + "loss": 1.5793, + "step": 1560 + }, + { + "epoch": 0.08, + "grad_norm": 0.52734375, + "learning_rate": 0.00016192446973616143, + "loss": 1.5979, + "step": 1565 + }, + { + "epoch": 0.08, + "grad_norm": 0.5234375, + "learning_rate": 0.00016244180031039834, + "loss": 1.5795, + "step": 1570 + }, + { + "epoch": 0.08, + "grad_norm": 0.546875, + "learning_rate": 0.00016295913088463528, + "loss": 1.5537, + "step": 1575 + }, + { + "epoch": 0.08, + "grad_norm": 0.53515625, + "learning_rate": 0.00016347646145887224, + "loss": 1.5306, + "step": 1580 + }, + { + "epoch": 0.08, + "grad_norm": 0.546875, + "learning_rate": 0.00016399379203310916, + "loss": 1.5812, + "step": 1585 + }, + { + "epoch": 0.08, + "grad_norm": 0.53515625, + "learning_rate": 0.0001645111226073461, + "loss": 1.5892, + "step": 1590 + }, + { + "epoch": 0.08, + "grad_norm": 0.55859375, + "learning_rate": 0.00016502845318158303, + "loss": 1.5867, + "step": 1595 + }, + { + "epoch": 0.08, + "grad_norm": 0.5390625, + "learning_rate": 0.00016554578375581997, + "loss": 1.5385, + "step": 1600 + }, + { + "epoch": 0.08, + "grad_norm": 0.5234375, + "learning_rate": 0.00016606311433005694, + "loss": 1.5428, + "step": 1605 + }, + { + "epoch": 0.08, + "grad_norm": 0.53125, + "learning_rate": 0.00016658044490429385, + "loss": 1.5596, + "step": 1610 + }, + { + "epoch": 0.08, + "grad_norm": 0.5390625, + "learning_rate": 0.00016709777547853079, + "loss": 1.5558, + "step": 1615 + }, + { + "epoch": 0.08, + "grad_norm": 0.546875, + "learning_rate": 0.00016761510605276772, + "loss": 1.5684, + "step": 1620 + }, + { + "epoch": 0.08, + "grad_norm": 0.5625, + "learning_rate": 0.00016813243662700466, + "loss": 1.572, + "step": 1625 + }, + { + "epoch": 0.08, + "grad_norm": 0.52734375, + "learning_rate": 0.0001686497672012416, + "loss": 1.5505, + "step": 1630 + }, + { + "epoch": 0.08, + "grad_norm": 0.53125, + "learning_rate": 0.00016916709777547854, + "loss": 1.5732, + "step": 1635 + }, + { + "epoch": 0.08, + "grad_norm": 0.51953125, + "learning_rate": 0.00016968442834971548, + "loss": 1.5629, + "step": 1640 + }, + { + "epoch": 0.09, + "grad_norm": 0.5390625, + "learning_rate": 0.00017020175892395241, + "loss": 1.5986, + "step": 1645 + }, + { + "epoch": 0.09, + "grad_norm": 0.54296875, + "learning_rate": 0.00017071908949818935, + "loss": 1.5384, + "step": 1650 + }, + { + "epoch": 0.09, + "grad_norm": 0.53515625, + "learning_rate": 0.0001712364200724263, + "loss": 1.5552, + "step": 1655 + }, + { + "epoch": 0.09, + "grad_norm": 0.54296875, + "learning_rate": 0.00017175375064666323, + "loss": 1.5812, + "step": 1660 + }, + { + "epoch": 0.09, + "grad_norm": 0.51171875, + "learning_rate": 0.00017227108122090017, + "loss": 1.5385, + "step": 1665 + }, + { + "epoch": 0.09, + "grad_norm": 0.53125, + "learning_rate": 0.0001727884117951371, + "loss": 1.5383, + "step": 1670 + }, + { + "epoch": 0.09, + "grad_norm": 0.51953125, + "learning_rate": 0.00017330574236937404, + "loss": 1.5549, + "step": 1675 + }, + { + "epoch": 0.09, + "grad_norm": 0.53515625, + "learning_rate": 0.00017382307294361098, + "loss": 1.5509, + "step": 1680 + }, + { + "epoch": 0.09, + "grad_norm": 0.53515625, + "learning_rate": 0.0001743404035178479, + "loss": 1.6073, + "step": 1685 + }, + { + "epoch": 0.09, + "grad_norm": 0.55078125, + "learning_rate": 0.00017485773409208486, + "loss": 1.5538, + "step": 1690 + }, + { + "epoch": 0.09, + "grad_norm": 0.56640625, + "learning_rate": 0.0001753750646663218, + "loss": 1.595, + "step": 1695 + }, + { + "epoch": 0.09, + "grad_norm": 0.55859375, + "learning_rate": 0.0001758923952405587, + "loss": 1.5658, + "step": 1700 + }, + { + "epoch": 0.09, + "grad_norm": 0.51953125, + "learning_rate": 0.00017640972581479567, + "loss": 1.5508, + "step": 1705 + }, + { + "epoch": 0.09, + "grad_norm": 0.51953125, + "learning_rate": 0.00017692705638903259, + "loss": 1.5406, + "step": 1710 + }, + { + "epoch": 0.09, + "grad_norm": 0.55078125, + "learning_rate": 0.00017744438696326955, + "loss": 1.5676, + "step": 1715 + }, + { + "epoch": 0.09, + "grad_norm": 0.53515625, + "learning_rate": 0.0001779617175375065, + "loss": 1.5622, + "step": 1720 + }, + { + "epoch": 0.09, + "grad_norm": 0.5390625, + "learning_rate": 0.0001784790481117434, + "loss": 1.5609, + "step": 1725 + }, + { + "epoch": 0.09, + "grad_norm": 0.546875, + "learning_rate": 0.00017899637868598037, + "loss": 1.578, + "step": 1730 + }, + { + "epoch": 0.09, + "grad_norm": 0.53125, + "learning_rate": 0.00017951370926021728, + "loss": 1.5461, + "step": 1735 + }, + { + "epoch": 0.09, + "grad_norm": 0.5234375, + "learning_rate": 0.00018003103983445421, + "loss": 1.5818, + "step": 1740 + }, + { + "epoch": 0.09, + "grad_norm": 0.5234375, + "learning_rate": 0.00018054837040869118, + "loss": 1.5628, + "step": 1745 + }, + { + "epoch": 0.09, + "grad_norm": 5.3125, + "learning_rate": 0.0001810657009829281, + "loss": 1.568, + "step": 1750 + }, + { + "epoch": 0.09, + "grad_norm": 0.51171875, + "learning_rate": 0.00018158303155716503, + "loss": 1.5412, + "step": 1755 + }, + { + "epoch": 0.09, + "grad_norm": 0.57421875, + "learning_rate": 0.00018210036213140197, + "loss": 1.6057, + "step": 1760 + }, + { + "epoch": 0.09, + "grad_norm": 0.546875, + "learning_rate": 0.0001826176927056389, + "loss": 1.5805, + "step": 1765 + }, + { + "epoch": 0.09, + "grad_norm": 0.82421875, + "learning_rate": 0.00018313502327987584, + "loss": 1.5736, + "step": 1770 + }, + { + "epoch": 0.09, + "grad_norm": 0.6015625, + "learning_rate": 0.00018365235385411278, + "loss": 1.5586, + "step": 1775 + }, + { + "epoch": 0.09, + "grad_norm": 2.8125, + "learning_rate": 0.00018416968442834972, + "loss": 1.5524, + "step": 1780 + }, + { + "epoch": 0.09, + "grad_norm": 0.56640625, + "learning_rate": 0.00018468701500258666, + "loss": 1.5908, + "step": 1785 + }, + { + "epoch": 0.09, + "grad_norm": 0.5078125, + "learning_rate": 0.0001852043455768236, + "loss": 1.5742, + "step": 1790 + }, + { + "epoch": 0.09, + "grad_norm": 0.50390625, + "learning_rate": 0.00018572167615106054, + "loss": 1.5759, + "step": 1795 + }, + { + "epoch": 0.09, + "grad_norm": 0.9140625, + "learning_rate": 0.00018623900672529747, + "loss": 1.6109, + "step": 1800 + }, + { + "epoch": 0.09, + "grad_norm": 0.53125, + "learning_rate": 0.0001867563372995344, + "loss": 1.545, + "step": 1805 + }, + { + "epoch": 0.09, + "grad_norm": 0.546875, + "learning_rate": 0.00018727366787377135, + "loss": 1.5727, + "step": 1810 + }, + { + "epoch": 0.09, + "grad_norm": 0.5625, + "learning_rate": 0.0001877909984480083, + "loss": 1.5265, + "step": 1815 + }, + { + "epoch": 0.09, + "grad_norm": 0.53515625, + "learning_rate": 0.00018830832902224523, + "loss": 1.5541, + "step": 1820 + }, + { + "epoch": 0.09, + "grad_norm": 0.5390625, + "learning_rate": 0.00018882565959648216, + "loss": 1.5675, + "step": 1825 + }, + { + "epoch": 0.09, + "grad_norm": 1.328125, + "learning_rate": 0.0001893429901707191, + "loss": 1.5542, + "step": 1830 + }, + { + "epoch": 0.09, + "grad_norm": 0.5390625, + "learning_rate": 0.00018986032074495604, + "loss": 1.5244, + "step": 1835 + }, + { + "epoch": 0.1, + "grad_norm": 0.53125, + "learning_rate": 0.00019037765131919298, + "loss": 1.5473, + "step": 1840 + }, + { + "epoch": 0.1, + "grad_norm": 0.53125, + "learning_rate": 0.00019089498189342992, + "loss": 1.5554, + "step": 1845 + }, + { + "epoch": 0.1, + "grad_norm": 0.52734375, + "learning_rate": 0.00019141231246766686, + "loss": 1.5603, + "step": 1850 + }, + { + "epoch": 0.1, + "grad_norm": 0.52734375, + "learning_rate": 0.0001919296430419038, + "loss": 1.5157, + "step": 1855 + }, + { + "epoch": 0.1, + "grad_norm": 0.51171875, + "learning_rate": 0.00019244697361614073, + "loss": 1.5392, + "step": 1860 + }, + { + "epoch": 0.1, + "grad_norm": 0.51953125, + "learning_rate": 0.00019296430419037764, + "loss": 1.5601, + "step": 1865 + }, + { + "epoch": 0.1, + "grad_norm": 0.490234375, + "learning_rate": 0.0001934816347646146, + "loss": 1.5473, + "step": 1870 + }, + { + "epoch": 0.1, + "grad_norm": 0.53125, + "learning_rate": 0.00019399896533885155, + "loss": 1.5782, + "step": 1875 + }, + { + "epoch": 0.1, + "grad_norm": 0.51953125, + "learning_rate": 0.00019451629591308846, + "loss": 1.5659, + "step": 1880 + }, + { + "epoch": 0.1, + "grad_norm": 0.52734375, + "learning_rate": 0.00019503362648732542, + "loss": 1.5565, + "step": 1885 + }, + { + "epoch": 0.1, + "grad_norm": 0.53125, + "learning_rate": 0.00019555095706156234, + "loss": 1.5245, + "step": 1890 + }, + { + "epoch": 0.1, + "grad_norm": 0.515625, + "learning_rate": 0.00019606828763579927, + "loss": 1.5682, + "step": 1895 + }, + { + "epoch": 0.1, + "grad_norm": 0.53125, + "learning_rate": 0.00019658561821003624, + "loss": 1.5439, + "step": 1900 + }, + { + "epoch": 0.1, + "grad_norm": 5.0, + "learning_rate": 0.00019710294878427315, + "loss": 1.5547, + "step": 1905 + }, + { + "epoch": 0.1, + "grad_norm": 0.56640625, + "learning_rate": 0.00019762027935851012, + "loss": 1.5742, + "step": 1910 + }, + { + "epoch": 0.1, + "grad_norm": 0.5390625, + "learning_rate": 0.00019813760993274703, + "loss": 1.524, + "step": 1915 + }, + { + "epoch": 0.1, + "grad_norm": 0.53515625, + "learning_rate": 0.00019865494050698396, + "loss": 1.5328, + "step": 1920 + }, + { + "epoch": 0.1, + "grad_norm": 0.578125, + "learning_rate": 0.00019917227108122093, + "loss": 1.5554, + "step": 1925 + }, + { + "epoch": 0.1, + "grad_norm": 0.5625, + "learning_rate": 0.00019968960165545784, + "loss": 1.57, + "step": 1930 + }, + { + "epoch": 0.1, + "grad_norm": 19.5, + "learning_rate": 0.00019999999347649694, + "loss": 1.5967, + "step": 1935 + }, + { + "epoch": 0.1, + "grad_norm": 0.53515625, + "learning_rate": 0.00019999992008709735, + "loss": 1.5606, + "step": 1940 + }, + { + "epoch": 0.1, + "grad_norm": 0.51953125, + "learning_rate": 0.00019999976515397937, + "loss": 1.5559, + "step": 1945 + }, + { + "epoch": 0.1, + "grad_norm": 0.53125, + "learning_rate": 0.00019999952867726936, + "loss": 1.5675, + "step": 1950 + }, + { + "epoch": 0.1, + "grad_norm": 0.5390625, + "learning_rate": 0.0001999992106571601, + "loss": 1.5612, + "step": 1955 + }, + { + "epoch": 0.1, + "grad_norm": 0.55859375, + "learning_rate": 0.00019999881109391098, + "loss": 1.5634, + "step": 1960 + }, + { + "epoch": 0.1, + "grad_norm": 0.52734375, + "learning_rate": 0.0001999983299878478, + "loss": 1.5404, + "step": 1965 + }, + { + "epoch": 0.1, + "grad_norm": 0.55078125, + "learning_rate": 0.00019999776733936286, + "loss": 1.5875, + "step": 1970 + }, + { + "epoch": 0.1, + "grad_norm": 0.61328125, + "learning_rate": 0.00019999712314891496, + "loss": 1.5651, + "step": 1975 + }, + { + "epoch": 0.1, + "grad_norm": 0.52734375, + "learning_rate": 0.00019999639741702943, + "loss": 1.5764, + "step": 1980 + }, + { + "epoch": 0.1, + "grad_norm": 0.5390625, + "learning_rate": 0.00019999559014429802, + "loss": 1.5411, + "step": 1985 + }, + { + "epoch": 0.1, + "grad_norm": 0.50390625, + "learning_rate": 0.00019999470133137906, + "loss": 1.5388, + "step": 1990 + }, + { + "epoch": 0.1, + "grad_norm": 0.53515625, + "learning_rate": 0.00019999373097899728, + "loss": 1.5659, + "step": 1995 + }, + { + "epoch": 0.1, + "grad_norm": 0.52734375, + "learning_rate": 0.00019999267908794394, + "loss": 1.5484, + "step": 2000 + }, + { + "epoch": 0.1, + "grad_norm": 0.53125, + "learning_rate": 0.00019999154565907682, + "loss": 1.5623, + "step": 2005 + }, + { + "epoch": 0.1, + "grad_norm": 0.546875, + "learning_rate": 0.00019999033069332013, + "loss": 1.5429, + "step": 2010 + }, + { + "epoch": 0.1, + "grad_norm": 0.52734375, + "learning_rate": 0.0001999890341916646, + "loss": 1.5245, + "step": 2015 + }, + { + "epoch": 0.1, + "grad_norm": 0.52734375, + "learning_rate": 0.0001999876561551675, + "loss": 1.5415, + "step": 2020 + }, + { + "epoch": 0.1, + "grad_norm": 0.52734375, + "learning_rate": 0.00019998619658495245, + "loss": 1.5505, + "step": 2025 + }, + { + "epoch": 0.11, + "grad_norm": 0.515625, + "learning_rate": 0.00019998465548220972, + "loss": 1.5406, + "step": 2030 + }, + { + "epoch": 0.11, + "grad_norm": 0.52734375, + "learning_rate": 0.00019998303284819594, + "loss": 1.5452, + "step": 2035 + }, + { + "epoch": 0.11, + "grad_norm": 0.515625, + "learning_rate": 0.00019998132868423427, + "loss": 1.5519, + "step": 2040 + }, + { + "epoch": 0.11, + "grad_norm": 0.5, + "learning_rate": 0.00019997954299171434, + "loss": 1.5671, + "step": 2045 + }, + { + "epoch": 0.11, + "grad_norm": 0.53515625, + "learning_rate": 0.0001999776757720923, + "loss": 1.5425, + "step": 2050 + }, + { + "epoch": 0.11, + "grad_norm": 0.5390625, + "learning_rate": 0.00019997572702689073, + "loss": 1.5056, + "step": 2055 + }, + { + "epoch": 0.11, + "grad_norm": 0.5234375, + "learning_rate": 0.00019997369675769873, + "loss": 1.511, + "step": 2060 + }, + { + "epoch": 0.11, + "grad_norm": 0.53125, + "learning_rate": 0.00019997158496617184, + "loss": 1.5355, + "step": 2065 + }, + { + "epoch": 0.11, + "grad_norm": 0.5390625, + "learning_rate": 0.00019996939165403208, + "loss": 1.5132, + "step": 2070 + }, + { + "epoch": 0.11, + "grad_norm": 0.52734375, + "learning_rate": 0.000199967116823068, + "loss": 1.5543, + "step": 2075 + }, + { + "epoch": 0.11, + "grad_norm": 0.51171875, + "learning_rate": 0.00019996476047513454, + "loss": 1.5596, + "step": 2080 + }, + { + "epoch": 0.11, + "grad_norm": 0.5234375, + "learning_rate": 0.0001999623226121532, + "loss": 1.5632, + "step": 2085 + }, + { + "epoch": 0.11, + "grad_norm": 0.494140625, + "learning_rate": 0.0001999598032361119, + "loss": 1.5241, + "step": 2090 + }, + { + "epoch": 0.11, + "grad_norm": 0.55859375, + "learning_rate": 0.00019995720234906498, + "loss": 1.5281, + "step": 2095 + }, + { + "epoch": 0.11, + "grad_norm": 0.5078125, + "learning_rate": 0.00019995451995313335, + "loss": 1.5327, + "step": 2100 + }, + { + "epoch": 0.11, + "grad_norm": 0.53125, + "learning_rate": 0.00019995175605050434, + "loss": 1.5475, + "step": 2105 + }, + { + "epoch": 0.11, + "grad_norm": 0.51953125, + "learning_rate": 0.0001999489106434317, + "loss": 1.5329, + "step": 2110 + }, + { + "epoch": 0.11, + "grad_norm": 0.55078125, + "learning_rate": 0.0001999459837342357, + "loss": 1.5358, + "step": 2115 + }, + { + "epoch": 0.11, + "grad_norm": 0.51171875, + "learning_rate": 0.00019994297532530312, + "loss": 1.5547, + "step": 2120 + }, + { + "epoch": 0.11, + "grad_norm": 0.498046875, + "learning_rate": 0.00019993988541908703, + "loss": 1.552, + "step": 2125 + }, + { + "epoch": 0.11, + "grad_norm": 0.55078125, + "learning_rate": 0.00019993671401810712, + "loss": 1.561, + "step": 2130 + }, + { + "epoch": 0.11, + "grad_norm": 0.5, + "learning_rate": 0.00019993346112494946, + "loss": 1.5502, + "step": 2135 + }, + { + "epoch": 0.11, + "grad_norm": 0.54296875, + "learning_rate": 0.00019993012674226655, + "loss": 1.5832, + "step": 2140 + }, + { + "epoch": 0.11, + "grad_norm": 0.5546875, + "learning_rate": 0.0001999267108727774, + "loss": 1.5401, + "step": 2145 + }, + { + "epoch": 0.11, + "grad_norm": 0.52734375, + "learning_rate": 0.00019992321351926744, + "loss": 1.5634, + "step": 2150 + }, + { + "epoch": 0.11, + "grad_norm": 0.53125, + "learning_rate": 0.00019991963468458853, + "loss": 1.5423, + "step": 2155 + }, + { + "epoch": 0.11, + "grad_norm": 0.50390625, + "learning_rate": 0.00019991597437165899, + "loss": 1.5138, + "step": 2160 + }, + { + "epoch": 0.11, + "grad_norm": 0.5234375, + "learning_rate": 0.00019991223258346362, + "loss": 1.4988, + "step": 2165 + }, + { + "epoch": 0.11, + "grad_norm": 0.54296875, + "learning_rate": 0.00019990840932305353, + "loss": 1.5531, + "step": 2170 + }, + { + "epoch": 0.11, + "grad_norm": 0.5078125, + "learning_rate": 0.0001999045045935464, + "loss": 1.522, + "step": 2175 + }, + { + "epoch": 0.11, + "grad_norm": 0.50390625, + "learning_rate": 0.00019990051839812633, + "loss": 1.5382, + "step": 2180 + }, + { + "epoch": 0.11, + "grad_norm": 0.5078125, + "learning_rate": 0.00019989645074004376, + "loss": 1.5066, + "step": 2185 + }, + { + "epoch": 0.11, + "grad_norm": 0.51953125, + "learning_rate": 0.0001998923016226156, + "loss": 1.5335, + "step": 2190 + }, + { + "epoch": 0.11, + "grad_norm": 0.51953125, + "learning_rate": 0.0001998880710492253, + "loss": 1.5735, + "step": 2195 + }, + { + "epoch": 0.11, + "grad_norm": 0.51953125, + "learning_rate": 0.0001998837590233225, + "loss": 1.5011, + "step": 2200 + }, + { + "epoch": 0.11, + "grad_norm": 0.51171875, + "learning_rate": 0.00019987936554842346, + "loss": 1.5417, + "step": 2205 + }, + { + "epoch": 0.11, + "grad_norm": 0.5234375, + "learning_rate": 0.00019987489062811076, + "loss": 1.4801, + "step": 2210 + }, + { + "epoch": 0.11, + "grad_norm": 0.515625, + "learning_rate": 0.00019987033426603344, + "loss": 1.5865, + "step": 2215 + }, + { + "epoch": 0.11, + "grad_norm": 1.03125, + "learning_rate": 0.00019986569646590692, + "loss": 1.5352, + "step": 2220 + }, + { + "epoch": 0.12, + "grad_norm": 0.546875, + "learning_rate": 0.00019986097723151305, + "loss": 1.5486, + "step": 2225 + }, + { + "epoch": 0.12, + "grad_norm": 0.5078125, + "learning_rate": 0.00019985617656670005, + "loss": 1.5046, + "step": 2230 + }, + { + "epoch": 0.12, + "grad_norm": 0.5234375, + "learning_rate": 0.00019985129447538258, + "loss": 1.5207, + "step": 2235 + }, + { + "epoch": 0.12, + "grad_norm": 0.53515625, + "learning_rate": 0.00019984633096154167, + "loss": 1.5484, + "step": 2240 + }, + { + "epoch": 0.12, + "grad_norm": 0.53515625, + "learning_rate": 0.00019984128602922477, + "loss": 1.4857, + "step": 2245 + }, + { + "epoch": 0.12, + "grad_norm": 0.50390625, + "learning_rate": 0.00019983615968254573, + "loss": 1.5185, + "step": 2250 + }, + { + "epoch": 0.12, + "grad_norm": 0.5234375, + "learning_rate": 0.0001998309519256847, + "loss": 1.5341, + "step": 2255 + }, + { + "epoch": 0.12, + "grad_norm": 0.52734375, + "learning_rate": 0.00019982566276288834, + "loss": 1.5127, + "step": 2260 + }, + { + "epoch": 0.12, + "grad_norm": 0.52734375, + "learning_rate": 0.00019982029219846962, + "loss": 1.5261, + "step": 2265 + }, + { + "epoch": 0.12, + "grad_norm": 0.546875, + "learning_rate": 0.00019981484023680787, + "loss": 1.5209, + "step": 2270 + }, + { + "epoch": 0.12, + "grad_norm": 0.5234375, + "learning_rate": 0.00019980930688234886, + "loss": 1.5647, + "step": 2275 + }, + { + "epoch": 0.12, + "grad_norm": 0.498046875, + "learning_rate": 0.00019980369213960472, + "loss": 1.5466, + "step": 2280 + }, + { + "epoch": 0.12, + "grad_norm": 0.50390625, + "learning_rate": 0.00019979799601315388, + "loss": 1.5414, + "step": 2285 + }, + { + "epoch": 0.12, + "grad_norm": 0.50390625, + "learning_rate": 0.00019979221850764117, + "loss": 1.5479, + "step": 2290 + }, + { + "epoch": 0.12, + "grad_norm": 0.70703125, + "learning_rate": 0.0001997863596277778, + "loss": 1.5523, + "step": 2295 + }, + { + "epoch": 0.12, + "grad_norm": 0.546875, + "learning_rate": 0.00019978041937834137, + "loss": 1.5381, + "step": 2300 + }, + { + "epoch": 0.12, + "grad_norm": 0.49609375, + "learning_rate": 0.0001997743977641757, + "loss": 1.5089, + "step": 2305 + }, + { + "epoch": 0.12, + "grad_norm": 6.5625, + "learning_rate": 0.00019976829479019113, + "loss": 1.5425, + "step": 2310 + }, + { + "epoch": 0.12, + "grad_norm": 0.5078125, + "learning_rate": 0.0001997621104613642, + "loss": 1.5251, + "step": 2315 + }, + { + "epoch": 0.12, + "grad_norm": 0.52734375, + "learning_rate": 0.00019975584478273782, + "loss": 1.5214, + "step": 2320 + }, + { + "epoch": 0.12, + "grad_norm": 0.52734375, + "learning_rate": 0.00019974949775942134, + "loss": 1.5123, + "step": 2325 + }, + { + "epoch": 0.12, + "grad_norm": 0.5078125, + "learning_rate": 0.00019974306939659026, + "loss": 1.5431, + "step": 2330 + }, + { + "epoch": 0.12, + "grad_norm": 0.53515625, + "learning_rate": 0.00019973655969948663, + "loss": 1.5478, + "step": 2335 + }, + { + "epoch": 0.12, + "grad_norm": 0.52734375, + "learning_rate": 0.00019972996867341863, + "loss": 1.5273, + "step": 2340 + }, + { + "epoch": 0.12, + "grad_norm": 0.498046875, + "learning_rate": 0.00019972329632376084, + "loss": 1.5277, + "step": 2345 + }, + { + "epoch": 0.12, + "grad_norm": 0.53515625, + "learning_rate": 0.00019971654265595415, + "loss": 1.5046, + "step": 2350 + }, + { + "epoch": 0.12, + "grad_norm": 0.52734375, + "learning_rate": 0.00019970970767550577, + "loss": 1.5588, + "step": 2355 + }, + { + "epoch": 0.12, + "grad_norm": 0.52734375, + "learning_rate": 0.0001997027913879892, + "loss": 1.6231, + "step": 2360 + }, + { + "epoch": 0.12, + "grad_norm": 0.5078125, + "learning_rate": 0.0001996957937990442, + "loss": 1.5189, + "step": 2365 + }, + { + "epoch": 0.12, + "grad_norm": 0.51171875, + "learning_rate": 0.00019968871491437691, + "loss": 1.5559, + "step": 2370 + }, + { + "epoch": 0.12, + "grad_norm": 0.5078125, + "learning_rate": 0.00019968155473975974, + "loss": 1.5194, + "step": 2375 + }, + { + "epoch": 0.12, + "grad_norm": 0.5, + "learning_rate": 0.0001996743132810313, + "loss": 1.5081, + "step": 2380 + }, + { + "epoch": 0.12, + "grad_norm": 0.51171875, + "learning_rate": 0.0001996669905440966, + "loss": 1.5502, + "step": 2385 + }, + { + "epoch": 0.12, + "grad_norm": 0.51953125, + "learning_rate": 0.0001996595865349269, + "loss": 1.5206, + "step": 2390 + }, + { + "epoch": 0.12, + "grad_norm": 0.515625, + "learning_rate": 0.00019965210125955966, + "loss": 1.5631, + "step": 2395 + }, + { + "epoch": 0.12, + "grad_norm": 0.51953125, + "learning_rate": 0.00019964453472409867, + "loss": 1.4963, + "step": 2400 + }, + { + "epoch": 0.12, + "grad_norm": 0.5234375, + "learning_rate": 0.00019963688693471396, + "loss": 1.5388, + "step": 2405 + }, + { + "epoch": 0.12, + "grad_norm": 0.50390625, + "learning_rate": 0.00019962915789764182, + "loss": 1.5142, + "step": 2410 + }, + { + "epoch": 0.12, + "grad_norm": 0.5, + "learning_rate": 0.00019962134761918488, + "loss": 1.5213, + "step": 2415 + }, + { + "epoch": 0.13, + "grad_norm": 0.5390625, + "learning_rate": 0.00019961345610571183, + "loss": 1.5471, + "step": 2420 + }, + { + "epoch": 0.13, + "grad_norm": 0.53515625, + "learning_rate": 0.00019960548336365774, + "loss": 1.507, + "step": 2425 + }, + { + "epoch": 0.13, + "grad_norm": 0.51171875, + "learning_rate": 0.00019959742939952392, + "loss": 1.5371, + "step": 2430 + }, + { + "epoch": 0.13, + "grad_norm": 0.51171875, + "learning_rate": 0.00019958929421987783, + "loss": 1.5465, + "step": 2435 + }, + { + "epoch": 0.13, + "grad_norm": 0.5078125, + "learning_rate": 0.00019958107783135326, + "loss": 1.4912, + "step": 2440 + }, + { + "epoch": 0.13, + "grad_norm": 0.51953125, + "learning_rate": 0.00019957278024065013, + "loss": 1.5464, + "step": 2445 + }, + { + "epoch": 0.13, + "grad_norm": 0.50390625, + "learning_rate": 0.00019956440145453458, + "loss": 1.5109, + "step": 2450 + }, + { + "epoch": 0.13, + "grad_norm": 0.55078125, + "learning_rate": 0.00019955594147983905, + "loss": 1.5218, + "step": 2455 + }, + { + "epoch": 0.13, + "grad_norm": 0.55859375, + "learning_rate": 0.00019954740032346208, + "loss": 1.4954, + "step": 2460 + }, + { + "epoch": 0.13, + "grad_norm": 0.51953125, + "learning_rate": 0.0001995387779923685, + "loss": 1.5223, + "step": 2465 + }, + { + "epoch": 0.13, + "grad_norm": 0.5390625, + "learning_rate": 0.0001995300744935892, + "loss": 1.5373, + "step": 2470 + }, + { + "epoch": 0.13, + "grad_norm": 0.51171875, + "learning_rate": 0.00019952128983422146, + "loss": 1.5302, + "step": 2475 + }, + { + "epoch": 0.13, + "grad_norm": 0.51953125, + "learning_rate": 0.00019951242402142848, + "loss": 1.5762, + "step": 2480 + }, + { + "epoch": 0.13, + "grad_norm": 0.498046875, + "learning_rate": 0.0001995034770624399, + "loss": 1.4838, + "step": 2485 + }, + { + "epoch": 0.13, + "grad_norm": 0.515625, + "learning_rate": 0.00019949444896455137, + "loss": 1.5001, + "step": 2490 + }, + { + "epoch": 0.13, + "grad_norm": 0.52734375, + "learning_rate": 0.00019948533973512472, + "loss": 1.5255, + "step": 2495 + }, + { + "epoch": 0.13, + "grad_norm": 0.51953125, + "learning_rate": 0.000199476149381588, + "loss": 1.5226, + "step": 2500 + }, + { + "epoch": 0.13, + "grad_norm": 0.53125, + "learning_rate": 0.0001994668779114353, + "loss": 1.5017, + "step": 2505 + }, + { + "epoch": 0.13, + "grad_norm": 0.53515625, + "learning_rate": 0.00019945752533222704, + "loss": 1.5007, + "step": 2510 + }, + { + "epoch": 0.13, + "grad_norm": 0.515625, + "learning_rate": 0.00019944809165158955, + "loss": 1.5411, + "step": 2515 + }, + { + "epoch": 0.13, + "grad_norm": 0.5234375, + "learning_rate": 0.0001994385768772155, + "loss": 1.5656, + "step": 2520 + }, + { + "epoch": 0.13, + "grad_norm": 0.51953125, + "learning_rate": 0.00019942898101686356, + "loss": 1.5273, + "step": 2525 + }, + { + "epoch": 0.13, + "grad_norm": 0.50390625, + "learning_rate": 0.00019941930407835857, + "loss": 1.5197, + "step": 2530 + }, + { + "epoch": 0.13, + "grad_norm": 0.52734375, + "learning_rate": 0.00019940954606959143, + "loss": 1.4927, + "step": 2535 + }, + { + "epoch": 0.13, + "grad_norm": 0.52734375, + "learning_rate": 0.00019939970699851925, + "loss": 1.5054, + "step": 2540 + }, + { + "epoch": 0.13, + "grad_norm": 0.51171875, + "learning_rate": 0.0001993897868731651, + "loss": 1.5288, + "step": 2545 + }, + { + "epoch": 0.13, + "grad_norm": 0.5234375, + "learning_rate": 0.00019937978570161834, + "loss": 1.4935, + "step": 2550 + }, + { + "epoch": 0.13, + "grad_norm": 0.515625, + "learning_rate": 0.00019936970349203423, + "loss": 1.5252, + "step": 2555 + }, + { + "epoch": 0.13, + "grad_norm": 0.515625, + "learning_rate": 0.00019935954025263416, + "loss": 1.5234, + "step": 2560 + }, + { + "epoch": 0.13, + "grad_norm": 0.53125, + "learning_rate": 0.00019934929599170568, + "loss": 1.5277, + "step": 2565 + }, + { + "epoch": 0.13, + "grad_norm": 0.5078125, + "learning_rate": 0.00019933897071760235, + "loss": 1.5002, + "step": 2570 + }, + { + "epoch": 0.13, + "grad_norm": 0.51171875, + "learning_rate": 0.00019932856443874374, + "loss": 1.4713, + "step": 2575 + }, + { + "epoch": 0.13, + "grad_norm": 0.5390625, + "learning_rate": 0.00019931807716361554, + "loss": 1.5328, + "step": 2580 + }, + { + "epoch": 0.13, + "grad_norm": 0.494140625, + "learning_rate": 0.00019930750890076947, + "loss": 1.5025, + "step": 2585 + }, + { + "epoch": 0.13, + "grad_norm": 0.5234375, + "learning_rate": 0.0001992968596588233, + "loss": 1.5213, + "step": 2590 + }, + { + "epoch": 0.13, + "grad_norm": 0.4921875, + "learning_rate": 0.00019928612944646084, + "loss": 1.4832, + "step": 2595 + }, + { + "epoch": 0.13, + "grad_norm": 0.5, + "learning_rate": 0.00019927531827243188, + "loss": 1.5389, + "step": 2600 + }, + { + "epoch": 0.13, + "grad_norm": 0.5078125, + "learning_rate": 0.0001992644261455523, + "loss": 1.5366, + "step": 2605 + }, + { + "epoch": 0.14, + "grad_norm": 0.5, + "learning_rate": 0.0001992534530747039, + "loss": 1.5488, + "step": 2610 + }, + { + "epoch": 0.14, + "grad_norm": 0.498046875, + "learning_rate": 0.00019924239906883457, + "loss": 1.4925, + "step": 2615 + }, + { + "epoch": 0.14, + "grad_norm": 0.50390625, + "learning_rate": 0.00019923126413695817, + "loss": 1.4764, + "step": 2620 + }, + { + "epoch": 0.14, + "grad_norm": 0.515625, + "learning_rate": 0.00019922004828815454, + "loss": 1.5102, + "step": 2625 + }, + { + "epoch": 0.14, + "grad_norm": 0.54296875, + "learning_rate": 0.0001992087515315695, + "loss": 1.5488, + "step": 2630 + }, + { + "epoch": 0.14, + "grad_norm": 0.51953125, + "learning_rate": 0.00019919737387641485, + "loss": 1.5403, + "step": 2635 + }, + { + "epoch": 0.14, + "grad_norm": 0.5078125, + "learning_rate": 0.00019918591533196834, + "loss": 1.5077, + "step": 2640 + }, + { + "epoch": 0.14, + "grad_norm": 0.5234375, + "learning_rate": 0.00019917437590757375, + "loss": 1.4922, + "step": 2645 + }, + { + "epoch": 0.14, + "grad_norm": 0.4921875, + "learning_rate": 0.00019916275561264075, + "loss": 1.5049, + "step": 2650 + }, + { + "epoch": 0.14, + "grad_norm": 0.5, + "learning_rate": 0.00019915105445664493, + "loss": 1.5126, + "step": 2655 + }, + { + "epoch": 0.14, + "grad_norm": 0.5078125, + "learning_rate": 0.00019913927244912788, + "loss": 1.5371, + "step": 2660 + }, + { + "epoch": 0.14, + "grad_norm": 0.51953125, + "learning_rate": 0.0001991274095996971, + "loss": 1.4777, + "step": 2665 + }, + { + "epoch": 0.14, + "grad_norm": 0.5234375, + "learning_rate": 0.00019911546591802604, + "loss": 1.5001, + "step": 2670 + }, + { + "epoch": 0.14, + "grad_norm": 0.5390625, + "learning_rate": 0.00019910344141385396, + "loss": 1.5312, + "step": 2675 + }, + { + "epoch": 0.14, + "grad_norm": 0.52734375, + "learning_rate": 0.00019909133609698616, + "loss": 1.4799, + "step": 2680 + }, + { + "epoch": 0.14, + "grad_norm": 0.5, + "learning_rate": 0.00019907914997729372, + "loss": 1.4944, + "step": 2685 + }, + { + "epoch": 0.14, + "grad_norm": 0.54296875, + "learning_rate": 0.00019906688306471366, + "loss": 1.4979, + "step": 2690 + }, + { + "epoch": 0.14, + "grad_norm": 0.51953125, + "learning_rate": 0.00019905453536924893, + "loss": 1.5127, + "step": 2695 + }, + { + "epoch": 0.14, + "grad_norm": 0.494140625, + "learning_rate": 0.0001990421069009683, + "loss": 1.5091, + "step": 2700 + }, + { + "epoch": 0.14, + "grad_norm": 0.51953125, + "learning_rate": 0.0001990295976700064, + "loss": 1.5043, + "step": 2705 + }, + { + "epoch": 0.14, + "grad_norm": 0.515625, + "learning_rate": 0.00019901700768656372, + "loss": 1.5113, + "step": 2710 + }, + { + "epoch": 0.14, + "grad_norm": 0.515625, + "learning_rate": 0.0001990043369609066, + "loss": 1.5135, + "step": 2715 + }, + { + "epoch": 0.14, + "grad_norm": 0.5, + "learning_rate": 0.00019899158550336729, + "loss": 1.5443, + "step": 2720 + }, + { + "epoch": 0.14, + "grad_norm": 0.50390625, + "learning_rate": 0.00019897875332434376, + "loss": 1.4857, + "step": 2725 + }, + { + "epoch": 0.14, + "grad_norm": 0.515625, + "learning_rate": 0.00019896584043429988, + "loss": 1.5084, + "step": 2730 + }, + { + "epoch": 0.14, + "grad_norm": 0.515625, + "learning_rate": 0.00019895284684376524, + "loss": 1.4913, + "step": 2735 + }, + { + "epoch": 0.14, + "grad_norm": 0.50390625, + "learning_rate": 0.0001989397725633354, + "loss": 1.5016, + "step": 2740 + }, + { + "epoch": 0.14, + "grad_norm": 0.5078125, + "learning_rate": 0.00019892661760367156, + "loss": 1.5035, + "step": 2745 + }, + { + "epoch": 0.14, + "grad_norm": 0.515625, + "learning_rate": 0.00019891338197550081, + "loss": 1.5148, + "step": 2750 + }, + { + "epoch": 0.14, + "grad_norm": 0.51171875, + "learning_rate": 0.00019890006568961597, + "loss": 1.4888, + "step": 2755 + }, + { + "epoch": 0.14, + "grad_norm": 0.51171875, + "learning_rate": 0.00019888666875687565, + "loss": 1.5469, + "step": 2760 + }, + { + "epoch": 0.14, + "grad_norm": 0.51171875, + "learning_rate": 0.00019887319118820418, + "loss": 1.5647, + "step": 2765 + }, + { + "epoch": 0.14, + "grad_norm": 0.5, + "learning_rate": 0.0001988596329945917, + "loss": 1.5234, + "step": 2770 + }, + { + "epoch": 0.14, + "grad_norm": 0.53515625, + "learning_rate": 0.0001988459941870941, + "loss": 1.5043, + "step": 2775 + }, + { + "epoch": 0.14, + "grad_norm": 0.51953125, + "learning_rate": 0.00019883227477683296, + "loss": 1.5245, + "step": 2780 + }, + { + "epoch": 0.14, + "grad_norm": 0.53125, + "learning_rate": 0.00019881847477499557, + "loss": 1.5148, + "step": 2785 + }, + { + "epoch": 0.14, + "grad_norm": 0.5078125, + "learning_rate": 0.00019880459419283503, + "loss": 1.5096, + "step": 2790 + }, + { + "epoch": 0.14, + "grad_norm": 0.51953125, + "learning_rate": 0.0001987906330416701, + "loss": 1.5181, + "step": 2795 + }, + { + "epoch": 0.14, + "grad_norm": 0.515625, + "learning_rate": 0.00019877659133288515, + "loss": 1.5406, + "step": 2800 + }, + { + "epoch": 0.15, + "grad_norm": 0.498046875, + "learning_rate": 0.0001987624690779304, + "loss": 1.5078, + "step": 2805 + }, + { + "epoch": 0.15, + "grad_norm": 0.5390625, + "learning_rate": 0.00019874826628832164, + "loss": 1.5313, + "step": 2810 + }, + { + "epoch": 0.15, + "grad_norm": 0.5078125, + "learning_rate": 0.00019873398297564037, + "loss": 1.4807, + "step": 2815 + }, + { + "epoch": 0.15, + "grad_norm": 0.494140625, + "learning_rate": 0.0001987196191515337, + "loss": 1.5308, + "step": 2820 + }, + { + "epoch": 0.15, + "grad_norm": 0.5078125, + "learning_rate": 0.0001987051748277145, + "loss": 1.5101, + "step": 2825 + }, + { + "epoch": 0.15, + "grad_norm": 0.498046875, + "learning_rate": 0.00019869065001596118, + "loss": 1.4953, + "step": 2830 + }, + { + "epoch": 0.15, + "grad_norm": 0.5, + "learning_rate": 0.00019867604472811786, + "loss": 1.4817, + "step": 2835 + }, + { + "epoch": 0.15, + "grad_norm": 0.53515625, + "learning_rate": 0.00019866135897609423, + "loss": 1.4956, + "step": 2840 + }, + { + "epoch": 0.15, + "grad_norm": 0.51171875, + "learning_rate": 0.00019864659277186555, + "loss": 1.5151, + "step": 2845 + }, + { + "epoch": 0.15, + "grad_norm": 0.515625, + "learning_rate": 0.0001986317461274728, + "loss": 1.4996, + "step": 2850 + }, + { + "epoch": 0.15, + "grad_norm": 0.51953125, + "learning_rate": 0.00019861681905502246, + "loss": 1.5088, + "step": 2855 + }, + { + "epoch": 0.15, + "grad_norm": 0.53515625, + "learning_rate": 0.0001986018115666867, + "loss": 1.507, + "step": 2860 + }, + { + "epoch": 0.15, + "grad_norm": 0.54296875, + "learning_rate": 0.00019858672367470312, + "loss": 1.516, + "step": 2865 + }, + { + "epoch": 0.15, + "grad_norm": 0.48828125, + "learning_rate": 0.000198571555391375, + "loss": 1.5231, + "step": 2870 + }, + { + "epoch": 0.15, + "grad_norm": 0.5, + "learning_rate": 0.00019855630672907108, + "loss": 1.4969, + "step": 2875 + }, + { + "epoch": 0.15, + "grad_norm": 0.53515625, + "learning_rate": 0.00019854097770022577, + "loss": 1.4924, + "step": 2880 + }, + { + "epoch": 0.15, + "grad_norm": 0.52734375, + "learning_rate": 0.0001985255683173389, + "loss": 1.5237, + "step": 2885 + }, + { + "epoch": 0.15, + "grad_norm": 0.51953125, + "learning_rate": 0.00019851007859297585, + "loss": 1.5364, + "step": 2890 + }, + { + "epoch": 0.15, + "grad_norm": 0.50390625, + "learning_rate": 0.00019849450853976755, + "loss": 1.5116, + "step": 2895 + }, + { + "epoch": 0.15, + "grad_norm": 0.486328125, + "learning_rate": 0.0001984788581704104, + "loss": 1.4999, + "step": 2900 + }, + { + "epoch": 0.15, + "grad_norm": 0.53515625, + "learning_rate": 0.0001984631274976663, + "loss": 1.4907, + "step": 2905 + }, + { + "epoch": 0.15, + "grad_norm": 0.51171875, + "learning_rate": 0.00019844731653436264, + "loss": 1.5079, + "step": 2910 + }, + { + "epoch": 0.15, + "grad_norm": 0.53125, + "learning_rate": 0.0001984314252933923, + "loss": 1.4708, + "step": 2915 + }, + { + "epoch": 0.15, + "grad_norm": 0.5078125, + "learning_rate": 0.00019841545378771356, + "loss": 1.4926, + "step": 2920 + }, + { + "epoch": 0.15, + "grad_norm": 0.51953125, + "learning_rate": 0.0001983994020303502, + "loss": 1.4863, + "step": 2925 + }, + { + "epoch": 0.15, + "grad_norm": 0.5390625, + "learning_rate": 0.00019838327003439147, + "loss": 1.4937, + "step": 2930 + }, + { + "epoch": 0.15, + "grad_norm": 0.5390625, + "learning_rate": 0.00019836705781299196, + "loss": 1.535, + "step": 2935 + }, + { + "epoch": 0.15, + "grad_norm": 0.5078125, + "learning_rate": 0.00019835076537937178, + "loss": 1.4663, + "step": 2940 + }, + { + "epoch": 0.15, + "grad_norm": 0.54296875, + "learning_rate": 0.00019833439274681634, + "loss": 1.5312, + "step": 2945 + }, + { + "epoch": 0.15, + "grad_norm": 0.50390625, + "learning_rate": 0.00019831793992867652, + "loss": 1.5367, + "step": 2950 + }, + { + "epoch": 0.15, + "grad_norm": 0.5234375, + "learning_rate": 0.0001983014069383686, + "loss": 1.5108, + "step": 2955 + }, + { + "epoch": 0.15, + "grad_norm": 0.5390625, + "learning_rate": 0.00019828479378937417, + "loss": 1.5443, + "step": 2960 + }, + { + "epoch": 0.15, + "grad_norm": 0.51171875, + "learning_rate": 0.00019826810049524026, + "loss": 1.506, + "step": 2965 + }, + { + "epoch": 0.15, + "grad_norm": 0.48828125, + "learning_rate": 0.00019825132706957917, + "loss": 1.5365, + "step": 2970 + }, + { + "epoch": 0.15, + "grad_norm": 0.53515625, + "learning_rate": 0.00019823447352606858, + "loss": 1.4662, + "step": 2975 + }, + { + "epoch": 0.15, + "grad_norm": 0.49609375, + "learning_rate": 0.00019821753987845156, + "loss": 1.5271, + "step": 2980 + }, + { + "epoch": 0.15, + "grad_norm": 0.54296875, + "learning_rate": 0.0001982005261405364, + "loss": 1.4906, + "step": 2985 + }, + { + "epoch": 0.15, + "grad_norm": 0.53515625, + "learning_rate": 0.0001981834323261968, + "loss": 1.4914, + "step": 2990 + }, + { + "epoch": 0.15, + "grad_norm": 0.5, + "learning_rate": 0.00019816625844937163, + "loss": 1.5608, + "step": 2995 + }, + { + "epoch": 0.16, + "grad_norm": 0.51953125, + "learning_rate": 0.0001981490045240652, + "loss": 1.4996, + "step": 3000 + }, + { + "epoch": 0.16, + "grad_norm": 0.51953125, + "learning_rate": 0.00019813167056434693, + "loss": 1.4952, + "step": 3005 + }, + { + "epoch": 0.16, + "grad_norm": 0.53125, + "learning_rate": 0.00019811425658435166, + "loss": 1.5259, + "step": 3010 + }, + { + "epoch": 0.16, + "grad_norm": 0.5078125, + "learning_rate": 0.00019809676259827935, + "loss": 1.4858, + "step": 3015 + }, + { + "epoch": 0.16, + "grad_norm": 0.5, + "learning_rate": 0.0001980791886203953, + "loss": 1.5333, + "step": 3020 + }, + { + "epoch": 0.16, + "grad_norm": 0.5234375, + "learning_rate": 0.00019806153466502997, + "loss": 1.5455, + "step": 3025 + }, + { + "epoch": 0.16, + "grad_norm": 0.515625, + "learning_rate": 0.00019804380074657906, + "loss": 1.5162, + "step": 3030 + }, + { + "epoch": 0.16, + "grad_norm": 0.515625, + "learning_rate": 0.00019802598687950352, + "loss": 1.5035, + "step": 3035 + }, + { + "epoch": 0.16, + "grad_norm": 0.52734375, + "learning_rate": 0.00019800809307832942, + "loss": 1.4887, + "step": 3040 + }, + { + "epoch": 0.16, + "grad_norm": 0.5546875, + "learning_rate": 0.00019799011935764803, + "loss": 1.4994, + "step": 3045 + }, + { + "epoch": 0.16, + "grad_norm": 0.51171875, + "learning_rate": 0.0001979720657321158, + "loss": 1.5093, + "step": 3050 + }, + { + "epoch": 0.16, + "grad_norm": 0.515625, + "learning_rate": 0.00019795393221645437, + "loss": 1.5287, + "step": 3055 + }, + { + "epoch": 0.16, + "grad_norm": 0.52734375, + "learning_rate": 0.00019793571882545047, + "loss": 1.4979, + "step": 3060 + }, + { + "epoch": 0.16, + "grad_norm": 0.51953125, + "learning_rate": 0.00019791742557395602, + "loss": 1.5138, + "step": 3065 + }, + { + "epoch": 0.16, + "grad_norm": 0.5234375, + "learning_rate": 0.000197899052476888, + "loss": 1.525, + "step": 3070 + }, + { + "epoch": 0.16, + "grad_norm": 0.53515625, + "learning_rate": 0.00019788059954922856, + "loss": 1.4849, + "step": 3075 + }, + { + "epoch": 0.16, + "grad_norm": 0.50390625, + "learning_rate": 0.00019786206680602486, + "loss": 1.531, + "step": 3080 + }, + { + "epoch": 0.16, + "grad_norm": 0.56640625, + "learning_rate": 0.00019784345426238927, + "loss": 1.528, + "step": 3085 + }, + { + "epoch": 0.16, + "grad_norm": 0.51171875, + "learning_rate": 0.00019782476193349905, + "loss": 1.5096, + "step": 3090 + }, + { + "epoch": 0.16, + "grad_norm": 0.50390625, + "learning_rate": 0.00019780598983459678, + "loss": 1.5177, + "step": 3095 + }, + { + "epoch": 0.16, + "grad_norm": 0.515625, + "learning_rate": 0.00019778713798098983, + "loss": 1.5012, + "step": 3100 + }, + { + "epoch": 0.16, + "grad_norm": 0.5078125, + "learning_rate": 0.00019776820638805077, + "loss": 1.4774, + "step": 3105 + }, + { + "epoch": 0.16, + "grad_norm": 0.515625, + "learning_rate": 0.0001977491950712171, + "loss": 1.5024, + "step": 3110 + }, + { + "epoch": 0.16, + "grad_norm": 0.498046875, + "learning_rate": 0.0001977301040459914, + "loss": 1.5284, + "step": 3115 + }, + { + "epoch": 0.16, + "grad_norm": 0.5234375, + "learning_rate": 0.00019771093332794117, + "loss": 1.5153, + "step": 3120 + }, + { + "epoch": 0.16, + "grad_norm": 0.53125, + "learning_rate": 0.000197691682932699, + "loss": 1.5114, + "step": 3125 + }, + { + "epoch": 0.16, + "grad_norm": 0.54296875, + "learning_rate": 0.00019767235287596237, + "loss": 1.4915, + "step": 3130 + }, + { + "epoch": 0.16, + "grad_norm": 0.5078125, + "learning_rate": 0.0001976529431734937, + "loss": 1.4751, + "step": 3135 + }, + { + "epoch": 0.16, + "grad_norm": 0.515625, + "learning_rate": 0.00019763345384112043, + "loss": 1.5189, + "step": 3140 + }, + { + "epoch": 0.16, + "grad_norm": 0.50390625, + "learning_rate": 0.0001976138848947349, + "loss": 1.5114, + "step": 3145 + }, + { + "epoch": 0.16, + "grad_norm": 0.55859375, + "learning_rate": 0.00019759423635029434, + "loss": 1.5104, + "step": 3150 + }, + { + "epoch": 0.16, + "grad_norm": 0.515625, + "learning_rate": 0.00019757450822382094, + "loss": 1.5031, + "step": 3155 + }, + { + "epoch": 0.16, + "grad_norm": 0.50390625, + "learning_rate": 0.00019755470053140178, + "loss": 1.4643, + "step": 3160 + }, + { + "epoch": 0.16, + "grad_norm": 0.515625, + "learning_rate": 0.0001975348132891888, + "loss": 1.483, + "step": 3165 + }, + { + "epoch": 0.16, + "grad_norm": 0.498046875, + "learning_rate": 0.00019751484651339877, + "loss": 1.4759, + "step": 3170 + }, + { + "epoch": 0.16, + "grad_norm": 0.51171875, + "learning_rate": 0.00019749480022031337, + "loss": 1.4971, + "step": 3175 + }, + { + "epoch": 0.16, + "grad_norm": 0.51171875, + "learning_rate": 0.00019747467442627912, + "loss": 1.4768, + "step": 3180 + }, + { + "epoch": 0.16, + "grad_norm": 0.51171875, + "learning_rate": 0.00019745446914770732, + "loss": 1.4869, + "step": 3185 + }, + { + "epoch": 0.17, + "grad_norm": 0.5234375, + "learning_rate": 0.00019743418440107418, + "loss": 1.5524, + "step": 3190 + }, + { + "epoch": 0.17, + "grad_norm": 0.5, + "learning_rate": 0.00019741382020292063, + "loss": 1.4935, + "step": 3195 + }, + { + "epoch": 0.17, + "grad_norm": 0.52734375, + "learning_rate": 0.00019739337656985234, + "loss": 1.5014, + "step": 3200 + }, + { + "epoch": 0.17, + "grad_norm": 0.494140625, + "learning_rate": 0.0001973728535185399, + "loss": 1.5191, + "step": 3205 + }, + { + "epoch": 0.17, + "grad_norm": 0.51953125, + "learning_rate": 0.00019735225106571854, + "loss": 1.4759, + "step": 3210 + }, + { + "epoch": 0.17, + "grad_norm": 0.49609375, + "learning_rate": 0.00019733156922818835, + "loss": 1.5358, + "step": 3215 + }, + { + "epoch": 0.17, + "grad_norm": 0.515625, + "learning_rate": 0.00019731080802281396, + "loss": 1.4926, + "step": 3220 + }, + { + "epoch": 0.17, + "grad_norm": 0.5, + "learning_rate": 0.00019728996746652496, + "loss": 1.4928, + "step": 3225 + }, + { + "epoch": 0.17, + "grad_norm": 0.51171875, + "learning_rate": 0.00019726904757631544, + "loss": 1.4744, + "step": 3230 + }, + { + "epoch": 0.17, + "grad_norm": 0.5546875, + "learning_rate": 0.0001972480483692443, + "loss": 1.516, + "step": 3235 + }, + { + "epoch": 0.17, + "grad_norm": 0.51171875, + "learning_rate": 0.00019722696986243515, + "loss": 1.4968, + "step": 3240 + }, + { + "epoch": 0.17, + "grad_norm": 0.51953125, + "learning_rate": 0.00019720581207307612, + "loss": 1.5516, + "step": 3245 + }, + { + "epoch": 0.17, + "grad_norm": 0.48046875, + "learning_rate": 0.0001971845750184201, + "loss": 1.4762, + "step": 3250 + }, + { + "epoch": 0.17, + "grad_norm": 0.51953125, + "learning_rate": 0.00019716325871578462, + "loss": 1.4957, + "step": 3255 + }, + { + "epoch": 0.17, + "grad_norm": 0.490234375, + "learning_rate": 0.0001971418631825517, + "loss": 1.4826, + "step": 3260 + }, + { + "epoch": 0.17, + "grad_norm": 0.5, + "learning_rate": 0.00019712038843616817, + "loss": 1.4764, + "step": 3265 + }, + { + "epoch": 0.17, + "grad_norm": 0.5, + "learning_rate": 0.00019709883449414535, + "loss": 1.4964, + "step": 3270 + }, + { + "epoch": 0.17, + "grad_norm": 0.515625, + "learning_rate": 0.00019707720137405907, + "loss": 1.5065, + "step": 3275 + }, + { + "epoch": 0.17, + "grad_norm": 0.5546875, + "learning_rate": 0.00019705548909354983, + "loss": 1.4792, + "step": 3280 + }, + { + "epoch": 0.17, + "grad_norm": 0.57421875, + "learning_rate": 0.00019703369767032266, + "loss": 1.4646, + "step": 3285 + }, + { + "epoch": 0.17, + "grad_norm": 0.546875, + "learning_rate": 0.0001970118271221471, + "loss": 1.5114, + "step": 3290 + }, + { + "epoch": 0.17, + "grad_norm": 0.5390625, + "learning_rate": 0.0001969898774668572, + "loss": 1.4886, + "step": 3295 + }, + { + "epoch": 0.17, + "grad_norm": 0.5, + "learning_rate": 0.00019696784872235158, + "loss": 1.509, + "step": 3300 + }, + { + "epoch": 0.17, + "grad_norm": 0.5234375, + "learning_rate": 0.0001969457409065933, + "loss": 1.4883, + "step": 3305 + }, + { + "epoch": 0.17, + "grad_norm": 0.54296875, + "learning_rate": 0.00019692355403760987, + "loss": 1.5194, + "step": 3310 + }, + { + "epoch": 0.17, + "grad_norm": 0.5078125, + "learning_rate": 0.00019690128813349333, + "loss": 1.5052, + "step": 3315 + }, + { + "epoch": 0.17, + "grad_norm": 0.49609375, + "learning_rate": 0.00019687894321240016, + "loss": 1.4941, + "step": 3320 + }, + { + "epoch": 0.17, + "grad_norm": 0.5, + "learning_rate": 0.00019685651929255123, + "loss": 1.5108, + "step": 3325 + }, + { + "epoch": 0.17, + "grad_norm": 0.50390625, + "learning_rate": 0.0001968340163922319, + "loss": 1.5232, + "step": 3330 + }, + { + "epoch": 0.17, + "grad_norm": 0.50390625, + "learning_rate": 0.00019681143452979178, + "loss": 1.5024, + "step": 3335 + }, + { + "epoch": 0.17, + "grad_norm": 0.5390625, + "learning_rate": 0.0001967887737236451, + "loss": 1.4631, + "step": 3340 + }, + { + "epoch": 0.17, + "grad_norm": 0.53515625, + "learning_rate": 0.00019676603399227023, + "loss": 1.5092, + "step": 3345 + }, + { + "epoch": 0.17, + "grad_norm": 0.4921875, + "learning_rate": 0.0001967432153542101, + "loss": 1.5084, + "step": 3350 + }, + { + "epoch": 0.17, + "grad_norm": 0.53125, + "learning_rate": 0.00019672031782807178, + "loss": 1.4948, + "step": 3355 + }, + { + "epoch": 0.17, + "grad_norm": 0.490234375, + "learning_rate": 0.0001966973414325269, + "loss": 1.4951, + "step": 3360 + }, + { + "epoch": 0.17, + "grad_norm": 0.52734375, + "learning_rate": 0.00019667428618631126, + "loss": 1.5071, + "step": 3365 + }, + { + "epoch": 0.17, + "grad_norm": 0.51953125, + "learning_rate": 0.00019665115210822489, + "loss": 1.4793, + "step": 3370 + }, + { + "epoch": 0.17, + "grad_norm": 0.4921875, + "learning_rate": 0.00019662793921713226, + "loss": 1.5236, + "step": 3375 + }, + { + "epoch": 0.17, + "grad_norm": 0.5078125, + "learning_rate": 0.00019660464753196207, + "loss": 1.4882, + "step": 3380 + }, + { + "epoch": 0.18, + "grad_norm": 0.51953125, + "learning_rate": 0.00019658127707170716, + "loss": 1.4904, + "step": 3385 + }, + { + "epoch": 0.18, + "grad_norm": 0.5, + "learning_rate": 0.00019655782785542476, + "loss": 1.4852, + "step": 3390 + }, + { + "epoch": 0.18, + "grad_norm": 0.5234375, + "learning_rate": 0.0001965342999022362, + "loss": 1.4834, + "step": 3395 + }, + { + "epoch": 0.18, + "grad_norm": 0.515625, + "learning_rate": 0.0001965106932313271, + "loss": 1.5002, + "step": 3400 + }, + { + "epoch": 0.18, + "grad_norm": 0.515625, + "learning_rate": 0.0001964870078619472, + "loss": 1.4868, + "step": 3405 + }, + { + "epoch": 0.18, + "grad_norm": 0.51953125, + "learning_rate": 0.00019646324381341045, + "loss": 1.514, + "step": 3410 + }, + { + "epoch": 0.18, + "grad_norm": 0.5234375, + "learning_rate": 0.000196439401105095, + "loss": 1.4797, + "step": 3415 + }, + { + "epoch": 0.18, + "grad_norm": 0.5234375, + "learning_rate": 0.00019641547975644304, + "loss": 1.4759, + "step": 3420 + }, + { + "epoch": 0.18, + "grad_norm": 0.515625, + "learning_rate": 0.00019639147978696097, + "loss": 1.5039, + "step": 3425 + }, + { + "epoch": 0.18, + "grad_norm": 0.490234375, + "learning_rate": 0.0001963674012162193, + "loss": 1.4789, + "step": 3430 + }, + { + "epoch": 0.18, + "grad_norm": 0.5, + "learning_rate": 0.00019634324406385252, + "loss": 1.5134, + "step": 3435 + }, + { + "epoch": 0.18, + "grad_norm": 0.51953125, + "learning_rate": 0.00019631900834955935, + "loss": 1.4714, + "step": 3440 + }, + { + "epoch": 0.18, + "grad_norm": 0.515625, + "learning_rate": 0.00019629469409310253, + "loss": 1.5027, + "step": 3445 + }, + { + "epoch": 0.18, + "grad_norm": 0.51171875, + "learning_rate": 0.00019627030131430875, + "loss": 1.5045, + "step": 3450 + }, + { + "epoch": 0.18, + "grad_norm": 0.51171875, + "learning_rate": 0.0001962458300330689, + "loss": 1.5231, + "step": 3455 + }, + { + "epoch": 0.18, + "grad_norm": 0.52734375, + "learning_rate": 0.0001962212802693377, + "loss": 1.5213, + "step": 3460 + }, + { + "epoch": 0.18, + "grad_norm": 0.53125, + "learning_rate": 0.000196196652043134, + "loss": 1.5295, + "step": 3465 + }, + { + "epoch": 0.18, + "grad_norm": 0.4921875, + "learning_rate": 0.0001961719453745406, + "loss": 1.5081, + "step": 3470 + }, + { + "epoch": 0.18, + "grad_norm": 0.515625, + "learning_rate": 0.0001961471602837042, + "loss": 1.4992, + "step": 3475 + }, + { + "epoch": 0.18, + "grad_norm": 0.48828125, + "learning_rate": 0.00019612229679083555, + "loss": 1.5028, + "step": 3480 + }, + { + "epoch": 0.18, + "grad_norm": 0.5234375, + "learning_rate": 0.0001960973549162093, + "loss": 1.5116, + "step": 3485 + }, + { + "epoch": 0.18, + "grad_norm": 0.494140625, + "learning_rate": 0.00019607233468016392, + "loss": 1.4824, + "step": 3490 + }, + { + "epoch": 0.18, + "grad_norm": 0.54296875, + "learning_rate": 0.00019604723610310194, + "loss": 1.5318, + "step": 3495 + }, + { + "epoch": 0.18, + "grad_norm": 0.5078125, + "learning_rate": 0.00019602205920548965, + "loss": 1.5108, + "step": 3500 + }, + { + "epoch": 0.18, + "grad_norm": 0.5, + "learning_rate": 0.0001959968040078572, + "loss": 1.5155, + "step": 3505 + }, + { + "epoch": 0.18, + "grad_norm": 0.51171875, + "learning_rate": 0.00019597147053079873, + "loss": 1.4939, + "step": 3510 + }, + { + "epoch": 0.18, + "grad_norm": 0.5390625, + "learning_rate": 0.00019594605879497202, + "loss": 1.4775, + "step": 3515 + }, + { + "epoch": 0.18, + "grad_norm": 0.51171875, + "learning_rate": 0.00019592056882109885, + "loss": 1.5044, + "step": 3520 + }, + { + "epoch": 0.18, + "grad_norm": 0.53125, + "learning_rate": 0.00019589500062996463, + "loss": 1.4672, + "step": 3525 + }, + { + "epoch": 0.18, + "grad_norm": 0.52734375, + "learning_rate": 0.00019586935424241873, + "loss": 1.4761, + "step": 3530 + }, + { + "epoch": 0.18, + "grad_norm": 0.51953125, + "learning_rate": 0.00019584362967937406, + "loss": 1.4901, + "step": 3535 + }, + { + "epoch": 0.18, + "grad_norm": 0.51953125, + "learning_rate": 0.00019581782696180748, + "loss": 1.4989, + "step": 3540 + }, + { + "epoch": 0.18, + "grad_norm": 0.5, + "learning_rate": 0.0001957919461107595, + "loss": 1.5152, + "step": 3545 + }, + { + "epoch": 0.18, + "grad_norm": 0.5546875, + "learning_rate": 0.00019576598714733431, + "loss": 1.5188, + "step": 3550 + }, + { + "epoch": 0.18, + "grad_norm": 0.515625, + "learning_rate": 0.00019573995009269988, + "loss": 1.4681, + "step": 3555 + }, + { + "epoch": 0.18, + "grad_norm": 0.49609375, + "learning_rate": 0.00019571383496808775, + "loss": 1.4673, + "step": 3560 + }, + { + "epoch": 0.18, + "grad_norm": 0.51171875, + "learning_rate": 0.00019568764179479323, + "loss": 1.4654, + "step": 3565 + }, + { + "epoch": 0.18, + "grad_norm": 0.5078125, + "learning_rate": 0.0001956613705941752, + "loss": 1.5073, + "step": 3570 + }, + { + "epoch": 0.18, + "grad_norm": 0.5078125, + "learning_rate": 0.00019563502138765618, + "loss": 1.4786, + "step": 3575 + }, + { + "epoch": 0.19, + "grad_norm": 0.5234375, + "learning_rate": 0.00019560859419672237, + "loss": 1.5432, + "step": 3580 + }, + { + "epoch": 0.19, + "grad_norm": 0.51953125, + "learning_rate": 0.00019558208904292342, + "loss": 1.497, + "step": 3585 + }, + { + "epoch": 0.19, + "grad_norm": 0.5234375, + "learning_rate": 0.0001955555059478727, + "loss": 1.4615, + "step": 3590 + }, + { + "epoch": 0.19, + "grad_norm": 0.484375, + "learning_rate": 0.00019552884493324703, + "loss": 1.4729, + "step": 3595 + }, + { + "epoch": 0.19, + "grad_norm": 0.5234375, + "learning_rate": 0.00019550210602078684, + "loss": 1.5153, + "step": 3600 + }, + { + "epoch": 0.19, + "grad_norm": 0.5234375, + "learning_rate": 0.000195475289232296, + "loss": 1.4976, + "step": 3605 + }, + { + "epoch": 0.19, + "grad_norm": 0.53125, + "learning_rate": 0.00019544839458964202, + "loss": 1.5, + "step": 3610 + }, + { + "epoch": 0.19, + "grad_norm": 0.5234375, + "learning_rate": 0.0001954214221147557, + "loss": 1.4885, + "step": 3615 + }, + { + "epoch": 0.19, + "grad_norm": 0.53515625, + "learning_rate": 0.00019539437182963153, + "loss": 1.4852, + "step": 3620 + }, + { + "epoch": 0.19, + "grad_norm": 0.486328125, + "learning_rate": 0.00019536724375632727, + "loss": 1.4751, + "step": 3625 + }, + { + "epoch": 0.19, + "grad_norm": 0.52734375, + "learning_rate": 0.00019534003791696417, + "loss": 1.4512, + "step": 3630 + }, + { + "epoch": 0.19, + "grad_norm": 0.515625, + "learning_rate": 0.00019531275433372694, + "loss": 1.4634, + "step": 3635 + }, + { + "epoch": 0.19, + "grad_norm": 0.51171875, + "learning_rate": 0.00019528539302886362, + "loss": 1.4936, + "step": 3640 + }, + { + "epoch": 0.19, + "grad_norm": 0.5234375, + "learning_rate": 0.00019525795402468567, + "loss": 1.4721, + "step": 3645 + }, + { + "epoch": 0.19, + "grad_norm": 0.5078125, + "learning_rate": 0.00019523043734356787, + "loss": 1.5107, + "step": 3650 + }, + { + "epoch": 0.19, + "grad_norm": 0.515625, + "learning_rate": 0.00019520284300794837, + "loss": 1.5082, + "step": 3655 + }, + { + "epoch": 0.19, + "grad_norm": 0.5234375, + "learning_rate": 0.00019517517104032864, + "loss": 1.4966, + "step": 3660 + }, + { + "epoch": 0.19, + "grad_norm": 0.50390625, + "learning_rate": 0.00019514742146327344, + "loss": 1.4804, + "step": 3665 + }, + { + "epoch": 0.19, + "grad_norm": 0.515625, + "learning_rate": 0.00019511959429941087, + "loss": 1.5073, + "step": 3670 + }, + { + "epoch": 0.19, + "grad_norm": 0.51171875, + "learning_rate": 0.0001950916895714322, + "loss": 1.5222, + "step": 3675 + }, + { + "epoch": 0.19, + "grad_norm": 0.51171875, + "learning_rate": 0.000195063707302092, + "loss": 1.4928, + "step": 3680 + }, + { + "epoch": 0.19, + "grad_norm": 0.5078125, + "learning_rate": 0.0001950356475142081, + "loss": 1.4753, + "step": 3685 + }, + { + "epoch": 0.19, + "grad_norm": 0.51953125, + "learning_rate": 0.00019500751023066154, + "loss": 1.4971, + "step": 3690 + }, + { + "epoch": 0.19, + "grad_norm": 0.490234375, + "learning_rate": 0.00019497929547439643, + "loss": 1.4678, + "step": 3695 + }, + { + "epoch": 0.19, + "grad_norm": 0.5078125, + "learning_rate": 0.0001949510032684202, + "loss": 1.4907, + "step": 3700 + }, + { + "epoch": 0.19, + "grad_norm": 0.51171875, + "learning_rate": 0.00019492263363580343, + "loss": 1.5243, + "step": 3705 + }, + { + "epoch": 0.19, + "grad_norm": 0.53515625, + "learning_rate": 0.00019489418659967975, + "loss": 1.485, + "step": 3710 + }, + { + "epoch": 0.19, + "grad_norm": 0.51953125, + "learning_rate": 0.00019486566218324597, + "loss": 1.4388, + "step": 3715 + }, + { + "epoch": 0.19, + "grad_norm": 0.48828125, + "learning_rate": 0.00019483706040976194, + "loss": 1.4807, + "step": 3720 + }, + { + "epoch": 0.19, + "grad_norm": 0.51953125, + "learning_rate": 0.0001948083813025506, + "loss": 1.4988, + "step": 3725 + }, + { + "epoch": 0.19, + "grad_norm": 0.49609375, + "learning_rate": 0.0001947796248849981, + "loss": 1.4606, + "step": 3730 + }, + { + "epoch": 0.19, + "grad_norm": 0.53125, + "learning_rate": 0.0001947507911805534, + "loss": 1.4861, + "step": 3735 + }, + { + "epoch": 0.19, + "grad_norm": 0.498046875, + "learning_rate": 0.00019472188021272868, + "loss": 1.4952, + "step": 3740 + }, + { + "epoch": 0.19, + "grad_norm": 0.5390625, + "learning_rate": 0.00019469289200509896, + "loss": 1.4767, + "step": 3745 + }, + { + "epoch": 0.19, + "grad_norm": 0.51171875, + "learning_rate": 0.00019466382658130232, + "loss": 1.5096, + "step": 3750 + }, + { + "epoch": 0.19, + "grad_norm": 0.4921875, + "learning_rate": 0.00019463468396503989, + "loss": 1.4809, + "step": 3755 + }, + { + "epoch": 0.19, + "grad_norm": 0.5078125, + "learning_rate": 0.0001946054641800756, + "loss": 1.4876, + "step": 3760 + }, + { + "epoch": 0.19, + "grad_norm": 0.5, + "learning_rate": 0.00019457616725023635, + "loss": 1.4583, + "step": 3765 + }, + { + "epoch": 0.2, + "grad_norm": 0.53125, + "learning_rate": 0.000194546793199412, + "loss": 1.4932, + "step": 3770 + }, + { + "epoch": 0.2, + "grad_norm": 0.515625, + "learning_rate": 0.00019451734205155527, + "loss": 1.4892, + "step": 3775 + }, + { + "epoch": 0.2, + "grad_norm": 0.515625, + "learning_rate": 0.00019448781383068174, + "loss": 1.5205, + "step": 3780 + }, + { + "epoch": 0.2, + "grad_norm": 0.515625, + "learning_rate": 0.0001944582085608698, + "loss": 1.5026, + "step": 3785 + }, + { + "epoch": 0.2, + "grad_norm": 0.4921875, + "learning_rate": 0.00019442852626626076, + "loss": 1.4821, + "step": 3790 + }, + { + "epoch": 0.2, + "grad_norm": 0.53125, + "learning_rate": 0.0001943987669710586, + "loss": 1.5057, + "step": 3795 + }, + { + "epoch": 0.2, + "grad_norm": 0.515625, + "learning_rate": 0.0001943689306995303, + "loss": 1.466, + "step": 3800 + }, + { + "epoch": 0.2, + "grad_norm": 0.4921875, + "learning_rate": 0.00019433901747600537, + "loss": 1.4606, + "step": 3805 + }, + { + "epoch": 0.2, + "grad_norm": 1.453125, + "learning_rate": 0.00019430902732487626, + "loss": 1.4627, + "step": 3810 + }, + { + "epoch": 0.2, + "grad_norm": 2.890625, + "learning_rate": 0.00019427896027059802, + "loss": 1.4864, + "step": 3815 + }, + { + "epoch": 0.2, + "grad_norm": 0.53125, + "learning_rate": 0.00019424881633768853, + "loss": 1.497, + "step": 3820 + }, + { + "epoch": 0.2, + "grad_norm": 0.58203125, + "learning_rate": 0.00019421859555072822, + "loss": 1.4953, + "step": 3825 + }, + { + "epoch": 0.2, + "grad_norm": 0.5078125, + "learning_rate": 0.0001941882979343603, + "loss": 1.4895, + "step": 3830 + }, + { + "epoch": 0.2, + "grad_norm": 0.5, + "learning_rate": 0.00019415792351329058, + "loss": 1.4654, + "step": 3835 + }, + { + "epoch": 0.2, + "grad_norm": 0.53515625, + "learning_rate": 0.00019412747231228753, + "loss": 1.5066, + "step": 3840 + }, + { + "epoch": 0.2, + "grad_norm": 0.6015625, + "learning_rate": 0.00019409694435618222, + "loss": 1.5092, + "step": 3845 + }, + { + "epoch": 0.2, + "grad_norm": 0.5078125, + "learning_rate": 0.00019406633966986828, + "loss": 1.4891, + "step": 3850 + }, + { + "epoch": 0.2, + "grad_norm": 0.51953125, + "learning_rate": 0.0001940356582783019, + "loss": 1.4741, + "step": 3855 + }, + { + "epoch": 0.2, + "grad_norm": 0.515625, + "learning_rate": 0.0001940049002065019, + "loss": 1.4906, + "step": 3860 + }, + { + "epoch": 0.2, + "grad_norm": 0.5234375, + "learning_rate": 0.00019397406547954954, + "loss": 1.4775, + "step": 3865 + }, + { + "epoch": 0.2, + "grad_norm": 0.53125, + "learning_rate": 0.00019394315412258868, + "loss": 1.4981, + "step": 3870 + }, + { + "epoch": 0.2, + "grad_norm": 0.5078125, + "learning_rate": 0.00019391216616082552, + "loss": 1.4714, + "step": 3875 + }, + { + "epoch": 0.2, + "grad_norm": 0.54296875, + "learning_rate": 0.0001938811016195289, + "loss": 1.504, + "step": 3880 + }, + { + "epoch": 0.2, + "grad_norm": 0.5546875, + "learning_rate": 0.00019384996052402995, + "loss": 1.5086, + "step": 3885 + }, + { + "epoch": 0.2, + "grad_norm": 0.52734375, + "learning_rate": 0.00019381874289972238, + "loss": 1.4597, + "step": 3890 + }, + { + "epoch": 0.2, + "grad_norm": 0.52734375, + "learning_rate": 0.0001937874487720621, + "loss": 1.527, + "step": 3895 + }, + { + "epoch": 0.2, + "grad_norm": 0.515625, + "learning_rate": 0.00019375607816656768, + "loss": 1.4881, + "step": 3900 + }, + { + "epoch": 0.2, + "grad_norm": 0.51171875, + "learning_rate": 0.0001937246311088198, + "loss": 1.4398, + "step": 3905 + }, + { + "epoch": 0.2, + "grad_norm": 0.5234375, + "learning_rate": 0.0001936931076244616, + "loss": 1.5191, + "step": 3910 + }, + { + "epoch": 0.2, + "grad_norm": 0.515625, + "learning_rate": 0.0001936615077391985, + "loss": 1.5062, + "step": 3915 + }, + { + "epoch": 0.2, + "grad_norm": 0.5234375, + "learning_rate": 0.00019362983147879826, + "loss": 1.5024, + "step": 3920 + }, + { + "epoch": 0.2, + "grad_norm": 0.5, + "learning_rate": 0.00019359807886909093, + "loss": 1.4988, + "step": 3925 + }, + { + "epoch": 0.2, + "grad_norm": 0.5234375, + "learning_rate": 0.00019356624993596878, + "loss": 1.4855, + "step": 3930 + }, + { + "epoch": 0.2, + "grad_norm": 0.5, + "learning_rate": 0.00019353434470538629, + "loss": 1.5001, + "step": 3935 + }, + { + "epoch": 0.2, + "grad_norm": 0.5078125, + "learning_rate": 0.00019350236320336023, + "loss": 1.4953, + "step": 3940 + }, + { + "epoch": 0.2, + "grad_norm": 0.494140625, + "learning_rate": 0.0001934703054559695, + "loss": 1.4843, + "step": 3945 + }, + { + "epoch": 0.2, + "grad_norm": 0.57421875, + "learning_rate": 0.0001934381714893552, + "loss": 1.4679, + "step": 3950 + }, + { + "epoch": 0.2, + "grad_norm": 0.53125, + "learning_rate": 0.00019340596132972062, + "loss": 1.4638, + "step": 3955 + }, + { + "epoch": 0.2, + "grad_norm": 0.515625, + "learning_rate": 0.0001933736750033311, + "loss": 1.465, + "step": 3960 + }, + { + "epoch": 0.21, + "grad_norm": 0.51953125, + "learning_rate": 0.00019334131253651414, + "loss": 1.4728, + "step": 3965 + }, + { + "epoch": 0.21, + "grad_norm": 0.5078125, + "learning_rate": 0.00019330887395565936, + "loss": 1.4941, + "step": 3970 + }, + { + "epoch": 0.21, + "grad_norm": 0.5234375, + "learning_rate": 0.00019327635928721834, + "loss": 1.4869, + "step": 3975 + }, + { + "epoch": 0.21, + "grad_norm": 0.5390625, + "learning_rate": 0.00019324376855770484, + "loss": 1.5167, + "step": 3980 + }, + { + "epoch": 0.21, + "grad_norm": 0.5078125, + "learning_rate": 0.00019321110179369448, + "loss": 1.4614, + "step": 3985 + }, + { + "epoch": 0.21, + "grad_norm": 0.5078125, + "learning_rate": 0.00019317835902182506, + "loss": 1.516, + "step": 3990 + }, + { + "epoch": 0.21, + "grad_norm": 0.52734375, + "learning_rate": 0.0001931455402687963, + "loss": 1.4819, + "step": 3995 + }, + { + "epoch": 0.21, + "grad_norm": 0.5, + "learning_rate": 0.00019311264556136975, + "loss": 1.4911, + "step": 4000 + }, + { + "epoch": 0.21, + "grad_norm": 0.52734375, + "learning_rate": 0.00019307967492636905, + "loss": 1.4753, + "step": 4005 + }, + { + "epoch": 0.21, + "grad_norm": 0.546875, + "learning_rate": 0.00019304662839067974, + "loss": 1.487, + "step": 4010 + }, + { + "epoch": 0.21, + "grad_norm": 0.50390625, + "learning_rate": 0.00019301350598124913, + "loss": 1.4955, + "step": 4015 + }, + { + "epoch": 0.21, + "grad_norm": 0.54296875, + "learning_rate": 0.00019298030772508658, + "loss": 1.4849, + "step": 4020 + }, + { + "epoch": 0.21, + "grad_norm": 0.51171875, + "learning_rate": 0.00019294703364926315, + "loss": 1.5116, + "step": 4025 + }, + { + "epoch": 0.21, + "grad_norm": 0.5, + "learning_rate": 0.00019291368378091176, + "loss": 1.4563, + "step": 4030 + }, + { + "epoch": 0.21, + "grad_norm": 0.53125, + "learning_rate": 0.0001928802581472272, + "loss": 1.4931, + "step": 4035 + }, + { + "epoch": 0.21, + "grad_norm": 0.515625, + "learning_rate": 0.000192846756775466, + "loss": 1.4745, + "step": 4040 + }, + { + "epoch": 0.21, + "grad_norm": 0.51953125, + "learning_rate": 0.00019281317969294643, + "loss": 1.4956, + "step": 4045 + }, + { + "epoch": 0.21, + "grad_norm": 0.53515625, + "learning_rate": 0.00019277952692704848, + "loss": 1.4962, + "step": 4050 + }, + { + "epoch": 0.21, + "grad_norm": 0.51171875, + "learning_rate": 0.00019274579850521393, + "loss": 1.4451, + "step": 4055 + }, + { + "epoch": 0.21, + "grad_norm": 0.5546875, + "learning_rate": 0.00019271199445494624, + "loss": 1.5035, + "step": 4060 + }, + { + "epoch": 0.21, + "grad_norm": 0.515625, + "learning_rate": 0.00019267811480381042, + "loss": 1.4918, + "step": 4065 + }, + { + "epoch": 0.21, + "grad_norm": 0.5078125, + "learning_rate": 0.0001926441595794333, + "loss": 1.4766, + "step": 4070 + }, + { + "epoch": 0.21, + "grad_norm": 0.5234375, + "learning_rate": 0.00019261012880950323, + "loss": 1.4601, + "step": 4075 + }, + { + "epoch": 0.21, + "grad_norm": 0.53515625, + "learning_rate": 0.00019257602252177017, + "loss": 1.4964, + "step": 4080 + }, + { + "epoch": 0.21, + "grad_norm": 0.51171875, + "learning_rate": 0.00019254184074404568, + "loss": 1.4682, + "step": 4085 + }, + { + "epoch": 0.21, + "grad_norm": 0.490234375, + "learning_rate": 0.0001925075835042029, + "loss": 1.4707, + "step": 4090 + }, + { + "epoch": 0.21, + "grad_norm": 0.50390625, + "learning_rate": 0.00019247325083017648, + "loss": 1.4259, + "step": 4095 + }, + { + "epoch": 0.21, + "grad_norm": 0.51953125, + "learning_rate": 0.00019243884274996255, + "loss": 1.4643, + "step": 4100 + }, + { + "epoch": 0.21, + "grad_norm": 0.515625, + "learning_rate": 0.00019240435929161878, + "loss": 1.4715, + "step": 4105 + }, + { + "epoch": 0.21, + "grad_norm": 0.5390625, + "learning_rate": 0.00019236980048326427, + "loss": 1.4774, + "step": 4110 + }, + { + "epoch": 0.21, + "grad_norm": 0.51171875, + "learning_rate": 0.0001923351663530796, + "loss": 1.4701, + "step": 4115 + }, + { + "epoch": 0.21, + "grad_norm": 0.5390625, + "learning_rate": 0.00019230045692930677, + "loss": 1.4838, + "step": 4120 + }, + { + "epoch": 0.21, + "grad_norm": 0.54296875, + "learning_rate": 0.00019226567224024912, + "loss": 1.4794, + "step": 4125 + }, + { + "epoch": 0.21, + "grad_norm": 0.53125, + "learning_rate": 0.0001922308123142714, + "loss": 1.4795, + "step": 4130 + }, + { + "epoch": 0.21, + "grad_norm": 0.52734375, + "learning_rate": 0.00019219587717979973, + "loss": 1.4763, + "step": 4135 + }, + { + "epoch": 0.21, + "grad_norm": 0.515625, + "learning_rate": 0.00019216086686532153, + "loss": 1.4965, + "step": 4140 + }, + { + "epoch": 0.21, + "grad_norm": 0.5078125, + "learning_rate": 0.00019212578139938554, + "loss": 1.5114, + "step": 4145 + }, + { + "epoch": 0.21, + "grad_norm": 0.53125, + "learning_rate": 0.00019209062081060178, + "loss": 1.4927, + "step": 4150 + }, + { + "epoch": 0.21, + "grad_norm": 0.51953125, + "learning_rate": 0.00019205538512764156, + "loss": 1.4886, + "step": 4155 + }, + { + "epoch": 0.22, + "grad_norm": 0.51953125, + "learning_rate": 0.0001920200743792373, + "loss": 1.4667, + "step": 4160 + }, + { + "epoch": 0.22, + "grad_norm": 0.51953125, + "learning_rate": 0.00019198468859418278, + "loss": 1.4878, + "step": 4165 + }, + { + "epoch": 0.22, + "grad_norm": 0.50390625, + "learning_rate": 0.00019194922780133293, + "loss": 1.5009, + "step": 4170 + }, + { + "epoch": 0.22, + "grad_norm": 0.51171875, + "learning_rate": 0.00019191369202960378, + "loss": 1.4706, + "step": 4175 + }, + { + "epoch": 0.22, + "grad_norm": 0.51953125, + "learning_rate": 0.00019187808130797254, + "loss": 1.4836, + "step": 4180 + }, + { + "epoch": 0.22, + "grad_norm": 0.5234375, + "learning_rate": 0.00019184239566547755, + "loss": 1.4744, + "step": 4185 + }, + { + "epoch": 0.22, + "grad_norm": 0.52734375, + "learning_rate": 0.00019180663513121825, + "loss": 1.4759, + "step": 4190 + }, + { + "epoch": 0.22, + "grad_norm": 0.4921875, + "learning_rate": 0.0001917707997343551, + "loss": 1.4465, + "step": 4195 + }, + { + "epoch": 0.22, + "grad_norm": 0.52734375, + "learning_rate": 0.00019173488950410968, + "loss": 1.4752, + "step": 4200 + }, + { + "epoch": 0.22, + "grad_norm": 0.5, + "learning_rate": 0.00019169890446976454, + "loss": 1.4593, + "step": 4205 + }, + { + "epoch": 0.22, + "grad_norm": 0.5078125, + "learning_rate": 0.00019166284466066319, + "loss": 1.4992, + "step": 4210 + }, + { + "epoch": 0.22, + "grad_norm": 0.53125, + "learning_rate": 0.00019162671010621024, + "loss": 1.4524, + "step": 4215 + }, + { + "epoch": 0.22, + "grad_norm": 0.484375, + "learning_rate": 0.0001915905008358711, + "loss": 1.4715, + "step": 4220 + }, + { + "epoch": 0.22, + "grad_norm": 0.54296875, + "learning_rate": 0.0001915542168791722, + "loss": 1.4902, + "step": 4225 + }, + { + "epoch": 0.22, + "grad_norm": 0.5, + "learning_rate": 0.0001915178582657009, + "loss": 1.4699, + "step": 4230 + }, + { + "epoch": 0.22, + "grad_norm": 0.5234375, + "learning_rate": 0.00019148142502510533, + "loss": 1.4632, + "step": 4235 + }, + { + "epoch": 0.22, + "grad_norm": 0.515625, + "learning_rate": 0.00019144491718709456, + "loss": 1.4785, + "step": 4240 + }, + { + "epoch": 0.22, + "grad_norm": 0.50390625, + "learning_rate": 0.00019140833478143847, + "loss": 1.4622, + "step": 4245 + }, + { + "epoch": 0.22, + "grad_norm": 0.5, + "learning_rate": 0.0001913716778379677, + "loss": 1.4746, + "step": 4250 + }, + { + "epoch": 0.22, + "grad_norm": 0.51171875, + "learning_rate": 0.00019133494638657374, + "loss": 1.4865, + "step": 4255 + }, + { + "epoch": 0.22, + "grad_norm": 0.53125, + "learning_rate": 0.0001912981404572088, + "loss": 1.4808, + "step": 4260 + }, + { + "epoch": 0.22, + "grad_norm": 0.515625, + "learning_rate": 0.00019126126007988585, + "loss": 1.5115, + "step": 4265 + }, + { + "epoch": 0.22, + "grad_norm": 0.50390625, + "learning_rate": 0.0001912243052846785, + "loss": 1.4847, + "step": 4270 + }, + { + "epoch": 0.22, + "grad_norm": 0.51953125, + "learning_rate": 0.0001911872761017211, + "loss": 1.4664, + "step": 4275 + }, + { + "epoch": 0.22, + "grad_norm": 0.55078125, + "learning_rate": 0.00019115017256120866, + "loss": 1.4713, + "step": 4280 + }, + { + "epoch": 0.22, + "grad_norm": 0.5078125, + "learning_rate": 0.0001911129946933968, + "loss": 1.4886, + "step": 4285 + }, + { + "epoch": 0.22, + "grad_norm": 0.50390625, + "learning_rate": 0.00019107574252860178, + "loss": 1.4937, + "step": 4290 + }, + { + "epoch": 0.22, + "grad_norm": 0.515625, + "learning_rate": 0.00019103841609720043, + "loss": 1.433, + "step": 4295 + }, + { + "epoch": 0.22, + "grad_norm": 0.494140625, + "learning_rate": 0.0001910010154296301, + "loss": 1.4664, + "step": 4300 + }, + { + "epoch": 0.22, + "grad_norm": 0.51171875, + "learning_rate": 0.0001909635405563887, + "loss": 1.4946, + "step": 4305 + }, + { + "epoch": 0.22, + "grad_norm": 0.4921875, + "learning_rate": 0.0001909259915080347, + "loss": 1.4688, + "step": 4310 + }, + { + "epoch": 0.22, + "grad_norm": 0.51171875, + "learning_rate": 0.000190888368315187, + "loss": 1.4631, + "step": 4315 + }, + { + "epoch": 0.22, + "grad_norm": 0.52734375, + "learning_rate": 0.000190850671008525, + "loss": 1.458, + "step": 4320 + }, + { + "epoch": 0.22, + "grad_norm": 0.77734375, + "learning_rate": 0.00019081289961878848, + "loss": 1.4863, + "step": 4325 + }, + { + "epoch": 0.22, + "grad_norm": 0.490234375, + "learning_rate": 0.00019077505417677764, + "loss": 1.4789, + "step": 4330 + }, + { + "epoch": 0.22, + "grad_norm": 0.51953125, + "learning_rate": 0.00019073713471335312, + "loss": 1.4919, + "step": 4335 + }, + { + "epoch": 0.22, + "grad_norm": 0.515625, + "learning_rate": 0.00019069914125943586, + "loss": 1.4988, + "step": 4340 + }, + { + "epoch": 0.22, + "grad_norm": 0.515625, + "learning_rate": 0.0001906610738460072, + "loss": 1.5027, + "step": 4345 + }, + { + "epoch": 0.23, + "grad_norm": 1.3203125, + "learning_rate": 0.00019062293250410873, + "loss": 1.4981, + "step": 4350 + }, + { + "epoch": 0.23, + "grad_norm": 0.5546875, + "learning_rate": 0.00019058471726484232, + "loss": 1.4197, + "step": 4355 + }, + { + "epoch": 0.23, + "grad_norm": 0.5078125, + "learning_rate": 0.00019054642815937012, + "loss": 1.4616, + "step": 4360 + }, + { + "epoch": 0.23, + "grad_norm": 0.5078125, + "learning_rate": 0.00019050806521891456, + "loss": 1.4806, + "step": 4365 + }, + { + "epoch": 0.23, + "grad_norm": 0.490234375, + "learning_rate": 0.0001904696284747582, + "loss": 1.4921, + "step": 4370 + }, + { + "epoch": 0.23, + "grad_norm": 0.50390625, + "learning_rate": 0.00019043111795824383, + "loss": 1.4368, + "step": 4375 + }, + { + "epoch": 0.23, + "grad_norm": 0.52734375, + "learning_rate": 0.00019039253370077436, + "loss": 1.4956, + "step": 4380 + }, + { + "epoch": 0.23, + "grad_norm": 0.5390625, + "learning_rate": 0.0001903538757338129, + "loss": 1.4884, + "step": 4385 + }, + { + "epoch": 0.23, + "grad_norm": 0.54296875, + "learning_rate": 0.00019031514408888257, + "loss": 1.5047, + "step": 4390 + }, + { + "epoch": 0.23, + "grad_norm": 0.52734375, + "learning_rate": 0.00019027633879756663, + "loss": 1.441, + "step": 4395 + }, + { + "epoch": 0.23, + "grad_norm": 0.53515625, + "learning_rate": 0.0001902374598915084, + "loss": 1.4652, + "step": 4400 + }, + { + "epoch": 0.23, + "grad_norm": 0.53515625, + "learning_rate": 0.0001901985074024112, + "loss": 1.4759, + "step": 4405 + }, + { + "epoch": 0.23, + "grad_norm": 0.52734375, + "learning_rate": 0.00019015948136203836, + "loss": 1.5075, + "step": 4410 + }, + { + "epoch": 0.23, + "grad_norm": 0.51171875, + "learning_rate": 0.00019012038180221322, + "loss": 1.5099, + "step": 4415 + }, + { + "epoch": 0.23, + "grad_norm": 0.515625, + "learning_rate": 0.00019008120875481897, + "loss": 1.4607, + "step": 4420 + }, + { + "epoch": 0.23, + "grad_norm": 0.53125, + "learning_rate": 0.00019004196225179886, + "loss": 1.4744, + "step": 4425 + }, + { + "epoch": 0.23, + "grad_norm": 0.5390625, + "learning_rate": 0.00019000264232515594, + "loss": 1.488, + "step": 4430 + }, + { + "epoch": 0.23, + "grad_norm": 0.51953125, + "learning_rate": 0.00018996324900695318, + "loss": 1.462, + "step": 4435 + }, + { + "epoch": 0.23, + "grad_norm": 0.49609375, + "learning_rate": 0.0001899237823293134, + "loss": 1.4705, + "step": 4440 + }, + { + "epoch": 0.23, + "grad_norm": 0.53125, + "learning_rate": 0.00018988424232441918, + "loss": 1.4757, + "step": 4445 + }, + { + "epoch": 0.23, + "grad_norm": 0.52734375, + "learning_rate": 0.000189844629024513, + "loss": 1.4953, + "step": 4450 + }, + { + "epoch": 0.23, + "grad_norm": 0.51953125, + "learning_rate": 0.00018980494246189698, + "loss": 1.5236, + "step": 4455 + }, + { + "epoch": 0.23, + "grad_norm": 0.53125, + "learning_rate": 0.00018976518266893304, + "loss": 1.4874, + "step": 4460 + }, + { + "epoch": 0.23, + "grad_norm": 0.51953125, + "learning_rate": 0.00018972534967804286, + "loss": 1.4537, + "step": 4465 + }, + { + "epoch": 0.23, + "grad_norm": 0.5078125, + "learning_rate": 0.00018968544352170776, + "loss": 1.4815, + "step": 4470 + }, + { + "epoch": 0.23, + "grad_norm": 0.5078125, + "learning_rate": 0.00018964546423246871, + "loss": 1.4856, + "step": 4475 + }, + { + "epoch": 0.23, + "grad_norm": 0.5234375, + "learning_rate": 0.00018960541184292638, + "loss": 1.4881, + "step": 4480 + }, + { + "epoch": 0.23, + "grad_norm": 0.5234375, + "learning_rate": 0.00018956528638574096, + "loss": 1.4695, + "step": 4485 + }, + { + "epoch": 0.23, + "grad_norm": 0.53125, + "learning_rate": 0.00018952508789363227, + "loss": 1.4625, + "step": 4490 + }, + { + "epoch": 0.23, + "grad_norm": 0.53125, + "learning_rate": 0.0001894848163993797, + "loss": 1.475, + "step": 4495 + }, + { + "epoch": 0.23, + "grad_norm": 0.5390625, + "learning_rate": 0.00018944447193582217, + "loss": 1.4838, + "step": 4500 + }, + { + "epoch": 0.23, + "grad_norm": 0.55078125, + "learning_rate": 0.00018940405453585798, + "loss": 1.4659, + "step": 4505 + }, + { + "epoch": 0.23, + "grad_norm": 0.53515625, + "learning_rate": 0.00018936356423244512, + "loss": 1.5054, + "step": 4510 + }, + { + "epoch": 0.23, + "grad_norm": 0.515625, + "learning_rate": 0.0001893230010586009, + "loss": 1.4518, + "step": 4515 + }, + { + "epoch": 0.23, + "grad_norm": 0.5, + "learning_rate": 0.000189282365047402, + "loss": 1.4761, + "step": 4520 + }, + { + "epoch": 0.23, + "grad_norm": 0.5546875, + "learning_rate": 0.00018924165623198462, + "loss": 1.4798, + "step": 4525 + }, + { + "epoch": 0.23, + "grad_norm": 0.50390625, + "learning_rate": 0.00018920087464554427, + "loss": 1.4675, + "step": 4530 + }, + { + "epoch": 0.23, + "grad_norm": 0.53125, + "learning_rate": 0.00018916002032133574, + "loss": 1.4589, + "step": 4535 + }, + { + "epoch": 0.23, + "grad_norm": 0.50390625, + "learning_rate": 0.00018911909329267325, + "loss": 1.4513, + "step": 4540 + }, + { + "epoch": 0.24, + "grad_norm": 0.51953125, + "learning_rate": 0.00018907809359293025, + "loss": 1.4666, + "step": 4545 + }, + { + "epoch": 0.24, + "grad_norm": 0.5, + "learning_rate": 0.0001890370212555394, + "loss": 1.4913, + "step": 4550 + }, + { + "epoch": 0.24, + "grad_norm": 0.53515625, + "learning_rate": 0.00018899587631399266, + "loss": 1.4915, + "step": 4555 + }, + { + "epoch": 0.24, + "grad_norm": 0.51953125, + "learning_rate": 0.0001889546588018412, + "loss": 1.476, + "step": 4560 + }, + { + "epoch": 0.24, + "grad_norm": 0.51953125, + "learning_rate": 0.0001889133687526953, + "loss": 1.4865, + "step": 4565 + }, + { + "epoch": 0.24, + "grad_norm": 0.51171875, + "learning_rate": 0.00018887200620022442, + "loss": 1.4912, + "step": 4570 + }, + { + "epoch": 0.24, + "grad_norm": 0.5234375, + "learning_rate": 0.0001888305711781572, + "loss": 1.493, + "step": 4575 + }, + { + "epoch": 0.24, + "grad_norm": 0.5234375, + "learning_rate": 0.0001887890637202813, + "loss": 1.5079, + "step": 4580 + }, + { + "epoch": 0.24, + "grad_norm": 0.50390625, + "learning_rate": 0.00018874748386044345, + "loss": 1.448, + "step": 4585 + }, + { + "epoch": 0.24, + "grad_norm": 0.50390625, + "learning_rate": 0.00018870583163254948, + "loss": 1.496, + "step": 4590 + }, + { + "epoch": 0.24, + "grad_norm": 0.5078125, + "learning_rate": 0.00018866410707056417, + "loss": 1.5013, + "step": 4595 + }, + { + "epoch": 0.24, + "grad_norm": 0.51171875, + "learning_rate": 0.0001886223102085113, + "loss": 1.4647, + "step": 4600 + }, + { + "epoch": 0.24, + "grad_norm": 0.54296875, + "learning_rate": 0.00018858044108047365, + "loss": 1.4696, + "step": 4605 + }, + { + "epoch": 0.24, + "grad_norm": 0.515625, + "learning_rate": 0.00018853849972059282, + "loss": 1.4539, + "step": 4610 + }, + { + "epoch": 0.24, + "grad_norm": 0.494140625, + "learning_rate": 0.00018849648616306943, + "loss": 1.4826, + "step": 4615 + }, + { + "epoch": 0.24, + "grad_norm": 0.6328125, + "learning_rate": 0.00018845440044216294, + "loss": 1.4955, + "step": 4620 + }, + { + "epoch": 0.24, + "grad_norm": 0.51171875, + "learning_rate": 0.0001884122425921916, + "loss": 1.4647, + "step": 4625 + }, + { + "epoch": 0.24, + "grad_norm": 0.52734375, + "learning_rate": 0.00018837001264753256, + "loss": 1.4642, + "step": 4630 + }, + { + "epoch": 0.24, + "grad_norm": 0.5703125, + "learning_rate": 0.00018832771064262167, + "loss": 1.5195, + "step": 4635 + }, + { + "epoch": 0.24, + "grad_norm": 0.46875, + "learning_rate": 0.0001882853366119536, + "loss": 1.4337, + "step": 4640 + }, + { + "epoch": 0.24, + "grad_norm": 0.5078125, + "learning_rate": 0.00018824289059008175, + "loss": 1.4322, + "step": 4645 + }, + { + "epoch": 0.24, + "grad_norm": 0.5, + "learning_rate": 0.0001882003726116182, + "loss": 1.4621, + "step": 4650 + }, + { + "epoch": 0.24, + "grad_norm": 0.51171875, + "learning_rate": 0.00018815778271123374, + "loss": 1.4737, + "step": 4655 + }, + { + "epoch": 0.24, + "grad_norm": 0.53125, + "learning_rate": 0.00018811512092365776, + "loss": 1.481, + "step": 4660 + }, + { + "epoch": 0.24, + "grad_norm": 0.52734375, + "learning_rate": 0.00018807238728367828, + "loss": 1.4978, + "step": 4665 + }, + { + "epoch": 0.24, + "grad_norm": 0.515625, + "learning_rate": 0.000188029581826142, + "loss": 1.491, + "step": 4670 + }, + { + "epoch": 0.24, + "grad_norm": 0.53125, + "learning_rate": 0.00018798670458595402, + "loss": 1.4686, + "step": 4675 + }, + { + "epoch": 0.24, + "grad_norm": 0.482421875, + "learning_rate": 0.0001879437555980781, + "loss": 1.4545, + "step": 4680 + }, + { + "epoch": 0.24, + "grad_norm": 0.53125, + "learning_rate": 0.0001879007348975365, + "loss": 1.4859, + "step": 4685 + }, + { + "epoch": 0.24, + "grad_norm": 0.50390625, + "learning_rate": 0.0001878576425194099, + "loss": 1.4621, + "step": 4690 + }, + { + "epoch": 0.24, + "grad_norm": 0.515625, + "learning_rate": 0.00018781447849883744, + "loss": 1.502, + "step": 4695 + }, + { + "epoch": 0.24, + "grad_norm": 0.51171875, + "learning_rate": 0.00018777124287101672, + "loss": 1.4946, + "step": 4700 + }, + { + "epoch": 0.24, + "grad_norm": 0.51171875, + "learning_rate": 0.0001877279356712037, + "loss": 1.4527, + "step": 4705 + }, + { + "epoch": 0.24, + "grad_norm": 0.51171875, + "learning_rate": 0.00018768455693471273, + "loss": 1.4475, + "step": 4710 + }, + { + "epoch": 0.24, + "grad_norm": 0.5546875, + "learning_rate": 0.0001876411066969164, + "loss": 1.4761, + "step": 4715 + }, + { + "epoch": 0.24, + "grad_norm": 0.51953125, + "learning_rate": 0.00018759758499324578, + "loss": 1.4905, + "step": 4720 + }, + { + "epoch": 0.24, + "grad_norm": 0.515625, + "learning_rate": 0.00018755399185919002, + "loss": 1.4484, + "step": 4725 + }, + { + "epoch": 0.24, + "grad_norm": 0.5, + "learning_rate": 0.0001875103273302967, + "loss": 1.4452, + "step": 4730 + }, + { + "epoch": 0.24, + "grad_norm": 0.51953125, + "learning_rate": 0.00018746659144217148, + "loss": 1.4934, + "step": 4735 + }, + { + "epoch": 0.25, + "grad_norm": 0.5234375, + "learning_rate": 0.00018742278423047824, + "loss": 1.4497, + "step": 4740 + }, + { + "epoch": 0.25, + "grad_norm": 0.5, + "learning_rate": 0.00018737890573093907, + "loss": 1.4335, + "step": 4745 + }, + { + "epoch": 0.25, + "grad_norm": 0.5, + "learning_rate": 0.00018733495597933412, + "loss": 1.4861, + "step": 4750 + }, + { + "epoch": 0.25, + "grad_norm": 0.515625, + "learning_rate": 0.00018729093501150174, + "loss": 1.4859, + "step": 4755 + }, + { + "epoch": 0.25, + "grad_norm": 0.52734375, + "learning_rate": 0.00018724684286333822, + "loss": 1.4934, + "step": 4760 + }, + { + "epoch": 0.25, + "grad_norm": 0.5078125, + "learning_rate": 0.00018720267957079805, + "loss": 1.4894, + "step": 4765 + }, + { + "epoch": 0.25, + "grad_norm": 0.515625, + "learning_rate": 0.0001871584451698936, + "loss": 1.4395, + "step": 4770 + }, + { + "epoch": 0.25, + "grad_norm": 0.51953125, + "learning_rate": 0.00018711413969669526, + "loss": 1.4548, + "step": 4775 + }, + { + "epoch": 0.25, + "grad_norm": 0.5078125, + "learning_rate": 0.00018706976318733141, + "loss": 1.486, + "step": 4780 + }, + { + "epoch": 0.25, + "grad_norm": 0.52734375, + "learning_rate": 0.00018702531567798837, + "loss": 1.4525, + "step": 4785 + }, + { + "epoch": 0.25, + "grad_norm": 0.515625, + "learning_rate": 0.00018698079720491024, + "loss": 1.5032, + "step": 4790 + }, + { + "epoch": 0.25, + "grad_norm": 0.51171875, + "learning_rate": 0.00018693620780439916, + "loss": 1.4398, + "step": 4795 + }, + { + "epoch": 0.25, + "grad_norm": 0.51953125, + "learning_rate": 0.00018689154751281494, + "loss": 1.4601, + "step": 4800 + }, + { + "epoch": 0.25, + "grad_norm": 0.5, + "learning_rate": 0.00018684681636657529, + "loss": 1.4332, + "step": 4805 + }, + { + "epoch": 0.25, + "grad_norm": 0.53125, + "learning_rate": 0.0001868020144021557, + "loss": 1.4677, + "step": 4810 + }, + { + "epoch": 0.25, + "grad_norm": 0.5390625, + "learning_rate": 0.00018675714165608935, + "loss": 1.445, + "step": 4815 + }, + { + "epoch": 0.25, + "grad_norm": 0.50390625, + "learning_rate": 0.00018671219816496722, + "loss": 1.4806, + "step": 4820 + }, + { + "epoch": 0.25, + "grad_norm": 0.51171875, + "learning_rate": 0.00018666718396543792, + "loss": 1.4517, + "step": 4825 + }, + { + "epoch": 0.25, + "grad_norm": 0.53125, + "learning_rate": 0.00018662209909420772, + "loss": 1.4778, + "step": 4830 + }, + { + "epoch": 0.25, + "grad_norm": 0.5546875, + "learning_rate": 0.0001865769435880405, + "loss": 1.4846, + "step": 4835 + }, + { + "epoch": 0.25, + "grad_norm": 0.51953125, + "learning_rate": 0.00018653171748375785, + "loss": 1.4785, + "step": 4840 + }, + { + "epoch": 0.25, + "grad_norm": 0.5078125, + "learning_rate": 0.00018648642081823877, + "loss": 1.4718, + "step": 4845 + }, + { + "epoch": 0.25, + "grad_norm": 0.515625, + "learning_rate": 0.0001864410536284199, + "loss": 1.4274, + "step": 4850 + }, + { + "epoch": 0.25, + "grad_norm": 0.52734375, + "learning_rate": 0.00018639561595129537, + "loss": 1.5004, + "step": 4855 + }, + { + "epoch": 0.25, + "grad_norm": 0.53125, + "learning_rate": 0.0001863501078239168, + "loss": 1.4704, + "step": 4860 + }, + { + "epoch": 0.25, + "grad_norm": 0.5078125, + "learning_rate": 0.0001863045292833932, + "loss": 1.4659, + "step": 4865 + }, + { + "epoch": 0.25, + "grad_norm": 0.55078125, + "learning_rate": 0.00018625888036689103, + "loss": 1.4741, + "step": 4870 + }, + { + "epoch": 0.25, + "grad_norm": 0.53515625, + "learning_rate": 0.0001862131611116342, + "loss": 1.5111, + "step": 4875 + }, + { + "epoch": 0.25, + "grad_norm": 0.5, + "learning_rate": 0.0001861673715549039, + "loss": 1.4657, + "step": 4880 + }, + { + "epoch": 0.25, + "grad_norm": 0.53125, + "learning_rate": 0.0001861215117340386, + "loss": 1.4569, + "step": 4885 + }, + { + "epoch": 0.25, + "grad_norm": 0.51171875, + "learning_rate": 0.00018607558168643422, + "loss": 1.4685, + "step": 4890 + }, + { + "epoch": 0.25, + "grad_norm": 0.5078125, + "learning_rate": 0.0001860295814495438, + "loss": 1.4891, + "step": 4895 + }, + { + "epoch": 0.25, + "grad_norm": 0.55078125, + "learning_rate": 0.00018598351106087772, + "loss": 1.4572, + "step": 4900 + }, + { + "epoch": 0.25, + "grad_norm": 0.51953125, + "learning_rate": 0.0001859373705580035, + "loss": 1.4894, + "step": 4905 + }, + { + "epoch": 0.25, + "grad_norm": 0.515625, + "learning_rate": 0.00018589115997854586, + "loss": 1.4507, + "step": 4910 + }, + { + "epoch": 0.25, + "grad_norm": 0.53515625, + "learning_rate": 0.00018584487936018661, + "loss": 1.4602, + "step": 4915 + }, + { + "epoch": 0.25, + "grad_norm": 0.5078125, + "learning_rate": 0.00018579852874066476, + "loss": 1.4457, + "step": 4920 + }, + { + "epoch": 0.25, + "grad_norm": 0.50390625, + "learning_rate": 0.0001857521081577764, + "loss": 1.462, + "step": 4925 + }, + { + "epoch": 0.26, + "grad_norm": 0.51953125, + "learning_rate": 0.0001857056176493745, + "loss": 1.4695, + "step": 4930 + }, + { + "epoch": 0.26, + "grad_norm": 0.5234375, + "learning_rate": 0.00018565905725336933, + "loss": 1.4426, + "step": 4935 + }, + { + "epoch": 0.26, + "grad_norm": 0.51953125, + "learning_rate": 0.00018561242700772788, + "loss": 1.5005, + "step": 4940 + }, + { + "epoch": 0.26, + "grad_norm": 0.53125, + "learning_rate": 0.00018556572695047427, + "loss": 1.4763, + "step": 4945 + }, + { + "epoch": 0.26, + "grad_norm": 0.498046875, + "learning_rate": 0.0001855189571196895, + "loss": 1.4547, + "step": 4950 + }, + { + "epoch": 0.26, + "grad_norm": 0.5390625, + "learning_rate": 0.00018547211755351147, + "loss": 1.4776, + "step": 4955 + }, + { + "epoch": 0.26, + "grad_norm": 0.515625, + "learning_rate": 0.0001854252082901349, + "loss": 1.4449, + "step": 4960 + }, + { + "epoch": 0.26, + "grad_norm": 0.53515625, + "learning_rate": 0.00018537822936781132, + "loss": 1.4937, + "step": 4965 + }, + { + "epoch": 0.26, + "grad_norm": 0.52734375, + "learning_rate": 0.00018533118082484927, + "loss": 1.4612, + "step": 4970 + }, + { + "epoch": 0.26, + "grad_norm": 0.515625, + "learning_rate": 0.0001852840626996138, + "loss": 1.4653, + "step": 4975 + }, + { + "epoch": 0.26, + "grad_norm": 0.5, + "learning_rate": 0.00018523687503052685, + "loss": 1.4569, + "step": 4980 + }, + { + "epoch": 0.26, + "grad_norm": 0.5, + "learning_rate": 0.00018518961785606703, + "loss": 1.4476, + "step": 4985 + }, + { + "epoch": 0.26, + "grad_norm": 0.5390625, + "learning_rate": 0.00018514229121476962, + "loss": 1.4912, + "step": 4990 + }, + { + "epoch": 0.26, + "grad_norm": 0.51171875, + "learning_rate": 0.00018509489514522657, + "loss": 1.4891, + "step": 4995 + }, + { + "epoch": 0.26, + "grad_norm": 0.52734375, + "learning_rate": 0.00018504742968608639, + "loss": 1.4496, + "step": 5000 + }, + { + "epoch": 0.26, + "grad_norm": 0.515625, + "learning_rate": 0.00018499989487605423, + "loss": 1.4243, + "step": 5005 + }, + { + "epoch": 0.26, + "grad_norm": 0.50390625, + "learning_rate": 0.00018495229075389183, + "loss": 1.4679, + "step": 5010 + }, + { + "epoch": 0.26, + "grad_norm": 0.52734375, + "learning_rate": 0.00018490461735841732, + "loss": 1.4744, + "step": 5015 + }, + { + "epoch": 0.26, + "grad_norm": 0.51953125, + "learning_rate": 0.0001848568747285054, + "loss": 1.483, + "step": 5020 + }, + { + "epoch": 0.26, + "grad_norm": 0.53125, + "learning_rate": 0.00018480906290308722, + "loss": 1.4682, + "step": 5025 + }, + { + "epoch": 0.26, + "grad_norm": 0.5234375, + "learning_rate": 0.00018476118192115037, + "loss": 1.4677, + "step": 5030 + }, + { + "epoch": 0.26, + "grad_norm": 0.5, + "learning_rate": 0.00018471323182173884, + "loss": 1.4572, + "step": 5035 + }, + { + "epoch": 0.26, + "grad_norm": 0.515625, + "learning_rate": 0.00018466521264395288, + "loss": 1.4501, + "step": 5040 + }, + { + "epoch": 0.26, + "grad_norm": 0.51171875, + "learning_rate": 0.0001846171244269492, + "loss": 1.486, + "step": 5045 + }, + { + "epoch": 0.26, + "grad_norm": 0.51171875, + "learning_rate": 0.00018456896720994072, + "loss": 1.484, + "step": 5050 + }, + { + "epoch": 0.26, + "grad_norm": 0.54296875, + "learning_rate": 0.0001845207410321967, + "loss": 1.4618, + "step": 5055 + }, + { + "epoch": 0.26, + "grad_norm": 0.5234375, + "learning_rate": 0.00018447244593304253, + "loss": 1.4758, + "step": 5060 + }, + { + "epoch": 0.26, + "grad_norm": 0.515625, + "learning_rate": 0.0001844240819518599, + "loss": 1.4699, + "step": 5065 + }, + { + "epoch": 0.26, + "grad_norm": 0.5234375, + "learning_rate": 0.00018437564912808665, + "loss": 1.4397, + "step": 5070 + }, + { + "epoch": 0.26, + "grad_norm": 0.50390625, + "learning_rate": 0.0001843271475012167, + "loss": 1.4299, + "step": 5075 + }, + { + "epoch": 0.26, + "grad_norm": 0.5546875, + "learning_rate": 0.00018427857711080013, + "loss": 1.4455, + "step": 5080 + }, + { + "epoch": 0.26, + "grad_norm": 0.5078125, + "learning_rate": 0.00018422993799644302, + "loss": 1.5103, + "step": 5085 + }, + { + "epoch": 0.26, + "grad_norm": 0.53515625, + "learning_rate": 0.00018418123019780765, + "loss": 1.5017, + "step": 5090 + }, + { + "epoch": 0.26, + "grad_norm": 0.5390625, + "learning_rate": 0.0001841324537546121, + "loss": 1.481, + "step": 5095 + }, + { + "epoch": 0.26, + "grad_norm": 0.51171875, + "learning_rate": 0.00018408360870663063, + "loss": 1.4721, + "step": 5100 + }, + { + "epoch": 0.26, + "grad_norm": 0.4921875, + "learning_rate": 0.0001840346950936932, + "loss": 1.4436, + "step": 5105 + }, + { + "epoch": 0.26, + "grad_norm": 0.498046875, + "learning_rate": 0.00018398571295568595, + "loss": 1.4581, + "step": 5110 + }, + { + "epoch": 0.26, + "grad_norm": 0.51171875, + "learning_rate": 0.00018393666233255073, + "loss": 1.4752, + "step": 5115 + }, + { + "epoch": 0.26, + "grad_norm": 0.51171875, + "learning_rate": 0.00018388754326428524, + "loss": 1.5162, + "step": 5120 + }, + { + "epoch": 0.27, + "grad_norm": 0.5390625, + "learning_rate": 0.00018383835579094304, + "loss": 1.4762, + "step": 5125 + }, + { + "epoch": 0.27, + "grad_norm": 0.53125, + "learning_rate": 0.0001837890999526335, + "loss": 1.431, + "step": 5130 + }, + { + "epoch": 0.27, + "grad_norm": 0.5390625, + "learning_rate": 0.0001837397757895216, + "loss": 1.4589, + "step": 5135 + }, + { + "epoch": 0.27, + "grad_norm": 0.515625, + "learning_rate": 0.00018369038334182825, + "loss": 1.4569, + "step": 5140 + }, + { + "epoch": 0.27, + "grad_norm": 0.51171875, + "learning_rate": 0.00018364092264982985, + "loss": 1.4379, + "step": 5145 + }, + { + "epoch": 0.27, + "grad_norm": 0.51171875, + "learning_rate": 0.00018359139375385852, + "loss": 1.4665, + "step": 5150 + }, + { + "epoch": 0.27, + "grad_norm": 0.51171875, + "learning_rate": 0.000183541796694302, + "loss": 1.4636, + "step": 5155 + }, + { + "epoch": 0.27, + "grad_norm": 0.50390625, + "learning_rate": 0.00018349213151160366, + "loss": 1.4701, + "step": 5160 + }, + { + "epoch": 0.27, + "grad_norm": 0.5234375, + "learning_rate": 0.00018344239824626227, + "loss": 1.4937, + "step": 5165 + }, + { + "epoch": 0.27, + "grad_norm": 0.515625, + "learning_rate": 0.0001833925969388323, + "loss": 1.5028, + "step": 5170 + }, + { + "epoch": 0.27, + "grad_norm": 0.5078125, + "learning_rate": 0.00018334272762992354, + "loss": 1.4863, + "step": 5175 + }, + { + "epoch": 0.27, + "grad_norm": 0.50390625, + "learning_rate": 0.0001832927903602014, + "loss": 1.4618, + "step": 5180 + }, + { + "epoch": 0.27, + "grad_norm": 0.515625, + "learning_rate": 0.0001832427851703866, + "loss": 1.5137, + "step": 5185 + }, + { + "epoch": 0.27, + "grad_norm": 0.515625, + "learning_rate": 0.00018319271210125523, + "loss": 1.4636, + "step": 5190 + }, + { + "epoch": 0.27, + "grad_norm": 0.50390625, + "learning_rate": 0.00018314257119363876, + "loss": 1.4666, + "step": 5195 + }, + { + "epoch": 0.27, + "grad_norm": 0.52734375, + "learning_rate": 0.00018309236248842403, + "loss": 1.4487, + "step": 5200 + }, + { + "epoch": 0.27, + "grad_norm": 0.51171875, + "learning_rate": 0.00018304208602655306, + "loss": 1.4146, + "step": 5205 + }, + { + "epoch": 0.27, + "grad_norm": 0.5078125, + "learning_rate": 0.00018299174184902323, + "loss": 1.442, + "step": 5210 + }, + { + "epoch": 0.27, + "grad_norm": 0.498046875, + "learning_rate": 0.0001829413299968871, + "loss": 1.4594, + "step": 5215 + }, + { + "epoch": 0.27, + "grad_norm": 0.498046875, + "learning_rate": 0.00018289085051125233, + "loss": 1.4576, + "step": 5220 + }, + { + "epoch": 0.27, + "grad_norm": 0.50390625, + "learning_rate": 0.00018284030343328181, + "loss": 1.4952, + "step": 5225 + }, + { + "epoch": 0.27, + "grad_norm": 0.5546875, + "learning_rate": 0.00018278968880419363, + "loss": 1.4293, + "step": 5230 + }, + { + "epoch": 0.27, + "grad_norm": 0.50390625, + "learning_rate": 0.00018273900666526078, + "loss": 1.4402, + "step": 5235 + }, + { + "epoch": 0.27, + "grad_norm": 0.53515625, + "learning_rate": 0.00018268825705781145, + "loss": 1.4513, + "step": 5240 + }, + { + "epoch": 0.27, + "grad_norm": 0.51171875, + "learning_rate": 0.00018263744002322874, + "loss": 1.4658, + "step": 5245 + }, + { + "epoch": 0.27, + "grad_norm": 0.515625, + "learning_rate": 0.00018258655560295087, + "loss": 1.4694, + "step": 5250 + }, + { + "epoch": 0.27, + "grad_norm": 0.51953125, + "learning_rate": 0.00018253560383847082, + "loss": 1.4721, + "step": 5255 + }, + { + "epoch": 0.27, + "grad_norm": 0.53125, + "learning_rate": 0.00018248458477133662, + "loss": 1.4458, + "step": 5260 + }, + { + "epoch": 0.27, + "grad_norm": 0.52734375, + "learning_rate": 0.00018243349844315117, + "loss": 1.4631, + "step": 5265 + }, + { + "epoch": 0.27, + "grad_norm": 0.53125, + "learning_rate": 0.00018238234489557215, + "loss": 1.4368, + "step": 5270 + }, + { + "epoch": 0.27, + "grad_norm": 0.51953125, + "learning_rate": 0.0001823311241703122, + "loss": 1.4617, + "step": 5275 + }, + { + "epoch": 0.27, + "grad_norm": 0.5546875, + "learning_rate": 0.0001822798363091385, + "loss": 1.4589, + "step": 5280 + }, + { + "epoch": 0.27, + "grad_norm": 0.51171875, + "learning_rate": 0.00018222848135387323, + "loss": 1.4841, + "step": 5285 + }, + { + "epoch": 0.27, + "grad_norm": 0.51953125, + "learning_rate": 0.0001821770593463931, + "loss": 1.4629, + "step": 5290 + }, + { + "epoch": 0.27, + "grad_norm": 0.515625, + "learning_rate": 0.00018212557032862953, + "loss": 1.4797, + "step": 5295 + }, + { + "epoch": 0.27, + "grad_norm": 0.53125, + "learning_rate": 0.0001820740143425687, + "loss": 1.4725, + "step": 5300 + }, + { + "epoch": 0.27, + "grad_norm": 0.5078125, + "learning_rate": 0.00018202239143025125, + "loss": 1.4393, + "step": 5305 + }, + { + "epoch": 0.27, + "grad_norm": 0.49609375, + "learning_rate": 0.00018197070163377248, + "loss": 1.4215, + "step": 5310 + }, + { + "epoch": 0.27, + "grad_norm": 0.53515625, + "learning_rate": 0.0001819189449952822, + "loss": 1.5017, + "step": 5315 + }, + { + "epoch": 0.28, + "grad_norm": 0.5078125, + "learning_rate": 0.00018186712155698475, + "loss": 1.4348, + "step": 5320 + }, + { + "epoch": 0.28, + "grad_norm": 0.53125, + "learning_rate": 0.0001818152313611389, + "loss": 1.4518, + "step": 5325 + }, + { + "epoch": 0.28, + "grad_norm": 0.53515625, + "learning_rate": 0.00018176327445005788, + "loss": 1.4392, + "step": 5330 + }, + { + "epoch": 0.28, + "grad_norm": 0.5390625, + "learning_rate": 0.0001817112508661093, + "loss": 1.4806, + "step": 5335 + }, + { + "epoch": 0.28, + "grad_norm": 0.515625, + "learning_rate": 0.0001816591606517152, + "loss": 1.4508, + "step": 5340 + }, + { + "epoch": 0.28, + "grad_norm": 0.51953125, + "learning_rate": 0.0001816070038493519, + "loss": 1.4718, + "step": 5345 + }, + { + "epoch": 0.28, + "grad_norm": 0.5546875, + "learning_rate": 0.00018155478050155, + "loss": 1.4973, + "step": 5350 + }, + { + "epoch": 0.28, + "grad_norm": 0.515625, + "learning_rate": 0.00018150249065089445, + "loss": 1.466, + "step": 5355 + }, + { + "epoch": 0.28, + "grad_norm": 0.51953125, + "learning_rate": 0.00018145013434002434, + "loss": 1.4631, + "step": 5360 + }, + { + "epoch": 0.28, + "grad_norm": 0.53125, + "learning_rate": 0.00018139771161163295, + "loss": 1.4552, + "step": 5365 + }, + { + "epoch": 0.28, + "grad_norm": 0.515625, + "learning_rate": 0.00018134522250846783, + "loss": 1.4604, + "step": 5370 + }, + { + "epoch": 0.28, + "grad_norm": 0.51953125, + "learning_rate": 0.00018129266707333052, + "loss": 1.4751, + "step": 5375 + }, + { + "epoch": 0.28, + "grad_norm": 0.5234375, + "learning_rate": 0.00018124004534907675, + "loss": 1.4653, + "step": 5380 + }, + { + "epoch": 0.28, + "grad_norm": 0.498046875, + "learning_rate": 0.00018118735737861625, + "loss": 1.4767, + "step": 5385 + }, + { + "epoch": 0.28, + "grad_norm": 0.494140625, + "learning_rate": 0.00018113460320491278, + "loss": 1.4715, + "step": 5390 + }, + { + "epoch": 0.28, + "grad_norm": 0.5234375, + "learning_rate": 0.0001810817828709841, + "loss": 1.4591, + "step": 5395 + }, + { + "epoch": 0.28, + "grad_norm": 0.53515625, + "learning_rate": 0.0001810288964199019, + "loss": 1.458, + "step": 5400 + }, + { + "epoch": 0.28, + "grad_norm": 0.52734375, + "learning_rate": 0.00018097594389479178, + "loss": 1.4588, + "step": 5405 + }, + { + "epoch": 0.28, + "grad_norm": 0.474609375, + "learning_rate": 0.00018092292533883325, + "loss": 1.4609, + "step": 5410 + }, + { + "epoch": 0.28, + "grad_norm": 0.51953125, + "learning_rate": 0.00018086984079525965, + "loss": 1.4566, + "step": 5415 + }, + { + "epoch": 0.28, + "grad_norm": 0.515625, + "learning_rate": 0.00018081669030735814, + "loss": 1.4482, + "step": 5420 + }, + { + "epoch": 0.28, + "grad_norm": 0.5234375, + "learning_rate": 0.0001807634739184696, + "loss": 1.4717, + "step": 5425 + }, + { + "epoch": 0.28, + "grad_norm": 0.55078125, + "learning_rate": 0.00018071019167198872, + "loss": 1.4687, + "step": 5430 + }, + { + "epoch": 0.28, + "grad_norm": 0.53515625, + "learning_rate": 0.0001806568436113638, + "loss": 1.4822, + "step": 5435 + }, + { + "epoch": 0.28, + "grad_norm": 0.52734375, + "learning_rate": 0.00018060342978009697, + "loss": 1.489, + "step": 5440 + }, + { + "epoch": 0.28, + "grad_norm": 0.5234375, + "learning_rate": 0.00018054995022174377, + "loss": 1.4717, + "step": 5445 + }, + { + "epoch": 0.28, + "grad_norm": 0.515625, + "learning_rate": 0.00018049640497991355, + "loss": 1.4647, + "step": 5450 + }, + { + "epoch": 0.28, + "grad_norm": 0.53125, + "learning_rate": 0.0001804427940982691, + "loss": 1.4401, + "step": 5455 + }, + { + "epoch": 0.28, + "grad_norm": 0.51171875, + "learning_rate": 0.00018038911762052675, + "loss": 1.4547, + "step": 5460 + }, + { + "epoch": 0.28, + "grad_norm": 0.5234375, + "learning_rate": 0.00018033537559045633, + "loss": 1.4169, + "step": 5465 + }, + { + "epoch": 0.28, + "grad_norm": 0.52734375, + "learning_rate": 0.00018028156805188113, + "loss": 1.4402, + "step": 5470 + }, + { + "epoch": 0.28, + "grad_norm": 0.5234375, + "learning_rate": 0.00018022769504867788, + "loss": 1.3978, + "step": 5475 + }, + { + "epoch": 0.28, + "grad_norm": 0.53515625, + "learning_rate": 0.00018017375662477658, + "loss": 1.4334, + "step": 5480 + }, + { + "epoch": 0.28, + "grad_norm": 0.5078125, + "learning_rate": 0.00018011975282416077, + "loss": 1.5001, + "step": 5485 + }, + { + "epoch": 0.28, + "grad_norm": 0.51171875, + "learning_rate": 0.00018006568369086708, + "loss": 1.471, + "step": 5490 + }, + { + "epoch": 0.28, + "grad_norm": 0.52734375, + "learning_rate": 0.00018001154926898565, + "loss": 1.472, + "step": 5495 + }, + { + "epoch": 0.28, + "grad_norm": 0.515625, + "learning_rate": 0.00017995734960265963, + "loss": 1.4583, + "step": 5500 + }, + { + "epoch": 0.28, + "grad_norm": 0.53125, + "learning_rate": 0.00017990308473608555, + "loss": 1.4341, + "step": 5505 + }, + { + "epoch": 0.29, + "grad_norm": 0.50390625, + "learning_rate": 0.00017984875471351302, + "loss": 1.4571, + "step": 5510 + }, + { + "epoch": 0.29, + "grad_norm": 0.5546875, + "learning_rate": 0.00017979435957924476, + "loss": 1.4274, + "step": 5515 + }, + { + "epoch": 0.29, + "grad_norm": 0.53515625, + "learning_rate": 0.00017973989937763665, + "loss": 1.436, + "step": 5520 + }, + { + "epoch": 0.29, + "grad_norm": 0.5078125, + "learning_rate": 0.0001796853741530976, + "loss": 1.4752, + "step": 5525 + }, + { + "epoch": 0.29, + "grad_norm": 1.0078125, + "learning_rate": 0.0001796307839500895, + "loss": 1.4609, + "step": 5530 + }, + { + "epoch": 0.29, + "grad_norm": 0.51171875, + "learning_rate": 0.00017957612881312732, + "loss": 1.4496, + "step": 5535 + }, + { + "epoch": 0.29, + "grad_norm": 0.51171875, + "learning_rate": 0.00017952140878677895, + "loss": 1.4494, + "step": 5540 + }, + { + "epoch": 0.29, + "grad_norm": 0.515625, + "learning_rate": 0.0001794666239156651, + "loss": 1.4503, + "step": 5545 + }, + { + "epoch": 0.29, + "grad_norm": 0.52734375, + "learning_rate": 0.00017941177424445943, + "loss": 1.4531, + "step": 5550 + }, + { + "epoch": 0.29, + "grad_norm": 0.5234375, + "learning_rate": 0.00017935685981788847, + "loss": 1.4794, + "step": 5555 + }, + { + "epoch": 0.29, + "grad_norm": 0.50390625, + "learning_rate": 0.00017930188068073153, + "loss": 1.4549, + "step": 5560 + }, + { + "epoch": 0.29, + "grad_norm": 0.51953125, + "learning_rate": 0.00017924683687782066, + "loss": 1.4862, + "step": 5565 + }, + { + "epoch": 0.29, + "grad_norm": 0.53515625, + "learning_rate": 0.00017919172845404067, + "loss": 1.4818, + "step": 5570 + }, + { + "epoch": 0.29, + "grad_norm": 0.51953125, + "learning_rate": 0.00017913655545432903, + "loss": 1.4655, + "step": 5575 + }, + { + "epoch": 0.29, + "grad_norm": 0.5078125, + "learning_rate": 0.00017908131792367587, + "loss": 1.4568, + "step": 5580 + }, + { + "epoch": 0.29, + "grad_norm": 0.52734375, + "learning_rate": 0.00017902601590712408, + "loss": 1.4522, + "step": 5585 + }, + { + "epoch": 0.29, + "grad_norm": 0.5078125, + "learning_rate": 0.00017897064944976887, + "loss": 1.4274, + "step": 5590 + }, + { + "epoch": 0.29, + "grad_norm": 0.5234375, + "learning_rate": 0.00017891521859675824, + "loss": 1.4413, + "step": 5595 + }, + { + "epoch": 0.29, + "grad_norm": 0.51953125, + "learning_rate": 0.00017885972339329255, + "loss": 1.4872, + "step": 5600 + }, + { + "epoch": 0.29, + "grad_norm": 0.51953125, + "learning_rate": 0.00017880416388462472, + "loss": 1.4727, + "step": 5605 + }, + { + "epoch": 0.29, + "grad_norm": 0.515625, + "learning_rate": 0.0001787485401160601, + "loss": 1.4454, + "step": 5610 + }, + { + "epoch": 0.29, + "grad_norm": 0.51953125, + "learning_rate": 0.00017869285213295634, + "loss": 1.4622, + "step": 5615 + }, + { + "epoch": 0.29, + "grad_norm": 0.5234375, + "learning_rate": 0.0001786370999807236, + "loss": 1.4323, + "step": 5620 + }, + { + "epoch": 0.29, + "grad_norm": 0.5234375, + "learning_rate": 0.00017858128370482426, + "loss": 1.4826, + "step": 5625 + }, + { + "epoch": 0.29, + "grad_norm": 0.52734375, + "learning_rate": 0.00017852540335077302, + "loss": 1.468, + "step": 5630 + }, + { + "epoch": 0.29, + "grad_norm": 0.498046875, + "learning_rate": 0.00017846945896413685, + "loss": 1.4539, + "step": 5635 + }, + { + "epoch": 0.29, + "grad_norm": 0.53125, + "learning_rate": 0.00017841345059053492, + "loss": 1.4769, + "step": 5640 + }, + { + "epoch": 0.29, + "grad_norm": 0.53515625, + "learning_rate": 0.00017835737827563857, + "loss": 1.4582, + "step": 5645 + }, + { + "epoch": 0.29, + "grad_norm": 0.52734375, + "learning_rate": 0.00017830124206517128, + "loss": 1.4439, + "step": 5650 + }, + { + "epoch": 0.29, + "grad_norm": 0.546875, + "learning_rate": 0.00017824504200490866, + "loss": 1.4926, + "step": 5655 + }, + { + "epoch": 0.29, + "grad_norm": 0.5078125, + "learning_rate": 0.00017818877814067833, + "loss": 1.4687, + "step": 5660 + }, + { + "epoch": 0.29, + "grad_norm": 0.5703125, + "learning_rate": 0.00017813245051836, + "loss": 1.4605, + "step": 5665 + }, + { + "epoch": 0.29, + "grad_norm": 0.51953125, + "learning_rate": 0.00017807605918388538, + "loss": 1.4395, + "step": 5670 + }, + { + "epoch": 0.29, + "grad_norm": 0.51171875, + "learning_rate": 0.00017801960418323802, + "loss": 1.4392, + "step": 5675 + }, + { + "epoch": 0.29, + "grad_norm": 0.5390625, + "learning_rate": 0.0001779630855624535, + "loss": 1.4397, + "step": 5680 + }, + { + "epoch": 0.29, + "grad_norm": 0.5078125, + "learning_rate": 0.00017790650336761926, + "loss": 1.4667, + "step": 5685 + }, + { + "epoch": 0.29, + "grad_norm": 0.53515625, + "learning_rate": 0.00017784985764487455, + "loss": 1.4608, + "step": 5690 + }, + { + "epoch": 0.29, + "grad_norm": 0.5078125, + "learning_rate": 0.00017779314844041047, + "loss": 1.4725, + "step": 5695 + }, + { + "epoch": 0.29, + "grad_norm": 0.515625, + "learning_rate": 0.00017773637580046974, + "loss": 1.4486, + "step": 5700 + }, + { + "epoch": 0.3, + "grad_norm": 0.52734375, + "learning_rate": 0.00017767953977134704, + "loss": 1.4713, + "step": 5705 + }, + { + "epoch": 0.3, + "grad_norm": 0.5234375, + "learning_rate": 0.00017762264039938855, + "loss": 1.4404, + "step": 5710 + }, + { + "epoch": 0.3, + "grad_norm": 0.5078125, + "learning_rate": 0.0001775656777309922, + "loss": 1.4726, + "step": 5715 + }, + { + "epoch": 0.3, + "grad_norm": 0.51953125, + "learning_rate": 0.0001775086518126075, + "loss": 1.472, + "step": 5720 + }, + { + "epoch": 0.3, + "grad_norm": 0.51171875, + "learning_rate": 0.00017745156269073555, + "loss": 1.412, + "step": 5725 + }, + { + "epoch": 0.3, + "grad_norm": 0.53125, + "learning_rate": 0.00017739441041192896, + "loss": 1.4437, + "step": 5730 + }, + { + "epoch": 0.3, + "grad_norm": 0.51171875, + "learning_rate": 0.00017733719502279185, + "loss": 1.4726, + "step": 5735 + }, + { + "epoch": 0.3, + "grad_norm": 0.5078125, + "learning_rate": 0.00017727991656997985, + "loss": 1.4665, + "step": 5740 + }, + { + "epoch": 0.3, + "grad_norm": 0.546875, + "learning_rate": 0.00017722257510019996, + "loss": 1.4804, + "step": 5745 + }, + { + "epoch": 0.3, + "grad_norm": 0.5078125, + "learning_rate": 0.00017716517066021056, + "loss": 1.4558, + "step": 5750 + }, + { + "epoch": 0.3, + "grad_norm": 0.5, + "learning_rate": 0.00017710770329682144, + "loss": 1.4385, + "step": 5755 + }, + { + "epoch": 0.3, + "grad_norm": 0.5078125, + "learning_rate": 0.00017705017305689365, + "loss": 1.4651, + "step": 5760 + }, + { + "epoch": 0.3, + "grad_norm": 0.53125, + "learning_rate": 0.00017699257998733952, + "loss": 1.4543, + "step": 5765 + }, + { + "epoch": 0.3, + "grad_norm": 0.515625, + "learning_rate": 0.00017693492413512263, + "loss": 1.4433, + "step": 5770 + }, + { + "epoch": 0.3, + "grad_norm": 0.515625, + "learning_rate": 0.00017687720554725772, + "loss": 1.4974, + "step": 5775 + }, + { + "epoch": 0.3, + "grad_norm": 0.515625, + "learning_rate": 0.00017681942427081072, + "loss": 1.4828, + "step": 5780 + }, + { + "epoch": 0.3, + "grad_norm": 0.49609375, + "learning_rate": 0.00017676158035289868, + "loss": 1.4557, + "step": 5785 + }, + { + "epoch": 0.3, + "grad_norm": 0.48828125, + "learning_rate": 0.00017670367384068971, + "loss": 1.4556, + "step": 5790 + }, + { + "epoch": 0.3, + "grad_norm": 0.50390625, + "learning_rate": 0.00017664570478140296, + "loss": 1.4639, + "step": 5795 + }, + { + "epoch": 0.3, + "grad_norm": 0.51953125, + "learning_rate": 0.00017658767322230862, + "loss": 1.4575, + "step": 5800 + }, + { + "epoch": 0.3, + "grad_norm": 0.5078125, + "learning_rate": 0.00017652957921072783, + "loss": 1.4221, + "step": 5805 + }, + { + "epoch": 0.3, + "grad_norm": 0.5078125, + "learning_rate": 0.0001764714227940326, + "loss": 1.4913, + "step": 5810 + }, + { + "epoch": 0.3, + "grad_norm": 0.515625, + "learning_rate": 0.0001764132040196459, + "loss": 1.477, + "step": 5815 + }, + { + "epoch": 0.3, + "grad_norm": 0.5546875, + "learning_rate": 0.0001763549229350415, + "loss": 1.4424, + "step": 5820 + }, + { + "epoch": 0.3, + "grad_norm": 0.50390625, + "learning_rate": 0.00017629657958774403, + "loss": 1.4894, + "step": 5825 + }, + { + "epoch": 0.3, + "grad_norm": 0.51953125, + "learning_rate": 0.00017623817402532884, + "loss": 1.4499, + "step": 5830 + }, + { + "epoch": 0.3, + "grad_norm": 0.5078125, + "learning_rate": 0.00017617970629542207, + "loss": 1.4797, + "step": 5835 + }, + { + "epoch": 0.3, + "grad_norm": 0.53125, + "learning_rate": 0.00017612117644570047, + "loss": 1.4671, + "step": 5840 + }, + { + "epoch": 0.3, + "grad_norm": 0.498046875, + "learning_rate": 0.0001760625845238915, + "loss": 1.4581, + "step": 5845 + }, + { + "epoch": 0.3, + "grad_norm": 0.5390625, + "learning_rate": 0.0001760039305777733, + "loss": 1.4995, + "step": 5850 + }, + { + "epoch": 0.3, + "grad_norm": 0.50390625, + "learning_rate": 0.0001759452146551744, + "loss": 1.4475, + "step": 5855 + }, + { + "epoch": 0.3, + "grad_norm": 0.58203125, + "learning_rate": 0.00017588643680397408, + "loss": 1.4706, + "step": 5860 + }, + { + "epoch": 0.3, + "grad_norm": 3.921875, + "learning_rate": 0.00017582759707210203, + "loss": 1.4339, + "step": 5865 + }, + { + "epoch": 0.3, + "grad_norm": 0.51953125, + "learning_rate": 0.0001757686955075383, + "loss": 1.4899, + "step": 5870 + }, + { + "epoch": 0.3, + "grad_norm": 0.5546875, + "learning_rate": 0.00017570973215831357, + "loss": 1.4016, + "step": 5875 + }, + { + "epoch": 0.3, + "grad_norm": 0.5234375, + "learning_rate": 0.00017565070707250868, + "loss": 1.4766, + "step": 5880 + }, + { + "epoch": 0.3, + "grad_norm": 0.515625, + "learning_rate": 0.000175591620298255, + "loss": 1.4583, + "step": 5885 + }, + { + "epoch": 0.3, + "grad_norm": 0.50390625, + "learning_rate": 0.00017553247188373402, + "loss": 1.4627, + "step": 5890 + }, + { + "epoch": 0.3, + "grad_norm": 0.515625, + "learning_rate": 0.00017547326187717773, + "loss": 1.476, + "step": 5895 + }, + { + "epoch": 0.31, + "grad_norm": 0.51953125, + "learning_rate": 0.00017541399032686811, + "loss": 1.4523, + "step": 5900 + }, + { + "epoch": 0.31, + "grad_norm": 0.5, + "learning_rate": 0.00017535465728113746, + "loss": 1.4557, + "step": 5905 + }, + { + "epoch": 0.31, + "grad_norm": 0.50390625, + "learning_rate": 0.0001752952627883682, + "loss": 1.4563, + "step": 5910 + }, + { + "epoch": 0.31, + "grad_norm": 0.5, + "learning_rate": 0.0001752358068969928, + "loss": 1.4079, + "step": 5915 + }, + { + "epoch": 0.31, + "grad_norm": 0.5078125, + "learning_rate": 0.0001751762896554939, + "loss": 1.4517, + "step": 5920 + }, + { + "epoch": 0.31, + "grad_norm": 0.5390625, + "learning_rate": 0.0001751167111124041, + "loss": 1.4412, + "step": 5925 + }, + { + "epoch": 0.31, + "grad_norm": 0.498046875, + "learning_rate": 0.00017505707131630597, + "loss": 1.4473, + "step": 5930 + }, + { + "epoch": 0.31, + "grad_norm": 0.51171875, + "learning_rate": 0.00017499737031583207, + "loss": 1.4326, + "step": 5935 + }, + { + "epoch": 0.31, + "grad_norm": 0.5390625, + "learning_rate": 0.00017493760815966486, + "loss": 1.4869, + "step": 5940 + }, + { + "epoch": 0.31, + "grad_norm": 0.51171875, + "learning_rate": 0.00017487778489653667, + "loss": 1.4441, + "step": 5945 + }, + { + "epoch": 0.31, + "grad_norm": 0.51953125, + "learning_rate": 0.00017481790057522964, + "loss": 1.4568, + "step": 5950 + }, + { + "epoch": 0.31, + "grad_norm": 0.5234375, + "learning_rate": 0.00017475795524457568, + "loss": 1.4663, + "step": 5955 + }, + { + "epoch": 0.31, + "grad_norm": 0.55078125, + "learning_rate": 0.00017469794895345656, + "loss": 1.4548, + "step": 5960 + }, + { + "epoch": 0.31, + "grad_norm": 0.48828125, + "learning_rate": 0.0001746378817508036, + "loss": 1.4363, + "step": 5965 + }, + { + "epoch": 0.31, + "grad_norm": 0.5078125, + "learning_rate": 0.00017457775368559793, + "loss": 1.463, + "step": 5970 + }, + { + "epoch": 0.31, + "grad_norm": 0.515625, + "learning_rate": 0.00017451756480687017, + "loss": 1.4627, + "step": 5975 + }, + { + "epoch": 0.31, + "grad_norm": 0.515625, + "learning_rate": 0.0001744573151637007, + "loss": 1.4662, + "step": 5980 + }, + { + "epoch": 0.31, + "grad_norm": 0.486328125, + "learning_rate": 0.00017439700480521934, + "loss": 1.4552, + "step": 5985 + }, + { + "epoch": 0.31, + "grad_norm": 0.498046875, + "learning_rate": 0.0001743366337806054, + "loss": 1.4738, + "step": 5990 + }, + { + "epoch": 0.31, + "grad_norm": 0.5234375, + "learning_rate": 0.0001742762021390877, + "loss": 1.4809, + "step": 5995 + }, + { + "epoch": 0.31, + "grad_norm": 0.52734375, + "learning_rate": 0.0001742157099299445, + "loss": 1.4729, + "step": 6000 + }, + { + "epoch": 0.31, + "grad_norm": 0.53125, + "learning_rate": 0.00017415515720250346, + "loss": 1.4271, + "step": 6005 + }, + { + "epoch": 0.31, + "grad_norm": 0.52734375, + "learning_rate": 0.00017409454400614153, + "loss": 1.4528, + "step": 6010 + }, + { + "epoch": 0.31, + "grad_norm": 0.51171875, + "learning_rate": 0.00017403387039028503, + "loss": 1.4727, + "step": 6015 + }, + { + "epoch": 0.31, + "grad_norm": 0.5234375, + "learning_rate": 0.0001739731364044095, + "loss": 1.4952, + "step": 6020 + }, + { + "epoch": 0.31, + "grad_norm": 0.51953125, + "learning_rate": 0.00017391234209803975, + "loss": 1.4583, + "step": 6025 + }, + { + "epoch": 0.31, + "grad_norm": 0.6171875, + "learning_rate": 0.00017385148752074975, + "loss": 1.4748, + "step": 6030 + }, + { + "epoch": 0.31, + "grad_norm": 0.5234375, + "learning_rate": 0.0001737905727221626, + "loss": 1.4519, + "step": 6035 + }, + { + "epoch": 0.31, + "grad_norm": 0.515625, + "learning_rate": 0.00017372959775195057, + "loss": 1.4362, + "step": 6040 + }, + { + "epoch": 0.31, + "grad_norm": 0.50390625, + "learning_rate": 0.00017366856265983493, + "loss": 1.4305, + "step": 6045 + }, + { + "epoch": 0.31, + "grad_norm": 0.53515625, + "learning_rate": 0.00017360746749558602, + "loss": 1.4634, + "step": 6050 + }, + { + "epoch": 0.31, + "grad_norm": 0.51953125, + "learning_rate": 0.00017354631230902316, + "loss": 1.4423, + "step": 6055 + }, + { + "epoch": 0.31, + "grad_norm": 0.5, + "learning_rate": 0.00017348509715001457, + "loss": 1.3851, + "step": 6060 + }, + { + "epoch": 0.31, + "grad_norm": 0.55078125, + "learning_rate": 0.00017342382206847744, + "loss": 1.4521, + "step": 6065 + }, + { + "epoch": 0.31, + "grad_norm": 0.51171875, + "learning_rate": 0.00017336248711437774, + "loss": 1.4607, + "step": 6070 + }, + { + "epoch": 0.31, + "grad_norm": 0.53515625, + "learning_rate": 0.00017330109233773037, + "loss": 1.4446, + "step": 6075 + }, + { + "epoch": 0.31, + "grad_norm": 0.5078125, + "learning_rate": 0.00017323963778859892, + "loss": 1.4198, + "step": 6080 + }, + { + "epoch": 0.31, + "grad_norm": 0.5, + "learning_rate": 0.00017317812351709576, + "loss": 1.4496, + "step": 6085 + }, + { + "epoch": 0.32, + "grad_norm": 0.52734375, + "learning_rate": 0.00017311654957338196, + "loss": 1.4998, + "step": 6090 + }, + { + "epoch": 0.32, + "grad_norm": 0.53515625, + "learning_rate": 0.00017305491600766725, + "loss": 1.4964, + "step": 6095 + }, + { + "epoch": 0.32, + "grad_norm": 0.5078125, + "learning_rate": 0.00017299322287020995, + "loss": 1.467, + "step": 6100 + }, + { + "epoch": 0.32, + "grad_norm": 0.52734375, + "learning_rate": 0.00017293147021131701, + "loss": 1.4281, + "step": 6105 + }, + { + "epoch": 0.32, + "grad_norm": 0.55078125, + "learning_rate": 0.00017286965808134387, + "loss": 1.4406, + "step": 6110 + }, + { + "epoch": 0.32, + "grad_norm": 0.51953125, + "learning_rate": 0.00017280778653069442, + "loss": 1.446, + "step": 6115 + }, + { + "epoch": 0.32, + "grad_norm": 0.5234375, + "learning_rate": 0.00017274585560982117, + "loss": 1.4484, + "step": 6120 + }, + { + "epoch": 0.32, + "grad_norm": 0.5390625, + "learning_rate": 0.00017268386536922487, + "loss": 1.457, + "step": 6125 + }, + { + "epoch": 0.32, + "grad_norm": 0.515625, + "learning_rate": 0.00017262181585945473, + "loss": 1.4568, + "step": 6130 + }, + { + "epoch": 0.32, + "grad_norm": 0.51171875, + "learning_rate": 0.00017255970713110825, + "loss": 1.4554, + "step": 6135 + }, + { + "epoch": 0.32, + "grad_norm": 0.498046875, + "learning_rate": 0.00017249753923483124, + "loss": 1.4656, + "step": 6140 + }, + { + "epoch": 0.32, + "grad_norm": 0.51953125, + "learning_rate": 0.00017243531222131778, + "loss": 1.466, + "step": 6145 + }, + { + "epoch": 0.32, + "grad_norm": 0.53125, + "learning_rate": 0.0001723730261413101, + "loss": 1.4508, + "step": 6150 + }, + { + "epoch": 0.32, + "grad_norm": 0.66796875, + "learning_rate": 0.00017231068104559864, + "loss": 1.4772, + "step": 6155 + }, + { + "epoch": 0.32, + "grad_norm": 0.5078125, + "learning_rate": 0.00017224827698502195, + "loss": 1.4592, + "step": 6160 + }, + { + "epoch": 0.32, + "grad_norm": 0.52734375, + "learning_rate": 0.00017218581401046666, + "loss": 1.4591, + "step": 6165 + }, + { + "epoch": 0.32, + "grad_norm": 0.56640625, + "learning_rate": 0.00017212329217286743, + "loss": 1.4274, + "step": 6170 + }, + { + "epoch": 0.32, + "grad_norm": 0.52734375, + "learning_rate": 0.000172060711523207, + "loss": 1.4351, + "step": 6175 + }, + { + "epoch": 0.32, + "grad_norm": 0.57421875, + "learning_rate": 0.00017199807211251588, + "loss": 1.4678, + "step": 6180 + }, + { + "epoch": 0.32, + "grad_norm": 0.51171875, + "learning_rate": 0.00017193537399187272, + "loss": 1.4544, + "step": 6185 + }, + { + "epoch": 0.32, + "grad_norm": 0.5, + "learning_rate": 0.00017187261721240388, + "loss": 1.4647, + "step": 6190 + }, + { + "epoch": 0.32, + "grad_norm": 0.5390625, + "learning_rate": 0.00017180980182528364, + "loss": 1.4611, + "step": 6195 + }, + { + "epoch": 0.32, + "grad_norm": 0.51171875, + "learning_rate": 0.00017174692788173403, + "loss": 1.4442, + "step": 6200 + }, + { + "epoch": 0.32, + "grad_norm": 0.5234375, + "learning_rate": 0.00017168399543302486, + "loss": 1.4441, + "step": 6205 + }, + { + "epoch": 0.32, + "grad_norm": 0.53515625, + "learning_rate": 0.00017162100453047363, + "loss": 1.4559, + "step": 6210 + }, + { + "epoch": 0.32, + "grad_norm": 0.51171875, + "learning_rate": 0.00017155795522544548, + "loss": 1.449, + "step": 6215 + }, + { + "epoch": 0.32, + "grad_norm": 0.5703125, + "learning_rate": 0.0001714948475693532, + "loss": 1.4631, + "step": 6220 + }, + { + "epoch": 0.32, + "grad_norm": 0.50390625, + "learning_rate": 0.0001714316816136572, + "loss": 1.4371, + "step": 6225 + }, + { + "epoch": 0.32, + "grad_norm": 0.5234375, + "learning_rate": 0.00017136845740986533, + "loss": 1.4613, + "step": 6230 + }, + { + "epoch": 0.32, + "grad_norm": 0.5234375, + "learning_rate": 0.00017130517500953306, + "loss": 1.4082, + "step": 6235 + }, + { + "epoch": 0.32, + "grad_norm": 0.5390625, + "learning_rate": 0.0001712418344642632, + "loss": 1.4541, + "step": 6240 + }, + { + "epoch": 0.32, + "grad_norm": 0.55859375, + "learning_rate": 0.00017117843582570608, + "loss": 1.4396, + "step": 6245 + }, + { + "epoch": 0.32, + "grad_norm": 0.50390625, + "learning_rate": 0.0001711149791455593, + "loss": 1.4454, + "step": 6250 + }, + { + "epoch": 0.32, + "grad_norm": 0.54296875, + "learning_rate": 0.00017105146447556787, + "loss": 1.4688, + "step": 6255 + }, + { + "epoch": 0.32, + "grad_norm": 0.515625, + "learning_rate": 0.00017098789186752403, + "loss": 1.5107, + "step": 6260 + }, + { + "epoch": 0.32, + "grad_norm": 0.494140625, + "learning_rate": 0.0001709242613732673, + "loss": 1.4628, + "step": 6265 + }, + { + "epoch": 0.32, + "grad_norm": 0.53515625, + "learning_rate": 0.0001708605730446844, + "loss": 1.4343, + "step": 6270 + }, + { + "epoch": 0.32, + "grad_norm": 0.515625, + "learning_rate": 0.0001707968269337092, + "loss": 1.4394, + "step": 6275 + }, + { + "epoch": 0.32, + "grad_norm": 0.546875, + "learning_rate": 0.00017073302309232268, + "loss": 1.4945, + "step": 6280 + }, + { + "epoch": 0.33, + "grad_norm": 0.51953125, + "learning_rate": 0.00017066916157255292, + "loss": 1.4309, + "step": 6285 + }, + { + "epoch": 0.33, + "grad_norm": 0.50390625, + "learning_rate": 0.00017060524242647502, + "loss": 1.4539, + "step": 6290 + }, + { + "epoch": 0.33, + "grad_norm": 0.498046875, + "learning_rate": 0.00017054126570621107, + "loss": 1.4287, + "step": 6295 + }, + { + "epoch": 0.33, + "grad_norm": 0.494140625, + "learning_rate": 0.00017047723146393012, + "loss": 1.4452, + "step": 6300 + }, + { + "epoch": 0.33, + "grad_norm": 0.5234375, + "learning_rate": 0.00017041313975184807, + "loss": 1.484, + "step": 6305 + }, + { + "epoch": 0.33, + "grad_norm": 0.5, + "learning_rate": 0.00017034899062222776, + "loss": 1.4645, + "step": 6310 + }, + { + "epoch": 0.33, + "grad_norm": 0.546875, + "learning_rate": 0.00017028478412737882, + "loss": 1.4366, + "step": 6315 + }, + { + "epoch": 0.33, + "grad_norm": 0.515625, + "learning_rate": 0.00017022052031965762, + "loss": 1.4023, + "step": 6320 + }, + { + "epoch": 0.33, + "grad_norm": 0.51953125, + "learning_rate": 0.00017015619925146735, + "loss": 1.4506, + "step": 6325 + }, + { + "epoch": 0.33, + "grad_norm": 0.51171875, + "learning_rate": 0.0001700918209752578, + "loss": 1.4303, + "step": 6330 + }, + { + "epoch": 0.33, + "grad_norm": 0.5, + "learning_rate": 0.00017002738554352552, + "loss": 1.4716, + "step": 6335 + }, + { + "epoch": 0.33, + "grad_norm": 0.51953125, + "learning_rate": 0.00016996289300881353, + "loss": 1.4636, + "step": 6340 + }, + { + "epoch": 0.33, + "grad_norm": 0.51953125, + "learning_rate": 0.00016989834342371146, + "loss": 1.4566, + "step": 6345 + }, + { + "epoch": 0.33, + "grad_norm": 0.50390625, + "learning_rate": 0.00016983373684085557, + "loss": 1.4486, + "step": 6350 + }, + { + "epoch": 0.33, + "grad_norm": 0.50390625, + "learning_rate": 0.00016976907331292846, + "loss": 1.4181, + "step": 6355 + }, + { + "epoch": 0.33, + "grad_norm": 0.52734375, + "learning_rate": 0.00016970435289265923, + "loss": 1.4574, + "step": 6360 + }, + { + "epoch": 0.33, + "grad_norm": 0.51171875, + "learning_rate": 0.00016963957563282336, + "loss": 1.4554, + "step": 6365 + }, + { + "epoch": 0.33, + "grad_norm": 0.51171875, + "learning_rate": 0.00016957474158624266, + "loss": 1.4864, + "step": 6370 + }, + { + "epoch": 0.33, + "grad_norm": 0.51953125, + "learning_rate": 0.0001695098508057853, + "loss": 1.4319, + "step": 6375 + }, + { + "epoch": 0.33, + "grad_norm": 0.51171875, + "learning_rate": 0.00016944490334436566, + "loss": 1.4662, + "step": 6380 + }, + { + "epoch": 0.33, + "grad_norm": 0.52734375, + "learning_rate": 0.00016937989925494432, + "loss": 1.4459, + "step": 6385 + }, + { + "epoch": 0.33, + "grad_norm": 0.51953125, + "learning_rate": 0.00016931483859052813, + "loss": 1.4277, + "step": 6390 + }, + { + "epoch": 0.33, + "grad_norm": 0.53515625, + "learning_rate": 0.00016924972140417, + "loss": 1.4903, + "step": 6395 + }, + { + "epoch": 0.33, + "grad_norm": 0.48828125, + "learning_rate": 0.00016918454774896892, + "loss": 1.4032, + "step": 6400 + }, + { + "epoch": 0.33, + "grad_norm": 0.51953125, + "learning_rate": 0.00016911931767807, + "loss": 1.4108, + "step": 6405 + }, + { + "epoch": 0.33, + "grad_norm": 0.5, + "learning_rate": 0.00016905403124466427, + "loss": 1.466, + "step": 6410 + }, + { + "epoch": 0.33, + "grad_norm": 0.53515625, + "learning_rate": 0.00016898868850198878, + "loss": 1.4487, + "step": 6415 + }, + { + "epoch": 0.33, + "grad_norm": 0.51953125, + "learning_rate": 0.0001689232895033265, + "loss": 1.4416, + "step": 6420 + }, + { + "epoch": 0.33, + "grad_norm": 0.54296875, + "learning_rate": 0.00016885783430200616, + "loss": 1.4537, + "step": 6425 + }, + { + "epoch": 0.33, + "grad_norm": 0.5390625, + "learning_rate": 0.0001687923229514025, + "loss": 1.444, + "step": 6430 + }, + { + "epoch": 0.33, + "grad_norm": 0.51171875, + "learning_rate": 0.00016872675550493594, + "loss": 1.4532, + "step": 6435 + }, + { + "epoch": 0.33, + "grad_norm": 0.51953125, + "learning_rate": 0.00016866113201607257, + "loss": 1.4436, + "step": 6440 + }, + { + "epoch": 0.33, + "grad_norm": 0.5234375, + "learning_rate": 0.0001685954525383244, + "loss": 1.4328, + "step": 6445 + }, + { + "epoch": 0.33, + "grad_norm": 0.5078125, + "learning_rate": 0.0001685297171252488, + "loss": 1.4651, + "step": 6450 + }, + { + "epoch": 0.33, + "grad_norm": 0.5, + "learning_rate": 0.0001684639258304491, + "loss": 1.4652, + "step": 6455 + }, + { + "epoch": 0.33, + "grad_norm": 0.5234375, + "learning_rate": 0.00016839807870757387, + "loss": 1.4639, + "step": 6460 + }, + { + "epoch": 0.33, + "grad_norm": 0.5, + "learning_rate": 0.00016833217581031738, + "loss": 1.4381, + "step": 6465 + }, + { + "epoch": 0.33, + "grad_norm": 0.53515625, + "learning_rate": 0.00016826621719241938, + "loss": 1.4469, + "step": 6470 + }, + { + "epoch": 0.33, + "grad_norm": 0.48828125, + "learning_rate": 0.00016820020290766498, + "loss": 1.434, + "step": 6475 + }, + { + "epoch": 0.34, + "grad_norm": 0.5078125, + "learning_rate": 0.00016813413300988478, + "loss": 1.4403, + "step": 6480 + }, + { + "epoch": 0.34, + "grad_norm": 0.5078125, + "learning_rate": 0.0001680680075529546, + "loss": 1.4353, + "step": 6485 + }, + { + "epoch": 0.34, + "grad_norm": 0.52734375, + "learning_rate": 0.00016800182659079568, + "loss": 1.4958, + "step": 6490 + }, + { + "epoch": 0.34, + "grad_norm": 0.5234375, + "learning_rate": 0.0001679355901773745, + "loss": 1.4549, + "step": 6495 + }, + { + "epoch": 0.34, + "grad_norm": 0.53515625, + "learning_rate": 0.0001678692983667027, + "loss": 1.4271, + "step": 6500 + }, + { + "epoch": 0.34, + "grad_norm": 0.5390625, + "learning_rate": 0.00016780295121283717, + "loss": 1.4726, + "step": 6505 + }, + { + "epoch": 0.34, + "grad_norm": 0.53125, + "learning_rate": 0.00016773654876987983, + "loss": 1.4884, + "step": 6510 + }, + { + "epoch": 0.34, + "grad_norm": 0.51953125, + "learning_rate": 0.00016767009109197782, + "loss": 1.4783, + "step": 6515 + }, + { + "epoch": 0.34, + "grad_norm": 0.5, + "learning_rate": 0.00016760357823332318, + "loss": 1.4433, + "step": 6520 + }, + { + "epoch": 0.34, + "grad_norm": 0.494140625, + "learning_rate": 0.00016753701024815304, + "loss": 1.4583, + "step": 6525 + }, + { + "epoch": 0.34, + "grad_norm": 0.5390625, + "learning_rate": 0.00016747038719074945, + "loss": 1.4556, + "step": 6530 + }, + { + "epoch": 0.34, + "grad_norm": 0.486328125, + "learning_rate": 0.00016740370911543938, + "loss": 1.4543, + "step": 6535 + }, + { + "epoch": 0.34, + "grad_norm": 0.51171875, + "learning_rate": 0.00016733697607659463, + "loss": 1.4557, + "step": 6540 + }, + { + "epoch": 0.34, + "grad_norm": 0.54296875, + "learning_rate": 0.0001672701881286319, + "loss": 1.4402, + "step": 6545 + }, + { + "epoch": 0.34, + "grad_norm": 0.55078125, + "learning_rate": 0.00016720334532601254, + "loss": 1.4782, + "step": 6550 + }, + { + "epoch": 0.34, + "grad_norm": 0.5078125, + "learning_rate": 0.00016713644772324275, + "loss": 1.4437, + "step": 6555 + }, + { + "epoch": 0.34, + "grad_norm": 0.51171875, + "learning_rate": 0.00016706949537487336, + "loss": 1.4701, + "step": 6560 + }, + { + "epoch": 0.34, + "grad_norm": 0.53515625, + "learning_rate": 0.0001670024883354998, + "loss": 1.437, + "step": 6565 + }, + { + "epoch": 0.34, + "grad_norm": 0.5234375, + "learning_rate": 0.0001669354266597622, + "loss": 1.4347, + "step": 6570 + }, + { + "epoch": 0.34, + "grad_norm": 0.5, + "learning_rate": 0.0001668683104023452, + "loss": 1.463, + "step": 6575 + }, + { + "epoch": 0.34, + "grad_norm": 0.5703125, + "learning_rate": 0.00016680113961797788, + "loss": 1.4704, + "step": 6580 + }, + { + "epoch": 0.34, + "grad_norm": 0.5078125, + "learning_rate": 0.00016673391436143384, + "loss": 1.4307, + "step": 6585 + }, + { + "epoch": 0.34, + "grad_norm": 0.51953125, + "learning_rate": 0.00016666663468753118, + "loss": 1.4539, + "step": 6590 + }, + { + "epoch": 0.34, + "grad_norm": 0.51171875, + "learning_rate": 0.00016659930065113219, + "loss": 1.4447, + "step": 6595 + }, + { + "epoch": 0.34, + "grad_norm": 0.546875, + "learning_rate": 0.00016653191230714366, + "loss": 1.44, + "step": 6600 + }, + { + "epoch": 0.34, + "grad_norm": 0.5390625, + "learning_rate": 0.00016646446971051653, + "loss": 1.4302, + "step": 6605 + }, + { + "epoch": 0.34, + "grad_norm": 0.515625, + "learning_rate": 0.00016639697291624615, + "loss": 1.4595, + "step": 6610 + }, + { + "epoch": 0.34, + "grad_norm": 0.5078125, + "learning_rate": 0.00016632942197937185, + "loss": 1.4512, + "step": 6615 + }, + { + "epoch": 0.34, + "grad_norm": 0.52734375, + "learning_rate": 0.00016626181695497726, + "loss": 1.4692, + "step": 6620 + }, + { + "epoch": 0.34, + "grad_norm": 0.546875, + "learning_rate": 0.00016619415789819012, + "loss": 1.4509, + "step": 6625 + }, + { + "epoch": 0.34, + "grad_norm": 0.546875, + "learning_rate": 0.00016612644486418211, + "loss": 1.47, + "step": 6630 + }, + { + "epoch": 0.34, + "grad_norm": 0.56640625, + "learning_rate": 0.00016605867790816901, + "loss": 1.463, + "step": 6635 + }, + { + "epoch": 0.34, + "grad_norm": 0.5078125, + "learning_rate": 0.00016599085708541065, + "loss": 1.423, + "step": 6640 + }, + { + "epoch": 0.34, + "grad_norm": 0.51953125, + "learning_rate": 0.0001659229824512106, + "loss": 1.4258, + "step": 6645 + }, + { + "epoch": 0.34, + "grad_norm": 0.53515625, + "learning_rate": 0.0001658550540609164, + "loss": 1.4762, + "step": 6650 + }, + { + "epoch": 0.34, + "grad_norm": 0.55078125, + "learning_rate": 0.00016578707196991953, + "loss": 1.4485, + "step": 6655 + }, + { + "epoch": 0.34, + "grad_norm": 0.5234375, + "learning_rate": 0.00016571903623365506, + "loss": 1.4438, + "step": 6660 + }, + { + "epoch": 0.34, + "grad_norm": 0.5390625, + "learning_rate": 0.00016565094690760193, + "loss": 1.4411, + "step": 6665 + }, + { + "epoch": 0.35, + "grad_norm": 0.51953125, + "learning_rate": 0.00016558280404728275, + "loss": 1.4442, + "step": 6670 + }, + { + "epoch": 0.35, + "grad_norm": 0.51953125, + "learning_rate": 0.00016551460770826383, + "loss": 1.4678, + "step": 6675 + }, + { + "epoch": 0.35, + "grad_norm": 0.50390625, + "learning_rate": 0.00016544635794615498, + "loss": 1.4506, + "step": 6680 + }, + { + "epoch": 0.35, + "grad_norm": 0.49609375, + "learning_rate": 0.00016537805481660968, + "loss": 1.4264, + "step": 6685 + }, + { + "epoch": 0.35, + "grad_norm": 0.5234375, + "learning_rate": 0.00016530969837532487, + "loss": 1.4492, + "step": 6690 + }, + { + "epoch": 0.35, + "grad_norm": 0.515625, + "learning_rate": 0.000165241288678041, + "loss": 1.4742, + "step": 6695 + }, + { + "epoch": 0.35, + "grad_norm": 0.5078125, + "learning_rate": 0.00016517282578054187, + "loss": 1.454, + "step": 6700 + }, + { + "epoch": 0.35, + "grad_norm": 0.515625, + "learning_rate": 0.0001651043097386548, + "loss": 1.4484, + "step": 6705 + }, + { + "epoch": 0.35, + "grad_norm": 0.494140625, + "learning_rate": 0.0001650357406082503, + "loss": 1.4599, + "step": 6710 + }, + { + "epoch": 0.35, + "grad_norm": 0.52734375, + "learning_rate": 0.00016496711844524224, + "loss": 1.434, + "step": 6715 + }, + { + "epoch": 0.35, + "grad_norm": 0.5078125, + "learning_rate": 0.00016489844330558773, + "loss": 1.4689, + "step": 6720 + }, + { + "epoch": 0.35, + "grad_norm": 0.55078125, + "learning_rate": 0.00016482971524528714, + "loss": 1.4879, + "step": 6725 + }, + { + "epoch": 0.35, + "grad_norm": 0.5234375, + "learning_rate": 0.00016476093432038385, + "loss": 1.4342, + "step": 6730 + }, + { + "epoch": 0.35, + "grad_norm": 0.5078125, + "learning_rate": 0.00016469210058696446, + "loss": 1.4312, + "step": 6735 + }, + { + "epoch": 0.35, + "grad_norm": 0.51171875, + "learning_rate": 0.0001646232141011586, + "loss": 1.4278, + "step": 6740 + }, + { + "epoch": 0.35, + "grad_norm": 0.51171875, + "learning_rate": 0.00016455427491913888, + "loss": 1.4728, + "step": 6745 + }, + { + "epoch": 0.35, + "grad_norm": 0.53515625, + "learning_rate": 0.000164485283097121, + "loss": 1.4484, + "step": 6750 + }, + { + "epoch": 0.35, + "grad_norm": 0.515625, + "learning_rate": 0.00016441623869136343, + "loss": 1.4546, + "step": 6755 + }, + { + "epoch": 0.35, + "grad_norm": 0.53125, + "learning_rate": 0.00016434714175816764, + "loss": 1.4523, + "step": 6760 + }, + { + "epoch": 0.35, + "grad_norm": 0.5703125, + "learning_rate": 0.00016427799235387784, + "loss": 1.4494, + "step": 6765 + }, + { + "epoch": 0.35, + "grad_norm": 0.494140625, + "learning_rate": 0.00016420879053488107, + "loss": 1.4351, + "step": 6770 + }, + { + "epoch": 0.35, + "grad_norm": 0.52734375, + "learning_rate": 0.00016413953635760714, + "loss": 1.4547, + "step": 6775 + }, + { + "epoch": 0.35, + "grad_norm": 0.515625, + "learning_rate": 0.0001640702298785285, + "loss": 1.4584, + "step": 6780 + }, + { + "epoch": 0.35, + "grad_norm": 0.51171875, + "learning_rate": 0.00016400087115416034, + "loss": 1.4429, + "step": 6785 + }, + { + "epoch": 0.35, + "grad_norm": 0.51171875, + "learning_rate": 0.0001639314602410603, + "loss": 1.4234, + "step": 6790 + }, + { + "epoch": 0.35, + "grad_norm": 0.50390625, + "learning_rate": 0.00016386199719582874, + "loss": 1.4738, + "step": 6795 + }, + { + "epoch": 0.35, + "grad_norm": 0.5078125, + "learning_rate": 0.0001637924820751084, + "loss": 1.4311, + "step": 6800 + }, + { + "epoch": 0.35, + "grad_norm": 0.52734375, + "learning_rate": 0.00016372291493558453, + "loss": 1.4664, + "step": 6805 + }, + { + "epoch": 0.35, + "grad_norm": 0.52734375, + "learning_rate": 0.00016365329583398487, + "loss": 1.4486, + "step": 6810 + }, + { + "epoch": 0.35, + "grad_norm": 0.5390625, + "learning_rate": 0.00016358362482707942, + "loss": 1.4347, + "step": 6815 + }, + { + "epoch": 0.35, + "grad_norm": 0.490234375, + "learning_rate": 0.0001635139019716806, + "loss": 1.3988, + "step": 6820 + }, + { + "epoch": 0.35, + "grad_norm": 0.515625, + "learning_rate": 0.000163444127324643, + "loss": 1.4205, + "step": 6825 + }, + { + "epoch": 0.35, + "grad_norm": 0.51171875, + "learning_rate": 0.00016337430094286358, + "loss": 1.4269, + "step": 6830 + }, + { + "epoch": 0.35, + "grad_norm": 0.51171875, + "learning_rate": 0.00016330442288328134, + "loss": 1.4432, + "step": 6835 + }, + { + "epoch": 0.35, + "grad_norm": 0.54296875, + "learning_rate": 0.00016323449320287755, + "loss": 1.4516, + "step": 6840 + }, + { + "epoch": 0.35, + "grad_norm": 0.50390625, + "learning_rate": 0.0001631645119586755, + "loss": 1.4376, + "step": 6845 + }, + { + "epoch": 0.35, + "grad_norm": 0.5078125, + "learning_rate": 0.0001630944792077405, + "loss": 1.457, + "step": 6850 + }, + { + "epoch": 0.35, + "grad_norm": 0.6015625, + "learning_rate": 0.00016302439500718002, + "loss": 1.472, + "step": 6855 + }, + { + "epoch": 0.35, + "grad_norm": 0.5234375, + "learning_rate": 0.00016295425941414323, + "loss": 1.4462, + "step": 6860 + }, + { + "epoch": 0.36, + "grad_norm": 0.498046875, + "learning_rate": 0.00016288407248582146, + "loss": 1.4394, + "step": 6865 + }, + { + "epoch": 0.36, + "grad_norm": 0.52734375, + "learning_rate": 0.0001628138342794477, + "loss": 1.4604, + "step": 6870 + }, + { + "epoch": 0.36, + "grad_norm": 0.53125, + "learning_rate": 0.00016274354485229688, + "loss": 1.4599, + "step": 6875 + }, + { + "epoch": 0.36, + "grad_norm": 0.51171875, + "learning_rate": 0.0001626732042616857, + "loss": 1.4046, + "step": 6880 + }, + { + "epoch": 0.36, + "grad_norm": 0.5, + "learning_rate": 0.00016260281256497247, + "loss": 1.4412, + "step": 6885 + }, + { + "epoch": 0.36, + "grad_norm": 0.52734375, + "learning_rate": 0.00016253236981955726, + "loss": 1.4617, + "step": 6890 + }, + { + "epoch": 0.36, + "grad_norm": 0.515625, + "learning_rate": 0.00016246187608288178, + "loss": 1.4696, + "step": 6895 + }, + { + "epoch": 0.36, + "grad_norm": 0.515625, + "learning_rate": 0.00016239133141242925, + "loss": 1.4534, + "step": 6900 + }, + { + "epoch": 0.36, + "grad_norm": 0.51171875, + "learning_rate": 0.0001623207358657245, + "loss": 1.4349, + "step": 6905 + }, + { + "epoch": 0.36, + "grad_norm": 0.5078125, + "learning_rate": 0.0001622500895003338, + "loss": 1.4505, + "step": 6910 + }, + { + "epoch": 0.36, + "grad_norm": 0.5625, + "learning_rate": 0.00016217939237386485, + "loss": 1.432, + "step": 6915 + }, + { + "epoch": 0.36, + "grad_norm": 0.515625, + "learning_rate": 0.00016210864454396678, + "loss": 1.4469, + "step": 6920 + }, + { + "epoch": 0.36, + "grad_norm": 0.51171875, + "learning_rate": 0.00016203784606833, + "loss": 1.4463, + "step": 6925 + }, + { + "epoch": 0.36, + "grad_norm": 0.50390625, + "learning_rate": 0.00016196699700468634, + "loss": 1.4452, + "step": 6930 + }, + { + "epoch": 0.36, + "grad_norm": 2.90625, + "learning_rate": 0.0001618960974108088, + "loss": 1.4029, + "step": 6935 + }, + { + "epoch": 0.36, + "grad_norm": 0.5234375, + "learning_rate": 0.0001618251473445115, + "loss": 1.4377, + "step": 6940 + }, + { + "epoch": 0.36, + "grad_norm": 0.5078125, + "learning_rate": 0.00016175414686364994, + "loss": 1.4317, + "step": 6945 + }, + { + "epoch": 0.36, + "grad_norm": 0.53515625, + "learning_rate": 0.00016168309602612052, + "loss": 1.4971, + "step": 6950 + }, + { + "epoch": 0.36, + "grad_norm": 0.53125, + "learning_rate": 0.00016161199488986077, + "loss": 1.4458, + "step": 6955 + }, + { + "epoch": 0.36, + "grad_norm": 0.54296875, + "learning_rate": 0.00016154084351284925, + "loss": 1.4253, + "step": 6960 + }, + { + "epoch": 0.36, + "grad_norm": 0.4921875, + "learning_rate": 0.00016146964195310555, + "loss": 1.4211, + "step": 6965 + }, + { + "epoch": 0.36, + "grad_norm": 0.55078125, + "learning_rate": 0.00016139839026869005, + "loss": 1.4844, + "step": 6970 + }, + { + "epoch": 0.36, + "grad_norm": 0.5078125, + "learning_rate": 0.00016132708851770408, + "loss": 1.4599, + "step": 6975 + }, + { + "epoch": 0.36, + "grad_norm": 0.51953125, + "learning_rate": 0.00016125573675828983, + "loss": 1.4546, + "step": 6980 + }, + { + "epoch": 0.36, + "grad_norm": 0.5078125, + "learning_rate": 0.00016118433504863012, + "loss": 1.4241, + "step": 6985 + }, + { + "epoch": 0.36, + "grad_norm": 0.51953125, + "learning_rate": 0.00016111288344694875, + "loss": 1.4573, + "step": 6990 + }, + { + "epoch": 0.36, + "grad_norm": 0.55078125, + "learning_rate": 0.00016104138201150994, + "loss": 1.4118, + "step": 6995 + }, + { + "epoch": 0.36, + "grad_norm": 0.53125, + "learning_rate": 0.00016096983080061874, + "loss": 1.4024, + "step": 7000 + }, + { + "epoch": 0.36, + "grad_norm": 0.49609375, + "learning_rate": 0.00016089822987262067, + "loss": 1.4301, + "step": 7005 + }, + { + "epoch": 0.36, + "grad_norm": 0.53125, + "learning_rate": 0.00016082657928590183, + "loss": 1.4426, + "step": 7010 + }, + { + "epoch": 0.36, + "grad_norm": 0.53515625, + "learning_rate": 0.00016075487909888886, + "loss": 1.4379, + "step": 7015 + }, + { + "epoch": 0.36, + "grad_norm": 0.53125, + "learning_rate": 0.0001606831293700488, + "loss": 1.4465, + "step": 7020 + }, + { + "epoch": 0.36, + "grad_norm": 0.50390625, + "learning_rate": 0.00016061133015788905, + "loss": 1.466, + "step": 7025 + }, + { + "epoch": 0.36, + "grad_norm": 0.53125, + "learning_rate": 0.00016053948152095745, + "loss": 1.4491, + "step": 7030 + }, + { + "epoch": 0.36, + "grad_norm": 0.53125, + "learning_rate": 0.0001604675835178421, + "loss": 1.4804, + "step": 7035 + }, + { + "epoch": 0.36, + "grad_norm": 0.53125, + "learning_rate": 0.00016039563620717128, + "loss": 1.4269, + "step": 7040 + }, + { + "epoch": 0.36, + "grad_norm": 0.5546875, + "learning_rate": 0.00016032363964761363, + "loss": 1.4239, + "step": 7045 + }, + { + "epoch": 0.36, + "grad_norm": 0.51171875, + "learning_rate": 0.00016025159389787788, + "loss": 1.4139, + "step": 7050 + }, + { + "epoch": 0.37, + "grad_norm": 0.51953125, + "learning_rate": 0.00016017949901671276, + "loss": 1.444, + "step": 7055 + }, + { + "epoch": 0.37, + "grad_norm": 0.765625, + "learning_rate": 0.00016010735506290726, + "loss": 1.4524, + "step": 7060 + }, + { + "epoch": 0.37, + "grad_norm": 0.53515625, + "learning_rate": 0.00016003516209529023, + "loss": 1.4677, + "step": 7065 + }, + { + "epoch": 0.37, + "grad_norm": 0.52734375, + "learning_rate": 0.00015996292017273058, + "loss": 1.4294, + "step": 7070 + }, + { + "epoch": 0.37, + "grad_norm": 0.51171875, + "learning_rate": 0.0001598906293541371, + "loss": 1.4313, + "step": 7075 + }, + { + "epoch": 0.37, + "grad_norm": 0.52734375, + "learning_rate": 0.00015981828969845844, + "loss": 1.4712, + "step": 7080 + }, + { + "epoch": 0.37, + "grad_norm": 0.53125, + "learning_rate": 0.00015974590126468315, + "loss": 1.4572, + "step": 7085 + }, + { + "epoch": 0.37, + "grad_norm": 0.498046875, + "learning_rate": 0.00015967346411183941, + "loss": 1.4446, + "step": 7090 + }, + { + "epoch": 0.37, + "grad_norm": 0.54296875, + "learning_rate": 0.00015960097829899528, + "loss": 1.4755, + "step": 7095 + }, + { + "epoch": 0.37, + "grad_norm": 0.53515625, + "learning_rate": 0.0001595284438852584, + "loss": 1.4535, + "step": 7100 + }, + { + "epoch": 0.37, + "grad_norm": 0.55078125, + "learning_rate": 0.00015945586092977612, + "loss": 1.4407, + "step": 7105 + }, + { + "epoch": 0.37, + "grad_norm": 0.515625, + "learning_rate": 0.00015938322949173527, + "loss": 1.4454, + "step": 7110 + }, + { + "epoch": 0.37, + "grad_norm": 0.51171875, + "learning_rate": 0.00015931054963036232, + "loss": 1.4415, + "step": 7115 + }, + { + "epoch": 0.37, + "grad_norm": 0.5, + "learning_rate": 0.00015923782140492317, + "loss": 1.4375, + "step": 7120 + }, + { + "epoch": 0.37, + "grad_norm": 0.55859375, + "learning_rate": 0.00015916504487472314, + "loss": 1.4466, + "step": 7125 + }, + { + "epoch": 0.37, + "grad_norm": 0.5078125, + "learning_rate": 0.000159092220099107, + "loss": 1.4097, + "step": 7130 + }, + { + "epoch": 0.37, + "grad_norm": 0.5390625, + "learning_rate": 0.0001590193471374588, + "loss": 1.4065, + "step": 7135 + }, + { + "epoch": 0.37, + "grad_norm": 0.51953125, + "learning_rate": 0.00015894642604920192, + "loss": 1.4808, + "step": 7140 + }, + { + "epoch": 0.37, + "grad_norm": 0.52734375, + "learning_rate": 0.00015887345689379897, + "loss": 1.4544, + "step": 7145 + }, + { + "epoch": 0.37, + "grad_norm": 0.61328125, + "learning_rate": 0.00015880043973075177, + "loss": 1.4441, + "step": 7150 + }, + { + "epoch": 0.37, + "grad_norm": 0.51953125, + "learning_rate": 0.00015872737461960126, + "loss": 1.4611, + "step": 7155 + }, + { + "epoch": 0.37, + "grad_norm": 0.5390625, + "learning_rate": 0.00015865426161992753, + "loss": 1.4104, + "step": 7160 + }, + { + "epoch": 0.37, + "grad_norm": 0.54296875, + "learning_rate": 0.00015858110079134966, + "loss": 1.4313, + "step": 7165 + }, + { + "epoch": 0.37, + "grad_norm": 0.54296875, + "learning_rate": 0.00015850789219352577, + "loss": 1.4332, + "step": 7170 + }, + { + "epoch": 0.37, + "grad_norm": 0.53515625, + "learning_rate": 0.0001584346358861529, + "loss": 1.4422, + "step": 7175 + }, + { + "epoch": 0.37, + "grad_norm": 0.53515625, + "learning_rate": 0.00015836133192896702, + "loss": 1.4478, + "step": 7180 + }, + { + "epoch": 0.37, + "grad_norm": 0.494140625, + "learning_rate": 0.00015828798038174298, + "loss": 1.4422, + "step": 7185 + }, + { + "epoch": 0.37, + "grad_norm": 0.51953125, + "learning_rate": 0.0001582145813042944, + "loss": 1.4665, + "step": 7190 + }, + { + "epoch": 0.37, + "grad_norm": 0.53515625, + "learning_rate": 0.0001581411347564736, + "loss": 1.4325, + "step": 7195 + }, + { + "epoch": 0.37, + "grad_norm": 0.546875, + "learning_rate": 0.00015806764079817178, + "loss": 1.3919, + "step": 7200 + }, + { + "epoch": 0.37, + "grad_norm": 0.515625, + "learning_rate": 0.0001579940994893186, + "loss": 1.4532, + "step": 7205 + }, + { + "epoch": 0.37, + "grad_norm": 0.52734375, + "learning_rate": 0.00015792051088988246, + "loss": 1.4209, + "step": 7210 + }, + { + "epoch": 0.37, + "grad_norm": 1.546875, + "learning_rate": 0.00015784687505987033, + "loss": 1.4561, + "step": 7215 + }, + { + "epoch": 0.37, + "grad_norm": 0.51171875, + "learning_rate": 0.00015777319205932758, + "loss": 1.442, + "step": 7220 + }, + { + "epoch": 0.37, + "grad_norm": 0.494140625, + "learning_rate": 0.00015769946194833817, + "loss": 1.4268, + "step": 7225 + }, + { + "epoch": 0.37, + "grad_norm": 0.52734375, + "learning_rate": 0.00015762568478702436, + "loss": 1.4749, + "step": 7230 + }, + { + "epoch": 0.37, + "grad_norm": 0.515625, + "learning_rate": 0.00015755186063554696, + "loss": 1.4237, + "step": 7235 + }, + { + "epoch": 0.37, + "grad_norm": 0.51953125, + "learning_rate": 0.00015747798955410483, + "loss": 1.4823, + "step": 7240 + }, + { + "epoch": 0.37, + "grad_norm": 0.5078125, + "learning_rate": 0.00015740407160293535, + "loss": 1.4482, + "step": 7245 + }, + { + "epoch": 0.38, + "grad_norm": 0.49609375, + "learning_rate": 0.00015733010684231395, + "loss": 1.4474, + "step": 7250 + }, + { + "epoch": 0.38, + "grad_norm": 0.5, + "learning_rate": 0.00015725609533255434, + "loss": 1.4608, + "step": 7255 + }, + { + "epoch": 0.38, + "grad_norm": 0.51171875, + "learning_rate": 0.00015718203713400828, + "loss": 1.4731, + "step": 7260 + }, + { + "epoch": 0.38, + "grad_norm": 0.494140625, + "learning_rate": 0.0001571079323070656, + "loss": 1.412, + "step": 7265 + }, + { + "epoch": 0.38, + "grad_norm": 0.51953125, + "learning_rate": 0.00015703378091215428, + "loss": 1.4288, + "step": 7270 + }, + { + "epoch": 0.38, + "grad_norm": 0.546875, + "learning_rate": 0.00015695958300974007, + "loss": 1.4882, + "step": 7275 + }, + { + "epoch": 0.38, + "grad_norm": 0.53515625, + "learning_rate": 0.0001568853386603268, + "loss": 1.4557, + "step": 7280 + }, + { + "epoch": 0.38, + "grad_norm": 0.5390625, + "learning_rate": 0.0001568110479244561, + "loss": 1.4651, + "step": 7285 + }, + { + "epoch": 0.38, + "grad_norm": 0.5703125, + "learning_rate": 0.00015673671086270741, + "loss": 1.4711, + "step": 7290 + }, + { + "epoch": 0.38, + "grad_norm": 0.5390625, + "learning_rate": 0.00015666232753569807, + "loss": 1.4877, + "step": 7295 + }, + { + "epoch": 0.38, + "grad_norm": 0.5078125, + "learning_rate": 0.000156587898004083, + "loss": 1.4273, + "step": 7300 + }, + { + "epoch": 0.38, + "grad_norm": 0.54296875, + "learning_rate": 0.00015651342232855486, + "loss": 1.4321, + "step": 7305 + }, + { + "epoch": 0.38, + "grad_norm": 0.53125, + "learning_rate": 0.00015643890056984394, + "loss": 1.4391, + "step": 7310 + }, + { + "epoch": 0.38, + "grad_norm": 0.51953125, + "learning_rate": 0.00015636433278871814, + "loss": 1.453, + "step": 7315 + }, + { + "epoch": 0.38, + "grad_norm": 0.53125, + "learning_rate": 0.00015628971904598277, + "loss": 1.4338, + "step": 7320 + }, + { + "epoch": 0.38, + "grad_norm": 0.54296875, + "learning_rate": 0.00015621505940248076, + "loss": 1.4304, + "step": 7325 + }, + { + "epoch": 0.38, + "grad_norm": 0.53125, + "learning_rate": 0.00015614035391909242, + "loss": 1.4304, + "step": 7330 + }, + { + "epoch": 0.38, + "grad_norm": 0.53125, + "learning_rate": 0.00015606560265673535, + "loss": 1.4565, + "step": 7335 + }, + { + "epoch": 0.38, + "grad_norm": 0.52734375, + "learning_rate": 0.00015599080567636463, + "loss": 1.465, + "step": 7340 + }, + { + "epoch": 0.38, + "grad_norm": 0.5078125, + "learning_rate": 0.00015591596303897256, + "loss": 1.449, + "step": 7345 + }, + { + "epoch": 0.38, + "grad_norm": 0.57421875, + "learning_rate": 0.00015584107480558858, + "loss": 1.4558, + "step": 7350 + }, + { + "epoch": 0.38, + "grad_norm": 0.5078125, + "learning_rate": 0.00015576614103727946, + "loss": 1.464, + "step": 7355 + }, + { + "epoch": 0.38, + "grad_norm": 0.53125, + "learning_rate": 0.00015569116179514896, + "loss": 1.4592, + "step": 7360 + }, + { + "epoch": 0.38, + "grad_norm": 0.5390625, + "learning_rate": 0.00015561613714033804, + "loss": 1.4774, + "step": 7365 + }, + { + "epoch": 0.38, + "grad_norm": 0.49609375, + "learning_rate": 0.00015554106713402466, + "loss": 1.4422, + "step": 7370 + }, + { + "epoch": 0.38, + "grad_norm": 0.53125, + "learning_rate": 0.00015546595183742372, + "loss": 1.3744, + "step": 7375 + }, + { + "epoch": 0.38, + "grad_norm": 0.50390625, + "learning_rate": 0.00015539079131178705, + "loss": 1.4724, + "step": 7380 + }, + { + "epoch": 0.38, + "grad_norm": 0.515625, + "learning_rate": 0.00015531558561840343, + "loss": 1.4461, + "step": 7385 + }, + { + "epoch": 0.38, + "grad_norm": 0.5234375, + "learning_rate": 0.00015524033481859842, + "loss": 1.4322, + "step": 7390 + }, + { + "epoch": 0.38, + "grad_norm": 0.50390625, + "learning_rate": 0.00015516503897373434, + "loss": 1.4249, + "step": 7395 + }, + { + "epoch": 0.38, + "grad_norm": 0.52734375, + "learning_rate": 0.00015508969814521025, + "loss": 1.4321, + "step": 7400 + }, + { + "epoch": 0.38, + "grad_norm": 0.51171875, + "learning_rate": 0.00015501431239446197, + "loss": 1.4365, + "step": 7405 + }, + { + "epoch": 0.38, + "grad_norm": 0.515625, + "learning_rate": 0.00015493888178296191, + "loss": 1.4282, + "step": 7410 + }, + { + "epoch": 0.38, + "grad_norm": 0.50390625, + "learning_rate": 0.00015486340637221895, + "loss": 1.4296, + "step": 7415 + }, + { + "epoch": 0.38, + "grad_norm": 0.546875, + "learning_rate": 0.00015478788622377872, + "loss": 1.4462, + "step": 7420 + }, + { + "epoch": 0.38, + "grad_norm": 0.51171875, + "learning_rate": 0.00015471232139922312, + "loss": 1.443, + "step": 7425 + }, + { + "epoch": 0.38, + "grad_norm": 0.53515625, + "learning_rate": 0.00015463671196017055, + "loss": 1.4506, + "step": 7430 + }, + { + "epoch": 0.38, + "grad_norm": 0.515625, + "learning_rate": 0.00015456105796827588, + "loss": 1.4532, + "step": 7435 + }, + { + "epoch": 0.38, + "grad_norm": 0.53125, + "learning_rate": 0.00015448535948523018, + "loss": 1.45, + "step": 7440 + }, + { + "epoch": 0.39, + "grad_norm": 0.515625, + "learning_rate": 0.00015440961657276088, + "loss": 1.4341, + "step": 7445 + }, + { + "epoch": 0.39, + "grad_norm": 0.50390625, + "learning_rate": 0.0001543338292926316, + "loss": 1.4292, + "step": 7450 + }, + { + "epoch": 0.39, + "grad_norm": 0.515625, + "learning_rate": 0.0001542579977066422, + "loss": 1.4081, + "step": 7455 + }, + { + "epoch": 0.39, + "grad_norm": 0.5, + "learning_rate": 0.00015418212187662858, + "loss": 1.4163, + "step": 7460 + }, + { + "epoch": 0.39, + "grad_norm": 0.54296875, + "learning_rate": 0.00015410620186446277, + "loss": 1.4281, + "step": 7465 + }, + { + "epoch": 0.39, + "grad_norm": 0.53125, + "learning_rate": 0.00015403023773205286, + "loss": 1.4514, + "step": 7470 + }, + { + "epoch": 0.39, + "grad_norm": 0.52734375, + "learning_rate": 0.00015395422954134278, + "loss": 1.4534, + "step": 7475 + }, + { + "epoch": 0.39, + "grad_norm": 0.5546875, + "learning_rate": 0.0001538781773543126, + "loss": 1.4488, + "step": 7480 + }, + { + "epoch": 0.39, + "grad_norm": 0.54296875, + "learning_rate": 0.0001538020812329781, + "loss": 1.4453, + "step": 7485 + }, + { + "epoch": 0.39, + "grad_norm": 0.53515625, + "learning_rate": 0.00015372594123939094, + "loss": 1.4087, + "step": 7490 + }, + { + "epoch": 0.39, + "grad_norm": 0.515625, + "learning_rate": 0.00015364975743563858, + "loss": 1.4497, + "step": 7495 + }, + { + "epoch": 0.39, + "grad_norm": 0.55078125, + "learning_rate": 0.0001535735298838441, + "loss": 1.4217, + "step": 7500 + }, + { + "epoch": 0.39, + "grad_norm": 0.5, + "learning_rate": 0.00015349725864616639, + "loss": 1.4277, + "step": 7505 + }, + { + "epoch": 0.39, + "grad_norm": 0.5625, + "learning_rate": 0.00015342094378479988, + "loss": 1.4472, + "step": 7510 + }, + { + "epoch": 0.39, + "grad_norm": 0.54296875, + "learning_rate": 0.0001533445853619746, + "loss": 1.4571, + "step": 7515 + }, + { + "epoch": 0.39, + "grad_norm": 0.53125, + "learning_rate": 0.0001532681834399561, + "loss": 1.4408, + "step": 7520 + }, + { + "epoch": 0.39, + "grad_norm": 0.498046875, + "learning_rate": 0.0001531917380810454, + "loss": 1.4034, + "step": 7525 + }, + { + "epoch": 0.39, + "grad_norm": 0.546875, + "learning_rate": 0.00015311524934757893, + "loss": 1.4527, + "step": 7530 + }, + { + "epoch": 0.39, + "grad_norm": 0.53515625, + "learning_rate": 0.0001530387173019285, + "loss": 1.4673, + "step": 7535 + }, + { + "epoch": 0.39, + "grad_norm": 0.52734375, + "learning_rate": 0.00015296214200650126, + "loss": 1.4032, + "step": 7540 + }, + { + "epoch": 0.39, + "grad_norm": 0.5, + "learning_rate": 0.00015288552352373956, + "loss": 1.4031, + "step": 7545 + }, + { + "epoch": 0.39, + "grad_norm": 0.53515625, + "learning_rate": 0.000152808861916121, + "loss": 1.4263, + "step": 7550 + }, + { + "epoch": 0.39, + "grad_norm": 0.52734375, + "learning_rate": 0.00015273215724615846, + "loss": 1.4538, + "step": 7555 + }, + { + "epoch": 0.39, + "grad_norm": 0.50390625, + "learning_rate": 0.00015265540957639973, + "loss": 1.4294, + "step": 7560 + }, + { + "epoch": 0.39, + "grad_norm": 0.50390625, + "learning_rate": 0.00015257861896942777, + "loss": 1.4324, + "step": 7565 + }, + { + "epoch": 0.39, + "grad_norm": 0.52734375, + "learning_rate": 0.0001525017854878606, + "loss": 1.4431, + "step": 7570 + }, + { + "epoch": 0.39, + "grad_norm": 0.515625, + "learning_rate": 0.0001524249091943511, + "loss": 1.4652, + "step": 7575 + }, + { + "epoch": 0.39, + "grad_norm": 0.50390625, + "learning_rate": 0.00015234799015158713, + "loss": 1.446, + "step": 7580 + }, + { + "epoch": 0.39, + "grad_norm": 0.546875, + "learning_rate": 0.00015227102842229134, + "loss": 1.4417, + "step": 7585 + }, + { + "epoch": 0.39, + "grad_norm": 0.498046875, + "learning_rate": 0.0001521940240692213, + "loss": 1.426, + "step": 7590 + }, + { + "epoch": 0.39, + "grad_norm": 0.515625, + "learning_rate": 0.00015211697715516927, + "loss": 1.4178, + "step": 7595 + }, + { + "epoch": 0.39, + "grad_norm": 0.53515625, + "learning_rate": 0.0001520398877429622, + "loss": 1.4294, + "step": 7600 + }, + { + "epoch": 0.39, + "grad_norm": 0.515625, + "learning_rate": 0.00015196275589546168, + "loss": 1.442, + "step": 7605 + }, + { + "epoch": 0.39, + "grad_norm": 0.5234375, + "learning_rate": 0.000151885581675564, + "loss": 1.4626, + "step": 7610 + }, + { + "epoch": 0.39, + "grad_norm": 0.5390625, + "learning_rate": 0.0001518083651461999, + "loss": 1.4439, + "step": 7615 + }, + { + "epoch": 0.39, + "grad_norm": 0.53515625, + "learning_rate": 0.0001517311063703347, + "loss": 1.3871, + "step": 7620 + }, + { + "epoch": 0.39, + "grad_norm": 0.55078125, + "learning_rate": 0.00015165380541096803, + "loss": 1.4404, + "step": 7625 + }, + { + "epoch": 0.39, + "grad_norm": 0.53515625, + "learning_rate": 0.00015157646233113412, + "loss": 1.4486, + "step": 7630 + }, + { + "epoch": 0.4, + "grad_norm": 0.5, + "learning_rate": 0.0001514990771939014, + "loss": 1.4766, + "step": 7635 + }, + { + "epoch": 0.4, + "grad_norm": 0.51171875, + "learning_rate": 0.00015142165006237266, + "loss": 1.4596, + "step": 7640 + }, + { + "epoch": 0.4, + "grad_norm": 0.515625, + "learning_rate": 0.0001513441809996849, + "loss": 1.4263, + "step": 7645 + }, + { + "epoch": 0.4, + "grad_norm": 0.5078125, + "learning_rate": 0.0001512666700690093, + "loss": 1.4117, + "step": 7650 + }, + { + "epoch": 0.4, + "grad_norm": 0.5234375, + "learning_rate": 0.00015118911733355125, + "loss": 1.4577, + "step": 7655 + }, + { + "epoch": 0.4, + "grad_norm": 0.52734375, + "learning_rate": 0.00015111152285655013, + "loss": 1.4462, + "step": 7660 + }, + { + "epoch": 0.4, + "grad_norm": 0.51953125, + "learning_rate": 0.00015103388670127947, + "loss": 1.433, + "step": 7665 + }, + { + "epoch": 0.4, + "grad_norm": 0.5, + "learning_rate": 0.0001509562089310467, + "loss": 1.4355, + "step": 7670 + }, + { + "epoch": 0.4, + "grad_norm": 0.515625, + "learning_rate": 0.0001508784896091932, + "loss": 1.3756, + "step": 7675 + }, + { + "epoch": 0.4, + "grad_norm": 0.5234375, + "learning_rate": 0.0001508007287990943, + "loss": 1.4544, + "step": 7680 + }, + { + "epoch": 0.4, + "grad_norm": 0.5078125, + "learning_rate": 0.00015072292656415906, + "loss": 1.4182, + "step": 7685 + }, + { + "epoch": 0.4, + "grad_norm": 0.53125, + "learning_rate": 0.00015064508296783037, + "loss": 1.4554, + "step": 7690 + }, + { + "epoch": 0.4, + "grad_norm": 0.498046875, + "learning_rate": 0.00015056719807358485, + "loss": 1.4284, + "step": 7695 + }, + { + "epoch": 0.4, + "grad_norm": 0.5234375, + "learning_rate": 0.00015048927194493276, + "loss": 1.4262, + "step": 7700 + }, + { + "epoch": 0.4, + "grad_norm": 0.5234375, + "learning_rate": 0.00015041130464541808, + "loss": 1.4831, + "step": 7705 + }, + { + "epoch": 0.4, + "grad_norm": 0.51953125, + "learning_rate": 0.00015033329623861822, + "loss": 1.4461, + "step": 7710 + }, + { + "epoch": 0.4, + "grad_norm": 0.4921875, + "learning_rate": 0.00015025524678814427, + "loss": 1.4351, + "step": 7715 + }, + { + "epoch": 0.4, + "grad_norm": 0.515625, + "learning_rate": 0.00015017715635764063, + "loss": 1.4615, + "step": 7720 + }, + { + "epoch": 0.4, + "grad_norm": 0.498046875, + "learning_rate": 0.00015009902501078525, + "loss": 1.4451, + "step": 7725 + }, + { + "epoch": 0.4, + "grad_norm": 0.484375, + "learning_rate": 0.0001500208528112893, + "loss": 1.424, + "step": 7730 + }, + { + "epoch": 0.4, + "grad_norm": 0.5234375, + "learning_rate": 0.00014994263982289746, + "loss": 1.4405, + "step": 7735 + }, + { + "epoch": 0.4, + "grad_norm": 0.51171875, + "learning_rate": 0.00014986438610938748, + "loss": 1.4386, + "step": 7740 + }, + { + "epoch": 0.4, + "grad_norm": 0.5234375, + "learning_rate": 0.00014978609173457044, + "loss": 1.4277, + "step": 7745 + }, + { + "epoch": 0.4, + "grad_norm": 0.5546875, + "learning_rate": 0.0001497077567622905, + "loss": 1.4664, + "step": 7750 + }, + { + "epoch": 0.4, + "grad_norm": 0.515625, + "learning_rate": 0.00014962938125642503, + "loss": 1.4372, + "step": 7755 + }, + { + "epoch": 0.4, + "grad_norm": 0.5234375, + "learning_rate": 0.00014955096528088428, + "loss": 1.4382, + "step": 7760 + }, + { + "epoch": 0.4, + "grad_norm": 0.51953125, + "learning_rate": 0.00014947250889961168, + "loss": 1.4066, + "step": 7765 + }, + { + "epoch": 0.4, + "grad_norm": 0.5078125, + "learning_rate": 0.0001493940121765835, + "loss": 1.4029, + "step": 7770 + }, + { + "epoch": 0.4, + "grad_norm": 0.5234375, + "learning_rate": 0.00014931547517580898, + "loss": 1.4455, + "step": 7775 + }, + { + "epoch": 0.4, + "grad_norm": 0.51171875, + "learning_rate": 0.00014923689796133007, + "loss": 1.4802, + "step": 7780 + }, + { + "epoch": 0.4, + "grad_norm": 0.498046875, + "learning_rate": 0.0001491582805972217, + "loss": 1.4362, + "step": 7785 + }, + { + "epoch": 0.4, + "grad_norm": 0.5390625, + "learning_rate": 0.00014907962314759143, + "loss": 1.4446, + "step": 7790 + }, + { + "epoch": 0.4, + "grad_norm": 0.5078125, + "learning_rate": 0.00014900092567657946, + "loss": 1.4295, + "step": 7795 + }, + { + "epoch": 0.4, + "grad_norm": 0.50390625, + "learning_rate": 0.00014892218824835872, + "loss": 1.4239, + "step": 7800 + }, + { + "epoch": 0.4, + "grad_norm": 0.5234375, + "learning_rate": 0.0001488434109271347, + "loss": 1.4603, + "step": 7805 + }, + { + "epoch": 0.4, + "grad_norm": 0.51171875, + "learning_rate": 0.00014876459377714541, + "loss": 1.458, + "step": 7810 + }, + { + "epoch": 0.4, + "grad_norm": 0.5, + "learning_rate": 0.0001486857368626613, + "loss": 1.4326, + "step": 7815 + }, + { + "epoch": 0.4, + "grad_norm": 0.55078125, + "learning_rate": 0.00014860684024798536, + "loss": 1.4336, + "step": 7820 + }, + { + "epoch": 0.4, + "grad_norm": 0.51953125, + "learning_rate": 0.00014852790399745276, + "loss": 1.4285, + "step": 7825 + }, + { + "epoch": 0.41, + "grad_norm": 0.515625, + "learning_rate": 0.00014844892817543118, + "loss": 1.444, + "step": 7830 + }, + { + "epoch": 0.41, + "grad_norm": 0.54296875, + "learning_rate": 0.00014836991284632048, + "loss": 1.4552, + "step": 7835 + }, + { + "epoch": 0.41, + "grad_norm": 0.53515625, + "learning_rate": 0.00014829085807455274, + "loss": 1.4585, + "step": 7840 + }, + { + "epoch": 0.41, + "grad_norm": 0.51953125, + "learning_rate": 0.00014821176392459224, + "loss": 1.4027, + "step": 7845 + }, + { + "epoch": 0.41, + "grad_norm": 0.546875, + "learning_rate": 0.0001481326304609353, + "loss": 1.4385, + "step": 7850 + }, + { + "epoch": 0.41, + "grad_norm": 0.54296875, + "learning_rate": 0.0001480534577481104, + "loss": 1.4407, + "step": 7855 + }, + { + "epoch": 0.41, + "grad_norm": 0.51953125, + "learning_rate": 0.00014797424585067789, + "loss": 1.4317, + "step": 7860 + }, + { + "epoch": 0.41, + "grad_norm": 0.53125, + "learning_rate": 0.0001478949948332302, + "loss": 1.4478, + "step": 7865 + }, + { + "epoch": 0.41, + "grad_norm": 0.546875, + "learning_rate": 0.00014781570476039163, + "loss": 1.4349, + "step": 7870 + }, + { + "epoch": 0.41, + "grad_norm": 0.54296875, + "learning_rate": 0.00014773637569681823, + "loss": 1.4618, + "step": 7875 + }, + { + "epoch": 0.41, + "grad_norm": 0.515625, + "learning_rate": 0.00014765700770719796, + "loss": 1.4413, + "step": 7880 + }, + { + "epoch": 0.41, + "grad_norm": 0.515625, + "learning_rate": 0.00014757760085625047, + "loss": 1.459, + "step": 7885 + }, + { + "epoch": 0.41, + "grad_norm": 0.55078125, + "learning_rate": 0.00014749815520872717, + "loss": 1.4563, + "step": 7890 + }, + { + "epoch": 0.41, + "grad_norm": 0.51171875, + "learning_rate": 0.00014741867082941095, + "loss": 1.461, + "step": 7895 + }, + { + "epoch": 0.41, + "grad_norm": 0.5078125, + "learning_rate": 0.00014733914778311647, + "loss": 1.4266, + "step": 7900 + }, + { + "epoch": 0.41, + "grad_norm": 0.49609375, + "learning_rate": 0.00014725958613468976, + "loss": 1.4292, + "step": 7905 + }, + { + "epoch": 0.41, + "grad_norm": 0.546875, + "learning_rate": 0.00014717998594900844, + "loss": 1.4546, + "step": 7910 + }, + { + "epoch": 0.41, + "grad_norm": 0.51953125, + "learning_rate": 0.0001471003472909815, + "loss": 1.4307, + "step": 7915 + }, + { + "epoch": 0.41, + "grad_norm": 0.51171875, + "learning_rate": 0.0001470206702255493, + "loss": 1.4127, + "step": 7920 + }, + { + "epoch": 0.41, + "grad_norm": 0.546875, + "learning_rate": 0.00014694095481768358, + "loss": 1.418, + "step": 7925 + }, + { + "epoch": 0.41, + "grad_norm": 0.51171875, + "learning_rate": 0.00014686120113238725, + "loss": 1.4108, + "step": 7930 + }, + { + "epoch": 0.41, + "grad_norm": 0.49609375, + "learning_rate": 0.00014678140923469452, + "loss": 1.4327, + "step": 7935 + }, + { + "epoch": 0.41, + "grad_norm": 0.5390625, + "learning_rate": 0.0001467015791896707, + "loss": 1.4625, + "step": 7940 + }, + { + "epoch": 0.41, + "grad_norm": 0.546875, + "learning_rate": 0.00014662171106241223, + "loss": 1.4198, + "step": 7945 + }, + { + "epoch": 0.41, + "grad_norm": 0.54296875, + "learning_rate": 0.0001465418049180466, + "loss": 1.4784, + "step": 7950 + }, + { + "epoch": 0.41, + "grad_norm": 0.5234375, + "learning_rate": 0.00014646186082173233, + "loss": 1.4499, + "step": 7955 + }, + { + "epoch": 0.41, + "grad_norm": 0.51953125, + "learning_rate": 0.0001463818788386588, + "loss": 1.4569, + "step": 7960 + }, + { + "epoch": 0.41, + "grad_norm": 0.53515625, + "learning_rate": 0.00014630185903404642, + "loss": 1.4456, + "step": 7965 + }, + { + "epoch": 0.41, + "grad_norm": 0.55078125, + "learning_rate": 0.00014622180147314632, + "loss": 1.4491, + "step": 7970 + }, + { + "epoch": 0.41, + "grad_norm": 0.51171875, + "learning_rate": 0.0001461417062212405, + "loss": 1.464, + "step": 7975 + }, + { + "epoch": 0.41, + "grad_norm": 0.51171875, + "learning_rate": 0.00014606157334364162, + "loss": 1.4202, + "step": 7980 + }, + { + "epoch": 0.41, + "grad_norm": 0.5078125, + "learning_rate": 0.00014598140290569307, + "loss": 1.4205, + "step": 7985 + }, + { + "epoch": 0.41, + "grad_norm": 0.52734375, + "learning_rate": 0.00014590119497276887, + "loss": 1.4561, + "step": 7990 + }, + { + "epoch": 0.41, + "grad_norm": 0.5078125, + "learning_rate": 0.0001458209496102736, + "loss": 1.4398, + "step": 7995 + }, + { + "epoch": 0.41, + "grad_norm": 0.53515625, + "learning_rate": 0.00014574066688364235, + "loss": 1.4299, + "step": 8000 + }, + { + "epoch": 0.41, + "grad_norm": 0.5390625, + "learning_rate": 0.0001456603468583407, + "loss": 1.4382, + "step": 8005 + }, + { + "epoch": 0.41, + "grad_norm": 0.5234375, + "learning_rate": 0.00014557998959986466, + "loss": 1.4097, + "step": 8010 + }, + { + "epoch": 0.41, + "grad_norm": 0.53125, + "learning_rate": 0.00014549959517374056, + "loss": 1.4518, + "step": 8015 + }, + { + "epoch": 0.41, + "grad_norm": 0.50390625, + "learning_rate": 0.00014541916364552504, + "loss": 1.438, + "step": 8020 + }, + { + "epoch": 0.42, + "grad_norm": 0.5546875, + "learning_rate": 0.00014533869508080504, + "loss": 1.458, + "step": 8025 + }, + { + "epoch": 0.42, + "grad_norm": 0.5078125, + "learning_rate": 0.00014525818954519765, + "loss": 1.4477, + "step": 8030 + }, + { + "epoch": 0.42, + "grad_norm": 0.52734375, + "learning_rate": 0.0001451776471043502, + "loss": 1.4434, + "step": 8035 + }, + { + "epoch": 0.42, + "grad_norm": 0.53125, + "learning_rate": 0.00014509706782393992, + "loss": 1.4283, + "step": 8040 + }, + { + "epoch": 0.42, + "grad_norm": 0.5234375, + "learning_rate": 0.00014501645176967428, + "loss": 1.452, + "step": 8045 + }, + { + "epoch": 0.42, + "grad_norm": 0.51171875, + "learning_rate": 0.00014493579900729065, + "loss": 1.4493, + "step": 8050 + }, + { + "epoch": 0.42, + "grad_norm": 0.49609375, + "learning_rate": 0.00014485510960255638, + "loss": 1.4199, + "step": 8055 + }, + { + "epoch": 0.42, + "grad_norm": 0.515625, + "learning_rate": 0.0001447743836212686, + "loss": 1.4594, + "step": 8060 + }, + { + "epoch": 0.42, + "grad_norm": 0.5546875, + "learning_rate": 0.00014469362112925436, + "loss": 1.4306, + "step": 8065 + }, + { + "epoch": 0.42, + "grad_norm": 0.5390625, + "learning_rate": 0.0001446128221923704, + "loss": 1.4349, + "step": 8070 + }, + { + "epoch": 0.42, + "grad_norm": 0.52734375, + "learning_rate": 0.00014453198687650336, + "loss": 1.459, + "step": 8075 + }, + { + "epoch": 0.42, + "grad_norm": 0.5390625, + "learning_rate": 0.0001444511152475693, + "loss": 1.4249, + "step": 8080 + }, + { + "epoch": 0.42, + "grad_norm": 0.52734375, + "learning_rate": 0.00014437020737151403, + "loss": 1.4304, + "step": 8085 + }, + { + "epoch": 0.42, + "grad_norm": 0.53125, + "learning_rate": 0.00014428926331431293, + "loss": 1.428, + "step": 8090 + }, + { + "epoch": 0.42, + "grad_norm": 0.51171875, + "learning_rate": 0.00014420828314197078, + "loss": 1.4365, + "step": 8095 + }, + { + "epoch": 0.42, + "grad_norm": 0.50390625, + "learning_rate": 0.00014412726692052195, + "loss": 1.4066, + "step": 8100 + }, + { + "epoch": 0.42, + "grad_norm": 0.52734375, + "learning_rate": 0.00014404621471603005, + "loss": 1.4333, + "step": 8105 + }, + { + "epoch": 0.42, + "grad_norm": 0.52734375, + "learning_rate": 0.00014396512659458824, + "loss": 1.4428, + "step": 8110 + }, + { + "epoch": 0.42, + "grad_norm": 0.54296875, + "learning_rate": 0.0001438840026223187, + "loss": 1.4507, + "step": 8115 + }, + { + "epoch": 0.42, + "grad_norm": 0.5234375, + "learning_rate": 0.00014380284286537307, + "loss": 1.4596, + "step": 8120 + }, + { + "epoch": 0.42, + "grad_norm": 0.51953125, + "learning_rate": 0.00014372164738993206, + "loss": 1.4695, + "step": 8125 + }, + { + "epoch": 0.42, + "grad_norm": 0.484375, + "learning_rate": 0.00014364041626220556, + "loss": 1.4101, + "step": 8130 + }, + { + "epoch": 0.42, + "grad_norm": 0.5234375, + "learning_rate": 0.00014355914954843247, + "loss": 1.424, + "step": 8135 + }, + { + "epoch": 0.42, + "grad_norm": 0.5625, + "learning_rate": 0.00014347784731488078, + "loss": 1.448, + "step": 8140 + }, + { + "epoch": 0.42, + "grad_norm": 0.51171875, + "learning_rate": 0.00014339650962784736, + "loss": 1.4513, + "step": 8145 + }, + { + "epoch": 0.42, + "grad_norm": 0.515625, + "learning_rate": 0.00014331513655365806, + "loss": 1.4363, + "step": 8150 + }, + { + "epoch": 0.42, + "grad_norm": 0.5078125, + "learning_rate": 0.00014323372815866757, + "loss": 1.44, + "step": 8155 + }, + { + "epoch": 0.42, + "grad_norm": 0.52734375, + "learning_rate": 0.00014315228450925943, + "loss": 1.469, + "step": 8160 + }, + { + "epoch": 0.42, + "grad_norm": 0.515625, + "learning_rate": 0.00014307080567184575, + "loss": 1.4107, + "step": 8165 + }, + { + "epoch": 0.42, + "grad_norm": 0.5234375, + "learning_rate": 0.00014298929171286753, + "loss": 1.4158, + "step": 8170 + }, + { + "epoch": 0.42, + "grad_norm": 0.53125, + "learning_rate": 0.00014290774269879434, + "loss": 1.417, + "step": 8175 + }, + { + "epoch": 0.42, + "grad_norm": 0.53515625, + "learning_rate": 0.00014282615869612433, + "loss": 1.45, + "step": 8180 + }, + { + "epoch": 0.42, + "grad_norm": 0.5, + "learning_rate": 0.00014274453977138415, + "loss": 1.4191, + "step": 8185 + }, + { + "epoch": 0.42, + "grad_norm": 0.5234375, + "learning_rate": 0.000142662885991129, + "loss": 1.446, + "step": 8190 + }, + { + "epoch": 0.42, + "grad_norm": 0.50390625, + "learning_rate": 0.00014258119742194242, + "loss": 1.4128, + "step": 8195 + }, + { + "epoch": 0.42, + "grad_norm": 0.5390625, + "learning_rate": 0.00014249947413043642, + "loss": 1.4124, + "step": 8200 + }, + { + "epoch": 0.42, + "grad_norm": 0.52734375, + "learning_rate": 0.00014241771618325123, + "loss": 1.4235, + "step": 8205 + }, + { + "epoch": 0.42, + "grad_norm": 0.55859375, + "learning_rate": 0.00014233592364705535, + "loss": 1.4351, + "step": 8210 + }, + { + "epoch": 0.43, + "grad_norm": 0.5234375, + "learning_rate": 0.0001422540965885455, + "loss": 1.4278, + "step": 8215 + }, + { + "epoch": 0.43, + "grad_norm": 0.52734375, + "learning_rate": 0.00014217223507444662, + "loss": 1.4011, + "step": 8220 + }, + { + "epoch": 0.43, + "grad_norm": 0.5234375, + "learning_rate": 0.00014209033917151167, + "loss": 1.4438, + "step": 8225 + }, + { + "epoch": 0.43, + "grad_norm": 0.5390625, + "learning_rate": 0.00014200840894652167, + "loss": 1.4237, + "step": 8230 + }, + { + "epoch": 0.43, + "grad_norm": 0.5234375, + "learning_rate": 0.00014192644446628556, + "loss": 1.3854, + "step": 8235 + }, + { + "epoch": 0.43, + "grad_norm": 0.515625, + "learning_rate": 0.00014184444579764036, + "loss": 1.4199, + "step": 8240 + }, + { + "epoch": 0.43, + "grad_norm": 0.52734375, + "learning_rate": 0.0001417624130074508, + "loss": 1.4375, + "step": 8245 + }, + { + "epoch": 0.43, + "grad_norm": 0.5234375, + "learning_rate": 0.00014168034616260963, + "loss": 1.4375, + "step": 8250 + }, + { + "epoch": 0.43, + "grad_norm": 0.515625, + "learning_rate": 0.00014159824533003718, + "loss": 1.4656, + "step": 8255 + }, + { + "epoch": 0.43, + "grad_norm": 0.546875, + "learning_rate": 0.0001415161105766816, + "loss": 1.4362, + "step": 8260 + }, + { + "epoch": 0.43, + "grad_norm": 0.51171875, + "learning_rate": 0.0001414339419695187, + "loss": 1.4265, + "step": 8265 + }, + { + "epoch": 0.43, + "grad_norm": 0.53515625, + "learning_rate": 0.00014135173957555182, + "loss": 1.4623, + "step": 8270 + }, + { + "epoch": 0.43, + "grad_norm": 0.5, + "learning_rate": 0.00014126950346181195, + "loss": 1.4236, + "step": 8275 + }, + { + "epoch": 0.43, + "grad_norm": 0.5234375, + "learning_rate": 0.00014118723369535747, + "loss": 1.4621, + "step": 8280 + }, + { + "epoch": 0.43, + "grad_norm": 0.53515625, + "learning_rate": 0.0001411049303432743, + "loss": 1.4385, + "step": 8285 + }, + { + "epoch": 0.43, + "grad_norm": 0.5703125, + "learning_rate": 0.00014102259347267574, + "loss": 1.4639, + "step": 8290 + }, + { + "epoch": 0.43, + "grad_norm": 0.55859375, + "learning_rate": 0.00014094022315070236, + "loss": 1.4278, + "step": 8295 + }, + { + "epoch": 0.43, + "grad_norm": 0.5078125, + "learning_rate": 0.00014085781944452201, + "loss": 1.426, + "step": 8300 + }, + { + "epoch": 0.43, + "grad_norm": 0.5078125, + "learning_rate": 0.0001407753824213298, + "loss": 1.4421, + "step": 8305 + }, + { + "epoch": 0.43, + "grad_norm": 0.51953125, + "learning_rate": 0.00014069291214834802, + "loss": 1.4385, + "step": 8310 + }, + { + "epoch": 0.43, + "grad_norm": 0.51953125, + "learning_rate": 0.00014061040869282608, + "loss": 1.4284, + "step": 8315 + }, + { + "epoch": 0.43, + "grad_norm": 0.515625, + "learning_rate": 0.00014052787212204032, + "loss": 1.4321, + "step": 8320 + }, + { + "epoch": 0.43, + "grad_norm": 0.5234375, + "learning_rate": 0.00014044530250329425, + "loss": 1.416, + "step": 8325 + }, + { + "epoch": 0.43, + "grad_norm": 0.50390625, + "learning_rate": 0.0001403626999039183, + "loss": 1.4127, + "step": 8330 + }, + { + "epoch": 0.43, + "grad_norm": 0.546875, + "learning_rate": 0.00014028006439126967, + "loss": 1.4603, + "step": 8335 + }, + { + "epoch": 0.43, + "grad_norm": 0.53125, + "learning_rate": 0.00014019739603273251, + "loss": 1.4149, + "step": 8340 + }, + { + "epoch": 0.43, + "grad_norm": 0.5546875, + "learning_rate": 0.00014011469489571776, + "loss": 1.4229, + "step": 8345 + }, + { + "epoch": 0.43, + "grad_norm": 0.50390625, + "learning_rate": 0.00014003196104766304, + "loss": 1.4199, + "step": 8350 + }, + { + "epoch": 0.43, + "grad_norm": 0.5078125, + "learning_rate": 0.00013994919455603263, + "loss": 1.4012, + "step": 8355 + }, + { + "epoch": 0.43, + "grad_norm": 0.546875, + "learning_rate": 0.00013986639548831752, + "loss": 1.4263, + "step": 8360 + }, + { + "epoch": 0.43, + "grad_norm": 0.5234375, + "learning_rate": 0.00013978356391203514, + "loss": 1.4384, + "step": 8365 + }, + { + "epoch": 0.43, + "grad_norm": 0.50390625, + "learning_rate": 0.0001397006998947295, + "loss": 1.4276, + "step": 8370 + }, + { + "epoch": 0.43, + "grad_norm": 0.53125, + "learning_rate": 0.00013961780350397112, + "loss": 1.4205, + "step": 8375 + }, + { + "epoch": 0.43, + "grad_norm": 0.515625, + "learning_rate": 0.00013953487480735679, + "loss": 1.4579, + "step": 8380 + }, + { + "epoch": 0.43, + "grad_norm": 0.51171875, + "learning_rate": 0.00013945191387250972, + "loss": 1.4331, + "step": 8385 + }, + { + "epoch": 0.43, + "grad_norm": 0.52734375, + "learning_rate": 0.00013936892076707937, + "loss": 1.4885, + "step": 8390 + }, + { + "epoch": 0.43, + "grad_norm": 0.51953125, + "learning_rate": 0.0001392858955587415, + "loss": 1.397, + "step": 8395 + }, + { + "epoch": 0.43, + "grad_norm": 0.51953125, + "learning_rate": 0.00013920283831519802, + "loss": 1.4128, + "step": 8400 + }, + { + "epoch": 0.43, + "grad_norm": 0.5, + "learning_rate": 0.0001391197491041769, + "loss": 1.4026, + "step": 8405 + }, + { + "epoch": 0.44, + "grad_norm": 0.5703125, + "learning_rate": 0.00013903662799343226, + "loss": 1.4581, + "step": 8410 + }, + { + "epoch": 0.44, + "grad_norm": 0.55078125, + "learning_rate": 0.00013895347505074417, + "loss": 1.4506, + "step": 8415 + }, + { + "epoch": 0.44, + "grad_norm": 0.5234375, + "learning_rate": 0.0001388702903439187, + "loss": 1.4552, + "step": 8420 + }, + { + "epoch": 0.44, + "grad_norm": 0.5234375, + "learning_rate": 0.00013878707394078782, + "loss": 1.433, + "step": 8425 + }, + { + "epoch": 0.44, + "grad_norm": 0.5078125, + "learning_rate": 0.00013870382590920933, + "loss": 1.4658, + "step": 8430 + }, + { + "epoch": 0.44, + "grad_norm": 1.25, + "learning_rate": 0.0001386205463170668, + "loss": 1.4267, + "step": 8435 + }, + { + "epoch": 0.44, + "grad_norm": 0.52734375, + "learning_rate": 0.00013853723523226955, + "loss": 1.4343, + "step": 8440 + }, + { + "epoch": 0.44, + "grad_norm": 0.546875, + "learning_rate": 0.00013845389272275268, + "loss": 1.4299, + "step": 8445 + }, + { + "epoch": 0.44, + "grad_norm": 0.515625, + "learning_rate": 0.0001383705188564767, + "loss": 1.4275, + "step": 8450 + }, + { + "epoch": 0.44, + "grad_norm": 0.53515625, + "learning_rate": 0.00013828711370142792, + "loss": 1.4335, + "step": 8455 + }, + { + "epoch": 0.44, + "grad_norm": 0.5546875, + "learning_rate": 0.00013820367732561803, + "loss": 1.4597, + "step": 8460 + }, + { + "epoch": 0.44, + "grad_norm": 0.51171875, + "learning_rate": 0.00013812020979708418, + "loss": 1.422, + "step": 8465 + }, + { + "epoch": 0.44, + "grad_norm": 0.494140625, + "learning_rate": 0.00013803671118388895, + "loss": 1.4521, + "step": 8470 + }, + { + "epoch": 0.44, + "grad_norm": 0.5078125, + "learning_rate": 0.0001379531815541203, + "loss": 1.4216, + "step": 8475 + }, + { + "epoch": 0.44, + "grad_norm": 0.51953125, + "learning_rate": 0.00013786962097589144, + "loss": 1.4213, + "step": 8480 + }, + { + "epoch": 0.44, + "grad_norm": 0.53125, + "learning_rate": 0.0001377860295173408, + "loss": 1.4642, + "step": 8485 + }, + { + "epoch": 0.44, + "grad_norm": 0.484375, + "learning_rate": 0.00013770240724663208, + "loss": 1.397, + "step": 8490 + }, + { + "epoch": 0.44, + "grad_norm": 0.52734375, + "learning_rate": 0.00013761875423195396, + "loss": 1.4256, + "step": 8495 + }, + { + "epoch": 0.44, + "grad_norm": 0.51171875, + "learning_rate": 0.00013753507054152034, + "loss": 1.4535, + "step": 8500 + }, + { + "epoch": 0.44, + "grad_norm": 0.53125, + "learning_rate": 0.00013745135624357007, + "loss": 1.445, + "step": 8505 + }, + { + "epoch": 0.44, + "grad_norm": 0.5078125, + "learning_rate": 0.0001373676114063669, + "loss": 1.4233, + "step": 8510 + }, + { + "epoch": 0.44, + "grad_norm": 0.515625, + "learning_rate": 0.00013728383609819958, + "loss": 1.4573, + "step": 8515 + }, + { + "epoch": 0.44, + "grad_norm": 0.5234375, + "learning_rate": 0.00013720003038738163, + "loss": 1.4007, + "step": 8520 + }, + { + "epoch": 0.44, + "grad_norm": 0.494140625, + "learning_rate": 0.00013711619434225145, + "loss": 1.4374, + "step": 8525 + }, + { + "epoch": 0.44, + "grad_norm": 0.51953125, + "learning_rate": 0.0001370323280311721, + "loss": 1.4193, + "step": 8530 + }, + { + "epoch": 0.44, + "grad_norm": 0.494140625, + "learning_rate": 0.00013694843152253132, + "loss": 1.4188, + "step": 8535 + }, + { + "epoch": 0.44, + "grad_norm": 0.53125, + "learning_rate": 0.00013686450488474154, + "loss": 1.4665, + "step": 8540 + }, + { + "epoch": 0.44, + "grad_norm": 0.53515625, + "learning_rate": 0.00013678054818623965, + "loss": 1.4242, + "step": 8545 + }, + { + "epoch": 0.44, + "grad_norm": 0.5234375, + "learning_rate": 0.00013669656149548718, + "loss": 1.4485, + "step": 8550 + }, + { + "epoch": 0.44, + "grad_norm": 0.5390625, + "learning_rate": 0.00013661254488097003, + "loss": 1.4056, + "step": 8555 + }, + { + "epoch": 0.44, + "grad_norm": 0.515625, + "learning_rate": 0.00013652849841119856, + "loss": 1.4034, + "step": 8560 + }, + { + "epoch": 0.44, + "grad_norm": 0.53515625, + "learning_rate": 0.00013644442215470737, + "loss": 1.4411, + "step": 8565 + }, + { + "epoch": 0.44, + "grad_norm": 0.5234375, + "learning_rate": 0.00013636031618005553, + "loss": 1.4342, + "step": 8570 + }, + { + "epoch": 0.44, + "grad_norm": 0.53125, + "learning_rate": 0.0001362761805558261, + "loss": 1.4277, + "step": 8575 + }, + { + "epoch": 0.44, + "grad_norm": 0.51171875, + "learning_rate": 0.00013619201535062657, + "loss": 1.4415, + "step": 8580 + }, + { + "epoch": 0.44, + "grad_norm": 0.5546875, + "learning_rate": 0.00013610782063308837, + "loss": 1.4548, + "step": 8585 + }, + { + "epoch": 0.44, + "grad_norm": 0.515625, + "learning_rate": 0.00013602359647186708, + "loss": 1.4578, + "step": 8590 + }, + { + "epoch": 0.44, + "grad_norm": 0.515625, + "learning_rate": 0.00013593934293564222, + "loss": 1.4366, + "step": 8595 + }, + { + "epoch": 0.44, + "grad_norm": 0.5078125, + "learning_rate": 0.00013585506009311738, + "loss": 1.4126, + "step": 8600 + }, + { + "epoch": 0.45, + "grad_norm": 0.515625, + "learning_rate": 0.00013577074801301992, + "loss": 1.4599, + "step": 8605 + }, + { + "epoch": 0.45, + "grad_norm": 0.51953125, + "learning_rate": 0.00013568640676410115, + "loss": 1.3858, + "step": 8610 + }, + { + "epoch": 0.45, + "grad_norm": 0.53515625, + "learning_rate": 0.00013560203641513606, + "loss": 1.4707, + "step": 8615 + }, + { + "epoch": 0.45, + "grad_norm": 0.5, + "learning_rate": 0.0001355176370349235, + "loss": 1.4366, + "step": 8620 + }, + { + "epoch": 0.45, + "grad_norm": 0.53515625, + "learning_rate": 0.00013543320869228585, + "loss": 1.4617, + "step": 8625 + }, + { + "epoch": 0.45, + "grad_norm": 0.5, + "learning_rate": 0.00013534875145606925, + "loss": 1.3993, + "step": 8630 + }, + { + "epoch": 0.45, + "grad_norm": 0.52734375, + "learning_rate": 0.00013526426539514324, + "loss": 1.4384, + "step": 8635 + }, + { + "epoch": 0.45, + "grad_norm": 0.5, + "learning_rate": 0.00013517975057840097, + "loss": 1.4434, + "step": 8640 + }, + { + "epoch": 0.45, + "grad_norm": 0.4921875, + "learning_rate": 0.00013509520707475907, + "loss": 1.4362, + "step": 8645 + }, + { + "epoch": 0.45, + "grad_norm": 0.53515625, + "learning_rate": 0.00013501063495315743, + "loss": 1.4159, + "step": 8650 + }, + { + "epoch": 0.45, + "grad_norm": 0.51953125, + "learning_rate": 0.0001349260342825595, + "loss": 1.3869, + "step": 8655 + }, + { + "epoch": 0.45, + "grad_norm": 0.5078125, + "learning_rate": 0.00013484140513195166, + "loss": 1.3795, + "step": 8660 + }, + { + "epoch": 0.45, + "grad_norm": 0.54296875, + "learning_rate": 0.0001347567475703439, + "loss": 1.3984, + "step": 8665 + }, + { + "epoch": 0.45, + "grad_norm": 0.53125, + "learning_rate": 0.00013467206166676914, + "loss": 1.4237, + "step": 8670 + }, + { + "epoch": 0.45, + "grad_norm": 0.52734375, + "learning_rate": 0.0001345873474902835, + "loss": 1.4428, + "step": 8675 + }, + { + "epoch": 0.45, + "grad_norm": 0.54296875, + "learning_rate": 0.000134502605109966, + "loss": 1.4228, + "step": 8680 + }, + { + "epoch": 0.45, + "grad_norm": 0.55859375, + "learning_rate": 0.00013441783459491893, + "loss": 1.4034, + "step": 8685 + }, + { + "epoch": 0.45, + "grad_norm": 0.5703125, + "learning_rate": 0.00013433303601426727, + "loss": 1.4506, + "step": 8690 + }, + { + "epoch": 0.45, + "grad_norm": 0.50390625, + "learning_rate": 0.0001342482094371591, + "loss": 1.4052, + "step": 8695 + }, + { + "epoch": 0.45, + "grad_norm": 0.54296875, + "learning_rate": 0.00013416335493276511, + "loss": 1.4684, + "step": 8700 + }, + { + "epoch": 0.45, + "grad_norm": 0.5390625, + "learning_rate": 0.00013407847257027896, + "loss": 1.3952, + "step": 8705 + }, + { + "epoch": 0.45, + "grad_norm": 0.53515625, + "learning_rate": 0.00013399356241891686, + "loss": 1.4229, + "step": 8710 + }, + { + "epoch": 0.45, + "grad_norm": 0.53515625, + "learning_rate": 0.00013390862454791785, + "loss": 1.4695, + "step": 8715 + }, + { + "epoch": 0.45, + "grad_norm": 0.5078125, + "learning_rate": 0.00013382365902654336, + "loss": 1.4372, + "step": 8720 + }, + { + "epoch": 0.45, + "grad_norm": 0.49609375, + "learning_rate": 0.00013373866592407765, + "loss": 1.4514, + "step": 8725 + }, + { + "epoch": 0.45, + "grad_norm": 0.5078125, + "learning_rate": 0.00013365364530982716, + "loss": 1.4135, + "step": 8730 + }, + { + "epoch": 0.45, + "grad_norm": 0.5234375, + "learning_rate": 0.00013356859725312104, + "loss": 1.4251, + "step": 8735 + }, + { + "epoch": 0.45, + "grad_norm": 0.5078125, + "learning_rate": 0.0001334835218233106, + "loss": 1.4028, + "step": 8740 + }, + { + "epoch": 0.45, + "grad_norm": 0.53125, + "learning_rate": 0.00013339841908976963, + "loss": 1.3512, + "step": 8745 + }, + { + "epoch": 0.45, + "grad_norm": 0.55078125, + "learning_rate": 0.00013331328912189407, + "loss": 1.4351, + "step": 8750 + }, + { + "epoch": 0.45, + "grad_norm": 0.51953125, + "learning_rate": 0.00013322813198910212, + "loss": 1.4621, + "step": 8755 + }, + { + "epoch": 0.45, + "grad_norm": 0.51953125, + "learning_rate": 0.0001331429477608342, + "loss": 1.4065, + "step": 8760 + }, + { + "epoch": 0.45, + "grad_norm": 0.52734375, + "learning_rate": 0.00013305773650655267, + "loss": 1.4684, + "step": 8765 + }, + { + "epoch": 0.45, + "grad_norm": 0.53515625, + "learning_rate": 0.00013297249829574202, + "loss": 1.4672, + "step": 8770 + }, + { + "epoch": 0.45, + "grad_norm": 0.53125, + "learning_rate": 0.00013288723319790875, + "loss": 1.4093, + "step": 8775 + }, + { + "epoch": 0.45, + "grad_norm": 0.498046875, + "learning_rate": 0.00013280194128258122, + "loss": 1.4403, + "step": 8780 + }, + { + "epoch": 0.45, + "grad_norm": 0.53125, + "learning_rate": 0.00013271662261930971, + "loss": 1.4036, + "step": 8785 + }, + { + "epoch": 0.45, + "grad_norm": 0.53515625, + "learning_rate": 0.00013263127727766624, + "loss": 1.4239, + "step": 8790 + }, + { + "epoch": 0.46, + "grad_norm": 0.51171875, + "learning_rate": 0.00013254590532724468, + "loss": 1.4219, + "step": 8795 + }, + { + "epoch": 0.46, + "grad_norm": 0.5078125, + "learning_rate": 0.00013246050683766048, + "loss": 1.4247, + "step": 8800 + }, + { + "epoch": 0.46, + "grad_norm": 0.52734375, + "learning_rate": 0.00013237508187855093, + "loss": 1.4521, + "step": 8805 + }, + { + "epoch": 0.46, + "grad_norm": 0.51953125, + "learning_rate": 0.0001322896305195746, + "loss": 1.4264, + "step": 8810 + }, + { + "epoch": 0.46, + "grad_norm": 0.52734375, + "learning_rate": 0.00013220415283041195, + "loss": 1.4408, + "step": 8815 + }, + { + "epoch": 0.46, + "grad_norm": 0.5234375, + "learning_rate": 0.00013211864888076457, + "loss": 1.4116, + "step": 8820 + }, + { + "epoch": 0.46, + "grad_norm": 0.52734375, + "learning_rate": 0.00013203311874035567, + "loss": 1.4312, + "step": 8825 + }, + { + "epoch": 0.46, + "grad_norm": 0.53125, + "learning_rate": 0.00013194756247892977, + "loss": 1.4617, + "step": 8830 + }, + { + "epoch": 0.46, + "grad_norm": 0.54296875, + "learning_rate": 0.00013186198016625268, + "loss": 1.4574, + "step": 8835 + }, + { + "epoch": 0.46, + "grad_norm": 0.5625, + "learning_rate": 0.00013177637187211143, + "loss": 1.433, + "step": 8840 + }, + { + "epoch": 0.46, + "grad_norm": 0.515625, + "learning_rate": 0.00013169073766631427, + "loss": 1.4582, + "step": 8845 + }, + { + "epoch": 0.46, + "grad_norm": 0.53125, + "learning_rate": 0.00013160507761869063, + "loss": 1.4727, + "step": 8850 + }, + { + "epoch": 0.46, + "grad_norm": 0.51953125, + "learning_rate": 0.00013151939179909086, + "loss": 1.4673, + "step": 8855 + }, + { + "epoch": 0.46, + "grad_norm": 0.53515625, + "learning_rate": 0.0001314336802773865, + "loss": 1.4465, + "step": 8860 + }, + { + "epoch": 0.46, + "grad_norm": 0.49609375, + "learning_rate": 0.00013134794312346992, + "loss": 1.4471, + "step": 8865 + }, + { + "epoch": 0.46, + "grad_norm": 0.515625, + "learning_rate": 0.00013126218040725447, + "loss": 1.4217, + "step": 8870 + }, + { + "epoch": 0.46, + "grad_norm": 0.51953125, + "learning_rate": 0.00013117639219867427, + "loss": 1.4203, + "step": 8875 + }, + { + "epoch": 0.46, + "grad_norm": 0.53125, + "learning_rate": 0.00013109057856768434, + "loss": 1.4375, + "step": 8880 + }, + { + "epoch": 0.46, + "grad_norm": 0.546875, + "learning_rate": 0.00013100473958426028, + "loss": 1.4119, + "step": 8885 + }, + { + "epoch": 0.46, + "grad_norm": 0.55078125, + "learning_rate": 0.00013091887531839852, + "loss": 1.4325, + "step": 8890 + }, + { + "epoch": 0.46, + "grad_norm": 0.50390625, + "learning_rate": 0.00013083298584011597, + "loss": 1.3835, + "step": 8895 + }, + { + "epoch": 0.46, + "grad_norm": 0.5625, + "learning_rate": 0.0001307470712194502, + "loss": 1.4462, + "step": 8900 + }, + { + "epoch": 0.46, + "grad_norm": 0.515625, + "learning_rate": 0.0001306611315264592, + "loss": 1.4318, + "step": 8905 + }, + { + "epoch": 0.46, + "grad_norm": 0.5234375, + "learning_rate": 0.00013057516683122152, + "loss": 1.431, + "step": 8910 + }, + { + "epoch": 0.46, + "grad_norm": 0.54296875, + "learning_rate": 0.00013048917720383593, + "loss": 1.435, + "step": 8915 + }, + { + "epoch": 0.46, + "grad_norm": 0.5078125, + "learning_rate": 0.00013040316271442173, + "loss": 1.4275, + "step": 8920 + }, + { + "epoch": 0.46, + "grad_norm": 0.52734375, + "learning_rate": 0.0001303171234331183, + "loss": 1.422, + "step": 8925 + }, + { + "epoch": 0.46, + "grad_norm": 0.5234375, + "learning_rate": 0.00013023105943008539, + "loss": 1.4462, + "step": 8930 + }, + { + "epoch": 0.46, + "grad_norm": 0.5234375, + "learning_rate": 0.0001301449707755028, + "loss": 1.4093, + "step": 8935 + }, + { + "epoch": 0.46, + "grad_norm": 0.5546875, + "learning_rate": 0.00013005885753957048, + "loss": 1.4153, + "step": 8940 + }, + { + "epoch": 0.46, + "grad_norm": 0.52734375, + "learning_rate": 0.00012997271979250843, + "loss": 1.4319, + "step": 8945 + }, + { + "epoch": 0.46, + "grad_norm": 0.5234375, + "learning_rate": 0.00012988655760455667, + "loss": 1.4326, + "step": 8950 + }, + { + "epoch": 0.46, + "grad_norm": 0.53125, + "learning_rate": 0.000129800371045975, + "loss": 1.3916, + "step": 8955 + }, + { + "epoch": 0.46, + "grad_norm": 0.52734375, + "learning_rate": 0.00012971416018704333, + "loss": 1.4428, + "step": 8960 + }, + { + "epoch": 0.46, + "grad_norm": 0.515625, + "learning_rate": 0.00012962792509806117, + "loss": 1.4368, + "step": 8965 + }, + { + "epoch": 0.46, + "grad_norm": 0.5078125, + "learning_rate": 0.0001295416658493479, + "loss": 1.4149, + "step": 8970 + }, + { + "epoch": 0.46, + "grad_norm": 0.5703125, + "learning_rate": 0.0001294553825112426, + "loss": 1.4172, + "step": 8975 + }, + { + "epoch": 0.46, + "grad_norm": 0.52734375, + "learning_rate": 0.00012936907515410392, + "loss": 1.4337, + "step": 8980 + }, + { + "epoch": 0.46, + "grad_norm": 0.5390625, + "learning_rate": 0.00012928274384831014, + "loss": 1.3825, + "step": 8985 + }, + { + "epoch": 0.47, + "grad_norm": 0.5, + "learning_rate": 0.00012919638866425913, + "loss": 1.4085, + "step": 8990 + }, + { + "epoch": 0.47, + "grad_norm": 0.51953125, + "learning_rate": 0.00012911000967236815, + "loss": 1.4035, + "step": 8995 + }, + { + "epoch": 0.47, + "grad_norm": 0.546875, + "learning_rate": 0.00012902360694307387, + "loss": 1.4052, + "step": 9000 + }, + { + "epoch": 0.47, + "grad_norm": 0.515625, + "learning_rate": 0.00012893718054683242, + "loss": 1.4425, + "step": 9005 + }, + { + "epoch": 0.47, + "grad_norm": 0.5234375, + "learning_rate": 0.00012885073055411903, + "loss": 1.4268, + "step": 9010 + }, + { + "epoch": 0.47, + "grad_norm": 0.53515625, + "learning_rate": 0.00012876425703542844, + "loss": 1.4251, + "step": 9015 + }, + { + "epoch": 0.47, + "grad_norm": 0.53515625, + "learning_rate": 0.00012867776006127428, + "loss": 1.4199, + "step": 9020 + }, + { + "epoch": 0.47, + "grad_norm": 0.5390625, + "learning_rate": 0.00012859123970218958, + "loss": 1.4357, + "step": 9025 + }, + { + "epoch": 0.47, + "grad_norm": 0.5234375, + "learning_rate": 0.00012850469602872623, + "loss": 1.4119, + "step": 9030 + }, + { + "epoch": 0.47, + "grad_norm": 0.5234375, + "learning_rate": 0.0001284181291114553, + "loss": 1.3873, + "step": 9035 + }, + { + "epoch": 0.47, + "grad_norm": 0.52734375, + "learning_rate": 0.00012833153902096664, + "loss": 1.3895, + "step": 9040 + }, + { + "epoch": 0.47, + "grad_norm": 0.53125, + "learning_rate": 0.00012824492582786916, + "loss": 1.4456, + "step": 9045 + }, + { + "epoch": 0.47, + "grad_norm": 0.515625, + "learning_rate": 0.00012815828960279047, + "loss": 1.41, + "step": 9050 + }, + { + "epoch": 0.47, + "grad_norm": 0.52734375, + "learning_rate": 0.00012807163041637706, + "loss": 1.4373, + "step": 9055 + }, + { + "epoch": 0.47, + "grad_norm": 0.52734375, + "learning_rate": 0.0001279849483392941, + "loss": 1.3842, + "step": 9060 + }, + { + "epoch": 0.47, + "grad_norm": 0.55078125, + "learning_rate": 0.00012789824344222546, + "loss": 1.4345, + "step": 9065 + }, + { + "epoch": 0.47, + "grad_norm": 0.5078125, + "learning_rate": 0.00012781151579587357, + "loss": 1.4377, + "step": 9070 + }, + { + "epoch": 0.47, + "grad_norm": 0.55078125, + "learning_rate": 0.00012772476547095944, + "loss": 1.4504, + "step": 9075 + }, + { + "epoch": 0.47, + "grad_norm": 0.5078125, + "learning_rate": 0.00012763799253822256, + "loss": 1.4384, + "step": 9080 + }, + { + "epoch": 0.47, + "grad_norm": 0.5234375, + "learning_rate": 0.00012755119706842088, + "loss": 1.4835, + "step": 9085 + }, + { + "epoch": 0.47, + "grad_norm": 0.494140625, + "learning_rate": 0.00012746437913233066, + "loss": 1.4315, + "step": 9090 + }, + { + "epoch": 0.47, + "grad_norm": 0.5078125, + "learning_rate": 0.0001273775388007466, + "loss": 1.4752, + "step": 9095 + }, + { + "epoch": 0.47, + "grad_norm": 0.5625, + "learning_rate": 0.00012729067614448156, + "loss": 1.4255, + "step": 9100 + }, + { + "epoch": 0.47, + "grad_norm": 0.50390625, + "learning_rate": 0.00012720379123436665, + "loss": 1.422, + "step": 9105 + }, + { + "epoch": 0.47, + "grad_norm": 0.5390625, + "learning_rate": 0.00012711688414125108, + "loss": 1.44, + "step": 9110 + }, + { + "epoch": 0.47, + "grad_norm": 0.515625, + "learning_rate": 0.0001270299549360022, + "loss": 1.4008, + "step": 9115 + }, + { + "epoch": 0.47, + "grad_norm": 0.5234375, + "learning_rate": 0.0001269430036895054, + "loss": 1.4218, + "step": 9120 + }, + { + "epoch": 0.47, + "grad_norm": 0.5390625, + "learning_rate": 0.00012685603047266398, + "loss": 1.4274, + "step": 9125 + }, + { + "epoch": 0.47, + "grad_norm": 0.51953125, + "learning_rate": 0.0001267690353563992, + "loss": 1.4504, + "step": 9130 + }, + { + "epoch": 0.47, + "grad_norm": 0.53125, + "learning_rate": 0.00012668201841165017, + "loss": 1.4366, + "step": 9135 + }, + { + "epoch": 0.47, + "grad_norm": 0.55078125, + "learning_rate": 0.0001265949797093738, + "loss": 1.4325, + "step": 9140 + }, + { + "epoch": 0.47, + "grad_norm": 0.53125, + "learning_rate": 0.00012650791932054473, + "loss": 1.4445, + "step": 9145 + }, + { + "epoch": 0.47, + "grad_norm": 0.498046875, + "learning_rate": 0.00012642083731615532, + "loss": 1.4038, + "step": 9150 + }, + { + "epoch": 0.47, + "grad_norm": 0.515625, + "learning_rate": 0.0001263337337672155, + "loss": 1.4125, + "step": 9155 + }, + { + "epoch": 0.47, + "grad_norm": 0.58203125, + "learning_rate": 0.00012624660874475287, + "loss": 1.442, + "step": 9160 + }, + { + "epoch": 0.47, + "grad_norm": 0.51171875, + "learning_rate": 0.00012615946231981238, + "loss": 1.4092, + "step": 9165 + }, + { + "epoch": 0.47, + "grad_norm": 0.5078125, + "learning_rate": 0.00012607229456345658, + "loss": 1.4424, + "step": 9170 + }, + { + "epoch": 0.47, + "grad_norm": 0.546875, + "learning_rate": 0.0001259851055467653, + "loss": 1.4196, + "step": 9175 + }, + { + "epoch": 0.47, + "grad_norm": 0.5625, + "learning_rate": 0.00012589789534083582, + "loss": 1.422, + "step": 9180 + }, + { + "epoch": 0.48, + "grad_norm": 0.54296875, + "learning_rate": 0.0001258106640167826, + "loss": 1.4399, + "step": 9185 + }, + { + "epoch": 0.48, + "grad_norm": 0.478515625, + "learning_rate": 0.0001257234116457374, + "loss": 1.4069, + "step": 9190 + }, + { + "epoch": 0.48, + "grad_norm": 0.51171875, + "learning_rate": 0.0001256361382988491, + "loss": 1.4012, + "step": 9195 + }, + { + "epoch": 0.48, + "grad_norm": 0.515625, + "learning_rate": 0.00012554884404728368, + "loss": 1.3978, + "step": 9200 + }, + { + "epoch": 0.48, + "grad_norm": 0.51953125, + "learning_rate": 0.00012546152896222417, + "loss": 1.3993, + "step": 9205 + }, + { + "epoch": 0.48, + "grad_norm": 0.52734375, + "learning_rate": 0.00012537419311487057, + "loss": 1.419, + "step": 9210 + }, + { + "epoch": 0.48, + "grad_norm": 0.51953125, + "learning_rate": 0.00012528683657643988, + "loss": 1.4224, + "step": 9215 + }, + { + "epoch": 0.48, + "grad_norm": 0.56640625, + "learning_rate": 0.0001251994594181659, + "loss": 1.4741, + "step": 9220 + }, + { + "epoch": 0.48, + "grad_norm": 0.53125, + "learning_rate": 0.00012511206171129927, + "loss": 1.4216, + "step": 9225 + }, + { + "epoch": 0.48, + "grad_norm": 0.55078125, + "learning_rate": 0.00012502464352710742, + "loss": 1.4371, + "step": 9230 + }, + { + "epoch": 0.48, + "grad_norm": 0.5, + "learning_rate": 0.0001249372049368744, + "loss": 1.4272, + "step": 9235 + }, + { + "epoch": 0.48, + "grad_norm": 0.51953125, + "learning_rate": 0.00012484974601190097, + "loss": 1.4451, + "step": 9240 + }, + { + "epoch": 0.48, + "grad_norm": 0.5078125, + "learning_rate": 0.00012476226682350442, + "loss": 1.4301, + "step": 9245 + }, + { + "epoch": 0.48, + "grad_norm": 0.515625, + "learning_rate": 0.00012467476744301866, + "loss": 1.4413, + "step": 9250 + }, + { + "epoch": 0.48, + "grad_norm": 0.52734375, + "learning_rate": 0.00012458724794179392, + "loss": 1.3858, + "step": 9255 + }, + { + "epoch": 0.48, + "grad_norm": 0.5390625, + "learning_rate": 0.00012449970839119697, + "loss": 1.4114, + "step": 9260 + }, + { + "epoch": 0.48, + "grad_norm": 0.5546875, + "learning_rate": 0.00012441214886261076, + "loss": 1.4585, + "step": 9265 + }, + { + "epoch": 0.48, + "grad_norm": 0.5234375, + "learning_rate": 0.00012432456942743477, + "loss": 1.4263, + "step": 9270 + }, + { + "epoch": 0.48, + "grad_norm": 0.51953125, + "learning_rate": 0.00012423697015708456, + "loss": 1.4141, + "step": 9275 + }, + { + "epoch": 0.48, + "grad_norm": 0.53515625, + "learning_rate": 0.0001241493511229918, + "loss": 1.4286, + "step": 9280 + }, + { + "epoch": 0.48, + "grad_norm": 0.5078125, + "learning_rate": 0.0001240617123966045, + "loss": 1.404, + "step": 9285 + }, + { + "epoch": 0.48, + "grad_norm": 0.5546875, + "learning_rate": 0.00012397405404938652, + "loss": 1.4444, + "step": 9290 + }, + { + "epoch": 0.48, + "grad_norm": 0.51171875, + "learning_rate": 0.00012388637615281777, + "loss": 1.3767, + "step": 9295 + }, + { + "epoch": 0.48, + "grad_norm": 0.51953125, + "learning_rate": 0.00012379867877839414, + "loss": 1.4326, + "step": 9300 + }, + { + "epoch": 0.48, + "grad_norm": 0.53125, + "learning_rate": 0.00012371096199762747, + "loss": 1.4363, + "step": 9305 + }, + { + "epoch": 0.48, + "grad_norm": 0.5078125, + "learning_rate": 0.0001236232258820452, + "loss": 1.3958, + "step": 9310 + }, + { + "epoch": 0.48, + "grad_norm": 0.52734375, + "learning_rate": 0.0001235354705031908, + "loss": 1.4464, + "step": 9315 + }, + { + "epoch": 0.48, + "grad_norm": 0.52734375, + "learning_rate": 0.00012344769593262324, + "loss": 1.3948, + "step": 9320 + }, + { + "epoch": 0.48, + "grad_norm": 0.546875, + "learning_rate": 0.0001233599022419173, + "loss": 1.4061, + "step": 9325 + }, + { + "epoch": 0.48, + "grad_norm": 0.53125, + "learning_rate": 0.0001232720895026632, + "loss": 1.4299, + "step": 9330 + }, + { + "epoch": 0.48, + "grad_norm": 0.51171875, + "learning_rate": 0.00012318425778646685, + "loss": 1.4029, + "step": 9335 + }, + { + "epoch": 0.48, + "grad_norm": 0.50390625, + "learning_rate": 0.0001230964071649495, + "loss": 1.37, + "step": 9340 + }, + { + "epoch": 0.48, + "grad_norm": 0.51171875, + "learning_rate": 0.00012300853770974787, + "loss": 1.3944, + "step": 9345 + }, + { + "epoch": 0.48, + "grad_norm": 0.5078125, + "learning_rate": 0.00012292064949251405, + "loss": 1.3961, + "step": 9350 + }, + { + "epoch": 0.48, + "grad_norm": 0.54296875, + "learning_rate": 0.00012283274258491543, + "loss": 1.3656, + "step": 9355 + }, + { + "epoch": 0.48, + "grad_norm": 0.53515625, + "learning_rate": 0.00012274481705863463, + "loss": 1.4532, + "step": 9360 + }, + { + "epoch": 0.48, + "grad_norm": 0.50390625, + "learning_rate": 0.00012265687298536942, + "loss": 1.4187, + "step": 9365 + }, + { + "epoch": 0.48, + "grad_norm": 0.50390625, + "learning_rate": 0.00012256891043683276, + "loss": 1.4343, + "step": 9370 + }, + { + "epoch": 0.49, + "grad_norm": 0.515625, + "learning_rate": 0.00012248092948475263, + "loss": 1.4114, + "step": 9375 + }, + { + "epoch": 0.49, + "grad_norm": 0.53515625, + "learning_rate": 0.000122392930200872, + "loss": 1.3942, + "step": 9380 + }, + { + "epoch": 0.49, + "grad_norm": 0.52734375, + "learning_rate": 0.00012230491265694888, + "loss": 1.4235, + "step": 9385 + }, + { + "epoch": 0.49, + "grad_norm": 0.53125, + "learning_rate": 0.000122216876924756, + "loss": 1.4181, + "step": 9390 + }, + { + "epoch": 0.49, + "grad_norm": 0.5078125, + "learning_rate": 0.00012212882307608116, + "loss": 1.4147, + "step": 9395 + }, + { + "epoch": 0.49, + "grad_norm": 0.53515625, + "learning_rate": 0.00012204075118272669, + "loss": 1.3878, + "step": 9400 + }, + { + "epoch": 0.49, + "grad_norm": 0.55078125, + "learning_rate": 0.0001219526613165098, + "loss": 1.4422, + "step": 9405 + }, + { + "epoch": 0.49, + "grad_norm": 0.52734375, + "learning_rate": 0.00012186455354926228, + "loss": 1.451, + "step": 9410 + }, + { + "epoch": 0.49, + "grad_norm": 0.5390625, + "learning_rate": 0.00012177642795283053, + "loss": 1.4068, + "step": 9415 + }, + { + "epoch": 0.49, + "grad_norm": 0.5234375, + "learning_rate": 0.00012168828459907551, + "loss": 1.4096, + "step": 9420 + }, + { + "epoch": 0.49, + "grad_norm": 0.53125, + "learning_rate": 0.00012160012355987265, + "loss": 1.4282, + "step": 9425 + }, + { + "epoch": 0.49, + "grad_norm": 0.5234375, + "learning_rate": 0.00012151194490711178, + "loss": 1.4695, + "step": 9430 + }, + { + "epoch": 0.49, + "grad_norm": 0.55078125, + "learning_rate": 0.00012142374871269713, + "loss": 1.4556, + "step": 9435 + }, + { + "epoch": 0.49, + "grad_norm": 0.50390625, + "learning_rate": 0.00012133553504854718, + "loss": 1.3891, + "step": 9440 + }, + { + "epoch": 0.49, + "grad_norm": 0.5, + "learning_rate": 0.00012124730398659474, + "loss": 1.4015, + "step": 9445 + }, + { + "epoch": 0.49, + "grad_norm": 0.51953125, + "learning_rate": 0.0001211590555987867, + "loss": 1.4103, + "step": 9450 + }, + { + "epoch": 0.49, + "grad_norm": 0.53125, + "learning_rate": 0.00012107078995708417, + "loss": 1.4507, + "step": 9455 + }, + { + "epoch": 0.49, + "grad_norm": 0.5390625, + "learning_rate": 0.00012098250713346231, + "loss": 1.4796, + "step": 9460 + }, + { + "epoch": 0.49, + "grad_norm": 0.55078125, + "learning_rate": 0.00012089420719991022, + "loss": 1.4649, + "step": 9465 + }, + { + "epoch": 0.49, + "grad_norm": 0.515625, + "learning_rate": 0.00012080589022843107, + "loss": 1.4003, + "step": 9470 + }, + { + "epoch": 0.49, + "grad_norm": 0.5234375, + "learning_rate": 0.0001207175562910418, + "loss": 1.4453, + "step": 9475 + }, + { + "epoch": 0.49, + "grad_norm": 0.515625, + "learning_rate": 0.00012062920545977327, + "loss": 1.4391, + "step": 9480 + }, + { + "epoch": 0.49, + "grad_norm": 0.515625, + "learning_rate": 0.00012054083780667012, + "loss": 1.4181, + "step": 9485 + }, + { + "epoch": 0.49, + "grad_norm": 0.51171875, + "learning_rate": 0.00012045245340379063, + "loss": 1.4388, + "step": 9490 + }, + { + "epoch": 0.49, + "grad_norm": 0.5078125, + "learning_rate": 0.0001203640523232068, + "loss": 1.44, + "step": 9495 + }, + { + "epoch": 0.49, + "grad_norm": 0.5390625, + "learning_rate": 0.00012027563463700427, + "loss": 1.4697, + "step": 9500 + }, + { + "epoch": 0.49, + "grad_norm": 0.5390625, + "learning_rate": 0.00012018720041728206, + "loss": 1.4354, + "step": 9505 + }, + { + "epoch": 0.49, + "grad_norm": 0.51171875, + "learning_rate": 0.00012009874973615287, + "loss": 1.3918, + "step": 9510 + }, + { + "epoch": 0.49, + "grad_norm": 0.51171875, + "learning_rate": 0.00012001028266574268, + "loss": 1.4441, + "step": 9515 + }, + { + "epoch": 0.49, + "grad_norm": 0.5078125, + "learning_rate": 0.00011992179927819093, + "loss": 1.4441, + "step": 9520 + }, + { + "epoch": 0.49, + "grad_norm": 0.5390625, + "learning_rate": 0.00011983329964565028, + "loss": 1.378, + "step": 9525 + }, + { + "epoch": 0.49, + "grad_norm": 0.5234375, + "learning_rate": 0.00011974478384028672, + "loss": 1.4261, + "step": 9530 + }, + { + "epoch": 0.49, + "grad_norm": 0.54296875, + "learning_rate": 0.00011965625193427934, + "loss": 1.4045, + "step": 9535 + }, + { + "epoch": 0.49, + "grad_norm": 0.5234375, + "learning_rate": 0.00011956770399982045, + "loss": 1.4679, + "step": 9540 + }, + { + "epoch": 0.49, + "grad_norm": 0.5078125, + "learning_rate": 0.00011947914010911534, + "loss": 1.4444, + "step": 9545 + }, + { + "epoch": 0.49, + "grad_norm": 0.52734375, + "learning_rate": 0.0001193905603343824, + "loss": 1.434, + "step": 9550 + }, + { + "epoch": 0.49, + "grad_norm": 0.50390625, + "learning_rate": 0.00011930196474785294, + "loss": 1.4215, + "step": 9555 + }, + { + "epoch": 0.49, + "grad_norm": 0.5234375, + "learning_rate": 0.00011921335342177111, + "loss": 1.4442, + "step": 9560 + }, + { + "epoch": 0.49, + "grad_norm": 0.515625, + "learning_rate": 0.00011912472642839394, + "loss": 1.4107, + "step": 9565 + }, + { + "epoch": 0.5, + "grad_norm": 0.515625, + "learning_rate": 0.00011903608383999125, + "loss": 1.3996, + "step": 9570 + }, + { + "epoch": 0.5, + "grad_norm": 0.53515625, + "learning_rate": 0.00011894742572884554, + "loss": 1.3999, + "step": 9575 + }, + { + "epoch": 0.5, + "grad_norm": 0.51953125, + "learning_rate": 0.00011885875216725205, + "loss": 1.4767, + "step": 9580 + }, + { + "epoch": 0.5, + "grad_norm": 0.53125, + "learning_rate": 0.00011877006322751847, + "loss": 1.4236, + "step": 9585 + }, + { + "epoch": 0.5, + "grad_norm": 0.51171875, + "learning_rate": 0.00011868135898196519, + "loss": 1.429, + "step": 9590 + }, + { + "epoch": 0.5, + "grad_norm": 0.53515625, + "learning_rate": 0.00011859263950292496, + "loss": 1.4233, + "step": 9595 + }, + { + "epoch": 0.5, + "grad_norm": 0.515625, + "learning_rate": 0.00011850390486274303, + "loss": 1.4237, + "step": 9600 + }, + { + "epoch": 0.5, + "grad_norm": 0.5234375, + "learning_rate": 0.00011841515513377697, + "loss": 1.4137, + "step": 9605 + }, + { + "epoch": 0.5, + "grad_norm": 0.53125, + "learning_rate": 0.00011832639038839666, + "loss": 1.4343, + "step": 9610 + }, + { + "epoch": 0.5, + "grad_norm": 0.484375, + "learning_rate": 0.00011823761069898425, + "loss": 1.4201, + "step": 9615 + }, + { + "epoch": 0.5, + "grad_norm": 0.53515625, + "learning_rate": 0.00011814881613793404, + "loss": 1.4431, + "step": 9620 + }, + { + "epoch": 0.5, + "grad_norm": 0.52734375, + "learning_rate": 0.0001180600067776525, + "loss": 1.4408, + "step": 9625 + }, + { + "epoch": 0.5, + "grad_norm": 0.51171875, + "learning_rate": 0.00011797118269055812, + "loss": 1.4132, + "step": 9630 + }, + { + "epoch": 0.5, + "grad_norm": 0.53125, + "learning_rate": 0.0001178823439490814, + "loss": 1.4418, + "step": 9635 + }, + { + "epoch": 0.5, + "grad_norm": 0.5078125, + "learning_rate": 0.00011779349062566485, + "loss": 1.4081, + "step": 9640 + }, + { + "epoch": 0.5, + "grad_norm": 0.54296875, + "learning_rate": 0.00011770462279276282, + "loss": 1.4312, + "step": 9645 + }, + { + "epoch": 0.5, + "grad_norm": 0.55859375, + "learning_rate": 0.0001176157405228415, + "loss": 1.4477, + "step": 9650 + }, + { + "epoch": 0.5, + "grad_norm": 0.53125, + "learning_rate": 0.0001175268438883789, + "loss": 1.4527, + "step": 9655 + }, + { + "epoch": 0.5, + "grad_norm": 0.5234375, + "learning_rate": 0.0001174379329618646, + "loss": 1.3892, + "step": 9660 + }, + { + "epoch": 0.5, + "grad_norm": 0.53125, + "learning_rate": 0.00011734900781580003, + "loss": 1.435, + "step": 9665 + }, + { + "epoch": 0.5, + "grad_norm": 0.51953125, + "learning_rate": 0.00011726006852269804, + "loss": 1.4074, + "step": 9670 + }, + { + "epoch": 0.5, + "grad_norm": 0.546875, + "learning_rate": 0.00011717111515508319, + "loss": 1.4093, + "step": 9675 + }, + { + "epoch": 0.5, + "grad_norm": 0.53515625, + "learning_rate": 0.00011708214778549131, + "loss": 1.4405, + "step": 9680 + }, + { + "epoch": 0.5, + "grad_norm": 0.53515625, + "learning_rate": 0.00011699316648646986, + "loss": 1.4514, + "step": 9685 + }, + { + "epoch": 0.5, + "grad_norm": 0.50390625, + "learning_rate": 0.00011690417133057747, + "loss": 1.4149, + "step": 9690 + }, + { + "epoch": 0.5, + "grad_norm": 0.51171875, + "learning_rate": 0.00011681516239038423, + "loss": 1.4184, + "step": 9695 + }, + { + "epoch": 0.5, + "grad_norm": 0.52734375, + "learning_rate": 0.00011672613973847136, + "loss": 1.4329, + "step": 9700 + }, + { + "epoch": 0.5, + "grad_norm": 0.53125, + "learning_rate": 0.00011663710344743135, + "loss": 1.4197, + "step": 9705 + }, + { + "epoch": 0.5, + "grad_norm": 0.52734375, + "learning_rate": 0.00011654805358986766, + "loss": 1.425, + "step": 9710 + }, + { + "epoch": 0.5, + "grad_norm": 0.54296875, + "learning_rate": 0.00011645899023839499, + "loss": 1.4101, + "step": 9715 + }, + { + "epoch": 0.5, + "grad_norm": 0.5625, + "learning_rate": 0.00011636991346563893, + "loss": 1.4285, + "step": 9720 + }, + { + "epoch": 0.5, + "grad_norm": 0.515625, + "learning_rate": 0.00011628082334423608, + "loss": 1.3963, + "step": 9725 + }, + { + "epoch": 0.5, + "grad_norm": 0.55078125, + "learning_rate": 0.00011619171994683389, + "loss": 1.4059, + "step": 9730 + }, + { + "epoch": 0.5, + "grad_norm": 0.55859375, + "learning_rate": 0.00011610260334609063, + "loss": 1.3875, + "step": 9735 + }, + { + "epoch": 0.5, + "grad_norm": 0.51171875, + "learning_rate": 0.00011601347361467534, + "loss": 1.4455, + "step": 9740 + }, + { + "epoch": 0.5, + "grad_norm": 0.57421875, + "learning_rate": 0.00011592433082526781, + "loss": 1.4541, + "step": 9745 + }, + { + "epoch": 0.5, + "grad_norm": 0.5234375, + "learning_rate": 0.00011583517505055839, + "loss": 1.4156, + "step": 9750 + }, + { + "epoch": 0.5, + "grad_norm": 0.50390625, + "learning_rate": 0.00011574600636324813, + "loss": 1.3933, + "step": 9755 + }, + { + "epoch": 0.5, + "grad_norm": 0.55078125, + "learning_rate": 0.00011565682483604852, + "loss": 1.3996, + "step": 9760 + }, + { + "epoch": 0.51, + "grad_norm": 0.5078125, + "learning_rate": 0.00011556763054168154, + "loss": 1.4398, + "step": 9765 + }, + { + "epoch": 0.51, + "grad_norm": 0.53515625, + "learning_rate": 0.00011547842355287961, + "loss": 1.4131, + "step": 9770 + }, + { + "epoch": 0.51, + "grad_norm": 0.51171875, + "learning_rate": 0.00011538920394238551, + "loss": 1.3948, + "step": 9775 + }, + { + "epoch": 0.51, + "grad_norm": 0.52734375, + "learning_rate": 0.00011529997178295223, + "loss": 1.4212, + "step": 9780 + }, + { + "epoch": 0.51, + "grad_norm": 0.5234375, + "learning_rate": 0.00011521072714734309, + "loss": 1.4149, + "step": 9785 + }, + { + "epoch": 0.51, + "grad_norm": 0.5234375, + "learning_rate": 0.00011512147010833152, + "loss": 1.4217, + "step": 9790 + }, + { + "epoch": 0.51, + "grad_norm": 0.53515625, + "learning_rate": 0.00011503220073870111, + "loss": 1.4138, + "step": 9795 + }, + { + "epoch": 0.51, + "grad_norm": 0.515625, + "learning_rate": 0.00011494291911124544, + "loss": 1.4112, + "step": 9800 + }, + { + "epoch": 0.51, + "grad_norm": 0.5234375, + "learning_rate": 0.0001148536252987682, + "loss": 1.3989, + "step": 9805 + }, + { + "epoch": 0.51, + "grad_norm": 0.515625, + "learning_rate": 0.00011476431937408285, + "loss": 1.4537, + "step": 9810 + }, + { + "epoch": 0.51, + "grad_norm": 0.5234375, + "learning_rate": 0.0001146750014100129, + "loss": 1.4093, + "step": 9815 + }, + { + "epoch": 0.51, + "grad_norm": 0.53125, + "learning_rate": 0.00011458567147939154, + "loss": 1.4279, + "step": 9820 + }, + { + "epoch": 0.51, + "grad_norm": 0.53515625, + "learning_rate": 0.00011449632965506183, + "loss": 1.4378, + "step": 9825 + }, + { + "epoch": 0.51, + "grad_norm": 0.515625, + "learning_rate": 0.00011440697600987642, + "loss": 1.435, + "step": 9830 + }, + { + "epoch": 0.51, + "grad_norm": 0.5234375, + "learning_rate": 0.00011431761061669768, + "loss": 1.4388, + "step": 9835 + }, + { + "epoch": 0.51, + "grad_norm": 0.5390625, + "learning_rate": 0.00011422823354839753, + "loss": 1.4265, + "step": 9840 + }, + { + "epoch": 0.51, + "grad_norm": 0.5703125, + "learning_rate": 0.00011413884487785742, + "loss": 1.4337, + "step": 9845 + }, + { + "epoch": 0.51, + "grad_norm": 0.53515625, + "learning_rate": 0.00011404944467796828, + "loss": 1.4373, + "step": 9850 + }, + { + "epoch": 0.51, + "grad_norm": 0.5234375, + "learning_rate": 0.00011396003302163034, + "loss": 1.4282, + "step": 9855 + }, + { + "epoch": 0.51, + "grad_norm": 0.5390625, + "learning_rate": 0.00011387060998175329, + "loss": 1.4358, + "step": 9860 + }, + { + "epoch": 0.51, + "grad_norm": 0.546875, + "learning_rate": 0.00011378117563125608, + "loss": 1.4105, + "step": 9865 + }, + { + "epoch": 0.51, + "grad_norm": 0.52734375, + "learning_rate": 0.00011369173004306683, + "loss": 1.4332, + "step": 9870 + }, + { + "epoch": 0.51, + "grad_norm": 0.5390625, + "learning_rate": 0.00011360227329012287, + "loss": 1.443, + "step": 9875 + }, + { + "epoch": 0.51, + "grad_norm": 0.55859375, + "learning_rate": 0.00011351280544537064, + "loss": 1.4231, + "step": 9880 + }, + { + "epoch": 0.51, + "grad_norm": 0.51953125, + "learning_rate": 0.00011342332658176555, + "loss": 1.4377, + "step": 9885 + }, + { + "epoch": 0.51, + "grad_norm": 0.52734375, + "learning_rate": 0.00011333383677227214, + "loss": 1.4763, + "step": 9890 + }, + { + "epoch": 0.51, + "grad_norm": 0.50390625, + "learning_rate": 0.00011324433608986369, + "loss": 1.3878, + "step": 9895 + }, + { + "epoch": 0.51, + "grad_norm": 0.515625, + "learning_rate": 0.00011315482460752252, + "loss": 1.4342, + "step": 9900 + }, + { + "epoch": 0.51, + "grad_norm": 0.51171875, + "learning_rate": 0.0001130653023982396, + "loss": 1.4205, + "step": 9905 + }, + { + "epoch": 0.51, + "grad_norm": 0.52734375, + "learning_rate": 0.00011297576953501481, + "loss": 1.4384, + "step": 9910 + }, + { + "epoch": 0.51, + "grad_norm": 0.51171875, + "learning_rate": 0.00011288622609085657, + "loss": 1.4072, + "step": 9915 + }, + { + "epoch": 0.51, + "grad_norm": 0.53515625, + "learning_rate": 0.00011279667213878205, + "loss": 1.417, + "step": 9920 + }, + { + "epoch": 0.51, + "grad_norm": 0.52734375, + "learning_rate": 0.00011270710775181687, + "loss": 1.3894, + "step": 9925 + }, + { + "epoch": 0.51, + "grad_norm": 0.54296875, + "learning_rate": 0.00011261753300299529, + "loss": 1.3755, + "step": 9930 + }, + { + "epoch": 0.51, + "grad_norm": 0.5390625, + "learning_rate": 0.00011252794796535988, + "loss": 1.3901, + "step": 9935 + }, + { + "epoch": 0.51, + "grad_norm": 0.546875, + "learning_rate": 0.0001124383527119617, + "loss": 1.4307, + "step": 9940 + }, + { + "epoch": 0.51, + "grad_norm": 0.5390625, + "learning_rate": 0.00011234874731586012, + "loss": 1.463, + "step": 9945 + }, + { + "epoch": 0.51, + "grad_norm": 0.51953125, + "learning_rate": 0.00011225913185012276, + "loss": 1.3902, + "step": 9950 + }, + { + "epoch": 0.52, + "grad_norm": 0.5234375, + "learning_rate": 0.00011216950638782545, + "loss": 1.3977, + "step": 9955 + }, + { + "epoch": 0.52, + "grad_norm": 0.51953125, + "learning_rate": 0.00011207987100205219, + "loss": 1.4353, + "step": 9960 + }, + { + "epoch": 0.52, + "grad_norm": 0.53515625, + "learning_rate": 0.00011199022576589506, + "loss": 1.4362, + "step": 9965 + }, + { + "epoch": 0.52, + "grad_norm": 0.515625, + "learning_rate": 0.00011190057075245422, + "loss": 1.4327, + "step": 9970 + }, + { + "epoch": 0.52, + "grad_norm": 0.53515625, + "learning_rate": 0.00011181090603483768, + "loss": 1.4427, + "step": 9975 + }, + { + "epoch": 0.52, + "grad_norm": 0.5859375, + "learning_rate": 0.00011172123168616153, + "loss": 1.4526, + "step": 9980 + }, + { + "epoch": 0.52, + "grad_norm": 0.54296875, + "learning_rate": 0.00011163154777954956, + "loss": 1.3659, + "step": 9985 + }, + { + "epoch": 0.52, + "grad_norm": 0.5234375, + "learning_rate": 0.00011154185438813345, + "loss": 1.4265, + "step": 9990 + }, + { + "epoch": 0.52, + "grad_norm": 0.53515625, + "learning_rate": 0.00011145215158505258, + "loss": 1.4005, + "step": 9995 + }, + { + "epoch": 0.52, + "grad_norm": 0.5703125, + "learning_rate": 0.00011136243944345402, + "loss": 1.4192, + "step": 10000 + }, + { + "epoch": 0.52, + "grad_norm": 0.54296875, + "learning_rate": 0.00011127271803649243, + "loss": 1.4063, + "step": 10005 + }, + { + "epoch": 0.52, + "grad_norm": 0.546875, + "learning_rate": 0.00011118298743733004, + "loss": 1.4345, + "step": 10010 + }, + { + "epoch": 0.52, + "grad_norm": 0.51953125, + "learning_rate": 0.00011109324771913659, + "loss": 1.4432, + "step": 10015 + }, + { + "epoch": 0.52, + "grad_norm": 0.5703125, + "learning_rate": 0.00011100349895508921, + "loss": 1.4541, + "step": 10020 + }, + { + "epoch": 0.52, + "grad_norm": 0.5078125, + "learning_rate": 0.0001109137412183725, + "loss": 1.4266, + "step": 10025 + }, + { + "epoch": 0.52, + "grad_norm": 0.5234375, + "learning_rate": 0.00011082397458217823, + "loss": 1.4216, + "step": 10030 + }, + { + "epoch": 0.52, + "grad_norm": 0.5390625, + "learning_rate": 0.0001107341991197056, + "loss": 1.4363, + "step": 10035 + }, + { + "epoch": 0.52, + "grad_norm": 0.546875, + "learning_rate": 0.00011064441490416083, + "loss": 1.3989, + "step": 10040 + }, + { + "epoch": 0.52, + "grad_norm": 0.546875, + "learning_rate": 0.00011055462200875743, + "loss": 1.4238, + "step": 10045 + }, + { + "epoch": 0.52, + "grad_norm": 0.53125, + "learning_rate": 0.00011046482050671589, + "loss": 1.3771, + "step": 10050 + }, + { + "epoch": 0.52, + "grad_norm": 0.51171875, + "learning_rate": 0.00011037501047126379, + "loss": 1.4204, + "step": 10055 + }, + { + "epoch": 0.52, + "grad_norm": 0.56640625, + "learning_rate": 0.0001102851919756356, + "loss": 1.4242, + "step": 10060 + }, + { + "epoch": 0.52, + "grad_norm": 0.50390625, + "learning_rate": 0.00011019536509307276, + "loss": 1.443, + "step": 10065 + }, + { + "epoch": 0.52, + "grad_norm": 0.54296875, + "learning_rate": 0.00011010552989682343, + "loss": 1.4455, + "step": 10070 + }, + { + "epoch": 0.52, + "grad_norm": 0.52734375, + "learning_rate": 0.00011001568646014269, + "loss": 1.4177, + "step": 10075 + }, + { + "epoch": 0.52, + "grad_norm": 0.5625, + "learning_rate": 0.00010992583485629227, + "loss": 1.4191, + "step": 10080 + }, + { + "epoch": 0.52, + "grad_norm": 0.5234375, + "learning_rate": 0.00010983597515854055, + "loss": 1.394, + "step": 10085 + }, + { + "epoch": 0.52, + "grad_norm": 0.53125, + "learning_rate": 0.00010974610744016254, + "loss": 1.4331, + "step": 10090 + }, + { + "epoch": 0.52, + "grad_norm": 0.5625, + "learning_rate": 0.00010965623177443978, + "loss": 1.4209, + "step": 10095 + }, + { + "epoch": 0.52, + "grad_norm": 0.5234375, + "learning_rate": 0.00010956634823466028, + "loss": 1.4153, + "step": 10100 + }, + { + "epoch": 0.52, + "grad_norm": 0.53515625, + "learning_rate": 0.00010947645689411849, + "loss": 1.4491, + "step": 10105 + }, + { + "epoch": 0.52, + "grad_norm": 0.53515625, + "learning_rate": 0.00010938655782611517, + "loss": 1.3941, + "step": 10110 + }, + { + "epoch": 0.52, + "grad_norm": 0.55078125, + "learning_rate": 0.0001092966511039575, + "loss": 1.4261, + "step": 10115 + }, + { + "epoch": 0.52, + "grad_norm": 0.5625, + "learning_rate": 0.00010920673680095874, + "loss": 1.4626, + "step": 10120 + }, + { + "epoch": 0.52, + "grad_norm": 0.5078125, + "learning_rate": 0.00010911681499043849, + "loss": 1.4246, + "step": 10125 + }, + { + "epoch": 0.52, + "grad_norm": 0.5, + "learning_rate": 0.00010902688574572233, + "loss": 1.4021, + "step": 10130 + }, + { + "epoch": 0.52, + "grad_norm": 0.546875, + "learning_rate": 0.00010893694914014201, + "loss": 1.4188, + "step": 10135 + }, + { + "epoch": 0.52, + "grad_norm": 0.53515625, + "learning_rate": 0.0001088470052470352, + "loss": 1.4214, + "step": 10140 + }, + { + "epoch": 0.52, + "grad_norm": 0.54296875, + "learning_rate": 0.00010875705413974561, + "loss": 1.4519, + "step": 10145 + }, + { + "epoch": 0.53, + "grad_norm": 0.50390625, + "learning_rate": 0.00010866709589162276, + "loss": 1.4177, + "step": 10150 + }, + { + "epoch": 0.53, + "grad_norm": 0.515625, + "learning_rate": 0.00010857713057602197, + "loss": 1.4231, + "step": 10155 + }, + { + "epoch": 0.53, + "grad_norm": 0.54296875, + "learning_rate": 0.0001084871582663044, + "loss": 1.4382, + "step": 10160 + }, + { + "epoch": 0.53, + "grad_norm": 0.515625, + "learning_rate": 0.00010839717903583684, + "loss": 1.415, + "step": 10165 + }, + { + "epoch": 0.53, + "grad_norm": 0.5546875, + "learning_rate": 0.00010830719295799181, + "loss": 1.4295, + "step": 10170 + }, + { + "epoch": 0.53, + "grad_norm": 0.498046875, + "learning_rate": 0.00010821720010614733, + "loss": 1.4191, + "step": 10175 + }, + { + "epoch": 0.53, + "grad_norm": 0.53125, + "learning_rate": 0.000108127200553687, + "loss": 1.3981, + "step": 10180 + }, + { + "epoch": 0.53, + "grad_norm": 0.54296875, + "learning_rate": 0.0001080371943739998, + "loss": 1.4236, + "step": 10185 + }, + { + "epoch": 0.53, + "grad_norm": 0.5078125, + "learning_rate": 0.00010794718164048026, + "loss": 1.4121, + "step": 10190 + }, + { + "epoch": 0.53, + "grad_norm": 0.5078125, + "learning_rate": 0.00010785716242652809, + "loss": 1.4075, + "step": 10195 + }, + { + "epoch": 0.53, + "grad_norm": 0.5390625, + "learning_rate": 0.00010776713680554842, + "loss": 1.4368, + "step": 10200 + }, + { + "epoch": 0.53, + "grad_norm": 0.53125, + "learning_rate": 0.00010767710485095151, + "loss": 1.3704, + "step": 10205 + }, + { + "epoch": 0.53, + "grad_norm": 0.53125, + "learning_rate": 0.00010758706663615284, + "loss": 1.4133, + "step": 10210 + }, + { + "epoch": 0.53, + "grad_norm": 0.5234375, + "learning_rate": 0.00010749702223457299, + "loss": 1.4003, + "step": 10215 + }, + { + "epoch": 0.53, + "grad_norm": 0.54296875, + "learning_rate": 0.00010740697171963754, + "loss": 1.4234, + "step": 10220 + }, + { + "epoch": 0.53, + "grad_norm": 0.5234375, + "learning_rate": 0.0001073169151647771, + "loss": 1.4117, + "step": 10225 + }, + { + "epoch": 0.53, + "grad_norm": 0.55078125, + "learning_rate": 0.00010722685264342722, + "loss": 1.4075, + "step": 10230 + }, + { + "epoch": 0.53, + "grad_norm": 0.515625, + "learning_rate": 0.00010713678422902825, + "loss": 1.4053, + "step": 10235 + }, + { + "epoch": 0.53, + "grad_norm": 0.53125, + "learning_rate": 0.0001070467099950254, + "loss": 1.4544, + "step": 10240 + }, + { + "epoch": 0.53, + "grad_norm": 0.5234375, + "learning_rate": 0.0001069566300148686, + "loss": 1.4289, + "step": 10245 + }, + { + "epoch": 0.53, + "grad_norm": 0.515625, + "learning_rate": 0.00010686654436201249, + "loss": 1.4284, + "step": 10250 + }, + { + "epoch": 0.53, + "grad_norm": 0.54296875, + "learning_rate": 0.00010677645310991628, + "loss": 1.4181, + "step": 10255 + }, + { + "epoch": 0.53, + "grad_norm": 0.55078125, + "learning_rate": 0.00010668635633204384, + "loss": 1.4545, + "step": 10260 + }, + { + "epoch": 0.53, + "grad_norm": 0.546875, + "learning_rate": 0.00010659625410186345, + "loss": 1.4424, + "step": 10265 + }, + { + "epoch": 0.53, + "grad_norm": 0.51953125, + "learning_rate": 0.00010650614649284791, + "loss": 1.4275, + "step": 10270 + }, + { + "epoch": 0.53, + "grad_norm": 0.52734375, + "learning_rate": 0.00010641603357847434, + "loss": 1.4008, + "step": 10275 + }, + { + "epoch": 0.53, + "grad_norm": 0.5390625, + "learning_rate": 0.00010632591543222426, + "loss": 1.4401, + "step": 10280 + }, + { + "epoch": 0.53, + "grad_norm": 0.54296875, + "learning_rate": 0.00010623579212758336, + "loss": 1.3828, + "step": 10285 + }, + { + "epoch": 0.53, + "grad_norm": 0.5234375, + "learning_rate": 0.00010614566373804167, + "loss": 1.4171, + "step": 10290 + }, + { + "epoch": 0.53, + "grad_norm": 0.52734375, + "learning_rate": 0.00010605553033709321, + "loss": 1.4277, + "step": 10295 + }, + { + "epoch": 0.53, + "grad_norm": 0.52734375, + "learning_rate": 0.0001059653919982362, + "loss": 1.4299, + "step": 10300 + }, + { + "epoch": 0.53, + "grad_norm": 0.51953125, + "learning_rate": 0.00010587524879497286, + "loss": 1.4266, + "step": 10305 + }, + { + "epoch": 0.53, + "grad_norm": 0.54296875, + "learning_rate": 0.00010578510080080937, + "loss": 1.418, + "step": 10310 + }, + { + "epoch": 0.53, + "grad_norm": 0.5703125, + "learning_rate": 0.0001056949480892558, + "loss": 1.4423, + "step": 10315 + }, + { + "epoch": 0.53, + "grad_norm": 0.53515625, + "learning_rate": 0.00010560479073382605, + "loss": 1.4533, + "step": 10320 + }, + { + "epoch": 0.53, + "grad_norm": 0.52734375, + "learning_rate": 0.00010551462880803793, + "loss": 1.4045, + "step": 10325 + }, + { + "epoch": 0.53, + "grad_norm": 0.53515625, + "learning_rate": 0.0001054244623854128, + "loss": 1.4342, + "step": 10330 + }, + { + "epoch": 0.53, + "grad_norm": 0.51953125, + "learning_rate": 0.00010533429153947582, + "loss": 1.4028, + "step": 10335 + }, + { + "epoch": 0.53, + "grad_norm": 0.5078125, + "learning_rate": 0.00010524411634375566, + "loss": 1.4093, + "step": 10340 + }, + { + "epoch": 0.54, + "grad_norm": 0.546875, + "learning_rate": 0.00010515393687178467, + "loss": 1.4353, + "step": 10345 + }, + { + "epoch": 0.54, + "grad_norm": 0.52734375, + "learning_rate": 0.00010506375319709852, + "loss": 1.4377, + "step": 10350 + }, + { + "epoch": 0.54, + "grad_norm": 0.52734375, + "learning_rate": 0.00010497356539323643, + "loss": 1.4349, + "step": 10355 + }, + { + "epoch": 0.54, + "grad_norm": 0.53125, + "learning_rate": 0.00010488337353374093, + "loss": 1.421, + "step": 10360 + }, + { + "epoch": 0.54, + "grad_norm": 0.515625, + "learning_rate": 0.00010479317769215793, + "loss": 1.4066, + "step": 10365 + }, + { + "epoch": 0.54, + "grad_norm": 0.51171875, + "learning_rate": 0.00010470297794203643, + "loss": 1.4299, + "step": 10370 + }, + { + "epoch": 0.54, + "grad_norm": 0.5, + "learning_rate": 0.00010461277435692882, + "loss": 1.4323, + "step": 10375 + }, + { + "epoch": 0.54, + "grad_norm": 0.52734375, + "learning_rate": 0.00010452256701039045, + "loss": 1.4183, + "step": 10380 + }, + { + "epoch": 0.54, + "grad_norm": 0.5625, + "learning_rate": 0.00010443235597597985, + "loss": 1.4495, + "step": 10385 + }, + { + "epoch": 0.54, + "grad_norm": 0.54296875, + "learning_rate": 0.00010434214132725846, + "loss": 1.4152, + "step": 10390 + }, + { + "epoch": 0.54, + "grad_norm": 0.51171875, + "learning_rate": 0.00010425192313779075, + "loss": 1.4081, + "step": 10395 + }, + { + "epoch": 0.54, + "grad_norm": 0.55078125, + "learning_rate": 0.00010416170148114404, + "loss": 1.4235, + "step": 10400 + }, + { + "epoch": 0.54, + "grad_norm": 0.54296875, + "learning_rate": 0.0001040714764308885, + "loss": 1.3903, + "step": 10405 + }, + { + "epoch": 0.54, + "grad_norm": 0.54296875, + "learning_rate": 0.00010398124806059701, + "loss": 1.4415, + "step": 10410 + }, + { + "epoch": 0.54, + "grad_norm": 0.53515625, + "learning_rate": 0.00010389101644384524, + "loss": 1.4446, + "step": 10415 + }, + { + "epoch": 0.54, + "grad_norm": 0.54296875, + "learning_rate": 0.00010380078165421144, + "loss": 1.4458, + "step": 10420 + }, + { + "epoch": 0.54, + "grad_norm": 0.5625, + "learning_rate": 0.00010371054376527647, + "loss": 1.4719, + "step": 10425 + }, + { + "epoch": 0.54, + "grad_norm": 0.52734375, + "learning_rate": 0.00010362030285062369, + "loss": 1.4487, + "step": 10430 + }, + { + "epoch": 0.54, + "grad_norm": 0.52734375, + "learning_rate": 0.00010353005898383905, + "loss": 1.4335, + "step": 10435 + }, + { + "epoch": 0.54, + "grad_norm": 0.5234375, + "learning_rate": 0.00010343981223851074, + "loss": 1.4065, + "step": 10440 + }, + { + "epoch": 0.54, + "grad_norm": 0.50390625, + "learning_rate": 0.00010334956268822937, + "loss": 1.4032, + "step": 10445 + }, + { + "epoch": 0.54, + "grad_norm": 0.56640625, + "learning_rate": 0.00010325931040658783, + "loss": 1.4184, + "step": 10450 + }, + { + "epoch": 0.54, + "grad_norm": 0.52734375, + "learning_rate": 0.00010316905546718128, + "loss": 1.4337, + "step": 10455 + }, + { + "epoch": 0.54, + "grad_norm": 0.52734375, + "learning_rate": 0.00010307879794360701, + "loss": 1.3685, + "step": 10460 + }, + { + "epoch": 0.54, + "grad_norm": 0.5859375, + "learning_rate": 0.0001029885379094644, + "loss": 1.4441, + "step": 10465 + }, + { + "epoch": 0.54, + "grad_norm": 0.52734375, + "learning_rate": 0.00010289827543835493, + "loss": 1.4085, + "step": 10470 + }, + { + "epoch": 0.54, + "grad_norm": 0.546875, + "learning_rate": 0.00010280801060388199, + "loss": 1.4465, + "step": 10475 + }, + { + "epoch": 0.54, + "grad_norm": 0.53515625, + "learning_rate": 0.00010271774347965097, + "loss": 1.4511, + "step": 10480 + }, + { + "epoch": 0.54, + "grad_norm": 0.53515625, + "learning_rate": 0.00010262747413926907, + "loss": 1.4433, + "step": 10485 + }, + { + "epoch": 0.54, + "grad_norm": 0.515625, + "learning_rate": 0.00010253720265634537, + "loss": 1.4181, + "step": 10490 + }, + { + "epoch": 0.54, + "grad_norm": 0.51953125, + "learning_rate": 0.00010244692910449061, + "loss": 1.4608, + "step": 10495 + }, + { + "epoch": 0.54, + "grad_norm": 0.482421875, + "learning_rate": 0.00010235665355731727, + "loss": 1.4125, + "step": 10500 + }, + { + "epoch": 0.54, + "grad_norm": 0.54296875, + "learning_rate": 0.00010226637608843947, + "loss": 1.4069, + "step": 10505 + }, + { + "epoch": 0.54, + "grad_norm": 0.51953125, + "learning_rate": 0.00010217609677147287, + "loss": 1.4075, + "step": 10510 + }, + { + "epoch": 0.54, + "grad_norm": 0.53125, + "learning_rate": 0.00010208581568003459, + "loss": 1.3924, + "step": 10515 + }, + { + "epoch": 0.54, + "grad_norm": 0.5625, + "learning_rate": 0.00010199553288774333, + "loss": 1.4404, + "step": 10520 + }, + { + "epoch": 0.54, + "grad_norm": 0.51171875, + "learning_rate": 0.00010190524846821903, + "loss": 1.4055, + "step": 10525 + }, + { + "epoch": 0.54, + "grad_norm": 0.5703125, + "learning_rate": 0.00010181496249508305, + "loss": 1.4068, + "step": 10530 + }, + { + "epoch": 0.55, + "grad_norm": 0.578125, + "learning_rate": 0.00010172467504195798, + "loss": 1.4726, + "step": 10535 + }, + { + "epoch": 0.55, + "grad_norm": 0.54296875, + "learning_rate": 0.00010163438618246763, + "loss": 1.4363, + "step": 10540 + }, + { + "epoch": 0.55, + "grad_norm": 0.5546875, + "learning_rate": 0.00010154409599023693, + "loss": 1.4466, + "step": 10545 + }, + { + "epoch": 0.55, + "grad_norm": 0.5625, + "learning_rate": 0.00010145380453889195, + "loss": 1.4278, + "step": 10550 + }, + { + "epoch": 0.55, + "grad_norm": 0.51953125, + "learning_rate": 0.00010136351190205975, + "loss": 1.4285, + "step": 10555 + }, + { + "epoch": 0.55, + "grad_norm": 0.54296875, + "learning_rate": 0.00010127321815336837, + "loss": 1.4437, + "step": 10560 + }, + { + "epoch": 0.55, + "grad_norm": 0.53125, + "learning_rate": 0.00010118292336644668, + "loss": 1.3674, + "step": 10565 + }, + { + "epoch": 0.55, + "grad_norm": 0.51953125, + "learning_rate": 0.00010109262761492458, + "loss": 1.3475, + "step": 10570 + }, + { + "epoch": 0.55, + "grad_norm": 0.5390625, + "learning_rate": 0.00010100233097243255, + "loss": 1.4186, + "step": 10575 + }, + { + "epoch": 0.55, + "grad_norm": 0.50390625, + "learning_rate": 0.00010091203351260194, + "loss": 1.3973, + "step": 10580 + }, + { + "epoch": 0.55, + "grad_norm": 0.498046875, + "learning_rate": 0.00010082173530906467, + "loss": 1.4284, + "step": 10585 + }, + { + "epoch": 0.55, + "grad_norm": 0.52734375, + "learning_rate": 0.00010073143643545339, + "loss": 1.4689, + "step": 10590 + }, + { + "epoch": 0.55, + "grad_norm": 0.53125, + "learning_rate": 0.00010064113696540111, + "loss": 1.3903, + "step": 10595 + }, + { + "epoch": 0.55, + "grad_norm": 0.52734375, + "learning_rate": 0.00010055083697254156, + "loss": 1.4076, + "step": 10600 + }, + { + "epoch": 0.55, + "grad_norm": 0.48828125, + "learning_rate": 0.00010046053653050862, + "loss": 1.4381, + "step": 10605 + }, + { + "epoch": 0.55, + "grad_norm": 0.51171875, + "learning_rate": 0.00010037023571293682, + "loss": 1.4145, + "step": 10610 + }, + { + "epoch": 0.55, + "grad_norm": 0.55859375, + "learning_rate": 0.00010027993459346079, + "loss": 1.4353, + "step": 10615 + }, + { + "epoch": 0.55, + "grad_norm": 0.5234375, + "learning_rate": 0.00010018963324571551, + "loss": 1.4071, + "step": 10620 + }, + { + "epoch": 0.55, + "grad_norm": 0.5234375, + "learning_rate": 0.00010009933174333608, + "loss": 1.3875, + "step": 10625 + }, + { + "epoch": 0.55, + "grad_norm": 0.54296875, + "learning_rate": 0.00010000903015995783, + "loss": 1.4194, + "step": 10630 + }, + { + "epoch": 0.55, + "grad_norm": 0.546875, + "learning_rate": 9.991872856921601e-05, + "loss": 1.4326, + "step": 10635 + }, + { + "epoch": 0.55, + "grad_norm": 0.5234375, + "learning_rate": 9.9828427044746e-05, + "loss": 1.4118, + "step": 10640 + }, + { + "epoch": 0.55, + "grad_norm": 0.53125, + "learning_rate": 9.973812566018309e-05, + "loss": 1.4773, + "step": 10645 + }, + { + "epoch": 0.55, + "grad_norm": 0.53515625, + "learning_rate": 9.96478244891624e-05, + "loss": 1.4413, + "step": 10650 + }, + { + "epoch": 0.55, + "grad_norm": 0.515625, + "learning_rate": 9.955752360531896e-05, + "loss": 1.3984, + "step": 10655 + }, + { + "epoch": 0.55, + "grad_norm": 0.546875, + "learning_rate": 9.94672230822875e-05, + "loss": 1.4695, + "step": 10660 + }, + { + "epoch": 0.55, + "grad_norm": 0.546875, + "learning_rate": 9.937692299370251e-05, + "loss": 1.404, + "step": 10665 + }, + { + "epoch": 0.55, + "grad_norm": 0.53125, + "learning_rate": 9.928662341319808e-05, + "loss": 1.4331, + "step": 10670 + }, + { + "epoch": 0.55, + "grad_norm": 0.53515625, + "learning_rate": 9.919632441440791e-05, + "loss": 1.441, + "step": 10675 + }, + { + "epoch": 0.55, + "grad_norm": 0.515625, + "learning_rate": 9.910602607096522e-05, + "loss": 1.4424, + "step": 10680 + }, + { + "epoch": 0.55, + "grad_norm": 0.5390625, + "learning_rate": 9.90157284565027e-05, + "loss": 1.4336, + "step": 10685 + }, + { + "epoch": 0.55, + "grad_norm": 0.53125, + "learning_rate": 9.892543164465243e-05, + "loss": 1.4476, + "step": 10690 + }, + { + "epoch": 0.55, + "grad_norm": 0.54296875, + "learning_rate": 9.883513570904587e-05, + "loss": 1.4135, + "step": 10695 + }, + { + "epoch": 0.55, + "grad_norm": 0.5, + "learning_rate": 9.874484072331371e-05, + "loss": 1.3951, + "step": 10700 + }, + { + "epoch": 0.55, + "grad_norm": 0.515625, + "learning_rate": 9.865454676108592e-05, + "loss": 1.4369, + "step": 10705 + }, + { + "epoch": 0.55, + "grad_norm": 0.546875, + "learning_rate": 9.856425389599159e-05, + "loss": 1.3727, + "step": 10710 + }, + { + "epoch": 0.55, + "grad_norm": 0.515625, + "learning_rate": 9.847396220165898e-05, + "loss": 1.4404, + "step": 10715 + }, + { + "epoch": 0.55, + "grad_norm": 0.494140625, + "learning_rate": 9.838367175171531e-05, + "loss": 1.4034, + "step": 10720 + }, + { + "epoch": 0.55, + "grad_norm": 0.51171875, + "learning_rate": 9.829338261978686e-05, + "loss": 1.4248, + "step": 10725 + }, + { + "epoch": 0.56, + "grad_norm": 0.5625, + "learning_rate": 9.82030948794988e-05, + "loss": 1.4239, + "step": 10730 + }, + { + "epoch": 0.56, + "grad_norm": 0.515625, + "learning_rate": 9.811280860447515e-05, + "loss": 1.3853, + "step": 10735 + }, + { + "epoch": 0.56, + "grad_norm": 0.55859375, + "learning_rate": 9.802252386833875e-05, + "loss": 1.394, + "step": 10740 + }, + { + "epoch": 0.56, + "grad_norm": 0.5390625, + "learning_rate": 9.793224074471125e-05, + "loss": 1.4399, + "step": 10745 + }, + { + "epoch": 0.56, + "grad_norm": 0.546875, + "learning_rate": 9.784195930721284e-05, + "loss": 1.3797, + "step": 10750 + }, + { + "epoch": 0.56, + "grad_norm": 0.5234375, + "learning_rate": 9.775167962946248e-05, + "loss": 1.3967, + "step": 10755 + }, + { + "epoch": 0.56, + "grad_norm": 0.515625, + "learning_rate": 9.76614017850776e-05, + "loss": 1.4251, + "step": 10760 + }, + { + "epoch": 0.56, + "grad_norm": 0.5234375, + "learning_rate": 9.757112584767422e-05, + "loss": 1.3978, + "step": 10765 + }, + { + "epoch": 0.56, + "grad_norm": 0.52734375, + "learning_rate": 9.748085189086668e-05, + "loss": 1.4219, + "step": 10770 + }, + { + "epoch": 0.56, + "grad_norm": 0.51953125, + "learning_rate": 9.739057998826786e-05, + "loss": 1.41, + "step": 10775 + }, + { + "epoch": 0.56, + "grad_norm": 0.51953125, + "learning_rate": 9.730031021348881e-05, + "loss": 1.4403, + "step": 10780 + }, + { + "epoch": 0.56, + "grad_norm": 0.515625, + "learning_rate": 9.721004264013899e-05, + "loss": 1.3703, + "step": 10785 + }, + { + "epoch": 0.56, + "grad_norm": 0.54296875, + "learning_rate": 9.711977734182593e-05, + "loss": 1.4361, + "step": 10790 + }, + { + "epoch": 0.56, + "grad_norm": 0.5859375, + "learning_rate": 9.702951439215543e-05, + "loss": 1.4309, + "step": 10795 + }, + { + "epoch": 0.56, + "grad_norm": 0.5390625, + "learning_rate": 9.693925386473127e-05, + "loss": 1.3903, + "step": 10800 + }, + { + "epoch": 0.56, + "grad_norm": 0.52734375, + "learning_rate": 9.684899583315531e-05, + "loss": 1.4016, + "step": 10805 + }, + { + "epoch": 0.56, + "grad_norm": 0.546875, + "learning_rate": 9.67587403710274e-05, + "loss": 1.4281, + "step": 10810 + }, + { + "epoch": 0.56, + "grad_norm": 0.54296875, + "learning_rate": 9.666848755194519e-05, + "loss": 1.4201, + "step": 10815 + }, + { + "epoch": 0.56, + "grad_norm": 0.53515625, + "learning_rate": 9.65782374495043e-05, + "loss": 1.4347, + "step": 10820 + }, + { + "epoch": 0.56, + "grad_norm": 0.53125, + "learning_rate": 9.648799013729802e-05, + "loss": 1.4321, + "step": 10825 + }, + { + "epoch": 0.56, + "grad_norm": 0.53125, + "learning_rate": 9.63977456889175e-05, + "loss": 1.4376, + "step": 10830 + }, + { + "epoch": 0.56, + "grad_norm": 0.5234375, + "learning_rate": 9.630750417795141e-05, + "loss": 1.4297, + "step": 10835 + }, + { + "epoch": 0.56, + "grad_norm": 0.55078125, + "learning_rate": 9.621726567798614e-05, + "loss": 1.457, + "step": 10840 + }, + { + "epoch": 0.56, + "grad_norm": 0.51953125, + "learning_rate": 9.612703026260553e-05, + "loss": 1.3704, + "step": 10845 + }, + { + "epoch": 0.56, + "grad_norm": 0.52734375, + "learning_rate": 9.603679800539102e-05, + "loss": 1.3975, + "step": 10850 + }, + { + "epoch": 0.56, + "grad_norm": 0.55078125, + "learning_rate": 9.594656897992133e-05, + "loss": 1.4303, + "step": 10855 + }, + { + "epoch": 0.56, + "grad_norm": 0.55859375, + "learning_rate": 9.585634325977268e-05, + "loss": 1.4059, + "step": 10860 + }, + { + "epoch": 0.56, + "grad_norm": 0.54296875, + "learning_rate": 9.57661209185185e-05, + "loss": 1.4084, + "step": 10865 + }, + { + "epoch": 0.56, + "grad_norm": 0.5390625, + "learning_rate": 9.567590202972952e-05, + "loss": 1.4019, + "step": 10870 + }, + { + "epoch": 0.56, + "grad_norm": 0.54296875, + "learning_rate": 9.558568666697362e-05, + "loss": 1.4165, + "step": 10875 + }, + { + "epoch": 0.56, + "grad_norm": 0.51953125, + "learning_rate": 9.549547490381585e-05, + "loss": 1.4164, + "step": 10880 + }, + { + "epoch": 0.56, + "grad_norm": 0.51171875, + "learning_rate": 9.540526681381824e-05, + "loss": 1.3971, + "step": 10885 + }, + { + "epoch": 0.56, + "grad_norm": 0.5546875, + "learning_rate": 9.531506247053995e-05, + "loss": 1.4246, + "step": 10890 + }, + { + "epoch": 0.56, + "grad_norm": 0.5703125, + "learning_rate": 9.522486194753695e-05, + "loss": 1.4365, + "step": 10895 + }, + { + "epoch": 0.56, + "grad_norm": 0.52734375, + "learning_rate": 9.513466531836221e-05, + "loss": 1.3987, + "step": 10900 + }, + { + "epoch": 0.56, + "grad_norm": 0.52734375, + "learning_rate": 9.504447265656544e-05, + "loss": 1.4129, + "step": 10905 + }, + { + "epoch": 0.56, + "grad_norm": 0.5078125, + "learning_rate": 9.495428403569317e-05, + "loss": 1.4259, + "step": 10910 + }, + { + "epoch": 0.56, + "grad_norm": 0.53515625, + "learning_rate": 9.486409952928858e-05, + "loss": 1.429, + "step": 10915 + }, + { + "epoch": 0.56, + "grad_norm": 0.5390625, + "learning_rate": 9.477391921089158e-05, + "loss": 1.4268, + "step": 10920 + }, + { + "epoch": 0.57, + "grad_norm": 0.5390625, + "learning_rate": 9.468374315403858e-05, + "loss": 1.3592, + "step": 10925 + }, + { + "epoch": 0.57, + "grad_norm": 0.51953125, + "learning_rate": 9.459357143226255e-05, + "loss": 1.4077, + "step": 10930 + }, + { + "epoch": 0.57, + "grad_norm": 0.5625, + "learning_rate": 9.450340411909293e-05, + "loss": 1.4344, + "step": 10935 + }, + { + "epoch": 0.57, + "grad_norm": 0.51953125, + "learning_rate": 9.441324128805555e-05, + "loss": 1.3846, + "step": 10940 + }, + { + "epoch": 0.57, + "grad_norm": 0.52734375, + "learning_rate": 9.432308301267261e-05, + "loss": 1.3887, + "step": 10945 + }, + { + "epoch": 0.57, + "grad_norm": 0.52734375, + "learning_rate": 9.423292936646257e-05, + "loss": 1.4068, + "step": 10950 + }, + { + "epoch": 0.57, + "grad_norm": 0.55078125, + "learning_rate": 9.414278042294012e-05, + "loss": 1.389, + "step": 10955 + }, + { + "epoch": 0.57, + "grad_norm": 0.5078125, + "learning_rate": 9.405263625561613e-05, + "loss": 1.408, + "step": 10960 + }, + { + "epoch": 0.57, + "grad_norm": 0.51171875, + "learning_rate": 9.396249693799754e-05, + "loss": 1.4058, + "step": 10965 + }, + { + "epoch": 0.57, + "grad_norm": 0.53125, + "learning_rate": 9.387236254358741e-05, + "loss": 1.4122, + "step": 10970 + }, + { + "epoch": 0.57, + "grad_norm": 0.5390625, + "learning_rate": 9.378223314588467e-05, + "loss": 1.3976, + "step": 10975 + }, + { + "epoch": 0.57, + "grad_norm": 0.53515625, + "learning_rate": 9.36921088183843e-05, + "loss": 1.439, + "step": 10980 + }, + { + "epoch": 0.57, + "grad_norm": 0.51953125, + "learning_rate": 9.360198963457705e-05, + "loss": 1.4165, + "step": 10985 + }, + { + "epoch": 0.57, + "grad_norm": 0.52734375, + "learning_rate": 9.351187566794953e-05, + "loss": 1.4397, + "step": 10990 + }, + { + "epoch": 0.57, + "grad_norm": 0.55078125, + "learning_rate": 9.342176699198406e-05, + "loss": 1.4671, + "step": 10995 + }, + { + "epoch": 0.57, + "grad_norm": 0.53125, + "learning_rate": 9.333166368015869e-05, + "loss": 1.44, + "step": 11000 + }, + { + "epoch": 0.57, + "grad_norm": 0.51171875, + "learning_rate": 9.324156580594704e-05, + "loss": 1.4439, + "step": 11005 + }, + { + "epoch": 0.57, + "grad_norm": 0.5234375, + "learning_rate": 9.315147344281836e-05, + "loss": 1.4235, + "step": 11010 + }, + { + "epoch": 0.57, + "grad_norm": 0.53515625, + "learning_rate": 9.306138666423733e-05, + "loss": 1.4024, + "step": 11015 + }, + { + "epoch": 0.57, + "grad_norm": 0.53125, + "learning_rate": 9.297130554366413e-05, + "loss": 1.3865, + "step": 11020 + }, + { + "epoch": 0.57, + "grad_norm": 0.54296875, + "learning_rate": 9.288123015455436e-05, + "loss": 1.4025, + "step": 11025 + }, + { + "epoch": 0.57, + "grad_norm": 0.52734375, + "learning_rate": 9.279116057035882e-05, + "loss": 1.4177, + "step": 11030 + }, + { + "epoch": 0.57, + "grad_norm": 0.5390625, + "learning_rate": 9.270109686452375e-05, + "loss": 1.4183, + "step": 11035 + }, + { + "epoch": 0.57, + "grad_norm": 0.5390625, + "learning_rate": 9.261103911049041e-05, + "loss": 1.4063, + "step": 11040 + }, + { + "epoch": 0.57, + "grad_norm": 0.51953125, + "learning_rate": 9.252098738169538e-05, + "loss": 1.4324, + "step": 11045 + }, + { + "epoch": 0.57, + "grad_norm": 0.52734375, + "learning_rate": 9.24309417515702e-05, + "loss": 1.469, + "step": 11050 + }, + { + "epoch": 0.57, + "grad_norm": 0.51953125, + "learning_rate": 9.234090229354149e-05, + "loss": 1.3761, + "step": 11055 + }, + { + "epoch": 0.57, + "grad_norm": 0.53515625, + "learning_rate": 9.225086908103082e-05, + "loss": 1.386, + "step": 11060 + }, + { + "epoch": 0.57, + "grad_norm": 0.51953125, + "learning_rate": 9.216084218745472e-05, + "loss": 1.4339, + "step": 11065 + }, + { + "epoch": 0.57, + "grad_norm": 0.52734375, + "learning_rate": 9.207082168622448e-05, + "loss": 1.3859, + "step": 11070 + }, + { + "epoch": 0.57, + "grad_norm": 0.578125, + "learning_rate": 9.198080765074625e-05, + "loss": 1.396, + "step": 11075 + }, + { + "epoch": 0.57, + "grad_norm": 0.52734375, + "learning_rate": 9.189080015442085e-05, + "loss": 1.3841, + "step": 11080 + }, + { + "epoch": 0.57, + "grad_norm": 0.57421875, + "learning_rate": 9.180079927064386e-05, + "loss": 1.382, + "step": 11085 + }, + { + "epoch": 0.57, + "grad_norm": 0.56640625, + "learning_rate": 9.171080507280532e-05, + "loss": 1.4463, + "step": 11090 + }, + { + "epoch": 0.57, + "grad_norm": 0.54296875, + "learning_rate": 9.162081763428999e-05, + "loss": 1.4456, + "step": 11095 + }, + { + "epoch": 0.57, + "grad_norm": 0.55078125, + "learning_rate": 9.153083702847695e-05, + "loss": 1.4228, + "step": 11100 + }, + { + "epoch": 0.57, + "grad_norm": 0.515625, + "learning_rate": 9.14408633287399e-05, + "loss": 1.3921, + "step": 11105 + }, + { + "epoch": 0.57, + "grad_norm": 0.51953125, + "learning_rate": 9.135089660844669e-05, + "loss": 1.4088, + "step": 11110 + }, + { + "epoch": 0.58, + "grad_norm": 0.5390625, + "learning_rate": 9.126093694095961e-05, + "loss": 1.3919, + "step": 11115 + }, + { + "epoch": 0.58, + "grad_norm": 0.5078125, + "learning_rate": 9.117098439963522e-05, + "loss": 1.4446, + "step": 11120 + }, + { + "epoch": 0.58, + "grad_norm": 0.5234375, + "learning_rate": 9.108103905782419e-05, + "loss": 1.3774, + "step": 11125 + }, + { + "epoch": 0.58, + "grad_norm": 0.53515625, + "learning_rate": 9.099110098887136e-05, + "loss": 1.407, + "step": 11130 + }, + { + "epoch": 0.58, + "grad_norm": 0.58203125, + "learning_rate": 9.090117026611564e-05, + "loss": 1.4042, + "step": 11135 + }, + { + "epoch": 0.58, + "grad_norm": 0.5390625, + "learning_rate": 9.081124696288995e-05, + "loss": 1.4179, + "step": 11140 + }, + { + "epoch": 0.58, + "grad_norm": 0.546875, + "learning_rate": 9.072133115252112e-05, + "loss": 1.4189, + "step": 11145 + }, + { + "epoch": 0.58, + "grad_norm": 0.5546875, + "learning_rate": 9.063142290832997e-05, + "loss": 1.3894, + "step": 11150 + }, + { + "epoch": 0.58, + "grad_norm": 0.5703125, + "learning_rate": 9.054152230363102e-05, + "loss": 1.4122, + "step": 11155 + }, + { + "epoch": 0.58, + "grad_norm": 0.51953125, + "learning_rate": 9.045162941173266e-05, + "loss": 1.406, + "step": 11160 + }, + { + "epoch": 0.58, + "grad_norm": 0.53125, + "learning_rate": 9.036174430593694e-05, + "loss": 1.4228, + "step": 11165 + }, + { + "epoch": 0.58, + "grad_norm": 0.51171875, + "learning_rate": 9.027186705953958e-05, + "loss": 1.4127, + "step": 11170 + }, + { + "epoch": 0.58, + "grad_norm": 0.5390625, + "learning_rate": 9.018199774582988e-05, + "loss": 1.3923, + "step": 11175 + }, + { + "epoch": 0.58, + "grad_norm": 0.51953125, + "learning_rate": 9.009213643809072e-05, + "loss": 1.4126, + "step": 11180 + }, + { + "epoch": 0.58, + "grad_norm": 0.53125, + "learning_rate": 9.000228320959833e-05, + "loss": 1.3914, + "step": 11185 + }, + { + "epoch": 0.58, + "grad_norm": 0.53125, + "learning_rate": 8.991243813362252e-05, + "loss": 1.4298, + "step": 11190 + }, + { + "epoch": 0.58, + "grad_norm": 0.5234375, + "learning_rate": 8.982260128342628e-05, + "loss": 1.42, + "step": 11195 + }, + { + "epoch": 0.58, + "grad_norm": 0.53515625, + "learning_rate": 8.973277273226607e-05, + "loss": 1.4267, + "step": 11200 + }, + { + "epoch": 0.58, + "grad_norm": 0.5234375, + "learning_rate": 8.96429525533914e-05, + "loss": 1.4044, + "step": 11205 + }, + { + "epoch": 0.58, + "grad_norm": 0.5546875, + "learning_rate": 8.95531408200451e-05, + "loss": 1.3861, + "step": 11210 + }, + { + "epoch": 0.58, + "grad_norm": 0.54296875, + "learning_rate": 8.946333760546303e-05, + "loss": 1.4079, + "step": 11215 + }, + { + "epoch": 0.58, + "grad_norm": 0.5234375, + "learning_rate": 8.937354298287414e-05, + "loss": 1.4124, + "step": 11220 + }, + { + "epoch": 0.58, + "grad_norm": 0.50390625, + "learning_rate": 8.928375702550036e-05, + "loss": 1.3697, + "step": 11225 + }, + { + "epoch": 0.58, + "grad_norm": 0.5703125, + "learning_rate": 8.919397980655657e-05, + "loss": 1.4066, + "step": 11230 + }, + { + "epoch": 0.58, + "grad_norm": 0.51171875, + "learning_rate": 8.910421139925045e-05, + "loss": 1.4038, + "step": 11235 + }, + { + "epoch": 0.58, + "grad_norm": 0.54296875, + "learning_rate": 8.901445187678264e-05, + "loss": 1.4115, + "step": 11240 + }, + { + "epoch": 0.58, + "grad_norm": 0.5546875, + "learning_rate": 8.892470131234639e-05, + "loss": 1.4057, + "step": 11245 + }, + { + "epoch": 0.58, + "grad_norm": 0.55859375, + "learning_rate": 8.883495977912775e-05, + "loss": 1.4335, + "step": 11250 + }, + { + "epoch": 0.58, + "grad_norm": 0.50390625, + "learning_rate": 8.874522735030532e-05, + "loss": 1.3702, + "step": 11255 + }, + { + "epoch": 0.58, + "grad_norm": 0.53125, + "learning_rate": 8.865550409905037e-05, + "loss": 1.3975, + "step": 11260 + }, + { + "epoch": 0.58, + "grad_norm": 0.50390625, + "learning_rate": 8.856579009852657e-05, + "loss": 1.4373, + "step": 11265 + }, + { + "epoch": 0.58, + "grad_norm": 0.51953125, + "learning_rate": 8.847608542189017e-05, + "loss": 1.418, + "step": 11270 + }, + { + "epoch": 0.58, + "grad_norm": 0.52734375, + "learning_rate": 8.83863901422897e-05, + "loss": 1.4033, + "step": 11275 + }, + { + "epoch": 0.58, + "grad_norm": 0.53515625, + "learning_rate": 8.829670433286613e-05, + "loss": 1.4357, + "step": 11280 + }, + { + "epoch": 0.58, + "grad_norm": 0.53125, + "learning_rate": 8.820702806675263e-05, + "loss": 1.4209, + "step": 11285 + }, + { + "epoch": 0.58, + "grad_norm": 0.53125, + "learning_rate": 8.811736141707466e-05, + "loss": 1.4346, + "step": 11290 + }, + { + "epoch": 0.58, + "grad_norm": 0.5390625, + "learning_rate": 8.802770445694975e-05, + "loss": 1.4062, + "step": 11295 + }, + { + "epoch": 0.58, + "grad_norm": 0.5390625, + "learning_rate": 8.793805725948764e-05, + "loss": 1.4005, + "step": 11300 + }, + { + "epoch": 0.58, + "grad_norm": 0.53515625, + "learning_rate": 8.784841989778996e-05, + "loss": 1.4151, + "step": 11305 + }, + { + "epoch": 0.59, + "grad_norm": 0.5546875, + "learning_rate": 8.775879244495052e-05, + "loss": 1.4449, + "step": 11310 + }, + { + "epoch": 0.59, + "grad_norm": 0.5546875, + "learning_rate": 8.766917497405481e-05, + "loss": 1.4397, + "step": 11315 + }, + { + "epoch": 0.59, + "grad_norm": 0.5625, + "learning_rate": 8.757956755818041e-05, + "loss": 1.424, + "step": 11320 + }, + { + "epoch": 0.59, + "grad_norm": 0.52734375, + "learning_rate": 8.748997027039653e-05, + "loss": 1.4169, + "step": 11325 + }, + { + "epoch": 0.59, + "grad_norm": 0.51953125, + "learning_rate": 8.740038318376423e-05, + "loss": 1.4071, + "step": 11330 + }, + { + "epoch": 0.59, + "grad_norm": 0.5234375, + "learning_rate": 8.731080637133618e-05, + "loss": 1.4427, + "step": 11335 + }, + { + "epoch": 0.59, + "grad_norm": 0.5234375, + "learning_rate": 8.722123990615673e-05, + "loss": 1.4193, + "step": 11340 + }, + { + "epoch": 0.59, + "grad_norm": 0.54296875, + "learning_rate": 8.713168386126173e-05, + "loss": 1.3958, + "step": 11345 + }, + { + "epoch": 0.59, + "grad_norm": 0.546875, + "learning_rate": 8.704213830967861e-05, + "loss": 1.457, + "step": 11350 + }, + { + "epoch": 0.59, + "grad_norm": 0.5390625, + "learning_rate": 8.695260332442616e-05, + "loss": 1.4133, + "step": 11355 + }, + { + "epoch": 0.59, + "grad_norm": 0.5234375, + "learning_rate": 8.686307897851463e-05, + "loss": 1.4035, + "step": 11360 + }, + { + "epoch": 0.59, + "grad_norm": 0.52734375, + "learning_rate": 8.677356534494553e-05, + "loss": 1.3896, + "step": 11365 + }, + { + "epoch": 0.59, + "grad_norm": 0.51171875, + "learning_rate": 8.668406249671169e-05, + "loss": 1.4115, + "step": 11370 + }, + { + "epoch": 0.59, + "grad_norm": 0.546875, + "learning_rate": 8.65945705067971e-05, + "loss": 1.4179, + "step": 11375 + }, + { + "epoch": 0.59, + "grad_norm": 0.5546875, + "learning_rate": 8.650508944817692e-05, + "loss": 1.4196, + "step": 11380 + }, + { + "epoch": 0.59, + "grad_norm": 0.56640625, + "learning_rate": 8.641561939381737e-05, + "loss": 1.4511, + "step": 11385 + }, + { + "epoch": 0.59, + "grad_norm": 0.53515625, + "learning_rate": 8.632616041667577e-05, + "loss": 1.4401, + "step": 11390 + }, + { + "epoch": 0.59, + "grad_norm": 0.53515625, + "learning_rate": 8.623671258970028e-05, + "loss": 1.4221, + "step": 11395 + }, + { + "epoch": 0.59, + "grad_norm": 0.54296875, + "learning_rate": 8.614727598583015e-05, + "loss": 1.4375, + "step": 11400 + }, + { + "epoch": 0.59, + "grad_norm": 0.53125, + "learning_rate": 8.605785067799527e-05, + "loss": 1.3985, + "step": 11405 + }, + { + "epoch": 0.59, + "grad_norm": 0.53125, + "learning_rate": 8.596843673911643e-05, + "loss": 1.4173, + "step": 11410 + }, + { + "epoch": 0.59, + "grad_norm": 0.5234375, + "learning_rate": 8.58790342421052e-05, + "loss": 1.443, + "step": 11415 + }, + { + "epoch": 0.59, + "grad_norm": 0.52734375, + "learning_rate": 8.578964325986368e-05, + "loss": 1.3976, + "step": 11420 + }, + { + "epoch": 0.59, + "grad_norm": 0.5234375, + "learning_rate": 8.570026386528475e-05, + "loss": 1.3657, + "step": 11425 + }, + { + "epoch": 0.59, + "grad_norm": 0.53125, + "learning_rate": 8.561089613125166e-05, + "loss": 1.4348, + "step": 11430 + }, + { + "epoch": 0.59, + "grad_norm": 0.546875, + "learning_rate": 8.55215401306383e-05, + "loss": 1.4046, + "step": 11435 + }, + { + "epoch": 0.59, + "grad_norm": 0.52734375, + "learning_rate": 8.543219593630892e-05, + "loss": 1.4324, + "step": 11440 + }, + { + "epoch": 0.59, + "grad_norm": 0.54296875, + "learning_rate": 8.534286362111812e-05, + "loss": 1.42, + "step": 11445 + }, + { + "epoch": 0.59, + "grad_norm": 0.546875, + "learning_rate": 8.525354325791092e-05, + "loss": 1.4366, + "step": 11450 + }, + { + "epoch": 0.59, + "grad_norm": 0.5703125, + "learning_rate": 8.516423491952247e-05, + "loss": 1.4362, + "step": 11455 + }, + { + "epoch": 0.59, + "grad_norm": 0.5390625, + "learning_rate": 8.50749386787782e-05, + "loss": 1.384, + "step": 11460 + }, + { + "epoch": 0.59, + "grad_norm": 0.5234375, + "learning_rate": 8.498565460849362e-05, + "loss": 1.3842, + "step": 11465 + }, + { + "epoch": 0.59, + "grad_norm": 0.51953125, + "learning_rate": 8.489638278147433e-05, + "loss": 1.4209, + "step": 11470 + }, + { + "epoch": 0.59, + "grad_norm": 0.53515625, + "learning_rate": 8.480712327051599e-05, + "loss": 1.3873, + "step": 11475 + }, + { + "epoch": 0.59, + "grad_norm": 0.5546875, + "learning_rate": 8.471787614840416e-05, + "loss": 1.4494, + "step": 11480 + }, + { + "epoch": 0.59, + "grad_norm": 0.50390625, + "learning_rate": 8.462864148791432e-05, + "loss": 1.422, + "step": 11485 + }, + { + "epoch": 0.59, + "grad_norm": 0.5234375, + "learning_rate": 8.453941936181181e-05, + "loss": 1.434, + "step": 11490 + }, + { + "epoch": 0.59, + "grad_norm": 0.53515625, + "learning_rate": 8.445020984285169e-05, + "loss": 1.4391, + "step": 11495 + }, + { + "epoch": 0.59, + "grad_norm": 0.55078125, + "learning_rate": 8.436101300377881e-05, + "loss": 1.3965, + "step": 11500 + }, + { + "epoch": 0.6, + "grad_norm": 0.5390625, + "learning_rate": 8.427182891732762e-05, + "loss": 1.4083, + "step": 11505 + }, + { + "epoch": 0.6, + "grad_norm": 0.5234375, + "learning_rate": 8.418265765622225e-05, + "loss": 1.3746, + "step": 11510 + }, + { + "epoch": 0.6, + "grad_norm": 0.546875, + "learning_rate": 8.409349929317623e-05, + "loss": 1.3924, + "step": 11515 + }, + { + "epoch": 0.6, + "grad_norm": 0.51953125, + "learning_rate": 8.400435390089277e-05, + "loss": 1.3971, + "step": 11520 + }, + { + "epoch": 0.6, + "grad_norm": 0.55078125, + "learning_rate": 8.391522155206429e-05, + "loss": 1.395, + "step": 11525 + }, + { + "epoch": 0.6, + "grad_norm": 0.5234375, + "learning_rate": 8.382610231937276e-05, + "loss": 1.439, + "step": 11530 + }, + { + "epoch": 0.6, + "grad_norm": 0.578125, + "learning_rate": 8.373699627548934e-05, + "loss": 1.425, + "step": 11535 + }, + { + "epoch": 0.6, + "grad_norm": 0.5625, + "learning_rate": 8.364790349307448e-05, + "loss": 1.389, + "step": 11540 + }, + { + "epoch": 0.6, + "grad_norm": 0.5234375, + "learning_rate": 8.355882404477778e-05, + "loss": 1.4014, + "step": 11545 + }, + { + "epoch": 0.6, + "grad_norm": 0.52734375, + "learning_rate": 8.346975800323804e-05, + "loss": 1.4194, + "step": 11550 + }, + { + "epoch": 0.6, + "grad_norm": 0.51953125, + "learning_rate": 8.338070544108304e-05, + "loss": 1.4122, + "step": 11555 + }, + { + "epoch": 0.6, + "grad_norm": 0.51171875, + "learning_rate": 8.329166643092963e-05, + "loss": 1.4194, + "step": 11560 + }, + { + "epoch": 0.6, + "grad_norm": 0.53515625, + "learning_rate": 8.320264104538357e-05, + "loss": 1.3873, + "step": 11565 + }, + { + "epoch": 0.6, + "grad_norm": 0.55859375, + "learning_rate": 8.311362935703955e-05, + "loss": 1.4282, + "step": 11570 + }, + { + "epoch": 0.6, + "grad_norm": 0.5625, + "learning_rate": 8.302463143848102e-05, + "loss": 1.4116, + "step": 11575 + }, + { + "epoch": 0.6, + "grad_norm": 0.53515625, + "learning_rate": 8.293564736228034e-05, + "loss": 1.4194, + "step": 11580 + }, + { + "epoch": 0.6, + "grad_norm": 0.52734375, + "learning_rate": 8.284667720099839e-05, + "loss": 1.4211, + "step": 11585 + }, + { + "epoch": 0.6, + "grad_norm": 0.52734375, + "learning_rate": 8.275772102718489e-05, + "loss": 1.4032, + "step": 11590 + }, + { + "epoch": 0.6, + "grad_norm": 0.5390625, + "learning_rate": 8.2668778913378e-05, + "loss": 1.3712, + "step": 11595 + }, + { + "epoch": 0.6, + "grad_norm": 0.54296875, + "learning_rate": 8.257985093210455e-05, + "loss": 1.4273, + "step": 11600 + }, + { + "epoch": 0.6, + "grad_norm": 0.53515625, + "learning_rate": 8.249093715587972e-05, + "loss": 1.4499, + "step": 11605 + }, + { + "epoch": 0.6, + "grad_norm": 0.5390625, + "learning_rate": 8.240203765720722e-05, + "loss": 1.3837, + "step": 11610 + }, + { + "epoch": 0.6, + "grad_norm": 0.515625, + "learning_rate": 8.231315250857902e-05, + "loss": 1.4142, + "step": 11615 + }, + { + "epoch": 0.6, + "grad_norm": 0.56640625, + "learning_rate": 8.222428178247548e-05, + "loss": 1.3888, + "step": 11620 + }, + { + "epoch": 0.6, + "grad_norm": 0.51953125, + "learning_rate": 8.21354255513651e-05, + "loss": 1.384, + "step": 11625 + }, + { + "epoch": 0.6, + "grad_norm": 0.55078125, + "learning_rate": 8.204658388770466e-05, + "loss": 1.3775, + "step": 11630 + }, + { + "epoch": 0.6, + "grad_norm": 0.51171875, + "learning_rate": 8.195775686393897e-05, + "loss": 1.4199, + "step": 11635 + }, + { + "epoch": 0.6, + "grad_norm": 0.5234375, + "learning_rate": 8.1868944552501e-05, + "loss": 1.3916, + "step": 11640 + }, + { + "epoch": 0.6, + "grad_norm": 0.5078125, + "learning_rate": 8.178014702581162e-05, + "loss": 1.3924, + "step": 11645 + }, + { + "epoch": 0.6, + "grad_norm": 0.546875, + "learning_rate": 8.169136435627971e-05, + "loss": 1.4131, + "step": 11650 + }, + { + "epoch": 0.6, + "grad_norm": 0.54296875, + "learning_rate": 8.160259661630201e-05, + "loss": 1.3814, + "step": 11655 + }, + { + "epoch": 0.6, + "grad_norm": 0.5390625, + "learning_rate": 8.151384387826313e-05, + "loss": 1.3941, + "step": 11660 + }, + { + "epoch": 0.6, + "grad_norm": 0.51171875, + "learning_rate": 8.142510621453536e-05, + "loss": 1.395, + "step": 11665 + }, + { + "epoch": 0.6, + "grad_norm": 0.53515625, + "learning_rate": 8.13363836974788e-05, + "loss": 1.3861, + "step": 11670 + }, + { + "epoch": 0.6, + "grad_norm": 0.53125, + "learning_rate": 8.124767639944109e-05, + "loss": 1.4077, + "step": 11675 + }, + { + "epoch": 0.6, + "grad_norm": 0.51171875, + "learning_rate": 8.115898439275756e-05, + "loss": 1.4453, + "step": 11680 + }, + { + "epoch": 0.6, + "grad_norm": 0.5, + "learning_rate": 8.107030774975101e-05, + "loss": 1.4159, + "step": 11685 + }, + { + "epoch": 0.6, + "grad_norm": 0.52734375, + "learning_rate": 8.098164654273174e-05, + "loss": 1.4389, + "step": 11690 + }, + { + "epoch": 0.61, + "grad_norm": 0.54296875, + "learning_rate": 8.089300084399747e-05, + "loss": 1.3747, + "step": 11695 + }, + { + "epoch": 0.61, + "grad_norm": 0.54296875, + "learning_rate": 8.08043707258332e-05, + "loss": 1.4058, + "step": 11700 + }, + { + "epoch": 0.61, + "grad_norm": 0.53515625, + "learning_rate": 8.071575626051133e-05, + "loss": 1.4204, + "step": 11705 + }, + { + "epoch": 0.61, + "grad_norm": 0.53515625, + "learning_rate": 8.062715752029142e-05, + "loss": 1.4552, + "step": 11710 + }, + { + "epoch": 0.61, + "grad_norm": 0.50390625, + "learning_rate": 8.053857457742025e-05, + "loss": 1.4289, + "step": 11715 + }, + { + "epoch": 0.61, + "grad_norm": 0.53515625, + "learning_rate": 8.045000750413169e-05, + "loss": 1.4027, + "step": 11720 + }, + { + "epoch": 0.61, + "grad_norm": 0.53515625, + "learning_rate": 8.036145637264673e-05, + "loss": 1.4515, + "step": 11725 + }, + { + "epoch": 0.61, + "grad_norm": 0.53125, + "learning_rate": 8.027292125517324e-05, + "loss": 1.4288, + "step": 11730 + }, + { + "epoch": 0.61, + "grad_norm": 0.50390625, + "learning_rate": 8.018440222390616e-05, + "loss": 1.4238, + "step": 11735 + }, + { + "epoch": 0.61, + "grad_norm": 0.52734375, + "learning_rate": 8.009589935102723e-05, + "loss": 1.4264, + "step": 11740 + }, + { + "epoch": 0.61, + "grad_norm": 0.53125, + "learning_rate": 8.000741270870507e-05, + "loss": 1.4174, + "step": 11745 + }, + { + "epoch": 0.61, + "grad_norm": 0.51171875, + "learning_rate": 7.991894236909498e-05, + "loss": 1.4294, + "step": 11750 + }, + { + "epoch": 0.61, + "grad_norm": 0.5390625, + "learning_rate": 7.98304884043391e-05, + "loss": 1.4169, + "step": 11755 + }, + { + "epoch": 0.61, + "grad_norm": 0.55859375, + "learning_rate": 7.974205088656606e-05, + "loss": 1.4359, + "step": 11760 + }, + { + "epoch": 0.61, + "grad_norm": 0.5234375, + "learning_rate": 7.965362988789121e-05, + "loss": 1.4157, + "step": 11765 + }, + { + "epoch": 0.61, + "grad_norm": 0.5078125, + "learning_rate": 7.956522548041635e-05, + "loss": 1.4213, + "step": 11770 + }, + { + "epoch": 0.61, + "grad_norm": 0.5546875, + "learning_rate": 7.947683773622982e-05, + "loss": 1.4131, + "step": 11775 + }, + { + "epoch": 0.61, + "grad_norm": 0.515625, + "learning_rate": 7.938846672740627e-05, + "loss": 1.4234, + "step": 11780 + }, + { + "epoch": 0.61, + "grad_norm": 0.51953125, + "learning_rate": 7.930011252600683e-05, + "loss": 1.4135, + "step": 11785 + }, + { + "epoch": 0.61, + "grad_norm": 0.5390625, + "learning_rate": 7.92117752040788e-05, + "loss": 1.4611, + "step": 11790 + }, + { + "epoch": 0.61, + "grad_norm": 0.50390625, + "learning_rate": 7.912345483365581e-05, + "loss": 1.4269, + "step": 11795 + }, + { + "epoch": 0.61, + "grad_norm": 0.51171875, + "learning_rate": 7.903515148675762e-05, + "loss": 1.4278, + "step": 11800 + }, + { + "epoch": 0.61, + "grad_norm": 0.53125, + "learning_rate": 7.894686523539013e-05, + "loss": 1.3783, + "step": 11805 + }, + { + "epoch": 0.61, + "grad_norm": 0.5234375, + "learning_rate": 7.885859615154527e-05, + "loss": 1.4143, + "step": 11810 + }, + { + "epoch": 0.61, + "grad_norm": 0.52734375, + "learning_rate": 7.877034430720102e-05, + "loss": 1.4065, + "step": 11815 + }, + { + "epoch": 0.61, + "grad_norm": 0.5, + "learning_rate": 7.868210977432123e-05, + "loss": 1.389, + "step": 11820 + }, + { + "epoch": 0.61, + "grad_norm": 0.56640625, + "learning_rate": 7.85938926248557e-05, + "loss": 1.4302, + "step": 11825 + }, + { + "epoch": 0.61, + "grad_norm": 0.53125, + "learning_rate": 7.850569293074006e-05, + "loss": 1.416, + "step": 11830 + }, + { + "epoch": 0.61, + "grad_norm": 0.5546875, + "learning_rate": 7.841751076389563e-05, + "loss": 1.3923, + "step": 11835 + }, + { + "epoch": 0.61, + "grad_norm": 0.52734375, + "learning_rate": 7.832934619622954e-05, + "loss": 1.4171, + "step": 11840 + }, + { + "epoch": 0.61, + "grad_norm": 0.53125, + "learning_rate": 7.824119929963444e-05, + "loss": 1.4211, + "step": 11845 + }, + { + "epoch": 0.61, + "grad_norm": 0.5390625, + "learning_rate": 7.81530701459887e-05, + "loss": 1.4153, + "step": 11850 + }, + { + "epoch": 0.61, + "grad_norm": 0.49609375, + "learning_rate": 7.806495880715614e-05, + "loss": 1.4169, + "step": 11855 + }, + { + "epoch": 0.61, + "grad_norm": 0.5625, + "learning_rate": 7.797686535498611e-05, + "loss": 1.4031, + "step": 11860 + }, + { + "epoch": 0.61, + "grad_norm": 0.52734375, + "learning_rate": 7.788878986131331e-05, + "loss": 1.4202, + "step": 11865 + }, + { + "epoch": 0.61, + "grad_norm": 0.55078125, + "learning_rate": 7.780073239795787e-05, + "loss": 1.3808, + "step": 11870 + }, + { + "epoch": 0.61, + "grad_norm": 0.51953125, + "learning_rate": 7.771269303672513e-05, + "loss": 1.4002, + "step": 11875 + }, + { + "epoch": 0.61, + "grad_norm": 0.51953125, + "learning_rate": 7.762467184940574e-05, + "loss": 1.3846, + "step": 11880 + }, + { + "epoch": 0.61, + "grad_norm": 0.55859375, + "learning_rate": 7.75366689077755e-05, + "loss": 1.4054, + "step": 11885 + }, + { + "epoch": 0.62, + "grad_norm": 0.52734375, + "learning_rate": 7.744868428359536e-05, + "loss": 1.3909, + "step": 11890 + }, + { + "epoch": 0.62, + "grad_norm": 0.5234375, + "learning_rate": 7.736071804861127e-05, + "loss": 1.4172, + "step": 11895 + }, + { + "epoch": 0.62, + "grad_norm": 0.53125, + "learning_rate": 7.727277027455428e-05, + "loss": 1.3996, + "step": 11900 + }, + { + "epoch": 0.62, + "grad_norm": 0.52734375, + "learning_rate": 7.718484103314026e-05, + "loss": 1.4351, + "step": 11905 + }, + { + "epoch": 0.62, + "grad_norm": 0.53515625, + "learning_rate": 7.709693039607012e-05, + "loss": 1.3794, + "step": 11910 + }, + { + "epoch": 0.62, + "grad_norm": 0.52734375, + "learning_rate": 7.700903843502947e-05, + "loss": 1.3866, + "step": 11915 + }, + { + "epoch": 0.62, + "grad_norm": 0.5546875, + "learning_rate": 7.692116522168877e-05, + "loss": 1.428, + "step": 11920 + }, + { + "epoch": 0.62, + "grad_norm": 0.5703125, + "learning_rate": 7.683331082770311e-05, + "loss": 1.4656, + "step": 11925 + }, + { + "epoch": 0.62, + "grad_norm": 0.53515625, + "learning_rate": 7.674547532471235e-05, + "loss": 1.4387, + "step": 11930 + }, + { + "epoch": 0.62, + "grad_norm": 0.515625, + "learning_rate": 7.665765878434084e-05, + "loss": 1.4312, + "step": 11935 + }, + { + "epoch": 0.62, + "grad_norm": 0.5390625, + "learning_rate": 7.656986127819754e-05, + "loss": 1.4365, + "step": 11940 + }, + { + "epoch": 0.62, + "grad_norm": 0.62890625, + "learning_rate": 7.648208287787584e-05, + "loss": 1.3558, + "step": 11945 + }, + { + "epoch": 0.62, + "grad_norm": 0.51171875, + "learning_rate": 7.639432365495357e-05, + "loss": 1.3889, + "step": 11950 + }, + { + "epoch": 0.62, + "grad_norm": 0.53515625, + "learning_rate": 7.630658368099291e-05, + "loss": 1.422, + "step": 11955 + }, + { + "epoch": 0.62, + "grad_norm": 0.51171875, + "learning_rate": 7.62188630275404e-05, + "loss": 1.3943, + "step": 11960 + }, + { + "epoch": 0.62, + "grad_norm": 0.54296875, + "learning_rate": 7.613116176612672e-05, + "loss": 1.3815, + "step": 11965 + }, + { + "epoch": 0.62, + "grad_norm": 0.52734375, + "learning_rate": 7.604347996826682e-05, + "loss": 1.4084, + "step": 11970 + }, + { + "epoch": 0.62, + "grad_norm": 0.515625, + "learning_rate": 7.595581770545978e-05, + "loss": 1.4034, + "step": 11975 + }, + { + "epoch": 0.62, + "grad_norm": 0.54296875, + "learning_rate": 7.58681750491887e-05, + "loss": 1.412, + "step": 11980 + }, + { + "epoch": 0.62, + "grad_norm": 0.55078125, + "learning_rate": 7.578055207092071e-05, + "loss": 1.3972, + "step": 11985 + }, + { + "epoch": 0.62, + "grad_norm": 0.546875, + "learning_rate": 7.569294884210694e-05, + "loss": 1.3899, + "step": 11990 + }, + { + "epoch": 0.62, + "grad_norm": 0.55859375, + "learning_rate": 7.560536543418235e-05, + "loss": 1.4266, + "step": 11995 + }, + { + "epoch": 0.62, + "grad_norm": 0.54296875, + "learning_rate": 7.551780191856575e-05, + "loss": 1.4088, + "step": 12000 + }, + { + "epoch": 0.62, + "grad_norm": 0.55078125, + "learning_rate": 7.543025836665977e-05, + "loss": 1.4214, + "step": 12005 + }, + { + "epoch": 0.62, + "grad_norm": 0.53515625, + "learning_rate": 7.53427348498507e-05, + "loss": 1.4133, + "step": 12010 + }, + { + "epoch": 0.62, + "grad_norm": 0.51171875, + "learning_rate": 7.525523143950859e-05, + "loss": 1.3915, + "step": 12015 + }, + { + "epoch": 0.62, + "grad_norm": 0.546875, + "learning_rate": 7.516774820698695e-05, + "loss": 1.4307, + "step": 12020 + }, + { + "epoch": 0.62, + "grad_norm": 0.53515625, + "learning_rate": 7.5080285223623e-05, + "loss": 1.4384, + "step": 12025 + }, + { + "epoch": 0.62, + "grad_norm": 0.5234375, + "learning_rate": 7.499284256073731e-05, + "loss": 1.3982, + "step": 12030 + }, + { + "epoch": 0.62, + "grad_norm": 0.53515625, + "learning_rate": 7.490542028963396e-05, + "loss": 1.4055, + "step": 12035 + }, + { + "epoch": 0.62, + "grad_norm": 0.5546875, + "learning_rate": 7.481801848160035e-05, + "loss": 1.4069, + "step": 12040 + }, + { + "epoch": 0.62, + "grad_norm": 0.546875, + "learning_rate": 7.473063720790727e-05, + "loss": 1.3999, + "step": 12045 + }, + { + "epoch": 0.62, + "grad_norm": 0.5234375, + "learning_rate": 7.464327653980865e-05, + "loss": 1.4145, + "step": 12050 + }, + { + "epoch": 0.62, + "grad_norm": 0.546875, + "learning_rate": 7.455593654854176e-05, + "loss": 1.4143, + "step": 12055 + }, + { + "epoch": 0.62, + "grad_norm": 0.5234375, + "learning_rate": 7.446861730532688e-05, + "loss": 1.435, + "step": 12060 + }, + { + "epoch": 0.62, + "grad_norm": 0.546875, + "learning_rate": 7.438131888136746e-05, + "loss": 1.4205, + "step": 12065 + }, + { + "epoch": 0.62, + "grad_norm": 0.515625, + "learning_rate": 7.429404134784987e-05, + "loss": 1.389, + "step": 12070 + }, + { + "epoch": 0.62, + "grad_norm": 0.5390625, + "learning_rate": 7.420678477594361e-05, + "loss": 1.4084, + "step": 12075 + }, + { + "epoch": 0.62, + "grad_norm": 0.578125, + "learning_rate": 7.411954923680091e-05, + "loss": 1.4572, + "step": 12080 + }, + { + "epoch": 0.63, + "grad_norm": 0.54296875, + "learning_rate": 7.403233480155697e-05, + "loss": 1.3838, + "step": 12085 + }, + { + "epoch": 0.63, + "grad_norm": 0.5, + "learning_rate": 7.394514154132975e-05, + "loss": 1.3921, + "step": 12090 + }, + { + "epoch": 0.63, + "grad_norm": 0.515625, + "learning_rate": 7.385796952721991e-05, + "loss": 1.4163, + "step": 12095 + }, + { + "epoch": 0.63, + "grad_norm": 0.55078125, + "learning_rate": 7.377081883031079e-05, + "loss": 1.4066, + "step": 12100 + }, + { + "epoch": 0.63, + "grad_norm": 0.546875, + "learning_rate": 7.368368952166839e-05, + "loss": 1.43, + "step": 12105 + }, + { + "epoch": 0.63, + "grad_norm": 0.55859375, + "learning_rate": 7.359658167234125e-05, + "loss": 1.4091, + "step": 12110 + }, + { + "epoch": 0.63, + "grad_norm": 0.515625, + "learning_rate": 7.350949535336041e-05, + "loss": 1.3923, + "step": 12115 + }, + { + "epoch": 0.63, + "grad_norm": 0.53125, + "learning_rate": 7.342243063573932e-05, + "loss": 1.4047, + "step": 12120 + }, + { + "epoch": 0.63, + "grad_norm": 0.515625, + "learning_rate": 7.333538759047389e-05, + "loss": 1.3858, + "step": 12125 + }, + { + "epoch": 0.63, + "grad_norm": 0.53515625, + "learning_rate": 7.324836628854226e-05, + "loss": 1.407, + "step": 12130 + }, + { + "epoch": 0.63, + "grad_norm": 0.55078125, + "learning_rate": 7.316136680090494e-05, + "loss": 1.4015, + "step": 12135 + }, + { + "epoch": 0.63, + "grad_norm": 0.54296875, + "learning_rate": 7.307438919850456e-05, + "loss": 1.3805, + "step": 12140 + }, + { + "epoch": 0.63, + "grad_norm": 0.55859375, + "learning_rate": 7.298743355226599e-05, + "loss": 1.4403, + "step": 12145 + }, + { + "epoch": 0.63, + "grad_norm": 0.5234375, + "learning_rate": 7.290049993309611e-05, + "loss": 1.4015, + "step": 12150 + }, + { + "epoch": 0.63, + "grad_norm": 0.5234375, + "learning_rate": 7.281358841188392e-05, + "loss": 1.4074, + "step": 12155 + }, + { + "epoch": 0.63, + "grad_norm": 0.5234375, + "learning_rate": 7.272669905950036e-05, + "loss": 1.409, + "step": 12160 + }, + { + "epoch": 0.63, + "grad_norm": 0.55078125, + "learning_rate": 7.263983194679827e-05, + "loss": 1.4018, + "step": 12165 + }, + { + "epoch": 0.63, + "grad_norm": 0.54296875, + "learning_rate": 7.25529871446124e-05, + "loss": 1.4238, + "step": 12170 + }, + { + "epoch": 0.63, + "grad_norm": 0.53125, + "learning_rate": 7.246616472375928e-05, + "loss": 1.4122, + "step": 12175 + }, + { + "epoch": 0.63, + "grad_norm": 0.54296875, + "learning_rate": 7.237936475503719e-05, + "loss": 1.4233, + "step": 12180 + }, + { + "epoch": 0.63, + "grad_norm": 0.55859375, + "learning_rate": 7.229258730922615e-05, + "loss": 1.4236, + "step": 12185 + }, + { + "epoch": 0.63, + "grad_norm": 0.53515625, + "learning_rate": 7.22058324570877e-05, + "loss": 1.3993, + "step": 12190 + }, + { + "epoch": 0.63, + "grad_norm": 0.54296875, + "learning_rate": 7.21191002693651e-05, + "loss": 1.4025, + "step": 12195 + }, + { + "epoch": 0.63, + "grad_norm": 0.54296875, + "learning_rate": 7.203239081678299e-05, + "loss": 1.4303, + "step": 12200 + }, + { + "epoch": 0.63, + "grad_norm": 0.54296875, + "learning_rate": 7.194570417004759e-05, + "loss": 1.3899, + "step": 12205 + }, + { + "epoch": 0.63, + "grad_norm": 0.54296875, + "learning_rate": 7.185904039984648e-05, + "loss": 1.4111, + "step": 12210 + }, + { + "epoch": 0.63, + "grad_norm": 0.51171875, + "learning_rate": 7.177239957684851e-05, + "loss": 1.3872, + "step": 12215 + }, + { + "epoch": 0.63, + "grad_norm": 0.54296875, + "learning_rate": 7.168578177170397e-05, + "loss": 1.3984, + "step": 12220 + }, + { + "epoch": 0.63, + "grad_norm": 0.55078125, + "learning_rate": 7.159918705504424e-05, + "loss": 1.4333, + "step": 12225 + }, + { + "epoch": 0.63, + "grad_norm": 0.52734375, + "learning_rate": 7.151261549748195e-05, + "loss": 1.4313, + "step": 12230 + }, + { + "epoch": 0.63, + "grad_norm": 0.5625, + "learning_rate": 7.14260671696108e-05, + "loss": 1.4067, + "step": 12235 + }, + { + "epoch": 0.63, + "grad_norm": 0.5078125, + "learning_rate": 7.13395421420056e-05, + "loss": 1.4108, + "step": 12240 + }, + { + "epoch": 0.63, + "grad_norm": 0.52734375, + "learning_rate": 7.125304048522211e-05, + "loss": 1.4413, + "step": 12245 + }, + { + "epoch": 0.63, + "grad_norm": 0.5078125, + "learning_rate": 7.116656226979708e-05, + "loss": 1.3683, + "step": 12250 + }, + { + "epoch": 0.63, + "grad_norm": 0.51171875, + "learning_rate": 7.108010756624808e-05, + "loss": 1.3989, + "step": 12255 + }, + { + "epoch": 0.63, + "grad_norm": 0.55078125, + "learning_rate": 7.099367644507357e-05, + "loss": 1.4072, + "step": 12260 + }, + { + "epoch": 0.63, + "grad_norm": 0.49609375, + "learning_rate": 7.090726897675277e-05, + "loss": 1.3872, + "step": 12265 + }, + { + "epoch": 0.63, + "grad_norm": 0.515625, + "learning_rate": 7.082088523174558e-05, + "loss": 1.4026, + "step": 12270 + }, + { + "epoch": 0.64, + "grad_norm": 0.52734375, + "learning_rate": 7.073452528049254e-05, + "loss": 1.3791, + "step": 12275 + }, + { + "epoch": 0.64, + "grad_norm": 0.52734375, + "learning_rate": 7.06481891934149e-05, + "loss": 1.3941, + "step": 12280 + }, + { + "epoch": 0.64, + "grad_norm": 0.546875, + "learning_rate": 7.056187704091434e-05, + "loss": 1.3936, + "step": 12285 + }, + { + "epoch": 0.64, + "grad_norm": 0.5390625, + "learning_rate": 7.047558889337302e-05, + "loss": 1.4152, + "step": 12290 + }, + { + "epoch": 0.64, + "grad_norm": 0.5625, + "learning_rate": 7.03893248211536e-05, + "loss": 1.4064, + "step": 12295 + }, + { + "epoch": 0.64, + "grad_norm": 0.5390625, + "learning_rate": 7.030308489459904e-05, + "loss": 1.394, + "step": 12300 + }, + { + "epoch": 0.64, + "grad_norm": 0.53125, + "learning_rate": 7.021686918403266e-05, + "loss": 1.4682, + "step": 12305 + }, + { + "epoch": 0.64, + "grad_norm": 0.56640625, + "learning_rate": 7.013067775975799e-05, + "loss": 1.415, + "step": 12310 + }, + { + "epoch": 0.64, + "grad_norm": 0.6484375, + "learning_rate": 7.004451069205881e-05, + "loss": 1.3852, + "step": 12315 + }, + { + "epoch": 0.64, + "grad_norm": 0.52734375, + "learning_rate": 6.995836805119897e-05, + "loss": 1.4032, + "step": 12320 + }, + { + "epoch": 0.64, + "grad_norm": 0.55859375, + "learning_rate": 6.98722499074225e-05, + "loss": 1.4187, + "step": 12325 + }, + { + "epoch": 0.64, + "grad_norm": 0.5390625, + "learning_rate": 6.978615633095331e-05, + "loss": 1.4065, + "step": 12330 + }, + { + "epoch": 0.64, + "grad_norm": 0.53125, + "learning_rate": 6.970008739199543e-05, + "loss": 1.418, + "step": 12335 + }, + { + "epoch": 0.64, + "grad_norm": 0.5859375, + "learning_rate": 6.961404316073267e-05, + "loss": 1.4124, + "step": 12340 + }, + { + "epoch": 0.64, + "grad_norm": 0.5234375, + "learning_rate": 6.95280237073288e-05, + "loss": 1.4179, + "step": 12345 + }, + { + "epoch": 0.64, + "grad_norm": 0.52734375, + "learning_rate": 6.944202910192732e-05, + "loss": 1.4075, + "step": 12350 + }, + { + "epoch": 0.64, + "grad_norm": 0.5703125, + "learning_rate": 6.93560594146515e-05, + "loss": 1.367, + "step": 12355 + }, + { + "epoch": 0.64, + "grad_norm": 0.54296875, + "learning_rate": 6.927011471560422e-05, + "loss": 1.4334, + "step": 12360 + }, + { + "epoch": 0.64, + "grad_norm": 0.54296875, + "learning_rate": 6.918419507486813e-05, + "loss": 1.4224, + "step": 12365 + }, + { + "epoch": 0.64, + "grad_norm": 0.498046875, + "learning_rate": 6.909830056250527e-05, + "loss": 1.3795, + "step": 12370 + }, + { + "epoch": 0.64, + "grad_norm": 0.5390625, + "learning_rate": 6.901243124855733e-05, + "loss": 1.3588, + "step": 12375 + }, + { + "epoch": 0.64, + "grad_norm": 0.51953125, + "learning_rate": 6.892658720304535e-05, + "loss": 1.4085, + "step": 12380 + }, + { + "epoch": 0.64, + "grad_norm": 0.52734375, + "learning_rate": 6.884076849596988e-05, + "loss": 1.3963, + "step": 12385 + }, + { + "epoch": 0.64, + "grad_norm": 0.515625, + "learning_rate": 6.875497519731067e-05, + "loss": 1.3857, + "step": 12390 + }, + { + "epoch": 0.64, + "grad_norm": 0.53125, + "learning_rate": 6.866920737702688e-05, + "loss": 1.4228, + "step": 12395 + }, + { + "epoch": 0.64, + "grad_norm": 0.515625, + "learning_rate": 6.858346510505678e-05, + "loss": 1.3961, + "step": 12400 + }, + { + "epoch": 0.64, + "grad_norm": 0.5390625, + "learning_rate": 6.849774845131791e-05, + "loss": 1.425, + "step": 12405 + }, + { + "epoch": 0.64, + "grad_norm": 0.546875, + "learning_rate": 6.841205748570685e-05, + "loss": 1.3729, + "step": 12410 + }, + { + "epoch": 0.64, + "grad_norm": 0.5390625, + "learning_rate": 6.832639227809927e-05, + "loss": 1.4257, + "step": 12415 + }, + { + "epoch": 0.64, + "grad_norm": 0.51953125, + "learning_rate": 6.82407528983498e-05, + "loss": 1.3877, + "step": 12420 + }, + { + "epoch": 0.64, + "grad_norm": 0.5390625, + "learning_rate": 6.815513941629204e-05, + "loss": 1.4204, + "step": 12425 + }, + { + "epoch": 0.64, + "grad_norm": 0.5078125, + "learning_rate": 6.806955190173848e-05, + "loss": 1.431, + "step": 12430 + }, + { + "epoch": 0.64, + "grad_norm": 0.5390625, + "learning_rate": 6.798399042448039e-05, + "loss": 1.3797, + "step": 12435 + }, + { + "epoch": 0.64, + "grad_norm": 0.53515625, + "learning_rate": 6.789845505428782e-05, + "loss": 1.4246, + "step": 12440 + }, + { + "epoch": 0.64, + "grad_norm": 0.515625, + "learning_rate": 6.781294586090962e-05, + "loss": 1.3974, + "step": 12445 + }, + { + "epoch": 0.64, + "grad_norm": 0.54296875, + "learning_rate": 6.772746291407315e-05, + "loss": 1.4657, + "step": 12450 + }, + { + "epoch": 0.64, + "grad_norm": 0.51171875, + "learning_rate": 6.764200628348449e-05, + "loss": 1.3938, + "step": 12455 + }, + { + "epoch": 0.64, + "grad_norm": 0.5390625, + "learning_rate": 6.755657603882816e-05, + "loss": 1.3811, + "step": 12460 + }, + { + "epoch": 0.64, + "grad_norm": 0.56640625, + "learning_rate": 6.747117224976726e-05, + "loss": 1.3983, + "step": 12465 + }, + { + "epoch": 0.65, + "grad_norm": 0.54296875, + "learning_rate": 6.738579498594322e-05, + "loss": 1.4247, + "step": 12470 + }, + { + "epoch": 0.65, + "grad_norm": 0.50390625, + "learning_rate": 6.730044431697595e-05, + "loss": 1.3805, + "step": 12475 + }, + { + "epoch": 0.65, + "grad_norm": 0.5390625, + "learning_rate": 6.721512031246358e-05, + "loss": 1.395, + "step": 12480 + }, + { + "epoch": 0.65, + "grad_norm": 0.5078125, + "learning_rate": 6.712982304198254e-05, + "loss": 1.417, + "step": 12485 + }, + { + "epoch": 0.65, + "grad_norm": 0.53515625, + "learning_rate": 6.704455257508743e-05, + "loss": 1.4102, + "step": 12490 + }, + { + "epoch": 0.65, + "grad_norm": 0.55078125, + "learning_rate": 6.695930898131107e-05, + "loss": 1.4234, + "step": 12495 + }, + { + "epoch": 0.65, + "grad_norm": 0.53515625, + "learning_rate": 6.687409233016422e-05, + "loss": 1.4232, + "step": 12500 + }, + { + "epoch": 0.65, + "grad_norm": 0.53515625, + "learning_rate": 6.678890269113587e-05, + "loss": 1.396, + "step": 12505 + }, + { + "epoch": 0.65, + "grad_norm": 0.5546875, + "learning_rate": 6.670374013369279e-05, + "loss": 1.4411, + "step": 12510 + }, + { + "epoch": 0.65, + "grad_norm": 0.515625, + "learning_rate": 6.661860472727981e-05, + "loss": 1.4217, + "step": 12515 + }, + { + "epoch": 0.65, + "grad_norm": 0.5390625, + "learning_rate": 6.65334965413195e-05, + "loss": 1.4315, + "step": 12520 + }, + { + "epoch": 0.65, + "grad_norm": 0.51171875, + "learning_rate": 6.644841564521237e-05, + "loss": 1.4366, + "step": 12525 + }, + { + "epoch": 0.65, + "grad_norm": 0.5625, + "learning_rate": 6.636336210833654e-05, + "loss": 1.4163, + "step": 12530 + }, + { + "epoch": 0.65, + "grad_norm": 0.53125, + "learning_rate": 6.627833600004791e-05, + "loss": 1.4071, + "step": 12535 + }, + { + "epoch": 0.65, + "grad_norm": 0.515625, + "learning_rate": 6.619333738967996e-05, + "loss": 1.3856, + "step": 12540 + }, + { + "epoch": 0.65, + "grad_norm": 0.58984375, + "learning_rate": 6.610836634654382e-05, + "loss": 1.441, + "step": 12545 + }, + { + "epoch": 0.65, + "grad_norm": 0.546875, + "learning_rate": 6.602342293992805e-05, + "loss": 1.409, + "step": 12550 + }, + { + "epoch": 0.65, + "grad_norm": 0.53515625, + "learning_rate": 6.593850723909875e-05, + "loss": 1.3984, + "step": 12555 + }, + { + "epoch": 0.65, + "grad_norm": 0.486328125, + "learning_rate": 6.585361931329937e-05, + "loss": 1.3951, + "step": 12560 + }, + { + "epoch": 0.65, + "grad_norm": 0.51953125, + "learning_rate": 6.576875923175075e-05, + "loss": 1.4047, + "step": 12565 + }, + { + "epoch": 0.65, + "grad_norm": 0.515625, + "learning_rate": 6.568392706365099e-05, + "loss": 1.3699, + "step": 12570 + }, + { + "epoch": 0.65, + "grad_norm": 0.51171875, + "learning_rate": 6.559912287817547e-05, + "loss": 1.3845, + "step": 12575 + }, + { + "epoch": 0.65, + "grad_norm": 0.5703125, + "learning_rate": 6.551434674447676e-05, + "loss": 1.3977, + "step": 12580 + }, + { + "epoch": 0.65, + "grad_norm": 0.5546875, + "learning_rate": 6.542959873168446e-05, + "loss": 1.4172, + "step": 12585 + }, + { + "epoch": 0.65, + "grad_norm": 0.51171875, + "learning_rate": 6.534487890890536e-05, + "loss": 1.4026, + "step": 12590 + }, + { + "epoch": 0.65, + "grad_norm": 0.53125, + "learning_rate": 6.526018734522317e-05, + "loss": 1.4244, + "step": 12595 + }, + { + "epoch": 0.65, + "grad_norm": 0.57421875, + "learning_rate": 6.517552410969863e-05, + "loss": 1.4092, + "step": 12600 + }, + { + "epoch": 0.65, + "grad_norm": 0.5625, + "learning_rate": 6.50908892713693e-05, + "loss": 1.4277, + "step": 12605 + }, + { + "epoch": 0.65, + "grad_norm": 0.51953125, + "learning_rate": 6.50062828992497e-05, + "loss": 1.4094, + "step": 12610 + }, + { + "epoch": 0.65, + "grad_norm": 0.546875, + "learning_rate": 6.4921705062331e-05, + "loss": 1.4182, + "step": 12615 + }, + { + "epoch": 0.65, + "grad_norm": 0.54296875, + "learning_rate": 6.48371558295812e-05, + "loss": 1.4341, + "step": 12620 + }, + { + "epoch": 0.65, + "grad_norm": 0.5390625, + "learning_rate": 6.475263526994494e-05, + "loss": 1.3804, + "step": 12625 + }, + { + "epoch": 0.65, + "grad_norm": 0.5546875, + "learning_rate": 6.466814345234348e-05, + "loss": 1.4144, + "step": 12630 + }, + { + "epoch": 0.65, + "grad_norm": 0.61328125, + "learning_rate": 6.458368044567466e-05, + "loss": 1.4298, + "step": 12635 + }, + { + "epoch": 0.65, + "grad_norm": 0.55078125, + "learning_rate": 6.449924631881277e-05, + "loss": 1.3918, + "step": 12640 + }, + { + "epoch": 0.65, + "grad_norm": 0.5234375, + "learning_rate": 6.441484114060865e-05, + "loss": 1.4225, + "step": 12645 + }, + { + "epoch": 0.65, + "grad_norm": 0.578125, + "learning_rate": 6.43304649798894e-05, + "loss": 1.4302, + "step": 12650 + }, + { + "epoch": 0.65, + "grad_norm": 0.53515625, + "learning_rate": 6.424611790545862e-05, + "loss": 1.4131, + "step": 12655 + }, + { + "epoch": 0.65, + "grad_norm": 0.53125, + "learning_rate": 6.416179998609604e-05, + "loss": 1.4137, + "step": 12660 + }, + { + "epoch": 0.66, + "grad_norm": 0.51953125, + "learning_rate": 6.407751129055772e-05, + "loss": 1.384, + "step": 12665 + }, + { + "epoch": 0.66, + "grad_norm": 0.52734375, + "learning_rate": 6.399325188757583e-05, + "loss": 1.4397, + "step": 12670 + }, + { + "epoch": 0.66, + "grad_norm": 0.53515625, + "learning_rate": 6.390902184585869e-05, + "loss": 1.4054, + "step": 12675 + }, + { + "epoch": 0.66, + "grad_norm": 0.5546875, + "learning_rate": 6.382482123409064e-05, + "loss": 1.4166, + "step": 12680 + }, + { + "epoch": 0.66, + "grad_norm": 0.5625, + "learning_rate": 6.374065012093206e-05, + "loss": 1.4157, + "step": 12685 + }, + { + "epoch": 0.66, + "grad_norm": 0.54296875, + "learning_rate": 6.365650857501926e-05, + "loss": 1.4305, + "step": 12690 + }, + { + "epoch": 0.66, + "grad_norm": 0.5, + "learning_rate": 6.357239666496446e-05, + "loss": 1.3844, + "step": 12695 + }, + { + "epoch": 0.66, + "grad_norm": 0.55859375, + "learning_rate": 6.348831445935566e-05, + "loss": 1.444, + "step": 12700 + }, + { + "epoch": 0.66, + "grad_norm": 0.55078125, + "learning_rate": 6.340426202675669e-05, + "loss": 1.45, + "step": 12705 + }, + { + "epoch": 0.66, + "grad_norm": 0.51953125, + "learning_rate": 6.332023943570706e-05, + "loss": 1.4239, + "step": 12710 + }, + { + "epoch": 0.66, + "grad_norm": 0.55859375, + "learning_rate": 6.323624675472202e-05, + "loss": 1.4083, + "step": 12715 + }, + { + "epoch": 0.66, + "grad_norm": 0.55078125, + "learning_rate": 6.315228405229232e-05, + "loss": 1.4053, + "step": 12720 + }, + { + "epoch": 0.66, + "grad_norm": 0.54296875, + "learning_rate": 6.306835139688438e-05, + "loss": 1.4305, + "step": 12725 + }, + { + "epoch": 0.66, + "grad_norm": 0.51953125, + "learning_rate": 6.298444885694001e-05, + "loss": 1.3939, + "step": 12730 + }, + { + "epoch": 0.66, + "grad_norm": 0.51953125, + "learning_rate": 6.290057650087656e-05, + "loss": 1.3813, + "step": 12735 + }, + { + "epoch": 0.66, + "grad_norm": 0.5546875, + "learning_rate": 6.281673439708668e-05, + "loss": 1.4162, + "step": 12740 + }, + { + "epoch": 0.66, + "grad_norm": 0.5546875, + "learning_rate": 6.273292261393846e-05, + "loss": 1.4266, + "step": 12745 + }, + { + "epoch": 0.66, + "grad_norm": 0.546875, + "learning_rate": 6.264914121977512e-05, + "loss": 1.417, + "step": 12750 + }, + { + "epoch": 0.66, + "grad_norm": 0.52734375, + "learning_rate": 6.256539028291523e-05, + "loss": 1.4123, + "step": 12755 + }, + { + "epoch": 0.66, + "grad_norm": 0.53515625, + "learning_rate": 6.248166987165247e-05, + "loss": 1.4137, + "step": 12760 + }, + { + "epoch": 0.66, + "grad_norm": 0.52734375, + "learning_rate": 6.239798005425561e-05, + "loss": 1.4262, + "step": 12765 + }, + { + "epoch": 0.66, + "grad_norm": 0.56640625, + "learning_rate": 6.231432089896848e-05, + "loss": 1.4328, + "step": 12770 + }, + { + "epoch": 0.66, + "grad_norm": 0.53125, + "learning_rate": 6.223069247400998e-05, + "loss": 1.4066, + "step": 12775 + }, + { + "epoch": 0.66, + "grad_norm": 0.57421875, + "learning_rate": 6.214709484757382e-05, + "loss": 1.4116, + "step": 12780 + }, + { + "epoch": 0.66, + "grad_norm": 0.53515625, + "learning_rate": 6.206352808782873e-05, + "loss": 1.3878, + "step": 12785 + }, + { + "epoch": 0.66, + "grad_norm": 0.5546875, + "learning_rate": 6.197999226291816e-05, + "loss": 1.4295, + "step": 12790 + }, + { + "epoch": 0.66, + "grad_norm": 0.546875, + "learning_rate": 6.189648744096043e-05, + "loss": 1.4139, + "step": 12795 + }, + { + "epoch": 0.66, + "grad_norm": 0.5546875, + "learning_rate": 6.181301369004847e-05, + "loss": 1.4571, + "step": 12800 + }, + { + "epoch": 0.66, + "grad_norm": 0.52734375, + "learning_rate": 6.172957107824999e-05, + "loss": 1.4152, + "step": 12805 + }, + { + "epoch": 0.66, + "grad_norm": 0.52734375, + "learning_rate": 6.164615967360723e-05, + "loss": 1.4134, + "step": 12810 + }, + { + "epoch": 0.66, + "grad_norm": 0.5078125, + "learning_rate": 6.156277954413701e-05, + "loss": 1.3994, + "step": 12815 + }, + { + "epoch": 0.66, + "grad_norm": 0.55078125, + "learning_rate": 6.147943075783062e-05, + "loss": 1.4376, + "step": 12820 + }, + { + "epoch": 0.66, + "grad_norm": 0.5390625, + "learning_rate": 6.139611338265386e-05, + "loss": 1.4031, + "step": 12825 + }, + { + "epoch": 0.66, + "grad_norm": 0.53125, + "learning_rate": 6.131282748654681e-05, + "loss": 1.362, + "step": 12830 + }, + { + "epoch": 0.66, + "grad_norm": 0.53125, + "learning_rate": 6.1229573137424e-05, + "loss": 1.4034, + "step": 12835 + }, + { + "epoch": 0.66, + "grad_norm": 0.4921875, + "learning_rate": 6.114635040317414e-05, + "loss": 1.3764, + "step": 12840 + }, + { + "epoch": 0.66, + "grad_norm": 0.53515625, + "learning_rate": 6.10631593516602e-05, + "loss": 1.4128, + "step": 12845 + }, + { + "epoch": 0.66, + "grad_norm": 0.55859375, + "learning_rate": 6.098000005071933e-05, + "loss": 1.4145, + "step": 12850 + }, + { + "epoch": 0.67, + "grad_norm": 0.5546875, + "learning_rate": 6.089687256816276e-05, + "loss": 1.4306, + "step": 12855 + }, + { + "epoch": 0.67, + "grad_norm": 0.53125, + "learning_rate": 6.081377697177576e-05, + "loss": 1.4322, + "step": 12860 + }, + { + "epoch": 0.67, + "grad_norm": 0.50390625, + "learning_rate": 6.073071332931768e-05, + "loss": 1.4077, + "step": 12865 + }, + { + "epoch": 0.67, + "grad_norm": 0.5625, + "learning_rate": 6.064768170852169e-05, + "loss": 1.3876, + "step": 12870 + }, + { + "epoch": 0.67, + "grad_norm": 0.53515625, + "learning_rate": 6.0564682177094976e-05, + "loss": 1.4286, + "step": 12875 + }, + { + "epoch": 0.67, + "grad_norm": 0.5234375, + "learning_rate": 6.048171480271847e-05, + "loss": 1.4198, + "step": 12880 + }, + { + "epoch": 0.67, + "grad_norm": 0.51171875, + "learning_rate": 6.0398779653046876e-05, + "loss": 1.4129, + "step": 12885 + }, + { + "epoch": 0.67, + "grad_norm": 0.546875, + "learning_rate": 6.031587679570869e-05, + "loss": 1.3972, + "step": 12890 + }, + { + "epoch": 0.67, + "grad_norm": 0.5546875, + "learning_rate": 6.0233006298306024e-05, + "loss": 1.3879, + "step": 12895 + }, + { + "epoch": 0.67, + "grad_norm": 0.54296875, + "learning_rate": 6.015016822841465e-05, + "loss": 1.4311, + "step": 12900 + }, + { + "epoch": 0.67, + "grad_norm": 0.54296875, + "learning_rate": 6.006736265358381e-05, + "loss": 1.4233, + "step": 12905 + }, + { + "epoch": 0.67, + "grad_norm": 0.51953125, + "learning_rate": 5.9984589641336354e-05, + "loss": 1.4113, + "step": 12910 + }, + { + "epoch": 0.67, + "grad_norm": 0.5234375, + "learning_rate": 5.9901849259168484e-05, + "loss": 1.4208, + "step": 12915 + }, + { + "epoch": 0.67, + "grad_norm": 0.546875, + "learning_rate": 5.981914157454988e-05, + "loss": 1.4341, + "step": 12920 + }, + { + "epoch": 0.67, + "grad_norm": 0.54296875, + "learning_rate": 5.9736466654923476e-05, + "loss": 1.3909, + "step": 12925 + }, + { + "epoch": 0.67, + "grad_norm": 0.515625, + "learning_rate": 5.9653824567705564e-05, + "loss": 1.3685, + "step": 12930 + }, + { + "epoch": 0.67, + "grad_norm": 0.50390625, + "learning_rate": 5.9571215380285604e-05, + "loss": 1.4367, + "step": 12935 + }, + { + "epoch": 0.67, + "grad_norm": 0.53125, + "learning_rate": 5.9488639160026274e-05, + "loss": 1.4329, + "step": 12940 + }, + { + "epoch": 0.67, + "grad_norm": 0.57421875, + "learning_rate": 5.940609597426332e-05, + "loss": 1.4143, + "step": 12945 + }, + { + "epoch": 0.67, + "grad_norm": 0.54296875, + "learning_rate": 5.932358589030562e-05, + "loss": 1.407, + "step": 12950 + }, + { + "epoch": 0.67, + "grad_norm": 0.53125, + "learning_rate": 5.9241108975434976e-05, + "loss": 1.4059, + "step": 12955 + }, + { + "epoch": 0.67, + "grad_norm": 0.55859375, + "learning_rate": 5.9158665296906235e-05, + "loss": 1.4186, + "step": 12960 + }, + { + "epoch": 0.67, + "grad_norm": 0.53515625, + "learning_rate": 5.9076254921947024e-05, + "loss": 1.4117, + "step": 12965 + }, + { + "epoch": 0.67, + "grad_norm": 0.54296875, + "learning_rate": 5.899387791775794e-05, + "loss": 1.3814, + "step": 12970 + }, + { + "epoch": 0.67, + "grad_norm": 0.53515625, + "learning_rate": 5.8911534351512276e-05, + "loss": 1.4122, + "step": 12975 + }, + { + "epoch": 0.67, + "grad_norm": 0.53515625, + "learning_rate": 5.882922429035611e-05, + "loss": 1.4037, + "step": 12980 + }, + { + "epoch": 0.67, + "grad_norm": 0.5546875, + "learning_rate": 5.874694780140817e-05, + "loss": 1.435, + "step": 12985 + }, + { + "epoch": 0.67, + "grad_norm": 0.55078125, + "learning_rate": 5.866470495175982e-05, + "loss": 1.4324, + "step": 12990 + }, + { + "epoch": 0.67, + "grad_norm": 0.55078125, + "learning_rate": 5.858249580847499e-05, + "loss": 1.4104, + "step": 12995 + }, + { + "epoch": 0.67, + "grad_norm": 0.54296875, + "learning_rate": 5.850032043859013e-05, + "loss": 1.411, + "step": 13000 + }, + { + "epoch": 0.67, + "grad_norm": 0.51953125, + "learning_rate": 5.841817890911413e-05, + "loss": 1.4082, + "step": 13005 + }, + { + "epoch": 0.67, + "grad_norm": 0.55078125, + "learning_rate": 5.8336071287028315e-05, + "loss": 1.4199, + "step": 13010 + }, + { + "epoch": 0.67, + "grad_norm": 0.53125, + "learning_rate": 5.825399763928634e-05, + "loss": 1.4118, + "step": 13015 + }, + { + "epoch": 0.67, + "grad_norm": 0.52734375, + "learning_rate": 5.817195803281421e-05, + "loss": 1.3896, + "step": 13020 + }, + { + "epoch": 0.67, + "grad_norm": 0.55078125, + "learning_rate": 5.808995253451006e-05, + "loss": 1.4292, + "step": 13025 + }, + { + "epoch": 0.67, + "grad_norm": 0.51953125, + "learning_rate": 5.8007981211244276e-05, + "loss": 1.4138, + "step": 13030 + }, + { + "epoch": 0.67, + "grad_norm": 0.515625, + "learning_rate": 5.79260441298594e-05, + "loss": 1.3991, + "step": 13035 + }, + { + "epoch": 0.67, + "grad_norm": 0.51171875, + "learning_rate": 5.7844141357170087e-05, + "loss": 1.4028, + "step": 13040 + }, + { + "epoch": 0.67, + "grad_norm": 0.53125, + "learning_rate": 5.776227295996284e-05, + "loss": 1.3874, + "step": 13045 + }, + { + "epoch": 0.68, + "grad_norm": 0.53125, + "learning_rate": 5.768043900499631e-05, + "loss": 1.4333, + "step": 13050 + }, + { + "epoch": 0.68, + "grad_norm": 0.52734375, + "learning_rate": 5.759863955900099e-05, + "loss": 1.4307, + "step": 13055 + }, + { + "epoch": 0.68, + "grad_norm": 0.53125, + "learning_rate": 5.751687468867929e-05, + "loss": 1.4232, + "step": 13060 + }, + { + "epoch": 0.68, + "grad_norm": 0.55078125, + "learning_rate": 5.74351444607053e-05, + "loss": 1.4245, + "step": 13065 + }, + { + "epoch": 0.68, + "grad_norm": 0.54296875, + "learning_rate": 5.7353448941724966e-05, + "loss": 1.3847, + "step": 13070 + }, + { + "epoch": 0.68, + "grad_norm": 0.5703125, + "learning_rate": 5.727178819835592e-05, + "loss": 1.4309, + "step": 13075 + }, + { + "epoch": 0.68, + "grad_norm": 0.55078125, + "learning_rate": 5.7190162297187475e-05, + "loss": 1.4215, + "step": 13080 + }, + { + "epoch": 0.68, + "grad_norm": 0.53125, + "learning_rate": 5.7108571304780355e-05, + "loss": 1.4169, + "step": 13085 + }, + { + "epoch": 0.68, + "grad_norm": 0.5390625, + "learning_rate": 5.702701528766703e-05, + "loss": 1.4033, + "step": 13090 + }, + { + "epoch": 0.68, + "grad_norm": 0.53125, + "learning_rate": 5.694549431235133e-05, + "loss": 1.4193, + "step": 13095 + }, + { + "epoch": 0.68, + "grad_norm": 0.546875, + "learning_rate": 5.6864008445308603e-05, + "loss": 1.4367, + "step": 13100 + }, + { + "epoch": 0.68, + "grad_norm": 0.52734375, + "learning_rate": 5.678255775298542e-05, + "loss": 1.4123, + "step": 13105 + }, + { + "epoch": 0.68, + "grad_norm": 0.56640625, + "learning_rate": 5.6701142301799784e-05, + "loss": 1.4158, + "step": 13110 + }, + { + "epoch": 0.68, + "grad_norm": 0.52734375, + "learning_rate": 5.6619762158140955e-05, + "loss": 1.4283, + "step": 13115 + }, + { + "epoch": 0.68, + "grad_norm": 0.55859375, + "learning_rate": 5.6538417388369404e-05, + "loss": 1.4124, + "step": 13120 + }, + { + "epoch": 0.68, + "grad_norm": 0.5703125, + "learning_rate": 5.6457108058816674e-05, + "loss": 1.4147, + "step": 13125 + }, + { + "epoch": 0.68, + "grad_norm": 0.55078125, + "learning_rate": 5.6375834235785495e-05, + "loss": 1.4181, + "step": 13130 + }, + { + "epoch": 0.68, + "grad_norm": 0.53125, + "learning_rate": 5.62945959855496e-05, + "loss": 1.3914, + "step": 13135 + }, + { + "epoch": 0.68, + "grad_norm": 0.5390625, + "learning_rate": 5.6213393374353814e-05, + "loss": 1.4017, + "step": 13140 + }, + { + "epoch": 0.68, + "grad_norm": 0.51171875, + "learning_rate": 5.6132226468413715e-05, + "loss": 1.3974, + "step": 13145 + }, + { + "epoch": 0.68, + "grad_norm": 0.55078125, + "learning_rate": 5.60510953339159e-05, + "loss": 1.4176, + "step": 13150 + }, + { + "epoch": 0.68, + "grad_norm": 0.52734375, + "learning_rate": 5.597000003701779e-05, + "loss": 1.4182, + "step": 13155 + }, + { + "epoch": 0.68, + "grad_norm": 0.5703125, + "learning_rate": 5.5888940643847574e-05, + "loss": 1.4126, + "step": 13160 + }, + { + "epoch": 0.68, + "grad_norm": 0.53125, + "learning_rate": 5.580791722050408e-05, + "loss": 1.41, + "step": 13165 + }, + { + "epoch": 0.68, + "grad_norm": 0.55078125, + "learning_rate": 5.5726929833056954e-05, + "loss": 1.3945, + "step": 13170 + }, + { + "epoch": 0.68, + "grad_norm": 0.51953125, + "learning_rate": 5.5645978547546284e-05, + "loss": 1.3876, + "step": 13175 + }, + { + "epoch": 0.68, + "grad_norm": 0.5390625, + "learning_rate": 5.5565063429982865e-05, + "loss": 1.4207, + "step": 13180 + }, + { + "epoch": 0.68, + "grad_norm": 0.53125, + "learning_rate": 5.5484184546347983e-05, + "loss": 1.4061, + "step": 13185 + }, + { + "epoch": 0.68, + "grad_norm": 0.546875, + "learning_rate": 5.540334196259326e-05, + "loss": 1.3876, + "step": 13190 + }, + { + "epoch": 0.68, + "grad_norm": 0.56640625, + "learning_rate": 5.532253574464083e-05, + "loss": 1.4197, + "step": 13195 + }, + { + "epoch": 0.68, + "grad_norm": 0.53515625, + "learning_rate": 5.5241765958383154e-05, + "loss": 1.4209, + "step": 13200 + }, + { + "epoch": 0.68, + "grad_norm": 0.53515625, + "learning_rate": 5.516103266968299e-05, + "loss": 1.3889, + "step": 13205 + }, + { + "epoch": 0.68, + "grad_norm": 0.53515625, + "learning_rate": 5.508033594437325e-05, + "loss": 1.4103, + "step": 13210 + }, + { + "epoch": 0.68, + "grad_norm": 0.53125, + "learning_rate": 5.4999675848257147e-05, + "loss": 1.3954, + "step": 13215 + }, + { + "epoch": 0.68, + "grad_norm": 0.55859375, + "learning_rate": 5.491905244710796e-05, + "loss": 1.4404, + "step": 13220 + }, + { + "epoch": 0.68, + "grad_norm": 0.54296875, + "learning_rate": 5.48384658066691e-05, + "loss": 1.3895, + "step": 13225 + }, + { + "epoch": 0.68, + "grad_norm": 0.5390625, + "learning_rate": 5.47579159926539e-05, + "loss": 1.4484, + "step": 13230 + }, + { + "epoch": 0.68, + "grad_norm": 0.54296875, + "learning_rate": 5.467740307074574e-05, + "loss": 1.4115, + "step": 13235 + }, + { + "epoch": 0.68, + "grad_norm": 0.53125, + "learning_rate": 5.459692710659792e-05, + "loss": 1.4206, + "step": 13240 + }, + { + "epoch": 0.69, + "grad_norm": 0.55078125, + "learning_rate": 5.451648816583362e-05, + "loss": 1.429, + "step": 13245 + }, + { + "epoch": 0.69, + "grad_norm": 0.5546875, + "learning_rate": 5.443608631404573e-05, + "loss": 1.4297, + "step": 13250 + }, + { + "epoch": 0.69, + "grad_norm": 0.53515625, + "learning_rate": 5.435572161679698e-05, + "loss": 1.3839, + "step": 13255 + }, + { + "epoch": 0.69, + "grad_norm": 0.5390625, + "learning_rate": 5.42753941396198e-05, + "loss": 1.3952, + "step": 13260 + }, + { + "epoch": 0.69, + "grad_norm": 0.55078125, + "learning_rate": 5.419510394801628e-05, + "loss": 1.3986, + "step": 13265 + }, + { + "epoch": 0.69, + "grad_norm": 0.51953125, + "learning_rate": 5.411485110745802e-05, + "loss": 1.4064, + "step": 13270 + }, + { + "epoch": 0.69, + "grad_norm": 0.5234375, + "learning_rate": 5.4034635683386245e-05, + "loss": 1.4064, + "step": 13275 + }, + { + "epoch": 0.69, + "grad_norm": 0.546875, + "learning_rate": 5.395445774121166e-05, + "loss": 1.3951, + "step": 13280 + }, + { + "epoch": 0.69, + "grad_norm": 0.54296875, + "learning_rate": 5.387431734631443e-05, + "loss": 1.4237, + "step": 13285 + }, + { + "epoch": 0.69, + "grad_norm": 0.515625, + "learning_rate": 5.379421456404397e-05, + "loss": 1.4112, + "step": 13290 + }, + { + "epoch": 0.69, + "grad_norm": 0.546875, + "learning_rate": 5.371414945971918e-05, + "loss": 1.4125, + "step": 13295 + }, + { + "epoch": 0.69, + "grad_norm": 0.578125, + "learning_rate": 5.3634122098628146e-05, + "loss": 1.4689, + "step": 13300 + }, + { + "epoch": 0.69, + "grad_norm": 0.5625, + "learning_rate": 5.3554132546028294e-05, + "loss": 1.3824, + "step": 13305 + }, + { + "epoch": 0.69, + "grad_norm": 0.54296875, + "learning_rate": 5.3474180867146004e-05, + "loss": 1.4219, + "step": 13310 + }, + { + "epoch": 0.69, + "grad_norm": 0.546875, + "learning_rate": 5.339426712717697e-05, + "loss": 1.4107, + "step": 13315 + }, + { + "epoch": 0.69, + "grad_norm": 0.54296875, + "learning_rate": 5.331439139128587e-05, + "loss": 1.4263, + "step": 13320 + }, + { + "epoch": 0.69, + "grad_norm": 0.546875, + "learning_rate": 5.323455372460644e-05, + "loss": 1.3911, + "step": 13325 + }, + { + "epoch": 0.69, + "grad_norm": 0.55078125, + "learning_rate": 5.315475419224124e-05, + "loss": 1.4176, + "step": 13330 + }, + { + "epoch": 0.69, + "grad_norm": 0.55078125, + "learning_rate": 5.3074992859261895e-05, + "loss": 1.4085, + "step": 13335 + }, + { + "epoch": 0.69, + "grad_norm": 0.578125, + "learning_rate": 5.299526979070879e-05, + "loss": 1.3893, + "step": 13340 + }, + { + "epoch": 0.69, + "grad_norm": 0.546875, + "learning_rate": 5.2915585051591196e-05, + "loss": 1.3686, + "step": 13345 + }, + { + "epoch": 0.69, + "grad_norm": 0.53125, + "learning_rate": 5.2835938706886966e-05, + "loss": 1.3746, + "step": 13350 + }, + { + "epoch": 0.69, + "grad_norm": 0.54296875, + "learning_rate": 5.275633082154279e-05, + "loss": 1.3683, + "step": 13355 + }, + { + "epoch": 0.69, + "grad_norm": 0.54296875, + "learning_rate": 5.2676761460473934e-05, + "loss": 1.4154, + "step": 13360 + }, + { + "epoch": 0.69, + "grad_norm": 0.77734375, + "learning_rate": 5.259723068856434e-05, + "loss": 1.387, + "step": 13365 + }, + { + "epoch": 0.69, + "grad_norm": 0.546875, + "learning_rate": 5.251773857066629e-05, + "loss": 1.3847, + "step": 13370 + }, + { + "epoch": 0.69, + "grad_norm": 0.5078125, + "learning_rate": 5.243828517160072e-05, + "loss": 1.3706, + "step": 13375 + }, + { + "epoch": 0.69, + "grad_norm": 0.5234375, + "learning_rate": 5.235887055615696e-05, + "loss": 1.4202, + "step": 13380 + }, + { + "epoch": 0.69, + "grad_norm": 0.51171875, + "learning_rate": 5.227949478909265e-05, + "loss": 1.3961, + "step": 13385 + }, + { + "epoch": 0.69, + "grad_norm": 0.5546875, + "learning_rate": 5.2200157935133865e-05, + "loss": 1.4123, + "step": 13390 + }, + { + "epoch": 0.69, + "grad_norm": 0.53515625, + "learning_rate": 5.2120860058974786e-05, + "loss": 1.4374, + "step": 13395 + }, + { + "epoch": 0.69, + "grad_norm": 0.5546875, + "learning_rate": 5.204160122527795e-05, + "loss": 1.3853, + "step": 13400 + }, + { + "epoch": 0.69, + "grad_norm": 0.54296875, + "learning_rate": 5.196238149867398e-05, + "loss": 1.4036, + "step": 13405 + }, + { + "epoch": 0.69, + "grad_norm": 0.56640625, + "learning_rate": 5.188320094376172e-05, + "loss": 1.4003, + "step": 13410 + }, + { + "epoch": 0.69, + "grad_norm": 0.546875, + "learning_rate": 5.180405962510789e-05, + "loss": 1.4341, + "step": 13415 + }, + { + "epoch": 0.69, + "grad_norm": 0.52734375, + "learning_rate": 5.172495760724736e-05, + "loss": 1.4119, + "step": 13420 + }, + { + "epoch": 0.69, + "grad_norm": 0.51171875, + "learning_rate": 5.1645894954682896e-05, + "loss": 1.3961, + "step": 13425 + }, + { + "epoch": 0.69, + "grad_norm": 0.52734375, + "learning_rate": 5.156687173188521e-05, + "loss": 1.3973, + "step": 13430 + }, + { + "epoch": 0.7, + "grad_norm": 0.5546875, + "learning_rate": 5.148788800329278e-05, + "loss": 1.411, + "step": 13435 + }, + { + "epoch": 0.7, + "grad_norm": 0.51953125, + "learning_rate": 5.140894383331196e-05, + "loss": 1.4306, + "step": 13440 + }, + { + "epoch": 0.7, + "grad_norm": 0.515625, + "learning_rate": 5.133003928631679e-05, + "loss": 1.3455, + "step": 13445 + }, + { + "epoch": 0.7, + "grad_norm": 0.53125, + "learning_rate": 5.1251174426649076e-05, + "loss": 1.4141, + "step": 13450 + }, + { + "epoch": 0.7, + "grad_norm": 0.55078125, + "learning_rate": 5.117234931861813e-05, + "loss": 1.3502, + "step": 13455 + }, + { + "epoch": 0.7, + "grad_norm": 0.5234375, + "learning_rate": 5.109356402650096e-05, + "loss": 1.4024, + "step": 13460 + }, + { + "epoch": 0.7, + "grad_norm": 0.546875, + "learning_rate": 5.1014818614542116e-05, + "loss": 1.415, + "step": 13465 + }, + { + "epoch": 0.7, + "grad_norm": 0.52734375, + "learning_rate": 5.0936113146953525e-05, + "loss": 1.4265, + "step": 13470 + }, + { + "epoch": 0.7, + "grad_norm": 0.515625, + "learning_rate": 5.085744768791465e-05, + "loss": 1.3924, + "step": 13475 + }, + { + "epoch": 0.7, + "grad_norm": 0.53515625, + "learning_rate": 5.0778822301572226e-05, + "loss": 1.3742, + "step": 13480 + }, + { + "epoch": 0.7, + "grad_norm": 0.5234375, + "learning_rate": 5.070023705204041e-05, + "loss": 1.4227, + "step": 13485 + }, + { + "epoch": 0.7, + "grad_norm": 0.546875, + "learning_rate": 5.062169200340058e-05, + "loss": 1.4204, + "step": 13490 + }, + { + "epoch": 0.7, + "grad_norm": 0.53515625, + "learning_rate": 5.054318721970137e-05, + "loss": 1.3805, + "step": 13495 + }, + { + "epoch": 0.7, + "grad_norm": 0.54296875, + "learning_rate": 5.046472276495848e-05, + "loss": 1.3879, + "step": 13500 + }, + { + "epoch": 0.7, + "grad_norm": 0.53125, + "learning_rate": 5.038629870315486e-05, + "loss": 1.3919, + "step": 13505 + }, + { + "epoch": 0.7, + "grad_norm": 0.5390625, + "learning_rate": 5.030791509824041e-05, + "loss": 1.4157, + "step": 13510 + }, + { + "epoch": 0.7, + "grad_norm": 0.5546875, + "learning_rate": 5.0229572014132156e-05, + "loss": 1.4404, + "step": 13515 + }, + { + "epoch": 0.7, + "grad_norm": 0.53515625, + "learning_rate": 5.0151269514713927e-05, + "loss": 1.4115, + "step": 13520 + }, + { + "epoch": 0.7, + "grad_norm": 0.5546875, + "learning_rate": 5.007300766383659e-05, + "loss": 1.4164, + "step": 13525 + }, + { + "epoch": 0.7, + "grad_norm": 0.546875, + "learning_rate": 4.999478652531782e-05, + "loss": 1.3713, + "step": 13530 + }, + { + "epoch": 0.7, + "grad_norm": 0.55078125, + "learning_rate": 4.99166061629421e-05, + "loss": 1.3609, + "step": 13535 + }, + { + "epoch": 0.7, + "grad_norm": 0.52734375, + "learning_rate": 4.9838466640460627e-05, + "loss": 1.3923, + "step": 13540 + }, + { + "epoch": 0.7, + "grad_norm": 0.52734375, + "learning_rate": 4.976036802159133e-05, + "loss": 1.4069, + "step": 13545 + }, + { + "epoch": 0.7, + "grad_norm": 0.57421875, + "learning_rate": 4.968231037001879e-05, + "loss": 1.3967, + "step": 13550 + }, + { + "epoch": 0.7, + "grad_norm": 0.5390625, + "learning_rate": 4.96042937493942e-05, + "loss": 1.3987, + "step": 13555 + }, + { + "epoch": 0.7, + "grad_norm": 0.5, + "learning_rate": 4.95263182233352e-05, + "loss": 1.4095, + "step": 13560 + }, + { + "epoch": 0.7, + "grad_norm": 0.546875, + "learning_rate": 4.9448383855426006e-05, + "loss": 1.3958, + "step": 13565 + }, + { + "epoch": 0.7, + "grad_norm": 0.5859375, + "learning_rate": 4.937049070921727e-05, + "loss": 1.4014, + "step": 13570 + }, + { + "epoch": 0.7, + "grad_norm": 0.53125, + "learning_rate": 4.9292638848226024e-05, + "loss": 1.4096, + "step": 13575 + }, + { + "epoch": 0.7, + "grad_norm": 0.53125, + "learning_rate": 4.9214828335935556e-05, + "loss": 1.4241, + "step": 13580 + }, + { + "epoch": 0.7, + "grad_norm": 0.53125, + "learning_rate": 4.913705923579556e-05, + "loss": 1.4062, + "step": 13585 + }, + { + "epoch": 0.7, + "grad_norm": 0.546875, + "learning_rate": 4.905933161122187e-05, + "loss": 1.3818, + "step": 13590 + }, + { + "epoch": 0.7, + "grad_norm": 0.5390625, + "learning_rate": 4.89816455255966e-05, + "loss": 1.4079, + "step": 13595 + }, + { + "epoch": 0.7, + "grad_norm": 0.5234375, + "learning_rate": 4.890400104226782e-05, + "loss": 1.4027, + "step": 13600 + }, + { + "epoch": 0.7, + "grad_norm": 0.52734375, + "learning_rate": 4.882639822454983e-05, + "loss": 1.3885, + "step": 13605 + }, + { + "epoch": 0.7, + "grad_norm": 0.53515625, + "learning_rate": 4.87488371357229e-05, + "loss": 1.4119, + "step": 13610 + }, + { + "epoch": 0.7, + "grad_norm": 0.5625, + "learning_rate": 4.867131783903333e-05, + "loss": 1.4109, + "step": 13615 + }, + { + "epoch": 0.7, + "grad_norm": 0.53515625, + "learning_rate": 4.859384039769319e-05, + "loss": 1.4487, + "step": 13620 + }, + { + "epoch": 0.7, + "grad_norm": 0.52734375, + "learning_rate": 4.851640487488057e-05, + "loss": 1.404, + "step": 13625 + }, + { + "epoch": 0.71, + "grad_norm": 0.51171875, + "learning_rate": 4.8439011333739314e-05, + "loss": 1.4212, + "step": 13630 + }, + { + "epoch": 0.71, + "grad_norm": 0.53125, + "learning_rate": 4.836165983737909e-05, + "loss": 1.3863, + "step": 13635 + }, + { + "epoch": 0.71, + "grad_norm": 0.53515625, + "learning_rate": 4.828435044887516e-05, + "loss": 1.3797, + "step": 13640 + }, + { + "epoch": 0.71, + "grad_norm": 0.53515625, + "learning_rate": 4.820708323126856e-05, + "loss": 1.4255, + "step": 13645 + }, + { + "epoch": 0.71, + "grad_norm": 0.55859375, + "learning_rate": 4.81298582475659e-05, + "loss": 1.3684, + "step": 13650 + }, + { + "epoch": 0.71, + "grad_norm": 0.53515625, + "learning_rate": 4.805267556073938e-05, + "loss": 1.3921, + "step": 13655 + }, + { + "epoch": 0.71, + "grad_norm": 0.54296875, + "learning_rate": 4.797553523372663e-05, + "loss": 1.419, + "step": 13660 + }, + { + "epoch": 0.71, + "grad_norm": 0.546875, + "learning_rate": 4.7898437329430815e-05, + "loss": 1.4267, + "step": 13665 + }, + { + "epoch": 0.71, + "grad_norm": 0.53125, + "learning_rate": 4.7821381910720484e-05, + "loss": 1.3806, + "step": 13670 + }, + { + "epoch": 0.71, + "grad_norm": 0.546875, + "learning_rate": 4.774436904042959e-05, + "loss": 1.4022, + "step": 13675 + }, + { + "epoch": 0.71, + "grad_norm": 0.51171875, + "learning_rate": 4.766739878135725e-05, + "loss": 1.4005, + "step": 13680 + }, + { + "epoch": 0.71, + "grad_norm": 0.53515625, + "learning_rate": 4.759047119626798e-05, + "loss": 1.4232, + "step": 13685 + }, + { + "epoch": 0.71, + "grad_norm": 0.54296875, + "learning_rate": 4.751358634789143e-05, + "loss": 1.4182, + "step": 13690 + }, + { + "epoch": 0.71, + "grad_norm": 0.53125, + "learning_rate": 4.743674429892245e-05, + "loss": 1.4279, + "step": 13695 + }, + { + "epoch": 0.71, + "grad_norm": 0.54296875, + "learning_rate": 4.73599451120209e-05, + "loss": 1.3974, + "step": 13700 + }, + { + "epoch": 0.71, + "grad_norm": 0.515625, + "learning_rate": 4.728318884981175e-05, + "loss": 1.3948, + "step": 13705 + }, + { + "epoch": 0.71, + "grad_norm": 0.53515625, + "learning_rate": 4.7206475574884976e-05, + "loss": 1.4153, + "step": 13710 + }, + { + "epoch": 0.71, + "grad_norm": 0.546875, + "learning_rate": 4.712980534979553e-05, + "loss": 1.394, + "step": 13715 + }, + { + "epoch": 0.71, + "grad_norm": 0.5234375, + "learning_rate": 4.7053178237063135e-05, + "loss": 1.3956, + "step": 13720 + }, + { + "epoch": 0.71, + "grad_norm": 0.5546875, + "learning_rate": 4.697659429917246e-05, + "loss": 1.4027, + "step": 13725 + }, + { + "epoch": 0.71, + "grad_norm": 0.53125, + "learning_rate": 4.690005359857297e-05, + "loss": 1.3886, + "step": 13730 + }, + { + "epoch": 0.71, + "grad_norm": 0.5078125, + "learning_rate": 4.6823556197678865e-05, + "loss": 1.4122, + "step": 13735 + }, + { + "epoch": 0.71, + "grad_norm": 0.5390625, + "learning_rate": 4.674710215886895e-05, + "loss": 1.408, + "step": 13740 + }, + { + "epoch": 0.71, + "grad_norm": 0.54296875, + "learning_rate": 4.667069154448679e-05, + "loss": 1.4116, + "step": 13745 + }, + { + "epoch": 0.71, + "grad_norm": 0.498046875, + "learning_rate": 4.659432441684047e-05, + "loss": 1.3956, + "step": 13750 + }, + { + "epoch": 0.71, + "grad_norm": 0.55078125, + "learning_rate": 4.6518000838202694e-05, + "loss": 1.4042, + "step": 13755 + }, + { + "epoch": 0.71, + "grad_norm": 0.546875, + "learning_rate": 4.6441720870810545e-05, + "loss": 1.4179, + "step": 13760 + }, + { + "epoch": 0.71, + "grad_norm": 0.52734375, + "learning_rate": 4.636548457686557e-05, + "loss": 1.3851, + "step": 13765 + }, + { + "epoch": 0.71, + "grad_norm": 0.53515625, + "learning_rate": 4.628929201853375e-05, + "loss": 1.4158, + "step": 13770 + }, + { + "epoch": 0.71, + "grad_norm": 0.53515625, + "learning_rate": 4.621314325794539e-05, + "loss": 1.3648, + "step": 13775 + }, + { + "epoch": 0.71, + "grad_norm": 0.5546875, + "learning_rate": 4.613703835719511e-05, + "loss": 1.3847, + "step": 13780 + }, + { + "epoch": 0.71, + "grad_norm": 0.55859375, + "learning_rate": 4.606097737834163e-05, + "loss": 1.4183, + "step": 13785 + }, + { + "epoch": 0.71, + "grad_norm": 0.53515625, + "learning_rate": 4.5984960383408005e-05, + "loss": 1.3988, + "step": 13790 + }, + { + "epoch": 0.71, + "grad_norm": 0.5390625, + "learning_rate": 4.590898743438138e-05, + "loss": 1.4255, + "step": 13795 + }, + { + "epoch": 0.71, + "grad_norm": 0.515625, + "learning_rate": 4.5833058593212984e-05, + "loss": 1.4159, + "step": 13800 + }, + { + "epoch": 0.71, + "grad_norm": 0.55078125, + "learning_rate": 4.575717392181801e-05, + "loss": 1.4114, + "step": 13805 + }, + { + "epoch": 0.71, + "grad_norm": 0.56640625, + "learning_rate": 4.568133348207572e-05, + "loss": 1.4001, + "step": 13810 + }, + { + "epoch": 0.71, + "grad_norm": 0.515625, + "learning_rate": 4.5605537335829275e-05, + "loss": 1.3891, + "step": 13815 + }, + { + "epoch": 0.72, + "grad_norm": 0.54296875, + "learning_rate": 4.5529785544885715e-05, + "loss": 1.4303, + "step": 13820 + }, + { + "epoch": 0.72, + "grad_norm": 0.5234375, + "learning_rate": 4.545407817101598e-05, + "loss": 1.4256, + "step": 13825 + }, + { + "epoch": 0.72, + "grad_norm": 0.55078125, + "learning_rate": 4.5378415275954634e-05, + "loss": 1.4324, + "step": 13830 + }, + { + "epoch": 0.72, + "grad_norm": 0.5703125, + "learning_rate": 4.53027969214001e-05, + "loss": 1.4147, + "step": 13835 + }, + { + "epoch": 0.72, + "grad_norm": 0.52734375, + "learning_rate": 4.5227223169014456e-05, + "loss": 1.4096, + "step": 13840 + }, + { + "epoch": 0.72, + "grad_norm": 0.52734375, + "learning_rate": 4.5151694080423414e-05, + "loss": 1.417, + "step": 13845 + }, + { + "epoch": 0.72, + "grad_norm": 0.57421875, + "learning_rate": 4.50762097172162e-05, + "loss": 1.4225, + "step": 13850 + }, + { + "epoch": 0.72, + "grad_norm": 0.5546875, + "learning_rate": 4.500077014094566e-05, + "loss": 1.4455, + "step": 13855 + }, + { + "epoch": 0.72, + "grad_norm": 0.54296875, + "learning_rate": 4.492537541312805e-05, + "loss": 1.415, + "step": 13860 + }, + { + "epoch": 0.72, + "grad_norm": 0.53515625, + "learning_rate": 4.485002559524314e-05, + "loss": 1.4214, + "step": 13865 + }, + { + "epoch": 0.72, + "grad_norm": 0.515625, + "learning_rate": 4.477472074873396e-05, + "loss": 1.424, + "step": 13870 + }, + { + "epoch": 0.72, + "grad_norm": 0.53515625, + "learning_rate": 4.469946093500694e-05, + "loss": 1.3948, + "step": 13875 + }, + { + "epoch": 0.72, + "grad_norm": 0.515625, + "learning_rate": 4.4624246215431796e-05, + "loss": 1.3954, + "step": 13880 + }, + { + "epoch": 0.72, + "grad_norm": 0.54296875, + "learning_rate": 4.4549076651341493e-05, + "loss": 1.449, + "step": 13885 + }, + { + "epoch": 0.72, + "grad_norm": 0.515625, + "learning_rate": 4.4473952304032065e-05, + "loss": 1.3918, + "step": 13890 + }, + { + "epoch": 0.72, + "grad_norm": 0.5234375, + "learning_rate": 4.439887323476277e-05, + "loss": 1.4122, + "step": 13895 + }, + { + "epoch": 0.72, + "grad_norm": 0.52734375, + "learning_rate": 4.432383950475595e-05, + "loss": 1.4061, + "step": 13900 + }, + { + "epoch": 0.72, + "grad_norm": 0.578125, + "learning_rate": 4.4248851175196956e-05, + "loss": 1.4182, + "step": 13905 + }, + { + "epoch": 0.72, + "grad_norm": 0.5234375, + "learning_rate": 4.4173908307234045e-05, + "loss": 1.4331, + "step": 13910 + }, + { + "epoch": 0.72, + "grad_norm": 0.55859375, + "learning_rate": 4.40990109619785e-05, + "loss": 1.4382, + "step": 13915 + }, + { + "epoch": 0.72, + "grad_norm": 0.53125, + "learning_rate": 4.402415920050447e-05, + "loss": 1.406, + "step": 13920 + }, + { + "epoch": 0.72, + "grad_norm": 0.5703125, + "learning_rate": 4.394935308384893e-05, + "loss": 1.4181, + "step": 13925 + }, + { + "epoch": 0.72, + "grad_norm": 0.54296875, + "learning_rate": 4.387459267301155e-05, + "loss": 1.4027, + "step": 13930 + }, + { + "epoch": 0.72, + "grad_norm": 0.50390625, + "learning_rate": 4.379987802895483e-05, + "loss": 1.3861, + "step": 13935 + }, + { + "epoch": 0.72, + "grad_norm": 0.55859375, + "learning_rate": 4.3725209212603925e-05, + "loss": 1.402, + "step": 13940 + }, + { + "epoch": 0.72, + "grad_norm": 0.55859375, + "learning_rate": 4.3650586284846636e-05, + "loss": 1.4507, + "step": 13945 + }, + { + "epoch": 0.72, + "grad_norm": 0.546875, + "learning_rate": 4.357600930653327e-05, + "loss": 1.4307, + "step": 13950 + }, + { + "epoch": 0.72, + "grad_norm": 0.5390625, + "learning_rate": 4.350147833847674e-05, + "loss": 1.4185, + "step": 13955 + }, + { + "epoch": 0.72, + "grad_norm": 0.52734375, + "learning_rate": 4.3426993441452414e-05, + "loss": 1.3886, + "step": 13960 + }, + { + "epoch": 0.72, + "grad_norm": 0.53515625, + "learning_rate": 4.335255467619814e-05, + "loss": 1.402, + "step": 13965 + }, + { + "epoch": 0.72, + "grad_norm": 0.5390625, + "learning_rate": 4.3278162103414033e-05, + "loss": 1.4353, + "step": 13970 + }, + { + "epoch": 0.72, + "grad_norm": 0.515625, + "learning_rate": 4.320381578376264e-05, + "loss": 1.4512, + "step": 13975 + }, + { + "epoch": 0.72, + "grad_norm": 0.5234375, + "learning_rate": 4.312951577786876e-05, + "loss": 1.4378, + "step": 13980 + }, + { + "epoch": 0.72, + "grad_norm": 0.53515625, + "learning_rate": 4.305526214631948e-05, + "loss": 1.4088, + "step": 13985 + }, + { + "epoch": 0.72, + "grad_norm": 0.54296875, + "learning_rate": 4.2981054949663926e-05, + "loss": 1.3837, + "step": 13990 + }, + { + "epoch": 0.72, + "grad_norm": 0.53125, + "learning_rate": 4.290689424841351e-05, + "loss": 1.3985, + "step": 13995 + }, + { + "epoch": 0.72, + "grad_norm": 0.51953125, + "learning_rate": 4.283278010304167e-05, + "loss": 1.411, + "step": 14000 + }, + { + "epoch": 0.72, + "grad_norm": 0.51953125, + "learning_rate": 4.2758712573983915e-05, + "loss": 1.3721, + "step": 14005 + }, + { + "epoch": 0.72, + "grad_norm": 0.52734375, + "learning_rate": 4.268469172163764e-05, + "loss": 1.4247, + "step": 14010 + }, + { + "epoch": 0.73, + "grad_norm": 0.55859375, + "learning_rate": 4.261071760636228e-05, + "loss": 1.4248, + "step": 14015 + }, + { + "epoch": 0.73, + "grad_norm": 0.5234375, + "learning_rate": 4.2536790288479135e-05, + "loss": 1.4128, + "step": 14020 + }, + { + "epoch": 0.73, + "grad_norm": 0.5546875, + "learning_rate": 4.246290982827137e-05, + "loss": 1.385, + "step": 14025 + }, + { + "epoch": 0.73, + "grad_norm": 0.5390625, + "learning_rate": 4.238907628598384e-05, + "loss": 1.3891, + "step": 14030 + }, + { + "epoch": 0.73, + "grad_norm": 0.52734375, + "learning_rate": 4.231528972182324e-05, + "loss": 1.4083, + "step": 14035 + }, + { + "epoch": 0.73, + "grad_norm": 0.53125, + "learning_rate": 4.2241550195957924e-05, + "loss": 1.4192, + "step": 14040 + }, + { + "epoch": 0.73, + "grad_norm": 0.52734375, + "learning_rate": 4.2167857768517935e-05, + "loss": 1.4116, + "step": 14045 + }, + { + "epoch": 0.73, + "grad_norm": 0.56640625, + "learning_rate": 4.2094212499594785e-05, + "loss": 1.3968, + "step": 14050 + }, + { + "epoch": 0.73, + "grad_norm": 0.55078125, + "learning_rate": 4.2020614449241705e-05, + "loss": 1.4164, + "step": 14055 + }, + { + "epoch": 0.73, + "grad_norm": 0.55078125, + "learning_rate": 4.194706367747323e-05, + "loss": 1.3963, + "step": 14060 + }, + { + "epoch": 0.73, + "grad_norm": 0.5234375, + "learning_rate": 4.187356024426549e-05, + "loss": 1.4072, + "step": 14065 + }, + { + "epoch": 0.73, + "grad_norm": 0.53125, + "learning_rate": 4.1800104209556e-05, + "loss": 1.4131, + "step": 14070 + }, + { + "epoch": 0.73, + "grad_norm": 0.55859375, + "learning_rate": 4.1726695633243527e-05, + "loss": 1.4076, + "step": 14075 + }, + { + "epoch": 0.73, + "grad_norm": 0.51171875, + "learning_rate": 4.165333457518823e-05, + "loss": 1.3496, + "step": 14080 + }, + { + "epoch": 0.73, + "grad_norm": 0.5234375, + "learning_rate": 4.1580021095211486e-05, + "loss": 1.4122, + "step": 14085 + }, + { + "epoch": 0.73, + "grad_norm": 0.54296875, + "learning_rate": 4.150675525309593e-05, + "loss": 1.4152, + "step": 14090 + }, + { + "epoch": 0.73, + "grad_norm": 0.52734375, + "learning_rate": 4.1433537108585216e-05, + "loss": 1.4124, + "step": 14095 + }, + { + "epoch": 0.73, + "grad_norm": 0.56640625, + "learning_rate": 4.1360366721384234e-05, + "loss": 1.4217, + "step": 14100 + }, + { + "epoch": 0.73, + "grad_norm": 0.53515625, + "learning_rate": 4.128724415115889e-05, + "loss": 1.4085, + "step": 14105 + }, + { + "epoch": 0.73, + "grad_norm": 0.51953125, + "learning_rate": 4.121416945753611e-05, + "loss": 1.3978, + "step": 14110 + }, + { + "epoch": 0.73, + "grad_norm": 0.54296875, + "learning_rate": 4.114114270010372e-05, + "loss": 1.3935, + "step": 14115 + }, + { + "epoch": 0.73, + "grad_norm": 0.5625, + "learning_rate": 4.106816393841052e-05, + "loss": 1.4313, + "step": 14120 + }, + { + "epoch": 0.73, + "grad_norm": 0.5625, + "learning_rate": 4.099523323196616e-05, + "loss": 1.4194, + "step": 14125 + }, + { + "epoch": 0.73, + "grad_norm": 0.53125, + "learning_rate": 4.092235064024111e-05, + "loss": 1.3693, + "step": 14130 + }, + { + "epoch": 0.73, + "grad_norm": 0.54296875, + "learning_rate": 4.0849516222666564e-05, + "loss": 1.4015, + "step": 14135 + }, + { + "epoch": 0.73, + "grad_norm": 0.55859375, + "learning_rate": 4.077673003863446e-05, + "loss": 1.4321, + "step": 14140 + }, + { + "epoch": 0.73, + "grad_norm": 0.53515625, + "learning_rate": 4.0703992147497425e-05, + "loss": 1.397, + "step": 14145 + }, + { + "epoch": 0.73, + "grad_norm": 0.546875, + "learning_rate": 4.063130260856872e-05, + "loss": 1.4254, + "step": 14150 + }, + { + "epoch": 0.73, + "grad_norm": 0.5234375, + "learning_rate": 4.055866148112208e-05, + "loss": 1.3843, + "step": 14155 + }, + { + "epoch": 0.73, + "grad_norm": 0.53515625, + "learning_rate": 4.0486068824391856e-05, + "loss": 1.3909, + "step": 14160 + }, + { + "epoch": 0.73, + "grad_norm": 0.52734375, + "learning_rate": 4.041352469757283e-05, + "loss": 1.4008, + "step": 14165 + }, + { + "epoch": 0.73, + "grad_norm": 0.55859375, + "learning_rate": 4.034102915982031e-05, + "loss": 1.4209, + "step": 14170 + }, + { + "epoch": 0.73, + "grad_norm": 0.54296875, + "learning_rate": 4.026858227024978e-05, + "loss": 1.4422, + "step": 14175 + }, + { + "epoch": 0.73, + "grad_norm": 0.56640625, + "learning_rate": 4.0196184087937235e-05, + "loss": 1.424, + "step": 14180 + }, + { + "epoch": 0.73, + "grad_norm": 0.5625, + "learning_rate": 4.012383467191889e-05, + "loss": 1.4286, + "step": 14185 + }, + { + "epoch": 0.73, + "grad_norm": 0.59765625, + "learning_rate": 4.005153408119123e-05, + "loss": 1.4388, + "step": 14190 + }, + { + "epoch": 0.73, + "grad_norm": 0.5390625, + "learning_rate": 3.9979282374710824e-05, + "loss": 1.3725, + "step": 14195 + }, + { + "epoch": 0.73, + "grad_norm": 0.53125, + "learning_rate": 3.9907079611394485e-05, + "loss": 1.4091, + "step": 14200 + }, + { + "epoch": 0.73, + "grad_norm": 0.51171875, + "learning_rate": 3.983492585011906e-05, + "loss": 1.3604, + "step": 14205 + }, + { + "epoch": 0.74, + "grad_norm": 0.53125, + "learning_rate": 3.9762821149721485e-05, + "loss": 1.4136, + "step": 14210 + }, + { + "epoch": 0.74, + "grad_norm": 0.5234375, + "learning_rate": 3.9690765568998665e-05, + "loss": 1.3978, + "step": 14215 + }, + { + "epoch": 0.74, + "grad_norm": 0.5234375, + "learning_rate": 3.9618759166707396e-05, + "loss": 1.375, + "step": 14220 + }, + { + "epoch": 0.74, + "grad_norm": 0.5546875, + "learning_rate": 3.9546802001564454e-05, + "loss": 1.4229, + "step": 14225 + }, + { + "epoch": 0.74, + "grad_norm": 0.53125, + "learning_rate": 3.9474894132246435e-05, + "loss": 1.4022, + "step": 14230 + }, + { + "epoch": 0.74, + "grad_norm": 0.52734375, + "learning_rate": 3.940303561738977e-05, + "loss": 1.4145, + "step": 14235 + }, + { + "epoch": 0.74, + "grad_norm": 0.5234375, + "learning_rate": 3.933122651559054e-05, + "loss": 1.4006, + "step": 14240 + }, + { + "epoch": 0.74, + "grad_norm": 0.515625, + "learning_rate": 3.925946688540464e-05, + "loss": 1.4169, + "step": 14245 + }, + { + "epoch": 0.74, + "grad_norm": 0.53515625, + "learning_rate": 3.918775678534759e-05, + "loss": 1.4083, + "step": 14250 + }, + { + "epoch": 0.74, + "grad_norm": 0.54296875, + "learning_rate": 3.911609627389453e-05, + "loss": 1.4037, + "step": 14255 + }, + { + "epoch": 0.74, + "grad_norm": 0.57421875, + "learning_rate": 3.904448540948012e-05, + "loss": 1.4371, + "step": 14260 + }, + { + "epoch": 0.74, + "grad_norm": 0.546875, + "learning_rate": 3.897292425049859e-05, + "loss": 1.3692, + "step": 14265 + }, + { + "epoch": 0.74, + "grad_norm": 0.546875, + "learning_rate": 3.89014128553036e-05, + "loss": 1.4395, + "step": 14270 + }, + { + "epoch": 0.74, + "grad_norm": 0.546875, + "learning_rate": 3.8829951282208297e-05, + "loss": 1.3866, + "step": 14275 + }, + { + "epoch": 0.74, + "grad_norm": 0.5390625, + "learning_rate": 3.875853958948508e-05, + "loss": 1.4246, + "step": 14280 + }, + { + "epoch": 0.74, + "grad_norm": 0.60546875, + "learning_rate": 3.868717783536578e-05, + "loss": 1.4123, + "step": 14285 + }, + { + "epoch": 0.74, + "grad_norm": 0.5390625, + "learning_rate": 3.861586607804147e-05, + "loss": 1.4189, + "step": 14290 + }, + { + "epoch": 0.74, + "grad_norm": 0.53515625, + "learning_rate": 3.8544604375662495e-05, + "loss": 1.4224, + "step": 14295 + }, + { + "epoch": 0.74, + "grad_norm": 0.53125, + "learning_rate": 3.847339278633827e-05, + "loss": 1.4446, + "step": 14300 + }, + { + "epoch": 0.74, + "grad_norm": 0.50390625, + "learning_rate": 3.8402231368137454e-05, + "loss": 1.3983, + "step": 14305 + }, + { + "epoch": 0.74, + "grad_norm": 0.5390625, + "learning_rate": 3.8331120179087754e-05, + "loss": 1.4149, + "step": 14310 + }, + { + "epoch": 0.74, + "grad_norm": 0.56640625, + "learning_rate": 3.8260059277175965e-05, + "loss": 1.3856, + "step": 14315 + }, + { + "epoch": 0.74, + "grad_norm": 0.55078125, + "learning_rate": 3.818904872034777e-05, + "loss": 1.4143, + "step": 14320 + }, + { + "epoch": 0.74, + "grad_norm": 0.5625, + "learning_rate": 3.8118088566507884e-05, + "loss": 1.4227, + "step": 14325 + }, + { + "epoch": 0.74, + "grad_norm": 0.5546875, + "learning_rate": 3.804717887351991e-05, + "loss": 1.412, + "step": 14330 + }, + { + "epoch": 0.74, + "grad_norm": 0.5234375, + "learning_rate": 3.797631969920633e-05, + "loss": 1.424, + "step": 14335 + }, + { + "epoch": 0.74, + "grad_norm": 0.5390625, + "learning_rate": 3.7905511101348334e-05, + "loss": 1.432, + "step": 14340 + }, + { + "epoch": 0.74, + "grad_norm": 0.51953125, + "learning_rate": 3.7834753137685955e-05, + "loss": 1.4125, + "step": 14345 + }, + { + "epoch": 0.74, + "grad_norm": 0.546875, + "learning_rate": 3.776404586591794e-05, + "loss": 1.4444, + "step": 14350 + }, + { + "epoch": 0.74, + "grad_norm": 0.54296875, + "learning_rate": 3.769338934370163e-05, + "loss": 1.4182, + "step": 14355 + }, + { + "epoch": 0.74, + "grad_norm": 0.54296875, + "learning_rate": 3.762278362865308e-05, + "loss": 1.4039, + "step": 14360 + }, + { + "epoch": 0.74, + "grad_norm": 0.5234375, + "learning_rate": 3.755222877834679e-05, + "loss": 1.4062, + "step": 14365 + }, + { + "epoch": 0.74, + "grad_norm": 0.54296875, + "learning_rate": 3.7481724850315894e-05, + "loss": 1.3981, + "step": 14370 + }, + { + "epoch": 0.74, + "grad_norm": 0.53515625, + "learning_rate": 3.741127190205196e-05, + "loss": 1.3981, + "step": 14375 + }, + { + "epoch": 0.74, + "grad_norm": 0.515625, + "learning_rate": 3.734086999100502e-05, + "loss": 1.4209, + "step": 14380 + }, + { + "epoch": 0.74, + "grad_norm": 0.53125, + "learning_rate": 3.7270519174583404e-05, + "loss": 1.3797, + "step": 14385 + }, + { + "epoch": 0.74, + "grad_norm": 0.5625, + "learning_rate": 3.7200219510153845e-05, + "loss": 1.406, + "step": 14390 + }, + { + "epoch": 0.74, + "grad_norm": 0.5390625, + "learning_rate": 3.7129971055041345e-05, + "loss": 1.4059, + "step": 14395 + }, + { + "epoch": 0.75, + "grad_norm": 0.57421875, + "learning_rate": 3.705977386652921e-05, + "loss": 1.3911, + "step": 14400 + }, + { + "epoch": 0.75, + "grad_norm": 0.68359375, + "learning_rate": 3.69896280018588e-05, + "loss": 1.438, + "step": 14405 + }, + { + "epoch": 0.75, + "grad_norm": 0.5390625, + "learning_rate": 3.6919533518229734e-05, + "loss": 1.3953, + "step": 14410 + }, + { + "epoch": 0.75, + "grad_norm": 0.5625, + "learning_rate": 3.6849490472799716e-05, + "loss": 1.4092, + "step": 14415 + }, + { + "epoch": 0.75, + "grad_norm": 0.52734375, + "learning_rate": 3.677949892268453e-05, + "loss": 1.4013, + "step": 14420 + }, + { + "epoch": 0.75, + "grad_norm": 0.53515625, + "learning_rate": 3.670955892495787e-05, + "loss": 1.4047, + "step": 14425 + }, + { + "epoch": 0.75, + "grad_norm": 0.5390625, + "learning_rate": 3.663967053665147e-05, + "loss": 1.3838, + "step": 14430 + }, + { + "epoch": 0.75, + "grad_norm": 0.55078125, + "learning_rate": 3.6569833814754995e-05, + "loss": 1.3798, + "step": 14435 + }, + { + "epoch": 0.75, + "grad_norm": 0.51171875, + "learning_rate": 3.650004881621596e-05, + "loss": 1.4116, + "step": 14440 + }, + { + "epoch": 0.75, + "grad_norm": 0.53515625, + "learning_rate": 3.6430315597939636e-05, + "loss": 1.3809, + "step": 14445 + }, + { + "epoch": 0.75, + "grad_norm": 0.55078125, + "learning_rate": 3.636063421678917e-05, + "loss": 1.4039, + "step": 14450 + }, + { + "epoch": 0.75, + "grad_norm": 0.55078125, + "learning_rate": 3.629100472958538e-05, + "loss": 1.4236, + "step": 14455 + }, + { + "epoch": 0.75, + "grad_norm": 0.546875, + "learning_rate": 3.6221427193106814e-05, + "loss": 1.3737, + "step": 14460 + }, + { + "epoch": 0.75, + "grad_norm": 0.5703125, + "learning_rate": 3.615190166408959e-05, + "loss": 1.391, + "step": 14465 + }, + { + "epoch": 0.75, + "grad_norm": 0.5390625, + "learning_rate": 3.608242819922746e-05, + "loss": 1.4241, + "step": 14470 + }, + { + "epoch": 0.75, + "grad_norm": 0.5390625, + "learning_rate": 3.6013006855171726e-05, + "loss": 1.4078, + "step": 14475 + }, + { + "epoch": 0.75, + "grad_norm": 0.56640625, + "learning_rate": 3.5943637688531216e-05, + "loss": 1.4076, + "step": 14480 + }, + { + "epoch": 0.75, + "grad_norm": 0.546875, + "learning_rate": 3.58743207558721e-05, + "loss": 1.3783, + "step": 14485 + }, + { + "epoch": 0.75, + "grad_norm": 0.53125, + "learning_rate": 3.580505611371806e-05, + "loss": 1.4014, + "step": 14490 + }, + { + "epoch": 0.75, + "grad_norm": 0.53125, + "learning_rate": 3.573584381855012e-05, + "loss": 1.3963, + "step": 14495 + }, + { + "epoch": 0.75, + "grad_norm": 0.5625, + "learning_rate": 3.566668392680662e-05, + "loss": 1.4465, + "step": 14500 + }, + { + "epoch": 0.75, + "grad_norm": 0.54296875, + "learning_rate": 3.5597576494883086e-05, + "loss": 1.3737, + "step": 14505 + }, + { + "epoch": 0.75, + "grad_norm": 0.5625, + "learning_rate": 3.552852157913238e-05, + "loss": 1.4044, + "step": 14510 + }, + { + "epoch": 0.75, + "grad_norm": 0.5625, + "learning_rate": 3.545951923586448e-05, + "loss": 1.4472, + "step": 14515 + }, + { + "epoch": 0.75, + "grad_norm": 0.5390625, + "learning_rate": 3.539056952134655e-05, + "loss": 1.3723, + "step": 14520 + }, + { + "epoch": 0.75, + "grad_norm": 0.55859375, + "learning_rate": 3.532167249180271e-05, + "loss": 1.4226, + "step": 14525 + }, + { + "epoch": 0.75, + "grad_norm": 0.53515625, + "learning_rate": 3.525282820341428e-05, + "loss": 1.3928, + "step": 14530 + }, + { + "epoch": 0.75, + "grad_norm": 0.55859375, + "learning_rate": 3.5184036712319444e-05, + "loss": 1.4194, + "step": 14535 + }, + { + "epoch": 0.75, + "grad_norm": 0.54296875, + "learning_rate": 3.5115298074613466e-05, + "loss": 1.4332, + "step": 14540 + }, + { + "epoch": 0.75, + "grad_norm": 0.54296875, + "learning_rate": 3.504661234634834e-05, + "loss": 1.3965, + "step": 14545 + }, + { + "epoch": 0.75, + "grad_norm": 0.5546875, + "learning_rate": 3.497797958353305e-05, + "loss": 1.4264, + "step": 14550 + }, + { + "epoch": 0.75, + "grad_norm": 0.5546875, + "learning_rate": 3.490939984213334e-05, + "loss": 1.3683, + "step": 14555 + }, + { + "epoch": 0.75, + "grad_norm": 0.53125, + "learning_rate": 3.484087317807176e-05, + "loss": 1.4378, + "step": 14560 + }, + { + "epoch": 0.75, + "grad_norm": 0.51953125, + "learning_rate": 3.477239964722748e-05, + "loss": 1.3627, + "step": 14565 + }, + { + "epoch": 0.75, + "grad_norm": 0.52734375, + "learning_rate": 3.470397930543645e-05, + "loss": 1.3664, + "step": 14570 + }, + { + "epoch": 0.75, + "grad_norm": 0.53125, + "learning_rate": 3.4635612208491194e-05, + "loss": 1.3869, + "step": 14575 + }, + { + "epoch": 0.75, + "grad_norm": 0.5546875, + "learning_rate": 3.456729841214083e-05, + "loss": 1.3984, + "step": 14580 + }, + { + "epoch": 0.75, + "grad_norm": 0.5234375, + "learning_rate": 3.4499037972091064e-05, + "loss": 1.3961, + "step": 14585 + }, + { + "epoch": 0.75, + "grad_norm": 0.55078125, + "learning_rate": 3.443083094400395e-05, + "loss": 1.3964, + "step": 14590 + }, + { + "epoch": 0.76, + "grad_norm": 0.5625, + "learning_rate": 3.4362677383498123e-05, + "loss": 1.4334, + "step": 14595 + }, + { + "epoch": 0.76, + "grad_norm": 0.515625, + "learning_rate": 3.429457734614857e-05, + "loss": 1.3997, + "step": 14600 + }, + { + "epoch": 0.76, + "grad_norm": 0.50390625, + "learning_rate": 3.422653088748668e-05, + "loss": 1.382, + "step": 14605 + }, + { + "epoch": 0.76, + "grad_norm": 0.53515625, + "learning_rate": 3.4158538063000046e-05, + "loss": 1.4022, + "step": 14610 + }, + { + "epoch": 0.76, + "grad_norm": 0.51171875, + "learning_rate": 3.409059892813261e-05, + "loss": 1.3996, + "step": 14615 + }, + { + "epoch": 0.76, + "grad_norm": 0.5625, + "learning_rate": 3.402271353828452e-05, + "loss": 1.444, + "step": 14620 + }, + { + "epoch": 0.76, + "grad_norm": 0.546875, + "learning_rate": 3.3954881948812125e-05, + "loss": 1.4276, + "step": 14625 + }, + { + "epoch": 0.76, + "grad_norm": 0.5859375, + "learning_rate": 3.38871042150278e-05, + "loss": 1.3799, + "step": 14630 + }, + { + "epoch": 0.76, + "grad_norm": 0.5234375, + "learning_rate": 3.381938039220011e-05, + "loss": 1.4047, + "step": 14635 + }, + { + "epoch": 0.76, + "grad_norm": 0.546875, + "learning_rate": 3.3751710535553615e-05, + "loss": 1.3985, + "step": 14640 + }, + { + "epoch": 0.76, + "grad_norm": 0.5234375, + "learning_rate": 3.368409470026892e-05, + "loss": 1.3995, + "step": 14645 + }, + { + "epoch": 0.76, + "grad_norm": 0.52734375, + "learning_rate": 3.3616532941482494e-05, + "loss": 1.4014, + "step": 14650 + }, + { + "epoch": 0.76, + "grad_norm": 0.52734375, + "learning_rate": 3.354902531428673e-05, + "loss": 1.4169, + "step": 14655 + }, + { + "epoch": 0.76, + "grad_norm": 0.55078125, + "learning_rate": 3.3481571873729924e-05, + "loss": 1.4251, + "step": 14660 + }, + { + "epoch": 0.76, + "grad_norm": 0.5078125, + "learning_rate": 3.341417267481616e-05, + "loss": 1.4041, + "step": 14665 + }, + { + "epoch": 0.76, + "grad_norm": 0.5390625, + "learning_rate": 3.334682777250534e-05, + "loss": 1.4032, + "step": 14670 + }, + { + "epoch": 0.76, + "grad_norm": 0.5234375, + "learning_rate": 3.3279537221712975e-05, + "loss": 1.4211, + "step": 14675 + }, + { + "epoch": 0.76, + "grad_norm": 0.54296875, + "learning_rate": 3.321230107731035e-05, + "loss": 1.3886, + "step": 14680 + }, + { + "epoch": 0.76, + "grad_norm": 0.55078125, + "learning_rate": 3.314511939412438e-05, + "loss": 1.3968, + "step": 14685 + }, + { + "epoch": 0.76, + "grad_norm": 0.5234375, + "learning_rate": 3.307799222693756e-05, + "loss": 1.4073, + "step": 14690 + }, + { + "epoch": 0.76, + "grad_norm": 0.52734375, + "learning_rate": 3.301091963048788e-05, + "loss": 1.3847, + "step": 14695 + }, + { + "epoch": 0.76, + "grad_norm": 0.52734375, + "learning_rate": 3.294390165946889e-05, + "loss": 1.4143, + "step": 14700 + }, + { + "epoch": 0.76, + "grad_norm": 0.5390625, + "learning_rate": 3.287693836852959e-05, + "loss": 1.3882, + "step": 14705 + }, + { + "epoch": 0.76, + "grad_norm": 0.54296875, + "learning_rate": 3.281002981227439e-05, + "loss": 1.3611, + "step": 14710 + }, + { + "epoch": 0.76, + "grad_norm": 0.796875, + "learning_rate": 3.2743176045263024e-05, + "loss": 1.3952, + "step": 14715 + }, + { + "epoch": 0.76, + "grad_norm": 0.5546875, + "learning_rate": 3.2676377122010605e-05, + "loss": 1.4149, + "step": 14720 + }, + { + "epoch": 0.76, + "grad_norm": 0.53125, + "learning_rate": 3.260963309698749e-05, + "loss": 1.3637, + "step": 14725 + }, + { + "epoch": 0.76, + "grad_norm": 0.57421875, + "learning_rate": 3.254294402461933e-05, + "loss": 1.4297, + "step": 14730 + }, + { + "epoch": 0.76, + "grad_norm": 0.5390625, + "learning_rate": 3.2476309959286846e-05, + "loss": 1.3733, + "step": 14735 + }, + { + "epoch": 0.76, + "grad_norm": 0.5546875, + "learning_rate": 3.240973095532601e-05, + "loss": 1.4047, + "step": 14740 + }, + { + "epoch": 0.76, + "grad_norm": 0.53515625, + "learning_rate": 3.2343207067027856e-05, + "loss": 1.4128, + "step": 14745 + }, + { + "epoch": 0.76, + "grad_norm": 0.54296875, + "learning_rate": 3.227673834863852e-05, + "loss": 1.4404, + "step": 14750 + }, + { + "epoch": 0.76, + "grad_norm": 0.56640625, + "learning_rate": 3.221032485435904e-05, + "loss": 1.4148, + "step": 14755 + }, + { + "epoch": 0.76, + "grad_norm": 0.53515625, + "learning_rate": 3.214396663834553e-05, + "loss": 1.4185, + "step": 14760 + }, + { + "epoch": 0.76, + "grad_norm": 0.55859375, + "learning_rate": 3.2077663754708983e-05, + "loss": 1.4212, + "step": 14765 + }, + { + "epoch": 0.76, + "grad_norm": 0.5390625, + "learning_rate": 3.201141625751532e-05, + "loss": 1.4082, + "step": 14770 + }, + { + "epoch": 0.76, + "grad_norm": 0.53125, + "learning_rate": 3.194522420078518e-05, + "loss": 1.3706, + "step": 14775 + }, + { + "epoch": 0.76, + "grad_norm": 0.546875, + "learning_rate": 3.187908763849412e-05, + "loss": 1.4007, + "step": 14780 + }, + { + "epoch": 0.76, + "grad_norm": 0.53125, + "learning_rate": 3.181300662457237e-05, + "loss": 1.4022, + "step": 14785 + }, + { + "epoch": 0.77, + "grad_norm": 0.55859375, + "learning_rate": 3.1746981212904944e-05, + "loss": 1.4264, + "step": 14790 + }, + { + "epoch": 0.77, + "grad_norm": 0.546875, + "learning_rate": 3.168101145733139e-05, + "loss": 1.4189, + "step": 14795 + }, + { + "epoch": 0.77, + "grad_norm": 0.5625, + "learning_rate": 3.161509741164596e-05, + "loss": 1.4334, + "step": 14800 + }, + { + "epoch": 0.77, + "grad_norm": 0.51953125, + "learning_rate": 3.1549239129597484e-05, + "loss": 1.4155, + "step": 14805 + }, + { + "epoch": 0.77, + "grad_norm": 0.52734375, + "learning_rate": 3.148343666488931e-05, + "loss": 1.3467, + "step": 14810 + }, + { + "epoch": 0.77, + "grad_norm": 0.5390625, + "learning_rate": 3.141769007117921e-05, + "loss": 1.4213, + "step": 14815 + }, + { + "epoch": 0.77, + "grad_norm": 0.5234375, + "learning_rate": 3.135199940207947e-05, + "loss": 1.3895, + "step": 14820 + }, + { + "epoch": 0.77, + "grad_norm": 0.5625, + "learning_rate": 3.1286364711156734e-05, + "loss": 1.4077, + "step": 14825 + }, + { + "epoch": 0.77, + "grad_norm": 0.5078125, + "learning_rate": 3.1220786051932064e-05, + "loss": 1.4263, + "step": 14830 + }, + { + "epoch": 0.77, + "grad_norm": 0.53515625, + "learning_rate": 3.1155263477880703e-05, + "loss": 1.3859, + "step": 14835 + }, + { + "epoch": 0.77, + "grad_norm": 0.53515625, + "learning_rate": 3.108979704243228e-05, + "loss": 1.4295, + "step": 14840 + }, + { + "epoch": 0.77, + "grad_norm": 0.52734375, + "learning_rate": 3.1024386798970586e-05, + "loss": 1.3763, + "step": 14845 + }, + { + "epoch": 0.77, + "grad_norm": 0.546875, + "learning_rate": 3.0959032800833657e-05, + "loss": 1.4086, + "step": 14850 + }, + { + "epoch": 0.77, + "grad_norm": 0.5546875, + "learning_rate": 3.089373510131354e-05, + "loss": 1.4125, + "step": 14855 + }, + { + "epoch": 0.77, + "grad_norm": 0.5390625, + "learning_rate": 3.0828493753656495e-05, + "loss": 1.4476, + "step": 14860 + }, + { + "epoch": 0.77, + "grad_norm": 0.578125, + "learning_rate": 3.076330881106278e-05, + "loss": 1.4296, + "step": 14865 + }, + { + "epoch": 0.77, + "grad_norm": 0.53125, + "learning_rate": 3.069818032668668e-05, + "loss": 1.434, + "step": 14870 + }, + { + "epoch": 0.77, + "grad_norm": 0.52734375, + "learning_rate": 3.0633108353636376e-05, + "loss": 1.4098, + "step": 14875 + }, + { + "epoch": 0.77, + "grad_norm": 0.546875, + "learning_rate": 3.056809294497406e-05, + "loss": 1.3864, + "step": 14880 + }, + { + "epoch": 0.77, + "grad_norm": 0.5625, + "learning_rate": 3.050313415371573e-05, + "loss": 1.3848, + "step": 14885 + }, + { + "epoch": 0.77, + "grad_norm": 0.54296875, + "learning_rate": 3.0438232032831292e-05, + "loss": 1.3968, + "step": 14890 + }, + { + "epoch": 0.77, + "grad_norm": 0.53515625, + "learning_rate": 3.0373386635244327e-05, + "loss": 1.4014, + "step": 14895 + }, + { + "epoch": 0.77, + "grad_norm": 0.515625, + "learning_rate": 3.0308598013832256e-05, + "loss": 1.3744, + "step": 14900 + }, + { + "epoch": 0.77, + "grad_norm": 0.55859375, + "learning_rate": 3.0243866221426166e-05, + "loss": 1.3868, + "step": 14905 + }, + { + "epoch": 0.77, + "grad_norm": 0.5390625, + "learning_rate": 3.0179191310810838e-05, + "loss": 1.3965, + "step": 14910 + }, + { + "epoch": 0.77, + "grad_norm": 0.52734375, + "learning_rate": 3.0114573334724592e-05, + "loss": 1.4171, + "step": 14915 + }, + { + "epoch": 0.77, + "grad_norm": 0.5234375, + "learning_rate": 3.005001234585939e-05, + "loss": 1.4186, + "step": 14920 + }, + { + "epoch": 0.77, + "grad_norm": 0.55859375, + "learning_rate": 2.9985508396860717e-05, + "loss": 1.4451, + "step": 14925 + }, + { + "epoch": 0.77, + "grad_norm": 0.55078125, + "learning_rate": 2.9921061540327545e-05, + "loss": 1.4018, + "step": 14930 + }, + { + "epoch": 0.77, + "grad_norm": 0.5390625, + "learning_rate": 2.9856671828812244e-05, + "loss": 1.3988, + "step": 14935 + }, + { + "epoch": 0.77, + "grad_norm": 0.53515625, + "learning_rate": 2.9792339314820662e-05, + "loss": 1.3915, + "step": 14940 + }, + { + "epoch": 0.77, + "grad_norm": 0.578125, + "learning_rate": 2.972806405081191e-05, + "loss": 1.4495, + "step": 14945 + }, + { + "epoch": 0.77, + "grad_norm": 0.5390625, + "learning_rate": 2.96638460891985e-05, + "loss": 1.4278, + "step": 14950 + }, + { + "epoch": 0.77, + "grad_norm": 0.53515625, + "learning_rate": 2.9599685482346218e-05, + "loss": 1.3901, + "step": 14955 + }, + { + "epoch": 0.77, + "grad_norm": 0.57421875, + "learning_rate": 2.9535582282573982e-05, + "loss": 1.3876, + "step": 14960 + }, + { + "epoch": 0.77, + "grad_norm": 0.55078125, + "learning_rate": 2.947153654215402e-05, + "loss": 1.4109, + "step": 14965 + }, + { + "epoch": 0.77, + "grad_norm": 0.53515625, + "learning_rate": 2.940754831331163e-05, + "loss": 1.3847, + "step": 14970 + }, + { + "epoch": 0.77, + "grad_norm": 0.54296875, + "learning_rate": 2.9343617648225273e-05, + "loss": 1.3946, + "step": 14975 + }, + { + "epoch": 0.78, + "grad_norm": 0.5625, + "learning_rate": 2.927974459902637e-05, + "loss": 1.4048, + "step": 14980 + }, + { + "epoch": 0.78, + "grad_norm": 0.5390625, + "learning_rate": 2.9215929217799454e-05, + "loss": 1.4128, + "step": 14985 + }, + { + "epoch": 0.78, + "grad_norm": 0.5234375, + "learning_rate": 2.9152171556581998e-05, + "loss": 1.3907, + "step": 14990 + }, + { + "epoch": 0.78, + "grad_norm": 0.53515625, + "learning_rate": 2.9088471667364447e-05, + "loss": 1.4094, + "step": 14995 + }, + { + "epoch": 0.78, + "grad_norm": 0.53125, + "learning_rate": 2.9024829602090033e-05, + "loss": 1.4178, + "step": 15000 + }, + { + "epoch": 0.78, + "grad_norm": 0.5390625, + "learning_rate": 2.8961245412654936e-05, + "loss": 1.4357, + "step": 15005 + }, + { + "epoch": 0.78, + "grad_norm": 0.51171875, + "learning_rate": 2.889771915090812e-05, + "loss": 1.433, + "step": 15010 + }, + { + "epoch": 0.78, + "grad_norm": 0.53515625, + "learning_rate": 2.883425086865129e-05, + "loss": 1.4404, + "step": 15015 + }, + { + "epoch": 0.78, + "grad_norm": 0.52734375, + "learning_rate": 2.8770840617638927e-05, + "loss": 1.3897, + "step": 15020 + }, + { + "epoch": 0.78, + "grad_norm": 0.5390625, + "learning_rate": 2.8707488449578068e-05, + "loss": 1.4184, + "step": 15025 + }, + { + "epoch": 0.78, + "grad_norm": 0.55859375, + "learning_rate": 2.8644194416128523e-05, + "loss": 1.4287, + "step": 15030 + }, + { + "epoch": 0.78, + "grad_norm": 0.54296875, + "learning_rate": 2.8580958568902616e-05, + "loss": 1.409, + "step": 15035 + }, + { + "epoch": 0.78, + "grad_norm": 0.5390625, + "learning_rate": 2.85177809594653e-05, + "loss": 1.4083, + "step": 15040 + }, + { + "epoch": 0.78, + "grad_norm": 0.54296875, + "learning_rate": 2.8454661639333923e-05, + "loss": 1.3748, + "step": 15045 + }, + { + "epoch": 0.78, + "grad_norm": 0.515625, + "learning_rate": 2.839160065997839e-05, + "loss": 1.4489, + "step": 15050 + }, + { + "epoch": 0.78, + "grad_norm": 0.53515625, + "learning_rate": 2.832859807282102e-05, + "loss": 1.3881, + "step": 15055 + }, + { + "epoch": 0.78, + "grad_norm": 0.56640625, + "learning_rate": 2.8265653929236537e-05, + "loss": 1.4145, + "step": 15060 + }, + { + "epoch": 0.78, + "grad_norm": 0.51953125, + "learning_rate": 2.8202768280551894e-05, + "loss": 1.4271, + "step": 15065 + }, + { + "epoch": 0.78, + "grad_norm": 0.52734375, + "learning_rate": 2.813994117804648e-05, + "loss": 1.3984, + "step": 15070 + }, + { + "epoch": 0.78, + "grad_norm": 0.546875, + "learning_rate": 2.807717267295189e-05, + "loss": 1.3794, + "step": 15075 + }, + { + "epoch": 0.78, + "grad_norm": 0.5234375, + "learning_rate": 2.8014462816451958e-05, + "loss": 1.4091, + "step": 15080 + }, + { + "epoch": 0.78, + "grad_norm": 0.52734375, + "learning_rate": 2.7951811659682625e-05, + "loss": 1.4287, + "step": 15085 + }, + { + "epoch": 0.78, + "grad_norm": 0.52734375, + "learning_rate": 2.7889219253732046e-05, + "loss": 1.4234, + "step": 15090 + }, + { + "epoch": 0.78, + "grad_norm": 0.546875, + "learning_rate": 2.7826685649640428e-05, + "loss": 1.4419, + "step": 15095 + }, + { + "epoch": 0.78, + "grad_norm": 0.56640625, + "learning_rate": 2.7764210898400066e-05, + "loss": 1.4219, + "step": 15100 + }, + { + "epoch": 0.78, + "grad_norm": 0.546875, + "learning_rate": 2.770179505095518e-05, + "loss": 1.4274, + "step": 15105 + }, + { + "epoch": 0.78, + "grad_norm": 0.5859375, + "learning_rate": 2.7639438158202037e-05, + "loss": 1.4231, + "step": 15110 + }, + { + "epoch": 0.78, + "grad_norm": 0.55859375, + "learning_rate": 2.757714027098882e-05, + "loss": 1.3985, + "step": 15115 + }, + { + "epoch": 0.78, + "grad_norm": 0.53125, + "learning_rate": 2.7514901440115615e-05, + "loss": 1.3931, + "step": 15120 + }, + { + "epoch": 0.78, + "grad_norm": 0.515625, + "learning_rate": 2.745272171633424e-05, + "loss": 1.4121, + "step": 15125 + }, + { + "epoch": 0.78, + "grad_norm": 0.5546875, + "learning_rate": 2.7390601150348437e-05, + "loss": 1.3921, + "step": 15130 + }, + { + "epoch": 0.78, + "grad_norm": 0.54296875, + "learning_rate": 2.7328539792813668e-05, + "loss": 1.382, + "step": 15135 + }, + { + "epoch": 0.78, + "grad_norm": 0.5234375, + "learning_rate": 2.7266537694337147e-05, + "loss": 1.42, + "step": 15140 + }, + { + "epoch": 0.78, + "grad_norm": 0.54296875, + "learning_rate": 2.7204594905477655e-05, + "loss": 1.3715, + "step": 15145 + }, + { + "epoch": 0.78, + "grad_norm": 0.546875, + "learning_rate": 2.714271147674572e-05, + "loss": 1.3723, + "step": 15150 + }, + { + "epoch": 0.78, + "grad_norm": 0.55078125, + "learning_rate": 2.7080887458603432e-05, + "loss": 1.4189, + "step": 15155 + }, + { + "epoch": 0.78, + "grad_norm": 0.52734375, + "learning_rate": 2.7019122901464477e-05, + "loss": 1.4318, + "step": 15160 + }, + { + "epoch": 0.78, + "grad_norm": 0.54296875, + "learning_rate": 2.6957417855693934e-05, + "loss": 1.3811, + "step": 15165 + }, + { + "epoch": 0.78, + "grad_norm": 0.55078125, + "learning_rate": 2.6895772371608473e-05, + "loss": 1.4058, + "step": 15170 + }, + { + "epoch": 0.79, + "grad_norm": 0.53125, + "learning_rate": 2.6834186499476145e-05, + "loss": 1.3842, + "step": 15175 + }, + { + "epoch": 0.79, + "grad_norm": 0.53515625, + "learning_rate": 2.677266028951645e-05, + "loss": 1.3844, + "step": 15180 + }, + { + "epoch": 0.79, + "grad_norm": 0.51171875, + "learning_rate": 2.67111937919001e-05, + "loss": 1.4222, + "step": 15185 + }, + { + "epoch": 0.79, + "grad_norm": 0.56640625, + "learning_rate": 2.6649787056749254e-05, + "loss": 1.4068, + "step": 15190 + }, + { + "epoch": 0.79, + "grad_norm": 0.5625, + "learning_rate": 2.658844013413727e-05, + "loss": 1.4152, + "step": 15195 + }, + { + "epoch": 0.79, + "grad_norm": 0.52734375, + "learning_rate": 2.6527153074088797e-05, + "loss": 1.3608, + "step": 15200 + }, + { + "epoch": 0.79, + "grad_norm": 0.55078125, + "learning_rate": 2.6465925926579548e-05, + "loss": 1.4029, + "step": 15205 + }, + { + "epoch": 0.79, + "grad_norm": 0.55078125, + "learning_rate": 2.6404758741536505e-05, + "loss": 1.386, + "step": 15210 + }, + { + "epoch": 0.79, + "grad_norm": 0.53515625, + "learning_rate": 2.634365156883768e-05, + "loss": 1.4222, + "step": 15215 + }, + { + "epoch": 0.79, + "grad_norm": 0.5234375, + "learning_rate": 2.628260445831222e-05, + "loss": 1.3958, + "step": 15220 + }, + { + "epoch": 0.79, + "grad_norm": 0.5546875, + "learning_rate": 2.622161745974019e-05, + "loss": 1.4211, + "step": 15225 + }, + { + "epoch": 0.79, + "grad_norm": 0.51171875, + "learning_rate": 2.6160690622852746e-05, + "loss": 1.4084, + "step": 15230 + }, + { + "epoch": 0.79, + "grad_norm": 0.53515625, + "learning_rate": 2.6099823997331886e-05, + "loss": 1.3864, + "step": 15235 + }, + { + "epoch": 0.79, + "grad_norm": 0.53125, + "learning_rate": 2.6039017632810582e-05, + "loss": 1.4074, + "step": 15240 + }, + { + "epoch": 0.79, + "grad_norm": 0.55078125, + "learning_rate": 2.597827157887267e-05, + "loss": 1.4254, + "step": 15245 + }, + { + "epoch": 0.79, + "grad_norm": 0.5078125, + "learning_rate": 2.5917585885052742e-05, + "loss": 1.3996, + "step": 15250 + }, + { + "epoch": 0.79, + "grad_norm": 0.53515625, + "learning_rate": 2.585696060083621e-05, + "loss": 1.4215, + "step": 15255 + }, + { + "epoch": 0.79, + "grad_norm": 0.51171875, + "learning_rate": 2.5796395775659243e-05, + "loss": 1.3848, + "step": 15260 + }, + { + "epoch": 0.79, + "grad_norm": 0.54296875, + "learning_rate": 2.5735891458908713e-05, + "loss": 1.3692, + "step": 15265 + }, + { + "epoch": 0.79, + "grad_norm": 0.5546875, + "learning_rate": 2.5675447699922084e-05, + "loss": 1.4185, + "step": 15270 + }, + { + "epoch": 0.79, + "grad_norm": 0.52734375, + "learning_rate": 2.5615064547987487e-05, + "loss": 1.4189, + "step": 15275 + }, + { + "epoch": 0.79, + "grad_norm": 0.52734375, + "learning_rate": 2.555474205234366e-05, + "loss": 1.353, + "step": 15280 + }, + { + "epoch": 0.79, + "grad_norm": 0.54296875, + "learning_rate": 2.5494480262179855e-05, + "loss": 1.3896, + "step": 15285 + }, + { + "epoch": 0.79, + "grad_norm": 0.52734375, + "learning_rate": 2.543427922663576e-05, + "loss": 1.4108, + "step": 15290 + }, + { + "epoch": 0.79, + "grad_norm": 0.5625, + "learning_rate": 2.537413899480161e-05, + "loss": 1.3963, + "step": 15295 + }, + { + "epoch": 0.79, + "grad_norm": 0.53125, + "learning_rate": 2.5314059615718034e-05, + "loss": 1.4301, + "step": 15300 + }, + { + "epoch": 0.79, + "grad_norm": 0.5390625, + "learning_rate": 2.525404113837605e-05, + "loss": 1.3944, + "step": 15305 + }, + { + "epoch": 0.79, + "grad_norm": 0.52734375, + "learning_rate": 2.5194083611716935e-05, + "loss": 1.4192, + "step": 15310 + }, + { + "epoch": 0.79, + "grad_norm": 0.55078125, + "learning_rate": 2.5134187084632356e-05, + "loss": 1.4197, + "step": 15315 + }, + { + "epoch": 0.79, + "grad_norm": 0.53125, + "learning_rate": 2.507435160596422e-05, + "loss": 1.4077, + "step": 15320 + }, + { + "epoch": 0.79, + "grad_norm": 0.51953125, + "learning_rate": 2.5014577224504642e-05, + "loss": 1.3889, + "step": 15325 + }, + { + "epoch": 0.79, + "grad_norm": 0.546875, + "learning_rate": 2.4954863988995892e-05, + "loss": 1.4001, + "step": 15330 + }, + { + "epoch": 0.79, + "grad_norm": 0.515625, + "learning_rate": 2.4895211948130394e-05, + "loss": 1.4222, + "step": 15335 + }, + { + "epoch": 0.79, + "grad_norm": 0.5390625, + "learning_rate": 2.48356211505507e-05, + "loss": 1.3745, + "step": 15340 + }, + { + "epoch": 0.79, + "grad_norm": 0.5390625, + "learning_rate": 2.4776091644849432e-05, + "loss": 1.4021, + "step": 15345 + }, + { + "epoch": 0.79, + "grad_norm": 0.53515625, + "learning_rate": 2.4716623479569136e-05, + "loss": 1.4166, + "step": 15350 + }, + { + "epoch": 0.79, + "grad_norm": 0.55078125, + "learning_rate": 2.4657216703202435e-05, + "loss": 1.4277, + "step": 15355 + }, + { + "epoch": 0.79, + "grad_norm": 0.515625, + "learning_rate": 2.459787136419186e-05, + "loss": 1.3755, + "step": 15360 + }, + { + "epoch": 0.79, + "grad_norm": 0.51953125, + "learning_rate": 2.4538587510929878e-05, + "loss": 1.4201, + "step": 15365 + }, + { + "epoch": 0.8, + "grad_norm": 0.55078125, + "learning_rate": 2.4479365191758717e-05, + "loss": 1.4043, + "step": 15370 + }, + { + "epoch": 0.8, + "grad_norm": 0.546875, + "learning_rate": 2.4420204454970542e-05, + "loss": 1.4268, + "step": 15375 + }, + { + "epoch": 0.8, + "grad_norm": 0.52734375, + "learning_rate": 2.4361105348807256e-05, + "loss": 1.4083, + "step": 15380 + }, + { + "epoch": 0.8, + "grad_norm": 0.52734375, + "learning_rate": 2.430206792146049e-05, + "loss": 1.3965, + "step": 15385 + }, + { + "epoch": 0.8, + "grad_norm": 0.546875, + "learning_rate": 2.424309222107164e-05, + "loss": 1.4135, + "step": 15390 + }, + { + "epoch": 0.8, + "grad_norm": 0.55859375, + "learning_rate": 2.418417829573165e-05, + "loss": 1.431, + "step": 15395 + }, + { + "epoch": 0.8, + "grad_norm": 0.53125, + "learning_rate": 2.4125326193481213e-05, + "loss": 1.3714, + "step": 15400 + }, + { + "epoch": 0.8, + "grad_norm": 0.546875, + "learning_rate": 2.4066535962310553e-05, + "loss": 1.4141, + "step": 15405 + }, + { + "epoch": 0.8, + "grad_norm": 0.54296875, + "learning_rate": 2.4007807650159464e-05, + "loss": 1.425, + "step": 15410 + }, + { + "epoch": 0.8, + "grad_norm": 0.55859375, + "learning_rate": 2.394914130491719e-05, + "loss": 1.4037, + "step": 15415 + }, + { + "epoch": 0.8, + "grad_norm": 0.52734375, + "learning_rate": 2.3890536974422518e-05, + "loss": 1.381, + "step": 15420 + }, + { + "epoch": 0.8, + "grad_norm": 0.51953125, + "learning_rate": 2.3831994706463623e-05, + "loss": 1.401, + "step": 15425 + }, + { + "epoch": 0.8, + "grad_norm": 0.51953125, + "learning_rate": 2.3773514548778132e-05, + "loss": 1.4207, + "step": 15430 + }, + { + "epoch": 0.8, + "grad_norm": 0.52734375, + "learning_rate": 2.3715096549052908e-05, + "loss": 1.4072, + "step": 15435 + }, + { + "epoch": 0.8, + "grad_norm": 0.5078125, + "learning_rate": 2.3656740754924233e-05, + "loss": 1.3835, + "step": 15440 + }, + { + "epoch": 0.8, + "grad_norm": 0.55078125, + "learning_rate": 2.3598447213977625e-05, + "loss": 1.3809, + "step": 15445 + }, + { + "epoch": 0.8, + "grad_norm": 0.54296875, + "learning_rate": 2.354021597374787e-05, + "loss": 1.3715, + "step": 15450 + }, + { + "epoch": 0.8, + "grad_norm": 0.51953125, + "learning_rate": 2.3482047081718884e-05, + "loss": 1.3965, + "step": 15455 + }, + { + "epoch": 0.8, + "grad_norm": 0.54296875, + "learning_rate": 2.342394058532378e-05, + "loss": 1.3943, + "step": 15460 + }, + { + "epoch": 0.8, + "grad_norm": 0.5546875, + "learning_rate": 2.336589653194482e-05, + "loss": 1.4223, + "step": 15465 + }, + { + "epoch": 0.8, + "grad_norm": 0.5625, + "learning_rate": 2.3307914968913347e-05, + "loss": 1.3886, + "step": 15470 + }, + { + "epoch": 0.8, + "grad_norm": 0.5625, + "learning_rate": 2.324999594350965e-05, + "loss": 1.3564, + "step": 15475 + }, + { + "epoch": 0.8, + "grad_norm": 0.56640625, + "learning_rate": 2.319213950296314e-05, + "loss": 1.4224, + "step": 15480 + }, + { + "epoch": 0.8, + "grad_norm": 0.56640625, + "learning_rate": 2.313434569445213e-05, + "loss": 1.392, + "step": 15485 + }, + { + "epoch": 0.8, + "grad_norm": 0.53515625, + "learning_rate": 2.3076614565103916e-05, + "loss": 1.4152, + "step": 15490 + }, + { + "epoch": 0.8, + "grad_norm": 0.58203125, + "learning_rate": 2.3018946161994594e-05, + "loss": 1.4077, + "step": 15495 + }, + { + "epoch": 0.8, + "grad_norm": 0.55078125, + "learning_rate": 2.2961340532149177e-05, + "loss": 1.4043, + "step": 15500 + }, + { + "epoch": 0.8, + "grad_norm": 0.53125, + "learning_rate": 2.2903797722541487e-05, + "loss": 1.3987, + "step": 15505 + }, + { + "epoch": 0.8, + "grad_norm": 0.5078125, + "learning_rate": 2.2846317780094127e-05, + "loss": 1.3903, + "step": 15510 + }, + { + "epoch": 0.8, + "grad_norm": 0.5234375, + "learning_rate": 2.2788900751678367e-05, + "loss": 1.4161, + "step": 15515 + }, + { + "epoch": 0.8, + "grad_norm": 0.52734375, + "learning_rate": 2.2731546684114247e-05, + "loss": 1.3782, + "step": 15520 + }, + { + "epoch": 0.8, + "grad_norm": 0.55859375, + "learning_rate": 2.2674255624170472e-05, + "loss": 1.3911, + "step": 15525 + }, + { + "epoch": 0.8, + "grad_norm": 0.53125, + "learning_rate": 2.261702761856429e-05, + "loss": 1.3903, + "step": 15530 + }, + { + "epoch": 0.8, + "grad_norm": 0.55078125, + "learning_rate": 2.2559862713961632e-05, + "loss": 1.4021, + "step": 15535 + }, + { + "epoch": 0.8, + "grad_norm": 0.5625, + "learning_rate": 2.2502760956976877e-05, + "loss": 1.4184, + "step": 15540 + }, + { + "epoch": 0.8, + "grad_norm": 0.53125, + "learning_rate": 2.2445722394172973e-05, + "loss": 1.3844, + "step": 15545 + }, + { + "epoch": 0.8, + "grad_norm": 0.55078125, + "learning_rate": 2.2388747072061335e-05, + "loss": 1.3948, + "step": 15550 + }, + { + "epoch": 0.8, + "grad_norm": 0.55078125, + "learning_rate": 2.2331835037101823e-05, + "loss": 1.411, + "step": 15555 + }, + { + "epoch": 0.81, + "grad_norm": 0.53515625, + "learning_rate": 2.2274986335702597e-05, + "loss": 1.4426, + "step": 15560 + }, + { + "epoch": 0.81, + "grad_norm": 0.55859375, + "learning_rate": 2.2218201014220263e-05, + "loss": 1.3654, + "step": 15565 + }, + { + "epoch": 0.81, + "grad_norm": 0.51953125, + "learning_rate": 2.2161479118959737e-05, + "loss": 1.414, + "step": 15570 + }, + { + "epoch": 0.81, + "grad_norm": 0.5703125, + "learning_rate": 2.2104820696174235e-05, + "loss": 1.3961, + "step": 15575 + }, + { + "epoch": 0.81, + "grad_norm": 0.546875, + "learning_rate": 2.204822579206509e-05, + "loss": 1.4196, + "step": 15580 + }, + { + "epoch": 0.81, + "grad_norm": 0.5234375, + "learning_rate": 2.1991694452781975e-05, + "loss": 1.4202, + "step": 15585 + }, + { + "epoch": 0.81, + "grad_norm": 0.5234375, + "learning_rate": 2.1935226724422686e-05, + "loss": 1.4266, + "step": 15590 + }, + { + "epoch": 0.81, + "grad_norm": 0.55078125, + "learning_rate": 2.187882265303317e-05, + "loss": 1.4199, + "step": 15595 + }, + { + "epoch": 0.81, + "grad_norm": 0.55078125, + "learning_rate": 2.182248228460738e-05, + "loss": 1.4131, + "step": 15600 + }, + { + "epoch": 0.81, + "grad_norm": 0.5390625, + "learning_rate": 2.1766205665087426e-05, + "loss": 1.4076, + "step": 15605 + }, + { + "epoch": 0.81, + "grad_norm": 0.5078125, + "learning_rate": 2.170999284036338e-05, + "loss": 1.3784, + "step": 15610 + }, + { + "epoch": 0.81, + "grad_norm": 0.515625, + "learning_rate": 2.1653843856273325e-05, + "loss": 1.4066, + "step": 15615 + }, + { + "epoch": 0.81, + "grad_norm": 0.53515625, + "learning_rate": 2.1597758758603236e-05, + "loss": 1.4271, + "step": 15620 + }, + { + "epoch": 0.81, + "grad_norm": 0.56640625, + "learning_rate": 2.154173759308703e-05, + "loss": 1.3941, + "step": 15625 + }, + { + "epoch": 0.81, + "grad_norm": 0.515625, + "learning_rate": 2.1485780405406498e-05, + "loss": 1.377, + "step": 15630 + }, + { + "epoch": 0.81, + "grad_norm": 0.578125, + "learning_rate": 2.142988724119127e-05, + "loss": 1.4119, + "step": 15635 + }, + { + "epoch": 0.81, + "grad_norm": 0.52734375, + "learning_rate": 2.1374058146018693e-05, + "loss": 1.4651, + "step": 15640 + }, + { + "epoch": 0.81, + "grad_norm": 0.546875, + "learning_rate": 2.131829316541395e-05, + "loss": 1.3771, + "step": 15645 + }, + { + "epoch": 0.81, + "grad_norm": 0.53125, + "learning_rate": 2.126259234484992e-05, + "loss": 1.4179, + "step": 15650 + }, + { + "epoch": 0.81, + "grad_norm": 0.546875, + "learning_rate": 2.120695572974718e-05, + "loss": 1.3984, + "step": 15655 + }, + { + "epoch": 0.81, + "grad_norm": 0.52734375, + "learning_rate": 2.1151383365473875e-05, + "loss": 1.4018, + "step": 15660 + }, + { + "epoch": 0.81, + "grad_norm": 0.57421875, + "learning_rate": 2.109587529734586e-05, + "loss": 1.4305, + "step": 15665 + }, + { + "epoch": 0.81, + "grad_norm": 0.55859375, + "learning_rate": 2.1040431570626483e-05, + "loss": 1.4443, + "step": 15670 + }, + { + "epoch": 0.81, + "grad_norm": 0.55078125, + "learning_rate": 2.0985052230526714e-05, + "loss": 1.423, + "step": 15675 + }, + { + "epoch": 0.81, + "grad_norm": 0.54296875, + "learning_rate": 2.092973732220489e-05, + "loss": 1.3989, + "step": 15680 + }, + { + "epoch": 0.81, + "grad_norm": 0.5703125, + "learning_rate": 2.0874486890766908e-05, + "loss": 1.3782, + "step": 15685 + }, + { + "epoch": 0.81, + "grad_norm": 0.5234375, + "learning_rate": 2.0819300981266066e-05, + "loss": 1.3892, + "step": 15690 + }, + { + "epoch": 0.81, + "grad_norm": 0.51953125, + "learning_rate": 2.0764179638703076e-05, + "loss": 1.3963, + "step": 15695 + }, + { + "epoch": 0.81, + "grad_norm": 0.53125, + "learning_rate": 2.070912290802589e-05, + "loss": 1.415, + "step": 15700 + }, + { + "epoch": 0.81, + "grad_norm": 0.5234375, + "learning_rate": 2.0654130834129903e-05, + "loss": 1.4013, + "step": 15705 + }, + { + "epoch": 0.81, + "grad_norm": 0.53515625, + "learning_rate": 2.0599203461857707e-05, + "loss": 1.4032, + "step": 15710 + }, + { + "epoch": 0.81, + "grad_norm": 0.51953125, + "learning_rate": 2.054434083599921e-05, + "loss": 1.4276, + "step": 15715 + }, + { + "epoch": 0.81, + "grad_norm": 0.5703125, + "learning_rate": 2.0489543001291402e-05, + "loss": 1.4058, + "step": 15720 + }, + { + "epoch": 0.81, + "grad_norm": 0.546875, + "learning_rate": 2.0434810002418547e-05, + "loss": 1.414, + "step": 15725 + }, + { + "epoch": 0.81, + "grad_norm": 0.51953125, + "learning_rate": 2.0380141884012004e-05, + "loss": 1.3732, + "step": 15730 + }, + { + "epoch": 0.81, + "grad_norm": 0.52734375, + "learning_rate": 2.0325538690650236e-05, + "loss": 1.3867, + "step": 15735 + }, + { + "epoch": 0.81, + "grad_norm": 0.57421875, + "learning_rate": 2.0271000466858726e-05, + "loss": 1.4122, + "step": 15740 + }, + { + "epoch": 0.81, + "grad_norm": 0.52734375, + "learning_rate": 2.0216527257110006e-05, + "loss": 1.4012, + "step": 15745 + }, + { + "epoch": 0.81, + "grad_norm": 0.52734375, + "learning_rate": 2.0162119105823607e-05, + "loss": 1.4146, + "step": 15750 + }, + { + "epoch": 0.82, + "grad_norm": 0.55078125, + "learning_rate": 2.010777605736599e-05, + "loss": 1.3965, + "step": 15755 + }, + { + "epoch": 0.82, + "grad_norm": 0.53125, + "learning_rate": 2.0053498156050555e-05, + "loss": 1.3938, + "step": 15760 + }, + { + "epoch": 0.82, + "grad_norm": 0.54296875, + "learning_rate": 1.9999285446137518e-05, + "loss": 1.3813, + "step": 15765 + }, + { + "epoch": 0.82, + "grad_norm": 0.5625, + "learning_rate": 1.9945137971833983e-05, + "loss": 1.3778, + "step": 15770 + }, + { + "epoch": 0.82, + "grad_norm": 0.5703125, + "learning_rate": 1.9891055777293865e-05, + "loss": 1.4193, + "step": 15775 + }, + { + "epoch": 0.82, + "grad_norm": 0.54296875, + "learning_rate": 1.9837038906617843e-05, + "loss": 1.4143, + "step": 15780 + }, + { + "epoch": 0.82, + "grad_norm": 0.546875, + "learning_rate": 1.9783087403853273e-05, + "loss": 1.4312, + "step": 15785 + }, + { + "epoch": 0.82, + "grad_norm": 0.52734375, + "learning_rate": 1.9729201312994273e-05, + "loss": 1.4178, + "step": 15790 + }, + { + "epoch": 0.82, + "grad_norm": 0.53125, + "learning_rate": 1.9675380677981603e-05, + "loss": 1.4112, + "step": 15795 + }, + { + "epoch": 0.82, + "grad_norm": 0.52734375, + "learning_rate": 1.962162554270267e-05, + "loss": 1.3914, + "step": 15800 + }, + { + "epoch": 0.82, + "grad_norm": 0.52734375, + "learning_rate": 1.95679359509914e-05, + "loss": 1.4105, + "step": 15805 + }, + { + "epoch": 0.82, + "grad_norm": 0.546875, + "learning_rate": 1.951431194662834e-05, + "loss": 1.3702, + "step": 15810 + }, + { + "epoch": 0.82, + "grad_norm": 0.52734375, + "learning_rate": 1.946075357334053e-05, + "loss": 1.3917, + "step": 15815 + }, + { + "epoch": 0.82, + "grad_norm": 0.5390625, + "learning_rate": 1.9407260874801513e-05, + "loss": 1.3756, + "step": 15820 + }, + { + "epoch": 0.82, + "grad_norm": 0.52734375, + "learning_rate": 1.9353833894631247e-05, + "loss": 1.3712, + "step": 15825 + }, + { + "epoch": 0.82, + "grad_norm": 0.55078125, + "learning_rate": 1.9300472676396076e-05, + "loss": 1.3687, + "step": 15830 + }, + { + "epoch": 0.82, + "grad_norm": 0.5546875, + "learning_rate": 1.9247177263608794e-05, + "loss": 1.4126, + "step": 15835 + }, + { + "epoch": 0.82, + "grad_norm": 0.5703125, + "learning_rate": 1.9193947699728488e-05, + "loss": 1.4815, + "step": 15840 + }, + { + "epoch": 0.82, + "grad_norm": 0.53515625, + "learning_rate": 1.9140784028160574e-05, + "loss": 1.3857, + "step": 15845 + }, + { + "epoch": 0.82, + "grad_norm": 0.578125, + "learning_rate": 1.908768629225669e-05, + "loss": 1.4191, + "step": 15850 + }, + { + "epoch": 0.82, + "grad_norm": 0.5625, + "learning_rate": 1.9034654535314767e-05, + "loss": 1.3807, + "step": 15855 + }, + { + "epoch": 0.82, + "grad_norm": 0.53125, + "learning_rate": 1.8981688800578877e-05, + "loss": 1.454, + "step": 15860 + }, + { + "epoch": 0.82, + "grad_norm": 0.65625, + "learning_rate": 1.8928789131239343e-05, + "loss": 1.3978, + "step": 15865 + }, + { + "epoch": 0.82, + "grad_norm": 0.53125, + "learning_rate": 1.887595557043248e-05, + "loss": 1.3991, + "step": 15870 + }, + { + "epoch": 0.82, + "grad_norm": 0.53125, + "learning_rate": 1.8823188161240813e-05, + "loss": 1.4167, + "step": 15875 + }, + { + "epoch": 0.82, + "grad_norm": 0.54296875, + "learning_rate": 1.8770486946692876e-05, + "loss": 1.4233, + "step": 15880 + }, + { + "epoch": 0.82, + "grad_norm": 0.53515625, + "learning_rate": 1.8717851969763266e-05, + "loss": 1.4152, + "step": 15885 + }, + { + "epoch": 0.82, + "grad_norm": 0.5234375, + "learning_rate": 1.866528327337249e-05, + "loss": 1.4035, + "step": 15890 + }, + { + "epoch": 0.82, + "grad_norm": 0.546875, + "learning_rate": 1.861278090038705e-05, + "loss": 1.3953, + "step": 15895 + }, + { + "epoch": 0.82, + "grad_norm": 0.5546875, + "learning_rate": 1.8560344893619396e-05, + "loss": 1.4082, + "step": 15900 + }, + { + "epoch": 0.82, + "grad_norm": 0.5234375, + "learning_rate": 1.850797529582785e-05, + "loss": 1.4083, + "step": 15905 + }, + { + "epoch": 0.82, + "grad_norm": 0.52734375, + "learning_rate": 1.8455672149716496e-05, + "loss": 1.4182, + "step": 15910 + }, + { + "epoch": 0.82, + "grad_norm": 0.53125, + "learning_rate": 1.840343549793535e-05, + "loss": 1.3979, + "step": 15915 + }, + { + "epoch": 0.82, + "grad_norm": 0.55859375, + "learning_rate": 1.835126538308013e-05, + "loss": 1.4196, + "step": 15920 + }, + { + "epoch": 0.82, + "grad_norm": 0.546875, + "learning_rate": 1.8299161847692358e-05, + "loss": 1.3812, + "step": 15925 + }, + { + "epoch": 0.82, + "grad_norm": 0.53515625, + "learning_rate": 1.8247124934259186e-05, + "loss": 1.4142, + "step": 15930 + }, + { + "epoch": 0.82, + "grad_norm": 0.546875, + "learning_rate": 1.819515468521349e-05, + "loss": 1.4185, + "step": 15935 + }, + { + "epoch": 0.82, + "grad_norm": 0.58203125, + "learning_rate": 1.8143251142933793e-05, + "loss": 1.4173, + "step": 15940 + }, + { + "epoch": 0.82, + "grad_norm": 0.53125, + "learning_rate": 1.809141434974423e-05, + "loss": 1.4131, + "step": 15945 + }, + { + "epoch": 0.83, + "grad_norm": 0.546875, + "learning_rate": 1.803964434791442e-05, + "loss": 1.4071, + "step": 15950 + }, + { + "epoch": 0.83, + "grad_norm": 0.53515625, + "learning_rate": 1.7987941179659608e-05, + "loss": 1.356, + "step": 15955 + }, + { + "epoch": 0.83, + "grad_norm": 0.52734375, + "learning_rate": 1.793630488714053e-05, + "loss": 1.4231, + "step": 15960 + }, + { + "epoch": 0.83, + "grad_norm": 0.51171875, + "learning_rate": 1.788473551246339e-05, + "loss": 1.3839, + "step": 15965 + }, + { + "epoch": 0.83, + "grad_norm": 0.55078125, + "learning_rate": 1.7833233097679746e-05, + "loss": 1.4259, + "step": 15970 + }, + { + "epoch": 0.83, + "grad_norm": 0.53125, + "learning_rate": 1.778179768478666e-05, + "loss": 1.4312, + "step": 15975 + }, + { + "epoch": 0.83, + "grad_norm": 0.5078125, + "learning_rate": 1.7730429315726494e-05, + "loss": 1.3773, + "step": 15980 + }, + { + "epoch": 0.83, + "grad_norm": 0.5625, + "learning_rate": 1.7679128032387004e-05, + "loss": 1.3844, + "step": 15985 + }, + { + "epoch": 0.83, + "grad_norm": 0.57421875, + "learning_rate": 1.762789387660113e-05, + "loss": 1.4109, + "step": 15990 + }, + { + "epoch": 0.83, + "grad_norm": 0.54296875, + "learning_rate": 1.7576726890147177e-05, + "loss": 1.3568, + "step": 15995 + }, + { + "epoch": 0.83, + "grad_norm": 0.54296875, + "learning_rate": 1.7525627114748645e-05, + "loss": 1.4048, + "step": 16000 + }, + { + "epoch": 0.83, + "grad_norm": 0.5546875, + "learning_rate": 1.7474594592074235e-05, + "loss": 1.4195, + "step": 16005 + }, + { + "epoch": 0.83, + "grad_norm": 0.53125, + "learning_rate": 1.742362936373776e-05, + "loss": 1.4214, + "step": 16010 + }, + { + "epoch": 0.83, + "grad_norm": 0.53125, + "learning_rate": 1.737273147129821e-05, + "loss": 1.41, + "step": 16015 + }, + { + "epoch": 0.83, + "grad_norm": 0.54296875, + "learning_rate": 1.7321900956259653e-05, + "loss": 1.4071, + "step": 16020 + }, + { + "epoch": 0.83, + "grad_norm": 0.515625, + "learning_rate": 1.727113786007125e-05, + "loss": 1.3967, + "step": 16025 + }, + { + "epoch": 0.83, + "grad_norm": 0.56640625, + "learning_rate": 1.7220442224127097e-05, + "loss": 1.3973, + "step": 16030 + }, + { + "epoch": 0.83, + "grad_norm": 0.52734375, + "learning_rate": 1.7169814089766344e-05, + "loss": 1.4093, + "step": 16035 + }, + { + "epoch": 0.83, + "grad_norm": 0.52734375, + "learning_rate": 1.7119253498273113e-05, + "loss": 1.4139, + "step": 16040 + }, + { + "epoch": 0.83, + "grad_norm": 0.55078125, + "learning_rate": 1.7068760490876422e-05, + "loss": 1.3937, + "step": 16045 + }, + { + "epoch": 0.83, + "grad_norm": 0.53125, + "learning_rate": 1.701833510875015e-05, + "loss": 1.4067, + "step": 16050 + }, + { + "epoch": 0.83, + "grad_norm": 0.51953125, + "learning_rate": 1.696797739301308e-05, + "loss": 1.3547, + "step": 16055 + }, + { + "epoch": 0.83, + "grad_norm": 0.55078125, + "learning_rate": 1.6917687384728785e-05, + "loss": 1.4478, + "step": 16060 + }, + { + "epoch": 0.83, + "grad_norm": 0.53125, + "learning_rate": 1.686746512490569e-05, + "loss": 1.4055, + "step": 16065 + }, + { + "epoch": 0.83, + "grad_norm": 0.54296875, + "learning_rate": 1.6817310654496852e-05, + "loss": 1.4043, + "step": 16070 + }, + { + "epoch": 0.83, + "grad_norm": 0.546875, + "learning_rate": 1.6767224014400173e-05, + "loss": 1.4098, + "step": 16075 + }, + { + "epoch": 0.83, + "grad_norm": 0.53515625, + "learning_rate": 1.6717205245458178e-05, + "loss": 1.4182, + "step": 16080 + }, + { + "epoch": 0.83, + "grad_norm": 0.51953125, + "learning_rate": 1.6667254388458088e-05, + "loss": 1.3897, + "step": 16085 + }, + { + "epoch": 0.83, + "grad_norm": 0.5390625, + "learning_rate": 1.661737148413167e-05, + "loss": 1.3928, + "step": 16090 + }, + { + "epoch": 0.83, + "grad_norm": 0.578125, + "learning_rate": 1.6567556573155374e-05, + "loss": 1.401, + "step": 16095 + }, + { + "epoch": 0.83, + "grad_norm": 0.5390625, + "learning_rate": 1.6517809696150143e-05, + "loss": 1.4434, + "step": 16100 + }, + { + "epoch": 0.83, + "grad_norm": 0.5234375, + "learning_rate": 1.64681308936815e-05, + "loss": 1.4293, + "step": 16105 + }, + { + "epoch": 0.83, + "grad_norm": 0.53125, + "learning_rate": 1.641852020625937e-05, + "loss": 1.403, + "step": 16110 + }, + { + "epoch": 0.83, + "grad_norm": 0.5390625, + "learning_rate": 1.6368977674338216e-05, + "loss": 1.4235, + "step": 16115 + }, + { + "epoch": 0.83, + "grad_norm": 0.53515625, + "learning_rate": 1.631950333831688e-05, + "loss": 1.4061, + "step": 16120 + }, + { + "epoch": 0.83, + "grad_norm": 0.546875, + "learning_rate": 1.6270097238538597e-05, + "loss": 1.4604, + "step": 16125 + }, + { + "epoch": 0.83, + "grad_norm": 0.53125, + "learning_rate": 1.6220759415290998e-05, + "loss": 1.396, + "step": 16130 + }, + { + "epoch": 0.83, + "grad_norm": 0.53515625, + "learning_rate": 1.6171489908805992e-05, + "loss": 1.3742, + "step": 16135 + }, + { + "epoch": 0.84, + "grad_norm": 0.52734375, + "learning_rate": 1.6122288759259795e-05, + "loss": 1.4133, + "step": 16140 + }, + { + "epoch": 0.84, + "grad_norm": 0.5625, + "learning_rate": 1.6073156006772893e-05, + "loss": 1.3686, + "step": 16145 + }, + { + "epoch": 0.84, + "grad_norm": 0.56640625, + "learning_rate": 1.6024091691410013e-05, + "loss": 1.3744, + "step": 16150 + }, + { + "epoch": 0.84, + "grad_norm": 0.5390625, + "learning_rate": 1.597509585318001e-05, + "loss": 1.3963, + "step": 16155 + }, + { + "epoch": 0.84, + "grad_norm": 0.546875, + "learning_rate": 1.592616853203597e-05, + "loss": 1.3879, + "step": 16160 + }, + { + "epoch": 0.84, + "grad_norm": 0.5390625, + "learning_rate": 1.587730976787508e-05, + "loss": 1.4037, + "step": 16165 + }, + { + "epoch": 0.84, + "grad_norm": 0.515625, + "learning_rate": 1.582851960053865e-05, + "loss": 1.3981, + "step": 16170 + }, + { + "epoch": 0.84, + "grad_norm": 0.546875, + "learning_rate": 1.577979806981198e-05, + "loss": 1.4068, + "step": 16175 + }, + { + "epoch": 0.84, + "grad_norm": 0.5390625, + "learning_rate": 1.573114521542447e-05, + "loss": 1.4194, + "step": 16180 + }, + { + "epoch": 0.84, + "grad_norm": 0.55078125, + "learning_rate": 1.5682561077049496e-05, + "loss": 1.4033, + "step": 16185 + }, + { + "epoch": 0.84, + "grad_norm": 0.53515625, + "learning_rate": 1.5634045694304412e-05, + "loss": 1.4413, + "step": 16190 + }, + { + "epoch": 0.84, + "grad_norm": 0.54296875, + "learning_rate": 1.5585599106750515e-05, + "loss": 1.3756, + "step": 16195 + }, + { + "epoch": 0.84, + "grad_norm": 0.5546875, + "learning_rate": 1.553722135389294e-05, + "loss": 1.4271, + "step": 16200 + }, + { + "epoch": 0.84, + "grad_norm": 0.54296875, + "learning_rate": 1.548891247518075e-05, + "loss": 1.4217, + "step": 16205 + }, + { + "epoch": 0.84, + "grad_norm": 0.53515625, + "learning_rate": 1.5440672510006848e-05, + "loss": 1.418, + "step": 16210 + }, + { + "epoch": 0.84, + "grad_norm": 0.5234375, + "learning_rate": 1.5392501497707945e-05, + "loss": 1.3715, + "step": 16215 + }, + { + "epoch": 0.84, + "grad_norm": 0.53515625, + "learning_rate": 1.5344399477564462e-05, + "loss": 1.3594, + "step": 16220 + }, + { + "epoch": 0.84, + "grad_norm": 0.52734375, + "learning_rate": 1.529636648880063e-05, + "loss": 1.4132, + "step": 16225 + }, + { + "epoch": 0.84, + "grad_norm": 0.546875, + "learning_rate": 1.5248402570584353e-05, + "loss": 1.4164, + "step": 16230 + }, + { + "epoch": 0.84, + "grad_norm": 0.5390625, + "learning_rate": 1.520050776202726e-05, + "loss": 1.4124, + "step": 16235 + }, + { + "epoch": 0.84, + "grad_norm": 0.5703125, + "learning_rate": 1.5152682102184546e-05, + "loss": 1.3827, + "step": 16240 + }, + { + "epoch": 0.84, + "grad_norm": 0.55078125, + "learning_rate": 1.5104925630055078e-05, + "loss": 1.4168, + "step": 16245 + }, + { + "epoch": 0.84, + "grad_norm": 0.54296875, + "learning_rate": 1.5057238384581296e-05, + "loss": 1.4161, + "step": 16250 + }, + { + "epoch": 0.84, + "grad_norm": 0.51953125, + "learning_rate": 1.5009620404649193e-05, + "loss": 1.424, + "step": 16255 + }, + { + "epoch": 0.84, + "grad_norm": 0.5703125, + "learning_rate": 1.4962071729088255e-05, + "loss": 1.3947, + "step": 16260 + }, + { + "epoch": 0.84, + "grad_norm": 0.53125, + "learning_rate": 1.4914592396671468e-05, + "loss": 1.4185, + "step": 16265 + }, + { + "epoch": 0.84, + "grad_norm": 0.53515625, + "learning_rate": 1.486718244611528e-05, + "loss": 1.4036, + "step": 16270 + }, + { + "epoch": 0.84, + "grad_norm": 0.53515625, + "learning_rate": 1.481984191607959e-05, + "loss": 1.397, + "step": 16275 + }, + { + "epoch": 0.84, + "grad_norm": 0.55078125, + "learning_rate": 1.477257084516761e-05, + "loss": 1.4109, + "step": 16280 + }, + { + "epoch": 0.84, + "grad_norm": 0.5390625, + "learning_rate": 1.4725369271925982e-05, + "loss": 1.3913, + "step": 16285 + }, + { + "epoch": 0.84, + "grad_norm": 0.546875, + "learning_rate": 1.4678237234844649e-05, + "loss": 1.4272, + "step": 16290 + }, + { + "epoch": 0.84, + "grad_norm": 0.55078125, + "learning_rate": 1.4631174772356881e-05, + "loss": 1.4109, + "step": 16295 + }, + { + "epoch": 0.84, + "grad_norm": 0.55859375, + "learning_rate": 1.4584181922839157e-05, + "loss": 1.4192, + "step": 16300 + }, + { + "epoch": 0.84, + "grad_norm": 0.5234375, + "learning_rate": 1.4537258724611235e-05, + "loss": 1.3831, + "step": 16305 + }, + { + "epoch": 0.84, + "grad_norm": 0.53515625, + "learning_rate": 1.4490405215936066e-05, + "loss": 1.3819, + "step": 16310 + }, + { + "epoch": 0.84, + "grad_norm": 0.54296875, + "learning_rate": 1.4443621435019793e-05, + "loss": 1.3904, + "step": 16315 + }, + { + "epoch": 0.84, + "grad_norm": 0.55078125, + "learning_rate": 1.4396907420011651e-05, + "loss": 1.4392, + "step": 16320 + }, + { + "epoch": 0.84, + "grad_norm": 0.5703125, + "learning_rate": 1.4350263209004034e-05, + "loss": 1.453, + "step": 16325 + }, + { + "epoch": 0.84, + "grad_norm": 0.51953125, + "learning_rate": 1.4303688840032381e-05, + "loss": 1.4595, + "step": 16330 + }, + { + "epoch": 0.85, + "grad_norm": 0.546875, + "learning_rate": 1.4257184351075237e-05, + "loss": 1.4095, + "step": 16335 + }, + { + "epoch": 0.85, + "grad_norm": 0.515625, + "learning_rate": 1.4210749780054066e-05, + "loss": 1.3962, + "step": 16340 + }, + { + "epoch": 0.85, + "grad_norm": 0.55078125, + "learning_rate": 1.4164385164833394e-05, + "loss": 1.3973, + "step": 16345 + }, + { + "epoch": 0.85, + "grad_norm": 0.55078125, + "learning_rate": 1.4118090543220697e-05, + "loss": 1.4014, + "step": 16350 + }, + { + "epoch": 0.85, + "grad_norm": 0.51171875, + "learning_rate": 1.4071865952966368e-05, + "loss": 1.3986, + "step": 16355 + }, + { + "epoch": 0.85, + "grad_norm": 0.52734375, + "learning_rate": 1.4025711431763644e-05, + "loss": 1.3975, + "step": 16360 + }, + { + "epoch": 0.85, + "grad_norm": 0.546875, + "learning_rate": 1.3979627017248687e-05, + "loss": 1.4011, + "step": 16365 + }, + { + "epoch": 0.85, + "grad_norm": 0.53515625, + "learning_rate": 1.393361274700049e-05, + "loss": 1.4472, + "step": 16370 + }, + { + "epoch": 0.85, + "grad_norm": 0.52734375, + "learning_rate": 1.3887668658540842e-05, + "loss": 1.4033, + "step": 16375 + }, + { + "epoch": 0.85, + "grad_norm": 0.53515625, + "learning_rate": 1.3841794789334239e-05, + "loss": 1.4049, + "step": 16380 + }, + { + "epoch": 0.85, + "grad_norm": 0.5390625, + "learning_rate": 1.3795991176788004e-05, + "loss": 1.4162, + "step": 16385 + }, + { + "epoch": 0.85, + "grad_norm": 0.54296875, + "learning_rate": 1.3750257858252124e-05, + "loss": 1.406, + "step": 16390 + }, + { + "epoch": 0.85, + "grad_norm": 0.54296875, + "learning_rate": 1.3704594871019305e-05, + "loss": 1.425, + "step": 16395 + }, + { + "epoch": 0.85, + "grad_norm": 0.52734375, + "learning_rate": 1.3659002252324838e-05, + "loss": 1.3878, + "step": 16400 + }, + { + "epoch": 0.85, + "grad_norm": 0.55078125, + "learning_rate": 1.3613480039346682e-05, + "loss": 1.4231, + "step": 16405 + }, + { + "epoch": 0.85, + "grad_norm": 0.5390625, + "learning_rate": 1.3568028269205391e-05, + "loss": 1.4099, + "step": 16410 + }, + { + "epoch": 0.85, + "grad_norm": 0.5390625, + "learning_rate": 1.3522646978964027e-05, + "loss": 1.4498, + "step": 16415 + }, + { + "epoch": 0.85, + "grad_norm": 0.52734375, + "learning_rate": 1.3477336205628233e-05, + "loss": 1.435, + "step": 16420 + }, + { + "epoch": 0.85, + "grad_norm": 0.494140625, + "learning_rate": 1.3432095986146109e-05, + "loss": 1.4224, + "step": 16425 + }, + { + "epoch": 0.85, + "grad_norm": 0.515625, + "learning_rate": 1.3386926357408257e-05, + "loss": 1.3966, + "step": 16430 + }, + { + "epoch": 0.85, + "grad_norm": 0.53515625, + "learning_rate": 1.3341827356247682e-05, + "loss": 1.3741, + "step": 16435 + }, + { + "epoch": 0.85, + "grad_norm": 0.52734375, + "learning_rate": 1.3296799019439865e-05, + "loss": 1.4017, + "step": 16440 + }, + { + "epoch": 0.85, + "grad_norm": 0.52734375, + "learning_rate": 1.3251841383702557e-05, + "loss": 1.4242, + "step": 16445 + }, + { + "epoch": 0.85, + "grad_norm": 0.578125, + "learning_rate": 1.3206954485695944e-05, + "loss": 1.3895, + "step": 16450 + }, + { + "epoch": 0.85, + "grad_norm": 0.5390625, + "learning_rate": 1.3162138362022491e-05, + "loss": 1.405, + "step": 16455 + }, + { + "epoch": 0.85, + "grad_norm": 0.5390625, + "learning_rate": 1.3117393049226978e-05, + "loss": 1.4151, + "step": 16460 + }, + { + "epoch": 0.85, + "grad_norm": 0.51953125, + "learning_rate": 1.3072718583796405e-05, + "loss": 1.3724, + "step": 16465 + }, + { + "epoch": 0.85, + "grad_norm": 0.57421875, + "learning_rate": 1.3028115002160035e-05, + "loss": 1.4355, + "step": 16470 + }, + { + "epoch": 0.85, + "grad_norm": 0.53125, + "learning_rate": 1.2983582340689304e-05, + "loss": 1.4304, + "step": 16475 + }, + { + "epoch": 0.85, + "grad_norm": 0.546875, + "learning_rate": 1.2939120635697855e-05, + "loss": 1.3647, + "step": 16480 + }, + { + "epoch": 0.85, + "grad_norm": 0.515625, + "learning_rate": 1.2894729923441407e-05, + "loss": 1.3749, + "step": 16485 + }, + { + "epoch": 0.85, + "grad_norm": 0.54296875, + "learning_rate": 1.2850410240117849e-05, + "loss": 1.403, + "step": 16490 + }, + { + "epoch": 0.85, + "grad_norm": 0.546875, + "learning_rate": 1.280616162186713e-05, + "loss": 1.4169, + "step": 16495 + }, + { + "epoch": 0.85, + "grad_norm": 0.58203125, + "learning_rate": 1.2761984104771252e-05, + "loss": 1.3677, + "step": 16500 + }, + { + "epoch": 0.85, + "grad_norm": 0.54296875, + "learning_rate": 1.271787772485421e-05, + "loss": 1.4086, + "step": 16505 + }, + { + "epoch": 0.85, + "grad_norm": 0.53515625, + "learning_rate": 1.2673842518082024e-05, + "loss": 1.4118, + "step": 16510 + }, + { + "epoch": 0.85, + "grad_norm": 0.53125, + "learning_rate": 1.262987852036267e-05, + "loss": 1.4067, + "step": 16515 + }, + { + "epoch": 0.85, + "grad_norm": 0.5390625, + "learning_rate": 1.2585985767546083e-05, + "loss": 1.4026, + "step": 16520 + }, + { + "epoch": 0.85, + "grad_norm": 0.55078125, + "learning_rate": 1.2542164295424031e-05, + "loss": 1.4008, + "step": 16525 + }, + { + "epoch": 0.86, + "grad_norm": 0.5625, + "learning_rate": 1.249841413973022e-05, + "loss": 1.3992, + "step": 16530 + }, + { + "epoch": 0.86, + "grad_norm": 0.515625, + "learning_rate": 1.2454735336140167e-05, + "loss": 1.3753, + "step": 16535 + }, + { + "epoch": 0.86, + "grad_norm": 0.5546875, + "learning_rate": 1.2411127920271271e-05, + "loss": 1.4067, + "step": 16540 + }, + { + "epoch": 0.86, + "grad_norm": 0.5703125, + "learning_rate": 1.2367591927682598e-05, + "loss": 1.4082, + "step": 16545 + }, + { + "epoch": 0.86, + "grad_norm": 0.54296875, + "learning_rate": 1.2324127393875084e-05, + "loss": 1.431, + "step": 16550 + }, + { + "epoch": 0.86, + "grad_norm": 0.53125, + "learning_rate": 1.2280734354291346e-05, + "loss": 1.3781, + "step": 16555 + }, + { + "epoch": 0.86, + "grad_norm": 0.5625, + "learning_rate": 1.2237412844315722e-05, + "loss": 1.4195, + "step": 16560 + }, + { + "epoch": 0.86, + "grad_norm": 0.55078125, + "learning_rate": 1.2194162899274208e-05, + "loss": 1.3911, + "step": 16565 + }, + { + "epoch": 0.86, + "grad_norm": 0.54296875, + "learning_rate": 1.215098455443443e-05, + "loss": 1.3816, + "step": 16570 + }, + { + "epoch": 0.86, + "grad_norm": 0.5234375, + "learning_rate": 1.2107877845005644e-05, + "loss": 1.4202, + "step": 16575 + }, + { + "epoch": 0.86, + "grad_norm": 0.52734375, + "learning_rate": 1.2064842806138698e-05, + "loss": 1.3935, + "step": 16580 + }, + { + "epoch": 0.86, + "grad_norm": 0.57421875, + "learning_rate": 1.2021879472926023e-05, + "loss": 1.4332, + "step": 16585 + }, + { + "epoch": 0.86, + "grad_norm": 0.5546875, + "learning_rate": 1.1978987880401493e-05, + "loss": 1.3974, + "step": 16590 + }, + { + "epoch": 0.86, + "grad_norm": 0.51171875, + "learning_rate": 1.1936168063540554e-05, + "loss": 1.418, + "step": 16595 + }, + { + "epoch": 0.86, + "grad_norm": 0.51953125, + "learning_rate": 1.1893420057260118e-05, + "loss": 1.4039, + "step": 16600 + }, + { + "epoch": 0.86, + "grad_norm": 0.52734375, + "learning_rate": 1.1850743896418537e-05, + "loss": 1.3886, + "step": 16605 + }, + { + "epoch": 0.86, + "grad_norm": 0.53125, + "learning_rate": 1.1808139615815527e-05, + "loss": 1.3926, + "step": 16610 + }, + { + "epoch": 0.86, + "grad_norm": 0.54296875, + "learning_rate": 1.1765607250192245e-05, + "loss": 1.4107, + "step": 16615 + }, + { + "epoch": 0.86, + "grad_norm": 0.55859375, + "learning_rate": 1.1723146834231214e-05, + "loss": 1.4003, + "step": 16620 + }, + { + "epoch": 0.86, + "grad_norm": 0.5390625, + "learning_rate": 1.1680758402556257e-05, + "loss": 1.4199, + "step": 16625 + }, + { + "epoch": 0.86, + "grad_norm": 0.5234375, + "learning_rate": 1.1638441989732473e-05, + "loss": 1.3873, + "step": 16630 + }, + { + "epoch": 0.86, + "grad_norm": 0.5703125, + "learning_rate": 1.1596197630266292e-05, + "loss": 1.4124, + "step": 16635 + }, + { + "epoch": 0.86, + "grad_norm": 0.5390625, + "learning_rate": 1.1554025358605369e-05, + "loss": 1.4054, + "step": 16640 + }, + { + "epoch": 0.86, + "grad_norm": 0.57421875, + "learning_rate": 1.1511925209138575e-05, + "loss": 1.4418, + "step": 16645 + }, + { + "epoch": 0.86, + "grad_norm": 0.515625, + "learning_rate": 1.1469897216195924e-05, + "loss": 1.4035, + "step": 16650 + }, + { + "epoch": 0.86, + "grad_norm": 0.5546875, + "learning_rate": 1.142794141404866e-05, + "loss": 1.3955, + "step": 16655 + }, + { + "epoch": 0.86, + "grad_norm": 0.5234375, + "learning_rate": 1.1386057836909137e-05, + "loss": 1.3921, + "step": 16660 + }, + { + "epoch": 0.86, + "grad_norm": 0.51953125, + "learning_rate": 1.1344246518930823e-05, + "loss": 1.4074, + "step": 16665 + }, + { + "epoch": 0.86, + "grad_norm": 0.5390625, + "learning_rate": 1.1302507494208191e-05, + "loss": 1.3626, + "step": 16670 + }, + { + "epoch": 0.86, + "grad_norm": 0.55078125, + "learning_rate": 1.1260840796776873e-05, + "loss": 1.4112, + "step": 16675 + }, + { + "epoch": 0.86, + "grad_norm": 0.578125, + "learning_rate": 1.1219246460613452e-05, + "loss": 1.4386, + "step": 16680 + }, + { + "epoch": 0.86, + "grad_norm": 0.5625, + "learning_rate": 1.1177724519635547e-05, + "loss": 1.4362, + "step": 16685 + }, + { + "epoch": 0.86, + "grad_norm": 0.55859375, + "learning_rate": 1.113627500770167e-05, + "loss": 1.3881, + "step": 16690 + }, + { + "epoch": 0.86, + "grad_norm": 0.51953125, + "learning_rate": 1.109489795861135e-05, + "loss": 1.392, + "step": 16695 + }, + { + "epoch": 0.86, + "grad_norm": 0.546875, + "learning_rate": 1.1053593406105001e-05, + "loss": 1.3767, + "step": 16700 + }, + { + "epoch": 0.86, + "grad_norm": 0.58203125, + "learning_rate": 1.1012361383863946e-05, + "loss": 1.3679, + "step": 16705 + }, + { + "epoch": 0.86, + "grad_norm": 0.51953125, + "learning_rate": 1.0971201925510288e-05, + "loss": 1.4073, + "step": 16710 + }, + { + "epoch": 0.86, + "grad_norm": 0.546875, + "learning_rate": 1.0930115064607016e-05, + "loss": 1.4165, + "step": 16715 + }, + { + "epoch": 0.87, + "grad_norm": 0.51953125, + "learning_rate": 1.0889100834657917e-05, + "loss": 1.403, + "step": 16720 + }, + { + "epoch": 0.87, + "grad_norm": 0.56640625, + "learning_rate": 1.0848159269107538e-05, + "loss": 1.3617, + "step": 16725 + }, + { + "epoch": 0.87, + "grad_norm": 0.546875, + "learning_rate": 1.0807290401341219e-05, + "loss": 1.3885, + "step": 16730 + }, + { + "epoch": 0.87, + "grad_norm": 0.5390625, + "learning_rate": 1.0766494264684934e-05, + "loss": 1.3835, + "step": 16735 + }, + { + "epoch": 0.87, + "grad_norm": 0.546875, + "learning_rate": 1.0725770892405407e-05, + "loss": 1.3934, + "step": 16740 + }, + { + "epoch": 0.87, + "grad_norm": 0.51953125, + "learning_rate": 1.0685120317710029e-05, + "loss": 1.3962, + "step": 16745 + }, + { + "epoch": 0.87, + "grad_norm": 0.5390625, + "learning_rate": 1.064454257374683e-05, + "loss": 1.4156, + "step": 16750 + }, + { + "epoch": 0.87, + "grad_norm": 0.5390625, + "learning_rate": 1.0604037693604396e-05, + "loss": 1.427, + "step": 16755 + }, + { + "epoch": 0.87, + "grad_norm": 0.53125, + "learning_rate": 1.0563605710311974e-05, + "loss": 1.3918, + "step": 16760 + }, + { + "epoch": 0.87, + "grad_norm": 0.5625, + "learning_rate": 1.0523246656839314e-05, + "loss": 1.4136, + "step": 16765 + }, + { + "epoch": 0.87, + "grad_norm": 0.5546875, + "learning_rate": 1.0482960566096733e-05, + "loss": 1.4192, + "step": 16770 + }, + { + "epoch": 0.87, + "grad_norm": 0.53515625, + "learning_rate": 1.0442747470935022e-05, + "loss": 1.4162, + "step": 16775 + }, + { + "epoch": 0.87, + "grad_norm": 0.52734375, + "learning_rate": 1.0402607404145449e-05, + "loss": 1.3932, + "step": 16780 + }, + { + "epoch": 0.87, + "grad_norm": 0.5390625, + "learning_rate": 1.0362540398459752e-05, + "loss": 1.3889, + "step": 16785 + }, + { + "epoch": 0.87, + "grad_norm": 0.55859375, + "learning_rate": 1.0322546486550112e-05, + "loss": 1.4413, + "step": 16790 + }, + { + "epoch": 0.87, + "grad_norm": 0.5234375, + "learning_rate": 1.0282625701029037e-05, + "loss": 1.4081, + "step": 16795 + }, + { + "epoch": 0.87, + "grad_norm": 0.5390625, + "learning_rate": 1.0242778074449455e-05, + "loss": 1.4092, + "step": 16800 + }, + { + "epoch": 0.87, + "grad_norm": 0.52734375, + "learning_rate": 1.0203003639304643e-05, + "loss": 1.4168, + "step": 16805 + }, + { + "epoch": 0.87, + "grad_norm": 0.53125, + "learning_rate": 1.0163302428028188e-05, + "loss": 1.4001, + "step": 16810 + }, + { + "epoch": 0.87, + "grad_norm": 0.5546875, + "learning_rate": 1.0123674472993916e-05, + "loss": 1.4336, + "step": 16815 + }, + { + "epoch": 0.87, + "grad_norm": 0.5234375, + "learning_rate": 1.0084119806516001e-05, + "loss": 1.4195, + "step": 16820 + }, + { + "epoch": 0.87, + "grad_norm": 0.52734375, + "learning_rate": 1.0044638460848798e-05, + "loss": 1.4218, + "step": 16825 + }, + { + "epoch": 0.87, + "grad_norm": 0.55078125, + "learning_rate": 1.000523046818691e-05, + "loss": 1.3904, + "step": 16830 + }, + { + "epoch": 0.87, + "grad_norm": 0.52734375, + "learning_rate": 9.965895860665075e-06, + "loss": 1.4194, + "step": 16835 + }, + { + "epoch": 0.87, + "grad_norm": 0.51953125, + "learning_rate": 9.926634670358236e-06, + "loss": 1.386, + "step": 16840 + }, + { + "epoch": 0.87, + "grad_norm": 0.55859375, + "learning_rate": 9.887446929281453e-06, + "loss": 1.4482, + "step": 16845 + }, + { + "epoch": 0.87, + "grad_norm": 0.515625, + "learning_rate": 9.848332669389916e-06, + "loss": 1.405, + "step": 16850 + }, + { + "epoch": 0.87, + "grad_norm": 0.546875, + "learning_rate": 9.809291922578823e-06, + "loss": 1.4228, + "step": 16855 + }, + { + "epoch": 0.87, + "grad_norm": 0.515625, + "learning_rate": 9.7703247206835e-06, + "loss": 1.3764, + "step": 16860 + }, + { + "epoch": 0.87, + "grad_norm": 0.5234375, + "learning_rate": 9.731431095479281e-06, + "loss": 1.3871, + "step": 16865 + }, + { + "epoch": 0.87, + "grad_norm": 0.5390625, + "learning_rate": 9.692611078681513e-06, + "loss": 1.4231, + "step": 16870 + }, + { + "epoch": 0.87, + "grad_norm": 0.5546875, + "learning_rate": 9.653864701945469e-06, + "loss": 1.4045, + "step": 16875 + }, + { + "epoch": 0.87, + "grad_norm": 0.5390625, + "learning_rate": 9.615191996866446e-06, + "loss": 1.4119, + "step": 16880 + }, + { + "epoch": 0.87, + "grad_norm": 0.53515625, + "learning_rate": 9.576592994979617e-06, + "loss": 1.3788, + "step": 16885 + }, + { + "epoch": 0.87, + "grad_norm": 0.54296875, + "learning_rate": 9.53806772776008e-06, + "loss": 1.4353, + "step": 16890 + }, + { + "epoch": 0.87, + "grad_norm": 0.56640625, + "learning_rate": 9.499616226622766e-06, + "loss": 1.4092, + "step": 16895 + }, + { + "epoch": 0.87, + "grad_norm": 0.52734375, + "learning_rate": 9.4612385229225e-06, + "loss": 1.4103, + "step": 16900 + }, + { + "epoch": 0.87, + "grad_norm": 0.55859375, + "learning_rate": 9.422934647953929e-06, + "loss": 1.4503, + "step": 16905 + }, + { + "epoch": 0.87, + "grad_norm": 0.52734375, + "learning_rate": 9.38470463295148e-06, + "loss": 1.4105, + "step": 16910 + }, + { + "epoch": 0.88, + "grad_norm": 0.53125, + "learning_rate": 9.346548509089326e-06, + "loss": 1.4304, + "step": 16915 + }, + { + "epoch": 0.88, + "grad_norm": 0.54296875, + "learning_rate": 9.308466307481423e-06, + "loss": 1.3896, + "step": 16920 + }, + { + "epoch": 0.88, + "grad_norm": 0.55859375, + "learning_rate": 9.270458059181452e-06, + "loss": 1.4248, + "step": 16925 + }, + { + "epoch": 0.88, + "grad_norm": 0.5390625, + "learning_rate": 9.23252379518279e-06, + "loss": 1.4388, + "step": 16930 + }, + { + "epoch": 0.88, + "grad_norm": 0.609375, + "learning_rate": 9.194663546418436e-06, + "loss": 1.3883, + "step": 16935 + }, + { + "epoch": 0.88, + "grad_norm": 0.5546875, + "learning_rate": 9.156877343761094e-06, + "loss": 1.3937, + "step": 16940 + }, + { + "epoch": 0.88, + "grad_norm": 0.53515625, + "learning_rate": 9.11916521802304e-06, + "loss": 1.4132, + "step": 16945 + }, + { + "epoch": 0.88, + "grad_norm": 0.515625, + "learning_rate": 9.081527199956196e-06, + "loss": 1.4238, + "step": 16950 + }, + { + "epoch": 0.88, + "grad_norm": 0.5078125, + "learning_rate": 9.043963320252025e-06, + "loss": 1.3871, + "step": 16955 + }, + { + "epoch": 0.88, + "grad_norm": 0.52734375, + "learning_rate": 9.006473609541511e-06, + "loss": 1.4014, + "step": 16960 + }, + { + "epoch": 0.88, + "grad_norm": 0.5390625, + "learning_rate": 8.96905809839519e-06, + "loss": 1.4254, + "step": 16965 + }, + { + "epoch": 0.88, + "grad_norm": 0.5390625, + "learning_rate": 8.931716817323099e-06, + "loss": 1.4089, + "step": 16970 + }, + { + "epoch": 0.88, + "grad_norm": 0.53515625, + "learning_rate": 8.89444979677474e-06, + "loss": 1.4142, + "step": 16975 + }, + { + "epoch": 0.88, + "grad_norm": 0.53515625, + "learning_rate": 8.857257067139013e-06, + "loss": 1.4016, + "step": 16980 + }, + { + "epoch": 0.88, + "grad_norm": 0.5390625, + "learning_rate": 8.820138658744304e-06, + "loss": 1.4131, + "step": 16985 + }, + { + "epoch": 0.88, + "grad_norm": 0.54296875, + "learning_rate": 8.783094601858355e-06, + "loss": 1.4196, + "step": 16990 + }, + { + "epoch": 0.88, + "grad_norm": 0.515625, + "learning_rate": 8.746124926688325e-06, + "loss": 1.3525, + "step": 16995 + }, + { + "epoch": 0.88, + "grad_norm": 0.54296875, + "learning_rate": 8.709229663380658e-06, + "loss": 1.3739, + "step": 17000 + }, + { + "epoch": 0.88, + "grad_norm": 0.515625, + "learning_rate": 8.67240884202113e-06, + "loss": 1.4177, + "step": 17005 + }, + { + "epoch": 0.88, + "grad_norm": 0.5078125, + "learning_rate": 8.635662492634855e-06, + "loss": 1.3755, + "step": 17010 + }, + { + "epoch": 0.88, + "grad_norm": 0.52734375, + "learning_rate": 8.59899064518619e-06, + "loss": 1.3941, + "step": 17015 + }, + { + "epoch": 0.88, + "grad_norm": 0.546875, + "learning_rate": 8.562393329578767e-06, + "loss": 1.4086, + "step": 17020 + }, + { + "epoch": 0.88, + "grad_norm": 0.546875, + "learning_rate": 8.525870575655392e-06, + "loss": 1.4255, + "step": 17025 + }, + { + "epoch": 0.88, + "grad_norm": 0.55078125, + "learning_rate": 8.489422413198112e-06, + "loss": 1.3802, + "step": 17030 + }, + { + "epoch": 0.88, + "grad_norm": 0.55078125, + "learning_rate": 8.453048871928138e-06, + "loss": 1.4057, + "step": 17035 + }, + { + "epoch": 0.88, + "grad_norm": 0.5234375, + "learning_rate": 8.416749981505856e-06, + "loss": 1.3872, + "step": 17040 + }, + { + "epoch": 0.88, + "grad_norm": 0.5234375, + "learning_rate": 8.380525771530701e-06, + "loss": 1.4199, + "step": 17045 + }, + { + "epoch": 0.88, + "grad_norm": 0.55078125, + "learning_rate": 8.34437627154131e-06, + "loss": 1.3667, + "step": 17050 + }, + { + "epoch": 0.88, + "grad_norm": 0.5390625, + "learning_rate": 8.308301511015327e-06, + "loss": 1.4286, + "step": 17055 + }, + { + "epoch": 0.88, + "grad_norm": 0.54296875, + "learning_rate": 8.272301519369519e-06, + "loss": 1.4153, + "step": 17060 + }, + { + "epoch": 0.88, + "grad_norm": 0.55078125, + "learning_rate": 8.236376325959583e-06, + "loss": 1.4186, + "step": 17065 + }, + { + "epoch": 0.88, + "grad_norm": 0.53515625, + "learning_rate": 8.200525960080308e-06, + "loss": 1.387, + "step": 17070 + }, + { + "epoch": 0.88, + "grad_norm": 0.51171875, + "learning_rate": 8.16475045096543e-06, + "loss": 1.418, + "step": 17075 + }, + { + "epoch": 0.88, + "grad_norm": 0.53515625, + "learning_rate": 8.129049827787693e-06, + "loss": 1.3995, + "step": 17080 + }, + { + "epoch": 0.88, + "grad_norm": 0.51953125, + "learning_rate": 8.093424119658678e-06, + "loss": 1.3728, + "step": 17085 + }, + { + "epoch": 0.88, + "grad_norm": 0.54296875, + "learning_rate": 8.057873355628964e-06, + "loss": 1.3687, + "step": 17090 + }, + { + "epoch": 0.88, + "grad_norm": 0.52734375, + "learning_rate": 8.022397564687989e-06, + "loss": 1.4024, + "step": 17095 + }, + { + "epoch": 0.88, + "grad_norm": 0.5546875, + "learning_rate": 7.986996775764077e-06, + "loss": 1.4226, + "step": 17100 + }, + { + "epoch": 0.88, + "grad_norm": 0.5390625, + "learning_rate": 7.951671017724316e-06, + "loss": 1.4087, + "step": 17105 + }, + { + "epoch": 0.89, + "grad_norm": 0.51171875, + "learning_rate": 7.916420319374707e-06, + "loss": 1.3865, + "step": 17110 + }, + { + "epoch": 0.89, + "grad_norm": 0.515625, + "learning_rate": 7.88124470945999e-06, + "loss": 1.413, + "step": 17115 + }, + { + "epoch": 0.89, + "grad_norm": 0.52734375, + "learning_rate": 7.846144216663697e-06, + "loss": 1.3887, + "step": 17120 + }, + { + "epoch": 0.89, + "grad_norm": 0.53125, + "learning_rate": 7.811118869608081e-06, + "loss": 1.4014, + "step": 17125 + }, + { + "epoch": 0.89, + "grad_norm": 0.5546875, + "learning_rate": 7.776168696854147e-06, + "loss": 1.4044, + "step": 17130 + }, + { + "epoch": 0.89, + "grad_norm": 0.546875, + "learning_rate": 7.741293726901589e-06, + "loss": 1.4081, + "step": 17135 + }, + { + "epoch": 0.89, + "grad_norm": 0.53515625, + "learning_rate": 7.70649398818879e-06, + "loss": 1.3762, + "step": 17140 + }, + { + "epoch": 0.89, + "grad_norm": 0.51953125, + "learning_rate": 7.671769509092741e-06, + "loss": 1.3945, + "step": 17145 + }, + { + "epoch": 0.89, + "grad_norm": 0.52734375, + "learning_rate": 7.637120317929114e-06, + "loss": 1.374, + "step": 17150 + }, + { + "epoch": 0.89, + "grad_norm": 0.5234375, + "learning_rate": 7.6025464429521635e-06, + "loss": 1.3886, + "step": 17155 + }, + { + "epoch": 0.89, + "grad_norm": 0.546875, + "learning_rate": 7.56804791235477e-06, + "loss": 1.3781, + "step": 17160 + }, + { + "epoch": 0.89, + "grad_norm": 0.53125, + "learning_rate": 7.533624754268287e-06, + "loss": 1.4105, + "step": 17165 + }, + { + "epoch": 0.89, + "grad_norm": 0.53125, + "learning_rate": 7.499276996762694e-06, + "loss": 1.4119, + "step": 17170 + }, + { + "epoch": 0.89, + "grad_norm": 0.53515625, + "learning_rate": 7.465004667846431e-06, + "loss": 1.4016, + "step": 17175 + }, + { + "epoch": 0.89, + "grad_norm": 0.55078125, + "learning_rate": 7.430807795466488e-06, + "loss": 1.4167, + "step": 17180 + }, + { + "epoch": 0.89, + "grad_norm": 0.58984375, + "learning_rate": 7.396686407508246e-06, + "loss": 1.4252, + "step": 17185 + }, + { + "epoch": 0.89, + "grad_norm": 0.53125, + "learning_rate": 7.362640531795606e-06, + "loss": 1.3954, + "step": 17190 + }, + { + "epoch": 0.89, + "grad_norm": 0.53125, + "learning_rate": 7.328670196090836e-06, + "loss": 1.3732, + "step": 17195 + }, + { + "epoch": 0.89, + "grad_norm": 0.53125, + "learning_rate": 7.294775428094669e-06, + "loss": 1.3902, + "step": 17200 + }, + { + "epoch": 0.89, + "grad_norm": 0.515625, + "learning_rate": 7.26095625544615e-06, + "loss": 1.39, + "step": 17205 + }, + { + "epoch": 0.89, + "grad_norm": 0.55078125, + "learning_rate": 7.227212705722719e-06, + "loss": 1.444, + "step": 17210 + }, + { + "epoch": 0.89, + "grad_norm": 0.546875, + "learning_rate": 7.1935448064401445e-06, + "loss": 1.4021, + "step": 17215 + }, + { + "epoch": 0.89, + "grad_norm": 0.5546875, + "learning_rate": 7.159952585052532e-06, + "loss": 1.4215, + "step": 17220 + }, + { + "epoch": 0.89, + "grad_norm": 0.5390625, + "learning_rate": 7.126436068952202e-06, + "loss": 1.4201, + "step": 17225 + }, + { + "epoch": 0.89, + "grad_norm": 0.578125, + "learning_rate": 7.092995285469816e-06, + "loss": 1.4256, + "step": 17230 + }, + { + "epoch": 0.89, + "grad_norm": 0.53125, + "learning_rate": 7.05963026187425e-06, + "loss": 1.4086, + "step": 17235 + }, + { + "epoch": 0.89, + "grad_norm": 0.5234375, + "learning_rate": 7.026341025372629e-06, + "loss": 1.3837, + "step": 17240 + }, + { + "epoch": 0.89, + "grad_norm": 0.515625, + "learning_rate": 6.993127603110216e-06, + "loss": 1.4214, + "step": 17245 + }, + { + "epoch": 0.89, + "grad_norm": 0.58203125, + "learning_rate": 6.959990022170515e-06, + "loss": 1.4172, + "step": 17250 + }, + { + "epoch": 0.89, + "grad_norm": 0.5390625, + "learning_rate": 6.926928309575154e-06, + "loss": 1.4337, + "step": 17255 + }, + { + "epoch": 0.89, + "grad_norm": 0.53515625, + "learning_rate": 6.893942492283934e-06, + "loss": 1.4106, + "step": 17260 + }, + { + "epoch": 0.89, + "grad_norm": 0.5234375, + "learning_rate": 6.861032597194683e-06, + "loss": 1.3771, + "step": 17265 + }, + { + "epoch": 0.89, + "grad_norm": 0.55078125, + "learning_rate": 6.828198651143425e-06, + "loss": 1.4111, + "step": 17270 + }, + { + "epoch": 0.89, + "grad_norm": 0.53515625, + "learning_rate": 6.795440680904164e-06, + "loss": 1.3926, + "step": 17275 + }, + { + "epoch": 0.89, + "grad_norm": 0.50390625, + "learning_rate": 6.762758713189044e-06, + "loss": 1.4102, + "step": 17280 + }, + { + "epoch": 0.89, + "grad_norm": 0.546875, + "learning_rate": 6.730152774648113e-06, + "loss": 1.3879, + "step": 17285 + }, + { + "epoch": 0.89, + "grad_norm": 0.55078125, + "learning_rate": 6.697622891869515e-06, + "loss": 1.3946, + "step": 17290 + }, + { + "epoch": 0.89, + "grad_norm": 0.53125, + "learning_rate": 6.665169091379364e-06, + "loss": 1.3882, + "step": 17295 + }, + { + "epoch": 0.9, + "grad_norm": 0.5234375, + "learning_rate": 6.63279139964168e-06, + "loss": 1.3692, + "step": 17300 + }, + { + "epoch": 0.9, + "grad_norm": 0.55859375, + "learning_rate": 6.6004898430585e-06, + "loss": 1.3792, + "step": 17305 + }, + { + "epoch": 0.9, + "grad_norm": 0.55078125, + "learning_rate": 6.568264447969697e-06, + "loss": 1.4321, + "step": 17310 + }, + { + "epoch": 0.9, + "grad_norm": 0.5390625, + "learning_rate": 6.536115240653096e-06, + "loss": 1.445, + "step": 17315 + }, + { + "epoch": 0.9, + "grad_norm": 0.546875, + "learning_rate": 6.504042247324371e-06, + "loss": 1.411, + "step": 17320 + }, + { + "epoch": 0.9, + "grad_norm": 0.546875, + "learning_rate": 6.47204549413708e-06, + "loss": 1.4009, + "step": 17325 + }, + { + "epoch": 0.9, + "grad_norm": 0.5390625, + "learning_rate": 6.440125007182551e-06, + "loss": 1.4019, + "step": 17330 + }, + { + "epoch": 0.9, + "grad_norm": 0.55078125, + "learning_rate": 6.408280812489964e-06, + "loss": 1.4255, + "step": 17335 + }, + { + "epoch": 0.9, + "grad_norm": 0.56640625, + "learning_rate": 6.37651293602628e-06, + "loss": 1.3937, + "step": 17340 + }, + { + "epoch": 0.9, + "grad_norm": 0.53515625, + "learning_rate": 6.344821403696255e-06, + "loss": 1.3765, + "step": 17345 + }, + { + "epoch": 0.9, + "grad_norm": 0.5078125, + "learning_rate": 6.313206241342328e-06, + "loss": 1.3954, + "step": 17350 + }, + { + "epoch": 0.9, + "grad_norm": 0.5546875, + "learning_rate": 6.281667474744712e-06, + "loss": 1.4106, + "step": 17355 + }, + { + "epoch": 0.9, + "grad_norm": 0.5546875, + "learning_rate": 6.2502051296213226e-06, + "loss": 1.4296, + "step": 17360 + }, + { + "epoch": 0.9, + "grad_norm": 0.54296875, + "learning_rate": 6.2188192316277374e-06, + "loss": 1.3906, + "step": 17365 + }, + { + "epoch": 0.9, + "grad_norm": 0.51953125, + "learning_rate": 6.1875098063571835e-06, + "loss": 1.4083, + "step": 17370 + }, + { + "epoch": 0.9, + "grad_norm": 0.54296875, + "learning_rate": 6.156276879340583e-06, + "loss": 1.4249, + "step": 17375 + }, + { + "epoch": 0.9, + "grad_norm": 0.5234375, + "learning_rate": 6.125120476046431e-06, + "loss": 1.396, + "step": 17380 + }, + { + "epoch": 0.9, + "grad_norm": 0.57421875, + "learning_rate": 6.094040621880837e-06, + "loss": 1.3768, + "step": 17385 + }, + { + "epoch": 0.9, + "grad_norm": 0.54296875, + "learning_rate": 6.0630373421875055e-06, + "loss": 1.4468, + "step": 17390 + }, + { + "epoch": 0.9, + "grad_norm": 0.51953125, + "learning_rate": 6.032110662247659e-06, + "loss": 1.4076, + "step": 17395 + }, + { + "epoch": 0.9, + "grad_norm": 0.52734375, + "learning_rate": 6.0012606072800905e-06, + "loss": 1.387, + "step": 17400 + }, + { + "epoch": 0.9, + "grad_norm": 0.5546875, + "learning_rate": 5.970487202441122e-06, + "loss": 1.3739, + "step": 17405 + }, + { + "epoch": 0.9, + "grad_norm": 0.5234375, + "learning_rate": 5.939790472824535e-06, + "loss": 1.4175, + "step": 17410 + }, + { + "epoch": 0.9, + "grad_norm": 0.546875, + "learning_rate": 5.909170443461598e-06, + "loss": 1.421, + "step": 17415 + }, + { + "epoch": 0.9, + "grad_norm": 0.54296875, + "learning_rate": 5.878627139321047e-06, + "loss": 1.4289, + "step": 17420 + }, + { + "epoch": 0.9, + "grad_norm": 0.54296875, + "learning_rate": 5.848160585309048e-06, + "loss": 1.408, + "step": 17425 + }, + { + "epoch": 0.9, + "grad_norm": 0.52734375, + "learning_rate": 5.817770806269207e-06, + "loss": 1.4263, + "step": 17430 + }, + { + "epoch": 0.9, + "grad_norm": 0.53125, + "learning_rate": 5.787457826982457e-06, + "loss": 1.4055, + "step": 17435 + }, + { + "epoch": 0.9, + "grad_norm": 0.55859375, + "learning_rate": 5.757221672167168e-06, + "loss": 1.4094, + "step": 17440 + }, + { + "epoch": 0.9, + "grad_norm": 0.546875, + "learning_rate": 5.727062366479041e-06, + "loss": 1.3952, + "step": 17445 + }, + { + "epoch": 0.9, + "grad_norm": 0.52734375, + "learning_rate": 5.696979934511137e-06, + "loss": 1.3921, + "step": 17450 + }, + { + "epoch": 0.9, + "grad_norm": 0.546875, + "learning_rate": 5.666974400793779e-06, + "loss": 1.4552, + "step": 17455 + }, + { + "epoch": 0.9, + "grad_norm": 0.5234375, + "learning_rate": 5.637045789794626e-06, + "loss": 1.4358, + "step": 17460 + }, + { + "epoch": 0.9, + "grad_norm": 0.54296875, + "learning_rate": 5.607194125918602e-06, + "loss": 1.43, + "step": 17465 + }, + { + "epoch": 0.9, + "grad_norm": 0.5546875, + "learning_rate": 5.577419433507891e-06, + "loss": 1.4005, + "step": 17470 + }, + { + "epoch": 0.9, + "grad_norm": 0.53125, + "learning_rate": 5.547721736841871e-06, + "loss": 1.4112, + "step": 17475 + }, + { + "epoch": 0.9, + "grad_norm": 0.53515625, + "learning_rate": 5.518101060137204e-06, + "loss": 1.3764, + "step": 17480 + }, + { + "epoch": 0.9, + "grad_norm": 0.5390625, + "learning_rate": 5.488557427547692e-06, + "loss": 1.4062, + "step": 17485 + }, + { + "epoch": 0.9, + "grad_norm": 0.5390625, + "learning_rate": 5.459090863164351e-06, + "loss": 1.4151, + "step": 17490 + }, + { + "epoch": 0.91, + "grad_norm": 0.53125, + "learning_rate": 5.429701391015296e-06, + "loss": 1.4052, + "step": 17495 + }, + { + "epoch": 0.91, + "grad_norm": 0.5546875, + "learning_rate": 5.400389035065845e-06, + "loss": 1.4168, + "step": 17500 + }, + { + "epoch": 0.91, + "grad_norm": 0.52734375, + "learning_rate": 5.371153819218389e-06, + "loss": 1.4148, + "step": 17505 + }, + { + "epoch": 0.91, + "grad_norm": 0.546875, + "learning_rate": 5.341995767312435e-06, + "loss": 1.4043, + "step": 17510 + }, + { + "epoch": 0.91, + "grad_norm": 0.51953125, + "learning_rate": 5.312914903124566e-06, + "loss": 1.398, + "step": 17515 + }, + { + "epoch": 0.91, + "grad_norm": 0.56640625, + "learning_rate": 5.2839112503684e-06, + "loss": 1.426, + "step": 17520 + }, + { + "epoch": 0.91, + "grad_norm": 0.5390625, + "learning_rate": 5.254984832694632e-06, + "loss": 1.3925, + "step": 17525 + }, + { + "epoch": 0.91, + "grad_norm": 0.51953125, + "learning_rate": 5.226135673690957e-06, + "loss": 1.3968, + "step": 17530 + }, + { + "epoch": 0.91, + "grad_norm": 0.5546875, + "learning_rate": 5.19736379688206e-06, + "loss": 1.3513, + "step": 17535 + }, + { + "epoch": 0.91, + "grad_norm": 0.5625, + "learning_rate": 5.168669225729616e-06, + "loss": 1.3746, + "step": 17540 + }, + { + "epoch": 0.91, + "grad_norm": 0.5625, + "learning_rate": 5.140051983632266e-06, + "loss": 1.4029, + "step": 17545 + }, + { + "epoch": 0.91, + "grad_norm": 0.5390625, + "learning_rate": 5.111512093925619e-06, + "loss": 1.4421, + "step": 17550 + }, + { + "epoch": 0.91, + "grad_norm": 0.53515625, + "learning_rate": 5.083049579882149e-06, + "loss": 1.3623, + "step": 17555 + }, + { + "epoch": 0.91, + "grad_norm": 0.55078125, + "learning_rate": 5.054664464711267e-06, + "loss": 1.4099, + "step": 17560 + }, + { + "epoch": 0.91, + "grad_norm": 0.5390625, + "learning_rate": 5.026356771559282e-06, + "loss": 1.4274, + "step": 17565 + }, + { + "epoch": 0.91, + "grad_norm": 0.53515625, + "learning_rate": 4.998126523509361e-06, + "loss": 1.3686, + "step": 17570 + }, + { + "epoch": 0.91, + "grad_norm": 0.54296875, + "learning_rate": 4.969973743581502e-06, + "loss": 1.3974, + "step": 17575 + }, + { + "epoch": 0.91, + "grad_norm": 0.546875, + "learning_rate": 4.941898454732563e-06, + "loss": 1.4286, + "step": 17580 + }, + { + "epoch": 0.91, + "grad_norm": 0.5625, + "learning_rate": 4.913900679856176e-06, + "loss": 1.4128, + "step": 17585 + }, + { + "epoch": 0.91, + "grad_norm": 0.5234375, + "learning_rate": 4.885980441782823e-06, + "loss": 1.4221, + "step": 17590 + }, + { + "epoch": 0.91, + "grad_norm": 0.5078125, + "learning_rate": 4.858137763279702e-06, + "loss": 1.403, + "step": 17595 + }, + { + "epoch": 0.91, + "grad_norm": 0.53125, + "learning_rate": 4.830372667050753e-06, + "loss": 1.4253, + "step": 17600 + }, + { + "epoch": 0.91, + "grad_norm": 0.53515625, + "learning_rate": 4.802685175736732e-06, + "loss": 1.393, + "step": 17605 + }, + { + "epoch": 0.91, + "grad_norm": 0.56640625, + "learning_rate": 4.775075311915045e-06, + "loss": 1.4162, + "step": 17610 + }, + { + "epoch": 0.91, + "grad_norm": 0.515625, + "learning_rate": 4.747543098099838e-06, + "loss": 1.3916, + "step": 17615 + }, + { + "epoch": 0.91, + "grad_norm": 0.546875, + "learning_rate": 4.720088556741897e-06, + "loss": 1.4114, + "step": 17620 + }, + { + "epoch": 0.91, + "grad_norm": 0.578125, + "learning_rate": 4.6927117102287034e-06, + "loss": 1.375, + "step": 17625 + }, + { + "epoch": 0.91, + "grad_norm": 0.52734375, + "learning_rate": 4.665412580884365e-06, + "loss": 1.4022, + "step": 17630 + }, + { + "epoch": 0.91, + "grad_norm": 0.53125, + "learning_rate": 4.638191190969665e-06, + "loss": 1.4275, + "step": 17635 + }, + { + "epoch": 0.91, + "grad_norm": 0.53515625, + "learning_rate": 4.611047562681903e-06, + "loss": 1.3914, + "step": 17640 + }, + { + "epoch": 0.91, + "grad_norm": 0.53515625, + "learning_rate": 4.58398171815504e-06, + "loss": 1.3729, + "step": 17645 + }, + { + "epoch": 0.91, + "grad_norm": 0.55078125, + "learning_rate": 4.556993679459587e-06, + "loss": 1.3844, + "step": 17650 + }, + { + "epoch": 0.91, + "grad_norm": 0.53515625, + "learning_rate": 4.530083468602631e-06, + "loss": 1.4087, + "step": 17655 + }, + { + "epoch": 0.91, + "grad_norm": 0.53515625, + "learning_rate": 4.503251107527751e-06, + "loss": 1.4423, + "step": 17660 + }, + { + "epoch": 0.91, + "grad_norm": 0.52734375, + "learning_rate": 4.476496618115078e-06, + "loss": 1.3984, + "step": 17665 + }, + { + "epoch": 0.91, + "grad_norm": 0.5390625, + "learning_rate": 4.449820022181239e-06, + "loss": 1.4143, + "step": 17670 + }, + { + "epoch": 0.91, + "grad_norm": 0.52734375, + "learning_rate": 4.423221341479344e-06, + "loss": 1.3943, + "step": 17675 + }, + { + "epoch": 0.91, + "grad_norm": 0.55078125, + "learning_rate": 4.396700597698955e-06, + "loss": 1.3885, + "step": 17680 + }, + { + "epoch": 0.91, + "grad_norm": 0.57421875, + "learning_rate": 4.3702578124660834e-06, + "loss": 1.3702, + "step": 17685 + }, + { + "epoch": 0.92, + "grad_norm": 0.54296875, + "learning_rate": 4.34389300734318e-06, + "loss": 1.4119, + "step": 17690 + }, + { + "epoch": 0.92, + "grad_norm": 0.53125, + "learning_rate": 4.317606203829127e-06, + "loss": 1.3952, + "step": 17695 + }, + { + "epoch": 0.92, + "grad_norm": 0.53515625, + "learning_rate": 4.291397423359156e-06, + "loss": 1.4155, + "step": 17700 + }, + { + "epoch": 0.92, + "grad_norm": 0.52734375, + "learning_rate": 4.265266687304892e-06, + "loss": 1.4023, + "step": 17705 + }, + { + "epoch": 0.92, + "grad_norm": 0.5390625, + "learning_rate": 4.239214016974335e-06, + "loss": 1.3853, + "step": 17710 + }, + { + "epoch": 0.92, + "grad_norm": 0.53125, + "learning_rate": 4.213239433611848e-06, + "loss": 1.4075, + "step": 17715 + }, + { + "epoch": 0.92, + "grad_norm": 0.53515625, + "learning_rate": 4.1873429583980325e-06, + "loss": 1.4139, + "step": 17720 + }, + { + "epoch": 0.92, + "grad_norm": 0.53515625, + "learning_rate": 4.161524612449896e-06, + "loss": 1.4018, + "step": 17725 + }, + { + "epoch": 0.92, + "grad_norm": 0.5234375, + "learning_rate": 4.135784416820665e-06, + "loss": 1.4292, + "step": 17730 + }, + { + "epoch": 0.92, + "grad_norm": 0.53125, + "learning_rate": 4.110122392499915e-06, + "loss": 1.3752, + "step": 17735 + }, + { + "epoch": 0.92, + "grad_norm": 0.55078125, + "learning_rate": 4.0845385604133755e-06, + "loss": 1.3981, + "step": 17740 + }, + { + "epoch": 0.92, + "grad_norm": 0.5234375, + "learning_rate": 4.059032941423113e-06, + "loss": 1.3676, + "step": 17745 + }, + { + "epoch": 0.92, + "grad_norm": 0.54296875, + "learning_rate": 4.033605556327347e-06, + "loss": 1.3836, + "step": 17750 + }, + { + "epoch": 0.92, + "grad_norm": 0.55859375, + "learning_rate": 4.008256425860546e-06, + "loss": 1.4058, + "step": 17755 + }, + { + "epoch": 0.92, + "grad_norm": 0.546875, + "learning_rate": 3.982985570693354e-06, + "loss": 1.4037, + "step": 17760 + }, + { + "epoch": 0.92, + "grad_norm": 0.53125, + "learning_rate": 3.957793011432564e-06, + "loss": 1.4318, + "step": 17765 + }, + { + "epoch": 0.92, + "grad_norm": 0.53125, + "learning_rate": 3.932678768621145e-06, + "loss": 1.381, + "step": 17770 + }, + { + "epoch": 0.92, + "grad_norm": 0.5546875, + "learning_rate": 3.907642862738214e-06, + "loss": 1.4374, + "step": 17775 + }, + { + "epoch": 0.92, + "grad_norm": 0.55078125, + "learning_rate": 3.882685314199009e-06, + "loss": 1.3676, + "step": 17780 + }, + { + "epoch": 0.92, + "grad_norm": 0.5625, + "learning_rate": 3.857806143354814e-06, + "loss": 1.4109, + "step": 17785 + }, + { + "epoch": 0.92, + "grad_norm": 0.55078125, + "learning_rate": 3.833005370493081e-06, + "loss": 1.3809, + "step": 17790 + }, + { + "epoch": 0.92, + "grad_norm": 0.54296875, + "learning_rate": 3.808283015837277e-06, + "loss": 1.4036, + "step": 17795 + }, + { + "epoch": 0.92, + "grad_norm": 0.53125, + "learning_rate": 3.7836390995469873e-06, + "loss": 1.383, + "step": 17800 + }, + { + "epoch": 0.92, + "grad_norm": 0.52734375, + "learning_rate": 3.7590736417177365e-06, + "loss": 1.4214, + "step": 17805 + }, + { + "epoch": 0.92, + "grad_norm": 0.5234375, + "learning_rate": 3.7345866623811677e-06, + "loss": 1.381, + "step": 17810 + }, + { + "epoch": 0.92, + "grad_norm": 0.53515625, + "learning_rate": 3.7101781815048753e-06, + "loss": 1.3782, + "step": 17815 + }, + { + "epoch": 0.92, + "grad_norm": 0.54296875, + "learning_rate": 3.6858482189924716e-06, + "loss": 1.4236, + "step": 17820 + }, + { + "epoch": 0.92, + "grad_norm": 0.55078125, + "learning_rate": 3.6615967946835084e-06, + "loss": 1.3932, + "step": 17825 + }, + { + "epoch": 0.92, + "grad_norm": 0.5078125, + "learning_rate": 3.637423928353523e-06, + "loss": 1.3883, + "step": 17830 + }, + { + "epoch": 0.92, + "grad_norm": 0.53125, + "learning_rate": 3.6133296397139804e-06, + "loss": 1.4071, + "step": 17835 + }, + { + "epoch": 0.92, + "grad_norm": 0.52734375, + "learning_rate": 3.5893139484122982e-06, + "loss": 1.3812, + "step": 17840 + }, + { + "epoch": 0.92, + "grad_norm": 0.54296875, + "learning_rate": 3.565376874031756e-06, + "loss": 1.3953, + "step": 17845 + }, + { + "epoch": 0.92, + "grad_norm": 0.53515625, + "learning_rate": 3.541518436091562e-06, + "loss": 1.4084, + "step": 17850 + }, + { + "epoch": 0.92, + "grad_norm": 0.52734375, + "learning_rate": 3.517738654046776e-06, + "loss": 1.4153, + "step": 17855 + }, + { + "epoch": 0.92, + "grad_norm": 0.5390625, + "learning_rate": 3.4940375472883536e-06, + "loss": 1.4134, + "step": 17860 + }, + { + "epoch": 0.92, + "grad_norm": 0.51953125, + "learning_rate": 3.470415135143046e-06, + "loss": 1.4192, + "step": 17865 + }, + { + "epoch": 0.92, + "grad_norm": 0.546875, + "learning_rate": 3.446871436873478e-06, + "loss": 1.4571, + "step": 17870 + }, + { + "epoch": 0.92, + "grad_norm": 0.53125, + "learning_rate": 3.42340647167807e-06, + "loss": 1.3937, + "step": 17875 + }, + { + "epoch": 0.93, + "grad_norm": 0.52734375, + "learning_rate": 3.400020258691061e-06, + "loss": 1.3724, + "step": 17880 + }, + { + "epoch": 0.93, + "grad_norm": 0.51171875, + "learning_rate": 3.3767128169824304e-06, + "loss": 1.4317, + "step": 17885 + }, + { + "epoch": 0.93, + "grad_norm": 0.53125, + "learning_rate": 3.353484165557941e-06, + "loss": 1.4134, + "step": 17890 + }, + { + "epoch": 0.93, + "grad_norm": 0.58203125, + "learning_rate": 3.33033432335913e-06, + "loss": 1.4244, + "step": 17895 + }, + { + "epoch": 0.93, + "grad_norm": 0.5078125, + "learning_rate": 3.307263309263242e-06, + "loss": 1.3976, + "step": 17900 + }, + { + "epoch": 0.93, + "grad_norm": 0.53125, + "learning_rate": 3.284271142083284e-06, + "loss": 1.4189, + "step": 17905 + }, + { + "epoch": 0.93, + "grad_norm": 0.546875, + "learning_rate": 3.2613578405679023e-06, + "loss": 1.3897, + "step": 17910 + }, + { + "epoch": 0.93, + "grad_norm": 0.53515625, + "learning_rate": 3.238523423401496e-06, + "loss": 1.4193, + "step": 17915 + }, + { + "epoch": 0.93, + "grad_norm": 0.5390625, + "learning_rate": 3.2157679092040927e-06, + "loss": 1.4181, + "step": 17920 + }, + { + "epoch": 0.93, + "grad_norm": 0.53125, + "learning_rate": 3.193091316531427e-06, + "loss": 1.407, + "step": 17925 + }, + { + "epoch": 0.93, + "grad_norm": 0.5234375, + "learning_rate": 3.1704936638748296e-06, + "loss": 1.4167, + "step": 17930 + }, + { + "epoch": 0.93, + "grad_norm": 0.55078125, + "learning_rate": 3.1479749696612713e-06, + "loss": 1.3896, + "step": 17935 + }, + { + "epoch": 0.93, + "grad_norm": 0.5546875, + "learning_rate": 3.1255352522533755e-06, + "loss": 1.4004, + "step": 17940 + }, + { + "epoch": 0.93, + "grad_norm": 0.54296875, + "learning_rate": 3.1031745299493266e-06, + "loss": 1.395, + "step": 17945 + }, + { + "epoch": 0.93, + "grad_norm": 0.55078125, + "learning_rate": 3.0808928209828837e-06, + "loss": 1.408, + "step": 17950 + }, + { + "epoch": 0.93, + "grad_norm": 0.546875, + "learning_rate": 3.058690143523424e-06, + "loss": 1.3846, + "step": 17955 + }, + { + "epoch": 0.93, + "grad_norm": 0.5390625, + "learning_rate": 3.0365665156758315e-06, + "loss": 1.4238, + "step": 17960 + }, + { + "epoch": 0.93, + "grad_norm": 0.5234375, + "learning_rate": 3.014521955480565e-06, + "loss": 1.4062, + "step": 17965 + }, + { + "epoch": 0.93, + "grad_norm": 0.51953125, + "learning_rate": 2.9925564809135776e-06, + "loss": 1.3923, + "step": 17970 + }, + { + "epoch": 0.93, + "grad_norm": 0.51171875, + "learning_rate": 2.970670109886353e-06, + "loss": 1.4395, + "step": 17975 + }, + { + "epoch": 0.93, + "grad_norm": 0.53515625, + "learning_rate": 2.94886286024586e-06, + "loss": 1.4237, + "step": 17980 + }, + { + "epoch": 0.93, + "grad_norm": 0.5859375, + "learning_rate": 2.927134749774585e-06, + "loss": 1.4126, + "step": 17985 + }, + { + "epoch": 0.93, + "grad_norm": 0.5, + "learning_rate": 2.9054857961904216e-06, + "loss": 1.3602, + "step": 17990 + }, + { + "epoch": 0.93, + "grad_norm": 0.52734375, + "learning_rate": 2.8839160171467485e-06, + "loss": 1.4304, + "step": 17995 + }, + { + "epoch": 0.93, + "grad_norm": 0.53515625, + "learning_rate": 2.862425430232385e-06, + "loss": 1.4215, + "step": 18000 + }, + { + "epoch": 0.93, + "grad_norm": 0.54296875, + "learning_rate": 2.84101405297158e-06, + "loss": 1.3593, + "step": 18005 + }, + { + "epoch": 0.93, + "grad_norm": 0.5234375, + "learning_rate": 2.8196819028239565e-06, + "loss": 1.4182, + "step": 18010 + }, + { + "epoch": 0.93, + "grad_norm": 0.54296875, + "learning_rate": 2.7984289971845657e-06, + "loss": 1.4155, + "step": 18015 + }, + { + "epoch": 0.93, + "grad_norm": 0.546875, + "learning_rate": 2.777255353383845e-06, + "loss": 1.3748, + "step": 18020 + }, + { + "epoch": 0.93, + "grad_norm": 0.515625, + "learning_rate": 2.756160988687573e-06, + "loss": 1.3857, + "step": 18025 + }, + { + "epoch": 0.93, + "grad_norm": 0.53125, + "learning_rate": 2.735145920296889e-06, + "loss": 1.4111, + "step": 18030 + }, + { + "epoch": 0.93, + "grad_norm": 0.55078125, + "learning_rate": 2.7142101653482852e-06, + "loss": 1.4205, + "step": 18035 + }, + { + "epoch": 0.93, + "grad_norm": 0.546875, + "learning_rate": 2.693353740913562e-06, + "loss": 1.4234, + "step": 18040 + }, + { + "epoch": 0.93, + "grad_norm": 0.56640625, + "learning_rate": 2.6725766639998485e-06, + "loss": 1.4098, + "step": 18045 + }, + { + "epoch": 0.93, + "grad_norm": 0.5234375, + "learning_rate": 2.651878951549536e-06, + "loss": 1.3929, + "step": 18050 + }, + { + "epoch": 0.93, + "grad_norm": 0.546875, + "learning_rate": 2.6312606204403343e-06, + "loss": 1.4238, + "step": 18055 + }, + { + "epoch": 0.93, + "grad_norm": 0.5234375, + "learning_rate": 2.610721687485207e-06, + "loss": 1.3936, + "step": 18060 + }, + { + "epoch": 0.93, + "grad_norm": 0.54296875, + "learning_rate": 2.5902621694324005e-06, + "loss": 1.3911, + "step": 18065 + }, + { + "epoch": 0.93, + "grad_norm": 0.55078125, + "learning_rate": 2.569882082965358e-06, + "loss": 1.4094, + "step": 18070 + }, + { + "epoch": 0.94, + "grad_norm": 0.54296875, + "learning_rate": 2.5495814447027643e-06, + "loss": 1.3685, + "step": 18075 + }, + { + "epoch": 0.94, + "grad_norm": 0.515625, + "learning_rate": 2.5293602711985444e-06, + "loss": 1.3835, + "step": 18080 + }, + { + "epoch": 0.94, + "grad_norm": 0.54296875, + "learning_rate": 2.5092185789418078e-06, + "loss": 1.3872, + "step": 18085 + }, + { + "epoch": 0.94, + "grad_norm": 0.52734375, + "learning_rate": 2.48915638435685e-06, + "loss": 1.3883, + "step": 18090 + }, + { + "epoch": 0.94, + "grad_norm": 0.55078125, + "learning_rate": 2.469173703803129e-06, + "loss": 1.4209, + "step": 18095 + }, + { + "epoch": 0.94, + "grad_norm": 0.546875, + "learning_rate": 2.4492705535753003e-06, + "loss": 1.389, + "step": 18100 + }, + { + "epoch": 0.94, + "grad_norm": 0.5546875, + "learning_rate": 2.429446949903147e-06, + "loss": 1.4392, + "step": 18105 + }, + { + "epoch": 0.94, + "grad_norm": 0.5234375, + "learning_rate": 2.409702908951561e-06, + "loss": 1.3789, + "step": 18110 + }, + { + "epoch": 0.94, + "grad_norm": 0.498046875, + "learning_rate": 2.3900384468205974e-06, + "loss": 1.3633, + "step": 18115 + }, + { + "epoch": 0.94, + "grad_norm": 0.52734375, + "learning_rate": 2.3704535795454065e-06, + "loss": 1.3901, + "step": 18120 + }, + { + "epoch": 0.94, + "grad_norm": 0.55859375, + "learning_rate": 2.350948323096214e-06, + "loss": 1.4299, + "step": 18125 + }, + { + "epoch": 0.94, + "grad_norm": 0.546875, + "learning_rate": 2.331522693378374e-06, + "loss": 1.3986, + "step": 18130 + }, + { + "epoch": 0.94, + "grad_norm": 0.55859375, + "learning_rate": 2.3121767062322387e-06, + "loss": 1.4388, + "step": 18135 + }, + { + "epoch": 0.94, + "grad_norm": 0.54296875, + "learning_rate": 2.2929103774332882e-06, + "loss": 1.3955, + "step": 18140 + }, + { + "epoch": 0.94, + "grad_norm": 0.51953125, + "learning_rate": 2.2737237226920003e-06, + "loss": 1.4058, + "step": 18145 + }, + { + "epoch": 0.94, + "grad_norm": 0.5234375, + "learning_rate": 2.2546167576539155e-06, + "loss": 1.4167, + "step": 18150 + }, + { + "epoch": 0.94, + "grad_norm": 0.515625, + "learning_rate": 2.2355894978995593e-06, + "loss": 1.4242, + "step": 18155 + }, + { + "epoch": 0.94, + "grad_norm": 0.5546875, + "learning_rate": 2.2166419589444875e-06, + "loss": 1.4274, + "step": 18160 + }, + { + "epoch": 0.94, + "grad_norm": 0.5390625, + "learning_rate": 2.1977741562392294e-06, + "loss": 1.414, + "step": 18165 + }, + { + "epoch": 0.94, + "grad_norm": 0.5546875, + "learning_rate": 2.178986105169334e-06, + "loss": 1.4187, + "step": 18170 + }, + { + "epoch": 0.94, + "grad_norm": 0.52734375, + "learning_rate": 2.1602778210552564e-06, + "loss": 1.3972, + "step": 18175 + }, + { + "epoch": 0.94, + "grad_norm": 0.546875, + "learning_rate": 2.141649319152461e-06, + "loss": 1.3914, + "step": 18180 + }, + { + "epoch": 0.94, + "grad_norm": 0.53515625, + "learning_rate": 2.1231006146513187e-06, + "loss": 1.4213, + "step": 18185 + }, + { + "epoch": 0.94, + "grad_norm": 0.54296875, + "learning_rate": 2.1046317226771417e-06, + "loss": 1.4014, + "step": 18190 + }, + { + "epoch": 0.94, + "grad_norm": 0.53125, + "learning_rate": 2.086242658290194e-06, + "loss": 1.4132, + "step": 18195 + }, + { + "epoch": 0.94, + "grad_norm": 0.5234375, + "learning_rate": 2.0679334364855806e-06, + "loss": 1.3834, + "step": 18200 + }, + { + "epoch": 0.94, + "grad_norm": 0.5234375, + "learning_rate": 2.049704072193337e-06, + "loss": 1.4112, + "step": 18205 + }, + { + "epoch": 0.94, + "grad_norm": 0.5390625, + "learning_rate": 2.031554580278394e-06, + "loss": 1.4272, + "step": 18210 + }, + { + "epoch": 0.94, + "grad_norm": 0.52734375, + "learning_rate": 2.013484975540536e-06, + "loss": 1.4039, + "step": 18215 + }, + { + "epoch": 0.94, + "grad_norm": 0.5859375, + "learning_rate": 1.995495272714376e-06, + "loss": 1.406, + "step": 18220 + }, + { + "epoch": 0.94, + "grad_norm": 0.51171875, + "learning_rate": 1.9775854864694134e-06, + "loss": 1.3711, + "step": 18225 + }, + { + "epoch": 0.94, + "grad_norm": 0.53515625, + "learning_rate": 1.959755631409976e-06, + "loss": 1.4097, + "step": 18230 + }, + { + "epoch": 0.94, + "grad_norm": 0.58984375, + "learning_rate": 1.9420057220751907e-06, + "loss": 1.4077, + "step": 18235 + }, + { + "epoch": 0.94, + "grad_norm": 0.64453125, + "learning_rate": 1.924335772939012e-06, + "loss": 1.3996, + "step": 18240 + }, + { + "epoch": 0.94, + "grad_norm": 0.5234375, + "learning_rate": 1.90674579841017e-06, + "loss": 1.4285, + "step": 18245 + }, + { + "epoch": 0.94, + "grad_norm": 0.57421875, + "learning_rate": 1.8892358128322018e-06, + "loss": 1.43, + "step": 18250 + }, + { + "epoch": 0.94, + "grad_norm": 0.55078125, + "learning_rate": 1.8718058304834307e-06, + "loss": 1.4526, + "step": 18255 + }, + { + "epoch": 0.94, + "grad_norm": 0.5234375, + "learning_rate": 1.8544558655768983e-06, + "loss": 1.4166, + "step": 18260 + }, + { + "epoch": 0.94, + "grad_norm": 0.5546875, + "learning_rate": 1.8371859322604323e-06, + "loss": 1.4201, + "step": 18265 + }, + { + "epoch": 0.95, + "grad_norm": 0.55859375, + "learning_rate": 1.8199960446165898e-06, + "loss": 1.3988, + "step": 18270 + }, + { + "epoch": 0.95, + "grad_norm": 0.5234375, + "learning_rate": 1.8028862166626691e-06, + "loss": 1.3912, + "step": 18275 + }, + { + "epoch": 0.95, + "grad_norm": 0.52734375, + "learning_rate": 1.7858564623506547e-06, + "loss": 1.3984, + "step": 18280 + }, + { + "epoch": 0.95, + "grad_norm": 0.54296875, + "learning_rate": 1.768906795567249e-06, + "loss": 1.4118, + "step": 18285 + }, + { + "epoch": 0.95, + "grad_norm": 0.546875, + "learning_rate": 1.7520372301338516e-06, + "loss": 1.37, + "step": 18290 + }, + { + "epoch": 0.95, + "grad_norm": 0.5625, + "learning_rate": 1.7352477798065703e-06, + "loss": 1.3932, + "step": 18295 + }, + { + "epoch": 0.95, + "grad_norm": 0.55859375, + "learning_rate": 1.718538458276131e-06, + "loss": 1.4033, + "step": 18300 + }, + { + "epoch": 0.95, + "grad_norm": 0.52734375, + "learning_rate": 1.701909279167946e-06, + "loss": 1.402, + "step": 18305 + }, + { + "epoch": 0.95, + "grad_norm": 0.52734375, + "learning_rate": 1.6853602560421012e-06, + "loss": 1.4002, + "step": 18310 + }, + { + "epoch": 0.95, + "grad_norm": 0.51953125, + "learning_rate": 1.6688914023932801e-06, + "loss": 1.4128, + "step": 18315 + }, + { + "epoch": 0.95, + "grad_norm": 0.55078125, + "learning_rate": 1.6525027316507957e-06, + "loss": 1.3474, + "step": 18320 + }, + { + "epoch": 0.95, + "grad_norm": 0.5390625, + "learning_rate": 1.6361942571786138e-06, + "loss": 1.3617, + "step": 18325 + }, + { + "epoch": 0.95, + "grad_norm": 0.55078125, + "learning_rate": 1.619965992275274e-06, + "loss": 1.4078, + "step": 18330 + }, + { + "epoch": 0.95, + "grad_norm": 0.5234375, + "learning_rate": 1.6038179501739138e-06, + "loss": 1.4057, + "step": 18335 + }, + { + "epoch": 0.95, + "grad_norm": 0.515625, + "learning_rate": 1.587750144042266e-06, + "loss": 1.3938, + "step": 18340 + }, + { + "epoch": 0.95, + "grad_norm": 0.53515625, + "learning_rate": 1.5717625869826168e-06, + "loss": 1.3701, + "step": 18345 + }, + { + "epoch": 0.95, + "grad_norm": 0.546875, + "learning_rate": 1.555855292031827e-06, + "loss": 1.4177, + "step": 18350 + }, + { + "epoch": 0.95, + "grad_norm": 0.53515625, + "learning_rate": 1.540028272161309e-06, + "loss": 1.4019, + "step": 18355 + }, + { + "epoch": 0.95, + "grad_norm": 0.51953125, + "learning_rate": 1.5242815402770172e-06, + "loss": 1.3823, + "step": 18360 + }, + { + "epoch": 0.95, + "grad_norm": 0.55078125, + "learning_rate": 1.5086151092194356e-06, + "loss": 1.4226, + "step": 18365 + }, + { + "epoch": 0.95, + "grad_norm": 0.5703125, + "learning_rate": 1.4930289917635453e-06, + "loss": 1.3835, + "step": 18370 + }, + { + "epoch": 0.95, + "grad_norm": 0.5234375, + "learning_rate": 1.4775232006188799e-06, + "loss": 1.3652, + "step": 18375 + }, + { + "epoch": 0.95, + "grad_norm": 0.54296875, + "learning_rate": 1.4620977484294362e-06, + "loss": 1.4092, + "step": 18380 + }, + { + "epoch": 0.95, + "grad_norm": 0.5390625, + "learning_rate": 1.4467526477737082e-06, + "loss": 1.4041, + "step": 18385 + }, + { + "epoch": 0.95, + "grad_norm": 0.52734375, + "learning_rate": 1.4314879111646861e-06, + "loss": 1.4034, + "step": 18390 + }, + { + "epoch": 0.95, + "grad_norm": 0.51953125, + "learning_rate": 1.4163035510498023e-06, + "loss": 1.4054, + "step": 18395 + }, + { + "epoch": 0.95, + "grad_norm": 0.53515625, + "learning_rate": 1.4011995798109522e-06, + "loss": 1.3963, + "step": 18400 + }, + { + "epoch": 0.95, + "grad_norm": 0.5390625, + "learning_rate": 1.386176009764506e-06, + "loss": 1.4419, + "step": 18405 + }, + { + "epoch": 0.95, + "grad_norm": 0.5390625, + "learning_rate": 1.3712328531612306e-06, + "loss": 1.414, + "step": 18410 + }, + { + "epoch": 0.95, + "grad_norm": 0.53515625, + "learning_rate": 1.3563701221863567e-06, + "loss": 1.3657, + "step": 18415 + }, + { + "epoch": 0.95, + "grad_norm": 0.5234375, + "learning_rate": 1.3415878289595008e-06, + "loss": 1.4276, + "step": 18420 + }, + { + "epoch": 0.95, + "grad_norm": 0.5234375, + "learning_rate": 1.3268859855347093e-06, + "loss": 1.3805, + "step": 18425 + }, + { + "epoch": 0.95, + "grad_norm": 0.54296875, + "learning_rate": 1.312264603900437e-06, + "loss": 1.4341, + "step": 18430 + }, + { + "epoch": 0.95, + "grad_norm": 0.5234375, + "learning_rate": 1.2977236959795025e-06, + "loss": 1.3819, + "step": 18435 + }, + { + "epoch": 0.95, + "grad_norm": 0.546875, + "learning_rate": 1.2832632736290983e-06, + "loss": 1.4432, + "step": 18440 + }, + { + "epoch": 0.95, + "grad_norm": 0.54296875, + "learning_rate": 1.2688833486408257e-06, + "loss": 1.4064, + "step": 18445 + }, + { + "epoch": 0.95, + "grad_norm": 0.546875, + "learning_rate": 1.254583932740594e-06, + "loss": 1.4189, + "step": 18450 + }, + { + "epoch": 0.95, + "grad_norm": 0.52734375, + "learning_rate": 1.2403650375887088e-06, + "loss": 1.3865, + "step": 18455 + }, + { + "epoch": 0.96, + "grad_norm": 0.53125, + "learning_rate": 1.2262266747797847e-06, + "loss": 1.4215, + "step": 18460 + }, + { + "epoch": 0.96, + "grad_norm": 0.5625, + "learning_rate": 1.2121688558427768e-06, + "loss": 1.3978, + "step": 18465 + }, + { + "epoch": 0.96, + "grad_norm": 0.52734375, + "learning_rate": 1.1981915922409603e-06, + "loss": 1.4087, + "step": 18470 + }, + { + "epoch": 0.96, + "grad_norm": 0.55859375, + "learning_rate": 1.1842948953719403e-06, + "loss": 1.3789, + "step": 18475 + }, + { + "epoch": 0.96, + "grad_norm": 0.5234375, + "learning_rate": 1.1704787765675963e-06, + "loss": 1.3762, + "step": 18480 + }, + { + "epoch": 0.96, + "grad_norm": 0.5390625, + "learning_rate": 1.1567432470941163e-06, + "loss": 1.4136, + "step": 18485 + }, + { + "epoch": 0.96, + "grad_norm": 0.546875, + "learning_rate": 1.1430883181519635e-06, + "loss": 1.4364, + "step": 18490 + }, + { + "epoch": 0.96, + "grad_norm": 0.51953125, + "learning_rate": 1.1295140008758864e-06, + "loss": 1.4176, + "step": 18495 + }, + { + "epoch": 0.96, + "grad_norm": 0.5390625, + "learning_rate": 1.1160203063349195e-06, + "loss": 1.4348, + "step": 18500 + }, + { + "epoch": 0.96, + "grad_norm": 0.56640625, + "learning_rate": 1.1026072455322945e-06, + "loss": 1.4194, + "step": 18505 + }, + { + "epoch": 0.96, + "grad_norm": 0.53515625, + "learning_rate": 1.089274829405562e-06, + "loss": 1.3916, + "step": 18510 + }, + { + "epoch": 0.96, + "grad_norm": 0.5234375, + "learning_rate": 1.0760230688264593e-06, + "loss": 1.4221, + "step": 18515 + }, + { + "epoch": 0.96, + "grad_norm": 0.53125, + "learning_rate": 1.0628519746009757e-06, + "loss": 1.3982, + "step": 18520 + }, + { + "epoch": 0.96, + "grad_norm": 0.53515625, + "learning_rate": 1.0497615574693309e-06, + "loss": 1.4214, + "step": 18525 + }, + { + "epoch": 0.96, + "grad_norm": 0.5234375, + "learning_rate": 1.0367518281059307e-06, + "loss": 1.4061, + "step": 18530 + }, + { + "epoch": 0.96, + "grad_norm": 0.53125, + "learning_rate": 1.0238227971194004e-06, + "loss": 1.3746, + "step": 18535 + }, + { + "epoch": 0.96, + "grad_norm": 0.53515625, + "learning_rate": 1.010974475052584e-06, + "loss": 1.4275, + "step": 18540 + }, + { + "epoch": 0.96, + "grad_norm": 0.546875, + "learning_rate": 9.982068723824677e-07, + "loss": 1.3601, + "step": 18545 + }, + { + "epoch": 0.96, + "grad_norm": 0.546875, + "learning_rate": 9.855199995202457e-07, + "loss": 1.3907, + "step": 18550 + }, + { + "epoch": 0.96, + "grad_norm": 0.5546875, + "learning_rate": 9.729138668112648e-07, + "loss": 1.4076, + "step": 18555 + }, + { + "epoch": 0.96, + "grad_norm": 0.53515625, + "learning_rate": 9.603884845350575e-07, + "loss": 1.4104, + "step": 18560 + }, + { + "epoch": 0.96, + "grad_norm": 0.5703125, + "learning_rate": 9.479438629052873e-07, + "loss": 1.3915, + "step": 18565 + }, + { + "epoch": 0.96, + "grad_norm": 0.5703125, + "learning_rate": 9.3558001206977e-07, + "loss": 1.4368, + "step": 18570 + }, + { + "epoch": 0.96, + "grad_norm": 0.53515625, + "learning_rate": 9.232969421104521e-07, + "loss": 1.4154, + "step": 18575 + }, + { + "epoch": 0.96, + "grad_norm": 0.54296875, + "learning_rate": 9.110946630434214e-07, + "loss": 1.4285, + "step": 18580 + }, + { + "epoch": 0.96, + "grad_norm": 0.54296875, + "learning_rate": 8.989731848188743e-07, + "loss": 1.4052, + "step": 18585 + }, + { + "epoch": 0.96, + "grad_norm": 0.53515625, + "learning_rate": 8.869325173211262e-07, + "loss": 1.3765, + "step": 18590 + }, + { + "epoch": 0.96, + "grad_norm": 0.55859375, + "learning_rate": 8.749726703685901e-07, + "loss": 1.4041, + "step": 18595 + }, + { + "epoch": 0.96, + "grad_norm": 0.57421875, + "learning_rate": 8.630936537137757e-07, + "loss": 1.3355, + "step": 18600 + }, + { + "epoch": 0.96, + "grad_norm": 0.515625, + "learning_rate": 8.51295477043279e-07, + "loss": 1.4103, + "step": 18605 + }, + { + "epoch": 0.96, + "grad_norm": 0.5234375, + "learning_rate": 8.395781499777932e-07, + "loss": 1.3912, + "step": 18610 + }, + { + "epoch": 0.96, + "grad_norm": 0.5234375, + "learning_rate": 8.279416820720531e-07, + "loss": 1.41, + "step": 18615 + }, + { + "epoch": 0.96, + "grad_norm": 0.54296875, + "learning_rate": 8.163860828148906e-07, + "loss": 1.3804, + "step": 18620 + }, + { + "epoch": 0.96, + "grad_norm": 0.53125, + "learning_rate": 8.049113616291793e-07, + "loss": 1.4257, + "step": 18625 + }, + { + "epoch": 0.96, + "grad_norm": 0.53515625, + "learning_rate": 7.935175278718232e-07, + "loss": 1.3851, + "step": 18630 + }, + { + "epoch": 0.96, + "grad_norm": 0.53125, + "learning_rate": 7.822045908337905e-07, + "loss": 1.3834, + "step": 18635 + }, + { + "epoch": 0.96, + "grad_norm": 0.51953125, + "learning_rate": 7.709725597400908e-07, + "loss": 1.4218, + "step": 18640 + }, + { + "epoch": 0.96, + "grad_norm": 0.53125, + "learning_rate": 7.598214437497531e-07, + "loss": 1.3998, + "step": 18645 + }, + { + "epoch": 0.96, + "grad_norm": 0.52734375, + "learning_rate": 7.487512519557815e-07, + "loss": 1.3893, + "step": 18650 + }, + { + "epoch": 0.97, + "grad_norm": 0.53515625, + "learning_rate": 7.377619933852664e-07, + "loss": 1.3919, + "step": 18655 + }, + { + "epoch": 0.97, + "grad_norm": 0.56640625, + "learning_rate": 7.268536769992507e-07, + "loss": 1.408, + "step": 18660 + }, + { + "epoch": 0.97, + "grad_norm": 0.52734375, + "learning_rate": 7.160263116927968e-07, + "loss": 1.388, + "step": 18665 + }, + { + "epoch": 0.97, + "grad_norm": 0.5546875, + "learning_rate": 7.052799062949312e-07, + "loss": 1.4445, + "step": 18670 + }, + { + "epoch": 0.97, + "grad_norm": 0.58984375, + "learning_rate": 6.946144695686885e-07, + "loss": 1.3965, + "step": 18675 + }, + { + "epoch": 0.97, + "grad_norm": 0.5546875, + "learning_rate": 6.840300102110785e-07, + "loss": 1.4341, + "step": 18680 + }, + { + "epoch": 0.97, + "grad_norm": 0.57421875, + "learning_rate": 6.735265368530641e-07, + "loss": 1.3783, + "step": 18685 + }, + { + "epoch": 0.97, + "grad_norm": 0.53515625, + "learning_rate": 6.631040580595605e-07, + "loss": 1.3876, + "step": 18690 + }, + { + "epoch": 0.97, + "grad_norm": 0.5234375, + "learning_rate": 6.527625823294692e-07, + "loss": 1.3976, + "step": 18695 + }, + { + "epoch": 0.97, + "grad_norm": 0.53125, + "learning_rate": 6.425021180956114e-07, + "loss": 1.3861, + "step": 18700 + }, + { + "epoch": 0.97, + "grad_norm": 0.53515625, + "learning_rate": 6.323226737247723e-07, + "loss": 1.4201, + "step": 18705 + }, + { + "epoch": 0.97, + "grad_norm": 0.53515625, + "learning_rate": 6.222242575176341e-07, + "loss": 1.4344, + "step": 18710 + }, + { + "epoch": 0.97, + "grad_norm": 0.53125, + "learning_rate": 6.122068777088319e-07, + "loss": 1.433, + "step": 18715 + }, + { + "epoch": 0.97, + "grad_norm": 0.58203125, + "learning_rate": 6.022705424669317e-07, + "loss": 1.4217, + "step": 18720 + }, + { + "epoch": 0.97, + "grad_norm": 0.5625, + "learning_rate": 5.924152598943966e-07, + "loss": 1.4262, + "step": 18725 + }, + { + "epoch": 0.97, + "grad_norm": 0.55859375, + "learning_rate": 5.826410380275759e-07, + "loss": 1.3997, + "step": 18730 + }, + { + "epoch": 0.97, + "grad_norm": 0.56640625, + "learning_rate": 5.729478848367609e-07, + "loss": 1.4183, + "step": 18735 + }, + { + "epoch": 0.97, + "grad_norm": 0.54296875, + "learning_rate": 5.633358082260954e-07, + "loss": 1.3599, + "step": 18740 + }, + { + "epoch": 0.97, + "grad_norm": 0.51953125, + "learning_rate": 5.53804816033654e-07, + "loss": 1.408, + "step": 18745 + }, + { + "epoch": 0.97, + "grad_norm": 0.55859375, + "learning_rate": 5.443549160313421e-07, + "loss": 1.404, + "step": 18750 + }, + { + "epoch": 0.97, + "grad_norm": 0.5546875, + "learning_rate": 5.349861159249959e-07, + "loss": 1.4162, + "step": 18755 + }, + { + "epoch": 0.97, + "grad_norm": 0.546875, + "learning_rate": 5.256984233542595e-07, + "loss": 1.3979, + "step": 18760 + }, + { + "epoch": 0.97, + "grad_norm": 0.5078125, + "learning_rate": 5.16491845892697e-07, + "loss": 1.3693, + "step": 18765 + }, + { + "epoch": 0.97, + "grad_norm": 0.546875, + "learning_rate": 5.073663910476811e-07, + "loss": 1.3507, + "step": 18770 + }, + { + "epoch": 0.97, + "grad_norm": 0.515625, + "learning_rate": 4.983220662604482e-07, + "loss": 1.4163, + "step": 18775 + }, + { + "epoch": 0.97, + "grad_norm": 0.546875, + "learning_rate": 4.893588789060988e-07, + "loss": 1.393, + "step": 18780 + }, + { + "epoch": 0.97, + "grad_norm": 0.52734375, + "learning_rate": 4.80476836293542e-07, + "loss": 1.4092, + "step": 18785 + }, + { + "epoch": 0.97, + "grad_norm": 0.5234375, + "learning_rate": 4.7167594566555064e-07, + "loss": 1.3873, + "step": 18790 + }, + { + "epoch": 0.97, + "grad_norm": 0.53125, + "learning_rate": 4.6295621419868427e-07, + "loss": 1.401, + "step": 18795 + }, + { + "epoch": 0.97, + "grad_norm": 0.52734375, + "learning_rate": 4.5431764900334404e-07, + "loss": 1.4024, + "step": 18800 + }, + { + "epoch": 0.97, + "grad_norm": 0.546875, + "learning_rate": 4.457602571237507e-07, + "loss": 1.3943, + "step": 18805 + }, + { + "epoch": 0.97, + "grad_norm": 0.5546875, + "learning_rate": 4.3728404553793344e-07, + "loss": 1.395, + "step": 18810 + }, + { + "epoch": 0.97, + "grad_norm": 0.5234375, + "learning_rate": 4.288890211576857e-07, + "loss": 1.3996, + "step": 18815 + }, + { + "epoch": 0.97, + "grad_norm": 0.53125, + "learning_rate": 4.205751908286537e-07, + "loss": 1.4113, + "step": 18820 + }, + { + "epoch": 0.97, + "grad_norm": 0.51953125, + "learning_rate": 4.1234256133024785e-07, + "loss": 1.4166, + "step": 18825 + }, + { + "epoch": 0.97, + "grad_norm": 0.52734375, + "learning_rate": 4.0419113937566475e-07, + "loss": 1.4103, + "step": 18830 + }, + { + "epoch": 0.97, + "grad_norm": 0.5390625, + "learning_rate": 3.961209316118653e-07, + "loss": 1.3753, + "step": 18835 + }, + { + "epoch": 0.97, + "grad_norm": 0.5546875, + "learning_rate": 3.8813194461961856e-07, + "loss": 1.4105, + "step": 18840 + }, + { + "epoch": 0.97, + "grad_norm": 0.53515625, + "learning_rate": 3.8022418491344693e-07, + "loss": 1.4321, + "step": 18845 + }, + { + "epoch": 0.98, + "grad_norm": 0.53515625, + "learning_rate": 3.723976589416256e-07, + "loss": 1.4311, + "step": 18850 + }, + { + "epoch": 0.98, + "grad_norm": 0.5390625, + "learning_rate": 3.6465237308621615e-07, + "loss": 1.4195, + "step": 18855 + }, + { + "epoch": 0.98, + "grad_norm": 0.54296875, + "learning_rate": 3.5698833366299975e-07, + "loss": 1.4145, + "step": 18860 + }, + { + "epoch": 0.98, + "grad_norm": 0.55859375, + "learning_rate": 3.4940554692154405e-07, + "loss": 1.3957, + "step": 18865 + }, + { + "epoch": 0.98, + "grad_norm": 0.51171875, + "learning_rate": 3.41904019045125e-07, + "loss": 1.4162, + "step": 18870 + }, + { + "epoch": 0.98, + "grad_norm": 0.5234375, + "learning_rate": 3.344837561507719e-07, + "loss": 1.3873, + "step": 18875 + }, + { + "epoch": 0.98, + "grad_norm": 0.54296875, + "learning_rate": 3.2714476428925553e-07, + "loss": 1.4291, + "step": 18880 + }, + { + "epoch": 0.98, + "grad_norm": 0.5625, + "learning_rate": 3.1988704944506677e-07, + "loss": 1.4272, + "step": 18885 + }, + { + "epoch": 0.98, + "grad_norm": 0.55078125, + "learning_rate": 3.127106175364158e-07, + "loss": 1.4175, + "step": 18890 + }, + { + "epoch": 0.98, + "grad_norm": 0.53125, + "learning_rate": 3.05615474415244e-07, + "loss": 1.422, + "step": 18895 + }, + { + "epoch": 0.98, + "grad_norm": 0.52734375, + "learning_rate": 2.9860162586718974e-07, + "loss": 1.4079, + "step": 18900 + }, + { + "epoch": 0.98, + "grad_norm": 0.55078125, + "learning_rate": 2.9166907761162264e-07, + "loss": 1.4483, + "step": 18905 + }, + { + "epoch": 0.98, + "grad_norm": 0.52734375, + "learning_rate": 2.8481783530159843e-07, + "loss": 1.4063, + "step": 18910 + }, + { + "epoch": 0.98, + "grad_norm": 0.54296875, + "learning_rate": 2.780479045238704e-07, + "loss": 1.4074, + "step": 18915 + }, + { + "epoch": 0.98, + "grad_norm": 0.54296875, + "learning_rate": 2.7135929079891156e-07, + "loss": 1.4116, + "step": 18920 + }, + { + "epoch": 0.98, + "grad_norm": 0.5390625, + "learning_rate": 2.6475199958085897e-07, + "loss": 1.3984, + "step": 18925 + }, + { + "epoch": 0.98, + "grad_norm": 0.546875, + "learning_rate": 2.582260362575584e-07, + "loss": 1.4304, + "step": 18930 + }, + { + "epoch": 0.98, + "grad_norm": 0.51953125, + "learning_rate": 2.5178140615051973e-07, + "loss": 1.404, + "step": 18935 + }, + { + "epoch": 0.98, + "grad_norm": 0.51171875, + "learning_rate": 2.4541811451493925e-07, + "loss": 1.4332, + "step": 18940 + }, + { + "epoch": 0.98, + "grad_norm": 0.53125, + "learning_rate": 2.391361665396885e-07, + "loss": 1.4137, + "step": 18945 + }, + { + "epoch": 0.98, + "grad_norm": 0.5703125, + "learning_rate": 2.3293556734730326e-07, + "loss": 1.4516, + "step": 18950 + }, + { + "epoch": 0.98, + "grad_norm": 0.53515625, + "learning_rate": 2.268163219939945e-07, + "loss": 1.4191, + "step": 18955 + }, + { + "epoch": 0.98, + "grad_norm": 0.5390625, + "learning_rate": 2.2077843546960408e-07, + "loss": 1.3984, + "step": 18960 + }, + { + "epoch": 0.98, + "grad_norm": 0.56640625, + "learning_rate": 2.1482191269768247e-07, + "loss": 1.4172, + "step": 18965 + }, + { + "epoch": 0.98, + "grad_norm": 0.55078125, + "learning_rate": 2.089467585353777e-07, + "loss": 1.3938, + "step": 18970 + }, + { + "epoch": 0.98, + "grad_norm": 0.5390625, + "learning_rate": 2.0315297777353525e-07, + "loss": 1.4371, + "step": 18975 + }, + { + "epoch": 0.98, + "grad_norm": 0.52734375, + "learning_rate": 1.9744057513660928e-07, + "loss": 1.4102, + "step": 18980 + }, + { + "epoch": 0.98, + "grad_norm": 0.5234375, + "learning_rate": 1.9180955528270706e-07, + "loss": 1.4085, + "step": 18985 + }, + { + "epoch": 0.98, + "grad_norm": 0.546875, + "learning_rate": 1.8625992280357773e-07, + "loss": 1.3943, + "step": 18990 + }, + { + "epoch": 0.98, + "grad_norm": 0.52734375, + "learning_rate": 1.8079168222461252e-07, + "loss": 1.4169, + "step": 18995 + }, + { + "epoch": 0.98, + "grad_norm": 0.53125, + "learning_rate": 1.7540483800481122e-07, + "loss": 1.379, + "step": 19000 + }, + { + "epoch": 0.98, + "grad_norm": 0.51171875, + "learning_rate": 1.7009939453680456e-07, + "loss": 1.3613, + "step": 19005 + }, + { + "epoch": 0.98, + "grad_norm": 0.53125, + "learning_rate": 1.6487535614687633e-07, + "loss": 1.4412, + "step": 19010 + }, + { + "epoch": 0.98, + "grad_norm": 0.5546875, + "learning_rate": 1.5973272709487453e-07, + "loss": 1.4065, + "step": 19015 + }, + { + "epoch": 0.98, + "grad_norm": 0.49609375, + "learning_rate": 1.5467151157431136e-07, + "loss": 1.3788, + "step": 19020 + }, + { + "epoch": 0.98, + "grad_norm": 0.5703125, + "learning_rate": 1.4969171371228552e-07, + "loss": 1.4031, + "step": 19025 + }, + { + "epoch": 0.98, + "grad_norm": 0.53125, + "learning_rate": 1.447933375695265e-07, + "loss": 1.376, + "step": 19030 + }, + { + "epoch": 0.98, + "grad_norm": 0.5625, + "learning_rate": 1.3997638714033923e-07, + "loss": 1.3969, + "step": 19035 + }, + { + "epoch": 0.99, + "grad_norm": 0.53125, + "learning_rate": 1.3524086635265942e-07, + "loss": 1.3911, + "step": 19040 + }, + { + "epoch": 0.99, + "grad_norm": 0.54296875, + "learning_rate": 1.305867790679982e-07, + "loss": 1.4282, + "step": 19045 + }, + { + "epoch": 0.99, + "grad_norm": 0.55078125, + "learning_rate": 1.2601412908147536e-07, + "loss": 1.4233, + "step": 19050 + }, + { + "epoch": 0.99, + "grad_norm": 0.5390625, + "learning_rate": 1.2152292012181932e-07, + "loss": 1.4246, + "step": 19055 + }, + { + "epoch": 0.99, + "grad_norm": 0.52734375, + "learning_rate": 1.1711315585131166e-07, + "loss": 1.4038, + "step": 19060 + }, + { + "epoch": 0.99, + "grad_norm": 0.50390625, + "learning_rate": 1.1278483986586486e-07, + "loss": 1.3726, + "step": 19065 + }, + { + "epoch": 0.99, + "grad_norm": 0.55078125, + "learning_rate": 1.085379756949223e-07, + "loss": 1.4142, + "step": 19070 + }, + { + "epoch": 0.99, + "grad_norm": 0.515625, + "learning_rate": 1.0437256680155827e-07, + "loss": 1.3689, + "step": 19075 + }, + { + "epoch": 0.99, + "grad_norm": 0.53515625, + "learning_rate": 1.0028861658238909e-07, + "loss": 1.4363, + "step": 19080 + }, + { + "epoch": 0.99, + "grad_norm": 0.515625, + "learning_rate": 9.628612836763973e-08, + "loss": 1.4179, + "step": 19085 + }, + { + "epoch": 0.99, + "grad_norm": 0.55078125, + "learning_rate": 9.236510542107723e-08, + "loss": 1.3981, + "step": 19090 + }, + { + "epoch": 0.99, + "grad_norm": 0.55859375, + "learning_rate": 8.85255509400662e-08, + "loss": 1.4081, + "step": 19095 + }, + { + "epoch": 0.99, + "grad_norm": 0.53515625, + "learning_rate": 8.476746805550218e-08, + "loss": 1.4231, + "step": 19100 + }, + { + "epoch": 0.99, + "grad_norm": 0.53515625, + "learning_rate": 8.109085983188936e-08, + "loss": 1.3895, + "step": 19105 + }, + { + "epoch": 0.99, + "grad_norm": 0.55078125, + "learning_rate": 7.74957292672629e-08, + "loss": 1.4086, + "step": 19110 + }, + { + "epoch": 0.99, + "grad_norm": 0.52734375, + "learning_rate": 7.39820792932333e-08, + "loss": 1.3892, + "step": 19115 + }, + { + "epoch": 0.99, + "grad_norm": 0.51171875, + "learning_rate": 7.054991277496425e-08, + "loss": 1.4197, + "step": 19120 + }, + { + "epoch": 0.99, + "grad_norm": 0.5625, + "learning_rate": 6.719923251116145e-08, + "loss": 1.4062, + "step": 19125 + }, + { + "epoch": 0.99, + "grad_norm": 0.546875, + "learning_rate": 6.393004123411706e-08, + "loss": 1.424, + "step": 19130 + }, + { + "epoch": 0.99, + "grad_norm": 0.52734375, + "learning_rate": 6.074234160963199e-08, + "loss": 1.3763, + "step": 19135 + }, + { + "epoch": 0.99, + "grad_norm": 0.56640625, + "learning_rate": 5.763613623709363e-08, + "loss": 1.4039, + "step": 19140 + }, + { + "epoch": 0.99, + "grad_norm": 0.5546875, + "learning_rate": 5.461142764940919e-08, + "loss": 1.4028, + "step": 19145 + }, + { + "epoch": 0.99, + "grad_norm": 0.54296875, + "learning_rate": 5.166821831305013e-08, + "loss": 1.3935, + "step": 19150 + }, + { + "epoch": 0.99, + "grad_norm": 0.55859375, + "learning_rate": 4.880651062800779e-08, + "loss": 1.4228, + "step": 19155 + }, + { + "epoch": 0.99, + "grad_norm": 0.55859375, + "learning_rate": 4.6026306927848814e-08, + "loss": 1.4005, + "step": 19160 + }, + { + "epoch": 0.99, + "grad_norm": 0.53125, + "learning_rate": 4.332760947962644e-08, + "loss": 1.4044, + "step": 19165 + }, + { + "epoch": 0.99, + "grad_norm": 0.56640625, + "learning_rate": 4.0710420483980326e-08, + "loss": 1.4302, + "step": 19170 + }, + { + "epoch": 0.99, + "grad_norm": 0.53125, + "learning_rate": 3.817474207505889e-08, + "loss": 1.3764, + "step": 19175 + }, + { + "epoch": 0.99, + "grad_norm": 0.5546875, + "learning_rate": 3.572057632055259e-08, + "loss": 1.4099, + "step": 19180 + }, + { + "epoch": 0.99, + "grad_norm": 0.515625, + "learning_rate": 3.3347925221682844e-08, + "loss": 1.3482, + "step": 19185 + }, + { + "epoch": 0.99, + "grad_norm": 0.53515625, + "learning_rate": 3.1056790713202e-08, + "loss": 1.4299, + "step": 19190 + }, + { + "epoch": 0.99, + "grad_norm": 0.53125, + "learning_rate": 2.884717466338227e-08, + "loss": 1.4004, + "step": 19195 + }, + { + "epoch": 0.99, + "grad_norm": 0.54296875, + "learning_rate": 2.6719078874026803e-08, + "loss": 1.4316, + "step": 19200 + }, + { + "epoch": 0.99, + "grad_norm": 0.53515625, + "learning_rate": 2.4672505080458597e-08, + "loss": 1.3985, + "step": 19205 + }, + { + "epoch": 0.99, + "grad_norm": 0.52734375, + "learning_rate": 2.2707454951553797e-08, + "loss": 1.4001, + "step": 19210 + }, + { + "epoch": 0.99, + "grad_norm": 0.5078125, + "learning_rate": 2.082393008966399e-08, + "loss": 1.4053, + "step": 19215 + }, + { + "epoch": 0.99, + "grad_norm": 0.5234375, + "learning_rate": 1.9021932030705015e-08, + "loss": 1.4332, + "step": 19220 + }, + { + "epoch": 0.99, + "grad_norm": 0.53125, + "learning_rate": 1.7301462244079246e-08, + "loss": 1.4338, + "step": 19225 + }, + { + "epoch": 0.99, + "grad_norm": 0.5234375, + "learning_rate": 1.5662522132742218e-08, + "loss": 1.3848, + "step": 19230 + }, + { + "epoch": 1.0, + "grad_norm": 0.5859375, + "learning_rate": 1.4105113033124895e-08, + "loss": 1.4195, + "step": 19235 + }, + { + "epoch": 1.0, + "grad_norm": 0.5546875, + "learning_rate": 1.2629236215211393e-08, + "loss": 1.3918, + "step": 19240 + }, + { + "epoch": 1.0, + "grad_norm": 0.53125, + "learning_rate": 1.1234892882494574e-08, + "loss": 1.4252, + "step": 19245 + }, + { + "epoch": 1.0, + "grad_norm": 0.55078125, + "learning_rate": 9.922084171953839e-09, + "loss": 1.4026, + "step": 19250 + }, + { + "epoch": 1.0, + "grad_norm": 0.55859375, + "learning_rate": 8.690811154121737e-09, + "loss": 1.3924, + "step": 19255 + }, + { + "epoch": 1.0, + "grad_norm": 0.54296875, + "learning_rate": 7.541074833006257e-09, + "loss": 1.3749, + "step": 19260 + }, + { + "epoch": 1.0, + "grad_norm": 0.55859375, + "learning_rate": 6.472876146168538e-09, + "loss": 1.4085, + "step": 19265 + }, + { + "epoch": 1.0, + "grad_norm": 0.53125, + "learning_rate": 5.486215964645158e-09, + "loss": 1.442, + "step": 19270 + }, + { + "epoch": 1.0, + "grad_norm": 0.54296875, + "learning_rate": 4.581095092992538e-09, + "loss": 1.4407, + "step": 19275 + }, + { + "epoch": 1.0, + "grad_norm": 0.5390625, + "learning_rate": 3.757514269286944e-09, + "loss": 1.4129, + "step": 19280 + }, + { + "epoch": 1.0, + "grad_norm": 0.53515625, + "learning_rate": 3.0154741651022833e-09, + "loss": 1.4254, + "step": 19285 + }, + { + "epoch": 1.0, + "grad_norm": 0.53125, + "learning_rate": 2.354975385543412e-09, + "loss": 1.413, + "step": 19290 + }, + { + "epoch": 1.0, + "grad_norm": 0.52734375, + "learning_rate": 1.776018469179519e-09, + "loss": 1.397, + "step": 19295 + }, + { + "epoch": 1.0, + "grad_norm": 0.5234375, + "learning_rate": 1.2786038881329455e-09, + "loss": 1.4139, + "step": 19300 + }, + { + "epoch": 1.0, + "grad_norm": 0.515625, + "learning_rate": 8.627320480125711e-10, + "loss": 1.4221, + "step": 19305 + }, + { + "epoch": 1.0, + "grad_norm": 0.53125, + "learning_rate": 5.284032879249168e-10, + "loss": 1.3758, + "step": 19310 + }, + { + "epoch": 1.0, + "grad_norm": 0.515625, + "learning_rate": 2.7561788049634827e-10, + "loss": 1.3815, + "step": 19315 + }, + { + "epoch": 1.0, + "grad_norm": 0.56640625, + "learning_rate": 1.0437603187307688e-10, + "loss": 1.4298, + "step": 19320 + }, + { + "epoch": 1.0, + "grad_norm": 0.51953125, + "learning_rate": 1.4677881676750105e-11, + "loss": 1.3775, + "step": 19325 + }, + { + "epoch": 1.0, + "eval_loss": 1.4011043310165405, + "eval_runtime": 25105.0488, + "eval_samples_per_second": 6.821, + "eval_steps_per_second": 1.705, + "step": 19328 + }, + { + "epoch": 1.0, + "step": 19328, + "total_flos": 1.3590087469130318e+19, + "train_loss": 1.456397372168421, + "train_runtime": 104474.4259, + "train_samples_per_second": 1.48, + "train_steps_per_second": 0.185 + } + ], + "logging_steps": 5, + "max_steps": 19328, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 1.3590087469130318e+19, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}